From 1d5ae1026e831016fc29fd927877c86af904481f Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 23 Oct 2019 17:51:42 +0000
Subject: Vendor import of stripped llvm trunk r375505, the last commit before
 the upstream Subversion repository was made read-only, and the LLVM project
 migrated to GitHub:

https://llvm.org/svn/llvm-project/llvm/trunk@375505
---
 include/llvm-c/Core.h                              |   22 +-
 include/llvm-c/DebugInfo.h                         |   47 +-
 include/llvm-c/Remarks.h                           |   17 +-
 include/llvm-c/Transforms/IPO.h                    |   18 +
 include/llvm-c/Transforms/Scalar.h                 |    6 +
 include/llvm-c/lto.h                               |   94 +-
 include/llvm/ADT/APFloat.h                         |    5 +
 include/llvm/ADT/APInt.h                           |    9 +
 include/llvm/ADT/Any.h                             |    4 +-
 include/llvm/ADT/ArrayRef.h                        |    6 +
 include/llvm/ADT/DenseMap.h                        |   57 +-
 include/llvm/ADT/DenseMapInfo.h                    |   13 +-
 include/llvm/ADT/DirectedGraph.h                   |  270 +
 include/llvm/ADT/Hashing.h                         |    1 -
 include/llvm/ADT/IntervalMap.h                     |    4 +-
 include/llvm/ADT/PointerIntPair.h                  |   11 +-
 include/llvm/ADT/PointerUnion.h                    |   30 +-
 include/llvm/ADT/STLExtras.h                       |  168 +-
 include/llvm/ADT/SmallBitVector.h                  |    2 +-
 include/llvm/ADT/Statistic.h                       |  102 +-
 include/llvm/ADT/StringExtras.h                    |    2 +-
 include/llvm/ADT/StringMap.h                       |   59 +-
 include/llvm/ADT/StringRef.h                       |   18 +-
 include/llvm/ADT/StringSet.h                       |    8 +-
 include/llvm/ADT/TinyPtrVector.h                   |   38 +-
 include/llvm/ADT/VariadicFunction.h                |  330 --
 include/llvm/ADT/iterator_range.h                  |    1 +
 include/llvm/Analysis/AliasAnalysis.h              |    2 +-
 include/llvm/Analysis/AliasSetTracker.h            |    5 +-
 include/llvm/Analysis/AssumptionCache.h            |    4 +-
 include/llvm/Analysis/CFG.h                        |    2 +
 include/llvm/Analysis/CFLAndersAliasAnalysis.h     |    5 +-
 include/llvm/Analysis/CFLSteensAliasAnalysis.h     |    5 +-
 include/llvm/Analysis/CGSCCPassManager.h           |   31 +-
 include/llvm/Analysis/CaptureTracking.h            |    6 +
 include/llvm/Analysis/DDG.h                        |  430 ++
 include/llvm/Analysis/DOTGraphTraitsPass.h         |    4 +-
 include/llvm/Analysis/DependenceGraphBuilder.h     |  119 +
 include/llvm/Analysis/DivergenceAnalysis.h         |   16 +-
 include/llvm/Analysis/GlobalsModRef.h              |   12 +-
 include/llvm/Analysis/InstructionSimplify.h        |   36 +-
 include/llvm/Analysis/LazyCallGraph.h              |   10 +-
 include/llvm/Analysis/LegacyDivergenceAnalysis.h   |   16 +-
 include/llvm/Analysis/Loads.h                      |   22 +-
 include/llvm/Analysis/LoopAnalysisManager.h        |   10 +-
 include/llvm/Analysis/LoopCacheAnalysis.h          |  281 +
 include/llvm/Analysis/LoopInfo.h                   |   37 +-
 include/llvm/Analysis/LoopInfoImpl.h               |    8 +-
 include/llvm/Analysis/MemoryBuiltins.h             |   26 +-
 include/llvm/Analysis/MemoryDependenceAnalysis.h   |   14 +-
 include/llvm/Analysis/MemorySSA.h                  |    4 +-
 include/llvm/Analysis/MemorySSAUpdater.h           |    3 +-
 include/llvm/Analysis/MustExecute.h                |  285 +-
 include/llvm/Analysis/Passes.h                     |    7 +
 include/llvm/Analysis/ProfileSummaryInfo.h         |   23 +
 include/llvm/Analysis/RegionInfoImpl.h             |    2 +-
 include/llvm/Analysis/ScalarEvolution.h            |    6 +-
 include/llvm/Analysis/ScalarEvolutionExpander.h    |   22 +-
 include/llvm/Analysis/TargetLibraryInfo.h          |   17 +-
 include/llvm/Analysis/TargetTransformInfo.h        |  180 +-
 include/llvm/Analysis/TargetTransformInfoImpl.h    |   55 +-
 include/llvm/Analysis/TypeMetadataUtils.h          |    2 +
 include/llvm/Analysis/Utils/Local.h                |   22 +-
 include/llvm/Analysis/ValueTracking.h              |   67 +-
 include/llvm/Analysis/VectorUtils.h                |  144 +-
 include/llvm/BinaryFormat/Dwarf.def                |  198 +-
 include/llvm/BinaryFormat/Dwarf.h                  |  125 +-
 include/llvm/BinaryFormat/ELF.h                    |   66 +
 include/llvm/BinaryFormat/ELFRelocs/AArch64.def    |    7 +-
 include/llvm/BinaryFormat/MachO.h                  |    5 +
 include/llvm/BinaryFormat/Magic.h                  |    1 +
 include/llvm/BinaryFormat/Minidump.h               |   68 +
 include/llvm/BinaryFormat/MinidumpConstants.def    |   41 +-
 include/llvm/BinaryFormat/Wasm.h                   |   14 +
 include/llvm/BinaryFormat/XCOFF.h                  |  116 +-
 include/llvm/Bitcode/BitcodeAnalyzer.h             |    1 +
 include/llvm/Bitcode/LLVMBitCodes.h                |    2 +-
 include/llvm/Bitstream/BitCodes.h                  |    5 +
 include/llvm/Bitstream/BitstreamReader.h           |    1 +
 include/llvm/CodeGen/AccelTable.h                  |    2 -
 include/llvm/CodeGen/AsmPrinter.h                  |   21 +-
 include/llvm/CodeGen/BasicTTIImpl.h                |   73 +-
 include/llvm/CodeGen/CallingConvLower.h            |   18 +-
 include/llvm/CodeGen/DFAPacketizer.h               |   44 +-
 include/llvm/CodeGen/DIE.h                         |   12 +
 include/llvm/CodeGen/FastISel.h                    |    4 +-
 include/llvm/CodeGen/FunctionLoweringInfo.h        |    2 +-
 include/llvm/CodeGen/GlobalISel/CallLowering.h     |  127 +-
 include/llvm/CodeGen/GlobalISel/CombinerHelper.h   |  127 +-
 include/llvm/CodeGen/GlobalISel/CombinerInfo.h     |   15 +-
 .../CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h |   11 +
 include/llvm/CodeGen/GlobalISel/GISelKnownBits.h   |  111 +
 include/llvm/CodeGen/GlobalISel/IRTranslator.h     |   12 +-
 .../llvm/CodeGen/GlobalISel/InstructionSelector.h  |   34 +-
 .../CodeGen/GlobalISel/InstructionSelectorImpl.h   |   66 +-
 .../GlobalISel/LegalizationArtifactCombiner.h      |   92 +-
 include/llvm/CodeGen/GlobalISel/LegalizerHelper.h  |   20 +
 include/llvm/CodeGen/GlobalISel/LegalizerInfo.h    |   61 +-
 include/llvm/CodeGen/GlobalISel/MIPatternMatch.h   |   20 +-
 include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h |   93 +-
 include/llvm/CodeGen/GlobalISel/Utils.h            |   22 +-
 include/llvm/CodeGen/ISDOpcodes.h                  |   41 +-
 include/llvm/CodeGen/LiveInterval.h                |    6 +-
 include/llvm/CodeGen/LiveIntervals.h               |   21 +-
 include/llvm/CodeGen/LiveRangeCalc.h               |  295 +
 include/llvm/CodeGen/LiveRegUnits.h                |    4 +-
 include/llvm/CodeGen/MIRYamlMapping.h              |    1 +
 include/llvm/CodeGen/MachineBasicBlock.h           |   36 +-
 include/llvm/CodeGen/MachineCombinerPattern.h      |   21 +
 include/llvm/CodeGen/MachineDominators.h           |   63 +-
 include/llvm/CodeGen/MachineFrameInfo.h            |   62 +-
 include/llvm/CodeGen/MachineFunction.h             |   61 +-
 include/llvm/CodeGen/MachineInstr.h                |   99 +-
 include/llvm/CodeGen/MachineInstrBuilder.h         |   62 +-
 include/llvm/CodeGen/MachineLoopUtils.h            |   41 +
 include/llvm/CodeGen/MachineMemOperand.h           |    7 -
 include/llvm/CodeGen/MachineModuleInfo.h           |   52 +-
 include/llvm/CodeGen/MachineOperand.h              |   49 +-
 include/llvm/CodeGen/MachinePipeliner.h            |   80 +-
 include/llvm/CodeGen/MachinePostDominators.h       |   46 +-
 include/llvm/CodeGen/MachineRegionInfo.h           |    2 +-
 include/llvm/CodeGen/MachineRegisterInfo.h         |   70 +-
 include/llvm/CodeGen/MachineScheduler.h            |    1 +
 include/llvm/CodeGen/ModuloSchedule.h              |  367 ++
 include/llvm/CodeGen/PBQP/Math.h                   |   12 +-
 include/llvm/CodeGen/Passes.h                      |    4 +
 include/llvm/CodeGen/Register.h                    |  118 +-
 include/llvm/CodeGen/RegisterClassInfo.h           |    2 +-
 include/llvm/CodeGen/RegisterPressure.h            |    9 +-
 include/llvm/CodeGen/RegisterScavenging.h          |   24 +-
 include/llvm/CodeGen/ScheduleDAGInstrs.h           |   12 +-
 include/llvm/CodeGen/SelectionDAG.h                |   95 +-
 include/llvm/CodeGen/SelectionDAGISel.h            |   36 +-
 include/llvm/CodeGen/SelectionDAGNodes.h           |  105 +-
 include/llvm/CodeGen/StackProtector.h              |    6 +
 include/llvm/CodeGen/SwitchLoweringUtils.h         |    5 +-
 include/llvm/CodeGen/TargetCallingConv.h           |   23 +-
 include/llvm/CodeGen/TargetFrameLowering.h         |   30 +-
 include/llvm/CodeGen/TargetInstrInfo.h             |  102 +-
 include/llvm/CodeGen/TargetLowering.h              |  399 +-
 .../llvm/CodeGen/TargetLoweringObjectFileImpl.h    |   34 +-
 include/llvm/CodeGen/TargetPassConfig.h            |    2 +-
 include/llvm/CodeGen/TargetRegisterInfo.h          |   94 +-
 include/llvm/CodeGen/TargetSubtargetInfo.h         |   10 +-
 include/llvm/CodeGen/ValueTypes.h                  |    4 +-
 include/llvm/CodeGen/ValueTypes.td                 |  247 +-
 include/llvm/CodeGen/VirtRegMap.h                  |   43 +-
 include/llvm/DebugInfo/CodeView/CVTypeVisitor.h    |    4 -
 include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h |   13 +-
 .../llvm/DebugInfo/CodeView/CodeViewRegisters.def  |  128 +
 include/llvm/DebugInfo/CodeView/EnumTables.h       |   11 +
 .../llvm/DebugInfo/CodeView/SymbolDeserializer.h   |    2 +-
 include/llvm/DebugInfo/CodeView/SymbolRecord.h     |  304 +-
 include/llvm/DebugInfo/CodeView/TypeDeserializer.h |    2 +-
 .../llvm/DebugInfo/CodeView/TypeRecordMapping.h    |    1 +
 .../CodeView/TypeVisitorCallbackPipeline.h         |    5 -
 include/llvm/DebugInfo/DIContext.h                 |   14 +-
 .../DebugInfo/DWARF/DWARFAbbreviationDeclaration.h |    4 +-
 .../llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h   |   68 +-
 include/llvm/DebugInfo/DWARF/DWARFAttribute.h      |    2 +-
 include/llvm/DebugInfo/DWARF/DWARFContext.h        |    8 +-
 include/llvm/DebugInfo/DWARF/DWARFDataExtractor.h  |   13 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h    |    6 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h      |    6 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h |    4 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h   |    8 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h     |    2 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h |   10 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugLine.h      |   27 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h       |   35 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h  |    4 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h |    7 +-
 include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h  |    2 +-
 include/llvm/DebugInfo/DWARF/DWARFDie.h            |    2 +-
 include/llvm/DebugInfo/DWARF/DWARFExpression.h     |   14 +-
 include/llvm/DebugInfo/DWARF/DWARFFormValue.h      |   10 +-
 include/llvm/DebugInfo/DWARF/DWARFListTable.h      |   77 +-
 include/llvm/DebugInfo/DWARF/DWARFObject.h         |   30 +-
 include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h       |    2 +-
 include/llvm/DebugInfo/DWARF/DWARFUnit.h           |   51 +-
 include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h      |    2 +-
 include/llvm/DebugInfo/DWARF/DWARFVerifier.h       |    4 +-
 include/llvm/DebugInfo/GSYM/FileEntry.h            |    7 +-
 include/llvm/DebugInfo/GSYM/FileWriter.h           |  124 +
 include/llvm/DebugInfo/GSYM/FunctionInfo.h         |  154 +-
 include/llvm/DebugInfo/GSYM/GsymCreator.h          |  229 +
 include/llvm/DebugInfo/GSYM/GsymReader.h           |  228 +
 include/llvm/DebugInfo/GSYM/Header.h               |  129 +
 include/llvm/DebugInfo/GSYM/InlineInfo.h           |   63 +-
 include/llvm/DebugInfo/GSYM/LineEntry.h            |    7 +-
 include/llvm/DebugInfo/GSYM/LineTable.h            |  198 +
 include/llvm/DebugInfo/GSYM/Range.h                |   33 +-
 include/llvm/DebugInfo/GSYM/StringTable.h          |    7 +-
 include/llvm/DebugInfo/PDB/GenericError.h          |    2 +-
 include/llvm/DebugInfo/PDB/Native/SymbolCache.h    |    2 +-
 include/llvm/DebugInfo/PDB/PDBSymbol.h             |    2 +-
 include/llvm/DebugInfo/Symbolize/Symbolize.h       |    1 +
 include/llvm/Demangle/Demangle.h                   |    9 +-
 include/llvm/Demangle/DemangleConfig.h             |    7 -
 include/llvm/Demangle/ItaniumDemangle.h            |  419 +-
 include/llvm/Demangle/MicrosoftDemangle.h          |    1 +
 include/llvm/Demangle/MicrosoftDemangleNodes.h     |    7 +-
 .../llvm/ExecutionEngine/JITLink/EHFrameSupport.h  |   39 +-
 include/llvm/ExecutionEngine/JITLink/JITLink.h     | 1244 ++--
 .../ExecutionEngine/JITLink/JITLinkMemoryManager.h |   17 +-
 include/llvm/ExecutionEngine/JITLink/MachO_arm64.h |   60 +
 .../llvm/ExecutionEngine/JITLink/MachO_x86_64.h    |    1 +
 include/llvm/ExecutionEngine/JITSymbol.h           |    5 +-
 .../ExecutionEngine/Orc/CompileOnDemandLayer.h     |   10 +-
 include/llvm/ExecutionEngine/Orc/Core.h            |  137 +-
 include/llvm/ExecutionEngine/Orc/ExecutionUtils.h  |   46 +-
 .../llvm/ExecutionEngine/Orc/IRTransformLayer.h    |    3 +
 include/llvm/ExecutionEngine/Orc/LLJIT.h           |    4 +-
 include/llvm/ExecutionEngine/Orc/LambdaResolver.h  |    5 +-
 .../llvm/ExecutionEngine/Orc/LazyEmittingLayer.h   |   40 +-
 include/llvm/ExecutionEngine/Orc/LazyReexports.h   |   13 +-
 include/llvm/ExecutionEngine/Orc/Legacy.h          |    2 +-
 .../llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h  |   23 +-
 .../ExecutionEngine/Orc/OrcRemoteTargetClient.h    |    4 +-
 .../llvm/ExecutionEngine/Orc/RPCSerialization.h    |   12 +-
 include/llvm/ExecutionEngine/Orc/RPCUtils.h        |   65 +-
 .../ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h |    6 +-
 .../llvm/ExecutionEngine/Orc/RemoteObjectLayer.h   |   21 +-
 .../llvm/ExecutionEngine/Orc/SpeculateAnalyses.h   |   84 +
 include/llvm/ExecutionEngine/Orc/Speculation.h     |  207 +
 .../llvm/ExecutionEngine/Orc/ThreadSafeModule.h    |   53 +-
 include/llvm/ExecutionEngine/RuntimeDyld.h         |   23 +-
 include/llvm/IR/Attributes.h                       |   49 +-
 include/llvm/IR/AutoUpgrade.h                      |   10 +-
 include/llvm/IR/BasicBlock.h                       |    5 +
 include/llvm/IR/CallSite.h                         |    9 +
 include/llvm/IR/CallingConv.h                      |   13 +
 include/llvm/IR/Constant.h                         |    6 +
 include/llvm/IR/ConstantRange.h                    |   10 +-
 include/llvm/IR/DataLayout.h                       |  125 +-
 include/llvm/IR/DebugInfoFlags.def                 |    6 +-
 include/llvm/IR/DebugInfoMetadata.h                |    4 +-
 include/llvm/IR/DerivedTypes.h                     |   75 +-
 include/llvm/IR/DiagnosticInfo.h                   |   25 +-
 include/llvm/IR/FixedMetadataKinds.def             |   43 +
 include/llvm/IR/Function.h                         |   15 +-
 include/llvm/IR/GlobalAlias.h                      |    4 -
 include/llvm/IR/GlobalIFunc.h                      |    4 -
 include/llvm/IR/GlobalIndirectSymbol.h             |    8 +-
 include/llvm/IR/GlobalObject.h                     |   26 +-
 include/llvm/IR/GlobalVariable.h                   |    1 +
 include/llvm/IR/IRBuilder.h                        |   90 +-
 include/llvm/IR/InlineAsm.h                        |    1 +
 include/llvm/IR/InstrTypes.h                       |   12 +-
 include/llvm/IR/Instruction.h                      |   10 +
 include/llvm/IR/Instructions.h                     |   99 +-
 include/llvm/IR/IntrinsicInst.h                    |   23 +-
 include/llvm/IR/Intrinsics.h                       |   10 +-
 include/llvm/IR/Intrinsics.td                      |  295 +-
 include/llvm/IR/IntrinsicsAArch64.td               |  125 +-
 include/llvm/IR/IntrinsicsAMDGPU.td                |  121 +-
 include/llvm/IR/IntrinsicsARM.td                   |    9 +
 include/llvm/IR/IntrinsicsBPF.td                   |    3 +
 include/llvm/IR/IntrinsicsMips.td                  |   16 +-
 include/llvm/IR/IntrinsicsNVVM.td                  |  125 +-
 include/llvm/IR/IntrinsicsWebAssembly.td           |   58 +
 include/llvm/IR/IntrinsicsX86.td                   |   12 +-
 include/llvm/IR/LLVMContext.h                      |   31 +-
 include/llvm/IR/MDBuilder.h                        |    5 +
 include/llvm/IR/Metadata.h                         |    4 +-
 include/llvm/IR/Module.h                           |    1 +
 include/llvm/IR/ModuleSummaryIndex.h               |   18 +-
 include/llvm/IR/ModuleSummaryIndexYAML.h           |    2 +-
 include/llvm/IR/Operator.h                         |   21 +-
 include/llvm/IR/PassManager.h                      |    5 +-
 include/llvm/IR/PassManagerInternal.h              |    2 +-
 include/llvm/IR/PatternMatch.h                     |  155 +-
 include/llvm/IR/RemarkStreamer.h                   |   28 +-
 include/llvm/IR/Type.h                             |   15 +-
 include/llvm/IR/User.h                             |    2 +-
 include/llvm/IR/Value.h                            |   58 +-
 include/llvm/IR/ValueMap.h                         |   15 +-
 include/llvm/InitializePasses.h                    |   10 +-
 include/llvm/LTO/Config.h                          |    2 +-
 include/llvm/LTO/LTO.h                             |   10 +-
 include/llvm/LTO/legacy/LTOCodeGenerator.h         |    2 +-
 include/llvm/LinkAllPasses.h                       |    2 +
 include/llvm/MC/MCAsmInfo.h                        |   18 +
 include/llvm/MC/MCAsmInfoXCOFF.h                   |    5 +
 include/llvm/MC/MCAsmMacro.h                       |   11 +-
 include/llvm/MC/MCContext.h                        |   23 +-
 include/llvm/MC/MCDirectives.h                     |    1 +
 include/llvm/MC/MCDwarf.h                          |    3 +-
 include/llvm/MC/MCExpr.h                           |    8 +-
 include/llvm/MC/MCFixup.h                          |  119 +-
 include/llvm/MC/MCFragment.h                       |   16 +-
 include/llvm/MC/MCInstPrinter.h                    |    2 -
 include/llvm/MC/MCInstrAnalysis.h                  |    6 +
 include/llvm/MC/MCInstrDesc.h                      |   23 +-
 include/llvm/MC/MCLinkerOptimizationHint.h         |    2 +
 include/llvm/MC/MCRegister.h                       |  110 +
 include/llvm/MC/MCRegisterInfo.h                   |   98 +-
 include/llvm/MC/MCSection.h                        |    7 +-
 include/llvm/MC/MCSectionXCOFF.h                   |   22 +-
 include/llvm/MC/MCStreamer.h                       |   41 +-
 include/llvm/MC/MCSubtargetInfo.h                  |   46 +
 include/llvm/MC/MCSymbolWasm.h                     |    7 +
 include/llvm/MC/MCSymbolXCOFF.h                    |   32 +
 include/llvm/MC/MCWasmObjectWriter.h               |    4 +-
 include/llvm/MC/MCXCOFFStreamer.h                  |    2 +
 include/llvm/MC/StringTableBuilder.h               |    2 +-
 include/llvm/MC/SubtargetFeature.h                 |  139 +-
 include/llvm/MCA/CodeEmitter.h                     |   72 +
 include/llvm/MCA/Context.h                         |    5 +-
 include/llvm/MCA/HardwareUnits/LSUnit.h            |   18 +-
 include/llvm/MCA/HardwareUnits/RegisterFile.h      |    2 +-
 include/llvm/MCA/HardwareUnits/ResourceManager.h   |   51 +-
 include/llvm/MCA/HardwareUnits/RetireControlUnit.h |   33 +-
 include/llvm/MCA/HardwareUnits/Scheduler.h         |   13 +-
 include/llvm/MCA/Instruction.h                     |   51 +-
 include/llvm/MCA/SourceMgr.h                       |    5 +-
 include/llvm/MCA/Stages/RetireStage.h              |    6 +-
 include/llvm/Object/Archive.h                      |    7 +-
 include/llvm/Object/Binary.h                       |   16 +-
 include/llvm/Object/COFF.h                         |   36 +-
 include/llvm/Object/ELF.h                          |  112 +-
 include/llvm/Object/ELFObjectFile.h                |   31 +-
 include/llvm/Object/ELFTypes.h                     |    6 +-
 include/llvm/Object/MachO.h                        |    1 +
 include/llvm/Object/MachOUniversal.h               |   14 +-
 include/llvm/Object/Minidump.h                     |   77 +-
 include/llvm/Object/ObjectFile.h                   |   21 +-
 include/llvm/Object/StackMapParser.h               |    4 +-
 include/llvm/Object/TapiFile.h                     |   60 +
 include/llvm/Object/TapiUniversal.h                |  109 +
 include/llvm/Object/WindowsResource.h              |   55 +-
 include/llvm/Object/XCOFFObjectFile.h              |  132 +-
 include/llvm/ObjectYAML/DWARFYAML.h                |    2 +-
 include/llvm/ObjectYAML/ELFYAML.h                  |  116 +-
 include/llvm/ObjectYAML/MachOYAML.h                |    3 +
 include/llvm/ObjectYAML/MinidumpYAML.h             |   64 +-
 include/llvm/ObjectYAML/WasmYAML.h                 |    2 +-
 include/llvm/ObjectYAML/yaml2obj.h                 |   67 +
 include/llvm/Pass.h                                |    5 +
 include/llvm/Passes/PassBuilder.h                  |    7 +-
 .../llvm/ProfileData/Coverage/CoverageMapping.h    |   16 +-
 .../ProfileData/Coverage/CoverageMappingWriter.h   |    3 +-
 include/llvm/ProfileData/InstrProf.h               |   18 +-
 include/llvm/ProfileData/InstrProfReader.h         |   12 +-
 include/llvm/ProfileData/SampleProf.h              |  178 +-
 include/llvm/ProfileData/SampleProfReader.h        |  272 +-
 include/llvm/ProfileData/SampleProfWriter.h        |  118 +-
 include/llvm/Remarks/BitstreamRemarkContainer.h    |  106 +
 include/llvm/Remarks/BitstreamRemarkParser.h       |  116 +
 include/llvm/Remarks/BitstreamRemarkSerializer.h   |  196 +
 include/llvm/Remarks/Remark.h                      |   36 +-
 include/llvm/Remarks/RemarkFormat.h                |    4 +-
 include/llvm/Remarks/RemarkParser.h                |   38 +-
 include/llvm/Remarks/RemarkSerializer.h            |   70 +-
 include/llvm/Remarks/RemarkStringTable.h           |   24 +-
 include/llvm/Remarks/YAMLRemarkSerializer.h        |  108 +
 include/llvm/Support/AArch64TargetParser.def       |   72 +-
 include/llvm/Support/AArch64TargetParser.h         |    3 +-
 include/llvm/Support/ARMTargetParser.def           |    2 +
 include/llvm/Support/ARMTargetParser.h             |   20 +-
 include/llvm/Support/AlignOf.h                     |  134 +-
 include/llvm/Support/Alignment.h                   |  403 ++
 include/llvm/Support/Allocator.h                   |   22 +-
 include/llvm/Support/Automaton.h                   |  253 +
 include/llvm/Support/BinaryStreamArray.h           |    2 +-
 include/llvm/Support/BinaryStreamReader.h          |    2 +-
 include/llvm/Support/CRC.h                         |   45 +-
 include/llvm/Support/CommandLine.h                 |    3 +
 include/llvm/Support/Compiler.h                    |   81 +-
 include/llvm/Support/DataExtractor.h               |  196 +-
 include/llvm/Support/Endian.h                      |   10 +-
 include/llvm/Support/Error.h                       |   42 +-
 include/llvm/Support/FileCheck.h                   |  604 +-
 include/llvm/Support/FileCollector.h               |   79 +
 include/llvm/Support/FileSystem.h                  |   30 +-
 include/llvm/Support/FileUtilities.h               |   38 +
 include/llvm/Support/Format.h                      |    5 +-
 include/llvm/Support/GenericDomTree.h              |    6 +-
 include/llvm/Support/GenericDomTreeConstruction.h  |    8 +-
 include/llvm/Support/GlobPattern.h                 |    2 +-
 include/llvm/Support/Host.h                        |   28 -
 include/llvm/Support/JamCRC.h                      |   48 -
 include/llvm/Support/MachineValueType.h            |  419 +-
 include/llvm/Support/MathExtras.h                  |  187 +-
 include/llvm/Support/Mutex.h                       |  105 +-
 include/llvm/Support/MutexGuard.h                  |   40 -
 include/llvm/Support/OnDiskHashTable.h             |    3 +-
 include/llvm/Support/Parallel.h                    |   27 -
 include/llvm/Support/RWMutex.h                     |  321 +-
 include/llvm/Support/Regex.h                       |   18 +-
 include/llvm/Support/Registry.h                    |    2 +-
 include/llvm/Support/SHA1.h                        |    2 +-
 include/llvm/Support/ScalableSize.h                |   43 -
 include/llvm/Support/Signals.h                     |   11 +
 include/llvm/Support/SwapByteOrder.h               |   38 +-
 include/llvm/Support/TargetOpcodes.def             |   26 +-
 include/llvm/Support/TargetRegistry.h              |    4 +-
 include/llvm/Support/TimeProfiler.h                |    2 +-
 include/llvm/Support/TrailingObjects.h             |   18 +-
 include/llvm/Support/TypeSize.h                    |  201 +
 include/llvm/Support/UnicodeCharRanges.h           |    3 -
 include/llvm/Support/UniqueLock.h                  |   68 -
 include/llvm/Support/VirtualFileSystem.h           |   16 +-
 include/llvm/Support/Win64EH.h                     |    4 +-
 include/llvm/Support/X86TargetParser.def           |    4 +-
 include/llvm/Support/YAMLTraits.h                  |   11 +-
 include/llvm/Support/circular_raw_ostream.h        |    4 +
 include/llvm/Support/raw_ostream.h                 |   27 +-
 include/llvm/Support/type_traits.h                 |   18 -
 include/llvm/TableGen/Automaton.td                 |   95 +
 include/llvm/TableGen/Error.h                      |    1 +
 include/llvm/TableGen/Record.h                     |   14 +-
 include/llvm/Target/GenericOpcodes.td              |   87 +-
 include/llvm/Target/GlobalISel/Combine.td          |  103 +
 .../llvm/Target/GlobalISel/SelectionDAGCompat.td   |   25 +
 include/llvm/Target/Target.td                      |   33 +-
 include/llvm/Target/TargetCallingConv.td           |    6 +
 include/llvm/Target/TargetItinerary.td             |   11 +
 include/llvm/Target/TargetLoweringObjectFile.h     |    3 +-
 include/llvm/Target/TargetMachine.h                |   28 +-
 include/llvm/Target/TargetSchedule.td              |    8 +-
 include/llvm/Target/TargetSelectionDAG.td          |  146 +-
 include/llvm/TextAPI/MachO/Architecture.h          |    4 +
 include/llvm/TextAPI/MachO/ArchitectureSet.h       |    4 +
 include/llvm/TextAPI/MachO/InterfaceFile.h         |  240 +-
 include/llvm/TextAPI/MachO/Platform.h              |   45 +
 include/llvm/TextAPI/MachO/Symbol.h                |   35 +-
 include/llvm/TextAPI/MachO/Target.h                |   68 +
 include/llvm/TextAPI/MachO/TextAPIReader.h         |    5 +-
 include/llvm/Transforms/IPO/Attributor.h           | 1731 +++++-
 include/llvm/Transforms/IPO/GlobalDCE.h            |   14 +
 include/llvm/Transforms/IPO/HotColdSplitting.h     |   39 +
 include/llvm/Transforms/IPO/LowerTypeTests.h       |    2 +
 include/llvm/Transforms/IPO/WholeProgramDevirt.h   |   26 +
 include/llvm/Transforms/Instrumentation.h          |    4 -
 .../Transforms/Instrumentation/InstrProfiling.h    |    5 +-
 .../Transforms/Instrumentation/MemorySanitizer.h   |   12 +-
 .../Transforms/Instrumentation/SanitizerCoverage.h |   47 +
 .../Transforms/Instrumentation/ThreadSanitizer.h   |    2 +
 include/llvm/Transforms/Scalar.h                   |    9 +-
 include/llvm/Transforms/Scalar/CallSiteSplitting.h |    5 -
 include/llvm/Transforms/Scalar/ConstantHoisting.h  |   10 +-
 include/llvm/Transforms/Scalar/Float2Int.h         |    6 +-
 include/llvm/Transforms/Scalar/GVN.h               |    7 +-
 include/llvm/Transforms/Scalar/GVNExpression.h     |    9 +-
 include/llvm/Transforms/Scalar/LoopPassManager.h   |   24 +-
 include/llvm/Transforms/Scalar/LoopUnrollPass.h    |   14 +
 .../Transforms/Scalar/LowerConstantIntrinsics.h    |   41 +
 .../llvm/Transforms/Scalar/MergedLoadStoreMotion.h |   18 +-
 include/llvm/Transforms/Scalar/Reassociate.h       |    4 +-
 include/llvm/Transforms/Scalar/SCCP.h              |    3 +-
 include/llvm/Transforms/Utils/BasicBlockUtils.h    |   11 +-
 include/llvm/Transforms/Utils/BuildLibCalls.h      |   27 +-
 include/llvm/Transforms/Utils/BypassSlowDivision.h |   13 +-
 include/llvm/Transforms/Utils/CodeExtractor.h      |   57 +-
 include/llvm/Transforms/Utils/Local.h              |   16 +-
 include/llvm/Transforms/Utils/LoopUtils.h          |    5 +
 include/llvm/Transforms/Utils/MisExpect.h          |   43 +
 include/llvm/Transforms/Utils/PredicateInfo.h      |   10 +-
 include/llvm/Transforms/Utils/SimplifyLibCalls.h   |   10 +
 include/llvm/Transforms/Utils/UnrollLoop.h         |    8 +-
 include/llvm/Transforms/Utils/ValueMapper.h        |    9 +-
 .../Vectorize/LoopVectorizationLegality.h          |   48 +-
 include/llvm/Transforms/Vectorize/LoopVectorize.h  |    8 +
 include/llvm/Transforms/Vectorize/SLPVectorizer.h  |    9 +-
 include/llvm/XRay/FDRRecordProducer.h              |    4 +-
 include/llvm/XRay/FDRRecords.h                     |    6 +-
 include/llvm/XRay/FileHeaderReader.h               |    2 +-
 include/llvm/module.modulemap                      |    2 +
 lib/Analysis/AliasAnalysis.cpp                     |    4 +-
 lib/Analysis/AliasSetTracker.cpp                   |   12 +-
 lib/Analysis/Analysis.cpp                          |    1 +
 lib/Analysis/AssumptionCache.cpp                   |   12 +-
 lib/Analysis/BasicAliasAnalysis.cpp                |   42 +-
 lib/Analysis/BranchProbabilityInfo.cpp             |   19 +-
 lib/Analysis/CFG.cpp                               |   11 +-
 lib/Analysis/CFGPrinter.cpp                        |    2 +-
 lib/Analysis/CFLAndersAliasAnalysis.cpp            |   19 +-
 lib/Analysis/CFLSteensAliasAnalysis.cpp            |   20 +-
 lib/Analysis/CallGraph.cpp                         |    4 +-
 lib/Analysis/CaptureTracking.cpp                   |   46 +-
 lib/Analysis/ConstantFolding.cpp                   |  405 +-
 lib/Analysis/DDG.cpp                               |  203 +
 lib/Analysis/DependenceAnalysis.cpp                |    8 +-
 lib/Analysis/DependenceGraphBuilder.cpp            |  228 +
 lib/Analysis/DivergenceAnalysis.cpp                |   10 +
 lib/Analysis/GlobalsModRef.cpp                     |   37 +-
 lib/Analysis/IVDescriptors.cpp                     |    3 +-
 lib/Analysis/IndirectCallPromotionAnalysis.cpp     |    2 +-
 lib/Analysis/InlineCost.cpp                        |   23 +-
 lib/Analysis/InstructionSimplify.cpp               |  320 +-
 lib/Analysis/LazyBranchProbabilityInfo.cpp         |    5 +-
 lib/Analysis/LazyCallGraph.cpp                     |   13 +-
 lib/Analysis/LazyValueInfo.cpp                     |   37 +-
 lib/Analysis/LegacyDivergenceAnalysis.cpp          |   36 +-
 lib/Analysis/Lint.cpp                              |    2 +-
 lib/Analysis/Loads.cpp                             |  238 +-
 lib/Analysis/LoopAccessAnalysis.cpp                |   45 +-
 lib/Analysis/LoopAnalysisManager.cpp               |    2 +-
 lib/Analysis/LoopCacheAnalysis.cpp                 |  625 +++
 lib/Analysis/LoopInfo.cpp                          |   39 +
 lib/Analysis/LoopUnrollAnalyzer.cpp                |    2 +-
 lib/Analysis/MemDerefPrinter.cpp                   |    4 +-
 lib/Analysis/MemoryBuiltins.cpp                    |   51 +-
 lib/Analysis/MemoryDependenceAnalysis.cpp          |   21 +-
 lib/Analysis/MemorySSA.cpp                         |   95 +-
 lib/Analysis/MemorySSAUpdater.cpp                  |  323 +-
 lib/Analysis/ModuleSummaryAnalysis.cpp             |   16 +-
 lib/Analysis/MustExecute.cpp                       |  118 +
 lib/Analysis/OptimizationRemarkEmitter.cpp         |    4 +-
 lib/Analysis/OrderedInstructions.cpp               |    2 +-
 lib/Analysis/ProfileSummaryInfo.cpp                |   67 +
 lib/Analysis/ScalarEvolution.cpp                   |   89 +-
 lib/Analysis/ScalarEvolutionExpander.cpp           |   19 +-
 lib/Analysis/StackSafetyAnalysis.cpp               |    4 +-
 lib/Analysis/SyncDependenceAnalysis.cpp            |   61 +-
 lib/Analysis/TargetLibraryInfo.cpp                 |   44 +-
 lib/Analysis/TargetTransformInfo.cpp               |   64 +-
 lib/Analysis/TypeMetadataUtils.cpp                 |   32 +
 lib/Analysis/VFABIDemangling.cpp                   |  418 ++
 lib/Analysis/ValueTracking.cpp                     |  658 ++-
 lib/Analysis/VectorUtils.cpp                       |   20 +-
 lib/AsmParser/LLLexer.cpp                          |    1 +
 lib/AsmParser/LLParser.cpp                         |   81 +-
 lib/AsmParser/LLParser.h                           |    4 +-
 lib/AsmParser/LLToken.h                            |    1 +
 lib/AsmParser/Parser.cpp                           |    8 +-
 lib/BinaryFormat/Dwarf.cpp                         |   22 +-
 lib/BinaryFormat/Magic.cpp                         |    5 +
 lib/Bitcode/Reader/BitcodeAnalyzer.cpp             |   10 +
 lib/Bitcode/Reader/BitcodeReader.cpp               |   84 +-
 lib/Bitcode/Reader/MetadataLoader.cpp              |    6 +-
 lib/Bitcode/Writer/BitWriter.cpp                   |    2 +-
 lib/Bitcode/Writer/BitcodeWriter.cpp               |    9 +-
 lib/CodeGen/AggressiveAntiDepBreaker.cpp           |   16 +-
 lib/CodeGen/Analysis.cpp                           |   12 +-
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp              |  255 +-
 lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp     |    3 +-
 lib/CodeGen/AsmPrinter/ByteStreamer.h              |   12 +-
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp           |  116 +-
 lib/CodeGen/AsmPrinter/CodeViewDebug.h             |    3 +-
 .../AsmPrinter/DbgEntityHistoryCalculator.cpp      |   12 +-
 lib/CodeGen/AsmPrinter/DebugLocStream.h            |   19 +-
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp        |  176 +-
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.h          |   31 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp              |  644 ++-
 lib/CodeGen/AsmPrinter/DwarfDebug.h                |   22 +-
 lib/CodeGen/AsmPrinter/DwarfExpression.cpp         |   95 +-
 lib/CodeGen/AsmPrinter/DwarfExpression.h           |   95 +-
 lib/CodeGen/AsmPrinter/DwarfFile.h                 |   19 +-
 lib/CodeGen/AsmPrinter/DwarfUnit.cpp               |   47 +-
 lib/CodeGen/AsmPrinter/DwarfUnit.h                 |   14 +-
 lib/CodeGen/AsmPrinter/EHStreamer.cpp              |    6 +-
 lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp         |    2 +-
 lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp          |    4 +-
 lib/CodeGen/AsmPrinter/WinException.cpp            |    3 +-
 lib/CodeGen/AtomicExpandPass.cpp                   |   12 +-
 lib/CodeGen/BranchFolding.cpp                      |   34 +-
 lib/CodeGen/BranchRelaxation.cpp                   |   22 +-
 lib/CodeGen/BreakFalseDeps.cpp                     |   23 +-
 lib/CodeGen/CalcSpillWeights.cpp                   |   22 +-
 lib/CodeGen/CallingConvLower.cpp                   |   42 +-
 lib/CodeGen/CodeGen.cpp                            |    5 +-
 lib/CodeGen/CodeGenPrepare.cpp                     |  122 +-
 lib/CodeGen/CriticalAntiDepBreaker.cpp             |    9 +-
 lib/CodeGen/DFAPacketizer.cpp                      |   81 +-
 lib/CodeGen/DeadMachineInstructionElim.cpp         |   12 +-
 lib/CodeGen/DetectDeadLanes.cpp                    |   56 +-
 lib/CodeGen/EarlyIfConversion.cpp                  |  345 +-
 lib/CodeGen/ExecutionDomainFix.cpp                 |    1 +
 lib/CodeGen/ExpandMemCmp.cpp                       |    2 +-
 lib/CodeGen/ExpandPostRAPseudos.cpp                |   10 +-
 lib/CodeGen/GCMetadata.cpp                         |    2 +-
 lib/CodeGen/GCRootLowering.cpp                     |    4 +-
 lib/CodeGen/GlobalISel/CSEInfo.cpp                 |    7 +-
 lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp           |   11 +
 lib/CodeGen/GlobalISel/CallLowering.cpp            |  288 +-
 lib/CodeGen/GlobalISel/Combiner.cpp                |   14 +-
 lib/CodeGen/GlobalISel/CombinerHelper.cpp          |  919 ++-
 lib/CodeGen/GlobalISel/GISelKnownBits.cpp          |  383 ++
 lib/CodeGen/GlobalISel/IRTranslator.cpp            |  392 +-
 lib/CodeGen/GlobalISel/InstructionSelect.cpp       |   38 +-
 lib/CodeGen/GlobalISel/InstructionSelector.cpp     |    2 +-
 lib/CodeGen/GlobalISel/Legalizer.cpp               |   35 +-
 lib/CodeGen/GlobalISel/LegalizerHelper.cpp         |  978 +++-
 lib/CodeGen/GlobalISel/LegalizerInfo.cpp           |   42 +-
 lib/CodeGen/GlobalISel/Localizer.cpp               |   11 +-
 lib/CodeGen/GlobalISel/MachineIRBuilder.cpp        |   93 +-
 lib/CodeGen/GlobalISel/RegBankSelect.cpp           |   13 +-
 lib/CodeGen/GlobalISel/RegisterBank.cpp            |    1 +
 lib/CodeGen/GlobalISel/RegisterBankInfo.cpp        |   17 +-
 lib/CodeGen/GlobalISel/Utils.cpp                   |   98 +-
 lib/CodeGen/GlobalMerge.cpp                        |    8 +-
 lib/CodeGen/HardwareLoops.cpp                      |    2 +-
 lib/CodeGen/IfConversion.cpp                       |  200 +-
 lib/CodeGen/ImplicitNullChecks.cpp                 |    8 +-
 lib/CodeGen/InlineSpiller.cpp                      |   22 +-
 lib/CodeGen/InterleavedLoadCombinePass.cpp         |    4 +-
 lib/CodeGen/LLVMTargetMachine.cpp                  |   34 +-
 lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp      |    6 +-
 lib/CodeGen/LexicalScopes.cpp                      |    1 +
 lib/CodeGen/LiveDebugValues.cpp                    |  510 +-
 lib/CodeGen/LiveDebugVariables.cpp                 |  257 +-
 lib/CodeGen/LiveInterval.cpp                       |    7 +-
 lib/CodeGen/LiveIntervals.cpp                      |   59 +-
 lib/CodeGen/LivePhysRegs.cpp                       |   20 +-
 lib/CodeGen/LiveRangeCalc.cpp                      |    5 +-
 lib/CodeGen/LiveRangeCalc.h                        |  297 -
 lib/CodeGen/LiveRangeEdit.cpp                      |   14 +-
 lib/CodeGen/LiveRangeShrink.cpp                    |    4 +-
 lib/CodeGen/LiveRegMatrix.cpp                      |    2 +-
 lib/CodeGen/LiveRegUnits.cpp                       |   12 +-
 lib/CodeGen/LiveStacks.cpp                         |    7 +-
 lib/CodeGen/LiveVariables.cpp                      |   29 +-
 lib/CodeGen/LocalStackSlotAllocation.cpp           |   10 +-
 lib/CodeGen/LowerEmuTLS.cpp                        |    7 +-
 lib/CodeGen/MIRCanonicalizerPass.cpp               |  359 +-
 lib/CodeGen/MIRNamerPass.cpp                       |   77 +
 lib/CodeGen/MIRParser/MILexer.cpp                  |    1 +
 lib/CodeGen/MIRParser/MILexer.h                    |    2 +
 lib/CodeGen/MIRParser/MIParser.cpp                 |   60 +-
 lib/CodeGen/MIRParser/MIRParser.cpp                |   18 +-
 lib/CodeGen/MIRPrinter.cpp                         |   16 +-
 lib/CodeGen/MIRVRegNamerUtils.cpp                  |  348 ++
 lib/CodeGen/MIRVRegNamerUtils.h                    |   91 +
 lib/CodeGen/MachineBasicBlock.cpp                  |   64 +-
 lib/CodeGen/MachineBlockPlacement.cpp              |   28 +-
 lib/CodeGen/MachineCSE.cpp                         |   75 +-
 lib/CodeGen/MachineCombiner.cpp                    |    6 +-
 lib/CodeGen/MachineCopyPropagation.cpp             |   78 +-
 lib/CodeGen/MachineDominators.cpp                  |   23 +-
 lib/CodeGen/MachineFrameInfo.cpp                   |   38 +-
 lib/CodeGen/MachineFunction.cpp                    |   58 +-
 lib/CodeGen/MachineFunctionPass.cpp                |    6 +-
 lib/CodeGen/MachineInstr.cpp                       |  116 +-
 lib/CodeGen/MachineInstrBundle.cpp                 |   14 +-
 lib/CodeGen/MachineLICM.cpp                        |   61 +-
 lib/CodeGen/MachineLoopUtils.cpp                   |  132 +
 lib/CodeGen/MachineModuleInfo.cpp                  |   85 +-
 lib/CodeGen/MachineOperand.cpp                     |   70 +-
 lib/CodeGen/MachineOptimizationRemarkEmitter.cpp   |    2 +-
 lib/CodeGen/MachineOutliner.cpp                    |   16 +-
 lib/CodeGen/MachinePipeliner.cpp                   | 1235 +---
 lib/CodeGen/MachinePostDominators.cpp              |   55 +-
 lib/CodeGen/MachineRegisterInfo.cpp                |   12 +-
 lib/CodeGen/MachineSSAUpdater.cpp                  |    6 +-
 lib/CodeGen/MachineScheduler.cpp                   |   59 +-
 lib/CodeGen/MachineSink.cpp                        |   73 +-
 lib/CodeGen/MachineTraceMetrics.cpp                |   24 +-
 lib/CodeGen/MachineVerifier.cpp                    |  163 +-
 lib/CodeGen/MacroFusion.cpp                        |    4 +-
 lib/CodeGen/ModuloSchedule.cpp                     | 2022 +++++++
 lib/CodeGen/OptimizePHIs.cpp                       |   15 +-
 lib/CodeGen/PHIElimination.cpp                     |   43 +-
 lib/CodeGen/PatchableFunction.cpp                  |    2 +-
 lib/CodeGen/PeepholeOptimizer.cpp                  |   83 +-
 lib/CodeGen/PreISelIntrinsicLowering.cpp           |    2 +-
 lib/CodeGen/ProcessImplicitDefs.cpp                |    8 +-
 lib/CodeGen/PrologEpilogInserter.cpp               |    2 +-
 lib/CodeGen/PseudoSourceValue.cpp                  |    6 +-
 lib/CodeGen/ReachingDefAnalysis.cpp                |    1 +
 lib/CodeGen/RegAllocBase.cpp                       |    4 +-
 lib/CodeGen/RegAllocFast.cpp                       |  117 +-
 lib/CodeGen/RegAllocGreedy.cpp                     |   16 +-
 lib/CodeGen/RegAllocPBQP.cpp                       |   12 +-
 lib/CodeGen/RegUsageInfoCollector.cpp              |   10 +-
 lib/CodeGen/RegUsageInfoPropagate.cpp              |    6 +-
 lib/CodeGen/RegisterCoalescer.cpp                  |   71 +-
 lib/CodeGen/RegisterPressure.cpp                   |   36 +-
 lib/CodeGen/RegisterScavenging.cpp                 |   62 +-
 lib/CodeGen/RenameIndependentSubregs.cpp           |    4 +-
 lib/CodeGen/SafeStack.cpp                          |    2 +-
 lib/CodeGen/ScalarizeMaskedMemIntrin.cpp           |  167 +-
 lib/CodeGen/ScheduleDAGInstrs.cpp                  |   57 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp           | 1758 +++---
 lib/CodeGen/SelectionDAG/FastISel.cpp              |   67 +-
 lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp  |    7 +-
 lib/CodeGen/SelectionDAG/InstrEmitter.cpp          |   77 +-
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp           |  222 +-
 lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp    |  430 +-
 lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  |  510 +-
 lib/CodeGen/SelectionDAG/LegalizeTypes.cpp         |   56 +-
 lib/CodeGen/SelectionDAG/LegalizeTypes.h           |   61 +-
 lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp  |   46 +-
 lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp     |   50 +-
 lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp   |  139 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp       |    2 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp     |   18 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp    |   18 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h      |    3 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp       |    9 +-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp          |  283 +-
 .../SelectionDAG/SelectionDAGAddressAnalysis.cpp   |    1 +
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp   |  495 +-
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h     |    2 +-
 lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp    |    9 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp      |   35 +-
 lib/CodeGen/SelectionDAG/StatepointLowering.cpp    |   34 +-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp        | 1406 ++++-
 lib/CodeGen/ShrinkWrap.cpp                         |    5 +-
 lib/CodeGen/SjLjEHPrepare.cpp                      |    5 +-
 lib/CodeGen/SplitKit.cpp                           |    6 +-
 lib/CodeGen/SplitKit.h                             |    2 +-
 lib/CodeGen/StackMaps.cpp                          |    8 +-
 lib/CodeGen/StackProtector.cpp                     |   67 +-
 lib/CodeGen/StackSlotColoring.cpp                  |    8 +-
 lib/CodeGen/SwiftErrorValueTracking.cpp            |    3 +-
 lib/CodeGen/TailDuplicator.cpp                     |   22 +-
 lib/CodeGen/TargetFrameLoweringImpl.cpp            |   19 +-
 lib/CodeGen/TargetInstrInfo.cpp                    |   82 +-
 lib/CodeGen/TargetLoweringBase.cpp                 |   95 +-
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp       |  107 +-
 lib/CodeGen/TargetPassConfig.cpp                   |   24 +-
 lib/CodeGen/TargetRegisterInfo.cpp                 |   60 +-
 lib/CodeGen/TargetSchedule.cpp                     |    2 +-
 lib/CodeGen/TwoAddressInstructionPass.cpp          |   90 +-
 lib/CodeGen/UnreachableBlockElim.cpp               |   15 +-
 lib/CodeGen/ValueTypes.cpp                         |  150 +-
 lib/CodeGen/VirtRegMap.cpp                         |   71 +-
 lib/CodeGen/XRayInstrumentation.cpp                |    2 +-
 lib/DebugInfo/CodeView/CVTypeVisitor.cpp           |   15 -
 lib/DebugInfo/CodeView/CodeViewRecordIO.cpp        |    8 +-
 lib/DebugInfo/CodeView/EnumTables.cpp              |  166 +
 lib/DebugInfo/CodeView/SymbolDumper.cpp            |    2 +-
 lib/DebugInfo/CodeView/SymbolRecordMapping.cpp     |    2 +-
 lib/DebugInfo/CodeView/TypeRecordMapping.cpp       |  238 +-
 .../DWARF/DWARFAbbreviationDeclaration.cpp         |    8 +-
 lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp      |  105 +-
 lib/DebugInfo/DWARF/DWARFCompileUnit.cpp           |   10 +-
 lib/DebugInfo/DWARF/DWARFContext.cpp               |  335 +-
 lib/DebugInfo/DWARF/DWARFDataExtractor.cpp         |   13 +-
 lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp           |   10 +-
 lib/DebugInfo/DWARF/DWARFDebugAddr.cpp             |   28 +-
 lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp        |    4 +-
 lib/DebugInfo/DWARF/DWARFDebugAranges.cpp          |   12 +-
 lib/DebugInfo/DWARF/DWARFDebugFrame.cpp            |   74 +-
 lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp        |    8 +-
 lib/DebugInfo/DWARF/DWARFDebugLine.cpp             |  159 +-
 lib/DebugInfo/DWARF/DWARFDebugLoc.cpp              |  257 +-
 lib/DebugInfo/DWARF/DWARFDebugMacro.cpp            |    2 +-
 lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp         |    6 +-
 lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp        |   18 +-
 lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp         |   30 +-
 lib/DebugInfo/DWARF/DWARFDie.cpp                   |   54 +-
 lib/DebugInfo/DWARF/DWARFExpression.cpp            |   13 +-
 lib/DebugInfo/DWARF/DWARFFormValue.cpp             |    9 +-
 lib/DebugInfo/DWARF/DWARFGdbIndex.cpp              |    2 +-
 lib/DebugInfo/DWARF/DWARFListTable.cpp             |   70 +-
 lib/DebugInfo/DWARF/DWARFTypeUnit.cpp              |   14 +-
 lib/DebugInfo/DWARF/DWARFUnit.cpp                  |  231 +-
 lib/DebugInfo/DWARF/DWARFUnitIndex.cpp             |   12 +-
 lib/DebugInfo/DWARF/DWARFVerifier.cpp              |  120 +-
 lib/DebugInfo/GSYM/FileWriter.cpp                  |   78 +
 lib/DebugInfo/GSYM/FunctionInfo.cpp                |  143 +-
 lib/DebugInfo/GSYM/GsymCreator.cpp                 |  275 +
 lib/DebugInfo/GSYM/GsymReader.cpp                  |  265 +
 lib/DebugInfo/GSYM/Header.cpp                      |  109 +
 lib/DebugInfo/GSYM/InlineInfo.cpp                  |  100 +
 lib/DebugInfo/GSYM/LineTable.cpp                   |  287 +
 lib/DebugInfo/GSYM/Range.cpp                       |   47 +
 lib/DebugInfo/MSF/MappedBlockStream.cpp            |    6 +-
 lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp             |   28 +-
 lib/DebugInfo/PDB/DIA/DIASectionContrib.cpp        |    2 +-
 lib/DebugInfo/PDB/DIA/DIASession.cpp               |   46 +-
 lib/DebugInfo/PDB/GenericError.cpp                 |    4 +-
 .../PDB/Native/DbiModuleDescriptorBuilder.cpp      |    4 +-
 lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp      |    2 +-
 lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp      |    4 +-
 lib/DebugInfo/PDB/Native/Hash.cpp                  |    5 +-
 .../PDB/Native/NativeEnumInjectedSources.cpp       |   29 +-
 lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp       |   24 +-
 lib/DebugInfo/PDB/Native/NativeSession.cpp         |   10 +-
 lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp        |    4 +-
 lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp |    6 +-
 lib/DebugInfo/PDB/Native/PDBFile.cpp               |   18 +-
 lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp        |   17 +-
 lib/DebugInfo/PDB/Native/TpiHashing.cpp            |    6 +-
 lib/DebugInfo/PDB/Native/TpiStream.cpp             |    2 +-
 lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp      |    2 +-
 lib/DebugInfo/PDB/PDBSymbolFunc.cpp                |    2 +-
 lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp     |    2 +-
 lib/DebugInfo/PDB/UDTLayout.cpp                    |   14 +-
 lib/DebugInfo/Symbolize/DIPrinter.cpp              |   17 +-
 lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp |   32 +-
 lib/DebugInfo/Symbolize/SymbolizableObjectFile.h   |    7 +-
 lib/DebugInfo/Symbolize/Symbolize.cpp              |   52 +-
 lib/Demangle/ItaniumDemangle.cpp                   |   10 +
 lib/Demangle/MicrosoftDemangle.cpp                 |   32 +-
 lib/Demangle/MicrosoftDemangleNodes.cpp            |   51 +-
 lib/ExecutionEngine/ExecutionEngine.cpp            |   34 +-
 lib/ExecutionEngine/GDBRegistrationListener.cpp    |    8 +-
 .../Interpreter/ExternalFunctions.cpp              |    4 +-
 .../JITLink/BasicGOTAndStubsBuilder.h              |   35 +-
 lib/ExecutionEngine/JITLink/EHFrameSupport.cpp     |  216 +-
 lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h   |   50 +-
 lib/ExecutionEngine/JITLink/JITLink.cpp            |  158 +-
 lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp     |  385 +-
 lib/ExecutionEngine/JITLink/JITLinkGeneric.h       |  185 +-
 .../JITLink/JITLinkMemoryManager.cpp               |   63 +-
 lib/ExecutionEngine/JITLink/MachO.cpp              |    3 +
 .../JITLink/MachOAtomGraphBuilder.cpp              |  411 --
 .../JITLink/MachOAtomGraphBuilder.h                |  138 -
 .../JITLink/MachOLinkGraphBuilder.cpp              |  535 ++
 .../JITLink/MachOLinkGraphBuilder.h                |  269 +
 lib/ExecutionEngine/JITLink/MachO_arm64.cpp        |  736 +++
 lib/ExecutionEngine/JITLink/MachO_x86_64.cpp       |  279 +-
 lib/ExecutionEngine/MCJIT/MCJIT.cpp                |   38 +-
 .../OProfileJIT/OProfileJITEventListener.cpp       |    2 +-
 .../OProfileJIT/OProfileWrapper.cpp                |    4 +-
 lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp   |  127 +-
 lib/ExecutionEngine/Orc/CompileUtils.cpp           |    2 +-
 lib/ExecutionEngine/Orc/Core.cpp                   |  506 +-
 lib/ExecutionEngine/Orc/ExecutionUtils.cpp         |   92 +-
 lib/ExecutionEngine/Orc/IRCompileLayer.cpp         |    4 +-
 lib/ExecutionEngine/Orc/IRTransformLayer.cpp       |    2 +-
 lib/ExecutionEngine/Orc/IndirectionUtils.cpp       |   27 +-
 .../Orc/JITTargetMachineBuilder.cpp                |   17 +-
 lib/ExecutionEngine/Orc/LLJIT.cpp                  |   38 +-
 lib/ExecutionEngine/Orc/Layer.cpp                  |   26 +-
 lib/ExecutionEngine/Orc/LazyReexports.cpp          |   18 +-
 lib/ExecutionEngine/Orc/Legacy.cpp                 |    5 +-
 lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp     |  258 +-
 lib/ExecutionEngine/Orc/OrcCBindingsStack.h        |   11 +-
 .../Orc/RTDyldObjectLinkingLayer.cpp               |   24 +-
 lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp      |  307 +
 lib/ExecutionEngine/Orc/Speculation.cpp            |  146 +
 lib/ExecutionEngine/Orc/ThreadSafeModule.cpp       |   58 +-
 .../PerfJITEvents/PerfJITEventListener.cpp         |    8 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp    |   64 +-
 .../RuntimeDyld/RuntimeDyldCOFF.cpp                |    8 +-
 .../RuntimeDyld/RuntimeDyldChecker.cpp             |    2 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp |   54 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h  |    2 +-
 .../RuntimeDyld/RuntimeDyldMachO.cpp               |   17 +-
 .../RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h    |    8 +-
 .../RuntimeDyld/Targets/RuntimeDyldMachOARM.h      |    5 +-
 .../RuntimeDyld/Targets/RuntimeDyldMachOI386.h     |    5 +-
 lib/FuzzMutate/FuzzerCLI.cpp                       |    2 +-
 lib/IR/AsmWriter.cpp                               |   11 +-
 lib/IR/AttributeImpl.h                             |    6 +-
 lib/IR/Attributes.cpp                              |   84 +-
 lib/IR/AutoUpgrade.cpp                             |  157 +-
 lib/IR/BasicBlock.cpp                              |    7 +
 lib/IR/ConstantFold.cpp                            |   25 +-
 lib/IR/ConstantRange.cpp                           |   76 +-
 lib/IR/Constants.cpp                               |   56 +-
 lib/IR/ConstantsContext.h                          |   12 +-
 lib/IR/Core.cpp                                    |  114 +-
 lib/IR/DIBuilder.cpp                               |    2 +-
 lib/IR/DataLayout.cpp                              |  163 +-
 lib/IR/DebugInfo.cpp                               |   22 +-
 lib/IR/DebugInfoMetadata.cpp                       |   34 +-
 lib/IR/DiagnosticInfo.cpp                          |   11 +
 lib/IR/Function.cpp                                |  103 +-
 lib/IR/Globals.cpp                                 |   51 +-
 lib/IR/IRBuilder.cpp                               |    8 +-
 lib/IR/IRPrintingPasses.cpp                        |   18 +-
 lib/IR/InlineAsm.cpp                               |   10 +
 lib/IR/Instruction.cpp                             |    2 +-
 lib/IR/Instructions.cpp                            |  167 +-
 lib/IR/IntrinsicInst.cpp                           |   15 +-
 lib/IR/LLVMContext.cpp                             |   31 +-
 lib/IR/LLVMContextImpl.cpp                         |    2 +-
 lib/IR/LegacyPassManager.cpp                       |   20 +-
 lib/IR/MDBuilder.cpp                               |   12 +
 lib/IR/Metadata.cpp                                |   18 +
 lib/IR/Module.cpp                                  |    2 +-
 lib/IR/RemarkStreamer.cpp                          |   72 +-
 lib/IR/SafepointIRVerifier.cpp                     |    4 +-
 lib/IR/Type.cpp                                    |   27 +-
 lib/IR/Value.cpp                                   |  111 +-
 lib/IR/Verifier.cpp                                |  167 +-
 lib/LTO/Caching.cpp                                |    4 +-
 lib/LTO/LTO.cpp                                    |   63 +-
 lib/LTO/LTOBackend.cpp                             |   18 +-
 lib/LTO/LTOCodeGenerator.cpp                       |   13 +-
 lib/LTO/LTOModule.cpp                              |    3 +-
 lib/LTO/SummaryBasedOptimizations.cpp              |    2 +-
 lib/LTO/ThinLTOCodeGenerator.cpp                   |  139 +-
 lib/Linker/IRMover.cpp                             |  112 +-
 lib/Linker/LinkModules.cpp                         |    3 +-
 lib/MC/ELFObjectWriter.cpp                         |   86 +-
 lib/MC/MCAsmBackend.cpp                            |    5 +-
 lib/MC/MCAsmInfoXCOFF.cpp                          |   17 +
 lib/MC/MCAsmMacro.cpp                              |    2 +
 lib/MC/MCAsmStreamer.cpp                           |  140 +-
 lib/MC/MCAssembler.cpp                             |   38 +-
 lib/MC/MCContext.cpp                               |   29 +-
 lib/MC/MCDwarf.cpp                                 |   44 +-
 lib/MC/MCELFStreamer.cpp                           |    7 +-
 lib/MC/MCExpr.cpp                                  |   46 +-
 lib/MC/MCInstPrinter.cpp                           |   31 +-
 lib/MC/MCInstrAnalysis.cpp                         |    6 +
 lib/MC/MCMachOStreamer.cpp                         |    1 +
 lib/MC/MCObjectFileInfo.cpp                        |   14 +-
 lib/MC/MCObjectStreamer.cpp                        |    4 +-
 lib/MC/MCParser/AsmParser.cpp                      |  142 +-
 lib/MC/MCParser/COFFAsmParser.cpp                  |  155 +-
 lib/MC/MCParser/DarwinAsmParser.cpp                |    4 +-
 lib/MC/MCParser/WasmAsmParser.cpp                  |    1 +
 lib/MC/MCRegisterInfo.cpp                          |   48 +-
 lib/MC/MCSectionXCOFF.cpp                          |   50 +-
 lib/MC/MCStreamer.cpp                              |   84 +-
 lib/MC/MCSubtargetInfo.cpp                         |   25 +
 lib/MC/MCWasmObjectTargetWriter.cpp                |    5 +-
 lib/MC/MCWasmStreamer.cpp                          |    2 +-
 lib/MC/MCWinCOFFStreamer.cpp                       |   18 +-
 lib/MC/MCXCOFFStreamer.cpp                         |   54 +-
 lib/MC/MachObjectWriter.cpp                        |   14 +-
 lib/MC/StringTableBuilder.cpp                      |   10 +-
 lib/MC/WasmObjectWriter.cpp                        |   77 +-
 lib/MC/WinCOFFObjectWriter.cpp                     |   10 +-
 lib/MC/XCOFFObjectWriter.cpp                       |  533 +-
 lib/MCA/CodeEmitter.cpp                            |   37 +
 lib/MCA/Context.cpp                                |   23 +-
 lib/MCA/HardwareUnits/LSUnit.cpp                   |   28 +-
 lib/MCA/HardwareUnits/RegisterFile.cpp             |   16 +-
 lib/MCA/HardwareUnits/ResourceManager.cpp          |   59 +-
 lib/MCA/HardwareUnits/RetireControlUnit.cpp        |   65 +-
 lib/MCA/HardwareUnits/Scheduler.cpp                |   12 +-
 lib/MCA/InstrBuilder.cpp                           |   44 +-
 lib/MCA/Instruction.cpp                            |    4 +-
 lib/MCA/Stages/DispatchStage.cpp                   |   19 +-
 lib/MCA/Stages/EntryStage.cpp                      |    2 +-
 lib/MCA/Stages/ExecuteStage.cpp                    |   22 +-
 lib/MCA/Stages/RetireStage.cpp                     |    8 +-
 lib/Object/Archive.cpp                             |    6 +-
 lib/Object/ArchiveWriter.cpp                       |   35 +-
 lib/Object/Binary.cpp                              |    3 +
 lib/Object/COFFObjectFile.cpp                      |  198 +-
 lib/Object/Decompressor.cpp                        |   15 +-
 lib/Object/ELF.cpp                                 |    2 +
 lib/Object/ELFObjectFile.cpp                       |   38 +-
 lib/Object/MachOObjectFile.cpp                     |   48 +-
 lib/Object/MachOUniversal.cpp                      |   38 +-
 lib/Object/Minidump.cpp                            |   46 +-
 lib/Object/Object.cpp                              |   10 +-
 lib/Object/ObjectFile.cpp                          |   11 +-
 lib/Object/RelocationResolver.cpp                  |   67 +-
 lib/Object/SymbolicFile.cpp                        |    1 +
 lib/Object/TapiFile.cpp                            |  104 +
 lib/Object/TapiUniversal.cpp                       |   54 +
 lib/Object/WasmObjectFile.cpp                      |   13 +-
 lib/Object/WindowsResource.cpp                     |  346 +-
 lib/Object/XCOFFObjectFile.cpp                     |  240 +-
 lib/ObjectYAML/COFFEmitter.cpp                     |  622 ++
 lib/ObjectYAML/CodeViewYAMLSymbols.cpp             |    2 +-
 lib/ObjectYAML/ELFEmitter.cpp                      | 1152 ++++
 lib/ObjectYAML/ELFYAML.cpp                         |  325 +-
 lib/ObjectYAML/MachOEmitter.cpp                    |  580 ++
 lib/ObjectYAML/MachOYAML.cpp                       |    9 +
 lib/ObjectYAML/MinidumpEmitter.cpp                 |  247 +
 lib/ObjectYAML/MinidumpYAML.cpp                    |  331 +-
 lib/ObjectYAML/WasmEmitter.cpp                     |  633 +++
 lib/ObjectYAML/WasmYAML.cpp                        |    4 +-
 lib/ObjectYAML/yaml2obj.cpp                        |   77 +
 lib/Option/ArgList.cpp                             |    8 +-
 lib/Passes/PassBuilder.cpp                         |  164 +-
 lib/Passes/PassRegistry.def                        |   14 +-
 lib/ProfileData/Coverage/CoverageMapping.cpp       |   60 +-
 lib/ProfileData/Coverage/CoverageMappingReader.cpp |   20 +-
 lib/ProfileData/Coverage/CoverageMappingWriter.cpp |   10 +
 lib/ProfileData/GCOV.cpp                           |   12 +-
 lib/ProfileData/InstrProf.cpp                      |   18 +-
 lib/ProfileData/InstrProfReader.cpp                |   44 +-
 lib/ProfileData/InstrProfWriter.cpp                |    2 +-
 lib/ProfileData/ProfileSummaryBuilder.cpp          |    4 +-
 lib/ProfileData/SampleProf.cpp                     |   56 +-
 lib/ProfileData/SampleProfReader.cpp               |  447 +-
 lib/ProfileData/SampleProfWriter.cpp               |  279 +-
 lib/Remarks/BitstreamRemarkParser.cpp              |  597 ++
 lib/Remarks/BitstreamRemarkParser.h                |   83 +
 lib/Remarks/BitstreamRemarkSerializer.cpp          |  386 ++
 lib/Remarks/RemarkFormat.cpp                       |    4 +-
 lib/Remarks/RemarkParser.cpp                       |   72 +-
 lib/Remarks/RemarkSerializer.cpp                   |   54 +
 lib/Remarks/RemarkStringTable.cpp                  |   28 +-
 lib/Remarks/YAMLRemarkParser.cpp                   |  165 +-
 lib/Remarks/YAMLRemarkParser.h                     |   38 +-
 lib/Remarks/YAMLRemarkSerializer.cpp               |  134 +-
 lib/Support/AArch64TargetParser.cpp                |    4 +-
 lib/Support/ABIBreak.cpp                           |   24 +
 lib/Support/APInt.cpp                              |   52 +
 lib/Support/ARMTargetParser.cpp                    |    8 +-
 lib/Support/CRC.cpp                                |  113 +-
 lib/Support/CachePruning.cpp                       |    2 +-
 lib/Support/CodeGenCoverage.cpp                    |    4 +-
 lib/Support/CommandLine.cpp                        |    2 +-
 lib/Support/CrashRecoveryContext.cpp               |    8 +-
 lib/Support/DataExtractor.cpp                      |  160 +-
 lib/Support/Error.cpp                              |   17 +-
 lib/Support/FileCheck.cpp                          |  356 +-
 lib/Support/FileCheckImpl.h                        |  624 +++
 lib/Support/FileCollector.cpp                      |  268 +
 lib/Support/FileOutputBuffer.cpp                   |    6 +-
 lib/Support/FileUtilities.cpp                      |   66 +
 lib/Support/GlobPattern.cpp                        |   23 +-
 lib/Support/Host.cpp                               |   34 +-
 lib/Support/JSON.cpp                               |    2 +-
 lib/Support/JamCRC.cpp                             |   96 -
 lib/Support/ManagedStatic.cpp                      |   13 +-
 lib/Support/MemoryBuffer.cpp                       |   31 +-
 lib/Support/Mutex.cpp                              |  123 -
 lib/Support/Parallel.cpp                           |   31 +-
 lib/Support/Path.cpp                               |    6 +-
 lib/Support/PrettyStackTrace.cpp                   |   64 +-
 lib/Support/RWMutex.cpp                            |   58 +-
 lib/Support/Regex.cpp                              |   39 +-
 lib/Support/Signposts.cpp                          |    2 +
 lib/Support/SpecialCaseList.cpp                    |    4 +-
 lib/Support/Statistic.cpp                          |   27 +-
 lib/Support/StringExtras.cpp                       |    4 +-
 lib/Support/TimeProfiler.cpp                       |   63 +-
 lib/Support/Timer.cpp                              |   10 +-
 lib/Support/Unix/Memory.inc                        |    6 +-
 lib/Support/Unix/Mutex.inc                         |   42 -
 lib/Support/Unix/Path.inc                          |   73 +-
 lib/Support/Unix/Process.inc                       |    7 +-
 lib/Support/Unix/Program.inc                       |    4 +-
 lib/Support/Unix/RWMutex.inc                       |   50 -
 lib/Support/Unix/Signals.inc                       |   15 +-
 lib/Support/VirtualFileSystem.cpp                  |  102 +-
 lib/Support/Windows/Mutex.inc                      |   56 -
 lib/Support/Windows/Path.inc                       |   93 +-
 lib/Support/Windows/Program.inc                    |    2 +-
 lib/Support/Windows/RWMutex.inc                    |  128 -
 lib/Support/Windows/Signals.inc                    |    3 +
 lib/Support/Windows/WindowsSupport.h               |    1 +
 lib/Support/Windows/explicit_symbols.inc           |    6 -
 lib/Support/YAMLTraits.cpp                         |   16 +-
 lib/Support/Z3Solver.cpp                           |    2 +-
 lib/Support/raw_ostream.cpp                        |   35 +-
 lib/Support/regcomp.c                              |    7 +-
 lib/TableGen/Error.cpp                             |    2 +
 lib/TableGen/Main.cpp                              |   21 +-
 lib/TableGen/Record.cpp                            |   11 +-
 lib/TableGen/SetTheory.cpp                         |   22 +-
 lib/TableGen/TGLexer.cpp                           |    4 +-
 lib/TableGen/TGParser.cpp                          |   28 +-
 lib/Target/AArch64/AArch64.h                       |    6 +-
 lib/Target/AArch64/AArch64.td                      |   80 +-
 lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp   |   12 +-
 lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp    |   16 +-
 lib/Target/AArch64/AArch64AsmPrinter.cpp           |  280 +-
 lib/Target/AArch64/AArch64CallLowering.cpp         |  632 ++-
 lib/Target/AArch64/AArch64CallLowering.h           |   29 +-
 lib/Target/AArch64/AArch64CallingConvention.cpp    |   38 +-
 lib/Target/AArch64/AArch64CallingConvention.h      |    3 +
 lib/Target/AArch64/AArch64CallingConvention.td     |   88 +-
 lib/Target/AArch64/AArch64CollectLOH.cpp           |   22 +-
 lib/Target/AArch64/AArch64Combine.td               |   18 +
 lib/Target/AArch64/AArch64CondBrTuning.cpp         |    4 +-
 lib/Target/AArch64/AArch64ConditionalCompares.cpp  |    6 +-
 .../AArch64/AArch64DeadRegisterDefinitionsPass.cpp |    4 +-
 lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp    |   76 +-
 lib/Target/AArch64/AArch64FalkorHWPFFix.cpp        |    2 +-
 lib/Target/AArch64/AArch64FastISel.cpp             |   75 +-
 lib/Target/AArch64/AArch64FrameLowering.cpp        |  301 +-
 lib/Target/AArch64/AArch64FrameLowering.h          |   28 +-
 lib/Target/AArch64/AArch64ISelDAGToDAG.cpp         |   45 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp         |  535 +-
 lib/Target/AArch64/AArch64ISelLowering.h           |   37 +-
 lib/Target/AArch64/AArch64InstrAtomics.td          |   65 +-
 lib/Target/AArch64/AArch64InstrFormats.td          |  220 +-
 lib/Target/AArch64/AArch64InstrInfo.cpp            | 1054 ++--
 lib/Target/AArch64/AArch64InstrInfo.h              |   12 +-
 lib/Target/AArch64/AArch64InstrInfo.td             |  253 +-
 lib/Target/AArch64/AArch64InstructionSelector.cpp  | 1096 +++-
 lib/Target/AArch64/AArch64LegalizerInfo.cpp        |  111 +-
 lib/Target/AArch64/AArch64LegalizerInfo.h          |    3 +
 lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp   |  160 +-
 lib/Target/AArch64/AArch64MCInstLower.cpp          |    2 +
 lib/Target/AArch64/AArch64MachineFunctionInfo.h    |   17 +
 lib/Target/AArch64/AArch64PBQPRegAlloc.cpp         |   16 +-
 lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp |   98 +-
 lib/Target/AArch64/AArch64RegisterBankInfo.cpp     |   39 +-
 lib/Target/AArch64/AArch64RegisterInfo.cpp         |   69 +-
 lib/Target/AArch64/AArch64SIMDInstrOpt.cpp         |    8 +-
 lib/Target/AArch64/AArch64SVEInstrInfo.td          |  264 +-
 lib/Target/AArch64/AArch64SelectionDAGInfo.cpp     |    2 +-
 lib/Target/AArch64/AArch64SpeculationHardening.cpp |   13 +-
 lib/Target/AArch64/AArch64StackOffset.h            |  138 +
 lib/Target/AArch64/AArch64StackTagging.cpp         |  394 +-
 lib/Target/AArch64/AArch64StackTaggingPreRA.cpp    |  209 +
 lib/Target/AArch64/AArch64StorePairSuppress.cpp    |    2 +-
 lib/Target/AArch64/AArch64Subtarget.cpp            |   50 +-
 lib/Target/AArch64/AArch64Subtarget.h              |   48 +-
 lib/Target/AArch64/AArch64SystemOperands.td        |   40 +-
 lib/Target/AArch64/AArch64TargetMachine.cpp        |   35 +-
 lib/Target/AArch64/AArch64TargetObjectFile.cpp     |    4 +-
 lib/Target/AArch64/AArch64TargetObjectFile.h       |    3 +-
 lib/Target/AArch64/AArch64TargetTransformInfo.cpp  |   29 +-
 lib/Target/AArch64/AArch64TargetTransformInfo.h    |   14 +-
 lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp  |  123 +-
 .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp     |   13 +-
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp        |   22 +-
 .../AArch64/MCTargetDesc/AArch64InstPrinter.cpp    |    3 +-
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp      |    5 +-
 lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h |    2 +-
 lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp  |    7 +
 lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h    |   20 +-
 .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp   |    2 +-
 .../MCTargetDesc/AArch64MachObjectWriter.cpp       |    4 +-
 .../MCTargetDesc/AArch64WinCOFFObjectWriter.cpp    |    2 +-
 lib/Target/AArch64/SVEInstrFormats.td              |  366 +-
 lib/Target/AArch64/Utils/AArch64BaseInfo.cpp       |    2 +-
 lib/Target/AArch64/Utils/AArch64BaseInfo.h         |   25 +-
 lib/Target/AMDGPU/AMDGPU.h                         |    4 +
 lib/Target/AMDGPU/AMDGPU.td                        |   16 +
 lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp |   21 +-
 lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h        |    4 +-
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp             |   78 +-
 lib/Target/AMDGPU/AMDGPUAsmPrinter.h               |   10 +-
 lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp        |  345 +-
 lib/Target/AMDGPU/AMDGPUCallLowering.cpp           |  701 ++-
 lib/Target/AMDGPU/AMDGPUCallLowering.h             |   29 +-
 lib/Target/AMDGPU/AMDGPUCallingConv.td             |   27 +-
 lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp         |   12 +-
 lib/Target/AMDGPU/AMDGPUFrameLowering.cpp          |    6 +-
 lib/Target/AMDGPU/AMDGPUFrameLowering.h            |    4 +-
 lib/Target/AMDGPU/AMDGPUGISel.td                   |   78 +-
 lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def    |   80 +-
 lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp    |    9 +-
 lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h      |    2 +-
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp           |  272 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp           |  250 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.h             |   18 +-
 lib/Target/AMDGPU/AMDGPUInline.cpp                 |    2 +-
 lib/Target/AMDGPU/AMDGPUInstrInfo.td               |  126 +-
 lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp    | 1144 ++--
 lib/Target/AMDGPU/AMDGPUInstructionSelector.h      |   48 +-
 lib/Target/AMDGPU/AMDGPUInstructions.td            |  216 +-
 lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp          | 1132 +++-
 lib/Target/AMDGPU/AMDGPULegalizerInfo.h            |   52 +-
 lib/Target/AMDGPU/AMDGPULibCalls.cpp               |   37 +-
 lib/Target/AMDGPU/AMDGPULibFunc.cpp                |   14 +-
 lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp   |   20 +-
 lib/Target/AMDGPU/AMDGPUMCInstLower.cpp            |    4 +
 lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp |   38 +-
 lib/Target/AMDGPU/AMDGPUMachineFunction.cpp        |    1 -
 lib/Target/AMDGPU/AMDGPUMachineFunction.h          |    6 +-
 lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp   |  592 ++
 lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp          |    2 +-
 lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp       | 1234 +++-
 lib/Target/AMDGPU/AMDGPURegisterBankInfo.h         |   56 +-
 lib/Target/AMDGPU/AMDGPURegisterBanks.td           |    6 +-
 lib/Target/AMDGPU/AMDGPURegisterInfo.cpp           |   66 +-
 lib/Target/AMDGPU/AMDGPURegisterInfo.h             |    2 +-
 lib/Target/AMDGPU/AMDGPUSearchableTables.td        |    4 +
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp              |   64 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.h                |   67 +-
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp          |   29 +-
 lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp    |   90 +-
 lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h      |   20 +-
 lib/Target/AMDGPU/AMDILCFGStructurizer.cpp         |    8 +-
 lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp    |  595 +-
 lib/Target/AMDGPU/BUFInstructions.td               |  694 ++-
 lib/Target/AMDGPU/DSInstructions.td                |   92 +-
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp     |    4 +-
 lib/Target/AMDGPU/EvergreenInstructions.td         |   60 +-
 lib/Target/AMDGPU/FLATInstructions.td              |  196 +-
 lib/Target/AMDGPU/GCNDPPCombine.cpp                |   88 +-
 lib/Target/AMDGPU/GCNHazardRecognizer.cpp          |   21 +-
 lib/Target/AMDGPU/GCNILPSched.cpp                  |    1 +
 lib/Target/AMDGPU/GCNIterativeScheduler.cpp        |    2 +-
 lib/Target/AMDGPU/GCNNSAReassign.cpp               |    8 +-
 lib/Target/AMDGPU/GCNRegBankReassign.cpp           |   14 +-
 lib/Target/AMDGPU/GCNRegPressure.cpp               |   26 +-
 lib/Target/AMDGPU/GCNRegPressure.h                 |    2 +-
 lib/Target/AMDGPU/GCNSchedStrategy.cpp             |   31 +-
 lib/Target/AMDGPU/GCNSchedStrategy.h               |    3 +
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp       |    2 +-
 .../AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp  |    2 +-
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp      |   37 +-
 lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h |    6 +-
 .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp   |    4 +-
 lib/Target/AMDGPU/MIMGInstructions.td              |    4 +-
 lib/Target/AMDGPU/R600AsmPrinter.cpp               |    2 +-
 lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp     |    4 +-
 lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp      |   22 +-
 lib/Target/AMDGPU/R600FrameLowering.h              |    6 +-
 lib/Target/AMDGPU/R600ISelLowering.cpp             |    7 +-
 lib/Target/AMDGPU/R600InstrInfo.cpp                |   22 +-
 lib/Target/AMDGPU/R600MachineScheduler.cpp         |    8 +-
 lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp  |   12 +-
 lib/Target/AMDGPU/R600Packetizer.cpp               |    4 +-
 lib/Target/AMDGPU/R600RegisterInfo.cpp             |    2 +-
 lib/Target/AMDGPU/SIAddIMGInit.cpp                 |    4 +-
 lib/Target/AMDGPU/SIDefines.h                      |    6 +-
 lib/Target/AMDGPU/SIFixSGPRCopies.cpp              |  394 +-
 lib/Target/AMDGPU/SIFixupVectorISel.cpp            |    3 +-
 lib/Target/AMDGPU/SIFoldOperands.cpp               |  114 +-
 lib/Target/AMDGPU/SIFormMemoryClauses.cpp          |   22 +-
 lib/Target/AMDGPU/SIFrameLowering.cpp              |   34 +-
 lib/Target/AMDGPU/SIFrameLowering.h                |    6 +-
 lib/Target/AMDGPU/SIISelLowering.cpp               | 1052 ++--
 lib/Target/AMDGPU/SIISelLowering.h                 |   54 +-
 lib/Target/AMDGPU/SIInsertWaitcnts.cpp             |    6 +-
 lib/Target/AMDGPU/SIInstrFormats.td                |    5 +
 lib/Target/AMDGPU/SIInstrInfo.cpp                  |  558 +-
 lib/Target/AMDGPU/SIInstrInfo.h                    |   44 +-
 lib/Target/AMDGPU/SIInstrInfo.td                   |  320 +-
 lib/Target/AMDGPU/SIInstructions.td                |  297 +-
 lib/Target/AMDGPU/SILoadStoreOptimizer.cpp         | 1117 ++--
 lib/Target/AMDGPU/SILowerControlFlow.cpp           |   60 +-
 lib/Target/AMDGPU/SILowerI1Copies.cpp              |   49 +-
 lib/Target/AMDGPU/SILowerSGPRSpills.cpp            |    4 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.cpp        |    9 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.h          |   11 +-
 lib/Target/AMDGPU/SIMachineScheduler.cpp           |   16 +-
 lib/Target/AMDGPU/SIMemoryLegalizer.cpp            |    6 +-
 lib/Target/AMDGPU/SIModeRegister.cpp               |    2 +-
 lib/Target/AMDGPU/SIOptimizeExecMasking.cpp        |    2 +-
 lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp   |   32 +-
 lib/Target/AMDGPU/SIPeepholeSDWA.cpp               |   32 +-
 lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp         |   12 +-
 lib/Target/AMDGPU/SIProgramInfo.h                  |    5 +
 lib/Target/AMDGPU/SIRegisterInfo.cpp               |  445 +-
 lib/Target/AMDGPU/SIRegisterInfo.h                 |   15 +-
 lib/Target/AMDGPU/SIRegisterInfo.td                |  464 +-
 lib/Target/AMDGPU/SIShrinkInstructions.cpp         |   42 +-
 lib/Target/AMDGPU/SIWholeQuadMode.cpp              |   35 +-
 lib/Target/AMDGPU/SMInstructions.td                |   15 +-
 lib/Target/AMDGPU/SOPInstructions.td               |   42 +-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp         |   71 +-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h           |   24 +-
 lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp      |    2 +
 lib/Target/AMDGPU/VOP1Instructions.td              |   65 +-
 lib/Target/AMDGPU/VOP2Instructions.td              |  346 +-
 lib/Target/AMDGPU/VOP3Instructions.td              |   69 +-
 lib/Target/AMDGPU/VOP3PInstructions.td             |    2 +
 lib/Target/AMDGPU/VOPCInstructions.td              |   30 +-
 lib/Target/AMDGPU/VOPInstructions.td               |   12 +-
 lib/Target/ARC/ARCFrameLowering.h                  |    4 +-
 lib/Target/ARC/ARCISelLowering.cpp                 |    2 +-
 lib/Target/ARC/ARCMachineFunctionInfo.h            |    4 +-
 lib/Target/ARC/ARCOptAddrMode.cpp                  |   16 +-
 lib/Target/ARC/ARCRegisterInfo.cpp                 |    2 +-
 lib/Target/ARC/ARCTargetMachine.cpp                |    2 +-
 lib/Target/ARM/A15SDOptimizer.cpp                  |   54 +-
 lib/Target/ARM/ARM.h                               |    2 +
 lib/Target/ARM/ARM.td                              |   42 +-
 lib/Target/ARM/ARMAsmPrinter.cpp                   |   66 +-
 lib/Target/ARM/ARMBaseInstrInfo.cpp                |  216 +-
 lib/Target/ARM/ARMBaseInstrInfo.h                  |   21 +-
 lib/Target/ARM/ARMBaseRegisterInfo.cpp             |   49 +-
 lib/Target/ARM/ARMBaseRegisterInfo.h               |    5 +-
 lib/Target/ARM/ARMBasicBlockInfo.cpp               |   16 +-
 lib/Target/ARM/ARMBasicBlockInfo.h                 |   31 +-
 lib/Target/ARM/ARMCallLowering.cpp                 |   54 +-
 lib/Target/ARM/ARMCallLowering.h                   |    5 +-
 lib/Target/ARM/ARMCallingConv.cpp                  |    2 +-
 lib/Target/ARM/ARMCodeGenPrepare.cpp               |   88 +-
 lib/Target/ARM/ARMConstantIslandPass.cpp           |  289 +-
 lib/Target/ARM/ARMConstantPoolValue.cpp            |    1 +
 lib/Target/ARM/ARMExpandPseudoInsts.cpp            |   96 +-
 lib/Target/ARM/ARMFastISel.cpp                     |   88 +-
 lib/Target/ARM/ARMFrameLowering.cpp                |   65 +-
 lib/Target/ARM/ARMFrameLowering.h                  |    5 +
 lib/Target/ARM/ARMISelDAGToDAG.cpp                 |  224 +-
 lib/Target/ARM/ARMISelLowering.cpp                 | 2073 +++++--
 lib/Target/ARM/ARMISelLowering.h                   |   45 +-
 lib/Target/ARM/ARMInstrFormats.td                  |   23 +-
 lib/Target/ARM/ARMInstrInfo.cpp                    |    2 +-
 lib/Target/ARM/ARMInstrInfo.td                     |  127 +-
 lib/Target/ARM/ARMInstrMVE.td                      | 1430 ++++-
 lib/Target/ARM/ARMInstrNEON.td                     |  191 +-
 lib/Target/ARM/ARMInstrThumb.td                    |   16 +-
 lib/Target/ARM/ARMInstrThumb2.td                   |   98 +-
 lib/Target/ARM/ARMInstrVFP.td                      |   96 +-
 lib/Target/ARM/ARMInstructionSelector.cpp          |   41 +-
 lib/Target/ARM/ARMLegalizerInfo.cpp                |    2 +
 lib/Target/ARM/ARMLoadStoreOptimizer.cpp           |   32 +-
 lib/Target/ARM/ARMLowOverheadLoops.cpp             |  364 +-
 lib/Target/ARM/ARMMCInstLower.cpp                  |    4 +-
 lib/Target/ARM/ARMMachineFunctionInfo.h            |    8 +
 lib/Target/ARM/ARMParallelDSP.cpp                  |  675 ++-
 lib/Target/ARM/ARMPredicates.td                    |    2 +-
 lib/Target/ARM/ARMRegisterInfo.td                  |   18 +-
 lib/Target/ARM/ARMScheduleA9.td                    |    4 +-
 lib/Target/ARM/ARMScheduleM4.td                    |   24 +-
 lib/Target/ARM/ARMSubtarget.cpp                    |   14 +-
 lib/Target/ARM/ARMSubtarget.h                      |   30 +-
 lib/Target/ARM/ARMTargetMachine.cpp                |   13 +-
 lib/Target/ARM/ARMTargetTransformInfo.cpp          |  370 +-
 lib/Target/ARM/ARMTargetTransformInfo.h            |   24 +-
 lib/Target/ARM/AsmParser/ARMAsmParser.cpp          |  251 +-
 lib/Target/ARM/Disassembler/ARMDisassembler.cpp    |   31 +-
 lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h   |   20 +-
 lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp      |   14 +-
 lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h          |    3 +
 lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp |    6 +-
 lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp     |   12 +-
 lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h       |    5 +-
 lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp   |   21 +-
 .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp       |    6 +-
 lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp  |    4 +-
 .../ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp    |    2 +-
 lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp |   10 -
 lib/Target/ARM/MLxExpansionPass.cpp                |   42 +-
 lib/Target/ARM/MVETailPredication.cpp              |  519 ++
 lib/Target/ARM/MVEVPTBlockPass.cpp                 |  278 +
 lib/Target/ARM/Thumb1FrameLowering.cpp             |    8 +-
 lib/Target/ARM/Thumb1InstrInfo.cpp                 |   17 +-
 lib/Target/ARM/Thumb2ITBlockPass.cpp               |  134 +-
 lib/Target/ARM/Thumb2InstrInfo.cpp                 |   38 +-
 lib/Target/ARM/Thumb2SizeReduction.cpp             |   28 +-
 lib/Target/ARM/ThumbRegisterInfo.cpp               |   11 +-
 lib/Target/AVR/AVRAsmPrinter.cpp                   |    2 +-
 lib/Target/AVR/AVRExpandPseudoInsts.cpp            |   12 +-
 lib/Target/AVR/AVRFrameLowering.cpp                |    5 +-
 lib/Target/AVR/AVRISelDAGToDAG.cpp                 |    2 +-
 lib/Target/AVR/AVRISelLowering.cpp                 |   27 +-
 lib/Target/AVR/AVRISelLowering.h                   |    4 +-
 lib/Target/AVR/AVRRegisterInfo.cpp                 |    2 +-
 lib/Target/AVR/AVRTargetMachine.cpp                |    2 +-
 lib/Target/AVR/AsmParser/AVRAsmParser.cpp          |    8 +-
 lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp |    2 +-
 lib/Target/BPF/AsmParser/BPFAsmParser.cpp          |    6 +-
 lib/Target/BPF/BPF.h                               |    4 +-
 lib/Target/BPF/BPFAbstractMemberAccess.cpp         |  708 ++-
 lib/Target/BPF/BPFAsmPrinter.cpp                   |    2 +-
 lib/Target/BPF/BPFCORE.h                           |   14 +-
 lib/Target/BPF/BPFFrameLowering.h                  |    2 +-
 lib/Target/BPF/BPFISelDAGToDAG.cpp                 |  170 +-
 lib/Target/BPF/BPFISelLowering.cpp                 |   21 +-
 lib/Target/BPF/BPFInstrInfo.cpp                    |    6 +-
 lib/Target/BPF/BPFInstrInfo.td                     |    2 +-
 lib/Target/BPF/BPFMIChecking.cpp                   |    1 +
 lib/Target/BPF/BPFMIPeephole.cpp                   |  206 +-
 lib/Target/BPF/BPFMISimplifyPatchable.cpp          |   27 +-
 lib/Target/BPF/BPFRegisterInfo.cpp                 |    6 +-
 lib/Target/BPF/BPFTargetMachine.cpp                |   16 +-
 lib/Target/BPF/BTF.h                               |   54 +-
 lib/Target/BPF/BTFDebug.cpp                        |  281 +-
 lib/Target/BPF/BTFDebug.h                          |   29 +-
 lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp |    4 +-
 lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp  |    2 +-
 lib/Target/Hexagon/BitTracker.cpp                  |   21 +-
 lib/Target/Hexagon/HexagonAsmPrinter.cpp           |    2 +-
 lib/Target/Hexagon/HexagonBitSimplify.cpp          |   71 +-
 lib/Target/Hexagon/HexagonBitTracker.cpp           |    8 +-
 lib/Target/Hexagon/HexagonBlockRanges.cpp          |   14 +-
 lib/Target/Hexagon/HexagonBranchRelaxation.cpp     |    5 +-
 lib/Target/Hexagon/HexagonConstExtenders.cpp       |   17 +-
 lib/Target/Hexagon/HexagonConstPropagation.cpp     |   32 +-
 lib/Target/Hexagon/HexagonCopyToCombine.cpp        |   32 +-
 lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td      |  696 +--
 lib/Target/Hexagon/HexagonDepOperands.td           |   83 +-
 lib/Target/Hexagon/HexagonEarlyIfConv.cpp          |   24 +-
 lib/Target/Hexagon/HexagonExpandCondsets.cpp       |   30 +-
 lib/Target/Hexagon/HexagonFixupHwLoops.cpp         |    5 +-
 lib/Target/Hexagon/HexagonFrameLowering.cpp        |   58 +-
 lib/Target/Hexagon/HexagonFrameLowering.h          |    2 +-
 lib/Target/Hexagon/HexagonGenExtract.cpp           |    2 +-
 lib/Target/Hexagon/HexagonGenInsert.cpp            |   27 +-
 lib/Target/Hexagon/HexagonGenMux.cpp               |    6 +-
 lib/Target/Hexagon/HexagonGenPredicate.cpp         |   14 +-
 lib/Target/Hexagon/HexagonHardwareLoops.cpp        |   56 +-
 lib/Target/Hexagon/HexagonISelDAGToDAG.cpp         |    2 +-
 lib/Target/Hexagon/HexagonISelLowering.cpp         |  156 +-
 lib/Target/Hexagon/HexagonISelLowering.h           |   15 +-
 lib/Target/Hexagon/HexagonISelLoweringHVX.cpp      |   24 +
 lib/Target/Hexagon/HexagonInstrInfo.cpp            |  273 +-
 lib/Target/Hexagon/HexagonInstrInfo.h              |   22 +-
 lib/Target/Hexagon/HexagonIntrinsics.td            |   46 +-
 lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp |   15 +-
 lib/Target/Hexagon/HexagonNewValueJump.cpp         |   10 +-
 lib/Target/Hexagon/HexagonOptAddrMode.cpp          |   12 +-
 lib/Target/Hexagon/HexagonPatterns.td              |  194 +-
 lib/Target/Hexagon/HexagonPatternsHVX.td           |   40 +-
 lib/Target/Hexagon/HexagonPeephole.cpp             |   38 +-
 lib/Target/Hexagon/HexagonRegisterInfo.cpp         |    6 +-
 .../Hexagon/HexagonSplitConst32AndConst64.cpp      |    8 +-
 lib/Target/Hexagon/HexagonSplitDouble.cpp          |   60 +-
 lib/Target/Hexagon/HexagonStoreWidening.cpp        |    2 +-
 lib/Target/Hexagon/HexagonSubtarget.cpp            |   19 +-
 lib/Target/Hexagon/HexagonSubtarget.h              |    2 +-
 lib/Target/Hexagon/HexagonTargetMachine.cpp        |   12 +-
 lib/Target/Hexagon/HexagonTargetTransformInfo.cpp  |    2 +
 lib/Target/Hexagon/HexagonTargetTransformInfo.h    |    4 +-
 lib/Target/Hexagon/HexagonVExtract.cpp             |   12 +-
 lib/Target/Hexagon/HexagonVLIWPacketizer.cpp       |   37 +-
 lib/Target/Hexagon/HexagonVLIWPacketizer.h         |    3 +-
 .../Hexagon/MCTargetDesc/HexagonAsmBackend.cpp     |    6 +-
 .../MCTargetDesc/HexagonELFObjectWriter.cpp        |    4 +-
 .../Hexagon/MCTargetDesc/HexagonMCChecker.cpp      |    7 +-
 .../Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp  |    4 +-
 .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp   |   10 +-
 lib/Target/Hexagon/RDFCopy.cpp                     |    4 +-
 lib/Target/Hexagon/RDFDeadCode.cpp                 |    1 +
 lib/Target/Hexagon/RDFGraph.cpp                    |   16 +-
 lib/Target/Hexagon/RDFLiveness.cpp                 |    8 +-
 lib/Target/Hexagon/RDFRegisters.cpp                |    8 +-
 lib/Target/Hexagon/RDFRegisters.h                  |    8 +-
 lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp      |   23 +-
 lib/Target/Lanai/LanaiAsmPrinter.cpp               |    2 +-
 lib/Target/Lanai/LanaiDelaySlotFiller.cpp          |    2 +-
 lib/Target/Lanai/LanaiFrameLowering.cpp            |    4 +-
 lib/Target/Lanai/LanaiFrameLowering.h              |    2 +-
 lib/Target/Lanai/LanaiISelLowering.cpp             |   15 +-
 lib/Target/Lanai/LanaiISelLowering.h               |    4 +-
 lib/Target/Lanai/LanaiInstrInfo.cpp                |    9 +-
 lib/Target/Lanai/LanaiInstrInfo.h                  |    3 +-
 lib/Target/Lanai/LanaiRegisterInfo.cpp             |    2 +-
 .../Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp    |    2 +-
 lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp    |   12 +-
 .../MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp  |    4 +-
 lib/Target/MSP430/MSP430AsmPrinter.cpp             |    8 +-
 lib/Target/MSP430/MSP430BranchSelector.cpp         |    1 +
 lib/Target/MSP430/MSP430FrameLowering.h            |    3 +-
 lib/Target/MSP430/MSP430ISelLowering.cpp           |   27 +-
 lib/Target/MSP430/MSP430ISelLowering.h             |    2 +
 lib/Target/MSP430/MSP430RegisterInfo.cpp           |    2 +-
 lib/Target/MSP430/MSP430TargetMachine.cpp          |    2 +-
 lib/Target/Mips/AsmParser/MipsAsmParser.cpp        |  696 ++-
 lib/Target/Mips/Disassembler/MipsDisassembler.cpp  |   16 +
 lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp    |    1 -
 lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h      |    6 +-
 .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp      |    4 +-
 lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp |    7 +-
 lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h          |    5 +-
 lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp  |   11 +-
 .../Mips/MCTargetDesc/MipsNaClELFStreamer.cpp      |    2 +-
 lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp  |    4 +-
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp       |   96 +-
 lib/Target/Mips/MicroMipsDSPInstrInfo.td           |    4 +-
 lib/Target/Mips/MicroMipsInstrInfo.td              |    9 +-
 lib/Target/Mips/MicroMipsSizeReduction.cpp         |   18 +-
 lib/Target/Mips/Mips.td                            |   12 +
 lib/Target/Mips/Mips16ISelDAGToDAG.cpp             |    2 +-
 lib/Target/Mips/Mips16ISelLowering.cpp             |   16 +-
 lib/Target/Mips/Mips16InstrInfo.cpp                |    2 +-
 lib/Target/Mips/Mips64InstrInfo.td                 |   36 +
 lib/Target/Mips/MipsAsmPrinter.cpp                 |   12 +-
 lib/Target/Mips/MipsCallLowering.cpp               |  150 +-
 lib/Target/Mips/MipsCallLowering.h                 |    8 +-
 lib/Target/Mips/MipsConstantIslandPass.cpp         |   63 +-
 lib/Target/Mips/MipsDSPInstrInfo.td                |   19 +-
 lib/Target/Mips/MipsExpandPseudo.cpp               |   54 +-
 lib/Target/Mips/MipsFastISel.cpp                   |   12 +-
 lib/Target/Mips/MipsFrameLowering.h                |    5 +-
 lib/Target/Mips/MipsISelDAGToDAG.cpp               |   53 +-
 lib/Target/Mips/MipsISelDAGToDAG.h                 |    5 +
 lib/Target/Mips/MipsISelLowering.cpp               |  164 +-
 lib/Target/Mips/MipsISelLowering.h                 |   13 +-
 lib/Target/Mips/MipsInstrInfo.cpp                  |    3 +-
 lib/Target/Mips/MipsInstrInfo.h                    |    2 +-
 lib/Target/Mips/MipsInstrInfo.td                   |   30 +-
 lib/Target/Mips/MipsInstructionSelector.cpp        |  206 +-
 lib/Target/Mips/MipsLegalizerInfo.cpp              |  244 +-
 lib/Target/Mips/MipsLegalizerInfo.h                |    3 +
 lib/Target/Mips/MipsMSAInstrInfo.td                |   55 +-
 lib/Target/Mips/MipsOptimizePICCall.cpp            |    5 +-
 lib/Target/Mips/MipsPfmCounters.td                 |   18 +
 lib/Target/Mips/MipsPreLegalizerCombiner.cpp       |    3 +-
 lib/Target/Mips/MipsRegisterBankInfo.cpp           |  322 +-
 lib/Target/Mips/MipsRegisterBankInfo.h             |    9 +
 lib/Target/Mips/MipsRegisterBanks.td               |    2 +-
 lib/Target/Mips/MipsSEFrameLowering.cpp            |   55 +-
 lib/Target/Mips/MipsSEISelDAGToDAG.cpp             |   54 +-
 lib/Target/Mips/MipsSEISelDAGToDAG.h               |    6 +-
 lib/Target/Mips/MipsSEISelLowering.cpp             |  124 +-
 lib/Target/Mips/MipsSEInstrInfo.cpp                |   20 +-
 lib/Target/Mips/MipsSERegisterInfo.cpp             |    8 +-
 lib/Target/Mips/MipsSubtarget.cpp                  |   17 +-
 lib/Target/Mips/MipsSubtarget.h                    |   15 +-
 lib/Target/Mips/MipsTargetMachine.cpp              |   18 +-
 lib/Target/Mips/MipsTargetStreamer.h               |   14 +-
 lib/Target/NVPTX/NVPTX.h                           |    2 +-
 lib/Target/NVPTX/NVPTXAsmPrinter.cpp               |   34 +-
 lib/Target/NVPTX/NVPTXAsmPrinter.h                 |    2 +-
 lib/Target/NVPTX/NVPTXFrameLowering.cpp            |    2 +-
 lib/Target/NVPTX/NVPTXISelLowering.cpp             |   58 +-
 lib/Target/NVPTX/NVPTXInstrInfo.td                 |   13 +-
 lib/Target/NVPTX/NVPTXIntrinsics.td                |  169 +-
 lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp          |    2 +-
 lib/Target/NVPTX/NVPTXLowerAlloca.cpp              |   97 +-
 lib/Target/NVPTX/NVPTXLowerArgs.cpp                |    2 +-
 lib/Target/NVPTX/NVPTXPeephole.cpp                 |    2 +-
 lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp         |    2 +-
 lib/Target/NVPTX/NVPTXTargetMachine.cpp            |    2 +-
 lib/Target/NVPTX/NVPTXUtilities.cpp                |   13 +-
 lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp      |   10 +-
 .../PowerPC/Disassembler/PPCDisassembler.cpp       |    6 -
 .../PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp    |    6 +-
 lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp |   25 +
 lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp   |    1 +
 lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp      |    4 +-
 lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h        |   14 +-
 .../PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp   |    4 +-
 .../PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp  |    2 +-
 lib/Target/PowerPC/P9InstrResources.td             |    8 +-
 lib/Target/PowerPC/PPC.h                           |    8 +-
 lib/Target/PowerPC/PPCAsmPrinter.cpp               |  482 +-
 lib/Target/PowerPC/PPCBranchCoalescing.cpp         |   13 +-
 lib/Target/PowerPC/PPCBranchSelector.cpp           |   29 +-
 lib/Target/PowerPC/PPCFastISel.cpp                 |   41 +-
 lib/Target/PowerPC/PPCFrameLowering.cpp            |   71 +-
 lib/Target/PowerPC/PPCFrameLowering.h              |   11 +-
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp             |  105 +-
 lib/Target/PowerPC/PPCISelLowering.cpp             |  546 +-
 lib/Target/PowerPC/PPCISelLowering.h               |   49 +-
 lib/Target/PowerPC/PPCInstr64Bit.td                |    4 +-
 lib/Target/PowerPC/PPCInstrAltivec.td              |   12 +-
 lib/Target/PowerPC/PPCInstrFormats.td              |    9 +-
 lib/Target/PowerPC/PPCInstrInfo.cpp                |  336 +-
 lib/Target/PowerPC/PPCInstrInfo.h                  |   42 +-
 lib/Target/PowerPC/PPCInstrInfo.td                 |  206 +-
 lib/Target/PowerPC/PPCInstrVSX.td                  |  180 +-
 lib/Target/PowerPC/PPCLoopPreIncPrep.cpp           |  670 ++-
 lib/Target/PowerPC/PPCMCInstLower.cpp              |   23 +-
 lib/Target/PowerPC/PPCMIPeephole.cpp               |   82 +-
 lib/Target/PowerPC/PPCPreEmitPeephole.cpp          |  106 +-
 lib/Target/PowerPC/PPCQPXLoadSplat.cpp             |    6 +-
 lib/Target/PowerPC/PPCReduceCRLogicals.cpp         |   15 +-
 lib/Target/PowerPC/PPCRegisterInfo.cpp             |   39 +-
 lib/Target/PowerPC/PPCRegisterInfo.td              |   22 +-
 lib/Target/PowerPC/PPCSubtarget.cpp                |   18 +-
 lib/Target/PowerPC/PPCSubtarget.h                  |   24 +-
 lib/Target/PowerPC/PPCTLSDynamicCall.cpp           |    4 +-
 lib/Target/PowerPC/PPCTOCRegDeps.cpp               |    9 +-
 lib/Target/PowerPC/PPCTargetMachine.cpp            |   38 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.cpp      |   68 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.h        |   12 +-
 lib/Target/PowerPC/PPCVSXCopy.cpp                  |    6 +-
 lib/Target/PowerPC/PPCVSXFMAMutate.cpp             |   18 +-
 lib/Target/PowerPC/PPCVSXSwapRemoval.cpp           |   32 +-
 lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp      |  282 +-
 .../RISCV/Disassembler/RISCVDisassembler.cpp       |  139 +-
 lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp  |   17 +-
 .../RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp    |   13 +-
 lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp |   41 +
 lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h   |    8 +-
 lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp   |   20 +
 lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h     |    3 +
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp      |    4 +-
 lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h        |    1 +
 .../RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp       |    4 +-
 lib/Target/RISCV/RISCV.h                           |    7 +
 lib/Target/RISCV/RISCV.td                          |   11 +-
 lib/Target/RISCV/RISCVCallLowering.cpp             |   50 +
 lib/Target/RISCV/RISCVCallLowering.h               |   42 +
 lib/Target/RISCV/RISCVCallingConv.td               |   28 +-
 lib/Target/RISCV/RISCVExpandPseudoInsts.cpp        |   54 +-
 lib/Target/RISCV/RISCVFrameLowering.cpp            |  164 +-
 lib/Target/RISCV/RISCVFrameLowering.h              |    9 +-
 lib/Target/RISCV/RISCVISelDAGToDAG.cpp             |    5 +-
 lib/Target/RISCV/RISCVISelLowering.cpp             |  323 +-
 lib/Target/RISCV/RISCVISelLowering.h               |    6 +
 lib/Target/RISCV/RISCVInstrInfo.cpp                |  118 +-
 lib/Target/RISCV/RISCVInstrInfo.h                  |   18 +-
 lib/Target/RISCV/RISCVInstrInfo.td                 |   22 +
 lib/Target/RISCV/RISCVInstrInfoA.td                |   34 +-
 lib/Target/RISCV/RISCVInstrInfoC.td                |  124 +-
 lib/Target/RISCV/RISCVInstrInfoF.td                |    6 +
 lib/Target/RISCV/RISCVInstructionSelector.cpp      |  103 +
 lib/Target/RISCV/RISCVLegalizerInfo.cpp            |   23 +
 lib/Target/RISCV/RISCVLegalizerInfo.h              |   28 +
 lib/Target/RISCV/RISCVMergeBaseOffset.cpp          |   16 +-
 lib/Target/RISCV/RISCVRegisterBankInfo.cpp         |   26 +
 lib/Target/RISCV/RISCVRegisterBankInfo.h           |   37 +
 lib/Target/RISCV/RISCVRegisterBanks.td             |   13 +
 lib/Target/RISCV/RISCVRegisterInfo.cpp             |   13 +-
 lib/Target/RISCV/RISCVRegisterInfo.h               |    6 +
 lib/Target/RISCV/RISCVRegisterInfo.td              |  100 +-
 lib/Target/RISCV/RISCVSubtarget.cpp                |   30 +-
 lib/Target/RISCV/RISCVSubtarget.h                  |   20 +
 lib/Target/RISCV/RISCVTargetMachine.cpp            |   31 +-
 lib/Target/RISCV/Utils/RISCVBaseInfo.h             |   16 +
 lib/Target/Sparc/AsmParser/SparcAsmParser.cpp      |    8 +-
 lib/Target/Sparc/DelaySlotFiller.cpp               |   10 +-
 .../Sparc/MCTargetDesc/SparcELFObjectWriter.cpp    |    6 +-
 lib/Target/Sparc/SparcFrameLowering.cpp            |    3 +-
 lib/Target/Sparc/SparcISelDAGToDAG.cpp             |    4 +-
 lib/Target/Sparc/SparcISelLowering.cpp             |   28 +-
 lib/Target/Sparc/SparcISelLowering.h               |    4 +-
 lib/Target/Sparc/SparcInstr64Bit.td                |    2 +-
 lib/Target/Sparc/SparcInstrInfo.cpp                |    4 +-
 lib/Target/Sparc/SparcInstrInfo.td                 |    8 +-
 lib/Target/Sparc/SparcRegisterInfo.cpp             |   12 +-
 lib/Target/Sparc/SparcTargetMachine.cpp            |    4 +-
 lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp  |   12 +-
 .../SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp |    2 +-
 lib/Target/SystemZ/SystemZ.h                       |    1 -
 lib/Target/SystemZ/SystemZAsmPrinter.cpp           |   20 +
 lib/Target/SystemZ/SystemZAsmPrinter.h             |    1 +
 lib/Target/SystemZ/SystemZElimCompare.cpp          |    9 +-
 lib/Target/SystemZ/SystemZExpandPseudo.cpp         |  152 -
 lib/Target/SystemZ/SystemZFrameLowering.cpp        |    6 +-
 lib/Target/SystemZ/SystemZISelDAGToDAG.cpp         |   11 +-
 lib/Target/SystemZ/SystemZISelLowering.cpp         |  244 +-
 lib/Target/SystemZ/SystemZInstrFP.td               |   32 +-
 lib/Target/SystemZ/SystemZInstrFormats.td          |  166 +-
 lib/Target/SystemZ/SystemZInstrInfo.cpp            |  168 +-
 lib/Target/SystemZ/SystemZInstrInfo.h              |   29 +-
 lib/Target/SystemZ/SystemZInstrInfo.td             |   22 +-
 lib/Target/SystemZ/SystemZInstrVector.td           |   26 +-
 lib/Target/SystemZ/SystemZLongBranch.cpp           |   26 +-
 lib/Target/SystemZ/SystemZMachineScheduler.cpp     |    5 +-
 lib/Target/SystemZ/SystemZOperands.td              |  121 +-
 lib/Target/SystemZ/SystemZOperators.td             |    6 +-
 lib/Target/SystemZ/SystemZPatterns.td              |    4 +-
 lib/Target/SystemZ/SystemZPostRewrite.cpp          |  164 +-
 lib/Target/SystemZ/SystemZProcessors.td            |    3 +-
 lib/Target/SystemZ/SystemZRegisterInfo.cpp         |   19 +-
 lib/Target/SystemZ/SystemZRegisterInfo.h           |    9 +
 lib/Target/SystemZ/SystemZSchedule.td              |    2 +-
 lib/Target/SystemZ/SystemZScheduleArch13.td        | 1695 ------
 lib/Target/SystemZ/SystemZScheduleZ15.td           | 1695 ++++++
 lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp     |    8 +-
 lib/Target/SystemZ/SystemZShortenInst.cpp          |    4 +-
 lib/Target/SystemZ/SystemZTargetMachine.cpp        |   11 +-
 lib/Target/SystemZ/SystemZTargetTransformInfo.cpp  |    5 +-
 lib/Target/SystemZ/SystemZTargetTransformInfo.h    |    8 +-
 lib/Target/TargetLoweringObjectFile.cpp            |    1 +
 lib/Target/TargetMachine.cpp                       |   20 +-
 lib/Target/TargetMachineC.cpp                      |    2 +-
 .../WebAssembly/AsmParser/WebAssemblyAsmParser.cpp |  100 +-
 .../Disassembler/WebAssemblyDisassembler.cpp       |   24 +-
 .../MCTargetDesc/WebAssemblyAsmBackend.cpp         |   10 +-
 .../MCTargetDesc/WebAssemblyInstPrinter.cpp        |   57 +-
 .../MCTargetDesc/WebAssemblyInstPrinter.h          |    3 +
 .../MCTargetDesc/WebAssemblyMCCodeEmitter.cpp      |    1 +
 .../MCTargetDesc/WebAssemblyMCTargetDesc.h         |   74 +-
 .../MCTargetDesc/WebAssemblyTargetStreamer.cpp     |   33 +-
 .../MCTargetDesc/WebAssemblyTargetStreamer.h       |    3 -
 .../MCTargetDesc/WebAssemblyWasmObjectWriter.cpp   |   11 +-
 lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp   |   38 +-
 lib/Target/WebAssembly/WebAssemblyCFGSort.cpp      |    5 +-
 lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp  |  127 +-
 .../WebAssembly/WebAssemblyExplicitLocals.cpp      |   22 +-
 lib/Target/WebAssembly/WebAssemblyFastISel.cpp     |   36 +-
 .../WebAssembly/WebAssemblyFixFunctionBitcasts.cpp |    2 +
 .../WebAssemblyFixIrreducibleControlFlow.cpp       |    3 +-
 .../WebAssembly/WebAssemblyFrameLowering.cpp       |    8 +-
 lib/Target/WebAssembly/WebAssemblyFrameLowering.h  |    4 +-
 lib/Target/WebAssembly/WebAssemblyISD.def          |    2 +
 lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp |  131 +-
 lib/Target/WebAssembly/WebAssemblyISelLowering.cpp |  306 +-
 lib/Target/WebAssembly/WebAssemblyISelLowering.h   |    2 +-
 lib/Target/WebAssembly/WebAssemblyInstrAtomics.td  |  100 +-
 .../WebAssembly/WebAssemblyInstrBulkMemory.td      |    4 +-
 lib/Target/WebAssembly/WebAssemblyInstrControl.td  |   48 +-
 lib/Target/WebAssembly/WebAssemblyInstrConv.td     |   17 +
 lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp    |    4 +-
 lib/Target/WebAssembly/WebAssemblyInstrInfo.h      |    2 +-
 lib/Target/WebAssembly/WebAssemblyInstrInfo.td     |    3 +-
 lib/Target/WebAssembly/WebAssemblyInstrMemory.td   |   51 -
 lib/Target/WebAssembly/WebAssemblyInstrSIMD.td     |  221 +-
 .../WebAssembly/WebAssemblyLateEHPrepare.cpp       |   12 +-
 .../WebAssembly/WebAssemblyLowerBrUnless.cpp       |    4 +-
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp           |  119 +-
 .../WebAssembly/WebAssemblyLowerGlobalDtors.cpp    |    2 +-
 lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp  |   55 +-
 lib/Target/WebAssembly/WebAssemblyMCInstLower.h    |    3 +
 .../WebAssembly/WebAssemblyMachineFunctionInfo.cpp |   12 +-
 .../WebAssembly/WebAssemblyMachineFunctionInfo.h   |   13 +-
 .../WebAssembly/WebAssemblyMemIntrinsicResults.cpp |    7 +-
 .../WebAssemblyOptimizeLiveIntervals.cpp           |    2 +-
 .../WebAssembly/WebAssemblyOptimizeReturned.cpp    |    7 +-
 lib/Target/WebAssembly/WebAssemblyPeephole.cpp     |  107 +-
 .../WebAssemblyPrepareForLiveIntervals.cpp         |    2 +-
 lib/Target/WebAssembly/WebAssemblyRegColoring.cpp  |    7 +-
 lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp |    2 +-
 lib/Target/WebAssembly/WebAssemblyRegStackify.cpp  |   24 +-
 lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp |    6 +-
 .../WebAssembly/WebAssemblyTargetMachine.cpp       |    2 +-
 .../WebAssembly/WebAssemblyTargetTransformInfo.cpp |    5 +-
 .../WebAssembly/WebAssemblyTargetTransformInfo.h   |    2 +-
 lib/Target/WebAssembly/WebAssemblyUtilities.cpp    |   21 +-
 lib/Target/X86/AsmParser/X86AsmParser.cpp          |  170 +
 lib/Target/X86/AsmParser/X86AsmParserCommon.h      |    4 +
 lib/Target/X86/AsmParser/X86Operand.h              |   25 +-
 .../X86/Disassembler/X86DisassemblerDecoder.cpp    |    5 +-
 lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp      |    6 +-
 lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp |   19 +-
 lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp       |    2 +
 lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp   |    3 +
 lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp    |   61 +-
 lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h      |   11 +-
 .../X86/MCTargetDesc/X86MachObjectWriter.cpp       |    7 +-
 .../X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp    |    2 +-
 lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp |    5 +-
 .../X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp  |    2 +-
 lib/Target/X86/X86.h                               |   10 +-
 lib/Target/X86/X86.td                              |   56 +-
 lib/Target/X86/X86AsmPrinter.cpp                   |    8 +-
 lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp   |    3 +-
 lib/Target/X86/X86AvoidTrailingCall.cpp            |  108 +
 lib/Target/X86/X86CallFrameOptimization.cpp        |   26 +-
 lib/Target/X86/X86CallLowering.cpp                 |   49 +-
 lib/Target/X86/X86CallLowering.h                   |    5 +-
 lib/Target/X86/X86CallingConv.td                   |    2 +
 lib/Target/X86/X86CmovConversion.cpp               |   18 +-
 lib/Target/X86/X86CondBrFolding.cpp                |    2 +-
 lib/Target/X86/X86DomainReassignment.cpp           |   20 +-
 lib/Target/X86/X86EvexToVex.cpp                    |    2 +-
 lib/Target/X86/X86ExpandPseudo.cpp                 |   11 +-
 lib/Target/X86/X86FastISel.cpp                     |   15 +-
 lib/Target/X86/X86FixupBWInsts.cpp                 |   68 +-
 lib/Target/X86/X86FixupLEAs.cpp                    |  201 +-
 lib/Target/X86/X86FixupSetCC.cpp                   |    4 +-
 lib/Target/X86/X86FlagsCopyLowering.cpp            |   13 +-
 lib/Target/X86/X86FloatingPoint.cpp                |    6 +-
 lib/Target/X86/X86FrameLowering.cpp                |  106 +-
 lib/Target/X86/X86FrameLowering.h                  |    4 +-
 lib/Target/X86/X86ISelDAGToDAG.cpp                 |  304 +-
 lib/Target/X86/X86ISelLowering.cpp                 | 5926 +++++++++++---------
 lib/Target/X86/X86ISelLowering.h                   |   77 +-
 lib/Target/X86/X86IndirectBranchTracking.cpp       |    2 +-
 lib/Target/X86/X86InsertPrefetch.cpp               |    8 +-
 lib/Target/X86/X86InstrAVX512.td                   | 1457 ++---
 lib/Target/X86/X86InstrArithmetic.td               |   10 +-
 lib/Target/X86/X86InstrBuilder.h                   |    6 +-
 lib/Target/X86/X86InstrCMovSetCC.td                |   33 +-
 lib/Target/X86/X86InstrCompiler.td                 |  139 +-
 lib/Target/X86/X86InstrControl.td                  |   85 +-
 lib/Target/X86/X86InstrExtension.td                |   11 +-
 lib/Target/X86/X86InstrFoldTables.cpp              |  287 +
 lib/Target/X86/X86InstrFoldTables.h                |   39 +-
 lib/Target/X86/X86InstrFragmentsSIMD.td            |   26 +
 lib/Target/X86/X86InstrInfo.cpp                    |  582 +-
 lib/Target/X86/X86InstrInfo.h                      |   28 +-
 lib/Target/X86/X86InstrInfo.td                     |   57 +-
 lib/Target/X86/X86InstrMMX.td                      |   33 +-
 lib/Target/X86/X86InstrMPX.td                      |   32 +-
 lib/Target/X86/X86InstrSSE.td                      |  551 +-
 lib/Target/X86/X86InstrSystem.td                   |    2 +-
 lib/Target/X86/X86InstrTSX.td                      |    2 +-
 lib/Target/X86/X86InstrXOP.td                      |   26 +-
 lib/Target/X86/X86InstructionSelector.cpp          |  135 +-
 lib/Target/X86/X86IntrinsicsInfo.h                 |    6 +-
 lib/Target/X86/X86LegalizerInfo.cpp                |   20 +
 lib/Target/X86/X86LegalizerInfo.h                  |    3 +
 lib/Target/X86/X86MCInstLower.cpp                  |  313 +-
 lib/Target/X86/X86MachineFunctionInfo.h            |    8 +
 lib/Target/X86/X86OptimizeLEAs.cpp                 |   60 +-
 lib/Target/X86/X86RegisterBankInfo.cpp             |    4 +-
 lib/Target/X86/X86RegisterInfo.cpp                 |   31 +-
 lib/Target/X86/X86RetpolineThunks.cpp              |    8 +-
 lib/Target/X86/X86SchedBroadwell.td                |    8 +-
 lib/Target/X86/X86SchedHaswell.td                  |    8 +-
 lib/Target/X86/X86SchedPredicates.td               |   57 +
 lib/Target/X86/X86SchedSandyBridge.td              |    8 +-
 lib/Target/X86/X86SchedSkylakeClient.td            |    8 +-
 lib/Target/X86/X86SchedSkylakeServer.td            |    8 +-
 lib/Target/X86/X86Schedule.td                      |   24 +-
 lib/Target/X86/X86ScheduleAtom.td                  |    6 +-
 lib/Target/X86/X86ScheduleBdVer2.td                |    6 +-
 lib/Target/X86/X86ScheduleBtVer2.td                |  257 +-
 lib/Target/X86/X86ScheduleSLM.td                   |    8 +-
 lib/Target/X86/X86ScheduleZnver1.td                |    8 +-
 lib/Target/X86/X86SelectionDAGInfo.cpp             |    2 +-
 lib/Target/X86/X86SpeculativeLoadHardening.cpp     |   59 +-
 lib/Target/X86/X86Subtarget.cpp                    |   18 +-
 lib/Target/X86/X86Subtarget.h                      |   23 +-
 lib/Target/X86/X86TargetMachine.cpp                |   49 +-
 lib/Target/X86/X86TargetMachine.h                  |    2 +-
 lib/Target/X86/X86TargetObjectFile.cpp             |    4 +-
 lib/Target/X86/X86TargetObjectFile.h               |    3 +-
 lib/Target/X86/X86TargetTransformInfo.cpp          |  225 +-
 lib/Target/X86/X86TargetTransformInfo.h            |   11 +-
 lib/Target/X86/X86VZeroUpper.cpp                   |    6 +-
 lib/Target/X86/X86WinAllocaExpander.cpp            |    4 +-
 lib/Target/X86/X86WinEHState.cpp                   |    5 +-
 lib/Target/XCore/XCoreAsmPrinter.cpp               |    4 +-
 lib/Target/XCore/XCoreFrameLowering.cpp            |    6 +-
 lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp    |    2 +-
 lib/Target/XCore/XCoreISelLowering.cpp             |   21 +-
 lib/Target/XCore/XCoreRegisterInfo.cpp             |    2 +-
 lib/Target/XCore/XCoreTargetMachine.cpp            |    2 +-
 lib/Target/XCore/XCoreTargetTransformInfo.h        |    3 +-
 lib/TextAPI/MachO/Architecture.cpp                 |    4 +
 lib/TextAPI/MachO/InterfaceFile.cpp                |   80 +-
 lib/TextAPI/MachO/Platform.cpp                     |   91 +
 lib/TextAPI/MachO/Symbol.cpp                       |    9 +
 lib/TextAPI/MachO/Target.cpp                       |   75 +
 lib/TextAPI/MachO/TextStub.cpp                     |  606 +-
 lib/TextAPI/MachO/TextStubCommon.cpp               |   93 +-
 lib/TextAPI/MachO/TextStubCommon.h                 |    8 +-
 lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp     |   26 +-
 lib/ToolDrivers/llvm-lib/LibDriver.cpp             |  236 +-
 .../AggressiveInstCombine.cpp                      |   78 +-
 lib/Transforms/Coroutines/CoroCleanup.cpp          |    7 +-
 lib/Transforms/Coroutines/CoroEarly.cpp            |   26 +-
 lib/Transforms/Coroutines/CoroElide.cpp            |    2 +-
 lib/Transforms/Coroutines/CoroFrame.cpp            |  652 ++-
 lib/Transforms/Coroutines/CoroInstr.h              |  205 +-
 lib/Transforms/Coroutines/CoroInternal.h           |  162 +-
 lib/Transforms/Coroutines/CoroSplit.cpp            | 1166 +++-
 lib/Transforms/Coroutines/Coroutines.cpp           |  342 +-
 lib/Transforms/IPO/ArgumentPromotion.cpp           |    2 +-
 lib/Transforms/IPO/Attributor.cpp                  | 5055 ++++++++++++++---
 lib/Transforms/IPO/BlockExtractor.cpp              |    5 +-
 lib/Transforms/IPO/ConstantMerge.cpp               |    4 +-
 lib/Transforms/IPO/CrossDSOCFI.cpp                 |   10 +-
 lib/Transforms/IPO/FunctionAttrs.cpp               |   38 +-
 lib/Transforms/IPO/FunctionImport.cpp              |   43 +-
 lib/Transforms/IPO/GlobalDCE.cpp                   |  156 +-
 lib/Transforms/IPO/GlobalOpt.cpp                   |  176 +-
 lib/Transforms/IPO/HotColdSplitting.cpp            |   61 +-
 lib/Transforms/IPO/IPO.cpp                         |   13 +
 lib/Transforms/IPO/InferFunctionAttrs.cpp          |   20 +-
 lib/Transforms/IPO/Inliner.cpp                     |   21 +-
 lib/Transforms/IPO/LoopExtractor.cpp               |    6 +-
 lib/Transforms/IPO/LowerTypeTests.cpp              |  305 +-
 lib/Transforms/IPO/MergeFunctions.cpp              |    4 +-
 lib/Transforms/IPO/PartialInlining.cpp             |   20 +-
 lib/Transforms/IPO/PassManagerBuilder.cpp          |    1 +
 lib/Transforms/IPO/SCCP.cpp                        |   18 +-
 lib/Transforms/IPO/SampleProfile.cpp               |  238 +-
 lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp        |   21 +-
 lib/Transforms/IPO/WholeProgramDevirt.cpp          |  389 +-
 lib/Transforms/InstCombine/InstCombineAddSub.cpp   |  268 +-
 lib/Transforms/InstCombine/InstCombineAndOrXor.cpp |  278 +-
 .../InstCombine/InstCombineAtomicRMW.cpp           |    4 +-
 lib/Transforms/InstCombine/InstCombineCalls.cpp    |  121 +-
 lib/Transforms/InstCombine/InstCombineCasts.cpp    |  102 +-
 lib/Transforms/InstCombine/InstCombineCompares.cpp |  870 ++-
 lib/Transforms/InstCombine/InstCombineInternal.h   |  116 +-
 .../InstCombine/InstCombineLoadStoreAlloca.cpp     |   93 +-
 .../InstCombine/InstCombineMulDivRem.cpp           |   77 +-
 lib/Transforms/InstCombine/InstCombinePHI.cpp      |    6 +-
 lib/Transforms/InstCombine/InstCombineSelect.cpp   |  455 +-
 lib/Transforms/InstCombine/InstCombineShifts.cpp   |  370 +-
 .../InstCombine/InstCombineSimplifyDemanded.cpp    |   48 +-
 .../InstCombine/InstCombineVectorOps.cpp           |  171 +-
 .../InstCombine/InstructionCombining.cpp           |   67 +-
 .../Instrumentation/AddressSanitizer.cpp           |   98 +-
 lib/Transforms/Instrumentation/BoundsChecking.cpp  |    2 +-
 lib/Transforms/Instrumentation/CFGMST.h            |    4 +-
 .../Instrumentation/ControlHeightReduction.cpp     |   26 +-
 .../Instrumentation/DataFlowSanitizer.cpp          |    2 +-
 lib/Transforms/Instrumentation/GCOVProfiling.cpp   |   49 +-
 .../Instrumentation/HWAddressSanitizer.cpp         |  376 +-
 .../Instrumentation/IndirectCallPromotion.cpp      |    2 +-
 lib/Transforms/Instrumentation/InstrOrderFile.cpp  |    3 +-
 lib/Transforms/Instrumentation/InstrProfiling.cpp  |   65 +-
 lib/Transforms/Instrumentation/Instrumentation.cpp |    5 +-
 lib/Transforms/Instrumentation/MemorySanitizer.cpp |   89 +-
 .../Instrumentation/PGOInstrumentation.cpp         |  220 +-
 lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp |    6 +-
 .../Instrumentation/SanitizerCoverage.cpp          |  164 +-
 lib/Transforms/Instrumentation/ThreadSanitizer.cpp |   54 +-
 .../Instrumentation/ValueProfileCollector.cpp      |   78 +
 .../Instrumentation/ValueProfileCollector.h        |   79 +
 .../Instrumentation/ValueProfilePlugins.inc        |   75 +
 lib/Transforms/ObjCARC/PtrState.cpp                |    4 +
 lib/Transforms/Scalar/AlignmentFromAssumptions.cpp |    8 +-
 lib/Transforms/Scalar/CallSiteSplitting.cpp        |    2 +-
 lib/Transforms/Scalar/ConstantHoisting.cpp         |   24 +-
 lib/Transforms/Scalar/ConstantProp.cpp             |    2 +-
 .../Scalar/CorrelatedValuePropagation.cpp          |  180 +-
 lib/Transforms/Scalar/DCE.cpp                      |   31 +-
 lib/Transforms/Scalar/DeadStoreElimination.cpp     |    7 +-
 lib/Transforms/Scalar/DivRemPairs.cpp              |  219 +-
 lib/Transforms/Scalar/EarlyCSE.cpp                 |   22 +-
 lib/Transforms/Scalar/FlattenCFGPass.cpp           |   24 +-
 lib/Transforms/Scalar/Float2Int.cpp                |   47 +-
 lib/Transforms/Scalar/GVN.cpp                      |  201 +-
 lib/Transforms/Scalar/GVNHoist.cpp                 |   17 +-
 lib/Transforms/Scalar/GuardWidening.cpp            |    2 +-
 lib/Transforms/Scalar/IndVarSimplify.cpp           |  389 +-
 lib/Transforms/Scalar/InferAddressSpaces.cpp       |   38 +-
 lib/Transforms/Scalar/InstSimplifyPass.cpp         |   48 +-
 lib/Transforms/Scalar/JumpThreading.cpp            |   18 +-
 lib/Transforms/Scalar/LICM.cpp                     |   55 +-
 lib/Transforms/Scalar/LoopDataPrefetch.cpp         |    4 +-
 lib/Transforms/Scalar/LoopDeletion.cpp             |    2 +-
 lib/Transforms/Scalar/LoopFuse.cpp                 |  640 ++-
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp       |  890 ++-
 lib/Transforms/Scalar/LoopInstSimplify.cpp         |    5 +-
 lib/Transforms/Scalar/LoopInterchange.cpp          |   62 +-
 lib/Transforms/Scalar/LoopLoadElimination.cpp      |    3 +-
 lib/Transforms/Scalar/LoopPredication.cpp          |    2 +-
 lib/Transforms/Scalar/LoopRerollPass.cpp           |    3 +-
 lib/Transforms/Scalar/LoopRotation.cpp             |   10 +-
 lib/Transforms/Scalar/LoopSimplifyCFG.cpp          |    4 +-
 lib/Transforms/Scalar/LoopSink.cpp                 |    9 +-
 lib/Transforms/Scalar/LoopStrengthReduce.cpp       |   20 +-
 lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp     |    8 +-
 lib/Transforms/Scalar/LoopUnrollPass.cpp           |  128 +-
 lib/Transforms/Scalar/LoopUnswitch.cpp             |   87 +-
 lib/Transforms/Scalar/LoopVersioningLICM.cpp       |   31 -
 lib/Transforms/Scalar/LowerConstantIntrinsics.cpp  |  170 +
 lib/Transforms/Scalar/LowerExpectIntrinsic.cpp     |   33 +-
 lib/Transforms/Scalar/MemCpyOptimizer.cpp          |  110 +-
 lib/Transforms/Scalar/MergeICmps.cpp               |    2 +-
 lib/Transforms/Scalar/MergedLoadStoreMotion.cpp    |  167 +-
 lib/Transforms/Scalar/NaryReassociate.cpp          |    2 +-
 lib/Transforms/Scalar/NewGVN.cpp                   |   25 +-
 lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp  |    2 +-
 lib/Transforms/Scalar/PlaceSafepoints.cpp          |    6 +-
 lib/Transforms/Scalar/Reassociate.cpp              |  190 +-
 lib/Transforms/Scalar/RewriteStatepointsForGC.cpp  |    6 +-
 lib/Transforms/Scalar/SCCP.cpp                     |   75 +-
 lib/Transforms/Scalar/SROA.cpp                     |   40 +-
 lib/Transforms/Scalar/Scalar.cpp                   |    9 +
 .../Scalar/SeparateConstOffsetFromGEP.cpp          |    2 +-
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp       |   25 +-
 lib/Transforms/Scalar/SpeculateAroundPHIs.cpp      |    6 +-
 lib/Transforms/Scalar/StructurizeCFG.cpp           |    2 +-
 lib/Transforms/Scalar/TailRecursionElimination.cpp |    2 +-
 lib/Transforms/Utils/BasicBlockUtils.cpp           |   64 +-
 lib/Transforms/Utils/BuildLibCalls.cpp             |   94 +-
 lib/Transforms/Utils/BypassSlowDivision.cpp        |    8 +-
 lib/Transforms/Utils/CanonicalizeAliases.cpp       |    1 +
 lib/Transforms/Utils/CloneFunction.cpp             |   15 +
 lib/Transforms/Utils/CloneModule.cpp               |   18 +-
 lib/Transforms/Utils/CodeExtractor.cpp             |  309 +-
 lib/Transforms/Utils/EntryExitInstrumenter.cpp     |    2 +-
 lib/Transforms/Utils/Evaluator.cpp                 |    2 +-
 lib/Transforms/Utils/FlattenCFG.cpp                |   20 +-
 lib/Transforms/Utils/FunctionImportUtils.cpp       |    2 +-
 .../Utils/ImportedFunctionsInliningStatistics.cpp  |    6 +-
 lib/Transforms/Utils/LibCallsShrinkWrap.cpp        |    2 +-
 lib/Transforms/Utils/Local.cpp                     |  209 +-
 lib/Transforms/Utils/LoopRotationUtils.cpp         |   27 +-
 lib/Transforms/Utils/LoopSimplify.cpp              |   15 +-
 lib/Transforms/Utils/LoopUnroll.cpp                |   12 +-
 lib/Transforms/Utils/LoopUnrollAndJam.cpp          |    6 +-
 lib/Transforms/Utils/LoopUnrollPeel.cpp            |  161 +-
 lib/Transforms/Utils/LoopUtils.cpp                 |   56 +
 lib/Transforms/Utils/LoopVersioning.cpp            |    4 +-
 lib/Transforms/Utils/MetaRenamer.cpp               |    5 +-
 lib/Transforms/Utils/MisExpect.cpp                 |  177 +
 lib/Transforms/Utils/ModuleUtils.cpp               |    2 +-
 lib/Transforms/Utils/PredicateInfo.cpp             |   80 +-
 lib/Transforms/Utils/SimplifyCFG.cpp               |  250 +-
 lib/Transforms/Utils/SimplifyLibCalls.cpp          |  688 ++-
 lib/Transforms/Utils/SymbolRewriter.cpp            |   12 +-
 lib/Transforms/Utils/VNCoercion.cpp                |    2 +-
 lib/Transforms/Utils/ValueMapper.cpp               |   60 +-
 lib/Transforms/Vectorize/LoadStoreVectorizer.cpp   |   26 +-
 .../Vectorize/LoopVectorizationLegality.cpp        |  186 +-
 .../Vectorize/LoopVectorizationPlanner.h           |    4 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp         |  738 ++-
 lib/Transforms/Vectorize/SLPVectorizer.cpp         |  820 ++-
 lib/Transforms/Vectorize/VPlan.cpp                 |   19 +-
 lib/Transforms/Vectorize/VPlan.h                   |    4 +
 lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp   |    2 +-
 lib/Transforms/Vectorize/VPlanSLP.cpp              |   13 +-
 lib/WindowsManifest/WindowsManifestMerger.cpp      |    4 +-
 lib/XRay/FDRRecordProducer.cpp                     |   37 +-
 lib/XRay/FileHeaderReader.cpp                      |   14 +-
 lib/XRay/InstrumentationMap.cpp                    |   17 +-
 lib/XRay/Profile.cpp                               |   10 +-
 lib/XRay/RecordInitializer.cpp                     |  202 +-
 lib/XRay/Trace.cpp                                 |   43 +-
 tools/bugpoint/BugDriver.h                         |    7 +-
 tools/bugpoint/ExtractFunction.cpp                 |    3 +-
 tools/bugpoint/OptimizerDriver.cpp                 |   12 +-
 tools/bugpoint/ToolRunner.cpp                      |   16 +-
 tools/bugpoint/bugpoint.cpp                        |   46 +-
 tools/llc/llc.cpp                                  |   27 +-
 tools/lli/lli.cpp                                  |   71 +-
 tools/llvm-ar/llvm-ar.cpp                          |  192 +-
 tools/llvm-as/llvm-as.cpp                          |    2 +-
 tools/llvm-cov/CodeCoverage.cpp                    |   24 +-
 tools/llvm-cov/SourceCoverageView.cpp              |    8 +-
 tools/llvm-cov/TestingSupport.cpp                  |   10 +-
 tools/llvm-cxxdump/llvm-cxxdump.cpp                |    6 +-
 tools/llvm-cxxmap/llvm-cxxmap.cpp                  |    2 +-
 tools/llvm-dis/llvm-dis.cpp                        |    4 +-
 tools/llvm-dwarfdump/Statistics.cpp                |  263 +-
 tools/llvm-dwarfdump/llvm-dwarfdump.cpp            |    2 +-
 tools/llvm-extract/llvm-extract.cpp                |   16 +-
 tools/llvm-ifs/CMakeLists.txt                      |   10 +
 tools/llvm-ifs/LLVMBuild.txt                       |   21 +
 tools/llvm-ifs/llvm-ifs.cpp                        |  532 ++
 tools/llvm-link/llvm-link.cpp                      |    6 +-
 tools/llvm-lto/llvm-lto.cpp                        |   20 +-
 tools/llvm-lto2/llvm-lto2.cpp                      |   12 +-
 tools/llvm-mc/Disassembler.cpp                     |   15 +-
 tools/llvm-mc/Disassembler.h                       |   10 +-
 tools/llvm-mc/llvm-mc.cpp                          |   27 +-
 tools/llvm-mca/CodeRegion.cpp                      |    6 +-
 tools/llvm-mca/CodeRegionGenerator.cpp             |    2 +
 tools/llvm-mca/Views/BottleneckAnalysis.cpp        |   40 +-
 tools/llvm-mca/Views/BottleneckAnalysis.h          |    8 +-
 tools/llvm-mca/Views/InstructionInfoView.cpp       |   31 +-
 tools/llvm-mca/Views/InstructionInfoView.h         |   13 +-
 tools/llvm-mca/Views/TimelineView.cpp              |   50 +-
 tools/llvm-mca/Views/TimelineView.h                |    1 +
 tools/llvm-mca/llvm-mca.cpp                        |  113 +-
 tools/llvm-modextract/llvm-modextract.cpp          |    2 +-
 tools/llvm-nm/llvm-nm.cpp                          |   51 +-
 tools/llvm-objcopy/COFF/COFFObjcopy.cpp            |   88 +-
 tools/llvm-objcopy/COFF/Reader.cpp                 |   18 +-
 tools/llvm-objcopy/COFF/Writer.cpp                 |    4 +-
 tools/llvm-objcopy/CommonOpts.td                   |  123 +
 tools/llvm-objcopy/CopyConfig.cpp                  |  370 +-
 tools/llvm-objcopy/CopyConfig.h                    |  110 +-
 tools/llvm-objcopy/ELF/ELFConfig.cpp               |  133 +
 tools/llvm-objcopy/ELF/ELFConfig.h                 |   44 +
 tools/llvm-objcopy/ELF/ELFObjcopy.cpp              |  169 +-
 tools/llvm-objcopy/ELF/Object.cpp                  |  252 +-
 tools/llvm-objcopy/ELF/Object.h                    |   56 +-
 tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp    |  350 ++
 tools/llvm-objcopy/MachO/MachOLayoutBuilder.h      |   50 +
 tools/llvm-objcopy/MachO/MachOObjcopy.cpp          |   30 +-
 tools/llvm-objcopy/MachO/MachOReader.cpp           |   45 +-
 tools/llvm-objcopy/MachO/MachOReader.h             |    3 +
 tools/llvm-objcopy/MachO/MachOWriter.cpp           |  305 +-
 tools/llvm-objcopy/MachO/MachOWriter.h             |   19 +-
 tools/llvm-objcopy/MachO/Object.h                  |   27 +
 tools/llvm-objcopy/ObjcopyOpts.td                  |  141 +-
 tools/llvm-objcopy/StripOpts.td                    |  103 +-
 tools/llvm-objcopy/llvm-objcopy.cpp                |   53 +-
 tools/llvm-objdump/COFFDump.cpp                    |   77 +-
 tools/llvm-objdump/ELFDump.cpp                     |    2 +-
 tools/llvm-objdump/MachODump.cpp                   |  375 +-
 tools/llvm-objdump/llvm-objdump.cpp                |  543 +-
 tools/llvm-objdump/llvm-objdump.h                  |   36 +-
 tools/llvm-pdbutil/BytesOutputStyle.cpp            |    2 +-
 tools/llvm-pdbutil/DumpOutputStyle.cpp             |    9 +-
 tools/llvm-pdbutil/ExplainOutputStyle.cpp          |    2 +-
 tools/llvm-pdbutil/InputFile.cpp                   |   17 +-
 tools/llvm-pdbutil/MinimalSymbolDumper.cpp         |    5 +-
 tools/llvm-pdbutil/PrettyTypeDumper.cpp            |    4 +-
 tools/llvm-pdbutil/llvm-pdbutil.cpp                |   10 +-
 tools/llvm-profdata/llvm-profdata.cpp              |  287 +-
 tools/llvm-readobj/ARMEHABIPrinter.h               |   19 +-
 tools/llvm-readobj/ARMWinEHPrinter.cpp             |    9 +-
 tools/llvm-readobj/COFFDumper.cpp                  |  362 +-
 tools/llvm-readobj/DwarfCFIEHPrinter.h             |   54 +-
 tools/llvm-readobj/ELFDumper.cpp                   | 1648 ++++--
 tools/llvm-readobj/MachODumper.cpp                 |   61 +-
 tools/llvm-readobj/ObjDumper.cpp                   |   32 +-
 tools/llvm-readobj/ObjDumper.h                     |   11 +-
 tools/llvm-readobj/WasmDumper.cpp                  |    7 +-
 tools/llvm-readobj/Win64EHDumper.cpp               |   13 +-
 tools/llvm-readobj/WindowsResourceDumper.cpp       |    8 +-
 tools/llvm-readobj/XCOFFDumper.cpp                 |  402 +-
 tools/llvm-readobj/llvm-readobj.cpp                |  148 +-
 tools/llvm-readobj/llvm-readobj.h                  |   25 +-
 tools/llvm-reduce/CMakeLists.txt                   |   26 +
 tools/llvm-reduce/DeltaManager.h                   |   36 +
 tools/llvm-reduce/LLVMBuild.txt                    |   24 +
 tools/llvm-reduce/TestRunner.cpp                   |   42 +
 tools/llvm-reduce/TestRunner.h                     |   46 +
 tools/llvm-reduce/deltas/Delta.cpp                 |  162 +
 tools/llvm-reduce/deltas/Delta.h                   |   76 +
 tools/llvm-reduce/deltas/ReduceArguments.cpp       |  125 +
 tools/llvm-reduce/deltas/ReduceArguments.h         |   21 +
 tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp     |  146 +
 tools/llvm-reduce/deltas/ReduceBasicBlocks.h       |   20 +
 tools/llvm-reduce/deltas/ReduceFunctions.cpp       |   77 +
 tools/llvm-reduce/deltas/ReduceFunctions.h         |   20 +
 tools/llvm-reduce/deltas/ReduceGlobalVars.cpp      |   74 +
 tools/llvm-reduce/deltas/ReduceGlobalVars.h        |   20 +
 tools/llvm-reduce/deltas/ReduceInstructions.cpp    |   65 +
 tools/llvm-reduce/deltas/ReduceInstructions.h      |   20 +
 tools/llvm-reduce/deltas/ReduceMetadata.cpp        |  138 +
 tools/llvm-reduce/deltas/ReduceMetadata.h          |   18 +
 tools/llvm-reduce/llvm-reduce.cpp                  |  114 +
 tools/llvm-rtdyld/llvm-rtdyld.cpp                  |  104 +-
 tools/llvm-stress/llvm-stress.cpp                  |    4 +-
 tools/llvm-symbolizer/llvm-symbolizer.cpp          |    6 +
 tools/llvm-xray/func-id-helper.cpp                 |    2 +-
 tools/llvm-xray/xray-account.cpp                   |    2 +-
 tools/llvm-xray/xray-converter.cpp                 |    4 +-
 tools/llvm-xray/xray-extract.cpp                   |    2 +-
 tools/llvm-xray/xray-fdr-dump.cpp                  |    2 +-
 tools/llvm-xray/xray-graph-diff.cpp                |    2 +-
 tools/llvm-xray/xray-graph.cpp                     |    2 +-
 tools/opt/opt.cpp                                  |   13 +-
 tools/vfabi-demangle-fuzzer/CMakeLists.txt         |    7 +
 .../vfabi-demangler-fuzzer.cpp                     |   26 +
 utils/TableGen/AsmMatcherEmitter.cpp               |   19 +-
 utils/TableGen/AsmWriterEmitter.cpp                |    3 +-
 utils/TableGen/CallingConvEmitter.cpp              |    4 +
 utils/TableGen/CodeEmitterGen.cpp                  |  305 +-
 utils/TableGen/CodeGenDAGPatterns.cpp              |   42 +-
 utils/TableGen/CodeGenDAGPatterns.h                |    4 +
 utils/TableGen/CodeGenInstruction.cpp              |    1 +
 utils/TableGen/CodeGenInstruction.h                |    1 +
 utils/TableGen/CodeGenIntrinsics.h                 |    8 +
 utils/TableGen/CodeGenMapTable.cpp                 |   12 +-
 utils/TableGen/CodeGenRegisters.cpp                |   52 +-
 utils/TableGen/CodeGenRegisters.h                  |   26 +-
 utils/TableGen/CodeGenSchedule.cpp                 |   24 +-
 utils/TableGen/CodeGenTarget.cpp                   |   70 +-
 utils/TableGen/CodeGenTarget.h                     |    6 +
 utils/TableGen/DAGISelEmitter.cpp                  |    2 +-
 utils/TableGen/DAGISelMatcher.h                    |    8 +-
 utils/TableGen/DAGISelMatcherEmitter.cpp           |   22 +-
 utils/TableGen/DAGISelMatcherGen.cpp               |   10 +-
 utils/TableGen/DAGISelMatcherOpt.cpp               |    9 +-
 utils/TableGen/DFAEmitter.cpp                      |  394 ++
 utils/TableGen/DFAEmitter.h                        |  107 +
 utils/TableGen/DFAPacketizerEmitter.cpp            |  657 +--
 utils/TableGen/DisassemblerEmitter.cpp             |    2 +-
 utils/TableGen/FixedLenDecoderEmitter.cpp          |   93 +-
 utils/TableGen/GICombinerEmitter.cpp               |  452 ++
 utils/TableGen/GlobalISel/CMakeLists.txt           |    7 +
 utils/TableGen/GlobalISel/CodeExpander.cpp         |   93 +
 utils/TableGen/GlobalISel/CodeExpander.h           |   55 +
 utils/TableGen/GlobalISel/CodeExpansions.h         |   43 +
 utils/TableGen/GlobalISelEmitter.cpp               |  775 ++-
 utils/TableGen/InfoByHwMode.cpp                    |   11 +
 utils/TableGen/InfoByHwMode.h                      |    5 +
 utils/TableGen/InstrDocsEmitter.cpp                |    2 +-
 utils/TableGen/InstrInfoEmitter.cpp                |   52 +-
 utils/TableGen/IntrinsicEmitter.cpp                |   20 +-
 utils/TableGen/RISCVCompressInstEmitter.cpp        |   13 +-
 utils/TableGen/RegisterInfoEmitter.cpp             |    4 +-
 utils/TableGen/SearchableTableEmitter.cpp          |   16 +-
 utils/TableGen/SubtargetEmitter.cpp                |    8 +-
 utils/TableGen/SubtargetFeatureInfo.cpp            |   12 +-
 utils/TableGen/TableGen.cpp                        |  157 +-
 utils/TableGen/TableGenBackends.h                  |    2 +
 utils/TableGen/WebAssemblyDisassemblerEmitter.cpp  |    2 +-
 utils/TableGen/X86DisassemblerTables.cpp           |    2 +-
 utils/TableGen/X86EVEX2VEXTablesEmitter.cpp        |    1 +
 utils/TableGen/X86RecognizableInstr.cpp            |   14 +-
 utils/add_argument_names.py                        |   82 +
 utils/llvm-locstats/CMakeLists.txt                 |   12 +
 utils/llvm-locstats/llvm-locstats.py               |  209 +
 2120 files changed, 122706 insertions(+), 50754 deletions(-)
 create mode 100644 include/llvm/ADT/DirectedGraph.h
 delete mode 100644 include/llvm/ADT/VariadicFunction.h
 create mode 100644 include/llvm/Analysis/DDG.h
 create mode 100644 include/llvm/Analysis/DependenceGraphBuilder.h
 create mode 100644 include/llvm/Analysis/LoopCacheAnalysis.h
 create mode 100644 include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
 create mode 100644 include/llvm/CodeGen/LiveRangeCalc.h
 create mode 100644 include/llvm/CodeGen/MachineLoopUtils.h
 create mode 100644 include/llvm/CodeGen/ModuloSchedule.h
 create mode 100644 include/llvm/DebugInfo/GSYM/FileWriter.h
 create mode 100644 include/llvm/DebugInfo/GSYM/GsymCreator.h
 create mode 100644 include/llvm/DebugInfo/GSYM/GsymReader.h
 create mode 100644 include/llvm/DebugInfo/GSYM/Header.h
 create mode 100644 include/llvm/DebugInfo/GSYM/LineTable.h
 create mode 100644 include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
 create mode 100644 include/llvm/ExecutionEngine/Orc/SpeculateAnalyses.h
 create mode 100644 include/llvm/ExecutionEngine/Orc/Speculation.h
 create mode 100644 include/llvm/IR/FixedMetadataKinds.def
 create mode 100644 include/llvm/MC/MCRegister.h
 create mode 100644 include/llvm/MCA/CodeEmitter.h
 create mode 100644 include/llvm/Object/TapiFile.h
 create mode 100644 include/llvm/Object/TapiUniversal.h
 create mode 100644 include/llvm/ObjectYAML/yaml2obj.h
 create mode 100644 include/llvm/Remarks/BitstreamRemarkContainer.h
 create mode 100644 include/llvm/Remarks/BitstreamRemarkParser.h
 create mode 100644 include/llvm/Remarks/BitstreamRemarkSerializer.h
 create mode 100644 include/llvm/Remarks/YAMLRemarkSerializer.h
 create mode 100644 include/llvm/Support/Alignment.h
 create mode 100644 include/llvm/Support/Automaton.h
 create mode 100644 include/llvm/Support/FileCollector.h
 delete mode 100644 include/llvm/Support/JamCRC.h
 delete mode 100644 include/llvm/Support/MutexGuard.h
 delete mode 100644 include/llvm/Support/ScalableSize.h
 create mode 100644 include/llvm/Support/TypeSize.h
 delete mode 100644 include/llvm/Support/UniqueLock.h
 create mode 100644 include/llvm/TableGen/Automaton.td
 create mode 100644 include/llvm/Target/GlobalISel/Combine.td
 create mode 100644 include/llvm/TextAPI/MachO/Platform.h
 create mode 100644 include/llvm/TextAPI/MachO/Target.h
 create mode 100644 include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
 create mode 100644 include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h
 create mode 100644 include/llvm/Transforms/Utils/MisExpect.h
 create mode 100644 lib/Analysis/DDG.cpp
 create mode 100644 lib/Analysis/DependenceGraphBuilder.cpp
 create mode 100644 lib/Analysis/LoopCacheAnalysis.cpp
 create mode 100644 lib/Analysis/VFABIDemangling.cpp
 create mode 100644 lib/CodeGen/GlobalISel/GISelKnownBits.cpp
 delete mode 100644 lib/CodeGen/LiveRangeCalc.h
 create mode 100644 lib/CodeGen/MIRNamerPass.cpp
 create mode 100644 lib/CodeGen/MIRVRegNamerUtils.cpp
 create mode 100644 lib/CodeGen/MIRVRegNamerUtils.h
 create mode 100644 lib/CodeGen/MachineLoopUtils.cpp
 create mode 100644 lib/CodeGen/ModuloSchedule.cpp
 create mode 100644 lib/DebugInfo/GSYM/FileWriter.cpp
 create mode 100644 lib/DebugInfo/GSYM/GsymCreator.cpp
 create mode 100644 lib/DebugInfo/GSYM/GsymReader.cpp
 create mode 100644 lib/DebugInfo/GSYM/Header.cpp
 create mode 100644 lib/DebugInfo/GSYM/LineTable.cpp
 delete mode 100644 lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.cpp
 delete mode 100644 lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.h
 create mode 100644 lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
 create mode 100644 lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
 create mode 100644 lib/ExecutionEngine/JITLink/MachO_arm64.cpp
 create mode 100644 lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
 create mode 100644 lib/ExecutionEngine/Orc/Speculation.cpp
 create mode 100644 lib/MCA/CodeEmitter.cpp
 create mode 100644 lib/Object/TapiFile.cpp
 create mode 100644 lib/Object/TapiUniversal.cpp
 create mode 100644 lib/ObjectYAML/COFFEmitter.cpp
 create mode 100644 lib/ObjectYAML/ELFEmitter.cpp
 create mode 100644 lib/ObjectYAML/MachOEmitter.cpp
 create mode 100644 lib/ObjectYAML/MinidumpEmitter.cpp
 create mode 100644 lib/ObjectYAML/WasmEmitter.cpp
 create mode 100644 lib/ObjectYAML/yaml2obj.cpp
 create mode 100644 lib/Remarks/BitstreamRemarkParser.cpp
 create mode 100644 lib/Remarks/BitstreamRemarkParser.h
 create mode 100644 lib/Remarks/BitstreamRemarkSerializer.cpp
 create mode 100644 lib/Remarks/RemarkSerializer.cpp
 create mode 100644 lib/Support/ABIBreak.cpp
 create mode 100644 lib/Support/FileCheckImpl.h
 create mode 100644 lib/Support/FileCollector.cpp
 delete mode 100644 lib/Support/JamCRC.cpp
 delete mode 100644 lib/Support/Mutex.cpp
 delete mode 100644 lib/Support/Unix/Mutex.inc
 delete mode 100644 lib/Support/Unix/RWMutex.inc
 delete mode 100644 lib/Support/Windows/Mutex.inc
 delete mode 100644 lib/Support/Windows/RWMutex.inc
 create mode 100644 lib/Target/AArch64/AArch64Combine.td
 create mode 100644 lib/Target/AArch64/AArch64StackOffset.h
 create mode 100644 lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
 create mode 100644 lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
 create mode 100644 lib/Target/ARM/MVETailPredication.cpp
 create mode 100644 lib/Target/ARM/MVEVPTBlockPass.cpp
 create mode 100644 lib/Target/Mips/MipsPfmCounters.td
 create mode 100644 lib/Target/RISCV/RISCVCallLowering.cpp
 create mode 100644 lib/Target/RISCV/RISCVCallLowering.h
 create mode 100644 lib/Target/RISCV/RISCVInstructionSelector.cpp
 create mode 100644 lib/Target/RISCV/RISCVLegalizerInfo.cpp
 create mode 100644 lib/Target/RISCV/RISCVLegalizerInfo.h
 create mode 100644 lib/Target/RISCV/RISCVRegisterBankInfo.cpp
 create mode 100644 lib/Target/RISCV/RISCVRegisterBankInfo.h
 create mode 100644 lib/Target/RISCV/RISCVRegisterBanks.td
 delete mode 100644 lib/Target/SystemZ/SystemZExpandPseudo.cpp
 delete mode 100644 lib/Target/SystemZ/SystemZScheduleArch13.td
 create mode 100644 lib/Target/SystemZ/SystemZScheduleZ15.td
 create mode 100644 lib/Target/X86/X86AvoidTrailingCall.cpp
 create mode 100644 lib/TextAPI/MachO/Platform.cpp
 create mode 100644 lib/TextAPI/MachO/Target.cpp
 create mode 100644 lib/Transforms/Instrumentation/ValueProfileCollector.cpp
 create mode 100644 lib/Transforms/Instrumentation/ValueProfileCollector.h
 create mode 100644 lib/Transforms/Instrumentation/ValueProfilePlugins.inc
 create mode 100644 lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
 create mode 100644 lib/Transforms/Utils/MisExpect.cpp
 create mode 100644 tools/llvm-ifs/CMakeLists.txt
 create mode 100644 tools/llvm-ifs/LLVMBuild.txt
 create mode 100644 tools/llvm-ifs/llvm-ifs.cpp
 create mode 100644 tools/llvm-objcopy/CommonOpts.td
 create mode 100644 tools/llvm-objcopy/ELF/ELFConfig.cpp
 create mode 100644 tools/llvm-objcopy/ELF/ELFConfig.h
 create mode 100644 tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
 create mode 100644 tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
 create mode 100644 tools/llvm-reduce/CMakeLists.txt
 create mode 100644 tools/llvm-reduce/DeltaManager.h
 create mode 100644 tools/llvm-reduce/LLVMBuild.txt
 create mode 100644 tools/llvm-reduce/TestRunner.cpp
 create mode 100644 tools/llvm-reduce/TestRunner.h
 create mode 100644 tools/llvm-reduce/deltas/Delta.cpp
 create mode 100644 tools/llvm-reduce/deltas/Delta.h
 create mode 100644 tools/llvm-reduce/deltas/ReduceArguments.cpp
 create mode 100644 tools/llvm-reduce/deltas/ReduceArguments.h
 create mode 100644 tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
 create mode 100644 tools/llvm-reduce/deltas/ReduceBasicBlocks.h
 create mode 100644 tools/llvm-reduce/deltas/ReduceFunctions.cpp
 create mode 100644 tools/llvm-reduce/deltas/ReduceFunctions.h
 create mode 100644 tools/llvm-reduce/deltas/ReduceGlobalVars.cpp
 create mode 100644 tools/llvm-reduce/deltas/ReduceGlobalVars.h
 create mode 100644 tools/llvm-reduce/deltas/ReduceInstructions.cpp
 create mode 100644 tools/llvm-reduce/deltas/ReduceInstructions.h
 create mode 100644 tools/llvm-reduce/deltas/ReduceMetadata.cpp
 create mode 100644 tools/llvm-reduce/deltas/ReduceMetadata.h
 create mode 100644 tools/llvm-reduce/llvm-reduce.cpp
 create mode 100644 tools/vfabi-demangle-fuzzer/CMakeLists.txt
 create mode 100644 tools/vfabi-demangle-fuzzer/vfabi-demangler-fuzzer.cpp
 create mode 100644 utils/TableGen/DFAEmitter.cpp
 create mode 100644 utils/TableGen/DFAEmitter.h
 create mode 100644 utils/TableGen/GICombinerEmitter.cpp
 create mode 100644 utils/TableGen/GlobalISel/CMakeLists.txt
 create mode 100644 utils/TableGen/GlobalISel/CodeExpander.cpp
 create mode 100644 utils/TableGen/GlobalISel/CodeExpander.h
 create mode 100644 utils/TableGen/GlobalISel/CodeExpansions.h
 create mode 100755 utils/add_argument_names.py
 create mode 100644 utils/llvm-locstats/CMakeLists.txt
 create mode 100755 utils/llvm-locstats/llvm-locstats.py

diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index cac2f297056d..b84970956666 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -370,9 +370,13 @@ typedef enum {
     LLVMAtomicRMWBinOpUMax, /**< Sets the value if it's greater than the
                              original using an unsigned comparison and return
                              the old one */
-    LLVMAtomicRMWBinOpUMin /**< Sets the value if it's greater than the
-                             original using an unsigned comparison  and return
-                             the old one */
+    LLVMAtomicRMWBinOpUMin, /**< Sets the value if it's greater than the
+                              original using an unsigned comparison and return
+                              the old one */
+    LLVMAtomicRMWBinOpFAdd, /**< Add a floating point value and return the
+                              old one */
+    LLVMAtomicRMWBinOpFSub /**< Subtract a floating point value and return the
+                             old one */
 } LLVMAtomicRMWBinOp;
 
 typedef enum {
@@ -1539,6 +1543,7 @@ LLVMTypeRef LLVMX86MMXType(void);
           macro(GlobalVariable)             \
       macro(UndefValue)                     \
     macro(Instruction)                      \
+      macro(UnaryOperator)                  \
       macro(BinaryOperator)                 \
       macro(CallInst)                       \
         macro(IntrinsicInst)                \
@@ -1571,6 +1576,8 @@ LLVMTypeRef LLVMX86MMXType(void);
       macro(ResumeInst)                     \
       macro(CleanupReturnInst)              \
       macro(CatchReturnInst)                \
+      macro(CatchSwitchInst)                \
+      macro(CallBrInst)                     \
       macro(FuncletPadInst)                 \
         macro(CatchPadInst)                 \
         macro(CleanupPadInst)               \
@@ -1592,7 +1599,10 @@ LLVMTypeRef LLVMX86MMXType(void);
           macro(ZExtInst)                   \
         macro(ExtractValueInst)             \
         macro(LoadInst)                     \
-        macro(VAArgInst)
+        macro(VAArgInst)                    \
+      macro(AtomicCmpXchgInst)              \
+      macro(AtomicRMWInst)                  \
+      macro(FenceInst)
 
 /**
  * @defgroup LLVMCCoreValueGeneral General APIs
@@ -3807,8 +3817,12 @@ LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str,
                                       const char *Name);
 LLVMBool LLVMGetVolatile(LLVMValueRef MemoryAccessInst);
 void LLVMSetVolatile(LLVMValueRef MemoryAccessInst, LLVMBool IsVolatile);
+LLVMBool LLVMGetWeak(LLVMValueRef CmpXchgInst);
+void LLVMSetWeak(LLVMValueRef CmpXchgInst, LLVMBool IsWeak);
 LLVMAtomicOrdering LLVMGetOrdering(LLVMValueRef MemoryAccessInst);
 void LLVMSetOrdering(LLVMValueRef MemoryAccessInst, LLVMAtomicOrdering Ordering);
+LLVMAtomicRMWBinOp LLVMGetAtomicRMWBinOp(LLVMValueRef AtomicRMWInst);
+void LLVMSetAtomicRMWBinOp(LLVMValueRef AtomicRMWInst, LLVMAtomicRMWBinOp BinOp);
 
 /* Casts */
 LLVMValueRef LLVMBuildTrunc(LLVMBuilderRef, LLVMValueRef Val,
diff --git a/include/llvm-c/DebugInfo.h b/include/llvm-c/DebugInfo.h
index 33c8110a863c..41e9f96bbb92 100644
--- a/include/llvm-c/DebugInfo.h
+++ b/include/llvm-c/DebugInfo.h
@@ -32,7 +32,7 @@ typedef enum {
   LLVMDIFlagPublic = 3,
   LLVMDIFlagFwdDecl = 1 << 2,
   LLVMDIFlagAppleBlock = 1 << 3,
-  LLVMDIFlagBlockByrefStruct = 1 << 4,
+  LLVMDIFlagReservedBit4 = 1 << 4,
   LLVMDIFlagVirtual = 1 << 5,
   LLVMDIFlagArtificial = 1 << 6,
   LLVMDIFlagExplicit = 1 << 7,
@@ -169,6 +169,19 @@ typedef unsigned LLVMMetadataKind;
  */
 typedef unsigned LLVMDWARFTypeEncoding;
 
+/**
+ * Describes the kind of macro declaration used for LLVMDIBuilderCreateMacro.
+ * @see llvm::dwarf::MacinfoRecordType
+ * @note Values are from DW_MACINFO_* constants in the DWARF specification.
+ */
+typedef enum {
+  LLVMDWARFMacinfoRecordTypeDefine = 0x01,
+  LLVMDWARFMacinfoRecordTypeMacro = 0x02,
+  LLVMDWARFMacinfoRecordTypeStartFile = 0x03,
+  LLVMDWARFMacinfoRecordTypeEndFile = 0x04,
+  LLVMDWARFMacinfoRecordTypeVendorExt = 0xff
+} LLVMDWARFMacinfoRecordType;
+
 /**
  * The current debug metadata version number.
  */
@@ -521,6 +534,38 @@ LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef Builder,
                                   unsigned NumParameterTypes,
                                   LLVMDIFlags Flags);
 
+/**
+ * Create debugging information entry for a macro.
+ * @param Builder         The DIBuilder.
+ * @param ParentMacroFile Macro parent (could be NULL).
+ * @param Line            Source line number where the macro is defined.
+ * @param RecordType      DW_MACINFO_define or DW_MACINFO_undef.
+ * @param Name            Macro name.
+ * @param NameLen         Macro name length.
+ * @param Value           Macro value.
+ * @param ValueLen        Macro value length.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateMacro(LLVMDIBuilderRef Builder,
+                                         LLVMMetadataRef ParentMacroFile,
+                                         unsigned Line,
+                                         LLVMDWARFMacinfoRecordType RecordType,
+                                         const char *Name, size_t NameLen,
+                                         const char *Value, size_t ValueLen);
+
+/**
+ * Create debugging information temporary entry for a macro file.
+ * List of macro node direct children will be calculated by DIBuilder,
+ * using the \p ParentMacroFile relationship.
+ * @param Builder         The DIBuilder.
+ * @param ParentMacroFile Macro parent (could be NULL).
+ * @param Line            Source line number where the macro file is included.
+ * @param File            File descriptor containing the name of the macro file.
+ */
+LLVMMetadataRef
+LLVMDIBuilderCreateTempMacroFile(LLVMDIBuilderRef Builder,
+                                 LLVMMetadataRef ParentMacroFile, unsigned Line,
+                                 LLVMMetadataRef File);
+
 /**
  * Create debugging information entry for an enumerator.
  * @param Builder        The DIBuilder.
diff --git a/include/llvm-c/Remarks.h b/include/llvm-c/Remarks.h
index 88eb5120c57c..5444aebddd60 100644
--- a/include/llvm-c/Remarks.h
+++ b/include/llvm-c/Remarks.h
@@ -30,7 +30,8 @@ extern "C" {
  * @{
  */
 
-#define REMARKS_API_VERSION 0
+// 0 -> 1: Bitstream remarks support.
+#define REMARKS_API_VERSION 1
 
 /**
  * The type of the emitted remark.
@@ -240,6 +241,20 @@ typedef struct LLVMRemarkOpaqueParser *LLVMRemarkParserRef;
 extern LLVMRemarkParserRef LLVMRemarkParserCreateYAML(const void *Buf,
                                                       uint64_t Size);
 
+/**
+ * Creates a remark parser that can be used to parse the buffer located in \p
+ * Buf of size \p Size bytes.
+ *
+ * \p Buf cannot be `NULL`.
+ *
+ * This function should be paired with LLVMRemarkParserDispose() to avoid
+ * leaking resources.
+ *
+ * \since REMARKS_API_VERSION=1
+ */
+extern LLVMRemarkParserRef LLVMRemarkParserCreateBitstream(const void *Buf,
+                                                           uint64_t Size);
+
 /**
  * Returns the next remark in the file.
  *
diff --git a/include/llvm-c/Transforms/IPO.h b/include/llvm-c/Transforms/IPO.h
index 7a82ed464141..51d007581283 100644
--- a/include/llvm-c/Transforms/IPO.h
+++ b/include/llvm-c/Transforms/IPO.h
@@ -34,6 +34,9 @@ void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM);
 /** See llvm::createConstantMergePass function. */
 void LLVMAddConstantMergePass(LLVMPassManagerRef PM);
 
+/** See llvm::createMergeFunctionsPass function. */
+void LLVMAddMergeFunctionsPass(LLVMPassManagerRef PM);
+
 /** See llvm::createCalledValuePropagationPass function. */
 void LLVMAddCalledValuePropagationPass(LLVMPassManagerRef PM);
 
@@ -67,6 +70,21 @@ void LLVMAddIPSCCPPass(LLVMPassManagerRef PM);
 /** See llvm::createInternalizePass function. */
 void LLVMAddInternalizePass(LLVMPassManagerRef, unsigned AllButMain);
 
+/**
+ * Create and add the internalize pass to the given pass manager with the
+ * provided preservation callback.
+ *
+ * The context parameter is forwarded to the callback on each invocation.
+ * As such, it is the responsibility of the caller to extend its lifetime
+ * until execution of this pass has finished.
+ *
+ * @see llvm::createInternalizePass function.
+ */
+void LLVMAddInternalizePassWithMustPreservePredicate(
+    LLVMPassManagerRef PM,
+    void *Context,
+    LLVMBool (*MustPreserve)(LLVMValueRef, void *));
+
 /** See llvm::createStripDeadPrototypesPass function. */
 void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM);
 
diff --git a/include/llvm-c/Transforms/Scalar.h b/include/llvm-c/Transforms/Scalar.h
index 031cf98b2df2..6f3a3d8b3750 100644
--- a/include/llvm-c/Transforms/Scalar.h
+++ b/include/llvm-c/Transforms/Scalar.h
@@ -35,6 +35,9 @@ extern "C" {
 /** See llvm::createAggressiveDCEPass function. */
 void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM);
 
+/** See llvm::createDeadCodeEliminationPass function. */
+void LLVMAddDCEPass(LLVMPassManagerRef PM);
+
 /** See llvm::createBitTrackingDCEPass function. */
 void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM);
 
@@ -144,6 +147,9 @@ void LLVMAddEarlyCSEMemSSAPass(LLVMPassManagerRef PM);
 /** See llvm::createLowerExpectIntrinsicPass function */
 void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM);
 
+/** See llvm::createLowerConstantIntrinsicsPass function */
+void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM);
+
 /** See llvm::createTypeBasedAliasAnalysisPass function */
 void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM);
 
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index 2467722b1954..41e6067cf44f 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -44,7 +44,7 @@ typedef bool lto_bool_t;
  * @{
  */
 
-#define LTO_API_VERSION 24
+#define LTO_API_VERSION 25
 
 /**
  * \since prior to LTO_API_VERSION=3
@@ -550,6 +550,56 @@ extern void
 lto_codegen_set_should_embed_uselists(lto_code_gen_t cg,
                                       lto_bool_t ShouldEmbedUselists);
 
+/** Opaque reference to an LTO input file */
+typedef struct LLVMOpaqueLTOInput *lto_input_t;
+
+/**
+  * Creates an LTO input file from a buffer. The path
+  * argument is used for diagnotics as this function
+  * otherwise does not know which file the given buffer
+  * is associated with.
+  *
+  * \since LTO_API_VERSION=24
+  */
+extern lto_input_t lto_input_create(const void *buffer,
+                                    size_t buffer_size,
+                                    const char *path);
+
+/**
+  * Frees all memory internally allocated by the LTO input file.
+  * Upon return the lto_module_t is no longer valid.
+  *
+  * \since LTO_API_VERSION=24
+  */
+extern void lto_input_dispose(lto_input_t input);
+
+/**
+  * Returns the number of dependent library specifiers
+  * for the given LTO input file.
+  *
+  * \since LTO_API_VERSION=24
+  */
+extern unsigned lto_input_get_num_dependent_libraries(lto_input_t input);
+
+/**
+  * Returns the ith dependent library specifier
+  * for the given LTO input file. The returned
+  * string is not null-terminated.
+  *
+  * \since LTO_API_VERSION=24
+  */
+extern const char * lto_input_get_dependent_library(lto_input_t input,
+                                                    size_t index,
+                                                    size_t *size);
+
+/**
+ * Returns the list of libcall symbols that can be generated by LTO
+ * that might not be visible from the symbol table of bitcode files.
+ *
+ * \since prior to LTO_API_VERSION=25
+ */
+extern const char *const *lto_runtime_lib_symbols_list(size_t *size);
+
 /**
  * @} // endgoup LLVMCLTO
  * @defgroup LLVMCTLTO ThinLTO
@@ -846,48 +896,6 @@ thinlto_codegen_set_cache_size_megabytes(thinlto_code_gen_t cg,
 extern void thinlto_codegen_set_cache_size_files(thinlto_code_gen_t cg,
                                                  unsigned max_size_files);
 
-/** Opaque reference to an LTO input file */
-typedef struct LLVMOpaqueLTOInput *lto_input_t;
-
-/**
-  * Creates an LTO input file from a buffer. The path
-  * argument is used for diagnotics as this function
-  * otherwise does not know which file the given buffer
-  * is associated with.
-  *
-  * \since LTO_API_VERSION=24
-  */
-extern lto_input_t lto_input_create(const void *buffer,
-                                    size_t buffer_size,
-                                    const char *path);
-
-/**
-  * Frees all memory internally allocated by the LTO input file.
-  * Upon return the lto_module_t is no longer valid.
-  *
-  * \since LTO_API_VERSION=24
-  */
-extern void lto_input_dispose(lto_input_t input);
-
-/**
-  * Returns the number of dependent library specifiers
-  * for the given LTO input file.
-  *
-  * \since LTO_API_VERSION=24
-  */
-extern unsigned lto_input_get_num_dependent_libraries(lto_input_t input);
-
-/**
-  * Returns the ith dependent library specifier
-  * for the given LTO input file. The returned
-  * string is not null-terminated.
-  *
-  * \since LTO_API_VERSION=24
-  */
-extern const char * lto_input_get_dependent_library(lto_input_t input,
-                                                    size_t index,
-                                                    size_t *size);
-
 /**
  * @} // endgroup LLVMCTLTO_CACHING
  */
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index a9648d35cf5d..1c4969733791 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -192,6 +192,11 @@ struct APFloatBase {
   /// IEEE-754R 7: Default exception handling.
   ///
   /// opUnderflow or opOverflow are always returned or-ed with opInexact.
+  ///
+  /// APFloat models this behavior specified by IEEE-754:
+  ///   "For operations producing results in floating-point format, the default
+  ///    result of an operation that signals the invalid operation exception
+  ///    shall be a quiet NaN."
   enum opStatus {
     opOK = 0x00,
     opInvalidOp = 0x01,
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 2381b75e08b1..8dce5a621bb3 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -1467,6 +1467,13 @@ public:
       U.pVal[whichWord(BitPosition)] &= Mask;
   }
 
+  /// Set bottom loBits bits to 0.
+  void clearLowBits(unsigned loBits) {
+    assert(loBits <= BitWidth && "More bits than bitwidth");
+    APInt Keep = getHighBitsSet(BitWidth, BitWidth - loBits);
+    *this &= Keep;
+  }
+
   /// Set the sign bit to 0.
   void clearSignBit() {
     clearBit(BitWidth - 1);
@@ -1496,9 +1503,11 @@ public:
 
   /// Insert the bits from a smaller APInt starting at bitPosition.
   void insertBits(const APInt &SubBits, unsigned bitPosition);
+  void insertBits(uint64_t SubBits, unsigned bitPosition, unsigned numBits);
 
   /// Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
   APInt extractBits(unsigned numBits, unsigned bitPosition) const;
+  uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const;
 
   /// @}
   /// \name Value Characterization Functions
diff --git a/include/llvm/ADT/Any.h b/include/llvm/ADT/Any.h
index 5dcd6e73c54f..49657e02a991 100644
--- a/include/llvm/ADT/Any.h
+++ b/include/llvm/ADT/Any.h
@@ -38,7 +38,7 @@ class Any {
     explicit StorageImpl(T &&Value) : Value(std::move(Value)) {}
 
     std::unique_ptr<StorageBase> clone() const override {
-      return llvm::make_unique<StorageImpl<T>>(Value);
+      return std::make_unique<StorageImpl<T>>(Value);
     }
 
     const void *id() const override { return &TypeId<T>::Id; }
@@ -78,7 +78,7 @@ public:
           int>::type = 0>
   Any(T &&Value) {
     using U = typename std::decay<T>::type;
-    Storage = llvm::make_unique<StorageImpl<U>>(std::forward<T>(Value));
+    Storage = std::make_unique<StorageImpl<U>>(std::forward<T>(Value));
   }
 
   Any(Any &&Other) : Storage(std::move(Other.Storage)) {}
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index 773c88f7c9f9..f6455d3fa412 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -481,6 +481,12 @@ namespace llvm {
     return Vec;
   }
 
+  /// Construct an ArrayRef from a std::array.
+  template <typename T, std::size_t N>
+  ArrayRef<T> makeArrayRef(const std::array<T, N> &Arr) {
+    return Arr;
+  }
+
   /// Construct an ArrayRef from an ArrayRef (no-op) (const)
   template <typename T> ArrayRef<T> makeArrayRef(const ArrayRef<T> &Vec) {
     return Vec;
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index a05cf8130d3c..948a6e6bfb38 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -38,33 +38,7 @@ namespace detail {
 // implementation without requiring two members.
 template <typename KeyT, typename ValueT>
 struct DenseMapPair : public std::pair<KeyT, ValueT> {
-
-  // FIXME: Switch to inheriting constructors when we drop support for older
-  //        clang versions.
-  // NOTE: This default constructor is declared with '{}' rather than
-  //       '= default' to work around a separate bug in clang-3.8. This can
-  //       also go when we switch to inheriting constructors.
-  DenseMapPair() {}
-
-  DenseMapPair(const KeyT &Key, const ValueT &Value)
-      : std::pair<KeyT, ValueT>(Key, Value) {}
-
-  DenseMapPair(KeyT &&Key, ValueT &&Value)
-      : std::pair<KeyT, ValueT>(std::move(Key), std::move(Value)) {}
-
-  template <typename AltKeyT, typename AltValueT>
-  DenseMapPair(AltKeyT &&AltKey, AltValueT &&AltValue,
-               typename std::enable_if<
-                   std::is_convertible<AltKeyT, KeyT>::value &&
-                   std::is_convertible<AltValueT, ValueT>::value>::type * = 0)
-      : std::pair<KeyT, ValueT>(std::forward<AltKeyT>(AltKey),
-                                std::forward<AltValueT>(AltValue)) {}
-
-  template <typename AltPairT>
-  DenseMapPair(AltPairT &&AltPair,
-               typename std::enable_if<std::is_convertible<
-                   AltPairT, std::pair<KeyT, ValueT>>::value>::type * = nullptr)
-      : std::pair<KeyT, ValueT>(std::forward<AltPairT>(AltPair)) {}
+  using std::pair<KeyT, ValueT>::pair;
 
   KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
   const KeyT &getFirst() const { return std::pair<KeyT, ValueT>::first; }
@@ -748,7 +722,7 @@ public:
 
   ~DenseMap() {
     this->destroyAll();
-    operator delete(Buckets);
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
   }
 
   void swap(DenseMap& RHS) {
@@ -768,7 +742,7 @@ public:
 
   DenseMap& operator=(DenseMap &&other) {
     this->destroyAll();
-    operator delete(Buckets);
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
     init(0);
     swap(other);
     return *this;
@@ -776,7 +750,7 @@ public:
 
   void copyFrom(const DenseMap& other) {
     this->destroyAll();
-    operator delete(Buckets);
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
     if (allocateBuckets(other.NumBuckets)) {
       this->BaseT::copyFrom(other);
     } else {
@@ -809,10 +783,12 @@ public:
     this->moveFromOldBuckets(OldBuckets, OldBuckets+OldNumBuckets);
 
     // Free the old table.
-    operator delete(OldBuckets);
+    deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
+                      alignof(BucketT));
   }
 
   void shrink_and_clear() {
+    unsigned OldNumBuckets = NumBuckets;
     unsigned OldNumEntries = NumEntries;
     this->destroyAll();
 
@@ -825,7 +801,8 @@ public:
       return;
     }
 
-    operator delete(Buckets);
+    deallocate_buffer(Buckets, sizeof(BucketT) * OldNumBuckets,
+                      alignof(BucketT));
     init(NewNumBuckets);
   }
 
@@ -861,7 +838,8 @@ private:
       return false;
     }
 
-    Buckets = static_cast<BucketT*>(operator new(sizeof(BucketT) * NumBuckets));
+    Buckets = static_cast<BucketT *>(
+        allocate_buffer(sizeof(BucketT) * NumBuckets, alignof(BucketT)));
     return true;
   }
 };
@@ -1076,7 +1054,8 @@ public:
     this->moveFromOldBuckets(OldRep.Buckets, OldRep.Buckets+OldRep.NumBuckets);
 
     // Free the old table.
-    operator delete(OldRep.Buckets);
+    deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
+                      alignof(BucketT));
   }
 
   void shrink_and_clear() {
@@ -1160,15 +1139,17 @@ private:
     if (Small)
       return;
 
-    operator delete(getLargeRep()->Buckets);
+    deallocate_buffer(getLargeRep()->Buckets,
+                      sizeof(BucketT) * getLargeRep()->NumBuckets,
+                      alignof(BucketT));
     getLargeRep()->~LargeRep();
   }
 
   LargeRep allocateBuckets(unsigned Num) {
     assert(Num > InlineBuckets && "Must allocate more buckets than are inline");
-    LargeRep Rep = {
-      static_cast<BucketT*>(operator new(sizeof(BucketT) * Num)), Num
-    };
+    LargeRep Rep = {static_cast<BucketT *>(allocate_buffer(
+                        sizeof(BucketT) * Num, alignof(BucketT))),
+                    Num};
     return Rep;
   }
 };
diff --git a/include/llvm/ADT/DenseMapInfo.h b/include/llvm/ADT/DenseMapInfo.h
index 5ef6f3ad1b04..bd4c60c8f13e 100644
--- a/include/llvm/ADT/DenseMapInfo.h
+++ b/include/llvm/ADT/DenseMapInfo.h
@@ -17,7 +17,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
-#include "llvm/Support/ScalableSize.h"
+#include "llvm/Support/TypeSize.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -67,6 +67,17 @@ template<> struct DenseMapInfo<char> {
   }
 };
 
+// Provide DenseMapInfo for unsigned chars.
+template <> struct DenseMapInfo<unsigned char> {
+  static inline unsigned char getEmptyKey() { return ~0; }
+  static inline unsigned char getTombstoneKey() { return ~0 - 1; }
+  static unsigned getHashValue(const unsigned char &Val) { return Val * 37U; }
+
+  static bool isEqual(const unsigned char &LHS, const unsigned char &RHS) {
+    return LHS == RHS;
+  }
+};
+
 // Provide DenseMapInfo for unsigned shorts.
 template <> struct DenseMapInfo<unsigned short> {
   static inline unsigned short getEmptyKey() { return 0xFFFF; }
diff --git a/include/llvm/ADT/DirectedGraph.h b/include/llvm/ADT/DirectedGraph.h
new file mode 100644
index 000000000000..f6a358d99cd2
--- /dev/null
+++ b/include/llvm/ADT/DirectedGraph.h
@@ -0,0 +1,270 @@
+//===- llvm/ADT/DirectedGraph.h - Directed Graph ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface and a base class implementation for a
+// directed graph.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_DIRECTEDGRAPH_H
+#define LLVM_ADT_DIRECTEDGRAPH_H
+
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+/// Represent an edge in the directed graph.
+/// The edge contains the target node it connects to.
+template <class NodeType, class EdgeType> class DGEdge {
+public:
+  DGEdge() = delete;
+  /// Create an edge pointing to the given node \p N.
+  explicit DGEdge(NodeType &N) : TargetNode(N) {}
+  explicit DGEdge(const DGEdge<NodeType, EdgeType> &E)
+      : TargetNode(E.TargetNode) {}
+  DGEdge<NodeType, EdgeType> &operator=(const DGEdge<NodeType, EdgeType> &E) {
+    TargetNode = E.TargetNode;
+    return *this;
+  }
+
+  /// Static polymorphism: delegate implementation (via isEqualTo) to the
+  /// derived class.
+  bool operator==(const EdgeType &E) const { return getDerived().isEqualTo(E); }
+  bool operator!=(const EdgeType &E) const { return !operator==(E); }
+
+  /// Retrieve the target node this edge connects to.
+  const NodeType &getTargetNode() const { return TargetNode; }
+  NodeType &getTargetNode() {
+    return const_cast<NodeType &>(
+        static_cast<const DGEdge<NodeType, EdgeType> &>(*this).getTargetNode());
+  }
+
+protected:
+  // As the default implementation use address comparison for equality.
+  bool isEqualTo(const EdgeType &E) const { return this == &E; }
+
+  // Cast the 'this' pointer to the derived type and return a reference.
+  EdgeType &getDerived() { return *static_cast<EdgeType *>(this); }
+  const EdgeType &getDerived() const {
+    return *static_cast<const EdgeType *>(this);
+  }
+
+  // The target node this edge connects to.
+  NodeType &TargetNode;
+};
+
+/// Represent a node in the directed graph.
+/// The node has a (possibly empty) list of outgoing edges.
+template <class NodeType, class EdgeType> class DGNode {
+public:
+  using EdgeListTy = SetVector<EdgeType *>;
+  using iterator = typename EdgeListTy::iterator;
+  using const_iterator = typename EdgeListTy::const_iterator;
+
+  /// Create a node with a single outgoing edge \p E.
+  explicit DGNode(EdgeType &E) : Edges() { Edges.insert(&E); }
+  DGNode() = default;
+
+  explicit DGNode(const DGNode<NodeType, EdgeType> &N) : Edges(N.Edges) {}
+  DGNode(DGNode<NodeType, EdgeType> &&N) : Edges(std::move(N.Edges)) {}
+
+  DGNode<NodeType, EdgeType> &operator=(const DGNode<NodeType, EdgeType> &N) {
+    Edges = N.Edges;
+    return *this;
+  }
+  DGNode<NodeType, EdgeType> &operator=(const DGNode<NodeType, EdgeType> &&N) {
+    Edges = std::move(N.Edges);
+    return *this;
+  }
+
+  /// Static polymorphism: delegate implementation (via isEqualTo) to the
+  /// derived class.
+  bool operator==(const NodeType &N) const { return getDerived().isEqualTo(N); }
+  bool operator!=(const NodeType &N) const { return !operator==(N); }
+
+  const_iterator begin() const { return Edges.begin(); }
+  const_iterator end() const { return Edges.end(); }
+  iterator begin() { return Edges.begin(); }
+  iterator end() { return Edges.end(); }
+  const EdgeType &front() const { return *Edges.front(); }
+  EdgeType &front() { return *Edges.front(); }
+  const EdgeType &back() const { return *Edges.back(); }
+  EdgeType &back() { return *Edges.back(); }
+
+  /// Collect in \p EL, all the edges from this node to \p N.
+  /// Return true if at least one edge was found, and false otherwise.
+  /// Note that this implementation allows more than one edge to connect
+  /// a given pair of nodes.
+  bool findEdgesTo(const NodeType &N, SmallVectorImpl<EdgeType *> &EL) const {
+    assert(EL.empty() && "Expected the list of edges to be empty.");
+    for (auto *E : Edges)
+      if (E->getTargetNode() == N)
+        EL.push_back(E);
+    return !EL.empty();
+  }
+
+  /// Add the given edge \p E to this node, if it doesn't exist already. Returns
+  /// true if the edge is added and false otherwise.
+  bool addEdge(EdgeType &E) { return Edges.insert(&E); }
+
+  /// Remove the given edge \p E from this node, if it exists.
+  void removeEdge(EdgeType &E) { Edges.remove(&E); }
+
+  /// Test whether there is an edge that goes from this node to \p N.
+  bool hasEdgeTo(const NodeType &N) const {
+    return (findEdgeTo(N) != Edges.end());
+  }
+
+  /// Retrieve the outgoing edges for the node.
+  const EdgeListTy &getEdges() const { return Edges; }
+  EdgeListTy &getEdges() {
+    return const_cast<EdgeListTy &>(
+        static_cast<const DGNode<NodeType, EdgeType> &>(*this).Edges);
+  }
+
+  /// Clear the outgoing edges.
+  void clear() { Edges.clear(); }
+
+protected:
+  // As the default implementation use address comparison for equality.
+  bool isEqualTo(const NodeType &N) const { return this == &N; }
+
+  // Cast the 'this' pointer to the derived type and return a reference.
+  NodeType &getDerived() { return *static_cast<NodeType *>(this); }
+  const NodeType &getDerived() const {
+    return *static_cast<const NodeType *>(this);
+  }
+
+  /// Find an edge to \p N. If more than one edge exists, this will return
+  /// the first one in the list of edges.
+  const_iterator findEdgeTo(const NodeType &N) const {
+    return llvm::find_if(
+        Edges, [&N](const EdgeType *E) { return E->getTargetNode() == N; });
+  }
+
+  // The list of outgoing edges.
+  EdgeListTy Edges;
+};
+
+/// Directed graph
+///
+/// The graph is represented by a table of nodes.
+/// Each node contains a (possibly empty) list of outgoing edges.
+/// Each edge contains the target node it connects to.
+template <class NodeType, class EdgeType> class DirectedGraph {
+protected:
+  using NodeListTy = SmallVector<NodeType *, 10>;
+  using EdgeListTy = SmallVector<EdgeType *, 10>;
+public:
+  using iterator = typename NodeListTy::iterator;
+  using const_iterator = typename NodeListTy::const_iterator;
+  using DGraphType = DirectedGraph<NodeType, EdgeType>;
+
+  DirectedGraph() = default;
+  explicit DirectedGraph(NodeType &N) : Nodes() { addNode(N); }
+  DirectedGraph(const DGraphType &G) : Nodes(G.Nodes) {}
+  DirectedGraph(DGraphType &&RHS) : Nodes(std::move(RHS.Nodes)) {}
+  DGraphType &operator=(const DGraphType &G) {
+    Nodes = G.Nodes;
+    return *this;
+  }
+  DGraphType &operator=(const DGraphType &&G) {
+    Nodes = std::move(G.Nodes);
+    return *this;
+  }
+
+  const_iterator begin() const { return Nodes.begin(); }
+  const_iterator end() const { return Nodes.end(); }
+  iterator begin() { return Nodes.begin(); }
+  iterator end() { return Nodes.end(); }
+  const NodeType &front() const { return *Nodes.front(); }
+  NodeType &front() { return *Nodes.front(); }
+  const NodeType &back() const { return *Nodes.back(); }
+  NodeType &back() { return *Nodes.back(); }
+
+  size_t size() const { return Nodes.size(); }
+
+  /// Find the given node \p N in the table.
+  const_iterator findNode(const NodeType &N) const {
+    return llvm::find_if(Nodes,
+                         [&N](const NodeType *Node) { return *Node == N; });
+  }
+  iterator findNode(const NodeType &N) {
+    return const_cast<iterator>(
+        static_cast<const DGraphType &>(*this).findNode(N));
+  }
+
+  /// Add the given node \p N to the graph if it is not already present.
+  bool addNode(NodeType &N) {
+    if (findNode(N) != Nodes.end())
+      return false;
+    Nodes.push_back(&N);
+    return true;
+  }
+
+  /// Collect in \p EL all edges that are coming into node \p N. Return true
+  /// if at least one edge was found, and false otherwise.
+  bool findIncomingEdgesToNode(const NodeType &N, SmallVectorImpl<EdgeType*> &EL) const {
+    assert(EL.empty() && "Expected the list of edges to be empty.");
+    EdgeListTy TempList;
+    for (auto *Node : Nodes) {
+      if (*Node == N)
+        continue;
+      Node->findEdgesTo(N, TempList);
+      EL.insert(EL.end(), TempList.begin(), TempList.end());
+      TempList.clear();
+    }
+    return !EL.empty();
+  }
+
+  /// Remove the given node \p N from the graph. If the node has incoming or
+  /// outgoing edges, they are also removed. Return true if the node was found
+  /// and then removed, and false if the node was not found in the graph to
+  /// begin with.
+  bool removeNode(NodeType &N) {
+    iterator IT = findNode(N);
+    if (IT == Nodes.end())
+      return false;
+    // Remove incoming edges.
+    EdgeListTy EL;
+    for (auto *Node : Nodes) {
+      if (*Node == N)
+        continue;
+      Node->findEdgesTo(N, EL);
+      for (auto *E : EL)
+        Node->removeEdge(*E);
+      EL.clear();
+    }
+    N.clear();
+    Nodes.erase(IT);
+    return true;
+  }
+
+  /// Assuming nodes \p Src and \p Dst are already in the graph, connect node \p
+  /// Src to node \p Dst using the provided edge \p E. Return true if \p Src is
+  /// not already connected to \p Dst via \p E, and false otherwise.
+  bool connect(NodeType &Src, NodeType &Dst, EdgeType &E) {
+    assert(findNode(Src) != Nodes.end() && "Src node should be present.");
+    assert(findNode(Dst) != Nodes.end() && "Dst node should be present.");
+    assert((E.getTargetNode() == Dst) &&
+           "Target of the given edge does not match Dst.");
+    return Src.addEdge(E);
+  }
+
+protected:
+  // The list of nodes in the graph.
+  NodeListTy Nodes;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ADT_DIRECTEDGRAPH_H
diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h
index 008188bfa210..b22606bdb518 100644
--- a/include/llvm/ADT/Hashing.h
+++ b/include/llvm/ADT/Hashing.h
@@ -45,7 +45,6 @@
 #define LLVM_ADT_HASHING_H
 
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Host.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/type_traits.h"
 #include <algorithm>
diff --git a/include/llvm/ADT/IntervalMap.h b/include/llvm/ADT/IntervalMap.h
index 12828c4cfdab..a02876ee77f3 100644
--- a/include/llvm/ADT/IntervalMap.h
+++ b/include/llvm/ADT/IntervalMap.h
@@ -963,8 +963,8 @@ public:
 
 private:
   // The root data is either a RootLeaf or a RootBranchData instance.
-  LLVM_ALIGNAS(RootLeaf) LLVM_ALIGNAS(RootBranchData)
-  AlignedCharArrayUnion<RootLeaf, RootBranchData> data;
+  alignas(RootLeaf) alignas(RootBranchData)
+      AlignedCharArrayUnion<RootLeaf, RootBranchData> data;
 
   // Tree height.
   // 0: Leaves in root.
diff --git a/include/llvm/ADT/PointerIntPair.h b/include/llvm/ADT/PointerIntPair.h
index 24a2bb67a36e..fa6bf1504469 100644
--- a/include/llvm/ADT/PointerIntPair.h
+++ b/include/llvm/ADT/PointerIntPair.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_ADT_POINTERINTPAIR_H
 #define LLVM_ADT_POINTERINTPAIR_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include "llvm/Support/type_traits.h"
 #include <cassert>
@@ -59,19 +60,19 @@ public:
 
   IntType getInt() const { return (IntType)Info::getInt(Value); }
 
-  void setPointer(PointerTy PtrVal) {
+  void setPointer(PointerTy PtrVal) LLVM_LVALUE_FUNCTION {
     Value = Info::updatePointer(Value, PtrVal);
   }
 
-  void setInt(IntType IntVal) {
+  void setInt(IntType IntVal) LLVM_LVALUE_FUNCTION {
     Value = Info::updateInt(Value, static_cast<intptr_t>(IntVal));
   }
 
-  void initWithPointer(PointerTy PtrVal) {
+  void initWithPointer(PointerTy PtrVal) LLVM_LVALUE_FUNCTION {
     Value = Info::updatePointer(0, PtrVal);
   }
 
-  void setPointerAndInt(PointerTy PtrVal, IntType IntVal) {
+  void setPointerAndInt(PointerTy PtrVal, IntType IntVal) LLVM_LVALUE_FUNCTION {
     Value = Info::updateInt(Info::updatePointer(0, PtrVal),
                             static_cast<intptr_t>(IntVal));
   }
@@ -89,7 +90,7 @@ public:
 
   void *getOpaqueValue() const { return reinterpret_cast<void *>(Value); }
 
-  void setFromOpaqueValue(void *Val) {
+  void setFromOpaqueValue(void *Val) LLVM_LVALUE_FUNCTION {
     Value = reinterpret_cast<intptr_t>(Val);
   }
 
diff --git a/include/llvm/ADT/PointerUnion.h b/include/llvm/ADT/PointerUnion.h
index 2bcdf546c6e4..98c905775a77 100644
--- a/include/llvm/ADT/PointerUnion.h
+++ b/include/llvm/ADT/PointerUnion.h
@@ -54,21 +54,14 @@ struct PointerUnionTypeSelectorReturn<
 };
 
 namespace pointer_union_detail {
-  constexpr int constexprMin(int a, int b) { return a < b ? a : b; }
   /// Determine the number of bits required to store integers with values < n.
   /// This is ceil(log2(n)).
   constexpr int bitsRequired(unsigned n) {
     return n > 1 ? 1 + bitsRequired((n + 1) / 2) : 0;
   }
 
-  // FIXME: In C++14, replace this with
-  //   std::min({PointerLikeTypeTraits<Ts>::NumLowBitsAvailable...})
-  template <typename T> constexpr int lowBitsAvailable() {
-    return PointerLikeTypeTraits<T>::NumLowBitsAvailable;
-  }
-  template <typename T1, typename T2, typename... Ts>
-  constexpr int lowBitsAvailable() {
-    return constexprMin(lowBitsAvailable<T1>(), lowBitsAvailable<T2, Ts...>());
+  template <typename... Ts> constexpr int lowBitsAvailable() {
+    return std::min<int>({PointerLikeTypeTraits<Ts>::NumLowBitsAvailable...});
   }
 
   /// Find the index of a type in a list of types. TypeIndex<T, Us...>::Index
@@ -167,10 +160,11 @@ class PointerUnion
               void *, pointer_union_detail::bitsRequired(sizeof...(PTs)), int,
               pointer_union_detail::PointerUnionUIntTraits<PTs...>>,
           0, PTs...> {
-  // The first type is special in some ways, but we don't want PointerUnion to
-  // be a 'template <typename First, typename ...Rest>' because it's much more
-  // convenient to have a name for the whole pack. So split off the first type
-  // here.
+  // The first type is special because we want to directly cast a pointer to a
+  // default-initialized union to a pointer to the first type. But we don't
+  // want PointerUnion to be a 'template <typename First, typename ...Rest>'
+  // because it's much more convenient to have a name for the whole pack. So
+  // split off the first type here.
   using First = typename pointer_union_detail::GetFirstType<PTs...>::type;
   using Base = typename PointerUnion::PointerUnionMembers;
 
@@ -182,12 +176,7 @@ public:
 
   /// Test if the pointer held in the union is null, regardless of
   /// which type it is.
-  bool isNull() const {
-    // Convert from the void* to one of the pointer types, to make sure that
-    // we recursively strip off low bits if we have a nested PointerUnion.
-    return !PointerLikeTypeTraits<First>::getFromVoidPointer(
-        this->Val.getPointer());
-  }
+  bool isNull() const { return !this->Val.getPointer(); }
 
   explicit operator bool() const { return !isNull(); }
 
@@ -226,7 +215,8 @@ public:
   First *getAddrOfPtr1() {
     assert(is<First>() && "Val is not the first pointer");
     assert(
-        get<First>() == this->Val.getPointer() &&
+        PointerLikeTypeTraits<First>::getAsVoidPointer(get<First>()) ==
+            this->Val.getPointer() &&
         "Can't get the address because PointerLikeTypeTraits changes the ptr");
     return const_cast<First *>(
         reinterpret_cast<const First *>(this->Val.getAddrOfPointer()));
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 81dce0168c79..274933bc5204 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -95,18 +95,6 @@ template <class Ty> struct identity {
   }
 };
 
-template <class Ty> struct less_ptr {
-  bool operator()(const Ty* left, const Ty* right) const {
-    return *left < *right;
-  }
-};
-
-template <class Ty> struct greater_ptr {
-  bool operator()(const Ty* left, const Ty* right) const {
-    return *right < *left;
-  }
-};
-
 /// An efficient, type-erasing, non-owning reference to a callable. This is
 /// intended for use as the type of a function parameter that is not used
 /// after the function in question returns.
@@ -530,10 +518,6 @@ bool all_of(R &&range, UnaryPredicate P);
 template <typename R, typename UnaryPredicate>
 bool any_of(R &&range, UnaryPredicate P);
 
-template <size_t... I> struct index_sequence;
-
-template <class... Ts> struct index_sequence_for;
-
 namespace detail {
 
 using std::declval;
@@ -568,38 +552,38 @@ struct zip_common : public zip_traits<ZipType, Iters...> {
   std::tuple<Iters...> iterators;
 
 protected:
-  template <size_t... Ns> value_type deref(index_sequence<Ns...>) const {
+  template <size_t... Ns> value_type deref(std::index_sequence<Ns...>) const {
     return value_type(*std::get<Ns>(iterators)...);
   }
 
   template <size_t... Ns>
-  decltype(iterators) tup_inc(index_sequence<Ns...>) const {
+  decltype(iterators) tup_inc(std::index_sequence<Ns...>) const {
     return std::tuple<Iters...>(std::next(std::get<Ns>(iterators))...);
   }
 
   template <size_t... Ns>
-  decltype(iterators) tup_dec(index_sequence<Ns...>) const {
+  decltype(iterators) tup_dec(std::index_sequence<Ns...>) const {
     return std::tuple<Iters...>(std::prev(std::get<Ns>(iterators))...);
   }
 
 public:
   zip_common(Iters &&... ts) : iterators(std::forward<Iters>(ts)...) {}
 
-  value_type operator*() { return deref(index_sequence_for<Iters...>{}); }
+  value_type operator*() { return deref(std::index_sequence_for<Iters...>{}); }
 
   const value_type operator*() const {
-    return deref(index_sequence_for<Iters...>{});
+    return deref(std::index_sequence_for<Iters...>{});
   }
 
   ZipType &operator++() {
-    iterators = tup_inc(index_sequence_for<Iters...>{});
+    iterators = tup_inc(std::index_sequence_for<Iters...>{});
     return *reinterpret_cast<ZipType *>(this);
   }
 
   ZipType &operator--() {
     static_assert(Base::IsBidirectional,
                   "All inner iterators must be at least bidirectional.");
-    iterators = tup_dec(index_sequence_for<Iters...>{});
+    iterators = tup_dec(std::index_sequence_for<Iters...>{});
     return *reinterpret_cast<ZipType *>(this);
   }
 };
@@ -618,7 +602,8 @@ struct zip_first : public zip_common<zip_first<Iters...>, Iters...> {
 template <typename... Iters>
 class zip_shortest : public zip_common<zip_shortest<Iters...>, Iters...> {
   template <size_t... Ns>
-  bool test(const zip_shortest<Iters...> &other, index_sequence<Ns...>) const {
+  bool test(const zip_shortest<Iters...> &other,
+            std::index_sequence<Ns...>) const {
     return all_of(std::initializer_list<bool>{std::get<Ns>(this->iterators) !=
                                               std::get<Ns>(other.iterators)...},
                   identity<bool>{});
@@ -630,7 +615,7 @@ public:
   zip_shortest(Iters &&... ts) : Base(std::forward<Iters>(ts)...) {}
 
   bool operator==(const zip_shortest<Iters...> &other) const {
-    return !test(other, index_sequence_for<Iters...>{});
+    return !test(other, std::index_sequence_for<Iters...>{});
   }
 };
 
@@ -646,18 +631,21 @@ public:
 private:
   std::tuple<Args...> ts;
 
-  template <size_t... Ns> iterator begin_impl(index_sequence<Ns...>) const {
+  template <size_t... Ns>
+  iterator begin_impl(std::index_sequence<Ns...>) const {
     return iterator(std::begin(std::get<Ns>(ts))...);
   }
-  template <size_t... Ns> iterator end_impl(index_sequence<Ns...>) const {
+  template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) const {
     return iterator(std::end(std::get<Ns>(ts))...);
   }
 
 public:
   zippy(Args &&... ts_) : ts(std::forward<Args>(ts_)...) {}
 
-  iterator begin() const { return begin_impl(index_sequence_for<Args...>{}); }
-  iterator end() const { return end_impl(index_sequence_for<Args...>{}); }
+  iterator begin() const {
+    return begin_impl(std::index_sequence_for<Args...>{});
+  }
+  iterator end() const { return end_impl(std::index_sequence_for<Args...>{}); }
 };
 
 } // end namespace detail
@@ -727,20 +715,20 @@ private:
 
   template <size_t... Ns>
   bool test(const zip_longest_iterator<Iters...> &other,
-            index_sequence<Ns...>) const {
+            std::index_sequence<Ns...>) const {
     return llvm::any_of(
         std::initializer_list<bool>{std::get<Ns>(this->iterators) !=
                                     std::get<Ns>(other.iterators)...},
         identity<bool>{});
   }
 
-  template <size_t... Ns> value_type deref(index_sequence<Ns...>) const {
+  template <size_t... Ns> value_type deref(std::index_sequence<Ns...>) const {
     return value_type(
         deref_or_none(std::get<Ns>(iterators), std::get<Ns>(end_iterators))...);
   }
 
   template <size_t... Ns>
-  decltype(iterators) tup_inc(index_sequence<Ns...>) const {
+  decltype(iterators) tup_inc(std::index_sequence<Ns...>) const {
     return std::tuple<Iters...>(
         next_or_end(std::get<Ns>(iterators), std::get<Ns>(end_iterators))...);
   }
@@ -750,17 +738,19 @@ public:
       : iterators(std::forward<Iters>(ts.first)...),
         end_iterators(std::forward<Iters>(ts.second)...) {}
 
-  value_type operator*() { return deref(index_sequence_for<Iters...>{}); }
+  value_type operator*() { return deref(std::index_sequence_for<Iters...>{}); }
 
-  value_type operator*() const { return deref(index_sequence_for<Iters...>{}); }
+  value_type operator*() const {
+    return deref(std::index_sequence_for<Iters...>{});
+  }
 
   zip_longest_iterator<Iters...> &operator++() {
-    iterators = tup_inc(index_sequence_for<Iters...>{});
+    iterators = tup_inc(std::index_sequence_for<Iters...>{});
     return *this;
   }
 
   bool operator==(const zip_longest_iterator<Iters...> &other) const {
-    return !test(other, index_sequence_for<Iters...>{});
+    return !test(other, std::index_sequence_for<Iters...>{});
   }
 };
 
@@ -777,12 +767,13 @@ public:
 private:
   std::tuple<Args...> ts;
 
-  template <size_t... Ns> iterator begin_impl(index_sequence<Ns...>) const {
+  template <size_t... Ns>
+  iterator begin_impl(std::index_sequence<Ns...>) const {
     return iterator(std::make_pair(adl_begin(std::get<Ns>(ts)),
                                    adl_end(std::get<Ns>(ts)))...);
   }
 
-  template <size_t... Ns> iterator end_impl(index_sequence<Ns...>) const {
+  template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) const {
     return iterator(std::make_pair(adl_end(std::get<Ns>(ts)),
                                    adl_end(std::get<Ns>(ts)))...);
   }
@@ -790,8 +781,10 @@ private:
 public:
   zip_longest_range(Args &&... ts_) : ts(std::forward<Args>(ts_)...) {}
 
-  iterator begin() const { return begin_impl(index_sequence_for<Args...>{}); }
-  iterator end() const { return end_impl(index_sequence_for<Args...>{}); }
+  iterator begin() const {
+    return begin_impl(std::index_sequence_for<Args...>{});
+  }
+  iterator end() const { return end_impl(std::index_sequence_for<Args...>{}); }
 };
 } // namespace detail
 
@@ -847,7 +840,7 @@ class concat_iterator
   /// Increments the first non-end iterator.
   ///
   /// It is an error to call this with all iterators at the end.
-  template <size_t... Ns> void increment(index_sequence<Ns...>) {
+  template <size_t... Ns> void increment(std::index_sequence<Ns...>) {
     // Build a sequence of functions to increment each iterator if possible.
     bool (concat_iterator::*IncrementHelperFns[])() = {
         &concat_iterator::incrementHelper<Ns>...};
@@ -876,7 +869,7 @@ class concat_iterator
   /// reference.
   ///
   /// It is an error to call this with all iterators at the end.
-  template <size_t... Ns> ValueT &get(index_sequence<Ns...>) const {
+  template <size_t... Ns> ValueT &get(std::index_sequence<Ns...>) const {
     // Build a sequence of functions to get from iterator if possible.
     ValueT *(concat_iterator::*GetHelperFns[])() const = {
         &concat_iterator::getHelper<Ns>...};
@@ -901,11 +894,13 @@ public:
   using BaseT::operator++;
 
   concat_iterator &operator++() {
-    increment(index_sequence_for<IterTs...>());
+    increment(std::index_sequence_for<IterTs...>());
     return *this;
   }
 
-  ValueT &operator*() const { return get(index_sequence_for<IterTs...>()); }
+  ValueT &operator*() const {
+    return get(std::index_sequence_for<IterTs...>());
+  }
 
   bool operator==(const concat_iterator &RHS) const {
     return Begins == RHS.Begins && Ends == RHS.Ends;
@@ -928,10 +923,10 @@ public:
 private:
   std::tuple<RangeTs...> Ranges;
 
-  template <size_t... Ns> iterator begin_impl(index_sequence<Ns...>) {
+  template <size_t... Ns> iterator begin_impl(std::index_sequence<Ns...>) {
     return iterator(std::get<Ns>(Ranges)...);
   }
-  template <size_t... Ns> iterator end_impl(index_sequence<Ns...>) {
+  template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) {
     return iterator(make_range(std::end(std::get<Ns>(Ranges)),
                                std::end(std::get<Ns>(Ranges)))...);
   }
@@ -940,8 +935,8 @@ public:
   concat_range(RangeTs &&... Ranges)
       : Ranges(std::forward<RangeTs>(Ranges)...) {}
 
-  iterator begin() { return begin_impl(index_sequence_for<RangeTs...>{}); }
-  iterator end() { return end_impl(index_sequence_for<RangeTs...>{}); }
+  iterator begin() { return begin_impl(std::index_sequence_for<RangeTs...>{}); }
+  iterator end() { return end_impl(std::index_sequence_for<RangeTs...>{}); }
 };
 
 } // end namespace detail
@@ -990,28 +985,6 @@ struct on_first {
   }
 };
 
-// A subset of N3658. More stuff can be added as-needed.
-
-/// Represents a compile-time sequence of integers.
-template <class T, T... I> struct integer_sequence {
-  using value_type = T;
-
-  static constexpr size_t size() { return sizeof...(I); }
-};
-
-/// Alias for the common case of a sequence of size_ts.
-template <size_t... I>
-struct index_sequence : integer_sequence<std::size_t, I...> {};
-
-template <std::size_t N, std::size_t... I>
-struct build_index_impl : build_index_impl<N - 1, N - 1, I...> {};
-template <std::size_t... I>
-struct build_index_impl<0, I...> : index_sequence<I...> {};
-
-/// Creates a compile-time integer sequence for a parameter pack.
-template <class... Ts>
-struct index_sequence_for : build_index_impl<sizeof...(Ts)> {};
-
 /// Utility type to build an inheritance chain that makes it easy to rank
 /// overload candidates.
 template <int N> struct rank : rank<N - 1> {};
@@ -1391,41 +1364,6 @@ void replace(Container &Cont, typename Container::iterator ContIt,
 //     Extra additions to <memory>
 //===----------------------------------------------------------------------===//
 
-// Implement make_unique according to N3656.
-
-/// Constructs a `new T()` with the given args and returns a
-///        `unique_ptr<T>` which owns the object.
-///
-/// Example:
-///
-///     auto p = make_unique<int>();
-///     auto p = make_unique<std::tuple<int, int>>(0, 1);
-template <class T, class... Args>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Args &&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-/// Constructs a `new T[n]` with the given args and returns a
-///        `unique_ptr<T[]>` which owns the object.
-///
-/// \param n size of the new array.
-///
-/// Example:
-///
-///     auto p = make_unique<int[]>(2); // value-initializes the array with 0's.
-template <class T>
-typename std::enable_if<std::is_array<T>::value && std::extent<T>::value == 0,
-                        std::unique_ptr<T>>::type
-make_unique(size_t n) {
-  return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
-}
-
-/// This function isn't used and is only here to provide better compile errors.
-template <class T, class... Args>
-typename std::enable_if<std::extent<T>::value != 0>::type
-make_unique(Args &&...) = delete;
-
 struct FreeDeleter {
   void operator()(void* v) {
     ::free(v);
@@ -1439,20 +1377,6 @@ struct pair_hash {
   }
 };
 
-/// A functor like C++14's std::less<void> in its absence.
-struct less {
-  template <typename A, typename B> bool operator()(A &&a, B &&b) const {
-    return std::forward<A>(a) < std::forward<B>(b);
-  }
-};
-
-/// A functor like C++14's std::equal<void> in its absence.
-struct equal {
-  template <typename A, typename B> bool operator()(A &&a, B &&b) const {
-    return std::forward<A>(a) == std::forward<B>(b);
-  }
-};
-
 /// Binary functor that adapts to any other binary functor after dereferencing
 /// operands.
 template <typename T> struct deref {
@@ -1580,7 +1504,7 @@ template <typename R> detail::enumerator<R> enumerate(R &&TheRange) {
 namespace detail {
 
 template <typename F, typename Tuple, std::size_t... I>
-auto apply_tuple_impl(F &&f, Tuple &&t, index_sequence<I...>)
+auto apply_tuple_impl(F &&f, Tuple &&t, std::index_sequence<I...>)
     -> decltype(std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...)) {
   return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
 }
@@ -1593,9 +1517,9 @@ auto apply_tuple_impl(F &&f, Tuple &&t, index_sequence<I...>)
 template <typename F, typename Tuple>
 auto apply_tuple(F &&f, Tuple &&t) -> decltype(detail::apply_tuple_impl(
     std::forward<F>(f), std::forward<Tuple>(t),
-    build_index_impl<
+    std::make_index_sequence<
         std::tuple_size<typename std::decay<Tuple>::type>::value>{})) {
-  using Indices = build_index_impl<
+  using Indices = std::make_index_sequence<
       std::tuple_size<typename std::decay<Tuple>::type>::value>;
 
   return detail::apply_tuple_impl(std::forward<F>(f), std::forward<Tuple>(t),
diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index 742450e6a951..61375c008022 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h
@@ -290,7 +290,7 @@ public:
       ++Prev;
       uintptr_t Bits = getSmallBits();
       // Mask in previous bits.
-      uintptr_t Mask = (1 << Prev) - 1;
+      uintptr_t Mask = (uintptr_t(1) << Prev) - 1;
       Bits |= Mask;
 
       if (Bits == ~uintptr_t(0) || Prev + 1 >= getSmallSize())
diff --git a/include/llvm/ADT/Statistic.h b/include/llvm/ADT/Statistic.h
index 2ac59da596ef..b7387ddcf1c7 100644
--- a/include/llvm/ADT/Statistic.h
+++ b/include/llvm/ADT/Statistic.h
@@ -44,38 +44,39 @@ class raw_ostream;
 class raw_fd_ostream;
 class StringRef;
 
-class Statistic {
+class StatisticBase {
 public:
   const char *DebugType;
   const char *Name;
   const char *Desc;
-  std::atomic<unsigned> Value;
-  std::atomic<bool> Initialized;
 
-  unsigned getValue() const { return Value.load(std::memory_order_relaxed); }
+  StatisticBase(const char *DebugType, const char *Name, const char *Desc)
+      : DebugType(DebugType), Name(Name), Desc(Desc) {}
+
   const char *getDebugType() const { return DebugType; }
   const char *getName() const { return Name; }
   const char *getDesc() const { return Desc; }
+};
 
-  /// construct - This should only be called for non-global statistics.
-  void construct(const char *debugtype, const char *name, const char *desc) {
-    DebugType = debugtype;
-    Name = name;
-    Desc = desc;
-    Value = 0;
-    Initialized = false;
-  }
+class TrackingStatistic : public StatisticBase {
+public:
+  std::atomic<unsigned> Value;
+  std::atomic<bool> Initialized;
+
+  TrackingStatistic(const char *DebugType, const char *Name, const char *Desc)
+      : StatisticBase(DebugType, Name, Desc), Value(0), Initialized(false) {}
+
+  unsigned getValue() const { return Value.load(std::memory_order_relaxed); }
 
   // Allow use of this class as the value itself.
   operator unsigned() const { return getValue(); }
 
-#if LLVM_ENABLE_STATS
-   const Statistic &operator=(unsigned Val) {
+  const TrackingStatistic &operator=(unsigned Val) {
     Value.store(Val, std::memory_order_relaxed);
     return init();
   }
 
-  const Statistic &operator++() {
+  const TrackingStatistic &operator++() {
     Value.fetch_add(1, std::memory_order_relaxed);
     return init();
   }
@@ -85,7 +86,7 @@ public:
     return Value.fetch_add(1, std::memory_order_relaxed);
   }
 
-  const Statistic &operator--() {
+  const TrackingStatistic &operator--() {
     Value.fetch_sub(1, std::memory_order_relaxed);
     return init();
   }
@@ -95,14 +96,14 @@ public:
     return Value.fetch_sub(1, std::memory_order_relaxed);
   }
 
-  const Statistic &operator+=(unsigned V) {
+  const TrackingStatistic &operator+=(unsigned V) {
     if (V == 0)
       return *this;
     Value.fetch_add(V, std::memory_order_relaxed);
     return init();
   }
 
-  const Statistic &operator-=(unsigned V) {
+  const TrackingStatistic &operator-=(unsigned V) {
     if (V == 0)
       return *this;
     Value.fetch_sub(V, std::memory_order_relaxed);
@@ -119,54 +120,57 @@ public:
     init();
   }
 
-#else  // Statistics are disabled in release builds.
-
-  const Statistic &operator=(unsigned Val) {
+protected:
+  TrackingStatistic &init() {
+    if (!Initialized.load(std::memory_order_acquire))
+      RegisterStatistic();
     return *this;
   }
 
-  const Statistic &operator++() {
-    return *this;
-  }
+  void RegisterStatistic();
+};
 
-  unsigned operator++(int) {
-    return 0;
-  }
+class NoopStatistic : public StatisticBase {
+public:
+  using StatisticBase::StatisticBase;
 
-  const Statistic &operator--() {
-    return *this;
-  }
+  unsigned getValue() const { return 0; }
 
-  unsigned operator--(int) {
-    return 0;
-  }
+  // Allow use of this class as the value itself.
+  operator unsigned() const { return 0; }
 
-  const Statistic &operator+=(const unsigned &V) {
-    return *this;
-  }
+  const NoopStatistic &operator=(unsigned Val) { return *this; }
 
-  const Statistic &operator-=(const unsigned &V) {
-    return *this;
-  }
+  const NoopStatistic &operator++() { return *this; }
 
-  void updateMax(unsigned V) {}
+  unsigned operator++(int) { return 0; }
 
-#endif  // LLVM_ENABLE_STATS
+  const NoopStatistic &operator--() { return *this; }
 
-protected:
-  Statistic &init() {
-    if (!Initialized.load(std::memory_order_acquire))
-      RegisterStatistic();
-    return *this;
-  }
+  unsigned operator--(int) { return 0; }
 
-  void RegisterStatistic();
+  const NoopStatistic &operator+=(const unsigned &V) { return *this; }
+
+  const NoopStatistic &operator-=(const unsigned &V) { return *this; }
+
+  void updateMax(unsigned V) {}
 };
 
+#if LLVM_ENABLE_STATS
+using Statistic = TrackingStatistic;
+#else
+using Statistic = NoopStatistic;
+#endif
+
 // STATISTIC - A macro to make definition of statistics really simple.  This
 // automatically passes the DEBUG_TYPE of the file into the statistic.
 #define STATISTIC(VARNAME, DESC)                                               \
-  static llvm::Statistic VARNAME = {DEBUG_TYPE, #VARNAME, DESC, {0}, {false}}
+  static llvm::Statistic VARNAME = {DEBUG_TYPE, #VARNAME, DESC}
+
+// ALWAYS_ENABLED_STATISTIC - A macro to define a statistic like STATISTIC but
+// it is enabled even if LLVM_ENABLE_STATS is off.
+#define ALWAYS_ENABLED_STATISTIC(VARNAME, DESC)                                \
+  static llvm::TrackingStatistic VARNAME = {DEBUG_TYPE, #VARNAME, DESC}
 
 /// Enable the collection and printing of statistics.
 void EnableStatistics(bool PrintOnExit = true);
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 16ac90bd6c89..ef1a11e0619b 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -345,7 +345,7 @@ inline void join_items_impl(std::string &Result, Sep Separator, const Arg1 &A1,
   join_items_impl(Result, Separator, std::forward<Args>(Items)...);
 }
 
-inline size_t join_one_item_size(char C) { return 1; }
+inline size_t join_one_item_size(char) { return 1; }
 inline size_t join_one_item_size(const char *S) { return S ? ::strlen(S) : 0; }
 
 template <typename T> inline size_t join_one_item_size(const T &Str) {
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 8a586fc26709..108185bd07b9 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -118,36 +118,59 @@ public:
   }
 };
 
-/// StringMapEntry - This is used to represent one value that is inserted into
-/// a StringMap.  It contains the Value itself and the key: the string length
-/// and data.
+/// StringMapEntryStorage - Holds the value in a StringMapEntry.
+///
+/// Factored out into a separate base class to make it easier to specialize.
+/// This is primarily intended to support StringSet, which doesn't need a value
+/// stored at all.
 template<typename ValueTy>
-class StringMapEntry : public StringMapEntryBase {
+class StringMapEntryStorage : public StringMapEntryBase {
 public:
   ValueTy second;
 
-  explicit StringMapEntry(size_t strLen)
+  explicit StringMapEntryStorage(size_t strLen)
     : StringMapEntryBase(strLen), second() {}
   template <typename... InitTy>
-  StringMapEntry(size_t strLen, InitTy &&... InitVals)
+  StringMapEntryStorage(size_t strLen, InitTy &&... InitVals)
       : StringMapEntryBase(strLen), second(std::forward<InitTy>(InitVals)...) {}
-  StringMapEntry(StringMapEntry &E) = delete;
-
-  StringRef getKey() const {
-    return StringRef(getKeyData(), getKeyLength());
-  }
+  StringMapEntryStorage(StringMapEntryStorage &E) = delete;
 
   const ValueTy &getValue() const { return second; }
   ValueTy &getValue() { return second; }
 
   void setValue(const ValueTy &V) { second = V; }
+};
+
+template<>
+class StringMapEntryStorage<NoneType> : public StringMapEntryBase {
+public:
+  explicit StringMapEntryStorage(size_t strLen, NoneType none = None)
+    : StringMapEntryBase(strLen) {}
+  StringMapEntryStorage(StringMapEntryStorage &E) = delete;
+
+  NoneType getValue() const { return None; }
+};
+
+/// StringMapEntry - This is used to represent one value that is inserted into
+/// a StringMap.  It contains the Value itself and the key: the string length
+/// and data.
+template<typename ValueTy>
+class StringMapEntry final : public StringMapEntryStorage<ValueTy> {
+public:
+  using StringMapEntryStorage<ValueTy>::StringMapEntryStorage;
+
+  StringRef getKey() const {
+    return StringRef(getKeyData(), this->getKeyLength());
+  }
 
   /// getKeyData - Return the start of the string data that is the key for this
   /// value.  The string data is always stored immediately after the
   /// StringMapEntry object.
   const char *getKeyData() const {return reinterpret_cast<const char*>(this+1);}
 
-  StringRef first() const { return StringRef(getKeyData(), getKeyLength()); }
+  StringRef first() const {
+    return StringRef(getKeyData(), this->getKeyLength());
+  }
 
   /// Create a StringMapEntry for the specified key construct the value using
   /// \p InitiVals.
@@ -199,7 +222,7 @@ public:
   template<typename AllocatorTy>
   void Destroy(AllocatorTy &Allocator) {
     // Free memory referenced by the item.
-    size_t AllocSize = sizeof(StringMapEntry) + getKeyLength() + 1;
+    size_t AllocSize = sizeof(StringMapEntry) + this->getKeyLength() + 1;
     this->~StringMapEntry();
     Allocator.Deallocate(static_cast<void *>(this), AllocSize);
   }
@@ -391,6 +414,16 @@ public:
     return try_emplace(KV.first, std::move(KV.second));
   }
 
+  /// Inserts an element or assigns to the current element if the key already
+  /// exists. The return type is the same as try_emplace.
+  template <typename V>
+  std::pair<iterator, bool> insert_or_assign(StringRef Key, V &&Val) {
+    auto Ret = try_emplace(Key, std::forward<V>(Val));
+    if (!Ret.second)
+      Ret.first->second = std::forward<V>(Val);
+    return Ret;
+  }
+
   /// Emplace a new element for the specified key into the map if the key isn't
   /// already in the map. The bool component of the returned pair is true
   /// if and only if the insertion takes place, and the iterator component of
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index 4661b1e68b2f..52baab17bede 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -67,6 +67,20 @@ namespace llvm {
       return ::memcmp(Lhs,Rhs,Length);
     }
 
+    // Constexpr version of std::strlen.
+    static constexpr size_t strLen(const char *Str) {
+#if __cplusplus > 201402L
+      return std::char_traits<char>::length(Str);
+#elif __has_builtin(__builtin_strlen) || defined(__GNUC__)
+      return __builtin_strlen(Str);
+#else
+      const char *Begin = Str;
+      while (*Str != '\0')
+        ++Str;
+      return Str - Begin;
+#endif
+    }
+
   public:
     /// @name Constructors
     /// @{
@@ -79,8 +93,8 @@ namespace llvm {
     StringRef(std::nullptr_t) = delete;
 
     /// Construct a string ref from a cstring.
-    /*implicit*/ StringRef(const char *Str)
-        : Data(Str), Length(Str ? ::strlen(Str) : 0) {}
+    /*implicit*/ constexpr StringRef(const char *Str)
+        : Data(Str), Length(Str ? strLen(Str) : 0) {}
 
     /// Construct a string ref from a pointer and length.
     /*implicit*/ constexpr StringRef(const char *data, size_t length)
diff --git a/include/llvm/ADT/StringSet.h b/include/llvm/ADT/StringSet.h
index af3a44a7b32c..60be09d3c326 100644
--- a/include/llvm/ADT/StringSet.h
+++ b/include/llvm/ADT/StringSet.h
@@ -24,8 +24,8 @@ namespace llvm {
 
   /// StringSet - A wrapper for StringMap that provides set-like functionality.
   template <class AllocatorTy = MallocAllocator>
-  class StringSet : public StringMap<char, AllocatorTy> {
-    using base = StringMap<char, AllocatorTy>;
+  class StringSet : public StringMap<NoneType, AllocatorTy> {
+    using base = StringMap<NoneType, AllocatorTy>;
 
   public:
     StringSet() = default;
@@ -37,13 +37,13 @@ namespace llvm {
 
     std::pair<typename base::iterator, bool> insert(StringRef Key) {
       assert(!Key.empty());
-      return base::insert(std::make_pair(Key, '\0'));
+      return base::insert(std::make_pair(Key, None));
     }
 
     template <typename InputIt>
     void insert(const InputIt &Begin, const InputIt &End) {
       for (auto It = Begin; It != End; ++It)
-        base::insert(std::make_pair(*It, '\0'));
+        base::insert(std::make_pair(*It, None));
     }
 
     template <typename ValueTy>
diff --git a/include/llvm/ADT/TinyPtrVector.h b/include/llvm/ADT/TinyPtrVector.h
index ac82451a9b21..6b76d35d4e92 100644
--- a/include/llvm/ADT/TinyPtrVector.h
+++ b/include/llvm/ADT/TinyPtrVector.h
@@ -31,6 +31,10 @@ class TinyPtrVector {
 public:
   using VecTy = SmallVector<EltTy, 4>;
   using value_type = typename VecTy::value_type;
+  // EltTy must be the first pointer type so that is<EltTy> is true for the
+  // default-constructed PtrUnion. This allows an empty TinyPtrVector to
+  // naturally vend a begin/end iterator of type EltTy* without an additional
+  // check for the empty state.
   using PtrUnion = PointerUnion<EltTy, VecTy *>;
 
 private:
@@ -96,14 +100,14 @@ public:
       if (RHS.Val.template is<EltTy>()) {
         V->clear();
         V->push_back(RHS.front());
-        RHS.Val = (EltTy)nullptr;
+        RHS.Val = EltTy();
         return *this;
       }
       delete V;
     }
 
     Val = RHS.Val;
-    RHS.Val = (EltTy)nullptr;
+    RHS.Val = EltTy();
     return *this;
   }
 
@@ -213,9 +217,9 @@ public:
 
   EltTy operator[](unsigned i) const {
     assert(!Val.isNull() && "can't index into an empty vector");
-    if (EltTy V = Val.template dyn_cast<EltTy>()) {
+    if (Val.template is<EltTy>()) {
       assert(i == 0 && "tinyvector index out of range");
-      return V;
+      return Val.template get<EltTy>();
     }
 
     assert(i < Val.template get<VecTy*>()->size() &&
@@ -225,29 +229,29 @@ public:
 
   EltTy front() const {
     assert(!empty() && "vector empty");
-    if (EltTy V = Val.template dyn_cast<EltTy>())
-      return V;
+    if (Val.template is<EltTy>())
+      return Val.template get<EltTy>();
     return Val.template get<VecTy*>()->front();
   }
 
   EltTy back() const {
     assert(!empty() && "vector empty");
-    if (EltTy V = Val.template dyn_cast<EltTy>())
-      return V;
+    if (Val.template is<EltTy>())
+      return Val.template get<EltTy>();
     return Val.template get<VecTy*>()->back();
   }
 
   void push_back(EltTy NewVal) {
-    assert(NewVal && "Can't add a null value");
-
     // If we have nothing, add something.
     if (Val.isNull()) {
       Val = NewVal;
+      assert(!Val.isNull() && "Can't add a null value");
       return;
     }
 
     // If we have a single value, convert to a vector.
-    if (EltTy V = Val.template dyn_cast<EltTy>()) {
+    if (Val.template is<EltTy>()) {
+      EltTy V = Val.template get<EltTy>();
       Val = new VecTy();
       Val.template get<VecTy*>()->push_back(V);
     }
@@ -267,7 +271,7 @@ public:
   void clear() {
     // If we have a single value, convert to empty.
     if (Val.template is<EltTy>()) {
-      Val = (EltTy)nullptr;
+      Val = EltTy();
     } else if (VecTy *Vec = Val.template dyn_cast<VecTy*>()) {
       // If we have a vector form, just clear it.
       Vec->clear();
@@ -282,7 +286,7 @@ public:
     // If we have a single value, convert to empty.
     if (Val.template is<EltTy>()) {
       if (I == begin())
-        Val = (EltTy)nullptr;
+        Val = EltTy();
     } else if (VecTy *Vec = Val.template dyn_cast<VecTy*>()) {
       // multiple items in a vector; just do the erase, there is no
       // benefit to collapsing back to a pointer
@@ -298,7 +302,7 @@ public:
 
     if (Val.template is<EltTy>()) {
       if (S == begin() && S != E)
-        Val = (EltTy)nullptr;
+        Val = EltTy();
     } else if (VecTy *Vec = Val.template dyn_cast<VecTy*>()) {
       return Vec->erase(S, E);
     }
@@ -313,7 +317,8 @@ public:
       return std::prev(end());
     }
     assert(!Val.isNull() && "Null value with non-end insert iterator.");
-    if (EltTy V = Val.template dyn_cast<EltTy>()) {
+    if (Val.template is<EltTy>()) {
+      EltTy V = Val.template get<EltTy>();
       assert(I == begin());
       Val = Elt;
       push_back(V);
@@ -339,7 +344,8 @@ public:
       }
 
       Val = new VecTy();
-    } else if (EltTy V = Val.template dyn_cast<EltTy>()) {
+    } else if (Val.template is<EltTy>()) {
+      EltTy V = Val.template get<EltTy>();
       Val = new VecTy();
       Val.template get<VecTy*>()->push_back(V);
     }
diff --git a/include/llvm/ADT/VariadicFunction.h b/include/llvm/ADT/VariadicFunction.h
deleted file mode 100644
index 5aefb05ecdda..000000000000
--- a/include/llvm/ADT/VariadicFunction.h
+++ /dev/null
@@ -1,330 +0,0 @@
-//===- VariadicFunction.h - Variadic Functions ------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file implements compile-time type-safe variadic functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ADT_VARIADICFUNCTION_H
-#define LLVM_ADT_VARIADICFUNCTION_H
-
-#include "llvm/ADT/ArrayRef.h"
-
-namespace llvm {
-
-// Define macros to aid in expanding a comma separated series with the index of
-// the series pasted onto the last token.
-#define LLVM_COMMA_JOIN1(x) x ## 0
-#define LLVM_COMMA_JOIN2(x) LLVM_COMMA_JOIN1(x), x ## 1
-#define LLVM_COMMA_JOIN3(x) LLVM_COMMA_JOIN2(x), x ## 2
-#define LLVM_COMMA_JOIN4(x) LLVM_COMMA_JOIN3(x), x ## 3
-#define LLVM_COMMA_JOIN5(x) LLVM_COMMA_JOIN4(x), x ## 4
-#define LLVM_COMMA_JOIN6(x) LLVM_COMMA_JOIN5(x), x ## 5
-#define LLVM_COMMA_JOIN7(x) LLVM_COMMA_JOIN6(x), x ## 6
-#define LLVM_COMMA_JOIN8(x) LLVM_COMMA_JOIN7(x), x ## 7
-#define LLVM_COMMA_JOIN9(x) LLVM_COMMA_JOIN8(x), x ## 8
-#define LLVM_COMMA_JOIN10(x) LLVM_COMMA_JOIN9(x), x ## 9
-#define LLVM_COMMA_JOIN11(x) LLVM_COMMA_JOIN10(x), x ## 10
-#define LLVM_COMMA_JOIN12(x) LLVM_COMMA_JOIN11(x), x ## 11
-#define LLVM_COMMA_JOIN13(x) LLVM_COMMA_JOIN12(x), x ## 12
-#define LLVM_COMMA_JOIN14(x) LLVM_COMMA_JOIN13(x), x ## 13
-#define LLVM_COMMA_JOIN15(x) LLVM_COMMA_JOIN14(x), x ## 14
-#define LLVM_COMMA_JOIN16(x) LLVM_COMMA_JOIN15(x), x ## 15
-#define LLVM_COMMA_JOIN17(x) LLVM_COMMA_JOIN16(x), x ## 16
-#define LLVM_COMMA_JOIN18(x) LLVM_COMMA_JOIN17(x), x ## 17
-#define LLVM_COMMA_JOIN19(x) LLVM_COMMA_JOIN18(x), x ## 18
-#define LLVM_COMMA_JOIN20(x) LLVM_COMMA_JOIN19(x), x ## 19
-#define LLVM_COMMA_JOIN21(x) LLVM_COMMA_JOIN20(x), x ## 20
-#define LLVM_COMMA_JOIN22(x) LLVM_COMMA_JOIN21(x), x ## 21
-#define LLVM_COMMA_JOIN23(x) LLVM_COMMA_JOIN22(x), x ## 22
-#define LLVM_COMMA_JOIN24(x) LLVM_COMMA_JOIN23(x), x ## 23
-#define LLVM_COMMA_JOIN25(x) LLVM_COMMA_JOIN24(x), x ## 24
-#define LLVM_COMMA_JOIN26(x) LLVM_COMMA_JOIN25(x), x ## 25
-#define LLVM_COMMA_JOIN27(x) LLVM_COMMA_JOIN26(x), x ## 26
-#define LLVM_COMMA_JOIN28(x) LLVM_COMMA_JOIN27(x), x ## 27
-#define LLVM_COMMA_JOIN29(x) LLVM_COMMA_JOIN28(x), x ## 28
-#define LLVM_COMMA_JOIN30(x) LLVM_COMMA_JOIN29(x), x ## 29
-#define LLVM_COMMA_JOIN31(x) LLVM_COMMA_JOIN30(x), x ## 30
-#define LLVM_COMMA_JOIN32(x) LLVM_COMMA_JOIN31(x), x ## 31
-
-/// Class which can simulate a type-safe variadic function.
-///
-/// The VariadicFunction class template makes it easy to define
-/// type-safe variadic functions where all arguments have the same
-/// type.
-///
-/// Suppose we need a variadic function like this:
-///
-///   ResultT Foo(const ArgT &A_0, const ArgT &A_1, ..., const ArgT &A_N);
-///
-/// Instead of many overloads of Foo(), we only need to define a helper
-/// function that takes an array of arguments:
-///
-///   ResultT FooImpl(ArrayRef<const ArgT *> Args) {
-///     // 'Args[i]' is a pointer to the i-th argument passed to Foo().
-///     ...
-///   }
-///
-/// and then define Foo() like this:
-///
-///   const VariadicFunction<ResultT, ArgT, FooImpl> Foo;
-///
-/// VariadicFunction takes care of defining the overloads of Foo().
-///
-/// Actually, Foo is a function object (i.e. functor) instead of a plain
-/// function.  This object is stateless and its constructor/destructor
-/// does nothing, so it's safe to create global objects and call Foo(...) at
-/// any time.
-///
-/// Sometimes we need a variadic function to have some fixed leading
-/// arguments whose types may be different from that of the optional
-/// arguments.  For example:
-///
-///   bool FullMatch(const StringRef &S, const RE &Regex,
-///                  const ArgT &A_0, ..., const ArgT &A_N);
-///
-/// VariadicFunctionN is for such cases, where N is the number of fixed
-/// arguments.  It is like VariadicFunction, except that it takes N more
-/// template arguments for the types of the fixed arguments:
-///
-///   bool FullMatchImpl(const StringRef &S, const RE &Regex,
-///                      ArrayRef<const ArgT *> Args) { ... }
-///   const VariadicFunction2<bool, const StringRef&,
-///                           const RE&, ArgT, FullMatchImpl>
-///       FullMatch;
-///
-/// Currently VariadicFunction and friends support up-to 3
-/// fixed leading arguments and up-to 32 optional arguments.
-template <typename ResultT, typename ArgT,
-          ResultT (*Func)(ArrayRef<const ArgT *>)>
-struct VariadicFunction {
-  ResultT operator()() const {
-    return Func(None);
-  }
-
-#define LLVM_DEFINE_OVERLOAD(N) \
-  ResultT operator()(LLVM_COMMA_JOIN ## N(const ArgT &A)) const { \
-    const ArgT *const Args[] = { LLVM_COMMA_JOIN ## N(&A) }; \
-    return Func(makeArrayRef(Args)); \
-  }
-  LLVM_DEFINE_OVERLOAD(1)
-  LLVM_DEFINE_OVERLOAD(2)
-  LLVM_DEFINE_OVERLOAD(3)
-  LLVM_DEFINE_OVERLOAD(4)
-  LLVM_DEFINE_OVERLOAD(5)
-  LLVM_DEFINE_OVERLOAD(6)
-  LLVM_DEFINE_OVERLOAD(7)
-  LLVM_DEFINE_OVERLOAD(8)
-  LLVM_DEFINE_OVERLOAD(9)
-  LLVM_DEFINE_OVERLOAD(10)
-  LLVM_DEFINE_OVERLOAD(11)
-  LLVM_DEFINE_OVERLOAD(12)
-  LLVM_DEFINE_OVERLOAD(13)
-  LLVM_DEFINE_OVERLOAD(14)
-  LLVM_DEFINE_OVERLOAD(15)
-  LLVM_DEFINE_OVERLOAD(16)
-  LLVM_DEFINE_OVERLOAD(17)
-  LLVM_DEFINE_OVERLOAD(18)
-  LLVM_DEFINE_OVERLOAD(19)
-  LLVM_DEFINE_OVERLOAD(20)
-  LLVM_DEFINE_OVERLOAD(21)
-  LLVM_DEFINE_OVERLOAD(22)
-  LLVM_DEFINE_OVERLOAD(23)
-  LLVM_DEFINE_OVERLOAD(24)
-  LLVM_DEFINE_OVERLOAD(25)
-  LLVM_DEFINE_OVERLOAD(26)
-  LLVM_DEFINE_OVERLOAD(27)
-  LLVM_DEFINE_OVERLOAD(28)
-  LLVM_DEFINE_OVERLOAD(29)
-  LLVM_DEFINE_OVERLOAD(30)
-  LLVM_DEFINE_OVERLOAD(31)
-  LLVM_DEFINE_OVERLOAD(32)
-#undef LLVM_DEFINE_OVERLOAD
-};
-
-template <typename ResultT, typename Param0T, typename ArgT,
-          ResultT (*Func)(Param0T, ArrayRef<const ArgT *>)>
-struct VariadicFunction1 {
-  ResultT operator()(Param0T P0) const {
-    return Func(P0, None);
-  }
-
-#define LLVM_DEFINE_OVERLOAD(N) \
-  ResultT operator()(Param0T P0, LLVM_COMMA_JOIN ## N(const ArgT &A)) const { \
-    const ArgT *const Args[] = { LLVM_COMMA_JOIN ## N(&A) }; \
-    return Func(P0, makeArrayRef(Args)); \
-  }
-  LLVM_DEFINE_OVERLOAD(1)
-  LLVM_DEFINE_OVERLOAD(2)
-  LLVM_DEFINE_OVERLOAD(3)
-  LLVM_DEFINE_OVERLOAD(4)
-  LLVM_DEFINE_OVERLOAD(5)
-  LLVM_DEFINE_OVERLOAD(6)
-  LLVM_DEFINE_OVERLOAD(7)
-  LLVM_DEFINE_OVERLOAD(8)
-  LLVM_DEFINE_OVERLOAD(9)
-  LLVM_DEFINE_OVERLOAD(10)
-  LLVM_DEFINE_OVERLOAD(11)
-  LLVM_DEFINE_OVERLOAD(12)
-  LLVM_DEFINE_OVERLOAD(13)
-  LLVM_DEFINE_OVERLOAD(14)
-  LLVM_DEFINE_OVERLOAD(15)
-  LLVM_DEFINE_OVERLOAD(16)
-  LLVM_DEFINE_OVERLOAD(17)
-  LLVM_DEFINE_OVERLOAD(18)
-  LLVM_DEFINE_OVERLOAD(19)
-  LLVM_DEFINE_OVERLOAD(20)
-  LLVM_DEFINE_OVERLOAD(21)
-  LLVM_DEFINE_OVERLOAD(22)
-  LLVM_DEFINE_OVERLOAD(23)
-  LLVM_DEFINE_OVERLOAD(24)
-  LLVM_DEFINE_OVERLOAD(25)
-  LLVM_DEFINE_OVERLOAD(26)
-  LLVM_DEFINE_OVERLOAD(27)
-  LLVM_DEFINE_OVERLOAD(28)
-  LLVM_DEFINE_OVERLOAD(29)
-  LLVM_DEFINE_OVERLOAD(30)
-  LLVM_DEFINE_OVERLOAD(31)
-  LLVM_DEFINE_OVERLOAD(32)
-#undef LLVM_DEFINE_OVERLOAD
-};
-
-template <typename ResultT, typename Param0T, typename Param1T, typename ArgT,
-          ResultT (*Func)(Param0T, Param1T, ArrayRef<const ArgT *>)>
-struct VariadicFunction2 {
-  ResultT operator()(Param0T P0, Param1T P1) const {
-    return Func(P0, P1, None);
-  }
-
-#define LLVM_DEFINE_OVERLOAD(N) \
-  ResultT operator()(Param0T P0, Param1T P1, \
-                     LLVM_COMMA_JOIN ## N(const ArgT &A)) const { \
-    const ArgT *const Args[] = { LLVM_COMMA_JOIN ## N(&A) }; \
-    return Func(P0, P1, makeArrayRef(Args)); \
-  }
-  LLVM_DEFINE_OVERLOAD(1)
-  LLVM_DEFINE_OVERLOAD(2)
-  LLVM_DEFINE_OVERLOAD(3)
-  LLVM_DEFINE_OVERLOAD(4)
-  LLVM_DEFINE_OVERLOAD(5)
-  LLVM_DEFINE_OVERLOAD(6)
-  LLVM_DEFINE_OVERLOAD(7)
-  LLVM_DEFINE_OVERLOAD(8)
-  LLVM_DEFINE_OVERLOAD(9)
-  LLVM_DEFINE_OVERLOAD(10)
-  LLVM_DEFINE_OVERLOAD(11)
-  LLVM_DEFINE_OVERLOAD(12)
-  LLVM_DEFINE_OVERLOAD(13)
-  LLVM_DEFINE_OVERLOAD(14)
-  LLVM_DEFINE_OVERLOAD(15)
-  LLVM_DEFINE_OVERLOAD(16)
-  LLVM_DEFINE_OVERLOAD(17)
-  LLVM_DEFINE_OVERLOAD(18)
-  LLVM_DEFINE_OVERLOAD(19)
-  LLVM_DEFINE_OVERLOAD(20)
-  LLVM_DEFINE_OVERLOAD(21)
-  LLVM_DEFINE_OVERLOAD(22)
-  LLVM_DEFINE_OVERLOAD(23)
-  LLVM_DEFINE_OVERLOAD(24)
-  LLVM_DEFINE_OVERLOAD(25)
-  LLVM_DEFINE_OVERLOAD(26)
-  LLVM_DEFINE_OVERLOAD(27)
-  LLVM_DEFINE_OVERLOAD(28)
-  LLVM_DEFINE_OVERLOAD(29)
-  LLVM_DEFINE_OVERLOAD(30)
-  LLVM_DEFINE_OVERLOAD(31)
-  LLVM_DEFINE_OVERLOAD(32)
-#undef LLVM_DEFINE_OVERLOAD
-};
-
-template <typename ResultT, typename Param0T, typename Param1T,
-          typename Param2T, typename ArgT,
-          ResultT (*Func)(Param0T, Param1T, Param2T, ArrayRef<const ArgT *>)>
-struct VariadicFunction3 {
-  ResultT operator()(Param0T P0, Param1T P1, Param2T P2) const {
-    return Func(P0, P1, P2, None);
-  }
-
-#define LLVM_DEFINE_OVERLOAD(N) \
-  ResultT operator()(Param0T P0, Param1T P1, Param2T P2, \
-                     LLVM_COMMA_JOIN ## N(const ArgT &A)) const { \
-    const ArgT *const Args[] = { LLVM_COMMA_JOIN ## N(&A) }; \
-    return Func(P0, P1, P2, makeArrayRef(Args)); \
-  }
-  LLVM_DEFINE_OVERLOAD(1)
-  LLVM_DEFINE_OVERLOAD(2)
-  LLVM_DEFINE_OVERLOAD(3)
-  LLVM_DEFINE_OVERLOAD(4)
-  LLVM_DEFINE_OVERLOAD(5)
-  LLVM_DEFINE_OVERLOAD(6)
-  LLVM_DEFINE_OVERLOAD(7)
-  LLVM_DEFINE_OVERLOAD(8)
-  LLVM_DEFINE_OVERLOAD(9)
-  LLVM_DEFINE_OVERLOAD(10)
-  LLVM_DEFINE_OVERLOAD(11)
-  LLVM_DEFINE_OVERLOAD(12)
-  LLVM_DEFINE_OVERLOAD(13)
-  LLVM_DEFINE_OVERLOAD(14)
-  LLVM_DEFINE_OVERLOAD(15)
-  LLVM_DEFINE_OVERLOAD(16)
-  LLVM_DEFINE_OVERLOAD(17)
-  LLVM_DEFINE_OVERLOAD(18)
-  LLVM_DEFINE_OVERLOAD(19)
-  LLVM_DEFINE_OVERLOAD(20)
-  LLVM_DEFINE_OVERLOAD(21)
-  LLVM_DEFINE_OVERLOAD(22)
-  LLVM_DEFINE_OVERLOAD(23)
-  LLVM_DEFINE_OVERLOAD(24)
-  LLVM_DEFINE_OVERLOAD(25)
-  LLVM_DEFINE_OVERLOAD(26)
-  LLVM_DEFINE_OVERLOAD(27)
-  LLVM_DEFINE_OVERLOAD(28)
-  LLVM_DEFINE_OVERLOAD(29)
-  LLVM_DEFINE_OVERLOAD(30)
-  LLVM_DEFINE_OVERLOAD(31)
-  LLVM_DEFINE_OVERLOAD(32)
-#undef LLVM_DEFINE_OVERLOAD
-};
-
-// Cleanup the macro namespace.
-#undef LLVM_COMMA_JOIN1
-#undef LLVM_COMMA_JOIN2
-#undef LLVM_COMMA_JOIN3
-#undef LLVM_COMMA_JOIN4
-#undef LLVM_COMMA_JOIN5
-#undef LLVM_COMMA_JOIN6
-#undef LLVM_COMMA_JOIN7
-#undef LLVM_COMMA_JOIN8
-#undef LLVM_COMMA_JOIN9
-#undef LLVM_COMMA_JOIN10
-#undef LLVM_COMMA_JOIN11
-#undef LLVM_COMMA_JOIN12
-#undef LLVM_COMMA_JOIN13
-#undef LLVM_COMMA_JOIN14
-#undef LLVM_COMMA_JOIN15
-#undef LLVM_COMMA_JOIN16
-#undef LLVM_COMMA_JOIN17
-#undef LLVM_COMMA_JOIN18
-#undef LLVM_COMMA_JOIN19
-#undef LLVM_COMMA_JOIN20
-#undef LLVM_COMMA_JOIN21
-#undef LLVM_COMMA_JOIN22
-#undef LLVM_COMMA_JOIN23
-#undef LLVM_COMMA_JOIN24
-#undef LLVM_COMMA_JOIN25
-#undef LLVM_COMMA_JOIN26
-#undef LLVM_COMMA_JOIN27
-#undef LLVM_COMMA_JOIN28
-#undef LLVM_COMMA_JOIN29
-#undef LLVM_COMMA_JOIN30
-#undef LLVM_COMMA_JOIN31
-#undef LLVM_COMMA_JOIN32
-
-} // end namespace llvm
-
-#endif  // LLVM_ADT_VARIADICFUNCTION_H
diff --git a/include/llvm/ADT/iterator_range.h b/include/llvm/ADT/iterator_range.h
index 774c7c4e3366..aa8830943cab 100644
--- a/include/llvm/ADT/iterator_range.h
+++ b/include/llvm/ADT/iterator_range.h
@@ -44,6 +44,7 @@ public:
 
   IteratorT begin() const { return begin_iterator; }
   IteratorT end() const { return end_iterator; }
+  bool empty() const { return begin_iterator == end_iterator; }
 };
 
 /// Convenience function for iterating over sub-ranges.
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 948341554f23..282142f51bb3 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -949,7 +949,7 @@ template <typename DerivedT> class AAResultBase {
 
   /// A pointer to the AAResults object that this AAResult is
   /// aggregated within. May be null if not aggregated.
-  AAResults *AAR;
+  AAResults *AAR = nullptr;
 
   /// Helper to dispatch calls back through the derived type.
   DerivedT &derived() { return static_cast<DerivedT &>(*this); }
diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index 34a509b7f4bb..187317e3831b 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -87,10 +87,11 @@ class AliasSet : public ilist_node<AliasSet> {
         AAInfo = NewAAInfo;
       else {
         AAMDNodes Intersection(AAInfo.intersect(NewAAInfo));
-        if (!Intersection) {
+        if (!Intersection.TBAA || !Intersection.Scope ||
+            !Intersection.NoAlias) {
           // NewAAInfo conflicts with AAInfo.
           AAInfo = DenseMapInfo<AAMDNodes>::getTombstoneKey();
-          return SizeChanged;
+          SizeChanged = true;
         }
         AAInfo = Intersection;
       }
diff --git a/include/llvm/Analysis/AssumptionCache.h b/include/llvm/Analysis/AssumptionCache.h
index b42846472f2e..0efbd59023d6 100644
--- a/include/llvm/Analysis/AssumptionCache.h
+++ b/include/llvm/Analysis/AssumptionCache.h
@@ -73,8 +73,8 @@ class AssumptionCache {
   /// Get the vector of assumptions which affect a value from the cache.
   SmallVector<WeakTrackingVH, 1> &getOrInsertAffectedValues(Value *V);
 
-  /// Copy affected values in the cache for OV to be affected values for NV.
-  void copyAffectedValuesInCache(Value *OV, Value *NV);
+  /// Move affected values in the cache for OV to be affected values for NV.
+  void transferAffectedValuesInCache(Value *OV, Value *NV);
 
   /// Flag tracking whether we have scanned the function yet.
   ///
diff --git a/include/llvm/Analysis/CFG.h b/include/llvm/Analysis/CFG.h
index bb55e76ac86a..68f137ba622c 100644
--- a/include/llvm/Analysis/CFG.h
+++ b/include/llvm/Analysis/CFG.h
@@ -46,6 +46,8 @@ unsigned GetSuccessorNumber(const BasicBlock *BB, const BasicBlock *Succ);
 ///
 bool isCriticalEdge(const Instruction *TI, unsigned SuccNum,
                     bool AllowIdenticalEdges = false);
+bool isCriticalEdge(const Instruction *TI, const BasicBlock *Succ,
+                    bool AllowIdenticalEdges = false);
 
 /// Determine whether instruction 'To' is reachable from 'From', without passing
 /// through any blocks in ExclusionSet, returning true if uncertain.
diff --git a/include/llvm/Analysis/CFLAndersAliasAnalysis.h b/include/llvm/Analysis/CFLAndersAliasAnalysis.h
index 7c8b42b1d8d2..5f5e52af3d88 100644
--- a/include/llvm/Analysis/CFLAndersAliasAnalysis.h
+++ b/include/llvm/Analysis/CFLAndersAliasAnalysis.h
@@ -41,7 +41,8 @@ class CFLAndersAAResult : public AAResultBase<CFLAndersAAResult> {
   class FunctionInfo;
 
 public:
-  explicit CFLAndersAAResult(const TargetLibraryInfo &TLI);
+  explicit CFLAndersAAResult(
+      std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
   CFLAndersAAResult(CFLAndersAAResult &&RHS);
   ~CFLAndersAAResult();
 
@@ -74,7 +75,7 @@ private:
   /// Build summary for a given function
   FunctionInfo buildInfoFrom(const Function &);
 
-  const TargetLibraryInfo &TLI;
+  std::function<const TargetLibraryInfo &(Function &F)> GetTLI;
 
   /// Cached mapping of Functions to their StratifiedSets.
   /// If a function's sets are currently being built, it is marked
diff --git a/include/llvm/Analysis/CFLSteensAliasAnalysis.h b/include/llvm/Analysis/CFLSteensAliasAnalysis.h
index cc7a47cd9a5f..135321616b7c 100644
--- a/include/llvm/Analysis/CFLSteensAliasAnalysis.h
+++ b/include/llvm/Analysis/CFLSteensAliasAnalysis.h
@@ -42,7 +42,8 @@ class CFLSteensAAResult : public AAResultBase<CFLSteensAAResult> {
   class FunctionInfo;
 
 public:
-  explicit CFLSteensAAResult(const TargetLibraryInfo &TLI);
+  explicit CFLSteensAAResult(
+      std::function<const TargetLibraryInfo &(Function &)> GetTLI);
   CFLSteensAAResult(CFLSteensAAResult &&Arg);
   ~CFLSteensAAResult();
 
@@ -90,7 +91,7 @@ public:
   }
 
 private:
-  const TargetLibraryInfo &TLI;
+  std::function<const TargetLibraryInfo &(Function &)> GetTLI;
 
   /// Cached mapping of Functions to their StratifiedSets.
   /// If a function's sets are currently being built, it is marked
diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h
index 8af5fb86995a..933f2210dafc 100644
--- a/include/llvm/Analysis/CGSCCPassManager.h
+++ b/include/llvm/Analysis/CGSCCPassManager.h
@@ -88,6 +88,7 @@
 #ifndef LLVM_ANALYSIS_CGSCCPASSMANAGER_H
 #define LLVM_ANALYSIS_CGSCCPASSMANAGER_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/ADT/STLExtras.h"
@@ -583,10 +584,12 @@ public:
                       SmallVectorImpl<WeakTrackingVH> &CallHandles) {
       assert(CallHandles.empty() && "Must start with a clear set of handles.");
 
-      SmallVector<CallCount, 4> CallCounts;
+      SmallDenseMap<Function *, CallCount> CallCounts;
+      CallCount CountLocal = {0, 0};
       for (LazyCallGraph::Node &N : C) {
-        CallCounts.push_back({0, 0});
-        CallCount &Count = CallCounts.back();
+        CallCount &Count =
+            CallCounts.insert(std::make_pair(&N.getFunction(), CountLocal))
+                .first->second;
         for (Instruction &I : instructions(N.getFunction()))
           if (auto CS = CallSite(&I)) {
             if (CS.getCalledFunction()) {
@@ -626,8 +629,6 @@ public:
       // Check that we didn't miss any update scenario.
       assert(!UR.InvalidatedSCCs.count(C) && "Processing an invalid SCC!");
       assert(C->begin() != C->end() && "Cannot have an empty SCC!");
-      assert((int)CallCounts.size() == C->size() &&
-             "Cannot have changed the size of the SCC!");
 
       // Check whether any of the handles were devirtualized.
       auto IsDevirtualizedHandle = [&](WeakTrackingVH &CallH) {
@@ -642,7 +643,7 @@ public:
         if (!F)
           return false;
 
-        LLVM_DEBUG(dbgs() << "Found devirutalized call from "
+        LLVM_DEBUG(dbgs() << "Found devirtualized call from "
                           << CS.getParent()->getParent()->getName() << " to "
                           << F->getName() << "\n");
 
@@ -664,12 +665,20 @@ public:
       // manner of transformations such as DCE and other things, but seems to
       // work well in practice.
       if (!Devirt)
-        for (int i = 0, Size = C->size(); i < Size; ++i)
-          if (CallCounts[i].Indirect > NewCallCounts[i].Indirect &&
-              CallCounts[i].Direct < NewCallCounts[i].Direct) {
-            Devirt = true;
-            break;
+        // Iterate over the keys in NewCallCounts, if Function also exists in
+        // CallCounts, make the check below.
+        for (auto &Pair : NewCallCounts) {
+          auto &CallCountNew = Pair.second;
+          auto CountIt = CallCounts.find(Pair.first);
+          if (CountIt != CallCounts.end()) {
+            const auto &CallCountOld = CountIt->second;
+            if (CallCountOld.Indirect > CallCountNew.Indirect &&
+                CallCountOld.Direct < CallCountNew.Direct) {
+              Devirt = true;
+              break;
+            }
           }
+        }
 
       if (!Devirt) {
         PA.intersect(std::move(PassPA));
diff --git a/include/llvm/Analysis/CaptureTracking.h b/include/llvm/Analysis/CaptureTracking.h
index ca7abd34fea2..29921a51d5be 100644
--- a/include/llvm/Analysis/CaptureTracking.h
+++ b/include/llvm/Analysis/CaptureTracking.h
@@ -17,6 +17,7 @@ namespace llvm {
 
   class Value;
   class Use;
+  class DataLayout;
   class Instruction;
   class DominatorTree;
   class OrderedBasicBlock;
@@ -83,6 +84,11 @@ namespace llvm {
     /// use U. Return true to stop the traversal or false to continue looking
     /// for more capturing instructions.
     virtual bool captured(const Use *U) = 0;
+
+    /// isDereferenceableOrNull - Overload to allow clients with additional
+    /// knowledge about pointer dereferenceability to provide it and thereby
+    /// avoid conservative responses when a pointer is compared to null.
+    virtual bool isDereferenceableOrNull(Value *O, const DataLayout &DL);
   };
 
   /// PointerMayBeCaptured - Visit the value and the values derived from it and
diff --git a/include/llvm/Analysis/DDG.h b/include/llvm/Analysis/DDG.h
new file mode 100644
index 000000000000..0e1eb9d2cda3
--- /dev/null
+++ b/include/llvm/Analysis/DDG.h
@@ -0,0 +1,430 @@
+//===- llvm/Analysis/DDG.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Data-Dependence Graph (DDG).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DDG_H
+#define LLVM_ANALYSIS_DDG_H
+
+#include "llvm/ADT/DirectedGraph.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DependenceGraphBuilder.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/IR/Instructions.h"
+#include <unordered_map>
+
+namespace llvm {
+class DDGNode;
+class DDGEdge;
+using DDGNodeBase = DGNode<DDGNode, DDGEdge>;
+using DDGEdgeBase = DGEdge<DDGNode, DDGEdge>;
+using DDGBase = DirectedGraph<DDGNode, DDGEdge>;
+class LPMUpdater;
+
+/// Data Dependence Graph Node
+/// The graph can represent the following types of nodes:
+/// 1. Single instruction node containing just one instruction.
+/// 2. Multiple instruction node where two or more instructions from
+///    the same basic block are merged into one node.
+/// 3. Root node is a special node that connects to all components such that
+///    there is always a path from it to any node in the graph.
+class DDGNode : public DDGNodeBase {
+public:
+  using InstructionListType = SmallVectorImpl<Instruction *>;
+
+  enum class NodeKind {
+    Unknown,
+    SingleInstruction,
+    MultiInstruction,
+    Root,
+  };
+
+  DDGNode() = delete;
+  DDGNode(const NodeKind K) : DDGNodeBase(), Kind(K) {}
+  DDGNode(const DDGNode &N) : DDGNodeBase(N), Kind(N.Kind) {}
+  DDGNode(DDGNode &&N) : DDGNodeBase(std::move(N)), Kind(N.Kind) {}
+  virtual ~DDGNode() = 0;
+
+  DDGNode &operator=(const DDGNode &N) {
+    DGNode::operator=(N);
+    Kind = N.Kind;
+    return *this;
+  }
+
+  DDGNode &operator=(DDGNode &&N) {
+    DGNode::operator=(std::move(N));
+    Kind = N.Kind;
+    return *this;
+  }
+
+  /// Getter for the kind of this node.
+  NodeKind getKind() const { return Kind; }
+
+  /// Collect a list of instructions, in \p IList, for which predicate \p Pred
+  /// evaluates to true when iterating over instructions of this node. Return
+  /// true if at least one instruction was collected, and false otherwise.
+  bool collectInstructions(llvm::function_ref<bool(Instruction *)> const &Pred,
+                           InstructionListType &IList) const;
+
+protected:
+  /// Setter for the kind of this node.
+  void setKind(NodeKind K) { Kind = K; }
+
+private:
+  NodeKind Kind;
+};
+
+/// Subclass of DDGNode representing the root node of the graph.
+/// There should only be one such node in a given graph.
+class RootDDGNode : public DDGNode {
+public:
+  RootDDGNode() : DDGNode(NodeKind::Root) {}
+  RootDDGNode(const RootDDGNode &N) = delete;
+  RootDDGNode(RootDDGNode &&N) : DDGNode(std::move(N)) {}
+  ~RootDDGNode() {}
+
+  /// Define classof to be able to use isa<>, cast<>, dyn_cast<>, etc.
+  static bool classof(const DDGNode *N) {
+    return N->getKind() == NodeKind::Root;
+  }
+  static bool classof(const RootDDGNode *N) { return true; }
+};
+
+/// Subclass of DDGNode representing single or multi-instruction nodes.
+class SimpleDDGNode : public DDGNode {
+public:
+  SimpleDDGNode() = delete;
+  SimpleDDGNode(Instruction &I);
+  SimpleDDGNode(const SimpleDDGNode &N);
+  SimpleDDGNode(SimpleDDGNode &&N);
+  ~SimpleDDGNode();
+
+  SimpleDDGNode &operator=(const SimpleDDGNode &N) {
+    DDGNode::operator=(N);
+    InstList = N.InstList;
+    return *this;
+  }
+
+  SimpleDDGNode &operator=(SimpleDDGNode &&N) {
+    DDGNode::operator=(std::move(N));
+    InstList = std::move(N.InstList);
+    return *this;
+  }
+
+  /// Get the list of instructions in this node.
+  const InstructionListType &getInstructions() const {
+    assert(!InstList.empty() && "Instruction List is empty.");
+    return InstList;
+  }
+  InstructionListType &getInstructions() {
+    return const_cast<InstructionListType &>(
+        static_cast<const SimpleDDGNode *>(this)->getInstructions());
+  }
+
+  /// Get the first/last instruction in the node.
+  Instruction *getFirstInstruction() const { return getInstructions().front(); }
+  Instruction *getLastInstruction() const { return getInstructions().back(); }
+
+  /// Define classof to be able to use isa<>, cast<>, dyn_cast<>, etc.
+  static bool classof(const DDGNode *N) {
+    return N->getKind() == NodeKind::SingleInstruction ||
+           N->getKind() == NodeKind::MultiInstruction;
+  }
+  static bool classof(const SimpleDDGNode *N) { return true; }
+
+private:
+  /// Append the list of instructions in \p Input to this node.
+  void appendInstructions(const InstructionListType &Input) {
+    setKind((InstList.size() == 0 && Input.size() == 1)
+                ? NodeKind::SingleInstruction
+                : NodeKind::MultiInstruction);
+    InstList.insert(InstList.end(), Input.begin(), Input.end());
+  }
+  void appendInstructions(const SimpleDDGNode &Input) {
+    appendInstructions(Input.getInstructions());
+  }
+
+  /// List of instructions associated with a single or multi-instruction node.
+  SmallVector<Instruction *, 2> InstList;
+};
+
+/// Data Dependency Graph Edge.
+/// An edge in the DDG can represent a def-use relationship or
+/// a memory dependence based on the result of DependenceAnalysis.
+/// A rooted edge connects the root node to one of the components
+/// of the graph.
+class DDGEdge : public DDGEdgeBase {
+public:
+  /// The kind of edge in the DDG
+  enum class EdgeKind { Unknown, RegisterDefUse, MemoryDependence, Rooted };
+
+  explicit DDGEdge(DDGNode &N) = delete;
+  DDGEdge(DDGNode &N, EdgeKind K) : DDGEdgeBase(N), Kind(K) {}
+  DDGEdge(const DDGEdge &E) : DDGEdgeBase(E), Kind(E.getKind()) {}
+  DDGEdge(DDGEdge &&E) : DDGEdgeBase(std::move(E)), Kind(E.Kind) {}
+  DDGEdge &operator=(const DDGEdge &E) {
+    DDGEdgeBase::operator=(E);
+    Kind = E.Kind;
+    return *this;
+  }
+
+  DDGEdge &operator=(DDGEdge &&E) {
+    DDGEdgeBase::operator=(std::move(E));
+    Kind = E.Kind;
+    return *this;
+  }
+
+  /// Get the edge kind
+  EdgeKind getKind() const { return Kind; };
+
+  /// Return true if this is a def-use edge, and false otherwise.
+  bool isDefUse() const { return Kind == EdgeKind::RegisterDefUse; }
+
+  /// Return true if this is a memory dependence edge, and false otherwise.
+  bool isMemoryDependence() const { return Kind == EdgeKind::MemoryDependence; }
+
+  /// Return true if this is an edge stemming from the root node, and false
+  /// otherwise.
+  bool isRooted() const { return Kind == EdgeKind::Rooted; }
+
+private:
+  EdgeKind Kind;
+};
+
+/// Encapsulate some common data and functionality needed for different
+/// variations of data dependence graphs.
+template <typename NodeType> class DependenceGraphInfo {
+public:
+  using DependenceList = SmallVector<std::unique_ptr<Dependence>, 1>;
+
+  DependenceGraphInfo() = delete;
+  DependenceGraphInfo(const DependenceGraphInfo &G) = delete;
+  DependenceGraphInfo(const std::string &N, const DependenceInfo &DepInfo)
+      : Name(N), DI(DepInfo), Root(nullptr) {}
+  DependenceGraphInfo(DependenceGraphInfo &&G)
+      : Name(std::move(G.Name)), DI(std::move(G.DI)), Root(G.Root) {}
+  virtual ~DependenceGraphInfo() {}
+
+  /// Return the label that is used to name this graph.
+  const StringRef getName() const { return Name; }
+
+  /// Return the root node of the graph.
+  NodeType &getRoot() const {
+    assert(Root && "Root node is not available yet. Graph construction may "
+                   "still be in progress\n");
+    return *Root;
+  }
+
+protected:
+  // Name of the graph.
+  std::string Name;
+
+  // Store a copy of DependenceInfo in the graph, so that individual memory
+  // dependencies don't need to be stored. Instead when the dependence is
+  // queried it is recomputed using @DI.
+  const DependenceInfo DI;
+
+  // A special node in the graph that has an edge to every connected component of
+  // the graph, to ensure all nodes are reachable in a graph walk.
+  NodeType *Root = nullptr;
+};
+
+using DDGInfo = DependenceGraphInfo<DDGNode>;
+
+/// Data Dependency Graph
+class DataDependenceGraph : public DDGBase, public DDGInfo {
+  friend class DDGBuilder;
+
+public:
+  using NodeType = DDGNode;
+  using EdgeType = DDGEdge;
+
+  DataDependenceGraph() = delete;
+  DataDependenceGraph(const DataDependenceGraph &G) = delete;
+  DataDependenceGraph(DataDependenceGraph &&G)
+      : DDGBase(std::move(G)), DDGInfo(std::move(G)) {}
+  DataDependenceGraph(Function &F, DependenceInfo &DI);
+  DataDependenceGraph(const Loop &L, DependenceInfo &DI);
+  ~DataDependenceGraph();
+
+protected:
+  /// Add node \p N to the graph, if it's not added yet, and keep track of
+  /// the root node. Return true if node is successfully added.
+  bool addNode(NodeType &N);
+
+};
+
+/// Concrete implementation of a pure data dependence graph builder. This class
+/// provides custom implementation for the pure-virtual functions used in the
+/// generic dependence graph build algorithm.
+///
+/// For information about time complexity of the build algorithm see the
+/// comments near the declaration of AbstractDependenceGraphBuilder.
+class DDGBuilder : public AbstractDependenceGraphBuilder<DataDependenceGraph> {
+public:
+  DDGBuilder(DataDependenceGraph &G, DependenceInfo &D,
+             const BasicBlockListType &BBs)
+      : AbstractDependenceGraphBuilder(G, D, BBs) {}
+  DDGNode &createRootNode() final override {
+    auto *RN = new RootDDGNode();
+    assert(RN && "Failed to allocate memory for DDG root node.");
+    Graph.addNode(*RN);
+    return *RN;
+  }
+  DDGNode &createFineGrainedNode(Instruction &I) final override {
+    auto *SN = new SimpleDDGNode(I);
+    assert(SN && "Failed to allocate memory for simple DDG node.");
+    Graph.addNode(*SN);
+    return *SN;
+  }
+  DDGEdge &createDefUseEdge(DDGNode &Src, DDGNode &Tgt) final override {
+    auto *E = new DDGEdge(Tgt, DDGEdge::EdgeKind::RegisterDefUse);
+    assert(E && "Failed to allocate memory for edge");
+    Graph.connect(Src, Tgt, *E);
+    return *E;
+  }
+  DDGEdge &createMemoryEdge(DDGNode &Src, DDGNode &Tgt) final override {
+    auto *E = new DDGEdge(Tgt, DDGEdge::EdgeKind::MemoryDependence);
+    assert(E && "Failed to allocate memory for edge");
+    Graph.connect(Src, Tgt, *E);
+    return *E;
+  }
+  DDGEdge &createRootedEdge(DDGNode &Src, DDGNode &Tgt) final override {
+    auto *E = new DDGEdge(Tgt, DDGEdge::EdgeKind::Rooted);
+    assert(E && "Failed to allocate memory for edge");
+    assert(isa<RootDDGNode>(Src) && "Expected root node");
+    Graph.connect(Src, Tgt, *E);
+    return *E;
+  }
+
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const DDGNode &N);
+raw_ostream &operator<<(raw_ostream &OS, const DDGNode::NodeKind K);
+raw_ostream &operator<<(raw_ostream &OS, const DDGEdge &E);
+raw_ostream &operator<<(raw_ostream &OS, const DDGEdge::EdgeKind K);
+raw_ostream &operator<<(raw_ostream &OS, const DataDependenceGraph &G);
+
+//===--------------------------------------------------------------------===//
+// DDG Analysis Passes
+//===--------------------------------------------------------------------===//
+
+/// Analysis pass that builds the DDG for a loop.
+class DDGAnalysis : public AnalysisInfoMixin<DDGAnalysis> {
+public:
+  using Result = std::unique_ptr<DataDependenceGraph>;
+  Result run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR);
+
+private:
+  friend AnalysisInfoMixin<DDGAnalysis>;
+  static AnalysisKey Key;
+};
+
+/// Textual printer pass for the DDG of a loop.
+class DDGAnalysisPrinterPass : public PassInfoMixin<DDGAnalysisPrinterPass> {
+public:
+  explicit DDGAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+
+private:
+  raw_ostream &OS;
+};
+
+//===--------------------------------------------------------------------===//
+// GraphTraits specializations for the DDG
+//===--------------------------------------------------------------------===//
+
+/// non-const versions of the grapth trait specializations for DDG
+template <> struct GraphTraits<DDGNode *> {
+  using NodeRef = DDGNode *;
+
+  static DDGNode *DDGGetTargetNode(DGEdge<DDGNode, DDGEdge> *P) {
+    return &P->getTargetNode();
+  }
+
+  // Provide a mapped iterator so that the GraphTrait-based implementations can
+  // find the target nodes without having to explicitly go through the edges.
+  using ChildIteratorType =
+      mapped_iterator<DDGNode::iterator, decltype(&DDGGetTargetNode)>;
+  using ChildEdgeIteratorType = DDGNode::iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static ChildIteratorType child_begin(NodeRef N) {
+    return ChildIteratorType(N->begin(), &DDGGetTargetNode);
+  }
+  static ChildIteratorType child_end(NodeRef N) {
+    return ChildIteratorType(N->end(), &DDGGetTargetNode);
+  }
+
+  static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
+    return N->begin();
+  }
+  static ChildEdgeIteratorType child_edge_end(NodeRef N) { return N->end(); }
+};
+
+template <>
+struct GraphTraits<DataDependenceGraph *> : public GraphTraits<DDGNode *> {
+  using nodes_iterator = DataDependenceGraph::iterator;
+  static NodeRef getEntryNode(DataDependenceGraph *DG) {
+    return &DG->getRoot();
+  }
+  static nodes_iterator nodes_begin(DataDependenceGraph *DG) {
+    return DG->begin();
+  }
+  static nodes_iterator nodes_end(DataDependenceGraph *DG) { return DG->end(); }
+};
+
+/// const versions of the grapth trait specializations for DDG
+template <> struct GraphTraits<const DDGNode *> {
+  using NodeRef = const DDGNode *;
+
+  static const DDGNode *DDGGetTargetNode(const DGEdge<DDGNode, DDGEdge> *P) {
+    return &P->getTargetNode();
+  }
+
+  // Provide a mapped iterator so that the GraphTrait-based implementations can
+  // find the target nodes without having to explicitly go through the edges.
+  using ChildIteratorType =
+      mapped_iterator<DDGNode::const_iterator, decltype(&DDGGetTargetNode)>;
+  using ChildEdgeIteratorType = DDGNode::const_iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static ChildIteratorType child_begin(NodeRef N) {
+    return ChildIteratorType(N->begin(), &DDGGetTargetNode);
+  }
+  static ChildIteratorType child_end(NodeRef N) {
+    return ChildIteratorType(N->end(), &DDGGetTargetNode);
+  }
+
+  static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
+    return N->begin();
+  }
+  static ChildEdgeIteratorType child_edge_end(NodeRef N) { return N->end(); }
+};
+
+template <>
+struct GraphTraits<const DataDependenceGraph *>
+    : public GraphTraits<const DDGNode *> {
+  using nodes_iterator = DataDependenceGraph::const_iterator;
+  static NodeRef getEntryNode(const DataDependenceGraph *DG) {
+    return &DG->getRoot();
+  }
+  static nodes_iterator nodes_begin(const DataDependenceGraph *DG) {
+    return DG->begin();
+  }
+  static nodes_iterator nodes_end(const DataDependenceGraph *DG) {
+    return DG->end();
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_DDG_H
diff --git a/include/llvm/Analysis/DOTGraphTraitsPass.h b/include/llvm/Analysis/DOTGraphTraitsPass.h
index 0410a3314659..c9e8df5db1c2 100644
--- a/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -99,7 +99,7 @@ public:
 
     errs() << "Writing '" << Filename << "'...";
 
-    raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
+    raw_fd_ostream File(Filename, EC, sys::fs::OF_Text);
     std::string GraphName = DOTGraphTraits<GraphT>::getGraphName(Graph);
     std::string Title = GraphName + " for '" + F.getName().str() + "' function";
 
@@ -162,7 +162,7 @@ public:
 
     errs() << "Writing '" << Filename << "'...";
 
-    raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
+    raw_fd_ostream File(Filename, EC, sys::fs::OF_Text);
     std::string Title = DOTGraphTraits<GraphT>::getGraphName(Graph);
 
     if (!EC)
diff --git a/include/llvm/Analysis/DependenceGraphBuilder.h b/include/llvm/Analysis/DependenceGraphBuilder.h
new file mode 100644
index 000000000000..5f4bdb47043b
--- /dev/null
+++ b/include/llvm/Analysis/DependenceGraphBuilder.h
@@ -0,0 +1,119 @@
+//===- llvm/Analysis/DependenceGraphBuilder.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a builder interface that can be used to populate dependence
+// graphs such as DDG and PDG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DEPENDENCE_GRAPH_BUILDER_H
+#define LLVM_ANALYSIS_DEPENDENCE_GRAPH_BUILDER_H
+
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+/// This abstract builder class defines a set of high-level steps for creating
+/// DDG-like graphs. The client code is expected to inherit from this class and
+/// define concrete implementation for each of the pure virtual functions used
+/// in the high-level algorithm.
+template <class GraphType> class AbstractDependenceGraphBuilder {
+protected:
+  using BasicBlockListType = SmallVectorImpl<BasicBlock *>;
+
+private:
+  using NodeType = typename GraphType::NodeType;
+  using EdgeType = typename GraphType::EdgeType;
+
+public:
+  using ClassesType = EquivalenceClasses<BasicBlock *>;
+  using NodeListType = SmallVector<NodeType *, 4>;
+
+  AbstractDependenceGraphBuilder(GraphType &G, DependenceInfo &D,
+                                 const BasicBlockListType &BBs)
+      : Graph(G), DI(D), BBList(BBs) {}
+  virtual ~AbstractDependenceGraphBuilder() {}
+
+  /// The main entry to the graph construction algorithm. It starts by
+  /// creating nodes in increasing order of granularity and then
+  /// adds def-use and memory edges.
+  ///
+  /// The algorithmic complexity of this implementation is O(V^2 * I^2), where V
+  /// is the number of vertecies (nodes) and I is the number of instructions in
+  /// each node. The total number of instructions, N, is equal to V * I,
+  /// therefore the worst-case time complexity is O(N^2). The average time
+  /// complexity is O((N^2)/2).
+  void populate() {
+    createFineGrainedNodes();
+    createDefUseEdges();
+    createMemoryDependencyEdges();
+    createAndConnectRootNode();
+  }
+
+  /// Create fine grained nodes. These are typically atomic nodes that
+  /// consist of a single instruction.
+  void createFineGrainedNodes();
+
+  /// Analyze the def-use chains and create edges from the nodes containing
+  /// definitions to the nodes containing the uses.
+  void createDefUseEdges();
+
+  /// Analyze data dependencies that exist between memory loads or stores,
+  /// in the graph nodes and create edges between them.
+  void createMemoryDependencyEdges();
+
+  /// Create a root node and add edges such that each node in the graph is
+  /// reachable from the root.
+  void createAndConnectRootNode();
+
+protected:
+  /// Create the root node of the graph.
+  virtual NodeType &createRootNode() = 0;
+
+  /// Create an atomic node in the graph given a single instruction.
+  virtual NodeType &createFineGrainedNode(Instruction &I) = 0;
+
+  /// Create a def-use edge going from \p Src to \p Tgt.
+  virtual EdgeType &createDefUseEdge(NodeType &Src, NodeType &Tgt) = 0;
+
+  /// Create a memory dependence edge going from \p Src to \p Tgt.
+  virtual EdgeType &createMemoryEdge(NodeType &Src, NodeType &Tgt) = 0;
+
+  /// Create a rooted edge going from \p Src to \p Tgt .
+  virtual EdgeType &createRootedEdge(NodeType &Src, NodeType &Tgt) = 0;
+
+  /// Deallocate memory of edge \p E.
+  virtual void destroyEdge(EdgeType &E) { delete &E; }
+
+  /// Deallocate memory of node \p N.
+  virtual void destroyNode(NodeType &N) { delete &N; }
+
+  /// Map types to map instructions to nodes used when populating the graph.
+  using InstToNodeMap = DenseMap<Instruction *, NodeType *>;
+
+  /// Reference to the graph that gets built by a concrete implementation of
+  /// this builder.
+  GraphType &Graph;
+
+  /// Dependence information used to create memory dependence edges in the
+  /// graph.
+  DependenceInfo &DI;
+
+  /// The list of basic blocks to consider when building the graph.
+  const BasicBlockListType &BBList;
+
+  /// A mapping from instructions to the corresponding nodes in the graph.
+  InstToNodeMap IMap;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_DEPENDENCE_GRAPH_BUILDER_H
diff --git a/include/llvm/Analysis/DivergenceAnalysis.h b/include/llvm/Analysis/DivergenceAnalysis.h
index 3cfb9d13df94..2fac9c8b4b34 100644
--- a/include/llvm/Analysis/DivergenceAnalysis.h
+++ b/include/llvm/Analysis/DivergenceAnalysis.h
@@ -73,9 +73,12 @@ public:
   /// operands
   bool isAlwaysUniform(const Value &Val) const;
 
-  /// \brief Whether \p Val is a divergent value
+  /// \brief Whether \p Val is divergent at its definition.
   bool isDivergent(const Value &Val) const;
 
+  /// \brief Whether \p U is divergent. Uses of a uniform value can be divergent.
+  bool isDivergentUse(const Use &U) const;
+
   void print(raw_ostream &OS, const Module *) const;
 
 private:
@@ -189,12 +192,19 @@ public:
   /// The GPU kernel this analysis result is for
   const Function &getFunction() const { return DA.getFunction(); }
 
-  /// Whether \p V is divergent.
+  /// Whether \p V is divergent at its definition.
   bool isDivergent(const Value &V) const;
 
-  /// Whether \p V is uniform/non-divergent
+  /// Whether \p U is divergent. Uses of a uniform value can be divergent.
+  bool isDivergentUse(const Use &U) const;
+
+  /// Whether \p V is uniform/non-divergent.
   bool isUniform(const Value &V) const { return !isDivergent(V); }
 
+  /// Whether \p U is uniform/non-divergent. Uses of a uniform value can be
+  /// divergent.
+  bool isUniformUse(const Use &U) const { return !isDivergentUse(U); }
+
   /// Print all divergent values in the kernel.
   void print(raw_ostream &OS, const Module *) const;
 };
diff --git a/include/llvm/Analysis/GlobalsModRef.h b/include/llvm/Analysis/GlobalsModRef.h
index d3fcfc2d41ab..5d1c5a05206a 100644
--- a/include/llvm/Analysis/GlobalsModRef.h
+++ b/include/llvm/Analysis/GlobalsModRef.h
@@ -34,7 +34,7 @@ class GlobalsAAResult : public AAResultBase<GlobalsAAResult> {
   class FunctionInfo;
 
   const DataLayout &DL;
-  const TargetLibraryInfo &TLI;
+  std::function<const TargetLibraryInfo &(Function &F)> GetTLI;
 
   /// The globals that do not have their addresses taken.
   SmallPtrSet<const GlobalValue *, 8> NonAddressTakenGlobals;
@@ -72,14 +72,18 @@ class GlobalsAAResult : public AAResultBase<GlobalsAAResult> {
   /// could perform to the memory utilization here if this becomes a problem.
   std::list<DeletionCallbackHandle> Handles;
 
-  explicit GlobalsAAResult(const DataLayout &DL, const TargetLibraryInfo &TLI);
+  explicit GlobalsAAResult(
+      const DataLayout &DL,
+      std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
 
 public:
   GlobalsAAResult(GlobalsAAResult &&Arg);
   ~GlobalsAAResult();
 
-  static GlobalsAAResult analyzeModule(Module &M, const TargetLibraryInfo &TLI,
-                                       CallGraph &CG);
+  static GlobalsAAResult
+  analyzeModule(Module &M,
+                std::function<const TargetLibraryInfo &(Function &F)> GetTLI,
+                CallGraph &CG);
 
   //------------------------------------------------
   // Implement the AliasAnalysis API
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index 054ffca7215e..a5ffca13046b 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -31,6 +31,7 @@
 #ifndef LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H
 #define LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H
 
+#include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/User.h"
@@ -141,6 +142,13 @@ Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF,
 Value *SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                         const SimplifyQuery &Q);
 
+/// Given operands for the multiplication of a FMA, fold the result or return
+/// null. In contrast to SimplifyFMulInst, this function will not perform
+/// simplifications whose unrounded results differ when rounded to the argument
+/// type.
+Value *SimplifyFMAFMul(Value *LHS, Value *RHS, FastMathFlags FMF,
+                       const SimplifyQuery &Q);
+
 /// Given operands for a Mul, fold the result or return null.
 Value *SimplifyMulInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
@@ -234,21 +242,19 @@ Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 /// Given operand for a UnaryOperator, fold the result or return null.
 Value *SimplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q);
 
-/// Given operand for an FP UnaryOperator, fold the result or return null.
-/// In contrast to SimplifyUnOp, try to use FastMathFlag when folding the
-/// result. In case we don't need FastMathFlags, simply fall to SimplifyUnOp.
-Value *SimplifyFPUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF,
-                      const SimplifyQuery &Q);
+/// Given operand for a UnaryOperator, fold the result or return null.
+/// Try to use FastMathFlags when folding the result.
+Value *SimplifyUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF,
+                    const SimplifyQuery &Q);
 
 /// Given operands for a BinaryOperator, fold the result or return null.
 Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                      const SimplifyQuery &Q);
 
-/// Given operands for an FP BinaryOperator, fold the result or return null.
-/// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
-/// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
-Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                       FastMathFlags FMF, const SimplifyQuery &Q);
+/// Given operands for a BinaryOperator, fold the result or return null.
+/// Try to use FastMathFlags when folding the result.
+Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                     FastMathFlags FMF, const SimplifyQuery &Q);
 
 /// Given a callsite, fold the result or return null.
 Value *SimplifyCall(CallBase *Call, const SimplifyQuery &Q);
@@ -263,12 +269,14 @@ Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q,
 /// This first performs a normal RAUW of I with SimpleV. It then recursively
 /// attempts to simplify those users updated by the operation. The 'I'
 /// instruction must not be equal to the simplified value 'SimpleV'.
+/// If UnsimplifiedUsers is provided, instructions that could not be simplified
+/// are added to it.
 ///
 /// The function returns true if any simplifications were performed.
-bool replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
-                                   const TargetLibraryInfo *TLI = nullptr,
-                                   const DominatorTree *DT = nullptr,
-                                   AssumptionCache *AC = nullptr);
+bool replaceAndRecursivelySimplify(
+    Instruction *I, Value *SimpleV, const TargetLibraryInfo *TLI = nullptr,
+    const DominatorTree *DT = nullptr, AssumptionCache *AC = nullptr,
+    SmallSetVector<Instruction *, 8> *UnsimplifiedUsers = nullptr);
 
 /// Recursively attempt to simplify an instruction.
 ///
diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index 2d83929211e2..20a35bef189b 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -931,7 +931,8 @@ public:
   /// This sets up the graph and computes all of the entry points of the graph.
   /// No function definitions are scanned until their nodes in the graph are
   /// requested during traversal.
-  LazyCallGraph(Module &M, TargetLibraryInfo &TLI);
+  LazyCallGraph(Module &M,
+                function_ref<TargetLibraryInfo &(Function &)> GetTLI);
 
   LazyCallGraph(LazyCallGraph &&G);
   LazyCallGraph &operator=(LazyCallGraph &&RHS);
@@ -1267,7 +1268,12 @@ public:
   /// This just builds the set of entry points to the call graph. The rest is
   /// built lazily as it is walked.
   LazyCallGraph run(Module &M, ModuleAnalysisManager &AM) {
-    return LazyCallGraph(M, AM.getResult<TargetLibraryAnalysis>(M));
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+    auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+      return FAM.getResult<TargetLibraryAnalysis>(F);
+    };
+    return LazyCallGraph(M, GetTLI);
   }
 };
 
diff --git a/include/llvm/Analysis/LegacyDivergenceAnalysis.h b/include/llvm/Analysis/LegacyDivergenceAnalysis.h
index 0a338b816640..e33b8f4129f3 100644
--- a/include/llvm/Analysis/LegacyDivergenceAnalysis.h
+++ b/include/llvm/Analysis/LegacyDivergenceAnalysis.h
@@ -39,17 +39,18 @@ public:
   void print(raw_ostream &OS, const Module *) const override;
 
   // Returns true if V is divergent at its definition.
-  //
-  // Even if this function returns false, V may still be divergent when used
-  // in a different basic block.
   bool isDivergent(const Value *V) const;
 
+  // Returns true if U is divergent. Uses of a uniform value can be divergent.
+  bool isDivergentUse(const Use *U) const;
+
   // Returns true if V is uniform/non-divergent.
-  //
-  // Even if this function returns true, V may still be divergent when used
-  // in a different basic block.
   bool isUniform(const Value *V) const { return !isDivergent(V); }
 
+  // Returns true if U is uniform/non-divergent. Uses of a uniform value can be
+  // divergent.
+  bool isUniformUse(const Use *U) const { return !isDivergentUse(U); }
+
   // Keep the analysis results uptodate by removing an erased value.
   void removeValue(const Value *V) { DivergentValues.erase(V); }
 
@@ -62,6 +63,9 @@ private:
 
   // Stores all divergent values.
   DenseSet<const Value *> DivergentValues;
+
+  // Stores divergent uses of possibly uniform values.
+  DenseSet<const Use *> DivergentUses;
 };
 } // End llvm namespace
 
diff --git a/include/llvm/Analysis/Loads.h b/include/llvm/Analysis/Loads.h
index 5df6bb02308d..9604b2521e89 100644
--- a/include/llvm/Analysis/Loads.h
+++ b/include/llvm/Analysis/Loads.h
@@ -20,7 +20,9 @@
 namespace llvm {
 
 class DataLayout;
+class Loop;
 class MDNode;
+class ScalarEvolution;
 
 /// Return true if this is always a dereferenceable pointer. If the context
 /// instruction is specified perform context-sensitive analysis and return true
@@ -35,7 +37,8 @@ bool isDereferenceablePointer(const Value *V, Type *Ty,
 /// performs context-sensitive analysis and returns true if the pointer is
 /// dereferenceable at the specified instruction.
 bool isDereferenceableAndAlignedPointer(const Value *V, Type *Ty,
-                                        unsigned Align, const DataLayout &DL,
+                                        MaybeAlign Alignment,
+                                        const DataLayout &DL,
                                         const Instruction *CtxI = nullptr,
                                         const DominatorTree *DT = nullptr);
 
@@ -43,7 +46,7 @@ bool isDereferenceableAndAlignedPointer(const Value *V, Type *Ty,
 /// greater or equal than requested. If the context instruction is specified
 /// performs context-sensitive analysis and returns true if the pointer is
 /// dereferenceable at the specified instruction.
-bool isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
+bool isDereferenceableAndAlignedPointer(const Value *V, Align Alignment,
                                         const APInt &Size, const DataLayout &DL,
                                         const Instruction *CtxI = nullptr,
                                         const DominatorTree *DT = nullptr);
@@ -56,11 +59,22 @@ bool isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
 /// If it is not obviously safe to load from the specified pointer, we do a
 /// quick local scan of the basic block containing ScanFrom, to determine if
 /// the address is already accessed.
-bool isSafeToLoadUnconditionally(Value *V, unsigned Align, APInt &Size,
+bool isSafeToLoadUnconditionally(Value *V, MaybeAlign Alignment, APInt &Size,
                                  const DataLayout &DL,
                                  Instruction *ScanFrom = nullptr,
                                  const DominatorTree *DT = nullptr);
 
+/// Return true if we can prove that the given load (which is assumed to be
+/// within the specified loop) would access only dereferenceable memory, and
+/// be properly aligned on every iteration of the specified loop regardless of
+/// its placement within the loop. (i.e. does not require predication beyond
+/// that required by the the header itself and could be hoisted into the header
+/// if desired.)  This is more powerful than the variants above when the
+/// address loaded from is analyzeable by SCEV.  
+bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
+                                       ScalarEvolution &SE,
+                                       DominatorTree &DT);
+
 /// Return true if we know that executing a load from this value cannot trap.
 ///
 /// If DT and ScanFrom are specified this method performs context-sensitive
@@ -69,7 +83,7 @@ bool isSafeToLoadUnconditionally(Value *V, unsigned Align, APInt &Size,
 /// If it is not obviously safe to load from the specified pointer, we do a
 /// quick local scan of the basic block containing ScanFrom, to determine if
 /// the address is already accessed.
-bool isSafeToLoadUnconditionally(Value *V, Type *Ty, unsigned Align,
+bool isSafeToLoadUnconditionally(Value *V, Type *Ty, MaybeAlign Alignment,
                                  const DataLayout &DL,
                                  Instruction *ScanFrom = nullptr,
                                  const DominatorTree *DT = nullptr);
diff --git a/include/llvm/Analysis/LoopAnalysisManager.h b/include/llvm/Analysis/LoopAnalysisManager.h
index 368a810cfa67..a2e65a7310af 100644
--- a/include/llvm/Analysis/LoopAnalysisManager.h
+++ b/include/llvm/Analysis/LoopAnalysisManager.h
@@ -86,8 +86,9 @@ typedef InnerAnalysisManagerProxy<LoopAnalysisManager, Function>
 template <> class LoopAnalysisManagerFunctionProxy::Result {
 public:
   explicit Result(LoopAnalysisManager &InnerAM, LoopInfo &LI)
-      : InnerAM(&InnerAM), LI(&LI) {}
-  Result(Result &&Arg) : InnerAM(std::move(Arg.InnerAM)), LI(Arg.LI) {
+      : InnerAM(&InnerAM), LI(&LI), MSSAUsed(false) {}
+  Result(Result &&Arg)
+      : InnerAM(std::move(Arg.InnerAM)), LI(Arg.LI), MSSAUsed(Arg.MSSAUsed) {
     // We have to null out the analysis manager in the moved-from state
     // because we are taking ownership of the responsibilty to clear the
     // analysis state.
@@ -96,6 +97,7 @@ public:
   Result &operator=(Result &&RHS) {
     InnerAM = RHS.InnerAM;
     LI = RHS.LI;
+    MSSAUsed = RHS.MSSAUsed;
     // We have to null out the analysis manager in the moved-from state
     // because we are taking ownership of the responsibilty to clear the
     // analysis state.
@@ -112,6 +114,9 @@ public:
     InnerAM->clear();
   }
 
+  /// Mark MemorySSA as used so we can invalidate self if MSSA is invalidated.
+  void markMSSAUsed() { MSSAUsed = true; }
+
   /// Accessor for the analysis manager.
   LoopAnalysisManager &getManager() { return *InnerAM; }
 
@@ -130,6 +135,7 @@ public:
 private:
   LoopAnalysisManager *InnerAM;
   LoopInfo *LI;
+  bool MSSAUsed;
 };
 
 /// Provide a specialized run method for the \c LoopAnalysisManagerFunctionProxy
diff --git a/include/llvm/Analysis/LoopCacheAnalysis.h b/include/llvm/Analysis/LoopCacheAnalysis.h
new file mode 100644
index 000000000000..ffec78b6db2c
--- /dev/null
+++ b/include/llvm/Analysis/LoopCacheAnalysis.h
@@ -0,0 +1,281 @@
+//===- llvm/Analysis/LoopCacheAnalysis.h ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the interface for the loop cache analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_LOOPCACHEANALYSIS_H
+#define LLVM_ANALYSIS_LOOPCACHEANALYSIS_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class LPMUpdater;
+using CacheCostTy = int64_t;
+using LoopVectorTy = SmallVector<Loop *, 8>;
+
+/// Represents a memory reference as a base pointer and a set of indexing
+/// operations. For example given the array reference A[i][2j+1][3k+2] in a
+/// 3-dim loop nest:
+///   for(i=0;i<n;++i)
+///     for(j=0;j<m;++j)
+///       for(k=0;k<o;++k)
+///         ... A[i][2j+1][3k+2] ...
+/// We expect:
+///   BasePointer -> A
+///   Subscripts -> [{0,+,1}<%for.i>][{1,+,2}<%for.j>][{2,+,3}<%for.k>]
+///   Sizes -> [m][o][4]
+class IndexedReference {
+  friend raw_ostream &operator<<(raw_ostream &OS, const IndexedReference &R);
+
+public:
+  /// Construct an indexed reference given a \p StoreOrLoadInst instruction.
+  IndexedReference(Instruction &StoreOrLoadInst, const LoopInfo &LI,
+                   ScalarEvolution &SE);
+
+  bool isValid() const { return IsValid; }
+  const SCEV *getBasePointer() const { return BasePointer; }
+  size_t getNumSubscripts() const { return Subscripts.size(); }
+  const SCEV *getSubscript(unsigned SubNum) const {
+    assert(SubNum < getNumSubscripts() && "Invalid subscript number");
+    return Subscripts[SubNum];
+  }
+  const SCEV *getFirstSubscript() const {
+    assert(!Subscripts.empty() && "Expecting non-empty container");
+    return Subscripts.front();
+  }
+  const SCEV *getLastSubscript() const {
+    assert(!Subscripts.empty() && "Expecting non-empty container");
+    return Subscripts.back();
+  }
+
+  /// Return true/false if the current object and the indexed reference \p Other
+  /// are/aren't in the same cache line of size \p CLS. Two references are in
+  /// the same chace line iff the distance between them in the innermost
+  /// dimension is less than the cache line size. Return None if unsure.
+  Optional<bool> hasSpacialReuse(const IndexedReference &Other, unsigned CLS,
+                                 AliasAnalysis &AA) const;
+
+  /// Return true if the current object and the indexed reference \p Other
+  /// have distance smaller than \p MaxDistance in the dimension associated with
+  /// the given loop \p L. Return false if the distance is not smaller than \p
+  /// MaxDistance and None if unsure.
+  Optional<bool> hasTemporalReuse(const IndexedReference &Other,
+                                  unsigned MaxDistance, const Loop &L,
+                                  DependenceInfo &DI, AliasAnalysis &AA) const;
+
+  /// Compute the cost of the reference w.r.t. the given loop \p L when it is
+  /// considered in the innermost position in the loop nest.
+  /// The cost is defined as:
+  ///   - equal to one if the reference is loop invariant, or
+  ///   - equal to '(TripCount * stride) / cache_line_size' if:
+  ///     + the reference stride is less than the cache line size, and
+  ///     + the coefficient of this loop's index variable used in all other
+  ///       subscripts is zero
+  ///   - or otherwise equal to 'TripCount'.
+  CacheCostTy computeRefCost(const Loop &L, unsigned CLS) const;
+
+private:
+  /// Attempt to delinearize the indexed reference.
+  bool delinearize(const LoopInfo &LI);
+
+  /// Return true if the index reference is invariant with respect to loop \p L.
+  bool isLoopInvariant(const Loop &L) const;
+
+  /// Return true if the indexed reference is 'consecutive' in loop \p L.
+  /// An indexed reference is 'consecutive' if the only coefficient that uses
+  /// the loop induction variable is the rightmost one, and the access stride is
+  /// smaller than the cache line size \p CLS.
+  bool isConsecutive(const Loop &L, unsigned CLS) const;
+
+  /// Return the coefficient used in the rightmost dimension.
+  const SCEV *getLastCoefficient() const;
+
+  /// Return true if the coefficient corresponding to induction variable of
+  /// loop \p L in the given \p Subscript is zero or is loop invariant in \p L.
+  bool isCoeffForLoopZeroOrInvariant(const SCEV &Subscript,
+                                     const Loop &L) const;
+
+  /// Verify that the given \p Subscript is 'well formed' (must be a simple add
+  /// recurrence).
+  bool isSimpleAddRecurrence(const SCEV &Subscript, const Loop &L) const;
+
+  /// Return true if the given reference \p Other is definetely aliased with
+  /// the indexed reference represented by this class.
+  bool isAliased(const IndexedReference &Other, AliasAnalysis &AA) const;
+
+private:
+  /// True if the reference can be delinearized, false otherwise.
+  bool IsValid = false;
+
+  /// Represent the memory reference instruction.
+  Instruction &StoreOrLoadInst;
+
+  /// The base pointer of the memory reference.
+  const SCEV *BasePointer = nullptr;
+
+  /// The subscript (indexes) of the memory reference.
+  SmallVector<const SCEV *, 3> Subscripts;
+
+  /// The dimensions of the memory reference.
+  SmallVector<const SCEV *, 3> Sizes;
+
+  ScalarEvolution &SE;
+};
+
+/// A reference group represents a set of memory references that exhibit
+/// temporal or spacial reuse. Two references belong to the same
+/// reference group with respect to a inner loop L iff:
+/// 1. they have a loop independent dependency, or
+/// 2. they have a loop carried dependence with a small dependence distance
+///    (e.g. less than 2) carried by the inner loop, or
+/// 3. they refer to the same array, and the subscript in their innermost
+///    dimension is less than or equal to 'd' (where 'd' is less than the cache
+///    line size)
+///
+/// Intuitively a reference group represents memory references that access
+/// the same cache line. Conditions 1,2 above account for temporal reuse, while
+/// contition 3 accounts for spacial reuse.
+using ReferenceGroupTy = SmallVector<std::unique_ptr<IndexedReference>, 8>;
+using ReferenceGroupsTy = SmallVector<ReferenceGroupTy, 8>;
+
+/// \c CacheCost represents the estimated cost of a inner loop as the number of
+/// cache lines used by the memory references it contains.
+/// The 'cache cost' of a loop 'L' in a loop nest 'LN' is computed as the sum of
+/// the cache costs of all of its reference groups when the loop is considered
+/// to be in the innermost position in the nest.
+/// A reference group represents memory references that fall into the same cache
+/// line. Each reference group is analysed with respect to the innermost loop in
+/// a loop nest. The cost of a reference is defined as follow:
+///  - one if it is loop invariant w.r.t the innermost loop,
+///  - equal to the loop trip count divided by the cache line times the
+///    reference stride if the reference stride is less than the cache line
+///    size (CLS), and the coefficient of this loop's index variable used in all
+///    other subscripts is zero (e.g. RefCost = TripCount/(CLS/RefStride))
+///  - equal to the innermost loop trip count if the reference stride is greater
+///    or equal to the cache line size CLS.
+class CacheCost {
+  friend raw_ostream &operator<<(raw_ostream &OS, const CacheCost &CC);
+  using LoopTripCountTy = std::pair<const Loop *, unsigned>;
+  using LoopCacheCostTy = std::pair<const Loop *, CacheCostTy>;
+
+public:
+  static CacheCostTy constexpr InvalidCost = -1;
+
+  /// Construct a CacheCost object for the loop nest described by \p Loops.
+  /// The optional parameter \p TRT can be used to specify the max. distance
+  /// between array elements accessed in a loop so that the elements are
+  /// classified to have temporal reuse.
+  CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI, ScalarEvolution &SE,
+            TargetTransformInfo &TTI, AliasAnalysis &AA, DependenceInfo &DI,
+            Optional<unsigned> TRT = None);
+
+  /// Create a CacheCost for the loop nest rooted by \p Root.
+  /// The optional parameter \p TRT can be used to specify the max. distance
+  /// between array elements accessed in a loop so that the elements are
+  /// classified to have temporal reuse.
+  static std::unique_ptr<CacheCost>
+  getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI,
+               Optional<unsigned> TRT = None);
+
+  /// Return the estimated cost of loop \p L if the given loop is part of the
+  /// loop nest associated with this object. Return -1 otherwise.
+  CacheCostTy getLoopCost(const Loop &L) const {
+    auto IT = std::find_if(
+        LoopCosts.begin(), LoopCosts.end(),
+        [&L](const LoopCacheCostTy &LCC) { return LCC.first == &L; });
+    return (IT != LoopCosts.end()) ? (*IT).second : -1;
+  }
+
+  /// Return the estimated ordered loop costs.
+  const ArrayRef<LoopCacheCostTy> getLoopCosts() const { return LoopCosts; }
+
+private:
+  /// Calculate the cache footprint of each loop in the nest (when it is
+  /// considered to be in the innermost position).
+  void calculateCacheFootprint();
+
+  /// Partition store/load instructions in the loop nest into reference groups.
+  /// Two or more memory accesses belong in the same reference group if they
+  /// share the same cache line.
+  bool populateReferenceGroups(ReferenceGroupsTy &RefGroups) const;
+
+  /// Calculate the cost of the given loop \p L assuming it is the innermost
+  /// loop in nest.
+  CacheCostTy computeLoopCacheCost(const Loop &L,
+                                   const ReferenceGroupsTy &RefGroups) const;
+
+  /// Compute the cost of a representative reference in reference group \p RG
+  /// when the given loop \p L is considered as the innermost loop in the nest.
+  /// The computed cost is an estimate for the number of cache lines used by the
+  /// reference group. The representative reference cost is defined as:
+  ///   - equal to one if the reference is loop invariant, or
+  ///   - equal to '(TripCount * stride) / cache_line_size' if (a) loop \p L's
+  ///     induction variable is used only in the reference subscript associated
+  ///     with loop \p L, and (b) the reference stride is less than the cache
+  ///     line size, or
+  ///   - TripCount otherwise
+  CacheCostTy computeRefGroupCacheCost(const ReferenceGroupTy &RG,
+                                       const Loop &L) const;
+
+  /// Sort the LoopCosts vector by decreasing cache cost.
+  void sortLoopCosts() {
+    sort(LoopCosts, [](const LoopCacheCostTy &A, const LoopCacheCostTy &B) {
+      return A.second > B.second;
+    });
+  }
+
+private:
+  /// Loops in the loop nest associated with this object.
+  LoopVectorTy Loops;
+
+  /// Trip counts for the loops in the loop nest associated with this object.
+  SmallVector<LoopTripCountTy, 3> TripCounts;
+
+  /// Cache costs for the loops in the loop nest associated with this object.
+  SmallVector<LoopCacheCostTy, 3> LoopCosts;
+
+  /// The max. distance between array elements accessed in a loop so that the
+  /// elements are classified to have temporal reuse.
+  Optional<unsigned> TRT;
+
+  const LoopInfo &LI;
+  ScalarEvolution &SE;
+  TargetTransformInfo &TTI;
+  AliasAnalysis &AA;
+  DependenceInfo &DI;
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const IndexedReference &R);
+raw_ostream &operator<<(raw_ostream &OS, const CacheCost &CC);
+
+/// Printer pass for the \c CacheCost results.
+class LoopCachePrinterPass : public PassInfoMixin<LoopCachePrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit LoopCachePrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_LOOPCACHEANALYSIS_H
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index 584eb3a8c854..abf3863b0601 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -30,6 +30,9 @@
 // instance.  In particular, a Loop might be inside such a non-loop SCC, or a
 // non-loop SCC might contain a sub-SCC which is a Loop.
 //
+// For an overview of terminology used in this API (and thus all of our loop
+// analyses or transforms), see docs/LoopTerminology.rst.
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_LOOPINFO_H
@@ -570,9 +573,9 @@ public:
   bool getIncomingAndBackEdge(BasicBlock *&Incoming,
                               BasicBlock *&Backedge) const;
 
-  /// Below are some utilities to get loop bounds and induction variable, and
-  /// check if a given phinode is an auxiliary induction variable, as well as
-  /// checking if the loop is canonical.
+  /// Below are some utilities to get the loop guard, loop bounds and induction
+  /// variable, and to check if a given phinode is an auxiliary induction
+  /// variable, if the loop is guarded, and if the loop is canonical.
   ///
   /// Here is an example:
   /// \code
@@ -604,6 +607,9 @@ public:
   ///
   /// - getInductionVariable            --> i_1
   /// - isAuxiliaryInductionVariable(x) --> true if x == i_1
+  /// - getLoopGuardBranch()
+  ///                 --> `if (guardcmp) goto preheader; else goto afterloop`
+  /// - isGuarded()                     --> true
   /// - isCanonical                     --> false
   struct LoopBounds {
     /// Return the LoopBounds object if
@@ -725,6 +731,31 @@ public:
   bool isAuxiliaryInductionVariable(PHINode &AuxIndVar,
                                     ScalarEvolution &SE) const;
 
+  /// Return the loop guard branch, if it exists.
+  ///
+  /// This currently only works on simplified loop, as it requires a preheader
+  /// and a latch to identify the guard. It will work on loops of the form:
+  /// \code
+  /// GuardBB:
+  ///   br cond1, Preheader, ExitSucc <== GuardBranch
+  /// Preheader:
+  ///   br Header
+  /// Header:
+  ///  ...
+  ///   br Latch
+  /// Latch:
+  ///   br cond2, Header, ExitBlock
+  /// ExitBlock:
+  ///   br ExitSucc
+  /// ExitSucc:
+  /// \endcode
+  BranchInst *getLoopGuardBranch() const;
+
+  /// Return true iff the loop is
+  /// - in simplify rotated form, and
+  /// - guarded by a loop guard branch.
+  bool isGuarded() const { return (getLoopGuardBranch() != nullptr); }
+
   /// Return true if the loop induction variable starts at zero and increments
   /// by one each time through the loop.
   bool isCanonical(ScalarEvolution &SE) const;
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index 4c33dac9e21e..8b11e848a195 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -85,9 +85,9 @@ template <class BlockT, class LoopT>
 bool LoopBase<BlockT, LoopT>::hasDedicatedExits() const {
   // Each predecessor of each exit block of a normal loop is contained
   // within the loop.
-  SmallVector<BlockT *, 4> ExitBlocks;
-  getExitBlocks(ExitBlocks);
-  for (BlockT *EB : ExitBlocks)
+  SmallVector<BlockT *, 4> UniqueExitBlocks;
+  getUniqueExitBlocks(UniqueExitBlocks);
+  for (BlockT *EB : UniqueExitBlocks)
     for (BlockT *Predecessor : children<Inverse<BlockT *>>(EB))
       if (!contains(Predecessor))
         return false;
@@ -200,8 +200,6 @@ BlockT *LoopBase<BlockT, LoopT>::getLoopPredecessor() const {
     }
   }
 
-  // Make sure there is only one exit out of the preheader.
-  assert(Out && "Header of loop has no predecessors from outside loop?");
   return Out;
 }
 
diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index 49f9e58ffad7..a89d76b9e5bd 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@@ -58,6 +58,9 @@ class Value;
 /// like).
 bool isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
                     bool LookThroughBitCast = false);
+bool isAllocationFn(const Value *V,
+                    function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
+                    bool LookThroughBitCast = false);
 
 /// Tests if a value is a call or invoke to a function that returns a
 /// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
@@ -68,6 +71,9 @@ bool isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
 /// allocates uninitialized memory (such as malloc).
 bool isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                     bool LookThroughBitCast = false);
+bool isMallocLikeFn(const Value *V,
+                    function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
+                    bool LookThroughBitCast = false);
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
@@ -93,6 +99,16 @@ bool isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
 /// reallocates memory (e.g., realloc).
 bool isReallocLikeFn(const Function *F, const TargetLibraryInfo *TLI);
 
+/// Tests if a value is a call or invoke to a library function that
+/// allocates memory and throws if an allocation failed (e.g., new).
+bool isOpNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                     bool LookThroughBitCast = false);
+
+/// Tests if a value is a call or invoke to a library function that
+/// allocates memory (strdup, strndup).
+bool isStrdupLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                     bool LookThroughBitCast = false);
+
 //===----------------------------------------------------------------------===//
 //  malloc Call Utility Functions.
 //
@@ -100,9 +116,13 @@ bool isReallocLikeFn(const Function *F, const TargetLibraryInfo *TLI);
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
-const CallInst *extractMallocCall(const Value *I, const TargetLibraryInfo *TLI);
-inline CallInst *extractMallocCall(Value *I, const TargetLibraryInfo *TLI) {
-  return const_cast<CallInst*>(extractMallocCall((const Value*)I, TLI));
+const CallInst *
+extractMallocCall(const Value *I,
+                  function_ref<const TargetLibraryInfo &(Function &)> GetTLI);
+inline CallInst *
+extractMallocCall(Value *I,
+                  function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
+  return const_cast<CallInst *>(extractMallocCall((const Value *)I, GetTLI));
 }
 
 /// getMallocType - Returns the PointerType resulting from the malloc call.
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index e2669c2fa601..e89e5690fad0 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -362,11 +362,14 @@ private:
   PhiValues &PV;
   PredIteratorCache PredCache;
 
+  unsigned DefaultBlockScanLimit;
+
 public:
   MemoryDependenceResults(AliasAnalysis &AA, AssumptionCache &AC,
-                          const TargetLibraryInfo &TLI,
-                          DominatorTree &DT, PhiValues &PV)
-      : AA(AA), AC(AC), TLI(TLI), DT(DT), PV(PV) {}
+                          const TargetLibraryInfo &TLI, DominatorTree &DT,
+                          PhiValues &PV, unsigned DefaultBlockScanLimit)
+      : AA(AA), AC(AC), TLI(TLI), DT(DT), PV(PV),
+        DefaultBlockScanLimit(DefaultBlockScanLimit) {}
 
   /// Handle invalidation in the new PM.
   bool invalidate(Function &F, const PreservedAnalyses &PA,
@@ -511,9 +514,14 @@ class MemoryDependenceAnalysis
 
   static AnalysisKey Key;
 
+  unsigned DefaultBlockScanLimit;
+
 public:
   using Result = MemoryDependenceResults;
 
+  MemoryDependenceAnalysis();
+  MemoryDependenceAnalysis(unsigned DefaultBlockScanLimit) : DefaultBlockScanLimit(DefaultBlockScanLimit) { }
+
   MemoryDependenceResults run(Function &F, FunctionAnalysisManager &AM);
 };
 
diff --git a/include/llvm/Analysis/MemorySSA.h b/include/llvm/Analysis/MemorySSA.h
index b7730be75354..e89bf26a7234 100644
--- a/include/llvm/Analysis/MemorySSA.h
+++ b/include/llvm/Analysis/MemorySSA.h
@@ -793,6 +793,7 @@ protected:
   friend class MemorySSAPrinterLegacyPass;
   friend class MemorySSAUpdater;
 
+  void verifyPrevDefInPhis(Function &F) const;
   void verifyDefUses(Function &F) const;
   void verifyDomination(Function &F) const;
   void verifyOrdering(Function &F) const;
@@ -830,7 +831,8 @@ protected:
   void insertIntoListsBefore(MemoryAccess *, const BasicBlock *,
                              AccessList::iterator);
   MemoryUseOrDef *createDefinedAccess(Instruction *, MemoryAccess *,
-                                      const MemoryUseOrDef *Template = nullptr);
+                                      const MemoryUseOrDef *Template = nullptr,
+                                      bool CreationMustSucceed = true);
 
 private:
   template <class AliasAnalysisType> class ClobberWalkerBase;
diff --git a/include/llvm/Analysis/MemorySSAUpdater.h b/include/llvm/Analysis/MemorySSAUpdater.h
index d4d8040c1ff6..1d34663721e3 100644
--- a/include/llvm/Analysis/MemorySSAUpdater.h
+++ b/include/llvm/Analysis/MemorySSAUpdater.h
@@ -99,7 +99,7 @@ public:
   /// load a
   /// Where a mayalias b, *does* require RenameUses be set to true.
   void insertDef(MemoryDef *Def, bool RenameUses = false);
-  void insertUse(MemoryUse *Use);
+  void insertUse(MemoryUse *Use, bool RenameUses = false);
   /// Update the MemoryPhi in `To` following an edge deletion between `From` and
   /// `To`. If `To` becomes unreachable, a call to removeBlocks should be made.
   void removeEdge(BasicBlock *From, BasicBlock *To);
@@ -275,6 +275,7 @@ private:
   getPreviousDefRecursive(BasicBlock *,
                           DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> &);
   MemoryAccess *recursePhi(MemoryAccess *Phi);
+  MemoryAccess *tryRemoveTrivialPhi(MemoryPhi *Phi);
   template <class RangeType>
   MemoryAccess *tryRemoveTrivialPhi(MemoryPhi *Phi, RangeType &Operands);
   void tryRemoveTrivialPhis(ArrayRef<WeakVH> UpdatedPHIs);
diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index 3ef539c89d97..87cf9f85c7f1 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -7,10 +7,17 @@
 //===----------------------------------------------------------------------===//
 /// \file
 /// Contains a collection of routines for determining if a given instruction is
-/// guaranteed to execute if a given point in control flow is reached.  The most
+/// guaranteed to execute if a given point in control flow is reached. The most
 /// common example is an instruction within a loop being provably executed if we
 /// branch to the header of it's containing loop.
 ///
+/// There are two interfaces available to determine if an instruction is
+/// executed once a given point in the control flow is reached:
+/// 1) A loop-centric one derived from LoopSafetyInfo.
+/// 2) A "must be executed context"-based one implemented in the
+///    MustBeExecutedContextExplorer.
+/// Please refer to the class comments for more information.
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_MUSTEXECUTE_H
@@ -164,6 +171,280 @@ public:
   virtual ~ICFLoopSafetyInfo() {};
 };
 
-}
+struct MustBeExecutedContextExplorer;
+
+/// Must be executed iterators visit stretches of instructions that are
+/// guaranteed to be executed together, potentially with other instruction
+/// executed in-between.
+///
+/// Given the following code, and assuming all statements are single
+/// instructions which transfer execution to the successor (see
+/// isGuaranteedToTransferExecutionToSuccessor), there are two possible
+/// outcomes. If we start the iterator at A, B, or E, we will visit only A, B,
+/// and E. If we start at C or D, we will visit all instructions A-E.
+///
+/// \code
+///   A;
+///   B;
+///   if (...) {
+///     C;
+///     D;
+///   }
+///   E;
+/// \endcode
+///
+///
+/// Below is the example extneded with instructions F and G. Now we assume F
+/// might not transfer execution to it's successor G. As a result we get the
+/// following visit sets:
+///
+/// Start Instruction   | Visit Set
+/// A                   | A, B,       E, F
+///    B                | A, B,       E, F
+///       C             | A, B, C, D, E, F
+///          D          | A, B, C, D, E, F
+///             E       | A, B,       E, F
+///                F    | A, B,       E, F
+///                   G | A, B,       E, F, G
+///
+///
+/// \code
+///   A;
+///   B;
+///   if (...) {
+///     C;
+///     D;
+///   }
+///   E;
+///   F;  // Might not transfer execution to its successor G.
+///   G;
+/// \endcode
+///
+///
+/// A more complex example involving conditionals, loops, break, and continue
+/// is shown below. We again assume all instructions will transmit control to
+/// the successor and we assume we can prove the inner loop to be finite. We
+/// omit non-trivial branch conditions as the exploration is oblivious to them.
+/// Constant branches are assumed to be unconditional in the CFG. The resulting
+/// visist sets are shown in the table below.
+///
+/// \code
+///   A;
+///   while (true) {
+///     B;
+///     if (...)
+///       C;
+///     if (...)
+///       continue;
+///     D;
+///     if (...)
+///       break;
+///     do {
+///       if (...)
+///         continue;
+///       E;
+///     } while (...);
+///     F;
+///   }
+///   G;
+/// \endcode
+///
+/// Start Instruction    | Visit Set
+/// A                    | A, B
+///    B                 | A, B
+///       C              | A, B, C
+///          D           | A, B,    D
+///             E        | A, B,    D, E, F
+///                F     | A, B,    D,    F
+///                   G  | A, B,    D,       G
+///
+///
+/// Note that the examples show optimal visist sets but not necessarily the ones
+/// derived by the explorer depending on the available CFG analyses (see
+/// MustBeExecutedContextExplorer). Also note that we, depending on the options,
+/// the visit set can contain instructions from other functions.
+struct MustBeExecutedIterator {
+  /// Type declarations that make his class an input iterator.
+  ///{
+  typedef const Instruction *value_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef const Instruction **pointer;
+  typedef const Instruction *&reference;
+  typedef std::input_iterator_tag iterator_category;
+  ///}
+
+  using ExplorerTy = MustBeExecutedContextExplorer;
+
+  MustBeExecutedIterator(const MustBeExecutedIterator &Other)
+      : Visited(Other.Visited), Explorer(Other.Explorer),
+        CurInst(Other.CurInst) {}
+
+  MustBeExecutedIterator(MustBeExecutedIterator &&Other)
+      : Visited(std::move(Other.Visited)), Explorer(Other.Explorer),
+        CurInst(Other.CurInst) {}
+
+  MustBeExecutedIterator &operator=(MustBeExecutedIterator &&Other) {
+    if (this != &Other) {
+      std::swap(Visited, Other.Visited);
+      std::swap(CurInst, Other.CurInst);
+    }
+    return *this;
+  }
+
+  ~MustBeExecutedIterator() {}
+
+  /// Pre- and post-increment operators.
+  ///{
+  MustBeExecutedIterator &operator++() {
+    CurInst = advance();
+    return *this;
+  }
+
+  MustBeExecutedIterator operator++(int) {
+    MustBeExecutedIterator tmp(*this);
+    operator++();
+    return tmp;
+  }
+  ///}
+
+  /// Equality and inequality operators. Note that we ignore the history here.
+  ///{
+  bool operator==(const MustBeExecutedIterator &Other) const {
+    return CurInst == Other.CurInst;
+  }
+
+  bool operator!=(const MustBeExecutedIterator &Other) const {
+    return !(*this == Other);
+  }
+  ///}
+
+  /// Return the underlying instruction.
+  const Instruction *&operator*() { return CurInst; }
+  const Instruction *getCurrentInst() const { return CurInst; }
+
+  /// Return true if \p I was encountered by this iterator already.
+  bool count(const Instruction *I) const { return Visited.count(I); }
+
+private:
+  using VisitedSetTy = DenseSet<const Instruction *>;
+
+  /// Private constructors.
+  MustBeExecutedIterator(ExplorerTy &Explorer, const Instruction *I);
+
+  /// Reset the iterator to its initial state pointing at \p I.
+  void reset(const Instruction *I);
+
+  /// Try to advance one of the underlying positions (Head or Tail).
+  ///
+  /// \return The next instruction in the must be executed context, or nullptr
+  ///         if none was found.
+  const Instruction *advance();
+
+  /// A set to track the visited instructions in order to deal with endless
+  /// loops and recursion.
+  VisitedSetTy Visited;
+
+  /// A reference to the explorer that created this iterator.
+  ExplorerTy &Explorer;
+
+  /// The instruction we are currently exposing to the user. There is always an
+  /// instruction that we know is executed with the given program point,
+  /// initially the program point itself.
+  const Instruction *CurInst;
+
+  friend struct MustBeExecutedContextExplorer;
+};
+
+/// A "must be executed context" for a given program point PP is the set of
+/// instructions, potentially before and after PP, that are executed always when
+/// PP is reached. The MustBeExecutedContextExplorer an interface to explore
+/// "must be executed contexts" in a module through the use of
+/// MustBeExecutedIterator.
+///
+/// The explorer exposes "must be executed iterators" that traverse the must be
+/// executed context. There is little information sharing between iterators as
+/// the expected use case involves few iterators for "far apart" instructions.
+/// If that changes, we should consider caching more intermediate results.
+struct MustBeExecutedContextExplorer {
+
+  /// In the description of the parameters we use PP to denote a program point
+  /// for which the must be executed context is explored, or put differently,
+  /// for which the MustBeExecutedIterator is created.
+  ///
+  /// \param ExploreInterBlock    Flag to indicate if instructions in blocks
+  ///                             other than the parent of PP should be
+  ///                             explored.
+  MustBeExecutedContextExplorer(bool ExploreInterBlock)
+      : ExploreInterBlock(ExploreInterBlock), EndIterator(*this, nullptr) {}
+
+  /// Clean up the dynamically allocated iterators.
+  ~MustBeExecutedContextExplorer() {
+    DeleteContainerSeconds(InstructionIteratorMap);
+  }
+
+  /// Iterator-based interface. \see MustBeExecutedIterator.
+  ///{
+  using iterator = MustBeExecutedIterator;
+  using const_iterator = const MustBeExecutedIterator;
+
+  /// Return an iterator to explore the context around \p PP.
+  iterator &begin(const Instruction *PP) {
+    auto *&It = InstructionIteratorMap[PP];
+    if (!It)
+      It = new iterator(*this, PP);
+    return *It;
+  }
+
+  /// Return an iterator to explore the cached context around \p PP.
+  const_iterator &begin(const Instruction *PP) const {
+    return *InstructionIteratorMap.lookup(PP);
+  }
+
+  /// Return an universal end iterator.
+  ///{
+  iterator &end() { return EndIterator; }
+  iterator &end(const Instruction *) { return EndIterator; }
+
+  const_iterator &end() const { return EndIterator; }
+  const_iterator &end(const Instruction *) const { return EndIterator; }
+  ///}
+
+  /// Return an iterator range to explore the context around \p PP.
+  llvm::iterator_range<iterator> range(const Instruction *PP) {
+    return llvm::make_range(begin(PP), end(PP));
+  }
+
+  /// Return an iterator range to explore the cached context around \p PP.
+  llvm::iterator_range<const_iterator> range(const Instruction *PP) const {
+    return llvm::make_range(begin(PP), end(PP));
+  }
+  ///}
+
+  /// Return the next instruction that is guaranteed to be executed after \p PP.
+  ///
+  /// \param It              The iterator that is used to traverse the must be
+  ///                        executed context.
+  /// \param PP              The program point for which the next instruction
+  ///                        that is guaranteed to execute is determined.
+  const Instruction *
+  getMustBeExecutedNextInstruction(MustBeExecutedIterator &It,
+                                   const Instruction *PP);
+
+  /// Parameter that limit the performed exploration. See the constructor for
+  /// their meaning.
+  ///{
+  const bool ExploreInterBlock;
+  ///}
+
+private:
+  /// Map from instructions to associated must be executed iterators.
+  DenseMap<const Instruction *, MustBeExecutedIterator *>
+      InstructionIteratorMap;
+
+  /// A unique end iterator.
+  MustBeExecutedIterator EndIterator;
+};
+
+} // namespace llvm
 
 #endif
diff --git a/include/llvm/Analysis/Passes.h b/include/llvm/Analysis/Passes.h
index d9c97dff8c6e..8562519fa7b1 100644
--- a/include/llvm/Analysis/Passes.h
+++ b/include/llvm/Analysis/Passes.h
@@ -103,6 +103,13 @@ namespace llvm {
   //
   FunctionPass *createMustExecutePrinter();
 
+  //===--------------------------------------------------------------------===//
+  //
+  // createMustBeExecutedContextPrinter - This pass prints information about which
+  // instructions are guaranteed to execute together (run with -analyze).
+  //
+  ModulePass *createMustBeExecutedContextPrinter();
+
 }
 
 #endif
diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h
index f309d344b8d1..6693e40ccf22 100644
--- a/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -52,6 +52,15 @@ private:
   // because the number of profile counts required to reach the hot
   // percentile is above a huge threshold.
   Optional<bool> HasHugeWorkingSetSize;
+  // True if the working set size of the code is considered large,
+  // because the number of profile counts required to reach the hot
+  // percentile is above a large threshold.
+  Optional<bool> HasLargeWorkingSetSize;
+  // Compute the threshold for a given cutoff.
+  Optional<uint64_t> computeThreshold(int PercentileCutoff);
+  // The map that caches the threshold values. The keys are the percentile
+  // cutoff values and the values are the corresponding threshold values.
+  DenseMap<int, uint64_t> ThresholdCache;
 
 public:
   ProfileSummaryInfo(Module &M) : M(M) {}
@@ -96,6 +105,8 @@ public:
                                      bool AllowSynthetic = false);
   /// Returns true if the working set size of the code is considered huge.
   bool hasHugeWorkingSetSize();
+  /// Returns true if the working set size of the code is considered large.
+  bool hasLargeWorkingSetSize();
   /// Returns true if \p F has hot function entry.
   bool isFunctionEntryHot(const Function *F);
   /// Returns true if \p F contains hot code.
@@ -104,14 +115,26 @@ public:
   bool isFunctionEntryCold(const Function *F);
   /// Returns true if \p F contains only cold code.
   bool isFunctionColdInCallGraph(const Function *F, BlockFrequencyInfo &BFI);
+  /// Returns true if \p F contains hot code with regard to a given hot
+  /// percentile cutoff value.
+  bool isFunctionHotInCallGraphNthPercentile(int PercentileCutoff,
+                                             const Function *F,
+                                             BlockFrequencyInfo &BFI);
   /// Returns true if count \p C is considered hot.
   bool isHotCount(uint64_t C);
   /// Returns true if count \p C is considered cold.
   bool isColdCount(uint64_t C);
+  /// Returns true if count \p C is considered hot with regard to a given
+  /// hot percentile cutoff value.
+  bool isHotCountNthPercentile(int PercentileCutoff, uint64_t C);
   /// Returns true if BasicBlock \p BB is considered hot.
   bool isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI);
   /// Returns true if BasicBlock \p BB is considered cold.
   bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI);
+  /// Returns true if BasicBlock \p BB is considered hot with regard to a given
+  /// hot percentile cutoff value.
+  bool isHotBlockNthPercentile(int PercentileCutoff,
+                               const BasicBlock *BB, BlockFrequencyInfo *BFI);
   /// Returns true if CallSite \p CS is considered hot.
   bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI);
   /// Returns true if Callsite \p CS is considered cold.
diff --git a/include/llvm/Analysis/RegionInfoImpl.h b/include/llvm/Analysis/RegionInfoImpl.h
index c59c09dd2095..6b5936680c37 100644
--- a/include/llvm/Analysis/RegionInfoImpl.h
+++ b/include/llvm/Analysis/RegionInfoImpl.h
@@ -365,7 +365,7 @@ typename Tr::RegionNodeT *RegionBase<Tr>::getBBNode(BlockT *BB) const {
     auto Deconst = const_cast<RegionBase<Tr> *>(this);
     typename BBNodeMapT::value_type V = {
         BB,
-        llvm::make_unique<RegionNodeT>(static_cast<RegionT *>(Deconst), BB)};
+        std::make_unique<RegionNodeT>(static_cast<RegionT *>(Deconst), BB)};
     at = BBNodeMap.insert(std::move(V)).first;
   }
   return at->second.get();
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 0bd98ef37e7a..9c55f7a5090f 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -468,6 +468,8 @@ template <> struct DenseMapInfo<ExitLimitQuery> {
 /// can't do much with the SCEV objects directly, they must ask this class
 /// for services.
 class ScalarEvolution {
+  friend class ScalarEvolutionsTest;
+
 public:
   /// An enum describing the relationship between a SCEV and a loop.
   enum LoopDisposition {
@@ -777,10 +779,10 @@ public:
   /// to (i.e. a "conservative over-approximation") of the value returend by
   /// getBackedgeTakenCount.  If such a value cannot be computed, it returns the
   /// SCEVCouldNotCompute object.
-  const SCEV *getMaxBackedgeTakenCount(const Loop *L);
+  const SCEV *getConstantMaxBackedgeTakenCount(const Loop *L);
 
   /// Return true if the backedge taken count is either the value returned by
-  /// getMaxBackedgeTakenCount or zero.
+  /// getConstantMaxBackedgeTakenCount or zero.
   bool isBackedgeTakenCountMaxOrZero(const Loop *L);
 
   /// Return true if the specified loop has an analyzable loop-invariant
diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h
index a519f93216b3..b4d727449fbe 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -77,9 +77,13 @@ namespace llvm {
     /// Phis that complete an IV chain. Reuse
     DenseSet<AssertingVH<PHINode>> ChainedPhis;
 
-    /// When true, expressions are expanded in "canonical" form. In particular,
-    /// addrecs are expanded as arithmetic based on a canonical induction
-    /// variable. When false, expression are expanded in a more literal form.
+    /// When true, SCEVExpander tries to expand expressions in "canonical" form.
+    /// When false, expressions are expanded in a more literal form.
+    ///
+    /// In "canonical" form addrecs are expanded as arithmetic based on a
+    /// canonical induction variable. Note that CanonicalMode doesn't guarantee
+    /// that all expressions are expanded in "canonical" form. For some
+    /// expressions literal mode can be preferred.
     bool CanonicalMode;
 
     /// When invoked from LSR, the expander is in "strength reduction" mode. The
@@ -275,8 +279,16 @@ namespace llvm {
 
     /// Clear the current insertion point. This is useful if the instruction
     /// that had been serving as the insertion point may have been deleted.
-    void clearInsertPoint() {
-      Builder.ClearInsertionPoint();
+    void clearInsertPoint() { Builder.ClearInsertionPoint(); }
+
+    /// Set location information used by debugging information.
+    void SetCurrentDebugLocation(DebugLoc L) {
+      Builder.SetCurrentDebugLocation(std::move(L));
+    }
+
+    /// Get location information used by debugging information.
+    const DebugLoc &getCurrentDebugLocation() const {
+      return Builder.getCurrentDebugLocation();
     }
 
     /// Return true if the specified instruction was inserted by the code
diff --git a/include/llvm/Analysis/TargetLibraryInfo.h b/include/llvm/Analysis/TargetLibraryInfo.h
index 4b5200f5a838..d4b223863c54 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/include/llvm/Analysis/TargetLibraryInfo.h
@@ -30,11 +30,12 @@ struct VecDesc {
   unsigned VectorizationFactor;
 };
 
-  enum LibFunc {
+  enum LibFunc : unsigned {
 #define TLI_DEFINE_ENUM
 #include "llvm/Analysis/TargetLibraryInfo.def"
 
-    NumLibFuncs
+    NumLibFuncs,
+    NotLibFunc
   };
 
 /// Implementation of the target library information.
@@ -48,7 +49,7 @@ class TargetLibraryInfoImpl {
 
   unsigned char AvailableArray[(NumLibFuncs+3)/4];
   llvm::DenseMap<unsigned, std::string> CustomNames;
-  static StringRef const StandardNames[NumLibFuncs];
+  static StringLiteral const StandardNames[NumLibFuncs];
   bool ShouldExtI32Param, ShouldExtI32Return, ShouldSignExtI32Param;
 
   enum AvailabilityState {
@@ -359,7 +360,6 @@ public:
   TargetLibraryAnalysis(TargetLibraryInfoImpl PresetInfoImpl)
       : PresetInfoImpl(std::move(PresetInfoImpl)) {}
 
-  TargetLibraryInfo run(Module &M, ModuleAnalysisManager &);
   TargetLibraryInfo run(Function &F, FunctionAnalysisManager &);
 
 private:
@@ -385,8 +385,13 @@ public:
   explicit TargetLibraryInfoWrapperPass(const Triple &T);
   explicit TargetLibraryInfoWrapperPass(const TargetLibraryInfoImpl &TLI);
 
-  TargetLibraryInfo &getTLI() { return TLI; }
-  const TargetLibraryInfo &getTLI() const { return TLI; }
+  TargetLibraryInfo &getTLI(const Function &F LLVM_ATTRIBUTE_UNUSED) {
+    return TLI;
+  }
+  const TargetLibraryInfo &
+  getTLI(const Function &F LLVM_ATTRIBUTE_UNUSED) const {
+    return TLI;
+  }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 7574b811bc1c..d6fa88411654 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -368,6 +368,20 @@ public:
   /// optimize away.
   unsigned getFlatAddressSpace() const;
 
+  /// Return any intrinsic address operand indexes which may be rewritten if
+  /// they use a flat address space pointer.
+  ///
+  /// \returns true if the intrinsic was handled.
+  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+                                  Intrinsic::ID IID) const;
+
+  /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
+  /// NewV, which has a different address space. This should happen for every
+  /// operand index that collectFlatAddressOperands returned for the intrinsic.
+  /// \returns true if the intrinsic /// was handled.
+  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+                                        Value *OldV, Value *NewV) const;
+
   /// Test whether calls to a function lower to actual program function
   /// calls.
   ///
@@ -469,12 +483,17 @@ public:
     bool Force;
     /// Allow using trip count upper bound to unroll loops.
     bool UpperBound;
-    /// Allow peeling off loop iterations for loops with low dynamic tripcount.
+    /// Allow peeling off loop iterations.
     bool AllowPeeling;
     /// Allow unrolling of all the iterations of the runtime loop remainder.
     bool UnrollRemainder;
     /// Allow unroll and jam. Used to enable unroll and jam for the target.
     bool UnrollAndJam;
+    /// Allow peeling basing on profile. Uses to enable peeling off all
+    /// iterations basing on provided profile.
+    /// If the value is true the peeling cost model can decide to peel only
+    /// some iterations and in this case it will set this to false.
+    bool PeelProfiledIterations;
     /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
     /// value above is used during unroll and jam for the outer loop size.
     /// This value is used in the same manner to limit the size of the inner
@@ -555,15 +574,15 @@ public:
   /// modes that operate across loop iterations.
   bool shouldFavorBackedgeIndex(const Loop *L) const;
 
-  /// Return true if the target supports masked load.
-  bool isLegalMaskedStore(Type *DataType) const;
   /// Return true if the target supports masked store.
-  bool isLegalMaskedLoad(Type *DataType) const;
+  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) const;
+  /// Return true if the target supports masked load.
+  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) const;
 
   /// Return true if the target supports nontemporal store.
-  bool isLegalNTStore(Type *DataType, unsigned Alignment) const;
+  bool isLegalNTStore(Type *DataType, Align Alignment) const;
   /// Return true if the target supports nontemporal load.
-  bool isLegalNTLoad(Type *DataType, unsigned Alignment) const;
+  bool isLegalNTLoad(Type *DataType, Align Alignment) const;
 
   /// Return true if the target supports masked scatter.
   bool isLegalMaskedScatter(Type *DataType) const;
@@ -622,12 +641,6 @@ public:
   /// Return true if this type is legal.
   bool isTypeLegal(Type *Ty) const;
 
-  /// Returns the target's jmp_buf alignment in bytes.
-  unsigned getJumpBufAlignment() const;
-
-  /// Returns the target's jmp_buf size in bytes.
-  unsigned getJumpBufSize() const;
-
   /// Return true if switches should be turned into lookup tables for the
   /// target.
   bool shouldBuildLookupTables() const;
@@ -775,10 +788,23 @@ public:
   /// Additional properties of an operand's values.
   enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
 
-  /// \return The number of scalar or vector registers that the target has.
-  /// If 'Vectors' is true, it returns the number of vector registers. If it is
-  /// set to false, it returns the number of scalar registers.
-  unsigned getNumberOfRegisters(bool Vector) const;
+  /// \return the number of registers in the target-provided register class.
+  unsigned getNumberOfRegisters(unsigned ClassID) const;
+
+  /// \return the target-provided register class ID for the provided type,
+  /// accounting for type promotion and other type-legalization techniques that the target might apply.
+  /// However, it specifically does not account for the scalarization or splitting of vector types.
+  /// Should a vector type require scalarization or splitting into multiple underlying vector registers,
+  /// that type should be mapped to a register class containing no registers.
+  /// Specifically, this is designed to provide a simple, high-level view of the register allocation
+  /// later performed by the backend. These register classes don't necessarily map onto the
+  /// register classes used by the backend.
+  /// FIXME: It's not currently possible to determine how many registers
+  /// are used by the provided type.
+  unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
+
+  /// \return the target-provided register class name
+  const char* getRegisterClassName(unsigned ClassID) const;
 
   /// \return The width of the largest scalar or vector register type.
   unsigned getRegisterBitWidth(bool Vector) const;
@@ -824,18 +850,20 @@ public:
   /// \return The associativity of the cache level, if available.
   llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) const;
 
-  /// \return How much before a load we should place the prefetch instruction.
-  /// This is currently measured in number of instructions.
+  /// \return How much before a load we should place the prefetch
+  /// instruction.  This is currently measured in number of
+  /// instructions.
   unsigned getPrefetchDistance() const;
 
-  /// \return Some HW prefetchers can handle accesses up to a certain constant
-  /// stride.  This is the minimum stride in bytes where it makes sense to start
-  /// adding SW prefetches.  The default is 1, i.e. prefetch with any stride.
+  /// \return Some HW prefetchers can handle accesses up to a certain
+  /// constant stride.  This is the minimum stride in bytes where it
+  /// makes sense to start adding SW prefetches.  The default is 1,
+  /// i.e. prefetch with any stride.
   unsigned getMinPrefetchStride() const;
 
-  /// \return The maximum number of iterations to prefetch ahead.  If the
-  /// required number of iterations is more than this number, no prefetching is
-  /// performed.
+  /// \return The maximum number of iterations to prefetch ahead.  If
+  /// the required number of iterations is more than this number, no
+  /// prefetching is performed.
   unsigned getMaxPrefetchIterationsAhead() const;
 
   /// \return The maximum interleave factor that any transform should try to
@@ -1155,6 +1183,10 @@ public:
   virtual bool isSourceOfDivergence(const Value *V) = 0;
   virtual bool isAlwaysUniform(const Value *V) = 0;
   virtual unsigned getFlatAddressSpace() = 0;
+  virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+                                          Intrinsic::ID IID) const = 0;
+  virtual bool rewriteIntrinsicWithAddressSpace(
+    IntrinsicInst *II, Value *OldV, Value *NewV) const = 0;
   virtual bool isLoweredToCall(const Function *F) = 0;
   virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                                        UnrollingPreferences &UP) = 0;
@@ -1177,10 +1209,10 @@ public:
                           TargetLibraryInfo *LibInfo) = 0;
   virtual bool shouldFavorPostInc() const = 0;
   virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
-  virtual bool isLegalMaskedStore(Type *DataType) = 0;
-  virtual bool isLegalMaskedLoad(Type *DataType) = 0;
-  virtual bool isLegalNTStore(Type *DataType, unsigned Alignment) = 0;
-  virtual bool isLegalNTLoad(Type *DataType, unsigned Alignment) = 0;
+  virtual bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) = 0;
+  virtual bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) = 0;
+  virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType) = 0;
   virtual bool isLegalMaskedGather(Type *DataType) = 0;
   virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
@@ -1196,8 +1228,6 @@ public:
   virtual bool isProfitableToHoist(Instruction *I) = 0;
   virtual bool useAA() = 0;
   virtual bool isTypeLegal(Type *Ty) = 0;
-  virtual unsigned getJumpBufAlignment() = 0;
-  virtual unsigned getJumpBufSize() = 0;
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
   virtual bool useColdCCForColdCall(Function &F) = 0;
@@ -1228,19 +1258,35 @@ public:
                             Type *Ty) = 0;
   virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                             Type *Ty) = 0;
-  virtual unsigned getNumberOfRegisters(bool Vector) = 0;
+  virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
+  virtual unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const = 0;
+  virtual const char* getRegisterClassName(unsigned ClassID) const = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
-  virtual unsigned getCacheLineSize() = 0;
-  virtual llvm::Optional<unsigned> getCacheSize(CacheLevel Level) = 0;
-  virtual llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) = 0;
-  virtual unsigned getPrefetchDistance() = 0;
-  virtual unsigned getMinPrefetchStride() = 0;
-  virtual unsigned getMaxPrefetchIterationsAhead() = 0;
+  virtual unsigned getCacheLineSize() const = 0;
+  virtual llvm::Optional<unsigned> getCacheSize(CacheLevel Level) const = 0;
+  virtual llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) const = 0;
+
+  /// \return How much before a load we should place the prefetch
+  /// instruction.  This is currently measured in number of
+  /// instructions.
+  virtual unsigned getPrefetchDistance() const = 0;
+
+  /// \return Some HW prefetchers can handle accesses up to a certain
+  /// constant stride.  This is the minimum stride in bytes where it
+  /// makes sense to start adding SW prefetches.  The default is 1,
+  /// i.e. prefetch with any stride.
+  virtual unsigned getMinPrefetchStride() const = 0;
+
+  /// \return The maximum number of iterations to prefetch ahead.  If
+  /// the required number of iterations is more than this number, no
+  /// prefetching is performed.
+  virtual unsigned getMaxPrefetchIterationsAhead() const = 0;
+
   virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
   virtual unsigned
   getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
@@ -1395,6 +1441,16 @@ public:
     return Impl.getFlatAddressSpace();
   }
 
+  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+                                  Intrinsic::ID IID) const override {
+    return Impl.collectFlatAddressOperands(OpIndexes, IID);
+  }
+
+  bool rewriteIntrinsicWithAddressSpace(
+    IntrinsicInst *II, Value *OldV, Value *NewV) const override {
+    return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
+  }
+
   bool isLoweredToCall(const Function *F) override {
     return Impl.isLoweredToCall(F);
   }
@@ -1440,16 +1496,16 @@ public:
   bool shouldFavorBackedgeIndex(const Loop *L) const override {
     return Impl.shouldFavorBackedgeIndex(L);
   }
-  bool isLegalMaskedStore(Type *DataType) override {
-    return Impl.isLegalMaskedStore(DataType);
+  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) override {
+    return Impl.isLegalMaskedStore(DataType, Alignment);
   }
-  bool isLegalMaskedLoad(Type *DataType) override {
-    return Impl.isLegalMaskedLoad(DataType);
+  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) override {
+    return Impl.isLegalMaskedLoad(DataType, Alignment);
   }
-  bool isLegalNTStore(Type *DataType, unsigned Alignment) override {
+  bool isLegalNTStore(Type *DataType, Align Alignment) override {
     return Impl.isLegalNTStore(DataType, Alignment);
   }
-  bool isLegalNTLoad(Type *DataType, unsigned Alignment) override {
+  bool isLegalNTLoad(Type *DataType, Align Alignment) override {
     return Impl.isLegalNTLoad(DataType, Alignment);
   }
   bool isLegalMaskedScatter(Type *DataType) override {
@@ -1490,8 +1546,6 @@ public:
   }
   bool useAA() override { return Impl.useAA(); }
   bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
-  unsigned getJumpBufAlignment() override { return Impl.getJumpBufAlignment(); }
-  unsigned getJumpBufSize() override { return Impl.getJumpBufSize(); }
   bool shouldBuildLookupTables() override {
     return Impl.shouldBuildLookupTables();
   }
@@ -1563,8 +1617,14 @@ public:
                     Type *Ty) override {
     return Impl.getIntImmCost(IID, Idx, Imm, Ty);
   }
-  unsigned getNumberOfRegisters(bool Vector) override {
-    return Impl.getNumberOfRegisters(Vector);
+  unsigned getNumberOfRegisters(unsigned ClassID) const override {
+    return Impl.getNumberOfRegisters(ClassID);
+  }
+  unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const override {
+    return Impl.getRegisterClassForType(Vector, Ty);
+  }
+  const char* getRegisterClassName(unsigned ClassID) const override {
+    return Impl.getRegisterClassName(ClassID);
   }
   unsigned getRegisterBitWidth(bool Vector) const override {
     return Impl.getRegisterBitWidth(Vector);
@@ -1583,22 +1643,36 @@ public:
     return Impl.shouldConsiderAddressTypePromotion(
         I, AllowPromotionWithoutCommonHeader);
   }
-  unsigned getCacheLineSize() override {
+  unsigned getCacheLineSize() const override {
     return Impl.getCacheLineSize();
   }
-  llvm::Optional<unsigned> getCacheSize(CacheLevel Level) override {
+  llvm::Optional<unsigned> getCacheSize(CacheLevel Level) const override {
     return Impl.getCacheSize(Level);
   }
-  llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) override {
+  llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) const override {
     return Impl.getCacheAssociativity(Level);
   }
-  unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); }
-  unsigned getMinPrefetchStride() override {
+
+  /// Return the preferred prefetch distance in terms of instructions.
+  ///
+  unsigned getPrefetchDistance() const override {
+    return Impl.getPrefetchDistance();
+  }
+
+  /// Return the minimum stride necessary to trigger software
+  /// prefetching.
+  ///
+  unsigned getMinPrefetchStride() const override {
     return Impl.getMinPrefetchStride();
   }
-  unsigned getMaxPrefetchIterationsAhead() override {
+
+  /// Return the maximum prefetch distance in terms of loop
+  /// iterations.
+  ///
+  unsigned getMaxPrefetchIterationsAhead() const override {
     return Impl.getMaxPrefetchIterationsAhead();
   }
+
   unsigned getMaxInterleaveFactor(unsigned VF) override {
     return Impl.getMaxInterleaveFactor(VF);
   }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index b99e1eb9adf0..a431fa0d458b 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -156,6 +156,16 @@ public:
     return -1;
   }
 
+  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+                                  Intrinsic::ID IID) const {
+    return false;
+  }
+
+  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+                                        Value *OldV, Value *NewV) const {
+    return false;
+  }
+
   bool isLoweredToCall(const Function *F) {
     assert(F && "A concrete function must be provided to this routine.");
 
@@ -233,18 +243,18 @@ public:
 
   bool shouldFavorBackedgeIndex(const Loop *L) const { return false; }
 
-  bool isLegalMaskedStore(Type *DataType) { return false; }
+  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) { return false; }
 
-  bool isLegalMaskedLoad(Type *DataType) { return false; }
+  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) { return false; }
 
-  bool isLegalNTStore(Type *DataType, unsigned Alignment) {
+  bool isLegalNTStore(Type *DataType, Align Alignment) {
     // By default, assume nontemporal memory stores are available for stores
     // that are aligned and have a size that is a power of 2.
     unsigned DataSize = DL.getTypeStoreSize(DataType);
     return Alignment >= DataSize && isPowerOf2_32(DataSize);
   }
 
-  bool isLegalNTLoad(Type *DataType, unsigned Alignment) {
+  bool isLegalNTLoad(Type *DataType, Align Alignment) {
     // By default, assume nontemporal memory loads are available for loads that
     // are aligned and have a size that is a power of 2.
     unsigned DataSize = DL.getTypeStoreSize(DataType);
@@ -284,10 +294,6 @@ public:
 
   bool isTypeLegal(Type *Ty) { return false; }
 
-  unsigned getJumpBufAlignment() { return 0; }
-
-  unsigned getJumpBufSize() { return 0; }
-
   bool shouldBuildLookupTables() { return true; }
   bool shouldBuildLookupTablesForConstant(Constant *C) { return true; }
 
@@ -348,7 +354,20 @@ public:
     return TTI::TCC_Free;
   }
 
-  unsigned getNumberOfRegisters(bool Vector) { return 8; }
+  unsigned getNumberOfRegisters(unsigned ClassID) const { return 8; }
+
+  unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
+    return Vector ? 1 : 0;
+  };
+
+  const char* getRegisterClassName(unsigned ClassID) const {
+    switch (ClassID) {
+      default:
+        return "Generic::Unknown Register Class";
+      case 0: return "Generic::ScalarRC";
+      case 1: return "Generic::VectorRC";
+    }
+  }
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
@@ -365,21 +384,20 @@ public:
     return false;
   }
 
-  unsigned getCacheLineSize() { return 0; }
+  unsigned getCacheLineSize() const { return 0; }
 
-  llvm::Optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level) {
+  llvm::Optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level) const {
     switch (Level) {
     case TargetTransformInfo::CacheLevel::L1D:
       LLVM_FALLTHROUGH;
     case TargetTransformInfo::CacheLevel::L2D:
       return llvm::Optional<unsigned>();
     }
-
     llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
   }
 
   llvm::Optional<unsigned> getCacheAssociativity(
-    TargetTransformInfo::CacheLevel Level) {
+    TargetTransformInfo::CacheLevel Level) const {
     switch (Level) {
     case TargetTransformInfo::CacheLevel::L1D:
       LLVM_FALLTHROUGH;
@@ -390,11 +408,9 @@ public:
     llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
   }
 
-  unsigned getPrefetchDistance() { return 0; }
-
-  unsigned getMinPrefetchStride() { return 1; }
-
-  unsigned getMaxPrefetchIterationsAhead() { return UINT_MAX; }
+  unsigned getPrefetchDistance() const { return 0; }
+  unsigned getMinPrefetchStride() const { return 1; }
+  unsigned getMaxPrefetchIterationsAhead() const { return UINT_MAX; }
 
   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
 
@@ -830,6 +846,9 @@ public:
     if (isa<PHINode>(U))
       return TTI::TCC_Free; // Model all PHI nodes as free.
 
+    if (isa<ExtractValueInst>(U))
+      return TTI::TCC_Free; // Model all ExtractValue nodes as free.
+
     // Static alloca doesn't generate target instructions.
     if (auto *A = dyn_cast<AllocaInst>(U))
       if (A->isStaticAlloca())
diff --git a/include/llvm/Analysis/TypeMetadataUtils.h b/include/llvm/Analysis/TypeMetadataUtils.h
index 82cf8efeea54..43ce26147c2e 100644
--- a/include/llvm/Analysis/TypeMetadataUtils.h
+++ b/include/llvm/Analysis/TypeMetadataUtils.h
@@ -50,6 +50,8 @@ void findDevirtualizableCallsForTypeCheckedLoad(
     SmallVectorImpl<Instruction *> &LoadedPtrs,
     SmallVectorImpl<Instruction *> &Preds, bool &HasNonCallUses,
     const CallInst *CI, DominatorTree &DT);
+
+Constant *getPointerAtOffset(Constant *I, uint64_t Offset, Module &M);
 }
 
 #endif
diff --git a/include/llvm/Analysis/Utils/Local.h b/include/llvm/Analysis/Utils/Local.h
index acbdf5dca32c..a63bcec9bc41 100644
--- a/include/llvm/Analysis/Utils/Local.h
+++ b/include/llvm/Analysis/Utils/Local.h
@@ -32,7 +32,7 @@ Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &DL, User *GEP,
   Value *Result = Constant::getNullValue(IntPtrTy);
 
   // If the GEP is inbounds, we know that none of the addressing operations will
-  // overflow in an unsigned sense.
+  // overflow in a signed sense.
   bool isInBounds = GEPOp->isInBounds() && !NoAssumptions;
 
   // Build a mask for high order bits.
@@ -51,10 +51,7 @@ Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &DL, User *GEP,
 
       // Handle a struct index, which adds its field offset to the pointer.
       if (StructType *STy = GTI.getStructTypeOrNull()) {
-        if (OpC->getType()->isVectorTy())
-          OpC = OpC->getSplatValue();
-
-        uint64_t OpValue = cast<ConstantInt>(OpC)->getZExtValue();
+        uint64_t OpValue = OpC->getUniqueInteger().getZExtValue();
         Size = DL.getStructLayout(STy)->getElementOffset(OpValue);
 
         if (Size)
@@ -63,20 +60,31 @@ Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &DL, User *GEP,
         continue;
       }
 
+      // Splat the constant if needed.
+      if (IntPtrTy->isVectorTy() && !OpC->getType()->isVectorTy())
+        OpC = ConstantVector::getSplat(IntPtrTy->getVectorNumElements(), OpC);
+
       Constant *Scale = ConstantInt::get(IntPtrTy, Size);
       Constant *OC = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/);
-      Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/);
+      Scale =
+          ConstantExpr::getMul(OC, Scale, false /*NUW*/, isInBounds /*NSW*/);
       // Emit an add instruction.
       Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs");
       continue;
     }
+
+    // Splat the index if needed.
+    if (IntPtrTy->isVectorTy() && !Op->getType()->isVectorTy())
+      Op = Builder->CreateVectorSplat(IntPtrTy->getVectorNumElements(), Op);
+
     // Convert to correct type.
     if (Op->getType() != IntPtrTy)
       Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c");
     if (Size != 1) {
       // We'll let instcombine(mul) convert this to a shl if possible.
       Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size),
-                              GEP->getName()+".idx", isInBounds /*NUW*/);
+                              GEP->getName() + ".idx", false /*NUW*/,
+                              isInBounds /*NSW*/);
     }
 
     // Emit an add instruction.
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index fa7e0e0eef7e..33b064fcf9d2 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -242,19 +242,21 @@ class Value;
   /// This is a wrapper around Value::stripAndAccumulateConstantOffsets that
   /// creates and later unpacks the required APInt.
   inline Value *GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
-                                                 const DataLayout &DL) {
+                                                 const DataLayout &DL,
+                                                 bool AllowNonInbounds = true) {
     APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
     Value *Base =
-        Ptr->stripAndAccumulateConstantOffsets(DL, OffsetAPInt,
-                                               /* AllowNonInbounds */ true);
+        Ptr->stripAndAccumulateConstantOffsets(DL, OffsetAPInt, AllowNonInbounds);
+
     Offset = OffsetAPInt.getSExtValue();
     return Base;
   }
-  inline const Value *GetPointerBaseWithConstantOffset(const Value *Ptr,
-                                                       int64_t &Offset,
-                                                       const DataLayout &DL) {
-    return GetPointerBaseWithConstantOffset(const_cast<Value *>(Ptr), Offset,
-                                            DL);
+  inline const Value *
+  GetPointerBaseWithConstantOffset(const Value *Ptr, int64_t &Offset,
+                                   const DataLayout &DL,
+                                   bool AllowNonInbounds = true) {
+    return GetPointerBaseWithConstantOffset(const_cast<Value *>(Ptr), Offset, DL,
+                                            AllowNonInbounds);
   }
 
   /// Returns true if the GEP is based on a pointer to a string (array of
@@ -307,20 +309,26 @@ class Value;
   uint64_t GetStringLength(const Value *V, unsigned CharSize = 8);
 
   /// This function returns call pointer argument that is considered the same by
-  /// aliasing rules. You CAN'T use it to replace one value with another.
-  const Value *getArgumentAliasingToReturnedPointer(const CallBase *Call);
-  inline Value *getArgumentAliasingToReturnedPointer(CallBase *Call) {
+  /// aliasing rules. You CAN'T use it to replace one value with another. If
+  /// \p MustPreserveNullness is true, the call must preserve the nullness of
+  /// the pointer.
+  const Value *getArgumentAliasingToReturnedPointer(const CallBase *Call,
+                                                    bool MustPreserveNullness);
+  inline Value *
+  getArgumentAliasingToReturnedPointer(CallBase *Call,
+                                       bool MustPreserveNullness) {
     return const_cast<Value *>(getArgumentAliasingToReturnedPointer(
-        const_cast<const CallBase *>(Call)));
+        const_cast<const CallBase *>(Call), MustPreserveNullness));
   }
 
-  // {launder,strip}.invariant.group returns pointer that aliases its argument,
-  // and it only captures pointer by returning it.
-  // These intrinsics are not marked as nocapture, because returning is
-  // considered as capture. The arguments are not marked as returned neither,
-  // because it would make it useless.
+  /// {launder,strip}.invariant.group returns pointer that aliases its argument,
+  /// and it only captures pointer by returning it.
+  /// These intrinsics are not marked as nocapture, because returning is
+  /// considered as capture. The arguments are not marked as returned neither,
+  /// because it would make it useless. If \p MustPreserveNullness is true,
+  /// the intrinsic must preserve the nullness of the pointer.
   bool isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-      const CallBase *Call);
+      const CallBase *Call, bool MustPreserveNullness);
 
   /// This method strips off any GEP address adjustments and pointer casts from
   /// the specified value, returning the original object being addressed. Note
@@ -376,6 +384,13 @@ class Value;
   /// Return true if the only users of this pointer are lifetime markers.
   bool onlyUsedByLifetimeMarkers(const Value *V);
 
+  /// Return true if speculation of the given load must be suppressed to avoid
+  /// ordering or interfering with an active sanitizer.  If not suppressed,
+  /// dereferenceability and alignment must be proven separately.  Note: This
+  /// is only needed for raw reasoning; if you use the interface below
+  /// (isSafeToSpeculativelyExecute), this is handled internally.
+  bool mustSuppressSpeculation(const LoadInst &LI);
+
   /// Return true if the instruction does not have any effects besides
   /// calculating the result and does not have undefined behavior.
   ///
@@ -605,12 +620,12 @@ class Value;
   SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
                                          Instruction::CastOps *CastOp = nullptr,
                                          unsigned Depth = 0);
+
   inline SelectPatternResult
-  matchSelectPattern(const Value *V, const Value *&LHS, const Value *&RHS,
-                     Instruction::CastOps *CastOp = nullptr) {
-    Value *L = const_cast<Value*>(LHS);
-    Value *R = const_cast<Value*>(RHS);
-    auto Result = matchSelectPattern(const_cast<Value*>(V), L, R);
+  matchSelectPattern(const Value *V, const Value *&LHS, const Value *&RHS) {
+    Value *L = const_cast<Value *>(LHS);
+    Value *R = const_cast<Value *>(RHS);
+    auto Result = matchSelectPattern(const_cast<Value *>(V), L, R);
     LHS = L;
     RHS = R;
     return Result;
@@ -654,6 +669,12 @@ class Value;
   Optional<bool> isImpliedByDomCondition(const Value *Cond,
                                          const Instruction *ContextI,
                                          const DataLayout &DL);
+
+  /// If Ptr1 is provably equal to Ptr2 plus a constant offset, return that
+  /// offset. For example, Ptr1 might be &A[42], and Ptr2 might be &A[40]. In
+  /// this case offset would be -8.
+  Optional<int64_t> isPointerOffset(const Value *Ptr1, const Value *Ptr2,
+                                    const DataLayout &DL);
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_VALUETRACKING_H
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index d93d2bc4570b..4a61c2bc35c7 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -15,18 +15,129 @@
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/CheckedArithmetic.h"
 
 namespace llvm {
 
+/// Describes the type of Parameters
+enum class VFParamKind {
+  Vector,            // No semantic information.
+  OMP_Linear,        // declare simd linear(i)
+  OMP_LinearRef,     // declare simd linear(ref(i))
+  OMP_LinearVal,     // declare simd linear(val(i))
+  OMP_LinearUVal,    // declare simd linear(uval(i))
+  OMP_LinearPos,     // declare simd linear(i:c) uniform(c)
+  OMP_LinearValPos,  // declare simd linear(val(i:c)) uniform(c)
+  OMP_LinearRefPos,  // declare simd linear(ref(i:c)) uniform(c)
+  OMP_LinearUValPos, // declare simd linear(uval(i:c)) uniform(c
+  OMP_Uniform,       // declare simd uniform(i)
+  GlobalPredicate,   // Global logical predicate that acts on all lanes
+                     // of the input and output mask concurrently. For
+                     // example, it is implied by the `M` token in the
+                     // Vector Function ABI mangled name.
+  Unknown
+};
+
+/// Describes the type of Instruction Set Architecture
+enum class VFISAKind {
+  AdvancedSIMD, // AArch64 Advanced SIMD (NEON)
+  SVE,          // AArch64 Scalable Vector Extension
+  SSE,          // x86 SSE
+  AVX,          // x86 AVX
+  AVX2,         // x86 AVX2
+  AVX512,       // x86 AVX512
+  Unknown       // Unknown ISA
+};
+
+/// Encapsulates information needed to describe a parameter.
+///
+/// The description of the parameter is not linked directly to
+/// OpenMP or any other vector function description. This structure
+/// is extendible to handle other paradigms that describe vector
+/// functions and their parameters.
+struct VFParameter {
+  unsigned ParamPos;         // Parameter Position in Scalar Function.
+  VFParamKind ParamKind;     // Kind of Parameter.
+  int LinearStepOrPos = 0;   // Step or Position of the Parameter.
+  Align Alignment = Align(); // Optional aligment in bytes, defaulted to 1.
+
+  // Comparison operator.
+  bool operator==(const VFParameter &Other) const {
+    return std::tie(ParamPos, ParamKind, LinearStepOrPos, Alignment) ==
+           std::tie(Other.ParamPos, Other.ParamKind, Other.LinearStepOrPos,
+                    Other.Alignment);
+  }
+};
+
+/// Contains the information about the kind of vectorization
+/// available.
+///
+/// This object in independent on the paradigm used to
+/// represent vector functions. in particular, it is not attached to
+/// any target-specific ABI.
+struct VFShape {
+  unsigned VF;     // Vectorization factor.
+  bool IsScalable; // True if the function is a scalable function.
+  VFISAKind ISA;   // Instruction Set Architecture.
+  SmallVector<VFParameter, 8> Parameters; // List of parameter informations.
+  // Comparison operator.
+  bool operator==(const VFShape &Other) const {
+    return std::tie(VF, IsScalable, ISA, Parameters) ==
+           std::tie(Other.VF, Other.IsScalable, Other.ISA, Other.Parameters);
+  }
+};
+
+/// Holds the VFShape for a specific scalar to vector function mapping.
+struct VFInfo {
+  VFShape Shape;        // Classification of the vector function.
+  StringRef ScalarName; // Scalar Function Name.
+  StringRef VectorName; // Vector Function Name associated to this VFInfo.
+
+  // Comparison operator.
+  bool operator==(const VFInfo &Other) const {
+    return std::tie(Shape, ScalarName, VectorName) ==
+           std::tie(Shape, Other.ScalarName, Other.VectorName);
+  }
+};
+
+namespace VFABI {
+/// Function to contruct a VFInfo out of a mangled names in the
+/// following format:
+///
+/// <VFABI_name>{(<redirection>)}
+///
+/// where <VFABI_name> is the name of the vector function, mangled according
+/// to the rules described in the Vector Function ABI of the target vector
+/// extentsion (or <isa> from now on). The <VFABI_name> is in the following
+/// format:
+///
+/// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]
+///
+/// This methods support demangling rules for the following <isa>:
+///
+/// * AArch64: https://developer.arm.com/docs/101129/latest
+///
+/// * x86 (libmvec): https://sourceware.org/glibc/wiki/libmvec and
+///  https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt
+///
+///
+///
+/// \param MangledName -> input string in the format
+/// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)].
+Optional<VFInfo> tryDemangleForVFABI(StringRef MangledName);
+
+/// Retrieve the `VFParamKind` from a string token.
+VFParamKind getVFParamKindFromString(const StringRef Token);
+} // end namespace VFABI
+
 template <typename T> class ArrayRef;
 class DemandedBits;
 class GetElementPtrInst;
 template <typename InstTy> class InterleaveGroup;
 class Loop;
 class ScalarEvolution;
+class TargetLibraryInfo;
 class TargetTransformInfo;
 class Type;
 class Value;
@@ -270,13 +381,12 @@ APInt possiblyDemandedEltsInMask(Value *Mask);
 /// the interleaved store group doesn't allow gaps.
 template <typename InstTy> class InterleaveGroup {
 public:
-  InterleaveGroup(uint32_t Factor, bool Reverse, uint32_t Align)
-      : Factor(Factor), Reverse(Reverse), Align(Align), InsertPos(nullptr) {}
-
-  InterleaveGroup(InstTy *Instr, int32_t Stride, uint32_t Align)
-      : Align(Align), InsertPos(Instr) {
-    assert(Align && "The alignment should be non-zero");
+  InterleaveGroup(uint32_t Factor, bool Reverse, Align Alignment)
+      : Factor(Factor), Reverse(Reverse), Alignment(Alignment),
+        InsertPos(nullptr) {}
 
+  InterleaveGroup(InstTy *Instr, int32_t Stride, Align Alignment)
+      : Alignment(Alignment), InsertPos(Instr) {
     Factor = std::abs(Stride);
     assert(Factor > 1 && "Invalid interleave factor");
 
@@ -286,7 +396,7 @@ public:
 
   bool isReverse() const { return Reverse; }
   uint32_t getFactor() const { return Factor; }
-  uint32_t getAlignment() const { return Align; }
+  uint32_t getAlignment() const { return Alignment.value(); }
   uint32_t getNumMembers() const { return Members.size(); }
 
   /// Try to insert a new member \p Instr with index \p Index and
@@ -294,9 +404,7 @@ public:
   /// negative if it is the new leader.
   ///
   /// \returns false if the instruction doesn't belong to the group.
-  bool insertMember(InstTy *Instr, int32_t Index, uint32_t NewAlign) {
-    assert(NewAlign && "The new member's alignment should be non-zero");
-
+  bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign) {
     // Make sure the key fits in an int32_t.
     Optional<int32_t> MaybeKey = checkedAdd(Index, SmallestKey);
     if (!MaybeKey)
@@ -328,7 +436,7 @@ public:
     }
 
     // It's always safe to select the minimum alignment.
-    Align = std::min(Align, NewAlign);
+    Alignment = std::min(Alignment, NewAlign);
     Members[Key] = Instr;
     return true;
   }
@@ -387,7 +495,7 @@ public:
 private:
   uint32_t Factor; // Interleave Factor.
   bool Reverse;
-  uint32_t Align;
+  Align Alignment;
   DenseMap<int32_t, InstTy *> Members;
   int32_t SmallestKey = 0;
   int32_t LargestKey = 0;
@@ -504,8 +612,8 @@ private:
   struct StrideDescriptor {
     StrideDescriptor() = default;
     StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
-                     unsigned Align)
-        : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
+                     Align Alignment)
+        : Stride(Stride), Scev(Scev), Size(Size), Alignment(Alignment) {}
 
     // The access's stride. It is negative for a reverse access.
     int64_t Stride = 0;
@@ -517,7 +625,7 @@ private:
     uint64_t Size = 0;
 
     // The alignment of this access.
-    unsigned Align = 0;
+    Align Alignment;
   };
 
   /// A type for holding instructions and their stride descriptors.
@@ -528,11 +636,11 @@ private:
   ///
   /// \returns the newly created interleave group.
   InterleaveGroup<Instruction> *
-  createInterleaveGroup(Instruction *Instr, int Stride, unsigned Align) {
+  createInterleaveGroup(Instruction *Instr, int Stride, Align Alignment) {
     assert(!InterleaveGroupMap.count(Instr) &&
            "Already in an interleaved access group");
     InterleaveGroupMap[Instr] =
-        new InterleaveGroup<Instruction>(Instr, Stride, Align);
+        new InterleaveGroup<Instruction>(Instr, Stride, Alignment);
     InterleaveGroups.insert(InterleaveGroupMap[Instr]);
     return InterleaveGroupMap[Instr];
   }
diff --git a/include/llvm/BinaryFormat/Dwarf.def b/include/llvm/BinaryFormat/Dwarf.def
index b0f78d0fd61f..34a7410f7474 100644
--- a/include/llvm/BinaryFormat/Dwarf.def
+++ b/include/llvm/BinaryFormat/Dwarf.def
@@ -17,7 +17,7 @@
     defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED ||             \
     defined HANDLE_DW_CC || defined HANDLE_DW_LNS || defined HANDLE_DW_LNE ||  \
     defined HANDLE_DW_LNCT || defined HANDLE_DW_MACRO ||                       \
-    defined HANDLE_DW_RLE ||                                                   \
+    defined HANDLE_DW_RLE || defined HANDLE_DW_LLE ||                          \
     (defined HANDLE_DW_CFA && defined HANDLE_DW_CFA_PRED) ||                   \
     defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT ||                \
     defined HANDLE_DWARF_SECTION || defined HANDLE_DW_IDX ||                   \
@@ -26,7 +26,17 @@
 #endif
 
 #ifndef HANDLE_DW_TAG
-#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR)
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR, KIND)
+#endif
+
+// Note that DW_KIND is not a DWARF concept, but rather a way for us to
+// generate a list of tags that belong together.
+#ifndef DW_KIND_NONE
+#define DW_KIND_NONE 0
+#endif
+
+#ifndef DW_KIND_TYPE
+#define DW_KIND_TYPE 1
 #endif
 
 #ifndef HANDLE_DW_AT
@@ -81,6 +91,10 @@
 #define HANDLE_DW_RLE(ID, NAME)
 #endif
 
+#ifndef HANDLE_DW_LLE
+#define HANDLE_DW_LLE(ID, NAME)
+#endif
+
 #ifndef HANDLE_DW_CFA
 #define HANDLE_DW_CFA(ID, NAME)
 #endif
@@ -109,94 +123,94 @@
 #define HANDLE_DW_END(ID, NAME)
 #endif
 
-HANDLE_DW_TAG(0x0000, null, 2, DWARF)
-HANDLE_DW_TAG(0x0001, array_type, 2, DWARF)
-HANDLE_DW_TAG(0x0002, class_type, 2, DWARF)
-HANDLE_DW_TAG(0x0003, entry_point, 2, DWARF)
-HANDLE_DW_TAG(0x0004, enumeration_type, 2, DWARF)
-HANDLE_DW_TAG(0x0005, formal_parameter, 2, DWARF)
-HANDLE_DW_TAG(0x0008, imported_declaration, 2, DWARF)
-HANDLE_DW_TAG(0x000a, label, 2, DWARF)
-HANDLE_DW_TAG(0x000b, lexical_block, 2, DWARF)
-HANDLE_DW_TAG(0x000d, member, 2, DWARF)
-HANDLE_DW_TAG(0x000f, pointer_type, 2, DWARF)
-HANDLE_DW_TAG(0x0010, reference_type, 2, DWARF)
-HANDLE_DW_TAG(0x0011, compile_unit, 2, DWARF)
-HANDLE_DW_TAG(0x0012, string_type, 2, DWARF)
-HANDLE_DW_TAG(0x0013, structure_type, 2, DWARF)
-HANDLE_DW_TAG(0x0015, subroutine_type, 2, DWARF)
-HANDLE_DW_TAG(0x0016, typedef, 2, DWARF)
-HANDLE_DW_TAG(0x0017, union_type, 2, DWARF)
-HANDLE_DW_TAG(0x0018, unspecified_parameters, 2, DWARF)
-HANDLE_DW_TAG(0x0019, variant, 2, DWARF)
-HANDLE_DW_TAG(0x001a, common_block, 2, DWARF)
-HANDLE_DW_TAG(0x001b, common_inclusion, 2, DWARF)
-HANDLE_DW_TAG(0x001c, inheritance, 2, DWARF)
-HANDLE_DW_TAG(0x001d, inlined_subroutine, 2, DWARF)
-HANDLE_DW_TAG(0x001e, module, 2, DWARF)
-HANDLE_DW_TAG(0x001f, ptr_to_member_type, 2, DWARF)
-HANDLE_DW_TAG(0x0020, set_type, 2, DWARF)
-HANDLE_DW_TAG(0x0021, subrange_type, 2, DWARF)
-HANDLE_DW_TAG(0x0022, with_stmt, 2, DWARF)
-HANDLE_DW_TAG(0x0023, access_declaration, 2, DWARF)
-HANDLE_DW_TAG(0x0024, base_type, 2, DWARF)
-HANDLE_DW_TAG(0x0025, catch_block, 2, DWARF)
-HANDLE_DW_TAG(0x0026, const_type, 2, DWARF)
-HANDLE_DW_TAG(0x0027, constant, 2, DWARF)
-HANDLE_DW_TAG(0x0028, enumerator, 2, DWARF)
-HANDLE_DW_TAG(0x0029, file_type, 2, DWARF)
-HANDLE_DW_TAG(0x002a, friend, 2, DWARF)
-HANDLE_DW_TAG(0x002b, namelist, 2, DWARF)
-HANDLE_DW_TAG(0x002c, namelist_item, 2, DWARF)
-HANDLE_DW_TAG(0x002d, packed_type, 2, DWARF)
-HANDLE_DW_TAG(0x002e, subprogram, 2, DWARF)
-HANDLE_DW_TAG(0x002f, template_type_parameter, 2, DWARF)
-HANDLE_DW_TAG(0x0030, template_value_parameter, 2, DWARF)
-HANDLE_DW_TAG(0x0031, thrown_type, 2, DWARF)
-HANDLE_DW_TAG(0x0032, try_block, 2, DWARF)
-HANDLE_DW_TAG(0x0033, variant_part, 2, DWARF)
-HANDLE_DW_TAG(0x0034, variable, 2, DWARF)
-HANDLE_DW_TAG(0x0035, volatile_type, 2, DWARF)
+HANDLE_DW_TAG(0x0000, null, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0001, array_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0002, class_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0003, entry_point, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0004, enumeration_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0005, formal_parameter, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0008, imported_declaration, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x000a, label, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x000b, lexical_block, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x000d, member, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x000f, pointer_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0010, reference_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0011, compile_unit, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0012, string_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0013, structure_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0015, subroutine_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0016, typedef, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0017, union_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0018, unspecified_parameters, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0019, variant, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x001a, common_block, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x001b, common_inclusion, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x001c, inheritance, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x001d, inlined_subroutine, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x001e, module, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x001f, ptr_to_member_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0020, set_type, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0021, subrange_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0022, with_stmt, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0023, access_declaration, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0024, base_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0025, catch_block, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0026, const_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0027, constant, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0028, enumerator, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0029, file_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x002a, friend, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x002b, namelist, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x002c, namelist_item, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x002d, packed_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x002e, subprogram, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x002f, template_type_parameter, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0030, template_value_parameter, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0031, thrown_type, 2, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0032, try_block, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0033, variant_part, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0034, variable, 2, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0035, volatile_type, 2, DWARF, DW_KIND_TYPE)
 // New in DWARF v3:
-HANDLE_DW_TAG(0x0036, dwarf_procedure, 3, DWARF)
-HANDLE_DW_TAG(0x0037, restrict_type, 3, DWARF)
-HANDLE_DW_TAG(0x0038, interface_type, 3, DWARF)
-HANDLE_DW_TAG(0x0039, namespace, 3, DWARF)
-HANDLE_DW_TAG(0x003a, imported_module, 3, DWARF)
-HANDLE_DW_TAG(0x003b, unspecified_type, 3, DWARF)
-HANDLE_DW_TAG(0x003c, partial_unit, 3, DWARF)
-HANDLE_DW_TAG(0x003d, imported_unit, 3, DWARF)
-HANDLE_DW_TAG(0x003f, condition, 3, DWARF)
-HANDLE_DW_TAG(0x0040, shared_type, 3, DWARF)
+HANDLE_DW_TAG(0x0036, dwarf_procedure, 3, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0037, restrict_type, 3, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0038, interface_type, 3, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0039, namespace, 3, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x003a, imported_module, 3, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x003b, unspecified_type, 3, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x003c, partial_unit, 3, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x003d, imported_unit, 3, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x003f, condition, 3, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0040, shared_type, 3, DWARF, DW_KIND_TYPE)
 // New in DWARF v4:
-HANDLE_DW_TAG(0x0041, type_unit, 4, DWARF)
-HANDLE_DW_TAG(0x0042, rvalue_reference_type, 4, DWARF)
-HANDLE_DW_TAG(0x0043, template_alias, 4, DWARF)
+HANDLE_DW_TAG(0x0041, type_unit, 4, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0042, rvalue_reference_type, 4, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0043, template_alias, 4, DWARF, DW_KIND_NONE)
 // New in DWARF v5:
-HANDLE_DW_TAG(0x0044, coarray_type, 5, DWARF)
-HANDLE_DW_TAG(0x0045, generic_subrange, 5, DWARF)
-HANDLE_DW_TAG(0x0046, dynamic_type, 5, DWARF)
-HANDLE_DW_TAG(0x0047, atomic_type, 5, DWARF)
-HANDLE_DW_TAG(0x0048, call_site, 5, DWARF)
-HANDLE_DW_TAG(0x0049, call_site_parameter, 5, DWARF)
-HANDLE_DW_TAG(0x004a, skeleton_unit, 5, DWARF)
-HANDLE_DW_TAG(0x004b, immutable_type, 5, DWARF)
+HANDLE_DW_TAG(0x0044, coarray_type, 5, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0045, generic_subrange, 5, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0046, dynamic_type, 5, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0047, atomic_type, 5, DWARF, DW_KIND_TYPE)
+HANDLE_DW_TAG(0x0048, call_site, 5, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x0049, call_site_parameter, 5, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x004a, skeleton_unit, 5, DWARF, DW_KIND_NONE)
+HANDLE_DW_TAG(0x004b, immutable_type, 5, DWARF, DW_KIND_TYPE)
 // Vendor extensions:
-HANDLE_DW_TAG(0x4081, MIPS_loop, 0, MIPS)
-HANDLE_DW_TAG(0x4101, format_label, 0, GNU)
-HANDLE_DW_TAG(0x4102, function_template, 0, GNU)
-HANDLE_DW_TAG(0x4103, class_template, 0, GNU)
-HANDLE_DW_TAG(0x4106, GNU_template_template_param, 0, GNU)
-HANDLE_DW_TAG(0x4107, GNU_template_parameter_pack, 0, GNU)
-HANDLE_DW_TAG(0x4108, GNU_formal_parameter_pack, 0, GNU)
-HANDLE_DW_TAG(0x4109, GNU_call_site, 0, GNU)
-HANDLE_DW_TAG(0x410a, GNU_call_site_parameter, 0, GNU)
-HANDLE_DW_TAG(0x4200, APPLE_property, 0, APPLE)
-HANDLE_DW_TAG(0xb000, BORLAND_property, 0, BORLAND)
-HANDLE_DW_TAG(0xb001, BORLAND_Delphi_string, 0, BORLAND)
-HANDLE_DW_TAG(0xb002, BORLAND_Delphi_dynamic_array, 0, BORLAND)
-HANDLE_DW_TAG(0xb003, BORLAND_Delphi_set, 0, BORLAND)
-HANDLE_DW_TAG(0xb004, BORLAND_Delphi_variant, 0, BORLAND)
+HANDLE_DW_TAG(0x4081, MIPS_loop, 0, MIPS, DW_KIND_NONE)
+HANDLE_DW_TAG(0x4101, format_label, 0, GNU, DW_KIND_NONE)
+HANDLE_DW_TAG(0x4102, function_template, 0, GNU, DW_KIND_NONE)
+HANDLE_DW_TAG(0x4103, class_template, 0, GNU, DW_KIND_NONE)
+HANDLE_DW_TAG(0x4106, GNU_template_template_param, 0, GNU, DW_KIND_NONE)
+HANDLE_DW_TAG(0x4107, GNU_template_parameter_pack, 0, GNU, DW_KIND_NONE)
+HANDLE_DW_TAG(0x4108, GNU_formal_parameter_pack, 0, GNU, DW_KIND_NONE)
+HANDLE_DW_TAG(0x4109, GNU_call_site, 0, GNU, DW_KIND_NONE)
+HANDLE_DW_TAG(0x410a, GNU_call_site_parameter, 0, GNU, DW_KIND_NONE)
+HANDLE_DW_TAG(0x4200, APPLE_property, 0, APPLE, DW_KIND_NONE)
+HANDLE_DW_TAG(0xb000, BORLAND_property, 0, BORLAND, DW_KIND_NONE)
+HANDLE_DW_TAG(0xb001, BORLAND_Delphi_string, 0, BORLAND, DW_KIND_TYPE)
+HANDLE_DW_TAG(0xb002, BORLAND_Delphi_dynamic_array, 0, BORLAND, DW_KIND_TYPE)
+HANDLE_DW_TAG(0xb003, BORLAND_Delphi_set, 0, BORLAND, DW_KIND_TYPE)
+HANDLE_DW_TAG(0xb004, BORLAND_Delphi_variant, 0, BORLAND, DW_KIND_TYPE)
 
 // Attributes.
 HANDLE_DW_AT(0x01, sibling, 2, DWARF)
@@ -815,6 +829,17 @@ HANDLE_DW_RLE(0x05, base_address)
 HANDLE_DW_RLE(0x06, start_end)
 HANDLE_DW_RLE(0x07, start_length)
 
+// DWARF v5 Loc List Entry encoding values.
+HANDLE_DW_LLE(0x00, end_of_list)
+HANDLE_DW_LLE(0x01, base_addressx)
+HANDLE_DW_LLE(0x02, startx_endx)
+HANDLE_DW_LLE(0x03, startx_length)
+HANDLE_DW_LLE(0x04, offset_pair)
+HANDLE_DW_LLE(0x05, default_location)
+HANDLE_DW_LLE(0x06, base_address)
+HANDLE_DW_LLE(0x07, start_end)
+HANDLE_DW_LLE(0x08, start_length)
+
 // Call frame instruction encodings.
 HANDLE_DW_CFA(0x00, nop)
 HANDLE_DW_CFA(0x40, advance_loc)
@@ -929,6 +954,7 @@ HANDLE_DW_IDX(0x05, type_hash)
 #undef HANDLE_DW_LNCT
 #undef HANDLE_DW_MACRO
 #undef HANDLE_DW_RLE
+#undef HANDLE_DW_LLE
 #undef HANDLE_DW_CFA
 #undef HANDLE_DW_CFA_PRED
 #undef HANDLE_DW_APPLE_PROPERTY
diff --git a/include/llvm/BinaryFormat/Dwarf.h b/include/llvm/BinaryFormat/Dwarf.h
index 76d9c365c0a8..1c6aee48661c 100644
--- a/include/llvm/BinaryFormat/Dwarf.h
+++ b/include/llvm/BinaryFormat/Dwarf.h
@@ -46,6 +46,11 @@ enum LLVMConstants : uint32_t {
   DW_VIRTUALITY_invalid = ~0U, // Virtuality for invalid results.
   DW_MACINFO_invalid = ~0U,    // Macinfo type for invalid results.
 
+  // Special values for an initial length field.
+  DW_LENGTH_lo_reserved = 0xfffffff0, // Lower bound of the reserved range.
+  DW_LENGTH_DWARF64 = 0xffffffff,     // Indicator of 64-bit DWARF format.
+  DW_LENGTH_hi_reserved = 0xffffffff, // Upper bound of the reserved range.
+
   // Other constants.
   DWARF_VERSION = 4,       // Default dwarf version we output.
   DW_PUBTYPES_VERSION = 2, // Section version number for .debug_pubtypes.
@@ -75,7 +80,7 @@ const uint64_t DW64_CIE_ID = UINT64_MAX;
 const uint32_t DW_INVALID_OFFSET = UINT32_MAX;
 
 enum Tag : uint16_t {
-#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR) DW_TAG_##NAME = ID,
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR, KIND) DW_TAG_##NAME = ID,
 #include "llvm/BinaryFormat/Dwarf.def"
   DW_TAG_lo_user = 0x4080,
   DW_TAG_hi_user = 0xffff,
@@ -84,29 +89,12 @@ enum Tag : uint16_t {
 
 inline bool isType(Tag T) {
   switch (T) {
-  case DW_TAG_array_type:
-  case DW_TAG_class_type:
-  case DW_TAG_interface_type:
-  case DW_TAG_enumeration_type:
-  case DW_TAG_pointer_type:
-  case DW_TAG_reference_type:
-  case DW_TAG_rvalue_reference_type:
-  case DW_TAG_string_type:
-  case DW_TAG_structure_type:
-  case DW_TAG_subroutine_type:
-  case DW_TAG_union_type:
-  case DW_TAG_ptr_to_member_type:
-  case DW_TAG_set_type:
-  case DW_TAG_subrange_type:
-  case DW_TAG_base_type:
-  case DW_TAG_const_type:
-  case DW_TAG_file_type:
-  case DW_TAG_packed_type:
-  case DW_TAG_volatile_type:
-  case DW_TAG_typedef:
-    return true;
   default:
     return false;
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR, KIND)                         \
+  case DW_TAG_##NAME:                                                          \
+    return (KIND == DW_KIND_TYPE);
+#include "llvm/BinaryFormat/Dwarf.def"
   }
 }
 
@@ -129,9 +117,10 @@ enum LocationAtom {
 #include "llvm/BinaryFormat/Dwarf.def"
   DW_OP_lo_user = 0xe0,
   DW_OP_hi_user = 0xff,
-  DW_OP_LLVM_fragment = 0x1000,   ///< Only used in LLVM metadata.
-  DW_OP_LLVM_convert = 0x1001,    ///< Only used in LLVM metadata.
-  DW_OP_LLVM_tag_offset = 0x1002, ///< Only used in LLVM metadata.
+  DW_OP_LLVM_fragment = 0x1000,    ///< Only used in LLVM metadata.
+  DW_OP_LLVM_convert = 0x1001,     ///< Only used in LLVM metadata.
+  DW_OP_LLVM_tag_offset = 0x1002,  ///< Only used in LLVM metadata.
+  DW_OP_LLVM_entry_value = 0x1003, ///< Only used in LLVM metadata.
 };
 
 enum TypeKind : uint8_t {
@@ -192,6 +181,59 @@ enum SourceLanguage {
   DW_LANG_hi_user = 0xffff
 };
 
+inline bool isCPlusPlus(SourceLanguage S) {
+  // Deliberately enumerate all the language options so we get a warning when
+  // new language options are added (-Wswitch) that'll hopefully help keep this
+  // switch up-to-date when new C++ versions are added.
+  switch (S) {
+  case DW_LANG_C_plus_plus:
+  case DW_LANG_C_plus_plus_03:
+  case DW_LANG_C_plus_plus_11:
+  case DW_LANG_C_plus_plus_14:
+    return true;
+  case DW_LANG_C89:
+  case DW_LANG_C:
+  case DW_LANG_Ada83:
+  case DW_LANG_Cobol74:
+  case DW_LANG_Cobol85:
+  case DW_LANG_Fortran77:
+  case DW_LANG_Fortran90:
+  case DW_LANG_Pascal83:
+  case DW_LANG_Modula2:
+  case DW_LANG_Java:
+  case DW_LANG_C99:
+  case DW_LANG_Ada95:
+  case DW_LANG_Fortran95:
+  case DW_LANG_PLI:
+  case DW_LANG_ObjC:
+  case DW_LANG_ObjC_plus_plus:
+  case DW_LANG_UPC:
+  case DW_LANG_D:
+  case DW_LANG_Python:
+  case DW_LANG_OpenCL:
+  case DW_LANG_Go:
+  case DW_LANG_Modula3:
+  case DW_LANG_Haskell:
+  case DW_LANG_OCaml:
+  case DW_LANG_Rust:
+  case DW_LANG_C11:
+  case DW_LANG_Swift:
+  case DW_LANG_Julia:
+  case DW_LANG_Dylan:
+  case DW_LANG_Fortran03:
+  case DW_LANG_Fortran08:
+  case DW_LANG_RenderScript:
+  case DW_LANG_BLISS:
+  case DW_LANG_Mips_Assembler:
+  case DW_LANG_GOOGLE_RenderScript:
+  case DW_LANG_BORLAND_Delphi:
+  case DW_LANG_lo_user:
+  case DW_LANG_hi_user:
+    return false;
+  }
+  llvm_unreachable("Invalid source language");
+}
+
 enum CaseSensitivity {
   // Identifier case codes
   DW_ID_case_sensitive = 0x00,
@@ -267,11 +309,17 @@ enum MacroEntryType {
 };
 
 /// DWARF v5 range list entry encoding values.
-enum RangeListEntries {
+enum RnglistEntries {
 #define HANDLE_DW_RLE(ID, NAME) DW_RLE_##NAME = ID,
 #include "llvm/BinaryFormat/Dwarf.def"
 };
 
+/// DWARF v5 loc list entry encoding values.
+enum LoclistEntries {
+#define HANDLE_DW_LLE(ID, NAME) DW_LLE_##NAME = ID,
+#include "llvm/BinaryFormat/Dwarf.def"
+};
+
 /// Call frame instruction encodings.
 enum CallFrameInfo {
 #define HANDLE_DW_CFA(ID, NAME) DW_CFA_##NAME = ID,
@@ -307,19 +355,6 @@ enum Constants {
   DW_EH_PE_indirect = 0x80
 };
 
-/// Constants for location lists in DWARF v5.
-enum LocationListEntry : unsigned char {
-  DW_LLE_end_of_list = 0x00,
-  DW_LLE_base_addressx = 0x01,
-  DW_LLE_startx_endx = 0x02,
-  DW_LLE_startx_length = 0x03,
-  DW_LLE_offset_pair = 0x04,
-  DW_LLE_default_location = 0x05,
-  DW_LLE_base_address = 0x06,
-  DW_LLE_start_end = 0x07,
-  DW_LLE_start_length = 0x08
-};
-
 /// Constants for the DW_APPLE_PROPERTY_attributes attribute.
 /// Keep this list in sync with clang's DeclSpec.h ObjCPropertyAttributeKind!
 enum ApplePropertyAttributes {
@@ -434,6 +469,7 @@ StringRef LNStandardString(unsigned Standard);
 StringRef LNExtendedString(unsigned Encoding);
 StringRef MacinfoString(unsigned Encoding);
 StringRef RangeListEncodingString(unsigned Encoding);
+StringRef LocListEncodingString(unsigned Encoding);
 StringRef CallFrameString(unsigned Encoding, Triple::ArchType Arch);
 StringRef ApplePropertyString(unsigned);
 StringRef UnitTypeString(unsigned);
@@ -525,6 +561,17 @@ struct FormParams {
   explicit operator bool() const { return Version && AddrSize; }
 };
 
+/// Get the byte size of the unit length field depending on the DWARF format.
+inline uint8_t getUnitLengthFieldByteSize(DwarfFormat Format) {
+  switch (Format) {
+  case DwarfFormat::DWARF32:
+    return 4;
+  case DwarfFormat::DWARF64:
+    return 12;
+  }
+  llvm_unreachable("Invalid Format value");
+}
+
 /// Get the fixed byte size for a given form.
 ///
 /// If the form has a fixed byte size, then an Optional with a value will be
diff --git a/include/llvm/BinaryFormat/ELF.h b/include/llvm/BinaryFormat/ELF.h
index 2bd711137845..46edfb6260be 100644
--- a/include/llvm/BinaryFormat/ELF.h
+++ b/include/llvm/BinaryFormat/ELF.h
@@ -1356,6 +1356,72 @@ enum : unsigned {
   NT_GNU_BUILD_ATTRIBUTE_FUNC = 0x101,
 };
 
+// Core note types
+enum : unsigned {
+  NT_PRSTATUS = 1,
+  NT_FPREGSET = 2,
+  NT_PRPSINFO = 3,
+  NT_TASKSTRUCT = 4,
+  NT_AUXV = 6,
+  NT_PSTATUS = 10,
+  NT_FPREGS = 12,
+  NT_PSINFO = 13,
+  NT_LWPSTATUS = 16,
+  NT_LWPSINFO = 17,
+  NT_WIN32PSTATUS = 18,
+
+  NT_PPC_VMX = 0x100,
+  NT_PPC_VSX = 0x102,
+  NT_PPC_TAR = 0x103,
+  NT_PPC_PPR = 0x104,
+  NT_PPC_DSCR = 0x105,
+  NT_PPC_EBB = 0x106,
+  NT_PPC_PMU = 0x107,
+  NT_PPC_TM_CGPR = 0x108,
+  NT_PPC_TM_CFPR = 0x109,
+  NT_PPC_TM_CVMX = 0x10a,
+  NT_PPC_TM_CVSX = 0x10b,
+  NT_PPC_TM_SPR = 0x10c,
+  NT_PPC_TM_CTAR = 0x10d,
+  NT_PPC_TM_CPPR = 0x10e,
+  NT_PPC_TM_CDSCR = 0x10f,
+
+  NT_386_TLS = 0x200,
+  NT_386_IOPERM = 0x201,
+  NT_X86_XSTATE = 0x202,
+
+  NT_S390_HIGH_GPRS = 0x300,
+  NT_S390_TIMER = 0x301,
+  NT_S390_TODCMP = 0x302,
+  NT_S390_TODPREG = 0x303,
+  NT_S390_CTRS = 0x304,
+  NT_S390_PREFIX = 0x305,
+  NT_S390_LAST_BREAK = 0x306,
+  NT_S390_SYSTEM_CALL = 0x307,
+  NT_S390_TDB = 0x308,
+  NT_S390_VXRS_LOW = 0x309,
+  NT_S390_VXRS_HIGH = 0x30a,
+  NT_S390_GS_CB = 0x30b,
+  NT_S390_GS_BC = 0x30c,
+
+  NT_ARM_VFP = 0x400,
+  NT_ARM_TLS = 0x401,
+  NT_ARM_HW_BREAK = 0x402,
+  NT_ARM_HW_WATCH = 0x403,
+  NT_ARM_SVE = 0x405,
+  NT_ARM_PAC_MASK = 0x406,
+
+  NT_FILE = 0x46494c45,
+  NT_PRXFPREG = 0x46e62b7f,
+  NT_SIGINFO = 0x53494749,
+};
+
+// LLVM-specific notes.
+enum {
+  NT_LLVM_HWASAN_GLOBALS = 3,
+};
+
+// GNU note types
 enum {
   NT_GNU_ABI_TAG = 1,
   NT_GNU_HWCAP = 2,
diff --git a/include/llvm/BinaryFormat/ELFRelocs/AArch64.def b/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
index 4afcd7d1f093..c8364133e31f 100644
--- a/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
+++ b/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
@@ -124,8 +124,11 @@ ELF_RELOC(R_AARCH64_COPY,                            0x400)
 ELF_RELOC(R_AARCH64_GLOB_DAT,                        0x401)
 ELF_RELOC(R_AARCH64_JUMP_SLOT,                       0x402)
 ELF_RELOC(R_AARCH64_RELATIVE,                        0x403)
-ELF_RELOC(R_AARCH64_TLS_DTPREL64,                    0x404)
-ELF_RELOC(R_AARCH64_TLS_DTPMOD64,                    0x405)
+// 0x404 and 0x405 are now R_AARCH64_TLS_IMPDEF1 and R_AARCH64_TLS_IMPDEF2
+// We follow GNU and define TLS_IMPDEF1 as TLS_DTPMOD64 and TLS_IMPDEF2 as
+// TLS_DTPREL64
+ELF_RELOC(R_AARCH64_TLS_DTPMOD64,                    0x404)
+ELF_RELOC(R_AARCH64_TLS_DTPREL64,                    0x405)
 ELF_RELOC(R_AARCH64_TLS_TPREL64,                     0x406)
 ELF_RELOC(R_AARCH64_TLSDESC,                         0x407)
 ELF_RELOC(R_AARCH64_IRELATIVE,                       0x408)
diff --git a/include/llvm/BinaryFormat/MachO.h b/include/llvm/BinaryFormat/MachO.h
index a01393a3b303..fb50e549cb9d 100644
--- a/include/llvm/BinaryFormat/MachO.h
+++ b/include/llvm/BinaryFormat/MachO.h
@@ -581,6 +581,11 @@ struct section_64 {
   uint32_t reserved3;
 };
 
+inline bool isVirtualSection(uint8_t type) {
+  return (type == MachO::S_ZEROFILL || type == MachO::S_GB_ZEROFILL ||
+          type == MachO::S_THREAD_LOCAL_ZEROFILL);
+}
+
 struct fvmlib {
   uint32_t name;
   uint32_t minor_version;
diff --git a/include/llvm/BinaryFormat/Magic.h b/include/llvm/BinaryFormat/Magic.h
index cd9833ec4d22..64c687262f4a 100644
--- a/include/llvm/BinaryFormat/Magic.h
+++ b/include/llvm/BinaryFormat/Magic.h
@@ -49,6 +49,7 @@ struct file_magic {
     xcoff_object_64,     ///< 64-bit XCOFF object file
     wasm_object,         ///< WebAssembly Object file
     pdb,                 ///< Windows PDB debug info file
+    tapi_file,           ///< Text-based Dynamic Library Stub file
   };
 
   bool is_object() const { return V != unknown; }
diff --git a/include/llvm/BinaryFormat/Minidump.h b/include/llvm/BinaryFormat/Minidump.h
index 65c17d1eb00c..89cd779951cf 100644
--- a/include/llvm/BinaryFormat/Minidump.h
+++ b/include/llvm/BinaryFormat/Minidump.h
@@ -18,12 +18,15 @@
 #ifndef LLVM_BINARYFORMAT_MINIDUMP_H
 #define LLVM_BINARYFORMAT_MINIDUMP_H
 
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/Support/Endian.h"
 
 namespace llvm {
 namespace minidump {
 
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
 /// The minidump header is the first part of a minidump file. It identifies the
 /// file as a minidump file, and gives the location of the stream directory.
 struct Header {
@@ -67,6 +70,50 @@ struct MemoryDescriptor {
 };
 static_assert(sizeof(MemoryDescriptor) == 16, "");
 
+struct MemoryInfoListHeader {
+  support::ulittle32_t SizeOfHeader;
+  support::ulittle32_t SizeOfEntry;
+  support::ulittle64_t NumberOfEntries;
+
+  MemoryInfoListHeader() = default;
+  MemoryInfoListHeader(uint32_t SizeOfHeader, uint32_t SizeOfEntry,
+                       uint64_t NumberOfEntries)
+      : SizeOfHeader(SizeOfHeader), SizeOfEntry(SizeOfEntry),
+        NumberOfEntries(NumberOfEntries) {}
+};
+static_assert(sizeof(MemoryInfoListHeader) == 16, "");
+
+enum class MemoryProtection : uint32_t {
+#define HANDLE_MDMP_PROTECT(CODE, NAME, NATIVENAME) NAME = CODE,
+#include "llvm/BinaryFormat/MinidumpConstants.def"
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/0xffffffffu),
+};
+
+enum class MemoryState : uint32_t {
+#define HANDLE_MDMP_MEMSTATE(CODE, NAME, NATIVENAME) NAME = CODE,
+#include "llvm/BinaryFormat/MinidumpConstants.def"
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/0xffffffffu),
+};
+
+enum class MemoryType : uint32_t {
+#define HANDLE_MDMP_MEMTYPE(CODE, NAME, NATIVENAME) NAME = CODE,
+#include "llvm/BinaryFormat/MinidumpConstants.def"
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/0xffffffffu),
+};
+
+struct MemoryInfo {
+  support::ulittle64_t BaseAddress;
+  support::ulittle64_t AllocationBase;
+  support::little_t<MemoryProtection> AllocationProtect;
+  support::ulittle32_t Reserved0;
+  support::ulittle64_t RegionSize;
+  support::little_t<MemoryState> State;
+  support::little_t<MemoryProtection> Protect;
+  support::little_t<MemoryType> Type;
+  support::ulittle32_t Reserved1;
+};
+static_assert(sizeof(MemoryInfo) == 48, "");
+
 /// Specifies the location and type of a single stream in the minidump file. The
 /// minidump stream directory is an array of entries of this type, with its size
 /// given by Header.NumberOfStreams.
@@ -180,6 +227,27 @@ struct Thread {
 };
 static_assert(sizeof(Thread) == 48, "");
 
+struct Exception {
+  static constexpr size_t MaxParameters = 15;
+
+  support::ulittle32_t ExceptionCode;
+  support::ulittle32_t ExceptionFlags;
+  support::ulittle64_t ExceptionRecord;
+  support::ulittle64_t ExceptionAddress;
+  support::ulittle32_t NumberParameters;
+  support::ulittle32_t UnusedAlignment;
+  support::ulittle64_t ExceptionInformation[MaxParameters];
+};
+static_assert(sizeof(Exception) == 152, "");
+
+struct ExceptionStream {
+  support::ulittle32_t ThreadId;
+  support::ulittle32_t UnusedAlignment;
+  Exception ExceptionRecord;
+  LocationDescriptor ThreadContext;
+};
+static_assert(sizeof(ExceptionStream) == 168, "");
+
 } // namespace minidump
 
 template <> struct DenseMapInfo<minidump::StreamType> {
diff --git a/include/llvm/BinaryFormat/MinidumpConstants.def b/include/llvm/BinaryFormat/MinidumpConstants.def
index d4f13dd99217..aeef399af7a4 100644
--- a/include/llvm/BinaryFormat/MinidumpConstants.def
+++ b/include/llvm/BinaryFormat/MinidumpConstants.def
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if !(defined HANDLE_MDMP_STREAM_TYPE || defined HANDLE_MDMP_ARCH ||           \
-      defined HANDLE_MDMP_PLATFORM)
+#if !(defined(HANDLE_MDMP_STREAM_TYPE) || defined(HANDLE_MDMP_ARCH) ||         \
+      defined(HANDLE_MDMP_PLATFORM) || defined(HANDLE_MDMP_PROTECT) ||         \
+      defined(HANDLE_MDMP_MEMSTATE) || defined(HANDLE_MDMP_MEMTYPE))
 #error "Missing HANDLE_MDMP definition"
 #endif
 
@@ -23,6 +24,18 @@
 #define HANDLE_MDMP_PLATFORM(CODE, NAME)
 #endif
 
+#ifndef HANDLE_MDMP_PROTECT
+#define HANDLE_MDMP_PROTECT(CODE, NAME, NATIVENAME)
+#endif
+
+#ifndef HANDLE_MDMP_MEMSTATE
+#define HANDLE_MDMP_MEMSTATE(CODE, NAME, NATIVENAME)
+#endif
+
+#ifndef HANDLE_MDMP_MEMTYPE
+#define HANDLE_MDMP_MEMTYPE(CODE, NAME, NATIVENAME)
+#endif
+
 HANDLE_MDMP_STREAM_TYPE(0x0003, ThreadList)
 HANDLE_MDMP_STREAM_TYPE(0x0004, ModuleList)
 HANDLE_MDMP_STREAM_TYPE(0x0005, MemoryList)
@@ -102,6 +115,30 @@ HANDLE_MDMP_PLATFORM(0x8203, Android) // Android
 HANDLE_MDMP_PLATFORM(0x8204, PS3) // PS3
 HANDLE_MDMP_PLATFORM(0x8205, NaCl) // Native Client (NaCl)
 
+HANDLE_MDMP_PROTECT(0x01, NoAccess, PAGE_NO_ACCESS)
+HANDLE_MDMP_PROTECT(0x02, ReadOnly, PAGE_READ_ONLY)
+HANDLE_MDMP_PROTECT(0x04, ReadWrite, PAGE_READ_WRITE)
+HANDLE_MDMP_PROTECT(0x08, WriteCopy, PAGE_WRITE_COPY)
+HANDLE_MDMP_PROTECT(0x10, Execute, PAGE_EXECUTE)
+HANDLE_MDMP_PROTECT(0x20, ExecuteRead, PAGE_EXECUTE_READ)
+HANDLE_MDMP_PROTECT(0x40, ExecuteReadWrite, PAGE_EXECUTE_READ_WRITE)
+HANDLE_MDMP_PROTECT(0x80, ExeciteWriteCopy, PAGE_EXECUTE_WRITE_COPY)
+HANDLE_MDMP_PROTECT(0x100, Guard, PAGE_GUARD)
+HANDLE_MDMP_PROTECT(0x200, NoCache, PAGE_NOCACHE)
+HANDLE_MDMP_PROTECT(0x400, WriteCombine, PAGE_WRITECOMBINE)
+HANDLE_MDMP_PROTECT(0x40000000, TargetsInvalid, PAGE_TARGETS_INVALID)
+
+HANDLE_MDMP_MEMSTATE(0x01000, Commit, MEM_COMMIT)
+HANDLE_MDMP_MEMSTATE(0x02000, Reserve, MEM_RESERVE)
+HANDLE_MDMP_MEMSTATE(0x10000, Free, MEM_FREE)
+
+HANDLE_MDMP_MEMTYPE(0x0020000, Private, MEM_PRIVATE)
+HANDLE_MDMP_MEMTYPE(0x0040000, Mapped, MEM_MAPPED)
+HANDLE_MDMP_MEMTYPE(0x1000000, Image, MEM_IMAGE)
+
 #undef HANDLE_MDMP_STREAM_TYPE
 #undef HANDLE_MDMP_ARCH
 #undef HANDLE_MDMP_PLATFORM
+#undef HANDLE_MDMP_PROTECT
+#undef HANDLE_MDMP_MEMSTATE
+#undef HANDLE_MDMP_MEMTYPE
diff --git a/include/llvm/BinaryFormat/Wasm.h b/include/llvm/BinaryFormat/Wasm.h
index 0f22bfe610c6..f550d880f68a 100644
--- a/include/llvm/BinaryFormat/Wasm.h
+++ b/include/llvm/BinaryFormat/Wasm.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
 namespace wasm {
@@ -251,9 +252,21 @@ enum : unsigned {
   WASM_OPCODE_F32_CONST = 0x43,
   WASM_OPCODE_F64_CONST = 0x44,
   WASM_OPCODE_I32_ADD = 0x6a,
+};
+
+// Opcodes used in synthetic functions.
+enum : unsigned {
+  WASM_OPCODE_IF = 0x04,
+  WASM_OPCODE_ELSE = 0x05,
+  WASM_OPCODE_DROP = 0x1a,
   WASM_OPCODE_MISC_PREFIX = 0xfc,
   WASM_OPCODE_MEMORY_INIT = 0x08,
   WASM_OPCODE_DATA_DROP = 0x09,
+  WASM_OPCODE_ATOMICS_PREFIX = 0xfe,
+  WASM_OPCODE_ATOMIC_NOTIFY = 0x00,
+  WASM_OPCODE_I32_ATOMIC_WAIT = 0x01,
+  WASM_OPCODE_I32_ATOMIC_STORE = 0x17,
+  WASM_OPCODE_I32_RMW_CMPXCHG = 0x48,
 };
 
 enum : unsigned {
@@ -318,6 +331,7 @@ const unsigned WASM_SYMBOL_VISIBILITY_HIDDEN = 0x4;
 const unsigned WASM_SYMBOL_UNDEFINED = 0x10;
 const unsigned WASM_SYMBOL_EXPORTED = 0x20;
 const unsigned WASM_SYMBOL_EXPLICIT_NAME = 0x40;
+const unsigned WASM_SYMBOL_NO_STRIP = 0x80;
 
 #define WASM_RELOC(name, value) name = value,
 
diff --git a/include/llvm/BinaryFormat/XCOFF.h b/include/llvm/BinaryFormat/XCOFF.h
index 7774ab3ed24a..20a0f446272f 100644
--- a/include/llvm/BinaryFormat/XCOFF.h
+++ b/include/llvm/BinaryFormat/XCOFF.h
@@ -19,12 +19,13 @@ namespace llvm {
 namespace XCOFF {
 
 // Constants used in the XCOFF definition.
-enum { SectionNameSize = 8, SymbolNameSize = 8 };
+enum { FileNamePadSize = 6, NameSize = 8, SymbolTableEntrySize = 18 };
+
 enum ReservedSectionNum { N_DEBUG = -2, N_ABS = -1, N_UNDEF = 0 };
 
 // x_smclas field of x_csect from system header: /usr/include/syms.h
 /// Storage Mapping Class definitions.
-enum StorageMappingClass {
+enum StorageMappingClass : uint8_t {
   //     READ ONLY CLASSES
   XMC_PR = 0,      ///< Program Code
   XMC_RO = 1,      ///< Read Only Constant
@@ -139,6 +140,117 @@ enum StorageClass : uint8_t {
   C_TCSYM = 134 // Reserved
 };
 
+enum SymbolType {
+  XTY_ER = 0, ///< External reference.
+  XTY_SD = 1, ///< Csect definition for initialized storage.
+  XTY_LD = 2, ///< Label definition.
+              ///< Defines an entry point to an initialized csect.
+  XTY_CM = 3  ///< Common csect definition. For uninitialized storage.
+};
+
+// Relocation types, defined in `/usr/include/reloc.h`.
+enum RelocationType : uint8_t {
+  R_POS = 0x00, ///< Positive relocation. Provides the address of the referenced
+                ///< symbol.
+  R_RL = 0x0c,  ///< Positive indirect load relocation. Modifiable instruction.
+  R_RLA = 0x0d, ///< Positive load address relocation. Modifiable instruction.
+
+  R_NEG = 0x01, ///< Negative relocation. Provides the negative of the address
+                ///< of the referenced symbol.
+  R_REL = 0x02, ///< Relative to self relocation. Provides a displacement value
+                ///< between the address of the referenced symbol and the
+                ///< address being relocated.
+
+  R_TOC = 0x03, ///< Relative to the TOC relocation. Provides a displacement
+                ///< that is the difference between the address of the
+                ///< referenced symbol and the TOC anchor csect.
+  R_TRL = 0x12, ///< TOC relative indirect load relocation. Similar to R_TOC,
+                ///< but not modifiable instruction.
+
+  R_TRLA =
+      0x13, ///< Relative to the TOC or to the thread-local storage base
+            ///< relocation. Compilers are not permitted to generate this
+            ///< relocation type. It is the result of a reversible
+            ///< transformation by the linker of an R_TOC relation that turned a
+            ///< load instruction into an add-immediate instruction.
+
+  R_GL = 0x05, ///< Global linkage-external TOC address relocation. Provides the
+               ///< address of the external TOC associated with a defined
+               ///< external symbol.
+  R_TCL = 0x06, ///< Local object TOC address relocation. Provides the address
+                ///< of the local TOC entry of a defined external symbol.
+
+  R_REF = 0x0f, ///< A non-relocating relocation. Used to prevent the binder
+                ///< from garbage collecting a csect (such as code used for
+                ///< dynamic initialization of non-local statics) for which
+                ///< another csect has an implicit dependency.
+
+  R_BA = 0x08, ///< Branch absolute relocation. Provides the address of the
+               ///< referenced symbol. References a non-modifiable instruction.
+  R_BR = 0x0a, ///< Branch relative to self relocation. Provides the
+               ///< displacement that is the difference between the address of
+               ///< the referenced symbol and the address of the referenced
+               ///< branch instruction. References a non-modifiable instruction.
+  R_RBA = 0x18, ///< Branch absolute relocation. Similar to R_BA but
+                ///< references a modifiable instruction.
+  R_RBR = 0x1a, ///< Branch relative to self relocation. Similar to the R_BR
+                ///< relocation type, but references a modifiable instruction.
+
+  R_TLS = 0x20,    ///< General-dynamic reference to TLS symbol.
+  R_TLS_IE = 0x21, ///< Initial-exec reference to TLS symbol.
+  R_TLS_LD = 0x22, ///< Local-dynamic reference to TLS symbol.
+  R_TLS_LE = 0x23, ///< Local-exec reference to TLS symbol.
+  R_TLSM = 0x24,  ///< Module reference to TLS. Provides a handle for the module
+                  ///< containing the referenced symbol.
+  R_TLSML = 0x25, ///< Module reference to the local TLS storage.
+
+  R_TOCU = 0x30, ///< Relative to TOC upper. Specifies the high-order 16 bits of
+                 ///< a large code model TOC-relative relocation.
+  R_TOCL = 0x31 ///< Relative to TOC lower. Specifies the low-order 16 bits of a
+                ///< large code model TOC-relative relocation.
+};
+
+struct FileHeader32 {
+  uint16_t Magic;
+  uint16_t NumberOfSections;
+  int32_t TimeStamp;
+  uint32_t SymbolTableFileOffset;
+  int32_t NumberOfSymbolTableEntries;
+  uint16_t AuxiliaryHeaderSize;
+  uint16_t Flags;
+};
+
+struct SectionHeader32 {
+  char Name[XCOFF::NameSize];
+  uint32_t PhysicalAddress;
+  uint32_t VirtualAddress;
+  uint32_t Size;
+  uint32_t FileOffsetToData;
+  uint32_t FileOffsetToRelocations;
+  uint32_t FileOffsetToLineNumbers;
+  uint16_t NumberOfRelocations;
+  uint16_t NumberOfLineNumbers;
+  int32_t Flags;
+};
+
+enum CFileStringType : uint8_t {
+  XFT_FN = 0,  ///< Specifies the source-file name.
+  XFT_CT = 1,  ///< Specifies the compiler time stamp.
+  XFT_CV = 2,  ///< Specifies the compiler version number.
+  XFT_CD = 128 ///< Specifies compiler-defined information.
+};
+
+enum CFileLangId : uint8_t {
+  TB_C = 0,        ///< C language.
+  TB_CPLUSPLUS = 9 ///< C++ language.
+};
+
+enum CFileCpuId : uint8_t {
+  TCPU_PPC64 = 2, ///< PowerPC common architecture 64-bit mode.
+  TCPU_COM = 3,   ///< POWER and PowerPC architecture common.
+  TCPU_970 = 19   ///< PPC970 - PowerPC 64-bit architecture.
+};
+
 } // end namespace XCOFF
 } // end namespace llvm
 
diff --git a/include/llvm/Bitcode/BitcodeAnalyzer.h b/include/llvm/Bitcode/BitcodeAnalyzer.h
index cfdebd6fe6cb..5fb8bb26f255 100644
--- a/include/llvm/Bitcode/BitcodeAnalyzer.h
+++ b/include/llvm/Bitcode/BitcodeAnalyzer.h
@@ -30,6 +30,7 @@ enum CurStreamTypeType {
   LLVMIRBitstream,
   ClangSerializedASTBitstream,
   ClangSerializedDiagnosticsBitstream,
+  LLVMBitstreamRemarks
 };
 
 struct BCDumpOptions {
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index decd4dd3a965..1a397068caf0 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -391,7 +391,7 @@ enum CastOpcodes {
 /// have no fixed relation to the LLVM IR enum values.  Changing these will
 /// break compatibility with old files.
 enum UnaryOpcodes {
-  UNOP_NEG = 0
+  UNOP_FNEG = 0
 };
 
 /// BinaryOpcodes - These are values used in the bitcode files to encode which
diff --git a/include/llvm/Bitstream/BitCodes.h b/include/llvm/Bitstream/BitCodes.h
index adf54ba96396..41a3de3b20ef 100644
--- a/include/llvm/Bitstream/BitCodes.h
+++ b/include/llvm/Bitstream/BitCodes.h
@@ -168,6 +168,11 @@ class BitCodeAbbrev {
   SmallVector<BitCodeAbbrevOp, 32> OperandList;
 
 public:
+  BitCodeAbbrev() = default;
+
+  explicit BitCodeAbbrev(std::initializer_list<BitCodeAbbrevOp> OperandList)
+      : OperandList(OperandList) {}
+
   unsigned getNumOperandInfos() const {
     return static_cast<unsigned>(OperandList.size());
   }
diff --git a/include/llvm/Bitstream/BitstreamReader.h b/include/llvm/Bitstream/BitstreamReader.h
index ee82e7ec1ba2..b49a969a2d8b 100644
--- a/include/llvm/Bitstream/BitstreamReader.h
+++ b/include/llvm/Bitstream/BitstreamReader.h
@@ -379,6 +379,7 @@ public:
   using SimpleBitstreamCursor::ReadVBR;
   using SimpleBitstreamCursor::ReadVBR64;
   using SimpleBitstreamCursor::SizeInBytes;
+  using SimpleBitstreamCursor::skipToEnd;
 
   /// Return the number of bits used to encode an abbrev #.
   unsigned getAbbrevIDWidth() const { return CurCodeSize; }
diff --git a/include/llvm/CodeGen/AccelTable.h b/include/llvm/CodeGen/AccelTable.h
index 734531a65d50..f8f6b5448f3f 100644
--- a/include/llvm/CodeGen/AccelTable.h
+++ b/include/llvm/CodeGen/AccelTable.h
@@ -101,8 +101,6 @@
 ///
 /// An Apple Accelerator Table can be serialized by calling emitAppleAccelTable
 /// function.
-///
-/// TODO: Add DWARF v5 emission code.
 
 namespace llvm {
 
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index d110f8b01cb5..a4580da5aec9 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -111,6 +111,10 @@ public:
   /// of each call to runOnMachineFunction().
   MCSymbol *CurrentFnSym = nullptr;
 
+  /// The symbol for the current function descriptor on AIX. This is created
+  /// at the beginning of each call to SetupMachineFunction().
+  MCSymbol *CurrentFnDescSym = nullptr;
+
   /// The symbol used to represent the start of the current function for the
   /// purpose of calculating its size (e.g. using the .size directive). By
   /// default, this is equal to CurrentFnSym.
@@ -304,7 +308,7 @@ public:
 
   /// This should be called when a new MachineFunction is being processed from
   /// runOnMachineFunction.
-  void SetupMachineFunction(MachineFunction &MF);
+  virtual void SetupMachineFunction(MachineFunction &MF);
 
   /// This method emits the body and trailer for a function.
   void EmitFunctionBody();
@@ -342,12 +346,11 @@ public:
   /// so, emit it and return true, otherwise do nothing and return false.
   bool EmitSpecialLLVMGlobal(const GlobalVariable *GV);
 
-  /// Emit an alignment directive to the specified power of two boundary. For
-  /// example, if you pass in 3 here, you will get an 8 byte alignment. If a
+  /// Emit an alignment directive to the specified power of two boundary. If a
   /// global value is specified, and if that global has an explicit alignment
   /// requested, it will override the alignment request if required for
   /// correctness.
-  void EmitAlignment(unsigned NumBits, const GlobalObject *GV = nullptr) const;
+  void EmitAlignment(Align Alignment, const GlobalObject *GV = nullptr) const;
 
   /// Lower the specified LLVM Constant to an MCExpr.
   virtual const MCExpr *lowerConstant(const Constant *CV);
@@ -400,7 +403,7 @@ public:
   /// By default, this method prints the label for the specified
   /// MachineBasicBlock, an alignment (if present) and a comment describing it
   /// if appropriate.
-  virtual void EmitBasicBlockStart(const MachineBasicBlock &MBB) const;
+  virtual void EmitBasicBlockStart(const MachineBasicBlock &MBB);
 
   /// Targets can override this to emit stuff at the end of a basic block.
   virtual void EmitBasicBlockEnd(const MachineBasicBlock &MBB);
@@ -415,6 +418,10 @@ public:
 
   virtual void EmitFunctionEntryLabel();
 
+  virtual void EmitFunctionDescriptor() {
+    llvm_unreachable("Function descriptor is target-specific.");
+  }
+
   virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV);
 
   /// Targets can override this to change how global constants that are part of
@@ -635,6 +642,10 @@ public:
   /// supported by the target.
   void EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const;
 
+  /// Return the alignment for the specified \p GV.
+  static Align getGVAlignment(const GlobalValue *GV, const DataLayout &DL,
+                              Align InAlign = Align::None());
+
 private:
   /// Private state for PrintSpecial()
   // Assign a unique ID to this machine instruction.
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index 70bf670fdf0b..2e57b4c9d332 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -190,6 +190,7 @@ private:
 protected:
   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
       : BaseT(DL) {}
+  virtual ~BasicTTIImplBase() = default;
 
   using TargetTransformInfoImplBase::DL;
 
@@ -215,6 +216,16 @@ public:
     return -1;
   }
 
+  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+                                  Intrinsic::ID IID) const {
+    return false;
+  }
+
+  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+                                        Value *OldV, Value *NewV) const {
+    return false;
+  }
+
   bool isLegalAddImmediate(int64_t imm) {
     return getTLI()->isLegalAddImmediate(imm);
   }
@@ -317,7 +328,7 @@ public:
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
                                             unsigned &JumpTableSize) {
     /// Try to find the estimated number of clusters. Note that the number of
-    /// clusters identified in this function could be different from the actural
+    /// clusters identified in this function could be different from the actual
     /// numbers found in lowering. This function ignore switches that are
     /// lowered with a mix of jump table / bit test / BTree. This function was
     /// initially intended to be used when estimating the cost of switch in
@@ -371,10 +382,6 @@ public:
     return N;
   }
 
-  unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }
-
-  unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }
-
   bool shouldBuildLookupTables() {
     const TargetLoweringBase *TLI = getTLI();
     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
@@ -508,13 +515,44 @@ public:
     return BaseT::getInstructionLatency(I);
   }
 
+  virtual Optional<unsigned>
+  getCacheSize(TargetTransformInfo::CacheLevel Level) const {
+    return Optional<unsigned>(
+      getST()->getCacheSize(static_cast<unsigned>(Level)));
+  }
+
+  virtual Optional<unsigned>
+  getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
+    Optional<unsigned> TargetResult =
+        getST()->getCacheAssociativity(static_cast<unsigned>(Level));
+
+    if (TargetResult)
+      return TargetResult;
+
+    return BaseT::getCacheAssociativity(Level);
+  }
+
+  virtual unsigned getCacheLineSize() const {
+    return getST()->getCacheLineSize();
+  }
+
+  virtual unsigned getPrefetchDistance() const {
+    return getST()->getPrefetchDistance();
+  }
+
+  virtual unsigned getMinPrefetchStride() const {
+    return getST()->getMinPrefetchStride();
+  }
+
+  virtual unsigned getMaxPrefetchIterationsAhead() const {
+    return getST()->getMaxPrefetchIterationsAhead();
+  }
+
   /// @}
 
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; }
-
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
@@ -1111,9 +1149,7 @@ public:
                                                     OpPropsBW);
       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
       if (X != Y) {
-        Type *CondTy = Type::getInt1Ty(RetTy->getContext());
-        if (RetVF > 1)
-          CondTy = VectorType::get(CondTy, RetVF);
+        Type *CondTy = RetTy->getWithNewBitWidth(1);
         Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy,
                                                 CondTy, nullptr);
         Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
@@ -1131,7 +1167,6 @@ public:
   unsigned getIntrinsicInstrCost(
       Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF,
       unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) {
-    unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
     auto *ConcreteTTI = static_cast<T *>(this);
 
     SmallVector<unsigned, 2> ISDs;
@@ -1288,9 +1323,7 @@ public:
           /*IsUnsigned=*/false);
     case Intrinsic::sadd_sat:
     case Intrinsic::ssub_sat: {
-      Type *CondTy = Type::getInt1Ty(RetTy->getContext());
-      if (RetVF > 1)
-        CondTy = VectorType::get(CondTy, RetVF);
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
 
       Type *OpTy = StructType::create({RetTy, CondTy});
       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
@@ -1310,9 +1343,7 @@ public:
     }
     case Intrinsic::uadd_sat:
     case Intrinsic::usub_sat: {
-      Type *CondTy = Type::getInt1Ty(RetTy->getContext());
-      if (RetVF > 1)
-        CondTy = VectorType::get(CondTy, RetVF);
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
 
       Type *OpTy = StructType::create({RetTy, CondTy});
       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
@@ -1329,9 +1360,7 @@ public:
     case Intrinsic::smul_fix:
     case Intrinsic::umul_fix: {
       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
-      Type *ExtTy = Type::getIntNTy(RetTy->getContext(), ExtSize);
-      if (RetVF > 1)
-        ExtTy = VectorType::get(ExtTy, RetVF);
+      Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
 
       unsigned ExtOp =
           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
@@ -1395,9 +1424,7 @@ public:
       Type *MulTy = RetTy->getContainedType(0);
       Type *OverflowTy = RetTy->getContainedType(1);
       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
-      Type *ExtTy = Type::getIntNTy(RetTy->getContext(), ExtSize);
-      if (MulTy->isVectorTy())
-        ExtTy = VectorType::get(ExtTy, MulTy->getVectorNumElements() );
+      Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
 
       unsigned ExtOp =
           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index aa339e1cc913..a30ca638ee6d 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetCallingConv.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Alignment.h"
 
 namespace llvm {
 
@@ -43,6 +44,7 @@ public:
     AExtUpper, // The value is in the upper bits of the location and should be
                // extended with undefined upper bits when retrieved.
     BCvt,      // The value is bit-converted in the location.
+    Trunc,     // The value is truncated in the location.
     VExt,      // The value is vector-widened in the location.
                // FIXME: Not implemented yet. Code that uses AExt to mean
                // vector-widen should be fixed to use VExt instead.
@@ -197,7 +199,7 @@ private:
   LLVMContext &Context;
 
   unsigned StackOffset;
-  unsigned MaxStackArgAlign;
+  Align MaxStackArgAlign;
   SmallVector<uint32_t, 16> UsedRegs;
   SmallVector<CCValAssign, 4> PendingLocs;
   SmallVector<ISD::ArgFlagsTy, 4> PendingArgFlags;
@@ -421,19 +423,19 @@ public:
 
   /// AllocateStack - Allocate a chunk of stack space with the specified size
   /// and alignment.
-  unsigned AllocateStack(unsigned Size, unsigned Align) {
-    assert(Align && ((Align - 1) & Align) == 0); // Align is power of 2.
-    StackOffset = alignTo(StackOffset, Align);
+  unsigned AllocateStack(unsigned Size, unsigned Alignment) {
+    const Align CheckedAlignment(Alignment);
+    StackOffset = alignTo(StackOffset, CheckedAlignment);
     unsigned Result = StackOffset;
     StackOffset += Size;
-    MaxStackArgAlign = std::max(Align, MaxStackArgAlign);
-    ensureMaxAlignment(Align);
+    MaxStackArgAlign = std::max(CheckedAlignment, MaxStackArgAlign);
+    ensureMaxAlignment(CheckedAlignment);
     return Result;
   }
 
-  void ensureMaxAlignment(unsigned Align) {
+  void ensureMaxAlignment(Align Alignment) {
     if (!AnalyzingMustTailForwardedRegs)
-      MF.getFrameInfo().ensureMaxAlignment(Align);
+      MF.getFrameInfo().ensureMaxAlignment(Alignment.value());
   }
 
   /// Version of AllocateStack with extra register to be shadowed.
diff --git a/include/llvm/CodeGen/DFAPacketizer.h b/include/llvm/CodeGen/DFAPacketizer.h
index cf58ee0cabea..705465b15c4c 100644
--- a/include/llvm/CodeGen/DFAPacketizer.h
+++ b/include/llvm/CodeGen/DFAPacketizer.h
@@ -28,6 +28,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/Support/Automaton.h"
 #include <cstdint>
 #include <map>
 #include <memory>
@@ -76,26 +77,26 @@ using DFAStateInput = int64_t;
 
 class DFAPacketizer {
 private:
-  using UnsignPair = std::pair<unsigned, DFAInput>;
-
   const InstrItineraryData *InstrItins;
-  int CurrentState = 0;
-  const DFAStateInput (*DFAStateInputTable)[2];
-  const unsigned *DFAStateEntryTable;
-
-  // CachedTable is a map from <FromState, Input> to ToState.
-  DenseMap<UnsignPair, unsigned> CachedTable;
-
-  // Read the DFA transition table and update CachedTable.
-  void ReadTable(unsigned state);
+  Automaton<DFAInput> A;
 
 public:
-  DFAPacketizer(const InstrItineraryData *I, const DFAStateInput (*SIT)[2],
-                const unsigned *SET);
+  DFAPacketizer(const InstrItineraryData *InstrItins, Automaton<uint64_t> a) :
+      InstrItins(InstrItins), A(std::move(a)) {
+    // Start off with resource tracking disabled.
+    A.enableTranscription(false);
+  }
 
   // Reset the current state to make all resources available.
   void clearResources() {
-    CurrentState = 0;
+    A.reset();
+  }
+
+  // Set whether this packetizer should track not just whether instructions
+  // can be packetized, but also which functional units each instruction ends up
+  // using after packetization.
+  void setTrackResources(bool Track) {
+    A.enableTranscription(Track);
   }
 
   // Return the DFAInput for an instruction class.
@@ -120,6 +121,15 @@ public:
   // current state to reflect that change.
   void reserveResources(MachineInstr &MI);
 
+  // Return the resources used by the InstIdx'th instruction added to this
+  // packet. The resources are returned as a bitvector of functional units.
+  //
+  // Note that a bundle may be packed in multiple valid ways. This function
+  // returns one arbitary valid packing.
+  //
+  // Requires setTrackResources(true) to have been called.
+  unsigned getUsedResources(unsigned InstIdx);
+
   const InstrItineraryData *getInstrItins() const { return InstrItins; }
 };
 
@@ -134,7 +144,7 @@ class VLIWPacketizerList {
 protected:
   MachineFunction &MF;
   const TargetInstrInfo *TII;
-  AliasAnalysis *AA;
+  AAResults *AA;
 
   // The VLIW Scheduler.
   DefaultVLIWScheduler *VLIWScheduler;
@@ -146,9 +156,9 @@ protected:
   std::map<MachineInstr*, SUnit*> MIToSUnit;
 
 public:
-  // The AliasAnalysis parameter can be nullptr.
+  // The AAResults parameter can be nullptr.
   VLIWPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
-                     AliasAnalysis *AA);
+                     AAResults *AA);
 
   virtual ~VLIWPacketizerList();
 
diff --git a/include/llvm/CodeGen/DIE.h b/include/llvm/CodeGen/DIE.h
index 684f9e40ca5a..e8e7504a6cda 100644
--- a/include/llvm/CodeGen/DIE.h
+++ b/include/llvm/CodeGen/DIE.h
@@ -550,6 +550,14 @@ public:
     return *static_cast<T *>(Last ? Last->Next.getPointer() : nullptr);
   }
 
+  void takeNodes(IntrusiveBackList<T> &Other) {
+    for (auto &N : Other) {
+      N.Next.setPointerAndInt(&N, true);
+      push_back(N);
+    }
+    Other.Last = nullptr;
+  }
+
   class const_iterator;
   class iterator
       : public iterator_facade_base<iterator, std::forward_iterator_tag, T> {
@@ -685,6 +693,10 @@ public:
     return addValue(Alloc, DIEValue(Attribute, Form, std::forward<T>(Value)));
   }
 
+  /// Take ownership of the nodes in \p Other, and append them to the back of
+  /// the list.
+  void takeValues(DIEValueList &Other) { List.takeNodes(Other.List); }
+
   value_range values() {
     return make_range(value_iterator(List.begin()), value_iterator(List.end()));
   }
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index f09b59daf4dd..03d681feb7aa 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -93,9 +93,9 @@ public:
 
     SmallVector<Value *, 16> OutVals;
     SmallVector<ISD::ArgFlagsTy, 16> OutFlags;
-    SmallVector<unsigned, 16> OutRegs;
+    SmallVector<Register, 16> OutRegs;
     SmallVector<ISD::InputArg, 4> Ins;
-    SmallVector<unsigned, 4> InRegs;
+    SmallVector<Register, 4> InRegs;
 
     CallLoweringInfo()
         : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
diff --git a/include/llvm/CodeGen/FunctionLoweringInfo.h b/include/llvm/CodeGen/FunctionLoweringInfo.h
index fb60191abd3a..f812a2f6c585 100644
--- a/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -20,7 +20,6 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -37,6 +36,7 @@ namespace llvm {
 class Argument;
 class BasicBlock;
 class BranchProbabilityInfo;
+class LegacyDivergenceAnalysis;
 class Function;
 class Instruction;
 class MachineFunction;
diff --git a/include/llvm/CodeGen/GlobalISel/CallLowering.h b/include/llvm/CodeGen/GlobalISel/CallLowering.h
index d717121ad78e..4901a3748e4a 100644
--- a/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -45,18 +45,62 @@ class CallLowering {
 public:
   struct ArgInfo {
     SmallVector<Register, 4> Regs;
+    // If the argument had to be split into multiple parts according to the
+    // target calling convention, then this contains the original vregs
+    // if the argument was an incoming arg.
+    SmallVector<Register, 2> OrigRegs;
     Type *Ty;
-    ISD::ArgFlagsTy Flags;
+    SmallVector<ISD::ArgFlagsTy, 4> Flags;
     bool IsFixed;
 
     ArgInfo(ArrayRef<Register> Regs, Type *Ty,
-            ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy{}, bool IsFixed = true)
-        : Regs(Regs.begin(), Regs.end()), Ty(Ty), Flags(Flags),
-          IsFixed(IsFixed) {
+            ArrayRef<ISD::ArgFlagsTy> Flags = ArrayRef<ISD::ArgFlagsTy>(),
+            bool IsFixed = true)
+        : Regs(Regs.begin(), Regs.end()), Ty(Ty),
+          Flags(Flags.begin(), Flags.end()), IsFixed(IsFixed) {
+      if (!Regs.empty() && Flags.empty())
+        this->Flags.push_back(ISD::ArgFlagsTy());
       // FIXME: We should have just one way of saying "no register".
       assert((Ty->isVoidTy() == (Regs.empty() || Regs[0] == 0)) &&
              "only void types should have no register");
     }
+
+    ArgInfo() : Ty(nullptr), IsFixed(false) {}
+  };
+
+  struct CallLoweringInfo {
+    /// Calling convention to be used for the call.
+    CallingConv::ID CallConv = CallingConv::C;
+
+    /// Destination of the call. It should be either a register, globaladdress,
+    /// or externalsymbol.
+    MachineOperand Callee = MachineOperand::CreateImm(0);
+
+    /// Descriptor for the return type of the function.
+    ArgInfo OrigRet;
+
+    /// List of descriptors of the arguments passed to the function.
+    SmallVector<ArgInfo, 8> OrigArgs;
+
+    /// Valid if the call has a swifterror inout parameter, and contains the
+    /// vreg that the swifterror should be copied into after the call.
+    Register SwiftErrorVReg = 0;
+
+    MDNode *KnownCallees = nullptr;
+
+    /// True if the call must be tail call optimized.
+    bool IsMustTailCall = false;
+
+    /// True if the call passes all target-independent checks for tail call
+    /// optimization.
+    bool IsTailCall = false;
+
+    /// True if the call was lowered as a tail call. This is consumed by the
+    /// legalizer. This allows the legalizer to lower libcalls as tail calls.
+    bool LoweredTailCall = false;
+
+    /// True if the call is to a vararg function.
+    bool IsVarArg = false;
   };
 
   /// Argument handling is mostly uniform between the four places that
@@ -72,9 +116,9 @@ public:
 
     virtual ~ValueHandler() = default;
 
-    /// Returns true if the handler is dealing with formal arguments,
-    /// not with return values etc.
-    virtual bool isArgumentHandler() const { return false; }
+    /// Returns true if the handler is dealing with incoming arguments,
+    /// i.e. those that move values from some physical location to vregs.
+    virtual bool isIncomingArgumentHandler() const = 0;
 
     /// Materialize a VReg containing the address of the specified
     /// stack-based object. This is either based on a FrameIndex or
@@ -112,8 +156,8 @@ public:
 
     virtual bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                            CCValAssign::LocInfo LocInfo, const ArgInfo &Info,
-                           CCState &State) {
-      return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+                           ISD::ArgFlagsTy Flags, CCState &State) {
+      return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
     }
 
     MachineIRBuilder &MIRBuilder;
@@ -162,12 +206,42 @@ protected:
   /// \p Callback to move them to the assigned locations.
   ///
   /// \return True if everything has succeeded, false otherwise.
-  bool handleAssignments(MachineIRBuilder &MIRBuilder, ArrayRef<ArgInfo> Args,
+  bool handleAssignments(MachineIRBuilder &MIRBuilder,
+                         SmallVectorImpl<ArgInfo> &Args,
                          ValueHandler &Handler) const;
   bool handleAssignments(CCState &CCState,
                          SmallVectorImpl<CCValAssign> &ArgLocs,
-                         MachineIRBuilder &MIRBuilder, ArrayRef<ArgInfo> Args,
+                         MachineIRBuilder &MIRBuilder,
+                         SmallVectorImpl<ArgInfo> &Args,
                          ValueHandler &Handler) const;
+
+  /// Analyze passed or returned values from a call, supplied in \p ArgInfo,
+  /// incorporating info about the passed values into \p CCState.
+  ///
+  /// Used to check if arguments are suitable for tail call lowering.
+  bool analyzeArgInfo(CCState &CCState, SmallVectorImpl<ArgInfo> &Args,
+                      CCAssignFn &AssignFnFixed,
+                      CCAssignFn &AssignFnVarArg) const;
+
+  /// \returns True if the calling convention for a callee and its caller pass
+  /// results in the same way. Typically used for tail call eligibility checks.
+  ///
+  /// \p Info is the CallLoweringInfo for the call.
+  /// \p MF is the MachineFunction for the caller.
+  /// \p InArgs contains the results of the call.
+  /// \p CalleeAssignFnFixed is the CCAssignFn to be used for the callee for
+  /// fixed arguments.
+  /// \p CalleeAssignFnVarArg is similar, but for varargs.
+  /// \p CallerAssignFnFixed is the CCAssignFn to be used for the caller for
+  /// fixed arguments.
+  /// \p CallerAssignFnVarArg is similar, but for varargs.
+  bool resultsCompatible(CallLoweringInfo &Info, MachineFunction &MF,
+                         SmallVectorImpl<ArgInfo> &InArgs,
+                         CCAssignFn &CalleeAssignFnFixed,
+                         CCAssignFn &CalleeAssignFnVarArg,
+                         CCAssignFn &CallerAssignFnFixed,
+                         CCAssignFn &CallerAssignFnVarArg) const;
+
 public:
   CallLowering(const TargetLowering *TLI) : TLI(TLI) {}
   virtual ~CallLowering() = default;
@@ -223,37 +297,10 @@ public:
   /// This hook must be implemented to lower the given call instruction,
   /// including argument and return value marshalling.
   ///
-  /// \p CallConv is the calling convention to be used for the call.
-  ///
-  /// \p Callee is the destination of the call. It should be either a register,
-  /// globaladdress, or externalsymbol.
-  ///
-  /// \p OrigRet is a descriptor for the return type of the function.
-  ///
-  /// \p OrigArgs is a list of descriptors of the arguments passed to the
-  /// function.
-  ///
-  /// \p SwiftErrorVReg is non-zero if the call has a swifterror inout
-  /// parameter, and contains the vreg that the swifterror should be copied into
-  /// after the call.
   ///
   /// \return true if the lowering succeeded, false otherwise.
-  virtual bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
-                         const MachineOperand &Callee, const ArgInfo &OrigRet,
-                         ArrayRef<ArgInfo> OrigArgs,
-                         Register SwiftErrorVReg) const {
-    if (!supportSwiftError()) {
-      assert(SwiftErrorVReg == 0 && "trying to use unsupported swifterror");
-      return lowerCall(MIRBuilder, CallConv, Callee, OrigRet, OrigArgs);
-    }
-    return false;
-  }
-
-  /// This hook behaves as the extended lowerCall function, but for targets that
-  /// do not support swifterror value promotion.
-  virtual bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
-                         const MachineOperand &Callee, const ArgInfo &OrigRet,
-                         ArrayRef<ArgInfo> OrigArgs) const {
+  virtual bool lowerCall(MachineIRBuilder &MIRBuilder,
+                         CallLoweringInfo &Info) const {
     return false;
   }
 
diff --git a/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 0c50c9c5e0cf..4c04dc52547d 100644
--- a/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -27,6 +27,8 @@ class MachineIRBuilder;
 class MachineRegisterInfo;
 class MachineInstr;
 class MachineOperand;
+class GISelKnownBits;
+class MachineDominatorTree;
 
 struct PreferredTuple {
   LLT Ty;                // The result type of the extend.
@@ -35,12 +37,17 @@ struct PreferredTuple {
 };
 
 class CombinerHelper {
+protected:
   MachineIRBuilder &Builder;
   MachineRegisterInfo &MRI;
   GISelChangeObserver &Observer;
+  GISelKnownBits *KB;
+  MachineDominatorTree *MDT;
 
 public:
-  CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B);
+  CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
+                 GISelKnownBits *KB = nullptr,
+                 MachineDominatorTree *MDT = nullptr);
 
   /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
   void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const;
@@ -56,18 +63,132 @@ public:
   bool matchCombineCopy(MachineInstr &MI);
   void applyCombineCopy(MachineInstr &MI);
 
+  /// Returns true if \p DefMI precedes \p UseMI or they are the same
+  /// instruction. Both must be in the same basic block.
+  bool isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI);
+
+  /// Returns true if \p DefMI dominates \p UseMI. By definition an
+  /// instruction dominates itself.
+  ///
+  /// If we haven't been provided with a MachineDominatorTree during
+  /// construction, this function returns a conservative result that tracks just
+  /// a single basic block.
+  bool dominates(MachineInstr &DefMI, MachineInstr &UseMI);
+
   /// If \p MI is extend that consumes the result of a load, try to combine it.
   /// Returns true if MI changed.
   bool tryCombineExtendingLoads(MachineInstr &MI);
   bool matchCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
   void applyCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
 
-  bool matchCombineBr(MachineInstr &MI);
-  bool tryCombineBr(MachineInstr &MI);
+  /// Combine \p MI into a pre-indexed or post-indexed load/store operation if
+  /// legal and the surrounding code makes it useful.
+  bool tryCombineIndexedLoadStore(MachineInstr &MI);
+
+  bool matchElideBrByInvertingCond(MachineInstr &MI);
+  void applyElideBrByInvertingCond(MachineInstr &MI);
+  bool tryElideBrByInvertingCond(MachineInstr &MI);
+
+  /// If \p MI is G_CONCAT_VECTORS, try to combine it.
+  /// Returns true if MI changed.
+  /// Right now, we support:
+  /// - concat_vector(undef, undef) => undef
+  /// - concat_vector(build_vector(A, B), build_vector(C, D)) =>
+  ///   build_vector(A, B, C, D)
+  ///
+  /// \pre MI.getOpcode() == G_CONCAT_VECTORS.
+  bool tryCombineConcatVectors(MachineInstr &MI);
+  /// Check if the G_CONCAT_VECTORS \p MI is undef or if it
+  /// can be flattened into a build_vector.
+  /// In the first case \p IsUndef will be true.
+  /// In the second case \p Ops will contain the operands needed
+  /// to produce the flattened build_vector.
+  ///
+  /// \pre MI.getOpcode() == G_CONCAT_VECTORS.
+  bool matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
+                                 SmallVectorImpl<Register> &Ops);
+  /// Replace \p MI with a flattened build_vector with \p Ops or an
+  /// implicit_def if IsUndef is true.
+  void applyCombineConcatVectors(MachineInstr &MI, bool IsUndef,
+                                 const ArrayRef<Register> Ops);
+
+  /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
+  /// Returns true if MI changed.
+  ///
+  /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR.
+  bool tryCombineShuffleVector(MachineInstr &MI);
+  /// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by a
+  /// concat_vectors.
+  /// \p Ops will contain the operands needed to produce the flattened
+  /// concat_vectors.
+  ///
+  /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR.
+  bool matchCombineShuffleVector(MachineInstr &MI,
+                                 SmallVectorImpl<Register> &Ops);
+  /// Replace \p MI with a concat_vectors with \p Ops.
+  void applyCombineShuffleVector(MachineInstr &MI,
+                                 const ArrayRef<Register> Ops);
+
+  /// Optimize memcpy intrinsics et al, e.g. constant len calls.
+  /// /p MaxLen if non-zero specifies the max length of a mem libcall to inline.
+  ///
+  /// For example (pre-indexed):
+  ///
+  ///     $addr = G_GEP $base, $offset
+  ///     [...]
+  ///     $val = G_LOAD $addr
+  ///     [...]
+  ///     $whatever = COPY $addr
+  ///
+  /// -->
+  ///
+  ///     $val, $addr = G_INDEXED_LOAD $base, $offset, 1 (IsPre)
+  ///     [...]
+  ///     $whatever = COPY $addr
+  ///
+  /// or (post-indexed):
+  ///
+  ///     G_STORE $val, $base
+  ///     [...]
+  ///     $addr = G_GEP $base, $offset
+  ///     [...]
+  ///     $whatever = COPY $addr
+  ///
+  /// -->
+  ///
+  ///     $addr = G_INDEXED_STORE $val, $base, $offset
+  ///     [...]
+  ///     $whatever = COPY $addr
+  bool tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0);
 
   /// Try to transform \p MI by using all of the above
   /// combine functions. Returns true if changed.
   bool tryCombine(MachineInstr &MI);
+
+private:
+  // Memcpy family optimization helpers.
+  bool optimizeMemcpy(MachineInstr &MI, Register Dst, Register Src,
+                      unsigned KnownLen, unsigned DstAlign, unsigned SrcAlign,
+                      bool IsVolatile);
+  bool optimizeMemmove(MachineInstr &MI, Register Dst, Register Src,
+                      unsigned KnownLen, unsigned DstAlign, unsigned SrcAlign,
+                      bool IsVolatile);
+  bool optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
+                      unsigned KnownLen, unsigned DstAlign, bool IsVolatile);
+
+  /// Given a non-indexed load or store instruction \p MI, find an offset that
+  /// can be usefully and legally folded into it as a post-indexing operation.
+  ///
+  /// \returns true if a candidate is found.
+  bool findPostIndexCandidate(MachineInstr &MI, Register &Addr, Register &Base,
+                              Register &Offset);
+
+  /// Given a non-indexed load or store instruction \p MI, find an offset that
+  /// can be usefully and legally folded into it as a pre-indexing operation.
+  ///
+  /// \returns true if a candidate is found.
+  bool findPreIndexCandidate(MachineInstr &MI, Register &Addr, Register &Base,
+                             Register &Offset);
 };
 } // namespace llvm
 
diff --git a/include/llvm/CodeGen/GlobalISel/CombinerInfo.h b/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
index 3b09a8e2b479..ad645a46bbe6 100644
--- a/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
@@ -27,9 +27,11 @@ class MachineRegisterInfo;
 class CombinerInfo {
 public:
   CombinerInfo(bool AllowIllegalOps, bool ShouldLegalizeIllegal,
-               LegalizerInfo *LInfo)
+               LegalizerInfo *LInfo, bool OptEnabled, bool OptSize,
+               bool MinSize)
       : IllegalOpsAllowed(AllowIllegalOps),
-        LegalizeIllegalOps(ShouldLegalizeIllegal), LInfo(LInfo) {
+        LegalizeIllegalOps(ShouldLegalizeIllegal), LInfo(LInfo),
+        EnableOpt(OptEnabled), EnableOptSize(OptSize), EnableMinSize(MinSize) {
     assert(((AllowIllegalOps || !LegalizeIllegalOps) || LInfo) &&
            "Expecting legalizerInfo when illegalops not allowed");
   }
@@ -43,6 +45,15 @@ public:
   bool LegalizeIllegalOps; // TODO: Make use of this.
   const LegalizerInfo *LInfo;
 
+  /// Whether optimizations should be enabled. This is to distinguish between
+  /// uses of the combiner unconditionally and only when optimizations are
+  /// specifically enabled/
+  bool EnableOpt;
+  /// Whether we're optimizing for size.
+  bool EnableOptSize;
+  /// Whether we're optimizing for minsize (-Oz).
+  bool EnableMinSize;
+
   /// Attempt to combine instructions using MI as the root.
   ///
   /// Use Observer to report the creation, modification, and erasure of
diff --git a/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h b/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h
index e817d9b4550e..df196bfbd437 100644
--- a/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h
+++ b/include/llvm/CodeGen/GlobalISel/ConstantFoldingMIRBuilder.h
@@ -54,6 +54,17 @@ public:
         return buildConstant(Dst, MaybeCst->getSExtValue());
       break;
     }
+    case TargetOpcode::G_SEXT_INREG: {
+      assert(DstOps.size() == 1 && "Invalid dst ops");
+      assert(SrcOps.size() == 2 && "Invalid src ops");
+      const DstOp &Dst = DstOps[0];
+      const SrcOp &Src0 = SrcOps[0];
+      const SrcOp &Src1 = SrcOps[1];
+      if (auto MaybeCst =
+              ConstantFoldExtOp(Opc, Src0.getReg(), Src1.getImm(), *getMRI()))
+        return buildConstant(Dst, MaybeCst->getSExtValue());
+      break;
+    }
     }
     return MachineIRBuilder::buildInstr(Opc, DstOps, SrcOps);
   }
diff --git a/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h b/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
new file mode 100644
index 000000000000..dfe5a7f3177d
--- /dev/null
+++ b/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
@@ -0,0 +1,111 @@
+//===- llvm/CodeGen/GlobalISel/GISelKnownBits.h ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Provides analysis for querying information about KnownBits during GISel
+/// passes.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CODEGEN_GLOBALISEL_KNOWNBITSINFO_H
+#define LLVM_CODEGEN_GLOBALISEL_KNOWNBITSINFO_H
+
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/KnownBits.h"
+
+namespace llvm {
+
+class TargetLowering;
+class DataLayout;
+
+class GISelKnownBits : public GISelChangeObserver {
+  MachineFunction &MF;
+  MachineRegisterInfo &MRI;
+  const TargetLowering &TL;
+  const DataLayout &DL;
+
+public:
+  GISelKnownBits(MachineFunction &MF);
+  virtual ~GISelKnownBits() = default;
+  void setMF(MachineFunction &MF);
+  virtual void computeKnownBitsImpl(Register R, KnownBits &Known,
+                                    const APInt &DemandedElts,
+                                    unsigned Depth = 0);
+
+  // KnownBitsAPI
+  KnownBits getKnownBits(Register R);
+  // Calls getKnownBits for first operand def of MI.
+  KnownBits getKnownBits(MachineInstr &MI);
+  APInt getKnownZeroes(Register R);
+  APInt getKnownOnes(Register R);
+
+  /// \return true if 'V & Mask' is known to be zero in DemandedElts. We use
+  /// this predicate to simplify operations downstream.
+  /// Mask is known to be zero for bits that V cannot have.
+  bool maskedValueIsZero(Register Val, const APInt &Mask) {
+    return Mask.isSubsetOf(getKnownBits(Val).Zero);
+  }
+
+  /// \return true if the sign bit of Op is known to be zero.  We use this
+  /// predicate to simplify operations downstream.
+  bool signBitIsZero(Register Op);
+
+  // FIXME: Is this the right place for G_FRAME_INDEX? Should it be in
+  // TargetLowering?
+  void computeKnownBitsForFrameIndex(Register R, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     unsigned Depth = 0);
+  static Align inferAlignmentForFrameIdx(int FrameIdx, int Offset,
+                                         const MachineFunction &MF);
+  static void computeKnownBitsForAlignment(KnownBits &Known,
+                                           MaybeAlign Alignment);
+
+  // Try to infer alignment for MI.
+  static MaybeAlign inferPtrAlignment(const MachineInstr &MI);
+
+  // Observer API. No-op for non-caching implementation.
+  void erasingInstr(MachineInstr &MI) override{};
+  void createdInstr(MachineInstr &MI) override{};
+  void changingInstr(MachineInstr &MI) override{};
+  void changedInstr(MachineInstr &MI) override{};
+
+protected:
+  unsigned getMaxDepth() const { return 6; }
+};
+
+/// To use KnownBitsInfo analysis in a pass,
+/// KnownBitsInfo &Info = getAnalysis<GISelKnownBitsInfoAnalysis>().get(MF);
+/// Add to observer if the Info is caching.
+/// WrapperObserver.addObserver(Info);
+
+/// Eventually add other features such as caching/ser/deserializing
+/// to MIR etc. Those implementations can derive from GISelKnownBits
+/// and override computeKnownBitsImpl.
+class GISelKnownBitsAnalysis : public MachineFunctionPass {
+  std::unique_ptr<GISelKnownBits> Info;
+
+public:
+  static char ID;
+  GISelKnownBitsAnalysis() : MachineFunctionPass(ID) {
+    initializeGISelKnownBitsAnalysisPass(*PassRegistry::getPassRegistry());
+  }
+  GISelKnownBits &get(MachineFunction &MF) {
+    if (!Info)
+      Info = std::make_unique<GISelKnownBits>(MF);
+    return *Info.get();
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void releaseMemory() override { Info.reset(); }
+};
+} // namespace llvm
+
+#endif // ifdef
diff --git a/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 8654ba83f08d..bdb92aa4689d 100644
--- a/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -213,8 +213,8 @@ private:
   bool translateStore(const User &U, MachineIRBuilder &MIRBuilder);
 
   /// Translate an LLVM string intrinsic (memcpy, memset, ...).
-  bool translateMemfunc(const CallInst &CI, MachineIRBuilder &MIRBuilder,
-                        unsigned ID);
+  bool translateMemFunc(const CallInst &CI, MachineIRBuilder &MIRBuilder,
+                        Intrinsic::ID ID);
 
   void getStackGuard(Register DstReg, MachineIRBuilder &MIRBuilder);
 
@@ -243,6 +243,10 @@ private:
   bool valueIsSplit(const Value &V,
                     SmallVectorImpl<uint64_t> *Offsets = nullptr);
 
+  /// Common code for translating normal calls or invokes.
+  bool translateCallSite(const ImmutableCallSite &CS,
+                         MachineIRBuilder &MIRBuilder);
+
   /// Translate call instruction.
   /// \pre \p U is a call instruction.
   bool translateCall(const User &U, MachineIRBuilder &MIRBuilder);
@@ -514,6 +518,10 @@ private:
   // function has the optnone attribute.
   bool EnableOpts = false;
 
+  /// True when the block contains a tail call. This allows the IRTranslator to
+  /// stop translating such blocks early.
+  bool HasTailCall = false;
+
   /// Switch analysis and optimization.
   class GISelSwitchLowering : public SwitchCG::SwitchLowering {
   public:
diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index e9b93be76754..fd3dc743000b 100644
--- a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -31,6 +31,7 @@ namespace llvm {
 
 class APInt;
 class APFloat;
+class GISelKnownBits;
 class MachineInstr;
 class MachineInstrBuilder;
 class MachineFunction;
@@ -148,6 +149,13 @@ enum {
   /// - AddrSpaceN+1 ...
   GIM_CheckMemoryAddressSpace,
 
+  /// Check the minimum alignment of the memory access for the given machine
+  /// memory operand.
+  /// - InsnID - Instruction ID
+  /// - MMOIdx - MMO index
+  /// - MinAlign - Minimum acceptable alignment
+  GIM_CheckMemoryAlignment,
+
   /// Check the size of the memory access for the given machine memory operand
   /// against the size of an operand.
   /// - InsnID - Instruction ID
@@ -201,11 +209,22 @@ enum {
   /// - Expected Intrinsic ID
   GIM_CheckIntrinsicID,
 
+  /// Check the operand is a specific predicate
+  /// - InsnID - Instruction ID
+  /// - OpIdx - Operand index
+  /// - Expected predicate
+  GIM_CheckCmpPredicate,
+
   /// Check the specified operand is an MBB
   /// - InsnID - Instruction ID
   /// - OpIdx - Operand index
   GIM_CheckIsMBB,
 
+  /// Check the specified operand is an Imm
+  /// - InsnID - Instruction ID
+  /// - OpIdx - Operand index
+  GIM_CheckIsImm,
+
   /// Check if the specified operand is safe to fold into the current
   /// instruction.
   /// - InsnID - Instruction ID
@@ -365,7 +384,20 @@ public:
   ///   if returns true:
   ///     for I in all mutated/inserted instructions:
   ///       !isPreISelGenericOpcode(I.getOpcode())
-  virtual bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const = 0;
+  virtual bool select(MachineInstr &I) = 0;
+
+  CodeGenCoverage *CoverageInfo = nullptr;
+  GISelKnownBits *KnownBits = nullptr;
+  MachineFunction *MF = nullptr;
+
+  /// Setup per-MF selector state.
+  virtual void setupMF(MachineFunction &mf,
+                       GISelKnownBits &KB,
+                       CodeGenCoverage &covinfo) {
+    CoverageInfo = &covinfo;
+    KnownBits = &KB;
+    MF = &mf;
+  }
 
 protected:
   using ComplexRendererFns =
diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
index e8ee4af0cb0b..08f2f54bcf90 100644
--- a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
+++ b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
@@ -98,7 +98,7 @@ bool InstructionSelector::executeMatchTable(
           return false;
         break;
       }
-      if (TRI.isPhysicalRegister(MO.getReg())) {
+      if (Register::isPhysicalRegister(MO.getReg())) {
         DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
                         dbgs() << CurrentIdx << ": Is a physical register\n");
         if (handleReject() == RejectAndGiveUp)
@@ -409,6 +409,30 @@ bool InstructionSelector::executeMatchTable(
         return false;
       break;
     }
+    case GIM_CheckMemoryAlignment: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t MMOIdx = MatchTable[CurrentIdx++];
+      unsigned MinAlign = MatchTable[CurrentIdx++];
+
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+
+      if (State.MIs[InsnID]->getNumMemOperands() <= MMOIdx) {
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+        break;
+      }
+
+      MachineMemOperand *MMO
+        = *(State.MIs[InsnID]->memoperands_begin() + MMOIdx);
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs() << CurrentIdx << ": GIM_CheckMemoryAlignment"
+                      << "(MIs[" << InsnID << "]->memoperands() + " << MMOIdx
+                      << ")->getAlignment() >= " << MinAlign << ")\n");
+      if (MMO->getAlignment() < MinAlign && handleReject() == RejectAndGiveUp)
+        return false;
+
+      break;
+    }
     case GIM_CheckMemorySizeEqualTo: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t MMOIdx = MatchTable[CurrentIdx++];
@@ -638,7 +662,21 @@ bool InstructionSelector::executeMatchTable(
           return false;
       break;
     }
-
+    case GIM_CheckCmpPredicate: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t OpIdx = MatchTable[CurrentIdx++];
+      int64_t Value = MatchTable[CurrentIdx++];
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs() << CurrentIdx << ": GIM_CheckCmpPredicate(MIs["
+                             << InsnID << "]->getOperand(" << OpIdx
+                             << "), Value=" << Value << ")\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
+      if (!MO.isPredicate() || MO.getPredicate() != Value)
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+      break;
+    }
     case GIM_CheckIsMBB: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t OpIdx = MatchTable[CurrentIdx++];
@@ -652,7 +690,19 @@ bool InstructionSelector::executeMatchTable(
       }
       break;
     }
-
+    case GIM_CheckIsImm: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t OpIdx = MatchTable[CurrentIdx++];
+      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
+                      dbgs() << CurrentIdx << ": GIM_CheckIsImm(MIs[" << InsnID
+                             << "]->getOperand(" << OpIdx << "))\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      if (!State.MIs[InsnID]->getOperand(OpIdx).isImm()) {
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+      }
+      break;
+    }
     case GIM_CheckIsSafeToFold: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
@@ -792,11 +842,13 @@ bool InstructionSelector::executeMatchTable(
     case GIR_AddRegister: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t RegNum = MatchTable[CurrentIdx++];
+      uint64_t RegFlags = MatchTable[CurrentIdx++];
       assert(OutMIs[InsnID] && "Attempted to add to undefined instruction");
-      OutMIs[InsnID].addReg(RegNum);
-      DEBUG_WITH_TYPE(TgtInstructionSelector::getName(),
-                      dbgs() << CurrentIdx << ": GIR_AddRegister(OutMIs["
-                             << InsnID << "], " << RegNum << ")\n");
+      OutMIs[InsnID].addReg(RegNum, RegFlags);
+      DEBUG_WITH_TYPE(
+        TgtInstructionSelector::getName(),
+        dbgs() << CurrentIdx << ": GIR_AddRegister(OutMIs["
+        << InsnID << "], " << RegNum << ", " << RegFlags << ")\n");
       break;
     }
 
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index a22778b8848c..7f960e727846 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -47,8 +47,7 @@ public:
 
   bool tryCombineAnyExt(MachineInstr &MI,
                         SmallVectorImpl<MachineInstr *> &DeadInsts) {
-    if (MI.getOpcode() != TargetOpcode::G_ANYEXT)
-      return false;
+    assert(MI.getOpcode() == TargetOpcode::G_ANYEXT);
 
     Builder.setInstr(MI);
     Register DstReg = MI.getOperand(0).getReg();
@@ -93,9 +92,7 @@ public:
 
   bool tryCombineZExt(MachineInstr &MI,
                       SmallVectorImpl<MachineInstr *> &DeadInsts) {
-
-    if (MI.getOpcode() != TargetOpcode::G_ZEXT)
-      return false;
+    assert(MI.getOpcode() == TargetOpcode::G_ZEXT);
 
     Builder.setInstr(MI);
     Register DstReg = MI.getOperand(0).getReg();
@@ -136,32 +133,24 @@ public:
 
   bool tryCombineSExt(MachineInstr &MI,
                       SmallVectorImpl<MachineInstr *> &DeadInsts) {
-
-    if (MI.getOpcode() != TargetOpcode::G_SEXT)
-      return false;
+    assert(MI.getOpcode() == TargetOpcode::G_SEXT);
 
     Builder.setInstr(MI);
     Register DstReg = MI.getOperand(0).getReg();
     Register SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
 
-    // sext(trunc x) - > ashr (shl (aext/copy/trunc x), c), c
+    // sext(trunc x) - > (sext_inreg (aext/copy/trunc x), c)
     Register TruncSrc;
     if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLT DstTy = MRI.getType(DstReg);
-      // Guess on the RHS shift amount type, which should be re-legalized if
-      // applicable.
-      if (isInstUnsupported({TargetOpcode::G_SHL, {DstTy, DstTy}}) ||
-          isInstUnsupported({TargetOpcode::G_ASHR, {DstTy, DstTy}}) ||
-          isConstantUnsupported(DstTy))
+      if (isInstUnsupported({TargetOpcode::G_SEXT_INREG, {DstTy}}))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
       LLT SrcTy = MRI.getType(SrcReg);
-      unsigned ShAmt = DstTy.getScalarSizeInBits() - SrcTy.getScalarSizeInBits();
-      auto MIBShAmt = Builder.buildConstant(DstTy, ShAmt);
-      auto MIBShl = Builder.buildInstr(
-          TargetOpcode::G_SHL, {DstTy},
-          {Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), MIBShAmt});
-      Builder.buildInstr(TargetOpcode::G_ASHR, {DstReg}, {MIBShl, MIBShAmt});
+      uint64_t SizeInBits = SrcTy.getScalarSizeInBits();
+      Builder.buildInstr(
+          TargetOpcode::G_SEXT_INREG, {DstReg},
+          {Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), SizeInBits});
       markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
@@ -172,9 +161,8 @@ public:
   bool tryFoldImplicitDef(MachineInstr &MI,
                           SmallVectorImpl<MachineInstr *> &DeadInsts) {
     unsigned Opcode = MI.getOpcode();
-    if (Opcode != TargetOpcode::G_ANYEXT && Opcode != TargetOpcode::G_ZEXT &&
-        Opcode != TargetOpcode::G_SEXT)
-      return false;
+    assert(Opcode == TargetOpcode::G_ANYEXT || Opcode == TargetOpcode::G_ZEXT ||
+           Opcode == TargetOpcode::G_SEXT);
 
     if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF,
                                            MI.getOperand(1).getReg(), MRI)) {
@@ -203,21 +191,38 @@ public:
     return false;
   }
 
-  static unsigned getMergeOpcode(LLT OpTy, LLT DestTy) {
+  static unsigned canFoldMergeOpcode(unsigned MergeOp, unsigned ConvertOp,
+                                     LLT OpTy, LLT DestTy) {
     if (OpTy.isVector() && DestTy.isVector())
-      return TargetOpcode::G_CONCAT_VECTORS;
+      return MergeOp == TargetOpcode::G_CONCAT_VECTORS;
+
+    if (OpTy.isVector() && !DestTy.isVector()) {
+      if (MergeOp == TargetOpcode::G_BUILD_VECTOR)
+        return true;
 
-    if (OpTy.isVector() && !DestTy.isVector())
-      return TargetOpcode::G_BUILD_VECTOR;
+      if (MergeOp == TargetOpcode::G_CONCAT_VECTORS) {
+        if (ConvertOp == 0)
+          return true;
 
-    return TargetOpcode::G_MERGE_VALUES;
+        const unsigned OpEltSize = OpTy.getElementType().getSizeInBits();
+
+        // Don't handle scalarization with a cast that isn't in the same
+        // direction as the vector cast. This could be handled, but it would
+        // require more intermediate unmerges.
+        if (ConvertOp == TargetOpcode::G_TRUNC)
+          return DestTy.getSizeInBits() <= OpEltSize;
+        return DestTy.getSizeInBits() >= OpEltSize;
+      }
+
+      return false;
+    }
+
+    return MergeOp == TargetOpcode::G_MERGE_VALUES;
   }
 
   bool tryCombineMerges(MachineInstr &MI,
                         SmallVectorImpl<MachineInstr *> &DeadInsts) {
-
-    if (MI.getOpcode() != TargetOpcode::G_UNMERGE_VALUES)
-      return false;
+    assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
 
     unsigned NumDefs = MI.getNumOperands() - 1;
     MachineInstr *SrcDef =
@@ -237,16 +242,14 @@ public:
       MergeI = getDefIgnoringCopies(SrcDef->getOperand(1).getReg(), MRI);
     }
 
-    // FIXME: Handle scalarizing concat_vectors (scalar result type with vector
-    // source)
-    unsigned MergingOpcode = getMergeOpcode(OpTy, DestTy);
-    if (!MergeI || MergeI->getOpcode() != MergingOpcode)
+    if (!MergeI || !canFoldMergeOpcode(MergeI->getOpcode(),
+                                       ConvertOp, OpTy, DestTy))
       return false;
 
     const unsigned NumMergeRegs = MergeI->getNumOperands() - 1;
 
     if (NumMergeRegs < NumDefs) {
-      if (ConvertOp != 0 || NumDefs % NumMergeRegs != 0)
+      if (NumDefs % NumMergeRegs != 0)
         return false;
 
       Builder.setInstr(MI);
@@ -264,7 +267,22 @@ public:
              ++j, ++DefIdx)
           DstRegs.push_back(MI.getOperand(DefIdx).getReg());
 
-        Builder.buildUnmerge(DstRegs, MergeI->getOperand(Idx + 1).getReg());
+        if (ConvertOp) {
+          SmallVector<Register, 2> TmpRegs;
+          // This is a vector that is being scalarized and casted. Extract to
+          // the element type, and do the conversion on the scalars.
+          LLT MergeEltTy
+            = MRI.getType(MergeI->getOperand(0).getReg()).getElementType();
+          for (unsigned j = 0; j < NumMergeRegs; ++j)
+            TmpRegs.push_back(MRI.createGenericVirtualRegister(MergeEltTy));
+
+          Builder.buildUnmerge(TmpRegs, MergeI->getOperand(Idx + 1).getReg());
+
+          for (unsigned j = 0; j < NumMergeRegs; ++j)
+            Builder.buildInstr(ConvertOp, {DstRegs[j]}, {TmpRegs[j]});
+        } else {
+          Builder.buildUnmerge(DstRegs, MergeI->getOperand(Idx + 1).getReg());
+        }
       }
 
     } else if (NumMergeRegs > NumDefs) {
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index a0f21e8b19d7..fbfe71255a38 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -200,6 +200,13 @@ public:
   LegalizeResult moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
                                        LLT MoreTy);
 
+  LegalizeResult fewerElementsVectorUnmergeValues(MachineInstr &MI,
+                                                  unsigned TypeIdx,
+                                                  LLT NarrowTy);
+  LegalizeResult fewerElementsVectorBuildVector(MachineInstr &MI,
+                                                unsigned TypeIdx,
+                                                LLT NarrowTy);
+
   LegalizeResult
   reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy);
 
@@ -219,9 +226,17 @@ public:
   LegalizeResult lowerU64ToF32BitOps(MachineInstr &MI);
   LegalizeResult lowerUITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
+  LegalizeResult lowerFPTOUI(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult lowerMinMax(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult lowerFCopySign(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
+  LegalizeResult lowerFMad(MachineInstr &MI);
+  LegalizeResult lowerUnmergeValues(MachineInstr &MI);
+  LegalizeResult lowerShuffleVector(MachineInstr &MI);
+  LegalizeResult lowerDynStackAlloc(MachineInstr &MI);
+  LegalizeResult lowerExtract(MachineInstr &MI);
+  LegalizeResult lowerInsert(MachineInstr &MI);
+  LegalizeResult lowerSADDO_SSUBO(MachineInstr &MI);
 
 private:
   MachineRegisterInfo &MRI;
@@ -236,6 +251,11 @@ createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
               const CallLowering::ArgInfo &Result,
               ArrayRef<CallLowering::ArgInfo> Args);
 
+/// Create a libcall to memcpy et al.
+LegalizerHelper::LegalizeResult createMemLibcall(MachineIRBuilder &MIRBuilder,
+                                                 MachineRegisterInfo &MRI,
+                                                 MachineInstr &MI);
+
 } // End namespace llvm.
 
 #endif
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 513c98f2d23f..1cf62d1fde59 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -331,6 +331,8 @@ class LegalizeRuleSet {
   /// individually handled.
   SmallBitVector TypeIdxsCovered{MCOI::OPERAND_LAST_GENERIC -
                                  MCOI::OPERAND_FIRST_GENERIC + 2};
+  SmallBitVector ImmIdxsCovered{MCOI::OPERAND_LAST_GENERIC_IMM -
+                                MCOI::OPERAND_FIRST_GENERIC_IMM + 2};
 #endif
 
   unsigned typeIdx(unsigned TypeIdx) {
@@ -342,9 +344,21 @@ class LegalizeRuleSet {
 #endif
     return TypeIdx;
   }
-  void markAllTypeIdxsAsCovered() {
+
+  unsigned immIdx(unsigned ImmIdx) {
+    assert(ImmIdx <= (MCOI::OPERAND_LAST_GENERIC_IMM -
+                      MCOI::OPERAND_FIRST_GENERIC_IMM) &&
+           "Imm Index is out of bounds");
+#ifndef NDEBUG
+    ImmIdxsCovered.set(ImmIdx);
+#endif
+    return ImmIdx;
+  }
+
+  void markAllIdxsAsCovered() {
 #ifndef NDEBUG
     TypeIdxsCovered.set();
+    ImmIdxsCovered.set();
 #endif
   }
 
@@ -403,6 +417,15 @@ class LegalizeRuleSet {
     return actionIf(Action, typePairInSet(typeIdx(0), typeIdx(1), Types),
                     Mutation);
   }
+  /// Use the given action when type index 0 is any type in the given list and
+  /// imm index 0 is anything. Action should not be an action that requires
+  /// mutation.
+  LegalizeRuleSet &actionForTypeWithAnyImm(LegalizeAction Action,
+                                           std::initializer_list<LLT> Types) {
+    using namespace LegalityPredicates;
+    immIdx(0); // Inform verifier imm idx 0 is handled.
+    return actionIf(Action, typeInSet(typeIdx(0), Types));
+  }
   /// Use the given action when type indexes 0 and 1 are both in the given list.
   /// That is, the type pair is in the cartesian product of the list.
   /// Action should not be an action that requires mutation.
@@ -454,7 +477,7 @@ public:
   LegalizeRuleSet &legalIf(LegalityPredicate Predicate) {
     // We have no choice but conservatively assume that the free-form
     // user-provided Predicate properly handles all type indices:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::Legal, Predicate);
   }
   /// The instruction is legal when type index 0 is any type in the given list.
@@ -466,6 +489,12 @@ public:
   LegalizeRuleSet &legalFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
     return actionFor(LegalizeAction::Legal, Types);
   }
+  /// The instruction is legal when type index 0 is any type in the given list
+  /// and imm index 0 is anything.
+  LegalizeRuleSet &legalForTypeWithAnyImm(std::initializer_list<LLT> Types) {
+    markAllIdxsAsCovered();
+    return actionForTypeWithAnyImm(LegalizeAction::Legal, Types);
+  }
   /// The instruction is legal when type indexes 0 and 1 along with the memory
   /// size and minimum alignment is any type and size tuple in the given list.
   LegalizeRuleSet &legalForTypesWithMemDesc(
@@ -497,7 +526,7 @@ public:
 
   LegalizeRuleSet &alwaysLegal() {
     using namespace LegalizeMutations;
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::Legal, always);
   }
 
@@ -506,7 +535,7 @@ public:
     using namespace LegalizeMutations;
     // We have no choice but conservatively assume that predicate-less lowering
     // properly handles all type indices by design:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::Lower, always);
   }
   /// The instruction is lowered if predicate is true. Keep type index 0 as the
@@ -515,7 +544,7 @@ public:
     using namespace LegalizeMutations;
     // We have no choice but conservatively assume that lowering with a
     // free-form user provided Predicate properly handles all type indices:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::Lower, Predicate);
   }
   /// The instruction is lowered if predicate is true.
@@ -523,7 +552,7 @@ public:
                            LegalizeMutation Mutation) {
     // We have no choice but conservatively assume that lowering with a
     // free-form user provided Predicate properly handles all type indices:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::Lower, Predicate, Mutation);
   }
   /// The instruction is lowered when type index 0 is any type in the given
@@ -571,7 +600,7 @@ public:
   LegalizeRuleSet &libcallIf(LegalityPredicate Predicate) {
     // We have no choice but conservatively assume that a libcall with a
     // free-form user provided Predicate properly handles all type indices:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::Libcall, Predicate);
   }
   LegalizeRuleSet &libcallFor(std::initializer_list<LLT> Types) {
@@ -597,7 +626,7 @@ public:
                                  LegalizeMutation Mutation) {
     // We have no choice but conservatively assume that an action with a
     // free-form user provided Predicate properly handles all type indices:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::WidenScalar, Predicate, Mutation);
   }
   /// Narrow the scalar to the one selected by the mutation if the predicate is
@@ -606,7 +635,7 @@ public:
                                   LegalizeMutation Mutation) {
     // We have no choice but conservatively assume that an action with a
     // free-form user provided Predicate properly handles all type indices:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::NarrowScalar, Predicate, Mutation);
   }
 
@@ -616,7 +645,7 @@ public:
                                   LegalizeMutation Mutation) {
     // We have no choice but conservatively assume that an action with a
     // free-form user provided Predicate properly handles all type indices:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::MoreElements, Predicate, Mutation);
   }
   /// Remove elements to reach the type selected by the mutation if the
@@ -625,7 +654,7 @@ public:
                                    LegalizeMutation Mutation) {
     // We have no choice but conservatively assume that an action with a
     // free-form user provided Predicate properly handles all type indices:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::FewerElements, Predicate, Mutation);
   }
 
@@ -640,11 +669,15 @@ public:
     return actionIf(LegalizeAction::Unsupported,
                     LegalityPredicates::memSizeInBytesNotPow2(0));
   }
+  LegalizeRuleSet &lowerIfMemSizeNotPow2() {
+    return actionIf(LegalizeAction::Lower,
+                    LegalityPredicates::memSizeInBytesNotPow2(0));
+  }
 
   LegalizeRuleSet &customIf(LegalityPredicate Predicate) {
     // We have no choice but conservatively assume that a custom action with a
     // free-form user provided Predicate properly handles all type indices:
-    markAllTypeIdxsAsCovered();
+    markAllIdxsAsCovered();
     return actionIf(LegalizeAction::Custom, Predicate);
   }
   LegalizeRuleSet &customFor(std::initializer_list<LLT> Types) {
@@ -882,6 +915,10 @@ public:
   /// LegalizeRuleSet in any way at all.
   /// \pre Type indices of the opcode form a dense [0, \p NumTypeIdxs) set.
   bool verifyTypeIdxsCoverage(unsigned NumTypeIdxs) const;
+  /// Check if there is no imm index which is obviously not handled by the
+  /// LegalizeRuleSet in any way at all.
+  /// \pre Type indices of the opcode form a dense [0, \p NumTypeIdxs) set.
+  bool verifyImmIdxsCoverage(unsigned NumImmIdxs) const;
 
   /// Apply the ruleset to the given LegalityQuery.
   LegalizeActionStep apply(const LegalityQuery &Query) const;
diff --git a/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 13eddd9539fa..be12341f5763 100644
--- a/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -21,7 +21,7 @@ namespace llvm {
 namespace MIPatternMatch {
 
 template <typename Reg, typename Pattern>
-bool mi_match(Reg R, MachineRegisterInfo &MRI, Pattern &&P) {
+bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P) {
   return P.match(MRI, R);
 }
 
@@ -30,7 +30,7 @@ template <typename SubPatternT> struct OneUse_match {
   SubPatternT SubPat;
   OneUse_match(const SubPatternT &SP) : SubPat(SP) {}
 
-  bool match(MachineRegisterInfo &MRI, unsigned Reg) {
+  bool match(const MachineRegisterInfo &MRI, unsigned Reg) {
     return MRI.hasOneUse(Reg) && SubPat.match(MRI, Reg);
   }
 };
@@ -71,7 +71,7 @@ inline operand_type_match m_Reg() { return operand_type_match(); }
 /// Matching combinators.
 template <typename... Preds> struct And {
   template <typename MatchSrc>
-  bool match(MachineRegisterInfo &MRI, MatchSrc &&src) {
+  bool match(const MachineRegisterInfo &MRI, MatchSrc &&src) {
     return true;
   }
 };
@@ -83,14 +83,14 @@ struct And<Pred, Preds...> : And<Preds...> {
       : And<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) {
   }
   template <typename MatchSrc>
-  bool match(MachineRegisterInfo &MRI, MatchSrc &&src) {
+  bool match(const MachineRegisterInfo &MRI, MatchSrc &&src) {
     return P.match(MRI, src) && And<Preds...>::match(MRI, src);
   }
 };
 
 template <typename... Preds> struct Or {
   template <typename MatchSrc>
-  bool match(MachineRegisterInfo &MRI, MatchSrc &&src) {
+  bool match(const MachineRegisterInfo &MRI, MatchSrc &&src) {
     return false;
   }
 };
@@ -101,7 +101,7 @@ struct Or<Pred, Preds...> : Or<Preds...> {
   Or(Pred &&p, Preds &&... preds)
       : Or<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) {}
   template <typename MatchSrc>
-  bool match(MachineRegisterInfo &MRI, MatchSrc &&src) {
+  bool match(const MachineRegisterInfo &MRI, MatchSrc &&src) {
     return P.match(MRI, src) || Or<Preds...>::match(MRI, src);
   }
 };
@@ -175,7 +175,8 @@ struct BinaryOp_match {
   RHS_P R;
 
   BinaryOp_match(const LHS_P &LHS, const RHS_P &RHS) : L(LHS), R(RHS) {}
-  template <typename OpTy> bool match(MachineRegisterInfo &MRI, OpTy &&Op) {
+  template <typename OpTy>
+  bool match(const MachineRegisterInfo &MRI, OpTy &&Op) {
     MachineInstr *TmpMI;
     if (mi_match(Op, MRI, m_MInstr(TmpMI))) {
       if (TmpMI->getOpcode() == Opcode && TmpMI->getNumOperands() == 3) {
@@ -242,7 +243,8 @@ template <typename SrcTy, unsigned Opcode> struct UnaryOp_match {
   SrcTy L;
 
   UnaryOp_match(const SrcTy &LHS) : L(LHS) {}
-  template <typename OpTy> bool match(MachineRegisterInfo &MRI, OpTy &&Op) {
+  template <typename OpTy>
+  bool match(const MachineRegisterInfo &MRI, OpTy &&Op) {
     MachineInstr *TmpMI;
     if (mi_match(Op, MRI, m_MInstr(TmpMI))) {
       if (TmpMI->getOpcode() == Opcode && TmpMI->getNumOperands() == 2) {
@@ -323,7 +325,7 @@ struct CheckType {
   LLT Ty;
   CheckType(const LLT &Ty) : Ty(Ty) {}
 
-  bool match(MachineRegisterInfo &MRI, unsigned Reg) {
+  bool match(const MachineRegisterInfo &MRI, unsigned Reg) {
     return MRI.getType(Reg) == Ty;
   }
 };
diff --git a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 10d712176b1b..416f9c19f794 100644
--- a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -122,14 +122,22 @@ class SrcOp {
     MachineInstrBuilder SrcMIB;
     Register Reg;
     CmpInst::Predicate Pred;
+    int64_t Imm;
   };
 
 public:
-  enum class SrcType { Ty_Reg, Ty_MIB, Ty_Predicate };
+  enum class SrcType { Ty_Reg, Ty_MIB, Ty_Predicate, Ty_Imm };
   SrcOp(Register R) : Reg(R), Ty(SrcType::Ty_Reg) {}
   SrcOp(const MachineOperand &Op) : Reg(Op.getReg()), Ty(SrcType::Ty_Reg) {}
   SrcOp(const MachineInstrBuilder &MIB) : SrcMIB(MIB), Ty(SrcType::Ty_MIB) {}
   SrcOp(const CmpInst::Predicate P) : Pred(P), Ty(SrcType::Ty_Predicate) {}
+  /// Use of registers held in unsigned integer variables (or more rarely signed
+  /// integers) is no longer permitted to avoid ambiguity with upcoming support
+  /// for immediates.
+  SrcOp(unsigned) = delete;
+  SrcOp(int) = delete;
+  SrcOp(uint64_t V) : Imm(V), Ty(SrcType::Ty_Imm) {}
+  SrcOp(int64_t V) : Imm(V), Ty(SrcType::Ty_Imm) {}
 
   void addSrcToMIB(MachineInstrBuilder &MIB) const {
     switch (Ty) {
@@ -142,12 +150,16 @@ public:
     case SrcType::Ty_MIB:
       MIB.addUse(SrcMIB->getOperand(0).getReg());
       break;
+    case SrcType::Ty_Imm:
+      MIB.addImm(Imm);
+      break;
     }
   }
 
   LLT getLLTTy(const MachineRegisterInfo &MRI) const {
     switch (Ty) {
     case SrcType::Ty_Predicate:
+    case SrcType::Ty_Imm:
       llvm_unreachable("Not a register operand");
     case SrcType::Ty_Reg:
       return MRI.getType(Reg);
@@ -160,6 +172,7 @@ public:
   Register getReg() const {
     switch (Ty) {
     case SrcType::Ty_Predicate:
+    case SrcType::Ty_Imm:
       llvm_unreachable("Not a register operand");
     case SrcType::Ty_Reg:
       return Reg;
@@ -178,6 +191,15 @@ public:
     }
   }
 
+  int64_t getImm() const {
+    switch (Ty) {
+    case SrcType::Ty_Imm:
+      return Imm;
+    default:
+      llvm_unreachable("Not an immediate");
+    }
+  }
+
   SrcType getSrcOpKind() const { return Ty; }
 
 private:
@@ -348,6 +370,17 @@ public:
   /// given. Convert "llvm.dbg.label Label" to "DBG_LABEL Label".
   MachineInstrBuilder buildDbgLabel(const MDNode *Label);
 
+  /// Build and insert \p Res = G_DYN_STACKALLOC \p Size, \p Align
+  ///
+  /// G_DYN_STACKALLOC does a dynamic stack allocation and writes the address of
+  /// the allocated memory into \p Res.
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res must be a generic virtual register with pointer type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildDynStackAlloc(const DstOp &Res, const SrcOp &Size,
+                                         unsigned Align);
+
   /// Build and insert \p Res = G_FRAME_INDEX \p Idx
   ///
   /// G_FRAME_INDEX materializes the address of an alloca value or other
@@ -489,11 +522,21 @@ public:
     return buildInstr(TargetOpcode::G_PTRTOINT, {Dst}, {Src});
   }
 
+  /// Build and insert a G_INTTOPTR instruction.
+  MachineInstrBuilder buildIntToPtr(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_INTTOPTR, {Dst}, {Src});
+  }
+
   /// Build and insert \p Dst = G_BITCAST \p Src
   MachineInstrBuilder buildBitcast(const DstOp &Dst, const SrcOp &Src) {
     return buildInstr(TargetOpcode::G_BITCAST, {Dst}, {Src});
   }
 
+    /// Build and insert \p Dst = G_ADDRSPACE_CAST \p Src
+  MachineInstrBuilder buildAddrSpaceCast(const DstOp &Dst, const SrcOp &Src) {
+    return buildInstr(TargetOpcode::G_ADDRSPACE_CAST, {Dst}, {Src});
+  }
+
   /// \return The opcode of the extension the target wants to use for boolean
   /// values.
   unsigned getBoolExtOp(bool IsVec, bool IsFP) const;
@@ -867,7 +910,8 @@ public:
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
   MachineInstrBuilder buildFCmp(CmpInst::Predicate Pred, const DstOp &Res,
-                                const SrcOp &Op0, const SrcOp &Op1);
+                                const SrcOp &Op0, const SrcOp &Op1,
+                                Optional<unsigned> Flags = None);
 
   /// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1
   ///
@@ -880,7 +924,8 @@ public:
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
   MachineInstrBuilder buildSelect(const DstOp &Res, const SrcOp &Tst,
-                                  const SrcOp &Op0, const SrcOp &Op1);
+                                  const SrcOp &Op0, const SrcOp &Op1,
+                                  Optional<unsigned> Flags = None);
 
   /// Build and insert \p Res = G_INSERT_VECTOR_ELT \p Val,
   /// \p Elt, \p Idx
@@ -961,8 +1006,8 @@ public:
   ///      same type.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildAtomicRMW(unsigned Opcode, Register OldValRes,
-                                     Register Addr, Register Val,
+  MachineInstrBuilder buildAtomicRMW(unsigned Opcode, const DstOp &OldValRes,
+                                     const SrcOp &Addr, const SrcOp &Val,
                                      MachineMemOperand &MMO);
 
   /// Build and insert `OldValRes<def> = G_ATOMICRMW_XCHG Addr, Val, MMO`.
@@ -1135,6 +1180,16 @@ public:
   MachineInstrBuilder buildAtomicRMWUmin(Register OldValRes, Register Addr,
                                          Register Val, MachineMemOperand &MMO);
 
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_FADD Addr, Val, MMO`.
+  MachineInstrBuilder buildAtomicRMWFAdd(
+    const DstOp &OldValRes, const SrcOp &Addr, const SrcOp &Val,
+    MachineMemOperand &MMO);
+
+  /// Build and insert `OldValRes<def> = G_ATOMICRMW_FSUB Addr, Val, MMO`.
+  MachineInstrBuilder buildAtomicRMWFSub(
+        const DstOp &OldValRes, const SrcOp &Addr, const SrcOp &Val,
+        MachineMemOperand &MMO);
+
   /// Build and insert `G_FENCE Ordering, Scope`.
   MachineInstrBuilder buildFence(unsigned Ordering, unsigned Scope);
 
@@ -1210,6 +1265,12 @@ public:
     return buildInstr(TargetOpcode::G_SMULH, {Dst}, {Src0, Src1}, Flags);
   }
 
+  MachineInstrBuilder buildFMul(const DstOp &Dst, const SrcOp &Src0,
+                                const SrcOp &Src1,
+                                Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_FMUL, {Dst}, {Src0, Src1}, Flags);
+  }
+
   MachineInstrBuilder buildShl(const DstOp &Dst, const SrcOp &Src0,
                                const SrcOp &Src1,
                                Optional<unsigned> Flags = None) {
@@ -1300,8 +1361,9 @@ public:
 
   /// Build and insert \p Res = G_FADD \p Op0, \p Op1
   MachineInstrBuilder buildFAdd(const DstOp &Dst, const SrcOp &Src0,
-                                const SrcOp &Src1) {
-    return buildInstr(TargetOpcode::G_FADD, {Dst}, {Src0, Src1});
+                                const SrcOp &Src1,
+                                Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_FADD, {Dst}, {Src0, Src1}, Flags);
   }
 
   /// Build and insert \p Res = G_FSUB \p Op0, \p Op1
@@ -1316,14 +1378,23 @@ public:
     return buildInstr(TargetOpcode::G_FMA, {Dst}, {Src0, Src1, Src2});
   }
 
+  /// Build and insert \p Res = G_FMAD \p Op0, \p Op1, \p Op2
+  MachineInstrBuilder buildFMAD(const DstOp &Dst, const SrcOp &Src0,
+                                const SrcOp &Src1, const SrcOp &Src2,
+                                Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_FMAD, {Dst}, {Src0, Src1, Src2}, Flags);
+  }
+
   /// Build and insert \p Res = G_FNEG \p Op0
-  MachineInstrBuilder buildFNeg(const DstOp &Dst, const SrcOp &Src0) {
-    return buildInstr(TargetOpcode::G_FNEG, {Dst}, {Src0});
+  MachineInstrBuilder buildFNeg(const DstOp &Dst, const SrcOp &Src0,
+                                Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_FNEG, {Dst}, {Src0}, Flags);
   }
 
   /// Build and insert \p Res = G_FABS \p Op0
-  MachineInstrBuilder buildFAbs(const DstOp &Dst, const SrcOp &Src0) {
-    return buildInstr(TargetOpcode::G_FABS, {Dst}, {Src0});
+  MachineInstrBuilder buildFAbs(const DstOp &Dst, const SrcOp &Src0,
+                                Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_FABS, {Dst}, {Src0}, Flags);
   }
 
   /// Build and insert \p Dst = G_FCANONICALIZE \p Src0
diff --git a/include/llvm/CodeGen/GlobalISel/Utils.h b/include/llvm/CodeGen/GlobalISel/Utils.h
index 4cdaa48fb689..8af2853473c2 100644
--- a/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -16,6 +16,8 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/Register.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
 
@@ -117,14 +119,16 @@ struct ValueAndVReg {
   unsigned VReg;
 };
 /// If \p VReg is defined by a statically evaluable chain of
-/// instructions rooted on a G_CONSTANT (\p LookThroughInstrs == true)
-/// and that constant fits in int64_t, returns its value as well as
-/// the virtual register defined by this G_CONSTANT.
-/// When \p LookThroughInstrs == false, this function behaves like
+/// instructions rooted on a G_F/CONSTANT (\p LookThroughInstrs == true)
+/// and that constant fits in int64_t, returns its value as well as the
+/// virtual register defined by this G_F/CONSTANT.
+/// When \p LookThroughInstrs == false this function behaves like
 /// getConstantVRegVal.
+/// When \p HandleFConstants == false the function bails on G_FCONSTANTs.
 Optional<ValueAndVReg>
 getConstantVRegValWithLookThrough(unsigned VReg, const MachineRegisterInfo &MRI,
-                                  bool LookThroughInstrs = true);
+                                  bool LookThroughInstrs = true,
+                                  bool HandleFConstants = true);
 const ConstantFP* getConstantFPVRegVal(unsigned VReg,
                                        const MachineRegisterInfo &MRI);
 
@@ -151,6 +155,9 @@ Optional<APInt> ConstantFoldBinOp(unsigned Opcode, const unsigned Op1,
                                   const unsigned Op2,
                                   const MachineRegisterInfo &MRI);
 
+Optional<APInt> ConstantFoldExtOp(unsigned Opcode, const unsigned Op1,
+                                  uint64_t Imm, const MachineRegisterInfo &MRI);
+
 /// Returns true if \p Val can be assumed to never be a NaN. If \p SNaN is true,
 /// this returns if \p Val can be assumed to never be a signaling NaN.
 bool isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
@@ -161,5 +168,10 @@ inline bool isKnownNeverSNaN(Register Val, const MachineRegisterInfo &MRI) {
   return isKnownNeverNaN(Val, MRI, true);
 }
 
+/// Get a rough equivalent of an MVT for a given LLT.
+MVT getMVTForLLT(LLT Ty);
+/// Get a rough equivalent of an LLT for a given MVT.
+LLT getLLTForMVT(MVT Ty);
+
 } // End namespace llvm.
 #endif
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index acf27dcc5fab..658ad31fa2a6 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -281,7 +281,7 @@ namespace ISD {
     /// Same as the corresponding unsaturated fixed point instructions, but the
     /// result is clamped between the min and max values representable by the
     /// bits of the first 2 operands.
-    SMULFIXSAT,
+    SMULFIXSAT, UMULFIXSAT,
 
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
@@ -301,6 +301,14 @@ namespace ISD {
     STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, STRICT_FLOG10, STRICT_FLOG2,
     STRICT_FRINT, STRICT_FNEARBYINT, STRICT_FMAXNUM, STRICT_FMINNUM,
     STRICT_FCEIL, STRICT_FFLOOR, STRICT_FROUND, STRICT_FTRUNC,
+    STRICT_LROUND, STRICT_LLROUND, STRICT_LRINT, STRICT_LLRINT,
+
+    /// STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or
+    /// unsigned integer. These have the same semantics as fptosi and fptoui 
+    /// in IR.
+    /// They are used to limit optimizations while the DAG is being optimized.
+    STRICT_FP_TO_SINT,
+    STRICT_FP_TO_UINT,
 
     /// X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating 
     /// point type down to the precision of the destination VT.  TRUNC is a 
@@ -398,6 +406,13 @@ namespace ISD {
     /// than the vector element type, and is implicitly truncated to it.
     SCALAR_TO_VECTOR,
 
+    /// SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL
+    /// duplicated in all lanes. The type of the operand must match the vector
+    /// element type, except when they are integer types.  In this case the
+    /// operand is allowed to be wider than the vector element type, and is
+    /// implicitly truncated to it.
+    SPLAT_VECTOR,
+
     /// MULHU/MULHS - Multiply high - Multiply two integers of type iN,
     /// producing an unsigned/signed value of type i[2*N], then return the top
     /// part.
@@ -569,13 +584,6 @@ namespace ISD {
     ///  3 Round to -inf
     FLT_ROUNDS_,
 
-    /// X = FP_ROUND_INREG(Y, VT) - This operator takes an FP register, and
-    /// rounds it to a floating point value.  It then promotes it and returns it
-    /// in a register of the same size.  This operation effectively just
-    /// discards excess precision.  The type to round down to is specified by
-    /// the VT operand, a VTSDNode.
-    FP_ROUND_INREG,
-
     /// X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
     FP_EXTEND,
 
@@ -957,6 +965,23 @@ namespace ISD {
 
   static const int LAST_INDEXED_MODE = POST_DEC + 1;
 
+  //===--------------------------------------------------------------------===//
+  /// MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's
+  /// index parameter when calculating addresses.
+  ///
+  /// SIGNED_SCALED     Addr = Base + ((signed)Index * sizeof(element))
+  /// SIGNED_UNSCALED   Addr = Base + (signed)Index
+  /// UNSIGNED_SCALED   Addr = Base + ((unsigned)Index * sizeof(element))
+  /// UNSIGNED_UNSCALED Addr = Base + (unsigned)Index
+  enum MemIndexType {
+    SIGNED_SCALED = 0,
+    SIGNED_UNSCALED,
+    UNSIGNED_SCALED,
+    UNSIGNED_UNSCALED
+  };
+
+  static const int LAST_MEM_INDEX_TYPE = UNSIGNED_UNSCALED + 1;
+
   //===--------------------------------------------------------------------===//
   /// LoadExtType enum - This enum defines the three variants of LOADEXT
   /// (load with extension).
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index 8bb88165d3e1..290a2381d9c9 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -189,6 +189,10 @@ namespace llvm {
         return start == Other.start && end == Other.end;
       }
 
+      bool operator!=(const Segment &Other) const {
+        return !(*this == Other);
+      }
+
       void dump() const;
     };
 
@@ -224,7 +228,7 @@ namespace llvm {
 
     /// Constructs a new LiveRange object.
     LiveRange(bool UseSegmentSet = false)
-        : segmentSet(UseSegmentSet ? llvm::make_unique<SegmentSet>()
+        : segmentSet(UseSegmentSet ? std::make_unique<SegmentSet>()
                                    : nullptr) {}
 
     /// Constructs a new LiveRange object by copying segments and valnos from
diff --git a/include/llvm/CodeGen/LiveIntervals.h b/include/llvm/CodeGen/LiveIntervals.h
index 588b0f9cf39c..888d72b87bd1 100644
--- a/include/llvm/CodeGen/LiveIntervals.h
+++ b/include/llvm/CodeGen/LiveIntervals.h
@@ -111,30 +111,31 @@ class VirtRegMap;
                                 const MachineBlockFrequencyInfo *MBFI,
                                 const MachineBasicBlock *MBB);
 
-    LiveInterval &getInterval(unsigned Reg) {
+    LiveInterval &getInterval(Register Reg) {
       if (hasInterval(Reg))
-        return *VirtRegIntervals[Reg];
+        return *VirtRegIntervals[Reg.id()];
       else
         return createAndComputeVirtRegInterval(Reg);
     }
 
-    const LiveInterval &getInterval(unsigned Reg) const {
+    const LiveInterval &getInterval(Register Reg) const {
       return const_cast<LiveIntervals*>(this)->getInterval(Reg);
     }
 
-    bool hasInterval(unsigned Reg) const {
-      return VirtRegIntervals.inBounds(Reg) && VirtRegIntervals[Reg];
+    bool hasInterval(Register Reg) const {
+      return VirtRegIntervals.inBounds(Reg.id()) &&
+             VirtRegIntervals[Reg.id()];
     }
 
     /// Interval creation.
-    LiveInterval &createEmptyInterval(unsigned Reg) {
+    LiveInterval &createEmptyInterval(Register Reg) {
       assert(!hasInterval(Reg) && "Interval already exists!");
-      VirtRegIntervals.grow(Reg);
-      VirtRegIntervals[Reg] = createInterval(Reg);
-      return *VirtRegIntervals[Reg];
+      VirtRegIntervals.grow(Reg.id());
+      VirtRegIntervals[Reg.id()] = createInterval(Reg);
+      return *VirtRegIntervals[Reg.id()];
     }
 
-    LiveInterval &createAndComputeVirtRegInterval(unsigned Reg) {
+    LiveInterval &createAndComputeVirtRegInterval(Register Reg) {
       LiveInterval &LI = createEmptyInterval(Reg);
       computeVirtRegInterval(LI);
       return LI;
diff --git a/include/llvm/CodeGen/LiveRangeCalc.h b/include/llvm/CodeGen/LiveRangeCalc.h
new file mode 100644
index 000000000000..08026c05733c
--- /dev/null
+++ b/include/llvm/CodeGen/LiveRangeCalc.h
@@ -0,0 +1,295 @@
+//===- LiveRangeCalc.h - Calculate live ranges ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeCalc class can be used to compute live ranges from scratch.  It
+// caches information about values in the CFG to speed up repeated operations
+// on the same live range.  The cache can be shared by non-overlapping live
+// ranges.  SplitKit uses that when computing the live range of split products.
+//
+// A low-level interface is available to clients that know where a variable is
+// live, but don't know which value it has as every point.  LiveRangeCalc will
+// propagate values down the dominator tree, and even insert PHI-defs where
+// needed.  SplitKit uses this faster interface when possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVERANGECALC_H
+#define LLVM_LIB_CODEGEN_LIVERANGECALC_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/LaneBitmask.h"
+#include <utility>
+
+namespace llvm {
+
+template <class NodeT> class DomTreeNodeBase;
+class MachineDominatorTree;
+class MachineFunction;
+class MachineRegisterInfo;
+
+using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
+
+class LiveRangeCalc {
+  const MachineFunction *MF = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+  SlotIndexes *Indexes = nullptr;
+  MachineDominatorTree *DomTree = nullptr;
+  VNInfo::Allocator *Alloc = nullptr;
+
+  /// LiveOutPair - A value and the block that defined it.  The domtree node is
+  /// redundant, it can be computed as: MDT[Indexes.getMBBFromIndex(VNI->def)].
+  using LiveOutPair = std::pair<VNInfo *, MachineDomTreeNode *>;
+
+  /// LiveOutMap - Map basic blocks to the value leaving the block.
+  using LiveOutMap = IndexedMap<LiveOutPair, MBB2NumberFunctor>;
+
+  /// Bit vector of active entries in LiveOut, also used as a visited set by
+  /// findReachingDefs.  One entry per basic block, indexed by block number.
+  /// This is kept as a separate bit vector because it can be cleared quickly
+  /// when switching live ranges.
+  BitVector Seen;
+
+  /// Map LiveRange to sets of blocks (represented by bit vectors) that
+  /// in the live range are defined on entry and undefined on entry.
+  /// A block is defined on entry if there is a path from at least one of
+  /// the defs in the live range to the entry of the block, and conversely,
+  /// a block is undefined on entry, if there is no such path (i.e. no
+  /// definition reaches the entry of the block). A single LiveRangeCalc
+  /// object is used to track live-out information for multiple registers
+  /// in live range splitting (which is ok, since the live ranges of these
+  /// registers do not overlap), but the defined/undefined information must
+  /// be kept separate for each individual range.
+  /// By convention, EntryInfoMap[&LR] = { Defined, Undefined }.
+  using EntryInfoMap = DenseMap<LiveRange *, std::pair<BitVector, BitVector>>;
+  EntryInfoMap EntryInfos;
+
+  /// Map each basic block where a live range is live out to the live-out value
+  /// and its defining block.
+  ///
+  /// For every basic block, MBB, one of these conditions shall be true:
+  ///
+  ///  1. !Seen.count(MBB->getNumber())
+  ///     Blocks without a Seen bit are ignored.
+  ///  2. LiveOut[MBB].second.getNode() == MBB
+  ///     The live-out value is defined in MBB.
+  ///  3. forall P in preds(MBB): LiveOut[P] == LiveOut[MBB]
+  ///     The live-out value passses through MBB. All predecessors must carry
+  ///     the same value.
+  ///
+  /// The domtree node may be null, it can be computed.
+  ///
+  /// The map can be shared by multiple live ranges as long as no two are
+  /// live-out of the same block.
+  LiveOutMap Map;
+
+  /// LiveInBlock - Information about a basic block where a live range is known
+  /// to be live-in, but the value has not yet been determined.
+  struct LiveInBlock {
+    // The live range set that is live-in to this block.  The algorithms can
+    // handle multiple non-overlapping live ranges simultaneously.
+    LiveRange &LR;
+
+    // DomNode - Dominator tree node for the block.
+    // Cleared when the final value has been determined and LI has been updated.
+    MachineDomTreeNode *DomNode;
+
+    // Position in block where the live-in range ends, or SlotIndex() if the
+    // range passes through the block.  When the final value has been
+    // determined, the range from the block start to Kill will be added to LI.
+    SlotIndex Kill;
+
+    // Live-in value filled in by updateSSA once it is known.
+    VNInfo *Value = nullptr;
+
+    LiveInBlock(LiveRange &LR, MachineDomTreeNode *node, SlotIndex kill)
+        : LR(LR), DomNode(node), Kill(kill) {}
+  };
+
+  /// LiveIn - Work list of blocks where the live-in value has yet to be
+  /// determined.  This list is typically computed by findReachingDefs() and
+  /// used as a work list by updateSSA().  The low-level interface may also be
+  /// used to add entries directly.
+  SmallVector<LiveInBlock, 16> LiveIn;
+
+  /// Check if the entry to block @p MBB can be reached by any of the defs
+  /// in @p LR. Return true if none of the defs reach the entry to @p MBB.
+  bool isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,
+                    MachineBasicBlock &MBB, BitVector &DefOnEntry,
+                    BitVector &UndefOnEntry);
+
+  /// Find the set of defs that can reach @p Kill. @p Kill must belong to
+  /// @p UseMBB.
+  ///
+  /// If exactly one def can reach @p UseMBB, and the def dominates @p Kill,
+  /// all paths from the def to @p UseMBB are added to @p LR, and the function
+  /// returns true.
+  ///
+  /// If multiple values can reach @p UseMBB, the blocks that need @p LR to be
+  /// live in are added to the LiveIn array, and the function returns false.
+  ///
+  /// The array @p Undef provides the locations where the range @p LR becomes
+  /// undefined by <def,read-undef> operands on other subranges. If @p Undef
+  /// is non-empty and @p Kill is jointly dominated only by the entries of
+  /// @p Undef, the function returns false.
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
+  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB, SlotIndex Use,
+                        unsigned PhysReg, ArrayRef<SlotIndex> Undefs);
+
+  /// updateSSA - Compute the values that will be live in to all requested
+  /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
+  ///
+  /// Every live-in block must be jointly dominated by the added live-out
+  /// blocks.  No values are read from the live ranges.
+  void updateSSA();
+
+  /// Transfer information from the LiveIn vector to the live ranges and update
+  /// the given @p LiveOuts.
+  void updateFromLiveIns();
+
+  /// Extend the live range of @p LR to reach all uses of Reg.
+  ///
+  /// If @p LR is a main range, or if @p LI is null, then all uses must be
+  /// jointly dominated by the definitions from @p LR. If @p LR is a subrange
+  /// of the live interval @p LI, corresponding to lane mask @p LaneMask,
+  /// all uses must be jointly dominated by the definitions from @p LR
+  /// together with definitions of other lanes where @p LR becomes undefined
+  /// (via <def,read-undef> operands).
+  /// If @p LR is a main range, the @p LaneMask should be set to ~0, i.e.
+  /// LaneBitmask::getAll().
+  void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask,
+                    LiveInterval *LI = nullptr);
+
+  /// Reset Map and Seen fields.
+  void resetLiveOutMap();
+
+public:
+  LiveRangeCalc() = default;
+
+  //===--------------------------------------------------------------------===//
+  // High-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Calculate live ranges from scratch.
+  //
+
+  /// reset - Prepare caches for a new set of non-overlapping live ranges.  The
+  /// caches must be reset before attempting calculations with a live range
+  /// that may overlap a previously computed live range, and before the first
+  /// live range in a function.  If live ranges are not known to be
+  /// non-overlapping, call reset before each.
+  void reset(const MachineFunction *mf, SlotIndexes *SI,
+             MachineDominatorTree *MDT, VNInfo::Allocator *VNIA);
+
+  //===--------------------------------------------------------------------===//
+  // Mid-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Modify existing live ranges.
+  //
+
+  /// Extend the live range of @p LR to reach @p Use.
+  ///
+  /// The existing values in @p LR must be live so they jointly dominate @p Use.
+  /// If @p Use is not dominated by a single existing value, PHI-defs are
+  /// inserted as required to preserve SSA form.
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
+  void extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg,
+              ArrayRef<SlotIndex> Undefs);
+
+  /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
+  /// Each instruction defining Reg gets a new VNInfo with a corresponding
+  /// minimal live range.
+  void createDeadDefs(LiveRange &LR, unsigned Reg);
+
+  /// Extend the live range of @p LR to reach all uses of Reg.
+  ///
+  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
+  /// inserted as needed to preserve SSA form.
+  void extendToUses(LiveRange &LR, unsigned PhysReg) {
+    extendToUses(LR, PhysReg, LaneBitmask::getAll());
+  }
+
+  /// Calculates liveness for the register specified in live interval @p LI.
+  /// Creates subregister live ranges as needed if subreg liveness tracking is
+  /// enabled.
+  void calculate(LiveInterval &LI, bool TrackSubRegs);
+
+  /// For live interval \p LI with correct SubRanges construct matching
+  /// information for the main live range. Expects the main live range to not
+  /// have any segments or value numbers.
+  void constructMainRangeFromSubranges(LiveInterval &LI);
+
+  //===--------------------------------------------------------------------===//
+  // Low-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // These functions can be used to compute live ranges where the live-in and
+  // live-out blocks are already known, but the SSA value in each block is
+  // unknown.
+  //
+  // After calling reset(), add known live-out values and known live-in blocks.
+  // Then call calculateValues() to compute the actual value that is
+  // live-in to each block, and add liveness to the live ranges.
+  //
+
+  /// setLiveOutValue - Indicate that VNI is live out from MBB.  The
+  /// calculateValues() function will not add liveness for MBB, the caller
+  /// should take care of that.
+  ///
+  /// VNI may be null only if MBB is a live-through block also passed to
+  /// addLiveInBlock().
+  void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
+    Seen.set(MBB->getNumber());
+    Map[MBB] = LiveOutPair(VNI, nullptr);
+  }
+
+  /// addLiveInBlock - Add a block with an unknown live-in value.  This
+  /// function can only be called once per basic block.  Once the live-in value
+  /// has been determined, calculateValues() will add liveness to LI.
+  ///
+  /// @param LR      The live range that is live-in to the block.
+  /// @param DomNode The domtree node for the block.
+  /// @param Kill    Index in block where LI is killed.  If the value is
+  ///                live-through, set Kill = SLotIndex() and also call
+  ///                setLiveOutValue(MBB, 0).
+  void addLiveInBlock(LiveRange &LR, MachineDomTreeNode *DomNode,
+                      SlotIndex Kill = SlotIndex()) {
+    LiveIn.push_back(LiveInBlock(LR, DomNode, Kill));
+  }
+
+  /// calculateValues - Calculate the value that will be live-in to each block
+  /// added with addLiveInBlock.  Add PHI-def values as needed to preserve SSA
+  /// form.  Add liveness to all live-in blocks up to the Kill point, or the
+  /// whole block for live-through blocks.
+  ///
+  /// Every predecessor of a live-in block must have been given a value with
+  /// setLiveOutValue, the value may be null for live-trough blocks.
+  void calculateValues();
+
+  /// A diagnostic function to check if the end of the block @p MBB is
+  /// jointly dominated by the blocks corresponding to the slot indices
+  /// in @p Defs. This function is mainly for use in self-verification
+  /// checks.
+  LLVM_ATTRIBUTE_UNUSED
+  static bool isJointlyDominated(const MachineBasicBlock *MBB,
+                                 ArrayRef<SlotIndex> Defs,
+                                 const SlotIndexes &Indexes);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_LIVERANGECALC_H
diff --git a/include/llvm/CodeGen/LiveRegUnits.h b/include/llvm/CodeGen/LiveRegUnits.h
index 7dbb2feab8bf..314afad92970 100644
--- a/include/llvm/CodeGen/LiveRegUnits.h
+++ b/include/llvm/CodeGen/LiveRegUnits.h
@@ -53,8 +53,8 @@ public:
         ModifiedRegUnits.addRegsInMask(O->getRegMask());
       if (!O->isReg())
         continue;
-      unsigned Reg = O->getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = O->getReg();
+      if (!Reg.isPhysical())
         continue;
       if (O->isDef()) {
         // Some architectures (e.g. AArch64 XZR/WZR) have registers that are
diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h
index 94e76a75e8da..069d0aa45095 100644
--- a/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/include/llvm/CodeGen/MIRYamlMapping.h
@@ -314,6 +314,7 @@ struct ScalarEnumerationTraits<TargetStackID::Value> {
   static void enumeration(yaml::IO &IO, TargetStackID::Value &ID) {
     IO.enumCase(ID, "default", TargetStackID::Default);
     IO.enumCase(ID, "sgpr-spill", TargetStackID::SGPRSpill);
+    IO.enumCase(ID, "sve-vec", TargetStackID::SVEVector);
     IO.enumCase(ID, "noalloc", TargetStackID::NoAlloc);
   }
 };
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 333d0a78618c..ccdde78a0b22 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -103,9 +103,9 @@ private:
   using LiveInVector = std::vector<RegisterMaskPair>;
   LiveInVector LiveIns;
 
-  /// Alignment of the basic block. Zero if the basic block does not need to be
-  /// aligned. The alignment is specified as log2(bytes).
-  unsigned Alignment = 0;
+  /// Alignment of the basic block. One if the basic block does not need to be
+  /// aligned.
+  Align Alignment;
 
   /// Indicate that this basic block is entered via an exception handler.
   bool IsEHPad = false;
@@ -312,7 +312,7 @@ public:
   /// Adds the specified register as a live in. Note that it is an error to add
   /// the same register to the same set more than once unless the intention is
   /// to call sortUniqueLiveIns after all registers are added.
-  void addLiveIn(MCPhysReg PhysReg,
+  void addLiveIn(MCRegister PhysReg,
                  LaneBitmask LaneMask = LaneBitmask::getAll()) {
     LiveIns.push_back(RegisterMaskPair(PhysReg, LaneMask));
   }
@@ -331,7 +331,7 @@ public:
   /// Add PhysReg as live in to this block, and ensure that there is a copy of
   /// PhysReg to a virtual register of class RC. Return the virtual register
   /// that is a copy of the live in PhysReg.
-  unsigned addLiveIn(MCPhysReg PhysReg, const TargetRegisterClass *RC);
+  unsigned addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC);
 
   /// Remove the specified register from the live in set.
   void removeLiveIn(MCPhysReg Reg,
@@ -372,13 +372,11 @@ public:
   /// \see getBeginClobberMask()
   const uint32_t *getEndClobberMask(const TargetRegisterInfo *TRI) const;
 
-  /// Return alignment of the basic block. The alignment is specified as
-  /// log2(bytes).
-  unsigned getAlignment() const { return Alignment; }
+  /// Return alignment of the basic block.
+  Align getAlignment() const { return Alignment; }
 
-  /// Set alignment of the basic block. The alignment is specified as
-  /// log2(bytes).
-  void setAlignment(unsigned Align) { Alignment = Align; }
+  /// Set alignment of the basic block.
+  void setAlignment(Align A) { Alignment = A; }
 
   /// Returns true if the block is a landing pad. That is this basic block is
   /// entered via an exception handler.
@@ -636,6 +634,18 @@ public:
     return Insts.insertAfter(I.getInstrIterator(), MI);
   }
 
+  /// If I is bundled then insert MI into the instruction list after the end of
+  /// the bundle, otherwise insert MI immediately after I.
+  instr_iterator insertAfterBundle(instr_iterator I, MachineInstr *MI) {
+    assert((I == instr_end() || I->getParent() == this) &&
+           "iterator points outside of basic block");
+    assert(!MI->isBundledWithPred() && !MI->isBundledWithSucc() &&
+           "Cannot insert instruction with bundle flags");
+    while (I->isBundledWithSucc())
+      ++I;
+    return Insts.insertAfter(I, MI);
+  }
+
   /// Remove an instruction from the instruction list and delete it.
   ///
   /// If the instruction is part of a bundle, the other instructions in the
@@ -723,6 +733,10 @@ public:
   /// CFG so that it branches to 'New' instead.
   void ReplaceUsesOfBlockWith(MachineBasicBlock *Old, MachineBasicBlock *New);
 
+  /// Update all phi nodes in this basic block to refer to basic block \p New
+  /// instead of basic block \p Old.
+  void replacePhiUsesWith(MachineBasicBlock *Old, MachineBasicBlock *New);
+
   /// Various pieces of code can cause excess edges in the CFG to be inserted.
   /// If we have proven that MBB can only branch to DestA and DestB, remove any
   /// other MBB successors from the CFG. DestA and DestB can be null. Besides
diff --git a/include/llvm/CodeGen/MachineCombinerPattern.h b/include/llvm/CodeGen/MachineCombinerPattern.h
index 4f4034baf801..503227222207 100644
--- a/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -39,6 +39,10 @@ enum class MachineCombinerPattern {
   MULADDXI_OP1,
   MULSUBXI_OP1,
   // Floating Point
+  FMULADDH_OP1,
+  FMULADDH_OP2,
+  FMULSUBH_OP1,
+  FMULSUBH_OP2,
   FMULADDS_OP1,
   FMULADDS_OP2,
   FMULSUBS_OP1,
@@ -47,16 +51,25 @@ enum class MachineCombinerPattern {
   FMULADDD_OP2,
   FMULSUBD_OP1,
   FMULSUBD_OP2,
+  FNMULSUBH_OP1,
   FNMULSUBS_OP1,
   FNMULSUBD_OP1,
   FMLAv1i32_indexed_OP1,
   FMLAv1i32_indexed_OP2,
   FMLAv1i64_indexed_OP1,
   FMLAv1i64_indexed_OP2,
+  FMLAv4f16_OP1,
+  FMLAv4f16_OP2,
+  FMLAv8f16_OP1,
+  FMLAv8f16_OP2,
   FMLAv2f32_OP2,
   FMLAv2f32_OP1,
   FMLAv2f64_OP1,
   FMLAv2f64_OP2,
+  FMLAv4i16_indexed_OP1,
+  FMLAv4i16_indexed_OP2,
+  FMLAv8i16_indexed_OP1,
+  FMLAv8i16_indexed_OP2,
   FMLAv2i32_indexed_OP1,
   FMLAv2i32_indexed_OP2,
   FMLAv2i64_indexed_OP1,
@@ -67,10 +80,18 @@ enum class MachineCombinerPattern {
   FMLAv4i32_indexed_OP2,
   FMLSv1i32_indexed_OP2,
   FMLSv1i64_indexed_OP2,
+  FMLSv4f16_OP1,
+  FMLSv4f16_OP2,
+  FMLSv8f16_OP1,
+  FMLSv8f16_OP2,
   FMLSv2f32_OP1,
   FMLSv2f32_OP2,
   FMLSv2f64_OP1,
   FMLSv2f64_OP2,
+  FMLSv4i16_indexed_OP1,
+  FMLSv4i16_indexed_OP2,
+  FMLSv8i16_indexed_OP1,
+  FMLSv8i16_indexed_OP2,
   FMLSv2i32_indexed_OP1,
   FMLSv2i32_indexed_OP2,
   FMLSv2i64_indexed_OP1,
diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h
index d2200080b897..e4d7a02f8c48 100644
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h
@@ -44,6 +44,8 @@ using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
 /// compute a normal dominator tree.
 ///
 class MachineDominatorTree : public MachineFunctionPass {
+  using DomTreeT = DomTreeBase<MachineBasicBlock>;
+
   /// Helper structure used to hold all the basic blocks
   /// involved in the split of a critical edge.
   struct CriticalEdge {
@@ -65,8 +67,8 @@ class MachineDominatorTree : public MachineFunctionPass {
   /// such as BB == elt.NewBB.
   mutable SmallSet<MachineBasicBlock *, 32> NewBBs;
 
-  /// The DominatorTreeBase that is used to compute a normal dominator tree
-  std::unique_ptr<DomTreeBase<MachineBasicBlock>> DT;
+  /// The DominatorTreeBase that is used to compute a normal dominator tree.
+  std::unique_ptr<DomTreeT> DT;
 
   /// Apply all the recorded critical edges to the DT.
   /// This updates the underlying DT information in a way that uses
@@ -80,8 +82,8 @@ public:
 
   MachineDominatorTree();
 
-  DomTreeBase<MachineBasicBlock> &getBase() {
-    if (!DT) DT.reset(new DomTreeBase<MachineBasicBlock>());
+  DomTreeT &getBase() {
+    if (!DT) DT.reset(new DomTreeT());
     applySplitCriticalEdges();
     return *DT;
   }
@@ -92,31 +94,30 @@ public:
   /// multiple blocks if we are computing post dominators.  For forward
   /// dominators, this will always be a single block (the entry node).
   ///
-  inline const SmallVectorImpl<MachineBasicBlock*> &getRoots() const {
+  const SmallVectorImpl<MachineBasicBlock*> &getRoots() const {
     applySplitCriticalEdges();
     return DT->getRoots();
   }
 
-  inline MachineBasicBlock *getRoot() const {
+  MachineBasicBlock *getRoot() const {
     applySplitCriticalEdges();
     return DT->getRoot();
   }
 
-  inline MachineDomTreeNode *getRootNode() const {
+  MachineDomTreeNode *getRootNode() const {
     applySplitCriticalEdges();
     return DT->getRootNode();
   }
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
-  inline bool dominates(const MachineDomTreeNode* A,
-                        const MachineDomTreeNode* B) const {
+  bool dominates(const MachineDomTreeNode *A,
+                 const MachineDomTreeNode *B) const {
     applySplitCriticalEdges();
     return DT->dominates(A, B);
   }
 
-  inline bool dominates(const MachineBasicBlock* A,
-                        const MachineBasicBlock* B) const {
+  bool dominates(const MachineBasicBlock *A, const MachineBasicBlock *B) const {
     applySplitCriticalEdges();
     return DT->dominates(A, B);
   }
@@ -133,36 +134,30 @@ public:
     for (; &*I != A && &*I != B; ++I)
       /*empty*/ ;
 
-    //if(!DT.IsPostDominators) {
-      // A dominates B if it is found first in the basic block.
-      return &*I == A;
-    //} else {
-    //  // A post-dominates B if B is found first in the basic block.
-    //  return &*I == B;
-    //}
+    return &*I == A;
   }
 
-  inline bool properlyDominates(const MachineDomTreeNode* A,
-                                const MachineDomTreeNode* B) const {
+  bool properlyDominates(const MachineDomTreeNode *A,
+                         const MachineDomTreeNode *B) const {
     applySplitCriticalEdges();
     return DT->properlyDominates(A, B);
   }
 
-  inline bool properlyDominates(const MachineBasicBlock* A,
-                                const MachineBasicBlock* B) const {
+  bool properlyDominates(const MachineBasicBlock *A,
+                         const MachineBasicBlock *B) const {
     applySplitCriticalEdges();
     return DT->properlyDominates(A, B);
   }
 
   /// findNearestCommonDominator - Find nearest common dominator basic block
   /// for basic block A and B. If there is no such block then return NULL.
-  inline MachineBasicBlock *findNearestCommonDominator(MachineBasicBlock *A,
-                                                       MachineBasicBlock *B) {
+  MachineBasicBlock *findNearestCommonDominator(MachineBasicBlock *A,
+                                                MachineBasicBlock *B) {
     applySplitCriticalEdges();
     return DT->findNearestCommonDominator(A, B);
   }
 
-  inline MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
+  MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
     applySplitCriticalEdges();
     return DT->getNode(BB);
   }
@@ -170,7 +165,7 @@ public:
   /// getNode - return the (Post)DominatorTree node for the specified basic
   /// block.  This is the same as using operator[] on this class.
   ///
-  inline MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
+  MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
     applySplitCriticalEdges();
     return DT->getNode(BB);
   }
@@ -178,8 +173,8 @@ public:
   /// addNewBlock - Add a new node to the dominator tree information.  This
   /// creates a new node as a child of DomBB dominator node,linking it into
   /// the children list of the immediate dominator.
-  inline MachineDomTreeNode *addNewBlock(MachineBasicBlock *BB,
-                                         MachineBasicBlock *DomBB) {
+  MachineDomTreeNode *addNewBlock(MachineBasicBlock *BB,
+                                  MachineBasicBlock *DomBB) {
     applySplitCriticalEdges();
     return DT->addNewBlock(BB, DomBB);
   }
@@ -187,14 +182,14 @@ public:
   /// changeImmediateDominator - This method is used to update the dominator
   /// tree information when a node's immediate dominator changes.
   ///
-  inline void changeImmediateDominator(MachineBasicBlock *N,
-                                       MachineBasicBlock* NewIDom) {
+  void changeImmediateDominator(MachineBasicBlock *N,
+                                MachineBasicBlock *NewIDom) {
     applySplitCriticalEdges();
     DT->changeImmediateDominator(N, NewIDom);
   }
 
-  inline void changeImmediateDominator(MachineDomTreeNode *N,
-                                       MachineDomTreeNode* NewIDom) {
+  void changeImmediateDominator(MachineDomTreeNode *N,
+                                MachineDomTreeNode *NewIDom) {
     applySplitCriticalEdges();
     DT->changeImmediateDominator(N, NewIDom);
   }
@@ -202,14 +197,14 @@ public:
   /// eraseNode - Removes a node from  the dominator tree. Block must not
   /// dominate any other blocks. Removes node from its immediate dominator's
   /// children list. Deletes dominator node associated with basic block BB.
-  inline void eraseNode(MachineBasicBlock *BB) {
+  void eraseNode(MachineBasicBlock *BB) {
     applySplitCriticalEdges();
     DT->eraseNode(BB);
   }
 
   /// splitBlock - BB is split and now it has one successor. Update dominator
   /// tree to reflect this change.
-  inline void splitBlock(MachineBasicBlock* NewBB) {
+  void splitBlock(MachineBasicBlock* NewBB) {
     applySplitCriticalEdges();
     DT->splitBlock(NewBB);
   }
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 761735120a64..01fc50d14a7f 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -14,6 +14,7 @@
 #define LLVM_CODEGEN_MACHINEFRAMEINFO_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
 #include <vector>
@@ -129,7 +130,7 @@ private:
     uint64_t Size;
 
     // The required alignment of this stack slot.
-    unsigned Alignment;
+    Align Alignment;
 
     // If true, the value of the stack object is set before
     // entering the function and is not modified inside the function. By
@@ -180,17 +181,16 @@ private:
 
     uint8_t SSPLayout;
 
-    StackObject(uint64_t Size, unsigned Alignment, int64_t SPOffset,
+    StackObject(uint64_t Size, Align Alignment, int64_t SPOffset,
                 bool IsImmutable, bool IsSpillSlot, const AllocaInst *Alloca,
                 bool IsAliased, uint8_t StackID = 0)
-      : SPOffset(SPOffset), Size(Size), Alignment(Alignment),
-        isImmutable(IsImmutable), isSpillSlot(IsSpillSlot),
-        StackID(StackID), Alloca(Alloca), isAliased(IsAliased),
-        SSPLayout(SSPLK_None) {}
+        : SPOffset(SPOffset), Size(Size), Alignment(Alignment),
+          isImmutable(IsImmutable), isSpillSlot(IsSpillSlot), StackID(StackID),
+          Alloca(Alloca), isAliased(IsAliased), SSPLayout(SSPLK_None) {}
   };
 
   /// The alignment of the stack.
-  unsigned StackAlignment;
+  Align StackAlignment;
 
   /// Can the stack be realigned. This can be false if the target does not
   /// support stack realignment, or if the user asks us not to realign the
@@ -260,7 +260,7 @@ private:
   /// native alignment maintained by the compiler, dynamic alignment code will
   /// be needed.
   ///
-  unsigned MaxAlignment = 0;
+  Align MaxAlignment;
 
   /// Set to true if this function adjusts the stack -- e.g.,
   /// when calling another function. This is only valid during and after
@@ -304,7 +304,7 @@ private:
 
   /// Required alignment of the local object blob, which is the strictest
   /// alignment of any object in it.
-  unsigned LocalFrameMaxAlign = 0;
+  Align LocalFrameMaxAlign;
 
   /// Whether the local object blob needs to be allocated together. If not,
   /// PEI should ignore the isPreAllocated flags on the stack objects and
@@ -338,8 +338,8 @@ private:
 public:
   explicit MachineFrameInfo(unsigned StackAlignment, bool StackRealignable,
                             bool ForcedRealign)
-      : StackAlignment(StackAlignment), StackRealignable(StackRealignable),
-        ForcedRealign(ForcedRealign) {}
+      : StackAlignment(assumeAligned(StackAlignment)),
+        StackRealignable(StackRealignable), ForcedRealign(ForcedRealign) {}
 
   /// Return true if there are any stack objects in this function.
   bool hasStackObjects() const { return !Objects.empty(); }
@@ -419,10 +419,12 @@ public:
 
   /// Required alignment of the local object blob,
   /// which is the strictest alignment of any object in it.
-  void setLocalFrameMaxAlign(unsigned Align) { LocalFrameMaxAlign = Align; }
+  void setLocalFrameMaxAlign(Align Alignment) {
+    LocalFrameMaxAlign = Alignment;
+  }
 
   /// Return the required alignment of the local object blob.
-  unsigned getLocalFrameMaxAlign() const { return LocalFrameMaxAlign; }
+  Align getLocalFrameMaxAlign() const { return LocalFrameMaxAlign; }
 
   /// Get whether the local allocation blob should be allocated together or
   /// let PEI allocate the locals in it directly.
@@ -462,14 +464,14 @@ public:
   unsigned getObjectAlignment(int ObjectIdx) const {
     assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
            "Invalid Object Idx!");
-    return Objects[ObjectIdx+NumFixedObjects].Alignment;
+    return Objects[ObjectIdx + NumFixedObjects].Alignment.value();
   }
 
   /// setObjectAlignment - Change the alignment of the specified stack object.
   void setObjectAlignment(int ObjectIdx, unsigned Align) {
     assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
            "Invalid Object Idx!");
-    Objects[ObjectIdx+NumFixedObjects].Alignment = Align;
+    Objects[ObjectIdx + NumFixedObjects].Alignment = assumeAligned(Align);
 
     // Only ensure max alignment for the default stack.
     if (getStackID(ObjectIdx) == 0)
@@ -561,10 +563,14 @@ public:
 
   /// Return the alignment in bytes that this function must be aligned to,
   /// which is greater than the default stack alignment provided by the target.
-  unsigned getMaxAlignment() const { return MaxAlignment; }
+  unsigned getMaxAlignment() const { return MaxAlignment.value(); }
 
   /// Make sure the function is at least Align bytes aligned.
-  void ensureMaxAlignment(unsigned Align);
+  void ensureMaxAlignment(Align Alignment);
+  /// FIXME: Remove this once transition to Align is over.
+  inline void ensureMaxAlignment(unsigned Align) {
+    ensureMaxAlignment(assumeAligned(Align));
+  }
 
   /// Return true if this function adjusts the stack -- e.g.,
   /// when calling another function. This is only valid during and after
@@ -728,12 +734,24 @@ public:
 
   /// Create a new statically sized stack object, returning
   /// a nonnegative identifier to represent it.
-  int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSpillSlot,
+  int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot,
                         const AllocaInst *Alloca = nullptr, uint8_t ID = 0);
+  /// FIXME: Remove this function when transition to Align is over.
+  inline int CreateStackObject(uint64_t Size, unsigned Alignment,
+                               bool isSpillSlot,
+                               const AllocaInst *Alloca = nullptr,
+                               uint8_t ID = 0) {
+    return CreateStackObject(Size, assumeAligned(Alignment), isSpillSlot,
+                             Alloca, ID);
+  }
 
   /// Create a new statically sized stack object that represents a spill slot,
   /// returning a nonnegative identifier to represent it.
-  int CreateSpillStackObject(uint64_t Size, unsigned Alignment);
+  int CreateSpillStackObject(uint64_t Size, Align Alignment);
+  /// FIXME: Remove this function when transition to Align is over.
+  inline int CreateSpillStackObject(uint64_t Size, unsigned Alignment) {
+    return CreateSpillStackObject(Size, assumeAligned(Alignment));
+  }
 
   /// Remove or mark dead a statically sized stack object.
   void RemoveStackObject(int ObjectIdx) {
@@ -744,7 +762,11 @@ public:
   /// Notify the MachineFrameInfo object that a variable sized object has been
   /// created.  This must be created whenever a variable sized object is
   /// created, whether or not the index returned is actually used.
-  int CreateVariableSizedObject(unsigned Alignment, const AllocaInst *Alloca);
+  int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca);
+  /// FIXME: Remove this function when transition to Align is over.
+  int CreateVariableSizedObject(unsigned Alignment, const AllocaInst *Alloca) {
+    return CreateVariableSizedObject(assumeAligned(Alignment), Alloca);
+  }
 
   /// Returns a reference to call saved info vector for the current function.
   const std::vector<CalleeSavedInfo> &getCalleeSavedInfo() const {
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 201c126ee52e..3a3176e51c51 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -36,6 +36,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Recycler.h"
+#include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cstdint>
 #include <memory>
@@ -277,7 +278,7 @@ class MachineFunction {
   unsigned FunctionNumber;
 
   /// Alignment - The alignment of the function.
-  unsigned Alignment;
+  Align Alignment;
 
   /// ExposesReturnsTwice - True if the function calls setjmp or related
   /// functions with attribute "returns twice", but doesn't have
@@ -322,7 +323,7 @@ class MachineFunction {
   std::vector<std::pair<MCSymbol *, MDNode *>> CodeViewAnnotations;
 
   /// CodeView heapallocsites.
-  std::vector<std::tuple<MCSymbol*, MCSymbol*, DIType*>>
+  std::vector<std::tuple<MCSymbol *, MCSymbol *, const DIType *>>
       CodeViewHeapAllocSites;
 
   bool CallsEHReturn = false;
@@ -400,6 +401,17 @@ private:
   /// Map a call instruction to call site arguments forwarding info.
   CallSiteInfoMap CallSitesInfo;
 
+  /// A helper function that returns call site info for a give call
+  /// instruction if debug entry value support is enabled.
+  CallSiteInfoMap::iterator getCallSiteInfo(const MachineInstr *MI) {
+    assert(MI->isCall() &&
+           "Call site info refers only to call instructions!");
+
+    if (!Target.Options.EnableDebugEntryValues)
+      return CallSitesInfo.end();
+    return CallSitesInfo.find(MI);
+  }
+
   // Callbacks for insertion and removal.
   void handleInsertion(MachineInstr &MI);
   void handleRemoval(MachineInstr &MI);
@@ -508,15 +520,16 @@ public:
   const WinEHFuncInfo *getWinEHFuncInfo() const { return WinEHInfo; }
   WinEHFuncInfo *getWinEHFuncInfo() { return WinEHInfo; }
 
-  /// getAlignment - Return the alignment (log2, not bytes) of the function.
-  unsigned getAlignment() const { return Alignment; }
+  /// getAlignment - Return the alignment of the function.
+  Align getAlignment() const { return Alignment; }
 
-  /// setAlignment - Set the alignment (log2, not bytes) of the function.
-  void setAlignment(unsigned A) { Alignment = A; }
+  /// setAlignment - Set the alignment of the function.
+  void setAlignment(Align A) { Alignment = A; }
 
-  /// ensureAlignment - Make sure the function is at least 1 << A bytes aligned.
-  void ensureAlignment(unsigned A) {
-    if (Alignment < A) Alignment = A;
+  /// ensureAlignment - Make sure the function is at least A bytes aligned.
+  void ensureAlignment(Align A) {
+    if (Alignment < A)
+      Alignment = A;
   }
 
   /// exposesReturnsTwice - Returns true if the function calls setjmp or
@@ -935,10 +948,10 @@ public:
   }
 
   /// Record heapallocsites
-  void addCodeViewHeapAllocSite(MachineInstr *I, MDNode *MD);
+  void addCodeViewHeapAllocSite(MachineInstr *I, const MDNode *MD);
 
-  ArrayRef<std::tuple<MCSymbol*, MCSymbol*, DIType*>>
-      getCodeViewHeapAllocSites() const {
+  ArrayRef<std::tuple<MCSymbol *, MCSymbol *, const DIType *>>
+  getCodeViewHeapAllocSites() const {
     return CodeViewHeapAllocSites;
   }
 
@@ -976,12 +989,24 @@ public:
     return CallSitesInfo;
   }
 
-  /// Update call sites info by deleting entry for \p Old call instruction.
-  /// If \p New is present then transfer \p Old call info to it. This function
-  /// should be called before removing call instruction or before replacing
-  /// call instruction with new one.
-  void updateCallSiteInfo(const MachineInstr *Old,
-                          const MachineInstr *New = nullptr);
+  /// Following functions update call site info. They should be called before
+  /// removing, replacing or copying call instruction.
+
+  /// Move the call site info from \p Old to \New call site info. This function
+  /// is used when we are replacing one call instruction with another one to
+  /// the same callee.
+  void moveCallSiteInfo(const MachineInstr *Old,
+                        const MachineInstr *New);
+
+  /// Erase the call site info for \p MI. It is used to remove a call
+  /// instruction from the instruction stream.
+  void eraseCallSiteInfo(const MachineInstr *MI);
+
+  /// Copy the call site info from \p Old to \ New. Its usage is when we are
+  /// making a copy of the instruction that will be inserted at different point
+  /// of the instruction stream.
+  void copyCallSiteInfo(const MachineInstr *Old,
+                        const MachineInstr *New);
 };
 
 //===--------------------------------------------------------------------===//
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index c82c5b137507..c94ad292ec96 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -20,11 +20,9 @@
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -38,6 +36,7 @@
 
 namespace llvm {
 
+class AAResults;
 template <typename T> class ArrayRef;
 class DIExpression;
 class DILocalVariable;
@@ -427,6 +426,22 @@ public:
     return getNumExplicitDefs() + MCID->getNumImplicitDefs();
   }
 
+  /// Returns true if the instruction has implicit definition.
+  bool hasImplicitDef() const {
+    for (unsigned I = getNumExplicitOperands(), E = getNumOperands();
+      I != E; ++I) {
+      const MachineOperand &MO = getOperand(I);
+      if (MO.isDef() && MO.isImplicit())
+        return true;
+    }
+    return false;
+  }
+
+  /// Returns the implicit operands number.
+  unsigned getNumImplicitOperands() const {
+    return getNumOperands() - getNumExplicitOperands();
+  }
+
   /// Return true if operand \p OpIdx is a subregister index.
   bool isOperandSubregIdx(unsigned OpIdx) const {
     assert(getOperand(OpIdx).getType() == MachineOperand::MO_Immediate &&
@@ -602,6 +617,12 @@ public:
     return hasPropertyInBundle(1ULL << MCFlag, Type);
   }
 
+  /// Return true if this is an instruction that should go through the usual
+  /// legalization steps.
+  bool isPreISelOpcode(QueryType Type = IgnoreBundle) const {
+    return hasProperty(MCID::PreISelOpcode, Type);
+  }
+
   /// Return true if this instruction can have a variable number of operands.
   /// In this case, the variable operands will be after the normal
   /// operands but before the implicit definitions and uses (if any are
@@ -1020,15 +1041,13 @@ public:
   }
 
   /// A DBG_VALUE is an entry value iff its debug expression contains the
-  /// DW_OP_entry_value DWARF operation.
-  bool isDebugEntryValue() const {
-    return isDebugValue() && getDebugExpression()->isEntryValue();
-  }
+  /// DW_OP_LLVM_entry_value operation.
+  bool isDebugEntryValue() const;
 
   /// Return true if the instruction is a debug value which describes a part of
   /// a variable as unavailable.
   bool isUndefDebugValue() const {
-    return isDebugValue() && getOperand(0).isReg() && !getOperand(0).getReg();
+    return isDebugValue() && getOperand(0).isReg() && !getOperand(0).getReg().isValid();
   }
 
   bool isPHI() const {
@@ -1140,7 +1159,7 @@ public:
   /// is a read of a super-register.
   /// This does not count partial redefines of virtual registers as reads:
   ///   %reg1024:6 = OP.
-  bool readsRegister(unsigned Reg,
+  bool readsRegister(Register Reg,
                      const TargetRegisterInfo *TRI = nullptr) const {
     return findRegisterUseOperandIdx(Reg, false, TRI) != -1;
   }
@@ -1148,20 +1167,20 @@ public:
   /// Return true if the MachineInstr reads the specified virtual register.
   /// Take into account that a partial define is a
   /// read-modify-write operation.
-  bool readsVirtualRegister(unsigned Reg) const {
+  bool readsVirtualRegister(Register Reg) const {
     return readsWritesVirtualRegister(Reg).first;
   }
 
   /// Return a pair of bools (reads, writes) indicating if this instruction
   /// reads or writes Reg. This also considers partial defines.
   /// If Ops is not null, all operand indices for Reg are added.
-  std::pair<bool,bool> readsWritesVirtualRegister(unsigned Reg,
+  std::pair<bool,bool> readsWritesVirtualRegister(Register Reg,
                                 SmallVectorImpl<unsigned> *Ops = nullptr) const;
 
   /// Return true if the MachineInstr kills the specified register.
   /// If TargetRegisterInfo is passed, then it also checks if there is
   /// a kill of a super-register.
-  bool killsRegister(unsigned Reg,
+  bool killsRegister(Register Reg,
                      const TargetRegisterInfo *TRI = nullptr) const {
     return findRegisterUseOperandIdx(Reg, true, TRI) != -1;
   }
@@ -1170,7 +1189,7 @@ public:
   /// If TargetRegisterInfo is passed, then it also checks
   /// if there is a def of a super-register.
   /// NOTE: It's ignoring subreg indices on virtual registers.
-  bool definesRegister(unsigned Reg,
+  bool definesRegister(Register Reg,
                        const TargetRegisterInfo *TRI = nullptr) const {
     return findRegisterDefOperandIdx(Reg, false, false, TRI) != -1;
   }
@@ -1178,38 +1197,38 @@ public:
   /// Return true if the MachineInstr modifies (fully define or partially
   /// define) the specified register.
   /// NOTE: It's ignoring subreg indices on virtual registers.
-  bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const {
+  bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const {
     return findRegisterDefOperandIdx(Reg, false, true, TRI) != -1;
   }
 
   /// Returns true if the register is dead in this machine instruction.
   /// If TargetRegisterInfo is passed, then it also checks
   /// if there is a dead def of a super-register.
-  bool registerDefIsDead(unsigned Reg,
+  bool registerDefIsDead(Register Reg,
                          const TargetRegisterInfo *TRI = nullptr) const {
     return findRegisterDefOperandIdx(Reg, true, false, TRI) != -1;
   }
 
   /// Returns true if the MachineInstr has an implicit-use operand of exactly
   /// the given register (not considering sub/super-registers).
-  bool hasRegisterImplicitUseOperand(unsigned Reg) const;
+  bool hasRegisterImplicitUseOperand(Register Reg) const;
 
   /// Returns the operand index that is a use of the specific register or -1
   /// if it is not found. It further tightens the search criteria to a use
   /// that kills the register if isKill is true.
-  int findRegisterUseOperandIdx(unsigned Reg, bool isKill = false,
+  int findRegisterUseOperandIdx(Register Reg, bool isKill = false,
                                 const TargetRegisterInfo *TRI = nullptr) const;
 
   /// Wrapper for findRegisterUseOperandIdx, it returns
   /// a pointer to the MachineOperand rather than an index.
-  MachineOperand *findRegisterUseOperand(unsigned Reg, bool isKill = false,
+  MachineOperand *findRegisterUseOperand(Register Reg, bool isKill = false,
                                       const TargetRegisterInfo *TRI = nullptr) {
     int Idx = findRegisterUseOperandIdx(Reg, isKill, TRI);
     return (Idx == -1) ? nullptr : &getOperand(Idx);
   }
 
   const MachineOperand *findRegisterUseOperand(
-    unsigned Reg, bool isKill = false,
+    Register Reg, bool isKill = false,
     const TargetRegisterInfo *TRI = nullptr) const {
     return const_cast<MachineInstr *>(this)->
       findRegisterUseOperand(Reg, isKill, TRI);
@@ -1221,14 +1240,14 @@ public:
   /// overlap the specified register. If TargetRegisterInfo is non-null,
   /// then it also checks if there is a def of a super-register.
   /// This may also return a register mask operand when Overlap is true.
-  int findRegisterDefOperandIdx(unsigned Reg,
+  int findRegisterDefOperandIdx(Register Reg,
                                 bool isDead = false, bool Overlap = false,
                                 const TargetRegisterInfo *TRI = nullptr) const;
 
   /// Wrapper for findRegisterDefOperandIdx, it returns
   /// a pointer to the MachineOperand rather than an index.
   MachineOperand *
-  findRegisterDefOperand(unsigned Reg, bool isDead = false,
+  findRegisterDefOperand(Register Reg, bool isDead = false,
                          bool Overlap = false,
                          const TargetRegisterInfo *TRI = nullptr) {
     int Idx = findRegisterDefOperandIdx(Reg, isDead, Overlap, TRI);
@@ -1236,7 +1255,7 @@ public:
   }
 
   const MachineOperand *
-  findRegisterDefOperand(unsigned Reg, bool isDead = false,
+  findRegisterDefOperand(Register Reg, bool isDead = false,
                          bool Overlap = false,
                          const TargetRegisterInfo *TRI = nullptr) const {
     return const_cast<MachineInstr *>(this)->findRegisterDefOperand(
@@ -1283,7 +1302,7 @@ public:
   ///
   /// \pre CurRC must not be NULL.
   const TargetRegisterClass *getRegClassConstraintEffectForVReg(
-      unsigned Reg, const TargetRegisterClass *CurRC,
+      Register Reg, const TargetRegisterClass *CurRC,
       const TargetInstrInfo *TII, const TargetRegisterInfo *TRI,
       bool ExploreBundle = false) const;
 
@@ -1346,39 +1365,39 @@ public:
 
   /// Replace all occurrences of FromReg with ToReg:SubIdx,
   /// properly composing subreg indices where necessary.
-  void substituteRegister(unsigned FromReg, unsigned ToReg, unsigned SubIdx,
+  void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx,
                           const TargetRegisterInfo &RegInfo);
 
   /// We have determined MI kills a register. Look for the
   /// operand that uses it and mark it as IsKill. If AddIfNotFound is true,
   /// add a implicit operand if it's not found. Returns true if the operand
   /// exists / is added.
-  bool addRegisterKilled(unsigned IncomingReg,
+  bool addRegisterKilled(Register IncomingReg,
                          const TargetRegisterInfo *RegInfo,
                          bool AddIfNotFound = false);
 
   /// Clear all kill flags affecting Reg.  If RegInfo is provided, this includes
   /// all aliasing registers.
-  void clearRegisterKills(unsigned Reg, const TargetRegisterInfo *RegInfo);
+  void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo);
 
   /// We have determined MI defined a register without a use.
   /// Look for the operand that defines it and mark it as IsDead. If
   /// AddIfNotFound is true, add a implicit operand if it's not found. Returns
   /// true if the operand exists / is added.
-  bool addRegisterDead(unsigned Reg, const TargetRegisterInfo *RegInfo,
+  bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo,
                        bool AddIfNotFound = false);
 
   /// Clear all dead flags on operands defining register @p Reg.
-  void clearRegisterDeads(unsigned Reg);
+  void clearRegisterDeads(Register Reg);
 
   /// Mark all subregister defs of register @p Reg with the undef flag.
   /// This function is used when we determined to have a subregister def in an
   /// otherwise undefined super register.
-  void setRegisterDefReadUndef(unsigned Reg, bool IsUndef = true);
+  void setRegisterDefReadUndef(Register Reg, bool IsUndef = true);
 
   /// We have determined MI defines a register. Make sure there is an operand
   /// defining Reg.
-  void addRegisterDefined(unsigned Reg,
+  void addRegisterDefined(Register Reg,
                           const TargetRegisterInfo *RegInfo = nullptr);
 
   /// Mark every physreg used by this instruction as
@@ -1386,13 +1405,13 @@ public:
   ///
   /// On instructions with register mask operands, also add implicit-def
   /// operands for all registers in UsedRegs.
-  void setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
+  void setPhysRegsDeadExcept(ArrayRef<Register> UsedRegs,
                              const TargetRegisterInfo &TRI);
 
   /// Return true if it is safe to move this instruction. If
   /// SawStore is set to true, it means that there is a store (or call) between
   /// the instruction's location and its intended destination.
-  bool isSafeToMove(AliasAnalysis *AA, bool &SawStore) const;
+  bool isSafeToMove(AAResults *AA, bool &SawStore) const;
 
   /// Returns true if this instruction's memory access aliases the memory
   /// access of Other.
@@ -1404,7 +1423,7 @@ public:
   /// @param AA Optional alias analysis, used to compare memory operands.
   /// @param Other MachineInstr to check aliasing against.
   /// @param UseTBAA Whether to pass TBAA information to alias analysis.
-  bool mayAlias(AliasAnalysis *AA, const MachineInstr &Other, bool UseTBAA) const;
+  bool mayAlias(AAResults *AA, const MachineInstr &Other, bool UseTBAA) const;
 
   /// Return true if this instruction may have an ordered
   /// or volatile memory reference, or if the information describing the memory
@@ -1419,7 +1438,7 @@ public:
   /// argument area of a function (if it does not change).  If the instruction
   /// does multiple loads, this returns true only if all of the loads are
   /// dereferenceable and invariant.
-  bool isDereferenceableInvariantLoad(AliasAnalysis *AA) const;
+  bool isDereferenceableInvariantLoad(AAResults *AA) const;
 
   /// If the specified instruction is a PHI that always merges together the
   /// same virtual register, return the register, otherwise return 0.
@@ -1603,9 +1622,15 @@ public:
   /// Scan instructions following MI and collect any matching DBG_VALUEs.
   void collectDebugValues(SmallVectorImpl<MachineInstr *> &DbgValues);
 
-  /// Find all DBG_VALUEs immediately following this instruction that point
-  /// to a register def in this instruction and point them to \p Reg instead.
-  void changeDebugValuesDefReg(unsigned Reg);
+  /// Find all DBG_VALUEs that point to the register def in this instruction
+  /// and point them to \p Reg instead.
+  void changeDebugValuesDefReg(Register Reg);
+
+  /// Returns the Intrinsic::ID for this instruction.
+  /// \pre Must have an intrinsic ID operand.
+  unsigned getIntrinsicID() const {
+    return getOperand(getNumExplicitDefs()).getIntrinsicID();
+  }
 
 private:
   /// If this instruction is embedded into a MachineFunction, return the
@@ -1630,7 +1655,7 @@ private:
   /// this MI and the given operand index \p OpIdx.
   /// If the related operand does not constrained Reg, this returns CurRC.
   const TargetRegisterClass *getRegClassConstraintEffectForVRegImpl(
-      unsigned OpIdx, unsigned Reg, const TargetRegisterClass *CurRC,
+      unsigned OpIdx, Register Reg, const TargetRegisterClass *CurRC,
       const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const;
 };
 
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index 6d7fb72b6bd1..880d4829ac7e 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -85,7 +85,7 @@ public:
   Register getReg(unsigned Idx) const { return MI->getOperand(Idx).getReg(); }
 
   /// Add a new virtual register operand.
-  const MachineInstrBuilder &addReg(unsigned RegNo, unsigned flags = 0,
+  const MachineInstrBuilder &addReg(Register RegNo, unsigned flags = 0,
                                     unsigned SubReg = 0) const {
     assert((flags & 0x1) == 0 &&
            "Passing in 'true' to addReg is forbidden! Use enums instead.");
@@ -104,14 +104,14 @@ public:
   }
 
   /// Add a virtual register definition operand.
-  const MachineInstrBuilder &addDef(unsigned RegNo, unsigned Flags = 0,
+  const MachineInstrBuilder &addDef(Register RegNo, unsigned Flags = 0,
                                     unsigned SubReg = 0) const {
     return addReg(RegNo, Flags | RegState::Define, SubReg);
   }
 
   /// Add a virtual register use operand. It is an error for Flags to contain
   /// `RegState::Define` when calling this function.
-  const MachineInstrBuilder &addUse(unsigned RegNo, unsigned Flags = 0,
+  const MachineInstrBuilder &addUse(Register RegNo, unsigned Flags = 0,
                                     unsigned SubReg = 0) const {
     assert(!(Flags & RegState::Define) &&
            "Misleading addUse defines register, use addReg instead.");
@@ -135,7 +135,7 @@ public:
   }
 
   const MachineInstrBuilder &addMBB(MachineBasicBlock *MBB,
-                                    unsigned char TargetFlags = 0) const {
+                                    unsigned TargetFlags = 0) const {
     MI->addOperand(*MF, MachineOperand::CreateMBB(MBB, TargetFlags));
     return *this;
   }
@@ -145,42 +145,42 @@ public:
     return *this;
   }
 
-  const MachineInstrBuilder &addConstantPoolIndex(unsigned Idx,
-                                                  int Offset = 0,
-                                          unsigned char TargetFlags = 0) const {
+  const MachineInstrBuilder &
+  addConstantPoolIndex(unsigned Idx, int Offset = 0,
+                       unsigned TargetFlags = 0) const {
     MI->addOperand(*MF, MachineOperand::CreateCPI(Idx, Offset, TargetFlags));
     return *this;
   }
 
   const MachineInstrBuilder &addTargetIndex(unsigned Idx, int64_t Offset = 0,
-                                          unsigned char TargetFlags = 0) const {
+                                          unsigned TargetFlags = 0) const {
     MI->addOperand(*MF, MachineOperand::CreateTargetIndex(Idx, Offset,
                                                           TargetFlags));
     return *this;
   }
 
   const MachineInstrBuilder &addJumpTableIndex(unsigned Idx,
-                                          unsigned char TargetFlags = 0) const {
+                                               unsigned TargetFlags = 0) const {
     MI->addOperand(*MF, MachineOperand::CreateJTI(Idx, TargetFlags));
     return *this;
   }
 
   const MachineInstrBuilder &addGlobalAddress(const GlobalValue *GV,
                                               int64_t Offset = 0,
-                                          unsigned char TargetFlags = 0) const {
+                                              unsigned TargetFlags = 0) const {
     MI->addOperand(*MF, MachineOperand::CreateGA(GV, Offset, TargetFlags));
     return *this;
   }
 
   const MachineInstrBuilder &addExternalSymbol(const char *FnName,
-                                          unsigned char TargetFlags = 0) const {
+                                               unsigned TargetFlags = 0) const {
     MI->addOperand(*MF, MachineOperand::CreateES(FnName, TargetFlags));
     return *this;
   }
 
   const MachineInstrBuilder &addBlockAddress(const BlockAddress *BA,
                                              int64_t Offset = 0,
-                                          unsigned char TargetFlags = 0) const {
+                                             unsigned TargetFlags = 0) const {
     MI->addOperand(*MF, MachineOperand::CreateBA(BA, Offset, TargetFlags));
     return *this;
   }
@@ -250,6 +250,11 @@ public:
     return *this;
   }
 
+  const MachineInstrBuilder &addShuffleMask(const Constant *Val) const {
+    MI->addOperand(*MF, MachineOperand::CreateShuffleMask(Val));
+    return *this;
+  }
+
   const MachineInstrBuilder &addSym(MCSymbol *Sym,
                                     unsigned char TargetFlags = 0) const {
     MI->addOperand(*MF, MachineOperand::CreateMCSymbol(Sym, TargetFlags));
@@ -316,7 +321,7 @@ inline MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL,
 /// This version of the builder sets up the first operand as a
 /// destination virtual register.
 inline MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL,
-                                   const MCInstrDesc &MCID, unsigned DestReg) {
+                                   const MCInstrDesc &MCID, Register DestReg) {
   return MachineInstrBuilder(MF, MF.CreateMachineInstr(MCID, DL))
            .addReg(DestReg, RegState::Define);
 }
@@ -327,7 +332,7 @@ inline MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL,
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
                                    MachineBasicBlock::iterator I,
                                    const DebugLoc &DL, const MCInstrDesc &MCID,
-                                   unsigned DestReg) {
+                                   Register DestReg) {
   MachineFunction &MF = *BB.getParent();
   MachineInstr *MI = MF.CreateMachineInstr(MCID, DL);
   BB.insert(I, MI);
@@ -343,7 +348,7 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
                                    MachineBasicBlock::instr_iterator I,
                                    const DebugLoc &DL, const MCInstrDesc &MCID,
-                                   unsigned DestReg) {
+                                   Register DestReg) {
   MachineFunction &MF = *BB.getParent();
   MachineInstr *MI = MF.CreateMachineInstr(MCID, DL);
   BB.insert(I, MI);
@@ -352,7 +357,7 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
 
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB, MachineInstr &I,
                                    const DebugLoc &DL, const MCInstrDesc &MCID,
-                                   unsigned DestReg) {
+                                   Register DestReg) {
   // Calling the overload for instr_iterator is always correct.  However, the
   // definition is not available in headers, so inline the check.
   if (I.isInsideBundle())
@@ -362,7 +367,7 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB, MachineInstr &I,
 
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB, MachineInstr *I,
                                    const DebugLoc &DL, const MCInstrDesc &MCID,
-                                   unsigned DestReg) {
+                                   Register DestReg) {
   return BuildMI(BB, *I, DL, MCID, DestReg);
 }
 
@@ -416,7 +421,7 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock *BB, const DebugLoc &DL,
 /// end of the given MachineBasicBlock, and sets up the first operand as a
 /// destination virtual register.
 inline MachineInstrBuilder BuildMI(MachineBasicBlock *BB, const DebugLoc &DL,
-                                   const MCInstrDesc &MCID, unsigned DestReg) {
+                                   const MCInstrDesc &MCID, Register DestReg) {
   return BuildMI(*BB, BB->end(), DL, MCID, DestReg);
 }
 
@@ -426,7 +431,7 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock *BB, const DebugLoc &DL,
 /// second operand is an immediate.
 MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL,
                             const MCInstrDesc &MCID, bool IsIndirect,
-                            unsigned Reg, const MDNode *Variable,
+                            Register Reg, const MDNode *Variable,
                             const MDNode *Expr);
 
 /// This version of the builder builds a DBG_VALUE intrinsic
@@ -442,7 +447,7 @@ MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL,
 MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
                             MachineBasicBlock::iterator I, const DebugLoc &DL,
                             const MCInstrDesc &MCID, bool IsIndirect,
-                            unsigned Reg, const MDNode *Variable,
+                            Register Reg, const MDNode *Variable,
                             const MDNode *Expr);
 
 /// This version of the builder builds a DBG_VALUE intrinsic
@@ -490,16 +495,13 @@ inline unsigned getRenamableRegState(bool B) {
 /// Get all register state flags from machine operand \p RegOp.
 inline unsigned getRegState(const MachineOperand &RegOp) {
   assert(RegOp.isReg() && "Not a register operand");
-  return getDefRegState(RegOp.isDef())                    |
-         getImplRegState(RegOp.isImplicit())              |
-         getKillRegState(RegOp.isKill())                  |
-         getDeadRegState(RegOp.isDead())                  |
-         getUndefRegState(RegOp.isUndef())                |
-         getInternalReadRegState(RegOp.isInternalRead())  |
-         getDebugRegState(RegOp.isDebug())                |
-         getRenamableRegState(
-             TargetRegisterInfo::isPhysicalRegister(RegOp.getReg()) &&
-             RegOp.isRenamable());
+  return getDefRegState(RegOp.isDef()) | getImplRegState(RegOp.isImplicit()) |
+         getKillRegState(RegOp.isKill()) | getDeadRegState(RegOp.isDead()) |
+         getUndefRegState(RegOp.isUndef()) |
+         getInternalReadRegState(RegOp.isInternalRead()) |
+         getDebugRegState(RegOp.isDebug()) |
+         getRenamableRegState(Register::isPhysicalRegister(RegOp.getReg()) &&
+                              RegOp.isRenamable());
 }
 
 /// Helper class for constructing bundles of MachineInstrs.
diff --git a/include/llvm/CodeGen/MachineLoopUtils.h b/include/llvm/CodeGen/MachineLoopUtils.h
new file mode 100644
index 000000000000..41379b75d00a
--- /dev/null
+++ b/include/llvm/CodeGen/MachineLoopUtils.h
@@ -0,0 +1,41 @@
+//=- MachineLoopUtils.h - Helper functions for manipulating loops -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H
+#define LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H
+
+namespace llvm {
+class MachineBasicBlock;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+
+enum LoopPeelDirection {
+  LPD_Front, ///< Peel the first iteration of the loop.
+  LPD_Back   ///< Peel the last iteration of the loop.
+};
+
+/// Peels a single block loop. Loop must have two successors, one of which
+/// must be itself. Similarly it must have two predecessors, one of which must
+/// be itself.
+///
+/// The loop block is copied and inserted into the CFG such that two copies of
+/// the loop follow on from each other. The copy is inserted either before or
+/// after the loop based on Direction.
+///
+/// Phis are updated and an unconditional branch inserted at the end of the
+/// clone so as to execute a single iteration.
+///
+/// The trip count of Loop is not updated.
+MachineBasicBlock *PeelSingleBlockLoop(LoopPeelDirection Direction,
+                                       MachineBasicBlock *Loop,
+                                       MachineRegisterInfo &MRI,
+                                       const TargetInstrInfo *TII);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H
diff --git a/include/llvm/CodeGen/MachineMemOperand.h b/include/llvm/CodeGen/MachineMemOperand.h
index 65f706302bc2..33a48a235e18 100644
--- a/include/llvm/CodeGen/MachineMemOperand.h
+++ b/include/llvm/CodeGen/MachineMemOperand.h
@@ -293,8 +293,6 @@ public:
 
   /// Support for operator<<.
   /// @{
-  void print(raw_ostream &OS) const;
-  void print(raw_ostream &OS, ModuleSlotTracker &MST) const;
   void print(raw_ostream &OS, ModuleSlotTracker &MST,
              SmallVectorImpl<StringRef> &SSNs, const LLVMContext &Context,
              const MachineFrameInfo *MFI, const TargetInstrInfo *TII) const;
@@ -319,11 +317,6 @@ public:
   }
 };
 
-inline raw_ostream &operator<<(raw_ostream &OS, const MachineMemOperand &MRO) {
-  MRO.print(OS);
-  return OS;
-}
-
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 4ff5c7fd013a..6902dada2423 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -33,6 +33,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Pass.h"
@@ -74,7 +75,10 @@ protected:
 /// made by different debugging and exception handling schemes and reformated
 /// for specific use.
 ///
-class MachineModuleInfo : public ImmutablePass {
+class MachineModuleInfo {
+  friend class MachineModuleInfoWrapperPass;
+  friend class MachineModuleAnalysis;
+
   const LLVMTargetMachine &TM;
 
   /// This is the MCContext used for the entire code generator.
@@ -140,15 +144,17 @@ class MachineModuleInfo : public ImmutablePass {
   const Function *LastRequest = nullptr; ///< Used for shortcut/cache.
   MachineFunction *LastResult = nullptr; ///< Used for shortcut/cache.
 
-public:
-  static char ID; // Pass identification, replacement for typeid
+  MachineModuleInfo &operator=(MachineModuleInfo &&MMII) = delete;
 
+public:
   explicit MachineModuleInfo(const LLVMTargetMachine *TM = nullptr);
-  ~MachineModuleInfo() override;
 
-  // Initialization and Finalization
-  bool doInitialization(Module &) override;
-  bool doFinalization(Module &) override;
+  MachineModuleInfo(MachineModuleInfo &&MMII);
+
+  ~MachineModuleInfo();
+
+  void initialize();
+  void finalize();
 
   const LLVMTargetMachine &getTarget() const { return TM; }
 
@@ -254,6 +260,38 @@ public:
   /// \}
 }; // End class MachineModuleInfo
 
+class MachineModuleInfoWrapperPass : public ImmutablePass {
+  MachineModuleInfo MMI;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit MachineModuleInfoWrapperPass(const LLVMTargetMachine *TM = nullptr);
+
+  // Initialization and Finalization
+  bool doInitialization(Module &) override;
+  bool doFinalization(Module &) override;
+
+  MachineModuleInfo &getMMI() { return MMI; }
+  const MachineModuleInfo &getMMI() const { return MMI; }
+};
+
+/// An analysis that produces \c MachineInfo for a module.
+class MachineModuleAnalysis : public AnalysisInfoMixin<MachineModuleAnalysis> {
+  friend AnalysisInfoMixin<MachineModuleAnalysis>;
+  static AnalysisKey Key;
+
+  const LLVMTargetMachine *TM;
+
+public:
+  /// Provide the result type for this analysis pass.
+  using Result = MachineModuleInfo;
+
+  MachineModuleAnalysis(const LLVMTargetMachine *TM) : TM(TM) {}
+
+  /// Run the analysis pass and produce machine module information.
+  MachineModuleInfo run(Module &M, ModuleAnalysisManager &);
+};
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINEMODULEINFO_H
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 2152c7582e5a..df914dc2d85e 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -23,6 +23,7 @@
 namespace llvm {
 
 class BlockAddress;
+class Constant;
 class ConstantFP;
 class ConstantInt;
 class GlobalValue;
@@ -68,7 +69,8 @@ public:
     MO_CFIIndex,          ///< MCCFIInstruction index.
     MO_IntrinsicID,       ///< Intrinsic ID for ISel
     MO_Predicate,         ///< Generic predicate for ISel
-    MO_Last = MO_Predicate,
+    MO_ShuffleMask,       ///< Other IR Constant for ISel (shuffle masks)
+    MO_Last = MO_ShuffleMask
   };
 
 private:
@@ -172,6 +174,7 @@ private:
     unsigned CFIIndex;       // For MO_CFI.
     Intrinsic::ID IntrinsicID; // For MO_IntrinsicID.
     unsigned Pred;           // For MO_Predicate
+    const Constant *ShuffleMask; // For MO_ShuffleMask
 
     struct {                  // For MO_Register.
       // Register number is in SmallContents.RegNo.
@@ -341,6 +344,7 @@ public:
   bool isCFIIndex() const { return OpKind == MO_CFIIndex; }
   bool isIntrinsicID() const { return OpKind == MO_IntrinsicID; }
   bool isPredicate() const { return OpKind == MO_Predicate; }
+  bool isShuffleMask() const { return OpKind == MO_ShuffleMask; }
   //===--------------------------------------------------------------------===//
   // Accessors for Register Operands
   //===--------------------------------------------------------------------===//
@@ -455,7 +459,7 @@ public:
 
   /// Change the register this operand corresponds to.
   ///
-  void setReg(unsigned Reg);
+  void setReg(Register Reg);
 
   void setSubReg(unsigned subReg) {
     assert(isReg() && "Wrong MachineOperand mutator");
@@ -468,13 +472,13 @@ public:
   /// using TargetRegisterInfo to compose the subreg indices if necessary.
   /// Reg must be a virtual register, SubIdx can be 0.
   ///
-  void substVirtReg(unsigned Reg, unsigned SubIdx, const TargetRegisterInfo&);
+  void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo&);
 
   /// substPhysReg - Substitute the current register with the physical register
   /// Reg, taking any existing SubReg into account. For instance,
   /// substPhysReg(%eax) will change %reg1024:sub_8bit to %al.
   ///
-  void substPhysReg(unsigned Reg, const TargetRegisterInfo&);
+  void substPhysReg(MCRegister Reg, const TargetRegisterInfo&);
 
   void setIsUse(bool Val = true) { setIsDef(!Val); }
 
@@ -579,6 +583,11 @@ public:
     return Contents.Pred;
   }
 
+  const Constant *getShuffleMask() const {
+    assert(isShuffleMask() && "Wrong MachineOperand accessor");
+    return Contents.ShuffleMask;
+  }
+
   /// Return the offset from the symbol in this operand. This always returns 0
   /// for ExternalSymbol operands.
   int64_t getOffset() const {
@@ -717,11 +726,11 @@ public:
   void ChangeToFPImmediate(const ConstantFP *FPImm);
 
   /// ChangeToES - Replace this operand with a new external symbol operand.
-  void ChangeToES(const char *SymName, unsigned char TargetFlags = 0);
+  void ChangeToES(const char *SymName, unsigned TargetFlags = 0);
 
   /// ChangeToGA - Replace this operand with a new global address operand.
   void ChangeToGA(const GlobalValue *GV, int64_t Offset,
-                  unsigned char TargetFlags = 0);
+                  unsigned TargetFlags = 0);
 
   /// ChangeToMCSymbol - Replace this operand with a new MC symbol operand.
   void ChangeToMCSymbol(MCSymbol *Sym);
@@ -731,12 +740,12 @@ public:
 
   /// Replace this operand with a target index.
   void ChangeToTargetIndex(unsigned Idx, int64_t Offset,
-                           unsigned char TargetFlags = 0);
+                           unsigned TargetFlags = 0);
 
   /// ChangeToRegister - Replace this operand with a new register operand of
   /// the specified value.  If an operand is known to be an register already,
   /// the setReg method should be used.
-  void ChangeToRegister(unsigned Reg, bool isDef, bool isImp = false,
+  void ChangeToRegister(Register Reg, bool isDef, bool isImp = false,
                         bool isKill = false, bool isDead = false,
                         bool isUndef = false, bool isDebug = false);
 
@@ -762,7 +771,7 @@ public:
     return Op;
   }
 
-  static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp = false,
+  static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp = false,
                                   bool isKill = false, bool isDead = false,
                                   bool isUndef = false,
                                   bool isEarlyClobber = false,
@@ -788,7 +797,7 @@ public:
     return Op;
   }
   static MachineOperand CreateMBB(MachineBasicBlock *MBB,
-                                  unsigned char TargetFlags = 0) {
+                                  unsigned TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_MachineBasicBlock);
     Op.setMBB(MBB);
     Op.setTargetFlags(TargetFlags);
@@ -800,7 +809,7 @@ public:
     return Op;
   }
   static MachineOperand CreateCPI(unsigned Idx, int Offset,
-                                  unsigned char TargetFlags = 0) {
+                                  unsigned TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_ConstantPoolIndex);
     Op.setIndex(Idx);
     Op.setOffset(Offset);
@@ -808,21 +817,21 @@ public:
     return Op;
   }
   static MachineOperand CreateTargetIndex(unsigned Idx, int64_t Offset,
-                                          unsigned char TargetFlags = 0) {
+                                          unsigned TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_TargetIndex);
     Op.setIndex(Idx);
     Op.setOffset(Offset);
     Op.setTargetFlags(TargetFlags);
     return Op;
   }
-  static MachineOperand CreateJTI(unsigned Idx, unsigned char TargetFlags = 0) {
+  static MachineOperand CreateJTI(unsigned Idx, unsigned TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_JumpTableIndex);
     Op.setIndex(Idx);
     Op.setTargetFlags(TargetFlags);
     return Op;
   }
   static MachineOperand CreateGA(const GlobalValue *GV, int64_t Offset,
-                                 unsigned char TargetFlags = 0) {
+                                 unsigned TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_GlobalAddress);
     Op.Contents.OffsetedInfo.Val.GV = GV;
     Op.setOffset(Offset);
@@ -830,7 +839,7 @@ public:
     return Op;
   }
   static MachineOperand CreateES(const char *SymName,
-                                 unsigned char TargetFlags = 0) {
+                                 unsigned TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_ExternalSymbol);
     Op.Contents.OffsetedInfo.Val.SymbolName = SymName;
     Op.setOffset(0); // Offset is always 0.
@@ -838,7 +847,7 @@ public:
     return Op;
   }
   static MachineOperand CreateBA(const BlockAddress *BA, int64_t Offset,
-                                 unsigned char TargetFlags = 0) {
+                                 unsigned TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_BlockAddress);
     Op.Contents.OffsetedInfo.Val.BA = BA;
     Op.setOffset(Offset);
@@ -876,7 +885,7 @@ public:
   }
 
   static MachineOperand CreateMCSymbol(MCSymbol *Sym,
-                                       unsigned char TargetFlags = 0) {
+                                       unsigned TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_MCSymbol);
     Op.Contents.Sym = Sym;
     Op.setOffset(0);
@@ -902,6 +911,12 @@ public:
     return Op;
   }
 
+  static MachineOperand CreateShuffleMask(const Constant *C) {
+    MachineOperand Op(MachineOperand::MO_ShuffleMask);
+    Op.Contents.ShuffleMask = C;
+    return Op;
+  }
+
   friend class MachineInstr;
   friend class MachineRegisterInfo;
 
diff --git a/include/llvm/CodeGen/MachinePipeliner.h b/include/llvm/CodeGen/MachinePipeliner.h
index 03ca53072685..e9cf7e115bff 100644
--- a/include/llvm/CodeGen/MachinePipeliner.h
+++ b/include/llvm/CodeGen/MachinePipeliner.h
@@ -40,6 +40,8 @@
 #ifndef LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
 #define LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
 
+#include "llvm/Analysis/AliasAnalysis.h"
+
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
@@ -148,7 +150,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
 
   /// We may create a new instruction, so remember it because it
   /// must be deleted when the pass is finished.
-  SmallPtrSet<MachineInstr *, 4> NewMIs;
+  DenseMap<MachineInstr*, MachineInstr *> NewMIs;
 
   /// Ordered list of DAG postprocessing steps.
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
@@ -200,7 +202,7 @@ public:
         RegClassInfo(rci), II_setByPragma(II), Topo(SUnits, &ExitSU) {
     P.MF->getSubtarget().getSMSMutations(Mutations);
     if (SwpEnableCopyToPhi)
-      Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
+      Mutations.push_back(std::make_unique<CopyToPhiMutation>());
   }
 
   void schedule() override;
@@ -297,53 +299,8 @@ private:
   void computeNodeOrder(NodeSetType &NodeSets);
   void checkValidNodeOrder(const NodeSetType &Circuits) const;
   bool schedulePipeline(SMSchedule &Schedule);
-  void generatePipelinedLoop(SMSchedule &Schedule);
-  void generateProlog(SMSchedule &Schedule, unsigned LastStage,
-                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
-                      MBBVectorTy &PrologBBs);
-  void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
-                      MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
-                      MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
-  void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
-                            MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
-                            SMSchedule &Schedule, ValueMapTy *VRMap,
-                            InstrMapTy &InstrMap, unsigned LastStageNum,
-                            unsigned CurStageNum, bool IsLast);
-  void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
-                    MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
-                    SMSchedule &Schedule, ValueMapTy *VRMap,
-                    InstrMapTy &InstrMap, unsigned LastStageNum,
-                    unsigned CurStageNum, bool IsLast);
-  void removeDeadInstructions(MachineBasicBlock *KernelBB,
-                              MBBVectorTy &EpilogBBs);
-  void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
-                      SMSchedule &Schedule);
-  void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
-                   MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
-                   SMSchedule &Schedule, ValueMapTy *VRMap);
   bool computeDelta(MachineInstr &MI, unsigned &Delta);
-  void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
-                         unsigned Num);
-  MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
-                           unsigned InstStageNum);
-  MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
-                                    unsigned InstStageNum,
-                                    SMSchedule &Schedule);
-  void updateInstruction(MachineInstr *NewMI, bool LastDef,
-                         unsigned CurStageNum, unsigned InstrStageNum,
-                         SMSchedule &Schedule, ValueMapTy *VRMap);
   MachineInstr *findDefInLoop(unsigned Reg);
-  unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
-                         unsigned LoopStage, ValueMapTy *VRMap,
-                         MachineBasicBlock *BB);
-  void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
-                        SMSchedule &Schedule, ValueMapTy *VRMap,
-                        InstrMapTy &InstrMap);
-  void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
-                             InstrMapTy &InstrMap, unsigned CurStageNum,
-                             unsigned PhiNum, MachineInstr *Phi,
-                             unsigned OldReg, unsigned NewReg,
-                             unsigned PrevReg = 0);
   bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
                              unsigned &OffsetPos, unsigned &NewBase,
                              int64_t &NewOffset);
@@ -529,12 +486,6 @@ private:
   /// Map from instruction to execution cycle.
   std::map<SUnit *, int> InstrToCycle;
 
-  /// Map for each register and the max difference between its uses and def.
-  /// The first element in the pair is the max difference in stages. The
-  /// second is true if the register defines a Phi value and loop value is
-  /// scheduled before the Phi.
-  std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
-
   /// Keep track of the first cycle value in the schedule.  It starts
   /// as zero, but the algorithm allows negative values.
   int FirstCycle = 0;
@@ -560,7 +511,6 @@ public:
   void reset() {
     ScheduledInstrs.clear();
     InstrToCycle.clear();
-    RegToStageDiff.clear();
     FirstCycle = 0;
     LastCycle = 0;
     InitiationInterval = 0;
@@ -620,28 +570,6 @@ public:
     return (LastCycle - FirstCycle) / InitiationInterval;
   }
 
-  /// Return the max. number of stages/iterations that can occur between a
-  /// register definition and its uses.
-  unsigned getStagesForReg(int Reg, unsigned CurStage) {
-    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
-    if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
-      return 1;
-    return Stages.first;
-  }
-
-  /// The number of stages for a Phi is a little different than other
-  /// instructions. The minimum value computed in RegToStageDiff is 1
-  /// because we assume the Phi is needed for at least 1 iteration.
-  /// This is not the case if the loop value is scheduled prior to the
-  /// Phi in the same stage.  This function returns the number of stages
-  /// or iterations needed between the Phi definition and any uses.
-  unsigned getStagesForPhi(int Reg) {
-    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
-    if (Stages.second)
-      return Stages.first;
-    return Stages.first - 1;
-  }
-
   /// Return the instructions that are scheduled at the specified cycle.
   std::deque<SUnit *> &getInstructions(int cycle) {
     return ScheduledInstrs[cycle];
diff --git a/include/llvm/CodeGen/MachinePostDominators.h b/include/llvm/CodeGen/MachinePostDominators.h
index b67e6b52ac8f..cb258b5e7b21 100644
--- a/include/llvm/CodeGen/MachinePostDominators.h
+++ b/include/llvm/CodeGen/MachinePostDominators.h
@@ -16,68 +16,76 @@
 
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include <memory>
 
 namespace llvm {
 
 ///
-/// PostDominatorTree Class - Concrete subclass of DominatorTree that is used
-/// to compute the post-dominator tree.
+/// MachinePostDominatorTree - an analysis pass wrapper for DominatorTree
+/// used to compute the post-dominator tree for MachineFunctions.
 ///
-struct MachinePostDominatorTree : public MachineFunctionPass {
-private:
- PostDomTreeBase<MachineBasicBlock> *DT;
+class MachinePostDominatorTree : public MachineFunctionPass {
+  using PostDomTreeT = PostDomTreeBase<MachineBasicBlock>;
+  std::unique_ptr<PostDomTreeT> PDT;
 
 public:
   static char ID;
 
   MachinePostDominatorTree();
 
-  ~MachinePostDominatorTree() override;
-
   FunctionPass *createMachinePostDominatorTreePass();
 
   const SmallVectorImpl<MachineBasicBlock *> &getRoots() const {
-    return DT->getRoots();
+    return PDT->getRoots();
   }
 
-  MachineDomTreeNode *getRootNode() const {
-    return DT->getRootNode();
-  }
+  MachineDomTreeNode *getRootNode() const { return PDT->getRootNode(); }
 
   MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
-    return DT->getNode(BB);
+    return PDT->getNode(BB);
   }
 
   MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
-    return DT->getNode(BB);
+    return PDT->getNode(BB);
   }
 
   bool dominates(const MachineDomTreeNode *A,
                  const MachineDomTreeNode *B) const {
-    return DT->dominates(A, B);
+    return PDT->dominates(A, B);
   }
 
   bool dominates(const MachineBasicBlock *A, const MachineBasicBlock *B) const {
-    return DT->dominates(A, B);
+    return PDT->dominates(A, B);
   }
 
   bool properlyDominates(const MachineDomTreeNode *A,
                          const MachineDomTreeNode *B) const {
-    return DT->properlyDominates(A, B);
+    return PDT->properlyDominates(A, B);
   }
 
   bool properlyDominates(const MachineBasicBlock *A,
                          const MachineBasicBlock *B) const {
-    return DT->properlyDominates(A, B);
+    return PDT->properlyDominates(A, B);
+  }
+
+  bool isVirtualRoot(const MachineDomTreeNode *Node) const {
+    return PDT->isVirtualRoot(Node);
   }
 
   MachineBasicBlock *findNearestCommonDominator(MachineBasicBlock *A,
-                                                MachineBasicBlock *B) {
-    return DT->findNearestCommonDominator(A, B);
+                                                MachineBasicBlock *B) const {
+    return PDT->findNearestCommonDominator(A, B);
   }
 
+  /// Returns the nearest common dominator of the given blocks.
+  /// If that tree node is a virtual root, a nullptr will be returned.
+  MachineBasicBlock *
+  findNearestCommonDominator(ArrayRef<MachineBasicBlock *> Blocks) const;
+
   bool runOnMachineFunction(MachineFunction &MF) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override { PDT.reset(nullptr); }
+  void verifyAnalysis() const override;
   void print(llvm::raw_ostream &OS, const Module *M = nullptr) const override;
 };
 } //end of namespace llvm
diff --git a/include/llvm/CodeGen/MachineRegionInfo.h b/include/llvm/CodeGen/MachineRegionInfo.h
index 6d9fb9b9100a..eeb69fef2c6b 100644
--- a/include/llvm/CodeGen/MachineRegionInfo.h
+++ b/include/llvm/CodeGen/MachineRegionInfo.h
@@ -22,7 +22,7 @@
 
 namespace llvm {
 
-struct MachinePostDominatorTree;
+class MachinePostDominatorTree;
 class MachineRegion;
 class MachineRegionNode;
 class MachineRegionInfo;
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index b5deed1f5010..488a5a55a169 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -107,16 +107,16 @@ private:
 
   /// getRegUseDefListHead - Return the head pointer for the register use/def
   /// list for the specified virtual or physical register.
-  MachineOperand *&getRegUseDefListHead(unsigned RegNo) {
-    if (TargetRegisterInfo::isVirtualRegister(RegNo))
-      return VRegInfo[RegNo].second;
-    return PhysRegUseDefLists[RegNo];
+  MachineOperand *&getRegUseDefListHead(Register RegNo) {
+    if (RegNo.isVirtual())
+      return VRegInfo[RegNo.id()].second;
+    return PhysRegUseDefLists[RegNo.id()];
   }
 
-  MachineOperand *getRegUseDefListHead(unsigned RegNo) const {
-    if (TargetRegisterInfo::isVirtualRegister(RegNo))
-      return VRegInfo[RegNo].second;
-    return PhysRegUseDefLists[RegNo];
+  MachineOperand *getRegUseDefListHead(Register RegNo) const {
+    if (RegNo.isVirtual())
+      return VRegInfo[RegNo.id()].second;
+    return PhysRegUseDefLists[RegNo.id()];
   }
 
   /// Get the next element in the use-def chain.
@@ -214,8 +214,8 @@ public:
   bool shouldTrackSubRegLiveness(const TargetRegisterClass &RC) const {
     return subRegLivenessEnabled() && RC.HasDisjunctSubRegs;
   }
-  bool shouldTrackSubRegLiveness(unsigned VReg) const {
-    assert(TargetRegisterInfo::isVirtualRegister(VReg) && "Must pass a VReg");
+  bool shouldTrackSubRegLiveness(Register VReg) const {
+    assert(VReg.isVirtual() && "Must pass a VReg");
     return shouldTrackSubRegLiveness(*getRegClass(VReg));
   }
   bool subRegLivenessEnabled() const {
@@ -326,7 +326,7 @@ public:
   /// of the specified register, skipping those marked as Debug.
   using reg_nodbg_iterator =
       defusechain_iterator<true, true, true, true, false, false>;
-  reg_nodbg_iterator reg_nodbg_begin(unsigned RegNo) const {
+  reg_nodbg_iterator reg_nodbg_begin(Register RegNo) const {
     return reg_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
   static reg_nodbg_iterator reg_nodbg_end() {
@@ -374,7 +374,7 @@ public:
 
   /// reg_nodbg_empty - Return true if the only instructions using or defining
   /// Reg are Debug instructions.
-  bool reg_nodbg_empty(unsigned RegNo) const {
+  bool reg_nodbg_empty(Register RegNo) const {
     return reg_nodbg_begin(RegNo) == reg_nodbg_end();
   }
 
@@ -628,10 +628,10 @@ public:
   /// Return the register class of the specified virtual register.
   /// This shouldn't be used directly unless \p Reg has a register class.
   /// \see getRegClassOrNull when this might happen.
-  const TargetRegisterClass *getRegClass(unsigned Reg) const {
-    assert(VRegInfo[Reg].first.is<const TargetRegisterClass *>() &&
+  const TargetRegisterClass *getRegClass(Register Reg) const {
+    assert(VRegInfo[Reg.id()].first.is<const TargetRegisterClass *>() &&
            "Register class not set, wrong accessor");
-    return VRegInfo[Reg].first.get<const TargetRegisterClass *>();
+    return VRegInfo[Reg.id()].first.get<const TargetRegisterClass *>();
   }
 
   /// Return the register class of \p Reg, or null if Reg has not been assigned
@@ -727,7 +727,7 @@ public:
   /// Get the low-level type of \p Reg or LLT{} if Reg is not a generic
   /// (target independent) virtual register.
   LLT getType(unsigned Reg) const {
-    if (TargetRegisterInfo::isVirtualRegister(Reg) && VRegToType.inBounds(Reg))
+    if (Register::isVirtualRegister(Reg) && VRegToType.inBounds(Reg))
       return VRegToType[Reg];
     return LLT{};
   }
@@ -760,7 +760,7 @@ public:
   /// specified virtual register. This is typically used by target, and in case
   /// of an earlier hint it will be overwritten.
   void setRegAllocationHint(unsigned VReg, unsigned Type, unsigned PrefReg) {
-    assert(TargetRegisterInfo::isVirtualRegister(VReg));
+    assert(Register::isVirtualRegister(VReg));
     RegAllocHints[VReg].first  = Type;
     RegAllocHints[VReg].second.clear();
     RegAllocHints[VReg].second.push_back(PrefReg);
@@ -769,7 +769,7 @@ public:
   /// addRegAllocationHint - Add a register allocation hint to the hints
   /// vector for VReg.
   void addRegAllocationHint(unsigned VReg, unsigned PrefReg) {
-    assert(TargetRegisterInfo::isVirtualRegister(VReg));
+    assert(Register::isVirtualRegister(VReg));
     RegAllocHints[VReg].second.push_back(PrefReg);
   }
 
@@ -789,17 +789,18 @@ public:
   /// specified virtual register. If there are many hints, this returns the
   /// one with the greatest weight.
   std::pair<unsigned, unsigned>
-  getRegAllocationHint(unsigned VReg) const {
-    assert(TargetRegisterInfo::isVirtualRegister(VReg));
-    unsigned BestHint = (RegAllocHints[VReg].second.size() ?
-                         RegAllocHints[VReg].second[0] : 0);
-    return std::pair<unsigned, unsigned>(RegAllocHints[VReg].first, BestHint);
+  getRegAllocationHint(Register VReg) const {
+    assert(VReg.isVirtual());
+    unsigned BestHint = (RegAllocHints[VReg.id()].second.size() ?
+                         RegAllocHints[VReg.id()].second[0] : 0);
+    return std::pair<unsigned, unsigned>(RegAllocHints[VReg.id()].first,
+                                         BestHint);
   }
 
   /// getSimpleHint - same as getRegAllocationHint except it will only return
   /// a target independent hint.
-  unsigned getSimpleHint(unsigned VReg) const {
-    assert(TargetRegisterInfo::isVirtualRegister(VReg));
+  Register getSimpleHint(Register VReg) const {
+    assert(VReg.isVirtual());
     std::pair<unsigned, unsigned> Hint = getRegAllocationHint(VReg);
     return Hint.first ? 0 : Hint.second;
   }
@@ -808,7 +809,7 @@ public:
   /// register allocation hints for VReg.
   const std::pair<unsigned, SmallVector<unsigned, 4>>
   &getRegAllocationHints(unsigned VReg) const {
-    assert(TargetRegisterInfo::isVirtualRegister(VReg));
+    assert(Register::isVirtualRegister(VReg));
     return RegAllocHints[VReg];
   }
 
@@ -817,6 +818,17 @@ public:
   /// deleted during LiveDebugVariables analysis.
   void markUsesInDebugValueAsUndef(unsigned Reg) const;
 
+  /// updateDbgUsersToReg - Update a collection of DBG_VALUE instructions
+  /// to refer to the designated register.
+  void updateDbgUsersToReg(unsigned Reg,
+                           ArrayRef<MachineInstr*> Users) const {
+    for (MachineInstr *MI : Users) {
+      assert(MI->isDebugInstr());
+      assert(MI->getOperand(0).isReg());
+      MI->getOperand(0).setReg(Reg);
+    }
+  }
+
   /// Return true if the specified register is modified in this function.
   /// This checks that no defining machine operands exist for the register or
   /// any of its aliases. Definitions found on functions marked noreturn are
@@ -882,8 +894,8 @@ public:
   ///
   /// Reserved registers may belong to an allocatable register class, but the
   /// target has explicitly requested that they are not used.
-  bool isReserved(unsigned PhysReg) const {
-    return getReservedRegs().test(PhysReg);
+  bool isReserved(Register PhysReg) const {
+    return getReservedRegs().test(PhysReg.id());
   }
 
   /// Returns true when the given register unit is considered reserved.
@@ -1164,7 +1176,7 @@ public:
 
   PSetIterator(unsigned RegUnit, const MachineRegisterInfo *MRI) {
     const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
-    if (TargetRegisterInfo::isVirtualRegister(RegUnit)) {
+    if (Register::isVirtualRegister(RegUnit)) {
       const TargetRegisterClass *RC = MRI->getRegClass(RegUnit);
       PSet = TRI->getRegClassPressureSets(RC);
       Weight = TRI->getRegClassWeight(RC).RegWeight;
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 75a334f61ad0..333367943ac0 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -100,6 +100,7 @@ namespace llvm {
 
 extern cl::opt<bool> ForceTopDown;
 extern cl::opt<bool> ForceBottomUp;
+extern cl::opt<bool> VerifyScheduling;
 
 class LiveIntervals;
 class MachineDominatorTree;
diff --git a/include/llvm/CodeGen/ModuloSchedule.h b/include/llvm/CodeGen/ModuloSchedule.h
new file mode 100644
index 000000000000..81a9b63b64ca
--- /dev/null
+++ b/include/llvm/CodeGen/ModuloSchedule.h
@@ -0,0 +1,367 @@
+//===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Software pipelining (SWP) is an instruction scheduling technique for loops
+// that overlaps loop iterations and exploits ILP via compiler transformations.
+//
+// There are multiple methods for analyzing a loop and creating a schedule.
+// An example algorithm is Swing Modulo Scheduling (implemented by the
+// MachinePipeliner). The details of how a schedule is arrived at are irrelevant
+// for the task of actually rewriting a loop to adhere to the schedule, which
+// is what this file does.
+//
+// A schedule is, for every instruction in a block, a Cycle and a Stage. Note
+// that we only support single-block loops, so "block" and "loop" can be used
+// interchangably.
+//
+// The Cycle of an instruction defines a partial order of the instructions in
+// the remapped loop. Instructions within a cycle must not consume the output
+// of any instruction in the same cycle. Cycle information is assumed to have
+// been calculated such that the processor will execute instructions in
+// lock-step (for example in a VLIW ISA).
+//
+// The Stage of an instruction defines the mapping between logical loop
+// iterations and pipelined loop iterations. An example (unrolled) pipeline
+// may look something like:
+//
+//  I0[0]                      Execute instruction I0 of iteration 0
+//  I1[0], I0[1]               Execute I0 of iteration 1 and I1 of iteration 1
+//         I1[1], I0[2]
+//                I1[2], I0[3]
+//
+// In the schedule for this unrolled sequence we would say that I0 was scheduled
+// in stage 0 and I1 in stage 1:
+//
+//  loop:
+//    [stage 0] x = I0
+//    [stage 1] I1 x (from stage 0)
+//
+// And to actually generate valid code we must insert a phi:
+//
+//  loop:
+//    x' = phi(x)
+//    x = I0
+//    I1 x'
+//
+// This is a simple example; the rules for how to generate correct code given
+// an arbitrary schedule containing loop-carried values are complex.
+//
+// Note that these examples only mention the steady-state kernel of the
+// generated loop; prologs and epilogs must be generated also that prime and
+// flush the pipeline. Doing so is nontrivial.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
+#define LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineLoopUtils.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include <deque>
+#include <vector>
+
+namespace llvm {
+class MachineBasicBlock;
+class MachineInstr;
+class LiveIntervals;
+
+/// Represents a schedule for a single-block loop. For every instruction we
+/// maintain a Cycle and Stage.
+class ModuloSchedule {
+private:
+  /// The block containing the loop instructions.
+  MachineLoop *Loop;
+
+  /// The instructions to be generated, in total order. Cycle provides a partial
+  /// order; the total order within cycles has been decided by the schedule
+  /// producer.
+  std::vector<MachineInstr *> ScheduledInstrs;
+
+  /// The cycle for each instruction.
+  DenseMap<MachineInstr *, int> Cycle;
+
+  /// The stage for each instruction.
+  DenseMap<MachineInstr *, int> Stage;
+
+  /// The number of stages in this schedule (Max(Stage) + 1).
+  int NumStages;
+
+public:
+  /// Create a new ModuloSchedule.
+  /// \arg ScheduledInstrs The new loop instructions, in total resequenced
+  ///    order.
+  /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
+  ///    not need to start at zero. ScheduledInstrs must be partially ordered by
+  ///    Cycle.
+  /// \arg Stage Stage index for all instructions in ScheduleInstrs.
+  ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
+                 std::vector<MachineInstr *> ScheduledInstrs,
+                 DenseMap<MachineInstr *, int> Cycle,
+                 DenseMap<MachineInstr *, int> Stage)
+      : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
+        Stage(std::move(Stage)) {
+    NumStages = 0;
+    for (auto &KV : this->Stage)
+      NumStages = std::max(NumStages, KV.second);
+    ++NumStages;
+  }
+
+  /// Return the single-block loop being scheduled.
+  MachineLoop *getLoop() const { return Loop; }
+
+  /// Return the number of stages contained in this schedule, which is the
+  /// largest stage index + 1.
+  int getNumStages() const { return NumStages; }
+
+  /// Return the first cycle in the schedule, which is the cycle index of the
+  /// first instruction.
+  int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
+
+  /// Return the final cycle in the schedule, which is the cycle index of the
+  /// last instruction.
+  int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
+
+  /// Return the stage that MI is scheduled in, or -1.
+  int getStage(MachineInstr *MI) {
+    auto I = Stage.find(MI);
+    return I == Stage.end() ? -1 : I->second;
+  }
+
+  /// Return the cycle that MI is scheduled at, or -1.
+  int getCycle(MachineInstr *MI) {
+    auto I = Cycle.find(MI);
+    return I == Cycle.end() ? -1 : I->second;
+  }
+
+  /// Return the rescheduled instructions in order.
+  ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
+
+  void dump() { print(dbgs()); }
+  void print(raw_ostream &OS);
+};
+
+/// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
+/// rewriting the old loop and inserting prologs and epilogs as required.
+class ModuloScheduleExpander {
+public:
+  using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
+
+private:
+  using ValueMapTy = DenseMap<unsigned, unsigned>;
+  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
+  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
+
+  ModuloSchedule &Schedule;
+  MachineFunction &MF;
+  const TargetSubtargetInfo &ST;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo *TII;
+  LiveIntervals &LIS;
+
+  MachineBasicBlock *BB;
+  MachineBasicBlock *Preheader;
+  MachineBasicBlock *NewKernel = nullptr;
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
+
+  /// Map for each register and the max difference between its uses and def.
+  /// The first element in the pair is the max difference in stages. The
+  /// second is true if the register defines a Phi value and loop value is
+  /// scheduled before the Phi.
+  std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
+
+  /// Instructions to change when emitting the final schedule.
+  InstrChangesTy InstrChanges;
+
+  void generatePipelinedLoop();
+  void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
+                      ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
+  void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
+                      ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
+                      MBBVectorTy &PrologBBs);
+  void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
+                            MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
+                            ValueMapTy *VRMap, InstrMapTy &InstrMap,
+                            unsigned LastStageNum, unsigned CurStageNum,
+                            bool IsLast);
+  void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
+                    MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
+                    ValueMapTy *VRMap, InstrMapTy &InstrMap,
+                    unsigned LastStageNum, unsigned CurStageNum, bool IsLast);
+  void removeDeadInstructions(MachineBasicBlock *KernelBB,
+                              MBBVectorTy &EpilogBBs);
+  void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
+  void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
+                   MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
+                   ValueMapTy *VRMap);
+  bool computeDelta(MachineInstr &MI, unsigned &Delta);
+  void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
+                         unsigned Num);
+  MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
+                           unsigned InstStageNum);
+  MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
+                                    unsigned InstStageNum);
+  void updateInstruction(MachineInstr *NewMI, bool LastDef,
+                         unsigned CurStageNum, unsigned InstrStageNum,
+                         ValueMapTy *VRMap);
+  MachineInstr *findDefInLoop(unsigned Reg);
+  unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
+                         unsigned LoopStage, ValueMapTy *VRMap,
+                         MachineBasicBlock *BB);
+  void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
+                        ValueMapTy *VRMap, InstrMapTy &InstrMap);
+  void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
+                             unsigned CurStageNum, unsigned PhiNum,
+                             MachineInstr *Phi, unsigned OldReg,
+                             unsigned NewReg, unsigned PrevReg = 0);
+  bool isLoopCarried(MachineInstr &Phi);
+
+  /// Return the max. number of stages/iterations that can occur between a
+  /// register definition and its uses.
+  unsigned getStagesForReg(int Reg, unsigned CurStage) {
+    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
+    if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
+        Stages.second)
+      return 1;
+    return Stages.first;
+  }
+
+  /// The number of stages for a Phi is a little different than other
+  /// instructions. The minimum value computed in RegToStageDiff is 1
+  /// because we assume the Phi is needed for at least 1 iteration.
+  /// This is not the case if the loop value is scheduled prior to the
+  /// Phi in the same stage.  This function returns the number of stages
+  /// or iterations needed between the Phi definition and any uses.
+  unsigned getStagesForPhi(int Reg) {
+    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
+    if (Stages.second)
+      return Stages.first;
+    return Stages.first - 1;
+  }
+
+public:
+  /// Create a new ModuloScheduleExpander.
+  /// \arg InstrChanges Modifications to make to instructions with memory
+  ///   operands.
+  /// FIXME: InstrChanges is opaque and is an implementation detail of an
+  ///   optimization in MachinePipeliner that crosses abstraction boundaries.
+  ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
+                         LiveIntervals &LIS, InstrChangesTy InstrChanges)
+      : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
+        TII(ST.getInstrInfo()), LIS(LIS),
+        InstrChanges(std::move(InstrChanges)) {}
+
+  /// Performs the actual expansion.
+  void expand();
+  /// Performs final cleanup after expansion.
+  void cleanup();
+
+  /// Returns the newly rewritten kernel block, or nullptr if this was
+  /// optimized away.
+  MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
+};
+
+/// A reimplementation of ModuloScheduleExpander. It works by generating a
+/// standalone kernel loop and peeling out the prologs and epilogs.
+class PeelingModuloScheduleExpander {
+  ModuloSchedule &Schedule;
+  MachineFunction &MF;
+  const TargetSubtargetInfo &ST;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo *TII;
+  LiveIntervals *LIS;
+
+  /// The original loop block that gets rewritten in-place.
+  MachineBasicBlock *BB;
+  /// The original loop preheader.
+  MachineBasicBlock *Preheader;
+  /// All prolog and epilog blocks.
+  SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
+  /// For every block, the stages that are produced.
+  DenseMap<MachineBasicBlock *, BitVector> LiveStages;
+  /// For every block, the stages that are available. A stage can be available
+  /// but not produced (in the epilog) or produced but not available (in the
+  /// prolog).
+  DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
+
+  /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
+  /// loop kernel clones.
+  DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
+  DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
+      BlockMIs;
+
+  /// State passed from peelKernel to peelPrologAndEpilogs().
+  std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
+
+public:
+  PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
+                                LiveIntervals *LIS)
+      : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
+        TII(ST.getInstrInfo()), LIS(LIS) {}
+
+  void expand();
+
+  /// Runs ModuloScheduleExpander and treats it as a golden input to validate
+  /// aspects of the code generated by PeelingModuloScheduleExpander.
+  void validateAgainstModuloScheduleExpander();
+
+protected:
+  /// Converts BB from the original loop body to the rewritten, pipelined
+  /// steady-state.
+  void rewriteKernel();
+
+private:
+  /// Peels one iteration of the rewritten kernel (BB) in the specified
+  /// direction.
+  MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
+  /// Peel the kernel forwards and backwards to produce prologs and epilogs,
+  /// and stitch them together.
+  void peelPrologAndEpilogs();
+  /// All prolog and epilog blocks are clones of the kernel, so any produced
+  /// register in one block has an corollary in all other blocks.
+  Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
+  /// Change all users of MI, if MI is predicated out
+  /// (LiveStages[MI->getParent()] == false).
+  void rewriteUsesOf(MachineInstr *MI);
+  /// Insert branches between prologs, kernel and epilogs.
+  void fixupBranches();
+  /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
+  /// to a block dominated by all prologs and epilogs. This allows us to treat
+  /// the loop exiting block as any other kernel clone.
+  MachineBasicBlock *CreateLCSSAExitingBlock();
+  /// Helper to get the stage of an instruction in the schedule.
+  unsigned getStage(MachineInstr *MI) {
+    if (CanonicalMIs.count(MI))
+      MI = CanonicalMIs[MI];
+    return Schedule.getStage(MI);
+  }
+};
+
+/// Expander that simply annotates each scheduled instruction with a post-instr
+/// symbol that can be consumed by the ModuloScheduleTest pass.
+///
+/// The post-instr symbol is a way of annotating an instruction that can be
+/// roundtripped in MIR. The syntax is:
+///   MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
+class ModuloScheduleTestAnnotater {
+  MachineFunction &MF;
+  ModuloSchedule &S;
+
+public:
+  ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
+      : MF(MF), S(S) {}
+
+  /// Performs the annotation.
+  void annotate();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
diff --git a/include/llvm/CodeGen/PBQP/Math.h b/include/llvm/CodeGen/PBQP/Math.h
index 8b014ccbb07b..099ba788e9a2 100644
--- a/include/llvm/CodeGen/PBQP/Math.h
+++ b/include/llvm/CodeGen/PBQP/Math.h
@@ -28,17 +28,17 @@ class Vector {
 public:
   /// Construct a PBQP vector of the given size.
   explicit Vector(unsigned Length)
-    : Length(Length), Data(llvm::make_unique<PBQPNum []>(Length)) {}
+    : Length(Length), Data(std::make_unique<PBQPNum []>(Length)) {}
 
   /// Construct a PBQP vector with initializer.
   Vector(unsigned Length, PBQPNum InitVal)
-    : Length(Length), Data(llvm::make_unique<PBQPNum []>(Length)) {
+    : Length(Length), Data(std::make_unique<PBQPNum []>(Length)) {
     std::fill(Data.get(), Data.get() + Length, InitVal);
   }
 
   /// Copy construct a PBQP vector.
   Vector(const Vector &V)
-    : Length(V.Length), Data(llvm::make_unique<PBQPNum []>(Length)) {
+    : Length(V.Length), Data(std::make_unique<PBQPNum []>(Length)) {
     std::copy(V.Data.get(), V.Data.get() + Length, Data.get());
   }
 
@@ -125,21 +125,21 @@ private:
 public:
   /// Construct a PBQP Matrix with the given dimensions.
   Matrix(unsigned Rows, unsigned Cols) :
-    Rows(Rows), Cols(Cols), Data(llvm::make_unique<PBQPNum []>(Rows * Cols)) {
+    Rows(Rows), Cols(Cols), Data(std::make_unique<PBQPNum []>(Rows * Cols)) {
   }
 
   /// Construct a PBQP Matrix with the given dimensions and initial
   /// value.
   Matrix(unsigned Rows, unsigned Cols, PBQPNum InitVal)
     : Rows(Rows), Cols(Cols),
-      Data(llvm::make_unique<PBQPNum []>(Rows * Cols)) {
+      Data(std::make_unique<PBQPNum []>(Rows * Cols)) {
     std::fill(Data.get(), Data.get() + (Rows * Cols), InitVal);
   }
 
   /// Copy construct a PBQP matrix.
   Matrix(const Matrix &M)
     : Rows(M.Rows), Cols(M.Cols),
-      Data(llvm::make_unique<PBQPNum []>(Rows * Cols)) {
+      Data(std::make_unique<PBQPNum []>(Rows * Cols)) {
     std::copy(M.Data.get(), M.Data.get() + (Rows * Cols), Data.get());
   }
 
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index d92ee93268e7..1e765ce51e4a 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -226,6 +226,10 @@ namespace llvm {
   /// inserting cmov instructions.
   extern char &EarlyIfConverterID;
 
+  /// EarlyIfPredicator - This pass performs if-conversion on SSA form by
+  /// predicating if/else block and insert select at the join point.
+  extern char &EarlyIfPredicatorID;
+
   /// This pass performs instruction combining using trace metrics to estimate
   /// critical-path and resource depth.
   extern char &MachineCombinerID;
diff --git a/include/llvm/CodeGen/Register.h b/include/llvm/CodeGen/Register.h
index 907c1a99e56f..aa5173684e24 100644
--- a/include/llvm/CodeGen/Register.h
+++ b/include/llvm/CodeGen/Register.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CODEGEN_REGISTER_H
 #define LLVM_CODEGEN_REGISTER_H
 
+#include "llvm/MC/MCRegister.h"
 #include <cassert>
 
 namespace llvm {
@@ -20,41 +21,136 @@ class Register {
 
 public:
   Register(unsigned Val = 0): Reg(Val) {}
+  Register(MCRegister Val): Reg(Val) {}
+
+  // Register numbers can represent physical registers, virtual registers, and
+  // sometimes stack slots. The unsigned values are divided into these ranges:
+  //
+  //   0           Not a register, can be used as a sentinel.
+  //   [1;2^30)    Physical registers assigned by TableGen.
+  //   [2^30;2^31) Stack slots. (Rarely used.)
+  //   [2^31;2^32) Virtual registers assigned by MachineRegisterInfo.
+  //
+  // Further sentinels can be allocated from the small negative integers.
+  // DenseMapInfo<unsigned> uses -1u and -2u.
+
+  /// isStackSlot - Sometimes it is useful the be able to store a non-negative
+  /// frame index in a variable that normally holds a register. isStackSlot()
+  /// returns true if Reg is in the range used for stack slots.
+  ///
+  /// Note that isVirtualRegister() and isPhysicalRegister() cannot handle stack
+  /// slots, so if a variable may contains a stack slot, always check
+  /// isStackSlot() first.
+  ///
+  static bool isStackSlot(unsigned Reg) {
+    return MCRegister::isStackSlot(Reg);
+  }
+
+  /// Compute the frame index from a register value representing a stack slot.
+  static int stackSlot2Index(unsigned Reg) {
+    assert(isStackSlot(Reg) && "Not a stack slot");
+    return int(Reg - (1u << 30));
+  }
+
+  /// Convert a non-negative frame index to a stack slot register value.
+  static unsigned index2StackSlot(int FI) {
+    assert(FI >= 0 && "Cannot hold a negative frame index.");
+    return FI + (1u << 30);
+  }
+
+  /// Return true if the specified register number is in
+  /// the physical register namespace.
+  static bool isPhysicalRegister(unsigned Reg) {
+    return MCRegister::isPhysicalRegister(Reg);
+  }
+
+  /// Return true if the specified register number is in
+  /// the virtual register namespace.
+  static bool isVirtualRegister(unsigned Reg) {
+    assert(!isStackSlot(Reg) && "Not a register! Check isStackSlot() first.");
+    return int(Reg) < 0;
+  }
+
+  /// Convert a virtual register number to a 0-based index.
+  /// The first virtual register in a function will get the index 0.
+  static unsigned virtReg2Index(unsigned Reg) {
+    assert(isVirtualRegister(Reg) && "Not a virtual register");
+    return Reg & ~(1u << 31);
+  }
+
+  /// Convert a 0-based index to a virtual register number.
+  /// This is the inverse operation of VirtReg2IndexFunctor below.
+  static unsigned index2VirtReg(unsigned Index) {
+    return Index | (1u << 31);
+  }
 
   /// Return true if the specified register number is in the virtual register
   /// namespace.
   bool isVirtual() const {
-    return int(Reg) < 0;
+    return isVirtualRegister(Reg);
   }
 
   /// Return true if the specified register number is in the physical register
   /// namespace.
   bool isPhysical() const {
-    return int(Reg) > 0;
+    return isPhysicalRegister(Reg);
   }
 
   /// Convert a virtual register number to a 0-based index. The first virtual
   /// register in a function will get the index 0.
   unsigned virtRegIndex() const {
-    assert(isVirtual() && "Not a virtual register");
-    return Reg & ~(1u << 31);
-  }
-
-  /// Convert a 0-based index to a virtual register number.
-  /// This is the inverse operation of VirtReg2IndexFunctor below.
-  static Register index2VirtReg(unsigned Index) {
-    return Register(Index | (1u << 31));
+    return virtReg2Index(Reg);
   }
 
   operator unsigned() const {
     return Reg;
   }
 
+  unsigned id() const { return Reg; }
+
+  operator MCRegister() const {
+    return MCRegister(Reg);
+  }
+
   bool isValid() const {
     return Reg != 0;
   }
+
+  /// Comparisons between register objects
+  bool operator==(const Register &Other) const { return Reg == Other.Reg; }
+  bool operator!=(const Register &Other) const { return Reg != Other.Reg; }
+  bool operator==(const MCRegister &Other) const { return Reg == Other.id(); }
+  bool operator!=(const MCRegister &Other) const { return Reg != Other.id(); }
+
+  /// Comparisons against register constants. E.g.
+  /// * R == AArch64::WZR
+  /// * R == 0
+  /// * R == VirtRegMap::NO_PHYS_REG
+  bool operator==(unsigned Other) const { return Reg == Other; }
+  bool operator!=(unsigned Other) const { return Reg != Other; }
+  bool operator==(int Other) const { return Reg == unsigned(Other); }
+  bool operator!=(int Other) const { return Reg != unsigned(Other); }
+  // MSVC requires that we explicitly declare these two as well.
+  bool operator==(MCPhysReg Other) const { return Reg == unsigned(Other); }
+  bool operator!=(MCPhysReg Other) const { return Reg != unsigned(Other); }
+};
+
+// Provide DenseMapInfo for Register
+template<> struct DenseMapInfo<Register> {
+  static inline unsigned getEmptyKey() {
+    return DenseMapInfo<unsigned>::getEmptyKey();
+  }
+  static inline unsigned getTombstoneKey() {
+    return DenseMapInfo<unsigned>::getTombstoneKey();
+  }
+  static unsigned getHashValue(const Register &Val) {
+    return DenseMapInfo<unsigned>::getHashValue(Val.id());
+  }
+  static bool isEqual(const Register &LHS, const Register &RHS) {
+    return DenseMapInfo<unsigned>::isEqual(LHS.id(), RHS.id());
+  }
 };
 
 }
 
-#endif
+#endif // ifndef LLVM_CODEGEN_REGISTER_H
diff --git a/include/llvm/CodeGen/RegisterClassInfo.h b/include/llvm/CodeGen/RegisterClassInfo.h
index 14af5c4d090d..25b310c47621 100644
--- a/include/llvm/CodeGen/RegisterClassInfo.h
+++ b/include/llvm/CodeGen/RegisterClassInfo.h
@@ -110,7 +110,7 @@ public:
   /// getLastCalleeSavedAlias - Returns the last callee saved register that
   /// overlaps PhysReg, or 0 if Reg doesn't overlap a CalleeSavedAliases.
   unsigned getLastCalleeSavedAlias(unsigned PhysReg) const {
-    assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+    assert(Register::isPhysicalRegister(PhysReg));
     if (PhysReg < CalleeSavedAliases.size())
       return CalleeSavedAliases[PhysReg];
     return 0;
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index 5bbaa03fd751..92333b859f1b 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -129,6 +129,8 @@ public:
   bool operator==(const PressureChange &RHS) const {
     return PSetID == RHS.PSetID && UnitInc == RHS.UnitInc;
   }
+
+  void dump() const;
 };
 
 /// List of PressureChanges in order of increasing, unique PSetID.
@@ -248,6 +250,7 @@ struct RegPressureDelta {
   bool operator!=(const RegPressureDelta &RHS) const {
     return !operator==(RHS);
   }
+  void dump() const;
 };
 
 /// A set of live virtual registers and physical register units.
@@ -273,15 +276,15 @@ private:
   unsigned NumRegUnits;
 
   unsigned getSparseIndexFromReg(unsigned Reg) const {
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
-      return TargetRegisterInfo::virtReg2Index(Reg) + NumRegUnits;
+    if (Register::isVirtualRegister(Reg))
+      return Register::virtReg2Index(Reg) + NumRegUnits;
     assert(Reg < NumRegUnits);
     return Reg;
   }
 
   unsigned getRegFromSparseIndex(unsigned SparseIndex) const {
     if (SparseIndex >= NumRegUnits)
-      return TargetRegisterInfo::index2VirtReg(SparseIndex-NumRegUnits);
+      return Register::index2VirtReg(SparseIndex-NumRegUnits);
     return SparseIndex;
   }
 
diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h
index 9c48df82f07d..5b5a80a67e7f 100644
--- a/include/llvm/CodeGen/RegisterScavenging.h
+++ b/include/llvm/CodeGen/RegisterScavenging.h
@@ -51,7 +51,7 @@ class RegScavenger {
 
     /// If non-zero, the specific register is currently being
     /// scavenged. That is, it is spilled to this scavenging stack slot.
-    unsigned Reg = 0;
+    Register Reg;
 
     /// The instruction that restores the scavenged register from stack.
     const MachineInstr *Restore = nullptr;
@@ -119,14 +119,14 @@ public:
   MachineBasicBlock::iterator getCurrentPosition() const { return MBBI; }
 
   /// Return if a specific register is currently used.
-  bool isRegUsed(unsigned Reg, bool includeReserved = true) const;
+  bool isRegUsed(Register Reg, bool includeReserved = true) const;
 
   /// Return all available registers in the register class in Mask.
   BitVector getRegsAvailable(const TargetRegisterClass *RC);
 
   /// Find an unused register of the specified register class.
   /// Return 0 if none is found.
-  unsigned FindUnusedReg(const TargetRegisterClass *RC) const;
+  Register FindUnusedReg(const TargetRegisterClass *RC) const;
 
   /// Add a scavenging frame index.
   void addScavengingFrameIndex(int FI) {
@@ -160,10 +160,10 @@ public:
   ///
   /// If \p AllowSpill is false, fail if a spill is required to make the
   /// register available, and return NoRegister.
-  unsigned scavengeRegister(const TargetRegisterClass *RC,
+  Register scavengeRegister(const TargetRegisterClass *RC,
                             MachineBasicBlock::iterator I, int SPAdj,
                             bool AllowSpill = true);
-  unsigned scavengeRegister(const TargetRegisterClass *RegClass, int SPAdj,
+  Register scavengeRegister(const TargetRegisterClass *RegClass, int SPAdj,
                             bool AllowSpill = true) {
     return scavengeRegister(RegClass, MBBI, SPAdj, AllowSpill);
   }
@@ -177,17 +177,17 @@ public:
   ///
   /// If \p AllowSpill is false, fail if a spill is required to make the
   /// register available, and return NoRegister.
-  unsigned scavengeRegisterBackwards(const TargetRegisterClass &RC,
+  Register scavengeRegisterBackwards(const TargetRegisterClass &RC,
                                      MachineBasicBlock::iterator To,
                                      bool RestoreAfter, int SPAdj,
                                      bool AllowSpill = true);
 
   /// Tell the scavenger a register is used.
-  void setRegUsed(unsigned Reg, LaneBitmask LaneMask = LaneBitmask::getAll());
+  void setRegUsed(Register Reg, LaneBitmask LaneMask = LaneBitmask::getAll());
 
 private:
   /// Returns true if a register is reserved. It is never "unused".
-  bool isReserved(unsigned Reg) const { return MRI->isReserved(Reg); }
+  bool isReserved(Register Reg) const { return MRI->isReserved(Reg); }
 
   /// setUsed / setUnused - Mark the state of one or a number of register units.
   ///
@@ -203,16 +203,16 @@ private:
   void determineKillsAndDefs();
 
   /// Add all Reg Units that Reg contains to BV.
-  void addRegUnits(BitVector &BV, unsigned Reg);
+  void addRegUnits(BitVector &BV, Register Reg);
 
   /// Remove all Reg Units that \p Reg contains from \p BV.
-  void removeRegUnits(BitVector &BV, unsigned Reg);
+  void removeRegUnits(BitVector &BV, Register Reg);
 
   /// Return the candidate register that is unused for the longest after
   /// StartMI. UseMI is set to the instruction where the search stopped.
   ///
   /// No more than InstrLimit instructions are inspected.
-  unsigned findSurvivorReg(MachineBasicBlock::iterator StartMI,
+  Register findSurvivorReg(MachineBasicBlock::iterator StartMI,
                            BitVector &Candidates,
                            unsigned InstrLimit,
                            MachineBasicBlock::iterator &UseMI);
@@ -225,7 +225,7 @@ private:
 
   /// Spill a register after position \p After and reload it before position
   /// \p UseMI.
-  ScavengedInfo &spill(unsigned Reg, const TargetRegisterClass &RC, int SPAdj,
+  ScavengedInfo &spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
                        MachineBasicBlock::iterator Before,
                        MachineBasicBlock::iterator &UseMI);
 };
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 3e3b604acbac..1eb9b9f322ba 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -34,6 +34,7 @@
 
 namespace llvm {
 
+  class AAResults;
   class LiveIntervals;
   class MachineFrameInfo;
   class MachineFunction;
@@ -57,7 +58,7 @@ namespace llvm {
       : VirtReg(VReg), LaneMask(LaneMask), SU(SU) {}
 
     unsigned getSparseSetIndex() const {
-      return TargetRegisterInfo::virtReg2Index(VirtReg);
+      return Register::virtReg2Index(VirtReg);
     }
   };
 
@@ -173,7 +174,7 @@ namespace llvm {
     /// Tracks the last instructions in this region using each virtual register.
     VReg2SUnitOperIdxMultiMap CurrentVRegUses;
 
-    AliasAnalysis *AAForDep = nullptr;
+    AAResults *AAForDep = nullptr;
 
     /// Remember a generic side-effecting instruction as we proceed.
     /// No other SU ever gets scheduled around it (except in the special
@@ -201,7 +202,7 @@ namespace llvm {
                                Value2SUsMap &loads, unsigned N);
 
     /// Adds a chain edge between SUa and SUb, but only if both
-    /// AliasAnalysis and Target fail to deny the dependency.
+    /// AAResults and Target fail to deny the dependency.
     void addChainDependency(SUnit *SUa, SUnit *SUb,
                             unsigned Latency = 0);
 
@@ -306,7 +307,7 @@ namespace llvm {
     /// If \p RPTracker is non-null, compute register pressure as a side effect.
     /// The DAG builder is an efficient place to do it because it already visits
     /// operands.
-    void buildSchedGraph(AliasAnalysis *AA,
+    void buildSchedGraph(AAResults *AA,
                          RegPressureTracker *RPTracker = nullptr,
                          PressureDiffs *PDiffs = nullptr,
                          LiveIntervals *LIS = nullptr,
@@ -374,6 +375,9 @@ namespace llvm {
     /// Returns a mask for which lanes get read/written by the given (register)
     /// machine operand.
     LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const;
+
+    /// Returns true if the def register in \p MO has no uses.
+    bool deadDefHasNoUse(const MachineOperand &MO);
   };
 
   /// Creates a new SUnit and return a ptr to it.
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 12a970847021..6b8e2dd803ba 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -26,8 +26,6 @@
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -58,6 +56,7 @@
 
 namespace llvm {
 
+class AAResults;
 class BlockAddress;
 class Constant;
 class ConstantFP;
@@ -66,6 +65,7 @@ class DataLayout;
 struct fltSemantics;
 class GlobalValue;
 struct KnownBits;
+class LegacyDivergenceAnalysis;
 class LLVMContext;
 class MachineBasicBlock;
 class MachineConstantPoolValue;
@@ -269,7 +269,13 @@ class SelectionDAG {
 
   using CallSiteInfo = MachineFunction::CallSiteInfo;
   using CallSiteInfoImpl = MachineFunction::CallSiteInfoImpl;
-  DenseMap<const SDNode *, CallSiteInfo> SDCallSiteInfo;
+
+  struct CallSiteDbgInfo {
+    CallSiteInfo CSInfo;
+    MDNode *HeapAllocSite = nullptr;
+  };
+
+  DenseMap<const SDNode *, CallSiteDbgInfo> SDCallSiteDbgInfo;
 
   uint16_t NextPersistentId = 0;
 
@@ -382,7 +388,11 @@ private:
     Node->OperandList = nullptr;
   }
   void CreateTopologicalOrder(std::vector<SDNode*>& Order);
+
 public:
+  // Maximum depth for recursive analysis such as computeKnownBits, etc.
+  static constexpr unsigned MaxRecursionDepth = 6;
+
   explicit SelectionDAG(const TargetMachine &TM, CodeGenOpt::Level);
   SelectionDAG(const SelectionDAG &) = delete;
   SelectionDAG &operator=(const SelectionDAG &) = delete;
@@ -489,7 +499,7 @@ public:
   /// certain types of nodes together, or eliminating superfluous nodes.  The
   /// Level argument controls whether Combine is allowed to produce nodes and
   /// types that are illegal on the target.
-  void Combine(CombineLevel Level, AliasAnalysis *AA,
+  void Combine(CombineLevel Level, AAResults *AA,
                CodeGenOpt::Level OptLevel);
 
   /// This transforms the SelectionDAG into a SelectionDAG that
@@ -628,10 +638,9 @@ public:
 
   SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
                            int64_t offset = 0, bool isTargetGA = false,
-                           unsigned char TargetFlags = 0);
+                           unsigned TargetFlags = 0);
   SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
-                                 int64_t offset = 0,
-                                 unsigned char TargetFlags = 0) {
+                                 int64_t offset = 0, unsigned TargetFlags = 0) {
     return getGlobalAddress(GV, DL, VT, offset, true, TargetFlags);
   }
   SDValue getFrameIndex(int FI, EVT VT, bool isTarget = false);
@@ -639,28 +648,27 @@ public:
     return getFrameIndex(FI, VT, true);
   }
   SDValue getJumpTable(int JTI, EVT VT, bool isTarget = false,
-                       unsigned char TargetFlags = 0);
-  SDValue getTargetJumpTable(int JTI, EVT VT, unsigned char TargetFlags = 0) {
+                       unsigned TargetFlags = 0);
+  SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags = 0) {
     return getJumpTable(JTI, VT, true, TargetFlags);
   }
-  SDValue getConstantPool(const Constant *C, EVT VT,
-                          unsigned Align = 0, int Offs = 0, bool isT=false,
-                          unsigned char TargetFlags = 0);
-  SDValue getTargetConstantPool(const Constant *C, EVT VT,
-                                unsigned Align = 0, int Offset = 0,
-                                unsigned char TargetFlags = 0) {
+  SDValue getConstantPool(const Constant *C, EVT VT, unsigned Align = 0,
+                          int Offs = 0, bool isT = false,
+                          unsigned TargetFlags = 0);
+  SDValue getTargetConstantPool(const Constant *C, EVT VT, unsigned Align = 0,
+                                int Offset = 0, unsigned TargetFlags = 0) {
     return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
   }
   SDValue getConstantPool(MachineConstantPoolValue *C, EVT VT,
                           unsigned Align = 0, int Offs = 0, bool isT=false,
-                          unsigned char TargetFlags = 0);
-  SDValue getTargetConstantPool(MachineConstantPoolValue *C,
-                                  EVT VT, unsigned Align = 0,
-                                  int Offset = 0, unsigned char TargetFlags=0) {
+                          unsigned TargetFlags = 0);
+  SDValue getTargetConstantPool(MachineConstantPoolValue *C, EVT VT,
+                                unsigned Align = 0, int Offset = 0,
+                                unsigned TargetFlags = 0) {
     return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
   }
   SDValue getTargetIndex(int Index, EVT VT, int64_t Offset = 0,
-                         unsigned char TargetFlags = 0);
+                         unsigned TargetFlags = 0);
   // When generating a branch to a BB, we don't in general know enough
   // to provide debug info for the BB at that time, so keep this one around.
   SDValue getBasicBlock(MachineBasicBlock *MBB);
@@ -668,7 +676,7 @@ public:
   SDValue getExternalSymbol(const char *Sym, EVT VT);
   SDValue getExternalSymbol(const char *Sym, const SDLoc &dl, EVT VT);
   SDValue getTargetExternalSymbol(const char *Sym, EVT VT,
-                                  unsigned char TargetFlags = 0);
+                                  unsigned TargetFlags = 0);
   SDValue getMCSymbol(MCSymbol *Sym, EVT VT);
 
   SDValue getValueType(EVT);
@@ -677,12 +685,10 @@ public:
   SDValue getEHLabel(const SDLoc &dl, SDValue Root, MCSymbol *Label);
   SDValue getLabelNode(unsigned Opcode, const SDLoc &dl, SDValue Root,
                        MCSymbol *Label);
-  SDValue getBlockAddress(const BlockAddress *BA, EVT VT,
-                          int64_t Offset = 0, bool isTarget = false,
-                          unsigned char TargetFlags = 0);
+  SDValue getBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset = 0,
+                          bool isTarget = false, unsigned TargetFlags = 0);
   SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT,
-                                int64_t Offset = 0,
-                                unsigned char TargetFlags = 0) {
+                                int64_t Offset = 0, unsigned TargetFlags = 0) {
     return getBlockAddress(BA, VT, Offset, true, TargetFlags);
   }
 
@@ -1035,7 +1041,7 @@ public:
     unsigned Align = 0,
     MachineMemOperand::Flags Flags
     = MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
-    unsigned Size = 0,
+    uint64_t Size = 0,
     const AAMDNodes &AAInfo = AAMDNodes());
 
   SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
@@ -1117,9 +1123,11 @@ public:
                          MachineMemOperand *MMO, bool IsTruncating = false,
                          bool IsCompressing = false);
   SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
-                          ArrayRef<SDValue> Ops, MachineMemOperand *MMO);
+                          ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+                          ISD::MemIndexType IndexType);
   SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
-                           ArrayRef<SDValue> Ops, MachineMemOperand *MMO);
+                           ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+                           ISD::MemIndexType IndexType);
 
   /// Return (create a new or find existing) a target-specific node.
   /// TargetMemSDNode should be derived class from MemSDNode.
@@ -1588,9 +1596,12 @@ public:
   /// Extract. The reduction must use one of the opcodes listed in /p
   /// CandidateBinOps and on success /p BinOp will contain the matching opcode.
   /// Returns the vector that is being reduced on, or SDValue() if a reduction
-  /// was not matched.
+  /// was not matched. If \p AllowPartials is set then in the case of a
+  /// reduction pattern that only matches the first few stages, the extracted
+  /// subvector of the start of the reduction is returned.
   SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
-                              ArrayRef<ISD::NodeType> CandidateBinOps);
+                              ArrayRef<ISD::NodeType> CandidateBinOps,
+                              bool AllowPartials = false);
 
   /// Utility function used by legalize and lowering to
   /// "unroll" a vector operation by splitting out the scalars and operating
@@ -1664,16 +1675,28 @@ public:
   }
 
   void addCallSiteInfo(const SDNode *CallNode, CallSiteInfoImpl &&CallInfo) {
-    SDCallSiteInfo[CallNode] = std::move(CallInfo);
+    SDCallSiteDbgInfo[CallNode].CSInfo = std::move(CallInfo);
   }
 
   CallSiteInfo getSDCallSiteInfo(const SDNode *CallNode) {
-    auto I = SDCallSiteInfo.find(CallNode);
-    if (I != SDCallSiteInfo.end())
-      return std::move(I->second);
+    auto I = SDCallSiteDbgInfo.find(CallNode);
+    if (I != SDCallSiteDbgInfo.end())
+      return std::move(I->second).CSInfo;
     return CallSiteInfo();
   }
 
+  void addHeapAllocSite(const SDNode *Node, MDNode *MD) {
+    SDCallSiteDbgInfo[Node].HeapAllocSite = MD;
+  }
+
+  /// Return the HeapAllocSite type associated with the SDNode, if it exists.
+  MDNode *getHeapAllocSite(const SDNode *Node) {
+    auto It = SDCallSiteDbgInfo.find(Node);
+    if (It == SDCallSiteDbgInfo.end())
+      return nullptr;
+    return It->second.HeapAllocSite;
+  }
+
 private:
   void InsertNode(SDNode *N);
   bool RemoveNodeFromCSEMaps(SDNode *N);
@@ -1712,7 +1735,7 @@ private:
   std::map<EVT, SDNode*, EVT::compareRawBits> ExtendedValueTypeNodes;
   StringMap<SDNode*> ExternalSymbols;
 
-  std::map<std::pair<std::string, unsigned char>,SDNode*> TargetExternalSymbols;
+  std::map<std::pair<std::string, unsigned>, SDNode *> TargetExternalSymbols;
   DenseMap<MCSymbol *, SDNode *> MCSymbols;
 };
 
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index 147c325342fc..de71a21d4671 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -22,22 +22,23 @@
 #include <memory>
 
 namespace llvm {
-  class FastISel;
-  class SelectionDAGBuilder;
-  class SDValue;
-  class MachineRegisterInfo;
-  class MachineBasicBlock;
-  class MachineFunction;
-  class MachineInstr;
-  class OptimizationRemarkEmitter;
-  class TargetLowering;
-  class TargetLibraryInfo;
-  class FunctionLoweringInfo;
-  class ScheduleHazardRecognizer;
-  class SwiftErrorValueTracking;
-  class GCFunctionInfo;
-  class ScheduleDAGSDNodes;
-  class LoadInst;
+class AAResults;
+class FastISel;
+class SelectionDAGBuilder;
+class SDValue;
+class MachineRegisterInfo;
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class OptimizationRemarkEmitter;
+class TargetLowering;
+class TargetLibraryInfo;
+class FunctionLoweringInfo;
+class ScheduleHazardRecognizer;
+class SwiftErrorValueTracking;
+class GCFunctionInfo;
+class ScheduleDAGSDNodes;
+class LoadInst;
 
 /// SelectionDAGISel - This is the common base class used for SelectionDAG-based
 /// pattern-matching instruction selectors.
@@ -51,7 +52,7 @@ public:
   MachineRegisterInfo *RegInfo;
   SelectionDAG *CurDAG;
   SelectionDAGBuilder *SDB;
-  AliasAnalysis *AA;
+  AAResults *AA;
   GCFunctionInfo *GFI;
   CodeGenOpt::Level OptLevel;
   const TargetInstrInfo *TII;
@@ -162,6 +163,7 @@ public:
     OPC_EmitMergeInputChains1_1,
     OPC_EmitMergeInputChains1_2,
     OPC_EmitCopyToReg,
+    OPC_EmitCopyToReg2,
     OPC_EmitNodeXForm,
     OPC_EmitNode,
     // Space-optimized forms that implicitly encode number of result VTs.
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 5aab9643e09d..ceb8b72635a2 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -548,10 +548,15 @@ BEGIN_TWO_BYTE_PACK()
 
   class LSBaseSDNodeBitfields {
     friend class LSBaseSDNode;
+    friend class MaskedGatherScatterSDNode;
 
     uint16_t : NumMemSDNodeBits;
 
-    uint16_t AddressingMode : 3; // enum ISD::MemIndexedMode
+    // This storage is shared between disparate class hierarchies to hold an
+    // enumeration specific to the class hierarchy in use.
+    //   LSBaseSDNode => enum ISD::MemIndexedMode
+    //   MaskedGatherScatterSDNode => enum ISD::MemIndexType
+    uint16_t AddressingMode : 3;
   };
   enum { NumLSBaseSDNodeBits = NumMemSDNodeBits + 3 };
 
@@ -696,14 +701,20 @@ public:
       case ISD::STRICT_FLOG:
       case ISD::STRICT_FLOG10:
       case ISD::STRICT_FLOG2:
+      case ISD::STRICT_LRINT:
+      case ISD::STRICT_LLRINT:
       case ISD::STRICT_FRINT:
       case ISD::STRICT_FNEARBYINT:
       case ISD::STRICT_FMAXNUM:
       case ISD::STRICT_FMINNUM:
       case ISD::STRICT_FCEIL:
       case ISD::STRICT_FFLOOR:
+      case ISD::STRICT_LROUND:
+      case ISD::STRICT_LLROUND:
       case ISD::STRICT_FROUND:
       case ISD::STRICT_FTRUNC:
+      case ISD::STRICT_FP_TO_SINT:
+      case ISD::STRICT_FP_TO_UINT:
       case ISD::STRICT_FP_ROUND:
       case ISD::STRICT_FP_EXTEND:
         return true;
@@ -1346,6 +1357,17 @@ public:
   /// store occurs.
   AtomicOrdering getOrdering() const { return MMO->getOrdering(); }
 
+  /// Return true if the memory operation ordering is Unordered or higher.
+  bool isAtomic() const { return MMO->isAtomic(); }
+
+  /// Returns true if the memory operation doesn't imply any ordering
+  /// constraints on surrounding memory operations beyond the normal memory
+  /// aliasing rules.
+  bool isUnordered() const { return MMO->isUnordered(); }
+
+  /// Returns true if the memory operation is neither atomic or volatile.
+  bool isSimple() const { return !isAtomic() && !isVolatile(); }
+
   /// Return the type of the in-memory value.
   EVT getMemoryVT() const { return MemoryVT; }
 
@@ -1702,16 +1724,16 @@ class GlobalAddressSDNode : public SDNode {
 
   const GlobalValue *TheGlobal;
   int64_t Offset;
-  unsigned char TargetFlags;
+  unsigned TargetFlags;
 
   GlobalAddressSDNode(unsigned Opc, unsigned Order, const DebugLoc &DL,
                       const GlobalValue *GA, EVT VT, int64_t o,
-                      unsigned char TF);
+                      unsigned TF);
 
 public:
   const GlobalValue *getGlobal() const { return TheGlobal; }
   int64_t getOffset() const { return Offset; }
-  unsigned char getTargetFlags() const { return TargetFlags; }
+  unsigned getTargetFlags() const { return TargetFlags; }
   // Return the address space this GlobalAddress belongs to.
   unsigned getAddressSpace() const;
 
@@ -1778,16 +1800,16 @@ class JumpTableSDNode : public SDNode {
   friend class SelectionDAG;
 
   int JTI;
-  unsigned char TargetFlags;
+  unsigned TargetFlags;
 
-  JumpTableSDNode(int jti, EVT VT, bool isTarg, unsigned char TF)
+  JumpTableSDNode(int jti, EVT VT, bool isTarg, unsigned TF)
     : SDNode(isTarg ? ISD::TargetJumpTable : ISD::JumpTable,
       0, DebugLoc(), getSDVTList(VT)), JTI(jti), TargetFlags(TF) {
   }
 
 public:
   int getIndex() const { return JTI; }
-  unsigned char getTargetFlags() const { return TargetFlags; }
+  unsigned getTargetFlags() const { return TargetFlags; }
 
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::JumpTable ||
@@ -1804,10 +1826,10 @@ class ConstantPoolSDNode : public SDNode {
   } Val;
   int Offset;  // It's a MachineConstantPoolValue if top bit is set.
   unsigned Alignment;  // Minimum alignment requirement of CP (not log2 value).
-  unsigned char TargetFlags;
+  unsigned TargetFlags;
 
   ConstantPoolSDNode(bool isTarget, const Constant *c, EVT VT, int o,
-                     unsigned Align, unsigned char TF)
+                     unsigned Align, unsigned TF)
     : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
              DebugLoc(), getSDVTList(VT)), Offset(o), Alignment(Align),
              TargetFlags(TF) {
@@ -1816,7 +1838,7 @@ class ConstantPoolSDNode : public SDNode {
   }
 
   ConstantPoolSDNode(bool isTarget, MachineConstantPoolValue *v,
-                     EVT VT, int o, unsigned Align, unsigned char TF)
+                     EVT VT, int o, unsigned Align, unsigned TF)
     : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
              DebugLoc(), getSDVTList(VT)), Offset(o), Alignment(Align),
              TargetFlags(TF) {
@@ -1847,7 +1869,7 @@ public:
   // Return the alignment of this constant pool object, which is either 0 (for
   // default alignment) or the desired value.
   unsigned getAlignment() const { return Alignment; }
-  unsigned char getTargetFlags() const { return TargetFlags; }
+  unsigned getTargetFlags() const { return TargetFlags; }
 
   Type *getType() const;
 
@@ -1861,16 +1883,16 @@ public:
 class TargetIndexSDNode : public SDNode {
   friend class SelectionDAG;
 
-  unsigned char TargetFlags;
+  unsigned TargetFlags;
   int Index;
   int64_t Offset;
 
 public:
-  TargetIndexSDNode(int Idx, EVT VT, int64_t Ofs, unsigned char TF)
-    : SDNode(ISD::TargetIndex, 0, DebugLoc(), getSDVTList(VT)),
-      TargetFlags(TF), Index(Idx), Offset(Ofs) {}
+  TargetIndexSDNode(int Idx, EVT VT, int64_t Ofs, unsigned TF)
+      : SDNode(ISD::TargetIndex, 0, DebugLoc(), getSDVTList(VT)),
+        TargetFlags(TF), Index(Idx), Offset(Ofs) {}
 
-  unsigned char getTargetFlags() const { return TargetFlags; }
+  unsigned getTargetFlags() const { return TargetFlags; }
   int getIndex() const { return Index; }
   int64_t getOffset() const { return Offset; }
 
@@ -2063,17 +2085,17 @@ class BlockAddressSDNode : public SDNode {
 
   const BlockAddress *BA;
   int64_t Offset;
-  unsigned char TargetFlags;
+  unsigned TargetFlags;
 
   BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba,
-                     int64_t o, unsigned char Flags)
+                     int64_t o, unsigned Flags)
     : SDNode(NodeTy, 0, DebugLoc(), getSDVTList(VT)),
              BA(ba), Offset(o), TargetFlags(Flags) {}
 
 public:
   const BlockAddress *getBlockAddress() const { return BA; }
   int64_t getOffset() const { return Offset; }
-  unsigned char getTargetFlags() const { return TargetFlags; }
+  unsigned getTargetFlags() const { return TargetFlags; }
 
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::BlockAddress ||
@@ -2104,15 +2126,16 @@ class ExternalSymbolSDNode : public SDNode {
   friend class SelectionDAG;
 
   const char *Symbol;
-  unsigned char TargetFlags;
+  unsigned TargetFlags;
 
-  ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned char TF, EVT VT)
-    : SDNode(isTarget ? ISD::TargetExternalSymbol : ISD::ExternalSymbol,
-             0, DebugLoc(), getSDVTList(VT)), Symbol(Sym), TargetFlags(TF) {}
+  ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned TF, EVT VT)
+      : SDNode(isTarget ? ISD::TargetExternalSymbol : ISD::ExternalSymbol, 0,
+               DebugLoc(), getSDVTList(VT)),
+        Symbol(Sym), TargetFlags(TF) {}
 
 public:
   const char *getSymbol() const { return Symbol; }
-  unsigned char getTargetFlags() const { return TargetFlags; }
+  unsigned getTargetFlags() const { return TargetFlags; }
 
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::ExternalSymbol ||
@@ -2181,8 +2204,6 @@ public:
       : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
     LSBaseSDNodeBits.AddressingMode = AM;
     assert(getAddressingMode() == AM && "Value truncated");
-    assert((!MMO->isAtomic() || MMO->isVolatile()) &&
-           "use an AtomicSDNode instead for non-volatile atomics");
   }
 
   const SDValue &getOffset() const {
@@ -2362,8 +2383,24 @@ public:
 
   MaskedGatherScatterSDNode(ISD::NodeType NodeTy, unsigned Order,
                             const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                            MachineMemOperand *MMO)
-      : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {}
+                            MachineMemOperand *MMO, ISD::MemIndexType IndexType)
+      : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
+    LSBaseSDNodeBits.AddressingMode = IndexType;
+    assert(getIndexType() == IndexType && "Value truncated");
+  }
+
+  /// How is Index applied to BasePtr when computing addresses.
+  ISD::MemIndexType getIndexType() const {
+    return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
+  }
+  bool isIndexScaled() const {
+    return (getIndexType() == ISD::SIGNED_SCALED) ||
+           (getIndexType() == ISD::UNSIGNED_SCALED);
+  }
+  bool isIndexSigned() const {
+    return (getIndexType() == ISD::SIGNED_SCALED) ||
+           (getIndexType() == ISD::SIGNED_UNSCALED);
+  }
 
   // In the both nodes address is Op1, mask is Op2:
   // MaskedGatherSDNode  (Chain, passthru, mask, base, index, scale)
@@ -2387,8 +2424,10 @@ public:
   friend class SelectionDAG;
 
   MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                     EVT MemVT, MachineMemOperand *MMO)
-      : MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, VTs, MemVT, MMO) {}
+                     EVT MemVT, MachineMemOperand *MMO,
+                     ISD::MemIndexType IndexType)
+      : MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, VTs, MemVT, MMO,
+                                  IndexType) {}
 
   const SDValue &getPassThru() const { return getOperand(1); }
 
@@ -2404,8 +2443,10 @@ public:
   friend class SelectionDAG;
 
   MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                      EVT MemVT, MachineMemOperand *MMO)
-      : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, VTs, MemVT, MMO) {}
+                      EVT MemVT, MachineMemOperand *MMO,
+                      ISD::MemIndexType IndexType)
+      : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, VTs, MemVT, MMO,
+                                  IndexType) {}
 
   const SDValue &getValue() const { return getOperand(1); }
 
diff --git a/include/llvm/CodeGen/StackProtector.h b/include/llvm/CodeGen/StackProtector.h
index 2bdf4425e24a..ed52db3e6269 100644
--- a/include/llvm/CodeGen/StackProtector.h
+++ b/include/llvm/CodeGen/StackProtector.h
@@ -61,6 +61,12 @@ private:
   /// protection when -fstack-protection is used.
   unsigned SSPBufferSize = 0;
 
+  /// VisitedPHIs - The set of PHI nodes visited when determining
+  /// if a variable's reference has been taken.  This set
+  /// is maintained to ensure we don't visit the same PHI node multiple
+  /// times.
+  SmallPtrSet<const PHINode *, 16> VisitedPHIs;
+
   // A prologue is generated.
   bool HasPrologue = false;
 
diff --git a/include/llvm/CodeGen/SwitchLoweringUtils.h b/include/llvm/CodeGen/SwitchLoweringUtils.h
index 62134dc792f7..b8adcf759b19 100644
--- a/include/llvm/CodeGen/SwitchLoweringUtils.h
+++ b/include/llvm/CodeGen/SwitchLoweringUtils.h
@@ -212,16 +212,17 @@ struct BitTestBlock {
   BitTestInfo Cases;
   BranchProbability Prob;
   BranchProbability DefaultProb;
+  bool OmitRangeCheck;
 
   BitTestBlock(APInt F, APInt R, const Value *SV, unsigned Rg, MVT RgVT, bool E,
                bool CR, MachineBasicBlock *P, MachineBasicBlock *D,
                BitTestInfo C, BranchProbability Pr)
       : First(std::move(F)), Range(std::move(R)), SValue(SV), Reg(Rg),
         RegVT(RgVT), Emitted(E), ContiguousRange(CR), Parent(P), Default(D),
-        Cases(std::move(C)), Prob(Pr) {}
+        Cases(std::move(C)), Prob(Pr), OmitRangeCheck(false) {}
 };
 
-/// Return the range of value within a range.
+/// Return the range of values within a range.
 uint64_t getJumpTableRange(const CaseClusterVector &Clusters, unsigned First,
                            unsigned Last);
 
diff --git a/include/llvm/CodeGen/TargetCallingConv.h b/include/llvm/CodeGen/TargetCallingConv.h
index aebeeecbe506..db3d1175afee 100644
--- a/include/llvm/CodeGen/TargetCallingConv.h
+++ b/include/llvm/CodeGen/TargetCallingConv.h
@@ -14,6 +14,7 @@
 #define LLVM_CODEGEN_TARGETCALLINGCONV_H
 
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
@@ -120,16 +121,22 @@ namespace ISD {
     bool isPointer()  const { return IsPointer; }
     void setPointer() { IsPointer = 1; }
 
-    unsigned getByValAlign() const { return (1U << ByValAlign) / 2; }
-    void setByValAlign(unsigned A) {
-      ByValAlign = Log2_32(A) + 1;
-      assert(getByValAlign() == A && "bitfield overflow");
+    unsigned getByValAlign() const {
+      MaybeAlign A = decodeMaybeAlign(ByValAlign);
+      return A ? A->value() : 0;
+    }
+    void setByValAlign(Align A) {
+      ByValAlign = encode(A);
+      assert(getByValAlign() == A.value() && "bitfield overflow");
     }
 
-    unsigned getOrigAlign() const { return (1U << OrigAlign) / 2; }
-    void setOrigAlign(unsigned A) {
-      OrigAlign = Log2_32(A) + 1;
-      assert(getOrigAlign() == A && "bitfield overflow");
+    unsigned getOrigAlign() const {
+      MaybeAlign A = decodeMaybeAlign(OrigAlign);
+      return A ? A->value() : 0;
+    }
+    void setOrigAlign(Align A) {
+      OrigAlign = encode(A);
+      assert(getOrigAlign() == A.value() && "bitfield overflow");
     }
 
     unsigned getByValSize() const { return ByValSize; }
diff --git a/include/llvm/CodeGen/TargetFrameLowering.h b/include/llvm/CodeGen/TargetFrameLowering.h
index 878c9ffd2b51..72edb27964c4 100644
--- a/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/include/llvm/CodeGen/TargetFrameLowering.h
@@ -28,6 +28,7 @@ namespace TargetStackID {
   enum Value {
     Default = 0,
     SGPRSpill = 1,
+    SVEVector = 2,
     NoAlloc = 255
   };
 }
@@ -53,15 +54,15 @@ public:
   };
 private:
   StackDirection StackDir;
-  unsigned StackAlignment;
-  unsigned TransientStackAlignment;
+  Align StackAlignment;
+  Align TransientStackAlignment;
   int LocalAreaOffset;
   bool StackRealignable;
 public:
-  TargetFrameLowering(StackDirection D, unsigned StackAl, int LAO,
-                      unsigned TransAl = 1, bool StackReal = true)
-    : StackDir(D), StackAlignment(StackAl), TransientStackAlignment(TransAl),
-      LocalAreaOffset(LAO), StackRealignable(StackReal) {}
+  TargetFrameLowering(StackDirection D, Align StackAl, int LAO,
+                      Align TransAl = Align::None(), bool StackReal = true)
+      : StackDir(D), StackAlignment(StackAl), TransientStackAlignment(TransAl),
+        LocalAreaOffset(LAO), StackRealignable(StackReal) {}
 
   virtual ~TargetFrameLowering();
 
@@ -76,7 +77,7 @@ public:
   /// stack pointer must be aligned on entry to a function.  Typically, this
   /// is the largest alignment for any data object in the target.
   ///
-  unsigned getStackAlignment() const { return StackAlignment; }
+  unsigned getStackAlignment() const { return StackAlignment.value(); }
 
   /// alignSPAdjust - This method aligns the stack adjustment to the correct
   /// alignment.
@@ -95,7 +96,7 @@ public:
   /// calls.
   ///
   unsigned getTransientStackAlignment() const {
-    return TransientStackAlignment;
+    return TransientStackAlignment.value();
   }
 
   /// isStackRealignable - This method returns whether the stack can be
@@ -366,15 +367,10 @@ public:
 
   /// Check if given function is safe for not having callee saved registers.
   /// This is used when interprocedural register allocation is enabled.
-  static bool isSafeForNoCSROpt(const Function &F) {
-    if (!F.hasLocalLinkage() || F.hasAddressTaken() ||
-        !F.hasFnAttribute(Attribute::NoRecurse))
-      return false;
-    // Function should not be optimized as tail call.
-    for (const User *U : F.users())
-      if (auto CS = ImmutableCallSite(U))
-        if (CS.isTailCall())
-          return false;
+  static bool isSafeForNoCSROpt(const Function &F);
+
+  /// Check if the no-CSR optimisation is profitable for the given function.
+  virtual bool isProfitableForNoCSROpt(const Function &F) const {
     return true;
   }
 
diff --git a/include/llvm/CodeGen/TargetInstrInfo.h b/include/llvm/CodeGen/TargetInstrInfo.h
index 25b04f8c019a..5011cf34c0ee 100644
--- a/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/include/llvm/CodeGen/TargetInstrInfo.h
@@ -22,7 +22,7 @@
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOutliner.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
@@ -38,10 +38,12 @@
 
 namespace llvm {
 
+class AAResults;
 class DFAPacketizer;
 class InstrItineraryData;
 class LiveIntervals;
 class LiveVariables;
+class MachineLoop;
 class MachineMemOperand;
 class MachineRegisterInfo;
 class MCAsmInfo;
@@ -60,6 +62,8 @@ class TargetSubtargetInfo;
 
 template <class T> class SmallVectorImpl;
 
+using ParamLoadedValue = std::pair<MachineOperand, DIExpression*>;
+
 //---------------------------------------------------------------------------
 ///
 /// TargetInstrInfo - Interface to description of machine instruction set
@@ -92,7 +96,7 @@ public:
   /// registers so that the instructions result is independent of the place
   /// in the function.
   bool isTriviallyReMaterializable(const MachineInstr &MI,
-                                   AliasAnalysis *AA = nullptr) const {
+                                   AAResults *AA = nullptr) const {
     return MI.getOpcode() == TargetOpcode::IMPLICIT_DEF ||
            (MI.getDesc().isRematerializable() &&
             (isReallyTriviallyReMaterializable(MI, AA) ||
@@ -108,7 +112,7 @@ protected:
   /// not always available.
   /// Requirements must be check as stated in isTriviallyReMaterializable() .
   virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                                 AliasAnalysis *AA) const {
+                                                 AAResults *AA) const {
     return false;
   }
 
@@ -151,7 +155,7 @@ private:
   /// this function does target-independent tests to determine if the
   /// instruction is really trivially rematerializable.
   bool isReallyTriviallyReMaterializableGeneric(const MachineInstr &MI,
-                                                AliasAnalysis *AA) const;
+                                                AAResults *AA) const;
 
 public:
   /// These methods return the opcode of the frame setup/destroy instructions
@@ -419,7 +423,8 @@ public:
   ///     findCommutedOpIndices(MI, Op1, Op2);
   /// can be interpreted as a query asking to find an operand that would be
   /// commutable with the operand#1.
-  virtual bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+  virtual bool findCommutedOpIndices(const MachineInstr &MI,
+                                     unsigned &SrcOpIdx1,
                                      unsigned &SrcOpIdx2) const;
 
   /// A pair composed of a register and a sub-register index.
@@ -659,6 +664,50 @@ public:
                         BytesAdded);
   }
 
+  /// Object returned by analyzeLoopForPipelining. Allows software pipelining
+  /// implementations to query attributes of the loop being pipelined and to
+  /// apply target-specific updates to the loop once pipelining is complete.
+  class PipelinerLoopInfo {
+  public:
+    virtual ~PipelinerLoopInfo();
+    /// Return true if the given instruction should not be pipelined and should
+    /// be ignored. An example could be a loop comparison, or induction variable
+    /// update with no users being pipelined.
+    virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const = 0;
+
+    /// Create a condition to determine if the trip count of the loop is greater
+    /// than TC.
+    ///
+    /// If the trip count is statically known to be greater than TC, return
+    /// true. If the trip count is statically known to be not greater than TC,
+    /// return false. Otherwise return nullopt and fill out Cond with the test
+    /// condition.
+    virtual Optional<bool>
+    createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+                                    SmallVectorImpl<MachineOperand> &Cond) = 0;
+
+    /// Modify the loop such that the trip count is
+    /// OriginalTC + TripCountAdjust.
+    virtual void adjustTripCount(int TripCountAdjust) = 0;
+
+    /// Called when the loop's preheader has been modified to NewPreheader.
+    virtual void setPreheader(MachineBasicBlock *NewPreheader) = 0;
+
+    /// Called when the loop is being removed. Any instructions in the preheader
+    /// should be removed.
+    ///
+    /// Once this function is called, no other functions on this object are
+    /// valid; the loop has been removed.
+    virtual void disposed() = 0;
+  };
+
+  /// Analyze loop L, which must be a single-basic-block loop, and if the
+  /// conditions can be understood enough produce a PipelinerLoopInfo object.
+  virtual std::unique_ptr<PipelinerLoopInfo>
+  analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+    return nullptr;
+  }
+
   /// Analyze the loop code, return true if it cannot be understoo. Upon
   /// success, this function returns false and returns information about the
   /// induction variable and compare instruction used at the end.
@@ -730,6 +779,19 @@ public:
     return false;
   }
 
+  /// Return the increase in code size needed to predicate a contiguous run of
+  /// NumInsts instructions.
+  virtual unsigned extraSizeToPredicateInstructions(const MachineFunction &MF,
+                                                    unsigned NumInsts) const {
+    return 0;
+  }
+
+  /// Return an estimate for the code size reduction (in bytes) which will be
+  /// caused by removing the given branch instruction during if-conversion.
+  virtual unsigned predictBranchSizeForIfCvt(MachineInstr &MI) const {
+    return getInstSizeInBytes(MI);
+  }
+
   /// Return true if it's profitable to unpredicate
   /// one side of a 'diamond', i.e. two sides of if-else predicated on mutually
   /// exclusive predicates.
@@ -1558,8 +1620,7 @@ public:
   /// function.
   virtual bool
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
-                                  const MachineInstr &MIb,
-                                  AliasAnalysis *AA = nullptr) const {
+                                  const MachineInstr &MIb) const {
     assert((MIa.mayLoad() || MIa.mayStore()) &&
            "MIa must load from or modify a memory location");
     assert((MIb.mayLoad() || MIb.mayStore()) &&
@@ -1636,6 +1697,28 @@ public:
     return false;
   }
 
+  /// During PHI eleimination lets target to make necessary checks and
+  /// insert the copy to the PHI destination register in a target specific
+  /// manner.
+  virtual MachineInstr *createPHIDestinationCopy(
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
+      const DebugLoc &DL, Register Src, Register Dst) const {
+    return BuildMI(MBB, InsPt, DL, get(TargetOpcode::COPY), Dst)
+        .addReg(Src);
+  }
+
+  /// During PHI eleimination lets target to make necessary checks and
+  /// insert the copy to the PHI destination register in a target specific
+  /// manner.
+  virtual MachineInstr *createPHISourceCopy(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator InsPt,
+                                            const DebugLoc &DL, Register Src,
+                                            Register SrcSubReg,
+                                            Register Dst) const {
+    return BuildMI(MBB, InsPt, DL, get(TargetOpcode::COPY), Dst)
+        .addReg(Src, 0, SrcSubReg);
+  }
+
   /// Returns a \p outliner::OutlinedFunction struct containing target-specific
   /// information for a set of outlining candidates.
   virtual outliner::OutlinedFunction getOutliningCandidateInfo(
@@ -1691,6 +1774,11 @@ public:
     return false;
   }
 
+  /// Produce the expression describing the \p MI loading a value into
+  /// the parameter's forwarding register.
+  virtual Optional<ParamLoadedValue>
+  describeLoadedValue(const MachineInstr &MI) const;
+
 private:
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
   unsigned CatchRetOpcode;
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index d5cca60bb1b2..a58fca7e73f5 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -28,7 +28,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -48,6 +47,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -72,8 +72,10 @@ class Constant;
 class FastISel;
 class FunctionLoweringInfo;
 class GlobalValue;
+class GISelKnownBits;
 class IntrinsicInst;
 struct KnownBits;
+class LegacyDivergenceAnalysis;
 class LLVMContext;
 class MachineBasicBlock;
 class MachineFunction;
@@ -122,8 +124,7 @@ public:
     TypeLegal,           // The target natively supports this type.
     TypePromoteInteger,  // Replace this integer with a larger one.
     TypeExpandInteger,   // Split this integer into two of half the size.
-    TypeSoftenFloat,     // Convert this float to a same size integer type,
-                         // if an operation is not supported in target HW.
+    TypeSoftenFloat,     // Convert this float to a same size integer type.
     TypeExpandFloat,     // Split this float into two of half the size.
     TypeScalarizeVector, // Replace this one-element vector with its element.
     TypeSplitVector,     // Split this vector into two of half the size.
@@ -284,7 +285,7 @@ public:
   /// a constant pool load whose address depends on the select condition. The
   /// parameter may be used to differentiate a select with FP compare from
   /// integer compare.
-  virtual bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
+  virtual bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
     return true;
   }
 
@@ -539,6 +540,12 @@ public:
     return hasAndNotCompare(X);
   }
 
+  /// Return true if the target has a bit-test instruction:
+  ///   (X & (1 << Y)) ==/!= 0
+  /// This knowledge can be used to prevent breaking the pattern,
+  /// or creating it if it could be recognized.
+  virtual bool hasBitTest(SDValue X, SDValue Y) const { return false; }
+
   /// There are two ways to clear extreme bits (either low or high):
   /// Mask:    x &  (-1 << y)  (the instcombine canonical form)
   /// Shifts:  x >> y << y
@@ -571,6 +578,38 @@ public:
     return false;
   }
 
+  /// Given the pattern
+  ///   (X & (C l>>/<< Y)) ==/!= 0
+  /// return true if it should be transformed into:
+  ///   ((X <</l>> Y) & C) ==/!= 0
+  /// WARNING: if 'X' is a constant, the fold may deadlock!
+  /// FIXME: we could avoid passing XC, but we can't use isConstOrConstSplat()
+  ///        here because it can end up being not linked in.
+  virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+      SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+      unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+      SelectionDAG &DAG) const {
+    if (hasBitTest(X, Y)) {
+      // One interesting pattern that we'd want to form is 'bit test':
+      //   ((1 << Y) & C) ==/!= 0
+      // But we also need to be careful not to try to reverse that fold.
+
+      // Is this '1 << Y' ?
+      if (OldShiftOpcode == ISD::SHL && CC->isOne())
+        return false; // Keep the 'bit test' pattern.
+
+      // Will it be '1 << Y' after the transform ?
+      if (XC && NewShiftOpcode == ISD::SHL && XC->isOne())
+        return true; // Do form the 'bit test' pattern.
+    }
+
+    // If 'X' is a constant, and we transform, then we will immediately
+    // try to undo the fold, thus causing endless combine loop.
+    // So by default, let's assume everyone prefers the fold
+    // iff 'X' is not a constant.
+    return !XC;
+  }
+
   /// These two forms are equivalent:
   ///   sub %y, (xor %x, -1)
   ///   add (add %x, 1), %y
@@ -798,9 +837,9 @@ public:
     PointerUnion<const Value *, const PseudoSourceValue *> ptrVal;
 
     int          offset = 0;       // offset off of ptrVal
-    unsigned     size = 0;         // the size of the memory location
+    uint64_t     size = 0;         // the size of the memory location
                                    // (taken from memVT if zero)
-    unsigned     align = 1;        // alignment
+    MaybeAlign align = Align::None(); // alignment
 
     MachineMemOperand::Flags flags = MachineMemOperand::MONone;
     IntrinsicInfo() = default;
@@ -884,6 +923,7 @@ public:
     case ISD::SMULFIX:
     case ISD::SMULFIXSAT:
     case ISD::UMULFIX:
+    case ISD::UMULFIXSAT:
       Supported = isSupportedFixedPointOperation(Op, VT, Scale);
       break;
     }
@@ -891,6 +931,8 @@ public:
     return Supported ? Action : Expand;
   }
 
+  // If Op is a strict floating-point operation, return the result
+  // of getOperationAction for the equivalent non-strict operation.
   LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const {
     unsigned EqOpc;
     switch (Op) {
@@ -911,26 +953,25 @@ public:
       case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
       case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
       case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
+      case ISD::STRICT_LRINT: EqOpc = ISD::LRINT; break;
+      case ISD::STRICT_LLRINT: EqOpc = ISD::LLRINT; break;
       case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
       case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
       case ISD::STRICT_FMAXNUM: EqOpc = ISD::FMAXNUM; break;
       case ISD::STRICT_FMINNUM: EqOpc = ISD::FMINNUM; break;
       case ISD::STRICT_FCEIL: EqOpc = ISD::FCEIL; break;
       case ISD::STRICT_FFLOOR: EqOpc = ISD::FFLOOR; break;
+      case ISD::STRICT_LROUND: EqOpc = ISD::LROUND; break;
+      case ISD::STRICT_LLROUND: EqOpc = ISD::LLROUND; break;
       case ISD::STRICT_FROUND: EqOpc = ISD::FROUND; break;
       case ISD::STRICT_FTRUNC: EqOpc = ISD::FTRUNC; break;
+      case ISD::STRICT_FP_TO_SINT: EqOpc = ISD::FP_TO_SINT; break;
+      case ISD::STRICT_FP_TO_UINT: EqOpc = ISD::FP_TO_UINT; break;
       case ISD::STRICT_FP_ROUND: EqOpc = ISD::FP_ROUND; break;
       case ISD::STRICT_FP_EXTEND: EqOpc = ISD::FP_EXTEND; break;
     }
 
-    auto Action = getOperationAction(EqOpc, VT);
-
-    // We don't currently handle Custom or Promote for strict FP pseudo-ops.
-    // For now, we just expand for those cases.
-    if (Action != Legal)
-      Action = Expand;
-
-    return Action;
+    return getOperationAction(EqOpc, VT);
   }
 
   /// Return true if the specified operation is legal on this target or can be
@@ -1206,7 +1247,7 @@ public:
         EltTy = PointerTy.getTypeForEVT(Ty->getContext());
       }
       return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false),
-                              VTy->getNumElements());
+                              VTy->getElementCount());
     }
 
     return EVT::getEVT(Ty, AllowUnknown);
@@ -1316,9 +1357,9 @@ public:
 
   /// Certain targets have context senstive alignment requirements, where one
   /// type has the alignment requirement of another type.
-  virtual unsigned getABIAlignmentForCallingConv(Type *ArgTy,
-                                                 DataLayout DL) const {
-    return DL.getABITypeAlignment(ArgTy);
+  virtual Align getABIAlignmentForCallingConv(Type *ArgTy,
+                                              DataLayout DL) const {
+    return Align(DL.getABITypeAlignment(ArgTy));
   }
 
   /// If true, then instruction selection should seek to shrink the FP constant
@@ -1426,11 +1467,38 @@ public:
     return false;
   }
 
+  /// LLT handling variant.
+  virtual bool allowsMisalignedMemoryAccesses(
+      LLT, unsigned AddrSpace = 0, unsigned Align = 1,
+      MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+      bool * /*Fast*/ = nullptr) const {
+    return false;
+  }
+
+  /// This function returns true if the memory access is aligned or if the
+  /// target allows this specific unaligned memory access. If the access is
+  /// allowed, the optional final parameter returns if the access is also fast
+  /// (as defined by the target).
+  bool allowsMemoryAccessForAlignment(
+      LLVMContext &Context, const DataLayout &DL, EVT VT,
+      unsigned AddrSpace = 0, unsigned Alignment = 1,
+      MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+      bool *Fast = nullptr) const;
+
+  /// Return true if the memory access of this type is aligned or if the target
+  /// allows this specific unaligned access for the given MachineMemOperand.
+  /// If the access is allowed, the optional final parameter returns if the
+  /// access is also fast (as defined by the target).
+  bool allowsMemoryAccessForAlignment(LLVMContext &Context,
+                                      const DataLayout &DL, EVT VT,
+                                      const MachineMemOperand &MMO,
+                                      bool *Fast = nullptr) const;
+
   /// Return true if the target supports a memory access of this type for the
   /// given address space and alignment. If the access is allowed, the optional
   /// final parameter returns if the access is also fast (as defined by the
   /// target).
-  bool
+  virtual bool
   allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
                      unsigned AddrSpace = 0, unsigned Alignment = 1,
                      MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
@@ -1463,6 +1531,16 @@ public:
     return MVT::Other;
   }
 
+
+  /// LLT returning variant.
+  virtual LLT
+  getOptimalMemOpLLT(uint64_t /*Size*/, unsigned /*DstAlign*/,
+                     unsigned /*SrcAlign*/, bool /*IsMemset*/,
+                     bool /*ZeroMemset*/, bool /*MemcpyStrSrc*/,
+                     const AttributeList & /*FuncAttributes*/) const {
+    return LLT();
+  }
+
   /// Returns true if it's safe to use load / store of the specified type to
   /// expand memcpy / memset inline.
   ///
@@ -1522,35 +1600,19 @@ public:
     report_fatal_error("Funclet EH is not implemented for this target");
   }
 
-  /// Returns the target's jmp_buf size in bytes (if never set, the default is
-  /// 200)
-  unsigned getJumpBufSize() const {
-    return JumpBufSize;
-  }
-
-  /// Returns the target's jmp_buf alignment in bytes (if never set, the default
-  /// is 0)
-  unsigned getJumpBufAlignment() const {
-    return JumpBufAlignment;
-  }
-
   /// Return the minimum stack alignment of an argument.
-  unsigned getMinStackArgumentAlignment() const {
+  Align getMinStackArgumentAlignment() const {
     return MinStackArgumentAlignment;
   }
 
   /// Return the minimum function alignment.
-  unsigned getMinFunctionAlignment() const {
-    return MinFunctionAlignment;
-  }
+  Align getMinFunctionAlignment() const { return MinFunctionAlignment; }
 
   /// Return the preferred function alignment.
-  unsigned getPrefFunctionAlignment() const {
-    return PrefFunctionAlignment;
-  }
+  Align getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
 
   /// Return the preferred loop alignment.
-  virtual unsigned getPrefLoopAlignment(MachineLoop *ML = nullptr) const {
+  virtual Align getPrefLoopAlignment(MachineLoop *ML = nullptr) const {
     return PrefLoopAlignment;
   }
 
@@ -1772,6 +1834,11 @@ public:
     return IsSigned;
   }
 
+  /// Returns true if arguments should be extended in lib calls.
+  virtual bool shouldExtendTypeInLibCall(EVT Type) const {
+    return true;
+  }
+
   /// Returns how the given (atomic) load should be expanded by the
   /// IR-level AtomicExpand pass.
   virtual AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const {
@@ -1848,7 +1915,8 @@ public:
   /// This may be true if the target does not directly support the
   /// multiplication operation for the specified type or the sequence of simpler
   /// ops is faster than the multiply.
-  virtual bool decomposeMulByConstant(EVT VT, SDValue C) const {
+  virtual bool decomposeMulByConstant(LLVMContext &Context,
+                                      EVT VT, SDValue C) const {
     return false;
   }
 
@@ -2056,40 +2124,25 @@ protected:
     TargetDAGCombineArray[NT >> 3] |= 1 << (NT&7);
   }
 
-  /// Set the target's required jmp_buf buffer size (in bytes); default is 200
-  void setJumpBufSize(unsigned Size) {
-    JumpBufSize = Size;
-  }
-
-  /// Set the target's required jmp_buf buffer alignment (in bytes); default is
-  /// 0
-  void setJumpBufAlignment(unsigned Align) {
-    JumpBufAlignment = Align;
-  }
-
-  /// Set the target's minimum function alignment (in log2(bytes))
-  void setMinFunctionAlignment(unsigned Align) {
-    MinFunctionAlignment = Align;
+  /// Set the target's minimum function alignment.
+  void setMinFunctionAlignment(Align Alignment) {
+    MinFunctionAlignment = Alignment;
   }
 
   /// Set the target's preferred function alignment.  This should be set if
-  /// there is a performance benefit to higher-than-minimum alignment (in
-  /// log2(bytes))
-  void setPrefFunctionAlignment(unsigned Align) {
-    PrefFunctionAlignment = Align;
+  /// there is a performance benefit to higher-than-minimum alignment
+  void setPrefFunctionAlignment(Align Alignment) {
+    PrefFunctionAlignment = Alignment;
   }
 
-  /// Set the target's preferred loop alignment. Default alignment is zero, it
-  /// means the target does not care about loop alignment.  The alignment is
-  /// specified in log2(bytes). The target may also override
-  /// getPrefLoopAlignment to provide per-loop values.
-  void setPrefLoopAlignment(unsigned Align) {
-    PrefLoopAlignment = Align;
-  }
+  /// Set the target's preferred loop alignment. Default alignment is one, it
+  /// means the target does not care about loop alignment. The target may also
+  /// override getPrefLoopAlignment to provide per-loop values.
+  void setPrefLoopAlignment(Align Alignment) { PrefLoopAlignment = Alignment; }
 
-  /// Set the minimum stack alignment of an argument (in log2(bytes)).
-  void setMinStackArgumentAlignment(unsigned Align) {
-    MinStackArgumentAlignment = Align;
+  /// Set the minimum stack alignment of an argument.
+  void setMinStackArgumentAlignment(Align Alignment) {
+    MinStackArgumentAlignment = Alignment;
   }
 
   /// Set the maximum atomic operation size supported by the
@@ -2555,6 +2608,12 @@ public:
   // same blocks of its users.
   virtual bool shouldConsiderGEPOffsetSplit() const { return false; }
 
+  // Return the shift amount threshold for profitable transforms into shifts.
+  // Transforms creating shifts above the returned value will be avoided.
+  virtual unsigned getShiftAmountThreshold(EVT VT) const {
+    return VT.getScalarSizeInBits();
+  }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
@@ -2650,25 +2709,19 @@ private:
   /// register usage.
   Sched::Preference SchedPreferenceInfo;
 
-  /// The size, in bytes, of the target's jmp_buf buffers
-  unsigned JumpBufSize;
-
-  /// The alignment, in bytes, of the target's jmp_buf buffers
-  unsigned JumpBufAlignment;
-
   /// The minimum alignment that any argument on the stack needs to have.
-  unsigned MinStackArgumentAlignment;
+  Align MinStackArgumentAlignment;
 
   /// The minimum function alignment (used when optimizing for size, and to
   /// prevent explicitly provided alignment from leading to incorrect code).
-  unsigned MinFunctionAlignment;
+  Align MinFunctionAlignment;
 
   /// The preferred function alignment (used when alignment unspecified and
   /// optimizing for speed).
-  unsigned PrefFunctionAlignment;
+  Align PrefFunctionAlignment;
 
-  /// The preferred loop alignment.
-  unsigned PrefLoopAlignment;
+  /// The preferred loop alignment (in log2 bot in bytes).
+  Align PrefLoopAlignment;
 
   /// Size in bits of the maximum atomics size the backend supports.
   /// Accesses larger than this will be expanded by AtomicExpandPass.
@@ -2744,7 +2797,6 @@ private:
   /// up the MVT::LAST_VALUETYPE value to the next multiple of 8.
   uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 7) / 8];
 
-protected:
   ValueTypeActionImpl ValueTypeActions;
 
 private:
@@ -2790,7 +2842,7 @@ protected:
   /// expected to be merged.
   unsigned GatherAllAliasesMaxDepth;
 
-  /// Specify maximum number of store instructions per memset call.
+  /// \brief Specify maximum number of store instructions per memset call.
   ///
   /// When lowering \@llvm.memset this field specifies the maximum number of
   /// store operations that may be substituted for the call to memset. Targets
@@ -2801,12 +2853,10 @@ protected:
   /// with 16-bit alignment would result in four 2-byte stores and one 1-byte
   /// store.  This only applies to setting a constant array of a constant size.
   unsigned MaxStoresPerMemset;
-
-  /// Maximum number of stores operations that may be substituted for the call
-  /// to memset, used for functions with OptSize attribute.
+  /// Likewise for functions with the OptSize attribute.
   unsigned MaxStoresPerMemsetOptSize;
 
-  /// Specify maximum bytes of store instructions per memcpy call.
+  /// \brief Specify maximum number of store instructions per memcpy call.
   ///
   /// When lowering \@llvm.memcpy this field specifies the maximum number of
   /// store operations that may be substituted for a call to memcpy. Targets
@@ -2818,8 +2868,8 @@ protected:
   /// and one 1-byte store. This only applies to copying a constant array of
   /// constant size.
   unsigned MaxStoresPerMemcpy;
-
-
+  /// Likewise for functions with the OptSize attribute.
+  unsigned MaxStoresPerMemcpyOptSize;
   /// \brief Specify max number of store instructions to glue in inlined memcpy.
   ///
   /// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number
@@ -2827,13 +2877,22 @@ protected:
   //  vectorization later on.
   unsigned MaxGluedStoresPerMemcpy = 0;
 
-  /// Maximum number of store operations that may be substituted for a call to
-  /// memcpy, used for functions with OptSize attribute.
-  unsigned MaxStoresPerMemcpyOptSize;
+  /// \brief Specify maximum number of load instructions per memcmp call.
+  ///
+  /// When lowering \@llvm.memcmp this field specifies the maximum number of
+  /// pairs of load operations that may be substituted for a call to memcmp.
+  /// Targets must set this value based on the cost threshold for that target.
+  /// Targets should assume that the memcmp will be done using as many of the
+  /// largest load operations first, followed by smaller ones, if necessary, per
+  /// alignment restrictions. For example, loading 7 bytes on a 32-bit machine
+  /// with 32-bit alignment would result in one 4-byte load, a one 2-byte load
+  /// and one 1-byte load. This only applies to copying a constant array of
+  /// constant size.
   unsigned MaxLoadsPerMemcmp;
+  /// Likewise for functions with the OptSize attribute.
   unsigned MaxLoadsPerMemcmpOptSize;
 
-  /// Specify maximum bytes of store instructions per memmove call.
+  /// \brief Specify maximum number of store instructions per memmove call.
   ///
   /// When lowering \@llvm.memmove this field specifies the maximum number of
   /// store instructions that may be substituted for a call to memmove. Targets
@@ -2844,9 +2903,7 @@ protected:
   /// with 8-bit alignment would result in nine 1-byte stores.  This only
   /// applies to copying a constant array of constant size.
   unsigned MaxStoresPerMemmove;
-
-  /// Maximum number of store instructions that may be substituted for a call to
-  /// memmove, used for functions with OptSize attribute.
+  /// Likewise for functions with the OptSize attribute.
   unsigned MaxStoresPerMemmoveOptSize;
 
   /// Tells the code generator that select is more expensive than a branch if
@@ -2885,6 +2942,7 @@ protected:
 class TargetLowering : public TargetLoweringBase {
 public:
   struct DAGCombinerInfo;
+  struct MakeLibCallOptions;
 
   TargetLowering(const TargetLowering &) = delete;
   TargetLowering &operator=(const TargetLowering &) = delete;
@@ -2925,6 +2983,14 @@ public:
     return false;
   }
 
+  /// Returns true if the specified base+offset is a legal indexed addressing
+  /// mode for this target. \p MI is the load or store instruction that is being
+  /// considered for transformation.
+  virtual bool isIndexingLegal(MachineInstr &MI, Register Base, Register Offset,
+                               bool IsPre, MachineRegisterInfo &MRI) const {
+    return false;
+  }
+
   /// Return the entry encoding for a jump table in the current function.  The
   /// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
   virtual unsigned getJumpTableEncoding() const;
@@ -2955,14 +3021,15 @@ public:
 
   void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS,
                            SDValue &NewRHS, ISD::CondCode &CCCode,
-                           const SDLoc &DL) const;
+                           const SDLoc &DL, const SDValue OldLHS,
+                           const SDValue OldRHS) const;
 
   /// Returns a pair of (return value, chain).
   /// It is an error to pass RTLIB::UNKNOWN_LIBCALL as \p LC.
-  std::pair<SDValue, SDValue> makeLibCall(
-      SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef<SDValue> Ops,
-      bool isSigned, const SDLoc &dl, bool doesNotReturn = false,
-      bool isReturnValueUsed = true, bool isPostTypeLegalization = false) const;
+  std::pair<SDValue, SDValue> makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC,
+                                          EVT RetVT, ArrayRef<SDValue> Ops,
+                                          MakeLibCallOptions CallOptions,
+                                          const SDLoc &dl) const;
 
   /// Check whether parameters to a call that are passed in callee saved
   /// registers are the same as from the calling function.  This needs to be
@@ -3065,6 +3132,14 @@ public:
   bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
                             DAGCombinerInfo &DCI) const;
 
+  /// More limited version of SimplifyDemandedBits that can be used to "look
+  /// through" ops that don't contribute to the DemandedBits/DemandedElts -
+  /// bitwise ops etc.
+  SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits,
+                                          const APInt &DemandedElts,
+                                          SelectionDAG &DAG,
+                                          unsigned Depth) const;
+
   /// Look at Vector Op. At this point, we know that only the DemandedElts
   /// elements of the result of Op are ever used downstream.  If we can use
   /// this information to simplify Op, create a new simplified DAG node and
@@ -3099,6 +3174,15 @@ public:
                                              const APInt &DemandedElts,
                                              const SelectionDAG &DAG,
                                              unsigned Depth = 0) const;
+  /// Determine which of the bits specified in Mask are known to be either zero
+  /// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
+  /// argument allows us to only collect the known bits that are shared by the
+  /// requested vector elements. This is for GISel.
+  virtual void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis,
+                                              Register R, KnownBits &Known,
+                                              const APInt &DemandedElts,
+                                              const MachineRegisterInfo &MRI,
+                                              unsigned Depth = 0) const;
 
   /// Determine which of the bits of FrameIndex \p FIOp are known to be 0.
   /// Default implementation computes low bits based on alignment
@@ -3139,6 +3223,21 @@ public:
                                                  TargetLoweringOpt &TLO,
                                                  unsigned Depth = 0) const;
 
+  /// More limited version of SimplifyDemandedBits that can be used to "look
+  /// through" ops that don't contribute to the DemandedBits/DemandedElts -
+  /// bitwise ops etc.
+  virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
+      SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+      SelectionDAG &DAG, unsigned Depth) const;
+
+  /// Tries to build a legal vector shuffle using the provided parameters
+  /// or equivalent variations. The Mask argument maybe be modified as the
+  /// function tries different variations.
+  /// Returns an empty SDValue if the operation fails.
+  SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0,
+                                  SDValue N1, MutableArrayRef<int> Mask,
+                                  SelectionDAG &DAG) const;
+
   /// This method returns the constant pool value that will be loaded by LD.
   /// NOTE: You must check for implicit extensions of the constant by LD.
   virtual const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const;
@@ -3174,6 +3273,8 @@ public:
     SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true);
     SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true);
 
+    bool recursivelyDeleteUnusedNodes(SDNode *N);
+
     void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
   };
 
@@ -3297,6 +3398,18 @@ public:
     llvm_unreachable("Not Implemented");
   }
 
+  /// Return 1 if we can compute the negated form of the specified expression
+  /// for the same cost as the expression itself, or 2 if we can compute the
+  /// negated form more cheaply than the expression itself. Else return 0.
+  virtual char isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
+                                  bool LegalOperations, bool ForCodeSize,
+                                  unsigned Depth = 0) const;
+
+  /// If isNegatibleForFree returns true, return the newly negated expression.
+  virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                       bool LegalOperations, bool ForCodeSize,
+                                       unsigned Depth = 0) const;
+
   //===--------------------------------------------------------------------===//
   // Lowering methods - These methods must be implemented by targets so that
   // the SelectionDAGBuilder code knows how to lower these.
@@ -3468,6 +3581,51 @@ public:
     }
   };
 
+  /// This structure is used to pass arguments to makeLibCall function.
+  struct MakeLibCallOptions {
+    // By passing type list before soften to makeLibCall, the target hook
+    // shouldExtendTypeInLibCall can get the original type before soften.
+    ArrayRef<EVT> OpsVTBeforeSoften;
+    EVT RetVTBeforeSoften;
+    bool IsSExt : 1;
+    bool DoesNotReturn : 1;
+    bool IsReturnValueUsed : 1;
+    bool IsPostTypeLegalization : 1;
+    bool IsSoften : 1;
+
+    MakeLibCallOptions()
+        : IsSExt(false), DoesNotReturn(false), IsReturnValueUsed(true),
+          IsPostTypeLegalization(false), IsSoften(false) {}
+
+    MakeLibCallOptions &setSExt(bool Value = true) {
+      IsSExt = Value;
+      return *this;
+    }
+
+    MakeLibCallOptions &setNoReturn(bool Value = true) {
+      DoesNotReturn = Value;
+      return *this;
+    }
+
+    MakeLibCallOptions &setDiscardResult(bool Value = true) {
+      IsReturnValueUsed = !Value;
+      return *this;
+    }
+
+    MakeLibCallOptions &setIsPostTypeLegalization(bool Value = true) {
+      IsPostTypeLegalization = Value;
+      return *this;
+    }
+
+    MakeLibCallOptions &setTypeListBeforeSoften(ArrayRef<EVT> OpsVT, EVT RetVT,
+                                                bool Value = true) {
+      OpsVTBeforeSoften = OpsVT;
+      RetVTBeforeSoften = RetVT;
+      IsSoften = Value;
+      return *this;
+    }
+  };
+
   /// This function lowers an abstract call to a function into an actual call.
   /// This returns a pair of operands.  The first element is the return value
   /// for the function (if RetTy is not VoidTy).  The second element is the
@@ -3537,8 +3695,8 @@ public:
   /// Return the register ID of the name passed in. Used by named register
   /// global variables extension. There is no target-independent behaviour
   /// so the default action is to bail.
-  virtual unsigned getRegisterByName(const char* RegName, EVT VT,
-                                     SelectionDAG &DAG) const {
+  virtual Register getRegisterByName(const char* RegName, EVT VT,
+                                     const MachineFunction &MF) const {
     report_fatal_error("Named registers not implemented for this target");
   }
 
@@ -3597,6 +3755,25 @@ public:
     return MachineMemOperand::MONone;
   }
 
+  /// Should SelectionDAG lower an atomic store of the given kind as a normal
+  /// StoreSDNode (as opposed to an AtomicSDNode)?  NOTE: The intention is to
+  /// eventually migrate all targets to the using StoreSDNodes, but porting is
+  /// being done target at a time.  
+  virtual bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
+    assert(SI.isAtomic() && "violated precondition");
+    return false;
+  }
+
+  /// Should SelectionDAG lower an atomic load of the given kind as a normal
+  /// LoadSDNode (as opposed to an AtomicSDNode)?  NOTE: The intention is to
+  /// eventually migrate all targets to the using LoadSDNodes, but porting is
+  /// being done target at a time.  
+  virtual bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
+    assert(LI.isAtomic() && "violated precondition");
+    return false;
+  }
+
+
   /// This callback is invoked by the type legalizer to legalize nodes with an
   /// illegal operand type but legal result types.  It replaces the
   /// LowerOperation callback in the type Legalizer.  The reason we can not do
@@ -3665,6 +3842,7 @@ public:
     C_Register,            // Constraint represents specific register(s).
     C_RegisterClass,       // Constraint represents any of register(s) in class.
     C_Memory,              // Memory constraint.
+    C_Immediate,           // Requires an immediate.
     C_Other,               // Something else.
     C_Unknown              // Unsupported constraint.
   };
@@ -3905,7 +4083,7 @@ public:
   /// \param N Node to expand
   /// \param Result output after conversion
   /// \returns True, if the expansion was successful, false otherwise
-  bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+  bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SDValue &Chain, SelectionDAG &DAG) const;
 
   /// Expand UINT(i64) to double(f64) conversion
   /// \param N Node to expand
@@ -3986,8 +4164,8 @@ public:
   /// method accepts integers as its arguments.
   SDValue expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const;
 
-  /// Method for building the DAG expansion of ISD::SMULFIX. This method accepts
-  /// integers as its arguments.
+  /// Method for building the DAG expansion of ISD::[U|S]MULFIX[SAT]. This
+  /// method accepts integers as its arguments.
   SDValue expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const;
 
   /// Method for building the DAG expansion of ISD::U(ADD|SUB)O. Expansion
@@ -4070,6 +4248,11 @@ private:
                                                DAGCombinerInfo &DCI,
                                                const SDLoc &DL) const;
 
+  // (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
+  SDValue optimizeSetCCByHoistingAndByConstFromLogicalShift(
+      EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond,
+      DAGCombinerInfo &DCI, const SDLoc &DL) const;
+
   SDValue prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
                             SDValue CompTargetNode, ISD::CondCode Cond,
                             DAGCombinerInfo &DCI, const SDLoc &DL,
@@ -4077,6 +4260,14 @@ private:
   SDValue buildUREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode,
                           ISD::CondCode Cond, DAGCombinerInfo &DCI,
                           const SDLoc &DL) const;
+
+  SDValue prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
+                            SDValue CompTargetNode, ISD::CondCode Cond,
+                            DAGCombinerInfo &DCI, const SDLoc &DL,
+                            SmallVectorImpl<SDNode *> &Created) const;
+  SDValue buildSREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode,
+                          ISD::CondCode Cond, DAGCombinerInfo &DCI,
+                          const SDLoc &DL) const;
 };
 
 /// Given an LLVM IR type and return type attributes, compute the return value
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index a1fb81cb009d..59f5ddbd9dac 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CODEGEN_TARGETLOWERINGOBJECTFILEIMPL_H
 #define LLVM_CODEGEN_TARGETLOWERINGOBJECTFILEIMPL_H
 
+#include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -35,7 +36,7 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile {
 protected:
   MCSymbolRefExpr::VariantKind PLTRelativeVariantKind =
       MCSymbolRefExpr::VK_None;
-  const TargetMachine *TM;
+  const TargetMachine *TM = nullptr;
 
 public:
   TargetLoweringObjectFileELF() = default;
@@ -126,7 +127,8 @@ public:
                                     MachineModuleInfo *MMI) const override;
 
   /// Get MachO PC relative GOT entry relocation
-  const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+  const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
+                                          const MCSymbol *Sym,
                                           const MCValue &MV, int64_t Offset,
                                           MachineModuleInfo *MMI,
                                           MCStreamer &Streamer) const override;
@@ -206,6 +208,34 @@ public:
                                        const TargetMachine &TM) const override;
 };
 
+class TargetLoweringObjectFileXCOFF : public TargetLoweringObjectFile {
+public:
+  TargetLoweringObjectFileXCOFF() = default;
+  ~TargetLoweringObjectFileXCOFF() override = default;
+
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+  bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
+                                           const Function &F) const override;
+
+  MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override;
+
+  MCSection *getStaticCtorSection(unsigned Priority,
+                                  const MCSymbol *KeySym) const override;
+  MCSection *getStaticDtorSection(unsigned Priority,
+                                  const MCSymbol *KeySym) const override;
+
+  const MCExpr *lowerRelativeReference(const GlobalValue *LHS,
+                                       const GlobalValue *RHS,
+                                       const TargetMachine &TM) const override;
+
+  MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                    const TargetMachine &TM) const override;
+
+  static XCOFF::StorageClass getStorageClassForGlobal(const GlobalObject *GO);
+};
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_TARGETLOWERINGOBJECTFILEIMPL_H
diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h
index 0bd82aafac37..d48fc664c1c3 100644
--- a/include/llvm/CodeGen/TargetPassConfig.h
+++ b/include/llvm/CodeGen/TargetPassConfig.h
@@ -280,7 +280,7 @@ public:
   ///
   /// This can also be used to plug a new MachineSchedStrategy into an instance
   /// of the standard ScheduleDAGMI:
-  ///   return new ScheduleDAGMI(C, make_unique<MyStrategy>(C), /*RemoveKillFlags=*/false)
+  ///   return new ScheduleDAGMI(C, std::make_unique<MyStrategy>(C), /*RemoveKillFlags=*/false)
   ///
   /// Return NULL to select the default (generic) machine scheduler.
   virtual ScheduleDAGInstrs *
diff --git a/include/llvm/CodeGen/TargetRegisterInfo.h b/include/llvm/CodeGen/TargetRegisterInfo.h
index ddbd677b3eaa..c42ca3ad6eb9 100644
--- a/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -87,11 +87,20 @@ public:
   /// Return true if the specified register is included in this register class.
   /// This does not include virtual registers.
   bool contains(unsigned Reg) const {
+    /// FIXME: Historically this function has returned false when given vregs
+    ///        but it should probably only receive physical registers
+    if (!Register::isPhysicalRegister(Reg))
+      return false;
     return MC->contains(Reg);
   }
 
   /// Return true if both registers are in this class.
   bool contains(unsigned Reg1, unsigned Reg2) const {
+    /// FIXME: Historically this function has returned false when given a vregs
+    ///        but it should probably only receive physical registers
+    if (!Register::isPhysicalRegister(Reg1) ||
+        !Register::isPhysicalRegister(Reg2))
+      return false;
     return MC->contains(Reg1, Reg2);
   }
 
@@ -258,57 +267,6 @@ public:
   // Further sentinels can be allocated from the small negative integers.
   // DenseMapInfo<unsigned> uses -1u and -2u.
 
-  /// isStackSlot - Sometimes it is useful the be able to store a non-negative
-  /// frame index in a variable that normally holds a register. isStackSlot()
-  /// returns true if Reg is in the range used for stack slots.
-  ///
-  /// Note that isVirtualRegister() and isPhysicalRegister() cannot handle stack
-  /// slots, so if a variable may contains a stack slot, always check
-  /// isStackSlot() first.
-  ///
-  static bool isStackSlot(unsigned Reg) {
-    return int(Reg) >= (1 << 30);
-  }
-
-  /// Compute the frame index from a register value representing a stack slot.
-  static int stackSlot2Index(unsigned Reg) {
-    assert(isStackSlot(Reg) && "Not a stack slot");
-    return int(Reg - (1u << 30));
-  }
-
-  /// Convert a non-negative frame index to a stack slot register value.
-  static unsigned index2StackSlot(int FI) {
-    assert(FI >= 0 && "Cannot hold a negative frame index.");
-    return FI + (1u << 30);
-  }
-
-  /// Return true if the specified register number is in
-  /// the physical register namespace.
-  static bool isPhysicalRegister(unsigned Reg) {
-    assert(!isStackSlot(Reg) && "Not a register! Check isStackSlot() first.");
-    return int(Reg) > 0;
-  }
-
-  /// Return true if the specified register number is in
-  /// the virtual register namespace.
-  static bool isVirtualRegister(unsigned Reg) {
-    assert(!isStackSlot(Reg) && "Not a register! Check isStackSlot() first.");
-    return int(Reg) < 0;
-  }
-
-  /// Convert a virtual register number to a 0-based index.
-  /// The first virtual register in a function will get the index 0.
-  static unsigned virtReg2Index(unsigned Reg) {
-    assert(isVirtualRegister(Reg) && "Not a virtual register");
-    return Reg & ~(1u << 31);
-  }
-
-  /// Convert a 0-based index to a virtual register number.
-  /// This is the inverse operation of VirtReg2IndexFunctor below.
-  static unsigned index2VirtReg(unsigned Index) {
-    return Index | (1u << 31);
-  }
-
   /// Return the size in bits of a register from class RC.
   unsigned getRegSizeInBits(const TargetRegisterClass &RC) const {
     return getRegClassInfo(RC).RegSize;
@@ -419,9 +377,9 @@ public:
 
   /// Returns true if the two registers are equal or alias each other.
   /// The registers may be virtual registers.
-  bool regsOverlap(unsigned regA, unsigned regB) const {
+  bool regsOverlap(Register regA, Register regB) const {
     if (regA == regB) return true;
-    if (isVirtualRegister(regA) || isVirtualRegister(regB))
+    if (regA.isVirtual() || regB.isVirtual())
       return false;
 
     // Regunits are numerically ordered. Find a common unit.
@@ -489,6 +447,14 @@ public:
     llvm_unreachable("target does not provide no preserved mask");
   }
 
+  /// Return a list of all of the registers which are clobbered "inside" a call
+  /// to the given function. For example, these might be needed for PLT
+  /// sequences of long-branch veneers.
+  virtual ArrayRef<MCPhysReg>
+  getIntraCallClobberedRegs(const MachineFunction *MF) const {
+    return {};
+  }
+
   /// Return true if all bits that are set in mask \p mask0 are also set in
   /// \p mask1.
   bool regmaskSubsetEqual(const uint32_t *mask0, const uint32_t *mask1) const;
@@ -535,6 +501,11 @@ public:
     return false;
   }
 
+  /// This is a wrapper around getCallPreservedMask().
+  /// Return true if the register is preserved after the call.
+  virtual bool isCalleeSavedPhysReg(unsigned PhysReg,
+                                    const MachineFunction &MF) const;
+
   /// Prior to adding the live-out mask to a stackmap or patchpoint
   /// instruction, provide the target the opportunity to adjust it (mainly to
   /// remove pseudo-registers that should be ignored).
@@ -709,13 +680,9 @@ public:
 
   /// Find the largest common subclass of A and B.
   /// Return NULL if there is no common subclass.
-  /// The common subclass should contain
-  /// simple value type SVT if it is not the Any type.
   const TargetRegisterClass *
   getCommonSubClass(const TargetRegisterClass *A,
-                    const TargetRegisterClass *B,
-                    const MVT::SimpleValueType SVT =
-                    MVT::SimpleValueType::Any) const;
+                    const TargetRegisterClass *B) const;
 
   /// Returns a TargetRegisterClass used for pointer values.
   /// If a target supports multiple different pointer register classes,
@@ -1005,6 +972,13 @@ public:
                                    const MachineRegisterInfo &MRI) const {
     return nullptr;
   }
+
+  /// Returns the physical register number of sub-register "Index"
+  /// for physical register RegNo. Return zero if the sub-register does not
+  /// exist.
+  inline Register getSubReg(MCRegister Reg, unsigned Idx) const {
+    return static_cast<const MCRegisterInfo *>(this)->getSubReg(Reg, Idx);
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -1156,7 +1130,7 @@ public:
 struct VirtReg2IndexFunctor {
   using argument_type = unsigned;
   unsigned operator()(unsigned Reg) const {
-    return TargetRegisterInfo::virtReg2Index(Reg);
+    return Register::virtReg2Index(Reg);
   }
 };
 
@@ -1170,7 +1144,7 @@ struct VirtReg2IndexFunctor {
 ///   %physreg17      - a physical register when no TRI instance given.
 ///
 /// Usage: OS << printReg(Reg, TRI, SubRegIdx) << '\n';
-Printable printReg(unsigned Reg, const TargetRegisterInfo *TRI = nullptr,
+Printable printReg(Register Reg, const TargetRegisterInfo *TRI = nullptr,
                    unsigned SubIdx = 0,
                    const MachineRegisterInfo *MRI = nullptr);
 
diff --git a/include/llvm/CodeGen/TargetSubtargetInfo.h b/include/llvm/CodeGen/TargetSubtargetInfo.h
index 037fc3ed3243..56018eca8c27 100644
--- a/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -106,12 +106,10 @@ public:
   // us do things like a dedicated avx512 selector).  However, we might want
   // to also specialize selectors by MachineFunction, which would let us be
   // aware of optsize/optnone and such.
-  virtual const InstructionSelector *getInstructionSelector() const {
+  virtual InstructionSelector *getInstructionSelector() const {
     return nullptr;
   }
 
-  virtual unsigned getHwMode() const { return 0; }
-
   /// Target can subclass this hook to select a different DAG scheduler.
   virtual RegisterScheduler::FunctionPassCtor
       getDAGScheduler(CodeGenOpt::Level) const {
@@ -274,6 +272,12 @@ public:
   /// scheduling, DAGCombine, etc.).
   virtual bool useAA() const;
 
+  /// \brief Sink addresses into blocks using GEP instructions rather than
+  /// pointer casts and arithmetic.
+  virtual bool addrSinkUsingGEPs() const {
+    return useAA();
+  }
+
   /// Enable the use of the early if conversion pass.
   virtual bool enableEarlyIfConversion() const { return false; }
 
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index c540c94f79d9..cd4c4ca64081 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -81,7 +81,7 @@ namespace llvm {
 
     /// Returns the EVT that represents a vector EC.Min elements in length,
     /// where each element is of type VT.
-    static EVT getVectorVT(LLVMContext &Context, EVT VT, MVT::ElementCount EC) {
+    static EVT getVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) {
       MVT M = MVT::getVectorVT(VT.V, EC);
       if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
         return M;
@@ -277,7 +277,7 @@ namespace llvm {
     }
 
     // Given a (possibly scalable) vector type, return the ElementCount
-    MVT::ElementCount getVectorElementCount() const {
+    ElementCount getVectorElementCount() const {
       assert((isVector()) && "Invalid vector type!");
       if (isSimple())
         return V.getVectorElementCount();
diff --git a/include/llvm/CodeGen/ValueTypes.td b/include/llvm/CodeGen/ValueTypes.td
index 5818ac183fcc..16df565bc8b8 100644
--- a/include/llvm/CodeGen/ValueTypes.td
+++ b/include/llvm/CodeGen/ValueTypes.td
@@ -40,127 +40,132 @@ def v16i1  : ValueType<16,  18>;   //  16 x i1 vector value
 def v32i1  : ValueType<32 , 19>;   //  32 x i1 vector value
 def v64i1  : ValueType<64 , 20>;   //  64 x i1 vector value
 def v128i1 : ValueType<128, 21>;   // 128 x i1 vector value
-def v512i1 : ValueType<512, 22>;   // 512 x i1 vector value
-def v1024i1: ValueType<1024,23>;   //1024 x i1 vector value
-
-def v1i8   : ValueType<8,   24>;   //  1 x i8  vector value
-def v2i8   : ValueType<16 , 25>;   //  2 x i8  vector value
-def v4i8   : ValueType<32 , 26>;   //  4 x i8  vector value
-def v8i8   : ValueType<64 , 27>;   //  8 x i8  vector value
-def v16i8  : ValueType<128, 28>;   // 16 x i8  vector value
-def v32i8  : ValueType<256, 29>;   // 32 x i8  vector value
-def v64i8  : ValueType<512, 30>;   // 64 x i8  vector value
-def v128i8 : ValueType<1024,31>;   //128 x i8  vector value
-def v256i8 : ValueType<2048,32>;   //256 x i8  vector value
-
-def v1i16  : ValueType<16 , 33>;   //  1 x i16 vector value
-def v2i16  : ValueType<32 , 34>;   //  2 x i16 vector value
-def v4i16  : ValueType<64 , 35>;   //  4 x i16 vector value
-def v8i16  : ValueType<128, 36>;   //  8 x i16 vector value
-def v16i16 : ValueType<256, 37>;   // 16 x i16 vector value
-def v32i16 : ValueType<512, 38>;   // 32 x i16 vector value
-def v64i16 : ValueType<1024,39>;   // 64 x i16 vector value
-def v128i16: ValueType<2048,40>;   //128 x i16 vector value
-
-def v1i32    : ValueType<32 , 41>;   //  1 x i32 vector value
-def v2i32    : ValueType<64 , 42>;   //  2 x i32 vector value
-def v3i32    : ValueType<96 , 43>;   //  3 x i32 vector value
-def v4i32    : ValueType<128, 44>;   //  4 x i32 vector value
-def v5i32    : ValueType<160, 45>;   //  5 x i32 vector value
-def v8i32    : ValueType<256, 46>;   //  8 x i32 vector value
-def v16i32   : ValueType<512, 47>;   // 16 x i32 vector value
-def v32i32   : ValueType<1024,48>;   // 32 x i32 vector value
-def v64i32   : ValueType<2048,49>;   // 64 x i32 vector value
-def v128i32  : ValueType<4096,50>;   // 128 x i32 vector value
-def v256i32  : ValueType<8182,51>;   // 256 x i32 vector value
-def v512i32  : ValueType<16384,52>;  // 512 x i32 vector value
-def v1024i32 : ValueType<32768,53>;  // 1024 x i32 vector value
-def v2048i32 : ValueType<65536,54>;  // 2048 x i32 vector value
-
-def v1i64  : ValueType<64 , 55>;   //  1 x i64 vector value
-def v2i64  : ValueType<128, 56>;   //  2 x i64 vector value
-def v4i64  : ValueType<256, 57>;   //  4 x i64 vector value
-def v8i64  : ValueType<512, 58>;   //  8 x i64 vector value
-def v16i64 : ValueType<1024,59>;   // 16 x i64 vector value
-def v32i64 : ValueType<2048,60>;   // 32 x i64 vector value
-
-def v1i128 : ValueType<128, 61>;   //  1 x i128 vector value
-
-def nxv1i1  : ValueType<1,   62>;  // n x  1 x i1  vector value
-def nxv2i1  : ValueType<2,   63>;  // n x  2 x i1  vector value
-def nxv4i1  : ValueType<4,   64>;  // n x  4 x i1  vector value
-def nxv8i1  : ValueType<8,   65>;  // n x  8 x i1  vector value
-def nxv16i1 : ValueType<16,  66>;  // n x 16 x i1  vector value
-def nxv32i1 : ValueType<32,  67>;  // n x 32 x i1  vector value
-
-def nxv1i8  : ValueType<8,   68>;  // n x  1 x i8  vector value
-def nxv2i8  : ValueType<16,  69>;  // n x  2 x i8  vector value
-def nxv4i8  : ValueType<32,  70>;  // n x  4 x i8  vector value
-def nxv8i8  : ValueType<64,  71>;  // n x  8 x i8  vector value
-def nxv16i8 : ValueType<128, 72>;  // n x 16 x i8  vector value
-def nxv32i8 : ValueType<256, 73>;  // n x 32 x i8  vector value
-
-def nxv1i16 : ValueType<16,  74>;  // n x  1 x i16 vector value
-def nxv2i16 : ValueType<32,  75>;  // n x  2 x i16 vector value
-def nxv4i16 : ValueType<64,  76>;  // n x  4 x i16 vector value
-def nxv8i16 : ValueType<128, 77>;  // n x  8 x i16 vector value
-def nxv16i16: ValueType<256, 78>;  // n x 16 x i16 vector value
-def nxv32i16: ValueType<512, 79>;  // n x 32 x i16 vector value
-
-def nxv1i32 : ValueType<32,  80>;  // n x  1 x i32 vector value
-def nxv2i32 : ValueType<64,  81>;  // n x  2 x i32 vector value
-def nxv4i32 : ValueType<128, 82>;  // n x  4 x i32 vector value
-def nxv8i32 : ValueType<256, 83>;  // n x  8 x i32 vector value
-def nxv16i32: ValueType<512, 84>;  // n x 16 x i32 vector value
-def nxv32i32: ValueType<1024,85>;  // n x 32 x i32 vector value
-
-def nxv1i64 : ValueType<64,  86>;  // n x  1 x i64 vector value
-def nxv2i64 : ValueType<128, 87>;  // n x  2 x i64 vector value
-def nxv4i64 : ValueType<256, 88>;  // n x  4 x i64 vector value
-def nxv8i64 : ValueType<512, 89>;  // n x  8 x i64 vector value
-def nxv16i64: ValueType<1024,90>;  // n x 16 x i64 vector value
-def nxv32i64: ValueType<2048,91>;  // n x 32 x i64 vector value
-
-def v2f16    : ValueType<32 , 92>;    //    2 x f16 vector value
-def v4f16    : ValueType<64 , 93>;    //    4 x f16 vector value
-def v8f16    : ValueType<128, 94>;    //    8 x f16 vector value
-def v1f32    : ValueType<32 , 95>;    //    1 x f32 vector value
-def v2f32    : ValueType<64 , 96>;    //    2 x f32 vector value
-def v3f32    : ValueType<96 , 97>;    //    3 x f32 vector value
-def v4f32    : ValueType<128, 98>;    //    4 x f32 vector value
-def v5f32    : ValueType<160, 99>;    //    5 x f32 vector value
-def v8f32    : ValueType<256, 100>;   //    8 x f32 vector value
-def v16f32   : ValueType<512,  101>;  //   16 x f32 vector value
-def v32f32   : ValueType<1024, 102>;  //   32 x f32 vector value
-def v64f32   : ValueType<2048, 103>;  //   64 x f32 vector value
-def v128f32  : ValueType<4096, 104>;  //  128 x f32 vector value
-def v256f32  : ValueType<8182, 105>;  //  256 x f32 vector value
-def v512f32  : ValueType<16384, 106>; //  512 x f32 vector value
-def v1024f32 : ValueType<32768, 107>; // 1024 x f32 vector value
-def v2048f32 : ValueType<65536, 108>; // 2048 x f32 vector value
-def v1f64    : ValueType<64, 109>;    //    1 x f64 vector value
-def v2f64    : ValueType<128, 110>;   //    2 x f64 vector value
-def v4f64    : ValueType<256, 111>;   //    4 x f64 vector value
-def v8f64    : ValueType<512, 112>;   //    8 x f64 vector value
-
-def nxv2f16  : ValueType<32 , 113>; // n x  2 x f16 vector value
-def nxv4f16  : ValueType<64 , 114>; // n x  4 x f16 vector value
-def nxv8f16  : ValueType<128, 115>; // n x  8 x f16 vector value
-def nxv1f32  : ValueType<32 , 116>; // n x  1 x f32 vector value
-def nxv2f32  : ValueType<64 , 117>; // n x  2 x f32 vector value
-def nxv4f32  : ValueType<128, 118>; // n x  4 x f32 vector value
-def nxv8f32  : ValueType<256, 119>; // n x  8 x f32 vector value
-def nxv16f32 : ValueType<512, 120>; // n x 16 x f32 vector value
-def nxv1f64  : ValueType<64,  121>; // n x  1 x f64 vector value
-def nxv2f64  : ValueType<128, 122>; // n x  2 x f64 vector value
-def nxv4f64  : ValueType<256, 123>; // n x  4 x f64 vector value
-def nxv8f64  : ValueType<512, 124>; // n x  8 x f64 vector value
-
-def x86mmx : ValueType<64 , 125>;   // X86 MMX value
-def FlagVT : ValueType<0  , 126>;   // Pre-RA sched glue
-def isVoid : ValueType<0  , 127>;   // Produces no value
-def untyped: ValueType<8  , 128>;   // Produces an untyped value
-def exnref: ValueType<0, 129>;      // WebAssembly's exnref type
+def v256i1 : ValueType<256, 22>;   // 256 x i1 vector value
+def v512i1 : ValueType<512, 23>;   // 512 x i1 vector value
+def v1024i1: ValueType<1024,24>;   //1024 x i1 vector value
+
+def v1i8   : ValueType<8,   25>;   //  1 x i8  vector value
+def v2i8   : ValueType<16 , 26>;   //  2 x i8  vector value
+def v4i8   : ValueType<32 , 27>;   //  4 x i8  vector value
+def v8i8   : ValueType<64 , 28>;   //  8 x i8  vector value
+def v16i8  : ValueType<128, 29>;   // 16 x i8  vector value
+def v32i8  : ValueType<256, 30>;   // 32 x i8  vector value
+def v64i8  : ValueType<512, 31>;   // 64 x i8  vector value
+def v128i8 : ValueType<1024,32>;   //128 x i8  vector value
+def v256i8 : ValueType<2048,33>;   //256 x i8  vector value
+
+def v1i16  : ValueType<16 , 34>;   //  1 x i16 vector value
+def v2i16  : ValueType<32 , 35>;   //  2 x i16 vector value
+def v3i16  : ValueType<48 , 36>;   //  3 x i16 vector value
+def v4i16  : ValueType<64 , 37>;   //  4 x i16 vector value
+def v8i16  : ValueType<128, 38>;   //  8 x i16 vector value
+def v16i16 : ValueType<256, 39>;   // 16 x i16 vector value
+def v32i16 : ValueType<512, 40>;   // 32 x i16 vector value
+def v64i16 : ValueType<1024,41>;   // 64 x i16 vector value
+def v128i16: ValueType<2048,42>;   //128 x i16 vector value
+
+def v1i32    : ValueType<32 , 43>;   //  1 x i32 vector value
+def v2i32    : ValueType<64 , 44>;   //  2 x i32 vector value
+def v3i32    : ValueType<96 , 45>;   //  3 x i32 vector value
+def v4i32    : ValueType<128, 46>;   //  4 x i32 vector value
+def v5i32    : ValueType<160, 47>;   //  5 x i32 vector value
+def v8i32    : ValueType<256, 48>;   //  8 x i32 vector value
+def v16i32   : ValueType<512, 49>;   // 16 x i32 vector value
+def v32i32   : ValueType<1024,50>;   // 32 x i32 vector value
+def v64i32   : ValueType<2048,51>;   // 64 x i32 vector value
+def v128i32  : ValueType<4096,52>;   // 128 x i32 vector value
+def v256i32  : ValueType<8182,53>;   // 256 x i32 vector value
+def v512i32  : ValueType<16384,54>;  // 512 x i32 vector value
+def v1024i32 : ValueType<32768,55>;  // 1024 x i32 vector value
+def v2048i32 : ValueType<65536,56>;  // 2048 x i32 vector value
+
+def v1i64  : ValueType<64 , 57>;   //  1 x i64 vector value
+def v2i64  : ValueType<128, 58>;   //  2 x i64 vector value
+def v4i64  : ValueType<256, 59>;   //  4 x i64 vector value
+def v8i64  : ValueType<512, 60>;   //  8 x i64 vector value
+def v16i64 : ValueType<1024,61>;   // 16 x i64 vector value
+def v32i64 : ValueType<2048,62>;   // 32 x i64 vector value
+
+def v1i128 : ValueType<128, 63>;   //  1 x i128 vector value
+
+def v2f16    : ValueType<32 , 64>;    //    2 x f16 vector value
+def v3f16    : ValueType<48 , 65>;    //    3 x f16 vector value
+def v4f16    : ValueType<64 , 66>;    //    4 x f16 vector value
+def v8f16    : ValueType<128, 67>;    //    8 x f16 vector value
+def v16f16   : ValueType<256, 68>;    //    8 x f16 vector value
+def v32f16   : ValueType<512, 69>;    //    8 x f16 vector value
+def v1f32    : ValueType<32 , 70>;    //    1 x f32 vector value
+def v2f32    : ValueType<64 , 71>;    //    2 x f32 vector value
+def v3f32    : ValueType<96 , 72>;    //    3 x f32 vector value
+def v4f32    : ValueType<128, 73>;    //    4 x f32 vector value
+def v5f32    : ValueType<160, 74>;    //    5 x f32 vector value
+def v8f32    : ValueType<256, 75>;    //    8 x f32 vector value
+def v16f32   : ValueType<512,  76>;   //   16 x f32 vector value
+def v32f32   : ValueType<1024, 77>;   //   32 x f32 vector value
+def v64f32   : ValueType<2048, 78>;   //   64 x f32 vector value
+def v128f32  : ValueType<4096, 79>;   //  128 x f32 vector value
+def v256f32  : ValueType<8182, 80>;   //  256 x f32 vector value
+def v512f32  : ValueType<16384, 81>;  //  512 x f32 vector value
+def v1024f32 : ValueType<32768, 82>;  // 1024 x f32 vector value
+def v2048f32 : ValueType<65536, 83>;  // 2048 x f32 vector value
+def v1f64    : ValueType<64, 84>;     //    1 x f64 vector value
+def v2f64    : ValueType<128, 85>;    //    2 x f64 vector value
+def v4f64    : ValueType<256, 86>;    //    4 x f64 vector value
+def v8f64    : ValueType<512, 87>;    //    8 x f64 vector value
+
+def nxv1i1  : ValueType<1,   88>;  // n x  1 x i1  vector value
+def nxv2i1  : ValueType<2,   89>;  // n x  2 x i1  vector value
+def nxv4i1  : ValueType<4,   90>;  // n x  4 x i1  vector value
+def nxv8i1  : ValueType<8,   91>;  // n x  8 x i1  vector value
+def nxv16i1 : ValueType<16,  92>;  // n x 16 x i1  vector value
+def nxv32i1 : ValueType<32,  93>;  // n x 32 x i1  vector value
+
+def nxv1i8  : ValueType<8,   94>;  // n x  1 x i8  vector value
+def nxv2i8  : ValueType<16,  95>;  // n x  2 x i8  vector value
+def nxv4i8  : ValueType<32,  96>;  // n x  4 x i8  vector value
+def nxv8i8  : ValueType<64,  97>;  // n x  8 x i8  vector value
+def nxv16i8 : ValueType<128, 98>;  // n x 16 x i8  vector value
+def nxv32i8 : ValueType<256, 99>;  // n x 32 x i8  vector value
+
+def nxv1i16 : ValueType<16,  100>; // n x  1 x i16 vector value
+def nxv2i16 : ValueType<32,  101>; // n x  2 x i16 vector value
+def nxv4i16 : ValueType<64,  102>; // n x  4 x i16 vector value
+def nxv8i16 : ValueType<128, 103>; // n x  8 x i16 vector value
+def nxv16i16: ValueType<256, 104>; // n x 16 x i16 vector value
+def nxv32i16: ValueType<512, 105>; // n x 32 x i16 vector value
+
+def nxv1i32 : ValueType<32,  106>; // n x  1 x i32 vector value
+def nxv2i32 : ValueType<64,  107>; // n x  2 x i32 vector value
+def nxv4i32 : ValueType<128, 108>; // n x  4 x i32 vector value
+def nxv8i32 : ValueType<256, 109>; // n x  8 x i32 vector value
+def nxv16i32: ValueType<512, 110>; // n x 16 x i32 vector value
+def nxv32i32: ValueType<1024,111>; // n x 32 x i32 vector value
+
+def nxv1i64 : ValueType<64,  112>; // n x  1 x i64 vector value
+def nxv2i64 : ValueType<128, 113>; // n x  2 x i64 vector value
+def nxv4i64 : ValueType<256, 114>; // n x  4 x i64 vector value
+def nxv8i64 : ValueType<512, 115>; // n x  8 x i64 vector value
+def nxv16i64: ValueType<1024,116>; // n x 16 x i64 vector value
+def nxv32i64: ValueType<2048,117>; // n x 32 x i64 vector value
+
+def nxv2f16  : ValueType<32 , 118>; // n x  2 x f16 vector value
+def nxv4f16  : ValueType<64 , 119>; // n x  4 x f16 vector value
+def nxv8f16  : ValueType<128, 120>; // n x  8 x f16 vector value
+def nxv1f32  : ValueType<32 , 121>; // n x  1 x f32 vector value
+def nxv2f32  : ValueType<64 , 122>; // n x  2 x f32 vector value
+def nxv4f32  : ValueType<128, 123>; // n x  4 x f32 vector value
+def nxv8f32  : ValueType<256, 124>; // n x  8 x f32 vector value
+def nxv16f32 : ValueType<512, 125>; // n x 16 x f32 vector value
+def nxv1f64  : ValueType<64,  126>; // n x  1 x f64 vector value
+def nxv2f64  : ValueType<128, 127>; // n x  2 x f64 vector value
+def nxv4f64  : ValueType<256, 128>; // n x  4 x f64 vector value
+def nxv8f64  : ValueType<512, 129>; // n x  8 x f64 vector value
+
+def x86mmx : ValueType<64 , 130>;   // X86 MMX value
+def FlagVT : ValueType<0  , 131>;   // Pre-RA sched glue
+def isVoid : ValueType<0  , 132>;   // Produces no value
+def untyped: ValueType<8  , 133>;   // Produces an untyped value
+def exnref: ValueType<0, 134>;      // WebAssembly's exnref type
 def token  : ValueType<0  , 248>;   // TokenTy
 def MetadataVT: ValueType<0, 249>;  // Metadata
 
diff --git a/include/llvm/CodeGen/VirtRegMap.h b/include/llvm/CodeGen/VirtRegMap.h
index 70eb048f05eb..db25ed5c5116 100644
--- a/include/llvm/CodeGen/VirtRegMap.h
+++ b/include/llvm/CodeGen/VirtRegMap.h
@@ -49,7 +49,7 @@ class TargetInstrInfo;
     /// it; even spilled virtual registers (the register mapped to a
     /// spilled register is the temporary used to load it from the
     /// stack).
-    IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2PhysMap;
+    IndexedMap<Register, VirtReg2IndexFunctor> Virt2PhysMap;
 
     /// Virt2StackSlotMap - This is virtual register to stack slot
     /// mapping. Each spilled virtual register has an entry in it
@@ -93,7 +93,7 @@ class TargetInstrInfo;
 
     /// returns true if the specified virtual register is
     /// mapped to a physical register
-    bool hasPhys(unsigned virtReg) const {
+    bool hasPhys(Register virtReg) const {
       return getPhys(virtReg) != NO_PHYS_REG;
     }
 
@@ -101,20 +101,20 @@ class TargetInstrInfo;
     /// virtual register
     Register getPhys(Register virtReg) const {
       assert(virtReg.isVirtual());
-      return Virt2PhysMap[virtReg];
+      return Virt2PhysMap[virtReg.id()];
     }
 
     /// creates a mapping for the specified virtual register to
     /// the specified physical register
-    void assignVirt2Phys(unsigned virtReg, MCPhysReg physReg);
+    void assignVirt2Phys(Register virtReg, MCPhysReg physReg);
 
     /// clears the specified virtual register's, physical
     /// register mapping
-    void clearVirt(unsigned virtReg) {
-      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
-      assert(Virt2PhysMap[virtReg] != NO_PHYS_REG &&
+    void clearVirt(Register virtReg) {
+      assert(virtReg.isVirtual());
+      assert(Virt2PhysMap[virtReg.id()] != NO_PHYS_REG &&
              "attempt to clear a not assigned virtual register");
-      Virt2PhysMap[virtReg] = NO_PHYS_REG;
+      Virt2PhysMap[virtReg.id()] = NO_PHYS_REG;
     }
 
     /// clears all virtual to physical register mappings
@@ -124,21 +124,21 @@ class TargetInstrInfo;
     }
 
     /// returns true if VirtReg is assigned to its preferred physreg.
-    bool hasPreferredPhys(unsigned VirtReg);
+    bool hasPreferredPhys(Register VirtReg);
 
     /// returns true if VirtReg has a known preferred register.
     /// This returns false if VirtReg has a preference that is a virtual
     /// register that hasn't been assigned yet.
-    bool hasKnownPreference(unsigned VirtReg);
+    bool hasKnownPreference(Register VirtReg);
 
     /// records virtReg is a split live interval from SReg.
-    void setIsSplitFromReg(unsigned virtReg, unsigned SReg) {
-      Virt2SplitMap[virtReg] = SReg;
+    void setIsSplitFromReg(Register virtReg, unsigned SReg) {
+      Virt2SplitMap[virtReg.id()] = SReg;
     }
 
     /// returns the live interval virtReg is split from.
-    unsigned getPreSplitReg(unsigned virtReg) const {
-      return Virt2SplitMap[virtReg];
+    unsigned getPreSplitReg(Register virtReg) const {
+      return Virt2SplitMap[virtReg.id()];
     }
 
     /// getOriginal - Return the original virtual register that VirtReg descends
@@ -152,28 +152,29 @@ class TargetInstrInfo;
 
     /// returns true if the specified virtual register is not
     /// mapped to a stack slot or rematerialized.
-    bool isAssignedReg(unsigned virtReg) const {
+    bool isAssignedReg(Register virtReg) const {
       if (getStackSlot(virtReg) == NO_STACK_SLOT)
         return true;
       // Split register can be assigned a physical register as well as a
       // stack slot or remat id.
-      return (Virt2SplitMap[virtReg] && Virt2PhysMap[virtReg] != NO_PHYS_REG);
+      return (Virt2SplitMap[virtReg.id()] &&
+              Virt2PhysMap[virtReg.id()] != NO_PHYS_REG);
     }
 
     /// returns the stack slot mapped to the specified virtual
     /// register
-    int getStackSlot(unsigned virtReg) const {
-      assert(TargetRegisterInfo::isVirtualRegister(virtReg));
-      return Virt2StackSlotMap[virtReg];
+    int getStackSlot(Register virtReg) const {
+      assert(virtReg.isVirtual());
+      return Virt2StackSlotMap[virtReg.id()];
     }
 
     /// create a mapping for the specifed virtual register to
     /// the next available stack slot
-    int assignVirt2StackSlot(unsigned virtReg);
+    int assignVirt2StackSlot(Register virtReg);
 
     /// create a mapping for the specified virtual register to
     /// the specified stack slot
-    void assignVirt2StackSlot(unsigned virtReg, int SS);
+    void assignVirt2StackSlot(Register virtReg, int SS);
 
     void print(raw_ostream &OS, const Module* M = nullptr) const override;
     void dump() const;
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index 7d20bb0a7bde..7538cb2c2548 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -11,7 +11,6 @@
 
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -31,9 +30,6 @@ enum VisitorDataSource {
 Error visitTypeRecord(CVType &Record, TypeIndex Index,
                       TypeVisitorCallbacks &Callbacks,
                       VisitorDataSource Source = VDS_BytesPresent);
-Error visitTypeRecord(CVType &Record, TypeIndex Index,
-                      TypeVisitorCallbackPipeline &Callbacks,
-                      VisitorDataSource Source = VDS_BytesPresent);
 Error visitTypeRecord(CVType &Record, TypeVisitorCallbacks &Callbacks,
                       VisitorDataSource Source = VDS_BytesPresent);
 
diff --git a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
index 00fb0cf4cc90..60829a51dc25 100644
--- a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
+++ b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
@@ -33,6 +33,9 @@ public:
   virtual void EmitIntValue(uint64_t Value, unsigned Size) = 0;
   virtual void EmitBinaryData(StringRef Data) = 0;
   virtual void AddComment(const Twine &T) = 0;
+  virtual void AddRawComment(const Twine &T) = 0;
+  virtual bool isVerboseAsm() = 0;
+  virtual std::string getTypeName(TypeIndex TI) = 0;
   virtual ~CodeViewRecordStreamer() = default;
 };
 
@@ -206,6 +209,11 @@ public:
     return 0;
   }
 
+  void emitRawComment(const Twine &T) {
+    if (isStreaming() && Streamer->isVerboseAsm())
+      Streamer->AddRawComment(T);
+  }
+
 private:
   void emitEncodedSignedInteger(const int64_t &Value,
                                 const Twine &Comment = "");
@@ -225,9 +233,10 @@ private:
   }
 
   void emitComment(const Twine &Comment) {
-    if (isStreaming()) {
+    if (isStreaming() && Streamer->isVerboseAsm()) {
       Twine TComment(Comment);
-      Streamer->AddComment(TComment);
+      if (!TComment.isTriviallyEmpty())
+        Streamer->AddComment(TComment);
     }
   }
 
diff --git a/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def b/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
index 9767e49c44f5..ed5c143818e6 100644
--- a/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
+++ b/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
@@ -366,8 +366,134 @@ CV_REGISTER(AMD64_K7, 765)
 
 #endif // defined(CV_REGISTERS_ALL) || defined(CV_REGISTERS_X86)
 
+#if defined(CV_REGISTERS_ALL) || defined(CV_REGISTERS_ARM)
+
+// ARM registers
+
+CV_REGISTER(ARM_NOREG, 0)
+
+// General purpose 32-bit integer regisers
+
+CV_REGISTER(ARM_R0, 10)
+CV_REGISTER(ARM_R1, 11)
+CV_REGISTER(ARM_R2, 12)
+CV_REGISTER(ARM_R3, 13)
+CV_REGISTER(ARM_R4, 14)
+CV_REGISTER(ARM_R5, 15)
+CV_REGISTER(ARM_R6, 16)
+CV_REGISTER(ARM_R7, 17)
+CV_REGISTER(ARM_R8, 18)
+CV_REGISTER(ARM_R9, 19)
+CV_REGISTER(ARM_R10, 20)
+CV_REGISTER(ARM_R11, 21)
+CV_REGISTER(ARM_R12, 22)
+CV_REGISTER(ARM_SP, 23)
+CV_REGISTER(ARM_LR, 24)
+CV_REGISTER(ARM_PC, 25)
+
+// Status register
+
+CV_REGISTER(ARM_CPSR, 25)
+
+// ARM VFPv1 registers
+
+CV_REGISTER(ARM_FPSCR, 40)
+CV_REGISTER(ARM_FPEXC, 41)
+
+// ARM VFPv3/NEON registers
+
+CV_REGISTER(ARM_FS32, 200)
+CV_REGISTER(ARM_FS33, 201)
+CV_REGISTER(ARM_FS34, 202)
+CV_REGISTER(ARM_FS35, 203)
+CV_REGISTER(ARM_FS36, 204)
+CV_REGISTER(ARM_FS37, 205)
+CV_REGISTER(ARM_FS38, 206)
+CV_REGISTER(ARM_FS39, 207)
+CV_REGISTER(ARM_FS40, 208)
+CV_REGISTER(ARM_FS41, 209)
+CV_REGISTER(ARM_FS42, 210)
+CV_REGISTER(ARM_FS43, 211)
+CV_REGISTER(ARM_FS44, 212)
+CV_REGISTER(ARM_FS45, 213)
+CV_REGISTER(ARM_FS46, 214)
+CV_REGISTER(ARM_FS47, 215)
+CV_REGISTER(ARM_FS48, 216)
+CV_REGISTER(ARM_FS49, 217)
+CV_REGISTER(ARM_FS50, 218)
+CV_REGISTER(ARM_FS51, 219)
+CV_REGISTER(ARM_FS52, 220)
+CV_REGISTER(ARM_FS53, 221)
+CV_REGISTER(ARM_FS54, 222)
+CV_REGISTER(ARM_FS55, 223)
+CV_REGISTER(ARM_FS56, 224)
+CV_REGISTER(ARM_FS57, 225)
+CV_REGISTER(ARM_FS58, 226)
+CV_REGISTER(ARM_FS59, 227)
+CV_REGISTER(ARM_FS60, 228)
+CV_REGISTER(ARM_FS61, 229)
+CV_REGISTER(ARM_FS62, 230)
+CV_REGISTER(ARM_FS63, 231)
+
+CV_REGISTER(ARM_ND0, 300)
+CV_REGISTER(ARM_ND1, 301)
+CV_REGISTER(ARM_ND2, 302)
+CV_REGISTER(ARM_ND3, 303)
+CV_REGISTER(ARM_ND4, 304)
+CV_REGISTER(ARM_ND5, 305)
+CV_REGISTER(ARM_ND6, 306)
+CV_REGISTER(ARM_ND7, 307)
+CV_REGISTER(ARM_ND8, 308)
+CV_REGISTER(ARM_ND9, 309)
+CV_REGISTER(ARM_ND10, 310)
+CV_REGISTER(ARM_ND11, 311)
+CV_REGISTER(ARM_ND12, 312)
+CV_REGISTER(ARM_ND13, 313)
+CV_REGISTER(ARM_ND14, 314)
+CV_REGISTER(ARM_ND15, 315)
+CV_REGISTER(ARM_ND16, 316)
+CV_REGISTER(ARM_ND17, 317)
+CV_REGISTER(ARM_ND18, 318)
+CV_REGISTER(ARM_ND19, 319)
+CV_REGISTER(ARM_ND20, 320)
+CV_REGISTER(ARM_ND21, 321)
+CV_REGISTER(ARM_ND22, 322)
+CV_REGISTER(ARM_ND23, 323)
+CV_REGISTER(ARM_ND24, 324)
+CV_REGISTER(ARM_ND25, 325)
+CV_REGISTER(ARM_ND26, 326)
+CV_REGISTER(ARM_ND27, 327)
+CV_REGISTER(ARM_ND28, 328)
+CV_REGISTER(ARM_ND29, 329)
+CV_REGISTER(ARM_ND30, 330)
+CV_REGISTER(ARM_ND31, 331)
+
+CV_REGISTER(ARM_NQ0, 400)
+CV_REGISTER(ARM_NQ1, 401)
+CV_REGISTER(ARM_NQ2, 402)
+CV_REGISTER(ARM_NQ3, 403)
+CV_REGISTER(ARM_NQ4, 404)
+CV_REGISTER(ARM_NQ5, 405)
+CV_REGISTER(ARM_NQ6, 406)
+CV_REGISTER(ARM_NQ7, 407)
+CV_REGISTER(ARM_NQ8, 408)
+CV_REGISTER(ARM_NQ9, 409)
+CV_REGISTER(ARM_NQ10, 410)
+CV_REGISTER(ARM_NQ11, 411)
+CV_REGISTER(ARM_NQ12, 412)
+CV_REGISTER(ARM_NQ13, 413)
+CV_REGISTER(ARM_NQ14, 414)
+CV_REGISTER(ARM_NQ15, 415)
+
+#endif // defined(CV_REGISTERS_ALL) || defined(CV_REGISTERS_ARM)
+
 #if defined(CV_REGISTERS_ALL) || defined(CV_REGISTERS_ARM64)
 
+// arm64intr.h from MSVC defines ARM64_FPSR, which conflicts with
+// these declarations.
+#pragma push_macro("ARM64_FPSR")
+#undef ARM64_FPSR
+
 // ARM64 registers
 
 CV_REGISTER(ARM64_NOREG, 0)
@@ -556,4 +682,6 @@ CV_REGISTER(ARM64_Q31, 211)
 
 CV_REGISTER(ARM64_FPSR, 220)
 
+#pragma pop_macro("ARM64_FPSR")
+
 #endif // defined(CV_REGISTERS_ALL) || defined(CV_REGISTERS_ARM64)
diff --git a/include/llvm/DebugInfo/CodeView/EnumTables.h b/include/llvm/DebugInfo/CodeView/EnumTables.h
index ed126ed9e2ff..270cd4b8330c 100644
--- a/include/llvm/DebugInfo/CodeView/EnumTables.h
+++ b/include/llvm/DebugInfo/CodeView/EnumTables.h
@@ -37,6 +37,17 @@ ArrayRef<EnumEntry<uint8_t>> getThunkOrdinalNames();
 ArrayRef<EnumEntry<uint16_t>> getTrampolineNames();
 ArrayRef<EnumEntry<COFF::SectionCharacteristics>>
 getImageSectionCharacteristicNames();
+ArrayRef<EnumEntry<uint16_t>> getClassOptionNames();
+ArrayRef<EnumEntry<uint8_t>> getMemberAccessNames();
+ArrayRef<EnumEntry<uint16_t>> getMethodOptionNames();
+ArrayRef<EnumEntry<uint16_t>> getMemberKindNames();
+ArrayRef<EnumEntry<uint8_t>> getPtrKindNames();
+ArrayRef<EnumEntry<uint8_t>> getPtrModeNames();
+ArrayRef<EnumEntry<uint16_t>> getPtrMemberRepNames();
+ArrayRef<EnumEntry<uint16_t>> getTypeModifierNames();
+ArrayRef<EnumEntry<uint8_t>> getCallingConventions();
+ArrayRef<EnumEntry<uint8_t>> getFunctionOptionEnum();
+ArrayRef<EnumEntry<uint16_t>> getLabelTypeEnum();
 
 } // end namespace codeview
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
index 62761cb87c81..108abb291498 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
@@ -62,7 +62,7 @@ public:
 
   Error visitSymbolBegin(CVSymbol &Record) override {
     assert(!Mapping && "Already in a symbol mapping!");
-    Mapping = llvm::make_unique<MappingInfo>(Record.content(), Container);
+    Mapping = std::make_unique<MappingInfo>(Record.content(), Container);
     return Mapping->Mapping.visitSymbolBegin(Record);
   }
   Error visitSymbolEnd(CVSymbol &Record) override {
diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
index 5e9a7432b9b6..1aafa3ca9f1d 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolRecord.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
@@ -73,17 +73,17 @@ public:
   Thunk32Sym(SymbolRecordKind Kind, uint32_t RecordOffset)
       : SymbolRecord(Kind), RecordOffset(RecordOffset) {}
 
-  uint32_t Parent;
-  uint32_t End;
-  uint32_t Next;
-  uint32_t Offset;
-  uint16_t Segment;
-  uint16_t Length;
+  uint32_t Parent = 0;
+  uint32_t End = 0;
+  uint32_t Next = 0;
+  uint32_t Offset = 0;
+  uint16_t Segment = 0;
+  uint16_t Length = 0;
   ThunkOrdinal Thunk;
   StringRef Name;
   ArrayRef<uint8_t> VariantData;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_TRAMPOLINE
@@ -94,13 +94,13 @@ public:
       : SymbolRecord(Kind), RecordOffset(RecordOffset) {}
 
   TrampolineType Type;
-  uint16_t Size;
-  uint32_t ThunkOffset;
-  uint32_t TargetOffset;
-  uint16_t ThunkSection;
-  uint16_t TargetSection;
+  uint16_t Size = 0;
+  uint32_t ThunkOffset = 0;
+  uint32_t TargetOffset = 0;
+  uint16_t ThunkSection = 0;
+  uint16_t TargetSection = 0;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_SECTION
@@ -110,14 +110,14 @@ public:
   SectionSym(SymbolRecordKind Kind, uint32_t RecordOffset)
       : SymbolRecord(Kind), RecordOffset(RecordOffset) {}
 
-  uint16_t SectionNumber;
-  uint8_t Alignment;
-  uint32_t Rva;
-  uint32_t Length;
-  uint32_t Characteristics;
+  uint16_t SectionNumber = 0;
+  uint8_t Alignment = 0;
+  uint32_t Rva = 0;
+  uint32_t Length = 0;
+  uint32_t Characteristics = 0;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_COFFGROUP
@@ -127,13 +127,13 @@ public:
   CoffGroupSym(SymbolRecordKind Kind, uint32_t RecordOffset)
       : SymbolRecord(Kind), RecordOffset(RecordOffset) {}
 
-  uint32_t Size;
-  uint32_t Characteristics;
-  uint32_t Offset;
-  uint16_t Segment;
+  uint32_t Size = 0;
+  uint32_t Characteristics = 0;
+  uint32_t Offset = 0;
+  uint16_t Segment = 0;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 class ScopeEndSym : public SymbolRecord {
@@ -142,7 +142,7 @@ public:
   ScopeEndSym(SymbolRecordKind Kind, uint32_t RecordOffset)
       : SymbolRecord(Kind), RecordOffset(RecordOffset) {}
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 class CallerSym : public SymbolRecord {
@@ -153,7 +153,7 @@ public:
 
   std::vector<TypeIndex> Indices;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 struct DecodedAnnotation {
@@ -333,7 +333,7 @@ private:
 class InlineSiteSym : public SymbolRecord {
 public:
   explicit InlineSiteSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  InlineSiteSym(uint32_t RecordOffset)
+  explicit InlineSiteSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::InlineSiteSym),
         RecordOffset(RecordOffset) {}
 
@@ -342,12 +342,12 @@ public:
                       BinaryAnnotationIterator());
   }
 
-  uint32_t Parent;
-  uint32_t End;
+  uint32_t Parent = 0;
+  uint32_t End = 0;
   TypeIndex Inlinee;
   std::vector<uint8_t> AnnotationData;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_PUB32
@@ -371,7 +371,7 @@ public:
 class RegisterSym : public SymbolRecord {
 public:
   explicit RegisterSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  RegisterSym(uint32_t RecordOffset)
+  explicit RegisterSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::RegisterSym),
         RecordOffset(RecordOffset) {}
 
@@ -379,7 +379,7 @@ public:
   RegisterId Register;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_PROCREF, S_LPROCREF
@@ -390,13 +390,13 @@ public:
       : SymbolRecord(SymbolRecordKind::ProcRefSym), RecordOffset(RecordOffset) {
   }
 
-  uint32_t SumName;
-  uint32_t SymOffset;
-  uint16_t Module;
+  uint32_t SumName = 0;
+  uint32_t SymOffset = 0;
+  uint16_t Module = 0;
   StringRef Name;
 
   uint16_t modi() const { return Module - 1; }
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_LOCAL
@@ -410,7 +410,7 @@ public:
   LocalSymFlags Flags;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 struct LocalVariableAddrRange {
@@ -440,11 +440,11 @@ public:
     return RecordOffset + RelocationOffset;
   }
 
-  uint32_t Program;
+  uint32_t Program = 0;
   LocalVariableAddrRange Range;
   std::vector<LocalVariableAddrGap> Gaps;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_DEFRANGE_SUBFIELD
@@ -453,7 +453,7 @@ class DefRangeSubfieldSym : public SymbolRecord {
 
 public:
   explicit DefRangeSubfieldSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  DefRangeSubfieldSym(uint32_t RecordOffset)
+  explicit DefRangeSubfieldSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::DefRangeSubfieldSym),
         RecordOffset(RecordOffset) {}
 
@@ -461,58 +461,62 @@ public:
     return RecordOffset + RelocationOffset;
   }
 
-  uint32_t Program;
-  uint16_t OffsetInParent;
+  uint32_t Program = 0;
+  uint16_t OffsetInParent = 0;
   LocalVariableAddrRange Range;
   std::vector<LocalVariableAddrGap> Gaps;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
+};
+
+struct DefRangeRegisterHeader {
+  ulittle16_t Register;
+  ulittle16_t MayHaveNoName;
 };
 
 // S_DEFRANGE_REGISTER
 class DefRangeRegisterSym : public SymbolRecord {
 public:
-  struct Header {
-    ulittle16_t Register;
-    ulittle16_t MayHaveNoName;
-  };
-
   explicit DefRangeRegisterSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  DefRangeRegisterSym(uint32_t RecordOffset)
+  explicit DefRangeRegisterSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::DefRangeRegisterSym),
         RecordOffset(RecordOffset) {}
 
-  uint32_t getRelocationOffset() const { return RecordOffset + sizeof(Header); }
+  uint32_t getRelocationOffset() const { return RecordOffset + sizeof(DefRangeRegisterHeader); }
 
-  Header Hdr;
+  DefRangeRegisterHeader Hdr;
   LocalVariableAddrRange Range;
   std::vector<LocalVariableAddrGap> Gaps;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
+};
+
+struct DefRangeSubfieldRegisterHeader {
+  ulittle16_t Register;
+  ulittle16_t MayHaveNoName;
+  ulittle32_t OffsetInParent;
 };
 
 // S_DEFRANGE_SUBFIELD_REGISTER
 class DefRangeSubfieldRegisterSym : public SymbolRecord {
 public:
-  struct Header {
-    ulittle16_t Register;
-    ulittle16_t MayHaveNoName;
-    ulittle32_t OffsetInParent;
-  };
-
   explicit DefRangeSubfieldRegisterSym(SymbolRecordKind Kind)
       : SymbolRecord(Kind) {}
-  DefRangeSubfieldRegisterSym(uint32_t RecordOffset)
+  explicit DefRangeSubfieldRegisterSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::DefRangeSubfieldRegisterSym),
         RecordOffset(RecordOffset) {}
 
-  uint32_t getRelocationOffset() const { return RecordOffset + sizeof(Header); }
+  uint32_t getRelocationOffset() const { return RecordOffset + sizeof(DefRangeSubfieldRegisterHeader); }
 
-  Header Hdr;
+  DefRangeSubfieldRegisterHeader Hdr;
   LocalVariableAddrRange Range;
   std::vector<LocalVariableAddrGap> Gaps;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
+};
+
+struct DefRangeFramePointerRelHeader {
+  little32_t Offset;
 };
 
 // S_DEFRANGE_FRAMEPOINTER_REL
@@ -522,7 +526,7 @@ class DefRangeFramePointerRelSym : public SymbolRecord {
 public:
   explicit DefRangeFramePointerRelSym(SymbolRecordKind Kind)
       : SymbolRecord(Kind) {}
-  DefRangeFramePointerRelSym(uint32_t RecordOffset)
+  explicit DefRangeFramePointerRelSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::DefRangeFramePointerRelSym),
         RecordOffset(RecordOffset) {}
 
@@ -530,22 +534,22 @@ public:
     return RecordOffset + RelocationOffset;
   }
 
-  int32_t Offset;
+  DefRangeFramePointerRelHeader Hdr;
   LocalVariableAddrRange Range;
   std::vector<LocalVariableAddrGap> Gaps;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
+};
+
+struct DefRangeRegisterRelHeader {
+  ulittle16_t Register;
+  ulittle16_t Flags;
+  little32_t BasePointerOffset;
 };
 
 // S_DEFRANGE_REGISTER_REL
 class DefRangeRegisterRelSym : public SymbolRecord {
 public:
-  struct Header {
-    ulittle16_t Register;
-    ulittle16_t Flags;
-    little32_t BasePointerOffset;
-  };
-
   explicit DefRangeRegisterRelSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
   explicit DefRangeRegisterRelSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::DefRangeRegisterRelSym),
@@ -563,13 +567,13 @@ public:
   bool hasSpilledUDTMember() const { return Hdr.Flags & IsSubfieldFlag; }
   uint16_t offsetInParent() const { return Hdr.Flags >> OffsetInParentShift; }
 
-  uint32_t getRelocationOffset() const { return RecordOffset + sizeof(Header); }
+  uint32_t getRelocationOffset() const { return RecordOffset + sizeof(DefRangeRegisterRelHeader); }
 
-  Header Hdr;
+  DefRangeRegisterRelHeader Hdr;
   LocalVariableAddrRange Range;
   std::vector<LocalVariableAddrGap> Gaps;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_DEFRANGE_FRAMEPOINTER_REL_FULL_SCOPE
@@ -581,9 +585,9 @@ public:
       : SymbolRecord(SymbolRecordKind::DefRangeFramePointerRelFullScopeSym),
         RecordOffset(RecordOffset) {}
 
-  int32_t Offset;
+  int32_t Offset = 0;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_BLOCK32
@@ -599,14 +603,14 @@ public:
     return RecordOffset + RelocationOffset;
   }
 
-  uint32_t Parent;
-  uint32_t End;
-  uint32_t CodeSize;
-  uint32_t CodeOffset;
-  uint16_t Segment;
+  uint32_t Parent = 0;
+  uint32_t End = 0;
+  uint32_t CodeSize = 0;
+  uint32_t CodeOffset = 0;
+  uint16_t Segment = 0;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_LABEL32
@@ -622,12 +626,12 @@ public:
     return RecordOffset + RelocationOffset;
   }
 
-  uint32_t CodeOffset;
-  uint16_t Segment;
+  uint32_t CodeOffset = 0;
+  uint16_t Segment = 0;
   ProcSymFlags Flags;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_OBJNAME
@@ -635,82 +639,82 @@ class ObjNameSym : public SymbolRecord {
 public:
   explicit ObjNameSym() : SymbolRecord(SymbolRecordKind::ObjNameSym) {}
   explicit ObjNameSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  ObjNameSym(uint32_t RecordOffset)
+  explicit ObjNameSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::ObjNameSym), RecordOffset(RecordOffset) {
   }
 
-  uint32_t Signature;
+  uint32_t Signature = 0;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_ENVBLOCK
 class EnvBlockSym : public SymbolRecord {
 public:
   explicit EnvBlockSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  EnvBlockSym(uint32_t RecordOffset)
+  explicit EnvBlockSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::EnvBlockSym),
         RecordOffset(RecordOffset) {}
 
   std::vector<StringRef> Fields;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_EXPORT
 class ExportSym : public SymbolRecord {
 public:
   explicit ExportSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  ExportSym(uint32_t RecordOffset)
+  explicit ExportSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::ExportSym), RecordOffset(RecordOffset) {}
 
-  uint16_t Ordinal;
+  uint16_t Ordinal = 0;
   ExportFlags Flags;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_FILESTATIC
 class FileStaticSym : public SymbolRecord {
 public:
   explicit FileStaticSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  FileStaticSym(uint32_t RecordOffset)
+  explicit FileStaticSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::FileStaticSym),
         RecordOffset(RecordOffset) {}
 
   TypeIndex Index;
-  uint32_t ModFilenameOffset;
+  uint32_t ModFilenameOffset = 0;
   LocalSymFlags Flags;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_COMPILE2
 class Compile2Sym : public SymbolRecord {
 public:
   explicit Compile2Sym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  Compile2Sym(uint32_t RecordOffset)
+  explicit Compile2Sym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::Compile2Sym),
         RecordOffset(RecordOffset) {}
 
   CompileSym2Flags Flags;
   CPUType Machine;
-  uint16_t VersionFrontendMajor;
-  uint16_t VersionFrontendMinor;
-  uint16_t VersionFrontendBuild;
-  uint16_t VersionBackendMajor;
-  uint16_t VersionBackendMinor;
-  uint16_t VersionBackendBuild;
+  uint16_t VersionFrontendMajor = 0;
+  uint16_t VersionFrontendMinor = 0;
+  uint16_t VersionFrontendBuild = 0;
+  uint16_t VersionBackendMajor = 0;
+  uint16_t VersionBackendMinor = 0;
+  uint16_t VersionBackendBuild = 0;
   StringRef Version;
   std::vector<StringRef> ExtraStrings;
 
   uint8_t getLanguage() const { return static_cast<uint32_t>(Flags) & 0xFF; }
   uint32_t getFlags() const { return static_cast<uint32_t>(Flags) & ~0xFF; }
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_COMPILE3
@@ -718,20 +722,20 @@ class Compile3Sym : public SymbolRecord {
 public:
   Compile3Sym() : SymbolRecord(SymbolRecordKind::Compile3Sym) {}
   explicit Compile3Sym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  Compile3Sym(uint32_t RecordOffset)
+  explicit Compile3Sym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::Compile3Sym),
         RecordOffset(RecordOffset) {}
 
   CompileSym3Flags Flags;
   CPUType Machine;
-  uint16_t VersionFrontendMajor;
-  uint16_t VersionFrontendMinor;
-  uint16_t VersionFrontendBuild;
-  uint16_t VersionFrontendQFE;
-  uint16_t VersionBackendMajor;
-  uint16_t VersionBackendMinor;
-  uint16_t VersionBackendBuild;
-  uint16_t VersionBackendQFE;
+  uint16_t VersionFrontendMajor = 0;
+  uint16_t VersionFrontendMinor = 0;
+  uint16_t VersionFrontendBuild = 0;
+  uint16_t VersionFrontendQFE = 0;
+  uint16_t VersionBackendMajor = 0;
+  uint16_t VersionBackendMinor = 0;
+  uint16_t VersionBackendBuild = 0;
+  uint16_t VersionBackendQFE = 0;
   StringRef Version;
 
   void setLanguage(SourceLanguage Lang) {
@@ -750,7 +754,7 @@ public:
            (getFlags() & (CompileSym3Flags::PGO | CompileSym3Flags::LTCG));
   }
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_FRAMEPROC
@@ -761,12 +765,12 @@ public:
       : SymbolRecord(SymbolRecordKind::FrameProcSym),
         RecordOffset(RecordOffset) {}
 
-  uint32_t TotalFrameBytes;
-  uint32_t PaddingFrameBytes;
-  uint32_t OffsetToPadding;
-  uint32_t BytesOfCalleeSavedRegisters;
-  uint32_t OffsetOfExceptionHandler;
-  uint16_t SectionIdOfExceptionHandler;
+  uint32_t TotalFrameBytes = 0;
+  uint32_t PaddingFrameBytes = 0;
+  uint32_t OffsetToPadding = 0;
+  uint32_t BytesOfCalleeSavedRegisters = 0;
+  uint32_t OffsetOfExceptionHandler = 0;
+  uint16_t SectionIdOfExceptionHandler = 0;
   FrameProcedureOptions Flags;
 
   /// Extract the register this frame uses to refer to local variables.
@@ -781,7 +785,7 @@ public:
         EncodedFramePtrReg((uint32_t(Flags) >> 16U) & 0x3U), CPU);
   }
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 
 private:
 };
@@ -799,11 +803,11 @@ public:
     return RecordOffset + RelocationOffset;
   }
 
-  uint32_t CodeOffset;
-  uint16_t Segment;
+  uint32_t CodeOffset = 0;
+  uint16_t Segment = 0;
   TypeIndex Type;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_HEAPALLOCSITE
@@ -820,12 +824,12 @@ public:
     return RecordOffset + RelocationOffset;
   }
 
-  uint32_t CodeOffset;
-  uint16_t Segment;
-  uint16_t CallInstructionSize;
+  uint32_t CodeOffset = 0;
+  uint16_t Segment = 0;
+  uint16_t CallInstructionSize = 0;
   TypeIndex Type;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_FRAMECOOKIE
@@ -841,12 +845,12 @@ public:
     return RecordOffset + RelocationOffset;
   }
 
-  uint32_t CodeOffset;
-  uint16_t Register;
+  uint32_t CodeOffset = 0;
+  uint16_t Register = 0;
   FrameCookieKind CookieKind;
-  uint8_t Flags;
+  uint8_t Flags = 0;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_UDT, S_COBOLUDT
@@ -859,20 +863,20 @@ public:
   TypeIndex Type;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_BUILDINFO
 class BuildInfoSym : public SymbolRecord {
 public:
   explicit BuildInfoSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  BuildInfoSym(uint32_t RecordOffset)
+  explicit BuildInfoSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::BuildInfoSym),
         RecordOffset(RecordOffset) {}
 
   TypeIndex BuildId;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_BPREL32
@@ -883,11 +887,11 @@ public:
       : SymbolRecord(SymbolRecordKind::BPRelativeSym),
         RecordOffset(RecordOffset) {}
 
-  int32_t Offset;
+  int32_t Offset = 0;
   TypeIndex Type;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_REGREL32
@@ -898,19 +902,19 @@ public:
       : SymbolRecord(SymbolRecordKind::RegRelativeSym),
         RecordOffset(RecordOffset) {}
 
-  uint32_t Offset;
+  uint32_t Offset = 0;
   TypeIndex Type;
   RegisterId Register;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_CONSTANT, S_MANCONSTANT
 class ConstantSym : public SymbolRecord {
 public:
   explicit ConstantSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  ConstantSym(uint32_t RecordOffset)
+  explicit ConstantSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::ConstantSym),
         RecordOffset(RecordOffset) {}
 
@@ -918,7 +922,7 @@ public:
   APSInt Value;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_LDATA32, S_GDATA32, S_LMANDATA, S_GMANDATA
@@ -927,7 +931,7 @@ class DataSym : public SymbolRecord {
 
 public:
   explicit DataSym(SymbolRecordKind Kind) : SymbolRecord(Kind) {}
-  DataSym(uint32_t RecordOffset)
+  explicit DataSym(uint32_t RecordOffset)
       : SymbolRecord(SymbolRecordKind::DataSym), RecordOffset(RecordOffset) {}
 
   uint32_t getRelocationOffset() const {
@@ -935,11 +939,11 @@ public:
   }
 
   TypeIndex Type;
-  uint32_t DataOffset;
-  uint16_t Segment;
+  uint32_t DataOffset = 0;
+  uint16_t Segment = 0;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_LTHREAD32, S_GTHREAD32
@@ -957,11 +961,11 @@ public:
   }
 
   TypeIndex Type;
-  uint32_t DataOffset;
-  uint16_t Segment;
+  uint32_t DataOffset = 0;
+  uint16_t Segment = 0;
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_UNAMESPACE
@@ -974,7 +978,7 @@ public:
 
   StringRef Name;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 // S_ANNOTATION
@@ -989,7 +993,7 @@ public:
   uint16_t Segment = 0;
   std::vector<StringRef> Strings;
 
-  uint32_t RecordOffset;
+  uint32_t RecordOffset = 0;
 };
 
 using CVSymbol = CVRecord<SymbolKind>;
diff --git a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
index 081de32dd02c..2b17f5ccb13b 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
@@ -66,7 +66,7 @@ public:
 
   Error visitTypeBegin(CVType &Record) override {
     assert(!Mapping && "Already in a type mapping!");
-    Mapping = llvm::make_unique<MappingInfo>(Record.content());
+    Mapping = std::make_unique<MappingInfo>(Record.content());
     return Mapping->Mapping.visitTypeBegin(Record);
   }
 
diff --git a/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h b/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h
index 4c309c10ff0c..c6044d5138a8 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h
@@ -10,6 +10,7 @@
 #define LLVM_DEBUGINFO_CODEVIEW_TYPERECORDMAPPING_H
 
 #include "llvm/ADT/Optional.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeViewRecordIO.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/Error.h"
diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h
index 169715be2d52..fb0b579d6a06 100644
--- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h
+++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h
@@ -82,11 +82,6 @@ public:
     Pipeline.push_back(&Callbacks);
   }
 
-  void addCallbackToPipelineFront(TypeVisitorCallbacks &Callbacks) {
-    auto CallBackItr = Pipeline.begin();
-    Pipeline.insert(CallBackItr, &Callbacks);
-  }
-
 #define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
   Error visitKnownRecord(CVType &CVR, Name##Record &Record) override {         \
     return visitKnownRecordImpl(CVR, Record);                                  \
diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index d2a5318179eb..fbebfe634b63 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h
@@ -28,6 +28,10 @@ namespace llvm {
 
 /// A format-neutral container for source line information.
 struct DILineInfo {
+  // DILineInfo contains "<invalid>" for function/filename it cannot fetch.
+  static constexpr const char *const BadString = "<invalid>";
+  // Use "??" instead of "<invalid>" to make our output closer to addr2line.
+  static constexpr const char *const Addr2LineBadString = "??";
   std::string FileName;
   std::string FunctionName;
   Optional<StringRef> Source;
@@ -38,7 +42,7 @@ struct DILineInfo {
   // DWARF-specific.
   uint32_t Discriminator = 0;
 
-  DILineInfo() : FileName("<invalid>"), FunctionName("<invalid>") {}
+  DILineInfo() : FileName(BadString), FunctionName(BadString) {}
 
   bool operator==(const DILineInfo &RHS) const {
     return Line == RHS.Line && Column == RHS.Column &&
@@ -61,9 +65,9 @@ struct DILineInfo {
 
   void dump(raw_ostream &OS) {
     OS << "Line info: ";
-    if (FileName != "<invalid>")
+    if (FileName != BadString)
       OS << "file '" << FileName << "', ";
-    if (FunctionName != "<invalid>")
+    if (FunctionName != BadString)
       OS << "function '" << FunctionName << "', ";
     OS << "line " << Line << ", ";
     OS << "column " << Column << ", ";
@@ -109,7 +113,7 @@ struct DIGlobal {
   uint64_t Start = 0;
   uint64_t Size = 0;
 
-  DIGlobal() : Name("<invalid>") {}
+  DIGlobal() : Name(DILineInfo::BadString) {}
 };
 
 struct DILocal {
@@ -289,7 +293,7 @@ public:
   LoadedObjectInfoHelper(Ts &&... Args) : Base(std::forward<Ts>(Args)...) {}
 
   std::unique_ptr<llvm::LoadedObjectInfo> clone() const override {
-    return llvm::make_unique<Derived>(static_cast<const Derived &>(*this));
+    return std::make_unique<Derived>(static_cast<const Derived &>(*this));
   }
 };
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index ccf2891c2e21..39ae53c4e7fe 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
@@ -130,11 +130,11 @@ public:
   /// \param Attr DWARF attribute to search for.
   /// \param U the DWARFUnit the contains the DIE.
   /// \returns Optional DWARF form value if the attribute was extracted.
-  Optional<DWARFFormValue> getAttributeValue(const uint32_t DIEOffset,
+  Optional<DWARFFormValue> getAttributeValue(const uint64_t DIEOffset,
                                              const dwarf::Attribute Attr,
                                              const DWARFUnit &U) const;
 
-  bool extract(DataExtractor Data, uint32_t* OffsetPtr);
+  bool extract(DataExtractor Data, uint64_t* OffsetPtr);
   void dump(raw_ostream &OS) const;
 
   // Return an optional byte size of all attribute data in this abbreviation
diff --git a/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
index 303375703d2e..c9042e593260 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
@@ -96,7 +96,7 @@ class AppleAcceleratorTable : public DWARFAcceleratorTable {
     using AtomType = uint16_t;
     using Form = dwarf::Form;
 
-    uint32_t DIEOffsetBase;
+    uint64_t DIEOffsetBase;
     SmallVector<std::pair<AtomType, Form>, 3> Atoms;
 
     Optional<uint64_t> extractOffset(Optional<DWARFFormValue> Value) const;
@@ -109,7 +109,7 @@ class AppleAcceleratorTable : public DWARFAcceleratorTable {
   /// Returns true if we should continue scanning for entries or false if we've
   /// reached the last (sentinel) entry of encountered a parsing error.
   bool dumpName(ScopedPrinter &W, SmallVectorImpl<DWARFFormValue> &AtomForms,
-                uint32_t *DataOffset) const;
+                uint64_t *DataOffset) const;
 
 public:
   /// Apple-specific implementation of an Accelerator Entry.
@@ -119,7 +119,7 @@ public:
     Entry(const HeaderData &Data);
     Entry() = default;
 
-    void extract(const AppleAcceleratorTable &AccelTable, uint32_t *Offset);
+    void extract(const AppleAcceleratorTable &AccelTable, uint64_t *Offset);
 
   public:
     Optional<uint64_t> getCUOffset() const override;
@@ -143,7 +143,7 @@ public:
   class ValueIterator : public std::iterator<std::input_iterator_tag, Entry> {
     const AppleAcceleratorTable *AccelTable = nullptr;
     Entry Current;           ///< The current entry.
-    unsigned DataOffset = 0; ///< Offset into the section.
+    uint64_t DataOffset = 0; ///< Offset into the section.
     unsigned Data = 0; ///< Current data entry.
     unsigned NumData = 0; ///< Number of data entries.
 
@@ -151,7 +151,7 @@ public:
     void Next();
   public:
     /// Construct a new iterator for the entries at \p DataOffset.
-    ValueIterator(const AppleAcceleratorTable &AccelTable, unsigned DataOffset);
+    ValueIterator(const AppleAcceleratorTable &AccelTable, uint64_t DataOffset);
     /// End marker.
     ValueIterator() = default;
 
@@ -193,7 +193,7 @@ public:
   /// DieOffset is the offset into the .debug_info section for the DIE
   /// related to the input hash data offset.
   /// DieTag is the tag of the DIE
-  std::pair<uint32_t, dwarf::Tag> readAtoms(uint32_t &HashDataOffset);
+  std::pair<uint64_t, dwarf::Tag> readAtoms(uint64_t *HashDataOffset);
   void dump(raw_ostream &OS) const override;
 
   /// Look up all entries in the accelerator table matching \c Key.
@@ -245,7 +245,7 @@ public:
   struct Header : public HeaderPOD {
     SmallString<8> AugmentationString;
 
-    Error extract(const DWARFDataExtractor &AS, uint32_t *Offset);
+    Error extract(const DWARFDataExtractor &AS, uint64_t *Offset);
     void dump(ScopedPrinter &W) const;
   };
 
@@ -354,12 +354,12 @@ public:
     DataExtractor StrData;
 
     uint32_t Index;
-    uint32_t StringOffset;
-    uint32_t EntryOffset;
+    uint64_t StringOffset;
+    uint64_t EntryOffset;
 
   public:
     NameTableEntry(const DataExtractor &StrData, uint32_t Index,
-                   uint32_t StringOffset, uint32_t EntryOffset)
+                   uint64_t StringOffset, uint64_t EntryOffset)
         : StrData(StrData), Index(Index), StringOffset(StringOffset),
           EntryOffset(EntryOffset) {}
 
@@ -367,17 +367,17 @@ public:
     uint32_t getIndex() const { return Index; }
 
     /// Returns the offset of the name of the described entities.
-    uint32_t getStringOffset() const { return StringOffset; }
+    uint64_t getStringOffset() const { return StringOffset; }
 
     /// Return the string referenced by this name table entry or nullptr if the
     /// string offset is not valid.
     const char *getString() const {
-      uint32_t Off = StringOffset;
+      uint64_t Off = StringOffset;
       return StrData.getCStr(&Off);
     }
 
     /// Returns the offset of the first Entry in the list.
-    uint32_t getEntryOffset() const { return EntryOffset; }
+    uint64_t getEntryOffset() const { return EntryOffset; }
   };
 
   /// Represents a single accelerator table within the DWARF v5 .debug_names
@@ -389,40 +389,40 @@ public:
 
     // Base of the whole unit and of various important tables, as offsets from
     // the start of the section.
-    uint32_t Base;
-    uint32_t CUsBase;
-    uint32_t BucketsBase;
-    uint32_t HashesBase;
-    uint32_t StringOffsetsBase;
-    uint32_t EntryOffsetsBase;
-    uint32_t EntriesBase;
+    uint64_t Base;
+    uint64_t CUsBase;
+    uint64_t BucketsBase;
+    uint64_t HashesBase;
+    uint64_t StringOffsetsBase;
+    uint64_t EntryOffsetsBase;
+    uint64_t EntriesBase;
 
     void dumpCUs(ScopedPrinter &W) const;
     void dumpLocalTUs(ScopedPrinter &W) const;
     void dumpForeignTUs(ScopedPrinter &W) const;
     void dumpAbbreviations(ScopedPrinter &W) const;
-    bool dumpEntry(ScopedPrinter &W, uint32_t *Offset) const;
+    bool dumpEntry(ScopedPrinter &W, uint64_t *Offset) const;
     void dumpName(ScopedPrinter &W, const NameTableEntry &NTE,
                   Optional<uint32_t> Hash) const;
     void dumpBucket(ScopedPrinter &W, uint32_t Bucket) const;
 
-    Expected<AttributeEncoding> extractAttributeEncoding(uint32_t *Offset);
+    Expected<AttributeEncoding> extractAttributeEncoding(uint64_t *Offset);
 
     Expected<std::vector<AttributeEncoding>>
-    extractAttributeEncodings(uint32_t *Offset);
+    extractAttributeEncodings(uint64_t *Offset);
 
-    Expected<Abbrev> extractAbbrev(uint32_t *Offset);
+    Expected<Abbrev> extractAbbrev(uint64_t *Offset);
 
   public:
-    NameIndex(const DWARFDebugNames &Section, uint32_t Base)
+    NameIndex(const DWARFDebugNames &Section, uint64_t Base)
         : Section(Section), Base(Base) {}
 
     /// Reads offset of compilation unit CU. CU is 0-based.
-    uint32_t getCUOffset(uint32_t CU) const;
+    uint64_t getCUOffset(uint32_t CU) const;
     uint32_t getCUCount() const { return Hdr.CompUnitCount; }
 
     /// Reads offset of local type unit TU, TU is 0-based.
-    uint32_t getLocalTUOffset(uint32_t TU) const;
+    uint64_t getLocalTUOffset(uint32_t TU) const;
     uint32_t getLocalTUCount() const { return Hdr.LocalTypeUnitCount; }
 
     /// Reads signature of foreign type unit TU. TU is 0-based.
@@ -451,7 +451,7 @@ public:
       return Abbrevs;
     }
 
-    Expected<Entry> getEntry(uint32_t *Offset) const;
+    Expected<Entry> getEntry(uint64_t *Offset) const;
 
     /// Look up all entries in this Name Index matching \c Key.
     iterator_range<ValueIterator> equal_range(StringRef Key) const;
@@ -460,8 +460,8 @@ public:
     NameIterator end() const { return NameIterator(this, getNameCount() + 1); }
 
     Error extract();
-    uint32_t getUnitOffset() const { return Base; }
-    uint32_t getNextUnitOffset() const { return Base + 4 + Hdr.UnitLength; }
+    uint64_t getUnitOffset() const { return Base; }
+    uint64_t getNextUnitOffset() const { return Base + 4 + Hdr.UnitLength; }
     void dump(ScopedPrinter &W) const;
 
     friend class DWARFDebugNames;
@@ -479,12 +479,12 @@ public:
     bool IsLocal;
 
     Optional<Entry> CurrentEntry;
-    unsigned DataOffset = 0; ///< Offset into the section.
+    uint64_t DataOffset = 0; ///< Offset into the section.
     std::string Key;         ///< The Key we are searching for.
     Optional<uint32_t> Hash; ///< Hash of Key, if it has been computed.
 
     bool getEntryAtCurrentOffset();
-    Optional<uint32_t> findEntryOffsetInCurrentIndex();
+    Optional<uint64_t> findEntryOffsetInCurrentIndex();
     bool findInCurrentIndex();
     void searchFromStartOfCurrentIndex();
     void next();
@@ -572,7 +572,7 @@ public:
 
 private:
   SmallVector<NameIndex, 0> NameIndices;
-  DenseMap<uint32_t, const NameIndex *> CUToNameIndex;
+  DenseMap<uint64_t, const NameIndex *> CUToNameIndex;
 
 public:
   DWARFDebugNames(const DWARFDataExtractor &AccelSection,
@@ -591,7 +591,7 @@ public:
 
   /// Return the Name Index covering the compile unit at CUOffset, or nullptr if
   /// there is no Name Index covering that unit.
-  const NameIndex *getCUNameIndex(uint32_t CUOffset);
+  const NameIndex *getCUNameIndex(uint64_t CUOffset);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFAttribute.h b/include/llvm/DebugInfo/DWARF/DWARFAttribute.h
index c8ad19ad6bf6..dfc778346dbe 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFAttribute.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFAttribute.h
@@ -23,7 +23,7 @@ namespace llvm {
 /// attributes in a DWARFDie.
 struct DWARFAttribute {
   /// The debug info/types offset for this attribute.
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   /// The debug info/types section byte size of the data for this attribute.
   uint32_t ByteSize = 0;
   /// The attribute enumeration of this attribute.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 23cf21c3523f..fae163622edb 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -225,10 +225,10 @@ public:
   DWARFCompileUnit *getDWOCompileUnitForHash(uint64_t Hash);
 
   /// Return the compile unit that includes an offset (relative to .debug_info).
-  DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset);
+  DWARFCompileUnit *getCompileUnitForOffset(uint64_t Offset);
 
   /// Get a DIE given an exact offset.
-  DWARFDie getDIEForOffset(uint32_t Offset);
+  DWARFDie getDIEForOffset(uint64_t Offset);
 
   unsigned getMaxVersion() {
     // Ensure info units have been parsed to discover MaxVersion
@@ -301,10 +301,10 @@ public:
                       std::function<void(Error)> RecoverableErrorCallback);
 
   DataExtractor getStringExtractor() const {
-    return DataExtractor(DObj->getStringSection(), false, 0);
+    return DataExtractor(DObj->getStrSection(), false, 0);
   }
   DataExtractor getLineStringExtractor() const {
-    return DataExtractor(DObj->getLineStringSection(), false, 0);
+    return DataExtractor(DObj->getLineStrSection(), false, 0);
   }
 
   /// Wraps the returned DIEs for a given address.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDataExtractor.h b/include/llvm/DebugInfo/DWARF/DWARFDataExtractor.h
index 7c2a159b71fa..980724c525d2 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDataExtractor.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDataExtractor.h
@@ -35,20 +35,25 @@ public:
 
   /// Extracts a value and applies a relocation to the result if
   /// one exists for the given offset.
-  uint64_t getRelocatedValue(uint32_t Size, uint32_t *Off,
-                             uint64_t *SectionIndex = nullptr) const;
+  uint64_t getRelocatedValue(uint32_t Size, uint64_t *Off,
+                             uint64_t *SectionIndex = nullptr,
+                             Error *Err = nullptr) const;
 
   /// Extracts an address-sized value and applies a relocation to the result if
   /// one exists for the given offset.
-  uint64_t getRelocatedAddress(uint32_t *Off, uint64_t *SecIx = nullptr) const {
+  uint64_t getRelocatedAddress(uint64_t *Off, uint64_t *SecIx = nullptr) const {
     return getRelocatedValue(getAddressSize(), Off, SecIx);
   }
+  uint64_t getRelocatedAddress(Cursor &C, uint64_t *SecIx = nullptr) const {
+    return getRelocatedValue(getAddressSize(), &getOffset(C), SecIx,
+                             &getError(C));
+  }
 
   /// Extracts a DWARF-encoded pointer in \p Offset using \p Encoding.
   /// There is a DWARF encoding that uses a PC-relative adjustment.
   /// For these values, \p AbsPosOffset is used to fix them, which should
   /// reflect the absolute address of this pointer.
-  Optional<uint64_t> getEncodedPointer(uint32_t *Offset, uint8_t Encoding,
+  Optional<uint64_t> getEncodedPointer(uint64_t *Offset, uint8_t Encoding,
                                        uint64_t AbsPosOffset = 0) const;
 
   size_t size() const { return Section == nullptr ? 0 : Section->Data.size(); }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
index 28fd8484b4a9..1398e16252a9 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
@@ -20,7 +20,7 @@ namespace llvm {
 class raw_ostream;
 
 class DWARFAbbreviationDeclarationSet {
-  uint32_t Offset;
+  uint64_t Offset;
   /// Code of the first abbreviation, if all abbreviations in the set have
   /// consecutive codes. UINT32_MAX otherwise.
   uint32_t FirstAbbrCode;
@@ -32,9 +32,9 @@ class DWARFAbbreviationDeclarationSet {
 public:
   DWARFAbbreviationDeclarationSet();
 
-  uint32_t getOffset() const { return Offset; }
+  uint64_t getOffset() const { return Offset; }
   void dump(raw_ostream &OS) const;
-  bool extract(DataExtractor Data, uint32_t *OffsetPtr);
+  bool extract(DataExtractor Data, uint64_t *OffsetPtr);
 
   const DWARFAbbreviationDeclaration *
   getAbbreviationDeclaration(uint32_t AbbrCode) const;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h
index a98bf282fe7c..4539b9c9d581 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAddr.h
@@ -45,7 +45,7 @@ public:
 
 private:
   dwarf::DwarfFormat Format;
-  uint32_t HeaderOffset;
+  uint64_t HeaderOffset;
   Header HeaderData;
   uint32_t DataSize = 0;
   std::vector<uint64_t> Addrs;
@@ -54,11 +54,11 @@ public:
   void clear();
 
   /// Extract an entire table, including all addresses.
-  Error extract(DWARFDataExtractor Data, uint32_t *OffsetPtr,
+  Error extract(DWARFDataExtractor Data, uint64_t *OffsetPtr,
                 uint16_t Version, uint8_t AddrSize,
                 std::function<void(Error)> WarnCallback);
 
-  uint32_t getHeaderOffset() const { return HeaderOffset; }
+  uint64_t getHeaderOffset() const { return HeaderOffset; }
   uint8_t getAddrSize() const { return HeaderData.AddrSize; }
   void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {}) const;
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
index 5b6c578bc3bf..ebe4ad6e24dd 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
@@ -49,7 +49,7 @@ private:
   using DescriptorColl = std::vector<Descriptor>;
   using desc_iterator_range = iterator_range<DescriptorColl::const_iterator>;
 
-  uint32_t Offset;
+  uint64_t Offset;
   Header HeaderData;
   DescriptorColl ArangeDescriptors;
 
@@ -57,7 +57,7 @@ public:
   DWARFDebugArangeSet() { clear(); }
 
   void clear();
-  bool extract(DataExtractor data, uint32_t *offset_ptr);
+  bool extract(DataExtractor data, uint64_t *offset_ptr);
   void dump(raw_ostream &OS) const;
 
   uint32_t getCompileUnitDIEOffset() const { return HeaderData.CuOffset; }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
index 03223fbc80a9..172f1d2c9dbe 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
@@ -28,7 +28,7 @@ private:
   void extract(DataExtractor DebugArangesData);
 
   /// Call appendRange multiple times and then call construct.
-  void appendRange(uint32_t CUOffset, uint64_t LowPC, uint64_t HighPC);
+  void appendRange(uint64_t CUOffset, uint64_t LowPC, uint64_t HighPC);
   void construct();
 
   struct Range {
@@ -60,10 +60,10 @@ private:
 
   struct RangeEndpoint {
     uint64_t Address;
-    uint32_t CUOffset;
+    uint64_t CUOffset;
     bool IsRangeStart;
 
-    RangeEndpoint(uint64_t Address, uint32_t CUOffset, bool IsRangeStart)
+    RangeEndpoint(uint64_t Address, uint64_t CUOffset, bool IsRangeStart)
         : Address(Address), CUOffset(CUOffset), IsRangeStart(IsRangeStart) {}
 
     bool operator<(const RangeEndpoint &Other) const {
@@ -76,7 +76,7 @@ private:
 
   std::vector<RangeEndpoint> Endpoints;
   RangeColl Aranges;
-  DenseSet<uint32_t> ParsedCUOffsets;
+  DenseSet<uint64_t> ParsedCUOffsets;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
index d960f4bc9b1c..c6539df0d756 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
@@ -69,7 +69,7 @@ public:
   /// starting at *Offset and ending at EndOffset. *Offset is updated
   /// to EndOffset upon successful parsing, or indicates the offset
   /// where a problem occurred in case an error is returned.
-  Error parse(DataExtractor Data, uint32_t *Offset, uint32_t EndOffset);
+  Error parse(DWARFDataExtractor Data, uint64_t *Offset, uint64_t EndOffset);
 
   void dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
             unsigned IndentLevel = 1) const;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h b/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
index f50063b24370..ded960337ec6 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
@@ -22,7 +22,7 @@ class DWARFUnit;
 /// DWARFDebugInfoEntry - A DIE with only the minimum required data.
 class DWARFDebugInfoEntry {
   /// Offset within the .debug_info of the start of this entry.
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
 
   /// The integer depth of this DIE within the compile unit DIEs where the
   /// compile/type unit DIE has a depth of zero.
@@ -36,14 +36,14 @@ public:
   /// Extracts a debug info entry, which is a child of a given unit,
   /// starting at a given offset. If DIE can't be extracted, returns false and
   /// doesn't change OffsetPtr.
-  bool extractFast(const DWARFUnit &U, uint32_t *OffsetPtr);
+  bool extractFast(const DWARFUnit &U, uint64_t *OffsetPtr);
 
   /// High performance extraction should use this call.
-  bool extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
-                   const DWARFDataExtractor &DebugInfoData, uint32_t UEndOffset,
+  bool extractFast(const DWARFUnit &U, uint64_t *OffsetPtr,
+                   const DWARFDataExtractor &DebugInfoData, uint64_t UEndOffset,
                    uint32_t Depth);
 
-  uint32_t getOffset() const { return Offset; }
+  uint64_t getOffset() const { return Offset; }
   uint32_t getDepth() const { return Depth; }
 
   dwarf::Tag getTag() const {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index e7425c192373..c2be8304ad84 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -18,6 +18,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
 #include "llvm/Support/MD5.h"
+#include "llvm/Support/Path.h"
 #include <cstdint>
 #include <map>
 #include <string>
@@ -128,13 +129,15 @@ public:
 
     bool hasFileAtIndex(uint64_t FileIndex) const;
 
-    bool getFileNameByIndex(uint64_t FileIndex, StringRef CompDir,
-                            DILineInfoSpecifier::FileLineInfoKind Kind,
-                            std::string &Result) const;
+    bool
+    getFileNameByIndex(uint64_t FileIndex, StringRef CompDir,
+                       DILineInfoSpecifier::FileLineInfoKind Kind,
+                       std::string &Result,
+                       sys::path::Style Style = sys::path::Style::native) const;
 
     void clear();
     void dump(raw_ostream &OS, DIDumpOptions DumpOptions) const;
-    Error parse(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
+    Error parse(const DWARFDataExtractor &DebugLineData, uint64_t *OffsetPtr,
                 const DWARFContext &Ctx, const DWARFUnit *U = nullptr);
   };
 
@@ -278,7 +281,7 @@ public:
 
     /// Parse prologue and all rows.
     Error parse(
-        DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
+        DWARFDataExtractor &DebugLineData, uint64_t *OffsetPtr,
         const DWARFContext &Ctx, const DWARFUnit *U,
         std::function<void(Error)> RecoverableErrorCallback,
         raw_ostream *OS = nullptr);
@@ -305,9 +308,9 @@ public:
                                 std::vector<uint32_t> &Result) const;
   };
 
-  const LineTable *getLineTable(uint32_t Offset) const;
+  const LineTable *getLineTable(uint64_t Offset) const;
   Expected<const LineTable *> getOrParseLineTable(
-      DWARFDataExtractor &DebugLineData, uint32_t Offset,
+      DWARFDataExtractor &DebugLineData, uint64_t Offset,
       const DWARFContext &Ctx, const DWARFUnit *U,
       std::function<void(Error)> RecoverableErrorCallback);
 
@@ -350,17 +353,17 @@ public:
     bool done() const { return Done; }
 
     /// Get the offset the parser has reached.
-    uint32_t getOffset() const { return Offset; }
+    uint64_t getOffset() const { return Offset; }
 
   private:
-    DWARFUnit *prepareToParse(uint32_t Offset);
-    void moveToNextTable(uint32_t OldOffset, const Prologue &P);
+    DWARFUnit *prepareToParse(uint64_t Offset);
+    void moveToNextTable(uint64_t OldOffset, const Prologue &P);
 
     LineToUnitMap LineToUnit;
 
     DWARFDataExtractor &DebugLineData;
     const DWARFContext &Context;
-    uint32_t Offset = 0;
+    uint64_t Offset = 0;
     bool Done = false;
   };
 
@@ -377,7 +380,7 @@ private:
     struct Sequence Sequence;
   };
 
-  using LineTableMapTy = std::map<uint32_t, LineTable>;
+  using LineTableMapTy = std::map<uint64_t, LineTable>;
   using LineTableIter = LineTableMapTy::iterator;
   using LineTableConstIter = LineTableMapTy::const_iterator;
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index cced6048e811..c79d98e34f6e 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include <cstdint>
@@ -29,19 +30,20 @@ public:
     /// The ending address of the instruction range.
     uint64_t End;
     /// The location of the variable within the specified range.
-    SmallVector<char, 4> Loc;
+    SmallVector<uint8_t, 4> Loc;
   };
 
   /// A list of locations that contain one variable.
   struct LocationList {
     /// The beginning offset where this location list is stored in the debug_loc
     /// section.
-    unsigned Offset;
+    uint64_t Offset;
     /// All the locations in which the variable is stored.
     SmallVector<Entry, 2> Entries;
     /// Dump this list on OS.
-    void dump(raw_ostream &OS, bool IsLittleEndian, unsigned AddressSize,
-              const MCRegisterInfo *MRI, DWARFUnit *U, uint64_t BaseAddress,
+    void dump(raw_ostream &OS, uint64_t BaseAddress, bool IsLittleEndian,
+              unsigned AddressSize, const MCRegisterInfo *MRI, DWARFUnit *U,
+              DIDumpOptions DumpOpts,
               unsigned Indent) const;
   };
 
@@ -58,7 +60,7 @@ private:
 
 public:
   /// Print the location lists found within the debug_loc section.
-  void dump(raw_ostream &OS, const MCRegisterInfo *RegInfo,
+  void dump(raw_ostream &OS, const MCRegisterInfo *RegInfo, DIDumpOptions DumpOpts,
             Optional<uint64_t> Offset) const;
 
   /// Parse the debug_loc section accessible via the 'data' parameter using the
@@ -68,25 +70,29 @@ public:
   /// Return the location list at the given offset or nullptr.
   LocationList const *getLocationListAtOffset(uint64_t Offset) const;
 
-  Optional<LocationList> parseOneLocationList(DWARFDataExtractor Data,
-                                              uint32_t *Offset);
+  Expected<LocationList>
+  parseOneLocationList(const DWARFDataExtractor &Data, uint64_t *Offset);
 };
 
 class DWARFDebugLoclists {
 public:
   struct Entry {
     uint8_t Kind;
+    uint64_t Offset;
     uint64_t Value0;
     uint64_t Value1;
-    SmallVector<char, 4> Loc;
+    SmallVector<uint8_t, 4> Loc;
+    void dump(raw_ostream &OS, uint64_t &BaseAddr, bool IsLittleEndian,
+              unsigned AddressSize, const MCRegisterInfo *MRI, DWARFUnit *U,
+              DIDumpOptions DumpOpts, unsigned Indent, size_t MaxEncodingStringLength) const;
   };
 
   struct LocationList {
-    unsigned Offset;
+    uint64_t Offset;
     SmallVector<Entry, 2> Entries;
     void dump(raw_ostream &OS, uint64_t BaseAddr, bool IsLittleEndian,
               unsigned AddressSize, const MCRegisterInfo *RegInfo,
-              DWARFUnit *U, unsigned Indent) const;
+              DWARFUnit *U, DIDumpOptions DumpOpts, unsigned Indent) const;
   };
 
 private:
@@ -99,15 +105,16 @@ private:
   bool IsLittleEndian;
 
 public:
-  void parse(DataExtractor data, unsigned Version);
+  void parse(DataExtractor data, uint64_t Offset, uint64_t EndOffset, uint16_t Version);
   void dump(raw_ostream &OS, uint64_t BaseAddr, const MCRegisterInfo *RegInfo,
-            Optional<uint64_t> Offset) const;
+            DIDumpOptions DumpOpts, Optional<uint64_t> Offset) const;
 
   /// Return the location list at the given offset or nullptr.
   LocationList const *getLocationListAtOffset(uint64_t Offset) const;
 
-  static Optional<LocationList>
-  parseOneLocationList(DataExtractor Data, unsigned *Offset, unsigned Version);
+  static Expected<LocationList> parseOneLocationList(const DataExtractor &Data,
+                                                     uint64_t *Offset,
+                                                     unsigned Version);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h b/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
index 99e91ca90319..ae57306b90e1 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
@@ -25,7 +25,7 @@ class DWARFDebugPubTable {
 public:
   struct Entry {
     /// Section offset from the beginning of the compilation unit.
-    uint32_t SecOffset;
+    uint64_t SecOffset;
 
     /// An entry of the various gnu_pub* debug sections.
     dwarf::PubIndexEntryDescriptor Descriptor;
@@ -50,7 +50,7 @@ public:
 
     /// The offset from the beginning of the .debug_info section of the
     /// compilation unit header referenced by the set.
-    uint32_t Offset;
+    uint64_t Offset;
 
     /// The size in bytes of the contents of the .debug_info section generated
     /// to represent that compilation unit.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index a66f60292343..2f72c642a2d5 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -53,14 +53,13 @@ public:
       assert(AddressSize == 4 || AddressSize == 8);
       if (AddressSize == 4)
         return StartAddress == -1U;
-      else
-        return StartAddress == -1ULL;
+      return StartAddress == -1ULL;
     }
   };
 
 private:
   /// Offset in .debug_ranges section.
-  uint32_t Offset;
+  uint64_t Offset;
   uint8_t AddressSize;
   std::vector<RangeListEntry> Entries;
 
@@ -69,7 +68,7 @@ public:
 
   void clear();
   void dump(raw_ostream &OS) const;
-  Error extract(const DWARFDataExtractor &data, uint32_t *offset_ptr);
+  Error extract(const DWARFDataExtractor &data, uint64_t *offset_ptr);
   const std::vector<RangeListEntry> &getEntries() { return Entries; }
 
   /// getAbsoluteRanges - Returns absolute address ranges defined by this range
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
index 167ddde3ec3d..952c41e188c7 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -34,7 +34,7 @@ struct RangeListEntry : public DWARFListEntryBase {
   uint64_t Value0;
   uint64_t Value1;
 
-  Error extract(DWARFDataExtractor Data, uint32_t End, uint32_t *OffsetPtr);
+  Error extract(DWARFDataExtractor Data, uint64_t End, uint64_t *OffsetPtr);
   void dump(raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
             uint64_t &CurrentBase, DIDumpOptions DumpOpts,
             llvm::function_ref<Optional<object::SectionedAddress>(uint32_t)>
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h
index 21e68f983bb3..f7f08b4a499d 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -63,7 +63,7 @@ public:
   /// Get the absolute offset into the debug info or types section.
   ///
   /// \returns the DIE offset or -1U if invalid.
-  uint32_t getOffset() const {
+  uint64_t getOffset() const {
     assert(isValid() && "must check validity prior to calling");
     return Die->getOffset();
   }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFExpression.h b/include/llvm/DebugInfo/DWARF/DWARFExpression.h
index f066dd58d606..456d9df957ad 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFExpression.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFExpression.h
@@ -77,18 +77,18 @@ public:
     uint8_t Opcode; ///< The Op Opcode, DW_OP_<something>.
     Description Desc;
     bool Error;
-    uint32_t EndOffset;
+    uint64_t EndOffset;
     uint64_t Operands[2];
-    uint32_t OperandEndOffsets[2];
+    uint64_t OperandEndOffsets[2];
 
   public:
     Description &getDescription() { return Desc; }
     uint8_t getCode() { return Opcode; }
     uint64_t getRawOperand(unsigned Idx) { return Operands[Idx]; }
-    uint32_t getOperandEndOffset(unsigned Idx) { return OperandEndOffsets[Idx]; }
-    uint32_t getEndOffset() { return EndOffset; }
+    uint64_t getOperandEndOffset(unsigned Idx) { return OperandEndOffsets[Idx]; }
+    uint64_t getEndOffset() { return EndOffset; }
     bool extract(DataExtractor Data, uint16_t Version, uint8_t AddressSize,
-                 uint32_t Offset);
+                 uint64_t Offset);
     bool isError() { return Error; }
     bool print(raw_ostream &OS, const DWARFExpression *Expr,
                const MCRegisterInfo *RegInfo, DWARFUnit *U, bool isEH);
@@ -101,9 +101,9 @@ public:
                                     Operation> {
     friend class DWARFExpression;
     const DWARFExpression *Expr;
-    uint32_t Offset;
+    uint64_t Offset;
     Operation Op;
-    iterator(const DWARFExpression *Expr, uint32_t Offset)
+    iterator(const DWARFExpression *Expr, uint64_t Offset)
         : Expr(Expr), Offset(Offset) {
       Op.Error =
           Offset >= Expr->Data.getData().size() ||
diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 731e71ed9eae..6fec6fcb6b34 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -70,7 +70,7 @@ public:
   static DWARFFormValue createFromBlockValue(dwarf::Form F,
                                              ArrayRef<uint8_t> D);
   static DWARFFormValue createFromUnit(dwarf::Form F, const DWARFUnit *Unit,
-                                       uint32_t *OffsetPtr);
+                                       uint64_t *OffsetPtr);
 
   dwarf::Form getForm() const { return Form; }
   uint64_t getRawUValue() const { return Value.uval; }
@@ -87,12 +87,12 @@ public:
   /// in \p FormParams is needed to interpret some forms. The optional
   /// \p Context and \p Unit allows extracting information if the form refers
   /// to other sections (e.g., .debug_str).
-  bool extractValue(const DWARFDataExtractor &Data, uint32_t *OffsetPtr,
+  bool extractValue(const DWARFDataExtractor &Data, uint64_t *OffsetPtr,
                     dwarf::FormParams FormParams,
                     const DWARFContext *Context = nullptr,
                     const DWARFUnit *Unit = nullptr);
 
-  bool extractValue(const DWARFDataExtractor &Data, uint32_t *OffsetPtr,
+  bool extractValue(const DWARFDataExtractor &Data, uint64_t *OffsetPtr,
                     dwarf::FormParams FormParams, const DWARFUnit *U) {
     return extractValue(Data, OffsetPtr, FormParams, nullptr, U);
   }
@@ -128,7 +128,7 @@ public:
   /// \param OffsetPtr A reference to the offset that will be updated.
   /// \param Params DWARF parameters to help interpret forms.
   /// \returns true on success, false if the form was not skipped.
-  bool skipValue(DataExtractor DebugInfoData, uint32_t *OffsetPtr,
+  bool skipValue(DataExtractor DebugInfoData, uint64_t *OffsetPtr,
                  const dwarf::FormParams Params) const {
     return DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr, Params);
   }
@@ -144,7 +144,7 @@ public:
   /// \param FormParams DWARF parameters to help interpret forms.
   /// \returns true on success, false if the form was not skipped.
   static bool skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
-                        uint32_t *OffsetPtr,
+                        uint64_t *OffsetPtr,
                         const dwarf::FormParams FormParams);
 
 private:
diff --git a/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
index a1ea69b040f0..496fdb2477f9 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFListTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -26,7 +26,7 @@ namespace llvm {
 /// entries.
 struct DWARFListEntryBase {
   /// The offset at which the entry is located in the section.
-  uint32_t Offset;
+  uint64_t Offset;
   /// The DWARF encoding (DW_RLE_* or DW_LLE_*).
   uint8_t EntryKind;
   /// The index of the section this entry belongs to.
@@ -46,8 +46,8 @@ public:
   const ListEntries &getEntries() const { return Entries; }
   bool empty() const { return Entries.empty(); }
   void clear() { Entries.clear(); }
-  Error extract(DWARFDataExtractor Data, uint32_t HeaderOffset, uint32_t End,
-                uint32_t *OffsetPtr, StringRef SectionName,
+  Error extract(DWARFDataExtractor Data, uint64_t HeaderOffset, uint64_t End,
+                uint64_t *OffsetPtr, StringRef SectionName,
                 StringRef ListStringName);
 };
 
@@ -57,7 +57,7 @@ class DWARFListTableHeader {
   struct Header {
     /// The total length of the entries for this table, not including the length
     /// field itself.
-    uint32_t Length = 0;
+    uint64_t Length = 0;
     /// The DWARF version number.
     uint16_t Version;
     /// The size in bytes of an address on the target architecture. For
@@ -75,12 +75,12 @@ class DWARFListTableHeader {
   /// The offset table, which contains offsets to the individual list entries.
   /// It is used by forms such as DW_FORM_rnglistx.
   /// FIXME: Generate the table and use the appropriate forms.
-  std::vector<uint32_t> Offsets;
+  std::vector<uint64_t> Offsets;
   /// The table's format, either DWARF32 or DWARF64.
   dwarf::DwarfFormat Format;
   /// The offset at which the header (and hence the table) is located within
   /// its section.
-  uint32_t HeaderOffset;
+  uint64_t HeaderOffset;
   /// The name of the section the list is located in.
   StringRef SectionName;
   /// A characterization of the list for dumping purposes, e.g. "range" or
@@ -95,28 +95,40 @@ public:
     HeaderData = {};
     Offsets.clear();
   }
-  uint32_t getHeaderOffset() const { return HeaderOffset; }
+  uint64_t getHeaderOffset() const { return HeaderOffset; }
   uint8_t getAddrSize() const { return HeaderData.AddrSize; }
-  uint32_t getLength() const { return HeaderData.Length; }
+  uint64_t getLength() const { return HeaderData.Length; }
   uint16_t getVersion() const { return HeaderData.Version; }
   StringRef getSectionName() const { return SectionName; }
   StringRef getListTypeString() const { return ListTypeString; }
   dwarf::DwarfFormat getFormat() const { return Format; }
 
+  /// Return the size of the table header including the length but not including
+  /// the offsets.
+  static uint8_t getHeaderSize(dwarf::DwarfFormat Format) {
+    switch (Format) {
+    case dwarf::DwarfFormat::DWARF32:
+      return 12;
+    case dwarf::DwarfFormat::DWARF64:
+      return 20;
+    }
+    llvm_unreachable("Invalid DWARF format (expected DWARF32 or DWARF64");
+  }
+
   void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {}) const;
-  Optional<uint32_t> getOffsetEntry(uint32_t Index) const {
+  Optional<uint64_t> getOffsetEntry(uint32_t Index) const {
     if (Index < Offsets.size())
       return Offsets[Index];
     return None;
   }
 
   /// Extract the table header and the array of offsets.
-  Error extract(DWARFDataExtractor Data, uint32_t *OffsetPtr);
+  Error extract(DWARFDataExtractor Data, uint64_t *OffsetPtr);
 
   /// Returns the length of the table, including the length field, or 0 if the
   /// length has not been determined (e.g. because the table has not yet been
   /// parsed, or there was a problem in parsing).
-  uint32_t length() const;
+  uint64_t length() const;
 };
 
 /// A class representing a table of lists as specified in the DWARF v5
@@ -128,7 +140,7 @@ template <typename DWARFListType> class DWARFListTableBase {
   DWARFListTableHeader Header;
   /// A mapping between file offsets and lists. It is used to find a particular
   /// list based on an offset (obtained from DW_AT_ranges, for example).
-  std::map<uint32_t, DWARFListType> ListMap;
+  std::map<uint64_t, DWARFListType> ListMap;
   /// This string is displayed as a heading before the list is dumped
   /// (e.g. "ranges:").
   StringRef HeaderString;
@@ -144,17 +156,18 @@ public:
     ListMap.clear();
   }
   /// Extract the table header and the array of offsets.
-  Error extractHeaderAndOffsets(DWARFDataExtractor Data, uint32_t *OffsetPtr) {
+  Error extractHeaderAndOffsets(DWARFDataExtractor Data, uint64_t *OffsetPtr) {
     return Header.extract(Data, OffsetPtr);
   }
   /// Extract an entire table, including all list entries.
-  Error extract(DWARFDataExtractor Data, uint32_t *OffsetPtr);
+  Error extract(DWARFDataExtractor Data, uint64_t *OffsetPtr);
   /// Look up a list based on a given offset. Extract it and enter it into the
   /// list map if necessary.
-  Expected<DWARFListType> findList(DWARFDataExtractor Data, uint32_t Offset);
+  Expected<DWARFListType> findList(DWARFDataExtractor Data, uint64_t Offset);
 
-  uint32_t getHeaderOffset() const { return Header.getHeaderOffset(); }
+  uint64_t getHeaderOffset() const { return Header.getHeaderOffset(); }
   uint8_t getAddrSize() const { return Header.getAddrSize(); }
+  dwarf::DwarfFormat getFormat() const { return Header.getFormat(); }
 
   void dump(raw_ostream &OS,
             llvm::function_ref<Optional<object::SectionedAddress>(uint32_t)>
@@ -162,37 +175,31 @@ public:
             DIDumpOptions DumpOpts = {}) const;
 
   /// Return the contents of the offset entry designated by a given index.
-  Optional<uint32_t> getOffsetEntry(uint32_t Index) const {
+  Optional<uint64_t> getOffsetEntry(uint32_t Index) const {
     return Header.getOffsetEntry(Index);
   }
   /// Return the size of the table header including the length but not including
   /// the offsets. This is dependent on the table format, which is unambiguously
   /// derived from parsing the table.
   uint8_t getHeaderSize() const {
-    switch (Header.getFormat()) {
-    case dwarf::DwarfFormat::DWARF32:
-      return 12;
-    case dwarf::DwarfFormat::DWARF64:
-      return 20;
-    }
-    llvm_unreachable("Invalid DWARF format (expected DWARF32 or DWARF64");
+    return DWARFListTableHeader::getHeaderSize(getFormat());
   }
 
-  uint32_t length() { return Header.length(); }
+  uint64_t length() { return Header.length(); }
 };
 
 template <typename DWARFListType>
 Error DWARFListTableBase<DWARFListType>::extract(DWARFDataExtractor Data,
-                                                 uint32_t *OffsetPtr) {
+                                                 uint64_t *OffsetPtr) {
   clear();
   if (Error E = extractHeaderAndOffsets(Data, OffsetPtr))
     return E;
 
   Data.setAddressSize(Header.getAddrSize());
-  uint32_t End = getHeaderOffset() + Header.length();
+  uint64_t End = getHeaderOffset() + Header.length();
   while (*OffsetPtr < End) {
     DWARFListType CurrentList;
-    uint32_t Off = *OffsetPtr;
+    uint64_t Off = *OffsetPtr;
     if (Error E = CurrentList.extract(Data, getHeaderOffset(), End, OffsetPtr,
                                       Header.getSectionName(),
                                       Header.getListTypeString()))
@@ -208,13 +215,13 @@ Error DWARFListTableBase<DWARFListType>::extract(DWARFDataExtractor Data,
 
 template <typename ListEntryType>
 Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
-                                            uint32_t HeaderOffset, uint32_t End,
-                                            uint32_t *OffsetPtr,
+                                            uint64_t HeaderOffset, uint64_t End,
+                                            uint64_t *OffsetPtr,
                                             StringRef SectionName,
                                             StringRef ListTypeString) {
   if (*OffsetPtr < HeaderOffset || *OffsetPtr >= End)
     return createStringError(errc::invalid_argument,
-                       "invalid %s list offset 0x%" PRIx32,
+                       "invalid %s list offset 0x%" PRIx64,
                        ListTypeString.data(), *OffsetPtr);
   Entries.clear();
   while (*OffsetPtr < End) {
@@ -227,7 +234,7 @@ Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
   }
   return createStringError(errc::illegal_byte_sequence,
                      "no end of list marker detected at end of %s table "
-                     "starting at offset 0x%" PRIx32,
+                     "starting at offset 0x%" PRIx64,
                      SectionName.data(), HeaderOffset);
 }
 
@@ -261,15 +268,15 @@ void DWARFListTableBase<DWARFListType>::dump(
 template <typename DWARFListType>
 Expected<DWARFListType>
 DWARFListTableBase<DWARFListType>::findList(DWARFDataExtractor Data,
-                                            uint32_t Offset) {
+                                            uint64_t Offset) {
   auto Entry = ListMap.find(Offset);
   if (Entry != ListMap.end())
     return Entry->second;
 
   // Extract the list from the section and enter it into the list map.
   DWARFListType List;
-  uint32_t End = getHeaderOffset() + Header.length();
-  uint32_t StartingOffset = Offset;
+  uint64_t End = getHeaderOffset() + Header.length();
+  uint64_t StartingOffset = Offset;
   if (Error E =
           List.extract(Data, getHeaderOffset(), End, &Offset,
                        Header.getSectionName(), Header.getListTypeString()))
diff --git a/include/llvm/DebugInfo/DWARF/DWARFObject.h b/include/llvm/DebugInfo/DWARF/DWARFObject.h
index 1bba74a25d0e..88fe3f434edc 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFObject.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFObject.h
@@ -39,20 +39,20 @@ public:
   virtual StringRef getAbbrevSection() const { return ""; }
   virtual const DWARFSection &getLocSection() const { return Dummy; }
   virtual const DWARFSection &getLoclistsSection() const { return Dummy; }
-  virtual StringRef getARangeSection() const { return ""; }
-  virtual StringRef getDebugFrameSection() const { return ""; }
-  virtual StringRef getEHFrameSection() const { return ""; }
+  virtual StringRef getArangesSection() const { return ""; }
+  virtual const DWARFSection &getFrameSection() const { return Dummy; }
+  virtual const DWARFSection &getEHFrameSection() const { return Dummy; }
   virtual const DWARFSection &getLineSection() const { return Dummy; }
-  virtual StringRef getLineStringSection() const { return ""; }
-  virtual StringRef getStringSection() const { return ""; }
-  virtual const DWARFSection &getRangeSection() const { return Dummy; }
+  virtual StringRef getLineStrSection() const { return ""; }
+  virtual StringRef getStrSection() const { return ""; }
+  virtual const DWARFSection &getRangesSection() const { return Dummy; }
   virtual const DWARFSection &getRnglistsSection() const { return Dummy; }
   virtual StringRef getMacinfoSection() const { return ""; }
-  virtual const DWARFSection &getPubNamesSection() const { return Dummy; }
-  virtual const DWARFSection &getPubTypesSection() const { return Dummy; }
-  virtual const DWARFSection &getGnuPubNamesSection() const { return Dummy; }
-  virtual const DWARFSection &getGnuPubTypesSection() const { return Dummy; }
-  virtual const DWARFSection &getStringOffsetSection() const { return Dummy; }
+  virtual const DWARFSection &getPubnamesSection() const { return Dummy; }
+  virtual const DWARFSection &getPubtypesSection() const { return Dummy; }
+  virtual const DWARFSection &getGnuPubnamesSection() const { return Dummy; }
+  virtual const DWARFSection &getGnuPubtypesSection() const { return Dummy; }
+  virtual const DWARFSection &getStrOffsetsSection() const { return Dummy; }
   virtual void
   forEachInfoDWOSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual void
@@ -60,11 +60,11 @@ public:
   virtual StringRef getAbbrevDWOSection() const { return ""; }
   virtual const DWARFSection &getLineDWOSection() const { return Dummy; }
   virtual const DWARFSection &getLocDWOSection() const { return Dummy; }
-  virtual StringRef getStringDWOSection() const { return ""; }
-  virtual const DWARFSection &getStringOffsetDWOSection() const {
+  virtual StringRef getStrDWOSection() const { return ""; }
+  virtual const DWARFSection &getStrOffsetsDWOSection() const {
     return Dummy;
   }
-  virtual const DWARFSection &getRangeDWOSection() const { return Dummy; }
+  virtual const DWARFSection &getRangesDWOSection() const { return Dummy; }
   virtual const DWARFSection &getRnglistsDWOSection() const { return Dummy; }
   virtual const DWARFSection &getAddrSection() const { return Dummy; }
   virtual const DWARFSection &getAppleNamesSection() const { return Dummy; }
@@ -72,7 +72,7 @@ public:
   virtual const DWARFSection &getAppleNamespacesSection() const {
     return Dummy;
   }
-  virtual const DWARFSection &getDebugNamesSection() const { return Dummy; }
+  virtual const DWARFSection &getNamesSection() const { return Dummy; }
   virtual const DWARFSection &getAppleObjCSection() const { return Dummy; }
   virtual StringRef getCUIndexSection() const { return ""; }
   virtual StringRef getGdbIndexSection() const { return ""; }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
index 90d89375fd35..c95bdcbd8a43 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
@@ -34,7 +34,7 @@ public:
                   LS, LE, IsDWO, UnitVector) {}
 
   uint64_t getTypeHash() const { return getHeader().getTypeHash(); }
-  uint32_t getTypeOffset() const { return getHeader().getTypeOffset(); }
+  uint64_t getTypeOffset() const { return getHeader().getTypeOffset(); }
 
   void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {}) override;
   // Enable LLVM-style RTTI.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index f9f90db31890..51de114a3506 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -45,7 +45,7 @@ class DWARFUnit;
 /// parse the header before deciding what specific kind of unit to construct.
 class DWARFUnitHeader {
   // Offset within section.
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   // Version, address size, and DWARF format.
   dwarf::FormParams FormParams;
   uint64_t Length = 0;
@@ -56,7 +56,7 @@ class DWARFUnitHeader {
 
   // For type units only.
   uint64_t TypeHash = 0;
-  uint32_t TypeOffset = 0;
+  uint64_t TypeOffset = 0;
 
   // For v5 split or skeleton compile units only.
   Optional<uint64_t> DWOId;
@@ -70,10 +70,10 @@ class DWARFUnitHeader {
 public:
   /// Parse a unit header from \p debug_info starting at \p offset_ptr.
   bool extract(DWARFContext &Context, const DWARFDataExtractor &debug_info,
-               uint32_t *offset_ptr, DWARFSectionKind Kind = DW_SECT_INFO,
+               uint64_t *offset_ptr, DWARFSectionKind Kind = DW_SECT_INFO,
                const DWARFUnitIndex *Index = nullptr,
                const DWARFUnitIndex::Entry *Entry = nullptr);
-  uint32_t getOffset() const { return Offset; }
+  uint64_t getOffset() const { return Offset; }
   const dwarf::FormParams &getFormParams() const { return FormParams; }
   uint16_t getVersion() const { return FormParams.Version; }
   dwarf::DwarfFormat getFormat() const { return FormParams.Format; }
@@ -91,16 +91,17 @@ public:
   }
   const DWARFUnitIndex::Entry *getIndexEntry() const { return IndexEntry; }
   uint64_t getTypeHash() const { return TypeHash; }
-  uint32_t getTypeOffset() const { return TypeOffset; }
+  uint64_t getTypeOffset() const { return TypeOffset; }
   uint8_t getUnitType() const { return UnitType; }
   bool isTypeUnit() const {
     return UnitType == dwarf::DW_UT_type || UnitType == dwarf::DW_UT_split_type;
   }
   uint8_t getSize() const { return Size; }
-  uint32_t getNextUnitOffset() const {
-    return Offset + Length +
-           (FormParams.Format == llvm::dwarf::DwarfFormat::DWARF64 ? 4 : 0) +
-           FormParams.getDwarfOffsetByteSize();
+  uint8_t getUnitLengthFieldByteSize() const {
+    return dwarf::getUnitLengthFieldByteSize(FormParams.Format);
+  }
+  uint64_t getNextUnitOffset() const {
+    return Offset + Length + getUnitLengthFieldByteSize();
   }
 };
 
@@ -110,7 +111,7 @@ const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context,
 /// Describe a collection of units. Intended to hold all units either from
 /// .debug_info and .debug_types, or from .debug_info.dwo and .debug_types.dwo.
 class DWARFUnitVector final : public SmallVector<std::unique_ptr<DWARFUnit>, 1> {
-  std::function<std::unique_ptr<DWARFUnit>(uint32_t, DWARFSectionKind,
+  std::function<std::unique_ptr<DWARFUnit>(uint64_t, DWARFSectionKind,
                                            const DWARFSection *,
                                            const DWARFUnitIndex::Entry *)>
       Parser;
@@ -121,7 +122,7 @@ public:
   using iterator = typename UnitVector::iterator;
   using iterator_range = llvm::iterator_range<typename UnitVector::iterator>;
 
-  DWARFUnit *getUnitForOffset(uint32_t Offset) const;
+  DWARFUnit *getUnitForOffset(uint64_t Offset) const;
   DWARFUnit *getUnitForIndexEntry(const DWARFUnitIndex::Entry &E);
 
   /// Read units from a .debug_info or .debug_types section.  Calls made
@@ -197,7 +198,7 @@ class DWARFUnit {
   DWARFUnitHeader Header;
   const DWARFDebugAbbrev *Abbrev;
   const DWARFSection *RangeSection;
-  uint32_t RangeSectionBase;
+  uint64_t RangeSectionBase;
   /// We either keep track of the location list section or its data, depending
   /// on whether we are handling a split DWARF section or not.
   union {
@@ -275,7 +276,7 @@ public:
   const DWARFSection &getInfoSection() const { return InfoSection; }
   const DWARFSection *getLocSection() const { return LocSection; }
   StringRef getLocSectionData() const { return LocSectionData; }
-  uint32_t getOffset() const { return Header.getOffset(); }
+  uint64_t getOffset() const { return Header.getOffset(); }
   const dwarf::FormParams &getFormParams() const {
     return Header.getFormParams();
   }
@@ -285,10 +286,10 @@ public:
   uint8_t getDwarfOffsetByteSize() const {
     return Header.getDwarfOffsetByteSize();
   }
-  uint32_t getLength() const { return Header.getLength(); }
+  uint64_t getLength() const { return Header.getLength(); }
   uint8_t getUnitType() const { return Header.getUnitType(); }
   bool isTypeUnit() const { return Header.isTypeUnit(); }
-  uint32_t getNextUnitOffset() const { return Header.getNextUnitOffset(); }
+  uint64_t getNextUnitOffset() const { return Header.getNextUnitOffset(); }
   const DWARFSection &getLineSection() const { return LineSection; }
   StringRef getStringSection() const { return StringSection; }
   const DWARFSection &getStringOffsetSection() const {
@@ -303,7 +304,7 @@ public:
   /// Recursively update address to Die map.
   void updateAddressDieMap(DWARFDie Die);
 
-  void setRangesSection(const DWARFSection *RS, uint32_t Base) {
+  void setRangesSection(const DWARFSection *RS, uint64_t Base) {
     RangeSection = RS;
     RangeSectionBase = Base;
   }
@@ -322,7 +323,7 @@ public:
   /// .debug_ranges section. If the extraction is unsuccessful, an error
   /// is returned. Successful extraction requires that the compile unit
   /// has already been extracted.
-  Error extractRangeList(uint32_t RangeListOffset,
+  Error extractRangeList(uint64_t RangeListOffset,
                          DWARFDebugRangeList &RangeList) const;
   void clear();
 
@@ -405,7 +406,7 @@ public:
 
   /// Return a vector of address ranges resulting from a (possibly encoded)
   /// range list starting at a given offset in the appropriate ranges section.
-  Expected<DWARFAddressRangesVector> findRnglistFromOffset(uint32_t Offset);
+  Expected<DWARFAddressRangesVector> findRnglistFromOffset(uint64_t Offset);
 
   /// Return a vector of address ranges retrieved from an encoded range
   /// list whose offset is found via a table lookup given an index (DWARF v5
@@ -415,7 +416,7 @@ public:
   /// Return a rangelist's offset based on an index. The index designates
   /// an entry in the rangelist table's offset array and is supplied by
   /// DW_FORM_rnglistx.
-  Optional<uint32_t> getRnglistOffset(uint32_t Index) {
+  Optional<uint64_t> getRnglistOffset(uint32_t Index) {
     if (RngListTable)
       return RngListTable->getOffsetEntry(Index);
     return None;
@@ -470,7 +471,7 @@ public:
   /// unit's DIE vector.
   ///
   /// The unit needs to have its DIEs extracted for this method to work.
-  DWARFDie getDIEForOffset(uint32_t Offset) {
+  DWARFDie getDIEForOffset(uint64_t Offset) {
     extractDIEsIfNeeded(false);
     assert(!DieArray.empty());
     auto It =
@@ -495,15 +496,19 @@ public:
   }
 
   virtual void dump(raw_ostream &OS, DIDumpOptions DumpOpts) = 0;
+
+  Error tryExtractDIEsIfNeeded(bool CUDieOnly);
+
 private:
   /// Size in bytes of the .debug_info data associated with this compile unit.
   size_t getDebugInfoSize() const {
-    return Header.getLength() + 4 - getHeaderSize();
+    return Header.getLength() + Header.getUnitLengthFieldByteSize() -
+           getHeaderSize();
   }
 
   /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
-  /// hasn't already been done. Returns the number of DIEs parsed at this call.
-  size_t extractDIEsIfNeeded(bool CUDieOnly);
+  /// hasn't already been done
+  void extractDIEsIfNeeded(bool CUDieOnly);
 
   /// extractDIEsToVector - Appends all parsed DIEs to a vector.
   void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
index fc8c707c512e..684103aac2fc 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
@@ -37,7 +37,7 @@ class DWARFUnitIndex {
     uint32_t NumUnits;
     uint32_t NumBuckets = 0;
 
-    bool parse(DataExtractor IndexData, uint32_t *OffsetPtr);
+    bool parse(DataExtractor IndexData, uint64_t *OffsetPtr);
     void dump(raw_ostream &OS) const;
   };
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index f1268f220272..a4a3a11d441b 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -94,7 +94,7 @@ private:
   /// A map that tracks all references (converted absolute references) so we
   /// can verify each reference points to a valid DIE and not an offset that
   /// lies between to valid DIEs.
-  std::map<uint64_t, std::set<uint32_t>> ReferenceToDIEOffsets;
+  std::map<uint64_t, std::set<uint64_t>> ReferenceToDIEOffsets;
   uint32_t NumDebugLineErrors = 0;
   // Used to relax some checks that do not currently work portably
   bool IsObjectFile;
@@ -138,7 +138,7 @@ private:
   ///
   /// \returns true if the header is verified successfully, false otherwise.
   bool verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
-                        uint32_t *Offset, unsigned UnitIndex, uint8_t &UnitType,
+                        uint64_t *Offset, unsigned UnitIndex, uint8_t &UnitType,
                         bool &isUnitDWARF64);
 
   /// Verifies the header of a unit in a .debug_info or .debug_types section.
diff --git a/include/llvm/DebugInfo/GSYM/FileEntry.h b/include/llvm/DebugInfo/GSYM/FileEntry.h
index 228b4efa0656..49e7fc9c4291 100644
--- a/include/llvm/DebugInfo/GSYM/FileEntry.h
+++ b/include/llvm/DebugInfo/GSYM/FileEntry.h
@@ -1,9 +1,8 @@
 //===- FileEntry.h ----------------------------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/include/llvm/DebugInfo/GSYM/FileWriter.h b/include/llvm/DebugInfo/GSYM/FileWriter.h
new file mode 100644
index 000000000000..cd568765a4f2
--- /dev/null
+++ b/include/llvm/DebugInfo/GSYM/FileWriter.h
@@ -0,0 +1,124 @@
+//===- FileWriter.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_GSYM_FILEWRITER_H
+#define LLVM_DEBUGINFO_GSYM_FILEWRITER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Endian.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+namespace llvm {
+class raw_pwrite_stream;
+
+namespace gsym {
+
+/// A simplified binary data writer class that doesn't require targets, target
+/// definitions, architectures, or require any other optional compile time
+/// libraries to be enabled via the build process. This class needs the ability
+/// to seek to different spots in the binary stream that is produces to fixup
+/// offsets and sizes.
+class FileWriter {
+  llvm::raw_pwrite_stream &OS;
+  llvm::support::endianness ByteOrder;
+public:
+  FileWriter(llvm::raw_pwrite_stream &S, llvm::support::endianness B)
+      : OS(S), ByteOrder(B) {}
+  ~FileWriter();
+  /// Write a single uint8_t value into the stream at the current file
+  /// position.
+  ///
+  /// \param   Value The value to write into the stream.
+  void writeU8(uint8_t Value);
+
+  /// Write a single uint16_t value into the stream at the current file
+  /// position. The value will be byte swapped if needed to match the byte
+  /// order specified during construction.
+  ///
+  /// \param   Value The value to write into the stream.
+  void writeU16(uint16_t Value);
+
+  /// Write a single uint32_t value into the stream at the current file
+  /// position. The value will be byte swapped if needed to match the byte
+  /// order specified during construction.
+  ///
+  /// \param   Value The value to write into the stream.
+  void writeU32(uint32_t Value);
+
+  /// Write a single uint64_t value into the stream at the current file
+  /// position. The value will be byte swapped if needed to match the byte
+  /// order specified during construction.
+  ///
+  /// \param   Value The value to write into the stream.
+  void writeU64(uint64_t Value);
+
+  /// Write the value into the stream encoded using signed LEB128 at the
+  /// current file position.
+  ///
+  /// \param   Value The value to write into the stream.
+  void writeSLEB(int64_t Value);
+
+  /// Write the value into the stream encoded using unsigned LEB128 at the
+  /// current file position.
+  ///
+  /// \param   Value The value to write into the stream.
+  void writeULEB(uint64_t Value);
+
+  /// Write an array of uint8_t values into the stream at the current file
+  /// position.
+  ///
+  /// \param   Data An array of values to write into the stream.
+  void writeData(llvm::ArrayRef<uint8_t> Data);
+
+  /// Write a NULL terminated C string into the stream at the current file
+  /// position. The entire contents of Str will be written into the steam at
+  /// the current file position and then an extra NULL termation byte will be
+  /// written. It is up to the user to ensure that Str doesn't contain any NULL
+  /// characters unless the additional NULL characters are desired.
+  ///
+  /// \param   Str The value to write into the stream.
+  void writeNullTerminated(llvm::StringRef Str);
+
+  /// Fixup a uint32_t value at the specified offset in the stream. This
+  /// function will save the current file position, seek to the specified
+  /// offset, overwrite the data using Value, and then restore the file
+  /// position to the previous file position.
+  ///
+  /// \param   Value The value to write into the stream.
+  /// \param   Offset The offset at which to write the Value within the stream.
+  void fixup32(uint32_t Value, uint64_t Offset);
+
+  /// Pad with zeroes at the current file position until the current file
+  /// position matches the specified alignment.
+  ///
+  /// \param  Align An integer speciying the desired alignment. This does not
+  ///         need to be a power of two.
+  void alignTo(size_t Align);
+
+  /// Return the current offset within the file.
+  ///
+  /// \return The unsigned offset from the start of the file of the current
+  ///         file position.
+  uint64_t tell();
+
+  llvm::raw_pwrite_stream &get_stream() {
+    return OS;
+  }
+
+private:
+  FileWriter(const FileWriter &rhs) = delete;
+  void operator=(const FileWriter &rhs) = delete;
+};
+
+} // namespace gsym
+} // namespace llvm
+
+#endif // #ifndef LLVM_DEBUGINFO_GSYM_FILEWRITER_H
diff --git a/include/llvm/DebugInfo/GSYM/FunctionInfo.h b/include/llvm/DebugInfo/GSYM/FunctionInfo.h
index eedb1e638fd1..63e18bb2ecd5 100644
--- a/include/llvm/DebugInfo/GSYM/FunctionInfo.h
+++ b/include/llvm/DebugInfo/GSYM/FunctionInfo.h
@@ -1,17 +1,17 @@
 //===- FunctionInfo.h -------------------------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_DEBUGINFO_GSYM_FUNCTIONINFO_H
 #define LLVM_DEBUGINFO_GSYM_FUNCTIONINFO_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/GSYM/InlineInfo.h"
-#include "llvm/DebugInfo/GSYM/LineEntry.h"
+#include "llvm/DebugInfo/GSYM/LineTable.h"
 #include "llvm/DebugInfo/GSYM/Range.h"
 #include "llvm/DebugInfo/GSYM/StringTable.h"
 #include <tuple>
@@ -21,41 +21,125 @@ namespace llvm {
 class raw_ostream;
 namespace gsym {
 
-/// Function information in GSYM files encodes information for one
-/// contiguous address range. The name of the function is encoded as
-/// a string table offset and allows multiple functions with the same
-/// name to share the name string in the string table. Line tables are
-/// stored in a sorted vector of gsym::LineEntry objects and are split
-/// into line tables for each function. If a function has a discontiguous
-/// range, it will be split into two gsym::FunctionInfo objects. If the
-/// function has inline functions, the information will be encoded in
-/// the "Inline" member, see gsym::InlineInfo for more information.
+/// Function information in GSYM files encodes information for one contiguous
+/// address range. If a function has discontiguous address ranges, they will
+/// need to be encoded using multiple FunctionInfo objects.
+///
+/// ENCODING
+///
+/// The function information gets the function start address as an argument
+/// to the FunctionInfo::decode(...) function. This information is calculated
+/// from the GSYM header and an address offset from the GSYM address offsets
+/// table. The encoded FunctionInfo information must be alinged to a 4 byte
+/// boundary.
+///
+/// The encoded data for a FunctionInfo starts with fixed data that all
+/// function info objects have:
+///
+/// ENCODING  NAME        DESCRIPTION
+/// ========= =========== ====================================================
+/// uint32_t  Size        The size in bytes of this function.
+/// uint32_t  Name        The string table offset of the function name.
+///
+/// The optional data in a FunctionInfo object follows this fixed information
+/// and consists of a stream of tuples that consist of:
+///
+/// ENCODING  NAME        DESCRIPTION
+/// ========= =========== ====================================================
+/// uint32_t  InfoType    An "InfoType" enumeration that describes the type
+///                       of optional data that is encoded.
+/// uint32_t  InfoLength  The size in bytes of the encoded data that
+///                       immediately follows this length if this value is
+///                       greater than zero.
+/// uint8_t[] InfoData    Encoded bytes that represent the data for the
+///                       "InfoType". These bytes are only present if
+///                       "InfoLength" is greater than zero.
+///
+/// The "InfoType" is an enumeration:
+///
+///   enum InfoType {
+///     EndOfList = 0u,
+///     LineTableInfo = 1u,
+///     InlineInfo = 2u
+///   };
+///
+/// This stream of tuples is terminated by a "InfoType" whose value is
+/// InfoType::EndOfList and a zero for "InfoLength". This signifies the end of
+/// the optional information list. This format allows us to add new optional
+/// information data to a FunctionInfo object over time and allows older
+/// clients to still parse the format and skip over any data that they don't
+/// understand or want to parse.
+///
+/// So the function information encoding essientially looks like:
+///
+/// struct {
+///   uint32_t Size;
+///   uint32_t Name;
+///   struct {
+///     uint32_t InfoType;
+///     uint32_t InfoLength;
+///     uint8_t InfoData[InfoLength];
+///   }[N];
+/// }
+///
+/// Where "N" is the number of tuples.
 struct FunctionInfo {
   AddressRange Range;
   uint32_t Name; ///< String table offset in the string table.
-  std::vector<gsym::LineEntry> Lines;
-  InlineInfo Inline;
+  llvm::Optional<LineTable> OptLineTable;
+  llvm::Optional<InlineInfo> Inline;
 
   FunctionInfo(uint64_t Addr = 0, uint64_t Size = 0, uint32_t N = 0)
       : Range(Addr, Addr + Size), Name(N) {}
 
+  /// Query if a FunctionInfo has rich debug info.
+  ///
+  /// \returns A bool that indicates if this object has something else than
+  /// range and name. When converting information from a symbol table and from
+  /// debug info, we might end up with multiple FunctionInfo objects for the
+  /// same range and we need to be able to tell which one is the better object
+  /// to use.
   bool hasRichInfo() const {
-    /// Returns whether we have something else than range and name. When
-    /// converting information from a symbol table and from debug info, we
-    /// might end up with multiple FunctionInfo objects for the same range
-    /// and we need to be able to tell which one is the better object to use.
-    return !Lines.empty() || Inline.isValid();
+    return OptLineTable.hasValue() || Inline.hasValue();
   }
 
+  /// Query if a FunctionInfo object is valid.
+  ///
+  /// Address and size can be zero and there can be no line entries for a
+  /// symbol so the only indication this entry is valid is if the name is
+  /// not zero. This can happen when extracting information from symbol
+  /// tables that do not encode symbol sizes. In that case only the
+  /// address and name will be filled in.
+  ///
+  /// \returns A boolean indicating if this FunctionInfo is valid.
   bool isValid() const {
-    /// Address and size can be zero and there can be no line entries for a
-    /// symbol so the only indication this entry is valid is if the name is
-    /// not zero. This can happen when extracting information from symbol
-    /// tables that do not encode symbol sizes. In that case only the
-    /// address and name will be filled in.
     return Name != 0;
   }
 
+  /// Decode an object from a binary data stream.
+  ///
+  /// \param Data The binary stream to read the data from. This object must
+  /// have the data for the object starting at offset zero. The data
+  /// can contain more data than needed.
+  ///
+  /// \param BaseAddr The FunctionInfo's start address and will be used as the
+  /// base address when decoding any contained information like the line table
+  /// and the inline info.
+  ///
+  /// \returns An FunctionInfo or an error describing the issue that was
+  /// encountered during decoding.
+  static llvm::Expected<FunctionInfo> decode(DataExtractor &Data,
+                                             uint64_t BaseAddr);
+
+  /// Encode this object into FileWriter stream.
+  ///
+  /// \param O The binary stream to write the data to at the current file
+  /// position.
+  ///
+  /// \returns An error object that indicates failure or the offset of the
+  /// function info that was successfully written into the stream.
+  llvm::Expected<uint64_t> encode(FileWriter &O) const;
+
   uint64_t startAddress() const { return Range.Start; }
   uint64_t endAddress() const { return Range.End; }
   uint64_t size() const { return Range.size(); }
@@ -66,14 +150,14 @@ struct FunctionInfo {
   void clear() {
     Range = {0, 0};
     Name = 0;
-    Lines.clear();
-    Inline.clear();
+    OptLineTable = None;
+    Inline = None;
   }
 };
 
 inline bool operator==(const FunctionInfo &LHS, const FunctionInfo &RHS) {
   return LHS.Range == RHS.Range && LHS.Name == RHS.Name &&
-         LHS.Lines == RHS.Lines && LHS.Inline == RHS.Inline;
+         LHS.OptLineTable == RHS.OptLineTable && LHS.Inline == RHS.Inline;
 }
 inline bool operator!=(const FunctionInfo &LHS, const FunctionInfo &RHS) {
   return !(LHS == RHS);
@@ -89,14 +173,10 @@ inline bool operator<(const FunctionInfo &LHS, const FunctionInfo &RHS) {
     return LHS.Range < RHS.Range;
 
   // Then sort by inline
-  if (LHS.Inline.isValid() != RHS.Inline.isValid())
-    return RHS.Inline.isValid();
-
-  // If the number of lines is the same, then compare line table entries
-  if (LHS.Lines.size() == RHS.Lines.size())
-    return LHS.Lines < RHS.Lines;
-  // Then sort by number of line table entries (more is better)
-  return LHS.Lines.size() < RHS.Lines.size();
+  if (LHS.Inline.hasValue() != RHS.Inline.hasValue())
+    return RHS.Inline.hasValue();
+
+  return LHS.OptLineTable < RHS.OptLineTable;
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const FunctionInfo &R);
diff --git a/include/llvm/DebugInfo/GSYM/GsymCreator.h b/include/llvm/DebugInfo/GSYM/GsymCreator.h
new file mode 100644
index 000000000000..12c8187132ba
--- /dev/null
+++ b/include/llvm/DebugInfo/GSYM/GsymCreator.h
@@ -0,0 +1,229 @@
+//===- GsymCreator.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_GSYM_GSYMCREATOR_H
+#define LLVM_DEBUGINFO_GSYM_GSYMCREATOR_H
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/GSYM/FileEntry.h"
+#include "llvm/DebugInfo/GSYM/FunctionInfo.h"
+#include "llvm/DebugInfo/GSYM/Range.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
+
+namespace llvm {
+
+namespace gsym {
+class FileWriter;
+
+/// GsymCreator is used to emit GSYM data to a stand alone file or section
+/// within a file.
+///
+/// The GsymCreator is designed to be used in 3 stages:
+/// - Create FunctionInfo objects and add them
+/// - Finalize the GsymCreator object
+/// - Save to file or section
+///
+/// The first stage involves creating FunctionInfo objects from another source
+/// of information like compiler debug info metadata, DWARF or Breakpad files.
+/// Any strings in the FunctionInfo or contained information, like InlineInfo
+/// or LineTable objects, should get the string table offsets by calling
+/// GsymCreator::insertString(...). Any file indexes that are needed should be
+/// obtained by calling GsymCreator::insertFile(...). All of the function calls
+/// in GsymCreator are thread safe. This allows multiple threads to create and
+/// add FunctionInfo objects while parsing debug information.
+///
+/// Once all of the FunctionInfo objects have been added, the
+/// GsymCreator::finalize(...) must be called prior to saving. This function
+/// will sort the FunctionInfo objects, finalize the string table, and do any
+/// other passes on the information needed to prepare the information to be
+/// saved.
+///
+/// Once the object has been finalized, it can be saved to a file or section.
+///
+/// ENCODING
+///
+/// GSYM files are designed to be memory mapped into a process as shared, read
+/// only data, and used as is.
+///
+/// The GSYM file format when in a stand alone file consists of:
+///   - Header
+///   - Address Table
+///   - Function Info Offsets
+///   - File Table
+///   - String Table
+///   - Function Info Data
+///
+/// HEADER
+///
+/// The header is fully described in "llvm/DebugInfo/GSYM/Header.h".
+///
+/// ADDRESS TABLE
+///
+/// The address table immediately follows the header in the file and consists
+/// of Header.NumAddresses address offsets. These offsets are sorted and can be
+/// binary searched for efficient lookups. Addresses in the address table are
+/// stored as offsets from a 64 bit base address found in Header.BaseAddress.
+/// This allows the address table to contain 8, 16, or 32 offsets. This allows
+/// the address table to not require full 64 bit addresses for each address.
+/// The resulting GSYM size is smaller and causes fewer pages to be touched
+/// during address lookups when the address table is smaller. The size of the
+/// address offsets in the address table is specified in the header in
+/// Header.AddrOffSize. The first offset in the address table is alinged to
+/// Header.AddrOffSize alignement to ensure efficient access when loaded into
+/// memory.
+///
+/// FUNCTION INFO OFFSETS TABLE
+///
+/// The function info offsets table immediately follows the address table and
+/// consists of Header.NumAddresses 32 bit file offsets: one for each address
+/// in the address table. This data is algined to a 4 byte boundary. The
+/// offsets in this table are the relative offsets from the start offset of the
+/// GSYM header and point to the function info data for each address in the
+/// address table. Keeping this data separate from the address table helps to
+/// reduce the number of pages that are touched when address lookups occur on a
+/// GSYM file.
+///
+/// FILE TABLE
+///
+/// The file table immediately follows the function info offsets table. The
+/// encoding of the FileTable is:
+///
+/// struct FileTable {
+///   uint32_t Count;
+///   FileEntry Files[];
+/// };
+///
+/// The file table starts with a 32 bit count of the number of files that are
+/// used in all of the function info, followed by that number of FileEntry
+/// structures. The file table is aligned to a 4 byte boundary, Each file in
+/// the file table is represented with a FileEntry structure.
+/// See "llvm/DebugInfo/GSYM/FileEntry.h" for details.
+///
+/// STRING TABLE
+///
+/// The string table follows the file table in stand alone GSYM files and
+/// contains all strings for everything contained in the GSYM file. Any string
+/// data should be added to the string table and any references to strings
+/// inside GSYM information must be stored as 32 bit string table offsets into
+/// this string table. The string table always starts with an empty string at
+/// offset zero and is followed by any strings needed by the GSYM information.
+/// The start of the string table is not aligned to any boundary.
+///
+/// FUNCTION INFO DATA
+///
+/// The function info data is the payload that contains information about the
+/// address that is being looked up. It contains all of the encoded
+/// FunctionInfo objects. Each encoded FunctionInfo's data is pointed to by an
+/// entry in the Function Info Offsets Table. For details on the exact encoding
+/// of FunctionInfo objects, see "llvm/DebugInfo/GSYM/FunctionInfo.h".
+class GsymCreator {
+  // Private member variables require Mutex protections
+  mutable std::recursive_mutex Mutex;
+  std::vector<FunctionInfo> Funcs;
+  StringTableBuilder StrTab;
+  DenseMap<llvm::gsym::FileEntry, uint32_t> FileEntryToIndex;
+  std::vector<llvm::gsym::FileEntry> Files;
+  std::vector<uint8_t> UUID;
+  bool Finalized = false;
+
+public:
+
+  GsymCreator();
+
+  /// Save a GSYM file to a stand alone file.
+  ///
+  /// \param Path The file path to save the GSYM file to.
+  /// \param ByteOrder The endianness to use when saving the file.
+  /// \returns An error object that indicates success or failure of the save.
+  llvm::Error save(StringRef Path, llvm::support::endianness ByteOrder) const;
+
+  /// Encode a GSYM into the file writer stream at the current position.
+  ///
+  /// \param O The stream to save the binary data to
+  /// \returns An error object that indicates success or failure of the save.
+  llvm::Error encode(FileWriter &O) const;
+
+  /// Insert a string into the GSYM string table.
+  ///
+  /// All strings used by GSYM files must be uniqued by adding them to this
+  /// string pool and using the returned offset for any string values.
+  ///
+  /// \param   S The string to insert into the string table.
+  /// \returns The unique 32 bit offset into the string table.
+  uint32_t insertString(StringRef S);
+
+  /// Insert a file into this GSYM creator.
+  ///
+  /// Inserts a file by adding a FileEntry into the "Files" member variable if
+  /// the file has not already been added. The file path is split into
+  /// directory and filename which are both added to the string table. This
+  /// allows paths to be stored efficiently by reusing the directories that are
+  /// common between multiple files.
+  ///
+  /// \param   Path The path to the file to insert.
+  /// \param   Style The path style for the "Path" parameter.
+  /// \returns The unique file index for the inserted file.
+  uint32_t insertFile(StringRef Path,
+                      sys::path::Style Style = sys::path::Style::native);
+
+  /// Add a function info to this GSYM creator.
+  ///
+  /// All information in the FunctionInfo object must use the
+  /// GsymCreator::insertString(...) function when creating string table
+  /// offsets for names and other strings.
+  ///
+  /// \param   FI The function info object to emplace into our functions list.
+  void addFunctionInfo(FunctionInfo &&FI);
+
+  /// Finalize the data in the GSYM creator prior to saving the data out.
+  ///
+  /// Finalize must be called after all FunctionInfo objects have been added
+  /// and before GsymCreator::save() is called.
+  ///
+  /// \param  OS Output stream to report duplicate function infos, overlapping
+  ///         function infos, and function infos that were merged or removed.
+  /// \returns An error object that indicates success or failure of the
+  ///          finalize.
+  llvm::Error finalize(llvm::raw_ostream &OS);
+
+  /// Set the UUID value.
+  ///
+  /// \param UUIDBytes The new UUID bytes.
+  void setUUID(llvm::ArrayRef<uint8_t> UUIDBytes) {
+    UUID.assign(UUIDBytes.begin(), UUIDBytes.end());
+  }
+
+  /// Thread safe iteration over all function infos.
+  ///
+  /// \param  Callback A callback function that will get called with each
+  ///         FunctionInfo. If the callback returns false, stop iterating.
+  void forEachFunctionInfo(
+      std::function<bool(FunctionInfo &)> const &Callback);
+
+  /// Thread safe const iteration over all function infos.
+  ///
+  /// \param  Callback A callback function that will get called with each
+  ///         FunctionInfo. If the callback returns false, stop iterating.
+  void forEachFunctionInfo(
+      std::function<bool(const FunctionInfo &)> const &Callback) const;
+
+};
+
+} // namespace gsym
+} // namespace llvm
+
+#endif // #ifndef LLVM_DEBUGINFO_GSYM_GSYMCREATOR_H
diff --git a/include/llvm/DebugInfo/GSYM/GsymReader.h b/include/llvm/DebugInfo/GSYM/GsymReader.h
new file mode 100644
index 000000000000..113bcee9c9a3
--- /dev/null
+++ b/include/llvm/DebugInfo/GSYM/GsymReader.h
@@ -0,0 +1,228 @@
+//===- GsymReader.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_GSYM_GSYMREADER_H
+#define LLVM_DEBUGINFO_GSYM_GSYMREADER_H
+
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/GSYM/FileEntry.h"
+#include "llvm/DebugInfo/GSYM/FunctionInfo.h"
+#include "llvm/DebugInfo/GSYM/Header.h"
+#include "llvm/DebugInfo/GSYM/LineEntry.h"
+#include "llvm/DebugInfo/GSYM/StringTable.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <inttypes.h>
+#include <memory>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+namespace llvm {
+class MemoryBuffer;
+class raw_ostream;
+
+namespace gsym {
+
+/// GsymReader is used to read GSYM data from a file or buffer.
+///
+/// This class is optimized for very quick lookups when the endianness matches
+/// the host system. The Header, address table, address info offsets, and file
+/// table is designed to be mmap'ed as read only into memory and used without
+/// any parsing needed. If the endianness doesn't match, we swap these objects
+/// and tables into GsymReader::SwappedData and then point our header and
+/// ArrayRefs to this swapped internal data.
+///
+/// GsymReader objects must use one of the static functions to create an
+/// instance: GsymReader::openFile(...) and GsymReader::copyBuffer(...).
+
+class GsymReader {
+  GsymReader(std::unique_ptr<MemoryBuffer> Buffer);
+  llvm::Error parse();
+
+  std::unique_ptr<MemoryBuffer> MemBuffer;
+  StringRef GsymBytes;
+  llvm::support::endianness Endian;
+  const Header *Hdr = nullptr;
+  ArrayRef<uint8_t> AddrOffsets;
+  ArrayRef<uint32_t> AddrInfoOffsets;
+  ArrayRef<FileEntry> Files;
+  StringTable StrTab;
+  /// When the GSYM file's endianness doesn't match the host system then
+  /// we must decode all data structures that need to be swapped into
+  /// local storage and set point the ArrayRef objects above to these swapped
+  /// copies.
+  struct SwappedData {
+    Header Hdr;
+    std::vector<uint8_t> AddrOffsets;
+    std::vector<uint32_t> AddrInfoOffsets;
+    std::vector<FileEntry> Files;
+  };
+  std::unique_ptr<SwappedData> Swap;
+
+public:
+  GsymReader(GsymReader &&RHS);
+  ~GsymReader();
+
+  /// Construct a GsymReader from a file on disk.
+  ///
+  /// \param Path The file path the GSYM file to read.
+  /// \returns An expected GsymReader that contains the object or an error
+  /// object that indicates reason for failing to read the GSYM.
+  static llvm::Expected<GsymReader> openFile(StringRef Path);
+
+  /// Construct a GsymReader from a buffer.
+  ///
+  /// \param Bytes A set of bytes that will be copied and owned by the
+  /// returned object on success.
+  /// \returns An expected GsymReader that contains the object or an error
+  /// object that indicates reason for failing to read the GSYM.
+  static llvm::Expected<GsymReader> copyBuffer(StringRef Bytes);
+
+  /// Access the GSYM header.
+  /// \returns A native endian version of the GSYM header.
+  const Header &getHeader() const;
+
+  /// Get the full function info for an address.
+  ///
+  /// \param Addr A virtual address from the orignal object file to lookup.
+  /// \returns An expected FunctionInfo that contains the function info object
+  /// or an error object that indicates reason for failing to lookup the
+  /// address,
+  llvm::Expected<FunctionInfo> getFunctionInfo(uint64_t Addr) const;
+
+  /// Get a string from the string table.
+  ///
+  /// \param Offset The string table offset for the string to retrieve.
+  /// \returns The string from the strin table.
+  StringRef getString(uint32_t Offset) const { return StrTab[Offset]; }
+
+protected:
+  /// Gets an address from the address table.
+  ///
+  /// Addresses are stored as offsets frrom the gsym::Header::BaseAddress.
+  ///
+  /// \param Index A index into the address table.
+  /// \returns A resolved virtual address for adddress in the address table
+  /// or llvm::None if Index is out of bounds.
+  Optional<uint64_t> getAddress(size_t Index) const;
+
+  /// Get the a file entry for the suppplied file index.
+  ///
+  /// Used to convert any file indexes in the FunctionInfo data back into
+  /// files. This function can be used for iteration, but is more commonly used
+  /// for random access when doing lookups.
+  ///
+  /// \param Index An index into the file table.
+  /// \returns An optional FileInfo that will be valid if the file index is
+  /// valid, or llvm::None if the file index is out of bounds,
+  Optional<FileEntry> getFile(uint32_t Index) const {
+    if (Index < Files.size())
+      return Files[Index];
+    return llvm::None;
+  }
+
+  /// Get an appropriate address info offsets array.
+  ///
+  /// The address table in the GSYM file is stored as array of 1, 2, 4 or 8
+  /// byte offsets from the The gsym::Header::BaseAddress. The table is stored
+  /// internally as a array of bytes that are in the correct endianness. When
+  /// we access this table we must get an array that matches those sizes. This
+  /// templatized helper function is used when accessing address offsets in the
+  /// AddrOffsets member variable.
+  ///
+  /// \returns An ArrayRef of an appropriate address offset size.
+  template <class T> ArrayRef<T>
+  getAddrOffsets() const {
+    return ArrayRef<T>(reinterpret_cast<const T *>(AddrOffsets.data()),
+                       AddrOffsets.size()/sizeof(T));
+  }
+
+  /// Get an appropriate address from the address table.
+  ///
+  /// The address table in the GSYM file is stored as array of 1, 2, 4 or 8
+  /// byte address offsets from the The gsym::Header::BaseAddress. The table is
+  /// stored internally as a array of bytes that are in the correct endianness.
+  /// In order to extract an address from the address table we must access the
+  /// address offset using the correct size and then add it to the BaseAddress
+  /// in the header.
+  ///
+  /// \param Index An index into the AddrOffsets array.
+  /// \returns An virtual address that matches the original object file for the
+  /// address as the specified index, or llvm::None if Index is out of bounds.
+  template <class T> Optional<uint64_t>
+  addressForIndex(size_t Index) const {
+    ArrayRef<T> AIO = getAddrOffsets<T>();
+    if (Index < AIO.size())
+      return AIO[Index] + Hdr->BaseAddress;
+    return llvm::None;
+  }
+  /// Lookup an address offset in the AddrOffsets table.
+  ///
+  /// Given an address offset, look it up using a binary search of the
+  /// AddrOffsets table.
+  ///
+  /// \param AddrOffset An address offset, that has already been computed by
+  /// subtracting the gsym::Header::BaseAddress.
+  /// \returns The matching address offset index. This index will be used to
+  /// extract the FunctionInfo data's offset from the AddrInfoOffsets array.
+  template <class T>
+  uint64_t getAddressOffsetIndex(const uint64_t AddrOffset) const {
+    ArrayRef<T> AIO = getAddrOffsets<T>();
+    const auto Begin = AIO.begin();
+    const auto End = AIO.end();
+    auto Iter = std::lower_bound(Begin, End, AddrOffset);
+    if (Iter == End || AddrOffset < *Iter)
+      --Iter;
+    return std::distance(Begin, Iter);
+  }
+
+  /// Create a GSYM from a memory buffer.
+  ///
+  /// Called by both openFile() and copyBuffer(), this function does all of the
+  /// work of parsing the GSYM file and returning an error.
+  ///
+  /// \param MemBuffer A memory buffer that will transfer ownership into the
+  /// GsymReader.
+  /// \returns An expected GsymReader that contains the object or an error
+  /// object that indicates reason for failing to read the GSYM.
+  static llvm::Expected<llvm::gsym::GsymReader>
+  create(std::unique_ptr<MemoryBuffer> &MemBuffer);
+
+
+  /// Given an address, find the address index.
+  ///
+  /// Binary search the address table and find the matching address index.
+  ///
+  /// \param Addr A virtual address that matches the original object file
+  /// to lookup.
+  /// \returns An index into the address table. This index can be used to
+  /// extract the FunctionInfo data's offset from the AddrInfoOffsets array.
+  /// Returns an error if the address isn't in the GSYM with details of why.
+  Expected<uint64_t> getAddressIndex(const uint64_t Addr) const;
+
+  /// Given an address index, get the offset for the FunctionInfo.
+  ///
+  /// Looking up an address is done by finding the corresponding address
+  /// index for the address. This index is then used to get the offset of the
+  /// FunctionInfo data that we will decode using this function.
+  ///
+  /// \param Index An index into the address table.
+  /// \returns An optional GSYM data offset for the offset of the FunctionInfo
+  /// that needs to be decoded.
+  Optional<uint64_t> getAddressInfoOffset(size_t Index) const;
+};
+
+} // namespace gsym
+} // namespace llvm
+
+#endif // #ifndef LLVM_DEBUGINFO_GSYM_GSYMREADER_H
diff --git a/include/llvm/DebugInfo/GSYM/Header.h b/include/llvm/DebugInfo/GSYM/Header.h
new file mode 100644
index 000000000000..6652c59c97a6
--- /dev/null
+++ b/include/llvm/DebugInfo/GSYM/Header.h
@@ -0,0 +1,129 @@
+//===- Header.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_GSYM_HEADER_H
+#define LLVM_DEBUGINFO_GSYM_HEADER_H
+
+#include "llvm/Support/Error.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace llvm {
+class raw_ostream;
+class DataExtractor;
+
+namespace gsym {
+class FileWriter;
+
+constexpr uint32_t GSYM_MAGIC = 0x4753594d; // 'GSYM'
+constexpr uint32_t GSYM_CIGAM = 0x4d595347; // 'MYSG'
+constexpr uint32_t GSYM_VERSION = 1;
+constexpr size_t GSYM_MAX_UUID_SIZE = 20;
+
+/// The GSYM header.
+///
+/// The GSYM header is found at the start of a stand alone GSYM file, or as
+/// the first bytes in a section when GSYM is contained in a section of an
+/// executable file (ELF, mach-o, COFF).
+///
+/// The structure is encoded exactly as it appears in the structure definition
+/// with no gaps between members. Alignment should not change from system to
+/// system as the members were laid out so that they shouldn't align
+/// differently on different architectures.
+///
+/// When endianness of the system loading a GSYM file matches, the file can
+/// be mmap'ed in and a pointer to the header can be cast to the first bytes
+/// of the file (stand alone GSYM file) or section data (GSYM in a section).
+/// When endianness is swapped, the Header::decode() function should be used to
+/// decode the header.
+struct Header {
+  /// The magic bytes should be set to GSYM_MAGIC. This helps detect if a file
+  /// is a GSYM file by scanning the first 4 bytes of a file or section.
+  /// This value might appear byte swapped
+  uint32_t Magic;
+  /// The version can number determines how the header is decoded and how each
+  /// InfoType in FunctionInfo is encoded/decoded. As version numbers increase,
+  /// "Magic" and "Version" members should always appear at offset zero and 4
+  /// respectively to ensure clients figure out if they can parse the format.
+  uint16_t Version;
+  /// The size in bytes of each address offset in the address offsets table.
+  uint8_t AddrOffSize;
+  /// The size in bytes of the UUID encoded in the "UUID" member.
+  uint8_t UUIDSize;
+  /// The 64 bit base address that all address offsets in the address offsets
+  /// table are relative to. Storing a full 64 bit address allows our address
+  /// offsets table to be smaller on disk.
+  uint64_t BaseAddress;
+  /// The number of addresses stored in the address offsets table.
+  uint32_t NumAddresses;
+  /// The file relative offset of the start of the string table for strings
+  /// contained in the GSYM file. If the GSYM in contained in a stand alone
+  /// file this will be the file offset of the start of the string table. If
+  /// the GSYM is contained in a section within an executable file, this can
+  /// be the offset of the first string used in the GSYM file and can possibly
+  /// span one or more executable string tables. This allows the strings to
+  /// share string tables in an ELF or mach-o file.
+  uint32_t StrtabOffset;
+  /// The size in bytes of the string table. For a stand alone GSYM file, this
+  /// will be the exact size in bytes of the string table. When the GSYM data
+  /// is in a section within an executable file, this size can span one or more
+  /// sections that contains strings. This allows any strings that are already
+  /// stored in the executable file to be re-used, and any extra strings could
+  /// be added to another string table and the string table offset and size
+  /// can be set to span all needed string tables.
+  uint32_t StrtabSize;
+  /// The UUID of the original executable file. This is stored to allow
+  /// matching a GSYM file to an executable file when symbolication is
+  /// required. Only the first "UUIDSize" bytes of the UUID are valid. Any
+  /// bytes in the UUID value that appear after the first UUIDSize bytes should
+  /// be set to zero.
+  uint8_t UUID[GSYM_MAX_UUID_SIZE];
+
+  /// Check if a header is valid and return an error if anything is wrong.
+  ///
+  /// This function can be used prior to encoding a header to ensure it is
+  /// valid, or after decoding a header to ensure it is valid and supported.
+  ///
+  /// Check a correctly byte swapped header for errors:
+  ///   - check magic value
+  ///   - check that version number is supported
+  ///   - check that the address offset size is supported
+  ///   - check that the UUID size is valid
+  ///
+  /// \returns An error if anything is wrong in the header, or Error::success()
+  /// if there are no errors.
+  llvm::Error checkForError() const;
+
+  /// Decode an object from a binary data stream.
+  ///
+  /// \param Data The binary stream to read the data from. This object must
+  /// have the data for the object starting at offset zero. The data
+  /// can contain more data than needed.
+  ///
+  /// \returns A Header or an error describing the issue that was
+  /// encountered during decoding.
+  static llvm::Expected<Header> decode(DataExtractor &Data);
+
+  /// Encode this object into FileWriter stream.
+  ///
+  /// \param O The binary stream to write the data to at the current file
+  /// position.
+  ///
+  /// \returns An error object that indicates success or failure of the
+  /// encoding process.
+  llvm::Error encode(FileWriter &O) const;
+};
+
+bool operator==(const Header &LHS, const Header &RHS);
+raw_ostream &operator<<(raw_ostream &OS, const llvm::gsym::Header &H);
+
+} // namespace gsym
+} // namespace llvm
+
+#endif // #ifndef LLVM_DEBUGINFO_GSYM_HEADER_H
diff --git a/include/llvm/DebugInfo/GSYM/InlineInfo.h b/include/llvm/DebugInfo/GSYM/InlineInfo.h
index 222430622932..48fd9a7c1308 100644
--- a/include/llvm/DebugInfo/GSYM/InlineInfo.h
+++ b/include/llvm/DebugInfo/GSYM/InlineInfo.h
@@ -1,9 +1,8 @@
 //===- InlineInfo.h ---------------------------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -12,6 +11,7 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/GSYM/Range.h"
+#include "llvm/Support/Error.h"
 #include <stdint.h>
 #include <vector>
 
@@ -31,6 +31,30 @@ namespace gsym {
 /// Any clients that encode information will need to ensure the ranges are
 /// all contined correctly or lookups could fail. Add ranges in these objects
 /// must be contained in the top level FunctionInfo address ranges as well.
+///
+/// ENCODING
+///
+/// When saved to disk, the inline info encodes all ranges to be relative to
+/// a parent address range. This will be the FunctionInfo's start address if
+/// the InlineInfo is directly contained in a FunctionInfo, or a the start
+/// address of the containing parent InlineInfo's first "Ranges" member. This
+/// allows address ranges to be efficiently encoded using ULEB128 encodings as
+/// we encode the offset and size of each range instead of full addresses. This
+/// also makes any encoded addresses easy to relocate as we just need to
+/// relocate the FunctionInfo's start address.
+///
+/// - The AddressRanges member "Ranges" is encoded using an approriate base
+///   address as described above.
+/// - UINT8 boolean value that specifies if the InlineInfo object has children.
+/// - UINT32 string table offset that points to the name of the inline
+///   function.
+/// - ULEB128 integer that specifies the file of the call site that called
+///   this function.
+/// - ULEB128 integer that specifies the source line of the call site that
+///   called this function.
+/// - if this object has children, enocode each child InlineInfo using the
+///   the first address range's start address as the base address.
+///
 struct InlineInfo {
 
   uint32_t Name; ///< String table offset in the string table.
@@ -62,6 +86,37 @@ struct InlineInfo {
   /// \returns optional vector of InlineInfo objects that describe the
   /// inline call stack for a given address, false otherwise.
   llvm::Optional<InlineArray> getInlineStack(uint64_t Addr) const;
+
+  /// Decode an InlineInfo object from a binary data stream.
+  ///
+  /// \param Data The binary stream to read the data from. This object must
+  /// have the data for the InlineInfo object starting at offset zero. The data
+  /// can contain more data than needed.
+  ///
+  /// \param BaseAddr The base address to use when decoding all address ranges.
+  /// This will be the FunctionInfo's start address if this object is directly
+  /// contained in a FunctionInfo object, or the start address of the first
+  /// address range in an InlineInfo object of this object is a child of
+  /// another InlineInfo object.
+  /// \returns An InlineInfo or an error describing the issue that was
+  /// encountered during decoding.
+  static llvm::Expected<InlineInfo> decode(DataExtractor &Data,
+                                           uint64_t BaseAddr);
+
+  /// Encode this InlineInfo object into FileWriter stream.
+  ///
+  /// \param O The binary stream to write the data to at the current file
+  /// position.
+  ///
+  /// \param BaseAddr The base address to use when encoding all address ranges.
+  /// This will be the FunctionInfo's start address if this object is directly
+  /// contained in a FunctionInfo object, or the start address of the first
+  /// address range in an InlineInfo object of this object is a child of
+  /// another InlineInfo object.
+  ///
+  /// \returns An error object that indicates success or failure or the
+  /// encoding process.
+  llvm::Error encode(FileWriter &O, uint64_t BaseAddr) const;
 };
 
 inline bool operator==(const InlineInfo &LHS, const InlineInfo &RHS) {
diff --git a/include/llvm/DebugInfo/GSYM/LineEntry.h b/include/llvm/DebugInfo/GSYM/LineEntry.h
index 6b9380940bd3..aac7c48e067e 100644
--- a/include/llvm/DebugInfo/GSYM/LineEntry.h
+++ b/include/llvm/DebugInfo/GSYM/LineEntry.h
@@ -1,9 +1,8 @@
 //===- LineEntry.h ----------------------------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/include/llvm/DebugInfo/GSYM/LineTable.h b/include/llvm/DebugInfo/GSYM/LineTable.h
new file mode 100644
index 000000000000..3cdbccb08ced
--- /dev/null
+++ b/include/llvm/DebugInfo/GSYM/LineTable.h
@@ -0,0 +1,198 @@
+//===- LineTable.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_GSYM_LINETABLE_H
+#define LLVM_DEBUGINFO_GSYM_LINETABLE_H
+
+#include "llvm/DebugInfo/GSYM/LineEntry.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+namespace gsym {
+
+struct FunctionInfo;
+class FileWriter;
+
+/// LineTable class contains deserialized versions of line tables for each
+/// function's address ranges.
+///
+/// When saved to disk, the line table is encoded using a modified version of
+/// the DWARF line tables that only tracks address to source file and line.
+///
+/// ENCODING
+///
+/// The line table starts with a small prolog that contains the following
+/// values:
+///
+/// ENCODING NAME        DESCRIPTION
+/// ======== =========== ====================================================
+/// SLEB     MinDelta    The min line delta for special opcodes that  advance
+///                      the address and line number.
+/// SLEB     MaxDelta    The max line delta for single byte opcodes that
+///                      advance the address and line number.
+/// ULEB     FirstLine   The value of the first source line number to
+///                      initialize the LineEntry with.
+///
+/// Once these prolog items are read, we initialize a LineEntry struct with
+/// the start address of the function from the FunctionInfo's address range,
+/// a default file index of 1, and the line number set to "FirstLine" from
+/// the prolog above:
+///
+///   LineEntry Row(BaseAddr, 1, FirstLine);
+///
+/// The line table state machine is now initialized and ready to be parsed.
+/// The stream that follows this encodes the line entries in a compact
+/// form. Some opcodes cause "Row" to be modified and some opcodes may also
+/// push "Row" onto the end of the "LineTable.Lines" vector. The end result
+/// is a vector of LineEntry structs that is sorted in ascending address
+/// order.
+///
+/// NORMAL OPCODES
+///
+/// The opcodes 0 through 3 are normal in opcodes. Their encoding and
+/// descriptions are listed below:
+///
+/// ENCODING ENUMERATION       VALUE DESCRIPTION
+/// ======== ================  ===== ========================================
+///          LTOC_EndSequence  0x00  Parsing is done.
+/// ULEB     LTOC_SetFile      0x01  Row.File = ULEB
+/// ULEB     LTOC_AdvancePC    0x02  Row.Addr += ULEB, push "Row".
+/// SLEB     LTOC_AdvanceLine  0x03  Row.Line += SLEB
+///          LTOC_FirstSpecial 0x04  First special opcode (see SPECIAL
+///                                  OPCODES below).
+///
+/// SPECIAL OPCODES
+///
+/// Opcodes LTOC_FirstSpecial through 255 are special opcodes that always
+/// increment both the Row.Addr and Row.Line and push "Row" onto the
+/// LineEntry.Lines array. They do this by using some of the bits to
+/// increment/decrement the source line number, and some of the bits to
+/// increment the address. Line numbers can go up or down when making line
+/// tables, where addresses always only increase since line tables are sorted
+/// by address.
+///
+/// In order to calculate the amount to increment the line and address for
+/// these special opcodes, we calculate the number of values reserved for the
+/// line increment/decrement using the "MinDelta" and "MaxDelta" from the
+/// prolog:
+///
+///     const int64_t LineRange = MaxDelta - MinDelta + 1;
+///
+/// Then we can adjust the opcode to not include any of the normal opcodes:
+///
+///     const uint8_t AdjustedOp = Opcode - LTOC_FirstSpecial;
+///
+/// And we can calculate the line offset, and address offset:
+///
+///     const int64_t LineDelta = MinDelta + (AdjustedOp % LineRange);
+///     const uint64_t AddrDelta = (AdjustedOp / LineRange);
+///
+/// And use these to modify our "Row":
+///
+///     Row.Line += LineDelta;
+///     Row.Addr += AddrDelta;
+///
+/// And push a row onto the line table:
+///
+///     Lines.push_back(Row);
+///
+/// This is verify similar to the way that DWARF encodes its line tables. The
+/// only difference is the DWARF line tables have more normal opcodes and the
+/// "Row" contains more members, like source column number, bools for end of
+/// prologue, beginnging of epilogue, is statement and many others. There are
+/// also more complex rules that happen for the extra normal opcodes. By
+/// leaving these extra opcodes out, we leave more bits for the special
+/// opcodes that allows us to encode line tables in fewer bytes than standard
+/// DWARF encodings.
+///
+/// Opcodes that will push "Row" onto the LineEntry.Lines include the
+/// LTOC_AdvancePC opcode and all special opcodes. All other opcodes
+/// only modify the current "Row", or cause the line table to end.
+class LineTable {
+  typedef std::vector<gsym::LineEntry> Collection;
+  Collection Lines; ///< All line entries in the line table.
+public:
+  static LineEntry lookup(DataExtractor &Data, uint64_t BaseAddr,
+                          uint64_t Addr);
+
+  /// Decode an LineTable object from a binary data stream.
+  ///
+  /// \param Data The binary stream to read the data from. This object must
+  /// have the data for the LineTable object starting at offset zero. The data
+  /// can contain more data than needed.
+  ///
+  /// \param BaseAddr The base address to use when decoding the line table.
+  /// This will be the FunctionInfo's start address and will be used to
+  /// initialize the line table row prior to parsing any opcodes.
+  ///
+  /// \returns An LineTable or an error describing the issue that was
+  /// encountered during decoding.
+  static llvm::Expected<LineTable> decode(DataExtractor &Data,
+                                          uint64_t BaseAddr);
+  /// Encode this LineTable object into FileWriter stream.
+  ///
+  /// \param O The binary stream to write the data to at the current file
+  /// position.
+  ///
+  /// \param BaseAddr The base address to use when decoding the line table.
+  /// This will be the FunctionInfo's start address.
+  ///
+  /// \returns An error object that indicates success or failure or the
+  /// encoding process.
+  llvm::Error encode(FileWriter &O, uint64_t BaseAddr) const;
+  bool empty() const { return Lines.empty(); }
+  void clear() { Lines.clear(); }
+  void push(const LineEntry &LE) {
+    Lines.push_back(LE);
+  }
+  size_t isValid() const {
+    return !Lines.empty();
+  }
+  size_t size() const {
+    return Lines.size();
+  }
+  LineEntry &get(size_t i) {
+    assert(i < Lines.size());
+    return Lines[i];
+  }
+  const LineEntry &get(size_t i) const {
+    assert(i < Lines.size());
+    return Lines[i];
+  }
+  LineEntry &operator[](size_t i) {
+    return get(i);
+  }
+  const LineEntry &operator[](size_t i) const {
+    return get(i);
+  }
+  bool operator==(const LineTable &RHS) const {
+    return Lines == RHS.Lines;
+  }
+  bool operator!=(const LineTable &RHS) const {
+    return Lines != RHS.Lines;
+  }
+  bool operator<(const LineTable &RHS) const {
+    const auto LHSSize = Lines.size();
+    const auto RHSSize = RHS.Lines.size();
+    if (LHSSize == RHSSize)
+      return Lines < RHS.Lines;
+    return LHSSize < RHSSize;
+  }
+  Collection::const_iterator begin() const { return Lines.begin(); }
+  Collection::const_iterator end() const { return Lines.end(); }
+
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const gsym::LineTable &LT);
+
+} // namespace gsym
+} // namespace llvm
+
+#endif // #ifndef LLVM_DEBUGINFO_GSYM_LINETABLE_H
diff --git a/include/llvm/DebugInfo/GSYM/Range.h b/include/llvm/DebugInfo/GSYM/Range.h
index 772ff244c5b7..37cfec713f26 100644
--- a/include/llvm/DebugInfo/GSYM/Range.h
+++ b/include/llvm/DebugInfo/GSYM/Range.h
@@ -1,9 +1,8 @@
-//===- AddressRange.h -------------------------------------------*- C++ -*-===//
+//===- Range.h --------------------------------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,10 +20,13 @@
 #define HEX64(v) llvm::format_hex(v, 18)
 
 namespace llvm {
+class DataExtractor;
 class raw_ostream;
 
 namespace gsym {
 
+class FileWriter;
+
 /// A class that represents an address range. The range is specified using
 /// a start and an end address.
 struct AddressRange {
@@ -47,6 +49,18 @@ struct AddressRange {
   bool operator<(const AddressRange &R) const {
     return std::make_pair(Start, End) < std::make_pair(R.Start, R.End);
   }
+  /// AddressRange objects are encoded and decoded to be relative to a base
+  /// address. This will be the FunctionInfo's start address if the AddressRange
+  /// is directly contained in a FunctionInfo, or a base address of the
+  /// containing parent AddressRange or AddressRanges. This allows address
+  /// ranges to be efficiently encoded using ULEB128 encodings as we encode the
+  /// offset and size of each range instead of full addresses. This also makes
+  /// encoded addresses easy to relocate as we just need to relocate one base
+  /// address.
+  /// @{
+  void decode(DataExtractor &Data, uint64_t BaseAddr, uint64_t &Offset);
+  void encode(FileWriter &O, uint64_t BaseAddr) const;
+  /// @}
 };
 
 raw_ostream &operator<<(raw_ostream &OS, const AddressRange &R);
@@ -66,6 +80,7 @@ public:
   void clear() { Ranges.clear(); }
   bool empty() const { return Ranges.empty(); }
   bool contains(uint64_t Addr) const;
+  bool contains(AddressRange Range) const;
   void insert(AddressRange Range);
   size_t size() const { return Ranges.size(); }
   bool operator==(const AddressRanges &RHS) const {
@@ -77,6 +92,14 @@ public:
   }
   Collection::const_iterator begin() const { return Ranges.begin(); }
   Collection::const_iterator end() const { return Ranges.end(); }
+
+  /// Address ranges are decoded and encoded to be relative to a base address.
+  /// See the AddressRange comment for the encode and decode methods for full
+  /// details.
+  /// @{
+  void decode(DataExtractor &Data, uint64_t BaseAddr, uint64_t &Offset);
+  void encode(FileWriter &O, uint64_t BaseAddr) const;
+  /// @}
 };
 
 raw_ostream &operator<<(raw_ostream &OS, const AddressRanges &AR);
diff --git a/include/llvm/DebugInfo/GSYM/StringTable.h b/include/llvm/DebugInfo/GSYM/StringTable.h
index 0001b8b82743..a96ae5899da3 100644
--- a/include/llvm/DebugInfo/GSYM/StringTable.h
+++ b/include/llvm/DebugInfo/GSYM/StringTable.h
@@ -1,9 +1,8 @@
 //===- StringTable.h --------------------------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/include/llvm/DebugInfo/PDB/GenericError.h b/include/llvm/DebugInfo/PDB/GenericError.h
index ec85d92d2a92..af93be931b8e 100644
--- a/include/llvm/DebugInfo/PDB/GenericError.h
+++ b/include/llvm/DebugInfo/PDB/GenericError.h
@@ -20,7 +20,7 @@ enum class pdb_error_code {
   dia_sdk_not_present,
   dia_failed_loading,
   signature_out_of_date,
-  external_cmdline_ref,
+  no_matching_pch,
   unspecified,
 };
 } // namespace pdb
diff --git a/include/llvm/DebugInfo/PDB/Native/SymbolCache.h b/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
index 0b15ab474f71..4adf3b394c2e 100644
--- a/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
+++ b/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
@@ -87,7 +87,7 @@ public:
 
     // Initial construction must not access the cache, since it must be done
     // atomically.
-    auto Result = llvm::make_unique<ConcreteSymbolT>(
+    auto Result = std::make_unique<ConcreteSymbolT>(
         Session, Id, std::forward<Args>(ConstructorArgs)...);
     Result->SymbolId = Id;
 
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbol.h b/include/llvm/DebugInfo/PDB/PDBSymbol.h
index d9004a8894d9..0d95a2467556 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbol.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbol.h
@@ -131,7 +131,7 @@ public:
     auto BaseIter = RawSymbol->findChildren(T::Tag);
     if (!BaseIter)
       return nullptr;
-    return llvm::make_unique<ConcreteSymbolEnumerator<T>>(std::move(BaseIter));
+    return std::make_unique<ConcreteSymbolEnumerator<T>>(std::move(BaseIter));
   }
   std::unique_ptr<IPDBEnumSymbols> findAllChildren(PDB_SymType Type) const;
   std::unique_ptr<IPDBEnumSymbols> findAllChildren() const;
diff --git a/include/llvm/DebugInfo/Symbolize/Symbolize.h b/include/llvm/DebugInfo/Symbolize/Symbolize.h
index d3da28ca0b7b..11599fc1797d 100644
--- a/include/llvm/DebugInfo/Symbolize/Symbolize.h
+++ b/include/llvm/DebugInfo/Symbolize/Symbolize.h
@@ -39,6 +39,7 @@ public:
     bool UseSymbolTable = true;
     bool Demangle = true;
     bool RelativeAddresses = false;
+    bool UntagAddresses = false;
     std::string DefaultArch;
     std::vector<std::string> DsymHints;
     std::string FallbackDebugPath;
diff --git a/include/llvm/Demangle/Demangle.h b/include/llvm/Demangle/Demangle.h
index 6fea7ef13f11..7b85b9a9ccf7 100644
--- a/include/llvm/Demangle/Demangle.h
+++ b/include/llvm/Demangle/Demangle.h
@@ -32,7 +32,14 @@ char *itaniumDemangle(const char *mangled_name, char *buf, size_t *n,
                       int *status);
 
 
-enum MSDemangleFlags { MSDF_None = 0, MSDF_DumpBackrefs = 1 << 0 };
+enum MSDemangleFlags {
+  MSDF_None = 0,
+  MSDF_DumpBackrefs = 1 << 0,
+  MSDF_NoAccessSpecifier = 1 << 1,
+  MSDF_NoCallingConvention = 1 << 2,
+  MSDF_NoReturnType = 1 << 3,
+  MSDF_NoMemberType = 1 << 4,
+};
 char *microsoftDemangle(const char *mangled_name, char *buf, size_t *n,
                         int *status, MSDemangleFlags Flags = MSDF_None);
 
diff --git a/include/llvm/Demangle/DemangleConfig.h b/include/llvm/Demangle/DemangleConfig.h
index 73f89d357c85..b7b7dbd24c7f 100644
--- a/include/llvm/Demangle/DemangleConfig.h
+++ b/include/llvm/Demangle/DemangleConfig.h
@@ -15,13 +15,6 @@
 #ifndef LLVM_DEMANGLE_COMPILER_H
 #define LLVM_DEMANGLE_COMPILER_H
 
-#ifdef _MSC_VER
-// snprintf is implemented in VS 2015
-#if _MSC_VER < 1900
-#define snprintf _snprintf_s
-#endif
-#endif
-
 #ifndef __has_feature
 #define __has_feature(x) 0
 #endif
diff --git a/include/llvm/Demangle/ItaniumDemangle.h b/include/llvm/Demangle/ItaniumDemangle.h
index aaccb27e17a3..7784e842bfeb 100644
--- a/include/llvm/Demangle/ItaniumDemangle.h
+++ b/include/llvm/Demangle/ItaniumDemangle.h
@@ -57,6 +57,11 @@
     X(LocalName) \
     X(VectorType) \
     X(PixelVectorType) \
+    X(SyntheticTemplateParamName) \
+    X(TypeTemplateParamDecl) \
+    X(NonTypeTemplateParamDecl) \
+    X(TemplateTemplateParamDecl) \
+    X(TemplateParamPackDecl) \
     X(ParameterPack) \
     X(TemplateArgumentPack) \
     X(ParameterPackExpansion) \
@@ -91,6 +96,8 @@
     X(ThrowExpr) \
     X(UUIDOfExpr) \
     X(BoolExpr) \
+    X(StringLiteral) \
+    X(LambdaExpr) \
     X(IntegerCastExpr) \
     X(IntegerLiteral) \
     X(FloatLiteral) \
@@ -303,7 +310,7 @@ inline Qualifiers operator|=(Qualifiers &Q1, Qualifiers Q2) {
   return Q1 = static_cast<Qualifiers>(Q1 | Q2);
 }
 
-class QualType : public Node {
+class QualType final : public Node {
 protected:
   const Qualifiers Quals;
   const Node *Child;
@@ -964,6 +971,127 @@ public:
   }
 };
 
+enum class TemplateParamKind { Type, NonType, Template };
+
+/// An invented name for a template parameter for which we don't have a
+/// corresponding template argument.
+///
+/// This node is created when parsing the <lambda-sig> for a lambda with
+/// explicit template arguments, which might be referenced in the parameter
+/// types appearing later in the <lambda-sig>.
+class SyntheticTemplateParamName final : public Node {
+  TemplateParamKind Kind;
+  unsigned Index;
+
+public:
+  SyntheticTemplateParamName(TemplateParamKind Kind_, unsigned Index_)
+      : Node(KSyntheticTemplateParamName), Kind(Kind_), Index(Index_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Kind, Index); }
+
+  void printLeft(OutputStream &S) const override {
+    switch (Kind) {
+    case TemplateParamKind::Type:
+      S += "$T";
+      break;
+    case TemplateParamKind::NonType:
+      S += "$N";
+      break;
+    case TemplateParamKind::Template:
+      S += "$TT";
+      break;
+    }
+    if (Index > 0)
+      S << Index - 1;
+  }
+};
+
+/// A template type parameter declaration, 'typename T'.
+class TypeTemplateParamDecl final : public Node {
+  Node *Name;
+
+public:
+  TypeTemplateParamDecl(Node *Name_)
+      : Node(KTypeTemplateParamDecl, Cache::Yes), Name(Name_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Name); }
+
+  void printLeft(OutputStream &S) const override {
+    S += "typename ";
+  }
+
+  void printRight(OutputStream &S) const override {
+    Name->print(S);
+  }
+};
+
+/// A non-type template parameter declaration, 'int N'.
+class NonTypeTemplateParamDecl final : public Node {
+  Node *Name;
+  Node *Type;
+
+public:
+  NonTypeTemplateParamDecl(Node *Name_, Node *Type_)
+      : Node(KNonTypeTemplateParamDecl, Cache::Yes), Name(Name_), Type(Type_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Name, Type); }
+
+  void printLeft(OutputStream &S) const override {
+    Type->printLeft(S);
+    if (!Type->hasRHSComponent(S))
+      S += " ";
+  }
+
+  void printRight(OutputStream &S) const override {
+    Name->print(S);
+    Type->printRight(S);
+  }
+};
+
+/// A template template parameter declaration,
+/// 'template<typename T> typename N'.
+class TemplateTemplateParamDecl final : public Node {
+  Node *Name;
+  NodeArray Params;
+
+public:
+  TemplateTemplateParamDecl(Node *Name_, NodeArray Params_)
+      : Node(KTemplateTemplateParamDecl, Cache::Yes), Name(Name_),
+        Params(Params_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Name, Params); }
+
+  void printLeft(OutputStream &S) const override {
+    S += "template<";
+    Params.printWithComma(S);
+    S += "> typename ";
+  }
+
+  void printRight(OutputStream &S) const override {
+    Name->print(S);
+  }
+};
+
+/// A template parameter pack declaration, 'typename ...T'.
+class TemplateParamPackDecl final : public Node {
+  Node *Param;
+
+public:
+  TemplateParamPackDecl(Node *Param_)
+      : Node(KTemplateParamPackDecl, Cache::Yes), Param(Param_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Param); }
+
+  void printLeft(OutputStream &S) const override {
+    Param->printLeft(S);
+    S += "...";
+  }
+
+  void printRight(OutputStream &S) const override {
+    Param->printRight(S);
+  }
+};
+
 /// An unexpanded parameter pack (either in the expression or type context). If
 /// this AST is correct, this node will have a ParameterPackExpansion node above
 /// it.
@@ -1410,21 +1538,36 @@ public:
 };
 
 class ClosureTypeName : public Node {
+  NodeArray TemplateParams;
   NodeArray Params;
   StringView Count;
 
 public:
-  ClosureTypeName(NodeArray Params_, StringView Count_)
-      : Node(KClosureTypeName), Params(Params_), Count(Count_) {}
+  ClosureTypeName(NodeArray TemplateParams_, NodeArray Params_,
+                  StringView Count_)
+      : Node(KClosureTypeName), TemplateParams(TemplateParams_),
+        Params(Params_), Count(Count_) {}
+
+  template<typename Fn> void match(Fn F) const {
+    F(TemplateParams, Params, Count);
+  }
 
-  template<typename Fn> void match(Fn F) const { F(Params, Count); }
+  void printDeclarator(OutputStream &S) const {
+    if (!TemplateParams.empty()) {
+      S += "<";
+      TemplateParams.printWithComma(S);
+      S += ">";
+    }
+    S += "(";
+    Params.printWithComma(S);
+    S += ")";
+  }
 
   void printLeft(OutputStream &S) const override {
     S += "\'lambda";
     S += Count;
-    S += "\'(";
-    Params.printWithComma(S);
-    S += ")";
+    S += "\'";
+    printDeclarator(S);
   }
 };
 
@@ -1902,6 +2045,37 @@ public:
   }
 };
 
+class StringLiteral : public Node {
+  const Node *Type;
+
+public:
+  StringLiteral(const Node *Type_) : Node(KStringLiteral), Type(Type_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Type); }
+
+  void printLeft(OutputStream &S) const override {
+    S += "\"<";
+    Type->print(S);
+    S += ">\"";
+  }
+};
+
+class LambdaExpr : public Node {
+  const Node *Type;
+
+public:
+  LambdaExpr(const Node *Type_) : Node(KLambdaExpr), Type(Type_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Type); }
+
+  void printLeft(OutputStream &S) const override {
+    S += "[]";
+    if (Type->getKind() == KClosureTypeName)
+      static_cast<const ClosureTypeName *>(Type)->printDeclarator(S);
+    S += "{...}";
+  }
+};
+
 class IntegerCastExpr : public Node {
   // ty(integer)
   const Node *Ty;
@@ -2167,10 +2341,36 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   // table.
   PODSmallVector<Node *, 32> Subs;
 
+  using TemplateParamList = PODSmallVector<Node *, 8>;
+
+  class ScopedTemplateParamList {
+    AbstractManglingParser *Parser;
+    size_t OldNumTemplateParamLists;
+    TemplateParamList Params;
+
+  public:
+    ScopedTemplateParamList(AbstractManglingParser *Parser)
+        : Parser(Parser),
+          OldNumTemplateParamLists(Parser->TemplateParams.size()) {
+      Parser->TemplateParams.push_back(&Params);
+    }
+    ~ScopedTemplateParamList() {
+      assert(Parser->TemplateParams.size() >= OldNumTemplateParamLists);
+      Parser->TemplateParams.dropBack(OldNumTemplateParamLists);
+    }
+  };
+
   // Template parameter table. Like the above, but referenced like "T42_".
   // This has a smaller size compared to Subs and Names because it can be
   // stored on the stack.
-  PODSmallVector<Node *, 8> TemplateParams;
+  TemplateParamList OuterTemplateParams;
+
+  // Lists of template parameters indexed by template parameter depth,
+  // referenced like "TL2_4_". If nonempty, element 0 is always
+  // OuterTemplateParams; inner elements are always template parameter lists of
+  // lambda expressions. For a generic lambda with no explicit template
+  // parameter list, the corresponding parameter list pointer will be null.
+  PODSmallVector<TemplateParamList *, 4> TemplateParams;
 
   // Set of unresolved forward <template-param> references. These can occur in a
   // conversion operator's type, and are resolved in the enclosing <encoding>.
@@ -2178,7 +2378,9 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
 
   bool TryToParseTemplateArgs = true;
   bool PermitForwardTemplateReferences = false;
-  bool ParsingLambdaParams = false;
+  size_t ParsingLambdaParamsAtLevel = (size_t)-1;
+
+  unsigned NumSyntheticTemplateParameters[3] = {};
 
   Alloc ASTAllocator;
 
@@ -2193,9 +2395,11 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
     Names.clear();
     Subs.clear();
     TemplateParams.clear();
-    ParsingLambdaParams = false;
+    ParsingLambdaParamsAtLevel = (size_t)-1;
     TryToParseTemplateArgs = true;
     PermitForwardTemplateReferences = false;
+    for (int I = 0; I != 3; ++I)
+      NumSyntheticTemplateParameters[I] = 0;
     ASTAllocator.reset();
   }
 
@@ -2253,6 +2457,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   bool parseSeqId(size_t *Out);
   Node *parseSubstitution();
   Node *parseTemplateParam();
+  Node *parseTemplateParamDecl();
   Node *parseTemplateArgs(bool TagTemplates = false);
   Node *parseTemplateArg();
 
@@ -2301,9 +2506,10 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
     size_t E = ForwardTemplateRefs.size();
     for (; I < E; ++I) {
       size_t Idx = ForwardTemplateRefs[I]->Index;
-      if (Idx >= TemplateParams.size())
+      if (TemplateParams.empty() || !TemplateParams[0] ||
+          Idx >= TemplateParams[0]->size())
         return true;
-      ForwardTemplateRefs[I]->Ref = TemplateParams[Idx];
+      ForwardTemplateRefs[I]->Ref = (*TemplateParams[0])[Idx];
     }
     ForwardTemplateRefs.dropBack(State.ForwardTemplateRefsBegin);
     return false;
@@ -2470,7 +2676,12 @@ AbstractManglingParser<Derived, Alloc>::parseUnqualifiedName(NameState *State) {
 // <lambda-sig> ::= <parameter type>+  # Parameter types or "v" if the lambda has no parameters
 template <typename Derived, typename Alloc>
 Node *
-AbstractManglingParser<Derived, Alloc>::parseUnnamedTypeName(NameState *) {
+AbstractManglingParser<Derived, Alloc>::parseUnnamedTypeName(NameState *State) {
+  // <template-params> refer to the innermost <template-args>. Clear out any
+  // outer args that we may have inserted into TemplateParams.
+  if (State != nullptr)
+    TemplateParams.clear();
+
   if (consumeIf("Ut")) {
     StringView Count = parseNumber();
     if (!consumeIf('_'))
@@ -2478,22 +2689,59 @@ AbstractManglingParser<Derived, Alloc>::parseUnnamedTypeName(NameState *) {
     return make<UnnamedTypeName>(Count);
   }
   if (consumeIf("Ul")) {
-    NodeArray Params;
-    SwapAndRestore<bool> SwapParams(ParsingLambdaParams, true);
+    SwapAndRestore<size_t> SwapParams(ParsingLambdaParamsAtLevel,
+                                      TemplateParams.size());
+    ScopedTemplateParamList LambdaTemplateParams(this);
+
+    size_t ParamsBegin = Names.size();
+    while (look() == 'T' &&
+           StringView("yptn").find(look(1)) != StringView::npos) {
+      Node *T = parseTemplateParamDecl();
+      if (!T)
+        return nullptr;
+      Names.push_back(T);
+    }
+    NodeArray TempParams = popTrailingNodeArray(ParamsBegin);
+
+    // FIXME: If TempParams is empty and none of the function parameters
+    // includes 'auto', we should remove LambdaTemplateParams from the
+    // TemplateParams list. Unfortunately, we don't find out whether there are
+    // any 'auto' parameters until too late in an example such as:
+    //
+    //   template<typename T> void f(
+    //       decltype([](decltype([]<typename T>(T v) {}),
+    //                   auto) {})) {}
+    //   template<typename T> void f(
+    //       decltype([](decltype([]<typename T>(T w) {}),
+    //                   int) {})) {}
+    //
+    // Here, the type of v is at level 2 but the type of w is at level 1. We
+    // don't find this out until we encounter the type of the next parameter.
+    //
+    // However, compilers can't actually cope with the former example in
+    // practice, and it's likely to be made ill-formed in future, so we don't
+    // need to support it here.
+    //
+    // If we encounter an 'auto' in the function parameter types, we will
+    // recreate a template parameter scope for it, but any intervening lambdas
+    // will be parsed in the 'wrong' template parameter depth.
+    if (TempParams.empty())
+      TemplateParams.pop_back();
+
     if (!consumeIf("vE")) {
-      size_t ParamsBegin = Names.size();
       do {
         Node *P = getDerived().parseType();
         if (P == nullptr)
           return nullptr;
         Names.push_back(P);
       } while (!consumeIf('E'));
-      Params = popTrailingNodeArray(ParamsBegin);
     }
+    NodeArray Params = popTrailingNodeArray(ParamsBegin);
+
     StringView Count = parseNumber();
     if (!consumeIf('_'))
       return nullptr;
-    return make<ClosureTypeName>(Params, Count);
+    return make<ClosureTypeName>(TempParams, Params, Count);
   }
   if (consumeIf("Ub")) {
     (void)parseNumber();
@@ -3949,6 +4197,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseConversionExpr() {
 //                ::= L <type> <value float> E                           # floating literal
 //                ::= L <string type> E                                  # string literal
 //                ::= L <nullptr type> E                                 # nullptr literal (i.e., "LDnE")
+//                ::= L <lambda type> E                                  # lambda expression
 // FIXME:         ::= L <type> <real-part float> _ <imag-part float> E   # complex floating point literal (C 2000)
 //                ::= L <mangled-name> E                                 # external name
 template <typename Derived, typename Alloc>
@@ -4020,24 +4269,43 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExprPrimary() {
         return R;
     }
     return nullptr;
+  case 'A': {
+    Node *T = getDerived().parseType();
+    if (T == nullptr)
+      return nullptr;
+    // FIXME: We need to include the string contents in the mangling.
+    if (consumeIf('E'))
+      return make<StringLiteral>(T);
+    return nullptr;
+  }
+  case 'D':
+    if (consumeIf("DnE"))
+      return make<NameType>("nullptr");
+    return nullptr;
   case 'T':
     // Invalid mangled name per
     //   http://sourcerytools.com/pipermail/cxx-abi-dev/2011-August/002422.html
     return nullptr;
+  case 'U': {
+    // FIXME: Should we support LUb... for block literals?
+    if (look(1) != 'l')
+      return nullptr;
+    Node *T = parseUnnamedTypeName(nullptr);
+    if (!T || !consumeIf('E'))
+      return nullptr;
+    return make<LambdaExpr>(T);
+  }
   default: {
     // might be named type
     Node *T = getDerived().parseType();
     if (T == nullptr)
       return nullptr;
     StringView N = parseNumber();
-    if (!N.empty()) {
-      if (!consumeIf('E'))
-        return nullptr;
-      return make<IntegerCastExpr>(T, N);
-    }
-    if (consumeIf('E'))
-      return T;
-    return nullptr;
+    if (N.empty())
+      return nullptr;
+    if (!consumeIf('E'))
+      return nullptr;
+    return make<IntegerCastExpr>(T, N);
   }
   }
 }
@@ -5062,11 +5330,22 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSubstitution() {
 
 // <template-param> ::= T_    # first template parameter
 //                  ::= T <parameter-2 non-negative number> _
+//                  ::= TL <level-1> __
+//                  ::= TL <level-1> _ <parameter-2 non-negative number> _
 template <typename Derived, typename Alloc>
 Node *AbstractManglingParser<Derived, Alloc>::parseTemplateParam() {
   if (!consumeIf('T'))
     return nullptr;
 
+  size_t Level = 0;
+  if (consumeIf('L')) {
+    if (parsePositiveInteger(&Level))
+      return nullptr;
+    ++Level;
+    if (!consumeIf('_'))
+      return nullptr;
+  }
+
   size_t Index = 0;
   if (!consumeIf('_')) {
     if (parsePositiveInteger(&Index))
@@ -5076,15 +5355,11 @@ Node *AbstractManglingParser<Derived, Alloc>::parseTemplateParam() {
       return nullptr;
   }
 
-  // Itanium ABI 5.1.8: In a generic lambda, uses of auto in the parameter list
-  // are mangled as the corresponding artificial template type parameter.
-  if (ParsingLambdaParams)
-    return make<NameType>("auto");
-
   // If we're in a context where this <template-param> refers to a
   // <template-arg> further ahead in the mangled name (currently just conversion
   // operator types), then we should only look it up in the right context.
-  if (PermitForwardTemplateReferences) {
+  // This can only happen at the outermost level.
+  if (PermitForwardTemplateReferences && Level == 0) {
     Node *ForwardRef = make<ForwardTemplateReference>(Index);
     if (!ForwardRef)
       return nullptr;
@@ -5094,9 +5369,78 @@ Node *AbstractManglingParser<Derived, Alloc>::parseTemplateParam() {
     return ForwardRef;
   }
 
-  if (Index >= TemplateParams.size())
+  if (Level >= TemplateParams.size() || !TemplateParams[Level] ||
+      Index >= TemplateParams[Level]->size()) {
+    // Itanium ABI 5.1.8: In a generic lambda, uses of auto in the parameter
+    // list are mangled as the corresponding artificial template type parameter.
+    if (ParsingLambdaParamsAtLevel == Level && Level <= TemplateParams.size()) {
+      // This will be popped by the ScopedTemplateParamList in
+      // parseUnnamedTypeName.
+      if (Level == TemplateParams.size())
+        TemplateParams.push_back(nullptr);
+      return make<NameType>("auto");
+    }
+
     return nullptr;
-  return TemplateParams[Index];
+  }
+
+  return (*TemplateParams[Level])[Index];
+}
+
+// <template-param-decl> ::= Ty                          # type parameter
+//                       ::= Tn <type>                   # non-type parameter
+//                       ::= Tt <template-param-decl>* E # template parameter
+//                       ::= Tp <template-param-decl>    # parameter pack
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseTemplateParamDecl() {
+  auto InventTemplateParamName = [&](TemplateParamKind Kind) {
+    unsigned Index = NumSyntheticTemplateParameters[(int)Kind]++;
+    Node *N = make<SyntheticTemplateParamName>(Kind, Index);
+    if (N) TemplateParams.back()->push_back(N);
+    return N;
+  };
+
+  if (consumeIf("Ty")) {
+    Node *Name = InventTemplateParamName(TemplateParamKind::Type);
+    if (!Name)
+      return nullptr;
+    return make<TypeTemplateParamDecl>(Name);
+  }
+
+  if (consumeIf("Tn")) {
+    Node *Name = InventTemplateParamName(TemplateParamKind::NonType);
+    if (!Name)
+      return nullptr;
+    Node *Type = parseType();
+    if (!Type)
+      return nullptr;
+    return make<NonTypeTemplateParamDecl>(Name, Type);
+  }
+
+  if (consumeIf("Tt")) {
+    Node *Name = InventTemplateParamName(TemplateParamKind::Template);
+    if (!Name)
+      return nullptr;
+    size_t ParamsBegin = Names.size();
+    ScopedTemplateParamList TemplateTemplateParamParams(this);
+    while (!consumeIf("E")) {
+      Node *P = parseTemplateParamDecl();
+      if (!P)
+        return nullptr;
+      Names.push_back(P);
+    }
+    NodeArray Params = popTrailingNodeArray(ParamsBegin);
+    return make<TemplateTemplateParamDecl>(Name, Params);
+  }
+
+  if (consumeIf("Tp")) {
+    Node *P = parseTemplateParamDecl();
+    if (!P)
+      return nullptr;
+    return make<TemplateParamPackDecl>(P);
+  }
+
+  return nullptr;
 }
 
 // <template-arg> ::= <type>                    # type or template
@@ -5153,8 +5497,11 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
 
   // <template-params> refer to the innermost <template-args>. Clear out any
   // outer args that we may have inserted into TemplateParams.
-  if (TagTemplates)
+  if (TagTemplates) {
     TemplateParams.clear();
+    TemplateParams.push_back(&OuterTemplateParams);
+    OuterTemplateParams.clear();
+  }
 
   size_t ArgsBegin = Names.size();
   while (!consumeIf('E')) {
@@ -5172,7 +5519,7 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
         if (!TableEntry)
           return nullptr;
       }
-      TemplateParams.push_back(TableEntry);
+      TemplateParams.back()->push_back(TableEntry);
     } else {
       Node *Arg = getDerived().parseTemplateArg();
       if (Arg == nullptr)
diff --git a/include/llvm/Demangle/MicrosoftDemangle.h b/include/llvm/Demangle/MicrosoftDemangle.h
index 382e79401c43..c6f26061bedd 100644
--- a/include/llvm/Demangle/MicrosoftDemangle.h
+++ b/include/llvm/Demangle/MicrosoftDemangle.h
@@ -158,6 +158,7 @@ private:
                                     QualifiedNameNode *QN);
   SymbolNode *demangleDeclarator(StringView &MangledName);
   SymbolNode *demangleMD5Name(StringView &MangledName);
+  SymbolNode *demangleTypeinfoName(StringView &MangledName);
 
   VariableSymbolNode *demangleVariableEncoding(StringView &MangledName,
                                                StorageClass SC);
diff --git a/include/llvm/Demangle/MicrosoftDemangleNodes.h b/include/llvm/Demangle/MicrosoftDemangleNodes.h
index da9d9d5bfdc0..81b279fe237d 100644
--- a/include/llvm/Demangle/MicrosoftDemangleNodes.h
+++ b/include/llvm/Demangle/MicrosoftDemangleNodes.h
@@ -16,6 +16,8 @@
 #include "llvm/Demangle/DemangleConfig.h"
 #include "llvm/Demangle/StringView.h"
 #include <array>
+#include <cstdint>
+#include <string>
 
 namespace llvm {
 namespace itanium_demangle {
@@ -73,6 +75,9 @@ enum OutputFlags {
   OF_Default = 0,
   OF_NoCallingConvention = 1,
   OF_NoTagSpecifier = 2,
+  OF_NoAccessSpecifier = 4,
+  OF_NoMemberType = 8,
+  OF_NoReturnType = 16,
 };
 
 // Types
@@ -301,8 +306,6 @@ struct TypeNode : public Node {
     outputPost(OS, Flags);
   }
 
-  void outputQuals(bool SpaceBefore, bool SpaceAfter) const;
-
   Qualifiers Quals = Q_None;
 };
 
diff --git a/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h b/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h
index 8d2f641254b3..72687682f606 100644
--- a/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h
+++ b/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h
@@ -22,17 +22,21 @@ namespace llvm {
 namespace jitlink {
 
 /// Registers all FDEs in the given eh-frame section with the current process.
-Error registerEHFrameSection(const void *EHFrameSectionAddr);
+Error registerEHFrameSection(const void *EHFrameSectionAddr,
+                             size_t EHFrameSectionSize);
 
 /// Deregisters all FDEs in the given eh-frame section with the current process.
-Error deregisterEHFrameSection(const void *EHFrameSectionAddr);
+Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
+                               size_t EHFrameSectionSize);
 
 /// Supports registration/deregistration of EH-frames in a target process.
 class EHFrameRegistrar {
 public:
   virtual ~EHFrameRegistrar();
-  virtual Error registerEHFrames(JITTargetAddress EHFrameSectionAddr) = 0;
-  virtual Error deregisterEHFrames(JITTargetAddress EHFrameSectionAddr) = 0;
+  virtual Error registerEHFrames(JITTargetAddress EHFrameSectionAddr,
+                                 size_t EHFrameSectionSize) = 0;
+  virtual Error deregisterEHFrames(JITTargetAddress EHFrameSectionAddr,
+                                   size_t EHFrameSectionSize) = 0;
 };
 
 /// Registers / Deregisters EH-frames in the current process.
@@ -48,31 +52,38 @@ public:
   InProcessEHFrameRegistrar(InProcessEHFrameRegistrar &&) = delete;
   InProcessEHFrameRegistrar &operator=(InProcessEHFrameRegistrar &&) = delete;
 
-  Error registerEHFrames(JITTargetAddress EHFrameSectionAddr) override {
+  Error registerEHFrames(JITTargetAddress EHFrameSectionAddr,
+                         size_t EHFrameSectionSize) override {
     return registerEHFrameSection(
-        jitTargetAddressToPointer<void *>(EHFrameSectionAddr));
+        jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
+        EHFrameSectionSize);
   }
 
-  Error deregisterEHFrames(JITTargetAddress EHFrameSectionAddr) override {
+  Error deregisterEHFrames(JITTargetAddress EHFrameSectionAddr,
+                           size_t EHFrameSectionSize) override {
     return deregisterEHFrameSection(
-        jitTargetAddressToPointer<void *>(EHFrameSectionAddr));
+        jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
+        EHFrameSectionSize);
   }
 
 private:
   InProcessEHFrameRegistrar();
 };
 
-using StoreFrameAddressFunction = std::function<void(JITTargetAddress)>;
+using StoreFrameRangeFunction =
+  std::function<void(JITTargetAddress EHFrameSectionAddr,
+                     size_t EHFrameSectionSize)>;
 
-/// Creates a pass that records the address of the EH frame section. If no
-/// eh-frame section is found, it will set EHFrameAddr to zero.
+/// Creates a pass that records the address and size of the EH frame section.
+/// If no eh-frame section is found then the address and size will both be given
+/// as zero.
 ///
 /// Authors of JITLinkContexts can use this function to register a post-fixup
-/// pass that records the address of the eh-frame section. This address can
+/// pass that records the range of the eh-frame section. This range can
 /// be used after finalization to register and deregister the frame.
-AtomGraphPassFunction
+LinkGraphPassFunction
 createEHFrameRecorderPass(const Triple &TT,
-                          StoreFrameAddressFunction StoreFrameAddress);
+                          StoreFrameRangeFunction StoreFrameRange);
 
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/include/llvm/ExecutionEngine/JITLink/JITLink.h b/include/llvm/ExecutionEngine/JITLink/JITLink.h
index be80d44ccf51..b531127cf892 100644
--- a/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -34,6 +34,9 @@
 namespace llvm {
 namespace jitlink {
 
+class Symbol;
+class Section;
+
 /// Base class for errors originating in JIT linker, e.g. missing relocation
 /// support.
 class JITLinkError : public ErrorInfo<JITLinkError> {
@@ -50,27 +53,22 @@ private:
   std::string ErrMsg;
 };
 
-// Forward declare the Atom class.
-class Atom;
-
-/// Edge class. Represents both object file relocations, as well as layout and
-/// keep-alive constraints.
+/// Represents fixups and constraints in the LinkGraph.
 class Edge {
 public:
   using Kind = uint8_t;
 
-  using GenericEdgeKind = enum : Kind {
+  enum GenericEdgeKind : Kind {
     Invalid,                    // Invalid edge value.
     FirstKeepAlive,             // Keeps target alive. Offset/addend zero.
     KeepAlive = FirstKeepAlive, // Tag first edge kind that preserves liveness.
-    LayoutNext,                 // Layout constraint. Offset/Addend zero.
     FirstRelocation             // First architecture specific relocation.
   };
 
   using OffsetT = uint32_t;
   using AddendT = int64_t;
 
-  Edge(Kind K, OffsetT Offset, Atom &Target, AddendT Addend)
+  Edge(Kind K, OffsetT Offset, Symbol &Target, AddendT Addend)
       : Target(&Target), Offset(Offset), Addend(Addend), K(K) {}
 
   OffsetT getOffset() const { return Offset; }
@@ -82,461 +80,637 @@ public:
     return K - FirstRelocation;
   }
   bool isKeepAlive() const { return K >= FirstKeepAlive; }
-  Atom &getTarget() const { return *Target; }
-  void setTarget(Atom &Target) { this->Target = &Target; }
+  Symbol &getTarget() const { return *Target; }
+  void setTarget(Symbol &Target) { this->Target = &Target; }
   AddendT getAddend() const { return Addend; }
   void setAddend(AddendT Addend) { this->Addend = Addend; }
 
 private:
-  Atom *Target;
-  OffsetT Offset;
-  AddendT Addend;
+  Symbol *Target = nullptr;
+  OffsetT Offset = 0;
+  AddendT Addend = 0;
   Kind K = 0;
 };
 
-using EdgeVector = std::vector<Edge>;
+/// Returns the string name of the given generic edge kind, or "unknown"
+/// otherwise. Useful for debugging.
+const char *getGenericEdgeKindName(Edge::Kind K);
 
-const StringRef getGenericEdgeKindName(Edge::Kind K);
-
-/// Base Atom class. Used by absolute and undefined atoms.
-class Atom {
-  friend class AtomGraph;
+/// Base class for Addressable entities (externals, absolutes, blocks).
+class Addressable {
+  friend class LinkGraph;
 
 protected:
-  /// Create a named (as yet unresolved) atom.
-  Atom(StringRef Name)
-      : Name(Name), IsDefined(false), IsLive(false), ShouldDiscard(false),
-        IsGlobal(false), IsAbsolute(false), IsCallable(false),
-        IsExported(false), IsWeak(false), HasLayoutNext(false),
-        IsCommon(false) {}
-
-  /// Create an absolute symbol atom.
-  Atom(StringRef Name, JITTargetAddress Address)
-      : Name(Name), Address(Address), IsDefined(true), IsLive(false),
-        ShouldDiscard(false), IsGlobal(false), IsAbsolute(false),
-        IsCallable(false), IsExported(false), IsWeak(false),
-        HasLayoutNext(false), IsCommon(false) {}
+  Addressable(JITTargetAddress Address, bool IsDefined)
+      : Address(Address), IsDefined(IsDefined), IsAbsolute(false) {}
 
-public:
-  /// Returns true if this atom has a name.
-  bool hasName() const { return Name != StringRef(); }
+  Addressable(JITTargetAddress Address)
+      : Address(Address), IsDefined(false), IsAbsolute(true) {
+    assert(!(IsDefined && IsAbsolute) &&
+           "Block cannot be both defined and absolute");
+  }
 
-  /// Returns the name of this atom.
-  StringRef getName() const { return Name; }
+public:
+  Addressable(const Addressable &) = delete;
+  Addressable &operator=(const Addressable &) = default;
+  Addressable(Addressable &&) = delete;
+  Addressable &operator=(Addressable &&) = default;
 
-  /// Returns the current target address of this atom.
-  /// The initial target address (for atoms that have one) will be taken from
-  /// the input object file's virtual address space. During the layout phase
-  /// of JIT linking the atom's address will be updated to point to its final
-  /// address in the JIT'd process.
   JITTargetAddress getAddress() const { return Address; }
-
-  /// Set the current target address of this atom.
   void setAddress(JITTargetAddress Address) { this->Address = Address; }
 
-  /// Returns true if this is a defined atom.
-  bool isDefined() const { return IsDefined; }
+  /// Returns true if this is a defined addressable, in which case you
+  /// can downcast this to a .
+  bool isDefined() const { return static_cast<bool>(IsDefined); }
+  bool isAbsolute() const { return static_cast<bool>(IsAbsolute); }
 
-  /// Returns true if this atom is marked as live.
-  bool isLive() const { return IsLive; }
+private:
+  JITTargetAddress Address = 0;
+  uint64_t IsDefined : 1;
+  uint64_t IsAbsolute : 1;
+};
 
-  /// Mark this atom as live.
-  ///
-  /// Note: Only defined and absolute atoms can be marked live.
-  void setLive(bool IsLive) {
-    assert((IsDefined || IsAbsolute || !IsLive) &&
-           "Only defined and absolute atoms can be marked live");
-    this->IsLive = IsLive;
-  }
+using BlockOrdinal = unsigned;
+using SectionOrdinal = unsigned;
 
-  /// Returns true if this atom should be discarded during pruning.
-  bool shouldDiscard() const { return ShouldDiscard; }
+/// An Addressable with content and edges.
+class Block : public Addressable {
+  friend class LinkGraph;
 
-  /// Mark this atom to be discarded.
-  ///
-  /// Note: Only defined and absolute atoms can be marked live.
-  void setShouldDiscard(bool ShouldDiscard) {
-    assert((IsDefined || IsAbsolute || !ShouldDiscard) &&
-           "Only defined and absolute atoms can be marked live");
-    this->ShouldDiscard = ShouldDiscard;
+private:
+  /// Create a zero-fill defined addressable.
+  Block(Section &Parent, BlockOrdinal Ordinal, JITTargetAddress Size,
+        JITTargetAddress Address, uint64_t Alignment, uint64_t AlignmentOffset)
+      : Addressable(Address, true), Parent(Parent), Size(Size),
+        Ordinal(Ordinal) {
+    assert(isPowerOf2_64(Alignment) && "Alignment must be power of 2");
+    assert(AlignmentOffset < Alignment &&
+           "Alignment offset cannot exceed alignment");
+    assert(AlignmentOffset <= MaxAlignmentOffset &&
+           "Alignment offset exceeds maximum");
+    P2Align = Alignment ? countTrailingZeros(Alignment) : 0;
+    this->AlignmentOffset = AlignmentOffset;
   }
 
-  /// Returns true if this definition is global (i.e. visible outside this
-  /// linkage unit).
-  ///
-  /// Note: This is distict from Exported, which means visibile outside the
-  /// JITDylib that this graph is being linked in to.
-  bool isGlobal() const { return IsGlobal; }
+  /// Create a defined addressable for the given content.
+  Block(Section &Parent, BlockOrdinal Ordinal, StringRef Content,
+        JITTargetAddress Address, uint64_t Alignment, uint64_t AlignmentOffset)
+      : Addressable(Address, true), Parent(Parent), Data(Content.data()),
+        Size(Content.size()), Ordinal(Ordinal) {
+    assert(isPowerOf2_64(Alignment) && "Alignment must be power of 2");
+    assert(AlignmentOffset < Alignment &&
+           "Alignment offset cannot exceed alignment");
+    assert(AlignmentOffset <= MaxAlignmentOffset &&
+           "Alignment offset exceeds maximum");
+    P2Align = Alignment ? countTrailingZeros(Alignment) : 0;
+    this->AlignmentOffset = AlignmentOffset;
+  }
 
-  /// Mark this atom as global.
-  void setGlobal(bool IsGlobal) { this->IsGlobal = IsGlobal; }
+public:
+  using EdgeVector = std::vector<Edge>;
+  using edge_iterator = EdgeVector::iterator;
+  using const_edge_iterator = EdgeVector::const_iterator;
 
-  /// Returns true if this atom represents an absolute symbol.
-  bool isAbsolute() const { return IsAbsolute; }
+  Block(const Block &) = delete;
+  Block &operator=(const Block &) = delete;
+  Block(Block &&) = delete;
+  Block &operator=(Block &&) = delete;
 
-  /// Returns true if this atom is known to be callable.
+  /// Return the parent section for this block.
+  Section &getSection() const { return Parent; }
+
+  /// Return the ordinal for this block.
+  BlockOrdinal getOrdinal() const { return Ordinal; }
+
+  /// Returns true if this is a zero-fill block.
   ///
-  /// Primarily provided for easy interoperability with ORC, which uses the
-  /// JITSymbolFlags::Common flag to identify symbols that can be interposed
-  /// with stubs.
-  bool isCallable() const { return IsCallable; }
+  /// If true, getSize is callable but getContent is not (the content is
+  /// defined to be a sequence of zero bytes of length Size).
+  bool isZeroFill() const { return !Data; }
+
+  /// Returns the size of this defined addressable.
+  size_t getSize() const { return Size; }
+
+  /// Get the content for this block. Block must not be a zero-fill block.
+  StringRef getContent() const {
+    assert(Data && "Section does not contain content");
+    return StringRef(Data, Size);
+  }
 
-  /// Mark this atom as callable.
-  void setCallable(bool IsCallable) {
-    assert((IsDefined || IsAbsolute || !IsCallable) &&
-           "Callable atoms must be defined or absolute");
-    this->IsCallable = IsCallable;
+  /// Set the content for this block.
+  /// Caller is responsible for ensuring the underlying bytes are not
+  /// deallocated while pointed to by this block.
+  void setContent(StringRef Content) {
+    Data = Content.data();
+    Size = Content.size();
   }
 
-  /// Returns true if this atom should appear in the symbol table of a final
-  /// linked image.
-  bool isExported() const { return IsExported; }
+  /// Get the alignment for this content.
+  uint64_t getAlignment() const { return 1ull << P2Align; }
+
+  /// Get the alignment offset for this content.
+  uint64_t getAlignmentOffset() const { return AlignmentOffset; }
 
-  /// Mark this atom as exported.
-  void setExported(bool IsExported) {
-    assert((!IsExported || ((IsDefined || IsAbsolute) && hasName())) &&
-           "Exported atoms must have names");
-    this->IsExported = IsExported;
+  /// Add an edge to this block.
+  void addEdge(Edge::Kind K, Edge::OffsetT Offset, Symbol &Target,
+               Edge::AddendT Addend) {
+    Edges.push_back(Edge(K, Offset, Target, Addend));
   }
 
-  /// Returns true if this is a weak symbol.
-  bool isWeak() const { return IsWeak; }
+  /// Return the list of edges attached to this content.
+  iterator_range<edge_iterator> edges() {
+    return make_range(Edges.begin(), Edges.end());
+  }
 
-  /// Mark this atom as weak.
-  void setWeak(bool IsWeak) { this->IsWeak = IsWeak; }
+  /// Returns the list of edges attached to this content.
+  iterator_range<const_edge_iterator> edges() const {
+    return make_range(Edges.begin(), Edges.end());
+  }
 
-private:
-  StringRef Name;
-  JITTargetAddress Address = 0;
+  /// Return the size of the edges list.
+  size_t edges_size() const { return Edges.size(); }
 
-  bool IsDefined : 1;
-  bool IsLive : 1;
-  bool ShouldDiscard : 1;
+  /// Returns true if the list of edges is empty.
+  bool edges_empty() const { return Edges.empty(); }
 
-  bool IsGlobal : 1;
-  bool IsAbsolute : 1;
-  bool IsCallable : 1;
-  bool IsExported : 1;
-  bool IsWeak : 1;
+private:
+  static constexpr uint64_t MaxAlignmentOffset = (1ULL << 57) - 1;
 
-protected:
-  // These flags only make sense for DefinedAtom, but we can minimize the size
-  // of DefinedAtom by defining them here.
-  bool HasLayoutNext : 1;
-  bool IsCommon : 1;
+  uint64_t P2Align : 5;
+  uint64_t AlignmentOffset : 57;
+  Section &Parent;
+  const char *Data = nullptr;
+  size_t Size = 0;
+  BlockOrdinal Ordinal = 0;
+  std::vector<Edge> Edges;
 };
 
-// Forward declare DefinedAtom.
-class DefinedAtom;
+/// Describes symbol linkage. This can be used to make resolve definition
+/// clashes.
+enum class Linkage : uint8_t {
+  Strong,
+  Weak,
+};
 
-raw_ostream &operator<<(raw_ostream &OS, const Atom &A);
-void printEdge(raw_ostream &OS, const Atom &FixupAtom, const Edge &E,
-               StringRef EdgeKindName);
+/// For errors and debugging output.
+const char *getLinkageName(Linkage L);
+
+/// Defines the scope in which this symbol should be visible:
+///   Default -- Visible in the public interface of the linkage unit.
+///   Hidden -- Visible within the linkage unit, but not exported from it.
+///   Local -- Visible only within the LinkGraph.
+enum class Scope : uint8_t { Default, Hidden, Local };
+
+/// For debugging output.
+const char *getScopeName(Scope S);
+
+raw_ostream &operator<<(raw_ostream &OS, const Block &B);
+
+/// Symbol representation.
+///
+/// Symbols represent locations within Addressable objects.
+/// They can be either Named or Anonymous.
+/// Anonymous symbols have neither linkage nor visibility, and must point at
+/// ContentBlocks.
+/// Named symbols may be in one of four states:
+///   - Null: Default initialized. Assignable, but otherwise unusable.
+///   - Defined: Has both linkage and visibility and points to a ContentBlock
+///   - Common: Has both linkage and visibility, points to a null Addressable.
+///   - External: Has neither linkage nor visibility, points to an external
+///     Addressable.
+///
+class Symbol {
+  friend class LinkGraph;
+
+private:
+  Symbol(Addressable &Base, JITTargetAddress Offset, StringRef Name,
+         JITTargetAddress Size, Linkage L, Scope S, bool IsLive,
+         bool IsCallable)
+      : Name(Name), Base(&Base), Offset(Offset), Size(Size) {
+    setLinkage(L);
+    setScope(S);
+    setLive(IsLive);
+    setCallable(IsCallable);
+  }
+
+  static Symbol &constructCommon(void *SymStorage, Block &Base, StringRef Name,
+                                 JITTargetAddress Size, Scope S, bool IsLive) {
+    assert(SymStorage && "Storage cannot be null");
+    assert(!Name.empty() && "Common symbol name cannot be empty");
+    assert(Base.isDefined() &&
+           "Cannot create common symbol from undefined block");
+    assert(static_cast<Block &>(Base).getSize() == Size &&
+           "Common symbol size should match underlying block size");
+    auto *Sym = reinterpret_cast<Symbol *>(SymStorage);
+    new (Sym) Symbol(Base, 0, Name, Size, Linkage::Weak, S, IsLive, false);
+    return *Sym;
+  }
+
+  static Symbol &constructExternal(void *SymStorage, Addressable &Base,
+                                   StringRef Name, JITTargetAddress Size) {
+    assert(SymStorage && "Storage cannot be null");
+    assert(!Base.isDefined() &&
+           "Cannot create external symbol from defined block");
+    assert(!Name.empty() && "External symbol name cannot be empty");
+    auto *Sym = reinterpret_cast<Symbol *>(SymStorage);
+    new (Sym) Symbol(Base, 0, Name, Size, Linkage::Strong, Scope::Default,
+                     false, false);
+    return *Sym;
+  }
+
+  static Symbol &constructAbsolute(void *SymStorage, Addressable &Base,
+                                   StringRef Name, JITTargetAddress Size,
+                                   Linkage L, Scope S, bool IsLive) {
+    assert(SymStorage && "Storage cannot be null");
+    assert(!Base.isDefined() &&
+           "Cannot create absolute symbol from a defined block");
+    auto *Sym = reinterpret_cast<Symbol *>(SymStorage);
+    new (Sym) Symbol(Base, 0, Name, Size, L, S, IsLive, false);
+    return *Sym;
+  }
+
+  static Symbol &constructAnonDef(void *SymStorage, Block &Base,
+                                  JITTargetAddress Offset,
+                                  JITTargetAddress Size, bool IsCallable,
+                                  bool IsLive) {
+    assert(SymStorage && "Storage cannot be null");
+    auto *Sym = reinterpret_cast<Symbol *>(SymStorage);
+    new (Sym) Symbol(Base, Offset, StringRef(), Size, Linkage::Strong,
+                     Scope::Local, IsLive, IsCallable);
+    return *Sym;
+  }
+
+  static Symbol &constructNamedDef(void *SymStorage, Block &Base,
+                                   JITTargetAddress Offset, StringRef Name,
+                                   JITTargetAddress Size, Linkage L, Scope S,
+                                   bool IsLive, bool IsCallable) {
+    assert(SymStorage && "Storage cannot be null");
+    assert(!Name.empty() && "Name cannot be empty");
+    auto *Sym = reinterpret_cast<Symbol *>(SymStorage);
+    new (Sym) Symbol(Base, Offset, Name, Size, L, S, IsLive, IsCallable);
+    return *Sym;
+  }
 
-/// Represents a section address range via a pair of DefinedAtom pointers to
-/// the first and last atoms in the section.
-class SectionRange {
 public:
-  SectionRange() = default;
-  SectionRange(DefinedAtom *First, DefinedAtom *Last)
-      : First(First), Last(Last) {}
-  DefinedAtom *getFirstAtom() const {
-    assert((!Last || First) && "First can not be null if end is non-null");
-    return First;
+  /// Create a null Symbol. This allows Symbols to be default initialized for
+  /// use in containers (e.g. as map values). Null symbols are only useful for
+  /// assigning to.
+  Symbol() = default;
+
+  // Symbols are not movable or copyable.
+  Symbol(const Symbol &) = delete;
+  Symbol &operator=(const Symbol &) = delete;
+  Symbol(Symbol &&) = delete;
+  Symbol &operator=(Symbol &&) = delete;
+
+  /// Returns true if this symbol has a name.
+  bool hasName() const { return !Name.empty(); }
+
+  /// Returns the name of this symbol (empty if the symbol is anonymous).
+  StringRef getName() const {
+    assert((!Name.empty() || getScope() == Scope::Local) &&
+           "Anonymous symbol has non-local scope");
+    return Name;
   }
-  DefinedAtom *getLastAtom() const {
-    assert((First || !Last) && "Last can not be null if start is non-null");
-    return Last;
+
+  /// Returns true if this Symbol has content (potentially) defined within this
+  /// object file (i.e. is anything but an external or absolute symbol).
+  bool isDefined() const {
+    assert(Base && "Attempt to access null symbol");
+    return Base->isDefined();
   }
-  bool isEmpty() const {
-    assert((First || !Last) && "Last can not be null if start is non-null");
-    return !First;
+
+  /// Returns true if this symbol is live (i.e. should be treated as a root for
+  /// dead stripping).
+  bool isLive() const {
+    assert(Base && "Attempting to access null symbol");
+    return IsLive;
   }
-  JITTargetAddress getStart() const;
-  JITTargetAddress getEnd() const;
-  uint64_t getSize() const;
 
-private:
-  DefinedAtom *First = nullptr;
-  DefinedAtom *Last = nullptr;
-};
+  /// Set this symbol's live bit.
+  void setLive(bool IsLive) { this->IsLive = IsLive; }
 
-/// Represents an object file section.
-class Section {
-  friend class AtomGraph;
+  /// Returns true is this symbol is callable.
+  bool isCallable() const { return IsCallable; }
 
-private:
-  Section(StringRef Name, uint32_t Alignment, sys::Memory::ProtectionFlags Prot,
-          unsigned Ordinal, bool IsZeroFill)
-      : Name(Name), Alignment(Alignment), Prot(Prot), Ordinal(Ordinal),
-        IsZeroFill(IsZeroFill) {
-    assert(isPowerOf2_32(Alignment) && "Alignments must be a power of 2");
+  /// Set this symbol's callable bit.
+  void setCallable(bool IsCallable) { this->IsCallable = IsCallable; }
+
+  /// Returns true if the underlying addressable is an unresolved external.
+  bool isExternal() const {
+    assert(Base && "Attempt to access null symbol");
+    return !Base->isDefined() && !Base->isAbsolute();
   }
 
-  using DefinedAtomSet = DenseSet<DefinedAtom *>;
+  /// Returns true if the underlying addressable is an absolute symbol.
+  bool isAbsolute() const {
+    assert(Base && "Attempt to access null symbol");
+    return !Base->isDefined() && Base->isAbsolute();
+  }
 
-public:
-  using atom_iterator = DefinedAtomSet::iterator;
-  using const_atom_iterator = DefinedAtomSet::const_iterator;
+  /// Return the addressable that this symbol points to.
+  Addressable &getAddressable() {
+    assert(Base && "Cannot get underlying addressable for null symbol");
+    return *Base;
+  }
 
-  ~Section();
-  StringRef getName() const { return Name; }
-  uint32_t getAlignment() const { return Alignment; }
-  sys::Memory::ProtectionFlags getProtectionFlags() const { return Prot; }
-  unsigned getSectionOrdinal() const { return Ordinal; }
-  size_t getNextAtomOrdinal() { return ++NextAtomOrdinal; }
+  /// Return the addressable that thsi symbol points to.
+  const Addressable &getAddressable() const {
+    assert(Base && "Cannot get underlying addressable for null symbol");
+    return *Base;
+  }
 
-  bool isZeroFill() const { return IsZeroFill; }
+  /// Return the Block for this Symbol (Symbol must be defined).
+  Block &getBlock() {
+    assert(Base && "Cannot get block for null symbol");
+    assert(Base->isDefined() && "Not a defined symbol");
+    return static_cast<Block &>(*Base);
+  }
 
-  /// Returns an iterator over the atoms in the section (in no particular
-  /// order).
-  iterator_range<atom_iterator> atoms() {
-    return make_range(DefinedAtoms.begin(), DefinedAtoms.end());
+  /// Return the Block for this Symbol (Symbol must be defined).
+  const Block &getBlock() const {
+    assert(Base && "Cannot get block for null symbol");
+    assert(Base->isDefined() && "Not a defined symbol");
+    return static_cast<const Block &>(*Base);
   }
 
-  /// Returns an iterator over the atoms in the section (in no particular
-  /// order).
-  iterator_range<const_atom_iterator> atoms() const {
-    return make_range(DefinedAtoms.begin(), DefinedAtoms.end());
+  /// Returns the offset for this symbol within the underlying addressable.
+  JITTargetAddress getOffset() const { return Offset; }
+
+  /// Returns the address of this symbol.
+  JITTargetAddress getAddress() const { return Base->getAddress() + Offset; }
+
+  /// Returns the size of this symbol.
+  JITTargetAddress getSize() const { return Size; }
+
+  /// Returns true if this symbol is backed by a zero-fill block.
+  /// This method may only be called on defined symbols.
+  bool isSymbolZeroFill() const { return getBlock().isZeroFill(); }
+
+  /// Returns the content in the underlying block covered by this symbol.
+  /// This method may only be called on defined non-zero-fill symbols.
+  StringRef getSymbolContent() const {
+    return getBlock().getContent().substr(Offset, Size);
   }
 
-  /// Return the number of atoms in this section.
-  DefinedAtomSet::size_type atoms_size() { return DefinedAtoms.size(); }
+  /// Get the linkage for this Symbol.
+  Linkage getLinkage() const { return static_cast<Linkage>(L); }
 
-  /// Return true if this section contains no atoms.
-  bool atoms_empty() const { return DefinedAtoms.empty(); }
+  /// Set the linkage for this Symbol.
+  void setLinkage(Linkage L) {
+    assert((L == Linkage::Strong || (Base->isDefined() && !Name.empty())) &&
+           "Linkage can only be applied to defined named symbols");
+    this->L = static_cast<uint8_t>(L);
+  }
 
-  /// Returns the range of this section as the pair of atoms with the lowest
-  /// and highest target address. This operation is expensive, as it
-  /// must traverse all atoms in the section.
-  ///
-  /// Note: If the section is empty, both values will be null. The section
-  /// address will evaluate to null, and the size to zero. If the section
-  /// contains a single atom both values will point to it, the address will
-  /// evaluate to the address of that atom, and the size will be the size of
-  /// that atom.
-  SectionRange getRange() const;
+  /// Get the visibility for this Symbol.
+  Scope getScope() const { return static_cast<Scope>(S); }
 
-private:
-  void addAtom(DefinedAtom &DA) {
-    assert(!DefinedAtoms.count(&DA) && "Atom is already in this section");
-    DefinedAtoms.insert(&DA);
+  /// Set the visibility for this Symbol.
+  void setScope(Scope S) {
+    assert((S == Scope::Default || Base->isDefined() || Base->isAbsolute()) &&
+           "Invalid visibility for symbol type");
+    this->S = static_cast<uint8_t>(S);
   }
 
-  void removeAtom(DefinedAtom &DA) {
-    assert(DefinedAtoms.count(&DA) && "Atom is not in this section");
-    DefinedAtoms.erase(&DA);
+private:
+  void makeExternal(Addressable &A) {
+    assert(!A.isDefined() && "Attempting to make external with defined block");
+    Base = &A;
+    Offset = 0;
+    setLinkage(Linkage::Strong);
+    setScope(Scope::Default);
+    IsLive = 0;
+    // note: Size and IsCallable fields left unchanged.
   }
 
+  static constexpr uint64_t MaxOffset = (1ULL << 59) - 1;
+
+  // FIXME: A char* or SymbolStringPtr may pack better.
   StringRef Name;
-  uint32_t Alignment = 0;
-  sys::Memory::ProtectionFlags Prot;
-  unsigned Ordinal = 0;
-  unsigned NextAtomOrdinal = 0;
-  bool IsZeroFill = false;
-  DefinedAtomSet DefinedAtoms;
+  Addressable *Base = nullptr;
+  uint64_t Offset : 59;
+  uint64_t L : 1;
+  uint64_t S : 2;
+  uint64_t IsLive : 1;
+  uint64_t IsCallable : 1;
+  JITTargetAddress Size = 0;
 };
 
-/// Defined atom class. Suitable for use by defined named and anonymous
-/// atoms.
-class DefinedAtom : public Atom {
-  friend class AtomGraph;
+raw_ostream &operator<<(raw_ostream &OS, const Symbol &A);
+
+void printEdge(raw_ostream &OS, const Block &B, const Edge &E,
+               StringRef EdgeKindName);
+
+/// Represents an object file section.
+class Section {
+  friend class LinkGraph;
 
 private:
-  DefinedAtom(Section &Parent, JITTargetAddress Address, uint32_t Alignment)
-      : Atom("", Address), Parent(Parent), Ordinal(Parent.getNextAtomOrdinal()),
-        Alignment(Alignment) {
-    assert(isPowerOf2_32(Alignment) && "Alignments must be a power of two");
-  }
+  Section(StringRef Name, sys::Memory::ProtectionFlags Prot,
+          SectionOrdinal SecOrdinal)
+      : Name(Name), Prot(Prot), SecOrdinal(SecOrdinal) {}
 
-  DefinedAtom(Section &Parent, StringRef Name, JITTargetAddress Address,
-              uint32_t Alignment)
-      : Atom(Name, Address), Parent(Parent),
-        Ordinal(Parent.getNextAtomOrdinal()), Alignment(Alignment) {
-    assert(isPowerOf2_32(Alignment) && "Alignments must be a power of two");
-  }
+  using SymbolSet = DenseSet<Symbol *>;
+  using BlockSet = DenseSet<Block *>;
 
 public:
-  using edge_iterator = EdgeVector::iterator;
+  using symbol_iterator = SymbolSet::iterator;
+  using const_symbol_iterator = SymbolSet::const_iterator;
 
-  Section &getSection() const { return Parent; }
+  using block_iterator = BlockSet::iterator;
+  using const_block_iterator = BlockSet::const_iterator;
 
-  uint64_t getSize() const { return Size; }
+  ~Section();
 
-  StringRef getContent() const {
-    assert(!Parent.isZeroFill() && "Trying to get content for zero-fill atom");
-    assert(Size <= std::numeric_limits<size_t>::max() &&
-           "Content size too large");
-    return {ContentPtr, static_cast<size_t>(Size)};
-  }
-  void setContent(StringRef Content) {
-    assert(!Parent.isZeroFill() && "Calling setContent on zero-fill atom?");
-    ContentPtr = Content.data();
-    Size = Content.size();
-  }
+  /// Returns the name of this section.
+  StringRef getName() const { return Name; }
+
+  /// Returns the protection flags for this section.
+  sys::Memory::ProtectionFlags getProtectionFlags() const { return Prot; }
 
-  bool isZeroFill() const { return Parent.isZeroFill(); }
+  /// Returns the ordinal for this section.
+  SectionOrdinal getOrdinal() const { return SecOrdinal; }
 
-  void setZeroFill(uint64_t Size) {
-    assert(Parent.isZeroFill() && !ContentPtr &&
-           "Can't set zero-fill length of a non zero-fill atom");
-    this->Size = Size;
+  /// Returns an iterator over the symbols defined in this section.
+  iterator_range<symbol_iterator> symbols() {
+    return make_range(Symbols.begin(), Symbols.end());
   }
 
-  uint64_t getZeroFillSize() const {
-    assert(Parent.isZeroFill() &&
-           "Can't get zero-fill length of a non zero-fill atom");
-    return Size;
+  /// Returns an iterator over the symbols defined in this section.
+  iterator_range<const_symbol_iterator> symbols() const {
+    return make_range(Symbols.begin(), Symbols.end());
   }
 
-  uint32_t getAlignment() const { return Alignment; }
+  /// Return the number of symbols in this section.
+  SymbolSet::size_type symbols_size() { return Symbols.size(); }
 
-  bool hasLayoutNext() const { return HasLayoutNext; }
-  void setLayoutNext(DefinedAtom &Next) {
-    assert(!HasLayoutNext && "Atom already has layout-next constraint");
-    HasLayoutNext = true;
-    Edges.push_back(Edge(Edge::LayoutNext, 0, Next, 0));
-  }
-  DefinedAtom &getLayoutNext() {
-    assert(HasLayoutNext && "Atom does not have a layout-next constraint");
-    DefinedAtom *Next = nullptr;
-    for (auto &E : edges())
-      if (E.getKind() == Edge::LayoutNext) {
-        assert(E.getTarget().isDefined() &&
-               "layout-next target atom must be a defined atom");
-        Next = static_cast<DefinedAtom *>(&E.getTarget());
-        break;
-      }
-    assert(Next && "Missing LayoutNext edge");
-    return *Next;
-  }
+  /// Return true if this section contains no symbols.
+  bool symbols_empty() const { return Symbols.empty(); }
 
-  bool isCommon() const { return IsCommon; }
+  /// Returns the ordinal for the next block.
+  BlockOrdinal getNextBlockOrdinal() { return NextBlockOrdinal++; }
 
-  void addEdge(Edge::Kind K, Edge::OffsetT Offset, Atom &Target,
-               Edge::AddendT Addend) {
-    assert(K != Edge::LayoutNext &&
-           "Layout edges should be added via setLayoutNext");
-    Edges.push_back(Edge(K, Offset, Target, Addend));
+private:
+  void addSymbol(Symbol &Sym) {
+    assert(!Symbols.count(&Sym) && "Symbol is already in this section");
+    Symbols.insert(&Sym);
   }
 
-  iterator_range<edge_iterator> edges() {
-    return make_range(Edges.begin(), Edges.end());
+  void removeSymbol(Symbol &Sym) {
+    assert(Symbols.count(&Sym) && "symbol is not in this section");
+    Symbols.erase(&Sym);
   }
-  size_t edges_size() const { return Edges.size(); }
-  bool edges_empty() const { return Edges.empty(); }
 
-  unsigned getOrdinal() const { return Ordinal; }
+  StringRef Name;
+  sys::Memory::ProtectionFlags Prot;
+  SectionOrdinal SecOrdinal = 0;
+  BlockOrdinal NextBlockOrdinal = 0;
+  SymbolSet Symbols;
+};
 
-private:
-  void setCommon(uint64_t Size) {
-    assert(ContentPtr == 0 && "Atom already has content?");
-    IsCommon = true;
-    setZeroFill(Size);
+/// Represents a section address range via a pair of Block pointers
+/// to the first and last Blocks in the section.
+class SectionRange {
+public:
+  SectionRange() = default;
+  SectionRange(const Section &Sec) {
+    if (Sec.symbols_empty())
+      return;
+    First = Last = *Sec.symbols().begin();
+    for (auto *Sym : Sec.symbols()) {
+      if (Sym->getAddress() < First->getAddress())
+        First = Sym;
+      if (Sym->getAddress() > Last->getAddress())
+        Last = Sym;
+    }
+  }
+  Symbol *getFirstSymbol() const {
+    assert((!Last || First) && "First can not be null if end is non-null");
+    return First;
+  }
+  Symbol *getLastSymbol() const {
+    assert((First || !Last) && "Last can not be null if start is non-null");
+    return Last;
+  }
+  bool isEmpty() const {
+    assert((First || !Last) && "Last can not be null if start is non-null");
+    return !First;
+  }
+  JITTargetAddress getStart() const {
+    return First ? First->getBlock().getAddress() : 0;
+  }
+  JITTargetAddress getEnd() const {
+    return Last ? Last->getBlock().getAddress() + Last->getBlock().getSize()
+                : 0;
   }
+  uint64_t getSize() const { return getEnd() - getStart(); }
 
-  EdgeVector Edges;
-  uint64_t Size = 0;
-  Section &Parent;
-  const char *ContentPtr = nullptr;
-  unsigned Ordinal = 0;
-  uint32_t Alignment = 0;
+private:
+  Symbol *First = nullptr;
+  Symbol *Last = nullptr;
 };
 
-inline JITTargetAddress SectionRange::getStart() const {
-  return First ? First->getAddress() : 0;
-}
+class LinkGraph {
+private:
+  using SectionList = std::vector<std::unique_ptr<Section>>;
+  using ExternalSymbolSet = DenseSet<Symbol *>;
+  using BlockSet = DenseSet<Block *>;
+
+  template <typename... ArgTs>
+  Addressable &createAddressable(ArgTs &&... Args) {
+    Addressable *A =
+        reinterpret_cast<Addressable *>(Allocator.Allocate<Addressable>());
+    new (A) Addressable(std::forward<ArgTs>(Args)...);
+    return *A;
+  }
 
-inline JITTargetAddress SectionRange::getEnd() const {
-  return Last ? Last->getAddress() + Last->getSize() : 0;
-}
+  void destroyAddressable(Addressable &A) {
+    A.~Addressable();
+    Allocator.Deallocate(&A);
+  }
 
-inline uint64_t SectionRange::getSize() const { return getEnd() - getStart(); }
+  template <typename... ArgTs> Block &createBlock(ArgTs &&... Args) {
+    Block *B = reinterpret_cast<Block *>(Allocator.Allocate<Block>());
+    new (B) Block(std::forward<ArgTs>(Args)...);
+    Blocks.insert(B);
+    return *B;
+  }
 
-inline SectionRange Section::getRange() const {
-  if (atoms_empty())
-    return SectionRange();
-  DefinedAtom *First = *DefinedAtoms.begin(), *Last = *DefinedAtoms.begin();
-  for (auto *DA : atoms()) {
-    if (DA->getAddress() < First->getAddress())
-      First = DA;
-    if (DA->getAddress() > Last->getAddress())
-      Last = DA;
+  void destroyBlock(Block &B) {
+    Blocks.erase(&B);
+    B.~Block();
+    Allocator.Deallocate(&B);
   }
-  return SectionRange(First, Last);
-}
 
-class AtomGraph {
-private:
-  using SectionList = std::vector<std::unique_ptr<Section>>;
-  using AddressToAtomMap = std::map<JITTargetAddress, DefinedAtom *>;
-  using NamedAtomMap = DenseMap<StringRef, Atom *>;
-  using ExternalAtomSet = DenseSet<Atom *>;
+  void destroySymbol(Symbol &S) {
+    S.~Symbol();
+    Allocator.Deallocate(&S);
+  }
 
 public:
-  using external_atom_iterator = ExternalAtomSet::iterator;
+  using external_symbol_iterator = ExternalSymbolSet::iterator;
+
+  using block_iterator = BlockSet::iterator;
 
   using section_iterator = pointee_iterator<SectionList::iterator>;
   using const_section_iterator = pointee_iterator<SectionList::const_iterator>;
 
-  template <typename SecItrT, typename AtomItrT, typename T>
-  class defined_atom_iterator_impl
+  template <typename SectionItrT, typename SymbolItrT, typename T>
+  class defined_symbol_iterator_impl
       : public iterator_facade_base<
-            defined_atom_iterator_impl<SecItrT, AtomItrT, T>,
+            defined_symbol_iterator_impl<SectionItrT, SymbolItrT, T>,
             std::forward_iterator_tag, T> {
   public:
-    defined_atom_iterator_impl() = default;
+    defined_symbol_iterator_impl() = default;
 
-    defined_atom_iterator_impl(SecItrT SI, SecItrT SE)
-        : SI(SI), SE(SE),
-          AI(SI != SE ? SI->atoms().begin() : Section::atom_iterator()) {
-      moveToNextAtomOrEnd();
+    defined_symbol_iterator_impl(SectionItrT SecI, SectionItrT SecE)
+        : SecI(SecI), SecE(SecE),
+          SymI(SecI != SecE ? SecI->symbols().begin() : SymbolItrT()) {
+      moveToNextSymbolOrEnd();
     }
 
-    bool operator==(const defined_atom_iterator_impl &RHS) const {
-      return (SI == RHS.SI) && (AI == RHS.AI);
+    bool operator==(const defined_symbol_iterator_impl &RHS) const {
+      return (SecI == RHS.SecI) && (SymI == RHS.SymI);
     }
 
     T operator*() const {
-      assert(AI != SI->atoms().end() && "Dereferencing end?");
-      return *AI;
+      assert(SymI != SecI->symbols().end() && "Dereferencing end?");
+      return *SymI;
     }
 
-    defined_atom_iterator_impl operator++() {
-      ++AI;
-      moveToNextAtomOrEnd();
+    defined_symbol_iterator_impl operator++() {
+      ++SymI;
+      moveToNextSymbolOrEnd();
       return *this;
     }
 
   private:
-    void moveToNextAtomOrEnd() {
-      while (SI != SE && AI == SI->atoms().end()) {
-        ++SI;
-        if (SI == SE)
-          AI = Section::atom_iterator();
-        else
-          AI = SI->atoms().begin();
+    void moveToNextSymbolOrEnd() {
+      while (SecI != SecE && SymI == SecI->symbols().end()) {
+        ++SecI;
+        SymI = SecI == SecE ? SymbolItrT() : SecI->symbols().begin();
       }
     }
 
-    SecItrT SI, SE;
-    AtomItrT AI;
+    SectionItrT SecI, SecE;
+    SymbolItrT SymI;
   };
 
-  using defined_atom_iterator =
-      defined_atom_iterator_impl<section_iterator, Section::atom_iterator,
-                                 DefinedAtom *>;
+  using defined_symbol_iterator =
+      defined_symbol_iterator_impl<const_section_iterator,
+                                   Section::symbol_iterator, Symbol *>;
 
-  using const_defined_atom_iterator =
-      defined_atom_iterator_impl<const_section_iterator,
-                                 Section::const_atom_iterator,
-                                 const DefinedAtom *>;
+  using const_defined_symbol_iterator = defined_symbol_iterator_impl<
+      const_section_iterator, Section::const_symbol_iterator, const Symbol *>;
 
-  AtomGraph(std::string Name, unsigned PointerSize,
+  LinkGraph(std::string Name, unsigned PointerSize,
             support::endianness Endianness)
       : Name(std::move(Name)), PointerSize(PointerSize),
         Endianness(Endianness) {}
 
+  ~LinkGraph();
+
   /// Returns the name of this graph (usually the name of the original
   /// underlying MemoryBuffer).
   const std::string &getName() { return Name; }
@@ -544,84 +718,83 @@ public:
   /// Returns the pointer size for use in this graph.
   unsigned getPointerSize() const { return PointerSize; }
 
-  /// Returns the endianness of atom-content in this graph.
+  /// Returns the endianness of content in this graph.
   support::endianness getEndianness() const { return Endianness; }
 
   /// Create a section with the given name, protection flags, and alignment.
-  Section &createSection(StringRef Name, uint32_t Alignment,
-                         sys::Memory::ProtectionFlags Prot, bool IsZeroFill) {
-    std::unique_ptr<Section> Sec(
-        new Section(Name, Alignment, Prot, Sections.size(), IsZeroFill));
+  Section &createSection(StringRef Name, sys::Memory::ProtectionFlags Prot) {
+    std::unique_ptr<Section> Sec(new Section(Name, Prot, Sections.size()));
     Sections.push_back(std::move(Sec));
     return *Sections.back();
   }
 
-  /// Add an external atom representing an undefined symbol in this graph.
-  Atom &addExternalAtom(StringRef Name) {
-    assert(!NamedAtoms.count(Name) && "Duplicate named atom inserted");
-    Atom *A = reinterpret_cast<Atom *>(
-        AtomAllocator.Allocate(sizeof(Atom), alignof(Atom)));
-    new (A) Atom(Name);
-    ExternalAtoms.insert(A);
-    NamedAtoms[Name] = A;
-    return *A;
+  /// Create a content block.
+  Block &createContentBlock(Section &Parent, StringRef Content,
+                            uint64_t Address, uint64_t Alignment,
+                            uint64_t AlignmentOffset) {
+    return createBlock(Parent, Parent.getNextBlockOrdinal(), Content, Address,
+                       Alignment, AlignmentOffset);
   }
 
-  /// Add an external atom representing an absolute symbol.
-  Atom &addAbsoluteAtom(StringRef Name, JITTargetAddress Addr) {
-    assert(!NamedAtoms.count(Name) && "Duplicate named atom inserted");
-    Atom *A = reinterpret_cast<Atom *>(
-        AtomAllocator.Allocate(sizeof(Atom), alignof(Atom)));
-    new (A) Atom(Name, Addr);
-    AbsoluteAtoms.insert(A);
-    NamedAtoms[Name] = A;
-    return *A;
+  /// Create a zero-fill block.
+  Block &createZeroFillBlock(Section &Parent, uint64_t Size, uint64_t Address,
+                             uint64_t Alignment, uint64_t AlignmentOffset) {
+    return createBlock(Parent, Parent.getNextBlockOrdinal(), Size, Address,
+                       Alignment, AlignmentOffset);
   }
 
-  /// Add an anonymous defined atom to the graph.
-  ///
-  /// Anonymous atoms have content but no name. They must have an address.
-  DefinedAtom &addAnonymousAtom(Section &Parent, JITTargetAddress Address,
-                                uint32_t Alignment) {
-    DefinedAtom *A = reinterpret_cast<DefinedAtom *>(
-        AtomAllocator.Allocate(sizeof(DefinedAtom), alignof(DefinedAtom)));
-    new (A) DefinedAtom(Parent, Address, Alignment);
-    Parent.addAtom(*A);
-    getAddrToAtomMap()[A->getAddress()] = A;
-    return *A;
+  /// Add an external symbol.
+  /// Some formats (e.g. ELF) allow Symbols to have sizes. For Symbols whose
+  /// size is not known, you should substitute '0'.
+  Symbol &addExternalSymbol(StringRef Name, uint64_t Size) {
+    auto &Sym = Symbol::constructExternal(
+        Allocator.Allocate<Symbol>(), createAddressable(0, false), Name, Size);
+    ExternalSymbols.insert(&Sym);
+    return Sym;
   }
 
-  /// Add a defined atom to the graph.
-  ///
-  /// Allocates and constructs a DefinedAtom instance with the given parent,
-  /// name, address, and alignment.
-  DefinedAtom &addDefinedAtom(Section &Parent, StringRef Name,
-                              JITTargetAddress Address, uint32_t Alignment) {
-    assert(!NamedAtoms.count(Name) && "Duplicate named atom inserted");
-    DefinedAtom *A = reinterpret_cast<DefinedAtom *>(
-        AtomAllocator.Allocate(sizeof(DefinedAtom), alignof(DefinedAtom)));
-    new (A) DefinedAtom(Parent, Name, Address, Alignment);
-    Parent.addAtom(*A);
-    getAddrToAtomMap()[A->getAddress()] = A;
-    NamedAtoms[Name] = A;
-    return *A;
+  /// Add an absolute symbol.
+  Symbol &addAbsoluteSymbol(StringRef Name, JITTargetAddress Address,
+                            uint64_t Size, Linkage L, Scope S, bool IsLive) {
+    auto &Sym = Symbol::constructAbsolute(Allocator.Allocate<Symbol>(),
+                                          createAddressable(Address), Name,
+                                          Size, L, S, IsLive);
+    AbsoluteSymbols.insert(&Sym);
+    return Sym;
   }
 
-  /// Add a common symbol atom to the graph.
-  ///
-  /// Adds a common-symbol atom to the graph with the given parent, name,
-  /// address, alignment and size.
-  DefinedAtom &addCommonAtom(Section &Parent, StringRef Name,
-                             JITTargetAddress Address, uint32_t Alignment,
-                             uint64_t Size) {
-    assert(!NamedAtoms.count(Name) && "Duplicate named atom inserted");
-    DefinedAtom *A = reinterpret_cast<DefinedAtom *>(
-        AtomAllocator.Allocate(sizeof(DefinedAtom), alignof(DefinedAtom)));
-    new (A) DefinedAtom(Parent, Name, Address, Alignment);
-    A->setCommon(Size);
-    Parent.addAtom(*A);
-    NamedAtoms[Name] = A;
-    return *A;
+  /// Convenience method for adding a weak zero-fill symbol.
+  Symbol &addCommonSymbol(StringRef Name, Scope S, Section &Section,
+                          JITTargetAddress Address, uint64_t Size,
+                          uint64_t Alignment, bool IsLive) {
+    auto &Sym = Symbol::constructCommon(
+        Allocator.Allocate<Symbol>(),
+        createBlock(Section, Section.getNextBlockOrdinal(), Address, Size,
+                    Alignment, 0),
+        Name, Size, S, IsLive);
+    Section.addSymbol(Sym);
+    return Sym;
+  }
+
+  /// Add an anonymous symbol.
+  Symbol &addAnonymousSymbol(Block &Content, JITTargetAddress Offset,
+                             JITTargetAddress Size, bool IsCallable,
+                             bool IsLive) {
+    auto &Sym = Symbol::constructAnonDef(Allocator.Allocate<Symbol>(), Content,
+                                         Offset, Size, IsCallable, IsLive);
+    Content.getSection().addSymbol(Sym);
+    return Sym;
+  }
+
+  /// Add a named symbol.
+  Symbol &addDefinedSymbol(Block &Content, JITTargetAddress Offset,
+                           StringRef Name, JITTargetAddress Size, Linkage L,
+                           Scope S, bool IsCallable, bool IsLive) {
+    auto &Sym =
+        Symbol::constructNamedDef(Allocator.Allocate<Symbol>(), Content, Offset,
+                                  Name, Size, L, S, IsLive, IsCallable);
+    Content.getSection().addSymbol(Sym);
+    return Sym;
   }
 
   iterator_range<section_iterator> sections() {
@@ -638,135 +811,79 @@ public:
     return nullptr;
   }
 
-  iterator_range<external_atom_iterator> external_atoms() {
-    return make_range(ExternalAtoms.begin(), ExternalAtoms.end());
+  iterator_range<external_symbol_iterator> external_symbols() {
+    return make_range(ExternalSymbols.begin(), ExternalSymbols.end());
   }
 
-  iterator_range<external_atom_iterator> absolute_atoms() {
-    return make_range(AbsoluteAtoms.begin(), AbsoluteAtoms.end());
+  iterator_range<external_symbol_iterator> absolute_symbols() {
+    return make_range(AbsoluteSymbols.begin(), AbsoluteSymbols.end());
   }
 
-  iterator_range<defined_atom_iterator> defined_atoms() {
-    return make_range(defined_atom_iterator(Sections.begin(), Sections.end()),
-                      defined_atom_iterator(Sections.end(), Sections.end()));
+  iterator_range<defined_symbol_iterator> defined_symbols() {
+    return make_range(defined_symbol_iterator(Sections.begin(), Sections.end()),
+                      defined_symbol_iterator(Sections.end(), Sections.end()));
   }
 
-  iterator_range<const_defined_atom_iterator> defined_atoms() const {
+  iterator_range<const_defined_symbol_iterator> defined_symbols() const {
     return make_range(
-        const_defined_atom_iterator(Sections.begin(), Sections.end()),
-        const_defined_atom_iterator(Sections.end(), Sections.end()));
-  }
-
-  /// Returns the atom with the given name, which must exist in this graph.
-  Atom &getAtomByName(StringRef Name) {
-    auto I = NamedAtoms.find(Name);
-    assert(I != NamedAtoms.end() && "Name not in NamedAtoms map");
-    return *I->second;
-  }
-
-  /// Returns the atom with the given name, which must exist in this graph and
-  /// be a DefinedAtom.
-  DefinedAtom &getDefinedAtomByName(StringRef Name) {
-    auto &A = getAtomByName(Name);
-    assert(A.isDefined() && "Atom is not a defined atom");
-    return static_cast<DefinedAtom &>(A);
-  }
-
-  /// Search for the given atom by name.
-  /// Returns the atom (if found) or an error (if no atom with this name
-  /// exists).
-  Expected<Atom &> findAtomByName(StringRef Name) {
-    auto I = NamedAtoms.find(Name);
-    if (I == NamedAtoms.end())
-      return make_error<JITLinkError>("No atom named " + Name);
-    return *I->second;
-  }
-
-  /// Search for the given defined atom by name.
-  /// Returns the defined atom (if found) or an error (if no atom with this
-  /// name exists, or if one exists but is not a defined atom).
-  Expected<DefinedAtom &> findDefinedAtomByName(StringRef Name) {
-    auto I = NamedAtoms.find(Name);
-    if (I == NamedAtoms.end())
-      return make_error<JITLinkError>("No atom named " + Name);
-    if (!I->second->isDefined())
-      return make_error<JITLinkError>("Atom " + Name +
-                                      " exists but is not a "
-                                      "defined atom");
-    return static_cast<DefinedAtom &>(*I->second);
-  }
-
-  /// Returns the atom covering the given address, or an error if no such atom
-  /// exists.
-  ///
-  /// Returns null if no atom exists at the given address.
-  DefinedAtom *getAtomByAddress(JITTargetAddress Address) {
-    refreshAddrToAtomCache();
-
-    // If there are no defined atoms, bail out early.
-    if (AddrToAtomCache->empty())
-      return nullptr;
-
-    // Find the atom *after* the given address.
-    auto I = AddrToAtomCache->upper_bound(Address);
-
-    // If this address falls before any known atom, bail out.
-    if (I == AddrToAtomCache->begin())
-      return nullptr;
-
-    // The atom we're looking for is the one before the atom we found.
-    --I;
-
-    // Otherwise range check the atom that was found.
-    assert(!I->second->getContent().empty() && "Atom content not set");
-    if (Address >= I->second->getAddress() + I->second->getContent().size())
-      return nullptr;
+        const_defined_symbol_iterator(Sections.begin(), Sections.end()),
+        const_defined_symbol_iterator(Sections.end(), Sections.end()));
+  }
 
-    return I->second;
+  iterator_range<block_iterator> blocks() {
+    return make_range(Blocks.begin(), Blocks.end());
   }
 
-  /// Like getAtomByAddress, but returns an Error if the given address is not
-  /// covered by an atom, rather than a null pointer.
-  Expected<DefinedAtom &> findAtomByAddress(JITTargetAddress Address) {
-    if (auto *DA = getAtomByAddress(Address))
-      return *DA;
-    return make_error<JITLinkError>("No atom at address " +
-                                    formatv("{0:x16}", Address));
+  /// Turn a defined symbol into an external one.
+  void makeExternal(Symbol &Sym) {
+    if (Sym.getAddressable().isAbsolute()) {
+      assert(AbsoluteSymbols.count(&Sym) &&
+             "Sym is not in the absolute symbols set");
+      AbsoluteSymbols.erase(&Sym);
+    } else {
+      assert(Sym.isDefined() && "Sym is not a defined symbol");
+      Section &Sec = Sym.getBlock().getSection();
+      Sec.removeSymbol(Sym);
+    }
+    Sym.makeExternal(createAddressable(false));
+    ExternalSymbols.insert(&Sym);
   }
 
-  // Remove the given external atom from the graph.
-  void removeExternalAtom(Atom &A) {
-    assert(!A.isDefined() && !A.isAbsolute() && "A is not an external atom");
-    assert(ExternalAtoms.count(&A) && "A is not in the external atoms set");
-    ExternalAtoms.erase(&A);
-    A.~Atom();
+  /// Removes an external symbol. Also removes the underlying Addressable.
+  void removeExternalSymbol(Symbol &Sym) {
+    assert(!Sym.isDefined() && !Sym.isAbsolute() &&
+           "Sym is not an external symbol");
+    assert(ExternalSymbols.count(&Sym) && "Symbol is not in the externals set");
+    ExternalSymbols.erase(&Sym);
+    Addressable &Base = *Sym.Base;
+    destroySymbol(Sym);
+    destroyAddressable(Base);
   }
 
-  /// Remove the given absolute atom from the graph.
-  void removeAbsoluteAtom(Atom &A) {
-    assert(A.isAbsolute() && "A is not an absolute atom");
-    assert(AbsoluteAtoms.count(&A) && "A is not in the absolute atoms set");
-    AbsoluteAtoms.erase(&A);
-    A.~Atom();
+  /// Remove an absolute symbol. Also removes the underlying Addressable.
+  void removeAbsoluteSymbol(Symbol &Sym) {
+    assert(!Sym.isDefined() && Sym.isAbsolute() &&
+           "Sym is not an absolute symbol");
+    assert(AbsoluteSymbols.count(&Sym) &&
+           "Symbol is not in the absolute symbols set");
+    AbsoluteSymbols.erase(&Sym);
+    Addressable &Base = *Sym.Base;
+    destroySymbol(Sym);
+    destroyAddressable(Base);
   }
 
-  /// Remove the given defined atom from the graph.
-  void removeDefinedAtom(DefinedAtom &DA) {
-    if (AddrToAtomCache) {
-      assert(AddrToAtomCache->count(DA.getAddress()) &&
-             "Cache exists, but does not contain atom");
-      AddrToAtomCache->erase(DA.getAddress());
-    }
-    if (DA.hasName()) {
-      assert(NamedAtoms.count(DA.getName()) && "Named atom not in map");
-      NamedAtoms.erase(DA.getName());
-    }
-    DA.getSection().removeAtom(DA);
-    DA.~DefinedAtom();
+  /// Removes defined symbols. Does not remove the underlying block.
+  void removeDefinedSymbol(Symbol &Sym) {
+    assert(Sym.isDefined() && "Sym is not a defined symbol");
+    Sym.getBlock().getSection().removeSymbol(Sym);
+    destroySymbol(Sym);
   }
 
-  /// Invalidate the atom-to-address map.
-  void invalidateAddrToAtomMap() { AddrToAtomCache = None; }
+  /// Remove a block.
+  void removeBlock(Block &B) {
+    Blocks.erase(&B);
+    destroyBlock(B);
+  }
 
   /// Dump the graph.
   ///
@@ -778,87 +895,84 @@ public:
                 std::function<StringRef(Edge::Kind)>());
 
 private:
-  AddressToAtomMap &getAddrToAtomMap() {
-    refreshAddrToAtomCache();
-    return *AddrToAtomCache;
-  }
-
-  const AddressToAtomMap &getAddrToAtomMap() const {
-    refreshAddrToAtomCache();
-    return *AddrToAtomCache;
-  }
-
-  void refreshAddrToAtomCache() const {
-    if (!AddrToAtomCache) {
-      AddrToAtomCache = AddressToAtomMap();
-      for (auto *DA : defined_atoms())
-        (*AddrToAtomCache)[DA->getAddress()] = const_cast<DefinedAtom *>(DA);
-    }
-  }
-
-  // Put the BumpPtrAllocator first so that we don't free any of the atoms in
-  // it until all of their destructors have been run.
-  BumpPtrAllocator AtomAllocator;
+  // Put the BumpPtrAllocator first so that we don't free any of the underlying
+  // memory until the Symbol/Addressable destructors have been run.
+  BumpPtrAllocator Allocator;
 
   std::string Name;
   unsigned PointerSize;
   support::endianness Endianness;
+  BlockSet Blocks;
   SectionList Sections;
-  NamedAtomMap NamedAtoms;
-  ExternalAtomSet ExternalAtoms;
-  ExternalAtomSet AbsoluteAtoms;
-  mutable Optional<AddressToAtomMap> AddrToAtomCache;
+  ExternalSymbolSet ExternalSymbols;
+  ExternalSymbolSet AbsoluteSymbols;
 };
 
-/// A function for mutating AtomGraphs.
-using AtomGraphPassFunction = std::function<Error(AtomGraph &)>;
+/// A function for mutating LinkGraphs.
+using LinkGraphPassFunction = std::function<Error(LinkGraph &)>;
 
-/// A list of atom graph passes.
-using AtomGraphPassList = std::vector<AtomGraphPassFunction>;
+/// A list of LinkGraph passes.
+using LinkGraphPassList = std::vector<LinkGraphPassFunction>;
 
-/// An atom graph pass configuration, consisting of a list of pre-prune,
+/// An LinkGraph pass configuration, consisting of a list of pre-prune,
 /// post-prune, and post-fixup passes.
 struct PassConfiguration {
 
   /// Pre-prune passes.
   ///
   /// These passes are called on the graph after it is built, and before any
-  /// atoms have been pruned.
+  /// symbols have been pruned.
   ///
-  /// Notable use cases: Marking atoms live or should-discard.
-  AtomGraphPassList PrePrunePasses;
+  /// Notable use cases: Marking symbols live or should-discard.
+  LinkGraphPassList PrePrunePasses;
 
   /// Post-prune passes.
   ///
-  /// These passes are called on the graph after dead and should-discard atoms
-  /// have been removed, but before fixups are applied.
+  /// These passes are called on the graph after dead stripping, but before
+  /// fixups are applied.
   ///
-  /// Notable use cases: Building GOT, stub, and TLV atoms.
-  AtomGraphPassList PostPrunePasses;
+  /// Notable use cases: Building GOT, stub, and TLV symbols.
+  LinkGraphPassList PostPrunePasses;
 
   /// Post-fixup passes.
   ///
-  /// These passes are called on the graph after atom contents has been copied
+  /// These passes are called on the graph after block contents has been copied
   /// to working memory, and fixups applied.
   ///
   /// Notable use cases: Testing and validation.
-  AtomGraphPassList PostFixupPasses;
+  LinkGraphPassList PostFixupPasses;
 };
 
 /// A map of symbol names to resolved addresses.
 using AsyncLookupResult = DenseMap<StringRef, JITEvaluatedSymbol>;
 
-/// A function to call with a resolved symbol map (See AsyncLookupResult) or an
-/// error if resolution failed.
-using JITLinkAsyncLookupContinuation =
-    std::function<void(Expected<AsyncLookupResult> LR)>;
+/// A function object to call with a resolved symbol map (See AsyncLookupResult)
+/// or an error if resolution failed.
+class JITLinkAsyncLookupContinuation {
+public:
+  virtual ~JITLinkAsyncLookupContinuation() {}
+  virtual void run(Expected<AsyncLookupResult> LR) = 0;
+
+private:
+  virtual void anchor();
+};
+
+/// Create a lookup continuation from a function object.
+template <typename Continuation>
+std::unique_ptr<JITLinkAsyncLookupContinuation>
+createLookupContinuation(Continuation Cont) {
 
-/// An asynchronous symbol lookup. Performs a search (possibly asynchronously)
-/// for the given symbols, calling the given continuation with either the result
-/// (if the lookup succeeds), or an error (if the lookup fails).
-using JITLinkAsyncLookupFunction =
-    std::function<void(const DenseSet<StringRef> &Symbols,
-                       JITLinkAsyncLookupContinuation LookupContinuation)>;
+  class Impl final : public JITLinkAsyncLookupContinuation {
+  public:
+    Impl(Continuation C) : C(std::move(C)) {}
+    void run(Expected<AsyncLookupResult> LR) override { C(std::move(LR)); }
+
+  private:
+    Continuation C;
+  };
+
+  return std::make_unique<Impl>(std::move(Cont));
+}
 
 /// Holds context for a single jitLink invocation.
 class JITLinkContext {
@@ -881,13 +995,13 @@ public:
   /// lookup continutation which it must call with a result to continue the
   /// linking process.
   virtual void lookup(const DenseSet<StringRef> &Symbols,
-                      JITLinkAsyncLookupContinuation LookupContinuation) = 0;
+                      std::unique_ptr<JITLinkAsyncLookupContinuation> LC) = 0;
 
-  /// Called by JITLink once all defined atoms in the graph have been assigned
-  /// their final memory locations in the target process. At this point he
-  /// atom graph can be, inspected to build a symbol table however the atom
+  /// Called by JITLink once all defined symbols in the graph have been assigned
+  /// their final memory locations in the target process. At this point the
+  /// LinkGraph can be inspected to build a symbol table, however the block
   /// content will not generally have been copied to the target location yet.
-  virtual void notifyResolved(AtomGraph &G) = 0;
+  virtual void notifyResolved(LinkGraph &G) = 0;
 
   /// Called by JITLink to notify the context that the object has been
   /// finalized (i.e. emitted to memory and memory permissions set). If all of
@@ -904,20 +1018,20 @@ public:
 
   /// Returns the mark-live pass to be used for this link. If no pass is
   /// returned (the default) then the target-specific linker implementation will
-  /// choose a conservative default (usually marking all atoms live).
+  /// choose a conservative default (usually marking all symbols live).
   /// This function is only called if shouldAddDefaultTargetPasses returns true,
   /// otherwise the JITContext is responsible for adding a mark-live pass in
   /// modifyPassConfig.
-  virtual AtomGraphPassFunction getMarkLivePass(const Triple &TT) const;
+  virtual LinkGraphPassFunction getMarkLivePass(const Triple &TT) const;
 
   /// Called by JITLink to modify the pass pipeline prior to linking.
   /// The default version performs no modification.
   virtual Error modifyPassConfig(const Triple &TT, PassConfiguration &Config);
 };
 
-/// Marks all atoms in a graph live. This can be used as a default, conservative
-/// mark-live implementation.
-Error markAllAtomsLive(AtomGraph &G);
+/// Marks all symbols in a graph live. This can be used as a default,
+/// conservative mark-live implementation.
+Error markAllSymbolsLive(LinkGraph &G);
 
 /// Basic JITLink implementation.
 ///
diff --git a/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h b/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
index 9d0b37fe4a4d..ac5a593bb77b 100644
--- a/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
+++ b/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
@@ -33,20 +33,19 @@ public:
   class SegmentRequest {
   public:
     SegmentRequest() = default;
-    SegmentRequest(size_t ContentSize, unsigned ContentAlign,
-                   uint64_t ZeroFillSize, unsigned ZeroFillAlign)
-        : ContentSize(ContentSize), ZeroFillSize(ZeroFillSize),
-          ContentAlign(ContentAlign), ZeroFillAlign(ZeroFillAlign) {}
+    SegmentRequest(uint64_t Alignment, size_t ContentSize,
+                   uint64_t ZeroFillSize)
+        : Alignment(Alignment), ContentSize(ContentSize),
+          ZeroFillSize(ZeroFillSize) {
+      assert(isPowerOf2_32(Alignment) && "Alignment must be power of 2");
+    }
+    uint64_t getAlignment() const { return Alignment; }
     size_t getContentSize() const { return ContentSize; }
-    unsigned getContentAlignment() const { return ContentAlign; }
     uint64_t getZeroFillSize() const { return ZeroFillSize; }
-    unsigned getZeroFillAlignment() const { return ZeroFillAlign; }
-
   private:
+    uint64_t Alignment = 0;
     size_t ContentSize = 0;
     uint64_t ZeroFillSize = 0;
-    unsigned ContentAlign = 0;
-    unsigned ZeroFillAlign = 0;
   };
 
   using SegmentsRequestMap = DenseMap<unsigned, SegmentRequest>;
diff --git a/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h b/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
new file mode 100644
index 000000000000..d70b545fff86
--- /dev/null
+++ b/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
@@ -0,0 +1,60 @@
+//===---- MachO_arm64.h - JIT link functions for MachO/arm64 ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// jit-link functions for MachO/arm64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_MACHO_ARM64_H
+#define LLVM_EXECUTIONENGINE_JITLINK_MACHO_ARM64_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+namespace llvm {
+namespace jitlink {
+
+namespace MachO_arm64_Edges {
+
+enum MachOARM64RelocationKind : Edge::Kind {
+  Branch26 = Edge::FirstRelocation,
+  Pointer32,
+  Pointer64,
+  Pointer64Anon,
+  Page21,
+  PageOffset12,
+  GOTPage21,
+  GOTPageOffset12,
+  PointerToGOT,
+  PairedAddend,
+  LDRLiteral19,
+  Delta32,
+  Delta64,
+  NegDelta32,
+  NegDelta64,
+};
+
+} // namespace MachO_arm64_Edges
+
+/// jit-link the given object buffer, which must be a MachO arm64 object file.
+///
+/// If PrePrunePasses is empty then a default mark-live pass will be inserted
+/// that will mark all exported atoms live. If PrePrunePasses is not empty, the
+/// caller is responsible for including a pass to mark atoms as live.
+///
+/// If PostPrunePasses is empty then a default GOT-and-stubs insertion pass will
+/// be inserted. If PostPrunePasses is not empty then the caller is responsible
+/// for including a pass to insert GOT and stub edges.
+void jitLink_MachO_arm64(std::unique_ptr<JITLinkContext> Ctx);
+
+/// Return the string name of the given MachO arm64 edge kind.
+StringRef getMachOARM64RelocationKindName(Edge::Kind R);
+
+} // end namespace jitlink
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_MACHO_ARM64_H
diff --git a/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h b/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h
index 1d5b586afc32..00a7feb86e83 100644
--- a/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h
+++ b/include/llvm/ExecutionEngine/JITLink/MachO_x86_64.h
@@ -22,6 +22,7 @@ namespace MachO_x86_64_Edges {
 
 enum MachOX86RelocationKind : Edge::Kind {
   Branch32 = Edge::FirstRelocation,
+  Pointer32,
   Pointer64,
   Pointer64Anon,
   PCRel32,
diff --git a/include/llvm/ExecutionEngine/JITSymbol.h b/include/llvm/ExecutionEngine/JITSymbol.h
index b14154c5b5e8..c0f1ca4b9876 100644
--- a/include/llvm/ExecutionEngine/JITSymbol.h
+++ b/include/llvm/ExecutionEngine/JITSymbol.h
@@ -23,6 +23,7 @@
 #include <string>
 
 #include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 
@@ -217,7 +218,7 @@ private:
 /// Represents a symbol in the JIT.
 class JITSymbol {
 public:
-  using GetAddressFtor = std::function<Expected<JITTargetAddress>()>;
+  using GetAddressFtor = unique_function<Expected<JITTargetAddress>()>;
 
   /// Create a 'null' symbol, used to represent a "symbol not found"
   ///        result from a successful (non-erroneous) lookup.
@@ -325,7 +326,7 @@ class JITSymbolResolver {
 public:
   using LookupSet = std::set<StringRef>;
   using LookupResult = std::map<StringRef, JITEvaluatedSymbol>;
-  using OnResolvedFunction = std::function<void(Expected<LookupResult>)>;
+  using OnResolvedFunction = unique_function<void(Expected<LookupResult>)>;
 
   virtual ~JITSymbolResolver() = default;
 
diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index 5f593a27cad6..7946b5b7b209 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -26,6 +26,7 @@
 #include "llvm/ExecutionEngine/Orc/LazyReexports.h"
 #include "llvm/ExecutionEngine/Orc/Legacy.h"
 #include "llvm/ExecutionEngine/Orc/OrcError.h"
+#include "llvm/ExecutionEngine/Orc/Speculation.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constant.h"
@@ -91,6 +92,8 @@ public:
   /// Sets the partition function.
   void setPartitionFunction(PartitionFunction Partition);
 
+  /// Sets the ImplSymbolMap
+  void setImplMap(ImplSymbolMap *Imp);
   /// Emits the given module. This should not be called by clients: it will be
   /// called by the JIT when a definition added via the add method is requested.
   void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
@@ -128,6 +131,7 @@ private:
   PerDylibResourcesMap DylibResources;
   PartitionFunction Partition = compileRequested;
   SymbolLinkagePromoter PromoteSymbols;
+  ImplSymbolMap *AliaseeImpls = nullptr;
 };
 
 /// Compile-on-demand layer.
@@ -187,7 +191,7 @@ private:
   std::unique_ptr<ResourceOwner<ResourceT>>
   wrapOwnership(ResourcePtrT ResourcePtr) {
     using RO = ResourceOwnerImpl<ResourceT, ResourcePtrT>;
-    return llvm::make_unique<RO>(std::move(ResourcePtr));
+    return std::make_unique<RO>(std::move(ResourcePtr));
   }
 
   struct LogicalDylib {
@@ -440,7 +444,7 @@ private:
       return Error::success();
 
     // Create the GlobalValues module.
-    auto GVsM = llvm::make_unique<Module>((SrcM.getName() + ".globals").str(),
+    auto GVsM = std::make_unique<Module>((SrcM.getName() + ".globals").str(),
                                           SrcM.getContext());
     GVsM->setDataLayout(DL);
 
@@ -633,7 +637,7 @@ private:
       NewName += F->getName();
     }
 
-    auto M = llvm::make_unique<Module>(NewName, SrcM.getContext());
+    auto M = std::make_unique<Module>(NewName, SrcM.getContext());
     M->setDataLayout(SrcM.getDataLayout());
     ValueToValueMapTy VMap;
 
diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h
index 94a5618233e4..4f22a4c38796 100644
--- a/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/include/llvm/ExecutionEngine/Orc/Core.h
@@ -14,6 +14,7 @@
 #define LLVM_EXECUTIONENGINE_ORC_CORE_H
 
 #include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
 #include "llvm/ExecutionEngine/OrcV1Deprecation.h"
@@ -51,8 +52,7 @@ using SymbolMap = DenseMap<SymbolStringPtr, JITEvaluatedSymbol>;
 /// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags.
 using SymbolFlagsMap = DenseMap<SymbolStringPtr, JITSymbolFlags>;
 
-/// A base class for materialization failures that allows the failing
-///        symbols to be obtained for logging.
+/// A map from JITDylibs to sets of symbols.
 using SymbolDependenceMap = DenseMap<JITDylib *, SymbolNameSet>;
 
 /// A list of (JITDylib*, bool) pairs.
@@ -108,7 +108,7 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases);
 raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S);
 
 /// Callback to notify client that symbols have been resolved.
-using SymbolsResolvedCallback = std::function<void(Expected<SymbolMap>)>;
+using SymbolsResolvedCallback = unique_function<void(Expected<SymbolMap>)>;
 
 /// Callback to register the dependencies for a given query.
 using RegisterDependenciesFunction =
@@ -124,13 +124,13 @@ class FailedToMaterialize : public ErrorInfo<FailedToMaterialize> {
 public:
   static char ID;
 
-  FailedToMaterialize(SymbolNameSet Symbols);
+  FailedToMaterialize(std::shared_ptr<SymbolDependenceMap> Symbols);
   std::error_code convertToErrorCode() const override;
   void log(raw_ostream &OS) const override;
-  const SymbolNameSet &getSymbols() const { return Symbols; }
+  const SymbolDependenceMap &getSymbols() const { return *Symbols; }
 
 private:
-  SymbolNameSet Symbols;
+  std::shared_ptr<SymbolDependenceMap> Symbols;
 };
 
 /// Used to notify clients when symbols can not be found during a lookup.
@@ -205,12 +205,26 @@ public:
   /// symbols must be ones covered by this MaterializationResponsibility
   /// instance. Individual calls to this method may resolve a subset of the
   /// symbols, but all symbols must have been resolved prior to calling emit.
-  void notifyResolved(const SymbolMap &Symbols);
+  ///
+  /// This method will return an error if any symbols being resolved have been
+  /// moved to the error state due to the failure of a dependency. If this
+  /// method returns an error then clients should log it and call
+  /// failMaterialize. If no dependencies have been registered for the
+  /// symbols covered by this MaterializationResponsibiility then this method
+  /// is guaranteed to return Error::success() and can be wrapped with cantFail.
+  Error notifyResolved(const SymbolMap &Symbols);
 
   /// Notifies the target JITDylib (and any pending queries on that JITDylib)
   /// that all symbols covered by this MaterializationResponsibility instance
   /// have been emitted.
-  void notifyEmitted();
+  ///
+  /// This method will return an error if any symbols being resolved have been
+  /// moved to the error state due to the failure of a dependency. If this
+  /// method returns an error then clients should log it and call
+  /// failMaterialize. If no dependencies have been registered for the
+  /// symbols covered by this MaterializationResponsibiility then this method
+  /// is guaranteed to return Error::success() and can be wrapped with cantFail.
+  Error notifyEmitted();
 
   /// Adds new symbols to the JITDylib and this responsibility instance.
   ///        JITDylib entries start out in the materializing state.
@@ -346,7 +360,7 @@ private:
 ///
 inline std::unique_ptr<AbsoluteSymbolsMaterializationUnit>
 absoluteSymbols(SymbolMap Symbols, VModuleKey K = VModuleKey()) {
-  return llvm::make_unique<AbsoluteSymbolsMaterializationUnit>(
+  return std::make_unique<AbsoluteSymbolsMaterializationUnit>(
       std::move(Symbols), std::move(K));
 }
 
@@ -390,7 +404,7 @@ private:
 /// \endcode
 inline std::unique_ptr<ReExportsMaterializationUnit>
 symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
-  return llvm::make_unique<ReExportsMaterializationUnit>(
+  return std::make_unique<ReExportsMaterializationUnit>(
       nullptr, true, std::move(Aliases), std::move(K));
 }
 
@@ -402,7 +416,7 @@ symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
 inline std::unique_ptr<ReExportsMaterializationUnit>
 reexports(JITDylib &SourceJD, SymbolAliasMap Aliases,
           bool MatchNonExported = false, VModuleKey K = VModuleKey()) {
-  return llvm::make_unique<ReExportsMaterializationUnit>(
+  return std::make_unique<ReExportsMaterializationUnit>(
       &SourceJD, MatchNonExported, std::move(Aliases), std::move(K));
 }
 
@@ -411,32 +425,13 @@ reexports(JITDylib &SourceJD, SymbolAliasMap Aliases,
 Expected<SymbolAliasMap>
 buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols);
 
-/// ReexportsGenerator can be used with JITDylib::setGenerator to automatically
-/// re-export a subset of the source JITDylib's symbols in the target.
-class ReexportsGenerator {
-public:
-  using SymbolPredicate = std::function<bool(SymbolStringPtr)>;
-
-  /// Create a reexports generator. If an Allow predicate is passed, only
-  /// symbols for which the predicate returns true will be reexported. If no
-  /// Allow predicate is passed, all symbols will be exported.
-  ReexportsGenerator(JITDylib &SourceJD, bool MatchNonExported = false,
-                     SymbolPredicate Allow = SymbolPredicate());
-
-  Expected<SymbolNameSet> operator()(JITDylib &JD, const SymbolNameSet &Names);
-
-private:
-  JITDylib &SourceJD;
-  bool MatchNonExported = false;
-  SymbolPredicate Allow;
-};
-
 /// Represents the state that a symbol has reached during materialization.
 enum class SymbolState : uint8_t {
   Invalid,       /// No symbol should be in this state.
   NeverSearched, /// Added to the symbol table, never queried.
   Materializing, /// Queried, materialization begun.
   Resolved,      /// Assigned address, still materializing.
+  Emitted,       /// Emitted to memory, but waiting on transitive dependencies.
   Ready = 0x3f   /// Ready and safe for clients to access.
 };
 
@@ -502,8 +497,12 @@ class JITDylib {
   friend class ExecutionSession;
   friend class MaterializationResponsibility;
 public:
-  using GeneratorFunction = std::function<Expected<SymbolNameSet>(
-      JITDylib &Parent, const SymbolNameSet &Names)>;
+  class DefinitionGenerator {
+  public:
+    virtual ~DefinitionGenerator();
+    virtual Expected<SymbolNameSet>
+    tryToGenerate(JITDylib &Parent, const SymbolNameSet &Names) = 0;
+  };
 
   using AsynchronousSymbolQuerySet =
     std::set<std::shared_ptr<AsynchronousSymbolQuery>>;
@@ -519,13 +518,20 @@ public:
   /// Get a reference to the ExecutionSession for this JITDylib.
   ExecutionSession &getExecutionSession() const { return ES; }
 
-  /// Set a definition generator. If set, whenever a symbol fails to resolve
-  /// within this JITDylib, lookup and lookupFlags will pass the unresolved
-  /// symbols set to the definition generator. The generator can optionally
-  /// add a definition for the unresolved symbols to the dylib.
-  void setGenerator(GeneratorFunction DefGenerator) {
-    this->DefGenerator = std::move(DefGenerator);
-  }
+  /// Adds a definition generator to this JITDylib and returns a referenece to
+  /// it.
+  ///
+  /// When JITDylibs are searched during lookup, if no existing definition of
+  /// a symbol is found, then any generators that have been added are run (in
+  /// the order that they were added) to potentially generate a definition.
+  template <typename GeneratorT>
+  GeneratorT &addGenerator(std::unique_ptr<GeneratorT> DefGenerator);
+
+  /// Remove a definition generator from this JITDylib.
+  ///
+  /// The given generator must exist in this JITDylib's generators list (i.e.
+  /// have been added and not yet removed).
+  void removeGenerator(DefinitionGenerator &G);
 
   /// Set the search order to be used when fixing up definitions in JITDylib.
   /// This will replace the previous search order, and apply to any symbol
@@ -633,17 +639,17 @@ private:
   struct MaterializingInfo {
     SymbolDependenceMap Dependants;
     SymbolDependenceMap UnemittedDependencies;
-    bool IsEmitted = false;
 
     void addQuery(std::shared_ptr<AsynchronousSymbolQuery> Q);
     void removeQuery(const AsynchronousSymbolQuery &Q);
     AsynchronousSymbolQueryList takeQueriesMeeting(SymbolState RequiredState);
-    AsynchronousSymbolQueryList takeAllQueries();
+    AsynchronousSymbolQueryList takeAllPendingQueries() {
+      return std::move(PendingQueries);
+    }
     bool hasQueriesPending() const { return !PendingQueries.empty(); }
     const AsynchronousSymbolQueryList &pendingQueries() const {
       return PendingQueries;
     }
-
   private:
     AsynchronousSymbolQueryList PendingQueries;
   };
@@ -710,9 +716,9 @@ private:
                    SymbolNameSet &Unresolved, bool MatchNonExported,
                    MaterializationUnitList &MUs);
 
-  void lodgeQueryImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                      SymbolNameSet &Unresolved, bool MatchNonExported,
-                      MaterializationUnitList &MUs);
+  Error lodgeQueryImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
+                       SymbolNameSet &Unresolved, bool MatchNonExported,
+                       MaterializationUnitList &MUs);
 
   bool lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
                   std::vector<std::unique_ptr<MaterializationUnit>> &MUs,
@@ -734,18 +740,20 @@ private:
   void addDependencies(const SymbolStringPtr &Name,
                        const SymbolDependenceMap &Dependants);
 
-  void resolve(const SymbolMap &Resolved);
+  Error resolve(const SymbolMap &Resolved);
 
-  void emit(const SymbolFlagsMap &Emitted);
+  Error emit(const SymbolFlagsMap &Emitted);
 
-  void notifyFailed(const SymbolNameSet &FailedSymbols);
+  using FailedSymbolsWorklist =
+      std::vector<std::pair<JITDylib *, SymbolStringPtr>>;
+  static void notifyFailed(FailedSymbolsWorklist FailedSymbols);
 
   ExecutionSession &ES;
   std::string JITDylibName;
   SymbolTable Symbols;
   UnmaterializedInfosMap UnmaterializedInfos;
   MaterializingInfosMap MaterializingInfos;
-  GeneratorFunction DefGenerator;
+  std::vector<std::unique_ptr<DefinitionGenerator>> DefGenerators;
   JITDylibSearchList SearchOrder;
 };
 
@@ -933,6 +941,14 @@ private:
       OutstandingMUs;
 };
 
+template <typename GeneratorT>
+GeneratorT &JITDylib::addGenerator(std::unique_ptr<GeneratorT> DefGenerator) {
+  auto &G = *DefGenerator;
+  ES.runSessionLocked(
+      [&]() { DefGenerators.push_back(std::move(DefGenerator)); });
+  return G;
+}
+
 template <typename Func>
 auto JITDylib::withSearchOrderDo(Func &&F)
     -> decltype(F(std::declval<const JITDylibSearchList &>())) {
@@ -972,6 +988,27 @@ Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &MU) {
   });
 }
 
+/// ReexportsGenerator can be used with JITDylib::setGenerator to automatically
+/// re-export a subset of the source JITDylib's symbols in the target.
+class ReexportsGenerator : public JITDylib::DefinitionGenerator {
+public:
+  using SymbolPredicate = std::function<bool(SymbolStringPtr)>;
+
+  /// Create a reexports generator. If an Allow predicate is passed, only
+  /// symbols for which the predicate returns true will be reexported. If no
+  /// Allow predicate is passed, all symbols will be exported.
+  ReexportsGenerator(JITDylib &SourceJD, bool MatchNonExported = false,
+                     SymbolPredicate Allow = SymbolPredicate());
+
+  Expected<SymbolNameSet> tryToGenerate(JITDylib &JD,
+                                        const SymbolNameSet &Names) override;
+
+private:
+  JITDylib &SourceJD;
+  bool MatchNonExported = false;
+  SymbolPredicate Allow;
+};
+
 /// Mangles symbol names then uniques them in the context of an
 /// ExecutionSession.
 class MangleAndInterner {
diff --git a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
index 75865920c741..cf0a428662ef 100644
--- a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -19,6 +19,7 @@
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/OrcError.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include <algorithm>
 #include <cstdint>
@@ -37,6 +38,8 @@ class Value;
 
 namespace orc {
 
+class ObjectLayer;
+
 /// This iterator provides a convenient way to iterate over the elements
 ///        of an llvm.global_ctors/llvm.global_dtors instance.
 ///
@@ -237,7 +240,7 @@ public:
 /// If an instance of this class is attached to a JITDylib as a fallback
 /// definition generator, then any symbol found in the given DynamicLibrary that
 /// passes the 'Allow' predicate will be added to the JITDylib.
-class DynamicLibrarySearchGenerator {
+class DynamicLibrarySearchGenerator : public JITDylib::DefinitionGenerator {
 public:
   using SymbolPredicate = std::function<bool(SymbolStringPtr)>;
 
@@ -253,19 +256,20 @@ public:
   /// Permanently loads the library at the given path and, on success, returns
   /// a DynamicLibrarySearchGenerator that will search it for symbol definitions
   /// in the library. On failure returns the reason the library failed to load.
-  static Expected<DynamicLibrarySearchGenerator>
+  static Expected<std::unique_ptr<DynamicLibrarySearchGenerator>>
   Load(const char *FileName, char GlobalPrefix,
        SymbolPredicate Allow = SymbolPredicate());
 
   /// Creates a DynamicLibrarySearchGenerator that searches for symbols in
   /// the current process.
-  static Expected<DynamicLibrarySearchGenerator>
+  static Expected<std::unique_ptr<DynamicLibrarySearchGenerator>>
   GetForCurrentProcess(char GlobalPrefix,
                        SymbolPredicate Allow = SymbolPredicate()) {
     return Load(nullptr, GlobalPrefix, std::move(Allow));
   }
 
-  Expected<SymbolNameSet> operator()(JITDylib &JD, const SymbolNameSet &Names);
+  Expected<SymbolNameSet> tryToGenerate(JITDylib &JD,
+                                        const SymbolNameSet &Names) override;
 
 private:
   sys::DynamicLibrary Dylib;
@@ -273,6 +277,40 @@ private:
   char GlobalPrefix;
 };
 
+/// A utility class to expose symbols from a static library.
+///
+/// If an instance of this class is attached to a JITDylib as a fallback
+/// definition generator, then any symbol found in the archive will result in
+/// the containing object being added to the JITDylib.
+class StaticLibraryDefinitionGenerator : public JITDylib::DefinitionGenerator {
+public:
+  /// Try to create a StaticLibraryDefinitionGenerator from the given path.
+  ///
+  /// This call will succeed if the file at the given path is a static library
+  /// is a valid archive, otherwise it will return an error.
+  static Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
+  Load(ObjectLayer &L, const char *FileName);
+
+  /// Try to create a StaticLibrarySearchGenerator from the given memory buffer.
+  /// Thhis call will succeed if the buffer contains a valid archive, otherwise
+  /// it will return an error.
+  static Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
+  Create(ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer);
+
+  Expected<SymbolNameSet> tryToGenerate(JITDylib &JD,
+                                        const SymbolNameSet &Names) override;
+
+private:
+  StaticLibraryDefinitionGenerator(ObjectLayer &L,
+                                   std::unique_ptr<MemoryBuffer> ArchiveBuffer,
+                                   Error &Err);
+
+  ObjectLayer &L;
+  std::unique_ptr<MemoryBuffer> ArchiveBuffer;
+  object::Archive Archive;
+  size_t UnrealizedObjects = 0;
+};
+
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
index 1b4c8b6cd95f..b71e5b339711 100644
--- a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
@@ -22,6 +22,9 @@ namespace llvm {
 class Module;
 namespace orc {
 
+/// A layer that applies a transform to emitted modules.
+/// The transform function is responsible for locking the ThreadSafeContext
+/// before operating on the module.
 class IRTransformLayer : public IRLayer {
 public:
   using TransformFunction = std::function<Expected<ThreadSafeModule>(
diff --git a/include/llvm/ExecutionEngine/Orc/LLJIT.h b/include/llvm/ExecutionEngine/Orc/LLJIT.h
index 0aac1916423f..b1e47d77557c 100644
--- a/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -184,8 +184,8 @@ private:
 
 class LLJITBuilderState {
 public:
-  using ObjectLinkingLayerCreator =
-      std::function<std::unique_ptr<ObjectLayer>(ExecutionSession &)>;
+  using ObjectLinkingLayerCreator = std::function<std::unique_ptr<ObjectLayer>(
+      ExecutionSession &, const Triple &TT)>;
 
   using CompileFunctionCreator =
       std::function<Expected<IRCompileLayer::CompileFunction>(
diff --git a/include/llvm/ExecutionEngine/Orc/LambdaResolver.h b/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
index 855e31b33549..b31914f12a0d 100644
--- a/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
+++ b/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/OrcV1Deprecation.h"
 #include <memory>
 
 namespace llvm {
@@ -62,7 +63,7 @@ std::shared_ptr<LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>>
 createLambdaResolver(DylibLookupFtorT DylibLookupFtor,
                      ExternalLookupFtorT ExternalLookupFtor) {
   using LR = LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>;
-  return make_unique<LR>(std::move(DylibLookupFtor),
+  return std::make_unique<LR>(std::move(DylibLookupFtor),
                          std::move(ExternalLookupFtor));
 }
 
@@ -72,7 +73,7 @@ createLambdaResolver(ORCv1DeprecationAcknowledgement,
                      DylibLookupFtorT DylibLookupFtor,
                      ExternalLookupFtorT ExternalLookupFtor) {
   using LR = LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>;
-  return make_unique<LR>(AcknowledgeORCv1Deprecation,
+  return std::make_unique<LR>(AcknowledgeORCv1Deprecation,
                          std::move(DylibLookupFtor),
                          std::move(ExternalLookupFtor));
 }
diff --git a/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h b/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
index 16202d89f861..b67a9feed523 100644
--- a/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
@@ -49,28 +49,24 @@ private:
       switch (EmitState) {
       case NotEmitted:
         if (auto GV = searchGVs(Name, ExportedSymbolsOnly)) {
-          // Create a std::string version of Name to capture here - the argument
-          // (a StringRef) may go away before the lambda is executed.
-          // FIXME: Use capture-init when we move to C++14.
-          std::string PName = Name;
           JITSymbolFlags Flags = JITSymbolFlags::fromGlobalValue(*GV);
-          auto GetAddress =
-            [this, ExportedSymbolsOnly, PName, &B]() -> Expected<JITTargetAddress> {
-              if (this->EmitState == Emitting)
-                return 0;
-              else if (this->EmitState == NotEmitted) {
-                this->EmitState = Emitting;
-                if (auto Err = this->emitToBaseLayer(B))
-                  return std::move(Err);
-                this->EmitState = Emitted;
-              }
-              if (auto Sym = B.findSymbolIn(K, PName, ExportedSymbolsOnly))
-                return Sym.getAddress();
-              else if (auto Err = Sym.takeError())
+          auto GetAddress = [this, ExportedSymbolsOnly, Name = Name.str(),
+                             &B]() -> Expected<JITTargetAddress> {
+            if (this->EmitState == Emitting)
+              return 0;
+            else if (this->EmitState == NotEmitted) {
+              this->EmitState = Emitting;
+              if (auto Err = this->emitToBaseLayer(B))
                 return std::move(Err);
-              else
-                llvm_unreachable("Successful symbol lookup should return "
-                                 "definition address here");
+              this->EmitState = Emitted;
+            }
+            if (auto Sym = B.findSymbolIn(K, Name, ExportedSymbolsOnly))
+              return Sym.getAddress();
+            else if (auto Err = Sym.takeError())
+              return std::move(Err);
+            else
+              llvm_unreachable("Successful symbol lookup should return "
+                               "definition address here");
           };
           return JITSymbol(std::move(GetAddress), Flags);
         } else
@@ -171,7 +167,7 @@ private:
                                            bool ExportedSymbolsOnly) const {
       assert(!MangledSymbols && "Mangled symbols map already exists?");
 
-      auto Symbols = llvm::make_unique<StringMap<const GlobalValue*>>();
+      auto Symbols = std::make_unique<StringMap<const GlobalValue*>>();
 
       Mangler Mang;
 
@@ -209,7 +205,7 @@ public:
   Error addModule(VModuleKey K, std::unique_ptr<Module> M) {
     assert(!ModuleMap.count(K) && "VModuleKey K already in use");
     ModuleMap[K] =
-        llvm::make_unique<EmissionDeferredModule>(std::move(K), std::move(M));
+        std::make_unique<EmissionDeferredModule>(std::move(K), std::move(M));
     return Error::success();
   }
 
diff --git a/include/llvm/ExecutionEngine/Orc/LazyReexports.h b/include/llvm/ExecutionEngine/Orc/LazyReexports.h
index 9fdd1d15f782..311ed59b1549 100644
--- a/include/llvm/ExecutionEngine/Orc/LazyReexports.h
+++ b/include/llvm/ExecutionEngine/Orc/LazyReexports.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
+#include "llvm/ExecutionEngine/Orc/Speculation.h"
 
 namespace llvm {
 
@@ -70,7 +71,7 @@ public:
   template <typename NotifyResolvedImpl>
   static std::unique_ptr<NotifyResolvedFunction>
   createNotifyResolvedFunction(NotifyResolvedImpl NotifyResolved) {
-    return llvm::make_unique<NotifyResolvedFunctionImpl<NotifyResolvedImpl>>(
+    return std::make_unique<NotifyResolvedFunctionImpl<NotifyResolvedImpl>>(
         std::move(NotifyResolved));
   }
 
@@ -159,7 +160,7 @@ public:
                                    IndirectStubsManager &ISManager,
                                    JITDylib &SourceJD,
                                    SymbolAliasMap CallableAliases,
-                                   VModuleKey K);
+                                   ImplSymbolMap *SrcJDLoc, VModuleKey K);
 
   StringRef getName() const override;
 
@@ -174,6 +175,7 @@ private:
   SymbolAliasMap CallableAliases;
   std::shared_ptr<LazyCallThroughManager::NotifyResolvedFunction>
       NotifyResolved;
+  ImplSymbolMap *AliaseeTable;
 };
 
 /// Define lazy-reexports based on the given SymbolAliasMap. Each lazy re-export
@@ -182,9 +184,10 @@ private:
 inline std::unique_ptr<LazyReexportsMaterializationUnit>
 lazyReexports(LazyCallThroughManager &LCTManager,
               IndirectStubsManager &ISManager, JITDylib &SourceJD,
-              SymbolAliasMap CallableAliases, VModuleKey K = VModuleKey()) {
-  return llvm::make_unique<LazyReexportsMaterializationUnit>(
-      LCTManager, ISManager, SourceJD, std::move(CallableAliases),
+              SymbolAliasMap CallableAliases, ImplSymbolMap *SrcJDLoc = nullptr,
+              VModuleKey K = VModuleKey()) {
+  return std::make_unique<LazyReexportsMaterializationUnit>(
+      LCTManager, ISManager, SourceJD, std::move(CallableAliases), SrcJDLoc,
       std::move(K));
 }
 
diff --git a/include/llvm/ExecutionEngine/Orc/Legacy.h b/include/llvm/ExecutionEngine/Orc/Legacy.h
index f9cbbf6ff180..148e260c9569 100644
--- a/include/llvm/ExecutionEngine/Orc/Legacy.h
+++ b/include/llvm/ExecutionEngine/Orc/Legacy.h
@@ -84,7 +84,7 @@ createSymbolResolver(GetResponsibilitySetFn &&GetResponsibilitySet,
           typename std::remove_reference<GetResponsibilitySetFn>::type>::type,
       typename std::remove_cv<
           typename std::remove_reference<LookupFn>::type>::type>;
-  return llvm::make_unique<LambdaSymbolResolverImpl>(
+  return std::make_unique<LambdaSymbolResolverImpl>(
       std::forward<GetResponsibilitySetFn>(GetResponsibilitySet),
       std::forward<LookupFn>(Lookup));
 }
diff --git a/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
index c1e7d27f446e..caf8e707516d 100644
--- a/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
@@ -73,6 +73,9 @@ public:
     virtual Error notifyRemovingAllModules() { return Error::success(); }
   };
 
+  using ReturnObjectBufferFunction =
+      std::function<void(std::unique_ptr<MemoryBuffer>)>;
+
   /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
   /// and NotifyEmitted functors.
   ObjectLinkingLayer(ExecutionSession &ES,
@@ -81,6 +84,13 @@ public:
   /// Destruct an ObjectLinkingLayer.
   ~ObjectLinkingLayer();
 
+  /// Set an object buffer return function. By default object buffers are
+  /// deleted once the JIT has linked them. If a return function is set then
+  /// it will be called to transfer ownership of the buffer instead.
+  void setReturnObjectBuffer(ReturnObjectBufferFunction ReturnObjectBuffer) {
+    this->ReturnObjectBuffer = std::move(ReturnObjectBuffer);
+  }
+
   /// Add a pass-config modifier.
   ObjectLinkingLayer &addPlugin(std::unique_ptr<Plugin> P) {
     std::lock_guard<std::mutex> Lock(LayerMutex);
@@ -138,6 +148,7 @@ private:
   jitlink::JITLinkMemoryManager &MemMgr;
   bool OverrideObjectFlags = false;
   bool AutoClaimObjectSymbols = false;
+  ReturnObjectBufferFunction ReturnObjectBuffer;
   DenseMap<VModuleKey, AllocPtr> TrackedAllocs;
   std::vector<AllocPtr> UntrackedAllocs;
   std::vector<std::unique_ptr<Plugin>> Plugins;
@@ -153,10 +164,16 @@ public:
   Error notifyRemovingAllModules() override;
 
 private:
+
+  struct EHFrameRange {
+    JITTargetAddress Addr = 0;
+    size_t Size;
+  };
+
   jitlink::EHFrameRegistrar &Registrar;
-  DenseMap<MaterializationResponsibility *, JITTargetAddress> InProcessLinks;
-  DenseMap<VModuleKey, JITTargetAddress> TrackedEHFrameAddrs;
-  std::vector<JITTargetAddress> UntrackedEHFrameAddrs;
+  DenseMap<MaterializationResponsibility *, EHFrameRange> InProcessLinks;
+  DenseMap<VModuleKey, EHFrameRange> TrackedEHFrameRanges;
+  std::vector<EHFrameRange> UntrackedEHFrameRanges;
 };
 
 } // end namespace orc
diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
index 8b875b7906e1..86e8d5df3ad9 100644
--- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
+++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
@@ -493,7 +493,7 @@ public:
                                  ExecutionSession &ES,
                                  JITTargetAddress ErrorHandlerAddress)
         : JITCompileCallbackManager(
-              llvm::make_unique<RemoteTrampolinePool>(Client), ES,
+              std::make_unique<RemoteTrampolinePool>(Client), ES,
               ErrorHandlerAddress) {}
   };
 
@@ -553,7 +553,7 @@ public:
     auto Id = IndirectStubOwnerIds.getNext();
     if (auto Err = callB<stubs::CreateIndirectStubsOwner>(Id))
       return std::move(Err);
-    return llvm::make_unique<RemoteIndirectStubsManager>(*this, Id);
+    return std::make_unique<RemoteIndirectStubsManager>(*this, Id);
   }
 
   Expected<RemoteCompileCallbackManager &>
diff --git a/include/llvm/ExecutionEngine/Orc/RPCSerialization.h b/include/llvm/ExecutionEngine/Orc/RPCSerialization.h
index 07c7471afc6a..752a0a34e0a1 100644
--- a/include/llvm/ExecutionEngine/Orc/RPCSerialization.h
+++ b/include/llvm/ExecutionEngine/Orc/RPCSerialization.h
@@ -359,9 +359,9 @@ public:
     {
       assert(KeyName != nullptr && "No keyname pointer");
       std::lock_guard<std::recursive_mutex> Lock(SerializersMutex);
-      // FIXME: Move capture Serialize once we have C++14.
       Serializers[ErrorInfoT::classID()] =
-          [KeyName, Serialize](ChannelT &C, const ErrorInfoBase &EIB) -> Error {
+          [KeyName, Serialize = std::move(Serialize)](
+              ChannelT &C, const ErrorInfoBase &EIB) -> Error {
         assert(EIB.dynamicClassID() == ErrorInfoT::classID() &&
                "Serializer called for wrong error type");
         if (auto Err = serializeSeq(C, *KeyName))
@@ -551,26 +551,26 @@ public:
 
   /// RPC channel serialization for std::tuple.
   static Error serialize(ChannelT &C, const std::tuple<ArgTs...> &V) {
-    return serializeTupleHelper(C, V, llvm::index_sequence_for<ArgTs...>());
+    return serializeTupleHelper(C, V, std::index_sequence_for<ArgTs...>());
   }
 
   /// RPC channel deserialization for std::tuple.
   static Error deserialize(ChannelT &C, std::tuple<ArgTs...> &V) {
-    return deserializeTupleHelper(C, V, llvm::index_sequence_for<ArgTs...>());
+    return deserializeTupleHelper(C, V, std::index_sequence_for<ArgTs...>());
   }
 
 private:
   // Serialization helper for std::tuple.
   template <size_t... Is>
   static Error serializeTupleHelper(ChannelT &C, const std::tuple<ArgTs...> &V,
-                                    llvm::index_sequence<Is...> _) {
+                                    std::index_sequence<Is...> _) {
     return serializeSeq(C, std::get<Is>(V)...);
   }
 
   // Serialization helper for std::tuple.
   template <size_t... Is>
   static Error deserializeTupleHelper(ChannelT &C, std::tuple<ArgTs...> &V,
-                                      llvm::index_sequence<Is...> _) {
+                                      std::index_sequence<Is...> _) {
     return deserializeSeq(C, std::get<Is>(V)...);
   }
 };
diff --git a/include/llvm/ExecutionEngine/Orc/RPCUtils.h b/include/llvm/ExecutionEngine/Orc/RPCUtils.h
index 3b11e1b283de..ee9c2cc69c30 100644
--- a/include/llvm/ExecutionEngine/Orc/RPCUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/RPCUtils.h
@@ -338,7 +338,9 @@ public:
       return Err;
 
     // Close the response message.
-    return C.endSendMessage();
+    if (auto Err = C.endSendMessage())
+      return Err;
+    return C.send();
   }
 
   template <typename ChannelT, typename FunctionIdT, typename SequenceNumberT>
@@ -350,7 +352,9 @@ public:
       return Err2;
     if (auto Err2 = serializeSeq(C, std::move(Err)))
       return Err2;
-    return C.endSendMessage();
+    if (auto Err2 = C.endSendMessage())
+      return Err2;
+    return C.send();
   }
 
 };
@@ -378,8 +382,11 @@ public:
                                                                C, *ResultOrErr))
       return Err;
 
-    // Close the response message.
-    return C.endSendMessage();
+    // End the response message.
+    if (auto Err = C.endSendMessage())
+      return Err;
+
+    return C.send();
   }
 
   template <typename ChannelT, typename FunctionIdT, typename SequenceNumberT>
@@ -389,7 +396,9 @@ public:
       return Err;
     if (auto Err2 = C.startSendMessage(ResponseId, SeqNo))
       return Err2;
-    return C.endSendMessage();
+    if (auto Err2 = C.endSendMessage())
+      return Err2;
+    return C.send();
   }
 
 };
@@ -502,7 +511,7 @@ public:
   static typename WrappedHandlerReturn<RetT>::Type
   unpackAndRun(HandlerT &Handler, std::tuple<TArgTs...> &Args) {
     return unpackAndRunHelper(Handler, Args,
-                              llvm::index_sequence_for<TArgTs...>());
+                              std::index_sequence_for<TArgTs...>());
   }
 
   // Call the given handler with the given arguments.
@@ -510,7 +519,7 @@ public:
   static Error unpackAndRunAsync(HandlerT &Handler, ResponderT &Responder,
                                  std::tuple<TArgTs...> &Args) {
     return unpackAndRunAsyncHelper(Handler, Responder, Args,
-                                   llvm::index_sequence_for<TArgTs...>());
+                                   std::index_sequence_for<TArgTs...>());
   }
 
   // Call the given handler with the given arguments.
@@ -540,14 +549,13 @@ public:
   // Deserialize arguments from the channel.
   template <typename ChannelT, typename... CArgTs>
   static Error deserializeArgs(ChannelT &C, std::tuple<CArgTs...> &Args) {
-    return deserializeArgsHelper(C, Args,
-                                 llvm::index_sequence_for<CArgTs...>());
+    return deserializeArgsHelper(C, Args, std::index_sequence_for<CArgTs...>());
   }
 
 private:
   template <typename ChannelT, typename... CArgTs, size_t... Indexes>
   static Error deserializeArgsHelper(ChannelT &C, std::tuple<CArgTs...> &Args,
-                                     llvm::index_sequence<Indexes...> _) {
+                                     std::index_sequence<Indexes...> _) {
     return SequenceSerialization<ChannelT, ArgTs...>::deserialize(
         C, std::get<Indexes>(Args)...);
   }
@@ -556,18 +564,16 @@ private:
   static typename WrappedHandlerReturn<
       typename HandlerTraits<HandlerT>::ReturnType>::Type
   unpackAndRunHelper(HandlerT &Handler, ArgTuple &Args,
-                     llvm::index_sequence<Indexes...>) {
+                     std::index_sequence<Indexes...>) {
     return run(Handler, std::move(std::get<Indexes>(Args))...);
   }
 
-
   template <typename HandlerT, typename ResponderT, typename ArgTuple,
             size_t... Indexes>
   static typename WrappedHandlerReturn<
       typename HandlerTraits<HandlerT>::ReturnType>::Type
   unpackAndRunAsyncHelper(HandlerT &Handler, ResponderT &Responder,
-                          ArgTuple &Args,
-                          llvm::index_sequence<Indexes...>) {
+                          ArgTuple &Args, std::index_sequence<Indexes...>) {
     return run(Handler, Responder, std::move(std::get<Indexes>(Args))...);
   }
 };
@@ -743,11 +749,15 @@ public:
   // to the user defined handler.
   Error handleResponse(ChannelT &C) override {
     Error Result = Error::success();
-    if (auto Err =
-            SerializationTraits<ChannelT, Error, Error>::deserialize(C, Result))
+    if (auto Err = SerializationTraits<ChannelT, Error, Error>::deserialize(
+            C, Result)) {
+      consumeError(std::move(Result));
       return Err;
-    if (auto Err = C.endReceiveMessage())
+    }
+    if (auto Err = C.endReceiveMessage()) {
+      consumeError(std::move(Result));
       return Err;
+    }
     return Handler(std::move(Result));
   }
 
@@ -767,7 +777,7 @@ private:
 // Create a ResponseHandler from a given user handler.
 template <typename ChannelT, typename FuncRetT, typename HandlerT>
 std::unique_ptr<ResponseHandler<ChannelT>> createResponseHandler(HandlerT H) {
-  return llvm::make_unique<ResponseHandlerImpl<ChannelT, FuncRetT, HandlerT>>(
+  return std::make_unique<ResponseHandlerImpl<ChannelT, FuncRetT, HandlerT>>(
       std::move(H));
 }
 
@@ -1403,14 +1413,12 @@ public:
     using ErrorReturn = typename RTraits::ErrorReturnType;
     using ErrorReturnPromise = typename RTraits::ReturnPromiseType;
 
-    // FIXME: Stack allocate and move this into the handler once LLVM builds
-    //        with C++14.
-    auto Promise = std::make_shared<ErrorReturnPromise>();
-    auto FutureResult = Promise->get_future();
+    ErrorReturnPromise Promise;
+    auto FutureResult = Promise.get_future();
 
     if (auto Err = this->template appendCallAsync<Func>(
-            [Promise](ErrorReturn RetOrErr) {
-              Promise->set_value(std::move(RetOrErr));
+            [Promise = std::move(Promise)](ErrorReturn RetOrErr) mutable {
+              Promise.set_value(std::move(RetOrErr));
               return Error::success();
             },
             Args...)) {
@@ -1523,6 +1531,12 @@ public:
       return std::move(Err);
     }
 
+    if (auto Err = this->C.send()) {
+      detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
+          std::move(Result));
+      return std::move(Err);
+    }
+
     while (!ReceivedResponse) {
       if (auto Err = this->handleOne()) {
         detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
@@ -1582,8 +1596,7 @@ public:
     // outstanding calls count, then poke the condition variable.
     using ArgType = typename detail::ResponseHandlerArg<
         typename detail::HandlerTraits<HandlerT>::Type>::ArgType;
-    // FIXME: Move handler into wrapped handler once we have C++14.
-    auto WrappedHandler = [this, Handler](ArgType Arg) {
+    auto WrappedHandler = [this, Handler = std::move(Handler)](ArgType Arg) {
       auto Err = Handler(std::move(Arg));
       std::unique_lock<std::mutex> Lock(M);
       --NumOutstandingCalls;
diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index d9535ce5f21f..c5106cf09ecc 100644
--- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -216,7 +216,7 @@ private:
         : K(std::move(K)),
           Parent(Parent),
           MemMgr(std::move(MemMgr)),
-          PFC(llvm::make_unique<PreFinalizeContents>(
+          PFC(std::make_unique<PreFinalizeContents>(
               std::move(Obj), std::move(Resolver),
               ProcessAllSections)) {
       buildInitialSymbolTable(PFC->Obj);
@@ -234,7 +234,7 @@ private:
 
       JITSymbolResolverAdapter ResolverAdapter(Parent.ES, *PFC->Resolver,
 					       nullptr);
-      PFC->RTDyld = llvm::make_unique<RuntimeDyld>(*MemMgr, ResolverAdapter);
+      PFC->RTDyld = std::make_unique<RuntimeDyld>(*MemMgr, ResolverAdapter);
       PFC->RTDyld->setProcessAllSections(PFC->ProcessAllSections);
 
       Finalized = true;
@@ -338,7 +338,7 @@ private:
                      std::shared_ptr<SymbolResolver> Resolver,
                      bool ProcessAllSections) {
     using LOS = ConcreteLinkedObject<MemoryManagerPtrT>;
-    return llvm::make_unique<LOS>(Parent, std::move(K), std::move(Obj),
+    return std::make_unique<LOS>(Parent, std::move(K), std::move(Obj),
                                   std::move(MemMgr), std::move(Resolver),
                                   ProcessAllSections);
   }
diff --git a/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h b/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h
index b87cf697a81e..d7304cfcf931 100644
--- a/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/RemoteObjectLayer.h
@@ -137,17 +137,12 @@ protected:
                              RemoteSymbolId Id)
       : C(C), Id(Id) {}
 
-    RemoteSymbolMaterializer(const RemoteSymbolMaterializer &Other)
-      : C(Other.C), Id(Other.Id) {
-      // FIXME: This is a horrible, auto_ptr-style, copy-as-move operation.
-      //        It should be removed as soon as LLVM has C++14's generalized
-      //        lambda capture (at which point the materializer can be moved
-      //        into the lambda in remoteToJITSymbol below).
-      const_cast<RemoteSymbolMaterializer&>(Other).Id = 0;
+    RemoteSymbolMaterializer(RemoteSymbolMaterializer &&Other)
+        : C(Other.C), Id(Other.Id) {
+      Other.Id = 0;
     }
 
-    RemoteSymbolMaterializer&
-    operator=(const RemoteSymbolMaterializer&) = delete;
+    RemoteSymbolMaterializer &operator=(RemoteSymbolMaterializer &&) = delete;
 
     /// Release the remote symbol.
     ~RemoteSymbolMaterializer() {
@@ -218,9 +213,9 @@ protected:
         return nullptr;
       // else...
       RemoteSymbolMaterializer RSM(*this, RemoteSym.first);
-      auto Sym =
-        JITSymbol([RSM]() mutable { return RSM.materialize(); },
-                  RemoteSym.second);
+      auto Sym = JITSymbol(
+          [RSM = std::move(RSM)]() mutable { return RSM.materialize(); },
+          RemoteSym.second);
       return Sym;
     } else
       return RemoteSymOrErr.takeError();
@@ -472,7 +467,7 @@ private:
   }
 
   Expected<ObjHandleT> addObject(std::string ObjBuffer) {
-    auto Buffer = llvm::make_unique<StringMemoryBuffer>(std::move(ObjBuffer));
+    auto Buffer = std::make_unique<StringMemoryBuffer>(std::move(ObjBuffer));
     auto Id = HandleIdMgr.getNext();
     assert(!BaseLayerHandles.count(Id) && "Id already in use?");
 
diff --git a/include/llvm/ExecutionEngine/Orc/SpeculateAnalyses.h b/include/llvm/ExecutionEngine/Orc/SpeculateAnalyses.h
new file mode 100644
index 000000000000..cf57b63b6448
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/SpeculateAnalyses.h
@@ -0,0 +1,84 @@
+//===-- SpeculateAnalyses.h  --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// \file
+/// Contains the Analyses and Result Interpretation to select likely functions
+/// to Speculatively compile before they are called. [Purely Experimentation]
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SPECULATEANALYSES_H
+#define LLVM_EXECUTIONENGINE_ORC_SPECULATEANALYSES_H
+
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Speculation.h"
+
+#include <vector>
+
+namespace llvm {
+
+namespace orc {
+
+// Provides common code.
+class SpeculateQuery {
+protected:
+  void findCalles(const BasicBlock *, DenseSet<StringRef> &);
+  bool isStraightLine(const Function &F);
+
+public:
+  using ResultTy = Optional<DenseMap<StringRef, DenseSet<StringRef>>>;
+};
+
+// Direct calls in high frequency basic blocks are extracted.
+class BlockFreqQuery : public SpeculateQuery {
+  size_t numBBToGet(size_t);
+
+public:
+  // Find likely next executables based on IR Block Frequency
+  ResultTy operator()(Function &F);
+};
+
+// This Query generates a sequence of basic blocks which follows the order of
+// execution.
+// A handful of BB with higher block frequencies are taken, then path to entry
+// and end BB are discovered by traversing up & down the CFG.
+class SequenceBBQuery : public SpeculateQuery {
+  struct WalkDirection {
+    bool Upward = true, Downward = true;
+    // the block associated contain a call
+    bool CallerBlock = false;
+  };
+
+public:
+  using VisitedBlocksInfoTy = DenseMap<const BasicBlock *, WalkDirection>;
+  using BlockListTy = SmallVector<const BasicBlock *, 8>;
+  using BackEdgesInfoTy =
+      SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 8>;
+  using BlockFreqInfoTy =
+      SmallVector<std::pair<const BasicBlock *, uint64_t>, 8>;
+
+private:
+  std::size_t getHottestBlocks(std::size_t TotalBlocks);
+  BlockListTy rearrangeBB(const Function &, const BlockListTy &);
+  BlockListTy queryCFG(Function &, const BlockListTy &);
+  void traverseToEntryBlock(const BasicBlock *, const BlockListTy &,
+                            const BackEdgesInfoTy &,
+                            const BranchProbabilityInfo *,
+                            VisitedBlocksInfoTy &);
+  void traverseToExitBlock(const BasicBlock *, const BlockListTy &,
+                           const BackEdgesInfoTy &,
+                           const BranchProbabilityInfo *,
+                           VisitedBlocksInfoTy &);
+
+public:
+  ResultTy operator()(Function &F);
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SPECULATEANALYSES_H
diff --git a/include/llvm/ExecutionEngine/Orc/Speculation.h b/include/llvm/ExecutionEngine/Orc/Speculation.h
new file mode 100644
index 000000000000..766a6b070f12
--- /dev/null
+++ b/include/llvm/ExecutionEngine/Orc/Speculation.h
@@ -0,0 +1,207 @@
+//===-- Speculation.h - Speculative Compilation --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains the definition to support speculative compilation when laziness is
+// enabled.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SPECULATION_H
+#define LLVM_EXECUTIONENGINE_ORC_SPECULATION_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Debug.h"
+
+#include <mutex>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+class Speculator;
+
+// Track the Impls (JITDylib,Symbols) of Symbols while lazy call through
+// trampolines are created. Operations are guarded by locks tp ensure that Imap
+// stays in consistent state after read/write
+
+class ImplSymbolMap {
+  friend class Speculator;
+
+public:
+  using AliaseeDetails = std::pair<SymbolStringPtr, JITDylib *>;
+  using Alias = SymbolStringPtr;
+  using ImapTy = DenseMap<Alias, AliaseeDetails>;
+  void trackImpls(SymbolAliasMap ImplMaps, JITDylib *SrcJD);
+
+private:
+  // FIX ME: find a right way to distinguish the pre-compile Symbols, and update
+  // the callsite
+  Optional<AliaseeDetails> getImplFor(const SymbolStringPtr &StubSymbol) {
+    std::lock_guard<std::mutex> Lockit(ConcurrentAccess);
+    auto Position = Maps.find(StubSymbol);
+    if (Position != Maps.end())
+      return Position->getSecond();
+    else
+      return None;
+  }
+
+  std::mutex ConcurrentAccess;
+  ImapTy Maps;
+};
+
+// Defines Speculator Concept,
+class Speculator {
+public:
+  using TargetFAddr = JITTargetAddress;
+  using FunctionCandidatesMap = DenseMap<SymbolStringPtr, SymbolNameSet>;
+  using StubAddrLikelies = DenseMap<TargetFAddr, SymbolNameSet>;
+
+private:
+  void registerSymbolsWithAddr(TargetFAddr ImplAddr,
+                               SymbolNameSet likelySymbols) {
+    std::lock_guard<std::mutex> Lockit(ConcurrentAccess);
+    GlobalSpecMap.insert({ImplAddr, std::move(likelySymbols)});
+  }
+
+  void launchCompile(JITTargetAddress FAddr) {
+    SymbolNameSet CandidateSet;
+    // Copy CandidateSet is necessary, to avoid unsynchronized access to
+    // the datastructure.
+    {
+      std::lock_guard<std::mutex> Lockit(ConcurrentAccess);
+      auto It = GlobalSpecMap.find(FAddr);
+      if (It == GlobalSpecMap.end())
+        return;
+      CandidateSet = It->getSecond();
+    }
+
+    SymbolDependenceMap SpeculativeLookUpImpls;
+
+    for (auto &Callee : CandidateSet) {
+      auto ImplSymbol = AliaseeImplTable.getImplFor(Callee);
+      // try to distinguish already compiled & library symbols
+      if (!ImplSymbol.hasValue())
+        continue;
+      const auto &ImplSymbolName = ImplSymbol.getPointer()->first;
+      JITDylib *ImplJD = ImplSymbol.getPointer()->second;
+      auto &SymbolsInJD = SpeculativeLookUpImpls[ImplJD];
+      SymbolsInJD.insert(ImplSymbolName);
+    }
+
+    DEBUG_WITH_TYPE("orc", for (auto &I
+                                : SpeculativeLookUpImpls) {
+      llvm::dbgs() << "\n In " << I.first->getName() << " JITDylib ";
+      for (auto &N : I.second)
+        llvm::dbgs() << "\n Likely Symbol : " << N;
+    });
+
+    // for a given symbol, there may be no symbol qualified for speculatively
+    // compile try to fix this before jumping to this code if possible.
+    for (auto &LookupPair : SpeculativeLookUpImpls)
+      ES.lookup(JITDylibSearchList({{LookupPair.first, true}}),
+                LookupPair.second, SymbolState::Ready,
+                [this](Expected<SymbolMap> Result) {
+                  if (auto Err = Result.takeError())
+                    ES.reportError(std::move(Err));
+                },
+                NoDependenciesToRegister);
+  }
+
+public:
+  Speculator(ImplSymbolMap &Impl, ExecutionSession &ref)
+      : AliaseeImplTable(Impl), ES(ref), GlobalSpecMap(0) {}
+  Speculator(const Speculator &) = delete;
+  Speculator(Speculator &&) = delete;
+  Speculator &operator=(const Speculator &) = delete;
+  Speculator &operator=(Speculator &&) = delete;
+
+  /// Define symbols for this Speculator object (__orc_speculator) and the
+  /// speculation runtime entry point symbol (__orc_speculate_for) in the
+  /// given JITDylib.
+  Error addSpeculationRuntime(JITDylib &JD, MangleAndInterner &Mangle);
+
+  // Speculatively compile likely functions for the given Stub Address.
+  // destination of __orc_speculate_for jump
+  void speculateFor(TargetFAddr StubAddr) { launchCompile(StubAddr); }
+
+  // FIXME : Register with Stub Address, after JITLink Fix.
+  void registerSymbols(FunctionCandidatesMap Candidates, JITDylib *JD) {
+    for (auto &SymPair : Candidates) {
+      auto Target = SymPair.first;
+      auto Likely = SymPair.second;
+
+      auto OnReadyFixUp = [Likely, Target,
+                           this](Expected<SymbolMap> ReadySymbol) {
+        if (ReadySymbol) {
+          auto RAddr = (*ReadySymbol)[Target].getAddress();
+          registerSymbolsWithAddr(RAddr, std::move(Likely));
+        } else
+          this->getES().reportError(ReadySymbol.takeError());
+      };
+      // Include non-exported symbols also.
+      ES.lookup(JITDylibSearchList({{JD, true}}), SymbolNameSet({Target}),
+                SymbolState::Ready, OnReadyFixUp, NoDependenciesToRegister);
+    }
+  }
+
+  ExecutionSession &getES() { return ES; }
+
+private:
+  static void speculateForEntryPoint(Speculator *Ptr, uint64_t StubId);
+  std::mutex ConcurrentAccess;
+  ImplSymbolMap &AliaseeImplTable;
+  ExecutionSession &ES;
+  StubAddrLikelies GlobalSpecMap;
+};
+
+class IRSpeculationLayer : public IRLayer {
+public:
+  using IRlikiesStrRef = Optional<DenseMap<StringRef, DenseSet<StringRef>>>;
+  using ResultEval = std::function<IRlikiesStrRef(Function &)>;
+  using TargetAndLikelies = DenseMap<SymbolStringPtr, SymbolNameSet>;
+
+  IRSpeculationLayer(ExecutionSession &ES, IRCompileLayer &BaseLayer,
+                     Speculator &Spec, MangleAndInterner &Mangle,
+                     ResultEval Interpreter)
+      : IRLayer(ES), NextLayer(BaseLayer), S(Spec), Mangle(Mangle),
+        QueryAnalysis(Interpreter) {}
+
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM);
+
+private:
+  TargetAndLikelies
+  internToJITSymbols(DenseMap<StringRef, DenseSet<StringRef>> IRNames) {
+    assert(!IRNames.empty() && "No IRNames received to Intern?");
+    TargetAndLikelies InternedNames;
+    DenseSet<SymbolStringPtr> TargetJITNames;
+    for (auto &NamePair : IRNames) {
+      for (auto &TargetNames : NamePair.second)
+        TargetJITNames.insert(Mangle(TargetNames));
+
+      InternedNames[Mangle(NamePair.first)] = std::move(TargetJITNames);
+    }
+    return InternedNames;
+  }
+
+  IRCompileLayer &NextLayer;
+  Speculator &S;
+  MangleAndInterner &Mangle;
+  ResultEval QueryAnalysis;
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SPECULATION_H
diff --git a/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h b/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
index 5787500387c4..2347faed37a2 100644
--- a/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
+++ b/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
@@ -38,17 +38,12 @@ private:
 public:
   // RAII based lock for ThreadSafeContext.
   class LLVM_NODISCARD Lock {
-  private:
-    using UnderlyingLock = std::lock_guard<std::recursive_mutex>;
-
   public:
-    Lock(std::shared_ptr<State> S)
-        : S(std::move(S)),
-          L(llvm::make_unique<UnderlyingLock>(this->S->Mutex)) {}
+    Lock(std::shared_ptr<State> S) : S(std::move(S)), L(this->S->Mutex) {}
 
   private:
     std::shared_ptr<State> S;
-    std::unique_ptr<UnderlyingLock> L;
+    std::unique_lock<std::recursive_mutex> L;
   };
 
   /// Construct a null context.
@@ -69,7 +64,7 @@ public:
   /// instance, or null if the instance was default constructed.
   const LLVMContext *getContext() const { return S ? S->Ctx.get() : nullptr; }
 
-  Lock getLock() {
+  Lock getLock() const {
     assert(S && "Can not lock an empty ThreadSafeContext");
     return Lock(S);
   }
@@ -95,7 +90,7 @@ public:
     // We also need to lock the context to make sure the module tear-down
     // does not overlap any other work on the context.
     if (M) {
-      auto L = getContextLock();
+      auto L = TSCtx.getLock();
       M = nullptr;
     }
     M = std::move(Other.M);
@@ -117,23 +112,14 @@ public:
   ~ThreadSafeModule() {
     // We need to lock the context while we destruct the module.
     if (M) {
-      auto L = getContextLock();
+      auto L = TSCtx.getLock();
       M = nullptr;
     }
   }
 
-  /// Get the module wrapped by this ThreadSafeModule.
-  Module *getModule() { return M.get(); }
-
-  /// Get the module wrapped by this ThreadSafeModule.
-  const Module *getModule() const { return M.get(); }
-
-  /// Take out a lock on the ThreadSafeContext for this module.
-  ThreadSafeContext::Lock getContextLock() { return TSCtx.getLock(); }
-
   /// Boolean conversion: This ThreadSafeModule will evaluate to true if it
   /// wraps a non-null module.
-  explicit operator bool() {
+  explicit operator bool() const {
     if (M) {
       assert(TSCtx.getContext() &&
              "Non-null module must have non-null context");
@@ -142,6 +128,33 @@ public:
     return false;
   }
 
+  /// Locks the associated ThreadSafeContext and calls the given function
+  /// on the contained Module.
+  template <typename Func>
+  auto withModuleDo(Func &&F) -> decltype(F(std::declval<Module &>())) {
+    assert(M && "Can not call on null module");
+    auto Lock = TSCtx.getLock();
+    return F(*M);
+  }
+
+  /// Locks the associated ThreadSafeContext and calls the given function
+  /// on the contained Module.
+  template <typename Func>
+  auto withModuleDo(Func &&F) const
+      -> decltype(F(std::declval<const Module &>())) {
+    auto Lock = TSCtx.getLock();
+    return F(*M);
+  }
+
+  /// Get a raw pointer to the contained module without locking the context.
+  Module *getModuleUnlocked() { return M.get(); }
+
+  /// Get a raw pointer to the contained module without locking the context.
+  const Module *getModuleUnlocked() const { return M.get(); }
+
+  /// Returns the context for this ThreadSafeModule.
+  ThreadSafeContext getContext() const { return TSCtx; }
+
 private:
   std::unique_ptr<Module> M;
   ThreadSafeContext TSCtx;
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index b2b4eba47074..ce7024a7f19b 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_EXECUTIONENGINE_RUNTIMEDYLD_H
 #define LLVM_EXECUTIONENGINE_RUNTIMEDYLD_H
 
+#include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DIContext.h"
@@ -271,10 +272,10 @@ private:
                 std::unique_ptr<MemoryBuffer> UnderlyingBuffer,
                 RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver,
                 bool ProcessAllSections,
-                std::function<Error(std::unique_ptr<LoadedObjectInfo>,
-                                    std::map<StringRef, JITEvaluatedSymbol>)>
+                unique_function<Error(std::unique_ptr<LoadedObjectInfo>,
+                                      std::map<StringRef, JITEvaluatedSymbol>)>
                     OnLoaded,
-                std::function<void(Error)> OnEmitted);
+                unique_function<void(Error)> OnEmitted);
 
   // RuntimeDyldImpl is the actual class. RuntimeDyld is just the public
   // interface.
@@ -291,14 +292,14 @@ private:
 // but ORC's RTDyldObjectLinkingLayer2. Internally it constructs a RuntimeDyld
 // instance and uses continuation passing to perform the fix-up and finalize
 // steps asynchronously.
-void jitLinkForORC(object::ObjectFile &Obj,
-                   std::unique_ptr<MemoryBuffer> UnderlyingBuffer,
-                   RuntimeDyld::MemoryManager &MemMgr,
-                   JITSymbolResolver &Resolver, bool ProcessAllSections,
-                   std::function<Error(std::unique_ptr<LoadedObjectInfo>,
-                                       std::map<StringRef, JITEvaluatedSymbol>)>
-                       OnLoaded,
-                   std::function<void(Error)> OnEmitted);
+void jitLinkForORC(
+    object::ObjectFile &Obj, std::unique_ptr<MemoryBuffer> UnderlyingBuffer,
+    RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver,
+    bool ProcessAllSections,
+    unique_function<Error(std::unique_ptr<RuntimeDyld::LoadedObjectInfo>,
+                          std::map<StringRef, JITEvaluatedSymbol>)>
+        OnLoaded,
+    unique_function<void(Error)> OnEmitted);
 
 } // end namespace llvm
 
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 06cc09e1cfc7..e6b280465f72 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include <bitset>
 #include <cassert>
@@ -94,8 +95,8 @@ public:
 
   /// Return a uniquified Attribute object that has the specific
   /// alignment set.
-  static Attribute getWithAlignment(LLVMContext &Context, uint64_t Align);
-  static Attribute getWithStackAlignment(LLVMContext &Context, uint64_t Align);
+  static Attribute getWithAlignment(LLVMContext &Context, Align Alignment);
+  static Attribute getWithStackAlignment(LLVMContext &Context, Align Alignment);
   static Attribute getWithDereferenceableBytes(LLVMContext &Context,
                                               uint64_t Bytes);
   static Attribute getWithDereferenceableOrNullBytes(LLVMContext &Context,
@@ -150,11 +151,11 @@ public:
 
   /// Returns the alignment field of an attribute as a byte alignment
   /// value.
-  unsigned getAlignment() const;
+  MaybeAlign getAlignment() const;
 
   /// Returns the stack alignment field of an attribute as a byte
   /// alignment value.
-  unsigned getStackAlignment() const;
+  MaybeAlign getStackAlignment() const;
 
   /// Returns the number of dereferenceable bytes from the
   /// dereferenceable attribute.
@@ -284,8 +285,8 @@ public:
   /// Return the target-dependent attribute object.
   Attribute getAttribute(StringRef Kind) const;
 
-  unsigned getAlignment() const;
-  unsigned getStackAlignment() const;
+  MaybeAlign getAlignment() const;
+  MaybeAlign getStackAlignment() const;
   uint64_t getDereferenceableBytes() const;
   uint64_t getDereferenceableOrNullBytes() const;
   Type *getByValType() const;
@@ -603,16 +604,16 @@ public:
   }
 
   /// Return the alignment of the return value.
-  unsigned getRetAlignment() const;
+  MaybeAlign getRetAlignment() const;
 
   /// Return the alignment for the specified function parameter.
-  unsigned getParamAlignment(unsigned ArgNo) const;
+  MaybeAlign getParamAlignment(unsigned ArgNo) const;
 
   /// Return the byval type for the specified function parameter.
   Type *getParamByValType(unsigned ArgNo) const;
 
   /// Get the stack alignment.
-  unsigned getStackAlignment(unsigned Index) const;
+  MaybeAlign getStackAlignment(unsigned Index) const;
 
   /// Get the number of dereferenceable bytes (or zero if unknown).
   uint64_t getDereferenceableBytes(unsigned Index) const;
@@ -704,9 +705,9 @@ template <> struct DenseMapInfo<AttributeList> {
 /// equality, presence of attributes, etc.
 class AttrBuilder {
   std::bitset<Attribute::EndAttrKinds> Attrs;
-  std::map<std::string, std::string> TargetDepAttrs;
-  uint64_t Alignment = 0;
-  uint64_t StackAlignment = 0;
+  std::map<std::string, std::string, std::less<>> TargetDepAttrs;
+  MaybeAlign Alignment;
+  MaybeAlign StackAlignment;
   uint64_t DerefBytes = 0;
   uint64_t DerefOrNullBytes = 0;
   uint64_t AllocSizeArgs = 0;
@@ -773,10 +774,10 @@ public:
   bool hasAlignmentAttr() const;
 
   /// Retrieve the alignment attribute, if it exists.
-  uint64_t getAlignment() const { return Alignment; }
+  MaybeAlign getAlignment() const { return Alignment; }
 
   /// Retrieve the stack alignment attribute, if it exists.
-  uint64_t getStackAlignment() const { return StackAlignment; }
+  MaybeAlign getStackAlignment() const { return StackAlignment; }
 
   /// Retrieve the number of dereferenceable bytes, if the
   /// dereferenceable attribute exists (zero is returned otherwise).
@@ -793,13 +794,29 @@ public:
   /// doesn't exist, pair(0, 0) is returned.
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
 
+  /// This turns an alignment into the form used internally in Attribute.
+  /// This call has no effect if Align is not set.
+  AttrBuilder &addAlignmentAttr(MaybeAlign Align);
+
   /// This turns an int alignment (which must be a power of 2) into the
   /// form used internally in Attribute.
-  AttrBuilder &addAlignmentAttr(unsigned Align);
+  /// This call has no effect if Align is 0.
+  /// Deprecated, use the version using a MaybeAlign.
+  inline AttrBuilder &addAlignmentAttr(unsigned Align) {
+    return addAlignmentAttr(MaybeAlign(Align));
+  }
+
+  /// This turns a stack alignment into the form used internally in Attribute.
+  /// This call has no effect if Align is not set.
+  AttrBuilder &addStackAlignmentAttr(MaybeAlign Align);
 
   /// This turns an int stack alignment (which must be a power of 2) into
   /// the form used internally in Attribute.
-  AttrBuilder &addStackAlignmentAttr(unsigned Align);
+  /// This call has no effect if Align is 0.
+  /// Deprecated, use the version using a MaybeAlign.
+  inline AttrBuilder &addStackAlignmentAttr(unsigned Align) {
+    return addStackAlignmentAttr(MaybeAlign(Align));
+  }
 
   /// This turns the number of dereferenceable bytes into the form used
   /// internally in Attribute.
diff --git a/include/llvm/IR/AutoUpgrade.h b/include/llvm/IR/AutoUpgrade.h
index 017ad93d8a2a..66f38e5b55d1 100644
--- a/include/llvm/IR/AutoUpgrade.h
+++ b/include/llvm/IR/AutoUpgrade.h
@@ -54,9 +54,9 @@ namespace llvm {
   /// module is modified.
   bool UpgradeModuleFlags(Module &M);
 
-  /// This checks for objc retain release marker which should be upgraded. It
-  /// returns true if module is modified.
-  bool UpgradeRetainReleaseMarker(Module &M);
+  /// Convert calls to ARC runtime functions to intrinsic calls and upgrade the
+  /// old retain release marker to new module flag format.
+  void UpgradeARCRuntime(Module &M);
 
   void UpgradeSectionAttributes(Module &M);
 
@@ -87,6 +87,10 @@ namespace llvm {
   /// Upgrade the loop attachment metadata node.
   MDNode *upgradeInstructionLoopAttachment(MDNode &N);
 
+  /// Upgrade the datalayout string by adding a section for address space
+  /// pointers.
+  std::string UpgradeDataLayoutString(StringRef DL, StringRef Triple);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 69555af50e1f..d594145f8636 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -192,6 +192,11 @@ public:
                                  std::function<bool(Instruction &)>>>
   instructionsWithoutDebug();
 
+  /// Return the size of the basic block ignoring debug instructions
+  filter_iterator<BasicBlock::const_iterator,
+                  std::function<bool(const Instruction &)>>::difference_type
+  sizeWithoutDebug() const;
+
   /// Unlink 'this' from the containing function, but do not delete it.
   void removeFromParent();
 
diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index b47a96c5d5fa..13b1ae8d0e32 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -854,6 +854,15 @@ public:
     return CI.ParameterEncoding[0];
   }
 
+  /// Return the use of the callee value in the underlying instruction. Only
+  /// valid for callback calls!
+  const Use &getCalleeUseForCallback() const {
+    int CalleeArgIdx = getCallArgOperandNoForCallee();
+    assert(CalleeArgIdx >= 0 &&
+           unsigned(CalleeArgIdx) < getInstruction()->getNumOperands());
+    return getInstruction()->getOperandUse(CalleeArgIdx);
+  }
+
   /// Return the pointer to function that is being called.
   Value *getCalledValue() const {
     if (isDirectCall())
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 399c6ad521fa..c1c979c2e2ab 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -75,6 +75,11 @@ namespace CallingConv {
     // CXX_FAST_TLS - Calling convention for access functions.
     CXX_FAST_TLS = 17,
 
+    /// Tail - This calling convention attemps to make calls as fast as
+    /// possible while guaranteeing that tail call optimization can always
+    /// be performed.
+    Tail = 18,
+
     // Target - This is the start of the target-specific calling conventions,
     // e.g. fastcall and thiscall on X86.
     FirstTargetCC = 64,
@@ -222,6 +227,14 @@ namespace CallingConv {
     // Calling convention between AArch64 Advanced SIMD functions
     AArch64_VectorCall = 97,
 
+    /// Calling convention between AArch64 SVE functions
+    AArch64_SVE_VectorCall = 98,
+
+    /// Calling convention for emscripten __invoke_* functions. The first
+    /// argument is required to be the function ptr being indirectly called.
+    /// The remainder matches the regular calling convention.
+    WASM_EmscriptenInvoke = 99,
+
     /// The highest possible calling convention ID. Must be some 2^k - 1.
     MaxID = 1023
   };
diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h
index 931576651224..2b6a6e4141b9 100644
--- a/include/llvm/IR/Constant.h
+++ b/include/llvm/IR/Constant.h
@@ -86,6 +86,12 @@ public:
   /// floating-point constant with all NaN elements.
   bool isNaN() const;
 
+  /// Return true if this constant and a constant 'Y' are element-wise equal.
+  /// This is identical to just comparing the pointers, with the exception that
+  /// for vectors, if only one of the constants has an `undef` element in some
+  /// lane, the constants still match.
+  bool isElementWiseEqual(Value *Y) const;
+
   /// Return true if this is a vector constant that includes any undefined
   /// elements.
   bool containsUndefElement() const;
diff --git a/include/llvm/IR/ConstantRange.h b/include/llvm/IR/ConstantRange.h
index 91f3f31abe17..964f9e8e9bc9 100644
--- a/include/llvm/IR/ConstantRange.h
+++ b/include/llvm/IR/ConstantRange.h
@@ -330,9 +330,13 @@ public:
   /// from an addition of a value in this range and a value in \p Other.
   ConstantRange add(const ConstantRange &Other) const;
 
-  /// Return a new range representing the possible values resulting from a
-  /// known NSW addition of a value in this range and \p Other constant.
-  ConstantRange addWithNoSignedWrap(const APInt &Other) const;
+  /// Return a new range representing the possible values resulting
+  /// from an addition with wrap type \p NoWrapKind of a value in this
+  /// range and a value in \p Other.
+  /// If the result range is disjoint, the preferred range is determined by the
+  /// \p PreferredRangeType.
+  ConstantRange addWithNoWrap(const ConstantRange &Other, unsigned NoWrapKind,
+                              PreferredRangeType RangeType = Smallest) const;
 
   /// Return a new range representing the possible values resulting
   /// from a subtraction of a value in this range and a value in \p Other.
diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h
index ac9770a15120..85093dd218f8 100644
--- a/include/llvm/IR/DataLayout.h
+++ b/include/llvm/IR/DataLayout.h
@@ -25,10 +25,11 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/TypeSize.h"
 #include <cassert>
 #include <cstdint>
 #include <string>
@@ -71,11 +72,11 @@ struct LayoutAlignElem {
   /// Alignment type from \c AlignTypeEnum
   unsigned AlignType : 8;
   unsigned TypeBitWidth : 24;
-  unsigned ABIAlign : 16;
-  unsigned PrefAlign : 16;
+  Align ABIAlign;
+  Align PrefAlign;
 
-  static LayoutAlignElem get(AlignTypeEnum align_type, unsigned abi_align,
-                             unsigned pref_align, uint32_t bit_width);
+  static LayoutAlignElem get(AlignTypeEnum align_type, Align abi_align,
+                             Align pref_align, uint32_t bit_width);
 
   bool operator==(const LayoutAlignElem &rhs) const;
 };
@@ -87,15 +88,15 @@ struct LayoutAlignElem {
 /// \note The unusual order of elements in the structure attempts to reduce
 /// padding and make the structure slightly more cache friendly.
 struct PointerAlignElem {
-  unsigned ABIAlign;
-  unsigned PrefAlign;
+  Align ABIAlign;
+  Align PrefAlign;
   uint32_t TypeByteWidth;
   uint32_t AddressSpace;
   uint32_t IndexWidth;
 
   /// Initializer
-  static PointerAlignElem get(uint32_t AddressSpace, unsigned ABIAlign,
-                              unsigned PrefAlign, uint32_t TypeByteWidth,
+  static PointerAlignElem get(uint32_t AddressSpace, Align ABIAlign,
+                              Align PrefAlign, uint32_t TypeByteWidth,
                               uint32_t IndexWidth);
 
   bool operator==(const PointerAlignElem &rhs) const;
@@ -120,10 +121,10 @@ private:
   bool BigEndian;
 
   unsigned AllocaAddrSpace;
-  unsigned StackNaturalAlign;
+  MaybeAlign StackNaturalAlign;
   unsigned ProgramAddrSpace;
 
-  unsigned FunctionPtrAlign;
+  MaybeAlign FunctionPtrAlign;
   FunctionPtrAlignType TheFunctionPtrAlignType;
 
   enum ManglingModeT {
@@ -172,16 +173,15 @@ private:
   /// well-defined bitwise representation.
   SmallVector<unsigned, 8> NonIntegralAddressSpaces;
 
-  void setAlignment(AlignTypeEnum align_type, unsigned abi_align,
-                    unsigned pref_align, uint32_t bit_width);
-  unsigned getAlignmentInfo(AlignTypeEnum align_type, uint32_t bit_width,
-                            bool ABIAlign, Type *Ty) const;
-  void setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
-                           unsigned PrefAlign, uint32_t TypeByteWidth,
-                           uint32_t IndexWidth);
+  void setAlignment(AlignTypeEnum align_type, Align abi_align, Align pref_align,
+                    uint32_t bit_width);
+  Align getAlignmentInfo(AlignTypeEnum align_type, uint32_t bit_width,
+                         bool ABIAlign, Type *Ty) const;
+  void setPointerAlignment(uint32_t AddrSpace, Align ABIAlign, Align PrefAlign,
+                           uint32_t TypeByteWidth, uint32_t IndexWidth);
 
   /// Internal helper method that returns requested alignment for type.
-  unsigned getAlignment(Type *Ty, bool abi_or_pref) const;
+  Align getAlignment(Type *Ty, bool abi_or_pref) const;
 
   /// Parses a target data specification string. Assert if the string is
   /// malformed.
@@ -261,17 +261,21 @@ public:
   bool isIllegalInteger(uint64_t Width) const { return !isLegalInteger(Width); }
 
   /// Returns true if the given alignment exceeds the natural stack alignment.
-  bool exceedsNaturalStackAlignment(unsigned Align) const {
-    return (StackNaturalAlign != 0) && (Align > StackNaturalAlign);
+  bool exceedsNaturalStackAlignment(Align Alignment) const {
+    return StackNaturalAlign && (Alignment > StackNaturalAlign);
+  }
+
+  Align getStackAlignment() const {
+    assert(StackNaturalAlign && "StackNaturalAlign must be defined");
+    return *StackNaturalAlign;
   }
 
-  unsigned getStackAlignment() const { return StackNaturalAlign; }
   unsigned getAllocaAddrSpace() const { return AllocaAddrSpace; }
 
   /// Returns the alignment of function pointers, which may or may not be
   /// related to the alignment of functions.
   /// \see getFunctionPtrAlignType
-  unsigned getFunctionPtrAlign() const { return FunctionPtrAlign; }
+  MaybeAlign getFunctionPtrAlign() const { return FunctionPtrAlign; }
 
   /// Return the type of function pointer alignment.
   /// \see getFunctionPtrAlign
@@ -344,12 +348,12 @@ public:
   }
 
   /// Layout pointer alignment
-  unsigned getPointerABIAlignment(unsigned AS) const;
+  Align getPointerABIAlignment(unsigned AS) const;
 
   /// Return target's alignment for stack-based pointers
   /// FIXME: The defaults need to be removed once all of
   /// the backends/clients are updated.
-  unsigned getPointerPrefAlignment(unsigned AS = 0) const;
+  Align getPointerPrefAlignment(unsigned AS = 0) const;
 
   /// Layout pointer size
   /// FIXME: The defaults need to be removed once all of
@@ -433,23 +437,33 @@ public:
 
   /// Returns the number of bits necessary to hold the specified type.
   ///
+  /// If Ty is a scalable vector type, the scalable property will be set and
+  /// the runtime size will be a positive integer multiple of the base size.
+  ///
   /// For example, returns 36 for i36 and 80 for x86_fp80. The type passed must
   /// have a size (Type::isSized() must return true).
-  uint64_t getTypeSizeInBits(Type *Ty) const;
+  TypeSize getTypeSizeInBits(Type *Ty) const;
 
   /// Returns the maximum number of bytes that may be overwritten by
   /// storing the specified type.
   ///
+  /// If Ty is a scalable vector type, the scalable property will be set and
+  /// the runtime size will be a positive integer multiple of the base size.
+  ///
   /// For example, returns 5 for i36 and 10 for x86_fp80.
-  uint64_t getTypeStoreSize(Type *Ty) const {
-    return (getTypeSizeInBits(Ty) + 7) / 8;
+  TypeSize getTypeStoreSize(Type *Ty) const {
+    TypeSize BaseSize = getTypeSizeInBits(Ty);
+    return { (BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable() };
   }
 
   /// Returns the maximum number of bits that may be overwritten by
   /// storing the specified type; always a multiple of 8.
   ///
+  /// If Ty is a scalable vector type, the scalable property will be set and
+  /// the runtime size will be a positive integer multiple of the base size.
+  ///
   /// For example, returns 40 for i36 and 80 for x86_fp80.
-  uint64_t getTypeStoreSizeInBits(Type *Ty) const {
+  TypeSize getTypeStoreSizeInBits(Type *Ty) const {
     return 8 * getTypeStoreSize(Ty);
   }
 
@@ -464,9 +478,12 @@ public:
   /// Returns the offset in bytes between successive objects of the
   /// specified type, including alignment padding.
   ///
+  /// If Ty is a scalable vector type, the scalable property will be set and
+  /// the runtime size will be a positive integer multiple of the base size.
+  ///
   /// This is the amount that alloca reserves for this type. For example,
   /// returns 12 or 16 for x86_fp80, depending on alignment.
-  uint64_t getTypeAllocSize(Type *Ty) const {
+  TypeSize getTypeAllocSize(Type *Ty) const {
     // Round up to the next alignment boundary.
     return alignTo(getTypeStoreSize(Ty), getABITypeAlignment(Ty));
   }
@@ -474,18 +491,28 @@ public:
   /// Returns the offset in bits between successive objects of the
   /// specified type, including alignment padding; always a multiple of 8.
   ///
+  /// If Ty is a scalable vector type, the scalable property will be set and
+  /// the runtime size will be a positive integer multiple of the base size.
+  ///
   /// This is the amount that alloca reserves for this type. For example,
   /// returns 96 or 128 for x86_fp80, depending on alignment.
-  uint64_t getTypeAllocSizeInBits(Type *Ty) const {
+  TypeSize getTypeAllocSizeInBits(Type *Ty) const {
     return 8 * getTypeAllocSize(Ty);
   }
 
   /// Returns the minimum ABI-required alignment for the specified type.
   unsigned getABITypeAlignment(Type *Ty) const;
 
+  /// Helper function to return `Alignment` if it's set or the result of
+  /// `getABITypeAlignment(Ty)`, in any case the result is a valid alignment.
+  inline Align getValueOrABITypeAlignment(MaybeAlign Alignment,
+                                          Type *Ty) const {
+    return Alignment ? *Alignment : Align(getABITypeAlignment(Ty));
+  }
+
   /// Returns the minimum ABI-required alignment for an integer type of
   /// the specified bitwidth.
-  unsigned getABIIntegerTypeAlignment(unsigned BitWidth) const;
+  Align getABIIntegerTypeAlignment(unsigned BitWidth) const;
 
   /// Returns the preferred stack/global alignment for the specified
   /// type.
@@ -493,10 +520,6 @@ public:
   /// This is always at least as good as the ABI alignment.
   unsigned getPrefTypeAlignment(Type *Ty) const;
 
-  /// Returns the preferred alignment for the specified type, returned as
-  /// log2 of the value (a shift amount).
-  unsigned getPreferredTypeAlignmentShift(Type *Ty) const;
-
   /// Returns an integer type with size at least as big as that of a
   /// pointer in the given address space.
   IntegerType *getIntPtrType(LLVMContext &C, unsigned AddressSpace = 0) const;
@@ -561,7 +584,7 @@ inline LLVMTargetDataRef wrap(const DataLayout *P) {
 /// based on the DataLayout structure.
 class StructLayout {
   uint64_t StructSize;
-  unsigned StructAlignment;
+  Align StructAlignment;
   unsigned IsPadded : 1;
   unsigned NumElements : 31;
   uint64_t MemberOffsets[1]; // variable sized array!
@@ -571,7 +594,7 @@ public:
 
   uint64_t getSizeInBits() const { return 8 * StructSize; }
 
-  unsigned getAlignment() const { return StructAlignment; }
+  Align getAlignment() const { return StructAlignment; }
 
   /// Returns whether the struct has padding or not between its fields.
   /// NB: Padding in nested element is not taken into account.
@@ -598,13 +621,13 @@ private:
 
 // The implementation of this method is provided inline as it is particularly
 // well suited to constant folding when called on a specific Type subclass.
-inline uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const {
+inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const {
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
   switch (Ty->getTypeID()) {
   case Type::LabelTyID:
-    return getPointerSizeInBits(0);
+    return TypeSize::Fixed(getPointerSizeInBits(0));
   case Type::PointerTyID:
-    return getPointerSizeInBits(Ty->getPointerAddressSpace());
+    return TypeSize::Fixed(getPointerSizeInBits(Ty->getPointerAddressSpace()));
   case Type::ArrayTyID: {
     ArrayType *ATy = cast<ArrayType>(Ty);
     return ATy->getNumElements() *
@@ -612,26 +635,30 @@ inline uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const {
   }
   case Type::StructTyID:
     // Get the layout annotation... which is lazily created on demand.
-    return getStructLayout(cast<StructType>(Ty))->getSizeInBits();
+    return TypeSize::Fixed(
+                        getStructLayout(cast<StructType>(Ty))->getSizeInBits());
   case Type::IntegerTyID:
-    return Ty->getIntegerBitWidth();
+    return TypeSize::Fixed(Ty->getIntegerBitWidth());
   case Type::HalfTyID:
-    return 16;
+    return TypeSize::Fixed(16);
   case Type::FloatTyID:
-    return 32;
+    return TypeSize::Fixed(32);
   case Type::DoubleTyID:
   case Type::X86_MMXTyID:
-    return 64;
+    return TypeSize::Fixed(64);
   case Type::PPC_FP128TyID:
   case Type::FP128TyID:
-    return 128;
+    return TypeSize::Fixed(128);
   // In memory objects this is always aligned to a higher boundary, but
   // only 80 bits contain information.
   case Type::X86_FP80TyID:
-    return 80;
+    return TypeSize::Fixed(80);
   case Type::VectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
-    return VTy->getNumElements() * getTypeSizeInBits(VTy->getElementType());
+    auto EltCnt = VTy->getElementCount();
+    uint64_t MinBits = EltCnt.Min *
+                        getTypeSizeInBits(VTy->getElementType()).getFixedSize();
+    return TypeSize(MinBits, EltCnt.Scalable);
   }
   default:
     llvm_unreachable("DataLayout::getTypeSizeInBits(): Unsupported type");
diff --git a/include/llvm/IR/DebugInfoFlags.def b/include/llvm/IR/DebugInfoFlags.def
index 07e3d6bdc9e5..f90c580f10ef 100644
--- a/include/llvm/IR/DebugInfoFlags.def
+++ b/include/llvm/IR/DebugInfoFlags.def
@@ -31,7 +31,8 @@ HANDLE_DI_FLAG(2, Protected)
 HANDLE_DI_FLAG(3, Public)
 HANDLE_DI_FLAG((1 << 2), FwdDecl)
 HANDLE_DI_FLAG((1 << 3), AppleBlock)
-HANDLE_DI_FLAG((1 << 4), BlockByrefStruct)
+// Used to be BlockByRef, can be reused for anything except DICompositeType.
+HANDLE_DI_FLAG((1 << 4), ReservedBit4)
 HANDLE_DI_FLAG((1 << 5), Virtual)
 HANDLE_DI_FLAG((1 << 6), Artificial)
 HANDLE_DI_FLAG((1 << 7), Explicit)
@@ -42,8 +43,7 @@ HANDLE_DI_FLAG((1 << 11), Vector)
 HANDLE_DI_FLAG((1 << 12), StaticMember)
 HANDLE_DI_FLAG((1 << 13), LValueReference)
 HANDLE_DI_FLAG((1 << 14), RValueReference)
-// 15 was formerly ExternalTypeRef, but this was never used.
-HANDLE_DI_FLAG((1 << 15), Reserved)
+HANDLE_DI_FLAG((1 << 15), ExportSymbols)
 HANDLE_DI_FLAG((1 << 16), SingleInheritance)
 HANDLE_DI_FLAG((2 << 16), MultipleInheritance)
 HANDLE_DI_FLAG((3 << 16), VirtualInheritance)
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 9dc6dfbb0f68..28a59576b7c6 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -650,7 +650,6 @@ public:
   }
   bool isForwardDecl() const { return getFlags() & FlagFwdDecl; }
   bool isAppleBlockExtension() const { return getFlags() & FlagAppleBlock; }
-  bool isBlockByrefStruct() const { return getFlags() & FlagBlockByrefStruct; }
   bool isVirtual() const { return getFlags() & FlagVirtual; }
   bool isArtificial() const { return getFlags() & FlagArtificial; }
   bool isObjectPointer() const { return getFlags() & FlagObjectPointer; }
@@ -668,6 +667,7 @@ public:
   }
   bool isBigEndian() const { return getFlags() & FlagBigEndian; }
   bool isLittleEndian() const { return getFlags() & FlagLittleEndian; }
+  bool getExportSymbols() const { return getFlags() & FlagExportSymbols; }
 
   static bool classof(const Metadata *MD) {
     switch (MD->getMetadataID()) {
@@ -2569,7 +2569,7 @@ public:
   /// (This is the only configuration of entry values that is supported.)
   bool isEntryValue() const {
     return getNumElements() > 0 &&
-           getElement(0) == dwarf::DW_OP_entry_value;
+           getElement(0) == dwarf::DW_OP_LLVM_entry_value;
   }
 };
 
diff --git a/include/llvm/IR/DerivedTypes.h b/include/llvm/IR/DerivedTypes.h
index 3c1d4278905f..20097ef3f31a 100644
--- a/include/llvm/IR/DerivedTypes.h
+++ b/include/llvm/IR/DerivedTypes.h
@@ -23,7 +23,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/ScalableSize.h"
+#include "llvm/Support/TypeSize.h"
 #include <cassert>
 #include <cstdint>
 
@@ -62,6 +62,11 @@ public:
   /// Get or create an IntegerType instance.
   static IntegerType *get(LLVMContext &C, unsigned NumBits);
 
+  /// Returns type twice as wide the input type.
+  IntegerType *getExtendedType() const {
+    return Type::getIntNTy(getContext(), 2 * getScalarSizeInBits());
+  }
+
   /// Get the number of bits in this IntegerType
   unsigned getBitWidth() const { return getSubclassData(); }
 
@@ -470,21 +475,47 @@ public:
   /// This static method is like getInteger except that the element types are
   /// twice as wide as the elements in the input type.
   static VectorType *getExtendedElementVectorType(VectorType *VTy) {
-    unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
-    Type *EltTy = IntegerType::get(VTy->getContext(), EltBits * 2);
-    return VectorType::get(EltTy, VTy->getElementCount());
+    assert(VTy->isIntOrIntVectorTy() && "VTy expected to be a vector of ints.");
+    auto *EltTy = cast<IntegerType>(VTy->getElementType());
+    return VectorType::get(EltTy->getExtendedType(), VTy->getElementCount());
   }
 
-  /// This static method is like getInteger except that the element types are
-  /// half as wide as the elements in the input type.
+  // This static method gets a VectorType with the same number of elements as
+  // the input type, and the element type is an integer or float type which
+  // is half as wide as the elements in the input type.
   static VectorType *getTruncatedElementVectorType(VectorType *VTy) {
-    unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
-    assert((EltBits & 1) == 0 &&
-           "Cannot truncate vector element with odd bit-width");
-    Type *EltTy = IntegerType::get(VTy->getContext(), EltBits / 2);
+    Type *EltTy;
+    if (VTy->getElementType()->isFloatingPointTy()) {
+      switch(VTy->getElementType()->getTypeID()) {
+      case DoubleTyID:
+        EltTy = Type::getFloatTy(VTy->getContext());
+        break;
+      case FloatTyID:
+        EltTy = Type::getHalfTy(VTy->getContext());
+        break;
+      default:
+        llvm_unreachable("Cannot create narrower fp vector element type");
+      }
+    } else {
+      unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
+      assert((EltBits & 1) == 0 &&
+             "Cannot truncate vector element with odd bit-width");
+      EltTy = IntegerType::get(VTy->getContext(), EltBits / 2);
+    }
     return VectorType::get(EltTy, VTy->getElementCount());
   }
 
+  // This static method returns a VectorType with a smaller number of elements
+  // of a larger type than the input element type. For example, a <16 x i8>
+  // subdivided twice would return <4 x i32>
+  static VectorType *getSubdividedVectorType(VectorType *VTy, int NumSubdivs) {
+    for (int i = 0; i < NumSubdivs; ++i) {
+      VTy = VectorType::getDoubleElementsVectorType(VTy);
+      VTy = VectorType::getTruncatedElementVectorType(VTy);
+    }
+    return VTy;
+  }
+
   /// This static method returns a VectorType with half as many elements as the
   /// input type and the same element type.
   static VectorType *getHalfElementsVectorType(VectorType *VTy) {
@@ -540,6 +571,10 @@ bool Type::getVectorIsScalable() const {
   return cast<VectorType>(this)->isScalable();
 }
 
+ElementCount Type::getVectorElementCount() const {
+  return cast<VectorType>(this)->getElementCount();
+}
+
 /// Class to represent pointers.
 class PointerType : public Type {
   explicit PointerType(Type *ElType, unsigned AddrSpace);
@@ -577,6 +612,26 @@ public:
   }
 };
 
+Type *Type::getExtendedType() const {
+  assert(
+      isIntOrIntVectorTy() &&
+      "Original type expected to be a vector of integers or a scalar integer.");
+  if (auto *VTy = dyn_cast<VectorType>(this))
+    return VectorType::getExtendedElementVectorType(
+        const_cast<VectorType *>(VTy));
+  return cast<IntegerType>(this)->getExtendedType();
+}
+
+Type *Type::getWithNewBitWidth(unsigned NewBitWidth) const {
+  assert(
+      isIntOrIntVectorTy() &&
+      "Original type expected to be a vector of integers or a scalar integer.");
+  Type *NewType = getIntNTy(getContext(), NewBitWidth);
+  if (isVectorTy())
+    NewType = VectorType::get(NewType, getVectorElementCount());
+  return NewType;
+}
+
 unsigned Type::getPointerAddressSpace() const {
   return cast<PointerType>(getScalarType())->getAddressSpace();
 }
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index 373663289dbd..ec469982d378 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -74,8 +74,10 @@ enum DiagnosticKind {
   DK_LastMachineRemark = DK_MachineOptimizationRemarkAnalysis,
   DK_MIRParser,
   DK_PGOProfile,
+  DK_MisExpect,
   DK_Unsupported,
-  DK_FirstPluginKind
+  DK_FirstPluginKind // Must be last value to work with
+                     // getNextAvailablePluginDiagnosticKind
 };
 
 /// Get the next available kind ID for a plugin diagnostic.
@@ -663,7 +665,7 @@ public:
 private:
   /// The IR value (currently basic block) that the optimization operates on.
   /// This is currently used to provide run-time hotness information with PGO.
-  const Value *CodeRegion;
+  const Value *CodeRegion = nullptr;
 };
 
 /// Diagnostic information for applied optimization remarks.
@@ -1002,6 +1004,25 @@ public:
   void print(DiagnosticPrinter &DP) const override;
 };
 
+/// Diagnostic information for MisExpect analysis.
+class DiagnosticInfoMisExpect : public DiagnosticInfoWithLocationBase {
+public:
+    DiagnosticInfoMisExpect(const Instruction *Inst, Twine &Msg);
+
+  /// \see DiagnosticInfo::print.
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_MisExpect;
+  }
+
+  const Twine &getMsg() const { return Msg; }
+
+private:
+  /// Message to report.
+  const Twine &Msg;
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_DIAGNOSTICINFO_H
diff --git a/include/llvm/IR/FixedMetadataKinds.def b/include/llvm/IR/FixedMetadataKinds.def
new file mode 100644
index 000000000000..0e1ffef58672
--- /dev/null
+++ b/include/llvm/IR/FixedMetadataKinds.def
@@ -0,0 +1,43 @@
+/*===-- FixedMetadataKinds.def - Fixed metadata kind IDs -------*- C++ -*-=== *\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_FIXED_MD_KIND
+#error "LLVM_FIXED_MD_KIND(EnumID, Name, Value) is not defined."
+#endif
+
+LLVM_FIXED_MD_KIND(MD_dbg, "dbg", 0)
+LLVM_FIXED_MD_KIND(MD_tbaa, "tbaa", 1)
+LLVM_FIXED_MD_KIND(MD_prof, "prof", 2)
+LLVM_FIXED_MD_KIND(MD_fpmath, "fpmath", 3)
+LLVM_FIXED_MD_KIND(MD_range, "range", 4)
+LLVM_FIXED_MD_KIND(MD_tbaa_struct, "tbaa.struct", 5)
+LLVM_FIXED_MD_KIND(MD_invariant_load, "invariant.load", 6)
+LLVM_FIXED_MD_KIND(MD_alias_scope, "alias.scope", 7)
+LLVM_FIXED_MD_KIND(MD_noalias, "noalias", 8)
+LLVM_FIXED_MD_KIND(MD_nontemporal, "nontemporal", 9)
+LLVM_FIXED_MD_KIND(MD_mem_parallel_loop_access,
+                    "llvm.mem.parallel_loop_access", 10)
+LLVM_FIXED_MD_KIND(MD_nonnull, "nonnull", 11)
+LLVM_FIXED_MD_KIND(MD_dereferenceable, "dereferenceable", 12)
+LLVM_FIXED_MD_KIND(MD_dereferenceable_or_null, "dereferenceable_or_null", 13)
+LLVM_FIXED_MD_KIND(MD_make_implicit, "make.implicit", 14)
+LLVM_FIXED_MD_KIND(MD_unpredictable, "unpredictable", 15)
+LLVM_FIXED_MD_KIND(MD_invariant_group, "invariant.group", 16)
+LLVM_FIXED_MD_KIND(MD_align, "align", 17)
+LLVM_FIXED_MD_KIND(MD_loop, "llvm.loop", 18)
+LLVM_FIXED_MD_KIND(MD_type, "type", 19)
+LLVM_FIXED_MD_KIND(MD_section_prefix, "section_prefix", 20)
+LLVM_FIXED_MD_KIND(MD_absolute_symbol, "absolute_symbol", 21)
+LLVM_FIXED_MD_KIND(MD_associated, "associated", 22)
+LLVM_FIXED_MD_KIND(MD_callees, "callees", 23)
+LLVM_FIXED_MD_KIND(MD_irr_loop, "irr_loop", 24)
+LLVM_FIXED_MD_KIND(MD_access_group, "llvm.access.group", 25)
+LLVM_FIXED_MD_KIND(MD_callback, "callback", 26)
+LLVM_FIXED_MD_KIND(MD_preserve_access_index, "llvm.preserve.access.index", 27)
+LLVM_FIXED_MD_KIND(MD_misexpect, "misexpect", 28)
+LLVM_FIXED_MD_KIND(MD_vcall_visibility, "vcall_visibility", 29)
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 7fa61e12f431..d586a9460d2b 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -343,7 +343,10 @@ public:
   unsigned getFnStackAlignment() const {
     if (!hasFnAttribute(Attribute::StackAlignment))
       return 0;
-    return AttributeSets.getStackAlignment(AttributeList::FunctionIndex);
+    if (const auto MA =
+            AttributeSets.getStackAlignment(AttributeList::FunctionIndex))
+      return MA->value();
+    return 0;
   }
 
   /// hasGC/getGC/setGC/clearGC - The name of the garbage collection algorithm
@@ -433,7 +436,9 @@ public:
 
   /// Extract the alignment for a call or parameter (0=unknown).
   unsigned getParamAlignment(unsigned ArgNo) const {
-    return AttributeSets.getParamAlignment(ArgNo);
+    if (const auto MA = AttributeSets.getParamAlignment(ArgNo))
+      return MA->value();
+    return 0;
   }
 
   /// Extract the byval type for a parameter.
@@ -710,6 +715,12 @@ public:
     return Arguments + NumArgs;
   }
 
+  Argument* getArg(unsigned i) const {
+    assert (i < NumArgs && "getArg() out of range!");
+    CheckLazyArguments();
+    return Arguments + i;
+  }
+
   iterator_range<arg_iterator> args() {
     return make_range(arg_begin(), arg_end());
   }
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index 3cd405701300..f2d9b9676ec9 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -58,10 +58,6 @@ public:
   // Linkage, Type, Parent and AddressSpace taken from the Aliasee.
   static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee);
 
-  void copyAttributesFrom(const GlobalValue *Src) {
-    GlobalValue::copyAttributesFrom(Src);
-  }
-
   /// removeFromParent - This method unlinks 'this' from the containing module,
   /// but does not delete it.
   ///
diff --git a/include/llvm/IR/GlobalIFunc.h b/include/llvm/IR/GlobalIFunc.h
index bc0d3c053cce..0fdae917878a 100644
--- a/include/llvm/IR/GlobalIFunc.h
+++ b/include/llvm/IR/GlobalIFunc.h
@@ -46,10 +46,6 @@ public:
                              LinkageTypes Linkage, const Twine &Name,
                              Constant *Resolver, Module *Parent);
 
-  void copyAttributesFrom(const GlobalIFunc *Src) {
-    GlobalValue::copyAttributesFrom(Src);
-  }
-
   /// This method unlinks 'this' from the containing module, but does not
   /// delete it.
   void removeFromParent();
diff --git a/include/llvm/IR/GlobalIndirectSymbol.h b/include/llvm/IR/GlobalIndirectSymbol.h
index 8bc3f90b94aa..d996237aa3ef 100644
--- a/include/llvm/IR/GlobalIndirectSymbol.h
+++ b/include/llvm/IR/GlobalIndirectSymbol.h
@@ -42,6 +42,10 @@ public:
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
 
+  void copyAttributesFrom(const GlobalValue *Src) {
+    GlobalValue::copyAttributesFrom(Src);
+  }
+
   /// These methods set and retrieve indirect symbol.
   void setIndirectSymbol(Constant *Symbol) {
     setOperand(0, Symbol);
@@ -54,9 +58,7 @@ public:
           static_cast<const GlobalIndirectSymbol *>(this)->getIndirectSymbol());
   }
 
-  const GlobalObject *getBaseObject() const {
-    return dyn_cast<GlobalObject>(getIndirectSymbol()->stripInBoundsOffsets());
-  }
+  const GlobalObject *getBaseObject() const;
   GlobalObject *getBaseObject() {
     return const_cast<GlobalObject *>(
               static_cast<const GlobalIndirectSymbol *>(this)->getBaseObject());
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index b8ab6140ebe7..ce81eb9f0719 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
 #include <string>
 #include <utility>
 
@@ -27,6 +28,20 @@ class MDNode;
 class Metadata;
 
 class GlobalObject : public GlobalValue {
+public:
+  // VCallVisibility - values for visibility metadata attached to vtables. This
+  // describes the scope in which a virtual call could end up being dispatched
+  // through this vtable.
+  enum VCallVisibility {
+    // Type is potentially visible to external code.
+    VCallVisibilityPublic = 0,
+    // Type is only visible to code which will be in the current Module after
+    // LTO internalization.
+    VCallVisibilityLinkageUnit = 1,
+    // Type is only visible to code in the current Module.
+    VCallVisibilityTranslationUnit = 2,
+  };
+
 protected:
   GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
                LinkageTypes Linkage, const Twine &Name,
@@ -58,9 +73,14 @@ public:
   unsigned getAlignment() const {
     unsigned Data = getGlobalValueSubClassData();
     unsigned AlignmentData = Data & AlignmentMask;
-    return (1u << AlignmentData) >> 1;
+    MaybeAlign Align = decodeMaybeAlign(AlignmentData);
+    return Align ? Align->value() : 0;
   }
-  void setAlignment(unsigned Align);
+
+  /// FIXME: Remove this setter once the migration to MaybeAlign is over.
+  LLVM_ATTRIBUTE_DEPRECATED(void setAlignment(unsigned Align),
+                            "Please use `void setAlignment(MaybeAlign Align)`");
+  void setAlignment(MaybeAlign Align);
 
   unsigned getGlobalObjectSubClassData() const {
     unsigned ValueData = getGlobalValueSubClassData();
@@ -158,6 +178,8 @@ public:
   void copyMetadata(const GlobalObject *Src, unsigned Offset);
 
   void addTypeMetadata(unsigned Offset, Metadata *TypeID);
+  void addVCallVisibilityMetadata(VCallVisibility Visibility);
+  VCallVisibility getVCallVisibility() const;
 
 protected:
   void copyAttributesFrom(const GlobalObject *Src);
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 2e2c8c477913..2c730bc312e4 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -243,6 +243,7 @@ public:
   bool hasImplicitSection() const {
     return getAttributes().hasAttribute("bss-section") ||
            getAttributes().hasAttribute("data-section") ||
+           getAttributes().hasAttribute("relro-section") ||
            getAttributes().hasAttribute("rodata-section");
   }
 
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index a74364dffb2e..d1ddb75cde9b 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -1461,7 +1461,7 @@ public:
     if (Value *V = foldConstant(Opc, LHS, RHS, Name)) return V;
     Instruction *BinOp = BinaryOperator::Create(Opc, LHS, RHS);
     if (isa<FPMathOperator>(BinOp))
-      BinOp = setFPAttrs(BinOp, FPMathTag, FMF);
+      setFPAttrs(BinOp, FPMathTag, FMF);
     return Insert(BinOp, Name);
   }
 
@@ -1479,7 +1479,8 @@ public:
 
     CallInst *C = CreateIntrinsic(ID, {L->getType()},
                                   {L, R, RoundingV, ExceptV}, nullptr, Name);
-    return cast<CallInst>(setFPAttrs(C, FPMathTag, UseFMF));
+    setFPAttrs(C, FPMathTag, UseFMF);
+    return C;
   }
 
   Value *CreateNeg(Value *V, const Twine &Name = "",
@@ -1504,7 +1505,7 @@ public:
                     MDNode *FPMathTag = nullptr) {
     if (auto *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateFNeg(VC), Name);
-    return Insert(setFPAttrs(BinaryOperator::CreateFNeg(V), FPMathTag, FMF),
+    return Insert(setFPAttrs(UnaryOperator::CreateFNeg(V), FPMathTag, FMF),
                   Name);
   }
 
@@ -1514,9 +1515,7 @@ public:
                        const Twine &Name = "") {
    if (auto *VC = dyn_cast<Constant>(V))
      return Insert(Folder.CreateFNeg(VC), Name);
-   // TODO: This should return UnaryOperator::CreateFNeg(...) once we are
-   // confident that they are optimized sufficiently.
-   return Insert(setFPAttrs(BinaryOperator::CreateFNeg(V), nullptr,
+   return Insert(setFPAttrs(UnaryOperator::CreateFNeg(V), nullptr,
                             FMFSource->getFastMathFlags()),
                  Name);
   }
@@ -1534,7 +1533,7 @@ public:
       return Insert(Folder.CreateUnOp(Opc, VC), Name);
     Instruction *UnOp = UnaryOperator::Create(Opc, V);
     if (isa<FPMathOperator>(UnOp))
-      UnOp = setFPAttrs(UnOp, FPMathTag, FMF);
+      setFPAttrs(UnOp, FPMathTag, FMF);
     return Insert(UnOp, Name);
   }
 
@@ -1612,19 +1611,19 @@ public:
   LoadInst *CreateAlignedLoad(Type *Ty, Value *Ptr, unsigned Align,
                               const char *Name) {
     LoadInst *LI = CreateLoad(Ty, Ptr, Name);
-    LI->setAlignment(Align);
+    LI->setAlignment(MaybeAlign(Align));
     return LI;
   }
   LoadInst *CreateAlignedLoad(Type *Ty, Value *Ptr, unsigned Align,
                               const Twine &Name = "") {
     LoadInst *LI = CreateLoad(Ty, Ptr, Name);
-    LI->setAlignment(Align);
+    LI->setAlignment(MaybeAlign(Align));
     return LI;
   }
   LoadInst *CreateAlignedLoad(Type *Ty, Value *Ptr, unsigned Align,
                               bool isVolatile, const Twine &Name = "") {
     LoadInst *LI = CreateLoad(Ty, Ptr, isVolatile, Name);
-    LI->setAlignment(Align);
+    LI->setAlignment(MaybeAlign(Align));
     return LI;
   }
 
@@ -1649,7 +1648,7 @@ public:
   StoreInst *CreateAlignedStore(Value *Val, Value *Ptr, unsigned Align,
                                 bool isVolatile = false) {
     StoreInst *SI = CreateStore(Val, Ptr, isVolatile);
-    SI->setAlignment(Align);
+    SI->setAlignment(MaybeAlign(Align));
     return SI;
   }
 
@@ -1913,11 +1912,17 @@ public:
     return V;
   }
 
-  Value *CreateFPToUI(Value *V, Type *DestTy, const Twine &Name = ""){
+  Value *CreateFPToUI(Value *V, Type *DestTy, const Twine &Name = "") {
+    if (IsFPConstrained)
+      return CreateConstrainedFPCast(Intrinsic::experimental_constrained_fptoui,
+                                     V, DestTy, nullptr, Name);
     return CreateCast(Instruction::FPToUI, V, DestTy, Name);
   }
 
-  Value *CreateFPToSI(Value *V, Type *DestTy, const Twine &Name = ""){
+  Value *CreateFPToSI(Value *V, Type *DestTy, const Twine &Name = "") {
+    if (IsFPConstrained)
+      return CreateConstrainedFPCast(Intrinsic::experimental_constrained_fptosi,
+                                     V, DestTy, nullptr, Name);
     return CreateCast(Instruction::FPToSI, V, DestTy, Name);
   }
 
@@ -1931,10 +1936,17 @@ public:
 
   Value *CreateFPTrunc(Value *V, Type *DestTy,
                        const Twine &Name = "") {
+    if (IsFPConstrained)
+      return CreateConstrainedFPCast(
+          Intrinsic::experimental_constrained_fptrunc, V, DestTy, nullptr,
+          Name);
     return CreateCast(Instruction::FPTrunc, V, DestTy, Name);
   }
 
   Value *CreateFPExt(Value *V, Type *DestTy, const Twine &Name = "") {
+    if (IsFPConstrained)
+      return CreateConstrainedFPCast(Intrinsic::experimental_constrained_fpext,
+                                     V, DestTy, nullptr, Name);
     return CreateCast(Instruction::FPExt, V, DestTy, Name);
   }
 
@@ -2046,6 +2058,37 @@ public:
     return Insert(CastInst::CreateFPCast(V, DestTy), Name);
   }
 
+  CallInst *CreateConstrainedFPCast(
+      Intrinsic::ID ID, Value *V, Type *DestTy,
+      Instruction *FMFSource = nullptr, const Twine &Name = "",
+      MDNode *FPMathTag = nullptr,
+      Optional<ConstrainedFPIntrinsic::RoundingMode> Rounding = None,
+      Optional<ConstrainedFPIntrinsic::ExceptionBehavior> Except = None) {
+    Value *ExceptV = getConstrainedFPExcept(Except);
+
+    FastMathFlags UseFMF = FMF;
+    if (FMFSource)
+      UseFMF = FMFSource->getFastMathFlags();
+
+    CallInst *C;
+    switch (ID) {
+    default: {
+      Value *RoundingV = getConstrainedFPRounding(Rounding);
+      C = CreateIntrinsic(ID, {DestTy, V->getType()}, {V, RoundingV, ExceptV},
+                          nullptr, Name);
+    } break;
+    case Intrinsic::experimental_constrained_fpext:
+    case Intrinsic::experimental_constrained_fptoui:
+    case Intrinsic::experimental_constrained_fptosi:
+      C = CreateIntrinsic(ID, {DestTy, V->getType()}, {V, ExceptV}, nullptr,
+                          Name);
+      break;
+    }
+    if (isa<FPMathOperator>(C))
+      setFPAttrs(C, FPMathTag, UseFMF);
+    return C;
+  }
+
   // Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a
   // compile time error, instead of converting the string to bool for the
   // isSigned parameter.
@@ -2187,7 +2230,10 @@ public:
 
   PHINode *CreatePHI(Type *Ty, unsigned NumReservedValues,
                      const Twine &Name = "") {
-    return Insert(PHINode::Create(Ty, NumReservedValues), Name);
+    PHINode *Phi = PHINode::Create(Ty, NumReservedValues);
+    if (isa<FPMathOperator>(Phi))
+      setFPAttrs(Phi, nullptr /* MDNode* */, FMF);
+    return Insert(Phi, Name);
   }
 
   CallInst *CreateCall(FunctionType *FTy, Value *Callee,
@@ -2195,7 +2241,7 @@ public:
                        MDNode *FPMathTag = nullptr) {
     CallInst *CI = CallInst::Create(FTy, Callee, Args, DefaultOperandBundles);
     if (isa<FPMathOperator>(CI))
-      CI = cast<CallInst>(setFPAttrs(CI, FPMathTag, FMF));
+      setFPAttrs(CI, FPMathTag, FMF);
     return Insert(CI, Name);
   }
 
@@ -2204,7 +2250,7 @@ public:
                        const Twine &Name = "", MDNode *FPMathTag = nullptr) {
     CallInst *CI = CallInst::Create(FTy, Callee, Args, OpBundles);
     if (isa<FPMathOperator>(CI))
-      CI = cast<CallInst>(setFPAttrs(CI, FPMathTag, FMF));
+      setFPAttrs(CI, FPMathTag, FMF);
     return Insert(CI, Name);
   }
 
@@ -2252,7 +2298,7 @@ public:
       Sel = addBranchMetadata(Sel, Prof, Unpred);
     }
     if (isa<FPMathOperator>(Sel))
-      Sel = cast<SelectInst>(setFPAttrs(Sel, nullptr /* MDNode* */, FMF));
+      setFPAttrs(Sel, nullptr /* MDNode* */, FMF);
     return Insert(Sel, Name);
   }
 
@@ -2454,7 +2500,7 @@ public:
   }
 
   Value *CreatePreserveArrayAccessIndex(Value *Base, unsigned Dimension,
-                                        unsigned LastIndex) {
+                                        unsigned LastIndex, MDNode *DbgInfo) {
     assert(isa<PointerType>(Base->getType()) &&
            "Invalid Base ptr type for preserve.array.access.index.");
     auto *BaseType = Base->getType();
@@ -2476,6 +2522,8 @@ public:
     Value *DimV = getInt32(Dimension);
     CallInst *Fn =
         CreateCall(FnPreserveArrayAccessIndex, {Base, DimV, LastIndexV});
+    if (DbgInfo)
+      Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
 
     return Fn;
   }
@@ -2493,7 +2541,8 @@ public:
     Value *DIIndex = getInt32(FieldIndex);
     CallInst *Fn =
         CreateCall(FnPreserveUnionAccessIndex, {Base, DIIndex});
-    Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
+    if (DbgInfo)
+      Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
 
     return Fn;
   }
@@ -2516,7 +2565,8 @@ public:
     Value *DIIndex = getInt32(FieldIndex);
     CallInst *Fn = CreateCall(FnPreserveStructAccessIndex,
                               {Base, GEPIndex, DIIndex});
-    Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
+    if (DbgInfo)
+      Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
 
     return Fn;
   }
diff --git a/include/llvm/IR/InlineAsm.h b/include/llvm/IR/InlineAsm.h
index 2aac807623a9..72d8ad1501ae 100644
--- a/include/llvm/IR/InlineAsm.h
+++ b/include/llvm/IR/InlineAsm.h
@@ -244,6 +244,7 @@ public:
     Constraint_m,
     Constraint_o,
     Constraint_v,
+    Constraint_A,
     Constraint_Q,
     Constraint_R,
     Constraint_S,
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index ca419b50da6b..7fb94e9d8c22 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -975,7 +975,7 @@ public:
   static Type* makeCmpResultType(Type* opnd_type) {
     if (VectorType* vt = dyn_cast<VectorType>(opnd_type)) {
       return VectorType::get(Type::getInt1Ty(opnd_type->getContext()),
-                             vt->getNumElements());
+                             vt->getElementCount());
     }
     return Type::getInt1Ty(opnd_type->getContext());
   }
@@ -1567,11 +1567,17 @@ public:
   }
 
   /// Extract the alignment of the return value.
-  unsigned getRetAlignment() const { return Attrs.getRetAlignment(); }
+  unsigned getRetAlignment() const {
+    if (const auto MA = Attrs.getRetAlignment())
+      return MA->value();
+    return 0;
+  }
 
   /// Extract the alignment for a call or parameter (0=unknown).
   unsigned getParamAlignment(unsigned ArgNo) const {
-    return Attrs.getParamAlignment(ArgNo);
+    if (const auto MA = Attrs.getParamAlignment(ArgNo))
+      return MA->value();
+    return 0;
   }
 
   /// Extract the byval type for a call or parameter.
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index 6a9a74bd16f0..803f6977b32c 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -229,6 +229,16 @@ public:
     return hasMetadataHashEntry();
   }
 
+  /// Return true if this instruction has the given type of metadata attached.
+  bool hasMetadata(unsigned KindID) const {
+    return getMetadata(KindID) != nullptr;
+  }
+
+  /// Return true if this instruction has the given type of metadata attached.
+  bool hasMetadata(StringRef Kind) const {
+    return getMetadata(Kind) != nullptr;
+  }
+
   /// Get the metadata of given kind attached to this Instruction.
   /// If the metadata is not found then return null.
   MDNode *getMetadata(unsigned KindID) const {
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 215ce45c7b75..fa980df03ef0 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -110,9 +110,11 @@ public:
   /// Return the alignment of the memory that is being allocated by the
   /// instruction.
   unsigned getAlignment() const {
-    return (1u << (getSubclassDataFromInstruction() & 31)) >> 1;
+    if (const auto MA = decodeMaybeAlign(getSubclassDataFromInstruction() & 31))
+      return MA->value();
+    return 0;
   }
-  void setAlignment(unsigned Align);
+  void setAlignment(MaybeAlign Align);
 
   /// Return true if this alloca is in the entry block of the function and is a
   /// constant size. If so, the code generator will fold it into the
@@ -182,15 +184,15 @@ public:
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
            BasicBlock *InsertAtEnd);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
-           unsigned Align, Instruction *InsertBefore = nullptr);
+           MaybeAlign Align, Instruction *InsertBefore = nullptr);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
-           unsigned Align, BasicBlock *InsertAtEnd);
+           MaybeAlign Align, BasicBlock *InsertAtEnd);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
-           unsigned Align, AtomicOrdering Order,
+           MaybeAlign Align, AtomicOrdering Order,
            SyncScope::ID SSID = SyncScope::System,
            Instruction *InsertBefore = nullptr);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
-           unsigned Align, AtomicOrdering Order, SyncScope::ID SSID,
+           MaybeAlign Align, AtomicOrdering Order, SyncScope::ID SSID,
            BasicBlock *InsertAtEnd);
 
   // Deprecated [opaque pointer types]
@@ -209,20 +211,20 @@ public:
            BasicBlock *InsertAtEnd)
       : LoadInst(Ptr->getType()->getPointerElementType(), Ptr, NameStr,
                  isVolatile, InsertAtEnd) {}
-  LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile, unsigned Align,
+  LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile, MaybeAlign Align,
            Instruction *InsertBefore = nullptr)
       : LoadInst(Ptr->getType()->getPointerElementType(), Ptr, NameStr,
                  isVolatile, Align, InsertBefore) {}
-  LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile, unsigned Align,
+  LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile, MaybeAlign Align,
            BasicBlock *InsertAtEnd)
       : LoadInst(Ptr->getType()->getPointerElementType(), Ptr, NameStr,
                  isVolatile, Align, InsertAtEnd) {}
-  LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile, unsigned Align,
+  LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile, MaybeAlign Align,
            AtomicOrdering Order, SyncScope::ID SSID = SyncScope::System,
            Instruction *InsertBefore = nullptr)
       : LoadInst(Ptr->getType()->getPointerElementType(), Ptr, NameStr,
                  isVolatile, Align, Order, SSID, InsertBefore) {}
-  LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile, unsigned Align,
+  LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile, MaybeAlign Align,
            AtomicOrdering Order, SyncScope::ID SSID, BasicBlock *InsertAtEnd)
       : LoadInst(Ptr->getType()->getPointerElementType(), Ptr, NameStr,
                  isVolatile, Align, Order, SSID, InsertAtEnd) {}
@@ -238,10 +240,13 @@ public:
 
   /// Return the alignment of the access that is being performed.
   unsigned getAlignment() const {
-    return (1 << ((getSubclassDataFromInstruction() >> 1) & 31)) >> 1;
+    if (const auto MA =
+            decodeMaybeAlign((getSubclassDataFromInstruction() >> 1) & 31))
+      return MA->value();
+    return 0;
   }
 
-  void setAlignment(unsigned Align);
+  void setAlignment(MaybeAlign Align);
 
   /// Returns the ordering constraint of this load instruction.
   AtomicOrdering getOrdering() const {
@@ -332,17 +337,15 @@ public:
   StoreInst(Value *Val, Value *Ptr, bool isVolatile = false,
             Instruction *InsertBefore = nullptr);
   StoreInst(Value *Val, Value *Ptr, bool isVolatile, BasicBlock *InsertAtEnd);
-  StoreInst(Value *Val, Value *Ptr, bool isVolatile,
-            unsigned Align, Instruction *InsertBefore = nullptr);
-  StoreInst(Value *Val, Value *Ptr, bool isVolatile,
-            unsigned Align, BasicBlock *InsertAtEnd);
-  StoreInst(Value *Val, Value *Ptr, bool isVolatile,
-            unsigned Align, AtomicOrdering Order,
-            SyncScope::ID SSID = SyncScope::System,
+  StoreInst(Value *Val, Value *Ptr, bool isVolatile, MaybeAlign Align,
             Instruction *InsertBefore = nullptr);
-  StoreInst(Value *Val, Value *Ptr, bool isVolatile,
-            unsigned Align, AtomicOrdering Order, SyncScope::ID SSID,
+  StoreInst(Value *Val, Value *Ptr, bool isVolatile, MaybeAlign Align,
             BasicBlock *InsertAtEnd);
+  StoreInst(Value *Val, Value *Ptr, bool isVolatile, MaybeAlign Align,
+            AtomicOrdering Order, SyncScope::ID SSID = SyncScope::System,
+            Instruction *InsertBefore = nullptr);
+  StoreInst(Value *Val, Value *Ptr, bool isVolatile, MaybeAlign Align,
+            AtomicOrdering Order, SyncScope::ID SSID, BasicBlock *InsertAtEnd);
 
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -363,10 +366,13 @@ public:
 
   /// Return the alignment of the access that is being performed
   unsigned getAlignment() const {
-    return (1 << ((getSubclassDataFromInstruction() >> 1) & 31)) >> 1;
+    if (const auto MA =
+            decodeMaybeAlign((getSubclassDataFromInstruction() >> 1) & 31))
+      return MA->value();
+    return 0;
   }
 
-  void setAlignment(unsigned Align);
+  void setAlignment(MaybeAlign Align);
 
   /// Returns the ordering constraint of this store instruction.
   AtomicOrdering getOrdering() const {
@@ -1764,6 +1770,10 @@ public:
   void setTrueValue(Value *V) { Op<1>() = V; }
   void setFalseValue(Value *V) { Op<2>() = V; }
 
+  /// Swap the true and false values of the select instruction.
+  /// This doesn't swap prof metadata.
+  void swapValues() { Op<1>().swap(Op<2>()); }
+
   /// Return a string if the specified operands are invalid
   /// for a select operation, otherwise return null.
   static const char *areInvalidOperands(Value *Cond, Value *True, Value *False);
@@ -3455,16 +3465,7 @@ public:
 class SwitchInstProfUpdateWrapper {
   SwitchInst &SI;
   Optional<SmallVector<uint32_t, 8> > Weights = None;
-
-  // Sticky invalid state is needed to safely ignore operations with prof data
-  // in cases where SwitchInstProfUpdateWrapper is created from SwitchInst
-  // with inconsistent prof data. TODO: once we fix all prof data
-  // inconsistencies we can turn invalid state to assertions.
-  enum {
-    Invalid,
-    Initialized,
-    Changed
-  } State = Invalid;
+  bool Changed = false;
 
 protected:
   static MDNode *getProfBranchWeightsMD(const SwitchInst &SI);
@@ -3482,7 +3483,7 @@ public:
   SwitchInstProfUpdateWrapper(SwitchInst &SI) : SI(SI) { init(); }
 
   ~SwitchInstProfUpdateWrapper() {
-    if (State == Changed)
+    if (Changed)
       SI.setMetadata(LLVMContext::MD_prof, buildProfBranchWeightsMD());
   }
 
@@ -3938,6 +3939,9 @@ class CallBrInst : public CallBase {
             ArrayRef<BasicBlock *> IndirectDests, ArrayRef<Value *> Args,
             ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
 
+  /// Should the Indirect Destinations change, scan + update the Arg list.
+  void updateArgBlockAddresses(unsigned i, BasicBlock *B);
+
   /// Compute the number of operands to allocate.
   static int ComputeNumOperands(int NumArgs, int NumIndirectDests,
                                 int NumBundleInputs = 0) {
@@ -4075,7 +4079,7 @@ public:
     return cast<BasicBlock>(*(&Op<-1>() - getNumIndirectDests() - 1));
   }
   BasicBlock *getIndirectDest(unsigned i) const {
-    return cast<BasicBlock>(*(&Op<-1>() - getNumIndirectDests() + i));
+    return cast_or_null<BasicBlock>(*(&Op<-1>() - getNumIndirectDests() + i));
   }
   SmallVector<BasicBlock *, 16> getIndirectDests() const {
     SmallVector<BasicBlock *, 16> IndirectDests;
@@ -4087,6 +4091,7 @@ public:
     *(&Op<-1>() - getNumIndirectDests() - 1) = reinterpret_cast<Value *>(B);
   }
   void setIndirectDest(unsigned i, BasicBlock *B) {
+    updateArgBlockAddresses(i, B);
     *(&Op<-1>() - getNumIndirectDests() + i) = reinterpret_cast<Value *>(B);
   }
 
@@ -4096,11 +4101,10 @@ public:
     return i == 0 ? getDefaultDest() : getIndirectDest(i - 1);
   }
 
-  void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
-    assert(idx < getNumIndirectDests() + 1 &&
+  void setSuccessor(unsigned i, BasicBlock *NewSucc) {
+    assert(i < getNumIndirectDests() + 1 &&
            "Successor # out of range for callbr!");
-    *(&Op<-1>() - getNumIndirectDests() -1 + idx) =
-        reinterpret_cast<Value *>(NewSucc);
+    return i == 0 ? setDefaultDest(NewSucc) : setIndirectDest(i - 1, NewSucc);
   }
 
   unsigned getNumSuccessors() const { return getNumIndirectDests() + 1; }
@@ -5251,31 +5255,38 @@ public:
 
 /// A helper function that returns the pointer operand of a load or store
 /// instruction. Returns nullptr if not load or store.
-inline Value *getLoadStorePointerOperand(Value *V) {
+inline const Value *getLoadStorePointerOperand(const Value *V) {
   if (auto *Load = dyn_cast<LoadInst>(V))
     return Load->getPointerOperand();
   if (auto *Store = dyn_cast<StoreInst>(V))
     return Store->getPointerOperand();
   return nullptr;
 }
+inline Value *getLoadStorePointerOperand(Value *V) {
+  return const_cast<Value *>(
+      getLoadStorePointerOperand(static_cast<const Value *>(V)));
+}
 
 /// A helper function that returns the pointer operand of a load, store
 /// or GEP instruction. Returns nullptr if not load, store, or GEP.
-inline Value *getPointerOperand(Value *V) {
+inline const Value *getPointerOperand(const Value *V) {
   if (auto *Ptr = getLoadStorePointerOperand(V))
     return Ptr;
   if (auto *Gep = dyn_cast<GetElementPtrInst>(V))
     return Gep->getPointerOperand();
   return nullptr;
 }
+inline Value *getPointerOperand(Value *V) {
+  return const_cast<Value *>(getPointerOperand(static_cast<const Value *>(V)));
+}
 
 /// A helper function that returns the alignment of load or store instruction.
-inline unsigned getLoadStoreAlignment(Value *I) {
+inline MaybeAlign getLoadStoreAlignment(Value *I) {
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
          "Expected Load or Store instruction");
   if (auto *LI = dyn_cast<LoadInst>(I))
-    return LI->getAlignment();
-  return cast<StoreInst>(I)->getAlignment();
+    return MaybeAlign(LI->getAlignment());
+  return MaybeAlign(cast<StoreInst>(I)->getAlignment());
 }
 
 /// A helper function that returns the address space of the pointer operand of
diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index 438bdb29b706..c989b4a2e72a 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@@ -259,6 +259,8 @@ namespace llvm {
       case Intrinsic::experimental_constrained_fdiv:
       case Intrinsic::experimental_constrained_frem:
       case Intrinsic::experimental_constrained_fma:
+      case Intrinsic::experimental_constrained_fptosi:
+      case Intrinsic::experimental_constrained_fptoui:
       case Intrinsic::experimental_constrained_fptrunc:
       case Intrinsic::experimental_constrained_fpext:
       case Intrinsic::experimental_constrained_sqrt:
@@ -271,12 +273,16 @@ namespace llvm {
       case Intrinsic::experimental_constrained_log:
       case Intrinsic::experimental_constrained_log10:
       case Intrinsic::experimental_constrained_log2:
+      case Intrinsic::experimental_constrained_lrint:
+      case Intrinsic::experimental_constrained_llrint:
       case Intrinsic::experimental_constrained_rint:
       case Intrinsic::experimental_constrained_nearbyint:
       case Intrinsic::experimental_constrained_maxnum:
       case Intrinsic::experimental_constrained_minnum:
       case Intrinsic::experimental_constrained_ceil:
       case Intrinsic::experimental_constrained_floor:
+      case Intrinsic::experimental_constrained_lround:
+      case Intrinsic::experimental_constrained_llround:
       case Intrinsic::experimental_constrained_round:
       case Intrinsic::experimental_constrained_trunc:
         return true;
@@ -405,11 +411,11 @@ namespace llvm {
       setArgOperand(ARG_DEST, Ptr);
     }
 
-    void setDestAlignment(unsigned Align) {
+    void setDestAlignment(unsigned Alignment) {
       removeParamAttr(ARG_DEST, Attribute::Alignment);
-      if (Align > 0)
-        addParamAttr(ARG_DEST,
-                     Attribute::getWithAlignment(getContext(), Align));
+      if (Alignment > 0)
+        addParamAttr(ARG_DEST, Attribute::getWithAlignment(getContext(),
+                                                           Align(Alignment)));
     }
 
     void setLength(Value *L) {
@@ -454,11 +460,12 @@ namespace llvm {
       BaseCL::setArgOperand(ARG_SOURCE, Ptr);
     }
 
-    void setSourceAlignment(unsigned Align) {
+    void setSourceAlignment(unsigned Alignment) {
       BaseCL::removeParamAttr(ARG_SOURCE, Attribute::Alignment);
-      if (Align > 0)
-        BaseCL::addParamAttr(ARG_SOURCE, Attribute::getWithAlignment(
-                                             BaseCL::getContext(), Align));
+      if (Alignment > 0)
+        BaseCL::addParamAttr(ARG_SOURCE,
+                             Attribute::getWithAlignment(BaseCL::getContext(),
+                                                         Align(Alignment)));
     }
   };
 
diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h
index f38f92022d21..9e4ebd915afc 100644
--- a/include/llvm/IR/Intrinsics.h
+++ b/include/llvm/IR/Intrinsics.h
@@ -100,7 +100,8 @@ namespace Intrinsic {
       Integer, Vector, Pointer, Struct,
       Argument, ExtendArgument, TruncArgument, HalfVecArgument,
       SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfAnyPtrsToElt,
-      VecElementArgument
+      VecElementArgument, ScalableVecArgument, Subdivide2Argument,
+      Subdivide4Argument, VecOfBitcastsToInt
     } Kind;
 
     union {
@@ -125,14 +126,17 @@ namespace Intrinsic {
       assert(Kind == Argument || Kind == ExtendArgument ||
              Kind == TruncArgument || Kind == HalfVecArgument ||
              Kind == SameVecWidthArgument || Kind == PtrToArgument ||
-             Kind == PtrToElt || Kind == VecElementArgument);
+             Kind == PtrToElt || Kind == VecElementArgument ||
+             Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
+             Kind == VecOfBitcastsToInt);
       return Argument_Info >> 3;
     }
     ArgKind getArgumentKind() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
              Kind == TruncArgument || Kind == HalfVecArgument ||
              Kind == SameVecWidthArgument || Kind == PtrToArgument ||
-             Kind == VecElementArgument);
+             Kind == VecElementArgument || Kind == Subdivide2Argument ||
+             Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
       return (ArgKind)(Argument_Info & 7);
     }
 
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index d660f8278437..7a0263f88c2a 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -63,6 +63,12 @@ class NoCapture<int argNo> : IntrinsicProperty {
   int ArgNo = argNo;
 }
 
+// NoAlias - The specified argument pointer is not aliasing other "noalias" pointer
+// arguments of the intrinsic wrt. the intrinsic scope.
+class NoAlias<int argNo> : IntrinsicProperty {
+  int ArgNo = argNo;
+}
+
 // Returned - The specified argument is always the return value of the
 // intrinsic.
 class Returned<int argNo> : IntrinsicProperty {
@@ -181,6 +187,16 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num>;
 // vector type, but change the element count to be half as many
 class LLVMHalfElementsVectorType<int num> : LLVMMatchType<num>;
 
+// Match the type of another intrinsic parameter that is expected to be a
+// vector type (i.e. <N x iM>) but with each element subdivided to
+// form a vector with more elements that are smaller than the original.
+class LLVMSubdivide2VectorType<int num> : LLVMMatchType<num>;
+class LLVMSubdivide4VectorType<int num> : LLVMMatchType<num>;
+
+// Match the element count and bit width of another intrinsic parameter, but
+// change the element type to an integer.
+class LLVMVectorOfBitcastsToInt<int num> : LLVMMatchType<num>;
+
 def llvm_void_ty       : LLVMType<isVoid>;
 let isAny = 1 in {
   def llvm_any_ty        : LLVMType<Any>;
@@ -407,9 +423,9 @@ def int_objc_arc_annotation_bottomup_bbend  : Intrinsic<[],
 //===--------------------- Code Generator Intrinsics ----------------------===//
 //
 def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
-def int_addressofreturnaddress : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
-def int_frameaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
-def int_sponentry  : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
+def int_addressofreturnaddress : Intrinsic<[llvm_anyptr_ty], [], [IntrNoMem]>;
+def int_frameaddress : Intrinsic<[llvm_anyptr_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
+def int_sponentry  : Intrinsic<[llvm_anyptr_ty], [], [IntrNoMem]>;
 def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
                                    [IntrReadMem], "llvm.read_register">;
 def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
@@ -451,8 +467,8 @@ def int_thread_pointer : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>,
 // from being reordered overly much with respect to nearby access to the same
 // memory while not impeding optimization.
 def int_prefetch
-    : Intrinsic<[], [ llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ],
-                [ IntrInaccessibleMemOrArgMemOnly, ReadOnly<0>, NoCapture<0>,
+    : Intrinsic<[], [ llvm_anyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ],
+                [ IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, ReadOnly<0>, NoCapture<0>,
                   ImmArg<1>, ImmArg<2>]>;
 def int_pcmarker      : Intrinsic<[], [llvm_i32_ty]>;
 
@@ -460,7 +476,7 @@ def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
 
 // The assume intrinsic is marked as arbitrarily writing so that proper
 // control dependencies will be maintained.
-def int_assume        : Intrinsic<[], [llvm_i1_ty], []>;
+def int_assume        : Intrinsic<[], [llvm_i1_ty], [IntrWillReturn]>;
 
 // Stack Protector Intrinsic - The stackprotector intrinsic writes the stack
 // guard to the correct place on the stack frame.
@@ -493,23 +509,23 @@ def int_instrprof_value_profile : Intrinsic<[],
 def int_memcpy  : Intrinsic<[],
                              [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty,
                               llvm_i1_ty],
-                            [IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
-                             WriteOnly<0>, ReadOnly<1>, ImmArg<3>]>;
+                            [IntrArgMemOnly, IntrWillReturn, NoCapture<0>, NoCapture<1>,
+                             NoAlias<0>, NoAlias<1>, WriteOnly<0>, ReadOnly<1>, ImmArg<3>]>;
 def int_memmove : Intrinsic<[],
                             [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty,
                              llvm_i1_ty],
-                            [IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
+                            [IntrArgMemOnly, IntrWillReturn, NoCapture<0>, NoCapture<1>,
                              ReadOnly<1>, ImmArg<3>]>;
 def int_memset  : Intrinsic<[],
                             [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty,
                              llvm_i1_ty],
-                            [IntrArgMemOnly, NoCapture<0>, WriteOnly<0>,
+                            [IntrArgMemOnly, IntrWillReturn, NoCapture<0>, WriteOnly<0>,
                             ImmArg<3>]>;
 
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_fma  : Intrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>,
                             LLVMMatchType<0>]>;
@@ -551,19 +567,19 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
 
 def int_minnum : Intrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
 def int_maxnum : Intrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
 def int_minimum : Intrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
 def int_maximum : Intrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
 >;
 
 // NOTE: these are internal interfaces.
@@ -576,13 +592,13 @@ def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrNoReturn]>;
 def int_objectsize : Intrinsic<[llvm_anyint_ty],
                                [llvm_anyptr_ty, llvm_i1_ty,
                                 llvm_i1_ty, llvm_i1_ty],
-                               [IntrNoMem, IntrSpeculatable, ImmArg<1>, ImmArg<2>, ImmArg<3>]>,
+                               [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<1>, ImmArg<2>, ImmArg<3>]>,
                                GCCBuiltin<"__builtin_object_size">;
 
 //===--------------- Constrained Floating Point Intrinsics ----------------===//
 //
 
-let IntrProperties = [IntrInaccessibleMemOnly] in {
+let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
   def int_experimental_constrained_fadd : Intrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
@@ -616,6 +632,14 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
 
+  def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ],
+                                                    [ llvm_anyfloat_ty,
+                                                      llvm_metadata_ty ]>;
+
+  def int_experimental_constrained_fptoui : Intrinsic<[ llvm_anyint_ty ],
+                                                    [ llvm_anyfloat_ty,
+                                                      llvm_metadata_ty ]>;
+
   def int_experimental_constrained_fptrunc : Intrinsic<[ llvm_anyfloat_ty ],
                                                        [ llvm_anyfloat_ty,
                                                          llvm_metadata_ty,
@@ -679,6 +703,14 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
                                                          [ LLVMMatchType<0>,
                                                            llvm_metadata_ty,
                                                            llvm_metadata_ty ]>;
+  def int_experimental_constrained_lrint : Intrinsic<[ llvm_anyint_ty ],
+                                                     [ llvm_anyfloat_ty,
+                                                       llvm_metadata_ty,
+                                                       llvm_metadata_ty ]>;
+  def int_experimental_constrained_llrint : Intrinsic<[ llvm_anyint_ty ],
+                                                      [ llvm_anyfloat_ty,
+                                                        llvm_metadata_ty,
+                                                        llvm_metadata_ty ]>;
   def int_experimental_constrained_maxnum : Intrinsic<[ llvm_anyfloat_ty ],
                                                       [ LLVMMatchType<0>,
                                                         LLVMMatchType<0>,
@@ -697,6 +729,12 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
                                                      [ LLVMMatchType<0>,
                                                        llvm_metadata_ty,
                                                        llvm_metadata_ty ]>;
+  def int_experimental_constrained_lround : Intrinsic<[ llvm_anyint_ty ],
+                                                      [ llvm_anyfloat_ty,
+                                                        llvm_metadata_ty ]>;
+  def int_experimental_constrained_llround : Intrinsic<[ llvm_anyint_ty ],
+                                                       [ llvm_anyfloat_ty,
+                                                         llvm_metadata_ty ]>;
   def int_experimental_constrained_round : Intrinsic<[ llvm_anyfloat_ty ],
                                                      [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
@@ -706,18 +744,19 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
                                                        llvm_metadata_ty,
                                                        llvm_metadata_ty ]>;
 }
-// FIXME: Add intrinsics for fcmp, fptoui and fptosi.
+// FIXME: Add intrinsic for fcmp.
+// FIXME: Consider maybe adding intrinsics for sitofp, uitofp.
 
 //===------------------------- Expect Intrinsics --------------------------===//
 //
 def int_expect : Intrinsic<[llvm_anyint_ty],
-  [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrWillReturn]>;
 
 //===-------------------- Bit Manipulation Intrinsics ---------------------===//
 //
 
 // None of these intrinsics accesses memory at all.
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_bswap: Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
   def int_ctpop: Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
   def int_bitreverse : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
@@ -727,7 +766,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
 
-let IntrProperties = [IntrNoMem, IntrSpeculatable, ImmArg<1>] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<1>] in {
   def int_ctlz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
   def int_cttz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
 }
@@ -739,7 +778,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, ImmArg<1>] in {
 // mean the optimizers can change them aggressively.  Special handling
 // needed in a few places. These synthetic intrinsics have no
 // side-effects and just mark information about their operands.
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_dbg_declare      : Intrinsic<[],
                                        [llvm_metadata_ty,
                                         llvm_metadata_ty,
@@ -796,21 +835,21 @@ def int_eh_sjlj_setup_dispatch  : Intrinsic<[], []>;
 def int_var_annotation : Intrinsic<[],
                                    [llvm_ptr_ty, llvm_ptr_ty,
                                     llvm_ptr_ty, llvm_i32_ty],
-                                   [], "llvm.var.annotation">;
+                                   [IntrWillReturn], "llvm.var.annotation">;
 def int_ptr_annotation : Intrinsic<[LLVMAnyPointerType<llvm_anyint_ty>],
                                    [LLVMMatchType<0>, llvm_ptr_ty, llvm_ptr_ty,
                                     llvm_i32_ty],
-                                   [], "llvm.ptr.annotation">;
+                                   [IntrWillReturn], "llvm.ptr.annotation">;
 def int_annotation : Intrinsic<[llvm_anyint_ty],
                                [LLVMMatchType<0>, llvm_ptr_ty,
                                 llvm_ptr_ty, llvm_i32_ty],
-                               [], "llvm.annotation">;
+                               [IntrWillReturn], "llvm.annotation">;
 
 // Annotates the current program point with metadata strings which are emitted
 // as CodeView debug info records. This is expensive, as it disables inlining
 // and is modelled as having side effects.
 def int_codeview_annotation : Intrinsic<[], [llvm_metadata_ty],
-                                        [IntrInaccessibleMemOnly, IntrNoDuplicate],
+                                        [IntrInaccessibleMemOnly, IntrNoDuplicate, IntrWillReturn],
                                         "llvm.codeview.annotation">;
 
 //===------------------------ Trampoline Intrinsics -----------------------===//
@@ -828,79 +867,77 @@ def int_adjust_trampoline : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty],
 //
 
 // Expose the carry flag from add operations on two integrals.
-def int_sadd_with_overflow : Intrinsic<[llvm_anyint_ty,
-                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                       [LLVMMatchType<0>, LLVMMatchType<0>],
-                                       [IntrNoMem, IntrSpeculatable]>;
-def int_uadd_with_overflow : Intrinsic<[llvm_anyint_ty,
-                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                       [LLVMMatchType<0>, LLVMMatchType<0>],
-                                       [IntrNoMem, IntrSpeculatable]>;
-
-def int_ssub_with_overflow : Intrinsic<[llvm_anyint_ty,
-                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                       [LLVMMatchType<0>, LLVMMatchType<0>],
-                                       [IntrNoMem, IntrSpeculatable]>;
-def int_usub_with_overflow : Intrinsic<[llvm_anyint_ty,
-                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                       [LLVMMatchType<0>, LLVMMatchType<0>],
-                                       [IntrNoMem, IntrSpeculatable]>;
-
-def int_smul_with_overflow : Intrinsic<[llvm_anyint_ty,
-                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                       [LLVMMatchType<0>, LLVMMatchType<0>],
-                                       [IntrNoMem, IntrSpeculatable]>;
-def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty,
-                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                       [LLVMMatchType<0>, LLVMMatchType<0>],
-                                       [IntrNoMem, IntrSpeculatable]>;
-
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
+  def int_sadd_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                                         [LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_uadd_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                                         [LLVMMatchType<0>, LLVMMatchType<0>]>;
+
+  def int_ssub_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                                         [LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_usub_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                                         [LLVMMatchType<0>, LLVMMatchType<0>]>;
+
+  def int_smul_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                                         [LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                                         [LLVMMatchType<0>, LLVMMatchType<0>]>;
+}
 //===------------------------- Saturation Arithmetic Intrinsics ---------------------===//
 //
 def int_sadd_sat : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]>;
 def int_uadd_sat : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]>;
 def int_ssub_sat : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable]>;
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 def int_usub_sat : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable]>;
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 //===------------------------- Fixed Point Arithmetic Intrinsics ---------------------===//
 //
 def int_smul_fix : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                             [IntrNoMem, IntrSpeculatable, Commutative, ImmArg<2>]>;
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
 
 def int_umul_fix : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                             [IntrNoMem, IntrSpeculatable, Commutative, ImmArg<2>]>;
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
 
 //===------------------- Fixed Point Saturation Arithmetic Intrinsics ----------------===//
 //
 def int_smul_fix_sat : Intrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                                 [IntrNoMem, IntrSpeculatable, Commutative, ImmArg<2>]>;
+                                 [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
+def int_umul_fix_sat : Intrinsic<[llvm_anyint_ty],
+                                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                                 [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
                                     [llvm_i64_ty, llvm_anyptr_ty],
-                                    [IntrArgMemOnly, NoCapture<1>, ImmArg<0>]>;
+                                    [IntrArgMemOnly, IntrWillReturn, NoCapture<1>, ImmArg<0>]>;
 def int_lifetime_end    : Intrinsic<[],
                                     [llvm_i64_ty, llvm_anyptr_ty],
-                                    [IntrArgMemOnly, NoCapture<1>, ImmArg<0>]>;
+                                    [IntrArgMemOnly, IntrWillReturn, NoCapture<1>, ImmArg<0>]>;
 def int_invariant_start : Intrinsic<[llvm_descriptor_ty],
                                     [llvm_i64_ty, llvm_anyptr_ty],
-                                    [IntrArgMemOnly, NoCapture<1>, ImmArg<0>]>;
+                                    [IntrArgMemOnly, IntrWillReturn, NoCapture<1>, ImmArg<0>]>;
 def int_invariant_end   : Intrinsic<[],
                                     [llvm_descriptor_ty, llvm_i64_ty,
                                      llvm_anyptr_ty],
-                                    [IntrArgMemOnly, NoCapture<2>, ImmArg<1>]>;
+                                    [IntrArgMemOnly, IntrWillReturn, NoCapture<2>, ImmArg<1>]>;
 
 // launder.invariant.group can't be marked with 'readnone' (IntrNoMem),
 // because it would cause CSE of two barriers with the same argument.
@@ -916,12 +953,12 @@ def int_invariant_end   : Intrinsic<[],
 // might change in the future.
 def int_launder_invariant_group : Intrinsic<[llvm_anyptr_ty],
                                             [LLVMMatchType<0>],
-                                            [IntrInaccessibleMemOnly, IntrSpeculatable]>;
+                                            [IntrInaccessibleMemOnly, IntrSpeculatable, IntrWillReturn]>;
 
 
 def int_strip_invariant_group : Intrinsic<[llvm_anyptr_ty],
                                           [LLVMMatchType<0>],
-                                          [IntrSpeculatable, IntrNoMem]>;
+                                          [IntrSpeculatable, IntrNoMem, IntrWillReturn]>;
 
 //===------------------------ Stackmap Intrinsics -------------------------===//
 //
@@ -964,6 +1001,14 @@ def int_coro_id : Intrinsic<[llvm_token_ty], [llvm_i32_ty, llvm_ptr_ty,
                              llvm_ptr_ty, llvm_ptr_ty],
                             [IntrArgMemOnly, IntrReadMem,
                              ReadNone<1>, ReadOnly<2>, NoCapture<2>]>;
+def int_coro_id_retcon : Intrinsic<[llvm_token_ty],
+    [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty,
+     llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
+    []>;
+def int_coro_id_retcon_once : Intrinsic<[llvm_token_ty],
+    [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty,
+     llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
+    []>;
 def int_coro_alloc : Intrinsic<[llvm_i1_ty], [llvm_token_ty], []>;
 def int_coro_begin : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
                                [WriteOnly<1>]>;
@@ -979,6 +1024,13 @@ def int_coro_size : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
 
 def int_coro_save : Intrinsic<[llvm_token_ty], [llvm_ptr_ty], []>;
 def int_coro_suspend : Intrinsic<[llvm_i8_ty], [llvm_token_ty, llvm_i1_ty], []>;
+def int_coro_suspend_retcon : Intrinsic<[llvm_any_ty], [llvm_vararg_ty], []>;
+def int_coro_prepare_retcon : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty],
+                                        [IntrNoMem]>;
+def int_coro_alloca_alloc : Intrinsic<[llvm_token_ty],
+                                      [llvm_anyint_ty, llvm_i32_ty], []>;
+def int_coro_alloca_get : Intrinsic<[llvm_ptr_ty], [llvm_token_ty], []>;
+def int_coro_alloca_free : Intrinsic<[], [llvm_token_ty], []>;
 
 def int_coro_param : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_ptr_ty],
                                [IntrNoMem, ReadNone<0>, ReadNone<1>]>;
@@ -1018,19 +1070,19 @@ def int_experimental_guard : Intrinsic<[], [llvm_i1_ty, llvm_vararg_ty],
 
 // Supports widenable conditions for guards represented as explicit branches.
 def int_experimental_widenable_condition : Intrinsic<[llvm_i1_ty], [],
-                                           [IntrInaccessibleMemOnly]>;
+                                           [IntrInaccessibleMemOnly, IntrWillReturn]>;
 
 // NOP: calls/invokes to this intrinsic are removed by codegen
-def int_donothing : Intrinsic<[], [], [IntrNoMem]>;
+def int_donothing : Intrinsic<[], [], [IntrNoMem, IntrWillReturn]>;
 
 // This instruction has no actual effect, though it is treated by the optimizer
 // has having opaque side effects. This may be inserted into loops to ensure
 // that they are not removed even if they turn out to be empty, for languages
 // which specify that infinite loops must be preserved.
-def int_sideeffect : Intrinsic<[], [], [IntrInaccessibleMemOnly]>;
+def int_sideeffect : Intrinsic<[], [], [IntrInaccessibleMemOnly, IntrWillReturn]>;
 
-// Intrisics to support half precision floating point format
-let IntrProperties = [IntrNoMem] in {
+// Intrinsics to support half precision floating point format
+let IntrProperties = [IntrNoMem, IntrWillReturn] in {
 def int_convert_to_fp16   : Intrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>;
 def int_convert_from_fp16 : Intrinsic<[llvm_anyfloat_ty], [llvm_i16_ty]>;
 }
@@ -1041,7 +1093,11 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
                                 [], "llvm.clear_cache">;
 
 // Intrinsic to detect whether its argument is a constant.
-def int_is_constant : Intrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem], "llvm.is.constant">;
+def int_is_constant : Intrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem, IntrWillReturn], "llvm.is.constant">;
+
+// Intrinsic to mask out bits of a pointer.
+def int_ptrmask: Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty, llvm_anyint_ty],
+                           [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
@@ -1049,45 +1105,45 @@ def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
                                       LLVMAnyPointerType<LLVMMatchType<0>>,
                                       llvm_i32_ty,
                                       LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                 [IntrArgMemOnly, ImmArg<2>]>;
+                                 [IntrArgMemOnly, IntrWillReturn, ImmArg<2>]>;
 
 def int_masked_load  : Intrinsic<[llvm_anyvector_ty],
                                  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty,
                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
-                                 [IntrReadMem, IntrArgMemOnly, ImmArg<1>]>;
+                                 [IntrReadMem, IntrArgMemOnly, IntrWillReturn, ImmArg<1>]>;
 
 def int_masked_gather: Intrinsic<[llvm_anyvector_ty],
                                  [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                   LLVMMatchType<0>],
-                                 [IntrReadMem, ImmArg<1>]>;
+                                 [IntrReadMem, IntrWillReturn, ImmArg<1>]>;
 
 def int_masked_scatter: Intrinsic<[],
                                   [llvm_anyvector_ty,
                                    LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
                                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                   [ImmArg<2>]>;
+                                   [IntrWillReturn, ImmArg<2>]>;
 
 def int_masked_expandload: Intrinsic<[llvm_anyvector_ty],
                                      [LLVMPointerToElt<0>,
                                       LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                       LLVMMatchType<0>],
-                                     [IntrReadMem]>;
+                                     [IntrReadMem, IntrWillReturn]>;
 
 def int_masked_compressstore: Intrinsic<[],
                                      [llvm_anyvector_ty,
                                       LLVMPointerToElt<0>,
                                       LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                     [IntrArgMemOnly]>;
+                                     [IntrArgMemOnly, IntrWillReturn]>;
 
 // Test whether a pointer is associated with a type metadata identifier.
 def int_type_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
-                              [IntrNoMem]>;
+                              [IntrNoMem, IntrWillReturn]>;
 
 // Safely loads a function pointer from a virtual table pointer using type metadata.
 def int_type_checked_load : Intrinsic<[llvm_ptr_ty, llvm_i1_ty],
                                       [llvm_ptr_ty, llvm_i32_ty, llvm_metadata_ty],
-                                      [IntrNoMem]>;
+                                      [IntrNoMem, IntrWillReturn]>;
 
 // Create a branch funnel that implements an indirect call to a limited set of
 // callees. This needs to be a musttail call.
@@ -1098,6 +1154,8 @@ def int_load_relative: Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty],
 
 def int_hwasan_check_memaccess :
   Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty], [IntrInaccessibleMemOnly, ImmArg<2>]>;
+def int_hwasan_check_memaccess_shortgranules :
+  Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty], [IntrInaccessibleMemOnly, ImmArg<2>]>;
 
 // Xray intrinsics
 //===----------------------------------------------------------------------===//
@@ -1121,7 +1179,7 @@ def int_memcpy_element_unordered_atomic
                   llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty
                 ],
                 [
-                  IntrArgMemOnly, NoCapture<0>, NoCapture<1>, WriteOnly<0>,
+                  IntrArgMemOnly, IntrWillReturn, NoCapture<0>, NoCapture<1>, WriteOnly<0>,
                   ReadOnly<1>, ImmArg<3>
                 ]>;
 
@@ -1132,58 +1190,47 @@ def int_memmove_element_unordered_atomic
                   llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty
                 ],
                 [
-                  IntrArgMemOnly, NoCapture<0>, NoCapture<1>, WriteOnly<0>,
+                  IntrArgMemOnly, IntrWillReturn, NoCapture<0>, NoCapture<1>, WriteOnly<0>,
                   ReadOnly<1>, ImmArg<3>
                 ]>;
 
 // @llvm.memset.element.unordered.atomic.*(dest, value, length, elementsize)
 def int_memset_element_unordered_atomic
     : Intrinsic<[], [ llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty, llvm_i32_ty ],
-                [ IntrArgMemOnly, NoCapture<0>, WriteOnly<0>, ImmArg<3> ]>;
+                [ IntrArgMemOnly, IntrWillReturn, NoCapture<0>, WriteOnly<0>, ImmArg<3> ]>;
 
 //===------------------------ Reduction Intrinsics ------------------------===//
 //
-def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty],
-                                                       [LLVMMatchType<0>,
-                                                        llvm_anyvector_ty],
-                                                       [IntrNoMem]>;
-def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty],
-                                                       [LLVMMatchType<0>,
-                                                        llvm_anyvector_ty],
-                                                       [IntrNoMem]>;
-def int_experimental_vector_reduce_add : Intrinsic<[LLVMVectorElementType<0>],
-                                                   [llvm_anyvector_ty],
-                                                   [IntrNoMem]>;
-def int_experimental_vector_reduce_mul : Intrinsic<[LLVMVectorElementType<0>],
-                                                   [llvm_anyvector_ty],
-                                                   [IntrNoMem]>;
-def int_experimental_vector_reduce_and : Intrinsic<[LLVMVectorElementType<0>],
-                                                   [llvm_anyvector_ty],
-                                                   [IntrNoMem]>;
-def int_experimental_vector_reduce_or : Intrinsic<[LLVMVectorElementType<0>],
-                                                  [llvm_anyvector_ty],
-                                                  [IntrNoMem]>;
-def int_experimental_vector_reduce_xor : Intrinsic<[LLVMVectorElementType<0>],
-                                                   [llvm_anyvector_ty],
-                                                   [IntrNoMem]>;
-def int_experimental_vector_reduce_smax : Intrinsic<[LLVMVectorElementType<0>],
-                                                    [llvm_anyvector_ty],
-                                                    [IntrNoMem]>;
-def int_experimental_vector_reduce_smin : Intrinsic<[LLVMVectorElementType<0>],
-                                                    [llvm_anyvector_ty],
-                                                    [IntrNoMem]>;
-def int_experimental_vector_reduce_umax : Intrinsic<[LLVMVectorElementType<0>],
-                                                    [llvm_anyvector_ty],
-                                                    [IntrNoMem]>;
-def int_experimental_vector_reduce_umin : Intrinsic<[LLVMVectorElementType<0>],
-                                                    [llvm_anyvector_ty],
-                                                    [IntrNoMem]>;
-def int_experimental_vector_reduce_fmax : Intrinsic<[LLVMVectorElementType<0>],
-                                                    [llvm_anyvector_ty],
-                                                    [IntrNoMem]>;
-def int_experimental_vector_reduce_fmin : Intrinsic<[LLVMVectorElementType<0>],
-                                                    [llvm_anyvector_ty],
-                                                    [IntrNoMem]>;
+let IntrProperties = [IntrNoMem, IntrWillReturn] in {
+  def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty],
+                                                         [LLVMMatchType<0>,
+                                                          llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty],
+                                                         [LLVMMatchType<0>,
+                                                          llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_add : Intrinsic<[LLVMVectorElementType<0>],
+                                                     [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_mul : Intrinsic<[LLVMVectorElementType<0>],
+                                                     [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_and : Intrinsic<[LLVMVectorElementType<0>],
+                                                     [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_or : Intrinsic<[LLVMVectorElementType<0>],
+                                                    [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_xor : Intrinsic<[LLVMVectorElementType<0>],
+                                                     [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_smax : Intrinsic<[LLVMVectorElementType<0>],
+                                                      [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_smin : Intrinsic<[LLVMVectorElementType<0>],
+                                                      [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_umax : Intrinsic<[LLVMVectorElementType<0>],
+                                                      [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_umin : Intrinsic<[LLVMVectorElementType<0>],
+                                                      [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_fmax : Intrinsic<[LLVMVectorElementType<0>],
+                                                      [llvm_anyvector_ty]>;
+  def int_experimental_vector_reduce_fmin : Intrinsic<[LLVMVectorElementType<0>],
+                                                      [llvm_anyvector_ty]>;
+}
 
 //===---------- Intrinsics to control hardware supported loops ----------===//
 
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index 832aca4fd30f..db01700f409f 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -691,7 +691,7 @@ def int_aarch64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
 // Memory Tagging Extensions (MTE) Intrinsics
 let TargetPrefix = "aarch64" in {
 def int_aarch64_irg   : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i64_ty],
-    [IntrInaccessibleMemOnly]>;
+    [IntrNoMem, IntrHasSideEffects]>;
 def int_aarch64_addg  : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i64_ty],
     [IntrNoMem]>;
 def int_aarch64_gmi   : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty],
@@ -707,7 +707,7 @@ def int_aarch64_subp :  Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_ptr_ty],
 
 // Generate a randomly tagged stack base pointer.
 def int_aarch64_irg_sp   : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty],
-    [IntrInaccessibleMemOnly]>;
+    [IntrNoMem, IntrHasSideEffects]>;
 
 // Transfer pointer tag with offset.
 // ptr1 = tagp(ptr0, baseptr, tag_offset) returns a pointer where
@@ -733,3 +733,124 @@ def int_aarch64_settag_zero  : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty],
 def int_aarch64_stgp  : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty, llvm_i64_ty],
     [IntrWriteMem, IntrArgMemOnly, NoCapture<0>, WriteOnly<0>]>;
 }
+
+// Transactional Memory Extension (TME) Intrinsics
+let TargetPrefix = "aarch64" in {
+def int_aarch64_tstart  : GCCBuiltin<"__builtin_arm_tstart">,
+                         Intrinsic<[llvm_i64_ty]>;
+
+def int_aarch64_tcommit : GCCBuiltin<"__builtin_arm_tcommit">, Intrinsic<[]>;
+
+def int_aarch64_tcancel : GCCBuiltin<"__builtin_arm_tcancel">,
+                          Intrinsic<[], [llvm_i64_ty], [ImmArg<0>]>;
+
+def int_aarch64_ttest   : GCCBuiltin<"__builtin_arm_ttest">,
+                          Intrinsic<[llvm_i64_ty], [],
+                                    [IntrNoMem, IntrHasSideEffects]>;
+}
+
+def llvm_nxv2i1_ty  : LLVMType<nxv2i1>;
+def llvm_nxv4i1_ty  : LLVMType<nxv4i1>;
+def llvm_nxv8i1_ty  : LLVMType<nxv8i1>;
+def llvm_nxv16i1_ty : LLVMType<nxv16i1>;
+def llvm_nxv16i8_ty : LLVMType<nxv16i8>;
+def llvm_nxv4i32_ty : LLVMType<nxv4i32>;
+def llvm_nxv2i64_ty : LLVMType<nxv2i64>;
+def llvm_nxv8f16_ty : LLVMType<nxv8f16>;
+def llvm_nxv4f32_ty : LLVMType<nxv4f32>;
+def llvm_nxv2f64_ty : LLVMType<nxv2f64>;
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_Merged1VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_SVE_CNT_Intrinsic
+    : Intrinsic<[LLVMVectorOfBitcastsToInt<0>],
+                [LLVMVectorOfBitcastsToInt<0>,
+                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 llvm_anyvector_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_SVE_Unpack_Intrinsic
+   : Intrinsic<[llvm_anyvector_ty],
+               [LLVMSubdivide2VectorType<0>],
+               [IntrNoMem]>;
+
+  class AdvSIMD_SVE_PUNPKHI_Intrinsic
+    : Intrinsic<[LLVMHalfElementsVectorType<0>],
+                [llvm_anyvector_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_SVE_DOT_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMSubdivide4VectorType<0>,
+                 LLVMSubdivide4VectorType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_SVE_DOT_Indexed_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMSubdivide4VectorType<0>,
+                 LLVMSubdivide4VectorType<0>,
+                 llvm_i32_ty],
+                [IntrNoMem]>;
+
+  // This class of intrinsics are not intended to be useful within LLVM IR but
+  // are instead here to support some of the more regid parts of the ACLE.
+  class Builtin_SVCVT<string name, LLVMType OUT, LLVMType IN>
+  : GCCBuiltin<"__builtin_sve_" # name>,
+    Intrinsic<[OUT], [OUT, llvm_nxv16i1_ty, IN], [IntrNoMem]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+
+//
+// Integer arithmetic
+//
+
+def int_aarch64_sve_abs : AdvSIMD_Merged1VectorArg_Intrinsic;
+def int_aarch64_sve_neg : AdvSIMD_Merged1VectorArg_Intrinsic;
+
+def int_aarch64_sve_sdot      : AdvSIMD_SVE_DOT_Intrinsic;
+def int_aarch64_sve_sdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
+
+def int_aarch64_sve_udot      : AdvSIMD_SVE_DOT_Intrinsic;
+def int_aarch64_sve_udot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
+
+//
+// Counting bits
+//
+
+def int_aarch64_sve_cnt : AdvSIMD_SVE_CNT_Intrinsic;
+
+//
+// Permutations and selection
+//
+
+def int_aarch64_sve_sunpkhi   : AdvSIMD_SVE_Unpack_Intrinsic;
+def int_aarch64_sve_sunpklo   : AdvSIMD_SVE_Unpack_Intrinsic;
+
+def int_aarch64_sve_uunpkhi   : AdvSIMD_SVE_Unpack_Intrinsic;
+def int_aarch64_sve_uunpklo   : AdvSIMD_SVE_Unpack_Intrinsic;
+
+//
+// Floating-point comparisons
+//
+
+def int_aarch64_sve_fcvtzs_i32f16 : Builtin_SVCVT<"svcvt_s32_f16_m", llvm_nxv4i32_ty, llvm_nxv8f16_ty>;
+
+//
+// Predicate operations
+//
+
+def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic;
+def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic;
+}
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 3982444b5401..ab6ee7f92dd1 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -175,6 +175,7 @@ def int_amdgcn_implicit_buffer_ptr :
 
 // Set EXEC to the 64-bit value given.
 // This is always moved to the beginning of the basic block.
+// FIXME: Should be mangled for wave size.
 def int_amdgcn_init_exec : Intrinsic<[],
   [llvm_i64_ty],      // 64-bit literal constant
   [IntrConvergent, ImmArg<0>]>;
@@ -185,7 +186,7 @@ def int_amdgcn_init_exec : Intrinsic<[],
 def int_amdgcn_init_exec_from_input : Intrinsic<[],
   [llvm_i32_ty,       // 32-bit SGPR input
    llvm_i32_ty],      // bit offset of the thread count
-  [IntrConvergent]>;
+  [IntrConvergent, ImmArg<1>]>;
 
 def int_amdgcn_wavefrontsize :
   GCCBuiltin<"__builtin_amdgcn_wavefrontsize">,
@@ -199,12 +200,14 @@ def int_amdgcn_wavefrontsize :
 // The first parameter is s_sendmsg immediate (i16),
 // the second one is copied to m0
 def int_amdgcn_s_sendmsg : GCCBuiltin<"__builtin_amdgcn_s_sendmsg">,
-  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, IntrInaccessibleMemOnly]>;
+  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
+  [ImmArg<0>, IntrNoMem, IntrHasSideEffects]>;
 def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
-  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, IntrInaccessibleMemOnly]>;
+  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
+  [ImmArg<0>, IntrNoMem, IntrHasSideEffects]>;
 
 def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
-  Intrinsic<[], [], [IntrConvergent]>;
+  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent]>;
 
 def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
   Intrinsic<[], [], [IntrConvergent]>;
@@ -835,9 +838,6 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
   defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
   defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
   defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
-
-  // TODO: INC/DEC are weird: they seem to have a vdata argument in hardware,
-  //       even though it clearly shouldn't be needed
   defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">;
   defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">;
 
@@ -854,8 +854,8 @@ let TargetPrefix = "amdgcn" in {
 
 defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
 
-class AMDGPUBufferLoad : Intrinsic <
-  [llvm_any_ty],
+class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
+  [data_ty],
   [llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
@@ -863,7 +863,7 @@ class AMDGPUBufferLoad : Intrinsic <
    llvm_i1_ty],       // slc(imm)
   [IntrReadMem, ImmArg<3>, ImmArg<4>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
+def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
 
 def int_amdgcn_s_buffer_load : Intrinsic <
@@ -874,9 +874,9 @@ def int_amdgcn_s_buffer_load : Intrinsic <
   [IntrNoMem, ImmArg<2>]>,
   AMDGPURsrcIntrinsic<0>;
 
-class AMDGPUBufferStore : Intrinsic <
+class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
   [],
-  [llvm_any_ty,       // vdata(VGPR)
+  [data_ty,          // vdata(VGPR)
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
@@ -884,7 +884,7 @@ class AMDGPUBufferStore : Intrinsic <
    llvm_i1_ty],       // slc(imm)
   [IntrWriteMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
-def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
+def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>;
 def int_amdgcn_buffer_store : AMDGPUBufferStore;
 
 // New buffer intrinsics with separate raw and struct variants.  The raw
@@ -894,56 +894,68 @@ def int_amdgcn_buffer_store : AMDGPUBufferStore;
 // and swizzling changes depending on whether idxen is set in the instruction.
 // These new instrinsics also keep the offset and soffset arguments separate as
 // they behave differently in bounds checking and swizzling.
-class AMDGPURawBufferLoad : Intrinsic <
-  [llvm_any_ty],
+class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
+  [data_ty],
   [llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
+   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                      //                                       bit 1 = slc,
+                      //                                       bit 2 = dlc on gfx10+),
+                      //                      swizzled buffer (bit 3 = swz))
   [IntrReadMem, ImmArg<3>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad;
+def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
 
-class AMDGPUStructBufferLoad : Intrinsic <
-  [llvm_any_ty],
+class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
+  [data_ty],
   [llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
+   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                      //                                       bit 1 = slc,
+                      //                                       bit 2 = dlc on gfx10+),
+                      //                      swizzled buffer (bit 3 = swz))
   [IntrReadMem, ImmArg<4>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
+def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
 
-class AMDGPURawBufferStore : Intrinsic <
+class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
   [],
-  [llvm_any_ty,       // vdata(VGPR)
+  [data_ty,           // vdata(VGPR)
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
+   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                      //                                       bit 1 = slc,
+                      //                                       bit 2 = dlc on gfx10+),
+                      //                      swizzled buffer (bit 3 = swz))
   [IntrWriteMem, ImmArg<4>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
-def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore;
+def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
 
-class AMDGPUStructBufferStore : Intrinsic <
+class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
   [],
-  [llvm_any_ty,       // vdata(VGPR)
+  [data_ty,           // vdata(VGPR)
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
+   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                      //                                       bit 1 = slc,
+                      //                                       bit 2 = dlc on gfx10+),
+                      //                      swizzled buffer (bit 3 = swz))
   [IntrWriteMem, ImmArg<5>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
-def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
+def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore<llvm_anyfloat_ty>;
 def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
 
-class AMDGPURawBufferAtomic : Intrinsic <
-  [llvm_anyint_ty],
+class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
+  [data_ty],
   [LLVMMatchType<0>,  // vdata(VGPR)
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
@@ -961,6 +973,8 @@ def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -972,8 +986,8 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
   [ImmArg<5>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<2, 0>;
 
-class AMDGPUStructBufferAtomic : Intrinsic <
-  [llvm_anyint_ty],
+class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
+  [data_ty],
   [LLVMMatchType<0>,  // vdata(VGPR)
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // vindex(VGPR)
@@ -992,6 +1006,8 @@ def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
   [llvm_anyint_ty],
   [LLVMMatchType<0>,  // src(VGPR)
@@ -1046,7 +1062,10 @@ def int_amdgcn_raw_tbuffer_load : Intrinsic <
      llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
      llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
      llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
-     llvm_i32_ty],    // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
+     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                      //                                       bit 1 = slc,
+                      //                                       bit 2 = dlc on gfx10+),
+                      //                      swizzled buffer (bit 3 = swz))
     [IntrReadMem, ImmArg<3>, ImmArg<4>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 
@@ -1057,7 +1076,10 @@ def int_amdgcn_raw_tbuffer_store : Intrinsic <
      llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
      llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
      llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
-     llvm_i32_ty],   // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
+     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                     //                                       bit 1 = slc,
+                     //                                       bit 2 = dlc on gfx10+),
+                     //                      swizzled buffer (bit 3 = swz))
     [IntrWriteMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 
@@ -1068,7 +1090,10 @@ def int_amdgcn_struct_tbuffer_load : Intrinsic <
      llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
      llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
      llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
-     llvm_i32_ty],    // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
+     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                      //                                       bit 1 = slc,
+                      //                                       bit 2 = dlc on gfx10+),
+                      //                      swizzled buffer (bit 3 = swz))
     [IntrReadMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 
@@ -1080,7 +1105,10 @@ def int_amdgcn_struct_tbuffer_store : Intrinsic <
      llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
      llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
      llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
-     llvm_i32_ty],   // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
+     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
+                     //                                       bit 1 = slc,
+                     //                                       bit 2 = dlc on gfx10+),
+                     //                      swizzled buffer (bit 3 = swz))
     [IntrWriteMem, ImmArg<5>, ImmArg<6>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 
@@ -1431,6 +1459,13 @@ def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
   [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
 >;
 
+// Copies the source value to the destination value, such that the source
+// is computed as if the entire program were executed in WQM if any other
+// program code executes in WQM.
+def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty],
+  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+>;
+
 // Return true if at least one thread within the pixel quad passes true into
 // the function.
 def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
@@ -1459,6 +1494,18 @@ def int_amdgcn_set_inactive :
              LLVMMatchType<0>], // value for the inactive lanes to take
             [IntrNoMem, IntrConvergent]>;
 
+// Return if the given flat pointer points to a local memory address.
+def int_amdgcn_is_shared : GCCBuiltin<"__builtin_amdgcn_is_shared">,
+  Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
+  [IntrNoMem, IntrSpeculatable, NoCapture<0>]
+>;
+
+// Return if the given flat pointer points to a prvate memory address.
+def int_amdgcn_is_private : GCCBuiltin<"__builtin_amdgcn_is_private">,
+  Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
+  [IntrNoMem, IntrSpeculatable, NoCapture<0>]
+>;
+
 //===----------------------------------------------------------------------===//
 // CI+ Intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td
index 4792af097d95..e13da6157e04 100644
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@@ -777,5 +777,14 @@ class Neon_Dot_Intrinsic
 def int_arm_neon_udot : Neon_Dot_Intrinsic;
 def int_arm_neon_sdot : Neon_Dot_Intrinsic;
 
+def int_arm_vctp8  : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+// GNU eabi mcount
+def int_arm_gnu_eabi_mcount : Intrinsic<[],
+                                    [],
+                                    [IntrReadMem, IntrWriteMem]>;
 
 } // end TargetPrefix
diff --git a/include/llvm/IR/IntrinsicsBPF.td b/include/llvm/IR/IntrinsicsBPF.td
index d7595a2a7700..3618cc6a4128 100644
--- a/include/llvm/IR/IntrinsicsBPF.td
+++ b/include/llvm/IR/IntrinsicsBPF.td
@@ -20,4 +20,7 @@ let TargetPrefix = "bpf" in {  // All intrinsics start with "llvm.bpf."
               Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrReadMem]>;
   def int_bpf_pseudo : GCCBuiltin<"__builtin_bpf_pseudo">,
               Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty]>;
+  def int_bpf_preserve_field_info : GCCBuiltin<"__builtin_bpf_preserve_field_info">,
+              Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty, llvm_i64_ty],
+              [IntrNoMem, ImmArg<1>]>;
 }
diff --git a/include/llvm/IR/IntrinsicsMips.td b/include/llvm/IR/IntrinsicsMips.td
index 6393a9ca35d5..bfcdd80a52d5 100644
--- a/include/llvm/IR/IntrinsicsMips.td
+++ b/include/llvm/IR/IntrinsicsMips.td
@@ -1260,16 +1260,16 @@ def int_mips_insve_d : GCCBuiltin<"__builtin_msa_insve_d">,
 
 def int_mips_ld_b : GCCBuiltin<"__builtin_msa_ld_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, ImmArg<1>]>;
+  [IntrReadMem, IntrArgMemOnly]>;
 def int_mips_ld_h : GCCBuiltin<"__builtin_msa_ld_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_ptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, ImmArg<1>]>;
+  [IntrReadMem, IntrArgMemOnly]>;
 def int_mips_ld_w : GCCBuiltin<"__builtin_msa_ld_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, ImmArg<1>]>;
+  [IntrReadMem, IntrArgMemOnly]>;
 def int_mips_ld_d : GCCBuiltin<"__builtin_msa_ld_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, ImmArg<1>]>;
+  [IntrReadMem, IntrArgMemOnly]>;
 
 def int_mips_ldi_b : GCCBuiltin<"__builtin_msa_ldi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
@@ -1684,16 +1684,16 @@ def int_mips_srlri_d : GCCBuiltin<"__builtin_msa_srlri_d">,
 
 def int_mips_st_b : GCCBuiltin<"__builtin_msa_st_b">,
   Intrinsic<[], [llvm_v16i8_ty, llvm_ptr_ty, llvm_i32_ty],
-  [IntrArgMemOnly, ImmArg<2>]>;
+  [IntrArgMemOnly]>;
 def int_mips_st_h : GCCBuiltin<"__builtin_msa_st_h">,
   Intrinsic<[], [llvm_v8i16_ty, llvm_ptr_ty, llvm_i32_ty],
-  [IntrArgMemOnly, ImmArg<2>]>;
+  [IntrArgMemOnly]>;
 def int_mips_st_w : GCCBuiltin<"__builtin_msa_st_w">,
   Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i32_ty],
-  [IntrArgMemOnly, ImmArg<2>]>;
+  [IntrArgMemOnly]>;
 def int_mips_st_d : GCCBuiltin<"__builtin_msa_st_d">,
   Intrinsic<[], [llvm_v2i64_ty, llvm_ptr_ty, llvm_i32_ty],
-  [IntrArgMemOnly, ImmArg<2>]>;
+  [IntrArgMemOnly]>;
 
 def int_mips_subs_s_b : GCCBuiltin<"__builtin_msa_subs_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td
index dba7dd76c4ff..0483d965ba64 100644
--- a/include/llvm/IR/IntrinsicsNVVM.td
+++ b/include/llvm/IR/IntrinsicsNVVM.td
@@ -276,6 +276,26 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
   );
 }
 
+class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
+  string Suffix = !if(sync, "sync_", "")
+                  # mode # "_"
+                  # type
+                  # !if(return_pred, "p", "");
+
+  string Name = "int_nvvm_shfl_" # Suffix;
+  string Builtin = "__nvvm_shfl_" # Suffix;
+  string IntrName = "llvm.nvvm.shfl." # !subst("_",".", Suffix);
+  list<int> withGccBuiltin = !if(return_pred, [], [1]);
+  list<int> withoutGccBuiltin = !if(return_pred, [1], []);
+  LLVMType OpType = !cond(
+    !eq(type,"i32"): llvm_i32_ty,
+    !eq(type,"f32"): llvm_float_ty);
+  list<LLVMType> RetTy = !if(return_pred, [OpType, llvm_i1_ty], [OpType]);
+  list<LLVMType> ArgsTy = !if(sync,
+    [llvm_i32_ty, OpType, llvm_i32_ty, llvm_i32_ty],
+    [OpType, llvm_i32_ty, llvm_i32_ty]);
+}
+
 let TargetPrefix = "nvvm" in {
   def int_nvvm_prmt : GCCBuiltin<"__nvvm_prmt">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -3955,90 +3975,27 @@ def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
 //
 // SHUFFLE
 //
-
-// shfl.down.b32 dest, val, offset, mask_and_clamp
-def int_nvvm_shfl_down_i32 :
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.i32">,
-  GCCBuiltin<"__nvvm_shfl_down_i32">;
-def int_nvvm_shfl_down_f32 :
-  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.f32">,
-  GCCBuiltin<"__nvvm_shfl_down_f32">;
-
-// shfl.up.b32 dest, val, offset, mask_and_clamp
-def int_nvvm_shfl_up_i32 :
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.i32">,
-  GCCBuiltin<"__nvvm_shfl_up_i32">;
-def int_nvvm_shfl_up_f32 :
-  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.f32">,
-  GCCBuiltin<"__nvvm_shfl_up_f32">;
-
-// shfl.bfly.b32 dest, val, offset, mask_and_clamp
-def int_nvvm_shfl_bfly_i32 :
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">,
-  GCCBuiltin<"__nvvm_shfl_bfly_i32">;
-def int_nvvm_shfl_bfly_f32 :
-  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">,
-  GCCBuiltin<"__nvvm_shfl_bfly_f32">;
-
-// shfl.idx.b32 dest, val, lane, mask_and_clamp
-def int_nvvm_shfl_idx_i32 :
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.i32">,
-  GCCBuiltin<"__nvvm_shfl_idx_i32">;
-def int_nvvm_shfl_idx_f32 :
-  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.f32">,
-  GCCBuiltin<"__nvvm_shfl_idx_f32">;
-
-// Synchronizing shfl variants available in CUDA-9.
-// On sm_70 these don't have to be convergent, so we may eventually want to
-// implement non-convergent variant of this intrinsic.
-
-// shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp
-def int_nvvm_shfl_sync_down_i32 :
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">,
-  GCCBuiltin<"__nvvm_shfl_sync_down_i32">;
-def int_nvvm_shfl_sync_down_f32 :
-  Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">,
-  GCCBuiltin<"__nvvm_shfl_sync_down_f32">;
-
-// shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp
-def int_nvvm_shfl_sync_up_i32 :
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">,
-  GCCBuiltin<"__nvvm_shfl_sync_up_i32">;
-def int_nvvm_shfl_sync_up_f32 :
-  Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">,
-  GCCBuiltin<"__nvvm_shfl_sync_up_f32">;
-
-// shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp
-def int_nvvm_shfl_sync_bfly_i32 :
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">,
-  GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">;
-def int_nvvm_shfl_sync_bfly_f32 :
-  Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">,
-  GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">;
-
-// shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp
-def int_nvvm_shfl_sync_idx_i32 :
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">,
-  GCCBuiltin<"__nvvm_shfl_sync_idx_i32">;
-def int_nvvm_shfl_sync_idx_f32 :
-  Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">,
-  GCCBuiltin<"__nvvm_shfl_sync_idx_f32">;
+// Generate intrinsics for all variants of shfl instruction.
+foreach sync = [0, 1] in {
+  foreach mode = ["up", "down", "bfly", "idx"] in {
+    foreach type = ["i32", "f32"] in {
+      foreach return_pred = [0, 1] in {
+        foreach i = [SHFL_INFO<sync, mode, type, return_pred>] in {
+          foreach _ = i.withGccBuiltin in {
+            def i.Name : GCCBuiltin<i.Builtin>,
+                         Intrinsic<i.RetTy, i.ArgsTy,
+                                   [IntrInaccessibleMemOnly, IntrConvergent],
+                                   i.IntrName>;
+          }
+          foreach _ = i.withoutGccBuiltin in {
+            def i.Name : Intrinsic<i.RetTy, i.ArgsTy,
+                         [IntrInaccessibleMemOnly, IntrConvergent], i.IntrName>;
+          }
+        }
+      }
+    }
+  }
+}
 
 //
 // VOTE
diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td
index 1b892727547d..810979b99934 100644
--- a/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -23,6 +23,17 @@ def int_wasm_memory_grow : Intrinsic<[llvm_anyint_ty],
                                      [llvm_i32_ty, LLVMMatchType<0>],
                                      []>;
 
+//===----------------------------------------------------------------------===//
+// Trapping float-to-int conversions
+//===----------------------------------------------------------------------===//
+
+def int_wasm_trunc_signed : Intrinsic<[llvm_anyint_ty],
+                                      [llvm_anyfloat_ty],
+                                      [IntrNoMem]>;
+def int_wasm_trunc_unsigned : Intrinsic<[llvm_anyint_ty],
+                                        [llvm_anyfloat_ty],
+                                        [IntrNoMem]>;
+
 //===----------------------------------------------------------------------===//
 // Saturating float-to-int conversions
 //===----------------------------------------------------------------------===//
@@ -89,6 +100,10 @@ def int_wasm_atomic_notify:
 // SIMD intrinsics
 //===----------------------------------------------------------------------===//
 
+def int_wasm_swizzle :
+  Intrinsic<[llvm_v16i8_ty],
+            [llvm_v16i8_ty, llvm_v16i8_ty],
+            [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_sub_saturate_signed :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>],
@@ -109,6 +124,39 @@ def int_wasm_alltrue :
   Intrinsic<[llvm_i32_ty],
             [llvm_anyvector_ty],
             [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_qfma :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_qfms :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_narrow_signed :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty, LLVMMatchType<1>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_narrow_unsigned :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty, LLVMMatchType<1>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_widen_low_signed :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_widen_high_signed :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_widen_low_unsigned :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_widen_high_unsigned :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
 
 //===----------------------------------------------------------------------===//
 // Bulk memory intrinsics
@@ -133,4 +181,14 @@ def int_wasm_tls_size :
             [],
             [IntrNoMem, IntrSpeculatable]>;
 
+def int_wasm_tls_align :
+  Intrinsic<[llvm_anyint_ty],
+            [],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_tls_base :
+  Intrinsic<[llvm_ptr_ty],
+            [],
+            [IntrReadMem]>;
+
 } // TargetPrefix = "wasm"
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 236d312d7d78..5796686dd79f 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -2091,16 +2091,20 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_ptr_ty], [], []>;
   def int_x86_lwpins32 :
               GCCBuiltin<"__builtin_ia32_lwpins32">,
-              Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+              Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                        [ImmArg<2>]>;
   def int_x86_lwpins64 :
               GCCBuiltin<"__builtin_ia32_lwpins64">,
-              Intrinsic<[llvm_i8_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>;
+              Intrinsic<[llvm_i8_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
+                        [ImmArg<2>]>;
   def int_x86_lwpval32 :
               GCCBuiltin<"__builtin_ia32_lwpval32">,
-              Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+              Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                        [ImmArg<2>]>;
   def int_x86_lwpval64 :
               GCCBuiltin<"__builtin_ia32_lwpval64">,
-              Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>;
+              Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
+                        [ImmArg<2>]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index c80504500418..91bd57dc5ac0 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -72,34 +72,9 @@ public:
   // Pinned metadata names, which always have the same value.  This is a
   // compile-time performance optimization, not a correctness optimization.
   enum : unsigned {
-    MD_dbg = 0,                       // "dbg"
-    MD_tbaa = 1,                      // "tbaa"
-    MD_prof = 2,                      // "prof"
-    MD_fpmath = 3,                    // "fpmath"
-    MD_range = 4,                     // "range"
-    MD_tbaa_struct = 5,               // "tbaa.struct"
-    MD_invariant_load = 6,            // "invariant.load"
-    MD_alias_scope = 7,               // "alias.scope"
-    MD_noalias = 8,                   // "noalias",
-    MD_nontemporal = 9,               // "nontemporal"
-    MD_mem_parallel_loop_access = 10, // "llvm.mem.parallel_loop_access"
-    MD_nonnull = 11,                  // "nonnull"
-    MD_dereferenceable = 12,          // "dereferenceable"
-    MD_dereferenceable_or_null = 13,  // "dereferenceable_or_null"
-    MD_make_implicit = 14,            // "make.implicit"
-    MD_unpredictable = 15,            // "unpredictable"
-    MD_invariant_group = 16,          // "invariant.group"
-    MD_align = 17,                    // "align"
-    MD_loop = 18,                     // "llvm.loop"
-    MD_type = 19,                     // "type"
-    MD_section_prefix = 20,           // "section_prefix"
-    MD_absolute_symbol = 21,          // "absolute_symbol"
-    MD_associated = 22,               // "associated"
-    MD_callees = 23,                  // "callees"
-    MD_irr_loop = 24,                 // "irr_loop"
-    MD_access_group = 25,             // "llvm.access.group"
-    MD_callback = 26,                 // "callback"
-    MD_preserve_access_index = 27,    // "llvm.preserve.*.access.index"
+#define LLVM_FIXED_MD_KIND(EnumID, Name, Value) EnumID = Value,
+#include "llvm/IR/FixedMetadataKinds.def"
+#undef LLVM_FIXED_MD_KIND
   };
 
   /// Known operand bundle tag IDs, which always have the same value.  All
diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h
index 3a2b1bddf45d..11e2e2623257 100644
--- a/include/llvm/IR/MDBuilder.h
+++ b/include/llvm/IR/MDBuilder.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/DataTypes.h"
 #include <utility>
@@ -75,6 +76,10 @@ public:
   /// Return metadata containing the section prefix for a function.
   MDNode *createFunctionSectionPrefix(StringRef Prefix);
 
+  /// return metadata containing expected value
+  MDNode *createMisExpect(uint64_t Index, uint64_t LikelyWeight,
+                          uint64_t UnlikelyWeight);
+
   //===------------------------------------------------------------------===//
   // Range metadata.
   //===------------------------------------------------------------------===//
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 7ca2540181ba..f62b1e246cca 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -601,7 +601,7 @@ dyn_extract_or_null(Y &&MD) {
 /// These are used to efficiently contain a byte sequence for metadata.
 /// MDString is always unnamed.
 class MDString : public Metadata {
-  friend class StringMapEntry<MDString>;
+  friend class StringMapEntryStorage<MDString>;
 
   StringMapEntry<MDString> *Entry = nullptr;
 
@@ -806,7 +806,7 @@ public:
   /// Ensure that this has RAUW support, and then return it.
   ReplaceableMetadataImpl *getOrCreateReplaceableUses() {
     if (!hasReplaceableUses())
-      makeReplaceable(llvm::make_unique<ReplaceableMetadataImpl>(getContext()));
+      makeReplaceable(std::make_unique<ReplaceableMetadataImpl>(getContext()));
     return getReplaceableUses();
   }
 
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index f458680cfe15..59331142766a 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -46,6 +46,7 @@ class FunctionType;
 class GVMaterializer;
 class LLVMContext;
 class MemoryBuffer;
+class Pass;
 class RandomNumberGenerator;
 template <class PtrType> class SmallPtrSetImpl;
 class StructType;
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index aacf8cfc089f..be60447abd87 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -119,7 +119,7 @@ class GlobalValueSummary;
 
 using GlobalValueSummaryList = std::vector<std::unique_ptr<GlobalValueSummary>>;
 
-struct LLVM_ALIGNAS(8) GlobalValueSummaryInfo {
+struct alignas(8) GlobalValueSummaryInfo {
   union NameOrGV {
     NameOrGV(bool HaveGVs) {
       if (HaveGVs)
@@ -603,7 +603,7 @@ public:
     if (!TypeTests.empty() || !TypeTestAssumeVCalls.empty() ||
         !TypeCheckedLoadVCalls.empty() || !TypeTestAssumeConstVCalls.empty() ||
         !TypeCheckedLoadConstVCalls.empty())
-      TIdInfo = llvm::make_unique<TypeIdInfo>(TypeIdInfo{
+      TIdInfo = std::make_unique<TypeIdInfo>(TypeIdInfo{
           std::move(TypeTests), std::move(TypeTestAssumeVCalls),
           std::move(TypeCheckedLoadVCalls),
           std::move(TypeTestAssumeConstVCalls),
@@ -632,6 +632,8 @@ public:
   /// Return the list of <CalleeValueInfo, CalleeInfo> pairs.
   ArrayRef<EdgeTy> calls() const { return CallGraphEdgeList; }
 
+  void addCall(EdgeTy E) { CallGraphEdgeList.push_back(E); }
+
   /// Returns the list of type identifiers used by this function in
   /// llvm.type.test intrinsics other than by an llvm.assume intrinsic,
   /// represented as GUIDs.
@@ -680,7 +682,7 @@ public:
   /// were unable to devirtualize a checked call.
   void addTypeTest(GlobalValue::GUID Guid) {
     if (!TIdInfo)
-      TIdInfo = llvm::make_unique<TypeIdInfo>();
+      TIdInfo = std::make_unique<TypeIdInfo>();
     TIdInfo->TypeTests.push_back(Guid);
   }
 
@@ -780,7 +782,7 @@ public:
 
   void setVTableFuncs(VTableFuncList Funcs) {
     assert(!VTableFuncs);
-    VTableFuncs = llvm::make_unique<VTableFuncList>(std::move(Funcs));
+    VTableFuncs = std::make_unique<VTableFuncList>(std::move(Funcs));
   }
 
   ArrayRef<VirtFuncOffset> vTableFuncs() const {
@@ -1293,6 +1295,12 @@ public:
     return nullptr;
   }
 
+  TypeIdSummary *getTypeIdSummary(StringRef TypeId) {
+    return const_cast<TypeIdSummary *>(
+        static_cast<const ModuleSummaryIndex *>(this)->getTypeIdSummary(
+            TypeId));
+  }
+
   const std::map<std::string, TypeIdCompatibleVtableInfo> &
   typeIdCompatibleVtableMap() const {
     return TypeIdCompatibleVtableMap;
@@ -1411,7 +1419,7 @@ template <>
 struct GraphTraits<ModuleSummaryIndex *> : public GraphTraits<ValueInfo> {
   static NodeRef getEntryNode(ModuleSummaryIndex *I) {
     std::unique_ptr<GlobalValueSummary> Root =
-        make_unique<FunctionSummary>(I->calculateCallGraphRoot());
+        std::make_unique<FunctionSummary>(I->calculateCallGraphRoot());
     GlobalValueSummaryInfo G(I->haveGVs());
     G.SummaryList.push_back(std::move(Root));
     static auto P =
diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h
index 26d9c43fabf1..4d4a67c75172 100644
--- a/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -220,7 +220,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
           V.emplace(RefGUID, /*IsAnalysis=*/false);
         Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*V.find(RefGUID)));
       }
-      Elem.SummaryList.push_back(llvm::make_unique<FunctionSummary>(
+      Elem.SummaryList.push_back(std::make_unique<FunctionSummary>(
           GlobalValueSummary::GVFlags(
               static_cast<GlobalValue::LinkageTypes>(FSum.Linkage),
               FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal, FSum.CanAutoHide),
diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index 8199c65ca8a0..037f5aed03ee 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h
@@ -379,16 +379,25 @@ public:
       return false;
 
     switch (Opcode) {
+    case Instruction::FNeg:
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+    // FIXME: To clean up and correct the semantics of fast-math-flags, FCmp
+    //        should not be treated as a math op, but the other opcodes should.
+    //        This would make things consistent with Select/PHI (FP value type
+    //        determines whether they are math ops and, therefore, capable of
+    //        having fast-math-flags).
     case Instruction::FCmp:
       return true;
-    // non math FP Operators (no FMF)
-    case Instruction::ExtractElement:
-    case Instruction::ShuffleVector:
-    case Instruction::InsertElement:
     case Instruction::PHI:
-      return false;
-    default:
+    case Instruction::Select:
+    case Instruction::Call:
       return V->getType()->isFPOrFPVectorTy();
+    default:
+      return false;
     }
   }
 };
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index 37fe2a5b01ad..1e1f4a92f844 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h
@@ -45,6 +45,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/PassManagerInternal.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TypeName.h"
 #include "llvm/Support/raw_ostream.h"
@@ -418,7 +419,7 @@ template <typename PassT, typename IRUnitT, typename AnalysisManagerT,
 typename PassT::Result
 getAnalysisResultUnpackTuple(AnalysisManagerT &AM, IRUnitT &IR,
                              std::tuple<ArgTs...> Args,
-                             llvm::index_sequence<Ns...>) {
+                             std::index_sequence<Ns...>) {
   (void)Args;
   return AM.template getResult<PassT>(IR, std::get<Ns>(Args)...);
 }
@@ -435,7 +436,7 @@ getAnalysisResult(AnalysisManager<IRUnitT, AnalysisArgTs...> &AM, IRUnitT &IR,
                   std::tuple<MainArgTs...> Args) {
   return (getAnalysisResultUnpackTuple<
           PassT, IRUnitT>)(AM, IR, Args,
-                           llvm::index_sequence_for<AnalysisArgTs...>{});
+                           std::index_sequence_for<AnalysisArgTs...>{});
 }
 
 } // namespace detail
diff --git a/include/llvm/IR/PassManagerInternal.h b/include/llvm/IR/PassManagerInternal.h
index 58198bf67b11..c602c0b5cc20 100644
--- a/include/llvm/IR/PassManagerInternal.h
+++ b/include/llvm/IR/PassManagerInternal.h
@@ -289,7 +289,7 @@ struct AnalysisPassModel : AnalysisPassConcept<IRUnitT, PreservedAnalysesT,
       AnalysisResultConcept<IRUnitT, PreservedAnalysesT, InvalidatorT>>
   run(IRUnitT &IR, AnalysisManager<IRUnitT, ExtraArgTs...> &AM,
       ExtraArgTs... ExtraArgs) override {
-    return llvm::make_unique<ResultModelT>(
+    return std::make_unique<ResultModelT>(
         Pass.run(IR, AM, std::forward<ExtraArgTs>(ExtraArgs)...));
   }
 
diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index 0f03d7cc56b8..2851b24c05ae 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h
@@ -88,6 +88,25 @@ inline class_match<UndefValue> m_Undef() { return class_match<UndefValue>(); }
 /// Match an arbitrary Constant and ignore it.
 inline class_match<Constant> m_Constant() { return class_match<Constant>(); }
 
+/// Match an arbitrary basic block value and ignore it.
+inline class_match<BasicBlock> m_BasicBlock() {
+  return class_match<BasicBlock>();
+}
+
+/// Inverting matcher
+template <typename Ty> struct match_unless {
+  Ty M;
+
+  match_unless(const Ty &Matcher) : M(Matcher) {}
+
+  template <typename ITy> bool match(ITy *V) { return !M.match(V); }
+};
+
+/// Match if the inner matcher does *NOT* match.
+template <typename Ty> inline match_unless<Ty> m_Unless(const Ty &M) {
+  return match_unless<Ty>(M);
+}
+
 /// Matching combinators
 template <typename LTy, typename RTy> struct match_combine_or {
   LTy L;
@@ -300,6 +319,15 @@ template <typename Predicate> struct cstfp_pred_ty : public Predicate {
 //
 ///////////////////////////////////////////////////////////////////////////////
 
+struct is_any_apint {
+  bool isValue(const APInt &C) { return true; }
+};
+/// Match an integer or vector with any integral constant.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_any_apint> m_AnyIntegralConstant() {
+  return cst_pred_ty<is_any_apint>();
+}
+
 struct is_all_ones {
   bool isValue(const APInt &C) { return C.isAllOnesValue(); }
 };
@@ -388,6 +416,18 @@ inline api_pred_ty<is_power2> m_Power2(const APInt *&V) {
   return V;
 }
 
+struct is_negated_power2 {
+  bool isValue(const APInt &C) { return (-C).isPowerOf2(); }
+};
+/// Match a integer or vector negated power-of-2.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_negated_power2> m_NegatedPower2() {
+  return cst_pred_ty<is_negated_power2>();
+}
+inline api_pred_ty<is_negated_power2> m_NegatedPower2(const APInt *&V) {
+  return V;
+}
+
 struct is_power2_or_zero {
   bool isValue(const APInt &C) { return !C || C.isPowerOf2(); }
 };
@@ -528,6 +568,12 @@ inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 /// Match a ConstantFP, capturing the value if we match.
 inline bind_ty<ConstantFP> m_ConstantFP(ConstantFP *&C) { return C; }
 
+/// Match a basic block value, capturing it if we match.
+inline bind_ty<BasicBlock> m_BasicBlock(BasicBlock *&V) { return V; }
+inline bind_ty<const BasicBlock> m_BasicBlock(const BasicBlock *&V) {
+  return V;
+}
+
 /// Match a specified Value*.
 struct specificval_ty {
   const Value *Val;
@@ -597,11 +643,11 @@ struct bind_const_intval_ty {
 };
 
 /// Match a specified integer value or vector of all elements of that
-// value.
+/// value.
 struct specific_intval {
-  uint64_t Val;
+  APInt Val;
 
-  specific_intval(uint64_t V) : Val(V) {}
+  specific_intval(APInt V) : Val(std::move(V)) {}
 
   template <typename ITy> bool match(ITy *V) {
     const auto *CI = dyn_cast<ConstantInt>(V);
@@ -609,18 +655,50 @@ struct specific_intval {
       if (const auto *C = dyn_cast<Constant>(V))
         CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue());
 
-    return CI && CI->getValue() == Val;
+    return CI && APInt::isSameValue(CI->getValue(), Val);
   }
 };
 
 /// Match a specific integer value or vector with all elements equal to
 /// the value.
-inline specific_intval m_SpecificInt(uint64_t V) { return specific_intval(V); }
+inline specific_intval m_SpecificInt(APInt V) {
+  return specific_intval(std::move(V));
+}
+
+inline specific_intval m_SpecificInt(uint64_t V) {
+  return m_SpecificInt(APInt(64, V));
+}
 
 /// Match a ConstantInt and bind to its value.  This does not match
 /// ConstantInts wider than 64-bits.
 inline bind_const_intval_ty m_ConstantInt(uint64_t &V) { return V; }
 
+/// Match a specified basic block value.
+struct specific_bbval {
+  BasicBlock *Val;
+
+  specific_bbval(BasicBlock *Val) : Val(Val) {}
+
+  template <typename ITy> bool match(ITy *V) {
+    const auto *BB = dyn_cast<BasicBlock>(V);
+    return BB && BB == Val;
+  }
+};
+
+/// Match a specific basic block value.
+inline specific_bbval m_SpecificBB(BasicBlock *BB) {
+  return specific_bbval(BB);
+}
+
+/// A commutative-friendly version of m_Specific().
+inline deferredval_ty<BasicBlock> m_Deferred(BasicBlock *const &BB) {
+  return BB;
+}
+inline deferredval_ty<const BasicBlock>
+m_Deferred(const BasicBlock *const &BB) {
+  return BB;
+}
+
 //===----------------------------------------------------------------------===//
 // Matcher for any binary operator.
 //
@@ -968,6 +1046,12 @@ struct is_idiv_op {
   }
 };
 
+struct is_irem_op {
+  bool isOpType(unsigned Opcode) {
+    return Opcode == Instruction::SRem || Opcode == Instruction::URem;
+  }
+};
+
 /// Matches shift operations.
 template <typename LHS, typename RHS>
 inline BinOpPred_match<LHS, RHS, is_shift_op> m_Shift(const LHS &L,
@@ -1003,6 +1087,13 @@ inline BinOpPred_match<LHS, RHS, is_idiv_op> m_IDiv(const LHS &L,
   return BinOpPred_match<LHS, RHS, is_idiv_op>(L, R);
 }
 
+/// Matches integer remainder operations.
+template <typename LHS, typename RHS>
+inline BinOpPred_match<LHS, RHS, is_irem_op> m_IRem(const LHS &L,
+                                                    const RHS &R) {
+  return BinOpPred_match<LHS, RHS, is_irem_op>(L, R);
+}
+
 //===----------------------------------------------------------------------===//
 // Class that matches exact binary ops.
 //
@@ -1210,6 +1301,12 @@ inline CastClass_match<OpTy, Instruction::Trunc> m_Trunc(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::Trunc>(Op);
 }
 
+template <typename OpTy>
+inline match_combine_or<CastClass_match<OpTy, Instruction::Trunc>, OpTy>
+m_TruncOrSelf(const OpTy &Op) {
+  return m_CombineOr(m_Trunc(Op), Op);
+}
+
 /// Matches SExt.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::SExt> m_SExt(const OpTy &Op) {
@@ -1222,6 +1319,18 @@ inline CastClass_match<OpTy, Instruction::ZExt> m_ZExt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::ZExt>(Op);
 }
 
+template <typename OpTy>
+inline match_combine_or<CastClass_match<OpTy, Instruction::ZExt>, OpTy>
+m_ZExtOrSelf(const OpTy &Op) {
+  return m_CombineOr(m_ZExt(Op), Op);
+}
+
+template <typename OpTy>
+inline match_combine_or<CastClass_match<OpTy, Instruction::SExt>, OpTy>
+m_SExtOrSelf(const OpTy &Op) {
+  return m_CombineOr(m_SExt(Op), Op);
+}
+
 template <typename OpTy>
 inline match_combine_or<CastClass_match<OpTy, Instruction::ZExt>,
                         CastClass_match<OpTy, Instruction::SExt>>
@@ -1229,6 +1338,15 @@ m_ZExtOrSExt(const OpTy &Op) {
   return m_CombineOr(m_ZExt(Op), m_SExt(Op));
 }
 
+template <typename OpTy>
+inline match_combine_or<
+    match_combine_or<CastClass_match<OpTy, Instruction::ZExt>,
+                     CastClass_match<OpTy, Instruction::SExt>>,
+    OpTy>
+m_ZExtOrSExtOrSelf(const OpTy &Op) {
+  return m_CombineOr(m_ZExtOrSExt(Op), Op);
+}
+
 /// Matches UIToFP.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::UIToFP> m_UIToFP(const OpTy &Op) {
@@ -1274,27 +1392,34 @@ struct br_match {
 
 inline br_match m_UnconditionalBr(BasicBlock *&Succ) { return br_match(Succ); }
 
-template <typename Cond_t> struct brc_match {
+template <typename Cond_t, typename TrueBlock_t, typename FalseBlock_t>
+struct brc_match {
   Cond_t Cond;
-  BasicBlock *&T, *&F;
+  TrueBlock_t T;
+  FalseBlock_t F;
 
-  brc_match(const Cond_t &C, BasicBlock *&t, BasicBlock *&f)
+  brc_match(const Cond_t &C, const TrueBlock_t &t, const FalseBlock_t &f)
       : Cond(C), T(t), F(f) {}
 
   template <typename OpTy> bool match(OpTy *V) {
     if (auto *BI = dyn_cast<BranchInst>(V))
-      if (BI->isConditional() && Cond.match(BI->getCondition())) {
-        T = BI->getSuccessor(0);
-        F = BI->getSuccessor(1);
-        return true;
-      }
+      if (BI->isConditional() && Cond.match(BI->getCondition()))
+        return T.match(BI->getSuccessor(0)) && F.match(BI->getSuccessor(1));
     return false;
   }
 };
 
 template <typename Cond_t>
-inline brc_match<Cond_t> m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F) {
-  return brc_match<Cond_t>(C, T, F);
+inline brc_match<Cond_t, bind_ty<BasicBlock>, bind_ty<BasicBlock>>
+m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F) {
+  return brc_match<Cond_t, bind_ty<BasicBlock>, bind_ty<BasicBlock>>(
+      C, m_BasicBlock(T), m_BasicBlock(F));
+}
+
+template <typename Cond_t, typename TrueBlock_t, typename FalseBlock_t>
+inline brc_match<Cond_t, TrueBlock_t, FalseBlock_t>
+m_Br(const Cond_t &C, const TrueBlock_t &T, const FalseBlock_t &F) {
+  return brc_match<Cond_t, TrueBlock_t, FalseBlock_t>(C, T, F);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/RemarkStreamer.h b/include/llvm/IR/RemarkStreamer.h
index f34cc660b2fb..2abf6f99cb08 100644
--- a/include/llvm/IR/RemarkStreamer.h
+++ b/include/llvm/IR/RemarkStreamer.h
@@ -25,12 +25,12 @@
 namespace llvm {
 /// Streamer for remarks.
 class RemarkStreamer {
-  /// The filename that the remark diagnostics are emitted to.
-  const std::string Filename;
   /// The regex used to filter remarks based on the passes that emit them.
   Optional<Regex> PassFilter;
   /// The object used to serialize the remarks to a specific format.
-  std::unique_ptr<remarks::Serializer> Serializer;
+  std::unique_ptr<remarks::RemarkSerializer> RemarkSerializer;
+  /// The filename that the remark diagnostics are emitted to.
+  const Optional<std::string> Filename;
 
   /// Convert diagnostics into remark objects.
   /// The lifetime of the members of the result is bound to the lifetime of
@@ -38,14 +38,16 @@ class RemarkStreamer {
   remarks::Remark toRemark(const DiagnosticInfoOptimizationBase &Diag);
 
 public:
-  RemarkStreamer(StringRef Filename,
-                 std::unique_ptr<remarks::Serializer> Serializer);
+  RemarkStreamer(std::unique_ptr<remarks::RemarkSerializer> RemarkSerializer,
+                 Optional<StringRef> Filename = None);
   /// Return the filename that the remark diagnostics are emitted to.
-  StringRef getFilename() const { return Filename; }
+  Optional<StringRef> getFilename() const {
+    return Filename ? Optional<StringRef>(*Filename) : None;
+  }
   /// Return stream that the remark diagnostics are emitted to.
-  raw_ostream &getStream() { return Serializer->OS; }
+  raw_ostream &getStream() { return RemarkSerializer->OS; }
   /// Return the serializer used for this stream.
-  remarks::Serializer &getSerializer() { return *Serializer; }
+  remarks::RemarkSerializer &getSerializer() { return *RemarkSerializer; }
   /// Set a pass filter based on a regex \p Filter.
   /// Returns an error if the regex is invalid.
   Error setFilter(StringRef Filter);
@@ -84,13 +86,21 @@ struct RemarkSetupFormatError : RemarkSetupErrorInfo<RemarkSetupFormatError> {
   using RemarkSetupErrorInfo<RemarkSetupFormatError>::RemarkSetupErrorInfo;
 };
 
-/// Setup optimization remarks.
+/// Setup optimization remarks that output to a file.
 Expected<std::unique_ptr<ToolOutputFile>>
 setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
                          StringRef RemarksPasses, StringRef RemarksFormat,
                          bool RemarksWithHotness,
                          unsigned RemarksHotnessThreshold = 0);
 
+/// Setup optimization remarks that output directly to a raw_ostream.
+/// \p OS is managed by the caller and should be open for writing as long as \p
+/// Context is streaming remarks to it.
+Error setupOptimizationRemarks(LLVMContext &Context, raw_ostream &OS,
+                               StringRef RemarksPasses, StringRef RemarksFormat,
+                               bool RemarksWithHotness,
+                               unsigned RemarksHotnessThreshold = 0);
+
 } // end namespace llvm
 
 #endif // LLVM_IR_REMARKSTREAMER_H
diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h
index f2aa49030aaa..d0961dac833d 100644
--- a/include/llvm/IR/Type.h
+++ b/include/llvm/IR/Type.h
@@ -21,6 +21,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TypeSize.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -281,12 +282,15 @@ public:
   /// This will return zero if the type does not have a size or is not a
   /// primitive type.
   ///
+  /// If this is a scalable vector type, the scalable property will be set and
+  /// the runtime size will be a positive integer multiple of the base size.
+  ///
   /// Note that this may not reflect the size of memory allocated for an
   /// instance of the type or the number of bytes that are written when an
   /// instance of the type is stored to memory. The DataLayout class provides
   /// additional query functions to provide this information.
   ///
-  unsigned getPrimitiveSizeInBits() const LLVM_READONLY;
+  TypeSize getPrimitiveSizeInBits() const LLVM_READONLY;
 
   /// If this is a vector type, return the getPrimitiveSizeInBits value for the
   /// element type. Otherwise return the getPrimitiveSizeInBits value for this
@@ -368,6 +372,7 @@ public:
 
   inline bool getVectorIsScalable() const;
   inline unsigned getVectorNumElements() const;
+  inline ElementCount getVectorElementCount() const;
   Type *getVectorElementType() const {
     assert(getTypeID() == VectorTyID);
     return ContainedTys[0];
@@ -378,6 +383,14 @@ public:
     return ContainedTys[0];
   }
 
+  /// Given an integer or vector type, change the lane bitwidth to NewBitwidth,
+  /// whilst keeping the old number of lanes.
+  inline Type *getWithNewBitWidth(unsigned NewBitWidth) const;
+
+  /// Given scalar/vector integer type, returns a type with elements twice as
+  /// wide as in the original type. For vectors, preserves element count.
+  inline Type *getExtendedType() const;
+
   /// Get the address space of this pointer or pointer vector type.
   inline unsigned getPointerAddressSpace() const;
 
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index 19d87c5c621d..850ee72a0387 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -111,7 +111,7 @@ public:
 #endif
   }
   /// Placement delete - required by std, called if the ctor throws.
-  void operator delete(void *Usr, unsigned, bool) {
+  void operator delete(void *Usr, unsigned, unsigned) {
     // Note: If a subclass manipulates the information which is required to calculate the
     // Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has
     // to restore the changed information to the original value, since the dtor of that class
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index b2d8e7ac4741..f2c4b3b3f203 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -14,8 +14,10 @@
 #define LLVM_IR_VALUE_H
 
 #include "llvm-c/Types.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Use.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
@@ -292,10 +294,29 @@ public:
   /// "V" instead of "this". This function skips metadata entries in the list.
   void replaceNonMetadataUsesWith(Value *V);
 
+  /// Go through the uses list for this definition and make each use point
+  /// to "V" if the callback ShouldReplace returns true for the given Use.
+  /// Unlike replaceAllUsesWith() this function does not support basic block
+  /// values or constant users.
+  void replaceUsesWithIf(Value *New,
+                         llvm::function_ref<bool(Use &U)> ShouldReplace) {
+    assert(New && "Value::replaceUsesWithIf(<null>) is invalid!");
+    assert(New->getType() == getType() &&
+           "replaceUses of value with new value of different type!");
+
+    for (use_iterator UI = use_begin(), E = use_end(); UI != E;) {
+      Use &U = *UI;
+      ++UI;
+      if (!ShouldReplace(U))
+        continue;
+      U.set(New);
+    }
+  }
+
   /// replaceUsesOutsideBlock - Go through the uses list for this definition and
   /// make each use point to "V" instead of "this" when the use is outside the
   /// block. 'This's use list is expected to have at least one element.
-  /// Unlike replaceAllUsesWith this function does not support basic block
+  /// Unlike replaceAllUsesWith() this function does not support basic block
   /// values or constant users.
   void replaceUsesOutsideBlock(Value *V, BasicBlock *BB);
 
@@ -493,17 +514,27 @@ public:
   /// swifterror attribute.
   bool isSwiftError() const;
 
-  /// Strip off pointer casts, all-zero GEPs, address space casts, and aliases.
+  /// Strip off pointer casts, all-zero GEPs and address space casts.
   ///
   /// Returns the original uncasted value.  If this is called on a non-pointer
   /// value, it returns 'this'.
   const Value *stripPointerCasts() const;
   Value *stripPointerCasts() {
     return const_cast<Value *>(
-                         static_cast<const Value *>(this)->stripPointerCasts());
+        static_cast<const Value *>(this)->stripPointerCasts());
   }
 
-  /// Strip off pointer casts, all-zero GEPs, address space casts, and aliases
+  /// Strip off pointer casts, all-zero GEPs, address space casts, and aliases.
+  ///
+  /// Returns the original uncasted value.  If this is called on a non-pointer
+  /// value, it returns 'this'.
+  const Value *stripPointerCastsAndAliases() const;
+  Value *stripPointerCastsAndAliases() {
+    return const_cast<Value *>(
+        static_cast<const Value *>(this)->stripPointerCastsAndAliases());
+  }
+
+  /// Strip off pointer casts, all-zero GEPs and address space casts
   /// but ensures the representation of the result stays the same.
   ///
   /// Returns the original uncasted value with the same representation. If this
@@ -514,26 +545,15 @@ public:
                                    ->stripPointerCastsSameRepresentation());
   }
 
-  /// Strip off pointer casts, all-zero GEPs, aliases and invariant group
-  /// info.
+  /// Strip off pointer casts, all-zero GEPs and invariant group info.
   ///
   /// Returns the original uncasted value.  If this is called on a non-pointer
   /// value, it returns 'this'. This function should be used only in
   /// Alias analysis.
   const Value *stripPointerCastsAndInvariantGroups() const;
   Value *stripPointerCastsAndInvariantGroups() {
-    return const_cast<Value *>(
-        static_cast<const Value *>(this)->stripPointerCastsAndInvariantGroups());
-  }
-
-  /// Strip off pointer casts and all-zero GEPs.
-  ///
-  /// Returns the original uncasted value.  If this is called on a non-pointer
-  /// value, it returns 'this'.
-  const Value *stripPointerCastsNoFollowAliases() const;
-  Value *stripPointerCastsNoFollowAliases() {
-    return const_cast<Value *>(
-          static_cast<const Value *>(this)->stripPointerCastsNoFollowAliases());
+    return const_cast<Value *>(static_cast<const Value *>(this)
+                                   ->stripPointerCastsAndInvariantGroups());
   }
 
   /// Strip off pointer casts and all-constant inbounds GEPs.
@@ -612,7 +632,7 @@ public:
   ///
   /// Returns an alignment which is either specified explicitly, e.g. via
   /// align attribute of a function argument, or guaranteed by DataLayout.
-  unsigned getPointerAlignment(const DataLayout &DL) const;
+  MaybeAlign getPointerAlignment(const DataLayout &DL) const;
 
   /// Translate PHI node to its predecessor from the given basic block.
   ///
diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h
index 6a79b1d387f3..fb5440d5efe8 100644
--- a/include/llvm/IR/ValueMap.h
+++ b/include/llvm/IR/ValueMap.h
@@ -33,11 +33,11 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Mutex.h"
-#include "llvm/Support/UniqueLock.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <iterator>
+#include <mutex>
 #include <type_traits>
 #include <utility>
 
@@ -93,7 +93,6 @@ class ValueMap {
   MapT Map;
   Optional<MDMapT> MDMap;
   ExtraData Data;
-  bool MayMapMetadata = true;
 
 public:
   using key_type = KeyT;
@@ -120,10 +119,6 @@ public:
   }
   Optional<MDMapT> &getMDMap() { return MDMap; }
 
-  bool mayMapMetadata() const { return MayMapMetadata; }
-  void enableMapMetadata() { MayMapMetadata = true; }
-  void disableMapMetadata() { MayMapMetadata = false; }
-
   /// Get the mapped metadata, if it's in the map.
   Optional<Metadata *> getMappedMD(const Metadata *MD) const {
     if (!MDMap)
@@ -266,9 +261,9 @@ public:
     // Make a copy that won't get changed even when *this is destroyed.
     ValueMapCallbackVH Copy(*this);
     typename Config::mutex_type *M = Config::getMutex(Copy.Map->Data);
-    unique_lock<typename Config::mutex_type> Guard;
+    std::unique_lock<typename Config::mutex_type> Guard;
     if (M)
-      Guard = unique_lock<typename Config::mutex_type>(*M);
+      Guard = std::unique_lock<typename Config::mutex_type>(*M);
     Config::onDelete(Copy.Map->Data, Copy.Unwrap());  // May destroy *this.
     Copy.Map->Map.erase(Copy);  // Definitely destroys *this.
   }
@@ -279,9 +274,9 @@ public:
     // Make a copy that won't get changed even when *this is destroyed.
     ValueMapCallbackVH Copy(*this);
     typename Config::mutex_type *M = Config::getMutex(Copy.Map->Data);
-    unique_lock<typename Config::mutex_type> Guard;
+    std::unique_lock<typename Config::mutex_type> Guard;
     if (M)
-      Guard = unique_lock<typename Config::mutex_type>(*M);
+      Guard = std::unique_lock<typename Config::mutex_type>(*M);
 
     KeyT typed_new_key = cast<KeySansPointerT>(new_key);
     // Can destroy *this:
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 164d0be2855a..49f69340c828 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -132,6 +132,7 @@ void initializeDwarfEHPreparePass(PassRegistry&);
 void initializeEarlyCSELegacyPassPass(PassRegistry&);
 void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry&);
 void initializeEarlyIfConverterPass(PassRegistry&);
+void initializeEarlyIfPredicatorPass(PassRegistry &);
 void initializeEarlyMachineLICMPass(PassRegistry&);
 void initializeEarlyTailDuplicatePass(PassRegistry&);
 void initializeEdgeBundlesPass(PassRegistry&);
@@ -202,6 +203,7 @@ void initializeLegacyLICMPassPass(PassRegistry&);
 void initializeLegacyLoopSinkPassPass(PassRegistry&);
 void initializeLegalizerPass(PassRegistry&);
 void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
+void initializeGISelKnownBitsAnalysisPass(PassRegistry &);
 void initializeLibCallsShrinkWrapLegacyPassPass(PassRegistry&);
 void initializeLintPass(PassRegistry&);
 void initializeLiveDebugValuesPass(PassRegistry&);
@@ -241,6 +243,7 @@ void initializeLoopVectorizePass(PassRegistry&);
 void initializeLoopVersioningLICMPass(PassRegistry&);
 void initializeLoopVersioningPassPass(PassRegistry&);
 void initializeLowerAtomicLegacyPassPass(PassRegistry&);
+void initializeLowerConstantIntrinsicsPass(PassRegistry&);
 void initializeLowerEmuTLSPass(PassRegistry&);
 void initializeLowerExpectIntrinsicPass(PassRegistry&);
 void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&);
@@ -250,6 +253,7 @@ void initializeLowerInvokeLegacyPassPass(PassRegistry&);
 void initializeLowerSwitchPass(PassRegistry&);
 void initializeLowerTypeTestsPass(PassRegistry&);
 void initializeMIRCanonicalizerPass(PassRegistry &);
+void initializeMIRNamerPass(PassRegistry &);
 void initializeMIRPrintingPassPass(PassRegistry&);
 void initializeMachineBlockFrequencyInfoPass(PassRegistry&);
 void initializeMachineBlockPlacementPass(PassRegistry&);
@@ -263,7 +267,7 @@ void initializeMachineDominatorTreePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 void initializeMachineLICMPass(PassRegistry&);
 void initializeMachineLoopInfoPass(PassRegistry&);
-void initializeMachineModuleInfoPass(PassRegistry&);
+void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
 void initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry&);
 void initializeMachineOutlinerPass(PassRegistry&);
 void initializeMachinePipelinerPass(PassRegistry&);
@@ -286,7 +290,9 @@ void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
 void initializeMetaRenamerPass(PassRegistry&);
 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
 void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&);
+void initializeModuloScheduleTestPass(PassRegistry&);
 void initializeMustExecutePrinterPass(PassRegistry&);
+void initializeMustBeExecutedContextPrinterPass(PassRegistry&);
 void initializeNameAnonGlobalLegacyPassPass(PassRegistry&);
 void initializeNaryReassociateLegacyPassPass(PassRegistry&);
 void initializeNewGVNLegacyPassPass(PassRegistry&);
@@ -360,7 +366,7 @@ void initializeSROALegacyPassPass(PassRegistry&);
 void initializeSafeStackLegacyPassPass(PassRegistry&);
 void initializeSafepointIRVerifierPass(PassRegistry&);
 void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&);
-void initializeSanitizerCoverageModulePass(PassRegistry&);
+void initializeModuleSanitizerCoverageLegacyPassPass(PassRegistry &);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
 void initializeScalarizeMaskedMemIntrinPass(PassRegistry&);
 void initializeScalarizerLegacyPassPass(PassRegistry&);
diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h
index fb107e3fbe02..daa6585b1113 100644
--- a/include/llvm/LTO/Config.h
+++ b/include/llvm/LTO/Config.h
@@ -226,7 +226,7 @@ struct LTOLLVMContext : LLVMContext {
     setDiscardValueNames(C.ShouldDiscardValueNames);
     enableDebugTypeODRUniquing();
     setDiagnosticHandler(
-        llvm::make_unique<LTOLLVMDiagnosticHandler>(&DiagHandler), true);
+        std::make_unique<LTOLLVMDiagnosticHandler>(&DiagHandler), true);
   }
   DiagnosticHandlerFunction DiagHandler;
 };
diff --git a/include/llvm/LTO/LTO.h b/include/llvm/LTO/LTO.h
index ca0a8b64523a..0a1e3e1d0e42 100644
--- a/include/llvm/LTO/LTO.h
+++ b/include/llvm/LTO/LTO.h
@@ -59,7 +59,9 @@ void thinLTOResolvePrevailingInIndex(
 /// must apply the changes to the Module via thinLTOInternalizeModule.
 void thinLTOInternalizeAndPromoteInIndex(
     ModuleSummaryIndex &Index,
-    function_ref<bool(StringRef, GlobalValue::GUID)> isExported);
+    function_ref<bool(StringRef, GlobalValue::GUID)> isExported,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing);
 
 /// Computes a unique hash for the Module considering the current list of
 /// export/import and other global analysis results.
@@ -296,6 +298,10 @@ public:
   /// Cache) for each task identifier.
   Error run(AddStreamFn AddStream, NativeObjectCache Cache = nullptr);
 
+  /// Static method that returns a list of libcall symbols that can be generated
+  /// by LTO but might not be visible from bitcode symbol table.
+  static ArrayRef<const char*> getRuntimeLibcallSymbols();
+
 private:
   Config Conf;
 
@@ -303,7 +309,7 @@ private:
     RegularLTOState(unsigned ParallelCodeGenParallelismLevel, Config &Conf);
     struct CommonResolution {
       uint64_t Size = 0;
-      unsigned Align = 0;
+      MaybeAlign Align;
       /// Record if at least one instance of the common was marked as prevailing
       bool Prevailing = false;
     };
diff --git a/include/llvm/LTO/legacy/LTOCodeGenerator.h b/include/llvm/LTO/legacy/LTOCodeGenerator.h
index d3cb4c8b79a0..8718df4b88e6 100644
--- a/include/llvm/LTO/legacy/LTOCodeGenerator.h
+++ b/include/llvm/LTO/legacy/LTOCodeGenerator.h
@@ -113,7 +113,7 @@ struct LTOCodeGenerator {
     ShouldRestoreGlobalsLinkage = Value;
   }
 
-  void addMustPreserveSymbol(StringRef Sym) { MustPreserveSymbols[Sym] = 1; }
+  void addMustPreserveSymbol(StringRef Sym) { MustPreserveSymbols.insert(Sym); }
 
   /// Pass options to the driver and optimization passes.
   ///
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 675d179eb22a..ac88165845d3 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -140,6 +140,7 @@ namespace {
       (void) llvm::createLoopVersioningLICMPass();
       (void) llvm::createLoopIdiomPass();
       (void) llvm::createLoopRotatePass();
+      (void) llvm::createLowerConstantIntrinsicsPass();
       (void) llvm::createLowerExpectIntrinsicPass();
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
@@ -219,6 +220,7 @@ namespace {
       (void) llvm::createStraightLineStrengthReducePass();
       (void) llvm::createMemDerefPrinter();
       (void) llvm::createMustExecutePrinter();
+      (void) llvm::createMustBeExecutedContextPrinter();
       (void) llvm::createFloat2IntPass();
       (void) llvm::createEliminateAvailableExternallyPass();
       (void) llvm::createScalarizeMaskedMemIntrinPass();
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 971e9354da8c..3261c483e0d8 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -165,6 +165,10 @@ protected:
   /// instead.
   bool UseDataRegionDirectives = false;
 
+  /// True if .align is to be used for alignment. Only power-of-two
+  /// alignment is supported.
+  bool UseDotAlignForAlignment = false;
+
   //===--- Data Emission Directives -------------------------------------===//
 
   /// This should be set to the directive used to get some number of zero bytes
@@ -313,6 +317,10 @@ protected:
   /// Defaults to false.
   bool HasLinkOnceDirective = false;
 
+  /// True if we have a .lglobl directive, which is used to emit the information
+  /// of a static symbol into the symbol table. Defaults to false.
+  bool HasDotLGloblDirective = false;
+
   /// This attribute, if not MCSA_Invalid, is used to declare a symbol as having
   /// hidden visibility.  Defaults to MCSA_Hidden.
   MCSymbolAttr HiddenVisibilityAttr = MCSA_Hidden;
@@ -388,6 +396,9 @@ protected:
   // %hi(), and similar unary operators.
   bool HasMipsExpressions = false;
 
+  // If true, emit function descriptor symbol on AIX.
+  bool NeedsFunctionDescriptors = false;
+
 public:
   explicit MCAsmInfo();
   virtual ~MCAsmInfo();
@@ -520,6 +531,10 @@ public:
     return UseDataRegionDirectives;
   }
 
+  bool useDotAlignForAlignment() const {
+    return UseDotAlignForAlignment;
+  }
+
   const char *getZeroDirective() const { return ZeroDirective; }
   const char *getAsciiDirective() const { return AsciiDirective; }
   const char *getAscizDirective() const { return AscizDirective; }
@@ -557,6 +572,8 @@ public:
 
   bool hasLinkOnceDirective() const { return HasLinkOnceDirective; }
 
+  bool hasDotLGloblDirective() const { return HasDotLGloblDirective; }
+
   MCSymbolAttr getHiddenVisibilityAttr() const { return HiddenVisibilityAttr; }
 
   MCSymbolAttr getHiddenDeclarationVisibilityAttr() const {
@@ -639,6 +656,7 @@ public:
   bool canRelaxRelocations() const { return RelaxELFRelocations; }
   void setRelaxELFRelocations(bool V) { RelaxELFRelocations = V; }
   bool hasMipsExpressions() const { return HasMipsExpressions; }
+  bool needsFunctionDescriptors() const { return NeedsFunctionDescriptors; }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCAsmInfoXCOFF.h b/include/llvm/MC/MCAsmInfoXCOFF.h
index 2a72ba7398a7..4a3bacc954e0 100644
--- a/include/llvm/MC/MCAsmInfoXCOFF.h
+++ b/include/llvm/MC/MCAsmInfoXCOFF.h
@@ -18,6 +18,11 @@ class MCAsmInfoXCOFF : public MCAsmInfo {
 
 protected:
   MCAsmInfoXCOFF();
+
+public:
+  // Return true only when the identifier Name does not need quotes to be
+  // syntactically correct for XCOFF.
+  bool isValidUnquotedName(StringRef Name) const override;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCAsmMacro.h b/include/llvm/MC/MCAsmMacro.h
index 364d3b5f3666..7eecce0faf64 100644
--- a/include/llvm/MC/MCAsmMacro.h
+++ b/include/llvm/MC/MCAsmMacro.h
@@ -124,7 +124,6 @@ public:
   }
 
   void dump(raw_ostream &OS) const;
-  void dump() const { dump(dbgs()); }
 };
 
 struct MCAsmMacroParameter {
@@ -133,10 +132,10 @@ struct MCAsmMacroParameter {
   bool Required = false;
   bool Vararg = false;
 
-  MCAsmMacroParameter() = default;
-
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump() const { dump(dbgs()); }
-  void dump(raw_ostream &OS) const;
+  LLVM_DUMP_METHOD void dump(raw_ostream &OS) const;
+#endif
 };
 
 typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
@@ -149,8 +148,10 @@ public:
   MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
       : Name(N), Body(B), Parameters(std::move(P)) {}
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump() const { dump(dbgs()); }
-  void dump(raw_ostream &OS) const;
+  LLVM_DUMP_METHOD void dump(raw_ostream &OS) const;
+#endif
 };
 } // namespace llvm
 
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 5c2124cc0d15..b925f3218883 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCAsmMacro.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
@@ -112,6 +113,9 @@ namespace llvm {
     /// number of section symbols with the same name).
     StringMap<bool, BumpPtrAllocator &> UsedNames;
 
+    /// Keeps track of labels that are used in inline assembly.
+    SymbolTable InlineAsmUsedLabelNames;
+
     /// The next ID to dole out to an unnamed assembler temporary symbol with
     /// a given prefix.
     StringMap<unsigned> NextID;
@@ -275,6 +279,8 @@ namespace llvm {
     /// Do automatic reset in destructor
     bool AutoReset;
 
+    MCTargetOptions const *TargetOptions;
+
     bool HadError = false;
 
     MCSymbol *createSymbolImpl(const StringMapEntry<bool> *Name,
@@ -298,7 +304,9 @@ namespace llvm {
   public:
     explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
                        const MCObjectFileInfo *MOFI,
-                       const SourceMgr *Mgr = nullptr, bool DoAutoReset = true);
+                       const SourceMgr *Mgr = nullptr,
+                       MCTargetOptions const *TargetOpts = nullptr,
+                       bool DoAutoReset = true);
     MCContext(const MCContext &) = delete;
     MCContext &operator=(const MCContext &) = delete;
     ~MCContext();
@@ -377,6 +385,16 @@ namespace llvm {
     /// APIs.
     const SymbolTable &getSymbols() const { return Symbols; }
 
+    /// isInlineAsmLabel - Return true if the name is a label referenced in
+    /// inline assembly.
+    MCSymbol *getInlineAsmLabel(StringRef Name) const {
+      return InlineAsmUsedLabelNames.lookup(Name);
+    }
+
+    /// registerInlineAsmLabel - Records that the name is a label referenced in
+    /// inline assembly.
+    void registerInlineAsmLabel(MCSymbol *Sym);
+
     /// @}
 
     /// \name Section Management
@@ -490,6 +508,8 @@ namespace llvm {
 
     MCSectionXCOFF *getXCOFFSection(StringRef Section,
                                     XCOFF::StorageMappingClass MappingClass,
+                                    XCOFF::SymbolType CSectType,
+                                    XCOFF::StorageClass StorageClass,
                                     SectionKind K,
                                     const char *BeginSymName = nullptr);
 
@@ -659,6 +679,7 @@ namespace llvm {
 
     bool hadError() { return HadError; }
     void reportError(SMLoc L, const Twine &Msg);
+    void reportWarning(SMLoc L, const Twine &Msg);
     // Unrecoverable error has occurred. Display the best diagnostic we can
     // and bail via exit(1). For now, most MC backend errors are unrecoverable.
     // FIXME: We should really do something about that.
diff --git a/include/llvm/MC/MCDirectives.h b/include/llvm/MC/MCDirectives.h
index 4029264c2026..ea79e68674e5 100644
--- a/include/llvm/MC/MCDirectives.h
+++ b/include/llvm/MC/MCDirectives.h
@@ -28,6 +28,7 @@ enum MCSymbolAttr {
   MCSA_ELF_TypeNoType,      ///< .type _foo, STT_NOTYPE  # aka @notype
   MCSA_ELF_TypeGnuUniqueObject, /// .type _foo, @gnu_unique_object
   MCSA_Global,              ///< .globl
+  MCSA_LGlobal,             ///< .lglobl (XCOFF)
   MCSA_Hidden,              ///< .hidden (ELF)
   MCSA_IndirectSymbol,      ///< .indirect_symbol (MachO)
   MCSA_Internal,            ///< .internal (ELF)
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 1a37aafd0654..a33b4b31bb06 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -629,7 +629,8 @@ public:
   static void Emit(MCObjectStreamer &streamer, MCAsmBackend *MAB, bool isEH);
   static void EmitAdvanceLoc(MCObjectStreamer &Streamer, uint64_t AddrDelta);
   static void EncodeAdvanceLoc(MCContext &Context, uint64_t AddrDelta,
-                               raw_ostream &OS);
+                               raw_ostream &OS, uint32_t *Offset = nullptr,
+                               uint32_t *Size = nullptr);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index fb23c0114c76..eb2786501f84 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -46,10 +46,6 @@ private:
   ExprKind Kind;
   SMLoc Loc;
 
-  bool evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
-                          const MCAsmLayout *Layout,
-                          const SectionAddrMap *Addrs) const;
-
   bool evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
                           const MCAsmLayout *Layout,
                           const SectionAddrMap *Addrs, bool InSet) const;
@@ -136,7 +132,7 @@ class MCConstantExpr : public MCExpr {
   int64_t Value;
   bool PrintInHex = false;
 
-  MCConstantExpr(int64_t Value)
+  explicit MCConstantExpr(int64_t Value)
       : MCExpr(MCExpr::Constant, SMLoc()), Value(Value) {}
 
   MCConstantExpr(int64_t Value, bool PrintInHex)
@@ -239,6 +235,8 @@ public:
     VK_PPC_TOC_LO,         // symbol@toc@l
     VK_PPC_TOC_HI,         // symbol@toc@h
     VK_PPC_TOC_HA,         // symbol@toc@ha
+    VK_PPC_U,              // symbol@u
+    VK_PPC_L,              // symbol@l
     VK_PPC_DTPMOD,         // symbol@dtpmod
     VK_PPC_TPREL_LO,       // symbol@tprel@l
     VK_PPC_TPREL_HI,       // symbol@tprel@h
diff --git a/include/llvm/MC/MCFixup.h b/include/llvm/MC/MCFixup.h
index accffb7f2247..29e321e2354c 100644
--- a/include/llvm/MC/MCFixup.h
+++ b/include/llvm/MC/MCFixup.h
@@ -20,35 +20,38 @@ class MCExpr;
 
 /// Extensible enumeration to represent the type of a fixup.
 enum MCFixupKind {
-  FK_NONE = 0,   ///< A no-op fixup.
-  FK_Data_1,     ///< A one-byte fixup.
-  FK_Data_2,     ///< A two-byte fixup.
-  FK_Data_4,     ///< A four-byte fixup.
-  FK_Data_8,     ///< A eight-byte fixup.
-  FK_PCRel_1,    ///< A one-byte pc relative fixup.
-  FK_PCRel_2,    ///< A two-byte pc relative fixup.
-  FK_PCRel_4,    ///< A four-byte pc relative fixup.
-  FK_PCRel_8,    ///< A eight-byte pc relative fixup.
-  FK_GPRel_1,    ///< A one-byte gp relative fixup.
-  FK_GPRel_2,    ///< A two-byte gp relative fixup.
-  FK_GPRel_4,    ///< A four-byte gp relative fixup.
-  FK_GPRel_8,    ///< A eight-byte gp relative fixup.
-  FK_DTPRel_4,   ///< A four-byte dtp relative fixup.
-  FK_DTPRel_8,   ///< A eight-byte dtp relative fixup.
-  FK_TPRel_4,    ///< A four-byte tp relative fixup.
-  FK_TPRel_8,    ///< A eight-byte tp relative fixup.
-  FK_SecRel_1,   ///< A one-byte section relative fixup.
-  FK_SecRel_2,   ///< A two-byte section relative fixup.
-  FK_SecRel_4,   ///< A four-byte section relative fixup.
-  FK_SecRel_8,   ///< A eight-byte section relative fixup.
-  FK_Data_Add_1, ///< A one-byte add fixup.
-  FK_Data_Add_2, ///< A two-byte add fixup.
-  FK_Data_Add_4, ///< A four-byte add fixup.
-  FK_Data_Add_8, ///< A eight-byte add fixup.
-  FK_Data_Sub_1, ///< A one-byte sub fixup.
-  FK_Data_Sub_2, ///< A two-byte sub fixup.
-  FK_Data_Sub_4, ///< A four-byte sub fixup.
-  FK_Data_Sub_8, ///< A eight-byte sub fixup.
+  FK_NONE = 0,    ///< A no-op fixup.
+  FK_Data_1,      ///< A one-byte fixup.
+  FK_Data_2,      ///< A two-byte fixup.
+  FK_Data_4,      ///< A four-byte fixup.
+  FK_Data_8,      ///< A eight-byte fixup.
+  FK_Data_6b,     ///< A six-bits fixup.
+  FK_PCRel_1,     ///< A one-byte pc relative fixup.
+  FK_PCRel_2,     ///< A two-byte pc relative fixup.
+  FK_PCRel_4,     ///< A four-byte pc relative fixup.
+  FK_PCRel_8,     ///< A eight-byte pc relative fixup.
+  FK_GPRel_1,     ///< A one-byte gp relative fixup.
+  FK_GPRel_2,     ///< A two-byte gp relative fixup.
+  FK_GPRel_4,     ///< A four-byte gp relative fixup.
+  FK_GPRel_8,     ///< A eight-byte gp relative fixup.
+  FK_DTPRel_4,    ///< A four-byte dtp relative fixup.
+  FK_DTPRel_8,    ///< A eight-byte dtp relative fixup.
+  FK_TPRel_4,     ///< A four-byte tp relative fixup.
+  FK_TPRel_8,     ///< A eight-byte tp relative fixup.
+  FK_SecRel_1,    ///< A one-byte section relative fixup.
+  FK_SecRel_2,    ///< A two-byte section relative fixup.
+  FK_SecRel_4,    ///< A four-byte section relative fixup.
+  FK_SecRel_8,    ///< A eight-byte section relative fixup.
+  FK_Data_Add_1,  ///< A one-byte add fixup.
+  FK_Data_Add_2,  ///< A two-byte add fixup.
+  FK_Data_Add_4,  ///< A four-byte add fixup.
+  FK_Data_Add_8,  ///< A eight-byte add fixup.
+  FK_Data_Add_6b, ///< A six-bits add fixup.
+  FK_Data_Sub_1,  ///< A one-byte sub fixup.
+  FK_Data_Sub_2,  ///< A two-byte sub fixup.
+  FK_Data_Sub_4,  ///< A four-byte sub fixup.
+  FK_Data_Sub_8,  ///< A eight-byte sub fixup.
+  FK_Data_Sub_6b, ///< A six-bits sub fixup.
 
   FirstTargetFixupKind = 128,
 
@@ -75,25 +78,25 @@ class MCFixup {
   /// The value to put into the fixup location. The exact interpretation of the
   /// expression is target dependent, usually it will be one of the operands to
   /// an instruction or an assembler directive.
-  const MCExpr *Value;
+  const MCExpr *Value = nullptr;
 
   /// The byte index of start of the relocation inside the MCFragment.
-  uint32_t Offset;
+  uint32_t Offset = 0;
 
   /// The target dependent kind of fixup item this is. The kind is used to
   /// determine how the operand value should be encoded into the instruction.
-  unsigned Kind;
+  MCFixupKind Kind = FK_NONE;
 
   /// The source location which gave rise to the fixup, if any.
   SMLoc Loc;
 public:
   static MCFixup create(uint32_t Offset, const MCExpr *Value,
                         MCFixupKind Kind, SMLoc Loc = SMLoc()) {
-    assert(unsigned(Kind) < MaxTargetFixupKind && "Kind out of range!");
+    assert(Kind < MaxTargetFixupKind && "Kind out of range!");
     MCFixup FI;
     FI.Value = Value;
     FI.Offset = Offset;
-    FI.Kind = unsigned(Kind);
+    FI.Kind = Kind;
     FI.Loc = Loc;
     return FI;
   }
@@ -104,7 +107,7 @@ public:
     MCFixup FI;
     FI.Value = Fixup.getValue();
     FI.Offset = Fixup.getOffset();
-    FI.Kind = (unsigned)getAddKindForKind(Fixup.getKind());
+    FI.Kind = getAddKindForKind(Fixup.getKind());
     FI.Loc = Fixup.getLoc();
     return FI;
   }
@@ -115,12 +118,14 @@ public:
     MCFixup FI;
     FI.Value = Fixup.getValue();
     FI.Offset = Fixup.getOffset();
-    FI.Kind = (unsigned)getSubKindForKind(Fixup.getKind());
+    FI.Kind = getSubKindForKind(Fixup.getKind());
     FI.Loc = Fixup.getLoc();
     return FI;
   }
 
-  MCFixupKind getKind() const { return MCFixupKind(Kind); }
+  MCFixupKind getKind() const { return Kind; }
+
+  unsigned getTargetKind() const { return Kind; }
 
   uint32_t getOffset() const { return Offset; }
   void setOffset(uint32_t Value) { Offset = Value; }
@@ -129,37 +134,63 @@ public:
 
   /// Return the generic fixup kind for a value with the given size. It
   /// is an error to pass an unsupported size.
-  static MCFixupKind getKindForSize(unsigned Size, bool isPCRel) {
+  static MCFixupKind getKindForSize(unsigned Size, bool IsPCRel) {
     switch (Size) {
     default: llvm_unreachable("Invalid generic fixup size!");
-    case 1: return isPCRel ? FK_PCRel_1 : FK_Data_1;
-    case 2: return isPCRel ? FK_PCRel_2 : FK_Data_2;
-    case 4: return isPCRel ? FK_PCRel_4 : FK_Data_4;
-    case 8: return isPCRel ? FK_PCRel_8 : FK_Data_8;
+    case 1:
+      return IsPCRel ? FK_PCRel_1 : FK_Data_1;
+    case 2:
+      return IsPCRel ? FK_PCRel_2 : FK_Data_2;
+    case 4:
+      return IsPCRel ? FK_PCRel_4 : FK_Data_4;
+    case 8:
+      return IsPCRel ? FK_PCRel_8 : FK_Data_8;
+    }
+  }
+
+  /// Return the generic fixup kind for a value with the given size in bits.
+  /// It is an error to pass an unsupported size.
+  static MCFixupKind getKindForSizeInBits(unsigned Size, bool IsPCRel) {
+    switch (Size) {
+    default:
+      llvm_unreachable("Invalid generic fixup size!");
+    case 6:
+      assert(!IsPCRel && "Invalid pc-relative fixup size!");
+      return FK_Data_6b;
+    case 8:
+      return IsPCRel ? FK_PCRel_1 : FK_Data_1;
+    case 16:
+      return IsPCRel ? FK_PCRel_2 : FK_Data_2;
+    case 32:
+      return IsPCRel ? FK_PCRel_4 : FK_Data_4;
+    case 64:
+      return IsPCRel ? FK_PCRel_8 : FK_Data_8;
     }
   }
 
   /// Return the generic fixup kind for an addition with a given size. It
   /// is an error to pass an unsupported size.
-  static MCFixupKind getAddKindForKind(unsigned Kind) {
+  static MCFixupKind getAddKindForKind(MCFixupKind Kind) {
     switch (Kind) {
     default: llvm_unreachable("Unknown type to convert!");
     case FK_Data_1: return FK_Data_Add_1;
     case FK_Data_2: return FK_Data_Add_2;
     case FK_Data_4: return FK_Data_Add_4;
     case FK_Data_8: return FK_Data_Add_8;
+    case FK_Data_6b: return FK_Data_Add_6b;
     }
   }
 
   /// Return the generic fixup kind for an subtraction with a given size. It
   /// is an error to pass an unsupported size.
-  static MCFixupKind getSubKindForKind(unsigned Kind) {
+  static MCFixupKind getSubKindForKind(MCFixupKind Kind) {
     switch (Kind) {
     default: llvm_unreachable("Unknown type to convert!");
     case FK_Data_1: return FK_Data_Sub_1;
     case FK_Data_2: return FK_Data_Sub_2;
     case FK_Data_4: return FK_Data_Sub_4;
     case FK_Data_8: return FK_Data_Sub_8;
+    case FK_Data_6b: return FK_Data_Sub_6b;
     }
   }
 
diff --git a/include/llvm/MC/MCFragment.h b/include/llvm/MC/MCFragment.h
index aadf2ce725ea..b0def566c46a 100644
--- a/include/llvm/MC/MCFragment.h
+++ b/include/llvm/MC/MCFragment.h
@@ -149,6 +149,7 @@ public:
     case MCFragment::FT_CompactEncodedInst:
     case MCFragment::FT_Data:
     case MCFragment::FT_Dwarf:
+    case MCFragment::FT_DwarfFrame:
       return true;
     }
   }
@@ -232,7 +233,8 @@ public:
   static bool classof(const MCFragment *F) {
     MCFragment::FragmentType Kind = F->getKind();
     return Kind == MCFragment::FT_Relaxable || Kind == MCFragment::FT_Data ||
-           Kind == MCFragment::FT_CVDefRange || Kind == MCFragment::FT_Dwarf;;
+           Kind == MCFragment::FT_CVDefRange || Kind == MCFragment::FT_Dwarf ||
+           Kind == MCFragment::FT_DwarfFrame;
   }
 };
 
@@ -543,27 +545,21 @@ public:
   }
 };
 
-class MCDwarfCallFrameFragment : public MCFragment {
+class MCDwarfCallFrameFragment : public MCEncodedFragmentWithFixups<8, 1> {
   /// AddrDelta - The expression for the difference of the two symbols that
   /// make up the address delta between two .cfi_* dwarf directives.
   const MCExpr *AddrDelta;
 
-  SmallString<8> Contents;
-
 public:
   MCDwarfCallFrameFragment(const MCExpr &AddrDelta, MCSection *Sec = nullptr)
-      : MCFragment(FT_DwarfFrame, false, Sec), AddrDelta(&AddrDelta) {
-    Contents.push_back(0);
-  }
+      : MCEncodedFragmentWithFixups<8, 1>(FT_DwarfFrame, false, Sec),
+        AddrDelta(&AddrDelta) {}
 
   /// \name Accessors
   /// @{
 
   const MCExpr &getAddrDelta() const { return *AddrDelta; }
 
-  SmallString<8> &getContents() { return Contents; }
-  const SmallString<8> &getContents() const { return Contents; }
-
   /// @}
 
   static bool classof(const MCFragment *F) {
diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index 6bbc4bc2903b..4501ce3084c8 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h
@@ -87,12 +87,10 @@ public:
 
   /// Utility functions to make adding mark ups simpler.
   StringRef markup(StringRef s) const;
-  StringRef markup(StringRef a, StringRef b) const;
 
   bool getPrintImmHex() const { return PrintImmHex; }
   void setPrintImmHex(bool Value) { PrintImmHex = Value; }
 
-  HexStyle::Style getPrintHexStyle() const { return PrintHexStyle; }
   void setPrintHexStyle(HexStyle::Style Value) { PrintHexStyle = Value; }
 
   /// Utility function to print immediates in decimal or hex.
diff --git a/include/llvm/MC/MCInstrAnalysis.h b/include/llvm/MC/MCInstrAnalysis.h
index dfefd7e72777..898ca47b13b8 100644
--- a/include/llvm/MC/MCInstrAnalysis.h
+++ b/include/llvm/MC/MCInstrAnalysis.h
@@ -152,6 +152,12 @@ public:
   evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
                  uint64_t &Target) const;
 
+  /// Given an instruction tries to get the address of a memory operand. Returns
+  /// the address on success.
+  virtual Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
+                                                          uint64_t Addr,
+                                                          uint64_t Size) const;
+
   /// Returns (PLT virtual address, GOT virtual address) pairs for PLT entries.
   virtual std::vector<std::pair<uint64_t, uint64_t>>
   findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index 0aa586dfc901..e75a27614a22 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -56,7 +56,11 @@ enum OperandType {
   OPERAND_GENERIC_5 = 11,
   OPERAND_LAST_GENERIC = 11,
 
-  OPERAND_FIRST_TARGET = 12,
+  OPERAND_FIRST_GENERIC_IMM = 12,
+  OPERAND_GENERIC_IMM_0 = 12,
+  OPERAND_LAST_GENERIC_IMM = 12,
+
+  OPERAND_FIRST_TARGET = 13,
 };
 
 }
@@ -103,6 +107,16 @@ public:
     assert(isGenericType() && "non-generic types don't have an index");
     return OperandType - MCOI::OPERAND_FIRST_GENERIC;
   }
+
+  bool isGenericImm() const {
+    return OperandType >= MCOI::OPERAND_FIRST_GENERIC_IMM &&
+           OperandType <= MCOI::OPERAND_LAST_GENERIC_IMM;
+  }
+
+  unsigned getGenericImmIndex() const {
+    assert(isGenericImm() && "non-generic immediates don't have an index");
+    return OperandType - MCOI::OPERAND_FIRST_GENERIC_IMM;
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -115,7 +129,8 @@ namespace MCID {
 /// not use these directly.  These all correspond to bitfields in the
 /// MCInstrDesc::Flags field.
 enum Flag {
-  Variadic = 0,
+  PreISelOpcode = 0,
+  Variadic,
   HasOptionalDef,
   Pseudo,
   Return,
@@ -228,6 +243,10 @@ public:
   /// Return flags of this instruction.
   uint64_t getFlags() const { return Flags; }
 
+  /// \returns true if this instruction is emitted before instruction selection
+  /// and should be legalized/regbankselected/selected.
+  bool isPreISelOpcode() const { return Flags & (1ULL << MCID::PreISelOpcode); }
+
   /// Return true if this instruction can have a variable number of
   /// operands.  In this case, the variable operands will be after the normal
   /// operands but before the implicit definitions and uses (if any are
diff --git a/include/llvm/MC/MCLinkerOptimizationHint.h b/include/llvm/MC/MCLinkerOptimizationHint.h
index f2a1364ad884..003491f32f75 100644
--- a/include/llvm/MC/MCLinkerOptimizationHint.h
+++ b/include/llvm/MC/MCLinkerOptimizationHint.h
@@ -61,6 +61,7 @@ static inline int MCLOHNameToId(StringRef Name) {
     MCLOHCaseNameToId(AdrpAdd)
     MCLOHCaseNameToId(AdrpLdrGot)
     .Default(-1);
+#undef MCLOHCaseNameToId
 }
 
 static inline StringRef MCLOHIdToName(MCLOHType Kind) {
@@ -76,6 +77,7 @@ static inline StringRef MCLOHIdToName(MCLOHType Kind) {
     MCLOHCaseIdToName(AdrpLdrGot);
   }
   return StringRef();
+#undef MCLOHCaseIdToName
 }
 
 static inline int MCLOHIdToNbArgs(MCLOHType Kind) {
diff --git a/include/llvm/MC/MCRegister.h b/include/llvm/MC/MCRegister.h
new file mode 100644
index 000000000000..8372947a4ba1
--- /dev/null
+++ b/include/llvm/MC/MCRegister.h
@@ -0,0 +1,110 @@
+//===-- llvm/MC/Register.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_REGISTER_H
+#define LLVM_MC_REGISTER_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include <cassert>
+
+namespace llvm {
+
+/// An unsigned integer type large enough to represent all physical registers,
+/// but not necessarily virtual registers.
+using MCPhysReg = uint16_t;
+
+/// Wrapper class representing physical registers. Should be passed by value.
+class MCRegister {
+  unsigned Reg;
+
+public:
+  MCRegister(unsigned Val = 0): Reg(Val) {}
+
+  // Register numbers can represent physical registers, virtual registers, and
+  // sometimes stack slots. The unsigned values are divided into these ranges:
+  //
+  //   0           Not a register, can be used as a sentinel.
+  //   [1;2^30)    Physical registers assigned by TableGen.
+  //   [2^30;2^31) Stack slots. (Rarely used.)
+  //   [2^31;2^32) Virtual registers assigned by MachineRegisterInfo.
+  //
+  // Further sentinels can be allocated from the small negative integers.
+  // DenseMapInfo<unsigned> uses -1u and -2u.
+
+  /// This is the portion of the positive number space that is not a physical
+  /// register. StackSlot values do not exist in the MC layer, see
+  /// Register::isStackSlot() for the more information on them.
+  ///
+  /// Note that isVirtualRegister() and isPhysicalRegister() cannot handle stack
+  /// slots, so if a variable may contains a stack slot, always check
+  /// isStackSlot() first.
+  static bool isStackSlot(unsigned Reg) {
+    return int(Reg) >= (1 << 30);
+  }
+
+  /// Return true if the specified register number is in
+  /// the physical register namespace.
+  static bool isPhysicalRegister(unsigned Reg) {
+    assert(!isStackSlot(Reg) && "Not a register! Check isStackSlot() first.");
+    return int(Reg) > 0;
+  }
+
+  /// Return true if the specified register number is in the physical register
+  /// namespace.
+  bool isPhysical() const {
+    return isPhysicalRegister(Reg);
+  }
+
+  operator unsigned() const {
+    return Reg;
+  }
+
+  unsigned id() const {
+    return Reg;
+  }
+
+  bool isValid() const {
+    return Reg != 0;
+  }
+
+  /// Comparisons between register objects
+  bool operator==(const MCRegister &Other) const { return Reg == Other.Reg; }
+  bool operator!=(const MCRegister &Other) const { return Reg != Other.Reg; }
+
+  /// Comparisons against register constants. E.g.
+  /// * R == AArch64::WZR
+  /// * R == 0
+  /// * R == VirtRegMap::NO_PHYS_REG
+  bool operator==(unsigned Other) const { return Reg == Other; }
+  bool operator!=(unsigned Other) const { return Reg != Other; }
+  bool operator==(int Other) const { return Reg == unsigned(Other); }
+  bool operator!=(int Other) const { return Reg != unsigned(Other); }
+  // MSVC requires that we explicitly declare these two as well.
+  bool operator==(MCPhysReg Other) const { return Reg == unsigned(Other); }
+  bool operator!=(MCPhysReg Other) const { return Reg != unsigned(Other); }
+};
+
+// Provide DenseMapInfo for MCRegister
+template<> struct DenseMapInfo<MCRegister> {
+  static inline unsigned getEmptyKey() {
+    return DenseMapInfo<unsigned>::getEmptyKey();
+  }
+  static inline unsigned getTombstoneKey() {
+    return DenseMapInfo<unsigned>::getTombstoneKey();
+  }
+  static unsigned getHashValue(const MCRegister &Val) {
+    return DenseMapInfo<unsigned>::getHashValue(Val.id());
+  }
+  static bool isEqual(const MCRegister &LHS, const MCRegister &RHS) {
+    return DenseMapInfo<unsigned>::isEqual(LHS.id(), RHS.id());
+  }
+};
+
+}
+
+#endif // ifndef LLVM_MC_REGISTER_H
diff --git a/include/llvm/MC/MCRegisterInfo.h b/include/llvm/MC/MCRegisterInfo.h
index 92d39c3fcfb7..c7dc56ea588e 100644
--- a/include/llvm/MC/MCRegisterInfo.h
+++ b/include/llvm/MC/MCRegisterInfo.h
@@ -18,16 +18,13 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegister.h"
 #include <cassert>
 #include <cstdint>
 #include <utility>
 
 namespace llvm {
 
-/// An unsigned integer type large enough to represent all physical registers,
-/// but not necessarily virtual registers.
-using MCPhysReg = uint16_t;
-
 /// MCRegisterClass - Base class of TargetRegisterClass.
 class MCRegisterClass {
 public:
@@ -65,16 +62,17 @@ public:
 
   /// contains - Return true if the specified register is included in this
   /// register class.  This does not include virtual registers.
-  bool contains(unsigned Reg) const {
-    unsigned InByte = Reg % 8;
-    unsigned Byte = Reg / 8;
+  bool contains(MCRegister Reg) const {
+    unsigned RegNo = unsigned(Reg);
+    unsigned InByte = RegNo % 8;
+    unsigned Byte = RegNo / 8;
     if (Byte >= RegSetSize)
       return false;
     return (RegSet[Byte] & (1 << InByte)) != 0;
   }
 
   /// contains - Return true if both registers are in this class.
-  bool contains(unsigned Reg1, unsigned Reg2) const {
+  bool contains(MCRegister Reg1, MCRegister Reg2) const {
     return contains(Reg1) && contains(Reg2);
   }
 
@@ -148,8 +146,8 @@ public:
 private:
   const MCRegisterDesc *Desc;                 // Pointer to the descriptor array
   unsigned NumRegs;                           // Number of entries in the array
-  unsigned RAReg;                             // Return address register
-  unsigned PCReg;                             // Program counter register
+  MCRegister RAReg;                           // Return address register
+  MCRegister PCReg;                           // Program counter register
   const MCRegisterClass *Classes;             // Pointer to the regclass array
   unsigned NumClasses;                        // Number of entries in the array
   unsigned NumRegUnits;                       // Number of regunits.
@@ -175,8 +173,8 @@ private:
   const DwarfLLVMRegPair *EHL2DwarfRegs;      // LLVM to Dwarf regs mapping EH
   const DwarfLLVMRegPair *Dwarf2LRegs;        // Dwarf to LLVM regs mapping
   const DwarfLLVMRegPair *EHDwarf2LRegs;      // Dwarf to LLVM regs mapping EH
-  DenseMap<unsigned, int> L2SEHRegs;          // LLVM to SEH regs mapping
-  DenseMap<unsigned, int> L2CVRegs;           // LLVM to CV regs mapping
+  DenseMap<MCRegister, int> L2SEHRegs;        // LLVM to SEH regs mapping
+  DenseMap<MCRegister, int> L2CVRegs;         // LLVM to CV regs mapping
 
 public:
   /// DiffListIterator - Base iterator class that can traverse the
@@ -202,7 +200,7 @@ public:
     /// advance - Move to the next list position, return the applied
     /// differential. This function does not detect the end of the list, that
     /// is the caller's responsibility (by checking for a 0 return value).
-    unsigned advance() {
+    MCRegister advance() {
       assert(isValid() && "Cannot move off the end of the list.");
       MCPhysReg D = *List++;
       Val += D;
@@ -214,7 +212,7 @@ public:
     bool isValid() const { return List; }
 
     /// Dereference the iterator to get the value at the current position.
-    unsigned operator*() const { return Val; }
+    MCRegister operator*() const { return Val; }
 
     /// Pre-increment to move to the next position.
     void operator++() {
@@ -309,26 +307,26 @@ public:
   /// as the LLVM register number.
   /// FIXME: TableGen these numbers. Currently this requires target specific
   /// initialization code.
-  void mapLLVMRegToSEHReg(unsigned LLVMReg, int SEHReg) {
+  void mapLLVMRegToSEHReg(MCRegister LLVMReg, int SEHReg) {
     L2SEHRegs[LLVMReg] = SEHReg;
   }
 
-  void mapLLVMRegToCVReg(unsigned LLVMReg, int CVReg) {
+  void mapLLVMRegToCVReg(MCRegister LLVMReg, int CVReg) {
     L2CVRegs[LLVMReg] = CVReg;
   }
 
   /// This method should return the register where the return
   /// address can be found.
-  unsigned getRARegister() const {
+  MCRegister getRARegister() const {
     return RAReg;
   }
 
   /// Return the register which is the program counter.
-  unsigned getProgramCounter() const {
+  MCRegister getProgramCounter() const {
     return PCReg;
   }
 
-  const MCRegisterDesc &operator[](unsigned RegNo) const {
+  const MCRegisterDesc &operator[](MCRegister RegNo) const {
     assert(RegNo < NumRegs &&
            "Attempting to access record for invalid register number!");
     return Desc[RegNo];
@@ -336,24 +334,24 @@ public:
 
   /// Provide a get method, equivalent to [], but more useful with a
   /// pointer to this object.
-  const MCRegisterDesc &get(unsigned RegNo) const {
+  const MCRegisterDesc &get(MCRegister RegNo) const {
     return operator[](RegNo);
   }
 
   /// Returns the physical register number of sub-register "Index"
   /// for physical register RegNo. Return zero if the sub-register does not
   /// exist.
-  unsigned getSubReg(unsigned Reg, unsigned Idx) const;
+  MCRegister getSubReg(MCRegister Reg, unsigned Idx) const;
 
   /// Return a super-register of the specified register
   /// Reg so its sub-register of index SubIdx is Reg.
-  unsigned getMatchingSuperReg(unsigned Reg, unsigned SubIdx,
-                               const MCRegisterClass *RC) const;
+  MCRegister getMatchingSuperReg(MCRegister Reg, unsigned SubIdx,
+                                 const MCRegisterClass *RC) const;
 
   /// For a given register pair, return the sub-register index
   /// if the second register is a sub-register of the first. Return zero
   /// otherwise.
-  unsigned getSubRegIndex(unsigned RegNo, unsigned SubRegNo) const;
+  unsigned getSubRegIndex(MCRegister RegNo, MCRegister SubRegNo) const;
 
   /// Get the size of the bit range covered by a sub-register index.
   /// If the index isn't continuous, return the sum of the sizes of its parts.
@@ -367,7 +365,7 @@ public:
 
   /// Return the human-readable symbolic target-specific name for the
   /// specified physical register.
-  const char *getName(unsigned RegNo) const {
+  const char *getName(MCRegister RegNo) const {
     return RegStrings + get(RegNo).Name;
   }
 
@@ -395,15 +393,11 @@ public:
   /// number.  Returns -1 if there is no equivalent value.  The second
   /// parameter allows targets to use different numberings for EH info and
   /// debugging info.
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-
-  /// Map a dwarf register back to a target register.
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
+  int getDwarfRegNum(MCRegister RegNum, bool isEH) const;
 
-  /// Map a DWARF EH register back to a target register (same as
-  /// getLLVMRegNum(RegNum, true)) but return -1 if there is no mapping,
-  /// rather than asserting that there must be one.
-  int getLLVMRegNumFromEH(unsigned RegNum) const;
+  /// Map a dwarf register back to a target register. Returns None is there is
+  /// no mapping.
+  Optional<unsigned> getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
   /// Map a target EH register number to an equivalent DWARF register
   /// number.
@@ -411,11 +405,11 @@ public:
 
   /// Map a target register to an equivalent SEH register
   /// number.  Returns LLVM register number if there is no equivalent value.
-  int getSEHRegNum(unsigned RegNum) const;
+  int getSEHRegNum(MCRegister RegNum) const;
 
   /// Map a target register to an equivalent CodeView register
   /// number.
-  int getCodeViewRegNum(unsigned RegNum) const;
+  int getCodeViewRegNum(MCRegister RegNum) const;
 
   regclass_iterator regclass_begin() const { return Classes; }
   regclass_iterator regclass_end() const { return Classes+NumClasses; }
@@ -439,34 +433,34 @@ public:
   }
 
    /// Returns the encoding for RegNo
-  uint16_t getEncodingValue(unsigned RegNo) const {
+  uint16_t getEncodingValue(MCRegister RegNo) const {
     assert(RegNo < NumRegs &&
            "Attempting to get encoding for invalid register number!");
     return RegEncodingTable[RegNo];
   }
 
   /// Returns true if RegB is a sub-register of RegA.
-  bool isSubRegister(unsigned RegA, unsigned RegB) const {
+  bool isSubRegister(MCRegister RegA, MCRegister RegB) const {
     return isSuperRegister(RegB, RegA);
   }
 
   /// Returns true if RegB is a super-register of RegA.
-  bool isSuperRegister(unsigned RegA, unsigned RegB) const;
+  bool isSuperRegister(MCRegister RegA, MCRegister RegB) const;
 
   /// Returns true if RegB is a sub-register of RegA or if RegB == RegA.
-  bool isSubRegisterEq(unsigned RegA, unsigned RegB) const {
+  bool isSubRegisterEq(MCRegister RegA, MCRegister RegB) const {
     return isSuperRegisterEq(RegB, RegA);
   }
 
   /// Returns true if RegB is a super-register of RegA or if
   /// RegB == RegA.
-  bool isSuperRegisterEq(unsigned RegA, unsigned RegB) const {
+  bool isSuperRegisterEq(MCRegister RegA, MCRegister RegB) const {
     return RegA == RegB || isSuperRegister(RegA, RegB);
   }
 
   /// Returns true if RegB is a super-register or sub-register of RegA
   /// or if RegB == RegA.
-  bool isSuperOrSubRegisterEq(unsigned RegA, unsigned RegB) const {
+  bool isSuperOrSubRegisterEq(MCRegister RegA, MCRegister RegB) const {
     return isSubRegisterEq(RegA, RegB) || isSuperRegister(RegA, RegB);
   }
 };
@@ -482,8 +476,8 @@ public:
 /// If IncludeSelf is set, Reg itself is included in the list.
 class MCSubRegIterator : public MCRegisterInfo::DiffListIterator {
 public:
-  MCSubRegIterator(unsigned Reg, const MCRegisterInfo *MCRI,
-                     bool IncludeSelf = false) {
+  MCSubRegIterator(MCRegister Reg, const MCRegisterInfo *MCRI,
+                   bool IncludeSelf = false) {
     init(Reg, MCRI->DiffLists + MCRI->get(Reg).SubRegs);
     // Initially, the iterator points to Reg itself.
     if (!IncludeSelf)
@@ -500,13 +494,13 @@ class MCSubRegIndexIterator {
 public:
   /// Constructs an iterator that traverses subregisters and their
   /// associated subregister indices.
-  MCSubRegIndexIterator(unsigned Reg, const MCRegisterInfo *MCRI)
+  MCSubRegIndexIterator(MCRegister Reg, const MCRegisterInfo *MCRI)
     : SRIter(Reg, MCRI) {
     SRIndex = MCRI->SubRegIndices + MCRI->get(Reg).SubRegIndices;
   }
 
   /// Returns current sub-register.
-  unsigned getSubReg() const {
+  MCRegister getSubReg() const {
     return *SRIter;
   }
 
@@ -531,7 +525,7 @@ class MCSuperRegIterator : public MCRegisterInfo::DiffListIterator {
 public:
   MCSuperRegIterator() = default;
 
-  MCSuperRegIterator(unsigned Reg, const MCRegisterInfo *MCRI,
+  MCSuperRegIterator(MCRegister Reg, const MCRegisterInfo *MCRI,
                      bool IncludeSelf = false) {
     init(Reg, MCRI->DiffLists + MCRI->get(Reg).SuperRegs);
     // Initially, the iterator points to Reg itself.
@@ -542,7 +536,7 @@ public:
 
 // Definition for isSuperRegister. Put it down here since it needs the
 // iterator defined above in addition to the MCRegisterInfo class itself.
-inline bool MCRegisterInfo::isSuperRegister(unsigned RegA, unsigned RegB) const{
+inline bool MCRegisterInfo::isSuperRegister(MCRegister RegA, MCRegister RegB) const{
   for (MCSuperRegIterator I(RegA, this); I.isValid(); ++I)
     if (*I == RegB)
       return true;
@@ -569,7 +563,7 @@ public:
   /// in Reg.
   MCRegUnitIterator() = default;
 
-  MCRegUnitIterator(unsigned Reg, const MCRegisterInfo *MCRI) {
+  MCRegUnitIterator(MCRegister Reg, const MCRegisterInfo *MCRI) {
     assert(Reg && "Null register has no regunits");
     // Decode the RegUnits MCRegisterDesc field.
     unsigned RU = MCRI->get(Reg).RegUnits;
@@ -600,7 +594,7 @@ public:
 
   /// Constructs an iterator that traverses the register units and their
   /// associated LaneMasks in Reg.
-  MCRegUnitMaskIterator(unsigned Reg, const MCRegisterInfo *MCRI)
+  MCRegUnitMaskIterator(MCRegister Reg, const MCRegisterInfo *MCRI)
     : RUIter(Reg, MCRI) {
       uint16_t Idx = MCRI->get(Reg).RegUnitLaneMasks;
       MaskListIter = &MCRI->RegUnitMaskSequences[Idx];
@@ -667,7 +661,7 @@ public:
 /// any ordering or that entries are unique.
 class MCRegAliasIterator {
 private:
-  unsigned Reg;
+  MCRegister Reg;
   const MCRegisterInfo *MCRI;
   bool IncludeSelf;
 
@@ -676,7 +670,7 @@ private:
   MCSuperRegIterator SI;
 
 public:
-  MCRegAliasIterator(unsigned Reg, const MCRegisterInfo *MCRI,
+  MCRegAliasIterator(MCRegister Reg, const MCRegisterInfo *MCRI,
                      bool IncludeSelf)
     : Reg(Reg), MCRI(MCRI), IncludeSelf(IncludeSelf) {
     // Initialize the iterators.
@@ -692,7 +686,7 @@ public:
 
   bool isValid() const { return RI.isValid(); }
 
-  unsigned operator*() const {
+  MCRegister operator*() const {
     assert(SI.isValid() && "Cannot dereference an invalid iterator.");
     return *SI;
   }
diff --git a/include/llvm/MC/MCSection.h b/include/llvm/MC/MCSection.h
index 6fad1ec2069c..d057feda87d8 100644
--- a/include/llvm/MC/MCSection.h
+++ b/include/llvm/MC/MCSection.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ilist.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Alignment.h"
 #include <cassert>
 #include <utility>
 
@@ -58,7 +59,7 @@ private:
   MCSymbol *Begin;
   MCSymbol *End = nullptr;
   /// The alignment requirement of this section.
-  unsigned Alignment = 1;
+  Align Alignment;
   /// The section index in the assemblers section list.
   unsigned Ordinal = 0;
   /// The index of this section in the layout order.
@@ -117,8 +118,8 @@ public:
   MCSymbol *getEndSymbol(MCContext &Ctx);
   bool hasEnded() const;
 
-  unsigned getAlignment() const { return Alignment; }
-  void setAlignment(unsigned Value) { Alignment = Value; }
+  unsigned getAlignment() const { return Alignment.value(); }
+  void setAlignment(Align Value) { Alignment = Value; }
 
   unsigned getOrdinal() const { return Ordinal; }
   void setOrdinal(unsigned Value) { Ordinal = Value; }
diff --git a/include/llvm/MC/MCSectionXCOFF.h b/include/llvm/MC/MCSectionXCOFF.h
index 2a3f391fd3e2..ee302ed5ecec 100644
--- a/include/llvm/MC/MCSectionXCOFF.h
+++ b/include/llvm/MC/MCSectionXCOFF.h
@@ -23,16 +23,30 @@ class MCSymbol;
 
 // This class represents an XCOFF `Control Section`, more commonly referred to
 // as a csect. A csect represents the smallest possible unit of data/code which
-// will be relocated as a single block.
+// will be relocated as a single block. A csect can either be:
+// 1) Initialized: The Type will be XTY_SD, and the symbols inside the csect
+//    will have a label definition representing their offset within the csect.
+// 2) Uninitialized: The Type will be XTY_CM, it will contain a single symbol,
+//    and may not contain label definitions.
+// 3) An external reference providing a symbol table entry for a symbol
+//    contained in another XCOFF object file. External reference csects are not
+//    implemented yet.
 class MCSectionXCOFF final : public MCSection {
   friend class MCContext;
 
   StringRef Name;
   XCOFF::StorageMappingClass MappingClass;
+  XCOFF::SymbolType Type;
+  XCOFF::StorageClass StorageClass;
 
   MCSectionXCOFF(StringRef Section, XCOFF::StorageMappingClass SMC,
-                 SectionKind K, MCSymbol *Begin)
-      : MCSection(SV_XCOFF, K, Begin), Name(Section), MappingClass(SMC) {}
+                 XCOFF::SymbolType ST, XCOFF::StorageClass SC, SectionKind K,
+                 MCSymbol *Begin)
+      : MCSection(SV_XCOFF, K, Begin), Name(Section), MappingClass(SMC),
+        Type(ST), StorageClass(SC) {
+    assert((ST == XCOFF::XTY_SD || ST == XCOFF::XTY_CM) &&
+           "Invalid or unhandled type for csect.");
+  }
 
 public:
   ~MCSectionXCOFF();
@@ -43,6 +57,8 @@ public:
 
   StringRef getSectionName() const { return Name; }
   XCOFF::StorageMappingClass getMappingClass() const { return MappingClass; }
+  XCOFF::StorageClass getStorageClass() const { return StorageClass; }
+  XCOFF::SymbolType getCSectType() const { return Type; }
 
   void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 731e7515448c..6b48580ae57c 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -46,6 +46,7 @@ struct MCDwarfFrameInfo;
 class MCExpr;
 class MCInst;
 class MCInstPrinter;
+class MCRegister;
 class MCSection;
 class MCStreamer;
 class MCSymbolRefExpr;
@@ -53,6 +54,13 @@ class MCSubtargetInfo;
 class raw_ostream;
 class Twine;
 
+namespace codeview {
+struct DefRangeRegisterRelHeader;
+struct DefRangeSubfieldRegisterHeader;
+struct DefRangeRegisterHeader;
+struct DefRangeFramePointerRelHeader;
+}
+
 using MCSectionSubPair = std::pair<MCSection *, const MCExpr *>;
 
 /// Target specific streamer interface. This is used so that targets can
@@ -536,6 +544,15 @@ public:
   /// \param Symbol - Symbol the image relative relocation should point to.
   virtual void EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset);
 
+  /// Emits an lcomm directive with XCOFF csect information.
+  ///
+  /// \param Symbol - The symbol we are emiting.
+  /// \param Size - The size of the block of storage.
+  /// \param ByteAlignment - The alignment of the symbol in bytes. Must be a power
+  /// of 2.
+  virtual void EmitXCOFFLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                          unsigned ByteAlignment);
+
   /// Emit an ELF .size directive.
   ///
   /// This corresponds to an assembler statement such as:
@@ -860,6 +877,22 @@ public:
       ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
       StringRef FixedSizePortion);
 
+  virtual void EmitCVDefRangeDirective(
+      ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+      codeview::DefRangeRegisterRelHeader DRHdr);
+
+  virtual void EmitCVDefRangeDirective(
+      ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+      codeview::DefRangeSubfieldRegisterHeader DRHdr);
+
+  virtual void EmitCVDefRangeDirective(
+      ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+      codeview::DefRangeRegisterHeader DRHdr);
+
+  virtual void EmitCVDefRangeDirective(
+      ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+      codeview::DefRangeFramePointerRelHeader DRHdr);
+
   /// This implements the CodeView '.cv_stringtable' assembler directive.
   virtual void EmitCVStringTableDirective() {}
 
@@ -917,13 +950,13 @@ public:
   virtual void EmitWinCFIFuncletOrFuncEnd(SMLoc Loc = SMLoc());
   virtual void EmitWinCFIStartChained(SMLoc Loc = SMLoc());
   virtual void EmitWinCFIEndChained(SMLoc Loc = SMLoc());
-  virtual void EmitWinCFIPushReg(unsigned Register, SMLoc Loc = SMLoc());
-  virtual void EmitWinCFISetFrame(unsigned Register, unsigned Offset,
+  virtual void EmitWinCFIPushReg(MCRegister Register, SMLoc Loc = SMLoc());
+  virtual void EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
                                   SMLoc Loc = SMLoc());
   virtual void EmitWinCFIAllocStack(unsigned Size, SMLoc Loc = SMLoc());
-  virtual void EmitWinCFISaveReg(unsigned Register, unsigned Offset,
+  virtual void EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
                                  SMLoc Loc = SMLoc());
-  virtual void EmitWinCFISaveXMM(unsigned Register, unsigned Offset,
+  virtual void EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
                                  SMLoc Loc = SMLoc());
   virtual void EmitWinCFIPushFrame(bool Code, SMLoc Loc = SMLoc());
   virtual void EmitWinCFIEndProlog(SMLoc Loc = SMLoc());
diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 9490a6ecedad..09130c4641ef 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h
@@ -221,6 +221,52 @@ public:
     auto Found = std::lower_bound(ProcDesc.begin(), ProcDesc.end(), CPU);
     return Found != ProcDesc.end() && StringRef(Found->Key) == CPU;
   }
+
+  virtual unsigned getHwMode() const { return 0; }
+
+  /// Return the cache size in bytes for the given level of cache.
+  /// Level is zero-based, so a value of zero means the first level of
+  /// cache.
+  ///
+  virtual Optional<unsigned> getCacheSize(unsigned Level) const;
+
+  /// Return the cache associatvity for the given level of cache.
+  /// Level is zero-based, so a value of zero means the first level of
+  /// cache.
+  ///
+  virtual Optional<unsigned> getCacheAssociativity(unsigned Level) const;
+
+  /// Return the target cache line size in bytes at a given level.
+  ///
+  virtual Optional<unsigned> getCacheLineSize(unsigned Level) const;
+
+  /// Return the target cache line size in bytes.  By default, return
+  /// the line size for the bottom-most level of cache.  This provides
+  /// a more convenient interface for the common case where all cache
+  /// levels have the same line size.  Return zero if there is no
+  /// cache model.
+  ///
+  virtual unsigned getCacheLineSize() const {
+    Optional<unsigned> Size = getCacheLineSize(0);
+    if (Size)
+      return *Size;
+
+    return 0;
+  }
+
+  /// Return the preferred prefetch distance in terms of instructions.
+  ///
+  virtual unsigned getPrefetchDistance() const;
+
+  /// Return the maximum prefetch distance in terms of loop
+  /// iterations.
+  ///
+  virtual unsigned getMaxPrefetchIterationsAhead() const;
+
+  /// Return the minimum stride necessary to trigger software
+  /// prefetching.
+  ///
+  virtual unsigned getMinPrefetchStride() const;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCSymbolWasm.h b/include/llvm/MC/MCSymbolWasm.h
index c50cd0ee4709..95beebe3f75a 100644
--- a/include/llvm/MC/MCSymbolWasm.h
+++ b/include/llvm/MC/MCSymbolWasm.h
@@ -54,6 +54,13 @@ public:
     modifyFlags(wasm::WASM_SYMBOL_EXPORTED, wasm::WASM_SYMBOL_EXPORTED);
   }
 
+  bool isNoStrip() const {
+    return getFlags() & wasm::WASM_SYMBOL_NO_STRIP;
+  }
+  void setNoStrip() const {
+    modifyFlags(wasm::WASM_SYMBOL_NO_STRIP, wasm::WASM_SYMBOL_NO_STRIP);
+  }
+
   bool isWeak() const { return IsWeak; }
   void setWeak(bool isWeak) { IsWeak = isWeak; }
 
diff --git a/include/llvm/MC/MCSymbolXCOFF.h b/include/llvm/MC/MCSymbolXCOFF.h
index 0a1fe1475138..98ecd2466926 100644
--- a/include/llvm/MC/MCSymbolXCOFF.h
+++ b/include/llvm/MC/MCSymbolXCOFF.h
@@ -8,17 +8,49 @@
 #ifndef LLVM_MC_MCSYMBOLXCOFF_H
 #define LLVM_MC_MCSYMBOLXCOFF_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/MC/MCSymbol.h"
 
 namespace llvm {
 
+class MCSectionXCOFF;
+
 class MCSymbolXCOFF : public MCSymbol {
 public:
   MCSymbolXCOFF(const StringMapEntry<bool> *Name, bool isTemporary)
       : MCSymbol(SymbolKindXCOFF, Name, isTemporary) {}
 
   static bool classof(const MCSymbol *S) { return S->isXCOFF(); }
+
+  void setStorageClass(XCOFF::StorageClass SC) {
+    assert((!StorageClass.hasValue() || StorageClass.getValue() == SC) &&
+           "Redefining StorageClass of XCOFF MCSymbol.");
+    StorageClass = SC;
+  };
+
+  XCOFF::StorageClass getStorageClass() const {
+    assert(StorageClass.hasValue() &&
+           "StorageClass not set on XCOFF MCSymbol.");
+    return StorageClass.getValue();
+  }
+
+  void setContainingCsect(MCSectionXCOFF *C) {
+    assert((!ContainingCsect || ContainingCsect == C) &&
+           "Trying to set a containing csect that doesn't match the one that"
+           "this symbol is already mapped to.");
+    ContainingCsect = C;
+  }
+
+  MCSectionXCOFF *getContainingCsect() const {
+    assert(ContainingCsect &&
+           "Trying to get containing csect but none was set.");
+    return ContainingCsect;
+  }
+
+private:
+  Optional<XCOFF::StorageClass> StorageClass;
+  MCSectionXCOFF *ContainingCsect = nullptr;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCWasmObjectWriter.h b/include/llvm/MC/MCWasmObjectWriter.h
index 4adbca28f116..fbb68549b503 100644
--- a/include/llvm/MC/MCWasmObjectWriter.h
+++ b/include/llvm/MC/MCWasmObjectWriter.h
@@ -20,9 +20,10 @@ class raw_pwrite_stream;
 
 class MCWasmObjectTargetWriter : public MCObjectTargetWriter {
   const unsigned Is64Bit : 1;
+  const unsigned IsEmscripten : 1;
 
 protected:
-  explicit MCWasmObjectTargetWriter(bool Is64Bit_);
+  explicit MCWasmObjectTargetWriter(bool Is64Bit_, bool IsEmscripten);
 
 public:
   virtual ~MCWasmObjectTargetWriter();
@@ -38,6 +39,7 @@ public:
   /// \name Accessors
   /// @{
   bool is64Bit() const { return Is64Bit; }
+  bool isEmscripten() const { return IsEmscripten; }
   /// @}
 };
 
diff --git a/include/llvm/MC/MCXCOFFStreamer.h b/include/llvm/MC/MCXCOFFStreamer.h
index 159ae4818749..b13b0031d18e 100644
--- a/include/llvm/MC/MCXCOFFStreamer.h
+++ b/include/llvm/MC/MCXCOFFStreamer.h
@@ -26,6 +26,8 @@ public:
                     uint64_t Size = 0, unsigned ByteAlignment = 0,
                     SMLoc Loc = SMLoc()) override;
   void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo &) override;
+  void EmitXCOFFLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                  unsigned ByteAlign) override;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/StringTableBuilder.h b/include/llvm/MC/StringTableBuilder.h
index c83eca4e512d..c8d4c3bbc262 100644
--- a/include/llvm/MC/StringTableBuilder.h
+++ b/include/llvm/MC/StringTableBuilder.h
@@ -22,7 +22,7 @@ class raw_ostream;
 /// Utility for building string tables with deduplicated suffixes.
 class StringTableBuilder {
 public:
-  enum Kind { ELF, WinCOFF, MachO, RAW, DWARF };
+  enum Kind { ELF, WinCOFF, MachO, RAW, DWARF, XCOFF };
 
 private:
   DenseMap<CachedHashStringRef, size_t> StringIndexMap;
diff --git a/include/llvm/MC/SubtargetFeature.h b/include/llvm/MC/SubtargetFeature.h
index fc9565ceafad..defbc3c64720 100644
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h
@@ -18,6 +18,7 @@
 #define LLVM_MC_SUBTARGETFEATURE_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MathExtras.h"
 #include <array>
 #include <bitset>
 #include <initializer_list>
@@ -33,20 +34,123 @@ const unsigned MAX_SUBTARGET_WORDS = 3;
 const unsigned MAX_SUBTARGET_FEATURES = MAX_SUBTARGET_WORDS * 64;
 
 /// Container class for subtarget features.
-/// This is convenient because std::bitset does not have a constructor
-/// with an initializer list of set bits.
-class FeatureBitset : public std::bitset<MAX_SUBTARGET_FEATURES> {
-public:
-  // Cannot inherit constructors because it's not supported by VC++..
-  FeatureBitset() = default;
-
-  FeatureBitset(const bitset<MAX_SUBTARGET_FEATURES>& B) : bitset(B) {}
+/// This is a constexpr reimplementation of a subset of std::bitset. It would be
+/// nice to use std::bitset directly, but it doesn't support constant
+/// initialization.
+class FeatureBitset {
+  static_assert((MAX_SUBTARGET_FEATURES % 64) == 0,
+                "Should be a multiple of 64!");
+  // This cannot be a std::array, operator[] is not constexpr until C++17.
+  uint64_t Bits[MAX_SUBTARGET_WORDS] = {};
+
+protected:
+  constexpr FeatureBitset(const std::array<uint64_t, MAX_SUBTARGET_WORDS> &B) {
+    for (unsigned I = 0; I != B.size(); ++I)
+      Bits[I] = B[I];
+  }
 
-  FeatureBitset(std::initializer_list<unsigned> Init) {
+public:
+  constexpr FeatureBitset() = default;
+  constexpr FeatureBitset(std::initializer_list<unsigned> Init) {
     for (auto I : Init)
       set(I);
   }
 
+  FeatureBitset &set() {
+    std::fill(std::begin(Bits), std::end(Bits), -1ULL);
+    return *this;
+  }
+
+  constexpr FeatureBitset &set(unsigned I) {
+    // GCC <6.2 crashes if this is written in a single statement.
+    uint64_t NewBits = Bits[I / 64] | (uint64_t(1) << (I % 64));
+    Bits[I / 64] = NewBits;
+    return *this;
+  }
+
+  constexpr FeatureBitset &reset(unsigned I) {
+    // GCC <6.2 crashes if this is written in a single statement.
+    uint64_t NewBits = Bits[I / 64] & ~(uint64_t(1) << (I % 64));
+    Bits[I / 64] = NewBits;
+    return *this;
+  }
+
+  constexpr FeatureBitset &flip(unsigned I) {
+    // GCC <6.2 crashes if this is written in a single statement.
+    uint64_t NewBits = Bits[I / 64] ^ (uint64_t(1) << (I % 64));
+    Bits[I / 64] = NewBits;
+    return *this;
+  }
+
+  constexpr bool operator[](unsigned I) const {
+    uint64_t Mask = uint64_t(1) << (I % 64);
+    return (Bits[I / 64] & Mask) != 0;
+  }
+
+  constexpr bool test(unsigned I) const { return (*this)[I]; }
+
+  constexpr size_t size() const { return MAX_SUBTARGET_FEATURES; }
+
+  bool any() const {
+    return llvm::any_of(Bits, [](uint64_t I) { return I != 0; });
+  }
+  bool none() const { return !any(); }
+  size_t count() const {
+    size_t Count = 0;
+    for (auto B : Bits)
+      Count += countPopulation(B);
+    return Count;
+  }
+
+  constexpr FeatureBitset &operator^=(const FeatureBitset &RHS) {
+    for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I) {
+      Bits[I] ^= RHS.Bits[I];
+    }
+    return *this;
+  }
+  constexpr FeatureBitset operator^(const FeatureBitset &RHS) const {
+    FeatureBitset Result = *this;
+    Result ^= RHS;
+    return Result;
+  }
+
+  constexpr FeatureBitset &operator&=(const FeatureBitset &RHS) {
+    for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I) {
+      Bits[I] &= RHS.Bits[I];
+    }
+    return *this;
+  }
+  constexpr FeatureBitset operator&(const FeatureBitset &RHS) const {
+    FeatureBitset Result = *this;
+    Result &= RHS;
+    return Result;
+  }
+
+  constexpr FeatureBitset &operator|=(const FeatureBitset &RHS) {
+    for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I) {
+      Bits[I] |= RHS.Bits[I];
+    }
+    return *this;
+  }
+  constexpr FeatureBitset operator|(const FeatureBitset &RHS) const {
+    FeatureBitset Result = *this;
+    Result |= RHS;
+    return Result;
+  }
+
+  constexpr FeatureBitset operator~() const {
+    FeatureBitset Result = *this;
+    for (auto &B : Result.Bits)
+      B = ~B;
+    return Result;
+  }
+
+  bool operator==(const FeatureBitset &RHS) const {
+    return std::equal(std::begin(Bits), std::end(Bits), std::begin(RHS.Bits));
+  }
+
+  bool operator!=(const FeatureBitset &RHS) const { return !(*this == RHS); }
+
   bool operator < (const FeatureBitset &Other) const {
     for (unsigned I = 0, E = size(); I != E; ++I) {
       bool LHS = test(I), RHS = Other.test(I);
@@ -58,23 +162,12 @@ public:
 };
 
 /// Class used to store the subtarget bits in the tables created by tablegen.
-/// The std::initializer_list constructor of FeatureBitset can't be done at
-/// compile time and requires a static constructor to run at startup.
-class FeatureBitArray {
-  std::array<uint64_t, MAX_SUBTARGET_WORDS> Bits;
-
+class FeatureBitArray : public FeatureBitset {
 public:
   constexpr FeatureBitArray(const std::array<uint64_t, MAX_SUBTARGET_WORDS> &B)
-      : Bits(B) {}
-
-  FeatureBitset getAsBitset() const {
-    FeatureBitset Result;
-
-    for (unsigned i = 0, e = Bits.size(); i != e; ++i)
-      Result |= FeatureBitset(Bits[i]) << (64 * i);
+      : FeatureBitset(B) {}
 
-    return Result;
-  }
+  const FeatureBitset &getAsBitset() const { return *this; }
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/MCA/CodeEmitter.h b/include/llvm/MCA/CodeEmitter.h
new file mode 100644
index 000000000000..c8d222bd8c2f
--- /dev/null
+++ b/include/llvm/MCA/CodeEmitter.h
@@ -0,0 +1,72 @@
+//===--------------------- CodeEmitter.h ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A utility class used to compute instruction encodings. It buffers encodings
+/// for later usage. It exposes a simple API to compute and get the encodings as
+/// StringRef.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCA_CODEEMITTER_H
+#define LLVM_MCA_CODEEMITTER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/Instruction.h"
+#include "llvm/MCA/Support.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <string>
+
+namespace llvm {
+namespace mca {
+
+/// A utility class used to compute instruction encodings for a code region.
+///
+/// It provides a simple API to compute and return instruction encodings as
+/// strings. Encodings are cached internally for later usage.
+class CodeEmitter {
+  const MCSubtargetInfo &STI;
+  const MCAsmBackend &MAB;
+  const MCCodeEmitter &MCE;
+
+  SmallString<256> Code;
+  raw_svector_ostream VecOS;
+  ArrayRef<MCInst> Sequence;
+
+  // An EncodingInfo pair stores <base, length> information.  Base (i.e. first)
+  // is an index to the `Code`. Length (i.e. second) is the encoding size.
+  using EncodingInfo = std::pair<unsigned, unsigned>;
+
+  // A cache of encodings.
+  SmallVector<EncodingInfo, 16> Encodings;
+
+  EncodingInfo getOrCreateEncodingInfo(unsigned MCID);
+
+public:
+  CodeEmitter(const MCSubtargetInfo &ST, const MCAsmBackend &AB,
+              const MCCodeEmitter &CE, ArrayRef<MCInst> S)
+      : STI(ST), MAB(AB), MCE(CE), VecOS(Code), Sequence(S),
+        Encodings(S.size()) {}
+
+  StringRef getEncoding(unsigned MCID) {
+    EncodingInfo EI = getOrCreateEncodingInfo(MCID);
+    return StringRef(&Code[EI.first], EI.second);
+  }
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif // LLVM_MCA_CODEEMITTER_H
diff --git a/include/llvm/MCA/Context.h b/include/llvm/MCA/Context.h
index 503d780d4947..af3cb8e1e837 100644
--- a/include/llvm/MCA/Context.h
+++ b/include/llvm/MCA/Context.h
@@ -20,7 +20,6 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
-#include "llvm/MCA/InstrBuilder.h"
 #include "llvm/MCA/Pipeline.h"
 #include "llvm/MCA/SourceMgr.h"
 #include <memory>
@@ -58,6 +57,9 @@ public:
   Context(const Context &C) = delete;
   Context &operator=(const Context &C) = delete;
 
+  const MCRegisterInfo &getMCRegisterInfo() const { return MRI; }
+  const MCSubtargetInfo &getMCSubtargetInfo() const { return STI; }
+
   void addHardwareUnit(std::unique_ptr<HardwareUnit> H) {
     Hardware.push_back(std::move(H));
   }
@@ -65,7 +67,6 @@ public:
   /// Construct a basic pipeline for simulating an out-of-order pipeline.
   /// This pipeline consists of Fetch, Dispatch, Execute, and Retire stages.
   std::unique_ptr<Pipeline> createDefaultPipeline(const PipelineOptions &Opts,
-                                                  InstrBuilder &IB,
                                                   SourceMgr &SrcMgr);
 };
 
diff --git a/include/llvm/MCA/HardwareUnits/LSUnit.h b/include/llvm/MCA/HardwareUnits/LSUnit.h
index ae9a49c64855..34903794db4a 100644
--- a/include/llvm/MCA/HardwareUnits/LSUnit.h
+++ b/include/llvm/MCA/HardwareUnits/LSUnit.h
@@ -209,8 +209,10 @@ public:
 
   unsigned getUsedLQEntries() const { return UsedLQEntries; }
   unsigned getUsedSQEntries() const { return UsedSQEntries; }
-  unsigned assignLQSlot() { return UsedLQEntries++; }
-  unsigned assignSQSlot() { return UsedSQEntries++; }
+  void acquireLQSlot() { ++UsedLQEntries; }
+  void acquireSQSlot() { ++UsedSQEntries; }
+  void releaseLQSlot() { --UsedLQEntries; }
+  void releaseSQSlot() { --UsedSQEntries; }
 
   bool assumeNoAlias() const { return NoAlias; }
 
@@ -285,13 +287,18 @@ public:
 
   unsigned createMemoryGroup() {
     Groups.insert(
-        std::make_pair(NextGroupID, llvm::make_unique<MemoryGroup>()));
+        std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
     return NextGroupID++;
   }
 
-  // Instruction executed event handlers.
   virtual void onInstructionExecuted(const InstRef &IR);
 
+  // Loads are tracked by the LDQ (load queue) from dispatch until completion.
+  // Stores are tracked by the STQ (store queue) from dispatch until commitment.
+  // By default we conservatively assume that the LDQ receives a load at
+  // dispatch. Loads leave the LDQ at retirement stage.
+  virtual void onInstructionRetired(const InstRef &IR);
+
   virtual void onInstructionIssued(const InstRef &IR) {
     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
     Groups[GroupID]->onInstructionIssued(IR);
@@ -436,9 +443,6 @@ public:
   /// 6. A store has to wait until an older store barrier is fully executed.
   unsigned dispatch(const InstRef &IR) override;
 
-  // FIXME: For simplicity, we optimistically assume a similar behavior for
-  // store instructions. In practice, store operations don't tend to leave the
-  // store queue until they reach the 'Retired' stage (See PR39830).
   void onInstructionExecuted(const InstRef &IR) override;
 };
 
diff --git a/include/llvm/MCA/HardwareUnits/RegisterFile.h b/include/llvm/MCA/HardwareUnits/RegisterFile.h
index 36506327bd29..cd7718d98744 100644
--- a/include/llvm/MCA/HardwareUnits/RegisterFile.h
+++ b/include/llvm/MCA/HardwareUnits/RegisterFile.h
@@ -220,7 +220,7 @@ public:
   //
   // Current implementation can simulate up to 32 register files (including the
   // special register file at index #0).
-  unsigned isAvailable(ArrayRef<unsigned> Regs) const;
+  unsigned isAvailable(ArrayRef<MCPhysReg> Regs) const;
 
   // Returns the number of PRFs implemented by this processor.
   unsigned getNumRegisterFiles() const { return RegisterFiles.size(); }
diff --git a/include/llvm/MCA/HardwareUnits/ResourceManager.h b/include/llvm/MCA/HardwareUnits/ResourceManager.h
index 2f91185516fb..917af3750044 100644
--- a/include/llvm/MCA/HardwareUnits/ResourceManager.h
+++ b/include/llvm/MCA/HardwareUnits/ResourceManager.h
@@ -33,8 +33,7 @@ namespace mca {
 /// with a buffer size of -1 is always available if it is not reserved.
 ///
 /// Values of type ResourceStateEvent are returned by method
-/// ResourceState::isBufferAvailable(), which is used to query the internal
-/// state of a resource.
+/// ResourceManager::canBeDispatched()
 ///
 /// The naming convention for resource state events is:
 ///  * Event names start with prefix RS_
@@ -263,16 +262,26 @@ public:
   /// Returns RS_BUFFER_UNAVAILABLE if there are no available slots.
   ResourceStateEvent isBufferAvailable() const;
 
-  /// Reserve a slot in the buffer.
-  void reserveBuffer() {
-    if (AvailableSlots)
-      AvailableSlots--;
+  /// Reserve a buffer slot.
+  ///
+  /// Returns true if the buffer is not full.
+  /// It always returns true if BufferSize is set to zero.
+  bool reserveBuffer() {
+    if (BufferSize <= 0)
+      return true;
+
+    --AvailableSlots;
+    assert(AvailableSlots <= static_cast<unsigned>(BufferSize));
+    return AvailableSlots;
   }
 
-  /// Release a slot in the buffer.
+  /// Releases a slot in the buffer.
   void releaseBuffer() {
-    if (BufferSize > 0)
-      AvailableSlots++;
+    // Ignore dispatch hazards or invalid buffer sizes.
+    if (BufferSize <= 0)
+      return;
+
+    ++AvailableSlots;
     assert(AvailableSlots <= static_cast<unsigned>(BufferSize));
   }
 
@@ -351,9 +360,16 @@ class ResourceManager {
   // Set of processor resource units that are available during this cycle.
   uint64_t AvailableProcResUnits;
 
-  // Set of processor resource groups that are currently reserved.
+  // Set of processor resources that are currently reserved.
   uint64_t ReservedResourceGroups;
 
+  // Set of unavailable scheduler buffer resources. This is used internally to
+  // speedup `canBeDispatched()` queries.
+  uint64_t AvailableBuffers;
+
+  // Set of dispatch hazard buffer resources that are currently unavailable.
+  uint64_t ReservedBuffers;
+
   // Returns the actual resource unit that will be used.
   ResourceRef selectPipe(uint64_t ResourceID);
 
@@ -382,17 +398,20 @@ public:
 
   // Returns RS_BUFFER_AVAILABLE if buffered resources are not reserved, and if
   // there are enough available slots in the buffers.
-  ResourceStateEvent canBeDispatched(ArrayRef<uint64_t> Buffers) const;
+  ResourceStateEvent canBeDispatched(uint64_t ConsumedBuffers) const;
 
   // Return the processor resource identifier associated to this Mask.
   unsigned resolveResourceMask(uint64_t Mask) const;
 
-  // Consume a slot in every buffered resource from array 'Buffers'. Resource
-  // units that are dispatch hazards (i.e. BufferSize=0) are marked as reserved.
-  void reserveBuffers(ArrayRef<uint64_t> Buffers);
+  // Acquires a slot from every buffered resource in mask `ConsumedBuffers`.
+  // Units that are dispatch hazards (i.e. BufferSize=0) are marked as reserved.
+  void reserveBuffers(uint64_t ConsumedBuffers);
 
-  // Release buffer entries previously allocated by method reserveBuffers.
-  void releaseBuffers(ArrayRef<uint64_t> Buffers);
+  // Releases a slot from every buffered resource in mask `ConsumedBuffers`.
+  // ConsumedBuffers is a bitmask of previously acquired buffers (using method
+  // `reserveBuffers`). Units that are dispatch hazards (i.e. BufferSize=0) are
+  // not automatically unreserved by this method.
+  void releaseBuffers(uint64_t ConsumedBuffers);
 
   // Reserve a processor resource. A reserved resource is not available for
   // instruction issue until it is released.
diff --git a/include/llvm/MCA/HardwareUnits/RetireControlUnit.h b/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
index 06290141739e..acbd4543bd4a 100644
--- a/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
+++ b/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
@@ -57,34 +57,43 @@ struct RetireControlUnit : public HardwareUnit {
 private:
   unsigned NextAvailableSlotIdx;
   unsigned CurrentInstructionSlotIdx;
-  unsigned AvailableSlots;
+  unsigned NumROBEntries;
+  unsigned AvailableEntries;
   unsigned MaxRetirePerCycle; // 0 means no limit.
   std::vector<RUToken> Queue;
 
-public:
-  RetireControlUnit(const MCSchedModel &SM);
-
-  bool isEmpty() const { return AvailableSlots == Queue.size(); }
-  bool isAvailable(unsigned Quantity = 1) const {
+  unsigned normalizeQuantity(unsigned Quantity) const {
     // Some instructions may declare a number of uOps which exceeds the size
     // of the reorder buffer. To avoid problems, cap the amount of slots to
     // the size of the reorder buffer.
-    Quantity = std::min(Quantity, static_cast<unsigned>(Queue.size()));
+    Quantity = std::min(Quantity, NumROBEntries);
 
     // Further normalize the number of micro opcodes for instructions that
     // declare zero opcodes. This should match the behavior of method
     // reserveSlot().
-    Quantity = std::max(Quantity, 1U);
-    return AvailableSlots >= Quantity;
+    return std::max(Quantity, 1U);
+  }
+
+  unsigned computeNextSlotIdx() const;
+
+public:
+  RetireControlUnit(const MCSchedModel &SM);
+
+  bool isEmpty() const { return AvailableEntries == NumROBEntries; }
+
+  bool isAvailable(unsigned Quantity = 1) const {
+    return AvailableEntries >= normalizeQuantity(Quantity);
   }
 
   unsigned getMaxRetirePerCycle() const { return MaxRetirePerCycle; }
 
-  // Reserves a number of slots, and returns a new token.
-  unsigned reserveSlot(const InstRef &IS, unsigned NumMicroOps);
+  // Reserves a number of slots, and returns a new token reference.
+  unsigned dispatch(const InstRef &IS);
 
   // Return the current token from the RCU's circular token queue.
-  const RUToken &peekCurrentToken() const;
+  const RUToken &getCurrentToken() const;
+
+  const RUToken &peekNextToken() const;
 
   // Advance the pointer to the next token in the circular token queue.
   void consumeCurrentToken();
diff --git a/include/llvm/MCA/HardwareUnits/Scheduler.h b/include/llvm/MCA/HardwareUnits/Scheduler.h
index 27beb842dfd2..6c196757e571 100644
--- a/include/llvm/MCA/HardwareUnits/Scheduler.h
+++ b/include/llvm/MCA/HardwareUnits/Scheduler.h
@@ -68,7 +68,7 @@ public:
 /// instructions from the dispatch stage, until the write-back stage.
 ///
 class Scheduler : public HardwareUnit {
-  LSUnit &LSU;
+  LSUnitBase &LSU;
 
   // Instruction selection strategy for this Scheduler.
   std::unique_ptr<SchedulerStrategy> Strategy;
@@ -154,15 +154,15 @@ class Scheduler : public HardwareUnit {
   bool promoteToPendingSet(SmallVectorImpl<InstRef> &Pending);
 
 public:
-  Scheduler(const MCSchedModel &Model, LSUnit &Lsu)
+  Scheduler(const MCSchedModel &Model, LSUnitBase &Lsu)
       : Scheduler(Model, Lsu, nullptr) {}
 
-  Scheduler(const MCSchedModel &Model, LSUnit &Lsu,
+  Scheduler(const MCSchedModel &Model, LSUnitBase &Lsu,
             std::unique_ptr<SchedulerStrategy> SelectStrategy)
-      : Scheduler(make_unique<ResourceManager>(Model), Lsu,
+      : Scheduler(std::make_unique<ResourceManager>(Model), Lsu,
                   std::move(SelectStrategy)) {}
 
-  Scheduler(std::unique_ptr<ResourceManager> RM, LSUnit &Lsu,
+  Scheduler(std::unique_ptr<ResourceManager> RM, LSUnitBase &Lsu,
             std::unique_ptr<SchedulerStrategy> SelectStrategy)
       : LSU(Lsu), Resources(std::move(RM)), BusyResourceUnits(0),
         NumDispatchedToThePendingSet(0), HadTokenStall(false) {
@@ -228,6 +228,9 @@ public:
                   SmallVectorImpl<InstRef> &Ready);
 
   /// Convert a resource mask into a valid llvm processor resource identifier.
+  ///
+  /// Only the most significant bit of the Mask is used by this method to
+  /// identify the processor resource.
   unsigned getResourceID(uint64_t Mask) const {
     return Resources->resolveResourceMask(Mask);
   }
diff --git a/include/llvm/MCA/Instruction.h b/include/llvm/MCA/Instruction.h
index d4d3f22797f7..c97cb463d0f5 100644
--- a/include/llvm/MCA/Instruction.h
+++ b/include/llvm/MCA/Instruction.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCRegister.h" // definition of MCPhysReg.
 #include "llvm/Support/MathExtras.h"
 
 #ifndef NDEBUG
@@ -42,7 +43,7 @@ struct WriteDescriptor {
   unsigned Latency;
   // This field is set to a value different than zero only if this
   // is an implicit definition.
-  unsigned RegisterID;
+  MCPhysReg RegisterID;
   // Instruction itineraries would set this field to the SchedClass ID.
   // Otherwise, it defaults to the WriteResourceID from the MCWriteLatencyEntry
   // element associated to this write.
@@ -70,7 +71,7 @@ struct ReadDescriptor {
   // uses always come first in the sequence of uses.
   unsigned UseIndex;
   // This field is only set if this is an implicit read.
-  unsigned RegisterID;
+  MCPhysReg RegisterID;
   // Scheduling Class Index. It is used to query the scheduling model for the
   // MCSchedClassDesc object.
   unsigned SchedClassID;
@@ -85,7 +86,7 @@ class ReadState;
 /// Field RegID is set to the invalid register for memory dependencies.
 struct CriticalDependency {
   unsigned IID;
-  unsigned RegID;
+  MCPhysReg RegID;
   unsigned Cycles;
 };
 
@@ -106,7 +107,7 @@ class WriteState {
   // to speedup queries on the register file.
   // For implicit writes, this field always matches the value of
   // field RegisterID from WD.
-  unsigned RegisterID;
+  MCPhysReg RegisterID;
 
   // Physical register file that serves register RegisterID.
   unsigned PRFID;
@@ -146,7 +147,7 @@ class WriteState {
   SmallVector<std::pair<ReadState *, int>, 4> Users;
 
 public:
-  WriteState(const WriteDescriptor &Desc, unsigned RegID,
+  WriteState(const WriteDescriptor &Desc, MCPhysReg RegID,
              bool clearsSuperRegs = false, bool writesZero = false)
       : WD(&Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID), PRFID(0),
         ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
@@ -158,7 +159,7 @@ public:
 
   int getCyclesLeft() const { return CyclesLeft; }
   unsigned getWriteResourceID() const { return WD->SClassOrWriteResourceID; }
-  unsigned getRegisterID() const { return RegisterID; }
+  MCPhysReg getRegisterID() const { return RegisterID; }
   unsigned getRegisterFileID() const { return PRFID; }
   unsigned getLatency() const { return WD->Latency; }
   unsigned getDependentWriteCyclesLeft() const {
@@ -200,7 +201,7 @@ public:
   }
 
   void setDependentWrite(const WriteState *Other) { DependentWrite = Other; }
-  void writeStartEvent(unsigned IID, unsigned RegID, unsigned Cycles);
+  void writeStartEvent(unsigned IID, MCPhysReg RegID, unsigned Cycles);
   void setWriteZero() { WritesZero = true; }
   void setEliminated() {
     assert(Users.empty() && "Write is in an inconsistent state.");
@@ -226,7 +227,7 @@ public:
 class ReadState {
   const ReadDescriptor *RD;
   // Physical register identified associated to this read.
-  unsigned RegisterID;
+  MCPhysReg RegisterID;
   // Physical register file that serves register RegisterID.
   unsigned PRFID;
   // Number of writes that contribute to the definition of RegisterID.
@@ -253,14 +254,14 @@ class ReadState {
   bool IndependentFromDef;
 
 public:
-  ReadState(const ReadDescriptor &Desc, unsigned RegID)
+  ReadState(const ReadDescriptor &Desc, MCPhysReg RegID)
       : RD(&Desc), RegisterID(RegID), PRFID(0), DependentWrites(0),
         CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), CRD(), IsReady(true),
         IsZero(false), IndependentFromDef(false) {}
 
   const ReadDescriptor &getDescriptor() const { return *RD; }
   unsigned getSchedClass() const { return RD->SchedClassID; }
-  unsigned getRegisterID() const { return RegisterID; }
+  MCPhysReg getRegisterID() const { return RegisterID; }
   unsigned getRegisterFileID() const { return PRFID; }
   const CriticalDependency &getCriticalRegDep() const { return CRD; }
 
@@ -272,7 +273,7 @@ public:
   void setIndependentFromDef() { IndependentFromDef = true; }
 
   void cycleEvent();
-  void writeStartEvent(unsigned IID, unsigned RegID, unsigned Cycles);
+  void writeStartEvent(unsigned IID, MCPhysReg RegID, unsigned Cycles);
   void setDependentWrites(unsigned Writes) {
     DependentWrites = Writes;
     IsReady = !Writes;
@@ -352,11 +353,14 @@ struct InstrDesc {
   // reports the number of "consumed cycles".
   SmallVector<std::pair<uint64_t, ResourceUsage>, 4> Resources;
 
-  // A list of buffered resources consumed by this instruction.
-  SmallVector<uint64_t, 4> Buffers;
+  // A bitmask of used hardware buffers.
+  uint64_t UsedBuffers;
 
-  unsigned UsedProcResUnits;
-  unsigned UsedProcResGroups;
+  // A bitmask of used processor resource units.
+  uint64_t UsedProcResUnits;
+
+  // A bitmask of used processor resource groups.
+  uint64_t UsedProcResGroups;
 
   unsigned MaxLatency;
   // Number of MicroOps for this instruction.
@@ -414,6 +418,7 @@ public:
   const InstrDesc &getDesc() const { return Desc; }
 
   unsigned getLatency() const { return Desc.MaxLatency; }
+  unsigned getNumMicroOps() const { return Desc.NumMicroOps; }
 
   bool hasDependentUsers() const {
     return any_of(Defs,
@@ -463,6 +468,12 @@ class Instruction : public InstructionBase {
   // operation.
   unsigned LSUTokenID;
 
+  // A resource mask which identifies buffered resources consumed by this
+  // instruction at dispatch stage. In the absence of macro-fusion, this value
+  // should always match the value of field `UsedBuffers` from the instruction
+  // descriptor (see field InstrBase::Desc).
+  uint64_t UsedBuffers;
+
   // Critical register dependency.
   CriticalDependency CriticalRegDep;
 
@@ -480,12 +491,18 @@ class Instruction : public InstructionBase {
 public:
   Instruction(const InstrDesc &D)
       : InstructionBase(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES),
-        RCUTokenID(0), LSUTokenID(0), CriticalRegDep(), CriticalMemDep(),
-        CriticalResourceMask(0), IsEliminated(false) {}
+        RCUTokenID(0), LSUTokenID(0), UsedBuffers(D.UsedBuffers),
+        CriticalRegDep(), CriticalMemDep(), CriticalResourceMask(0),
+        IsEliminated(false) {}
 
   unsigned getRCUTokenID() const { return RCUTokenID; }
   unsigned getLSUTokenID() const { return LSUTokenID; }
   void setLSUTokenID(unsigned LSUTok) { LSUTokenID = LSUTok; }
+
+  uint64_t getUsedBuffers() const { return UsedBuffers; }
+  void setUsedBuffers(uint64_t Mask) { UsedBuffers = Mask; }
+  void clearUsedBuffers() { UsedBuffers = 0ULL; }
+
   int getCyclesLeft() const { return CyclesLeft; }
 
   // Transition to the dispatch stage, and assign a RCUToken to this
diff --git a/include/llvm/MCA/SourceMgr.h b/include/llvm/MCA/SourceMgr.h
index dbe31db1b1dd..e844171bdcab 100644
--- a/include/llvm/MCA/SourceMgr.h
+++ b/include/llvm/MCA/SourceMgr.h
@@ -16,12 +16,13 @@
 #define LLVM_MCA_SOURCEMGR_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/MCA/Instruction.h"
 
 namespace llvm {
 namespace mca {
 
-class Instruction;
-
+// MSVC >= 19.15, < 19.20 need to see the definition of class Instruction to
+// prevent compiler error C2139 about intrinsic type trait '__is_assignable'.
 typedef std::pair<unsigned, const Instruction &> SourceRef;
 
 class SourceMgr {
diff --git a/include/llvm/MCA/Stages/RetireStage.h b/include/llvm/MCA/Stages/RetireStage.h
index 08c216ac7bf4..f4713688d25f 100644
--- a/include/llvm/MCA/Stages/RetireStage.h
+++ b/include/llvm/MCA/Stages/RetireStage.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_MCA_RETIRE_STAGE_H
 #define LLVM_MCA_RETIRE_STAGE_H
 
+#include "llvm/MCA/HardwareUnits/LSUnit.h"
 #include "llvm/MCA/HardwareUnits/RegisterFile.h"
 #include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
 #include "llvm/MCA/Stages/Stage.h"
@@ -27,13 +28,14 @@ class RetireStage final : public Stage {
   // Owner will go away when we move listeners/eventing to the stages.
   RetireControlUnit &RCU;
   RegisterFile &PRF;
+  LSUnitBase &LSU;
 
   RetireStage(const RetireStage &Other) = delete;
   RetireStage &operator=(const RetireStage &Other) = delete;
 
 public:
-  RetireStage(RetireControlUnit &R, RegisterFile &F)
-      : Stage(), RCU(R), PRF(F) {}
+  RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS)
+      : Stage(), RCU(R), PRF(F), LSU(LS) {}
 
   bool hasWorkToComplete() const override { return !RCU.isEmpty(); }
   Error cycleStart() override;
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index c40278a4f923..c3f36bdd9d1a 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -48,8 +48,7 @@ public:
   /// Get the name looking up long names.
   Expected<StringRef> getName(uint64_t Size) const;
 
-  /// Members are not larger than 4GB.
-  Expected<uint32_t> getSize() const;
+  Expected<uint64_t> getSize() const;
 
   Expected<sys::fs::perms> getAccessMode() const;
   Expected<sys::TimePoint<std::chrono::seconds>> getLastModified() const;
@@ -136,6 +135,7 @@ public:
 
     Expected<StringRef> getBuffer() const;
     uint64_t getChildOffset() const;
+    uint64_t getDataOffset() const { return getChildOffset() + StartOfFile; }
 
     Expected<MemoryBufferRef> getMemoryBufferRef() const;
 
@@ -221,6 +221,9 @@ public:
   Archive(MemoryBufferRef Source, Error &Err);
   static Expected<std::unique_ptr<Archive>> create(MemoryBufferRef Source);
 
+  /// Size field is 10 decimal digits long
+  static const uint64_t MaxMemberSize = 9999999999;
+
   enum Kind {
     K_GNU,
     K_GNU64,
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index 3c3e977baff4..aa5e718f5e9b 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -42,7 +42,9 @@ protected:
     ID_Archive,
     ID_MachOUniversalBinary,
     ID_COFFImportFile,
-    ID_IR, // LLVM IR
+    ID_IR,            // LLVM IR
+    ID_TapiUniversal, // Text-based Dynamic Library Stub file.
+    ID_TapiFile,      // Text-based Dynamic Library Stub file.
 
     ID_Minidump,
 
@@ -101,16 +103,18 @@ public:
     return TypeID > ID_StartObjects && TypeID < ID_EndObjects;
   }
 
-  bool isSymbolic() const { return isIR() || isObject() || isCOFFImportFile(); }
-
-  bool isArchive() const {
-    return TypeID == ID_Archive;
+  bool isSymbolic() const {
+    return isIR() || isObject() || isCOFFImportFile() || isTapiFile();
   }
 
+  bool isArchive() const { return TypeID == ID_Archive; }
+
   bool isMachOUniversalBinary() const {
     return TypeID == ID_MachOUniversalBinary;
   }
 
+  bool isTapiUniversal() const { return TypeID == ID_TapiUniversal; }
+
   bool isELF() const {
     return TypeID >= ID_ELF32L && TypeID <= ID_ELF64B;
   }
@@ -137,6 +141,8 @@ public:
 
   bool isMinidump() const { return TypeID == ID_Minidump; }
 
+  bool isTapiFile() const { return TypeID == ID_TapiFile; }
+
   bool isLittleEndian() const {
     return !(TypeID == ID_ELF32B || TypeID == ID_ELF64B ||
              TypeID == ID_MachO32B || TypeID == ID_MachO64B);
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index c53cbc46c747..b91ee5887fec 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -314,7 +314,10 @@ public:
     return CS16 ? CS16->Name.Offset : CS32->Name.Offset;
   }
 
-  uint32_t getValue() const { return CS16 ? CS16->Value : CS32->Value; }
+  uint32_t getValue() const {
+    assert(isSet() && "COFFSymbolRef points to nothing!");
+    return CS16 ? CS16->Value : CS32->Value;
+  }
 
   int32_t getSectionNumber() const {
     assert(isSet() && "COFFSymbolRef points to nothing!");
@@ -969,11 +972,14 @@ public:
       return nullptr;
     return reinterpret_cast<const dos_header *>(base());
   }
-  std::error_code getCOFFHeader(const coff_file_header *&Res) const;
-  std::error_code
-  getCOFFBigObjHeader(const coff_bigobj_file_header *&Res) const;
-  std::error_code getPE32Header(const pe32_header *&Res) const;
-  std::error_code getPE32PlusHeader(const pe32plus_header *&Res) const;
+
+  const coff_file_header *getCOFFHeader() const { return COFFHeader; }
+  const coff_bigobj_file_header *getCOFFBigObjHeader() const {
+    return COFFBigObjHeader;
+  }
+  const pe32_header *getPE32Header() const { return PE32Header; }
+  const pe32plus_header *getPE32PlusHeader() const { return PE32PlusHeader; }
+
   std::error_code getDataDirectory(uint32_t index,
                                    const data_directory *&Res) const;
   std::error_code getSection(int32_t index, const coff_section *&Res) const;
@@ -1201,16 +1207,34 @@ public:
   ResourceSectionRef() = default;
   explicit ResourceSectionRef(StringRef Ref) : BBS(Ref, support::little) {}
 
+  Error load(const COFFObjectFile *O);
+  Error load(const COFFObjectFile *O, const SectionRef &S);
+
   Expected<ArrayRef<UTF16>>
   getEntryNameString(const coff_resource_dir_entry &Entry);
   Expected<const coff_resource_dir_table &>
   getEntrySubDir(const coff_resource_dir_entry &Entry);
+  Expected<const coff_resource_data_entry &>
+  getEntryData(const coff_resource_dir_entry &Entry);
   Expected<const coff_resource_dir_table &> getBaseTable();
+  Expected<const coff_resource_dir_entry &>
+  getTableEntry(const coff_resource_dir_table &Table, uint32_t Index);
+
+  Expected<StringRef> getContents(const coff_resource_data_entry &Entry);
 
 private:
   BinaryByteStream BBS;
 
+  SectionRef Section;
+  const COFFObjectFile *Obj;
+
+  std::vector<const coff_relocation *> Relocs;
+
   Expected<const coff_resource_dir_table &> getTableAtOffset(uint32_t Offset);
+  Expected<const coff_resource_dir_entry &>
+  getTableEntryAtOffset(uint32_t Offset);
+  Expected<const coff_resource_data_entry &>
+  getDataEntryAtOffset(uint32_t Offset);
   Expected<ArrayRef<UTF16>> getDirStringAtOffset(uint32_t Offset);
 };
 
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index cf8e4529bad9..28b00c8413de 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -64,6 +64,10 @@ std::string getSecIndexForError(const ELFFile<ELFT> *Obj,
   return "[unknown index]";
 }
 
+static inline Error defaultWarningHandler(const Twine &Msg) {
+  return createError(Msg);
+}
+
 template <class ELFT>
 class ELFFile {
 public:
@@ -95,6 +99,13 @@ public:
   using Elf_Relr_Range = typename ELFT::RelrRange;
   using Elf_Phdr_Range = typename ELFT::PhdrRange;
 
+  // This is a callback that can be passed to a number of functions.
+  // It can be used to ignore non-critical errors (warnings), which is
+  // useful for dumpers, like llvm-readobj.
+  // It accepts a warning message string and returns a success
+  // when the warning should be ignored or an error otherwise.
+  using WarningHandler = llvm::function_ref<Error(const Twine &Msg)>;
+
   const uint8_t *base() const { return Buf.bytes_begin(); }
 
   size_t getBufSize() const { return Buf.size(); }
@@ -114,7 +125,9 @@ public:
   template <typename T>
   Expected<const T *> getEntry(const Elf_Shdr *Section, uint32_t Entry) const;
 
-  Expected<StringRef> getStringTable(const Elf_Shdr *Section) const;
+  Expected<StringRef>
+  getStringTable(const Elf_Shdr *Section,
+                 WarningHandler WarnHandler = &defaultWarningHandler) const;
   Expected<StringRef> getStringTableForSymtab(const Elf_Shdr &Section) const;
   Expected<StringRef> getStringTableForSymtab(const Elf_Shdr &Section,
                                               Elf_Shdr_Range Sections) const;
@@ -137,15 +150,16 @@ public:
 
   static Expected<ELFFile> create(StringRef Object);
 
+  bool isLE() const {
+    return getHeader()->getDataEncoding() == ELF::ELFDATA2LSB;
+  }
+
   bool isMipsELF64() const {
     return getHeader()->e_machine == ELF::EM_MIPS &&
            getHeader()->getFileClass() == ELF::ELFCLASS64;
   }
 
-  bool isMips64EL() const {
-    return isMipsELF64() &&
-           getHeader()->getDataEncoding() == ELF::ELFDATA2LSB;
-  }
+  bool isMips64EL() const { return isMipsELF64() && isLE(); }
 
   Expected<Elf_Shdr_Range> sections() const;
 
@@ -261,7 +275,9 @@ public:
     return make_range(notes_begin(Shdr, Err), notes_end());
   }
 
-  Expected<StringRef> getSectionStringTable(Elf_Shdr_Range Sections) const;
+  Expected<StringRef> getSectionStringTable(
+      Elf_Shdr_Range Sections,
+      WarningHandler WarnHandler = &defaultWarningHandler) const;
   Expected<uint32_t> getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms,
                                      ArrayRef<Elf_Word> ShndxTable) const;
   Expected<const Elf_Shdr *> getSection(const Elf_Sym *Sym,
@@ -271,12 +287,13 @@ public:
                                         Elf_Sym_Range Symtab,
                                         ArrayRef<Elf_Word> ShndxTable) const;
   Expected<const Elf_Shdr *> getSection(uint32_t Index) const;
-  Expected<const Elf_Shdr *> getSection(const StringRef SectionName) const;
 
   Expected<const Elf_Sym *> getSymbol(const Elf_Shdr *Sec,
                                       uint32_t Index) const;
 
-  Expected<StringRef> getSectionName(const Elf_Shdr *Section) const;
+  Expected<StringRef>
+  getSectionName(const Elf_Shdr *Section,
+                 WarningHandler WarnHandler = &defaultWarningHandler) const;
   Expected<StringRef> getSectionName(const Elf_Shdr *Section,
                                      StringRef DotShstrtab) const;
   template <typename T>
@@ -459,18 +476,18 @@ ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel *Rel,
 
 template <class ELFT>
 Expected<StringRef>
-ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections) const {
+ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections,
+                                     WarningHandler WarnHandler) const {
   uint32_t Index = getHeader()->e_shstrndx;
   if (Index == ELF::SHN_XINDEX)
     Index = Sections[0].sh_link;
 
   if (!Index) // no section string table.
     return "";
-  // TODO: Test a case when the sh_link of the section with index 0 is broken.
   if (Index >= Sections.size())
     return createError("section header string table index " + Twine(Index) +
                        " does not exist");
-  return getStringTable(&Sections[Index]);
+  return getStringTable(&Sections[Index], WarnHandler);
 }
 
 template <class ELFT> ELFFile<ELFT>::ELFFile(StringRef Object) : Buf(Object) {}
@@ -495,7 +512,8 @@ Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const {
                        Twine(getHeader()->e_shentsize));
 
   const uint64_t FileSize = Buf.size();
-  if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize)
+  if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize ||
+      SectionTableOffset + (uintX_t)sizeof(Elf_Shdr) < SectionTableOffset)
     return createError(
         "section header table goes past the end of the file: e_shoff = 0x" +
         Twine::utohexstr(SectionTableOffset));
@@ -513,15 +531,22 @@ Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const {
     NumSections = First->sh_size;
 
   if (NumSections > UINT64_MAX / sizeof(Elf_Shdr))
-    // TODO: this error is untested.
-    return createError("section table goes past the end of file");
+    return createError("invalid number of sections specified in the NULL "
+                       "section's sh_size field (" +
+                       Twine(NumSections) + ")");
 
   const uint64_t SectionTableSize = NumSections * sizeof(Elf_Shdr);
+  if (SectionTableOffset + SectionTableSize < SectionTableOffset)
+    return createError(
+        "invalid section header table offset (e_shoff = 0x" +
+        Twine::utohexstr(SectionTableOffset) +
+        ") or invalid number of sections specified in the first section "
+        "header's sh_size field (0x" +
+        Twine::utohexstr(NumSections) + ")");
 
   // Section table goes past end of file!
   if (SectionTableOffset + SectionTableSize > FileSize)
     return createError("section table goes past the end of file");
-
   return makeArrayRef(First, NumSections);
 }
 
@@ -540,8 +565,9 @@ template <typename T>
 Expected<const T *> ELFFile<ELFT>::getEntry(const Elf_Shdr *Section,
                                             uint32_t Entry) const {
   if (sizeof(T) != Section->sh_entsize)
-    // TODO: this error is untested.
-    return createError("invalid sh_entsize");
+    return createError("section " + getSecIndexForError(this, Section) +
+                       " has invalid sh_entsize: expected " + Twine(sizeof(T)) +
+                       ", but got " + Twine(Section->sh_entsize));
   size_t Pos = Section->sh_offset + Entry * sizeof(T);
   if (Pos + sizeof(T) > Buf.size())
     return createError("unable to access section " +
@@ -560,43 +586,27 @@ ELFFile<ELFT>::getSection(uint32_t Index) const {
   return object::getSection<ELFT>(*TableOrErr, Index);
 }
 
-template <class ELFT>
-Expected<const typename ELFT::Shdr *>
-ELFFile<ELFT>::getSection(const StringRef SectionName) const {
-  auto TableOrErr = sections();
-  if (!TableOrErr)
-    return TableOrErr.takeError();
-  for (auto &Sec : *TableOrErr) {
-    auto SecNameOrErr = getSectionName(&Sec);
-    if (!SecNameOrErr)
-      return SecNameOrErr.takeError();
-    if (*SecNameOrErr == SectionName)
-      return &Sec;
-  }
-  // TODO: this error is untested.
-  return createError("invalid section name");
-}
-
 template <class ELFT>
 Expected<StringRef>
-ELFFile<ELFT>::getStringTable(const Elf_Shdr *Section) const {
+ELFFile<ELFT>::getStringTable(const Elf_Shdr *Section,
+                              WarningHandler WarnHandler) const {
   if (Section->sh_type != ELF::SHT_STRTAB)
-    return createError("invalid sh_type for string table section " +
-                       getSecIndexForError(this, Section) +
-                       ": expected SHT_STRTAB, but got " +
-                       object::getELFSectionTypeName(getHeader()->e_machine,
-                                                     Section->sh_type));
+    if (Error E = WarnHandler("invalid sh_type for string table section " +
+                              getSecIndexForError(this, Section) +
+                              ": expected SHT_STRTAB, but got " +
+                              object::getELFSectionTypeName(
+                                  getHeader()->e_machine, Section->sh_type)))
+      return std::move(E);
+
   auto V = getSectionContentsAsArray<char>(Section);
   if (!V)
     return V.takeError();
   ArrayRef<char> Data = *V;
   if (Data.empty())
-    // TODO: this error is untested.
-    return createError("empty string table");
+    return createError("SHT_STRTAB string table section " +
+                       getSecIndexForError(this, Section) + " is empty");
   if (Data.back() != '\0')
-    return createError(object::getELFSectionTypeName(getHeader()->e_machine,
-                                                     Section->sh_type) +
-                       " string table section " +
+    return createError("SHT_STRTAB string table section " +
                        getSecIndexForError(this, Section) +
                        " is non-null terminated");
   return StringRef(Data.begin(), Data.size());
@@ -626,8 +636,11 @@ ELFFile<ELFT>::getSHNDXTable(const Elf_Shdr &Section,
   const Elf_Shdr &SymTable = **SymTableOrErr;
   if (SymTable.sh_type != ELF::SHT_SYMTAB &&
       SymTable.sh_type != ELF::SHT_DYNSYM)
-    // TODO: this error is untested.
-    return createError("invalid sh_type");
+    return createError("SHT_SYMTAB_SHNDX section is linked with " +
+                       object::getELFSectionTypeName(getHeader()->e_machine,
+                                                     SymTable.sh_type) +
+                       " section (expected SHT_SYMTAB/SHT_DYNSYM)");
+
   if (V.size() != (SymTable.sh_size / sizeof(Elf_Sym)))
     return createError("SHT_SYMTAB_SHNDX section has sh_size (" +
                        Twine(SymTable.sh_size) +
@@ -662,11 +675,12 @@ ELFFile<ELFT>::getStringTableForSymtab(const Elf_Shdr &Sec,
 
 template <class ELFT>
 Expected<StringRef>
-ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section) const {
+ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section,
+                              WarningHandler WarnHandler) const {
   auto SectionsOrErr = sections();
   if (!SectionsOrErr)
     return SectionsOrErr.takeError();
-  auto Table = getSectionStringTable(*SectionsOrErr);
+  auto Table = getSectionStringTable(*SectionsOrErr, WarnHandler);
   if (!Table)
     return Table.takeError();
   return getSectionName(Section, *Table);
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index 86c015efd704..424289a9ccaa 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -41,7 +41,7 @@
 namespace llvm {
 namespace object {
 
-constexpr int NumElfSymbolTypes = 8;
+constexpr int NumElfSymbolTypes = 16;
 extern const llvm::EnumEntry<unsigned> ElfSymbolTypes[NumElfSymbolTypes];
 
 class elf_symbol_iterator;
@@ -239,6 +239,10 @@ public:
   using Elf_Rela = typename ELFT::Rela;
   using Elf_Dyn = typename ELFT::Dyn;
 
+  SectionRef toSectionRef(const Elf_Shdr *Sec) const {
+    return SectionRef(toDRI(Sec), this);
+  }
+
 private:
   ELFObjectFile(MemoryBufferRef Object, ELFFile<ELFT> EF,
                 const Elf_Shdr *DotDynSymSec, const Elf_Shdr *DotSymtabSec,
@@ -284,7 +288,8 @@ protected:
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
   std::vector<SectionRef> dynamic_relocation_sections() const override;
-  section_iterator getRelocatedSection(DataRefImpl Sec) const override;
+  Expected<section_iterator>
+  getRelocatedSection(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
   uint64_t getRelocationOffset(DataRefImpl Rel) const override;
@@ -461,13 +466,15 @@ Expected<StringRef> ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Sym) const {
   if (!SymStrTabOrErr)
     return SymStrTabOrErr.takeError();
   Expected<StringRef> Name = ESym->getName(*SymStrTabOrErr);
+  if (Name && !Name->empty())
+    return Name;
 
   // If the symbol name is empty use the section name.
-  if ((!Name || Name->empty()) && ESym->getType() == ELF::STT_SECTION) {
-    StringRef SecName;
-    Expected<section_iterator> Sec = getSymbolSection(Sym);
-    if (Sec && !(*Sec)->getName(SecName))
-      return SecName;
+  if (ESym->getType() == ELF::STT_SECTION) {
+    if (Expected<section_iterator> SecOrErr = getSymbolSection(Sym)) {
+      consumeError(Name.takeError());
+      return (*SecOrErr)->getName();
+    }
   }
   return Name;
 }
@@ -835,7 +842,7 @@ ELFObjectFile<ELFT>::section_rel_end(DataRefImpl Sec) const {
 }
 
 template <class ELFT>
-section_iterator
+Expected<section_iterator>
 ELFObjectFile<ELFT>::getRelocatedSection(DataRefImpl Sec) const {
   if (EF.getHeader()->e_type != ELF::ET_REL)
     return section_end();
@@ -845,10 +852,10 @@ ELFObjectFile<ELFT>::getRelocatedSection(DataRefImpl Sec) const {
   if (Type != ELF::SHT_REL && Type != ELF::SHT_RELA)
     return section_end();
 
-  auto R = EF.getSection(EShdr->sh_info);
-  if (!R)
-    report_fatal_error(errorToErrorCode(R.takeError()).message());
-  return section_iterator(SectionRef(toDRI(*R), this));
+  Expected<const Elf_Shdr *> SecOrErr = EF.getSection(EShdr->sh_info);
+  if (!SecOrErr)
+    return SecOrErr.takeError();
+  return section_iterator(SectionRef(toDRI(*SecOrErr), this));
 }
 
 // Relocations
diff --git a/include/llvm/Object/ELFTypes.h b/include/llvm/Object/ELFTypes.h
index 5552208b1f8a..7d1ade4d5437 100644
--- a/include/llvm/Object/ELFTypes.h
+++ b/include/llvm/Object/ELFTypes.h
@@ -248,7 +248,11 @@ template <class ELFT>
 Expected<StringRef> Elf_Sym_Impl<ELFT>::getName(StringRef StrTab) const {
   uint32_t Offset = this->st_name;
   if (Offset >= StrTab.size())
-    return errorCodeToError(object_error::parse_failed);
+    return createStringError(object_error::parse_failed,
+                             "st_name (0x%" PRIx32
+                             ") is past the end of the string table"
+                             " of size 0x%zx",
+                             Offset, StrTab.size());
   return StringRef(StrTab.data() + Offset);
 }
 
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index ca9512f21706..76be8049a7d4 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -297,6 +297,7 @@ public:
   uint64_t getSectionAddress(DataRefImpl Sec) const override;
   uint64_t getSectionIndex(DataRefImpl Sec) const override;
   uint64_t getSectionSize(DataRefImpl Sec) const override;
+  ArrayRef<uint8_t> getSectionContents(uint32_t Offset, uint64_t Size) const;
   Expected<ArrayRef<uint8_t>>
   getSectionContents(DataRefImpl Sec) const override;
   uint64_t getSectionAlignment(DataRefImpl Sec) const override;
diff --git a/include/llvm/Object/MachOUniversal.h b/include/llvm/Object/MachOUniversal.h
index 5bf724f2c8b2..eb45aff4480b 100644
--- a/include/llvm/Object/MachOUniversal.h
+++ b/include/llvm/Object/MachOUniversal.h
@@ -31,6 +31,8 @@ class MachOUniversalBinary : public Binary {
   uint32_t Magic;
   uint32_t NumberOfObjects;
 public:
+  static constexpr uint32_t MaxSectionAlignment = 15; /* 2**15 or 0x8000 */
+
   class ObjectForArch {
     const MachOUniversalBinary *Parent;
     /// Index of object in the universal binary.
@@ -64,13 +66,13 @@ public:
       else // Parent->getMagic() == MachO::FAT_MAGIC_64
         return Header64.cpusubtype;
     }
-    uint32_t getOffset() const {
+    uint64_t getOffset() const {
       if (Parent->getMagic() == MachO::FAT_MAGIC)
         return Header.offset;
       else // Parent->getMagic() == MachO::FAT_MAGIC_64
         return Header64.offset;
     }
-    uint32_t getSize() const {
+    uint64_t getSize() const {
       if (Parent->getMagic() == MachO::FAT_MAGIC)
         return Header.size;
       else // Parent->getMagic() == MachO::FAT_MAGIC_64
@@ -157,8 +159,14 @@ public:
     return V->isMachOUniversalBinary();
   }
 
-  Expected<std::unique_ptr<MachOObjectFile>>
+  Expected<ObjectForArch>
   getObjectForArch(StringRef ArchName) const;
+
+  Expected<std::unique_ptr<MachOObjectFile>>
+  getMachOObjectForArch(StringRef ArchName) const;
+
+  Expected<std::unique_ptr<Archive>>
+  getArchiveForArch(StringRef ArchName) const;
 };
 
 }
diff --git a/include/llvm/Object/Minidump.h b/include/llvm/Object/Minidump.h
index 470008d552e7..4429493aff45 100644
--- a/include/llvm/Object/Minidump.h
+++ b/include/llvm/Object/Minidump.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/BinaryFormat/Minidump.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Support/Error.h"
@@ -80,16 +81,65 @@ public:
     return getListStream<minidump::Thread>(minidump::StreamType::ThreadList);
   }
 
-  /// Returns the list of memory ranges embedded in the MemoryList stream. An
-  /// error is returned if the file does not contain this stream, or if the
-  /// stream is not large enough to contain the number of memory descriptors
-  /// declared in the stream header. The consistency of the MemoryDescriptor
-  /// entries themselves is not checked in any way.
+  /// Returns the contents of the Exception stream.  An error is returned if the
+  /// file does not contain this stream, or the stream is smaller than the size
+  /// of the ExceptionStream structure.  The internal consistency of the stream
+  /// is not checked in any way.
+  Expected<const minidump::ExceptionStream &> getExceptionStream() const {
+    return getStream<minidump::ExceptionStream>(
+        minidump::StreamType::Exception);
+  }
+
+  /// Returns the list of descriptors embedded in the MemoryList stream. The
+  /// descriptors provide the content of interesting regions of memory at the
+  /// time the minidump was taken. An error is returned if the file does not
+  /// contain this stream, or if the stream is not large enough to contain the
+  /// number of memory descriptors declared in the stream header. The
+  /// consistency of the MemoryDescriptor entries themselves is not checked in
+  /// any way.
   Expected<ArrayRef<minidump::MemoryDescriptor>> getMemoryList() const {
     return getListStream<minidump::MemoryDescriptor>(
         minidump::StreamType::MemoryList);
   }
 
+  class MemoryInfoIterator
+      : public iterator_facade_base<MemoryInfoIterator,
+                                    std::forward_iterator_tag,
+                                    minidump::MemoryInfo> {
+  public:
+    MemoryInfoIterator(ArrayRef<uint8_t> Storage, size_t Stride)
+        : Storage(Storage), Stride(Stride) {
+      assert(Storage.size() % Stride == 0);
+    }
+
+    bool operator==(const MemoryInfoIterator &R) const {
+      return Storage.size() == R.Storage.size();
+    }
+
+    const minidump::MemoryInfo &operator*() const {
+      assert(Storage.size() >= sizeof(minidump::MemoryInfo));
+      return *reinterpret_cast<const minidump::MemoryInfo *>(Storage.data());
+    }
+
+    MemoryInfoIterator &operator++() {
+      Storage = Storage.drop_front(Stride);
+      return *this;
+    }
+
+  private:
+    ArrayRef<uint8_t> Storage;
+    size_t Stride;
+  };
+
+  /// Returns the list of descriptors embedded in the MemoryInfoList stream. The
+  /// descriptors provide properties (e.g. permissions) of interesting regions
+  /// of memory at the time the minidump was taken. An error is returned if the
+  /// file does not contain this stream, or if the stream is not large enough to
+  /// contain the number of memory descriptors declared in the stream header.
+  /// The consistency of the MemoryInfoList entries themselves is not checked
+  /// in any way.
+  Expected<iterator_range<MemoryInfoIterator>> getMemoryInfoList() const;
+
 private:
   static Error createError(StringRef Str) {
     return make_error<GenericBinaryError>(Str, object_error::parse_failed);
@@ -137,10 +187,10 @@ private:
 };
 
 template <typename T>
-Expected<const T &> MinidumpFile::getStream(minidump::StreamType Stream) const {
-  if (auto OptionalStream = getRawStream(Stream)) {
-    if (OptionalStream->size() >= sizeof(T))
-      return *reinterpret_cast<const T *>(OptionalStream->data());
+Expected<const T &> MinidumpFile::getStream(minidump::StreamType Type) const {
+  if (Optional<ArrayRef<uint8_t>> Stream = getRawStream(Type)) {
+    if (Stream->size() >= sizeof(T))
+      return *reinterpret_cast<const T *>(Stream->data());
     return createEOFError();
   }
   return createError("No such stream");
@@ -153,10 +203,11 @@ Expected<ArrayRef<T>> MinidumpFile::getDataSliceAs(ArrayRef<uint8_t> Data,
   // Check for overflow.
   if (Count > std::numeric_limits<size_t>::max() / sizeof(T))
     return createEOFError();
-  auto ExpectedArray = getDataSlice(Data, Offset, sizeof(T) * Count);
-  if (!ExpectedArray)
-    return ExpectedArray.takeError();
-  return ArrayRef<T>(reinterpret_cast<const T *>(ExpectedArray->data()), Count);
+  Expected<ArrayRef<uint8_t>> Slice =
+      getDataSlice(Data, Offset, sizeof(T) * Count);
+  if (!Slice)
+    return Slice.takeError();
+  return ArrayRef<T>(reinterpret_cast<const T *>(Slice->data()), Count);
 }
 
 } // end namespace object
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 483a3486bd72..adc9dbc189af 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -94,7 +94,7 @@ public:
 
   void moveNext();
 
-  std::error_code getName(StringRef &Result) const;
+  Expected<StringRef> getName() const;
   uint64_t getAddress() const;
   uint64_t getIndex() const;
   uint64_t getSize() const;
@@ -130,18 +130,13 @@ public:
   iterator_range<relocation_iterator> relocations() const {
     return make_range(relocation_begin(), relocation_end());
   }
-  section_iterator getRelocatedSection() const;
+  Expected<section_iterator> getRelocatedSection() const;
 
   DataRefImpl getRawDataRefImpl() const;
   const ObjectFile *getObject() const;
 };
 
 struct SectionedAddress {
-  // TODO: constructors could be removed when C++14 would be adopted.
-  SectionedAddress() {}
-  SectionedAddress(uint64_t Addr, uint64_t SectIdx)
-      : Address(Addr), SectionIndex(SectIdx) {}
-
   const static uint64_t UndefSection = UINT64_MAX;
 
   uint64_t Address = 0;
@@ -277,7 +272,7 @@ protected:
   virtual bool isBerkeleyData(DataRefImpl Sec) const;
   virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0;
   virtual relocation_iterator section_rel_end(DataRefImpl Sec) const = 0;
-  virtual section_iterator getRelocatedSection(DataRefImpl Sec) const;
+  virtual Expected<section_iterator> getRelocatedSection(DataRefImpl Sec) const;
 
   // Same as above for RelocationRef.
   friend class RelocationRef;
@@ -434,12 +429,8 @@ inline void SectionRef::moveNext() {
   return OwningObject->moveSectionNext(SectionPimpl);
 }
 
-inline std::error_code SectionRef::getName(StringRef &Result) const {
-  Expected<StringRef> NameOrErr = OwningObject->getSectionName(SectionPimpl);
-  if (!NameOrErr)
-    return errorToErrorCode(NameOrErr.takeError());
-  Result = *NameOrErr;
-  return std::error_code();
+inline Expected<StringRef> SectionRef::getName() const {
+  return OwningObject->getSectionName(SectionPimpl);
 }
 
 inline uint64_t SectionRef::getAddress() const {
@@ -510,7 +501,7 @@ inline relocation_iterator SectionRef::relocation_end() const {
   return OwningObject->section_rel_end(SectionPimpl);
 }
 
-inline section_iterator SectionRef::getRelocatedSection() const {
+inline Expected<section_iterator> SectionRef::getRelocatedSection() const {
   return OwningObject->getRelocatedSection(SectionPimpl);
 }
 
diff --git a/include/llvm/Object/StackMapParser.h b/include/llvm/Object/StackMapParser.h
index ed44efbf80b9..b408f4041034 100644
--- a/include/llvm/Object/StackMapParser.h
+++ b/include/llvm/Object/StackMapParser.h
@@ -19,7 +19,7 @@
 
 namespace llvm {
 
-/// A parser for the latest stackmap format.  At the moment, latest=V2.
+/// A parser for the latest stackmap format.  At the moment, latest=V3.
 template <support::endianness Endianness>
 class StackMapParser {
 public:
@@ -299,7 +299,7 @@ public:
     const uint8_t *P;
   };
 
-  /// Construct a parser for a version-2 stackmap. StackMap data will be read
+  /// Construct a parser for a version-3 stackmap. StackMap data will be read
   /// from the given array.
   StackMapParser(ArrayRef<uint8_t> StackMapSection)
       : StackMapSection(StackMapSection) {
diff --git a/include/llvm/Object/TapiFile.h b/include/llvm/Object/TapiFile.h
new file mode 100644
index 000000000000..bc2e04e1cc96
--- /dev/null
+++ b/include/llvm/Object/TapiFile.h
@@ -0,0 +1,60 @@
+//===- TapiFile.h - Text-based Dynamic Library Stub -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TapiFile interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_TAPI_FILE_H
+#define LLVM_OBJECT_TAPI_FILE_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/TextAPI/MachO/InterfaceFile.h"
+
+namespace llvm {
+namespace object {
+
+class TapiFile : public SymbolicFile {
+public:
+  TapiFile(MemoryBufferRef Source, const MachO::InterfaceFile &interface,
+           MachO::Architecture Arch);
+  ~TapiFile() override;
+
+  void moveSymbolNext(DataRefImpl &DRI) const override;
+
+  Error printSymbolName(raw_ostream &OS, DataRefImpl DRI) const override;
+
+  uint32_t getSymbolFlags(DataRefImpl DRI) const override;
+
+  basic_symbol_iterator symbol_begin() const override;
+
+  basic_symbol_iterator symbol_end() const override;
+
+  static bool classof(const Binary *v) { return v->isTapiFile(); }
+
+private:
+  struct Symbol {
+    StringRef Prefix;
+    StringRef Name;
+    uint32_t Flags;
+
+    constexpr Symbol(StringRef Prefix, StringRef Name, uint32_t Flags)
+        : Prefix(Prefix), Name(Name), Flags(Flags) {}
+  };
+
+  std::vector<Symbol> Symbols;
+};
+
+} // end namespace object.
+} // end namespace llvm.
+
+#endif // LLVM_OBJECT_TAPI_FILE_H
diff --git a/include/llvm/Object/TapiUniversal.h b/include/llvm/Object/TapiUniversal.h
new file mode 100644
index 000000000000..4931183852ad
--- /dev/null
+++ b/include/llvm/Object/TapiUniversal.h
@@ -0,0 +1,109 @@
+//===-- TapiUniversal.h - Text-based Dynamic Library Stub -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TapiUniversal interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_TAPI_UNIVERSAL_H
+#define LLVM_OBJECT_TAPI_UNIVERSAL_H
+
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/TapiFile.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/TextAPI/MachO/Architecture.h"
+#include "llvm/TextAPI/MachO/InterfaceFile.h"
+
+namespace llvm {
+namespace object {
+
+class TapiUniversal : public Binary {
+public:
+  class ObjectForArch {
+    const TapiUniversal *Parent;
+    int Index;
+
+  public:
+    ObjectForArch(const TapiUniversal *Parent, int Index)
+        : Parent(Parent), Index(Index) {}
+
+    ObjectForArch getNext() const { return ObjectForArch(Parent, Index + 1); }
+
+    bool operator==(const ObjectForArch &Other) const {
+      return (Parent == Other.Parent) && (Index == Other.Index);
+    }
+
+    uint32_t getCPUType() const {
+      auto Result =
+          MachO::getCPUTypeFromArchitecture(Parent->Architectures[Index]);
+      return Result.first;
+    }
+
+    uint32_t getCPUSubType() const {
+      auto Result =
+          MachO::getCPUTypeFromArchitecture(Parent->Architectures[Index]);
+      return Result.second;
+    }
+
+    std::string getArchFlagName() const {
+      return MachO::getArchitectureName(Parent->Architectures[Index]);
+    }
+
+    Expected<std::unique_ptr<TapiFile>> getAsObjectFile() const;
+  };
+
+  class object_iterator {
+    ObjectForArch Obj;
+
+  public:
+    object_iterator(const ObjectForArch &Obj) : Obj(Obj) {}
+    const ObjectForArch *operator->() const { return &Obj; }
+    const ObjectForArch &operator*() const { return Obj; }
+
+    bool operator==(const object_iterator &Other) const {
+      return Obj == Other.Obj;
+    }
+    bool operator!=(const object_iterator &Other) const {
+      return !(*this == Other);
+    }
+
+    object_iterator &operator++() { // Preincrement
+      Obj = Obj.getNext();
+      return *this;
+    }
+  };
+
+  TapiUniversal(MemoryBufferRef Source, Error &Err);
+  static Expected<std::unique_ptr<TapiUniversal>>
+  create(MemoryBufferRef Source);
+  ~TapiUniversal() override;
+
+  object_iterator begin_objects() const { return ObjectForArch(this, 0); }
+  object_iterator end_objects() const {
+    return ObjectForArch(this, Architectures.size());
+  }
+
+  iterator_range<object_iterator> objects() const {
+    return make_range(begin_objects(), end_objects());
+  }
+
+  uint32_t getNumberOfObjects() const { return Architectures.size(); }
+
+  // Cast methods.
+  static bool classof(const Binary *v) { return v->isTapiUniversal(); }
+
+private:
+  std::unique_ptr<MachO::InterfaceFile> ParsedFile;
+  std::vector<MachO::Architecture> Architectures;
+};
+
+} // end namespace object.
+} // end namespace llvm.
+
+#endif // LLVM_OBJECT_TAPI_UNIVERSAL_H
diff --git a/include/llvm/Object/WindowsResource.h b/include/llvm/Object/WindowsResource.h
index 356dcb03abba..a0d658491cb9 100644
--- a/include/llvm/Object/WindowsResource.h
+++ b/include/llvm/Object/WindowsResource.h
@@ -31,6 +31,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Object/Binary.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
@@ -48,6 +49,7 @@ class ScopedPrinter;
 namespace object {
 
 class WindowsResource;
+class ResourceSectionRef;
 
 const size_t WIN_RES_MAGIC_SIZE = 16;
 const size_t WIN_RES_NULL_ENTRY_SIZE = 16;
@@ -151,8 +153,11 @@ private:
 class WindowsResourceParser {
 public:
   class TreeNode;
-  WindowsResourceParser();
+  WindowsResourceParser(bool MinGW = false);
   Error parse(WindowsResource *WR, std::vector<std::string> &Duplicates);
+  Error parse(ResourceSectionRef &RSR, StringRef Filename,
+              std::vector<std::string> &Duplicates);
+  void cleanUpManifests(std::vector<std::string> &Duplicates);
   void printTree(raw_ostream &OS) const;
   const TreeNode &getTree() const { return Root; }
   const ArrayRef<std::vector<uint8_t>> getData() const { return Data; }
@@ -181,32 +186,38 @@ public:
   private:
     friend class WindowsResourceParser;
 
-    static uint32_t StringCount;
-    static uint32_t DataCount;
-
-    static std::unique_ptr<TreeNode> createStringNode();
+    // Index is the StringTable vector index for this node's name.
+    static std::unique_ptr<TreeNode> createStringNode(uint32_t Index);
     static std::unique_ptr<TreeNode> createIDNode();
+    // DataIndex is the Data vector index that the data node points at.
     static std::unique_ptr<TreeNode> createDataNode(uint16_t MajorVersion,
                                                     uint16_t MinorVersion,
                                                     uint32_t Characteristics,
-                                                    uint32_t Origin);
+                                                    uint32_t Origin,
+                                                    uint32_t DataIndex);
 
-    explicit TreeNode(bool IsStringNode);
+    explicit TreeNode(uint32_t StringIndex);
     TreeNode(uint16_t MajorVersion, uint16_t MinorVersion,
-             uint32_t Characteristics, uint32_t Origin);
+             uint32_t Characteristics, uint32_t Origin, uint32_t DataIndex);
 
     bool addEntry(const ResourceEntryRef &Entry, uint32_t Origin,
-                  bool &IsNewTypeString, bool &IsNewNameString,
+                  std::vector<std::vector<uint8_t>> &Data,
+                  std::vector<std::vector<UTF16>> &StringTable,
                   TreeNode *&Result);
-    TreeNode &addTypeNode(const ResourceEntryRef &Entry, bool &IsNewTypeString);
-    TreeNode &addNameNode(const ResourceEntryRef &Entry, bool &IsNewNameString);
+    TreeNode &addTypeNode(const ResourceEntryRef &Entry,
+                          std::vector<std::vector<UTF16>> &StringTable);
+    TreeNode &addNameNode(const ResourceEntryRef &Entry,
+                          std::vector<std::vector<UTF16>> &StringTable);
     bool addLanguageNode(const ResourceEntryRef &Entry, uint32_t Origin,
+                         std::vector<std::vector<uint8_t>> &Data,
                          TreeNode *&Result);
     bool addDataChild(uint32_t ID, uint16_t MajorVersion, uint16_t MinorVersion,
                       uint32_t Characteristics, uint32_t Origin,
-                      TreeNode *&Result);
+                      uint32_t DataIndex, TreeNode *&Result);
     TreeNode &addIDChild(uint32_t ID);
-    TreeNode &addNameChild(ArrayRef<UTF16> NameRef, bool &IsNewString);
+    TreeNode &addNameChild(ArrayRef<UTF16> NameRef,
+                           std::vector<std::vector<UTF16>> &StringTable);
+    void shiftDataIndexDown(uint32_t Index);
 
     bool IsDataNode = false;
     uint32_t StringIndex;
@@ -222,12 +233,30 @@ public:
     uint32_t Origin;
   };
 
+  struct StringOrID {
+    bool IsString;
+    ArrayRef<UTF16> String;
+    uint32_t ID;
+
+    StringOrID(uint32_t ID) : IsString(false), ID(ID) {}
+    StringOrID(ArrayRef<UTF16> String) : IsString(true), String(String) {}
+  };
+
 private:
+  Error addChildren(TreeNode &Node, ResourceSectionRef &RSR,
+                    const coff_resource_dir_table &Table, uint32_t Origin,
+                    std::vector<StringOrID> &Context,
+                    std::vector<std::string> &Duplicates);
+  bool shouldIgnoreDuplicate(const ResourceEntryRef &Entry) const;
+  bool shouldIgnoreDuplicate(const std::vector<StringOrID> &Context) const;
+
   TreeNode Root;
   std::vector<std::vector<uint8_t>> Data;
   std::vector<std::vector<UTF16>> StringTable;
 
   std::vector<std::string> InputFilenames;
+
+  bool MinGW;
 };
 
 Expected<std::unique_ptr<MemoryBuffer>>
diff --git a/include/llvm/Object/XCOFFObjectFile.h b/include/llvm/Object/XCOFFObjectFile.h
index cdee7129a2ab..84073ce5f6cf 100644
--- a/include/llvm/Object/XCOFFObjectFile.h
+++ b/include/llvm/Object/XCOFFObjectFile.h
@@ -13,23 +13,8 @@
 #ifndef LLVM_OBJECT_XCOFFOBJECTFILE_H
 #define LLVM_OBJECT_XCOFFOBJECTFILE_H
 
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/BinaryFormat/XCOFF.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/Error.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/SymbolicFile.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <system_error>
 
 namespace llvm {
 namespace object {
@@ -63,7 +48,7 @@ struct XCOFFFileHeader64 {
 };
 
 struct XCOFFSectionHeader32 {
-  char Name[XCOFF::SectionNameSize];
+  char Name[XCOFF::NameSize];
   support::ubig32_t PhysicalAddress;
   support::ubig32_t VirtualAddress;
   support::ubig32_t SectionSize;
@@ -78,7 +63,7 @@ struct XCOFFSectionHeader32 {
 };
 
 struct XCOFFSectionHeader64 {
-  char Name[XCOFF::SectionNameSize];
+  char Name[XCOFF::NameSize];
   support::ubig64_t PhysicalAddress;
   support::ubig64_t VirtualAddress;
   support::ubig64_t SectionSize;
@@ -106,7 +91,7 @@ struct XCOFFSymbolEntry {
   } CFileLanguageIdAndTypeIdType;
 
   union {
-    char SymbolName[XCOFF::SymbolNameSize];
+    char SymbolName[XCOFF::NameSize];
     NameInStrTblType NameInStrTbl;
   };
 
@@ -127,6 +112,75 @@ struct XCOFFStringTable {
   const char *Data;
 };
 
+struct XCOFFCsectAuxEnt32 {
+  support::ubig32_t
+      SectionOrLength; // If the symbol type is XTY_SD or XTY_CM, the csect
+                       // length.
+                       // If the symbol type is XTY_LD, the symbol table
+                       // index of the containing csect.
+                       // If the symbol type is XTY_ER, 0.
+  support::ubig32_t ParameterHashIndex;
+  support::ubig16_t TypeChkSectNum;
+  uint8_t SymbolAlignmentAndType;
+  XCOFF::StorageMappingClass StorageMappingClass;
+  support::ubig32_t StabInfoIndex;
+  support::ubig16_t StabSectNum;
+};
+
+struct XCOFFFileAuxEnt {
+  typedef struct {
+    support::big32_t Magic; // Zero indicates name in string table.
+    support::ubig32_t Offset;
+    char NamePad[XCOFF::FileNamePadSize];
+  } NameInStrTblType;
+  union {
+    char Name[XCOFF::NameSize + XCOFF::FileNamePadSize];
+    NameInStrTblType NameInStrTbl;
+  };
+  XCOFF::CFileStringType Type;
+  uint8_t ReservedZeros[2];
+  uint8_t AuxType; // 64-bit XCOFF file only.
+};
+
+struct XCOFFSectAuxEntForStat {
+  support::ubig32_t SectionLength;
+  support::ubig16_t NumberOfRelocEnt;
+  support::ubig16_t NumberOfLineNum;
+  uint8_t Pad[10];
+};
+
+struct XCOFFRelocation32 {
+  // Masks for packing/unpacking the r_rsize field of relocations.
+
+  // The msb is used to indicate if the bits being relocated are signed or
+  // unsigned.
+  static constexpr uint8_t XR_SIGN_INDICATOR_MASK = 0x80;
+
+  // The 2nd msb is used to indicate that the binder has replaced/modified the
+  // original instruction.
+  static constexpr uint8_t XR_FIXUP_INDICATOR_MASK = 0x40;
+
+  // The remaining bits specify the bit length of the relocatable reference
+  // minus one.
+  static constexpr uint8_t XR_BIASED_LENGTH_MASK = 0x3f;
+
+public:
+  support::ubig32_t VirtualAddress;
+  support::ubig32_t SymbolIndex;
+
+  // Packed field, see XR_* masks for details of packing.
+  uint8_t Info;
+
+  XCOFF::RelocationType Type;
+
+public:
+  bool isRelocationSigned() const;
+  bool isFixupIndicated() const;
+
+  // Returns the number of bits being relocated.
+  uint8_t getRelocatedLength() const;
+};
+
 class XCOFFObjectFile : public ObjectFile {
 private:
   const void *FileHeader = nullptr;
@@ -146,18 +200,18 @@ private:
 
   const XCOFFSectionHeader32 *toSection32(DataRefImpl Ref) const;
   const XCOFFSectionHeader64 *toSection64(DataRefImpl Ref) const;
-  void checkSectionAddress(uintptr_t Addr, uintptr_t TableAddr) const;
   uintptr_t getSectionHeaderTableAddress() const;
+  uintptr_t getEndOfSymbolTableAddress() const;
 
   // This returns a pointer to the start of the storage for the name field of
   // the 32-bit or 64-bit SectionHeader struct. This string is *not* necessarily
   // null-terminated.
   const char *getSectionNameInternal(DataRefImpl Sec) const;
 
-  int32_t getSectionFlags(DataRefImpl Sec) const;
+  // This function returns string table entry.
+  Expected<StringRef> getStringTableEntry(uint32_t Offset) const;
 
   static bool isReservedSectionNumber(int16_t SectionNumber);
-  Expected<DataRefImpl> getSectionByNum(int16_t Num) const;
 
   // Constructor and "create" factory function. The constructor is only a thin
   // wrapper around the base constructor. The "create" function fills out the
@@ -175,6 +229,8 @@ private:
   friend Expected<std::unique_ptr<ObjectFile>>
   ObjectFile::createXCOFFObjectFile(MemoryBufferRef Object, unsigned FileType);
 
+  void checkSectionAddress(uintptr_t Addr, uintptr_t TableAddr) const;
+
 public:
   // Interface inherited from base classes.
   void moveSymbolNext(DataRefImpl &Symb) const override;
@@ -253,15 +309,49 @@ public:
   uint32_t getLogicalNumberOfSymbolTableEntries32() const;
 
   uint32_t getNumberOfSymbolTableEntries64() const;
+  uint32_t getSymbolIndex(uintptr_t SymEntPtr) const;
+  Expected<StringRef> getSymbolNameByIndex(uint32_t SymbolTableIndex) const;
 
+  Expected<StringRef> getCFileName(const XCOFFFileAuxEnt *CFileEntPtr) const;
   uint16_t getOptionalHeaderSize() const;
   uint16_t getFlags() const;
 
   // Section header table related interfaces.
   ArrayRef<XCOFFSectionHeader32> sections32() const;
   ArrayRef<XCOFFSectionHeader64> sections64() const;
+
+  int32_t getSectionFlags(DataRefImpl Sec) const;
+  Expected<DataRefImpl> getSectionByNum(int16_t Num) const;
+
+  void checkSymbolEntryPointer(uintptr_t SymbolEntPtr) const;
+
+  // Relocation-related interfaces.
+  Expected<uint32_t>
+  getLogicalNumberOfRelocationEntries(const XCOFFSectionHeader32 &Sec) const;
+
+  Expected<ArrayRef<XCOFFRelocation32>>
+  relocations(const XCOFFSectionHeader32 &) const;
 }; // XCOFFObjectFile
 
+class XCOFFSymbolRef {
+  const DataRefImpl SymEntDataRef;
+  const XCOFFObjectFile *const OwningObjectPtr;
+
+public:
+  XCOFFSymbolRef(DataRefImpl SymEntDataRef,
+                 const XCOFFObjectFile *OwningObjectPtr)
+      : SymEntDataRef(SymEntDataRef), OwningObjectPtr(OwningObjectPtr){};
+
+  XCOFF::StorageClass getStorageClass() const;
+  uint8_t getNumberOfAuxEntries() const;
+  const XCOFFCsectAuxEnt32 *getXCOFFCsectAuxEnt32() const;
+  uint16_t getType() const;
+  int16_t getSectionNumber() const;
+
+  bool hasCsectAuxEnt() const;
+  bool isFunction() const;
+};
+
 } // namespace object
 } // namespace llvm
 
diff --git a/include/llvm/ObjectYAML/DWARFYAML.h b/include/llvm/ObjectYAML/DWARFYAML.h
index 78d736c3ef05..525fd9a89242 100644
--- a/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/include/llvm/ObjectYAML/DWARFYAML.h
@@ -234,7 +234,7 @@ template <> struct MappingTraits<DWARFYAML::InitialLength> {
   static void mapping(IO &IO, DWARFYAML::InitialLength &DWARF);
 };
 
-#define HANDLE_DW_TAG(unused, name, unused2, unused3)                          \
+#define HANDLE_DW_TAG(unused, name, unused2, unused3, unused4)                 \
   io.enumCase(value, "DW_TAG_" #name, dwarf::DW_TAG_##name);
 
 template <> struct ScalarEnumerationTraits<dwarf::Tag> {
diff --git a/include/llvm/ObjectYAML/ELFYAML.h b/include/llvm/ObjectYAML/ELFYAML.h
index f4212516f486..0898a0e7d532 100644
--- a/include/llvm/ObjectYAML/ELFYAML.h
+++ b/include/llvm/ObjectYAML/ELFYAML.h
@@ -25,6 +25,8 @@
 namespace llvm {
 namespace ELFYAML {
 
+StringRef dropUniqueSuffix(StringRef S);
+
 // These types are invariant across 32/64-bit ELF, so for simplicity just
 // directly give them their exact sizes. We don't need to worry about
 // endianness because these are just the types in the YAMLIO structures,
@@ -54,8 +56,6 @@ LLVM_YAML_STRONG_TYPEDEF(uint64_t, ELF_SHF)
 LLVM_YAML_STRONG_TYPEDEF(uint16_t, ELF_SHN)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STB)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STT)
-LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STV)
-LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STO)
 
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_AFL_REG)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_ABI_FP)
@@ -77,7 +77,7 @@ struct FileHeader {
   llvm::yaml::Hex64 Entry;
 
   Optional<llvm::yaml::Hex16> SHEntSize;
-  Optional<llvm::yaml::Hex16> SHOffset;
+  Optional<llvm::yaml::Hex64> SHOff;
   Optional<llvm::yaml::Hex16> SHNum;
   Optional<llvm::yaml::Hex16> SHStrNdx;
 };
@@ -107,7 +107,7 @@ struct Symbol {
   ELF_STB Binding;
   llvm::yaml::Hex64 Value;
   llvm::yaml::Hex64 Size;
-  uint8_t Other;
+  Optional<uint8_t> Other;
 };
 
 struct SectionOrType {
@@ -119,6 +119,11 @@ struct DynamicEntry {
   llvm::yaml::Hex64 Val;
 };
 
+struct StackSizeEntry {
+  llvm::yaml::Hex64 Address;
+  llvm::yaml::Hex64 Size;
+};
+
 struct Section {
   enum class SectionKind {
     Dynamic,
@@ -126,10 +131,14 @@ struct Section {
     RawContent,
     Relocation,
     NoBits,
+    Hash,
     Verdef,
     Verneed,
+    StackSizes,
+    SymtabShndxSection,
     Symver,
-    MipsABIFlags
+    MipsABIFlags,
+    Addrsig
   };
   SectionKind Kind;
   StringRef Name;
@@ -140,16 +149,44 @@ struct Section {
   llvm::yaml::Hex64 AddressAlign;
   Optional<llvm::yaml::Hex64> EntSize;
 
+  // Usually sections are not created implicitly, but loaded from YAML.
+  // When they are, this flag is used to signal about that.
+  bool IsImplicit;
+
+  Section(SectionKind Kind, bool IsImplicit = false)
+      : Kind(Kind), IsImplicit(IsImplicit) {}
+  virtual ~Section();
+
+  // The following members are used to override section fields which is
+  // useful for creating invalid objects.
+
+  // This can be used to override the offset stored in the sh_name field.
+  // It does not affect the name stored in the string table.
+  Optional<llvm::yaml::Hex64> ShName;
+
   // This can be used to override the sh_offset field. It does not place the
-  // section data at the offset specified. Useful for creating invalid objects.
+  // section data at the offset specified.
   Optional<llvm::yaml::Hex64> ShOffset;
 
   // This can be used to override the sh_size field. It does not affect the
   // content written.
   Optional<llvm::yaml::Hex64> ShSize;
+};
 
-  Section(SectionKind Kind) : Kind(Kind) {}
-  virtual ~Section();
+struct StackSizesSection : Section {
+  Optional<yaml::BinaryRef> Content;
+  Optional<llvm::yaml::Hex64> Size;
+  Optional<std::vector<StackSizeEntry>> Entries;
+
+  StackSizesSection() : Section(SectionKind::StackSizes) {}
+
+  static bool classof(const Section *S) {
+    return S->Kind == SectionKind::StackSizes;
+  }
+
+  static bool nameMatches(StringRef Name) {
+    return Name == ".stack_sizes";
+  }
 };
 
 struct DynamicSection : Section {
@@ -185,6 +222,17 @@ struct NoBitsSection : Section {
   }
 };
 
+struct HashSection : Section {
+  Optional<yaml::BinaryRef> Content;
+  Optional<llvm::yaml::Hex64> Size;
+  Optional<std::vector<uint32_t>> Bucket;
+  Optional<std::vector<uint32_t>> Chain;
+
+  HashSection() : Section(SectionKind::Hash) {}
+
+  static bool classof(const Section *S) { return S->Kind == SectionKind::Hash; }
+};
+
 struct VernauxEntry {
   uint32_t Hash;
   uint16_t Flags;
@@ -209,6 +257,26 @@ struct VerneedSection : Section {
   }
 };
 
+struct AddrsigSymbol {
+  AddrsigSymbol(StringRef N) : Name(N), Index(None) {}
+  AddrsigSymbol(llvm::yaml::Hex32 Ndx) : Name(None), Index(Ndx) {}
+  AddrsigSymbol() : Name(None), Index(None) {}
+
+  Optional<StringRef> Name;
+  Optional<llvm::yaml::Hex32> Index;
+};
+
+struct AddrsigSection : Section {
+  Optional<yaml::BinaryRef> Content;
+  Optional<llvm::yaml::Hex64> Size;
+  Optional<std::vector<AddrsigSymbol>> Symbols;
+
+  AddrsigSection() : Section(SectionKind::Addrsig) {}
+  static bool classof(const Section *S) {
+    return S->Kind == SectionKind::Addrsig;
+  }
+};
+
 struct SymverSection : Section {
   std::vector<uint16_t> Entries;
 
@@ -269,6 +337,16 @@ struct RelocationSection : Section {
   }
 };
 
+struct SymtabShndxSection : Section {
+  std::vector<uint32_t> Entries;
+
+  SymtabShndxSection() : Section(SectionKind::SymtabShndxSection) {}
+
+  static bool classof(const Section *S) {
+    return S->Kind == SectionKind::SymtabShndxSection;
+  }
+};
+
 // Represents .MIPS.abiflags section
 struct MipsABIFlags : Section {
   llvm::yaml::Hex16 Version;
@@ -298,13 +376,15 @@ struct Object {
   // cleaner and nicer if we read them from the YAML as a separate
   // top-level key, which automatically ensures that invariants like there
   // being a single SHT_SYMTAB section are upheld.
-  std::vector<Symbol> Symbols;
+  Optional<std::vector<Symbol>> Symbols;
   std::vector<Symbol> DynamicSymbols;
 };
 
 } // end namespace ELFYAML
 } // end namespace llvm
 
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::AddrsigSymbol)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::StackSizeEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::DynamicEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::ProgramHeader)
 LLVM_YAML_IS_SEQUENCE_VECTOR(std::unique_ptr<llvm::ELFYAML::Section>)
@@ -380,16 +460,6 @@ struct ScalarEnumerationTraits<ELFYAML::ELF_STT> {
   static void enumeration(IO &IO, ELFYAML::ELF_STT &Value);
 };
 
-template <>
-struct ScalarEnumerationTraits<ELFYAML::ELF_STV> {
-  static void enumeration(IO &IO, ELFYAML::ELF_STV &Value);
-};
-
-template <>
-struct ScalarBitSetTraits<ELFYAML::ELF_STO> {
-  static void bitset(IO &IO, ELFYAML::ELF_STO &Value);
-};
-
 template <>
 struct ScalarEnumerationTraits<ELFYAML::ELF_REL> {
   static void enumeration(IO &IO, ELFYAML::ELF_REL &Value);
@@ -450,6 +520,10 @@ struct MappingTraits<ELFYAML::Symbol> {
   static StringRef validate(IO &IO, ELFYAML::Symbol &Symbol);
 };
 
+template <> struct MappingTraits<ELFYAML::StackSizeEntry> {
+  static void mapping(IO &IO, ELFYAML::StackSizeEntry &Rel);
+};
+
 template <> struct MappingTraits<ELFYAML::DynamicEntry> {
   static void mapping(IO &IO, ELFYAML::DynamicEntry &Rel);
 };
@@ -466,6 +540,10 @@ template <> struct MappingTraits<ELFYAML::VernauxEntry> {
   static void mapping(IO &IO, ELFYAML::VernauxEntry &E);
 };
 
+template <> struct MappingTraits<ELFYAML::AddrsigSymbol> {
+  static void mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym);
+};
+
 template <> struct MappingTraits<ELFYAML::Relocation> {
   static void mapping(IO &IO, ELFYAML::Relocation &Rel);
 };
diff --git a/include/llvm/ObjectYAML/MachOYAML.h b/include/llvm/ObjectYAML/MachOYAML.h
index d7e1c033f43b..327c3b9f892b 100644
--- a/include/llvm/ObjectYAML/MachOYAML.h
+++ b/include/llvm/ObjectYAML/MachOYAML.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/ObjectYAML/DWARFYAML.h"
+#include "llvm/ObjectYAML/YAML.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
 #include <string>
@@ -39,6 +40,7 @@ struct Section {
   llvm::yaml::Hex32 reserved1;
   llvm::yaml::Hex32 reserved2;
   llvm::yaml::Hex32 reserved3;
+  Optional<llvm::yaml::BinaryRef> content;
 };
 
 struct FileHeader {
@@ -198,6 +200,7 @@ template <> struct MappingTraits<MachOYAML::ExportEntry> {
 
 template <> struct MappingTraits<MachOYAML::Section> {
   static void mapping(IO &IO, MachOYAML::Section &Section);
+  static StringRef validate(IO &io, MachOYAML::Section &Section);
 };
 
 template <> struct MappingTraits<MachOYAML::NListEntry> {
diff --git a/include/llvm/ObjectYAML/MinidumpYAML.h b/include/llvm/ObjectYAML/MinidumpYAML.h
index 39fdd62e017b..c1711a28dd84 100644
--- a/include/llvm/ObjectYAML/MinidumpYAML.h
+++ b/include/llvm/ObjectYAML/MinidumpYAML.h
@@ -26,6 +26,8 @@ namespace MinidumpYAML {
 /// from Types to Kinds is fixed and given by the static getKind function.
 struct Stream {
   enum class StreamKind {
+    Exception,
+    MemoryInfoList,
     MemoryList,
     ModuleList,
     RawContent,
@@ -102,6 +104,45 @@ using ModuleListStream = detail::ListStream<detail::ParsedModule>;
 using ThreadListStream = detail::ListStream<detail::ParsedThread>;
 using MemoryListStream = detail::ListStream<detail::ParsedMemoryDescriptor>;
 
+/// ExceptionStream minidump stream.
+struct ExceptionStream : public Stream {
+  minidump::ExceptionStream MDExceptionStream;
+  yaml::BinaryRef ThreadContext;
+
+  ExceptionStream()
+      : Stream(StreamKind::Exception, minidump::StreamType::Exception),
+        MDExceptionStream({}) {}
+
+  explicit ExceptionStream(const minidump::ExceptionStream &MDExceptionStream,
+                           ArrayRef<uint8_t> ThreadContext)
+      : Stream(StreamKind::Exception, minidump::StreamType::Exception),
+        MDExceptionStream(MDExceptionStream), ThreadContext(ThreadContext) {}
+
+  static bool classof(const Stream *S) {
+    return S->Kind == StreamKind::Exception;
+  }
+};
+
+/// A structure containing the list of MemoryInfo entries comprising a
+/// MemoryInfoList stream.
+struct MemoryInfoListStream : public Stream {
+  std::vector<minidump::MemoryInfo> Infos;
+
+  MemoryInfoListStream()
+      : Stream(StreamKind::MemoryInfoList,
+               minidump::StreamType::MemoryInfoList) {}
+
+  explicit MemoryInfoListStream(
+      iterator_range<object::MinidumpFile::MemoryInfoIterator> Range)
+      : Stream(StreamKind::MemoryInfoList,
+               minidump::StreamType::MemoryInfoList),
+        Infos(Range.begin(), Range.end()) {}
+
+  static bool classof(const Stream *S) {
+    return S->Kind == StreamKind::MemoryInfoList;
+  }
+};
+
 /// A minidump stream represented as a sequence of hex bytes. This is used as a
 /// fallback when no other stream kind is suitable.
 struct RawContentStream : public Stream {
@@ -122,16 +163,16 @@ struct SystemInfoStream : public Stream {
   minidump::SystemInfo Info;
   std::string CSDVersion;
 
-  explicit SystemInfoStream(const minidump::SystemInfo &Info,
-                            std::string CSDVersion)
-      : Stream(StreamKind::SystemInfo, minidump::StreamType::SystemInfo),
-        Info(Info), CSDVersion(std::move(CSDVersion)) {}
-
   SystemInfoStream()
       : Stream(StreamKind::SystemInfo, minidump::StreamType::SystemInfo) {
     memset(&Info, 0, sizeof(Info));
   }
 
+  explicit SystemInfoStream(const minidump::SystemInfo &Info,
+                            std::string CSDVersion)
+      : Stream(StreamKind::SystemInfo, minidump::StreamType::SystemInfo),
+        Info(Info), CSDVersion(std::move(CSDVersion)) {}
+
   static bool classof(const Stream *S) {
     return S->Kind == StreamKind::SystemInfo;
   }
@@ -177,12 +218,6 @@ struct Object {
   static Expected<Object> create(const object::MinidumpFile &File);
 };
 
-/// Serialize the minidump file represented by Obj to OS in binary form.
-void writeAsBinary(Object &Obj, raw_ostream &OS);
-
-/// Serialize the yaml string as a minidump file to OS in binary form.
-Error writeAsBinary(StringRef Yaml, raw_ostream &OS);
-
 } // namespace MinidumpYAML
 
 namespace yaml {
@@ -213,6 +248,10 @@ template <> struct MappingContextTraits<minidump::MemoryDescriptor, BinaryRef> {
 
 } // namespace llvm
 
+LLVM_YAML_DECLARE_BITSET_TRAITS(llvm::minidump::MemoryProtection)
+LLVM_YAML_DECLARE_BITSET_TRAITS(llvm::minidump::MemoryState)
+LLVM_YAML_DECLARE_BITSET_TRAITS(llvm::minidump::MemoryType)
+
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::minidump::ProcessorArchitecture)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::minidump::OSPlatform)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::minidump::StreamType)
@@ -220,6 +259,8 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::minidump::StreamType)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(llvm::minidump::CPUInfo::ArmInfo)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(llvm::minidump::CPUInfo::OtherInfo)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(llvm::minidump::CPUInfo::X86Info)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(llvm::minidump::Exception)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(llvm::minidump::MemoryInfo)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(llvm::minidump::VSFixedFileInfo)
 
 LLVM_YAML_DECLARE_MAPPING_TRAITS(
@@ -233,6 +274,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(std::unique_ptr<llvm::MinidumpYAML::Stream>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MinidumpYAML::MemoryListStream::entry_type)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MinidumpYAML::ModuleListStream::entry_type)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MinidumpYAML::ThreadListStream::entry_type)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::minidump::MemoryInfo)
 
 LLVM_YAML_DECLARE_MAPPING_TRAITS(llvm::MinidumpYAML::Object)
 
diff --git a/include/llvm/ObjectYAML/WasmYAML.h b/include/llvm/ObjectYAML/WasmYAML.h
index 2411dc7ac17d..15a8cc215020 100644
--- a/include/llvm/ObjectYAML/WasmYAML.h
+++ b/include/llvm/ObjectYAML/WasmYAML.h
@@ -145,7 +145,7 @@ struct Signature {
   uint32_t Index;
   SignatureForm Form = wasm::WASM_TYPE_FUNC;
   std::vector<ValueType> ParamTypes;
-  ValueType ReturnType;
+  std::vector<ValueType> ReturnTypes;
 };
 
 struct SymbolInfo {
diff --git a/include/llvm/ObjectYAML/yaml2obj.h b/include/llvm/ObjectYAML/yaml2obj.h
new file mode 100644
index 000000000000..386551337d86
--- /dev/null
+++ b/include/llvm/ObjectYAML/yaml2obj.h
@@ -0,0 +1,67 @@
+//===--- yaml2obj.h - -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Common declarations for yaml2obj
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_YAML2OBJ_YAML2OBJ_H
+#define LLVM_TOOLS_YAML2OBJ_YAML2OBJ_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <memory>
+
+namespace llvm {
+class raw_ostream;
+template <typename T> class SmallVectorImpl;
+template <typename T> class Expected;
+
+namespace object {
+class ObjectFile;
+}
+
+namespace COFFYAML {
+struct Object;
+}
+
+namespace ELFYAML {
+struct Object;
+}
+
+namespace MinidumpYAML {
+struct Object;
+}
+
+namespace WasmYAML {
+struct Object;
+}
+
+namespace yaml {
+class Input;
+struct YamlObjectFile;
+
+using ErrorHandler = llvm::function_ref<void(const Twine &Msg)>;
+
+bool yaml2coff(COFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
+bool yaml2elf(ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
+bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler EH);
+bool yaml2minidump(MinidumpYAML::Object &Doc, raw_ostream &Out,
+                   ErrorHandler EH);
+bool yaml2wasm(WasmYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
+
+bool convertYAML(Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
+                 unsigned DocNum = 1);
+
+/// Convenience function for tests.
+std::unique_ptr<object::ObjectFile>
+yaml2ObjectFile(SmallVectorImpl<char> &Storage, StringRef Yaml,
+                ErrorHandler ErrHandler);
+
+} // namespace yaml
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/Pass.h b/include/llvm/Pass.h
index 329f7eaba73d..1d53ae32cf37 100644
--- a/include/llvm/Pass.h
+++ b/include/llvm/Pass.h
@@ -306,6 +306,9 @@ protected:
 };
 
 //===----------------------------------------------------------------------===//
+/// Deprecated - do not create new passes as BasicBlockPasses. Use FunctionPass
+/// with a loop over the BasicBlocks instead.
+//
 /// BasicBlockPass class - This class is used to implement most local
 /// optimizations.  Optimizations should subclass this class if they
 /// meet the following constraints:
@@ -338,6 +341,8 @@ public:
   /// do any post processing needed after all passes have run.
   virtual bool doFinalization(Function &);
 
+  void preparePassManager(PMStack &PMS) override;
+
   void assignPassManager(PMStack &PMS, PassManagerType T) override;
 
   ///  Return what kind of Pass Manager can manage this pass.
diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h
index 5e6660599f93..f73e4b42dd4b 100644
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@@ -629,6 +629,12 @@ public:
     TopLevelPipelineParsingCallbacks.push_back(C);
   }
 
+  /// Add PGOInstrumenation passes for O0 only.
+  void addPGOInstrPassesForO0(ModulePassManager &MPM, bool DebugLogging,
+                              bool RunProfileGen, bool IsCS,
+                              std::string ProfileFile,
+                              std::string ProfileRemappingFile);
+
 private:
   static Optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
@@ -660,7 +666,6 @@ private:
                          OptimizationLevel Level, bool RunProfileGen, bool IsCS,
                          std::string ProfileFile,
                          std::string ProfileRemappingFile);
-
   void invokePeepholeEPCallbacks(FunctionPassManager &, OptimizationLevel);
 
   // Extension Point callbacks
diff --git a/include/llvm/ProfileData/Coverage/CoverageMapping.h b/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 11758ac4cf2f..0dd0c7ec8065 100644
--- a/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -301,7 +301,12 @@ public:
 struct FunctionRecord {
   /// Raw function name.
   std::string Name;
-  /// Associated files.
+  /// Mapping from FileID (i.e. vector index) to filename. Used to support
+  /// macro expansions within a function in which the macro and function are
+  /// defined in separate files.
+  ///
+  /// TODO: Uniquing filenames across all function records may be a performance
+  /// optimization.
   std::vector<std::string> Filenames;
   /// Regions in the function along with their counts.
   std::vector<CountedRegion> CountedRegions;
@@ -508,6 +513,7 @@ public:
 class CoverageMapping {
   DenseMap<size_t, DenseSet<size_t>> RecordProvenance;
   std::vector<FunctionRecord> Functions;
+  DenseMap<size_t, SmallVector<unsigned, 0>> FilenameHash2RecordIndices;
   std::vector<std::pair<std::string, uint64_t>> FuncHashMismatches;
 
   CoverageMapping() = default;
@@ -516,6 +522,13 @@ class CoverageMapping {
   Error loadFunctionRecord(const CoverageMappingRecord &Record,
                            IndexedInstrProfReader &ProfileReader);
 
+  /// Look up the indices for function records which are at least partially
+  /// defined in the specified file. This is guaranteed to return a superset of
+  /// such records: extra records not in the file may be included if there is
+  /// a hash collision on the filename. Clients must be robust to collisions.
+  ArrayRef<unsigned>
+  getImpreciseRecordIndicesForFilename(StringRef Filename) const;
+
 public:
   CoverageMapping(const CoverageMapping &) = delete;
   CoverageMapping &operator=(const CoverageMapping &) = delete;
@@ -527,6 +540,7 @@ public:
 
   /// Load the coverage mapping from the given object files and profile. If
   /// \p Arches is non-empty, it must specify an architecture for each object.
+  /// Ignores non-instrumented object files unless all are not instrumented.
   static Expected<std::unique_ptr<CoverageMapping>>
   load(ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
        ArrayRef<StringRef> Arches = None);
diff --git a/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h b/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h
index 5f88cacdfcbb..6fcd8a09a494 100644
--- a/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h
+++ b/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h
@@ -30,8 +30,7 @@ class CoverageFilenamesSectionWriter {
   ArrayRef<StringRef> Filenames;
 
 public:
-  CoverageFilenamesSectionWriter(ArrayRef<StringRef> Filenames)
-      : Filenames(Filenames) {}
+  CoverageFilenamesSectionWriter(ArrayRef<StringRef> Filenames);
 
   /// Write encoded filenames to the given output stream.
   void write(raw_ostream &OS);
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index c7d764ade30d..c26f76949992 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -93,10 +93,6 @@ inline StringRef getInstrProfValuesVarPrefix() { return "__profvp_"; }
 /// Return the name of value profile node array variables:
 inline StringRef getInstrProfVNodesVarName() { return "__llvm_prf_vnodes"; }
 
-/// Return the name prefix of the COMDAT group for instrumentation variables
-/// associated with a COMDAT function.
-inline StringRef getInstrProfComdatPrefix() { return "__profv_"; }
-
 /// Return the name of the variable holding the strings (possibly compressed)
 /// of all function's PGO names.
 inline StringRef getInstrProfNamesVarName() {
@@ -634,8 +630,8 @@ struct OverlapStats {
     FuncHash = Hash;
   }
 
-  Error accumuateCounts(const std::string &BaseFilename,
-                        const std::string &TestFilename, bool IsCS);
+  Error accumulateCounts(const std::string &BaseFilename,
+                         const std::string &TestFilename, bool IsCS);
   void addOneMismatch(const CountSumOrPercent &MismatchFunc);
   void addOneUnique(const CountSumOrPercent &UniqueFunc);
 
@@ -695,7 +691,7 @@ struct InstrProfRecord {
   InstrProfRecord(const InstrProfRecord &RHS)
       : Counts(RHS.Counts),
         ValueData(RHS.ValueData
-                      ? llvm::make_unique<ValueProfData>(*RHS.ValueData)
+                      ? std::make_unique<ValueProfData>(*RHS.ValueData)
                       : nullptr) {}
   InstrProfRecord &operator=(InstrProfRecord &&) = default;
   InstrProfRecord &operator=(const InstrProfRecord &RHS) {
@@ -705,7 +701,7 @@ struct InstrProfRecord {
       return *this;
     }
     if (!ValueData)
-      ValueData = llvm::make_unique<ValueProfData>(*RHS.ValueData);
+      ValueData = std::make_unique<ValueProfData>(*RHS.ValueData);
     else
       *ValueData = *RHS.ValueData;
     return *this;
@@ -772,7 +768,7 @@ struct InstrProfRecord {
   void clearValueData() { ValueData = nullptr; }
 
   /// Compute the sums of all counts and store in Sum.
-  void accumuateCounts(CountSumOrPercent &Sum) const;
+  void accumulateCounts(CountSumOrPercent &Sum) const;
 
   /// Compute the overlap b/w this IntrprofRecord and Other.
   void overlap(InstrProfRecord &Other, OverlapStats &Overlap,
@@ -817,7 +813,7 @@ private:
   std::vector<InstrProfValueSiteRecord> &
   getOrCreateValueSitesForKind(uint32_t ValueKind) {
     if (!ValueData)
-      ValueData = llvm::make_unique<ValueProfData>();
+      ValueData = std::make_unique<ValueProfData>();
     switch (ValueKind) {
     case IPVK_IndirectCallTarget:
       return ValueData->IndirectCallSites;
@@ -897,7 +893,7 @@ InstrProfRecord::getValueForSite(uint32_t ValueKind, uint32_t Site,
     return std::unique_ptr<InstrProfValueData[]>(nullptr);
   }
 
-  auto VD = llvm::make_unique<InstrProfValueData[]>(N);
+  auto VD = std::make_unique<InstrProfValueData[]>(N);
   TotalCount = getValueForSite(VD.get(), ValueKind, Site);
 
   return VD;
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 73751faab88e..f5f552672bf0 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -92,7 +92,7 @@ public:
   virtual InstrProfSymtab &getSymtab() = 0;
 
   /// Compute the sum of counts and return in Sum.
-  void accumuateCounts(CountSumOrPercent &Sum, bool IsCS);
+  void accumulateCounts(CountSumOrPercent &Sum, bool IsCS);
 
 protected:
   std::unique_ptr<InstrProfSymtab> Symtab;
@@ -268,8 +268,14 @@ private:
       return (const char *)ValueDataStart;
   }
 
-  const uint64_t *getCounter(IntPtrT CounterPtr) const {
-    ptrdiff_t Offset = (swap(CounterPtr) - CountersDelta) / sizeof(uint64_t);
+  /// Get the offset of \p CounterPtr from the start of the counters section of
+  /// the profile. The offset has units of "number of counters", i.e. increasing
+  /// the offset by 1 corresponds to an increase in the *byte offset* by 8.
+  ptrdiff_t getCounterOffset(IntPtrT CounterPtr) const {
+    return (swap(CounterPtr) - CountersDelta) / sizeof(uint64_t);
+  }
+
+  const uint64_t *getCounter(ptrdiff_t Offset) const {
     return CountersStart + Offset;
   }
 
diff --git a/include/llvm/ProfileData/SampleProf.h b/include/llvm/ProfileData/SampleProf.h
index 7fbc857b7230..55418d9d0f9c 100644
--- a/include/llvm/ProfileData/SampleProf.h
+++ b/include/llvm/ProfileData/SampleProf.h
@@ -18,15 +18,18 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdint>
 #include <map>
+#include <set>
 #include <string>
 #include <system_error>
 #include <utility>
@@ -49,7 +52,10 @@ enum class sampleprof_error {
   truncated_name_table,
   not_implemented,
   counter_overflow,
-  ostream_seek_unsupported
+  ostream_seek_unsupported,
+  compress_failed,
+  uncompress_failed,
+  zlib_unavailable
 };
 
 inline std::error_code make_error_code(sampleprof_error E) {
@@ -83,6 +89,7 @@ enum SampleProfileFormat {
   SPF_Text = 0x1,
   SPF_Compact_Binary = 0x2,
   SPF_GCC = 0x3,
+  SPF_Ext_Binary = 0x4,
   SPF_Binary = 0xff
 };
 
@@ -105,6 +112,61 @@ static inline StringRef getRepInFormat(StringRef Name,
 
 static inline uint64_t SPVersion() { return 103; }
 
+// Section Type used by SampleProfileExtBinaryBaseReader and
+// SampleProfileExtBinaryBaseWriter. Never change the existing
+// value of enum. Only append new ones.
+enum SecType {
+  SecInValid = 0,
+  SecProfSummary = 1,
+  SecNameTable = 2,
+  SecProfileSymbolList = 3,
+  SecFuncOffsetTable = 4,
+  // marker for the first type of profile.
+  SecFuncProfileFirst = 32,
+  SecLBRProfile = SecFuncProfileFirst
+};
+
+static inline std::string getSecName(SecType Type) {
+  switch (Type) {
+  case SecInValid:
+    return "InvalidSection";
+  case SecProfSummary:
+    return "ProfileSummarySection";
+  case SecNameTable:
+    return "NameTableSection";
+  case SecProfileSymbolList:
+    return "ProfileSymbolListSection";
+  case SecFuncOffsetTable:
+    return "FuncOffsetTableSection";
+  case SecLBRProfile:
+    return "LBRProfileSection";
+  }
+  llvm_unreachable("A SecType has no name for output");
+}
+
+// Entry type of section header table used by SampleProfileExtBinaryBaseReader
+// and SampleProfileExtBinaryBaseWriter.
+struct SecHdrTableEntry {
+  SecType Type;
+  uint64_t Flags;
+  uint64_t Offset;
+  uint64_t Size;
+};
+
+enum SecFlags { SecFlagInValid = 0, SecFlagCompress = (1 << 0) };
+
+static inline void addSecFlags(SecHdrTableEntry &Entry, uint64_t Flags) {
+  Entry.Flags |= Flags;
+}
+
+static inline void removeSecFlags(SecHdrTableEntry &Entry, uint64_t Flags) {
+  Entry.Flags &= ~Flags;
+}
+
+static inline bool hasSecFlag(SecHdrTableEntry &Entry, SecFlags Flag) {
+  return Entry.Flags & Flag;
+}
+
 /// Represents the relative location of an instruction.
 ///
 /// Instruction locations are specified by the line offset from the
@@ -143,8 +205,18 @@ raw_ostream &operator<<(raw_ostream &OS, const LineLocation &Loc);
 /// will be a list of one or more functions.
 class SampleRecord {
 public:
-  using CallTargetMap = StringMap<uint64_t>;
+  using CallTarget = std::pair<StringRef, uint64_t>;
+  struct CallTargetComparator {
+    bool operator()(const CallTarget &LHS, const CallTarget &RHS) const {
+      if (LHS.second != RHS.second)
+        return LHS.second > RHS.second;
+
+      return LHS.first < RHS.first;
+    }
+  };
 
+  using SortedCallTargetSet = std::set<CallTarget, CallTargetComparator>;
+  using CallTargetMap = StringMap<uint64_t>;
   SampleRecord() = default;
 
   /// Increment the number of samples for this record by \p S.
@@ -179,6 +251,18 @@ public:
 
   uint64_t getSamples() const { return NumSamples; }
   const CallTargetMap &getCallTargets() const { return CallTargets; }
+  const SortedCallTargetSet getSortedCallTargets() const {
+    return SortCallTargets(CallTargets);
+  }
+
+  /// Sort call targets in descending order of call frequency.
+  static const SortedCallTargetSet SortCallTargets(const CallTargetMap &Targets) {
+    SortedCallTargetSet SortedTargets;
+    for (const auto &I : Targets) {
+      SortedTargets.emplace(I.first(), I.second);
+    }
+    return SortedTargets;
+  }
 
   /// Merge the samples in \p Other into this record.
   /// Optionally scale sample counts by \p Weight.
@@ -205,7 +289,7 @@ class FunctionSamples;
 using BodySampleMap = std::map<LineLocation, SampleRecord>;
 // NOTE: Using a StringMap here makes parsed profiles consume around 17% more
 // memory, which is *very* significant for large profiles.
-using FunctionSamplesMap = std::map<std::string, FunctionSamples>;
+using FunctionSamplesMap = std::map<std::string, FunctionSamples, std::less<>>;
 using CallsiteSampleMap = std::map<LineLocation, FunctionSamplesMap>;
 
 /// Representation of the samples collected for a function.
@@ -447,11 +531,10 @@ public:
   StringRef getNameInModule(StringRef Name, const Module *M) const {
     if (Format != SPF_Compact_Binary)
       return Name;
-    // Expect CurrentModule to be initialized by GUIDToFuncNameMapper.
-    if (M != CurrentModule)
-      llvm_unreachable("Input Module should be the same as CurrentModule");
-    auto iter = GUIDToFuncNameMap.find(std::stoull(Name.data()));
-    if (iter == GUIDToFuncNameMap.end())
+
+    assert(GUIDToFuncNameMap && "GUIDToFuncNameMap needs to be popluated first");
+    auto iter = GUIDToFuncNameMap->find(std::stoull(Name.data()));
+    if (iter == GUIDToFuncNameMap->end())
       return StringRef();
     return iter->second;
   }
@@ -472,42 +555,10 @@ public:
   const FunctionSamples *findFunctionSamples(const DILocation *DIL) const;
 
   static SampleProfileFormat Format;
-  /// GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
-  /// all the function symbols defined or declared in CurrentModule.
-  static DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
-  static Module *CurrentModule;
-
-  class GUIDToFuncNameMapper {
-  public:
-    GUIDToFuncNameMapper(Module &M) {
-      if (Format != SPF_Compact_Binary)
-        return;
-
-      for (const auto &F : M) {
-        StringRef OrigName = F.getName();
-        GUIDToFuncNameMap.insert({Function::getGUID(OrigName), OrigName});
-        /// Local to global var promotion used by optimization like thinlto
-        /// will rename the var and add suffix like ".llvm.xxx" to the
-        /// original local name. In sample profile, the suffixes of function
-        /// names are all stripped. Since it is possible that the mapper is
-        /// built in post-thin-link phase and var promotion has been done,
-        /// we need to add the substring of function name without the suffix
-        /// into the GUIDToFuncNameMap.
-        StringRef CanonName = getCanonicalFnName(F);
-        if (CanonName != OrigName)
-          GUIDToFuncNameMap.insert({Function::getGUID(CanonName), CanonName});
-      }
-      CurrentModule = &M;
-    }
-
-    ~GUIDToFuncNameMapper() {
-      if (Format != SPF_Compact_Binary)
-        return;
 
-      GUIDToFuncNameMap.clear();
-      CurrentModule = nullptr;
-    }
-  };
+  /// GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
+  /// all the function symbols defined or declared in current module.
+  DenseMap<uint64_t, StringRef> *GUIDToFuncNameMap = nullptr;
 
   // Assume the input \p Name is a name coming from FunctionSamples itself.
   // If the format is SPF_Compact_Binary, the name is already a GUID and we
@@ -583,6 +634,47 @@ private:
   SamplesWithLocList V;
 };
 
+/// ProfileSymbolList records the list of function symbols shown up
+/// in the binary used to generate the profile. It is useful to
+/// to discriminate a function being so cold as not to shown up
+/// in the profile and a function newly added.
+class ProfileSymbolList {
+public:
+  /// copy indicates whether we need to copy the underlying memory
+  /// for the input Name.
+  void add(StringRef Name, bool copy = false) {
+    if (!copy) {
+      Syms.insert(Name);
+      return;
+    }
+    Syms.insert(Name.copy(Allocator));
+  }
+
+  bool contains(StringRef Name) { return Syms.count(Name); }
+
+  void merge(const ProfileSymbolList &List) {
+    for (auto Sym : List.Syms)
+      add(Sym, true);
+  }
+
+  unsigned size() { return Syms.size(); }
+
+  void setToCompress(bool TC) { ToCompress = TC; }
+  bool toCompress() { return ToCompress; }
+
+  std::error_code read(const uint8_t *Data, uint64_t ListSize);
+  std::error_code write(raw_ostream &OS);
+  void dump(raw_ostream &OS = dbgs()) const;
+
+private:
+  // Determine whether or not to compress the symbol list when
+  // writing it into profile. The variable is unused when the symbol
+  // list is read from an existing profile.
+  bool ToCompress = false;
+  DenseSet<StringRef> Syms;
+  BumpPtrAllocator Allocator;
+};
+
 } // end namespace sampleprof
 } // end namespace llvm
 
diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
index 969cdea859c9..5a5d4cfde224 100644
--- a/include/llvm/ProfileData/SampleProfReader.h
+++ b/include/llvm/ProfileData/SampleProfReader.h
@@ -235,6 +235,62 @@ class raw_ostream;
 
 namespace sampleprof {
 
+class SampleProfileReader;
+
+/// SampleProfileReaderItaniumRemapper remaps the profile data from a
+/// sample profile data reader, by applying a provided set of equivalences
+/// between components of the symbol names in the profile.
+class SampleProfileReaderItaniumRemapper {
+public:
+  SampleProfileReaderItaniumRemapper(std::unique_ptr<MemoryBuffer> B,
+                                     std::unique_ptr<SymbolRemappingReader> SRR,
+                                     SampleProfileReader &R)
+      : Buffer(std::move(B)), Remappings(std::move(SRR)), Reader(R) {
+    assert(Remappings && "Remappings cannot be nullptr");
+  }
+
+  /// Create a remapper from the given remapping file. The remapper will
+  /// be used for profile read in by Reader.
+  static ErrorOr<std::unique_ptr<SampleProfileReaderItaniumRemapper>>
+  create(const std::string Filename, SampleProfileReader &Reader,
+         LLVMContext &C);
+
+  /// Create a remapper from the given Buffer. The remapper will
+  /// be used for profile read in by Reader.
+  static ErrorOr<std::unique_ptr<SampleProfileReaderItaniumRemapper>>
+  create(std::unique_ptr<MemoryBuffer> &B, SampleProfileReader &Reader,
+         LLVMContext &C);
+
+  /// Apply remappings to the profile read by Reader.
+  void applyRemapping(LLVMContext &Ctx);
+
+  bool hasApplied() { return RemappingApplied; }
+
+  /// Insert function name into remapper.
+  void insert(StringRef FunctionName) { Remappings->insert(FunctionName); }
+
+  /// Query whether there is equivalent in the remapper which has been
+  /// inserted.
+  bool exist(StringRef FunctionName) {
+    return Remappings->lookup(FunctionName);
+  }
+
+  /// Return the samples collected for function \p F if remapper knows
+  /// it is present in SampleMap.
+  FunctionSamples *getSamplesFor(StringRef FunctionName);
+
+private:
+  // The buffer holding the content read from remapping file.
+  std::unique_ptr<MemoryBuffer> Buffer;
+  std::unique_ptr<SymbolRemappingReader> Remappings;
+  DenseMap<SymbolRemappingReader::Key, FunctionSamples *> SampleMap;
+  // The Reader the remapper is servicing.
+  SampleProfileReader &Reader;
+  // Indicate whether remapping has been applied to the profile read
+  // by Reader -- by calling applyRemapping.
+  bool RemappingApplied = false;
+};
+
 /// Sample-based profile reader.
 ///
 /// Each profile contains sample counts for all the functions
@@ -273,13 +329,22 @@ public:
   /// Read and validate the file header.
   virtual std::error_code readHeader() = 0;
 
-  /// Read sample profiles from the associated file.
-  virtual std::error_code read() = 0;
+  /// The interface to read sample profiles from the associated file.
+  std::error_code read() {
+    if (std::error_code EC = readImpl())
+      return EC;
+    if (Remapper)
+      Remapper->applyRemapping(Ctx);
+    return sampleprof_error::success;
+  }
+
+  /// The implementaion to read sample profiles from the associated file.
+  virtual std::error_code readImpl() = 0;
 
   /// Print the profile for \p FName on stream \p OS.
   void dumpFunctionProfile(StringRef FName, raw_ostream &OS = dbgs());
 
-  virtual void collectFuncsToUse(const Module &M) {}
+  virtual void collectFuncsFrom(const Module &M) {}
 
   /// Print all the profiles on stream \p OS.
   void dump(raw_ostream &OS = dbgs());
@@ -295,6 +360,10 @@ public:
 
   /// Return the samples collected for function \p F.
   virtual FunctionSamples *getSamplesFor(StringRef Fname) {
+    if (Remapper) {
+      if (auto FS = Remapper->getSamplesFor(Fname))
+        return FS;
+    }
     std::string FGUID;
     Fname = getRepInFormat(Fname, getFormat(), FGUID);
     auto It = Profiles.find(Fname);
@@ -313,18 +382,33 @@ public:
   }
 
   /// Create a sample profile reader appropriate to the file format.
+  /// Create a remapper underlying if RemapFilename is not empty.
   static ErrorOr<std::unique_ptr<SampleProfileReader>>
-  create(const Twine &Filename, LLVMContext &C);
+  create(const std::string Filename, LLVMContext &C,
+         const std::string RemapFilename = "");
 
   /// Create a sample profile reader from the supplied memory buffer.
+  /// Create a remapper underlying if RemapFilename is not empty.
   static ErrorOr<std::unique_ptr<SampleProfileReader>>
-  create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C);
+  create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C,
+         const std::string RemapFilename = "");
 
   /// Return the profile summary.
-  ProfileSummary &getSummary() { return *(Summary.get()); }
+  ProfileSummary &getSummary() const { return *(Summary.get()); }
+
+  MemoryBuffer *getBuffer() const { return Buffer.get(); }
 
   /// \brief Return the profile format.
-  SampleProfileFormat getFormat() { return Format; }
+  SampleProfileFormat getFormat() const { return Format; }
+
+  virtual std::unique_ptr<ProfileSymbolList> getProfileSymbolList() {
+    return nullptr;
+  };
+
+  /// It includes all the names that have samples either in outline instance
+  /// or inline instance.
+  virtual std::vector<StringRef> *getNameTable() { return nullptr; }
+  virtual bool dumpSectionInfo(raw_ostream &OS = dbgs()) { return false; };
 
 protected:
   /// Map every function to its associated profile.
@@ -352,6 +436,8 @@ protected:
   /// Compute summary for this profile.
   void computeSummary();
 
+  std::unique_ptr<SampleProfileReaderItaniumRemapper> Remapper;
+
   /// \brief The format of sample.
   SampleProfileFormat Format = SPF_None;
 };
@@ -365,7 +451,7 @@ public:
   std::error_code readHeader() override { return sampleprof_error::success; }
 
   /// Read sample profiles from the associated file.
-  std::error_code read() override;
+  std::error_code readImpl() override;
 
   /// Return true if \p Buffer is in the format supported by this class.
   static bool hasFormat(const MemoryBuffer &Buffer);
@@ -381,7 +467,11 @@ public:
   virtual std::error_code readHeader() override;
 
   /// Read sample profiles from the associated file.
-  std::error_code read() override;
+  std::error_code readImpl() override;
+
+  /// It includes all the names that have samples either in outline instance
+  /// or inline instance.
+  virtual std::vector<StringRef> *getNameTable() override { return &NameTable; }
 
 protected:
   /// Read a numeric value of type T from the profile.
@@ -411,46 +501,134 @@ protected:
   bool at_eof() const { return Data >= End; }
 
   /// Read the next function profile instance.
-  std::error_code readFuncProfile();
+  std::error_code readFuncProfile(const uint8_t *Start);
 
   /// Read the contents of the given profile instance.
   std::error_code readProfile(FunctionSamples &FProfile);
 
+  /// Read the contents of Magic number and Version number.
+  std::error_code readMagicIdent();
+
+  /// Read profile summary.
+  std::error_code readSummary();
+
+  /// Read the whole name table.
+  virtual std::error_code readNameTable();
+
   /// Points to the current location in the buffer.
   const uint8_t *Data = nullptr;
 
   /// Points to the end of the buffer.
   const uint8_t *End = nullptr;
 
+  /// Function name table.
+  std::vector<StringRef> NameTable;
+
+  /// Read a string indirectly via the name table.
+  virtual ErrorOr<StringRef> readStringFromTable();
+
 private:
   std::error_code readSummaryEntry(std::vector<ProfileSummaryEntry> &Entries);
   virtual std::error_code verifySPMagic(uint64_t Magic) = 0;
+};
 
-  /// Read profile summary.
-  std::error_code readSummary();
+class SampleProfileReaderRawBinary : public SampleProfileReaderBinary {
+private:
+  virtual std::error_code verifySPMagic(uint64_t Magic) override;
 
-  /// Read the whole name table.
-  virtual std::error_code readNameTable() = 0;
+public:
+  SampleProfileReaderRawBinary(std::unique_ptr<MemoryBuffer> B, LLVMContext &C,
+                               SampleProfileFormat Format = SPF_Binary)
+      : SampleProfileReaderBinary(std::move(B), C, Format) {}
 
-  /// Read a string indirectly via the name table.
-  virtual ErrorOr<StringRef> readStringFromTable() = 0;
+  /// \brief Return true if \p Buffer is in the format supported by this class.
+  static bool hasFormat(const MemoryBuffer &Buffer);
 };
 
-class SampleProfileReaderRawBinary : public SampleProfileReaderBinary {
+/// SampleProfileReaderExtBinaryBase/SampleProfileWriterExtBinaryBase defines
+/// the basic structure of the extensible binary format.
+/// The format is organized in sections except the magic and version number
+/// at the beginning. There is a section table before all the sections, and
+/// each entry in the table describes the entry type, start, size and
+/// attributes. The format in each section is defined by the section itself.
+///
+/// It is easy to add a new section while maintaining the backward
+/// compatibility of the profile. Nothing extra needs to be done. If we want
+/// to extend an existing section, like add cache misses information in
+/// addition to the sample count in the profile body, we can add a new section
+/// with the extension and retire the existing section, and we could choose
+/// to keep the parser of the old section if we want the reader to be able
+/// to read both new and old format profile.
+///
+/// SampleProfileReaderExtBinary/SampleProfileWriterExtBinary define the
+/// commonly used sections of a profile in extensible binary format. It is
+/// possible to define other types of profile inherited from
+/// SampleProfileReaderExtBinaryBase/SampleProfileWriterExtBinaryBase.
+class SampleProfileReaderExtBinaryBase : public SampleProfileReaderBinary {
+private:
+  std::error_code decompressSection(const uint8_t *SecStart,
+                                    const uint64_t SecSize,
+                                    const uint8_t *&DecompressBuf,
+                                    uint64_t &DecompressBufSize);
+
+  BumpPtrAllocator Allocator;
+
+protected:
+  std::vector<SecHdrTableEntry> SecHdrTable;
+  std::unique_ptr<ProfileSymbolList> ProfSymList;
+  std::error_code readSecHdrTableEntry();
+  std::error_code readSecHdrTable();
+  virtual std::error_code readHeader() override;
+  virtual std::error_code verifySPMagic(uint64_t Magic) override = 0;
+  virtual std::error_code readOneSection(const uint8_t *Start, uint64_t Size,
+                                         SecType Type) = 0;
+
+public:
+  SampleProfileReaderExtBinaryBase(std::unique_ptr<MemoryBuffer> B,
+                                   LLVMContext &C, SampleProfileFormat Format)
+      : SampleProfileReaderBinary(std::move(B), C, Format) {}
+
+  /// Read sample profiles in extensible format from the associated file.
+  std::error_code readImpl() override;
+
+  /// Get the total size of all \p Type sections.
+  uint64_t getSectionSize(SecType Type);
+  /// Get the total size of header and all sections.
+  uint64_t getFileSize();
+  virtual bool dumpSectionInfo(raw_ostream &OS = dbgs()) override;
+};
+
+class SampleProfileReaderExtBinary : public SampleProfileReaderExtBinaryBase {
 private:
-  /// Function name table.
-  std::vector<StringRef> NameTable;
   virtual std::error_code verifySPMagic(uint64_t Magic) override;
-  virtual std::error_code readNameTable() override;
-  /// Read a string indirectly via the name table.
-  virtual ErrorOr<StringRef> readStringFromTable() override;
+  virtual std::error_code readOneSection(const uint8_t *Start, uint64_t Size,
+                                         SecType Type) override;
+  std::error_code readProfileSymbolList();
+  std::error_code readFuncOffsetTable();
+  std::error_code readFuncProfiles();
+
+  /// The table mapping from function name to the offset of its FunctionSample
+  /// towards file start.
+  DenseMap<StringRef, uint64_t> FuncOffsetTable;
+  /// The set containing the functions to use when compiling a module.
+  DenseSet<StringRef> FuncsToUse;
+  /// Use all functions from the input profile.
+  bool UseAllFuncs = true;
 
 public:
-  SampleProfileReaderRawBinary(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
-      : SampleProfileReaderBinary(std::move(B), C, SPF_Binary) {}
+  SampleProfileReaderExtBinary(std::unique_ptr<MemoryBuffer> B, LLVMContext &C,
+                               SampleProfileFormat Format = SPF_Ext_Binary)
+      : SampleProfileReaderExtBinaryBase(std::move(B), C, Format) {}
 
   /// \brief Return true if \p Buffer is in the format supported by this class.
   static bool hasFormat(const MemoryBuffer &Buffer);
+
+  virtual std::unique_ptr<ProfileSymbolList> getProfileSymbolList() override {
+    return std::move(ProfSymList);
+  };
+
+  /// Collect functions with definitions in Module \p M.
+  void collectFuncsFrom(const Module &M) override;
 };
 
 class SampleProfileReaderCompactBinary : public SampleProfileReaderBinary {
@@ -462,6 +640,8 @@ private:
   DenseMap<StringRef, uint64_t> FuncOffsetTable;
   /// The set containing the functions to use when compiling a module.
   DenseSet<StringRef> FuncsToUse;
+  /// Use all functions from the input profile.
+  bool UseAllFuncs = true;
   virtual std::error_code verifySPMagic(uint64_t Magic) override;
   virtual std::error_code readNameTable() override;
   /// Read a string indirectly via the name table.
@@ -478,10 +658,10 @@ public:
   static bool hasFormat(const MemoryBuffer &Buffer);
 
   /// Read samples only for functions to use.
-  std::error_code read() override;
+  std::error_code readImpl() override;
 
   /// Collect functions to be used when compiling Module \p M.
-  void collectFuncsToUse(const Module &M) override;
+  void collectFuncsFrom(const Module &M) override;
 };
 
 using InlineCallStack = SmallVector<FunctionSamples *, 10>;
@@ -509,7 +689,7 @@ public:
   std::error_code readHeader() override;
 
   /// Read sample profiles from the associated file.
-  std::error_code read() override;
+  std::error_code readImpl() override;
 
   /// Return true if \p Buffer is in the format supported by this class.
   static bool hasFormat(const MemoryBuffer &Buffer);
@@ -537,44 +717,6 @@ protected:
   static const uint32_t GCOVTagAFDOFunction = 0xac000000;
 };
 
-/// A profile data reader proxy that remaps the profile data from another
-/// sample profile data reader, by applying a provided set of equivalences
-/// between components of the symbol names in the profile.
-class SampleProfileReaderItaniumRemapper : public SampleProfileReader {
-public:
-  SampleProfileReaderItaniumRemapper(
-      std::unique_ptr<MemoryBuffer> B, LLVMContext &C,
-      std::unique_ptr<SampleProfileReader> Underlying)
-      : SampleProfileReader(std::move(B), C, Underlying->getFormat()) {
-    Profiles = std::move(Underlying->getProfiles());
-    Summary = takeSummary(*Underlying);
-    // Keep the underlying reader alive; the profile data may contain
-    // StringRefs referencing names in its name table.
-    UnderlyingReader = std::move(Underlying);
-  }
-
-  /// Create a remapped sample profile from the given remapping file and
-  /// underlying samples.
-  static ErrorOr<std::unique_ptr<SampleProfileReader>>
-  create(const Twine &Filename, LLVMContext &C,
-         std::unique_ptr<SampleProfileReader> Underlying);
-
-  /// Read and validate the file header.
-  std::error_code readHeader() override { return sampleprof_error::success; }
-
-  /// Read remapping file and apply it to the sample profile.
-  std::error_code read() override;
-
-  /// Return the samples collected for function \p F.
-  FunctionSamples *getSamplesFor(StringRef FunctionName) override;
-  using SampleProfileReader::getSamplesFor;
-
-private:
-  SymbolRemappingReader Remappings;
-  DenseMap<SymbolRemappingReader::Key, FunctionSamples*> SampleMap;
-  std::unique_ptr<SampleProfileReader> UnderlyingReader;
-};
-
 } // end namespace sampleprof
 
 } // end namespace llvm
diff --git a/include/llvm/ProfileData/SampleProfWriter.h b/include/llvm/ProfileData/SampleProfWriter.h
index 81e6e3ab0b4a..cc951594c9e2 100644
--- a/include/llvm/ProfileData/SampleProfWriter.h
+++ b/include/llvm/ProfileData/SampleProfWriter.h
@@ -36,7 +36,7 @@ public:
   /// Write sample profiles in \p S.
   ///
   /// \returns status code of the file update operation.
-  virtual std::error_code write(const FunctionSamples &S) = 0;
+  virtual std::error_code writeSample(const FunctionSamples &S) = 0;
 
   /// Write all the sample profiles in the given map of samples.
   ///
@@ -56,6 +56,8 @@ public:
   static ErrorOr<std::unique_ptr<SampleProfileWriter>>
   create(std::unique_ptr<raw_ostream> &OS, SampleProfileFormat Format);
 
+  virtual void setProfileSymbolList(ProfileSymbolList *PSL) {}
+
 protected:
   SampleProfileWriter(std::unique_ptr<raw_ostream> &OS)
       : OutputStream(std::move(OS)) {}
@@ -64,6 +66,10 @@ protected:
   virtual std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) = 0;
 
+  // Write function profiles to the profile file.
+  virtual std::error_code
+  writeFuncProfiles(const StringMap<FunctionSamples> &ProfileMap);
+
   /// Output stream where to emit the profile to.
   std::unique_ptr<raw_ostream> OutputStream;
 
@@ -72,12 +78,15 @@ protected:
 
   /// Compute summary for this profile.
   void computeSummary(const StringMap<FunctionSamples> &ProfileMap);
+
+  /// Profile format.
+  SampleProfileFormat Format;
 };
 
 /// Sample-based profile writer (text format).
 class SampleProfileWriterText : public SampleProfileWriter {
 public:
-  std::error_code write(const FunctionSamples &S) override;
+  std::error_code writeSample(const FunctionSamples &S) override;
 
 protected:
   SampleProfileWriterText(std::unique_ptr<raw_ostream> &OS)
@@ -102,13 +111,14 @@ private:
 /// Sample-based profile writer (binary format).
 class SampleProfileWriterBinary : public SampleProfileWriter {
 public:
-  virtual std::error_code write(const FunctionSamples &S) override;
   SampleProfileWriterBinary(std::unique_ptr<raw_ostream> &OS)
       : SampleProfileWriter(OS) {}
 
+  virtual std::error_code writeSample(const FunctionSamples &S) override;
+
 protected:
-  virtual std::error_code writeNameTable() = 0;
-  virtual std::error_code writeMagicIdent() = 0;
+  virtual std::error_code writeMagicIdent(SampleProfileFormat Format);
+  virtual std::error_code writeNameTable();
   virtual std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
   std::error_code writeSummary();
@@ -118,10 +128,10 @@ protected:
 
   MapVector<StringRef, uint32_t> NameTable;
 
-private:
   void addName(StringRef FName);
   void addNames(const FunctionSamples &S);
 
+private:
   friend ErrorOr<std::unique_ptr<SampleProfileWriter>>
   SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
                               SampleProfileFormat Format);
@@ -129,10 +139,99 @@ private:
 
 class SampleProfileWriterRawBinary : public SampleProfileWriterBinary {
   using SampleProfileWriterBinary::SampleProfileWriterBinary;
+};
+
+class SampleProfileWriterExtBinaryBase : public SampleProfileWriterBinary {
+  using SampleProfileWriterBinary::SampleProfileWriterBinary;
+public:
+  virtual std::error_code
+  write(const StringMap<FunctionSamples> &ProfileMap) override;
+
+  void setToCompressAllSections();
+  void setToCompressSection(SecType Type);
 
 protected:
-  virtual std::error_code writeNameTable() override;
-  virtual std::error_code writeMagicIdent() override;
+  uint64_t markSectionStart(SecType Type);
+  std::error_code addNewSection(SecType Sec, uint64_t SectionStart);
+  virtual void initSectionHdrLayout() = 0;
+  virtual std::error_code
+  writeSections(const StringMap<FunctionSamples> &ProfileMap) = 0;
+
+  // Specifiy the order of sections in section header table. Note
+  // the order of sections in the profile may be different that the
+  // order in SectionHdrLayout. sample Reader will follow the order
+  // in SectionHdrLayout to read each section.
+  SmallVector<SecHdrTableEntry, 8> SectionHdrLayout;
+
+private:
+  void allocSecHdrTable();
+  std::error_code writeSecHdrTable();
+  virtual std::error_code
+  writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
+  void addSectionFlags(SecType Type, SecFlags Flags);
+  SecHdrTableEntry &getEntryInLayout(SecType Type);
+  std::error_code compressAndOutput();
+
+  // We will swap the raw_ostream held by LocalBufStream and that
+  // held by OutputStream if we try to add a section which needs
+  // compression. After the swap, all the data written to output
+  // will be temporarily buffered into the underlying raw_string_ostream
+  // originally held by LocalBufStream. After the data writing for the
+  // section is completed, compress the data in the local buffer,
+  // swap the raw_ostream back and write the compressed data to the
+  // real output.
+  std::unique_ptr<raw_ostream> LocalBufStream;
+  // The location where the output stream starts.
+  uint64_t FileStart;
+  // The location in the output stream where the SecHdrTable should be
+  // written to.
+  uint64_t SecHdrTableOffset;
+  // Initial Section Flags setting.
+  std::vector<SecHdrTableEntry> SecHdrTable;
+};
+
+class SampleProfileWriterExtBinary : public SampleProfileWriterExtBinaryBase {
+public:
+  SampleProfileWriterExtBinary(std::unique_ptr<raw_ostream> &OS)
+      : SampleProfileWriterExtBinaryBase(OS) {
+    initSectionHdrLayout();
+  }
+
+  virtual std::error_code writeSample(const FunctionSamples &S) override;
+  virtual void setProfileSymbolList(ProfileSymbolList *PSL) override {
+    ProfSymList = PSL;
+  };
+
+private:
+  virtual void initSectionHdrLayout() override {
+    // Note that SecFuncOffsetTable section is written after SecLBRProfile
+    // in the profile, but is put before SecLBRProfile in SectionHdrLayout.
+    //
+    // This is because sample reader follows the order of SectionHdrLayout to
+    // read each section, to read function profiles on demand sample reader
+    // need to get the offset of each function profile first.
+    //
+    // SecFuncOffsetTable section is written after SecLBRProfile in the
+    // profile because FuncOffsetTable needs to be populated while section
+    // SecLBRProfile is written.
+    SectionHdrLayout = {{SecProfSummary, 0, 0, 0},
+                        {SecNameTable, 0, 0, 0},
+                        {SecFuncOffsetTable, 0, 0, 0},
+                        {SecLBRProfile, 0, 0, 0},
+                        {SecProfileSymbolList, 0, 0, 0}};
+  };
+  virtual std::error_code
+  writeSections(const StringMap<FunctionSamples> &ProfileMap) override;
+  ProfileSymbolList *ProfSymList = nullptr;
+
+  // Save the start of SecLBRProfile so we can compute the offset to the
+  // start of SecLBRProfile for each Function's Profile and will keep it
+  // in FuncOffsetTable.
+  uint64_t SecLBRProfileStart;
+  // FuncOffsetTable maps function name to its profile offset in SecLBRProfile
+  // section. It is used to load function profile on demand.
+  MapVector<StringRef, uint64_t> FuncOffsetTable;
+  std::error_code writeFuncOffsetTable();
 };
 
 // CompactBinary is a compact format of binary profile which both reduces
@@ -169,7 +268,7 @@ class SampleProfileWriterCompactBinary : public SampleProfileWriterBinary {
   using SampleProfileWriterBinary::SampleProfileWriterBinary;
 
 public:
-  virtual std::error_code write(const FunctionSamples &S) override;
+  virtual std::error_code writeSample(const FunctionSamples &S) override;
   virtual std::error_code
   write(const StringMap<FunctionSamples> &ProfileMap) override;
 
@@ -181,7 +280,6 @@ protected:
   /// towards profile start.
   uint64_t TableOffset;
   virtual std::error_code writeNameTable() override;
-  virtual std::error_code writeMagicIdent() override;
   virtual std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
   std::error_code writeFuncOffsetTable();
diff --git a/include/llvm/Remarks/BitstreamRemarkContainer.h b/include/llvm/Remarks/BitstreamRemarkContainer.h
new file mode 100644
index 000000000000..a2282fca04ab
--- /dev/null
+++ b/include/llvm/Remarks/BitstreamRemarkContainer.h
@@ -0,0 +1,106 @@
+//===-- BitstreamRemarkContainer.h - Container for remarks --------------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides declarations for things used in the various types of
+// remark containers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_REMARKS_REMARK_CONTAINER_H
+#define LLVM_REMARKS_REMARK_CONTAINER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Bitstream/BitCodes.h"
+#include <cstdint>
+
+namespace llvm {
+namespace remarks {
+
+/// The current version of the remark container.
+/// Note: this is different from the version of the remark entry.
+constexpr uint64_t CurrentContainerVersion = 0;
+/// The magic number used for identifying remark blocks.
+constexpr StringLiteral ContainerMagic("RMRK");
+
+/// Type of the remark container.
+/// The remark container has two modes:
+/// * separate: the metadata is separate from the remarks and points to the
+///             auxiliary file that contains the remarks.
+/// * standalone: the metadata and the remarks are emitted together.
+enum class BitstreamRemarkContainerType {
+  /// The metadata emitted separately.
+  /// This will contain the following:
+  /// * Container version and type
+  /// * String table
+  /// * External file
+  SeparateRemarksMeta,
+  /// The remarks emitted separately.
+  /// This will contain the following:
+  /// * Container version and type
+  /// * Remark version
+  SeparateRemarksFile,
+  /// Everything is emitted together.
+  /// This will contain the following:
+  /// * Container version and type
+  /// * Remark version
+  /// * String table
+  Standalone,
+  First = SeparateRemarksMeta,
+  Last = Standalone,
+};
+
+/// The possible blocks that will be encountered in a bitstream remark
+/// container.
+enum BlockIDs {
+  /// The metadata block is mandatory. It should always come after the
+  /// BLOCKINFO_BLOCK, and contains metadata that should be used when parsing
+  /// REMARK_BLOCKs.
+  /// There should always be only one META_BLOCK.
+  META_BLOCK_ID = bitc::FIRST_APPLICATION_BLOCKID,
+  /// One remark entry is represented using a REMARK_BLOCK. There can be
+  /// multiple REMARK_BLOCKs in the same file.
+  REMARK_BLOCK_ID
+};
+
+constexpr StringRef MetaBlockName = StringRef("Meta", 4);
+constexpr StringRef RemarkBlockName = StringRef("Remark", 6);
+
+/// The possible records that can be encountered in the previously described
+/// blocks.
+enum RecordIDs {
+  // Meta block records.
+  RECORD_META_CONTAINER_INFO = 1,
+  RECORD_META_REMARK_VERSION,
+  RECORD_META_STRTAB,
+  RECORD_META_EXTERNAL_FILE,
+  // Remark block records.
+  RECORD_REMARK_HEADER,
+  RECORD_REMARK_DEBUG_LOC,
+  RECORD_REMARK_HOTNESS,
+  RECORD_REMARK_ARG_WITH_DEBUGLOC,
+  RECORD_REMARK_ARG_WITHOUT_DEBUGLOC,
+  // Helpers.
+  RECORD_FIRST = RECORD_META_CONTAINER_INFO,
+  RECORD_LAST = RECORD_REMARK_ARG_WITHOUT_DEBUGLOC
+};
+
+constexpr StringRef MetaContainerInfoName = StringRef("Container info", 14);
+constexpr StringRef MetaRemarkVersionName = StringRef("Remark version", 14);
+constexpr StringRef MetaStrTabName = StringRef("String table", 12);
+constexpr StringRef MetaExternalFileName = StringRef("External File", 13);
+constexpr StringRef RemarkHeaderName = StringRef("Remark header", 13);
+constexpr StringRef RemarkDebugLocName = StringRef("Remark debug location", 21);
+constexpr StringRef RemarkHotnessName = StringRef("Remark hotness", 14);
+constexpr StringRef RemarkArgWithDebugLocName =
+    StringRef("Argument with debug location", 28);
+constexpr StringRef RemarkArgWithoutDebugLocName = StringRef("Argument", 8);
+
+} // end namespace remarks
+} // end namespace llvm
+
+#endif /* LLVM_REMARKS_REMARK_CONTAINER_H */
diff --git a/include/llvm/Remarks/BitstreamRemarkParser.h b/include/llvm/Remarks/BitstreamRemarkParser.h
new file mode 100644
index 000000000000..7ebd731693b2
--- /dev/null
+++ b/include/llvm/Remarks/BitstreamRemarkParser.h
@@ -0,0 +1,116 @@
+//===-- BitstreamRemarkParser.h - Bitstream parser --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides an implementation of the remark parser using the LLVM
+// Bitstream format.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_REMARKS_BITSTREAM_REMARK_PARSER_H
+#define LLVM_REMARKS_BITSTREAM_REMARK_PARSER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Bitstream/BitstreamReader.h"
+#include "llvm/Remarks/BitstreamRemarkContainer.h"
+#include "llvm/Remarks/Remark.h"
+#include "llvm/Remarks/RemarkParser.h"
+#include "llvm/Support/Error.h"
+#include <array>
+
+namespace llvm {
+namespace remarks {
+
+/// Helper to parse a META_BLOCK for a bitstream remark container.
+struct BitstreamMetaParserHelper {
+  /// The Bitstream reader.
+  BitstreamCursor &Stream;
+  /// Reference to the storage for the block info.
+  BitstreamBlockInfo &BlockInfo;
+  /// The parsed content: depending on the container type, some fields might be
+  /// empty.
+  Optional<uint64_t> ContainerVersion;
+  Optional<uint8_t> ContainerType;
+  Optional<StringRef> StrTabBuf;
+  Optional<StringRef> ExternalFilePath;
+  Optional<uint64_t> RemarkVersion;
+
+  /// Continue parsing with \p Stream. \p Stream is expected to contain a
+  /// ENTER_SUBBLOCK to the META_BLOCK at the current position.
+  /// \p Stream is expected to have a BLOCKINFO_BLOCK set.
+  BitstreamMetaParserHelper(BitstreamCursor &Stream,
+                            BitstreamBlockInfo &BlockInfo);
+
+  /// Parse the META_BLOCK and fill the available entries.
+  /// This helper does not check for the validity of the fields.
+  Error parse();
+};
+
+/// Helper to parse a REMARK_BLOCK for a bitstream remark container.
+struct BitstreamRemarkParserHelper {
+  /// The Bitstream reader.
+  BitstreamCursor &Stream;
+  /// The parsed content: depending on the remark, some fields might be empty.
+  Optional<uint8_t> Type;
+  Optional<uint64_t> RemarkNameIdx;
+  Optional<uint64_t> PassNameIdx;
+  Optional<uint64_t> FunctionNameIdx;
+  Optional<uint64_t> SourceFileNameIdx;
+  Optional<uint32_t> SourceLine;
+  Optional<uint32_t> SourceColumn;
+  Optional<uint64_t> Hotness;
+  struct Argument {
+    Optional<uint64_t> KeyIdx;
+    Optional<uint64_t> ValueIdx;
+    Optional<uint64_t> SourceFileNameIdx;
+    Optional<uint32_t> SourceLine;
+    Optional<uint32_t> SourceColumn;
+  };
+  Optional<ArrayRef<Argument>> Args;
+  /// Avoid re-allocating a vector every time.
+  SmallVector<Argument, 8> TmpArgs;
+
+  /// Continue parsing with \p Stream. \p Stream is expected to contain a
+  /// ENTER_SUBBLOCK to the REMARK_BLOCK at the current position.
+  /// \p Stream is expected to have a BLOCKINFO_BLOCK set and to have already
+  /// parsed the META_BLOCK.
+  BitstreamRemarkParserHelper(BitstreamCursor &Stream);
+
+  /// Parse the REMARK_BLOCK and fill the available entries.
+  /// This helper does not check for the validity of the fields.
+  Error parse();
+};
+
+/// Helper to parse any bitstream remark container.
+struct BitstreamParserHelper {
+  /// The Bitstream reader.
+  BitstreamCursor Stream;
+  /// The block info block.
+  BitstreamBlockInfo BlockInfo;
+  /// Start parsing at \p Buffer.
+  BitstreamParserHelper(StringRef Buffer);
+  /// Parse the magic number.
+  Expected<std::array<char, 4>> parseMagic();
+  /// Parse the block info block containing all the abbrevs.
+  /// This needs to be called before calling any other parsing function.
+  Error parseBlockInfoBlock();
+  /// Return true if the next block is a META_BLOCK. This function does not move
+  /// the cursor.
+  Expected<bool> isMetaBlock();
+  /// Return true if the next block is a REMARK_BLOCK. This function does not
+  /// move the cursor.
+  Expected<bool> isRemarkBlock();
+  /// Return true if the parser reached the end of the stream.
+  bool atEndOfStream() { return Stream.AtEndOfStream(); }
+  /// Jump to the end of the stream, skipping everything.
+  void skipToEnd() { return Stream.skipToEnd(); }
+};
+
+} // end namespace remarks
+} // end namespace llvm
+
+#endif /* LLVM_REMARKS_BITSTREAM_REMARK_PARSER_H */
diff --git a/include/llvm/Remarks/BitstreamRemarkSerializer.h b/include/llvm/Remarks/BitstreamRemarkSerializer.h
new file mode 100644
index 000000000000..62a175a1db0b
--- /dev/null
+++ b/include/llvm/Remarks/BitstreamRemarkSerializer.h
@@ -0,0 +1,196 @@
+//===-- BitstreamRemarkSerializer.h - Bitstream serializer ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides an implementation of the serializer using the LLVM
+// Bitstream format.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_REMARKS_BITSTREAM_REMARK_SERIALIZER_H
+#define LLVM_REMARKS_BITSTREAM_REMARK_SERIALIZER_H
+
+#include "llvm/Bitstream/BitstreamWriter.h"
+#include "llvm/Remarks/BitstreamRemarkContainer.h"
+#include "llvm/Remarks/RemarkSerializer.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace remarks {
+
+/// Serialize the remarks to LLVM bitstream.
+/// This class provides ways to emit remarks in the LLVM bitstream format and
+/// its associated metadata.
+///
+/// * The separate model:
+///   Separate meta:        | Container info
+///                         | String table
+///                         | External file
+///
+///   Separate remarks:     | Container info
+///                         | Remark version
+///                         | Remark0
+///                         | Remark1
+///                         | Remark2
+///                         | ...
+///
+/// * The standalone model: | Container info
+///                         | String table
+///                         | Remark version
+///                         | Remark0
+///                         | Remark1
+///                         | Remark2
+///                         | ...
+///
+struct BitstreamRemarkSerializerHelper {
+  /// Buffer used for encoding the bitstream before writing it to the final
+  /// stream.
+  SmallVector<char, 1024> Encoded;
+  /// Buffer used to construct records and pass to the bitstream writer.
+  SmallVector<uint64_t, 64> R;
+  /// The Bitstream writer.
+  BitstreamWriter Bitstream;
+  /// The type of the container we are serializing.
+  BitstreamRemarkContainerType ContainerType;
+
+  /// Abbrev IDs initialized in the block info block.
+  /// Note: depending on the container type, some IDs might be uninitialized.
+  /// Warning: When adding more abbrev IDs, make sure to update the
+  /// BlockCodeSize (in the call to EnterSubblock).
+  uint64_t RecordMetaContainerInfoAbbrevID = 0;
+  uint64_t RecordMetaRemarkVersionAbbrevID = 0;
+  uint64_t RecordMetaStrTabAbbrevID = 0;
+  uint64_t RecordMetaExternalFileAbbrevID = 0;
+  uint64_t RecordRemarkHeaderAbbrevID = 0;
+  uint64_t RecordRemarkDebugLocAbbrevID = 0;
+  uint64_t RecordRemarkHotnessAbbrevID = 0;
+  uint64_t RecordRemarkArgWithDebugLocAbbrevID = 0;
+  uint64_t RecordRemarkArgWithoutDebugLocAbbrevID = 0;
+
+  BitstreamRemarkSerializerHelper(BitstreamRemarkContainerType ContainerType);
+
+  // Disable copy and move: Bitstream points to Encoded, which needs special
+  // handling during copy/move, but moving the vectors is probably useless
+  // anyway.
+  BitstreamRemarkSerializerHelper(const BitstreamRemarkSerializerHelper &) =
+      delete;
+  BitstreamRemarkSerializerHelper &
+  operator=(const BitstreamRemarkSerializerHelper &) = delete;
+  BitstreamRemarkSerializerHelper(BitstreamRemarkSerializerHelper &&) = delete;
+  BitstreamRemarkSerializerHelper &
+  operator=(BitstreamRemarkSerializerHelper &&) = delete;
+
+  /// Set up the necessary block info entries according to the container type.
+  void setupBlockInfo();
+
+  /// Set up the block info for the metadata block.
+  void setupMetaBlockInfo();
+  /// The remark version in the metadata block.
+  void setupMetaRemarkVersion();
+  void emitMetaRemarkVersion(uint64_t RemarkVersion);
+  /// The strtab in the metadata block.
+  void setupMetaStrTab();
+  void emitMetaStrTab(const StringTable &StrTab);
+  /// The external file in the metadata block.
+  void setupMetaExternalFile();
+  void emitMetaExternalFile(StringRef Filename);
+
+  /// The block info for the remarks block.
+  void setupRemarkBlockInfo();
+
+  /// Emit the metadata for the remarks.
+  void emitMetaBlock(uint64_t ContainerVersion,
+                     Optional<uint64_t> RemarkVersion,
+                     Optional<const StringTable *> StrTab = None,
+                     Optional<StringRef> Filename = None);
+
+  /// Emit a remark block. The string table is required.
+  void emitRemarkBlock(const Remark &Remark, StringTable &StrTab);
+  /// Finalize the writing to \p OS.
+  void flushToStream(raw_ostream &OS);
+  /// Finalize the writing to a buffer.
+  /// The contents of the buffer remain valid for the lifetime of the object.
+  /// Any call to any other function in this class will invalidate the buffer.
+  StringRef getBuffer();
+};
+
+/// Implementation of the remark serializer using LLVM bitstream.
+struct BitstreamRemarkSerializer : public RemarkSerializer {
+  /// The file should contain:
+  /// 1) The block info block that describes how to read the blocks.
+  /// 2) The metadata block that contains various information about the remarks
+  ///    in the file.
+  /// 3) A number of remark blocks.
+
+  /// We need to set up 1) and 2) first, so that we can emit 3) after. This flag
+  /// is used to emit the first two blocks only once.
+  bool DidSetUp = false;
+  /// The helper to emit bitstream.
+  BitstreamRemarkSerializerHelper Helper;
+
+  /// Construct a serializer that will create its own string table.
+  BitstreamRemarkSerializer(raw_ostream &OS, SerializerMode Mode);
+  /// Construct a serializer with a pre-filled string table.
+  BitstreamRemarkSerializer(raw_ostream &OS, SerializerMode Mode,
+                            StringTable StrTab);
+
+  /// Emit a remark to the stream. This also emits the metadata associated to
+  /// the remarks based on the SerializerMode specified at construction.
+  /// This writes the serialized output to the provided stream.
+  void emit(const Remark &Remark) override;
+  /// The metadata serializer associated to this remark serializer. Based on the
+  /// container type of the current serializer, the container type of the
+  /// metadata serializer will change.
+  std::unique_ptr<MetaSerializer>
+  metaSerializer(raw_ostream &OS,
+                 Optional<StringRef> ExternalFilename = None) override;
+
+  static bool classof(const RemarkSerializer *S) {
+    return S->SerializerFormat == Format::Bitstream;
+  }
+};
+
+/// Serializer of metadata for bitstream remarks.
+struct BitstreamMetaSerializer : public MetaSerializer {
+  /// This class can be used with [1] a pre-constructed
+  /// BitstreamRemarkSerializerHelper, or with [2] one that is owned by the meta
+  /// serializer. In case of [1], we need to be able to store a reference to the
+  /// object, while in case of [2] we need to store the whole object.
+  Optional<BitstreamRemarkSerializerHelper> TmpHelper;
+  /// The actual helper, that can point to \p TmpHelper or to an external helper
+  /// object.
+  BitstreamRemarkSerializerHelper *Helper = nullptr;
+
+  Optional<const StringTable *> StrTab;
+  Optional<StringRef> ExternalFilename;
+
+  /// Create a new meta serializer based on \p ContainerType.
+  BitstreamMetaSerializer(raw_ostream &OS,
+                          BitstreamRemarkContainerType ContainerType,
+                          Optional<const StringTable *> StrTab = None,
+                          Optional<StringRef> ExternalFilename = None)
+      : MetaSerializer(OS), TmpHelper(None), Helper(nullptr), StrTab(StrTab),
+        ExternalFilename(ExternalFilename) {
+    TmpHelper.emplace(ContainerType);
+    Helper = &*TmpHelper;
+  }
+
+  /// Create a new meta serializer based on a previously built \p Helper.
+  BitstreamMetaSerializer(raw_ostream &OS,
+                          BitstreamRemarkSerializerHelper &Helper,
+                          Optional<const StringTable *> StrTab = None,
+                          Optional<StringRef> ExternalFilename = None)
+      : MetaSerializer(OS), TmpHelper(None), Helper(&Helper), StrTab(StrTab),
+        ExternalFilename(ExternalFilename) {}
+
+  void emit() override;
+};
+
+} // end namespace remarks
+} // end namespace llvm
+
+#endif /* LLVM_REMARKS_BITSTREAM_REMARK_SERIALIZER_H */
diff --git a/include/llvm/Remarks/Remark.h b/include/llvm/Remarks/Remark.h
index 05d0ea60accd..1243311fb8c5 100644
--- a/include/llvm/Remarks/Remark.h
+++ b/include/llvm/Remarks/Remark.h
@@ -23,7 +23,8 @@
 namespace llvm {
 namespace remarks {
 
-constexpr uint64_t Version = 0;
+/// The current version of the remark entry.
+constexpr uint64_t CurrentRemarkVersion = 0;
 
 /// The debug location used to track a remark back to the source file.
 struct RemarkLocation {
@@ -58,7 +59,8 @@ enum class Type {
   AnalysisFPCommute,
   AnalysisAliasing,
   Failure,
-  LastTypeValue = Failure
+  First = Unknown,
+  Last = Failure
 };
 
 /// A remark type used for both emission and parsing.
@@ -107,6 +109,36 @@ private:
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(Remark, LLVMRemarkEntryRef)
 
+/// Comparison operators for Remark objects and dependent objects.
+inline bool operator==(const RemarkLocation &LHS, const RemarkLocation &RHS) {
+  return LHS.SourceFilePath == RHS.SourceFilePath &&
+         LHS.SourceLine == RHS.SourceLine &&
+         LHS.SourceColumn == RHS.SourceColumn;
+}
+
+inline bool operator!=(const RemarkLocation &LHS, const RemarkLocation &RHS) {
+  return !(LHS == RHS);
+}
+
+inline bool operator==(const Argument &LHS, const Argument &RHS) {
+  return LHS.Key == RHS.Key && LHS.Val == RHS.Val && LHS.Loc == RHS.Loc;
+}
+
+inline bool operator!=(const Argument &LHS, const Argument &RHS) {
+  return !(LHS == RHS);
+}
+
+inline bool operator==(const Remark &LHS, const Remark &RHS) {
+  return LHS.RemarkType == RHS.RemarkType && LHS.PassName == RHS.PassName &&
+         LHS.RemarkName == RHS.RemarkName &&
+         LHS.FunctionName == RHS.FunctionName && LHS.Loc == RHS.Loc &&
+         LHS.Hotness == RHS.Hotness && LHS.Args == RHS.Args;
+}
+
+inline bool operator!=(const Remark &LHS, const Remark &RHS) {
+  return !(LHS == RHS);
+}
+
 } // end namespace remarks
 } // end namespace llvm
 
diff --git a/include/llvm/Remarks/RemarkFormat.h b/include/llvm/Remarks/RemarkFormat.h
index e167d99d2517..6dd32b226099 100644
--- a/include/llvm/Remarks/RemarkFormat.h
+++ b/include/llvm/Remarks/RemarkFormat.h
@@ -19,10 +19,10 @@
 namespace llvm {
 namespace remarks {
 
-constexpr StringRef Magic("REMARKS", 7);
+constexpr StringLiteral Magic("REMARKS");
 
 /// The format used for serializing/deserializing remarks.
-enum class Format { Unknown, YAML };
+enum class Format { Unknown, YAML, YAMLStrTab, Bitstream };
 
 /// Parse and validate a string for the remark format.
 Expected<Format> parseFormat(StringRef FormatStr);
diff --git a/include/llvm/Remarks/RemarkParser.h b/include/llvm/Remarks/RemarkParser.h
index 671e1abe5ec7..d6b1fddb06ff 100644
--- a/include/llvm/Remarks/RemarkParser.h
+++ b/include/llvm/Remarks/RemarkParser.h
@@ -23,9 +23,6 @@
 namespace llvm {
 namespace remarks {
 
-struct ParserImpl;
-struct ParsedStringTable;
-
 class EndOfFileError : public ErrorInfo<EndOfFileError> {
 public:
   static char ID;
@@ -39,11 +36,13 @@ public:
 };
 
 /// Parser used to parse a raw buffer to remarks::Remark objects.
-struct Parser {
+struct RemarkParser {
   /// The format of the parser.
   Format ParserFormat;
+  /// Path to prepend when opening an external remark file.
+  std::string ExternalFilePrependPath;
 
-  Parser(Format ParserFormat) : ParserFormat(ParserFormat) {}
+  RemarkParser(Format ParserFormat) : ParserFormat(ParserFormat) {}
 
   /// If no error occurs, this returns a valid Remark object.
   /// If an error of type EndOfFileError occurs, it is safe to recover from it
@@ -52,7 +51,7 @@ struct Parser {
   /// The pointer should never be null.
   virtual Expected<std::unique_ptr<Remark>> next() = 0;
 
-  virtual ~Parser() = default;
+  virtual ~RemarkParser() = default;
 };
 
 /// In-memory representation of the string table parsed from a buffer (e.g. the
@@ -60,16 +59,33 @@ struct Parser {
 struct ParsedStringTable {
   /// The buffer mapped from the section contents.
   StringRef Buffer;
-  /// Collection of offsets in the buffer for each string entry.
-  SmallVector<size_t, 8> Offsets;
+  /// This object has high changes to be std::move'd around, so don't use a
+  /// SmallVector for once.
+  std::vector<size_t> Offsets;
 
-  Expected<StringRef> operator[](size_t Index) const;
   ParsedStringTable(StringRef Buffer);
+  /// Disable copy.
+  ParsedStringTable(const ParsedStringTable &) = delete;
+  ParsedStringTable &operator=(const ParsedStringTable &) = delete;
+  /// Should be movable.
+  ParsedStringTable(ParsedStringTable &&) = default;
+  ParsedStringTable &operator=(ParsedStringTable &&) = default;
+
+  size_t size() const { return Offsets.size(); }
+  Expected<StringRef> operator[](size_t Index) const;
 };
 
-Expected<std::unique_ptr<Parser>>
+Expected<std::unique_ptr<RemarkParser>> createRemarkParser(Format ParserFormat,
+                                                           StringRef Buf);
+
+Expected<std::unique_ptr<RemarkParser>>
 createRemarkParser(Format ParserFormat, StringRef Buf,
-                   Optional<const ParsedStringTable *> StrTab = None);
+                   ParsedStringTable StrTab);
+
+Expected<std::unique_ptr<RemarkParser>>
+createRemarkParserFromMeta(Format ParserFormat, StringRef Buf,
+                           Optional<ParsedStringTable> StrTab = None,
+                           Optional<StringRef> ExternalFilePrependPath = None);
 
 } // end namespace remarks
 } // end namespace llvm
diff --git a/include/llvm/Remarks/RemarkSerializer.h b/include/llvm/Remarks/RemarkSerializer.h
index def5c2e16620..35752cd5f6fb 100644
--- a/include/llvm/Remarks/RemarkSerializer.h
+++ b/include/llvm/Remarks/RemarkSerializer.h
@@ -14,54 +14,74 @@
 #define LLVM_REMARKS_REMARK_SERIALIZER_H
 
 #include "llvm/Remarks/Remark.h"
+#include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Remarks/RemarkStringTable.h"
-#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace remarks {
 
+enum class SerializerMode {
+  Separate,  // A mode where the metadata is serialized separately from the
+             // remarks. Typically, this is used when the remarks need to be
+             // streamed to a side file and the metadata is embedded into the
+             // final result of the compilation.
+  Standalone // A mode where everything can be retrieved in the same
+             // file/buffer. Typically, this is used for storing remarks for
+             // later use.
+};
+
+struct MetaSerializer;
+
 /// This is the base class for a remark serializer.
 /// It includes support for using a string table while emitting.
-struct Serializer {
+struct RemarkSerializer {
+  /// The format of the serializer.
+  Format SerializerFormat;
   /// The open raw_ostream that the remark diagnostics are emitted to.
   raw_ostream &OS;
+  /// The serialization mode.
+  SerializerMode Mode;
   /// The string table containing all the unique strings used in the output.
   /// The table can be serialized to be consumed after the compilation.
   Optional<StringTable> StrTab;
 
-  Serializer(raw_ostream &OS) : OS(OS), StrTab() {}
+  RemarkSerializer(Format SerializerFormat, raw_ostream &OS,
+                   SerializerMode Mode)
+      : SerializerFormat(SerializerFormat), OS(OS), Mode(Mode), StrTab() {}
 
   /// This is just an interface.
-  virtual ~Serializer() = default;
+  virtual ~RemarkSerializer() = default;
+  /// Emit a remark to the stream.
   virtual void emit(const Remark &Remark) = 0;
+  /// Return the corresponding metadata serializer.
+  virtual std::unique_ptr<MetaSerializer>
+  metaSerializer(raw_ostream &OS,
+                 Optional<StringRef> ExternalFilename = None) = 0;
 };
 
-/// Wether the serializer should use a string table while emitting.
-enum class UseStringTable { No, Yes };
-
-/// Serialize the remarks to YAML. One remark entry looks like this:
-/// --- !<TYPE>
-/// Pass:            <PASSNAME>
-/// Name:            <REMARKNAME>
-/// DebugLoc:        { File: <SOURCEFILENAME>, Line: <SOURCELINE>,
-///                    Column: <SOURCECOLUMN> }
-/// Function:        <FUNCTIONNAME>
-/// Args:
-///   - <KEY>: <VALUE>
-///     DebugLoc:        { File: <FILE>, Line: <LINE>, Column: <COL> }
-/// ...
-struct YAMLSerializer : public Serializer {
-  /// The YAML streamer.
-  yaml::Output YAMLOutput;
+/// This is the base class for a remark metadata serializer.
+struct MetaSerializer {
+  /// The open raw_ostream that the metadata is emitted to.
+  raw_ostream &OS;
 
-  YAMLSerializer(raw_ostream &OS,
-                 UseStringTable UseStringTable = remarks::UseStringTable::No);
+  MetaSerializer(raw_ostream &OS) : OS(OS) {}
 
-  /// Emit a remark to the stream.
-  void emit(const Remark &Remark) override;
+  /// This is just an interface.
+  virtual ~MetaSerializer() = default;
+  virtual void emit() = 0;
 };
 
+/// Create a remark serializer.
+Expected<std::unique_ptr<RemarkSerializer>>
+createRemarkSerializer(Format RemarksFormat, SerializerMode Mode,
+                       raw_ostream &OS);
+
+/// Create a remark serializer that uses a pre-filled string table.
+Expected<std::unique_ptr<RemarkSerializer>>
+createRemarkSerializer(Format RemarksFormat, SerializerMode Mode,
+                       raw_ostream &OS, remarks::StringTable StrTab);
+
 } // end namespace remarks
 } // end namespace llvm
 
diff --git a/include/llvm/Remarks/RemarkStringTable.h b/include/llvm/Remarks/RemarkStringTable.h
index f9b4fdbbfb8d..4ce27ee884c8 100644
--- a/include/llvm/Remarks/RemarkStringTable.h
+++ b/include/llvm/Remarks/RemarkStringTable.h
@@ -18,7 +18,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Allocator.h"
+#include "llvm/Remarks/Remark.h"
 #include <vector>
 
 namespace llvm {
@@ -27,21 +27,35 @@ class raw_ostream;
 
 namespace remarks {
 
+struct ParsedStringTable;
+
 /// The string table used for serializing remarks.
 /// This table can be for example serialized in a section to be consumed after
 /// the compilation.
 struct StringTable {
-  /// Allocator holding all the memory used by the map.
-  BumpPtrAllocator Allocator;
   /// The string table containing all the unique strings used in the output.
   /// It maps a string to an unique ID.
-  StringMap<unsigned, BumpPtrAllocator &> StrTab;
+  StringMap<unsigned, BumpPtrAllocator> StrTab;
   /// Total size of the string table when serialized.
   size_t SerializedSize = 0;
 
-  StringTable() : Allocator(), StrTab(Allocator) {}
+  StringTable() = default;
+
+  /// Disable copy.
+  StringTable(const StringTable &) = delete;
+  StringTable &operator=(const StringTable &) = delete;
+  /// Should be movable.
+  StringTable(StringTable &&) = default;
+  StringTable &operator=(StringTable &&) = default;
+
+  /// Construct a string table from a ParsedStringTable.
+  StringTable(const ParsedStringTable &Other);
+
   /// Add a string to the table. It returns an unique ID of the string.
   std::pair<unsigned, StringRef> add(StringRef Str);
+  /// Modify \p R to use strings from this string table. If the string table
+  /// does not contain the strings, it adds them.
+  void internalize(Remark &R);
   /// Serialize the string table to a stream. It is serialized as a little
   /// endian uint64 (the size of the table in bytes) followed by a sequence of
   /// NULL-terminated strings, where the N-th string is the string with the ID N
diff --git a/include/llvm/Remarks/YAMLRemarkSerializer.h b/include/llvm/Remarks/YAMLRemarkSerializer.h
new file mode 100644
index 000000000000..f1213beab15d
--- /dev/null
+++ b/include/llvm/Remarks/YAMLRemarkSerializer.h
@@ -0,0 +1,108 @@
+//===-- YAMLRemarkSerializer.h - YAML Remark serialization ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides an interface for serializing remarks to YAML.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_REMARKS_YAML_REMARK_SERIALIZER_H
+#define LLVM_REMARKS_YAML_REMARK_SERIALIZER_H
+
+#include "llvm/Remarks/RemarkSerializer.h"
+#include "llvm/Support/YAMLTraits.h"
+
+namespace llvm {
+namespace remarks {
+
+/// Serialize the remarks to YAML. One remark entry looks like this:
+/// --- !<TYPE>
+/// Pass:            <PASSNAME>
+/// Name:            <REMARKNAME>
+/// DebugLoc:        { File: <SOURCEFILENAME>, Line: <SOURCELINE>,
+///                    Column: <SOURCECOLUMN> }
+/// Function:        <FUNCTIONNAME>
+/// Args:
+///   - <KEY>: <VALUE>
+///     DebugLoc:        { File: <FILE>, Line: <LINE>, Column: <COL> }
+/// ...
+struct YAMLRemarkSerializer : public RemarkSerializer {
+  /// The YAML streamer.
+  yaml::Output YAMLOutput;
+
+  YAMLRemarkSerializer(raw_ostream &OS, SerializerMode Mode,
+                       Optional<StringTable> StrTab = None);
+
+  void emit(const Remark &Remark) override;
+  std::unique_ptr<MetaSerializer>
+  metaSerializer(raw_ostream &OS,
+                 Optional<StringRef> ExternalFilename = None) override;
+
+  static bool classof(const RemarkSerializer *S) {
+    return S->SerializerFormat == Format::YAML;
+  }
+
+protected:
+  YAMLRemarkSerializer(Format SerializerFormat, raw_ostream &OS,
+                       SerializerMode Mode,
+                       Optional<StringTable> StrTab = None);
+};
+
+struct YAMLMetaSerializer : public MetaSerializer {
+  Optional<StringRef> ExternalFilename;
+
+  YAMLMetaSerializer(raw_ostream &OS, Optional<StringRef> ExternalFilename)
+      : MetaSerializer(OS), ExternalFilename(ExternalFilename) {}
+
+  void emit() override;
+};
+
+/// Serialize the remarks to YAML using a string table. An remark entry looks
+/// like the regular YAML remark but instead of string entries it's using
+/// numbers that map to an index in the string table.
+struct YAMLStrTabRemarkSerializer : public YAMLRemarkSerializer {
+  /// Wether we already emitted the metadata in standalone mode.
+  /// This should be set to true after the first invocation of `emit`.
+  bool DidEmitMeta = false;
+
+  YAMLStrTabRemarkSerializer(raw_ostream &OS, SerializerMode Mode)
+      : YAMLRemarkSerializer(Format::YAMLStrTab, OS, Mode) {
+    // We always need a string table for this type of serializer.
+    StrTab.emplace();
+  }
+  YAMLStrTabRemarkSerializer(raw_ostream &OS, SerializerMode Mode,
+                             StringTable StrTab)
+      : YAMLRemarkSerializer(Format::YAMLStrTab, OS, Mode, std::move(StrTab)) {}
+
+  /// Override to emit the metadata if necessary.
+  void emit(const Remark &Remark) override;
+
+  std::unique_ptr<MetaSerializer>
+  metaSerializer(raw_ostream &OS,
+                 Optional<StringRef> ExternalFilename = None) override;
+
+  static bool classof(const RemarkSerializer *S) {
+    return S->SerializerFormat == Format::YAMLStrTab;
+  }
+};
+
+struct YAMLStrTabMetaSerializer : public YAMLMetaSerializer {
+  /// The string table is part of the metadata.
+  const StringTable &StrTab;
+
+  YAMLStrTabMetaSerializer(raw_ostream &OS,
+                           Optional<StringRef> ExternalFilename,
+                           const StringTable &StrTab)
+      : YAMLMetaSerializer(OS, ExternalFilename), StrTab(StrTab) {}
+
+  void emit() override;
+};
+
+} // end namespace remarks
+} // end namespace llvm
+
+#endif /* LLVM_REMARKS_REMARK_SERIALIZER_H */
diff --git a/include/llvm/Support/AArch64TargetParser.def b/include/llvm/Support/AArch64TargetParser.def
index e152f383b3ec..15737265dfc3 100644
--- a/include/llvm/Support/AArch64TargetParser.def
+++ b/include/llvm/Support/AArch64TargetParser.def
@@ -50,35 +50,36 @@ AARCH64_ARCH("armv8.5-a", ARMV8_5A, "8.5-A", "v8.5a",
 #define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE)
 #endif
 // FIXME: This would be nicer were it tablegen
-AARCH64_ARCH_EXT_NAME("invalid",   AArch64::AEK_INVALID,  nullptr,  nullptr)
-AARCH64_ARCH_EXT_NAME("none",      AArch64::AEK_NONE,     nullptr,  nullptr)
-AARCH64_ARCH_EXT_NAME("crc",       AArch64::AEK_CRC,      "+crc",   "-crc")
-AARCH64_ARCH_EXT_NAME("lse",       AArch64::AEK_LSE,      "+lse",   "-lse")
-AARCH64_ARCH_EXT_NAME("rdm",       AArch64::AEK_RDM,      "+rdm",   "-rdm")
-AARCH64_ARCH_EXT_NAME("crypto",    AArch64::AEK_CRYPTO,   "+crypto","-crypto")
-AARCH64_ARCH_EXT_NAME("sm4",       AArch64::AEK_SM4,      "+sm4",   "-sm4")
-AARCH64_ARCH_EXT_NAME("sha3",      AArch64::AEK_SHA3,     "+sha3",  "-sha3")
-AARCH64_ARCH_EXT_NAME("sha2",      AArch64::AEK_SHA2,     "+sha2",  "-sha2")
-AARCH64_ARCH_EXT_NAME("aes",       AArch64::AEK_AES,      "+aes",   "-aes")
-AARCH64_ARCH_EXT_NAME("dotprod",   AArch64::AEK_DOTPROD,  "+dotprod","-dotprod")
-AARCH64_ARCH_EXT_NAME("fp",        AArch64::AEK_FP,       "+fp-armv8",  "-fp-armv8")
-AARCH64_ARCH_EXT_NAME("simd",      AArch64::AEK_SIMD,     "+neon",  "-neon")
-AARCH64_ARCH_EXT_NAME("fp16",      AArch64::AEK_FP16,     "+fullfp16",  "-fullfp16")
-AARCH64_ARCH_EXT_NAME("fp16fml",   AArch64::AEK_FP16FML,  "+fp16fml", "-fp16fml")
-AARCH64_ARCH_EXT_NAME("profile",   AArch64::AEK_PROFILE,  "+spe",  "-spe")
-AARCH64_ARCH_EXT_NAME("ras",       AArch64::AEK_RAS,      "+ras",  "-ras")
-AARCH64_ARCH_EXT_NAME("sve",       AArch64::AEK_SVE,      "+sve",  "-sve")
-AARCH64_ARCH_EXT_NAME("sve2",      AArch64::AEK_SVE2,     "+sve2", "-sve2")
-AARCH64_ARCH_EXT_NAME("sve2-aes",  AArch64::AEK_SVE2AES,  "+sve2-aes", "-sve2-aes")
-AARCH64_ARCH_EXT_NAME("sve2-sm4",  AArch64::AEK_SVE2SM4,  "+sve2-sm4", "-sve2-sm4")
-AARCH64_ARCH_EXT_NAME("sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3")
-AARCH64_ARCH_EXT_NAME("bitperm",   AArch64::AEK_BITPERM,  "+bitperm", "-bitperm")
-AARCH64_ARCH_EXT_NAME("rcpc",      AArch64::AEK_RCPC,     "+rcpc", "-rcpc")
-AARCH64_ARCH_EXT_NAME("rng",       AArch64::AEK_RAND,     "+rand",  "-rand")
-AARCH64_ARCH_EXT_NAME("memtag",    AArch64::AEK_MTE,      "+mte",   "-mte")
-AARCH64_ARCH_EXT_NAME("ssbs",      AArch64::AEK_SSBS,     "+ssbs",  "-ssbs")
-AARCH64_ARCH_EXT_NAME("sb",        AArch64::AEK_SB,       "+sb",    "-sb")
-AARCH64_ARCH_EXT_NAME("predres",   AArch64::AEK_PREDRES,  "+predres", "-predres")
+AARCH64_ARCH_EXT_NAME("invalid",      AArch64::AEK_INVALID,     nullptr,  nullptr)
+AARCH64_ARCH_EXT_NAME("none",         AArch64::AEK_NONE,        nullptr,  nullptr)
+AARCH64_ARCH_EXT_NAME("crc",          AArch64::AEK_CRC,         "+crc",   "-crc")
+AARCH64_ARCH_EXT_NAME("lse",          AArch64::AEK_LSE,         "+lse",   "-lse")
+AARCH64_ARCH_EXT_NAME("rdm",          AArch64::AEK_RDM,         "+rdm",   "-rdm")
+AARCH64_ARCH_EXT_NAME("crypto",       AArch64::AEK_CRYPTO,      "+crypto","-crypto")
+AARCH64_ARCH_EXT_NAME("sm4",          AArch64::AEK_SM4,         "+sm4",   "-sm4")
+AARCH64_ARCH_EXT_NAME("sha3",         AArch64::AEK_SHA3,        "+sha3",  "-sha3")
+AARCH64_ARCH_EXT_NAME("sha2",         AArch64::AEK_SHA2,        "+sha2",  "-sha2")
+AARCH64_ARCH_EXT_NAME("aes",          AArch64::AEK_AES,         "+aes",   "-aes")
+AARCH64_ARCH_EXT_NAME("dotprod",      AArch64::AEK_DOTPROD,     "+dotprod","-dotprod")
+AARCH64_ARCH_EXT_NAME("fp",           AArch64::AEK_FP,          "+fp-armv8",  "-fp-armv8")
+AARCH64_ARCH_EXT_NAME("simd",         AArch64::AEK_SIMD,        "+neon",  "-neon")
+AARCH64_ARCH_EXT_NAME("fp16",         AArch64::AEK_FP16,        "+fullfp16",  "-fullfp16")
+AARCH64_ARCH_EXT_NAME("fp16fml",      AArch64::AEK_FP16FML,     "+fp16fml", "-fp16fml")
+AARCH64_ARCH_EXT_NAME("profile",      AArch64::AEK_PROFILE,     "+spe",  "-spe")
+AARCH64_ARCH_EXT_NAME("ras",          AArch64::AEK_RAS,         "+ras",  "-ras")
+AARCH64_ARCH_EXT_NAME("sve",          AArch64::AEK_SVE,         "+sve",  "-sve")
+AARCH64_ARCH_EXT_NAME("sve2",         AArch64::AEK_SVE2,        "+sve2", "-sve2")
+AARCH64_ARCH_EXT_NAME("sve2-aes",     AArch64::AEK_SVE2AES,     "+sve2-aes", "-sve2-aes")
+AARCH64_ARCH_EXT_NAME("sve2-sm4",     AArch64::AEK_SVE2SM4,     "+sve2-sm4", "-sve2-sm4")
+AARCH64_ARCH_EXT_NAME("sve2-sha3",    AArch64::AEK_SVE2SHA3,    "+sve2-sha3", "-sve2-sha3")
+AARCH64_ARCH_EXT_NAME("sve2-bitperm", AArch64::AEK_SVE2BITPERM, "+sve2-bitperm", "-sve2-bitperm")
+AARCH64_ARCH_EXT_NAME("rcpc",         AArch64::AEK_RCPC,        "+rcpc", "-rcpc")
+AARCH64_ARCH_EXT_NAME("rng",          AArch64::AEK_RAND,        "+rand",  "-rand")
+AARCH64_ARCH_EXT_NAME("memtag",       AArch64::AEK_MTE,         "+mte",   "-mte")
+AARCH64_ARCH_EXT_NAME("ssbs",         AArch64::AEK_SSBS,        "+ssbs",  "-ssbs")
+AARCH64_ARCH_EXT_NAME("sb",           AArch64::AEK_SB,          "+sb",    "-sb")
+AARCH64_ARCH_EXT_NAME("predres",      AArch64::AEK_PREDRES,     "+predres", "-predres")
+AARCH64_ARCH_EXT_NAME("tme",          AArch64::AEK_TME,         "+tme",   "-tme")
 #undef AARCH64_ARCH_EXT_NAME
 
 #ifndef AARCH64_CPU_NAME
@@ -92,6 +93,12 @@ AARCH64_CPU_NAME("cortex-a55", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC))
 AARCH64_CPU_NAME("cortex-a57", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_CRC))
+AARCH64_CPU_NAME("cortex-a65", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 | AArch64::AEK_RAS |
+                  AArch64::AEK_RCPC | AArch64::AEK_SSBS))
+AARCH64_CPU_NAME("cortex-a65ae", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 | AArch64::AEK_RAS |
+                  AArch64::AEK_RCPC | AArch64::AEK_SSBS))
 AARCH64_CPU_NAME("cortex-a72", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("cortex-a73", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
@@ -104,6 +111,13 @@ AARCH64_CPU_NAME("cortex-a76", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
 AARCH64_CPU_NAME("cortex-a76ae", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
                   AArch64::AEK_SSBS))
+AARCH64_CPU_NAME("neoverse-e1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 | AArch64::AEK_RAS |
+                  AArch64::AEK_RCPC | AArch64::AEK_SSBS))
+AARCH64_CPU_NAME("neoverse-n1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 |
+                  AArch64::AEK_PROFILE | AArch64::AEK_RAS | AArch64::AEK_RCPC |
+                  AArch64::AEK_SSBS))
 AARCH64_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_NONE))
 AARCH64_CPU_NAME("exynos-m1", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
diff --git a/include/llvm/Support/AArch64TargetParser.h b/include/llvm/Support/AArch64TargetParser.h
index 965d38535e74..94f341c83260 100644
--- a/include/llvm/Support/AArch64TargetParser.h
+++ b/include/llvm/Support/AArch64TargetParser.h
@@ -53,7 +53,8 @@ enum ArchExtKind : unsigned {
   AEK_SVE2AES =     1 << 24,
   AEK_SVE2SM4 =     1 << 25,
   AEK_SVE2SHA3 =    1 << 26,
-  AEK_BITPERM =     1 << 27,
+  AEK_SVE2BITPERM = 1 << 27,
+  AEK_TME =         1 << 28,
 };
 
 enum class ArchKind {
diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def
index f466b3252748..3e77e20762c1 100644
--- a/include/llvm/Support/ARMTargetParser.def
+++ b/include/llvm/Support/ARMTargetParser.def
@@ -274,6 +274,8 @@ ARM_CPU_NAME("cortex-a76", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
             (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("cortex-a76ae", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
             (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
+ARM_CPU_NAME("neoverse-n1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
+             (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m1", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m2", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
diff --git a/include/llvm/Support/ARMTargetParser.h b/include/llvm/Support/ARMTargetParser.h
index 4b9070dea596..02d4c975129f 100644
--- a/include/llvm/Support/ARMTargetParser.h
+++ b/include/llvm/Support/ARMTargetParser.h
@@ -39,19 +39,13 @@ enum ArchExtKind : unsigned {
   AEK_DSP =         1 << 10,
   AEK_FP16 =        1 << 11,
   AEK_RAS =         1 << 12,
-  AEK_SVE =         1 << 13,
-  AEK_DOTPROD =     1 << 14,
-  AEK_SHA2    =     1 << 15,
-  AEK_AES     =     1 << 16,
-  AEK_FP16FML =     1 << 17,
-  AEK_SB      =     1 << 18,
-  AEK_SVE2 =        1 << 19,
-  AEK_SVE2AES =     1 << 20,
-  AEK_SVE2SM4 =     1 << 21,
-  AEK_SVE2SHA3 =    1 << 22,
-  AEK_BITPERM =     1 << 23,
-  AEK_FP_DP   =     1 << 24,
-  AEK_LOB     =     1 << 25,
+  AEK_DOTPROD =     1 << 13,
+  AEK_SHA2    =     1 << 14,
+  AEK_AES     =     1 << 15,
+  AEK_FP16FML =     1 << 16,
+  AEK_SB      =     1 << 17,
+  AEK_FP_DP   =     1 << 18,
+  AEK_LOB     =     1 << 19,
   // Unsupported extensions.
   AEK_OS = 0x8000000,
   AEK_IWMMXT = 0x10000000,
diff --git a/include/llvm/Support/AlignOf.h b/include/llvm/Support/AlignOf.h
index d12401f0eb49..eb42542b777f 100644
--- a/include/llvm/Support/AlignOf.h
+++ b/include/llvm/Support/AlignOf.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the AlignedCharArray and AlignedCharArrayUnion classes.
+// This file defines the AlignedCharArrayUnion class.
 //
 //===----------------------------------------------------------------------===//
 
@@ -18,128 +18,38 @@
 
 namespace llvm {
 
-/// \struct AlignedCharArray
-/// Helper for building an aligned character array type.
-///
-/// This template is used to explicitly build up a collection of aligned
-/// character array types. We have to build these up using a macro and explicit
-/// specialization to cope with MSVC (at least till 2015) where only an
-/// integer literal can be used to specify an alignment constraint. Once built
-/// up here, we can then begin to indirect between these using normal C++
-/// template parameters.
-
-// MSVC requires special handling here.
-#ifndef _MSC_VER
-
-template<std::size_t Alignment, std::size_t Size>
-struct AlignedCharArray {
-  alignas(Alignment) char buffer[Size];
-};
-
-#else // _MSC_VER
-
-/// Create a type with an aligned char buffer.
-template<std::size_t Alignment, std::size_t Size>
-struct AlignedCharArray;
-
-// We provide special variations of this template for the most common
-// alignments because __declspec(align(...)) doesn't actually work when it is
-// a member of a by-value function argument in MSVC, even if the alignment
-// request is something reasonably like 8-byte or 16-byte. Note that we can't
-// even include the declspec with the union that forces the alignment because
-// MSVC warns on the existence of the declspec despite the union member forcing
-// proper alignment.
-
-template<std::size_t Size>
-struct AlignedCharArray<1, Size> {
-  union {
-    char aligned;
-    char buffer[Size];
-  };
-};
-
-template<std::size_t Size>
-struct AlignedCharArray<2, Size> {
-  union {
-    short aligned;
-    char buffer[Size];
-  };
-};
-
-template<std::size_t Size>
-struct AlignedCharArray<4, Size> {
-  union {
-    int aligned;
-    char buffer[Size];
-  };
-};
+namespace detail {
 
-template<std::size_t Size>
-struct AlignedCharArray<8, Size> {
-  union {
-    double aligned;
-    char buffer[Size];
-  };
+template <typename T, typename... Ts> class AlignerImpl {
+  T t;
+  AlignerImpl<Ts...> rest;
+  AlignerImpl() = delete;
 };
 
-
-// The rest of these are provided with a __declspec(align(...)) and we simply
-// can't pass them by-value as function arguments on MSVC.
-
-#define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
-  template<std::size_t Size> \
-  struct AlignedCharArray<x, Size> { \
-    __declspec(align(x)) char buffer[Size]; \
-  };
-
-LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16)
-LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32)
-LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64)
-LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128)
-
-#undef LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
-
-#endif // _MSC_VER
-
-namespace detail {
-template <typename T1,
-          typename T2 = char, typename T3 = char, typename T4 = char,
-          typename T5 = char, typename T6 = char, typename T7 = char,
-          typename T8 = char, typename T9 = char, typename T10 = char>
-class AlignerImpl {
-  T1 t1; T2 t2; T3 t3; T4 t4; T5 t5; T6 t6; T7 t7; T8 t8; T9 t9; T10 t10;
-
+template <typename T> class AlignerImpl<T> {
+  T t;
   AlignerImpl() = delete;
 };
 
-template <typename T1,
-          typename T2 = char, typename T3 = char, typename T4 = char,
-          typename T5 = char, typename T6 = char, typename T7 = char,
-          typename T8 = char, typename T9 = char, typename T10 = char>
-union SizerImpl {
-  char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)],
-       arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)],
-       arr9[sizeof(T9)], arr10[sizeof(T10)];
+template <typename T, typename... Ts> union SizerImpl {
+  char arr[sizeof(T)];
+  SizerImpl<Ts...> rest;
 };
+
+template <typename T> union SizerImpl<T> { char arr[sizeof(T)]; };
 } // end namespace detail
 
-/// This union template exposes a suitably aligned and sized character
-/// array member which can hold elements of any of up to ten types.
+/// A suitably aligned and sized character array member which can hold elements
+/// of any type.
 ///
-/// These types may be arrays, structs, or any other types. The goal is to
-/// expose a char array buffer member which can be used as suitable storage for
-/// a placement new of any of these types. Support for more than ten types can
-/// be added at the cost of more boilerplate.
-template <typename T1,
-          typename T2 = char, typename T3 = char, typename T4 = char,
-          typename T5 = char, typename T6 = char, typename T7 = char,
-          typename T8 = char, typename T9 = char, typename T10 = char>
-struct AlignedCharArrayUnion : llvm::AlignedCharArray<
-    alignof(llvm::detail::AlignerImpl<T1, T2, T3, T4, T5,
-                                      T6, T7, T8, T9, T10>),
-    sizeof(::llvm::detail::SizerImpl<T1, T2, T3, T4, T5,
-                                     T6, T7, T8, T9, T10>)> {
+/// These types may be arrays, structs, or any other types. This exposes a
+/// `buffer` member which can be used as suitable storage for a placement new of
+/// any of these types.
+template <typename T, typename... Ts> struct AlignedCharArrayUnion {
+  alignas(::llvm::detail::AlignerImpl<T, Ts...>) char buffer[sizeof(
+      llvm::detail::SizerImpl<T, Ts...>)];
 };
+
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_ALIGNOF_H
diff --git a/include/llvm/Support/Alignment.h b/include/llvm/Support/Alignment.h
new file mode 100644
index 000000000000..72fad87dd0d4
--- /dev/null
+++ b/include/llvm/Support/Alignment.h
@@ -0,0 +1,403 @@
+//===-- llvm/Support/Alignment.h - Useful alignment functions ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains types to represent alignments.
+// They are instrumented to guarantee some invariants are preserved and prevent
+// invalid manipulations.
+//
+// - Align represents an alignment in bytes, it is always set and always a valid
+// power of two, its minimum value is 1 which means no alignment requirements.
+//
+// - MaybeAlign is an optional type, it may be undefined or set. When it's set
+// you can get the underlying Align type by using the getValue() method.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ALIGNMENT_H_
+#define LLVM_SUPPORT_ALIGNMENT_H_
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <limits>
+
+namespace llvm {
+
+#define ALIGN_CHECK_ISPOSITIVE(decl)                                           \
+  assert(decl > 0 && (#decl " should be defined"))
+#define ALIGN_CHECK_ISSET(decl)                                                \
+  assert(decl.hasValue() && (#decl " should be defined"))
+
+/// This struct is a compact representation of a valid (non-zero power of two)
+/// alignment.
+/// It is suitable for use as static global constants.
+struct Align {
+private:
+  uint8_t ShiftValue = 0; /// The log2 of the required alignment.
+                          /// ShiftValue is less than 64 by construction.
+
+  friend struct MaybeAlign;
+  friend unsigned Log2(Align);
+  friend bool operator==(Align Lhs, Align Rhs);
+  friend bool operator!=(Align Lhs, Align Rhs);
+  friend bool operator<=(Align Lhs, Align Rhs);
+  friend bool operator>=(Align Lhs, Align Rhs);
+  friend bool operator<(Align Lhs, Align Rhs);
+  friend bool operator>(Align Lhs, Align Rhs);
+  friend unsigned encode(struct MaybeAlign A);
+  friend struct MaybeAlign decodeMaybeAlign(unsigned Value);
+
+  /// A trivial type to allow construction of constexpr Align.
+  /// This is currently needed to workaround a bug in GCC 5.3 which prevents
+  /// definition of constexpr assign operators.
+  /// https://stackoverflow.com/questions/46756288/explicitly-defaulted-function-cannot-be-declared-as-constexpr-because-the-implic
+  /// FIXME: Remove this, make all assign operators constexpr and introduce user
+  /// defined literals when we don't have to support GCC 5.3 anymore.
+  /// https://llvm.org/docs/GettingStarted.html#getting-a-modern-host-c-toolchain
+  struct LogValue {
+    uint8_t Log;
+  };
+
+public:
+  /// Default is byte-aligned.
+  constexpr Align() = default;
+  /// Do not perform checks in case of copy/move construct/assign, because the
+  /// checks have been performed when building `Other`.
+  constexpr Align(const Align &Other) = default;
+  constexpr Align(Align &&Other) = default;
+  Align &operator=(const Align &Other) = default;
+  Align &operator=(Align &&Other) = default;
+
+  explicit Align(uint64_t Value) {
+    assert(Value > 0 && "Value must not be 0");
+    assert(llvm::isPowerOf2_64(Value) && "Alignment is not a power of 2");
+    ShiftValue = Log2_64(Value);
+    assert(ShiftValue < 64 && "Broken invariant");
+  }
+
+  /// This is a hole in the type system and should not be abused.
+  /// Needed to interact with C for instance.
+  uint64_t value() const { return uint64_t(1) << ShiftValue; }
+
+  /// Returns a default constructed Align which corresponds to no alignment.
+  /// This is useful to test for unalignment as it conveys clear semantic.
+  /// `if (A != Align::None())`
+  /// would be better than
+  /// `if (A > Align(1))`
+  constexpr static const Align None() { return Align(); }
+
+  /// Allow constructions of constexpr Align.
+  template <size_t kValue> constexpr static LogValue Constant() {
+    return LogValue{static_cast<uint8_t>(CTLog2<kValue>())};
+  }
+
+  /// Allow constructions of constexpr Align from types.
+  /// Compile time equivalent to Align(alignof(T)).
+  template <typename T> constexpr static LogValue Of() {
+    return Constant<std::alignment_of<T>::value>();
+  }
+
+  /// Constexpr constructor from LogValue type.
+  constexpr Align(LogValue CA) : ShiftValue(CA.Log) {}
+};
+
+/// Treats the value 0 as a 1, so Align is always at least 1.
+inline Align assumeAligned(uint64_t Value) {
+  return Value ? Align(Value) : Align();
+}
+
+/// This struct is a compact representation of a valid (power of two) or
+/// undefined (0) alignment.
+struct MaybeAlign : public llvm::Optional<Align> {
+private:
+  using UP = llvm::Optional<Align>;
+
+public:
+  /// Default is undefined.
+  MaybeAlign() = default;
+  /// Do not perform checks in case of copy/move construct/assign, because the
+  /// checks have been performed when building `Other`.
+  MaybeAlign(const MaybeAlign &Other) = default;
+  MaybeAlign &operator=(const MaybeAlign &Other) = default;
+  MaybeAlign(MaybeAlign &&Other) = default;
+  MaybeAlign &operator=(MaybeAlign &&Other) = default;
+
+  /// Use llvm::Optional<Align> constructor.
+  using UP::UP;
+
+  explicit MaybeAlign(uint64_t Value) {
+    assert((Value == 0 || llvm::isPowerOf2_64(Value)) &&
+           "Alignment is neither 0 nor a power of 2");
+    if (Value)
+      emplace(Value);
+  }
+
+  /// For convenience, returns a valid alignment or 1 if undefined.
+  Align valueOrOne() const { return hasValue() ? getValue() : Align(); }
+};
+
+/// Checks that SizeInBytes is a multiple of the alignment.
+inline bool isAligned(Align Lhs, uint64_t SizeInBytes) {
+  return SizeInBytes % Lhs.value() == 0;
+}
+
+/// Checks that SizeInBytes is a multiple of the alignment.
+/// Returns false if the alignment is undefined.
+inline bool isAligned(MaybeAlign Lhs, uint64_t SizeInBytes) {
+  ALIGN_CHECK_ISSET(Lhs);
+  return SizeInBytes % (*Lhs).value() == 0;
+}
+
+/// Checks that Addr is a multiple of the alignment.
+inline bool isAddrAligned(Align Lhs, const void *Addr) {
+  return isAligned(Lhs, reinterpret_cast<uintptr_t>(Addr));
+}
+
+/// Returns a multiple of A needed to store `Size` bytes.
+inline uint64_t alignTo(uint64_t Size, Align A) {
+  const uint64_t value = A.value();
+  // The following line is equivalent to `(Size + value - 1) / value * value`.
+
+  // The division followed by a multiplication can be thought of as a right
+  // shift followed by a left shift which zeros out the extra bits produced in
+  // the bump; `~(value - 1)` is a mask where all those bits being zeroed out
+  // are just zero.
+
+  // Most compilers can generate this code but the pattern may be missed when
+  // multiple functions gets inlined.
+  return (Size + value - 1) & ~(value - 1);
+}
+
+/// Returns a multiple of A needed to store `Size` bytes.
+/// Returns `Size` if current alignment is undefined.
+inline uint64_t alignTo(uint64_t Size, MaybeAlign A) {
+  return A ? alignTo(Size, A.getValue()) : Size;
+}
+
+/// Aligns `Addr` to `Alignment` bytes, rounding up.
+inline uintptr_t alignAddr(const void *Addr, Align Alignment) {
+  uintptr_t ArithAddr = reinterpret_cast<uintptr_t>(Addr);
+  assert(static_cast<uintptr_t>(ArithAddr + Alignment.value() - 1) >=
+             ArithAddr && "Overflow");
+  return alignTo(ArithAddr, Alignment);
+}
+
+/// Returns the offset to the next integer (mod 2**64) that is greater than
+/// or equal to \p Value and is a multiple of \p Align.
+inline uint64_t offsetToAlignment(uint64_t Value, Align Alignment) {
+  return alignTo(Value, Alignment) - Value;
+}
+
+/// Returns the necessary adjustment for aligning `Addr` to `Alignment`
+/// bytes, rounding up.
+inline uint64_t offsetToAlignedAddr(const void *Addr, Align Alignment) {
+  return offsetToAlignment(reinterpret_cast<uintptr_t>(Addr), Alignment);
+}
+
+/// Returns the log2 of the alignment.
+inline unsigned Log2(Align A) { return A.ShiftValue; }
+
+/// Returns the log2 of the alignment.
+/// \pre A must be defined.
+inline unsigned Log2(MaybeAlign A) {
+  ALIGN_CHECK_ISSET(A);
+  return Log2(A.getValue());
+}
+
+/// Returns the alignment that satisfies both alignments.
+/// Same semantic as MinAlign.
+inline Align commonAlignment(Align A, Align B) { return std::min(A, B); }
+
+/// Returns the alignment that satisfies both alignments.
+/// Same semantic as MinAlign.
+inline Align commonAlignment(Align A, uint64_t Offset) {
+  return Align(MinAlign(A.value(), Offset));
+}
+
+/// Returns the alignment that satisfies both alignments.
+/// Same semantic as MinAlign.
+inline MaybeAlign commonAlignment(MaybeAlign A, MaybeAlign B) {
+  return A && B ? commonAlignment(*A, *B) : A ? A : B;
+}
+
+/// Returns the alignment that satisfies both alignments.
+/// Same semantic as MinAlign.
+inline MaybeAlign commonAlignment(MaybeAlign A, uint64_t Offset) {
+  return MaybeAlign(MinAlign((*A).value(), Offset));
+}
+
+/// Returns a representation of the alignment that encodes undefined as 0.
+inline unsigned encode(MaybeAlign A) { return A ? A->ShiftValue + 1 : 0; }
+
+/// Dual operation of the encode function above.
+inline MaybeAlign decodeMaybeAlign(unsigned Value) {
+  if (Value == 0)
+    return MaybeAlign();
+  Align Out;
+  Out.ShiftValue = Value - 1;
+  return Out;
+}
+
+/// Returns a representation of the alignment, the encoded value is positive by
+/// definition.
+inline unsigned encode(Align A) { return encode(MaybeAlign(A)); }
+
+/// Comparisons between Align and scalars. Rhs must be positive.
+inline bool operator==(Align Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return Lhs.value() == Rhs;
+}
+inline bool operator!=(Align Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return Lhs.value() != Rhs;
+}
+inline bool operator<=(Align Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return Lhs.value() <= Rhs;
+}
+inline bool operator>=(Align Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return Lhs.value() >= Rhs;
+}
+inline bool operator<(Align Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return Lhs.value() < Rhs;
+}
+inline bool operator>(Align Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return Lhs.value() > Rhs;
+}
+
+/// Comparisons between MaybeAlign and scalars.
+inline bool operator==(MaybeAlign Lhs, uint64_t Rhs) {
+  return Lhs ? (*Lhs).value() == Rhs : Rhs == 0;
+}
+inline bool operator!=(MaybeAlign Lhs, uint64_t Rhs) {
+  return Lhs ? (*Lhs).value() != Rhs : Rhs != 0;
+}
+inline bool operator<=(MaybeAlign Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return (*Lhs).value() <= Rhs;
+}
+inline bool operator>=(MaybeAlign Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return (*Lhs).value() >= Rhs;
+}
+inline bool operator<(MaybeAlign Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return (*Lhs).value() < Rhs;
+}
+inline bool operator>(MaybeAlign Lhs, uint64_t Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  ALIGN_CHECK_ISPOSITIVE(Rhs);
+  return (*Lhs).value() > Rhs;
+}
+
+/// Comparisons operators between Align.
+inline bool operator==(Align Lhs, Align Rhs) {
+  return Lhs.ShiftValue == Rhs.ShiftValue;
+}
+inline bool operator!=(Align Lhs, Align Rhs) {
+  return Lhs.ShiftValue != Rhs.ShiftValue;
+}
+inline bool operator<=(Align Lhs, Align Rhs) {
+  return Lhs.ShiftValue <= Rhs.ShiftValue;
+}
+inline bool operator>=(Align Lhs, Align Rhs) {
+  return Lhs.ShiftValue >= Rhs.ShiftValue;
+}
+inline bool operator<(Align Lhs, Align Rhs) {
+  return Lhs.ShiftValue < Rhs.ShiftValue;
+}
+inline bool operator>(Align Lhs, Align Rhs) {
+  return Lhs.ShiftValue > Rhs.ShiftValue;
+}
+
+/// Comparisons operators between Align and MaybeAlign.
+inline bool operator==(Align Lhs, MaybeAlign Rhs) {
+  ALIGN_CHECK_ISSET(Rhs);
+  return Lhs.value() == (*Rhs).value();
+}
+inline bool operator!=(Align Lhs, MaybeAlign Rhs) {
+  ALIGN_CHECK_ISSET(Rhs);
+  return Lhs.value() != (*Rhs).value();
+}
+inline bool operator<=(Align Lhs, MaybeAlign Rhs) {
+  ALIGN_CHECK_ISSET(Rhs);
+  return Lhs.value() <= (*Rhs).value();
+}
+inline bool operator>=(Align Lhs, MaybeAlign Rhs) {
+  ALIGN_CHECK_ISSET(Rhs);
+  return Lhs.value() >= (*Rhs).value();
+}
+inline bool operator<(Align Lhs, MaybeAlign Rhs) {
+  ALIGN_CHECK_ISSET(Rhs);
+  return Lhs.value() < (*Rhs).value();
+}
+inline bool operator>(Align Lhs, MaybeAlign Rhs) {
+  ALIGN_CHECK_ISSET(Rhs);
+  return Lhs.value() > (*Rhs).value();
+}
+
+/// Comparisons operators between MaybeAlign and Align.
+inline bool operator==(MaybeAlign Lhs, Align Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  return Lhs && (*Lhs).value() == Rhs.value();
+}
+inline bool operator!=(MaybeAlign Lhs, Align Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  return Lhs && (*Lhs).value() != Rhs.value();
+}
+inline bool operator<=(MaybeAlign Lhs, Align Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  return Lhs && (*Lhs).value() <= Rhs.value();
+}
+inline bool operator>=(MaybeAlign Lhs, Align Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  return Lhs && (*Lhs).value() >= Rhs.value();
+}
+inline bool operator<(MaybeAlign Lhs, Align Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  return Lhs && (*Lhs).value() < Rhs.value();
+}
+inline bool operator>(MaybeAlign Lhs, Align Rhs) {
+  ALIGN_CHECK_ISSET(Lhs);
+  return Lhs && (*Lhs).value() > Rhs.value();
+}
+
+inline Align operator/(Align Lhs, uint64_t Divisor) {
+  assert(llvm::isPowerOf2_64(Divisor) &&
+         "Divisor must be positive and a power of 2");
+  assert(Lhs != 1 && "Can't halve byte alignment");
+  return Align(Lhs.value() / Divisor);
+}
+
+inline MaybeAlign operator/(MaybeAlign Lhs, uint64_t Divisor) {
+  assert(llvm::isPowerOf2_64(Divisor) &&
+         "Divisor must be positive and a power of 2");
+  return Lhs ? Lhs.getValue() / Divisor : MaybeAlign();
+}
+
+inline Align max(MaybeAlign Lhs, Align Rhs) {
+  return Lhs && *Lhs > Rhs ? *Lhs : Rhs;
+}
+
+inline Align max(Align Lhs, MaybeAlign Rhs) {
+  return Rhs && *Rhs > Lhs ? *Rhs : Lhs;
+}
+
+#undef ALIGN_CHECK_ISPOSITIVE
+#undef ALIGN_CHECK_ISSET
+
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_ALIGNMENT_H_
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index 09e967b98abc..106b90c35bf5 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -22,6 +22,7 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -211,13 +212,11 @@ public:
 
   /// Allocate space at the specified alignment.
   LLVM_ATTRIBUTE_RETURNS_NONNULL LLVM_ATTRIBUTE_RETURNS_NOALIAS void *
-  Allocate(size_t Size, size_t Alignment) {
-    assert(Alignment > 0 && "0-byte alignnment is not allowed. Use 1 instead.");
-
+  Allocate(size_t Size, Align Alignment) {
     // Keep track of how many bytes we've allocated.
     BytesAllocated += Size;
 
-    size_t Adjustment = alignmentAdjustment(CurPtr, Alignment);
+    size_t Adjustment = offsetToAlignedAddr(CurPtr, Alignment);
     assert(Adjustment + Size >= Size && "Adjustment + Size must not overflow");
 
     size_t SizeToAllocate = Size;
@@ -240,7 +239,7 @@ public:
     }
 
     // If Size is really big, allocate a separate slab for it.
-    size_t PaddedSize = SizeToAllocate + Alignment - 1;
+    size_t PaddedSize = SizeToAllocate + Alignment.value() - 1;
     if (PaddedSize > SizeThreshold) {
       void *NewSlab = Allocator.Allocate(PaddedSize, 0);
       // We own the new slab and don't want anyone reading anyting other than
@@ -268,6 +267,12 @@ public:
     return AlignedPtr;
   }
 
+  inline LLVM_ATTRIBUTE_RETURNS_NONNULL LLVM_ATTRIBUTE_RETURNS_NOALIAS void *
+  Allocate(size_t Size, size_t Alignment) {
+    assert(Alignment > 0 && "0-byte alignnment is not allowed. Use 1 instead.");
+    return Allocate(Size, Align(Alignment));
+  }
+
   // Pull in base class overloads.
   using AllocatorBase<BumpPtrAllocatorImpl>::Allocate;
 
@@ -461,7 +466,7 @@ public:
   /// all memory allocated so far.
   void DestroyAll() {
     auto DestroyElements = [](char *Begin, char *End) {
-      assert(Begin == (char *)alignAddr(Begin, alignof(T)));
+      assert(Begin == (char *)alignAddr(Begin, Align::Of<T>()));
       for (char *Ptr = Begin; Ptr + sizeof(T) <= End; Ptr += sizeof(T))
         reinterpret_cast<T *>(Ptr)->~T();
     };
@@ -470,7 +475,7 @@ public:
          ++I) {
       size_t AllocatedSlabSize = BumpPtrAllocator::computeSlabSize(
           std::distance(Allocator.Slabs.begin(), I));
-      char *Begin = (char *)alignAddr(*I, alignof(T));
+      char *Begin = (char *)alignAddr(*I, Align::Of<T>());
       char *End = *I == Allocator.Slabs.back() ? Allocator.CurPtr
                                                : (char *)*I + AllocatedSlabSize;
 
@@ -480,7 +485,8 @@ public:
     for (auto &PtrAndSize : Allocator.CustomSizedSlabs) {
       void *Ptr = PtrAndSize.first;
       size_t Size = PtrAndSize.second;
-      DestroyElements((char *)alignAddr(Ptr, alignof(T)), (char *)Ptr + Size);
+      DestroyElements((char *)alignAddr(Ptr, Align::Of<T>()),
+                      (char *)Ptr + Size);
     }
 
     Allocator.Reset();
diff --git a/include/llvm/Support/Automaton.h b/include/llvm/Support/Automaton.h
new file mode 100644
index 000000000000..7c13a698e492
--- /dev/null
+++ b/include/llvm/Support/Automaton.h
@@ -0,0 +1,253 @@
+//===-- Automaton.h - Support for driving TableGen-produced DFAs ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements class that drive and introspect deterministic finite-
+// state automata (DFAs) as generated by TableGen's -gen-automata backend.
+//
+// For a description of how to define an automaton, see
+// include/llvm/TableGen/Automaton.td.
+//
+// One important detail is that these deterministic automata are created from
+// (potentially) nondeterministic definitions. Therefore a unique sequence of
+// input symbols will produce one path through the DFA but multiple paths
+// through the original NFA. An automaton by default only returns "accepted" or
+// "not accepted", but frequently we want to analyze what NFA path was taken.
+// Finding a path through the NFA states that results in a DFA state can help
+// answer *what* the solution to a problem was, not just that there exists a
+// solution.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_AUTOMATON_H
+#define LLVM_SUPPORT_AUTOMATON_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Allocator.h"
+#include <deque>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+
+using NfaPath = SmallVector<uint64_t, 4>;
+
+/// Forward define the pair type used by the automata transition info tables.
+///
+/// Experimental results with large tables have shown a significant (multiple
+/// orders of magnitude) parsing speedup by using a custom struct here with a
+/// trivial constructor rather than std::pair<uint64_t, uint64_t>.
+struct NfaStatePair {
+  uint64_t FromDfaState, ToDfaState;
+
+  bool operator<(const NfaStatePair &Other) const {
+    return std::make_tuple(FromDfaState, ToDfaState) <
+           std::make_tuple(Other.FromDfaState, Other.ToDfaState);
+  }
+};
+
+namespace internal {
+/// The internal class that maintains all possible paths through an NFA based
+/// on a path through the DFA.
+class NfaTranscriber {
+private:
+  /// Cached transition table. This is a table of NfaStatePairs that contains
+  /// zero-terminated sequences pointed to by DFA transitions.
+  ArrayRef<NfaStatePair> TransitionInfo;
+
+  /// A simple linked-list of traversed states that can have a shared tail. The
+  /// traversed path is stored in reverse order with the latest state as the
+  /// head.
+  struct PathSegment {
+    uint64_t State;
+    PathSegment *Tail;
+  };
+
+  /// We allocate segment objects frequently. Allocate them upfront and dispose
+  /// at the end of a traversal rather than hammering the system allocator.
+  SpecificBumpPtrAllocator<PathSegment> Allocator;
+
+  /// Heads of each tracked path. These are not ordered.
+  std::deque<PathSegment *> Heads;
+
+  /// The returned paths. This is populated during getPaths.
+  SmallVector<NfaPath, 4> Paths;
+
+  /// Create a new segment and return it.
+  PathSegment *makePathSegment(uint64_t State, PathSegment *Tail) {
+    PathSegment *P = Allocator.Allocate();
+    *P = {State, Tail};
+    return P;
+  }
+
+  /// Pairs defines a sequence of possible NFA transitions for a single DFA
+  /// transition.
+  void transition(ArrayRef<NfaStatePair> Pairs) {
+    // Iterate over all existing heads. We will mutate the Heads deque during
+    // iteration.
+    unsigned NumHeads = Heads.size();
+    for (unsigned I = 0; I < NumHeads; ++I) {
+      PathSegment *Head = Heads[I];
+      // The sequence of pairs is sorted. Select the set of pairs that
+      // transition from the current head state.
+      auto PI = lower_bound(Pairs, NfaStatePair{Head->State, 0ULL});
+      auto PE = upper_bound(Pairs, NfaStatePair{Head->State, INT64_MAX});
+      // For every transition from the current head state, add a new path
+      // segment.
+      for (; PI != PE; ++PI)
+        if (PI->FromDfaState == Head->State)
+          Heads.push_back(makePathSegment(PI->ToDfaState, Head));
+    }
+    // Now we've iterated over all the initial heads and added new ones,
+    // dispose of the original heads.
+    Heads.erase(Heads.begin(), std::next(Heads.begin(), NumHeads));
+  }
+
+public:
+  NfaTranscriber(ArrayRef<NfaStatePair> TransitionInfo)
+      : TransitionInfo(TransitionInfo) {
+    reset();
+  }
+
+  void reset() {
+    Paths.clear();
+    Heads.clear();
+    Allocator.DestroyAll();
+    // The initial NFA state is 0.
+    Heads.push_back(makePathSegment(0ULL, nullptr));
+  }
+
+  void transition(unsigned TransitionInfoIdx) {
+    unsigned EndIdx = TransitionInfoIdx;
+    while (TransitionInfo[EndIdx].ToDfaState != 0)
+      ++EndIdx;
+    ArrayRef<NfaStatePair> Pairs(&TransitionInfo[TransitionInfoIdx],
+                                 EndIdx - TransitionInfoIdx);
+    transition(Pairs);
+  }
+
+  ArrayRef<NfaPath> getPaths() {
+    Paths.clear();
+    for (auto *Head : Heads) {
+      NfaPath P;
+      while (Head->State != 0) {
+        P.push_back(Head->State);
+        Head = Head->Tail;
+      }
+      std::reverse(P.begin(), P.end());
+      Paths.push_back(std::move(P));
+    }
+    return Paths;
+  }
+};
+} // namespace internal
+
+/// A deterministic finite-state automaton. The automaton is defined in
+/// TableGen; this object drives an automaton defined by tblgen-emitted tables.
+///
+/// An automaton accepts a sequence of input tokens ("actions"). This class is
+/// templated on the type of these actions.
+template <typename ActionT> class Automaton {
+  /// Map from {State, Action} to {NewState, TransitionInfoIdx}.
+  /// TransitionInfoIdx is used by the DfaTranscriber to analyze the transition.
+  /// FIXME: This uses a std::map because ActionT can be a pair type including
+  /// an enum. In particular DenseMapInfo<ActionT> must be defined to use
+  /// DenseMap here.
+  /// This is a shared_ptr to allow very quick copy-construction of Automata; this
+  /// state is immutable after construction so this is safe.
+  using MapTy = std::map<std::pair<uint64_t, ActionT>, std::pair<uint64_t, unsigned>>;
+  std::shared_ptr<MapTy> M;
+  /// An optional transcription object. This uses much more state than simply
+  /// traversing the DFA for acceptance, so is heap allocated.
+  std::shared_ptr<internal::NfaTranscriber> Transcriber;
+  /// The initial DFA state is 1.
+  uint64_t State = 1;
+  /// True if we should transcribe and false if not (even if Transcriber is defined).
+  bool Transcribe;
+
+public:
+  /// Create an automaton.
+  /// \param Transitions The Transitions table as created by TableGen. Note that
+  ///                    because the action type differs per automaton, the
+  ///                    table type is templated as ArrayRef<InfoT>.
+  /// \param TranscriptionTable The TransitionInfo table as created by TableGen.
+  ///
+  /// Providing the TranscriptionTable argument as non-empty will enable the
+  /// use of transcription, which analyzes the possible paths in the original
+  /// NFA taken by the DFA. NOTE: This is substantially more work than simply
+  /// driving the DFA, so unless you require the getPaths() method leave this
+  /// empty.
+  template <typename InfoT>
+  Automaton(ArrayRef<InfoT> Transitions,
+            ArrayRef<NfaStatePair> TranscriptionTable = {}) {
+    if (!TranscriptionTable.empty())
+      Transcriber =
+          std::make_shared<internal::NfaTranscriber>(TranscriptionTable);
+    Transcribe = Transcriber != nullptr;
+    M = std::make_shared<MapTy>();
+    for (const auto &I : Transitions)
+      // Greedily read and cache the transition table.
+      M->emplace(std::make_pair(I.FromDfaState, I.Action),
+                 std::make_pair(I.ToDfaState, I.InfoIdx));
+  }
+  Automaton(const Automaton &) = default;
+
+  /// Reset the automaton to its initial state.
+  void reset() {
+    State = 1;
+    if (Transcriber)
+      Transcriber->reset();
+  }
+
+  /// Enable or disable transcription. Transcription is only available if
+  /// TranscriptionTable was provided to the constructor.
+  void enableTranscription(bool Enable = true) {
+    assert(Transcriber &&
+           "Transcription is only available if TranscriptionTable was provided "
+           "to the Automaton constructor");
+    Transcribe = Enable;
+  }
+
+  /// Transition the automaton based on input symbol A. Return true if the
+  /// automaton transitioned to a valid state, false if the automaton
+  /// transitioned to an invalid state.
+  ///
+  /// If this function returns false, all methods are undefined until reset() is
+  /// called.
+  bool add(const ActionT &A) {
+    auto I = M->find({State, A});
+    if (I == M->end())
+      return false;
+    if (Transcriber && Transcribe)
+      Transcriber->transition(I->second.second);
+    State = I->second.first;
+    return true;
+  }
+
+  /// Return true if the automaton can be transitioned based on input symbol A.
+  bool canAdd(const ActionT &A) {
+    auto I = M->find({State, A});
+    return I != M->end();
+  }
+
+  /// Obtain a set of possible paths through the input nondeterministic
+  /// automaton that could be obtained from the sequence of input actions
+  /// presented to this deterministic automaton.
+  ArrayRef<NfaPath> getNfaPaths() {
+    assert(Transcriber && Transcribe &&
+           "Can only obtain NFA paths if transcribing!");
+    return Transcriber->getPaths();
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_AUTOMATON_H
diff --git a/include/llvm/Support/BinaryStreamArray.h b/include/llvm/Support/BinaryStreamArray.h
index 96d09db69ae5..67ba2e4189be 100644
--- a/include/llvm/Support/BinaryStreamArray.h
+++ b/include/llvm/Support/BinaryStreamArray.h
@@ -286,7 +286,7 @@ public:
       // an exact multiple of the element size.
       consumeError(std::move(EC));
     }
-    assert(llvm::alignmentAdjustment(Data.data(), alignof(T)) == 0);
+    assert(isAddrAligned(Align::Of<T>(), Data.data()));
     return *reinterpret_cast<const T *>(Data.data());
   }
 
diff --git a/include/llvm/Support/BinaryStreamReader.h b/include/llvm/Support/BinaryStreamReader.h
index d8fddde66bfa..9e16ce227ff8 100644
--- a/include/llvm/Support/BinaryStreamReader.h
+++ b/include/llvm/Support/BinaryStreamReader.h
@@ -198,7 +198,7 @@ public:
     if (auto EC = readBytes(Bytes, NumElements * sizeof(T)))
       return EC;
 
-    assert(alignmentAdjustment(Bytes.data(), alignof(T)) == 0 &&
+    assert(isAddrAligned(Align::Of<T>(), Bytes.data()) &&
            "Reading at invalid alignment!");
 
     Array = ArrayRef<T>(reinterpret_cast<const T *>(Bytes.data()), NumElements);
diff --git a/include/llvm/Support/CRC.h b/include/llvm/Support/CRC.h
index 6ea8e3edcea4..210890ae06d4 100644
--- a/include/llvm/Support/CRC.h
+++ b/include/llvm/Support/CRC.h
@@ -6,20 +6,55 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains basic functions for calculating Cyclic Redundancy Check
-// or CRC.
+// This file contains implementations of CRC functions.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_SUPPORT_CRC_H
 #define LLVM_SUPPORT_CRC_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
-/// zlib independent CRC32 calculation.
-uint32_t crc32(uint32_t CRC, StringRef S);
+template <typename T> class ArrayRef;
+
+// Compute the CRC-32 of Data.
+uint32_t crc32(ArrayRef<uint8_t> Data);
+
+// Compute the running CRC-32 of Data, with CRC being the previous value of the
+// checksum.
+uint32_t crc32(uint32_t CRC, ArrayRef<uint8_t> Data);
+
+// Class for computing the JamCRC.
+//
+// We will use the "Rocksoft^tm Model CRC Algorithm" to describe the properties
+// of this CRC:
+//   Width  : 32
+//   Poly   : 04C11DB7
+//   Init   : FFFFFFFF
+//   RefIn  : True
+//   RefOut : True
+//   XorOut : 00000000
+//   Check  : 340BC6D9 (result of CRC for "123456789")
+//
+// In other words, this is the same as CRC-32, except that XorOut is 0 instead
+// of FFFFFFFF.
+//
+// N.B.  We permit flexibility of the "Init" value.  Some consumers of this need
+//       it to be zero.
+class JamCRC {
+public:
+  JamCRC(uint32_t Init = 0xFFFFFFFFU) : CRC(Init) {}
+
+  // Update the CRC calculation with Data.
+  void update(ArrayRef<uint8_t> Data);
+
+  uint32_t getCRC() const { return CRC; }
+
+private:
+  uint32_t CRC;
+};
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index 3cc2c3c0121b..63784463e171 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -2000,6 +2000,9 @@ void ResetAllOptionOccurrences();
 /// where no options are supported.
 void ResetCommandLineParser();
 
+/// Parses `Arg` into the option handler `Handler`.
+bool ProvidePositionalOption(Option *Handler, StringRef Arg, int i);
+
 } // end namespace cl
 
 } // end namespace llvm
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 3f4f465f3960..cb7e57d4cd21 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines several macros, based on the current compiler.  This allows
-// use of compiler-specific features in a way that remains portable.
+// use of compiler-specific features in a way that remains portable. This header
+// can be included from either C or C++.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,7 +17,9 @@
 
 #include "llvm/Config/llvm-config.h"
 
+#ifdef __cplusplus
 #include <new>
+#endif
 #include <stddef.h>
 
 #if defined(_MSC_VER)
@@ -35,14 +38,20 @@
 # define __has_attribute(x) 0
 #endif
 
-#ifndef __has_cpp_attribute
-# define __has_cpp_attribute(x) 0
-#endif
-
 #ifndef __has_builtin
 # define __has_builtin(x) 0
 #endif
 
+// Only use __has_cpp_attribute in C++ mode. GCC defines __has_cpp_attribute in
+// C mode, but the :: in __has_cpp_attribute(scoped::attribute) is invalid.
+#ifndef LLVM_HAS_CPP_ATTRIBUTE
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define LLVM_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define LLVM_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+#endif
+
 /// \macro LLVM_GNUC_PREREQ
 /// Extend the default __GNUC_PREREQ even if glibc's features.h isn't
 /// available.
@@ -62,13 +71,21 @@
 /// \macro LLVM_MSC_PREREQ
 /// Is the compiler MSVC of at least the specified version?
 /// The common \param version values to check for are:
-///  * 1900: Microsoft Visual Studio 2015 / 14.0
+/// * 1910: VS2017, version 15.1 & 15.2
+/// * 1911: VS2017, version 15.3 & 15.4
+/// * 1912: VS2017, version 15.5
+/// * 1913: VS2017, version 15.6
+/// * 1914: VS2017, version 15.7
+/// * 1915: VS2017, version 15.8
+/// * 1916: VS2017, version 15.9
+/// * 1920: VS2019, version 16.0
+/// * 1921: VS2019, version 16.1
 #ifdef _MSC_VER
 #define LLVM_MSC_PREREQ(version) (_MSC_VER >= (version))
 
-// We require at least MSVC 2015.
-#if !LLVM_MSC_PREREQ(1900)
-#error LLVM requires at least MSVC 2015.
+// We require at least MSVC 2017.
+#if !LLVM_MSC_PREREQ(1910)
+#error LLVM requires at least MSVC 2017.
 #endif
 
 #else
@@ -120,14 +137,18 @@
 #endif
 
 /// LLVM_NODISCARD - Warn if a type or return value is discarded.
-#if __cplusplus > 201402L && __has_cpp_attribute(nodiscard)
+
+// Use the 'nodiscard' attribute in C++17 or newer mode.
+#if __cplusplus > 201402L && LLVM_HAS_CPP_ATTRIBUTE(nodiscard)
 #define LLVM_NODISCARD [[nodiscard]]
-#elif !__cplusplus
-// Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
-// error when __has_cpp_attribute is given a scoped attribute in C mode.
-#define LLVM_NODISCARD
-#elif __has_cpp_attribute(clang::warn_unused_result)
+#elif LLVM_HAS_CPP_ATTRIBUTE(clang::warn_unused_result)
 #define LLVM_NODISCARD [[clang::warn_unused_result]]
+// Clang in C++14 mode claims that it has the 'nodiscard' attribute, but also
+// warns in the pedantic mode that 'nodiscard' is a C++17 extension (PR33518).
+// Use the 'nodiscard' attribute in C++14 mode only with GCC.
+// TODO: remove this workaround when PR33518 is resolved.
+#elif defined(__GNUC__) && LLVM_HAS_CPP_ATTRIBUTE(nodiscard)
+#define LLVM_NODISCARD [[nodiscard]]
 #else
 #define LLVM_NODISCARD
 #endif
@@ -139,7 +160,7 @@
 // The clang-tidy check bugprone-use-after-move recognizes this attribute as a
 // marker that a moved-from object has left the indeterminate state and can be
 // reused.
-#if __has_cpp_attribute(clang::reinitializes)
+#if LLVM_HAS_CPP_ATTRIBUTE(clang::reinitializes)
 #define LLVM_ATTRIBUTE_REINITIALIZES [[clang::reinitializes]]
 #else
 #define LLVM_ATTRIBUTE_REINITIALIZES
@@ -240,15 +261,13 @@
 #endif
 
 /// LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
-#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
+#if __cplusplus > 201402L && LLVM_HAS_CPP_ATTRIBUTE(fallthrough)
 #define LLVM_FALLTHROUGH [[fallthrough]]
-#elif __has_cpp_attribute(gnu::fallthrough)
+#elif LLVM_HAS_CPP_ATTRIBUTE(gnu::fallthrough)
 #define LLVM_FALLTHROUGH [[gnu::fallthrough]]
-#elif !__cplusplus
-// Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
-// error when __has_cpp_attribute is given a scoped attribute in C mode.
-#define LLVM_FALLTHROUGH
-#elif __has_cpp_attribute(clang::fallthrough)
+#elif __has_attribute(fallthrough)
+#define LLVM_FALLTHROUGH __attribute__((fallthrough))
+#elif LLVM_HAS_CPP_ATTRIBUTE(clang::fallthrough)
 #define LLVM_FALLTHROUGH [[clang::fallthrough]]
 #else
 #define LLVM_FALLTHROUGH
@@ -256,7 +275,7 @@
 
 /// LLVM_REQUIRE_CONSTANT_INITIALIZATION - Apply this to globals to ensure that
 /// they are constant initialized.
-#if __has_cpp_attribute(clang::require_constant_initialization)
+#if LLVM_HAS_CPP_ATTRIBUTE(clang::require_constant_initialization)
 #define LLVM_REQUIRE_CONSTANT_INITIALIZATION                                   \
   [[clang::require_constant_initialization]]
 #else
@@ -338,14 +357,6 @@
 # define LLVM_ASSUME_ALIGNED(p, a) (p)
 #endif
 
-/// \macro LLVM_ALIGNAS
-/// Used to specify a minimum alignment for a structure or variable.
-#if __GNUC__ && !__has_feature(cxx_alignas) && !LLVM_GNUC_PREREQ(4, 8, 1)
-# define LLVM_ALIGNAS(x) __attribute__((aligned(x)))
-#else
-# define LLVM_ALIGNAS(x) alignas(x)
-#endif
-
 /// \macro LLVM_PACKED
 /// Used to specify a packed structure.
 /// LLVM_PACKED(
@@ -376,8 +387,8 @@
 
 /// \macro LLVM_PTR_SIZE
 /// A constant integer equivalent to the value of sizeof(void*).
-/// Generally used in combination with LLVM_ALIGNAS or when doing computation in
-/// the preprocessor.
+/// Generally used in combination with alignas or when doing computation in the
+/// preprocessor.
 #ifdef __SIZEOF_POINTER__
 # define LLVM_PTR_SIZE __SIZEOF_POINTER__
 #elif defined(_WIN64)
@@ -527,6 +538,7 @@ void AnnotateIgnoreWritesEnd(const char *file, int line);
 #define LLVM_ENABLE_EXCEPTIONS 1
 #endif
 
+#ifdef __cplusplus
 namespace llvm {
 
 /// Allocate a buffer of memory with the given size and alignment.
@@ -569,4 +581,5 @@ inline void deallocate_buffer(void *Ptr, size_t Size, size_t Alignment) {
 
 } // End namespace llvm
 
+#endif // __cplusplus
 #endif
diff --git a/include/llvm/Support/DataExtractor.h b/include/llvm/Support/DataExtractor.h
index 6b08a2a2a445..f590a1e104fb 100644
--- a/include/llvm/Support/DataExtractor.h
+++ b/include/llvm/Support/DataExtractor.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 
@@ -42,6 +43,38 @@ class DataExtractor {
   uint8_t IsLittleEndian;
   uint8_t AddressSize;
 public:
+  /// A class representing a position in a DataExtractor, as well as any error
+  /// encountered during extraction. It enables one to extract a sequence of
+  /// values without error-checking and then checking for errors in bulk at the
+  /// end. The class holds an Error object, so failing to check the result of
+  /// the parse will result in a runtime error. The error flag is sticky and
+  /// will cause all subsequent extraction functions to fail without even
+  /// attempting to parse and without updating the Cursor offset. After clearing
+  /// the error flag, one can again use the Cursor object for parsing.
+  class Cursor {
+    uint64_t Offset;
+    Error Err;
+
+    friend class DataExtractor;
+
+  public:
+    /// Construct a cursor for extraction from the given offset.
+    explicit Cursor(uint64_t Offset) : Offset(Offset), Err(Error::success()) {}
+
+    /// Checks whether the cursor is valid (i.e. no errors were encountered). In
+    /// case of errors, this does not clear the error flag -- one must call
+    /// takeError() instead.
+    explicit operator bool() { return !Err; }
+
+    /// Return the current position of this Cursor. In the error state this is
+    /// the position of the Cursor before the first error was encountered.
+    uint64_t tell() const { return Offset; }
+
+    /// Return error contained inside this Cursor, if any. Clears the internal
+    /// Cursor state.
+    Error takeError() { return std::move(Err); }
+  };
+
   /// Construct with a buffer that is owned by the caller.
   ///
   /// This constructor allows us to use data that is owned by the
@@ -49,6 +82,11 @@ public:
   /// valid.
   DataExtractor(StringRef Data, bool IsLittleEndian, uint8_t AddressSize)
     : Data(Data), IsLittleEndian(IsLittleEndian), AddressSize(AddressSize) {}
+  DataExtractor(ArrayRef<uint8_t> Data, bool IsLittleEndian,
+                uint8_t AddressSize)
+      : Data(StringRef(reinterpret_cast<const char *>(Data.data()),
+                       Data.size())),
+        IsLittleEndian(IsLittleEndian), AddressSize(AddressSize) {}
 
   /// Get the data pointed to by this extractor.
   StringRef getData() const { return Data; }
@@ -79,17 +117,17 @@ public:
   ///     pointed to by \a offset_ptr is out of bounds, or if the
   ///     offset plus the length of the C string is out of bounds,
   ///     NULL will be returned.
-  const char *getCStr(uint32_t *offset_ptr) const;
+  const char *getCStr(uint64_t *offset_ptr) const;
 
-  /// Extract a C string from \a *OffsetPtr.
+  /// Extract a C string from \a *offset_ptr.
   ///
   /// Returns a StringRef for the C String from the data at the offset
-  /// pointed to by \a OffsetPtr. A variable length NULL terminated C
-  /// string will be extracted and the \a OffsetPtr will be
+  /// pointed to by \a offset_ptr. A variable length NULL terminated C
+  /// string will be extracted and the \a offset_ptr will be
   /// updated with the offset of the byte that follows the NULL
   /// terminator byte.
   ///
-  /// \param[in,out] OffsetPtr
+  /// \param[in,out] offset_ptr
   ///     A pointer to an offset within the data that will be advanced
   ///     by the appropriate number of bytes if the value is extracted
   ///     correctly. If the offset is out of bounds or there are not
@@ -98,10 +136,10 @@ public:
   ///
   /// \return
   ///     A StringRef for the C string value in the data. If the offset
-  ///     pointed to by \a OffsetPtr is out of bounds, or if the
+  ///     pointed to by \a offset_ptr is out of bounds, or if the
   ///     offset plus the length of the C string is out of bounds,
   ///     a default-initialized StringRef will be returned.
-  StringRef getCStrRef(uint32_t *OffsetPtr) const;
+  StringRef getCStrRef(uint64_t *offset_ptr) const;
 
   /// Extract an unsigned integer of size \a byte_size from \a
   /// *offset_ptr.
@@ -124,10 +162,24 @@ public:
   /// @param[in] byte_size
   ///     The size in byte of the integer to extract.
   ///
+  /// @param[in,out] Err
+  ///     A pointer to an Error object. Upon return the Error object is set to
+  ///     indicate the result (success/failure) of the function. If the Error
+  ///     object is already set when calling this function, no extraction is
+  ///     performed.
+  ///
   /// @return
   ///     The unsigned integer value that was extracted, or zero on
   ///     failure.
-  uint64_t getUnsigned(uint32_t *offset_ptr, uint32_t byte_size) const;
+  uint64_t getUnsigned(uint64_t *offset_ptr, uint32_t byte_size,
+                       Error *Err = nullptr) const;
+
+  /// Extract an unsigned integer of the given size from the location given by
+  /// the cursor. In case of an extraction error, or if the cursor is already in
+  /// an error state, zero is returned.
+  uint64_t getUnsigned(Cursor &C, uint32_t Size) const {
+    return getUnsigned(&C.Offset, Size, &C.Err);
+  }
 
   /// Extract an signed integer of size \a byte_size from \a *offset_ptr.
   ///
@@ -152,7 +204,7 @@ public:
   /// @return
   ///     The sign extended signed integer value that was extracted,
   ///     or zero on failure.
-  int64_t getSigned(uint32_t *offset_ptr, uint32_t size) const;
+  int64_t getSigned(uint64_t *offset_ptr, uint32_t size) const;
 
   //------------------------------------------------------------------
   /// Extract an pointer from \a *offset_ptr.
@@ -171,10 +223,15 @@ public:
   ///
   /// @return
   ///     The extracted pointer value as a 64 integer.
-  uint64_t getAddress(uint32_t *offset_ptr) const {
+  uint64_t getAddress(uint64_t *offset_ptr) const {
     return getUnsigned(offset_ptr, AddressSize);
   }
 
+  /// Extract a pointer-sized unsigned integer from the location given by the
+  /// cursor. In case of an extraction error, or if the cursor is already in
+  /// an error state, zero is returned.
+  uint64_t getAddress(Cursor &C) const { return getUnsigned(C, AddressSize); }
+
   /// Extract a uint8_t value from \a *offset_ptr.
   ///
   /// Extract a single uint8_t from the binary data at the offset
@@ -187,9 +244,20 @@ public:
   ///     enough bytes to extract this value, the offset will be left
   ///     unmodified.
   ///
+  /// @param[in,out] Err
+  ///     A pointer to an Error object. Upon return the Error object is set to
+  ///     indicate the result (success/failure) of the function. If the Error
+  ///     object is already set when calling this function, no extraction is
+  ///     performed.
+  ///
   /// @return
   ///     The extracted uint8_t value.
-  uint8_t getU8(uint32_t *offset_ptr) const;
+  uint8_t getU8(uint64_t *offset_ptr, Error *Err = nullptr) const;
+
+  /// Extract a single uint8_t value from the location given by the cursor. In
+  /// case of an extraction error, or if the cursor is already in an error
+  /// state, zero is returned.
+  uint8_t getU8(Cursor &C) const { return getU8(&C.Offset, &C.Err); }
 
   /// Extract \a count uint8_t values from \a *offset_ptr.
   ///
@@ -214,7 +282,27 @@ public:
   /// @return
   ///     \a dst if all values were properly extracted and copied,
   ///     NULL otherise.
-  uint8_t *getU8(uint32_t *offset_ptr, uint8_t *dst, uint32_t count) const;
+  uint8_t *getU8(uint64_t *offset_ptr, uint8_t *dst, uint32_t count) const;
+
+  /// Extract \a Count uint8_t values from the location given by the cursor and
+  /// store them into the destination buffer. In case of an extraction error, or
+  /// if the cursor is already in an error state, a nullptr is returned and the
+  /// destination buffer is left unchanged.
+  uint8_t *getU8(Cursor &C, uint8_t *Dst, uint32_t Count) const;
+
+  /// Extract \a Count uint8_t values from the location given by the cursor and
+  /// store them into the destination vector. The vector is resized to fit the
+  /// extracted data. In case of an extraction error, or if the cursor is
+  /// already in an error state, the destination vector is left unchanged and
+  /// cursor is placed into an error state.
+  void getU8(Cursor &C, SmallVectorImpl<uint8_t> &Dst, uint32_t Count) const {
+    if (isValidOffsetForDataOfSize(C.Offset, Count))
+      Dst.resize(Count);
+
+    // This relies on the fact that getU8 will not attempt to write to the
+    // buffer if isValidOffsetForDataOfSize(C.Offset, Count) is false.
+    getU8(C, Dst.data(), Count);
+  }
 
   //------------------------------------------------------------------
   /// Extract a uint16_t value from \a *offset_ptr.
@@ -229,10 +317,21 @@ public:
   ///     enough bytes to extract this value, the offset will be left
   ///     unmodified.
   ///
+  /// @param[in,out] Err
+  ///     A pointer to an Error object. Upon return the Error object is set to
+  ///     indicate the result (success/failure) of the function. If the Error
+  ///     object is already set when calling this function, no extraction is
+  ///     performed.
+  ///
   /// @return
   ///     The extracted uint16_t value.
   //------------------------------------------------------------------
-  uint16_t getU16(uint32_t *offset_ptr) const;
+  uint16_t getU16(uint64_t *offset_ptr, Error *Err = nullptr) const;
+
+  /// Extract a single uint16_t value from the location given by the cursor. In
+  /// case of an extraction error, or if the cursor is already in an error
+  /// state, zero is returned.
+  uint16_t getU16(Cursor &C) const { return getU16(&C.Offset, &C.Err); }
 
   /// Extract \a count uint16_t values from \a *offset_ptr.
   ///
@@ -257,7 +356,7 @@ public:
   /// @return
   ///     \a dst if all values were properly extracted and copied,
   ///     NULL otherise.
-  uint16_t *getU16(uint32_t *offset_ptr, uint16_t *dst, uint32_t count) const;
+  uint16_t *getU16(uint64_t *offset_ptr, uint16_t *dst, uint32_t count) const;
 
   /// Extract a 24-bit unsigned value from \a *offset_ptr and return it
   /// in a uint32_t.
@@ -274,7 +373,7 @@ public:
   ///
   /// @return
   ///     The extracted 24-bit value represented in a uint32_t.
-  uint32_t getU24(uint32_t *offset_ptr) const;
+  uint32_t getU24(uint64_t *offset_ptr) const;
 
   /// Extract a uint32_t value from \a *offset_ptr.
   ///
@@ -288,9 +387,20 @@ public:
   ///     enough bytes to extract this value, the offset will be left
   ///     unmodified.
   ///
+  /// @param[in,out] Err
+  ///     A pointer to an Error object. Upon return the Error object is set to
+  ///     indicate the result (success/failure) of the function. If the Error
+  ///     object is already set when calling this function, no extraction is
+  ///     performed.
+  ///
   /// @return
   ///     The extracted uint32_t value.
-  uint32_t getU32(uint32_t *offset_ptr) const;
+  uint32_t getU32(uint64_t *offset_ptr, Error *Err = nullptr) const;
+
+  /// Extract a single uint32_t value from the location given by the cursor. In
+  /// case of an extraction error, or if the cursor is already in an error
+  /// state, zero is returned.
+  uint32_t getU32(Cursor &C) const { return getU32(&C.Offset, &C.Err); }
 
   /// Extract \a count uint32_t values from \a *offset_ptr.
   ///
@@ -315,7 +425,7 @@ public:
   /// @return
   ///     \a dst if all values were properly extracted and copied,
   ///     NULL otherise.
-  uint32_t *getU32(uint32_t *offset_ptr, uint32_t *dst, uint32_t count) const;
+  uint32_t *getU32(uint64_t *offset_ptr, uint32_t *dst, uint32_t count) const;
 
   /// Extract a uint64_t value from \a *offset_ptr.
   ///
@@ -329,9 +439,20 @@ public:
   ///     enough bytes to extract this value, the offset will be left
   ///     unmodified.
   ///
+  /// @param[in,out] Err
+  ///     A pointer to an Error object. Upon return the Error object is set to
+  ///     indicate the result (success/failure) of the function. If the Error
+  ///     object is already set when calling this function, no extraction is
+  ///     performed.
+  ///
   /// @return
   ///     The extracted uint64_t value.
-  uint64_t getU64(uint32_t *offset_ptr) const;
+  uint64_t getU64(uint64_t *offset_ptr, Error *Err = nullptr) const;
+
+  /// Extract a single uint64_t value from the location given by the cursor. In
+  /// case of an extraction error, or if the cursor is already in an error
+  /// state, zero is returned.
+  uint64_t getU64(Cursor &C) const { return getU64(&C.Offset, &C.Err); }
 
   /// Extract \a count uint64_t values from \a *offset_ptr.
   ///
@@ -356,7 +477,7 @@ public:
   /// @return
   ///     \a dst if all values were properly extracted and copied,
   ///     NULL otherise.
-  uint64_t *getU64(uint32_t *offset_ptr, uint64_t *dst, uint32_t count) const;
+  uint64_t *getU64(uint64_t *offset_ptr, uint64_t *dst, uint32_t count) const;
 
   /// Extract a signed LEB128 value from \a *offset_ptr.
   ///
@@ -374,7 +495,7 @@ public:
   ///
   /// @return
   ///     The extracted signed integer value.
-  int64_t getSLEB128(uint32_t *offset_ptr) const;
+  int64_t getSLEB128(uint64_t *offset_ptr) const;
 
   /// Extract a unsigned LEB128 value from \a *offset_ptr.
   ///
@@ -390,23 +511,44 @@ public:
   ///     enough bytes to extract this value, the offset will be left
   ///     unmodified.
   ///
+  /// @param[in,out] Err
+  ///     A pointer to an Error object. Upon return the Error object is set to
+  ///     indicate the result (success/failure) of the function. If the Error
+  ///     object is already set when calling this function, no extraction is
+  ///     performed.
+  ///
   /// @return
   ///     The extracted unsigned integer value.
-  uint64_t getULEB128(uint32_t *offset_ptr) const;
+  uint64_t getULEB128(uint64_t *offset_ptr, llvm::Error *Err = nullptr) const;
+
+  /// Extract an unsigned ULEB128 value from the location given by the cursor.
+  /// In case of an extraction error, or if the cursor is already in an error
+  /// state, zero is returned.
+  uint64_t getULEB128(Cursor &C) const { return getULEB128(&C.Offset, &C.Err); }
+
+  /// Advance the Cursor position by the given number of bytes. No-op if the
+  /// cursor is in an error state.
+  void skip(Cursor &C, uint64_t Length) const;
+
+  /// Return true iff the cursor is at the end of the buffer, regardless of the
+  /// error state of the cursor. The only way both eof and error states can be
+  /// true is if one attempts a read while the cursor is at the very end of the
+  /// data buffer.
+  bool eof(const Cursor &C) const { return Data.size() == C.Offset; }
 
   /// Test the validity of \a offset.
   ///
   /// @return
   ///     \b true if \a offset is a valid offset into the data in this
   ///     object, \b false otherwise.
-  bool isValidOffset(uint32_t offset) const { return Data.size() > offset; }
+  bool isValidOffset(uint64_t offset) const { return Data.size() > offset; }
 
   /// Test the availability of \a length bytes of data from \a offset.
   ///
   /// @return
   ///     \b true if \a offset is a valid offset and there are \a
   ///     length bytes available at that offset, \b false otherwise.
-  bool isValidOffsetForDataOfSize(uint32_t offset, uint32_t length) const {
+  bool isValidOffsetForDataOfSize(uint64_t offset, uint64_t length) const {
     return offset + length >= offset && isValidOffset(offset + length - 1);
   }
 
@@ -417,9 +559,15 @@ public:
   ///     \b true if \a offset is a valid offset and there are enough
   ///     bytes for a pointer available at that offset, \b false
   ///     otherwise.
-  bool isValidOffsetForAddress(uint32_t offset) const {
+  bool isValidOffsetForAddress(uint64_t offset) const {
     return isValidOffsetForDataOfSize(offset, AddressSize);
   }
+
+protected:
+  // Make it possible for subclasses to access these fields without making them
+  // public.
+  static uint64_t &getOffset(Cursor &C) { return C.Offset; }
+  static Error &getError(Cursor &C) { return C.Err; }
 };
 
 } // namespace llvm
diff --git a/include/llvm/Support/Endian.h b/include/llvm/Support/Endian.h
index d8be94427d7e..87aecedd3a4b 100644
--- a/include/llvm/Support/Endian.h
+++ b/include/llvm/Support/Endian.h
@@ -203,9 +203,8 @@ inline void writeAtBitAlignment(void *memory, value_type value,
 
 namespace detail {
 
-template<typename ValueType,
-         endianness Endian,
-         std::size_t Alignment>
+template <typename ValueType, endianness Endian, std::size_t Alignment,
+          std::size_t ALIGN = PickAlignment<ValueType, Alignment>::value>
 struct packed_endian_specific_integral {
   using value_type = ValueType;
   static constexpr endianness endian = Endian;
@@ -246,8 +245,9 @@ struct packed_endian_specific_integral {
   }
 
 private:
-  AlignedCharArray<PickAlignment<value_type, alignment>::value,
-                   sizeof(value_type)> Value;
+  struct {
+    alignas(ALIGN) char buffer[sizeof(value_type)];
+  } Value;
 
 public:
   struct ref {
diff --git a/include/llvm/Support/Error.h b/include/llvm/Support/Error.h
index 299fce7a1368..350877a219bf 100644
--- a/include/llvm/Support/Error.h
+++ b/include/llvm/Support/Error.h
@@ -328,7 +328,7 @@ inline ErrorSuccess Error::success() { return ErrorSuccess(); }
 /// Make a Error instance representing failure using the given error info
 /// type.
 template <typename ErrT, typename... ArgTs> Error make_error(ArgTs &&... Args) {
-  return Error(llvm::make_unique<ErrT>(std::forward<ArgTs>(Args)...));
+  return Error(std::make_unique<ErrT>(std::forward<ArgTs>(Args)...));
 }
 
 /// Base class for user error types. Users should declare their error types
@@ -548,7 +548,7 @@ public:
   /// Take ownership of the stored error.
   /// After calling this the Expected<T> is in an indeterminate state that can
   /// only be safely destructed. No further calls (beside the destructor) should
-  /// be made on the Expected<T> vaule.
+  /// be made on the Expected<T> value.
   Error takeError() {
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
     Unchecked = false;
@@ -704,6 +704,12 @@ inline void cantFail(Error Err, const char *Msg = nullptr) {
   if (Err) {
     if (!Msg)
       Msg = "Failure value returned from cantFail wrapped call";
+#ifndef NDEBUG
+    std::string Str;
+    raw_string_ostream OS(Str);
+    OS << Msg << "\n" << Err;
+    Msg = OS.str().c_str();
+#endif
     llvm_unreachable(Msg);
   }
 }
@@ -728,6 +734,13 @@ T cantFail(Expected<T> ValOrErr, const char *Msg = nullptr) {
   else {
     if (!Msg)
       Msg = "Failure value returned from cantFail wrapped call";
+#ifndef NDEBUG
+    std::string Str;
+    raw_string_ostream OS(Str);
+    auto E = ValOrErr.takeError();
+    OS << Msg << "\n" << E;
+    Msg = OS.str().c_str();
+#endif
     llvm_unreachable(Msg);
   }
 }
@@ -752,6 +765,13 @@ T& cantFail(Expected<T&> ValOrErr, const char *Msg = nullptr) {
   else {
     if (!Msg)
       Msg = "Failure value returned from cantFail wrapped call";
+#ifndef NDEBUG
+    std::string Str;
+    raw_string_ostream OS(Str);
+    auto E = ValOrErr.takeError();
+    OS << Msg << "\n" << E;
+    Msg = OS.str().c_str();
+#endif
     llvm_unreachable(Msg);
   }
 }
@@ -982,6 +1002,20 @@ inline void consumeError(Error Err) {
   handleAllErrors(std::move(Err), [](const ErrorInfoBase &) {});
 }
 
+/// Convert an Expected to an Optional without doing anything. This method
+/// should be used only where an error can be considered a reasonable and
+/// expected return value.
+///
+/// Uses of this method are potentially indicative of problems: perhaps the
+/// error should be propagated further, or the error-producer should just
+/// return an Optional in the first place.
+template <typename T> Optional<T> expectedToOptional(Expected<T> &&E) {
+  if (E)
+    return std::move(*E);
+  consumeError(E.takeError());
+  return None;
+}
+
 /// Helper for converting an Error to a bool.
 ///
 /// This method returns true if Err is in an error state, or false if it is
@@ -1170,6 +1204,10 @@ inline Error createStringError(std::error_code EC, char const *Fmt,
 
 Error createStringError(std::error_code EC, char const *Msg);
 
+inline Error createStringError(std::error_code EC, const Twine &S) {
+  return createStringError(EC, S.str().c_str());
+}
+
 template <typename... Ts>
 inline Error createStringError(std::errc EC, char const *Fmt,
                                const Ts &... Vals) {
diff --git a/include/llvm/Support/FileCheck.h b/include/llvm/Support/FileCheck.h
index 0cd25a71a3b3..2547449246a8 100644
--- a/include/llvm/Support/FileCheck.h
+++ b/include/llvm/Support/FileCheck.h
@@ -13,12 +13,12 @@
 #ifndef LLVM_SUPPORT_FILECHECK_H
 #define LLVM_SUPPORT_FILECHECK_H
 
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/SourceMgr.h"
+#include <string>
 #include <vector>
-#include <map>
 
 namespace llvm {
 
@@ -30,6 +30,7 @@ struct FileCheckRequest {
   std::vector<std::string> GlobalDefines;
   bool AllowEmptyInput = false;
   bool MatchFullLines = false;
+  bool IgnoreCase = false;
   bool EnableVarScope = false;
   bool AllowDeprecatedDagOverlap = false;
   bool Verbose = false;
@@ -37,217 +38,7 @@ struct FileCheckRequest {
 };
 
 //===----------------------------------------------------------------------===//
-// Numeric substitution handling code.
-//===----------------------------------------------------------------------===//
-
-/// Base class representing the AST of a given expression.
-class FileCheckExpressionAST {
-public:
-  virtual ~FileCheckExpressionAST() = default;
-
-  /// Evaluates and \returns the value of the expression represented by this
-  /// AST or an error if evaluation fails.
-  virtual Expected<uint64_t> eval() const = 0;
-};
-
-/// Class representing an unsigned literal in the AST of an expression.
-class FileCheckExpressionLiteral : public FileCheckExpressionAST {
-private:
-  /// Actual value of the literal.
-  uint64_t Value;
-
-public:
-  /// Constructs a literal with the specified value.
-  FileCheckExpressionLiteral(uint64_t Val) : Value(Val) {}
-
-  /// \returns the literal's value.
-  Expected<uint64_t> eval() const { return Value; }
-};
-
-/// Class to represent an undefined variable error, which quotes that
-/// variable's name when printed.
-class FileCheckUndefVarError : public ErrorInfo<FileCheckUndefVarError> {
-private:
-  StringRef VarName;
-
-public:
-  static char ID;
-
-  FileCheckUndefVarError(StringRef VarName) : VarName(VarName) {}
-
-  StringRef getVarName() const { return VarName; }
-
-  std::error_code convertToErrorCode() const override {
-    return inconvertibleErrorCode();
-  }
-
-  /// Print name of variable associated with this error.
-  void log(raw_ostream &OS) const override {
-    OS << "\"";
-    OS.write_escaped(VarName) << "\"";
-  }
-};
-
-/// Class representing a numeric variable and its associated current value.
-class FileCheckNumericVariable {
-private:
-  /// Name of the numeric variable.
-  StringRef Name;
-
-  /// Value of numeric variable, if defined, or None otherwise.
-  Optional<uint64_t> Value;
-
-  /// Line number where this variable is defined, or None if defined before
-  /// input is parsed. Used to determine whether a variable is defined on the
-  /// same line as a given use.
-  Optional<size_t> DefLineNumber;
-
-public:
-  /// Constructor for a variable \p Name defined at line \p DefLineNumber or
-  /// defined before input is parsed if DefLineNumber is None.
-  FileCheckNumericVariable(StringRef Name,
-                           Optional<size_t> DefLineNumber = None)
-      : Name(Name), DefLineNumber(DefLineNumber) {}
-
-  /// \returns name of this numeric variable.
-  StringRef getName() const { return Name; }
-
-  /// \returns this variable's value.
-  Optional<uint64_t> getValue() const { return Value; }
-
-  /// Sets value of this numeric variable, if undefined. Triggers an assertion
-  /// failure if the variable is actually defined.
-  void setValue(uint64_t Value);
-
-  /// Clears value of this numeric variable, regardless of whether it is
-  /// currently defined or not.
-  void clearValue();
-
-  /// \returns the line number where this variable is defined, if any, or None
-  /// if defined before input is parsed.
-  Optional<size_t> getDefLineNumber() { return DefLineNumber; }
-};
-
-/// Class representing the use of a numeric variable in the AST of an
-/// expression.
-class FileCheckNumericVariableUse : public FileCheckExpressionAST {
-private:
-  /// Name of the numeric variable.
-  StringRef Name;
-
-  /// Pointer to the class instance for the variable this use is about.
-  FileCheckNumericVariable *NumericVariable;
-
-public:
-  FileCheckNumericVariableUse(StringRef Name,
-                              FileCheckNumericVariable *NumericVariable)
-      : Name(Name), NumericVariable(NumericVariable) {}
-
-  /// \returns the value of the variable referenced by this instance.
-  Expected<uint64_t> eval() const;
-};
-
-/// Type of functions evaluating a given binary operation.
-using binop_eval_t = uint64_t (*)(uint64_t, uint64_t);
-
-/// Class representing a single binary operation in the AST of an expression.
-class FileCheckASTBinop : public FileCheckExpressionAST {
-private:
-  /// Left operand.
-  std::unique_ptr<FileCheckExpressionAST> LeftOperand;
-
-  /// Right operand.
-  std::unique_ptr<FileCheckExpressionAST> RightOperand;
-
-  /// Pointer to function that can evaluate this binary operation.
-  binop_eval_t EvalBinop;
-
-public:
-  FileCheckASTBinop(binop_eval_t EvalBinop,
-                    std::unique_ptr<FileCheckExpressionAST> LeftOp,
-                    std::unique_ptr<FileCheckExpressionAST> RightOp)
-      : EvalBinop(EvalBinop) {
-    LeftOperand = std::move(LeftOp);
-    RightOperand = std::move(RightOp);
-  }
-
-  /// Evaluates the value of the binary operation represented by this AST,
-  /// using EvalBinop on the result of recursively evaluating the operands.
-  /// \returns the expression value or an error if an undefined numeric
-  /// variable is used in one of the operands.
-  Expected<uint64_t> eval() const;
-};
-
-class FileCheckPatternContext;
-
-/// Class representing a substitution to perform in the RegExStr string.
-class FileCheckSubstitution {
-protected:
-  /// Pointer to a class instance holding, among other things, the table with
-  /// the values of live string variables at the start of any given CHECK line.
-  /// Used for substituting string variables with the text they were defined
-  /// as. Expressions are linked to the numeric variables they use at
-  /// parse time and directly access the value of the numeric variable to
-  /// evaluate their value.
-  FileCheckPatternContext *Context;
-
-  /// The string that needs to be substituted for something else. For a
-  /// string variable this is its name, otherwise this is the whole expression.
-  StringRef FromStr;
-
-  // Index in RegExStr of where to do the substitution.
-  size_t InsertIdx;
-
-public:
-  FileCheckSubstitution(FileCheckPatternContext *Context, StringRef VarName,
-                        size_t InsertIdx)
-      : Context(Context), FromStr(VarName), InsertIdx(InsertIdx) {}
-
-  virtual ~FileCheckSubstitution() = default;
-
-  /// \returns the string to be substituted for something else.
-  StringRef getFromString() const { return FromStr; }
-
-  /// \returns the index where the substitution is to be performed in RegExStr.
-  size_t getIndex() const { return InsertIdx; }
-
-  /// \returns a string containing the result of the substitution represented
-  /// by this class instance or an error if substitution failed.
-  virtual Expected<std::string> getResult() const = 0;
-};
-
-class FileCheckStringSubstitution : public FileCheckSubstitution {
-public:
-  FileCheckStringSubstitution(FileCheckPatternContext *Context,
-                              StringRef VarName, size_t InsertIdx)
-      : FileCheckSubstitution(Context, VarName, InsertIdx) {}
-
-  /// \returns the text that the string variable in this substitution matched
-  /// when defined, or an error if the variable is undefined.
-  Expected<std::string> getResult() const override;
-};
-
-class FileCheckNumericSubstitution : public FileCheckSubstitution {
-private:
-  /// Pointer to the class representing the expression whose value is to be
-  /// substituted.
-  std::unique_ptr<FileCheckExpressionAST> ExpressionAST;
-
-public:
-  FileCheckNumericSubstitution(FileCheckPatternContext *Context, StringRef Expr,
-                               std::unique_ptr<FileCheckExpressionAST> ExprAST,
-                               size_t InsertIdx)
-      : FileCheckSubstitution(Context, Expr, InsertIdx) {
-    ExpressionAST = std::move(ExprAST);
-  }
-
-  /// \returns a string containing the result of evaluating the expression in
-  /// this substitution, or an error if evaluation failed.
-  Expected<std::string> getResult() const override;
-};
-
-//===----------------------------------------------------------------------===//
-// Pattern handling code.
+// Summary of a FileCheck diagnostic.
 //===----------------------------------------------------------------------===//
 
 namespace Check {
@@ -291,325 +82,6 @@ public:
 };
 } // namespace Check
 
-struct FileCheckDiag;
-
-/// Class holding the FileCheckPattern global state, shared by all patterns:
-/// tables holding values of variables and whether they are defined or not at
-/// any given time in the matching process.
-class FileCheckPatternContext {
-  friend class FileCheckPattern;
-
-private:
-  /// When matching a given pattern, this holds the value of all the string
-  /// variables defined in previous patterns. In a pattern, only the last
-  /// definition for a given variable is recorded in this table.
-  /// Back-references are used for uses after any the other definition.
-  StringMap<StringRef> GlobalVariableTable;
-
-  /// Map of all string variables defined so far. Used at parse time to detect
-  /// a name conflict between a numeric variable and a string variable when
-  /// the former is defined on a later line than the latter.
-  StringMap<bool> DefinedVariableTable;
-
-  /// When matching a given pattern, this holds the pointers to the classes
-  /// representing the numeric variables defined in previous patterns. When
-  /// matching a pattern all definitions for that pattern are recorded in the
-  /// NumericVariableDefs table in the FileCheckPattern instance of that
-  /// pattern.
-  StringMap<FileCheckNumericVariable *> GlobalNumericVariableTable;
-
-  /// Pointer to the class instance representing the @LINE pseudo variable for
-  /// easily updating its value.
-  FileCheckNumericVariable *LineVariable = nullptr;
-
-  /// Vector holding pointers to all parsed numeric variables. Used to
-  /// automatically free them once they are guaranteed to no longer be used.
-  std::vector<std::unique_ptr<FileCheckNumericVariable>> NumericVariables;
-
-  /// Vector holding pointers to all substitutions. Used to automatically free
-  /// them once they are guaranteed to no longer be used.
-  std::vector<std::unique_ptr<FileCheckSubstitution>> Substitutions;
-
-public:
-  /// \returns the value of string variable \p VarName or an error if no such
-  /// variable has been defined.
-  Expected<StringRef> getPatternVarValue(StringRef VarName);
-
-  /// Defines string and numeric variables from definitions given on the
-  /// command line, passed as a vector of [#]VAR=VAL strings in
-  /// \p CmdlineDefines. \returns an error list containing diagnostics against
-  /// \p SM for all definition parsing failures, if any, or Success otherwise.
-  Error defineCmdlineVariables(std::vector<std::string> &CmdlineDefines,
-                               SourceMgr &SM);
-
-  /// Create @LINE pseudo variable. Value is set when pattern are being
-  /// matched.
-  void createLineVariable();
-
-  /// Undefines local variables (variables whose name does not start with a '$'
-  /// sign), i.e. removes them from GlobalVariableTable and from
-  /// GlobalNumericVariableTable and also clears the value of numeric
-  /// variables.
-  void clearLocalVars();
-
-private:
-  /// Makes a new numeric variable and registers it for destruction when the
-  /// context is destroyed.
-  template <class... Types>
-  FileCheckNumericVariable *makeNumericVariable(Types... args);
-
-  /// Makes a new string substitution and registers it for destruction when the
-  /// context is destroyed.
-  FileCheckSubstitution *makeStringSubstitution(StringRef VarName,
-                                                size_t InsertIdx);
-
-  /// Makes a new numeric substitution and registers it for destruction when
-  /// the context is destroyed.
-  FileCheckSubstitution *
-  makeNumericSubstitution(StringRef ExpressionStr,
-                          std::unique_ptr<FileCheckExpressionAST> ExpressionAST,
-                          size_t InsertIdx);
-};
-
-/// Class to represent an error holding a diagnostic with location information
-/// used when printing it.
-class FileCheckErrorDiagnostic : public ErrorInfo<FileCheckErrorDiagnostic> {
-private:
-  SMDiagnostic Diagnostic;
-
-public:
-  static char ID;
-
-  FileCheckErrorDiagnostic(SMDiagnostic &&Diag) : Diagnostic(Diag) {}
-
-  std::error_code convertToErrorCode() const override {
-    return inconvertibleErrorCode();
-  }
-
-  /// Print diagnostic associated with this error when printing the error.
-  void log(raw_ostream &OS) const override { Diagnostic.print(nullptr, OS); }
-
-  static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg) {
-    return make_error<FileCheckErrorDiagnostic>(
-        SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg));
-  }
-
-  static Error get(const SourceMgr &SM, StringRef Buffer, const Twine &ErrMsg) {
-    return get(SM, SMLoc::getFromPointer(Buffer.data()), ErrMsg);
-  }
-};
-
-class FileCheckNotFoundError : public ErrorInfo<FileCheckNotFoundError> {
-public:
-  static char ID;
-
-  std::error_code convertToErrorCode() const override {
-    return inconvertibleErrorCode();
-  }
-
-  /// Print diagnostic associated with this error when printing the error.
-  void log(raw_ostream &OS) const override {
-    OS << "String not found in input";
-  }
-};
-
-class FileCheckPattern {
-  SMLoc PatternLoc;
-
-  /// A fixed string to match as the pattern or empty if this pattern requires
-  /// a regex match.
-  StringRef FixedStr;
-
-  /// A regex string to match as the pattern or empty if this pattern requires
-  /// a fixed string to match.
-  std::string RegExStr;
-
-  /// Entries in this vector represent a substitution of a string variable or
-  /// an expression in the RegExStr regex at match time. For example, in the
-  /// case of a CHECK directive with the pattern "foo[[bar]]baz[[#N+1]]",
-  /// RegExStr will contain "foobaz" and we'll get two entries in this vector
-  /// that tells us to insert the value of string variable "bar" at offset 3
-  /// and the value of expression "N+1" at offset 6.
-  std::vector<FileCheckSubstitution *> Substitutions;
-
-  /// Maps names of string variables defined in a pattern to the number of
-  /// their parenthesis group in RegExStr capturing their last definition.
-  ///
-  /// E.g. for the pattern "foo[[bar:.*]]baz([[bar]][[QUUX]][[bar:.*]])",
-  /// RegExStr will be "foo(.*)baz(\1<quux value>(.*))" where <quux value> is
-  /// the value captured for QUUX on the earlier line where it was defined, and
-  /// VariableDefs will map "bar" to the third parenthesis group which captures
-  /// the second definition of "bar".
-  ///
-  /// Note: uses std::map rather than StringMap to be able to get the key when
-  /// iterating over values.
-  std::map<StringRef, unsigned> VariableDefs;
-
-  /// Structure representing the definition of a numeric variable in a pattern.
-  /// It holds the pointer to the class representing the numeric variable whose
-  /// value is being defined and the number of the parenthesis group in
-  /// RegExStr to capture that value.
-  struct FileCheckNumericVariableMatch {
-    /// Pointer to class representing the numeric variable whose value is being
-    /// defined.
-    FileCheckNumericVariable *DefinedNumericVariable;
-
-    /// Number of the parenthesis group in RegExStr that captures the value of
-    /// this numeric variable definition.
-    unsigned CaptureParenGroup;
-  };
-
-  /// Holds the number of the parenthesis group in RegExStr and pointer to the
-  /// corresponding FileCheckNumericVariable class instance of all numeric
-  /// variable definitions. Used to set the matched value of all those
-  /// variables.
-  StringMap<FileCheckNumericVariableMatch> NumericVariableDefs;
-
-  /// Pointer to a class instance holding the global state shared by all
-  /// patterns:
-  /// - separate tables with the values of live string and numeric variables
-  ///   respectively at the start of any given CHECK line;
-  /// - table holding whether a string variable has been defined at any given
-  ///   point during the parsing phase.
-  FileCheckPatternContext *Context;
-
-  Check::FileCheckType CheckTy;
-
-  /// Line number for this CHECK pattern or None if it is an implicit pattern.
-  /// Used to determine whether a variable definition is made on an earlier
-  /// line to the one with this CHECK.
-  Optional<size_t> LineNumber;
-
-public:
-  FileCheckPattern(Check::FileCheckType Ty, FileCheckPatternContext *Context,
-                   Optional<size_t> Line = None)
-      : Context(Context), CheckTy(Ty), LineNumber(Line) {}
-
-  /// \returns the location in source code.
-  SMLoc getLoc() const { return PatternLoc; }
-
-  /// \returns the pointer to the global state for all patterns in this
-  /// FileCheck instance.
-  FileCheckPatternContext *getContext() const { return Context; }
-
-  /// \returns whether \p C is a valid first character for a variable name.
-  static bool isValidVarNameStart(char C);
-
-  /// Parsing information about a variable.
-  struct VariableProperties {
-    StringRef Name;
-    bool IsPseudo;
-  };
-
-  /// Parses the string at the start of \p Str for a variable name. \returns
-  /// a VariableProperties structure holding the variable name and whether it
-  /// is the name of a pseudo variable, or an error holding a diagnostic
-  /// against \p SM if parsing fail. If parsing was successful, also strips
-  /// \p Str from the variable name.
-  static Expected<VariableProperties> parseVariable(StringRef &Str,
-                                                    const SourceMgr &SM);
-  /// Parses \p Expr for the name of a numeric variable to be defined at line
-  /// \p LineNumber or before input is parsed if \p LineNumber is None.
-  /// \returns a pointer to the class instance representing that variable,
-  /// creating it if needed, or an error holding a diagnostic against \p SM
-  /// should defining such a variable be invalid.
-  static Expected<FileCheckNumericVariable *> parseNumericVariableDefinition(
-      StringRef &Expr, FileCheckPatternContext *Context,
-      Optional<size_t> LineNumber, const SourceMgr &SM);
-  /// Parses \p Expr for a numeric substitution block. Parameter
-  /// \p IsLegacyLineExpr indicates whether \p Expr should be a legacy @LINE
-  /// expression. \returns a pointer to the class instance representing the AST
-  /// of the expression whose value must be substituted, or an error holding a
-  /// diagnostic against \p SM if parsing fails. If substitution was
-  /// successful, sets \p DefinedNumericVariable to point to the class
-  /// representing the numeric variable being defined in this numeric
-  /// substitution block, or None if this block does not define any variable.
-  Expected<std::unique_ptr<FileCheckExpressionAST>>
-  parseNumericSubstitutionBlock(
-      StringRef Expr,
-      Optional<FileCheckNumericVariable *> &DefinedNumericVariable,
-      bool IsLegacyLineExpr, const SourceMgr &SM) const;
-  /// Parses the pattern in \p PatternStr and initializes this FileCheckPattern
-  /// instance accordingly.
-  ///
-  /// \p Prefix provides which prefix is being matched, \p Req describes the
-  /// global options that influence the parsing such as whitespace
-  /// canonicalization, \p SM provides the SourceMgr used for error reports.
-  /// \returns true in case of an error, false otherwise.
-  bool parsePattern(StringRef PatternStr, StringRef Prefix, SourceMgr &SM,
-                    const FileCheckRequest &Req);
-  /// Matches the pattern string against the input buffer \p Buffer
-  ///
-  /// \returns the position that is matched or an error indicating why matching
-  /// failed. If there is a match, updates \p MatchLen with the size of the
-  /// matched string.
-  ///
-  /// The GlobalVariableTable StringMap in the FileCheckPatternContext class
-  /// instance provides the current values of FileCheck string variables and
-  /// is updated if this match defines new values. Likewise, the
-  /// GlobalNumericVariableTable StringMap in the same class provides the
-  /// current values of FileCheck numeric variables and is updated if this
-  /// match defines new numeric values.
-  Expected<size_t> match(StringRef Buffer, size_t &MatchLen,
-                         const SourceMgr &SM) const;
-  /// Prints the value of successful substitutions or the name of the undefined
-  /// string or numeric variables preventing a successful substitution.
-  void printSubstitutions(const SourceMgr &SM, StringRef Buffer,
-                          SMRange MatchRange = None) const;
-  void printFuzzyMatch(const SourceMgr &SM, StringRef Buffer,
-                       std::vector<FileCheckDiag> *Diags) const;
-
-  bool hasVariable() const {
-    return !(Substitutions.empty() && VariableDefs.empty());
-  }
-
-  Check::FileCheckType getCheckTy() const { return CheckTy; }
-
-  int getCount() const { return CheckTy.getCount(); }
-
-private:
-  bool AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM);
-  void AddBackrefToRegEx(unsigned BackrefNum);
-  /// Computes an arbitrary estimate for the quality of matching this pattern
-  /// at the start of \p Buffer; a distance of zero should correspond to a
-  /// perfect match.
-  unsigned computeMatchDistance(StringRef Buffer) const;
-  /// Finds the closing sequence of a regex variable usage or definition.
-  ///
-  /// \p Str has to point in the beginning of the definition (right after the
-  /// opening sequence). \p SM holds the SourceMgr used for error repporting.
-  ///  \returns the offset of the closing sequence within Str, or npos if it
-  /// was not found.
-  size_t FindRegexVarEnd(StringRef Str, SourceMgr &SM);
-
-  /// Parses \p Name as a (pseudo if \p IsPseudo is true) numeric variable use.
-  /// \returns the pointer to the class instance representing that variable if
-  /// successful, or an error holding a diagnostic against \p SM otherwise.
-  Expected<std::unique_ptr<FileCheckNumericVariableUse>>
-  parseNumericVariableUse(StringRef Name, bool IsPseudo,
-                          const SourceMgr &SM) const;
-  enum class AllowedOperand { LineVar, Literal, Any };
-  /// Parses \p Expr for use of a numeric operand. Accepts both literal values
-  /// and numeric variables, depending on the value of \p AO. \returns the
-  /// class representing that operand in the AST of the expression or an error
-  /// holding a diagnostic against \p SM otherwise.
-  Expected<std::unique_ptr<FileCheckExpressionAST>>
-  parseNumericOperand(StringRef &Expr, AllowedOperand AO,
-                      const SourceMgr &SM) const;
-  /// Parses \p Expr for a binary operation. The left operand of this binary
-  /// operation is given in \p LeftOp and \p IsLegacyLineExpr indicates whether
-  /// we are parsing a legacy @LINE expression. \returns the class representing
-  /// the binary operation in the AST of the expression, or an error holding a
-  /// diagnostic against \p SM otherwise.
-  Expected<std::unique_ptr<FileCheckExpressionAST>>
-  parseBinop(StringRef &Expr, std::unique_ptr<FileCheckExpressionAST> LeftOp,
-             bool IsLegacyLineExpr, const SourceMgr &SM) const;
-};
-
-//===----------------------------------------------------------------------===//
-/// Summary of a FileCheck diagnostic.
-//===----------------------------------------------------------------------===//
-
 struct FileCheckDiag {
   /// What is the FileCheck directive for this diagnostic?
   Check::FileCheckType CheckTy;
@@ -659,61 +131,20 @@ struct FileCheckDiag {
                 SMLoc CheckLoc, MatchType MatchTy, SMRange InputRange);
 };
 
-//===----------------------------------------------------------------------===//
-// Check Strings.
-//===----------------------------------------------------------------------===//
-
-/// A check that we found in the input file.
-struct FileCheckString {
-  /// The pattern to match.
-  FileCheckPattern Pat;
-
-  /// Which prefix name this check matched.
-  StringRef Prefix;
-
-  /// The location in the match file that the check string was specified.
-  SMLoc Loc;
-
-  /// All of the strings that are disallowed from occurring between this match
-  /// string and the previous one (or start of file).
-  std::vector<FileCheckPattern> DagNotStrings;
-
-  FileCheckString(const FileCheckPattern &P, StringRef S, SMLoc L)
-      : Pat(P), Prefix(S), Loc(L) {}
-
-  /// Matches check string and its "not strings" and/or "dag strings".
-  size_t Check(const SourceMgr &SM, StringRef Buffer, bool IsLabelScanMode,
-               size_t &MatchLen, FileCheckRequest &Req,
-               std::vector<FileCheckDiag> *Diags) const;
-
-  /// Verifies that there is a single line in the given \p Buffer. Errors are
-  /// reported against \p SM.
-  bool CheckNext(const SourceMgr &SM, StringRef Buffer) const;
-  /// Verifies that there is no newline in the given \p Buffer. Errors are
-  /// reported against \p SM.
-  bool CheckSame(const SourceMgr &SM, StringRef Buffer) const;
-  /// Verifies that none of the strings in \p NotStrings are found in the given
-  /// \p Buffer. Errors are reported against \p SM and diagnostics recorded in
-  /// \p Diags according to the verbosity level set in \p Req.
-  bool CheckNot(const SourceMgr &SM, StringRef Buffer,
-                const std::vector<const FileCheckPattern *> &NotStrings,
-                const FileCheckRequest &Req,
-                std::vector<FileCheckDiag> *Diags) const;
-  /// Matches "dag strings" and their mixed "not strings".
-  size_t CheckDag(const SourceMgr &SM, StringRef Buffer,
-                  std::vector<const FileCheckPattern *> &NotStrings,
-                  const FileCheckRequest &Req,
-                  std::vector<FileCheckDiag> *Diags) const;
-};
+class FileCheckPatternContext;
+struct FileCheckString;
 
 /// FileCheck class takes the request and exposes various methods that
 /// use information from the request.
 class FileCheck {
   FileCheckRequest Req;
-  FileCheckPatternContext PatternContext;
+  std::unique_ptr<FileCheckPatternContext> PatternContext;
+  // C++17 TODO: make this a plain std::vector.
+  std::unique_ptr<std::vector<FileCheckString>> CheckStrings;
 
 public:
-  FileCheck(FileCheckRequest Req) : Req(Req) {}
+  explicit FileCheck(FileCheckRequest Req);
+  ~FileCheck();
 
   // Combines the check prefixes into a single regex so that we can efficiently
   // scan for any of the set.
@@ -723,13 +154,11 @@ public:
   Regex buildCheckPrefixRegex();
 
   /// Reads the check file from \p Buffer and records the expected strings it
-  /// contains in the \p CheckStrings vector. Errors are reported against
-  /// \p SM.
+  /// contains. Errors are reported against \p SM.
   ///
   /// Only expected strings whose prefix is one of those listed in \p PrefixRE
   /// are recorded. \returns true in case of an error, false otherwise.
-  bool ReadCheckFile(SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
-                     std::vector<FileCheckString> &CheckStrings);
+  bool readCheckFile(SourceMgr &SM, StringRef Buffer, Regex &PrefixRE);
 
   bool ValidateCheckPrefixes();
 
@@ -739,13 +168,14 @@ public:
                              SmallVectorImpl<char> &OutputBuffer);
 
   /// Checks the input to FileCheck provided in the \p Buffer against the
-  /// \p CheckStrings read from the check file and record diagnostics emitted
+  /// expected strings read from the check file and record diagnostics emitted
   /// in \p Diags. Errors are recorded against \p SM.
   ///
   /// \returns false if the input fails to satisfy the checks.
-  bool CheckInput(SourceMgr &SM, StringRef Buffer,
-                  ArrayRef<FileCheckString> CheckStrings,
+  bool checkInput(SourceMgr &SM, StringRef Buffer,
                   std::vector<FileCheckDiag> *Diags = nullptr);
 };
+
 } // namespace llvm
+
 #endif
diff --git a/include/llvm/Support/FileCollector.h b/include/llvm/Support/FileCollector.h
new file mode 100644
index 000000000000..19429bd3e9b4
--- /dev/null
+++ b/include/llvm/Support/FileCollector.h
@@ -0,0 +1,79 @@
+//===-- FileCollector.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_FILE_COLLECTOR_H
+#define LLVM_SUPPORT_FILE_COLLECTOR_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/VirtualFileSystem.h"
+
+#include <mutex>
+
+namespace llvm {
+
+/// Collects files into a directory and generates a mapping that can be used by
+/// the VFS.
+class FileCollector {
+public:
+  FileCollector(std::string Root, std::string OverlayRoot);
+
+  void addFile(const Twine &file);
+
+  /// Write the yaml mapping (for the VFS) to the given file.
+  std::error_code writeMapping(StringRef mapping_file);
+
+  /// Copy the files into the root directory.
+  ///
+  /// When StopOnError is true (the default) we abort as soon as one file
+  /// cannot be copied. This is relatively common, for example when a file was
+  /// removed after it was added to the mapping.
+  std::error_code copyFiles(bool StopOnError = true);
+
+  /// Create a VFS that collects all the paths that might be looked at by the
+  /// file system accesses.
+  static IntrusiveRefCntPtr<vfs::FileSystem>
+  createCollectorVFS(IntrusiveRefCntPtr<vfs::FileSystem> BaseFS,
+                     std::shared_ptr<FileCollector> Collector);
+
+private:
+  void addFileImpl(StringRef SrcPath);
+
+  bool markAsSeen(StringRef Path) { return Seen.insert(Path).second; }
+
+  bool getRealPath(StringRef SrcPath, SmallVectorImpl<char> &Result);
+
+  void addFileToMapping(StringRef VirtualPath, StringRef RealPath) {
+    VFSWriter.addFileMapping(VirtualPath, RealPath);
+  }
+
+protected:
+  /// Synchronizes adding files.
+  std::mutex Mutex;
+
+  /// The root directory where files are copied.
+  std::string Root;
+
+  /// The root directory where the VFS overlay lives.
+  std::string OverlayRoot;
+
+  /// Tracks already seen files so they can be skipped.
+  StringSet<> Seen;
+
+  /// The yaml mapping writer.
+  vfs::YAMLVFSWriter VFSWriter;
+
+  /// Caches RealPath calls when resolving symlinks.
+  StringMap<std::string> SymlinkMap;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_FILE_COLLECTOR_H
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index 1bec27bddad9..a29a9d787947 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -991,29 +991,27 @@ file_t getStdoutHandle();
 /// Returns kInvalidFile when the stream is closed.
 file_t getStderrHandle();
 
-/// Reads \p Buf.size() bytes from \p FileHandle into \p Buf. The number of
-/// bytes actually read is returned in \p BytesRead. On Unix, this is equivalent
-/// to `*BytesRead = ::read(FD, Buf.data(), Buf.size())`, with error reporting.
-/// BytesRead will contain zero when reaching EOF.
+/// Reads \p Buf.size() bytes from \p FileHandle into \p Buf. Returns the number
+/// of bytes actually read. On Unix, this is equivalent to `return ::read(FD,
+/// Buf.data(), Buf.size())`, with error reporting. Returns 0 when reaching EOF.
 ///
 /// @param FileHandle File to read from.
 /// @param Buf Buffer to read into.
-/// @param BytesRead Output parameter of the number of bytes read.
-/// @returns The error, if any, or errc::success.
-std::error_code readNativeFile(file_t FileHandle, MutableArrayRef<char> Buf,
-                               size_t *BytesRead);
+/// @returns The number of bytes read, or error.
+Expected<size_t> readNativeFile(file_t FileHandle, MutableArrayRef<char> Buf);
 
 /// Reads \p Buf.size() bytes from \p FileHandle at offset \p Offset into \p
 /// Buf. If 'pread' is available, this will use that, otherwise it will use
-/// 'lseek'. Bytes requested beyond the end of the file will be zero
-/// initialized.
+/// 'lseek'. Returns the number of bytes actually read. Returns 0 when reaching
+/// EOF.
 ///
 /// @param FileHandle File to read from.
 /// @param Buf Buffer to read into.
 /// @param Offset Offset into the file at which the read should occur.
-/// @returns The error, if any, or errc::success.
-std::error_code readNativeFileSlice(file_t FileHandle,
-                                    MutableArrayRef<char> Buf, size_t Offset);
+/// @returns The number of bytes read, or error.
+Expected<size_t> readNativeFileSlice(file_t FileHandle,
+                                     MutableArrayRef<char> Buf,
+                                     uint64_t Offset);
 
 /// @brief Opens the file with the given name in a write-only or read-write
 /// mode, returning its open file descriptor. If the file does not exist, it
@@ -1217,9 +1215,9 @@ class directory_entry {
   // that whole structure, callers end up paying for a stat().
   // std::filesystem::directory_entry may be a better model.
   std::string Path;
-  file_type Type;           // Most platforms can provide this.
-  bool FollowSymlinks;      // Affects the behavior of status().
-  basic_file_status Status; // If available.
+  file_type Type = file_type::type_unknown; // Most platforms can provide this.
+  bool FollowSymlinks = true;               // Affects the behavior of status().
+  basic_file_status Status;                 // If available.
 
 public:
   explicit directory_entry(const Twine &Path, bool FollowSymlinks = true,
diff --git a/include/llvm/Support/FileUtilities.h b/include/llvm/Support/FileUtilities.h
index 16b2206924c3..04efdced32a4 100644
--- a/include/llvm/Support/FileUtilities.h
+++ b/include/llvm/Support/FileUtilities.h
@@ -14,6 +14,9 @@
 #ifndef LLVM_SUPPORT_FILEUTILITIES_H
 #define LLVM_SUPPORT_FILEUTILITIES_H
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 
@@ -72,6 +75,41 @@ namespace llvm {
     /// will not be removed when the object is destroyed.
     void releaseFile() { DeleteIt = false; }
   };
+
+  enum class atomic_write_error {
+    failed_to_create_uniq_file = 0,
+    output_stream_error,
+    failed_to_rename_temp_file
+  };
+
+  class AtomicFileWriteError : public llvm::ErrorInfo<AtomicFileWriteError> {
+  public:
+    AtomicFileWriteError(atomic_write_error Error) : Error(Error) {}
+
+    void log(raw_ostream &OS) const override;
+
+    const atomic_write_error Error;
+    static char ID;
+
+  private:
+    // Users are not expected to use error_code.
+    std::error_code convertToErrorCode() const override {
+      return llvm::inconvertibleErrorCode();
+    }
+  };
+
+  // atomic_write_error + whatever the Writer can return
+
+  /// Creates a unique file with name according to the given \p TempPathModel,
+  /// writes content of \p Buffer to the file and renames it to \p FinalPath.
+  ///
+  /// \returns \c AtomicFileWriteError in case of error.
+  llvm::Error writeFileAtomically(StringRef TempPathModel, StringRef FinalPath,
+                                  StringRef Buffer);
+
+  llvm::Error
+  writeFileAtomically(StringRef TempPathModel, StringRef FinalPath,
+                      std::function<llvm::Error(llvm::raw_ostream &)> Writer);
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h
index 77dcbaebf1a3..9dd7b401b46a 100644
--- a/include/llvm/Support/Format.h
+++ b/include/llvm/Support/Format.h
@@ -29,6 +29,7 @@
 #include <cassert>
 #include <cstdio>
 #include <tuple>
+#include <utility>
 
 namespace llvm {
 
@@ -91,7 +92,7 @@ class format_object final : public format_object_base {
 
   template <std::size_t... Is>
   int snprint_tuple(char *Buffer, unsigned BufferSize,
-                    index_sequence<Is...>) const {
+                    std::index_sequence<Is...>) const {
 #ifdef _MSC_VER
     return _snprintf(Buffer, BufferSize, Fmt, std::get<Is>(Vals)...);
 #else
@@ -106,7 +107,7 @@ public:
   }
 
   int snprint(char *Buffer, unsigned BufferSize) const override {
-    return snprint_tuple(Buffer, BufferSize, index_sequence_for<Ts...>());
+    return snprint_tuple(Buffer, BufferSize, std::index_sequence_for<Ts...>());
   }
 };
 
diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h
index 99620802505b..9169379f746d 100644
--- a/include/llvm/Support/GenericDomTree.h
+++ b/include/llvm/Support/GenericDomTree.h
@@ -242,7 +242,7 @@ protected:
   using DomTreeNodeMapType =
      DenseMap<NodeT *, std::unique_ptr<DomTreeNodeBase<NodeT>>>;
   DomTreeNodeMapType DomTreeNodes;
-  DomTreeNodeBase<NodeT> *RootNode;
+  DomTreeNodeBase<NodeT> *RootNode = nullptr;
   ParentPtr Parent = nullptr;
 
   mutable bool DFSInfoValid = false;
@@ -571,7 +571,7 @@ protected:
     assert(IDomNode && "Not immediate dominator specified for block!");
     DFSInfoValid = false;
     return (DomTreeNodes[BB] = IDomNode->addChild(
-                llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode))).get();
+                std::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode))).get();
   }
 
   /// Add a new node to the forward dominator tree and make it a new root.
@@ -585,7 +585,7 @@ protected:
            "Cannot change root of post-dominator tree");
     DFSInfoValid = false;
     DomTreeNodeBase<NodeT> *NewNode = (DomTreeNodes[BB] =
-      llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, nullptr)).get();
+      std::make_unique<DomTreeNodeBase<NodeT>>(BB, nullptr)).get();
     if (Roots.empty()) {
       addRoot(BB);
     } else {
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index ccceba881718..7c0278e8770e 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -186,7 +186,7 @@ struct SemiNCAInfo {
     // Add a new tree node for this NodeT, and link it as a child of
     // IDomNode
     return (DT.DomTreeNodes[BB] = IDomNode->addChild(
-        llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode)))
+        std::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode)))
         .get();
   }
 
@@ -586,7 +586,7 @@ struct SemiNCAInfo {
     NodePtr Root = IsPostDom ? nullptr : DT.Roots[0];
 
     DT.RootNode = (DT.DomTreeNodes[Root] =
-                       llvm::make_unique<DomTreeNodeBase<NodeT>>(Root, nullptr))
+                       std::make_unique<DomTreeNodeBase<NodeT>>(Root, nullptr))
         .get();
     SNCA.attachNewSubtree(DT, DT.RootNode);
   }
@@ -611,7 +611,7 @@ struct SemiNCAInfo {
       // Add a new tree node for this BasicBlock, and link it as a child of
       // IDomNode.
       DT.DomTreeNodes[W] = IDomNode->addChild(
-          llvm::make_unique<DomTreeNodeBase<NodeT>>(W, IDomNode));
+          std::make_unique<DomTreeNodeBase<NodeT>>(W, IDomNode));
     }
   }
 
@@ -663,7 +663,7 @@ struct SemiNCAInfo {
       TreeNodePtr VirtualRoot = DT.getNode(nullptr);
       FromTN =
           (DT.DomTreeNodes[From] = VirtualRoot->addChild(
-               llvm::make_unique<DomTreeNodeBase<NodeT>>(From, VirtualRoot)))
+               std::make_unique<DomTreeNodeBase<NodeT>>(From, VirtualRoot)))
               .get();
       DT.Roots.push_back(From);
     }
diff --git a/include/llvm/Support/GlobPattern.h b/include/llvm/Support/GlobPattern.h
index 66a4cd94c12a..0098ac65fd30 100644
--- a/include/llvm/Support/GlobPattern.h
+++ b/include/llvm/Support/GlobPattern.h
@@ -21,7 +21,7 @@
 #include <vector>
 
 // This class represents a glob pattern. Supported metacharacters
-// are "*", "?", "[<chars>]" and "[^<chars>]".
+// are "*", "?", "\", "[<chars>]", "[^<chars>]", and "[!<chars>]".
 namespace llvm {
 class BitVector;
 template <typename T> class ArrayRef;
diff --git a/include/llvm/Support/Host.h b/include/llvm/Support/Host.h
index b37cc514c92e..44f543c363db 100644
--- a/include/llvm/Support/Host.h
+++ b/include/llvm/Support/Host.h
@@ -15,39 +15,11 @@
 
 #include "llvm/ADT/StringMap.h"
 
-#if defined(__linux__) || defined(__GNU__) || defined(__HAIKU__)
-#include <endian.h>
-#elif defined(_AIX)
-#include <sys/machine.h>
-#elif defined(__sun)
-/* Solaris provides _BIG_ENDIAN/_LITTLE_ENDIAN selector in sys/types.h */
-#include <sys/types.h>
-#define BIG_ENDIAN 4321
-#define LITTLE_ENDIAN 1234
-#if defined(_BIG_ENDIAN)
-#define BYTE_ORDER BIG_ENDIAN
-#else
-#define BYTE_ORDER LITTLE_ENDIAN
-#endif
-#else
-#if !defined(BYTE_ORDER) && !defined(_WIN32)
-#include <machine/endian.h>
-#endif
-#endif
-
 #include <string>
 
 namespace llvm {
 namespace sys {
 
-#if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN
-constexpr bool IsBigEndianHost = true;
-#else
-constexpr bool IsBigEndianHost = false;
-#endif
-
-  static const bool IsLittleEndianHost = !IsBigEndianHost;
-
   /// getDefaultTargetTriple() - Return the default target triple the compiler
   /// has been configured to produce code for.
   ///
diff --git a/include/llvm/Support/JamCRC.h b/include/llvm/Support/JamCRC.h
deleted file mode 100644
index b6fc4e7b9b03..000000000000
--- a/include/llvm/Support/JamCRC.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- llvm/Support/JamCRC.h - Cyclic Redundancy Check ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains an implementation of JamCRC.
-//
-// We will use the "Rocksoft^tm Model CRC Algorithm" to describe the properties
-// of this CRC:
-//   Width  : 32
-//   Poly   : 04C11DB7
-//   Init   : FFFFFFFF
-//   RefIn  : True
-//   RefOut : True
-//   XorOut : 00000000
-//   Check  : 340BC6D9 (result of CRC for "123456789")
-//
-// N.B.  We permit flexibility of the "Init" value.  Some consumers of this need
-//       it to be zero.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_JAMCRC_H
-#define LLVM_SUPPORT_JAMCRC_H
-
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-template <typename T> class ArrayRef;
-
-class JamCRC {
-public:
-  JamCRC(uint32_t Init = 0xFFFFFFFFU) : CRC(Init) {}
-
-  // Update the CRC calculation with Data.
-  void update(ArrayRef<char> Data);
-
-  uint32_t getCRC() const { return CRC; }
-
-private:
-  uint32_t CRC;
-};
-} // End of namespace llvm
-
-#endif
diff --git a/include/llvm/Support/MachineValueType.h b/include/llvm/Support/MachineValueType.h
index b94d2c4836cc..7f9f0b85c55e 100644
--- a/include/llvm/Support/MachineValueType.h
+++ b/include/llvm/Support/MachineValueType.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TypeSize.h"
 #include <cassert>
 
 namespace llvm {
@@ -64,152 +65,162 @@ namespace llvm {
       v32i1          =  19,   //   32 x i1
       v64i1          =  20,   //   64 x i1
       v128i1         =  21,   //  128 x i1
-      v512i1         =  22,   //  512 x i1
-      v1024i1        =  23,   // 1024 x i1
-
-      v1i8           =  24,   //  1 x i8
-      v2i8           =  25,   //  2 x i8
-      v4i8           =  26,   //  4 x i8
-      v8i8           =  27,   //  8 x i8
-      v16i8          =  28,   // 16 x i8
-      v32i8          =  29,   // 32 x i8
-      v64i8          =  30,   // 64 x i8
-      v128i8         =  31,   //128 x i8
-      v256i8         =  32,   //256 x i8
-
-      v1i16          =  33,   //  1 x i16
-      v2i16          =  34,   //  2 x i16
-      v4i16          =  35,   //  4 x i16
-      v8i16          =  36,   //  8 x i16
-      v16i16         =  37,   // 16 x i16
-      v32i16         =  38,   // 32 x i16
-      v64i16         =  39,   // 64 x i16
-      v128i16        =  40,   //128 x i16
-
-      v1i32          =  41,   //    1 x i32
-      v2i32          =  42,   //    2 x i32
-      v3i32          =  43,   //    3 x i32
-      v4i32          =  44,   //    4 x i32
-      v5i32          =  45,   //    5 x i32
-      v8i32          =  46,   //    8 x i32
-      v16i32         =  47,   //   16 x i32
-      v32i32         =  48,   //   32 x i32
-      v64i32         =  49,   //   64 x i32
-      v128i32        =  50,   //  128 x i32
-      v256i32        =  51,   //  256 x i32
-      v512i32        =  52,   //  512 x i32
-      v1024i32       =  53,   // 1024 x i32
-      v2048i32       =  54,   // 2048 x i32
-
-      v1i64          =  55,   //  1 x i64
-      v2i64          =  56,   //  2 x i64
-      v4i64          =  57,   //  4 x i64
-      v8i64          =  58,   //  8 x i64
-      v16i64         =  59,   // 16 x i64
-      v32i64         =  60,   // 32 x i64
-
-      v1i128         =  61,   //  1 x i128
-
-      // Scalable integer types
-      nxv1i1         =  62,   // n x  1 x i1
-      nxv2i1         =  63,   // n x  2 x i1
-      nxv4i1         =  64,   // n x  4 x i1
-      nxv8i1         =  65,   // n x  8 x i1
-      nxv16i1        =  66,   // n x 16 x i1
-      nxv32i1        =  67,   // n x 32 x i1
-
-      nxv1i8         =  68,   // n x  1 x i8
-      nxv2i8         =  69,   // n x  2 x i8
-      nxv4i8         =  70,   // n x  4 x i8
-      nxv8i8         =  71,   // n x  8 x i8
-      nxv16i8        =  72,   // n x 16 x i8
-      nxv32i8        =  73,   // n x 32 x i8
-
-      nxv1i16        =  74,   // n x  1 x i16
-      nxv2i16        =  75,   // n x  2 x i16
-      nxv4i16        =  76,   // n x  4 x i16
-      nxv8i16        =  77,   // n x  8 x i16
-      nxv16i16       =  78,   // n x 16 x i16
-      nxv32i16       =  79,   // n x 32 x i16
-
-      nxv1i32        =  80,   // n x  1 x i32
-      nxv2i32        =  81,   // n x  2 x i32
-      nxv4i32        =  82,   // n x  4 x i32
-      nxv8i32        =  83,   // n x  8 x i32
-      nxv16i32       =  84,   // n x 16 x i32
-      nxv32i32       =  85,   // n x 32 x i32
-
-      nxv1i64        =  86,   // n x  1 x i64
-      nxv2i64        =  87,   // n x  2 x i64
-      nxv4i64        =  88,   // n x  4 x i64
-      nxv8i64        =  89,   // n x  8 x i64
-      nxv16i64       =  90,   // n x 16 x i64
-      nxv32i64       =  91,   // n x 32 x i64
-
-      FIRST_INTEGER_VECTOR_VALUETYPE = v1i1,
-      LAST_INTEGER_VECTOR_VALUETYPE = nxv32i64,
-
-      FIRST_INTEGER_SCALABLE_VALUETYPE = nxv1i1,
-      LAST_INTEGER_SCALABLE_VALUETYPE = nxv32i64,
-
-      v2f16          =  92,   //    2 x f16
-      v4f16          =  93,   //    4 x f16
-      v8f16          =  94,   //    8 x f16
-      v1f32          =  95,   //    1 x f32
-      v2f32          =  96,   //    2 x f32
-      v3f32          =  97,   //    3 x f32
-      v4f32          =  98,   //    4 x f32
-      v5f32          =  99,   //    5 x f32
-      v8f32          =  100,  //    8 x f32
-      v16f32         =  101,  //   16 x f32
-      v32f32         =  102,  //   32 x f32
-      v64f32         =  103,  //   64 x f32
-      v128f32        =  104,  //  128 x f32
-      v256f32        =  105,  //  256 x f32
-      v512f32        =  106,  //  512 x f32
-      v1024f32       =  107,  // 1024 x f32
-      v2048f32       =  108,  // 2048 x f32
-      v1f64          =  109,  //    1 x f64
-      v2f64          =  110,  //    2 x f64
-      v4f64          =  111,  //    4 x f64
-      v8f64          =  112,  //    8 x f64
-
-      nxv2f16        =  113,  // n x  2 x f16
-      nxv4f16        =  114,  // n x  4 x f16
-      nxv8f16        =  115,  // n x  8 x f16
-      nxv1f32        =  116,  // n x  1 x f32
-      nxv2f32        =  117,  // n x  2 x f32
-      nxv4f32        =  118,  // n x  4 x f32
-      nxv8f32        =  119,  // n x  8 x f32
-      nxv16f32       =  120,  // n x 16 x f32
-      nxv1f64        =  121,  // n x  1 x f64
-      nxv2f64        =  122,  // n x  2 x f64
-      nxv4f64        =  123,  // n x  4 x f64
-      nxv8f64        =  124,  // n x  8 x f64
-
-      FIRST_FP_VECTOR_VALUETYPE = v2f16,
-      LAST_FP_VECTOR_VALUETYPE = nxv8f64,
-
-      FIRST_FP_SCALABLE_VALUETYPE = nxv2f16,
-      LAST_FP_SCALABLE_VALUETYPE = nxv8f64,
+      v256i1         =  22,   //  256 x i1
+      v512i1         =  23,   //  512 x i1
+      v1024i1        =  24,   // 1024 x i1
+
+      v1i8           =  25,   //  1 x i8
+      v2i8           =  26,   //  2 x i8
+      v4i8           =  27,   //  4 x i8
+      v8i8           =  28,   //  8 x i8
+      v16i8          =  29,   // 16 x i8
+      v32i8          =  30,   // 32 x i8
+      v64i8          =  31,   // 64 x i8
+      v128i8         =  32,   //128 x i8
+      v256i8         =  33,   //256 x i8
+
+      v1i16          =  34,   //  1 x i16
+      v2i16          =  35,   //  2 x i16
+      v3i16          =  36,   //  3 x i16
+      v4i16          =  37,   //  4 x i16
+      v8i16          =  38,   //  8 x i16
+      v16i16         =  39,   // 16 x i16
+      v32i16         =  40,   // 32 x i16
+      v64i16         =  41,   // 64 x i16
+      v128i16        =  42,   //128 x i16
+
+      v1i32          =  43,   //    1 x i32
+      v2i32          =  44,   //    2 x i32
+      v3i32          =  45,   //    3 x i32
+      v4i32          =  46,   //    4 x i32
+      v5i32          =  47,   //    5 x i32
+      v8i32          =  48,   //    8 x i32
+      v16i32         =  49,   //   16 x i32
+      v32i32         =  50,   //   32 x i32
+      v64i32         =  51,   //   64 x i32
+      v128i32        =  52,   //  128 x i32
+      v256i32        =  53,   //  256 x i32
+      v512i32        =  54,   //  512 x i32
+      v1024i32       =  55,   // 1024 x i32
+      v2048i32       =  56,   // 2048 x i32
+
+      v1i64          =  57,   //  1 x i64
+      v2i64          =  58,   //  2 x i64
+      v4i64          =  59,   //  4 x i64
+      v8i64          =  60,   //  8 x i64
+      v16i64         =  61,   // 16 x i64
+      v32i64         =  62,   // 32 x i64
+
+      v1i128         =  63,   //  1 x i128
+
+      FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
+      LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128,
+
+      v2f16          =  64,   //    2 x f16
+      v3f16          =  65,   //    3 x f16
+      v4f16          =  66,   //    4 x f16
+      v8f16          =  67,   //    8 x f16
+      v16f16         =  68,   //   16 x f16
+      v32f16         =  69,   //   32 x f16
+      v1f32          =  70,   //    1 x f32
+      v2f32          =  71,   //    2 x f32
+      v3f32          =  72,   //    3 x f32
+      v4f32          =  73,   //    4 x f32
+      v5f32          =  74,   //    5 x f32
+      v8f32          =  75,   //    8 x f32
+      v16f32         =  76,   //   16 x f32
+      v32f32         =  77,   //   32 x f32
+      v64f32         =  78,   //   64 x f32
+      v128f32        =  79,   //  128 x f32
+      v256f32        =  80,   //  256 x f32
+      v512f32        =  81,   //  512 x f32
+      v1024f32       =  82,   // 1024 x f32
+      v2048f32       =  83,   // 2048 x f32
+      v1f64          =  84,   //    1 x f64
+      v2f64          =  85,   //    2 x f64
+      v4f64          =  86,   //    4 x f64
+      v8f64          =  87,   //    8 x f64
+
+      FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v2f16,
+      LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v8f64,
+
+      FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
+      LAST_FIXEDLEN_VECTOR_VALUETYPE = v8f64,
+
+      nxv1i1         =  88,   // n x  1 x i1
+      nxv2i1         =  89,   // n x  2 x i1
+      nxv4i1         =  90,   // n x  4 x i1
+      nxv8i1         =  91,   // n x  8 x i1
+      nxv16i1        =  92,   // n x 16 x i1
+      nxv32i1        =  93,   // n x 32 x i1
+
+      nxv1i8         =  94,   // n x  1 x i8
+      nxv2i8         =  95,   // n x  2 x i8
+      nxv4i8         =  96,   // n x  4 x i8
+      nxv8i8         =  97,   // n x  8 x i8
+      nxv16i8        =  98,   // n x 16 x i8
+      nxv32i8        =  99,   // n x 32 x i8
+
+      nxv1i16        =  100,  // n x  1 x i16
+      nxv2i16        =  101,  // n x  2 x i16
+      nxv4i16        =  102,  // n x  4 x i16
+      nxv8i16        =  103,  // n x  8 x i16
+      nxv16i16       =  104,  // n x 16 x i16
+      nxv32i16       =  105,  // n x 32 x i16
+
+      nxv1i32        =  106,  // n x  1 x i32
+      nxv2i32        =  107,  // n x  2 x i32
+      nxv4i32        =  108,  // n x  4 x i32
+      nxv8i32        =  109,  // n x  8 x i32
+      nxv16i32       =  110,  // n x 16 x i32
+      nxv32i32       =  111,  // n x 32 x i32
+
+      nxv1i64        =  112,  // n x  1 x i64
+      nxv2i64        =  113,  // n x  2 x i64
+      nxv4i64        =  114,  // n x  4 x i64
+      nxv8i64        =  115,  // n x  8 x i64
+      nxv16i64       =  116,  // n x 16 x i64
+      nxv32i64       =  117,  // n x 32 x i64
+
+      FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
+      LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64,
+
+      nxv2f16        =  118,  // n x  2 x f16
+      nxv4f16        =  119,  // n x  4 x f16
+      nxv8f16        =  120,  // n x  8 x f16
+      nxv1f32        =  121,  // n x  1 x f32
+      nxv2f32        =  122,  // n x  2 x f32
+      nxv4f32        =  123,  // n x  4 x f32
+      nxv8f32        =  124,  // n x  8 x f32
+      nxv16f32       =  125,  // n x 16 x f32
+      nxv1f64        =  126,  // n x  1 x f64
+      nxv2f64        =  127,  // n x  2 x f64
+      nxv4f64        =  128,  // n x  4 x f64
+      nxv8f64        =  129,  // n x  8 x f64
+
+      FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv2f16,
+      LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
+
+      FIRST_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
+      LAST_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
 
       FIRST_VECTOR_VALUETYPE = v1i1,
       LAST_VECTOR_VALUETYPE  = nxv8f64,
 
-      x86mmx         =  125,   // This is an X86 MMX value
+      x86mmx         =  130,   // This is an X86 MMX value
 
-      Glue           =  126,   // This glues nodes together during pre-RA sched
+      Glue           =  131,   // This glues nodes together during pre-RA sched
 
-      isVoid         =  127,   // This has no value
+      isVoid         =  132,   // This has no value
 
-      Untyped        =  128,   // This value takes a register, but has
+      Untyped        =  133,   // This value takes a register, but has
                                // unspecified type.  The register class
                                // will be determined by the opcode.
 
-      exnref         =  129,   // WebAssembly's exnref type
+      exnref         =  134,   // WebAssembly's exnref type
 
       FIRST_VALUETYPE = 1,     // This is always the beginning of the list.
-      LAST_VALUETYPE =  130,   // This always remains at the end of the list.
+      LAST_VALUETYPE =  135,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
@@ -253,41 +264,6 @@ namespace llvm {
 
     SimpleValueType SimpleTy = INVALID_SIMPLE_VALUE_TYPE;
 
-    // A class to represent the number of elements in a vector
-    //
-    // For fixed-length vectors, the total number of elements is equal to 'Min'
-    // For scalable vectors, the total number of elements is a multiple of 'Min'
-    class ElementCount {
-    public:
-      unsigned Min;
-      bool Scalable;
-
-      ElementCount(unsigned Min, bool Scalable)
-      : Min(Min), Scalable(Scalable) {}
-
-      ElementCount operator*(unsigned RHS) {
-        return { Min * RHS, Scalable };
-      }
-
-      ElementCount& operator*=(unsigned RHS) {
-        Min *= RHS;
-        return *this;
-      }
-
-      ElementCount operator/(unsigned RHS) {
-        return { Min / RHS, Scalable };
-      }
-
-      ElementCount& operator/=(unsigned RHS) {
-        Min /= RHS;
-        return *this;
-      }
-
-      bool operator==(const ElementCount& RHS) {
-        return Min == RHS.Min && Scalable == RHS.Scalable;
-      }
-    };
-
     constexpr MVT() = default;
     constexpr MVT(SimpleValueType SVT) : SimpleTy(SVT) {}
 
@@ -308,16 +284,20 @@ namespace llvm {
     bool isFloatingPoint() const {
       return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE &&
                SimpleTy <= MVT::LAST_FP_VALUETYPE) ||
-              (SimpleTy >= MVT::FIRST_FP_VECTOR_VALUETYPE &&
-               SimpleTy <= MVT::LAST_FP_VECTOR_VALUETYPE));
+              (SimpleTy >= MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE &&
+               SimpleTy <= MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE) ||
+              (SimpleTy >= MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE &&
+               SimpleTy <= MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE));
     }
 
     /// Return true if this is an integer or a vector integer type.
     bool isInteger() const {
       return ((SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
                SimpleTy <= MVT::LAST_INTEGER_VALUETYPE) ||
-              (SimpleTy >= MVT::FIRST_INTEGER_VECTOR_VALUETYPE &&
-               SimpleTy <= MVT::LAST_INTEGER_VECTOR_VALUETYPE));
+              (SimpleTy >= MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE &&
+               SimpleTy <= MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE) ||
+              (SimpleTy >= MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE &&
+               SimpleTy <= MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE));
     }
 
     /// Return true if this is an integer, not including vectors.
@@ -335,10 +315,13 @@ namespace llvm {
     /// Return true if this is a vector value type where the
     /// runtime length is machine dependent
     bool isScalableVector() const {
-      return ((SimpleTy >= MVT::FIRST_INTEGER_SCALABLE_VALUETYPE &&
-               SimpleTy <= MVT::LAST_INTEGER_SCALABLE_VALUETYPE) ||
-              (SimpleTy >= MVT::FIRST_FP_SCALABLE_VALUETYPE &&
-               SimpleTy <= MVT::LAST_FP_SCALABLE_VALUETYPE));
+      return (SimpleTy >= MVT::FIRST_SCALABLE_VECTOR_VALUETYPE &&
+              SimpleTy <= MVT::LAST_SCALABLE_VECTOR_VALUETYPE);
+    }
+
+    bool isFixedLengthVector() const {
+      return (SimpleTy >= MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE &&
+              SimpleTy <= MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE);
     }
 
     /// Return true if this is a 16-bit vector type.
@@ -373,17 +356,18 @@ namespace llvm {
 
     /// Return true if this is a 256-bit vector type.
     bool is256BitVector() const {
-      return (SimpleTy == MVT::v8f32 || SimpleTy == MVT::v4f64  ||
-              SimpleTy == MVT::v32i8 || SimpleTy == MVT::v16i16 ||
-              SimpleTy == MVT::v8i32 || SimpleTy == MVT::v4i64);
+      return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v8f32 ||
+              SimpleTy == MVT::v4f64  || SimpleTy == MVT::v32i8 ||
+              SimpleTy == MVT::v16i16 || SimpleTy == MVT::v8i32 ||
+              SimpleTy == MVT::v4i64  || SimpleTy == MVT::v256i1);
     }
 
     /// Return true if this is a 512-bit vector type.
     bool is512BitVector() const {
-      return (SimpleTy == MVT::v16f32 || SimpleTy == MVT::v8f64  ||
-              SimpleTy == MVT::v512i1 || SimpleTy == MVT::v64i8  ||
-              SimpleTy == MVT::v32i16 || SimpleTy == MVT::v16i32 ||
-              SimpleTy == MVT::v8i64);
+      return (SimpleTy == MVT::v32f16 || SimpleTy == MVT::v16f32 ||
+              SimpleTy == MVT::v8f64  || SimpleTy == MVT::v512i1 ||
+              SimpleTy == MVT::v64i8  || SimpleTy == MVT::v32i16 ||
+              SimpleTy == MVT::v16i32 || SimpleTy == MVT::v8i64);
     }
 
     /// Return true if this is a 1024-bit vector type.
@@ -406,6 +390,15 @@ namespace llvm {
               SimpleTy==MVT::vAny || SimpleTy==MVT::iPTRAny);
     }
 
+    /// Return a VT for a vector type with the same element type but
+    /// half the number of elements.
+    MVT getHalfNumVectorElementsVT() const {
+      MVT EltVT = getVectorElementType();
+      auto EltCnt = getVectorElementCount();
+      assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!");
+      return getVectorVT(EltVT, EltCnt / 2);
+    }
+
     /// Returns true if the given vector is a power of 2.
     bool isPow2VectorType() const {
       unsigned NElts = getVectorNumElements();
@@ -440,6 +433,7 @@ namespace llvm {
       case v32i1:
       case v64i1:
       case v128i1:
+      case v256i1:
       case v512i1:
       case v1024i1:
       case nxv1i1:
@@ -465,6 +459,7 @@ namespace llvm {
       case nxv32i8: return i8;
       case v1i16:
       case v2i16:
+      case v3i16:
       case v4i16:
       case v8i16:
       case v16i16:
@@ -511,8 +506,11 @@ namespace llvm {
       case nxv32i64: return i64;
       case v1i128: return i128;
       case v2f16:
+      case v3f16:
       case v4f16:
       case v8f16:
+      case v16f16:
+      case v32f16:
       case nxv2f16:
       case nxv4f16:
       case nxv8f16: return f16;
@@ -558,6 +556,7 @@ namespace llvm {
       case v512i1:
       case v512i32:
       case v512f32: return 512;
+      case v256i1:
       case v256i8:
       case v256i32:
       case v256f32: return 256;
@@ -576,6 +575,7 @@ namespace llvm {
       case v32i16:
       case v32i32:
       case v32i64:
+      case v32f16:
       case v32f32:
       case nxv32i1:
       case nxv32i8:
@@ -587,6 +587,7 @@ namespace llvm {
       case v16i16:
       case v16i32:
       case v16i64:
+      case v16f16:
       case v16f32:
       case nxv16i1:
       case nxv16i8:
@@ -628,7 +629,9 @@ namespace llvm {
       case nxv4f16:
       case nxv4f32:
       case nxv4f64: return 4;
+      case v3i16:
       case v3i32:
+      case v3f16:
       case v3f32: return 3;
       case v2i1:
       case v2i8:
@@ -664,7 +667,7 @@ namespace llvm {
       }
     }
 
-    MVT::ElementCount getVectorElementCount() const {
+    ElementCount getVectorElementCount() const {
       return { getVectorNumElements(), isScalableVector() };
     }
 
@@ -721,6 +724,8 @@ namespace llvm {
       case nxv1i32:
       case nxv2f16:
       case nxv1f32: return 32;
+      case v3i16:
+      case v3f16: return 48;
       case x86mmx:
       case f64 :
       case i64 :
@@ -763,10 +768,12 @@ namespace llvm {
       case nxv2f64: return 128;
       case v5i32:
       case v5f32: return 160;
+      case v256i1:
       case v32i8:
       case v16i16:
       case v8i32:
       case v4i64:
+      case v16f16:
       case v8f32:
       case v4f64:
       case nxv32i8:
@@ -780,6 +787,7 @@ namespace llvm {
       case v32i16:
       case v16i32:
       case v8i64:
+      case v32f16:
       case v16f32:
       case v8f64:
       case nxv32i16:
@@ -900,6 +908,7 @@ namespace llvm {
         if (NumElements == 32)   return MVT::v32i1;
         if (NumElements == 64)   return MVT::v64i1;
         if (NumElements == 128)  return MVT::v128i1;
+        if (NumElements == 256)  return MVT::v256i1;
         if (NumElements == 512)  return MVT::v512i1;
         if (NumElements == 1024) return MVT::v1024i1;
         break;
@@ -917,6 +926,7 @@ namespace llvm {
       case MVT::i16:
         if (NumElements == 1)   return MVT::v1i16;
         if (NumElements == 2)   return MVT::v2i16;
+        if (NumElements == 3)   return MVT::v3i16;
         if (NumElements == 4)   return MVT::v4i16;
         if (NumElements == 8)   return MVT::v8i16;
         if (NumElements == 16)  return MVT::v16i16;
@@ -953,8 +963,11 @@ namespace llvm {
         break;
       case MVT::f16:
         if (NumElements == 2)  return MVT::v2f16;
+        if (NumElements == 3)  return MVT::v3f16;
         if (NumElements == 4)  return MVT::v4f16;
         if (NumElements == 8)  return MVT::v8f16;
+        if (NumElements == 16) return MVT::v16f16;
+        if (NumElements == 32) return MVT::v32f16;
         break;
       case MVT::f32:
         if (NumElements == 1)    return MVT::v1f32;
@@ -1054,7 +1067,7 @@ namespace llvm {
       return getVectorVT(VT, NumElements);
     }
 
-    static MVT getVectorVT(MVT VT, MVT::ElementCount EC) {
+    static MVT getVectorVT(MVT VT, ElementCount EC) {
       if (EC.Scalable)
         return getScalableVectorVT(VT, EC.Min);
       return getVectorVT(VT, EC.Min);
@@ -1108,26 +1121,40 @@ namespace llvm {
                        (MVT::SimpleValueType)(MVT::LAST_VECTOR_VALUETYPE + 1));
     }
 
-    static mvt_range integer_vector_valuetypes() {
+    static mvt_range fixedlen_vector_valuetypes() {
       return mvt_range(
-          MVT::FIRST_INTEGER_VECTOR_VALUETYPE,
-          (MVT::SimpleValueType)(MVT::LAST_INTEGER_VECTOR_VALUETYPE + 1));
+               MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE,
+               (MVT::SimpleValueType)(MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE + 1));
     }
 
-    static mvt_range fp_vector_valuetypes() {
+    static mvt_range scalable_vector_valuetypes() {
       return mvt_range(
-          MVT::FIRST_FP_VECTOR_VALUETYPE,
-          (MVT::SimpleValueType)(MVT::LAST_FP_VECTOR_VALUETYPE + 1));
+               MVT::FIRST_SCALABLE_VECTOR_VALUETYPE,
+               (MVT::SimpleValueType)(MVT::LAST_SCALABLE_VECTOR_VALUETYPE + 1));
+    }
+
+    static mvt_range integer_fixedlen_vector_valuetypes() {
+      return mvt_range(
+       MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE,
+       (MVT::SimpleValueType)(MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE + 1));
+    }
+
+    static mvt_range fp_fixedlen_vector_valuetypes() {
+      return mvt_range(
+          MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE,
+          (MVT::SimpleValueType)(MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE + 1));
     }
 
     static mvt_range integer_scalable_vector_valuetypes() {
-      return mvt_range(MVT::FIRST_INTEGER_SCALABLE_VALUETYPE,
-              (MVT::SimpleValueType)(MVT::LAST_INTEGER_SCALABLE_VALUETYPE + 1));
+      return mvt_range(
+       MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE,
+       (MVT::SimpleValueType)(MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE + 1));
     }
 
     static mvt_range fp_scalable_vector_valuetypes() {
-      return mvt_range(MVT::FIRST_FP_SCALABLE_VALUETYPE,
-                   (MVT::SimpleValueType)(MVT::LAST_FP_SCALABLE_VALUETYPE + 1));
+      return mvt_range(
+            MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE,
+            (MVT::SimpleValueType)(MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE + 1));
     }
     /// @}
   };
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 249139e824b5..004a6f5f6eb8 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -39,6 +39,7 @@ unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
 #endif
 
 namespace llvm {
+
 /// The behavior an operation has on an input of 0.
 enum ZeroBehavior {
   /// The returned value is undefined.
@@ -49,6 +50,42 @@ enum ZeroBehavior {
   ZB_Width
 };
 
+/// Mathematical constants.
+namespace numbers {
+// TODO: Track C++20 std::numbers.
+// TODO: Favor using the hexadecimal FP constants (requires C++17).
+constexpr double e          = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
+                 egamma     = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
+                 ln2        = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
+                 ln10       = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
+                 log2e      = 1.4426950408889634074, // (0x1.71547652b82feP+0)
+                 log10e     = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
+                 pi         = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
+                 inv_pi     = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
+                 sqrtpi     = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
+                 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
+                 sqrt2      = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
+                 inv_sqrt2  = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
+                 sqrt3      = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
+                 inv_sqrt3  = .57735026918962576451, // (0x1.279a74590331cP-1)
+                 phi        = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
+constexpr float ef          = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
+                egammaf     = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
+                ln2f        = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
+                ln10f       = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
+                log2ef      = 1.44269504F, // (0x1.715476P+0)
+                log10ef     = .434294482F, // (0x1.bcb7b2P-2)
+                pif         = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
+                inv_pif     = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
+                sqrtpif     = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
+                inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
+                sqrt2f      = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
+                inv_sqrt2f  = .707106781F, // (0x1.6a09e6P-1)
+                sqrt3f      = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
+                inv_sqrt3f  = .577350269F, // (0x1.279a74P-1)
+                phif        = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
+} // namespace numbers
+
 namespace detail {
 template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
   static unsigned count(T Val, ZeroBehavior) {
@@ -73,13 +110,13 @@ template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
   }
 };
 
-#if __GNUC__ >= 4 || defined(_MSC_VER)
+#if defined(__GNUC__) || defined(_MSC_VER)
 template <typename T> struct TrailingZerosCounter<T, 4> {
   static unsigned count(T Val, ZeroBehavior ZB) {
     if (ZB != ZB_Undefined && Val == 0)
       return 32;
 
-#if __has_builtin(__builtin_ctz) || LLVM_GNUC_PREREQ(4, 0, 0)
+#if __has_builtin(__builtin_ctz) || defined(__GNUC__)
     return __builtin_ctz(Val);
 #elif defined(_MSC_VER)
     unsigned long Index;
@@ -95,7 +132,7 @@ template <typename T> struct TrailingZerosCounter<T, 8> {
     if (ZB != ZB_Undefined && Val == 0)
       return 64;
 
-#if __has_builtin(__builtin_ctzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+#if __has_builtin(__builtin_ctzll) || defined(__GNUC__)
     return __builtin_ctzll(Val);
 #elif defined(_MSC_VER)
     unsigned long Index;
@@ -142,13 +179,13 @@ template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
   }
 };
 
-#if __GNUC__ >= 4 || defined(_MSC_VER)
+#if defined(__GNUC__) || defined(_MSC_VER)
 template <typename T> struct LeadingZerosCounter<T, 4> {
   static unsigned count(T Val, ZeroBehavior ZB) {
     if (ZB != ZB_Undefined && Val == 0)
       return 32;
 
-#if __has_builtin(__builtin_clz) || LLVM_GNUC_PREREQ(4, 0, 0)
+#if __has_builtin(__builtin_clz) || defined(__GNUC__)
     return __builtin_clz(Val);
 #elif defined(_MSC_VER)
     unsigned long Index;
@@ -164,7 +201,7 @@ template <typename T> struct LeadingZerosCounter<T, 8> {
     if (ZB != ZB_Undefined && Val == 0)
       return 64;
 
-#if __has_builtin(__builtin_clzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+#if __has_builtin(__builtin_clzll) || defined(__GNUC__)
     return __builtin_clzll(Val);
 #elif defined(_MSC_VER)
     unsigned long Index;
@@ -486,7 +523,7 @@ template <typename T, std::size_t SizeOfT> struct PopulationCounter {
   static unsigned count(T Value) {
     // Generic version, forward to 32 bits.
     static_assert(SizeOfT <= 4, "Not implemented!");
-#if __GNUC__ >= 4
+#if defined(__GNUC__)
     return __builtin_popcount(Value);
 #else
     uint32_t v = Value;
@@ -499,7 +536,7 @@ template <typename T, std::size_t SizeOfT> struct PopulationCounter {
 
 template <typename T> struct PopulationCounter<T, 8> {
   static unsigned count(T Value) {
-#if __GNUC__ >= 4
+#if defined(__GNUC__)
     return __builtin_popcountll(Value);
 #else
     uint64_t v = Value;
@@ -523,6 +560,16 @@ inline unsigned countPopulation(T Value) {
   return detail::PopulationCounter<T, sizeof(T)>::count(Value);
 }
 
+/// Compile time Log2.
+/// Valid only for positive powers of two.
+template <size_t kValue> constexpr inline size_t CTLog2() {
+  static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
+                "Value is not a valid power of 2");
+  return 1 + CTLog2<kValue / 2>();
+}
+
+template <> constexpr inline size_t CTLog2<1>() { return 0; }
+
 /// Return the log base 2 of the specified value.
 inline double Log2(double Value) {
 #if defined(__ANDROID_API__) && __ANDROID_API__ < 18
@@ -620,25 +667,6 @@ constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
   return (A | B) & (1 + ~(A | B));
 }
 
-/// Aligns \c Addr to \c Alignment bytes, rounding up.
-///
-/// Alignment should be a power of two.  This method rounds up, so
-/// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8.
-inline uintptr_t alignAddr(const void *Addr, size_t Alignment) {
-  assert(Alignment && isPowerOf2_64((uint64_t)Alignment) &&
-         "Alignment is not a power of two!");
-
-  assert((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr);
-
-  return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1));
-}
-
-/// Returns the necessary adjustment for aligning \c Ptr to \c Alignment
-/// bytes, rounding up.
-inline size_t alignmentAdjustment(const void *Ptr, size_t Alignment) {
-  return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr;
-}
-
 /// Returns the next power of two (in 64-bits) that is strictly greater than A.
 /// Returns zero on overflow.
 inline uint64_t NextPowerOf2(uint64_t A) {
@@ -704,19 +732,6 @@ inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
   return alignTo(Numerator, Denominator) / Denominator;
 }
 
-/// \c alignTo for contexts where a constant expression is required.
-/// \sa alignTo
-///
-/// \todo FIXME: remove when \c constexpr becomes really \c constexpr
-template <uint64_t Align>
-struct AlignTo {
-  static_assert(Align != 0u, "Align must be non-zero");
-  template <uint64_t Value>
-  struct from_value {
-    static const uint64_t value = (Value + Align - 1) / Align * Align;
-  };
-};
-
 /// Returns the largest uint64_t less than or equal to \p Value and is
 /// \p Skew mod \p Align. \p Align must be non-zero
 inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
@@ -725,13 +740,6 @@ inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
   return (Value - Skew) / Align * Align + Skew;
 }
 
-/// Returns the offset to the next integer (mod 2**64) that is greater than
-/// or equal to \p Value and is a multiple of \p Align. \p Align must be
-/// non-zero.
-inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {
-  return alignTo(Value, Align) - Value;
-}
-
 /// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
 /// Requires 0 < B <= 32.
 template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
@@ -853,6 +861,91 @@ SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
 
 /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
 extern const float huge_valf;
+
+
+/// Add two signed integers, computing the two's complement truncated result,
+/// returning true if overflow occured.
+template <typename T>
+typename std::enable_if<std::is_signed<T>::value, T>::type
+AddOverflow(T X, T Y, T &Result) {
+#if __has_builtin(__builtin_add_overflow)
+  return __builtin_add_overflow(X, Y, &Result);
+#else
+  // Perform the unsigned addition.
+  using U = typename std::make_unsigned<T>::type;
+  const U UX = static_cast<U>(X);
+  const U UY = static_cast<U>(Y);
+  const U UResult = UX + UY;
+
+  // Convert to signed.
+  Result = static_cast<T>(UResult);
+
+  // Adding two positive numbers should result in a positive number.
+  if (X > 0 && Y > 0)
+    return Result <= 0;
+  // Adding two negatives should result in a negative number.
+  if (X < 0 && Y < 0)
+    return Result >= 0;
+  return false;
+#endif
+}
+
+/// Subtract two signed integers, computing the two's complement truncated
+/// result, returning true if an overflow ocurred.
+template <typename T>
+typename std::enable_if<std::is_signed<T>::value, T>::type
+SubOverflow(T X, T Y, T &Result) {
+#if __has_builtin(__builtin_sub_overflow)
+  return __builtin_sub_overflow(X, Y, &Result);
+#else
+  // Perform the unsigned addition.
+  using U = typename std::make_unsigned<T>::type;
+  const U UX = static_cast<U>(X);
+  const U UY = static_cast<U>(Y);
+  const U UResult = UX - UY;
+
+  // Convert to signed.
+  Result = static_cast<T>(UResult);
+
+  // Subtracting a positive number from a negative results in a negative number.
+  if (X <= 0 && Y > 0)
+    return Result >= 0;
+  // Subtracting a negative number from a positive results in a positive number.
+  if (X >= 0 && Y < 0)
+    return Result <= 0;
+  return false;
+#endif
+}
+
+
+/// Multiply two signed integers, computing the two's complement truncated
+/// result, returning true if an overflow ocurred.
+template <typename T>
+typename std::enable_if<std::is_signed<T>::value, T>::type
+MulOverflow(T X, T Y, T &Result) {
+  // Perform the unsigned multiplication on absolute values.
+  using U = typename std::make_unsigned<T>::type;
+  const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
+  const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
+  const U UResult = UX * UY;
+
+  // Convert to signed.
+  const bool IsNegative = (X < 0) ^ (Y < 0);
+  Result = IsNegative ? (0 - UResult) : UResult;
+
+  // If any of the args was 0, result is 0 and no overflow occurs.
+  if (UX == 0 || UY == 0)
+    return false;
+
+  // UX and UY are in [1, 2^n], where n is the number of digits.
+  // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
+  // positive) divided by an argument compares to the other.
+  if (IsNegative)
+    return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
+  else
+    return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Support/Mutex.h b/include/llvm/Support/Mutex.h
index c3abfc7a7806..1d8a0d3c87cb 100644
--- a/include/llvm/Support/Mutex.h
+++ b/include/llvm/Support/Mutex.h
@@ -13,97 +13,31 @@
 #ifndef LLVM_SUPPORT_MUTEX_H
 #define LLVM_SUPPORT_MUTEX_H
 
-#include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Threading.h"
 #include <cassert>
+#include <mutex>
 
 namespace llvm
 {
   namespace sys
   {
-    /// Platform agnostic Mutex class.
-    class MutexImpl
-    {
-    /// @name Constructors
-    /// @{
-    public:
-
-      /// Initializes the lock but doesn't acquire it. if \p recursive is set
-      /// to false, the lock will not be recursive which makes it cheaper but
-      /// also more likely to deadlock (same thread can't acquire more than
-      /// once).
-      /// Default Constructor.
-      explicit MutexImpl(bool recursive = true);
-
-      /// Releases and removes the lock
-      /// Destructor
-      ~MutexImpl();
-
-    /// @}
-    /// @name Methods
-    /// @{
-    public:
-
-      /// Attempts to unconditionally acquire the lock. If the lock is held by
-      /// another thread, this method will wait until it can acquire the lock.
-      /// @returns false if any kind of error occurs, true otherwise.
-      /// Unconditionally acquire the lock.
-      bool acquire();
-
-      /// Attempts to release the lock. If the lock is held by the current
-      /// thread, the lock is released allowing other threads to acquire the
-      /// lock.
-      /// @returns false if any kind of error occurs, true otherwise.
-      /// Unconditionally release the lock.
-      bool release();
-
-      /// Attempts to acquire the lock without blocking. If the lock is not
-      /// available, this function returns false quickly (without blocking). If
-      /// the lock is available, it is acquired.
-      /// @returns false if any kind of error occurs or the lock is not
-      /// available, true otherwise.
-      /// Try to acquire the lock.
-      bool tryacquire();
-
-    //@}
-    /// @name Platform Dependent Data
-    /// @{
-    private:
-#if defined(LLVM_ENABLE_THREADS) && LLVM_ENABLE_THREADS != 0
-      void* data_; ///< We don't know what the data will be
-#endif
-
-    /// @}
-    /// @name Do Not Implement
-    /// @{
-    private:
-      MutexImpl(const MutexImpl &) = delete;
-      void operator=(const MutexImpl &) = delete;
-    /// @}
-    };
-
-
     /// SmartMutex - A mutex with a compile time constant parameter that
     /// indicates whether this mutex should become a no-op when we're not
     /// running in multithreaded mode.
     template<bool mt_only>
     class SmartMutex {
-      MutexImpl impl;
-      unsigned acquired;
-      bool recursive;
-    public:
-      explicit SmartMutex(bool rec = true) :
-        impl(rec), acquired(0), recursive(rec) { }
+      std::recursive_mutex impl;
+      unsigned acquired = 0;
 
+    public:
       bool lock() {
         if (!mt_only || llvm_is_multithreaded()) {
-          return impl.acquire();
+          impl.lock();
+          return true;
         } else {
           // Single-threaded debugging code.  This would be racy in
           // multithreaded mode, but provides not sanity checks in single
           // threaded mode.
-          assert((recursive || acquired == 0) && "Lock already acquired!!");
           ++acquired;
           return true;
         }
@@ -111,13 +45,13 @@ namespace llvm
 
       bool unlock() {
         if (!mt_only || llvm_is_multithreaded()) {
-          return impl.release();
+          impl.unlock();
+          return true;
         } else {
           // Single-threaded debugging code.  This would be racy in
           // multithreaded mode, but provides not sanity checks in single
           // threaded mode.
-          assert(((recursive && acquired) || (acquired == 1)) &&
-                 "Lock not acquired before release!");
+          assert(acquired && "Lock not acquired before release!");
           --acquired;
           return true;
         }
@@ -125,31 +59,16 @@ namespace llvm
 
       bool try_lock() {
         if (!mt_only || llvm_is_multithreaded())
-          return impl.tryacquire();
+          return impl.try_lock();
         else return true;
       }
-
-      private:
-        SmartMutex(const SmartMutex<mt_only> & original);
-        void operator=(const SmartMutex<mt_only> &);
     };
 
     /// Mutex - A standard, always enforced mutex.
     typedef SmartMutex<false> Mutex;
 
-    template<bool mt_only>
-    class SmartScopedLock  {
-      SmartMutex<mt_only>& mtx;
-
-    public:
-      SmartScopedLock(SmartMutex<mt_only>& m) : mtx(m) {
-        mtx.lock();
-      }
-
-      ~SmartScopedLock() {
-        mtx.unlock();
-      }
-    };
+    template <bool mt_only>
+    using SmartScopedLock = std::lock_guard<SmartMutex<mt_only>>;
 
     typedef SmartScopedLock<false> ScopedLock;
   }
diff --git a/include/llvm/Support/MutexGuard.h b/include/llvm/Support/MutexGuard.h
deleted file mode 100644
index d86ced145816..000000000000
--- a/include/llvm/Support/MutexGuard.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- Support/MutexGuard.h - Acquire/Release Mutex In Scope ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a guard for a block of code that ensures a Mutex is locked
-// upon construction and released upon destruction.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_MUTEXGUARD_H
-#define LLVM_SUPPORT_MUTEXGUARD_H
-
-#include "llvm/Support/Mutex.h"
-
-namespace llvm {
-  /// Instances of this class acquire a given Mutex Lock when constructed and
-  /// hold that lock until destruction. The intention is to instantiate one of
-  /// these on the stack at the top of some scope to be assured that C++
-  /// destruction of the object will always release the Mutex and thus avoid
-  /// a host of nasty multi-threading problems in the face of exceptions, etc.
-  /// Guard a section of code with a Mutex.
-  class MutexGuard {
-    sys::Mutex &M;
-    MutexGuard(const MutexGuard &) = delete;
-    void operator=(const MutexGuard &) = delete;
-  public:
-    MutexGuard(sys::Mutex &m) : M(m) { M.lock(); }
-    ~MutexGuard() { M.unlock(); }
-    /// holds - Returns true if this locker instance holds the specified lock.
-    /// This is mostly used in assertions to validate that the correct mutex
-    /// is held.
-    bool holds(const sys::Mutex& lock) const { return &M == &lock; }
-  };
-}
-
-#endif // LLVM_SUPPORT_MUTEXGUARD_H
diff --git a/include/llvm/Support/OnDiskHashTable.h b/include/llvm/Support/OnDiskHashTable.h
index d84da92aab9b..11dc0de0f354 100644
--- a/include/llvm/Support/OnDiskHashTable.h
+++ b/include/llvm/Support/OnDiskHashTable.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_SUPPORT_ONDISKHASHTABLE_H
 #define LLVM_SUPPORT_ONDISKHASHTABLE_H
 
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/EndianStream.h"
@@ -207,7 +208,7 @@ public:
 
     // Pad with zeros so that we can start the hashtable at an aligned address.
     offset_type TableOff = Out.tell();
-    uint64_t N = llvm::OffsetToAlignment(TableOff, alignof(offset_type));
+    uint64_t N = offsetToAlignment(TableOff, Align(alignof(offset_type)));
     TableOff += N;
     while (N--)
       LE.write<uint8_t>(0);
diff --git a/include/llvm/Support/Parallel.h b/include/llvm/Support/Parallel.h
index eab9b492c4a5..3c0ed2c11127 100644
--- a/include/llvm/Support/Parallel.h
+++ b/include/llvm/Support/Parallel.h
@@ -18,14 +18,6 @@
 #include <functional>
 #include <mutex>
 
-#if defined(_MSC_VER) && LLVM_ENABLE_THREADS
-#pragma warning(push)
-#pragma warning(disable : 4530)
-#include <concrt.h>
-#include <ppl.h>
-#pragma warning(pop)
-#endif
-
 namespace llvm {
 
 namespace parallel {
@@ -84,23 +76,6 @@ public:
   void sync() const { L.sync(); }
 };
 
-#if defined(_MSC_VER)
-template <class RandomAccessIterator, class Comparator>
-void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
-                   const Comparator &Comp) {
-  concurrency::parallel_sort(Start, End, Comp);
-}
-template <class IterTy, class FuncTy>
-void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
-  concurrency::parallel_for_each(Begin, End, Fn);
-}
-
-template <class IndexTy, class FuncTy>
-void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
-  concurrency::parallel_for(Begin, End, Fn);
-}
-
-#else
 const ptrdiff_t MinParallelSize = 1024;
 
 /// Inclusive median.
@@ -188,8 +163,6 @@ void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
 
 #endif
 
-#endif
-
 template <typename Iter>
 using DefComparator =
     std::less<typename std::iterator_traits<Iter>::value_type>;
diff --git a/include/llvm/Support/RWMutex.h b/include/llvm/Support/RWMutex.h
index 9cd57cbd65a1..150bc7dbbce1 100644
--- a/include/llvm/Support/RWMutex.h
+++ b/include/llvm/Support/RWMutex.h
@@ -16,161 +16,184 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Threading.h"
 #include <cassert>
+#include <mutex>
+#include <shared_mutex>
+
+// std::shared_timed_mutex is only availble on macOS 10.12 and later.
+#if defined(__APPLE__) && defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
+#if __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200
+#define LLVM_USE_RW_MUTEX_IMPL
+#endif
+#endif
 
 namespace llvm {
 namespace sys {
 
-    /// Platform agnostic RWMutex class.
-    class RWMutexImpl
-    {
-    /// @name Constructors
-    /// @{
-    public:
-
-      /// Initializes the lock but doesn't acquire it.
-      /// Default Constructor.
-      explicit RWMutexImpl();
-
-    /// @}
-    /// @name Do Not Implement
-    /// @{
-      RWMutexImpl(const RWMutexImpl & original) = delete;
-      RWMutexImpl &operator=(const RWMutexImpl &) = delete;
-    /// @}
-
-      /// Releases and removes the lock
-      /// Destructor
-      ~RWMutexImpl();
-
-    /// @}
-    /// @name Methods
-    /// @{
-    public:
-
-      /// Attempts to unconditionally acquire the lock in reader mode. If the
-      /// lock is held by a writer, this method will wait until it can acquire
-      /// the lock.
-      /// @returns false if any kind of error occurs, true otherwise.
-      /// Unconditionally acquire the lock in reader mode.
-      bool reader_acquire();
-
-      /// Attempts to release the lock in reader mode.
-      /// @returns false if any kind of error occurs, true otherwise.
-      /// Unconditionally release the lock in reader mode.
-      bool reader_release();
-
-      /// Attempts to unconditionally acquire the lock in reader mode. If the
-      /// lock is held by any readers, this method will wait until it can
-      /// acquire the lock.
-      /// @returns false if any kind of error occurs, true otherwise.
-      /// Unconditionally acquire the lock in writer mode.
-      bool writer_acquire();
-
-      /// Attempts to release the lock in writer mode.
-      /// @returns false if any kind of error occurs, true otherwise.
-      /// Unconditionally release the lock in write mode.
-      bool writer_release();
-
-    //@}
-    /// @name Platform Dependent Data
-    /// @{
-    private:
+#if defined(LLVM_USE_RW_MUTEX_IMPL)
+/// Platform agnostic RWMutex class.
+class RWMutexImpl {
+  /// @name Constructors
+  /// @{
+public:
+  /// Initializes the lock but doesn't acquire it.
+  /// Default Constructor.
+  explicit RWMutexImpl();
+
+  /// @}
+  /// @name Do Not Implement
+  /// @{
+  RWMutexImpl(const RWMutexImpl &original) = delete;
+  RWMutexImpl &operator=(const RWMutexImpl &) = delete;
+  /// @}
+
+  /// Releases and removes the lock
+  /// Destructor
+  ~RWMutexImpl();
+
+  /// @}
+  /// @name Methods
+  /// @{
+public:
+  /// Attempts to unconditionally acquire the lock in reader mode. If the
+  /// lock is held by a writer, this method will wait until it can acquire
+  /// the lock.
+  /// @returns false if any kind of error occurs, true otherwise.
+  /// Unconditionally acquire the lock in reader mode.
+  bool lock_shared();
+
+  /// Attempts to release the lock in reader mode.
+  /// @returns false if any kind of error occurs, true otherwise.
+  /// Unconditionally release the lock in reader mode.
+  bool unlock_shared();
+
+  /// Attempts to unconditionally acquire the lock in reader mode. If the
+  /// lock is held by any readers, this method will wait until it can
+  /// acquire the lock.
+  /// @returns false if any kind of error occurs, true otherwise.
+  /// Unconditionally acquire the lock in writer mode.
+  bool lock();
+
+  /// Attempts to release the lock in writer mode.
+  /// @returns false if any kind of error occurs, true otherwise.
+  /// Unconditionally release the lock in write mode.
+  bool unlock();
+
+  //@}
+  /// @name Platform Dependent Data
+  /// @{
+private:
 #if defined(LLVM_ENABLE_THREADS) && LLVM_ENABLE_THREADS != 0
-      void* data_ = nullptr; ///< We don't know what the data will be
+  void *data_ = nullptr; ///< We don't know what the data will be
+#endif
+};
+#endif
+
+/// SmartMutex - An R/W mutex with a compile time constant parameter that
+/// indicates whether this mutex should become a no-op when we're not
+/// running in multithreaded mode.
+template <bool mt_only> class SmartRWMutex {
+  // shared_mutex (C++17) is more efficient than shared_timed_mutex (C++14)
+  // on Windows and always available on MSVC.
+#if defined(_MSC_VER) || __cplusplus > 201402L
+  std::shared_mutex impl;
+#else
+#if !defined(LLVM_USE_RW_MUTEX_IMPL)
+  std::shared_timed_mutex impl;
+#else
+  RWMutexImpl impl;
+#endif
+#endif
+  unsigned readers = 0;
+  unsigned writers = 0;
+
+public:
+  bool lock_shared() {
+    if (!mt_only || llvm_is_multithreaded()) {
+      impl.lock_shared();
+      return true;
+    }
+
+    // Single-threaded debugging code.  This would be racy in multithreaded
+    // mode, but provides not sanity checks in single threaded mode.
+    ++readers;
+    return true;
+  }
+
+  bool unlock_shared() {
+    if (!mt_only || llvm_is_multithreaded()) {
+      impl.unlock_shared();
+      return true;
+    }
+
+    // Single-threaded debugging code.  This would be racy in multithreaded
+    // mode, but provides not sanity checks in single threaded mode.
+    assert(readers > 0 && "Reader lock not acquired before release!");
+    --readers;
+    return true;
+  }
+
+  bool lock() {
+    if (!mt_only || llvm_is_multithreaded()) {
+      impl.lock();
+      return true;
+    }
+
+    // Single-threaded debugging code.  This would be racy in multithreaded
+    // mode, but provides not sanity checks in single threaded mode.
+    assert(writers == 0 && "Writer lock already acquired!");
+    ++writers;
+    return true;
+  }
+
+  bool unlock() {
+    if (!mt_only || llvm_is_multithreaded()) {
+      impl.unlock();
+      return true;
+    }
+
+    // Single-threaded debugging code.  This would be racy in multithreaded
+    // mode, but provides not sanity checks in single threaded mode.
+    assert(writers == 1 && "Writer lock not acquired before release!");
+    --writers;
+    return true;
+  }
+};
+
+typedef SmartRWMutex<false> RWMutex;
+
+/// ScopedReader - RAII acquisition of a reader lock
+#if !defined(LLVM_USE_RW_MUTEX_IMPL)
+template <bool mt_only>
+using SmartScopedReader = const std::shared_lock<SmartRWMutex<mt_only>>;
+#else
+template <bool mt_only> struct SmartScopedReader {
+  SmartRWMutex<mt_only> &mutex;
+
+  explicit SmartScopedReader(SmartRWMutex<mt_only> &m) : mutex(m) {
+    mutex.lock_shared();
+  }
+
+  ~SmartScopedReader() { mutex.unlock_shared(); }
+};
+#endif
+typedef SmartScopedReader<false> ScopedReader;
+
+/// ScopedWriter - RAII acquisition of a writer lock
+#if !defined(LLVM_USE_RW_MUTEX_IMPL)
+template <bool mt_only>
+using SmartScopedWriter = std::lock_guard<SmartRWMutex<mt_only>>;
+#else
+template <bool mt_only> struct SmartScopedWriter {
+  SmartRWMutex<mt_only> &mutex;
+
+  explicit SmartScopedWriter(SmartRWMutex<mt_only> &m) : mutex(m) {
+    mutex.lock();
+  }
+
+  ~SmartScopedWriter() { mutex.unlock(); }
+};
 #endif
-    };
-
-    /// SmartMutex - An R/W mutex with a compile time constant parameter that
-    /// indicates whether this mutex should become a no-op when we're not
-    /// running in multithreaded mode.
-    template<bool mt_only>
-    class SmartRWMutex {
-      RWMutexImpl impl;
-      unsigned readers = 0;
-      unsigned writers = 0;
-
-    public:
-      explicit SmartRWMutex() = default;
-      SmartRWMutex(const SmartRWMutex<mt_only> & original) = delete;
-      SmartRWMutex<mt_only> &operator=(const SmartRWMutex<mt_only> &) = delete;
-
-      bool lock_shared() {
-        if (!mt_only || llvm_is_multithreaded())
-          return impl.reader_acquire();
-
-        // Single-threaded debugging code.  This would be racy in multithreaded
-        // mode, but provides not sanity checks in single threaded mode.
-        ++readers;
-        return true;
-      }
-
-      bool unlock_shared() {
-        if (!mt_only || llvm_is_multithreaded())
-          return impl.reader_release();
-
-        // Single-threaded debugging code.  This would be racy in multithreaded
-        // mode, but provides not sanity checks in single threaded mode.
-        assert(readers > 0 && "Reader lock not acquired before release!");
-        --readers;
-        return true;
-      }
-
-      bool lock() {
-        if (!mt_only || llvm_is_multithreaded())
-          return impl.writer_acquire();
-
-        // Single-threaded debugging code.  This would be racy in multithreaded
-        // mode, but provides not sanity checks in single threaded mode.
-        assert(writers == 0 && "Writer lock already acquired!");
-        ++writers;
-        return true;
-      }
-
-      bool unlock() {
-        if (!mt_only || llvm_is_multithreaded())
-          return impl.writer_release();
-
-        // Single-threaded debugging code.  This would be racy in multithreaded
-        // mode, but provides not sanity checks in single threaded mode.
-        assert(writers == 1 && "Writer lock not acquired before release!");
-        --writers;
-        return true;
-      }
-    };
-
-    typedef SmartRWMutex<false> RWMutex;
-
-    /// ScopedReader - RAII acquisition of a reader lock
-    template<bool mt_only>
-    struct SmartScopedReader {
-      SmartRWMutex<mt_only>& mutex;
-
-      explicit SmartScopedReader(SmartRWMutex<mt_only>& m) : mutex(m) {
-        mutex.lock_shared();
-      }
-
-      ~SmartScopedReader() {
-        mutex.unlock_shared();
-      }
-    };
-
-    typedef SmartScopedReader<false> ScopedReader;
-
-    /// ScopedWriter - RAII acquisition of a writer lock
-    template<bool mt_only>
-    struct SmartScopedWriter {
-      SmartRWMutex<mt_only>& mutex;
-
-      explicit SmartScopedWriter(SmartRWMutex<mt_only>& m) : mutex(m) {
-        mutex.lock();
-      }
-
-      ~SmartScopedWriter() {
-        mutex.unlock();
-      }
-    };
-
-    typedef SmartScopedWriter<false> ScopedWriter;
+typedef SmartScopedWriter<false> ScopedWriter;
 
 } // end namespace sys
 } // end namespace llvm
diff --git a/include/llvm/Support/Regex.h b/include/llvm/Support/Regex.h
index 2d19b10fd890..b2620ab4cfc9 100644
--- a/include/llvm/Support/Regex.h
+++ b/include/llvm/Support/Regex.h
@@ -44,6 +44,9 @@ namespace llvm {
 
     Regex();
     /// Compiles the given regular expression \p Regex.
+    ///
+    /// \param Regex - referenced string is no longer needed after this
+    /// constructor does finish.  Only its compiled form is kept stored.
     Regex(StringRef Regex, unsigned Flags = NoFlags);
     Regex(const Regex &) = delete;
     Regex &operator=(Regex regex) {
@@ -54,9 +57,10 @@ namespace llvm {
     Regex(Regex &&regex);
     ~Regex();
 
-    /// isValid - returns the error encountered during regex compilation, or
-    /// matching, if any.
+    /// isValid - returns the error encountered during regex compilation, if
+    /// any.
     bool isValid(std::string &Error) const;
+    bool isValid() const { return !error; }
 
     /// getNumMatches - In a valid regex, return the number of parenthesized
     /// matches it contains.  The number filled in by match will include this
@@ -69,8 +73,12 @@ namespace llvm {
     /// with references to the matched group expressions (inside \p String),
     /// the first group is always the entire pattern.
     ///
+    /// \param Error - If non-null, any errors in the matching will be recorded
+    /// as a non-empty string. If there is no error, it will be an empty string.
+    ///
     /// This returns true on a successful match.
-    bool match(StringRef String, SmallVectorImpl<StringRef> *Matches = nullptr);
+    bool match(StringRef String, SmallVectorImpl<StringRef> *Matches = nullptr,
+               std::string *Error = nullptr) const;
 
     /// sub - Return the result of replacing the first match of the regex in
     /// \p String with the \p Repl string. Backreferences like "\0" in the
@@ -81,9 +89,9 @@ namespace llvm {
     ///
     /// \param Error If non-null, any errors in the substitution (invalid
     /// backreferences, trailing backslashes) will be recorded as a non-empty
-    /// string.
+    /// string. If there is no error, it will be an empty string.
     std::string sub(StringRef Repl, StringRef String,
-                    std::string *Error = nullptr);
+                    std::string *Error = nullptr) const;
 
     /// If this function returns true, ^Str$ is an extended regular
     /// expression that matches Str and only Str.
diff --git a/include/llvm/Support/Registry.h b/include/llvm/Support/Registry.h
index 4d8aa5f1470d..5bb6a254a47f 100644
--- a/include/llvm/Support/Registry.h
+++ b/include/llvm/Support/Registry.h
@@ -115,7 +115,7 @@ namespace llvm {
       entry Entry;
       node Node;
 
-      static std::unique_ptr<T> CtorFn() { return make_unique<V>(); }
+      static std::unique_ptr<T> CtorFn() { return std::make_unique<V>(); }
 
     public:
       Add(StringRef Name, StringRef Desc)
diff --git a/include/llvm/Support/SHA1.h b/include/llvm/Support/SHA1.h
index 87fe94bbd5cd..2cfbd2179364 100644
--- a/include/llvm/Support/SHA1.h
+++ b/include/llvm/Support/SHA1.h
@@ -16,13 +16,13 @@
 #define LLVM_SUPPORT_SHA1_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 
 #include <array>
 #include <cstdint>
 
 namespace llvm {
 template <typename T> class ArrayRef;
-class StringRef;
 
 /// A class that wrap the SHA1 algorithm.
 class SHA1 {
diff --git a/include/llvm/Support/ScalableSize.h b/include/llvm/Support/ScalableSize.h
deleted file mode 100644
index 96bf043773a0..000000000000
--- a/include/llvm/Support/ScalableSize.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===- ScalableSize.h - Scalable vector size info ---------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides a struct that can be used to query the size of IR types
-// which may be scalable vectors. It provides convenience operators so that
-// it can be used in much the same way as a single scalar value.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_SCALABLESIZE_H
-#define LLVM_SUPPORT_SCALABLESIZE_H
-
-namespace llvm {
-
-class ElementCount {
-public:
-  unsigned Min;  // Minimum number of vector elements.
-  bool Scalable; // If true, NumElements is a multiple of 'Min' determined
-                 // at runtime rather than compile time.
-
-  ElementCount(unsigned Min, bool Scalable)
-  : Min(Min), Scalable(Scalable) {}
-
-  ElementCount operator*(unsigned RHS) {
-    return { Min * RHS, Scalable };
-  }
-  ElementCount operator/(unsigned RHS) {
-    return { Min / RHS, Scalable };
-  }
-
-  bool operator==(const ElementCount& RHS) const {
-    return Min == RHS.Min && Scalable == RHS.Scalable;
-  }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_SUPPORT_SCALABLESIZE_H
diff --git a/include/llvm/Support/Signals.h b/include/llvm/Support/Signals.h
index a6b215a24311..a4f1fad22dd5 100644
--- a/include/llvm/Support/Signals.h
+++ b/include/llvm/Support/Signals.h
@@ -84,6 +84,17 @@ namespace sys {
   /// function.  Note also that the handler may be executed on a different
   /// thread on some platforms.
   void SetInfoSignalFunction(void (*Handler)());
+
+  /// Registers a function to be called when a "pipe" signal is delivered to
+  /// the process.
+  ///
+  /// The "pipe" signal typically indicates a failed write to a pipe (SIGPIPE).
+  /// The default installed handler calls `exit(EX_IOERR)`, causing the process
+  /// to immediately exit with an IO error exit code.
+  ///
+  /// This function is only applicable on POSIX systems.
+  void SetPipeSignalFunction(void (*Handler)());
+
 } // End sys namespace
 } // End llvm namespace
 
diff --git a/include/llvm/Support/SwapByteOrder.h b/include/llvm/Support/SwapByteOrder.h
index 06a447a27c2a..6cec87006c02 100644
--- a/include/llvm/Support/SwapByteOrder.h
+++ b/include/llvm/Support/SwapByteOrder.h
@@ -22,9 +22,37 @@
 #include <stdlib.h>
 #endif
 
+#if defined(__linux__) || defined(__GNU__) || defined(__HAIKU__)
+#include <endian.h>
+#elif defined(_AIX)
+#include <sys/machine.h>
+#elif defined(__sun)
+/* Solaris provides _BIG_ENDIAN/_LITTLE_ENDIAN selector in sys/types.h */
+#include <sys/types.h>
+#define BIG_ENDIAN 4321
+#define LITTLE_ENDIAN 1234
+#if defined(_BIG_ENDIAN)
+#define BYTE_ORDER BIG_ENDIAN
+#else
+#define BYTE_ORDER LITTLE_ENDIAN
+#endif
+#else
+#if !defined(BYTE_ORDER) && !defined(_WIN32)
+#include <machine/endian.h>
+#endif
+#endif
+
 namespace llvm {
 namespace sys {
 
+#if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN
+constexpr bool IsBigEndianHost = true;
+#else
+constexpr bool IsBigEndianHost = false;
+#endif
+
+static const bool IsLittleEndianHost = !IsBigEndianHost;
+
 /// SwapByteOrder_16 - This function returns a byte-swapped representation of
 /// the 16-bit argument.
 inline uint16_t SwapByteOrder_16(uint16_t value) {
@@ -39,10 +67,9 @@ inline uint16_t SwapByteOrder_16(uint16_t value) {
 #endif
 }
 
-/// SwapByteOrder_32 - This function returns a byte-swapped representation of
-/// the 32-bit argument.
+/// This function returns a byte-swapped representation of the 32-bit argument.
 inline uint32_t SwapByteOrder_32(uint32_t value) {
-#if defined(__llvm__) || (LLVM_GNUC_PREREQ(4, 3, 0) && !defined(__ICC))
+#if defined(__llvm__) || (defined(__GNUC__) && !defined(__ICC))
   return __builtin_bswap32(value);
 #elif defined(_MSC_VER) && !defined(_DEBUG)
   return _byteswap_ulong(value);
@@ -55,10 +82,9 @@ inline uint32_t SwapByteOrder_32(uint32_t value) {
 #endif
 }
 
-/// SwapByteOrder_64 - This function returns a byte-swapped representation of
-/// the 64-bit argument.
+/// This function returns a byte-swapped representation of the 64-bit argument.
 inline uint64_t SwapByteOrder_64(uint64_t value) {
-#if defined(__llvm__) || (LLVM_GNUC_PREREQ(4, 3, 0) && !defined(__ICC))
+#if defined(__llvm__) || (defined(__GNUC__) && !defined(__ICC))
   return __builtin_bswap64(value);
 #elif defined(_MSC_VER) && !defined(_DEBUG)
   return _byteswap_uint64(value);
diff --git a/include/llvm/Support/TargetOpcodes.def b/include/llvm/Support/TargetOpcodes.def
index 598c1064efd0..11731ac35415 100644
--- a/include/llvm/Support/TargetOpcodes.def
+++ b/include/llvm/Support/TargetOpcodes.def
@@ -294,9 +294,21 @@ HANDLE_TARGET_OPCODE(G_SEXTLOAD)
 /// Generic zeroext load
 HANDLE_TARGET_OPCODE(G_ZEXTLOAD)
 
+/// Generic indexed load (including anyext load)
+HANDLE_TARGET_OPCODE(G_INDEXED_LOAD)
+
+/// Generic indexed signext load
+HANDLE_TARGET_OPCODE(G_INDEXED_SEXTLOAD)
+
+/// Generic indexed zeroext load
+HANDLE_TARGET_OPCODE(G_INDEXED_ZEXTLOAD)
+
 /// Generic store.
 HANDLE_TARGET_OPCODE(G_STORE)
 
+/// Generic indexed store.
+HANDLE_TARGET_OPCODE(G_INDEXED_STORE)
+
 /// Generic atomic cmpxchg with internal success check.
 HANDLE_TARGET_OPCODE(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
 
@@ -315,6 +327,8 @@ HANDLE_TARGET_OPCODE(G_ATOMICRMW_MAX)
 HANDLE_TARGET_OPCODE(G_ATOMICRMW_MIN)
 HANDLE_TARGET_OPCODE(G_ATOMICRMW_UMAX)
 HANDLE_TARGET_OPCODE(G_ATOMICRMW_UMIN)
+HANDLE_TARGET_OPCODE(G_ATOMICRMW_FADD)
+HANDLE_TARGET_OPCODE(G_ATOMICRMW_FSUB)
 
 // Generic atomic fence
 HANDLE_TARGET_OPCODE(G_FENCE)
@@ -354,6 +368,7 @@ HANDLE_TARGET_OPCODE(G_VAARG)
 
 // Generic sign extend
 HANDLE_TARGET_OPCODE(G_SEXT)
+HANDLE_TARGET_OPCODE(G_SEXT_INREG)
 
 // Generic zero extend
 HANDLE_TARGET_OPCODE(G_ZEXT)
@@ -436,6 +451,9 @@ HANDLE_TARGET_OPCODE(G_FMUL)
 /// Generic FMA multiplication. Behaves like llvm fma intrinsic
 HANDLE_TARGET_OPCODE(G_FMA)
 
+/// Generic FP multiply and add. Behaves as separate fmul and fadd.
+HANDLE_TARGET_OPCODE(G_FMAD)
+
 /// Generic FP division.
 HANDLE_TARGET_OPCODE(G_FDIV)
 
@@ -557,6 +575,9 @@ HANDLE_TARGET_OPCODE(G_CTPOP)
 /// Generic byte swap.
 HANDLE_TARGET_OPCODE(G_BSWAP)
 
+/// Generic bit reverse.
+HANDLE_TARGET_OPCODE(G_BITREVERSE)
+
 /// Floating point ceil.
 HANDLE_TARGET_OPCODE(G_FCEIL)
 
@@ -587,12 +608,15 @@ HANDLE_TARGET_OPCODE(G_BLOCK_ADDR)
 /// Generic jump table address
 HANDLE_TARGET_OPCODE(G_JUMP_TABLE)
 
+/// Generic dynamic stack allocation.
+HANDLE_TARGET_OPCODE(G_DYN_STACKALLOC)
+
 // TODO: Add more generic opcodes as we move along.
 
 /// Marker for the end of the generic opcode.
 /// This is used to check if an opcode is in the range of the
 /// generic opcodes.
-HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_JUMP_TABLE)
+HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_DYN_STACKALLOC)
 
 /// BUILTIN_OP_END - This must be the last enum value in this list.
 /// The target-specific post-isel opcode values start here.
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index bf75650760d0..f4bc26b858c8 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -510,8 +510,8 @@ public:
                                std::move(Emitter), RelaxAll);
       break;
     case Triple::XCOFF:
-        S = createXCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter), RelaxAll);
+      S = createXCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
+                              std::move(Emitter), RelaxAll);
       break;
     }
     if (ObjectTargetStreamerCtorFn)
diff --git a/include/llvm/Support/TimeProfiler.h b/include/llvm/Support/TimeProfiler.h
index 72b6f7180bde..8cc430d0bc72 100644
--- a/include/llvm/Support/TimeProfiler.h
+++ b/include/llvm/Support/TimeProfiler.h
@@ -19,7 +19,7 @@ extern TimeTraceProfiler *TimeTraceProfilerInstance;
 /// Initialize the time trace profiler.
 /// This sets up the global \p TimeTraceProfilerInstance
 /// variable to be the profiler instance.
-void timeTraceProfilerInitialize();
+void timeTraceProfilerInitialize(unsigned TimeTraceGranularity);
 
 /// Cleanup the time trace profiler, if it was initialized.
 void timeTraceProfilerCleanup();
diff --git a/include/llvm/Support/TrailingObjects.h b/include/llvm/Support/TrailingObjects.h
index 8cf4f7aed7f8..49be89613c43 100644
--- a/include/llvm/Support/TrailingObjects.h
+++ b/include/llvm/Support/TrailingObjects.h
@@ -47,6 +47,7 @@
 #define LLVM_SUPPORT_TRAILINGOBJECTS_H
 
 #include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/type_traits.h"
@@ -87,11 +88,6 @@ protected:
   template <typename T> struct OverloadToken {};
 };
 
-/// This helper template works-around MSVC 2013's lack of useful
-/// alignas() support. The argument to alignas(), in MSVC, is
-/// required to be a literal integer. But, you *can* use template
-/// specialization to select between a bunch of different alignas()
-/// expressions...
 template <int Align>
 class TrailingObjectsAligner : public TrailingObjectsBase {};
 template <>
@@ -172,7 +168,7 @@ protected:
 
     if (requiresRealignment())
       return reinterpret_cast<const NextTy *>(
-          llvm::alignAddr(Ptr, alignof(NextTy)));
+          alignAddr(Ptr, Align::Of<NextTy>()));
     else
       return reinterpret_cast<const NextTy *>(Ptr);
   }
@@ -186,7 +182,7 @@ protected:
                     Obj, TrailingObjectsBase::OverloadToken<PrevTy>());
 
     if (requiresRealignment())
-      return reinterpret_cast<NextTy *>(llvm::alignAddr(Ptr, alignof(NextTy)));
+      return reinterpret_cast<NextTy *>(alignAddr(Ptr, Align::Of<NextTy>()));
     else
       return reinterpret_cast<NextTy *>(Ptr);
   }
@@ -254,9 +250,7 @@ class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl<
   // because BaseTy isn't complete at class instantiation time, but
   // will be by the time this function is instantiated.
   static void verifyTrailingObjectsAssertions() {
-#ifdef LLVM_IS_FINAL
-    static_assert(LLVM_IS_FINAL(BaseTy), "BaseTy must be final.");
-#endif
+    static_assert(std::is_final<BaseTy>(), "BaseTy must be final.");
   }
 
   // These two methods are the base of the recursion for this method.
@@ -369,7 +363,9 @@ public:
   template <typename... Tys> struct FixedSizeStorage {
     template <size_t... Counts> struct with_counts {
       enum { Size = totalSizeToAlloc<Tys...>(Counts...) };
-      typedef llvm::AlignedCharArray<alignof(BaseTy), Size> type;
+      struct type {
+        alignas(BaseTy) char buffer[Size];
+      };
     };
   };
 
diff --git a/include/llvm/Support/TypeSize.h b/include/llvm/Support/TypeSize.h
new file mode 100644
index 000000000000..711679cdcacb
--- /dev/null
+++ b/include/llvm/Support/TypeSize.h
@@ -0,0 +1,201 @@
+//===- TypeSize.h - Wrapper around type sizes -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides a struct that can be used to query the size of IR types
+// which may be scalable vectors. It provides convenience operators so that
+// it can be used in much the same way as a single scalar value.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_TYPESIZE_H
+#define LLVM_SUPPORT_TYPESIZE_H
+
+#include <cassert>
+#include <tuple>
+
+namespace llvm {
+
+class ElementCount {
+public:
+  unsigned Min;  // Minimum number of vector elements.
+  bool Scalable; // If true, NumElements is a multiple of 'Min' determined
+                 // at runtime rather than compile time.
+
+  ElementCount(unsigned Min, bool Scalable)
+  : Min(Min), Scalable(Scalable) {}
+
+  ElementCount operator*(unsigned RHS) {
+    return { Min * RHS, Scalable };
+  }
+  ElementCount operator/(unsigned RHS) {
+    return { Min / RHS, Scalable };
+  }
+
+  bool operator==(const ElementCount& RHS) const {
+    return Min == RHS.Min && Scalable == RHS.Scalable;
+  }
+  bool operator!=(const ElementCount& RHS) const {
+    return !(*this == RHS);
+  }
+};
+
+// This class is used to represent the size of types. If the type is of fixed
+// size, it will represent the exact size. If the type is a scalable vector,
+// it will represent the known minimum size.
+class TypeSize {
+  uint64_t MinSize;   // The known minimum size.
+  bool IsScalable;    // If true, then the runtime size is an integer multiple
+                      // of MinSize.
+
+public:
+  constexpr TypeSize(uint64_t MinSize, bool Scalable)
+    : MinSize(MinSize), IsScalable(Scalable) {}
+
+  static constexpr TypeSize Fixed(uint64_t Size) {
+    return TypeSize(Size, /*IsScalable=*/false);
+  }
+
+  static constexpr TypeSize Scalable(uint64_t MinSize) {
+    return TypeSize(MinSize, /*IsScalable=*/true);
+  }
+
+  // Scalable vector types with the same minimum size as a fixed size type are
+  // not guaranteed to be the same size at runtime, so they are never
+  // considered to be equal.
+  friend bool operator==(const TypeSize &LHS, const TypeSize &RHS) {
+    return std::tie(LHS.MinSize, LHS.IsScalable) ==
+           std::tie(RHS.MinSize, RHS.IsScalable);
+  }
+
+  friend bool operator!=(const TypeSize &LHS, const TypeSize &RHS) {
+    return !(LHS == RHS);
+  }
+
+  // For many cases, size ordering between scalable and fixed size types cannot
+  // be determined at compile time, so such comparisons aren't allowed.
+  //
+  // e.g. <vscale x 2 x i16> could be bigger than <4 x i32> with a runtime
+  // vscale >= 5, equal sized with a vscale of 4, and smaller with
+  // a vscale <= 3.
+  //
+  // If the scalable flags match, just perform the requested comparison
+  // between the minimum sizes.
+  friend bool operator<(const TypeSize &LHS, const TypeSize &RHS) {
+    assert(LHS.IsScalable == RHS.IsScalable &&
+           "Ordering comparison of scalable and fixed types");
+
+    return LHS.MinSize < RHS.MinSize;
+  }
+
+  friend bool operator>(const TypeSize &LHS, const TypeSize &RHS) {
+    return RHS < LHS;
+  }
+
+  friend bool operator<=(const TypeSize &LHS, const TypeSize &RHS) {
+    return !(RHS < LHS);
+  }
+
+  friend bool operator>=(const TypeSize &LHS, const TypeSize& RHS) {
+    return !(LHS < RHS);
+  }
+
+  // Convenience operators to obtain relative sizes independently of
+  // the scalable flag.
+  TypeSize operator*(unsigned RHS) const {
+    return { MinSize * RHS, IsScalable };
+  }
+
+  friend TypeSize operator*(const unsigned LHS, const TypeSize &RHS) {
+    return { LHS * RHS.MinSize, RHS.IsScalable };
+  }
+
+  TypeSize operator/(unsigned RHS) const {
+    return { MinSize / RHS, IsScalable };
+  }
+
+  // Return the minimum size with the assumption that the size is exact.
+  // Use in places where a scalable size doesn't make sense (e.g. non-vector
+  // types, or vectors in backends which don't support scalable vectors).
+  uint64_t getFixedSize() const {
+    assert(!IsScalable && "Request for a fixed size on a scalable object");
+    return MinSize;
+  }
+
+  // Return the known minimum size. Use in places where the scalable property
+  // doesn't matter (e.g. determining alignment) or in conjunction with the
+  // isScalable method below.
+  uint64_t getKnownMinSize() const {
+    return MinSize;
+  }
+
+  // Return whether or not the size is scalable.
+  bool isScalable() const {
+    return IsScalable;
+  }
+
+  // Casts to a uint64_t if this is a fixed-width size.
+  //
+  // NOTE: This interface is obsolete and will be removed in a future version
+  // of LLVM in favour of calling getFixedSize() directly.
+  operator uint64_t() const {
+    return getFixedSize();
+  }
+
+  // Additional convenience operators needed to avoid ambiguous parses.
+  // TODO: Make uint64_t the default operator?
+  TypeSize operator*(uint64_t RHS) const {
+    return { MinSize * RHS, IsScalable };
+  }
+
+  TypeSize operator*(int RHS) const {
+    return { MinSize * RHS, IsScalable };
+  }
+
+  TypeSize operator*(int64_t RHS) const {
+    return { MinSize * RHS, IsScalable };
+  }
+
+  friend TypeSize operator*(const uint64_t LHS, const TypeSize &RHS) {
+    return { LHS * RHS.MinSize, RHS.IsScalable };
+  }
+
+  friend TypeSize operator*(const int LHS, const TypeSize &RHS) {
+    return { LHS * RHS.MinSize, RHS.IsScalable };
+  }
+
+  friend TypeSize operator*(const int64_t LHS, const TypeSize &RHS) {
+    return { LHS * RHS.MinSize, RHS.IsScalable };
+  }
+
+  TypeSize operator/(uint64_t RHS) const {
+    return { MinSize / RHS, IsScalable };
+  }
+
+  TypeSize operator/(int RHS) const {
+    return { MinSize / RHS, IsScalable };
+  }
+
+  TypeSize operator/(int64_t RHS) const {
+    return { MinSize / RHS, IsScalable };
+  }
+};
+
+/// Returns a TypeSize with a known minimum size that is the next integer
+/// (mod 2**64) that is greater than or equal to \p Value and is a multiple
+/// of \p Align. \p Align must be non-zero.
+///
+/// Similar to the alignTo functions in MathExtras.h
+inline TypeSize alignTo(TypeSize Size, uint64_t Align) {
+  assert(Align != 0u && "Align must be non-zero");
+  return {(Size.getKnownMinSize() + Align - 1) / Align * Align,
+          Size.isScalable()};
+}
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_TypeSize_H
diff --git a/include/llvm/Support/UnicodeCharRanges.h b/include/llvm/Support/UnicodeCharRanges.h
index 4b59f8a92b76..73d3603b74df 100644
--- a/include/llvm/Support/UnicodeCharRanges.h
+++ b/include/llvm/Support/UnicodeCharRanges.h
@@ -9,11 +9,8 @@
 #define LLVM_SUPPORT_UNICODECHARRANGES_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
diff --git a/include/llvm/Support/UniqueLock.h b/include/llvm/Support/UniqueLock.h
deleted file mode 100644
index 0a887ad5965d..000000000000
--- a/include/llvm/Support/UniqueLock.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===- Support/UniqueLock.h - Acquire/Release Mutex In Scope ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a guard for a block of code that ensures a Mutex is locked
-// upon construction and released upon destruction.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_UNIQUE_LOCK_H
-#define LLVM_SUPPORT_UNIQUE_LOCK_H
-
-#include <cassert>
-
-namespace llvm {
-
-  /// A pared-down imitation of std::unique_lock from C++11. Contrary to the
-  /// name, it's really more of a wrapper for a lock. It may or may not have
-  /// an associated mutex, which is guaranteed to be locked upon creation
-  /// and unlocked after destruction. unique_lock can also unlock the mutex
-  /// and re-lock it freely during its lifetime.
-  /// Guard a section of code with a mutex.
-  template<typename MutexT>
-  class unique_lock {
-    MutexT *M = nullptr;
-    bool locked = false;
-
-  public:
-    unique_lock() = default;
-    explicit unique_lock(MutexT &m) : M(&m), locked(true) { M->lock(); }
-    unique_lock(const unique_lock &) = delete;
-     unique_lock &operator=(const unique_lock &) = delete;
-
-    void operator=(unique_lock &&o) {
-      if (owns_lock())
-        M->unlock();
-      M = o.M;
-      locked = o.locked;
-      o.M = nullptr;
-      o.locked = false;
-    }
-
-    ~unique_lock() { if (owns_lock()) M->unlock(); }
-
-    void lock() {
-      assert(!locked && "mutex already locked!");
-      assert(M && "no associated mutex!");
-      M->lock();
-      locked = true;
-    }
-
-    void unlock() {
-      assert(locked && "unlocking a mutex that isn't locked!");
-      assert(M && "no associated mutex!");
-      M->unlock();
-      locked = false;
-    }
-
-    bool owns_lock() { return locked; }
-  };
-
-} // end namespace llvm
-
-#endif // LLVM_SUPPORT_UNIQUE_LOCK_H
diff --git a/include/llvm/Support/VirtualFileSystem.h b/include/llvm/Support/VirtualFileSystem.h
index 31c9e851daed..c844d9d194f0 100644
--- a/include/llvm/Support/VirtualFileSystem.h
+++ b/include/llvm/Support/VirtualFileSystem.h
@@ -647,9 +647,19 @@ private:
   friend class VFSFromYamlDirIterImpl;
   friend class RedirectingFileSystemParser;
 
+  bool shouldUseExternalFS() const {
+    return ExternalFSValidWD && IsFallthrough;
+  }
+
   /// The root(s) of the virtual file system.
   std::vector<std::unique_ptr<Entry>> Roots;
 
+  /// The current working directory of the file system.
+  std::string WorkingDirectory;
+
+  /// Whether the current working directory is valid for the external FS.
+  bool ExternalFSValidWD = false;
+
   /// The file system to use for external references.
   IntrusiveRefCntPtr<FileSystem> ExternalFS;
 
@@ -689,8 +699,7 @@ private:
       true;
 #endif
 
-  RedirectingFileSystem(IntrusiveRefCntPtr<FileSystem> ExternalFS)
-      : ExternalFS(std::move(ExternalFS)) {}
+  RedirectingFileSystem(IntrusiveRefCntPtr<FileSystem> ExternalFS);
 
   /// Looks up the path <tt>[Start, End)</tt> in \p From, possibly
   /// recursing into the contents of \p From if it is a directory.
@@ -730,9 +739,10 @@ public:
 
   StringRef getExternalContentsPrefixDir() const;
 
+  void dump(raw_ostream &OS) const;
+  void dumpEntry(raw_ostream &OS, Entry *E, int NumSpaces = 0) const;
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const;
-  LLVM_DUMP_METHOD void dumpEntry(Entry *E, int NumSpaces = 0) const;
 #endif
 };
 
diff --git a/include/llvm/Support/Win64EH.h b/include/llvm/Support/Win64EH.h
index bdd23b41594e..8220131e5be9 100644
--- a/include/llvm/Support/Win64EH.h
+++ b/include/llvm/Support/Win64EH.h
@@ -30,7 +30,9 @@ enum UnwindOpcodes {
   UOP_SetFPReg,
   UOP_SaveNonVol,
   UOP_SaveNonVolBig,
-  UOP_SaveXMM128 = 8,
+  UOP_Epilog,
+  UOP_SpareCode,
+  UOP_SaveXMM128,
   UOP_SaveXMM128Big,
   UOP_PushMachFrame,
   // The following set of unwind opcodes is for ARM64.  They are documented at
diff --git a/include/llvm/Support/X86TargetParser.def b/include/llvm/Support/X86TargetParser.def
index 1749be3b3ae2..4ebf2d79cb8d 100644
--- a/include/llvm/Support/X86TargetParser.def
+++ b/include/llvm/Support/X86TargetParser.def
@@ -112,6 +112,7 @@ X86_CPU_SUBTYPE       ("k6-2",           AMDPENTIUM_K62)
 X86_CPU_SUBTYPE       ("k6-3",           AMDPENTIUM_K63)
 X86_CPU_SUBTYPE       ("geode",          AMDPENTIUM_GEODE)
 X86_CPU_SUBTYPE       ("cooperlake",     INTEL_COREI7_COOPERLAKE)
+X86_CPU_SUBTYPE       ("tigerlake",      INTEL_COREI7_TIGERLAKE)
 #undef X86_CPU_SUBTYPE_COMPAT
 #undef X86_CPU_SUBTYPE
 
@@ -160,12 +161,13 @@ X86_FEATURE_COMPAT(32, FEATURE_GFNI,            "gfni")
 X86_FEATURE_COMPAT(33, FEATURE_VPCLMULQDQ,      "vpclmulqdq")
 X86_FEATURE_COMPAT(34, FEATURE_AVX512VNNI,      "avx512vnni")
 X86_FEATURE_COMPAT(35, FEATURE_AVX512BITALG,    "avx512bitalg")
+X86_FEATURE_COMPAT(36, FEATURE_AVX512BF16,      "avx512bf16")
 // Features below here are not in libgcc/compiler-rt.
 X86_FEATURE       (64, FEATURE_MOVBE)
 X86_FEATURE       (65, FEATURE_ADX)
 X86_FEATURE       (66, FEATURE_EM64T)
 X86_FEATURE       (67, FEATURE_CLFLUSHOPT)
 X86_FEATURE       (68, FEATURE_SHA)
-X86_FEATURE       (69, FEATURE_AVX512BF16)
+X86_FEATURE       (69, FEATURE_AVX512VP2INTERSECT)
 #undef X86_FEATURE_COMPAT
 #undef X86_FEATURE
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 5181dc56d81d..a3bfa7dc4678 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -649,7 +649,8 @@ inline bool isBool(StringRef S) {
 inline QuotingType needsQuotes(StringRef S) {
   if (S.empty())
     return QuotingType::Single;
-  if (isspace(S.front()) || isspace(S.back()))
+  if (isspace(static_cast<unsigned char>(S.front())) ||
+      isspace(static_cast<unsigned char>(S.back())))
     return QuotingType::Single;
   if (isNull(S))
     return QuotingType::Single;
@@ -748,7 +749,7 @@ public:
   IO(void *Ctxt = nullptr);
   virtual ~IO();
 
-  virtual bool outputting() = 0;
+  virtual bool outputting() const = 0;
 
   virtual unsigned beginSequence() = 0;
   virtual bool preflightElement(unsigned, void *&) = 0;
@@ -842,7 +843,7 @@ public:
       Val = Val | ConstVal;
   }
 
-  void *getContext();
+  void *getContext() const;
   void setContext(void *);
 
   template <typename T> void mapRequired(const char *Key, T &Val) {
@@ -1402,7 +1403,7 @@ public:
   std::error_code error();
 
 private:
-  bool outputting() override;
+  bool outputting() const override;
   bool mapTag(StringRef, bool) override;
   void beginMapping() override;
   void endMapping() override;
@@ -1549,7 +1550,7 @@ public:
   /// anyway.
   void setWriteDefaultValues(bool Write) { WriteDefaultValues = Write; }
 
-  bool outputting() override;
+  bool outputting() const override;
   bool mapTag(StringRef, bool) override;
   void beginMapping() override;
   void endMapping() override;
diff --git a/include/llvm/Support/circular_raw_ostream.h b/include/llvm/Support/circular_raw_ostream.h
index 4ecdb17376f1..a72acd4fe002 100644
--- a/include/llvm/Support/circular_raw_ostream.h
+++ b/include/llvm/Support/circular_raw_ostream.h
@@ -122,6 +122,10 @@ namespace llvm {
       delete[] BufferArray;
     }
 
+    bool is_displayed() const override {
+      return TheStream->is_displayed();
+    }
+
     /// setStream - Tell the circular_raw_ostream to output a
     /// different stream.  "Owns" tells circular_raw_ostream whether
     /// it should take responsibility for managing the underlying
diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index 48bb623b0638..0debc5da7a68 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -72,7 +72,7 @@ private:
 
 public:
   // color order matches ANSI escape sequence, don't change
-  enum Colors {
+  enum class Colors {
     BLACK = 0,
     RED,
     GREEN,
@@ -81,9 +81,21 @@ public:
     MAGENTA,
     CYAN,
     WHITE,
-    SAVEDCOLOR
+    SAVEDCOLOR,
+    RESET,
   };
 
+  static const Colors BLACK = Colors::BLACK;
+  static const Colors RED = Colors::RED;
+  static const Colors GREEN = Colors::GREEN;
+  static const Colors YELLOW = Colors::YELLOW;
+  static const Colors BLUE = Colors::BLUE;
+  static const Colors MAGENTA = Colors::MAGENTA;
+  static const Colors CYAN = Colors::CYAN;
+  static const Colors WHITE = Colors::WHITE;
+  static const Colors SAVEDCOLOR = Colors::SAVEDCOLOR;
+  static const Colors RESET = Colors::RESET;
+
   explicit raw_ostream(bool unbuffered = false)
       : BufferMode(unbuffered ? Unbuffered : InternalBuffer) {
     // Start out ready to flush.
@@ -214,6 +226,9 @@ public:
   /// Output \p N in hexadecimal, without any prefix or padding.
   raw_ostream &write_hex(unsigned long long N);
 
+  // Change the foreground color of text.
+  raw_ostream &operator<<(Colors C);
+
   /// Output a formatted UUID with dash separators.
   using uuid_t = uint8_t[16];
   raw_ostream &write_uuid(const uuid_t UUID);
@@ -277,6 +292,10 @@ public:
   /// This function determines if this stream is displayed and supports colors.
   virtual bool has_colors() const { return is_displayed(); }
 
+  // Enable or disable colors. Once disable_colors() is called,
+  // changeColor() has no effect until enable_colors() is called.
+  virtual void enable_colors(bool /*enable*/) {}
+
   //===--------------------------------------------------------------------===//
   // Subclass Interface
   //===--------------------------------------------------------------------===//
@@ -365,8 +384,8 @@ public:
 class raw_fd_ostream : public raw_pwrite_stream {
   int FD;
   bool ShouldClose;
-
   bool SupportsSeeking;
+  bool ColorEnabled = true;
 
 #ifdef _WIN32
   /// True if this fd refers to a Windows console device. Mintty and other
@@ -442,6 +461,8 @@ public:
 
   bool has_colors() const override;
 
+  void enable_colors(bool enable) override { ColorEnabled = enable; }
+
   std::error_code error() const { return EC; }
 
   /// Return the value of the flag in this raw_fd_ostream indicating whether an
diff --git a/include/llvm/Support/type_traits.h b/include/llvm/Support/type_traits.h
index c8c6a76a90f1..b7d48e8e1ade 100644
--- a/include/llvm/Support/type_traits.h
+++ b/include/llvm/Support/type_traits.h
@@ -17,11 +17,6 @@
 #include <type_traits>
 #include <utility>
 
-#ifndef __has_feature
-#define LLVM_DEFINED_HAS_FEATURE
-#define __has_feature(x) 0
-#endif
-
 namespace llvm {
 
 
@@ -194,17 +189,4 @@ class is_trivially_copyable<T*> : public std::true_type {
 
 } // end namespace llvm
 
-// If the compiler supports detecting whether a class is final, define
-// an LLVM_IS_FINAL macro. If it cannot be defined properly, this
-// macro will be left undefined.
-#if __cplusplus >= 201402L || defined(_MSC_VER)
-#define LLVM_IS_FINAL(Ty) std::is_final<Ty>()
-#elif __has_feature(is_final) || LLVM_GNUC_PREREQ(4, 7, 0)
-#define LLVM_IS_FINAL(Ty) __is_final(Ty)
-#endif
-
-#ifdef LLVM_DEFINED_HAS_FEATURE
-#undef __has_feature
-#endif
-
 #endif // LLVM_SUPPORT_TYPE_TRAITS_H
diff --git a/include/llvm/TableGen/Automaton.td b/include/llvm/TableGen/Automaton.td
new file mode 100644
index 000000000000..13ced2a0e784
--- /dev/null
+++ b/include/llvm/TableGen/Automaton.td
@@ -0,0 +1,95 @@
+//===- Automaton.td ----------------------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the key top-level classes needed to produce a reasonably
+// generic finite-state automaton.
+//
+//===----------------------------------------------------------------------===//
+
+// Define a record inheriting from GenericAutomaton to generate a reasonably
+// generic finite-state automaton over a set of actions and states.
+//
+// This automaton is defined by:
+//   1) a state space (explicit, always bits<32>).
+//   2) a set of input symbols (actions, explicit) and
+//   3) a transition function from state + action -> state.
+//
+// A theoretical automaton is defined by <Q, S, d, q0, F>:
+//   Q: A set of possible states.
+//   S: (sigma) The input alphabet.
+//   d: (delta) The transition function f(q in Q, s in S) -> q' in Q.
+//   F: The set of final (accepting) states.
+//
+// Because generating all possible states is tedious, we instead define the
+// transition function only and crawl all reachable states starting from the
+// initial state with all inputs under all transitions until termination.
+//
+// We define F = S, that is, all valid states are accepting.
+//
+// To ensure the generation of the automaton terminates, the state transitions
+// are defined as a lattice (meaning every transitioned-to state is more
+// specific than the transitioned-from state, for some definition of specificity).
+// Concretely a transition may set one or more bits in the state that were
+// previously zero to one. If any bit was not zero, the transition is invalid.
+//
+// Instead of defining all possible states (which would be cumbersome), the user
+// provides a set of possible Transitions from state A, consuming an input
+// symbol A to state B. The Transition object transforms state A to state B and
+// acts as a predicate. This means the state space can be discovered by crawling
+// all the possible transitions until none are valid.
+//
+// This automaton is considered to be nondeterministic, meaning that multiple
+// transitions can occur from any (state, action) pair. The generated automaton
+// is determinized, meaning that is executes in O(k) time where k is the input
+// sequence length.
+//
+// In addition to a generated automaton that determines if a sequence of inputs
+// is accepted or not, a table is emitted that allows determining a plausible
+// sequence of states traversed to accept that input.
+class GenericAutomaton {
+  // Name of a class that inherits from Transition. All records inheriting from
+  // this class will be considered when constructing the automaton.
+  string TransitionClass;
+
+  // Names of fields within TransitionClass that define the action symbol. This
+  // defines the action as an N-tuple.
+  //
+  // Each symbol field can be of class, int, string or code type.
+  //   If the type of a field is a class, the Record's name is used verbatim
+  //     in C++ and the class name is used as the C++ type name.
+  //   If the type of a field is a string, code or int, that is also used
+  //     verbatim in C++.
+  //
+  // To override the C++ type name for field F, define a field called TypeOf_F.
+  // This should be a string that will be used verbatim in C++.
+  //
+  // As an example, to define a 2-tuple with an enum and a string, one might:
+  //   def MyTransition : Transition {
+  //     MyEnum S1;
+  //     int S2;
+  //   }
+  //   def MyAutomaton : GenericAutomaton }{
+  //     let TransitionClass = "Transition";
+  //     let SymbolFields = ["S1", "S2"];
+  //     let TypeOf_S1 = "MyEnumInCxxKind";
+  //   }
+  list<string> SymbolFields;
+}
+
+// All transitions inherit from Transition.
+class Transition {
+  // A transition S' = T(S) is valid if, for every set bit in NewState, the
+  // corresponding bit in S is clear. That is:
+  //   def T(S):
+  //     S' = S | NewState
+  //     return S' if S' != S else Failure
+  //
+  // The automaton generator uses this property to crawl the set of possible
+  // transitions from a starting state of 0b0.
+  bits<32> NewState;
+}
diff --git a/include/llvm/TableGen/Error.h b/include/llvm/TableGen/Error.h
index 7c83b6298620..cf990427f577 100644
--- a/include/llvm/TableGen/Error.h
+++ b/include/llvm/TableGen/Error.h
@@ -18,6 +18,7 @@
 
 namespace llvm {
 
+void PrintNote(const Twine &Msg);
 void PrintNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg);
 
 void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg);
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index bf7f02208c28..73ed342a6101 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -1263,7 +1263,14 @@ class FieldInit : public TypedInit {
 
   FieldInit(Init *R, StringInit *FN)
       : TypedInit(IK_FieldInit, R->getFieldType(FN)), Rec(R), FieldName(FN) {
-    assert(getType() && "FieldInit with non-record type!");
+#ifndef NDEBUG
+    if (!getType()) {
+      llvm::errs() << "In Record = " << Rec->getAsString()
+                   << ", got FieldName = " << *FieldName
+                   << " with non-record type!\n";
+      llvm_unreachable("FieldInit with non-record type!");
+    }
+#endif
   }
 
 public:
@@ -1323,6 +1330,7 @@ public:
   void Profile(FoldingSetNodeID &ID) const;
 
   Init *getOperator() const { return Val; }
+  Record *getOperatorAsDef(ArrayRef<SMLoc> Loc) const;
 
   StringInit *getName() const { return ValName; }
 
@@ -1680,10 +1688,10 @@ raw_ostream &operator<<(raw_ostream &OS, const Record &R);
 
 class RecordKeeper {
   friend class RecordRecTy;
-  using RecordMap = std::map<std::string, std::unique_ptr<Record>>;
+  using RecordMap = std::map<std::string, std::unique_ptr<Record>, std::less<>>;
   RecordMap Classes, Defs;
   FoldingSet<RecordRecTy> RecordTypePool;
-  std::map<std::string, Init *> ExtraGlobals;
+  std::map<std::string, Init *, std::less<>> ExtraGlobals;
   unsigned AnonCounter = 0;
 
 public:
diff --git a/include/llvm/Target/GenericOpcodes.td b/include/llvm/Target/GenericOpcodes.td
index 45718327b4a7..4b49dfd4dd18 100644
--- a/include/llvm/Target/GenericOpcodes.td
+++ b/include/llvm/Target/GenericOpcodes.td
@@ -15,7 +15,9 @@
 // Unary ops.
 //------------------------------------------------------------------------------
 
-class GenericInstruction : StandardPseudoInstruction;
+class GenericInstruction : StandardPseudoInstruction {
+  let isPreISelOpcode = 1;
+}
 
 // Extend the underlying scalar type of an operation, leaving the high bits
 // unspecified.
@@ -33,6 +35,20 @@ def G_SEXT : GenericInstruction {
   let hasSideEffects = 0;
 }
 
+// Sign extend the a value from an arbitrary bit position, copying the sign bit
+// into all bits above it. This is equivalent to a shl + ashr pair with an
+// appropriate shift amount. $sz is an immediate (MachineOperand::isImm()
+// returns true) to allow targets to have some bitwidths legal and others
+// lowered. This opcode is particularly useful if the target has sign-extension
+// instructions that are cheaper than the constituent shifts as the optimizer is
+// able to make decisions on whether it's better to hang on to the G_SEXT_INREG
+// or to lower it and optimize the individual shifts.
+def G_SEXT_INREG : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, untyped_imm_0:$sz);
+  let hasSideEffects = 0;
+}
+
 // Zero extend the underlying scalar type of an operation, putting zero bits
 // into the newly-created space.
 def G_ZEXT : GenericInstruction {
@@ -157,6 +173,12 @@ def G_BSWAP : GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_BITREVERSE : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = 0;
+}
+
 def G_ADDRSPACE_CAST : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
@@ -175,6 +197,12 @@ def G_JUMP_TABLE : GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_DYN_STACKALLOC : GenericInstruction {
+  let OutOperandList = (outs ptype0:$dst);
+  let InOperandList = (ins type1:$size, i32imm:$align);
+  let hasSideEffects = 1;
+}
+
 //------------------------------------------------------------------------------
 // Binary ops.
 //------------------------------------------------------------------------------
@@ -598,6 +626,15 @@ def G_FMA : GenericInstruction {
   let isCommutable = 0;
 }
 
+/// Generic FP multiply and add. Perform a * b + c, while getting the
+/// same result as the separately rounded operations, unlike G_FMA.
+def G_FMAD : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
+  let hasSideEffects = 0;
+  let isCommutable = 0;
+}
+
 // Generic FP division.
 def G_FDIV : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
@@ -725,7 +762,11 @@ def G_INTRINSIC_ROUND : GenericInstruction {
 // Memory ops
 //------------------------------------------------------------------------------
 
-// Generic load. Expects a MachineMemOperand in addition to explicit operands.
+// Generic load. Expects a MachineMemOperand in addition to explicit
+// operands. If the result size is larger than the memory size, the
+// high bits are undefined. If the result is a vector type and larger
+// than the memory size, the high elements are undefined (i.e. this is
+// not a per-element, vector anyextload)
 def G_LOAD : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins ptype1:$addr);
@@ -749,6 +790,32 @@ def G_ZEXTLOAD : GenericInstruction {
   let mayLoad = 1;
 }
 
+// Generic indexed load. Combines a GEP with a load. $newaddr is set to $base + $offset.
+// If $am is 0 (post-indexed), then the value is loaded from $base; if $am is 1 (pre-indexed)
+//  then the value is loaded from $newaddr.
+def G_INDEXED_LOAD : GenericInstruction {
+  let OutOperandList = (outs type0:$dst, ptype1:$newaddr);
+  let InOperandList = (ins ptype1:$base, type2:$offset, unknown:$am);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
+// Same as G_INDEXED_LOAD except that the load performed is sign-extending, as with G_SEXTLOAD.
+def G_INDEXED_SEXTLOAD : GenericInstruction {
+  let OutOperandList = (outs type0:$dst, ptype1:$newaddr);
+  let InOperandList = (ins ptype1:$base, type2:$offset, unknown:$am);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
+// Same as G_INDEXED_LOAD except that the load performed is zero-extending, as with G_ZEXTLOAD.
+def G_INDEXED_ZEXTLOAD : GenericInstruction {
+  let OutOperandList = (outs type0:$dst, ptype1:$newaddr);
+  let InOperandList = (ins ptype1:$base, type2:$offset, unknown:$am);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
 // Generic store. Expects a MachineMemOperand in addition to explicit operands.
 def G_STORE : GenericInstruction {
   let OutOperandList = (outs);
@@ -757,6 +824,15 @@ def G_STORE : GenericInstruction {
   let mayStore = 1;
 }
 
+// Combines a store with a GEP. See description of G_INDEXED_LOAD for indexing behaviour.
+def G_INDEXED_STORE : GenericInstruction {
+  let OutOperandList = (outs ptype0:$newaddr);
+  let InOperandList = (ins type1:$src, ptype0:$base, ptype2:$offset,
+                           unknown:$am);
+  let hasSideEffects = 0;
+  let mayStore = 1;
+}
+
 // Generic atomic cmpxchg with internal success check. Expects a
 // MachineMemOperand in addition to explicit operands.
 def G_ATOMIC_CMPXCHG_WITH_SUCCESS : GenericInstruction {
@@ -798,6 +874,8 @@ def G_ATOMICRMW_MAX : G_ATOMICRMW_OP;
 def G_ATOMICRMW_MIN : G_ATOMICRMW_OP;
 def G_ATOMICRMW_UMAX : G_ATOMICRMW_OP;
 def G_ATOMICRMW_UMIN : G_ATOMICRMW_OP;
+def G_ATOMICRMW_FADD : G_ATOMICRMW_OP;
+def G_ATOMICRMW_FSUB : G_ATOMICRMW_OP;
 
 def G_FENCE : GenericInstruction {
   let OutOperandList = (outs);
@@ -947,9 +1025,12 @@ def G_EXTRACT_VECTOR_ELT : GenericInstruction {
 }
 
 // Generic shufflevector.
+//
+// The mask operand should be an IR Constant which exactly matches the
+// corresponding mask for the IR shufflevector instruction.
 def G_SHUFFLE_VECTOR: GenericInstruction {
   let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type1:$v1, type1:$v2, type2:$mask);
+  let InOperandList = (ins type1:$v1, type1:$v2, unknown:$mask);
   let hasSideEffects = 0;
 }
 
diff --git a/include/llvm/Target/GlobalISel/Combine.td b/include/llvm/Target/GlobalISel/Combine.td
new file mode 100644
index 000000000000..dcac399fd693
--- /dev/null
+++ b/include/llvm/Target/GlobalISel/Combine.td
@@ -0,0 +1,103 @@
+//===- Combine.td - Combine rule definitions ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declare GlobalISel combine rules and provide mechanisms to opt-out.
+//
+//===----------------------------------------------------------------------===//
+
+// Common base class for GICombineRule and GICombineGroup.
+class GICombine {
+  // See GICombineGroup. We only declare it here to make the tablegen pass
+  // simpler.
+  list<GICombine> Rules = ?;
+}
+
+// A group of combine rules that can be added to a GICombiner or another group.
+class GICombineGroup<list<GICombine> rules> : GICombine {
+  // The rules contained in this group. The rules in a group are flattened into
+  // a single list and sorted into whatever order is most efficient. However,
+  // they will never be re-ordered such that behaviour differs from the
+  // specified order. It is therefore possible to use the order of rules in this
+  // list to describe priorities.
+  let Rules = rules;
+}
+
+// Declares a combiner helper class
+class GICombinerHelper<string classname, list<GICombine> rules>
+    : GICombineGroup<rules> {
+  // The class name to use in the generated output.
+  string Classname = classname;
+  // The name of a run-time compiler option that will be generated to disable
+  // specific rules within this combiner.
+  string DisableRuleOption = ?;
+}
+class GICombineRule<dag defs, dag match, dag apply> : GICombine {
+  /// Defines the external interface of the match rule. This includes:
+  /// * The names of the root nodes (requires at least one)
+  /// See GIDefKind for details.
+  dag Defs = defs;
+
+  /// Defines the things which must be true for the pattern to match
+  /// See GIMatchKind for details.
+  dag Match = match;
+
+  /// Defines the things which happen after the decision is made to apply a
+  /// combine rule.
+  /// See GIApplyKind for details.
+  dag Apply = apply;
+}
+
+/// The operator at the root of a GICombineRule.Defs dag.
+def defs;
+
+/// All arguments of the defs operator must be subclasses of GIDefKind or
+/// sub-dags whose operator is GIDefKindWithArgs.
+class GIDefKind;
+class GIDefKindWithArgs;
+/// Declare a root node. There must be at least one of these in every combine
+/// rule.
+/// TODO: The plan is to elide `root` definitions and determine it from the DAG
+///       itself with an overide for situations where the usual determination
+///       is incorrect.
+def root : GIDefKind;
+
+/// The operator at the root of a GICombineRule.Match dag.
+def match;
+/// All arguments of the match operator must be either:
+/// * A subclass of GIMatchKind
+/// * A subclass of GIMatchKindWithArgs
+/// * A MIR code block (deprecated)
+/// The GIMatchKind and GIMatchKindWithArgs cases are described in more detail
+/// in their definitions below.
+/// For the Instruction case, these are collected into a DAG where operand names
+/// that occur multiple times introduce edges.
+class GIMatchKind;
+class GIMatchKindWithArgs;
+
+/// The operator at the root of a GICombineRule.Apply dag.
+def apply;
+/// All arguments of the apply operator must be subclasses of GIApplyKind, or
+/// sub-dags whose operator is GIApplyKindWithArgs, or an MIR block
+/// (deprecated).
+class GIApplyKind;
+class GIApplyKindWithArgs;
+
+def copy_prop : GICombineRule<
+  (defs root:$d),
+  (match [{ return Helper.matchCombineCopy(${d}); }]),
+  (apply [{ Helper.applyCombineCopy(${d}); }])>;
+def trivial_combines : GICombineGroup<[copy_prop]>;
+
+// FIXME: Is there a reason this wasn't in tryCombine? I've left it out of
+//        all_combines because it wasn't there.
+def elide_br_by_inverting_cond : GICombineRule<
+  (defs root:$d),
+  (match [{ return Helper.matchElideBrByInvertingCond(${d}); }]),
+  (apply [{ Helper.applyElideBrByInvertingCond(${d}); }])>;
+
+def all_combines : GICombineGroup<[trivial_combines]>;
diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 6cc58d6521da..b846d2252b8d 100644
--- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -27,6 +27,7 @@ class GINodeEquiv<Instruction i, SDNode node> {
   // (ISD::LOAD, ISD::ATOMIC_LOAD, ISD::STORE, ISD::ATOMIC_STORE) but GlobalISel
   // stores this information in the MachineMemoryOperand.
   bit CheckMMOIsNonAtomic = 0;
+  bit CheckMMOIsAtomic = 0;
 
   // SelectionDAG has one node for all loads and uses predicates to
   // differentiate them. GlobalISel on the other hand uses separate opcodes.
@@ -34,6 +35,10 @@ class GINodeEquiv<Instruction i, SDNode node> {
   // depending on the predicates on the node.
   Instruction IfSignExtend = ?;
   Instruction IfZeroExtend = ?;
+
+  // SelectionDAG has one setcc for all compares. This differentiates
+  // for G_ICMP and G_FCMP.
+  Instruction IfFloatingPoint = ?;
 }
 
 // These are defined in the same order as the G_* instructions.
@@ -46,6 +51,7 @@ def : GINodeEquiv<G_BITCAST, bitconvert>;
 // G_PTRTOINT - SelectionDAG has no equivalent.
 def : GINodeEquiv<G_CONSTANT, imm>;
 def : GINodeEquiv<G_FCONSTANT, fpimm>;
+def : GINodeEquiv<G_IMPLICIT_DEF, undef>;
 def : GINodeEquiv<G_ADD, add>;
 def : GINodeEquiv<G_SUB, sub>;
 def : GINodeEquiv<G_MUL, mul>;
@@ -72,6 +78,7 @@ def : GINodeEquiv<G_UITOFP, uint_to_fp>;
 def : GINodeEquiv<G_FADD, fadd>;
 def : GINodeEquiv<G_FSUB, fsub>;
 def : GINodeEquiv<G_FMA, fma>;
+def : GINodeEquiv<G_FMAD, fmad>;
 def : GINodeEquiv<G_FMUL, fmul>;
 def : GINodeEquiv<G_FDIV, fdiv>;
 def : GINodeEquiv<G_FREM, frem>;
@@ -85,6 +92,7 @@ def : GINodeEquiv<G_INTRINSIC_W_SIDE_EFFECTS, intrinsic_void>;
 def : GINodeEquiv<G_INTRINSIC_W_SIDE_EFFECTS, intrinsic_w_chain>;
 def : GINodeEquiv<G_BR, br>;
 def : GINodeEquiv<G_BSWAP, bswap>;
+def : GINodeEquiv<G_BITREVERSE, bitreverse>;
 def : GINodeEquiv<G_CTLZ, ctlz>;
 def : GINodeEquiv<G_CTTZ, cttz>;
 def : GINodeEquiv<G_CTLZ_ZERO_UNDEF, ctlz_zero_undef>;
@@ -100,10 +108,15 @@ def : GINodeEquiv<G_FSQRT, fsqrt>;
 def : GINodeEquiv<G_FFLOOR, ffloor>;
 def : GINodeEquiv<G_FRINT, frint>;
 def : GINodeEquiv<G_FNEARBYINT, fnearbyint>;
+def : GINodeEquiv<G_FCOPYSIGN, fcopysign>;
 def : GINodeEquiv<G_SMIN, smin>;
 def : GINodeEquiv<G_SMAX, smax>;
 def : GINodeEquiv<G_UMIN, umin>;
 def : GINodeEquiv<G_UMAX, umax>;
+def : GINodeEquiv<G_FMINNUM, fminnum>;
+def : GINodeEquiv<G_FMAXNUM, fmaxnum>;
+def : GINodeEquiv<G_FMINNUM_IEEE, fminnum_ieee>;
+def : GINodeEquiv<G_FMAXNUM_IEEE, fmaxnum_ieee>;
 
 // Broadly speaking G_LOAD is equivalent to ISD::LOAD but there are some
 // complications that tablegen must take care of. For example, Predicates such
@@ -117,6 +130,11 @@ def : GINodeEquiv<G_LOAD, ld> {
   let IfSignExtend = G_SEXTLOAD;
   let IfZeroExtend = G_ZEXTLOAD;
 }
+
+def : GINodeEquiv<G_ICMP, setcc> {
+  let IfFloatingPoint = G_FCMP;
+}
+
 // Broadly speaking G_STORE is equivalent to ISD::STORE but there are some
 // complications that tablegen must take care of. For example, predicates such
 // as isTruncStore require that this is not a perfect 1:1 mapping since a
@@ -126,6 +144,11 @@ def : GINodeEquiv<G_LOAD, ld> {
 // G_STORE with a non-atomic MachineMemOperand.
 def : GINodeEquiv<G_STORE, st> { let CheckMMOIsNonAtomic = 1; }
 
+def : GINodeEquiv<G_LOAD, atomic_load> {
+  let CheckMMOIsNonAtomic = 0;
+  let CheckMMOIsAtomic = 1;
+}
+
 def : GINodeEquiv<G_ATOMIC_CMPXCHG, atomic_cmp_swap>;
 def : GINodeEquiv<G_ATOMICRMW_XCHG, atomic_swap>;
 def : GINodeEquiv<G_ATOMICRMW_ADD, atomic_load_add>;
@@ -138,6 +161,8 @@ def : GINodeEquiv<G_ATOMICRMW_MIN, atomic_load_min>;
 def : GINodeEquiv<G_ATOMICRMW_MAX, atomic_load_max>;
 def : GINodeEquiv<G_ATOMICRMW_UMIN, atomic_load_umin>;
 def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax>;
+def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd>;
+def : GINodeEquiv<G_ATOMICRMW_FSUB, atomic_load_fsub>;
 def : GINodeEquiv<G_FENCE, atomic_fence>;
 
 // Specifies the GlobalISel equivalents for SelectionDAG's ComplexPattern.
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index d58662e128e0..dd8679661b9a 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -351,7 +351,11 @@ def interleave;
 // RegisterTuples instances can be used in other set operations to form
 // register classes and so on. This is the only way of using the generated
 // registers.
-class RegisterTuples<list<SubRegIndex> Indices, list<dag> Regs> {
+//
+// RegNames may be specified to supply asm names for the generated tuples.
+// If used must have the same size as the list of produced registers.
+class RegisterTuples<list<SubRegIndex> Indices, list<dag> Regs,
+                     list<string> RegNames = []> {
   // SubRegs - N lists of registers to be zipped up. Super-registers are
   // synthesized from the first element of each SubRegs list, the second
   // element and so on.
@@ -360,6 +364,9 @@ class RegisterTuples<list<SubRegIndex> Indices, list<dag> Regs> {
   // SubRegIndices - N SubRegIndex instances. This provides the names of the
   // sub-registers in the synthesized super-registers.
   list<SubRegIndex> SubRegIndices = Indices;
+
+  // List of asm names for the generated tuple registers.
+  list<string> RegAsmNames = RegNames;
 }
 
 
@@ -436,6 +443,15 @@ class InstructionEncoding {
   bit hasCompleteDecoder = 1;
 }
 
+// Allows specifying an InstructionEncoding by HwMode. If an Instruction specifies
+// an EncodingByHwMode, its Inst and Size members are ignored and Ts are used
+// to encode and decode based on HwMode.
+class EncodingByHwMode<list<HwMode> Ms = [], list<InstructionEncoding> Ts = []>
+    : HwModeSelect<Ms> {
+  // The length of this list must be the same as the length of Ms.
+  list<InstructionEncoding> Objects = Ts;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction set description - These classes correspond to the C++ classes in
 // the Target/TargetInstrInfo.h file.
@@ -447,6 +463,10 @@ class Instruction : InstructionEncoding {
   dag InOperandList;        // An dag containing the MI use operand list.
   string AsmString = "";    // The .s format to print the instruction with.
 
+  // Allows specifying a canonical InstructionEncoding by HwMode. If non-empty,
+  // the Inst member of this Instruction is ignored.
+  EncodingByHwMode EncodingInfos;
+
   // Pattern - Set to the DAG pattern for this instruction, if we know of one,
   // otherwise, uninitialized.
   list<dag> Pattern;
@@ -472,6 +492,10 @@ class Instruction : InstructionEncoding {
   // Added complexity passed onto matching pattern.
   int AddedComplexity  = 0;
 
+  // Indicates if this is a pre-isel opcode that should be
+  // legalized/regbankselected/selected.
+  bit isPreISelOpcode = 0;
+
   // These bits capture information about the high-level semantics of the
   // instruction.
   bit isReturn     = 0;     // Is this instruction a return instruction?
@@ -834,6 +858,7 @@ def f64imm : Operand<f64>;
 class TypedOperand<string Ty> : Operand<untyped> {
   let OperandType = Ty;
   bit IsPointer = 0;
+  bit IsImmediate = 0;
 }
 
 def type0 : TypedOperand<"OPERAND_GENERIC_0">;
@@ -852,6 +877,12 @@ let IsPointer = 1 in {
   def ptype5 : TypedOperand<"OPERAND_GENERIC_5">;
 }
 
+// untyped_imm is for operands where isImm() will be true. It currently has no
+// special behaviour and is only used for clarity.
+def untyped_imm_0 : TypedOperand<"OPERAND_GENERIC_IMM_0"> {
+  let IsImmediate = 1;
+}
+
 /// zero_reg definition - Special node to stand for the zero register.
 ///
 def zero_reg;
diff --git a/include/llvm/Target/TargetCallingConv.td b/include/llvm/Target/TargetCallingConv.td
index 1bc03cf8a49d..7b1973cc3828 100644
--- a/include/llvm/Target/TargetCallingConv.td
+++ b/include/llvm/Target/TargetCallingConv.td
@@ -152,6 +152,12 @@ class CCBitConvertToType<ValueType destTy> : CCAction {
   ValueType DestTy = destTy;
 }
 
+/// CCTruncToType - If applied, this truncates the specified current value to
+/// the specified type.
+class CCTruncToType<ValueType destTy> : CCAction {
+  ValueType DestTy = destTy;
+}
+
 /// CCPassIndirect - If applied, this stores the value to stack and passes the pointer
 /// as normal argument.
 class CCPassIndirect<ValueType destTy> : CCAction {
diff --git a/include/llvm/Target/TargetItinerary.td b/include/llvm/Target/TargetItinerary.td
index b68ed045520c..89e5abd947d0 100644
--- a/include/llvm/Target/TargetItinerary.td
+++ b/include/llvm/Target/TargetItinerary.td
@@ -127,6 +127,17 @@ class ProcessorItineraries<list<FuncUnit> fu, list<Bypass> bp,
   list<FuncUnit> FU = fu;
   list<Bypass> BP = bp;
   list<InstrItinData> IID = iid;
+  // The packetizer automaton to use for this itinerary. By default all
+  // itineraries for a target are bundled up into the same automaton. This only
+  // works correctly when there are no conflicts in functional unit IDs between
+  // itineraries. For example, given two itineraries A<[SLOT_A]>, B<[SLOT_B]>,
+  // SLOT_A and SLOT_B will be assigned the same functional unit index, and
+  // the generated packetizer will confuse instructions referencing these slots.
+  //
+  // To avoid this, setting PacketizerNamespace to non-"" will cause this
+  // itinerary to be generated in a different automaton. The subtarget will need
+  // to declare a method "create##Namespace##DFAPacketizer()".
+  string PacketizerNamespace = "";
 }
 
 // NoItineraries - A marker that can be used by processors without schedule
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index 3a2497bff11e..d74341b23fb1 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -191,7 +191,8 @@ public:
   }
 
   /// Get the target specific PC relative GOT entry relocation
-  virtual const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+  virtual const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
+                                                  const MCSymbol *Sym,
                                                   const MCValue &MV,
                                                   int64_t Offset,
                                                   MachineModuleInfo *MMI,
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index cdf9f8bfd5ea..285c0ec0fb90 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -25,7 +25,7 @@ namespace llvm {
 
 class Function;
 class GlobalValue;
-class MachineModuleInfo;
+class MachineModuleInfoWrapperPass;
 class Mangler;
 class MCAsmInfo;
 class MCContext;
@@ -284,12 +284,13 @@ public:
   /// emitted.  Typically this will involve several steps of code generation.
   /// This method should return true if emission of this file type is not
   /// supported, or false on success.
-  /// \p MMI is an optional parameter that, if set to non-nullptr,
+  /// \p MMIWP is an optional parameter that, if set to non-nullptr,
   /// will be used to set the MachineModuloInfo for this PM.
-  virtual bool addPassesToEmitFile(PassManagerBase &, raw_pwrite_stream &,
-                                   raw_pwrite_stream *, CodeGenFileType,
-                                   bool /*DisableVerify*/ = true,
-                                   MachineModuleInfo *MMI = nullptr) {
+  virtual bool
+  addPassesToEmitFile(PassManagerBase &, raw_pwrite_stream &,
+                      raw_pwrite_stream *, CodeGenFileType,
+                      bool /*DisableVerify*/ = true,
+                      MachineModuleInfoWrapperPass *MMIWP = nullptr) {
     return true;
   }
 
@@ -341,12 +342,13 @@ public:
 
   /// Add passes to the specified pass manager to get the specified file
   /// emitted.  Typically this will involve several steps of code generation.
-  /// \p MMI is an optional parameter that, if set to non-nullptr,
-  /// will be used to set the MachineModuloInfofor this PM.
-  bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
-                           raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
-                           bool DisableVerify = true,
-                           MachineModuleInfo *MMI = nullptr) override;
+  /// \p MMIWP is an optional parameter that, if set to non-nullptr,
+  /// will be used to set the MachineModuloInfo for this PM.
+  bool
+  addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
+                      raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
+                      bool DisableVerify = true,
+                      MachineModuleInfoWrapperPass *MMIWP = nullptr) override;
 
   /// Add passes to the specified pass manager to get machine code emitted with
   /// the MCJIT. This method returns true if machine code is not supported. It
@@ -365,7 +367,7 @@ public:
   /// Adds an AsmPrinter pass to the pipeline that prints assembly or
   /// machine code from the MI representation.
   bool addAsmPrinter(PassManagerBase &PM, raw_pwrite_stream &Out,
-                     raw_pwrite_stream *DwoOut, CodeGenFileType FileTYpe,
+                     raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
                      MCContext &Context);
 
   /// True if the target uses physical regs at Prolog/Epilog insertion
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index a36d259df831..24f37e94da91 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -563,10 +563,10 @@ class RetireControlUnit<int bufferSize, int retirePerCycle> {
 
 // Base class for Load/StoreQueue.  It is used to identify processor resources
 // which describe load/store queues in the LS unit.
-class MemoryQueue<ProcResource PR> {
-  ProcResource QueueDescriptor = PR;
+class MemoryQueue<ProcResourceKind PR> {
+  ProcResourceKind QueueDescriptor = PR;
   SchedMachineModel SchedModel = ?;
 }
 
-class LoadQueue<ProcResource LDQueue> : MemoryQueue<LDQueue>;
-class StoreQueue<ProcResource STQueue> : MemoryQueue<STQueue>;
+class LoadQueue<ProcResourceKind LDQueue> : MemoryQueue<LDQueue>;
+class StoreQueue<ProcResourceKind STQueue> : MemoryQueue<STQueue>;
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index b913a054ac2c..441f3d7d118d 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -137,9 +137,12 @@ def SDTFPSignOp : SDTypeProfile<1, 2, [     // fcopysign.
 def SDTFPTernaryOp : SDTypeProfile<1, 3, [  // fmadd, fnmsub, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisFP<0>
 ]>;
-def SDTIntUnaryOp : SDTypeProfile<1, 1, [   // ctlz, cttz
+def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // bitreverse
   SDTCisSameAs<0, 1>, SDTCisInt<0>
 ]>;
+def SDTIntBitCountUnaryOp : SDTypeProfile<1, 1, [   // ctlz, cttz
+  SDTCisInt<0>, SDTCisInt<1>
+]>;
 def SDTIntExtendOp : SDTypeProfile<1, 1, [  // sext, zext, anyext
   SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>
 ]>;
@@ -239,6 +242,9 @@ def SDTVecExtract : SDTypeProfile<1, 2, [   // vector extract
 def SDTVecInsert : SDTypeProfile<1, 3, [    // vector insert
   SDTCisEltOfVec<2, 1>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3>
 ]>;
+def SDTVecReduce : SDTypeProfile<1, 1, [    // vector reduction
+  SDTCisInt<0>, SDTCisVec<1>
+]>;
 
 def SDTSubVecExtract : SDTypeProfile<1, 2, [// subvector extract
   SDTCisSubVecOfVec<0,1>, SDTCisInt<2>
@@ -393,6 +399,7 @@ def usubsat    : SDNode<"ISD::USUBSAT"   , SDTIntBinOp>;
 def smulfix    : SDNode<"ISD::SMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
 def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
 def umulfix    : SDNode<"ISD::UMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
+def umulfixsat : SDNode<"ISD::UMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
 
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
@@ -401,11 +408,11 @@ def zext_invec : SDNode<"ISD::ZERO_EXTEND_VECTOR_INREG", SDTExtInvec>;
 def abs        : SDNode<"ISD::ABS"        , SDTIntUnaryOp>;
 def bitreverse : SDNode<"ISD::BITREVERSE" , SDTIntUnaryOp>;
 def bswap      : SDNode<"ISD::BSWAP"      , SDTIntUnaryOp>;
-def ctlz       : SDNode<"ISD::CTLZ"       , SDTIntUnaryOp>;
-def cttz       : SDNode<"ISD::CTTZ"       , SDTIntUnaryOp>;
-def ctpop      : SDNode<"ISD::CTPOP"      , SDTIntUnaryOp>;
-def ctlz_zero_undef : SDNode<"ISD::CTLZ_ZERO_UNDEF", SDTIntUnaryOp>;
-def cttz_zero_undef : SDNode<"ISD::CTTZ_ZERO_UNDEF", SDTIntUnaryOp>;
+def ctlz       : SDNode<"ISD::CTLZ"       , SDTIntBitCountUnaryOp>;
+def cttz       : SDNode<"ISD::CTTZ"       , SDTIntBitCountUnaryOp>;
+def ctpop      : SDNode<"ISD::CTPOP"      , SDTIntBitCountUnaryOp>;
+def ctlz_zero_undef : SDNode<"ISD::CTLZ_ZERO_UNDEF", SDTIntBitCountUnaryOp>;
+def cttz_zero_undef : SDNode<"ISD::CTTZ_ZERO_UNDEF", SDTIntBitCountUnaryOp>;
 def sext       : SDNode<"ISD::SIGN_EXTEND", SDTIntExtendOp>;
 def zext       : SDNode<"ISD::ZERO_EXTEND", SDTIntExtendOp>;
 def anyext     : SDNode<"ISD::ANY_EXTEND" , SDTIntExtendOp>;
@@ -415,6 +422,12 @@ def addrspacecast : SDNode<"ISD::ADDRSPACECAST", SDTUnaryOp>;
 def extractelt : SDNode<"ISD::EXTRACT_VECTOR_ELT", SDTVecExtract>;
 def insertelt  : SDNode<"ISD::INSERT_VECTOR_ELT", SDTVecInsert>;
 
+def vecreduce_add  : SDNode<"ISD::VECREDUCE_ADD", SDTVecReduce>;
+def vecreduce_smax  : SDNode<"ISD::VECREDUCE_SMAX", SDTVecReduce>;
+def vecreduce_umax  : SDNode<"ISD::VECREDUCE_UMAX", SDTVecReduce>;
+def vecreduce_smin  : SDNode<"ISD::VECREDUCE_SMIN", SDTVecReduce>;
+def vecreduce_umin  : SDNode<"ISD::VECREDUCE_UMIN", SDTVecReduce>;
+
 def fadd       : SDNode<"ISD::FADD"       , SDTFPBinOp, [SDNPCommutative]>;
 def fsub       : SDNode<"ISD::FSUB"       , SDTFPBinOp>;
 def fmul       : SDNode<"ISD::FMUL"       , SDTFPBinOp, [SDNPCommutative]>;
@@ -493,12 +506,20 @@ def strict_flog2      : SDNode<"ISD::STRICT_FLOG2",
                                SDTFPUnaryOp, [SDNPHasChain]>;
 def strict_frint      : SDNode<"ISD::STRICT_FRINT",
                                SDTFPUnaryOp, [SDNPHasChain]>;
+def strict_lrint      : SDNode<"ISD::STRICT_LRINT",
+                               SDTFPToIntOp, [SDNPHasChain]>;
+def strict_llrint     : SDNode<"ISD::STRICT_LLRINT",
+                               SDTFPToIntOp, [SDNPHasChain]>;
 def strict_fnearbyint : SDNode<"ISD::STRICT_FNEARBYINT",
                                SDTFPUnaryOp, [SDNPHasChain]>;
 def strict_fceil      : SDNode<"ISD::STRICT_FCEIL",
                                SDTFPUnaryOp, [SDNPHasChain]>;
 def strict_ffloor     : SDNode<"ISD::STRICT_FFLOOR",
                                SDTFPUnaryOp, [SDNPHasChain]>;
+def strict_lround     : SDNode<"ISD::STRICT_LROUND",
+                               SDTFPToIntOp, [SDNPHasChain]>;
+def strict_llround    : SDNode<"ISD::STRICT_LLROUND",
+                               SDTFPToIntOp, [SDNPHasChain]>;
 def strict_fround     : SDNode<"ISD::STRICT_FROUND",
                                SDTFPUnaryOp, [SDNPHasChain]>;
 def strict_ftrunc     : SDNode<"ISD::STRICT_FTRUNC",
@@ -513,6 +534,10 @@ def strict_fpround    : SDNode<"ISD::STRICT_FP_ROUND",
                                SDTFPRoundOp, [SDNPHasChain]>;
 def strict_fpextend   : SDNode<"ISD::STRICT_FP_EXTEND",
                                SDTFPExtendOp, [SDNPHasChain]>;
+def strict_fp_to_sint : SDNode<"ISD::STRICT_FP_TO_SINT",
+                               SDTFPToIntOp, [SDNPHasChain]>;
+def strict_fp_to_uint : SDNode<"ISD::STRICT_FP_TO_UINT",
+                               SDTFPToIntOp, [SDNPHasChain]>;
 
 def setcc      : SDNode<"ISD::SETCC"      , SDTSetCC>;
 def select     : SDNode<"ISD::SELECT"     , SDTSelect>;
@@ -638,16 +663,32 @@ def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>;
 //===----------------------------------------------------------------------===//
 // Selection DAG Condition Codes
 
-class CondCode; // ISD::CondCode enums
-def SETOEQ : CondCode; def SETOGT : CondCode;
-def SETOGE : CondCode; def SETOLT : CondCode; def SETOLE : CondCode;
-def SETONE : CondCode; def SETO   : CondCode; def SETUO  : CondCode;
-def SETUEQ : CondCode; def SETUGT : CondCode; def SETUGE : CondCode;
-def SETULT : CondCode; def SETULE : CondCode; def SETUNE : CondCode;
-
-def SETEQ : CondCode; def SETGT : CondCode; def SETGE : CondCode;
-def SETLT : CondCode; def SETLE : CondCode; def SETNE : CondCode;
-
+class CondCode<string fcmpName = "", string icmpName = ""> {
+  string ICmpPredicate = icmpName;
+  string FCmpPredicate = fcmpName;
+}
+
+// ISD::CondCode enums, and mapping to CmpInst::Predicate names
+def SETOEQ : CondCode<"FCMP_OEQ">;
+def SETOGT : CondCode<"FCMP_OGT">;
+def SETOGE : CondCode<"FCMP_OGE">;
+def SETOLT : CondCode<"FCMP_OLT">;
+def SETOLE : CondCode<"FCMP_OLE">;
+def SETONE : CondCode<"FCMP_ONE">;
+def SETO   : CondCode<"FCMP_ORD">;
+def SETUO  : CondCode<"FCMP_UNO">;
+def SETUEQ : CondCode<"FCMP_UEQ">;
+def SETUGT : CondCode<"FCMP_UGT", "ICMP_UGT">;
+def SETUGE : CondCode<"FCMP_UGE", "ICMP_UGE">;
+def SETULT : CondCode<"FCMP_ULT", "ICMP_ULT">;
+def SETULE : CondCode<"FCMP_ULE", "ICMP_ULE">;
+def SETUNE : CondCode<"FCMP_UNE">;
+def SETEQ : CondCode<"", "ICMP_EQ">;
+def SETGT : CondCode<"", "ICMP_SGT">;
+def SETGE : CondCode<"", "ICMP_SGE">;
+def SETLT : CondCode<"", "ICMP_SLT">;
+def SETLE : CondCode<"", "ICMP_SLE">;
+def SETNE : CondCode<"", "ICMP_NE">;
 
 //===----------------------------------------------------------------------===//
 // Selection DAG Node Transformation Functions.
@@ -741,6 +782,10 @@ class PatFrags<dag ops, list<dag> frags, code pred = [{}],
   // If this empty, accept any address space.
   list<int> AddressSpaces = ?;
 
+  // cast<MemSDNode>(N)->getAlignment() >=
+  // If this is empty, accept any alignment.
+  int MinAlignment = ?;
+
   // cast<AtomicSDNode>(N)->getOrdering() == AtomicOrdering::Monotonic
   bit IsAtomicOrderingMonotonic = ?;
   // cast<AtomicSDNode>(N)->getOrdering() == AtomicOrdering::Acquire
@@ -766,8 +811,6 @@ class PatFrags<dag ops, list<dag> frags, code pred = [{}],
   // cast<LoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::<VT>;
   // cast<StoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::<VT>;
   ValueType ScalarMemoryVT = ?;
-
-  // TODO: Add alignment
 }
 
 // PatFrag - A version of PatFrags matching only a single fragment.
@@ -813,6 +856,11 @@ class ImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm,
   bit IsAPFloat = 0;
 }
 
+// Convenience wrapper for ImmLeaf to use timm/TargetConstant instead
+// of imm/Constant.
+class TImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm,
+  SDNode ImmNode = timm> : ImmLeaf<vt, pred, xform, ImmNode>;
+
 // An ImmLeaf except that Imm is an APInt. This is useful when you need to
 // zero-extend the immediate instead of sign-extend it.
 //
@@ -1111,6 +1159,16 @@ def pre_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset),
   let IsStore = 1;
   let MemoryVT = f32;
 }
+def pre_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                             (pre_truncst node:$val, node:$base, node:$offset)> {
+  let IsStore = 1;
+  let ScalarMemoryVT = i8;
+}
+def pre_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                              (pre_truncst node:$val, node:$base, node:$offset)> {
+  let IsStore = 1;
+  let ScalarMemoryVT = i16;
+}
 
 def post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
                          (istore node:$val, node:$ptr, node:$offset), [{
@@ -1148,14 +1206,26 @@ def post_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset),
   let IsStore = 1;
   let MemoryVT = f32;
 }
+def post_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                              (post_truncst node:$val, node:$base, node:$offset)> {
+  let IsStore = 1;
+  let ScalarMemoryVT = i8;
+}
+def post_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                               (post_truncst node:$val, node:$base, node:$offset)> {
+  let IsStore = 1;
+  let ScalarMemoryVT = i16;
+}
 
-def nonvolatile_load : PatFrag<(ops node:$ptr),
-                               (load node:$ptr), [{
-  return !cast<LoadSDNode>(N)->isVolatile();
+// TODO: Split these into volatile and unordered flavors to enable
+// selectively legal optimizations for each.  (See D66309)
+def simple_load : PatFrag<(ops node:$ptr),
+                          (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->isSimple();
 }]>;
-def nonvolatile_store : PatFrag<(ops node:$val, node:$ptr),
-                                (store node:$val, node:$ptr), [{
-  return !cast<StoreSDNode>(N)->isVolatile();
+def simple_store : PatFrag<(ops node:$val, node:$ptr),
+                           (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->isSimple();
 }]>;
 
 // nontemporal store fragments.
@@ -1277,6 +1347,12 @@ def any_flog2      : PatFrags<(ops node:$src),
 def any_frint      : PatFrags<(ops node:$src),
                               [(strict_frint node:$src),
                                (frint node:$src)]>;
+def any_lrint      : PatFrags<(ops node:$src),
+                              [(strict_lrint node:$src),
+                               (lrint node:$src)]>;
+def any_llrint     : PatFrags<(ops node:$src),
+                              [(strict_llrint node:$src),
+                               (llrint node:$src)]>;
 def any_fnearbyint : PatFrags<(ops node:$src),
                               [(strict_fnearbyint node:$src),
                                (fnearbyint node:$src)]>;
@@ -1286,6 +1362,12 @@ def any_fceil      : PatFrags<(ops node:$src),
 def any_ffloor     : PatFrags<(ops node:$src),
                               [(strict_ffloor node:$src),
                                (ffloor node:$src)]>;
+def any_lround     : PatFrags<(ops node:$src),
+                              [(strict_lround node:$src),
+                               (lround node:$src)]>;
+def any_llround    : PatFrags<(ops node:$src),
+                              [(strict_llround node:$src),
+                               (llround node:$src)]>;
 def any_fround     : PatFrags<(ops node:$src),
                               [(strict_fround node:$src),
                                (fround node:$src)]>;
@@ -1310,6 +1392,12 @@ def any_extloadf32 : PatFrags<(ops node:$ptr),
 def any_extloadf64 : PatFrags<(ops node:$ptr),
                               [(strict_extloadf64 node:$ptr),
                                (extloadf64 node:$ptr)]>;
+def any_fp_to_sint : PatFrags<(ops node:$src),
+                              [(strict_fp_to_sint node:$src),
+                               (fp_to_sint node:$src)]>;
+def any_fp_to_uint : PatFrags<(ops node:$src),
+                              [(strict_fp_to_uint node:$src),
+                               (fp_to_uint node:$src)]>;
 
 multiclass binary_atomic_op_ord<SDNode atomic_op> {
   def #NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val),
@@ -1367,26 +1455,26 @@ multiclass ternary_atomic_op_ord<SDNode atomic_op> {
   }
 }
 
-multiclass binary_atomic_op<SDNode atomic_op> {
+multiclass binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
   def _8 : PatFrag<(ops node:$ptr, node:$val),
                    (atomic_op  node:$ptr, node:$val)> {
     let IsAtomic = 1;
-    let MemoryVT = i8;
+    let MemoryVT = !if(IsInt, i8, ?);
   }
   def _16 : PatFrag<(ops node:$ptr, node:$val),
                     (atomic_op node:$ptr, node:$val)> {
     let IsAtomic = 1;
-    let MemoryVT = i16;
+    let MemoryVT = !if(IsInt, i16, f16);
   }
   def _32 : PatFrag<(ops node:$ptr, node:$val),
                     (atomic_op node:$ptr, node:$val)> {
     let IsAtomic = 1;
-    let MemoryVT = i32;
+    let MemoryVT = !if(IsInt, i32, f32);
   }
   def _64 : PatFrag<(ops node:$ptr, node:$val),
                     (atomic_op node:$ptr, node:$val)> {
     let IsAtomic = 1;
-    let MemoryVT = i64;
+    let MemoryVT = !if(IsInt, i64, f64);
   }
 
   defm NAME#_8  : binary_atomic_op_ord<atomic_op>;
diff --git a/include/llvm/TextAPI/MachO/Architecture.h b/include/llvm/TextAPI/MachO/Architecture.h
index 055baeb0c0f0..3898cbada68f 100644
--- a/include/llvm/TextAPI/MachO/Architecture.h
+++ b/include/llvm/TextAPI/MachO/Architecture.h
@@ -14,6 +14,7 @@
 #define LLVM_TEXTAPI_MACHO_ARCHITECTURE_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -39,6 +40,9 @@ StringRef getArchitectureName(Architecture Arch);
 /// Convert an architecture slice to a CPU Type and Subtype pair.
 std::pair<uint32_t, uint32_t> getCPUTypeFromArchitecture(Architecture Arch);
 
+/// Convert a target to an architecture slice.
+Architecture mapToArchitecture(const llvm::Triple &Target);
+
 raw_ostream &operator<<(raw_ostream &OS, Architecture Arch);
 
 } // end namespace MachO.
diff --git a/include/llvm/TextAPI/MachO/ArchitectureSet.h b/include/llvm/TextAPI/MachO/ArchitectureSet.h
index d8dfc7f1af21..6e4ede6275b4 100644
--- a/include/llvm/TextAPI/MachO/ArchitectureSet.h
+++ b/include/llvm/TextAPI/MachO/ArchitectureSet.h
@@ -59,6 +59,10 @@ public:
 
   ArchSetType rawValue() const { return ArchSet; }
 
+  bool hasX86() const {
+    return has(AK_i386) || has(AK_x86_64) || has(AK_x86_64h);
+  }
+
   template <typename Ty>
   class arch_iterator
       : public std::iterator<std::forward_iterator_tag, Architecture, size_t> {
diff --git a/include/llvm/TextAPI/MachO/InterfaceFile.h b/include/llvm/TextAPI/MachO/InterfaceFile.h
index e722449d52f1..bd434e04b693 100644
--- a/include/llvm/TextAPI/MachO/InterfaceFile.h
+++ b/include/llvm/TextAPI/MachO/InterfaceFile.h
@@ -26,21 +26,13 @@
 #include "llvm/TextAPI/MachO/Architecture.h"
 #include "llvm/TextAPI/MachO/ArchitectureSet.h"
 #include "llvm/TextAPI/MachO/PackedVersion.h"
+#include "llvm/TextAPI/MachO/Platform.h"
 #include "llvm/TextAPI/MachO/Symbol.h"
+#include "llvm/TextAPI/MachO/Target.h"
 
 namespace llvm {
 namespace MachO {
 
-/// Defines the list of MachO platforms.
-enum class PlatformKind : unsigned {
-  unknown,
-  macOS = MachO::PLATFORM_MACOS,
-  iOS = MachO::PLATFORM_IOS,
-  tvOS = MachO::PLATFORM_TVOS,
-  watchOS = MachO::PLATFORM_WATCHOS,
-  bridgeOS = MachO::PLATFORM_BRIDGEOS,
-};
-
 /// Defines a list of Objective-C constraints.
 enum class ObjCConstraintType : unsigned {
   /// No constraint.
@@ -75,6 +67,9 @@ enum FileType : unsigned {
   /// Text-based stub file (.tbd) version 3.0
   TBD_V3  = 1U <<  2,
 
+  /// Text-based stub file (.tbd) version 4.0
+  TBD_V4  = 1U <<  3,
+
   All     = ~0U,
 
   LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/All),
@@ -89,29 +84,42 @@ public:
 
   InterfaceFileRef(StringRef InstallName) : InstallName(InstallName) {}
 
-  InterfaceFileRef(StringRef InstallName, ArchitectureSet Archs)
-      : InstallName(InstallName), Architectures(Archs) {}
+  InterfaceFileRef(StringRef InstallName, const TargetList Targets)
+      : InstallName(InstallName), Targets(std::move(Targets)) {}
 
   StringRef getInstallName() const { return InstallName; };
-  void addArchitectures(ArchitectureSet Archs) { Architectures |= Archs; }
-  ArchitectureSet getArchitectures() const { return Architectures; }
-  bool hasArchitecture(Architecture Arch) const {
-    return Architectures.has(Arch);
+
+  void addTarget(const Target &Target);
+  template <typename RangeT> void addTargets(RangeT &&Targets) {
+    for (const auto &Target : Targets)
+      addTarget(Target(Target));
   }
 
+  using const_target_iterator = TargetList::const_iterator;
+  using const_target_range = llvm::iterator_range<const_target_iterator>;
+  const_target_range targets() const { return {Targets}; }
+
+  ArchitectureSet getArchitectures() const {
+    return mapToArchitectureSet(Targets);
+  }
+
+  PlatformSet getPlatforms() const { return mapToPlatformSet(Targets); }
+
   bool operator==(const InterfaceFileRef &O) const {
-    return std::tie(InstallName, Architectures) ==
-           std::tie(O.InstallName, O.Architectures);
+    return std::tie(InstallName, Targets) == std::tie(O.InstallName, O.Targets);
+  }
+
+  bool operator!=(const InterfaceFileRef &O) const {
+    return std::tie(InstallName, Targets) != std::tie(O.InstallName, O.Targets);
   }
 
   bool operator<(const InterfaceFileRef &O) const {
-    return std::tie(InstallName, Architectures) <
-           std::tie(O.InstallName, O.Architectures);
+    return std::tie(InstallName, Targets) < std::tie(O.InstallName, O.Targets);
   }
 
 private:
   std::string InstallName;
-  ArchitectureSet Architectures;
+  TargetList Targets;
 };
 
 } // end namespace MachO.
@@ -170,27 +178,43 @@ public:
   /// \return The file type.
   FileType getFileType() const { return FileKind; }
 
-  /// Set the platform.
-  void setPlatform(PlatformKind Platform_) { Platform = Platform_; }
+  /// Get the architectures.
+  ///
+  /// \return The applicable architectures.
+  ArchitectureSet getArchitectures() const {
+    return mapToArchitectureSet(Targets);
+  }
 
-  /// Get the platform.
-  PlatformKind getPlatform() const { return Platform; }
+  /// Get the platforms.
+  ///
+  /// \return The applicable platforms.
+  PlatformSet getPlatforms() const { return mapToPlatformSet(Targets); }
 
-  /// Specify the set of supported architectures by this file.
-  void setArchitectures(ArchitectureSet Architectures_) {
-    Architectures = Architectures_;
-  }
+  /// Set and add target.
+  ///
+  /// \param Target the target to add into.
+  void addTarget(const Target &Target);
 
-  /// Add the set of supported architectures by this file.
-  void addArchitectures(ArchitectureSet Architectures_) {
-    Architectures |= Architectures_;
+  /// Set and add targets.
+  ///
+  /// Add the subset of llvm::triples that is supported by Tapi
+  ///
+  /// \param Targets the collection of targets.
+  template <typename RangeT> void addTargets(RangeT &&Targets) {
+    for (const auto &Target_ : Targets)
+      addTarget(Target(Target_));
   }
 
-  /// Add supported architecture by this file..
-  void addArch(Architecture Arch) { Architectures.set(Arch); }
+  using const_target_iterator = TargetList::const_iterator;
+  using const_target_range = llvm::iterator_range<const_target_iterator>;
+  const_target_range targets() const { return {Targets}; }
 
-  /// Get the set of supported architectures.
-  ArchitectureSet getArchitectures() const { return Architectures; }
+  using const_filtered_target_iterator =
+      llvm::filter_iterator<const_target_iterator,
+                            std::function<bool(const Target &)>>;
+  using const_filtered_target_range =
+      llvm::iterator_range<const_filtered_target_iterator>;
+  const_filtered_target_range targets(ArchitectureSet Archs) const;
 
   /// Set the install name of the library.
   void setInstallName(StringRef InstallName_) { InstallName = InstallName_; }
@@ -244,11 +268,18 @@ public:
   /// Check if this file was generated during InstallAPI.
   bool isInstallAPI() const { return IsInstallAPI; }
 
-  /// Set the parent umbrella framework.
-  void setParentUmbrella(StringRef Parent) { ParentUmbrella = Parent; }
+  /// Set the parent umbrella frameworks.
+  /// \param Target_ The target applicable to Parent
+  /// \param Parent  The name of Parent
+  void addParentUmbrella(const Target &Target_, StringRef Parent);
+  const std::vector<std::pair<Target, std::string>> &umbrellas() const {
+    return ParentUmbrellas;
+  }
 
   /// Get the parent umbrella framework.
-  StringRef getParentUmbrella() const { return ParentUmbrella; }
+  const std::vector<std::pair<Target, std::string>> getParentUmbrellas() const {
+    return ParentUmbrellas;
+  }
 
   /// Add an allowable client.
   ///
@@ -257,9 +288,9 @@ public:
   /// that is being generated needs to match one of the allowable clients or the
   /// linker refuses to link this library.
   ///
-  /// \param Name The name of the client that is allowed to link this library.
-  /// \param Architectures The set of architecture for which this applies.
-  void addAllowableClient(StringRef Name, ArchitectureSet Architectures);
+  /// \param InstallName The name of the client that is allowed to link this library.
+  /// \param Target The target triple for which this applies.
+  void addAllowableClient(StringRef InstallName, const Target &Target);
 
   /// Get the list of allowable clients.
   ///
@@ -271,9 +302,8 @@ public:
   /// Add a re-exported library.
   ///
   /// \param InstallName The name of the library to re-export.
-  /// \param Architectures The set of architecture for which this applies.
-  void addReexportedLibrary(StringRef InstallName,
-                            ArchitectureSet Architectures);
+  /// \param Target The target triple for which this applies.
+  void addReexportedLibrary(StringRef InstallName, const Target &Target);
 
   /// Get the list of re-exported libraries.
   ///
@@ -282,27 +312,27 @@ public:
     return ReexportedLibraries;
   }
 
-  /// Add an architecture/UUID pair.
+  /// Add an Target/UUID pair.
   ///
-  /// \param Arch The architecture for which this applies.
+  /// \param Target The target triple for which this applies.
   /// \param UUID The UUID of the library for the specified architecture.
-  void addUUID(Architecture Arch, StringRef UUID);
+  void addUUID(const Target &Target, StringRef UUID);
 
-  /// Add an architecture/UUID pair.
+  /// Add an Target/UUID pair.
   ///
-  /// \param Arch The architecture for which this applies.
+  /// \param Target The target triple for which this applies.
   /// \param UUID The UUID of the library for the specified architecture.
-  void addUUID(Architecture Arch, uint8_t UUID[16]);
+  void addUUID(const Target &Target, uint8_t UUID[16]);
 
-  /// Get the list of architecture/UUID pairs.
+  /// Get the list of Target/UUID pairs.
   ///
-  /// \return Returns a list of architecture/UUID pairs.
-  const std::vector<std::pair<Architecture, std::string>> &uuids() const {
+  /// \return Returns a list of Target/UUID pairs.
+  const std::vector<std::pair<Target, std::string>> &uuids() const {
     return UUIDs;
   }
 
   /// Add a symbol to the symbols list or extend an existing one.
-  void addSymbol(SymbolKind Kind, StringRef Name, ArchitectureSet Architectures,
+  void addSymbol(SymbolKind Kind, StringRef Name, const TargetList &Targets,
                  SymbolFlags Flags = SymbolFlags::None);
 
   using SymbolMapType = DenseMap<SymbolsMapKey, Symbol *>;
@@ -320,84 +350,35 @@ public:
     reference operator*() const { return I->second; }
     pointer operator->() const { return I->second; }
   };
-  using const_symbol_range = iterator_range<const_symbol_iterator>;
-
-  // Custom iterator to return only exported symbols.
-  struct const_export_iterator
-      : public iterator_adaptor_base<
-            const_export_iterator, const_symbol_iterator,
-            std::forward_iterator_tag, const Symbol *> {
-    const_symbol_iterator _end;
-
-    void skipToNextSymbol() {
-      while (I != _end && I->isUndefined())
-        ++I;
-    }
-
-    const_export_iterator() = default;
-    template <typename U>
-    const_export_iterator(U &&it, U &&end)
-        : iterator_adaptor_base(std::forward<U &&>(it)),
-          _end(std::forward<U &&>(end)) {
-      skipToNextSymbol();
-    }
-
-    const_export_iterator &operator++() {
-      ++I;
-      skipToNextSymbol();
-      return *this;
-    }
-
-    const_export_iterator operator++(int) {
-      const_export_iterator tmp(*this);
-      ++(*this);
-      return tmp;
-    }
-  };
-  using const_export_range = llvm::iterator_range<const_export_iterator>;
-
-  // Custom iterator to return only undefined symbols.
-  struct const_undefined_iterator
-      : public iterator_adaptor_base<
-            const_undefined_iterator, const_symbol_iterator,
-            std::forward_iterator_tag, const Symbol *> {
-    const_symbol_iterator _end;
 
-    void skipToNextSymbol() {
-      while (I != _end && !I->isUndefined())
-        ++I;
-    }
+  using const_symbol_range = iterator_range<const_symbol_iterator>;
 
-    const_undefined_iterator() = default;
-    template <typename U>
-    const_undefined_iterator(U &&it, U &&end)
-        : iterator_adaptor_base(std::forward<U &&>(it)),
-          _end(std::forward<U &&>(end)) {
-      skipToNextSymbol();
-    }
-
-    const_undefined_iterator &operator++() {
-      ++I;
-      skipToNextSymbol();
-      return *this;
-    }
-
-    const_undefined_iterator operator++(int) {
-      const_undefined_iterator tmp(*this);
-      ++(*this);
-      return tmp;
-    }
-  };
-  using const_undefined_range = llvm::iterator_range<const_undefined_iterator>;
+  using const_filtered_symbol_iterator =
+      filter_iterator<const_symbol_iterator,
+                      std::function<bool(const Symbol *)>>;
+  using const_filtered_symbol_range =
+      iterator_range<const_filtered_symbol_iterator>;
 
   const_symbol_range symbols() const {
     return {Symbols.begin(), Symbols.end()};
   }
-  const_export_range exports() const {
-    return {{Symbols.begin(), Symbols.end()}, {Symbols.end(), Symbols.end()}};
+
+  const_filtered_symbol_range exports() const {
+    std::function<bool(const Symbol *)> fn = [](const Symbol *Symbol) {
+      return !Symbol->isUndefined();
+    };
+    return make_filter_range(
+        make_range<const_symbol_iterator>({Symbols.begin()}, {Symbols.end()}),
+        fn);
   }
-  const_undefined_range undefineds() const {
-    return {{Symbols.begin(), Symbols.end()}, {Symbols.end(), Symbols.end()}};
+
+  const_filtered_symbol_range undefineds() const {
+    std::function<bool(const Symbol *)> fn = [](const Symbol *Symbol) {
+      return Symbol->isUndefined();
+    };
+    return make_filter_range(
+        make_range<const_symbol_iterator>({Symbols.begin()}, {Symbols.end()}),
+        fn);
   }
 
 private:
@@ -411,10 +392,9 @@ private:
     return StringRef(reinterpret_cast<const char *>(Ptr), String.size());
   }
 
+  TargetList Targets;
   std::string Path;
   FileType FileKind;
-  PlatformKind Platform;
-  ArchitectureSet Architectures;
   std::string InstallName;
   PackedVersion CurrentVersion;
   PackedVersion CompatibilityVersion;
@@ -423,10 +403,10 @@ private:
   bool IsAppExtensionSafe{false};
   bool IsInstallAPI{false};
   ObjCConstraintType ObjcConstraint = ObjCConstraintType::None;
-  std::string ParentUmbrella;
+  std::vector<std::pair<Target, std::string>> ParentUmbrellas;
   std::vector<InterfaceFileRef> AllowableClients;
   std::vector<InterfaceFileRef> ReexportedLibraries;
-  std::vector<std::pair<Architecture, std::string>> UUIDs;
+  std::vector<std::pair<Target, std::string>> UUIDs;
   SymbolMapType Symbols;
 };
 
diff --git a/include/llvm/TextAPI/MachO/Platform.h b/include/llvm/TextAPI/MachO/Platform.h
new file mode 100644
index 000000000000..a22aae9b7dce
--- /dev/null
+++ b/include/llvm/TextAPI/MachO/Platform.h
@@ -0,0 +1,45 @@
+//===- llvm/TextAPI/MachO/Platform.h - Platform -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the Platforms supported by Tapi and helpers.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TEXTAPI_MACHO_PLATFORM_H
+#define LLVM_TEXTAPI_MACHO_PLATFORM_H
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/BinaryFormat/MachO.h"
+
+namespace llvm {
+namespace MachO {
+
+/// Defines the list of MachO platforms.
+enum class PlatformKind : unsigned {
+  unknown,
+  macOS = MachO::PLATFORM_MACOS,
+  iOS = MachO::PLATFORM_IOS,
+  tvOS = MachO::PLATFORM_TVOS,
+  watchOS = MachO::PLATFORM_WATCHOS,
+  bridgeOS = MachO::PLATFORM_BRIDGEOS,
+  macCatalyst = MachO::PLATFORM_MACCATALYST,
+  iOSSimulator = MachO::PLATFORM_IOSSIMULATOR,
+  tvOSSimulator = MachO::PLATFORM_TVOSSIMULATOR,
+  watchOSSimulator = MachO::PLATFORM_WATCHOSSIMULATOR
+};
+
+using PlatformSet = SmallSet<PlatformKind, 3>;
+
+PlatformKind mapToPlatformKind(PlatformKind Platform, bool WantSim);
+PlatformKind mapToPlatformKind(const Triple &Target);
+PlatformSet mapToPlatformSet(ArrayRef<Triple> Targets);
+StringRef getPlatformName(PlatformKind Platform);
+
+} // end namespace MachO.
+} // end namespace llvm.
+
+#endif // LLVM_TEXTAPI_MACHO_PLATFORM_H
\ No newline at end of file
diff --git a/include/llvm/TextAPI/MachO/Symbol.h b/include/llvm/TextAPI/MachO/Symbol.h
index 3c7ff5e0f4ea..1b1632c599c4 100644
--- a/include/llvm/TextAPI/MachO/Symbol.h
+++ b/include/llvm/TextAPI/MachO/Symbol.h
@@ -14,6 +14,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TextAPI/MachO/ArchitectureSet.h"
+#include "llvm/TextAPI/MachO/Target.h"
 
 namespace llvm {
 namespace MachO {
@@ -37,7 +38,10 @@ enum class SymbolFlags : uint8_t {
   /// Undefined
   Undefined        = 1U << 3,
 
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/Undefined),
+  /// Rexported
+  Rexported        = 1U << 4,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/Rexported),
 };
 
 // clang-format on
@@ -49,16 +53,18 @@ enum class SymbolKind : uint8_t {
   ObjectiveCInstanceVariable,
 };
 
+using TargetList = SmallVector<Target, 5>;
 class Symbol {
 public:
-  constexpr Symbol(SymbolKind Kind, StringRef Name,
-                   ArchitectureSet Architectures, SymbolFlags Flags)
-      : Name(Name), Architectures(Architectures), Kind(Kind), Flags(Flags) {}
+  Symbol(SymbolKind Kind, StringRef Name, TargetList Targets, SymbolFlags Flags)
+      : Name(Name), Targets(std::move(Targets)), Kind(Kind), Flags(Flags) {}
 
+  void addTarget(Target target) { Targets.emplace_back(target); }
   SymbolKind getKind() const { return Kind; }
   StringRef getName() const { return Name; }
-  ArchitectureSet getArchitectures() const { return Architectures; }
-  void addArchitectures(ArchitectureSet Archs) { Architectures |= Archs; }
+  ArchitectureSet getArchitectures() const {
+    return mapToArchitectureSet(Targets);
+  }
   SymbolFlags getFlags() const { return Flags; }
 
   bool isWeakDefined() const {
@@ -78,6 +84,21 @@ public:
     return (Flags & SymbolFlags::Undefined) == SymbolFlags::Undefined;
   }
 
+  bool isReexported() const {
+    return (Flags & SymbolFlags::Rexported) == SymbolFlags::Rexported;
+  }
+
+  using const_target_iterator = TargetList::const_iterator;
+  using const_target_range = llvm::iterator_range<const_target_iterator>;
+  const_target_range targets() const { return {Targets}; }
+
+  using const_filtered_target_iterator =
+      llvm::filter_iterator<const_target_iterator,
+                            std::function<bool(const Target &)>>;
+  using const_filtered_target_range =
+      llvm::iterator_range<const_filtered_target_iterator>;
+  const_filtered_target_range targets(ArchitectureSet architectures) const;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump(raw_ostream &OS) const;
   void dump() const { dump(llvm::errs()); }
@@ -85,7 +106,7 @@ public:
 
 private:
   StringRef Name;
-  ArchitectureSet Architectures;
+  TargetList Targets;
   SymbolKind Kind;
   SymbolFlags Flags;
 };
diff --git a/include/llvm/TextAPI/MachO/Target.h b/include/llvm/TextAPI/MachO/Target.h
new file mode 100644
index 000000000000..5fe44cb7d366
--- /dev/null
+++ b/include/llvm/TextAPI/MachO/Target.h
@@ -0,0 +1,68 @@
+//===- llvm/TextAPI/Target.h - TAPI Target ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TEXTAPI_MACHO_TARGET_H
+#define LLVM_TEXTAPI_MACHO_TARGET_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Error.h"
+#include "llvm/TextAPI/MachO/Architecture.h"
+#include "llvm/TextAPI/MachO/ArchitectureSet.h"
+#include "llvm/TextAPI/MachO/Platform.h"
+
+namespace llvm {
+namespace MachO {
+
+// This is similar to a llvm Triple, but the triple doesn't have all the
+// information we need. For example there is no enum value for x86_64h. The
+// only way to get that information is to parse the triple string.
+class Target {
+public:
+  Target() = default;
+  Target(Architecture Arch, PlatformKind Platform)
+      : Arch(Arch), Platform(Platform) {}
+  explicit Target(const llvm::Triple &Triple)
+      : Arch(mapToArchitecture(Triple)), Platform(mapToPlatformKind(Triple)) {}
+
+  static llvm::Expected<Target> create(StringRef Target);
+
+  operator std::string() const;
+
+  Architecture Arch;
+  PlatformKind Platform;
+};
+
+inline bool operator==(const Target &LHS, const Target &RHS) {
+  return std::tie(LHS.Arch, LHS.Platform) == std::tie(RHS.Arch, RHS.Platform);
+}
+
+inline bool operator!=(const Target &LHS, const Target &RHS) {
+  return std::tie(LHS.Arch, LHS.Platform) != std::tie(RHS.Arch, RHS.Platform);
+}
+
+inline bool operator<(const Target &LHS, const Target &RHS) {
+  return std::tie(LHS.Arch, LHS.Platform) < std::tie(RHS.Arch, RHS.Platform);
+}
+
+inline bool operator==(const Target &LHS, const Architecture &RHS) {
+  return LHS.Arch == RHS;
+}
+
+inline bool operator!=(const Target &LHS, const Architecture &RHS) {
+  return LHS.Arch != RHS;
+}
+
+PlatformSet mapToPlatformSet(ArrayRef<Target> Targets);
+ArchitectureSet mapToArchitectureSet(ArrayRef<Target> Targets);
+
+raw_ostream &operator<<(raw_ostream &OS, const Target &Target);
+
+} // namespace MachO
+} // namespace llvm
+
+#endif // LLVM_TEXTAPI_MACHO_TARGET_H
diff --git a/include/llvm/TextAPI/MachO/TextAPIReader.h b/include/llvm/TextAPI/MachO/TextAPIReader.h
index 6d9c09de5294..c551f0454e8e 100644
--- a/include/llvm/TextAPI/MachO/TextAPIReader.h
+++ b/include/llvm/TextAPI/MachO/TextAPIReader.h
@@ -20,10 +20,7 @@ class InterfaceFile;
 class TextAPIReader {
 public:
   static Expected<std::unique_ptr<InterfaceFile>>
-  get(std::unique_ptr<MemoryBuffer> InputBuffer);
-
-  static Expected<std::unique_ptr<InterfaceFile>>
-  getUnmanaged(llvm::MemoryBuffer *InputBuffer);
+  get(MemoryBufferRef InputBuffer);
 
   TextAPIReader() = delete;
 };
diff --git a/include/llvm/Transforms/IPO/Attributor.h b/include/llvm/Transforms/IPO/Attributor.h
index 5dbe21ac5e4e..3dbe0fcd76ea 100644
--- a/include/llvm/Transforms/IPO/Attributor.h
+++ b/include/llvm/Transforms/IPO/Attributor.h
@@ -60,13 +60,12 @@
 // manifest their result in the IR for passes to come.
 //
 // Attribute manifestation is not mandatory. If desired, there is support to
-// generate a single LLVM-IR attribute already in the AbstractAttribute base
-// class. In the simplest case, a subclass overloads
-// `AbstractAttribute::getManifestPosition()` and
-// `AbstractAttribute::getAttrKind()` to return the appropriate values. The
-// Attributor manifestation framework will then create and place a new attribute
-// if it is allowed to do so (based on the abstract state). Other use cases can
-// be achieved by overloading other abstract attribute methods.
+// generate a single or multiple LLVM-IR attributes already in the helper struct
+// IRAttribute. In the simplest case, a subclass inherits from IRAttribute with
+// a proper Attribute::AttrKind as template parameter. The Attributor
+// manifestation framework will then create and place a new attribute if it is
+// allowed to do so (based on the abstract state). Other use cases can be
+// achieved by overloading AbstractAttribute or IRAttribute methods.
 //
 //
 // The "mechanics" of adding a new "abstract attribute":
@@ -97,7 +96,13 @@
 #ifndef LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H
 #define LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H
 
-#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/PassManager.h"
 
@@ -105,6 +110,7 @@ namespace llvm {
 
 struct AbstractAttribute;
 struct InformationCache;
+struct AAIsDead;
 
 class Function;
 
@@ -120,6 +126,563 @@ ChangeStatus operator|(ChangeStatus l, ChangeStatus r);
 ChangeStatus operator&(ChangeStatus l, ChangeStatus r);
 ///}
 
+/// Helper to describe and deal with positions in the LLVM-IR.
+///
+/// A position in the IR is described by an anchor value and an "offset" that
+/// could be the argument number, for call sites and arguments, or an indicator
+/// of the "position kind". The kinds, specified in the Kind enum below, include
+/// the locations in the attribute list, i.a., function scope and return value,
+/// as well as a distinction between call sites and functions. Finally, there
+/// are floating values that do not have a corresponding attribute list
+/// position.
+struct IRPosition {
+  virtual ~IRPosition() {}
+
+  /// The positions we distinguish in the IR.
+  ///
+  /// The values are chosen such that the KindOrArgNo member has a value >= 1
+  /// if it is an argument or call site argument while a value < 1 indicates the
+  /// respective kind of that value.
+  enum Kind : int {
+    IRP_INVALID = -6, ///< An invalid position.
+    IRP_FLOAT = -5, ///< A position that is not associated with a spot suitable
+                    ///< for attributes. This could be any value or instruction.
+    IRP_RETURNED = -4, ///< An attribute for the function return value.
+    IRP_CALL_SITE_RETURNED = -3, ///< An attribute for a call site return value.
+    IRP_FUNCTION = -2,           ///< An attribute for a function (scope).
+    IRP_CALL_SITE = -1, ///< An attribute for a call site (function scope).
+    IRP_ARGUMENT = 0,   ///< An attribute for a function argument.
+    IRP_CALL_SITE_ARGUMENT = 1, ///< An attribute for a call site argument.
+  };
+
+  /// Default constructor available to create invalid positions implicitly. All
+  /// other positions need to be created explicitly through the appropriate
+  /// static member function.
+  IRPosition() : AnchorVal(nullptr), KindOrArgNo(IRP_INVALID) { verify(); }
+
+  /// Create a position describing the value of \p V.
+  static const IRPosition value(const Value &V) {
+    if (auto *Arg = dyn_cast<Argument>(&V))
+      return IRPosition::argument(*Arg);
+    if (auto *CB = dyn_cast<CallBase>(&V))
+      return IRPosition::callsite_returned(*CB);
+    return IRPosition(const_cast<Value &>(V), IRP_FLOAT);
+  }
+
+  /// Create a position describing the function scope of \p F.
+  static const IRPosition function(const Function &F) {
+    return IRPosition(const_cast<Function &>(F), IRP_FUNCTION);
+  }
+
+  /// Create a position describing the returned value of \p F.
+  static const IRPosition returned(const Function &F) {
+    return IRPosition(const_cast<Function &>(F), IRP_RETURNED);
+  }
+
+  /// Create a position describing the argument \p Arg.
+  static const IRPosition argument(const Argument &Arg) {
+    return IRPosition(const_cast<Argument &>(Arg), Kind(Arg.getArgNo()));
+  }
+
+  /// Create a position describing the function scope of \p CB.
+  static const IRPosition callsite_function(const CallBase &CB) {
+    return IRPosition(const_cast<CallBase &>(CB), IRP_CALL_SITE);
+  }
+
+  /// Create a position describing the returned value of \p CB.
+  static const IRPosition callsite_returned(const CallBase &CB) {
+    return IRPosition(const_cast<CallBase &>(CB), IRP_CALL_SITE_RETURNED);
+  }
+
+  /// Create a position describing the argument of \p CB at position \p ArgNo.
+  static const IRPosition callsite_argument(const CallBase &CB,
+                                            unsigned ArgNo) {
+    return IRPosition(const_cast<CallBase &>(CB), Kind(ArgNo));
+  }
+
+  /// Create a position describing the function scope of \p ICS.
+  static const IRPosition callsite_function(ImmutableCallSite ICS) {
+    return IRPosition::callsite_function(cast<CallBase>(*ICS.getInstruction()));
+  }
+
+  /// Create a position describing the returned value of \p ICS.
+  static const IRPosition callsite_returned(ImmutableCallSite ICS) {
+    return IRPosition::callsite_returned(cast<CallBase>(*ICS.getInstruction()));
+  }
+
+  /// Create a position describing the argument of \p ICS at position \p ArgNo.
+  static const IRPosition callsite_argument(ImmutableCallSite ICS,
+                                            unsigned ArgNo) {
+    return IRPosition::callsite_argument(cast<CallBase>(*ICS.getInstruction()),
+                                         ArgNo);
+  }
+
+  /// Create a position describing the argument of \p ACS at position \p ArgNo.
+  static const IRPosition callsite_argument(AbstractCallSite ACS,
+                                            unsigned ArgNo) {
+    int CSArgNo = ACS.getCallArgOperandNo(ArgNo);
+    if (CSArgNo >= 0)
+      return IRPosition::callsite_argument(
+          cast<CallBase>(*ACS.getInstruction()), CSArgNo);
+    return IRPosition();
+  }
+
+  /// Create a position with function scope matching the "context" of \p IRP.
+  /// If \p IRP is a call site (see isAnyCallSitePosition()) then the result
+  /// will be a call site position, otherwise the function position of the
+  /// associated function.
+  static const IRPosition function_scope(const IRPosition &IRP) {
+    if (IRP.isAnyCallSitePosition()) {
+      return IRPosition::callsite_function(
+          cast<CallBase>(IRP.getAnchorValue()));
+    }
+    assert(IRP.getAssociatedFunction());
+    return IRPosition::function(*IRP.getAssociatedFunction());
+  }
+
+  bool operator==(const IRPosition &RHS) const {
+    return (AnchorVal == RHS.AnchorVal) && (KindOrArgNo == RHS.KindOrArgNo);
+  }
+  bool operator!=(const IRPosition &RHS) const { return !(*this == RHS); }
+
+  /// Return the value this abstract attribute is anchored with.
+  ///
+  /// The anchor value might not be the associated value if the latter is not
+  /// sufficient to determine where arguments will be manifested. This is, so
+  /// far, only the case for call site arguments as the value is not sufficient
+  /// to pinpoint them. Instead, we can use the call site as an anchor.
+  ///
+  ///{
+  Value &getAnchorValue() {
+    assert(KindOrArgNo != IRP_INVALID &&
+           "Invalid position does not have an anchor value!");
+    return *AnchorVal;
+  }
+  const Value &getAnchorValue() const {
+    return const_cast<IRPosition *>(this)->getAnchorValue();
+  }
+  ///}
+
+  /// Return the associated function, if any.
+  ///
+  ///{
+  Function *getAssociatedFunction() {
+    if (auto *CB = dyn_cast<CallBase>(AnchorVal))
+      return CB->getCalledFunction();
+    assert(KindOrArgNo != IRP_INVALID &&
+           "Invalid position does not have an anchor scope!");
+    Value &V = getAnchorValue();
+    if (isa<Function>(V))
+      return &cast<Function>(V);
+    if (isa<Argument>(V))
+      return cast<Argument>(V).getParent();
+    if (isa<Instruction>(V))
+      return cast<Instruction>(V).getFunction();
+    return nullptr;
+  }
+  const Function *getAssociatedFunction() const {
+    return const_cast<IRPosition *>(this)->getAssociatedFunction();
+  }
+  ///}
+
+  /// Return the associated argument, if any.
+  ///
+  ///{
+  Argument *getAssociatedArgument() {
+    if (auto *Arg = dyn_cast<Argument>(&getAnchorValue()))
+      return Arg;
+    int ArgNo = getArgNo();
+    if (ArgNo < 0)
+      return nullptr;
+    Function *AssociatedFn = getAssociatedFunction();
+    if (!AssociatedFn || AssociatedFn->arg_size() <= unsigned(ArgNo))
+      return nullptr;
+    return AssociatedFn->arg_begin() + ArgNo;
+  }
+  const Argument *getAssociatedArgument() const {
+    return const_cast<IRPosition *>(this)->getAssociatedArgument();
+  }
+  ///}
+
+  /// Return true if the position refers to a function interface, that is the
+  /// function scope, the function return, or an argumnt.
+  bool isFnInterfaceKind() const {
+    switch (getPositionKind()) {
+    case IRPosition::IRP_FUNCTION:
+    case IRPosition::IRP_RETURNED:
+    case IRPosition::IRP_ARGUMENT:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  /// Return the Function surrounding the anchor value.
+  ///
+  ///{
+  Function *getAnchorScope() {
+    Value &V = getAnchorValue();
+    if (isa<Function>(V))
+      return &cast<Function>(V);
+    if (isa<Argument>(V))
+      return cast<Argument>(V).getParent();
+    if (isa<Instruction>(V))
+      return cast<Instruction>(V).getFunction();
+    return nullptr;
+  }
+  const Function *getAnchorScope() const {
+    return const_cast<IRPosition *>(this)->getAnchorScope();
+  }
+  ///}
+
+  /// Return the context instruction, if any.
+  ///
+  ///{
+  Instruction *getCtxI() {
+    Value &V = getAnchorValue();
+    if (auto *I = dyn_cast<Instruction>(&V))
+      return I;
+    if (auto *Arg = dyn_cast<Argument>(&V))
+      if (!Arg->getParent()->isDeclaration())
+        return &Arg->getParent()->getEntryBlock().front();
+    if (auto *F = dyn_cast<Function>(&V))
+      if (!F->isDeclaration())
+        return &(F->getEntryBlock().front());
+    return nullptr;
+  }
+  const Instruction *getCtxI() const {
+    return const_cast<IRPosition *>(this)->getCtxI();
+  }
+  ///}
+
+  /// Return the value this abstract attribute is associated with.
+  ///
+  ///{
+  Value &getAssociatedValue() {
+    assert(KindOrArgNo != IRP_INVALID &&
+           "Invalid position does not have an associated value!");
+    if (getArgNo() < 0 || isa<Argument>(AnchorVal))
+      return *AnchorVal;
+    assert(isa<CallBase>(AnchorVal) && "Expected a call base!");
+    return *cast<CallBase>(AnchorVal)->getArgOperand(getArgNo());
+  }
+  const Value &getAssociatedValue() const {
+    return const_cast<IRPosition *>(this)->getAssociatedValue();
+  }
+  ///}
+
+  /// Return the argument number of the associated value if it is an argument or
+  /// call site argument, otherwise a negative value.
+  int getArgNo() const { return KindOrArgNo; }
+
+  /// Return the index in the attribute list for this position.
+  unsigned getAttrIdx() const {
+    switch (getPositionKind()) {
+    case IRPosition::IRP_INVALID:
+    case IRPosition::IRP_FLOAT:
+      break;
+    case IRPosition::IRP_FUNCTION:
+    case IRPosition::IRP_CALL_SITE:
+      return AttributeList::FunctionIndex;
+    case IRPosition::IRP_RETURNED:
+    case IRPosition::IRP_CALL_SITE_RETURNED:
+      return AttributeList::ReturnIndex;
+    case IRPosition::IRP_ARGUMENT:
+    case IRPosition::IRP_CALL_SITE_ARGUMENT:
+      return KindOrArgNo + AttributeList::FirstArgIndex;
+    }
+    llvm_unreachable(
+        "There is no attribute index for a floating or invalid position!");
+  }
+
+  /// Return the associated position kind.
+  Kind getPositionKind() const {
+    if (getArgNo() >= 0) {
+      assert(((isa<Argument>(getAnchorValue()) &&
+               isa<Argument>(getAssociatedValue())) ||
+              isa<CallBase>(getAnchorValue())) &&
+             "Expected argument or call base due to argument number!");
+      if (isa<CallBase>(getAnchorValue()))
+        return IRP_CALL_SITE_ARGUMENT;
+      return IRP_ARGUMENT;
+    }
+
+    assert(KindOrArgNo < 0 &&
+           "Expected (call site) arguments to never reach this point!");
+    return Kind(KindOrArgNo);
+  }
+
+  /// TODO: Figure out if the attribute related helper functions should live
+  ///       here or somewhere else.
+
+  /// Return true if any kind in \p AKs existing in the IR at a position that
+  /// will affect this one. See also getAttrs(...).
+  /// \param IgnoreSubsumingPositions Flag to determine if subsuming positions,
+  ///                                 e.g., the function position if this is an
+  ///                                 argument position, should be ignored.
+  bool hasAttr(ArrayRef<Attribute::AttrKind> AKs,
+               bool IgnoreSubsumingPositions = false) const;
+
+  /// Return the attributes of any kind in \p AKs existing in the IR at a
+  /// position that will affect this one. While each position can only have a
+  /// single attribute of any kind in \p AKs, there are "subsuming" positions
+  /// that could have an attribute as well. This method returns all attributes
+  /// found in \p Attrs.
+  void getAttrs(ArrayRef<Attribute::AttrKind> AKs,
+                SmallVectorImpl<Attribute> &Attrs) const;
+
+  /// Return the attribute of kind \p AK existing in the IR at this position.
+  Attribute getAttr(Attribute::AttrKind AK) const {
+    if (getPositionKind() == IRP_INVALID || getPositionKind() == IRP_FLOAT)
+      return Attribute();
+
+    AttributeList AttrList;
+    if (ImmutableCallSite ICS = ImmutableCallSite(&getAnchorValue()))
+      AttrList = ICS.getAttributes();
+    else
+      AttrList = getAssociatedFunction()->getAttributes();
+
+    if (AttrList.hasAttribute(getAttrIdx(), AK))
+      return AttrList.getAttribute(getAttrIdx(), AK);
+    return Attribute();
+  }
+
+  /// Remove the attribute of kind \p AKs existing in the IR at this position.
+  void removeAttrs(ArrayRef<Attribute::AttrKind> AKs) {
+    if (getPositionKind() == IRP_INVALID || getPositionKind() == IRP_FLOAT)
+      return;
+
+    AttributeList AttrList;
+    CallSite CS = CallSite(&getAnchorValue());
+    if (CS)
+      AttrList = CS.getAttributes();
+    else
+      AttrList = getAssociatedFunction()->getAttributes();
+
+    LLVMContext &Ctx = getAnchorValue().getContext();
+    for (Attribute::AttrKind AK : AKs)
+      AttrList = AttrList.removeAttribute(Ctx, getAttrIdx(), AK);
+
+    if (CS)
+      CS.setAttributes(AttrList);
+    else
+      getAssociatedFunction()->setAttributes(AttrList);
+  }
+
+  bool isAnyCallSitePosition() const {
+    switch (getPositionKind()) {
+    case IRPosition::IRP_CALL_SITE:
+    case IRPosition::IRP_CALL_SITE_RETURNED:
+    case IRPosition::IRP_CALL_SITE_ARGUMENT:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  /// Special DenseMap key values.
+  ///
+  ///{
+  static const IRPosition EmptyKey;
+  static const IRPosition TombstoneKey;
+  ///}
+
+private:
+  /// Private constructor for special values only!
+  explicit IRPosition(int KindOrArgNo)
+      : AnchorVal(0), KindOrArgNo(KindOrArgNo) {}
+
+  /// IRPosition anchored at \p AnchorVal with kind/argument numbet \p PK.
+  explicit IRPosition(Value &AnchorVal, Kind PK)
+      : AnchorVal(&AnchorVal), KindOrArgNo(PK) {
+    verify();
+  }
+
+  /// Verify internal invariants.
+  void verify();
+
+  /// The value this position is anchored at.
+  Value *AnchorVal;
+
+  /// The argument number, if non-negative, or the position "kind".
+  int KindOrArgNo;
+};
+
+/// Helper that allows IRPosition as a key in a DenseMap.
+template <> struct DenseMapInfo<IRPosition> {
+  static inline IRPosition getEmptyKey() { return IRPosition::EmptyKey; }
+  static inline IRPosition getTombstoneKey() {
+    return IRPosition::TombstoneKey;
+  }
+  static unsigned getHashValue(const IRPosition &IRP) {
+    return (DenseMapInfo<Value *>::getHashValue(&IRP.getAnchorValue()) << 4) ^
+           (unsigned(IRP.getArgNo()));
+  }
+  static bool isEqual(const IRPosition &LHS, const IRPosition &RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// A visitor class for IR positions.
+///
+/// Given a position P, the SubsumingPositionIterator allows to visit "subsuming
+/// positions" wrt. attributes/information. Thus, if a piece of information
+/// holds for a subsuming position, it also holds for the position P.
+///
+/// The subsuming positions always include the initial position and then,
+/// depending on the position kind, additionally the following ones:
+/// - for IRP_RETURNED:
+///   - the function (IRP_FUNCTION)
+/// - for IRP_ARGUMENT:
+///   - the function (IRP_FUNCTION)
+/// - for IRP_CALL_SITE:
+///   - the callee (IRP_FUNCTION), if known
+/// - for IRP_CALL_SITE_RETURNED:
+///   - the callee (IRP_RETURNED), if known
+///   - the call site (IRP_FUNCTION)
+///   - the callee (IRP_FUNCTION), if known
+/// - for IRP_CALL_SITE_ARGUMENT:
+///   - the argument of the callee (IRP_ARGUMENT), if known
+///   - the callee (IRP_FUNCTION), if known
+///   - the position the call site argument is associated with if it is not
+///     anchored to the call site, e.g., if it is an arugment then the argument
+///     (IRP_ARGUMENT)
+class SubsumingPositionIterator {
+  SmallVector<IRPosition, 4> IRPositions;
+  using iterator = decltype(IRPositions)::iterator;
+
+public:
+  SubsumingPositionIterator(const IRPosition &IRP);
+  iterator begin() { return IRPositions.begin(); }
+  iterator end() { return IRPositions.end(); }
+};
+
+/// Wrapper for FunctoinAnalysisManager.
+struct AnalysisGetter {
+  template <typename Analysis>
+  typename Analysis::Result *getAnalysis(const Function &F) {
+    if (!MAM || !F.getParent())
+      return nullptr;
+    auto &FAM = MAM->getResult<FunctionAnalysisManagerModuleProxy>(
+                       const_cast<Module &>(*F.getParent()))
+                    .getManager();
+    return &FAM.getResult<Analysis>(const_cast<Function &>(F));
+  }
+
+  template <typename Analysis>
+  typename Analysis::Result *getAnalysis(const Module &M) {
+    if (!MAM)
+      return nullptr;
+    return &MAM->getResult<Analysis>(const_cast<Module &>(M));
+  }
+  AnalysisGetter(ModuleAnalysisManager &MAM) : MAM(&MAM) {}
+  AnalysisGetter() {}
+
+private:
+  ModuleAnalysisManager *MAM = nullptr;
+};
+
+/// Data structure to hold cached (LLVM-IR) information.
+///
+/// All attributes are given an InformationCache object at creation time to
+/// avoid inspection of the IR by all of them individually. This default
+/// InformationCache will hold information required by 'default' attributes,
+/// thus the ones deduced when Attributor::identifyDefaultAbstractAttributes(..)
+/// is called.
+///
+/// If custom abstract attributes, registered manually through
+/// Attributor::registerAA(...), need more information, especially if it is not
+/// reusable, it is advised to inherit from the InformationCache and cast the
+/// instance down in the abstract attributes.
+struct InformationCache {
+  InformationCache(const Module &M, AnalysisGetter &AG)
+      : DL(M.getDataLayout()), Explorer(/* ExploreInterBlock */ true), AG(AG) {
+
+    CallGraph *CG = AG.getAnalysis<CallGraphAnalysis>(M);
+    if (!CG)
+      return;
+
+    DenseMap<const Function *, unsigned> SccSize;
+    for (scc_iterator<CallGraph *> I = scc_begin(CG); !I.isAtEnd(); ++I) {
+      for (CallGraphNode *Node : *I)
+        SccSize[Node->getFunction()] = I->size();
+    }
+    SccSizeOpt = std::move(SccSize);
+  }
+
+  /// A map type from opcodes to instructions with this opcode.
+  using OpcodeInstMapTy = DenseMap<unsigned, SmallVector<Instruction *, 32>>;
+
+  /// Return the map that relates "interesting" opcodes with all instructions
+  /// with that opcode in \p F.
+  OpcodeInstMapTy &getOpcodeInstMapForFunction(const Function &F) {
+    return FuncInstOpcodeMap[&F];
+  }
+
+  /// A vector type to hold instructions.
+  using InstructionVectorTy = std::vector<Instruction *>;
+
+  /// Return the instructions in \p F that may read or write memory.
+  InstructionVectorTy &getReadOrWriteInstsForFunction(const Function &F) {
+    return FuncRWInstsMap[&F];
+  }
+
+  /// Return MustBeExecutedContextExplorer
+  MustBeExecutedContextExplorer &getMustBeExecutedContextExplorer() {
+    return Explorer;
+  }
+
+  /// Return TargetLibraryInfo for function \p F.
+  TargetLibraryInfo *getTargetLibraryInfoForFunction(const Function &F) {
+    return AG.getAnalysis<TargetLibraryAnalysis>(F);
+  }
+
+  /// Return AliasAnalysis Result for function \p F.
+  AAResults *getAAResultsForFunction(const Function &F) {
+    return AG.getAnalysis<AAManager>(F);
+  }
+
+  /// Return SCC size on call graph for function \p F.
+  unsigned getSccSize(const Function &F) {
+    if (!SccSizeOpt.hasValue())
+      return 0;
+    return (SccSizeOpt.getValue())[&F];
+  }
+
+  /// Return datalayout used in the module.
+  const DataLayout &getDL() { return DL; }
+
+private:
+  /// A map type from functions to opcode to instruction maps.
+  using FuncInstOpcodeMapTy = DenseMap<const Function *, OpcodeInstMapTy>;
+
+  /// A map type from functions to their read or write instructions.
+  using FuncRWInstsMapTy = DenseMap<const Function *, InstructionVectorTy>;
+
+  /// A nested map that remembers all instructions in a function with a certain
+  /// instruction opcode (Instruction::getOpcode()).
+  FuncInstOpcodeMapTy FuncInstOpcodeMap;
+
+  /// A map from functions to their instructions that may read or write memory.
+  FuncRWInstsMapTy FuncRWInstsMap;
+
+  /// The datalayout used in the module.
+  const DataLayout &DL;
+
+  /// MustBeExecutedContextExplorer
+  MustBeExecutedContextExplorer Explorer;
+
+  /// Getters for analysis.
+  AnalysisGetter &AG;
+
+  /// Cache result for scc size in the call graph
+  Optional<DenseMap<const Function *, unsigned>> SccSizeOpt;
+
+  /// Give the Attributor access to the members so
+  /// Attributor::identifyDefaultAbstractAttributes(...) can initialize them.
+  friend struct Attributor;
+};
+
 /// The fixpoint analysis framework that orchestrates the attribute deduction.
 ///
 /// The Attributor provides a general abstract analysis framework (guided
@@ -148,6 +711,18 @@ ChangeStatus operator&(ChangeStatus l, ChangeStatus r);
 /// NOTE: The mechanics of adding a new "concrete" abstract attribute are
 ///       described in the file comment.
 struct Attributor {
+  /// Constructor
+  ///
+  /// \param InfoCache Cache to hold various information accessible for
+  ///                  the abstract attributes.
+  /// \param DepRecomputeInterval Number of iterations until the dependences
+  ///                             between abstract attributes are recomputed.
+  /// \param Whitelist If not null, a set limiting the attribute opportunities.
+  Attributor(InformationCache &InfoCache, unsigned DepRecomputeInterval,
+             DenseSet<const char *> *Whitelist = nullptr)
+      : InfoCache(InfoCache), DepRecomputeInterval(DepRecomputeInterval),
+        Whitelist(Whitelist) {}
+
   ~Attributor() { DeleteContainerPointers(AllAbstractAttributes); }
 
   /// Run the analyses until a fixpoint is reached or enforced (timeout).
@@ -156,12 +731,13 @@ struct Attributor {
   /// as the Attributor is not destroyed (it owns the attributes now).
   ///
   /// \Returns CHANGED if the IR was changed, otherwise UNCHANGED.
-  ChangeStatus run();
+  ChangeStatus run(Module &M);
 
-  /// Lookup an abstract attribute of type \p AAType anchored at value \p V and
-  /// argument number \p ArgNo. If no attribute is found and \p V is a call base
-  /// instruction, the called function is tried as a value next. Thus, the
-  /// returned abstract attribute might be anchored at the callee of \p V.
+  /// Lookup an abstract attribute of type \p AAType at position \p IRP. While
+  /// no abstract attribute is found equivalent positions are checked, see
+  /// SubsumingPositionIterator. Thus, the returned abstract attribute
+  /// might be anchored at a different position, e.g., the callee if \p IRP is a
+  /// call base.
   ///
   /// This method is the only (supported) way an abstract attribute can retrieve
   /// information from another abstract attribute. As an example, take an
@@ -170,51 +746,29 @@ struct Attributor {
   /// most optimistic information for other abstract attributes in-flight, e.g.
   /// the one reasoning about the "captured" state for the argument or the one
   /// reasoning on the memory access behavior of the function as a whole.
+  ///
+  /// If the flag \p TrackDependence is set to false the dependence from
+  /// \p QueryingAA to the return abstract attribute is not automatically
+  /// recorded. This should only be used if the caller will record the
+  /// dependence explicitly if necessary, thus if it the returned abstract
+  /// attribute is used for reasoning. To record the dependences explicitly use
+  /// the `Attributor::recordDependence` method.
   template <typename AAType>
-  const AAType *getAAFor(AbstractAttribute &QueryingAA, const Value &V,
-                         int ArgNo = -1) {
-    static_assert(std::is_base_of<AbstractAttribute, AAType>::value,
-                  "Cannot query an attribute with a type not derived from "
-                  "'AbstractAttribute'!");
-    assert(AAType::ID != Attribute::None &&
-           "Cannot lookup generic abstract attributes!");
-
-    // Determine the argument number automatically for llvm::Arguments if none
-    // is set. Do not override a given one as it could be a use of the argument
-    // in a call site.
-    if (ArgNo == -1)
-      if (auto *Arg = dyn_cast<Argument>(&V))
-        ArgNo = Arg->getArgNo();
-
-    // If a function was given together with an argument number, perform the
-    // lookup for the actual argument instead. Don't do it for variadic
-    // arguments.
-    if (ArgNo >= 0 && isa<Function>(&V) &&
-        cast<Function>(&V)->arg_size() > (size_t)ArgNo)
-      return getAAFor<AAType>(
-          QueryingAA, *(cast<Function>(&V)->arg_begin() + ArgNo), ArgNo);
-
-    // Lookup the abstract attribute of type AAType. If found, return it after
-    // registering a dependence of QueryingAA on the one returned attribute.
-    const auto &KindToAbstractAttributeMap = AAMap.lookup({&V, ArgNo});
-    if (AAType *AA = static_cast<AAType *>(
-            KindToAbstractAttributeMap.lookup(AAType::ID))) {
-      // Do not return an attribute with an invalid state. This minimizes checks
-      // at the calls sites and allows the fallback below to kick in.
-      if (AA->getState().isValidState()) {
-        QueryMap[AA].insert(&QueryingAA);
-        return AA;
-      }
-    }
-
-    // If no abstract attribute was found and we look for a call site argument,
-    // defer to the actual argument instead.
-    ImmutableCallSite ICS(&V);
-    if (ICS && ICS.getCalledValue())
-      return getAAFor<AAType>(QueryingAA, *ICS.getCalledValue(), ArgNo);
+  const AAType &getAAFor(const AbstractAttribute &QueryingAA,
+                         const IRPosition &IRP, bool TrackDependence = true) {
+    return getOrCreateAAFor<AAType>(IRP, &QueryingAA, TrackDependence);
+  }
 
-    // No matching attribute found
-    return nullptr;
+  /// Explicitly record a dependence from \p FromAA to \p ToAA, that is if
+  /// \p FromAA changes \p ToAA should be updated as well.
+  ///
+  /// This method should be used in conjunction with the `getAAFor` method and
+  /// with the TrackDependence flag passed to the method set to false. This can
+  /// be beneficial to avoid false dependences but it requires the users of
+  /// `getAAFor` to explicitly record true dependences through this method.
+  void recordDependence(const AbstractAttribute &FromAA,
+                        const AbstractAttribute &ToAA) {
+    QueryMap[&FromAA].insert(const_cast<AbstractAttribute *>(&ToAA));
   }
 
   /// Introduce a new abstract attribute into the fixpoint analysis.
@@ -222,126 +776,242 @@ struct Attributor {
   /// Note that ownership of the attribute is given to the Attributor. It will
   /// invoke delete for the Attributor on destruction of the Attributor.
   ///
-  /// Attributes are identified by
-  ///  (1) their anchored value (see AA.getAnchoredValue()),
-  ///  (2) their argument number (\p ArgNo, or Argument::getArgNo()), and
-  ///  (3) their default attribute kind (see AAType::ID).
-  template <typename AAType> AAType &registerAA(AAType &AA, int ArgNo = -1) {
+  /// Attributes are identified by their IR position (AAType::getIRPosition())
+  /// and the address of their static member (see AAType::ID).
+  template <typename AAType> AAType &registerAA(AAType &AA) {
     static_assert(std::is_base_of<AbstractAttribute, AAType>::value,
                   "Cannot register an attribute with a type not derived from "
                   "'AbstractAttribute'!");
-
-    // Determine the anchor value and the argument number which are used to
-    // lookup the attribute together with AAType::ID. If passed an argument,
-    // use its argument number but do not override a given one as it could be a
-    // use of the argument at a call site.
-    Value &AnchoredVal = AA.getAnchoredValue();
-    if (ArgNo == -1)
-      if (auto *Arg = dyn_cast<Argument>(&AnchoredVal))
-        ArgNo = Arg->getArgNo();
-
     // Put the attribute in the lookup map structure and the container we use to
     // keep track of all attributes.
-    AAMap[{&AnchoredVal, ArgNo}][AAType::ID] = &AA;
+    IRPosition &IRP = AA.getIRPosition();
+    auto &KindToAbstractAttributeMap = AAMap[IRP];
+    assert(!KindToAbstractAttributeMap.count(&AAType::ID) &&
+           "Attribute already in map!");
+    KindToAbstractAttributeMap[&AAType::ID] = &AA;
     AllAbstractAttributes.push_back(&AA);
     return AA;
   }
 
+  /// Return the internal information cache.
+  InformationCache &getInfoCache() { return InfoCache; }
+
   /// Determine opportunities to derive 'default' attributes in \p F and create
   /// abstract attribute objects for them.
   ///
   /// \param F The function that is checked for attribute opportunities.
-  /// \param InfoCache A cache for information queryable by the new attributes.
-  /// \param Whitelist If not null, a set limiting the attribute opportunities.
   ///
   /// Note that abstract attribute instances are generally created even if the
   /// IR already contains the information they would deduce. The most important
   /// reason for this is the single interface, the one of the abstract attribute
   /// instance, which can be queried without the need to look at the IR in
   /// various places.
-  void identifyDefaultAbstractAttributes(
-      Function &F, InformationCache &InfoCache,
-      DenseSet</* Attribute::AttrKind */ unsigned> *Whitelist = nullptr);
+  void identifyDefaultAbstractAttributes(Function &F);
+
+  /// Initialize the information cache for queries regarding function \p F.
+  ///
+  /// This method needs to be called for all function that might be looked at
+  /// through the information cache interface *prior* to looking at them.
+  void initializeInformationCache(Function &F);
+
+  /// Mark the internal function \p F as live.
+  ///
+  /// This will trigger the identification and initialization of attributes for
+  /// \p F.
+  void markLiveInternalFunction(const Function &F) {
+    assert(F.hasLocalLinkage() &&
+           "Only local linkage is assumed dead initially.");
+
+    identifyDefaultAbstractAttributes(const_cast<Function &>(F));
+  }
+
+  /// Record that \p I is deleted after information was manifested.
+  void deleteAfterManifest(Instruction &I) { ToBeDeletedInsts.insert(&I); }
+
+  /// Record that \p BB is deleted after information was manifested.
+  void deleteAfterManifest(BasicBlock &BB) { ToBeDeletedBlocks.insert(&BB); }
+
+  /// Record that \p F is deleted after information was manifested.
+  void deleteAfterManifest(Function &F) { ToBeDeletedFunctions.insert(&F); }
+
+  /// Return true if \p AA (or its context instruction) is assumed dead.
+  ///
+  /// If \p LivenessAA is not provided it is queried.
+  bool isAssumedDead(const AbstractAttribute &AA, const AAIsDead *LivenessAA);
 
   /// Check \p Pred on all function call sites.
   ///
   /// This method will evaluate \p Pred on call sites and return
   /// true if \p Pred holds in every call sites. However, this is only possible
   /// all call sites are known, hence the function has internal linkage.
-  bool checkForAllCallSites(Function &F, std::function<bool(CallSite)> &Pred,
+  bool checkForAllCallSites(const function_ref<bool(AbstractCallSite)> &Pred,
+                            const AbstractAttribute &QueryingAA,
                             bool RequireAllCallSites);
 
+  /// Check \p Pred on all values potentially returned by \p F.
+  ///
+  /// This method will evaluate \p Pred on all values potentially returned by
+  /// the function associated with \p QueryingAA. The returned values are
+  /// matched with their respective return instructions. Returns true if \p Pred
+  /// holds on all of them.
+  bool checkForAllReturnedValuesAndReturnInsts(
+      const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
+          &Pred,
+      const AbstractAttribute &QueryingAA);
+
+  /// Check \p Pred on all values potentially returned by the function
+  /// associated with \p QueryingAA.
+  ///
+  /// This is the context insensitive version of the method above.
+  bool checkForAllReturnedValues(const function_ref<bool(Value &)> &Pred,
+                                 const AbstractAttribute &QueryingAA);
+
+  /// Check \p Pred on all instructions with an opcode present in \p Opcodes.
+  ///
+  /// This method will evaluate \p Pred on all instructions with an opcode
+  /// present in \p Opcode and return true if \p Pred holds on all of them.
+  bool checkForAllInstructions(const function_ref<bool(Instruction &)> &Pred,
+                               const AbstractAttribute &QueryingAA,
+                               const ArrayRef<unsigned> &Opcodes);
+
+  /// Check \p Pred on all call-like instructions (=CallBased derived).
+  ///
+  /// See checkForAllCallLikeInstructions(...) for more information.
+  bool
+  checkForAllCallLikeInstructions(const function_ref<bool(Instruction &)> &Pred,
+                                  const AbstractAttribute &QueryingAA) {
+    return checkForAllInstructions(Pred, QueryingAA,
+                                   {(unsigned)Instruction::Invoke,
+                                    (unsigned)Instruction::CallBr,
+                                    (unsigned)Instruction::Call});
+  }
+
+  /// Check \p Pred on all Read/Write instructions.
+  ///
+  /// This method will evaluate \p Pred on all instructions that read or write
+  /// to memory present in the information cache and return true if \p Pred
+  /// holds on all of them.
+  bool checkForAllReadWriteInstructions(
+      const llvm::function_ref<bool(Instruction &)> &Pred,
+      AbstractAttribute &QueryingAA);
+
+  /// Return the data layout associated with the anchor scope.
+  const DataLayout &getDataLayout() const { return InfoCache.DL; }
+
 private:
+  /// Check \p Pred on all call sites of \p Fn.
+  ///
+  /// This method will evaluate \p Pred on call sites and return
+  /// true if \p Pred holds in every call sites. However, this is only possible
+  /// all call sites are known, hence the function has internal linkage.
+  bool checkForAllCallSites(const function_ref<bool(AbstractCallSite)> &Pred,
+                            const Function &Fn, bool RequireAllCallSites,
+                            const AbstractAttribute *QueryingAA);
+
+  /// The private version of getAAFor that allows to omit a querying abstract
+  /// attribute. See also the public getAAFor method.
+  template <typename AAType>
+  const AAType &getOrCreateAAFor(const IRPosition &IRP,
+                                 const AbstractAttribute *QueryingAA = nullptr,
+                                 bool TrackDependence = false) {
+    if (const AAType *AAPtr =
+            lookupAAFor<AAType>(IRP, QueryingAA, TrackDependence))
+      return *AAPtr;
+
+    // No matching attribute found, create one.
+    // Use the static create method.
+    auto &AA = AAType::createForPosition(IRP, *this);
+    registerAA(AA);
+
+    // For now we ignore naked and optnone functions.
+    bool Invalidate = Whitelist && !Whitelist->count(&AAType::ID);
+    if (const Function *Fn = IRP.getAnchorScope())
+      Invalidate |= Fn->hasFnAttribute(Attribute::Naked) ||
+                    Fn->hasFnAttribute(Attribute::OptimizeNone);
+
+    // Bootstrap the new attribute with an initial update to propagate
+    // information, e.g., function -> call site. If it is not on a given
+    // whitelist we will not perform updates at all.
+    if (Invalidate) {
+      AA.getState().indicatePessimisticFixpoint();
+      return AA;
+    }
+
+    AA.initialize(*this);
+    AA.update(*this);
+
+    if (TrackDependence && AA.getState().isValidState())
+      QueryMap[&AA].insert(const_cast<AbstractAttribute *>(QueryingAA));
+    return AA;
+  }
+
+  /// Return the attribute of \p AAType for \p IRP if existing.
+  template <typename AAType>
+  const AAType *lookupAAFor(const IRPosition &IRP,
+                            const AbstractAttribute *QueryingAA = nullptr,
+                            bool TrackDependence = false) {
+    static_assert(std::is_base_of<AbstractAttribute, AAType>::value,
+                  "Cannot query an attribute with a type not derived from "
+                  "'AbstractAttribute'!");
+    assert((QueryingAA || !TrackDependence) &&
+           "Cannot track dependences without a QueryingAA!");
+
+    // Lookup the abstract attribute of type AAType. If found, return it after
+    // registering a dependence of QueryingAA on the one returned attribute.
+    const auto &KindToAbstractAttributeMap = AAMap.lookup(IRP);
+    if (AAType *AA = static_cast<AAType *>(
+            KindToAbstractAttributeMap.lookup(&AAType::ID))) {
+      // Do not register a dependence on an attribute with an invalid state.
+      if (TrackDependence && AA->getState().isValidState())
+        QueryMap[AA].insert(const_cast<AbstractAttribute *>(QueryingAA));
+      return AA;
+    }
+    return nullptr;
+  }
+
   /// The set of all abstract attributes.
   ///{
   using AAVector = SmallVector<AbstractAttribute *, 64>;
   AAVector AllAbstractAttributes;
   ///}
 
-  /// A nested map to lookup abstract attributes based on the anchored value and
-  /// an argument positions (or -1) on the outer level, and attribute kinds
-  /// (Attribute::AttrKind) on the inner level.
+  /// A nested map to lookup abstract attributes based on the argument position
+  /// on the outer level, and the addresses of the static member (AAType::ID) on
+  /// the inner level.
   ///{
-  using KindToAbstractAttributeMap = DenseMap<unsigned, AbstractAttribute *>;
-  DenseMap<std::pair<const Value *, int>, KindToAbstractAttributeMap> AAMap;
+  using KindToAbstractAttributeMap =
+      DenseMap<const char *, AbstractAttribute *>;
+  DenseMap<IRPosition, KindToAbstractAttributeMap> AAMap;
   ///}
 
   /// A map from abstract attributes to the ones that queried them through calls
   /// to the getAAFor<...>(...) method.
   ///{
   using QueryMapTy =
-      DenseMap<AbstractAttribute *, SetVector<AbstractAttribute *>>;
+      MapVector<const AbstractAttribute *, SetVector<AbstractAttribute *>>;
   QueryMapTy QueryMap;
-  ///}
-};
-
-/// Data structure to hold cached (LLVM-IR) information.
-///
-/// All attributes are given an InformationCache object at creation time to
-/// avoid inspection of the IR by all of them individually. This default
-/// InformationCache will hold information required by 'default' attributes,
-/// thus the ones deduced when Attributor::identifyDefaultAbstractAttributes(..)
-/// is called.
-///
-/// If custom abstract attributes, registered manually through
-/// Attributor::registerAA(...), need more information, especially if it is not
-/// reusable, it is advised to inherit from the InformationCache and cast the
-/// instance down in the abstract attributes.
-struct InformationCache {
-  /// A map type from opcodes to instructions with this opcode.
-  using OpcodeInstMapTy = DenseMap<unsigned, SmallVector<Instruction *, 32>>;
-
-  /// Return the map that relates "interesting" opcodes with all instructions
-  /// with that opcode in \p F.
-  OpcodeInstMapTy &getOpcodeInstMapForFunction(Function &F) {
-    return FuncInstOpcodeMap[&F];
-  }
-
-  /// A vector type to hold instructions.
-  using InstructionVectorTy = std::vector<Instruction *>;
-
-  /// Return the instructions in \p F that may read or write memory.
-  InstructionVectorTy &getReadOrWriteInstsForFunction(Function &F) {
-    return FuncRWInstsMap[&F];
-  }
+  ///}
 
-private:
-  /// A map type from functions to opcode to instruction maps.
-  using FuncInstOpcodeMapTy = DenseMap<Function *, OpcodeInstMapTy>;
+  /// The information cache that holds pre-processed (LLVM-IR) information.
+  InformationCache &InfoCache;
 
-  /// A map type from functions to their read or write instructions.
-  using FuncRWInstsMapTy = DenseMap<Function *, InstructionVectorTy>;
+  /// Number of iterations until the dependences between abstract attributes are
+  /// recomputed.
+  const unsigned DepRecomputeInterval;
 
-  /// A nested map that remembers all instructions in a function with a certain
-  /// instruction opcode (Instruction::getOpcode()).
-  FuncInstOpcodeMapTy FuncInstOpcodeMap;
+  /// If not null, a set limiting the attribute opportunities.
+  const DenseSet<const char *> *Whitelist;
 
-  /// A map from functions to their instructions that may read or write memory.
-  FuncRWInstsMapTy FuncRWInstsMap;
+  /// A set to remember the functions we already assume to be live and visited.
+  DenseSet<const Function *> VisitedFunctions;
 
-  /// Give the Attributor access to the members so
-  /// Attributor::identifyDefaultAbstractAttributes(...) can initialize them.
-  friend struct Attributor;
+  /// Functions, blocks, and instructions we delete after manifest is done.
+  ///
+  ///{
+  SmallPtrSet<Function *, 8> ToBeDeletedFunctions;
+  SmallPtrSet<BasicBlock *, 8> ToBeDeletedBlocks;
+  SmallPtrSet<Instruction *, 8> ToBeDeletedInsts;
+  ///}
 };
 
 /// An interface to query the internal state of an abstract attribute.
@@ -375,13 +1045,17 @@ struct AbstractState {
   ///
   /// This will usually make the optimistically assumed state the known to be
   /// true state.
-  virtual void indicateOptimisticFixpoint() = 0;
+  ///
+  /// \returns ChangeStatus::UNCHANGED as the assumed value should not change.
+  virtual ChangeStatus indicateOptimisticFixpoint() = 0;
 
   /// Indicate that the abstract state should converge to the pessimistic state.
   ///
   /// This will usually revert the optimistically assumed state to the known to
   /// be true state.
-  virtual void indicatePessimisticFixpoint() = 0;
+  ///
+  /// \returns ChangeStatus::CHANGED as the assumed value may change.
+  virtual ChangeStatus indicatePessimisticFixpoint() = 0;
 };
 
 /// Simple state with integers encoding.
@@ -412,10 +1086,16 @@ struct IntegerState : public AbstractState {
   bool isAtFixpoint() const override { return Assumed == Known; }
 
   /// See AbstractState::indicateOptimisticFixpoint(...)
-  void indicateOptimisticFixpoint() override { Known = Assumed; }
+  ChangeStatus indicateOptimisticFixpoint() override {
+    Known = Assumed;
+    return ChangeStatus::UNCHANGED;
+  }
 
   /// See AbstractState::indicatePessimisticFixpoint(...)
-  void indicatePessimisticFixpoint() override { Assumed = Known; }
+  ChangeStatus indicatePessimisticFixpoint() override {
+    Assumed = Known;
+    return ChangeStatus::CHANGED;
+  }
 
   /// Return the known state encoding
   base_t getKnown() const { return Known; }
@@ -448,6 +1128,12 @@ struct IntegerState : public AbstractState {
     return *this;
   }
 
+  /// Remove the bits in \p BitsEncoding from the "known bits".
+  IntegerState &removeKnownBits(base_t BitsEncoding) {
+    Known = (Known & ~BitsEncoding);
+    return *this;
+  }
+
   /// Keep only "assumed bits" also set in \p BitsEncoding but all known ones.
   IntegerState &intersectAssumedBits(base_t BitsEncoding) {
     // Make sure we never loose any "known bits".
@@ -455,6 +1141,62 @@ struct IntegerState : public AbstractState {
     return *this;
   }
 
+  /// Take minimum of assumed and \p Value.
+  IntegerState &takeAssumedMinimum(base_t Value) {
+    // Make sure we never loose "known value".
+    Assumed = std::max(std::min(Assumed, Value), Known);
+    return *this;
+  }
+
+  /// Take maximum of known and \p Value.
+  IntegerState &takeKnownMaximum(base_t Value) {
+    // Make sure we never loose "known value".
+    Assumed = std::max(Value, Assumed);
+    Known = std::max(Value, Known);
+    return *this;
+  }
+
+  /// Equality for IntegerState.
+  bool operator==(const IntegerState &R) const {
+    return this->getAssumed() == R.getAssumed() &&
+           this->getKnown() == R.getKnown();
+  }
+
+  /// Inequality for IntegerState.
+  bool operator!=(const IntegerState &R) const { return !(*this == R); }
+
+  /// "Clamp" this state with \p R. The result is the minimum of the assumed
+  /// information but not less than what was known before.
+  ///
+  /// TODO: Consider replacing the operator with a call or using it only when
+  ///       we can also take the maximum of the known information, thus when
+  ///       \p R is not dependent on additional assumed state.
+  IntegerState operator^=(const IntegerState &R) {
+    takeAssumedMinimum(R.Assumed);
+    return *this;
+  }
+
+  /// "Clamp" this state with \p R. The result is the maximum of the known
+  /// information but not more than what was assumed before.
+  IntegerState operator+=(const IntegerState &R) {
+    takeKnownMaximum(R.Known);
+    return *this;
+  }
+
+  /// Make this the minimum, known and assumed, of this state and \p R.
+  IntegerState operator&=(const IntegerState &R) {
+    Known = std::min(Known, R.Known);
+    Assumed = std::min(Assumed, R.Assumed);
+    return *this;
+  }
+
+  /// Make this the maximum, known and assumed, of this state and \p R.
+  IntegerState operator|=(const IntegerState &R) {
+    Known = std::max(Known, R.Known);
+    Assumed = std::max(Assumed, R.Assumed);
+    return *this;
+  }
+
 private:
   /// The known state encoding in an integer of type base_t.
   base_t Known = getWorstState();
@@ -468,6 +1210,77 @@ struct BooleanState : public IntegerState {
   BooleanState() : IntegerState(1){};
 };
 
+/// Helper struct necessary as the modular build fails if the virtual method
+/// IRAttribute::manifest is defined in the Attributor.cpp.
+struct IRAttributeManifest {
+  static ChangeStatus manifestAttrs(Attributor &A, IRPosition &IRP,
+                                    const ArrayRef<Attribute> &DeducedAttrs);
+};
+
+/// Helper to tie a abstract state implementation to an abstract attribute.
+template <typename StateTy, typename Base>
+struct StateWrapper : public StateTy, public Base {
+  /// Provide static access to the type of the state.
+  using StateType = StateTy;
+
+  /// See AbstractAttribute::getState(...).
+  StateType &getState() override { return *this; }
+
+  /// See AbstractAttribute::getState(...).
+  const AbstractState &getState() const override { return *this; }
+};
+
+/// Helper class that provides common functionality to manifest IR attributes.
+template <Attribute::AttrKind AK, typename Base>
+struct IRAttribute : public IRPosition, public Base {
+  IRAttribute(const IRPosition &IRP) : IRPosition(IRP) {}
+  ~IRAttribute() {}
+
+  /// See AbstractAttribute::initialize(...).
+  virtual void initialize(Attributor &A) override {
+    if (hasAttr(getAttrKind())) {
+      this->getState().indicateOptimisticFixpoint();
+      return;
+    }
+
+    const IRPosition &IRP = this->getIRPosition();
+    bool IsFnInterface = IRP.isFnInterfaceKind();
+    const Function *FnScope = IRP.getAnchorScope();
+    // TODO: Not all attributes require an exact definition. Find a way to
+    //       enable deduction for some but not all attributes in case the
+    //       definition might be changed at runtime, see also
+    //       http://lists.llvm.org/pipermail/llvm-dev/2018-February/121275.html.
+    // TODO: We could always determine abstract attributes and if sufficient
+    //       information was found we could duplicate the functions that do not
+    //       have an exact definition.
+    if (IsFnInterface && (!FnScope || !FnScope->hasExactDefinition()))
+      this->getState().indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    SmallVector<Attribute, 4> DeducedAttrs;
+    getDeducedAttributes(getAnchorValue().getContext(), DeducedAttrs);
+    return IRAttributeManifest::manifestAttrs(A, getIRPosition(), DeducedAttrs);
+  }
+
+  /// Return the kind that identifies the abstract attribute implementation.
+  Attribute::AttrKind getAttrKind() const { return AK; }
+
+  /// Return the deduced attributes in \p Attrs.
+  virtual void getDeducedAttributes(LLVMContext &Ctx,
+                                    SmallVectorImpl<Attribute> &Attrs) const {
+    Attrs.emplace_back(Attribute::get(Ctx, getAttrKind()));
+  }
+
+  /// Return an IR position, see struct IRPosition.
+  ///
+  ///{
+  IRPosition &getIRPosition() override { return *this; }
+  const IRPosition &getIRPosition() const override { return *this; }
+  ///}
+};
+
 /// Base struct for all "concrete attribute" deductions.
 ///
 /// The abstract attribute is a minimal interface that allows the Attributor to
@@ -512,29 +1325,7 @@ struct BooleanState : public IntegerState {
 /// NOTE: The mechanics of adding a new "concrete" abstract attribute are
 ///       described in the file comment.
 struct AbstractAttribute {
-
-  /// The positions attributes can be manifested in.
-  enum ManifestPosition {
-    MP_ARGUMENT,           ///< An attribute for a function argument.
-    MP_CALL_SITE_ARGUMENT, ///< An attribute for a call site argument.
-    MP_FUNCTION,           ///< An attribute for a function as a whole.
-    MP_RETURNED,           ///< An attribute for the function return value.
-  };
-
-  /// An abstract attribute associated with \p AssociatedVal and anchored at
-  /// \p AnchoredVal.
-  ///
-  /// \param AssociatedVal The value this abstract attribute is associated with.
-  /// \param AnchoredVal The value this abstract attributes is anchored at.
-  /// \param InfoCache Cached information accessible to the abstract attribute.
-  AbstractAttribute(Value *AssociatedVal, Value &AnchoredVal,
-                    InformationCache &InfoCache)
-      : AssociatedVal(AssociatedVal), AnchoredVal(AnchoredVal),
-        InfoCache(InfoCache) {}
-
-  /// An abstract attribute associated with and anchored at \p V.
-  AbstractAttribute(Value &V, InformationCache &InfoCache)
-      : AbstractAttribute(&V, V, InfoCache) {}
+  using StateType = AbstractState;
 
   /// Virtual destructor.
   virtual ~AbstractAttribute() {}
@@ -550,47 +1341,11 @@ struct AbstractAttribute {
   virtual void initialize(Attributor &A) {}
 
   /// Return the internal abstract state for inspection.
-  virtual const AbstractState &getState() const = 0;
-
-  /// Return the value this abstract attribute is anchored with.
-  ///
-  /// The anchored value might not be the associated value if the latter is not
-  /// sufficient to determine where arguments will be manifested. This is mostly
-  /// the case for call site arguments as the value is not sufficient to
-  /// pinpoint them. Instead, we can use the call site as an anchor.
-  ///
-  ///{
-  Value &getAnchoredValue() { return AnchoredVal; }
-  const Value &getAnchoredValue() const { return AnchoredVal; }
-  ///}
-
-  /// Return the llvm::Function surrounding the anchored value.
-  ///
-  ///{
-  Function &getAnchorScope();
-  const Function &getAnchorScope() const;
-  ///}
-
-  /// Return the value this abstract attribute is associated with.
-  ///
-  /// The abstract state usually represents this value.
-  ///
-  ///{
-  virtual Value *getAssociatedValue() { return AssociatedVal; }
-  virtual const Value *getAssociatedValue() const { return AssociatedVal; }
-  ///}
-
-  /// Return the position this abstract state is manifested in.
-  virtual ManifestPosition getManifestPosition() const = 0;
-
-  /// Return the kind that identifies the abstract attribute implementation.
-  virtual Attribute::AttrKind getAttrKind() const = 0;
+  virtual StateType &getState() = 0;
+  virtual const StateType &getState() const = 0;
 
-  /// Return the deduced attributes in \p Attrs.
-  virtual void getDeducedAttributes(SmallVectorImpl<Attribute> &Attrs) const {
-    LLVMContext &Ctx = AnchoredVal.getContext();
-    Attrs.emplace_back(Attribute::get(Ctx, getAttrKind()));
-  }
+  /// Return an IR position, see struct IRPosition.
+  virtual const IRPosition &getIRPosition() const = 0;
 
   /// Helper functions, for debug purposes only.
   ///{
@@ -617,10 +1372,19 @@ protected:
   /// represented by the abstract attribute in the LLVM-IR.
   ///
   /// \Return CHANGED if the IR was altered, otherwise UNCHANGED.
-  virtual ChangeStatus manifest(Attributor &A);
+  virtual ChangeStatus manifest(Attributor &A) {
+    return ChangeStatus::UNCHANGED;
+  }
 
-  /// Return the internal abstract state for careful modification.
-  virtual AbstractState &getState() = 0;
+  /// Hook to enable custom statistic tracking, called after manifest that
+  /// resulted in a change if statistics are enabled.
+  ///
+  /// We require subclasses to provide an implementation so we remember to
+  /// add statistics for them.
+  virtual void trackStatistics() const = 0;
+
+  /// Return an IR position, see struct IRPosition.
+  virtual IRPosition &getIRPosition() = 0;
 
   /// The actual update/transfer function which has to be implemented by the
   /// derived classes.
@@ -630,15 +1394,6 @@ protected:
   ///
   /// \Return CHANGED if the internal state changed, otherwise UNCHANGED.
   virtual ChangeStatus updateImpl(Attributor &A) = 0;
-
-  /// The value this abstract attribute is associated with.
-  Value *AssociatedVal;
-
-  /// The value this abstract attribute is anchored at.
-  Value &AnchoredVal;
-
-  /// The information cache accessible to this abstract attribute.
-  InformationCache &InfoCache;
 };
 
 /// Forward declarations of output streams for debug purposes.
@@ -646,8 +1401,10 @@ protected:
 ///{
 raw_ostream &operator<<(raw_ostream &OS, const AbstractAttribute &AA);
 raw_ostream &operator<<(raw_ostream &OS, ChangeStatus S);
-raw_ostream &operator<<(raw_ostream &OS, AbstractAttribute::ManifestPosition);
+raw_ostream &operator<<(raw_ostream &OS, IRPosition::Kind);
+raw_ostream &operator<<(raw_ostream &OS, const IRPosition &);
 raw_ostream &operator<<(raw_ostream &OS, const AbstractState &State);
+raw_ostream &operator<<(raw_ostream &OS, const IntegerState &S);
 ///}
 
 struct AttributorPass : public PassInfoMixin<AttributorPass> {
@@ -661,129 +1418,531 @@ Pass *createAttributorLegacyPass();
 /// ----------------------------------------------------------------------------
 
 /// An abstract attribute for the returned values of a function.
-struct AAReturnedValues : public AbstractAttribute {
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AAReturnedValues(Function &F, InformationCache &InfoCache)
-      : AbstractAttribute(F, InfoCache) {}
+struct AAReturnedValues
+    : public IRAttribute<Attribute::Returned, AbstractAttribute> {
+  AAReturnedValues(const IRPosition &IRP) : IRAttribute(IRP) {}
+
+  /// Return an assumed unique return value if a single candidate is found. If
+  /// there cannot be one, return a nullptr. If it is not clear yet, return the
+  /// Optional::NoneType.
+  Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const;
 
   /// Check \p Pred on all returned values.
   ///
   /// This method will evaluate \p Pred on returned values and return
   /// true if (1) all returned values are known, and (2) \p Pred returned true
   /// for all returned values.
-  virtual bool
-  checkForallReturnedValues(std::function<bool(Value &)> &Pred) const = 0;
-
-  /// See AbstractAttribute::getAttrKind()
-  Attribute::AttrKind getAttrKind() const override { return ID; }
-
-  /// The identifier used by the Attributor for this class of attributes.
-  static constexpr Attribute::AttrKind ID = Attribute::Returned;
+  ///
+  /// Note: Unlike the Attributor::checkForAllReturnedValuesAndReturnInsts
+  /// method, this one will not filter dead return instructions.
+  virtual bool checkForAllReturnedValuesAndReturnInsts(
+      const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
+          &Pred) const = 0;
+
+  using iterator =
+      MapVector<Value *, SmallSetVector<ReturnInst *, 4>>::iterator;
+  using const_iterator =
+      MapVector<Value *, SmallSetVector<ReturnInst *, 4>>::const_iterator;
+  virtual llvm::iterator_range<iterator> returned_values() = 0;
+  virtual llvm::iterator_range<const_iterator> returned_values() const = 0;
+
+  virtual size_t getNumReturnValues() const = 0;
+  virtual const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const = 0;
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAReturnedValues &createForPosition(const IRPosition &IRP,
+                                             Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
 };
 
-struct AANoUnwind : public AbstractAttribute {
-  /// An abstract interface for all nosync attributes.
-  AANoUnwind(Value &V, InformationCache &InfoCache)
-      : AbstractAttribute(V, InfoCache) {}
-
-  /// See AbstractAttribute::getAttrKind()/
-  Attribute::AttrKind getAttrKind() const override { return ID; }
-
-  static constexpr Attribute::AttrKind ID = Attribute::NoUnwind;
+struct AANoUnwind
+    : public IRAttribute<Attribute::NoUnwind,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AANoUnwind(const IRPosition &IRP) : IRAttribute(IRP) {}
 
   /// Returns true if nounwind is assumed.
-  virtual bool isAssumedNoUnwind() const = 0;
+  bool isAssumedNoUnwind() const { return getAssumed(); }
 
   /// Returns true if nounwind is known.
-  virtual bool isKnownNoUnwind() const = 0;
-};
+  bool isKnownNoUnwind() const { return getKnown(); }
 
-struct AANoSync : public AbstractAttribute {
-  /// An abstract interface for all nosync attributes.
-  AANoSync(Value &V, InformationCache &InfoCache)
-      : AbstractAttribute(V, InfoCache) {}
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANoUnwind &createForPosition(const IRPosition &IRP, Attributor &A);
 
-  /// See AbstractAttribute::getAttrKind().
-  Attribute::AttrKind getAttrKind() const override { return ID; }
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
 
-  static constexpr Attribute::AttrKind ID =
-      Attribute::AttrKind(Attribute::NoSync);
+struct AANoSync
+    : public IRAttribute<Attribute::NoSync,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AANoSync(const IRPosition &IRP) : IRAttribute(IRP) {}
 
   /// Returns true if "nosync" is assumed.
-  virtual bool isAssumedNoSync() const = 0;
+  bool isAssumedNoSync() const { return getAssumed(); }
 
   /// Returns true if "nosync" is known.
-  virtual bool isKnownNoSync() const = 0;
-};
+  bool isKnownNoSync() const { return getKnown(); }
 
-/// An abstract interface for all nonnull attributes.
-struct AANonNull : public AbstractAttribute {
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANoSync &createForPosition(const IRPosition &IRP, Attributor &A);
 
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AANonNull(Value &V, InformationCache &InfoCache)
-      : AbstractAttribute(V, InfoCache) {}
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
 
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AANonNull(Value *AssociatedVal, Value &AnchoredValue,
-            InformationCache &InfoCache)
-      : AbstractAttribute(AssociatedVal, AnchoredValue, InfoCache) {}
+/// An abstract interface for all nonnull attributes.
+struct AANonNull
+    : public IRAttribute<Attribute::NonNull,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AANonNull(const IRPosition &IRP) : IRAttribute(IRP) {}
 
   /// Return true if we assume that the underlying value is nonnull.
-  virtual bool isAssumedNonNull() const = 0;
+  bool isAssumedNonNull() const { return getAssumed(); }
 
   /// Return true if we know that underlying value is nonnull.
-  virtual bool isKnownNonNull() const = 0;
+  bool isKnownNonNull() const { return getKnown(); }
 
-  /// See AbastractState::getAttrKind().
-  Attribute::AttrKind getAttrKind() const override { return ID; }
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANonNull &createForPosition(const IRPosition &IRP, Attributor &A);
 
-  /// The identifier used by the Attributor for this class of attributes.
-  static constexpr Attribute::AttrKind ID = Attribute::NonNull;
+  /// Unique ID (due to the unique address)
+  static const char ID;
 };
 
 /// An abstract attribute for norecurse.
-struct AANoRecurse : public AbstractAttribute {
+struct AANoRecurse
+    : public IRAttribute<Attribute::NoRecurse,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AANoRecurse(const IRPosition &IRP) : IRAttribute(IRP) {}
 
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AANoRecurse(Value &V, InformationCache &InfoCache)
-      : AbstractAttribute(V, InfoCache) {}
-
-  /// See AbstractAttribute::getAttrKind()
-  virtual Attribute::AttrKind getAttrKind() const override {
-    return Attribute::NoRecurse;
-  }
+  /// Return true if "norecurse" is assumed.
+  bool isAssumedNoRecurse() const { return getAssumed(); }
 
   /// Return true if "norecurse" is known.
-  virtual bool isKnownNoRecurse() const = 0;
+  bool isKnownNoRecurse() const { return getKnown(); }
 
-  /// Return true if "norecurse" is assumed.
-  virtual bool isAssumedNoRecurse() const = 0;
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANoRecurse &createForPosition(const IRPosition &IRP, Attributor &A);
 
-  /// The identifier used by the Attributor for this class of attributes.
-  static constexpr Attribute::AttrKind ID = Attribute::NoRecurse;
+  /// Unique ID (due to the unique address)
+  static const char ID;
 };
 
 /// An abstract attribute for willreturn.
-struct AAWillReturn : public AbstractAttribute {
+struct AAWillReturn
+    : public IRAttribute<Attribute::WillReturn,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AAWillReturn(const IRPosition &IRP) : IRAttribute(IRP) {}
+
+  /// Return true if "willreturn" is assumed.
+  bool isAssumedWillReturn() const { return getAssumed(); }
+
+  /// Return true if "willreturn" is known.
+  bool isKnownWillReturn() const { return getKnown(); }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAWillReturn &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// An abstract interface for all noalias attributes.
+struct AANoAlias
+    : public IRAttribute<Attribute::NoAlias,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AANoAlias(const IRPosition &IRP) : IRAttribute(IRP) {}
+
+  /// Return true if we assume that the underlying value is alias.
+  bool isAssumedNoAlias() const { return getAssumed(); }
+
+  /// Return true if we know that underlying value is noalias.
+  bool isKnownNoAlias() const { return getKnown(); }
 
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AAWillReturn(Value &V, InformationCache &InfoCache)
-      : AbstractAttribute(V, InfoCache) {}
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANoAlias &createForPosition(const IRPosition &IRP, Attributor &A);
 
-  /// See AbstractAttribute::getAttrKind()
-  virtual Attribute::AttrKind getAttrKind() const override {
-    return Attribute::WillReturn;
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// An AbstractAttribute for nofree.
+struct AANoFree
+    : public IRAttribute<Attribute::NoFree,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AANoFree(const IRPosition &IRP) : IRAttribute(IRP) {}
+
+  /// Return true if "nofree" is assumed.
+  bool isAssumedNoFree() const { return getAssumed(); }
+
+  /// Return true if "nofree" is known.
+  bool isKnownNoFree() const { return getKnown(); }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANoFree &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// An AbstractAttribute for noreturn.
+struct AANoReturn
+    : public IRAttribute<Attribute::NoReturn,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AANoReturn(const IRPosition &IRP) : IRAttribute(IRP) {}
+
+  /// Return true if the underlying object is assumed to never return.
+  bool isAssumedNoReturn() const { return getAssumed(); }
+
+  /// Return true if the underlying object is known to never return.
+  bool isKnownNoReturn() const { return getKnown(); }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANoReturn &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// An abstract interface for liveness abstract attribute.
+struct AAIsDead : public StateWrapper<BooleanState, AbstractAttribute>,
+                  public IRPosition {
+  AAIsDead(const IRPosition &IRP) : IRPosition(IRP) {}
+
+  /// Returns true if \p BB is assumed dead.
+  virtual bool isAssumedDead(const BasicBlock *BB) const = 0;
+
+  /// Returns true if \p BB is known dead.
+  virtual bool isKnownDead(const BasicBlock *BB) const = 0;
+
+  /// Returns true if \p I is assumed dead.
+  virtual bool isAssumedDead(const Instruction *I) const = 0;
+
+  /// Returns true if \p I is known dead.
+  virtual bool isKnownDead(const Instruction *I) const = 0;
+
+  /// This method is used to check if at least one instruction in a collection
+  /// of instructions is live.
+  template <typename T> bool isLiveInstSet(T begin, T end) const {
+    for (const auto &I : llvm::make_range(begin, end)) {
+      assert(I->getFunction() == getIRPosition().getAssociatedFunction() &&
+             "Instruction must be in the same anchor scope function.");
+
+      if (!isAssumedDead(I))
+        return true;
+    }
+
+    return false;
   }
 
-  /// Return true if "willreturn" is known.
-  virtual bool isKnownWillReturn() const = 0;
+  /// Return an IR position, see struct IRPosition.
+  ///
+  ///{
+  IRPosition &getIRPosition() override { return *this; }
+  const IRPosition &getIRPosition() const override { return *this; }
+  ///}
 
-  /// Return true if "willreturn" is assumed.
-  virtual bool isAssumedWillReturn() const = 0;
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAIsDead &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// State for dereferenceable attribute
+struct DerefState : AbstractState {
+
+  /// State representing for dereferenceable bytes.
+  IntegerState DerefBytesState;
+
+  /// State representing that whether the value is globaly dereferenceable.
+  BooleanState GlobalState;
+
+  /// See AbstractState::isValidState()
+  bool isValidState() const override { return DerefBytesState.isValidState(); }
+
+  /// See AbstractState::isAtFixpoint()
+  bool isAtFixpoint() const override {
+    return !isValidState() ||
+           (DerefBytesState.isAtFixpoint() && GlobalState.isAtFixpoint());
+  }
+
+  /// See AbstractState::indicateOptimisticFixpoint(...)
+  ChangeStatus indicateOptimisticFixpoint() override {
+    DerefBytesState.indicateOptimisticFixpoint();
+    GlobalState.indicateOptimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractState::indicatePessimisticFixpoint(...)
+  ChangeStatus indicatePessimisticFixpoint() override {
+    DerefBytesState.indicatePessimisticFixpoint();
+    GlobalState.indicatePessimisticFixpoint();
+    return ChangeStatus::CHANGED;
+  }
+
+  /// Update known dereferenceable bytes.
+  void takeKnownDerefBytesMaximum(uint64_t Bytes) {
+    DerefBytesState.takeKnownMaximum(Bytes);
+  }
+
+  /// Update assumed dereferenceable bytes.
+  void takeAssumedDerefBytesMinimum(uint64_t Bytes) {
+    DerefBytesState.takeAssumedMinimum(Bytes);
+  }
+
+  /// Equality for DerefState.
+  bool operator==(const DerefState &R) {
+    return this->DerefBytesState == R.DerefBytesState &&
+           this->GlobalState == R.GlobalState;
+  }
+
+  /// Inequality for IntegerState.
+  bool operator!=(const DerefState &R) { return !(*this == R); }
+
+  /// See IntegerState::operator^=
+  DerefState operator^=(const DerefState &R) {
+    DerefBytesState ^= R.DerefBytesState;
+    GlobalState ^= R.GlobalState;
+    return *this;
+  }
+
+  /// See IntegerState::operator+=
+  DerefState operator+=(const DerefState &R) {
+    DerefBytesState += R.DerefBytesState;
+    GlobalState += R.GlobalState;
+    return *this;
+  }
+
+  /// See IntegerState::operator&=
+  DerefState operator&=(const DerefState &R) {
+    DerefBytesState &= R.DerefBytesState;
+    GlobalState &= R.GlobalState;
+    return *this;
+  }
+
+  /// See IntegerState::operator|=
+  DerefState operator|=(const DerefState &R) {
+    DerefBytesState |= R.DerefBytesState;
+    GlobalState |= R.GlobalState;
+    return *this;
+  }
+
+protected:
+  const AANonNull *NonNullAA = nullptr;
+};
+
+/// An abstract interface for all dereferenceable attribute.
+struct AADereferenceable
+    : public IRAttribute<Attribute::Dereferenceable,
+                         StateWrapper<DerefState, AbstractAttribute>> {
+  AADereferenceable(const IRPosition &IRP) : IRAttribute(IRP) {}
+
+  /// Return true if we assume that the underlying value is nonnull.
+  bool isAssumedNonNull() const {
+    return NonNullAA && NonNullAA->isAssumedNonNull();
+  }
+
+  /// Return true if we know that the underlying value is nonnull.
+  bool isKnownNonNull() const {
+    return NonNullAA && NonNullAA->isKnownNonNull();
+  }
+
+  /// Return true if we assume that underlying value is
+  /// dereferenceable(_or_null) globally.
+  bool isAssumedGlobal() const { return GlobalState.getAssumed(); }
+
+  /// Return true if we know that underlying value is
+  /// dereferenceable(_or_null) globally.
+  bool isKnownGlobal() const { return GlobalState.getKnown(); }
+
+  /// Return assumed dereferenceable bytes.
+  uint32_t getAssumedDereferenceableBytes() const {
+    return DerefBytesState.getAssumed();
+  }
+
+  /// Return known dereferenceable bytes.
+  uint32_t getKnownDereferenceableBytes() const {
+    return DerefBytesState.getKnown();
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AADereferenceable &createForPosition(const IRPosition &IRP,
+                                              Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// An abstract interface for all align attributes.
+struct AAAlign
+    : public IRAttribute<Attribute::Alignment,
+                         StateWrapper<IntegerState, AbstractAttribute>> {
+  AAAlign(const IRPosition &IRP) : IRAttribute(IRP) {}
+
+  /// Return assumed alignment.
+  unsigned getAssumedAlign() const { return getAssumed(); }
+
+  /// Return known alignemnt.
+  unsigned getKnownAlign() const { return getKnown(); }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAAlign &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// An abstract interface for all nocapture attributes.
+struct AANoCapture
+    : public IRAttribute<Attribute::NoCapture,
+                         StateWrapper<IntegerState, AbstractAttribute>> {
+  AANoCapture(const IRPosition &IRP) : IRAttribute(IRP) {}
+
+  /// State encoding bits. A set bit in the state means the property holds.
+  /// NO_CAPTURE is the best possible state, 0 the worst possible state.
+  enum {
+    NOT_CAPTURED_IN_MEM = 1 << 0,
+    NOT_CAPTURED_IN_INT = 1 << 1,
+    NOT_CAPTURED_IN_RET = 1 << 2,
+
+    /// If we do not capture the value in memory or through integers we can only
+    /// communicate it back as a derived pointer.
+    NO_CAPTURE_MAYBE_RETURNED = NOT_CAPTURED_IN_MEM | NOT_CAPTURED_IN_INT,
+
+    /// If we do not capture the value in memory, through integers, or as a
+    /// derived pointer we know it is not captured.
+    NO_CAPTURE =
+        NOT_CAPTURED_IN_MEM | NOT_CAPTURED_IN_INT | NOT_CAPTURED_IN_RET,
+  };
+
+  /// Return true if we know that the underlying value is not captured in its
+  /// respective scope.
+  bool isKnownNoCapture() const { return isKnown(NO_CAPTURE); }
+
+  /// Return true if we assume that the underlying value is not captured in its
+  /// respective scope.
+  bool isAssumedNoCapture() const { return isAssumed(NO_CAPTURE); }
+
+  /// Return true if we know that the underlying value is not captured in its
+  /// respective scope but we allow it to escape through a "return".
+  bool isKnownNoCaptureMaybeReturned() const {
+    return isKnown(NO_CAPTURE_MAYBE_RETURNED);
+  }
+
+  /// Return true if we assume that the underlying value is not captured in its
+  /// respective scope but we allow it to escape through a "return".
+  bool isAssumedNoCaptureMaybeReturned() const {
+    return isAssumed(NO_CAPTURE_MAYBE_RETURNED);
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANoCapture &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// An abstract interface for value simplify abstract attribute.
+struct AAValueSimplify : public StateWrapper<BooleanState, AbstractAttribute>,
+                         public IRPosition {
+  AAValueSimplify(const IRPosition &IRP) : IRPosition(IRP) {}
+
+  /// Return an IR position, see struct IRPosition.
+  ///
+  ///{
+  IRPosition &getIRPosition() { return *this; }
+  const IRPosition &getIRPosition() const { return *this; }
+  ///}
+
+  /// Return an assumed simplified value if a single candidate is found. If
+  /// there cannot be one, return original value. If it is not clear yet, return
+  /// the Optional::NoneType.
+  virtual Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const = 0;
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAValueSimplify &createForPosition(const IRPosition &IRP,
+                                            Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+struct AAHeapToStack : public StateWrapper<BooleanState, AbstractAttribute>,
+                       public IRPosition {
+  AAHeapToStack(const IRPosition &IRP) : IRPosition(IRP) {}
+
+  /// Returns true if HeapToStack conversion is assumed to be possible.
+  bool isAssumedHeapToStack() const { return getAssumed(); }
+
+  /// Returns true if HeapToStack conversion is known to be possible.
+  bool isKnownHeapToStack() const { return getKnown(); }
+
+  /// Return an IR position, see struct IRPosition.
+  ///
+  ///{
+  IRPosition &getIRPosition() { return *this; }
+  const IRPosition &getIRPosition() const { return *this; }
+  ///}
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAHeapToStack &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+/// An abstract interface for all memory related attributes.
+struct AAMemoryBehavior
+    : public IRAttribute<Attribute::ReadNone,
+                         StateWrapper<IntegerState, AbstractAttribute>> {
+  AAMemoryBehavior(const IRPosition &IRP) : IRAttribute(IRP) {}
+
+  /// State encoding bits. A set bit in the state means the property holds.
+  /// BEST_STATE is the best possible state, 0 the worst possible state.
+  enum {
+    NO_READS = 1 << 0,
+    NO_WRITES = 1 << 1,
+    NO_ACCESSES = NO_READS | NO_WRITES,
+
+    BEST_STATE = NO_ACCESSES,
+  };
 
-  /// The identifier used by the Attributor for this class of attributes.
-  static constexpr Attribute::AttrKind ID = Attribute::WillReturn;
+  /// Return true if we know that the underlying value is not read or accessed
+  /// in its respective scope.
+  bool isKnownReadNone() const { return isKnown(NO_ACCESSES); }
+
+  /// Return true if we assume that the underlying value is not read or accessed
+  /// in its respective scope.
+  bool isAssumedReadNone() const { return isAssumed(NO_ACCESSES); }
+
+  /// Return true if we know that the underlying value is not accessed
+  /// (=written) in its respective scope.
+  bool isKnownReadOnly() const { return isKnown(NO_WRITES); }
+
+  /// Return true if we assume that the underlying value is not accessed
+  /// (=written) in its respective scope.
+  bool isAssumedReadOnly() const { return isAssumed(NO_WRITES); }
+
+  /// Return true if we know that the underlying value is not read in its
+  /// respective scope.
+  bool isKnownWriteOnly() const { return isKnown(NO_READS); }
+
+  /// Return true if we assume that the underlying value is not read in its
+  /// respective scope.
+  bool isAssumedWriteOnly() const { return isAssumed(NO_READS); }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAMemoryBehavior &createForPosition(const IRPosition &IRP,
+                                             Attributor &A);
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
 };
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_IPO_FUNCTIONATTRS_H
diff --git a/include/llvm/Transforms/IPO/GlobalDCE.h b/include/llvm/Transforms/IPO/GlobalDCE.h
index c434484d1ae3..0a6851849e7e 100644
--- a/include/llvm/Transforms/IPO/GlobalDCE.h
+++ b/include/llvm/Transforms/IPO/GlobalDCE.h
@@ -43,11 +43,25 @@ private:
   /// Comdat -> Globals in that Comdat section.
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
 
+  /// !type metadata -> set of (vtable, offset) pairs
+  DenseMap<Metadata *, SmallSet<std::pair<GlobalVariable *, uint64_t>, 4>>
+      TypeIdMap;
+
+  // Global variables which are vtables, and which we have enough information
+  // about to safely do dead virtual function elimination.
+  SmallPtrSet<GlobalValue *, 32> VFESafeVTables;
+
   void UpdateGVDependencies(GlobalValue &GV);
   void MarkLive(GlobalValue &GV,
                 SmallVectorImpl<GlobalValue *> *Updates = nullptr);
   bool RemoveUnusedGlobalValue(GlobalValue &GV);
 
+  // Dead virtual function elimination.
+  void AddVirtualFunctionDependencies(Module &M);
+  void ScanVTables(Module &M);
+  void ScanTypeCheckedLoadIntrinsics(Module &M);
+  void ScanVTableLoad(Function *Caller, Metadata *TypeId, uint64_t CallOffset);
+
   void ComputeDependencies(Value *V, SmallPtrSetImpl<GlobalValue *> &U);
 };
 
diff --git a/include/llvm/Transforms/IPO/HotColdSplitting.h b/include/llvm/Transforms/IPO/HotColdSplitting.h
index 73668844590d..8c3049fbaac4 100644
--- a/include/llvm/Transforms/IPO/HotColdSplitting.h
+++ b/include/llvm/Transforms/IPO/HotColdSplitting.h
@@ -17,6 +17,45 @@
 namespace llvm {
 
 class Module;
+class ProfileSummaryInfo;
+class BlockFrequencyInfo;
+class TargetTransformInfo;
+class OptimizationRemarkEmitter;
+class AssumptionCache;
+class DominatorTree;
+class CodeExtractorAnalysisCache;
+
+/// A sequence of basic blocks.
+///
+/// A 0-sized SmallVector is slightly cheaper to move than a std::vector.
+using BlockSequence = SmallVector<BasicBlock *, 0>;
+
+class HotColdSplitting {
+public:
+  HotColdSplitting(ProfileSummaryInfo *ProfSI,
+                   function_ref<BlockFrequencyInfo *(Function &)> GBFI,
+                   function_ref<TargetTransformInfo &(Function &)> GTTI,
+                   std::function<OptimizationRemarkEmitter &(Function &)> *GORE,
+                   function_ref<AssumptionCache *(Function &)> LAC)
+      : PSI(ProfSI), GetBFI(GBFI), GetTTI(GTTI), GetORE(GORE), LookupAC(LAC) {}
+  bool run(Module &M);
+
+private:
+  bool isFunctionCold(const Function &F) const;
+  bool shouldOutlineFrom(const Function &F) const;
+  bool outlineColdRegions(Function &F, bool HasProfileSummary);
+  Function *extractColdRegion(const BlockSequence &Region,
+                              const CodeExtractorAnalysisCache &CEAC,
+                              DominatorTree &DT, BlockFrequencyInfo *BFI,
+                              TargetTransformInfo &TTI,
+                              OptimizationRemarkEmitter &ORE,
+                              AssumptionCache *AC, unsigned Count);
+  ProfileSummaryInfo *PSI;
+  function_ref<BlockFrequencyInfo *(Function &)> GetBFI;
+  function_ref<TargetTransformInfo &(Function &)> GetTTI;
+  std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
+  function_ref<AssumptionCache *(Function &)> LookupAC;
+};
 
 /// Pass to outline cold regions.
 class HotColdSplittingPass : public PassInfoMixin<HotColdSplittingPass> {
diff --git a/include/llvm/Transforms/IPO/LowerTypeTests.h b/include/llvm/Transforms/IPO/LowerTypeTests.h
index 39b23f5957db..3c2bb65b9552 100644
--- a/include/llvm/Transforms/IPO/LowerTypeTests.h
+++ b/include/llvm/Transforms/IPO/LowerTypeTests.h
@@ -193,6 +193,8 @@ struct ByteArrayBuilder {
                 uint64_t &AllocByteOffset, uint8_t &AllocMask);
 };
 
+bool isJumpTableCanonical(Function *F);
+
 } // end namespace lowertypetests
 
 class LowerTypeTestsPass : public PassInfoMixin<LowerTypeTestsPass> {
diff --git a/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/include/llvm/Transforms/IPO/WholeProgramDevirt.h
index 509fcc867060..22435e4ed1e5 100644
--- a/include/llvm/Transforms/IPO/WholeProgramDevirt.h
+++ b/include/llvm/Transforms/IPO/WholeProgramDevirt.h
@@ -16,8 +16,10 @@
 
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/IPO/FunctionImport.h"
 #include <cassert>
 #include <cstdint>
+#include <set>
 #include <utility>
 #include <vector>
 
@@ -28,6 +30,7 @@ template <typename T> class MutableArrayRef;
 class Function;
 class GlobalVariable;
 class ModuleSummaryIndex;
+struct ValueInfo;
 
 namespace wholeprogramdevirt {
 
@@ -228,6 +231,29 @@ struct WholeProgramDevirtPass : public PassInfoMixin<WholeProgramDevirtPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
 };
 
+struct VTableSlotSummary {
+  StringRef TypeID;
+  uint64_t ByteOffset;
+};
+
+/// Perform index-based whole program devirtualization on the \p Summary
+/// index. Any devirtualized targets used by a type test in another module
+/// are added to the \p ExportedGUIDs set. For any local devirtualized targets
+/// only used within the defining module, the information necessary for
+/// locating the corresponding WPD resolution is recorded for the ValueInfo
+/// in case it is exported by cross module importing (in which case the
+/// devirtualized target name will need adjustment).
+void runWholeProgramDevirtOnIndex(
+    ModuleSummaryIndex &Summary, std::set<GlobalValue::GUID> &ExportedGUIDs,
+    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap);
+
+/// Call after cross-module importing to update the recorded single impl
+/// devirt target names for any locals that were exported.
+void updateIndexWPDForExports(
+    ModuleSummaryIndex &Summary,
+    function_ref<bool(StringRef, GlobalValue::GUID)> isExported,
+    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_IPO_WHOLEPROGRAMDEVIRT_H
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 8b70d2926ae9..fcad1e11895f 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -181,10 +181,6 @@ struct SanitizerCoverageOptions {
   SanitizerCoverageOptions() = default;
 };
 
-// Insert SanitizerCoverage instrumentation.
-ModulePass *createSanitizerCoverageModulePass(
-    const SanitizerCoverageOptions &Options = SanitizerCoverageOptions());
-
 /// Calculate what to divide by to scale counts.
 ///
 /// Given the maximum count, calculate a divisor that will scale all the
diff --git a/include/llvm/Transforms/Instrumentation/InstrProfiling.h b/include/llvm/Transforms/Instrumentation/InstrProfiling.h
index 8f76d4a1ce55..2e0fae527b15 100644
--- a/include/llvm/Transforms/Instrumentation/InstrProfiling.h
+++ b/include/llvm/Transforms/Instrumentation/InstrProfiling.h
@@ -39,13 +39,14 @@ public:
       : Options(Options), IsCS(IsCS) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-  bool run(Module &M, const TargetLibraryInfo &TLI);
+  bool run(Module &M,
+           std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
 
 private:
   InstrProfOptions Options;
   Module *M;
   Triple TT;
-  const TargetLibraryInfo *TLI;
+  std::function<const TargetLibraryInfo &(Function &F)> GetTLI;
   struct PerFunctionProfileData {
     uint32_t NumValueSites[IPVK_Last + 1];
     GlobalVariable *RegionCounters = nullptr;
diff --git a/include/llvm/Transforms/Instrumentation/MemorySanitizer.h b/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
index 0739d9e58a61..01a86ee3f1fd 100644
--- a/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
+++ b/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
@@ -19,12 +19,11 @@
 namespace llvm {
 
 struct MemorySanitizerOptions {
-  MemorySanitizerOptions() = default;
-  MemorySanitizerOptions(int TrackOrigins, bool Recover, bool Kernel)
-      : TrackOrigins(TrackOrigins), Recover(Recover), Kernel(Kernel) {}
-  int TrackOrigins = 0;
-  bool Recover = false;
-  bool Kernel = false;
+  MemorySanitizerOptions() : MemorySanitizerOptions(0, false, false){};
+  MemorySanitizerOptions(int TrackOrigins, bool Recover, bool Kernel);
+  bool Kernel;
+  int TrackOrigins;
+  bool Recover;
 };
 
 // Insert MemorySanitizer instrumentation (detection of uninitialized reads)
@@ -41,6 +40,7 @@ struct MemorySanitizerPass : public PassInfoMixin<MemorySanitizerPass> {
   MemorySanitizerPass(MemorySanitizerOptions Options) : Options(Options) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
 private:
   MemorySanitizerOptions Options;
diff --git a/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h b/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
new file mode 100644
index 000000000000..85a43ff86f2e
--- /dev/null
+++ b/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
@@ -0,0 +1,47 @@
+//===--------- Definition of the SanitizerCoverage class --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SanitizerCoverage class which is a port of the legacy
+// SanitizerCoverage pass to use the new PassManager infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_SANITIZERCOVERAGE_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_SANITIZERCOVERAGE_H
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Instrumentation.h"
+
+namespace llvm {
+
+/// This is the ModuleSanitizerCoverage pass used in the new pass manager. The
+/// pass instruments functions for coverage, adds initialization calls to the
+/// module for trace PC guards and 8bit counters if they are requested, and
+/// appends globals to llvm.compiler.used.
+class ModuleSanitizerCoveragePass
+    : public PassInfoMixin<ModuleSanitizerCoveragePass> {
+public:
+  explicit ModuleSanitizerCoveragePass(
+      SanitizerCoverageOptions Options = SanitizerCoverageOptions())
+      : Options(Options) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  SanitizerCoverageOptions Options;
+};
+
+// Insert SanitizerCoverage instrumentation.
+ModulePass *createModuleSanitizerCoverageLegacyPassPass(
+    const SanitizerCoverageOptions &Options = SanitizerCoverageOptions());
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h b/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
index b4e7d9924ff6..ce0e46745abb 100644
--- a/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
+++ b/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
@@ -27,6 +27,8 @@ FunctionPass *createThreadSanitizerLegacyPassPass();
 /// yet, the pass inserts the declarations. Otherwise the existing globals are
 struct ThreadSanitizerPass : public PassInfoMixin<ThreadSanitizerPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
+
 } // namespace llvm
 #endif /* LLVM_TRANSFORMS_INSTRUMENTATION_THREADSANITIZER_H */
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index f9360b5ee2c8..f06230b6f366 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -308,7 +308,7 @@ FunctionPass *createGVNSinkPass();
 // MergedLoadStoreMotion - This pass merges loads and stores in diamonds. Loads
 // are hoisted into the header, while stores sink into the footer.
 //
-FunctionPass *createMergedLoadStoreMotionPass();
+FunctionPass *createMergedLoadStoreMotionPass(bool SplitFooterBB = false);
 
 //===----------------------------------------------------------------------===//
 //
@@ -395,6 +395,13 @@ extern char &InferAddressSpacesID;
 // "block_weights" metadata.
 FunctionPass *createLowerExpectIntrinsicPass();
 
+//===----------------------------------------------------------------------===//
+//
+// LowerConstantIntrinsicss - Expand any remaining llvm.objectsize and
+// llvm.is.constant intrinsic calls, even for the unknown cases.
+//
+FunctionPass *createLowerConstantIntrinsicsPass();
+
 //===----------------------------------------------------------------------===//
 //
 // PartiallyInlineLibCalls - Tries to inline the fast path of library
diff --git a/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/include/llvm/Transforms/Scalar/CallSiteSplitting.h
index b6055639e8a8..74cbf84b64b2 100644
--- a/include/llvm/Transforms/Scalar/CallSiteSplitting.h
+++ b/include/llvm/Transforms/Scalar/CallSiteSplitting.h
@@ -9,13 +9,8 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H
 #define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H
 
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/Compiler.h"
-#include <vector>
 
 namespace llvm {
 
diff --git a/include/llvm/Transforms/Scalar/ConstantHoisting.h b/include/llvm/Transforms/Scalar/ConstantHoisting.h
index 6b0fc9c1dd07..39039b093241 100644
--- a/include/llvm/Transforms/Scalar/ConstantHoisting.h
+++ b/include/llvm/Transforms/Scalar/ConstantHoisting.h
@@ -37,7 +37,9 @@
 #define LLVM_TRANSFORMS_SCALAR_CONSTANTHOISTING_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
@@ -154,21 +156,21 @@ private:
 
   /// Keeps track of constant candidates found in the function.
   using ConstCandVecType = std::vector<consthoist::ConstantCandidate>;
-  using GVCandVecMapType = DenseMap<GlobalVariable *, ConstCandVecType>;
+  using GVCandVecMapType = MapVector<GlobalVariable *, ConstCandVecType>;
   ConstCandVecType ConstIntCandVec;
   GVCandVecMapType ConstGEPCandMap;
 
   /// These are the final constants we decided to hoist.
   using ConstInfoVecType = SmallVector<consthoist::ConstantInfo, 8>;
-  using GVInfoVecMapType = DenseMap<GlobalVariable *, ConstInfoVecType>;
+  using GVInfoVecMapType = MapVector<GlobalVariable *, ConstInfoVecType>;
   ConstInfoVecType ConstIntInfoVec;
   GVInfoVecMapType ConstGEPInfoMap;
 
   /// Keep track of cast instructions we already cloned.
-  SmallDenseMap<Instruction *, Instruction *> ClonedCastMap;
+  MapVector<Instruction *, Instruction *> ClonedCastMap;
 
   Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const;
-  SmallPtrSet<Instruction *, 8>
+  SetVector<Instruction *>
   findConstantInsertionPoint(const consthoist::ConstantInfo &ConstInfo) const;
   void collectConstantCandidates(ConstCandMapType &ConstCandMap,
                                  Instruction *Inst, unsigned Idx,
diff --git a/include/llvm/Transforms/Scalar/Float2Int.h b/include/llvm/Transforms/Scalar/Float2Int.h
index 06aeb8322527..f04b98a19d82 100644
--- a/include/llvm/Transforms/Scalar/Float2Int.h
+++ b/include/llvm/Transforms/Scalar/Float2Int.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 
@@ -26,10 +27,11 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   // Glue for old PM.
-  bool runImpl(Function &F);
+  bool runImpl(Function &F, const DominatorTree &DT);
 
 private:
-  void findRoots(Function &F, SmallPtrSet<Instruction *, 8> &Roots);
+  void findRoots(Function &F, const DominatorTree &DT,
+                 SmallPtrSet<Instruction *, 8> &Roots);
   void seen(Instruction *I, ConstantRange R);
   ConstantRange badRange();
   ConstantRange unknownRange();
diff --git a/include/llvm/Transforms/Scalar/GVN.h b/include/llvm/Transforms/Scalar/GVN.h
index 9fe00a9e7f2d..8a64768af6b5 100644
--- a/include/llvm/Transforms/Scalar/GVN.h
+++ b/include/llvm/Transforms/Scalar/GVN.h
@@ -120,6 +120,8 @@ public:
     uint32_t lookupOrAddCall(CallInst *C);
     uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock,
                               uint32_t Num, GVN &Gvn);
+    bool areCallValsEqual(uint32_t Num, uint32_t NewNum, const BasicBlock *Pred,
+                          const BasicBlock *PhiBlock, GVN &Gvn);
     std::pair<uint32_t, bool> assignExpNewValueNum(Expression &exp);
     bool areAllValsInBB(uint32_t num, const BasicBlock *BB, GVN &Gvn);
 
@@ -159,6 +161,7 @@ private:
   SetVector<BasicBlock *> DeadBlocks;
   OptimizationRemarkEmitter *ORE;
   ImplicitControlFlowTracking *ICF;
+  LoopInfo *LI;
 
   ValueTable VN;
 
@@ -175,7 +178,7 @@ private:
   // Block-local map of equivalent values to their leader, does not
   // propagate to any successors. Entries added mid-block are applied
   // to the remaining instructions in the block.
-  SmallMapVector<Value *, Constant *, 4> ReplaceWithConstMap;
+  SmallMapVector<Value *, Value *, 4> ReplaceOperandsWithMap;
   SmallVector<Instruction *, 8> InstrsToErase;
 
   // Map the block to reversed postorder traversal number. It is used to
@@ -280,7 +283,7 @@ private:
   void verifyRemoved(const Instruction *I) const;
   bool splitCriticalEdges();
   BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
-  bool replaceOperandsWithConsts(Instruction *I) const;
+  bool replaceOperandsForInBlockEquality(Instruction *I) const;
   bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
                          bool DominatesByEdge);
   bool processFoldableCondBr(BranchInst *BI);
diff --git a/include/llvm/Transforms/Scalar/GVNExpression.h b/include/llvm/Transforms/Scalar/GVNExpression.h
index 3dc4515f85a1..1600d1af3242 100644
--- a/include/llvm/Transforms/Scalar/GVNExpression.h
+++ b/include/llvm/Transforms/Scalar/GVNExpression.h
@@ -323,7 +323,7 @@ public:
 class LoadExpression final : public MemoryExpression {
 private:
   LoadInst *Load;
-  unsigned Alignment;
+  MaybeAlign Alignment;
 
 public:
   LoadExpression(unsigned NumOperands, LoadInst *L,
@@ -333,7 +333,8 @@ public:
   LoadExpression(enum ExpressionType EType, unsigned NumOperands, LoadInst *L,
                  const MemoryAccess *MemoryLeader)
       : MemoryExpression(NumOperands, EType, MemoryLeader), Load(L) {
-    Alignment = L ? L->getAlignment() : 0;
+    if (L)
+      Alignment = MaybeAlign(L->getAlignment());
   }
 
   LoadExpression() = delete;
@@ -348,8 +349,8 @@ public:
   LoadInst *getLoadInst() const { return Load; }
   void setLoadInst(LoadInst *L) { Load = L; }
 
-  unsigned getAlignment() const { return Alignment; }
-  void setAlignment(unsigned Align) { Alignment = Align; }
+  MaybeAlign getAlignment() const { return Alignment; }
+  void setAlignment(MaybeAlign Align) { Alignment = Align; }
 
   bool equals(const Expression &Other) const override;
   bool exactlyEquals(const Expression &Other) const override {
diff --git a/include/llvm/Transforms/Scalar/LoopPassManager.h b/include/llvm/Transforms/Scalar/LoopPassManager.h
index 61ec58585fd0..aed764855b2e 100644
--- a/include/llvm/Transforms/Scalar/LoopPassManager.h
+++ b/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -263,8 +263,10 @@ template <typename LoopPassT>
 class FunctionToLoopPassAdaptor
     : public PassInfoMixin<FunctionToLoopPassAdaptor<LoopPassT>> {
 public:
-  explicit FunctionToLoopPassAdaptor(LoopPassT Pass, bool DebugLogging = false)
-      : Pass(std::move(Pass)), LoopCanonicalizationFPM(DebugLogging) {
+  explicit FunctionToLoopPassAdaptor(LoopPassT Pass, bool UseMemorySSA = false,
+                                     bool DebugLogging = false)
+      : Pass(std::move(Pass)), LoopCanonicalizationFPM(DebugLogging),
+        UseMemorySSA(UseMemorySSA) {
     LoopCanonicalizationFPM.addPass(LoopSimplifyPass());
     LoopCanonicalizationFPM.addPass(LCSSAPass());
   }
@@ -293,7 +295,7 @@ public:
       return PA;
 
     // Get the analysis results needed by loop passes.
-    MemorySSA *MSSA = EnableMSSALoopDependency
+    MemorySSA *MSSA = UseMemorySSA
                           ? (&AM.getResult<MemorySSAAnalysis>(F).getMSSA())
                           : nullptr;
     LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F),
@@ -310,8 +312,10 @@ public:
     // LoopStandardAnalysisResults object. The loop analyses cached in this
     // manager have access to those analysis results and so it must invalidate
     // itself when they go away.
-    LoopAnalysisManager &LAM =
-        AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+    auto &LAMFP = AM.getResult<LoopAnalysisManagerFunctionProxy>(F);
+    if (UseMemorySSA)
+      LAMFP.markMSSAUsed();
+    LoopAnalysisManager &LAM = LAMFP.getManager();
 
     // A postorder worklist of loops to process.
     SmallPriorityWorklist<Loop *, 4> Worklist;
@@ -382,7 +386,7 @@ public:
     PA.preserve<DominatorTreeAnalysis>();
     PA.preserve<LoopAnalysis>();
     PA.preserve<ScalarEvolutionAnalysis>();
-    if (EnableMSSALoopDependency)
+    if (UseMemorySSA)
       PA.preserve<MemorySSAAnalysis>();
     // FIXME: What we really want to do here is preserve an AA category, but
     // that concept doesn't exist yet.
@@ -397,14 +401,18 @@ private:
   LoopPassT Pass;
 
   FunctionPassManager LoopCanonicalizationFPM;
+
+  bool UseMemorySSA = false;
 };
 
 /// A function to deduce a loop pass type and wrap it in the templated
 /// adaptor.
 template <typename LoopPassT>
 FunctionToLoopPassAdaptor<LoopPassT>
-createFunctionToLoopPassAdaptor(LoopPassT Pass, bool DebugLogging = false) {
-  return FunctionToLoopPassAdaptor<LoopPassT>(std::move(Pass), DebugLogging);
+createFunctionToLoopPassAdaptor(LoopPassT Pass, bool UseMemorySSA = false,
+                                bool DebugLogging = false) {
+  return FunctionToLoopPassAdaptor<LoopPassT>(std::move(Pass), UseMemorySSA,
+                                              DebugLogging);
 }
 
 /// Pass for printing a loop's contents as textual IR.
diff --git a/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
index a84d889a83ad..afeb1f1da029 100644
--- a/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@@ -62,6 +62,8 @@ struct LoopUnrollOptions {
   Optional<bool> AllowPeeling;
   Optional<bool> AllowRuntime;
   Optional<bool> AllowUpperBound;
+  Optional<bool> AllowProfileBasedPeeling;
+  Optional<unsigned> FullUnrollMaxCount;
   int OptLevel;
 
   /// If false, use a cost model to determine whether unrolling of a loop is
@@ -110,6 +112,18 @@ struct LoopUnrollOptions {
     OptLevel = O;
     return *this;
   }
+
+  // Enables or disables loop peeling basing on profile.
+  LoopUnrollOptions &setProfileBasedPeeling(int O) {
+    AllowProfileBasedPeeling = O;
+    return *this;
+  }
+
+  // Sets the max full unroll count.
+  LoopUnrollOptions &setFullUnrollMaxCount(unsigned O) {
+    FullUnrollMaxCount = O;
+    return *this;
+  }
 };
 
 /// Loop unroll pass that will support both full and partial unrolling.
diff --git a/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h b/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h
new file mode 100644
index 000000000000..a5ad4a2192a0
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/LowerConstantIntrinsics.h
@@ -0,0 +1,41 @@
+//===- LowerConstantIntrinsics.h - Lower constant int. pass -*- C++ -*-========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// The header file for the LowerConstantIntrinsics pass as used by the new pass
+/// manager.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOWERCONSTANTINTRINSICS_H
+#define LLVM_TRANSFORMS_SCALAR_LOWERCONSTANTINTRINSICS_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct LowerConstantIntrinsicsPass :
+    PassInfoMixin<LowerConstantIntrinsicsPass> {
+public:
+  explicit LowerConstantIntrinsicsPass() {}
+
+  /// Run the pass over the function.
+  ///
+  /// This will lower all remaining 'objectsize' and 'is.constant'`
+  /// intrinsic calls in this function, even when the argument has no known
+  /// size or is not a constant respectively. The resulting constant is
+  /// propagated and conditional branches are resolved where possible.
+  /// This complements the Instruction Simplification and
+  /// Instruction Combination passes of the optimized pass chain.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+};
+
+}
+
+#endif
diff --git a/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h b/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
index 9071a56532f8..c5f6d6e0e8bd 100644
--- a/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
+++ b/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
@@ -27,12 +27,28 @@
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+struct MergedLoadStoreMotionOptions {
+  bool SplitFooterBB;
+  MergedLoadStoreMotionOptions(bool SplitFooterBB = false)
+      : SplitFooterBB(SplitFooterBB) {}
+
+  MergedLoadStoreMotionOptions &splitFooterBB(bool SFBB) {
+    SplitFooterBB = SFBB;
+    return *this;
+  }
+};
+
 class MergedLoadStoreMotionPass
     : public PassInfoMixin<MergedLoadStoreMotionPass> {
+  MergedLoadStoreMotionOptions Options;
+
 public:
+  MergedLoadStoreMotionPass()
+      : MergedLoadStoreMotionPass(MergedLoadStoreMotionOptions()) {}
+  MergedLoadStoreMotionPass(const MergedLoadStoreMotionOptions &PassOptions)
+      : Options(PassOptions) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
-
 }
 
 #endif // LLVM_TRANSFORMS_SCALAR_MERGEDLOADSTOREMOTION_H
diff --git a/include/llvm/Transforms/Scalar/Reassociate.h b/include/llvm/Transforms/Scalar/Reassociate.h
index 2db8d8ce309c..d5b175eff0e6 100644
--- a/include/llvm/Transforms/Scalar/Reassociate.h
+++ b/include/llvm/Transforms/Scalar/Reassociate.h
@@ -122,7 +122,9 @@ private:
   void EraseInst(Instruction *I);
   void RecursivelyEraseDeadInsts(Instruction *I, OrderedSet &Insts);
   void OptimizeInst(Instruction *I);
-  Instruction *canonicalizeNegConstExpr(Instruction *I);
+  Instruction *canonicalizeNegFPConstantsForOp(Instruction *I, Instruction *Op,
+                                               Value *OtherOp);
+  Instruction *canonicalizeNegFPConstants(Instruction *I);
   void BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT);
 };
 
diff --git a/include/llvm/Transforms/Scalar/SCCP.h b/include/llvm/Transforms/Scalar/SCCP.h
index 0ffd983eb3e0..45e674a20a16 100644
--- a/include/llvm/Transforms/Scalar/SCCP.h
+++ b/include/llvm/Transforms/Scalar/SCCP.h
@@ -45,7 +45,8 @@ struct AnalysisResultsForFn {
   PostDominatorTree *PDT;
 };
 
-bool runIPSCCP(Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI,
+bool runIPSCCP(Module &M, const DataLayout &DL,
+               std::function<const TargetLibraryInfo &(Function &)> GetTLI,
                function_ref<AnalysisResultsForFn(Function &)> getAnalysis);
 } // end namespace llvm
 
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 4d861ffe9a31..698e57fd0394 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -83,10 +83,16 @@ bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI = nullptr);
 
 /// Attempts to merge a block into its predecessor, if possible. The return
 /// value indicates success or failure.
+/// By default do not merge blocks if BB's predecessor has multiple successors.
+/// If PredecessorWithTwoSuccessors = true, the blocks can only be merged
+/// if BB's Pred has a branch to BB and to AnotherBB, and BB has a single
+/// successor Sing. In this case the branch will be updated with Sing instead of
+/// BB, and BB will still be merged into its predecessor and removed.
 bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU = nullptr,
                                LoopInfo *LI = nullptr,
                                MemorySSAUpdater *MSSAU = nullptr,
-                               MemoryDependenceResults *MemDep = nullptr);
+                               MemoryDependenceResults *MemDep = nullptr,
+                               bool PredecessorWithTwoSuccessors = false);
 
 /// Replace all uses of an instruction (specified by BI) with a value, then
 /// remove and delete the original instruction.
@@ -222,7 +228,8 @@ BasicBlock *SplitEdge(BasicBlock *From, BasicBlock *To,
 /// info is updated.
 BasicBlock *SplitBlock(BasicBlock *Old, Instruction *SplitPt,
                        DominatorTree *DT = nullptr, LoopInfo *LI = nullptr,
-                       MemorySSAUpdater *MSSAU = nullptr);
+                       MemorySSAUpdater *MSSAU = nullptr,
+                       const Twine &BBName = "");
 
 /// This method introduces at least one new basic block into the function and
 /// moves some of the predecessors of BB to be predecessors of the new block.
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index 8421c31a36da..3d15b2a7bf2a 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -30,17 +30,16 @@ namespace llvm {
   bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI);
   bool inferLibFuncAttributes(Module *M, StringRef Name, const TargetLibraryInfo &TLI);
 
-  /// Check whether the overloaded unary floating point function
+  /// Check whether the overloaded floating point function
   /// corresponding to \a Ty is available.
-  bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                       LibFunc DoubleFn, LibFunc FloatFn,
-                       LibFunc LongDoubleFn);
+  bool hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                  LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn);
 
-  /// Get the name of the overloaded unary floating point function
+  /// Get the name of the overloaded floating point function
   /// corresponding to \a Ty.
-  StringRef getUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                            LibFunc DoubleFn, LibFunc FloatFn,
-                            LibFunc LongDoubleFn);
+  StringRef getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
+                           LibFunc DoubleFn, LibFunc FloatFn,
+                           LibFunc LongDoubleFn);
 
   /// Return V if it is an i8*, otherwise cast it to i8*.
   Value *castToCStr(Value *V, IRBuilder<> &B);
@@ -51,6 +50,11 @@ namespace llvm {
   Value *emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
                     const TargetLibraryInfo *TLI);
 
+  /// Emit a call to the strdup function to the builder, for the specified
+  /// pointer. Ptr is required to be some pointer type, and the return value has
+  /// 'i8*' type.
+  Value *emitStrDup(Value *Ptr, IRBuilder<> &B, const TargetLibraryInfo *TLI);
+
   /// Emit a call to the strnlen function to the builder, for the specified
   /// pointer. Ptr is required to be some pointer type, MaxLen must be of size_t
   /// type, and the return value has 'intptr_t' type.
@@ -164,6 +168,13 @@ namespace llvm {
   Value *emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
                                IRBuilder<> &B, const AttributeList &Attrs);
 
+  /// Emit a call to the binary function DoubleFn, FloatFn or LongDoubleFn,
+  /// depending of the type of Op1.
+  Value *emitBinaryFloatFnCall(Value *Op1, Value *Op2,
+                               const TargetLibraryInfo *TLI, LibFunc DoubleFn,
+                               LibFunc FloatFn, LibFunc LongDoubleFn,
+                               IRBuilder<> &B, const AttributeList &Attrs);
+
   /// Emit a call to the putchar function. This assumes that Char is an integer.
   Value *emitPutChar(Value *Char, IRBuilder<> &B, const TargetLibraryInfo *TLI);
 
diff --git a/include/llvm/Transforms/Utils/BypassSlowDivision.h b/include/llvm/Transforms/Utils/BypassSlowDivision.h
index 471055921fa8..bd98c902d1ab 100644
--- a/include/llvm/Transforms/Utils/BypassSlowDivision.h
+++ b/include/llvm/Transforms/Utils/BypassSlowDivision.h
@@ -19,6 +19,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/IR/ValueHandle.h"
 #include <cstdint>
 
 namespace llvm {
@@ -28,8 +29,10 @@ class Value;
 
 struct DivRemMapKey {
   bool SignedOp;
-  Value *Dividend;
-  Value *Divisor;
+  AssertingVH<Value> Dividend;
+  AssertingVH<Value> Divisor;
+
+  DivRemMapKey() = default;
 
   DivRemMapKey(bool InSignedOp, Value *InDividend, Value *InDivisor)
       : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
@@ -50,8 +53,10 @@ template <> struct DenseMapInfo<DivRemMapKey> {
   }
 
   static unsigned getHashValue(const DivRemMapKey &Val) {
-    return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
-                      reinterpret_cast<uintptr_t>(Val.Divisor)) ^
+    return (unsigned)(reinterpret_cast<uintptr_t>(
+                          static_cast<Value *>(Val.Dividend)) ^
+                      reinterpret_cast<uintptr_t>(
+                          static_cast<Value *>(Val.Divisor))) ^
            (unsigned)Val.SignedOp;
   }
 };
diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h
index 9d79ee1633f6..8a1ab796734e 100644
--- a/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -22,6 +22,7 @@
 
 namespace llvm {
 
+class AllocaInst;
 class BasicBlock;
 class BlockFrequency;
 class BlockFrequencyInfo;
@@ -36,6 +37,38 @@ class Module;
 class Type;
 class Value;
 
+/// A cache for the CodeExtractor analysis. The operation \ref
+/// CodeExtractor::extractCodeRegion is guaranteed not to invalidate this
+/// object. This object should conservatively be considered invalid if any
+/// other mutating operations on the IR occur.
+///
+/// Constructing this object is O(n) in the size of the function.
+class CodeExtractorAnalysisCache {
+  /// The allocas in the function.
+  SmallVector<AllocaInst *, 16> Allocas;
+
+  /// Base memory addresses of load/store instructions, grouped by block.
+  DenseMap<BasicBlock *, DenseSet<Value *>> BaseMemAddrs;
+
+  /// Blocks which contain instructions which may have unknown side-effects
+  /// on memory.
+  DenseSet<BasicBlock *> SideEffectingBlocks;
+
+  void findSideEffectInfoForBlock(BasicBlock &BB);
+
+public:
+  CodeExtractorAnalysisCache(Function &F);
+
+  /// Get the allocas in the function at the time the analysis was created.
+  /// Note that some of these allocas may no longer be present in the function,
+  /// due to \ref CodeExtractor::extractCodeRegion.
+  ArrayRef<AllocaInst *> getAllocas() const { return Allocas; }
+
+  /// Check whether \p BB contains an instruction thought to load from, store
+  /// to, or otherwise clobber the alloca \p Addr.
+  bool doesBlockContainClobberOfAddr(BasicBlock &BB, AllocaInst *Addr) const;
+};
+
   /// Utility class for extracting code into a new function.
   ///
   /// This utility provides a simple interface for extracting some sequence of
@@ -104,13 +137,21 @@ class Value;
     ///
     /// Returns zero when called on a CodeExtractor instance where isEligible
     /// returns false.
-    Function *extractCodeRegion();
+    Function *extractCodeRegion(const CodeExtractorAnalysisCache &CEAC);
+
+    /// Verify that assumption cache isn't stale after a region is extracted.
+    /// Returns false when verifier finds errors. AssumptionCache is passed as
+    /// parameter to make this function stateless.
+    static bool verifyAssumptionCache(const Function& F, AssumptionCache *AC);
 
     /// Test whether this code extractor is eligible.
     ///
     /// Based on the blocks used when constructing the code extractor,
     /// determine whether it is eligible for extraction.
-    bool isEligible() const { return !Blocks.empty(); }
+    /// 
+    /// Checks that varargs handling (with vastart and vaend) is only done in
+    /// the outlined blocks.
+    bool isEligible() const;
 
     /// Compute the set of input values and output values for the code.
     ///
@@ -127,7 +168,9 @@ class Value;
     /// region.
     ///
     /// Returns true if it is safe to do the code motion.
-    bool isLegalToShrinkwrapLifetimeMarkers(Instruction *AllocaAddr) const;
+    bool
+    isLegalToShrinkwrapLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
+                                       Instruction *AllocaAddr) const;
 
     /// Find the set of allocas whose life ranges are contained within the
     /// outlined region.
@@ -137,7 +180,8 @@ class Value;
     /// are used by the lifetime markers are also candidates for shrink-
     /// wrapping. The instructions that need to be sunk are collected in
     /// 'Allocas'.
-    void findAllocas(ValueSet &SinkCands, ValueSet &HoistCands,
+    void findAllocas(const CodeExtractorAnalysisCache &CEAC,
+                     ValueSet &SinkCands, ValueSet &HoistCands,
                      BasicBlock *&ExitBlock) const;
 
     /// Find or create a block within the outline region for placing hoisted
@@ -158,8 +202,9 @@ class Value;
       Instruction *LifeEnd = nullptr;
     };
 
-    LifetimeMarkerInfo getLifetimeMarkers(Instruction *Addr,
-                                          BasicBlock *ExitBlock) const;
+    LifetimeMarkerInfo
+    getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
+                       Instruction *Addr, BasicBlock *ExitBlock) const;
 
     void severSplitPHINodesOfEntry(BasicBlock *&Header);
     void severSplitPHINodesOfExits(const SmallPtrSetImpl<BasicBlock *> &Exits);
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index ff516f230979..9fcb2f64d79b 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -271,6 +271,15 @@ inline unsigned getKnownAlignment(Value *V, const DataLayout &DL,
   return getOrEnforceKnownAlignment(V, 0, DL, CxtI, AC, DT);
 }
 
+/// Create a call that matches the invoke \p II in terms of arguments,
+/// attributes, debug information, etc. The call is not placed in a block and it
+/// will not have a name. The invoke instruction is not removed, nor are the
+/// uses replaced by the new call.
+CallInst *createCallMatchingInvoke(InvokeInst *II);
+
+/// This function converts the specified invoek into a normall call.
+void changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr);
+
 ///===---------------------------------------------------------------------===//
 ///  Dbg Intrinsic utilities
 ///
@@ -403,8 +412,7 @@ void removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU = nullptr);
 /// Remove all blocks that can not be reached from the function's entry.
 ///
 /// Returns true if any basic block was removed.
-bool removeUnreachableBlocks(Function &F, LazyValueInfo *LVI = nullptr,
-                             DomTreeUpdater *DTU = nullptr,
+bool removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU = nullptr,
                              MemorySSAUpdater *MSSAU = nullptr);
 
 /// Combine the metadata of two instructions so that K can replace J. Some
@@ -424,6 +432,10 @@ void combineMetadata(Instruction *K, const Instruction *J,
 void combineMetadataForCSE(Instruction *K, const Instruction *J,
                            bool DoesKMove);
 
+/// Copy the metadata from the source instruction to the destination (the
+/// replacement for the source instruction).
+void copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source);
+
 /// Patch the replacement so that it is not more restrictive than the value
 /// being replaced. It assumes that the replacement does not get moved from
 /// its original position.
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index 68bdded5cf93..d32f08717e9b 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -215,6 +215,9 @@ makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef<StringRef> FollowupAttrs,
 /// Look for the loop attribute that disables all transformation heuristic.
 bool hasDisableAllTransformsHint(const Loop *L);
 
+/// Look for the loop attribute that disables the LICM transformation heuristics.
+bool hasDisableLICMTransformsHint(const Loop *L);
+
 /// The mode sets how eager a transformation should be applied.
 enum TransformationMode {
   /// The pass can use heuristics to determine whether a transformation should
@@ -252,6 +255,8 @@ TransformationMode hasLICMVersioningTransformation(Loop *L);
 /// @}
 
 /// Set input string into loop metadata by keeping other values intact.
+/// If the string is already in loop metadata update value if it is
+/// different.
 void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
                              unsigned V = 0);
 
diff --git a/include/llvm/Transforms/Utils/MisExpect.h b/include/llvm/Transforms/Utils/MisExpect.h
new file mode 100644
index 000000000000..1dbe8cb95936
--- /dev/null
+++ b/include/llvm/Transforms/Utils/MisExpect.h
@@ -0,0 +1,43 @@
+//===--- MisExpect.h - Check the use of llvm.expect with PGO data ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code to emit warnings for potentially incorrect usage of the
+// llvm.expect intrinsic. This utility extracts the threshold values from
+// metadata associated with the instrumented Branch or Switch instruction. The
+// threshold values are then used to determine if a warning should be emmited.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+
+namespace llvm {
+namespace misexpect {
+
+/// verifyMisExpect - compares PGO counters to the thresholds used for
+/// llvm.expect and warns if the PGO counters are outside of the expected
+/// range.
+/// \param I The Instruction being checked
+/// \param Weights A vector of profile weights for each target block
+/// \param Ctx The current LLVM context
+void verifyMisExpect(llvm::Instruction *I,
+                     const llvm::SmallVector<uint32_t, 4> &Weights,
+                     llvm::LLVMContext &Ctx);
+
+/// checkClangInstrumentation - verify if llvm.expect matches PGO profile
+/// This function checks the frontend instrumentation in the backend when
+/// lowering llvm.expect intrinsics. It checks for existing metadata, and
+/// then validates the use of llvm.expect against the assigned branch weights.
+//
+/// \param I the Instruction being checked
+void checkFrontendInstrumentation(Instruction &I);
+
+} // namespace misexpect
+} // namespace llvm
diff --git a/include/llvm/Transforms/Utils/PredicateInfo.h b/include/llvm/Transforms/Utils/PredicateInfo.h
index da4a5dcc28c0..7c7a8eb04a2c 100644
--- a/include/llvm/Transforms/Utils/PredicateInfo.h
+++ b/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -229,10 +229,10 @@ protected:
 
 private:
   void buildPredicateInfo();
-  void processAssume(IntrinsicInst *, BasicBlock *, SmallPtrSetImpl<Value *> &);
-  void processBranch(BranchInst *, BasicBlock *, SmallPtrSetImpl<Value *> &);
-  void processSwitch(SwitchInst *, BasicBlock *, SmallPtrSetImpl<Value *> &);
-  void renameUses(SmallPtrSetImpl<Value *> &);
+  void processAssume(IntrinsicInst *, BasicBlock *, SmallVectorImpl<Value *> &);
+  void processBranch(BranchInst *, BasicBlock *, SmallVectorImpl<Value *> &);
+  void processSwitch(SwitchInst *, BasicBlock *, SmallVectorImpl<Value *> &);
+  void renameUses(SmallVectorImpl<Value *> &);
   using ValueDFS = PredicateInfoClasses::ValueDFS;
   typedef SmallVectorImpl<ValueDFS> ValueDFSStack;
   void convertUsesToDFSOrdered(Value *, SmallVectorImpl<ValueDFS> &);
@@ -240,7 +240,7 @@ private:
   bool stackIsInScope(const ValueDFSStack &, const ValueDFS &) const;
   void popStackUntilDFSScope(ValueDFSStack &, const ValueDFS &);
   ValueInfo &getOrCreateValueInfo(Value *);
-  void addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
+  void addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op,
                   PredicateBase *PB);
   const ValueInfo &getValueInfo(Value *) const;
   Function &F;
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 2572094ddac8..88c2ef787ad8 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -126,6 +126,12 @@ private:
   /// Erase an instruction from its parent with our eraser.
   void eraseFromParent(Instruction *I);
 
+  /// Replace an instruction with a value and erase it from its parent.
+  void substituteInParent(Instruction *I, Value *With) {
+    replaceAllUsesWith(I, With);
+    eraseFromParent(I);
+  }
+
   Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B);
 
 public:
@@ -154,6 +160,7 @@ private:
   Value *optimizeStrRChr(CallInst *CI, IRBuilder<> &B);
   Value *optimizeStrCmp(CallInst *CI, IRBuilder<> &B);
   Value *optimizeStrNCmp(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrNDup(CallInst *CI, IRBuilder<> &B);
   Value *optimizeStrCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeStpCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeStrNCpy(CallInst *CI, IRBuilder<> &B);
@@ -164,14 +171,17 @@ private:
   Value *optimizeStrCSpn(CallInst *CI, IRBuilder<> &B);
   Value *optimizeStrStr(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemChr(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemRChr(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemCmp(CallInst *CI, IRBuilder<> &B);
   Value *optimizeBCmp(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemCmpBCmpCommon(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemPCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemMove(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemSet(CallInst *CI, IRBuilder<> &B);
   Value *optimizeRealloc(CallInst *CI, IRBuilder<> &B);
   Value *optimizeWcslen(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeBCopy(CallInst *CI, IRBuilder<> &B);
   // Wrapper for all String/Memory Library Call Optimizations
   Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B);
 
diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h
index 593ca26feb98..02b81b4b7ee2 100644
--- a/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -114,8 +114,8 @@ bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
                         DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
                         const SmallPtrSetImpl<const Value *> &EphValues,
                         OptimizationRemarkEmitter *ORE, unsigned &TripCount,
-                        unsigned MaxTripCount, unsigned &TripMultiple,
-                        unsigned LoopSize,
+                        unsigned MaxTripCount, bool MaxOrZero,
+                        unsigned &TripMultiple, unsigned LoopSize,
                         TargetTransformInfo::UnrollingPreferences &UP,
                         bool &UseUpperBound);
 
@@ -132,7 +132,9 @@ TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
-    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling);
+    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling,
+    Optional<bool> UserAllowProfileBasedPeeling,
+    Optional<unsigned> UserFullUnrollMaxCount);
 
 unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
                              bool &NotDuplicatable, bool &Convergent,
diff --git a/include/llvm/Transforms/Utils/ValueMapper.h b/include/llvm/Transforms/Utils/ValueMapper.h
index 1952a210291e..ff5bfc609586 100644
--- a/include/llvm/Transforms/Utils/ValueMapper.h
+++ b/include/llvm/Transforms/Utils/ValueMapper.h
@@ -22,7 +22,7 @@ namespace llvm {
 
 class Constant;
 class Function;
-class GlobalAlias;
+class GlobalIndirectSymbol;
 class GlobalVariable;
 class Instruction;
 class MDNode;
@@ -120,7 +120,7 @@ inline RemapFlags operator|(RemapFlags LHS, RemapFlags RHS) {
 /// instance:
 /// - \a scheduleMapGlobalInitializer()
 /// - \a scheduleMapAppendingVariable()
-/// - \a scheduleMapGlobalAliasee()
+/// - \a scheduleMapGlobalIndirectSymbol()
 /// - \a scheduleRemapFunction()
 ///
 /// Sometimes a callback needs a different mapping context.  Such a context can
@@ -180,8 +180,9 @@ public:
                                     bool IsOldCtorDtor,
                                     ArrayRef<Constant *> NewMembers,
                                     unsigned MappingContextID = 0);
-  void scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
-                                unsigned MappingContextID = 0);
+  void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
+                                       Constant &Target,
+                                       unsigned MappingContextID = 0);
   void scheduleRemapFunction(Function &F, unsigned MappingContextID = 0);
 };
 
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index b144006e2628..d1e7acc877bf 100644
--- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -33,18 +33,6 @@
 
 namespace llvm {
 
-/// Create an analysis remark that explains why vectorization failed
-///
-/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
-/// RemarkName is the identifier for the remark.  If \p I is passed it is an
-/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
-/// the location of the remark.  \return the remark object that can be
-/// streamed to.
-OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
-                                                  StringRef RemarkName,
-                                                  Loop *TheLoop,
-                                                  Instruction *I = nullptr);
-
 /// Utility class for getting and setting loop vectorizer hints in the form
 /// of loop metadata.
 /// This class keeps a number of loop annotations locally (as member variables)
@@ -55,7 +43,8 @@ OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
 /// for example 'force', means a decision has been made. So, we need to be
 /// careful NOT to add them if the user hasn't specifically asked so.
 class LoopVectorizeHints {
-  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
+  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED,
+                  HK_PREDICATE };
 
   /// Hint - associates name and validation with the hint value.
   struct Hint {
@@ -81,6 +70,9 @@ class LoopVectorizeHints {
   /// Already Vectorized
   Hint IsVectorized;
 
+  /// Vector Predicate
+  Hint Predicate;
+
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
 
@@ -109,6 +101,7 @@ public:
   unsigned getWidth() const { return Width.Value; }
   unsigned getInterleave() const { return Interleave.Value; }
   unsigned getIsVectorized() const { return IsVectorized.Value; }
+  unsigned getPredicate() const { return Predicate.Value; }
   enum ForceKind getForce() const {
     if ((ForceKind)Force.Value == FK_Undefined &&
         hasDisableAllTransformsHint(TheLoop))
@@ -235,8 +228,8 @@ public:
   bool canVectorize(bool UseVPlanNativePath);
 
   /// Return true if we can vectorize this loop while folding its tail by
-  /// masking.
-  bool canFoldTailByMasking();
+  /// masking, and mark all respective loads/stores for masking.
+  bool prepareToFoldTailByMasking();
 
   /// Returns the primary induction variable.
   PHINode *getPrimaryInduction() { return PrimaryInduction; }
@@ -362,9 +355,16 @@ private:
   bool canVectorizeOuterLoop();
 
   /// Return true if all of the instructions in the block can be speculatively
-  /// executed. \p SafePtrs is a list of addresses that are known to be legal
-  /// and we know that we can read from them without segfault.
-  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
+  /// executed, and record the loads/stores that require masking. If's that
+  /// guard loads can be ignored under "assume safety" unless \p PreserveGuards
+  /// is true. This can happen when we introduces guards for which the original
+  /// "unguarded-loads are safe" assumption does not hold. For example, the
+  /// vectorizer's fold-tail transformation changes the loop to execute beyond
+  /// its original trip-count, under a proper guard, which should be preserved.
+  /// \p SafePtrs is a list of addresses that are known to be legal and we know
+  /// that we can read from them without segfault.
+  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+                            bool PreserveGuards = false);
 
   /// Updates the vectorization state by adding \p Phi to the inductions list.
   /// This can set \p Phi as the main induction of the loop if \p Phi is a
@@ -382,14 +382,6 @@ private:
     return LAI ? &LAI->getSymbolicStrides() : nullptr;
   }
 
-  /// Reports a vectorization illegality: print \p DebugMsg for debugging
-  /// purposes along with the corresponding optimization remark \p RemarkName.
-  /// If \p I is passed it is an instruction that prevents vectorization.
-  /// Otherwise the loop is used for the location of the remark.
-  void reportVectorizationFailure(const StringRef DebugMsg,
-      const StringRef OREMsg, const StringRef ORETag,
-      Instruction *I = nullptr) const;
-
   /// The loop that we evaluate.
   Loop *TheLoop;
 
@@ -452,8 +444,8 @@ private:
   /// Holds the widest induction type encountered.
   Type *WidestIndTy = nullptr;
 
-  /// Allowed outside users. This holds the induction and reduction
-  /// vars which can be accessed from outside the loop.
+  /// Allowed outside users. This holds the variables that can be accessed from
+  /// outside the loop.
   SmallPtrSet<Value *, 4> AllowedExit;
 
   /// Can we assume the absence of NaNs.
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorize.h b/include/llvm/Transforms/Vectorize/LoopVectorize.h
index d1ec06afb02a..d824e2903ef3 100644
--- a/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -155,6 +155,14 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
   bool processLoop(Loop *L);
 };
 
+/// Reports a vectorization failure: print \p DebugMsg for debugging
+/// purposes along with the corresponding optimization remark \p RemarkName.
+/// If \p I is passed, it is an instruction that prevents vectorization.
+/// Otherwise, the loop \p TheLoop is used for the location of the remark.
+void reportVectorizationFailure(const StringRef DebugMsg,
+    const StringRef OREMsg, const StringRef ORETag,
+    OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H
diff --git a/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index ac6afb761d4d..32ccc8a46380 100644
--- a/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -24,7 +24,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
 
@@ -60,8 +59,8 @@ extern cl::opt<bool> RunSLPVectorization;
 struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   using StoreList = SmallVector<StoreInst *, 8>;
   using StoreListMap = MapVector<Value *, StoreList>;
-  using WeakTrackingVHList = SmallVector<WeakTrackingVH, 8>;
-  using WeakTrackingVHListMap = MapVector<Value *, WeakTrackingVHList>;
+  using GEPList = SmallVector<GetElementPtrInst *, 8>;
+  using GEPListMap = MapVector<Value *, GEPList>;
 
   ScalarEvolution *SE = nullptr;
   TargetTransformInfo *TTI = nullptr;
@@ -131,7 +130,7 @@ private:
 
   /// Tries to vectorize constructs started from CmpInst, InsertValueInst or
   /// InsertElementInst instructions.
-  bool vectorizeSimpleInstructions(SmallVectorImpl<WeakVH> &Instructions,
+  bool vectorizeSimpleInstructions(SmallVectorImpl<Instruction *> &Instructions,
                                    BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
   /// Scan the basic block and look for patterns that are likely to start
@@ -147,7 +146,7 @@ private:
   StoreListMap Stores;
 
   /// The getelementptr instructions in a basic block organized by base pointer.
-  WeakTrackingVHListMap GEPs;
+  GEPListMap GEPs;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/XRay/FDRRecordProducer.h b/include/llvm/XRay/FDRRecordProducer.h
index b530a85bc7e1..043d91568f4e 100644
--- a/include/llvm/XRay/FDRRecordProducer.h
+++ b/include/llvm/XRay/FDRRecordProducer.h
@@ -27,7 +27,7 @@ public:
 class FileBasedRecordProducer : public RecordProducer {
   const XRayFileHeader &Header;
   DataExtractor &E;
-  uint32_t &OffsetPtr;
+  uint64_t &OffsetPtr;
   uint32_t CurrentBufferBytes = 0;
 
   // Helper function which gets the next record by speculatively reading through
@@ -36,7 +36,7 @@ class FileBasedRecordProducer : public RecordProducer {
 
 public:
   FileBasedRecordProducer(const XRayFileHeader &FH, DataExtractor &DE,
-                          uint32_t &OP)
+                          uint64_t &OP)
       : Header(FH), E(DE), OffsetPtr(OP) {}
 
   /// This producer encapsulates the logic for loading a File-backed
diff --git a/include/llvm/XRay/FDRRecords.h b/include/llvm/XRay/FDRRecords.h
index a8ce74bd88fb..e3e16f71e2fe 100644
--- a/include/llvm/XRay/FDRRecords.h
+++ b/include/llvm/XRay/FDRRecords.h
@@ -417,16 +417,16 @@ public:
 
 class RecordInitializer : public RecordVisitor {
   DataExtractor &E;
-  uint32_t &OffsetPtr;
+  uint64_t &OffsetPtr;
   uint16_t Version;
 
 public:
   static constexpr uint16_t DefaultVersion = 5u;
 
-  explicit RecordInitializer(DataExtractor &DE, uint32_t &OP, uint16_t V)
+  explicit RecordInitializer(DataExtractor &DE, uint64_t &OP, uint16_t V)
       : RecordVisitor(), E(DE), OffsetPtr(OP), Version(V) {}
 
-  explicit RecordInitializer(DataExtractor &DE, uint32_t &OP)
+  explicit RecordInitializer(DataExtractor &DE, uint64_t &OP)
       : RecordInitializer(DE, OP, DefaultVersion) {}
 
   Error visit(BufferExtents &) override;
diff --git a/include/llvm/XRay/FileHeaderReader.h b/include/llvm/XRay/FileHeaderReader.h
index 1c9681cfd9af..30878f3e99e8 100644
--- a/include/llvm/XRay/FileHeaderReader.h
+++ b/include/llvm/XRay/FileHeaderReader.h
@@ -24,7 +24,7 @@ namespace xray {
 /// Convenience function for loading the file header given a data extractor at a
 /// specified offset.
 Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
-                                                uint32_t &OffsetPtr);
+                                                uint64_t &OffsetPtr);
 
 } // namespace xray
 } // namespace llvm
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 9c4668e1473c..ecb3b37004fd 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -253,6 +253,7 @@ module LLVM_IR {
   textual header "IR/DebugInfoFlags.def"
   textual header "IR/Instruction.def"
   textual header "IR/Metadata.def"
+  textual header "IR/FixedMetadataKinds.def"
   textual header "IR/Value.def"
   textual header "IR/RuntimeLibcalls.def"
 }
@@ -331,6 +332,7 @@ module LLVM_TableGen {
 module LLVM_Transforms {
   requires cplusplus
   umbrella "Transforms"
+
   module * { export * }
 }
 
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 32241e355eb8..55dd9a4cda08 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -784,7 +784,7 @@ bool AAResultsWrapperPass::runOnFunction(Function &F) {
   // previous object first, in this case replacing it with an empty one, before
   // registering new results.
   AAR.reset(
-      new AAResults(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()));
+      new AAResults(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F)));
 
   // BasicAA is always available for function analyses. Also, we add it first
   // so that it can trump TBAA results when it proves MustAlias.
@@ -840,7 +840,7 @@ void AAResultsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
 
 AAResults llvm::createLegacyPMAAResults(Pass &P, Function &F,
                                         BasicAAResult &BAR) {
-  AAResults AAR(P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI());
+  AAResults AAR(P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F));
 
   // Add in our explicitly constructed BasicAA results.
   if (!DisableBasicAA)
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index a6e5b9fab558..79fbcd464c1b 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -119,6 +119,12 @@ void AliasSetTracker::removeAliasSet(AliasSet *AS) {
         TotalMayAliasSetSize -= AS->size();
 
   AliasSets.erase(AS);
+  // If we've removed the saturated alias set, set saturated marker back to
+  // nullptr and ensure this tracker is empty.
+  if (AS == AliasAnyAS) {
+    AliasAnyAS = nullptr;
+    assert(AliasSets.empty() && "Tracker not empty");
+  }
 }
 
 void AliasSet::removeFromTracker(AliasSetTracker &AST) {
@@ -690,8 +696,10 @@ void AliasSet::print(raw_ostream &OS) const {
 }
 
 void AliasSetTracker::print(raw_ostream &OS) const {
-  OS << "Alias Set Tracker: " << AliasSets.size() << " alias sets for "
-     << PointerMap.size() << " pointer values.\n";
+  OS << "Alias Set Tracker: " << AliasSets.size();
+  if (AliasAnyAS)
+    OS << " (Saturated)";
+  OS << " alias sets for " << PointerMap.size() << " pointer values.\n";
   for (const AliasSet &AS : *this)
     AS.print(OS);
   OS << "\n";
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index d46a8d8e306c..af718526684b 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -65,6 +65,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeModuleDebugInfoPrinterPass(Registry);
   initializeModuleSummaryIndexWrapperPassPass(Registry);
   initializeMustExecutePrinterPass(Registry);
+  initializeMustBeExecutedContextPrinterPass(Registry);
   initializeObjCARCAAWrapperPassPass(Registry);
   initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
   initializePhiValuesWrapperPassPass(Registry);
diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp
index cf2f845dee0a..129944743c5e 100644
--- a/lib/Analysis/AssumptionCache.cpp
+++ b/lib/Analysis/AssumptionCache.cpp
@@ -130,7 +130,10 @@ void AssumptionCache::unregisterAssumption(CallInst *CI) {
     if (AVI != AffectedValues.end())
       AffectedValues.erase(AVI);
   }
-  remove_if(AssumeHandles, [CI](WeakTrackingVH &VH) { return CI == VH; });
+
+  AssumeHandles.erase(
+      remove_if(AssumeHandles, [CI](WeakTrackingVH &VH) { return CI == VH; }),
+      AssumeHandles.end());
 }
 
 void AssumptionCache::AffectedValueCallbackVH::deleted() {
@@ -140,7 +143,7 @@ void AssumptionCache::AffectedValueCallbackVH::deleted() {
   // 'this' now dangles!
 }
 
-void AssumptionCache::copyAffectedValuesInCache(Value *OV, Value *NV) {
+void AssumptionCache::transferAffectedValuesInCache(Value *OV, Value *NV) {
   auto &NAVV = getOrInsertAffectedValues(NV);
   auto AVI = AffectedValues.find(OV);
   if (AVI == AffectedValues.end())
@@ -149,6 +152,7 @@ void AssumptionCache::copyAffectedValuesInCache(Value *OV, Value *NV) {
   for (auto &A : AVI->second)
     if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end())
       NAVV.push_back(A);
+  AffectedValues.erase(OV);
 }
 
 void AssumptionCache::AffectedValueCallbackVH::allUsesReplacedWith(Value *NV) {
@@ -157,7 +161,7 @@ void AssumptionCache::AffectedValueCallbackVH::allUsesReplacedWith(Value *NV) {
 
   // Any assumptions that affected this value now affect the new value.
 
-  AC->copyAffectedValuesInCache(getValPtr(), NV);
+  AC->transferAffectedValuesInCache(getValPtr(), NV);
   // 'this' now might dangle! If the AffectedValues map was resized to add an
   // entry for NV then this object might have been destroyed in favor of some
   // copy in the grown map.
@@ -252,7 +256,7 @@ AssumptionCache &AssumptionCacheTracker::getAssumptionCache(Function &F) {
   // Ok, build a new cache by scanning the function, insert it and the value
   // handle into our map, and return the newly populated cache.
   auto IP = AssumptionCaches.insert(std::make_pair(
-      FunctionCallbackVH(&F, this), llvm::make_unique<AssumptionCache>(F)));
+      FunctionCallbackVH(&F, this), std::make_unique<AssumptionCache>(F)));
   assert(IP.second && "Scanning function already in the map?");
   return *IP.first->second;
 }
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 3721c99883b8..f3c30c258c19 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -233,6 +233,26 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
   return ObjectSize != MemoryLocation::UnknownSize && ObjectSize < Size;
 }
 
+/// Return the minimal extent from \p V to the end of the underlying object,
+/// assuming the result is used in an aliasing query. E.g., we do use the query
+/// location size and the fact that null pointers cannot alias here.
+static uint64_t getMinimalExtentFrom(const Value &V,
+                                     const LocationSize &LocSize,
+                                     const DataLayout &DL,
+                                     bool NullIsValidLoc) {
+  // If we have dereferenceability information we know a lower bound for the
+  // extent as accesses for a lower offset would be valid. We need to exclude
+  // the "or null" part if null is a valid pointer.
+  bool CanBeNull;
+  uint64_t DerefBytes = V.getPointerDereferenceableBytes(DL, CanBeNull);
+  DerefBytes = (CanBeNull && NullIsValidLoc) ? 0 : DerefBytes;
+  // If queried with a precise location size, we assume that location size to be
+  // accessed, thus valid.
+  if (LocSize.isPrecise())
+    DerefBytes = std::max(DerefBytes, LocSize.getValue());
+  return DerefBytes;
+}
+
 /// Returns true if we can prove that the object specified by V has size Size.
 static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
                          const TargetLibraryInfo &TLI, bool NullIsValidLoc) {
@@ -481,7 +501,7 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
         // because it should be in sync with CaptureTracking. Not using it may
         // cause weird miscompilations where 2 aliasing pointers are assumed to
         // noalias.
-        if (auto *RP = getArgumentAliasingToReturnedPointer(Call)) {
+        if (auto *RP = getArgumentAliasingToReturnedPointer(Call, false)) {
           V = RP;
           continue;
         }
@@ -1792,10 +1812,12 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
   // If the size of one access is larger than the entire object on the other
   // side, then we know such behavior is undefined and can assume no alias.
   bool NullIsValidLocation = NullPointerIsDefined(&F);
-  if ((V1Size.isPrecise() && isObjectSmallerThan(O2, V1Size.getValue(), DL, TLI,
-                                                 NullIsValidLocation)) ||
-      (V2Size.isPrecise() && isObjectSmallerThan(O1, V2Size.getValue(), DL, TLI,
-                                                 NullIsValidLocation)))
+  if ((isObjectSmallerThan(
+          O2, getMinimalExtentFrom(*V1, V1Size, DL, NullIsValidLocation), DL,
+          TLI, NullIsValidLocation)) ||
+      (isObjectSmallerThan(
+          O1, getMinimalExtentFrom(*V2, V2Size, DL, NullIsValidLocation), DL,
+          TLI, NullIsValidLocation)))
     return NoAlias;
 
   // Check the cache before climbing up use-def chains. This also terminates
@@ -2053,8 +2075,9 @@ bool BasicAAWrapperPass::runOnFunction(Function &F) {
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   auto *PVWP = getAnalysisIfAvailable<PhiValuesWrapperPass>();
 
-  Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), F, TLIWP.getTLI(),
-                                 ACT.getAssumptionCache(F), &DTWP.getDomTree(),
+  Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), F,
+                                 TLIWP.getTLI(F), ACT.getAssumptionCache(F),
+                                 &DTWP.getDomTree(),
                                  LIWP ? &LIWP->getLoopInfo() : nullptr,
                                  PVWP ? &PVWP->getResult() : nullptr));
 
@@ -2071,8 +2094,7 @@ void BasicAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
 
 BasicAAResult llvm::createLegacyPMBasicAAResult(Pass &P, Function &F) {
   return BasicAAResult(
-      F.getParent()->getDataLayout(),
-      F,
-      P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+      F.getParent()->getDataLayout(), F,
+      P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
       P.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
 }
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 5eb95003f5d8..a06ee096d54c 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -118,6 +118,13 @@ static const uint32_t ZH_NONTAKEN_WEIGHT = 12;
 static const uint32_t FPH_TAKEN_WEIGHT = 20;
 static const uint32_t FPH_NONTAKEN_WEIGHT = 12;
 
+/// This is the probability for an ordered floating point comparison.
+static const uint32_t FPH_ORD_WEIGHT = 1024 * 1024 - 1;
+/// This is the probability for an unordered floating point comparison, it means
+/// one or two of the operands are NaN. Usually it is used to test for an
+/// exceptional case, so the result is unlikely.
+static const uint32_t FPH_UNO_WEIGHT = 1;
+
 /// Invoke-terminating normal branch taken weight
 ///
 /// This is the weight for branching to the normal destination of an invoke
@@ -778,6 +785,8 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) {
   if (!FCmp)
     return false;
 
+  uint32_t TakenWeight = FPH_TAKEN_WEIGHT;
+  uint32_t NontakenWeight = FPH_NONTAKEN_WEIGHT;
   bool isProb;
   if (FCmp->isEquality()) {
     // f1 == f2 -> Unlikely
@@ -786,9 +795,13 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) {
   } else if (FCmp->getPredicate() == FCmpInst::FCMP_ORD) {
     // !isnan -> Likely
     isProb = true;
+    TakenWeight = FPH_ORD_WEIGHT;
+    NontakenWeight = FPH_UNO_WEIGHT;
   } else if (FCmp->getPredicate() == FCmpInst::FCMP_UNO) {
     // isnan -> Unlikely
     isProb = false;
+    TakenWeight = FPH_ORD_WEIGHT;
+    NontakenWeight = FPH_UNO_WEIGHT;
   } else {
     return false;
   }
@@ -798,8 +811,7 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) {
   if (!isProb)
     std::swap(TakenIdx, NonTakenIdx);
 
-  BranchProbability TakenProb(FPH_TAKEN_WEIGHT,
-                              FPH_TAKEN_WEIGHT + FPH_NONTAKEN_WEIGHT);
+  BranchProbability TakenProb(TakenWeight, TakenWeight + NontakenWeight);
   setEdgeProbability(BB, TakenIdx, TakenProb);
   setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl());
   return true;
@@ -1014,7 +1026,8 @@ void BranchProbabilityInfoWrapperPass::getAnalysisUsage(
 
 bool BranchProbabilityInfoWrapperPass::runOnFunction(Function &F) {
   const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  const TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   BPI.calculate(F, LI, &TLI);
   return false;
 }
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index 18b83d6838cc..8215b4ecbb03 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -87,11 +87,18 @@ unsigned llvm::GetSuccessorNumber(const BasicBlock *BB,
 /// with multiple predecessors.
 bool llvm::isCriticalEdge(const Instruction *TI, unsigned SuccNum,
                           bool AllowIdenticalEdges) {
-  assert(TI->isTerminator() && "Must be a terminator to have successors!");
   assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
+  return isCriticalEdge(TI, TI->getSuccessor(SuccNum), AllowIdenticalEdges);
+}
+
+bool llvm::isCriticalEdge(const Instruction *TI, const BasicBlock *Dest,
+                          bool AllowIdenticalEdges) {
+  assert(TI->isTerminator() && "Must be a terminator to have successors!");
   if (TI->getNumSuccessors() == 1) return false;
 
-  const BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  assert(find(predecessors(Dest), TI->getParent()) != pred_end(Dest) &&
+         "No edge between TI's block and Dest.");
+
   const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest);
 
   // If there is more than one predecessor, this is a critical edge...
diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp
index 619b675b58d8..4f4103fefa25 100644
--- a/lib/Analysis/CFGPrinter.cpp
+++ b/lib/Analysis/CFGPrinter.cpp
@@ -99,7 +99,7 @@ static void writeCFGToDotFile(Function &F, bool CFGOnly = false) {
   errs() << "Writing '" << Filename << "'...";
 
   std::error_code EC;
-  raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
+  raw_fd_ostream File(Filename, EC, sys::fs::OF_Text);
 
   if (!EC)
     WriteGraph(File, (const Function*)&F, CFGOnly);
diff --git a/lib/Analysis/CFLAndersAliasAnalysis.cpp b/lib/Analysis/CFLAndersAliasAnalysis.cpp
index 690e514d4f5c..fd90bd1521d6 100644
--- a/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -88,9 +88,11 @@ using namespace llvm::cflaa;
 
 #define DEBUG_TYPE "cfl-anders-aa"
 
-CFLAndersAAResult::CFLAndersAAResult(const TargetLibraryInfo &TLI) : TLI(TLI) {}
+CFLAndersAAResult::CFLAndersAAResult(
+    std::function<const TargetLibraryInfo &(Function &F)> GetTLI)
+    : GetTLI(std::move(GetTLI)) {}
 CFLAndersAAResult::CFLAndersAAResult(CFLAndersAAResult &&RHS)
-    : AAResultBase(std::move(RHS)), TLI(RHS.TLI) {}
+    : AAResultBase(std::move(RHS)), GetTLI(std::move(RHS.GetTLI)) {}
 CFLAndersAAResult::~CFLAndersAAResult() = default;
 
 namespace {
@@ -779,7 +781,7 @@ static AliasAttrMap buildAttrMap(const CFLGraph &Graph,
 CFLAndersAAResult::FunctionInfo
 CFLAndersAAResult::buildInfoFrom(const Function &Fn) {
   CFLGraphBuilder<CFLAndersAAResult> GraphBuilder(
-      *this, TLI,
+      *this, GetTLI(const_cast<Function &>(Fn)),
       // Cast away the constness here due to GraphBuilder's API requirement
       const_cast<Function &>(Fn));
   auto &Graph = GraphBuilder.getCFLGraph();
@@ -898,7 +900,10 @@ AliasResult CFLAndersAAResult::alias(const MemoryLocation &LocA,
 AnalysisKey CFLAndersAA::Key;
 
 CFLAndersAAResult CFLAndersAA::run(Function &F, FunctionAnalysisManager &AM) {
-  return CFLAndersAAResult(AM.getResult<TargetLibraryAnalysis>(F));
+  auto GetTLI = [&AM](Function &F) -> TargetLibraryInfo & {
+    return AM.getResult<TargetLibraryAnalysis>(F);
+  };
+  return CFLAndersAAResult(GetTLI);
 }
 
 char CFLAndersAAWrapperPass::ID = 0;
@@ -914,8 +919,10 @@ CFLAndersAAWrapperPass::CFLAndersAAWrapperPass() : ImmutablePass(ID) {
 }
 
 void CFLAndersAAWrapperPass::initializePass() {
-  auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
-  Result.reset(new CFLAndersAAResult(TLIWP.getTLI()));
+  auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
+  Result.reset(new CFLAndersAAResult(GetTLI));
 }
 
 void CFLAndersAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/lib/Analysis/CFLSteensAliasAnalysis.cpp b/lib/Analysis/CFLSteensAliasAnalysis.cpp
index 44b1834f70bf..b87aa4065392 100644
--- a/lib/Analysis/CFLSteensAliasAnalysis.cpp
+++ b/lib/Analysis/CFLSteensAliasAnalysis.cpp
@@ -60,10 +60,11 @@ using namespace llvm::cflaa;
 
 #define DEBUG_TYPE "cfl-steens-aa"
 
-CFLSteensAAResult::CFLSteensAAResult(const TargetLibraryInfo &TLI)
-    : AAResultBase(), TLI(TLI) {}
+CFLSteensAAResult::CFLSteensAAResult(
+    std::function<const TargetLibraryInfo &(Function &F)> GetTLI)
+    : AAResultBase(), GetTLI(std::move(GetTLI)) {}
 CFLSteensAAResult::CFLSteensAAResult(CFLSteensAAResult &&Arg)
-    : AAResultBase(std::move(Arg)), TLI(Arg.TLI) {}
+    : AAResultBase(std::move(Arg)), GetTLI(std::move(Arg.GetTLI)) {}
 CFLSteensAAResult::~CFLSteensAAResult() = default;
 
 /// Information we have about a function and would like to keep around.
@@ -181,7 +182,7 @@ CFLSteensAAResult::FunctionInfo::FunctionInfo(
 
 // Builds the graph + StratifiedSets for a function.
 CFLSteensAAResult::FunctionInfo CFLSteensAAResult::buildSetsFrom(Function *Fn) {
-  CFLGraphBuilder<CFLSteensAAResult> GraphBuilder(*this, TLI, *Fn);
+  CFLGraphBuilder<CFLSteensAAResult> GraphBuilder(*this, GetTLI(*Fn), *Fn);
   StratifiedSetsBuilder<InstantiatedValue> SetBuilder;
 
   // Add all CFLGraph nodes and all Dereference edges to StratifiedSets
@@ -331,7 +332,10 @@ AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA,
 AnalysisKey CFLSteensAA::Key;
 
 CFLSteensAAResult CFLSteensAA::run(Function &F, FunctionAnalysisManager &AM) {
-  return CFLSteensAAResult(AM.getResult<TargetLibraryAnalysis>(F));
+  auto GetTLI = [&AM](Function &F) -> const TargetLibraryInfo & {
+    return AM.getResult<TargetLibraryAnalysis>(F);
+  };
+  return CFLSteensAAResult(GetTLI);
 }
 
 char CFLSteensAAWrapperPass::ID = 0;
@@ -347,8 +351,10 @@ CFLSteensAAWrapperPass::CFLSteensAAWrapperPass() : ImmutablePass(ID) {
 }
 
 void CFLSteensAAWrapperPass::initializePass() {
-  auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
-  Result.reset(new CFLSteensAAResult(TLIWP.getTLI()));
+  auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
+  Result.reset(new CFLSteensAAResult(GetTLI));
 }
 
 void CFLSteensAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/lib/Analysis/CallGraph.cpp b/lib/Analysis/CallGraph.cpp
index ec5e94d499be..70aeb1a688ee 100644
--- a/lib/Analysis/CallGraph.cpp
+++ b/lib/Analysis/CallGraph.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 
 CallGraph::CallGraph(Module &M)
     : M(M), ExternalCallingNode(getOrInsertFunction(nullptr)),
-      CallsExternalNode(llvm::make_unique<CallGraphNode>(nullptr)) {
+      CallsExternalNode(std::make_unique<CallGraphNode>(nullptr)) {
   // Add every function to the call graph.
   for (Function &F : M)
     addToCallGraph(&F);
@@ -150,7 +150,7 @@ CallGraphNode *CallGraph::getOrInsertFunction(const Function *F) {
     return CGN.get();
 
   assert((!F || F->getParent() == &M) && "Function not in current module!");
-  CGN = llvm::make_unique<CallGraphNode>(const_cast<Function *>(F));
+  CGN = std::make_unique<CallGraphNode>(const_cast<Function *>(F));
   return CGN.get();
 }
 
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index adaa83a6c443..20e2f06540a3 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -33,6 +33,22 @@ CaptureTracker::~CaptureTracker() {}
 
 bool CaptureTracker::shouldExplore(const Use *U) { return true; }
 
+bool CaptureTracker::isDereferenceableOrNull(Value *O, const DataLayout &DL) {
+  // An inbounds GEP can either be a valid pointer (pointing into
+  // or to the end of an allocation), or be null in the default
+  // address space. So for an inbounds GEP there is no way to let
+  // the pointer escape using clever GEP hacking because doing so
+  // would make the pointer point outside of the allocated object
+  // and thus make the GEP result a poison value. Similarly, other
+  // dereferenceable pointers cannot be manipulated without producing
+  // poison.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(O))
+    if (GEP->isInBounds())
+      return true;
+  bool CanBeNull;
+  return O->getPointerDereferenceableBytes(DL, CanBeNull);
+}
+
 namespace {
   struct SimpleCaptureTracker : public CaptureTracker {
     explicit SimpleCaptureTracker(bool ReturnCaptures)
@@ -251,7 +267,8 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
       // marked with nocapture do not capture. This means that places like
       // GetUnderlyingObject in ValueTracking or DecomposeGEPExpression
       // in BasicAA also need to know about this property.
-      if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call)) {
+      if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call,
+                                                                      true)) {
         AddUses(Call);
         break;
       }
@@ -330,7 +347,9 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
       AddUses(I);
       break;
     case Instruction::ICmp: {
-      if (auto *CPN = dyn_cast<ConstantPointerNull>(I->getOperand(1))) {
+      unsigned Idx = (I->getOperand(0) == V) ? 0 : 1;
+      unsigned OtherIdx = 1 - Idx;
+      if (auto *CPN = dyn_cast<ConstantPointerNull>(I->getOperand(OtherIdx))) {
         // Don't count comparisons of a no-alias return value against null as
         // captures. This allows us to ignore comparisons of malloc results
         // with null, for example.
@@ -338,29 +357,18 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
           if (isNoAliasCall(V->stripPointerCasts()))
             break;
         if (!I->getFunction()->nullPointerIsDefined()) {
-          auto *O = I->getOperand(0)->stripPointerCastsSameRepresentation();
-          // An inbounds GEP can either be a valid pointer (pointing into
-          // or to the end of an allocation), or be null in the default
-          // address space. So for an inbounds GEPs there is no way to let
-          // the pointer escape using clever GEP hacking because doing so
-          // would make the pointer point outside of the allocated object
-          // and thus make the GEP result a poison value.
-          if (auto *GEP = dyn_cast<GetElementPtrInst>(O))
-            if (GEP->isInBounds())
-              break;
-          // Comparing a dereferenceable_or_null argument against null
-          // cannot lead to pointer escapes, because if it is not null it
-          // must be a valid (in-bounds) pointer.
-          bool CanBeNull;
-          if (O->getPointerDereferenceableBytes(I->getModule()->getDataLayout(), CanBeNull))
+          auto *O = I->getOperand(Idx)->stripPointerCastsSameRepresentation();
+          // Comparing a dereferenceable_or_null pointer against null cannot
+          // lead to pointer escapes, because if it is not null it must be a
+          // valid (in-bounds) pointer.
+          if (Tracker->isDereferenceableOrNull(O, I->getModule()->getDataLayout()))
             break;
         }
       }
       // Comparison against value stored in global variable. Given the pointer
       // does not escape, its value cannot be guessed and stored separately in a
       // global variable.
-      unsigned OtherIndex = (I->getOperand(0) == V) ? 1 : 0;
-      auto *LI = dyn_cast<LoadInst>(I->getOperand(OtherIndex));
+      auto *LI = dyn_cast<LoadInst>(I->getOperand(OtherIdx));
       if (LI && isa<GlobalVariable>(LI->getPointerOperand()))
         break;
       // Otherwise, be conservative. There are crazy ways to capture pointers
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 20231ca78b45..8dbcf7034fda 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -93,6 +93,9 @@ static Constant *foldConstVectorToAPInt(APInt &Result, Type *DestTy,
 /// This always returns a non-null constant, but it may be a
 /// ConstantExpr if unfoldable.
 Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
+  assert(CastInst::castIsValid(Instruction::BitCast, C, DestTy) &&
+         "Invalid constantexpr bitcast!");
+
   // Catch the obvious splat cases.
   if (C->isNullValue() && !DestTy->isX86_MMXTy())
     return Constant::getNullValue(DestTy);
@@ -521,8 +524,23 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
       return nullptr;
 
     C = FoldBitCast(C, MapTy->getPointerTo(AS), DL);
-    if (Constant *Res = FoldReinterpretLoadFromConstPtr(C, MapTy, DL))
-      return FoldBitCast(Res, LoadTy, DL);
+    if (Constant *Res = FoldReinterpretLoadFromConstPtr(C, MapTy, DL)) {
+      if (Res->isNullValue() && !LoadTy->isX86_MMXTy())
+        // Materializing a zero can be done trivially without a bitcast
+        return Constant::getNullValue(LoadTy);
+      Type *CastTy = LoadTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(LoadTy) : LoadTy;
+      Res = FoldBitCast(Res, CastTy, DL);
+      if (LoadTy->isPtrOrPtrVectorTy()) {
+        // For vector of pointer, we needed to first convert to a vector of integer, then do vector inttoptr
+        if (Res->isNullValue() && !LoadTy->isX86_MMXTy())
+          return Constant::getNullValue(LoadTy);
+        if (DL.isNonIntegralPointerType(LoadTy->getScalarType()))
+          // Be careful not to replace a load of an addrspace value with an inttoptr here
+          return nullptr;
+        Res = ConstantExpr::getCast(Instruction::IntToPtr, Res, LoadTy);
+      }
+      return Res;
+    }
     return nullptr;
   }
 
@@ -544,7 +562,7 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
   int64_t InitializerSize = DL.getTypeAllocSize(GV->getInitializer()->getType());
 
   // If we're not accessing anything in this constant, the result is undefined.
-  if (Offset + BytesLoaded <= 0)
+  if (Offset <= -1 * static_cast<int64_t>(BytesLoaded))
     return UndefValue::get(IntType);
 
   // If we're not accessing anything in this constant, the result is undefined.
@@ -781,10 +799,10 @@ Constant *CastGEPIndices(Type *SrcElemTy, ArrayRef<Constant *> Ops,
 }
 
 /// Strip the pointer casts, but preserve the address space information.
-Constant* StripPtrCastKeepAS(Constant* Ptr, Type *&ElemTy) {
+Constant *StripPtrCastKeepAS(Constant *Ptr, Type *&ElemTy) {
   assert(Ptr->getType()->isPointerTy() && "Not a pointer type");
   auto *OldPtrTy = cast<PointerType>(Ptr->getType());
-  Ptr = Ptr->stripPointerCasts();
+  Ptr = cast<Constant>(Ptr->stripPointerCasts());
   auto *NewPtrTy = cast<PointerType>(Ptr->getType());
 
   ElemTy = NewPtrTy->getPointerElementType();
@@ -1038,7 +1056,7 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
     return ConstantExpr::getExtractElement(Ops[0], Ops[1]);
   case Instruction::ExtractValue:
     return ConstantExpr::getExtractValue(
-        Ops[0], dyn_cast<ExtractValueInst>(InstOrCE)->getIndices());
+        Ops[0], cast<ExtractValueInst>(InstOrCE)->getIndices());
   case Instruction::InsertElement:
     return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
   case Instruction::ShuffleVector:
@@ -1464,40 +1482,50 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
 
   if (!F->hasName())
     return false;
-  StringRef Name = F->getName();
 
   // In these cases, the check of the length is required.  We don't want to
   // return true for a name like "cos\0blah" which strcmp would return equal to
   // "cos", but has length 8.
+  StringRef Name = F->getName();
   switch (Name[0]) {
   default:
     return false;
   case 'a':
-    return Name == "acos" || Name == "asin" || Name == "atan" ||
-           Name == "atan2" || Name == "acosf" || Name == "asinf" ||
-           Name == "atanf" || Name == "atan2f";
+    return Name == "acos" || Name == "acosf" ||
+           Name == "asin" || Name == "asinf" ||
+           Name == "atan" || Name == "atanf" ||
+           Name == "atan2" || Name == "atan2f";
   case 'c':
-    return Name == "ceil" || Name == "cos" || Name == "cosh" ||
-           Name == "ceilf" || Name == "cosf" || Name == "coshf";
+    return Name == "ceil" || Name == "ceilf" ||
+           Name == "cos" || Name == "cosf" ||
+           Name == "cosh" || Name == "coshf";
   case 'e':
-    return Name == "exp" || Name == "exp2" || Name == "expf" || Name == "exp2f";
+    return Name == "exp" || Name == "expf" ||
+           Name == "exp2" || Name == "exp2f";
   case 'f':
-    return Name == "fabs" || Name == "floor" || Name == "fmod" ||
-           Name == "fabsf" || Name == "floorf" || Name == "fmodf";
+    return Name == "fabs" || Name == "fabsf" ||
+           Name == "floor" || Name == "floorf" ||
+           Name == "fmod" || Name == "fmodf";
   case 'l':
-    return Name == "log" || Name == "log10" || Name == "logf" ||
-           Name == "log10f";
+    return Name == "log" || Name == "logf" ||
+           Name == "log2" || Name == "log2f" ||
+           Name == "log10" || Name == "log10f";
+  case 'n':
+    return Name == "nearbyint" || Name == "nearbyintf";
   case 'p':
     return Name == "pow" || Name == "powf";
   case 'r':
-    return Name == "round" || Name == "roundf";
+    return Name == "rint" || Name == "rintf" ||
+           Name == "round" || Name == "roundf";
   case 's':
-    return Name == "sin" || Name == "sinh" || Name == "sqrt" ||
-           Name == "sinf" || Name == "sinhf" || Name == "sqrtf";
+    return Name == "sin" || Name == "sinf" ||
+           Name == "sinh" || Name == "sinhf" ||
+           Name == "sqrt" || Name == "sqrtf";
   case 't':
-    return Name == "tan" || Name == "tanh" || Name == "tanf" || Name == "tanhf";
+    return Name == "tan" || Name == "tanf" ||
+           Name == "tanh" || Name == "tanhf" ||
+           Name == "trunc" || Name == "truncf";
   case '_':
-
     // Check for various function names that get used for the math functions
     // when the header files are preprocessed with the macro
     // __FINITE_MATH_ONLY__ enabled.
@@ -1713,40 +1741,37 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
       return nullptr;
 
-    if (IntrinsicID == Intrinsic::round) {
-      APFloat V = Op->getValueAPF();
-      V.roundToIntegral(APFloat::rmNearestTiesToAway);
-      return ConstantFP::get(Ty->getContext(), V);
+    // Use internal versions of these intrinsics.
+    APFloat U = Op->getValueAPF();
+
+    if (IntrinsicID == Intrinsic::nearbyint || IntrinsicID == Intrinsic::rint) {
+      U.roundToIntegral(APFloat::rmNearestTiesToEven);
+      return ConstantFP::get(Ty->getContext(), U);
     }
 
-    if (IntrinsicID == Intrinsic::floor) {
-      APFloat V = Op->getValueAPF();
-      V.roundToIntegral(APFloat::rmTowardNegative);
-      return ConstantFP::get(Ty->getContext(), V);
+    if (IntrinsicID == Intrinsic::round) {
+      U.roundToIntegral(APFloat::rmNearestTiesToAway);
+      return ConstantFP::get(Ty->getContext(), U);
     }
 
     if (IntrinsicID == Intrinsic::ceil) {
-      APFloat V = Op->getValueAPF();
-      V.roundToIntegral(APFloat::rmTowardPositive);
-      return ConstantFP::get(Ty->getContext(), V);
+      U.roundToIntegral(APFloat::rmTowardPositive);
+      return ConstantFP::get(Ty->getContext(), U);
     }
 
-    if (IntrinsicID == Intrinsic::trunc) {
-      APFloat V = Op->getValueAPF();
-      V.roundToIntegral(APFloat::rmTowardZero);
-      return ConstantFP::get(Ty->getContext(), V);
+    if (IntrinsicID == Intrinsic::floor) {
+      U.roundToIntegral(APFloat::rmTowardNegative);
+      return ConstantFP::get(Ty->getContext(), U);
     }
 
-    if (IntrinsicID == Intrinsic::rint) {
-      APFloat V = Op->getValueAPF();
-      V.roundToIntegral(APFloat::rmNearestTiesToEven);
-      return ConstantFP::get(Ty->getContext(), V);
+    if (IntrinsicID == Intrinsic::trunc) {
+      U.roundToIntegral(APFloat::rmTowardZero);
+      return ConstantFP::get(Ty->getContext(), U);
     }
 
-    if (IntrinsicID == Intrinsic::nearbyint) {
-      APFloat V = Op->getValueAPF();
-      V.roundToIntegral(APFloat::rmNearestTiesToEven);
-      return ConstantFP::get(Ty->getContext(), V);
+    if (IntrinsicID == Intrinsic::fabs) {
+      U.clearSign();
+      return ConstantFP::get(Ty->getContext(), U);
     }
 
     /// We only fold functions with finite arguments. Folding NaN and inf is
@@ -1763,18 +1788,19 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
 
     switch (IntrinsicID) {
       default: break;
-      case Intrinsic::fabs:
-        return ConstantFoldFP(fabs, V, Ty);
-      case Intrinsic::log2:
-        return ConstantFoldFP(Log2, V, Ty);
       case Intrinsic::log:
         return ConstantFoldFP(log, V, Ty);
+      case Intrinsic::log2:
+        // TODO: What about hosts that lack a C99 library?
+        return ConstantFoldFP(Log2, V, Ty);
       case Intrinsic::log10:
+        // TODO: What about hosts that lack a C99 library?
         return ConstantFoldFP(log10, V, Ty);
       case Intrinsic::exp:
         return ConstantFoldFP(exp, V, Ty);
       case Intrinsic::exp2:
-        return ConstantFoldFP(exp2, V, Ty);
+        // Fold exp2(x) as pow(2, x), in case the host lacks a C99 library.
+        return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
       case Intrinsic::sin:
         return ConstantFoldFP(sin, V, Ty);
       case Intrinsic::cos:
@@ -1786,104 +1812,150 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     if (!TLI)
       return nullptr;
 
-    char NameKeyChar = Name[0];
-    if (Name[0] == '_' && Name.size() > 2 && Name[1] == '_')
-      NameKeyChar = Name[2];
-
-    switch (NameKeyChar) {
-    case 'a':
-      if ((Name == "acos" && TLI->has(LibFunc_acos)) ||
-          (Name == "acosf" && TLI->has(LibFunc_acosf)) ||
-          (Name == "__acos_finite" && TLI->has(LibFunc_acos_finite)) ||
-          (Name == "__acosf_finite" && TLI->has(LibFunc_acosf_finite)))
+    LibFunc Func = NotLibFunc;
+    TLI->getLibFunc(Name, Func);
+    switch (Func) {
+    default:
+      break;
+    case LibFunc_acos:
+    case LibFunc_acosf:
+    case LibFunc_acos_finite:
+    case LibFunc_acosf_finite:
+      if (TLI->has(Func))
         return ConstantFoldFP(acos, V, Ty);
-      else if ((Name == "asin" && TLI->has(LibFunc_asin)) ||
-               (Name == "asinf" && TLI->has(LibFunc_asinf)) ||
-               (Name == "__asin_finite" && TLI->has(LibFunc_asin_finite)) ||
-               (Name == "__asinf_finite" && TLI->has(LibFunc_asinf_finite)))
+      break;
+    case LibFunc_asin:
+    case LibFunc_asinf:
+    case LibFunc_asin_finite:
+    case LibFunc_asinf_finite:
+      if (TLI->has(Func))
         return ConstantFoldFP(asin, V, Ty);
-      else if ((Name == "atan" && TLI->has(LibFunc_atan)) ||
-               (Name == "atanf" && TLI->has(LibFunc_atanf)))
+      break;
+    case LibFunc_atan:
+    case LibFunc_atanf:
+      if (TLI->has(Func))
         return ConstantFoldFP(atan, V, Ty);
       break;
-    case 'c':
-      if ((Name == "ceil" && TLI->has(LibFunc_ceil)) ||
-          (Name == "ceilf" && TLI->has(LibFunc_ceilf)))
-        return ConstantFoldFP(ceil, V, Ty);
-      else if ((Name == "cos" && TLI->has(LibFunc_cos)) ||
-               (Name == "cosf" && TLI->has(LibFunc_cosf)))
+    case LibFunc_ceil:
+    case LibFunc_ceilf:
+      if (TLI->has(Func)) {
+        U.roundToIntegral(APFloat::rmTowardPositive);
+        return ConstantFP::get(Ty->getContext(), U);
+      }
+      break;
+    case LibFunc_cos:
+    case LibFunc_cosf:
+      if (TLI->has(Func))
         return ConstantFoldFP(cos, V, Ty);
-      else if ((Name == "cosh" && TLI->has(LibFunc_cosh)) ||
-               (Name == "coshf" && TLI->has(LibFunc_coshf)) ||
-               (Name == "__cosh_finite" && TLI->has(LibFunc_cosh_finite)) ||
-               (Name == "__coshf_finite" && TLI->has(LibFunc_coshf_finite)))
+      break;
+    case LibFunc_cosh:
+    case LibFunc_coshf:
+    case LibFunc_cosh_finite:
+    case LibFunc_coshf_finite:
+      if (TLI->has(Func))
         return ConstantFoldFP(cosh, V, Ty);
       break;
-    case 'e':
-      if ((Name == "exp" && TLI->has(LibFunc_exp)) ||
-          (Name == "expf" && TLI->has(LibFunc_expf)) ||
-          (Name == "__exp_finite" && TLI->has(LibFunc_exp_finite)) ||
-          (Name == "__expf_finite" && TLI->has(LibFunc_expf_finite)))
+    case LibFunc_exp:
+    case LibFunc_expf:
+    case LibFunc_exp_finite:
+    case LibFunc_expf_finite:
+      if (TLI->has(Func))
         return ConstantFoldFP(exp, V, Ty);
-      if ((Name == "exp2" && TLI->has(LibFunc_exp2)) ||
-          (Name == "exp2f" && TLI->has(LibFunc_exp2f)) ||
-          (Name == "__exp2_finite" && TLI->has(LibFunc_exp2_finite)) ||
-          (Name == "__exp2f_finite" && TLI->has(LibFunc_exp2f_finite)))
-        // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
-        // C99 library.
+      break;
+    case LibFunc_exp2:
+    case LibFunc_exp2f:
+    case LibFunc_exp2_finite:
+    case LibFunc_exp2f_finite:
+      if (TLI->has(Func))
+        // Fold exp2(x) as pow(2, x), in case the host lacks a C99 library.
         return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
       break;
-    case 'f':
-      if ((Name == "fabs" && TLI->has(LibFunc_fabs)) ||
-          (Name == "fabsf" && TLI->has(LibFunc_fabsf)))
-        return ConstantFoldFP(fabs, V, Ty);
-      else if ((Name == "floor" && TLI->has(LibFunc_floor)) ||
-               (Name == "floorf" && TLI->has(LibFunc_floorf)))
-        return ConstantFoldFP(floor, V, Ty);
+    case LibFunc_fabs:
+    case LibFunc_fabsf:
+      if (TLI->has(Func)) {
+        U.clearSign();
+        return ConstantFP::get(Ty->getContext(), U);
+      }
       break;
-    case 'l':
-      if ((Name == "log" && V > 0 && TLI->has(LibFunc_log)) ||
-          (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)) ||
-          (Name == "__log_finite" && V > 0 &&
-            TLI->has(LibFunc_log_finite)) ||
-          (Name == "__logf_finite" && V > 0 &&
-            TLI->has(LibFunc_logf_finite)))
+    case LibFunc_floor:
+    case LibFunc_floorf:
+      if (TLI->has(Func)) {
+        U.roundToIntegral(APFloat::rmTowardNegative);
+        return ConstantFP::get(Ty->getContext(), U);
+      }
+      break;
+    case LibFunc_log:
+    case LibFunc_logf:
+    case LibFunc_log_finite:
+    case LibFunc_logf_finite:
+      if (V > 0.0 && TLI->has(Func))
         return ConstantFoldFP(log, V, Ty);
-      else if ((Name == "log10" && V > 0 && TLI->has(LibFunc_log10)) ||
-               (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)) ||
-               (Name == "__log10_finite" && V > 0 &&
-                 TLI->has(LibFunc_log10_finite)) ||
-               (Name == "__log10f_finite" && V > 0 &&
-                 TLI->has(LibFunc_log10f_finite)))
+      break;
+    case LibFunc_log2:
+    case LibFunc_log2f:
+    case LibFunc_log2_finite:
+    case LibFunc_log2f_finite:
+      if (V > 0.0 && TLI->has(Func))
+        // TODO: What about hosts that lack a C99 library?
+        return ConstantFoldFP(Log2, V, Ty);
+      break;
+    case LibFunc_log10:
+    case LibFunc_log10f:
+    case LibFunc_log10_finite:
+    case LibFunc_log10f_finite:
+      if (V > 0.0 && TLI->has(Func))
+        // TODO: What about hosts that lack a C99 library?
         return ConstantFoldFP(log10, V, Ty);
       break;
-    case 'r':
-      if ((Name == "round" && TLI->has(LibFunc_round)) ||
-          (Name == "roundf" && TLI->has(LibFunc_roundf)))
-        return ConstantFoldFP(round, V, Ty);
+    case LibFunc_nearbyint:
+    case LibFunc_nearbyintf:
+    case LibFunc_rint:
+    case LibFunc_rintf:
+      if (TLI->has(Func)) {
+        U.roundToIntegral(APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(Ty->getContext(), U);
+      }
       break;
-    case 's':
-      if ((Name == "sin" && TLI->has(LibFunc_sin)) ||
-          (Name == "sinf" && TLI->has(LibFunc_sinf)))
+    case LibFunc_round:
+    case LibFunc_roundf:
+      if (TLI->has(Func)) {
+        U.roundToIntegral(APFloat::rmNearestTiesToAway);
+        return ConstantFP::get(Ty->getContext(), U);
+      }
+      break;
+    case LibFunc_sin:
+    case LibFunc_sinf:
+      if (TLI->has(Func))
         return ConstantFoldFP(sin, V, Ty);
-      else if ((Name == "sinh" && TLI->has(LibFunc_sinh)) ||
-               (Name == "sinhf" && TLI->has(LibFunc_sinhf)) ||
-               (Name == "__sinh_finite" && TLI->has(LibFunc_sinh_finite)) ||
-               (Name == "__sinhf_finite" && TLI->has(LibFunc_sinhf_finite)))
+      break;
+    case LibFunc_sinh:
+    case LibFunc_sinhf:
+    case LibFunc_sinh_finite:
+    case LibFunc_sinhf_finite:
+      if (TLI->has(Func))
         return ConstantFoldFP(sinh, V, Ty);
-      else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc_sqrt)) ||
-               (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc_sqrtf)))
+      break;
+    case LibFunc_sqrt:
+    case LibFunc_sqrtf:
+      if (V >= 0.0 && TLI->has(Func))
         return ConstantFoldFP(sqrt, V, Ty);
       break;
-    case 't':
-      if ((Name == "tan" && TLI->has(LibFunc_tan)) ||
-          (Name == "tanf" && TLI->has(LibFunc_tanf)))
+    case LibFunc_tan:
+    case LibFunc_tanf:
+      if (TLI->has(Func))
         return ConstantFoldFP(tan, V, Ty);
-      else if ((Name == "tanh" && TLI->has(LibFunc_tanh)) ||
-               (Name == "tanhf" && TLI->has(LibFunc_tanhf)))
+      break;
+    case LibFunc_tanh:
+    case LibFunc_tanhf:
+      if (TLI->has(Func))
         return ConstantFoldFP(tanh, V, Ty);
       break;
-    default:
+    case LibFunc_trunc:
+    case LibFunc_truncf:
+      if (TLI->has(Func)) {
+        U.roundToIntegral(APFloat::rmTowardZero);
+        return ConstantFP::get(Ty->getContext(), U);
+      }
       break;
     }
     return nullptr;
@@ -2002,19 +2074,35 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
 
       if (!TLI)
         return nullptr;
-      if ((Name == "pow" && TLI->has(LibFunc_pow)) ||
-          (Name == "powf" && TLI->has(LibFunc_powf)) ||
-          (Name == "__pow_finite" && TLI->has(LibFunc_pow_finite)) ||
-          (Name == "__powf_finite" && TLI->has(LibFunc_powf_finite)))
-        return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
-      if ((Name == "fmod" && TLI->has(LibFunc_fmod)) ||
-          (Name == "fmodf" && TLI->has(LibFunc_fmodf)))
-        return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
-      if ((Name == "atan2" && TLI->has(LibFunc_atan2)) ||
-          (Name == "atan2f" && TLI->has(LibFunc_atan2f)) ||
-          (Name == "__atan2_finite" && TLI->has(LibFunc_atan2_finite)) ||
-          (Name == "__atan2f_finite" && TLI->has(LibFunc_atan2f_finite)))
-        return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
+
+      LibFunc Func = NotLibFunc;
+      TLI->getLibFunc(Name, Func);
+      switch (Func) {
+      default:
+        break;
+      case LibFunc_pow:
+      case LibFunc_powf:
+      case LibFunc_pow_finite:
+      case LibFunc_powf_finite:
+        if (TLI->has(Func))
+          return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
+        break;
+      case LibFunc_fmod:
+      case LibFunc_fmodf:
+        if (TLI->has(Func)) {
+          APFloat V = Op1->getValueAPF();
+          if (APFloat::opStatus::opOK == V.mod(Op2->getValueAPF()))
+            return ConstantFP::get(Ty->getContext(), V);
+        }
+        break;
+      case LibFunc_atan2:
+      case LibFunc_atan2f:
+      case LibFunc_atan2_finite:
+      case LibFunc_atan2f_finite:
+        if (TLI->has(Func))
+          return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
+        break;
+      }
     } else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
       if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy())
         return ConstantFP::get(Ty->getContext(),
@@ -2041,20 +2129,27 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
 
     switch (IntrinsicID) {
     default: break;
+    case Intrinsic::usub_with_overflow:
+    case Intrinsic::ssub_with_overflow:
+    case Intrinsic::uadd_with_overflow:
+    case Intrinsic::sadd_with_overflow:
+      // X - undef -> { undef, false }
+      // undef - X -> { undef, false }
+      // X + undef -> { undef, false }
+      // undef + x -> { undef, false }
+      if (!C0 || !C1) {
+        return ConstantStruct::get(
+            cast<StructType>(Ty),
+            {UndefValue::get(Ty->getStructElementType(0)),
+             Constant::getNullValue(Ty->getStructElementType(1))});
+      }
+      LLVM_FALLTHROUGH;
     case Intrinsic::smul_with_overflow:
-    case Intrinsic::umul_with_overflow:
-      // Even if both operands are undef, we cannot fold muls to undef
-      // in the general case. For example, on i2 there are no inputs
-      // that would produce { i2 -1, i1 true } as the result.
+    case Intrinsic::umul_with_overflow: {
+      // undef * X -> { 0, false }
+      // X * undef -> { 0, false }
       if (!C0 || !C1)
         return Constant::getNullValue(Ty);
-      LLVM_FALLTHROUGH;
-    case Intrinsic::sadd_with_overflow:
-    case Intrinsic::uadd_with_overflow:
-    case Intrinsic::ssub_with_overflow:
-    case Intrinsic::usub_with_overflow: {
-      if (!C0 || !C1)
-        return UndefValue::get(Ty);
 
       APInt Res;
       bool Overflow;
@@ -2194,13 +2289,9 @@ static Constant *ConstantFoldScalarCall3(StringRef Name,
         case Intrinsic::fma:
         case Intrinsic::fmuladd: {
           APFloat V = Op1->getValueAPF();
-          APFloat::opStatus s = V.fusedMultiplyAdd(Op2->getValueAPF(),
-                                                   Op3->getValueAPF(),
-                                                   APFloat::rmNearestTiesToEven);
-          if (s != APFloat::opInvalidOp)
-            return ConstantFP::get(Ty->getContext(), V);
-
-          return nullptr;
+          V.fusedMultiplyAdd(Op2->getValueAPF(), Op3->getValueAPF(),
+                             APFloat::rmNearestTiesToEven);
+          return ConstantFP::get(Ty->getContext(), V);
         }
         }
       }
diff --git a/lib/Analysis/DDG.cpp b/lib/Analysis/DDG.cpp
new file mode 100644
index 000000000000..b5c3c761ad98
--- /dev/null
+++ b/lib/Analysis/DDG.cpp
@@ -0,0 +1,203 @@
+//===- DDG.cpp - Data Dependence Graph -------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The implementation for the data dependence graph.
+//===----------------------------------------------------------------------===//
+#include "llvm/Analysis/DDG.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ddg"
+
+template class llvm::DGEdge<DDGNode, DDGEdge>;
+template class llvm::DGNode<DDGNode, DDGEdge>;
+template class llvm::DirectedGraph<DDGNode, DDGEdge>;
+
+//===--------------------------------------------------------------------===//
+// DDGNode implementation
+//===--------------------------------------------------------------------===//
+DDGNode::~DDGNode() {}
+
+bool DDGNode::collectInstructions(
+    llvm::function_ref<bool(Instruction *)> const &Pred,
+    InstructionListType &IList) const {
+  assert(IList.empty() && "Expected the IList to be empty on entry.");
+  if (isa<SimpleDDGNode>(this)) {
+    for (auto *I : cast<const SimpleDDGNode>(this)->getInstructions())
+      if (Pred(I))
+        IList.push_back(I);
+  } else
+    llvm_unreachable("unimplemented type of node");
+  return !IList.empty();
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGNode::NodeKind K) {
+  const char *Out;
+  switch (K) {
+  case DDGNode::NodeKind::SingleInstruction:
+    Out = "single-instruction";
+    break;
+  case DDGNode::NodeKind::MultiInstruction:
+    Out = "multi-instruction";
+    break;
+  case DDGNode::NodeKind::Root:
+    Out = "root";
+    break;
+  case DDGNode::NodeKind::Unknown:
+    Out = "??";
+    break;
+  }
+  OS << Out;
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGNode &N) {
+  OS << "Node Address:" << &N << ":" << N.getKind() << "\n";
+  if (isa<SimpleDDGNode>(N)) {
+    OS << " Instructions:\n";
+    for (auto *I : cast<const SimpleDDGNode>(N).getInstructions())
+      OS.indent(2) << *I << "\n";
+  } else if (!isa<RootDDGNode>(N))
+    llvm_unreachable("unimplemented type of node");
+
+  OS << (N.getEdges().empty() ? " Edges:none!\n" : " Edges:\n");
+  for (auto &E : N.getEdges())
+    OS.indent(2) << *E;
+  return OS;
+}
+
+//===--------------------------------------------------------------------===//
+// SimpleDDGNode implementation
+//===--------------------------------------------------------------------===//
+
+SimpleDDGNode::SimpleDDGNode(Instruction &I)
+  : DDGNode(NodeKind::SingleInstruction), InstList() {
+  assert(InstList.empty() && "Expected empty list.");
+  InstList.push_back(&I);
+}
+
+SimpleDDGNode::SimpleDDGNode(const SimpleDDGNode &N)
+    : DDGNode(N), InstList(N.InstList) {
+  assert(((getKind() == NodeKind::SingleInstruction && InstList.size() == 1) ||
+          (getKind() == NodeKind::MultiInstruction && InstList.size() > 1)) &&
+         "constructing from invalid simple node.");
+}
+
+SimpleDDGNode::SimpleDDGNode(SimpleDDGNode &&N)
+    : DDGNode(std::move(N)), InstList(std::move(N.InstList)) {
+  assert(((getKind() == NodeKind::SingleInstruction && InstList.size() == 1) ||
+          (getKind() == NodeKind::MultiInstruction && InstList.size() > 1)) &&
+         "constructing from invalid simple node.");
+}
+
+SimpleDDGNode::~SimpleDDGNode() { InstList.clear(); }
+
+//===--------------------------------------------------------------------===//
+// DDGEdge implementation
+//===--------------------------------------------------------------------===//
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGEdge::EdgeKind K) {
+  const char *Out;
+  switch (K) {
+  case DDGEdge::EdgeKind::RegisterDefUse:
+    Out = "def-use";
+    break;
+  case DDGEdge::EdgeKind::MemoryDependence:
+    Out = "memory";
+    break;
+  case DDGEdge::EdgeKind::Rooted:
+    Out = "rooted";
+    break;
+  case DDGEdge::EdgeKind::Unknown:
+    Out = "??";
+    break;
+  }
+  OS << Out;
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGEdge &E) {
+  OS << "[" << E.getKind() << "] to " << &E.getTargetNode() << "\n";
+  return OS;
+}
+
+//===--------------------------------------------------------------------===//
+// DataDependenceGraph implementation
+//===--------------------------------------------------------------------===//
+using BasicBlockListType = SmallVector<BasicBlock *, 8>;
+
+DataDependenceGraph::DataDependenceGraph(Function &F, DependenceInfo &D)
+    : DependenceGraphInfo(F.getName().str(), D) {
+  BasicBlockListType BBList;
+  for (auto &BB : F.getBasicBlockList())
+    BBList.push_back(&BB);
+  DDGBuilder(*this, D, BBList).populate();
+}
+
+DataDependenceGraph::DataDependenceGraph(const Loop &L, DependenceInfo &D)
+    : DependenceGraphInfo(Twine(L.getHeader()->getParent()->getName() + "." +
+                                L.getHeader()->getName())
+                              .str(),
+                          D) {
+  BasicBlockListType BBList;
+  for (BasicBlock *BB : L.blocks())
+    BBList.push_back(BB);
+  DDGBuilder(*this, D, BBList).populate();
+}
+
+DataDependenceGraph::~DataDependenceGraph() {
+  for (auto *N : Nodes) {
+    for (auto *E : *N)
+      delete E;
+    delete N;
+  }
+}
+
+bool DataDependenceGraph::addNode(DDGNode &N) {
+  if (!DDGBase::addNode(N))
+    return false;
+
+  // In general, if the root node is already created and linked, it is not safe
+  // to add new nodes since they may be unreachable by the root.
+  // TODO: Allow adding Pi-block nodes after root is created. Pi-blocks are an
+  // exception because they represent components that are already reachable by
+  // root.
+  assert(!Root && "Root node is already added. No more nodes can be added.");
+  if (isa<RootDDGNode>(N))
+    Root = &N;
+
+  return true;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const DataDependenceGraph &G) {
+  for (auto *Node : G)
+    OS << *Node << "\n";
+  return OS;
+}
+
+//===--------------------------------------------------------------------===//
+// DDG Analysis Passes
+//===--------------------------------------------------------------------===//
+
+/// DDG as a loop pass.
+DDGAnalysis::Result DDGAnalysis::run(Loop &L, LoopAnalysisManager &AM,
+                                     LoopStandardAnalysisResults &AR) {
+  Function *F = L.getHeader()->getParent();
+  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
+  return std::make_unique<DataDependenceGraph>(L, DI);
+}
+AnalysisKey DDGAnalysis::Key;
+
+PreservedAnalyses DDGAnalysisPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &U) {
+  OS << "'DDG' for loop '" << L.getHeader()->getName() << "':\n";
+  OS << *AM.getResult<DDGAnalysis>(L, AR);
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index 75f269e84f9d..0038c9fb9ce4 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -254,7 +254,7 @@ FullDependence::FullDependence(Instruction *Source, Instruction *Destination,
       LoopIndependent(PossiblyLoopIndependent) {
   Consistent = true;
   if (CommonLevels)
-    DV = make_unique<DVEntry[]>(CommonLevels);
+    DV = std::make_unique<DVEntry[]>(CommonLevels);
 }
 
 // The rest are simple getters that hide the implementation.
@@ -3415,7 +3415,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
   if (!isLoadOrStore(Src) || !isLoadOrStore(Dst)) {
     // can only analyze simple loads and stores, i.e., no calls, invokes, etc.
     LLVM_DEBUG(dbgs() << "can only handle simple loads and stores\n");
-    return make_unique<Dependence>(Src, Dst);
+    return std::make_unique<Dependence>(Src, Dst);
   }
 
   assert(isLoadOrStore(Src) && "instruction is not load or store");
@@ -3430,7 +3430,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
   case PartialAlias:
     // cannot analyse objects if we don't understand their aliasing.
     LLVM_DEBUG(dbgs() << "can't analyze may or partial alias\n");
-    return make_unique<Dependence>(Src, Dst);
+    return std::make_unique<Dependence>(Src, Dst);
   case NoAlias:
     // If the objects noalias, they are distinct, accesses are independent.
     LLVM_DEBUG(dbgs() << "no alias\n");
@@ -3777,7 +3777,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
       return nullptr;
   }
 
-  return make_unique<FullDependence>(std::move(Result));
+  return std::make_unique<FullDependence>(std::move(Result));
 }
 
 
diff --git a/lib/Analysis/DependenceGraphBuilder.cpp b/lib/Analysis/DependenceGraphBuilder.cpp
new file mode 100644
index 000000000000..ed1d8351b2f0
--- /dev/null
+++ b/lib/Analysis/DependenceGraphBuilder.cpp
@@ -0,0 +1,228 @@
+//===- DependenceGraphBuilder.cpp ------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements common steps of the build algorithm for construction
+// of dependence graphs such as DDG and PDG.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DependenceGraphBuilder.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DDG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dgb"
+
+STATISTIC(TotalGraphs, "Number of dependence graphs created.");
+STATISTIC(TotalDefUseEdges, "Number of def-use edges created.");
+STATISTIC(TotalMemoryEdges, "Number of memory dependence edges created.");
+STATISTIC(TotalFineGrainedNodes, "Number of fine-grained nodes created.");
+STATISTIC(TotalConfusedEdges,
+          "Number of confused memory dependencies between two nodes.");
+STATISTIC(TotalEdgeReversals,
+          "Number of times the source and sink of dependence was reversed to "
+          "expose cycles in the graph.");
+
+using InstructionListType = SmallVector<Instruction *, 2>;
+
+//===--------------------------------------------------------------------===//
+// AbstractDependenceGraphBuilder implementation
+//===--------------------------------------------------------------------===//
+
+template <class G>
+void AbstractDependenceGraphBuilder<G>::createFineGrainedNodes() {
+  ++TotalGraphs;
+  assert(IMap.empty() && "Expected empty instruction map at start");
+  for (BasicBlock *BB : BBList)
+    for (Instruction &I : *BB) {
+      auto &NewNode = createFineGrainedNode(I);
+      IMap.insert(std::make_pair(&I, &NewNode));
+      ++TotalFineGrainedNodes;
+    }
+}
+
+template <class G>
+void AbstractDependenceGraphBuilder<G>::createAndConnectRootNode() {
+  // Create a root node that connects to every connected component of the graph.
+  // This is done to allow graph iterators to visit all the disjoint components
+  // of the graph, in a single walk.
+  //
+  // This algorithm works by going through each node of the graph and for each
+  // node N, do a DFS starting from N. A rooted edge is established between the
+  // root node and N (if N is not yet visited). All the nodes reachable from N
+  // are marked as visited and are skipped in the DFS of subsequent nodes.
+  //
+  // Note: This algorithm tries to limit the number of edges out of the root
+  // node to some extent, but there may be redundant edges created depending on
+  // the iteration order. For example for a graph {A -> B}, an edge from the
+  // root node is added to both nodes if B is visited before A. While it does
+  // not result in minimal number of edges, this approach saves compile-time
+  // while keeping the number of edges in check.
+  auto &RootNode = createRootNode();
+  df_iterator_default_set<const NodeType *, 4> Visited;
+  for (auto *N : Graph) {
+    if (*N == RootNode)
+      continue;
+    for (auto I : depth_first_ext(N, Visited))
+      if (I == N)
+        createRootedEdge(RootNode, *N);
+  }
+}
+
+template <class G> void AbstractDependenceGraphBuilder<G>::createDefUseEdges() {
+  for (NodeType *N : Graph) {
+    InstructionListType SrcIList;
+    N->collectInstructions([](const Instruction *I) { return true; }, SrcIList);
+
+    // Use a set to mark the targets that we link to N, so we don't add
+    // duplicate def-use edges when more than one instruction in a target node
+    // use results of instructions that are contained in N.
+    SmallPtrSet<NodeType *, 4> VisitedTargets;
+
+    for (Instruction *II : SrcIList) {
+      for (User *U : II->users()) {
+        Instruction *UI = dyn_cast<Instruction>(U);
+        if (!UI)
+          continue;
+        NodeType *DstNode = nullptr;
+        if (IMap.find(UI) != IMap.end())
+          DstNode = IMap.find(UI)->second;
+
+        // In the case of loops, the scope of the subgraph is all the
+        // basic blocks (and instructions within them) belonging to the loop. We
+        // simply ignore all the edges coming from (or going into) instructions
+        // or basic blocks outside of this range.
+        if (!DstNode) {
+          LLVM_DEBUG(
+              dbgs()
+              << "skipped def-use edge since the sink" << *UI
+              << " is outside the range of instructions being considered.\n");
+          continue;
+        }
+
+        // Self dependencies are ignored because they are redundant and
+        // uninteresting.
+        if (DstNode == N) {
+          LLVM_DEBUG(dbgs()
+                     << "skipped def-use edge since the sink and the source ("
+                     << N << ") are the same.\n");
+          continue;
+        }
+
+        if (VisitedTargets.insert(DstNode).second) {
+          createDefUseEdge(*N, *DstNode);
+          ++TotalDefUseEdges;
+        }
+      }
+    }
+  }
+}
+
+template <class G>
+void AbstractDependenceGraphBuilder<G>::createMemoryDependencyEdges() {
+  using DGIterator = typename G::iterator;
+  auto isMemoryAccess = [](const Instruction *I) {
+    return I->mayReadOrWriteMemory();
+  };
+  for (DGIterator SrcIt = Graph.begin(), E = Graph.end(); SrcIt != E; ++SrcIt) {
+    InstructionListType SrcIList;
+    (*SrcIt)->collectInstructions(isMemoryAccess, SrcIList);
+    if (SrcIList.empty())
+      continue;
+
+    for (DGIterator DstIt = SrcIt; DstIt != E; ++DstIt) {
+      if (**SrcIt == **DstIt)
+        continue;
+      InstructionListType DstIList;
+      (*DstIt)->collectInstructions(isMemoryAccess, DstIList);
+      if (DstIList.empty())
+        continue;
+      bool ForwardEdgeCreated = false;
+      bool BackwardEdgeCreated = false;
+      for (Instruction *ISrc : SrcIList) {
+        for (Instruction *IDst : DstIList) {
+          auto D = DI.depends(ISrc, IDst, true);
+          if (!D)
+            continue;
+
+          // If we have a dependence with its left-most non-'=' direction
+          // being '>' we need to reverse the direction of the edge, because
+          // the source of the dependence cannot occur after the sink. For
+          // confused dependencies, we will create edges in both directions to
+          // represent the possibility of a cycle.
+
+          auto createConfusedEdges = [&](NodeType &Src, NodeType &Dst) {
+            if (!ForwardEdgeCreated) {
+              createMemoryEdge(Src, Dst);
+              ++TotalMemoryEdges;
+            }
+            if (!BackwardEdgeCreated) {
+              createMemoryEdge(Dst, Src);
+              ++TotalMemoryEdges;
+            }
+            ForwardEdgeCreated = BackwardEdgeCreated = true;
+            ++TotalConfusedEdges;
+          };
+
+          auto createForwardEdge = [&](NodeType &Src, NodeType &Dst) {
+            if (!ForwardEdgeCreated) {
+              createMemoryEdge(Src, Dst);
+              ++TotalMemoryEdges;
+            }
+            ForwardEdgeCreated = true;
+          };
+
+          auto createBackwardEdge = [&](NodeType &Src, NodeType &Dst) {
+            if (!BackwardEdgeCreated) {
+              createMemoryEdge(Dst, Src);
+              ++TotalMemoryEdges;
+            }
+            BackwardEdgeCreated = true;
+          };
+
+          if (D->isConfused())
+            createConfusedEdges(**SrcIt, **DstIt);
+          else if (D->isOrdered() && !D->isLoopIndependent()) {
+            bool ReversedEdge = false;
+            for (unsigned Level = 1; Level <= D->getLevels(); ++Level) {
+              if (D->getDirection(Level) == Dependence::DVEntry::EQ)
+                continue;
+              else if (D->getDirection(Level) == Dependence::DVEntry::GT) {
+                createBackwardEdge(**SrcIt, **DstIt);
+                ReversedEdge = true;
+                ++TotalEdgeReversals;
+                break;
+              } else if (D->getDirection(Level) == Dependence::DVEntry::LT)
+                break;
+              else {
+                createConfusedEdges(**SrcIt, **DstIt);
+                break;
+              }
+            }
+            if (!ReversedEdge)
+              createForwardEdge(**SrcIt, **DstIt);
+          } else
+            createForwardEdge(**SrcIt, **DstIt);
+
+          // Avoid creating duplicate edges.
+          if (ForwardEdgeCreated && BackwardEdgeCreated)
+            break;
+        }
+
+        // If we've created edges in both directions, there is no more
+        // unique edge that we can create between these two nodes, so we
+        // can exit early.
+        if (ForwardEdgeCreated && BackwardEdgeCreated)
+          break;
+      }
+    }
+  }
+}
+
+template class llvm::AbstractDependenceGraphBuilder<DataDependenceGraph>;
+template class llvm::DependenceGraphInfo<DDGNode>;
diff --git a/lib/Analysis/DivergenceAnalysis.cpp b/lib/Analysis/DivergenceAnalysis.cpp
index 0ccd59ef2bfd..3d1be1e1cce0 100644
--- a/lib/Analysis/DivergenceAnalysis.cpp
+++ b/lib/Analysis/DivergenceAnalysis.cpp
@@ -412,6 +412,12 @@ bool DivergenceAnalysis::isDivergent(const Value &V) const {
   return DivergentValues.find(&V) != DivergentValues.end();
 }
 
+bool DivergenceAnalysis::isDivergentUse(const Use &U) const {
+  Value &V = *U.get();
+  Instruction &I = *cast<Instruction>(U.getUser());
+  return isDivergent(V) || isTemporalDivergent(*I.getParent(), V);
+}
+
 void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
   if (DivergentValues.empty())
     return;
@@ -449,6 +455,10 @@ bool GPUDivergenceAnalysis::isDivergent(const Value &val) const {
   return DA.isDivergent(val);
 }
 
+bool GPUDivergenceAnalysis::isDivergentUse(const Use &use) const {
+  return DA.isDivergentUse(use);
+}
+
 void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
   OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
   DA.print(OS, mod);
diff --git a/lib/Analysis/GlobalsModRef.cpp b/lib/Analysis/GlobalsModRef.cpp
index 0d6c0ffb18a8..efdf9706ba3c 100644
--- a/lib/Analysis/GlobalsModRef.cpp
+++ b/lib/Analysis/GlobalsModRef.cpp
@@ -370,7 +370,8 @@ bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V,
       // passing into the function.
       if (Call->isDataOperand(&U)) {
         // Detect calls to free.
-        if (Call->isArgOperand(&U) && isFreeCall(I, &TLI)) {
+        if (Call->isArgOperand(&U) &&
+            isFreeCall(I, &GetTLI(*Call->getFunction()))) {
           if (Writers)
             Writers->insert(Call->getParent()->getParent());
         } else {
@@ -432,7 +433,7 @@ bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) {
       Value *Ptr = GetUnderlyingObject(SI->getOperand(0),
                                        GV->getParent()->getDataLayout());
 
-      if (!isAllocLikeFn(Ptr, &TLI))
+      if (!isAllocLikeFn(Ptr, &GetTLI(*SI->getFunction())))
         return false; // Too hard to analyze.
 
       // Analyze all uses of the allocation.  If any of them are used in a
@@ -576,6 +577,7 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
         // We handle calls specially because the graph-relevant aspects are
         // handled above.
         if (auto *Call = dyn_cast<CallBase>(&I)) {
+          auto &TLI = GetTLI(*Node->getFunction());
           if (isAllocationFn(Call, &TLI) || isFreeCall(Call, &TLI)) {
             // FIXME: It is completely unclear why this is necessary and not
             // handled by the above graph code.
@@ -937,12 +939,13 @@ ModRefInfo GlobalsAAResult::getModRefInfo(const CallBase *Call,
   return intersectModRef(Known, AAResultBase::getModRefInfo(Call, Loc, AAQI));
 }
 
-GlobalsAAResult::GlobalsAAResult(const DataLayout &DL,
-                                 const TargetLibraryInfo &TLI)
-    : AAResultBase(), DL(DL), TLI(TLI) {}
+GlobalsAAResult::GlobalsAAResult(
+    const DataLayout &DL,
+    std::function<const TargetLibraryInfo &(Function &F)> GetTLI)
+    : AAResultBase(), DL(DL), GetTLI(std::move(GetTLI)) {}
 
 GlobalsAAResult::GlobalsAAResult(GlobalsAAResult &&Arg)
-    : AAResultBase(std::move(Arg)), DL(Arg.DL), TLI(Arg.TLI),
+    : AAResultBase(std::move(Arg)), DL(Arg.DL), GetTLI(std::move(Arg.GetTLI)),
       NonAddressTakenGlobals(std::move(Arg.NonAddressTakenGlobals)),
       IndirectGlobals(std::move(Arg.IndirectGlobals)),
       AllocsForIndirectGlobals(std::move(Arg.AllocsForIndirectGlobals)),
@@ -957,10 +960,10 @@ GlobalsAAResult::GlobalsAAResult(GlobalsAAResult &&Arg)
 
 GlobalsAAResult::~GlobalsAAResult() {}
 
-/*static*/ GlobalsAAResult
-GlobalsAAResult::analyzeModule(Module &M, const TargetLibraryInfo &TLI,
-                               CallGraph &CG) {
-  GlobalsAAResult Result(M.getDataLayout(), TLI);
+/*static*/ GlobalsAAResult GlobalsAAResult::analyzeModule(
+    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI,
+    CallGraph &CG) {
+  GlobalsAAResult Result(M.getDataLayout(), GetTLI);
 
   // Discover which functions aren't recursive, to feed into AnalyzeGlobals.
   Result.CollectSCCMembership(CG);
@@ -977,8 +980,12 @@ GlobalsAAResult::analyzeModule(Module &M, const TargetLibraryInfo &TLI,
 AnalysisKey GlobalsAA::Key;
 
 GlobalsAAResult GlobalsAA::run(Module &M, ModuleAnalysisManager &AM) {
-  return GlobalsAAResult::analyzeModule(M,
-                                        AM.getResult<TargetLibraryAnalysis>(M),
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  return GlobalsAAResult::analyzeModule(M, GetTLI,
                                         AM.getResult<CallGraphAnalysis>(M));
 }
 
@@ -999,9 +1006,11 @@ GlobalsAAWrapperPass::GlobalsAAWrapperPass() : ModulePass(ID) {
 }
 
 bool GlobalsAAWrapperPass::runOnModule(Module &M) {
+  auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
   Result.reset(new GlobalsAAResult(GlobalsAAResult::analyzeModule(
-      M, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
-      getAnalysis<CallGraphWrapperPass>().getCallGraph())));
+      M, GetTLI, getAnalysis<CallGraphWrapperPass>().getCallGraph())));
   return false;
 }
 
diff --git a/lib/Analysis/IVDescriptors.cpp b/lib/Analysis/IVDescriptors.cpp
index ce285f82f720..6fb600114bc6 100644
--- a/lib/Analysis/IVDescriptors.cpp
+++ b/lib/Analysis/IVDescriptors.cpp
@@ -300,7 +300,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
       ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr);
       if (!ReduxDesc.isRecurrence())
         return false;
-      if (isa<FPMathOperator>(ReduxDesc.getPatternInst()))
+      // FIXME: FMF is allowed on phi, but propagation is not handled correctly.
+      if (isa<FPMathOperator>(ReduxDesc.getPatternInst()) && !IsAPhi)
         FMF &= ReduxDesc.getPatternInst()->getFastMathFlags();
     }
 
diff --git a/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index 6ff840efcb64..68153de8219f 100644
--- a/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -53,7 +53,7 @@ static cl::opt<unsigned>
                               "call callsite"));
 
 ICallPromotionAnalysis::ICallPromotionAnalysis() {
-  ValueDataArray = llvm::make_unique<InstrProfValueData[]>(MaxNumPromotions);
+  ValueDataArray = std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
 }
 
 bool ICallPromotionAnalysis::isPromotionProfitable(uint64_t Count,
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 0dec146e0465..89811ec0e377 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -436,7 +436,8 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
       Type *Ty = I.getAllocatedType();
       AllocatedSize = SaturatingMultiplyAdd(
-          AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
+          AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
+          AllocatedSize);
       return Base::visitAlloca(I);
     }
   }
@@ -444,7 +445,8 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   // Accumulate the allocated size.
   if (I.isStaticAlloca()) {
     Type *Ty = I.getAllocatedType();
-    AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
+    AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getFixedSize(),
+                                  AllocatedSize);
   }
 
   // We will happily inline static alloca instructions.
@@ -1070,8 +1072,8 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
 
   Value *SimpleV = nullptr;
   if (auto FI = dyn_cast<FPMathOperator>(&I))
-    SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
-                              CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
+    SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
+                            CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
   else
     SimpleV =
         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
@@ -1453,19 +1455,6 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   // Maximum valid cost increased in this function.
   int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
 
-  // Exit early for a large switch, assuming one case needs at least one
-  // instruction.
-  // FIXME: This is not true for a bit test, but ignore such case for now to
-  // save compile-time.
-  int64_t CostLowerBound =
-      std::min((int64_t)CostUpperBound,
-               (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
-
-  if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
-    addCost((int64_t)SI.getNumCases() * InlineConstants::InstrCost);
-    return false;
-  }
-
   unsigned JumpTableSize = 0;
   unsigned NumCaseCluster =
       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index e34bf6f4e43f..cb8987721700 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -56,8 +56,8 @@ static Value *simplifyFPUnOp(unsigned, Value *, const FastMathFlags &,
                              const SimplifyQuery &, unsigned);
 static Value *SimplifyBinOp(unsigned, Value *, Value *, const SimplifyQuery &,
                             unsigned);
-static Value *SimplifyFPBinOp(unsigned, Value *, Value *, const FastMathFlags &,
-                              const SimplifyQuery &, unsigned);
+static Value *SimplifyBinOp(unsigned, Value *, Value *, const FastMathFlags &,
+                            const SimplifyQuery &, unsigned);
 static Value *SimplifyCmpInst(unsigned, Value *, Value *, const SimplifyQuery &,
                               unsigned);
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -1371,7 +1371,8 @@ Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
 /// Commuted variants are assumed to be handled by calling this function again
 /// with the parameters swapped.
 static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
-                                         ICmpInst *UnsignedICmp, bool IsAnd) {
+                                         ICmpInst *UnsignedICmp, bool IsAnd,
+                                         const SimplifyQuery &Q) {
   Value *X, *Y;
 
   ICmpInst::Predicate EqPred;
@@ -1380,6 +1381,59 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
     return nullptr;
 
   ICmpInst::Predicate UnsignedPred;
+
+  Value *A, *B;
+  // Y = (A - B);
+  if (match(Y, m_Sub(m_Value(A), m_Value(B)))) {
+    if (match(UnsignedICmp,
+              m_c_ICmp(UnsignedPred, m_Specific(A), m_Specific(B))) &&
+        ICmpInst::isUnsigned(UnsignedPred)) {
+      if (UnsignedICmp->getOperand(0) != A)
+        UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
+
+      // A >=/<= B || (A - B) != 0  <-->  true
+      if ((UnsignedPred == ICmpInst::ICMP_UGE ||
+           UnsignedPred == ICmpInst::ICMP_ULE) &&
+          EqPred == ICmpInst::ICMP_NE && !IsAnd)
+        return ConstantInt::getTrue(UnsignedICmp->getType());
+      // A </> B && (A - B) == 0  <-->  false
+      if ((UnsignedPred == ICmpInst::ICMP_ULT ||
+           UnsignedPred == ICmpInst::ICMP_UGT) &&
+          EqPred == ICmpInst::ICMP_EQ && IsAnd)
+        return ConstantInt::getFalse(UnsignedICmp->getType());
+
+      // A </> B && (A - B) != 0  <-->  A </> B
+      // A </> B || (A - B) != 0  <-->  (A - B) != 0
+      if (EqPred == ICmpInst::ICMP_NE && (UnsignedPred == ICmpInst::ICMP_ULT ||
+                                          UnsignedPred == ICmpInst::ICMP_UGT))
+        return IsAnd ? UnsignedICmp : ZeroICmp;
+
+      // A <=/>= B && (A - B) == 0  <-->  (A - B) == 0
+      // A <=/>= B || (A - B) == 0  <-->  A <=/>= B
+      if (EqPred == ICmpInst::ICMP_EQ && (UnsignedPred == ICmpInst::ICMP_ULE ||
+                                          UnsignedPred == ICmpInst::ICMP_UGE))
+        return IsAnd ? ZeroICmp : UnsignedICmp;
+    }
+
+    // Given  Y = (A - B)
+    //   Y >= A && Y != 0  --> Y >= A  iff B != 0
+    //   Y <  A || Y == 0  --> Y <  A  iff B != 0
+    if (match(UnsignedICmp,
+              m_c_ICmp(UnsignedPred, m_Specific(Y), m_Specific(A)))) {
+      if (UnsignedICmp->getOperand(0) != Y)
+        UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
+
+      if (UnsignedPred == ICmpInst::ICMP_UGE && IsAnd &&
+          EqPred == ICmpInst::ICMP_NE &&
+          isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+        return UnsignedICmp;
+      if (UnsignedPred == ICmpInst::ICMP_ULT && !IsAnd &&
+          EqPred == ICmpInst::ICMP_EQ &&
+          isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+        return UnsignedICmp;
+    }
+  }
+
   if (match(UnsignedICmp, m_ICmp(UnsignedPred, m_Value(X), m_Specific(Y))) &&
       ICmpInst::isUnsigned(UnsignedPred))
     ;
@@ -1395,19 +1449,33 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
   if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE)
     return IsAnd ? UnsignedICmp : ZeroICmp;
 
-  // X >= Y || Y != 0  -->  true
+  // X <= Y && Y != 0  -->  X <= Y  iff X != 0
+  // X <= Y || Y != 0  -->  Y != 0  iff X != 0
+  if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
+      isKnownNonZero(X, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+    return IsAnd ? UnsignedICmp : ZeroICmp;
+
+  // X >= Y && Y == 0  -->  Y == 0
   // X >= Y || Y == 0  -->  X >= Y
-  if (UnsignedPred == ICmpInst::ICMP_UGE && !IsAnd) {
-    if (EqPred == ICmpInst::ICMP_NE)
-      return getTrue(UnsignedICmp->getType());
-    return UnsignedICmp;
-  }
+  if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ)
+    return IsAnd ? ZeroICmp : UnsignedICmp;
+
+  // X > Y && Y == 0  -->  Y == 0  iff X != 0
+  // X > Y || Y == 0  -->  X > Y   iff X != 0
+  if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+      isKnownNonZero(X, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+    return IsAnd ? ZeroICmp : UnsignedICmp;
 
   // X < Y && Y == 0  -->  false
   if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_EQ &&
       IsAnd)
     return getFalse(UnsignedICmp->getType());
 
+  // X >= Y || Y != 0  -->  true
+  if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_NE &&
+      !IsAnd)
+    return getTrue(UnsignedICmp->getType());
+
   return nullptr;
 }
 
@@ -1587,10 +1655,10 @@ static Value *simplifyAndOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1,
 }
 
 static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1,
-                                 const InstrInfoQuery &IIQ) {
-  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
+                                 const SimplifyQuery &Q) {
+  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true, Q))
     return X;
-  if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/true))
+  if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/true, Q))
     return X;
 
   if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1))
@@ -1604,9 +1672,9 @@ static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1,
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, true))
     return X;
 
-  if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1, IIQ))
+  if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1, Q.IIQ))
     return X;
-  if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0, IIQ))
+  if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0, Q.IIQ))
     return X;
 
   return nullptr;
@@ -1660,10 +1728,10 @@ static Value *simplifyOrOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1,
 }
 
 static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1,
-                                const InstrInfoQuery &IIQ) {
-  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false))
+                                const SimplifyQuery &Q) {
+  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false, Q))
     return X;
-  if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/false))
+  if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/false, Q))
     return X;
 
   if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1))
@@ -1677,9 +1745,9 @@ static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1,
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, false))
     return X;
 
-  if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1, IIQ))
+  if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1, Q.IIQ))
     return X;
-  if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0, IIQ))
+  if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0, Q.IIQ))
     return X;
 
   return nullptr;
@@ -1738,8 +1806,8 @@ static Value *simplifyAndOrOfCmps(const SimplifyQuery &Q,
   auto *ICmp0 = dyn_cast<ICmpInst>(Op0);
   auto *ICmp1 = dyn_cast<ICmpInst>(Op1);
   if (ICmp0 && ICmp1)
-    V = IsAnd ? simplifyAndOfICmps(ICmp0, ICmp1, Q.IIQ)
-              : simplifyOrOfICmps(ICmp0, ICmp1, Q.IIQ);
+    V = IsAnd ? simplifyAndOfICmps(ICmp0, ICmp1, Q)
+              : simplifyOrOfICmps(ICmp0, ICmp1, Q);
 
   auto *FCmp0 = dyn_cast<FCmpInst>(Op0);
   auto *FCmp1 = dyn_cast<FCmpInst>(Op1);
@@ -1759,6 +1827,77 @@ static Value *simplifyAndOrOfCmps(const SimplifyQuery &Q,
   return nullptr;
 }
 
+/// Check that the Op1 is in expected form, i.e.:
+///   %Agg = tail call { i4, i1 } @llvm.[us]mul.with.overflow.i4(i4 %X, i4 %???)
+///   %Op1 = extractvalue { i4, i1 } %Agg, 1
+static bool omitCheckForZeroBeforeMulWithOverflowInternal(Value *Op1,
+                                                          Value *X) {
+  auto *Extract = dyn_cast<ExtractValueInst>(Op1);
+  // We should only be extracting the overflow bit.
+  if (!Extract || !Extract->getIndices().equals(1))
+    return false;
+  Value *Agg = Extract->getAggregateOperand();
+  // This should be a multiplication-with-overflow intrinsic.
+  if (!match(Agg, m_CombineOr(m_Intrinsic<Intrinsic::umul_with_overflow>(),
+                              m_Intrinsic<Intrinsic::smul_with_overflow>())))
+    return false;
+  // One of its multipliers should be the value we checked for zero before.
+  if (!match(Agg, m_CombineOr(m_Argument<0>(m_Specific(X)),
+                              m_Argument<1>(m_Specific(X)))))
+    return false;
+  return true;
+}
+
+/// The @llvm.[us]mul.with.overflow intrinsic could have been folded from some
+/// other form of check, e.g. one that was using division; it may have been
+/// guarded against division-by-zero. We can drop that check now.
+/// Look for:
+///   %Op0 = icmp ne i4 %X, 0
+///   %Agg = tail call { i4, i1 } @llvm.[us]mul.with.overflow.i4(i4 %X, i4 %???)
+///   %Op1 = extractvalue { i4, i1 } %Agg, 1
+///   %??? = and i1 %Op0, %Op1
+/// We can just return  %Op1
+static Value *omitCheckForZeroBeforeMulWithOverflow(Value *Op0, Value *Op1) {
+  ICmpInst::Predicate Pred;
+  Value *X;
+  if (!match(Op0, m_ICmp(Pred, m_Value(X), m_Zero())) ||
+      Pred != ICmpInst::Predicate::ICMP_NE)
+    return nullptr;
+  // Is Op1 in expected form?
+  if (!omitCheckForZeroBeforeMulWithOverflowInternal(Op1, X))
+    return nullptr;
+  // Can omit 'and', and just return the overflow bit.
+  return Op1;
+}
+
+/// The @llvm.[us]mul.with.overflow intrinsic could have been folded from some
+/// other form of check, e.g. one that was using division; it may have been
+/// guarded against division-by-zero. We can drop that check now.
+/// Look for:
+///   %Op0 = icmp eq i4 %X, 0
+///   %Agg = tail call { i4, i1 } @llvm.[us]mul.with.overflow.i4(i4 %X, i4 %???)
+///   %Op1 = extractvalue { i4, i1 } %Agg, 1
+///   %NotOp1 = xor i1 %Op1, true
+///   %or = or i1 %Op0, %NotOp1
+/// We can just return  %NotOp1
+static Value *omitCheckForZeroBeforeInvertedMulWithOverflow(Value *Op0,
+                                                            Value *NotOp1) {
+  ICmpInst::Predicate Pred;
+  Value *X;
+  if (!match(Op0, m_ICmp(Pred, m_Value(X), m_Zero())) ||
+      Pred != ICmpInst::Predicate::ICMP_EQ)
+    return nullptr;
+  // We expect the other hand of an 'or' to be a 'not'.
+  Value *Op1;
+  if (!match(NotOp1, m_Not(m_Value(Op1))))
+    return nullptr;
+  // Is Op1 in expected form?
+  if (!omitCheckForZeroBeforeMulWithOverflowInternal(Op1, X))
+    return nullptr;
+  // Can omit 'and', and just return the inverted overflow bit.
+  return NotOp1;
+}
+
 /// Given operands for an And, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
@@ -1813,6 +1952,14 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       return Op0;
   }
 
+  // If we have a multiplication overflow check that is being 'and'ed with a
+  // check that one of the multipliers is not zero, we can omit the 'and', and
+  // only keep the overflow check.
+  if (Value *V = omitCheckForZeroBeforeMulWithOverflow(Op0, Op1))
+    return V;
+  if (Value *V = omitCheckForZeroBeforeMulWithOverflow(Op1, Op0))
+    return V;
+
   // A & (-A) = A if A is a power of two or zero.
   if (match(Op0, m_Neg(m_Specific(Op1))) ||
       match(Op1, m_Neg(m_Specific(Op0)))) {
@@ -1987,6 +2134,14 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, false))
     return V;
 
+  // If we have a multiplication overflow check that is being 'and'ed with a
+  // check that one of the multipliers is not zero, we can omit the 'and', and
+  // only keep the overflow check.
+  if (Value *V = omitCheckForZeroBeforeInvertedMulWithOverflow(Op0, Op1))
+    return V;
+  if (Value *V = omitCheckForZeroBeforeInvertedMulWithOverflow(Op1, Op0))
+    return V;
+
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q,
                                           MaxRecurse))
@@ -3529,6 +3684,9 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
     //   %sel = select i1 %cmp, i32 -2147483648, i32 %add
     //
     // We can't replace %sel with %add unless we strip away the flags.
+    // TODO: This is an unusual limitation because better analysis results in
+    //       worse simplification. InstCombine can do this fold more generally
+    //       by dropping the flags. Remove this fold to save compile-time?
     if (isa<OverflowingBinaryOperator>(B))
       if (Q.IIQ.hasNoSignedWrap(B) || Q.IIQ.hasNoUnsignedWrap(B))
         return nullptr;
@@ -4324,14 +4482,16 @@ static Constant *propagateNaN(Constant *In) {
   return In;
 }
 
-static Constant *simplifyFPBinop(Value *Op0, Value *Op1) {
-  if (isa<UndefValue>(Op0) || isa<UndefValue>(Op1))
-    return ConstantFP::getNaN(Op0->getType());
+/// Perform folds that are common to any floating-point operation. This implies
+/// transforms based on undef/NaN because the operation itself makes no
+/// difference to the result.
+static Constant *simplifyFPOp(ArrayRef<Value *> Ops) {
+  if (any_of(Ops, [](Value *V) { return isa<UndefValue>(V); }))
+    return ConstantFP::getNaN(Ops[0]->getType());
 
-  if (match(Op0, m_NaN()))
-    return propagateNaN(cast<Constant>(Op0));
-  if (match(Op1, m_NaN()))
-    return propagateNaN(cast<Constant>(Op1));
+  for (Value *V : Ops)
+    if (match(V, m_NaN()))
+      return propagateNaN(cast<Constant>(V));
 
   return nullptr;
 }
@@ -4343,7 +4503,7 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPBinop(Op0, Op1))
+  if (Constant *C = simplifyFPOp({Op0, Op1}))
     return C;
 
   // fadd X, -0 ==> X
@@ -4390,7 +4550,7 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPBinop(Op0, Op1))
+  if (Constant *C = simplifyFPOp({Op0, Op1}))
     return C;
 
   // fsub X, +0 ==> X
@@ -4430,23 +4590,27 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   return nullptr;
 }
 
-/// Given the operands for an FMul, see if we can fold the result
-static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                               const SimplifyQuery &Q, unsigned MaxRecurse) {
-  if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q))
-    return C;
-
-  if (Constant *C = simplifyFPBinop(Op0, Op1))
+static Value *SimplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
+                              const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = simplifyFPOp({Op0, Op1}))
     return C;
 
   // fmul X, 1.0 ==> X
   if (match(Op1, m_FPOne()))
     return Op0;
 
+  // fmul 1.0, X ==> X
+  if (match(Op0, m_FPOne()))
+    return Op1;
+
   // fmul nnan nsz X, 0 ==> 0
   if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZeroFP()))
     return ConstantFP::getNullValue(Op0->getType());
 
+  // fmul nnan nsz 0, X ==> 0
+  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()))
+    return ConstantFP::getNullValue(Op1->getType());
+
   // sqrt(X) * sqrt(X) --> X, if we can:
   // 1. Remove the intermediate rounding (reassociate).
   // 2. Ignore non-zero negative numbers because sqrt would produce NAN.
@@ -4459,6 +4623,16 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   return nullptr;
 }
 
+/// Given the operands for an FMul, see if we can fold the result
+static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                               const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q))
+    return C;
+
+  // Now apply simplifications that do not require rounding.
+  return SimplifyFMAFMul(Op0, Op1, FMF, Q, MaxRecurse);
+}
+
 Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q) {
   return ::SimplifyFAddInst(Op0, Op1, FMF, Q, RecursionLimit);
@@ -4475,12 +4649,17 @@ Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   return ::SimplifyFMulInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
+Value *llvm::SimplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
+                             const SimplifyQuery &Q) {
+  return ::SimplifyFMAFMul(Op0, Op1, FMF, Q, RecursionLimit);
+}
+
 static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const SimplifyQuery &Q, unsigned) {
   if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPBinop(Op0, Op1))
+  if (Constant *C = simplifyFPOp({Op0, Op1}))
     return C;
 
   // X / 1.0 -> X
@@ -4525,7 +4704,7 @@ static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPBinop(Op0, Op1))
+  if (Constant *C = simplifyFPOp({Op0, Op1}))
     return C;
 
   // Unlike fdiv, the result of frem always matches the sign of the dividend.
@@ -4564,8 +4743,7 @@ static Value *simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q,
 
 /// Given the operand for a UnaryOperator, see if we can fold the result.
 /// If not, this returns null.
-/// In contrast to SimplifyUnOp, try to use FastMathFlag when folding the
-/// result. In case we don't need FastMathFlags, simply fall to SimplifyUnOp.
+/// Try to use FastMathFlags when folding the result.
 static Value *simplifyFPUnOp(unsigned Opcode, Value *Op,
                              const FastMathFlags &FMF,
                              const SimplifyQuery &Q, unsigned MaxRecurse) {
@@ -4581,8 +4759,8 @@ Value *llvm::SimplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q) {
   return ::simplifyUnOp(Opcode, Op, Q, RecursionLimit);
 }
 
-Value *llvm::SimplifyFPUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF,
-                            const SimplifyQuery &Q) {
+Value *llvm::SimplifyUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF,
+                          const SimplifyQuery &Q) {
   return ::simplifyFPUnOp(Opcode, Op, FMF, Q, RecursionLimit);
 }
 
@@ -4634,11 +4812,10 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 
 /// Given operands for a BinaryOperator, see if we can fold the result.
 /// If not, this returns null.
-/// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
-/// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
-static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                              const FastMathFlags &FMF, const SimplifyQuery &Q,
-                              unsigned MaxRecurse) {
+/// Try to use FastMathFlags when folding the result.
+static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                            const FastMathFlags &FMF, const SimplifyQuery &Q,
+                            unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::FAdd:
     return SimplifyFAddInst(LHS, RHS, FMF, Q, MaxRecurse);
@@ -4658,9 +4835,9 @@ Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   return ::SimplifyBinOp(Opcode, LHS, RHS, Q, RecursionLimit);
 }
 
-Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                             FastMathFlags FMF, const SimplifyQuery &Q) {
-  return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit);
+Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                           FastMathFlags FMF, const SimplifyQuery &Q) {
+  return ::SimplifyBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
 /// Given operands for a CmpInst, see if we can fold the result.
@@ -5009,6 +5186,15 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
     }
     return nullptr;
   }
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd: {
+    Value *Op0 = Call->getArgOperand(0);
+    Value *Op1 = Call->getArgOperand(1);
+    Value *Op2 = Call->getArgOperand(2);
+    if (Value *V = simplifyFPOp({ Op0, Op1, Op2 }))
+      return V;
+    return nullptr;
+  }
   default:
     return nullptr;
   }
@@ -5221,14 +5407,16 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
 /// If we have a pre-simplified value in 'SimpleV', that is forcibly used to
 /// replace the instruction 'I'. Otherwise, we simply add 'I' to the list of
 /// instructions to process and attempt to simplify it using
-/// InstructionSimplify.
+/// InstructionSimplify. Recursively visited users which could not be
+/// simplified themselves are to the optional UnsimplifiedUsers set for
+/// further processing by the caller.
 ///
 /// This routine returns 'true' only when *it* simplifies something. The passed
 /// in simplified value does not count toward this.
-static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
-                                              const TargetLibraryInfo *TLI,
-                                              const DominatorTree *DT,
-                                              AssumptionCache *AC) {
+static bool replaceAndRecursivelySimplifyImpl(
+    Instruction *I, Value *SimpleV, const TargetLibraryInfo *TLI,
+    const DominatorTree *DT, AssumptionCache *AC,
+    SmallSetVector<Instruction *, 8> *UnsimplifiedUsers = nullptr) {
   bool Simplified = false;
   SmallSetVector<Instruction *, 8> Worklist;
   const DataLayout &DL = I->getModule()->getDataLayout();
@@ -5258,8 +5446,11 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
 
     // See if this instruction simplifies.
     SimpleV = SimplifyInstruction(I, {DL, TLI, DT, AC});
-    if (!SimpleV)
+    if (!SimpleV) {
+      if (UnsimplifiedUsers)
+        UnsimplifiedUsers->insert(I);
       continue;
+    }
 
     Simplified = true;
 
@@ -5285,16 +5476,17 @@ bool llvm::recursivelySimplifyInstruction(Instruction *I,
                                           const TargetLibraryInfo *TLI,
                                           const DominatorTree *DT,
                                           AssumptionCache *AC) {
-  return replaceAndRecursivelySimplifyImpl(I, nullptr, TLI, DT, AC);
+  return replaceAndRecursivelySimplifyImpl(I, nullptr, TLI, DT, AC, nullptr);
 }
 
-bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
-                                         const TargetLibraryInfo *TLI,
-                                         const DominatorTree *DT,
-                                         AssumptionCache *AC) {
+bool llvm::replaceAndRecursivelySimplify(
+    Instruction *I, Value *SimpleV, const TargetLibraryInfo *TLI,
+    const DominatorTree *DT, AssumptionCache *AC,
+    SmallSetVector<Instruction *, 8> *UnsimplifiedUsers) {
   assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!");
   assert(SimpleV && "Must provide a simplified value.");
-  return replaceAndRecursivelySimplifyImpl(I, SimpleV, TLI, DT, AC);
+  return replaceAndRecursivelySimplifyImpl(I, SimpleV, TLI, DT, AC,
+                                           UnsimplifiedUsers);
 }
 
 namespace llvm {
@@ -5302,7 +5494,7 @@ const SimplifyQuery getBestSimplifyQuery(Pass &P, Function &F) {
   auto *DTWP = P.getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   auto *TLIWP = P.getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  auto *TLI = TLIWP ? &TLIWP->getTLI() : nullptr;
+  auto *TLI = TLIWP ? &TLIWP->getTLI(F) : nullptr;
   auto *ACWP = P.getAnalysisIfAvailable<AssumptionCacheTracker>();
   auto *AC = ACWP ? &ACWP->getAssumptionCache(F) : nullptr;
   return {F.getParent()->getDataLayout(), TLI, DT, AC};
diff --git a/lib/Analysis/LazyBranchProbabilityInfo.cpp b/lib/Analysis/LazyBranchProbabilityInfo.cpp
index f2592c26b373..e727de468a0d 100644
--- a/lib/Analysis/LazyBranchProbabilityInfo.cpp
+++ b/lib/Analysis/LazyBranchProbabilityInfo.cpp
@@ -55,8 +55,9 @@ void LazyBranchProbabilityInfoPass::releaseMemory() { LBPI.reset(); }
 
 bool LazyBranchProbabilityInfoPass::runOnFunction(Function &F) {
   LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  LBPI = llvm::make_unique<LazyBranchProbabilityInfo>(&F, &LI, &TLI);
+  TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  LBPI = std::make_unique<LazyBranchProbabilityInfo>(&F, &LI, &TLI);
   return false;
 }
 
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index 797fcf516429..ef31c1e0ba8c 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -150,7 +150,8 @@ static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) {
   return TLI.getLibFunc(F, LF) || TLI.isFunctionVectorizable(F.getName());
 }
 
-LazyCallGraph::LazyCallGraph(Module &M, TargetLibraryInfo &TLI) {
+LazyCallGraph::LazyCallGraph(
+    Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   LLVM_DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier()
                     << "\n");
   for (Function &F : M) {
@@ -159,7 +160,7 @@ LazyCallGraph::LazyCallGraph(Module &M, TargetLibraryInfo &TLI) {
     // If this function is a known lib function to LLVM then we want to
     // synthesize reference edges to it to model the fact that LLVM can turn
     // arbitrary code into a library function call.
-    if (isKnownLibFunction(F, TLI))
+    if (isKnownLibFunction(F, GetTLI(F)))
       LibFunctions.insert(&F);
 
     if (F.hasLocalLinkage())
@@ -631,7 +632,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(
 
   // If the merge range is empty, then adding the edge didn't actually form any
   // new cycles. We're done.
-  if (empty(MergeRange)) {
+  if (MergeRange.empty()) {
     // Now that the SCC structure is finalized, flip the kind to call.
     SourceN->setEdgeKind(TargetN, Edge::Call);
     return false; // No new cycle.
@@ -1751,16 +1752,14 @@ static void printNode(raw_ostream &OS, LazyCallGraph::Node &N) {
 }
 
 static void printSCC(raw_ostream &OS, LazyCallGraph::SCC &C) {
-  ptrdiff_t Size = size(C);
-  OS << "    SCC with " << Size << " functions:\n";
+  OS << "    SCC with " << C.size() << " functions:\n";
 
   for (LazyCallGraph::Node &N : C)
     OS << "      " << N.getFunction().getName() << "\n";
 }
 
 static void printRefSCC(raw_ostream &OS, LazyCallGraph::RefSCC &C) {
-  ptrdiff_t Size = size(C);
-  OS << "  RefSCC with " << Size << " call SCCs:\n";
+  OS << "  RefSCC with " << C.size() << " call SCCs:\n";
 
   for (LazyCallGraph::SCC &InnerC : C)
     printSCC(OS, InnerC);
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 542ff709d475..96722f32e355 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -188,7 +188,7 @@ namespace {
       else {
         auto It = ValueCache.find_as(Val);
         if (It == ValueCache.end()) {
-          ValueCache[Val] = make_unique<ValueCacheEntryTy>(Val, this);
+          ValueCache[Val] = std::make_unique<ValueCacheEntryTy>(Val, this);
           It = ValueCache.find_as(Val);
           assert(It != ValueCache.end() && "Val was just added to the map!");
         }
@@ -434,6 +434,8 @@ namespace {
       ValueLatticeElement &BBLV, WithOverflowInst *WO, BasicBlock *BB);
   bool solveBlockValueIntrinsic(ValueLatticeElement &BBLV, IntrinsicInst *II,
                                 BasicBlock *BB);
+  bool solveBlockValueExtractValue(ValueLatticeElement &BBLV,
+                                   ExtractValueInst *EVI, BasicBlock *BB);
   void intersectAssumeOrGuardBlockValueConstantRange(Value *Val,
                                                      ValueLatticeElement &BBLV,
                                                      Instruction *BBI);
@@ -648,9 +650,7 @@ bool LazyValueInfoImpl::solveBlockValueImpl(ValueLatticeElement &Res,
       return solveBlockValueBinaryOp(Res, BO, BB);
 
     if (auto *EVI = dyn_cast<ExtractValueInst>(BBI))
-      if (auto *WO = dyn_cast<WithOverflowInst>(EVI->getAggregateOperand()))
-        if (EVI->getNumIndices() == 1 && *EVI->idx_begin() == 0)
-          return solveBlockValueOverflowIntrinsic(Res, WO, BB);
+      return solveBlockValueExtractValue(Res, EVI, BB);
 
     if (auto *II = dyn_cast<IntrinsicInst>(BBI))
       return solveBlockValueIntrinsic(Res, II, BB);
@@ -1135,6 +1135,33 @@ bool LazyValueInfoImpl::solveBlockValueIntrinsic(
   }
 }
 
+bool LazyValueInfoImpl::solveBlockValueExtractValue(
+    ValueLatticeElement &BBLV, ExtractValueInst *EVI, BasicBlock *BB) {
+  if (auto *WO = dyn_cast<WithOverflowInst>(EVI->getAggregateOperand()))
+    if (EVI->getNumIndices() == 1 && *EVI->idx_begin() == 0)
+      return solveBlockValueOverflowIntrinsic(BBLV, WO, BB);
+
+  // Handle extractvalue of insertvalue to allow further simplification
+  // based on replaced with.overflow intrinsics.
+  if (Value *V = SimplifyExtractValueInst(
+          EVI->getAggregateOperand(), EVI->getIndices(),
+          EVI->getModule()->getDataLayout())) {
+    if (!hasBlockValue(V, BB)) {
+      if (pushBlockValue({ BB, V }))
+        return false;
+      BBLV = ValueLatticeElement::getOverdefined();
+      return true;
+    }
+    BBLV = getBlockValue(V, BB);
+    return true;
+  }
+
+  LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
+                    << "' - overdefined (unknown extractvalue).\n");
+  BBLV = ValueLatticeElement::getOverdefined();
+  return true;
+}
+
 static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
                                                      bool isTrueDest) {
   Value *LHS = ICI->getOperand(0);
@@ -1575,7 +1602,7 @@ bool LazyValueInfoWrapperPass::runOnFunction(Function &F) {
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   Info.DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  Info.TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  Info.TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
   if (Info.PImpl)
     getImpl(Info.PImpl, Info.AC, &DL, Info.DT).clear();
diff --git a/lib/Analysis/LegacyDivergenceAnalysis.cpp b/lib/Analysis/LegacyDivergenceAnalysis.cpp
index 52212e1c42aa..7de9d2cbfddb 100644
--- a/lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ b/lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -93,8 +93,9 @@ namespace {
 class DivergencePropagator {
 public:
   DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT,
-                       PostDominatorTree &PDT, DenseSet<const Value *> &DV)
-      : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {}
+                       PostDominatorTree &PDT, DenseSet<const Value *> &DV,
+                       DenseSet<const Use *> &DU)
+      : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV), DU(DU) {}
   void populateWithSourcesOfDivergence();
   void propagate();
 
@@ -118,11 +119,14 @@ private:
   PostDominatorTree &PDT;
   std::vector<Value *> Worklist; // Stack for DFS.
   DenseSet<const Value *> &DV;   // Stores all divergent values.
+  DenseSet<const Use *> &DU;   // Stores divergent uses of possibly uniform
+                               // values.
 };
 
 void DivergencePropagator::populateWithSourcesOfDivergence() {
   Worklist.clear();
   DV.clear();
+  DU.clear();
   for (auto &I : instructions(F)) {
     if (TTI.isSourceOfDivergence(&I)) {
       Worklist.push_back(&I);
@@ -197,8 +201,10 @@ void DivergencePropagator::exploreSyncDependency(Instruction *TI) {
   // dominators of TI until it is outside the influence region.
   BasicBlock *InfluencedBB = ThisBB;
   while (InfluenceRegion.count(InfluencedBB)) {
-    for (auto &I : *InfluencedBB)
-      findUsersOutsideInfluenceRegion(I, InfluenceRegion);
+    for (auto &I : *InfluencedBB) {
+      if (!DV.count(&I))
+        findUsersOutsideInfluenceRegion(I, InfluenceRegion);
+    }
     DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom();
     if (IDomNode == nullptr)
       break;
@@ -208,9 +214,10 @@ void DivergencePropagator::exploreSyncDependency(Instruction *TI) {
 
 void DivergencePropagator::findUsersOutsideInfluenceRegion(
     Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion) {
-  for (User *U : I.users()) {
-    Instruction *UserInst = cast<Instruction>(U);
+  for (Use &Use : I.uses()) {
+    Instruction *UserInst = cast<Instruction>(Use.getUser());
     if (!InfluenceRegion.count(UserInst->getParent())) {
+      DU.insert(&Use);
       if (DV.insert(UserInst).second)
         Worklist.push_back(UserInst);
     }
@@ -250,9 +257,8 @@ void DivergencePropagator::computeInfluenceRegion(
 void DivergencePropagator::exploreDataDependency(Value *V) {
   // Follow def-use chains of V.
   for (User *U : V->users()) {
-    Instruction *UserInst = cast<Instruction>(U);
-    if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second)
-      Worklist.push_back(UserInst);
+    if (!TTI.isAlwaysUniform(U) && DV.insert(U).second)
+      Worklist.push_back(U);
   }
 }
 
@@ -320,6 +326,7 @@ bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
     return false;
 
   DivergentValues.clear();
+  DivergentUses.clear();
   gpuDA = nullptr;
 
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -328,11 +335,11 @@ bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
   if (shouldUseGPUDivergenceAnalysis(F)) {
     // run the new GPU divergence analysis
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    gpuDA = llvm::make_unique<GPUDivergenceAnalysis>(F, DT, PDT, LI, TTI);
+    gpuDA = std::make_unique<GPUDivergenceAnalysis>(F, DT, PDT, LI, TTI);
 
   } else {
     // run LLVM's existing DivergenceAnalysis
-    DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues);
+    DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues, DivergentUses);
     DP.populateWithSourcesOfDivergence();
     DP.propagate();
   }
@@ -351,6 +358,13 @@ bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const {
   return DivergentValues.count(V);
 }
 
+bool LegacyDivergenceAnalysis::isDivergentUse(const Use *U) const {
+  if (gpuDA) {
+    return gpuDA->isDivergentUse(*U);
+  }
+  return DivergentValues.count(U->get()) || DivergentUses.count(U);
+}
+
 void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
   if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty())
     return;
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index d28b8a189d4b..db18716c64cf 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -205,7 +205,7 @@ bool Lint::runOnFunction(Function &F) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   visit(F);
   dbgs() << MessagesStr.str();
   Messages.clear();
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index 31da4e9ec783..641e92eac781 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -12,6 +12,9 @@
 
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -24,34 +27,30 @@
 
 using namespace llvm;
 
-static bool isAligned(const Value *Base, const APInt &Offset, unsigned Align,
-                      const DataLayout &DL) {
-  APInt BaseAlign(Offset.getBitWidth(), Base->getPointerAlignment(DL));
-
-  if (!BaseAlign) {
-    Type *Ty = Base->getType()->getPointerElementType();
-    if (!Ty->isSized())
-      return false;
-    BaseAlign = DL.getABITypeAlignment(Ty);
-  }
-
-  APInt Alignment(Offset.getBitWidth(), Align);
-
-  assert(Alignment.isPowerOf2() && "must be a power of 2!");
-  return BaseAlign.uge(Alignment) && !(Offset & (Alignment-1));
+static MaybeAlign getBaseAlign(const Value *Base, const DataLayout &DL) {
+  if (const MaybeAlign PA = Base->getPointerAlignment(DL))
+    return *PA;
+  Type *const Ty = Base->getType()->getPointerElementType();
+  if (!Ty->isSized())
+    return None;
+  return Align(DL.getABITypeAlignment(Ty));
 }
 
-static bool isAligned(const Value *Base, unsigned Align, const DataLayout &DL) {
-  Type *Ty = Base->getType();
-  assert(Ty->isSized() && "must be sized");
-  APInt Offset(DL.getTypeStoreSizeInBits(Ty), 0);
-  return isAligned(Base, Offset, Align, DL);
+static bool isAligned(const Value *Base, const APInt &Offset, Align Alignment,
+                      const DataLayout &DL) {
+  if (MaybeAlign BA = getBaseAlign(Base, DL)) {
+    const APInt APBaseAlign(Offset.getBitWidth(), BA->value());
+    const APInt APAlign(Offset.getBitWidth(), Alignment.value());
+    assert(APAlign.isPowerOf2() && "must be a power of 2!");
+    return APBaseAlign.uge(APAlign) && !(Offset & (APAlign - 1));
+  }
+  return false;
 }
 
 /// Test if V is always a pointer to allocated and suitably aligned memory for
 /// a simple load or store.
 static bool isDereferenceableAndAlignedPointer(
-    const Value *V, unsigned Align, const APInt &Size, const DataLayout &DL,
+    const Value *V, Align Alignment, const APInt &Size, const DataLayout &DL,
     const Instruction *CtxI, const DominatorTree *DT,
     SmallPtrSetImpl<const Value *> &Visited) {
   // Already visited?  Bail out, we've likely hit unreachable code.
@@ -63,17 +62,22 @@ static bool isDereferenceableAndAlignedPointer(
 
   // bitcast instructions are no-ops as far as dereferenceability is concerned.
   if (const BitCastOperator *BC = dyn_cast<BitCastOperator>(V))
-    return isDereferenceableAndAlignedPointer(BC->getOperand(0), Align, Size,
-                                              DL, CtxI, DT, Visited);
+    return isDereferenceableAndAlignedPointer(BC->getOperand(0), Alignment,
+                                              Size, DL, CtxI, DT, Visited);
 
   bool CheckForNonNull = false;
   APInt KnownDerefBytes(Size.getBitWidth(),
                         V->getPointerDereferenceableBytes(DL, CheckForNonNull));
-  if (KnownDerefBytes.getBoolValue()) {
-    if (KnownDerefBytes.uge(Size))
-      if (!CheckForNonNull || isKnownNonZero(V, DL, 0, nullptr, CtxI, DT))
-        return isAligned(V, Align, DL);
-  }
+  if (KnownDerefBytes.getBoolValue() && KnownDerefBytes.uge(Size))
+    if (!CheckForNonNull || isKnownNonZero(V, DL, 0, nullptr, CtxI, DT)) {
+      // As we recursed through GEPs to get here, we've incrementally checked
+      // that each step advanced by a multiple of the alignment. If our base is
+      // properly aligned, then the original offset accessed must also be.  
+      Type *Ty = V->getType();
+      assert(Ty->isSized() && "must be sized");
+      APInt Offset(DL.getTypeStoreSizeInBits(Ty), 0);
+      return isAligned(V, Offset, Alignment, DL);
+    }
 
   // For GEPs, determine if the indexing lands within the allocated object.
   if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
@@ -81,7 +85,8 @@ static bool isDereferenceableAndAlignedPointer(
 
     APInt Offset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
     if (!GEP->accumulateConstantOffset(DL, Offset) || Offset.isNegative() ||
-        !Offset.urem(APInt(Offset.getBitWidth(), Align)).isMinValue())
+        !Offset.urem(APInt(Offset.getBitWidth(), Alignment.value()))
+             .isMinValue())
       return false;
 
     // If the base pointer is dereferenceable for Offset+Size bytes, then the
@@ -93,67 +98,69 @@ static bool isDereferenceableAndAlignedPointer(
     // Offset and Size may have different bit widths if we have visited an
     // addrspacecast, so we can't do arithmetic directly on the APInt values.
     return isDereferenceableAndAlignedPointer(
-        Base, Align, Offset + Size.sextOrTrunc(Offset.getBitWidth()),
-        DL, CtxI, DT, Visited);
+        Base, Alignment, Offset + Size.sextOrTrunc(Offset.getBitWidth()), DL,
+        CtxI, DT, Visited);
   }
 
   // For gc.relocate, look through relocations
   if (const GCRelocateInst *RelocateInst = dyn_cast<GCRelocateInst>(V))
     return isDereferenceableAndAlignedPointer(
-        RelocateInst->getDerivedPtr(), Align, Size, DL, CtxI, DT, Visited);
+        RelocateInst->getDerivedPtr(), Alignment, Size, DL, CtxI, DT, Visited);
 
   if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
-    return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Align, Size,
-                                              DL, CtxI, DT, Visited);
+    return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Alignment,
+                                              Size, DL, CtxI, DT, Visited);
 
   if (const auto *Call = dyn_cast<CallBase>(V))
-    if (auto *RP = getArgumentAliasingToReturnedPointer(Call))
-      return isDereferenceableAndAlignedPointer(RP, Align, Size, DL, CtxI, DT,
-                                                Visited);
+    if (auto *RP = getArgumentAliasingToReturnedPointer(Call, true))
+      return isDereferenceableAndAlignedPointer(RP, Alignment, Size, DL, CtxI,
+                                                DT, Visited);
 
   // If we don't know, assume the worst.
   return false;
 }
 
-bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
+bool llvm::isDereferenceableAndAlignedPointer(const Value *V, Align Alignment,
                                               const APInt &Size,
                                               const DataLayout &DL,
                                               const Instruction *CtxI,
                                               const DominatorTree *DT) {
+  // Note: At the moment, Size can be zero.  This ends up being interpreted as
+  // a query of whether [Base, V] is dereferenceable and V is aligned (since
+  // that's what the implementation happened to do).  It's unclear if this is
+  // the desired semantic, but at least SelectionDAG does exercise this case.  
+  
   SmallPtrSet<const Value *, 32> Visited;
-  return ::isDereferenceableAndAlignedPointer(V, Align, Size, DL, CtxI, DT,
+  return ::isDereferenceableAndAlignedPointer(V, Alignment, Size, DL, CtxI, DT,
                                               Visited);
 }
 
 bool llvm::isDereferenceableAndAlignedPointer(const Value *V, Type *Ty,
-                                              unsigned Align,
+                                              MaybeAlign MA,
                                               const DataLayout &DL,
                                               const Instruction *CtxI,
                                               const DominatorTree *DT) {
+  if (!Ty->isSized())
+    return false;
+  
   // When dereferenceability information is provided by a dereferenceable
   // attribute, we know exactly how many bytes are dereferenceable. If we can
   // determine the exact offset to the attributed variable, we can use that
   // information here.
 
   // Require ABI alignment for loads without alignment specification
-  if (Align == 0)
-    Align = DL.getABITypeAlignment(Ty);
-
-  if (!Ty->isSized())
-    return false;
-
-  SmallPtrSet<const Value *, 32> Visited;
-  return ::isDereferenceableAndAlignedPointer(
-      V, Align,
-      APInt(DL.getIndexTypeSizeInBits(V->getType()), DL.getTypeStoreSize(Ty)),
-      DL, CtxI, DT, Visited);
+  const Align Alignment = DL.getValueOrABITypeAlignment(MA, Ty);
+  APInt AccessSize(DL.getIndexTypeSizeInBits(V->getType()),
+                   DL.getTypeStoreSize(Ty));
+  return isDereferenceableAndAlignedPointer(V, Alignment, AccessSize, DL, CtxI,
+                                            DT);
 }
 
 bool llvm::isDereferenceablePointer(const Value *V, Type *Ty,
                                     const DataLayout &DL,
                                     const Instruction *CtxI,
                                     const DominatorTree *DT) {
-  return isDereferenceableAndAlignedPointer(V, Ty, 1, DL, CtxI, DT);
+  return isDereferenceableAndAlignedPointer(V, Ty, Align::None(), DL, CtxI, DT);
 }
 
 /// Test if A and B will obviously have the same value.
@@ -187,6 +194,60 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
   return false;
 }
 
+bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
+                                             ScalarEvolution &SE,
+                                             DominatorTree &DT) {
+  auto &DL = LI->getModule()->getDataLayout();
+  Value *Ptr = LI->getPointerOperand();
+
+  APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
+                DL.getTypeStoreSize(LI->getType()));
+  const Align Alignment = DL.getValueOrABITypeAlignment(
+      MaybeAlign(LI->getAlignment()), LI->getType());
+
+  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
+
+  // If given a uniform (i.e. non-varying) address, see if we can prove the
+  // access is safe within the loop w/o needing predication.
+  if (L->isLoopInvariant(Ptr))
+    return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL,
+                                              HeaderFirstNonPHI, &DT);
+
+  // Otherwise, check to see if we have a repeating access pattern where we can
+  // prove that all accesses are well aligned and dereferenceable.
+  auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Ptr));
+  if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine())
+    return false;
+  auto* Step = dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(SE));
+  if (!Step)
+    return false;
+  // TODO: generalize to access patterns which have gaps
+  if (Step->getAPInt() != EltSize)
+    return false;
+
+  // TODO: If the symbolic trip count has a small bound (max count), we might
+  // be able to prove safety.
+  auto TC = SE.getSmallConstantTripCount(L);
+  if (!TC)
+    return false;
+
+  const APInt AccessSize = TC * EltSize;
+
+  auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart());
+  if (!StartS)
+    return false;
+  assert(SE.isLoopInvariant(StartS, L) && "implied by addrec definition");
+  Value *Base = StartS->getValue();
+
+  // For the moment, restrict ourselves to the case where the access size is a
+  // multiple of the requested alignment and the base is aligned.
+  // TODO: generalize if a case found which warrants
+  if (EltSize.urem(Alignment.value()) != 0)
+    return false;
+  return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
+                                            HeaderFirstNonPHI, &DT);
+}
+
 /// Check if executing a load of this pointer value cannot trap.
 ///
 /// If DT and ScanFrom are specified this method performs context-sensitive
@@ -198,64 +259,25 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
 ///
 /// This uses the pointee type to determine how many bytes need to be safe to
 /// load from the pointer.
-bool llvm::isSafeToLoadUnconditionally(Value *V, unsigned Align, APInt &Size,
+bool llvm::isSafeToLoadUnconditionally(Value *V, MaybeAlign MA, APInt &Size,
                                        const DataLayout &DL,
                                        Instruction *ScanFrom,
                                        const DominatorTree *DT) {
   // Zero alignment means that the load has the ABI alignment for the target
-  if (Align == 0)
-    Align = DL.getABITypeAlignment(V->getType()->getPointerElementType());
-  assert(isPowerOf2_32(Align));
+  const Align Alignment =
+      DL.getValueOrABITypeAlignment(MA, V->getType()->getPointerElementType());
 
   // If DT is not specified we can't make context-sensitive query
   const Instruction* CtxI = DT ? ScanFrom : nullptr;
-  if (isDereferenceableAndAlignedPointer(V, Align, Size, DL, CtxI, DT))
+  if (isDereferenceableAndAlignedPointer(V, Alignment, Size, DL, CtxI, DT))
     return true;
 
-  int64_t ByteOffset = 0;
-  Value *Base = V;
-  Base = GetPointerBaseWithConstantOffset(V, ByteOffset, DL);
-
-  if (ByteOffset < 0) // out of bounds
+  if (!ScanFrom)
     return false;
 
-  Type *BaseType = nullptr;
-  unsigned BaseAlign = 0;
-  if (const AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
-    // An alloca is safe to load from as load as it is suitably aligned.
-    BaseType = AI->getAllocatedType();
-    BaseAlign = AI->getAlignment();
-  } else if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
-    // Global variables are not necessarily safe to load from if they are
-    // interposed arbitrarily. Their size may change or they may be weak and
-    // require a test to determine if they were in fact provided.
-    if (!GV->isInterposable()) {
-      BaseType = GV->getType()->getElementType();
-      BaseAlign = GV->getAlignment();
-    }
-  }
-
-  PointerType *AddrTy = cast<PointerType>(V->getType());
-  uint64_t LoadSize = DL.getTypeStoreSize(AddrTy->getElementType());
-
-  // If we found a base allocated type from either an alloca or global variable,
-  // try to see if we are definitively within the allocated region. We need to
-  // know the size of the base type and the loaded type to do anything in this
-  // case.
-  if (BaseType && BaseType->isSized()) {
-    if (BaseAlign == 0)
-      BaseAlign = DL.getPrefTypeAlignment(BaseType);
-
-    if (Align <= BaseAlign) {
-      // Check if the load is within the bounds of the underlying object.
-      if (ByteOffset + LoadSize <= DL.getTypeAllocSize(BaseType) &&
-          ((ByteOffset % Align) == 0))
-        return true;
-    }
-  }
-
-  if (!ScanFrom)
+  if (Size.getBitWidth() > 64)
     return false;
+  const uint64_t LoadSize = Size.getZExtValue();
 
   // Otherwise, be a little bit aggressive by scanning the local block where we
   // want to check to see if the pointer is already being loaded or stored
@@ -279,7 +301,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, unsigned Align, APInt &Size,
       return false;
 
     Value *AccessedPtr;
-    unsigned AccessedAlign;
+    MaybeAlign MaybeAccessedAlign;
     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
       // Ignore volatile loads. The execution of a volatile load cannot
       // be used to prove an address is backed by regular memory; it can,
@@ -287,24 +309,26 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, unsigned Align, APInt &Size,
       if (LI->isVolatile())
         continue;
       AccessedPtr = LI->getPointerOperand();
-      AccessedAlign = LI->getAlignment();
+      MaybeAccessedAlign = MaybeAlign(LI->getAlignment());
     } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
       // Ignore volatile stores (see comment for loads).
       if (SI->isVolatile())
         continue;
       AccessedPtr = SI->getPointerOperand();
-      AccessedAlign = SI->getAlignment();
+      MaybeAccessedAlign = MaybeAlign(SI->getAlignment());
     } else
       continue;
 
     Type *AccessedTy = AccessedPtr->getType()->getPointerElementType();
-    if (AccessedAlign == 0)
-      AccessedAlign = DL.getABITypeAlignment(AccessedTy);
-    if (AccessedAlign < Align)
+
+    const Align AccessedAlign =
+        DL.getValueOrABITypeAlignment(MaybeAccessedAlign, AccessedTy);
+    if (AccessedAlign < Alignment)
       continue;
 
     // Handle trivial cases.
-    if (AccessedPtr == V)
+    if (AccessedPtr == V &&
+        LoadSize <= DL.getTypeStoreSize(AccessedTy))
       return true;
 
     if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) &&
@@ -314,12 +338,12 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, unsigned Align, APInt &Size,
   return false;
 }
 
-bool llvm::isSafeToLoadUnconditionally(Value *V, Type *Ty, unsigned Align,
+bool llvm::isSafeToLoadUnconditionally(Value *V, Type *Ty, MaybeAlign Alignment,
                                        const DataLayout &DL,
                                        Instruction *ScanFrom,
                                        const DominatorTree *DT) {
   APInt Size(DL.getIndexTypeSizeInBits(V->getType()), DL.getTypeStoreSize(Ty));
-  return isSafeToLoadUnconditionally(V, Align, Size, DL, ScanFrom, DT);
+  return isSafeToLoadUnconditionally(V, Alignment, Size, DL, ScanFrom, DT);
 }
 
   /// DefMaxInstsToScan - the default number of maximum instructions
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 36bd9a8b7ea7..3d8f77675f3a 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1189,18 +1189,31 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
 
   unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  APInt Size(IdxWidth, DL.getTypeStoreSize(Ty));
 
   APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
+  // Retrieve the address space again as pointer stripping now tracks through
+  // `addrspacecast`.
+  ASA = cast<PointerType>(PtrA->getType())->getAddressSpace();
+  ASB = cast<PointerType>(PtrB->getType())->getAddressSpace();
+  // Check that the address spaces match and that the pointers are valid.
+  if (ASA != ASB)
+    return false;
+
+  IdxWidth = DL.getIndexSizeInBits(ASA);
+  OffsetA = OffsetA.sextOrTrunc(IdxWidth);
+  OffsetB = OffsetB.sextOrTrunc(IdxWidth);
+
+  APInt Size(IdxWidth, DL.getTypeStoreSize(Ty));
+
   //  OffsetDelta = OffsetB - OffsetA;
   const SCEV *OffsetSCEVA = SE.getConstant(OffsetA);
   const SCEV *OffsetSCEVB = SE.getConstant(OffsetB);
   const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
-  const SCEVConstant *OffsetDeltaC = dyn_cast<SCEVConstant>(OffsetDeltaSCEV);
-  const APInt &OffsetDelta = OffsetDeltaC->getAPInt();
+  const APInt &OffsetDelta = cast<SCEVConstant>(OffsetDeltaSCEV)->getAPInt();
+
   // Check if they are based on the same pointer. That makes the offsets
   // sufficient.
   if (PtrA == PtrB)
@@ -1641,13 +1654,21 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
     // Check every access pair.
     while (AI != AE) {
       Visited.insert(*AI);
-      EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
+      bool AIIsWrite = AI->getInt();
+      // Check loads only against next equivalent class, but stores also against
+      // other stores in the same equivalence class - to the same address.
+      EquivalenceClasses<MemAccessInfo>::member_iterator OI =
+          (AIIsWrite ? AI : std::next(AI));
       while (OI != AE) {
         // Check every accessing instruction pair in program order.
         for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
              I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
-          for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
-               I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
+          // Scan all accesses of another equivalence class, but only the next
+          // accesses of the same equivalent class.
+          for (std::vector<unsigned>::iterator
+                   I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()),
+                   I2E = (OI == AI ? I1E : Accesses[*OI].end());
+               I2 != I2E; ++I2) {
             auto A = std::make_pair(&*AI, *I1);
             auto B = std::make_pair(&*OI, *I2);
 
@@ -2078,7 +2099,7 @@ OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
       DL = I->getDebugLoc();
   }
 
-  Report = make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
+  Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
                                                    CodeRegion);
   return *Report;
 }
@@ -2323,9 +2344,9 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                                DominatorTree *DT, LoopInfo *LI)
-    : PSE(llvm::make_unique<PredicatedScalarEvolution>(*SE, *L)),
-      PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
-      DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
+    : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
+      PtrRtChecking(std::make_unique<RuntimePointerChecking>(SE)),
+      DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
       HasConvergentOp(false),
       HasDependenceInvolvingLoopInvariantAddress(false) {
@@ -2380,7 +2401,7 @@ const LoopAccessInfo &LoopAccessLegacyAnalysis::getInfo(Loop *L) {
   auto &LAI = LoopAccessInfoMap[L];
 
   if (!LAI)
-    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
+    LAI = std::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
 
   return *LAI.get();
 }
@@ -2399,7 +2420,7 @@ void LoopAccessLegacyAnalysis::print(raw_ostream &OS, const Module *M) const {
 bool LoopAccessLegacyAnalysis::runOnFunction(Function &F) {
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  TLI = TLIP ? &TLIP->getTLI() : nullptr;
+  TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
diff --git a/lib/Analysis/LoopAnalysisManager.cpp b/lib/Analysis/LoopAnalysisManager.cpp
index a10a87ce113b..02d40fb8d72a 100644
--- a/lib/Analysis/LoopAnalysisManager.cpp
+++ b/lib/Analysis/LoopAnalysisManager.cpp
@@ -46,7 +46,7 @@ bool LoopAnalysisManagerFunctionProxy::Result::invalidate(
   // invalidation logic below to act on that.
   auto PAC = PA.getChecker<LoopAnalysisManagerFunctionProxy>();
   bool invalidateMemorySSAAnalysis = false;
-  if (EnableMSSALoopDependency)
+  if (MSSAUsed)
     invalidateMemorySSAAnalysis = Inv.invalidate<MemorySSAAnalysis>(F, PA);
   if (!(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) ||
       Inv.invalidate<AAManager>(F, PA) ||
diff --git a/lib/Analysis/LoopCacheAnalysis.cpp b/lib/Analysis/LoopCacheAnalysis.cpp
new file mode 100644
index 000000000000..10d2fe07884a
--- /dev/null
+++ b/lib/Analysis/LoopCacheAnalysis.cpp
@@ -0,0 +1,625 @@
+//===- LoopCacheAnalysis.cpp - Loop Cache Analysis -------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the implementation for the loop cache analysis.
+/// The implementation is largely based on the following paper:
+///
+///       Compiler Optimizations for Improving Data Locality
+///       By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng
+///       http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf
+///
+/// The general approach taken to estimate the number of cache lines used by the
+/// memory references in an inner loop is:
+///    1. Partition memory references that exhibit temporal or spacial reuse
+///       into reference groups.
+///    2. For each loop L in the a loop nest LN:
+///       a. Compute the cost of the reference group
+///       b. Compute the loop cost by summing up the reference groups costs
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopCacheAnalysis.h"
+#include "llvm/ADT/BreadthFirstIterator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-cache-cost"
+
+static cl::opt<unsigned> DefaultTripCount(
+    "default-trip-count", cl::init(100), cl::Hidden,
+    cl::desc("Use this to specify the default trip count of a loop"));
+
+// In this analysis two array references are considered to exhibit temporal
+// reuse if they access either the same memory location, or a memory location
+// with distance smaller than a configurable threshold.
+static cl::opt<unsigned> TemporalReuseThreshold(
+    "temporal-reuse-threshold", cl::init(2), cl::Hidden,
+    cl::desc("Use this to specify the max. distance between array elements "
+             "accessed in a loop so that the elements are classified to have "
+             "temporal reuse"));
+
+/// Retrieve the innermost loop in the given loop nest \p Loops. It returns a
+/// nullptr if any loops in the loop vector supplied has more than one sibling.
+/// The loop vector is expected to contain loops collected in breadth-first
+/// order.
+static Loop *getInnerMostLoop(const LoopVectorTy &Loops) {
+  assert(!Loops.empty() && "Expecting a non-empy loop vector");
+
+  Loop *LastLoop = Loops.back();
+  Loop *ParentLoop = LastLoop->getParentLoop();
+
+  if (ParentLoop == nullptr) {
+    assert(Loops.size() == 1 && "Expecting a single loop");
+    return LastLoop;
+  }
+
+  return (std::is_sorted(Loops.begin(), Loops.end(),
+                         [](const Loop *L1, const Loop *L2) {
+                           return L1->getLoopDepth() < L2->getLoopDepth();
+                         }))
+             ? LastLoop
+             : nullptr;
+}
+
+static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
+                                  const Loop &L, ScalarEvolution &SE) {
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&AccessFn);
+  if (!AR || !AR->isAffine())
+    return false;
+
+  assert(AR->getLoop() && "AR should have a loop");
+
+  // Check that start and increment are not add recurrences.
+  const SCEV *Start = AR->getStart();
+  const SCEV *Step = AR->getStepRecurrence(SE);
+  if (isa<SCEVAddRecExpr>(Start) || isa<SCEVAddRecExpr>(Step))
+    return false;
+
+  // Check that start and increment are both invariant in the loop.
+  if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
+    return false;
+
+  return AR->getStepRecurrence(SE) == &ElemSize;
+}
+
+/// Compute the trip count for the given loop \p L. Return the SCEV expression
+/// for the trip count or nullptr if it cannot be computed.
+static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) {
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
+      !isa<SCEVConstant>(BackedgeTakenCount))
+    return nullptr;
+
+  return SE.getAddExpr(BackedgeTakenCount,
+                       SE.getOne(BackedgeTakenCount->getType()));
+}
+
+//===----------------------------------------------------------------------===//
+// IndexedReference implementation
+//
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IndexedReference &R) {
+  if (!R.IsValid) {
+    OS << R.StoreOrLoadInst;
+    OS << ", IsValid=false.";
+    return OS;
+  }
+
+  OS << *R.BasePointer;
+  for (const SCEV *Subscript : R.Subscripts)
+    OS << "[" << *Subscript << "]";
+
+  OS << ", Sizes: ";
+  for (const SCEV *Size : R.Sizes)
+    OS << "[" << *Size << "]";
+
+  return OS;
+}
+
+IndexedReference::IndexedReference(Instruction &StoreOrLoadInst,
+                                   const LoopInfo &LI, ScalarEvolution &SE)
+    : StoreOrLoadInst(StoreOrLoadInst), SE(SE) {
+  assert((isa<StoreInst>(StoreOrLoadInst) || isa<LoadInst>(StoreOrLoadInst)) &&
+         "Expecting a load or store instruction");
+
+  IsValid = delinearize(LI);
+  if (IsValid)
+    LLVM_DEBUG(dbgs().indent(2) << "Succesfully delinearized: " << *this
+                                << "\n");
+}
+
+Optional<bool> IndexedReference::hasSpacialReuse(const IndexedReference &Other,
+                                                 unsigned CLS,
+                                                 AliasAnalysis &AA) const {
+  assert(IsValid && "Expecting a valid reference");
+
+  if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
+    LLVM_DEBUG(dbgs().indent(2)
+               << "No spacial reuse: different base pointers\n");
+    return false;
+  }
+
+  unsigned NumSubscripts = getNumSubscripts();
+  if (NumSubscripts != Other.getNumSubscripts()) {
+    LLVM_DEBUG(dbgs().indent(2)
+               << "No spacial reuse: different number of subscripts\n");
+    return false;
+  }
+
+  // all subscripts must be equal, except the leftmost one (the last one).
+  for (auto SubNum : seq<unsigned>(0, NumSubscripts - 1)) {
+    if (getSubscript(SubNum) != Other.getSubscript(SubNum)) {
+      LLVM_DEBUG(dbgs().indent(2) << "No spacial reuse, different subscripts: "
+                                  << "\n\t" << *getSubscript(SubNum) << "\n\t"
+                                  << *Other.getSubscript(SubNum) << "\n");
+      return false;
+    }
+  }
+
+  // the difference between the last subscripts must be less than the cache line
+  // size.
+  const SCEV *LastSubscript = getLastSubscript();
+  const SCEV *OtherLastSubscript = Other.getLastSubscript();
+  const SCEVConstant *Diff = dyn_cast<SCEVConstant>(
+      SE.getMinusSCEV(LastSubscript, OtherLastSubscript));
+
+  if (Diff == nullptr) {
+    LLVM_DEBUG(dbgs().indent(2)
+               << "No spacial reuse, difference between subscript:\n\t"
+               << *LastSubscript << "\n\t" << OtherLastSubscript
+               << "\nis not constant.\n");
+    return None;
+  }
+
+  bool InSameCacheLine = (Diff->getValue()->getSExtValue() < CLS);
+
+  LLVM_DEBUG({
+    if (InSameCacheLine)
+      dbgs().indent(2) << "Found spacial reuse.\n";
+    else
+      dbgs().indent(2) << "No spacial reuse.\n";
+  });
+
+  return InSameCacheLine;
+}
+
+Optional<bool> IndexedReference::hasTemporalReuse(const IndexedReference &Other,
+                                                  unsigned MaxDistance,
+                                                  const Loop &L,
+                                                  DependenceInfo &DI,
+                                                  AliasAnalysis &AA) const {
+  assert(IsValid && "Expecting a valid reference");
+
+  if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
+    LLVM_DEBUG(dbgs().indent(2)
+               << "No temporal reuse: different base pointer\n");
+    return false;
+  }
+
+  std::unique_ptr<Dependence> D =
+      DI.depends(&StoreOrLoadInst, &Other.StoreOrLoadInst, true);
+
+  if (D == nullptr) {
+    LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: no dependence\n");
+    return false;
+  }
+
+  if (D->isLoopIndependent()) {
+    LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
+    return true;
+  }
+
+  // Check the dependence distance at every loop level. There is temporal reuse
+  // if the distance at the given loop's depth is small (|d| <= MaxDistance) and
+  // it is zero at every other loop level.
+  int LoopDepth = L.getLoopDepth();
+  int Levels = D->getLevels();
+  for (int Level = 1; Level <= Levels; ++Level) {
+    const SCEV *Distance = D->getDistance(Level);
+    const SCEVConstant *SCEVConst = dyn_cast_or_null<SCEVConstant>(Distance);
+
+    if (SCEVConst == nullptr) {
+      LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: distance unknown\n");
+      return None;
+    }
+
+    const ConstantInt &CI = *SCEVConst->getValue();
+    if (Level != LoopDepth && !CI.isZero()) {
+      LLVM_DEBUG(dbgs().indent(2)
+                 << "No temporal reuse: distance is not zero at depth=" << Level
+                 << "\n");
+      return false;
+    } else if (Level == LoopDepth && CI.getSExtValue() > MaxDistance) {
+      LLVM_DEBUG(
+          dbgs().indent(2)
+          << "No temporal reuse: distance is greater than MaxDistance at depth="
+          << Level << "\n");
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
+  return true;
+}
+
+CacheCostTy IndexedReference::computeRefCost(const Loop &L,
+                                             unsigned CLS) const {
+  assert(IsValid && "Expecting a valid reference");
+  LLVM_DEBUG({
+    dbgs().indent(2) << "Computing cache cost for:\n";
+    dbgs().indent(4) << *this << "\n";
+  });
+
+  // If the indexed reference is loop invariant the cost is one.
+  if (isLoopInvariant(L)) {
+    LLVM_DEBUG(dbgs().indent(4) << "Reference is loop invariant: RefCost=1\n");
+    return 1;
+  }
+
+  const SCEV *TripCount = computeTripCount(L, SE);
+  if (!TripCount) {
+    LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
+                      << " could not be computed, using DefaultTripCount\n");
+    const SCEV *ElemSize = Sizes.back();
+    TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount);
+  }
+  LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
+
+  // If the indexed reference is 'consecutive' the cost is
+  // (TripCount*Stride)/CLS, otherwise the cost is TripCount.
+  const SCEV *RefCost = TripCount;
+
+  if (isConsecutive(L, CLS)) {
+    const SCEV *Coeff = getLastCoefficient();
+    const SCEV *ElemSize = Sizes.back();
+    const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
+    const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
+    const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
+    RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
+    LLVM_DEBUG(dbgs().indent(4)
+               << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
+               << *RefCost << "\n");
+  } else
+    LLVM_DEBUG(dbgs().indent(4)
+               << "Access is not consecutive: RefCost=TripCount=" << *RefCost
+               << "\n");
+
+  // Attempt to fold RefCost into a constant.
+  if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
+    return ConstantCost->getValue()->getSExtValue();
+
+  LLVM_DEBUG(dbgs().indent(4)
+             << "RefCost is not a constant! Setting to RefCost=InvalidCost "
+                "(invalid value).\n");
+
+  return CacheCost::InvalidCost;
+}
+
+bool IndexedReference::delinearize(const LoopInfo &LI) {
+  assert(Subscripts.empty() && "Subscripts should be empty");
+  assert(Sizes.empty() && "Sizes should be empty");
+  assert(!IsValid && "Should be called once from the constructor");
+  LLVM_DEBUG(dbgs() << "Delinearizing: " << StoreOrLoadInst << "\n");
+
+  const SCEV *ElemSize = SE.getElementSize(&StoreOrLoadInst);
+  const BasicBlock *BB = StoreOrLoadInst.getParent();
+
+  for (Loop *L = LI.getLoopFor(BB); L != nullptr; L = L->getParentLoop()) {
+    const SCEV *AccessFn =
+        SE.getSCEVAtScope(getPointerOperand(&StoreOrLoadInst), L);
+
+    BasePointer = dyn_cast<SCEVUnknown>(SE.getPointerBase(AccessFn));
+    if (BasePointer == nullptr) {
+      LLVM_DEBUG(
+          dbgs().indent(2)
+          << "ERROR: failed to delinearize, can't identify base pointer\n");
+      return false;
+    }
+
+    AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
+
+    LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
+                                << "', AccessFn: " << *AccessFn << "\n");
+
+    SE.delinearize(AccessFn, Subscripts, Sizes,
+                   SE.getElementSize(&StoreOrLoadInst));
+
+    if (Subscripts.empty() || Sizes.empty() ||
+        Subscripts.size() != Sizes.size()) {
+      // Attempt to determine whether we have a single dimensional array access.
+      // before giving up.
+      if (!isOneDimensionalArray(*AccessFn, *ElemSize, *L, SE)) {
+        LLVM_DEBUG(dbgs().indent(2)
+                   << "ERROR: failed to delinearize reference\n");
+        Subscripts.clear();
+        Sizes.clear();
+        break;
+      }
+
+      const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize);
+      Subscripts.push_back(Div);
+      Sizes.push_back(ElemSize);
+    }
+
+    return all_of(Subscripts, [&](const SCEV *Subscript) {
+      return isSimpleAddRecurrence(*Subscript, *L);
+    });
+  }
+
+  return false;
+}
+
+bool IndexedReference::isLoopInvariant(const Loop &L) const {
+  Value *Addr = getPointerOperand(&StoreOrLoadInst);
+  assert(Addr != nullptr && "Expecting either a load or a store instruction");
+  assert(SE.isSCEVable(Addr->getType()) && "Addr should be SCEVable");
+
+  if (SE.isLoopInvariant(SE.getSCEV(Addr), &L))
+    return true;
+
+  // The indexed reference is loop invariant if none of the coefficients use
+  // the loop induction variable.
+  bool allCoeffForLoopAreZero = all_of(Subscripts, [&](const SCEV *Subscript) {
+    return isCoeffForLoopZeroOrInvariant(*Subscript, L);
+  });
+
+  return allCoeffForLoopAreZero;
+}
+
+bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
+  // The indexed reference is 'consecutive' if the only coefficient that uses
+  // the loop induction variable is the last one...
+  const SCEV *LastSubscript = Subscripts.back();
+  for (const SCEV *Subscript : Subscripts) {
+    if (Subscript == LastSubscript)
+      continue;
+    if (!isCoeffForLoopZeroOrInvariant(*Subscript, L))
+      return false;
+  }
+
+  // ...and the access stride is less than the cache line size.
+  const SCEV *Coeff = getLastCoefficient();
+  const SCEV *ElemSize = Sizes.back();
+  const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
+  const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
+
+  return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
+}
+
+const SCEV *IndexedReference::getLastCoefficient() const {
+  const SCEV *LastSubscript = getLastSubscript();
+  assert(isa<SCEVAddRecExpr>(LastSubscript) &&
+         "Expecting a SCEV add recurrence expression");
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LastSubscript);
+  return AR->getStepRecurrence(SE);
+}
+
+bool IndexedReference::isCoeffForLoopZeroOrInvariant(const SCEV &Subscript,
+                                                     const Loop &L) const {
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&Subscript);
+  return (AR != nullptr) ? AR->getLoop() != &L
+                         : SE.isLoopInvariant(&Subscript, &L);
+}
+
+bool IndexedReference::isSimpleAddRecurrence(const SCEV &Subscript,
+                                             const Loop &L) const {
+  if (!isa<SCEVAddRecExpr>(Subscript))
+    return false;
+
+  const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(&Subscript);
+  assert(AR->getLoop() && "AR should have a loop");
+
+  if (!AR->isAffine())
+    return false;
+
+  const SCEV *Start = AR->getStart();
+  const SCEV *Step = AR->getStepRecurrence(SE);
+
+  if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
+    return false;
+
+  return true;
+}
+
+bool IndexedReference::isAliased(const IndexedReference &Other,
+                                 AliasAnalysis &AA) const {
+  const auto &Loc1 = MemoryLocation::get(&StoreOrLoadInst);
+  const auto &Loc2 = MemoryLocation::get(&Other.StoreOrLoadInst);
+  return AA.isMustAlias(Loc1, Loc2);
+}
+
+//===----------------------------------------------------------------------===//
+// CacheCost implementation
+//
+raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) {
+  for (const auto &LC : CC.LoopCosts) {
+    const Loop *L = LC.first;
+    OS << "Loop '" << L->getName() << "' has cost = " << LC.second << "\n";
+  }
+  return OS;
+}
+
+CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI,
+                     ScalarEvolution &SE, TargetTransformInfo &TTI,
+                     AliasAnalysis &AA, DependenceInfo &DI,
+                     Optional<unsigned> TRT)
+    : Loops(Loops), TripCounts(), LoopCosts(),
+      TRT(TRT == None ? Optional<unsigned>(TemporalReuseThreshold) : TRT),
+      LI(LI), SE(SE), TTI(TTI), AA(AA), DI(DI) {
+  assert(!Loops.empty() && "Expecting a non-empty loop vector.");
+
+  for (const Loop *L : Loops) {
+    unsigned TripCount = SE.getSmallConstantTripCount(L);
+    TripCount = (TripCount == 0) ? DefaultTripCount : TripCount;
+    TripCounts.push_back({L, TripCount});
+  }
+
+  calculateCacheFootprint();
+}
+
+std::unique_ptr<CacheCost>
+CacheCost::getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR,
+                        DependenceInfo &DI, Optional<unsigned> TRT) {
+  if (Root.getParentLoop()) {
+    LLVM_DEBUG(dbgs() << "Expecting the outermost loop in a loop nest\n");
+    return nullptr;
+  }
+
+  LoopVectorTy Loops;
+  for (Loop *L : breadth_first(&Root))
+    Loops.push_back(L);
+
+  if (!getInnerMostLoop(Loops)) {
+    LLVM_DEBUG(dbgs() << "Cannot compute cache cost of loop nest with more "
+                         "than one innermost loop\n");
+    return nullptr;
+  }
+
+  return std::make_unique<CacheCost>(Loops, AR.LI, AR.SE, AR.TTI, AR.AA, DI, TRT);
+}
+
+void CacheCost::calculateCacheFootprint() {
+  LLVM_DEBUG(dbgs() << "POPULATING REFERENCE GROUPS\n");
+  ReferenceGroupsTy RefGroups;
+  if (!populateReferenceGroups(RefGroups))
+    return;
+
+  LLVM_DEBUG(dbgs() << "COMPUTING LOOP CACHE COSTS\n");
+  for (const Loop *L : Loops) {
+    assert((std::find_if(LoopCosts.begin(), LoopCosts.end(),
+                         [L](const LoopCacheCostTy &LCC) {
+                           return LCC.first == L;
+                         }) == LoopCosts.end()) &&
+           "Should not add duplicate element");
+    CacheCostTy LoopCost = computeLoopCacheCost(*L, RefGroups);
+    LoopCosts.push_back(std::make_pair(L, LoopCost));
+  }
+
+  sortLoopCosts();
+  RefGroups.clear();
+}
+
+bool CacheCost::populateReferenceGroups(ReferenceGroupsTy &RefGroups) const {
+  assert(RefGroups.empty() && "Reference groups should be empty");
+
+  unsigned CLS = TTI.getCacheLineSize();
+  Loop *InnerMostLoop = getInnerMostLoop(Loops);
+  assert(InnerMostLoop != nullptr && "Expecting a valid innermost loop");
+
+  for (BasicBlock *BB : InnerMostLoop->getBlocks()) {
+    for (Instruction &I : *BB) {
+      if (!isa<StoreInst>(I) && !isa<LoadInst>(I))
+        continue;
+
+      std::unique_ptr<IndexedReference> R(new IndexedReference(I, LI, SE));
+      if (!R->isValid())
+        continue;
+
+      bool Added = false;
+      for (ReferenceGroupTy &RefGroup : RefGroups) {
+        const IndexedReference &Representative = *RefGroup.front().get();
+        LLVM_DEBUG({
+          dbgs() << "References:\n";
+          dbgs().indent(2) << *R << "\n";
+          dbgs().indent(2) << Representative << "\n";
+        });
+
+        Optional<bool> HasTemporalReuse =
+            R->hasTemporalReuse(Representative, *TRT, *InnerMostLoop, DI, AA);
+        Optional<bool> HasSpacialReuse =
+            R->hasSpacialReuse(Representative, CLS, AA);
+
+        if ((HasTemporalReuse.hasValue() && *HasTemporalReuse) ||
+            (HasSpacialReuse.hasValue() && *HasSpacialReuse)) {
+          RefGroup.push_back(std::move(R));
+          Added = true;
+          break;
+        }
+      }
+
+      if (!Added) {
+        ReferenceGroupTy RG;
+        RG.push_back(std::move(R));
+        RefGroups.push_back(std::move(RG));
+      }
+    }
+  }
+
+  if (RefGroups.empty())
+    return false;
+
+  LLVM_DEBUG({
+    dbgs() << "\nIDENTIFIED REFERENCE GROUPS:\n";
+    int n = 1;
+    for (const ReferenceGroupTy &RG : RefGroups) {
+      dbgs().indent(2) << "RefGroup " << n << ":\n";
+      for (const auto &IR : RG)
+        dbgs().indent(4) << *IR << "\n";
+      n++;
+    }
+    dbgs() << "\n";
+  });
+
+  return true;
+}
+
+CacheCostTy
+CacheCost::computeLoopCacheCost(const Loop &L,
+                                const ReferenceGroupsTy &RefGroups) const {
+  if (!L.isLoopSimplifyForm())
+    return InvalidCost;
+
+  LLVM_DEBUG(dbgs() << "Considering loop '" << L.getName()
+                    << "' as innermost loop.\n");
+
+  // Compute the product of the trip counts of each other loop in the nest.
+  CacheCostTy TripCountsProduct = 1;
+  for (const auto &TC : TripCounts) {
+    if (TC.first == &L)
+      continue;
+    TripCountsProduct *= TC.second;
+  }
+
+  CacheCostTy LoopCost = 0;
+  for (const ReferenceGroupTy &RG : RefGroups) {
+    CacheCostTy RefGroupCost = computeRefGroupCacheCost(RG, L);
+    LoopCost += RefGroupCost * TripCountsProduct;
+  }
+
+  LLVM_DEBUG(dbgs().indent(2) << "Loop '" << L.getName()
+                              << "' has cost=" << LoopCost << "\n");
+
+  return LoopCost;
+}
+
+CacheCostTy CacheCost::computeRefGroupCacheCost(const ReferenceGroupTy &RG,
+                                                const Loop &L) const {
+  assert(!RG.empty() && "Reference group should have at least one member.");
+
+  const IndexedReference *Representative = RG.front().get();
+  return Representative->computeRefCost(L, TTI.getCacheLineSize());
+}
+
+//===----------------------------------------------------------------------===//
+// LoopCachePrinterPass implementation
+//
+PreservedAnalyses LoopCachePrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                            LoopStandardAnalysisResults &AR,
+                                            LPMUpdater &U) {
+  Function *F = L.getHeader()->getParent();
+  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
+
+  if (auto CC = CacheCost::getCacheCost(L, AR, DI))
+    OS << *CC;
+
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index aa5da0859805..dbab5db7dbc2 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -359,6 +359,45 @@ bool Loop::isAuxiliaryInductionVariable(PHINode &AuxIndVar,
   return SE.isLoopInvariant(IndDesc.getStep(), this);
 }
 
+BranchInst *Loop::getLoopGuardBranch() const {
+  if (!isLoopSimplifyForm())
+    return nullptr;
+
+  BasicBlock *Preheader = getLoopPreheader();
+  BasicBlock *Latch = getLoopLatch();
+  assert(Preheader && Latch &&
+         "Expecting a loop with valid preheader and latch");
+
+  // Loop should be in rotate form.
+  if (!isLoopExiting(Latch))
+    return nullptr;
+
+  // Disallow loops with more than one unique exit block, as we do not verify
+  // that GuardOtherSucc post dominates all exit blocks.
+  BasicBlock *ExitFromLatch = getUniqueExitBlock();
+  if (!ExitFromLatch)
+    return nullptr;
+
+  BasicBlock *ExitFromLatchSucc = ExitFromLatch->getUniqueSuccessor();
+  if (!ExitFromLatchSucc)
+    return nullptr;
+
+  BasicBlock *GuardBB = Preheader->getUniquePredecessor();
+  if (!GuardBB)
+    return nullptr;
+
+  assert(GuardBB->getTerminator() && "Expecting valid guard terminator");
+
+  BranchInst *GuardBI = dyn_cast<BranchInst>(GuardBB->getTerminator());
+  if (!GuardBI || GuardBI->isUnconditional())
+    return nullptr;
+
+  BasicBlock *GuardOtherSucc = (GuardBI->getSuccessor(0) == Preheader)
+                                   ? GuardBI->getSuccessor(1)
+                                   : GuardBI->getSuccessor(0);
+  return (GuardOtherSucc == ExitFromLatchSucc) ? GuardBI : nullptr;
+}
+
 bool Loop::isCanonical(ScalarEvolution &SE) const {
   InductionDescriptor IndDesc;
   if (!getInductionDescriptor(SE, IndDesc))
diff --git a/lib/Analysis/LoopUnrollAnalyzer.cpp b/lib/Analysis/LoopUnrollAnalyzer.cpp
index 1728b5e9f6d2..762623de41e9 100644
--- a/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -78,7 +78,7 @@ bool UnrolledInstAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   const DataLayout &DL = I.getModule()->getDataLayout();
   if (auto FI = dyn_cast<FPMathOperator>(&I))
     SimpleV =
-        SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
+        SimplifyBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
   else
     SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
 
diff --git a/lib/Analysis/MemDerefPrinter.cpp b/lib/Analysis/MemDerefPrinter.cpp
index 77ebf89d9a08..5cf516a538b5 100644
--- a/lib/Analysis/MemDerefPrinter.cpp
+++ b/lib/Analysis/MemDerefPrinter.cpp
@@ -55,8 +55,8 @@ bool MemDerefPrinter::runOnFunction(Function &F) {
       Value *PO = LI->getPointerOperand();
       if (isDereferenceablePointer(PO, LI->getType(), DL))
         Deref.push_back(PO);
-      if (isDereferenceableAndAlignedPointer(PO, LI->getType(),
-                                             LI->getAlignment(), DL))
+      if (isDereferenceableAndAlignedPointer(
+              PO, LI->getType(), MaybeAlign(LI->getAlignment()), DL))
         DerefAndAligned.insert(PO);
     }
   }
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 729dad463657..172c86eb4646 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -180,6 +180,19 @@ static Optional<AllocFnsTy> getAllocationData(const Value *V, AllocType AllocTy,
   return None;
 }
 
+static Optional<AllocFnsTy>
+getAllocationData(const Value *V, AllocType AllocTy,
+                  function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
+                  bool LookThroughBitCast = false) {
+  bool IsNoBuiltinCall;
+  if (const Function *Callee =
+          getCalledFunction(V, LookThroughBitCast, IsNoBuiltinCall))
+    if (!IsNoBuiltinCall)
+      return getAllocationDataForFunction(
+          Callee, AllocTy, &GetTLI(const_cast<Function &>(*Callee)));
+  return None;
+}
+
 static Optional<AllocFnsTy> getAllocationSize(const Value *V,
                                               const TargetLibraryInfo *TLI) {
   bool IsNoBuiltinCall;
@@ -223,6 +236,11 @@ bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
                           bool LookThroughBitCast) {
   return getAllocationData(V, AnyAlloc, TLI, LookThroughBitCast).hasValue();
 }
+bool llvm::isAllocationFn(
+    const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
+    bool LookThroughBitCast) {
+  return getAllocationData(V, AnyAlloc, GetTLI, LookThroughBitCast).hasValue();
+}
 
 /// Tests if a value is a call or invoke to a function that returns a
 /// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
@@ -240,6 +258,12 @@ bool llvm::isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                           bool LookThroughBitCast) {
   return getAllocationData(V, MallocLike, TLI, LookThroughBitCast).hasValue();
 }
+bool llvm::isMallocLikeFn(
+    const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
+    bool LookThroughBitCast) {
+  return getAllocationData(V, MallocLike, GetTLI, LookThroughBitCast)
+      .hasValue();
+}
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
@@ -276,12 +300,27 @@ bool llvm::isReallocLikeFn(const Function *F, const TargetLibraryInfo *TLI) {
   return getAllocationDataForFunction(F, ReallocLike, TLI).hasValue();
 }
 
+/// Tests if a value is a call or invoke to a library function that
+/// allocates memory and throws if an allocation failed (e.g., new).
+bool llvm::isOpNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                     bool LookThroughBitCast) {
+  return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast).hasValue();
+}
+
+/// Tests if a value is a call or invoke to a library function that
+/// allocates memory (strdup, strndup).
+bool llvm::isStrdupLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, StrDupLike, TLI, LookThroughBitCast).hasValue();
+}
+
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
-const CallInst *llvm::extractMallocCall(const Value *I,
-                                        const TargetLibraryInfo *TLI) {
-  return isMallocLikeFn(I, TLI) ? dyn_cast<CallInst>(I) : nullptr;
+const CallInst *llvm::extractMallocCall(
+    const Value *I,
+    function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
+  return isMallocLikeFn(I, GetTLI) ? dyn_cast<CallInst>(I) : nullptr;
 }
 
 static Value *computeArraySize(const CallInst *CI, const DataLayout &DL,
@@ -521,9 +560,9 @@ STATISTIC(ObjectVisitorArgument,
 STATISTIC(ObjectVisitorLoad,
           "Number of load instructions with unsolved size and offset");
 
-APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
-  if (Options.RoundToAlign && Align)
-    return APInt(IntTyBits, alignTo(Size.getZExtValue(), Align));
+APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Alignment) {
+  if (Options.RoundToAlign && Alignment)
+    return APInt(IntTyBits, alignTo(Size.getZExtValue(), Align(Alignment)));
   return Size;
 }
 
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index b25b655165d7..884587e020bb 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -183,7 +183,7 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
 MemDepResult MemoryDependenceResults::getCallDependencyFrom(
     CallBase *Call, bool isReadOnlyCall, BasicBlock::iterator ScanIt,
     BasicBlock *BB) {
-  unsigned Limit = BlockScanLimit;
+  unsigned Limit = getDefaultBlockScanLimit();
 
   // Walk backwards through the block, looking for dependencies.
   while (ScanIt != BB->begin()) {
@@ -356,7 +356,7 @@ MemDepResult
 MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
                                                             BasicBlock *BB) {
 
-  if (!LI->getMetadata(LLVMContext::MD_invariant_group))
+  if (!LI->hasMetadata(LLVMContext::MD_invariant_group))
     return MemDepResult::getUnknown();
 
   // Take the ptr operand after all casts and geps 0. This way we can search
@@ -417,7 +417,7 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
       // same pointer operand) we can assume that value pointed by pointer
       // operand didn't change.
       if ((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
-          U->getMetadata(LLVMContext::MD_invariant_group) != nullptr)
+          U->hasMetadata(LLVMContext::MD_invariant_group))
         ClosestDependency = GetClosestDependency(ClosestDependency, U);
     }
   }
@@ -443,7 +443,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     OrderedBasicBlock *OBB) {
   bool isInvariantLoad = false;
 
-  unsigned DefaultLimit = BlockScanLimit;
+  unsigned DefaultLimit = getDefaultBlockScanLimit();
   if (!Limit)
     Limit = &DefaultLimit;
 
@@ -481,7 +481,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
   // Arguably, this logic should be pushed inside AliasAnalysis itself.
   if (isLoad && QueryInst) {
     LoadInst *LI = dyn_cast<LoadInst>(QueryInst);
-    if (LI && LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr)
+    if (LI && LI->hasMetadata(LLVMContext::MD_invariant_load))
       isInvariantLoad = true;
   }
 
@@ -1746,6 +1746,9 @@ void MemoryDependenceResults::verifyRemoved(Instruction *D) const {
 
 AnalysisKey MemoryDependenceAnalysis::Key;
 
+MemoryDependenceAnalysis::MemoryDependenceAnalysis()
+    : DefaultBlockScanLimit(BlockScanLimit) {}
+
 MemoryDependenceResults
 MemoryDependenceAnalysis::run(Function &F, FunctionAnalysisManager &AM) {
   auto &AA = AM.getResult<AAManager>(F);
@@ -1753,7 +1756,7 @@ MemoryDependenceAnalysis::run(Function &F, FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &PV = AM.getResult<PhiValuesAnalysis>(F);
-  return MemoryDependenceResults(AA, AC, TLI, DT, PV);
+  return MemoryDependenceResults(AA, AC, TLI, DT, PV, DefaultBlockScanLimit);
 }
 
 char MemoryDependenceWrapperPass::ID = 0;
@@ -1807,15 +1810,15 @@ bool MemoryDependenceResults::invalidate(Function &F, const PreservedAnalyses &P
 }
 
 unsigned MemoryDependenceResults::getDefaultBlockScanLimit() const {
-  return BlockScanLimit;
+  return DefaultBlockScanLimit;
 }
 
 bool MemoryDependenceWrapperPass::runOnFunction(Function &F) {
   auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &PV = getAnalysis<PhiValuesWrapperPass>().getResult();
-  MemDep.emplace(AA, AC, TLI, DT, PV);
+  MemDep.emplace(AA, AC, TLI, DT, PV, BlockScanLimit);
   return false;
 }
diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
index 17f5d9b9f0ad..cfb8b7e7dcb5 100644
--- a/lib/Analysis/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -49,6 +49,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
+#include <cstdlib>
 #include <iterator>
 #include <memory>
 #include <utility>
@@ -83,7 +84,7 @@ bool llvm::VerifyMemorySSA = false;
 #endif
 /// Enables memory ssa as a dependency for loop passes in legacy pass manager.
 cl::opt<bool> llvm::EnableMSSALoopDependency(
-    "enable-mssa-loop-dependency", cl::Hidden, cl::init(false),
+    "enable-mssa-loop-dependency", cl::Hidden, cl::init(true),
     cl::desc("Enable MemorySSA dependency for loop pass manager"));
 
 static cl::opt<bool, true>
@@ -284,6 +285,11 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc,
     case Intrinsic::invariant_end:
     case Intrinsic::assume:
       return {false, NoAlias};
+    case Intrinsic::dbg_addr:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_label:
+    case Intrinsic::dbg_value:
+      llvm_unreachable("debuginfo shouldn't have associated defs!");
     default:
       break;
     }
@@ -369,7 +375,7 @@ static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysisType &AA,
                                                    const Instruction *I) {
   // If the memory can't be changed, then loads of the memory can't be
   // clobbered.
-  return isa<LoadInst>(I) && (I->getMetadata(LLVMContext::MD_invariant_load) ||
+  return isa<LoadInst>(I) && (I->hasMetadata(LLVMContext::MD_invariant_load) ||
                               AA.pointsToConstantMemory(MemoryLocation(
                                   cast<LoadInst>(I)->getPointerOperand())));
 }
@@ -867,6 +873,7 @@ template <class AliasAnalysisType> class ClobberWalker {
         if (!DefChainEnd)
           for (auto *MA : def_chain(const_cast<MemoryAccess *>(Target)))
             DefChainEnd = MA;
+        assert(DefChainEnd && "Failed to find dominating phi/liveOnEntry");
 
         // If any of the terminated paths don't dominate the phi we'll try to
         // optimize, we need to figure out what they are and quit.
@@ -1087,9 +1094,14 @@ void MemorySSA::renameSuccessorPhis(BasicBlock *BB, MemoryAccess *IncomingVal,
     AccessList *Accesses = It->second.get();
     auto *Phi = cast<MemoryPhi>(&Accesses->front());
     if (RenameAllUses) {
-      int PhiIndex = Phi->getBasicBlockIndex(BB);
-      assert(PhiIndex != -1 && "Incomplete phi during partial rename");
-      Phi->setIncomingValue(PhiIndex, IncomingVal);
+      bool ReplacementDone = false;
+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+        if (Phi->getIncomingBlock(I) == BB) {
+          Phi->setIncomingValue(I, IncomingVal);
+          ReplacementDone = true;
+        }
+      (void) ReplacementDone;
+      assert(ReplacementDone && "Incomplete phi during partial rename");
     } else
       Phi->addIncoming(IncomingVal, BB);
   }
@@ -1237,7 +1249,7 @@ MemorySSA::AccessList *MemorySSA::getOrCreateAccessList(const BasicBlock *BB) {
   auto Res = PerBlockAccesses.insert(std::make_pair(BB, nullptr));
 
   if (Res.second)
-    Res.first->second = llvm::make_unique<AccessList>();
+    Res.first->second = std::make_unique<AccessList>();
   return Res.first->second.get();
 }
 
@@ -1245,7 +1257,7 @@ MemorySSA::DefsList *MemorySSA::getOrCreateDefsList(const BasicBlock *BB) {
   auto Res = PerBlockDefs.insert(std::make_pair(BB, nullptr));
 
   if (Res.second)
-    Res.first->second = llvm::make_unique<DefsList>();
+    Res.first->second = std::make_unique<DefsList>();
   return Res.first->second.get();
 }
 
@@ -1554,10 +1566,10 @@ MemorySSA::CachingWalker<AliasAnalysis> *MemorySSA::getWalkerImpl() {
 
   if (!WalkerBase)
     WalkerBase =
-        llvm::make_unique<ClobberWalkerBase<AliasAnalysis>>(this, AA, DT);
+        std::make_unique<ClobberWalkerBase<AliasAnalysis>>(this, AA, DT);
 
   Walker =
-      llvm::make_unique<CachingWalker<AliasAnalysis>>(this, WalkerBase.get());
+      std::make_unique<CachingWalker<AliasAnalysis>>(this, WalkerBase.get());
   return Walker.get();
 }
 
@@ -1567,10 +1579,10 @@ MemorySSAWalker *MemorySSA::getSkipSelfWalker() {
 
   if (!WalkerBase)
     WalkerBase =
-        llvm::make_unique<ClobberWalkerBase<AliasAnalysis>>(this, AA, DT);
+        std::make_unique<ClobberWalkerBase<AliasAnalysis>>(this, AA, DT);
 
   SkipWalker =
-      llvm::make_unique<SkipSelfWalker<AliasAnalysis>>(this, WalkerBase.get());
+      std::make_unique<SkipSelfWalker<AliasAnalysis>>(this, WalkerBase.get());
   return SkipWalker.get();
  }
 
@@ -1687,13 +1699,15 @@ MemoryPhi *MemorySSA::createMemoryPhi(BasicBlock *BB) {
 
 MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I,
                                                MemoryAccess *Definition,
-                                               const MemoryUseOrDef *Template) {
+                                               const MemoryUseOrDef *Template,
+                                               bool CreationMustSucceed) {
   assert(!isa<PHINode>(I) && "Cannot create a defined access for a PHI");
   MemoryUseOrDef *NewAccess = createNewAccess(I, AA, Template);
-  assert(
-      NewAccess != nullptr &&
-      "Tried to create a memory access for a non-memory touching instruction");
-  NewAccess->setDefiningAccess(Definition);
+  if (CreationMustSucceed)
+    assert(NewAccess != nullptr && "Tried to create a memory access for a "
+                                   "non-memory touching instruction");
+  if (NewAccess)
+    NewAccess->setDefiningAccess(Definition);
   return NewAccess;
 }
 
@@ -1717,13 +1731,21 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I,
                                            AliasAnalysisType *AAP,
                                            const MemoryUseOrDef *Template) {
   // The assume intrinsic has a control dependency which we model by claiming
-  // that it writes arbitrarily. Ignore that fake memory dependency here.
+  // that it writes arbitrarily. Debuginfo intrinsics may be considered
+  // clobbers when we have a nonstandard AA pipeline. Ignore these fake memory
+  // dependencies here.
   // FIXME: Replace this special casing with a more accurate modelling of
   // assume's control dependency.
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
     if (II->getIntrinsicID() == Intrinsic::assume)
       return nullptr;
 
+  // Using a nonstandard AA pipelines might leave us with unexpected modref
+  // results for I, so add a check to not model instructions that may not read
+  // from or write to memory. This is necessary for correctness.
+  if (!I->mayReadFromMemory() && !I->mayWriteToMemory())
+    return nullptr;
+
   bool Def, Use;
   if (Template) {
     Def = dyn_cast_or_null<MemoryDef>(Template) != nullptr;
@@ -1850,6 +1872,7 @@ void MemorySSA::verifyMemorySSA() const {
   verifyDomination(F);
   verifyOrdering(F);
   verifyDominationNumbers(F);
+  verifyPrevDefInPhis(F);
   // Previously, the verification used to also verify that the clobberingAccess
   // cached by MemorySSA is the same as the clobberingAccess found at a later
   // query to AA. This does not hold true in general due to the current fragility
@@ -1862,6 +1885,40 @@ void MemorySSA::verifyMemorySSA() const {
   // example, see test4 added in D51960.
 }
 
+void MemorySSA::verifyPrevDefInPhis(Function &F) const {
+#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS)
+  for (const BasicBlock &BB : F) {
+    if (MemoryPhi *Phi = getMemoryAccess(&BB)) {
+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+        auto *Pred = Phi->getIncomingBlock(I);
+        auto *IncAcc = Phi->getIncomingValue(I);
+        // If Pred has no unreachable predecessors, get last def looking at
+        // IDoms. If, while walkings IDoms, any of these has an unreachable
+        // predecessor, then the incoming def can be any access.
+        if (auto *DTNode = DT->getNode(Pred)) {
+          while (DTNode) {
+            if (auto *DefList = getBlockDefs(DTNode->getBlock())) {
+              auto *LastAcc = &*(--DefList->end());
+              assert(LastAcc == IncAcc &&
+                     "Incorrect incoming access into phi.");
+              break;
+            }
+            DTNode = DTNode->getIDom();
+          }
+        } else {
+          // If Pred has unreachable predecessors, but has at least a Def, the
+          // incoming access can be the last Def in Pred, or it could have been
+          // optimized to LoE. After an update, though, the LoE may have been
+          // replaced by another access, so IncAcc may be any access.
+          // If Pred has unreachable predecessors and no Defs, incoming access
+          // should be LoE; However, after an update, it may be any access.
+        }
+      }
+    }
+  }
+#endif
+}
+
 /// Verify that all of the blocks we believe to have valid domination numbers
 /// actually have valid domination numbers.
 void MemorySSA::verifyDominationNumbers(const Function &F) const {
@@ -2005,7 +2062,7 @@ void MemorySSA::verifyUseInDefs(MemoryAccess *Def, MemoryAccess *Use) const {
 /// accesses and verifying that, for each use, it appears in the
 /// appropriate def's use list
 void MemorySSA::verifyDefUses(Function &F) const {
-#ifndef NDEBUG
+#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS)
   for (BasicBlock &B : F) {
     // Phi nodes are attached to basic blocks
     if (MemoryPhi *Phi = getMemoryAccess(&B)) {
@@ -2212,7 +2269,7 @@ MemorySSAAnalysis::Result MemorySSAAnalysis::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
-  return MemorySSAAnalysis::Result(llvm::make_unique<MemorySSA>(F, &AA, &DT));
+  return MemorySSAAnalysis::Result(std::make_unique<MemorySSA>(F, &AA, &DT));
 }
 
 bool MemorySSAAnalysis::Result::invalidate(
diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp
index 4c1feee7fd9a..f2d56b05d968 100644
--- a/lib/Analysis/MemorySSAUpdater.cpp
+++ b/lib/Analysis/MemorySSAUpdater.cpp
@@ -44,11 +44,15 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(
   // First, do a cache lookup. Without this cache, certain CFG structures
   // (like a series of if statements) take exponential time to visit.
   auto Cached = CachedPreviousDef.find(BB);
-  if (Cached != CachedPreviousDef.end()) {
+  if (Cached != CachedPreviousDef.end())
     return Cached->second;
-  }
 
-  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+  // If this method is called from an unreachable block, return LoE.
+  if (!MSSA->DT->isReachableFromEntry(BB))
+    return MSSA->getLiveOnEntryDef();
+
+  if (BasicBlock *Pred = BB->getUniquePredecessor()) {
+    VisitedBlocks.insert(BB);
     // Single predecessor case, just recurse, we can only have one definition.
     MemoryAccess *Result = getPreviousDefFromEnd(Pred, CachedPreviousDef);
     CachedPreviousDef.insert({BB, Result});
@@ -71,11 +75,19 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(
     // Recurse to get the values in our predecessors for placement of a
     // potential phi node. This will insert phi nodes if we cycle in order to
     // break the cycle and have an operand.
-    for (auto *Pred : predecessors(BB))
-      if (MSSA->DT->isReachableFromEntry(Pred))
-        PhiOps.push_back(getPreviousDefFromEnd(Pred, CachedPreviousDef));
-      else
+    bool UniqueIncomingAccess = true;
+    MemoryAccess *SingleAccess = nullptr;
+    for (auto *Pred : predecessors(BB)) {
+      if (MSSA->DT->isReachableFromEntry(Pred)) {
+        auto *IncomingAccess = getPreviousDefFromEnd(Pred, CachedPreviousDef);
+        if (!SingleAccess)
+          SingleAccess = IncomingAccess;
+        else if (IncomingAccess != SingleAccess)
+          UniqueIncomingAccess = false;
+        PhiOps.push_back(IncomingAccess);
+      } else
         PhiOps.push_back(MSSA->getLiveOnEntryDef());
+    }
 
     // Now try to simplify the ops to avoid placing a phi.
     // This may return null if we never created a phi yet, that's okay
@@ -84,7 +96,15 @@ MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(
     // See if we can avoid the phi by simplifying it.
     auto *Result = tryRemoveTrivialPhi(Phi, PhiOps);
     // If we couldn't simplify, we may have to create a phi
-    if (Result == Phi) {
+    if (Result == Phi && UniqueIncomingAccess && SingleAccess) {
+      // A concrete Phi only exists if we created an empty one to break a cycle.
+      if (Phi) {
+        assert(Phi->operands().empty() && "Expected empty Phi");
+        Phi->replaceAllUsesWith(SingleAccess);
+        removeMemoryAccess(Phi);
+      }
+      Result = SingleAccess;
+    } else if (Result == Phi && !(UniqueIncomingAccess && SingleAccess)) {
       if (!Phi)
         Phi = MSSA->createMemoryPhi(BB);
 
@@ -173,12 +193,9 @@ MemoryAccess *MemorySSAUpdater::recursePhi(MemoryAccess *Phi) {
   TrackingVH<MemoryAccess> Res(Phi);
   SmallVector<TrackingVH<Value>, 8> Uses;
   std::copy(Phi->user_begin(), Phi->user_end(), std::back_inserter(Uses));
-  for (auto &U : Uses) {
-    if (MemoryPhi *UsePhi = dyn_cast<MemoryPhi>(&*U)) {
-      auto OperRange = UsePhi->operands();
-      tryRemoveTrivialPhi(UsePhi, OperRange);
-    }
-  }
+  for (auto &U : Uses)
+    if (MemoryPhi *UsePhi = dyn_cast<MemoryPhi>(&*U))
+      tryRemoveTrivialPhi(UsePhi);
   return Res;
 }
 
@@ -187,6 +204,11 @@ MemoryAccess *MemorySSAUpdater::recursePhi(MemoryAccess *Phi) {
 // argument.
 // IE phi(a, a) or b = phi(a, b) or c = phi(a, a, c)
 // We recursively try to remove them.
+MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi) {
+  assert(Phi && "Can only remove concrete Phi.");
+  auto OperRange = Phi->operands();
+  return tryRemoveTrivialPhi(Phi, OperRange);
+}
 template <class RangeType>
 MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi,
                                                     RangeType &Operands) {
@@ -218,17 +240,49 @@ MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi,
   return recursePhi(Same);
 }
 
-void MemorySSAUpdater::insertUse(MemoryUse *MU) {
+void MemorySSAUpdater::insertUse(MemoryUse *MU, bool RenameUses) {
   InsertedPHIs.clear();
   MU->setDefiningAccess(getPreviousDef(MU));
-  // Unlike for defs, there is no extra work to do.  Because uses do not create
-  // new may-defs, there are only two cases:
-  //
+
+  // In cases without unreachable blocks, because uses do not create new
+  // may-defs, there are only two cases:
   // 1. There was a def already below us, and therefore, we should not have
   // created a phi node because it was already needed for the def.
   //
   // 2. There is no def below us, and therefore, there is no extra renaming work
   // to do.
+
+  // In cases with unreachable blocks, where the unnecessary Phis were
+  // optimized out, adding the Use may re-insert those Phis. Hence, when
+  // inserting Uses outside of the MSSA creation process, and new Phis were
+  // added, rename all uses if we are asked.
+
+  if (!RenameUses && !InsertedPHIs.empty()) {
+    auto *Defs = MSSA->getBlockDefs(MU->getBlock());
+    (void)Defs;
+    assert((!Defs || (++Defs->begin() == Defs->end())) &&
+           "Block may have only a Phi or no defs");
+  }
+
+  if (RenameUses && InsertedPHIs.size()) {
+    SmallPtrSet<BasicBlock *, 16> Visited;
+    BasicBlock *StartBlock = MU->getBlock();
+
+    if (auto *Defs = MSSA->getWritableBlockDefs(StartBlock)) {
+      MemoryAccess *FirstDef = &*Defs->begin();
+      // Convert to incoming value if it's a memorydef. A phi *is* already an
+      // incoming value.
+      if (auto *MD = dyn_cast<MemoryDef>(FirstDef))
+        FirstDef = MD->getDefiningAccess();
+
+      MSSA->renamePass(MU->getBlock(), FirstDef, Visited);
+    }
+    // We just inserted a phi into this block, so the incoming value will
+    // become the phi anyway, so it does not matter what we pass.
+    for (auto &MP : InsertedPHIs)
+      if (MemoryPhi *Phi = cast_or_null<MemoryPhi>(MP))
+        MSSA->renamePass(Phi->getBlock(), nullptr, Visited);
+  }
 }
 
 // Set every incoming edge {BB, MP->getBlock()} of MemoryPhi MP to NewDef.
@@ -260,33 +314,35 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
 
   // See if we had a local def, and if not, go hunting.
   MemoryAccess *DefBefore = getPreviousDef(MD);
-  bool DefBeforeSameBlock = DefBefore->getBlock() == MD->getBlock();
+  bool DefBeforeSameBlock = false;
+  if (DefBefore->getBlock() == MD->getBlock() &&
+      !(isa<MemoryPhi>(DefBefore) &&
+        std::find(InsertedPHIs.begin(), InsertedPHIs.end(), DefBefore) !=
+            InsertedPHIs.end()))
+    DefBeforeSameBlock = true;
 
   // There is a def before us, which means we can replace any store/phi uses
   // of that thing with us, since we are in the way of whatever was there
   // before.
   // We now define that def's memorydefs and memoryphis
   if (DefBeforeSameBlock) {
-    for (auto UI = DefBefore->use_begin(), UE = DefBefore->use_end();
-         UI != UE;) {
-      Use &U = *UI++;
+    DefBefore->replaceUsesWithIf(MD, [MD](Use &U) {
       // Leave the MemoryUses alone.
       // Also make sure we skip ourselves to avoid self references.
-      if (isa<MemoryUse>(U.getUser()) || U.getUser() == MD)
-        continue;
+      User *Usr = U.getUser();
+      return !isa<MemoryUse>(Usr) && Usr != MD;
       // Defs are automatically unoptimized when the user is set to MD below,
       // because the isOptimized() call will fail to find the same ID.
-      U.set(MD);
-    }
+    });
   }
 
   // and that def is now our defining access.
   MD->setDefiningAccess(DefBefore);
 
-  // Remember the index where we may insert new phis below.
-  unsigned NewPhiIndex = InsertedPHIs.size();
-
   SmallVector<WeakVH, 8> FixupList(InsertedPHIs.begin(), InsertedPHIs.end());
+
+  // Remember the index where we may insert new phis.
+  unsigned NewPhiIndex = InsertedPHIs.size();
   if (!DefBeforeSameBlock) {
     // If there was a local def before us, we must have the same effect it
     // did. Because every may-def is the same, any phis/etc we would create, it
@@ -302,46 +358,54 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
 
     // If this is the first def in the block and this insert is in an arbitrary
     // place, compute IDF and place phis.
+    SmallPtrSet<BasicBlock *, 2> DefiningBlocks;
+
+    // If this is the last Def in the block, also compute IDF based on MD, since
+    // this may a new Def added, and we may need additional Phis.
     auto Iter = MD->getDefsIterator();
     ++Iter;
     auto IterEnd = MSSA->getBlockDefs(MD->getBlock())->end();
-    if (Iter == IterEnd) {
-      ForwardIDFCalculator IDFs(*MSSA->DT);
-      SmallVector<BasicBlock *, 32> IDFBlocks;
-      SmallPtrSet<BasicBlock *, 2> DefiningBlocks;
+    if (Iter == IterEnd)
       DefiningBlocks.insert(MD->getBlock());
-      IDFs.setDefiningBlocks(DefiningBlocks);
-      IDFs.calculate(IDFBlocks);
-      SmallVector<AssertingVH<MemoryPhi>, 4> NewInsertedPHIs;
-      for (auto *BBIDF : IDFBlocks)
-        if (!MSSA->getMemoryAccess(BBIDF)) {
-          auto *MPhi = MSSA->createMemoryPhi(BBIDF);
-          NewInsertedPHIs.push_back(MPhi);
-          // Add the phis created into the IDF blocks to NonOptPhis, so they are
-          // not optimized out as trivial by the call to getPreviousDefFromEnd
-          // below. Once they are complete, all these Phis are added to the
-          // FixupList, and removed from NonOptPhis inside fixupDefs().
-          NonOptPhis.insert(MPhi);
-        }
 
-      for (auto &MPhi : NewInsertedPHIs) {
-        auto *BBIDF = MPhi->getBlock();
-        for (auto *Pred : predecessors(BBIDF)) {
-          DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> CachedPreviousDef;
-          MPhi->addIncoming(getPreviousDefFromEnd(Pred, CachedPreviousDef),
-                            Pred);
-        }
+    for (const auto &VH : InsertedPHIs)
+      if (const auto *RealPHI = cast_or_null<MemoryPhi>(VH))
+        DefiningBlocks.insert(RealPHI->getBlock());
+    ForwardIDFCalculator IDFs(*MSSA->DT);
+    SmallVector<BasicBlock *, 32> IDFBlocks;
+    IDFs.setDefiningBlocks(DefiningBlocks);
+    IDFs.calculate(IDFBlocks);
+    SmallVector<AssertingVH<MemoryPhi>, 4> NewInsertedPHIs;
+    for (auto *BBIDF : IDFBlocks) {
+      auto *MPhi = MSSA->getMemoryAccess(BBIDF);
+      if (!MPhi) {
+        MPhi = MSSA->createMemoryPhi(BBIDF);
+        NewInsertedPHIs.push_back(MPhi);
       }
-
-      // Re-take the index where we're adding the new phis, because the above
-      // call to getPreviousDefFromEnd, may have inserted into InsertedPHIs.
-      NewPhiIndex = InsertedPHIs.size();
-      for (auto &MPhi : NewInsertedPHIs) {
-        InsertedPHIs.push_back(&*MPhi);
-        FixupList.push_back(&*MPhi);
+      // Add the phis created into the IDF blocks to NonOptPhis, so they are not
+      // optimized out as trivial by the call to getPreviousDefFromEnd below.
+      // Once they are complete, all these Phis are added to the FixupList, and
+      // removed from NonOptPhis inside fixupDefs(). Existing Phis in IDF may
+      // need fixing as well, and potentially be trivial before this insertion,
+      // hence add all IDF Phis. See PR43044.
+      NonOptPhis.insert(MPhi);
+    }
+    for (auto &MPhi : NewInsertedPHIs) {
+      auto *BBIDF = MPhi->getBlock();
+      for (auto *Pred : predecessors(BBIDF)) {
+        DenseMap<BasicBlock *, TrackingVH<MemoryAccess>> CachedPreviousDef;
+        MPhi->addIncoming(getPreviousDefFromEnd(Pred, CachedPreviousDef), Pred);
       }
     }
 
+    // Re-take the index where we're adding the new phis, because the above call
+    // to getPreviousDefFromEnd, may have inserted into InsertedPHIs.
+    NewPhiIndex = InsertedPHIs.size();
+    for (auto &MPhi : NewInsertedPHIs) {
+      InsertedPHIs.push_back(&*MPhi);
+      FixupList.push_back(&*MPhi);
+    }
+
     FixupList.push_back(MD);
   }
 
@@ -458,8 +522,7 @@ void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<WeakVH> &Vars) {
 void MemorySSAUpdater::removeEdge(BasicBlock *From, BasicBlock *To) {
   if (MemoryPhi *MPhi = MSSA->getMemoryAccess(To)) {
     MPhi->unorderedDeleteIncomingBlock(From);
-    if (MPhi->getNumIncomingValues() == 1)
-      removeMemoryAccess(MPhi);
+    tryRemoveTrivialPhi(MPhi);
   }
 }
 
@@ -475,34 +538,51 @@ void MemorySSAUpdater::removeDuplicatePhiEdgesBetween(const BasicBlock *From,
       Found = true;
       return false;
     });
-    if (MPhi->getNumIncomingValues() == 1)
-      removeMemoryAccess(MPhi);
+    tryRemoveTrivialPhi(MPhi);
+  }
+}
+
+static MemoryAccess *getNewDefiningAccessForClone(MemoryAccess *MA,
+                                                  const ValueToValueMapTy &VMap,
+                                                  PhiToDefMap &MPhiMap,
+                                                  bool CloneWasSimplified,
+                                                  MemorySSA *MSSA) {
+  MemoryAccess *InsnDefining = MA;
+  if (MemoryDef *DefMUD = dyn_cast<MemoryDef>(InsnDefining)) {
+    if (!MSSA->isLiveOnEntryDef(DefMUD)) {
+      Instruction *DefMUDI = DefMUD->getMemoryInst();
+      assert(DefMUDI && "Found MemoryUseOrDef with no Instruction.");
+      if (Instruction *NewDefMUDI =
+              cast_or_null<Instruction>(VMap.lookup(DefMUDI))) {
+        InsnDefining = MSSA->getMemoryAccess(NewDefMUDI);
+        if (!CloneWasSimplified)
+          assert(InsnDefining && "Defining instruction cannot be nullptr.");
+        else if (!InsnDefining || isa<MemoryUse>(InsnDefining)) {
+          // The clone was simplified, it's no longer a MemoryDef, look up.
+          auto DefIt = DefMUD->getDefsIterator();
+          // Since simplified clones only occur in single block cloning, a
+          // previous definition must exist, otherwise NewDefMUDI would not
+          // have been found in VMap.
+          assert(DefIt != MSSA->getBlockDefs(DefMUD->getBlock())->begin() &&
+                 "Previous def must exist");
+          InsnDefining = getNewDefiningAccessForClone(
+              &*(--DefIt), VMap, MPhiMap, CloneWasSimplified, MSSA);
+        }
+      }
+    }
+  } else {
+    MemoryPhi *DefPhi = cast<MemoryPhi>(InsnDefining);
+    if (MemoryAccess *NewDefPhi = MPhiMap.lookup(DefPhi))
+      InsnDefining = NewDefPhi;
   }
+  assert(InsnDefining && "Defining instruction cannot be nullptr.");
+  return InsnDefining;
 }
 
 void MemorySSAUpdater::cloneUsesAndDefs(BasicBlock *BB, BasicBlock *NewBB,
                                         const ValueToValueMapTy &VMap,
                                         PhiToDefMap &MPhiMap,
                                         bool CloneWasSimplified) {
-  auto GetNewDefiningAccess = [&](MemoryAccess *MA) -> MemoryAccess * {
-    MemoryAccess *InsnDefining = MA;
-    if (MemoryUseOrDef *DefMUD = dyn_cast<MemoryUseOrDef>(InsnDefining)) {
-      if (!MSSA->isLiveOnEntryDef(DefMUD)) {
-        Instruction *DefMUDI = DefMUD->getMemoryInst();
-        assert(DefMUDI && "Found MemoryUseOrDef with no Instruction.");
-        if (Instruction *NewDefMUDI =
-                cast_or_null<Instruction>(VMap.lookup(DefMUDI)))
-          InsnDefining = MSSA->getMemoryAccess(NewDefMUDI);
-      }
-    } else {
-      MemoryPhi *DefPhi = cast<MemoryPhi>(InsnDefining);
-      if (MemoryAccess *NewDefPhi = MPhiMap.lookup(DefPhi))
-        InsnDefining = NewDefPhi;
-    }
-    assert(InsnDefining && "Defining instruction cannot be nullptr.");
-    return InsnDefining;
-  };
-
   const MemorySSA::AccessList *Acc = MSSA->getBlockAccesses(BB);
   if (!Acc)
     return;
@@ -519,9 +599,13 @@ void MemorySSAUpdater::cloneUsesAndDefs(BasicBlock *BB, BasicBlock *NewBB,
       if (Instruction *NewInsn =
               dyn_cast_or_null<Instruction>(VMap.lookup(Insn))) {
         MemoryAccess *NewUseOrDef = MSSA->createDefinedAccess(
-            NewInsn, GetNewDefiningAccess(MUD->getDefiningAccess()),
-            CloneWasSimplified ? nullptr : MUD);
-        MSSA->insertIntoListsForBlock(NewUseOrDef, NewBB, MemorySSA::End);
+            NewInsn,
+            getNewDefiningAccessForClone(MUD->getDefiningAccess(), VMap,
+                                         MPhiMap, CloneWasSimplified, MSSA),
+            /*Template=*/CloneWasSimplified ? nullptr : MUD,
+            /*CreationMustSucceed=*/CloneWasSimplified ? false : true);
+        if (NewUseOrDef)
+          MSSA->insertIntoListsForBlock(NewUseOrDef, NewBB, MemorySSA::End);
       }
     }
   }
@@ -563,8 +647,7 @@ void MemorySSAUpdater::updatePhisWhenInsertingUniqueBackedgeBlock(
 
   // If NewMPhi is a trivial phi, remove it. Its use in the header MPhi will be
   // replaced with the unique value.
-  if (HasUniqueIncomingValue)
-    removeMemoryAccess(NewMPhi);
+  tryRemoveTrivialPhi(NewMPhi);
 }
 
 void MemorySSAUpdater::updateForClonedLoop(const LoopBlocksRPO &LoopBlocks,
@@ -770,6 +853,9 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
       } else {
         // Single predecessor, BB cannot be dead. GetLastDef of Pred.
         assert(Count == 1 && Pred && "Single predecessor expected.");
+        // BB can be unreachable though, return LoE if that is the case.
+        if (!DT.getNode(BB))
+          return MSSA->getLiveOnEntryDef();
         BB = Pred;
       }
     };
@@ -1010,7 +1096,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
         for (; UI != E;) {
           Use &U = *UI;
           ++UI;
-          MemoryAccess *Usr = dyn_cast<MemoryAccess>(U.getUser());
+          MemoryAccess *Usr = cast<MemoryAccess>(U.getUser());
           if (MemoryPhi *UsrPhi = dyn_cast<MemoryPhi>(Usr)) {
             BasicBlock *DominatedBlock = UsrPhi->getIncomingBlock(U);
             if (!DT.dominates(DominatingBlock, DominatedBlock))
@@ -1052,9 +1138,9 @@ void MemorySSAUpdater::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
 
   // Now reinsert it into the IR and do whatever fixups needed.
   if (auto *MD = dyn_cast<MemoryDef>(What))
-    insertDef(MD);
+    insertDef(MD, /*RenameUses=*/true);
   else
-    insertUse(cast<MemoryUse>(What));
+    insertUse(cast<MemoryUse>(What), /*RenameUses=*/true);
 
   // Clear dangling pointers. We added all MemoryPhi users, but not all
   // of them are removed by fixupDefs().
@@ -1084,25 +1170,32 @@ void MemorySSAUpdater::moveAllAccesses(BasicBlock *From, BasicBlock *To,
   if (!Accs)
     return;
 
+  assert(Start->getParent() == To && "Incorrect Start instruction");
   MemoryAccess *FirstInNew = nullptr;
   for (Instruction &I : make_range(Start->getIterator(), To->end()))
     if ((FirstInNew = MSSA->getMemoryAccess(&I)))
       break;
-  if (!FirstInNew)
-    return;
+  if (FirstInNew) {
+    auto *MUD = cast<MemoryUseOrDef>(FirstInNew);
+    do {
+      auto NextIt = ++MUD->getIterator();
+      MemoryUseOrDef *NextMUD = (!Accs || NextIt == Accs->end())
+                                    ? nullptr
+                                    : cast<MemoryUseOrDef>(&*NextIt);
+      MSSA->moveTo(MUD, To, MemorySSA::End);
+      // Moving MUD from Accs in the moveTo above, may delete Accs, so we need
+      // to retrieve it again.
+      Accs = MSSA->getWritableBlockAccesses(From);
+      MUD = NextMUD;
+    } while (MUD);
+  }
 
-  auto *MUD = cast<MemoryUseOrDef>(FirstInNew);
-  do {
-    auto NextIt = ++MUD->getIterator();
-    MemoryUseOrDef *NextMUD = (!Accs || NextIt == Accs->end())
-                                  ? nullptr
-                                  : cast<MemoryUseOrDef>(&*NextIt);
-    MSSA->moveTo(MUD, To, MemorySSA::End);
-    // Moving MUD from Accs in the moveTo above, may delete Accs, so we need to
-    // retrieve it again.
-    Accs = MSSA->getWritableBlockAccesses(From);
-    MUD = NextMUD;
-  } while (MUD);
+  // If all accesses were moved and only a trivial Phi remains, we try to remove
+  // that Phi. This is needed when From is going to be deleted.
+  auto *Defs = MSSA->getWritableBlockDefs(From);
+  if (Defs && !Defs->empty())
+    if (auto *Phi = dyn_cast<MemoryPhi>(&*Defs->begin()))
+      tryRemoveTrivialPhi(Phi);
 }
 
 void MemorySSAUpdater::moveAllAfterSpliceBlocks(BasicBlock *From,
@@ -1118,7 +1211,7 @@ void MemorySSAUpdater::moveAllAfterSpliceBlocks(BasicBlock *From,
 
 void MemorySSAUpdater::moveAllAfterMergeBlocks(BasicBlock *From, BasicBlock *To,
                                                Instruction *Start) {
-  assert(From->getSinglePredecessor() == To &&
+  assert(From->getUniquePredecessor() == To &&
          "From block is expected to have a single predecessor (To).");
   moveAllAccesses(From, To, Start);
   for (BasicBlock *Succ : successors(From))
@@ -1173,8 +1266,7 @@ void MemorySSAUpdater::wireOldPredecessorsToNewImmediatePredecessor(
       return false;
     });
     Phi->addIncoming(NewPhi, New);
-    if (onlySingleValue(NewPhi))
-      removeMemoryAccess(NewPhi);
+    tryRemoveTrivialPhi(NewPhi);
   }
 }
 
@@ -1239,10 +1331,8 @@ void MemorySSAUpdater::removeMemoryAccess(MemoryAccess *MA, bool OptimizePhis) {
     unsigned PhisSize = PhisToOptimize.size();
     while (PhisSize-- > 0)
       if (MemoryPhi *MP =
-              cast_or_null<MemoryPhi>(PhisToOptimize.pop_back_val())) {
-        auto OperRange = MP->operands();
-        tryRemoveTrivialPhi(MP, OperRange);
-      }
+              cast_or_null<MemoryPhi>(PhisToOptimize.pop_back_val()))
+        tryRemoveTrivialPhi(MP);
   }
 }
 
@@ -1256,8 +1346,7 @@ void MemorySSAUpdater::removeBlocks(
       if (!DeadBlocks.count(Succ))
         if (MemoryPhi *MP = MSSA->getMemoryAccess(Succ)) {
           MP->unorderedDeleteIncomingBlock(BB);
-          if (MP->getNumIncomingValues() == 1)
-            removeMemoryAccess(MP);
+          tryRemoveTrivialPhi(MP);
         }
     // Drop all references of all accesses in BB
     if (MemorySSA::AccessList *Acc = MSSA->getWritableBlockAccesses(BB))
@@ -1281,10 +1370,8 @@ void MemorySSAUpdater::removeBlocks(
 
 void MemorySSAUpdater::tryRemoveTrivialPhis(ArrayRef<WeakVH> UpdatedPHIs) {
   for (auto &VH : UpdatedPHIs)
-    if (auto *MPhi = cast_or_null<MemoryPhi>(VH)) {
-      auto OperRange = MPhi->operands();
-      tryRemoveTrivialPhi(MPhi, OperRange);
-    }
+    if (auto *MPhi = cast_or_null<MemoryPhi>(VH))
+      tryRemoveTrivialPhi(MPhi);
 }
 
 void MemorySSAUpdater::changeToUnreachable(const Instruction *I) {
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index e25eb290a665..8232bf07cafc 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -319,7 +319,7 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       auto *CalledValue = CS.getCalledValue();
       auto *CalledFunction = CS.getCalledFunction();
       if (CalledValue && !CalledFunction) {
-        CalledValue = CalledValue->stripPointerCastsNoFollowAliases();
+        CalledValue = CalledValue->stripPointerCasts();
         // Stripping pointer casts can reveal a called function.
         CalledFunction = dyn_cast<Function>(CalledValue);
       }
@@ -467,7 +467,7 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       // FIXME: refactor this to use the same code that inliner is using.
       // Don't try to import functions with noinline attribute.
       F.getAttributes().hasFnAttribute(Attribute::NoInline)};
-  auto FuncSummary = llvm::make_unique<FunctionSummary>(
+  auto FuncSummary = std::make_unique<FunctionSummary>(
       Flags, NumInsts, FunFlags, /*EntryCount=*/0, std::move(Refs),
       CallGraphEdges.takeVector(), TypeTests.takeVector(),
       TypeTestAssumeVCalls.takeVector(), TypeCheckedLoadVCalls.takeVector(),
@@ -598,7 +598,7 @@ static void computeVariableSummary(ModuleSummaryIndex &Index,
       !V.hasComdat() && !V.hasAppendingLinkage() && !V.isInterposable() &&
       !V.hasAvailableExternallyLinkage() && !V.hasDLLExportStorageClass();
   GlobalVarSummary::GVarFlags VarFlags(CanBeInternalized, CanBeInternalized);
-  auto GVarSummary = llvm::make_unique<GlobalVarSummary>(Flags, VarFlags,
+  auto GVarSummary = std::make_unique<GlobalVarSummary>(Flags, VarFlags,
                                                          RefEdges.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(V.getGUID());
@@ -616,7 +616,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
   GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal,
                                     /* Live = */ false, A.isDSOLocal(),
                                     A.hasLinkOnceODRLinkage() && A.hasGlobalUnnamedAddr());
-  auto AS = llvm::make_unique<AliasSummary>(Flags);
+  auto AS = std::make_unique<AliasSummary>(Flags);
   auto *Aliasee = A.getBaseObject();
   auto AliaseeVI = Index.getValueInfo(Aliasee->getGUID());
   assert(AliaseeVI && "Alias expects aliasee summary to be available");
@@ -696,7 +696,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
           // Create the appropriate summary type.
           if (Function *F = dyn_cast<Function>(GV)) {
             std::unique_ptr<FunctionSummary> Summary =
-                llvm::make_unique<FunctionSummary>(
+                std::make_unique<FunctionSummary>(
                     GVFlags, /*InstCount=*/0,
                     FunctionSummary::FFlags{
                         F->hasFnAttribute(Attribute::ReadNone),
@@ -714,7 +714,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
-                llvm::make_unique<GlobalVarSummary>(
+                std::make_unique<GlobalVarSummary>(
                     GVFlags, GlobalVarSummary::GVarFlags(false, false),
                     ArrayRef<ValueInfo>{});
             Index.addGlobalValueSummary(*GV, std::move(Summary));
@@ -741,7 +741,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     else if (F.hasProfileData()) {
       LoopInfo LI{DT};
       BranchProbabilityInfo BPI{F, LI};
-      BFIPtr = llvm::make_unique<BlockFrequencyInfo>(F, BPI, LI);
+      BFIPtr = std::make_unique<BlockFrequencyInfo>(F, BPI, LI);
       BFI = BFIPtr.get();
     }
 
@@ -813,7 +813,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
 
   if (!ModuleSummaryDotFile.empty()) {
     std::error_code EC;
-    raw_fd_ostream OSDot(ModuleSummaryDotFile, EC, sys::fs::OpenFlags::F_None);
+    raw_fd_ostream OSDot(ModuleSummaryDotFile, EC, sys::fs::OpenFlags::OF_None);
     if (EC)
       report_fatal_error(Twine("Failed to open dot file ") +
                          ModuleSummaryDotFile + ": " + EC.message() + "\n");
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index b616cd6f762b..44527773115d 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/MustExecute.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
@@ -19,8 +21,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
+#define DEBUG_TYPE "must-execute"
+
 const DenseMap<BasicBlock *, ColorVector> &
 LoopSafetyInfo::getBlockColors() const {
   return BlockColors;
@@ -306,6 +311,17 @@ namespace {
     }
     bool runOnFunction(Function &F) override;
   };
+  struct MustBeExecutedContextPrinter : public ModulePass {
+    static char ID;
+
+    MustBeExecutedContextPrinter() : ModulePass(ID) {
+      initializeMustBeExecutedContextPrinterPass(*PassRegistry::getPassRegistry());
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+    }
+    bool runOnModule(Module &M) override;
+  };
 }
 
 char MustExecutePrinter::ID = 0;
@@ -320,6 +336,36 @@ FunctionPass *llvm::createMustExecutePrinter() {
   return new MustExecutePrinter();
 }
 
+char MustBeExecutedContextPrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    MustBeExecutedContextPrinter, "print-must-be-executed-contexts",
+    "print the must-be-executed-contexed for all instructions", false, true)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(MustBeExecutedContextPrinter,
+                    "print-must-be-executed-contexts",
+                    "print the must-be-executed-contexed for all instructions",
+                    false, true)
+
+ModulePass *llvm::createMustBeExecutedContextPrinter() {
+  return new MustBeExecutedContextPrinter();
+}
+
+bool MustBeExecutedContextPrinter::runOnModule(Module &M) {
+  MustBeExecutedContextExplorer Explorer(true);
+  for (Function &F : M) {
+    for (Instruction &I : instructions(F)) {
+      dbgs() << "-- Explore context of: " << I << "\n";
+      for (const Instruction *CI : Explorer.range(&I))
+        dbgs() << "  [F: " << CI->getFunction()->getName() << "] " << *CI
+               << "\n";
+    }
+  }
+
+  return false;
+}
+
 static bool isMustExecuteIn(const Instruction &I, Loop *L, DominatorTree *DT) {
   // TODO: merge these two routines.  For the moment, we display the best
   // result obtained by *either* implementation.  This is a bit unfair since no
@@ -396,3 +442,75 @@ bool MustExecutePrinter::runOnFunction(Function &F) {
 
   return false;
 }
+
+const Instruction *
+MustBeExecutedContextExplorer::getMustBeExecutedNextInstruction(
+    MustBeExecutedIterator &It, const Instruction *PP) {
+  if (!PP)
+    return PP;
+  LLVM_DEBUG(dbgs() << "Find next instruction for " << *PP << "\n");
+
+  // If we explore only inside a given basic block we stop at terminators.
+  if (!ExploreInterBlock && PP->isTerminator()) {
+    LLVM_DEBUG(dbgs() << "\tReached terminator in intra-block mode, done\n");
+    return nullptr;
+  }
+
+  // If we do not traverse the call graph we check if we can make progress in
+  // the current function. First, check if the instruction is guaranteed to
+  // transfer execution to the successor.
+  bool TransfersExecution = isGuaranteedToTransferExecutionToSuccessor(PP);
+  if (!TransfersExecution)
+    return nullptr;
+
+  // If this is not a terminator we know that there is a single instruction
+  // after this one that is executed next if control is transfered. If not,
+  // we can try to go back to a call site we entered earlier. If none exists, we
+  // do not know any instruction that has to be executd next.
+  if (!PP->isTerminator()) {
+    const Instruction *NextPP = PP->getNextNode();
+    LLVM_DEBUG(dbgs() << "\tIntermediate instruction does transfer control\n");
+    return NextPP;
+  }
+
+  // Finally, we have to handle terminators, trivial ones first.
+  assert(PP->isTerminator() && "Expected a terminator!");
+
+  // A terminator without a successor is not handled yet.
+  if (PP->getNumSuccessors() == 0) {
+    LLVM_DEBUG(dbgs() << "\tUnhandled terminator\n");
+    return nullptr;
+  }
+
+  // A terminator with a single successor, we will continue at the beginning of
+  // that one.
+  if (PP->getNumSuccessors() == 1) {
+    LLVM_DEBUG(
+        dbgs() << "\tUnconditional terminator, continue with successor\n");
+    return &PP->getSuccessor(0)->front();
+  }
+
+  LLVM_DEBUG(dbgs() << "\tNo join point found\n");
+  return nullptr;
+}
+
+MustBeExecutedIterator::MustBeExecutedIterator(
+    MustBeExecutedContextExplorer &Explorer, const Instruction *I)
+    : Explorer(Explorer), CurInst(I) {
+  reset(I);
+}
+
+void MustBeExecutedIterator::reset(const Instruction *I) {
+  CurInst = I;
+  Visited.clear();
+  Visited.insert(I);
+}
+
+const Instruction *MustBeExecutedIterator::advance() {
+  assert(CurInst && "Cannot advance an end iterator!");
+  const Instruction *Next =
+      Explorer.getMustBeExecutedNextInstruction(*this, CurInst);
+  if (Next && !Visited.insert(Next).second)
+    Next = nullptr;
+  return Next;
+}
diff --git a/lib/Analysis/OptimizationRemarkEmitter.cpp b/lib/Analysis/OptimizationRemarkEmitter.cpp
index 72c40a0be232..07a5619a35b9 100644
--- a/lib/Analysis/OptimizationRemarkEmitter.cpp
+++ b/lib/Analysis/OptimizationRemarkEmitter.cpp
@@ -39,7 +39,7 @@ OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F)
   BPI.calculate(*F, LI);
 
   // Finally compute BFI.
-  OwnedBFI = llvm::make_unique<BlockFrequencyInfo>(*F, BPI, LI);
+  OwnedBFI = std::make_unique<BlockFrequencyInfo>(*F, BPI, LI);
   BFI = OwnedBFI.get();
 }
 
@@ -97,7 +97,7 @@ bool OptimizationRemarkEmitterWrapperPass::runOnFunction(Function &Fn) {
   else
     BFI = nullptr;
 
-  ORE = llvm::make_unique<OptimizationRemarkEmitter>(&Fn, BFI);
+  ORE = std::make_unique<OptimizationRemarkEmitter>(&Fn, BFI);
   return false;
 }
 
diff --git a/lib/Analysis/OrderedInstructions.cpp b/lib/Analysis/OrderedInstructions.cpp
index 458c0a7de6c2..e947e5e388a8 100644
--- a/lib/Analysis/OrderedInstructions.cpp
+++ b/lib/Analysis/OrderedInstructions.cpp
@@ -21,7 +21,7 @@ bool OrderedInstructions::localDominates(const Instruction *InstA,
   const BasicBlock *IBB = InstA->getParent();
   auto OBB = OBBMap.find(IBB);
   if (OBB == OBBMap.end())
-    OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
+    OBB = OBBMap.insert({IBB, std::make_unique<OrderedBasicBlock>(IBB)}).first;
   return OBB->second->dominates(InstA, InstB);
 }
 
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index dce19d6d546e..b99b75715025 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -45,6 +45,13 @@ static cl::opt<unsigned> ProfileSummaryHugeWorkingSetSizeThreshold(
              " blocks required to reach the -profile-summary-cutoff-hot"
              " percentile exceeds this count."));
 
+static cl::opt<unsigned> ProfileSummaryLargeWorkingSetSizeThreshold(
+    "profile-summary-large-working-set-size-threshold", cl::Hidden,
+    cl::init(12500), cl::ZeroOrMore,
+    cl::desc("The code working set size is considered large if the number of"
+             " blocks required to reach the -profile-summary-cutoff-hot"
+             " percentile exceeds this count."));
+
 // The next two options override the counts derived from summary computation and
 // are useful for debugging purposes.
 static cl::opt<int> ProfileSummaryHotCount(
@@ -186,6 +193,31 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
   return true;
 }
 
+// Like isFunctionHotInCallGraph but for a given cutoff.
+bool ProfileSummaryInfo::isFunctionHotInCallGraphNthPercentile(
+    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) {
+  if (!F || !computeSummary())
+    return false;
+  if (auto FunctionCount = F->getEntryCount())
+    if (isHotCountNthPercentile(PercentileCutoff, FunctionCount.getCount()))
+      return true;
+
+  if (hasSampleProfile()) {
+    uint64_t TotalCallCount = 0;
+    for (const auto &BB : *F)
+      for (const auto &I : BB)
+        if (isa<CallInst>(I) || isa<InvokeInst>(I))
+          if (auto CallCount = getProfileCount(&I, nullptr))
+            TotalCallCount += CallCount.getValue();
+    if (isHotCountNthPercentile(PercentileCutoff, TotalCallCount))
+      return true;
+  }
+  for (const auto &BB : *F)
+    if (isHotBlockNthPercentile(PercentileCutoff, &BB, &BFI))
+      return true;
+  return false;
+}
+
 /// Returns true if the function's entry is a cold. If it returns false, it
 /// either means it is not cold or it is unknown whether it is cold or not (for
 /// example, no profile data is available).
@@ -222,6 +254,23 @@ void ProfileSummaryInfo::computeThresholds() {
          "Cold count threshold cannot exceed hot count threshold!");
   HasHugeWorkingSetSize =
       HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
+  HasLargeWorkingSetSize =
+      HotEntry.NumCounts > ProfileSummaryLargeWorkingSetSizeThreshold;
+}
+
+Optional<uint64_t> ProfileSummaryInfo::computeThreshold(int PercentileCutoff) {
+  if (!computeSummary())
+    return None;
+  auto iter = ThresholdCache.find(PercentileCutoff);
+  if (iter != ThresholdCache.end()) {
+    return iter->second;
+  }
+  auto &DetailedSummary = Summary->getDetailedSummary();
+  auto &Entry =
+      getEntryForPercentile(DetailedSummary, PercentileCutoff);
+  uint64_t CountThreshold = Entry.MinCount;
+  ThresholdCache[PercentileCutoff] = CountThreshold;
+  return CountThreshold;
 }
 
 bool ProfileSummaryInfo::hasHugeWorkingSetSize() {
@@ -230,6 +279,12 @@ bool ProfileSummaryInfo::hasHugeWorkingSetSize() {
   return HasHugeWorkingSetSize && HasHugeWorkingSetSize.getValue();
 }
 
+bool ProfileSummaryInfo::hasLargeWorkingSetSize() {
+  if (!HasLargeWorkingSetSize)
+    computeThresholds();
+  return HasLargeWorkingSetSize && HasLargeWorkingSetSize.getValue();
+}
+
 bool ProfileSummaryInfo::isHotCount(uint64_t C) {
   if (!HotCountThreshold)
     computeThresholds();
@@ -242,6 +297,11 @@ bool ProfileSummaryInfo::isColdCount(uint64_t C) {
   return ColdCountThreshold && C <= ColdCountThreshold.getValue();
 }
 
+bool ProfileSummaryInfo::isHotCountNthPercentile(int PercentileCutoff, uint64_t C) {
+  auto CountThreshold = computeThreshold(PercentileCutoff);
+  return CountThreshold && C >= CountThreshold.getValue();
+}
+
 uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() {
   if (!HotCountThreshold)
     computeThresholds();
@@ -265,6 +325,13 @@ bool ProfileSummaryInfo::isColdBlock(const BasicBlock *BB,
   return Count && isColdCount(*Count);
 }
 
+bool ProfileSummaryInfo::isHotBlockNthPercentile(int PercentileCutoff,
+                                                 const BasicBlock *BB,
+                                                 BlockFrequencyInfo *BFI) {
+  auto Count = BFI->getBlockProfileCount(BB);
+  return Count && isHotCountNthPercentile(PercentileCutoff, *Count);
+}
+
 bool ProfileSummaryInfo::isHotCallSite(const CallSite &CS,
                                        BlockFrequencyInfo *BFI) {
   auto C = getProfileCount(CS.getInstruction(), BFI);
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index bc2cfd6fcc42..5ce0a1adeaa0 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -148,6 +148,7 @@ STATISTIC(NumBruteForceTripCountsComputed,
 
 static cl::opt<unsigned>
 MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
+                        cl::ZeroOrMore,
                         cl::desc("Maximum number of iterations SCEV will "
                                  "symbolically execute a constant "
                                  "derived loop"),
@@ -157,6 +158,9 @@ MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
 static cl::opt<bool> VerifySCEV(
     "verify-scev", cl::Hidden,
     cl::desc("Verify ScalarEvolution's backedge taken counts (slow)"));
+static cl::opt<bool> VerifySCEVStrict(
+    "verify-scev-strict", cl::Hidden,
+    cl::desc("Enable stricter verification with -verify-scev is passed"));
 static cl::opt<bool>
     VerifySCEVMap("verify-scev-maps", cl::Hidden,
                   cl::desc("Verify no dangling value in ScalarEvolution's "
@@ -1707,7 +1711,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
       // in infinite recursion. In the later case, the analysis code will
       // cope with a conservative value, and it will take care to purge
       // that value once it has finished.
-      const SCEV *MaxBECount = getMaxBackedgeTakenCount(L);
+      const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(L);
       if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
         // Manually compute the final value for AR, checking for
         // overflow.
@@ -2051,7 +2055,7 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
       // in infinite recursion. In the later case, the analysis code will
       // cope with a conservative value, and it will take care to purge
       // that value once it has finished.
-      const SCEV *MaxBECount = getMaxBackedgeTakenCount(L);
+      const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(L);
       if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
         // Manually compute the final value for AR, checking for
         // overflow.
@@ -3421,7 +3425,7 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
     return getAddRecExpr(Operands, L, SCEV::FlagAnyWrap); // {X,+,0}  -->  X
   }
 
-  // It's tempting to want to call getMaxBackedgeTakenCount count here and
+  // It's tempting to want to call getConstantMaxBackedgeTakenCount count here and
   // use that information to infer NUW and NSW flags. However, computing a
   // BE count requires calling getAddRecExpr, so we may not yet have a
   // meaningful BE count at this point (and if we don't, we'd be stuck
@@ -4991,7 +4995,7 @@ const SCEV *ScalarEvolution::createSimpleAffineAddRec(PHINode *PN,
   // overflow.
   if (auto *BEInst = dyn_cast<Instruction>(BEValueV))
     if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L))
-      (void)getAddRecExpr(getAddExpr(StartVal, Accum, Flags), Accum, L, Flags);
+      (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);
 
   return PHISCEV;
 }
@@ -5596,6 +5600,22 @@ ScalarEvolution::getRangeRef(const SCEV *S,
                     ConservativeResult.intersectWith(X, RangeType));
   }
 
+  if (const SCEVSMinExpr *SMin = dyn_cast<SCEVSMinExpr>(S)) {
+    ConstantRange X = getRangeRef(SMin->getOperand(0), SignHint);
+    for (unsigned i = 1, e = SMin->getNumOperands(); i != e; ++i)
+      X = X.smin(getRangeRef(SMin->getOperand(i), SignHint));
+    return setRange(SMin, SignHint,
+                    ConservativeResult.intersectWith(X, RangeType));
+  }
+
+  if (const SCEVUMinExpr *UMin = dyn_cast<SCEVUMinExpr>(S)) {
+    ConstantRange X = getRangeRef(UMin->getOperand(0), SignHint);
+    for (unsigned i = 1, e = UMin->getNumOperands(); i != e; ++i)
+      X = X.umin(getRangeRef(UMin->getOperand(i), SignHint));
+    return setRange(UMin, SignHint,
+                    ConservativeResult.intersectWith(X, RangeType));
+  }
+
   if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
     ConstantRange X = getRangeRef(UDiv->getLHS(), SignHint);
     ConstantRange Y = getRangeRef(UDiv->getRHS(), SignHint);
@@ -5654,7 +5674,7 @@ ScalarEvolution::getRangeRef(const SCEV *S,
 
     // TODO: non-affine addrec
     if (AddRec->isAffine()) {
-      const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop());
+      const SCEV *MaxBECount = getConstantMaxBackedgeTakenCount(AddRec->getLoop());
       if (!isa<SCEVCouldNotCompute>(MaxBECount) &&
           getTypeSizeInBits(MaxBECount->getType()) <= BitWidth) {
         auto RangeFromAffine = getRangeForAffineAR(
@@ -6523,7 +6543,7 @@ unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L,
 
 unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) {
   const auto *MaxExitCount =
-      dyn_cast<SCEVConstant>(getMaxBackedgeTakenCount(L));
+      dyn_cast<SCEVConstant>(getConstantMaxBackedgeTakenCount(L));
   return getConstantTripCount(MaxExitCount);
 }
 
@@ -6599,7 +6619,7 @@ const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L) {
 
 /// Similar to getBackedgeTakenCount, except return the least SCEV value that is
 /// known never to be less than the actual backedge taken count.
-const SCEV *ScalarEvolution::getMaxBackedgeTakenCount(const Loop *L) {
+const SCEV *ScalarEvolution::getConstantMaxBackedgeTakenCount(const Loop *L) {
   return getBackedgeTakenInfo(L).getMax(this);
 }
 
@@ -9833,6 +9853,10 @@ Optional<APInt> ScalarEvolution::computeConstantDifference(const SCEV *More,
   // We avoid subtracting expressions here because this function is usually
   // fairly deep in the call stack (i.e. is called many times).
 
+  // X - X = 0.
+  if (More == Less)
+    return APInt(getTypeSizeInBits(More->getType()), 0);
+
   if (isa<SCEVAddRecExpr>(Less) && isa<SCEVAddRecExpr>(More)) {
     const auto *LAR = cast<SCEVAddRecExpr>(Less);
     const auto *MAR = cast<SCEVAddRecExpr>(More);
@@ -10314,10 +10338,43 @@ bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
   return false;
 }
 
+static bool isKnownPredicateExtendIdiom(ICmpInst::Predicate Pred,
+                                        const SCEV *LHS, const SCEV *RHS) {
+  // zext x u<= sext x, sext x s<= zext x
+  switch (Pred) {
+  case ICmpInst::ICMP_SGE:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SLE: {
+    // If operand >=s 0 then ZExt == SExt.  If operand <s 0 then SExt <s ZExt.
+    const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(LHS);
+    const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(RHS);
+    if (SExt && ZExt && SExt->getOperand() == ZExt->getOperand())
+      return true;
+    break;
+  }
+  case ICmpInst::ICMP_UGE:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULE: {
+    // If operand >=s 0 then ZExt == SExt.  If operand <s 0 then ZExt <u SExt.
+    const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(LHS);
+    const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(RHS);
+    if (SExt && ZExt && SExt->getOperand() == ZExt->getOperand())
+      return true;
+    break;
+  }
+  default:
+    break;
+  };
+  return false;
+}
+
 bool
 ScalarEvolution::isKnownViaNonRecursiveReasoning(ICmpInst::Predicate Pred,
                                            const SCEV *LHS, const SCEV *RHS) {
-  return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
+  return isKnownPredicateExtendIdiom(Pred, LHS, RHS) ||
+         isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
          IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
          IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) ||
          isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
@@ -11434,8 +11491,8 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
   L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
   OS << ": ";
 
-  if (!isa<SCEVCouldNotCompute>(SE->getMaxBackedgeTakenCount(L))) {
-    OS << "max backedge-taken count is " << *SE->getMaxBackedgeTakenCount(L);
+  if (!isa<SCEVCouldNotCompute>(SE->getConstantMaxBackedgeTakenCount(L))) {
+    OS << "max backedge-taken count is " << *SE->getConstantMaxBackedgeTakenCount(L);
     if (SE->isBackedgeTakenCountMaxOrZero(L))
       OS << ", actual taken count either this or zero.";
   } else {
@@ -11901,14 +11958,14 @@ void ScalarEvolution::verify() const {
              SE.getTypeSizeInBits(NewBECount->getType()))
       CurBECount = SE2.getZeroExtendExpr(CurBECount, NewBECount->getType());
 
-    auto *ConstantDelta =
-        dyn_cast<SCEVConstant>(SE2.getMinusSCEV(CurBECount, NewBECount));
+    const SCEV *Delta = SE2.getMinusSCEV(CurBECount, NewBECount);
 
-    if (ConstantDelta && ConstantDelta->getAPInt() != 0) {
-      dbgs() << "Trip Count Changed!\n";
+    // Unless VerifySCEVStrict is set, we only compare constant deltas.
+    if ((VerifySCEVStrict || isa<SCEVConstant>(Delta)) && !Delta->isZero()) {
+      dbgs() << "Trip Count for " << *L << " Changed!\n";
       dbgs() << "Old: " << *CurBECount << "\n";
       dbgs() << "New: " << *NewBECount << "\n";
-      dbgs() << "Delta: " << *ConstantDelta << "\n";
+      dbgs() << "Delta: " << *Delta << "\n";
       std::abort();
     }
   }
@@ -11959,7 +12016,7 @@ ScalarEvolutionWrapperPass::ScalarEvolutionWrapperPass() : FunctionPass(ID) {
 
 bool ScalarEvolutionWrapperPass::runOnFunction(Function &F) {
   SE.reset(new ScalarEvolution(
-      F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+      F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
       getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
       getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
       getAnalysis<LoopInfoWrapperPass>().getLoopInfo()));
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index e8a95d35482c..bceec921188e 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -240,9 +240,6 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
 /// division. If so, update S with Factor divided out and return true.
 /// S need not be evenly divisible if a reasonable remainder can be
 /// computed.
-/// TODO: When ScalarEvolution gets a SCEVSDivExpr, this can be made
-/// unnecessary; in its place, just signed-divide Ops[i] by the scale and
-/// check to see if the divide was folded.
 static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder,
                               const SCEV *Factor, ScalarEvolution &SE,
                               const DataLayout &DL) {
@@ -1486,7 +1483,18 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
 }
 
 Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
-  if (!CanonicalMode) return expandAddRecExprLiterally(S);
+  // In canonical mode we compute the addrec as an expression of a canonical IV
+  // using evaluateAtIteration and expand the resulting SCEV expression. This
+  // way we avoid introducing new IVs to carry on the comutation of the addrec
+  // throughout the loop.
+  //
+  // For nested addrecs evaluateAtIteration might need a canonical IV of a
+  // type wider than the addrec itself. Emitting a canonical IV of the
+  // proper type might produce non-legal types, for example expanding an i64
+  // {0,+,2,+,1} addrec would need an i65 canonical IV. To avoid this just fall
+  // back to non-canonical mode for nested addrecs.
+  if (!CanonicalMode || (S->getNumOperands() > 2))
+    return expandAddRecExprLiterally(S);
 
   Type *Ty = SE.getEffectiveSCEVType(S->getType());
   const Loop *L = S->getLoop();
@@ -2094,11 +2102,10 @@ SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
   for (BasicBlock *BB : ExitingBlocks) {
     ICmpInst::Predicate Pred;
     Instruction *LHS, *RHS;
-    BasicBlock *TrueBB, *FalseBB;
 
     if (!match(BB->getTerminator(),
                m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)),
-                    TrueBB, FalseBB)))
+                    m_BasicBlock(), m_BasicBlock())))
       continue;
 
     if (SE.getSCEV(LHS) == S && SE.DT.dominates(LHS, At))
diff --git a/lib/Analysis/StackSafetyAnalysis.cpp b/lib/Analysis/StackSafetyAnalysis.cpp
index 4cf235db86eb..1b3638698950 100644
--- a/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/lib/Analysis/StackSafetyAnalysis.cpp
@@ -333,8 +333,8 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) {
         // FIXME: consult devirt?
         // Do not follow aliases, otherwise we could inadvertently follow
         // dso_preemptable aliases or aliases with interposable linkage.
-        const GlobalValue *Callee = dyn_cast<GlobalValue>(
-            CS.getCalledValue()->stripPointerCastsNoFollowAliases());
+        const GlobalValue *Callee =
+            dyn_cast<GlobalValue>(CS.getCalledValue()->stripPointerCasts());
         if (!Callee) {
           US.updateRange(UnknownRange);
           return false;
diff --git a/lib/Analysis/SyncDependenceAnalysis.cpp b/lib/Analysis/SyncDependenceAnalysis.cpp
index 3cf248a31142..8447dc87069d 100644
--- a/lib/Analysis/SyncDependenceAnalysis.cpp
+++ b/lib/Analysis/SyncDependenceAnalysis.cpp
@@ -218,9 +218,11 @@ struct DivergencePropagator {
   template <typename SuccessorIterable>
   std::unique_ptr<ConstBlockSet>
   computeJoinPoints(const BasicBlock &RootBlock,
-                    SuccessorIterable NodeSuccessors, const Loop *ParentLoop, const BasicBlock * PdBoundBlock) {
+                    SuccessorIterable NodeSuccessors, const Loop *ParentLoop) {
     assert(JoinBlocks);
 
+    LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints. Parent loop: " << (ParentLoop ? ParentLoop->getName() : "<null>") << "\n" );
+
     // bootstrap with branch targets
     for (const auto *SuccBlock : NodeSuccessors) {
       DefMap.emplace(SuccBlock, SuccBlock);
@@ -228,13 +230,19 @@ struct DivergencePropagator {
       if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
         // immediate loop exit from node.
         ReachedLoopExits.insert(SuccBlock);
-        continue;
       } else {
         // regular successor
         PendingUpdates.insert(SuccBlock);
       }
     }
 
+    LLVM_DEBUG(
+      dbgs() << "SDA: rpo order:\n";
+      for (const auto * RpoBlock : FuncRPOT) {
+        dbgs() << "- " << RpoBlock->getName() << "\n";
+      }
+    );
+
     auto ItBeginRPO = FuncRPOT.begin();
 
     // skip until term (TODO RPOT won't let us start at @term directly)
@@ -245,16 +253,18 @@ struct DivergencePropagator {
 
     // propagate definitions at the immediate successors of the node in RPO
     auto ItBlockRPO = ItBeginRPO;
-    while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
+    while ((++ItBlockRPO != ItEndRPO) &&
+           !PendingUpdates.empty()) {
       const auto *Block = *ItBlockRPO;
+      LLVM_DEBUG(dbgs() << "SDA::joins. visiting " << Block->getName() << "\n");
 
-      // skip @block if not pending update
+      // skip Block if not pending update
       auto ItPending = PendingUpdates.find(Block);
       if (ItPending == PendingUpdates.end())
         continue;
       PendingUpdates.erase(ItPending);
 
-      // propagate definition at @block to its successors
+      // propagate definition at Block to its successors
       auto ItDef = DefMap.find(Block);
       const auto *DefBlock = ItDef->second;
       assert(DefBlock);
@@ -278,6 +288,8 @@ struct DivergencePropagator {
       }
     }
 
+    LLVM_DEBUG(dbgs() << "SDA::joins. After propagation:\n"; printDefs(dbgs()));
+
     // We need to know the definition at the parent loop header to decide
     // whether the definition at the header is different from the definition at
     // the loop exits, which would indicate a divergent loop exits.
@@ -292,24 +304,17 @@ struct DivergencePropagator {
     // |
     // proper exit from both loops
     //
-    // D post-dominates B as it is the only proper exit from the "A loop".
-    // If C has a divergent branch, propagation will therefore stop at D.
-    // That implies that B will never receive a definition.
-    // But that definition can only be the same as at D (D itself in thise case)
-    // because all paths to anywhere have to pass through D.
-    //
-    const BasicBlock *ParentLoopHeader =
-        ParentLoop ? ParentLoop->getHeader() : nullptr;
-    if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
-      DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
-    }
-
     // analyze reached loop exits
     if (!ReachedLoopExits.empty()) {
+      const BasicBlock *ParentLoopHeader =
+          ParentLoop ? ParentLoop->getHeader() : nullptr;
+
       assert(ParentLoop);
-      const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
+      auto ItHeaderDef = DefMap.find(ParentLoopHeader);
+      const auto *HeaderDefBlock = (ItHeaderDef == DefMap.end()) ? nullptr : ItHeaderDef->second;
+
       LLVM_DEBUG(printDefs(dbgs()));
-      assert(HeaderDefBlock && "no definition in header of carrying loop");
+      assert(HeaderDefBlock && "no definition at header of carrying loop");
 
       for (const auto *ExitBlock : ReachedLoopExits) {
         auto ItExitDef = DefMap.find(ExitBlock);
@@ -339,19 +344,10 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) {
     return *ItCached->second;
   }
 
-  // dont propagte beyond the immediate post dom of the loop
-  const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(Loop.getHeader()));
-  const auto *IpdNode = PdNode->getIDom();
-  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-  while (PdBoundBlock && Loop.contains(PdBoundBlock)) {
-    IpdNode = IpdNode->getIDom();
-    PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-  }
-
   // compute all join points
   DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
   auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
-      *Loop.getHeader(), LoopExits, Loop.getParentLoop(), PdBoundBlock);
+      *Loop.getHeader(), LoopExits, Loop.getParentLoop());
 
   auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks));
   assert(ItInserted.second);
@@ -370,16 +366,11 @@ SyncDependenceAnalysis::join_blocks(const Instruction &Term) {
   if (ItCached != CachedBranchJoins.end())
     return *ItCached->second;
 
-  // dont propagate beyond the immediate post dominator of the branch
-  const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(Term.getParent()));
-  const auto *IpdNode = PdNode->getIDom();
-  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-
   // compute all join points
   DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
   const auto &TermBlock = *Term.getParent();
   auto JoinBlocks = Propagator.computeJoinPoints<succ_const_range>(
-      TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock), PdBoundBlock);
+      TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock));
 
   auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
   assert(ItInserted.second);
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index ef139d3257d2..230969698054 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -28,7 +28,8 @@ static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary(
                clEnumValN(TargetLibraryInfoImpl::SVML, "SVML",
                           "Intel SVML library")));
 
-StringRef const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] = {
+StringLiteral const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] =
+    {
 #define TLI_DEFINE_STRING
 #include "llvm/Analysis/TargetLibraryInfo.def"
 };
@@ -58,14 +59,14 @@ static bool hasBcmp(const Triple &TT) {
     return TT.isGNUEnvironment() || TT.isMusl();
   // Both NetBSD and OpenBSD are planning to remove the function. Windows does
   // not have it.
-  return TT.isOSFreeBSD() || TT.isOSSolaris() || TT.isOSDarwin();
+  return TT.isOSFreeBSD() || TT.isOSSolaris();
 }
 
 /// Initialize the set of available library functions based on the specified
 /// target triple. This should be carefully written so that a missing target
 /// triple gets a sane set of defaults.
 static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
-                       ArrayRef<StringRef> StandardNames) {
+                       ArrayRef<StringLiteral> StandardNames) {
   // Verify that the StandardNames array is in alphabetical order.
   assert(std::is_sorted(StandardNames.begin(), StandardNames.end(),
                         [](StringRef LHS, StringRef RHS) {
@@ -104,19 +105,10 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   TLI.setShouldSignExtI32Param(ShouldSignExtI32Param);
 
   if (T.getArch() == Triple::r600 ||
-      T.getArch() == Triple::amdgcn) {
-    TLI.setUnavailable(LibFunc_ldexp);
-    TLI.setUnavailable(LibFunc_ldexpf);
-    TLI.setUnavailable(LibFunc_ldexpl);
-    TLI.setUnavailable(LibFunc_exp10);
-    TLI.setUnavailable(LibFunc_exp10f);
-    TLI.setUnavailable(LibFunc_exp10l);
-    TLI.setUnavailable(LibFunc_log10);
-    TLI.setUnavailable(LibFunc_log10f);
-    TLI.setUnavailable(LibFunc_log10l);
-  }
+      T.getArch() == Triple::amdgcn)
+    TLI.disableAllFunctions();
 
-  // There are no library implementations of mempcy and memset for AMD gpus and
+  // There are no library implementations of memcpy and memset for AMD gpus and
   // these can be difficult to lower in the backend.
   if (T.getArch() == Triple::r600 ||
       T.getArch() == Triple::amdgcn) {
@@ -623,19 +615,14 @@ static StringRef sanitizeFunctionName(StringRef funcName) {
   return GlobalValue::dropLLVMManglingEscape(funcName);
 }
 
-bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
-                                       LibFunc &F) const {
-  StringRef const *Start = &StandardNames[0];
-  StringRef const *End = &StandardNames[NumLibFuncs];
-
+bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName, LibFunc &F) const {
   funcName = sanitizeFunctionName(funcName);
   if (funcName.empty())
     return false;
 
-  StringRef const *I = std::lower_bound(
-      Start, End, funcName, [](StringRef LHS, StringRef RHS) {
-        return LHS < RHS;
-      });
+  const auto *Start = std::begin(StandardNames);
+  const auto *End = std::end(StandardNames);
+  const auto *I = std::lower_bound(Start, End, funcName);
   if (I != End && *I == funcName) {
     F = (LibFunc)(I - Start);
     return true;
@@ -1481,6 +1468,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
       return false;
   }
   case LibFunc::NumLibFuncs:
+  case LibFunc::NotLibFunc:
     break;
   }
 
@@ -1599,14 +1587,6 @@ StringRef TargetLibraryInfoImpl::getScalarizedFunction(StringRef F,
   return I->ScalarFnName;
 }
 
-TargetLibraryInfo TargetLibraryAnalysis::run(Module &M,
-                                             ModuleAnalysisManager &) {
-  if (PresetInfoImpl)
-    return TargetLibraryInfo(*PresetInfoImpl);
-
-  return TargetLibraryInfo(lookupInfoImpl(Triple(M.getTargetTriple())));
-}
-
 TargetLibraryInfo TargetLibraryAnalysis::run(Function &F,
                                              FunctionAnalysisManager &) {
   if (PresetInfoImpl)
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index eb04c34453fb..c9c294873ea6 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -59,11 +60,7 @@ bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
   SmallVector<BasicBlock *, 4> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
-  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
-                                               IE = ExitingBlocks.end();
-       I != IE; ++I) {
-    BasicBlock *BB = *I;
-
+  for (BasicBlock *BB : ExitingBlocks) {
     // If we pass the updated counter back through a phi, we need to know
     // which latch the updated value will be coming from.
     if (!L->isLoopLatch(BB)) {
@@ -97,13 +94,11 @@ bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
     // For this to be true, we must dominate all blocks with backedges. Such
     // blocks are in-loop predecessors to the header block.
     bool NotAlways = false;
-    for (pred_iterator PI = pred_begin(L->getHeader()),
-                       PIE = pred_end(L->getHeader());
-         PI != PIE; ++PI) {
-      if (!L->contains(*PI))
+    for (BasicBlock *Pred : predecessors(L->getHeader())) {
+      if (!L->contains(Pred))
         continue;
 
-      if (!DT.dominates(*I, *PI)) {
+      if (!DT.dominates(BB, Pred)) {
         NotAlways = true;
         break;
       }
@@ -127,7 +122,7 @@ bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
 
     // Note that this block may not be the loop latch block, even if the loop
     // has a latch block.
-    ExitBlock = *I;
+    ExitBlock = BB;
     ExitCount = EC;
     break;
   }
@@ -227,6 +222,16 @@ unsigned TargetTransformInfo::getFlatAddressSpace() const {
   return TTIImpl->getFlatAddressSpace();
 }
 
+bool TargetTransformInfo::collectFlatAddressOperands(
+  SmallVectorImpl<int> &OpIndexes, Intrinsic::ID IID) const  {
+  return TTIImpl->collectFlatAddressOperands(OpIndexes, IID);
+}
+
+bool TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
+  IntrinsicInst *II, Value *OldV, Value *NewV) const {
+  return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
+}
+
 bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
   return TTIImpl->isLoweredToCall(F);
 }
@@ -283,21 +288,22 @@ bool TargetTransformInfo::shouldFavorBackedgeIndex(const Loop *L) const {
   return TTIImpl->shouldFavorBackedgeIndex(L);
 }
 
-bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
-  return TTIImpl->isLegalMaskedStore(DataType);
+bool TargetTransformInfo::isLegalMaskedStore(Type *DataType,
+                                             MaybeAlign Alignment) const {
+  return TTIImpl->isLegalMaskedStore(DataType, Alignment);
 }
 
-bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType) const {
-  return TTIImpl->isLegalMaskedLoad(DataType);
+bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType,
+                                            MaybeAlign Alignment) const {
+  return TTIImpl->isLegalMaskedLoad(DataType, Alignment);
 }
 
 bool TargetTransformInfo::isLegalNTStore(Type *DataType,
-                                         unsigned Alignment) const {
+                                         Align Alignment) const {
   return TTIImpl->isLegalNTStore(DataType, Alignment);
 }
 
-bool TargetTransformInfo::isLegalNTLoad(Type *DataType,
-                                        unsigned Alignment) const {
+bool TargetTransformInfo::isLegalNTLoad(Type *DataType, Align Alignment) const {
   return TTIImpl->isLegalNTLoad(DataType, Alignment);
 }
 
@@ -359,14 +365,6 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
   return TTIImpl->isTypeLegal(Ty);
 }
 
-unsigned TargetTransformInfo::getJumpBufAlignment() const {
-  return TTIImpl->getJumpBufAlignment();
-}
-
-unsigned TargetTransformInfo::getJumpBufSize() const {
-  return TTIImpl->getJumpBufSize();
-}
-
 bool TargetTransformInfo::shouldBuildLookupTables() const {
   return TTIImpl->shouldBuildLookupTables();
 }
@@ -470,8 +468,16 @@ int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   return Cost;
 }
 
-unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
-  return TTIImpl->getNumberOfRegisters(Vector);
+unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const {
+  return TTIImpl->getNumberOfRegisters(ClassID);
+}
+
+unsigned TargetTransformInfo::getRegisterClassForType(bool Vector, Type *Ty) const {
+  return TTIImpl->getRegisterClassForType(Vector, Ty);
+}
+
+const char* TargetTransformInfo::getRegisterClassName(unsigned ClassID) const {
+  return TTIImpl->getRegisterClassName(ClassID);
 }
 
 unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
@@ -1276,6 +1282,8 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
     return getVectorInstrCost(I->getOpcode(),
                                    IE->getType(), Idx);
   }
+  case Instruction::ExtractValue:
+    return 0; // Model all ExtractValue nodes as free.
   case Instruction::ShuffleVector: {
     const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
     Type *Ty = Shuffle->getType();
diff --git a/lib/Analysis/TypeMetadataUtils.cpp b/lib/Analysis/TypeMetadataUtils.cpp
index 9311dfbc6eba..072d291f3f93 100644
--- a/lib/Analysis/TypeMetadataUtils.cpp
+++ b/lib/Analysis/TypeMetadataUtils.cpp
@@ -127,3 +127,35 @@ void llvm::findDevirtualizableCallsForTypeCheckedLoad(
     findCallsAtConstantOffset(DevirtCalls, &HasNonCallUses, LoadedPtr,
                               Offset->getZExtValue(), CI, DT);
 }
+
+Constant *llvm::getPointerAtOffset(Constant *I, uint64_t Offset, Module &M) {
+  if (I->getType()->isPointerTy()) {
+    if (Offset == 0)
+      return I;
+    return nullptr;
+  }
+
+  const DataLayout &DL = M.getDataLayout();
+
+  if (auto *C = dyn_cast<ConstantStruct>(I)) {
+    const StructLayout *SL = DL.getStructLayout(C->getType());
+    if (Offset >= SL->getSizeInBytes())
+      return nullptr;
+
+    unsigned Op = SL->getElementContainingOffset(Offset);
+    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
+                              Offset - SL->getElementOffset(Op), M);
+  }
+  if (auto *C = dyn_cast<ConstantArray>(I)) {
+    ArrayType *VTableTy = C->getType();
+    uint64_t ElemSize = DL.getTypeAllocSize(VTableTy->getElementType());
+
+    unsigned Op = Offset / ElemSize;
+    if (Op >= C->getNumOperands())
+      return nullptr;
+
+    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
+                              Offset % ElemSize, M);
+  }
+  return nullptr;
+}
diff --git a/lib/Analysis/VFABIDemangling.cpp b/lib/Analysis/VFABIDemangling.cpp
new file mode 100644
index 000000000000..6fd8ae63f5f0
--- /dev/null
+++ b/lib/Analysis/VFABIDemangling.cpp
@@ -0,0 +1,418 @@
+//===- VFABIDemangling.cpp - Vector Function ABI demangling utilities. ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/VectorUtils.h"
+
+using namespace llvm;
+
+namespace {
+/// Utilities for the Vector Function ABI name parser.
+
+/// Return types for the parser functions.
+enum class ParseRet {
+  OK,   // Found.
+  None, // Not found.
+  Error // Syntax error.
+};
+
+/// Extracts the `<isa>` information from the mangled string, and
+/// sets the `ISA` accordingly.
+ParseRet tryParseISA(StringRef &MangledName, VFISAKind &ISA) {
+  if (MangledName.empty())
+    return ParseRet::Error;
+
+  ISA = StringSwitch<VFISAKind>(MangledName.take_front(1))
+            .Case("n", VFISAKind::AdvancedSIMD)
+            .Case("s", VFISAKind::SVE)
+            .Case("b", VFISAKind::SSE)
+            .Case("c", VFISAKind::AVX)
+            .Case("d", VFISAKind::AVX2)
+            .Case("e", VFISAKind::AVX512)
+            .Default(VFISAKind::Unknown);
+
+  MangledName = MangledName.drop_front(1);
+
+  return ParseRet::OK;
+}
+
+/// Extracts the `<mask>` information from the mangled string, and
+/// sets `IsMasked` accordingly. The input string `MangledName` is
+/// left unmodified.
+ParseRet tryParseMask(StringRef &MangledName, bool &IsMasked) {
+  if (MangledName.consume_front("M")) {
+    IsMasked = true;
+    return ParseRet::OK;
+  }
+
+  if (MangledName.consume_front("N")) {
+    IsMasked = false;
+    return ParseRet::OK;
+  }
+
+  return ParseRet::Error;
+}
+
+/// Extract the `<vlen>` information from the mangled string, and
+/// sets `VF` accordingly. A `<vlen> == "x"` token is interpreted as a scalable
+/// vector length. On success, the `<vlen>` token is removed from
+/// the input string `ParseString`.
+///
+ParseRet tryParseVLEN(StringRef &ParseString, unsigned &VF, bool &IsScalable) {
+  if (ParseString.consume_front("x")) {
+    VF = 0;
+    IsScalable = true;
+    return ParseRet::OK;
+  }
+
+  if (ParseString.consumeInteger(10, VF))
+    return ParseRet::Error;
+
+  IsScalable = false;
+  return ParseRet::OK;
+}
+
+/// The function looks for the following strings at the beginning of
+/// the input string `ParseString`:
+///
+///  <token> <number>
+///
+/// On success, it removes the parsed parameter from `ParseString`,
+/// sets `PKind` to the correspondent enum value, sets `Pos` to
+/// <number>, and return success.  On a syntax error, it return a
+/// parsing error. If nothing is parsed, it returns None.
+///
+/// The function expects <token> to be one of "ls", "Rs", "Us" or
+/// "Ls".
+ParseRet tryParseLinearTokenWithRuntimeStep(StringRef &ParseString,
+                                            VFParamKind &PKind, int &Pos,
+                                            const StringRef Token) {
+  if (ParseString.consume_front(Token)) {
+    PKind = VFABI::getVFParamKindFromString(Token);
+    if (ParseString.consumeInteger(10, Pos))
+      return ParseRet::Error;
+    return ParseRet::OK;
+  }
+
+  return ParseRet::None;
+}
+
+/// The function looks for the following stringt at the beginning of
+/// the input string `ParseString`:
+///
+///  <token> <number>
+///
+/// <token> is one of "ls", "Rs", "Us" or "Ls".
+///
+/// On success, it removes the parsed parameter from `ParseString`,
+/// sets `PKind` to the correspondent enum value, sets `StepOrPos` to
+/// <number>, and return success.  On a syntax error, it return a
+/// parsing error. If nothing is parsed, it returns None.
+ParseRet tryParseLinearWithRuntimeStep(StringRef &ParseString,
+                                       VFParamKind &PKind, int &StepOrPos) {
+  ParseRet Ret;
+
+  // "ls" <RuntimeStepPos>
+  Ret = tryParseLinearTokenWithRuntimeStep(ParseString, PKind, StepOrPos, "ls");
+  if (Ret != ParseRet::None)
+    return Ret;
+
+  // "Rs" <RuntimeStepPos>
+  Ret = tryParseLinearTokenWithRuntimeStep(ParseString, PKind, StepOrPos, "Rs");
+  if (Ret != ParseRet::None)
+    return Ret;
+
+  // "Ls" <RuntimeStepPos>
+  Ret = tryParseLinearTokenWithRuntimeStep(ParseString, PKind, StepOrPos, "Ls");
+  if (Ret != ParseRet::None)
+    return Ret;
+
+  // "Us" <RuntimeStepPos>
+  Ret = tryParseLinearTokenWithRuntimeStep(ParseString, PKind, StepOrPos, "Us");
+  if (Ret != ParseRet::None)
+    return Ret;
+
+  return ParseRet::None;
+}
+
+/// The function looks for the following strings at the beginning of
+/// the input string `ParseString`:
+///
+///  <token> {"n"} <number>
+///
+/// On success, it removes the parsed parameter from `ParseString`,
+/// sets `PKind` to the correspondent enum value, sets `LinearStep` to
+/// <number>, and return success.  On a syntax error, it return a
+/// parsing error. If nothing is parsed, it returns None.
+///
+/// The function expects <token> to be one of "l", "R", "U" or
+/// "L".
+ParseRet tryParseCompileTimeLinearToken(StringRef &ParseString,
+                                        VFParamKind &PKind, int &LinearStep,
+                                        const StringRef Token) {
+  if (ParseString.consume_front(Token)) {
+    PKind = VFABI::getVFParamKindFromString(Token);
+    const bool Negate = ParseString.consume_front("n");
+    if (ParseString.consumeInteger(10, LinearStep))
+      LinearStep = 1;
+    if (Negate)
+      LinearStep *= -1;
+    return ParseRet::OK;
+  }
+
+  return ParseRet::None;
+}
+
+/// The function looks for the following strings at the beginning of
+/// the input string `ParseString`:
+///
+/// ["l" | "R" | "U" | "L"] {"n"} <number>
+///
+/// On success, it removes the parsed parameter from `ParseString`,
+/// sets `PKind` to the correspondent enum value, sets `LinearStep` to
+/// <number>, and return success.  On a syntax error, it return a
+/// parsing error. If nothing is parsed, it returns None.
+ParseRet tryParseLinearWithCompileTimeStep(StringRef &ParseString,
+                                           VFParamKind &PKind, int &StepOrPos) {
+  // "l" {"n"} <CompileTimeStep>
+  if (tryParseCompileTimeLinearToken(ParseString, PKind, StepOrPos, "l") ==
+      ParseRet::OK)
+    return ParseRet::OK;
+
+  // "R" {"n"} <CompileTimeStep>
+  if (tryParseCompileTimeLinearToken(ParseString, PKind, StepOrPos, "R") ==
+      ParseRet::OK)
+    return ParseRet::OK;
+
+  // "L" {"n"} <CompileTimeStep>
+  if (tryParseCompileTimeLinearToken(ParseString, PKind, StepOrPos, "L") ==
+      ParseRet::OK)
+    return ParseRet::OK;
+
+  // "U" {"n"} <CompileTimeStep>
+  if (tryParseCompileTimeLinearToken(ParseString, PKind, StepOrPos, "U") ==
+      ParseRet::OK)
+    return ParseRet::OK;
+
+  return ParseRet::None;
+}
+
+/// The function looks for the following strings at the beginning of
+/// the input string `ParseString`:
+///
+/// "u" <number>
+///
+/// On success, it removes the parsed parameter from `ParseString`,
+/// sets `PKind` to the correspondent enum value, sets `Pos` to
+/// <number>, and return success.  On a syntax error, it return a
+/// parsing error. If nothing is parsed, it returns None.
+ParseRet tryParseUniform(StringRef &ParseString, VFParamKind &PKind, int &Pos) {
+  // "u" <Pos>
+  const char *UniformToken = "u";
+  if (ParseString.consume_front(UniformToken)) {
+    PKind = VFABI::getVFParamKindFromString(UniformToken);
+    if (ParseString.consumeInteger(10, Pos))
+      return ParseRet::Error;
+
+    return ParseRet::OK;
+  }
+  return ParseRet::None;
+}
+
+/// Looks into the <parameters> part of the mangled name in search
+/// for valid paramaters at the beginning of the string
+/// `ParseString`.
+///
+/// On success, it removes the parsed parameter from `ParseString`,
+/// sets `PKind` to the correspondent enum value, sets `StepOrPos`
+/// accordingly, and return success.  On a syntax error, it return a
+/// parsing error. If nothing is parsed, it returns None.
+ParseRet tryParseParameter(StringRef &ParseString, VFParamKind &PKind,
+                           int &StepOrPos) {
+  if (ParseString.consume_front("v")) {
+    PKind = VFParamKind::Vector;
+    StepOrPos = 0;
+    return ParseRet::OK;
+  }
+
+  const ParseRet HasLinearRuntime =
+      tryParseLinearWithRuntimeStep(ParseString, PKind, StepOrPos);
+  if (HasLinearRuntime != ParseRet::None)
+    return HasLinearRuntime;
+
+  const ParseRet HasLinearCompileTime =
+      tryParseLinearWithCompileTimeStep(ParseString, PKind, StepOrPos);
+  if (HasLinearCompileTime != ParseRet::None)
+    return HasLinearCompileTime;
+
+  const ParseRet HasUniform = tryParseUniform(ParseString, PKind, StepOrPos);
+  if (HasUniform != ParseRet::None)
+    return HasUniform;
+
+  return ParseRet::None;
+}
+
+/// Looks into the <parameters> part of the mangled name in search
+/// of a valid 'aligned' clause. The function should be invoked
+/// after parsing a parameter via `tryParseParameter`.
+///
+/// On success, it removes the parsed parameter from `ParseString`,
+/// sets `PKind` to the correspondent enum value, sets `StepOrPos`
+/// accordingly, and return success.  On a syntax error, it return a
+/// parsing error. If nothing is parsed, it returns None.
+ParseRet tryParseAlign(StringRef &ParseString, Align &Alignment) {
+  uint64_t Val;
+  //    "a" <number>
+  if (ParseString.consume_front("a")) {
+    if (ParseString.consumeInteger(10, Val))
+      return ParseRet::Error;
+
+    if (!isPowerOf2_64(Val))
+      return ParseRet::Error;
+
+    Alignment = Align(Val);
+
+    return ParseRet::OK;
+  }
+
+  return ParseRet::None;
+}
+} // namespace
+
+// Format of the ABI name:
+// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]
+Optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName) {
+  // Assume there is no custom name <redirection>, and therefore the
+  // vector name consists of
+  // _ZGV<isa><mask><vlen><parameters>_<scalarname>.
+  StringRef VectorName = MangledName;
+
+  // Parse the fixed size part of the manled name
+  if (!MangledName.consume_front("_ZGV"))
+    return None;
+
+  // Extract ISA. An unknow ISA is also supported, so we accept all
+  // values.
+  VFISAKind ISA;
+  if (tryParseISA(MangledName, ISA) != ParseRet::OK)
+    return None;
+
+  // Extract <mask>.
+  bool IsMasked;
+  if (tryParseMask(MangledName, IsMasked) != ParseRet::OK)
+    return None;
+
+  // Parse the variable size, starting from <vlen>.
+  unsigned VF;
+  bool IsScalable;
+  if (tryParseVLEN(MangledName, VF, IsScalable) != ParseRet::OK)
+    return None;
+
+  // Parse the <parameters>.
+  ParseRet ParamFound;
+  SmallVector<VFParameter, 8> Parameters;
+  do {
+    const unsigned ParameterPos = Parameters.size();
+    VFParamKind PKind;
+    int StepOrPos;
+    ParamFound = tryParseParameter(MangledName, PKind, StepOrPos);
+
+    // Bail off if there is a parsing error in the parsing of the parameter.
+    if (ParamFound == ParseRet::Error)
+      return None;
+
+    if (ParamFound == ParseRet::OK) {
+      Align Alignment;
+      // Look for the alignment token "a <number>".
+      const ParseRet AlignFound = tryParseAlign(MangledName, Alignment);
+      // Bail off if there is a syntax error in the align token.
+      if (AlignFound == ParseRet::Error)
+        return None;
+
+      // Add the parameter.
+      Parameters.push_back({ParameterPos, PKind, StepOrPos, Alignment});
+    }
+  } while (ParamFound == ParseRet::OK);
+
+  // A valid MangledName mus have at least one valid entry in the
+  // <parameters>.
+  if (Parameters.empty())
+    return None;
+
+  // Check for the <scalarname> and the optional <redirection>, which
+  // are separated from the prefix with "_"
+  if (!MangledName.consume_front("_"))
+    return None;
+
+  // The rest of the string must be in the format:
+  // <scalarname>[(<redirection>)]
+  const StringRef ScalarName =
+      MangledName.take_while([](char In) { return In != '('; });
+
+  if (ScalarName.empty())
+    return None;
+
+  // Reduce MangledName to [(<redirection>)].
+  MangledName = MangledName.ltrim(ScalarName);
+  // Find the optional custom name redirection.
+  if (MangledName.consume_front("(")) {
+    if (!MangledName.consume_back(")"))
+      return None;
+    // Update the vector variant with the one specified by the user.
+    VectorName = MangledName;
+    // If the vector name is missing, bail out.
+    if (VectorName.empty())
+      return None;
+  }
+
+  // When <mask> is "M", we need to add a parameter that is used as
+  // global predicate for the function.
+  if (IsMasked) {
+    const unsigned Pos = Parameters.size();
+    Parameters.push_back({Pos, VFParamKind::GlobalPredicate});
+  }
+
+  // Asserts for parameters of type `VFParamKind::GlobalPredicate`, as
+  // prescribed by the Vector Function ABI specifications supported by
+  // this parser:
+  // 1. Uniqueness.
+  // 2. Must be the last in the parameter list.
+  const auto NGlobalPreds = std::count_if(
+      Parameters.begin(), Parameters.end(), [](const VFParameter PK) {
+        return PK.ParamKind == VFParamKind::GlobalPredicate;
+      });
+  assert(NGlobalPreds < 2 && "Cannot have more than one global predicate.");
+  if (NGlobalPreds)
+    assert(Parameters.back().ParamKind == VFParamKind::GlobalPredicate &&
+           "The global predicate must be the last parameter");
+
+  const VFShape Shape({VF, IsScalable, ISA, Parameters});
+  return VFInfo({Shape, ScalarName, VectorName});
+}
+
+VFParamKind VFABI::getVFParamKindFromString(const StringRef Token) {
+  const VFParamKind ParamKind = StringSwitch<VFParamKind>(Token)
+                                    .Case("v", VFParamKind::Vector)
+                                    .Case("l", VFParamKind::OMP_Linear)
+                                    .Case("R", VFParamKind::OMP_LinearRef)
+                                    .Case("L", VFParamKind::OMP_LinearVal)
+                                    .Case("U", VFParamKind::OMP_LinearUVal)
+                                    .Case("ls", VFParamKind::OMP_LinearPos)
+                                    .Case("Ls", VFParamKind::OMP_LinearValPos)
+                                    .Case("Rs", VFParamKind::OMP_LinearRefPos)
+                                    .Case("Us", VFParamKind::OMP_LinearUValPos)
+                                    .Case("u", VFParamKind::OMP_Uniform)
+                                    .Default(VFParamKind::Unknown);
+
+  if (ParamKind != VFParamKind::Unknown)
+    return ParamKind;
+
+  // This function should never be invoked with an invalid input.
+  llvm_unreachable("This fuction should be invoken only on parameters"
+                   " that have a textual representation in the mangled name"
+                   " of the Vector Function ABI");
+}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index c70906dcc629..bbf389991836 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -558,12 +558,18 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
         return true;
   }
 
+  // Don't let an assume affect itself - this would cause the problems
+  // `isEphemeralValueOf` is trying to prevent, and it would also make
+  // the loop below go out of bounds.
+  if (Inv == CxtI)
+    return false;
+
   // The context comes first, but they're both in the same block. Make sure
   // there is nothing in between that might interrupt the control flow.
   for (BasicBlock::const_iterator I =
          std::next(BasicBlock::const_iterator(CxtI)), IE(Inv);
        I != IE; ++I)
-    if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I))
+    if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
       return false;
 
   return !isEphemeralValueOf(Inv, CxtI);
@@ -1049,7 +1055,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     break;
   }
   case Instruction::Select: {
-    const Value *LHS, *RHS;
+    const Value *LHS = nullptr, *RHS = nullptr;
     SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
     if (SelectPatternResult::isMinOrMax(SPF)) {
       computeKnownBits(RHS, Known, Depth + 1, Q);
@@ -1095,7 +1101,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       // RHS from matchSelectPattern returns the negation part of abs pattern.
       // If the negate has an NSW flag we can assume the sign bit of the result
       // will be 0 because that makes abs(INT_MIN) undefined.
-      if (Q.IIQ.hasNoSignedWrap(cast<Instruction>(RHS)))
+      if (match(RHS, m_Neg(m_Specific(LHS))) &&
+          Q.IIQ.hasNoSignedWrap(cast<Instruction>(RHS)))
         MaxHighZeros = 1;
     }
 
@@ -1366,7 +1373,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
           else if (LR == I)
             L = LL;
           else
-            break;
+            continue; // Check for recurrence with L and R flipped.
           // Ok, we have a PHI of the form L op= R. Check for low
           // zero bits.
           computeKnownBits(R, Known2, Depth + 1, Q);
@@ -1714,9 +1721,9 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
 
   // Aligned pointers have trailing zeros - refine Known.Zero set
   if (V->getType()->isPointerTy()) {
-    unsigned Align = V->getPointerAlignment(Q.DL);
+    const MaybeAlign Align = V->getPointerAlignment(Q.DL);
     if (Align)
-      Known.Zero.setLowBits(countTrailingZeros(Align));
+      Known.Zero.setLowBits(countTrailingZeros(Align->value()));
   }
 
   // computeKnownBitsFromAssume strictly refines Known.
@@ -2066,7 +2073,7 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
     if (const auto *Call = dyn_cast<CallBase>(V)) {
       if (Call->isReturnNonNull())
         return true;
-      if (const auto *RP = getArgumentAliasingToReturnedPointer(Call))
+      if (const auto *RP = getArgumentAliasingToReturnedPointer(Call, true))
         return isKnownNonZero(RP, Depth, Q);
     }
   }
@@ -2300,7 +2307,7 @@ static bool isSignedMinMaxClamp(const Value *Select, const Value *&In,
          cast<Operator>(Select)->getOpcode() == Instruction::Select &&
          "Input should be a Select!");
 
-  const Value *LHS, *RHS, *LHS2, *RHS2;
+  const Value *LHS = nullptr, *RHS = nullptr;
   SelectPatternFlavor SPF = matchSelectPattern(Select, LHS, RHS).Flavor;
   if (SPF != SPF_SMAX && SPF != SPF_SMIN)
     return false;
@@ -2308,6 +2315,7 @@ static bool isSignedMinMaxClamp(const Value *Select, const Value *&In,
   if (!match(RHS, m_APInt(CLow)))
     return false;
 
+  const Value *LHS2 = nullptr, *RHS2 = nullptr;
   SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor;
   if (getInverseMinMaxFlavor(SPF) != SPF2)
     return false;
@@ -2384,253 +2392,256 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
   if (Depth == MaxDepth)
     return 1;  // Limit search depth.
 
-  const Operator *U = dyn_cast<Operator>(V);
-  switch (Operator::getOpcode(V)) {
-  default: break;
-  case Instruction::SExt:
-    Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
-    return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp;
+  if (auto *U = dyn_cast<Operator>(V)) {
+    switch (Operator::getOpcode(V)) {
+    default: break;
+    case Instruction::SExt:
+      Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
+      return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp;
 
-  case Instruction::SDiv: {
-    const APInt *Denominator;
-    // sdiv X, C -> adds log(C) sign bits.
-    if (match(U->getOperand(1), m_APInt(Denominator))) {
+    case Instruction::SDiv: {
+      const APInt *Denominator;
+      // sdiv X, C -> adds log(C) sign bits.
+      if (match(U->getOperand(1), m_APInt(Denominator))) {
 
-      // Ignore non-positive denominator.
-      if (!Denominator->isStrictlyPositive())
-        break;
+        // Ignore non-positive denominator.
+        if (!Denominator->isStrictlyPositive())
+          break;
 
-      // Calculate the incoming numerator bits.
-      unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+        // Calculate the incoming numerator bits.
+        unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
-      // Add floor(log(C)) bits to the numerator bits.
-      return std::min(TyBits, NumBits + Denominator->logBase2());
+        // Add floor(log(C)) bits to the numerator bits.
+        return std::min(TyBits, NumBits + Denominator->logBase2());
+      }
+      break;
     }
-    break;
-  }
-
-  case Instruction::SRem: {
-    const APInt *Denominator;
-    // srem X, C -> we know that the result is within [-C+1,C) when C is a
-    // positive constant.  This let us put a lower bound on the number of sign
-    // bits.
-    if (match(U->getOperand(1), m_APInt(Denominator))) {
-
-      // Ignore non-positive denominator.
-      if (!Denominator->isStrictlyPositive())
-        break;
 
-      // Calculate the incoming numerator bits. SRem by a positive constant
-      // can't lower the number of sign bits.
-      unsigned NumrBits =
-          ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+    case Instruction::SRem: {
+      const APInt *Denominator;
+      // srem X, C -> we know that the result is within [-C+1,C) when C is a
+      // positive constant.  This let us put a lower bound on the number of sign
+      // bits.
+      if (match(U->getOperand(1), m_APInt(Denominator))) {
 
-      // Calculate the leading sign bit constraints by examining the
-      // denominator.  Given that the denominator is positive, there are two
-      // cases:
-      //
-      //  1. the numerator is positive.  The result range is [0,C) and [0,C) u<
-      //     (1 << ceilLogBase2(C)).
-      //
-      //  2. the numerator is negative.  Then the result range is (-C,0] and
-      //     integers in (-C,0] are either 0 or >u (-1 << ceilLogBase2(C)).
-      //
-      // Thus a lower bound on the number of sign bits is `TyBits -
-      // ceilLogBase2(C)`.
+        // Ignore non-positive denominator.
+        if (!Denominator->isStrictlyPositive())
+          break;
 
-      unsigned ResBits = TyBits - Denominator->ceilLogBase2();
-      return std::max(NumrBits, ResBits);
+        // Calculate the incoming numerator bits. SRem by a positive constant
+        // can't lower the number of sign bits.
+        unsigned NumrBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+
+        // Calculate the leading sign bit constraints by examining the
+        // denominator.  Given that the denominator is positive, there are two
+        // cases:
+        //
+        //  1. the numerator is positive. The result range is [0,C) and [0,C) u<
+        //     (1 << ceilLogBase2(C)).
+        //
+        //  2. the numerator is negative. Then the result range is (-C,0] and
+        //     integers in (-C,0] are either 0 or >u (-1 << ceilLogBase2(C)).
+        //
+        // Thus a lower bound on the number of sign bits is `TyBits -
+        // ceilLogBase2(C)`.
+
+        unsigned ResBits = TyBits - Denominator->ceilLogBase2();
+        return std::max(NumrBits, ResBits);
+      }
+      break;
     }
-    break;
-  }
 
-  case Instruction::AShr: {
-    Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-    // ashr X, C   -> adds C sign bits.  Vectors too.
-    const APInt *ShAmt;
-    if (match(U->getOperand(1), m_APInt(ShAmt))) {
-      if (ShAmt->uge(TyBits))
-        break;  // Bad shift.
-      unsigned ShAmtLimited = ShAmt->getZExtValue();
-      Tmp += ShAmtLimited;
-      if (Tmp > TyBits) Tmp = TyBits;
-    }
-    return Tmp;
-  }
-  case Instruction::Shl: {
-    const APInt *ShAmt;
-    if (match(U->getOperand(1), m_APInt(ShAmt))) {
-      // shl destroys sign bits.
+    case Instruction::AShr: {
       Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-      if (ShAmt->uge(TyBits) ||      // Bad shift.
-          ShAmt->uge(Tmp)) break;    // Shifted all sign bits out.
-      Tmp2 = ShAmt->getZExtValue();
-      return Tmp - Tmp2;
+      // ashr X, C   -> adds C sign bits.  Vectors too.
+      const APInt *ShAmt;
+      if (match(U->getOperand(1), m_APInt(ShAmt))) {
+        if (ShAmt->uge(TyBits))
+          break; // Bad shift.
+        unsigned ShAmtLimited = ShAmt->getZExtValue();
+        Tmp += ShAmtLimited;
+        if (Tmp > TyBits) Tmp = TyBits;
+      }
+      return Tmp;
     }
-    break;
-  }
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:    // NOT is handled here.
-    // Logical binary ops preserve the number of sign bits at the worst.
-    Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-    if (Tmp != 1) {
-      Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-      FirstAnswer = std::min(Tmp, Tmp2);
-      // We computed what we know about the sign bits as our first
-      // answer. Now proceed to the generic code that uses
-      // computeKnownBits, and pick whichever answer is better.
+    case Instruction::Shl: {
+      const APInt *ShAmt;
+      if (match(U->getOperand(1), m_APInt(ShAmt))) {
+        // shl destroys sign bits.
+        Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+        if (ShAmt->uge(TyBits) ||   // Bad shift.
+            ShAmt->uge(Tmp)) break; // Shifted all sign bits out.
+        Tmp2 = ShAmt->getZExtValue();
+        return Tmp - Tmp2;
+      }
+      break;
     }
-    break;
-
-  case Instruction::Select: {
-    // If we have a clamp pattern, we know that the number of sign bits will be
-    // the minimum of the clamp min/max range.
-    const Value *X;
-    const APInt *CLow, *CHigh;
-    if (isSignedMinMaxClamp(U, X, CLow, CHigh))
-      return std::min(CLow->getNumSignBits(), CHigh->getNumSignBits());
-
-    Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-    if (Tmp == 1) break;
-    Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q);
-    return std::min(Tmp, Tmp2);
-  }
-
-  case Instruction::Add:
-    // Add can have at most one carry bit.  Thus we know that the output
-    // is, at worst, one more bit than the inputs.
-    Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-    if (Tmp == 1) break;
-
-    // Special case decrementing a value (ADD X, -1):
-    if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
-      if (CRHS->isAllOnesValue()) {
-        KnownBits Known(TyBits);
-        computeKnownBits(U->getOperand(0), Known, Depth + 1, Q);
-
-        // If the input is known to be 0 or 1, the output is 0/-1, which is all
-        // sign bits set.
-        if ((Known.Zero | 1).isAllOnesValue())
-          return TyBits;
-
-        // If we are subtracting one from a positive number, there is no carry
-        // out of the result.
-        if (Known.isNonNegative())
-          return Tmp;
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: // NOT is handled here.
+      // Logical binary ops preserve the number of sign bits at the worst.
+      Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+      if (Tmp != 1) {
+        Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
+        FirstAnswer = std::min(Tmp, Tmp2);
+        // We computed what we know about the sign bits as our first
+        // answer. Now proceed to the generic code that uses
+        // computeKnownBits, and pick whichever answer is better.
       }
+      break;
 
-    Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-    if (Tmp2 == 1) break;
-    return std::min(Tmp, Tmp2)-1;
+    case Instruction::Select: {
+      // If we have a clamp pattern, we know that the number of sign bits will
+      // be the minimum of the clamp min/max range.
+      const Value *X;
+      const APInt *CLow, *CHigh;
+      if (isSignedMinMaxClamp(U, X, CLow, CHigh))
+        return std::min(CLow->getNumSignBits(), CHigh->getNumSignBits());
+
+      Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
+      if (Tmp == 1) break;
+      Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q);
+      return std::min(Tmp, Tmp2);
+    }
 
-  case Instruction::Sub:
-    Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-    if (Tmp2 == 1) break;
-
-    // Handle NEG.
-    if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
-      if (CLHS->isNullValue()) {
-        KnownBits Known(TyBits);
-        computeKnownBits(U->getOperand(1), Known, Depth + 1, Q);
-        // If the input is known to be 0 or 1, the output is 0/-1, which is all
-        // sign bits set.
-        if ((Known.Zero | 1).isAllOnesValue())
-          return TyBits;
-
-        // If the input is known to be positive (the sign bit is known clear),
-        // the output of the NEG has the same number of sign bits as the input.
-        if (Known.isNonNegative())
-          return Tmp2;
-
-        // Otherwise, we treat this like a SUB.
-      }
+    case Instruction::Add:
+      // Add can have at most one carry bit.  Thus we know that the output
+      // is, at worst, one more bit than the inputs.
+      Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+      if (Tmp == 1) break;
+
+      // Special case decrementing a value (ADD X, -1):
+      if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
+        if (CRHS->isAllOnesValue()) {
+          KnownBits Known(TyBits);
+          computeKnownBits(U->getOperand(0), Known, Depth + 1, Q);
+
+          // If the input is known to be 0 or 1, the output is 0/-1, which is
+          // all sign bits set.
+          if ((Known.Zero | 1).isAllOnesValue())
+            return TyBits;
+
+          // If we are subtracting one from a positive number, there is no carry
+          // out of the result.
+          if (Known.isNonNegative())
+            return Tmp;
+        }
 
-    // Sub can have at most one carry bit.  Thus we know that the output
-    // is, at worst, one more bit than the inputs.
-    Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-    if (Tmp == 1) break;
-    return std::min(Tmp, Tmp2)-1;
+      Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
+      if (Tmp2 == 1) break;
+      return std::min(Tmp, Tmp2) - 1;
 
-  case Instruction::Mul: {
-    // The output of the Mul can be at most twice the valid bits in the inputs.
-    unsigned SignBitsOp0 = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-    if (SignBitsOp0 == 1) break;
-    unsigned SignBitsOp1 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
-    if (SignBitsOp1 == 1) break;
-    unsigned OutValidBits =
-        (TyBits - SignBitsOp0 + 1) + (TyBits - SignBitsOp1 + 1);
-    return OutValidBits > TyBits ? 1 : TyBits - OutValidBits + 1;
-  }
+    case Instruction::Sub:
+      Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
+      if (Tmp2 == 1) break;
+
+      // Handle NEG.
+      if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
+        if (CLHS->isNullValue()) {
+          KnownBits Known(TyBits);
+          computeKnownBits(U->getOperand(1), Known, Depth + 1, Q);
+          // If the input is known to be 0 or 1, the output is 0/-1, which is
+          // all sign bits set.
+          if ((Known.Zero | 1).isAllOnesValue())
+            return TyBits;
+
+          // If the input is known to be positive (the sign bit is known clear),
+          // the output of the NEG has the same number of sign bits as the
+          // input.
+          if (Known.isNonNegative())
+            return Tmp2;
+
+          // Otherwise, we treat this like a SUB.
+        }
 
-  case Instruction::PHI: {
-    const PHINode *PN = cast<PHINode>(U);
-    unsigned NumIncomingValues = PN->getNumIncomingValues();
-    // Don't analyze large in-degree PHIs.
-    if (NumIncomingValues > 4) break;
-    // Unreachable blocks may have zero-operand PHI nodes.
-    if (NumIncomingValues == 0) break;
-
-    // Take the minimum of all incoming values.  This can't infinitely loop
-    // because of our depth threshold.
-    Tmp = ComputeNumSignBits(PN->getIncomingValue(0), Depth + 1, Q);
-    for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) {
-      if (Tmp == 1) return Tmp;
-      Tmp = std::min(
-          Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, Q));
+      // Sub can have at most one carry bit.  Thus we know that the output
+      // is, at worst, one more bit than the inputs.
+      Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+      if (Tmp == 1) break;
+      return std::min(Tmp, Tmp2) - 1;
+
+    case Instruction::Mul: {
+      // The output of the Mul can be at most twice the valid bits in the
+      // inputs.
+      unsigned SignBitsOp0 = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+      if (SignBitsOp0 == 1) break;
+      unsigned SignBitsOp1 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
+      if (SignBitsOp1 == 1) break;
+      unsigned OutValidBits =
+          (TyBits - SignBitsOp0 + 1) + (TyBits - SignBitsOp1 + 1);
+      return OutValidBits > TyBits ? 1 : TyBits - OutValidBits + 1;
     }
-    return Tmp;
-  }
 
-  case Instruction::Trunc:
-    // FIXME: it's tricky to do anything useful for this, but it is an important
-    // case for targets like X86.
-    break;
+    case Instruction::PHI: {
+      const PHINode *PN = cast<PHINode>(U);
+      unsigned NumIncomingValues = PN->getNumIncomingValues();
+      // Don't analyze large in-degree PHIs.
+      if (NumIncomingValues > 4) break;
+      // Unreachable blocks may have zero-operand PHI nodes.
+      if (NumIncomingValues == 0) break;
+
+      // Take the minimum of all incoming values.  This can't infinitely loop
+      // because of our depth threshold.
+      Tmp = ComputeNumSignBits(PN->getIncomingValue(0), Depth + 1, Q);
+      for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) {
+        if (Tmp == 1) return Tmp;
+        Tmp = std::min(
+            Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, Q));
+      }
+      return Tmp;
+    }
 
-  case Instruction::ExtractElement:
-    // Look through extract element. At the moment we keep this simple and skip
-    // tracking the specific element. But at least we might find information
-    // valid for all elements of the vector (for example if vector is sign
-    // extended, shifted, etc).
-    return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
-
-  case Instruction::ShuffleVector: {
-    // TODO: This is copied almost directly from the SelectionDAG version of
-    //       ComputeNumSignBits. It would be better if we could share common
-    //       code. If not, make sure that changes are translated to the DAG.
-
-    // Collect the minimum number of sign bits that are shared by every vector
-    // element referenced by the shuffle.
-    auto *Shuf = cast<ShuffleVectorInst>(U);
-    int NumElts = Shuf->getOperand(0)->getType()->getVectorNumElements();
-    int NumMaskElts = Shuf->getMask()->getType()->getVectorNumElements();
-    APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
-    for (int i = 0; i != NumMaskElts; ++i) {
-      int M = Shuf->getMaskValue(i);
-      assert(M < NumElts * 2 && "Invalid shuffle mask constant");
-      // For undef elements, we don't know anything about the common state of
-      // the shuffle result.
-      if (M == -1)
-        return 1;
-      if (M < NumElts)
-        DemandedLHS.setBit(M % NumElts);
-      else
-        DemandedRHS.setBit(M % NumElts);
+    case Instruction::Trunc:
+      // FIXME: it's tricky to do anything useful for this, but it is an
+      // important case for targets like X86.
+      break;
+
+    case Instruction::ExtractElement:
+      // Look through extract element. At the moment we keep this simple and
+      // skip tracking the specific element. But at least we might find
+      // information valid for all elements of the vector (for example if vector
+      // is sign extended, shifted, etc).
+      return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+
+    case Instruction::ShuffleVector: {
+      // TODO: This is copied almost directly from the SelectionDAG version of
+      //       ComputeNumSignBits. It would be better if we could share common
+      //       code. If not, make sure that changes are translated to the DAG.
+
+      // Collect the minimum number of sign bits that are shared by every vector
+      // element referenced by the shuffle.
+      auto *Shuf = cast<ShuffleVectorInst>(U);
+      int NumElts = Shuf->getOperand(0)->getType()->getVectorNumElements();
+      int NumMaskElts = Shuf->getMask()->getType()->getVectorNumElements();
+      APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+      for (int i = 0; i != NumMaskElts; ++i) {
+        int M = Shuf->getMaskValue(i);
+        assert(M < NumElts * 2 && "Invalid shuffle mask constant");
+        // For undef elements, we don't know anything about the common state of
+        // the shuffle result.
+        if (M == -1)
+          return 1;
+        if (M < NumElts)
+          DemandedLHS.setBit(M % NumElts);
+        else
+          DemandedRHS.setBit(M % NumElts);
+      }
+      Tmp = std::numeric_limits<unsigned>::max();
+      if (!!DemandedLHS)
+        Tmp = ComputeNumSignBits(Shuf->getOperand(0), Depth + 1, Q);
+      if (!!DemandedRHS) {
+        Tmp2 = ComputeNumSignBits(Shuf->getOperand(1), Depth + 1, Q);
+        Tmp = std::min(Tmp, Tmp2);
+      }
+      // If we don't know anything, early out and try computeKnownBits
+      // fall-back.
+      if (Tmp == 1)
+        break;
+      assert(Tmp <= V->getType()->getScalarSizeInBits() &&
+             "Failed to determine minimum sign bits");
+      return Tmp;
     }
-    Tmp = std::numeric_limits<unsigned>::max();
-    if (!!DemandedLHS)
-      Tmp = ComputeNumSignBits(Shuf->getOperand(0), Depth + 1, Q);
-    if (!!DemandedRHS) {
-      Tmp2 = ComputeNumSignBits(Shuf->getOperand(1), Depth + 1, Q);
-      Tmp = std::min(Tmp, Tmp2);
     }
-    // If we don't know anything, early out and try computeKnownBits fall-back.
-    if (Tmp == 1)
-      break;
-    assert(Tmp <= V->getType()->getScalarSizeInBits() &&
-           "Failed to determine minimum sign bits");
-    return Tmp;
-  }
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
@@ -2655,8 +2666,6 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
 /// through SExt instructions only if LookThroughSExt is true.
 bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
                            bool LookThroughSExt, unsigned Depth) {
-  const unsigned MaxDepth = 6;
-
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
   assert(V->getType()->isIntegerTy() && "Not integer or pointer type!");
@@ -3651,23 +3660,28 @@ uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
   return Len == ~0ULL ? 1 : Len;
 }
 
-const Value *llvm::getArgumentAliasingToReturnedPointer(const CallBase *Call) {
+const Value *
+llvm::getArgumentAliasingToReturnedPointer(const CallBase *Call,
+                                           bool MustPreserveNullness) {
   assert(Call &&
          "getArgumentAliasingToReturnedPointer only works on nonnull calls");
   if (const Value *RV = Call->getReturnedArgOperand())
     return RV;
   // This can be used only as a aliasing property.
-  if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call))
+  if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+          Call, MustPreserveNullness))
     return Call->getArgOperand(0);
   return nullptr;
 }
 
 bool llvm::isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-    const CallBase *Call) {
+    const CallBase *Call, bool MustPreserveNullness) {
   return Call->getIntrinsicID() == Intrinsic::launder_invariant_group ||
          Call->getIntrinsicID() == Intrinsic::strip_invariant_group ||
          Call->getIntrinsicID() == Intrinsic::aarch64_irg ||
-         Call->getIntrinsicID() == Intrinsic::aarch64_tagp;
+         Call->getIntrinsicID() == Intrinsic::aarch64_tagp ||
+         (!MustPreserveNullness &&
+          Call->getIntrinsicID() == Intrinsic::ptrmask);
 }
 
 /// \p PN defines a loop-variant pointer to an object.  Check if the
@@ -3725,7 +3739,7 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
         // because it should be in sync with CaptureTracking. Not using it may
         // cause weird miscompilations where 2 aliasing pointers are assumed to
         // noalias.
-        if (auto *RP = getArgumentAliasingToReturnedPointer(Call)) {
+        if (auto *RP = getArgumentAliasingToReturnedPointer(Call, false)) {
           V = RP;
           continue;
         }
@@ -3865,6 +3879,18 @@ bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
   return true;
 }
 
+bool llvm::mustSuppressSpeculation(const LoadInst &LI) {
+  if (!LI.isUnordered())
+    return true;
+  const Function &F = *LI.getFunction();
+  // Speculative load may create a race that did not exist in the source.
+  return F.hasFnAttribute(Attribute::SanitizeThread) ||
+    // Speculative load may load data from dirty regions.
+    F.hasFnAttribute(Attribute::SanitizeAddress) ||
+    F.hasFnAttribute(Attribute::SanitizeHWAddress);
+}
+
+
 bool llvm::isSafeToSpeculativelyExecute(const Value *V,
                                         const Instruction *CtxI,
                                         const DominatorTree *DT) {
@@ -3909,17 +3935,12 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
   }
   case Instruction::Load: {
     const LoadInst *LI = cast<LoadInst>(Inst);
-    if (!LI->isUnordered() ||
-        // Speculative load may create a race that did not exist in the source.
-        LI->getFunction()->hasFnAttribute(Attribute::SanitizeThread) ||
-        // Speculative load may load data from dirty regions.
-        LI->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
-        LI->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
+    if (mustSuppressSpeculation(*LI))
       return false;
     const DataLayout &DL = LI->getModule()->getDataLayout();
-    return isDereferenceableAndAlignedPointer(LI->getPointerOperand(),
-                                              LI->getType(), LI->getAlignment(),
-                                              DL, CtxI, DT);
+    return isDereferenceableAndAlignedPointer(
+        LI->getPointerOperand(), LI->getType(), MaybeAlign(LI->getAlignment()),
+        DL, CtxI, DT);
   }
   case Instruction::Call: {
     auto *CI = cast<const CallInst>(Inst);
@@ -4221,22 +4242,9 @@ OverflowResult llvm::computeOverflowForSignedAdd(const Value *LHS,
 }
 
 bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
-  // A memory operation returns normally if it isn't volatile. A volatile
-  // operation is allowed to trap.
-  //
-  // An atomic operation isn't guaranteed to return in a reasonable amount of
-  // time because it's possible for another thread to interfere with it for an
+  // Note: An atomic operation isn't guaranteed to return in a reasonable amount
+  // of time because it's possible for another thread to interfere with it for an
   // arbitrary length of time, but programs aren't allowed to rely on that.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
-    return !LI->isVolatile();
-  if (const StoreInst *SI = dyn_cast<StoreInst>(I))
-    return !SI->isVolatile();
-  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
-    return !CXI->isVolatile();
-  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
-    return !RMWI->isVolatile();
-  if (const MemIntrinsic *MII = dyn_cast<MemIntrinsic>(I))
-    return !MII->isVolatile();
 
   // If there is no successor, then execution can't transfer to it.
   if (const auto *CRI = dyn_cast<CleanupReturnInst>(I))
@@ -4277,10 +4285,7 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
 
     // FIXME: This isn't aggressive enough; a call which only writes to a global
     // is guaranteed to return.
-    return CS.onlyReadsMemory() || CS.onlyAccessesArgMemory() ||
-           match(I, m_Intrinsic<Intrinsic::assume>()) ||
-           match(I, m_Intrinsic<Intrinsic::sideeffect>()) ||
-           match(I, m_Intrinsic<Intrinsic::experimental_widenable_condition>());
+    return CS.onlyReadsMemory() || CS.onlyAccessesArgMemory();
   }
 
   // Other instructions return normally.
@@ -4572,12 +4577,12 @@ static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred,
   // TODO: Allow FP min/max with nnan/nsz.
   assert(CmpInst::isIntPredicate(Pred) && "Expected integer comparison");
 
-  Value *A, *B;
+  Value *A = nullptr, *B = nullptr;
   SelectPatternResult L = matchSelectPattern(TVal, A, B, nullptr, Depth + 1);
   if (!SelectPatternResult::isMinOrMax(L.Flavor))
     return {SPF_UNKNOWN, SPNB_NA, false};
 
-  Value *C, *D;
+  Value *C = nullptr, *D = nullptr;
   SelectPatternResult R = matchSelectPattern(FVal, C, D, nullptr, Depth + 1);
   if (L.Flavor != R.Flavor)
     return {SPF_UNKNOWN, SPNB_NA, false};
@@ -5627,8 +5632,8 @@ static void setLimitsForIntrinsic(const IntrinsicInst &II, APInt &Lower,
 }
 
 static void setLimitsForSelectPattern(const SelectInst &SI, APInt &Lower,
-                                      APInt &Upper) {
-  const Value *LHS, *RHS;
+                                      APInt &Upper, const InstrInfoQuery &IIQ) {
+  const Value *LHS = nullptr, *RHS = nullptr;
   SelectPatternResult R = matchSelectPattern(&SI, LHS, RHS);
   if (R.Flavor == SPF_UNKNOWN)
     return;
@@ -5640,7 +5645,8 @@ static void setLimitsForSelectPattern(const SelectInst &SI, APInt &Lower,
     // then the result of abs(X) is [0..SIGNED_MAX],
     // otherwise it is [0..SIGNED_MIN], as -SIGNED_MIN == SIGNED_MIN.
     Lower = APInt::getNullValue(BitWidth);
-    if (cast<Instruction>(RHS)->hasNoSignedWrap())
+    if (match(RHS, m_Neg(m_Specific(LHS))) &&
+        IIQ.hasNoSignedWrap(cast<Instruction>(RHS)))
       Upper = APInt::getSignedMaxValue(BitWidth) + 1;
     else
       Upper = APInt::getSignedMinValue(BitWidth) + 1;
@@ -5694,7 +5700,7 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo) {
   else if (auto *II = dyn_cast<IntrinsicInst>(V))
     setLimitsForIntrinsic(*II, Lower, Upper);
   else if (auto *SI = dyn_cast<SelectInst>(V))
-    setLimitsForSelectPattern(*SI, Lower, Upper);
+    setLimitsForSelectPattern(*SI, Lower, Upper, IIQ);
 
   ConstantRange CR = ConstantRange::getNonEmpty(Lower, Upper);
 
@@ -5704,3 +5710,111 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo) {
 
   return CR;
 }
+
+static Optional<int64_t>
+getOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, const DataLayout &DL) {
+  // Skip over the first indices.
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1; i != Idx; ++i, ++GTI)
+    /*skip along*/;
+
+  // Compute the offset implied by the rest of the indices.
+  int64_t Offset = 0;
+  for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!OpC)
+      return None;
+    if (OpC->isZero())
+      continue; // No offset.
+
+    // Handle struct indices, which add their field offset to the pointer.
+    if (StructType *STy = GTI.getStructTypeOrNull()) {
+      Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+      continue;
+    }
+
+    // Otherwise, we have a sequential type like an array or vector.  Multiply
+    // the index by the ElementSize.
+    uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+    Offset += Size * OpC->getSExtValue();
+  }
+
+  return Offset;
+}
+
+Optional<int64_t> llvm::isPointerOffset(const Value *Ptr1, const Value *Ptr2,
+                                        const DataLayout &DL) {
+  Ptr1 = Ptr1->stripPointerCasts();
+  Ptr2 = Ptr2->stripPointerCasts();
+
+  // Handle the trivial case first.
+  if (Ptr1 == Ptr2) {
+    return 0;
+  }
+
+  const GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
+  const GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
+
+  // If one pointer is a GEP see if the GEP is a constant offset from the base,
+  // as in "P" and "gep P, 1".
+  // Also do this iteratively to handle the the following case:
+  //   Ptr_t1 = GEP Ptr1, c1
+  //   Ptr_t2 = GEP Ptr_t1, c2
+  //   Ptr2 = GEP Ptr_t2, c3
+  // where we will return c1+c2+c3.
+  // TODO: Handle the case when both Ptr1 and Ptr2 are GEPs of some common base
+  // -- replace getOffsetFromBase with getOffsetAndBase, check that the bases
+  // are the same, and return the difference between offsets.
+  auto getOffsetFromBase = [&DL](const GEPOperator *GEP,
+                                 const Value *Ptr) -> Optional<int64_t> {
+    const GEPOperator *GEP_T = GEP;
+    int64_t OffsetVal = 0;
+    bool HasSameBase = false;
+    while (GEP_T) {
+      auto Offset = getOffsetFromIndex(GEP_T, 1, DL);
+      if (!Offset)
+        return None;
+      OffsetVal += *Offset;
+      auto Op0 = GEP_T->getOperand(0)->stripPointerCasts();
+      if (Op0 == Ptr) {
+        HasSameBase = true;
+        break;
+      }
+      GEP_T = dyn_cast<GEPOperator>(Op0);
+    }
+    if (!HasSameBase)
+      return None;
+    return OffsetVal;
+  };
+
+  if (GEP1) {
+    auto Offset = getOffsetFromBase(GEP1, Ptr2);
+    if (Offset)
+      return -*Offset;
+  }
+  if (GEP2) {
+    auto Offset = getOffsetFromBase(GEP2, Ptr1);
+    if (Offset)
+      return Offset;
+  }
+
+  // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
+  // base.  After that base, they may have some number of common (and
+  // potentially variable) indices.  After that they handle some constant
+  // offset, which determines their offset from each other.  At this point, we
+  // handle no other case.
+  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
+    return None;
+
+  // Skip any common indices and track the GEP types.
+  unsigned Idx = 1;
+  for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
+    if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
+      break;
+
+  auto Offset1 = getOffsetFromIndex(GEP1, Idx, DL);
+  auto Offset2 = getOffsetFromIndex(GEP2, Idx, DL);
+  if (!Offset1 || !Offset2)
+    return None;
+  return *Offset2 - *Offset1;
+}
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 986756eb2627..600f57ab9d71 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -56,6 +56,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat:
   case Intrinsic::umul_fix:
+  case Intrinsic::umul_fix_sat:
   case Intrinsic::sqrt: // Begin floating-point.
   case Intrinsic::sin:
   case Intrinsic::cos:
@@ -98,6 +99,7 @@ bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat:
   case Intrinsic::umul_fix:
+  case Intrinsic::umul_fix_sat:
     return (ScalarOpdIdx == 2);
   default:
     return false;
@@ -830,15 +832,15 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
                                     /*Assume=*/true, /*ShouldCheckWrap=*/false);
 
       const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
-      PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+      PointerType *PtrTy = cast<PointerType>(Ptr->getType());
       uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
       // An alignment of 0 means target ABI alignment.
-      unsigned Align = getLoadStoreAlignment(&I);
-      if (!Align)
-        Align = DL.getABITypeAlignment(PtrTy->getElementType());
+      MaybeAlign Alignment = MaybeAlign(getLoadStoreAlignment(&I));
+      if (!Alignment)
+        Alignment = Align(DL.getABITypeAlignment(PtrTy->getElementType()));
 
-      AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
+      AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, *Alignment);
     }
 }
 
@@ -925,7 +927,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
       if (!Group) {
         LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
                           << '\n');
-        Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
+        Group = createInterleaveGroup(B, DesB.Stride, DesB.Alignment);
       }
       if (B->mayWriteToMemory())
         StoreGroups.insert(Group);
@@ -964,6 +966,10 @@ void InterleavedAccessInfo::analyzeInterleaving(
         // instructions that precede it.
         if (isInterleaved(A)) {
           InterleaveGroup<Instruction> *StoreGroup = getInterleaveGroup(A);
+
+          LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to "
+                               "dependence between " << *A << " and "<< *B << '\n');
+
           StoreGroups.remove(StoreGroup);
           releaseGroup(StoreGroup);
         }
@@ -1028,7 +1034,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
           Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
 
       // Try to insert A into B's group.
-      if (Group->insertMember(A, IndexA, DesA.Align)) {
+      if (Group->insertMember(A, IndexA, DesA.Alignment)) {
         LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
                           << "    into the interleave group with" << *B
                           << '\n');
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 72d2357c2933..5292b0e62744 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -622,6 +622,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(amdgpu_ps);
   KEYWORD(amdgpu_cs);
   KEYWORD(amdgpu_kernel);
+  KEYWORD(tailcc);
 
   KEYWORD(cc);
   KEYWORD(c);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 87dff6468f2d..594537307d00 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1122,7 +1122,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
       if (ParseToken(lltok::StringConstant, "expected partition string"))
         return true;
     } else if (Lex.getKind() == lltok::kw_align) {
-      unsigned Alignment;
+      MaybeAlign Alignment;
       if (ParseOptionalAlignment(Alignment)) return true;
       GV->setAlignment(Alignment);
     } else if (Lex.getKind() == lltok::MetadataVar) {
@@ -1229,12 +1229,13 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       // As a hack, we allow function alignment to be initially parsed as an
       // attribute on a function declaration/definition or added to an attribute
       // group and later moved to the alignment field.
-      unsigned Alignment;
+      MaybeAlign Alignment;
       if (inAttrGrp) {
         Lex.Lex();
-        if (ParseToken(lltok::equal, "expected '=' here") ||
-            ParseUInt32(Alignment))
+        uint32_t Value = 0;
+        if (ParseToken(lltok::equal, "expected '=' here") || ParseUInt32(Value))
           return true;
+        Alignment = Align(Value);
       } else {
         if (ParseOptionalAlignment(Alignment))
           return true;
@@ -1603,7 +1604,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
       continue;
     }
     case lltok::kw_align: {
-      unsigned Alignment;
+      MaybeAlign Alignment;
       if (ParseOptionalAlignment(Alignment))
         return true;
       B.addAlignmentAttr(Alignment);
@@ -1720,7 +1721,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
       continue;
     }
     case lltok::kw_align: {
-      unsigned Alignment;
+      MaybeAlign Alignment;
       if (ParseOptionalAlignment(Alignment))
         return true;
       B.addAlignmentAttr(Alignment);
@@ -1955,6 +1956,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'amdgpu_ps'
 ///   ::= 'amdgpu_cs'
 ///   ::= 'amdgpu_kernel'
+///   ::= 'tailcc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
@@ -2000,6 +2002,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_amdgpu_ps:      CC = CallingConv::AMDGPU_PS; break;
   case lltok::kw_amdgpu_cs:      CC = CallingConv::AMDGPU_CS; break;
   case lltok::kw_amdgpu_kernel:  CC = CallingConv::AMDGPU_KERNEL; break;
+  case lltok::kw_tailcc:         CC = CallingConv::Tail; break;
   case lltok::kw_cc: {
       Lex.Lex();
       return ParseUInt32(CC);
@@ -2067,16 +2070,19 @@ bool LLParser::ParseOptionalFunctionMetadata(Function &F) {
 /// ParseOptionalAlignment
 ///   ::= /* empty */
 ///   ::= 'align' 4
-bool LLParser::ParseOptionalAlignment(unsigned &Alignment) {
-  Alignment = 0;
+bool LLParser::ParseOptionalAlignment(MaybeAlign &Alignment) {
+  Alignment = None;
   if (!EatIfPresent(lltok::kw_align))
     return false;
   LocTy AlignLoc = Lex.getLoc();
-  if (ParseUInt32(Alignment)) return true;
-  if (!isPowerOf2_32(Alignment))
+  uint32_t Value = 0;
+  if (ParseUInt32(Value))
+    return true;
+  if (!isPowerOf2_32(Value))
     return Error(AlignLoc, "alignment is not a power of two");
-  if (Alignment > Value::MaximumAlignment)
+  if (Value > Value::MaximumAlignment)
     return Error(AlignLoc, "huge alignments are not supported yet");
+  Alignment = Align(Value);
   return false;
 }
 
@@ -2113,7 +2119,7 @@ bool LLParser::ParseOptionalDerefAttrBytes(lltok::Kind AttrKind,
 ///
 /// This returns with AteExtraComma set to true if it ate an excess comma at the
 /// end.
-bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
+bool LLParser::ParseOptionalCommaAlign(MaybeAlign &Alignment,
                                        bool &AteExtraComma) {
   AteExtraComma = false;
   while (EatIfPresent(lltok::comma)) {
@@ -2551,6 +2557,7 @@ bool LLParser::ParseOptionalOperandBundles(
 ///
 bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
                                  bool &isVarArg){
+  unsigned CurValID = 0;
   isVarArg = false;
   assert(Lex.getKind() == lltok::lparen);
   Lex.Lex(); // eat the (.
@@ -2575,6 +2582,12 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     if (Lex.getKind() == lltok::LocalVar) {
       Name = Lex.getStrVal();
       Lex.Lex();
+    } else if (Lex.getKind() == lltok::LocalVarID) {
+      if (Lex.getUIntVal() != CurValID)
+        return Error(TypeLoc, "argument expected to be numbered '%" +
+                                  Twine(CurValID) + "'");
+      ++CurValID;
+      Lex.Lex();
     }
 
     if (!FunctionType::isValidArgumentType(ArgTy))
@@ -2602,6 +2615,13 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
         Name = Lex.getStrVal();
         Lex.Lex();
       } else {
+        if (Lex.getKind() == lltok::LocalVarID) {
+          if (Lex.getUIntVal() != CurValID)
+            return Error(TypeLoc, "argument expected to be numbered '%" +
+                                      Twine(CurValID) + "'");
+          Lex.Lex();
+        }
+        ++CurValID;
         Name = "";
       }
 
@@ -3093,7 +3113,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
         ParseToken(lltok::rbrace, "expected end of struct constant"))
       return true;
 
-    ID.ConstantStructElts = make_unique<Constant *[]>(Elts.size());
+    ID.ConstantStructElts = std::make_unique<Constant *[]>(Elts.size());
     ID.UIntVal = Elts.size();
     memcpy(ID.ConstantStructElts.get(), Elts.data(),
            Elts.size() * sizeof(Elts[0]));
@@ -3115,7 +3135,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       return true;
 
     if (isPackedStruct) {
-      ID.ConstantStructElts = make_unique<Constant *[]>(Elts.size());
+      ID.ConstantStructElts = std::make_unique<Constant *[]>(Elts.size());
       memcpy(ID.ConstantStructElts.get(), Elts.data(),
              Elts.size() * sizeof(Elts[0]));
       ID.UIntVal = Elts.size();
@@ -5354,7 +5374,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   LocTy BuiltinLoc;
   std::string Section;
   std::string Partition;
-  unsigned Alignment;
+  MaybeAlign Alignment;
   std::string GC;
   GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
   unsigned AddrSpace = 0;
@@ -5471,7 +5491,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Fn->setCallingConv(CC);
   Fn->setAttributes(PAL);
   Fn->setUnnamedAddr(UnnamedAddr);
-  Fn->setAlignment(Alignment);
+  Fn->setAlignment(MaybeAlign(Alignment));
   Fn->setSection(Section);
   Fn->setPartition(Partition);
   Fn->setComdat(C);
@@ -5788,7 +5808,19 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_extractelement: return ParseExtractElement(Inst, PFS);
   case lltok::kw_insertelement:  return ParseInsertElement(Inst, PFS);
   case lltok::kw_shufflevector:  return ParseShuffleVector(Inst, PFS);
-  case lltok::kw_phi:            return ParsePHI(Inst, PFS);
+  case lltok::kw_phi: {
+    FastMathFlags FMF = EatFastMathFlagsIfPresent();
+    int Res = ParsePHI(Inst, PFS);
+    if (Res != 0)
+      return Res;
+    if (FMF.any()) {
+      if (!Inst->getType()->isFPOrFPVectorTy())
+        return Error(Loc, "fast-math-flags specified for phi without "
+                          "floating-point scalar or vector return type");
+      Inst->setFastMathFlags(FMF);
+    }
+    return 0;
+  }
   case lltok::kw_landingpad:     return ParseLandingPad(Inst, PFS);
   // Call.
   case lltok::kw_call:     return ParseCall(Inst, PFS, CallInst::TCK_None);
@@ -6837,7 +6869,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Size = nullptr;
   LocTy SizeLoc, TyLoc, ASLoc;
-  unsigned Alignment = 0;
+  MaybeAlign Alignment;
   unsigned AddrSpace = 0;
   Type *Ty = nullptr;
 
@@ -6885,7 +6917,8 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   if (Size && !Size->getType()->isIntegerTy())
     return Error(SizeLoc, "element count must have integer type");
 
-  AllocaInst *AI = new AllocaInst(Ty, AddrSpace, Size, Alignment);
+  AllocaInst *AI =
+      new AllocaInst(Ty, AddrSpace, Size, Alignment ? Alignment->value() : 0);
   AI->setUsedWithInAlloca(IsInAlloca);
   AI->setSwiftError(IsSwiftError);
   Inst = AI;
@@ -6898,7 +6931,7 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
 ///       'singlethread'? AtomicOrdering (',' 'align' i32)?
 int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Val; LocTy Loc;
-  unsigned Alignment = 0;
+  MaybeAlign Alignment;
   bool AteExtraComma = false;
   bool isAtomic = false;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
@@ -6947,7 +6980,7 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
 ///       'singlethread'? AtomicOrdering (',' 'align' i32)?
 int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Val, *Ptr; LocTy Loc, PtrLoc;
-  unsigned Alignment = 0;
+  MaybeAlign Alignment;
   bool AteExtraComma = false;
   bool isAtomic = false;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
@@ -8074,7 +8107,7 @@ bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
   if (ParseToken(lltok::rparen, "expected ')' here"))
     return true;
 
-  auto FS = llvm::make_unique<FunctionSummary>(
+  auto FS = std::make_unique<FunctionSummary>(
       GVFlags, InstCount, FFlags, /*EntryCount=*/0, std::move(Refs),
       std::move(Calls), std::move(TypeIdInfo.TypeTests),
       std::move(TypeIdInfo.TypeTestAssumeVCalls),
@@ -8134,7 +8167,7 @@ bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID,
     return true;
 
   auto GS =
-      llvm::make_unique<GlobalVarSummary>(GVFlags, GVarFlags, std::move(Refs));
+      std::make_unique<GlobalVarSummary>(GVFlags, GVarFlags, std::move(Refs));
 
   GS->setModulePath(ModulePath);
   GS->setVTableFuncs(std::move(VTableFuncs));
@@ -8175,7 +8208,7 @@ bool LLParser::ParseAliasSummary(std::string Name, GlobalValue::GUID GUID,
   if (ParseToken(lltok::rparen, "expected ')' here"))
     return true;
 
-  auto AS = llvm::make_unique<AliasSummary>(GVFlags);
+  auto AS = std::make_unique<AliasSummary>(GVFlags);
 
   AS->setModulePath(ModulePath);
 
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 610e2e262008..abc423b4e3cd 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -281,14 +281,14 @@ namespace llvm {
     void ParseOptionalVisibility(unsigned &Res);
     void ParseOptionalDLLStorageClass(unsigned &Res);
     bool ParseOptionalCallingConv(unsigned &CC);
-    bool ParseOptionalAlignment(unsigned &Alignment);
+    bool ParseOptionalAlignment(MaybeAlign &Alignment);
     bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
     bool ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID,
                                AtomicOrdering &Ordering);
     bool ParseScope(SyncScope::ID &SSID);
     bool ParseOrdering(AtomicOrdering &Ordering);
     bool ParseOptionalStackAlignment(unsigned &Alignment);
-    bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
+    bool ParseOptionalCommaAlign(MaybeAlign &Alignment, bool &AteExtraComma);
     bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
                                      bool &AteExtraComma);
     bool ParseOptionalCommaInAlloca(bool &IsInAlloca);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 0e9ba4db4742..f49feb2dc14d 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -168,6 +168,7 @@ enum Kind {
   kw_amdgpu_ps,
   kw_amdgpu_cs,
   kw_amdgpu_kernel,
+  kw_tailcc,
 
   // Attributes:
   kw_attributes,
diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
index b13c6237f411..b7f552a6fccb 100644
--- a/lib/AsmParser/Parser.cpp
+++ b/lib/AsmParser/Parser.cpp
@@ -42,7 +42,7 @@ llvm::parseAssembly(MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
                     SlotMapping *Slots, bool UpgradeDebugInfo,
                     StringRef DataLayoutString) {
   std::unique_ptr<Module> M =
-      make_unique<Module>(F.getBufferIdentifier(), Context);
+      std::make_unique<Module>(F.getBufferIdentifier(), Context);
 
   if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, UpgradeDebugInfo,
                         DataLayoutString))
@@ -71,9 +71,9 @@ ParsedModuleAndIndex llvm::parseAssemblyWithIndex(
     MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
     SlotMapping *Slots, bool UpgradeDebugInfo, StringRef DataLayoutString) {
   std::unique_ptr<Module> M =
-      make_unique<Module>(F.getBufferIdentifier(), Context);
+      std::make_unique<Module>(F.getBufferIdentifier(), Context);
   std::unique_ptr<ModuleSummaryIndex> Index =
-      make_unique<ModuleSummaryIndex>(/*HaveGVs=*/true);
+      std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/true);
 
   if (parseAssemblyInto(F, M.get(), Index.get(), Err, Slots, UpgradeDebugInfo,
                         DataLayoutString))
@@ -123,7 +123,7 @@ static bool parseSummaryIndexAssemblyInto(MemoryBufferRef F,
 std::unique_ptr<ModuleSummaryIndex>
 llvm::parseSummaryIndexAssembly(MemoryBufferRef F, SMDiagnostic &Err) {
   std::unique_ptr<ModuleSummaryIndex> Index =
-      make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
+      std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
 
   if (parseSummaryIndexAssemblyInto(F, *Index, Err))
     return nullptr;
diff --git a/lib/BinaryFormat/Dwarf.cpp b/lib/BinaryFormat/Dwarf.cpp
index eb6bd33ce583..d06cccdf0dfd 100644
--- a/lib/BinaryFormat/Dwarf.cpp
+++ b/lib/BinaryFormat/Dwarf.cpp
@@ -22,7 +22,7 @@ StringRef llvm::dwarf::TagString(unsigned Tag) {
   switch (Tag) {
   default:
     return StringRef();
-#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR)                               \
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR, KIND)                         \
   case DW_TAG_##NAME:                                                          \
     return "DW_TAG_" #NAME;
 #include "llvm/BinaryFormat/Dwarf.def"
@@ -31,7 +31,7 @@ StringRef llvm::dwarf::TagString(unsigned Tag) {
 
 unsigned llvm::dwarf::getTag(StringRef TagString) {
   return StringSwitch<unsigned>(TagString)
-#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR)                               \
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR, KIND)                         \
   .Case("DW_TAG_" #NAME, DW_TAG_##NAME)
 #include "llvm/BinaryFormat/Dwarf.def"
       .Default(DW_TAG_invalid);
@@ -41,7 +41,7 @@ unsigned llvm::dwarf::TagVersion(dwarf::Tag Tag) {
   switch (Tag) {
   default:
     return 0;
-#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR)                               \
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR, KIND)                         \
   case DW_TAG_##NAME:                                                          \
     return VERSION;
 #include "llvm/BinaryFormat/Dwarf.def"
@@ -52,7 +52,7 @@ unsigned llvm::dwarf::TagVendor(dwarf::Tag Tag) {
   switch (Tag) {
   default:
     return 0;
-#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR)                               \
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR, KIND)                         \
   case DW_TAG_##NAME:                                                          \
     return DWARF_VENDOR_##VENDOR;
 #include "llvm/BinaryFormat/Dwarf.def"
@@ -149,6 +149,8 @@ StringRef llvm::dwarf::OperationEncodingString(unsigned Encoding) {
     return "DW_OP_LLVM_fragment";
   case DW_OP_LLVM_tag_offset:
     return "DW_OP_LLVM_tag_offset";
+  case DW_OP_LLVM_entry_value:
+    return "DW_OP_LLVM_entry_value";
   }
 }
 
@@ -160,6 +162,7 @@ unsigned llvm::dwarf::getOperationEncoding(StringRef OperationEncodingString) {
       .Case("DW_OP_LLVM_convert", DW_OP_LLVM_convert)
       .Case("DW_OP_LLVM_fragment", DW_OP_LLVM_fragment)
       .Case("DW_OP_LLVM_tag_offset", DW_OP_LLVM_tag_offset)
+      .Case("DW_OP_LLVM_entry_value", DW_OP_LLVM_entry_value)
       .Default(0);
 }
 
@@ -472,6 +475,17 @@ StringRef llvm::dwarf::RangeListEncodingString(unsigned Encoding) {
   }
 }
 
+StringRef llvm::dwarf::LocListEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+  default:
+    return StringRef();
+#define HANDLE_DW_LLE(ID, NAME)                                                \
+  case DW_LLE_##NAME:                                                          \
+    return "DW_LLE_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
 StringRef llvm::dwarf::CallFrameString(unsigned Encoding,
     Triple::ArchType Arch) {
   assert(Arch != llvm::Triple::ArchType::UnknownArch);
diff --git a/lib/BinaryFormat/Magic.cpp b/lib/BinaryFormat/Magic.cpp
index 7dfe23690a50..bbcbbabededb 100644
--- a/lib/BinaryFormat/Magic.cpp
+++ b/lib/BinaryFormat/Magic.cpp
@@ -210,6 +210,11 @@ file_magic llvm::identify_magic(StringRef Magic) {
       return file_magic::coff_object;
     break;
 
+  case 0x2d: // YAML '-'
+    if (startswith(Magic, "--- !tapi") || startswith(Magic, "---\narchs:"))
+      return file_magic::tapi_file;
+    break;
+
   default:
     break;
   }
diff --git a/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 9c30d563a314..e70caa83c8c1 100644
--- a/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -434,6 +434,13 @@ static Expected<CurStreamTypeType> ReadSignature(BitstreamCursor &Stream) {
       return std::move(Err);
     if (Signature[2] == 'A' && Signature[3] == 'G')
       return ClangSerializedDiagnosticsBitstream;
+  } else if (Signature[0] == 'R' && Signature[1] == 'M') {
+    if (Error Err = tryRead(Signature[2], 8))
+      return std::move(Err);
+    if (Error Err = tryRead(Signature[3], 8))
+      return std::move(Err);
+    if (Signature[2] == 'R' && Signature[3] == 'K')
+      return LLVMBitstreamRemarks;
   } else {
     if (Error Err = tryRead(Signature[2], 4))
       return std::move(Err);
@@ -627,6 +634,9 @@ void BitcodeAnalyzer::printStats(BCDumpOptions O,
   case ClangSerializedDiagnosticsBitstream:
     O.OS << "Clang Serialized Diagnostics\n";
     break;
+  case LLVMBitstreamRemarks:
+    O.OS << "LLVM Remarks\n";
+    break;
   }
   O.OS << "  # Toplevel Blocks: " << NumTopBlocks << "\n";
   O.OS << "\n";
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 29dc7f616392..15eead1de31a 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -722,7 +722,7 @@ private:
   /// Converts alignment exponent (i.e. power of two (or zero)) to the
   /// corresponding alignment to use. If alignment is too large, returns
   /// a corresponding error code.
-  Error parseAlignmentValue(uint64_t Exponent, unsigned &Alignment);
+  Error parseAlignmentValue(uint64_t Exponent, MaybeAlign &Alignment);
   Error parseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
   Error parseModule(uint64_t ResumeBit, bool ShouldLazyLoadMetadata = false);
 
@@ -1063,7 +1063,7 @@ static int getDecodedUnaryOpcode(unsigned Val, Type *Ty) {
   switch (Val) {
   default:
     return -1;
-  case bitc::UNOP_NEG:
+  case bitc::UNOP_FNEG:
     return IsFP ? Instruction::FNeg : -1;
   }
 }
@@ -1544,12 +1544,12 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
 }
 
 Error BitcodeReader::parseAlignmentValue(uint64_t Exponent,
-                                         unsigned &Alignment) {
+                                         MaybeAlign &Alignment) {
   // Note: Alignment in bitcode files is incremented by 1, so that zero
   // can be used for default alignment.
   if (Exponent > Value::MaxAlignmentExponent + 1)
     return error("Invalid alignment value");
-  Alignment = (1 << static_cast<unsigned>(Exponent)) >> 1;
+  Alignment = decodeMaybeAlign(Exponent);
   return Error::success();
 }
 
@@ -2377,6 +2377,8 @@ Error BitcodeReader::parseConstants() {
       CurTy = flattenPointerTypes(CurFullTy);
       continue;  // Skip the ValueList manipulation.
     case bitc::CST_CODE_NULL:      // NULL
+      if (CurTy->isVoidTy() || CurTy->isFunctionTy() || CurTy->isLabelTy())
+        return error("Invalid type for a constant null value");
       V = Constant::getNullValue(CurTy);
       break;
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
@@ -3110,7 +3112,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
 
   uint64_t RawLinkage = Record[3];
   GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-  unsigned Alignment;
+  MaybeAlign Alignment;
   if (Error Err = parseAlignmentValue(Record[4], Alignment))
     return Err;
   std::string Section;
@@ -3241,7 +3243,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
                               Context, getPointerElementFlatType(PTy)));
   }
 
-  unsigned Alignment;
+  MaybeAlign Alignment;
   if (Error Err = parseAlignmentValue(Record[5], Alignment))
     return Err;
   Func->setAlignment(Alignment);
@@ -3646,6 +3648,11 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       break;
     }
     Record.clear();
+
+    // Upgrade data layout string.
+    std::string DL = llvm::UpgradeDataLayoutString(
+        TheModule->getDataLayoutStr(), TheModule->getTargetTriple());
+    TheModule->setDataLayout(DL);
   }
 }
 
@@ -4622,31 +4629,48 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
-      if (Record.size() < 1 || ((Record.size()-1)&1))
+      if (Record.size() < 1)
         return error("Invalid record");
+      // The first record specifies the type.
       FullTy = getFullyStructuredTypeByID(Record[0]);
       Type *Ty = flattenPointerTypes(FullTy);
       if (!Ty)
         return error("Invalid record");
 
-      PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
+      // Phi arguments are pairs of records of [value, basic block].
+      // There is an optional final record for fast-math-flags if this phi has a
+      // floating-point type.
+      size_t NumArgs = (Record.size() - 1) / 2;
+      if ((Record.size() - 1) % 2 == 1 && !Ty->isFPOrFPVectorTy())
+        return error("Invalid record");
+
+      PHINode *PN = PHINode::Create(Ty, NumArgs);
       InstructionList.push_back(PN);
 
-      for (unsigned i = 0, e = Record.size()-1; i != e; i += 2) {
+      for (unsigned i = 0; i != NumArgs; i++) {
         Value *V;
         // With the new function encoding, it is possible that operands have
         // negative IDs (for forward references).  Use a signed VBR
         // representation to keep the encoding small.
         if (UseRelativeIDs)
-          V = getValueSigned(Record, 1+i, NextValueNo, Ty);
+          V = getValueSigned(Record, i * 2 + 1, NextValueNo, Ty);
         else
-          V = getValue(Record, 1+i, NextValueNo, Ty);
-        BasicBlock *BB = getBasicBlock(Record[2+i]);
+          V = getValue(Record, i * 2 + 1, NextValueNo, Ty);
+        BasicBlock *BB = getBasicBlock(Record[i * 2 + 2]);
         if (!V || !BB)
           return error("Invalid record");
         PN->addIncoming(V, BB);
       }
       I = PN;
+
+      // If there are an even number of records, the final record must be FMF.
+      if (Record.size() % 2 == 0) {
+        assert(isa<FPMathOperator>(I) && "Unexpected phi type");
+        FastMathFlags FMF = getDecodedFastMathFlags(Record[Record.size() - 1]);
+        if (FMF.any())
+          I->setFastMathFlags(FMF);
+      }
+
       break;
     }
 
@@ -4726,7 +4750,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
       Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
-      unsigned Align;
+      MaybeAlign Align;
       if (Error Err = parseAlignmentValue(AlignRecord & ~FlagMask, Align)) {
         return Err;
       }
@@ -4737,7 +4761,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       const DataLayout &DL = TheModule->getDataLayout();
       unsigned AS = DL.getAllocaAddrSpace();
 
-      AllocaInst *AI = new AllocaInst(Ty, AS, Size, Align);
+      AllocaInst *AI = new AllocaInst(Ty, AS, Size, Align ? Align->value() : 0);
       AI->setUsedWithInAlloca(InAlloca);
       AI->setSwiftError(SwiftError);
       I = AI;
@@ -4765,7 +4789,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Error Err = typeCheckLoadStoreInst(Ty, Op->getType()))
         return Err;
 
-      unsigned Align;
+      MaybeAlign Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
       I = new LoadInst(Ty, Op, "", Record[OpNum + 1], Align);
@@ -4802,7 +4826,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
       SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 3]);
 
-      unsigned Align;
+      MaybeAlign Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
       I = new LoadInst(Ty, Op, "", Record[OpNum + 1], Align, Ordering, SSID);
@@ -4824,10 +4848,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       if (Error Err = typeCheckLoadStoreInst(Val->getType(), Ptr->getType()))
         return Err;
-      unsigned Align;
+      MaybeAlign Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new StoreInst(Val, Ptr, Record[OpNum+1], Align);
+      I = new StoreInst(Val, Ptr, Record[OpNum + 1], Align);
       InstructionList.push_back(I);
       break;
     }
@@ -4857,10 +4881,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Ordering != AtomicOrdering::NotAtomic && Record[OpNum] == 0)
         return error("Invalid record");
 
-      unsigned Align;
+      MaybeAlign Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new StoreInst(Val, Ptr, Record[OpNum+1], Align, Ordering, SSID);
+      I = new StoreInst(Val, Ptr, Record[OpNum + 1], Align, Ordering, SSID);
       InstructionList.push_back(I);
       break;
     }
@@ -5312,7 +5336,7 @@ Error BitcodeReader::materializeModule() {
 
   UpgradeModuleFlags(*TheModule);
 
-  UpgradeRetainReleaseMarker(*TheModule);
+  UpgradeARCRuntime(*TheModule);
 
   return Error::success();
 }
@@ -5874,7 +5898,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           ArrayRef<uint64_t>(Record).slice(CallGraphEdgeStartIndex),
           IsOldProfileFormat, HasProfile, HasRelBF);
       setSpecialRefs(Refs, NumRORefs, NumWORefs);
-      auto FS = llvm::make_unique<FunctionSummary>(
+      auto FS = std::make_unique<FunctionSummary>(
           Flags, InstCount, getDecodedFFlags(RawFunFlags), /*EntryCount=*/0,
           std::move(Refs), std::move(Calls), std::move(PendingTypeTests),
           std::move(PendingTypeTestAssumeVCalls),
@@ -5900,7 +5924,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       uint64_t RawFlags = Record[1];
       unsigned AliaseeID = Record[2];
       auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
-      auto AS = llvm::make_unique<AliasSummary>(Flags);
+      auto AS = std::make_unique<AliasSummary>(Flags);
       // The module path string ref set in the summary must be owned by the
       // index's module string table. Since we don't have a module path
       // string table section in the per-module index, we create a single
@@ -5934,7 +5958,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       std::vector<ValueInfo> Refs =
           makeRefList(ArrayRef<uint64_t>(Record).slice(RefArrayStart));
       auto FS =
-          llvm::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));
+          std::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));
       FS->setModulePath(getThisModule()->first());
       auto GUID = getValueInfoFromValueId(ValueID);
       FS->setOriginalName(GUID.second);
@@ -5961,7 +5985,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
         VTableFuncs.push_back({Callee, Offset});
       }
       auto VS =
-          llvm::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));
+          std::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));
       VS->setModulePath(getThisModule()->first());
       VS->setVTableFuncs(VTableFuncs);
       auto GUID = getValueInfoFromValueId(ValueID);
@@ -6019,7 +6043,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           IsOldProfileFormat, HasProfile, false);
       ValueInfo VI = getValueInfoFromValueId(ValueID).first;
       setSpecialRefs(Refs, NumRORefs, NumWORefs);
-      auto FS = llvm::make_unique<FunctionSummary>(
+      auto FS = std::make_unique<FunctionSummary>(
           Flags, InstCount, getDecodedFFlags(RawFunFlags), EntryCount,
           std::move(Refs), std::move(Edges), std::move(PendingTypeTests),
           std::move(PendingTypeTestAssumeVCalls),
@@ -6046,7 +6070,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       uint64_t RawFlags = Record[2];
       unsigned AliaseeValueId = Record[3];
       auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
-      auto AS = llvm::make_unique<AliasSummary>(Flags);
+      auto AS = std::make_unique<AliasSummary>(Flags);
       LastSeenSummary = AS.get();
       AS->setModulePath(ModuleIdMap[ModuleId]);
 
@@ -6075,7 +6099,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       std::vector<ValueInfo> Refs =
           makeRefList(ArrayRef<uint64_t>(Record).slice(RefArrayStart));
       auto FS =
-          llvm::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));
+          std::make_unique<GlobalVarSummary>(Flags, GVF, std::move(Refs));
       LastSeenSummary = FS.get();
       FS->setModulePath(ModuleIdMap[ModuleId]);
       ValueInfo VI = getValueInfoFromValueId(ValueID).first;
@@ -6438,7 +6462,7 @@ BitcodeModule::getModuleImpl(LLVMContext &Context, bool MaterializeAll,
                               Context);
 
   std::unique_ptr<Module> M =
-      llvm::make_unique<Module>(ModuleIdentifier, Context);
+      std::make_unique<Module>(ModuleIdentifier, Context);
   M->setMaterializer(R);
 
   // Delay parsing Metadata if ShouldLazyLoadMetadata is true.
@@ -6485,7 +6509,7 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
   if (Error JumpFailed = Stream.JumpToBit(ModuleBit))
     return std::move(JumpFailed);
 
-  auto Index = llvm::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
+  auto Index = std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
   ModuleSummaryIndexBitcodeReader R(std::move(Stream), Strtab, *Index,
                                     ModuleIdentifier, 0);
 
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 108f71189585..4da51dda8b74 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -515,7 +515,7 @@ class MetadataLoader::MetadataLoaderImpl {
       GV.getMetadata(LLVMContext::MD_dbg, MDs);
       GV.eraseMetadata(LLVMContext::MD_dbg);
       for (auto *MD : MDs)
-        if (auto *DGV = dyn_cast_or_null<DIGlobalVariable>(MD)) {
+        if (auto *DGV = dyn_cast<DIGlobalVariable>(MD)) {
           auto *DGVE = DIGlobalVariableExpression::getDistinct(
               Context, DGV, DIExpression::get(Context, {}));
           GV.addMetadata(LLVMContext::MD_dbg, *DGVE);
@@ -987,7 +987,7 @@ void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
   assert(ID >= MDStringRef.size() && "Unexpected lazy-loading of MDString");
   // Lookup first if the metadata hasn't already been loaded.
   if (auto *MD = MetadataList.lookup(ID)) {
-    auto *N = dyn_cast_or_null<MDNode>(MD);
+    auto *N = cast<MDNode>(MD);
     if (!N->isTemporary())
       return;
   }
@@ -2133,7 +2133,7 @@ MetadataLoader::MetadataLoader(BitstreamCursor &Stream, Module &TheModule,
                                BitcodeReaderValueList &ValueList,
                                bool IsImporting,
                                std::function<Type *(unsigned)> getTypeByID)
-    : Pimpl(llvm::make_unique<MetadataLoaderImpl>(
+    : Pimpl(std::make_unique<MetadataLoaderImpl>(
           Stream, TheModule, ValueList, std::move(getTypeByID), IsImporting)) {}
 
 Error MetadataLoader::parseMetadata(bool ModuleLevel) {
diff --git a/lib/Bitcode/Writer/BitWriter.cpp b/lib/Bitcode/Writer/BitWriter.cpp
index 76ca89147e52..be59c1f92836 100644
--- a/lib/Bitcode/Writer/BitWriter.cpp
+++ b/lib/Bitcode/Writer/BitWriter.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 
 int LLVMWriteBitcodeToFile(LLVMModuleRef M, const char *Path) {
   std::error_code EC;
-  raw_fd_ostream OS(Path, EC, sys::fs::F_None);
+  raw_fd_ostream OS(Path, EC, sys::fs::OF_None);
 
   if (EC)
     return -1;
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 5c7b970a3a75..deb4019ea8ba 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -86,7 +86,7 @@ static cl::opt<unsigned>
                    cl::desc("Number of metadatas above which we emit an index "
                             "to enable lazy-loading"));
 
-cl::opt<bool> WriteRelBFToSummary(
+static cl::opt<bool> WriteRelBFToSummary(
     "write-relbf-to-summary", cl::Hidden, cl::init(false),
     cl::desc("Write relative block frequency to function summary "));
 
@@ -520,7 +520,7 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) {
 static unsigned getEncodedUnaryOpcode(unsigned Opcode) {
   switch (Opcode) {
   default: llvm_unreachable("Unknown binary instruction!");
-  case Instruction::FNeg: return bitc::UNOP_NEG;
+  case Instruction::FNeg: return bitc::UNOP_FNEG;
   }
 }
 
@@ -2880,6 +2880,11 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
       pushValueSigned(PN.getIncomingValue(i), InstID, Vals64);
       Vals64.push_back(VE.getValueID(PN.getIncomingBlock(i)));
     }
+
+    uint64_t Flags = getOptimizationFlags(&I);
+    if (Flags != 0)
+      Vals64.push_back(Flags);
+
     // Emit a Vals64 vector and exit.
     Stream.EmitRecord(Code, Vals64, AbbrevToUse);
     Vals64.clear();
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 444f618d8b8c..f64b775a8b77 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -232,7 +232,7 @@ bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr &MI,
   if (!MO.isReg() || !MO.isImplicit())
     return false;
 
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   if (Reg == 0)
     return false;
 
@@ -252,7 +252,7 @@ void AggressiveAntiDepBreaker::GetPassthruRegs(
     if (!MO.isReg()) continue;
     if ((MO.isDef() && MI.isRegTiedToUseOperand(i)) ||
         IsImplicitDefUse(MI, MO)) {
-      const unsigned Reg = MO.getReg();
+      const Register Reg = MO.getReg();
       for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs)
         PassthruRegs.insert(*SubRegs);
@@ -365,7 +365,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
     HandleLastUse(Reg, Count + 1, "", "\tDead Def: ", "\n");
@@ -375,7 +375,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
     LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g"
@@ -418,7 +418,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
     // Ignore KILLs and passthru registers for liveness...
     if (MI.isKill() || (PassthruRegs.count(Reg) != 0))
@@ -471,7 +471,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isUse()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
     LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g"
@@ -506,7 +506,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
       if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (Reg == 0) continue;
 
       if (FirstReg != 0) {
@@ -790,7 +790,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
         CriticalPathSU = SU;
       }
     }
-
+    assert(CriticalPathSU && "Failed to find SUnit critical path");
     CriticalPathMI = CriticalPathSU->getInstr();
   }
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index d158e70b86ac..4f24f077d120 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -309,7 +309,8 @@ static const Value *getNoopInput(const Value *V,
         NoopInput = Op;
     } else if (isa<TruncInst>(I) &&
                TLI.allowTruncateForTailCall(Op->getType(), I->getType())) {
-      DataBits = std::min(DataBits, I->getType()->getPrimitiveSizeInBits());
+      DataBits = std::min((uint64_t)DataBits,
+                         I->getType()->getPrimitiveSizeInBits().getFixedSize());
       NoopInput = Op;
     } else if (auto CS = ImmutableCallSite(I)) {
       const Value *ReturnedOp = CS.getReturnedArgOperand();
@@ -523,7 +524,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
   // longjmp on x86), it can end up causing miscompilation that has not
   // been fully understood.
   if (!Ret &&
-      (!TM.Options.GuaranteedTailCallOpt || !isa<UnreachableInst>(Term)))
+      ((!TM.Options.GuaranteedTailCallOpt &&
+        CS.getCallingConv() != CallingConv::Tail) || !isa<UnreachableInst>(Term)))
     return false;
 
   // If I will have a chain, make sure no other instruction that will have a
@@ -536,9 +538,11 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
       // Debug info intrinsics do not get in the way of tail call optimization.
       if (isa<DbgInfoIntrinsic>(BBI))
         continue;
-      // A lifetime end intrinsic should not stop tail call optimization.
+      // A lifetime end or assume intrinsic should not stop tail call
+      // optimization.
       if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
-        if (II->getIntrinsicID() == Intrinsic::lifetime_end)
+        if (II->getIntrinsicID() == Intrinsic::lifetime_end ||
+            II->getIntrinsicID() == Intrinsic::assume)
           continue;
       if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
           !isSafeToSpeculativelyExecute(&*BBI))
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 54f6cc2d5571..73c53d6c4af5 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -91,10 +91,12 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SectionKind.h"
@@ -159,30 +161,30 @@ static gcp_map_type &getGCMap(void *&P) {
   return *(gcp_map_type*)P;
 }
 
-/// getGVAlignmentLog2 - Return the alignment to use for the specified global
-/// value in log2 form.  This rounds up to the preferred alignment if possible
-/// and legal.
-static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL,
-                                   unsigned InBits = 0) {
-  unsigned NumBits = 0;
+/// getGVAlignment - Return the alignment to use for the specified global
+/// value.  This rounds up to the preferred alignment if possible and legal.
+Align AsmPrinter::getGVAlignment(const GlobalValue *GV, const DataLayout &DL,
+                                 Align InAlign) {
+  Align Alignment;
   if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-    NumBits = DL.getPreferredAlignmentLog(GVar);
+    Alignment = Align(DL.getPreferredAlignment(GVar));
 
-  // If InBits is specified, round it to it.
-  if (InBits > NumBits)
-    NumBits = InBits;
+  // If InAlign is specified, round it to it.
+  if (InAlign > Alignment)
+    Alignment = InAlign;
 
   // If the GV has a specified alignment, take it into account.
-  if (GV->getAlignment() == 0)
-    return NumBits;
+  const MaybeAlign GVAlign(GV->getAlignment());
+  if (!GVAlign)
+    return Alignment;
 
-  unsigned GVAlign = Log2_32(GV->getAlignment());
+  assert(GVAlign && "GVAlign must be set");
 
   // If the GVAlign is larger than NumBits, or if we are required to obey
   // NumBits because the GV has an assigned section, obey it.
-  if (GVAlign > NumBits || GV->hasSection())
-    NumBits = GVAlign;
-  return NumBits;
+  if (*GVAlign > Alignment || GV->hasSection())
+    Alignment = *GVAlign;
+  return Alignment;
 }
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
@@ -248,13 +250,14 @@ const MCSection *AsmPrinter::getCurrentSection() const {
 void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
-  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<MachineModuleInfoWrapperPass>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
-  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+  MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
 
   // Initialize TargetLoweringObjectFile.
   const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
@@ -311,7 +314,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   if (MAI->doesSupportDebugInformation()) {
     bool EmitCodeView = MMI->getModule()->getCodeViewFlag();
     if (EmitCodeView && TM.getTargetTriple().isOSWindows()) {
-      Handlers.emplace_back(llvm::make_unique<CodeViewDebug>(this),
+      Handlers.emplace_back(std::make_unique<CodeViewDebug>(this),
                             DbgTimerName, DbgTimerDescription,
                             CodeViewLineTablesGroupName,
                             CodeViewLineTablesGroupDescription);
@@ -380,7 +383,7 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   if (mdconst::extract_or_null<ConstantInt>(
           MMI->getModule()->getModuleFlag("cfguardtable")))
-    Handlers.emplace_back(llvm::make_unique<WinCFGuard>(this), CFGuardName,
+    Handlers.emplace_back(std::make_unique<WinCFGuard>(this), CFGuardName,
                           CFGuardDescription, DWARFGroupName,
                           DWARFGroupDescription);
 
@@ -425,7 +428,10 @@ void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
     OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
     return;
   case GlobalValue::PrivateLinkage:
+    return;
   case GlobalValue::InternalLinkage:
+    if (MAI->hasDotLGloblDirective())
+      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_LGlobal);
     return;
   case GlobalValue::AppendingLinkage:
   case GlobalValue::AvailableExternallyLinkage:
@@ -501,7 +507,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // If the alignment is specified, we *must* obey it.  Overaligning a global
   // with a specified alignment is a prompt way to break globals emitted to
   // sections and expected to be contiguous (e.g. ObjC metadata).
-  unsigned AlignLog = getGVAlignmentLog2(GV, DL);
+  const Align Alignment = getGVAlignment(GV, DL);
 
   for (const HandlerInfo &HI : Handlers) {
     NamedRegionTimer T(HI.TimerName, HI.TimerDescription,
@@ -513,12 +519,11 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // Handle common symbols
   if (GVKind.isCommon()) {
     if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
-    unsigned Align = 1 << AlignLog;
-    if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
-      Align = 0;
-
     // .comm _foo, 42, 4
-    OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
+    const bool SupportsAlignment =
+        getObjFileLowering().getCommDirectiveSupportsAlignment();
+    OutStreamer->EmitCommonSymbol(GVSym, Size,
+                                  SupportsAlignment ? Alignment.value() : 0);
     return;
   }
 
@@ -531,10 +536,9 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       TheSection->isVirtualSection()) {
     if (Size == 0)
       Size = 1; // zerofill of 0 bytes is undefined.
-    unsigned Align = 1 << AlignLog;
     EmitLinkage(GV, GVSym);
     // .zerofill __DATA, __bss, _foo, 400, 5
-    OutStreamer->EmitZerofill(TheSection, GVSym, Size, Align);
+    OutStreamer->EmitZerofill(TheSection, GVSym, Size, Alignment.value());
     return;
   }
 
@@ -544,7 +548,6 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       getObjFileLowering().getBSSSection() == TheSection) {
     if (Size == 0)
       Size = 1; // .comm Foo, 0 is undefined, avoid it.
-    unsigned Align = 1 << AlignLog;
 
     // Use .lcomm only if it supports user-specified alignment.
     // Otherwise, while it would still be correct to use .lcomm in some
@@ -554,17 +557,17 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     // Prefer to simply fall back to .local / .comm in this case.
     if (MAI->getLCOMMDirectiveAlignmentType() != LCOMM::NoAlignment) {
       // .lcomm _foo, 42
-      OutStreamer->EmitLocalCommonSymbol(GVSym, Size, Align);
+      OutStreamer->EmitLocalCommonSymbol(GVSym, Size, Alignment.value());
       return;
     }
 
-    if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
-      Align = 0;
-
     // .local _foo
     OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Local);
     // .comm _foo, 42, 4
-    OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
+    const bool SupportsAlignment =
+        getObjFileLowering().getCommDirectiveSupportsAlignment();
+    OutStreamer->EmitCommonSymbol(GVSym, Size,
+                                  SupportsAlignment ? Alignment.value() : 0);
     return;
   }
 
@@ -585,11 +588,11 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
     if (GVKind.isThreadBSS()) {
       TheSection = getObjFileLowering().getTLSBSSSection();
-      OutStreamer->EmitTBSSSymbol(TheSection, MangSym, Size, 1 << AlignLog);
+      OutStreamer->EmitTBSSSymbol(TheSection, MangSym, Size, Alignment.value());
     } else if (GVKind.isThreadData()) {
       OutStreamer->SwitchSection(TheSection);
 
-      EmitAlignment(AlignLog, GV);
+      EmitAlignment(Alignment, GV);
       OutStreamer->EmitLabel(MangSym);
 
       EmitGlobalConstant(GV->getParent()->getDataLayout(),
@@ -625,7 +628,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   OutStreamer->SwitchSection(TheSection);
 
   EmitLinkage(GV, EmittedInitSym);
-  EmitAlignment(AlignLog, GV);
+  EmitAlignment(Alignment, GV);
 
   OutStreamer->EmitLabel(EmittedInitSym);
 
@@ -664,6 +667,10 @@ void AsmPrinter::EmitFunctionHeader() {
   OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(&F, TM));
   EmitVisibility(CurrentFnSym, F.getVisibility());
 
+  if (MAI->needsFunctionDescriptors() &&
+      F.getLinkage() != GlobalValue::InternalLinkage)
+    EmitLinkage(&F, CurrentFnDescSym);
+
   EmitLinkage(&F, CurrentFnSym);
   if (MAI->hasFunctionAlignment())
     EmitAlignment(MF->getAlignment(), &F);
@@ -699,8 +706,13 @@ void AsmPrinter::EmitFunctionHeader() {
     }
   }
 
-  // Emit the CurrentFnSym.  This is a virtual function to allow targets to
-  // do their wild and crazy things as required.
+  // Emit the function descriptor. This is a virtual function to allow targets
+  // to emit their specific function descriptor.
+  if (MAI->needsFunctionDescriptors())
+    EmitFunctionDescriptor();
+
+  // Emit the CurrentFnSym. This is a virtual function to allow targets to do
+  // their wild and crazy things as required.
   EmitFunctionEntryLabel();
 
   // If the function had address-taken blocks that got deleted, then we have
@@ -783,7 +795,7 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
 /// emitImplicitDef - This method emits the specified machine instruction
 /// that is an implicit def.
 void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
-  unsigned RegNo = MI->getOperand(0).getReg();
+  Register RegNo = MI->getOperand(0).getReg();
 
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
@@ -910,7 +922,8 @@ static bool emitDebugLabelComment(const MachineInstr *MI, AsmPrinter &AP) {
   OS << "DEBUG_LABEL: ";
 
   const DILabel *V = MI->getDebugLabel();
-  if (auto *SP = dyn_cast<DISubprogram>(V->getScope())) {
+  if (auto *SP = dyn_cast<DISubprogram>(
+          V->getScope()->getNonLexicalBlockFileScope())) {
     StringRef Name = SP->getName();
     if (!Name.empty())
       OS << Name << ":";
@@ -1024,7 +1037,7 @@ void AsmPrinter::EmitFunctionBody() {
     // Get MachineDominatorTree or compute it on the fly if it's unavailable
     MDT = getAnalysisIfAvailable<MachineDominatorTree>();
     if (!MDT) {
-      OwnedMDT = make_unique<MachineDominatorTree>();
+      OwnedMDT = std::make_unique<MachineDominatorTree>();
       OwnedMDT->getBase().recalculate(*MF);
       MDT = OwnedMDT.get();
     }
@@ -1032,7 +1045,7 @@ void AsmPrinter::EmitFunctionBody() {
     // Get MachineLoopInfo or compute it on the fly if it's unavailable
     MLI = getAnalysisIfAvailable<MachineLoopInfo>();
     if (!MLI) {
-      OwnedMLI = make_unique<MachineLoopInfo>();
+      OwnedMLI = std::make_unique<MachineLoopInfo>();
       OwnedMLI->getBase().analyze(MDT->getBase());
       MLI = OwnedMLI.get();
     }
@@ -1052,9 +1065,13 @@ void AsmPrinter::EmitFunctionBody() {
         ++NumInstsInFunction;
       }
 
-      // If there is a pre-instruction symbol, emit a label for it here.
+      // If there is a pre-instruction symbol, emit a label for it here. If the
+      // instruction was duplicated and the label has already been emitted,
+      // don't re-emit the same label.
+      // FIXME: Consider strengthening that to an assertion.
       if (MCSymbol *S = MI.getPreInstrSymbol())
-        OutStreamer->EmitLabel(S);
+        if (S->isUndefined())
+          OutStreamer->EmitLabel(S);
 
       if (ShouldPrintDebugScopes) {
         for (const HandlerInfo &HI : Handlers) {
@@ -1107,9 +1124,13 @@ void AsmPrinter::EmitFunctionBody() {
         break;
       }
 
-      // If there is a post-instruction symbol, emit a label for it here.
+      // If there is a post-instruction symbol, emit a label for it here.  If
+      // the instruction was duplicated and the label has already been emitted,
+      // don't re-emit the same label.
+      // FIXME: Consider strengthening that to an assertion.
       if (MCSymbol *S = MI.getPostInstrSymbol())
-        OutStreamer->EmitLabel(S);
+        if (S->isUndefined())
+          OutStreamer->EmitLabel(S);
 
       if (ShouldPrintDebugScopes) {
         for (const HandlerInfo &HI : Handlers) {
@@ -1313,11 +1334,10 @@ void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
 
   // Set the symbol type to function if the alias has a function type.
   // This affects codegen when the aliasee is not a function.
-  if (IsFunction) {
-    OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
-    if (isa<GlobalIFunc>(GIS))
-      OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeIndFunction);
-  }
+  if (IsFunction)
+    OutStreamer->EmitSymbolAttribute(Name, isa<GlobalIFunc>(GIS)
+                                               ? MCSA_ELF_TypeIndFunction
+                                               : MCSA_ELF_TypeFunction);
 
   EmitVisibility(Name, GIS.getVisibility());
 
@@ -1349,60 +1369,28 @@ void AsmPrinter::emitRemarksSection(Module &M) {
   RemarkStreamer *RS = M.getContext().getRemarkStreamer();
   if (!RS)
     return;
-  const remarks::Serializer &Serializer = RS->getSerializer();
+  remarks::RemarkSerializer &RemarkSerializer = RS->getSerializer();
+
+  Optional<SmallString<128>> Filename;
+  if (Optional<StringRef> FilenameRef = RS->getFilename()) {
+    Filename = *FilenameRef;
+    sys::fs::make_absolute(*Filename);
+    assert(!Filename->empty() && "The filename can't be empty.");
+  }
+
+  std::string Buf;
+  raw_string_ostream OS(Buf);
+  std::unique_ptr<remarks::MetaSerializer> MetaSerializer =
+      Filename ? RemarkSerializer.metaSerializer(OS, StringRef(*Filename))
+               : RemarkSerializer.metaSerializer(OS);
+  MetaSerializer->emit();
 
   // Switch to the right section: .remarks/__remarks.
   MCSection *RemarksSection =
       OutContext.getObjectFileInfo()->getRemarksSection();
   OutStreamer->SwitchSection(RemarksSection);
 
-  // Emit the magic number.
-  OutStreamer->EmitBytes(remarks::Magic);
-  // Explicitly emit a '\0'.
-  OutStreamer->EmitIntValue(/*Value=*/0, /*Size=*/1);
-
-  // Emit the version number: little-endian uint64_t.
-  // The version number is located at the offset 0x0 in the section.
-  std::array<char, 8> Version;
-  support::endian::write64le(Version.data(), remarks::Version);
-  OutStreamer->EmitBinaryData(StringRef(Version.data(), Version.size()));
-
-  // Emit the string table in the section.
-  // Note: we need to use the streamer here to emit it in the section. We can't
-  // just use the serialize function with a raw_ostream because of the way
-  // MCStreamers work.
-  uint64_t StrTabSize =
-      Serializer.StrTab ? Serializer.StrTab->SerializedSize : 0;
-  // Emit the total size of the string table (the size itself excluded):
-  // little-endian uint64_t.
-  // The total size is located after the version number.
-  // Note: even if no string table is used, emit 0.
-  std::array<char, 8> StrTabSizeBuf;
-  support::endian::write64le(StrTabSizeBuf.data(), StrTabSize);
-  OutStreamer->EmitBinaryData(
-      StringRef(StrTabSizeBuf.data(), StrTabSizeBuf.size()));
-
-  if (const Optional<remarks::StringTable> &StrTab = Serializer.StrTab) {
-    std::vector<StringRef> StrTabStrings = StrTab->serialize();
-    // Emit a list of null-terminated strings.
-    // Note: the order is important here: the ID used in the remarks corresponds
-    // to the position of the string in the section.
-    for (StringRef Str : StrTabStrings) {
-      OutStreamer->EmitBytes(Str);
-      // Explicitly emit a '\0'.
-      OutStreamer->EmitIntValue(/*Value=*/0, /*Size=*/1);
-    }
-  }
-
-  // Emit the null-terminated absolute path to the remark file.
-  // The path is located at the offset 0x4 in the section.
-  StringRef FilenameRef = RS->getFilename();
-  SmallString<128> Filename = FilenameRef;
-  sys::fs::make_absolute(Filename);
-  assert(!Filename.empty() && "The filename can't be empty.");
-  OutStreamer->EmitBytes(Filename);
-  // Explicitly emit a '\0'.
-  OutStreamer->EmitIntValue(/*Value=*/0, /*Size=*/1);
+  OutStreamer->EmitBinaryData(OS.str());
 }
 
 bool AsmPrinter::doFinalization(Module &M) {
@@ -1455,7 +1443,7 @@ bool AsmPrinter::doFinalization(Module &M) {
       OutStreamer->SwitchSection(TLOF.getDataSection());
       const DataLayout &DL = M.getDataLayout();
 
-      EmitAlignment(Log2_32(DL.getPointerSize()));
+      EmitAlignment(Align(DL.getPointerSize()));
       for (const auto &Stub : Stubs) {
         OutStreamer->EmitLabel(Stub.first);
         OutStreamer->EmitSymbolValue(Stub.second.getPointer(),
@@ -1482,7 +1470,7 @@ bool AsmPrinter::doFinalization(Module &M) {
                 COFF::IMAGE_SCN_LNK_COMDAT,
             SectionKind::getReadOnly(), Stub.first->getName(),
             COFF::IMAGE_COMDAT_SELECT_ANY));
-        EmitAlignment(Log2_32(DL.getPointerSize()));
+        EmitAlignment(Align(DL.getPointerSize()));
         OutStreamer->EmitSymbolAttribute(Stub.first, MCSA_Global);
         OutStreamer->EmitLabel(Stub.first);
         OutStreamer->EmitSymbolValue(Stub.second.getPointer(),
@@ -1607,8 +1595,7 @@ bool AsmPrinter::doFinalization(Module &M) {
              "expected llvm.used to be an array type");
       if (const auto *A = cast<ConstantArray>(LU->getInitializer())) {
         for (const Value *Op : A->operands()) {
-          const auto *GV =
-              cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases());
+          const auto *GV = cast<GlobalValue>(Op->stripPointerCasts());
           // Global symbols with internal or private linkage are not visible to
           // the linker, and thus would cause an error when the linker tried to
           // preserve the symbol due to the `/include:` directive.
@@ -1679,8 +1666,27 @@ MCSymbol *AsmPrinter::getCurExceptionSym() {
 
 void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
+
   // Get the function symbol.
-  CurrentFnSym = getSymbol(&MF.getFunction());
+  if (MAI->needsFunctionDescriptors()) {
+    assert(TM.getTargetTriple().isOSAIX() && "Function descriptor is only"
+                                             " supported on AIX.");
+    assert(CurrentFnDescSym && "The function descriptor symbol needs to be"
+		                           " initalized first.");
+
+    // Get the function entry point symbol.
+    CurrentFnSym =
+        OutContext.getOrCreateSymbol("." + CurrentFnDescSym->getName());
+
+    const Function &F = MF.getFunction();
+    MCSectionXCOFF *FnEntryPointSec =
+        cast<MCSectionXCOFF>(getObjFileLowering().SectionForGlobal(&F, TM));
+    // Set the containing csect.
+    cast<MCSymbolXCOFF>(CurrentFnSym)->setContainingCsect(FnEntryPointSec);
+  } else {
+    CurrentFnSym = getSymbol(&MF.getFunction());
+  }
+
   CurrentFnSymForSize = CurrentFnSym;
   CurrentFnBegin = nullptr;
   CurExceptionSym = nullptr;
@@ -1765,7 +1771,7 @@ void AsmPrinter::EmitConstantPool() {
 
       if (CurSection != CPSections[i].S) {
         OutStreamer->SwitchSection(CPSections[i].S);
-        EmitAlignment(Log2_32(CPSections[i].Alignment));
+        EmitAlignment(Align(CPSections[i].Alignment));
         CurSection = CPSections[i].S;
         Offset = 0;
       }
@@ -1812,7 +1818,7 @@ void AsmPrinter::EmitJumpTableInfo() {
     OutStreamer->SwitchSection(ReadOnlySection);
   }
 
-  EmitAlignment(Log2_32(MJTI->getEntryAlignment(DL)));
+  EmitAlignment(Align(MJTI->getEntryAlignment(DL)));
 
   // Jump tables in code sections are marked with a data_region directive
   // where that's supported.
@@ -2025,10 +2031,10 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,
   }
 
   // Emit the function pointers in the target-specific order
-  unsigned Align = Log2_32(DL.getPointerPrefAlignment());
   llvm::stable_sort(Structors, [](const Structor &L, const Structor &R) {
     return L.Priority < R.Priority;
   });
+  const Align Align = DL.getPointerPrefAlignment();
   for (Structor &S : Structors) {
     const TargetLoweringObjectFile &Obj = getObjFileLowering();
     const MCSymbol *KeySym = nullptr;
@@ -2149,23 +2155,20 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 //===----------------------------------------------------------------------===//
 
 // EmitAlignment - Emit an alignment directive to the specified power of
-// two boundary.  For example, if you pass in 3 here, you will get an 8
-// byte alignment.  If a global value is specified, and if that global has
+// two boundary.  If a global value is specified, and if that global has
 // an explicit alignment requested, it will override the alignment request
 // if required for correctness.
-void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const {
+void AsmPrinter::EmitAlignment(Align Alignment, const GlobalObject *GV) const {
   if (GV)
-    NumBits = getGVAlignmentLog2(GV, GV->getParent()->getDataLayout(), NumBits);
+    Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);
 
-  if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
+  if (Alignment == Align::None())
+    return; // 1-byte aligned: no need to emit alignment.
 
-  assert(NumBits <
-             static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
-         "undefined behavior");
   if (getCurrentSection()->getKind().isText())
-    OutStreamer->EmitCodeAlignment(1u << NumBits);
+    OutStreamer->EmitCodeAlignment(Alignment.value());
   else
-    OutStreamer->EmitValueToAlignment(1u << NumBits);
+    OutStreamer->EmitValueToAlignment(Alignment.value());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2481,6 +2484,7 @@ static void emitGlobalConstantStruct(const DataLayout &DL,
 }
 
 static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) {
+  assert(ET && "Unknown float type");
   APInt API = APF.bitcastToAPInt();
 
   // First print a comment with what we think the original floating-point value
@@ -2488,11 +2492,7 @@ static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) {
   if (AP.isVerbose()) {
     SmallString<8> StrVal;
     APF.toString(StrVal);
-
-    if (ET)
-      ET->print(AP.OutStreamer->GetCommentOS());
-    else
-      AP.OutStreamer->GetCommentOS() << "Printing <null> Type";
+    ET->print(AP.OutStreamer->GetCommentOS());
     AP.OutStreamer->GetCommentOS() << ' ' << StrVal << '\n';
   }
 
@@ -2670,7 +2670,7 @@ static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME,
   const GlobalValue *FinalGV = dyn_cast<GlobalValue>(GV->getOperand(0));
   const MCSymbol *FinalSym = AP.getSymbol(FinalGV);
   *ME = AP.getObjFileLowering().getIndirectSymViaGOTPCRel(
-      FinalSym, MV, Offset, AP.MMI, *AP.OutStreamer);
+      FinalGV, FinalSym, MV, Offset, AP.MMI, *AP.OutStreamer);
 
   // Update GOT equivalent usage information
   --NumUses;
@@ -2930,7 +2930,7 @@ void AsmPrinter::setupCodePaddingContext(const MachineBasicBlock &MBB,
 /// EmitBasicBlockStart - This method prints the label for the specified
 /// MachineBasicBlock, an alignment (if present) and a comment describing
 /// it if appropriate.
-void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
   // End the previous funclet and start a new one.
   if (MBB.isEHFuncletEntry()) {
     for (const HandlerInfo &HI : Handlers) {
@@ -2940,8 +2940,9 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
   }
 
   // Emit an alignment directive for this block, if needed.
-  if (unsigned Align = MBB.getAlignment())
-    EmitAlignment(Align);
+  const Align Alignment = MBB.getAlignment();
+  if (Alignment != Align::None())
+    EmitAlignment(Alignment);
   MCCodePaddingContext Context;
   setupCodePaddingContext(MBB, Context);
   OutStreamer->EmitCodePaddingBasicBlockStart(Context);
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 7721e996aca5..420df26a2b8b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -72,7 +72,7 @@ static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
 unsigned AsmPrinter::addInlineAsmDiagBuffer(StringRef AsmStr,
                                             const MDNode *LocMDNode) const {
   if (!DiagInfo) {
-    DiagInfo = make_unique<SrcMgrDiagInfo>();
+    DiagInfo = std::make_unique<SrcMgrDiagInfo>();
 
     MCContext &Context = MMI->getContext();
     Context.setInlineSourceManager(&DiagInfo->SrcMgr);
@@ -432,6 +432,7 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
               const BlockAddress *BA = MI->getOperand(OpNo).getBlockAddress();
               MCSymbol *Sym = AP->GetBlockAddressSymbol(BA);
               Sym->print(OS, AP->MAI);
+              MMI->getContext().registerInlineAsmLabel(Sym);
             } else if (MI->getOperand(OpNo).isMBB()) {
               const MCSymbol *Sym = MI->getOperand(OpNo).getMBB()->getSymbol();
               Sym->print(OS, AP->MAI);
diff --git a/lib/CodeGen/AsmPrinter/ByteStreamer.h b/lib/CodeGen/AsmPrinter/ByteStreamer.h
index db2ff458eb2e..09f7496cd4ef 100644
--- a/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -73,18 +73,18 @@ class HashingByteStreamer final : public ByteStreamer {
 class BufferByteStreamer final : public ByteStreamer {
 private:
   SmallVectorImpl<char> &Buffer;
-  SmallVectorImpl<std::string> &Comments;
+  std::vector<std::string> &Comments;
 
+public:
   /// Only verbose textual output needs comments.  This will be set to
   /// true for that case, and false otherwise.  If false, comments passed in to
   /// the emit methods will be ignored.
-  bool GenerateComments;
+  const bool GenerateComments;
 
-public:
   BufferByteStreamer(SmallVectorImpl<char> &Buffer,
-                     SmallVectorImpl<std::string> &Comments,
-                     bool GenerateComments)
-  : Buffer(Buffer), Comments(Comments), GenerateComments(GenerateComments) {}
+                     std::vector<std::string> &Comments, bool GenerateComments)
+      : Buffer(Buffer), Comments(Comments), GenerateComments(GenerateComments) {
+  }
   void EmitInt8(uint8_t Byte, const Twine &Comment) override {
     Buffer.push_back(Byte);
     if (GenerateComments)
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 932959c311fa..c6457f3626d1 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -98,7 +98,8 @@ using namespace llvm::codeview;
 namespace {
 class CVMCAdapter : public CodeViewRecordStreamer {
 public:
-  CVMCAdapter(MCStreamer &OS) : OS(&OS) {}
+  CVMCAdapter(MCStreamer &OS, TypeCollection &TypeTable)
+      : OS(&OS), TypeTable(TypeTable) {}
 
   void EmitBytes(StringRef Data) { OS->EmitBytes(Data); }
 
@@ -110,8 +111,24 @@ public:
 
   void AddComment(const Twine &T) { OS->AddComment(T); }
 
+  void AddRawComment(const Twine &T) { OS->emitRawComment(T); }
+
+  bool isVerboseAsm() { return OS->isVerboseAsm(); }
+
+  std::string getTypeName(TypeIndex TI) {
+    std::string TypeName;
+    if (!TI.isNoneType()) {
+      if (TI.isSimple())
+        TypeName = TypeIndex::simpleTypeName(TI);
+      else
+        TypeName = TypeTable.getTypeName(TI);
+    }
+    return TypeName;
+  }
+
 private:
   MCStreamer *OS = nullptr;
+  TypeCollection &TypeTable;
 };
 } // namespace
 
@@ -617,13 +634,6 @@ emitNullTerminatedSymbolName(MCStreamer &OS, StringRef S,
   OS.EmitBytes(NullTerminatedString);
 }
 
-static StringRef getTypeLeafName(TypeLeafKind TypeKind) {
-  for (const EnumEntry<TypeLeafKind> &EE : getTypeLeafNames())
-    if (EE.Value == TypeKind)
-      return EE.Name;
-  return "";
-}
-
 void CodeViewDebug::emitTypeInformation() {
   if (TypeTable.empty())
     return;
@@ -632,30 +642,11 @@ void CodeViewDebug::emitTypeInformation() {
   OS.SwitchSection(Asm->getObjFileLowering().getCOFFDebugTypesSection());
   emitCodeViewMagicVersion();
 
-  SmallString<8> CommentPrefix;
-  if (OS.isVerboseAsm()) {
-    CommentPrefix += '\t';
-    CommentPrefix += Asm->MAI->getCommentString();
-    CommentPrefix += ' ';
-  }
-
   TypeTableCollection Table(TypeTable.records());
-  SmallString<512> CommentBlock;
-  raw_svector_ostream CommentOS(CommentBlock);
-  std::unique_ptr<ScopedPrinter> SP;
-  std::unique_ptr<TypeDumpVisitor> TDV;
   TypeVisitorCallbackPipeline Pipeline;
 
-  if (OS.isVerboseAsm()) {
-    // To construct block comment describing the type record for readability.
-    SP = llvm::make_unique<ScopedPrinter>(CommentOS);
-    SP->setPrefix(CommentPrefix);
-    TDV = llvm::make_unique<TypeDumpVisitor>(Table, SP.get(), false);
-    Pipeline.addCallbackToPipeline(*TDV);
-  }
-
   // To emit type record using Codeview MCStreamer adapter
-  CVMCAdapter CVMCOS(OS);
+  CVMCAdapter CVMCOS(OS, Table);
   TypeRecordMapping typeMapping(CVMCOS);
   Pipeline.addCallbackToPipeline(typeMapping);
 
@@ -664,17 +655,6 @@ void CodeViewDebug::emitTypeInformation() {
     // This will fail if the record data is invalid.
     CVType Record = Table.getType(*B);
 
-    CommentBlock.clear();
-
-    auto RecordLen = Record.length();
-    auto RecordKind = Record.kind();
-    if (OS.isVerboseAsm())
-      CVMCOS.AddComment("Record length");
-    CVMCOS.EmitIntValue(RecordLen - 2, 2);
-    if (OS.isVerboseAsm())
-      CVMCOS.AddComment("Record kind: " + getTypeLeafName(RecordKind));
-    CVMCOS.EmitIntValue(RecordKind, sizeof(RecordKind));
-
     Error E = codeview::visitTypeRecord(Record, *B, Pipeline);
 
     if (E) {
@@ -682,13 +662,6 @@ void CodeViewDebug::emitTypeInformation() {
       llvm_unreachable("produced malformed type record");
     }
 
-    if (OS.isVerboseAsm()) {
-      // emitRawComment will insert its own tab and comment string before
-      // the first line, so strip off our first one. It also prints its own
-      // newline.
-      OS.emitRawComment(
-          CommentOS.str().drop_front(CommentPrefix.size() - 1).rtrim());
-    }
     B = Table.getNext(*B);
   }
 }
@@ -1135,7 +1108,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
       if (!BeginLabel->isDefined() || !EndLabel->isDefined())
         continue;
 
-      DIType *DITy = std::get<2>(HeapAllocSite);
+      const DIType *DITy = std::get<2>(HeapAllocSite);
       MCSymbol *HeapAllocEnd = beginSymbolRecord(SymbolKind::S_HEAPALLOCSITE);
       OS.AddComment("Call site offset");
       OS.EmitCOFFSecRel32(BeginLabel, /*Offset=*/0);
@@ -1363,7 +1336,7 @@ void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   const TargetRegisterInfo *TRI = TSI.getRegisterInfo();
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   const Function &GV = MF->getFunction();
-  auto Insertion = FnDebugInfo.insert({&GV, llvm::make_unique<FunctionInfo>()});
+  auto Insertion = FnDebugInfo.insert({&GV, std::make_unique<FunctionInfo>()});
   assert(Insertion.second && "function already has info");
   CurFn = Insertion.first->second.get();
   CurFn->FuncId = NextFuncId++;
@@ -2633,17 +2606,6 @@ void CodeViewDebug::emitLocalVariableList(const FunctionInfo &FI,
       emitLocalVariable(FI, L);
 }
 
-/// Only call this on endian-specific types like ulittle16_t and little32_t, or
-/// structs composed of them.
-template <typename T>
-static void copyBytesForDefRange(SmallString<20> &BytePrefix,
-                                 SymbolKind SymKind, const T &DefRangeHeader) {
-  BytePrefix.resize(2 + sizeof(T));
-  ulittle16_t SymKindLE = ulittle16_t(SymKind);
-  memcpy(&BytePrefix[0], &SymKindLE, 2);
-  memcpy(&BytePrefix[2], &DefRangeHeader, sizeof(T));
-}
-
 void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
                                       const LocalVariable &Var) {
   // LocalSym record, see SymbolRecord.h for more info.
@@ -2692,8 +2654,9 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
           (bool(Flags & LocalSymFlags::IsParameter)
                ? (EncFP == FI.EncodedParamFramePtrReg)
                : (EncFP == FI.EncodedLocalFramePtrReg))) {
-        little32_t FPOffset = little32_t(Offset);
-        copyBytesForDefRange(BytePrefix, S_DEFRANGE_FRAMEPOINTER_REL, FPOffset);
+        DefRangeFramePointerRelHeader DRHdr;
+        DRHdr.Offset = Offset;
+        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       } else {
         uint16_t RegRelFlags = 0;
         if (DefRange.IsSubfield) {
@@ -2701,28 +2664,27 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
                         (DefRange.StructOffset
                          << DefRangeRegisterRelSym::OffsetInParentShift);
         }
-        DefRangeRegisterRelSym::Header DRHdr;
+        DefRangeRegisterRelHeader DRHdr;
         DRHdr.Register = Reg;
         DRHdr.Flags = RegRelFlags;
         DRHdr.BasePointerOffset = Offset;
-        copyBytesForDefRange(BytePrefix, S_DEFRANGE_REGISTER_REL, DRHdr);
+        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       }
     } else {
       assert(DefRange.DataOffset == 0 && "unexpected offset into register");
       if (DefRange.IsSubfield) {
-        DefRangeSubfieldRegisterSym::Header DRHdr;
+        DefRangeSubfieldRegisterHeader DRHdr;
         DRHdr.Register = DefRange.CVRegister;
         DRHdr.MayHaveNoName = 0;
         DRHdr.OffsetInParent = DefRange.StructOffset;
-        copyBytesForDefRange(BytePrefix, S_DEFRANGE_SUBFIELD_REGISTER, DRHdr);
+        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       } else {
-        DefRangeRegisterSym::Header DRHdr;
+        DefRangeRegisterHeader DRHdr;
         DRHdr.Register = DefRange.CVRegister;
         DRHdr.MayHaveNoName = 0;
-        copyBytesForDefRange(BytePrefix, S_DEFRANGE_REGISTER, DRHdr);
+        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       }
     }
-    OS.EmitCVDefRangeDirective(DefRange.Ranges, BytePrefix);
   }
 }
 
@@ -2896,6 +2858,14 @@ void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) {
   CurFn = nullptr;
 }
 
+// Usable locations are valid with non-zero line numbers. A line number of zero
+// corresponds to optimized code that doesn't have a distinct source location.
+// In this case, we try to use the previous or next source location depending on
+// the context.
+static bool isUsableDebugLoc(DebugLoc DL) {
+  return DL && DL.getLine() != 0;
+}
+
 void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   DebugHandlerBase::beginInstruction(MI);
 
@@ -2907,19 +2877,21 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   // If the first instruction of a new MBB has no location, find the first
   // instruction with a location and use that.
   DebugLoc DL = MI->getDebugLoc();
-  if (!DL && MI->getParent() != PrevInstBB) {
+  if (!isUsableDebugLoc(DL) && MI->getParent() != PrevInstBB) {
     for (const auto &NextMI : *MI->getParent()) {
       if (NextMI.isDebugInstr())
         continue;
       DL = NextMI.getDebugLoc();
-      if (DL)
+      if (isUsableDebugLoc(DL))
         break;
     }
+    // FIXME: Handle the case where the BB has no valid locations. This would
+    // probably require doing a real dataflow analysis.
   }
   PrevInstBB = MI->getParent();
 
   // If we still don't have a debug location, don't record a location.
-  if (!DL)
+  if (!isUsableDebugLoc(DL))
     return;
 
   maybeRecordLocation(DL, Asm->MF);
@@ -3026,7 +2998,7 @@ void CodeViewDebug::collectGlobalVariableInfo() {
         auto Insertion = ScopeGlobals.insert(
             {Scope, std::unique_ptr<GlobalVariableList>()});
         if (Insertion.second)
-          Insertion.first->second = llvm::make_unique<GlobalVariableList>();
+          Insertion.first->second = std::make_unique<GlobalVariableList>();
         VariableList = Insertion.first->second.get();
       } else if (GV->hasComdat())
         // Emit this global variable into a COMDAT section.
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index ce57b789d7fa..7ffd77926cf7 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -148,7 +148,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
     SmallVector<LexicalBlock *, 1> ChildBlocks;
 
     std::vector<std::pair<MCSymbol *, MDNode *>> Annotations;
-    std::vector<std::tuple<MCSymbol *, MCSymbol *, DIType *>> HeapAllocSites;
+    std::vector<std::tuple<MCSymbol *, MCSymbol *, const DIType *>>
+        HeapAllocSites;
 
     const MCSymbol *Begin = nullptr;
     const MCSymbol *End = nullptr;
diff --git a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index ddd60575b6c0..7f9d6c618ad3 100644
--- a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -41,7 +41,7 @@ using EntryIndex = DbgValueHistoryMap::EntryIndex;
 static Register isDescribedByReg(const MachineInstr &MI) {
   assert(MI.isDebugValue());
   assert(MI.getNumOperands() == 4);
-  // If the location of variable is an entry value (DW_OP_entry_value)
+  // If the location of variable is an entry value (DW_OP_LLVM_entry_value)
   // do not consider it as a register location.
   if (MI.getDebugExpression()->isEntryValue())
     return 0;
@@ -177,13 +177,13 @@ static void handleNewDebugValue(InlinedEntity Var, const MachineInstr &DV,
         IndicesToErase.push_back(Index);
         Entry.endEntry(NewIndex);
       }
-      if (unsigned Reg = isDescribedByReg(DV))
+      if (Register Reg = isDescribedByReg(DV))
         TrackedRegs[Reg] |= !Overlaps;
     }
 
     // If the new debug value is described by a register, add tracking of
     // that register if it is not already tracked.
-    if (unsigned NewReg = isDescribedByReg(DV)) {
+    if (Register NewReg = isDescribedByReg(DV)) {
       if (!TrackedRegs.count(NewReg))
         addRegDescribedVar(RegVars, NewReg, Var);
       LiveEntries[Var].insert(NewIndex);
@@ -234,7 +234,7 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
                                      DbgLabelInstrMap &DbgLabels) {
   const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
   unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
-  unsigned FrameReg = TRI->getFrameRegister(*MF);
+  Register FrameReg = TRI->getFrameRegister(*MF);
   RegDescribedVarsMap RegVars;
   DbgValueEntriesMap LiveEntries;
   for (const auto &MBB : *MF) {
@@ -275,7 +275,7 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
             continue;
           // If this is a virtual register, only clobber it since it doesn't
           // have aliases.
-          if (TRI->isVirtualRegister(MO.getReg()))
+          if (Register::isVirtualRegister(MO.getReg()))
             clobberRegisterUses(RegVars, MO.getReg(), DbgValues, LiveEntries,
                                 MI);
           // If this is a register def operand, it may end a debug value
@@ -296,7 +296,7 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
           // Don't consider SP to be clobbered by register masks.
           for (auto It : RegVars) {
             unsigned int Reg = It.first;
-            if (Reg != SP && TRI->isPhysicalRegister(Reg) &&
+            if (Reg != SP && Register::isPhysicalRegister(Reg) &&
                 MO.clobbersPhysReg(Reg))
               RegsToClobber.push_back(Reg);
           }
diff --git a/lib/CodeGen/AsmPrinter/DebugLocStream.h b/lib/CodeGen/AsmPrinter/DebugLocStream.h
index 789291771b5a..0db86b09d19a 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocStream.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocStream.h
@@ -38,21 +38,18 @@ public:
         : CU(CU), EntryOffset(EntryOffset) {}
   };
   struct Entry {
-    const MCSymbol *BeginSym;
-    const MCSymbol *EndSym;
+    const MCSymbol *Begin;
+    const MCSymbol *End;
     size_t ByteOffset;
     size_t CommentOffset;
-    Entry(const MCSymbol *BeginSym, const MCSymbol *EndSym, size_t ByteOffset,
-          size_t CommentOffset)
-        : BeginSym(BeginSym), EndSym(EndSym), ByteOffset(ByteOffset),
-          CommentOffset(CommentOffset) {}
   };
 
 private:
   SmallVector<List, 4> Lists;
   SmallVector<Entry, 32> Entries;
   SmallString<256> DWARFBytes;
-  SmallVector<std::string, 32> Comments;
+  std::vector<std::string> Comments;
+  MCSymbol *Sym;
 
   /// Only verbose textual output needs comments.  This will be set to
   /// true for that case, and false otherwise.
@@ -63,6 +60,12 @@ public:
   size_t getNumLists() const { return Lists.size(); }
   const List &getList(size_t LI) const { return Lists[LI]; }
   ArrayRef<List> getLists() const { return Lists; }
+  MCSymbol *getSym() const {
+    return Sym;
+  }
+  void setSym(MCSymbol *Sym) {
+    this->Sym = Sym;
+  }
 
   class ListBuilder;
   class EntryBuilder;
@@ -93,7 +96,7 @@ private:
   /// Until the next call, bytes added to the stream will be added to this
   /// entry.
   void startEntry(const MCSymbol *BeginSym, const MCSymbol *EndSym) {
-    Entries.emplace_back(BeginSym, EndSym, DWARFBytes.size(), Comments.size());
+    Entries.push_back({BeginSym, EndSym, DWARFBytes.size(), Comments.size()});
   }
 
   /// Finalize a .debug_loc entry, deleting if it's empty.
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 9548ad9918c1..a61c98ec1c18 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -208,7 +208,7 @@ void DwarfCompileUnit::addLocationAttribute(
     if (!Loc) {
       addToAccelTable = true;
       Loc = new (DIEValueAllocator) DIELoc;
-      DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
+      DwarfExpr = std::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
     }
 
     if (Expr) {
@@ -326,14 +326,13 @@ void DwarfCompileUnit::addRange(RangeSpan Range) {
   // emitted into and the subprogram was contained within. If these are the
   // same then extend our current range, otherwise add this as a new range.
   if (CURanges.empty() || !SameAsPrevCU ||
-      (&CURanges.back().getEnd()->getSection() !=
-       &Range.getEnd()->getSection())) {
+      (&CURanges.back().End->getSection() !=
+       &Range.End->getSection())) {
     CURanges.push_back(Range);
-    DD->addSectionLabel(Range.getStart());
     return;
   }
 
-  CURanges.back().setEnd(Range.getEnd());
+  CURanges.back().End = Range.End;
 }
 
 void DwarfCompileUnit::initStmtList() {
@@ -399,7 +398,7 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
     } else {
       const TargetRegisterInfo *RI = Asm->MF->getSubtarget().getRegisterInfo();
       MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-      if (RI->isPhysicalRegister(Location.getReg()))
+      if (Register::isPhysicalRegister(Location.getReg()))
         addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
     }
   }
@@ -468,14 +467,6 @@ void DwarfCompileUnit::constructScopeDIE(
 
 void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
                                          SmallVector<RangeSpan, 2> Range) {
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-
-  // Emit the offset into .debug_ranges or .debug_rnglists as a relocatable
-  // label. emitDIE() will handle emitting it appropriately.
-  const MCSymbol *RangeSectionSym =
-      DD->getDwarfVersion() >= 5
-          ? TLOF.getDwarfRnglistsSection()->getBeginSymbol()
-          : TLOF.getDwarfRangesSection()->getBeginSymbol();
 
   HasRangeLists = true;
 
@@ -494,12 +485,17 @@ void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
   // (DW_RLE_startx_endx etc.).
   if (DD->getDwarfVersion() >= 5)
     addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_rnglistx, Index);
-  else if (isDwoUnit())
-    addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
-                    RangeSectionSym);
-  else
-    addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
-                    RangeSectionSym);
+  else {
+    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+    const MCSymbol *RangeSectionSym =
+        TLOF.getDwarfRangesSection()->getBeginSymbol();
+    if (isDwoUnit())
+      addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                      RangeSectionSym);
+    else
+      addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                      RangeSectionSym);
+  }
 }
 
 void DwarfCompileUnit::attachRangesOrLowHighPC(
@@ -507,7 +503,7 @@ void DwarfCompileUnit::attachRangesOrLowHighPC(
   if (Ranges.size() == 1 || !DD->useRangesSection()) {
     const RangeSpan &Front = Ranges.front();
     const RangeSpan &Back = Ranges.back();
-    attachLowHighPC(Die, Front.getStart(), Back.getEnd());
+    attachLowHighPC(Die, Front.Begin, Back.End);
   } else
     addScopeRangeList(Die, std::move(Ranges));
 }
@@ -517,8 +513,8 @@ void DwarfCompileUnit::attachRangesOrLowHighPC(
   SmallVector<RangeSpan, 2> List;
   List.reserve(Ranges.size());
   for (const InsnRange &R : Ranges)
-    List.push_back(RangeSpan(DD->getLabelBeforeInsn(R.first),
-                             DD->getLabelAfterInsn(R.second)));
+    List.push_back(
+        {DD->getLabelBeforeInsn(R.first), DD->getLabelAfterInsn(R.second)});
   attachRangesOrLowHighPC(Die, std::move(List));
 }
 
@@ -647,8 +643,7 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
     int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);
     DwarfExpr.addFragmentOffset(Expr);
     SmallVector<uint64_t, 8> Ops;
-    Ops.push_back(dwarf::DW_OP_plus_uconst);
-    Ops.push_back(Offset);
+    DIExpression::appendOffset(Ops, Offset);
     // According to
     // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
     // cuda-gdb requires DW_AT_address_class for all variables to be able to
@@ -892,32 +887,117 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
     ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
 }
 
-DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
-                                                 const DISubprogram &CalleeSP,
-                                                 bool IsTail,
-                                                 const MCExpr *PCOffset) {
+/// Whether to use the GNU analog for a DWARF5 tag, attribute, or location atom.
+static bool useGNUAnalogForDwarf5Feature(DwarfDebug *DD) {
+  return DD->getDwarfVersion() == 4 && DD->tuneForGDB();
+}
+
+dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const {
+  if (!useGNUAnalogForDwarf5Feature(DD))
+    return Tag;
+  switch (Tag) {
+  case dwarf::DW_TAG_call_site:
+    return dwarf::DW_TAG_GNU_call_site;
+  case dwarf::DW_TAG_call_site_parameter:
+    return dwarf::DW_TAG_GNU_call_site_parameter;
+  default:
+    llvm_unreachable("DWARF5 tag with no GNU analog");
+  }
+}
+
+dwarf::Attribute
+DwarfCompileUnit::getDwarf5OrGNUAttr(dwarf::Attribute Attr) const {
+  if (!useGNUAnalogForDwarf5Feature(DD))
+    return Attr;
+  switch (Attr) {
+  case dwarf::DW_AT_call_all_calls:
+    return dwarf::DW_AT_GNU_all_call_sites;
+  case dwarf::DW_AT_call_target:
+    return dwarf::DW_AT_GNU_call_site_target;
+  case dwarf::DW_AT_call_origin:
+    return dwarf::DW_AT_abstract_origin;
+  case dwarf::DW_AT_call_pc:
+    return dwarf::DW_AT_low_pc;
+  case dwarf::DW_AT_call_value:
+    return dwarf::DW_AT_GNU_call_site_value;
+  case dwarf::DW_AT_call_tail_call:
+    return dwarf::DW_AT_GNU_tail_call;
+  default:
+    llvm_unreachable("DWARF5 attribute with no GNU analog");
+  }
+}
+
+dwarf::LocationAtom
+DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
+  if (!useGNUAnalogForDwarf5Feature(DD))
+    return Loc;
+  switch (Loc) {
+  case dwarf::DW_OP_entry_value:
+    return dwarf::DW_OP_GNU_entry_value;
+  default:
+    llvm_unreachable("DWARF5 location atom with no GNU analog");
+  }
+}
+
+DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
+    DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail,
+    const MCSymbol *PCAddr, const MCExpr *PCOffset, unsigned CallReg) {
   // Insert a call site entry DIE within ScopeDIE.
-  DIE &CallSiteDIE =
-      createAndAddDIE(dwarf::DW_TAG_call_site, ScopeDIE, nullptr);
+  DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site),
+                                     ScopeDIE, nullptr);
 
-  // For the purposes of showing tail call frames in backtraces, a key piece of
-  // information is DW_AT_call_origin, a pointer to the callee DIE.
-  DIE *CalleeDIE = getOrCreateSubprogramDIE(&CalleeSP);
-  assert(CalleeDIE && "Could not create DIE for call site entry origin");
-  addDIEEntry(CallSiteDIE, dwarf::DW_AT_call_origin, *CalleeDIE);
+  if (CallReg) {
+    // Indirect call.
+    addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
+               MachineLocation(CallReg));
+  } else {
+    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP);
+    assert(CalleeDIE && "Could not create DIE for call site entry origin");
+    addDIEEntry(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_origin),
+                *CalleeDIE);
+  }
 
-  if (IsTail) {
+  if (IsTail)
     // Attach DW_AT_call_tail_call to tail calls for standards compliance.
-    addFlag(CallSiteDIE, dwarf::DW_AT_call_tail_call);
-  } else {
-    // Attach the return PC to allow the debugger to disambiguate call paths
-    // from one function to another.
+    addFlag(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_tail_call));
+
+  // Attach the return PC to allow the debugger to disambiguate call paths
+  // from one function to another.
+  if (DD->getDwarfVersion() == 4 && DD->tuneForGDB()) {
+    assert(PCAddr && "Missing PC information for a call");
+    addLabelAddress(CallSiteDIE, dwarf::DW_AT_low_pc, PCAddr);
+  } else if (!IsTail || DD->tuneForGDB()) {
     assert(PCOffset && "Missing return PC information for a call");
     addAddressExpr(CallSiteDIE, dwarf::DW_AT_call_return_pc, PCOffset);
   }
+
   return CallSiteDIE;
 }
 
+void DwarfCompileUnit::constructCallSiteParmEntryDIEs(
+    DIE &CallSiteDIE, SmallVector<DbgCallSiteParam, 4> &Params) {
+  for (const auto &Param : Params) {
+    unsigned Register = Param.getRegister();
+    auto CallSiteDieParam =
+        DIE::get(DIEValueAllocator,
+                 getDwarf5OrGNUTag(dwarf::DW_TAG_call_site_parameter));
+    insertDIE(CallSiteDieParam);
+    addAddress(*CallSiteDieParam, dwarf::DW_AT_location,
+               MachineLocation(Register));
+
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+    DwarfExpr.setCallSiteParamValueFlag();
+
+    DwarfDebug::emitDebugLocValue(*Asm, nullptr, Param.getValue(), DwarfExpr);
+
+    addBlock(*CallSiteDieParam, getDwarf5OrGNUAttr(dwarf::DW_AT_call_value),
+             DwarfExpr.finalize());
+
+    CallSiteDIE.addChild(CallSiteDieParam);
+  }
+}
+
 DIE *DwarfCompileUnit::constructImportedEntityDIE(
     const DIImportedEntity *Module) {
   DIE *IMDie = DIE::get(DIEValueAllocator, (dwarf::Tag)Module->getTag());
@@ -997,11 +1077,11 @@ void DwarfCompileUnit::createAbstractEntity(const DINode *Node,
   assert(Scope && Scope->isAbstractScope());
   auto &Entity = getAbstractEntities()[Node];
   if (isa<const DILocalVariable>(Node)) {
-    Entity = llvm::make_unique<DbgVariable>(
+    Entity = std::make_unique<DbgVariable>(
                         cast<const DILocalVariable>(Node), nullptr /* IA */);;
     DU->addScopeVariable(Scope, cast<DbgVariable>(Entity.get()));
   } else if (isa<const DILabel>(Node)) {
-    Entity = llvm::make_unique<DbgLabel>(
+    Entity = std::make_unique<DbgLabel>(
                         cast<const DILabel>(Node), nullptr /* IA */);
     DU->addScopeLabel(Scope, cast<DbgLabel>(Entity.get()));
   }
@@ -1081,16 +1161,8 @@ void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty,
   GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie()));
 }
 
-/// addVariableAddress - Add DW_AT_location attribute for a
-/// DbgVariable based on provided MachineLocation.
 void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
                                           MachineLocation Location) {
-  // addBlockByrefAddress is obsolete and will be removed soon.
-  // The clang frontend always generates block byref variables with a
-  // complex expression that encodes exactly what addBlockByrefAddress
-  // would do.
-  assert((!DV.isBlockByrefVariable() || DV.hasComplexAddress()) &&
-         "block byref variable without a complex expression");
   if (DV.hasComplexAddress())
     addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
   else
@@ -1133,7 +1205,7 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
 
   if (DIExpr->isEntryValue()) {
     DwarfExpr.setEntryValueFlag();
-    DwarfExpr.addEntryValueExpression(Cursor);
+    DwarfExpr.beginEntryValueExpression(Cursor);
   }
 
   const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index ea980dfda17e..1b7ea2673ac0 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -227,12 +227,35 @@ public:
 
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
+  /// This takes a DWARF 5 tag and returns it or a GNU analog.
+  dwarf::Tag getDwarf5OrGNUTag(dwarf::Tag Tag) const;
+
+  /// This takes a DWARF 5 attribute and returns it or a GNU analog.
+  dwarf::Attribute getDwarf5OrGNUAttr(dwarf::Attribute Attr) const;
+
+  /// This takes a DWARF 5 location atom and either returns it or a GNU analog.
+  dwarf::LocationAtom getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const;
+
   /// Construct a call site entry DIE describing a call within \p Scope to a
-  /// callee described by \p CalleeSP. \p IsTail specifies whether the call is
-  /// a tail call. \p PCOffset must be non-zero for non-tail calls or be the
+  /// callee described by \p CalleeSP.
+  /// \p IsTail specifies whether the call is a tail call.
+  /// \p PCAddr (used for GDB + DWARF 4 tuning) points to the PC value after
+  /// the call instruction.
+  /// \p PCOffset (used for cases other than GDB + DWARF 4 tuning) must be
+  /// non-zero for non-tail calls (in the case of non-gdb tuning, since for
+  /// GDB + DWARF 5 tuning we still generate PC info for tail calls) or be the
   /// function-local offset to PC value after the call instruction.
-  DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram &CalleeSP,
-                                 bool IsTail, const MCExpr *PCOffset);
+  /// \p CallReg is a register location for an indirect call. For direct calls
+  /// the \p CallReg is set to 0.
+  DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
+                                 bool IsTail, const MCSymbol *PCAddr,
+                                 const MCExpr *PCOffset, unsigned CallReg);
+  /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
+  /// were collected by the \ref collectCallSiteParameters.
+  /// Note: The order of parameters does not matter, since debuggers recognize
+  ///       call site parameters by the DW_AT_location attribute.
+  void constructCallSiteParmEntryDIEs(DIE &CallSiteDIE,
+                                      SmallVector<DbgCallSiteParam, 4> &Params);
 
   /// Construct import_module DIE.
   DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 71bb2b0858cc..c505e77e5acd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -39,6 +40,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
@@ -83,6 +85,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
 
+STATISTIC(NumCSParams, "Number of dbg call site params created");
+
 static cl::opt<bool>
 DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
                          cl::desc("Disable debug info printing"));
@@ -166,26 +170,26 @@ static const char *const DbgTimerDescription = "DWARF Debug Writer";
 static constexpr unsigned ULEB128PadSize = 4;
 
 void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) {
-  BS.EmitInt8(
+  getActiveStreamer().EmitInt8(
       Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
                   : dwarf::OperationEncodingString(Op));
 }
 
 void DebugLocDwarfExpression::emitSigned(int64_t Value) {
-  BS.EmitSLEB128(Value, Twine(Value));
+  getActiveStreamer().EmitSLEB128(Value, Twine(Value));
 }
 
 void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) {
-  BS.EmitULEB128(Value, Twine(Value));
+  getActiveStreamer().EmitULEB128(Value, Twine(Value));
 }
 
 void DebugLocDwarfExpression::emitData1(uint8_t Value) {
-  BS.EmitInt8(Value, Twine(Value));
+  getActiveStreamer().EmitInt8(Value, Twine(Value));
 }
 
 void DebugLocDwarfExpression::emitBaseTypeRef(uint64_t Idx) {
   assert(Idx < (1ULL << (ULEB128PadSize * 7)) && "Idx wont fit");
-  BS.EmitULEB128(Idx, Twine(Idx), ULEB128PadSize);
+  getActiveStreamer().EmitULEB128(Idx, Twine(Idx), ULEB128PadSize);
 }
 
 bool DebugLocDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
@@ -194,54 +198,34 @@ bool DebugLocDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
   return false;
 }
 
-bool DbgVariable::isBlockByrefVariable() const {
-  assert(getVariable() && "Invalid complex DbgVariable!");
-  return getVariable()->getType()->isBlockByrefStruct();
+void DebugLocDwarfExpression::enableTemporaryBuffer() {
+  assert(!IsBuffering && "Already buffering?");
+  if (!TmpBuf)
+    TmpBuf = std::make_unique<TempBuffer>(OutBS.GenerateComments);
+  IsBuffering = true;
 }
 
-const DIType *DbgVariable::getType() const {
-  DIType *Ty = getVariable()->getType();
-  // FIXME: isBlockByrefVariable should be reformulated in terms of complex
-  // addresses instead.
-  if (Ty->isBlockByrefStruct()) {
-    /* Byref variables, in Blocks, are declared by the programmer as
-       "SomeType VarName;", but the compiler creates a
-       __Block_byref_x_VarName struct, and gives the variable VarName
-       either the struct, or a pointer to the struct, as its type.  This
-       is necessary for various behind-the-scenes things the compiler
-       needs to do with by-reference variables in blocks.
-
-       However, as far as the original *programmer* is concerned, the
-       variable should still have type 'SomeType', as originally declared.
-
-       The following function dives into the __Block_byref_x_VarName
-       struct to find the original type of the variable.  This will be
-       passed back to the code generating the type for the Debug
-       Information Entry for the variable 'VarName'.  'VarName' will then
-       have the original type 'SomeType' in its debug information.
-
-       The original type 'SomeType' will be the type of the field named
-       'VarName' inside the __Block_byref_x_VarName struct.
-
-       NOTE: In order for this to not completely fail on the debugger
-       side, the Debug Information Entry for the variable VarName needs to
-       have a DW_AT_location that tells the debugger how to unwind through
-       the pointers and __Block_byref_x_VarName struct to find the actual
-       value of the variable.  The function addBlockByrefType does this.  */
-    DIType *subType = Ty;
-    uint16_t tag = Ty->getTag();
-
-    if (tag == dwarf::DW_TAG_pointer_type)
-      subType = cast<DIDerivedType>(Ty)->getBaseType();
-
-    auto Elements = cast<DICompositeType>(subType)->getElements();
-    for (unsigned i = 0, N = Elements.size(); i < N; ++i) {
-      auto *DT = cast<DIDerivedType>(Elements[i]);
-      if (getName() == DT->getName())
-        return DT->getBaseType();
-    }
+void DebugLocDwarfExpression::disableTemporaryBuffer() { IsBuffering = false; }
+
+unsigned DebugLocDwarfExpression::getTemporaryBufferSize() {
+  return TmpBuf ? TmpBuf->Bytes.size() : 0;
+}
+
+void DebugLocDwarfExpression::commitTemporaryBuffer() {
+  if (!TmpBuf)
+    return;
+  for (auto Byte : enumerate(TmpBuf->Bytes)) {
+    const char *Comment = (Byte.index() < TmpBuf->Comments.size())
+                              ? TmpBuf->Comments[Byte.index()].c_str()
+                              : "";
+    OutBS.EmitInt8(Byte.value(), Comment);
   }
-  return Ty;
+  TmpBuf->Bytes.clear();
+  TmpBuf->Comments.clear();
+}
+
+const DIType *DbgVariable::getType() const {
+  return getVariable()->getType();
 }
 
 /// Get .debug_loc entry for the instruction range starting at MI.
@@ -275,7 +259,7 @@ void DbgVariable::initializeDbgValue(const MachineInstr *DbgValue) {
   assert(getInlinedAt() == DbgValue->getDebugLoc()->getInlinedAt() &&
          "Wrong inlined-at");
 
-  ValueLoc = llvm::make_unique<DbgValueLoc>(getDebugLocValue(DbgValue));
+  ValueLoc = std::make_unique<DbgValueLoc>(getDebugLocValue(DbgValue));
   if (auto *E = DbgValue->getDebugExpression())
     if (E->getNumElements())
       FrameIndexExprs.push_back({0, E});
@@ -551,6 +535,157 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
   }
 }
 
+/// Try to interpret values loaded into registers that forward parameters
+/// for \p CallMI. Store parameters with interpreted value into \p Params.
+static void collectCallSiteParameters(const MachineInstr *CallMI,
+                                      ParamSet &Params) {
+  auto *MF = CallMI->getMF();
+  auto CalleesMap = MF->getCallSitesInfo();
+  auto CallFwdRegsInfo = CalleesMap.find(CallMI);
+
+  // There is no information for the call instruction.
+  if (CallFwdRegsInfo == CalleesMap.end())
+    return;
+
+  auto *MBB = CallMI->getParent();
+  const auto &TRI = MF->getSubtarget().getRegisterInfo();
+  const auto &TII = MF->getSubtarget().getInstrInfo();
+  const auto &TLI = MF->getSubtarget().getTargetLowering();
+
+  // Skip the call instruction.
+  auto I = std::next(CallMI->getReverseIterator());
+
+  DenseSet<unsigned> ForwardedRegWorklist;
+  // Add all the forwarding registers into the ForwardedRegWorklist.
+  for (auto ArgReg : CallFwdRegsInfo->second) {
+    bool InsertedReg = ForwardedRegWorklist.insert(ArgReg.Reg).second;
+    assert(InsertedReg && "Single register used to forward two arguments?");
+    (void)InsertedReg;
+  }
+
+  // We erase, from the ForwardedRegWorklist, those forwarding registers for
+  // which we successfully describe a loaded value (by using
+  // the describeLoadedValue()). For those remaining arguments in the working
+  // list, for which we do not describe a loaded value by
+  // the describeLoadedValue(), we try to generate an entry value expression
+  // for their call site value desctipion, if the call is within the entry MBB.
+  // The RegsForEntryValues maps a forwarding register into the register holding
+  // the entry value.
+  // TODO: Handle situations when call site parameter value can be described
+  // as the entry value within basic blocks other then the first one.
+  bool ShouldTryEmitEntryVals = MBB->getIterator() == MF->begin();
+  DenseMap<unsigned, unsigned> RegsForEntryValues;
+
+  // If the MI is an instruction defining one or more parameters' forwarding
+  // registers, add those defines. We can currently only describe forwarded
+  // registers that are explicitly defined, but keep track of implicit defines
+  // also to remove those registers from the work list.
+  auto getForwardingRegsDefinedByMI = [&](const MachineInstr &MI,
+                                          SmallVectorImpl<unsigned> &Explicit,
+                                          SmallVectorImpl<unsigned> &Implicit) {
+    if (MI.isDebugInstr())
+      return;
+
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isReg() && MO.isDef() &&
+          Register::isPhysicalRegister(MO.getReg())) {
+        for (auto FwdReg : ForwardedRegWorklist) {
+          if (TRI->regsOverlap(FwdReg, MO.getReg())) {
+            if (MO.isImplicit())
+              Implicit.push_back(FwdReg);
+            else
+              Explicit.push_back(FwdReg);
+            break;
+          }
+        }
+      }
+    }
+  };
+
+  auto finishCallSiteParam = [&](DbgValueLoc DbgLocVal, unsigned Reg) {
+    unsigned FwdReg = Reg;
+    if (ShouldTryEmitEntryVals) {
+      auto EntryValReg = RegsForEntryValues.find(Reg);
+      if (EntryValReg != RegsForEntryValues.end())
+        FwdReg = EntryValReg->second;
+    }
+
+    DbgCallSiteParam CSParm(FwdReg, DbgLocVal);
+    Params.push_back(CSParm);
+    ++NumCSParams;
+  };
+
+  // Search for a loading value in forwaring registers.
+  for (; I != MBB->rend(); ++I) {
+    // If the next instruction is a call we can not interpret parameter's
+    // forwarding registers or we finished the interpretation of all parameters.
+    if (I->isCall())
+      return;
+
+    if (ForwardedRegWorklist.empty())
+      return;
+
+    SmallVector<unsigned, 4> ExplicitFwdRegDefs;
+    SmallVector<unsigned, 4> ImplicitFwdRegDefs;
+    getForwardingRegsDefinedByMI(*I, ExplicitFwdRegDefs, ImplicitFwdRegDefs);
+    if (ExplicitFwdRegDefs.empty() && ImplicitFwdRegDefs.empty())
+      continue;
+
+    // If the MI clobbers more then one forwarding register we must remove
+    // all of them from the working list.
+    for (auto Reg : concat<unsigned>(ExplicitFwdRegDefs, ImplicitFwdRegDefs))
+      ForwardedRegWorklist.erase(Reg);
+
+    // The describeLoadedValue() hook currently does not have any information
+    // about which register it should describe in case of multiple defines, so
+    // for now we only handle instructions where a forwarded register is (at
+    // least partially) defined by the instruction's single explicit define.
+    if (I->getNumExplicitDefs() != 1 || ExplicitFwdRegDefs.empty())
+      continue;
+    unsigned Reg = ExplicitFwdRegDefs[0];
+
+    if (auto ParamValue = TII->describeLoadedValue(*I)) {
+      if (ParamValue->first.isImm()) {
+        int64_t Val = ParamValue->first.getImm();
+        DbgValueLoc DbgLocVal(ParamValue->second, Val);
+        finishCallSiteParam(DbgLocVal, Reg);
+      } else if (ParamValue->first.isReg()) {
+        Register RegLoc = ParamValue->first.getReg();
+        unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+        Register FP = TRI->getFrameRegister(*MF);
+        bool IsSPorFP = (RegLoc == SP) || (RegLoc == FP);
+        if (TRI->isCalleeSavedPhysReg(RegLoc, *MF) || IsSPorFP) {
+          DbgValueLoc DbgLocVal(ParamValue->second,
+                                MachineLocation(RegLoc,
+                                                /*IsIndirect=*/IsSPorFP));
+          finishCallSiteParam(DbgLocVal, Reg);
+        } else if (ShouldTryEmitEntryVals) {
+          ForwardedRegWorklist.insert(RegLoc);
+          RegsForEntryValues[RegLoc] = Reg;
+        }
+      }
+    }
+  }
+
+  // Emit the call site parameter's value as an entry value.
+  if (ShouldTryEmitEntryVals) {
+    // Create an expression where the register's entry value is used.
+    DIExpression *EntryExpr = DIExpression::get(
+        MF->getFunction().getContext(), {dwarf::DW_OP_LLVM_entry_value, 1});
+    for (auto RegEntry : ForwardedRegWorklist) {
+      unsigned FwdReg = RegEntry;
+      auto EntryValReg = RegsForEntryValues.find(RegEntry);
+        if (EntryValReg != RegsForEntryValues.end())
+          FwdReg = EntryValReg->second;
+
+      DbgValueLoc DbgLocVal(EntryExpr, MachineLocation(RegEntry));
+      DbgCallSiteParam CSParm(FwdReg, DbgLocVal);
+      Params.push_back(CSParm);
+      ++NumCSParams;
+    }
+  }
+}
+
 void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
                                             DwarfCompileUnit &CU, DIE &ScopeDIE,
                                             const MachineFunction &MF) {
@@ -563,10 +698,11 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
   // for both tail and non-tail calls. Don't use DW_AT_call_all_source_calls
   // because one of its requirements is not met: call site entries for
   // optimized-out calls are elided.
-  CU.addFlag(ScopeDIE, dwarf::DW_AT_call_all_calls);
+  CU.addFlag(ScopeDIE, CU.getDwarf5OrGNUAttr(dwarf::DW_AT_call_all_calls));
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   assert(TII && "TargetInstrInfo not found: cannot label tail calls");
+  bool ApplyGNUExtensions = getDwarfVersion() == 4 && tuneForGDB();
 
   // Emit call site entries for each call or tail call in the function.
   for (const MachineBasicBlock &MBB : MF) {
@@ -581,30 +717,66 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
         return;
 
       // If this is a direct call, find the callee's subprogram.
+      // In the case of an indirect call find the register that holds
+      // the callee.
       const MachineOperand &CalleeOp = MI.getOperand(0);
-      if (!CalleeOp.isGlobal())
-        continue;
-      const Function *CalleeDecl = dyn_cast<Function>(CalleeOp.getGlobal());
-      if (!CalleeDecl || !CalleeDecl->getSubprogram())
+      if (!CalleeOp.isGlobal() && !CalleeOp.isReg())
         continue;
 
+      unsigned CallReg = 0;
+      const DISubprogram *CalleeSP = nullptr;
+      const Function *CalleeDecl = nullptr;
+      if (CalleeOp.isReg()) {
+        CallReg = CalleeOp.getReg();
+        if (!CallReg)
+          continue;
+      } else {
+        CalleeDecl = dyn_cast<Function>(CalleeOp.getGlobal());
+        if (!CalleeDecl || !CalleeDecl->getSubprogram())
+          continue;
+        CalleeSP = CalleeDecl->getSubprogram();
+      }
+
       // TODO: Omit call site entries for runtime calls (objc_msgSend, etc).
-      // TODO: Add support for indirect calls.
 
       bool IsTail = TII->isTailCall(MI);
 
-      // For tail calls, no return PC information is needed. For regular calls,
-      // the return PC is needed to disambiguate paths in the call graph which
-      // could lead to some target function.
+      // For tail calls, for non-gdb tuning, no return PC information is needed.
+      // For regular calls (and tail calls in GDB tuning), the return PC
+      // is needed to disambiguate paths in the call graph which could lead to
+      // some target function.
       const MCExpr *PCOffset =
-          IsTail ? nullptr : getFunctionLocalOffsetAfterInsn(&MI);
+          (IsTail && !tuneForGDB()) ? nullptr
+                                    : getFunctionLocalOffsetAfterInsn(&MI);
+
+      // Address of a call-like instruction for a normal call or a jump-like
+      // instruction for a tail call. This is needed for GDB + DWARF 4 tuning.
+      const MCSymbol *PCAddr =
+          ApplyGNUExtensions ? const_cast<MCSymbol*>(getLabelAfterInsn(&MI))
+                             : nullptr;
+
+      assert((IsTail || PCOffset || PCAddr) &&
+             "Call without return PC information");
 
-      assert((IsTail || PCOffset) && "Call without return PC information");
       LLVM_DEBUG(dbgs() << "CallSiteEntry: " << MF.getName() << " -> "
-                        << CalleeDecl->getName() << (IsTail ? " [tail]" : "")
-                        << "\n");
-      CU.constructCallSiteEntryDIE(ScopeDIE, *CalleeDecl->getSubprogram(),
-                                   IsTail, PCOffset);
+                        << (CalleeDecl ? CalleeDecl->getName()
+                                       : StringRef(MF.getSubtarget()
+                                                       .getRegisterInfo()
+                                                       ->getName(CallReg)))
+                        << (IsTail ? " [IsTail]" : "") << "\n");
+
+      DIE &CallSiteDIE =
+            CU.constructCallSiteEntryDIE(ScopeDIE, CalleeSP, IsTail, PCAddr,
+                                         PCOffset, CallReg);
+
+      // GDB and LLDB support call site parameter debug info.
+      if (Asm->TM.Options.EnableDebugEntryValues &&
+          (tuneForGDB() || tuneForLLDB())) {
+        ParamSet Params;
+        // Try to interpret values of call site parameters.
+        collectCallSiteParameters(&MI, Params);
+        CU.constructCallSiteParmEntryDIEs(CallSiteDIE, Params);
+      }
     }
   }
 }
@@ -680,7 +852,7 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
 
   CompilationDir = DIUnit->getDirectory();
 
-  auto OwnedUnit = llvm::make_unique<DwarfCompileUnit>(
+  auto OwnedUnit = std::make_unique<DwarfCompileUnit>(
       InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder);
   DwarfCompileUnit &NewCU = *OwnedUnit;
   InfoHolder.addUnit(std::move(OwnedUnit));
@@ -793,8 +965,6 @@ void DwarfDebug::beginModule() {
     DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
     Holder.setRnglistsTableBaseSym(
         Asm->createTempSymbol("rnglists_table_base"));
-    Holder.setLoclistsTableBaseSym(
-        Asm->createTempSymbol("loclists_table_base"));
 
     if (useSplitDwarf())
       InfoHolder.setRnglistsTableBaseSym(
@@ -907,7 +1077,7 @@ void DwarfDebug::finalizeModuleInfo() {
     // If we're splitting the dwarf out now that we've got the entire
     // CU then add the dwo id to it.
     auto *SkCU = TheCU.getSkeleton();
-    if (useSplitDwarf() && !empty(TheCU.getUnitDie().children())) {
+    if (useSplitDwarf() && !TheCU.getUnitDie().children().empty()) {
       finishUnitAttributes(TheCU.getCUNode(), TheCU);
       TheCU.addString(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_name,
                       Asm->TM.Options.MCOptions.SplitDwarfFile);
@@ -951,7 +1121,7 @@ void DwarfDebug::finalizeModuleInfo() {
         // 2.17.3).
         U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
       else
-        U.setBaseAddress(TheCU.getRanges().front().getStart());
+        U.setBaseAddress(TheCU.getRanges().front().Begin);
       U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
     }
 
@@ -959,15 +1129,19 @@ void DwarfDebug::finalizeModuleInfo() {
     // is a bit pessimistic under LTO.
     if (!AddrPool.isEmpty() &&
         (getDwarfVersion() >= 5 ||
-         (SkCU && !empty(TheCU.getUnitDie().children()))))
+         (SkCU && !TheCU.getUnitDie().children().empty())))
       U.addAddrTableBase();
 
     if (getDwarfVersion() >= 5) {
       if (U.hasRangeLists())
         U.addRnglistsBase();
 
-      if (!DebugLocs.getLists().empty() && !useSplitDwarf())
-        U.addLoclistsBase();
+      if (!DebugLocs.getLists().empty() && !useSplitDwarf()) {
+        DebugLocs.setSym(Asm->createTempSymbol("loclists_table_base"));
+        U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_loclists_base,
+                          DebugLocs.getSym(),
+                          TLOF.getDwarfLoclistsSection()->getBeginSymbol());
+      }
     }
 
     auto *CUNode = cast<DICompileUnit>(P.first);
@@ -1105,7 +1279,7 @@ void DwarfDebug::collectVariableInfoFromMFTable(
       continue;
 
     ensureAbstractEntityIsCreatedIfScoped(TheCU, Var.first, Scope->getScopeNode());
-    auto RegVar = llvm::make_unique<DbgVariable>(
+    auto RegVar = std::make_unique<DbgVariable>(
                     cast<DILocalVariable>(Var.first), Var.second);
     RegVar->initializeMMI(VI.Expr, VI.Slot);
     if (DbgVariable *DbgVar = MFVars.lookup(Var))
@@ -1316,13 +1490,13 @@ DbgEntity *DwarfDebug::createConcreteEntity(DwarfCompileUnit &TheCU,
   ensureAbstractEntityIsCreatedIfScoped(TheCU, Node, Scope.getScopeNode());
   if (isa<const DILocalVariable>(Node)) {
     ConcreteEntities.push_back(
-        llvm::make_unique<DbgVariable>(cast<const DILocalVariable>(Node),
+        std::make_unique<DbgVariable>(cast<const DILocalVariable>(Node),
                                        Location));
     InfoHolder.addScopeVariable(&Scope,
         cast<DbgVariable>(ConcreteEntities.back().get()));
   } else if (isa<const DILabel>(Node)) {
     ConcreteEntities.push_back(
-        llvm::make_unique<DbgLabel>(cast<const DILabel>(Node),
+        std::make_unique<DbgLabel>(cast<const DILabel>(Node),
                                     Location, Sym));
     InfoHolder.addScopeLabel(&Scope,
         cast<DbgLabel>(ConcreteEntities.back().get()));
@@ -1419,11 +1593,14 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
 
     LexicalScope *Scope = nullptr;
     const DILabel *Label = cast<DILabel>(IL.first);
+    // The scope could have an extra lexical block file.
+    const DILocalScope *LocalScope =
+        Label->getScope()->getNonLexicalBlockFileScope();
     // Get inlined DILocation if it is inlined label.
     if (const DILocation *IA = IL.second)
-      Scope = LScopes.findInlinedScope(Label->getScope(), IA);
+      Scope = LScopes.findInlinedScope(LocalScope, IA);
     else
-      Scope = LScopes.findLexicalScope(Label->getScope());
+      Scope = LScopes.findLexicalScope(LocalScope);
     // If label scope is not found then skip this label.
     if (!Scope)
       continue;
@@ -1607,6 +1784,9 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   if (SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug)
     return;
 
+  SectionLabels.insert(std::make_pair(&Asm->getFunctionBegin()->getSection(),
+                                      Asm->getFunctionBegin()));
+
   DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
 
   // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
@@ -1654,7 +1834,7 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
   collectEntityInfo(TheCU, SP, Processed);
 
   // Add the range of this function to the list of ranges for the CU.
-  TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd()));
+  TheCU.addRange({Asm->getFunctionBegin(), Asm->getFunctionEnd()});
 
   // Under -gmlt, skip building the subprogram if there are no inlined
   // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram
@@ -1836,9 +2016,10 @@ static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
   case dwarf::DW_TAG_union_type:
   case dwarf::DW_TAG_enumeration_type:
     return dwarf::PubIndexEntryDescriptor(
-        dwarf::GIEK_TYPE, CU->getLanguage() != dwarf::DW_LANG_C_plus_plus
-                              ? dwarf::GIEL_STATIC
-                              : dwarf::GIEL_EXTERNAL);
+        dwarf::GIEK_TYPE,
+        dwarf::isCPlusPlus((dwarf::SourceLanguage)CU->getLanguage())
+            ? dwarf::GIEL_EXTERNAL
+            : dwarf::GIEL_STATIC);
   case dwarf::DW_TAG_typedef:
   case dwarf::DW_TAG_base_type:
   case dwarf::DW_TAG_subrange_type:
@@ -1967,7 +2148,7 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
   DWARFExpression Expr(Data, getDwarfVersion(), PtrSize);
 
   using Encoding = DWARFExpression::Operation::Encoding;
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   for (auto &Op : Expr) {
     assert(Op.getCode() != dwarf::DW_OP_const_type &&
            "3 operand ops not yet supported");
@@ -1990,7 +2171,7 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
             if (Comment != End)
               Comment++;
       } else {
-        for (uint32_t J = Offset; J < Op.getOperandEndOffset(I); ++J)
+        for (uint64_t J = Offset; J < Op.getOperandEndOffset(I); ++J)
           Streamer.EmitInt8(Data.getData()[J], Comment != End ? *(Comment++) : "");
       }
       Offset = Op.getOperandEndOffset(I);
@@ -2020,7 +2201,7 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
 
     if (DIExpr->isEntryValue()) {
       DwarfExpr.setEntryValueFlag();
-      DwarfExpr.addEntryValueExpression(Cursor);
+      DwarfExpr.beginEntryValueExpression(Cursor);
     }
 
     const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
@@ -2083,7 +2264,7 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry,
 }
 
 // Emit the common part of the DWARF 5 range/locations list tables header.
-static void emitListsTableHeaderStart(AsmPrinter *Asm, const DwarfFile &Holder,
+static void emitListsTableHeaderStart(AsmPrinter *Asm,
                                       MCSymbol *TableStart,
                                       MCSymbol *TableEnd) {
   // Build the table header, which starts with the length field.
@@ -2108,7 +2289,7 @@ static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
                                          const DwarfFile &Holder) {
   MCSymbol *TableStart = Asm->createTempSymbol("debug_rnglist_table_start");
   MCSymbol *TableEnd = Asm->createTempSymbol("debug_rnglist_table_end");
-  emitListsTableHeaderStart(Asm, Holder, TableStart, TableEnd);
+  emitListsTableHeaderStart(Asm, TableStart, TableEnd);
 
   Asm->OutStreamer->AddComment("Offset entry count");
   Asm->emitInt32(Holder.getRangeLists().size());
@@ -2125,94 +2306,147 @@ static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
 // designates the end of the table for the caller to emit when the table is
 // complete.
 static MCSymbol *emitLoclistsTableHeader(AsmPrinter *Asm,
-                                         const DwarfFile &Holder) {
+                                         const DwarfDebug &DD) {
   MCSymbol *TableStart = Asm->createTempSymbol("debug_loclist_table_start");
   MCSymbol *TableEnd = Asm->createTempSymbol("debug_loclist_table_end");
-  emitListsTableHeaderStart(Asm, Holder, TableStart, TableEnd);
+  emitListsTableHeaderStart(Asm, TableStart, TableEnd);
+
+  const auto &DebugLocs = DD.getDebugLocs();
 
   // FIXME: Generate the offsets table and use DW_FORM_loclistx with the
   // DW_AT_loclists_base attribute. Until then set the number of offsets to 0.
   Asm->OutStreamer->AddComment("Offset entry count");
   Asm->emitInt32(0);
-  Asm->OutStreamer->EmitLabel(Holder.getLoclistsTableBaseSym());
+  Asm->OutStreamer->EmitLabel(DebugLocs.getSym());
 
   return TableEnd;
 }
 
-// Emit locations into the .debug_loc/.debug_rnglists section.
-void DwarfDebug::emitDebugLoc() {
-  if (DebugLocs.getLists().empty())
-    return;
+template <typename Ranges, typename PayloadEmitter>
+static void emitRangeList(
+    DwarfDebug &DD, AsmPrinter *Asm, MCSymbol *Sym, const Ranges &R,
+    const DwarfCompileUnit &CU, unsigned BaseAddressx, unsigned OffsetPair,
+    unsigned StartxLength, unsigned EndOfList,
+    StringRef (*StringifyEnum)(unsigned),
+    bool ShouldUseBaseAddress,
+    PayloadEmitter EmitPayload) {
 
-  bool IsLocLists = getDwarfVersion() >= 5;
-  MCSymbol *TableEnd = nullptr;
-  if (IsLocLists) {
-    Asm->OutStreamer->SwitchSection(
-        Asm->getObjFileLowering().getDwarfLoclistsSection());
-    TableEnd = emitLoclistsTableHeader(Asm, useSplitDwarf() ? SkeletonHolder
-                                                            : InfoHolder);
-  } else {
-    Asm->OutStreamer->SwitchSection(
-        Asm->getObjFileLowering().getDwarfLocSection());
-  }
+  auto Size = Asm->MAI->getCodePointerSize();
+  bool UseDwarf5 = DD.getDwarfVersion() >= 5;
 
-  unsigned char Size = Asm->MAI->getCodePointerSize();
-  for (const auto &List : DebugLocs.getLists()) {
-    Asm->OutStreamer->EmitLabel(List.Label);
+  // Emit our symbol so we can find the beginning of the range.
+  Asm->OutStreamer->EmitLabel(Sym);
 
-    const DwarfCompileUnit *CU = List.CU;
-    const MCSymbol *Base = CU->getBaseAddress();
-    for (const auto &Entry : DebugLocs.getEntries(List)) {
+  // Gather all the ranges that apply to the same section so they can share
+  // a base address entry.
+  MapVector<const MCSection *, std::vector<decltype(&*R.begin())>> SectionRanges;
+
+  for (const auto &Range : R)
+    SectionRanges[&Range.Begin->getSection()].push_back(&Range);
+
+  const MCSymbol *CUBase = CU.getBaseAddress();
+  bool BaseIsSet = false;
+  for (const auto &P : SectionRanges) {
+    auto *Base = CUBase;
+    if (!Base && ShouldUseBaseAddress) {
+      const MCSymbol *Begin = P.second.front()->Begin;
+      const MCSymbol *NewBase = DD.getSectionLabel(&Begin->getSection());
+      if (!UseDwarf5) {
+        Base = NewBase;
+        BaseIsSet = true;
+        Asm->OutStreamer->EmitIntValue(-1, Size);
+        Asm->OutStreamer->AddComment("  base address");
+        Asm->OutStreamer->EmitSymbolValue(Base, Size);
+      } else if (NewBase != Begin || P.second.size() > 1) {
+        // Only use a base address if
+        //  * the existing pool address doesn't match (NewBase != Begin)
+        //  * or, there's more than one entry to share the base address
+        Base = NewBase;
+        BaseIsSet = true;
+        Asm->OutStreamer->AddComment(StringifyEnum(BaseAddressx));
+        Asm->emitInt8(BaseAddressx);
+        Asm->OutStreamer->AddComment("  base address index");
+        Asm->EmitULEB128(DD.getAddressPool().getIndex(Base));
+      }
+    } else if (BaseIsSet && !UseDwarf5) {
+      BaseIsSet = false;
+      assert(!Base);
+      Asm->OutStreamer->EmitIntValue(-1, Size);
+      Asm->OutStreamer->EmitIntValue(0, Size);
+    }
+
+    for (const auto *RS : P.second) {
+      const MCSymbol *Begin = RS->Begin;
+      const MCSymbol *End = RS->End;
+      assert(Begin && "Range without a begin symbol?");
+      assert(End && "Range without an end symbol?");
       if (Base) {
-        // Set up the range. This range is relative to the entry point of the
-        // compile unit. This is a hard coded 0 for low_pc when we're emitting
-        // ranges, or the DW_AT_low_pc on the compile unit otherwise.
-        if (IsLocLists) {
-          Asm->OutStreamer->AddComment("DW_LLE_offset_pair");
-          Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_offset_pair, 1);
+        if (UseDwarf5) {
+          // Emit offset_pair when we have a base.
+          Asm->OutStreamer->AddComment(StringifyEnum(OffsetPair));
+          Asm->emitInt8(OffsetPair);
           Asm->OutStreamer->AddComment("  starting offset");
-          Asm->EmitLabelDifferenceAsULEB128(Entry.BeginSym, Base);
+          Asm->EmitLabelDifferenceAsULEB128(Begin, Base);
           Asm->OutStreamer->AddComment("  ending offset");
-          Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Base);
+          Asm->EmitLabelDifferenceAsULEB128(End, Base);
         } else {
-          Asm->EmitLabelDifference(Entry.BeginSym, Base, Size);
-          Asm->EmitLabelDifference(Entry.EndSym, Base, Size);
+          Asm->EmitLabelDifference(Begin, Base, Size);
+          Asm->EmitLabelDifference(End, Base, Size);
         }
-
-        emitDebugLocEntryLocation(Entry, CU);
-        continue;
-      }
-
-      // We have no base address.
-      if (IsLocLists) {
-        // TODO: Use DW_LLE_base_addressx + DW_LLE_offset_pair, or
-        // DW_LLE_startx_length in case if there is only a single range.
-        // That should reduce the size of the debug data emited.
-        // For now just use the DW_LLE_startx_length for all cases.
-        Asm->OutStreamer->AddComment("DW_LLE_startx_length");
-        Asm->emitInt8(dwarf::DW_LLE_startx_length);
-        Asm->OutStreamer->AddComment("  start idx");
-        Asm->EmitULEB128(AddrPool.getIndex(Entry.BeginSym));
+      } else if (UseDwarf5) {
+        Asm->OutStreamer->AddComment(StringifyEnum(StartxLength));
+        Asm->emitInt8(StartxLength);
+        Asm->OutStreamer->AddComment("  start index");
+        Asm->EmitULEB128(DD.getAddressPool().getIndex(Begin));
         Asm->OutStreamer->AddComment("  length");
-        Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Entry.BeginSym);
+        Asm->EmitLabelDifferenceAsULEB128(End, Begin);
       } else {
-        Asm->OutStreamer->EmitSymbolValue(Entry.BeginSym, Size);
-        Asm->OutStreamer->EmitSymbolValue(Entry.EndSym, Size);
+        Asm->OutStreamer->EmitSymbolValue(Begin, Size);
+        Asm->OutStreamer->EmitSymbolValue(End, Size);
       }
-
-      emitDebugLocEntryLocation(Entry, CU);
+      EmitPayload(*RS);
     }
+  }
 
-    if (IsLocLists) {
-      // .debug_loclists section ends with DW_LLE_end_of_list.
-      Asm->OutStreamer->AddComment("DW_LLE_end_of_list");
-      Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_end_of_list, 1);
-    } else {
-      // Terminate the .debug_loc list with two 0 values.
-      Asm->OutStreamer->EmitIntValue(0, Size);
-      Asm->OutStreamer->EmitIntValue(0, Size);
-    }
+  if (UseDwarf5) {
+    Asm->OutStreamer->AddComment(StringifyEnum(EndOfList));
+    Asm->emitInt8(EndOfList);
+  } else {
+    // Terminate the list with two 0 values.
+    Asm->OutStreamer->EmitIntValue(0, Size);
+    Asm->OutStreamer->EmitIntValue(0, Size);
   }
+}
+
+static void emitLocList(DwarfDebug &DD, AsmPrinter *Asm, const DebugLocStream::List &List) {
+  emitRangeList(
+      DD, Asm, List.Label, DD.getDebugLocs().getEntries(List), *List.CU,
+      dwarf::DW_LLE_base_addressx, dwarf::DW_LLE_offset_pair,
+      dwarf::DW_LLE_startx_length, dwarf::DW_LLE_end_of_list,
+      llvm::dwarf::LocListEncodingString,
+      /* ShouldUseBaseAddress */ true,
+      [&](const DebugLocStream::Entry &E) {
+        DD.emitDebugLocEntryLocation(E, List.CU);
+      });
+}
+
+// Emit locations into the .debug_loc/.debug_rnglists section.
+void DwarfDebug::emitDebugLoc() {
+  if (DebugLocs.getLists().empty())
+    return;
+
+  MCSymbol *TableEnd = nullptr;
+  if (getDwarfVersion() >= 5) {
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfLoclistsSection());
+    TableEnd = emitLoclistsTableHeader(Asm, *this);
+  } else {
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfLocSection());
+  }
+
+  for (const auto &List : DebugLocs.getLists())
+    emitLocList(*this, Asm, List);
 
   if (TableEnd)
     Asm->OutStreamer->EmitLabel(TableEnd);
@@ -2232,9 +2466,9 @@ void DwarfDebug::emitDebugLocDWO() {
       // Ideally/in v5, this could use SectionLabels to reuse existing addresses
       // in the address pool to minimize object size/relocations.
       Asm->emitInt8(dwarf::DW_LLE_startx_length);
-      unsigned idx = AddrPool.getIndex(Entry.BeginSym);
+      unsigned idx = AddrPool.getIndex(Entry.Begin);
       Asm->EmitULEB128(idx);
-      Asm->EmitLabelDifference(Entry.EndSym, Entry.BeginSym, 4);
+      Asm->EmitLabelDifference(Entry.End, Entry.Begin, 4);
 
       emitDebugLocEntryLocation(Entry, List.CU);
     }
@@ -2360,7 +2594,7 @@ void DwarfDebug::emitDebugARanges() {
 
     // 7.20 in the Dwarf specs requires the table to be aligned to a tuple.
     unsigned Padding =
-        OffsetToAlignment(sizeof(int32_t) + ContentSize, TupleSize);
+        offsetToAlignment(sizeof(int32_t) + ContentSize, Align(TupleSize));
 
     ContentSize += Padding;
     ContentSize += (List.size() + 1) * TupleSize;
@@ -2405,93 +2639,13 @@ void DwarfDebug::emitDebugARanges() {
 /// Emit a single range list. We handle both DWARF v5 and earlier.
 static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
                           const RangeSpanList &List) {
-
-  auto DwarfVersion = DD.getDwarfVersion();
-  // Emit our symbol so we can find the beginning of the range.
-  Asm->OutStreamer->EmitLabel(List.getSym());
-  // Gather all the ranges that apply to the same section so they can share
-  // a base address entry.
-  MapVector<const MCSection *, std::vector<const RangeSpan *>> SectionRanges;
-  // Size for our labels.
-  auto Size = Asm->MAI->getCodePointerSize();
-
-  for (const RangeSpan &Range : List.getRanges())
-    SectionRanges[&Range.getStart()->getSection()].push_back(&Range);
-
-  const DwarfCompileUnit &CU = List.getCU();
-  const MCSymbol *CUBase = CU.getBaseAddress();
-  bool BaseIsSet = false;
-  for (const auto &P : SectionRanges) {
-    // Don't bother with a base address entry if there's only one range in
-    // this section in this range list - for example ranges for a CU will
-    // usually consist of single regions from each of many sections
-    // (-ffunction-sections, or just C++ inline functions) except under LTO
-    // or optnone where there may be holes in a single CU's section
-    // contributions.
-    auto *Base = CUBase;
-    if (!Base && (P.second.size() > 1 || DwarfVersion < 5) &&
-        (CU.getCUNode()->getRangesBaseAddress() || DwarfVersion >= 5)) {
-      BaseIsSet = true;
-      // FIXME/use care: This may not be a useful base address if it's not
-      // the lowest address/range in this object.
-      Base = P.second.front()->getStart();
-      if (DwarfVersion >= 5) {
-        Base = DD.getSectionLabel(&Base->getSection());
-        Asm->OutStreamer->AddComment("DW_RLE_base_addressx");
-        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_addressx, 1);
-        Asm->OutStreamer->AddComment("  base address index");
-        Asm->EmitULEB128(DD.getAddressPool().getIndex(Base));
-      } else {
-        Asm->OutStreamer->EmitIntValue(-1, Size);
-        Asm->OutStreamer->AddComment("  base address");
-        Asm->OutStreamer->EmitSymbolValue(Base, Size);
-      }
-    } else if (BaseIsSet && DwarfVersion < 5) {
-      BaseIsSet = false;
-      assert(!Base);
-      Asm->OutStreamer->EmitIntValue(-1, Size);
-      Asm->OutStreamer->EmitIntValue(0, Size);
-    }
-
-    for (const auto *RS : P.second) {
-      const MCSymbol *Begin = RS->getStart();
-      const MCSymbol *End = RS->getEnd();
-      assert(Begin && "Range without a begin symbol?");
-      assert(End && "Range without an end symbol?");
-      if (Base) {
-        if (DwarfVersion >= 5) {
-          // Emit DW_RLE_offset_pair when we have a base.
-          Asm->OutStreamer->AddComment("DW_RLE_offset_pair");
-          Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_offset_pair, 1);
-          Asm->OutStreamer->AddComment("  starting offset");
-          Asm->EmitLabelDifferenceAsULEB128(Begin, Base);
-          Asm->OutStreamer->AddComment("  ending offset");
-          Asm->EmitLabelDifferenceAsULEB128(End, Base);
-        } else {
-          Asm->EmitLabelDifference(Begin, Base, Size);
-          Asm->EmitLabelDifference(End, Base, Size);
-        }
-      } else if (DwarfVersion >= 5) {
-        Asm->OutStreamer->AddComment("DW_RLE_startx_length");
-        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_startx_length, 1);
-        Asm->OutStreamer->AddComment("  start index");
-        Asm->EmitULEB128(DD.getAddressPool().getIndex(Begin));
-        Asm->OutStreamer->AddComment("  length");
-        Asm->EmitLabelDifferenceAsULEB128(End, Begin);
-      } else {
-        Asm->OutStreamer->EmitSymbolValue(Begin, Size);
-        Asm->OutStreamer->EmitSymbolValue(End, Size);
-      }
-    }
-  }
-  if (DwarfVersion >= 5) {
-    Asm->OutStreamer->AddComment("DW_RLE_end_of_list");
-    Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_end_of_list, 1);
-  } else {
-    // Terminate the list with two 0 values.
-    Asm->OutStreamer->EmitIntValue(0, Size);
-    Asm->OutStreamer->EmitIntValue(0, Size);
-  }
+  emitRangeList(DD, Asm, List.getSym(), List.getRanges(), List.getCU(),
+                dwarf::DW_RLE_base_addressx, dwarf::DW_RLE_offset_pair,
+                dwarf::DW_RLE_startx_length, dwarf::DW_RLE_end_of_list,
+                llvm::dwarf::RangeListEncodingString,
+                List.getCU().getCUNode()->getRangesBaseAddress() ||
+                    DD.getDwarfVersion() >= 5,
+                [](auto) {});
 }
 
 static void emitDebugRangesImpl(DwarfDebug &DD, AsmPrinter *Asm,
@@ -2637,7 +2791,7 @@ void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
 
 DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
 
-  auto OwnedUnit = llvm::make_unique<DwarfCompileUnit>(
+  auto OwnedUnit = std::make_unique<DwarfCompileUnit>(
       CU.getUniqueID(), CU.getCUNode(), Asm, this, &SkeletonHolder);
   DwarfCompileUnit &NewCU = *OwnedUnit;
   NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoSection());
@@ -2737,7 +2891,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   bool TopLevelType = TypeUnitsUnderConstruction.empty();
   AddrPool.resetUsedFlag();
 
-  auto OwnedUnit = llvm::make_unique<DwarfTypeUnit>(CU, Asm, this, &InfoHolder,
+  auto OwnedUnit = std::make_unique<DwarfTypeUnit>(CU, Asm, this, &InfoHolder,
                                                     getDwoLineTable(CU));
   DwarfTypeUnit &NewTU = *OwnedUnit;
   DIE &UnitDie = NewTU.getUnitDie();
@@ -2879,10 +3033,6 @@ uint16_t DwarfDebug::getDwarfVersion() const {
   return Asm->OutStreamer->getContext().getDwarfVersion();
 }
 
-void DwarfDebug::addSectionLabel(const MCSymbol *Sym) {
-  SectionLabels.insert(std::make_pair(&Sym->getSection(), Sym));
-}
-
 const MCSymbol *DwarfDebug::getSectionLabel(const MCSection *S) {
   return SectionLabels.find(S)->second;
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 3ac474e2bdda..c8c511f67c2a 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -153,7 +153,7 @@ public:
     assert(!ValueLoc && "Already initialized?");
     assert(!Value.getExpression()->isFragment() && "Fragments not supported.");
 
-    ValueLoc = llvm::make_unique<DbgValueLoc>(Value);
+    ValueLoc = std::make_unique<DbgValueLoc>(Value);
     if (auto *E = ValueLoc->getExpression())
       if (E->getNumElements())
         FrameIndexExprs.push_back({0, E});
@@ -216,7 +216,6 @@ public:
     return !FrameIndexExprs.empty();
   }
 
-  bool isBlockByrefVariable() const;
   const DIType *getType() const;
 
   static bool classof(const DbgEntity *N) {
@@ -254,6 +253,25 @@ public:
   }
 };
 
+/// Used for tracking debug info about call site parameters.
+class DbgCallSiteParam {
+private:
+  unsigned Register; ///< Parameter register at the callee entry point.
+  DbgValueLoc Value; ///< Corresponding location for the parameter value at
+                     ///< the call site.
+public:
+  DbgCallSiteParam(unsigned Reg, DbgValueLoc Val)
+      : Register(Reg), Value(Val) {
+    assert(Reg && "Parameter register cannot be undef");
+  }
+
+  unsigned getRegister() const { return Register; }
+  DbgValueLoc getValue() const { return Value; }
+};
+
+/// Collection used for storing debug call site parameters.
+using ParamSet = SmallVector<DbgCallSiteParam, 4>;
+
 /// Helper used to pair up a symbol and its DWARF compile unit.
 struct SymbolCU {
   SymbolCU(DwarfCompileUnit *CU, const MCSymbol *Sym) : Sym(Sym), CU(CU) {}
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 2858afaa1cf1..1c5a244d7c5d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -97,7 +98,7 @@ void DwarfExpression::addAnd(unsigned Mask) {
 
 bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
                                     unsigned MachineReg, unsigned MaxSize) {
-  if (!TRI.isPhysicalRegister(MachineReg)) {
+  if (!llvm::Register::isPhysicalRegister(MachineReg)) {
     if (isFrameRegister(TRI, MachineReg)) {
       DwarfRegs.push_back({-1, 0, nullptr});
       return true;
@@ -241,15 +242,22 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
     return false;
   }
 
-  // Handle simple register locations.
-  if (!isMemoryLocation() && !HasComplexExpression) {
+  // Handle simple register locations. If we are supposed to emit
+  // a call site parameter expression and if that expression is just a register
+  // location, emit it with addBReg and offset 0, because we should emit a DWARF
+  // expression representing a value, rather than a location.
+  if (!isMemoryLocation() && !HasComplexExpression &&
+      (!isParameterValue() || isEntryValue())) {
     for (auto &Reg : DwarfRegs) {
       if (Reg.DwarfRegNo >= 0)
         addReg(Reg.DwarfRegNo, Reg.Comment);
       addOpPiece(Reg.Size);
     }
 
-    if (isEntryValue() && DwarfVersion >= 4)
+    if (isEntryValue())
+      finalizeEntryValue();
+
+    if (isEntryValue() && !isParameterValue() && DwarfVersion >= 4)
       emitOp(dwarf::DW_OP_stack_value);
 
     DwarfRegs.clear();
@@ -275,19 +283,27 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
   // Pattern-match combinations for which more efficient representations exist.
   // [Reg, DW_OP_plus_uconst, Offset] --> [DW_OP_breg, Offset].
   if (Op && (Op->getOp() == dwarf::DW_OP_plus_uconst)) {
-    SignedOffset = Op->getArg(0);
-    ExprCursor.take();
+    uint64_t Offset = Op->getArg(0);
+    uint64_t IntMax = static_cast<uint64_t>(std::numeric_limits<int>::max());
+    if (Offset <= IntMax) {
+      SignedOffset = Offset;
+      ExprCursor.take();
+    }
   }
 
   // [Reg, DW_OP_constu, Offset, DW_OP_plus]  --> [DW_OP_breg, Offset]
   // [Reg, DW_OP_constu, Offset, DW_OP_minus] --> [DW_OP_breg,-Offset]
   // If Reg is a subregister we need to mask it out before subtracting.
   if (Op && Op->getOp() == dwarf::DW_OP_constu) {
+    uint64_t Offset = Op->getArg(0);
+    uint64_t IntMax = static_cast<uint64_t>(std::numeric_limits<int>::max());
     auto N = ExprCursor.peekNext();
-    if (N && (N->getOp() == dwarf::DW_OP_plus ||
-             (N->getOp() == dwarf::DW_OP_minus && !SubRegisterSizeInBits))) {
-      int Offset = Op->getArg(0);
-      SignedOffset = (N->getOp() == dwarf::DW_OP_minus) ? -Offset : Offset;
+    if (N && N->getOp() == dwarf::DW_OP_plus && Offset <= IntMax) {
+      SignedOffset = Offset;
+      ExprCursor.consume(2);
+    } else if (N && N->getOp() == dwarf::DW_OP_minus &&
+               !SubRegisterSizeInBits && Offset <= IntMax + 1) {
+      SignedOffset = -static_cast<int64_t>(Offset);
       ExprCursor.consume(2);
     }
   }
@@ -300,17 +316,34 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
   return true;
 }
 
-void DwarfExpression::addEntryValueExpression(DIExpressionCursor &ExprCursor) {
+void DwarfExpression::beginEntryValueExpression(
+    DIExpressionCursor &ExprCursor) {
   auto Op = ExprCursor.take();
-  assert(Op && Op->getOp() == dwarf::DW_OP_entry_value);
+  (void)Op;
+  assert(Op && Op->getOp() == dwarf::DW_OP_LLVM_entry_value);
   assert(!isMemoryLocation() &&
          "We don't support entry values of memory locations yet");
+  assert(!IsEmittingEntryValue && "Already emitting entry value?");
+  assert(Op->getArg(0) == 1 &&
+         "Can currently only emit entry values covering a single operation");
 
-  if (DwarfVersion >= 5)
-    emitOp(dwarf::DW_OP_entry_value);
-  else
-    emitOp(dwarf::DW_OP_GNU_entry_value);
-  emitUnsigned(Op->getArg(0));
+  emitOp(CU.getDwarf5OrGNULocationAtom(dwarf::DW_OP_entry_value));
+  IsEmittingEntryValue = true;
+  enableTemporaryBuffer();
+}
+
+void DwarfExpression::finalizeEntryValue() {
+  assert(IsEmittingEntryValue && "Entry value not open?");
+  disableTemporaryBuffer();
+
+  // Emit the entry value's size operand.
+  unsigned Size = getTemporaryBufferSize();
+  emitUnsigned(Size);
+
+  // Emit the entry value's DWARF block operand.
+  commitTemporaryBuffer();
+
+  IsEmittingEntryValue = false;
 }
 
 /// Assuming a well-formed expression, match "DW_OP_deref* DW_OP_LLVM_fragment?".
@@ -340,7 +373,17 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
 
   while (ExprCursor) {
     auto Op = ExprCursor.take();
-    switch (Op->getOp()) {
+    uint64_t OpNum = Op->getOp();
+
+    if (OpNum >= dwarf::DW_OP_reg0 && OpNum <= dwarf::DW_OP_reg31) {
+      emitOp(OpNum);
+      continue;
+    } else if (OpNum >= dwarf::DW_OP_breg0 && OpNum <= dwarf::DW_OP_breg31) {
+      addBReg(OpNum - dwarf::DW_OP_breg0, Op->getArg(0));
+      continue;
+    }
+
+    switch (OpNum) {
     case dwarf::DW_OP_LLVM_fragment: {
       unsigned SizeInBits = Op->getArg(1);
       unsigned FragmentOffset = Op->getArg(0);
@@ -389,10 +432,13 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
     case dwarf::DW_OP_lit0:
     case dwarf::DW_OP_not:
     case dwarf::DW_OP_dup:
-      emitOp(Op->getOp());
+      emitOp(OpNum);
       break;
     case dwarf::DW_OP_deref:
       assert(!isRegisterLocation());
+      // For more detailed explanation see llvm.org/PR43343.
+      assert(!isParameterValue() && "Parameter entry values should not be "
+                                    "dereferenced due to safety reasons.");
       if (!isMemoryLocation() && ::isMemoryLocation(ExprCursor))
         // Turning this into a memory location description makes the deref
         // implicit.
@@ -458,12 +504,21 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
     case dwarf::DW_OP_LLVM_tag_offset:
       TagOffset = Op->getArg(0);
       break;
+    case dwarf::DW_OP_regx:
+      emitOp(dwarf::DW_OP_regx);
+      emitUnsigned(Op->getArg(0));
+      break;
+    case dwarf::DW_OP_bregx:
+      emitOp(dwarf::DW_OP_bregx);
+      emitUnsigned(Op->getArg(0));
+      emitSigned(Op->getArg(1));
+      break;
     default:
       llvm_unreachable("unhandled opcode found in expression");
     }
   }
 
-  if (isImplicitLocation())
+  if (isImplicitLocation() && !isParameterValue())
     // Turn this into an implicit location description.
     addStackValue();
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h
index ec2ef6e575f7..1ad46669f9b2 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
 
+#include "ByteStreamer.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -26,7 +27,6 @@ namespace llvm {
 
 class AsmPrinter;
 class APInt;
-class ByteStreamer;
 class DwarfCompileUnit;
 class DIELoc;
 class TargetRegisterInfo;
@@ -95,6 +95,13 @@ public:
 /// Base class containing the logic for constructing DWARF expressions
 /// independently of whether they are emitted into a DIE or into a .debug_loc
 /// entry.
+///
+/// Some DWARF operations, e.g. DW_OP_entry_value, need to calculate the size
+/// of a succeeding DWARF block before the latter is emitted to the output.
+/// To handle such cases, data can conditionally be emitted to a temporary
+/// buffer, which can later on be committed to the main output. The size of the
+/// temporary buffer is queryable, allowing for the size of the data to be
+/// emitted before the data is committed.
 class DwarfExpression {
 protected:
   /// Holds information about all subregisters comprising a register location.
@@ -104,6 +111,9 @@ protected:
     const char *Comment;
   };
 
+  /// Whether we are currently emitting an entry value operation.
+  bool IsEmittingEntryValue = false;
+
   DwarfCompileUnit &CU;
 
   /// The register location, if any.
@@ -120,7 +130,7 @@ protected:
   enum { Unknown = 0, Register, Memory, Implicit };
 
   /// The flags of location description being produced.
-  enum { EntryValue = 1 };
+  enum { EntryValue = 1, CallSiteParamValue };
 
   unsigned LocationKind : 3;
   unsigned LocationFlags : 2;
@@ -147,6 +157,10 @@ public:
     return LocationFlags & EntryValue;
   }
 
+  bool isParameterValue() {
+    return LocationFlags & CallSiteParamValue;
+  }
+
   Optional<uint8_t> TagOffset;
 
 protected:
@@ -174,6 +188,22 @@ protected:
 
   virtual void emitBaseTypeRef(uint64_t Idx) = 0;
 
+  /// Start emitting data to the temporary buffer. The data stored in the
+  /// temporary buffer can be committed to the main output using
+  /// commitTemporaryBuffer().
+  virtual void enableTemporaryBuffer() = 0;
+
+  /// Disable emission to the temporary buffer. This does not commit data
+  /// in the temporary buffer to the main output.
+  virtual void disableTemporaryBuffer() = 0;
+
+  /// Return the emitted size, in number of bytes, for the data stored in the
+  /// temporary buffer.
+  virtual unsigned getTemporaryBufferSize() = 0;
+
+  /// Commit the data stored in the temporary buffer to the main output.
+  virtual void commitTemporaryBuffer() = 0;
+
   /// Emit a normalized unsigned constant.
   void emitConstu(uint64_t Value);
 
@@ -233,6 +263,10 @@ protected:
   /// expression.  See PR21176 for more details.
   void addStackValue();
 
+  /// Finalize an entry value by emitting its size operand, and committing the
+  /// DWARF block which has been emitted to the temporary buffer.
+  void finalizeEntryValue();
+
   ~DwarfExpression() = default;
 
 public:
@@ -264,6 +298,11 @@ public:
     LocationFlags |= EntryValue;
   }
 
+  /// Lock this down to become a call site parameter location.
+  void setCallSiteParamValueFlag() {
+    LocationFlags |= CallSiteParamValue;
+  }
+
   /// Emit a machine register location. As an optimization this may also consume
   /// the prefix of a DwarfExpression if a more efficient representation for
   /// combining the register location and the first operation exists.
@@ -278,8 +317,11 @@ public:
                                DIExpressionCursor &Expr, unsigned MachineReg,
                                unsigned FragmentOffsetInBits = 0);
 
-  /// Emit entry value dwarf operation.
-  void addEntryValueExpression(DIExpressionCursor &ExprCursor);
+  /// Begin emission of an entry value dwarf operation. The entry value's
+  /// first operand is the size of the DWARF block (its second operand),
+  /// which needs to be calculated at time of emission, so we don't emit
+  /// any operands here.
+  void beginEntryValueExpression(DIExpressionCursor &ExprCursor);
 
   /// Emit all remaining operations in the DIExpressionCursor.
   ///
@@ -299,31 +341,62 @@ public:
 
 /// DwarfExpression implementation for .debug_loc entries.
 class DebugLocDwarfExpression final : public DwarfExpression {
-  ByteStreamer &BS;
+
+  struct TempBuffer {
+    SmallString<32> Bytes;
+    std::vector<std::string> Comments;
+    BufferByteStreamer BS;
+
+    TempBuffer(bool GenerateComments) : BS(Bytes, Comments, GenerateComments) {}
+  };
+
+  std::unique_ptr<TempBuffer> TmpBuf;
+  BufferByteStreamer &OutBS;
+  bool IsBuffering = false;
+
+  /// Return the byte streamer that currently is being emitted to.
+  ByteStreamer &getActiveStreamer() { return IsBuffering ? TmpBuf->BS : OutBS; }
 
   void emitOp(uint8_t Op, const char *Comment = nullptr) override;
   void emitSigned(int64_t Value) override;
   void emitUnsigned(uint64_t Value) override;
   void emitData1(uint8_t Value) override;
   void emitBaseTypeRef(uint64_t Idx) override;
+
+  void enableTemporaryBuffer() override;
+  void disableTemporaryBuffer() override;
+  unsigned getTemporaryBufferSize() override;
+  void commitTemporaryBuffer() override;
+
   bool isFrameRegister(const TargetRegisterInfo &TRI,
                        unsigned MachineReg) override;
-
 public:
-  DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS, DwarfCompileUnit &CU)
-      : DwarfExpression(DwarfVersion, CU), BS(BS) {}
+  DebugLocDwarfExpression(unsigned DwarfVersion, BufferByteStreamer &BS,
+                          DwarfCompileUnit &CU)
+      : DwarfExpression(DwarfVersion, CU), OutBS(BS) {}
 };
 
 /// DwarfExpression implementation for singular DW_AT_location.
 class DIEDwarfExpression final : public DwarfExpression {
-const AsmPrinter &AP;
-  DIELoc &DIE;
+  const AsmPrinter &AP;
+  DIELoc &OutDIE;
+  DIELoc TmpDIE;
+  bool IsBuffering = false;
+
+  /// Return the DIE that currently is being emitted to.
+  DIELoc &getActiveDIE() { return IsBuffering ? TmpDIE : OutDIE; }
 
   void emitOp(uint8_t Op, const char *Comment = nullptr) override;
   void emitSigned(int64_t Value) override;
   void emitUnsigned(uint64_t Value) override;
   void emitData1(uint8_t Value) override;
   void emitBaseTypeRef(uint64_t Idx) override;
+
+  void enableTemporaryBuffer() override;
+  void disableTemporaryBuffer() override;
+  unsigned getTemporaryBufferSize() override;
+  void commitTemporaryBuffer() override;
+
   bool isFrameRegister(const TargetRegisterInfo &TRI,
                        unsigned MachineReg) override;
 public:
@@ -331,7 +404,7 @@ public:
 
   DIELoc *finalize() {
     DwarfExpression::finalize();
-    return &DIE;
+    return &OutDIE;
   }
 };
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 244678ce9dc1..35fa51fb24c4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -32,15 +32,9 @@ class LexicalScope;
 class MCSection;
 
 // Data structure to hold a range for range lists.
-class RangeSpan {
-public:
-  RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {}
-  const MCSymbol *getStart() const { return Start; }
-  const MCSymbol *getEnd() const { return End; }
-  void setEnd(const MCSymbol *E) { End = E; }
-
-private:
-  const MCSymbol *Start, *End;
+struct RangeSpan {
+  const MCSymbol *Begin;
+  const MCSymbol *End;
 };
 
 class RangeSpanList {
@@ -86,10 +80,6 @@ class DwarfFile {
   /// The table is shared by all units.
   MCSymbol *RnglistsTableBaseSym = nullptr;
 
-  /// DWARF v5: The symbol that designates the base of the locations list table.
-  /// The table is shared by all units.
-  MCSymbol *LoclistsTableBaseSym = nullptr;
-
   /// The variables of a lexical scope.
   struct ScopeVars {
     /// We need to sort Args by ArgNo and check for duplicates. This could also
@@ -167,9 +157,6 @@ public:
   MCSymbol *getRnglistsTableBaseSym() const { return RnglistsTableBaseSym; }
   void setRnglistsTableBaseSym(MCSymbol *Sym) { RnglistsTableBaseSym = Sym; }
 
-  MCSymbol *getLoclistsTableBaseSym() const { return LoclistsTableBaseSym; }
-  void setLoclistsTableBaseSym(MCSymbol *Sym) { LoclistsTableBaseSym = Sym; }
-
   /// \returns false if the variable was merged with a previous one.
   bool addScopeVariable(LexicalScope *LS, DbgVariable *Var);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 991ab94b50ab..37c68c085792 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -47,31 +47,42 @@ using namespace llvm;
 #define DEBUG_TYPE "dwarfdebug"
 
 DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP,
-                                       DwarfCompileUnit &CU,
-                                       DIELoc &DIE)
-    : DwarfExpression(AP.getDwarfVersion(), CU), AP(AP),
-      DIE(DIE) {}
+                                       DwarfCompileUnit &CU, DIELoc &DIE)
+    : DwarfExpression(AP.getDwarfVersion(), CU), AP(AP), OutDIE(DIE) {}
 
 void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) {
-  CU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
+  CU.addUInt(getActiveDIE(), dwarf::DW_FORM_data1, Op);
 }
 
 void DIEDwarfExpression::emitSigned(int64_t Value) {
-  CU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
+  CU.addSInt(getActiveDIE(), dwarf::DW_FORM_sdata, Value);
 }
 
 void DIEDwarfExpression::emitUnsigned(uint64_t Value) {
-  CU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
+  CU.addUInt(getActiveDIE(), dwarf::DW_FORM_udata, Value);
 }
 
 void DIEDwarfExpression::emitData1(uint8_t Value) {
-  CU.addUInt(DIE, dwarf::DW_FORM_data1, Value);
+  CU.addUInt(getActiveDIE(), dwarf::DW_FORM_data1, Value);
 }
 
 void DIEDwarfExpression::emitBaseTypeRef(uint64_t Idx) {
-  CU.addBaseTypeRef(DIE, Idx);
+  CU.addBaseTypeRef(getActiveDIE(), Idx);
 }
 
+void DIEDwarfExpression::enableTemporaryBuffer() {
+  assert(!IsBuffering && "Already buffering?");
+  IsBuffering = true;
+}
+
+void DIEDwarfExpression::disableTemporaryBuffer() { IsBuffering = false; }
+
+unsigned DIEDwarfExpression::getTemporaryBufferSize() {
+  return TmpDIE.ComputeSize(&AP);
+}
+
+void DIEDwarfExpression::commitTemporaryBuffer() { OutDIE.takeValues(TmpDIE); }
+
 bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
                                          unsigned MachineReg) {
   return MachineReg == TRI.getFrameRegister(*AP.MF);
@@ -205,6 +216,10 @@ void DwarfUnit::insertDIE(const DINode *Desc, DIE *D) {
   MDNodeToDieMap.insert(std::make_pair(Desc, D));
 }
 
+void DwarfUnit::insertDIE(DIE *D) {
+  MDNodeToDieMap.insert(std::make_pair(nullptr, D));
+}
+
 void DwarfUnit::addFlag(DIE &Die, dwarf::Attribute Attribute) {
   if (DD->getDwarfVersion() >= 4)
     Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_flag_present,
@@ -718,7 +733,7 @@ std::string DwarfUnit::getParentContextString(const DIScope *Context) const {
     return "";
 
   // FIXME: Decide whether to implement this for non-C++ languages.
-  if (getLanguage() != dwarf::DW_LANG_C_plus_plus)
+  if (!dwarf::isCPlusPlus((dwarf::SourceLanguage)getLanguage()))
     return "";
 
   std::string CS;
@@ -942,6 +957,9 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
     if (CTy->isAppleBlockExtension())
       addFlag(Buffer, dwarf::DW_AT_APPLE_block);
 
+    if (CTy->getExportSymbols())
+      addFlag(Buffer, dwarf::DW_AT_export_symbols);
+
     // This is outside the DWARF spec, but GDB expects a DW_AT_containing_type
     // inside C++ composite types to point to the base class with the vtable.
     // Rust uses DW_AT_containing_type to link a vtable to the type
@@ -1696,15 +1714,6 @@ void DwarfUnit::addRnglistsBase() {
                   TLOF.getDwarfRnglistsSection()->getBeginSymbol());
 }
 
-void DwarfUnit::addLoclistsBase() {
-  assert(DD->getDwarfVersion() >= 5 &&
-         "DW_AT_loclists_base requires DWARF version 5 or later");
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-  addSectionLabel(getUnitDie(), dwarf::DW_AT_loclists_base,
-                  DU->getLoclistsTableBaseSym(),
-                  TLOF.getDwarfLoclistsSection()->getBeginSymbol());
-}
-
 void DwarfTypeUnit::finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) {
   addFlag(D, dwarf::DW_AT_declaration);
   StringRef Name = CTy->getName();
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 56c934a35ae8..46c52a1faf4b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -127,6 +127,8 @@ public:
   /// the mappings are kept in DwarfDebug.
   void insertDIE(const DINode *Desc, DIE *D);
 
+  void insertDIE(DIE *D);
+
   /// Add a flag that is true to the DIE.
   void addFlag(DIE &Die, dwarf::Attribute Attribute);
 
@@ -214,15 +216,6 @@ public:
   /// Add thrown types.
   void addThrownTypes(DIE &Die, DINodeArray ThrownTypes);
 
-  // FIXME: Should be reformulated in terms of addComplexAddress.
-  /// Start with the address based on the location provided, and generate the
-  /// DWARF information necessary to find the actual Block variable (navigating
-  /// the Block struct) based on the starting location.  Add the DWARF
-  /// information to the die.  Obsolete, please use addComplexAddress instead.
-  void addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
-                            dwarf::Attribute Attribute,
-                            const MachineLocation &Location);
-
   /// Add a new type attribute to the specified entity.
   ///
   /// This takes and attribute parameter because DW_AT_friend attributes are
@@ -279,9 +272,6 @@ public:
   /// Add the DW_AT_rnglists_base attribute to the unit DIE.
   void addRnglistsBase();
 
-  /// Add the DW_AT_loclists_base attribute to the unit DIE.
-  void addLoclistsBase();
-
   virtual DwarfCompileUnit &getCU() = 0;
 
   void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy);
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 99e3687b36b8..31dfaaac836e 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -426,7 +426,7 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   // EHABI). In this case LSDASection will be NULL.
   if (LSDASection)
     Asm->OutStreamer->SwitchSection(LSDASection);
-  Asm->EmitAlignment(2);
+  Asm->EmitAlignment(Align(4));
 
   // Emit the LSDA.
   MCSymbol *GCCETSym =
@@ -602,11 +602,11 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   }
 
   if (HaveTTData) {
-    Asm->EmitAlignment(2);
+    Asm->EmitAlignment(Align(4));
     emitTypeInfos(TTypeEncoding, TTBaseLabel);
   }
 
-  Asm->EmitAlignment(2);
+  Asm->EmitAlignment(Align(4));
   return GCCETSym;
 }
 
diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 39392b79e960..3849644d1584 100644
--- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -72,7 +72,7 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
      **/
 
     // Align to address width.
-    AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+    AP.EmitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
 
     // Emit PointCount.
     OS.AddComment("safe point count");
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 3145cc90dc73..b4eda5fa8c58 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -129,7 +129,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
     report_fatal_error(" Too much descriptor for ocaml GC");
   }
   AP.emitInt16(NumDescriptors);
-  AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+  AP.EmitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
 
   for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
                                            IE = Info.funcinfo_end();
@@ -180,7 +180,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
         AP.emitInt16(K->StackOffset);
       }
 
-      AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+      AP.EmitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
     }
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 155e91ce61a1..0398675577cd 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -982,8 +982,7 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
   OS.EmitValueToAlignment(4);
   OS.EmitLabel(LSDALabel);
 
-  const Function *Per =
-      dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+  const auto *Per = cast<Function>(F.getPersonalityFn()->stripPointerCasts());
   StringRef PerName = Per->getName();
   int BaseState = -1;
   if (PerName == "_except_handler4") {
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index dc7eaf6a5fe7..27b298dcf6af 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -382,7 +382,7 @@ LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   auto *NewLI = Builder.CreateLoad(NewTy, NewAddr);
-  NewLI->setAlignment(LI->getAlignment());
+  NewLI->setAlignment(MaybeAlign(LI->getAlignment()));
   NewLI->setVolatile(LI->isVolatile());
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
@@ -469,7 +469,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
-  NewSI->setAlignment(SI->getAlignment());
+  NewSI->setAlignment(MaybeAlign(SI->getAlignment()));
   NewSI->setVolatile(SI->isVolatile());
   NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
@@ -1376,7 +1376,7 @@ Value *AtomicExpand::insertRMWCmpXchgLoop(
   Builder.SetInsertPoint(BB);
   LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
   // Atomics require at least natural alignment.
-  InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8);
+  InitLoaded->setAlignment(MaybeAlign(ResultTy->getPrimitiveSizeInBits() / 8));
   Builder.CreateBr(LoopBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
@@ -1711,7 +1711,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // 'expected' argument, if present.
   if (CASExpected) {
     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
-    AllocaCASExpected->setAlignment(AllocaAlignment);
+    AllocaCASExpected->setAlignment(MaybeAlign(AllocaAlignment));
     unsigned AllocaAS =  AllocaCASExpected->getType()->getPointerAddressSpace();
 
     AllocaCASExpected_i8 =
@@ -1730,7 +1730,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
       Args.push_back(IntValue);
     } else {
       AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
-      AllocaValue->setAlignment(AllocaAlignment);
+      AllocaValue->setAlignment(MaybeAlign(AllocaAlignment));
       AllocaValue_i8 =
           Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
       Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
@@ -1742,7 +1742,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // 'ret' argument.
   if (!CASExpected && HasResult && !UseSizedLibcall) {
     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
-    AllocaResult->setAlignment(AllocaAlignment);
+    AllocaResult->setAlignment(MaybeAlign(AllocaAlignment));
     unsigned AllocaAS =  AllocaResult->getType()->getPointerAddressSpace();
     AllocaResult_i8 =
       Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS));
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index fb54b5d6c8d8..455916eeb82f 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -129,9 +129,10 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
       getAnalysis<MachineBlockFrequencyInfo>());
   BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
                       getAnalysis<MachineBranchProbabilityInfo>());
-  return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
-                                 MF.getSubtarget().getRegisterInfo(),
-                                 getAnalysisIfAvailable<MachineModuleInfo>());
+  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+  return Folder.OptimizeFunction(
+      MF, MF.getSubtarget().getInstrInfo(), MF.getSubtarget().getRegisterInfo(),
+      MMIWP ? &MMIWP->getMMI() : nullptr);
 }
 
 BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
@@ -161,6 +162,11 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
   // Avoid matching if this pointer gets reused.
   TriedMerging.erase(MBB);
 
+  // Update call site info.
+  std::for_each(MBB->begin(), MBB->end(), [MF](const MachineInstr &MI) {
+    if (MI.isCall(MachineInstr::IgnoreBundle))
+      MF->eraseCallSiteInfo(&MI);
+  });
   // Remove the block.
   MF->erase(MBB);
   EHScopeMembership.erase(MBB);
@@ -1306,6 +1312,8 @@ static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) {
 /// result in infinite loops.
 static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
                                 MachineBasicBlock *MBB2) {
+  assert(MBB1 && MBB2 && "Unknown MachineBasicBlock");
+
   // Right now, we use a simple heuristic.  If MBB2 ends with a call, and
   // MBB1 doesn't, we prefer to fall through into MBB1.  This allows us to
   // optimize branches that branch to either a return block or an assert block
@@ -1843,7 +1851,7 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
 template <class Container>
 static void addRegAndItsAliases(unsigned Reg, const TargetRegisterInfo *TRI,
                                 Container &Set) {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
       Set.insert(*AI);
   } else {
@@ -1871,7 +1879,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   for (const MachineOperand &MO : Loc->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isUse()) {
@@ -1909,7 +1917,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
       return Loc;
     if (!MO.isReg() || MO.isUse())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (Uses.count(Reg)) {
@@ -1937,14 +1945,14 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   for (const MachineOperand &MO : PI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isUse()) {
       addRegAndItsAliases(Reg, TRI, Uses);
     } else {
       if (Uses.erase(Reg)) {
-        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        if (Register::isPhysicalRegister(Reg)) {
           for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
             Uses.erase(*SubRegs); // Use sub-registers to be conservative
         }
@@ -2010,7 +2018,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       }
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
       if (MO.isDef()) {
@@ -2060,13 +2068,13 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
     for (const MachineOperand &MO : TIB->operands()) {
       if (!MO.isReg() || !MO.isUse() || !MO.isKill())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
       if (!AllDefsSet.count(Reg)) {
         continue;
       }
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (Register::isPhysicalRegister(Reg)) {
         for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
           ActiveDefsSet.erase(*AI);
       } else {
@@ -2078,8 +2086,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
     for (const MachineOperand &MO : TIB->operands()) {
       if (!MO.isReg() || !MO.isDef() || MO.isDead())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Reg || Register::isVirtualRegister(Reg))
         continue;
       addRegAndItsAliases(Reg, TRI, ActiveDefsSet);
       addRegAndItsAliases(Reg, TRI, AllDefsSet);
diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp
index 3ad6266d4f35..6efdc9efa968 100644
--- a/lib/CodeGen/BranchRelaxation.cpp
+++ b/lib/CodeGen/BranchRelaxation.cpp
@@ -64,19 +64,18 @@ class BranchRelaxation : public MachineFunctionPass {
     /// Compute the offset immediately following this block. \p MBB is the next
     /// block.
     unsigned postOffset(const MachineBasicBlock &MBB) const {
-      unsigned PO = Offset + Size;
-      unsigned Align = MBB.getAlignment();
-      if (Align == 0)
+      const unsigned PO = Offset + Size;
+      const Align Alignment = MBB.getAlignment();
+      if (Alignment == 1)
         return PO;
 
-      unsigned AlignAmt = 1 << Align;
-      unsigned ParentAlign = MBB.getParent()->getAlignment();
-      if (Align <= ParentAlign)
-        return PO + OffsetToAlignment(PO, AlignAmt);
+      const Align ParentAlign = MBB.getParent()->getAlignment();
+      if (Alignment <= ParentAlign)
+        return PO + offsetToAlignment(PO, Alignment);
 
       // The alignment of this MBB is larger than the function's alignment, so we
       // can't tell whether or not it will insert nops. Assume that it will.
-      return PO + AlignAmt + OffsetToAlignment(PO, AlignAmt);
+      return PO + Alignment.value() + offsetToAlignment(PO, Alignment);
     }
   };
 
@@ -128,9 +127,8 @@ void BranchRelaxation::verify() {
 #ifndef NDEBUG
   unsigned PrevNum = MF->begin()->getNumber();
   for (MachineBasicBlock &MBB : *MF) {
-    unsigned Align = MBB.getAlignment();
-    unsigned Num = MBB.getNumber();
-    assert(BlockInfo[Num].Offset % (1u << Align) == 0);
+    const unsigned Num = MBB.getNumber();
+    assert(isAligned(MBB.getAlignment(), BlockInfo[Num].Offset));
     assert(!Num || BlockInfo[PrevNum].postOffset(MBB) <= BlockInfo[Num].Offset);
     assert(BlockInfo[Num].Size == computeBlockSize(MBB));
     PrevNum = Num;
@@ -143,7 +141,7 @@ void BranchRelaxation::verify() {
 LLVM_DUMP_METHOD void BranchRelaxation::dumpBBs() {
   for (auto &MBB : *MF) {
     const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
-    dbgs() << format("%bb.%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
+    dbgs() << format("%%bb.%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
            << format("size=%#x\n", BBI.Size);
   }
 }
diff --git a/lib/CodeGen/BreakFalseDeps.cpp b/lib/CodeGen/BreakFalseDeps.cpp
index cc4b2caa9bed..709164e5f178 100644
--- a/lib/CodeGen/BreakFalseDeps.cpp
+++ b/lib/CodeGen/BreakFalseDeps.cpp
@@ -9,12 +9,11 @@
 /// \file Break False Dependency pass.
 ///
 /// Some instructions have false dependencies which cause unnecessary stalls.
-/// For exmaple, instructions that only write part of a register, and implicitly
-/// need to read the other parts of the register.  This may cause unwanted
+/// For example, instructions may write part of a register and implicitly
+/// need to read the other parts of the register. This may cause unwanted
 /// stalls preventing otherwise unrelated instructions from executing in
 /// parallel in an out-of-order CPU.
-/// This pass is aimed at identifying and avoiding these depepndencies when
-/// possible.
+/// This pass is aimed at identifying and avoiding these dependencies.
 //
 //===----------------------------------------------------------------------===//
 
@@ -24,6 +23,7 @@
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
 
 
 using namespace llvm;
@@ -109,7 +109,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   MachineOperand &MO = MI->getOperand(OpIdx);
   assert(MO.isUndef() && "Expected undef machine operand");
 
-  unsigned OriginalReg = MO.getReg();
+  Register OriginalReg = MO.getReg();
 
   // Update only undef operands that have reg units that are mapped to one root.
   for (MCRegUnitIterator Unit(OriginalReg, TRI); Unit.isValid(); ++Unit) {
@@ -162,7 +162,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
 
 bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
                                            unsigned Pref) {
-  unsigned reg = MI->getOperand(OpIdx).getReg();
+  Register reg = MI->getOperand(OpIdx).getReg();
   unsigned Clearance = RDA->getClearance(MI, reg);
   LLVM_DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
 
@@ -178,6 +178,7 @@ void BreakFalseDeps::processDefs(MachineInstr *MI) {
   assert(!MI->isDebugInstr() && "Won't process debug values");
 
   // Break dependence on undef uses. Do this before updating LiveRegs below.
+  // This can remove a false dependence with no additional instructions.
   unsigned OpNum;
   unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
   if (Pref) {
@@ -189,6 +190,11 @@ void BreakFalseDeps::processDefs(MachineInstr *MI) {
       UndefReads.push_back(std::make_pair(MI, OpNum));
   }
 
+  // The code below allows the target to create a new instruction to break the
+  // dependence. That opposes the goal of minimizing size, so bail out now.
+  if (MF->getFunction().hasMinSize())
+    return;
+
   const MCInstrDesc &MCID = MI->getDesc();
   for (unsigned i = 0,
     e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
@@ -209,6 +215,11 @@ void BreakFalseDeps::processUndefReads(MachineBasicBlock *MBB) {
   if (UndefReads.empty())
     return;
 
+  // The code below allows the target to create a new instruction to break the
+  // dependence. That opposes the goal of minimizing size, so bail out now.
+  if (MF->getFunction().hasMinSize())
+    return;
+
   // Collect this block's live out register units.
   LiveRegSet.init(*TRI);
   // We do not need to care about pristine registers as they are just preserved
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 7164fdfb7886..bf97aaee3665 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -40,7 +40,7 @@ void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   VirtRegAuxInfo VRAI(MF, LIS, VRM, MLI, MBFI, norm);
   for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
     VRAI.calculateSpillWeightAndHint(LIS.getInterval(Reg));
@@ -48,10 +48,11 @@ void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
 }
 
 // Return the preferred allocation register for reg, given a COPY instruction.
-static unsigned copyHint(const MachineInstr *mi, unsigned reg,
+static Register copyHint(const MachineInstr *mi, unsigned reg,
                          const TargetRegisterInfo &tri,
                          const MachineRegisterInfo &mri) {
-  unsigned sub, hreg, hsub;
+  unsigned sub, hsub;
+  Register hreg;
   if (mi->getOperand(0).getReg() == reg) {
     sub = mi->getOperand(0).getSubReg();
     hreg = mi->getOperand(1).getReg();
@@ -65,11 +66,11 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg,
   if (!hreg)
     return 0;
 
-  if (TargetRegisterInfo::isVirtualRegister(hreg))
-    return sub == hsub ? hreg : 0;
+  if (Register::isVirtualRegister(hreg))
+    return sub == hsub ? hreg : Register();
 
   const TargetRegisterClass *rc = mri.getRegClass(reg);
-  unsigned CopiedPReg = (hsub ? tri.getSubReg(hreg, hsub) : hreg);
+  Register CopiedPReg = (hsub ? tri.getSubReg(hreg, hsub) : hreg);
   if (rc->contains(CopiedPReg))
     return CopiedPReg;
 
@@ -112,7 +113,7 @@ static bool isRematerializable(const LiveInterval &LI,
 
         // If the original (pre-splitting) registers match this
         // copy came from a split.
-        if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+        if (!Register::isVirtualRegister(Reg) ||
             VRM->getOriginal(Reg) != Original)
           return false;
 
@@ -243,7 +244,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
     // Get allocation hints from copies.
     if (!mi->isCopy())
       continue;
-    unsigned hint = copyHint(mi, li.reg, tri, mri);
+    Register hint = copyHint(mi, li.reg, tri, mri);
     if (!hint)
       continue;
     // Force hweight onto the stack so that x86 doesn't add hidden precision,
@@ -251,8 +252,9 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
     //
     // FIXME: we probably shouldn't use floats at all.
     volatile float hweight = Hint[hint] += weight;
-    if (TargetRegisterInfo::isVirtualRegister(hint) || mri.isAllocatable(hint))
-      CopyHints.insert(CopyHint(hint, hweight, tri.isPhysicalRegister(hint)));
+    if (Register::isVirtualRegister(hint) || mri.isAllocatable(hint))
+      CopyHints.insert(
+          CopyHint(hint, hweight, Register::isPhysicalRegister(hint)));
   }
 
   Hint.clear();
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 497fcb147849..a397039180a4 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -32,7 +32,6 @@ CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
       TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) {
   // No stack is used.
   StackOffset = 0;
-  MaxStackArgAlign = 1;
 
   clearByValRegsInfo();
   UsedRegs.resize((TRI.getNumRegs()+31)/32);
@@ -41,20 +40,21 @@ CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
 /// Allocate space on the stack large enough to pass an argument by value.
 /// The size and alignment information of the argument is encoded in
 /// its parameter attribute.
-void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
-                          MVT LocVT, CCValAssign::LocInfo LocInfo,
-                          int MinSize, int MinAlign,
-                          ISD::ArgFlagsTy ArgFlags) {
-  unsigned Align = ArgFlags.getByValAlign();
+void CCState::HandleByVal(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo, int MinSize,
+                          int MinAlignment, ISD::ArgFlagsTy ArgFlags) {
+  Align MinAlign(MinAlignment);
+  Align Alignment(ArgFlags.getByValAlign());
   unsigned Size  = ArgFlags.getByValSize();
   if (MinSize > (int)Size)
     Size = MinSize;
-  if (MinAlign > (int)Align)
-    Align = MinAlign;
-  ensureMaxAlignment(Align);
-  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size, Align);
+  if (MinAlign > Alignment)
+    Alignment = MinAlign;
+  ensureMaxAlignment(Alignment);
+  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size,
+                                                     Alignment.value());
   Size = unsigned(alignTo(Size, MinAlign));
-  unsigned Offset = AllocateStack(Size, Align);
+  unsigned Offset = AllocateStack(Size, Alignment.value());
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 }
 
@@ -90,13 +90,8 @@ CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ArgVT = Ins[i].VT;
     ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
-#ifndef NDEBUG
-      dbgs() << "Formal argument #" << i << " has unhandled type "
-             << EVT(ArgVT).getEVTString() << '\n';
-#endif
-      llvm_unreachable(nullptr);
-    }
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this))
+      report_fatal_error("unable to allocate function argument #" + Twine(i));
   }
 }
 
@@ -122,13 +117,8 @@ void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) {
-#ifndef NDEBUG
-      dbgs() << "Return operand #" << i << " has unhandled type "
-             << EVT(VT).getEVTString() << '\n';
-#endif
-      llvm_unreachable(nullptr);
-    }
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+      report_fatal_error("unable to allocate function return #" + Twine(i));
   }
 }
 
@@ -209,7 +199,7 @@ static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) {
 void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
                                           MVT VT, CCAssignFn Fn) {
   unsigned SavedStackOffset = StackOffset;
-  unsigned SavedMaxStackArgAlign = MaxStackArgAlign;
+  Align SavedMaxStackArgAlign = MaxStackArgAlign;
   unsigned NumLocs = Locs.size();
 
   // Set the 'inreg' flag if it is used for this calling convention.
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index c37ed57781d4..ad9525f927e8 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -28,6 +28,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeDetectDeadLanesPass(Registry);
   initializeDwarfEHPreparePass(Registry);
   initializeEarlyIfConverterPass(Registry);
+  initializeEarlyIfPredicatorPass(Registry);
   initializeEarlyMachineLICMPass(Registry);
   initializeEarlyTailDuplicatePass(Registry);
   initializeExpandMemCmpPassPass(Registry);
@@ -53,6 +54,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeLocalStackSlotPassPass(Registry);
   initializeLowerIntrinsicsPass(Registry);
   initializeMIRCanonicalizerPass(Registry);
+  initializeMIRNamerPass(Registry);
   initializeMachineBlockFrequencyInfoPass(Registry);
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
@@ -63,10 +65,11 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineFunctionPrinterPassPass(Registry);
   initializeMachineLICMPass(Registry);
   initializeMachineLoopInfoPass(Registry);
-  initializeMachineModuleInfoPass(Registry);
+  initializeMachineModuleInfoWrapperPassPass(Registry);
   initializeMachineOptimizationRemarkEmitterPassPass(Registry);
   initializeMachineOutlinerPass(Registry);
   initializeMachinePipelinerPass(Registry);
+  initializeModuloScheduleTestPass(Registry);
   initializeMachinePostDominatorTreePass(Registry);
   initializeMachineRegionInfoPassPass(Registry);
   initializeMachineSchedulerPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 52b4bbea012b..fa4432ea23ec 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -344,7 +344,7 @@ class TypePromotionTransaction;
     // Get the DominatorTree, building if necessary.
     DominatorTree &getDT(Function &F) {
       if (!DT)
-        DT = llvm::make_unique<DominatorTree>(F);
+        DT = std::make_unique<DominatorTree>(F);
       return *DT;
     }
 
@@ -424,7 +424,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     TLI = SubtargetInfo->getTargetLowering();
     TRI = SubtargetInfo->getRegisterInfo();
   }
-  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   BPI.reset(new BranchProbabilityInfo(F, *LI));
@@ -1524,7 +1524,7 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
                      const TargetLowering &TLI, const DataLayout &DL) {
   BasicBlock *UserBB = User->getParent();
   DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
-  TruncInst *TruncI = dyn_cast<TruncInst>(User);
+  auto *TruncI = cast<TruncInst>(User);
   bool MadeChange = false;
 
   for (Value::user_iterator TruncUI = TruncI->user_begin(),
@@ -1682,10 +1682,11 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
     TheUse = InsertedShift;
   }
 
-  // If we removed all uses, nuke the shift.
+  // If we removed all uses, or there are none, nuke the shift.
   if (ShiftI->use_empty()) {
     salvageDebugInfo(*ShiftI);
     ShiftI->eraseFromParent();
+    MadeChange = true;
   }
 
   return MadeChange;
@@ -1811,7 +1812,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       AllocaInst *AI;
       if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
-        AI->setAlignment(PrefAlign);
+        AI->setAlignment(MaybeAlign(PrefAlign));
       // Global variables can only be aligned if they are defined in this
       // object (i.e. they are uniquely initialized in this object), and
       // over-aligning global variables that have an explicit section is
@@ -1821,7 +1822,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
           GV->getPointerAlignment(*DL) < PrefAlign &&
           DL->getTypeAllocSize(GV->getValueType()) >=
               MinSize + Offset2)
-        GV->setAlignment(PrefAlign);
+        GV->setAlignment(MaybeAlign(PrefAlign));
     }
     // If this is a memcpy (or similar) then we may be able to improve the
     // alignment
@@ -1867,24 +1868,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       });
       return true;
     }
-    case Intrinsic::objectsize: {
-      // Lower all uses of llvm.objectsize.*
-      Value *RetVal =
-          lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true);
-
-      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
-        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
-      });
-      return true;
-    }
-    case Intrinsic::is_constant: {
-      // If is_constant hasn't folded away yet, lower it to false now.
-      Constant *RetVal = ConstantInt::get(II->getType(), 0);
-      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
-        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
-      });
-      return true;
-    }
+    case Intrinsic::objectsize:
+      llvm_unreachable("llvm.objectsize.* should have been lowered already");
+    case Intrinsic::is_constant:
+      llvm_unreachable("llvm.is.constant.* should have been lowered already");
     case Intrinsic::aarch64_stlxr:
     case Intrinsic::aarch64_stxr: {
       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
@@ -2024,17 +2011,18 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
   /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
   /// call.
   const Function *F = BB->getParent();
-  SmallVector<CallInst*, 4> TailCalls;
+  SmallVector<BasicBlock*, 4> TailCallBBs;
   if (PN) {
     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
       // Look through bitcasts.
       Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts();
       CallInst *CI = dyn_cast<CallInst>(IncomingVal);
+      BasicBlock *PredBB = PN->getIncomingBlock(I);
       // Make sure the phi value is indeed produced by the tail call.
-      if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
+      if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&
           TLI->mayBeEmittedAsTailCall(CI) &&
           attributesPermitTailCall(F, CI, RetI, *TLI))
-        TailCalls.push_back(CI);
+        TailCallBBs.push_back(PredBB);
     }
   } else {
     SmallPtrSet<BasicBlock*, 4> VisitedBBs;
@@ -2052,24 +2040,20 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
       CallInst *CI = dyn_cast<CallInst>(&*RI);
       if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
           attributesPermitTailCall(F, CI, RetI, *TLI))
-        TailCalls.push_back(CI);
+        TailCallBBs.push_back(*PI);
     }
   }
 
   bool Changed = false;
-  for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
-    CallInst *CI = TailCalls[i];
-    CallSite CS(CI);
-
+  for (auto const &TailCallBB : TailCallBBs) {
     // Make sure the call instruction is followed by an unconditional branch to
     // the return block.
-    BasicBlock *CallBB = CI->getParent();
-    BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
+    BranchInst *BI = dyn_cast<BranchInst>(TailCallBB->getTerminator());
     if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
       continue;
 
-    // Duplicate the return into CallBB.
-    (void)FoldReturnIntoUncondBranch(RetI, BB, CallBB);
+    // Duplicate the return into TailCallBB.
+    (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);
     ModifiedDT = Changed = true;
     ++NumRetsDup;
   }
@@ -2683,26 +2667,26 @@ private:
 
 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
                                           Value *NewVal) {
-  Actions.push_back(llvm::make_unique<TypePromotionTransaction::OperandSetter>(
+  Actions.push_back(std::make_unique<TypePromotionTransaction::OperandSetter>(
       Inst, Idx, NewVal));
 }
 
 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
                                                 Value *NewVal) {
   Actions.push_back(
-      llvm::make_unique<TypePromotionTransaction::InstructionRemover>(
+      std::make_unique<TypePromotionTransaction::InstructionRemover>(
           Inst, RemovedInsts, NewVal));
 }
 
 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
                                                   Value *New) {
   Actions.push_back(
-      llvm::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
+      std::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
 }
 
 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
   Actions.push_back(
-      llvm::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
+      std::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
 }
 
 Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
@@ -2732,7 +2716,7 @@ Value *TypePromotionTransaction::createZExt(Instruction *Inst,
 void TypePromotionTransaction::moveBefore(Instruction *Inst,
                                           Instruction *Before) {
   Actions.push_back(
-      llvm::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
+      std::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
           Inst, Before));
 }
 
@@ -3048,7 +3032,7 @@ public:
       To = dyn_cast<PHINode>(OldReplacement);
       OldReplacement = Get(From);
     }
-    assert(Get(To) == To && "Replacement PHI node is already replaced.");
+    assert(To && Get(To) == To && "Replacement PHI node is already replaced.");
     Put(From, To);
     From->replaceAllUsesWith(To);
     AllPhiNodes.erase(From);
@@ -3334,7 +3318,7 @@ private:
         // So the values are different and does not match. So we need them to
         // match. (But we register no more than one match per PHI node, so that
         // we won't later try to replace them twice.)
-        if (!MatchedPHIs.insert(FirstPhi).second)
+        if (MatchedPHIs.insert(FirstPhi).second)
           Matcher.insert({ FirstPhi, SecondPhi });
         // But me must check it.
         WorkList.push_back({ FirstPhi, SecondPhi });
@@ -3412,11 +3396,10 @@ private:
         Select->setFalseValue(ST.Get(Map[FalseValue]));
       } else {
         // Must be a Phi node then.
-        PHINode *PHI = cast<PHINode>(V);
-        auto *CurrentPhi = dyn_cast<PHINode>(Current);
+        auto *PHI = cast<PHINode>(V);
         // Fill the Phi node with values from predecessors.
         for (auto B : predecessors(PHI->getParent())) {
-          Value *PV = CurrentPhi->getIncomingValueForBlock(B);
+          Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);
           assert(Map.find(PV) != Map.end() && "No predecessor Value!");
           PHI->addIncoming(ST.Get(Map[PV]), B);
         }
@@ -3785,13 +3768,11 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
   //          poisoned value                    regular value
   // It should be OK since undef covers valid value.
   if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
-    const Instruction *ExtInst =
-        dyn_cast<const Instruction>(*Inst->user_begin());
+    const auto *ExtInst = cast<const Instruction>(*Inst->user_begin());
     if (ExtInst->hasOneUse()) {
-      const Instruction *AndInst =
-          dyn_cast<const Instruction>(*ExtInst->user_begin());
+      const auto *AndInst = dyn_cast<const Instruction>(*ExtInst->user_begin());
       if (AndInst && AndInst->getOpcode() == Instruction::And) {
-        const ConstantInt *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
+        const auto *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
         if (Cst &&
             Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))
           return true;
@@ -4793,8 +4774,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                       << " for " << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
       SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
-  } else if (AddrSinkUsingGEPs ||
-             (!AddrSinkUsingGEPs.getNumOccurrences() && TM && TTI->useAA())) {
+  } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
+                                   TM && SubtargetInfo->addrSinkUsingGEPs())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
@@ -5816,7 +5797,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
     return false;
 
   IRBuilder<> Builder(Load->getNextNode());
-  auto *NewAnd = dyn_cast<Instruction>(
+  auto *NewAnd = cast<Instruction>(
       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
   // Mark this instruction as "inserted by CGP", so that other
   // optimizations don't touch it.
@@ -6193,35 +6174,49 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
 
   // OpsToSink can contain multiple uses in a use chain (e.g.
   // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
-  // uses must come first, which means they are sunk first, temporarily creating
-  // invalid IR. This will be fixed once their dominated users are sunk and
-  // updated.
+  // uses must come first, so we process the ops in reverse order so as to not
+  // create invalid IR.
   BasicBlock *TargetBB = I->getParent();
   bool Changed = false;
   SmallVector<Use *, 4> ToReplace;
-  for (Use *U : OpsToSink) {
+  for (Use *U : reverse(OpsToSink)) {
     auto *UI = cast<Instruction>(U->get());
     if (UI->getParent() == TargetBB || isa<PHINode>(UI))
       continue;
     ToReplace.push_back(U);
   }
 
-  SmallPtrSet<Instruction *, 4> MaybeDead;
+  SetVector<Instruction *> MaybeDead;
+  DenseMap<Instruction *, Instruction *> NewInstructions;
+  Instruction *InsertPoint = I;
   for (Use *U : ToReplace) {
     auto *UI = cast<Instruction>(U->get());
     Instruction *NI = UI->clone();
+    NewInstructions[UI] = NI;
     MaybeDead.insert(UI);
     LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
-    NI->insertBefore(I);
+    NI->insertBefore(InsertPoint);
+    InsertPoint = NI;
     InsertedInsts.insert(NI);
-    U->set(NI);
+
+    // Update the use for the new instruction, making sure that we update the
+    // sunk instruction uses, if it is part of a chain that has already been
+    // sunk.
+    Instruction *OldI = cast<Instruction>(U->getUser());
+    if (NewInstructions.count(OldI))
+      NewInstructions[OldI]->setOperand(U->getOperandNo(), NI);
+    else
+      U->set(NI);
     Changed = true;
   }
 
   // Remove instructions that are dead after sinking.
-  for (auto *I : MaybeDead)
-    if (!I->hasNUsesOrMore(1))
+  for (auto *I : MaybeDead) {
+    if (!I->hasNUsesOrMore(1)) {
+      LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n");
       I->eraseFromParent();
+    }
+  }
 
   return Changed;
 }
@@ -7106,7 +7101,6 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
     for (auto &I : reverse(BB)) {
       if (makeBitReverse(I, *DL, *TLI)) {
         MadeBitReverse = MadeChange = true;
-        ModifiedDT = true;
         break;
       }
     }
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 4144c243a341..702e7e244bce 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -187,7 +187,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
     const TargetRegisterClass *NewRC = nullptr;
 
@@ -272,7 +272,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
           }
 
       if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (Reg == 0) continue;
       if (!MO.isDef()) continue;
 
@@ -303,7 +303,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
     if (!MO.isUse()) continue;
 
@@ -457,6 +457,7 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
     if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency)
       Max = SU;
   }
+  assert(Max && "Failed to find bottom of the critical path");
 
 #ifndef NDEBUG
   {
@@ -612,7 +613,7 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
       for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
         MachineOperand &MO = MI.getOperand(i);
         if (!MO.isReg()) continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (Reg == 0) continue;
         if (MO.isUse() && TRI->regsOverlap(AntiDepReg, Reg)) {
           AntiDepReg = 0;
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index b99be5d7a87c..a169c3cb16b2 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -23,6 +23,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
@@ -71,39 +73,13 @@ static DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) {
 
 // --------------------------------------------------------------------
 
-DFAPacketizer::DFAPacketizer(const InstrItineraryData *I,
-                             const DFAStateInput (*SIT)[2],
-                             const unsigned *SET):
-  InstrItins(I), DFAStateInputTable(SIT), DFAStateEntryTable(SET) {
-  // Make sure DFA types are large enough for the number of terms & resources.
-  static_assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <=
-                    (8 * sizeof(DFAInput)),
-                "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAInput");
-  static_assert(
-      (DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAStateInput)),
-      "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput");
-}
-
-// Read the DFA transition table and update CachedTable.
-//
-// Format of the transition tables:
-// DFAStateInputTable[][2] = pairs of <Input, Transition> for all valid
-//                           transitions
-// DFAStateEntryTable[i] = Index of the first entry in DFAStateInputTable
-//                         for the ith state
-//
-void DFAPacketizer::ReadTable(unsigned int state) {
-  unsigned ThisState = DFAStateEntryTable[state];
-  unsigned NextStateInTable = DFAStateEntryTable[state+1];
-  // Early exit in case CachedTable has already contains this
-  // state's transitions.
-  if (CachedTable.count(UnsignPair(state, DFAStateInputTable[ThisState][0])))
-    return;
-
-  for (unsigned i = ThisState; i < NextStateInTable; i++)
-    CachedTable[UnsignPair(state, DFAStateInputTable[i][0])] =
-      DFAStateInputTable[i][1];
-}
+// Make sure DFA types are large enough for the number of terms & resources.
+static_assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <=
+              (8 * sizeof(DFAInput)),
+              "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAInput");
+static_assert(
+    (DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAStateInput)),
+    "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput");
 
 // Return the DFAInput for an instruction class.
 DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) {
@@ -129,9 +105,7 @@ DFAInput DFAPacketizer::getInsnInput(const std::vector<unsigned> &InsnClass) {
 bool DFAPacketizer::canReserveResources(const MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
   DFAInput InsnInput = getInsnInput(InsnClass);
-  UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
-  ReadTable(CurrentState);
-  return CachedTable.count(StateTrans) != 0;
+  return A.canAdd(InsnInput);
 }
 
 // Reserve the resources occupied by a MCInstrDesc and change the current
@@ -139,10 +113,7 @@ bool DFAPacketizer::canReserveResources(const MCInstrDesc *MID) {
 void DFAPacketizer::reserveResources(const MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
   DFAInput InsnInput = getInsnInput(InsnClass);
-  UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
-  ReadTable(CurrentState);
-  assert(CachedTable.count(StateTrans) != 0);
-  CurrentState = CachedTable[StateTrans];
+  A.add(InsnInput);
 }
 
 // Check if the resources occupied by a machine instruction are available
@@ -159,19 +130,33 @@ void DFAPacketizer::reserveResources(MachineInstr &MI) {
   reserveResources(&MID);
 }
 
+unsigned DFAPacketizer::getUsedResources(unsigned InstIdx) {
+  ArrayRef<NfaPath> NfaPaths = A.getNfaPaths();
+  assert(!NfaPaths.empty() && "Invalid bundle!");
+  const NfaPath &RS = NfaPaths.front();
+
+  // RS stores the cumulative resources used up to and including the I'th
+  // instruction. The 0th instruction is the base case.
+  if (InstIdx == 0)
+    return RS[0];
+  // Return the difference between the cumulative resources used by InstIdx and
+  // its predecessor.
+  return RS[InstIdx] ^ RS[InstIdx - 1];
+}
+
 namespace llvm {
 
 // This class extends ScheduleDAGInstrs and overrides the schedule method
 // to build the dependence graph.
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
 private:
-  AliasAnalysis *AA;
+  AAResults *AA;
   /// Ordered list of DAG postprocessing steps.
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
 
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
-                       AliasAnalysis *AA);
+                       AAResults *AA);
 
   // Actual scheduling work.
   void schedule() override;
@@ -189,7 +174,7 @@ protected:
 
 DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
                                            MachineLoopInfo &MLI,
-                                           AliasAnalysis *AA)
+                                           AAResults *AA)
     : ScheduleDAGInstrs(MF, &MLI), AA(AA) {
   CanHandleTerminators = true;
 }
@@ -207,9 +192,10 @@ void DefaultVLIWScheduler::schedule() {
 }
 
 VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf,
-                                       MachineLoopInfo &mli, AliasAnalysis *aa)
+                                       MachineLoopInfo &mli, AAResults *aa)
     : MF(mf), TII(mf.getSubtarget().getInstrInfo()), AA(aa) {
   ResourceTracker = TII->CreateTargetScheduleState(MF.getSubtarget());
+  ResourceTracker->setTrackResources(true);
   VLIWScheduler = new DefaultVLIWScheduler(MF, mli, AA);
 }
 
@@ -224,8 +210,11 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
   LLVM_DEBUG({
     if (!CurrentPacketMIs.empty()) {
       dbgs() << "Finalizing packet:\n";
-      for (MachineInstr *MI : CurrentPacketMIs)
-        dbgs() << " * " << *MI;
+      unsigned Idx = 0;
+      for (MachineInstr *MI : CurrentPacketMIs) {
+        unsigned R = ResourceTracker->getUsedResources(Idx++);
+        dbgs() << " * [res:0x" << utohexstr(R) << "] " << *MI;
+      }
     }
   });
   if (CurrentPacketMIs.size() > 1) {
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 049ce7063307..9a537c859a67 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -75,8 +75,8 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (MO.isReg() && MO.isDef()) {
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isPhysicalRegister(Reg)) {
         // Don't delete live physreg defs, or any reserved register defs.
         if (LivePhysRegs.test(Reg) || MRI->isReserved(Reg))
           return false;
@@ -140,8 +140,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         const MachineOperand &MO = MI->getOperand(i);
         if (MO.isReg() && MO.isDef()) {
-          unsigned Reg = MO.getReg();
-          if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          Register Reg = MO.getReg();
+          if (Register::isPhysicalRegister(Reg)) {
             // Check the subreg set, not the alias set, because a def
             // of a super-register may still be partially live after
             // this def.
@@ -159,8 +159,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         const MachineOperand &MO = MI->getOperand(i);
         if (MO.isReg() && MO.isUse()) {
-          unsigned Reg = MO.getReg();
-          if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          Register Reg = MO.getReg();
+          if (Register::isPhysicalRegister(Reg)) {
             for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
               LivePhysRegs.set(*AI);
           }
diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp
index fe78acf4d80a..6d5306c1dc0c 100644
--- a/lib/CodeGen/DetectDeadLanes.cpp
+++ b/lib/CodeGen/DetectDeadLanes.cpp
@@ -154,7 +154,7 @@ static bool isCrossCopy(const MachineRegisterInfo &MRI,
                         const TargetRegisterClass *DstRC,
                         const MachineOperand &MO) {
   assert(lowersToCopies(MI));
-  unsigned SrcReg = MO.getReg();
+  Register SrcReg = MO.getReg();
   const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
   if (DstRC == SrcRC)
     return false;
@@ -194,8 +194,8 @@ void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO,
                                             LaneBitmask UsedLanes) {
   if (!MO.readsReg())
     return;
-  unsigned MOReg = MO.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(MOReg))
+  Register MOReg = MO.getReg();
+  if (!Register::isVirtualRegister(MOReg))
     return;
 
   unsigned MOSubReg = MO.getSubReg();
@@ -203,7 +203,7 @@ void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO,
     UsedLanes = TRI->composeSubRegIndexLaneMask(MOSubReg, UsedLanes);
   UsedLanes &= MRI->getMaxLaneMaskForVReg(MOReg);
 
-  unsigned MORegIdx = TargetRegisterInfo::virtReg2Index(MOReg);
+  unsigned MORegIdx = Register::virtReg2Index(MOReg);
   VRegInfo &MORegInfo = VRegInfos[MORegIdx];
   LaneBitmask PrevUsedLanes = MORegInfo.UsedLanes;
   // Any change at all?
@@ -219,7 +219,7 @@ void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO,
 void DetectDeadLanes::transferUsedLanesStep(const MachineInstr &MI,
                                             LaneBitmask UsedLanes) {
   for (const MachineOperand &MO : MI.uses()) {
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
       continue;
     LaneBitmask UsedOnMO = transferUsedLanes(MI, UsedLanes, MO);
     addUsedLanesOnOperand(MO, UsedOnMO);
@@ -230,8 +230,8 @@ LaneBitmask DetectDeadLanes::transferUsedLanes(const MachineInstr &MI,
                                                LaneBitmask UsedLanes,
                                                const MachineOperand &MO) const {
   unsigned OpNum = MI.getOperandNo(&MO);
-  assert(lowersToCopies(MI) && DefinedByCopy[
-           TargetRegisterInfo::virtReg2Index(MI.getOperand(0).getReg())]);
+  assert(lowersToCopies(MI) &&
+         DefinedByCopy[Register::virtReg2Index(MI.getOperand(0).getReg())]);
 
   switch (MI.getOpcode()) {
   case TargetOpcode::COPY:
@@ -250,7 +250,7 @@ LaneBitmask DetectDeadLanes::transferUsedLanes(const MachineInstr &MI,
       return MO2UsedLanes;
 
     const MachineOperand &Def = MI.getOperand(0);
-    unsigned DefReg = Def.getReg();
+    Register DefReg = Def.getReg();
     const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
     LaneBitmask MO1UsedLanes;
     if (RC->CoveredBySubRegs)
@@ -285,10 +285,10 @@ void DetectDeadLanes::transferDefinedLanesStep(const MachineOperand &Use,
   if (MI.getOpcode() == TargetOpcode::PATCHPOINT)
     return;
   const MachineOperand &Def = *MI.defs().begin();
-  unsigned DefReg = Def.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+  Register DefReg = Def.getReg();
+  if (!Register::isVirtualRegister(DefReg))
     return;
-  unsigned DefRegIdx = TargetRegisterInfo::virtReg2Index(DefReg);
+  unsigned DefRegIdx = Register::virtReg2Index(DefReg);
   if (!DefinedByCopy.test(DefRegIdx))
     return;
 
@@ -360,7 +360,7 @@ LaneBitmask DetectDeadLanes::determineInitialDefinedLanes(unsigned Reg) {
   if (lowersToCopies(DefMI)) {
     // Start optimisatically with no used or defined lanes for copy
     // instructions. The following dataflow analysis will add more bits.
-    unsigned RegIdx = TargetRegisterInfo::virtReg2Index(Reg);
+    unsigned RegIdx = Register::virtReg2Index(Reg);
     DefinedByCopy.set(RegIdx);
     PutInWorklist(RegIdx);
 
@@ -377,17 +377,17 @@ LaneBitmask DetectDeadLanes::determineInitialDefinedLanes(unsigned Reg) {
     for (const MachineOperand &MO : DefMI.uses()) {
       if (!MO.isReg() || !MO.readsReg())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (!MOReg)
         continue;
 
       LaneBitmask MODefinedLanes;
-      if (TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+      if (Register::isPhysicalRegister(MOReg)) {
         MODefinedLanes = LaneBitmask::getAll();
       } else if (isCrossCopy(*MRI, DefMI, DefRC, MO)) {
         MODefinedLanes = LaneBitmask::getAll();
       } else {
-        assert(TargetRegisterInfo::isVirtualRegister(MOReg));
+        assert(Register::isVirtualRegister(MOReg));
         if (MRI->hasOneDef(MOReg)) {
           const MachineOperand &MODef = *MRI->def_begin(MOReg);
           const MachineInstr &MODefMI = *MODef.getParent();
@@ -428,10 +428,10 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) {
     if (lowersToCopies(UseMI)) {
       assert(UseMI.getDesc().getNumDefs() == 1);
       const MachineOperand &Def = *UseMI.defs().begin();
-      unsigned DefReg = Def.getReg();
+      Register DefReg = Def.getReg();
       // The used lanes of COPY-like instruction operands are determined by the
       // following dataflow analysis.
-      if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
+      if (Register::isVirtualRegister(DefReg)) {
         // But ignore copies across incompatible register classes.
         bool CrossCopy = false;
         if (lowersToCopies(UseMI)) {
@@ -470,10 +470,10 @@ bool DetectDeadLanes::isUndefInput(const MachineOperand &MO,
   if (!lowersToCopies(MI))
     return false;
   const MachineOperand &Def = MI.getOperand(0);
-  unsigned DefReg = Def.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+  Register DefReg = Def.getReg();
+  if (!Register::isVirtualRegister(DefReg))
     return false;
-  unsigned DefRegIdx = TargetRegisterInfo::virtReg2Index(DefReg);
+  unsigned DefRegIdx = Register::virtReg2Index(DefReg);
   if (!DefinedByCopy.test(DefRegIdx))
     return false;
 
@@ -482,8 +482,8 @@ bool DetectDeadLanes::isUndefInput(const MachineOperand &MO,
   if (UsedLanes.any())
     return false;
 
-  unsigned MOReg = MO.getReg();
-  if (TargetRegisterInfo::isVirtualRegister(MOReg)) {
+  Register MOReg = MO.getReg();
+  if (Register::isVirtualRegister(MOReg)) {
     const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
     *CrossCopy = isCrossCopy(*MRI, MI, DstRC, MO);
   }
@@ -494,7 +494,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
   // First pass: Populate defs/uses of vregs with initial values
   unsigned NumVirtRegs = MRI->getNumVirtRegs();
   for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+    unsigned Reg = Register::index2VirtReg(RegIdx);
 
     // Determine used/defined lanes and add copy instructions to worklist.
     VRegInfo &Info = VRegInfos[RegIdx];
@@ -508,7 +508,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
     Worklist.pop_front();
     WorklistMembers.reset(RegIdx);
     VRegInfo &Info = VRegInfos[RegIdx];
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+    unsigned Reg = Register::index2VirtReg(RegIdx);
 
     // Transfer UsedLanes to operands of DefMI (backwards dataflow).
     MachineOperand &Def = *MRI->def_begin(Reg);
@@ -522,7 +522,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "Defined/Used lanes:\n"; for (unsigned RegIdx = 0;
                                                      RegIdx < NumVirtRegs;
                                                      ++RegIdx) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+    unsigned Reg = Register::index2VirtReg(RegIdx);
     const VRegInfo &Info = VRegInfos[RegIdx];
     dbgs() << printReg(Reg, nullptr)
            << " Used: " << PrintLaneMask(Info.UsedLanes)
@@ -536,10 +536,10 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
       for (MachineOperand &MO : MI.operands()) {
         if (!MO.isReg())
           continue;
-        unsigned Reg = MO.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        Register Reg = MO.getReg();
+        if (!Register::isVirtualRegister(Reg))
           continue;
-        unsigned RegIdx = TargetRegisterInfo::virtReg2Index(Reg);
+        unsigned RegIdx = Register::virtReg2Index(Reg);
         const VRegInfo &RegInfo = VRegInfos[RegIdx];
         if (MO.isDef() && !MO.isDead() && RegInfo.UsedLanes.none()) {
           LLVM_DEBUG(dbgs()
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index 0a83760befaa..e5694218b5c3 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
@@ -140,6 +141,18 @@ private:
   /// speculated.
   bool canSpeculateInstrs(MachineBasicBlock *MBB);
 
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// predicated.
+  bool canPredicateInstrs(MachineBasicBlock *MBB);
+
+  /// Scan through instruction dependencies and update InsertAfter array.
+  /// Return false if any dependency is incompatible with if conversion.
+  bool InstrDependenciesAllowIfConv(MachineInstr *I);
+
+  /// Predicate all instructions of the basic block with current condition
+  /// except for terminators. Reverse the condition if ReversePredicate is set.
+  void PredicateBlock(MachineBasicBlock *MBB, bool ReversePredicate);
+
   /// Find a valid insertion point in Head.
   bool findInsertionPoint();
 
@@ -163,11 +176,14 @@ public:
 
   /// canConvertIf - If the sub-CFG headed by MBB can be if-converted,
   /// initialize the internal state, and return true.
-  bool canConvertIf(MachineBasicBlock *MBB);
+  /// If predicate is set try to predicate the block otherwise try to
+  /// speculatively execute it.
+  bool canConvertIf(MachineBasicBlock *MBB, bool Predicate = false);
 
   /// convertIf - If-convert the last block passed to canConvertIf(), assuming
   /// it is possible. Add any erased blocks to RemovedBlocks.
-  void convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks);
+  void convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+                 bool Predicate = false);
 };
 } // end anonymous namespace
 
@@ -225,37 +241,112 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
     }
 
     // Check for any dependencies on Head instructions.
-    for (const MachineOperand &MO : I->operands()) {
-      if (MO.isRegMask()) {
-        LLVM_DEBUG(dbgs() << "Won't speculate regmask: " << *I);
-        return false;
-      }
-      if (!MO.isReg())
-        continue;
-      unsigned Reg = MO.getReg();
+    if (!InstrDependenciesAllowIfConv(&(*I)))
+      return false;
+  }
+  return true;
+}
 
-      // Remember clobbered regunits.
-      if (MO.isDef() && TargetRegisterInfo::isPhysicalRegister(Reg))
-        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
-          ClobberedRegUnits.set(*Units);
+/// Check that there is no dependencies preventing if conversion.
+///
+/// If instruction uses any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+bool SSAIfConv::InstrDependenciesAllowIfConv(MachineInstr *I) {
+  for (const MachineOperand &MO : I->operands()) {
+    if (MO.isRegMask()) {
+      LLVM_DEBUG(dbgs() << "Won't speculate regmask: " << *I);
+      return false;
+    }
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
 
-      if (!MO.readsReg() || !TargetRegisterInfo::isVirtualRegister(Reg))
-        continue;
-      MachineInstr *DefMI = MRI->getVRegDef(Reg);
-      if (!DefMI || DefMI->getParent() != Head)
-        continue;
-      if (InsertAfter.insert(DefMI).second)
-        LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " depends on "
-                          << *DefMI);
-      if (DefMI->isTerminator()) {
-        LLVM_DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
-        return false;
-      }
+    // Remember clobbered regunits.
+    if (MO.isDef() && Register::isPhysicalRegister(Reg))
+      for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+        ClobberedRegUnits.set(*Units);
+
+    if (!MO.readsReg() || !Register::isVirtualRegister(Reg))
+      continue;
+    MachineInstr *DefMI = MRI->getVRegDef(Reg);
+    if (!DefMI || DefMI->getParent() != Head)
+      continue;
+    if (InsertAfter.insert(DefMI).second)
+      LLVM_DEBUG(dbgs() << printMBBReference(*I->getParent()) << " depends on "
+                        << *DefMI);
+    if (DefMI->isTerminator()) {
+      LLVM_DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
+      return false;
     }
   }
   return true;
 }
 
+/// canPredicateInstrs - Returns true if all the instructions in MBB can safely
+/// be predicates. The terminators are not considered.
+///
+/// If instructions use any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+///
+/// Any clobbered regunits are added to ClobberedRegUnits.
+///
+bool SSAIfConv::canPredicateInstrs(MachineBasicBlock *MBB) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+                                   E = MBB->getFirstTerminator();
+       I != E; ++I) {
+    if (I->isDebugInstr())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
+                        << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      LLVM_DEBUG(dbgs() << "Can't predicate: " << *I);
+      return false;
+    }
+
+    // Check that instruction is predicable and that it is not already
+    // predicated.
+    if (!TII->isPredicable(*I) || TII->isPredicated(*I)) {
+      return false;
+    }
+
+    // Check for any dependencies on Head instructions.
+    if (!InstrDependenciesAllowIfConv(&(*I)))
+      return false;
+  }
+  return true;
+}
+
+// Apply predicate to all instructions in the machine block.
+void SSAIfConv::PredicateBlock(MachineBasicBlock *MBB, bool ReversePredicate) {
+  auto Condition = Cond;
+  if (ReversePredicate)
+    TII->reverseBranchCondition(Condition);
+  // Terminators don't need to be predicated as they will be removed.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+                                   E = MBB->getFirstTerminator();
+       I != E; ++I) {
+    if (I->isDebugInstr())
+      continue;
+    TII->PredicateInstruction(*I, Condition);
+  }
+}
 
 /// Find an insertion point in Head for the speculated instructions. The
 /// insertion point must be:
@@ -288,8 +379,8 @@ bool SSAIfConv::findInsertionPoint() {
       // We're ignoring regmask operands. That is conservatively correct.
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       // I clobbers Reg, so it isn't live before I.
       if (MO.isDef())
@@ -337,7 +428,7 @@ bool SSAIfConv::findInsertionPoint() {
 /// canConvertIf - analyze the sub-cfg rooted in MBB, and return true if it is
 /// a potential candidate for if-conversion. Fill out the internal state.
 ///
-bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
+bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB, bool Predicate) {
   Head = MBB;
   TBB = FBB = Tail = nullptr;
 
@@ -378,8 +469,9 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
   }
 
   // This is a triangle or a diamond.
-  // If Tail doesn't have any phis, there must be side effects.
-  if (Tail->empty() || !Tail->front().isPHI()) {
+  // Skip if we cannot predicate and there are no phis skip as there must be
+  // side effects that can only be handled with predication.
+  if (!Predicate && (Tail->empty() || !Tail->front().isPHI())) {
     LLVM_DEBUG(dbgs() << "No phis in tail.\n");
     return false;
   }
@@ -423,8 +515,8 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
       if (PI.PHI->getOperand(i+1).getMBB() == FPred)
         PI.FReg = PI.PHI->getOperand(i).getReg();
     }
-    assert(TargetRegisterInfo::isVirtualRegister(PI.TReg) && "Bad PHI");
-    assert(TargetRegisterInfo::isVirtualRegister(PI.FReg) && "Bad PHI");
+    assert(Register::isVirtualRegister(PI.TReg) && "Bad PHI");
+    assert(Register::isVirtualRegister(PI.FReg) && "Bad PHI");
 
     // Get target information.
     if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg,
@@ -437,10 +529,17 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
   // Check that the conditional instructions can be speculated.
   InsertAfter.clear();
   ClobberedRegUnits.reset();
-  if (TBB != Tail && !canSpeculateInstrs(TBB))
-    return false;
-  if (FBB != Tail && !canSpeculateInstrs(FBB))
-    return false;
+  if (Predicate) {
+    if (TBB != Tail && !canPredicateInstrs(TBB))
+      return false;
+    if (FBB != Tail && !canPredicateInstrs(FBB))
+      return false;
+  } else {
+    if (TBB != Tail && !canSpeculateInstrs(TBB))
+      return false;
+    if (FBB != Tail && !canSpeculateInstrs(FBB))
+      return false;
+  }
 
   // Try to find a valid insertion point for the speculated instructions in the
   // head basic block.
@@ -467,7 +566,7 @@ void SSAIfConv::replacePHIInstrs() {
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
     PHIInfo &PI = PHIs[i];
     LLVM_DEBUG(dbgs() << "If-converting " << *PI.PHI);
-    unsigned DstReg = PI.PHI->getOperand(0).getReg();
+    Register DstReg = PI.PHI->getOperand(0).getReg();
     TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
     LLVM_DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
     PI.PHI->eraseFromParent();
@@ -494,7 +593,7 @@ void SSAIfConv::rewritePHIOperands() {
       // equal.
       DstReg = PI.TReg;
     } else {
-      unsigned PHIDst = PI.PHI->getOperand(0).getReg();
+      Register PHIDst = PI.PHI->getOperand(0).getReg();
       DstReg = MRI->createVirtualRegister(MRI->getRegClass(PHIDst));
       TII->insertSelect(*Head, FirstTerm, HeadDL,
                          DstReg, Cond, PI.TReg, PI.FReg);
@@ -521,7 +620,8 @@ void SSAIfConv::rewritePHIOperands() {
 ///
 /// Any basic blocks erased will be added to RemovedBlocks.
 ///
-void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+                          bool Predicate) {
   assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
 
   // Update statistics.
@@ -531,11 +631,16 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
     ++NumDiamondsConv;
 
   // Move all instructions into Head, except for the terminators.
-  if (TBB != Tail)
+  if (TBB != Tail) {
+    if (Predicate)
+      PredicateBlock(TBB, /*ReversePredicate=*/false);
     Head->splice(InsertionPoint, TBB, TBB->begin(), TBB->getFirstTerminator());
-  if (FBB != Tail)
+  }
+  if (FBB != Tail) {
+    if (Predicate)
+      PredicateBlock(FBB, /*ReversePredicate=*/true);
     Head->splice(InsertionPoint, FBB, FBB->begin(), FBB->getFirstTerminator());
-
+  }
   // Are there extra Tail predecessors?
   bool ExtraPreds = Tail->pred_size() != 2;
   if (ExtraPreds)
@@ -587,7 +692,6 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
   LLVM_DEBUG(dbgs() << *Head);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                           EarlyIfConverter Pass
 //===----------------------------------------------------------------------===//
@@ -613,8 +717,6 @@ public:
 
 private:
   bool tryConvertIf(MachineBasicBlock*);
-  void updateDomTree(ArrayRef<MachineBasicBlock*> Removed);
-  void updateLoops(ArrayRef<MachineBasicBlock*> Removed);
   void invalidateTraces();
   bool shouldConvertIf();
 };
@@ -642,32 +744,36 @@ void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
+namespace {
 /// Update the dominator tree after if-conversion erased some blocks.
-void EarlyIfConverter::updateDomTree(ArrayRef<MachineBasicBlock*> Removed) {
+void updateDomTree(MachineDominatorTree *DomTree, const SSAIfConv &IfConv,
+                   ArrayRef<MachineBasicBlock *> Removed) {
   // convertIf can remove TBB, FBB, and Tail can be merged into Head.
   // TBB and FBB should not dominate any blocks.
   // Tail children should be transferred to Head.
   MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
-    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+  for (auto B : Removed) {
+    MachineDomTreeNode *Node = DomTree->getNode(B);
     assert(Node != HeadNode && "Cannot erase the head node");
     while (Node->getNumChildren()) {
       assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
       DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
     }
-    DomTree->eraseNode(Removed[i]);
+    DomTree->eraseNode(B);
   }
 }
 
 /// Update LoopInfo after if-conversion.
-void EarlyIfConverter::updateLoops(ArrayRef<MachineBasicBlock*> Removed) {
+void updateLoops(MachineLoopInfo *Loops,
+                 ArrayRef<MachineBasicBlock *> Removed) {
   if (!Loops)
     return;
   // If-conversion doesn't change loop structure, and it doesn't mess with back
   // edges, so updating LoopInfo is simply removing the dead blocks.
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
-    Loops->removeBlock(Removed[i]);
+  for (auto B : Removed)
+    Loops->removeBlock(B);
 }
+} // namespace
 
 /// Invalidate MachineTraceMetrics before if-conversion.
 void EarlyIfConverter::invalidateTraces() {
@@ -783,8 +889,8 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
     SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
     IfConv.convertIf(RemovedBlocks);
     Changed = true;
-    updateDomTree(RemovedBlocks);
-    updateLoops(RemovedBlocks);
+    updateDomTree(DomTree, IfConv, RemovedBlocks);
+    updateLoops(Loops, RemovedBlocks);
   }
   return Changed;
 }
@@ -822,3 +928,132 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
 
   return Changed;
 }
+
+//===----------------------------------------------------------------------===//
+//                           EarlyIfPredicator Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class EarlyIfPredicator : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  TargetSchedModel SchedModel;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  SSAIfConv IfConv;
+
+public:
+  static char ID;
+  EarlyIfPredicator() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return "Early If-predicator"; }
+
+protected:
+  bool tryConvertIf(MachineBasicBlock *);
+  bool shouldConvertIf();
+};
+} // end anonymous namespace
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "early-if-predicator"
+
+char EarlyIfPredicator::ID = 0;
+char &llvm::EarlyIfPredicatorID = EarlyIfPredicator::ID;
+
+INITIALIZE_PASS_BEGIN(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator", false,
+                    false)
+
+void EarlyIfPredicator::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Apply the target heuristic to decide if the transformation is profitable.
+bool EarlyIfPredicator::shouldConvertIf() {
+  if (IfConv.isTriangle()) {
+    MachineBasicBlock &IfBlock =
+        (IfConv.TBB == IfConv.Tail) ? *IfConv.FBB : *IfConv.TBB;
+
+    unsigned ExtraPredCost = 0;
+    unsigned Cycles = 0;
+    for (MachineInstr &I : IfBlock) {
+      unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+      if (NumCycles > 1)
+        Cycles += NumCycles - 1;
+      ExtraPredCost += TII->getPredicationCost(I);
+    }
+
+    return TII->isProfitableToIfCvt(IfBlock, Cycles, ExtraPredCost,
+                                    BranchProbability::getUnknown());
+  }
+  unsigned TExtra = 0;
+  unsigned FExtra = 0;
+  unsigned TCycle = 0;
+  unsigned FCycle = 0;
+  for (MachineInstr &I : *IfConv.TBB) {
+    unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+    if (NumCycles > 1)
+      TCycle += NumCycles - 1;
+    TExtra += TII->getPredicationCost(I);
+  }
+  for (MachineInstr &I : *IfConv.FBB) {
+    unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+    if (NumCycles > 1)
+      FCycle += NumCycles - 1;
+    FExtra += TII->getPredicationCost(I);
+  }
+  return TII->isProfitableToIfCvt(*IfConv.TBB, TCycle, TExtra, *IfConv.FBB,
+                                  FCycle, FExtra,
+                                  BranchProbability::getUnknown());
+}
+
+/// Attempt repeated if-conversion on MBB, return true if successful.
+///
+bool EarlyIfPredicator::tryConvertIf(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (IfConv.canConvertIf(MBB, /*Predicate*/ true) && shouldConvertIf()) {
+    // If-convert MBB and update analyses.
+    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+    IfConv.convertIf(RemovedBlocks, /*Predicate*/ true);
+    Changed = true;
+    updateDomTree(DomTree, IfConv, RemovedBlocks);
+    updateLoops(Loops, RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool EarlyIfPredicator::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** EARLY IF-PREDICATOR **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  SchedModel.init(&STI);
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+
+  bool Changed = false;
+  IfConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree post-order. The post-order enables nested
+  // if-conversion in a single pass. The tryConvertIf() function may erase
+  // blocks, but only blocks dominated by the head block. This makes it safe to
+  // update the dominator tree while the post-order iterator is still active.
+  for (auto DomNode : post_order(DomTree))
+    if (tryConvertIf(DomNode->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/lib/CodeGen/ExecutionDomainFix.cpp b/lib/CodeGen/ExecutionDomainFix.cpp
index a2dd5eee33b7..2cca05ea6f55 100644
--- a/lib/CodeGen/ExecutionDomainFix.cpp
+++ b/lib/CodeGen/ExecutionDomainFix.cpp
@@ -9,6 +9,7 @@
 #include "llvm/CodeGen/ExecutionDomainFix.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/ExpandMemCmp.cpp b/lib/CodeGen/ExpandMemCmp.cpp
index b425482e6adf..9916f2de0414 100644
--- a/lib/CodeGen/ExpandMemCmp.cpp
+++ b/lib/CodeGen/ExpandMemCmp.cpp
@@ -795,7 +795,7 @@ public:
         TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();
 
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     const TargetTransformInfo *TTI =
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto PA = runImpl(F, TLI, TTI, TL);
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 0ab70aff7dc4..1fc57fac1489 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -79,17 +79,17 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
          (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
           MI->getOperand(3).isImm() && "Invalid subreg_to_reg");
 
-  unsigned DstReg  = MI->getOperand(0).getReg();
-  unsigned InsReg  = MI->getOperand(2).getReg();
+  Register DstReg = MI->getOperand(0).getReg();
+  Register InsReg = MI->getOperand(2).getReg();
   assert(!MI->getOperand(2).getSubReg() && "SubIdx on physreg?");
   unsigned SubIdx  = MI->getOperand(3).getImm();
 
   assert(SubIdx != 0 && "Invalid index for insert_subreg");
-  unsigned DstSubReg = TRI->getSubReg(DstReg, SubIdx);
+  Register DstSubReg = TRI->getSubReg(DstReg, SubIdx);
 
-  assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+  assert(Register::isPhysicalRegister(DstReg) &&
          "Insert destination must be in a physical register");
-  assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
+  assert(Register::isPhysicalRegister(InsReg) &&
          "Inserted value must be in a physical register");
 
   LLVM_DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index 9c53550eaa9d..c1d22ef89195 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -72,7 +72,7 @@ GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
     return *I->second;
 
   GCStrategy *S = getGCStrategy(F.getGC());
-  Functions.push_back(llvm::make_unique<GCFunctionInfo>(F, *S));
+  Functions.push_back(std::make_unique<GCFunctionInfo>(F, *S));
   GCFunctionInfo *GFI = Functions.back().get();
   FInfoMap[&F] = GFI;
   return *GFI;
diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp
index 90571d090bfb..0dc0a5bce747 100644
--- a/lib/CodeGen/GCRootLowering.cpp
+++ b/lib/CodeGen/GCRootLowering.cpp
@@ -249,7 +249,7 @@ GCMachineCodeAnalysis::GCMachineCodeAnalysis() : MachineFunctionPass(ID) {}
 void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
   AU.setPreservesAll();
-  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<MachineModuleInfoWrapperPass>();
   AU.addRequired<GCModuleInfo>();
 }
 
@@ -310,7 +310,7 @@ bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(MF.getFunction());
-  MMI = &getAnalysis<MachineModuleInfo>();
+  MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   TII = MF.getSubtarget().getInstrInfo();
 
   // Find the size of the stack frame.  There may be no correct static frame
diff --git a/lib/CodeGen/GlobalISel/CSEInfo.cpp b/lib/CodeGen/GlobalISel/CSEInfo.cpp
index 4518dbee1a9f..7d9d812d34bc 100644
--- a/lib/CodeGen/GlobalISel/CSEInfo.cpp
+++ b/lib/CodeGen/GlobalISel/CSEInfo.cpp
@@ -52,6 +52,7 @@ bool CSEConfigFull::shouldCSEOpc(unsigned Opc) {
   case TargetOpcode::G_ANYEXT:
   case TargetOpcode::G_UNMERGE_VALUES:
   case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_GEP:
     return true;
   }
   return false;
@@ -65,9 +66,9 @@ std::unique_ptr<CSEConfigBase>
 llvm::getStandardCSEConfigForOpt(CodeGenOpt::Level Level) {
   std::unique_ptr<CSEConfigBase> Config;
   if (Level == CodeGenOpt::None)
-    Config = make_unique<CSEConfigConstantOnly>();
+    Config = std::make_unique<CSEConfigConstantOnly>();
   else
-    Config = make_unique<CSEConfigFull>();
+    Config = std::make_unique<CSEConfigFull>();
   return Config;
 }
 
@@ -332,7 +333,7 @@ GISelInstProfileBuilder::addNodeIDFlag(unsigned Flag) const {
 const GISelInstProfileBuilder &GISelInstProfileBuilder::addNodeIDMachineOperand(
     const MachineOperand &MO) const {
   if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!MO.isDef())
       addNodeIDRegNum(Reg);
     LLT Ty = MRI.getType(Reg);
diff --git a/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 461bc6038c2c..51a74793f029 100644
--- a/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -162,6 +162,17 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
       return buildConstant(DstOps[0], Cst->getSExtValue());
     break;
   }
+  case TargetOpcode::G_SEXT_INREG: {
+    assert(DstOps.size() == 1 && "Invalid dst ops");
+    assert(SrcOps.size() == 2 && "Invalid src ops");
+    const DstOp &Dst = DstOps[0];
+    const SrcOp &Src0 = SrcOps[0];
+    const SrcOp &Src1 = SrcOps[1];
+    if (auto MaybeCst =
+            ConstantFoldExtOp(Opc, Src0.getReg(), Src1.getImm(), *getMRI()))
+      return buildConstant(Dst, MaybeCst->getSExtValue());
+    break;
+  }
   }
   bool CanCopy = checkCopyToDefsPossible(DstOps);
   if (!canPerformCSEForOpc(Opc))
diff --git a/lib/CodeGen/GlobalISel/CallLowering.cpp b/lib/CodeGen/GlobalISel/CallLowering.cpp
index a5d8205a34a8..cdad92f7db4f 100644
--- a/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -11,14 +11,16 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 
 #define DEBUG_TYPE "call-lowering"
@@ -32,66 +34,70 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, ImmutableCallSite CS,
                              ArrayRef<ArrayRef<Register>> ArgRegs,
                              Register SwiftErrorVReg,
                              std::function<unsigned()> GetCalleeReg) const {
+  CallLoweringInfo Info;
   auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout();
 
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
   // we'll pass to the assigner function.
-  SmallVector<ArgInfo, 8> OrigArgs;
   unsigned i = 0;
   unsigned NumFixedArgs = CS.getFunctionType()->getNumParams();
   for (auto &Arg : CS.args()) {
     ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{},
                     i < NumFixedArgs};
     setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CS);
-    // We don't currently support swiftself args.
-    if (OrigArg.Flags.isSwiftSelf())
-      return false;
-    OrigArgs.push_back(OrigArg);
+    Info.OrigArgs.push_back(OrigArg);
     ++i;
   }
 
-  MachineOperand Callee = MachineOperand::CreateImm(0);
   if (const Function *F = CS.getCalledFunction())
-    Callee = MachineOperand::CreateGA(F, 0);
+    Info.Callee = MachineOperand::CreateGA(F, 0);
   else
-    Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
-
-  ArgInfo OrigRet{ResRegs, CS.getType(), ISD::ArgFlagsTy{}};
-  if (!OrigRet.Ty->isVoidTy())
-    setArgFlags(OrigRet, AttributeList::ReturnIndex, DL, CS);
-
-  return lowerCall(MIRBuilder, CS.getCallingConv(), Callee, OrigRet, OrigArgs,
-                   SwiftErrorVReg);
+    Info.Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
+
+  Info.OrigRet = ArgInfo{ResRegs, CS.getType(), ISD::ArgFlagsTy{}};
+  if (!Info.OrigRet.Ty->isVoidTy())
+    setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CS);
+
+  Info.KnownCallees =
+      CS.getInstruction()->getMetadata(LLVMContext::MD_callees);
+  Info.CallConv = CS.getCallingConv();
+  Info.SwiftErrorVReg = SwiftErrorVReg;
+  Info.IsMustTailCall = CS.isMustTailCall();
+  Info.IsTailCall = CS.isTailCall() &&
+                    isInTailCallPosition(CS, MIRBuilder.getMF().getTarget());
+  Info.IsVarArg = CS.getFunctionType()->isVarArg();
+  return lowerCall(MIRBuilder, Info);
 }
 
 template <typename FuncInfoTy>
 void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                const DataLayout &DL,
                                const FuncInfoTy &FuncInfo) const {
+  auto &Flags = Arg.Flags[0];
   const AttributeList &Attrs = FuncInfo.getAttributes();
   if (Attrs.hasAttribute(OpIdx, Attribute::ZExt))
-    Arg.Flags.setZExt();
+    Flags.setZExt();
   if (Attrs.hasAttribute(OpIdx, Attribute::SExt))
-    Arg.Flags.setSExt();
+    Flags.setSExt();
   if (Attrs.hasAttribute(OpIdx, Attribute::InReg))
-    Arg.Flags.setInReg();
+    Flags.setInReg();
   if (Attrs.hasAttribute(OpIdx, Attribute::StructRet))
-    Arg.Flags.setSRet();
+    Flags.setSRet();
   if (Attrs.hasAttribute(OpIdx, Attribute::SwiftSelf))
-    Arg.Flags.setSwiftSelf();
+    Flags.setSwiftSelf();
   if (Attrs.hasAttribute(OpIdx, Attribute::SwiftError))
-    Arg.Flags.setSwiftError();
+    Flags.setSwiftError();
   if (Attrs.hasAttribute(OpIdx, Attribute::ByVal))
-    Arg.Flags.setByVal();
+    Flags.setByVal();
   if (Attrs.hasAttribute(OpIdx, Attribute::InAlloca))
-    Arg.Flags.setInAlloca();
+    Flags.setInAlloca();
 
-  if (Arg.Flags.isByVal() || Arg.Flags.isInAlloca()) {
+  if (Flags.isByVal() || Flags.isInAlloca()) {
     Type *ElementTy = cast<PointerType>(Arg.Ty)->getElementType();
 
     auto Ty = Attrs.getAttribute(OpIdx, Attribute::ByVal).getValueAsType();
-    Arg.Flags.setByValSize(DL.getTypeAllocSize(Ty ? Ty : ElementTy));
+    Flags.setByValSize(DL.getTypeAllocSize(Ty ? Ty : ElementTy));
 
     // For ByVal, alignment should be passed from FE.  BE will guess if
     // this info is not there but there are cases it cannot get right.
@@ -100,11 +106,11 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
       FrameAlign = FuncInfo.getParamAlignment(OpIdx - 2);
     else
       FrameAlign = getTLI()->getByValTypeAlignment(ElementTy, DL);
-    Arg.Flags.setByValAlign(FrameAlign);
+    Flags.setByValAlign(Align(FrameAlign));
   }
   if (Attrs.hasAttribute(OpIdx, Attribute::Nest))
-    Arg.Flags.setNest();
-  Arg.Flags.setOrigAlign(DL.getABITypeAlignment(Arg.Ty));
+    Flags.setNest();
+  Flags.setOrigAlign(Align(DL.getABITypeAlignment(Arg.Ty)));
 }
 
 template void
@@ -159,7 +165,7 @@ void CallLowering::unpackRegs(ArrayRef<Register> DstRegs, Register SrcReg,
 }
 
 bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
-                                     ArrayRef<ArgInfo> Args,
+                                     SmallVectorImpl<ArgInfo> &Args,
                                      ValueHandler &Handler) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
@@ -171,7 +177,7 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
 bool CallLowering::handleAssignments(CCState &CCInfo,
                                      SmallVectorImpl<CCValAssign> &ArgLocs,
                                      MachineIRBuilder &MIRBuilder,
-                                     ArrayRef<ArgInfo> Args,
+                                     SmallVectorImpl<ArgInfo> &Args,
                                      ValueHandler &Handler) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
@@ -180,14 +186,99 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
   unsigned NumArgs = Args.size();
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT CurVT = MVT::getVT(Args[i].Ty);
-    if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo)) {
-      // Try to use the register type if we couldn't assign the VT.
-      if (!Handler.isArgumentHandler() || !CurVT.isValid())
+    if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i],
+                          Args[i].Flags[0], CCInfo)) {
+      if (!CurVT.isValid())
         return false;
-      CurVT = TLI->getRegisterTypeForCallingConv(
+      MVT NewVT = TLI->getRegisterTypeForCallingConv(
           F.getContext(), F.getCallingConv(), EVT(CurVT));
-      if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo))
-        return false;
+
+      // If we need to split the type over multiple regs, check it's a scenario
+      // we currently support.
+      unsigned NumParts = TLI->getNumRegistersForCallingConv(
+          F.getContext(), F.getCallingConv(), CurVT);
+      if (NumParts > 1) {
+        // For now only handle exact splits.
+        if (NewVT.getSizeInBits() * NumParts != CurVT.getSizeInBits())
+          return false;
+      }
+
+      // For incoming arguments (physregs to vregs), we could have values in
+      // physregs (or memlocs) which we want to extract and copy to vregs.
+      // During this, we might have to deal with the LLT being split across
+      // multiple regs, so we have to record this information for later.
+      //
+      // If we have outgoing args, then we have the opposite case. We have a
+      // vreg with an LLT which we want to assign to a physical location, and
+      // we might have to record that the value has to be split later.
+      if (Handler.isIncomingArgumentHandler()) {
+        if (NumParts == 1) {
+          // Try to use the register type if we couldn't assign the VT.
+          if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i],
+                                Args[i].Flags[0], CCInfo))
+            return false;
+        } else {
+          // We're handling an incoming arg which is split over multiple regs.
+          // E.g. passing an s128 on AArch64.
+          ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
+          Args[i].OrigRegs.push_back(Args[i].Regs[0]);
+          Args[i].Regs.clear();
+          Args[i].Flags.clear();
+          LLT NewLLT = getLLTForMVT(NewVT);
+          // For each split register, create and assign a vreg that will store
+          // the incoming component of the larger value. These will later be
+          // merged to form the final vreg.
+          for (unsigned Part = 0; Part < NumParts; ++Part) {
+            Register Reg =
+                MIRBuilder.getMRI()->createGenericVirtualRegister(NewLLT);
+            ISD::ArgFlagsTy Flags = OrigFlags;
+            if (Part == 0) {
+              Flags.setSplit();
+            } else {
+              Flags.setOrigAlign(Align::None());
+              if (Part == NumParts - 1)
+                Flags.setSplitEnd();
+            }
+            Args[i].Regs.push_back(Reg);
+            Args[i].Flags.push_back(Flags);
+            if (Handler.assignArg(i + Part, NewVT, NewVT, CCValAssign::Full,
+                                  Args[i], Args[i].Flags[Part], CCInfo)) {
+              // Still couldn't assign this smaller part type for some reason.
+              return false;
+            }
+          }
+        }
+      } else {
+        // Handling an outgoing arg that might need to be split.
+        if (NumParts < 2)
+          return false; // Don't know how to deal with this type combination.
+
+        // This type is passed via multiple registers in the calling convention.
+        // We need to extract the individual parts.
+        Register LargeReg = Args[i].Regs[0];
+        LLT SmallTy = LLT::scalar(NewVT.getSizeInBits());
+        auto Unmerge = MIRBuilder.buildUnmerge(SmallTy, LargeReg);
+        assert(Unmerge->getNumOperands() == NumParts + 1);
+        ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
+        // We're going to replace the regs and flags with the split ones.
+        Args[i].Regs.clear();
+        Args[i].Flags.clear();
+        for (unsigned PartIdx = 0; PartIdx < NumParts; ++PartIdx) {
+          ISD::ArgFlagsTy Flags = OrigFlags;
+          if (PartIdx == 0) {
+            Flags.setSplit();
+          } else {
+            Flags.setOrigAlign(Align::None());
+            if (PartIdx == NumParts - 1)
+              Flags.setSplitEnd();
+          }
+          Args[i].Regs.push_back(Unmerge.getReg(PartIdx));
+          Args[i].Flags.push_back(Flags);
+          if (Handler.assignArg(i + PartIdx, NewVT, NewVT, CCValAssign::Full,
+                                Args[i], Args[i].Flags[PartIdx], CCInfo))
+            return false;
+        }
+      }
     }
   }
 
@@ -202,18 +293,32 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
       continue;
     }
 
-    assert(Args[i].Regs.size() == 1 &&
-           "Can't handle multiple virtual regs yet");
-
     // FIXME: Pack registers if we have more than one.
     Register ArgReg = Args[i].Regs[0];
 
+    MVT OrigVT = MVT::getVT(Args[i].Ty);
+    MVT VAVT = VA.getValVT();
     if (VA.isRegLoc()) {
-      MVT OrigVT = MVT::getVT(Args[i].Ty);
-      MVT VAVT = VA.getValVT();
-      if (Handler.isArgumentHandler() && VAVT != OrigVT) {
-        if (VAVT.getSizeInBits() < OrigVT.getSizeInBits())
-          return false; // Can't handle this type of arg yet.
+      if (Handler.isIncomingArgumentHandler() && VAVT != OrigVT) {
+        if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) {
+          // Expected to be multiple regs for a single incoming arg.
+          unsigned NumArgRegs = Args[i].Regs.size();
+          if (NumArgRegs < 2)
+            return false;
+
+          assert((j + (NumArgRegs - 1)) < ArgLocs.size() &&
+                 "Too many regs for number of args");
+          for (unsigned Part = 0; Part < NumArgRegs; ++Part) {
+            // There should be Regs.size() ArgLocs per argument.
+            VA = ArgLocs[j + Part];
+            Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
+          }
+          j += NumArgRegs - 1;
+          // Merge the split registers into the expected larger result vreg
+          // of the original call.
+          MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs);
+          continue;
+        }
         const LLT VATy(VAVT);
         Register NewReg =
             MIRBuilder.getMRI()->createGenericVirtualRegister(VATy);
@@ -234,10 +339,28 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
         } else {
           MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0);
         }
+      } else if (!Handler.isIncomingArgumentHandler()) {
+        assert((j + (Args[i].Regs.size() - 1)) < ArgLocs.size() &&
+               "Too many regs for number of args");
+        // This is an outgoing argument that might have been split.
+        for (unsigned Part = 0; Part < Args[i].Regs.size(); ++Part) {
+          // There should be Regs.size() ArgLocs per argument.
+          VA = ArgLocs[j + Part];
+          Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
+        }
+        j += Args[i].Regs.size() - 1;
       } else {
         Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
       }
     } else if (VA.isMemLoc()) {
+      // Don't currently support loading/storing a type that needs to be split
+      // to the stack. Should be easy, just not implemented yet.
+      if (Args[i].Regs.size() > 1) {
+        LLVM_DEBUG(
+            dbgs()
+            << "Load/store a split arg to/from the stack not implemented yet");
+        return false;
+      }
       MVT VT = MVT::getVT(Args[i].Ty);
       unsigned Size = VT == MVT::iPTR ? DL.getPointerSize()
                                       : alignTo(VT.getSizeInBits(), 8) / 8;
@@ -253,6 +376,81 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
   return true;
 }
 
+bool CallLowering::analyzeArgInfo(CCState &CCState,
+                                  SmallVectorImpl<ArgInfo> &Args,
+                                  CCAssignFn &AssignFnFixed,
+                                  CCAssignFn &AssignFnVarArg) const {
+  for (unsigned i = 0, e = Args.size(); i < e; ++i) {
+    MVT VT = MVT::getVT(Args[i].Ty);
+    CCAssignFn &Fn = Args[i].IsFixed ? AssignFnFixed : AssignFnVarArg;
+    if (Fn(i, VT, VT, CCValAssign::Full, Args[i].Flags[0], CCState)) {
+      // Bail out on anything we can't handle.
+      LLVM_DEBUG(dbgs() << "Cannot analyze " << EVT(VT).getEVTString()
+                        << " (arg number = " << i << "\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+bool CallLowering::resultsCompatible(CallLoweringInfo &Info,
+                                     MachineFunction &MF,
+                                     SmallVectorImpl<ArgInfo> &InArgs,
+                                     CCAssignFn &CalleeAssignFnFixed,
+                                     CCAssignFn &CalleeAssignFnVarArg,
+                                     CCAssignFn &CallerAssignFnFixed,
+                                     CCAssignFn &CallerAssignFnVarArg) const {
+  const Function &F = MF.getFunction();
+  CallingConv::ID CalleeCC = Info.CallConv;
+  CallingConv::ID CallerCC = F.getCallingConv();
+
+  if (CallerCC == CalleeCC)
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs1;
+  CCState CCInfo1(CalleeCC, false, MF, ArgLocs1, F.getContext());
+  if (!analyzeArgInfo(CCInfo1, InArgs, CalleeAssignFnFixed,
+                      CalleeAssignFnVarArg))
+    return false;
+
+  SmallVector<CCValAssign, 16> ArgLocs2;
+  CCState CCInfo2(CallerCC, false, MF, ArgLocs2, F.getContext());
+  if (!analyzeArgInfo(CCInfo2, InArgs, CallerAssignFnFixed,
+                      CalleeAssignFnVarArg))
+    return false;
+
+  // We need the argument locations to match up exactly. If there's more in
+  // one than the other, then we are done.
+  if (ArgLocs1.size() != ArgLocs2.size())
+    return false;
+
+  // Make sure that each location is passed in exactly the same way.
+  for (unsigned i = 0, e = ArgLocs1.size(); i < e; ++i) {
+    const CCValAssign &Loc1 = ArgLocs1[i];
+    const CCValAssign &Loc2 = ArgLocs2[i];
+
+    // We need both of them to be the same. So if one is a register and one
+    // isn't, we're done.
+    if (Loc1.isRegLoc() != Loc2.isRegLoc())
+      return false;
+
+    if (Loc1.isRegLoc()) {
+      // If they don't have the same register location, we're done.
+      if (Loc1.getLocReg() != Loc2.getLocReg())
+        return false;
+
+      // They matched, so we can move to the next ArgLoc.
+      continue;
+    }
+
+    // Loc1 wasn't a RegLoc, so they both must be MemLocs. Check if they match.
+    if (Loc1.getLocMemOffset() != Loc2.getLocMemOffset())
+      return false;
+  }
+
+  return true;
+}
+
 Register CallLowering::ValueHandler::extendRegister(Register ValReg,
                                                     CCValAssign &VA) {
   LLT LocTy{VA.getLocVT()};
diff --git a/lib/CodeGen/GlobalISel/Combiner.cpp b/lib/CodeGen/GlobalISel/Combiner.cpp
index 31cb1dbbc9b5..b4562a5c6601 100644
--- a/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -27,6 +27,18 @@
 
 using namespace llvm;
 
+namespace llvm {
+cl::OptionCategory GICombinerOptionCategory(
+    "GlobalISel Combiner",
+    "Control the rules which are enabled. These options all take a comma "
+    "separated list of rules to disable and may be specified by number "
+    "or number range (e.g. 1-10)."
+#ifndef NDEBUG
+    " They may also be specified by name."
+#endif
+);
+} // end namespace llvm
+
 namespace {
 /// This class acts as the glue the joins the CombinerHelper to the overall
 /// Combine algorithm. The CombinerHelper is intended to report the
@@ -92,7 +104,7 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF,
     return false;
 
   Builder =
-      CSEInfo ? make_unique<CSEMIRBuilder>() : make_unique<MachineIRBuilder>();
+      CSEInfo ? std::make_unique<CSEMIRBuilder>() : std::make_unique<MachineIRBuilder>();
   MRI = &MF.getRegInfo();
   Builder->setMF(MF);
   if (CSEInfo)
diff --git a/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 9cbf3dd83ff1..854769d283f7 100644
--- a/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -8,19 +8,36 @@
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "gi-combiner"
 
 using namespace llvm;
 
+// Option to allow testing of the combiner while no targets know about indexed
+// addressing.
+static cl::opt<bool>
+    ForceLegalIndexing("force-legal-indexing", cl::Hidden, cl::init(false),
+                       cl::desc("Force all indexed operations to be "
+                                "legal for the GlobalISel combiner"));
+
+
 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
-                               MachineIRBuilder &B)
-    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer) {}
+                               MachineIRBuilder &B, GISelKnownBits *KB,
+                               MachineDominatorTree *MDT)
+    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
+      KB(KB), MDT(MDT) {
+  (void)this->KB;
+}
 
 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
                                     Register ToReg) const {
@@ -55,8 +72,8 @@ bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
   if (MI.getOpcode() != TargetOpcode::COPY)
     return false;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   LLT DstTy = MRI.getType(DstReg);
   LLT SrcTy = MRI.getType(SrcReg);
   // Simple Copy Propagation.
@@ -66,12 +83,183 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
   return false;
 }
 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, SrcReg);
 }
 
+bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
+  bool IsUndef = false;
+  SmallVector<Register, 4> Ops;
+  if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
+    applyCombineConcatVectors(MI, IsUndef, Ops);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
+                                               SmallVectorImpl<Register> &Ops) {
+  assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
+         "Invalid instruction");
+  IsUndef = true;
+  MachineInstr *Undef = nullptr;
+
+  // Walk over all the operands of concat vectors and check if they are
+  // build_vector themselves or undef.
+  // Then collect their operands in Ops.
+  for (const MachineOperand &MO : MI.operands()) {
+    // Skip the instruction definition.
+    if (MO.isDef())
+      continue;
+    Register Reg = MO.getReg();
+    MachineInstr *Def = MRI.getVRegDef(Reg);
+    assert(Def && "Operand not defined");
+    switch (Def->getOpcode()) {
+    case TargetOpcode::G_BUILD_VECTOR:
+      IsUndef = false;
+      // Remember the operands of the build_vector to fold
+      // them into the yet-to-build flattened concat vectors.
+      for (const MachineOperand &BuildVecMO : Def->operands()) {
+        // Skip the definition.
+        if (BuildVecMO.isDef())
+          continue;
+        Ops.push_back(BuildVecMO.getReg());
+      }
+      break;
+    case TargetOpcode::G_IMPLICIT_DEF: {
+      LLT OpType = MRI.getType(Reg);
+      // Keep one undef value for all the undef operands.
+      if (!Undef) {
+        Builder.setInsertPt(*MI.getParent(), MI);
+        Undef = Builder.buildUndef(OpType.getScalarType());
+      }
+      assert(MRI.getType(Undef->getOperand(0).getReg()) ==
+                 OpType.getScalarType() &&
+             "All undefs should have the same type");
+      // Break the undef vector in as many scalar elements as needed
+      // for the flattening.
+      for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
+           EltIdx != EltEnd; ++EltIdx)
+        Ops.push_back(Undef->getOperand(0).getReg());
+      break;
+    }
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+void CombinerHelper::applyCombineConcatVectors(
+    MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
+  // We determined that the concat_vectors can be flatten.
+  // Generate the flattened build_vector.
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInsertPt(*MI.getParent(), MI);
+  Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
+
+  // Note: IsUndef is sort of redundant. We could have determine it by
+  // checking that at all Ops are undef.  Alternatively, we could have
+  // generate a build_vector of undefs and rely on another combine to
+  // clean that up.  For now, given we already gather this information
+  // in tryCombineConcatVectors, just save compile time and issue the
+  // right thing.
+  if (IsUndef)
+    Builder.buildUndef(NewDstReg);
+  else
+    Builder.buildBuildVector(NewDstReg, Ops);
+  MI.eraseFromParent();
+  replaceRegWith(MRI, DstReg, NewDstReg);
+}
+
+bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
+  SmallVector<Register, 4> Ops;
+  if (matchCombineShuffleVector(MI, Ops)) {
+    applyCombineShuffleVector(MI, Ops);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
+                                               SmallVectorImpl<Register> &Ops) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+         "Invalid instruction kind");
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+  Register Src1 = MI.getOperand(1).getReg();
+  LLT SrcType = MRI.getType(Src1);
+  unsigned DstNumElts = DstType.getNumElements();
+  unsigned SrcNumElts = SrcType.getNumElements();
+
+  // If the resulting vector is smaller than the size of the source
+  // vectors being concatenated, we won't be able to replace the
+  // shuffle vector into a concat_vectors.
+  //
+  // Note: We may still be able to produce a concat_vectors fed by
+  //       extract_vector_elt and so on. It is less clear that would
+  //       be better though, so don't bother for now.
+  if (DstNumElts < 2 * SrcNumElts)
+    return false;
+
+  // Check that the shuffle mask can be broken evenly between the
+  // different sources.
+  if (DstNumElts % SrcNumElts != 0)
+    return false;
+
+  // Mask length is a multiple of the source vector length.
+  // Check if the shuffle is some kind of concatenation of the input
+  // vectors.
+  unsigned NumConcat = DstNumElts / SrcNumElts;
+  SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
+  SmallVector<int, 8> Mask;
+  ShuffleVectorInst::getShuffleMask(MI.getOperand(3).getShuffleMask(), Mask);
+  for (unsigned i = 0; i != DstNumElts; ++i) {
+    int Idx = Mask[i];
+    // Undef value.
+    if (Idx < 0)
+      continue;
+    // Ensure the indices in each SrcType sized piece are sequential and that
+    // the same source is used for the whole piece.
+    if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
+        (ConcatSrcs[i / SrcNumElts] >= 0 &&
+         ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
+      return false;
+    // Remember which source this index came from.
+    ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
+  }
+
+  // The shuffle is concatenating multiple vectors together.
+  // Collect the different operands for that.
+  Register UndefReg;
+  Register Src2 = MI.getOperand(2).getReg();
+  for (auto Src : ConcatSrcs) {
+    if (Src < 0) {
+      if (!UndefReg) {
+        Builder.setInsertPt(*MI.getParent(), MI);
+        UndefReg = Builder.buildUndef(SrcType).getReg(0);
+      }
+      Ops.push_back(UndefReg);
+    } else if (Src == 0)
+      Ops.push_back(Src1);
+    else
+      Ops.push_back(Src2);
+  }
+  return true;
+}
+
+void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
+                                               const ArrayRef<Register> Ops) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInsertPt(*MI.getParent(), MI);
+  Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
+
+  Builder.buildConcatVectors(NewDstReg, Ops);
+
+  MI.eraseFromParent();
+  replaceRegWith(MRI, DstReg, NewDstReg);
+}
+
 namespace {
 
 /// Select a preference between two uses. CurrentUse is the current preference
@@ -279,7 +467,7 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
     // up the type and extend so that it uses the preferred use.
     if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
         UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
-      unsigned UseDstReg = UseMI->getOperand(0).getReg();
+      Register UseDstReg = UseMI->getOperand(0).getReg();
       MachineOperand &UseSrcMO = UseMI->getOperand(1);
       const LLT &UseDstTy = MRI.getType(UseDstReg);
       if (UseDstReg != ChosenDstReg) {
@@ -342,8 +530,212 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
   Observer.changedInstr(MI);
 }
 
-bool CombinerHelper::matchCombineBr(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_BR && "Expected a G_BR");
+bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
+  assert(DefMI.getParent() == UseMI.getParent());
+  if (&DefMI == &UseMI)
+    return false;
+
+  // Loop through the basic block until we find one of the instructions.
+  MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
+  for (; &*I != &DefMI && &*I != &UseMI; ++I)
+    return &*I == &DefMI;
+
+  llvm_unreachable("Block must contain instructions");
+}
+
+bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
+  if (MDT)
+    return MDT->dominates(&DefMI, &UseMI);
+  else if (DefMI.getParent() != UseMI.getParent())
+    return false;
+
+  return isPredecessor(DefMI, UseMI);
+}
+
+bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
+                                            Register &Base, Register &Offset) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+#ifndef NDEBUG
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
+         Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
+#endif
+
+  Base = MI.getOperand(1).getReg();
+  MachineInstr *BaseDef = MRI.getUniqueVRegDef(Base);
+  if (BaseDef && BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
+
+  for (auto &Use : MRI.use_instructions(Base)) {
+    if (Use.getOpcode() != TargetOpcode::G_GEP)
+      continue;
+
+    Offset = Use.getOperand(2).getReg();
+    if (!ForceLegalIndexing &&
+        !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ false, MRI)) {
+      LLVM_DEBUG(dbgs() << "    Ignoring candidate with illegal addrmode: "
+                        << Use);
+      continue;
+    }
+
+    // Make sure the offset calculation is before the potentially indexed op.
+    // FIXME: we really care about dependency here. The offset calculation might
+    // be movable.
+    MachineInstr *OffsetDef = MRI.getUniqueVRegDef(Offset);
+    if (!OffsetDef || !dominates(*OffsetDef, MI)) {
+      LLVM_DEBUG(dbgs() << "    Ignoring candidate with offset after mem-op: "
+                        << Use);
+      continue;
+    }
+
+    // FIXME: check whether all uses of Base are load/store with foldable
+    // addressing modes. If so, using the normal addr-modes is better than
+    // forming an indexed one.
+
+    bool MemOpDominatesAddrUses = true;
+    for (auto &GEPUse : MRI.use_instructions(Use.getOperand(0).getReg())) {
+      if (!dominates(MI, GEPUse)) {
+        MemOpDominatesAddrUses = false;
+        break;
+      }
+    }
+
+    if (!MemOpDominatesAddrUses) {
+      LLVM_DEBUG(
+          dbgs() << "    Ignoring candidate as memop does not dominate uses: "
+                 << Use);
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "    Found match: " << Use);
+    Addr = Use.getOperand(0).getReg();
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
+                                           Register &Base, Register &Offset) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+#ifndef NDEBUG
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
+         Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
+#endif
+
+  Addr = MI.getOperand(1).getReg();
+  MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_GEP, Addr, MRI);
+  if (!AddrDef || MRI.hasOneUse(Addr))
+    return false;
+
+  Base = AddrDef->getOperand(1).getReg();
+  Offset = AddrDef->getOperand(2).getReg();
+
+  LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI);
+
+  if (!ForceLegalIndexing &&
+      !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ true, MRI)) {
+    LLVM_DEBUG(dbgs() << "    Skipping, not legal for target");
+    return false;
+  }
+
+  MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
+  if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    LLVM_DEBUG(dbgs() << "    Skipping, frame index would need copy anyway.");
+    return false;
+  }
+
+  if (MI.getOpcode() == TargetOpcode::G_STORE) {
+    // Would require a copy.
+    if (Base == MI.getOperand(0).getReg()) {
+      LLVM_DEBUG(dbgs() << "    Skipping, storing base so need copy anyway.");
+      return false;
+    }
+
+    // We're expecting one use of Addr in MI, but it could also be the
+    // value stored, which isn't actually dominated by the instruction.
+    if (MI.getOperand(0).getReg() == Addr) {
+      LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses");
+      return false;
+    }
+  }
+
+  // FIXME: check whether all uses of the base pointer are constant GEPs. That
+  // might allow us to end base's liveness here by adjusting the constant.
+
+  for (auto &UseMI : MRI.use_instructions(Addr)) {
+    if (!dominates(MI, UseMI)) {
+      LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses.");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode != TargetOpcode::G_LOAD && Opcode != TargetOpcode::G_SEXTLOAD &&
+      Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
+    return false;
+
+  bool IsStore = Opcode == TargetOpcode::G_STORE;
+  Register Addr, Base, Offset;
+  bool IsPre = findPreIndexCandidate(MI, Addr, Base, Offset);
+  if (!IsPre && !findPostIndexCandidate(MI, Addr, Base, Offset))
+    return false;
+
+
+  unsigned NewOpcode;
+  switch (Opcode) {
+  case TargetOpcode::G_LOAD:
+    NewOpcode = TargetOpcode::G_INDEXED_LOAD;
+    break;
+  case TargetOpcode::G_SEXTLOAD:
+    NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
+    break;
+  case TargetOpcode::G_ZEXTLOAD:
+    NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
+    break;
+  case TargetOpcode::G_STORE:
+    NewOpcode = TargetOpcode::G_INDEXED_STORE;
+    break;
+  default:
+    llvm_unreachable("Unknown load/store opcode");
+  }
+
+  MachineInstr &AddrDef = *MRI.getUniqueVRegDef(Addr);
+  MachineIRBuilder MIRBuilder(MI);
+  auto MIB = MIRBuilder.buildInstr(NewOpcode);
+  if (IsStore) {
+    MIB.addDef(Addr);
+    MIB.addUse(MI.getOperand(0).getReg());
+  } else {
+    MIB.addDef(MI.getOperand(0).getReg());
+    MIB.addDef(Addr);
+  }
+
+  MIB.addUse(Base);
+  MIB.addUse(Offset);
+  MIB.addImm(IsPre);
+  MI.eraseFromParent();
+  AddrDef.eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
+  return true;
+}
+
+bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
+  if (MI.getOpcode() != TargetOpcode::G_BR)
+    return false;
+
   // Try to match the following:
   // bb1:
   //   %c(s32) = G_ICMP pred, %a, %b
@@ -380,9 +772,14 @@ bool CombinerHelper::matchCombineBr(MachineInstr &MI) {
   return true;
 }
 
-bool CombinerHelper::tryCombineBr(MachineInstr &MI) {
-  if (!matchCombineBr(MI))
+bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
+  if (!matchElideBrByInvertingCond(MI))
     return false;
+  applyElideBrByInvertingCond(MI);
+  return true;
+}
+
+void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
   MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
   MachineBasicBlock::iterator BrIt(MI);
   MachineInstr *BrCond = &*std::prev(BrIt);
@@ -401,11 +798,509 @@ bool CombinerHelper::tryCombineBr(MachineInstr &MI) {
   BrCond->getOperand(1).setMBB(BrTarget);
   Observer.changedInstr(*BrCond);
   MI.eraseFromParent();
+}
+
+static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
+  // On Darwin, -Os means optimize for size without hurting performance, so
+  // only really optimize for size when -Oz (MinSize) is used.
+  if (MF.getTarget().getTargetTriple().isOSDarwin())
+    return MF.getFunction().hasMinSize();
+  return MF.getFunction().hasOptSize();
+}
+
+// Returns a list of types to use for memory op lowering in MemOps. A partial
+// port of findOptimalMemOpLowering in TargetLowering.
+static bool findGISelOptimalMemOpLowering(
+    std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign,
+    unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+    bool AllowOverlap, unsigned DstAS, unsigned SrcAS,
+    const AttributeList &FuncAttributes, const TargetLowering &TLI) {
+  // If 'SrcAlign' is zero, that means the memory operation does not need to
+  // load the value, i.e. memset or memcpy from constant string. Otherwise,
+  // it's the inferred alignment of the source. 'DstAlign', on the other hand,
+  // is the specified alignment of the memory operation. If it is zero, that
+  // means it's possible to change the alignment of the destination.
+  // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
+  // not need to be loaded.
+  if (SrcAlign != 0 && SrcAlign < DstAlign)
+    return false;
+
+  LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset,
+                                  ZeroMemset, MemcpyStrSrc, FuncAttributes);
+
+  if (Ty == LLT()) {
+    // Use the largest scalar type whose alignment constraints are satisfied.
+    // We only need to check DstAlign here as SrcAlign is always greater or
+    // equal to DstAlign (or zero).
+    Ty = LLT::scalar(64);
+    while (DstAlign && DstAlign < Ty.getSizeInBytes() &&
+           !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign))
+      Ty = LLT::scalar(Ty.getSizeInBytes());
+    assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
+    // FIXME: check for the largest legal type we can load/store to.
+  }
+
+  unsigned NumMemOps = 0;
+  while (Size != 0) {
+    unsigned TySize = Ty.getSizeInBytes();
+    while (TySize > Size) {
+      // For now, only use non-vector load / store's for the left-over pieces.
+      LLT NewTy = Ty;
+      // FIXME: check for mem op safety and legality of the types. Not all of
+      // SDAGisms map cleanly to GISel concepts.
+      if (NewTy.isVector())
+        NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
+      NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
+      unsigned NewTySize = NewTy.getSizeInBytes();
+      assert(NewTySize > 0 && "Could not find appropriate type");
+
+      // If the new LLT cannot cover all of the remaining bits, then consider
+      // issuing a (or a pair of) unaligned and overlapping load / store.
+      bool Fast;
+      // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
+      MVT VT = getMVTForLLT(Ty);
+      if (NumMemOps && AllowOverlap && NewTySize < Size &&
+          TLI.allowsMisalignedMemoryAccesses(
+              VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) &&
+          Fast)
+        TySize = Size;
+      else {
+        Ty = NewTy;
+        TySize = NewTySize;
+      }
+    }
+
+    if (++NumMemOps > Limit)
+      return false;
+
+    MemOps.push_back(Ty);
+    Size -= TySize;
+  }
+
+  return true;
+}
+
+static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
+  if (Ty.isVector())
+    return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
+                           Ty.getNumElements());
+  return IntegerType::get(C, Ty.getSizeInBits());
+}
+
+// Get a vectorized representation of the memset value operand, GISel edition.
+static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  unsigned NumBits = Ty.getScalarSizeInBits();
+  auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
+  if (!Ty.isVector() && ValVRegAndVal) {
+    unsigned KnownVal = ValVRegAndVal->Value;
+    APInt Scalar = APInt(8, KnownVal);
+    APInt SplatVal = APInt::getSplat(NumBits, Scalar);
+    return MIB.buildConstant(Ty, SplatVal).getReg(0);
+  }
+  // FIXME: for vector types create a G_BUILD_VECTOR.
+  if (Ty.isVector())
+    return Register();
+
+  // Extend the byte value to the larger type, and then multiply by a magic
+  // value 0x010101... in order to replicate it across every byte.
+  LLT ExtType = Ty.getScalarType();
+  auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
+  if (NumBits > 8) {
+    APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
+    auto MagicMI = MIB.buildConstant(ExtType, Magic);
+    Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
+  }
+
+  assert(ExtType == Ty && "Vector memset value type not supported yet");
+  return Val;
+}
+
+bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
+                                    unsigned KnownLen, unsigned Align,
+                                    bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memset length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+
+  auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
+  bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
+
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0,
+          /*IsMemset=*/true,
+          /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false,
+          /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u,
+          MF.getFunction().getAttributes(), TLI))
+    return false;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
+    if (NewAlign > Align) {
+      Align = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlignment(FI) < Align)
+        MFI.setObjectAlignment(FI, Align);
+    }
+  }
+
+  MachineIRBuilder MIB(MI);
+  // Find the largest store and generate the bit pattern for it.
+  LLT LargestTy = MemOps[0];
+  for (unsigned i = 1; i < MemOps.size(); i++)
+    if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
+      LargestTy = MemOps[i];
+
+  // The memset stored value is always defined as an s8, so in order to make it
+  // work with larger store types we need to repeat the bit pattern across the
+  // wider type.
+  Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
+
+  if (!MemSetValue)
+    return false;
+
+  // Generate the stores. For each store type in the list, we generate the
+  // matching store of that type to the destination address.
+  LLT PtrTy = MRI.getType(Dst);
+  unsigned DstOff = 0;
+  unsigned Size = KnownLen;
+  for (unsigned I = 0; I < MemOps.size(); I++) {
+    LLT Ty = MemOps[I];
+    unsigned TySize = Ty.getSizeInBytes();
+    if (TySize > Size) {
+      // Issuing an unaligned load / store pair that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(I == MemOps.size() - 1 && I != 0);
+      DstOff -= TySize - Size;
+    }
+
+    // If this store is smaller than the largest store see whether we can get
+    // the smaller value for free with a truncate.
+    Register Value = MemSetValue;
+    if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
+      MVT VT = getMVTForLLT(Ty);
+      MVT LargestVT = getMVTForLLT(LargestTy);
+      if (!LargestTy.isVector() && !Ty.isVector() &&
+          TLI.isTruncateFree(LargestVT, VT))
+        Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
+      else
+        Value = getMemsetValue(Val, Ty, MIB);
+      if (!Value)
+        return false;
+    }
+
+    auto *StoreMMO =
+        MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes());
+
+    Register Ptr = Dst;
+    if (DstOff != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
+      Ptr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0);
+    }
+
+    MIB.buildStore(Value, Ptr, *StoreMMO);
+    DstOff += Ty.getSizeInBytes();
+    Size -= TySize;
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+
+bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
+                                    Register Src, unsigned KnownLen,
+                                    unsigned DstAlign, unsigned SrcAlign,
+                                    bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memcpy length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+  unsigned Alignment = MinAlign(DstAlign, SrcAlign);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  // FIXME: infer better src pointer alignment like SelectionDAG does here.
+  // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
+  // if the memcpy is in a tail call position.
+
+  unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
+          SrcAlign,
+          /*IsMemset=*/false,
+          /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
+          /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(),
+          SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
+    return false;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->needsStackRealignment(MF))
+      while (NewAlign > Alignment &&
+             DL.exceedsNaturalStackAlignment(Align(NewAlign)))
+        NewAlign /= 2;
+
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlignment(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
+
+  MachineIRBuilder MIB(MI);
+  // Now we need to emit a pair of load and stores for each of the types we've
+  // collected. I.e. for each type, generate a load from the source pointer of
+  // that type width, and then generate a corresponding store to the dest buffer
+  // of that value loaded. This can result in a sequence of loads and stores
+  // mixed types, depending on what the target specifies as good types to use.
+  unsigned CurrOffset = 0;
+  LLT PtrTy = MRI.getType(Src);
+  unsigned Size = KnownLen;
+  for (auto CopyTy : MemOps) {
+    // Issuing an unaligned load / store pair  that overlaps with the previous
+    // pair. Adjust the offset accordingly.
+    if (CopyTy.getSizeInBytes() > Size)
+      CurrOffset -= CopyTy.getSizeInBytes() - Size;
+
+    // Construct MMOs for the accesses.
+    auto *LoadMMO =
+        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+    auto *StoreMMO = 
+        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    // Create the load.
+    Register LoadPtr = Src;
+    Register Offset;
+    if (CurrOffset != 0) {
+      Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
+                   .getReg(0);
+      LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0);
+    }
+    auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
+
+    // Create the store.
+    Register StorePtr =
+        CurrOffset == 0 ? Dst : MIB.buildGEP(PtrTy, Dst, Offset).getReg(0);
+    MIB.buildStore(LdVal, StorePtr, *StoreMMO);
+    CurrOffset += CopyTy.getSizeInBytes();
+    Size -= CopyTy.getSizeInBytes();
+  }
+
+  MI.eraseFromParent();
   return true;
 }
 
+bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
+                                    Register Src, unsigned KnownLen,
+                                    unsigned DstAlign, unsigned SrcAlign,
+                                    bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memmove length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+  unsigned Alignment = MinAlign(DstAlign, SrcAlign);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+  // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
+  // to a bug in it's findOptimalMemOpLowering implementation. For now do the
+  // same thing here.
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
+          SrcAlign,
+          /*IsMemset=*/false,
+          /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
+          /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(),
+          SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
+    return false;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->needsStackRealignment(MF))
+      while (NewAlign > Alignment &&
+             DL.exceedsNaturalStackAlignment(Align(NewAlign)))
+        NewAlign /= 2;
+
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlignment(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
+
+  MachineIRBuilder MIB(MI);
+  // Memmove requires that we perform the loads first before issuing the stores.
+  // Apart from that, this loop is pretty much doing the same thing as the
+  // memcpy codegen function.
+  unsigned CurrOffset = 0;
+  LLT PtrTy = MRI.getType(Src);
+  SmallVector<Register, 16> LoadVals;
+  for (auto CopyTy : MemOps) {
+    // Construct MMO for the load.
+    auto *LoadMMO =
+        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    // Create the load.
+    Register LoadPtr = Src;
+    if (CurrOffset != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
+      LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0);
+    }
+    LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
+    CurrOffset += CopyTy.getSizeInBytes();
+  }
+
+  CurrOffset = 0;
+  for (unsigned I = 0; I < MemOps.size(); ++I) {
+    LLT CopyTy = MemOps[I];
+    // Now store the values loaded.
+    auto *StoreMMO =
+        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    Register StorePtr = Dst;
+    if (CurrOffset != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
+      StorePtr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0);
+    }
+    MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
+    CurrOffset += CopyTy.getSizeInBytes();
+  }
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
+  // This combine is fairly complex so it's not written with a separate
+  // matcher function.
+  assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+  Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID();
+  assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove ||
+          ID == Intrinsic::memset) &&
+         "Expected a memcpy like intrinsic");
+
+  auto MMOIt = MI.memoperands_begin();
+  const MachineMemOperand *MemOp = *MMOIt;
+  bool IsVolatile = MemOp->isVolatile();
+  // Don't try to optimize volatile.
+  if (IsVolatile)
+    return false;
+
+  unsigned DstAlign = MemOp->getBaseAlignment();
+  unsigned SrcAlign = 0;
+  Register Dst = MI.getOperand(1).getReg();
+  Register Src = MI.getOperand(2).getReg();
+  Register Len = MI.getOperand(3).getReg();
+
+  if (ID != Intrinsic::memset) {
+    assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
+    MemOp = *(++MMOIt);
+    SrcAlign = MemOp->getBaseAlignment();
+  }
+
+  // See if this is a constant length copy
+  auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
+  if (!LenVRegAndVal)
+    return false; // Leave it to the legalizer to lower it to a libcall.
+  unsigned KnownLen = LenVRegAndVal->Value;
+
+  if (KnownLen == 0) {
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (MaxLen && KnownLen > MaxLen)
+    return false;
+
+  if (ID == Intrinsic::memcpy)
+    return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
+  if (ID == Intrinsic::memmove)
+    return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
+  if (ID == Intrinsic::memset)
+    return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
+  return false;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
-  return tryCombineExtendingLoads(MI);
+  if (tryCombineExtendingLoads(MI))
+    return true;
+  if (tryCombineIndexedLoadStore(MI))
+    return true;
+  return false;
 }
diff --git a/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
new file mode 100644
index 000000000000..be8efa8795f3
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -0,0 +1,383 @@
+//===- lib/CodeGen/GlobalISel/GISelKnownBits.cpp --------------*- C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Provides analysis for querying information about KnownBits during GISel
+/// passes.
+//
+//===------------------
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+
+#define DEBUG_TYPE "gisel-known-bits"
+
+using namespace llvm;
+
+char llvm::GISelKnownBitsAnalysis::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GISelKnownBitsAnalysis, DEBUG_TYPE,
+                      "Analysis for ComputingKnownBits", false, true)
+INITIALIZE_PASS_END(GISelKnownBitsAnalysis, DEBUG_TYPE,
+                    "Analysis for ComputingKnownBits", false, true)
+
+GISelKnownBits::GISelKnownBits(MachineFunction &MF)
+    : MF(MF), MRI(MF.getRegInfo()), TL(*MF.getSubtarget().getTargetLowering()),
+      DL(MF.getFunction().getParent()->getDataLayout()) {}
+
+Align GISelKnownBits::inferAlignmentForFrameIdx(int FrameIdx, int Offset,
+                                                const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return commonAlignment(Align(MFI.getObjectAlignment(FrameIdx)), Offset);
+  // TODO: How to handle cases with Base + Offset?
+}
+
+MaybeAlign GISelKnownBits::inferPtrAlignment(const MachineInstr &MI) {
+  if (MI.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    int FrameIdx = MI.getOperand(1).getIndex();
+    return inferAlignmentForFrameIdx(FrameIdx, 0, *MI.getMF());
+  }
+  return None;
+}
+
+void GISelKnownBits::computeKnownBitsForFrameIndex(Register R, KnownBits &Known,
+                                                   const APInt &DemandedElts,
+                                                   unsigned Depth) {
+  const MachineInstr &MI = *MRI.getVRegDef(R);
+  computeKnownBitsForAlignment(Known, inferPtrAlignment(MI));
+}
+
+void GISelKnownBits::computeKnownBitsForAlignment(KnownBits &Known,
+                                                  MaybeAlign Alignment) {
+  if (Alignment)
+    // The low bits are known zero if the pointer is aligned.
+    Known.Zero.setLowBits(Log2(Alignment));
+}
+
+KnownBits GISelKnownBits::getKnownBits(MachineInstr &MI) {
+  return getKnownBits(MI.getOperand(0).getReg());
+}
+
+KnownBits GISelKnownBits::getKnownBits(Register R) {
+  KnownBits Known;
+  LLT Ty = MRI.getType(R);
+  APInt DemandedElts =
+      Ty.isVector() ? APInt::getAllOnesValue(Ty.getNumElements()) : APInt(1, 1);
+  computeKnownBitsImpl(R, Known, DemandedElts);
+  return Known;
+}
+
+bool GISelKnownBits::signBitIsZero(Register R) {
+  LLT Ty = MRI.getType(R);
+  unsigned BitWidth = Ty.getScalarSizeInBits();
+  return maskedValueIsZero(R, APInt::getSignMask(BitWidth));
+}
+
+APInt GISelKnownBits::getKnownZeroes(Register R) {
+  return getKnownBits(R).Zero;
+}
+
+APInt GISelKnownBits::getKnownOnes(Register R) { return getKnownBits(R).One; }
+
+void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
+                                          const APInt &DemandedElts,
+                                          unsigned Depth) {
+  MachineInstr &MI = *MRI.getVRegDef(R);
+  unsigned Opcode = MI.getOpcode();
+  LLT DstTy = MRI.getType(R);
+
+  // Handle the case where this is called on a register that does not have a
+  // type constraint (i.e. it has a register class constraint instead). This is
+  // unlikely to occur except by looking through copies but it is possible for
+  // the initial register being queried to be in this state.
+  if (!DstTy.isValid()) {
+    Known = KnownBits();
+    return;
+  }
+
+  unsigned BitWidth = DstTy.getSizeInBits();
+  Known = KnownBits(BitWidth); // Don't know anything
+
+  if (DstTy.isVector())
+    return; // TODO: Handle vectors.
+
+  if (Depth == getMaxDepth())
+    return;
+
+  if (!DemandedElts)
+    return; // No demanded elts, better to assume we don't know anything.
+
+  KnownBits Known2;
+
+  switch (Opcode) {
+  default:
+    TL.computeKnownBitsForTargetInstr(*this, R, Known, DemandedElts, MRI,
+                                      Depth);
+    break;
+  case TargetOpcode::COPY: {
+    MachineOperand Dst = MI.getOperand(0);
+    MachineOperand Src = MI.getOperand(1);
+    // Look through trivial copies but don't look through trivial copies of the
+    // form `%1:(s32) = OP %0:gpr32` known-bits analysis is currently unable to
+    // determine the bit width of a register class.
+    //
+    // We can't use NoSubRegister by name as it's defined by each target but
+    // it's always defined to be 0 by tablegen.
+    if (Dst.getSubReg() == 0 /*NoSubRegister*/ && Src.getReg().isVirtual() &&
+        Src.getSubReg() == 0 /*NoSubRegister*/ &&
+        MRI.getType(Src.getReg()).isValid()) {
+      // Don't increment Depth for this one since we didn't do any work.
+      computeKnownBitsImpl(Src.getReg(), Known, DemandedElts, Depth);
+    }
+    break;
+  }
+  case TargetOpcode::G_CONSTANT: {
+    auto CstVal = getConstantVRegVal(R, MRI);
+    if (!CstVal)
+      break;
+    Known.One = *CstVal;
+    Known.Zero = ~Known.One;
+    break;
+  }
+  case TargetOpcode::G_FRAME_INDEX: {
+    computeKnownBitsForFrameIndex(R, Known, DemandedElts);
+    break;
+  }
+  case TargetOpcode::G_SUB: {
+    // If low bits are known to be zero in both operands, then we know they are
+    // going to be 0 in the result. Both addition and complement operations
+    // preserve the low zero bits.
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
+    if (KnownZeroLow == 0)
+      break;
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
+    Known.Zero.setLowBits(KnownZeroLow);
+    break;
+  }
+  case TargetOpcode::G_XOR: {
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
+    Known.Zero = KnownZeroOut;
+    break;
+  }
+  case TargetOpcode::G_GEP: {
+    // G_GEP is like G_ADD. FIXME: Is this true for all targets?
+    LLT Ty = MRI.getType(MI.getOperand(1).getReg());
+    if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
+      break;
+    LLVM_FALLTHROUGH;
+  }
+  case TargetOpcode::G_ADD: {
+    // Output known-0 bits are known if clear or set in both the low clear bits
+    // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
+    // low 3 bits clear.
+    // Output known-0 bits are also known if the top bits of each input are
+    // known to be clear. For example, if one input has the top 10 bits clear
+    // and the other has the top 8 bits clear, we know the top 7 bits of the
+    // output must be clear.
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
+    Known.Zero.setLowBits(KnownZeroLow);
+    if (KnownZeroHigh > 1)
+      Known.Zero.setHighBits(KnownZeroHigh - 1);
+    break;
+  }
+  case TargetOpcode::G_AND: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    Known.One &= Known2.One;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    Known.Zero |= Known2.Zero;
+    break;
+  }
+  case TargetOpcode::G_OR: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    Known.Zero &= Known2.Zero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    Known.One |= Known2.One;
+    break;
+  }
+  case TargetOpcode::G_MUL: {
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    // If low bits are zero in either operand, output low known-0 bits.
+    // Also compute a conservative estimate for high known-0 bits.
+    // More trickiness is possible, but this is sufficient for the
+    // interesting case of alignment computation.
+    unsigned TrailZ =
+        Known.countMinTrailingZeros() + Known2.countMinTrailingZeros();
+    unsigned LeadZ =
+        std::max(Known.countMinLeadingZeros() + Known2.countMinLeadingZeros(),
+                 BitWidth) -
+        BitWidth;
+
+    Known.resetAll();
+    Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
+    Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
+    break;
+  }
+  case TargetOpcode::G_SELECT: {
+    computeKnownBitsImpl(MI.getOperand(3).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    // If we don't know any bits, early out.
+    if (Known.isUnknown())
+      break;
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    // Only known if known in both the LHS and RHS.
+    Known.One &= Known2.One;
+    Known.Zero &= Known2.Zero;
+    break;
+  }
+  case TargetOpcode::G_FCMP:
+  case TargetOpcode::G_ICMP: {
+    if (TL.getBooleanContents(DstTy.isVector(),
+                              Opcode == TargetOpcode::G_FCMP) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      Known.Zero.setBitsFrom(1);
+    break;
+  }
+  case TargetOpcode::G_SEXT: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    // If the sign bit is known to be zero or one, then sext will extend
+    // it to the top bits, else it will just zext.
+    Known = Known.sext(BitWidth);
+    break;
+  }
+  case TargetOpcode::G_ANYEXT: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
+    break;
+  }
+  case TargetOpcode::G_LOAD: {
+    if (MI.hasOneMemOperand()) {
+      const MachineMemOperand *MMO = *MI.memoperands_begin();
+      if (const MDNode *Ranges = MMO->getRanges()) {
+        computeKnownBitsFromRangeMetadata(*Ranges, Known);
+      }
+    }
+    break;
+  }
+  case TargetOpcode::G_ZEXTLOAD: {
+    // Everything above the retrieved bits is zero
+    if (MI.hasOneMemOperand())
+      Known.Zero.setBitsFrom((*MI.memoperands_begin())->getSizeInBits());
+    break;
+  }
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_SHL: {
+    KnownBits RHSKnown;
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), RHSKnown, DemandedElts,
+                         Depth + 1);
+    if (!RHSKnown.isConstant()) {
+      LLVM_DEBUG(
+          MachineInstr *RHSMI = MRI.getVRegDef(MI.getOperand(2).getReg());
+          dbgs() << '[' << Depth << "] Shift not known constant: " << *RHSMI);
+      break;
+    }
+    uint64_t Shift = RHSKnown.getConstant().getZExtValue();
+    LLVM_DEBUG(dbgs() << '[' << Depth << "] Shift is " << Shift << '\n');
+
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+
+    switch (Opcode) {
+    case TargetOpcode::G_ASHR:
+      Known.Zero = Known.Zero.ashr(Shift);
+      Known.One = Known.One.ashr(Shift);
+      break;
+    case TargetOpcode::G_LSHR:
+      Known.Zero = Known.Zero.lshr(Shift);
+      Known.One = Known.One.lshr(Shift);
+      Known.Zero.setBitsFrom(Known.Zero.getBitWidth() - Shift);
+      break;
+    case TargetOpcode::G_SHL:
+      Known.Zero = Known.Zero.shl(Shift);
+      Known.One = Known.One.shl(Shift);
+      Known.Zero.setBits(0, Shift);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_INTTOPTR:
+  case TargetOpcode::G_PTRTOINT:
+    // Fall through and handle them the same as zext/trunc.
+    LLVM_FALLTHROUGH;
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_TRUNC: {
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    unsigned SrcBitWidth = SrcTy.isPointer()
+                               ? DL.getIndexSizeInBits(SrcTy.getAddressSpace())
+                               : SrcTy.getSizeInBits();
+    assert(SrcBitWidth && "SrcBitWidth can't be zero");
+    Known = Known.zextOrTrunc(SrcBitWidth, true);
+    computeKnownBitsImpl(SrcReg, Known, DemandedElts, Depth + 1);
+    Known = Known.zextOrTrunc(BitWidth, true);
+    if (BitWidth > SrcBitWidth)
+      Known.Zero.setBitsFrom(SrcBitWidth);
+    break;
+  }
+  }
+
+  assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+  LLVM_DEBUG(dbgs() << "[" << Depth << "] Compute known bits: " << MI << "["
+                    << Depth << "] Computed for: " << MI << "[" << Depth
+                    << "] Known: 0x"
+                    << (Known.Zero | Known.One).toString(16, false) << "\n"
+                    << "[" << Depth << "] Zero: 0x"
+                    << Known.Zero.toString(16, false) << "\n"
+                    << "[" << Depth << "] One:  0x"
+                    << Known.One.toString(16, false) << "\n");
+}
+
+void GISelKnownBitsAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool GISelKnownBitsAnalysis::runOnMachineFunction(MachineFunction &MF) {
+  return false;
+}
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 6e99bdbd8264..45cef4aca888 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -334,7 +335,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
 
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  const CmpInst *CI = dyn_cast<CmpInst>(&U);
+  auto *CI = dyn_cast<CmpInst>(&U);
   Register Op0 = getOrCreateVReg(*U.getOperand(0));
   Register Op1 = getOrCreateVReg(*U.getOperand(1));
   Register Res = getOrCreateVReg(U);
@@ -345,11 +346,12 @@ bool IRTranslator::translateCompare(const User &U,
     MIRBuilder.buildICmp(Pred, Res, Op0, Op1);
   else if (Pred == CmpInst::FCMP_FALSE)
     MIRBuilder.buildCopy(
-        Res, getOrCreateVReg(*Constant::getNullValue(CI->getType())));
+        Res, getOrCreateVReg(*Constant::getNullValue(U.getType())));
   else if (Pred == CmpInst::FCMP_TRUE)
     MIRBuilder.buildCopy(
-        Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType())));
+        Res, getOrCreateVReg(*Constant::getAllOnesValue(U.getType())));
   else {
+    assert(CI && "Instruction should be CmpInst");
     MIRBuilder.buildInstr(TargetOpcode::G_FCMP, {Res}, {Pred, Op0, Op1},
                           MachineInstr::copyFlagsFromInstruction(*CI));
   }
@@ -588,8 +590,8 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
     Register CondRHS = getOrCreateVReg(*CB.CmpRHS);
     Cond = MIB.buildICmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0);
   } else {
-    assert(CB.PredInfo.Pred == CmpInst::ICMP_ULE &&
-           "Can only handle ULE ranges");
+    assert(CB.PredInfo.Pred == CmpInst::ICMP_SLE &&
+           "Can only handle SLE ranges");
 
     const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
     const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue();
@@ -598,7 +600,7 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
     if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
       Register CondRHS = getOrCreateVReg(*CB.CmpRHS);
       Cond =
-          MIB.buildICmp(CmpInst::ICMP_ULE, i1Ty, CmpOpReg, CondRHS).getReg(0);
+          MIB.buildICmp(CmpInst::ICMP_SLE, i1Ty, CmpOpReg, CondRHS).getReg(0);
     } else {
       const LLT &CmpTy = MRI->getType(CmpOpReg);
       auto Sub = MIB.buildSub({CmpTy}, CmpOpReg, CondLHS);
@@ -728,7 +730,7 @@ bool IRTranslator::lowerSwitchRangeWorkItem(SwitchCG::CaseClusterIt I,
     MHS = nullptr;
   } else {
     // Check I->Low <= Cond <= I->High.
-    Pred = CmpInst::ICMP_ULE;
+    Pred = CmpInst::ICMP_SLE;
     LHS = I->Low;
     MHS = Cond;
     RHS = I->High;
@@ -879,7 +881,8 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-
+  const MDNode *Ranges =
+      Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr;
   for (unsigned i = 0; i < Regs.size(); ++i) {
     Register Addr;
     MIRBuilder.materializeGEP(Addr, Base, OffsetTy, Offsets[i] / 8);
@@ -888,7 +891,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     unsigned BaseAlign = getMemOpAlignment(LI);
     auto MMO = MF->getMachineMemOperand(
         Ptr, Flags, (MRI->getType(Regs[i]).getSizeInBits() + 7) / 8,
-        MinAlign(BaseAlign, Offsets[i] / 8), AAMDNodes(), nullptr,
+        MinAlign(BaseAlign, Offsets[i] / 8), AAMDNodes(), Ranges,
         LI.getSyncScopeID(), LI.getOrdering());
     MIRBuilder.buildLoad(Regs[i], Addr, *MMO);
   }
@@ -1075,36 +1078,29 @@ bool IRTranslator::translateGetElementPtr(const User &U,
       }
 
       if (Offset != 0) {
-        Register NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
         LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
         auto OffsetMIB = MIRBuilder.buildConstant({OffsetTy}, Offset);
-        MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetMIB.getReg(0));
-
-        BaseReg = NewBaseReg;
+        BaseReg =
+            MIRBuilder.buildGEP(PtrTy, BaseReg, OffsetMIB.getReg(0)).getReg(0);
         Offset = 0;
       }
 
       Register IdxReg = getOrCreateVReg(*Idx);
-      if (MRI->getType(IdxReg) != OffsetTy) {
-        Register NewIdxReg = MRI->createGenericVirtualRegister(OffsetTy);
-        MIRBuilder.buildSExtOrTrunc(NewIdxReg, IdxReg);
-        IdxReg = NewIdxReg;
-      }
+      if (MRI->getType(IdxReg) != OffsetTy)
+        IdxReg = MIRBuilder.buildSExtOrTrunc(OffsetTy, IdxReg).getReg(0);
 
       // N = N + Idx * ElementSize;
       // Avoid doing it for ElementSize of 1.
       Register GepOffsetReg;
       if (ElementSize != 1) {
-        GepOffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
         auto ElementSizeMIB = MIRBuilder.buildConstant(
             getLLTForType(*OffsetIRTy, *DL), ElementSize);
-        MIRBuilder.buildMul(GepOffsetReg, ElementSizeMIB.getReg(0), IdxReg);
+        GepOffsetReg =
+            MIRBuilder.buildMul(OffsetTy, ElementSizeMIB, IdxReg).getReg(0);
       } else
         GepOffsetReg = IdxReg;
 
-      Register NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
-      MIRBuilder.buildGEP(NewBaseReg, BaseReg, GepOffsetReg);
-      BaseReg = NewBaseReg;
+      BaseReg = MIRBuilder.buildGEP(PtrTy, BaseReg, GepOffsetReg).getReg(0);
     }
   }
 
@@ -1119,54 +1115,51 @@ bool IRTranslator::translateGetElementPtr(const User &U,
   return true;
 }
 
-bool IRTranslator::translateMemfunc(const CallInst &CI,
+bool IRTranslator::translateMemFunc(const CallInst &CI,
                                     MachineIRBuilder &MIRBuilder,
-                                    unsigned ID) {
+                                    Intrinsic::ID ID) {
 
   // If the source is undef, then just emit a nop.
-  if (isa<UndefValue>(CI.getArgOperand(1))) {
-    switch (ID) {
-    case Intrinsic::memmove:
-    case Intrinsic::memcpy:
-    case Intrinsic::memset:
-      return true;
-    default:
-      break;
-    }
-  }
-
-  LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL);
-  Type *DstTy = CI.getArgOperand(0)->getType();
-  if (cast<PointerType>(DstTy)->getAddressSpace() != 0 ||
-      SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0))
-    return false;
+  if (isa<UndefValue>(CI.getArgOperand(1)))
+    return true;
 
-  SmallVector<CallLowering::ArgInfo, 8> Args;
-  for (int i = 0; i < 3; ++i) {
-    const auto &Arg = CI.getArgOperand(i);
-    Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType());
+  ArrayRef<Register> Res;
+  auto ICall = MIRBuilder.buildIntrinsic(ID, Res, true);
+  for (auto AI = CI.arg_begin(), AE = CI.arg_end(); std::next(AI) != AE; ++AI)
+    ICall.addUse(getOrCreateVReg(**AI));
+
+  unsigned DstAlign = 0, SrcAlign = 0;
+  unsigned IsVol =
+      cast<ConstantInt>(CI.getArgOperand(CI.getNumArgOperands() - 1))
+          ->getZExtValue();
+
+  if (auto *MCI = dyn_cast<MemCpyInst>(&CI)) {
+    DstAlign = std::max<unsigned>(MCI->getDestAlignment(), 1);
+    SrcAlign = std::max<unsigned>(MCI->getSourceAlignment(), 1);
+  } else if (auto *MMI = dyn_cast<MemMoveInst>(&CI)) {
+    DstAlign = std::max<unsigned>(MMI->getDestAlignment(), 1);
+    SrcAlign = std::max<unsigned>(MMI->getSourceAlignment(), 1);
+  } else {
+    auto *MSI = cast<MemSetInst>(&CI);
+    DstAlign = std::max<unsigned>(MSI->getDestAlignment(), 1);
   }
 
-  const char *Callee;
-  switch (ID) {
-  case Intrinsic::memmove:
-  case Intrinsic::memcpy: {
-    Type *SrcTy = CI.getArgOperand(1)->getType();
-    if(cast<PointerType>(SrcTy)->getAddressSpace() != 0)
-      return false;
-    Callee = ID == Intrinsic::memcpy ? "memcpy" : "memmove";
-    break;
-  }
-  case Intrinsic::memset:
-    Callee = "memset";
-    break;
-  default:
-    return false;
-  }
+  // We need to propagate the tail call flag from the IR inst as an argument.
+  // Otherwise, we have to pessimize and assume later that we cannot tail call
+  // any memory intrinsics.
+  ICall.addImm(CI.isTailCall() ? 1 : 0);
 
-  return CLI->lowerCall(MIRBuilder, CI.getCallingConv(),
-                        MachineOperand::CreateES(Callee),
-                        CallLowering::ArgInfo({0}, CI.getType()), Args);
+  // Create mem operands to store the alignment and volatile info.
+  auto VolFlag = IsVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+  ICall.addMemOperand(MF->getMachineMemOperand(
+      MachinePointerInfo(CI.getArgOperand(0)),
+      MachineMemOperand::MOStore | VolFlag, 1, DstAlign));
+  if (ID != Intrinsic::memset)
+    ICall.addMemOperand(MF->getMachineMemOperand(
+        MachinePointerInfo(CI.getArgOperand(1)),
+        MachineMemOperand::MOLoad | VolFlag, 1, SrcAlign));
+
+  return true;
 }
 
 void IRTranslator::getStackGuard(Register DstReg,
@@ -1186,7 +1179,7 @@ void IRTranslator::getStackGuard(Register DstReg,
                MachineMemOperand::MODereferenceable;
   MachineMemOperand *MemRef =
       MF->getMachineMemOperand(MPInfo, Flags, DL->getPointerSizeInBits() / 8,
-                               DL->getPointerABIAlignment(0));
+                               DL->getPointerABIAlignment(0).value());
   MIB.setMemRefs({MemRef});
 }
 
@@ -1208,6 +1201,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       break;
     case Intrinsic::bswap:
       return TargetOpcode::G_BSWAP;
+  case Intrinsic::bitreverse:
+      return TargetOpcode::G_BITREVERSE;
     case Intrinsic::ceil:
       return TargetOpcode::G_FCEIL;
     case Intrinsic::cos:
@@ -1383,16 +1378,17 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     if (!V) {
       // Currently the optimizer can produce this; insert an undef to
       // help debugging.  Probably the optimizer should not do this.
-      MIRBuilder.buildIndirectDbgValue(0, DI.getVariable(), DI.getExpression());
+      MIRBuilder.buildDirectDbgValue(0, DI.getVariable(), DI.getExpression());
     } else if (const auto *CI = dyn_cast<Constant>(V)) {
       MIRBuilder.buildConstDbgValue(*CI, DI.getVariable(), DI.getExpression());
     } else {
-      Register Reg = getOrCreateVReg(*V);
-      // FIXME: This does not handle register-indirect values at offset 0. The
-      // direct/indirect thing shouldn't really be handled by something as
-      // implicit as reg+noreg vs reg+imm in the first palce, but it seems
-      // pretty baked in right now.
-      MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression());
+      for (Register Reg : getOrCreateVRegs(*V)) {
+        // FIXME: This does not handle register-indirect values at offset 0. The
+        // direct/indirect thing shouldn't really be handled by something as
+        // implicit as reg+noreg vs reg+imm in the first place, but it seems
+        // pretty baked in right now.
+        MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression());
+      }
     }
     return true;
   }
@@ -1433,7 +1429,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
-    return translateMemfunc(CI, MIRBuilder, ID);
+    return translateMemFunc(CI, MIRBuilder, ID);
   case Intrinsic::eh_typeid_for: {
     GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));
     Register Reg = getOrCreateVReg(CI);
@@ -1441,18 +1437,12 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     MIRBuilder.buildConstant(Reg, TypeID);
     return true;
   }
-  case Intrinsic::objectsize: {
-    // If we don't know by now, we're never going to know.
-    const ConstantInt *Min = cast<ConstantInt>(CI.getArgOperand(1));
+  case Intrinsic::objectsize:
+    llvm_unreachable("llvm.objectsize.* should have been lowered already");
 
-    MIRBuilder.buildConstant(getOrCreateVReg(CI), Min->isZero() ? -1ULL : 0);
-    return true;
-  }
   case Intrinsic::is_constant:
-    // If this wasn't constant-folded away by now, then it's not a
-    // constant.
-    MIRBuilder.buildConstant(getOrCreateVReg(CI), 0);
-    return true;
+    llvm_unreachable("llvm.is.constant.* should have been lowered already");
+
   case Intrinsic::stackguard:
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
@@ -1551,6 +1541,46 @@ bool IRTranslator::translateInlineAsm(const CallInst &CI,
   return true;
 }
 
+bool IRTranslator::translateCallSite(const ImmutableCallSite &CS,
+                                     MachineIRBuilder &MIRBuilder) {
+  const Instruction &I = *CS.getInstruction();
+  ArrayRef<Register> Res = getOrCreateVRegs(I);
+
+  SmallVector<ArrayRef<Register>, 8> Args;
+  Register SwiftInVReg = 0;
+  Register SwiftErrorVReg = 0;
+  for (auto &Arg : CS.args()) {
+    if (CLI->supportSwiftError() && isSwiftError(Arg)) {
+      assert(SwiftInVReg == 0 && "Expected only one swift error argument");
+      LLT Ty = getLLTForType(*Arg->getType(), *DL);
+      SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
+      MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
+                                            &I, &MIRBuilder.getMBB(), Arg));
+      Args.emplace_back(makeArrayRef(SwiftInVReg));
+      SwiftErrorVReg =
+          SwiftError.getOrCreateVRegDefAt(&I, &MIRBuilder.getMBB(), Arg);
+      continue;
+    }
+    Args.push_back(getOrCreateVRegs(*Arg));
+  }
+
+  // We don't set HasCalls on MFI here yet because call lowering may decide to
+  // optimize into tail calls. Instead, we defer that to selection where a final
+  // scan is done to check if any instructions are calls.
+  bool Success =
+      CLI->lowerCall(MIRBuilder, CS, Res, Args, SwiftErrorVReg,
+                     [&]() { return getOrCreateVReg(*CS.getCalledValue()); });
+
+  // Check if we just inserted a tail call.
+  if (Success) {
+    assert(!HasTailCall && "Can't tail call return twice from block?");
+    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+    HasTailCall = TII->isTailCall(*std::prev(MIRBuilder.getInsertPt()));
+  }
+
+  return Success;
+}
+
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   const CallInst &CI = cast<CallInst>(U);
   auto TII = MF->getTarget().getIntrinsicInfo();
@@ -1570,34 +1600,8 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
       ID = static_cast<Intrinsic::ID>(TII->getIntrinsicID(F));
   }
 
-  if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic) {
-    ArrayRef<Register> Res = getOrCreateVRegs(CI);
-
-    SmallVector<ArrayRef<Register>, 8> Args;
-    Register SwiftInVReg = 0;
-    Register SwiftErrorVReg = 0;
-    for (auto &Arg: CI.arg_operands()) {
-      if (CLI->supportSwiftError() && isSwiftError(Arg)) {
-        assert(SwiftInVReg == 0 && "Expected only one swift error argument");
-        LLT Ty = getLLTForType(*Arg->getType(), *DL);
-        SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
-        MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
-                                              &CI, &MIRBuilder.getMBB(), Arg));
-        Args.emplace_back(makeArrayRef(SwiftInVReg));
-        SwiftErrorVReg =
-            SwiftError.getOrCreateVRegDefAt(&CI, &MIRBuilder.getMBB(), Arg);
-        continue;
-      }
-      Args.push_back(getOrCreateVRegs(*Arg));
-    }
-
-    MF->getFrameInfo().setHasCalls(true);
-    bool Success =
-        CLI->lowerCall(MIRBuilder, &CI, Res, Args, SwiftErrorVReg,
-                       [&]() { return getOrCreateVReg(*CI.getCalledValue()); });
-
-    return Success;
-  }
+  if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic)
+    return translateCallSite(&CI, MIRBuilder);
 
   assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");
 
@@ -1615,14 +1619,29 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (isa<FPMathOperator>(CI))
     MIB->copyIRFlags(CI);
 
-  for (auto &Arg : CI.arg_operands()) {
+  for (auto &Arg : enumerate(CI.arg_operands())) {
     // Some intrinsics take metadata parameters. Reject them.
-    if (isa<MetadataAsValue>(Arg))
-      return false;
-    ArrayRef<Register> VRegs = getOrCreateVRegs(*Arg);
-    if (VRegs.size() > 1)
+    if (isa<MetadataAsValue>(Arg.value()))
       return false;
-    MIB.addUse(VRegs[0]);
+
+    // If this is required to be an immediate, don't materialize it in a
+    // register.
+    if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) {
+        // imm arguments are more convenient than cimm (and realistically
+        // probably sufficient), so use them.
+        assert(CI->getBitWidth() <= 64 &&
+               "large intrinsic immediates not handled");
+        MIB.addImm(CI->getSExtValue());
+      } else {
+        MIB.addFPImm(cast<ConstantFP>(Arg.value()));
+      }
+    } else {
+      ArrayRef<Register> VRegs = getOrCreateVRegs(*Arg.value());
+      if (VRegs.size() > 1)
+        return false;
+      MIB.addUse(VRegs[0]);
+    }
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
@@ -1630,13 +1649,14 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   TargetLowering::IntrinsicInfo Info;
   // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
   if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
-    unsigned Align = Info.align;
-    if (Align == 0)
-      Align = DL->getABITypeAlignment(Info.memVT.getTypeForEVT(F->getContext()));
+    MaybeAlign Align = Info.align;
+    if (!Align)
+      Align = MaybeAlign(
+          DL->getABITypeAlignment(Info.memVT.getTypeForEVT(F->getContext())));
 
     uint64_t Size = Info.memVT.getStoreSize();
-    MIB.addMemOperand(MF->getMachineMemOperand(MachinePointerInfo(Info.ptrVal),
-                                               Info.flags, Size, Align));
+    MIB.addMemOperand(MF->getMachineMemOperand(
+        MachinePointerInfo(Info.ptrVal), Info.flags, Size, Align->value()));
   }
 
   return true;
@@ -1672,30 +1692,7 @@ bool IRTranslator::translateInvoke(const User &U,
   MCSymbol *BeginSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
 
-  ArrayRef<Register> Res;
-  if (!I.getType()->isVoidTy())
-    Res = getOrCreateVRegs(I);
-  SmallVector<ArrayRef<Register>, 8> Args;
-  Register SwiftErrorVReg = 0;
-  Register SwiftInVReg = 0;
-  for (auto &Arg : I.arg_operands()) {
-    if (CLI->supportSwiftError() && isSwiftError(Arg)) {
-      assert(SwiftInVReg == 0 && "Expected only one swift error argument");
-      LLT Ty = getLLTForType(*Arg->getType(), *DL);
-      SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
-      MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
-                                            &I, &MIRBuilder.getMBB(), Arg));
-      Args.push_back(makeArrayRef(SwiftInVReg));
-      SwiftErrorVReg =
-          SwiftError.getOrCreateVRegDefAt(&I, &MIRBuilder.getMBB(), Arg);
-      continue;
-    }
-
-    Args.push_back(getOrCreateVRegs(*Arg));
-  }
-
-  if (!CLI->lowerCall(MIRBuilder, &I, Res, Args, SwiftErrorVReg,
-                      [&]() { return getOrCreateVReg(*I.getCalledValue()); }))
+  if (!translateCallSite(&I, MIRBuilder))
     return false;
 
   MCSymbol *EndSymbol = Context.createTempSymbol();
@@ -1811,36 +1808,25 @@ bool IRTranslator::translateAlloca(const User &U,
 
   Register AllocSize = MRI->createGenericVirtualRegister(IntPtrTy);
   Register TySize =
-      getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty)));
+      getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, DL->getTypeAllocSize(Ty)));
   MIRBuilder.buildMul(AllocSize, NumElts, TySize);
 
-  LLT PtrTy = getLLTForType(*AI.getType(), *DL);
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
-
-  Register SPTmp = MRI->createGenericVirtualRegister(PtrTy);
-  MIRBuilder.buildCopy(SPTmp, SPReg);
-
-  Register AllocTmp = MRI->createGenericVirtualRegister(PtrTy);
-  MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize);
-
-  // Handle alignment. We have to realign if the allocation granule was smaller
-  // than stack alignment, or the specific alloca requires more than stack
-  // alignment.
   unsigned StackAlign =
       MF->getSubtarget().getFrameLowering()->getStackAlignment();
-  Align = std::max(Align, StackAlign);
-  if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) {
-    // Round the size of the allocation up to the stack alignment size
-    // by add SA-1 to the size. This doesn't overflow because we're computing
-    // an address inside an alloca.
-    Register AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy);
-    MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align));
-    AllocTmp = AlignedAlloc;
-  }
+  if (Align <= StackAlign)
+    Align = 0;
+
+  // Round the size of the allocation up to the stack alignment size
+  // by add SA-1 to the size. This doesn't overflow because we're computing
+  // an address inside an alloca.
+  auto SAMinusOne = MIRBuilder.buildConstant(IntPtrTy, StackAlign - 1);
+  auto AllocAdd = MIRBuilder.buildAdd(IntPtrTy, AllocSize, SAMinusOne,
+                                      MachineInstr::NoUWrap);
+  auto AlignCst =
+      MIRBuilder.buildConstant(IntPtrTy, ~(uint64_t)(StackAlign - 1));
+  auto AlignedAlloc = MIRBuilder.buildAnd(IntPtrTy, AllocAdd, AlignCst);
 
-  MIRBuilder.buildCopy(SPReg, AllocTmp);
-  MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp);
+  MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AlignedAlloc, Align);
 
   MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI);
   assert(MF->getFrameInfo().hasVarSizedObjects());
@@ -1926,7 +1912,7 @@ bool IRTranslator::translateShuffleVector(const User &U,
       .addDef(getOrCreateVReg(U))
       .addUse(getOrCreateVReg(*U.getOperand(0)))
       .addUse(getOrCreateVReg(*U.getOperand(1)))
-      .addUse(getOrCreateVReg(*U.getOperand(2)));
+      .addShuffleMask(cast<Constant>(U.getOperand(2)));
   return true;
 }
 
@@ -1991,7 +1977,6 @@ bool IRTranslator::translateAtomicRMW(const User &U,
   unsigned Opcode = 0;
   switch (I.getOperation()) {
   default:
-    llvm_unreachable("Unknown atomicrmw op");
     return false;
   case AtomicRMWInst::Xchg:
     Opcode = TargetOpcode::G_ATOMICRMW_XCHG;
@@ -2026,6 +2011,12 @@ bool IRTranslator::translateAtomicRMW(const User &U,
   case AtomicRMWInst::UMin:
     Opcode = TargetOpcode::G_ATOMICRMW_UMIN;
     break;
+  case AtomicRMWInst::FAdd:
+    Opcode = TargetOpcode::G_ATOMICRMW_FADD;
+    break;
+  case AtomicRMWInst::FSub:
+    Opcode = TargetOpcode::G_ATOMICRMW_FSUB;
+    break;
   }
 
   MIRBuilder.buildAtomicRMW(
@@ -2197,6 +2188,20 @@ void IRTranslator::finalizeFunction() {
   FuncInfo.clear();
 }
 
+/// Returns true if a BasicBlock \p BB within a variadic function contains a
+/// variadic musttail call.
+static bool checkForMustTailInVarArgFn(bool IsVarArg, const BasicBlock &BB) {
+  if (!IsVarArg)
+    return false;
+
+  // Walk the block backwards, because tail calls usually only appear at the end
+  // of a block.
+  return std::any_of(BB.rbegin(), BB.rend(), [](const Instruction &I) {
+    const auto *CI = dyn_cast<CallInst>(&I);
+    return CI && CI->isMustTailCall();
+  });
+}
+
 bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MF = &CurMF;
   const Function &F = MF->getFunction();
@@ -2212,26 +2217,26 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
                        : TPC->isGISelCSEEnabled();
 
   if (EnableCSE) {
-    EntryBuilder = make_unique<CSEMIRBuilder>(CurMF);
+    EntryBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
     CSEInfo = &Wrapper.get(TPC->getCSEConfig());
     EntryBuilder->setCSEInfo(CSEInfo);
-    CurBuilder = make_unique<CSEMIRBuilder>(CurMF);
+    CurBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
     CurBuilder->setCSEInfo(CSEInfo);
   } else {
-    EntryBuilder = make_unique<MachineIRBuilder>();
-    CurBuilder = make_unique<MachineIRBuilder>();
+    EntryBuilder = std::make_unique<MachineIRBuilder>();
+    CurBuilder = std::make_unique<MachineIRBuilder>();
   }
   CLI = MF->getSubtarget().getCallLowering();
   CurBuilder->setMF(*MF);
   EntryBuilder->setMF(*MF);
   MRI = &MF->getRegInfo();
   DL = &F.getParent()->getDataLayout();
-  ORE = llvm::make_unique<OptimizationRemarkEmitter>(&F);
+  ORE = std::make_unique<OptimizationRemarkEmitter>(&F);
   FuncInfo.MF = MF;
   FuncInfo.BPI = nullptr;
   const auto &TLI = *MF->getSubtarget().getTargetLowering();
   const TargetMachine &TM = MF->getTarget();
-  SL = make_unique<GISelSwitchLowering>(this, FuncInfo);
+  SL = std::make_unique<GISelSwitchLowering>(this, FuncInfo);
   SL->init(TLI, TM, *DL);
 
   EnableOpts = TM.getOptLevel() != CodeGenOpt::None && !skipFunction(F);
@@ -2258,6 +2263,9 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   SwiftError.setFunction(CurMF);
   SwiftError.createEntriesInEntryBlock(DbgLoc);
 
+  bool IsVarArg = F.isVarArg();
+  bool HasMustTailInVarArgFn = false;
+
   // Create all blocks, in IR order, to preserve the layout.
   for (const BasicBlock &BB: F) {
     auto *&MBB = BBToMBB[&BB];
@@ -2267,8 +2275,13 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
 
     if (BB.hasAddressTaken())
       MBB->setHasAddressTaken();
+
+    if (!HasMustTailInVarArgFn)
+      HasMustTailInVarArgFn = checkForMustTailInVarArgFn(IsVarArg, BB);
   }
 
+  MF->getFrameInfo().setHasMustTailInVarArgFunc(HasMustTailInVarArgFn);
+
   // Make our arguments/constants entry block fallthrough to the IR entry block.
   EntryBB->addSuccessor(&getMBB(F.front()));
 
@@ -2286,18 +2299,6 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
     }
   }
 
-  // We don't currently support translating swifterror or swiftself functions.
-  for (auto &Arg : F.args()) {
-    if (Arg.hasSwiftSelfAttr()) {
-      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
-                                 F.getSubprogram(), &F.getEntryBlock());
-      R << "unable to lower arguments due to swiftself: "
-        << ore::NV("Prototype", F.getType());
-      reportTranslationError(*MF, *TPC, *ORE, R);
-      return false;
-    }
-  }
-
   if (!CLI->lowerFormalArguments(*EntryBuilder.get(), F, VRegArgs)) {
     OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                F.getSubprogram(), &F.getEntryBlock());
@@ -2322,8 +2323,15 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
       // Set the insertion point of all the following translations to
       // the end of this basic block.
       CurBuilder->setMBB(MBB);
-
+      HasTailCall = false;
       for (const Instruction &Inst : *BB) {
+        // If we translated a tail call in the last step, then we know
+        // everything after the call is either a return, or something that is
+        // handled by the call itself. (E.g. a lifetime marker or assume
+        // intrinsic.) In this case, we should stop translating the block and
+        // move on.
+        if (HasTailCall)
+          break;
 #ifndef NDEBUG
         Verifier.setCurrentInst(&Inst);
 #endif // ifndef NDEBUG
diff --git a/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 70694fe6b6c8..7c4fd2d140d3 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -12,11 +12,14 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -45,6 +48,7 @@ INITIALIZE_PASS_BEGIN(InstructionSelect, DEBUG_TYPE,
                       "Select target instructions out of generic instructions",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
 INITIALIZE_PASS_END(InstructionSelect, DEBUG_TYPE,
                     "Select target instructions out of generic instructions",
                     false, false)
@@ -53,6 +57,8 @@ InstructionSelect::InstructionSelect() : MachineFunctionPass(ID) { }
 
 void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
   getSelectionDAGFallbackAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -64,11 +70,13 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   LLVM_DEBUG(dbgs() << "Selecting function: " << MF.getName() << '\n');
+  GISelKnownBits &KB = getAnalysis<GISelKnownBitsAnalysis>().get(MF);
 
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
-  const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
+  InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
   CodeGenCoverage CoverageInfo;
   assert(ISel && "Cannot work without InstructionSelector");
+  ISel->setupMF(MF, KB, CoverageInfo);
 
   // An optimization remark emitter. Used to report failures.
   MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
@@ -124,7 +132,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
-      if (!ISel->select(MI, CoverageInfo)) {
+      if (!ISel->select(MI)) {
         // FIXME: It would be nice to dump all inserted instructions.  It's
         // not obvious how, esp. considering select() can insert after MI.
         reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI);
@@ -159,10 +167,10 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
         --MII;
       if (MI.getOpcode() != TargetOpcode::COPY)
         continue;
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned DstReg = MI.getOperand(0).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-          TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      Register SrcReg = MI.getOperand(1).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
+      if (Register::isVirtualRegister(SrcReg) &&
+          Register::isVirtualRegister(DstReg)) {
         auto SrcRC = MRI.getRegClass(SrcReg);
         auto DstRC = MRI.getRegClass(DstReg);
         if (SrcRC == DstRC) {
@@ -179,7 +187,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   // that the size of the now-constrained vreg is unchanged and that it has a
   // register class.
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned VReg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned VReg = Register::index2VirtReg(I);
 
     MachineInstr *MI = nullptr;
     if (!MRI.def_empty(VReg))
@@ -217,6 +225,22 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   auto &TLI = *MF.getSubtarget().getTargetLowering();
   TLI.finalizeLowering(MF);
 
+  // Determine if there are any calls in this machine function. Ported from
+  // SelectionDAG.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  for (const auto &MBB : MF) {
+    if (MFI.hasCalls() && MF.hasInlineAsm())
+      break;
+
+    for (const auto &MI : MBB) {
+      if ((MI.isCall() && !MI.isReturn()) || MI.isStackAligningInlineAsm())
+        MFI.setHasCalls(true);
+      if (MI.isInlineAsm())
+        MF.setHasInlineAsm(true);
+    }
+  }
+
+
   LLVM_DEBUG({
     dbgs() << "Rules covered by selecting function: " << MF.getName() << ":";
     for (auto RuleID : CoverageInfo.covered())
diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 2ad35b3a72c9..28143b30d4e8 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -79,5 +79,5 @@ bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI,
     return true;
 
   return !MI.mayLoadOrStore() && !MI.mayRaiseFPException() &&
-         !MI.hasUnmodeledSideEffects() && empty(MI.implicit_operands());
+         !MI.hasUnmodeledSideEffects() && MI.implicit_operands().empty();
 }
diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp
index b5b26bff34bb..1593e21fe07e 100644
--- a/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -184,11 +184,11 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
                        : TPC.isGISelCSEEnabled();
 
   if (EnableCSE) {
-    MIRBuilder = make_unique<CSEMIRBuilder>();
+    MIRBuilder = std::make_unique<CSEMIRBuilder>();
     CSEInfo = &Wrapper.get(TPC.getCSEConfig());
     MIRBuilder->setCSEInfo(CSEInfo);
   } else
-    MIRBuilder = make_unique<MachineIRBuilder>();
+    MIRBuilder = std::make_unique<MachineIRBuilder>();
   // This observer keeps the worklist updated.
   LegalizerWorkListManager WorkListObserver(InstList, ArtifactList);
   // We want both WorkListObserver as well as CSEInfo to observe all changes.
@@ -206,8 +206,16 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   auto RemoveDeadInstFromLists = [&WrapperObserver](MachineInstr *DeadMI) {
     WrapperObserver.erasingInstr(*DeadMI);
   };
+  auto stopLegalizing = [&](MachineInstr &MI) {
+    Helper.MIRBuilder.stopObservingChanges();
+    reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
+                       "unable to legalize instruction", MI);
+  };
   bool Changed = false;
+  SmallVector<MachineInstr *, 128> RetryList;
   do {
+    assert(RetryList.empty() && "Expected no instructions in RetryList");
+    unsigned NumArtifacts = ArtifactList.size();
     while (!InstList.empty()) {
       MachineInstr &MI = *InstList.pop_back_val();
       assert(isPreISelGenericOpcode(MI.getOpcode()) && "Expecting generic opcode");
@@ -222,14 +230,31 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       // Error out if we couldn't legalize this instruction. We may want to
       // fall back to DAG ISel instead in the future.
       if (Res == LegalizerHelper::UnableToLegalize) {
-        Helper.MIRBuilder.stopObservingChanges();
-        reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
-                           "unable to legalize instruction", MI);
+        // Move illegal artifacts to RetryList instead of aborting because
+        // legalizing InstList may generate artifacts that allow
+        // ArtifactCombiner to combine away them.
+        if (isArtifact(MI)) {
+          RetryList.push_back(&MI);
+          continue;
+        }
+        stopLegalizing(MI);
         return false;
       }
       WorkListObserver.printNewInstrs();
       Changed |= Res == LegalizerHelper::Legalized;
     }
+    // Try to combine the instructions in RetryList again if there
+    // are new artifacts. If not, stop legalizing.
+    if (!RetryList.empty()) {
+      if (ArtifactList.size() > NumArtifacts) {
+        while (!RetryList.empty())
+          ArtifactList.insert(RetryList.pop_back_val());
+      } else {
+        MachineInstr *MI = *RetryList.begin();
+        stopLegalizing(*MI);
+        return false;
+      }
+    }
     while (!ArtifactList.empty()) {
       MachineInstr &MI = *ArtifactList.pop_back_val();
       assert(isPreISelGenericOpcode(MI.getOpcode()) && "Expecting generic opcode");
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index f5cf7fc9bd9b..21512e543878 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -171,6 +172,26 @@ bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
   return true;
 }
 
+static LLT getGCDType(LLT OrigTy, LLT TargetTy) {
+  if (OrigTy.isVector() && TargetTy.isVector()) {
+    assert(OrigTy.getElementType() == TargetTy.getElementType());
+    int GCD = greatestCommonDivisor(OrigTy.getNumElements(),
+                                    TargetTy.getNumElements());
+    return LLT::scalarOrVector(GCD, OrigTy.getElementType());
+  }
+
+  if (OrigTy.isVector() && !TargetTy.isVector()) {
+    assert(OrigTy.getElementType() == TargetTy);
+    return TargetTy;
+  }
+
+  assert(!OrigTy.isVector() && !TargetTy.isVector());
+
+  int GCD = greatestCommonDivisor(OrigTy.getSizeInBits(),
+                                  TargetTy.getSizeInBits());
+  return LLT::scalar(GCD);
+}
+
 void LegalizerHelper::insertParts(Register DstReg,
                                   LLT ResultTy, LLT PartTy,
                                   ArrayRef<Register> PartRegs,
@@ -219,11 +240,29 @@ void LegalizerHelper::insertParts(Register DstReg,
 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
   switch (Opcode) {
   case TargetOpcode::G_SDIV:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::SDIV_I64 : RTLIB::SDIV_I32;
+    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
+    switch (Size) {
+    case 32:
+      return RTLIB::SDIV_I32;
+    case 64:
+      return RTLIB::SDIV_I64;
+    case 128:
+      return RTLIB::SDIV_I128;
+    default:
+      llvm_unreachable("unexpected size");
+    }
   case TargetOpcode::G_UDIV:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::UDIV_I64 : RTLIB::UDIV_I32;
+    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
+    switch (Size) {
+    case 32:
+      return RTLIB::UDIV_I32;
+    case 64:
+      return RTLIB::UDIV_I64;
+    case 128:
+      return RTLIB::UDIV_I128;
+    default:
+      llvm_unreachable("unexpected size");
+    }
   case TargetOpcode::G_SREM:
     assert((Size == 32 || Size == 64) && "Unsupported size");
     return Size == 64 ? RTLIB::SREM_I64 : RTLIB::SREM_I32;
@@ -288,6 +327,35 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
   llvm_unreachable("Unknown libcall function");
 }
 
+/// True if an instruction is in tail position in its caller. Intended for
+/// legalizing libcalls as tail calls when possible.
+static bool isLibCallInTailPosition(MachineInstr &MI) {
+  const Function &F = MI.getParent()->getParent()->getFunction();
+
+  // Conservatively require the attributes of the call to match those of
+  // the return. Ignore NoAlias and NonNull because they don't affect the
+  // call sequence.
+  AttributeList CallerAttrs = F.getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
+          .removeAttribute(Attribute::NoAlias)
+          .removeAttribute(Attribute::NonNull)
+          .hasAttributes())
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
+    return false;
+
+  // Only tail call if the following instruction is a standard return.
+  auto &TII = *MI.getMF()->getSubtarget().getInstrInfo();
+  MachineInstr *Next = MI.getNextNode();
+  if (!Next || TII.isTailCall(*Next) || !Next->isReturn())
+    return false;
+
+  return true;
+}
+
 LegalizerHelper::LegalizeResult
 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
                     const CallLowering::ArgInfo &Result,
@@ -296,9 +364,12 @@ llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
   const char *Name = TLI.getLibcallName(Libcall);
 
-  MIRBuilder.getMF().getFrameInfo().setHasCalls(true);
-  if (!CLI.lowerCall(MIRBuilder, TLI.getLibcallCallingConv(Libcall),
-                     MachineOperand::CreateES(Name), Result, Args))
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(Libcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = Result;
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
     return LegalizerHelper::UnableToLegalize;
 
   return LegalizerHelper::Legalized;
@@ -317,6 +388,74 @@ simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
                        Args);
 }
 
+LegalizerHelper::LegalizeResult
+llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+  auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
+
+  SmallVector<CallLowering::ArgInfo, 3> Args;
+  // Add all the args, except for the last which is an imm denoting 'tail'.
+  for (unsigned i = 1; i < MI.getNumOperands() - 1; i++) {
+    Register Reg = MI.getOperand(i).getReg();
+
+    // Need derive an IR type for call lowering.
+    LLT OpLLT = MRI.getType(Reg);
+    Type *OpTy = nullptr;
+    if (OpLLT.isPointer())
+      OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
+    else
+      OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
+    Args.push_back({Reg, OpTy});
+  }
+
+  auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  Intrinsic::ID ID = MI.getOperand(0).getIntrinsicID();
+  RTLIB::Libcall RTLibcall;
+  switch (ID) {
+  case Intrinsic::memcpy:
+    RTLibcall = RTLIB::MEMCPY;
+    break;
+  case Intrinsic::memset:
+    RTLibcall = RTLIB::MEMSET;
+    break;
+  case Intrinsic::memmove:
+    RTLibcall = RTLIB::MEMMOVE;
+    break;
+  default:
+    return LegalizerHelper::UnableToLegalize;
+  }
+  const char *Name = TLI.getLibcallName(RTLibcall);
+
+  MIRBuilder.setInstr(MI);
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx));
+  Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() == 1 &&
+                    isLibCallInTailPosition(MI);
+
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
+    return LegalizerHelper::UnableToLegalize;
+
+  if (Info.LoweredTailCall) {
+    assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
+    // We must have a return following the call to get past
+    // isLibCallInTailPosition.
+    assert(MI.getNextNode() && MI.getNextNode()->isReturn() &&
+           "Expected instr following MI to be a return?");
+
+    // We lowered a tail call, so the call is now the return from the block.
+    // Delete the old return.
+    MI.getNextNode()->eraseFromParent();
+  }
+
+  return LegalizerHelper::Legalized;
+}
+
 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
                                        Type *FromType) {
   auto ToMVT = MVT::getVT(ToType);
@@ -518,6 +657,65 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SEXT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+
+    // FIXME: support the general case where the requested NarrowTy may not be
+    // the same as the source type. E.g. s128 = sext(s32)
+    if ((SrcTy.getSizeInBits() != SizeOp0 / 2) ||
+        SrcTy.getSizeInBits() != NarrowTy.getSizeInBits()) {
+      LLVM_DEBUG(dbgs() << "Can't narrow sext to type " << NarrowTy << "\n");
+      return UnableToLegalize;
+    }
+
+    // Shift the sign bit of the low register through the high register.
+    auto ShiftAmt =
+        MIRBuilder.buildConstant(LLT::scalar(64), NarrowTy.getSizeInBits() - 1);
+    auto Shift = MIRBuilder.buildAShr(NarrowTy, SrcReg, ShiftAmt);
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {SrcReg, Shift.getReg(0)});
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_ZEXT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+    uint64_t SizeOp1 = SrcTy.getSizeInBits();
+    if (SizeOp0 % SizeOp1 != 0)
+      return UnableToLegalize;
+
+    // Generate a merge where the bottom bits are taken from the source, and
+    // zero everything else.
+    Register ZeroReg = MIRBuilder.buildConstant(SrcTy, 0).getReg(0);
+    unsigned NumParts = SizeOp0 / SizeOp1;
+    SmallVector<Register, 4> Srcs = {MI.getOperand(1).getReg()};
+    for (unsigned Part = 1; Part < NumParts; ++Part)
+      Srcs.push_back(ZeroReg);
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Srcs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_TRUNC: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
+      LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
+      return UnableToLegalize;
+    }
+
+    auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
+    MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Unmerge.getReg(0));
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
   case TargetOpcode::G_ADD: {
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
@@ -530,15 +728,17 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
-    Register CarryIn = MRI.createGenericVirtualRegister(LLT::scalar(1));
-    MIRBuilder.buildConstant(CarryIn, 0);
-
+    Register CarryIn;
     for (int i = 0; i < NumParts; ++i) {
       Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
 
-      MIRBuilder.buildUAdde(DstReg, CarryOut, Src1Regs[i],
-                            Src2Regs[i], CarryIn);
+      if (i == 0)
+        MIRBuilder.buildUAddo(DstReg, CarryOut, Src1Regs[i], Src2Regs[i]);
+      else {
+        MIRBuilder.buildUAdde(DstReg, CarryOut, Src1Regs[i],
+                              Src2Regs[i], CarryIn);
+      }
 
       DstRegs.push_back(DstReg);
       CarryIn = CarryOut;
@@ -730,7 +930,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
     }
-    MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
+    MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
     Observer.changedInstr(MI);
     MI.eraseFromParent();
@@ -763,6 +963,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
     CmpInst::Predicate Pred =
         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+    LLT ResTy = MRI.getType(MI.getOperand(0).getReg());
 
     if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
       MachineInstrBuilder XorL = MIRBuilder.buildXor(NarrowTy, LHSL, RHSL);
@@ -771,18 +972,109 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       MachineInstrBuilder Zero = MIRBuilder.buildConstant(NarrowTy, 0);
       MIRBuilder.buildICmp(Pred, MI.getOperand(0).getReg(), Or, Zero);
     } else {
-      const LLT s1 = LLT::scalar(1);
-      MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, s1, LHSH, RHSH);
+      MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
       MachineInstrBuilder CmpHEQ =
-          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, s1, LHSH, RHSH);
+          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
-          ICmpInst::getUnsignedPredicate(Pred), s1, LHSL, RHSL);
+          ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
       MIRBuilder.buildSelect(MI.getOperand(0).getReg(), CmpHEQ, CmpLU, CmpH);
     }
     Observer.changedInstr(MI);
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SEXT_INREG: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    if (!MI.getOperand(2).isImm())
+      return UnableToLegalize;
+    int64_t SizeInBits = MI.getOperand(2).getImm();
+
+    // So long as the new type has more bits than the bits we're extending we
+    // don't need to break it apart.
+    if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
+      Observer.changingInstr(MI);
+      // We don't lose any non-extension bits by truncating the src and
+      // sign-extending the dst.
+      MachineOperand &MO1 = MI.getOperand(1);
+      auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1.getReg());
+      MO1.setReg(TruncMIB->getOperand(0).getReg());
+
+      MachineOperand &MO2 = MI.getOperand(0);
+      Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+      MIRBuilder.buildInstr(TargetOpcode::G_SEXT, {MO2.getReg()}, {DstExt});
+      MO2.setReg(DstExt);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    // Break it apart. Components below the extension point are unmodified. The
+    // component containing the extension point becomes a narrower SEXT_INREG.
+    // Components above it are ashr'd from the component containing the
+    // extension point.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+    int NumParts = SizeOp0 / NarrowSize;
+
+    // List the registers where the destination will be scattered.
+    SmallVector<Register, 2> DstRegs;
+    // List the registers where the source will be split.
+    SmallVector<Register, 2> SrcRegs;
+
+    // Create all the temporary registers.
+    for (int i = 0; i < NumParts; ++i) {
+      Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
+
+      SrcRegs.push_back(SrcReg);
+    }
+
+    // Explode the big arguments into smaller chunks.
+    MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1).getReg());
+
+    Register AshrCstReg =
+        MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
+            ->getOperand(0)
+            .getReg();
+    Register FullExtensionReg = 0;
+    Register PartialExtensionReg = 0;
+
+    // Do the operation on each small part.
+    for (int i = 0; i < NumParts; ++i) {
+      if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
+        DstRegs.push_back(SrcRegs[i]);
+      else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
+        assert(PartialExtensionReg &&
+               "Expected to visit partial extension before full");
+        if (FullExtensionReg) {
+          DstRegs.push_back(FullExtensionReg);
+          continue;
+        }
+        DstRegs.push_back(MIRBuilder
+                              .buildInstr(TargetOpcode::G_ASHR, {NarrowTy},
+                                          {PartialExtensionReg, AshrCstReg})
+                              ->getOperand(0)
+                              .getReg());
+        FullExtensionReg = DstRegs.back();
+      } else {
+        DstRegs.push_back(
+            MIRBuilder
+                .buildInstr(
+                    TargetOpcode::G_SEXT_INREG, {NarrowTy},
+                    {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
+                ->getOperand(0)
+                .getReg());
+        PartialExtensionReg = DstRegs.back();
+      }
+    }
+
+    // Gather the destination registers into the final destination.
+    Register DstReg = MI.getOperand(0).getReg();
+    MIRBuilder.buildMerge(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
@@ -892,7 +1184,7 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
 
       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
 
-      Register NextResult = I + 1 == NumOps && WideSize == DstSize ? DstReg :
+      Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
         MRI.createGenericVirtualRegister(WideTy);
 
       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
@@ -903,6 +1195,8 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
 
     if (WideSize > DstSize)
       MIRBuilder.buildTrunc(DstReg, ResultReg);
+    else if (DstTy.isPointer())
+      MIRBuilder.buildIntToPtr(DstReg, ResultReg);
 
     MI.eraseFromParent();
     return Legalized;
@@ -1218,6 +1512,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_BITREVERSE: {
+    Observer.changingInstr(MI);
+
+    Register DstReg = MI.getOperand(0).getReg();
+    LLT Ty = MRI.getType(DstReg);
+    unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
+
+    Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    MI.getOperand(0).setReg(DstExt);
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+
+    auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
+    auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
+    MIRBuilder.buildTrunc(DstReg, Shift);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_AND:
   case TargetOpcode::G_MUL:
@@ -1310,13 +1622,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
 
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
-    if (TypeIdx != 0)
-      return UnableToLegalize;
     Observer.changingInstr(MI);
-    widenScalarDst(MI, WideTy);
+
+    if (TypeIdx == 0)
+      widenScalarDst(MI, WideTy);
+    else
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
+
     Observer.changedInstr(MI);
     return Legalized;
-
   case TargetOpcode::G_SITOFP:
     if (TypeIdx != 1)
       return UnableToLegalize;
@@ -1483,6 +1797,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_FMUL:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMA:
+  case TargetOpcode::G_FMAD:
   case TargetOpcode::G_FNEG:
   case TargetOpcode::G_FABS:
   case TargetOpcode::G_FCANONICALIZE:
@@ -1553,6 +1868,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_SEXT_INREG:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
+    Observer.changedInstr(MI);
+    return Legalized;
   }
 }
 
@@ -1579,6 +1903,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SSUBO:
+    return lowerSADDO_SSUBO(MI);
   case TargetOpcode::G_SMULO:
   case TargetOpcode::G_UMULO: {
     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
@@ -1669,6 +1996,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_FMAD:
+    return lowerFMad(MI);
   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
     Register OldValRes = MI.getOperand(0).getReg();
     Register SuccessRes = MI.getOperand(1).getReg();
@@ -1690,11 +2019,57 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     LLT DstTy = MRI.getType(DstReg);
     auto &MMO = **MI.memoperands_begin();
 
-    if (DstTy.getSizeInBits() == MMO.getSize() /* in bytes */ * 8) {
-      // In the case of G_LOAD, this was a non-extending load already and we're
-      // about to lower to the same instruction.
-      if (MI.getOpcode() == TargetOpcode::G_LOAD)
+    if (DstTy.getSizeInBits() == MMO.getSizeInBits()) {
+      if (MI.getOpcode() == TargetOpcode::G_LOAD) {
+        // This load needs splitting into power of 2 sized loads.
+        if (DstTy.isVector())
           return UnableToLegalize;
+        if (isPowerOf2_32(DstTy.getSizeInBits()))
+          return UnableToLegalize; // Don't know what we're being asked to do.
+
+        // Our strategy here is to generate anyextending loads for the smaller
+        // types up to next power-2 result type, and then combine the two larger
+        // result values together, before truncating back down to the non-pow-2
+        // type.
+        // E.g. v1 = i24 load =>
+        // v2 = i32 load (2 byte)
+        // v3 = i32 load (1 byte)
+        // v4 = i32 shl v3, 16
+        // v5 = i32 or v4, v2
+        // v1 = i24 trunc v5
+        // By doing this we generate the correct truncate which should get
+        // combined away as an artifact with a matching extend.
+        uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits());
+        uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize;
+
+        MachineFunction &MF = MIRBuilder.getMF();
+        MachineMemOperand *LargeMMO =
+            MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+        MachineMemOperand *SmallMMO = MF.getMachineMemOperand(
+            &MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+
+        LLT PtrTy = MRI.getType(PtrReg);
+        unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits());
+        LLT AnyExtTy = LLT::scalar(AnyExtSize);
+        Register LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
+        Register SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
+        auto LargeLoad =
+            MIRBuilder.buildLoad(LargeLdReg, PtrReg, *LargeMMO);
+
+        auto OffsetCst =
+            MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
+        Register GEPReg = MRI.createGenericVirtualRegister(PtrTy);
+        auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
+        auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0),
+                                              *SmallMMO);
+
+        auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
+        auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
+        auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
+        MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)});
+        MI.eraseFromParent();
+        return Legalized;
+      }
       MIRBuilder.buildLoad(DstReg, PtrReg, MMO);
       MI.eraseFromParent();
       return Legalized;
@@ -1723,6 +2098,51 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
 
     return UnableToLegalize;
   }
+  case TargetOpcode::G_STORE: {
+    // Lower a non-power of 2 store into multiple pow-2 stores.
+    // E.g. split an i24 store into an i16 store + i8 store.
+    // We do this by first extending the stored value to the next largest power
+    // of 2 type, and then using truncating stores to store the components.
+    // By doing this, likewise with G_LOAD, generate an extend that can be
+    // artifact-combined away instead of leaving behind extracts.
+    Register SrcReg = MI.getOperand(0).getReg();
+    Register PtrReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    MachineMemOperand &MMO = **MI.memoperands_begin();
+    if (SrcTy.getSizeInBits() != MMO.getSizeInBits())
+      return UnableToLegalize;
+    if (SrcTy.isVector())
+      return UnableToLegalize;
+    if (isPowerOf2_32(SrcTy.getSizeInBits()))
+      return UnableToLegalize; // Don't know what we're being asked to do.
+
+    // Extend to the next pow-2.
+    const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits()));
+    auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg);
+
+    // Obtain the smaller value by shifting away the larger value.
+    uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits());
+    uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize;
+    auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize);
+    auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt);
+
+    // Generate the GEP and truncating stores.
+    LLT PtrTy = MRI.getType(PtrReg);
+    auto OffsetCst =
+        MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
+    Register GEPReg = MRI.createGenericVirtualRegister(PtrTy);
+    auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
+
+    MachineFunction &MF = MIRBuilder.getMF();
+    MachineMemOperand *LargeMMO =
+        MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+    MachineMemOperand *SmallMMO =
+        MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+    MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO);
+    MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
   case TargetOpcode::G_CTLZ:
@@ -1797,6 +2217,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return lowerUITOFP(MI, TypeIdx, Ty);
   case G_SITOFP:
     return lowerSITOFP(MI, TypeIdx, Ty);
+  case G_FPTOUI:
+    return lowerFPTOUI(MI, TypeIdx, Ty);
   case G_SMIN:
   case G_SMAX:
   case G_UMIN:
@@ -1807,6 +2229,31 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   case G_FMINNUM:
   case G_FMAXNUM:
     return lowerFMinNumMaxNum(MI);
+  case G_UNMERGE_VALUES:
+    return lowerUnmergeValues(MI);
+  case TargetOpcode::G_SEXT_INREG: {
+    assert(MI.getOperand(2).isImm() && "Expected immediate");
+    int64_t SizeInBits = MI.getOperand(2).getImm();
+
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+    Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
+
+    auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
+    MIRBuilder.buildInstr(TargetOpcode::G_SHL, {TmpRes}, {SrcReg, MIBSz->getOperand(0).getReg()});
+    MIRBuilder.buildInstr(TargetOpcode::G_ASHR, {DstReg}, {TmpRes, MIBSz->getOperand(0).getReg()});
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case G_SHUFFLE_VECTOR:
+    return lowerShuffleVector(MI);
+  case G_DYN_STACKALLOC:
+    return lowerDynStackAlloc(MI);
+  case G_EXTRACT:
+    return lowerExtract(MI);
+  case G_INSERT:
+    return lowerInsert(MI);
   }
 }
 
@@ -2282,6 +2729,105 @@ LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
+                                                  unsigned TypeIdx,
+                                                  LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  const int NumDst = MI.getNumOperands() - 1;
+  const Register SrcReg = MI.getOperand(NumDst).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  // TODO: Create sequence of extracts.
+  if (DstTy == NarrowTy)
+    return UnableToLegalize;
+
+  LLT GCDTy = getGCDType(SrcTy, NarrowTy);
+  if (DstTy == GCDTy) {
+    // This would just be a copy of the same unmerge.
+    // TODO: Create extracts, pad with undef and create intermediate merges.
+    return UnableToLegalize;
+  }
+
+  auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
+  const int NumUnmerge = Unmerge->getNumOperands() - 1;
+  const int PartsPerUnmerge = NumDst / NumUnmerge;
+
+  for (int I = 0; I != NumUnmerge; ++I) {
+    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+
+    for (int J = 0; J != PartsPerUnmerge; ++J)
+      MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
+    MIB.addUse(Unmerge.getReg(I));
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorBuildVector(MachineInstr &MI,
+                                                unsigned TypeIdx,
+                                                LLT NarrowTy) {
+  assert(TypeIdx == 0 && "not a vector type index");
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = DstTy.getElementType();
+
+  int DstNumElts = DstTy.getNumElements();
+  int NarrowNumElts = NarrowTy.getNumElements();
+  int NumConcat = (DstNumElts + NarrowNumElts - 1) / NarrowNumElts;
+  LLT WidenedDstTy = LLT::vector(NarrowNumElts * NumConcat, SrcTy);
+
+  SmallVector<Register, 8> ConcatOps;
+  SmallVector<Register, 8> SubBuildVector;
+
+  Register UndefReg;
+  if (WidenedDstTy != DstTy)
+    UndefReg = MIRBuilder.buildUndef(SrcTy).getReg(0);
+
+  // Create a G_CONCAT_VECTORS of NarrowTy pieces, padding with undef as
+  // necessary.
+  //
+  // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
+  //   -> <2 x s16>
+  //
+  // %4:_(s16) = G_IMPLICIT_DEF
+  // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
+  // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
+  // %7:_(<4 x s16>) = G_CONCAT_VECTORS %5, %6
+  // %3:_(<3 x s16>) = G_EXTRACT %7, 0
+  for (int I = 0; I != NumConcat; ++I) {
+    for (int J = 0; J != NarrowNumElts; ++J) {
+      int SrcIdx = NarrowNumElts * I + J;
+
+      if (SrcIdx < DstNumElts) {
+        Register SrcReg = MI.getOperand(SrcIdx + 1).getReg();
+        SubBuildVector.push_back(SrcReg);
+      } else
+        SubBuildVector.push_back(UndefReg);
+    }
+
+    auto BuildVec = MIRBuilder.buildBuildVector(NarrowTy, SubBuildVector);
+    ConcatOps.push_back(BuildVec.getReg(0));
+    SubBuildVector.clear();
+  }
+
+  if (DstTy == WidenedDstTy)
+    MIRBuilder.buildConcatVectors(DstReg, ConcatOps);
+  else {
+    auto Concat = MIRBuilder.buildConcatVectors(WidenedDstTy, ConcatOps);
+    MIRBuilder.buildExtract(DstReg, Concat, 0);
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx,
                                       LLT NarrowTy) {
@@ -2395,6 +2941,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FDIV:
   case G_FREM:
   case G_FMA:
+  case G_FMAD:
   case G_FPOW:
   case G_FEXP:
   case G_FEXP2:
@@ -2411,6 +2958,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FSIN:
   case G_FSQRT:
   case G_BSWAP:
+  case G_BITREVERSE:
   case G_SDIV:
   case G_SMIN:
   case G_SMAX:
@@ -2453,6 +3001,10 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy);
   case G_PHI:
     return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy);
+  case G_UNMERGE_VALUES:
+    return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
+  case G_BUILD_VECTOR:
+    return fewerElementsVectorBuildVector(MI, TypeIdx, NarrowTy);
   case G_LOAD:
   case G_STORE:
     return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy);
@@ -2604,11 +3156,11 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
   switch (MI.getOpcode()) {
   case TargetOpcode::G_SHL: {
     // Short: ShAmt < NewBitSize
-    auto LoS = MIRBuilder.buildShl(HalfTy, InH, Amt);
+    auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
 
-    auto OrLHS = MIRBuilder.buildShl(HalfTy, InH, Amt);
-    auto OrRHS = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
-    auto HiS = MIRBuilder.buildOr(HalfTy, OrLHS, OrRHS);
+    auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
+    auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
+    auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
 
     // Long: ShAmt >= NewBitSize
     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
@@ -2622,41 +3174,25 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
     ResultRegs[1] = Hi.getReg(0);
     break;
   }
-  case TargetOpcode::G_LSHR: {
-    // Short: ShAmt < NewBitSize
-    auto HiS = MIRBuilder.buildLShr(HalfTy, InH, Amt);
-
-    auto OrLHS = MIRBuilder.buildLShr(HalfTy, InL, Amt);
-    auto OrRHS = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
-    auto LoS = MIRBuilder.buildOr(HalfTy, OrLHS, OrRHS);
-
-    // Long: ShAmt >= NewBitSize
-    auto HiL = MIRBuilder.buildConstant(HalfTy, 0);          // Hi part is zero.
-    auto LoL = MIRBuilder.buildLShr(HalfTy, InH, AmtExcess); // Lo from Hi part.
-
-    auto Lo = MIRBuilder.buildSelect(
-        HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
-    auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
-
-    ResultRegs[0] = Lo.getReg(0);
-    ResultRegs[1] = Hi.getReg(0);
-    break;
-  }
+  case TargetOpcode::G_LSHR:
   case TargetOpcode::G_ASHR: {
     // Short: ShAmt < NewBitSize
-    auto HiS = MIRBuilder.buildAShr(HalfTy, InH, Amt);
+    auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
 
-    auto OrLHS = MIRBuilder.buildLShr(HalfTy, InL, Amt);
-    auto OrRHS = MIRBuilder.buildLShr(HalfTy, InH, AmtLack);
-    auto LoS = MIRBuilder.buildOr(HalfTy, OrLHS, OrRHS);
+    auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
+    auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
+    auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
 
     // Long: ShAmt >= NewBitSize
-
-    // Sign of Hi part.
-    auto HiL = MIRBuilder.buildAShr(
-        HalfTy, InH, MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1));
-
-    auto LoL = MIRBuilder.buildAShr(HalfTy, InH, AmtExcess); // Lo from Hi part.
+    MachineInstrBuilder HiL;
+    if (MI.getOpcode() == TargetOpcode::G_LSHR) {
+      HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
+    } else {
+      auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
+      HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
+    }
+    auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
+                                     {InH, AmtExcess});     // Lo from Hi part.
 
     auto Lo = MIRBuilder.buildSelect(
         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
@@ -2701,12 +3237,22 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
   MIRBuilder.setInstr(MI);
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
-  case TargetOpcode::G_IMPLICIT_DEF: {
+  case TargetOpcode::G_IMPLICIT_DEF:
+  case TargetOpcode::G_LOAD: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
     Observer.changingInstr(MI);
     moreElementsVectorDst(MI, MoreTy, 0);
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_STORE:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
   case TargetOpcode::G_AND:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
@@ -2748,6 +3294,26 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     moreElementsVectorDst(MI, MoreTy, 0);
     Observer.changedInstr(MI);
     return Legalized;
+  case TargetOpcode::G_UNMERGE_VALUES: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    int NumDst = MI.getNumOperands() - 1;
+    moreElementsVectorSrc(MI, MoreTy, NumDst);
+
+    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+    for (int I = 0; I != NumDst; ++I)
+      MIB.addDef(MI.getOperand(I).getReg());
+
+    int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits();
+    for (int I = NumDst; I != NewNumDst; ++I)
+      MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
+
+    MIB.addUse(MI.getOperand(NumDst).getReg());
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_PHI:
     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
   default:
@@ -3310,6 +3876,48 @@ LegalizerHelper::lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   return UnableToLegalize;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFPTOUI(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+
+  if (SrcTy != S64 && SrcTy != S32)
+    return UnableToLegalize;
+  if (DstTy != S32 && DstTy != S64)
+    return UnableToLegalize;
+
+  // FPTOSI gives same result as FPTOUI for positive signed integers.
+  // FPTOUI needs to deal with fp values that convert to unsigned integers
+  // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
+
+  APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
+  APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
+                                                : APFloat::IEEEdouble(),
+                    APInt::getNullValue(SrcTy.getSizeInBits()));
+  TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
+
+  MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
+
+  MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
+  // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
+  // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
+  MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
+  MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
+  MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
+  MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
+
+  MachineInstrBuilder FCMP =
+      MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, DstTy, Src, Threshold);
+  MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
   switch (Opc) {
   case TargetOpcode::G_SMIN:
@@ -3419,3 +4027,251 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
   MI.eraseFromParent();
   return Legalized;
 }
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
+  // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Flags = MI.getFlags();
+
+  auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
+                                  Flags);
+  MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
+  const unsigned NumDst = MI.getNumOperands() - 1;
+  const Register SrcReg = MI.getOperand(NumDst).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+
+  Register Dst0Reg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst0Reg);
+
+
+  // Expand scalarizing unmerge as bitcast to integer and shift.
+  if (!DstTy.isVector() && SrcTy.isVector() &&
+      SrcTy.getElementType() == DstTy) {
+    LLT IntTy = LLT::scalar(SrcTy.getSizeInBits());
+    Register Cast = MIRBuilder.buildBitcast(IntTy, SrcReg).getReg(0);
+
+    MIRBuilder.buildTrunc(Dst0Reg, Cast);
+
+    const unsigned DstSize = DstTy.getSizeInBits();
+    unsigned Offset = DstSize;
+    for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
+      auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
+      auto Shift = MIRBuilder.buildLShr(IntTy, Cast, ShiftAmt);
+      MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0Reg = MI.getOperand(1).getReg();
+  Register Src1Reg = MI.getOperand(2).getReg();
+  LLT Src0Ty = MRI.getType(Src0Reg);
+  LLT DstTy = MRI.getType(DstReg);
+  LLT IdxTy = LLT::scalar(32);
+
+  const Constant *ShufMask = MI.getOperand(3).getShuffleMask();
+
+  SmallVector<int, 32> Mask;
+  ShuffleVectorInst::getShuffleMask(ShufMask, Mask);
+
+  if (DstTy.isScalar()) {
+    if (Src0Ty.isVector())
+      return UnableToLegalize;
+
+    // This is just a SELECT.
+    assert(Mask.size() == 1 && "Expected a single mask element");
+    Register Val;
+    if (Mask[0] < 0 || Mask[0] > 1)
+      Val = MIRBuilder.buildUndef(DstTy).getReg(0);
+    else
+      Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
+    MIRBuilder.buildCopy(DstReg, Val);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  Register Undef;
+  SmallVector<Register, 32> BuildVec;
+  LLT EltTy = DstTy.getElementType();
+
+  for (int Idx : Mask) {
+    if (Idx < 0) {
+      if (!Undef.isValid())
+        Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
+      BuildVec.push_back(Undef);
+      continue;
+    }
+
+    if (Src0Ty.isScalar()) {
+      BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
+    } else {
+      int NumElts = Src0Ty.getNumElements();
+      Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
+      int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
+      auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
+      auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
+      BuildVec.push_back(Extract.getReg(0));
+    }
+  }
+
+  MIRBuilder.buildBuildVector(DstReg, BuildVec);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register AllocSize = MI.getOperand(1).getReg();
+  unsigned Align = MI.getOperand(2).getImm();
+
+  const auto &MF = *MI.getMF();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+  LLT PtrTy = MRI.getType(Dst);
+  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+
+  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
+  SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
+
+  // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
+  // have to generate an extra instruction to negate the alloc and then use
+  // G_GEP to add the negative offset.
+  auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
+  if (Align) {
+    APInt AlignMask(IntPtrTy.getSizeInBits(), Align, true);
+    AlignMask.negate();
+    auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
+    Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
+  }
+
+  SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
+  MIRBuilder.buildCopy(SPReg, SPTmp);
+  MIRBuilder.buildCopy(Dst, SPTmp);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerExtract(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  unsigned Offset = MI.getOperand(2).getImm();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (DstTy.isScalar() &&
+      (SrcTy.isScalar() ||
+       (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
+    LLT SrcIntTy = SrcTy;
+    if (!SrcTy.isScalar()) {
+      SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
+      Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
+    }
+
+    if (Offset == 0)
+      MIRBuilder.buildTrunc(Dst, Src);
+    else {
+      auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
+      auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
+      MIRBuilder.buildTrunc(Dst, Shr);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  Register InsertSrc = MI.getOperand(2).getReg();
+  uint64_t Offset = MI.getOperand(3).getImm();
+
+  LLT DstTy = MRI.getType(Src);
+  LLT InsertTy = MRI.getType(InsertSrc);
+
+  if (InsertTy.isScalar() &&
+      (DstTy.isScalar() ||
+       (DstTy.isVector() && DstTy.getElementType() == InsertTy))) {
+    LLT IntDstTy = DstTy;
+    if (!DstTy.isScalar()) {
+      IntDstTy = LLT::scalar(DstTy.getSizeInBits());
+      Src = MIRBuilder.buildBitcast(IntDstTy, Src).getReg(0);
+    }
+
+    Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
+    if (Offset != 0) {
+      auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
+      ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
+    }
+
+    APInt MaskVal = ~APInt::getBitsSet(DstTy.getSizeInBits(), Offset,
+                                       InsertTy.getSizeInBits());
+
+    auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
+    auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
+    auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
+
+    MIRBuilder.buildBitcast(Dst, Or);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
+  Register Dst0 = MI.getOperand(0).getReg();
+  Register Dst1 = MI.getOperand(1).getReg();
+  Register LHS = MI.getOperand(2).getReg();
+  Register RHS = MI.getOperand(3).getReg();
+  const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
+
+  LLT Ty = MRI.getType(Dst0);
+  LLT BoolTy = MRI.getType(Dst1);
+
+  if (IsAdd)
+    MIRBuilder.buildAdd(Dst0, LHS, RHS);
+  else
+    MIRBuilder.buildSub(Dst0, LHS, RHS);
+
+  // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
+
+  auto Zero = MIRBuilder.buildConstant(Ty, 0);
+
+  // For an addition, the result should be less than one of the operands (LHS)
+  // if and only if the other operand (RHS) is negative, otherwise there will
+  // be overflow.
+  // For a subtraction, the result should be less than one of the operands
+  // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
+  // otherwise there will be overflow.
+  auto ResultLowerThanLHS =
+      MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
+  auto ConditionRHS = MIRBuilder.buildICmp(
+      IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
+
+  MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
+  MI.eraseFromParent();
+  return Legalized;
+}
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 6e1de95b3277..70045512fae5 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -215,7 +215,30 @@ bool LegalizeRuleSet::verifyTypeIdxsCoverage(unsigned NumTypeIdxs) const {
     return true;
   }
   const bool AllCovered = (FirstUncovered >= NumTypeIdxs);
-  LLVM_DEBUG(dbgs() << ".. the first uncovered type index: " << FirstUncovered
+  if (NumTypeIdxs > 0)
+    LLVM_DEBUG(dbgs() << ".. the first uncovered type index: " << FirstUncovered
+                      << ", " << (AllCovered ? "OK" : "FAIL") << "\n");
+  return AllCovered;
+#else
+  return true;
+#endif
+}
+
+bool LegalizeRuleSet::verifyImmIdxsCoverage(unsigned NumImmIdxs) const {
+#ifndef NDEBUG
+  if (Rules.empty()) {
+    LLVM_DEBUG(
+        dbgs() << ".. imm index coverage check SKIPPED: no rules defined\n");
+    return true;
+  }
+  const int64_t FirstUncovered = ImmIdxsCovered.find_first_unset();
+  if (FirstUncovered < 0) {
+    LLVM_DEBUG(dbgs() << ".. imm index coverage check SKIPPED:"
+                         " user-defined predicate detected\n");
+    return true;
+  }
+  const bool AllCovered = (FirstUncovered >= NumImmIdxs);
+  LLVM_DEBUG(dbgs() << ".. the first uncovered imm index: " << FirstUncovered
                     << ", " << (AllCovered ? "OK" : "FAIL") << "\n");
   return AllCovered;
 #else
@@ -387,8 +410,6 @@ unsigned LegalizerInfo::getActionDefinitionsIdx(unsigned Opcode) const {
     LLVM_DEBUG(dbgs() << ".. opcode " << Opcode << " is aliased to " << Alias
                       << "\n");
     OpcodeIdx = getOpcodeIdxForOpcode(Alias);
-    LLVM_DEBUG(dbgs() << ".. opcode " << Alias << " is aliased to "
-                      << RulesForOpcode[OpcodeIdx].getAlias() << "\n");
     assert(RulesForOpcode[OpcodeIdx].getAlias() == 0 && "Cannot chain aliases");
   }
 
@@ -412,7 +433,7 @@ LegalizeRuleSet &LegalizerInfo::getActionDefinitionsBuilder(
     std::initializer_list<unsigned> Opcodes) {
   unsigned Representative = *Opcodes.begin();
 
-  assert(!empty(Opcodes) && Opcodes.begin() + 1 != Opcodes.end() &&
+  assert(!llvm::empty(Opcodes) && Opcodes.begin() + 1 != Opcodes.end() &&
          "Initializer list must have at least two opcodes");
 
   for (auto I = Opcodes.begin() + 1, E = Opcodes.end(); I != E; ++I)
@@ -677,12 +698,23 @@ void LegalizerInfo::verify(const MCInstrInfo &MII) const {
                      ? std::max(OpInfo.getGenericTypeIndex() + 1U, Acc)
                      : Acc;
         });
+    const unsigned NumImmIdxs = std::accumulate(
+        MCID.opInfo_begin(), MCID.opInfo_end(), 0U,
+        [](unsigned Acc, const MCOperandInfo &OpInfo) {
+          return OpInfo.isGenericImm()
+                     ? std::max(OpInfo.getGenericImmIndex() + 1U, Acc)
+                     : Acc;
+        });
     LLVM_DEBUG(dbgs() << MII.getName(Opcode) << " (opcode " << Opcode
                       << "): " << NumTypeIdxs << " type ind"
-                      << (NumTypeIdxs == 1 ? "ex" : "ices") << "\n");
+                      << (NumTypeIdxs == 1 ? "ex" : "ices") << ", "
+                      << NumImmIdxs << " imm ind"
+                      << (NumImmIdxs == 1 ? "ex" : "ices") << "\n");
     const LegalizeRuleSet &RuleSet = getActionDefinitions(Opcode);
     if (!RuleSet.verifyTypeIdxsCoverage(NumTypeIdxs))
       FailedOpcodes.push_back(Opcode);
+    else if (!RuleSet.verifyImmIdxsCoverage(NumImmIdxs))
+      FailedOpcodes.push_back(Opcode);
   }
   if (!FailedOpcodes.empty()) {
     errs() << "The following opcodes have ill-defined legalization rules:";
diff --git a/lib/CodeGen/GlobalISel/Localizer.cpp b/lib/CodeGen/GlobalISel/Localizer.cpp
index 3592409710a7..f882ecbf5db3 100644
--- a/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -79,7 +79,7 @@ bool Localizer::shouldLocalize(const MachineInstr &MI) {
     return true;
   case TargetOpcode::G_GLOBAL_VALUE: {
     unsigned RematCost = TTI->getGISelRematGlobalCost();
-    unsigned Reg = MI.getOperand(0).getReg();
+    Register Reg = MI.getOperand(0).getReg();
     unsigned MaxUses = maxUses(RematCost);
     if (MaxUses == UINT_MAX)
       return true; // Remats are "free" so always localize.
@@ -121,7 +121,7 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
     LLVM_DEBUG(dbgs() << "Should localize: " << MI);
     assert(MI.getDesc().getNumDefs() == 1 &&
            "More than one definition not supported yet");
-    unsigned Reg = MI.getOperand(0).getReg();
+    Register Reg = MI.getOperand(0).getReg();
     // Check if all the users of MI are local.
     // We are going to invalidation the list of use operands, so we
     // can't use range iterator.
@@ -151,7 +151,7 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
                             LocalizedMI);
 
         // Set a new register for the definition.
-        unsigned NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
+        Register NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
         MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
         LocalizedMI->getOperand(0).setReg(NewReg);
         NewVRegIt =
@@ -177,7 +177,7 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
   // many users, but this case may be better served by regalloc improvements.
 
   for (MachineInstr *MI : LocalizedInstrs) {
-    unsigned Reg = MI->getOperand(0).getReg();
+    Register Reg = MI->getOperand(0).getReg();
     MachineBasicBlock &MBB = *MI->getParent();
     // All of the user MIs of this reg.
     SmallPtrSet<MachineInstr *, 32> Users;
@@ -220,5 +220,6 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {
   LocalizedSetVecT LocalizedInstrs;
 
   bool Changed = localizeInterBlock(MF, LocalizedInstrs);
-  return Changed |= localizeIntraBlock(LocalizedInstrs);
+  Changed |= localizeIntraBlock(LocalizedInstrs);
+  return Changed;
 }
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index b7a73326b85c..df770f6664ca 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -107,9 +107,13 @@ MachineIRBuilder::buildIndirectDbgValue(Register Reg, const MDNode *Variable,
   assert(
       cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
       "Expected inlined-at fields to agree");
+  // DBG_VALUE insts now carry IR-level indirection in their DIExpression
+  // rather than encoding it in the instruction itself.
+  const DIExpression *DIExpr = cast<DIExpression>(Expr);
+  DIExpr = DIExpression::append(DIExpr, {dwarf::DW_OP_deref});
   return insertInstr(BuildMI(getMF(), getDL(),
                              getTII().get(TargetOpcode::DBG_VALUE),
-                             /*IsIndirect*/ true, Reg, Variable, Expr));
+                             /*IsIndirect*/ false, Reg, Variable, DIExpr));
 }
 
 MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
@@ -120,11 +124,15 @@ MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
   assert(
       cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
       "Expected inlined-at fields to agree");
+  // DBG_VALUE insts now carry IR-level indirection in their DIExpression
+  // rather than encoding it in the instruction itself.
+  const DIExpression *DIExpr = cast<DIExpression>(Expr);
+  DIExpr = DIExpression::append(DIExpr, {dwarf::DW_OP_deref});
   return buildInstr(TargetOpcode::DBG_VALUE)
       .addFrameIndex(FI)
-      .addImm(0)
+      .addReg(0)
       .addMetadata(Variable)
-      .addMetadata(Expr);
+      .addMetadata(DIExpr);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
@@ -148,7 +156,7 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
     MIB.addReg(0U);
   }
 
-  return MIB.addImm(0).addMetadata(Variable).addMetadata(Expr);
+  return MIB.addReg(0).addMetadata(Variable).addMetadata(Expr);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) {
@@ -160,6 +168,17 @@ MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) {
   return MIB.addMetadata(Label);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildDynStackAlloc(const DstOp &Res,
+                                                         const SrcOp &Size,
+                                                         unsigned Align) {
+  assert(Res.getLLTTy(*getMRI()).isPointer() && "expected ptr dst type");
+  auto MIB = buildInstr(TargetOpcode::G_DYN_STACKALLOC);
+  Res.addDefToMIB(*getMRI(), MIB);
+  Size.addSrcToMIB(MIB);
+  MIB.addImm(Align);
+  return MIB;
+}
+
 MachineInstrBuilder MachineIRBuilder::buildFrameIndex(const DstOp &Res,
                                                       int Idx) {
   assert(Res.getLLTTy(*getMRI()).isPointer() && "invalid operand type");
@@ -207,11 +226,7 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(const DstOp &Res,
          Res.getLLTTy(*getMRI()) == Op0.getLLTTy(*getMRI()) && "type mismatch");
   assert(Op1.getLLTTy(*getMRI()).isScalar() && "invalid offset type");
 
-  auto MIB = buildInstr(TargetOpcode::G_GEP);
-  Res.addDefToMIB(*getMRI(), MIB);
-  Op0.addSrcToMIB(MIB);
-  Op1.addSrcToMIB(MIB);
-  return MIB;
+  return buildInstr(TargetOpcode::G_GEP, {Res}, {Op0, Op1});
 }
 
 Optional<MachineInstrBuilder>
@@ -697,17 +712,19 @@ MachineInstrBuilder MachineIRBuilder::buildICmp(CmpInst::Predicate Pred,
 MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred,
                                                 const DstOp &Res,
                                                 const SrcOp &Op0,
-                                                const SrcOp &Op1) {
+                                                const SrcOp &Op1,
+                                                Optional<unsigned> Flags) {
 
-  return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1});
+  return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1}, Flags);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildSelect(const DstOp &Res,
                                                   const SrcOp &Tst,
                                                   const SrcOp &Op0,
-                                                  const SrcOp &Op1) {
+                                                  const SrcOp &Op1,
+                                                  Optional<unsigned> Flags) {
 
-  return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1});
+  return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1}, Flags);
 }
 
 MachineInstrBuilder
@@ -774,26 +791,28 @@ MachineIRBuilder::buildAtomicCmpXchg(Register OldValRes, Register Addr,
       .addMemOperand(&MMO);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildAtomicRMW(unsigned Opcode,
-                                                     Register OldValRes,
-                                                     Register Addr,
-                                                     Register Val,
-                                                     MachineMemOperand &MMO) {
+MachineInstrBuilder MachineIRBuilder::buildAtomicRMW(
+  unsigned Opcode, const DstOp &OldValRes,
+  const SrcOp &Addr, const SrcOp &Val,
+  MachineMemOperand &MMO) {
+
 #ifndef NDEBUG
-  LLT OldValResTy = getMRI()->getType(OldValRes);
-  LLT AddrTy = getMRI()->getType(Addr);
-  LLT ValTy = getMRI()->getType(Val);
+  LLT OldValResTy = OldValRes.getLLTTy(*getMRI());
+  LLT AddrTy = Addr.getLLTTy(*getMRI());
+  LLT ValTy = Val.getLLTTy(*getMRI());
   assert(OldValResTy.isScalar() && "invalid operand type");
   assert(AddrTy.isPointer() && "invalid operand type");
   assert(ValTy.isValid() && "invalid operand type");
   assert(OldValResTy == ValTy && "type mismatch");
+  assert(MMO.isAtomic() && "not atomic mem operand");
 #endif
 
-  return buildInstr(Opcode)
-      .addDef(OldValRes)
-      .addUse(Addr)
-      .addUse(Val)
-      .addMemOperand(&MMO);
+  auto MIB = buildInstr(Opcode);
+  OldValRes.addDefToMIB(*getMRI(), MIB);
+  Addr.addSrcToMIB(MIB);
+  Val.addSrcToMIB(MIB);
+  MIB.addMemOperand(&MMO);
+  return MIB;
 }
 
 MachineInstrBuilder
@@ -864,6 +883,21 @@ MachineIRBuilder::buildAtomicRMWUmin(Register OldValRes, Register Addr,
                         MMO);
 }
 
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWFAdd(
+  const DstOp &OldValRes, const SrcOp &Addr, const SrcOp &Val,
+  MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FADD, OldValRes, Addr, Val,
+                        MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWFSub(const DstOp &OldValRes, const SrcOp &Addr, const SrcOp &Val,
+                                     MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FSUB, OldValRes, Addr, Val,
+                        MMO);
+}
+
 MachineInstrBuilder
 MachineIRBuilder::buildFence(unsigned Ordering, unsigned Scope) {
   return buildInstr(TargetOpcode::G_FENCE)
@@ -1037,8 +1071,11 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
            "input operands do not cover output register");
     if (SrcOps.size() == 1)
       return buildCast(DstOps[0], SrcOps[0]);
-    if (DstOps[0].getLLTTy(*getMRI()).isVector())
-      return buildInstr(TargetOpcode::G_CONCAT_VECTORS, DstOps, SrcOps);
+    if (DstOps[0].getLLTTy(*getMRI()).isVector()) {
+      if (SrcOps[0].getLLTTy(*getMRI()).isVector())
+        return buildInstr(TargetOpcode::G_CONCAT_VECTORS, DstOps, SrcOps);
+      return buildInstr(TargetOpcode::G_BUILD_VECTOR, DstOps, SrcOps);
+    }
     break;
   }
   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 42be88fcf947..f0e35c65c53b 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -92,7 +92,7 @@ void RegBankSelect::init(MachineFunction &MF) {
     MBPI = nullptr;
   }
   MIRBuilder.setMF(MF);
-  MORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
+  MORE = std::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
 }
 
 void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -139,7 +139,7 @@ bool RegBankSelect::repairReg(
          "need new vreg for each breakdown");
 
   // An empty range of new register means no repairing.
-  assert(!empty(NewVRegs) && "We should not have to repair");
+  assert(!NewVRegs.empty() && "We should not have to repair");
 
   MachineInstr *MI;
   if (ValMapping.NumBreakDowns == 1) {
@@ -154,7 +154,7 @@ bool RegBankSelect::repairReg(
       std::swap(Src, Dst);
 
     assert((RepairPt.getNumInsertPoints() == 1 ||
-            TargetRegisterInfo::isPhysicalRegister(Dst)) &&
+            Register::isPhysicalRegister(Dst)) &&
            "We are about to create several defs for Dst");
 
     // Build the instruction used to repair, then clone it at the right
@@ -398,7 +398,7 @@ void RegBankSelect::tryAvoidingSplit(
 
   // Check if this is a physical or virtual register.
   Register Reg = MO.getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     // We are going to split every outgoing edges.
     // Check that this is possible.
     // FIXME: The machine representation is currently broken
@@ -687,8 +687,9 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
       // iterator before hand.
       MachineInstr &MI = *MII++;
 
-      // Ignore target-specific instructions: they should use proper regclasses.
-      if (isTargetSpecificOpcode(MI.getOpcode()))
+      // Ignore target-specific post-isel instructions: they should use proper
+      // regclasses.
+      if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode())
         continue;
 
       if (!assignInstr(MI)) {
diff --git a/lib/CodeGen/GlobalISel/RegisterBank.cpp b/lib/CodeGen/GlobalISel/RegisterBank.cpp
index 4e41f338934d..fc9c802693ab 100644
--- a/lib/CodeGen/GlobalISel/RegisterBank.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBank.cpp
@@ -12,6 +12,7 @@
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "registerbank"
 
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index 159422e38878..3fcc55286beb 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -82,7 +82,7 @@ bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
 const RegisterBank *
 RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI,
                              const TargetRegisterInfo &TRI) const {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return &getRegBankFromRegClass(getMinimalPhysRegClass(Reg, TRI));
 
   assert(Reg && "NoRegister does not have a register bank");
@@ -97,8 +97,7 @@ RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI,
 const TargetRegisterClass &
 RegisterBankInfo::getMinimalPhysRegClass(Register Reg,
                                          const TargetRegisterInfo &TRI) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
-         "Reg must be a physreg");
+  assert(Register::isPhysicalRegister(Reg) && "Reg must be a physreg");
   const auto &RegRCIt = PhysRegMinimalRCs.find(Reg);
   if (RegRCIt != PhysRegMinimalRCs.end())
     return *RegRCIt->second;
@@ -284,7 +283,7 @@ RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length,
   ++NumPartialMappingsCreated;
 
   auto &PartMapping = MapOfPartialMappings[Hash];
-  PartMapping = llvm::make_unique<PartialMapping>(StartIdx, Length, RegBank);
+  PartMapping = std::make_unique<PartialMapping>(StartIdx, Length, RegBank);
   return *PartMapping;
 }
 
@@ -318,7 +317,7 @@ RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown,
   ++NumValueMappingsCreated;
 
   auto &ValMapping = MapOfValueMappings[Hash];
-  ValMapping = llvm::make_unique<ValueMapping>(BreakDown, NumBreakDowns);
+  ValMapping = std::make_unique<ValueMapping>(BreakDown, NumBreakDowns);
   return *ValMapping;
 }
 
@@ -342,7 +341,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
   // mapping, because we use the pointer of the ValueMapping
   // to hash and we expect them to uniquely identify an instance
   // of value mapping.
-  Res = llvm::make_unique<ValueMapping[]>(std::distance(Begin, End));
+  Res = std::make_unique<ValueMapping[]>(std::distance(Begin, End));
   unsigned Idx = 0;
   for (Iterator It = Begin; It != End; ++It, ++Idx) {
     const ValueMapping *ValMap = *It;
@@ -392,7 +391,7 @@ RegisterBankInfo::getInstructionMappingImpl(
   ++NumInstructionMappingsCreated;
 
   auto &InstrMapping = MapOfInstructionMappings[Hash];
-  InstrMapping = llvm::make_unique<InstructionMapping>(
+  InstrMapping = std::make_unique<InstructionMapping>(
       ID, Cost, OperandsMapping, NumOperands);
   return *InstrMapping;
 }
@@ -456,7 +455,7 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
            "This mapping is too complex for this function");
     iterator_range<SmallVectorImpl<Register>::const_iterator> NewRegs =
         OpdMapper.getVRegs(OpIdx);
-    if (empty(NewRegs)) {
+    if (NewRegs.empty()) {
       LLVM_DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
       continue;
     }
@@ -489,7 +488,7 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
 unsigned RegisterBankInfo::getSizeInBits(Register Reg,
                                          const MachineRegisterInfo &MRI,
                                          const TargetRegisterInfo &TRI) const {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     // The size is not directly available for physical registers.
     // Instead, we need to access a register class that contains Reg and
     // get the size of that register class.
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index 766ea1d60bac..45618d7992ad 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -43,10 +43,9 @@ unsigned llvm::constrainOperandRegClass(
     const RegisterBankInfo &RBI, MachineInstr &InsertPt,
     const TargetRegisterClass &RegClass, const MachineOperand &RegMO,
     unsigned OpIdx) {
-  unsigned Reg = RegMO.getReg();
+  Register Reg = RegMO.getReg();
   // Assume physical registers are properly constrained.
-  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
-         "PhysReg not implemented");
+  assert(Register::isVirtualRegister(Reg) && "PhysReg not implemented");
 
   unsigned ConstrainedReg = constrainRegToClass(MRI, TII, RBI, Reg, RegClass);
   // If we created a new virtual register because the class is not compatible
@@ -73,10 +72,9 @@ unsigned llvm::constrainOperandRegClass(
     MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
     const RegisterBankInfo &RBI, MachineInstr &InsertPt, const MCInstrDesc &II,
     const MachineOperand &RegMO, unsigned OpIdx) {
-  unsigned Reg = RegMO.getReg();
+  Register Reg = RegMO.getReg();
   // Assume physical registers are properly constrained.
-  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
-         "PhysReg not implemented");
+  assert(Register::isVirtualRegister(Reg) && "PhysReg not implemented");
 
   const TargetRegisterClass *RegClass = TII.getRegClass(II, OpIdx, &TRI, MF);
   // Some of the target independent instructions, like COPY, may not impose any
@@ -130,9 +128,9 @@ bool llvm::constrainSelectedInstRegOperands(MachineInstr &I,
     LLVM_DEBUG(dbgs() << "Converting operand: " << MO << '\n');
     assert(MO.isReg() && "Unsupported non-reg operand");
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     // Physical registers don't need to be constrained.
-    if (TRI.isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       continue;
 
     // Register operands with a value of 0 (e.g. predicate operands) don't need
@@ -170,9 +168,8 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
     if (!MO.isReg() || !MO.isDef())
       continue;
 
-    unsigned Reg = MO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-        !MRI.use_nodbg_empty(Reg))
+    Register Reg = MO.getReg();
+    if (Register::isPhysicalRegister(Reg) || !MRI.use_nodbg_empty(Reg))
       return false;
   }
   return true;
@@ -219,11 +216,33 @@ Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg,
 }
 
 Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
-    unsigned VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs) {
+    unsigned VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs,
+    bool HandleFConstant) {
   SmallVector<std::pair<unsigned, unsigned>, 4> SeenOpcodes;
   MachineInstr *MI;
-  while ((MI = MRI.getVRegDef(VReg)) &&
-         MI->getOpcode() != TargetOpcode::G_CONSTANT && LookThroughInstrs) {
+  auto IsConstantOpcode = [HandleFConstant](unsigned Opcode) {
+    return Opcode == TargetOpcode::G_CONSTANT ||
+           (HandleFConstant && Opcode == TargetOpcode::G_FCONSTANT);
+  };
+  auto GetImmediateValue = [HandleFConstant,
+                            &MRI](const MachineInstr &MI) -> Optional<APInt> {
+    const MachineOperand &CstVal = MI.getOperand(1);
+    if (!CstVal.isImm() && !CstVal.isCImm() &&
+        (!HandleFConstant || !CstVal.isFPImm()))
+      return None;
+    if (!CstVal.isFPImm()) {
+      unsigned BitWidth =
+          MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      APInt Val = CstVal.isImm() ? APInt(BitWidth, CstVal.getImm())
+                                 : CstVal.getCImm()->getValue();
+      assert(Val.getBitWidth() == BitWidth &&
+             "Value bitwidth doesn't match definition type");
+      return Val;
+    }
+    return CstVal.getFPImm()->getValueAPF().bitcastToAPInt();
+  };
+  while ((MI = MRI.getVRegDef(VReg)) && !IsConstantOpcode(MI->getOpcode()) &&
+         LookThroughInstrs) {
     switch (MI->getOpcode()) {
     case TargetOpcode::G_TRUNC:
     case TargetOpcode::G_SEXT:
@@ -235,7 +254,7 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
       break;
     case TargetOpcode::COPY:
       VReg = MI->getOperand(1).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(VReg))
+      if (Register::isPhysicalRegister(VReg))
         return None;
       break;
     case TargetOpcode::G_INTTOPTR:
@@ -245,16 +264,13 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
       return None;
     }
   }
-  if (!MI || MI->getOpcode() != TargetOpcode::G_CONSTANT ||
-      (!MI->getOperand(1).isImm() && !MI->getOperand(1).isCImm()))
+  if (!MI || !IsConstantOpcode(MI->getOpcode()))
     return None;
 
-  const MachineOperand &CstVal = MI->getOperand(1);
-  unsigned BitWidth = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
-  APInt Val = CstVal.isImm() ? APInt(BitWidth, CstVal.getImm())
-                             : CstVal.getCImm()->getValue();
-  assert(Val.getBitWidth() == BitWidth &&
-         "Value bitwidth doesn't match definition type");
+  Optional<APInt> MaybeVal = GetImmediateValue(*MI);
+  if (!MaybeVal)
+    return None;
+  APInt &Val = *MaybeVal;
   while (!SeenOpcodes.empty()) {
     std::pair<unsigned, unsigned> OpcodeAndSize = SeenOpcodes.pop_back_val();
     switch (OpcodeAndSize.first) {
@@ -291,7 +307,7 @@ llvm::MachineInstr *llvm::getDefIgnoringCopies(Register Reg,
   if (!DstTy.isValid())
     return nullptr;
   while (DefMI->getOpcode() == TargetOpcode::COPY) {
-    unsigned SrcReg = DefMI->getOperand(1).getReg();
+    Register SrcReg = DefMI->getOperand(1).getReg();
     auto SrcTy = MRI.getType(SrcReg);
     if (!SrcTy.isValid() || SrcTy != DstTy)
       break;
@@ -395,6 +411,40 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
   return false;
 }
 
+Optional<APInt> llvm::ConstantFoldExtOp(unsigned Opcode, const unsigned Op1,
+                                        uint64_t Imm,
+                                        const MachineRegisterInfo &MRI) {
+  auto MaybeOp1Cst = getConstantVRegVal(Op1, MRI);
+  if (MaybeOp1Cst) {
+    LLT Ty = MRI.getType(Op1);
+    APInt C1(Ty.getSizeInBits(), *MaybeOp1Cst, true);
+    switch (Opcode) {
+    default:
+      break;
+    case TargetOpcode::G_SEXT_INREG:
+      return C1.trunc(Imm).sext(C1.getBitWidth());
+    }
+  }
+  return None;
+}
+
 void llvm::getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU) {
   AU.addPreserved<StackProtector>();
 }
+
+MVT llvm::getMVTForLLT(LLT Ty) {
+  if (!Ty.isVector())
+    return MVT::getIntegerVT(Ty.getSizeInBits());
+
+  return MVT::getVectorVT(
+      MVT::getIntegerVT(Ty.getElementType().getSizeInBits()),
+      Ty.getNumElements());
+}
+
+LLT llvm::getLLTForMVT(MVT Ty) {
+  if (!Ty.isVector())
+    return LLT::scalar(Ty.getSizeInBits());
+
+  return LLT::vector(Ty.getVectorNumElements(),
+                     Ty.getVectorElementType().getSizeInBits());
+}
diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 09201c2e7bae..d4fa45fcb405 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -456,14 +456,14 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
 
     bool HasExternal = false;
     StringRef FirstExternalName;
-    unsigned MaxAlign = 1;
+    Align MaxAlign;
     unsigned CurIdx = 0;
     for (j = i; j != -1; j = GlobalSet.find_next(j)) {
       Type *Ty = Globals[j]->getValueType();
 
       // Make sure we use the same alignment AsmPrinter would use.
-      unsigned Align = DL.getPreferredAlignment(Globals[j]);
-      unsigned Padding = alignTo(MergedSize, Align) - MergedSize;
+      Align Alignment(DL.getPreferredAlignment(Globals[j]));
+      unsigned Padding = alignTo(MergedSize, Alignment) - MergedSize;
       MergedSize += Padding;
       MergedSize += DL.getTypeAllocSize(Ty);
       if (MergedSize > MaxOffset) {
@@ -478,7 +478,7 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
       Inits.push_back(Globals[j]->getInitializer());
       StructIdxs.push_back(CurIdx++);
 
-      MaxAlign = std::max(MaxAlign, Align);
+      MaxAlign = std::max(MaxAlign, Alignment);
 
       if (Globals[j]->hasExternalLinkage() && !HasExternal) {
         HasExternal = true;
diff --git a/lib/CodeGen/HardwareLoops.cpp b/lib/CodeGen/HardwareLoops.cpp
index 5f57cabbe865..6a0f98d2e2b4 100644
--- a/lib/CodeGen/HardwareLoops.cpp
+++ b/lib/CodeGen/HardwareLoops.cpp
@@ -183,7 +183,7 @@ bool HardwareLoops::runOnFunction(Function &F) {
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   DL = &F.getParent()->getDataLayout();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+  LibInfo = TLIP ? &TLIP->getTLI(F) : nullptr;
   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   M = F.getParent();
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index b17a253fe23f..d9caa5660695 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -285,14 +285,113 @@ namespace {
                                                    Prediction);
     }
 
-    bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB,
-                            unsigned TCycle, unsigned TExtra,
-                            MachineBasicBlock &FBB,
-                            unsigned FCycle, unsigned FExtra,
-                            BranchProbability Prediction) const {
-      return TCycle > 0 && FCycle > 0 &&
-        TII->isProfitableToIfCvt(TBB, TCycle, TExtra, FBB, FCycle, FExtra,
-                                 Prediction);
+    bool MeetIfcvtSizeLimit(BBInfo &TBBInfo, BBInfo &FBBInfo,
+                            MachineBasicBlock &CommBB, unsigned Dups,
+                            BranchProbability Prediction, bool Forked) const {
+      const MachineFunction &MF = *TBBInfo.BB->getParent();
+      if (MF.getFunction().hasMinSize()) {
+        MachineBasicBlock::iterator TIB = TBBInfo.BB->begin();
+        MachineBasicBlock::iterator FIB = FBBInfo.BB->begin();
+        MachineBasicBlock::iterator TIE = TBBInfo.BB->end();
+        MachineBasicBlock::iterator FIE = FBBInfo.BB->end();
+
+        unsigned Dups1, Dups2;
+        if (!CountDuplicatedInstructions(TIB, FIB, TIE, FIE, Dups1, Dups2,
+                                         *TBBInfo.BB, *FBBInfo.BB,
+                                         /*SkipUnconditionalBranches*/ true))
+          llvm_unreachable("should already have been checked by ValidDiamond");
+
+        unsigned BranchBytes = 0;
+        unsigned CommonBytes = 0;
+
+        // Count common instructions at the start of the true and false blocks.
+        for (auto &I : make_range(TBBInfo.BB->begin(), TIB)) {
+          LLVM_DEBUG(dbgs() << "Common inst: " << I);
+          CommonBytes += TII->getInstSizeInBytes(I);
+        }
+        for (auto &I : make_range(FBBInfo.BB->begin(), FIB)) {
+          LLVM_DEBUG(dbgs() << "Common inst: " << I);
+          CommonBytes += TII->getInstSizeInBytes(I);
+        }
+
+        // Count instructions at the end of the true and false blocks, after
+        // the ones we plan to predicate. Analyzable branches will be removed
+        // (unless this is a forked diamond), and all other instructions are
+        // common between the two blocks.
+        for (auto &I : make_range(TIE, TBBInfo.BB->end())) {
+          if (I.isBranch() && TBBInfo.IsBrAnalyzable && !Forked) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          } else {
+            LLVM_DEBUG(dbgs() << "Common inst: " << I);
+            CommonBytes += TII->getInstSizeInBytes(I);
+          }
+        }
+        for (auto &I : make_range(FIE, FBBInfo.BB->end())) {
+          if (I.isBranch() && FBBInfo.IsBrAnalyzable && !Forked) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          } else {
+            LLVM_DEBUG(dbgs() << "Common inst: " << I);
+            CommonBytes += TII->getInstSizeInBytes(I);
+          }
+        }
+        for (auto &I : CommBB.terminators()) {
+          if (I.isBranch()) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          }
+        }
+
+        // The common instructions in one branch will be eliminated, halving
+        // their code size.
+        CommonBytes /= 2;
+
+        // Count the instructions which we need to predicate.
+        unsigned NumPredicatedInstructions = 0;
+        for (auto &I : make_range(TIB, TIE)) {
+          if (!I.isDebugInstr()) {
+            LLVM_DEBUG(dbgs() << "Predicating: " << I);
+            NumPredicatedInstructions++;
+          }
+        }
+        for (auto &I : make_range(FIB, FIE)) {
+          if (!I.isDebugInstr()) {
+            LLVM_DEBUG(dbgs() << "Predicating: " << I);
+            NumPredicatedInstructions++;
+          }
+        }
+
+        // Even though we're optimising for size at the expense of performance,
+        // avoid creating really long predicated blocks.
+        if (NumPredicatedInstructions > 15)
+          return false;
+
+        // Some targets (e.g. Thumb2) need to insert extra instructions to
+        // start predicated blocks.
+        unsigned ExtraPredicateBytes = TII->extraSizeToPredicateInstructions(
+            MF, NumPredicatedInstructions);
+
+        LLVM_DEBUG(dbgs() << "MeetIfcvtSizeLimit(BranchBytes=" << BranchBytes
+                          << ", CommonBytes=" << CommonBytes
+                          << ", NumPredicatedInstructions="
+                          << NumPredicatedInstructions
+                          << ", ExtraPredicateBytes=" << ExtraPredicateBytes
+                          << ")\n");
+        return (BranchBytes + CommonBytes) > ExtraPredicateBytes;
+      } else {
+        unsigned TCycle = TBBInfo.NonPredSize + TBBInfo.ExtraCost - Dups;
+        unsigned FCycle = FBBInfo.NonPredSize + FBBInfo.ExtraCost - Dups;
+        bool Res = TCycle > 0 && FCycle > 0 &&
+                   TII->isProfitableToIfCvt(
+                       *TBBInfo.BB, TCycle, TBBInfo.ExtraCost2, *FBBInfo.BB,
+                       FCycle, FBBInfo.ExtraCost2, Prediction);
+        LLVM_DEBUG(dbgs() << "MeetIfcvtSizeLimit(TCycle=" << TCycle
+                          << ", FCycle=" << FCycle
+                          << ", TExtra=" << TBBInfo.ExtraCost2 << ", FExtra="
+                          << FBBInfo.ExtraCost2 << ") = " << Res << "\n");
+        return Res;
+      }
     }
 
     /// Returns true if Block ends without a terminator.
@@ -356,8 +455,10 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
     BranchFolder BF(true, false, MBFI, *MBPI);
-    BFChange = BF.OptimizeFunction(MF, TII, ST.getRegisterInfo(),
-                                   getAnalysisIfAvailable<MachineModuleInfo>());
+    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+    BFChange = BF.OptimizeFunction(
+        MF, TII, ST.getRegisterInfo(),
+        MMIWP ? &MMIWP->getMMI() : nullptr);
   }
 
   LLVM_DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum << ") \'"
@@ -496,8 +597,10 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
 
   if (MadeChange && IfCvtBranchFold) {
     BranchFolder BF(false, false, MBFI, *MBPI);
-    BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
-                        getAnalysisIfAvailable<MachineModuleInfo>());
+    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+    BF.OptimizeFunction(
+        MF, TII, MF.getSubtarget().getRegisterInfo(),
+        MMIWP ? &MMIWP->getMMI() : nullptr);
   }
 
   MadeChange |= BFChange;
@@ -569,6 +672,9 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
                                 bool FalseBranch, unsigned &Dups,
                                 BranchProbability Prediction) const {
   Dups = 0;
+  if (TrueBBI.BB == FalseBBI.BB)
+    return false;
+
   if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
     return false;
 
@@ -835,6 +941,8 @@ bool IfConverter::ValidForkedDiamond(
 
   TrueBBICalc.BB = TrueBBI.BB;
   FalseBBICalc.BB = FalseBBI.BB;
+  TrueBBICalc.IsBrAnalyzable = TrueBBI.IsBrAnalyzable;
+  FalseBBICalc.IsBrAnalyzable = FalseBBI.IsBrAnalyzable;
   if (!RescanInstructions(TIB, FIB, TIE, FIE, TrueBBICalc, FalseBBICalc))
     return false;
 
@@ -892,6 +1000,8 @@ bool IfConverter::ValidDiamond(
 
   TrueBBICalc.BB = TrueBBI.BB;
   FalseBBICalc.BB = FalseBBI.BB;
+  TrueBBICalc.IsBrAnalyzable = TrueBBI.IsBrAnalyzable;
+  FalseBBICalc.IsBrAnalyzable = FalseBBI.IsBrAnalyzable;
   if (!RescanInstructions(TIB, FIB, TIE, FIE, TrueBBICalc, FalseBBICalc))
     return false;
   // The size is used to decide whether to if-convert, and the shared portions
@@ -912,6 +1022,12 @@ void IfConverter::AnalyzeBranches(BBInfo &BBI) {
   BBI.BrCond.clear();
   BBI.IsBrAnalyzable =
       !TII->analyzeBranch(*BBI.BB, BBI.TrueBB, BBI.FalseBB, BBI.BrCond);
+  if (!BBI.IsBrAnalyzable) {
+    BBI.TrueBB = nullptr;
+    BBI.FalseBB = nullptr;
+    BBI.BrCond.clear();
+  }
+
   SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
   BBI.IsBrReversible = (RevCond.size() == 0) ||
       !TII->reverseBranchCondition(RevCond);
@@ -1173,13 +1289,9 @@ void IfConverter::AnalyzeBlock(
 
     if (CanRevCond) {
       BBInfo TrueBBICalc, FalseBBICalc;
-      auto feasibleDiamond = [&]() {
-        bool MeetsSize = MeetIfcvtSizeLimit(
-            *TrueBBI.BB, (TrueBBICalc.NonPredSize - (Dups + Dups2) +
-                          TrueBBICalc.ExtraCost), TrueBBICalc.ExtraCost2,
-            *FalseBBI.BB, (FalseBBICalc.NonPredSize - (Dups + Dups2) +
-                           FalseBBICalc.ExtraCost), FalseBBICalc.ExtraCost2,
-            Prediction);
+      auto feasibleDiamond = [&](bool Forked) {
+        bool MeetsSize = MeetIfcvtSizeLimit(TrueBBICalc, FalseBBICalc, *BB,
+                                            Dups + Dups2, Prediction, Forked);
         bool TrueFeasible = FeasibilityAnalysis(TrueBBI, BBI.BrCond,
                                                 /* IsTriangle */ false, /* RevCond */ false,
                                                 /* hasCommonTail */ true);
@@ -1191,7 +1303,7 @@ void IfConverter::AnalyzeBlock(
 
       if (ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2,
                        TrueBBICalc, FalseBBICalc)) {
-        if (feasibleDiamond()) {
+        if (feasibleDiamond(false)) {
           // Diamond:
           //   EBB
           //   / \_
@@ -1200,14 +1312,14 @@ void IfConverter::AnalyzeBlock(
           //   \ /
           //  TailBB
           // Note TailBB can be empty.
-          Tokens.push_back(llvm::make_unique<IfcvtToken>(
+          Tokens.push_back(std::make_unique<IfcvtToken>(
               BBI, ICDiamond, TNeedSub | FNeedSub, Dups, Dups2,
               (bool) TrueBBICalc.ClobbersPred, (bool) FalseBBICalc.ClobbersPred));
           Enqueued = true;
         }
       } else if (ValidForkedDiamond(TrueBBI, FalseBBI, Dups, Dups2,
                                     TrueBBICalc, FalseBBICalc)) {
-        if (feasibleDiamond()) {
+        if (feasibleDiamond(true)) {
           // ForkedDiamond:
           // if TBB and FBB have a common tail that includes their conditional
           // branch instructions, then we can If Convert this pattern.
@@ -1218,7 +1330,7 @@ void IfConverter::AnalyzeBlock(
           //        / \ /   \
           //  FalseBB TrueBB FalseBB
           //
-          Tokens.push_back(llvm::make_unique<IfcvtToken>(
+          Tokens.push_back(std::make_unique<IfcvtToken>(
               BBI, ICForkedDiamond, TNeedSub | FNeedSub, Dups, Dups2,
               (bool) TrueBBICalc.ClobbersPred, (bool) FalseBBICalc.ClobbersPred));
           Enqueued = true;
@@ -1238,7 +1350,7 @@ void IfConverter::AnalyzeBlock(
       //   |  /
       //   FBB
       Tokens.push_back(
-          llvm::make_unique<IfcvtToken>(BBI, ICTriangle, TNeedSub, Dups));
+          std::make_unique<IfcvtToken>(BBI, ICTriangle, TNeedSub, Dups));
       Enqueued = true;
     }
 
@@ -1247,7 +1359,7 @@ void IfConverter::AnalyzeBlock(
                            TrueBBI.ExtraCost2, Prediction) &&
         FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
       Tokens.push_back(
-          llvm::make_unique<IfcvtToken>(BBI, ICTriangleRev, TNeedSub, Dups));
+          std::make_unique<IfcvtToken>(BBI, ICTriangleRev, TNeedSub, Dups));
       Enqueued = true;
     }
 
@@ -1263,7 +1375,7 @@ void IfConverter::AnalyzeBlock(
       //   |
       //   FBB
       Tokens.push_back(
-          llvm::make_unique<IfcvtToken>(BBI, ICSimple, TNeedSub, Dups));
+          std::make_unique<IfcvtToken>(BBI, ICSimple, TNeedSub, Dups));
       Enqueued = true;
     }
 
@@ -1275,7 +1387,7 @@ void IfConverter::AnalyzeBlock(
                              FalseBBI.NonPredSize + FalseBBI.ExtraCost,
                              FalseBBI.ExtraCost2, Prediction.getCompl()) &&
           FeasibilityAnalysis(FalseBBI, RevCond, true)) {
-        Tokens.push_back(llvm::make_unique<IfcvtToken>(BBI, ICTriangleFalse,
+        Tokens.push_back(std::make_unique<IfcvtToken>(BBI, ICTriangleFalse,
                                                        FNeedSub, Dups));
         Enqueued = true;
       }
@@ -1287,7 +1399,7 @@ void IfConverter::AnalyzeBlock(
                            FalseBBI.ExtraCost2, Prediction.getCompl()) &&
         FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
         Tokens.push_back(
-            llvm::make_unique<IfcvtToken>(BBI, ICTriangleFRev, FNeedSub, Dups));
+            std::make_unique<IfcvtToken>(BBI, ICTriangleFRev, FNeedSub, Dups));
         Enqueued = true;
       }
 
@@ -1297,7 +1409,7 @@ void IfConverter::AnalyzeBlock(
                              FalseBBI.ExtraCost2, Prediction.getCompl()) &&
           FeasibilityAnalysis(FalseBBI, RevCond)) {
         Tokens.push_back(
-            llvm::make_unique<IfcvtToken>(BBI, ICSimpleFalse, FNeedSub, Dups));
+            std::make_unique<IfcvtToken>(BBI, ICSimpleFalse, FNeedSub, Dups));
         Enqueued = true;
       }
     }
@@ -1730,6 +1842,11 @@ bool IfConverter::IfConvertDiamondCommon(
       ++i;
   }
   while (NumDups1 != 0) {
+    // Since this instruction is going to be deleted, update call
+    // site info state if the instruction is call instruction.
+    if (DI2->isCall(MachineInstr::IgnoreBundle))
+      MBB2.getParent()->eraseCallSiteInfo(&*DI2);
+
     ++DI2;
     if (DI2 == MBB2.end())
       break;
@@ -1758,14 +1875,27 @@ bool IfConverter::IfConvertDiamondCommon(
   if (!BBI1->IsBrAnalyzable)
     verifySameBranchInstructions(&MBB1, &MBB2);
 #endif
-  BBI1->NonPredSize -= TII->removeBranch(*BBI1->BB);
-  // Remove duplicated instructions.
+  // Remove duplicated instructions from the tail of MBB1: any branch
+  // instructions, and the common instructions counted by NumDups2.
   DI1 = MBB1.end();
+  while (DI1 != MBB1.begin()) {
+    MachineBasicBlock::iterator Prev = std::prev(DI1);
+    if (!Prev->isBranch() && !Prev->isDebugInstr())
+      break;
+    DI1 = Prev;
+  }
   for (unsigned i = 0; i != NumDups2; ) {
     // NumDups2 only counted non-dbg_value instructions, so this won't
     // run off the head of the list.
     assert(DI1 != MBB1.begin());
+
     --DI1;
+
+    // Since this instruction is going to be deleted, update call
+    // site info state if the instruction is call instruction.
+    if (DI1->isCall(MachineInstr::IgnoreBundle))
+      MBB1.getParent()->eraseCallSiteInfo(&*DI1);
+
     // skip dbg_value instructions
     if (!DI1->isDebugInstr())
       ++i;
@@ -1815,7 +1945,7 @@ bool IfConverter::IfConvertDiamondCommon(
       for (const MachineOperand &MO : FI.operands()) {
         if (!MO.isReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (!Reg)
           continue;
         if (MO.isDef()) {
@@ -1983,7 +2113,7 @@ static bool MaySpeculate(const MachineInstr &MI,
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isDef() && !LaterRedefs.count(Reg))
@@ -2050,6 +2180,10 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
       break;
 
     MachineInstr *MI = MF.CloneMachineInstr(&I);
+    // Make a copy of the call site info.
+    if (MI->isCall(MachineInstr::IgnoreBundle))
+      MF.copyCallSiteInfo(&I,MI);
+
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
     unsigned ExtraPredCost = TII->getPredicationCost(I);
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index 1e82ea659617..b7dcaec90106 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -278,12 +278,12 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A,
     if (!(MOA.isReg() && MOA.getReg()))
       continue;
 
-    unsigned RegA = MOA.getReg();
+    Register RegA = MOA.getReg();
     for (auto MOB : B->operands()) {
       if (!(MOB.isReg() && MOB.getReg()))
         continue;
 
-      unsigned RegB = MOB.getReg();
+      Register RegB = MOB.getReg();
 
       if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef()))
         return false;
@@ -517,7 +517,7 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   //
   // we must ensure that there are no instructions between the 'test' and
   // conditional jump that modify %rax.
-  const unsigned PointerReg = MBP.LHS.getReg();
+  const Register PointerReg = MBP.LHS.getReg();
 
   assert(MBP.ConditionDef->getParent() ==  &MBB && "Should be in basic block");
 
@@ -689,7 +689,7 @@ void ImplicitNullChecks::rewriteNullChecks(
     for (const MachineOperand &MO : FaultingInstr->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg || MBB->isLiveIn(Reg))
         continue;
       MBB->addLiveIn(Reg);
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 41ae8061a917..2408f18678e4 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LiveRangeCalc.h"
 #include "Spiller.h"
 #include "SplitKit.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -26,6 +25,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -346,8 +346,7 @@ void InlineSpiller::collectRegsToSpill() {
 }
 
 bool InlineSpiller::isSibling(unsigned Reg) {
-  return TargetRegisterInfo::isVirtualRegister(Reg) &&
-           VRM.getOriginal(Reg) == Original;
+  return Register::isVirtualRegister(Reg) && VRM.getOriginal(Reg) == Original;
 }
 
 /// It is beneficial to spill to earlier place in the same BB in case
@@ -377,7 +376,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   assert(VNI && VNI->def == Idx.getRegSlot() && "Not defined by copy");
 #endif
 
-  unsigned SrcReg = CopyMI.getOperand(1).getReg();
+  Register SrcReg = CopyMI.getOperand(1).getReg();
   LiveInterval &SrcLI = LIS.getInterval(SrcReg);
   VNInfo *SrcVNI = SrcLI.getVNInfoAt(Idx);
   LiveQueryResult SrcQ = SrcLI.Query(Idx);
@@ -845,9 +844,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
   for (MIBundleOperands MO(*MI); MO.isValid(); ++MO) {
     if (!MO->isReg())
       continue;
-    unsigned Reg = MO->getReg();
-    if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) ||
-        MRI.isReserved(Reg)) {
+    Register Reg = MO->getReg();
+    if (!Reg || Register::isVirtualRegister(Reg) || MRI.isReserved(Reg)) {
       continue;
     }
     // Skip non-Defs, including undef uses and internal reads.
@@ -869,7 +867,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
     --NumSpills;
   LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
   if (MI->isCall())
-    MI->getMF()->updateCallSiteInfo(MI, FoldMI);
+    MI->getMF()->moveCallSiteInfo(MI, FoldMI);
   MI->eraseFromParent();
 
   // Insert any new instructions other than FoldMI into the LIS maps.
@@ -1111,8 +1109,8 @@ void InlineSpiller::spillAll() {
 void InlineSpiller::spill(LiveRangeEdit &edit) {
   ++NumSpilledRanges;
   Edit = &edit;
-  assert(!TargetRegisterInfo::isStackSlot(edit.getReg())
-         && "Trying to spill a stack slot.");
+  assert(!Register::isStackSlot(edit.getReg()) &&
+         "Trying to spill a stack slot.");
   // Share a stack slot among all descendants of Original.
   Original = VRM.getOriginal(edit.getReg());
   StackSlot = VRM.getStackSlot(Original);
@@ -1147,7 +1145,7 @@ void HoistSpillHelper::addToMergeableSpills(MachineInstr &Spill, int StackSlot,
   // save a copy of LiveInterval in StackSlotToOrigLI because the original
   // LiveInterval may be cleared after all its references are spilled.
   if (StackSlotToOrigLI.find(StackSlot) == StackSlotToOrigLI.end()) {
-    auto LI = llvm::make_unique<LiveInterval>(OrigLI.reg, OrigLI.weight);
+    auto LI = std::make_unique<LiveInterval>(OrigLI.reg, OrigLI.weight);
     LI->assign(OrigLI, Allocator);
     StackSlotToOrigLI[StackSlot] = std::move(LI);
   }
@@ -1459,7 +1457,7 @@ void HoistSpillHelper::hoistAllSpills() {
   LiveRangeEdit Edit(nullptr, NewVRegs, MF, LIS, &VRM, this);
 
   for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     unsigned Original = VRM.getPreSplitReg(Reg);
     if (!MRI.def_empty(Reg))
       Virt2SiblingsMap[Original].insert(Reg);
diff --git a/lib/CodeGen/InterleavedLoadCombinePass.cpp b/lib/CodeGen/InterleavedLoadCombinePass.cpp
index 9525da849e2a..770c4952d169 100644
--- a/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -940,8 +940,8 @@ public:
   /// \param V input value
   /// \param Result result polynomial
   static void computePolynomial(Value &V, Polynomial &Result) {
-    if (isa<BinaryOperator>(&V))
-      computePolynomialBinOp(*dyn_cast<BinaryOperator>(&V), Result);
+    if (auto *BO = dyn_cast<BinaryOperator>(&V))
+      computePolynomialBinOp(*BO, Result);
     else
       Result = Polynomial(&V);
   }
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 886ae7e94adb..1c362aec6e67 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -96,14 +96,15 @@ LLVMTargetMachine::getTargetTransformInfo(const Function &F) {
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
 static TargetPassConfig *
 addPassesToGenerateCode(LLVMTargetMachine &TM, PassManagerBase &PM,
-                        bool DisableVerify, MachineModuleInfo &MMI) {
+                        bool DisableVerify,
+                        MachineModuleInfoWrapperPass &MMIWP) {
   // Targets may override createPassConfig to provide a target-specific
   // subclass.
   TargetPassConfig *PassConfig = TM.createPassConfig(PM);
   // Set PassConfig options provided by TargetMachine.
   PassConfig->setDisableVerify(DisableVerify);
   PM.add(PassConfig);
-  PM.add(&MMI);
+  PM.add(&MMIWP);
 
   if (PassConfig->addISelPasses())
     return nullptr;
@@ -139,7 +140,7 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
 
     std::unique_ptr<MCAsmBackend> MAB(
         getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions));
-    auto FOut = llvm::make_unique<formatted_raw_ostream>(Out);
+    auto FOut = std::make_unique<formatted_raw_ostream>(Out);
     MCStreamer *S = getTarget().createAsmStreamer(
         Context, std::move(FOut), Options.MCOptions.AsmVerbose,
         Options.MCOptions.MCUseDwarfDirectory, InstPrinter, std::move(MCE),
@@ -186,17 +187,15 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
   return false;
 }
 
-bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
-                                            raw_pwrite_stream &Out,
-                                            raw_pwrite_stream *DwoOut,
-                                            CodeGenFileType FileType,
-                                            bool DisableVerify,
-                                            MachineModuleInfo *MMI) {
+bool LLVMTargetMachine::addPassesToEmitFile(
+    PassManagerBase &PM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType, bool DisableVerify,
+    MachineModuleInfoWrapperPass *MMIWP) {
   // Add common CodeGen passes.
-  if (!MMI)
-    MMI = new MachineModuleInfo(this);
+  if (!MMIWP)
+    MMIWP = new MachineModuleInfoWrapperPass(this);
   TargetPassConfig *PassConfig =
-      addPassesToGenerateCode(*this, PM, DisableVerify, *MMI);
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMIWP);
   if (!PassConfig)
     return true;
 
@@ -206,12 +205,13 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
       // testing to be meaningful, we need to ensure that the symbols created
       // are MCSymbolXCOFF variants, which requires that
       // the TargetLoweringObjectFile instance has been initialized.
-      MCContext &Ctx = MMI->getContext();
+      MCContext &Ctx = MMIWP->getMMI().getContext();
       const_cast<TargetLoweringObjectFile &>(*this->getObjFileLowering())
           .Initialize(Ctx, *this);
     }
     PM.add(createPrintMIRPass(Out));
-  } else if (addAsmPrinter(PM, Out, DwoOut, FileType, MMI->getContext()))
+  } else if (addAsmPrinter(PM, Out, DwoOut, FileType,
+                           MMIWP->getMMI().getContext()))
     return true;
 
   PM.add(createFreeMachineFunctionPass());
@@ -227,15 +227,15 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
                                           raw_pwrite_stream &Out,
                                           bool DisableVerify) {
   // Add common CodeGen passes.
-  MachineModuleInfo *MMI = new MachineModuleInfo(this);
+  MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(this);
   TargetPassConfig *PassConfig =
-      addPassesToGenerateCode(*this, PM, DisableVerify, *MMI);
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMIWP);
   if (!PassConfig)
     return true;
   assert(TargetPassConfig::willCompleteCodeGenPipeline() &&
          "Cannot emit MC with limited codegen pipeline");
 
-  Ctx = &MMI->getContext();
+  Ctx = &MMIWP->getMMI().getContext();
   if (Options.MCOptions.MCSaveTempLabels)
     Ctx->setAllowTemporaryLabels(false);
 
diff --git a/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
index 200ac0ba15bf..cef5085ae079 100644
--- a/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -73,18 +73,18 @@ LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
 
     if (!MDT) {
       LLVM_DEBUG(dbgs() << "Building DominatorTree on the fly\n");
-      OwnedMDT = make_unique<MachineDominatorTree>();
+      OwnedMDT = std::make_unique<MachineDominatorTree>();
       OwnedMDT->getBase().recalculate(*MF);
       MDT = OwnedMDT.get();
     }
 
     // Generate LoopInfo from it.
-    OwnedMLI = make_unique<MachineLoopInfo>();
+    OwnedMLI = std::make_unique<MachineLoopInfo>();
     OwnedMLI->getBase().analyze(MDT->getBase());
     MLI = OwnedMLI.get();
   }
 
-  OwnedMBFI = make_unique<MachineBlockFrequencyInfo>();
+  OwnedMBFI = std::make_unique<MachineBlockFrequencyInfo>();
   OwnedMBFI->calculate(*MF, MBPI, *MLI);
   return *OwnedMBFI.get();
 }
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index 503821537ed9..ac3ef0e709f3 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
index a669e64692b9..f1b237d83e8c 100644
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ b/lib/CodeGen/LiveDebugValues.cpp
@@ -7,14 +7,23 @@
 //===----------------------------------------------------------------------===//
 ///
 /// This pass implements a data flow analysis that propagates debug location
-/// information by inserting additional DBG_VALUE instructions into the machine
-/// instruction stream. The pass internally builds debug location liveness
-/// ranges to determine the points where additional DBG_VALUEs need to be
-/// inserted.
+/// information by inserting additional DBG_VALUE insts into the machine
+/// instruction stream. Before running, each DBG_VALUE inst corresponds to a
+/// source assignment of a variable. Afterwards, a DBG_VALUE inst specifies a
+/// variable location for the current basic block (see SourceLevelDebugging.rst).
 ///
 /// This is a separate pass from DbgValueHistoryCalculator to facilitate
 /// testing and improve modularity.
 ///
+/// Each variable location is represented by a VarLoc object that identifies the
+/// source variable, its current machine-location, and the DBG_VALUE inst that
+/// specifies the location. Each VarLoc is indexed in the (function-scope)
+/// VarLocMap, giving each VarLoc a unique index. Rather than operate directly
+/// on machine locations, the dataflow analysis in this pass identifies
+/// locations by their index in the VarLocMap, meaning all the variable
+/// locations in a block can be described by a sparse vector of VarLocMap
+/// indexes.
+///
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
@@ -68,6 +77,7 @@ using namespace llvm;
 #define DEBUG_TYPE "livedebugvalues"
 
 STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
+STATISTIC(NumRemoved, "Number of DBG_VALUE instructions removed");
 
 // If @MI is a DBG_VALUE with debug value described by a defined
 // register, returns the number of this register. In the other case, returns 0.
@@ -179,8 +189,16 @@ private:
       }
     };
 
+    /// Identity of the variable at this location.
     const DebugVariable Var;
-    const MachineInstr &MI; ///< Only used for cloning a new DBG_VALUE.
+
+    /// The expression applied to this location.
+    const DIExpression *Expr;
+
+    /// DBG_VALUE to clone var/expr information from if this location
+    /// is moved.
+    const MachineInstr &MI;
+
     mutable UserValueScopes UVS;
     enum VarLocKind {
       InvalidKind = 0,
@@ -201,9 +219,9 @@ private:
       const ConstantInt *CImm;
     } Loc;
 
-    VarLoc(const MachineInstr &MI, LexicalScopes &LS,
-          VarLocKind K = InvalidKind)
-        : Var(MI), MI(MI), UVS(MI.getDebugLoc(), LS){
+    VarLoc(const MachineInstr &MI, LexicalScopes &LS)
+        : Var(MI), Expr(MI.getDebugExpression()), MI(MI),
+          UVS(MI.getDebugLoc(), LS) {
       static_assert((sizeof(Loc) == sizeof(uint64_t)),
                     "hash does not cover all members of Loc");
       assert(MI.isDebugValue() && "not a DBG_VALUE");
@@ -225,17 +243,78 @@ private:
              "entry values must be register locations");
     }
 
-    /// The constructor for spill locations.
-    VarLoc(const MachineInstr &MI, unsigned SpillBase, int SpillOffset,
-           LexicalScopes &LS)
-        : Var(MI), MI(MI), UVS(MI.getDebugLoc(), LS) {
-      assert(MI.isDebugValue() && "not a DBG_VALUE");
-      assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
-      Kind = SpillLocKind;
-      Loc.SpillLocation = {SpillBase, SpillOffset};
+    /// Take the variable and machine-location in DBG_VALUE MI, and build an
+    /// entry location using the given expression.
+    static VarLoc CreateEntryLoc(const MachineInstr &MI, LexicalScopes &LS,
+                                 const DIExpression *EntryExpr) {
+      VarLoc VL(MI, LS);
+      VL.Kind = EntryValueKind;
+      VL.Expr = EntryExpr;
+      return VL;
+    }
+
+    /// Copy the register location in DBG_VALUE MI, updating the register to
+    /// be NewReg.
+    static VarLoc CreateCopyLoc(const MachineInstr &MI, LexicalScopes &LS,
+                                unsigned NewReg) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Loc.RegNo = NewReg;
+      return VL;
+    }
+
+    /// Take the variable described by DBG_VALUE MI, and create a VarLoc
+    /// locating it in the specified spill location.
+    static VarLoc CreateSpillLoc(const MachineInstr &MI, unsigned SpillBase,
+                                 int SpillOffset, LexicalScopes &LS) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Kind = SpillLocKind;
+      VL.Loc.SpillLocation = {SpillBase, SpillOffset};
+      return VL;
     }
 
-    // Is the Loc field a constant or constant object?
+    /// Create a DBG_VALUE representing this VarLoc in the given function.
+    /// Copies variable-specific information such as DILocalVariable and
+    /// inlining information from the original DBG_VALUE instruction, which may
+    /// have been several transfers ago.
+    MachineInstr *BuildDbgValue(MachineFunction &MF) const {
+      const DebugLoc &DbgLoc = MI.getDebugLoc();
+      bool Indirect = MI.isIndirectDebugValue();
+      const auto &IID = MI.getDesc();
+      const DILocalVariable *Var = MI.getDebugVariable();
+      const DIExpression *DIExpr = MI.getDebugExpression();
+
+      switch (Kind) {
+      case EntryValueKind:
+        // An entry value is a register location -- but with an updated
+        // expression.
+        return BuildMI(MF, DbgLoc, IID, Indirect, Loc.RegNo, Var, Expr);
+      case RegisterKind:
+        // Register locations are like the source DBG_VALUE, but with the
+        // register number from this VarLoc.
+        return BuildMI(MF, DbgLoc, IID, Indirect, Loc.RegNo, Var, DIExpr);
+      case SpillLocKind: {
+        // Spills are indirect DBG_VALUEs, with a base register and offset.
+        // Use the original DBG_VALUEs expression to build the spilt location
+        // on top of. FIXME: spill locations created before this pass runs
+        // are not recognized, and not handled here.
+        auto *SpillExpr = DIExpression::prepend(
+            DIExpr, DIExpression::ApplyOffset, Loc.SpillLocation.SpillOffset);
+        unsigned Base = Loc.SpillLocation.SpillBase;
+        return BuildMI(MF, DbgLoc, IID, true, Base, Var, SpillExpr);
+      }
+      case ImmediateKind: {
+        MachineOperand MO = MI.getOperand(0);
+        return BuildMI(MF, DbgLoc, IID, Indirect, MO, Var, DIExpr);
+      }
+      case InvalidKind:
+        llvm_unreachable("Tried to produce DBG_VALUE for invalid VarLoc");
+      }
+      llvm_unreachable("Unrecognized LiveDebugValues.VarLoc.Kind enum");
+    }
+
+    /// Is the Loc field a constant or constant object?
     bool isConstant() const { return Kind == ImmediateKind; }
 
     /// If this variable is described by a register, return it,
@@ -251,18 +330,42 @@ private:
     bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    LLVM_DUMP_METHOD void dump() const { MI.dump(); }
+    // TRI can be null.
+    void dump(const TargetRegisterInfo *TRI, raw_ostream &Out = dbgs()) const {
+      dbgs() << "VarLoc(";
+      switch (Kind) {
+      case RegisterKind:
+      case EntryValueKind:
+        dbgs() << printReg(Loc.RegNo, TRI);
+        break;
+      case SpillLocKind:
+        dbgs() << printReg(Loc.SpillLocation.SpillBase, TRI);
+        dbgs() << "[" << Loc.SpillLocation.SpillOffset << "]";
+        break;
+      case ImmediateKind:
+        dbgs() << Loc.Immediate;
+        break;
+      case InvalidKind:
+        llvm_unreachable("Invalid VarLoc in dump method");
+      }
+
+      dbgs() << ", \"" << Var.getVar()->getName() << "\", " << *Expr << ", ";
+      if (Var.getInlinedAt())
+        dbgs() << "!" << Var.getInlinedAt()->getMetadataID() << ")\n";
+      else
+        dbgs() << "(null))\n";
+    }
 #endif
 
     bool operator==(const VarLoc &Other) const {
       return Kind == Other.Kind && Var == Other.Var &&
-             Loc.Hash == Other.Loc.Hash;
+             Loc.Hash == Other.Loc.Hash && Expr == Other.Expr;
     }
 
     /// This operator guarantees that VarLocs are sorted by Variable first.
     bool operator<(const VarLoc &Other) const {
-      return std::tie(Var, Kind, Loc.Hash) <
-             std::tie(Other.Var, Other.Kind, Other.Loc.Hash);
+      return std::tie(Var, Kind, Loc.Hash, Expr) <
+             std::tie(Other.Var, Other.Kind, Other.Loc.Hash, Other.Expr);
     }
   };
 
@@ -271,8 +374,8 @@ private:
   using VarLocSet = SparseBitVector<>;
   using VarLocInMBB = SmallDenseMap<const MachineBasicBlock *, VarLocSet>;
   struct TransferDebugPair {
-    MachineInstr *TransferInst;
-    MachineInstr *DebugInst;
+    MachineInstr *TransferInst; /// Instruction where this transfer occurs.
+    unsigned LocationID;        /// Location number for the transfer dest.
   };
   using TransferMap = SmallVector<TransferDebugPair, 4>;
 
@@ -320,6 +423,14 @@ private:
       Vars.insert({Var, VarLocID});
     }
 
+    /// Insert a set of ranges.
+    void insertFromLocSet(const VarLocSet &ToLoad, const VarLocMap &Map) {
+      for (unsigned Id : ToLoad) {
+        const VarLoc &Var = Map[Id];
+        insert(Id, Var.Var);
+      }
+    }
+
     /// Empty the set.
     void clear() {
       VarLocs.clear();
@@ -333,8 +444,18 @@ private:
     }
   };
 
-  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF,
-                          unsigned &Reg);
+  /// Tests whether this instruction is a spill to a stack location.
+  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
+
+  /// Decide if @MI is a spill instruction and return true if it is. We use 2
+  /// criteria to make this decision:
+  /// - Is this instruction a store to a spill slot?
+  /// - Is there a register operand that is both used and killed?
+  /// TODO: Store optimization can fold spills into other stores (including
+  /// other spills). We do not handle this yet (more than one memory operand).
+  bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
+                       unsigned &Reg);
+
   /// If a given instruction is identified as a spill, return the spill location
   /// and set \p Reg to the spilled register.
   Optional<VarLoc::SpillLoc> isRestoreInstruction(const MachineInstr &MI,
@@ -361,13 +482,13 @@ private:
   void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
                            VarLocMap &VarLocIDs, TransferMap &Transfers,
                            DebugParamMap &DebugEntryVals);
-  bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                              VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
+  bool transferTerminator(MachineBasicBlock *MBB, OpenRangesSet &OpenRanges,
+                          VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
 
-  bool process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+  void process(MachineInstr &MI, OpenRangesSet &OpenRanges,
                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
                TransferMap &Transfers, DebugParamMap &DebugEntryVals,
-               bool transferChanges, OverlapMap &OverlapFragments,
+               OverlapMap &OverlapFragments,
                VarToFragments &SeenFragments);
 
   void accumulateFragmentMap(MachineInstr &MI, VarToFragments &SeenFragments,
@@ -376,7 +497,12 @@ private:
   bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
             const VarLocMap &VarLocIDs,
             SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-            SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks);
+            SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks,
+            VarLocInMBB &PendingInLocs);
+
+  /// Create DBG_VALUE insts for inlocs that have been propagated but
+  /// had their instruction creation deferred.
+  void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs);
 
   bool ExtendRanges(MachineFunction &MF);
 
@@ -518,7 +644,7 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
       const VarLoc &VL = VarLocIDs[VLL];
       Out << " Var: " << VL.Var.getVar()->getName();
       Out << " MI: ";
-      VL.dump();
+      VL.dump(TRI, Out);
     }
   }
   Out << "\n";
@@ -567,11 +693,7 @@ void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
     ID = VarLocIDs.insert(VL);
     OpenRanges.insert(ID, VL.Var);
   } else if (MI.hasOneMemOperand()) {
-    // It's a stack spill -- fetch spill base and offset.
-    VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
-    VarLoc VL(MI, SpillLocation.SpillBase, SpillLocation.SpillOffset, LS);
-    ID = VarLocIDs.insert(VL);
-    OpenRanges.insert(ID, VL.Var);
+    llvm_unreachable("DBG_VALUE with mem operand encountered after regalloc?");
   } else {
     // This must be an undefined location. We should leave OpenRanges closed.
     assert(MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == 0 &&
@@ -585,7 +707,6 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI,
                                       TransferMap &Transfers,
                                       DebugParamMap &DebugEntryVals,
                                       SparseBitVector<> &KillSet) {
-  MachineFunction *MF = MI.getParent()->getParent();
   for (unsigned ID : KillSet) {
     if (!VarLocIDs[ID].Var.getVar()->isParameter())
       continue;
@@ -600,20 +721,12 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI,
     auto ParamDebugInstr = DebugEntryVals[CurrDebugInstr->getDebugVariable()];
     DIExpression *NewExpr = DIExpression::prepend(
         ParamDebugInstr->getDebugExpression(), DIExpression::EntryValue);
-    MachineInstr *EntryValDbgMI =
-        BuildMI(*MF, ParamDebugInstr->getDebugLoc(), ParamDebugInstr->getDesc(),
-                ParamDebugInstr->isIndirectDebugValue(),
-                ParamDebugInstr->getOperand(0).getReg(),
-                ParamDebugInstr->getDebugVariable(), NewExpr);
-
-    if (ParamDebugInstr->isIndirectDebugValue())
-      EntryValDbgMI->getOperand(1).setImm(
-          ParamDebugInstr->getOperand(1).getImm());
-
-    Transfers.push_back({&MI, EntryValDbgMI});
-    VarLoc VL(*EntryValDbgMI, LS);
-    unsigned EntryValLocID = VarLocIDs.insert(VL);
-    OpenRanges.insert(EntryValLocID, VL.Var);
+
+    VarLoc EntryLoc = VarLoc::CreateEntryLoc(*ParamDebugInstr, LS, NewExpr);
+
+    unsigned EntryValLocID = VarLocIDs.insert(EntryLoc);
+    Transfers.push_back({&MI, EntryValLocID});
+    OpenRanges.insert(EntryValLocID, EntryLoc.Var);
   }
 }
 
@@ -627,21 +740,19 @@ void LiveDebugValues::insertTransferDebugPair(
     VarLocMap &VarLocIDs, unsigned OldVarID, TransferKind Kind,
     unsigned NewReg) {
   const MachineInstr *DebugInstr = &VarLocIDs[OldVarID].MI;
-  MachineFunction *MF = MI.getParent()->getParent();
-  MachineInstr *NewDebugInstr;
 
   auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &DebugInstr,
-                        &VarLocIDs](VarLoc &VL, MachineInstr *NewDebugInstr) {
+                        &VarLocIDs](VarLoc &VL) {
     unsigned LocId = VarLocIDs.insert(VL);
 
     // Close this variable's previous location range.
     DebugVariable V(*DebugInstr);
     OpenRanges.erase(V);
 
+    // Record the new location as an open range, and a postponed transfer
+    // inserting a DBG_VALUE for this location.
     OpenRanges.insert(LocId, VL.Var);
-    // The newly created DBG_VALUE instruction NewDebugInstr must be inserted
-    // after MI. Keep track of the pairing.
-    TransferDebugPair MIP = {&MI, NewDebugInstr};
+    TransferDebugPair MIP = {&MI, LocId};
     Transfers.push_back(MIP);
   };
 
@@ -653,37 +764,25 @@ void LiveDebugValues::insertTransferDebugPair(
            "No register supplied when handling a copy of a debug value");
     // Create a DBG_VALUE instruction to describe the Var in its new
     // register location.
-    NewDebugInstr = BuildMI(
-        *MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(),
-        DebugInstr->isIndirectDebugValue(), NewReg,
-        DebugInstr->getDebugVariable(), DebugInstr->getDebugExpression());
-    if (DebugInstr->isIndirectDebugValue())
-      NewDebugInstr->getOperand(1).setImm(DebugInstr->getOperand(1).getImm());
-    VarLoc VL(*NewDebugInstr, LS);
-    ProcessVarLoc(VL, NewDebugInstr);
-    LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register copy: ";
-               NewDebugInstr->print(dbgs(), /*IsStandalone*/false,
-                                    /*SkipOpers*/false, /*SkipDebugLoc*/false,
-                                    /*AddNewLine*/true, TII));
+    VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for register copy:";
+      VL.dump(TRI);
+    });
     return;
   }
   case TransferKind::TransferSpill: {
     // Create a DBG_VALUE instruction to describe the Var in its spilled
     // location.
     VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
-    auto *SpillExpr = DIExpression::prepend(DebugInstr->getDebugExpression(),
-                                            DIExpression::ApplyOffset,
-                                            SpillLocation.SpillOffset);
-    NewDebugInstr = BuildMI(
-        *MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), true,
-        SpillLocation.SpillBase, DebugInstr->getDebugVariable(), SpillExpr);
-    VarLoc VL(*NewDebugInstr, SpillLocation.SpillBase,
-              SpillLocation.SpillOffset, LS);
-    ProcessVarLoc(VL, NewDebugInstr);
-    LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: ";
-               NewDebugInstr->print(dbgs(), /*IsStandalone*/false,
-                                    /*SkipOpers*/false, /*SkipDebugLoc*/false,
-                                    /*AddNewLine*/true, TII));
+    VarLoc VL = VarLoc::CreateSpillLoc(*DebugInstr, SpillLocation.SpillBase,
+                                       SpillLocation.SpillOffset, LS);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for spill:";
+      VL.dump(TRI);
+    });
     return;
   }
   case TransferKind::TransferRestore: {
@@ -691,15 +790,14 @@ void LiveDebugValues::insertTransferDebugPair(
            "No register supplied when handling a restore of a debug value");
     MachineFunction *MF = MI.getMF();
     DIBuilder DIB(*const_cast<Function &>(MF->getFunction()).getParent());
-    NewDebugInstr =
-        BuildMI(*MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), false,
-                NewReg, DebugInstr->getDebugVariable(), DIB.createExpression());
-    VarLoc VL(*NewDebugInstr, LS);
-    ProcessVarLoc(VL, NewDebugInstr);
-    LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register restore: ";
-               NewDebugInstr->print(dbgs(), /*IsStandalone*/false,
-                                    /*SkipOpers*/false, /*SkipDebugLoc*/false,
-                                    /*AddNewLine*/true, TII));
+    // DebugInstr refers to the pre-spill location, therefore we can reuse
+    // its expression.
+    VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for restore:";
+      VL.dump(TRI);
+    });
     return;
   }
   }
@@ -719,7 +817,7 @@ void LiveDebugValues::transferRegisterDef(
     // instructions never clobber SP, because some backends (e.g., AArch64)
     // never list SP in the regmask.
     if (MO.isReg() && MO.isDef() && MO.getReg() &&
-        TRI->isPhysicalRegister(MO.getReg()) &&
+        Register::isPhysicalRegister(MO.getReg()) &&
         !(MI.isCall() && MO.getReg() == SP)) {
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
@@ -748,16 +846,8 @@ void LiveDebugValues::transferRegisterDef(
   }
 }
 
-/// Decide if @MI is a spill instruction and return true if it is. We use 2
-/// criteria to make this decision:
-/// - Is this instruction a store to a spill slot?
-/// - Is there a register operand that is both used and killed?
-/// TODO: Store optimization can fold spills into other stores (including
-/// other spills). We do not handle this yet (more than one memory operand).
 bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
-                                         MachineFunction *MF, unsigned &Reg) {
-  SmallVector<const MachineMemOperand*, 1> Accesses;
-
+                                         MachineFunction *MF) {
   // TODO: Handle multiple stores folded into one.
   if (!MI.hasOneMemOperand())
     return false;
@@ -766,6 +856,14 @@ bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
     return false; // This is not a spill instruction, since no valid size was
                   // returned from either function.
 
+  return true;
+}
+
+bool LiveDebugValues::isLocationSpill(const MachineInstr &MI,
+                                      MachineFunction *MF, unsigned &Reg) {
+  if (!isSpillInstruction(MI, MF))
+    return false;
+
   auto isKilledReg = [&](const MachineOperand MO, unsigned &Reg) {
     if (!MO.isReg() || !MO.isUse()) {
       Reg = 0;
@@ -834,7 +932,37 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
 
   LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
 
-  if (isSpillInstruction(MI, MF, Reg)) {
+  // First, if there are any DBG_VALUEs pointing at a spill slot that is
+  // written to, then close the variable location. The value in memory
+  // will have changed.
+  VarLocSet KillSet;
+  if (isSpillInstruction(MI, MF)) {
+    Loc = extractSpillBaseRegAndOffset(MI);
+    for (unsigned ID : OpenRanges.getVarLocs()) {
+      const VarLoc &VL = VarLocIDs[ID];
+      if (VL.Kind == VarLoc::SpillLocKind && VL.Loc.SpillLocation == *Loc) {
+        // This location is overwritten by the current instruction -- terminate
+        // the open range, and insert an explicit DBG_VALUE $noreg.
+        //
+        // Doing this at a later stage would require re-interpreting all
+        // DBG_VALUes and DIExpressions to identify whether they point at
+        // memory, and then analysing all memory writes to see if they
+        // overwrite that memory, which is expensive.
+        //
+        // At this stage, we already know which DBG_VALUEs are for spills and
+        // where they are located; it's best to fix handle overwrites now.
+        KillSet.set(ID);
+        VarLoc UndefVL = VarLoc::CreateCopyLoc(VL.MI, LS, 0);
+        unsigned UndefLocID = VarLocIDs.insert(UndefVL);
+        Transfers.push_back({&MI, UndefLocID});
+      }
+    }
+    OpenRanges.erase(KillSet, VarLocIDs);
+  }
+
+  // Try to recognise spill and restore instructions that may create a new
+  // variable location.
+  if (isLocationSpill(MI, MF, Reg)) {
     TKind = TransferKind::TransferSpill;
     LLVM_DEBUG(dbgs() << "Recognized as spill: "; MI.dump(););
     LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
@@ -854,6 +982,7 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
       LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
                         << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
     } else if (TKind == TransferKind::TransferRestore &&
+               VarLocIDs[ID].Kind == VarLoc::SpillLocKind &&
                VarLocIDs[ID].Loc.SpillLocation == *Loc) {
       LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
                         << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
@@ -885,8 +1014,8 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
     return false;
   };
 
-  unsigned SrcReg = SrcRegOp->getReg();
-  unsigned DestReg = DestRegOp->getReg();
+  Register SrcReg = SrcRegOp->getReg();
+  Register DestReg = DestRegOp->getReg();
 
   // We want to recognize instructions where destination register is callee
   // saved register. If register that could be clobbered by the call is
@@ -906,26 +1035,20 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
 }
 
 /// Terminate all open ranges at the end of the current basic block.
-bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
-                                             OpenRangesSet &OpenRanges,
-                                             VarLocInMBB &OutLocs,
-                                             const VarLocMap &VarLocIDs) {
+bool LiveDebugValues::transferTerminator(MachineBasicBlock *CurMBB,
+                                         OpenRangesSet &OpenRanges,
+                                         VarLocInMBB &OutLocs,
+                                         const VarLocMap &VarLocIDs) {
   bool Changed = false;
-  const MachineBasicBlock *CurMBB = MI.getParent();
-  if (!(MI.isTerminator() || (&MI == &CurMBB->back())))
-    return false;
-
-  if (OpenRanges.empty())
-    return false;
 
   LLVM_DEBUG(for (unsigned ID
                   : OpenRanges.getVarLocs()) {
     // Copy OpenRanges to OutLocs, if not already present.
     dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ":  ";
-    VarLocIDs[ID].dump();
+    VarLocIDs[ID].dump(TRI);
   });
   VarLocSet &VLS = OutLocs[CurMBB];
-  Changed = VLS |= OpenRanges.getVarLocs();
+  Changed = VLS != OpenRanges.getVarLocs();
   // New OutLocs set may be different due to spill, restore or register
   // copy instruction processing.
   if (Changed)
@@ -995,26 +1118,17 @@ void LiveDebugValues::accumulateFragmentMap(MachineInstr &MI,
 }
 
 /// This routine creates OpenRanges and OutLocs.
-bool LiveDebugValues::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+void LiveDebugValues::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
-                              TransferMap &Transfers, DebugParamMap &DebugEntryVals,
-                              bool transferChanges,
+                              TransferMap &Transfers,
+                              DebugParamMap &DebugEntryVals,
                               OverlapMap &OverlapFragments,
                               VarToFragments &SeenFragments) {
-  bool Changed = false;
   transferDebugValue(MI, OpenRanges, VarLocIDs);
   transferRegisterDef(MI, OpenRanges, VarLocIDs, Transfers,
                       DebugEntryVals);
-  if (transferChanges) {
-    transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
-    transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
-  } else {
-    // Build up a map of overlapping fragments on the first run through.
-    if (MI.isDebugValue())
-      accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
-  }
-  Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
-  return Changed;
+  transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
+  transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
 }
 
 /// This routine joins the analysis results of all incoming edges in @MBB by
@@ -1024,7 +1138,8 @@ bool LiveDebugValues::join(
     MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
     const VarLocMap &VarLocIDs,
     SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-    SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks) {
+    SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks,
+    VarLocInMBB &PendingInLocs) {
   LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
   bool Changed = false;
 
@@ -1034,9 +1149,11 @@ bool LiveDebugValues::join(
   // can be joined.
   int NumVisited = 0;
   for (auto p : MBB.predecessors()) {
-    // Ignore unvisited predecessor blocks.  As we are processing
-    // the blocks in reverse post-order any unvisited block can
-    // be considered to not remove any incoming values.
+    // Ignore backedges if we have not visited the predecessor yet. As the
+    // predecessor hasn't yet had locations propagated into it, most locations
+    // will not yet be valid, so treat them as all being uninitialized and
+    // potentially valid. If a location guessed to be correct here is
+    // invalidated later, we will remove it when we revisit this block.
     if (!Visited.count(p)) {
       LLVM_DEBUG(dbgs() << "  ignoring unvisited pred MBB: " << p->getNumber()
                         << "\n");
@@ -1086,44 +1203,59 @@ bool LiveDebugValues::join(
   // is the entry block which has no predecessor.
   assert((NumVisited || MBB.pred_empty()) &&
          "Should have processed at least one predecessor");
-  if (InLocsT.empty())
-    return false;
 
   VarLocSet &ILS = InLocs[&MBB];
+  VarLocSet &Pending = PendingInLocs[&MBB];
 
-  // Insert DBG_VALUE instructions, if not already inserted.
+  // New locations will have DBG_VALUE insts inserted at the start of the
+  // block, after location propagation has finished. Record the insertions
+  // that we need to perform in the Pending set.
   VarLocSet Diff = InLocsT;
   Diff.intersectWithComplement(ILS);
   for (auto ID : Diff) {
-    // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
-    // new range is started for the var from the mbb's beginning by inserting
-    // a new DBG_VALUE. process() will end this range however appropriate.
-    const VarLoc &DiffIt = VarLocIDs[ID];
-    const MachineInstr *DebugInstr = &DiffIt.MI;
-    MachineInstr *MI = nullptr;
-    if (DiffIt.isConstant()) {
-      MachineOperand MO(DebugInstr->getOperand(0));
-      MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(),
-                   DebugInstr->getDesc(), false, MO,
-                   DebugInstr->getDebugVariable(),
-                   DebugInstr->getDebugExpression());
-    } else {
-      MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(),
-                   DebugInstr->getDesc(), DebugInstr->isIndirectDebugValue(),
-                   DebugInstr->getOperand(0).getReg(),
-                   DebugInstr->getDebugVariable(),
-                   DebugInstr->getDebugExpression());
-      if (DebugInstr->isIndirectDebugValue())
-        MI->getOperand(1).setImm(DebugInstr->getOperand(1).getImm());
-    }
-    LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
+    Pending.set(ID);
     ILS.set(ID);
     ++NumInserted;
     Changed = true;
   }
+
+  // We may have lost locations by learning about a predecessor that either
+  // loses or moves a variable. Find any locations in ILS that are not in the
+  // new in-locations, and delete those.
+  VarLocSet Removed = ILS;
+  Removed.intersectWithComplement(InLocsT);
+  for (auto ID : Removed) {
+    Pending.reset(ID);
+    ILS.reset(ID);
+    ++NumRemoved;
+    Changed = true;
+  }
+
   return Changed;
 }
 
+void LiveDebugValues::flushPendingLocs(VarLocInMBB &PendingInLocs,
+                                       VarLocMap &VarLocIDs) {
+  // PendingInLocs records all locations propagated into blocks, which have
+  // not had DBG_VALUE insts created. Go through and create those insts now.
+  for (auto &Iter : PendingInLocs) {
+    // Map is keyed on a constant pointer, unwrap it so we can insert insts.
+    auto &MBB = const_cast<MachineBasicBlock &>(*Iter.first);
+    VarLocSet &Pending = Iter.second;
+
+    for (unsigned ID : Pending) {
+      // The ID location is live-in to MBB -- work out what kind of machine
+      // location it is and create a DBG_VALUE.
+      const VarLoc &DiffIt = VarLocIDs[ID];
+      MachineInstr *MI = DiffIt.BuildDbgValue(*MBB.getParent());
+      MBB.insert(MBB.instr_begin(), MI);
+
+      (void)MI;
+      LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
+    }
+  }
+}
+
 /// Calculate the liveness information for the given machine function and
 /// extend ranges across basic blocks.
 bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
@@ -1140,6 +1272,9 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
   VarLocInMBB OutLocs;        // Ranges that exist beyond bb.
   VarLocInMBB InLocs;         // Ranges that are incoming after joining.
   TransferMap Transfers;      // DBG_VALUEs associated with spills.
+  VarLocInMBB PendingInLocs;  // Ranges that are incoming after joining, but
+                              // that we have deferred creating DBG_VALUE insts
+                              // for immediately.
 
   VarToFragments SeenFragments;
 
@@ -1156,8 +1291,6 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
                       std::greater<unsigned int>>
       Pending;
 
-  enum : bool { dontTransferChanges = false, transferChanges = true };
-
   // Besides parameter's modification, check whether a DBG_VALUE is inlined
   // in order to deduce whether the variable that it tracks comes from
   // a different function. If that is the case we can't track its entry value.
@@ -1169,7 +1302,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
 
   const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
   unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
-  unsigned FP = TRI->getFrameRegister(MF);
+  Register FP = TRI->getFrameRegister(MF);
   auto IsRegOtherThanSPAndFP = [&](const MachineOperand &Op) -> bool {
     return Op.isReg() && Op.getReg() != SP && Op.getReg() != FP;
   };
@@ -1195,23 +1328,14 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
         !MI.getDebugExpression()->isFragment())
       DebugEntryVals[MI.getDebugVariable()] = &MI;
 
-  // Initialize every mbb with OutLocs.
-  // We are not looking at any spill instructions during the initial pass
-  // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE
-  // instructions for spills of registers that are known to be user variables
-  // within the BB in which the spill occurs.
+  // Initialize per-block structures and scan for fragment overlaps.
   for (auto &MBB : MF) {
+    PendingInLocs[&MBB] = VarLocSet();
+
     for (auto &MI : MBB) {
-      process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers, DebugEntryVals,
-              dontTransferChanges, OverlapFragments, SeenFragments);
-    }
-    // Add any entry DBG_VALUE instructions necessitated by parameter
-    // clobbering.
-    for (auto &TR : Transfers) {
-      MBB.insertAfter(MachineBasicBlock::iterator(*TR.TransferInst),
-                     TR.DebugInst);
+      if (MI.isDebugValue())
+        accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
     }
-    Transfers.clear();
   }
 
   auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
@@ -1248,26 +1372,21 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     while (!Worklist.empty()) {
       MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
       Worklist.pop();
-      MBBJoined =
-          join(*MBB, OutLocs, InLocs, VarLocIDs, Visited, ArtificialBlocks);
-      Visited.insert(MBB);
+      MBBJoined = join(*MBB, OutLocs, InLocs, VarLocIDs, Visited,
+                       ArtificialBlocks, PendingInLocs);
+      MBBJoined |= Visited.insert(MBB).second;
       if (MBBJoined) {
         MBBJoined = false;
         Changed = true;
         // Now that we have started to extend ranges across BBs we need to
         // examine spill instructions to see whether they spill registers that
         // correspond to user variables.
+        // First load any pending inlocs.
+        OpenRanges.insertFromLocSet(PendingInLocs[MBB], VarLocIDs);
         for (auto &MI : *MBB)
-          OLChanged |=
               process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers,
-                      DebugEntryVals, transferChanges, OverlapFragments,
-                      SeenFragments);
-
-        // Add any DBG_VALUE instructions necessitated by spills.
-        for (auto &TR : Transfers)
-          MBB->insertAfter(MachineBasicBlock::iterator(*TR.TransferInst),
-                           TR.DebugInst);
-        Transfers.clear();
+                      DebugEntryVals, OverlapFragments, SeenFragments);
+        OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs);
 
         LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
                                     "OutLocs after propagating", dbgs()));
@@ -1289,6 +1408,19 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     assert(Pending.empty() && "Pending should be empty");
   }
 
+  // Add any DBG_VALUE instructions created by location transfers.
+  for (auto &TR : Transfers) {
+    MachineBasicBlock *MBB = TR.TransferInst->getParent();
+    const VarLoc &VL = VarLocIDs[TR.LocationID];
+    MachineInstr *MI = VL.BuildDbgValue(MF);
+    MBB->insertAfterBundle(TR.TransferInst->getIterator(), MI);
+  }
+  Transfers.clear();
+
+  // Deferred inlocs will not have had any DBG_VALUE insts created; do
+  // that now.
+  flushPendingLocs(PendingInLocs, VarLocIDs);
+
   LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
   LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
   return Changed;
@@ -1308,7 +1440,7 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TFI = MF.getSubtarget().getFrameLowering();
   TFI->determineCalleeSaves(MF, CalleeSavedRegs,
-                            make_unique<RegScavenger>().get());
+                            std::make_unique<RegScavenger>().get());
   LS.initialize(MF);
 
   bool Changed = ExtendRanges(MF);
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 656ec7d4bdfd..2dd462fc72b3 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -99,28 +99,27 @@ enum : unsigned { UndefLocNo = ~0U };
 /// usage of the location.
 class DbgValueLocation {
 public:
-  DbgValueLocation(unsigned LocNo, bool WasIndirect)
-      : LocNo(LocNo), WasIndirect(WasIndirect) {
+  DbgValueLocation(unsigned LocNo)
+      : LocNo(LocNo) {
     static_assert(sizeof(*this) == sizeof(unsigned), "bad bitfield packing");
     assert(locNo() == LocNo && "location truncation");
   }
 
-  DbgValueLocation() : LocNo(0), WasIndirect(0) {}
+  DbgValueLocation() : LocNo(0) {}
 
   unsigned locNo() const {
     // Fix up the undef location number, which gets truncated.
     return LocNo == INT_MAX ? UndefLocNo : LocNo;
   }
-  bool wasIndirect() const { return WasIndirect; }
   bool isUndef() const { return locNo() == UndefLocNo; }
 
   DbgValueLocation changeLocNo(unsigned NewLocNo) const {
-    return DbgValueLocation(NewLocNo, WasIndirect);
+    return DbgValueLocation(NewLocNo);
   }
 
   friend inline bool operator==(const DbgValueLocation &LHS,
                                 const DbgValueLocation &RHS) {
-    return LHS.LocNo == RHS.LocNo && LHS.WasIndirect == RHS.WasIndirect;
+    return LHS.LocNo == RHS.LocNo;
   }
 
   friend inline bool operator!=(const DbgValueLocation &LHS,
@@ -129,8 +128,7 @@ public:
   }
 
 private:
-  unsigned LocNo : 31;
-  unsigned WasIndirect : 1;
+  unsigned LocNo;
 };
 
 /// Map of where a user value is live, and its location.
@@ -144,22 +142,51 @@ namespace {
 
 class LDVImpl;
 
+/// A UserValue is uniquely identified by the source variable it refers to
+/// (Variable), the expression describing how to get the value (Expression) and
+/// the specific usage (InlinedAt). InlinedAt differentiates both between
+/// inline and non-inline functions, and multiple inlined instances in the same
+/// scope. FIXME: The only part of the Expression which matters for UserValue
+/// identification is the fragment part.
+class UserValueIdentity {
+private:
+  /// The debug info variable we are part of.
+  const DILocalVariable *Variable;
+  /// Any complex address expression.
+  const DIExpression *Expression;
+  /// Function usage identification.
+  const DILocation *InlinedAt;
+
+public:
+  UserValueIdentity(const DILocalVariable *Var, const DIExpression *Expr,
+                    const DILocation *IA)
+      : Variable(Var), Expression(Expr), InlinedAt(IA) {}
+
+  bool match(const DILocalVariable *Var, const DIExpression *Expr,
+             const DILocation *IA) const {
+    // FIXME: The fragment should be part of the identity, but not
+    // other things in the expression like stack values.
+    return Var == Variable && Expr == Expression && IA == InlinedAt;
+  }
+
+  bool match(const UserValueIdentity &Other) const {
+    return match(Other.Variable, Other.Expression, Other.InlinedAt);
+  }
+
+  unsigned hash_value() const {
+    return hash_combine(Variable, Expression, InlinedAt);
+  }
+};
+
 /// A user value is a part of a debug info user variable.
 ///
 /// A DBG_VALUE instruction notes that (a sub-register of) a virtual register
 /// holds part of a user variable. The part is identified by a byte offset.
-///
-/// UserValues are grouped into equivalence classes for easier searching. Two
-/// user values are related if they refer to the same variable, or if they are
-/// held by the same virtual register. The equivalence class is the transitive
-/// closure of that relation.
 class UserValue {
   const DILocalVariable *Variable; ///< The debug info variable we are part of.
   const DIExpression *Expression; ///< Any complex address expression.
   DebugLoc dl;            ///< The debug location for the variable. This is
                           ///< used by dwarf writer to find lexical scope.
-  UserValue *leader;      ///< Equivalence class leader.
-  UserValue *next = nullptr; ///< Next value in equivalence class, or null.
 
   /// Numbered locations referenced by locmap.
   SmallVector<MachineOperand, 4> locations;
@@ -180,49 +207,15 @@ class UserValue {
                      LiveIntervals &LIS);
 
 public:
+  UserValue(const UserValue &) = delete;
+
   /// Create a new UserValue.
   UserValue(const DILocalVariable *var, const DIExpression *expr, DebugLoc L,
             LocMap::Allocator &alloc)
-      : Variable(var), Expression(expr), dl(std::move(L)), leader(this),
-        locInts(alloc) {}
-
-  /// Get the leader of this value's equivalence class.
-  UserValue *getLeader() {
-    UserValue *l = leader;
-    while (l != l->leader)
-      l = l->leader;
-    return leader = l;
-  }
+      : Variable(var), Expression(expr), dl(std::move(L)), locInts(alloc) {}
 
-  /// Return the next UserValue in the equivalence class.
-  UserValue *getNext() const { return next; }
-
-  /// Does this UserValue match the parameters?
-  bool match(const DILocalVariable *Var, const DIExpression *Expr,
-             const DILocation *IA) const {
-    // FIXME: The fragment should be part of the equivalence class, but not
-    // other things in the expression like stack values.
-    return Var == Variable && Expr == Expression && dl->getInlinedAt() == IA;
-  }
-
-  /// Merge equivalence classes.
-  static UserValue *merge(UserValue *L1, UserValue *L2) {
-    L2 = L2->getLeader();
-    if (!L1)
-      return L2;
-    L1 = L1->getLeader();
-    if (L1 == L2)
-      return L1;
-    // Splice L2 before L1's members.
-    UserValue *End = L2;
-    while (End->next) {
-      End->leader = L1;
-      End = End->next;
-    }
-    End->leader = L1;
-    End->next = L1->next;
-    L1->next = L2;
-    return L1;
+  UserValueIdentity getId() {
+    return UserValueIdentity(Variable, Expression, dl->getInlinedAt());
   }
 
   /// Return the location number that matches Loc.
@@ -261,8 +254,8 @@ public:
   void mapVirtRegs(LDVImpl *LDV);
 
   /// Add a definition point to this value.
-  void addDef(SlotIndex Idx, const MachineOperand &LocMO, bool IsIndirect) {
-    DbgValueLocation Loc(getLocationNo(LocMO), IsIndirect);
+  void addDef(SlotIndex Idx, const MachineOperand &LocMO) {
+    DbgValueLocation Loc(getLocationNo(LocMO));
     // Add a singular (Idx,Idx) -> Loc mapping.
     LocMap::iterator I = locInts.find(Idx);
     if (!I.valid() || I.start() != Idx)
@@ -297,11 +290,10 @@ public:
   ///
   /// \param LI Scan for copies of the value in LI->reg.
   /// \param LocNo Location number of LI->reg.
-  /// \param WasIndirect Indicates if the original use of LI->reg was indirect
   /// \param Kills Points where the range of LocNo could be extended.
   /// \param [in,out] NewDefs Append (Idx, LocNo) of inserted defs here.
   void addDefsFromCopies(
-      LiveInterval *LI, unsigned LocNo, bool WasIndirect,
+      LiveInterval *LI, unsigned LocNo,
       const SmallVectorImpl<SlotIndex> &Kills,
       SmallVectorImpl<std::pair<SlotIndex, DbgValueLocation>> &NewDefs,
       MachineRegisterInfo &MRI, LiveIntervals &LIS);
@@ -335,7 +327,29 @@ public:
 
   void print(raw_ostream &, const TargetRegisterInfo *);
 };
+} // namespace
 
+namespace llvm {
+template <> struct DenseMapInfo<UserValueIdentity> {
+  static UserValueIdentity getEmptyKey() {
+    auto Key = DenseMapInfo<DILocalVariable *>::getEmptyKey();
+    return UserValueIdentity(Key, nullptr, nullptr);
+  }
+  static UserValueIdentity getTombstoneKey() {
+    auto Key = DenseMapInfo<DILocalVariable *>::getTombstoneKey();
+    return UserValueIdentity(Key, nullptr, nullptr);
+  }
+  static unsigned getHashValue(const UserValueIdentity &Val) {
+    return Val.hash_value();
+  }
+  static bool isEqual(const UserValueIdentity &LHS,
+                      const UserValueIdentity &RHS) {
+    return LHS.match(RHS);
+  }
+};
+} // namespace llvm
+
+namespace {
 /// A user label is a part of a debug info user label.
 class UserLabel {
   const DILabel *Label; ///< The debug info label we are part of.
@@ -387,20 +401,20 @@ class LDVImpl {
   /// All allocated UserLabel instances.
   SmallVector<std::unique_ptr<UserLabel>, 2> userLabels;
 
-  /// Map virtual register to eq class leader.
-  using VRMap = DenseMap<unsigned, UserValue *>;
-  VRMap virtRegToEqClass;
+  /// Map virtual register to UserValues which use it.
+  using VRMap = DenseMap<unsigned, SmallVector<UserValue *, 4>>;
+  VRMap VirtRegToUserVals;
 
-  /// Map user variable to eq class leader.
-  using UVMap = DenseMap<const DILocalVariable *, UserValue *>;
-  UVMap userVarMap;
+  /// Map unique UserValue identity to UserValue.
+  using UVMap = DenseMap<UserValueIdentity, UserValue *>;
+  UVMap UserVarMap;
 
   /// Find or create a UserValue.
   UserValue *getUserValue(const DILocalVariable *Var, const DIExpression *Expr,
                           const DebugLoc &DL);
 
-  /// Find the EC leader for VirtReg or null.
-  UserValue *lookupVirtReg(unsigned VirtReg);
+  /// Find the UserValues for VirtReg or null.
+  SmallVectorImpl<UserValue *> *lookupVirtReg(unsigned VirtReg);
 
   /// Add DBG_VALUE instruction to our maps.
   ///
@@ -440,8 +454,8 @@ public:
     MF = nullptr;
     userValues.clear();
     userLabels.clear();
-    virtRegToEqClass.clear();
-    userVarMap.clear();
+    VirtRegToUserVals.clear();
+    UserVarMap.clear();
     // Make sure we call emitDebugValues if the machine function was modified.
     assert((!ModifiedMF || EmitDone) &&
            "Dbg values are not emitted in LDV");
@@ -449,8 +463,8 @@ public:
     ModifiedMF = false;
   }
 
-  /// Map virtual register to an equivalence class.
-  void mapVirtReg(unsigned VirtReg, UserValue *EC);
+  /// Map virtual register to a UserValue.
+  void mapVirtReg(unsigned VirtReg, UserValue *UV);
 
   /// Replace all references to OldReg with NewRegs.
   void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs);
@@ -521,8 +535,6 @@ void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
       OS << "undef";
     else {
       OS << I.value().locNo();
-      if (I.value().wasIndirect())
-        OS << " ind";
     }
   }
   for (unsigned i = 0, e = locations.size(); i != e; ++i) {
@@ -554,37 +566,33 @@ void LDVImpl::print(raw_ostream &OS) {
 void UserValue::mapVirtRegs(LDVImpl *LDV) {
   for (unsigned i = 0, e = locations.size(); i != e; ++i)
     if (locations[i].isReg() &&
-        TargetRegisterInfo::isVirtualRegister(locations[i].getReg()))
+        Register::isVirtualRegister(locations[i].getReg()))
       LDV->mapVirtReg(locations[i].getReg(), this);
 }
 
 UserValue *LDVImpl::getUserValue(const DILocalVariable *Var,
                                  const DIExpression *Expr, const DebugLoc &DL) {
-  UserValue *&Leader = userVarMap[Var];
-  if (Leader) {
-    UserValue *UV = Leader->getLeader();
-    Leader = UV;
-    for (; UV; UV = UV->getNext())
-      if (UV->match(Var, Expr, DL->getInlinedAt()))
-        return UV;
-  }
+  auto Ident = UserValueIdentity(Var, Expr, DL->getInlinedAt());
+  UserValue *&UVEntry = UserVarMap[Ident];
 
-  userValues.push_back(
-      llvm::make_unique<UserValue>(Var, Expr, DL, allocator));
-  UserValue *UV = userValues.back().get();
-  Leader = UserValue::merge(Leader, UV);
-  return UV;
+  if (UVEntry)
+    return UVEntry;
+
+  userValues.push_back(std::make_unique<UserValue>(Var, Expr, DL, allocator));
+  return UVEntry = userValues.back().get();
 }
 
-void LDVImpl::mapVirtReg(unsigned VirtReg, UserValue *EC) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && "Only map VirtRegs");
-  UserValue *&Leader = virtRegToEqClass[VirtReg];
-  Leader = UserValue::merge(Leader, EC);
+void LDVImpl::mapVirtReg(unsigned VirtReg, UserValue *UV) {
+  assert(Register::isVirtualRegister(VirtReg) && "Only map VirtRegs");
+  assert(UserVarMap.find(UV->getId()) != UserVarMap.end() &&
+         "UserValue should exist in UserVarMap");
+  VirtRegToUserVals[VirtReg].push_back(UV);
 }
 
-UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
-  if (UserValue *UV = virtRegToEqClass.lookup(VirtReg))
-    return UV->getLeader();
+SmallVectorImpl<UserValue *> *LDVImpl::lookupVirtReg(unsigned VirtReg) {
+  VRMap::iterator Itr = VirtRegToUserVals.find(VirtReg);
+  if (Itr != VirtRegToUserVals.end())
+    return &Itr->getSecond();
   return nullptr;
 }
 
@@ -606,8 +614,8 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   // could be removed or replaced by asserts.
   bool Discard = false;
   if (MI.getOperand(0).isReg() &&
-      TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) {
-    const unsigned Reg = MI.getOperand(0).getReg();
+      Register::isVirtualRegister(MI.getOperand(0).getReg())) {
+    const Register Reg = MI.getOperand(0).getReg();
     if (!LIS->hasInterval(Reg)) {
       // The DBG_VALUE is described by a virtual register that does not have a
       // live interval. Discard the DBG_VALUE.
@@ -631,19 +639,18 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   }
 
   // Get or create the UserValue for (variable,offset) here.
-  bool IsIndirect = MI.getOperand(1).isImm();
-  if (IsIndirect)
-    assert(MI.getOperand(1).getImm() == 0 && "DBG_VALUE with nonzero offset");
+  assert(!MI.getOperand(1).isImm() && "DBG_VALUE with indirect flag before "
+                                      "LiveDebugVariables");
   const DILocalVariable *Var = MI.getDebugVariable();
   const DIExpression *Expr = MI.getDebugExpression();
   UserValue *UV =
       getUserValue(Var, Expr, MI.getDebugLoc());
   if (!Discard)
-    UV->addDef(Idx, MI.getOperand(0), IsIndirect);
+    UV->addDef(Idx, MI.getOperand(0));
   else {
     MachineOperand MO = MachineOperand::CreateReg(0U, false);
     MO.setIsDebug();
-    UV->addDef(Idx, MO, false);
+    UV->addDef(Idx, MO);
   }
   return true;
 }
@@ -666,7 +673,7 @@ bool LDVImpl::handleDebugLabel(MachineInstr &MI, SlotIndex Idx) {
     }
   }
   if (!Found)
-    userLabels.push_back(llvm::make_unique<UserLabel>(Label, DL, Idx));
+    userLabels.push_back(std::make_unique<UserLabel>(Label, DL, Idx));
 
   return true;
 }
@@ -751,14 +758,14 @@ void UserValue::extendDef(SlotIndex Idx, DbgValueLocation Loc, LiveRange *LR,
 }
 
 void UserValue::addDefsFromCopies(
-    LiveInterval *LI, unsigned LocNo, bool WasIndirect,
+    LiveInterval *LI, unsigned LocNo,
     const SmallVectorImpl<SlotIndex> &Kills,
     SmallVectorImpl<std::pair<SlotIndex, DbgValueLocation>> &NewDefs,
     MachineRegisterInfo &MRI, LiveIntervals &LIS) {
   if (Kills.empty())
     return;
   // Don't track copies from physregs, there are too many uses.
-  if (!TargetRegisterInfo::isVirtualRegister(LI->reg))
+  if (!Register::isVirtualRegister(LI->reg))
     return;
 
   // Collect all the (vreg, valno) pairs that are copies of LI.
@@ -768,13 +775,13 @@ void UserValue::addDefsFromCopies(
     // Copies of the full value.
     if (MO.getSubReg() || !MI->isCopy())
       continue;
-    unsigned DstReg = MI->getOperand(0).getReg();
+    Register DstReg = MI->getOperand(0).getReg();
 
     // Don't follow copies to physregs. These are usually setting up call
     // arguments, and the argument registers are always call clobbered. We are
     // better off in the source register which could be a callee-saved register,
     // or it could be spilled.
-    if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    if (!Register::isVirtualRegister(DstReg))
       continue;
 
     // Is LocNo extended to reach this copy? If not, another def may be blocking
@@ -815,7 +822,7 @@ void UserValue::addDefsFromCopies(
       MachineInstr *CopyMI = LIS.getInstructionFromIndex(DstVNI->def);
       assert(CopyMI && CopyMI->isCopy() && "Bad copy value");
       unsigned LocNo = getLocationNo(CopyMI->getOperand(0));
-      DbgValueLocation NewLoc(LocNo, WasIndirect);
+      DbgValueLocation NewLoc(LocNo);
       I.insert(Idx, Idx.getNextSlot(), NewLoc);
       NewDefs.push_back(std::make_pair(Idx, NewLoc));
       break;
@@ -845,7 +852,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
     }
 
     // Register locations are constrained to where the register value is live.
-    if (TargetRegisterInfo::isVirtualRegister(LocMO.getReg())) {
+    if (Register::isVirtualRegister(LocMO.getReg())) {
       LiveInterval *LI = nullptr;
       const VNInfo *VNI = nullptr;
       if (LIS.hasInterval(LocMO.getReg())) {
@@ -863,8 +870,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
       // sub-register in that regclass). For now, simply skip handling copies if
       // a sub-register is involved.
       if (LI && !LocMO.getSubReg())
-        addDefsFromCopies(LI, Loc.locNo(), Loc.wasIndirect(), Kills, Defs, MRI,
-                          LIS);
+        addDefsFromCopies(LI, Loc.locNo(), Kills, Defs, MRI, LIS);
       continue;
     }
 
@@ -1123,16 +1129,18 @@ UserValue::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
 
 void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs) {
   bool DidChange = false;
-  for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
-    DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
+  if (auto *UserVals = lookupVirtReg(OldReg))
+    for (auto *UV : *UserVals)
+      DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
 
   if (!DidChange)
     return;
 
   // Map all of the new virtual registers.
-  UserValue *UV = lookupVirtReg(OldReg);
-  for (unsigned i = 0; i != NewRegs.size(); ++i)
-    mapVirtReg(NewRegs[i], UV);
+  if (auto *UserVals = lookupVirtReg(OldReg))
+    for (auto *UV : *UserVals)
+      for (unsigned i = 0; i != NewRegs.size(); ++i)
+        mapVirtReg(NewRegs[i], UV);
 }
 
 void LiveDebugVariables::
@@ -1161,10 +1169,10 @@ void UserValue::rewriteLocations(VirtRegMap &VRM, const MachineFunction &MF,
     MachineOperand Loc = locations[I];
     // Only virtual registers are rewritten.
     if (Loc.isReg() && Loc.getReg() &&
-        TargetRegisterInfo::isVirtualRegister(Loc.getReg())) {
-      unsigned VirtReg = Loc.getReg();
+        Register::isVirtualRegister(Loc.getReg())) {
+      Register VirtReg = Loc.getReg();
       if (VRM.isAssignedReg(VirtReg) &&
-          TargetRegisterInfo::isPhysicalRegister(VRM.getPhys(VirtReg))) {
+          Register::isPhysicalRegister(VRM.getPhys(VirtReg))) {
         // This can create a %noreg operand in rare cases when the sub-register
         // index is no longer available. That means the user value is in a
         // non-existent sub-register, and %noreg is exactly what we want.
@@ -1258,7 +1266,7 @@ findNextInsertLocation(MachineBasicBlock *MBB,
                        const TargetRegisterInfo &TRI) {
   if (!LocMO.isReg())
     return MBB->instr_end();
-  unsigned Reg = LocMO.getReg();
+  Register Reg = LocMO.getReg();
 
   // Find the next instruction in the MBB that define the register Reg.
   while (I != MBB->end() && !I->isTerminator()) {
@@ -1302,21 +1310,14 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
   // that the original virtual register was a pointer. Also, add the stack slot
   // offset for the spilled register to the expression.
   const DIExpression *Expr = Expression;
-  uint8_t DIExprFlags = DIExpression::ApplyOffset;
-  bool IsIndirect = Loc.wasIndirect();
-  if (Spilled) {
-    if (IsIndirect)
-      DIExprFlags |= DIExpression::DerefAfter;
-    Expr =
-        DIExpression::prepend(Expr, DIExprFlags, SpillOffset);
-    IsIndirect = true;
-  }
+  if (Spilled)
+    Expr = DIExpression::prepend(Expr, DIExpression::ApplyOffset, SpillOffset);
 
   assert((!Spilled || MO.isFI()) && "a spilled location must be a frame index");
 
   do {
     BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE),
-            IsIndirect, MO, Variable, Expr);
+            Spilled, MO, Variable, Expr);
 
     // Continue and insert DBG_VALUES after every redefinition of register
     // associated with the debug value within the range
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 70b2a77fe800..54ac46f2e7ce 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -886,7 +886,7 @@ static void stripValuesNotDefiningMask(unsigned Reg, LiveInterval::SubRange &SR,
                                        const TargetRegisterInfo &TRI) {
   // Phys reg should not be tracked at subreg level.
   // Same for noreg (Reg == 0).
-  if (!TargetRegisterInfo::isVirtualRegister(Reg) || !Reg)
+  if (!Register::isVirtualRegister(Reg) || !Reg)
     return;
   // Remove the values that don't define those lanes.
   SmallVector<VNInfo *, 8> ToBeRemoved;
@@ -917,7 +917,8 @@ static void stripValuesNotDefiningMask(unsigned Reg, LiveInterval::SubRange &SR,
   for (VNInfo *VNI : ToBeRemoved)
     SR.removeValNo(VNI);
 
-  assert(!SR.empty() && "At least one value should be defined by this mask");
+  // If the subrange is empty at this point, the MIR is invalid. Do not assert
+  // and let the verifier catch this case.
 }
 
 void LiveInterval::refineSubRanges(
@@ -967,7 +968,7 @@ void LiveInterval::computeSubRangeUndefs(SmallVectorImpl<SlotIndex> &Undefs,
                                          LaneBitmask LaneMask,
                                          const MachineRegisterInfo &MRI,
                                          const SlotIndexes &Indexes) const {
-  assert(TargetRegisterInfo::isVirtualRegister(reg));
+  assert(Register::isVirtualRegister(reg));
   LaneBitmask VRegMask = MRI.getMaxLaneMaskForVReg(reg);
   assert((VRegMask & LaneMask).any());
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
diff --git a/lib/CodeGen/LiveIntervals.cpp b/lib/CodeGen/LiveIntervals.cpp
index aa85569063b3..2989930ad093 100644
--- a/lib/CodeGen/LiveIntervals.cpp
+++ b/lib/CodeGen/LiveIntervals.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -22,6 +21,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -108,7 +108,7 @@ LiveIntervals::~LiveIntervals() {
 void LiveIntervals::releaseMemory() {
   // Free the live intervals themselves.
   for (unsigned i = 0, e = VirtRegIntervals.size(); i != e; ++i)
-    delete VirtRegIntervals[TargetRegisterInfo::index2VirtReg(i)];
+    delete VirtRegIntervals[Register::index2VirtReg(i)];
   VirtRegIntervals.clear();
   RegMaskSlots.clear();
   RegMaskBits.clear();
@@ -161,7 +161,7 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
 
   // Dump the virtregs.
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (hasInterval(Reg))
       OS << getInterval(Reg) << '\n';
   }
@@ -186,7 +186,7 @@ LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const {
 #endif
 
 LiveInterval* LiveIntervals::createInterval(unsigned reg) {
-  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? huge_valf : 0.0F;
+  float Weight = Register::isPhysicalRegister(reg) ? huge_valf : 0.0F;
   return new LiveInterval(reg, Weight);
 }
 
@@ -201,7 +201,7 @@ void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
 
 void LiveIntervals::computeVirtRegs() {
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     createAndComputeVirtRegInterval(Reg);
@@ -441,8 +441,8 @@ void LiveIntervals::extendSegmentsToUses(LiveRange &Segments,
 bool LiveIntervals::shrinkToUses(LiveInterval *li,
                                  SmallVectorImpl<MachineInstr*> *dead) {
   LLVM_DEBUG(dbgs() << "Shrink: " << *li << '\n');
-  assert(TargetRegisterInfo::isVirtualRegister(li->reg)
-         && "Can only shrink virtual registers");
+  assert(Register::isVirtualRegister(li->reg) &&
+         "Can only shrink virtual registers");
 
   // Shrink subregister live ranges.
   bool NeedsCleanup = false;
@@ -541,8 +541,8 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
 
 void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
   LLVM_DEBUG(dbgs() << "Shrink: " << SR << '\n');
-  assert(TargetRegisterInfo::isVirtualRegister(Reg)
-         && "Can only shrink virtual registers");
+  assert(Register::isVirtualRegister(Reg) &&
+         "Can only shrink virtual registers");
   // Find all the values used, including PHI kills.
   ShrinkToUsesWorkList WorkList;
 
@@ -688,7 +688,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
                         LiveRange::const_iterator>, 4> SRs;
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     const LiveInterval &LI = getInterval(Reg);
@@ -986,10 +986,10 @@ public:
         MO.setIsKill(false);
       }
 
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         LiveInterval &LI = LIS.getInterval(Reg);
         if (LI.hasSubRanges()) {
           unsigned SubReg = MO.getSubReg();
@@ -1023,7 +1023,7 @@ private:
       return;
     LLVM_DEBUG({
       dbgs() << "     ";
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         dbgs() << printReg(Reg);
         if (LaneMask.any())
           dbgs() << " L" << PrintLaneMask(LaneMask);
@@ -1288,6 +1288,20 @@ private:
           const SlotIndex SplitPos = NewIdxDef;
           OldIdxVNI = OldIdxIn->valno;
 
+          SlotIndex NewDefEndPoint = std::next(NewIdxIn)->end;
+          LiveRange::iterator Prev = std::prev(OldIdxIn);
+          if (OldIdxIn != LR.begin() &&
+              SlotIndex::isEarlierInstr(NewIdx, Prev->end)) {
+            // If the segment before OldIdx read a value defined earlier than
+            // NewIdx, the moved instruction also reads and forwards that
+            // value. Extend the lifetime of the new def point.
+
+            // Extend to where the previous range started, unless there is
+            // another redef first.
+            NewDefEndPoint = std::min(OldIdxIn->start,
+                                      std::next(NewIdxOut)->start);
+          }
+
           // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut.
           OldIdxOut->valno->def = OldIdxIn->start;
           *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end,
@@ -1305,7 +1319,8 @@ private:
             // There is no gap between NewSegment and its predecessor.
             *NewSegment = LiveRange::Segment(Next->start, SplitPos,
                                              Next->valno);
-            *Next = LiveRange::Segment(SplitPos, Next->end, OldIdxVNI);
+
+            *Next = LiveRange::Segment(SplitPos, NewDefEndPoint, OldIdxVNI);
             Next->valno->def = SplitPos;
           } else {
             // There is a gap between NewSegment and its predecessor
@@ -1384,7 +1399,7 @@ private:
   // Return the last use of reg between NewIdx and OldIdx.
   SlotIndex findLastUseBefore(SlotIndex Before, unsigned Reg,
                               LaneBitmask LaneMask) {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       SlotIndex LastUse = Before;
       for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
         if (MO.isUndef())
@@ -1429,7 +1444,7 @@ private:
       // Check if MII uses Reg.
       for (MIBundleOperands MO(*MII); MO.isValid(); ++MO)
         if (MO->isReg() && !MO->isUndef() &&
-            TargetRegisterInfo::isPhysicalRegister(MO->getReg()) &&
+            Register::isPhysicalRegister(MO->getReg()) &&
             TRI.hasRegUnit(MO->getReg(), Reg))
           return Idx.getRegSlot();
     }
@@ -1439,7 +1454,10 @@ private:
 };
 
 void LiveIntervals::handleMove(MachineInstr &MI, bool UpdateFlags) {
-  assert(!MI.isBundled() && "Can't handle bundled instructions yet.");
+  // It is fine to move a bundle as a whole, but not an individual instruction
+  // inside it.
+  assert((!MI.isBundled() || MI.getOpcode() == TargetOpcode::BUNDLE) &&
+         "Cannot move instruction in bundle");
   SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
   Indexes->removeMachineInstrFromMaps(MI);
   SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(MI);
@@ -1582,8 +1600,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
                                           MOE = MI.operands_end();
          MOI != MOE; ++MOI) {
-      if (MOI->isReg() &&
-          TargetRegisterInfo::isVirtualRegister(MOI->getReg()) &&
+      if (MOI->isReg() && Register::isVirtualRegister(MOI->getReg()) &&
           !hasInterval(MOI->getReg())) {
         createAndComputeVirtRegInterval(MOI->getReg());
       }
@@ -1591,7 +1608,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
   }
 
   for (unsigned Reg : OrigRegs) {
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       continue;
 
     LiveInterval &LI = getInterval(Reg);
@@ -1642,7 +1659,7 @@ void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
   unsigned Reg = LI.reg;
   const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
   for (unsigned I = 1; I < NumComp; ++I) {
-    unsigned NewVReg = MRI->createVirtualRegister(RegClass);
+    Register NewVReg = MRI->createVirtualRegister(RegClass);
     LiveInterval &NewLI = createEmptyInterval(NewVReg);
     SplitLIs.push_back(&NewLI);
   }
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index cd3d248ac878..c2a1cc7c6490 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -46,8 +46,8 @@ void LivePhysRegs::removeDefs(const MachineInstr &MI) {
     if (O->isReg()) {
       if (!O->isDef() || O->isDebug())
         continue;
-      unsigned Reg = O->getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = O->getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       removeReg(Reg);
     } else if (O->isRegMask())
@@ -60,8 +60,8 @@ void LivePhysRegs::addUses(const MachineInstr &MI) {
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (!O->isReg() || !O->readsReg() || O->isDebug())
       continue;
-    unsigned Reg = O->getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = O->getReg();
+    if (!Register::isPhysicalRegister(Reg))
       continue;
     addReg(Reg);
   }
@@ -86,8 +86,8 @@ void LivePhysRegs::stepForward(const MachineInstr &MI,
   // Remove killed registers from the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg() && !O->isDebug()) {
-      unsigned Reg = O->getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = O->getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       if (O->isDef()) {
         // Note, dead defs are still recorded.  The caller should decide how to
@@ -292,10 +292,10 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
       if (!MO->isReg() || !MO->isDef() || MO->isDebug())
         continue;
 
-      unsigned Reg = MO->getReg();
+      Register Reg = MO->getReg();
       if (Reg == 0)
         continue;
-      assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+      assert(Register::isPhysicalRegister(Reg));
 
       bool IsNotLive = LiveRegs.available(MRI, Reg);
       MO->setIsDead(IsNotLive);
@@ -309,10 +309,10 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
       if (!MO->isReg() || !MO->readsReg() || MO->isDebug())
         continue;
 
-      unsigned Reg = MO->getReg();
+      Register Reg = MO->getReg();
       if (Reg == 0)
         continue;
-      assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+      assert(Register::isPhysicalRegister(Reg));
 
       bool IsNotLive = LiveRegs.available(MRI, Reg);
       MO->setIsKill(IsNotLive);
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index d670f28df6ba..24b57be0da00 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LiveRangeCalc.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -372,8 +372,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
       report_fatal_error("Use not jointly dominated by defs.");
     }
 
-    if (TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
-        !MBB->isLiveIn(PhysReg)) {
+    if (Register::isPhysicalRegister(PhysReg) && !MBB->isLiveIn(PhysReg)) {
       MBB->getParent()->verify();
       const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
       errs() << "The register " << printReg(PhysReg, TRI)
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
deleted file mode 100644
index 11aea5a3b016..000000000000
--- a/lib/CodeGen/LiveRangeCalc.h
+++ /dev/null
@@ -1,297 +0,0 @@
-//===- LiveRangeCalc.h - Calculate live ranges ------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The LiveRangeCalc class can be used to compute live ranges from scratch.  It
-// caches information about values in the CFG to speed up repeated operations
-// on the same live range.  The cache can be shared by non-overlapping live
-// ranges.  SplitKit uses that when computing the live range of split products.
-//
-// A low-level interface is available to clients that know where a variable is
-// live, but don't know which value it has as every point.  LiveRangeCalc will
-// propagate values down the dominator tree, and even insert PHI-defs where
-// needed.  SplitKit uses this faster interface when possible.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_LIVERANGECALC_H
-#define LLVM_LIB_CODEGEN_LIVERANGECALC_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/MC/LaneBitmask.h"
-#include <utility>
-
-namespace llvm {
-
-template <class NodeT> class DomTreeNodeBase;
-class MachineDominatorTree;
-class MachineFunction;
-class MachineRegisterInfo;
-
-using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
-
-class LiveRangeCalc {
-  const MachineFunction *MF = nullptr;
-  const MachineRegisterInfo *MRI = nullptr;
-  SlotIndexes *Indexes = nullptr;
-  MachineDominatorTree *DomTree = nullptr;
-  VNInfo::Allocator *Alloc = nullptr;
-
-  /// LiveOutPair - A value and the block that defined it.  The domtree node is
-  /// redundant, it can be computed as: MDT[Indexes.getMBBFromIndex(VNI->def)].
-  using LiveOutPair = std::pair<VNInfo *, MachineDomTreeNode *>;
-
-  /// LiveOutMap - Map basic blocks to the value leaving the block.
-  using LiveOutMap = IndexedMap<LiveOutPair, MBB2NumberFunctor>;
-
-  /// Bit vector of active entries in LiveOut, also used as a visited set by
-  /// findReachingDefs.  One entry per basic block, indexed by block number.
-  /// This is kept as a separate bit vector because it can be cleared quickly
-  /// when switching live ranges.
-  BitVector Seen;
-
-  /// Map LiveRange to sets of blocks (represented by bit vectors) that
-  /// in the live range are defined on entry and undefined on entry.
-  /// A block is defined on entry if there is a path from at least one of
-  /// the defs in the live range to the entry of the block, and conversely,
-  /// a block is undefined on entry, if there is no such path (i.e. no
-  /// definition reaches the entry of the block). A single LiveRangeCalc
-  /// object is used to track live-out information for multiple registers
-  /// in live range splitting (which is ok, since the live ranges of these
-  /// registers do not overlap), but the defined/undefined information must
-  /// be kept separate for each individual range.
-  /// By convention, EntryInfoMap[&LR] = { Defined, Undefined }.
-  using EntryInfoMap = DenseMap<LiveRange *, std::pair<BitVector, BitVector>>;
-  EntryInfoMap EntryInfos;
-
-  /// Map each basic block where a live range is live out to the live-out value
-  /// and its defining block.
-  ///
-  /// For every basic block, MBB, one of these conditions shall be true:
-  ///
-  ///  1. !Seen.count(MBB->getNumber())
-  ///     Blocks without a Seen bit are ignored.
-  ///  2. LiveOut[MBB].second.getNode() == MBB
-  ///     The live-out value is defined in MBB.
-  ///  3. forall P in preds(MBB): LiveOut[P] == LiveOut[MBB]
-  ///     The live-out value passses through MBB. All predecessors must carry
-  ///     the same value.
-  ///
-  /// The domtree node may be null, it can be computed.
-  ///
-  /// The map can be shared by multiple live ranges as long as no two are
-  /// live-out of the same block.
-  LiveOutMap Map;
-
-  /// LiveInBlock - Information about a basic block where a live range is known
-  /// to be live-in, but the value has not yet been determined.
-  struct LiveInBlock {
-    // The live range set that is live-in to this block.  The algorithms can
-    // handle multiple non-overlapping live ranges simultaneously.
-    LiveRange &LR;
-
-    // DomNode - Dominator tree node for the block.
-    // Cleared when the final value has been determined and LI has been updated.
-    MachineDomTreeNode *DomNode;
-
-    // Position in block where the live-in range ends, or SlotIndex() if the
-    // range passes through the block.  When the final value has been
-    // determined, the range from the block start to Kill will be added to LI.
-    SlotIndex Kill;
-
-    // Live-in value filled in by updateSSA once it is known.
-    VNInfo *Value = nullptr;
-
-    LiveInBlock(LiveRange &LR, MachineDomTreeNode *node, SlotIndex kill)
-      : LR(LR), DomNode(node), Kill(kill) {}
-  };
-
-  /// LiveIn - Work list of blocks where the live-in value has yet to be
-  /// determined.  This list is typically computed by findReachingDefs() and
-  /// used as a work list by updateSSA().  The low-level interface may also be
-  /// used to add entries directly.
-  SmallVector<LiveInBlock, 16> LiveIn;
-
-  /// Check if the entry to block @p MBB can be reached by any of the defs
-  /// in @p LR. Return true if none of the defs reach the entry to @p MBB.
-  bool isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,
-                    MachineBasicBlock &MBB, BitVector &DefOnEntry,
-                    BitVector &UndefOnEntry);
-
-  /// Find the set of defs that can reach @p Kill. @p Kill must belong to
-  /// @p UseMBB.
-  ///
-  /// If exactly one def can reach @p UseMBB, and the def dominates @p Kill,
-  /// all paths from the def to @p UseMBB are added to @p LR, and the function
-  /// returns true.
-  ///
-  /// If multiple values can reach @p UseMBB, the blocks that need @p LR to be
-  /// live in are added to the LiveIn array, and the function returns false.
-  ///
-  /// The array @p Undef provides the locations where the range @p LR becomes
-  /// undefined by <def,read-undef> operands on other subranges. If @p Undef
-  /// is non-empty and @p Kill is jointly dominated only by the entries of
-  /// @p Undef, the function returns false.
-  ///
-  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
-                        SlotIndex Use, unsigned PhysReg,
-                        ArrayRef<SlotIndex> Undefs);
-
-  /// updateSSA - Compute the values that will be live in to all requested
-  /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
-  ///
-  /// Every live-in block must be jointly dominated by the added live-out
-  /// blocks.  No values are read from the live ranges.
-  void updateSSA();
-
-  /// Transfer information from the LiveIn vector to the live ranges and update
-  /// the given @p LiveOuts.
-  void updateFromLiveIns();
-
-  /// Extend the live range of @p LR to reach all uses of Reg.
-  ///
-  /// If @p LR is a main range, or if @p LI is null, then all uses must be
-  /// jointly dominated by the definitions from @p LR. If @p LR is a subrange
-  /// of the live interval @p LI, corresponding to lane mask @p LaneMask,
-  /// all uses must be jointly dominated by the definitions from @p LR
-  /// together with definitions of other lanes where @p LR becomes undefined
-  /// (via <def,read-undef> operands).
-  /// If @p LR is a main range, the @p LaneMask should be set to ~0, i.e.
-  /// LaneBitmask::getAll().
-  void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask,
-                    LiveInterval *LI = nullptr);
-
-  /// Reset Map and Seen fields.
-  void resetLiveOutMap();
-
-public:
-  LiveRangeCalc() = default;
-
-  //===--------------------------------------------------------------------===//
-  // High-level interface.
-  //===--------------------------------------------------------------------===//
-  //
-  // Calculate live ranges from scratch.
-  //
-
-  /// reset - Prepare caches for a new set of non-overlapping live ranges.  The
-  /// caches must be reset before attempting calculations with a live range
-  /// that may overlap a previously computed live range, and before the first
-  /// live range in a function.  If live ranges are not known to be
-  /// non-overlapping, call reset before each.
-  void reset(const MachineFunction *mf, SlotIndexes *SI,
-             MachineDominatorTree *MDT, VNInfo::Allocator *VNIA);
-
-  //===--------------------------------------------------------------------===//
-  // Mid-level interface.
-  //===--------------------------------------------------------------------===//
-  //
-  // Modify existing live ranges.
-  //
-
-  /// Extend the live range of @p LR to reach @p Use.
-  ///
-  /// The existing values in @p LR must be live so they jointly dominate @p Use.
-  /// If @p Use is not dominated by a single existing value, PHI-defs are
-  /// inserted as required to preserve SSA form.
-  ///
-  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  void extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg,
-              ArrayRef<SlotIndex> Undefs);
-
-  /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
-  /// Each instruction defining Reg gets a new VNInfo with a corresponding
-  /// minimal live range.
-  void createDeadDefs(LiveRange &LR, unsigned Reg);
-
-  /// Extend the live range of @p LR to reach all uses of Reg.
-  ///
-  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
-  /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveRange &LR, unsigned PhysReg) {
-    extendToUses(LR, PhysReg, LaneBitmask::getAll());
-  }
-
-  /// Calculates liveness for the register specified in live interval @p LI.
-  /// Creates subregister live ranges as needed if subreg liveness tracking is
-  /// enabled.
-  void calculate(LiveInterval &LI, bool TrackSubRegs);
-
-  /// For live interval \p LI with correct SubRanges construct matching
-  /// information for the main live range. Expects the main live range to not
-  /// have any segments or value numbers.
-  void constructMainRangeFromSubranges(LiveInterval &LI);
-
-  //===--------------------------------------------------------------------===//
-  // Low-level interface.
-  //===--------------------------------------------------------------------===//
-  //
-  // These functions can be used to compute live ranges where the live-in and
-  // live-out blocks are already known, but the SSA value in each block is
-  // unknown.
-  //
-  // After calling reset(), add known live-out values and known live-in blocks.
-  // Then call calculateValues() to compute the actual value that is
-  // live-in to each block, and add liveness to the live ranges.
-  //
-
-  /// setLiveOutValue - Indicate that VNI is live out from MBB.  The
-  /// calculateValues() function will not add liveness for MBB, the caller
-  /// should take care of that.
-  ///
-  /// VNI may be null only if MBB is a live-through block also passed to
-  /// addLiveInBlock().
-  void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
-    Seen.set(MBB->getNumber());
-    Map[MBB] = LiveOutPair(VNI, nullptr);
-  }
-
-  /// addLiveInBlock - Add a block with an unknown live-in value.  This
-  /// function can only be called once per basic block.  Once the live-in value
-  /// has been determined, calculateValues() will add liveness to LI.
-  ///
-  /// @param LR      The live range that is live-in to the block.
-  /// @param DomNode The domtree node for the block.
-  /// @param Kill    Index in block where LI is killed.  If the value is
-  ///                live-through, set Kill = SLotIndex() and also call
-  ///                setLiveOutValue(MBB, 0).
-  void addLiveInBlock(LiveRange &LR,
-                      MachineDomTreeNode *DomNode,
-                      SlotIndex Kill = SlotIndex()) {
-    LiveIn.push_back(LiveInBlock(LR, DomNode, Kill));
-  }
-
-  /// calculateValues - Calculate the value that will be live-in to each block
-  /// added with addLiveInBlock.  Add PHI-def values as needed to preserve SSA
-  /// form.  Add liveness to all live-in blocks up to the Kill point, or the
-  /// whole block for live-through blocks.
-  ///
-  /// Every predecessor of a live-in block must have been given a value with
-  /// setLiveOutValue, the value may be null for live-trough blocks.
-  void calculateValues();
-
-  /// A diagnostic function to check if the end of the block @p MBB is
-  /// jointly dominated by the blocks corresponding to the slot indices
-  /// in @p Defs. This function is mainly for use in self-verification
-  /// checks.
-  LLVM_ATTRIBUTE_UNUSED
-  static bool isJointlyDominated(const MachineBasicBlock *MBB,
-                                 ArrayRef<SlotIndex> Defs,
-                                 const SlotIndexes &Indexes);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_CODEGEN_LIVERANGECALC_H
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 882e562ba95c..34bac082bcd7 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -32,7 +32,7 @@ void LiveRangeEdit::Delegate::anchor() { }
 
 LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg,
                                                      bool createSubRanges) {
-  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  Register VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
   if (VRM)
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
 
@@ -52,7 +52,7 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg,
 }
 
 unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
-  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  Register VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
   if (VRM) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
@@ -114,7 +114,7 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
       continue;
 
     // We can't remat physreg uses, unless it is a constant.
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    if (Register::isPhysicalRegister(MO.getReg())) {
       if (MRI.isConstantPhysReg(MO.getReg()))
         continue;
       return false;
@@ -232,7 +232,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   LLVM_DEBUG(dbgs() << "                folded: " << *FoldMI);
   LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
   if (UseMI->isCall())
-    UseMI->getMF()->updateCallSiteInfo(UseMI, FoldMI);
+    UseMI->getMF()->moveCallSiteInfo(UseMI, FoldMI);
   UseMI->eraseFromParent();
   DefMI->addRegisterDead(LI->reg, nullptr);
   Dead.push_back(DefMI);
@@ -308,8 +308,8 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
          MOE = MI->operands_end(); MOI != MOE; ++MOI) {
     if (!MOI->isReg())
       continue;
-    unsigned Reg = MOI->getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    Register Reg = MOI->getReg();
+    if (!Register::isVirtualRegister(Reg)) {
       // Check if MI reads any unreserved physregs.
       if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
         ReadsPhysRegs = true;
@@ -349,7 +349,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
     // Remove all operands that aren't physregs.
     for (unsigned i = MI->getNumOperands(); i; --i) {
       const MachineOperand &MO = MI->getOperand(i-1);
-      if (MO.isReg() && TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      if (MO.isReg() && Register::isPhysicalRegister(MO.getReg()))
         continue;
       MI->RemoveOperand(i-1);
     }
diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp
index 8818f1ce0ad9..cbf112ee2bd5 100644
--- a/lib/CodeGen/LiveRangeShrink.cpp
+++ b/lib/CodeGen/LiveRangeShrink.cpp
@@ -172,10 +172,10 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isReg() || MO.isDead() || MO.isDebug())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         // Do not move the instruction if it def/uses a physical register,
         // unless it is a constant physical register or a noreg.
-        if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (!Register::isVirtualRegister(Reg)) {
           if (!Reg || MRI.isConstantPhysReg(Reg))
             continue;
           Insert = nullptr;
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index ce99e5535c25..72c79e5f8a75 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -118,7 +118,7 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
 }
 
 void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
-  unsigned PhysReg = VRM->getPhys(VirtReg.reg);
+  Register PhysReg = VRM->getPhys(VirtReg.reg);
   LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg, TRI) << " from "
                     << printReg(PhysReg, TRI) << ':');
   VRM->clearVirt(VirtReg.reg);
diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp
index 6afb7fb7aa11..97763def1f40 100644
--- a/lib/CodeGen/LiveRegUnits.cpp
+++ b/lib/CodeGen/LiveRegUnits.cpp
@@ -47,8 +47,8 @@ void LiveRegUnits::stepBackward(const MachineInstr &MI) {
     if (O->isReg()) {
       if (!O->isDef() || O->isDebug())
         continue;
-      unsigned Reg = O->getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = O->getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       removeReg(Reg);
     } else if (O->isRegMask())
@@ -59,8 +59,8 @@ void LiveRegUnits::stepBackward(const MachineInstr &MI) {
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (!O->isReg() || !O->readsReg() || O->isDebug())
       continue;
-    unsigned Reg = O->getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = O->getReg();
+    if (!Register::isPhysicalRegister(Reg))
       continue;
     addReg(Reg);
   }
@@ -70,8 +70,8 @@ void LiveRegUnits::accumulate(const MachineInstr &MI) {
   // Add defs, uses and regmask clobbers to the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg()) {
-      unsigned Reg = O->getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = O->getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       if (!O->isDef() && !O->readsReg())
         continue;
diff --git a/lib/CodeGen/LiveStacks.cpp b/lib/CodeGen/LiveStacks.cpp
index f55977d72723..8df84ebf4f06 100644
--- a/lib/CodeGen/LiveStacks.cpp
+++ b/lib/CodeGen/LiveStacks.cpp
@@ -58,9 +58,10 @@ LiveStacks::getOrCreateInterval(int Slot, const TargetRegisterClass *RC) {
   assert(Slot >= 0 && "Spill slot indice must be >= 0");
   SS2IntervalMap::iterator I = S2IMap.find(Slot);
   if (I == S2IMap.end()) {
-    I = S2IMap.emplace(std::piecewise_construct, std::forward_as_tuple(Slot),
-                       std::forward_as_tuple(
-                           TargetRegisterInfo::index2StackSlot(Slot), 0.0F))
+    I = S2IMap
+            .emplace(
+                std::piecewise_construct, std::forward_as_tuple(Slot),
+                std::forward_as_tuple(Register::index2StackSlot(Slot), 0.0F))
             .first;
     S2RCMap.insert(std::make_pair(Slot, RC));
   } else {
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index aaff982ef1b0..9bd55c6f750f 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -26,6 +26,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -82,7 +83,7 @@ LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
 
 /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
 LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) {
-  assert(TargetRegisterInfo::isVirtualRegister(RegIdx) &&
+  assert(Register::isVirtualRegister(RegIdx) &&
          "getVarInfo: not a virtual register!");
   VirtRegInfo.grow(RegIdx);
   return VirtRegInfo[RegIdx];
@@ -214,7 +215,7 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
     MachineOperand &MO = LastDef->getOperand(i);
     if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
       continue;
-    unsigned DefReg = MO.getReg();
+    Register DefReg = MO.getReg();
     if (TRI->isSubRegister(Reg, DefReg)) {
       for (MCSubRegIterator SubRegs(DefReg, TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs)
@@ -519,10 +520,9 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
     }
     if (!MO.isReg() || MO.getReg() == 0)
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (MO.isUse()) {
-      if (!(TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-            MRI->isReserved(MOReg)))
+      if (!(Register::isPhysicalRegister(MOReg) && MRI->isReserved(MOReg)))
         MO.setIsKill(false);
       if (MO.readsReg())
         UseRegs.push_back(MOReg);
@@ -530,8 +530,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
       assert(MO.isDef());
       // FIXME: We should not remove any dead flags. However the MIPS RDDSP
       // instruction needs it at the moment: http://llvm.org/PR27116.
-      if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-          !MRI->isReserved(MOReg))
+      if (Register::isPhysicalRegister(MOReg) && !MRI->isReserved(MOReg))
         MO.setIsDead(false);
       DefRegs.push_back(MOReg);
     }
@@ -541,7 +540,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
   // Process all uses.
   for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
     unsigned MOReg = UseRegs[i];
-    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+    if (Register::isVirtualRegister(MOReg))
       HandleVirtRegUse(MOReg, MBB, MI);
     else if (!MRI->isReserved(MOReg))
       HandlePhysRegUse(MOReg, MI);
@@ -554,7 +553,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
   // Process all defs.
   for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
     unsigned MOReg = DefRegs[i];
-    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+    if (Register::isVirtualRegister(MOReg))
       HandleVirtRegDef(MOReg, MI);
     else if (!MRI->isReserved(MOReg))
       HandlePhysRegDef(MOReg, &MI, Defs);
@@ -566,7 +565,7 @@ void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
   // Mark live-in registers as live-in.
   SmallVector<unsigned, 4> Defs;
   for (const auto &LI : MBB->liveins()) {
-    assert(TargetRegisterInfo::isPhysicalRegister(LI.PhysReg) &&
+    assert(Register::isPhysicalRegister(LI.PhysReg) &&
            "Cannot have a live-in virtual register!");
     HandlePhysRegDef(LI.PhysReg, nullptr, Defs);
   }
@@ -654,7 +653,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   // Convert and transfer the dead / killed information we have gathered into
   // VirtRegInfo onto MI's.
   for (unsigned i = 0, e1 = VirtRegInfo.size(); i != e1; ++i) {
-    const unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    const unsigned Reg = Register::index2VirtReg(i);
     for (unsigned j = 0, e2 = VirtRegInfo[Reg].Kills.size(); j != e2; ++j)
       if (VirtRegInfo[Reg].Kills[j] == MRI->getVRegDef(Reg))
         VirtRegInfo[Reg].Kills[j]->addRegisterDead(Reg, TRI);
@@ -692,8 +691,8 @@ void LiveVariables::removeVirtualRegistersKilled(MachineInstr &MI) {
     MachineOperand &MO = MI.getOperand(i);
     if (MO.isReg() && MO.isKill()) {
       MO.setIsKill(false);
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isVirtualRegister(Reg)) {
         bool removed = getVarInfo(Reg).removeKill(MI);
         assert(removed && "kill not in register's VarInfo?");
         (void)removed;
@@ -783,7 +782,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
   for (; BBI != BBE; ++BBI) {
     for (MachineInstr::mop_iterator I = BBI->operands_begin(),
          E = BBI->operands_end(); I != E; ++I) {
-      if (I->isReg() && TargetRegisterInfo::isVirtualRegister(I->getReg())) {
+      if (I->isReg() && Register::isVirtualRegister(I->getReg())) {
         if (I->isDef())
           Defs.insert(I->getReg());
         else if (I->isKill())
@@ -794,7 +793,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
 
   // Update info for all live variables
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
 
     // If the Defs is defined in the successor it can't be live in BB.
     if (Defs.count(Reg))
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index b14d76a585f7..2392d4d00b56 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -261,7 +261,7 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
   // Remember how big this blob of stack space is
   MFI.setLocalFrameSize(Offset);
-  MFI.setLocalFrameMaxAlign(MaxAlign);
+  MFI.setLocalFrameMaxAlign(assumeAligned(MaxAlign));
 }
 
 static inline bool
@@ -351,6 +351,14 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
     assert(MFI.isObjectPreAllocated(FrameIdx) &&
            "Only pre-allocated locals expected!");
 
+    // We need to keep the references to the stack protector slot through frame
+    // index operands so that it gets resolved by PEI rather than this pass.
+    // This avoids accesses to the stack protector though virtual base
+    // registers, and forces PEI to address it using fp/sp/bp.
+    if (MFI.hasStackProtectorIndex() &&
+        FrameIdx == MFI.getStackProtectorIndex())
+      continue;
+
     LLVM_DEBUG(dbgs() << "Considering: " << MI);
 
     unsigned idx = 0;
diff --git a/lib/CodeGen/LowerEmuTLS.cpp b/lib/CodeGen/LowerEmuTLS.cpp
index c8cf6abda4fc..ed48365b0102 100644
--- a/lib/CodeGen/LowerEmuTLS.cpp
+++ b/lib/CodeGen/LowerEmuTLS.cpp
@@ -142,7 +142,7 @@ bool LowerEmuTLS::addEmuTlsVar(Module &M, const GlobalVariable *GV) {
     assert(EmuTlsTmplVar && "Failed to create emualted TLS initializer");
     EmuTlsTmplVar->setConstant(true);
     EmuTlsTmplVar->setInitializer(const_cast<Constant*>(InitValue));
-    EmuTlsTmplVar->setAlignment(GVAlignment);
+    EmuTlsTmplVar->setAlignment(Align(GVAlignment));
     copyLinkageVisibility(M, GV, EmuTlsTmplVar);
   }
 
@@ -155,9 +155,8 @@ bool LowerEmuTLS::addEmuTlsVar(Module &M, const GlobalVariable *GV) {
   ArrayRef<Constant*> ElementValueArray(ElementValues, 4);
   EmuTlsVar->setInitializer(
       ConstantStruct::get(EmuTlsVarType, ElementValueArray));
-  unsigned MaxAlignment = std::max(
-      DL.getABITypeAlignment(WordType),
-      DL.getABITypeAlignment(VoidPtrType));
+  Align MaxAlignment(std::max(DL.getABITypeAlignment(WordType),
+                              DL.getABITypeAlignment(VoidPtrType)));
   EmuTlsVar->setAlignment(MaxAlignment);
   return true;
 }
diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp
index f49bc854e23f..c9bb5461aa3c 100644
--- a/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -23,12 +23,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MIRVRegNamerUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <queue>
@@ -71,28 +73,6 @@ public:
 
 } // end anonymous namespace
 
-enum VRType { RSE_Reg = 0, RSE_FrameIndex, RSE_NewCandidate };
-class TypedVReg {
-  VRType type;
-  unsigned reg;
-
-public:
-  TypedVReg(unsigned reg) : type(RSE_Reg), reg(reg) {}
-  TypedVReg(VRType type) : type(type), reg(~0U) {
-    assert(type != RSE_Reg && "Expected a non-register type.");
-  }
-
-  bool isReg() const { return type == RSE_Reg; }
-  bool isFrameIndex() const { return type == RSE_FrameIndex; }
-  bool isCandidate() const { return type == RSE_NewCandidate; }
-
-  VRType getType() const { return type; }
-  unsigned getReg() const {
-    assert(this->isReg() && "Expected a virtual or physical register.");
-    return reg;
-  }
-};
-
 char MIRCanonicalizer::ID;
 
 char &llvm::MIRCanonicalizerID = MIRCanonicalizer::ID;
@@ -190,7 +170,7 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
       if (!MO.isReg())
         continue;
 
-      if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      if (Register::isVirtualRegister(MO.getReg()))
         continue;
 
       if (!MO.isDef())
@@ -207,7 +187,7 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
       continue;
 
     MachineOperand &MO = II->getOperand(0);
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
       continue;
     if (!MO.isDef())
       continue;
@@ -220,7 +200,7 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
       }
 
       if (II->getOperand(i).isReg()) {
-        if (!TargetRegisterInfo::isVirtualRegister(II->getOperand(i).getReg()))
+        if (!Register::isVirtualRegister(II->getOperand(i).getReg()))
           if (llvm::find(PhysRegDefs, II->getOperand(i).getReg()) ==
               PhysRegDefs.end()) {
             continue;
@@ -340,12 +320,12 @@ static bool propagateLocalCopies(MachineBasicBlock *MBB) {
     if (!MI->getOperand(1).isReg())
       continue;
 
-    const unsigned Dst = MI->getOperand(0).getReg();
-    const unsigned Src = MI->getOperand(1).getReg();
+    const Register Dst = MI->getOperand(0).getReg();
+    const Register Src = MI->getOperand(1).getReg();
 
-    if (!TargetRegisterInfo::isVirtualRegister(Dst))
+    if (!Register::isVirtualRegister(Dst))
       continue;
-    if (!TargetRegisterInfo::isVirtualRegister(Src))
+    if (!Register::isVirtualRegister(Src))
       continue;
     // Not folding COPY instructions if regbankselect has not set the RCs.
     // Why are we only considering Register Classes? Because the verifier
@@ -370,258 +350,6 @@ static bool propagateLocalCopies(MachineBasicBlock *MBB) {
   return Changed;
 }
 
-/// Here we find our candidates. What makes an interesting candidate?
-/// An candidate for a canonicalization tree root is normally any kind of
-/// instruction that causes side effects such as a store to memory or a copy to
-/// a physical register or a return instruction. We use these as an expression
-/// tree root that we walk inorder to build a canonical walk which should result
-/// in canoncal vreg renaming.
-static std::vector<MachineInstr *> populateCandidates(MachineBasicBlock *MBB) {
-  std::vector<MachineInstr *> Candidates;
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-
-  for (auto II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
-    MachineInstr *MI = &*II;
-
-    bool DoesMISideEffect = false;
-
-    if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg()) {
-      const unsigned Dst = MI->getOperand(0).getReg();
-      DoesMISideEffect |= !TargetRegisterInfo::isVirtualRegister(Dst);
-
-      for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) {
-        if (DoesMISideEffect)
-          break;
-        DoesMISideEffect |= (UI->getParent()->getParent() != MI->getParent());
-      }
-    }
-
-    if (!MI->mayStore() && !MI->isBranch() && !DoesMISideEffect)
-      continue;
-
-    LLVM_DEBUG(dbgs() << "Found Candidate:  "; MI->dump(););
-    Candidates.push_back(MI);
-  }
-
-  return Candidates;
-}
-
-static void doCandidateWalk(std::vector<TypedVReg> &VRegs,
-                            std::queue<TypedVReg> &RegQueue,
-                            std::vector<MachineInstr *> &VisitedMIs,
-                            const MachineBasicBlock *MBB) {
-
-  const MachineFunction &MF = *MBB->getParent();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  while (!RegQueue.empty()) {
-
-    auto TReg = RegQueue.front();
-    RegQueue.pop();
-
-    if (TReg.isFrameIndex()) {
-      LLVM_DEBUG(dbgs() << "Popping frame index.\n";);
-      VRegs.push_back(TypedVReg(RSE_FrameIndex));
-      continue;
-    }
-
-    assert(TReg.isReg() && "Expected vreg or physreg.");
-    unsigned Reg = TReg.getReg();
-
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      LLVM_DEBUG({
-        dbgs() << "Popping vreg ";
-        MRI.def_begin(Reg)->dump();
-        dbgs() << "\n";
-      });
-
-      if (!llvm::any_of(VRegs, [&](const TypedVReg &TR) {
-            return TR.isReg() && TR.getReg() == Reg;
-          })) {
-        VRegs.push_back(TypedVReg(Reg));
-      }
-    } else {
-      LLVM_DEBUG(dbgs() << "Popping physreg.\n";);
-      VRegs.push_back(TypedVReg(Reg));
-      continue;
-    }
-
-    for (auto RI = MRI.def_begin(Reg), RE = MRI.def_end(); RI != RE; ++RI) {
-      MachineInstr *Def = RI->getParent();
-
-      if (Def->getParent() != MBB)
-        continue;
-
-      if (llvm::any_of(VisitedMIs,
-                       [&](const MachineInstr *VMI) { return Def == VMI; })) {
-        break;
-      }
-
-      LLVM_DEBUG({
-        dbgs() << "\n========================\n";
-        dbgs() << "Visited MI: ";
-        Def->dump();
-        dbgs() << "BB Name: " << Def->getParent()->getName() << "\n";
-        dbgs() << "\n========================\n";
-      });
-      VisitedMIs.push_back(Def);
-      for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) {
-
-        MachineOperand &MO = Def->getOperand(I);
-        if (MO.isFI()) {
-          LLVM_DEBUG(dbgs() << "Pushing frame index.\n";);
-          RegQueue.push(TypedVReg(RSE_FrameIndex));
-        }
-
-        if (!MO.isReg())
-          continue;
-        RegQueue.push(TypedVReg(MO.getReg()));
-      }
-    }
-  }
-}
-
-namespace {
-class NamedVRegCursor {
-  MachineRegisterInfo &MRI;
-  unsigned virtualVRegNumber;
-
-public:
-  NamedVRegCursor(MachineRegisterInfo &MRI) : MRI(MRI), virtualVRegNumber(0) {}
-
-  void SkipVRegs() {
-    unsigned VRegGapIndex = 1;
-    if (!virtualVRegNumber) {
-      VRegGapIndex = 0;
-      virtualVRegNumber = MRI.createIncompleteVirtualRegister();
-    }
-    const unsigned VR_GAP = (++VRegGapIndex * 1000);
-
-    unsigned I = virtualVRegNumber;
-    const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP;
-
-    virtualVRegNumber = E;
-  }
-
-  unsigned getVirtualVReg() const { return virtualVRegNumber; }
-
-  unsigned incrementVirtualVReg(unsigned incr = 1) {
-    virtualVRegNumber += incr;
-    return virtualVRegNumber;
-  }
-
-  unsigned createVirtualRegister(unsigned VReg) {
-    if (!virtualVRegNumber)
-      SkipVRegs();
-    std::string S;
-    raw_string_ostream OS(S);
-    OS << "namedVReg" << (virtualVRegNumber & ~0x80000000);
-    OS.flush();
-    virtualVRegNumber++;
-    if (auto RC = MRI.getRegClassOrNull(VReg))
-      return MRI.createVirtualRegister(RC, OS.str());
-    return MRI.createGenericVirtualRegister(MRI.getType(VReg), OS.str());
-  }
-};
-} // namespace
-
-static std::map<unsigned, unsigned>
-GetVRegRenameMap(const std::vector<TypedVReg> &VRegs,
-                 const std::vector<unsigned> &renamedInOtherBB,
-                 MachineRegisterInfo &MRI, NamedVRegCursor &NVC) {
-  std::map<unsigned, unsigned> VRegRenameMap;
-  bool FirstCandidate = true;
-
-  for (auto &vreg : VRegs) {
-    if (vreg.isFrameIndex()) {
-      // We skip one vreg for any frame index because there is a good chance
-      // (especially when comparing SelectionDAG to GlobalISel generated MIR)
-      // that in the other file we are just getting an incoming vreg that comes
-      // from a copy from a frame index. So it's safe to skip by one.
-      unsigned LastRenameReg = NVC.incrementVirtualVReg();
-      (void)LastRenameReg;
-      LLVM_DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";);
-      continue;
-    } else if (vreg.isCandidate()) {
-
-      // After the first candidate, for every subsequent candidate, we skip mod
-      // 10 registers so that the candidates are more likely to start at the
-      // same vreg number making it more likely that the canonical walk from the
-      // candidate insruction. We don't need to skip from the first candidate of
-      // the BasicBlock because we already skip ahead several vregs for each BB.
-      unsigned LastRenameReg = NVC.getVirtualVReg();
-      if (FirstCandidate)
-        NVC.incrementVirtualVReg(LastRenameReg % 10);
-      FirstCandidate = false;
-      continue;
-    } else if (!TargetRegisterInfo::isVirtualRegister(vreg.getReg())) {
-      unsigned LastRenameReg = NVC.incrementVirtualVReg();
-      (void)LastRenameReg;
-      LLVM_DEBUG({
-        dbgs() << "Skipping rename for Phys Reg " << LastRenameReg << "\n";
-      });
-      continue;
-    }
-
-    auto Reg = vreg.getReg();
-    if (llvm::find(renamedInOtherBB, Reg) != renamedInOtherBB.end()) {
-      LLVM_DEBUG(dbgs() << "Vreg " << Reg
-                        << " already renamed in other BB.\n";);
-      continue;
-    }
-
-    auto Rename = NVC.createVirtualRegister(Reg);
-
-    if (VRegRenameMap.find(Reg) == VRegRenameMap.end()) {
-      LLVM_DEBUG(dbgs() << "Mapping vreg ";);
-      if (MRI.reg_begin(Reg) != MRI.reg_end()) {
-        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump(););
-      } else {
-        LLVM_DEBUG(dbgs() << Reg;);
-      }
-      LLVM_DEBUG(dbgs() << " to ";);
-      if (MRI.reg_begin(Rename) != MRI.reg_end()) {
-        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump(););
-      } else {
-        LLVM_DEBUG(dbgs() << Rename;);
-      }
-      LLVM_DEBUG(dbgs() << "\n";);
-
-      VRegRenameMap.insert(std::pair<unsigned, unsigned>(Reg, Rename));
-    }
-  }
-
-  return VRegRenameMap;
-}
-
-static bool doVRegRenaming(std::vector<unsigned> &RenamedInOtherBB,
-                           const std::map<unsigned, unsigned> &VRegRenameMap,
-                           MachineRegisterInfo &MRI) {
-  bool Changed = false;
-  for (auto I = VRegRenameMap.begin(), E = VRegRenameMap.end(); I != E; ++I) {
-
-    auto VReg = I->first;
-    auto Rename = I->second;
-
-    RenamedInOtherBB.push_back(Rename);
-
-    std::vector<MachineOperand *> RenameMOs;
-    for (auto &MO : MRI.reg_operands(VReg)) {
-      RenameMOs.push_back(&MO);
-    }
-
-    for (auto *MO : RenameMOs) {
-      Changed = true;
-      MO->setReg(Rename);
-
-      if (!MO->isDef())
-        MO->setIsKill(false);
-    }
-  }
-
-  return Changed;
-}
-
 static bool doDefKillClear(MachineBasicBlock *MBB) {
   bool Changed = false;
 
@@ -646,9 +374,7 @@ static bool doDefKillClear(MachineBasicBlock *MBB) {
 
 static bool runOnBasicBlock(MachineBasicBlock *MBB,
                             std::vector<StringRef> &bbNames,
-                            std::vector<unsigned> &renamedInOtherBB,
-                            unsigned &basicBlockNum, unsigned &VRegGapIndex,
-                            NamedVRegCursor &NVC) {
+                            unsigned &basicBlockNum, NamedVRegCursor &NVC) {
 
   if (CanonicalizeBasicBlockNumber != ~0U) {
     if (CanonicalizeBasicBlockNumber != basicBlockNum++)
@@ -687,74 +413,20 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB,
   Changed |= rescheduleCanonically(IdempotentInstCount, MBB);
   LLVM_DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump(););
 
-  std::vector<MachineInstr *> Candidates = populateCandidates(MBB);
-  std::vector<MachineInstr *> VisitedMIs;
-  llvm::copy(Candidates, std::back_inserter(VisitedMIs));
-
-  std::vector<TypedVReg> VRegs;
-  for (auto candidate : Candidates) {
-    VRegs.push_back(TypedVReg(RSE_NewCandidate));
-
-    std::queue<TypedVReg> RegQueue;
-
-    // Here we walk the vreg operands of a non-root node along our walk.
-    // The root nodes are the original candidates (stores normally).
-    // These are normally not the root nodes (except for the case of copies to
-    // physical registers).
-    for (unsigned i = 1; i < candidate->getNumOperands(); i++) {
-      if (candidate->mayStore() || candidate->isBranch())
-        break;
-
-      MachineOperand &MO = candidate->getOperand(i);
-      if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
-        continue;
-
-      LLVM_DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";);
-      RegQueue.push(TypedVReg(MO.getReg()));
-    }
-
-    // Here we walk the root candidates. We start from the 0th operand because
-    // the root is normally a store to a vreg.
-    for (unsigned i = 0; i < candidate->getNumOperands(); i++) {
-
-      if (!candidate->mayStore() && !candidate->isBranch())
-        break;
-
-      MachineOperand &MO = candidate->getOperand(i);
-
-      // TODO: Do we want to only add vregs here?
-      if (!MO.isReg() && !MO.isFI())
-        continue;
-
-      LLVM_DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";);
-
-      RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg())
-                               : TypedVReg(RSE_FrameIndex));
-    }
-
-    doCandidateWalk(VRegs, RegQueue, VisitedMIs, MBB);
-  }
-
-  // If we have populated no vregs to rename then bail.
-  // The rest of this function does the vreg remaping.
-  if (VRegs.size() == 0)
-    return Changed;
-
-  auto VRegRenameMap = GetVRegRenameMap(VRegs, renamedInOtherBB, MRI, NVC);
-  Changed |= doVRegRenaming(renamedInOtherBB, VRegRenameMap, MRI);
+  Changed |= NVC.renameVRegs(MBB);
 
   // Here we renumber the def vregs for the idempotent instructions from the top
   // of the MachineBasicBlock so that they are named in the order that we sorted
   // them alphabetically. Eventually we wont need SkipVRegs because we will use
   // named vregs instead.
   if (IdempotentInstCount)
-    NVC.SkipVRegs();
+    NVC.skipVRegs();
 
   auto MII = MBB->begin();
   for (unsigned i = 0; i < IdempotentInstCount && MII != MBB->end(); ++i) {
     MachineInstr &MI = *MII++;
     Changed = true;
-    unsigned vRegToRename = MI.getOperand(0).getReg();
+    Register vRegToRename = MI.getOperand(0).getReg();
     auto Rename = NVC.createVirtualRegister(vRegToRename);
 
     std::vector<MachineOperand *> RenameMOs;
@@ -799,9 +471,7 @@ bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
       << "\n\n================================================\n\n";);
 
   std::vector<StringRef> BBNames;
-  std::vector<unsigned> RenamedInOtherBB;
 
-  unsigned GapIdx = 0;
   unsigned BBNum = 0;
 
   bool Changed = false;
@@ -809,8 +479,7 @@ bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   NamedVRegCursor NVC(MRI);
   for (auto MBB : RPOList)
-    Changed |=
-        runOnBasicBlock(MBB, BBNames, RenamedInOtherBB, BBNum, GapIdx, NVC);
+    Changed |= runOnBasicBlock(MBB, BBNames, BBNum, NVC);
 
   return Changed;
 }
diff --git a/lib/CodeGen/MIRNamerPass.cpp b/lib/CodeGen/MIRNamerPass.cpp
new file mode 100644
index 000000000000..9d719f3917ce
--- /dev/null
+++ b/lib/CodeGen/MIRNamerPass.cpp
@@ -0,0 +1,77 @@
+//===----------------------- MIRNamer.cpp - MIR Namer ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to rename virtual register operands with the goal
+// of making it easier to author easier to read tests for MIR. This pass reuses
+// the vreg renamer used by MIRCanonicalizerPass.
+//
+// Basic Usage:
+//
+// llc -o - -run-pass mir-namer example.mir
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIRVRegNamerUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+
+using namespace llvm;
+
+namespace llvm {
+extern char &MIRNamerID;
+} // namespace llvm
+
+#define DEBUG_TYPE "mir-namer"
+
+namespace {
+
+class MIRNamer : public MachineFunctionPass {
+public:
+  static char ID;
+  MIRNamer() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "Rename virtual register operands";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    bool Changed = false;
+
+    if (MF.empty())
+      return Changed;
+
+    NamedVRegCursor NVC(MF.getRegInfo());
+
+    ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+    for (auto &MBB : RPOT)
+      Changed |= NVC.renameVRegs(MBB);
+
+    return Changed;
+  }
+};
+
+} // end anonymous namespace
+
+char MIRNamer::ID;
+
+char &llvm::MIRNamerID = MIRNamer::ID;
+
+INITIALIZE_PASS_BEGIN(MIRNamer, "mir-namer", "Rename Register Operands", false,
+                      false)
+
+INITIALIZE_PASS_END(MIRNamer, "mir-namer", "Rename Register Operands", false,
+                    false)
diff --git a/lib/CodeGen/MIRParser/MILexer.cpp b/lib/CodeGen/MIRParser/MILexer.cpp
index 4899bd3f5811..ad5c617623f2 100644
--- a/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/lib/CodeGen/MIRParser/MILexer.cpp
@@ -249,6 +249,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("successors", MIToken::kw_successors)
       .Case("floatpred", MIToken::kw_floatpred)
       .Case("intpred", MIToken::kw_intpred)
+      .Case("shufflemask", MIToken::kw_shufflemask)
       .Case("pre-instr-symbol", MIToken::kw_pre_instr_symbol)
       .Case("post-instr-symbol", MIToken::kw_post_instr_symbol)
       .Case("unknown-size", MIToken::kw_unknown_size)
diff --git a/lib/CodeGen/MIRParser/MILexer.h b/lib/CodeGen/MIRParser/MILexer.h
index 0fe3f9f706db..200f9d026cc8 100644
--- a/lib/CodeGen/MIRParser/MILexer.h
+++ b/lib/CodeGen/MIRParser/MILexer.h
@@ -117,6 +117,7 @@ struct MIToken {
     kw_successors,
     kw_floatpred,
     kw_intpred,
+    kw_shufflemask,
     kw_pre_instr_symbol,
     kw_post_instr_symbol,
     kw_unknown_size,
@@ -146,6 +147,7 @@ struct MIToken {
     IntegerLiteral,
     FloatingPointLiteral,
     HexLiteral,
+    VectorLiteral,
     VirtualRegister,
     ConstantPoolItem,
     JumpTableIndex,
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index c0b800a0b870..6498acc9fa51 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -451,6 +451,7 @@ public:
   bool parseBlockAddressOperand(MachineOperand &Dest);
   bool parseIntrinsicOperand(MachineOperand &Dest);
   bool parsePredicateOperand(MachineOperand &Dest);
+  bool parseShuffleMaskOperand(MachineOperand &Dest);
   bool parseTargetIndexOperand(MachineOperand &Dest);
   bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
@@ -640,7 +641,7 @@ bool MIParser::parseBasicBlockDefinition(
     return error(Loc, Twine("redefinition of machine basic block with id #") +
                           Twine(ID));
   if (Alignment)
-    MBB->setAlignment(Alignment);
+    MBB->setAlignment(Align(Alignment));
   if (HasAddressTaken)
     MBB->setHasAddressTaken();
   MBB->setIsEHPad(IsLandingPad);
@@ -1078,7 +1079,7 @@ static const char *printImplicitRegisterFlag(const MachineOperand &MO) {
 
 static std::string getRegisterName(const TargetRegisterInfo *TRI,
                                    unsigned Reg) {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg) && "expected phys reg");
+  assert(Register::isPhysicalRegister(Reg) && "expected phys reg");
   return StringRef(TRI->getName(Reg)).lower();
 }
 
@@ -1408,11 +1409,11 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
   if (Token.is(MIToken::dot)) {
     if (parseSubRegisterIndex(SubReg))
       return true;
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       return error("subregister index expects a virtual register");
   }
   if (Token.is(MIToken::colon)) {
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       return error("register class specification expects a virtual register");
     lex();
     if (parseRegisterClassOrBank(*RegInfo))
@@ -1436,12 +1437,13 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
         if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
           return error("inconsistent type for generic virtual register");
 
+        MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
         MRI.setType(Reg, Ty);
       }
     }
   } else if (consumeIfPresent(MIToken::lparen)) {
     // Virtual registers may have a tpe with GlobalISel.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       return error("unexpected type on physical register");
 
     LLT Ty;
@@ -1454,8 +1456,9 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
     if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
       return error("inconsistent type for generic virtual register");
 
+    MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
     MRI.setType(Reg, Ty);
-  } else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  } else if (Register::isVirtualRegister(Reg)) {
     // Generic virtual registers must have a type.
     // If we end up here this means the type hasn't been specified and
     // this is bad!
@@ -2285,6 +2288,49 @@ bool MIParser::parsePredicateOperand(MachineOperand &Dest) {
   return false;
 }
 
+bool MIParser::parseShuffleMaskOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_shufflemask));
+
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return error("expected syntax shufflemask(<integer or undef>, ...)");
+
+  SmallVector<Constant *, 32> ShufMask;
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  Type *I32Ty = Type::getInt32Ty(Ctx);
+
+  bool AllZero = true;
+  bool AllUndef = true;
+
+  do {
+    if (Token.is(MIToken::kw_undef)) {
+      ShufMask.push_back(UndefValue::get(I32Ty));
+      AllZero = false;
+    } else if (Token.is(MIToken::IntegerLiteral)) {
+      AllUndef = false;
+      const APSInt &Int = Token.integerValue();
+      if (!Int.isNullValue())
+        AllZero = false;
+      ShufMask.push_back(ConstantInt::get(I32Ty, Int.getExtValue()));
+    } else
+      return error("expected integer constant");
+
+    lex();
+  } while (consumeIfPresent(MIToken::comma));
+
+  if (expectAndConsume(MIToken::rparen))
+    return error("shufflemask should be terminated by ')'.");
+
+  if (AllZero || AllUndef) {
+    VectorType *VT = VectorType::get(I32Ty, ShufMask.size());
+    Constant *C = AllZero ? Constant::getNullValue(VT) : UndefValue::get(VT);
+    Dest = MachineOperand::CreateShuffleMask(C);
+  } else
+    Dest = MachineOperand::CreateShuffleMask(ConstantVector::get(ShufMask));
+
+  return false;
+}
+
 bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {
   assert(Token.is(MIToken::kw_target_index));
   lex();
@@ -2432,6 +2478,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
   case MIToken::kw_floatpred:
   case MIToken::kw_intpred:
     return parsePredicateOperand(Dest);
+  case MIToken::kw_shufflemask:
+    return parseShuffleMaskOperand(Dest);
   case MIToken::Error:
     return true;
   case MIToken::Identifier:
diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp
index b242934def80..55fac93d8991 100644
--- a/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -216,7 +216,7 @@ std::unique_ptr<Module> MIRParserImpl::parseIRModule() {
       return nullptr;
     // Create an empty module when the MIR file is empty.
     NoMIRDocuments = true;
-    return llvm::make_unique<Module>(Filename, Context);
+    return std::make_unique<Module>(Filename, Context);
   }
 
   std::unique_ptr<Module> M;
@@ -236,7 +236,7 @@ std::unique_ptr<Module> MIRParserImpl::parseIRModule() {
       NoMIRDocuments = true;
   } else {
     // Create an new, empty module.
-    M = llvm::make_unique<Module>(Filename, Context);
+    M = std::make_unique<Module>(Filename, Context);
     NoLLVMIR = true;
   }
   return M;
@@ -306,7 +306,7 @@ bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI) {
 static bool isSSA(const MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     if (!MRI.hasOneDef(Reg) && !MRI.def_empty(Reg))
       return false;
   }
@@ -355,10 +355,10 @@ bool MIRParserImpl::initializeCallSiteInfo(
     if (MILoc.Offset >= CallB->size())
       return error(Twine(MF.getName()) +
                    Twine(" call instruction offset out of range.") +
-                   "Unable to reference instruction at bb: " +
+                   " Unable to reference instruction at bb: " +
                    Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset));
-    auto CallI = std::next(CallB->begin(), MILoc.Offset);
-    if (!CallI->isCall())
+    auto CallI = std::next(CallB->instr_begin(), MILoc.Offset);
+    if (!CallI->isCall(MachineInstr::IgnoreBundle))
       return error(Twine(MF.getName()) +
                    Twine(" call site info should reference call "
                          "instruction. Instruction at bb:") +
@@ -393,7 +393,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   }
 
   if (YamlMF.Alignment)
-    MF.setAlignment(YamlMF.Alignment);
+    MF.setAlignment(Align(YamlMF.Alignment));
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
   MF.setHasWinCFI(YamlMF.HasWinCFI);
 
@@ -949,6 +949,6 @@ llvm::createMIRParser(std::unique_ptr<MemoryBuffer> Contents,
             "Can't read MIR with a Context that discards named Values")));
     return nullptr;
   }
-  return llvm::make_unique<MIRParser>(
-      llvm::make_unique<MIRParserImpl>(std::move(Contents), Filename, Context));
+  return std::make_unique<MIRParser>(
+      std::make_unique<MIRParserImpl>(std::move(Contents), Filename, Context));
 }
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 0a95a0ced0f5..1a4e21ac06a9 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -197,7 +197,7 @@ void MIRPrinter::print(const MachineFunction &MF) {
 
   yaml::MachineFunction YamlMF;
   YamlMF.Name = MF.getName();
-  YamlMF.Alignment = MF.getAlignment();
+  YamlMF.Alignment = MF.getAlignment().value();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
   YamlMF.HasWinCFI = MF.hasWinCFI();
 
@@ -290,7 +290,7 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
 
   // Print the virtual register definitions.
   for (unsigned I = 0, E = RegInfo.getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     yaml::VirtualRegisterDefinition VReg;
     VReg.ID = I;
     if (RegInfo.getVRegName(Reg) != "")
@@ -473,10 +473,11 @@ void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF,
     yaml::CallSiteInfo::MachineInstrLoc CallLocation;
 
     // Prepare instruction position.
-    MachineBasicBlock::const_iterator CallI = CSInfo.first->getIterator();
+    MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator();
     CallLocation.BlockNum = CallI->getParent()->getNumber();
     // Get call instruction offset from the beginning of block.
-    CallLocation.Offset = std::distance(CallI->getParent()->begin(), CallI);
+    CallLocation.Offset =
+        std::distance(CallI->getParent()->instr_begin(), CallI);
     YmlCS.CallLocation = CallLocation;
     // Construct call arguments and theirs forwarding register info.
     for (auto ArgReg : CSInfo.second) {
@@ -628,9 +629,9 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
     OS << "landing-pad";
     HasAttributes = true;
   }
-  if (MBB.getAlignment()) {
+  if (MBB.getAlignment() != Align::None()) {
     OS << (HasAttributes ? ", " : " (");
-    OS << "align " << MBB.getAlignment();
+    OS << "align " << MBB.getAlignment().value();
     HasAttributes = true;
   }
   if (HasAttributes)
@@ -842,7 +843,8 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   case MachineOperand::MO_CFIIndex:
   case MachineOperand::MO_IntrinsicID:
   case MachineOperand::MO_Predicate:
-  case MachineOperand::MO_BlockAddress: {
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ShuffleMask: {
     unsigned TiedOperandIdx = 0;
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
diff --git a/lib/CodeGen/MIRVRegNamerUtils.cpp b/lib/CodeGen/MIRVRegNamerUtils.cpp
new file mode 100644
index 000000000000..6629000f468f
--- /dev/null
+++ b/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -0,0 +1,348 @@
+//===---------- MIRVRegNamerUtils.cpp - MIR VReg Renaming Utilities -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIRVRegNamerUtils.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mir-vregnamer-utils"
+
+namespace {
+
+// TypedVReg and VRType are used to tell the renamer what to do at points in a
+// sequence of values to be renamed. A TypedVReg can either contain
+// an actual VReg, a FrameIndex, or it could just be a barrier for the next
+// candidate (side-effecting instruction). This tells the renamer to increment
+// to the next vreg name, or to skip modulo some skip-gap value.
+enum VRType { RSE_Reg = 0, RSE_FrameIndex, RSE_NewCandidate };
+class TypedVReg {
+  VRType Type;
+  Register Reg;
+
+public:
+  TypedVReg(Register Reg) : Type(RSE_Reg), Reg(Reg) {}
+  TypedVReg(VRType Type) : Type(Type), Reg(~0U) {
+    assert(Type != RSE_Reg && "Expected a non-Register Type.");
+  }
+
+  bool isReg() const { return Type == RSE_Reg; }
+  bool isFrameIndex() const { return Type == RSE_FrameIndex; }
+  bool isCandidate() const { return Type == RSE_NewCandidate; }
+
+  VRType getType() const { return Type; }
+  Register getReg() const {
+    assert(this->isReg() && "Expected a virtual or physical Register.");
+    return Reg;
+  }
+};
+
+/// Here we find our candidates. What makes an interesting candidate?
+/// A candidate for a canonicalization tree root is normally any kind of
+/// instruction that causes side effects such as a store to memory or a copy to
+/// a physical register or a return instruction. We use these as an expression
+/// tree root that we walk in order to build a canonical walk which should
+/// result in canonical vreg renaming.
+std::vector<MachineInstr *> populateCandidates(MachineBasicBlock *MBB) {
+  std::vector<MachineInstr *> Candidates;
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  for (auto II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
+    MachineInstr *MI = &*II;
+
+    bool DoesMISideEffect = false;
+
+    if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg()) {
+      const Register Dst = MI->getOperand(0).getReg();
+      DoesMISideEffect |= !Register::isVirtualRegister(Dst);
+
+      for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) {
+        if (DoesMISideEffect)
+          break;
+        DoesMISideEffect |= (UI->getParent()->getParent() != MI->getParent());
+      }
+    }
+
+    if (!MI->mayStore() && !MI->isBranch() && !DoesMISideEffect)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Found Candidate:  "; MI->dump(););
+    Candidates.push_back(MI);
+  }
+
+  return Candidates;
+}
+
+void doCandidateWalk(std::vector<TypedVReg> &VRegs,
+                     std::queue<TypedVReg> &RegQueue,
+                     std::vector<MachineInstr *> &VisitedMIs,
+                     const MachineBasicBlock *MBB) {
+
+  const MachineFunction &MF = *MBB->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  while (!RegQueue.empty()) {
+
+    auto TReg = RegQueue.front();
+    RegQueue.pop();
+
+    if (TReg.isFrameIndex()) {
+      LLVM_DEBUG(dbgs() << "Popping frame index.\n";);
+      VRegs.push_back(TypedVReg(RSE_FrameIndex));
+      continue;
+    }
+
+    assert(TReg.isReg() && "Expected vreg or physreg.");
+    Register Reg = TReg.getReg();
+
+    if (Register::isVirtualRegister(Reg)) {
+      LLVM_DEBUG({
+        dbgs() << "Popping vreg ";
+        MRI.def_begin(Reg)->dump();
+        dbgs() << "\n";
+      });
+
+      if (!llvm::any_of(VRegs, [&](const TypedVReg &TR) {
+            return TR.isReg() && TR.getReg() == Reg;
+          })) {
+        VRegs.push_back(TypedVReg(Reg));
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "Popping physreg.\n";);
+      VRegs.push_back(TypedVReg(Reg));
+      continue;
+    }
+
+    for (auto RI = MRI.def_begin(Reg), RE = MRI.def_end(); RI != RE; ++RI) {
+      MachineInstr *Def = RI->getParent();
+
+      if (Def->getParent() != MBB)
+        continue;
+
+      if (llvm::any_of(VisitedMIs,
+                       [&](const MachineInstr *VMI) { return Def == VMI; })) {
+        break;
+      }
+
+      LLVM_DEBUG({
+        dbgs() << "\n========================\n";
+        dbgs() << "Visited MI: ";
+        Def->dump();
+        dbgs() << "BB Name: " << Def->getParent()->getName() << "\n";
+        dbgs() << "\n========================\n";
+      });
+      VisitedMIs.push_back(Def);
+      for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) {
+
+        MachineOperand &MO = Def->getOperand(I);
+        if (MO.isFI()) {
+          LLVM_DEBUG(dbgs() << "Pushing frame index.\n";);
+          RegQueue.push(TypedVReg(RSE_FrameIndex));
+        }
+
+        if (!MO.isReg())
+          continue;
+        RegQueue.push(TypedVReg(MO.getReg()));
+      }
+    }
+  }
+}
+
+std::map<unsigned, unsigned>
+getVRegRenameMap(const std::vector<TypedVReg> &VRegs,
+                 const std::vector<Register> &renamedInOtherBB,
+                 MachineRegisterInfo &MRI, NamedVRegCursor &NVC) {
+  std::map<unsigned, unsigned> VRegRenameMap;
+  bool FirstCandidate = true;
+
+  for (auto &vreg : VRegs) {
+    if (vreg.isFrameIndex()) {
+      // We skip one vreg for any frame index because there is a good chance
+      // (especially when comparing SelectionDAG to GlobalISel generated MIR)
+      // that in the other file we are just getting an incoming vreg that comes
+      // from a copy from a frame index. So it's safe to skip by one.
+      unsigned LastRenameReg = NVC.incrementVirtualVReg();
+      (void)LastRenameReg;
+      LLVM_DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";);
+      continue;
+    } else if (vreg.isCandidate()) {
+
+      // After the first candidate, for every subsequent candidate, we skip mod
+      // 10 registers so that the candidates are more likely to start at the
+      // same vreg number making it more likely that the canonical walk from the
+      // candidate insruction. We don't need to skip from the first candidate of
+      // the BasicBlock because we already skip ahead several vregs for each BB.
+      unsigned LastRenameReg = NVC.getVirtualVReg();
+      if (FirstCandidate)
+        NVC.incrementVirtualVReg(LastRenameReg % 10);
+      FirstCandidate = false;
+      continue;
+    } else if (!Register::isVirtualRegister(vreg.getReg())) {
+      unsigned LastRenameReg = NVC.incrementVirtualVReg();
+      (void)LastRenameReg;
+      LLVM_DEBUG({
+        dbgs() << "Skipping rename for Phys Reg " << LastRenameReg << "\n";
+      });
+      continue;
+    }
+
+    auto Reg = vreg.getReg();
+    if (llvm::find(renamedInOtherBB, Reg) != renamedInOtherBB.end()) {
+      LLVM_DEBUG(dbgs() << "Vreg " << Reg
+                        << " already renamed in other BB.\n";);
+      continue;
+    }
+
+    auto Rename = NVC.createVirtualRegister(Reg);
+
+    if (VRegRenameMap.find(Reg) == VRegRenameMap.end()) {
+      LLVM_DEBUG(dbgs() << "Mapping vreg ";);
+      if (MRI.reg_begin(Reg) != MRI.reg_end()) {
+        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump(););
+      } else {
+        LLVM_DEBUG(dbgs() << Reg;);
+      }
+      LLVM_DEBUG(dbgs() << " to ";);
+      if (MRI.reg_begin(Rename) != MRI.reg_end()) {
+        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump(););
+      } else {
+        LLVM_DEBUG(dbgs() << Rename;);
+      }
+      LLVM_DEBUG(dbgs() << "\n";);
+
+      VRegRenameMap.insert(std::pair<unsigned, unsigned>(Reg, Rename));
+    }
+  }
+
+  return VRegRenameMap;
+}
+
+bool doVRegRenaming(std::vector<Register> &renamedInOtherBB,
+                    const std::map<unsigned, unsigned> &VRegRenameMap,
+                    MachineRegisterInfo &MRI) {
+  bool Changed = false;
+  for (auto I = VRegRenameMap.begin(), E = VRegRenameMap.end(); I != E; ++I) {
+
+    auto VReg = I->first;
+    auto Rename = I->second;
+
+    renamedInOtherBB.push_back(Rename);
+
+    std::vector<MachineOperand *> RenameMOs;
+    for (auto &MO : MRI.reg_operands(VReg)) {
+      RenameMOs.push_back(&MO);
+    }
+
+    for (auto *MO : RenameMOs) {
+      Changed = true;
+      MO->setReg(Rename);
+
+      if (!MO->isDef())
+        MO->setIsKill(false);
+    }
+  }
+
+  return Changed;
+}
+
+bool renameVRegs(MachineBasicBlock *MBB,
+                 std::vector<Register> &renamedInOtherBB,
+                 NamedVRegCursor &NVC) {
+  bool Changed = false;
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  std::vector<MachineInstr *> Candidates = populateCandidates(MBB);
+  std::vector<MachineInstr *> VisitedMIs;
+  llvm::copy(Candidates, std::back_inserter(VisitedMIs));
+
+  std::vector<TypedVReg> VRegs;
+  for (auto candidate : Candidates) {
+    VRegs.push_back(TypedVReg(RSE_NewCandidate));
+
+    std::queue<TypedVReg> RegQueue;
+
+    // Here we walk the vreg operands of a non-root node along our walk.
+    // The root nodes are the original candidates (stores normally).
+    // These are normally not the root nodes (except for the case of copies to
+    // physical registers).
+    for (unsigned i = 1; i < candidate->getNumOperands(); i++) {
+      if (candidate->mayStore() || candidate->isBranch())
+        break;
+
+      MachineOperand &MO = candidate->getOperand(i);
+      if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";);
+      RegQueue.push(TypedVReg(MO.getReg()));
+    }
+
+    // Here we walk the root candidates. We start from the 0th operand because
+    // the root is normally a store to a vreg.
+    for (unsigned i = 0; i < candidate->getNumOperands(); i++) {
+
+      if (!candidate->mayStore() && !candidate->isBranch())
+        break;
+
+      MachineOperand &MO = candidate->getOperand(i);
+
+      // TODO: Do we want to only add vregs here?
+      if (!MO.isReg() && !MO.isFI())
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";);
+
+      RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg())
+                               : TypedVReg(RSE_FrameIndex));
+    }
+
+    doCandidateWalk(VRegs, RegQueue, VisitedMIs, MBB);
+  }
+
+  // If we have populated no vregs to rename then bail.
+  // The rest of this function does the vreg remaping.
+  if (VRegs.size() == 0)
+    return Changed;
+
+  auto VRegRenameMap = getVRegRenameMap(VRegs, renamedInOtherBB, MRI, NVC);
+  Changed |= doVRegRenaming(renamedInOtherBB, VRegRenameMap, MRI);
+  return Changed;
+}
+} // anonymous namespace
+
+void NamedVRegCursor::skipVRegs() {
+  unsigned VRegGapIndex = 1;
+  if (!virtualVRegNumber) {
+    VRegGapIndex = 0;
+    virtualVRegNumber = MRI.createIncompleteVirtualRegister();
+  }
+  const unsigned VR_GAP = (++VRegGapIndex * SkipGapSize);
+
+  unsigned I = virtualVRegNumber;
+  const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP;
+
+  virtualVRegNumber = E;
+}
+
+unsigned NamedVRegCursor::createVirtualRegister(unsigned VReg) {
+  if (!virtualVRegNumber)
+    skipVRegs();
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << "namedVReg" << (virtualVRegNumber & ~0x80000000);
+  OS.flush();
+  virtualVRegNumber++;
+  if (auto RC = MRI.getRegClassOrNull(VReg))
+    return MRI.createVirtualRegister(RC, OS.str());
+  return MRI.createGenericVirtualRegister(MRI.getType(VReg), OS.str());
+}
+
+bool NamedVRegCursor::renameVRegs(MachineBasicBlock *MBB) {
+  return ::renameVRegs(MBB, RenamedInOtherBB, *this);
+}
diff --git a/lib/CodeGen/MIRVRegNamerUtils.h b/lib/CodeGen/MIRVRegNamerUtils.h
new file mode 100644
index 000000000000..c5b52a968538
--- /dev/null
+++ b/lib/CodeGen/MIRVRegNamerUtils.h
@@ -0,0 +1,91 @@
+
+//===------------ MIRVRegNamerUtils.h - MIR VReg Renaming Utilities -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of these utilities is to abstract out parts of the MIRCanon pass
+// that are responsible for renaming virtual registers with the purpose of
+// sharing code with a MIRVRegNamer pass that could be the analog of the
+// opt -instnamer pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MIRVREGNAMERUTILS_H
+#define LLVM_LIB_CODEGEN_MIRVREGNAMERUTILS_H
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <queue>
+
+namespace llvm {
+
+/// NamedVRegCursor - The cursor is an object that keeps track of what the next
+/// vreg name should be. It does book keeping to determine when to skip the
+/// index value and by how much, or if the next vreg name should be an increment
+/// from the previous.
+class NamedVRegCursor {
+  MachineRegisterInfo &MRI;
+
+  /// virtualVRegNumber - Book keeping of the last vreg position.
+  unsigned virtualVRegNumber;
+
+  /// SkipGapSize - Used to calculate a modulo amount to skip by after every
+  /// sequence of instructions starting from a given side-effecting
+  /// MachineInstruction for a given MachineBasicBlock. The general idea is that
+  /// for a given program compiled with two different opt pipelines, there
+  /// shouldn't be greater than SkipGapSize difference in how many vregs are in
+  /// play between the two and for every def-use graph of vregs we rename we
+  /// will round up to the next SkipGapSize'th number so that we have a high
+  /// change of landing on the same name for two given matching side-effects
+  /// for the two compilation outcomes.
+  const unsigned SkipGapSize;
+
+  /// RenamedInOtherBB - VRegs that we already renamed: ie breadcrumbs.
+  std::vector<Register> RenamedInOtherBB;
+
+public:
+  NamedVRegCursor() = delete;
+  /// 1000 for the SkipGapSize was a good heuristic at the time of the writing
+  /// of the MIRCanonicalizerPass. Adjust as needed.
+  NamedVRegCursor(MachineRegisterInfo &MRI, unsigned SkipGapSize = 1000)
+      : MRI(MRI), virtualVRegNumber(0), SkipGapSize(SkipGapSize) {}
+
+  /// SkipGapSize - Skips modulo a gap value of indices. Indices are used to
+  /// produce the next vreg name.
+  void skipVRegs();
+
+  unsigned getVirtualVReg() const { return virtualVRegNumber; }
+
+  /// incrementVirtualVReg - This increments an index value that us used to
+  /// create a new vreg name. This is not a Register.
+  unsigned incrementVirtualVReg(unsigned incr = 1) {
+    virtualVRegNumber += incr;
+    return virtualVRegNumber;
+  }
+
+  /// createVirtualRegister - Given an existing vreg, create a named vreg to
+  /// take its place.
+  unsigned createVirtualRegister(unsigned VReg);
+
+  /// renameVRegs - For a given MachineBasicBlock, scan for side-effecting
+  /// instructions, walk the def-use from each side-effecting root (in sorted
+  /// root order) and rename the encountered vregs in the def-use graph in a
+  /// canonical ordering. This method maintains book keeping for which vregs
+  /// were already renamed in RenamedInOtherBB.
+  // @return changed
+  bool renameVRegs(MachineBasicBlock *MBB);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 4d29e883d879..854bef3aab05 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -39,6 +39,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "codegen"
 
+static cl::opt<bool> PrintSlotIndexes(
+    "print-slotindexes",
+    cl::desc("When printing machine IR, annotate instructions and blocks with "
+             "SlotIndexes when available"),
+    cl::init(true), cl::Hidden);
+
 MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
     : BB(B), Number(-1), xParent(&MF) {
   Insts.Parent = this;
@@ -291,7 +297,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     return;
   }
 
-  if (Indexes)
+  if (Indexes && PrintSlotIndexes)
     OS << Indexes->getMBBStartIdx(this) << '\t';
 
   OS << "bb." << getNumber();
@@ -320,9 +326,9 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "landing-pad";
     HasAttributes = true;
   }
-  if (getAlignment()) {
+  if (getAlignment() != Align::None()) {
     OS << (HasAttributes ? ", " : " (");
-    OS << "align " << getAlignment();
+    OS << "align " << Log2(getAlignment());
     HasAttributes = true;
   }
   if (HasAttributes)
@@ -402,7 +408,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   bool IsInBundle = false;
   for (const MachineInstr &MI : instrs()) {
-    if (Indexes) {
+    if (Indexes && PrintSlotIndexes) {
       if (Indexes->hasIndex(MI))
         OS << Indexes->getInstructionIndex(MI);
       OS << '\t';
@@ -484,9 +490,9 @@ void MachineBasicBlock::sortUniqueLiveIns() {
 }
 
 unsigned
-MachineBasicBlock::addLiveIn(MCPhysReg PhysReg, const TargetRegisterClass *RC) {
+MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC) {
   assert(getParent() && "MBB must be inserted in function");
-  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) && "Expected physreg");
+  assert(PhysReg.isPhysical() && "Expected physreg");
   assert(RC && "Register class is required");
   assert((isEHPad() || this == &getParent()->front()) &&
          "Only the entry block and landing pads can have physreg live ins");
@@ -500,14 +506,14 @@ MachineBasicBlock::addLiveIn(MCPhysReg PhysReg, const TargetRegisterClass *RC) {
   if (LiveIn)
     for (;I != E && I->isCopy(); ++I)
       if (I->getOperand(1).getReg() == PhysReg) {
-        unsigned VirtReg = I->getOperand(0).getReg();
+        Register VirtReg = I->getOperand(0).getReg();
         if (!MRI.constrainRegClass(VirtReg, RC))
           llvm_unreachable("Incompatible live-in register class.");
         return VirtReg;
       }
 
   // No luck, create a virtual register.
-  unsigned VirtReg = MRI.createVirtualRegister(RC);
+  Register VirtReg = MRI.createVirtualRegister(RC);
   BuildMI(*this, I, DebugLoc(), TII.get(TargetOpcode::COPY), VirtReg)
     .addReg(PhysReg, RegState::Kill);
   if (!LiveIn)
@@ -772,7 +778,8 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
 
-    // If probability list is empty it means we don't use it (disabled optimization).
+    // If probability list is empty it means we don't use it (disabled
+    // optimization).
     if (!FromMBB->Probs.empty()) {
       auto Prob = *FromMBB->Probs.begin();
       addSuccessor(Succ, Prob);
@@ -798,13 +805,7 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
     FromMBB->removeSuccessor(Succ);
 
     // Fix up any PHI nodes in the successor.
-    for (MachineBasicBlock::instr_iterator MI = Succ->instr_begin(),
-           ME = Succ->instr_end(); MI != ME && MI->isPHI(); ++MI)
-      for (unsigned i = 2, e = MI->getNumOperands()+1; i != e; i += 2) {
-        MachineOperand &MO = MI->getOperand(i);
-        if (MO.getMBB() == FromMBB)
-          MO.setMBB(this);
-      }
+    Succ->replacePhiUsesWith(FromMBB, this);
   }
   normalizeSuccProbs();
 }
@@ -907,8 +908,8 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
         if (!OI->isReg() || OI->getReg() == 0 ||
             !OI->isUse() || !OI->isKill() || OI->isUndef())
           continue;
-        unsigned Reg = OI->getReg();
-        if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+        Register Reg = OI->getReg();
+        if (Register::isPhysicalRegister(Reg) ||
             LV->getVarInfo(Reg).removeKill(*MI)) {
           KilledRegs.push_back(Reg);
           LLVM_DEBUG(dbgs() << "Removing terminator kill: " << *MI);
@@ -928,7 +929,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
         if (!OI->isReg() || OI->getReg() == 0)
           continue;
 
-        unsigned Reg = OI->getReg();
+        Register Reg = OI->getReg();
         if (!is_contained(UsedRegs, Reg))
           UsedRegs.push_back(Reg);
       }
@@ -979,13 +980,8 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
     }
   }
 
-  // Fix PHI nodes in Succ so they refer to NMBB instead of this
-  for (MachineBasicBlock::instr_iterator
-         i = Succ->instr_begin(),e = Succ->instr_end();
-       i != e && i->isPHI(); ++i)
-    for (unsigned ni = 1, ne = i->getNumOperands(); ni != ne; ni += 2)
-      if (i->getOperand(ni+1).getMBB() == this)
-        i->getOperand(ni+1).setMBB(NMBB);
+  // Fix PHI nodes in Succ so they refer to NMBB instead of this.
+  Succ->replacePhiUsesWith(this, NMBB);
 
   // Inherit live-ins from the successor
   for (const auto &LI : Succ->liveins())
@@ -1000,7 +996,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
       for (instr_iterator I = instr_end(), E = instr_begin(); I != E;) {
         if (!(--I)->addRegisterKilled(Reg, TRI, /* AddIfNotFound= */ false))
           continue;
-        if (TargetRegisterInfo::isVirtualRegister(Reg))
+        if (Register::isVirtualRegister(Reg))
           LV->getVarInfo(Reg).Kills.push_back(&*I);
         LLVM_DEBUG(dbgs() << "Restored terminator kill: " << *I);
         break;
@@ -1033,7 +1029,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
       for (unsigned ni = 1, ne = I->getNumOperands(); ni != ne; ni += 2) {
         if (I->getOperand(ni+1).getMBB() == NMBB) {
           MachineOperand &MO = I->getOperand(ni);
-          unsigned Reg = MO.getReg();
+          Register Reg = MO.getReg();
           PHISrcRegs.insert(Reg);
           if (MO.isUndef())
             continue;
@@ -1049,7 +1045,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
 
     MachineRegisterInfo *MRI = &getParent()->getRegInfo();
     for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-      unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+      unsigned Reg = Register::index2VirtReg(i);
       if (PHISrcRegs.count(Reg) || !LIS->hasInterval(Reg))
         continue;
 
@@ -1217,6 +1213,16 @@ void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
   replaceSuccessor(Old, New);
 }
 
+void MachineBasicBlock::replacePhiUsesWith(MachineBasicBlock *Old,
+                                           MachineBasicBlock *New) {
+  for (MachineInstr &MI : phis())
+    for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.getMBB() == Old)
+        MO.setMBB(New);
+    }
+}
+
 /// Various pieces of code can cause excess edges in the CFG to be inserted.  If
 /// we have proven that MBB can only branch to DestA and DestB, remove any other
 /// MBB successors from the CFG.  DestA and DestB can be null.
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 639b588766a1..ac19bc0bd8ea 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -79,16 +79,17 @@ STATISTIC(CondBranchTakenFreq,
 STATISTIC(UncondBranchTakenFreq,
           "Potential frequency of taking unconditional branches");
 
-static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
-                                       cl::desc("Force the alignment of all "
-                                                "blocks in the function."),
-                                       cl::init(0), cl::Hidden);
+static cl::opt<unsigned> AlignAllBlock(
+    "align-all-blocks",
+    cl::desc("Force the alignment of all blocks in the function in log2 format "
+             "(e.g 4 means align on 16B boundaries)."),
+    cl::init(0), cl::Hidden);
 
 static cl::opt<unsigned> AlignAllNonFallThruBlocks(
     "align-all-nofallthru-blocks",
-    cl::desc("Force the alignment of all "
-             "blocks that have no fall-through predecessors (i.e. don't add "
-             "nops that are executed)."),
+    cl::desc("Force the alignment of all blocks that have no fall-through "
+             "predecessors (i.e. don't add nops that are executed). In log2 "
+             "format (e.g 4 means align on 16B boundaries)."),
     cl::init(0), cl::Hidden);
 
 // FIXME: Find a good default for this flag and remove the flag.
@@ -2763,8 +2764,8 @@ void MachineBlockPlacement::alignBlocks() {
     if (!L)
       continue;
 
-    unsigned Align = TLI->getPrefLoopAlignment(L);
-    if (!Align)
+    const Align Align = TLI->getPrefLoopAlignment(L);
+    if (Align == 1)
       continue; // Don't care about loop alignment.
 
     // If the block is cold relative to the function entry don't waste space
@@ -2981,7 +2982,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
 
   F = &MF;
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
-  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
+  MBFI = std::make_unique<BranchFolder::MBFIWrapper>(
       getAnalysis<MachineBlockFrequencyInfo>());
   MLI = &getAnalysis<MachineLoopInfo>();
   TII = MF.getSubtarget().getInstrInfo();
@@ -3038,8 +3039,9 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
+    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
     if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
-                            getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
+                            MMIWP ? &MMIWP->getMMI() : nullptr, MLI,
                             /*AfterPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
       BlockToChain.clear();
@@ -3062,14 +3064,14 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   if (AlignAllBlock)
     // Align all of the blocks in the function to a specific alignment.
     for (MachineBasicBlock &MBB : MF)
-      MBB.setAlignment(AlignAllBlock);
+      MBB.setAlignment(Align(1ULL << AlignAllBlock));
   else if (AlignAllNonFallThruBlocks) {
     // Align all of the blocks that have no fall-through predecessors to a
     // specific alignment.
     for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
       auto LayoutPred = std::prev(MBI);
       if (!LayoutPred->isSuccessor(&*MBI))
-        MBI->setAlignment(AlignAllNonFallThruBlocks);
+        MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
     }
   }
   if (ViewBlockLayoutWithBFI != GVDT_None &&
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 2df6d40d9293..d9bd32b2fbab 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -66,6 +67,7 @@ namespace {
     AliasAnalysis *AA;
     MachineDominatorTree *DT;
     MachineRegisterInfo *MRI;
+    MachineBlockFrequencyInfo *MBFI;
 
   public:
     static char ID; // Pass identification
@@ -83,6 +85,8 @@ namespace {
       AU.addPreservedID(MachineLoopInfoID);
       AU.addRequired<MachineDominatorTree>();
       AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addPreserved<MachineBlockFrequencyInfo>();
     }
 
     void releaseMemory() override {
@@ -133,6 +137,11 @@ namespace {
     bool isPRECandidate(MachineInstr *MI);
     bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
     bool PerformSimplePRE(MachineDominatorTree *DT);
+    /// Heuristics to see if it's profitable to move common computations of MBB
+    /// and MBB1 to CandidateBB.
+    bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
+                                 MachineBasicBlock *MBB,
+                                 MachineBasicBlock *MBB1);
   };
 
 } // end anonymous namespace
@@ -158,15 +167,15 @@ bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
   for (MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isUse())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
     bool OnlyOneUse = MRI->hasOneNonDBGUse(Reg);
     MachineInstr *DefMI = MRI->getVRegDef(Reg);
     if (!DefMI->isCopy())
       continue;
-    unsigned SrcReg = DefMI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    Register SrcReg = DefMI->getOperand(1).getReg();
+    if (!Register::isVirtualRegister(SrcReg))
       continue;
     if (DefMI->getOperand(0).getSubReg())
       continue;
@@ -189,14 +198,16 @@ bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
     LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI);
     LLVM_DEBUG(dbgs() << "***     to: " << *MI);
 
-    // Update matching debug values.
-    DefMI->changeDebugValuesDefReg(SrcReg);
-
     // Propagate SrcReg of copies to MI.
     MO.setReg(SrcReg);
     MRI->clearKillFlags(SrcReg);
     // Coalesce single use copies.
     if (OnlyOneUse) {
+      // If (and only if) we've eliminated all uses of the copy, also
+      // copy-propagate to any debug-users of MI, or they'll be left using
+      // an undefined value.
+      DefMI->changeDebugValuesDefReg(SrcReg);
+
       DefMI->eraseFromParent();
       ++NumCoalesces;
     }
@@ -271,10 +282,10 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (Register::isVirtualRegister(Reg))
       continue;
     // Reading either caller preserved or constant physregs is ok.
     if (!isCallerPreservedOrConstPhysReg(Reg, *MI->getMF(), *TRI))
@@ -290,10 +301,10 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
     const MachineOperand &MO = MOP.value();
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (Register::isVirtualRegister(Reg))
       continue;
     // Check against PhysRefs even if the def is "dead".
     if (PhysRefs.count(Reg))
@@ -367,8 +378,8 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
         return false;
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned MOReg = MO.getReg();
-      if (TargetRegisterInfo::isVirtualRegister(MOReg))
+      Register MOReg = MO.getReg();
+      if (Register::isVirtualRegister(MOReg))
         continue;
       if (PhysRefs.count(MOReg))
         return false;
@@ -424,8 +435,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
   // If CSReg is used at all uses of Reg, CSE should not increase register
   // pressure of CSReg.
   bool MayIncreasePressure = true;
-  if (TargetRegisterInfo::isVirtualRegister(CSReg) &&
-      TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(CSReg) && Register::isVirtualRegister(Reg)) {
     MayIncreasePressure = false;
     SmallPtrSet<MachineInstr*, 8> CSUses;
     for (MachineInstr &MI : MRI->use_nodbg_instructions(CSReg)) {
@@ -453,8 +463,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
   // of the redundant computation are copies, do not cse.
   bool HasVRegUse = false;
   for (const MachineOperand &MO : MI->operands()) {
-    if (MO.isReg() && MO.isUse() &&
-        TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+    if (MO.isReg() && MO.isUse() && Register::isVirtualRegister(MO.getReg())) {
       HasVRegUse = true;
       break;
     }
@@ -586,8 +595,8 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned OldReg = MO.getReg();
-      unsigned NewReg = CSMI->getOperand(i).getReg();
+      Register OldReg = MO.getReg();
+      Register NewReg = CSMI->getOperand(i).getReg();
 
       // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
       // we should make sure it is not dead at CSMI.
@@ -604,8 +613,8 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
         continue;
       }
 
-      assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
-             TargetRegisterInfo::isVirtualRegister(NewReg) &&
+      assert(Register::isVirtualRegister(OldReg) &&
+             Register::isVirtualRegister(NewReg) &&
              "Do not CSE physical register defs!");
 
       if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), MI)) {
@@ -769,11 +778,11 @@ bool MachineCSE::isPRECandidate(MachineInstr *MI) {
     return false;
 
   for (auto def : MI->defs())
-    if (!TRI->isVirtualRegister(def.getReg()))
+    if (!Register::isVirtualRegister(def.getReg()))
       return false;
 
   for (auto use : MI->uses())
-    if (use.isReg() && !TRI->isVirtualRegister(use.getReg()))
+    if (use.isReg() && !Register::isVirtualRegister(use.getReg()))
       return false;
 
   return true;
@@ -802,6 +811,9 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
     if (!CMBB->isLegalToHoistInto())
       continue;
 
+    if (!isProfitableToHoistInto(CMBB, MBB, MBB1))
+      continue;
+
     // Two instrs are partial redundant if their basic blocks are reachable
     // from one to another but one doesn't dominate another.
     if (CMBB != MBB1) {
@@ -812,8 +824,8 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
 
         assert(MI->getOperand(0).isDef() &&
                "First operand of instr with one explicit def must be this def");
-        unsigned VReg = MI->getOperand(0).getReg();
-        unsigned NewReg = MRI->cloneVirtualRegister(VReg);
+        Register VReg = MI->getOperand(0).getReg();
+        Register NewReg = MRI->cloneVirtualRegister(VReg);
         if (!isProfitableToCSE(NewReg, VReg, CMBB, MI))
           continue;
         MachineInstr &NewMI =
@@ -854,6 +866,18 @@ bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
   return Changed;
 }
 
+bool MachineCSE::isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
+                                         MachineBasicBlock *MBB,
+                                         MachineBasicBlock *MBB1) {
+  if (CandidateBB->getParent()->getFunction().hasMinSize())
+    return true;
+  assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB");
+  assert(DT->dominates(CandidateBB, MBB1) &&
+         "CandidateBB should dominate MBB1");
+  return MBFI->getBlockFreq(CandidateBB) <=
+         MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB1);
+}
+
 bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -863,6 +887,7 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<MachineDominatorTree>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   LookAheadLimit = TII->getMachineCSELookAheadLimit();
   bool ChangedPRE, ChangedCSE;
   ChangedPRE = PerformSimplePRE(DT);
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index 0584ec0bd2b3..e9f462fd1b37 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -137,7 +137,7 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
   MachineInstr *DefInstr = nullptr;
   // We need a virtual register definition.
-  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
     DefInstr = MRI->getUniqueVRegDef(MO.getReg());
   // PHI's have no depth etc.
   if (DefInstr && DefInstr->isPHI())
@@ -168,7 +168,7 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
     unsigned IDepth = 0;
     for (const MachineOperand &MO : InstrPtr->operands()) {
       // Check for virtual register operand.
-      if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+      if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
         continue;
       if (!MO.isUse())
         continue;
@@ -223,7 +223,7 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
 
   for (const MachineOperand &MO : NewRoot->operands()) {
     // Check for virtual register operand.
-    if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+    if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
       continue;
     if (!MO.isDef())
       continue;
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 9fc12ac89e12..ebe76e31dca9 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -119,8 +119,8 @@ public:
   void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI) {
     assert(MI->isCopy() && "Tracking non-copy?");
 
-    unsigned Def = MI->getOperand(0).getReg();
-    unsigned Src = MI->getOperand(1).getReg();
+    Register Def = MI->getOperand(0).getReg();
+    Register Src = MI->getOperand(1).getReg();
 
     // Remember Def is defined by the copy.
     for (MCRegUnitIterator RUI(Def, &TRI); RUI.isValid(); ++RUI)
@@ -163,8 +163,8 @@ public:
 
     // Check that the available copy isn't clobbered by any regmasks between
     // itself and the destination.
-    unsigned AvailSrc = AvailCopy->getOperand(1).getReg();
-    unsigned AvailDef = AvailCopy->getOperand(0).getReg();
+    Register AvailSrc = AvailCopy->getOperand(1).getReg();
+    Register AvailDef = AvailCopy->getOperand(0).getReg();
     for (const MachineInstr &MI :
          make_range(AvailCopy->getIterator(), DestCopy.getIterator()))
       for (const MachineOperand &MO : MI.operands())
@@ -205,8 +205,11 @@ public:
   }
 
 private:
+  typedef enum { DebugUse = false, RegularUse = true } DebugType;
+
   void ClobberRegister(unsigned Reg);
-  void ReadRegister(unsigned Reg);
+  void ReadRegister(unsigned Reg, MachineInstr &Reader,
+                    DebugType DT);
   void CopyPropagateBlock(MachineBasicBlock &MBB);
   bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
   void forwardUses(MachineInstr &MI);
@@ -217,6 +220,9 @@ private:
   /// Candidates for deletion.
   SmallSetVector<MachineInstr *, 8> MaybeDeadCopies;
 
+  /// Multimap tracking debug users in current BB
+  DenseMap<MachineInstr*, SmallVector<MachineInstr*, 2>> CopyDbgUsers;
+
   CopyTracker Tracker;
 
   bool Changed;
@@ -231,13 +237,19 @@ char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
 INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
                 "Machine Copy Propagation Pass", false, false)
 
-void MachineCopyPropagation::ReadRegister(unsigned Reg) {
+void MachineCopyPropagation::ReadRegister(unsigned Reg, MachineInstr &Reader,
+                                          DebugType DT) {
   // If 'Reg' is defined by a copy, the copy is no longer a candidate
-  // for elimination.
+  // for elimination. If a copy is "read" by a debug user, record the user
+  // for propagation.
   for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
     if (MachineInstr *Copy = Tracker.findCopyForUnit(*RUI, *TRI)) {
-      LLVM_DEBUG(dbgs() << "MCP: Copy is used - not dead: "; Copy->dump());
-      MaybeDeadCopies.remove(Copy);
+      if (DT == RegularUse) {
+        LLVM_DEBUG(dbgs() << "MCP: Copy is used - not dead: "; Copy->dump());
+        MaybeDeadCopies.remove(Copy);
+      } else {
+        CopyDbgUsers[Copy].push_back(&Reader);
+      }
     }
   }
 }
@@ -250,8 +262,8 @@ void MachineCopyPropagation::ReadRegister(unsigned Reg) {
 /// isNopCopy("ecx = COPY eax", AH, CL) == false
 static bool isNopCopy(const MachineInstr &PreviousCopy, unsigned Src,
                       unsigned Def, const TargetRegisterInfo *TRI) {
-  unsigned PreviousSrc = PreviousCopy.getOperand(1).getReg();
-  unsigned PreviousDef = PreviousCopy.getOperand(0).getReg();
+  Register PreviousSrc = PreviousCopy.getOperand(1).getReg();
+  Register PreviousDef = PreviousCopy.getOperand(0).getReg();
   if (Src == PreviousSrc) {
     assert(Def == PreviousDef);
     return true;
@@ -288,7 +300,7 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
   // Copy was redundantly redefining either Src or Def. Remove earlier kill
   // flags between Copy and PrevCopy because the value will be reused now.
   assert(Copy.isCopy());
-  unsigned CopyDef = Copy.getOperand(0).getReg();
+  Register CopyDef = Copy.getOperand(0).getReg();
   assert(CopyDef == Src || CopyDef == Def);
   for (MachineInstr &MI :
        make_range(PrevCopy->getIterator(), Copy.getIterator()))
@@ -307,7 +319,7 @@ bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
                                                        const MachineInstr &UseI,
                                                        unsigned UseIdx) {
 
-  unsigned CopySrcReg = Copy.getOperand(1).getReg();
+  Register CopySrcReg = Copy.getOperand(1).getReg();
 
   // If the new register meets the opcode register constraints, then allow
   // forwarding.
@@ -398,9 +410,9 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     if (!Copy)
       continue;
 
-    unsigned CopyDstReg = Copy->getOperand(0).getReg();
+    Register CopyDstReg = Copy->getOperand(0).getReg();
     const MachineOperand &CopySrc = Copy->getOperand(1);
-    unsigned CopySrcReg = CopySrc.getReg();
+    Register CopySrcReg = CopySrc.getReg();
 
     // FIXME: Don't handle partial uses of wider COPYs yet.
     if (MOUse.getReg() != CopyDstReg) {
@@ -456,11 +468,11 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
     // Analyze copies (which don't overlap themselves).
     if (MI->isCopy() && !TRI->regsOverlap(MI->getOperand(0).getReg(),
                                           MI->getOperand(1).getReg())) {
-      unsigned Def = MI->getOperand(0).getReg();
-      unsigned Src = MI->getOperand(1).getReg();
+      Register Def = MI->getOperand(0).getReg();
+      Register Src = MI->getOperand(1).getReg();
 
-      assert(!TargetRegisterInfo::isVirtualRegister(Def) &&
-             !TargetRegisterInfo::isVirtualRegister(Src) &&
+      assert(!Register::isVirtualRegister(Def) &&
+             !Register::isVirtualRegister(Src) &&
              "MachineCopyPropagation should be run after register allocation!");
 
       // The two copies cancel out and the source of the first copy
@@ -488,14 +500,14 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // If Src is defined by a previous copy, the previous copy cannot be
       // eliminated.
-      ReadRegister(Src);
+      ReadRegister(Src, *MI, RegularUse);
       for (const MachineOperand &MO : MI->implicit_operands()) {
         if (!MO.isReg() || !MO.readsReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (!Reg)
           continue;
-        ReadRegister(Reg);
+        ReadRegister(Reg, *MI, RegularUse);
       }
 
       LLVM_DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
@@ -515,7 +527,7 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       for (const MachineOperand &MO : MI->implicit_operands()) {
         if (!MO.isReg() || !MO.isDef())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (!Reg)
           continue;
         Tracker.clobberRegister(Reg, *TRI);
@@ -529,12 +541,12 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
     // Clobber any earlyclobber regs first.
     for (const MachineOperand &MO : MI->operands())
       if (MO.isReg() && MO.isEarlyClobber()) {
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         // If we have a tied earlyclobber, that means it is also read by this
         // instruction, so we need to make sure we don't remove it as dead
         // later.
         if (MO.isTied())
-          ReadRegister(Reg);
+          ReadRegister(Reg, *MI, RegularUse);
         Tracker.clobberRegister(Reg, *TRI);
       }
 
@@ -548,18 +560,18 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
         RegMask = &MO;
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
 
-      assert(!TargetRegisterInfo::isVirtualRegister(Reg) &&
+      assert(!Register::isVirtualRegister(Reg) &&
              "MachineCopyPropagation should be run after register allocation!");
 
       if (MO.isDef() && !MO.isEarlyClobber()) {
         Defs.push_back(Reg);
         continue;
-      } else if (!MO.isDebug() && MO.readsReg())
-        ReadRegister(Reg);
+      } else if (MO.readsReg())
+        ReadRegister(Reg, *MI, MO.isDebug() ? DebugUse : RegularUse);
     }
 
     // The instruction has a register mask operand which means that it clobbers
@@ -571,7 +583,7 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
                MaybeDeadCopies.begin();
            DI != MaybeDeadCopies.end();) {
         MachineInstr *MaybeDead = *DI;
-        unsigned Reg = MaybeDead->getOperand(0).getReg();
+        Register Reg = MaybeDead->getOperand(0).getReg();
         assert(!MRI->isReserved(Reg));
 
         if (!RegMask->clobbersPhysReg(Reg)) {
@@ -609,9 +621,10 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
                  MaybeDead->dump());
       assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
 
-      // Update matching debug values.
+      // Update matching debug values, if any.
       assert(MaybeDead->isCopy());
-      MaybeDead->changeDebugValuesDefReg(MaybeDead->getOperand(1).getReg());
+      unsigned SrcReg = MaybeDead->getOperand(1).getReg();
+      MRI->updateDbgUsersToReg(SrcReg, CopyDbgUsers[MaybeDead]);
 
       MaybeDead->eraseFromParent();
       Changed = true;
@@ -620,6 +633,7 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
   }
 
   MaybeDeadCopies.clear();
+  CopyDbgUsers.clear();
   Tracker.clear();
 }
 
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 1dfba8638c22..706c706d7527 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -18,12 +18,15 @@
 
 using namespace llvm;
 
+namespace llvm {
 // Always verify dominfo if expensive checking is enabled.
 #ifdef EXPENSIVE_CHECKS
-static bool VerifyMachineDomInfo = true;
+bool VerifyMachineDomInfo = true;
 #else
-static bool VerifyMachineDomInfo = false;
+bool VerifyMachineDomInfo = false;
 #endif
+} // namespace llvm
+
 static cl::opt<bool, true> VerifyMachineDomInfoX(
     "verify-machine-dom-info", cl::location(VerifyMachineDomInfo), cl::Hidden,
     cl::desc("Verify machine dominator info (time consuming)"));
@@ -64,21 +67,11 @@ void MachineDominatorTree::releaseMemory() {
 }
 
 void MachineDominatorTree::verifyAnalysis() const {
-  if (DT && VerifyMachineDomInfo) {
-    MachineFunction &F = *getRoot()->getParent();
-
-    DomTreeBase<MachineBasicBlock> OtherDT;
-    OtherDT.recalculate(F);
-    if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() ||
-        DT->compare(OtherDT)) {
-      errs() << "MachineDominatorTree for function " << F.getName()
-            << " is not up to date!\nComputed:\n";
-      DT->print(errs());
-      errs() << "\nActual:\n";
-      OtherDT.print(errs());
+  if (DT && VerifyMachineDomInfo)
+    if (!DT->verify(DomTreeT::VerificationLevel::Basic)) {
+      errs() << "MachineDominatorTree verification failed\n";
       abort();
     }
-  }
 }
 
 void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
diff --git a/lib/CodeGen/MachineFrameInfo.cpp b/lib/CodeGen/MachineFrameInfo.cpp
index bae3a4333bda..604f5145b1a0 100644
--- a/lib/CodeGen/MachineFrameInfo.cpp
+++ b/lib/CodeGen/MachineFrameInfo.cpp
@@ -28,25 +28,26 @@
 
 using namespace llvm;
 
-void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
+void MachineFrameInfo::ensureMaxAlignment(Align Alignment) {
   if (!StackRealignable)
-    assert(Align <= StackAlignment &&
-           "For targets without stack realignment, Align is out of limit!");
-  if (MaxAlignment < Align) MaxAlignment = Align;
+    assert(Alignment <= StackAlignment &&
+           "For targets without stack realignment, Alignment is out of limit!");
+  if (MaxAlignment < Alignment)
+    MaxAlignment = Alignment;
 }
 
 /// Clamp the alignment if requested and emit a warning.
-static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
-                                           unsigned StackAlign) {
-  if (!ShouldClamp || Align <= StackAlign)
-    return Align;
-  LLVM_DEBUG(dbgs() << "Warning: requested alignment " << Align
-                    << " exceeds the stack alignment " << StackAlign
+static inline Align clampStackAlignment(bool ShouldClamp, Align Alignment,
+                                        Align StackAlignment) {
+  if (!ShouldClamp || Alignment <= StackAlignment)
+    return Alignment;
+  LLVM_DEBUG(dbgs() << "Warning: requested alignment " << Alignment.value()
+                    << " exceeds the stack alignment " << StackAlignment.value()
                     << " when stack realignment is off" << '\n');
-  return StackAlign;
+  return StackAlignment;
 }
 
-int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
+int MachineFrameInfo::CreateStackObject(uint64_t Size, Align Alignment,
                                         bool IsSpillSlot,
                                         const AllocaInst *Alloca,
                                         uint8_t StackID) {
@@ -61,8 +62,7 @@ int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
   return Index;
 }
 
-int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
-                                             unsigned Alignment) {
+int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, Align Alignment) {
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   CreateStackObject(Size, Alignment, true);
   int Index = (int)Objects.size() - NumFixedObjects - 1;
@@ -70,7 +70,7 @@ int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
   return Index;
 }
 
-int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
+int MachineFrameInfo::CreateVariableSizedObject(Align Alignment,
                                                 const AllocaInst *Alloca) {
   HasVarSizedObjects = true;
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
@@ -88,7 +88,8 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
   // object is 16-byte aligned. Note that unlike the non-fixed case, if the
   // stack needs realignment, we can't assume that the stack will in fact be
   // aligned.
-  unsigned Alignment = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align Alignment =
+      commonAlignment(ForcedRealign ? Align::None() : StackAlignment, SPOffset);
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.insert(Objects.begin(),
                  StackObject(Size, Alignment, SPOffset, IsImmutable,
@@ -100,7 +101,8 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
 int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
                                                   int64_t SPOffset,
                                                   bool IsImmutable) {
-  unsigned Alignment = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align Alignment =
+      commonAlignment(ForcedRealign ? Align::None() : StackAlignment, SPOffset);
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.insert(Objects.begin(),
                  StackObject(Size, Alignment, SPOffset, IsImmutable,
@@ -232,7 +234,7 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
       OS << "variable sized";
     else
       OS << "size=" << SO.Size;
-    OS << ", align=" << SO.Alignment;
+    OS << ", align=" << SO.Alignment.value();
 
     if (i < NumFixedObjects)
       OS << ", fixed";
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 4df5ce2dcedc..7d2ee230ca9f 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -78,10 +78,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "codegen"
 
-static cl::opt<unsigned>
-AlignAllFunctions("align-all-functions",
-                  cl::desc("Force the alignment of all functions."),
-                  cl::init(0), cl::Hidden);
+static cl::opt<unsigned> AlignAllFunctions(
+    "align-all-functions",
+    cl::desc("Force the alignment of all functions in log2 format (e.g. 4 "
+             "means align on 16B boundaries)."),
+    cl::init(0), cl::Hidden);
 
 static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
   using P = MachineFunctionProperties::Property;
@@ -181,7 +182,7 @@ void MachineFunction::init() {
                          STI->getTargetLowering()->getPrefFunctionAlignment());
 
   if (AlignAllFunctions)
-    Alignment = AlignAllFunctions;
+    Alignment = Align(1ULL << AlignAllFunctions);
 
   JumpTableInfo = nullptr;
 
@@ -200,7 +201,7 @@ void MachineFunction::init() {
          "Target-incompatible DataLayout attached\n");
 
   PSVManager =
-    llvm::make_unique<PseudoSourceValueManager>(*(getSubtarget().
+    std::make_unique<PseudoSourceValueManager>(*(getSubtarget().
                                                   getInstrInfo()));
 }
 
@@ -823,30 +824,47 @@ try_next:;
   return FilterID;
 }
 
-void MachineFunction::addCodeViewHeapAllocSite(MachineInstr *I, MDNode *MD) {
+void MachineFunction::addCodeViewHeapAllocSite(MachineInstr *I,
+                                               const MDNode *MD) {
   MCSymbol *BeginLabel = Ctx.createTempSymbol("heapallocsite", true);
   MCSymbol *EndLabel = Ctx.createTempSymbol("heapallocsite", true);
   I->setPreInstrSymbol(*this, BeginLabel);
   I->setPostInstrSymbol(*this, EndLabel);
 
-  DIType *DI = dyn_cast<DIType>(MD);
+  const DIType *DI = dyn_cast<DIType>(MD);
   CodeViewHeapAllocSites.push_back(std::make_tuple(BeginLabel, EndLabel, DI));
 }
 
-void MachineFunction::updateCallSiteInfo(const MachineInstr *Old,
-                                         const MachineInstr *New) {
-  if (!Target.Options.EnableDebugEntryValues || Old == New)
-    return;
+void MachineFunction::moveCallSiteInfo(const MachineInstr *Old,
+                                       const MachineInstr *New) {
+  assert(New->isCall() && "Call site info refers only to call instructions!");
 
-  assert(Old->isCall() && (!New || New->isCall()) &&
-         "Call site info referes only to call instructions!");
-  CallSiteInfoMap::iterator CSIt = CallSitesInfo.find(Old);
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(Old);
   if (CSIt == CallSitesInfo.end())
     return;
+
   CallSiteInfo CSInfo = std::move(CSIt->second);
   CallSitesInfo.erase(CSIt);
-  if (New)
-    CallSitesInfo[New] = CSInfo;
+  CallSitesInfo[New] = CSInfo;
+}
+
+void MachineFunction::eraseCallSiteInfo(const MachineInstr *MI) {
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(MI);
+  if (CSIt == CallSitesInfo.end())
+    return;
+  CallSitesInfo.erase(CSIt);
+}
+
+void MachineFunction::copyCallSiteInfo(const MachineInstr *Old,
+                                       const MachineInstr *New) {
+  assert(New->isCall() && "Call site info refers only to call instructions!");
+
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(Old);
+  if (CSIt == CallSitesInfo.end())
+    return;
+
+  CallSiteInfo CSInfo = CSIt->second;
+  CallSitesInfo[New] = CSInfo;
 }
 
 /// \}
@@ -881,13 +899,13 @@ unsigned MachineJumpTableInfo::getEntryAlignment(const DataLayout &TD) const {
   // alignment.
   switch (getEntryKind()) {
   case MachineJumpTableInfo::EK_BlockAddress:
-    return TD.getPointerABIAlignment(0);
+    return TD.getPointerABIAlignment(0).value();
   case MachineJumpTableInfo::EK_GPRel64BlockAddress:
-    return TD.getABIIntegerTypeAlignment(64);
+    return TD.getABIIntegerTypeAlignment(64).value();
   case MachineJumpTableInfo::EK_GPRel32BlockAddress:
   case MachineJumpTableInfo::EK_LabelDifference32:
   case MachineJumpTableInfo::EK_Custom32:
-    return TD.getABIIntegerTypeAlignment(32);
+    return TD.getABIIntegerTypeAlignment(32).value();
   case MachineJumpTableInfo::EK_Inline:
     return 1;
   }
diff --git a/lib/CodeGen/MachineFunctionPass.cpp b/lib/CodeGen/MachineFunctionPass.cpp
index 0da4cf3fc90c..03149aa7db4a 100644
--- a/lib/CodeGen/MachineFunctionPass.cpp
+++ b/lib/CodeGen/MachineFunctionPass.cpp
@@ -41,7 +41,7 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
   if (F.hasAvailableExternallyLinkage())
     return false;
 
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
 
   MachineFunctionProperties &MFProps = MF.getProperties();
@@ -101,8 +101,8 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
 }
 
 void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineModuleInfo>();
-  AU.addPreserved<MachineModuleInfo>();
+  AU.addRequired<MachineModuleInfoWrapperPass>();
+  AU.addPreserved<MachineModuleInfoWrapperPass>();
 
   // MachineFunctionPass preserves all LLVM IR passes, but there's no
   // high-level way to express this. Instead, just list a bunch of
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index e5c398a2d10c..fec20b2b1a05 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -636,8 +636,8 @@ bool MachineInstr::isIdenticalTo(const MachineInstr &Other,
       if (Check == IgnoreDefs)
         continue;
       else if (Check == IgnoreVRegDefs) {
-        if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()) ||
-            !TargetRegisterInfo::isVirtualRegister(OMO.getReg()))
+        if (!Register::isVirtualRegister(MO.getReg()) ||
+            !Register::isVirtualRegister(OMO.getReg()))
           if (!MO.isIdenticalTo(OMO))
             return false;
       } else {
@@ -692,8 +692,8 @@ void MachineInstr::eraseFromParentAndMarkDBGValuesForRemoval() {
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Reg.isVirtual())
       continue;
     MRI.markUsesInDebugValueAsUndef(Reg);
   }
@@ -832,6 +832,10 @@ const DIExpression *MachineInstr::getDebugExpression() const {
   return cast<DIExpression>(getOperand(3).getMetadata());
 }
 
+bool MachineInstr::isDebugEntryValue() const {
+  return isDebugValue() && getDebugExpression()->isEntryValue();
+}
+
 const TargetRegisterClass*
 MachineInstr::getRegClassConstraint(unsigned OpIdx,
                                     const TargetInstrInfo *TII,
@@ -873,7 +877,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
 }
 
 const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg(
-    unsigned Reg, const TargetRegisterClass *CurRC, const TargetInstrInfo *TII,
+    Register Reg, const TargetRegisterClass *CurRC, const TargetInstrInfo *TII,
     const TargetRegisterInfo *TRI, bool ExploreBundle) const {
   // Check every operands inside the bundle if we have
   // been asked to.
@@ -890,7 +894,7 @@ const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg(
 }
 
 const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVRegImpl(
-    unsigned OpIdx, unsigned Reg, const TargetRegisterClass *CurRC,
+    unsigned OpIdx, Register Reg, const TargetRegisterClass *CurRC,
     const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const {
   assert(CurRC && "Invalid initial register class");
   // Check if Reg is constrained by some of its use/def from MI.
@@ -933,7 +937,7 @@ unsigned MachineInstr::getBundleSize() const {
 
 /// Returns true if the MachineInstr has an implicit-use operand of exactly
 /// the given register (not considering sub/super-registers).
-bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
+bool MachineInstr::hasRegisterImplicitUseOperand(Register Reg) const {
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
     if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg)
@@ -946,12 +950,12 @@ bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
 /// the specific register or -1 if it is not found. It further tightens
 /// the search criteria to a use that kills the register if isKill is true.
 int MachineInstr::findRegisterUseOperandIdx(
-    unsigned Reg, bool isKill, const TargetRegisterInfo *TRI) const {
+    Register Reg, bool isKill, const TargetRegisterInfo *TRI) const {
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
     if (!MO.isReg() || !MO.isUse())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (!MOReg)
       continue;
     if (MOReg == Reg || (TRI && Reg && MOReg && TRI->regsOverlap(MOReg, Reg)))
@@ -965,7 +969,7 @@ int MachineInstr::findRegisterUseOperandIdx(
 /// indicating if this instruction reads or writes Reg. This also considers
 /// partial defines.
 std::pair<bool,bool>
-MachineInstr::readsWritesVirtualRegister(unsigned Reg,
+MachineInstr::readsWritesVirtualRegister(Register Reg,
                                          SmallVectorImpl<unsigned> *Ops) const {
   bool PartDef = false; // Partial redefine.
   bool FullDef = false; // Full define.
@@ -994,9 +998,9 @@ MachineInstr::readsWritesVirtualRegister(unsigned Reg,
 /// that are not dead are skipped. If TargetRegisterInfo is non-null, then it
 /// also checks if there is a def of a super-register.
 int
-MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead, bool Overlap,
+MachineInstr::findRegisterDefOperandIdx(Register Reg, bool isDead, bool Overlap,
                                         const TargetRegisterInfo *TRI) const {
-  bool isPhys = TargetRegisterInfo::isPhysicalRegister(Reg);
+  bool isPhys = Register::isPhysicalRegister(Reg);
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
     // Accept regmask operands when Overlap is set.
@@ -1005,10 +1009,9 @@ MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead, bool Overlap,
       return i;
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     bool Found = (MOReg == Reg);
-    if (!Found && TRI && isPhys &&
-        TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+    if (!Found && TRI && isPhys && Register::isPhysicalRegister(MOReg)) {
       if (Overlap)
         Found = TRI->regsOverlap(MOReg, Reg);
       else
@@ -1142,10 +1145,10 @@ void MachineInstr::clearKillInfo() {
   }
 }
 
-void MachineInstr::substituteRegister(unsigned FromReg, unsigned ToReg,
+void MachineInstr::substituteRegister(Register FromReg, Register ToReg,
                                       unsigned SubIdx,
                                       const TargetRegisterInfo &RegInfo) {
-  if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+  if (Register::isPhysicalRegister(ToReg)) {
     if (SubIdx)
       ToReg = RegInfo.getSubReg(ToReg, SubIdx);
     for (MachineOperand &MO : operands()) {
@@ -1165,7 +1168,7 @@ void MachineInstr::substituteRegister(unsigned FromReg, unsigned ToReg,
 /// isSafeToMove - Return true if it is safe to move this instruction. If
 /// SawStore is set to true, it means that there is a store (or call) between
 /// the instruction's location and its intended destination.
-bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
+bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
   // Ignore stuff that we obviously can't move.
   //
   // Treat volatile loads as stores. This is not strictly necessary for
@@ -1194,7 +1197,7 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
   return true;
 }
 
-bool MachineInstr::mayAlias(AliasAnalysis *AA, const MachineInstr &Other,
+bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
                             bool UseTBAA) const {
   const MachineFunction *MF = getMF();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
@@ -1206,7 +1209,7 @@ bool MachineInstr::mayAlias(AliasAnalysis *AA, const MachineInstr &Other,
     return false;
 
   // Let the target decide if memory accesses cannot possibly overlap.
-  if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA))
+  if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
     return false;
 
   // FIXME: Need to handle multiple memory operands to support all targets.
@@ -1312,7 +1315,7 @@ bool MachineInstr::hasOrderedMemoryRef() const {
 /// isDereferenceableInvariantLoad - Return true if this instruction will never
 /// trap and is loading from a location whose value is invariant across a run of
 /// this function.
-bool MachineInstr::isDereferenceableInvariantLoad(AliasAnalysis *AA) const {
+bool MachineInstr::isDereferenceableInvariantLoad(AAResults *AA) const {
   // If the instruction doesn't load at all, it isn't an invariant load.
   if (!mayLoad())
     return false;
@@ -1364,7 +1367,7 @@ unsigned MachineInstr::isConstantValuePHI() const {
   assert(getNumOperands() >= 3 &&
          "It's illegal to have a PHI without source operands");
 
-  unsigned Reg = getOperand(1).getReg();
+  Register Reg = getOperand(1).getReg();
   for (unsigned i = 3, e = getNumOperands(); i < e; i += 2)
     if (getOperand(i).getReg() != Reg)
       return 0;
@@ -1726,7 +1729,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       MFI = &MF->getFrameInfo();
       Context = &MF->getFunction().getContext();
     } else {
-      CtxPtr = llvm::make_unique<LLVMContext>();
+      CtxPtr = std::make_unique<LLVMContext>();
       Context = CtxPtr.get();
     }
 
@@ -1780,10 +1783,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << '\n';
 }
 
-bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
+bool MachineInstr::addRegisterKilled(Register IncomingReg,
                                      const TargetRegisterInfo *RegInfo,
                                      bool AddIfNotFound) {
-  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool isPhysReg = Register::isPhysicalRegister(IncomingReg);
   bool hasAliases = isPhysReg &&
     MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
   bool Found = false;
@@ -1799,7 +1802,7 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
     if (MO.isDebug())
       continue;
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
 
@@ -1814,8 +1817,7 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
         MO.setIsKill();
         Found = true;
       }
-    } else if (hasAliases && MO.isKill() &&
-               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    } else if (hasAliases && MO.isKill() && Register::isPhysicalRegister(Reg)) {
       // A super-register kill already exists.
       if (RegInfo->isSuperRegister(IncomingReg, Reg))
         return true;
@@ -1847,23 +1849,23 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
   return Found;
 }
 
-void MachineInstr::clearRegisterKills(unsigned Reg,
+void MachineInstr::clearRegisterKills(Register Reg,
                                       const TargetRegisterInfo *RegInfo) {
-  if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (!Register::isPhysicalRegister(Reg))
     RegInfo = nullptr;
   for (MachineOperand &MO : operands()) {
     if (!MO.isReg() || !MO.isUse() || !MO.isKill())
       continue;
-    unsigned OpReg = MO.getReg();
+    Register OpReg = MO.getReg();
     if ((RegInfo && RegInfo->regsOverlap(Reg, OpReg)) || Reg == OpReg)
       MO.setIsKill(false);
   }
 }
 
-bool MachineInstr::addRegisterDead(unsigned Reg,
+bool MachineInstr::addRegisterDead(Register Reg,
                                    const TargetRegisterInfo *RegInfo,
                                    bool AddIfNotFound) {
-  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(Reg);
+  bool isPhysReg = Register::isPhysicalRegister(Reg);
   bool hasAliases = isPhysReg &&
     MCRegAliasIterator(Reg, RegInfo, false).isValid();
   bool Found = false;
@@ -1872,7 +1874,7 @@ bool MachineInstr::addRegisterDead(unsigned Reg,
     MachineOperand &MO = getOperand(i);
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (!MOReg)
       continue;
 
@@ -1880,7 +1882,7 @@ bool MachineInstr::addRegisterDead(unsigned Reg,
       MO.setIsDead();
       Found = true;
     } else if (hasAliases && MO.isDead() &&
-               TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+               Register::isPhysicalRegister(MOReg)) {
       // There exists a super-register that's marked dead.
       if (RegInfo->isSuperRegister(Reg, MOReg))
         return true;
@@ -1913,7 +1915,7 @@ bool MachineInstr::addRegisterDead(unsigned Reg,
   return true;
 }
 
-void MachineInstr::clearRegisterDeads(unsigned Reg) {
+void MachineInstr::clearRegisterDeads(Register Reg) {
   for (MachineOperand &MO : operands()) {
     if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
       continue;
@@ -1921,7 +1923,7 @@ void MachineInstr::clearRegisterDeads(unsigned Reg) {
   }
 }
 
-void MachineInstr::setRegisterDefReadUndef(unsigned Reg, bool IsUndef) {
+void MachineInstr::setRegisterDefReadUndef(Register Reg, bool IsUndef) {
   for (MachineOperand &MO : operands()) {
     if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg || MO.getSubReg() == 0)
       continue;
@@ -1929,9 +1931,9 @@ void MachineInstr::setRegisterDefReadUndef(unsigned Reg, bool IsUndef) {
   }
 }
 
-void MachineInstr::addRegisterDefined(unsigned Reg,
+void MachineInstr::addRegisterDefined(Register Reg,
                                       const TargetRegisterInfo *RegInfo) {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     MachineOperand *MO = findRegisterDefOperand(Reg, false, false, RegInfo);
     if (MO)
       return;
@@ -1947,7 +1949,7 @@ void MachineInstr::addRegisterDefined(unsigned Reg,
                                        true  /*IsImp*/));
 }
 
-void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
+void MachineInstr::setPhysRegsDeadExcept(ArrayRef<Register> UsedRegs,
                                          const TargetRegisterInfo &TRI) {
   bool HasRegMask = false;
   for (MachineOperand &MO : operands()) {
@@ -1956,18 +1958,19 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
       continue;
     }
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isPhysical())
+      continue;
     // If there are no uses, including partial uses, the def is dead.
     if (llvm::none_of(UsedRegs,
-                      [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); }))
+                      [&](MCRegister Use) { return TRI.regsOverlap(Use, Reg); }))
       MO.setIsDead();
   }
 
   // This is a call with a register mask operand.
   // Mask clobbers are always dead, so add defs for the non-dead defines.
   if (HasRegMask)
-    for (ArrayRef<unsigned>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
+    for (ArrayRef<Register>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
          I != E; ++I)
       addRegisterDefined(*I, &TRI);
 }
@@ -1979,8 +1982,7 @@ MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
   HashComponents.reserve(MI->getNumOperands() + 1);
   HashComponents.push_back(MI->getOpcode());
   for (const MachineOperand &MO : MI->operands()) {
-    if (MO.isReg() && MO.isDef() &&
-        TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    if (MO.isReg() && MO.isDef() && Register::isVirtualRegister(MO.getReg()))
       continue;  // Skip virtual register defs.
 
     HashComponents.push_back(hash_value(MO));
@@ -2012,7 +2014,7 @@ void MachineInstr::emitError(StringRef Msg) const {
 
 MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
                                   const MCInstrDesc &MCID, bool IsIndirect,
-                                  unsigned Reg, const MDNode *Variable,
+                                  Register Reg, const MDNode *Variable,
                                   const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
@@ -2048,7 +2050,7 @@ MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
 MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, const MCInstrDesc &MCID,
-                                  bool IsIndirect, unsigned Reg,
+                                  bool IsIndirect, Register Reg,
                                   const MDNode *Variable, const MDNode *Expr) {
   MachineFunction &MF = *BB.getParent();
   MachineInstr *MI = BuildMI(MF, DL, MCID, IsIndirect, Reg, Variable, Expr);
@@ -2118,10 +2120,24 @@ void MachineInstr::collectDebugValues(
   }
 }
 
-void MachineInstr::changeDebugValuesDefReg(unsigned Reg) {
+void MachineInstr::changeDebugValuesDefReg(Register Reg) {
   // Collect matching debug values.
   SmallVector<MachineInstr *, 2> DbgValues;
-  collectDebugValues(DbgValues);
+
+  if (!getOperand(0).isReg())
+    return;
+
+  unsigned DefReg = getOperand(0).getReg();
+  auto *MRI = getRegInfo();
+  for (auto &MO : MRI->use_operands(DefReg)) {
+    auto *DI = MO.getParent();
+    if (!DI->isDebugValue())
+      continue;
+    if (DI->getOperand(0).isReg() &&
+        DI->getOperand(0).getReg() == DefReg){
+      DbgValues.push_back(DI);
+    }
+  }
 
   // Propagate Reg to debug value instructions.
   for (auto *DBI : DbgValues)
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index 32e266e9401e..feb849ced353 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -154,10 +154,10 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
         continue;
       }
 
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+
       if (LocalDefSet.count(Reg)) {
         MO.setIsInternalRead();
         if (MO.isKill())
@@ -177,7 +177,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
 
     for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
       MachineOperand &MO = *Defs[i];
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
 
@@ -194,7 +194,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
           DeadDefSet.erase(Reg);
       }
 
-      if (!MO.isDead()) {
+      if (!MO.isDead() && Register::isPhysicalRegister(Reg)) {
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
           unsigned SubReg = *SubRegs;
           if (LocalDefSet.insert(SubReg).second)
@@ -316,7 +316,7 @@ MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
   bool AllDefsDead = true;
   PhysRegInfo PRI = {false, false, false, false, false, false, false, false};
 
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+  assert(Register::isPhysicalRegister(Reg) &&
          "analyzePhysReg not given a physical register!");
   for (; isValid(); ++*this) {
     MachineOperand &MO = deref();
@@ -329,8 +329,8 @@ MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
     if (!MO.isReg())
       continue;
 
-    unsigned MOReg = MO.getReg();
-    if (!MOReg || !TargetRegisterInfo::isPhysicalRegister(MOReg))
+    Register MOReg = MO.getReg();
+    if (!MOReg || !Register::isPhysicalRegister(MOReg))
       continue;
 
     if (!TRI->regsOverlap(MOReg, Reg))
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 1107e609c258..6a898ff6ef88 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -153,7 +153,6 @@ namespace {
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addPreserved<MachineLoopInfo>();
-      AU.addPreserved<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -424,10 +423,10 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI,
 
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
-    assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+    assert(Register::isPhysicalRegister(Reg) &&
            "Not expecting virtual register!");
 
     if (!MO.isDef()) {
@@ -526,7 +525,7 @@ void MachineLICMBase::HoistRegionPostRA() {
     for (const MachineOperand &MO : TI->operands()) {
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
@@ -554,7 +553,7 @@ void MachineLICMBase::HoistRegionPostRA() {
       for (const MachineOperand &MO : MI->operands()) {
         if (!MO.isReg() || MO.isDef() || !MO.getReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (PhysRegDefs.test(Reg) ||
             PhysRegClobbers.test(Reg)) {
           // If it's using a non-loop-invariant register, then it's obviously
@@ -852,8 +851,8 @@ MachineLICMBase::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isImplicit())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
 
     // FIXME: It seems bad to use RegSeen only for some of these calculations.
@@ -922,12 +921,12 @@ static bool isInvariantStore(const MachineInstr &MI,
   // Check that all register operands are caller-preserved physical registers.
   for (const MachineOperand &MO : MI.operands()) {
     if (MO.isReg()) {
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       // If operand is a virtual register, check if it comes from a copy of a
       // physical register.
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         Reg = TRI->lookThruCopyLike(MO.getReg(), MRI);
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         return false;
       if (!TRI->isCallerPreservedPhysReg(Reg, *MI.getMF()))
         return false;
@@ -955,17 +954,17 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
 
   const MachineFunction *MF = MI.getMF();
   // Check that we are copying a constant physical register.
-  unsigned CopySrcReg = MI.getOperand(1).getReg();
-  if (TargetRegisterInfo::isVirtualRegister(CopySrcReg))
+  Register CopySrcReg = MI.getOperand(1).getReg();
+  if (Register::isVirtualRegister(CopySrcReg))
     return false;
 
   if (!TRI->isCallerPreservedPhysReg(CopySrcReg, *MF))
     return false;
 
-  unsigned CopyDstReg = MI.getOperand(0).getReg();
+  Register CopyDstReg = MI.getOperand(0).getReg();
   // Check if any of the uses of the copy are invariant stores.
-  assert (TargetRegisterInfo::isVirtualRegister(CopyDstReg) &&
-          "copy dst is not a virtual reg");
+  assert(Register::isVirtualRegister(CopyDstReg) &&
+         "copy dst is not a virtual reg");
 
   for (MachineInstr &UseMI : MRI->use_instructions(CopyDstReg)) {
     if (UseMI.mayStore() && isInvariantStore(UseMI, TRI, MRI))
@@ -1010,11 +1009,11 @@ bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I) {
     if (!MO.isReg())
       continue;
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
     // Don't hoist an instruction that uses or defines a physical register.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       if (MO.isUse()) {
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
@@ -1061,8 +1060,8 @@ bool MachineLICMBase::HasLoopPHIUse(const MachineInstr *MI) const {
     for (const MachineOperand &MO : MI->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Register::isVirtualRegister(Reg))
         continue;
       for (MachineInstr &UseMI : MRI->use_instructions(Reg)) {
         // A PHI may cause a copy to be inserted.
@@ -1104,7 +1103,7 @@ bool MachineLICMBase::HasHighOperandLatency(MachineInstr &MI,
       const MachineOperand &MO = UseMI.getOperand(i);
       if (!MO.isReg() || !MO.isUse())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (MOReg != Reg)
         continue;
 
@@ -1132,8 +1131,8 @@ bool MachineLICMBase::IsCheapInstruction(MachineInstr &MI) const {
     if (!DefMO.isReg() || !DefMO.isDef())
       continue;
     --NumDefs;
-    unsigned Reg = DefMO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = DefMO.getReg();
+    if (Register::isPhysicalRegister(Reg))
       continue;
 
     if (!TII->hasLowDefLatency(SchedModel, MI, i))
@@ -1225,8 +1224,8 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || MO.isImplicit())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
     if (MO.isDef() && HasHighOperandLatency(MI, i, Reg)) {
       LLVM_DEBUG(dbgs() << "Hoist High Latency: " << MI);
@@ -1304,7 +1303,7 @@ MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI) {
   MachineFunction &MF = *MI->getMF();
   const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
   // Ok, we're unfolding. Create a temporary register and do the unfold.
-  unsigned Reg = MRI->createVirtualRegister(RC);
+  Register Reg = MRI->createVirtualRegister(RC);
 
   SmallVector<MachineInstr *, 2> NewMIs;
   bool Success = TII->unfoldMemoryOperand(MF, *MI, Reg,
@@ -1378,20 +1377,20 @@ bool MachineLICMBase::EliminateCSE(MachineInstr *MI,
 
       // Physical registers may not differ here.
       assert((!MO.isReg() || MO.getReg() == 0 ||
-              !TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+              !Register::isPhysicalRegister(MO.getReg()) ||
               MO.getReg() == Dup->getOperand(i).getReg()) &&
              "Instructions with different phys regs are not identical!");
 
       if (MO.isReg() && MO.isDef() &&
-          !TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+          !Register::isPhysicalRegister(MO.getReg()))
         Defs.push_back(i);
     }
 
     SmallVector<const TargetRegisterClass*, 2> OrigRCs;
     for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
       unsigned Idx = Defs[i];
-      unsigned Reg = MI->getOperand(Idx).getReg();
-      unsigned DupReg = Dup->getOperand(Idx).getReg();
+      Register Reg = MI->getOperand(Idx).getReg();
+      Register DupReg = Dup->getOperand(Idx).getReg();
       OrigRCs.push_back(MRI->getRegClass(DupReg));
 
       if (!MRI->constrainRegClass(DupReg, MRI->getRegClass(Reg))) {
@@ -1403,8 +1402,8 @@ bool MachineLICMBase::EliminateCSE(MachineInstr *MI,
     }
 
     for (unsigned Idx : Defs) {
-      unsigned Reg = MI->getOperand(Idx).getReg();
-      unsigned DupReg = Dup->getOperand(Idx).getReg();
+      Register Reg = MI->getOperand(Idx).getReg();
+      Register DupReg = Dup->getOperand(Idx).getReg();
       MRI->replaceRegWith(Reg, DupReg);
       MRI->clearKillFlags(DupReg);
     }
diff --git a/lib/CodeGen/MachineLoopUtils.cpp b/lib/CodeGen/MachineLoopUtils.cpp
new file mode 100644
index 000000000000..e074b76082f0
--- /dev/null
+++ b/lib/CodeGen/MachineLoopUtils.cpp
@@ -0,0 +1,132 @@
+//=- MachineLoopUtils.cpp - Functions for manipulating loops ----------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+using namespace llvm;
+
+namespace {
+// MI's parent and BB are clones of each other. Find the equivalent copy of MI
+// in BB.
+MachineInstr &findEquivalentInstruction(MachineInstr &MI,
+                                        MachineBasicBlock *BB) {
+  MachineBasicBlock *PB = MI.getParent();
+  unsigned Offset = std::distance(PB->instr_begin(), MachineBasicBlock::instr_iterator(MI));
+  return *std::next(BB->instr_begin(), Offset);
+}
+} // namespace
+
+MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,
+                                             MachineBasicBlock *Loop,
+                                             MachineRegisterInfo &MRI,
+                                             const TargetInstrInfo *TII) {
+  MachineFunction &MF = *Loop->getParent();
+  MachineBasicBlock *Preheader = *Loop->pred_begin();
+  if (Preheader == Loop)
+    Preheader = *std::next(Loop->pred_begin());
+  MachineBasicBlock *Exit = *Loop->succ_begin();
+  if (Exit == Loop)
+    Exit = *std::next(Loop->succ_begin());
+
+  MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(Loop->getBasicBlock());
+  if (Direction == LPD_Front)
+    MF.insert(Loop->getIterator(), NewBB);
+  else
+    MF.insert(std::next(Loop->getIterator()), NewBB);
+
+  // FIXME: Add DenseMapInfo trait for Register so we can use it as a key.
+  DenseMap<unsigned, Register> Remaps;
+  auto InsertPt = NewBB->end();
+  for (MachineInstr &MI : *Loop) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
+    NewBB->insert(InsertPt, NewMI);
+    for (MachineOperand &MO : NewMI->defs()) {
+      Register OrigR = MO.getReg();
+      if (OrigR.isPhysical())
+        continue;
+      Register &R = Remaps[OrigR];
+      R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
+      MO.setReg(R);
+
+      if (Direction == LPD_Back) {
+        // Replace all uses outside the original loop with the new register.
+        // FIXME: is the use_iterator stable enough to mutate register uses
+        // while iterating?
+        SmallVector<MachineOperand *, 4> Uses;
+        for (auto &Use : MRI.use_operands(OrigR))
+          if (Use.getParent()->getParent() != Loop)
+            Uses.push_back(&Use);
+        for (auto *Use : Uses) {
+          MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg()));
+          Use->setReg(R);
+        }
+      }
+    }
+  }
+
+  for (auto I = NewBB->getFirstNonPHI(); I != NewBB->end(); ++I)
+    for (MachineOperand &MO : I->uses())
+      if (MO.isReg() && Remaps.count(MO.getReg()))
+        MO.setReg(Remaps[MO.getReg()]);
+
+  for (auto I = NewBB->begin(); I->isPHI(); ++I) {
+    MachineInstr &MI = *I;
+    unsigned LoopRegIdx = 3, InitRegIdx = 1;
+    if (MI.getOperand(2).getMBB() != Preheader)
+      std::swap(LoopRegIdx, InitRegIdx);
+    MachineInstr &OrigPhi = findEquivalentInstruction(MI, Loop);
+    assert(OrigPhi.isPHI());
+    if (Direction == LPD_Front) {
+      // When peeling front, we are only left with the initial value from the
+      // preheader.
+      Register R = MI.getOperand(LoopRegIdx).getReg();
+      if (Remaps.count(R))
+        R = Remaps[R];
+      OrigPhi.getOperand(InitRegIdx).setReg(R);
+      MI.RemoveOperand(LoopRegIdx + 1);
+      MI.RemoveOperand(LoopRegIdx + 0);
+    } else {
+      // When peeling back, the initial value is the loop-carried value from
+      // the original loop.
+      Register LoopReg = OrigPhi.getOperand(LoopRegIdx).getReg();
+      MI.getOperand(LoopRegIdx).setReg(LoopReg);
+      MI.RemoveOperand(InitRegIdx + 1);
+      MI.RemoveOperand(InitRegIdx + 0);
+    }
+  }
+
+  DebugLoc DL;
+  if (Direction == LPD_Front) {
+    Preheader->replaceSuccessor(Loop, NewBB);
+    NewBB->addSuccessor(Loop);
+    Loop->replacePhiUsesWith(Preheader, NewBB);
+    if (TII->removeBranch(*Preheader) > 0)
+      TII->insertBranch(*Preheader, NewBB, nullptr, {}, DL);
+    TII->removeBranch(*NewBB);
+    TII->insertBranch(*NewBB, Loop, nullptr, {}, DL);
+  } else {
+    Loop->replaceSuccessor(Exit, NewBB);
+    Exit->replacePhiUsesWith(Loop, NewBB);
+    NewBB->addSuccessor(Exit);
+
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    SmallVector<MachineOperand, 4> Cond;
+    bool CanAnalyzeBr = !TII->analyzeBranch(*Loop, TBB, FBB, Cond);
+    (void)CanAnalyzeBr;
+    assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+    TII->removeBranch(*Loop);
+    TII->insertBranch(*Loop, TBB == Exit ? NewBB : TBB,
+                      FBB == Exit ? NewBB : FBB, Cond, DL);
+    if (TII->removeBranch(*NewBB) > 0)
+      TII->insertBranch(*NewBB, Exit, nullptr, {}, DL);
+  }
+
+  return NewBB;
+}
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index aadcd7319799..e0b4e9cac229 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -36,11 +36,6 @@
 using namespace llvm;
 using namespace llvm::dwarf;
 
-// Handle the Pass registration stuff necessary to use DataLayout's.
-INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
-                "Machine Module Information", false, false)
-char MachineModuleInfo::ID = 0;
-
 // Out of line virtual method.
 MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
 
@@ -121,7 +116,7 @@ ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
   BBCallbacks.back().setMap(this);
   Entry.Index = BBCallbacks.size() - 1;
   Entry.Fn = BB->getParent();
-  Entry.Symbols.push_back(Context.createTempSymbol());
+  Entry.Symbols.push_back(Context.createTempSymbol(!BB->hasAddressTaken()));
   return Entry.Symbols;
 }
 
@@ -193,27 +188,15 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
   Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
 }
 
-MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
-  : ImmutablePass(ID), TM(*TM),
-    Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
-            TM->getObjFileLowering(), nullptr, false) {
-  initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
-}
-
-MachineModuleInfo::~MachineModuleInfo() = default;
-
-bool MachineModuleInfo::doInitialization(Module &M) {
+void MachineModuleInfo::initialize() {
   ObjFileMMI = nullptr;
   CurCallSite = 0;
   UsesMSVCFloatingPoint = UsesMorestackAddr = false;
   HasSplitStack = HasNosplitStack = false;
   AddrLabelSymbols = nullptr;
-  TheModule = &M;
-  DbgInfoAvailable = !llvm::empty(M.debug_compile_units());
-  return false;
 }
 
-bool MachineModuleInfo::doFinalization(Module &M) {
+void MachineModuleInfo::finalize() {
   Personalities.clear();
 
   delete AddrLabelSymbols;
@@ -223,10 +206,30 @@ bool MachineModuleInfo::doFinalization(Module &M) {
 
   delete ObjFileMMI;
   ObjFileMMI = nullptr;
+}
 
-  return false;
+MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
+    : TM(std::move(MMI.TM)),
+      Context(MMI.TM.getMCAsmInfo(), MMI.TM.getMCRegisterInfo(),
+              MMI.TM.getObjFileLowering(), nullptr, nullptr, false) {
+  ObjFileMMI = MMI.ObjFileMMI;
+  CurCallSite = MMI.CurCallSite;
+  UsesMSVCFloatingPoint = MMI.UsesMSVCFloatingPoint;
+  UsesMorestackAddr = MMI.UsesMorestackAddr;
+  HasSplitStack = MMI.HasSplitStack;
+  HasNosplitStack = MMI.HasNosplitStack;
+  AddrLabelSymbols = MMI.AddrLabelSymbols;
+  TheModule = MMI.TheModule;
 }
 
+MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
+    : TM(*TM), Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
+                       TM->getObjFileLowering(), nullptr, nullptr, false) {
+  initialize();
+}
+
+MachineModuleInfo::~MachineModuleInfo() { finalize(); }
+
 //===- Address of Block Management ----------------------------------------===//
 
 ArrayRef<MCSymbol *>
@@ -305,12 +308,13 @@ public:
   FreeMachineFunction() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineModuleInfo>();
-    AU.addPreserved<MachineModuleInfo>();
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
-    MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
     MMI.deleteMachineFunctionFor(F);
     return true;
   }
@@ -327,3 +331,36 @@ char FreeMachineFunction::ID;
 FunctionPass *llvm::createFreeMachineFunctionPass() {
   return new FreeMachineFunction();
 }
+
+MachineModuleInfoWrapperPass::MachineModuleInfoWrapperPass(
+    const LLVMTargetMachine *TM)
+    : ImmutablePass(ID), MMI(TM) {
+  initializeMachineModuleInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+// Handle the Pass registration stuff necessary to use DataLayout's.
+INITIALIZE_PASS(MachineModuleInfoWrapperPass, "machinemoduleinfo",
+                "Machine Module Information", false, false)
+char MachineModuleInfoWrapperPass::ID = 0;
+
+bool MachineModuleInfoWrapperPass::doInitialization(Module &M) {
+  MMI.initialize();
+  MMI.TheModule = &M;
+  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
+  return false;
+}
+
+bool MachineModuleInfoWrapperPass::doFinalization(Module &M) {
+  MMI.finalize();
+  return false;
+}
+
+AnalysisKey MachineModuleAnalysis::Key;
+
+MachineModuleInfo MachineModuleAnalysis::run(Module &M,
+                                             ModuleAnalysisManager &) {
+  MachineModuleInfo MMI(TM);
+  MMI.TheModule = &M;
+  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
+  return MMI;
+}
diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp
index 4fa4ea7f6cf5..8b19501ec3cf 100644
--- a/lib/CodeGen/MachineOperand.cpp
+++ b/lib/CodeGen/MachineOperand.cpp
@@ -49,7 +49,7 @@ static MachineFunction *getMFIfAvailable(MachineOperand &MO) {
       getMFIfAvailable(const_cast<const MachineOperand &>(MO)));
 }
 
-void MachineOperand::setReg(unsigned Reg) {
+void MachineOperand::setReg(Register Reg) {
   if (getReg() == Reg)
     return; // No change.
 
@@ -71,9 +71,9 @@ void MachineOperand::setReg(unsigned Reg) {
   SmallContents.RegNo = Reg;
 }
 
-void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
+void MachineOperand::substVirtReg(Register Reg, unsigned SubIdx,
                                   const TargetRegisterInfo &TRI) {
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Reg.isVirtual());
   if (SubIdx && getSubReg())
     SubIdx = TRI.composeSubRegIndices(SubIdx, getSubReg());
   setReg(Reg);
@@ -81,8 +81,8 @@ void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
     setSubReg(SubIdx);
 }
 
-void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+void MachineOperand::substPhysReg(MCRegister Reg, const TargetRegisterInfo &TRI) {
+  assert(Reg.isPhysical());
   if (getSubReg()) {
     Reg = TRI.getSubReg(Reg, getSubReg());
     // Note that getSubReg() may return 0 if the sub-register doesn't exist.
@@ -114,7 +114,7 @@ void MachineOperand::setIsDef(bool Val) {
 
 bool MachineOperand::isRenamable() const {
   assert(isReg() && "Wrong MachineOperand accessor");
-  assert(TargetRegisterInfo::isPhysicalRegister(getReg()) &&
+  assert(Register::isPhysicalRegister(getReg()) &&
          "isRenamable should only be checked on physical registers");
   if (!IsRenamable)
     return false;
@@ -132,7 +132,7 @@ bool MachineOperand::isRenamable() const {
 
 void MachineOperand::setIsRenamable(bool Val) {
   assert(isReg() && "Wrong MachineOperand accessor");
-  assert(TargetRegisterInfo::isPhysicalRegister(getReg()) &&
+  assert(Register::isPhysicalRegister(getReg()) &&
          "setIsRenamable should only be called on physical registers");
   IsRenamable = Val;
 }
@@ -169,7 +169,7 @@ void MachineOperand::ChangeToFPImmediate(const ConstantFP *FPImm) {
 }
 
 void MachineOperand::ChangeToES(const char *SymName,
-                                unsigned char TargetFlags) {
+                                unsigned TargetFlags) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into an external symbol");
 
@@ -182,7 +182,7 @@ void MachineOperand::ChangeToES(const char *SymName,
 }
 
 void MachineOperand::ChangeToGA(const GlobalValue *GV, int64_t Offset,
-                                unsigned char TargetFlags) {
+                                unsigned TargetFlags) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into a global address");
 
@@ -215,7 +215,7 @@ void MachineOperand::ChangeToFrameIndex(int Idx) {
 }
 
 void MachineOperand::ChangeToTargetIndex(unsigned Idx, int64_t Offset,
-                                         unsigned char TargetFlags) {
+                                         unsigned TargetFlags) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into a FrameIndex");
 
@@ -230,7 +230,7 @@ void MachineOperand::ChangeToTargetIndex(unsigned Idx, int64_t Offset,
 /// ChangeToRegister - Replace this operand with a new register operand of
 /// the specified value.  If an operand is known to be an register already,
 /// the setReg method should be used.
-void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
+void MachineOperand::ChangeToRegister(Register Reg, bool isDef, bool isImp,
                                       bool isKill, bool isDead, bool isUndef,
                                       bool isDebug) {
   MachineRegisterInfo *RegInfo = nullptr;
@@ -333,6 +333,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
     return getIntrinsicID() == Other.getIntrinsicID();
   case MachineOperand::MO_Predicate:
     return getPredicate() == Other.getPredicate();
+  case MachineOperand::MO_ShuffleMask:
+    return getShuffleMask() == Other.getShuffleMask();
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -381,6 +383,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID());
   case MachineOperand::MO_Predicate:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate());
+  case MachineOperand::MO_ShuffleMask:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getShuffleMask());
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -425,12 +429,10 @@ static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS,
     return;
   }
 
-  int Reg = TRI->getLLVMRegNum(DwarfReg, true);
-  if (Reg == -1) {
+  if (Optional<unsigned> Reg = TRI->getLLVMRegNum(DwarfReg, true))
+    OS << printReg(*Reg, TRI);
+  else
     OS << "<badreg>";
-    return;
-  }
-  OS << printReg(Reg, TRI);
 }
 
 static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
@@ -746,7 +748,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
   printTargetFlags(OS, *this);
   switch (getType()) {
   case MachineOperand::MO_Register: {
-    unsigned Reg = getReg();
+    Register Reg = getReg();
     if (isImplicit())
       OS << (isDef() ? "implicit-def " : "implicit ");
     else if (PrintDef && isDef())
@@ -762,13 +764,13 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << "undef ";
     if (isEarlyClobber())
       OS << "early-clobber ";
-    if (TargetRegisterInfo::isPhysicalRegister(getReg()) && isRenamable())
+    if (Register::isPhysicalRegister(getReg()) && isRenamable())
       OS << "renamable ";
     // isDebug() is exactly true for register operands of a DBG_VALUE. So we
     // simply infer it when parsing and do not need to print it.
 
     const MachineRegisterInfo *MRI = nullptr;
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       if (const MachineFunction *MF = getMFIfAvailable(*this)) {
         MRI = &MF->getRegInfo();
       }
@@ -783,7 +785,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
         OS << ".subreg" << SubReg;
     }
     // Print the register class / bank.
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       if (const MachineFunction *MF = getMFIfAvailable(*this)) {
         const MachineRegisterInfo &MRI = MF->getRegInfo();
         if (IsStandalone || !PrintDef || MRI.def_empty(Reg)) {
@@ -936,6 +938,20 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
        << CmpInst::getPredicateName(Pred) << ')';
     break;
   }
+  case MachineOperand::MO_ShuffleMask:
+    OS << "shufflemask(";
+    const Constant* C = getShuffleMask();
+    const int NumElts = C->getType()->getVectorNumElements();
+
+    StringRef Separator;
+    for (int I = 0; I != NumElts; ++I) {
+      OS << Separator;
+      C->getAggregateElement(I)->printAsOperand(OS, false, MST);
+      Separator = ", ";
+    }
+
+    OS << ')';
+    break;
   }
 }
 
@@ -963,7 +979,8 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
     return false;
 
   return isDereferenceableAndAlignedPointer(
-      BasePtr, 1, APInt(DL.getPointerSizeInBits(), Offset + Size), DL);
+      BasePtr, Align::None(), APInt(DL.getPointerSizeInBits(), Offset + Size),
+      DL);
 }
 
 /// getConstantPool - Return a MachinePointerInfo record that refers to the
@@ -1049,17 +1066,6 @@ uint64_t MachineMemOperand::getAlignment() const {
   return MinAlign(getBaseAlignment(), getOffset());
 }
 
-void MachineMemOperand::print(raw_ostream &OS) const {
-  ModuleSlotTracker DummyMST(nullptr);
-  print(OS, DummyMST);
-}
-
-void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const {
-  SmallVector<StringRef, 0> SSNs;
-  LLVMContext Ctx;
-  print(OS, MST, SSNs, Ctx, nullptr, nullptr);
-}
-
 void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
                               SmallVectorImpl<StringRef> &SSNs,
                               const LLVMContext &Context,
diff --git a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
index 27db9106b337..b82403ae1b85 100644
--- a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
+++ b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -76,7 +76,7 @@ bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction(
   else
     MBFI = nullptr;
 
-  ORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
+  ORE = std::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
   return false;
 }
 
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 80a235aeaa5c..8cd66825a58a 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -846,8 +846,8 @@ struct MachineOutliner : public ModulePass {
   StringRef getPassName() const override { return "Machine Outliner"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineModuleInfo>();
-    AU.addPreserved<MachineModuleInfo>();
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
     AU.setPreservesAll();
     ModulePass::getAnalysisUsage(AU);
   }
@@ -1128,7 +1128,7 @@ MachineOutliner::createOutlinedFunction(Module &M, OutlinedFunction &OF,
   IRBuilder<> Builder(EntryBB);
   Builder.CreateRetVoid();
 
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
   MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock();
   const TargetSubtargetInfo &STI = MF.getSubtarget();
@@ -1260,7 +1260,7 @@ bool MachineOutliner::outline(Module &M,
                   true /* isImp = true */));
           }
           if (MI.isCall())
-            MI.getMF()->updateCallSiteInfo(&MI);
+            MI.getMF()->eraseCallSiteInfo(&MI);
         };
         // Copy over the defs in the outlined range.
         // First inst in outlined range <-- Anything that's defined in this
@@ -1303,6 +1303,12 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M,
     if (F.empty())
       continue;
 
+    // Disable outlining from noreturn functions right now. Noreturn requires
+    // special handling for the case where what we are outlining could be a
+    // tail call.
+    if (F.hasFnAttribute(Attribute::NoReturn))
+      continue;
+
     // There's something in F. Check if it has a MachineFunction associated with
     // it.
     MachineFunction *MF = MMI.getMachineFunction(F);
@@ -1421,7 +1427,7 @@ bool MachineOutliner::runOnModule(Module &M) {
   if (M.empty())
     return false;
 
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
 
   // If the user passed -enable-machine-outliner=always or
   // -enable-machine-outliner, the pass will run on all functions in the module.
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 54df522d371a..89c9f6093a97 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -56,6 +56,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePipeliner.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ModuloSchedule.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
@@ -153,6 +154,17 @@ static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
 static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
                                       cl::init(false));
 
+static cl::opt<bool> EmitTestAnnotations(
+    "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
+    cl::desc("Instead of emitting the pipelined code, annotate instructions "
+             "with the generated schedule for feeding into the "
+             "-modulo-schedule-test pass"));
+
+static cl::opt<bool> ExperimentalCodeGen(
+    "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Use the experimental peeling code generator for software pipelining"));
+
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -314,7 +326,7 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
 
   LI.LoopInductionVar = nullptr;
   LI.LoopCompare = nullptr;
-  if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare)) {
+  if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
     LLVM_DEBUG(
         dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n");
     NumFailLoop++;
@@ -349,7 +361,7 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
 
       // If the operand uses a subregister, replace it with a new register
       // without subregisters, and generate a copy to the new register.
-      unsigned NewReg = MRI.createVirtualRegister(RC);
+      Register NewReg = MRI.createVirtualRegister(RC);
       MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
       MachineBasicBlock::iterator At = PredB.getFirstTerminator();
       const DebugLoc &DL = PredB.findDebugLoc(At);
@@ -515,14 +527,49 @@ void SwingSchedulerDAG::schedule() {
     return;
   }
 
-  generatePipelinedLoop(Schedule);
+  // Generate the schedule as a ModuloSchedule.
+  DenseMap<MachineInstr *, int> Cycles, Stages;
+  std::vector<MachineInstr *> OrderedInsts;
+  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+       ++Cycle) {
+    for (SUnit *SU : Schedule.getInstructions(Cycle)) {
+      OrderedInsts.push_back(SU->getInstr());
+      Cycles[SU->getInstr()] = Cycle;
+      Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
+    }
+  }
+  DenseMap<MachineInstr *, std::pair<unsigned, int64_t>> NewInstrChanges;
+  for (auto &KV : NewMIs) {
+    Cycles[KV.first] = Cycles[KV.second];
+    Stages[KV.first] = Stages[KV.second];
+    NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
+  }
+
+  ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
+                    std::move(Stages));
+  if (EmitTestAnnotations) {
+    assert(NewInstrChanges.empty() &&
+           "Cannot serialize a schedule with InstrChanges!");
+    ModuloScheduleTestAnnotater MSTI(MF, MS);
+    MSTI.annotate();
+    return;
+  }
+  // The experimental code generator can't work if there are InstChanges.
+  if (ExperimentalCodeGen && NewInstrChanges.empty()) {
+    PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
+    MSE.expand();
+  } else {
+    ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
+    MSE.expand();
+    MSE.cleanup();
+  }
   ++NumPipelined;
 }
 
 /// Clean up after the software pipeliner runs.
 void SwingSchedulerDAG::finishBlock() {
-  for (MachineInstr *I : NewMIs)
-    MF.DeleteMachineInstr(I);
+  for (auto &KV : NewMIs)
+    MF.DeleteMachineInstr(KV.second);
   NewMIs.clear();
 
   // Call the superclass.
@@ -546,14 +593,6 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
 }
 
-/// Return the Phi register value that comes from the incoming block.
-static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
-  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
-    if (Phi.getOperand(i + 1).getMBB() != LoopBB)
-      return Phi.getOperand(i).getReg();
-  return 0;
-}
-
 /// Return the Phi register value that comes the loop block.
 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
@@ -658,7 +697,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
               TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) {
             if (BaseOp1->isIdenticalTo(*BaseOp2) &&
                 (int)Offset1 < (int)Offset2) {
-              assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
+              assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
                      "What happened to the chain edge?");
               SDep Dep(Load, SDep::Barrier);
               Dep.setLatency(1);
@@ -730,7 +769,7 @@ void SwingSchedulerDAG::updatePhiDependences() {
          MOI != MOE; ++MOI) {
       if (!MOI->isReg())
         continue;
-      unsigned Reg = MOI->getReg();
+      Register Reg = MOI->getReg();
       if (MOI->isDef()) {
         // If the register is used by a Phi, then create an anti dependence.
         for (MachineRegisterInfo::use_instr_iterator
@@ -809,7 +848,7 @@ void SwingSchedulerDAG::changeDependences() {
       continue;
 
     // Get the MI and SUnit for the instruction that defines the original base.
-    unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
+    Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
     if (!DefMI)
       continue;
@@ -958,7 +997,7 @@ struct FuncUnitSorter {
     unsigned F1 = 0, F2 = 0;
     unsigned MFUs1 = minFuncUnits(IS1, F1);
     unsigned MFUs2 = minFuncUnits(IS2, F2);
-    if (MFUs1 == 1 && MFUs2 == 1)
+    if (MFUs1 == MFUs2)
       return Resources.lookup(F1) < Resources.lookup(F2);
     return MFUs1 > MFUs2;
   }
@@ -1514,8 +1553,8 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
       continue;
     for (const MachineOperand &MO : MI->operands())
       if (MO.isReg() && MO.isUse()) {
-        unsigned Reg = MO.getReg();
-        if (TargetRegisterInfo::isVirtualRegister(Reg))
+        Register Reg = MO.getReg();
+        if (Register::isVirtualRegister(Reg))
           Uses.insert(Reg);
         else if (MRI.isAllocatable(Reg))
           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
@@ -1525,8 +1564,8 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
   for (SUnit *SU : NS)
     for (const MachineOperand &MO : SU->getInstr()->operands())
       if (MO.isReg() && MO.isDef() && !MO.isDead()) {
-        unsigned Reg = MO.getReg();
-        if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        Register Reg = MO.getReg();
+        if (Register::isVirtualRegister(Reg)) {
           if (!Uses.count(Reg))
             LiveOutRegs.push_back(RegisterMaskPair(Reg,
                                                    LaneBitmask::getNone()));
@@ -2012,836 +2051,6 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
   return scheduleFound && Schedule.getMaxStageCount() > 0;
 }
 
-/// Given a schedule for the loop, generate a new version of the loop,
-/// and replace the old version.  This function generates a prolog
-/// that contains the initial iterations in the pipeline, and kernel
-/// loop, and the epilogue that contains the code for the final
-/// iterations.
-void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
-  // Create a new basic block for the kernel and add it to the CFG.
-  MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
-
-  unsigned MaxStageCount = Schedule.getMaxStageCount();
-
-  // Remember the registers that are used in different stages. The index is
-  // the iteration, or stage, that the instruction is scheduled in.  This is
-  // a map between register names in the original block and the names created
-  // in each stage of the pipelined loop.
-  ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
-  InstrMapTy InstrMap;
-
-  SmallVector<MachineBasicBlock *, 4> PrologBBs;
-
-  MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
-  assert(PreheaderBB != nullptr &&
-         "Need to add code to handle loops w/o preheader");
-  // Generate the prolog instructions that set up the pipeline.
-  generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
-  MF.insert(BB->getIterator(), KernelBB);
-
-  // Rearrange the instructions to generate the new, pipelined loop,
-  // and update register names as needed.
-  for (int Cycle = Schedule.getFirstCycle(),
-           LastCycle = Schedule.getFinalCycle();
-       Cycle <= LastCycle; ++Cycle) {
-    std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
-    // This inner loop schedules each instruction in the cycle.
-    for (SUnit *CI : CycleInstrs) {
-      if (CI->getInstr()->isPHI())
-        continue;
-      unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
-      MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
-      updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
-      KernelBB->push_back(NewMI);
-      InstrMap[NewMI] = CI->getInstr();
-    }
-  }
-
-  // Copy any terminator instructions to the new kernel, and update
-  // names as needed.
-  for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
-                                   E = BB->instr_end();
-       I != E; ++I) {
-    MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
-    updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
-    KernelBB->push_back(NewMI);
-    InstrMap[NewMI] = &*I;
-  }
-
-  KernelBB->transferSuccessors(BB);
-  KernelBB->replaceSuccessor(BB, KernelBB);
-
-  generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
-                       VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
-  generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
-               InstrMap, MaxStageCount, MaxStageCount, false);
-
-  LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
-
-  SmallVector<MachineBasicBlock *, 4> EpilogBBs;
-  // Generate the epilog instructions to complete the pipeline.
-  generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
-                 PrologBBs);
-
-  // We need this step because the register allocation doesn't handle some
-  // situations well, so we insert copies to help out.
-  splitLifetimes(KernelBB, EpilogBBs, Schedule);
-
-  // Remove dead instructions due to loop induction variables.
-  removeDeadInstructions(KernelBB, EpilogBBs);
-
-  // Add branches between prolog and epilog blocks.
-  addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
-
-  // Remove the original loop since it's no longer referenced.
-  for (auto &I : *BB)
-    LIS.RemoveMachineInstrFromMaps(I);
-  BB->clear();
-  BB->eraseFromParent();
-
-  delete[] VRMap;
-}
-
-/// Generate the pipeline prolog code.
-void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
-                                       MachineBasicBlock *KernelBB,
-                                       ValueMapTy *VRMap,
-                                       MBBVectorTy &PrologBBs) {
-  MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
-  assert(PreheaderBB != nullptr &&
-         "Need to add code to handle loops w/o preheader");
-  MachineBasicBlock *PredBB = PreheaderBB;
-  InstrMapTy InstrMap;
-
-  // Generate a basic block for each stage, not including the last stage,
-  // which will be generated in the kernel. Each basic block may contain
-  // instructions from multiple stages/iterations.
-  for (unsigned i = 0; i < LastStage; ++i) {
-    // Create and insert the prolog basic block prior to the original loop
-    // basic block.  The original loop is removed later.
-    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
-    PrologBBs.push_back(NewBB);
-    MF.insert(BB->getIterator(), NewBB);
-    NewBB->transferSuccessors(PredBB);
-    PredBB->addSuccessor(NewBB);
-    PredBB = NewBB;
-
-    // Generate instructions for each appropriate stage. Process instructions
-    // in original program order.
-    for (int StageNum = i; StageNum >= 0; --StageNum) {
-      for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
-                                       BBE = BB->getFirstTerminator();
-           BBI != BBE; ++BBI) {
-        if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
-          if (BBI->isPHI())
-            continue;
-          MachineInstr *NewMI =
-              cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
-          updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
-                            VRMap);
-          NewBB->push_back(NewMI);
-          InstrMap[NewMI] = &*BBI;
-        }
-      }
-    }
-    rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
-    LLVM_DEBUG({
-      dbgs() << "prolog:\n";
-      NewBB->dump();
-    });
-  }
-
-  PredBB->replaceSuccessor(BB, KernelBB);
-
-  // Check if we need to remove the branch from the preheader to the original
-  // loop, and replace it with a branch to the new loop.
-  unsigned numBranches = TII->removeBranch(*PreheaderBB);
-  if (numBranches) {
-    SmallVector<MachineOperand, 0> Cond;
-    TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
-  }
-}
-
-/// Generate the pipeline epilog code. The epilog code finishes the iterations
-/// that were started in either the prolog or the kernel.  We create a basic
-/// block for each stage that needs to complete.
-void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
-                                       MachineBasicBlock *KernelBB,
-                                       ValueMapTy *VRMap,
-                                       MBBVectorTy &EpilogBBs,
-                                       MBBVectorTy &PrologBBs) {
-  // We need to change the branch from the kernel to the first epilog block, so
-  // this call to analyze branch uses the kernel rather than the original BB.
-  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-  SmallVector<MachineOperand, 4> Cond;
-  bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
-  assert(!checkBranch && "generateEpilog must be able to analyze the branch");
-  if (checkBranch)
-    return;
-
-  MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
-  if (*LoopExitI == KernelBB)
-    ++LoopExitI;
-  assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
-  MachineBasicBlock *LoopExitBB = *LoopExitI;
-
-  MachineBasicBlock *PredBB = KernelBB;
-  MachineBasicBlock *EpilogStart = LoopExitBB;
-  InstrMapTy InstrMap;
-
-  // Generate a basic block for each stage, not including the last stage,
-  // which was generated for the kernel.  Each basic block may contain
-  // instructions from multiple stages/iterations.
-  int EpilogStage = LastStage + 1;
-  for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
-    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
-    EpilogBBs.push_back(NewBB);
-    MF.insert(BB->getIterator(), NewBB);
-
-    PredBB->replaceSuccessor(LoopExitBB, NewBB);
-    NewBB->addSuccessor(LoopExitBB);
-
-    if (EpilogStart == LoopExitBB)
-      EpilogStart = NewBB;
-
-    // Add instructions to the epilog depending on the current block.
-    // Process instructions in original program order.
-    for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
-      for (auto &BBI : *BB) {
-        if (BBI.isPHI())
-          continue;
-        MachineInstr *In = &BBI;
-        if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
-          // Instructions with memoperands in the epilog are updated with
-          // conservative values.
-          MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
-          updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
-          NewBB->push_back(NewMI);
-          InstrMap[NewMI] = In;
-        }
-      }
-    }
-    generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
-                         VRMap, InstrMap, LastStage, EpilogStage, i == 1);
-    generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
-                 InstrMap, LastStage, EpilogStage, i == 1);
-    PredBB = NewBB;
-
-    LLVM_DEBUG({
-      dbgs() << "epilog:\n";
-      NewBB->dump();
-    });
-  }
-
-  // Fix any Phi nodes in the loop exit block.
-  for (MachineInstr &MI : *LoopExitBB) {
-    if (!MI.isPHI())
-      break;
-    for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (MO.getMBB() == BB)
-        MO.setMBB(PredBB);
-    }
-  }
-
-  // Create a branch to the new epilog from the kernel.
-  // Remove the original branch and add a new branch to the epilog.
-  TII->removeBranch(*KernelBB);
-  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
-  // Add a branch to the loop exit.
-  if (EpilogBBs.size() > 0) {
-    MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
-    SmallVector<MachineOperand, 4> Cond1;
-    TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
-  }
-}
-
-/// Replace all uses of FromReg that appear outside the specified
-/// basic block with ToReg.
-static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
-                                    MachineBasicBlock *MBB,
-                                    MachineRegisterInfo &MRI,
-                                    LiveIntervals &LIS) {
-  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
-                                         E = MRI.use_end();
-       I != E;) {
-    MachineOperand &O = *I;
-    ++I;
-    if (O.getParent()->getParent() != MBB)
-      O.setReg(ToReg);
-  }
-  if (!LIS.hasInterval(ToReg))
-    LIS.createEmptyInterval(ToReg);
-}
-
-/// Return true if the register has a use that occurs outside the
-/// specified loop.
-static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
-                            MachineRegisterInfo &MRI) {
-  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
-                                         E = MRI.use_end();
-       I != E; ++I)
-    if (I->getParent()->getParent() != BB)
-      return true;
-  return false;
-}
-
-/// Generate Phis for the specific block in the generated pipelined code.
-/// This function looks at the Phis from the original code to guide the
-/// creation of new Phis.
-void SwingSchedulerDAG::generateExistingPhis(
-    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
-    MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
-    InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
-    bool IsLast) {
-  // Compute the stage number for the initial value of the Phi, which
-  // comes from the prolog. The prolog to use depends on to which kernel/
-  // epilog that we're adding the Phi.
-  unsigned PrologStage = 0;
-  unsigned PrevStage = 0;
-  bool InKernel = (LastStageNum == CurStageNum);
-  if (InKernel) {
-    PrologStage = LastStageNum - 1;
-    PrevStage = CurStageNum;
-  } else {
-    PrologStage = LastStageNum - (CurStageNum - LastStageNum);
-    PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
-  }
-
-  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
-                                   BBE = BB->getFirstNonPHI();
-       BBI != BBE; ++BBI) {
-    unsigned Def = BBI->getOperand(0).getReg();
-
-    unsigned InitVal = 0;
-    unsigned LoopVal = 0;
-    getPhiRegs(*BBI, BB, InitVal, LoopVal);
-
-    unsigned PhiOp1 = 0;
-    // The Phi value from the loop body typically is defined in the loop, but
-    // not always. So, we need to check if the value is defined in the loop.
-    unsigned PhiOp2 = LoopVal;
-    if (VRMap[LastStageNum].count(LoopVal))
-      PhiOp2 = VRMap[LastStageNum][LoopVal];
-
-    int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
-    int LoopValStage =
-        Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
-    unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
-    if (NumStages == 0) {
-      // We don't need to generate a Phi anymore, but we need to rename any uses
-      // of the Phi value.
-      unsigned NewReg = VRMap[PrevStage][LoopVal];
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
-                            Def, InitVal, NewReg);
-      if (VRMap[CurStageNum].count(LoopVal))
-        VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
-    }
-    // Adjust the number of Phis needed depending on the number of prologs left,
-    // and the distance from where the Phi is first scheduled. The number of
-    // Phis cannot exceed the number of prolog stages. Each stage can
-    // potentially define two values.
-    unsigned MaxPhis = PrologStage + 2;
-    if (!InKernel && (int)PrologStage <= LoopValStage)
-      MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
-    unsigned NumPhis = std::min(NumStages, MaxPhis);
-
-    unsigned NewReg = 0;
-    unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
-    // In the epilog, we may need to look back one stage to get the correct
-    // Phi name because the epilog and prolog blocks execute the same stage.
-    // The correct name is from the previous block only when the Phi has
-    // been completely scheduled prior to the epilog, and Phi value is not
-    // needed in multiple stages.
-    int StageDiff = 0;
-    if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
-        NumPhis == 1)
-      StageDiff = 1;
-    // Adjust the computations below when the phi and the loop definition
-    // are scheduled in different stages.
-    if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
-      StageDiff = StageScheduled - LoopValStage;
-    for (unsigned np = 0; np < NumPhis; ++np) {
-      // If the Phi hasn't been scheduled, then use the initial Phi operand
-      // value. Otherwise, use the scheduled version of the instruction. This
-      // is a little complicated when a Phi references another Phi.
-      if (np > PrologStage || StageScheduled >= (int)LastStageNum)
-        PhiOp1 = InitVal;
-      // Check if the Phi has already been scheduled in a prolog stage.
-      else if (PrologStage >= AccessStage + StageDiff + np &&
-               VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
-        PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
-      // Check if the Phi has already been scheduled, but the loop instruction
-      // is either another Phi, or doesn't occur in the loop.
-      else if (PrologStage >= AccessStage + StageDiff + np) {
-        // If the Phi references another Phi, we need to examine the other
-        // Phi to get the correct value.
-        PhiOp1 = LoopVal;
-        MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
-        int Indirects = 1;
-        while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
-          int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
-          if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
-            PhiOp1 = getInitPhiReg(*InstOp1, BB);
-          else
-            PhiOp1 = getLoopPhiReg(*InstOp1, BB);
-          InstOp1 = MRI.getVRegDef(PhiOp1);
-          int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
-          int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
-          if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
-              VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
-            PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
-            break;
-          }
-          ++Indirects;
-        }
-      } else
-        PhiOp1 = InitVal;
-      // If this references a generated Phi in the kernel, get the Phi operand
-      // from the incoming block.
-      if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
-        if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
-          PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
-
-      MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
-      bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
-      // In the epilog, a map lookup is needed to get the value from the kernel,
-      // or previous epilog block. How is does this depends on if the
-      // instruction is scheduled in the previous block.
-      if (!InKernel) {
-        int StageDiffAdj = 0;
-        if (LoopValStage != -1 && StageScheduled > LoopValStage)
-          StageDiffAdj = StageScheduled - LoopValStage;
-        // Use the loop value defined in the kernel, unless the kernel
-        // contains the last definition of the Phi.
-        if (np == 0 && PrevStage == LastStageNum &&
-            (StageScheduled != 0 || LoopValStage != 0) &&
-            VRMap[PrevStage - StageDiffAdj].count(LoopVal))
-          PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
-        // Use the value defined by the Phi. We add one because we switch
-        // from looking at the loop value to the Phi definition.
-        else if (np > 0 && PrevStage == LastStageNum &&
-                 VRMap[PrevStage - np + 1].count(Def))
-          PhiOp2 = VRMap[PrevStage - np + 1][Def];
-        // Use the loop value defined in the kernel.
-        else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
-                 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
-          PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
-        // Use the value defined by the Phi, unless we're generating the first
-        // epilog and the Phi refers to a Phi in a different stage.
-        else if (VRMap[PrevStage - np].count(Def) &&
-                 (!LoopDefIsPhi || (PrevStage != LastStageNum) || (LoopValStage == StageScheduled)))
-          PhiOp2 = VRMap[PrevStage - np][Def];
-      }
-
-      // Check if we can reuse an existing Phi. This occurs when a Phi
-      // references another Phi, and the other Phi is scheduled in an
-      // earlier stage. We can try to reuse an existing Phi up until the last
-      // stage of the current Phi.
-      if (LoopDefIsPhi) {
-        if (static_cast<int>(PrologStage - np) >= StageScheduled) {
-          int LVNumStages = Schedule.getStagesForPhi(LoopVal);
-          int StageDiff = (StageScheduled - LoopValStage);
-          LVNumStages -= StageDiff;
-          // Make sure the loop value Phi has been processed already.
-          if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
-            NewReg = PhiOp2;
-            unsigned ReuseStage = CurStageNum;
-            if (Schedule.isLoopCarried(this, *PhiInst))
-              ReuseStage -= LVNumStages;
-            // Check if the Phi to reuse has been generated yet. If not, then
-            // there is nothing to reuse.
-            if (VRMap[ReuseStage - np].count(LoopVal)) {
-              NewReg = VRMap[ReuseStage - np][LoopVal];
-
-              rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                    &*BBI, Def, NewReg);
-              // Update the map with the new Phi name.
-              VRMap[CurStageNum - np][Def] = NewReg;
-              PhiOp2 = NewReg;
-              if (VRMap[LastStageNum - np - 1].count(LoopVal))
-                PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
-
-              if (IsLast && np == NumPhis - 1)
-                replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
-              continue;
-            }
-          }
-        }
-        if (InKernel && StageDiff > 0 &&
-            VRMap[CurStageNum - StageDiff - np].count(LoopVal))
-          PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
-      }
-
-      const TargetRegisterClass *RC = MRI.getRegClass(Def);
-      NewReg = MRI.createVirtualRegister(RC);
-
-      MachineInstrBuilder NewPhi =
-          BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
-                  TII->get(TargetOpcode::PHI), NewReg);
-      NewPhi.addReg(PhiOp1).addMBB(BB1);
-      NewPhi.addReg(PhiOp2).addMBB(BB2);
-      if (np == 0)
-        InstrMap[NewPhi] = &*BBI;
-
-      // We define the Phis after creating the new pipelined code, so
-      // we need to rename the Phi values in scheduled instructions.
-
-      unsigned PrevReg = 0;
-      if (InKernel && VRMap[PrevStage - np].count(LoopVal))
-        PrevReg = VRMap[PrevStage - np][LoopVal];
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
-                            Def, NewReg, PrevReg);
-      // If the Phi has been scheduled, use the new name for rewriting.
-      if (VRMap[CurStageNum - np].count(Def)) {
-        unsigned R = VRMap[CurStageNum - np][Def];
-        rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
-                              R, NewReg);
-      }
-
-      // Check if we need to rename any uses that occurs after the loop. The
-      // register to replace depends on whether the Phi is scheduled in the
-      // epilog.
-      if (IsLast && np == NumPhis - 1)
-        replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
-
-      // In the kernel, a dependent Phi uses the value from this Phi.
-      if (InKernel)
-        PhiOp2 = NewReg;
-
-      // Update the map with the new Phi name.
-      VRMap[CurStageNum - np][Def] = NewReg;
-    }
-
-    while (NumPhis++ < NumStages) {
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
-                            &*BBI, Def, NewReg, 0);
-    }
-
-    // Check if we need to rename a Phi that has been eliminated due to
-    // scheduling.
-    if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
-      replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
-  }
-}
-
-/// Generate Phis for the specified block in the generated pipelined code.
-/// These are new Phis needed because the definition is scheduled after the
-/// use in the pipelined sequence.
-void SwingSchedulerDAG::generatePhis(
-    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
-    MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
-    InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
-    bool IsLast) {
-  // Compute the stage number that contains the initial Phi value, and
-  // the Phi from the previous stage.
-  unsigned PrologStage = 0;
-  unsigned PrevStage = 0;
-  unsigned StageDiff = CurStageNum - LastStageNum;
-  bool InKernel = (StageDiff == 0);
-  if (InKernel) {
-    PrologStage = LastStageNum - 1;
-    PrevStage = CurStageNum;
-  } else {
-    PrologStage = LastStageNum - StageDiff;
-    PrevStage = LastStageNum + StageDiff - 1;
-  }
-
-  for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
-                                   BBE = BB->instr_end();
-       BBI != BBE; ++BBI) {
-    for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = BBI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef() ||
-          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;
-
-      int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
-      assert(StageScheduled != -1 && "Expecting scheduled instruction.");
-      unsigned Def = MO.getReg();
-      unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
-      // An instruction scheduled in stage 0 and is used after the loop
-      // requires a phi in the epilog for the last definition from either
-      // the kernel or prolog.
-      if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
-          hasUseAfterLoop(Def, BB, MRI))
-        NumPhis = 1;
-      if (!InKernel && (unsigned)StageScheduled > PrologStage)
-        continue;
-
-      unsigned PhiOp2 = VRMap[PrevStage][Def];
-      if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
-        if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
-          PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
-      // The number of Phis can't exceed the number of prolog stages. The
-      // prolog stage number is zero based.
-      if (NumPhis > PrologStage + 1 - StageScheduled)
-        NumPhis = PrologStage + 1 - StageScheduled;
-      for (unsigned np = 0; np < NumPhis; ++np) {
-        unsigned PhiOp1 = VRMap[PrologStage][Def];
-        if (np <= PrologStage)
-          PhiOp1 = VRMap[PrologStage - np][Def];
-        if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
-          if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
-            PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
-          if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
-            PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
-        }
-        if (!InKernel)
-          PhiOp2 = VRMap[PrevStage - np][Def];
-
-        const TargetRegisterClass *RC = MRI.getRegClass(Def);
-        unsigned NewReg = MRI.createVirtualRegister(RC);
-
-        MachineInstrBuilder NewPhi =
-            BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
-                    TII->get(TargetOpcode::PHI), NewReg);
-        NewPhi.addReg(PhiOp1).addMBB(BB1);
-        NewPhi.addReg(PhiOp2).addMBB(BB2);
-        if (np == 0)
-          InstrMap[NewPhi] = &*BBI;
-
-        // Rewrite uses and update the map. The actions depend upon whether
-        // we generating code for the kernel or epilog blocks.
-        if (InKernel) {
-          rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                &*BBI, PhiOp1, NewReg);
-          rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                &*BBI, PhiOp2, NewReg);
-
-          PhiOp2 = NewReg;
-          VRMap[PrevStage - np - 1][Def] = NewReg;
-        } else {
-          VRMap[CurStageNum - np][Def] = NewReg;
-          if (np == NumPhis - 1)
-            rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                  &*BBI, Def, NewReg);
-        }
-        if (IsLast && np == NumPhis - 1)
-          replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
-      }
-    }
-  }
-}
-
-/// Remove instructions that generate values with no uses.
-/// Typically, these are induction variable operations that generate values
-/// used in the loop itself.  A dead instruction has a definition with
-/// no uses, or uses that occur in the original loop only.
-void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
-                                               MBBVectorTy &EpilogBBs) {
-  // For each epilog block, check that the value defined by each instruction
-  // is used.  If not, delete it.
-  for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
-                                     MBE = EpilogBBs.rend();
-       MBB != MBE; ++MBB)
-    for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
-                                                   ME = (*MBB)->instr_rend();
-         MI != ME;) {
-      // From DeadMachineInstructionElem. Don't delete inline assembly.
-      if (MI->isInlineAsm()) {
-        ++MI;
-        continue;
-      }
-      bool SawStore = false;
-      // Check if it's safe to remove the instruction due to side effects.
-      // We can, and want to, remove Phis here.
-      if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
-        ++MI;
-        continue;
-      }
-      bool used = true;
-      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-                                      MOE = MI->operands_end();
-           MOI != MOE; ++MOI) {
-        if (!MOI->isReg() || !MOI->isDef())
-          continue;
-        unsigned reg = MOI->getReg();
-        // Assume physical registers are used, unless they are marked dead.
-        if (TargetRegisterInfo::isPhysicalRegister(reg)) {
-          used = !MOI->isDead();
-          if (used)
-            break;
-          continue;
-        }
-        unsigned realUses = 0;
-        for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
-                                               EI = MRI.use_end();
-             UI != EI; ++UI) {
-          // Check if there are any uses that occur only in the original
-          // loop.  If so, that's not a real use.
-          if (UI->getParent()->getParent() != BB) {
-            realUses++;
-            used = true;
-            break;
-          }
-        }
-        if (realUses > 0)
-          break;
-        used = false;
-      }
-      if (!used) {
-        LIS.RemoveMachineInstrFromMaps(*MI);
-        MI++->eraseFromParent();
-        continue;
-      }
-      ++MI;
-    }
-  // In the kernel block, check if we can remove a Phi that generates a value
-  // used in an instruction removed in the epilog block.
-  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
-                                   BBE = KernelBB->getFirstNonPHI();
-       BBI != BBE;) {
-    MachineInstr *MI = &*BBI;
-    ++BBI;
-    unsigned reg = MI->getOperand(0).getReg();
-    if (MRI.use_begin(reg) == MRI.use_end()) {
-      LIS.RemoveMachineInstrFromMaps(*MI);
-      MI->eraseFromParent();
-    }
-  }
-}
-
-/// For loop carried definitions, we split the lifetime of a virtual register
-/// that has uses past the definition in the next iteration. A copy with a new
-/// virtual register is inserted before the definition, which helps with
-/// generating a better register assignment.
-///
-///   v1 = phi(a, v2)     v1 = phi(a, v2)
-///   v2 = phi(b, v3)     v2 = phi(b, v3)
-///   v3 = ..             v4 = copy v1
-///   .. = V1             v3 = ..
-///                       .. = v4
-void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
-                                       MBBVectorTy &EpilogBBs,
-                                       SMSchedule &Schedule) {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  for (auto &PHI : KernelBB->phis()) {
-    unsigned Def = PHI.getOperand(0).getReg();
-    // Check for any Phi definition that used as an operand of another Phi
-    // in the same block.
-    for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
-                                                 E = MRI.use_instr_end();
-         I != E; ++I) {
-      if (I->isPHI() && I->getParent() == KernelBB) {
-        // Get the loop carried definition.
-        unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
-        if (!LCDef)
-          continue;
-        MachineInstr *MI = MRI.getVRegDef(LCDef);
-        if (!MI || MI->getParent() != KernelBB || MI->isPHI())
-          continue;
-        // Search through the rest of the block looking for uses of the Phi
-        // definition. If one occurs, then split the lifetime.
-        unsigned SplitReg = 0;
-        for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
-                                    KernelBB->instr_end()))
-          if (BBJ.readsRegister(Def)) {
-            // We split the lifetime when we find the first use.
-            if (SplitReg == 0) {
-              SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
-              BuildMI(*KernelBB, MI, MI->getDebugLoc(),
-                      TII->get(TargetOpcode::COPY), SplitReg)
-                  .addReg(Def);
-            }
-            BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
-          }
-        if (!SplitReg)
-          continue;
-        // Search through each of the epilog blocks for any uses to be renamed.
-        for (auto &Epilog : EpilogBBs)
-          for (auto &I : *Epilog)
-            if (I.readsRegister(Def))
-              I.substituteRegister(Def, SplitReg, 0, *TRI);
-        break;
-      }
-    }
-  }
-}
-
-/// Remove the incoming block from the Phis in a basic block.
-static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
-  for (MachineInstr &MI : *BB) {
-    if (!MI.isPHI())
-      break;
-    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
-      if (MI.getOperand(i + 1).getMBB() == Incoming) {
-        MI.RemoveOperand(i + 1);
-        MI.RemoveOperand(i);
-        break;
-      }
-  }
-}
-
-/// Create branches from each prolog basic block to the appropriate epilog
-/// block.  These edges are needed if the loop ends before reaching the
-/// kernel.
-void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB,
-                                    MBBVectorTy &PrologBBs,
-                                    MachineBasicBlock *KernelBB,
-                                    MBBVectorTy &EpilogBBs,
-                                    SMSchedule &Schedule, ValueMapTy *VRMap) {
-  assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
-  MachineInstr *IndVar = Pass.LI.LoopInductionVar;
-  MachineInstr *Cmp = Pass.LI.LoopCompare;
-  MachineBasicBlock *LastPro = KernelBB;
-  MachineBasicBlock *LastEpi = KernelBB;
-
-  // Start from the blocks connected to the kernel and work "out"
-  // to the first prolog and the last epilog blocks.
-  SmallVector<MachineInstr *, 4> PrevInsts;
-  unsigned MaxIter = PrologBBs.size() - 1;
-  unsigned LC = UINT_MAX;
-  unsigned LCMin = UINT_MAX;
-  for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
-    // Add branches to the prolog that go to the corresponding
-    // epilog, and the fall-thru prolog/kernel block.
-    MachineBasicBlock *Prolog = PrologBBs[j];
-    MachineBasicBlock *Epilog = EpilogBBs[i];
-    // We've executed one iteration, so decrement the loop count and check for
-    // the loop end.
-    SmallVector<MachineOperand, 4> Cond;
-    // Check if the LOOP0 has already been removed. If so, then there is no need
-    // to reduce the trip count.
-    if (LC != 0)
-      LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond,
-                                PrevInsts, j, MaxIter);
-
-    // Record the value of the first trip count, which is used to determine if
-    // branches and blocks can be removed for constant trip counts.
-    if (LCMin == UINT_MAX)
-      LCMin = LC;
-
-    unsigned numAdded = 0;
-    if (TargetRegisterInfo::isVirtualRegister(LC)) {
-      Prolog->addSuccessor(Epilog);
-      numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
-    } else if (j >= LCMin) {
-      Prolog->addSuccessor(Epilog);
-      Prolog->removeSuccessor(LastPro);
-      LastEpi->removeSuccessor(Epilog);
-      numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
-      removePhis(Epilog, LastEpi);
-      // Remove the blocks that are no longer referenced.
-      if (LastPro != LastEpi) {
-        LastEpi->clear();
-        LastEpi->eraseFromParent();
-      }
-      LastPro->clear();
-      LastPro->eraseFromParent();
-    } else {
-      numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
-      removePhis(Epilog, Prolog);
-    }
-    LastPro = Prolog;
-    LastEpi = Epilog;
-    for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
-                                                   E = Prolog->instr_rend();
-         I != E && numAdded > 0; ++I, --numAdded)
-      updateInstruction(&*I, false, j, 0, Schedule, VRMap);
-  }
-}
-
 /// Return true if we can compute the amount the instruction changes
 /// during each iteration. Set Delta to the amount of the change.
 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
@@ -2854,7 +2063,7 @@ bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
   if (!BaseOp->isReg())
     return false;
 
-  unsigned BaseReg = BaseOp->getReg();
+  Register BaseReg = BaseOp->getReg();
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   // Check if there is a Phi. If so, get the definition in the loop.
@@ -2874,261 +2083,6 @@ bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
   return true;
 }
 
-/// Update the memory operand with a new offset when the pipeliner
-/// generates a new copy of the instruction that refers to a
-/// different memory location.
-void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
-                                          MachineInstr &OldMI, unsigned Num) {
-  if (Num == 0)
-    return;
-  // If the instruction has memory operands, then adjust the offset
-  // when the instruction appears in different stages.
-  if (NewMI.memoperands_empty())
-    return;
-  SmallVector<MachineMemOperand *, 2> NewMMOs;
-  for (MachineMemOperand *MMO : NewMI.memoperands()) {
-    // TODO: Figure out whether isAtomic is really necessary (see D57601).
-    if (MMO->isVolatile() || MMO->isAtomic() ||
-        (MMO->isInvariant() && MMO->isDereferenceable()) ||
-        (!MMO->getValue())) {
-      NewMMOs.push_back(MMO);
-      continue;
-    }
-    unsigned Delta;
-    if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
-      int64_t AdjOffset = Delta * Num;
-      NewMMOs.push_back(
-          MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
-    } else {
-      NewMMOs.push_back(
-          MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
-    }
-  }
-  NewMI.setMemRefs(MF, NewMMOs);
-}
-
-/// Clone the instruction for the new pipelined loop and update the
-/// memory operands, if needed.
-MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
-                                            unsigned CurStageNum,
-                                            unsigned InstStageNum) {
-  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
-  // Check for tied operands in inline asm instructions. This should be handled
-  // elsewhere, but I'm not sure of the best solution.
-  if (OldMI->isInlineAsm())
-    for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
-      const auto &MO = OldMI->getOperand(i);
-      if (MO.isReg() && MO.isUse())
-        break;
-      unsigned UseIdx;
-      if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
-        NewMI->tieOperands(i, UseIdx);
-    }
-  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
-  return NewMI;
-}
-
-/// Clone the instruction for the new pipelined loop. If needed, this
-/// function updates the instruction using the values saved in the
-/// InstrChanges structure.
-MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
-                                                     unsigned CurStageNum,
-                                                     unsigned InstStageNum,
-                                                     SMSchedule &Schedule) {
-  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
-  DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
-      InstrChanges.find(getSUnit(OldMI));
-  if (It != InstrChanges.end()) {
-    std::pair<unsigned, int64_t> RegAndOffset = It->second;
-    unsigned BasePos, OffsetPos;
-    if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
-      return nullptr;
-    int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
-    MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
-    if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
-      NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
-    NewMI->getOperand(OffsetPos).setImm(NewOffset);
-  }
-  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
-  return NewMI;
-}
-
-/// Update the machine instruction with new virtual registers.  This
-/// function may change the defintions and/or uses.
-void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
-                                          unsigned CurStageNum,
-                                          unsigned InstrStageNum,
-                                          SMSchedule &Schedule,
-                                          ValueMapTy *VRMap) {
-  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = NewMI->getOperand(i);
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-      continue;
-    unsigned reg = MO.getReg();
-    if (MO.isDef()) {
-      // Create a new virtual register for the definition.
-      const TargetRegisterClass *RC = MRI.getRegClass(reg);
-      unsigned NewReg = MRI.createVirtualRegister(RC);
-      MO.setReg(NewReg);
-      VRMap[CurStageNum][reg] = NewReg;
-      if (LastDef)
-        replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
-    } else if (MO.isUse()) {
-      MachineInstr *Def = MRI.getVRegDef(reg);
-      // Compute the stage that contains the last definition for instruction.
-      int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
-      unsigned StageNum = CurStageNum;
-      if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
-        // Compute the difference in stages between the defintion and the use.
-        unsigned StageDiff = (InstrStageNum - DefStageNum);
-        // Make an adjustment to get the last definition.
-        StageNum -= StageDiff;
-      }
-      if (VRMap[StageNum].count(reg))
-        MO.setReg(VRMap[StageNum][reg]);
-    }
-  }
-}
-
-/// Return the instruction in the loop that defines the register.
-/// If the definition is a Phi, then follow the Phi operand to
-/// the instruction in the loop.
-MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
-  SmallPtrSet<MachineInstr *, 8> Visited;
-  MachineInstr *Def = MRI.getVRegDef(Reg);
-  while (Def->isPHI()) {
-    if (!Visited.insert(Def).second)
-      break;
-    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
-      if (Def->getOperand(i + 1).getMBB() == BB) {
-        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
-        break;
-      }
-  }
-  return Def;
-}
-
-/// Return the new name for the value from the previous stage.
-unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
-                                          unsigned LoopVal, unsigned LoopStage,
-                                          ValueMapTy *VRMap,
-                                          MachineBasicBlock *BB) {
-  unsigned PrevVal = 0;
-  if (StageNum > PhiStage) {
-    MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
-    if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
-      // The name is defined in the previous stage.
-      PrevVal = VRMap[StageNum - 1][LoopVal];
-    else if (VRMap[StageNum].count(LoopVal))
-      // The previous name is defined in the current stage when the instruction
-      // order is swapped.
-      PrevVal = VRMap[StageNum][LoopVal];
-    else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
-      // The loop value hasn't yet been scheduled.
-      PrevVal = LoopVal;
-    else if (StageNum == PhiStage + 1)
-      // The loop value is another phi, which has not been scheduled.
-      PrevVal = getInitPhiReg(*LoopInst, BB);
-    else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
-      // The loop value is another phi, which has been scheduled.
-      PrevVal =
-          getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
-                        LoopStage, VRMap, BB);
-  }
-  return PrevVal;
-}
-
-/// Rewrite the Phi values in the specified block to use the mappings
-/// from the initial operand. Once the Phi is scheduled, we switch
-/// to using the loop value instead of the Phi value, so those names
-/// do not need to be rewritten.
-void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
-                                         unsigned StageNum,
-                                         SMSchedule &Schedule,
-                                         ValueMapTy *VRMap,
-                                         InstrMapTy &InstrMap) {
-  for (auto &PHI : BB->phis()) {
-    unsigned InitVal = 0;
-    unsigned LoopVal = 0;
-    getPhiRegs(PHI, BB, InitVal, LoopVal);
-    unsigned PhiDef = PHI.getOperand(0).getReg();
-
-    unsigned PhiStage =
-        (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
-    unsigned LoopStage =
-        (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
-    unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
-    if (NumPhis > StageNum)
-      NumPhis = StageNum;
-    for (unsigned np = 0; np <= NumPhis; ++np) {
-      unsigned NewVal =
-          getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
-      if (!NewVal)
-        NewVal = InitVal;
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
-                            PhiDef, NewVal);
-    }
-  }
-}
-
-/// Rewrite a previously scheduled instruction to use the register value
-/// from the new instruction. Make sure the instruction occurs in the
-/// basic block, and we don't change the uses in the new instruction.
-void SwingSchedulerDAG::rewriteScheduledInstr(
-    MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
-    unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
-    unsigned NewReg, unsigned PrevReg) {
-  bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
-  int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
-  // Rewrite uses that have been scheduled already to use the new
-  // Phi register.
-  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
-                                         EI = MRI.use_end();
-       UI != EI;) {
-    MachineOperand &UseOp = *UI;
-    MachineInstr *UseMI = UseOp.getParent();
-    ++UI;
-    if (UseMI->getParent() != BB)
-      continue;
-    if (UseMI->isPHI()) {
-      if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
-        continue;
-      if (getLoopPhiReg(*UseMI, BB) != OldReg)
-        continue;
-    }
-    InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
-    assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
-    SUnit *OrigMISU = getSUnit(OrigInstr->second);
-    int StageSched = Schedule.stageScheduled(OrigMISU);
-    int CycleSched = Schedule.cycleScheduled(OrigMISU);
-    unsigned ReplaceReg = 0;
-    // This is the stage for the scheduled instruction.
-    if (StagePhi == StageSched && Phi->isPHI()) {
-      int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
-      if (PrevReg && InProlog)
-        ReplaceReg = PrevReg;
-      else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
-               (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
-        ReplaceReg = PrevReg;
-      else
-        ReplaceReg = NewReg;
-    }
-    // The scheduled instruction occurs before the scheduled Phi, and the
-    // Phi is not loop carried.
-    if (!InProlog && StagePhi + 1 == StageSched &&
-        !Schedule.isLoopCarried(this, *Phi))
-      ReplaceReg = NewReg;
-    if (StagePhi > StageSched && Phi->isPHI())
-      ReplaceReg = NewReg;
-    if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
-      ReplaceReg = NewReg;
-    if (ReplaceReg) {
-      MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
-      UseOp.setReg(ReplaceReg);
-    }
-  }
-}
-
 /// Check if we can change the instruction to use an offset value from the
 /// previous iteration. If so, return true and set the base and offset values
 /// so that we can rewrite the load, if necessary.
@@ -3147,7 +2101,7 @@ bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
   unsigned BasePosLd, OffsetPosLd;
   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
     return false;
-  unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
+  Register BaseReg = MI->getOperand(BasePosLd).getReg();
 
   // Look for the Phi instruction.
   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
@@ -3202,7 +2156,7 @@ void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
     unsigned BasePos, OffsetPos;
     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
       return;
-    unsigned BaseReg = MI->getOperand(BasePos).getReg();
+    Register BaseReg = MI->getOperand(BasePos).getReg();
     MachineInstr *LoopDef = findDefInLoop(BaseReg);
     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
@@ -3221,11 +2175,29 @@ void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
       NewMI->getOperand(OffsetPos).setImm(NewOffset);
       SU->setInstr(NewMI);
       MISUnitMap[NewMI] = SU;
-      NewMIs.insert(NewMI);
+      NewMIs[MI] = NewMI;
     }
   }
 }
 
+/// Return the instruction in the loop that defines the register.
+/// If the definition is a Phi, then follow the Phi operand to
+/// the instruction in the loop.
+MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
+  SmallPtrSet<MachineInstr *, 8> Visited;
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  while (Def->isPHI()) {
+    if (!Visited.insert(Def).second)
+      break;
+    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
+      if (Def->getOperand(i + 1).getMBB() == BB) {
+        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
+        break;
+      }
+  }
+  return Def;
+}
+
 /// Return true for an order or output dependence that is loop carried
 /// potentially. A dependence is loop carried if the destination defines a valu
 /// that may be used or defined by the source in a subsequent iteration.
@@ -3499,10 +2471,10 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
        ++I, ++Pos) {
     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
         continue;
 
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       unsigned BasePos, OffsetPos;
       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
         if (MI->getOperand(BasePos).getReg() == Reg)
@@ -3676,7 +2648,7 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
     assert(StageDef != -1 && "Instruction should have been scheduled.");
     for (auto &SI : SU.Succs)
       if (SI.isAssignedRegDep())
-        if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
+        if (Register::isPhysicalRegister(SI.getReg()))
           if (stageScheduled(SI.getSUnit()) != StageDef)
             return false;
   }
@@ -3810,7 +2782,7 @@ void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
             NewMI->getOperand(OffsetPos).setImm(NewOffset);
             SU->setInstr(NewMI);
             MISUnitMap[NewMI] = SU;
-            NewMIs.insert(NewMI);
+            NewMIs[MI] = NewMI;
           }
         }
         OverlapReg = 0;
@@ -3847,40 +2819,6 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
         ScheduledInstrs[cycle].push_front(*I);
     }
   }
-  // Iterate over the definitions in each instruction, and compute the
-  // stage difference for each use.  Keep the maximum value.
-  for (auto &I : InstrToCycle) {
-    int DefStage = stageScheduled(I.first);
-    MachineInstr *MI = I.first->getInstr();
-    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
-      MachineOperand &Op = MI->getOperand(i);
-      if (!Op.isReg() || !Op.isDef())
-        continue;
-
-      unsigned Reg = Op.getReg();
-      unsigned MaxDiff = 0;
-      bool PhiIsSwapped = false;
-      for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
-                                             EI = MRI.use_end();
-           UI != EI; ++UI) {
-        MachineOperand &UseOp = *UI;
-        MachineInstr *UseMI = UseOp.getParent();
-        SUnit *SUnitUse = SSD->getSUnit(UseMI);
-        int UseStage = stageScheduled(SUnitUse);
-        unsigned Diff = 0;
-        if (UseStage != -1 && UseStage >= DefStage)
-          Diff = UseStage - DefStage;
-        if (MI->isPHI()) {
-          if (isLoopCarried(SSD, *MI))
-            ++Diff;
-          else
-            PhiIsSwapped = true;
-        }
-        MaxDiff = std::max(Diff, MaxDiff);
-      }
-      RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
-    }
-  }
 
   // Erase all the elements in the later stages. Only one iteration should
   // remain in the scheduled list, and it contains all the instructions.
@@ -4085,4 +3023,3 @@ void ResourceManager::clearResources() {
     return DFAResources->clearResources();
   std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0);
 }
-
diff --git a/lib/CodeGen/MachinePostDominators.cpp b/lib/CodeGen/MachinePostDominators.cpp
index 7f220ed1fd8f..f4daff667e86 100644
--- a/lib/CodeGen/MachinePostDominators.cpp
+++ b/lib/CodeGen/MachinePostDominators.cpp
@@ -17,7 +17,9 @@ using namespace llvm;
 
 namespace llvm {
 template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTreeBase
-}
+
+extern bool VerifyMachineDomInfo;
+} // namespace llvm
 
 char MachinePostDominatorTree::ID = 0;
 
@@ -25,33 +27,52 @@ char MachinePostDominatorTree::ID = 0;
 INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
                 "MachinePostDominator Tree Construction", true, true)
 
-MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) {
+MachinePostDominatorTree::MachinePostDominatorTree()
+    : MachineFunctionPass(ID), PDT(nullptr) {
   initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
-  DT = new PostDomTreeBase<MachineBasicBlock>();
 }
 
-FunctionPass *
-MachinePostDominatorTree::createMachinePostDominatorTreePass() {
+FunctionPass *MachinePostDominatorTree::createMachinePostDominatorTreePass() {
   return new MachinePostDominatorTree();
 }
 
-bool
-MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
-  DT->recalculate(F);
+bool MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  PDT = std::make_unique<PostDomTreeT>();
+  PDT->recalculate(F);
   return false;
 }
 
-MachinePostDominatorTree::~MachinePostDominatorTree() {
-  delete DT;
-}
-
-void
-MachinePostDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+void MachinePostDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-void
-MachinePostDominatorTree::print(llvm::raw_ostream &OS, const Module *M) const {
-  DT->print(OS);
+MachineBasicBlock *MachinePostDominatorTree::findNearestCommonDominator(
+    ArrayRef<MachineBasicBlock *> Blocks) const {
+  assert(!Blocks.empty());
+
+  MachineBasicBlock *NCD = Blocks.front();
+  for (MachineBasicBlock *BB : Blocks.drop_front()) {
+    NCD = PDT->findNearestCommonDominator(NCD, BB);
+
+    // Stop when the root is reached.
+    if (PDT->isVirtualRoot(PDT->getNode(NCD)))
+      return nullptr;
+  }
+
+  return NCD;
+}
+
+void MachinePostDominatorTree::verifyAnalysis() const {
+  if (PDT && VerifyMachineDomInfo)
+    if (!PDT->verify(PostDomTreeT::VerificationLevel::Basic)) {
+      errs() << "MachinePostDominatorTree verification failed\n";
+
+      abort();
+    }
+}
+
+void MachinePostDominatorTree::print(llvm::raw_ostream &OS,
+                                     const Module *M) const {
+  PDT->print(OS);
 }
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index f0fd0405d69d..b88d4ea462ef 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -144,7 +144,7 @@ MachineRegisterInfo::recomputeRegClass(unsigned Reg) {
 }
 
 unsigned MachineRegisterInfo::createIncompleteVirtualRegister(StringRef Name) {
-  unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
+  unsigned Reg = Register::index2VirtReg(getNumVirtRegs());
   VRegInfo.grow(Reg);
   RegAllocHints.grow(Reg);
   insertVRegByName(Name, Reg);
@@ -202,7 +202,7 @@ void MachineRegisterInfo::clearVirtRegTypes() { VRegToType.clear(); }
 void MachineRegisterInfo::clearVirtRegs() {
 #ifndef NDEBUG
   for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (!VRegInfo[Reg].second)
       continue;
     verifyUseList(Reg);
@@ -255,7 +255,7 @@ void MachineRegisterInfo::verifyUseList(unsigned Reg) const {
 void MachineRegisterInfo::verifyUseLists() const {
 #ifndef NDEBUG
   for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i)
-    verifyUseList(TargetRegisterInfo::index2VirtReg(i));
+    verifyUseList(Register::index2VirtReg(i));
   for (unsigned i = 1, e = getTargetRegisterInfo()->getNumRegs(); i != e; ++i)
     verifyUseList(i);
 #endif
@@ -386,7 +386,7 @@ void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
   for (reg_iterator I = reg_begin(FromReg), E = reg_end(); I != E; ) {
     MachineOperand &O = *I;
     ++I;
-    if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+    if (Register::isPhysicalRegister(ToReg)) {
       O.substPhysReg(ToReg, *TRI);
     } else {
       O.setReg(ToReg);
@@ -498,7 +498,7 @@ MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
 
 LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const {
   // Lane masks are only defined for vregs.
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
   const TargetRegisterClass &TRC = *getRegClass(Reg);
   return TRC.getLaneMask();
 }
@@ -517,7 +517,7 @@ void MachineRegisterInfo::freezeReservedRegs(const MachineFunction &MF) {
 }
 
 bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+  assert(Register::isPhysicalRegister(PhysReg));
 
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   if (TRI->isConstantPhysReg(PhysReg))
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index e8b42047b49f..258a5f9e0482 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -95,7 +95,7 @@ unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
   while (I != BB->end() && I->isPHI()) {
     bool Same = true;
     for (unsigned i = 1, e = I->getNumOperands(); i != e; i += 2) {
-      unsigned SrcReg = I->getOperand(i).getReg();
+      Register SrcReg = I->getOperand(i).getReg();
       MachineBasicBlock *SrcBB = I->getOperand(i+1).getMBB();
       if (AVals[SrcBB] != SrcReg) {
         Same = false;
@@ -118,7 +118,7 @@ MachineInstrBuilder InsertNewDef(unsigned Opcode,
                            const TargetRegisterClass *RC,
                            MachineRegisterInfo *MRI,
                            const TargetInstrInfo *TII) {
-  unsigned NewVR = MRI->createVirtualRegister(RC);
+  Register NewVR = MRI->createVirtualRegister(RC);
   return BuildMI(*BB, I, DebugLoc(), TII->get(Opcode), NewVR);
 }
 
@@ -292,7 +292,7 @@ public:
                               MachineSSAUpdater *Updater) {
     // Insert an implicit_def to represent an undef value.
     MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
-                                        BB, BB->getFirstTerminator(),
+                                        BB, BB->getFirstNonPHI(),
                                         Updater->VRC, Updater->MRI,
                                         Updater->TII);
     return NewDef->getOperand(0).getReg();
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index ae1170ad1be6..f0721ea3b76d 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -82,6 +82,10 @@ cl::opt<bool>
 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                        cl::desc("Print critical path length to stdout"));
 
+cl::opt<bool> VerifyScheduling(
+    "verify-misched", cl::Hidden,
+    cl::desc("Verify machine instrs before and after machine scheduling"));
+
 } // end namespace llvm
 
 #ifndef NDEBUG
@@ -122,9 +126,6 @@ static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
                                         cl::desc("Enable memop clustering."),
                                         cl::init(true));
 
-static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
-  cl::desc("Verify machine instrs before and after machine scheduling"));
-
 // DAG subtrees must have at least this many nodes.
 static const unsigned MinSubtreeSize = 8;
 
@@ -198,6 +199,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID;
 INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
@@ -210,7 +212,7 @@ MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) {
 
 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequiredID(MachineDominatorsID);
+  AU.addRequired<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
   AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<TargetPassConfig>();
@@ -234,7 +236,7 @@ PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) {
 
 void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequiredID(MachineDominatorsID);
+  AU.addRequired<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
   AU.addRequired<TargetPassConfig>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -933,8 +935,8 @@ void ScheduleDAGMILive::collectVRegUses(SUnit &SU) {
     if (TrackLaneMasks && !MO.isUse())
       continue;
 
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
 
     // Ignore re-defs.
@@ -985,7 +987,7 @@ void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
          "ShouldTrackLaneMasks requires ShouldTrackPressure");
 }
 
-// Setup the register pressure trackers for the top scheduled top and bottom
+// Setup the register pressure trackers for the top scheduled and bottom
 // scheduled regions.
 void ScheduleDAGMILive::initRegPressure() {
   VRegUses.clear();
@@ -1095,7 +1097,7 @@ void ScheduleDAGMILive::updatePressureDiffs(
   for (const RegisterMaskPair &P : LiveUses) {
     unsigned Reg = P.RegUnit;
     /// FIXME: Currently assuming single-use physregs.
-    if (!TRI->isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       continue;
 
     if (ShouldTrackLaneMasks) {
@@ -1319,8 +1321,8 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
   // Visit each live out vreg def to find def/use pairs that cross iterations.
   for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
     unsigned Reg = P.RegUnit;
-    if (!TRI->isVirtualRegister(Reg))
-        continue;
+    if (!Register::isVirtualRegister(Reg))
+      continue;
     const LiveInterval &LI = LIS->getInterval(Reg);
     const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
     if (!DefVNI)
@@ -1538,14 +1540,14 @@ namespace llvm {
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
                              const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? llvm::make_unique<LoadClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
                               const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? llvm::make_unique<StoreClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
@@ -1657,7 +1659,7 @@ namespace llvm {
 std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
                                const TargetRegisterInfo *TRI) {
-  return llvm::make_unique<CopyConstrain>(TII, TRI);
+  return std::make_unique<CopyConstrain>(TII, TRI);
 }
 
 } // end namespace llvm
@@ -1687,13 +1689,13 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
 
   // Check for pure vreg copies.
   const MachineOperand &SrcOp = Copy->getOperand(1);
-  unsigned SrcReg = SrcOp.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || !SrcOp.readsReg())
+  Register SrcReg = SrcOp.getReg();
+  if (!Register::isVirtualRegister(SrcReg) || !SrcOp.readsReg())
     return;
 
   const MachineOperand &DstOp = Copy->getOperand(0);
-  unsigned DstReg = DstOp.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg) || DstOp.isDead())
+  Register DstReg = DstOp.getReg();
+  if (!Register::isVirtualRegister(DstReg) || DstOp.isDead())
     return;
 
   // Check if either the dest or source is local. If it's live across a back
@@ -2914,14 +2916,12 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
     unsigned UnscheduledOper = isTop ? 0 : 1;
     // If we have already scheduled the physreg produce/consumer, immediately
     // schedule the copy.
-    if (TargetRegisterInfo::isPhysicalRegister(
-            MI->getOperand(ScheduledOper).getReg()))
+    if (Register::isPhysicalRegister(MI->getOperand(ScheduledOper).getReg()))
       return 1;
     // If the physreg is at the boundary, defer it. Otherwise schedule it
     // immediately to free the dependent. We can hoist the copy later.
     bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
-    if (TargetRegisterInfo::isPhysicalRegister(
-            MI->getOperand(UnscheduledOper).getReg()))
+    if (Register::isPhysicalRegister(MI->getOperand(UnscheduledOper).getReg()))
       return AtBoundary ? -1 : 1;
   }
 
@@ -2931,7 +2931,7 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
     // physical registers.
     bool DoBias = true;
     for (const MachineOperand &Op : MI->defs()) {
-      if (Op.isReg() && !TargetRegisterInfo::isPhysicalRegister(Op.getReg())) {
+      if (Op.isReg() && !Register::isPhysicalRegister(Op.getReg())) {
         DoBias = false;
         break;
       }
@@ -3259,7 +3259,8 @@ void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
   // Find already scheduled copies with a single physreg dependence and move
   // them just above the scheduled instruction.
   for (SDep &Dep : Deps) {
-    if (Dep.getKind() != SDep::Data || !TRI->isPhysicalRegister(Dep.getReg()))
+    if (Dep.getKind() != SDep::Data ||
+        !Register::isPhysicalRegister(Dep.getReg()))
       continue;
     SUnit *DepSU = Dep.getSUnit();
     if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
@@ -3298,7 +3299,7 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 /// default scheduler if the target does not set a default.
 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
-      new ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C));
+      new ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C));
   // Register DAG post-processors.
   //
   // FIXME: extend the mutation API to allow earlier mutations to instantiate
@@ -3450,7 +3451,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 }
 
 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
-  return new ScheduleDAGMI(C, llvm::make_unique<PostGenericScheduler>(C),
+  return new ScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(C),
                            /*RemoveKillFlags=*/true);
 }
 
@@ -3561,10 +3562,10 @@ public:
 } // end anonymous namespace
 
 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(true));
+  return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(true));
 }
 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(false));
+  return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(false));
 }
 
 static MachineSchedRegistry ILPMaxRegistry(
@@ -3658,7 +3659,7 @@ static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
   assert((TopDown || !ForceTopDown) &&
          "-misched-topdown incompatible with -misched-bottomup");
   return new ScheduleDAGMILive(
-      C, llvm::make_unique<InstructionShuffler>(Alternate, TopDown));
+      C, std::make_unique<InstructionShuffler>(Alternate, TopDown));
 }
 
 static MachineSchedRegistry ShufflerRegistry(
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 41db2c88ce50..27a2e7023f22 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -36,8 +36,9 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
@@ -114,15 +115,12 @@ namespace {
     bool runOnMachineFunction(MachineFunction &MF) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<MachinePostDominatorTree>();
       AU.addRequired<MachineLoopInfo>();
       AU.addRequired<MachineBranchProbabilityInfo>();
-      AU.addPreserved<MachineDominatorTree>();
-      AU.addPreserved<MachinePostDominatorTree>();
       AU.addPreserved<MachineLoopInfo>();
       if (UseBlockFreqInfo)
         AU.addRequired<MachineBlockFrequencyInfo>();
@@ -195,11 +193,10 @@ bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
   if (!MI.isCopy())
     return false;
 
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  unsigned DstReg = MI.getOperand(0).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
-      !TargetRegisterInfo::isVirtualRegister(DstReg) ||
-      !MRI->hasOneNonDBGUse(SrcReg))
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!Register::isVirtualRegister(SrcReg) ||
+      !Register::isVirtualRegister(DstReg) || !MRI->hasOneNonDBGUse(SrcReg))
     return false;
 
   const TargetRegisterClass *SRC = MRI->getRegClass(SrcReg);
@@ -233,8 +230,7 @@ MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
                                         MachineBasicBlock *DefMBB,
                                         bool &BreakPHIEdge,
                                         bool &LocalUse) const {
-  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
-         "Only makes sense for vregs");
+  assert(Register::isVirtualRegister(Reg) && "Only makes sense for vregs");
 
   // Ignore debug uses because debug info doesn't affect the code.
   if (MRI->use_nodbg_empty(Reg))
@@ -416,13 +412,13 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr &MI,
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isUse())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0)
       continue;
 
     // We don't move live definitions of physical registers,
     // so sinking their uses won't enable any opportunities.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       continue;
 
     // If this instruction is the only user of a virtual register,
@@ -615,10 +611,10 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;  // Ignore non-register operands.
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       if (MO.isUse()) {
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
@@ -817,8 +813,9 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0 || !Register::isPhysicalRegister(Reg))
+      continue;
     if (SuccToSinkTo->isLiveIn(Reg))
       return false;
   }
@@ -958,8 +955,9 @@ private:
   /// Track which register units have been modified and used.
   LiveRegUnits ModifiedRegUnits, UsedRegUnits;
 
-  /// Track DBG_VALUEs of (unmodified) register units.
-  DenseMap<unsigned, TinyPtrVector<MachineInstr*>> SeenDbgInstrs;
+  /// Track DBG_VALUEs of (unmodified) register units. Each DBG_VALUE has an
+  /// entry in this map for each unit it touches.
+  DenseMap<unsigned, TinyPtrVector<MachineInstr *>> SeenDbgInstrs;
 
   /// Sink Copy instructions unused in the same block close to their uses in
   /// successors.
@@ -1030,7 +1028,7 @@ static void clearKillFlags(MachineInstr *MI, MachineBasicBlock &CurBB,
                            const TargetRegisterInfo *TRI) {
   for (auto U : UsedOpsInCopy) {
     MachineOperand &MO = MI->getOperand(U);
-    unsigned SrcReg = MO.getReg();
+    Register SrcReg = MO.getReg();
     if (!UsedRegUnits.available(SrcReg)) {
       MachineBasicBlock::iterator NI = std::next(MI->getIterator());
       for (MachineInstr &UI : make_range(NI, CurBB.end())) {
@@ -1053,7 +1051,7 @@ static void updateLiveIn(MachineInstr *MI, MachineBasicBlock *SuccBB,
     for (MCSubRegIterator S(DefReg, TRI, true); S.isValid(); ++S)
       SuccBB->removeLiveIn(*S);
   for (auto U : UsedOpsInCopy) {
-    unsigned Reg = MI->getOperand(U).getReg();
+    Register Reg = MI->getOperand(U).getReg();
     if (!SuccBB->isLiveIn(Reg))
       SuccBB->addLiveIn(Reg);
   }
@@ -1069,7 +1067,7 @@ static bool hasRegisterDependency(MachineInstr *MI,
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isDef()) {
@@ -1094,6 +1092,14 @@ static bool hasRegisterDependency(MachineInstr *MI,
   return HasRegDependency;
 }
 
+static SmallSet<unsigned, 4> getRegUnits(unsigned Reg,
+                                         const TargetRegisterInfo *TRI) {
+  SmallSet<unsigned, 4> RegUnits;
+  for (auto RI = MCRegUnitIterator(Reg, TRI); RI.isValid(); ++RI)
+    RegUnits.insert(*RI);
+  return RegUnits;
+}
+
 bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
                                          MachineFunction &MF,
                                          const TargetRegisterInfo *TRI,
@@ -1130,15 +1136,17 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     // for DBG_VALUEs later, record them when they're encountered.
     if (MI->isDebugValue()) {
       auto &MO = MI->getOperand(0);
-      if (MO.isReg() && TRI->isPhysicalRegister(MO.getReg())) {
+      if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) {
         // Bail if we can already tell the sink would be rejected, rather
         // than needlessly accumulating lots of DBG_VALUEs.
         if (hasRegisterDependency(MI, UsedOpsInCopy, DefedRegsInCopy,
                                   ModifiedRegUnits, UsedRegUnits))
           continue;
 
-        // Record debug use of this register.
-        SeenDbgInstrs[MO.getReg()].push_back(MI);
+        // Record debug use of each reg unit.
+        SmallSet<unsigned, 4> Units = getRegUnits(MO.getReg(), TRI);
+        for (unsigned Reg : Units)
+          SeenDbgInstrs[Reg].push_back(MI);
       }
       continue;
     }
@@ -1177,15 +1185,22 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     assert((SuccBB->pred_size() == 1 && *SuccBB->pred_begin() == &CurBB) &&
            "Unexpected predecessor");
 
-    // Collect DBG_VALUEs that must sink with this copy.
+    // Collect DBG_VALUEs that must sink with this copy. We've previously
+    // recorded which reg units that DBG_VALUEs read, if this instruction
+    // writes any of those units then the corresponding DBG_VALUEs must sink.
+    SetVector<MachineInstr *> DbgValsToSinkSet;
     SmallVector<MachineInstr *, 4> DbgValsToSink;
     for (auto &MO : MI->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned reg = MO.getReg();
-      for (auto *MI : SeenDbgInstrs.lookup(reg))
-        DbgValsToSink.push_back(MI);
+
+      SmallSet<unsigned, 4> Units = getRegUnits(MO.getReg(), TRI);
+      for (unsigned Reg : Units)
+        for (auto *MI : SeenDbgInstrs.lookup(Reg))
+          DbgValsToSinkSet.insert(MI);
     }
+    DbgValsToSink.insert(DbgValsToSink.begin(), DbgValsToSinkSet.begin(),
+                         DbgValsToSinkSet.end());
 
     // Clear the kill flag if SrcReg is killed between MI and the end of the
     // block.
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index f9505df4e7f4..66a3bc2f8cc4 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -634,7 +634,7 @@ struct DataDep {
   /// Create a DataDep from an SSA form virtual register.
   DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp)
     : UseOp(UseOp) {
-    assert(TargetRegisterInfo::isVirtualRegister(VirtReg));
+    assert(Register::isVirtualRegister(VirtReg));
     MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg);
     assert(!DefI.atEnd() && "Register has no defs");
     DefMI = DefI->getParent();
@@ -660,10 +660,10 @@ static bool getDataDeps(const MachineInstr &UseMI,
     const MachineOperand &MO = *I;
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       HasPhysRegs = true;
       continue;
     }
@@ -687,7 +687,7 @@ static void getPHIDeps(const MachineInstr &UseMI,
   assert(UseMI.isPHI() && UseMI.getNumOperands() % 2 && "Bad PHI");
   for (unsigned i = 1; i != UseMI.getNumOperands(); i += 2) {
     if (UseMI.getOperand(i + 1).getMBB() == Pred) {
-      unsigned Reg = UseMI.getOperand(i).getReg();
+      Register Reg = UseMI.getOperand(i).getReg();
       Deps.push_back(DataDep(MRI, Reg, i));
       return;
     }
@@ -708,8 +708,8 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
     const MachineOperand &MO = *MI;
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isPhysicalRegister(Reg))
       continue;
     // Track live defs and kills for updating RegUnits.
     if (MO.isDef()) {
@@ -765,7 +765,7 @@ computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
   assert(TBI.HasValidInstrHeights && "Missing height info");
   unsigned MaxLen = 0;
   for (const LiveInReg &LIR : TBI.LiveIns) {
-    if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg))
+    if (!Register::isVirtualRegister(LIR.Reg))
       continue;
     const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
     // Ignore dependencies outside the current trace.
@@ -902,8 +902,8 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
     const MachineOperand &MO = *MOI;
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isPhysicalRegister(Reg))
       continue;
     if (MO.readsReg())
       ReadOps.push_back(MI.getOperandNo(MOI));
@@ -930,7 +930,7 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
 
   // Now we know the height of MI. Update any regunits read.
   for (unsigned i = 0, e = ReadOps.size(); i != e; ++i) {
-    unsigned Reg = MI.getOperand(ReadOps[i]).getReg();
+    Register Reg = MI.getOperand(ReadOps[i]).getReg();
     for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
       LiveRegUnit &LRU = RegUnits[*Units];
       // Set the height to the highest reader of the unit.
@@ -979,7 +979,7 @@ addLiveIns(const MachineInstr *DefMI, unsigned DefOp,
            ArrayRef<const MachineBasicBlock*> Trace) {
   assert(!Trace.empty() && "Trace should contain at least one block");
   unsigned Reg = DefMI->getOperand(DefOp).getReg();
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
   const MachineBasicBlock *DefMBB = DefMI->getParent();
 
   // Reg is live-in to all blocks in Trace that follow DefMBB.
@@ -1026,7 +1026,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
   if (MBB) {
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
     for (LiveInReg &LI : TBI.LiveIns) {
-      if (TargetRegisterInfo::isVirtualRegister(LI.Reg)) {
+      if (Register::isVirtualRegister(LI.Reg)) {
         // For virtual registers, the def latency is included.
         unsigned &Height = Heights[MTM.MRI->getVRegDef(LI.Reg)];
         if (Height < LI.Height)
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 0ad792ac62cf..969743edca52 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -22,7 +22,6 @@
 // the verifier errors.
 //===----------------------------------------------------------------------===//
 
-#include "LiveRangeCalc.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -37,6 +36,7 @@
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -122,7 +122,7 @@ namespace {
     // Add Reg and any sub-registers to RV
     void addRegWithSubRegs(RegVector &RV, unsigned Reg) {
       RV.push_back(Reg);
-      if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      if (Register::isPhysicalRegister(Reg))
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
           RV.push_back(*SubRegs);
     }
@@ -159,7 +159,7 @@ namespace {
       // Add register to vregsPassed if it belongs there. Return true if
       // anything changed.
       bool addPassed(unsigned Reg) {
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        if (!Register::isVirtualRegister(Reg))
           return false;
         if (regsKilled.count(Reg) || regsLiveOut.count(Reg))
           return false;
@@ -178,7 +178,7 @@ namespace {
       // Add register to vregsRequired if it belongs there. Return true if
       // anything changed.
       bool addRequired(unsigned Reg) {
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        if (!Register::isVirtualRegister(Reg))
           return false;
         if (regsLiveOut.count(Reg))
           return false;
@@ -552,7 +552,7 @@ void MachineVerifier::report_context_vreg(unsigned VReg) const {
 }
 
 void MachineVerifier::report_context_vreg_regunit(unsigned VRegOrUnit) const {
-  if (TargetRegisterInfo::isVirtualRegister(VRegOrUnit)) {
+  if (Register::isVirtualRegister(VRegOrUnit)) {
     report_context_vreg(VRegOrUnit);
   } else {
     errs() << "- regunit:     " << printRegUnit(VRegOrUnit, TRI) << '\n';
@@ -797,7 +797,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   regsLive.clear();
   if (MRI->tracksLiveness()) {
     for (const auto &LI : MBB->liveins()) {
-      if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) {
+      if (!Register::isPhysicalRegister(LI.PhysReg)) {
         report("MBB live-in list contains non-physical register", MBB);
         continue;
       }
@@ -957,7 +957,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
   // Generic opcodes must not have physical register operands.
   for (unsigned I = 0; I < MI->getNumOperands(); ++I) {
     const MachineOperand *MO = &MI->getOperand(I);
-    if (MO->isReg() && TargetRegisterInfo::isPhysicalRegister(MO->getReg()))
+    if (MO->isReg() && Register::isPhysicalRegister(MO->getReg()))
       report("Generic instruction cannot have physical register", MO, I);
   }
 
@@ -1368,7 +1368,108 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
         break;
       }
     }
+    switch (IntrID) {
+    case Intrinsic::memcpy:
+      if (MI->getNumOperands() != 5)
+        report("Expected memcpy intrinsic to have 5 operands", MI);
+      break;
+    case Intrinsic::memmove:
+      if (MI->getNumOperands() != 5)
+        report("Expected memmove intrinsic to have 5 operands", MI);
+      break;
+    case Intrinsic::memset:
+      if (MI->getNumOperands() != 5)
+        report("Expected memset intrinsic to have 5 operands", MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_SEXT_INREG: {
+    if (!MI->getOperand(2).isImm()) {
+      report("G_SEXT_INREG expects an immediate operand #2", MI);
+      break;
+    }
+
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    verifyVectorElementMatch(DstTy, SrcTy, MI);
+
+    int64_t Imm = MI->getOperand(2).getImm();
+    if (Imm <= 0)
+      report("G_SEXT_INREG size must be >= 1", MI);
+    if (Imm >= SrcTy.getScalarSizeInBits())
+      report("G_SEXT_INREG size must be less than source bit width", MI);
+    break;
+  }
+  case TargetOpcode::G_SHUFFLE_VECTOR: {
+    const MachineOperand &MaskOp = MI->getOperand(3);
+    if (!MaskOp.isShuffleMask()) {
+      report("Incorrect mask operand type for G_SHUFFLE_VECTOR", MI);
+      break;
+    }
+
+    const Constant *Mask = MaskOp.getShuffleMask();
+    auto *MaskVT = dyn_cast<VectorType>(Mask->getType());
+    if (!MaskVT || !MaskVT->getElementType()->isIntegerTy(32)) {
+      report("Invalid shufflemask constant type", MI);
+      break;
+    }
+
+    if (!Mask->getAggregateElement(0u)) {
+      report("Invalid shufflemask constant type", MI);
+      break;
+    }
 
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT Src0Ty = MRI->getType(MI->getOperand(1).getReg());
+    LLT Src1Ty = MRI->getType(MI->getOperand(2).getReg());
+
+    if (Src0Ty != Src1Ty)
+      report("Source operands must be the same type", MI);
+
+    if (Src0Ty.getScalarType() != DstTy.getScalarType())
+      report("G_SHUFFLE_VECTOR cannot change element type", MI);
+
+    // Don't check that all operands are vector because scalars are used in
+    // place of 1 element vectors.
+    int SrcNumElts = Src0Ty.isVector() ? Src0Ty.getNumElements() : 1;
+    int DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1;
+
+    SmallVector<int, 32> MaskIdxes;
+    ShuffleVectorInst::getShuffleMask(Mask, MaskIdxes);
+
+    if (static_cast<int>(MaskIdxes.size()) != DstNumElts)
+      report("Wrong result type for shufflemask", MI);
+
+    for (int Idx : MaskIdxes) {
+      if (Idx < 0)
+        continue;
+
+      if (Idx >= 2 * SrcNumElts)
+        report("Out of bounds shuffle index", MI);
+    }
+
+    break;
+  }
+  case TargetOpcode::G_DYN_STACKALLOC: {
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &AllocOp = MI->getOperand(1);
+    const MachineOperand &AlignOp = MI->getOperand(2);
+
+    if (!DstOp.isReg() || !MRI->getType(DstOp.getReg()).isPointer()) {
+      report("dst operand 0 must be a pointer type", MI);
+      break;
+    }
+
+    if (!AllocOp.isReg() || !MRI->getType(AllocOp.getReg()).isScalar()) {
+      report("src operand 1 must be a scalar reg type", MI);
+      break;
+    }
+
+    if (!AlignOp.isImm()) {
+      report("src operand 2 must be an immediate type", MI);
+      break;
+    }
     break;
   }
   default:
@@ -1525,11 +1626,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         report("Operand should be tied", MO, MONum);
       else if (unsigned(TiedTo) != MI->findTiedOperandIdx(MONum))
         report("Tied def doesn't match MCInstrDesc", MO, MONum);
-      else if (TargetRegisterInfo::isPhysicalRegister(MO->getReg())) {
+      else if (Register::isPhysicalRegister(MO->getReg())) {
         const MachineOperand &MOTied = MI->getOperand(TiedTo);
         if (!MOTied.isReg())
           report("Tied counterpart must be a register", &MOTied, TiedTo);
-        else if (TargetRegisterInfo::isPhysicalRegister(MOTied.getReg()) &&
+        else if (Register::isPhysicalRegister(MOTied.getReg()) &&
                  MO->getReg() != MOTied.getReg())
           report("Tied physical registers must match.", &MOTied, TiedTo);
       }
@@ -1543,7 +1644,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
 
   switch (MO->getType()) {
   case MachineOperand::MO_Register: {
-    const unsigned Reg = MO->getReg();
+    const Register Reg = MO->getReg();
     if (!Reg)
       return;
     if (MRI->tracksLiveness() && !MI->isDebugValue())
@@ -1581,7 +1682,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     // Check register classes.
     unsigned SubIdx = MO->getSubReg();
 
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       if (SubIdx) {
         report("Illegal subregister index for physical register", MO, MONum);
         return;
@@ -1817,7 +1918,7 @@ void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
   if (MO->isDead()) {
     LiveQueryResult LRQ = LR.Query(DefIdx);
     if (!LRQ.isDeadDef()) {
-      assert(TargetRegisterInfo::isVirtualRegister(VRegOrUnit) &&
+      assert(Register::isVirtualRegister(VRegOrUnit) &&
              "Expecting a virtual register.");
       // A dead subreg def only tells us that the specific subreg is dead. There
       // could be other non-dead defs of other subregs, or we could have other
@@ -1845,8 +1946,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       addRegWithSubRegs(regsKilled, Reg);
 
     // Check that LiveVars knows this kill.
-    if (LiveVars && TargetRegisterInfo::isVirtualRegister(Reg) &&
-        MO->isKill()) {
+    if (LiveVars && Register::isVirtualRegister(Reg) && MO->isKill()) {
       LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
       if (!is_contained(VI.Kills, MI))
         report("Kill missing from LiveVariables", MO, MONum);
@@ -1856,7 +1956,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     if (LiveInts && !LiveInts->isNotInMIMap(*MI)) {
       SlotIndex UseIdx = LiveInts->getInstructionIndex(*MI);
       // Check the cached regunit intervals.
-      if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
+      if (Register::isPhysicalRegister(Reg) && !isReserved(Reg)) {
         for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
           if (MRI->isReservedRegUnit(*Units))
             continue;
@@ -1865,7 +1965,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         }
       }
 
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         if (LiveInts->hasInterval(Reg)) {
           // This is a virtual register interval.
           const LiveInterval &LI = LiveInts->getInterval(Reg);
@@ -1900,7 +2000,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
 
     // Use of a dead register.
     if (!regsLive.count(Reg)) {
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (Register::isPhysicalRegister(Reg)) {
         // Reserved registers may be used even when 'dead'.
         bool Bad = !isReserved(Reg);
         // We are fine if just any subregister has a defined value.
@@ -1922,7 +2022,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
             if (!MOP.isReg() || !MOP.isImplicit())
               continue;
 
-            if (!TargetRegisterInfo::isPhysicalRegister(MOP.getReg()))
+            if (!Register::isPhysicalRegister(MOP.getReg()))
               continue;
 
             for (MCSubRegIterator SubRegs(MOP.getReg(), TRI); SubRegs.isValid();
@@ -1960,7 +2060,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       addRegWithSubRegs(regsDefined, Reg);
 
     // Verify SSA form.
-    if (MRI->isSSA() && TargetRegisterInfo::isVirtualRegister(Reg) &&
+    if (MRI->isSSA() && Register::isVirtualRegister(Reg) &&
         std::next(MRI->def_begin(Reg)) != MRI->def_end())
       report("Multiple virtual register defs in SSA form", MO, MONum);
 
@@ -1969,7 +2069,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       SlotIndex DefIdx = LiveInts->getInstructionIndex(*MI);
       DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
 
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         if (LiveInts->hasInterval(Reg)) {
           const LiveInterval &LI = LiveInts->getInterval(Reg);
           checkLivenessAtDef(MO, MONum, DefIdx, LI, Reg);
@@ -2007,7 +2107,7 @@ void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) {
   while (!regMasks.empty()) {
     const uint32_t *Mask = regMasks.pop_back_val();
     for (RegSet::iterator I = regsLive.begin(), E = regsLive.end(); I != E; ++I)
-      if (TargetRegisterInfo::isPhysicalRegister(*I) &&
+      if (Register::isPhysicalRegister(*I) &&
           MachineOperand::clobbersPhysReg(Mask, *I))
         regsDead.push_back(*I);
   }
@@ -2119,8 +2219,8 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) {
     if (MODef.isTied() || MODef.isImplicit() || MODef.isInternalRead() ||
         MODef.isEarlyClobber() || MODef.isDebug())
       report("Unexpected flag on PHI operand", &MODef, 0);
-    unsigned DefReg = MODef.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+    Register DefReg = MODef.getReg();
+    if (!Register::isVirtualRegister(DefReg))
       report("Expected first PHI operand to be a virtual register", &MODef, 0);
 
     for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
@@ -2212,7 +2312,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
 void MachineVerifier::verifyLiveVariables() {
   assert(LiveVars && "Don't call verifyLiveVariables without LiveVars");
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
     for (const auto &MBB : *MF) {
       BBInfo &MInfo = MBBInfoMap[&MBB];
@@ -2238,7 +2338,7 @@ void MachineVerifier::verifyLiveVariables() {
 void MachineVerifier::verifyLiveIntervals() {
   assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts");
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
 
     // Spilling and splitting may leave unused registers around. Skip them.
     if (MRI->reg_nodbg_empty(Reg))
@@ -2315,11 +2415,11 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
     for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
       if (!MOI->isReg() || !MOI->isDef())
         continue;
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         if (MOI->getReg() != Reg)
           continue;
       } else {
-        if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
+        if (!Register::isPhysicalRegister(MOI->getReg()) ||
             !TRI->hasRegUnit(MOI->getReg(), Reg))
           continue;
       }
@@ -2402,7 +2502,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     return;
 
   // RegUnit intervals are allowed dead phis.
-  if (!TargetRegisterInfo::isVirtualRegister(Reg) && VNI->isPHIDef() &&
+  if (!Register::isVirtualRegister(Reg) && VNI->isPHIDef() &&
       S.start == VNI->def && S.end == VNI->def.getDeadSlot())
     return;
 
@@ -2446,7 +2546,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
   // The following checks only apply to virtual registers. Physreg liveness
   // is too weird to check.
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     // A live segment can end with either a redefinition, a kill flag on a
     // use, or a dead flag on a def.
     bool hasRead = false;
@@ -2519,8 +2619,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   while (true) {
     assert(LiveInts->isLiveInToMBB(LR, &*MFI));
     // We don't know how to track physregs into a landing pad.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg) &&
-        MFI->isEHPad()) {
+    if (!Register::isVirtualRegister(Reg) && MFI->isEHPad()) {
       if (&*MFI == EndMBB)
         break;
       ++MFI;
@@ -2580,7 +2679,7 @@ void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg,
 
 void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
   unsigned Reg = LI.reg;
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
   verifyLiveRange(LI, Reg);
 
   LaneBitmask Mask;
diff --git a/lib/CodeGen/MacroFusion.cpp b/lib/CodeGen/MacroFusion.cpp
index 2db1e86905a4..d21eae222af0 100644
--- a/lib/CodeGen/MacroFusion.cpp
+++ b/lib/CodeGen/MacroFusion.cpp
@@ -176,7 +176,7 @@ std::unique_ptr<ScheduleDAGMutation>
 llvm::createMacroFusionDAGMutation(
      ShouldSchedulePredTy shouldScheduleAdjacent) {
   if(EnableMacroFusion)
-    return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, true);
+    return std::make_unique<MacroFusion>(shouldScheduleAdjacent, true);
   return nullptr;
 }
 
@@ -184,6 +184,6 @@ std::unique_ptr<ScheduleDAGMutation>
 llvm::createBranchMacroFusionDAGMutation(
      ShouldSchedulePredTy shouldScheduleAdjacent) {
   if(EnableMacroFusion)
-    return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, false);
+    return std::make_unique<MacroFusion>(shouldScheduleAdjacent, false);
   return nullptr;
 }
diff --git a/lib/CodeGen/ModuloSchedule.cpp b/lib/CodeGen/ModuloSchedule.cpp
new file mode 100644
index 000000000000..7ce3c5861801
--- /dev/null
+++ b/lib/CodeGen/ModuloSchedule.cpp
@@ -0,0 +1,2022 @@
+//===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopUtils.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "pipeliner"
+using namespace llvm;
+
+void ModuloSchedule::print(raw_ostream &OS) {
+  for (MachineInstr *MI : ScheduledInstrs)
+    OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
+}
+
+//===----------------------------------------------------------------------===//
+// ModuloScheduleExpander implementation
+//===----------------------------------------------------------------------===//
+
+/// Return the register values for  the operands of a Phi instruction.
+/// This function assume the instruction is a Phi.
+static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
+                       unsigned &InitVal, unsigned &LoopVal) {
+  assert(Phi.isPHI() && "Expecting a Phi.");
+
+  InitVal = 0;
+  LoopVal = 0;
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() != Loop)
+      InitVal = Phi.getOperand(i).getReg();
+    else
+      LoopVal = Phi.getOperand(i).getReg();
+
+  assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
+}
+
+/// Return the Phi register value that comes from the incoming block.
+static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() != LoopBB)
+      return Phi.getOperand(i).getReg();
+  return 0;
+}
+
+/// Return the Phi register value that comes the loop block.
+static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() == LoopBB)
+      return Phi.getOperand(i).getReg();
+  return 0;
+}
+
+void ModuloScheduleExpander::expand() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = *BB->pred_begin();
+  if (Preheader == BB)
+    Preheader = *std::next(BB->pred_begin());
+
+  // Iterate over the definitions in each instruction, and compute the
+  // stage difference for each use.  Keep the maximum value.
+  for (MachineInstr *MI : Schedule.getInstructions()) {
+    int DefStage = Schedule.getStage(MI);
+    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+
+      Register Reg = Op.getReg();
+      unsigned MaxDiff = 0;
+      bool PhiIsSwapped = false;
+      for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
+                                             EI = MRI.use_end();
+           UI != EI; ++UI) {
+        MachineOperand &UseOp = *UI;
+        MachineInstr *UseMI = UseOp.getParent();
+        int UseStage = Schedule.getStage(UseMI);
+        unsigned Diff = 0;
+        if (UseStage != -1 && UseStage >= DefStage)
+          Diff = UseStage - DefStage;
+        if (MI->isPHI()) {
+          if (isLoopCarried(*MI))
+            ++Diff;
+          else
+            PhiIsSwapped = true;
+        }
+        MaxDiff = std::max(Diff, MaxDiff);
+      }
+      RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
+    }
+  }
+
+  generatePipelinedLoop();
+}
+
+void ModuloScheduleExpander::generatePipelinedLoop() {
+  LoopInfo = TII->analyzeLoopForPipelining(BB);
+  assert(LoopInfo && "Must be able to analyze loop!");
+
+  // Create a new basic block for the kernel and add it to the CFG.
+  MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+
+  unsigned MaxStageCount = Schedule.getNumStages() - 1;
+
+  // Remember the registers that are used in different stages. The index is
+  // the iteration, or stage, that the instruction is scheduled in.  This is
+  // a map between register names in the original block and the names created
+  // in each stage of the pipelined loop.
+  ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
+  InstrMapTy InstrMap;
+
+  SmallVector<MachineBasicBlock *, 4> PrologBBs;
+
+  // Generate the prolog instructions that set up the pipeline.
+  generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
+  MF.insert(BB->getIterator(), KernelBB);
+
+  // Rearrange the instructions to generate the new, pipelined loop,
+  // and update register names as needed.
+  for (MachineInstr *CI : Schedule.getInstructions()) {
+    if (CI->isPHI())
+      continue;
+    unsigned StageNum = Schedule.getStage(CI);
+    MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
+    updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
+    KernelBB->push_back(NewMI);
+    InstrMap[NewMI] = CI;
+  }
+
+  // Copy any terminator instructions to the new kernel, and update
+  // names as needed.
+  for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
+                                   E = BB->instr_end();
+       I != E; ++I) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
+    updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
+    KernelBB->push_back(NewMI);
+    InstrMap[NewMI] = &*I;
+  }
+
+  NewKernel = KernelBB;
+  KernelBB->transferSuccessors(BB);
+  KernelBB->replaceSuccessor(BB, KernelBB);
+
+  generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
+                       InstrMap, MaxStageCount, MaxStageCount, false);
+  generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, InstrMap,
+               MaxStageCount, MaxStageCount, false);
+
+  LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
+
+  SmallVector<MachineBasicBlock *, 4> EpilogBBs;
+  // Generate the epilog instructions to complete the pipeline.
+  generateEpilog(MaxStageCount, KernelBB, VRMap, EpilogBBs, PrologBBs);
+
+  // We need this step because the register allocation doesn't handle some
+  // situations well, so we insert copies to help out.
+  splitLifetimes(KernelBB, EpilogBBs);
+
+  // Remove dead instructions due to loop induction variables.
+  removeDeadInstructions(KernelBB, EpilogBBs);
+
+  // Add branches between prolog and epilog blocks.
+  addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
+
+  delete[] VRMap;
+}
+
+void ModuloScheduleExpander::cleanup() {
+  // Remove the original loop since it's no longer referenced.
+  for (auto &I : *BB)
+    LIS.RemoveMachineInstrFromMaps(I);
+  BB->clear();
+  BB->eraseFromParent();
+}
+
+/// Generate the pipeline prolog code.
+void ModuloScheduleExpander::generateProlog(unsigned LastStage,
+                                            MachineBasicBlock *KernelBB,
+                                            ValueMapTy *VRMap,
+                                            MBBVectorTy &PrologBBs) {
+  MachineBasicBlock *PredBB = Preheader;
+  InstrMapTy InstrMap;
+
+  // Generate a basic block for each stage, not including the last stage,
+  // which will be generated in the kernel. Each basic block may contain
+  // instructions from multiple stages/iterations.
+  for (unsigned i = 0; i < LastStage; ++i) {
+    // Create and insert the prolog basic block prior to the original loop
+    // basic block.  The original loop is removed later.
+    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+    PrologBBs.push_back(NewBB);
+    MF.insert(BB->getIterator(), NewBB);
+    NewBB->transferSuccessors(PredBB);
+    PredBB->addSuccessor(NewBB);
+    PredBB = NewBB;
+
+    // Generate instructions for each appropriate stage. Process instructions
+    // in original program order.
+    for (int StageNum = i; StageNum >= 0; --StageNum) {
+      for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
+                                       BBE = BB->getFirstTerminator();
+           BBI != BBE; ++BBI) {
+        if (Schedule.getStage(&*BBI) == StageNum) {
+          if (BBI->isPHI())
+            continue;
+          MachineInstr *NewMI =
+              cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
+          updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
+          NewBB->push_back(NewMI);
+          InstrMap[NewMI] = &*BBI;
+        }
+      }
+    }
+    rewritePhiValues(NewBB, i, VRMap, InstrMap);
+    LLVM_DEBUG({
+      dbgs() << "prolog:\n";
+      NewBB->dump();
+    });
+  }
+
+  PredBB->replaceSuccessor(BB, KernelBB);
+
+  // Check if we need to remove the branch from the preheader to the original
+  // loop, and replace it with a branch to the new loop.
+  unsigned numBranches = TII->removeBranch(*Preheader);
+  if (numBranches) {
+    SmallVector<MachineOperand, 0> Cond;
+    TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
+  }
+}
+
+/// Generate the pipeline epilog code. The epilog code finishes the iterations
+/// that were started in either the prolog or the kernel.  We create a basic
+/// block for each stage that needs to complete.
+void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
+                                            MachineBasicBlock *KernelBB,
+                                            ValueMapTy *VRMap,
+                                            MBBVectorTy &EpilogBBs,
+                                            MBBVectorTy &PrologBBs) {
+  // We need to change the branch from the kernel to the first epilog block, so
+  // this call to analyze branch uses the kernel rather than the original BB.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
+  assert(!checkBranch && "generateEpilog must be able to analyze the branch");
+  if (checkBranch)
+    return;
+
+  MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
+  if (*LoopExitI == KernelBB)
+    ++LoopExitI;
+  assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
+  MachineBasicBlock *LoopExitBB = *LoopExitI;
+
+  MachineBasicBlock *PredBB = KernelBB;
+  MachineBasicBlock *EpilogStart = LoopExitBB;
+  InstrMapTy InstrMap;
+
+  // Generate a basic block for each stage, not including the last stage,
+  // which was generated for the kernel.  Each basic block may contain
+  // instructions from multiple stages/iterations.
+  int EpilogStage = LastStage + 1;
+  for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
+    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
+    EpilogBBs.push_back(NewBB);
+    MF.insert(BB->getIterator(), NewBB);
+
+    PredBB->replaceSuccessor(LoopExitBB, NewBB);
+    NewBB->addSuccessor(LoopExitBB);
+
+    if (EpilogStart == LoopExitBB)
+      EpilogStart = NewBB;
+
+    // Add instructions to the epilog depending on the current block.
+    // Process instructions in original program order.
+    for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
+      for (auto &BBI : *BB) {
+        if (BBI.isPHI())
+          continue;
+        MachineInstr *In = &BBI;
+        if ((unsigned)Schedule.getStage(In) == StageNum) {
+          // Instructions with memoperands in the epilog are updated with
+          // conservative values.
+          MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
+          updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
+          NewBB->push_back(NewMI);
+          InstrMap[NewMI] = In;
+        }
+      }
+    }
+    generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
+                         InstrMap, LastStage, EpilogStage, i == 1);
+    generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, InstrMap,
+                 LastStage, EpilogStage, i == 1);
+    PredBB = NewBB;
+
+    LLVM_DEBUG({
+      dbgs() << "epilog:\n";
+      NewBB->dump();
+    });
+  }
+
+  // Fix any Phi nodes in the loop exit block.
+  LoopExitBB->replacePhiUsesWith(BB, PredBB);
+
+  // Create a branch to the new epilog from the kernel.
+  // Remove the original branch and add a new branch to the epilog.
+  TII->removeBranch(*KernelBB);
+  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
+  // Add a branch to the loop exit.
+  if (EpilogBBs.size() > 0) {
+    MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
+    SmallVector<MachineOperand, 4> Cond1;
+    TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
+  }
+}
+
+/// Replace all uses of FromReg that appear outside the specified
+/// basic block with ToReg.
+static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
+                                    MachineBasicBlock *MBB,
+                                    MachineRegisterInfo &MRI,
+                                    LiveIntervals &LIS) {
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
+                                         E = MRI.use_end();
+       I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+    if (O.getParent()->getParent() != MBB)
+      O.setReg(ToReg);
+  }
+  if (!LIS.hasInterval(ToReg))
+    LIS.createEmptyInterval(ToReg);
+}
+
+/// Return true if the register has a use that occurs outside the
+/// specified loop.
+static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
+                            MachineRegisterInfo &MRI) {
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
+                                         E = MRI.use_end();
+       I != E; ++I)
+    if (I->getParent()->getParent() != BB)
+      return true;
+  return false;
+}
+
+/// Generate Phis for the specific block in the generated pipelined code.
+/// This function looks at the Phis from the original code to guide the
+/// creation of new Phis.
+void ModuloScheduleExpander::generateExistingPhis(
+    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+    MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
+    unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
+  // Compute the stage number for the initial value of the Phi, which
+  // comes from the prolog. The prolog to use depends on to which kernel/
+  // epilog that we're adding the Phi.
+  unsigned PrologStage = 0;
+  unsigned PrevStage = 0;
+  bool InKernel = (LastStageNum == CurStageNum);
+  if (InKernel) {
+    PrologStage = LastStageNum - 1;
+    PrevStage = CurStageNum;
+  } else {
+    PrologStage = LastStageNum - (CurStageNum - LastStageNum);
+    PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
+  }
+
+  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
+                                   BBE = BB->getFirstNonPHI();
+       BBI != BBE; ++BBI) {
+    Register Def = BBI->getOperand(0).getReg();
+
+    unsigned InitVal = 0;
+    unsigned LoopVal = 0;
+    getPhiRegs(*BBI, BB, InitVal, LoopVal);
+
+    unsigned PhiOp1 = 0;
+    // The Phi value from the loop body typically is defined in the loop, but
+    // not always. So, we need to check if the value is defined in the loop.
+    unsigned PhiOp2 = LoopVal;
+    if (VRMap[LastStageNum].count(LoopVal))
+      PhiOp2 = VRMap[LastStageNum][LoopVal];
+
+    int StageScheduled = Schedule.getStage(&*BBI);
+    int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
+    unsigned NumStages = getStagesForReg(Def, CurStageNum);
+    if (NumStages == 0) {
+      // We don't need to generate a Phi anymore, but we need to rename any uses
+      // of the Phi value.
+      unsigned NewReg = VRMap[PrevStage][LoopVal];
+      rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
+                            InitVal, NewReg);
+      if (VRMap[CurStageNum].count(LoopVal))
+        VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
+    }
+    // Adjust the number of Phis needed depending on the number of prologs left,
+    // and the distance from where the Phi is first scheduled. The number of
+    // Phis cannot exceed the number of prolog stages. Each stage can
+    // potentially define two values.
+    unsigned MaxPhis = PrologStage + 2;
+    if (!InKernel && (int)PrologStage <= LoopValStage)
+      MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
+    unsigned NumPhis = std::min(NumStages, MaxPhis);
+
+    unsigned NewReg = 0;
+    unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
+    // In the epilog, we may need to look back one stage to get the correct
+    // Phi name because the epilog and prolog blocks execute the same stage.
+    // The correct name is from the previous block only when the Phi has
+    // been completely scheduled prior to the epilog, and Phi value is not
+    // needed in multiple stages.
+    int StageDiff = 0;
+    if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
+        NumPhis == 1)
+      StageDiff = 1;
+    // Adjust the computations below when the phi and the loop definition
+    // are scheduled in different stages.
+    if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
+      StageDiff = StageScheduled - LoopValStage;
+    for (unsigned np = 0; np < NumPhis; ++np) {
+      // If the Phi hasn't been scheduled, then use the initial Phi operand
+      // value. Otherwise, use the scheduled version of the instruction. This
+      // is a little complicated when a Phi references another Phi.
+      if (np > PrologStage || StageScheduled >= (int)LastStageNum)
+        PhiOp1 = InitVal;
+      // Check if the Phi has already been scheduled in a prolog stage.
+      else if (PrologStage >= AccessStage + StageDiff + np &&
+               VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
+        PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
+      // Check if the Phi has already been scheduled, but the loop instruction
+      // is either another Phi, or doesn't occur in the loop.
+      else if (PrologStage >= AccessStage + StageDiff + np) {
+        // If the Phi references another Phi, we need to examine the other
+        // Phi to get the correct value.
+        PhiOp1 = LoopVal;
+        MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
+        int Indirects = 1;
+        while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
+          int PhiStage = Schedule.getStage(InstOp1);
+          if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
+            PhiOp1 = getInitPhiReg(*InstOp1, BB);
+          else
+            PhiOp1 = getLoopPhiReg(*InstOp1, BB);
+          InstOp1 = MRI.getVRegDef(PhiOp1);
+          int PhiOpStage = Schedule.getStage(InstOp1);
+          int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
+          if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
+              VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
+            PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
+            break;
+          }
+          ++Indirects;
+        }
+      } else
+        PhiOp1 = InitVal;
+      // If this references a generated Phi in the kernel, get the Phi operand
+      // from the incoming block.
+      if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
+        if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
+          PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
+
+      MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
+      bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
+      // In the epilog, a map lookup is needed to get the value from the kernel,
+      // or previous epilog block. How is does this depends on if the
+      // instruction is scheduled in the previous block.
+      if (!InKernel) {
+        int StageDiffAdj = 0;
+        if (LoopValStage != -1 && StageScheduled > LoopValStage)
+          StageDiffAdj = StageScheduled - LoopValStage;
+        // Use the loop value defined in the kernel, unless the kernel
+        // contains the last definition of the Phi.
+        if (np == 0 && PrevStage == LastStageNum &&
+            (StageScheduled != 0 || LoopValStage != 0) &&
+            VRMap[PrevStage - StageDiffAdj].count(LoopVal))
+          PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
+        // Use the value defined by the Phi. We add one because we switch
+        // from looking at the loop value to the Phi definition.
+        else if (np > 0 && PrevStage == LastStageNum &&
+                 VRMap[PrevStage - np + 1].count(Def))
+          PhiOp2 = VRMap[PrevStage - np + 1][Def];
+        // Use the loop value defined in the kernel.
+        else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
+                 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
+          PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
+        // Use the value defined by the Phi, unless we're generating the first
+        // epilog and the Phi refers to a Phi in a different stage.
+        else if (VRMap[PrevStage - np].count(Def) &&
+                 (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
+                  (LoopValStage == StageScheduled)))
+          PhiOp2 = VRMap[PrevStage - np][Def];
+      }
+
+      // Check if we can reuse an existing Phi. This occurs when a Phi
+      // references another Phi, and the other Phi is scheduled in an
+      // earlier stage. We can try to reuse an existing Phi up until the last
+      // stage of the current Phi.
+      if (LoopDefIsPhi) {
+        if (static_cast<int>(PrologStage - np) >= StageScheduled) {
+          int LVNumStages = getStagesForPhi(LoopVal);
+          int StageDiff = (StageScheduled - LoopValStage);
+          LVNumStages -= StageDiff;
+          // Make sure the loop value Phi has been processed already.
+          if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
+            NewReg = PhiOp2;
+            unsigned ReuseStage = CurStageNum;
+            if (isLoopCarried(*PhiInst))
+              ReuseStage -= LVNumStages;
+            // Check if the Phi to reuse has been generated yet. If not, then
+            // there is nothing to reuse.
+            if (VRMap[ReuseStage - np].count(LoopVal)) {
+              NewReg = VRMap[ReuseStage - np][LoopVal];
+
+              rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
+                                    Def, NewReg);
+              // Update the map with the new Phi name.
+              VRMap[CurStageNum - np][Def] = NewReg;
+              PhiOp2 = NewReg;
+              if (VRMap[LastStageNum - np - 1].count(LoopVal))
+                PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
+
+              if (IsLast && np == NumPhis - 1)
+                replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
+              continue;
+            }
+          }
+        }
+        if (InKernel && StageDiff > 0 &&
+            VRMap[CurStageNum - StageDiff - np].count(LoopVal))
+          PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
+      }
+
+      const TargetRegisterClass *RC = MRI.getRegClass(Def);
+      NewReg = MRI.createVirtualRegister(RC);
+
+      MachineInstrBuilder NewPhi =
+          BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
+                  TII->get(TargetOpcode::PHI), NewReg);
+      NewPhi.addReg(PhiOp1).addMBB(BB1);
+      NewPhi.addReg(PhiOp2).addMBB(BB2);
+      if (np == 0)
+        InstrMap[NewPhi] = &*BBI;
+
+      // We define the Phis after creating the new pipelined code, so
+      // we need to rename the Phi values in scheduled instructions.
+
+      unsigned PrevReg = 0;
+      if (InKernel && VRMap[PrevStage - np].count(LoopVal))
+        PrevReg = VRMap[PrevStage - np][LoopVal];
+      rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
+                            NewReg, PrevReg);
+      // If the Phi has been scheduled, use the new name for rewriting.
+      if (VRMap[CurStageNum - np].count(Def)) {
+        unsigned R = VRMap[CurStageNum - np][Def];
+        rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
+                              NewReg);
+      }
+
+      // Check if we need to rename any uses that occurs after the loop. The
+      // register to replace depends on whether the Phi is scheduled in the
+      // epilog.
+      if (IsLast && np == NumPhis - 1)
+        replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
+
+      // In the kernel, a dependent Phi uses the value from this Phi.
+      if (InKernel)
+        PhiOp2 = NewReg;
+
+      // Update the map with the new Phi name.
+      VRMap[CurStageNum - np][Def] = NewReg;
+    }
+
+    while (NumPhis++ < NumStages) {
+      rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
+                            NewReg, 0);
+    }
+
+    // Check if we need to rename a Phi that has been eliminated due to
+    // scheduling.
+    if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
+      replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
+  }
+}
+
+/// Generate Phis for the specified block in the generated pipelined code.
+/// These are new Phis needed because the definition is scheduled after the
+/// use in the pipelined sequence.
+void ModuloScheduleExpander::generatePhis(
+    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+    MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
+    unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
+  // Compute the stage number that contains the initial Phi value, and
+  // the Phi from the previous stage.
+  unsigned PrologStage = 0;
+  unsigned PrevStage = 0;
+  unsigned StageDiff = CurStageNum - LastStageNum;
+  bool InKernel = (StageDiff == 0);
+  if (InKernel) {
+    PrologStage = LastStageNum - 1;
+    PrevStage = CurStageNum;
+  } else {
+    PrologStage = LastStageNum - StageDiff;
+    PrevStage = LastStageNum + StageDiff - 1;
+  }
+
+  for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
+                                   BBE = BB->instr_end();
+       BBI != BBE; ++BBI) {
+    for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = BBI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef() ||
+          !Register::isVirtualRegister(MO.getReg()))
+        continue;
+
+      int StageScheduled = Schedule.getStage(&*BBI);
+      assert(StageScheduled != -1 && "Expecting scheduled instruction.");
+      Register Def = MO.getReg();
+      unsigned NumPhis = getStagesForReg(Def, CurStageNum);
+      // An instruction scheduled in stage 0 and is used after the loop
+      // requires a phi in the epilog for the last definition from either
+      // the kernel or prolog.
+      if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
+          hasUseAfterLoop(Def, BB, MRI))
+        NumPhis = 1;
+      if (!InKernel && (unsigned)StageScheduled > PrologStage)
+        continue;
+
+      unsigned PhiOp2 = VRMap[PrevStage][Def];
+      if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
+        if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
+          PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
+      // The number of Phis can't exceed the number of prolog stages. The
+      // prolog stage number is zero based.
+      if (NumPhis > PrologStage + 1 - StageScheduled)
+        NumPhis = PrologStage + 1 - StageScheduled;
+      for (unsigned np = 0; np < NumPhis; ++np) {
+        unsigned PhiOp1 = VRMap[PrologStage][Def];
+        if (np <= PrologStage)
+          PhiOp1 = VRMap[PrologStage - np][Def];
+        if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
+          if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
+            PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
+          if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
+            PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
+        }
+        if (!InKernel)
+          PhiOp2 = VRMap[PrevStage - np][Def];
+
+        const TargetRegisterClass *RC = MRI.getRegClass(Def);
+        Register NewReg = MRI.createVirtualRegister(RC);
+
+        MachineInstrBuilder NewPhi =
+            BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
+                    TII->get(TargetOpcode::PHI), NewReg);
+        NewPhi.addReg(PhiOp1).addMBB(BB1);
+        NewPhi.addReg(PhiOp2).addMBB(BB2);
+        if (np == 0)
+          InstrMap[NewPhi] = &*BBI;
+
+        // Rewrite uses and update the map. The actions depend upon whether
+        // we generating code for the kernel or epilog blocks.
+        if (InKernel) {
+          rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
+                                NewReg);
+          rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
+                                NewReg);
+
+          PhiOp2 = NewReg;
+          VRMap[PrevStage - np - 1][Def] = NewReg;
+        } else {
+          VRMap[CurStageNum - np][Def] = NewReg;
+          if (np == NumPhis - 1)
+            rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
+                                  NewReg);
+        }
+        if (IsLast && np == NumPhis - 1)
+          replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
+      }
+    }
+  }
+}
+
+/// Remove instructions that generate values with no uses.
+/// Typically, these are induction variable operations that generate values
+/// used in the loop itself.  A dead instruction has a definition with
+/// no uses, or uses that occur in the original loop only.
+void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
+                                                    MBBVectorTy &EpilogBBs) {
+  // For each epilog block, check that the value defined by each instruction
+  // is used.  If not, delete it.
+  for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
+                                     MBE = EpilogBBs.rend();
+       MBB != MBE; ++MBB)
+    for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
+                                                   ME = (*MBB)->instr_rend();
+         MI != ME;) {
+      // From DeadMachineInstructionElem. Don't delete inline assembly.
+      if (MI->isInlineAsm()) {
+        ++MI;
+        continue;
+      }
+      bool SawStore = false;
+      // Check if it's safe to remove the instruction due to side effects.
+      // We can, and want to, remove Phis here.
+      if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
+        ++MI;
+        continue;
+      }
+      bool used = true;
+      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+                                      MOE = MI->operands_end();
+           MOI != MOE; ++MOI) {
+        if (!MOI->isReg() || !MOI->isDef())
+          continue;
+        Register reg = MOI->getReg();
+        // Assume physical registers are used, unless they are marked dead.
+        if (Register::isPhysicalRegister(reg)) {
+          used = !MOI->isDead();
+          if (used)
+            break;
+          continue;
+        }
+        unsigned realUses = 0;
+        for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
+                                               EI = MRI.use_end();
+             UI != EI; ++UI) {
+          // Check if there are any uses that occur only in the original
+          // loop.  If so, that's not a real use.
+          if (UI->getParent()->getParent() != BB) {
+            realUses++;
+            used = true;
+            break;
+          }
+        }
+        if (realUses > 0)
+          break;
+        used = false;
+      }
+      if (!used) {
+        LIS.RemoveMachineInstrFromMaps(*MI);
+        MI++->eraseFromParent();
+        continue;
+      }
+      ++MI;
+    }
+  // In the kernel block, check if we can remove a Phi that generates a value
+  // used in an instruction removed in the epilog block.
+  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
+                                   BBE = KernelBB->getFirstNonPHI();
+       BBI != BBE;) {
+    MachineInstr *MI = &*BBI;
+    ++BBI;
+    Register reg = MI->getOperand(0).getReg();
+    if (MRI.use_begin(reg) == MRI.use_end()) {
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+    }
+  }
+}
+
+/// For loop carried definitions, we split the lifetime of a virtual register
+/// that has uses past the definition in the next iteration. A copy with a new
+/// virtual register is inserted before the definition, which helps with
+/// generating a better register assignment.
+///
+///   v1 = phi(a, v2)     v1 = phi(a, v2)
+///   v2 = phi(b, v3)     v2 = phi(b, v3)
+///   v3 = ..             v4 = copy v1
+///   .. = V1             v3 = ..
+///                       .. = v4
+void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
+                                            MBBVectorTy &EpilogBBs) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (auto &PHI : KernelBB->phis()) {
+    Register Def = PHI.getOperand(0).getReg();
+    // Check for any Phi definition that used as an operand of another Phi
+    // in the same block.
+    for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
+                                                 E = MRI.use_instr_end();
+         I != E; ++I) {
+      if (I->isPHI() && I->getParent() == KernelBB) {
+        // Get the loop carried definition.
+        unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
+        if (!LCDef)
+          continue;
+        MachineInstr *MI = MRI.getVRegDef(LCDef);
+        if (!MI || MI->getParent() != KernelBB || MI->isPHI())
+          continue;
+        // Search through the rest of the block looking for uses of the Phi
+        // definition. If one occurs, then split the lifetime.
+        unsigned SplitReg = 0;
+        for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
+                                    KernelBB->instr_end()))
+          if (BBJ.readsRegister(Def)) {
+            // We split the lifetime when we find the first use.
+            if (SplitReg == 0) {
+              SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
+              BuildMI(*KernelBB, MI, MI->getDebugLoc(),
+                      TII->get(TargetOpcode::COPY), SplitReg)
+                  .addReg(Def);
+            }
+            BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
+          }
+        if (!SplitReg)
+          continue;
+        // Search through each of the epilog blocks for any uses to be renamed.
+        for (auto &Epilog : EpilogBBs)
+          for (auto &I : *Epilog)
+            if (I.readsRegister(Def))
+              I.substituteRegister(Def, SplitReg, 0, *TRI);
+        break;
+      }
+    }
+  }
+}
+
+/// Remove the incoming block from the Phis in a basic block.
+static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
+  for (MachineInstr &MI : *BB) {
+    if (!MI.isPHI())
+      break;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
+      if (MI.getOperand(i + 1).getMBB() == Incoming) {
+        MI.RemoveOperand(i + 1);
+        MI.RemoveOperand(i);
+        break;
+      }
+  }
+}
+
+/// Create branches from each prolog basic block to the appropriate epilog
+/// block.  These edges are needed if the loop ends before reaching the
+/// kernel.
+void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
+                                         MBBVectorTy &PrologBBs,
+                                         MachineBasicBlock *KernelBB,
+                                         MBBVectorTy &EpilogBBs,
+                                         ValueMapTy *VRMap) {
+  assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
+  MachineBasicBlock *LastPro = KernelBB;
+  MachineBasicBlock *LastEpi = KernelBB;
+
+  // Start from the blocks connected to the kernel and work "out"
+  // to the first prolog and the last epilog blocks.
+  SmallVector<MachineInstr *, 4> PrevInsts;
+  unsigned MaxIter = PrologBBs.size() - 1;
+  for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
+    // Add branches to the prolog that go to the corresponding
+    // epilog, and the fall-thru prolog/kernel block.
+    MachineBasicBlock *Prolog = PrologBBs[j];
+    MachineBasicBlock *Epilog = EpilogBBs[i];
+
+    SmallVector<MachineOperand, 4> Cond;
+    Optional<bool> StaticallyGreater =
+        LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
+    unsigned numAdded = 0;
+    if (!StaticallyGreater.hasValue()) {
+      Prolog->addSuccessor(Epilog);
+      numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
+    } else if (*StaticallyGreater == false) {
+      Prolog->addSuccessor(Epilog);
+      Prolog->removeSuccessor(LastPro);
+      LastEpi->removeSuccessor(Epilog);
+      numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
+      removePhis(Epilog, LastEpi);
+      // Remove the blocks that are no longer referenced.
+      if (LastPro != LastEpi) {
+        LastEpi->clear();
+        LastEpi->eraseFromParent();
+      }
+      if (LastPro == KernelBB) {
+        LoopInfo->disposed();
+        NewKernel = nullptr;
+      }
+      LastPro->clear();
+      LastPro->eraseFromParent();
+    } else {
+      numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
+      removePhis(Epilog, Prolog);
+    }
+    LastPro = Prolog;
+    LastEpi = Epilog;
+    for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
+                                                   E = Prolog->instr_rend();
+         I != E && numAdded > 0; ++I, --numAdded)
+      updateInstruction(&*I, false, j, 0, VRMap);
+  }
+
+  if (NewKernel) {
+    LoopInfo->setPreheader(PrologBBs[MaxIter]);
+    LoopInfo->adjustTripCount(-(MaxIter + 1));
+  }
+}
+
+/// Return true if we can compute the amount the instruction changes
+/// during each iteration. Set Delta to the amount of the change.
+bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const MachineOperand *BaseOp;
+  int64_t Offset;
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
+    return false;
+
+  if (!BaseOp->isReg())
+    return false;
+
+  Register BaseReg = BaseOp->getReg();
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Check if there is a Phi. If so, get the definition in the loop.
+  MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
+  if (BaseDef && BaseDef->isPHI()) {
+    BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
+    BaseDef = MRI.getVRegDef(BaseReg);
+  }
+  if (!BaseDef)
+    return false;
+
+  int D = 0;
+  if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
+    return false;
+
+  Delta = D;
+  return true;
+}
+
+/// Update the memory operand with a new offset when the pipeliner
+/// generates a new copy of the instruction that refers to a
+/// different memory location.
+void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
+                                               MachineInstr &OldMI,
+                                               unsigned Num) {
+  if (Num == 0)
+    return;
+  // If the instruction has memory operands, then adjust the offset
+  // when the instruction appears in different stages.
+  if (NewMI.memoperands_empty())
+    return;
+  SmallVector<MachineMemOperand *, 2> NewMMOs;
+  for (MachineMemOperand *MMO : NewMI.memoperands()) {
+    // TODO: Figure out whether isAtomic is really necessary (see D57601).
+    if (MMO->isVolatile() || MMO->isAtomic() ||
+        (MMO->isInvariant() && MMO->isDereferenceable()) ||
+        (!MMO->getValue())) {
+      NewMMOs.push_back(MMO);
+      continue;
+    }
+    unsigned Delta;
+    if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
+      int64_t AdjOffset = Delta * Num;
+      NewMMOs.push_back(
+          MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
+    } else {
+      NewMMOs.push_back(
+          MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
+    }
+  }
+  NewMI.setMemRefs(MF, NewMMOs);
+}
+
+/// Clone the instruction for the new pipelined loop and update the
+/// memory operands, if needed.
+MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
+                                                 unsigned CurStageNum,
+                                                 unsigned InstStageNum) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
+  // Check for tied operands in inline asm instructions. This should be handled
+  // elsewhere, but I'm not sure of the best solution.
+  if (OldMI->isInlineAsm())
+    for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
+      const auto &MO = OldMI->getOperand(i);
+      if (MO.isReg() && MO.isUse())
+        break;
+      unsigned UseIdx;
+      if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
+        NewMI->tieOperands(i, UseIdx);
+    }
+  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
+  return NewMI;
+}
+
+/// Clone the instruction for the new pipelined loop. If needed, this
+/// function updates the instruction using the values saved in the
+/// InstrChanges structure.
+MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
+    MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
+  auto It = InstrChanges.find(OldMI);
+  if (It != InstrChanges.end()) {
+    std::pair<unsigned, int64_t> RegAndOffset = It->second;
+    unsigned BasePos, OffsetPos;
+    if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
+      return nullptr;
+    int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
+    MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
+    if (Schedule.getStage(LoopDef) > (signed)InstStageNum)
+      NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
+    NewMI->getOperand(OffsetPos).setImm(NewOffset);
+  }
+  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
+  return NewMI;
+}
+
+/// Update the machine instruction with new virtual registers.  This
+/// function may change the defintions and/or uses.
+void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
+                                               bool LastDef,
+                                               unsigned CurStageNum,
+                                               unsigned InstrStageNum,
+                                               ValueMapTy *VRMap) {
+  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = NewMI->getOperand(i);
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
+      continue;
+    Register reg = MO.getReg();
+    if (MO.isDef()) {
+      // Create a new virtual register for the definition.
+      const TargetRegisterClass *RC = MRI.getRegClass(reg);
+      Register NewReg = MRI.createVirtualRegister(RC);
+      MO.setReg(NewReg);
+      VRMap[CurStageNum][reg] = NewReg;
+      if (LastDef)
+        replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
+    } else if (MO.isUse()) {
+      MachineInstr *Def = MRI.getVRegDef(reg);
+      // Compute the stage that contains the last definition for instruction.
+      int DefStageNum = Schedule.getStage(Def);
+      unsigned StageNum = CurStageNum;
+      if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
+        // Compute the difference in stages between the defintion and the use.
+        unsigned StageDiff = (InstrStageNum - DefStageNum);
+        // Make an adjustment to get the last definition.
+        StageNum -= StageDiff;
+      }
+      if (VRMap[StageNum].count(reg))
+        MO.setReg(VRMap[StageNum][reg]);
+    }
+  }
+}
+
+/// Return the instruction in the loop that defines the register.
+/// If the definition is a Phi, then follow the Phi operand to
+/// the instruction in the loop.
+MachineInstr *ModuloScheduleExpander::findDefInLoop(unsigned Reg) {
+  SmallPtrSet<MachineInstr *, 8> Visited;
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  while (Def->isPHI()) {
+    if (!Visited.insert(Def).second)
+      break;
+    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
+      if (Def->getOperand(i + 1).getMBB() == BB) {
+        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
+        break;
+      }
+  }
+  return Def;
+}
+
+/// Return the new name for the value from the previous stage.
+unsigned ModuloScheduleExpander::getPrevMapVal(
+    unsigned StageNum, unsigned PhiStage, unsigned LoopVal, unsigned LoopStage,
+    ValueMapTy *VRMap, MachineBasicBlock *BB) {
+  unsigned PrevVal = 0;
+  if (StageNum > PhiStage) {
+    MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
+    if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
+      // The name is defined in the previous stage.
+      PrevVal = VRMap[StageNum - 1][LoopVal];
+    else if (VRMap[StageNum].count(LoopVal))
+      // The previous name is defined in the current stage when the instruction
+      // order is swapped.
+      PrevVal = VRMap[StageNum][LoopVal];
+    else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
+      // The loop value hasn't yet been scheduled.
+      PrevVal = LoopVal;
+    else if (StageNum == PhiStage + 1)
+      // The loop value is another phi, which has not been scheduled.
+      PrevVal = getInitPhiReg(*LoopInst, BB);
+    else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
+      // The loop value is another phi, which has been scheduled.
+      PrevVal =
+          getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
+                        LoopStage, VRMap, BB);
+  }
+  return PrevVal;
+}
+
+/// Rewrite the Phi values in the specified block to use the mappings
+/// from the initial operand. Once the Phi is scheduled, we switch
+/// to using the loop value instead of the Phi value, so those names
+/// do not need to be rewritten.
+void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
+                                              unsigned StageNum,
+                                              ValueMapTy *VRMap,
+                                              InstrMapTy &InstrMap) {
+  for (auto &PHI : BB->phis()) {
+    unsigned InitVal = 0;
+    unsigned LoopVal = 0;
+    getPhiRegs(PHI, BB, InitVal, LoopVal);
+    Register PhiDef = PHI.getOperand(0).getReg();
+
+    unsigned PhiStage = (unsigned)Schedule.getStage(MRI.getVRegDef(PhiDef));
+    unsigned LoopStage = (unsigned)Schedule.getStage(MRI.getVRegDef(LoopVal));
+    unsigned NumPhis = getStagesForPhi(PhiDef);
+    if (NumPhis > StageNum)
+      NumPhis = StageNum;
+    for (unsigned np = 0; np <= NumPhis; ++np) {
+      unsigned NewVal =
+          getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
+      if (!NewVal)
+        NewVal = InitVal;
+      rewriteScheduledInstr(NewBB, InstrMap, StageNum - np, np, &PHI, PhiDef,
+                            NewVal);
+    }
+  }
+}
+
+/// Rewrite a previously scheduled instruction to use the register value
+/// from the new instruction. Make sure the instruction occurs in the
+/// basic block, and we don't change the uses in the new instruction.
+void ModuloScheduleExpander::rewriteScheduledInstr(
+    MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
+    unsigned PhiNum, MachineInstr *Phi, unsigned OldReg, unsigned NewReg,
+    unsigned PrevReg) {
+  bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
+  int StagePhi = Schedule.getStage(Phi) + PhiNum;
+  // Rewrite uses that have been scheduled already to use the new
+  // Phi register.
+  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
+                                         EI = MRI.use_end();
+       UI != EI;) {
+    MachineOperand &UseOp = *UI;
+    MachineInstr *UseMI = UseOp.getParent();
+    ++UI;
+    if (UseMI->getParent() != BB)
+      continue;
+    if (UseMI->isPHI()) {
+      if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
+        continue;
+      if (getLoopPhiReg(*UseMI, BB) != OldReg)
+        continue;
+    }
+    InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
+    assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
+    MachineInstr *OrigMI = OrigInstr->second;
+    int StageSched = Schedule.getStage(OrigMI);
+    int CycleSched = Schedule.getCycle(OrigMI);
+    unsigned ReplaceReg = 0;
+    // This is the stage for the scheduled instruction.
+    if (StagePhi == StageSched && Phi->isPHI()) {
+      int CyclePhi = Schedule.getCycle(Phi);
+      if (PrevReg && InProlog)
+        ReplaceReg = PrevReg;
+      else if (PrevReg && !isLoopCarried(*Phi) &&
+               (CyclePhi <= CycleSched || OrigMI->isPHI()))
+        ReplaceReg = PrevReg;
+      else
+        ReplaceReg = NewReg;
+    }
+    // The scheduled instruction occurs before the scheduled Phi, and the
+    // Phi is not loop carried.
+    if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(*Phi))
+      ReplaceReg = NewReg;
+    if (StagePhi > StageSched && Phi->isPHI())
+      ReplaceReg = NewReg;
+    if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
+      ReplaceReg = NewReg;
+    if (ReplaceReg) {
+      MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
+      UseOp.setReg(ReplaceReg);
+    }
+  }
+}
+
+bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
+  if (!Phi.isPHI())
+    return false;
+  unsigned DefCycle = Schedule.getCycle(&Phi);
+  int DefStage = Schedule.getStage(&Phi);
+
+  unsigned InitVal = 0;
+  unsigned LoopVal = 0;
+  getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
+  MachineInstr *Use = MRI.getVRegDef(LoopVal);
+  if (!Use || Use->isPHI())
+    return true;
+  unsigned LoopCycle = Schedule.getCycle(Use);
+  int LoopStage = Schedule.getStage(Use);
+  return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
+}
+
+//===----------------------------------------------------------------------===//
+// PeelingModuloScheduleExpander implementation
+//===----------------------------------------------------------------------===//
+// This is a reimplementation of ModuloScheduleExpander that works by creating
+// a fully correct steady-state kernel and peeling off the prolog and epilogs.
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Remove any dead phis in MBB. Dead phis either have only one block as input
+// (in which case they are the identity) or have no uses.
+void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
+                       LiveIntervals *LIS) {
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (auto I = MBB->begin(); I != MBB->getFirstNonPHI();) {
+      MachineInstr &MI = *I++;
+      assert(MI.isPHI());
+      if (MRI.use_empty(MI.getOperand(0).getReg())) {
+        if (LIS)
+          LIS->RemoveMachineInstrFromMaps(MI);
+        MI.eraseFromParent();
+        Changed = true;
+      } else if (MI.getNumExplicitOperands() == 3) {
+        MRI.constrainRegClass(MI.getOperand(1).getReg(),
+                              MRI.getRegClass(MI.getOperand(0).getReg()));
+        MRI.replaceRegWith(MI.getOperand(0).getReg(),
+                           MI.getOperand(1).getReg());
+        if (LIS)
+          LIS->RemoveMachineInstrFromMaps(MI);
+        MI.eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+}
+
+/// Rewrites the kernel block in-place to adhere to the given schedule.
+/// KernelRewriter holds all of the state required to perform the rewriting.
+class KernelRewriter {
+  ModuloSchedule &S;
+  MachineBasicBlock *BB;
+  MachineBasicBlock *PreheaderBB, *ExitBB;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo *TII;
+  LiveIntervals *LIS;
+
+  // Map from register class to canonical undef register for that class.
+  DenseMap<const TargetRegisterClass *, Register> Undefs;
+  // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
+  // this map is only used when InitReg is non-undef.
+  DenseMap<std::pair<unsigned, unsigned>, Register> Phis;
+  // Map from LoopReg to phi register where the InitReg is undef.
+  DenseMap<Register, Register> UndefPhis;
+
+  // Reg is used by MI. Return the new register MI should use to adhere to the
+  // schedule. Insert phis as necessary.
+  Register remapUse(Register Reg, MachineInstr &MI);
+  // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
+  // If InitReg is not given it is chosen arbitrarily. It will either be undef
+  // or will be chosen so as to share another phi.
+  Register phi(Register LoopReg, Optional<Register> InitReg = {},
+               const TargetRegisterClass *RC = nullptr);
+  // Create an undef register of the given register class.
+  Register undef(const TargetRegisterClass *RC);
+
+public:
+  KernelRewriter(MachineLoop &L, ModuloSchedule &S,
+                 LiveIntervals *LIS = nullptr);
+  void rewrite();
+};
+} // namespace
+
+KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
+                               LiveIntervals *LIS)
+    : S(S), BB(L.getTopBlock()), PreheaderBB(L.getLoopPreheader()),
+      ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
+      TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
+  PreheaderBB = *BB->pred_begin();
+  if (PreheaderBB == BB)
+    PreheaderBB = *std::next(BB->pred_begin());
+}
+
+void KernelRewriter::rewrite() {
+  // Rearrange the loop to be in schedule order. Note that the schedule may
+  // contain instructions that are not owned by the loop block (InstrChanges and
+  // friends), so we gracefully handle unowned instructions and delete any
+  // instructions that weren't in the schedule.
+  auto InsertPt = BB->getFirstTerminator();
+  MachineInstr *FirstMI = nullptr;
+  for (MachineInstr *MI : S.getInstructions()) {
+    if (MI->isPHI())
+      continue;
+    if (MI->getParent())
+      MI->removeFromParent();
+    BB->insert(InsertPt, MI);
+    if (!FirstMI)
+      FirstMI = MI;
+  }
+  assert(FirstMI && "Failed to find first MI in schedule");
+
+  // At this point all of the scheduled instructions are between FirstMI
+  // and the end of the block. Kill from the first non-phi to FirstMI.
+  for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*I);
+    (I++)->eraseFromParent();
+  }
+
+  // Now remap every instruction in the loop.
+  for (MachineInstr &MI : *BB) {
+    if (MI.isPHI() || MI.isTerminator())
+      continue;
+    for (MachineOperand &MO : MI.uses()) {
+      if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
+        continue;
+      Register Reg = remapUse(MO.getReg(), MI);
+      MO.setReg(Reg);
+    }
+  }
+  EliminateDeadPhis(BB, MRI, LIS);
+
+  // Ensure a phi exists for all instructions that are either referenced by
+  // an illegal phi or by an instruction outside the loop. This allows us to
+  // treat remaps of these values the same as "normal" values that come from
+  // loop-carried phis.
+  for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
+    if (MI->isPHI()) {
+      Register R = MI->getOperand(0).getReg();
+      phi(R);
+      continue;
+    }
+
+    for (MachineOperand &Def : MI->defs()) {
+      for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) {
+        if (MI.getParent() != BB) {
+          phi(Def.getReg());
+          break;
+        }
+      }
+    }
+  }
+}
+
+Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
+  MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
+  if (!Producer)
+    return Reg;
+
+  int ConsumerStage = S.getStage(&MI);
+  if (!Producer->isPHI()) {
+    // Non-phi producers are simple to remap. Insert as many phis as the
+    // difference between the consumer and producer stages.
+    if (Producer->getParent() != BB)
+      // Producer was not inside the loop. Use the register as-is.
+      return Reg;
+    int ProducerStage = S.getStage(Producer);
+    assert(ConsumerStage != -1 &&
+           "In-loop consumer should always be scheduled!");
+    assert(ConsumerStage >= ProducerStage);
+    unsigned StageDiff = ConsumerStage - ProducerStage;
+
+    for (unsigned I = 0; I < StageDiff; ++I)
+      Reg = phi(Reg);
+    return Reg;
+  }
+
+  // First, dive through the phi chain to find the defaults for the generated
+  // phis.
+  SmallVector<Optional<Register>, 4> Defaults;
+  Register LoopReg = Reg;
+  auto LoopProducer = Producer;
+  while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
+    LoopReg = getLoopPhiReg(*LoopProducer, BB);
+    Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB));
+    LoopProducer = MRI.getUniqueVRegDef(LoopReg);
+    assert(LoopProducer);
+  }
+  int LoopProducerStage = S.getStage(LoopProducer);
+
+  Optional<Register> IllegalPhiDefault;
+
+  if (LoopProducerStage == -1) {
+    // Do nothing.
+  } else if (LoopProducerStage > ConsumerStage) {
+    // This schedule is only representable if ProducerStage == ConsumerStage+1.
+    // In addition, Consumer's cycle must be scheduled after Producer in the
+    // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
+    // functions.
+#ifndef NDEBUG // Silence unused variables in non-asserts mode.
+    int LoopProducerCycle = S.getCycle(LoopProducer);
+    int ConsumerCycle = S.getCycle(&MI);
+#endif
+    assert(LoopProducerCycle <= ConsumerCycle);
+    assert(LoopProducerStage == ConsumerStage + 1);
+    // Peel off the first phi from Defaults and insert a phi between producer
+    // and consumer. This phi will not be at the front of the block so we
+    // consider it illegal. It will only exist during the rewrite process; it
+    // needs to exist while we peel off prologs because these could take the
+    // default value. After that we can replace all uses with the loop producer
+    // value.
+    IllegalPhiDefault = Defaults.front();
+    Defaults.erase(Defaults.begin());
+  } else {
+    assert(ConsumerStage >= LoopProducerStage);
+    int StageDiff = ConsumerStage - LoopProducerStage;
+    if (StageDiff > 0) {
+      LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
+                        << " to " << (Defaults.size() + StageDiff) << "\n");
+      // If we need more phis than we have defaults for, pad out with undefs for
+      // the earliest phis, which are at the end of the defaults chain (the
+      // chain is in reverse order).
+      Defaults.resize(Defaults.size() + StageDiff, Defaults.empty()
+                                                       ? Optional<Register>()
+                                                       : Defaults.back());
+    }
+  }
+
+  // Now we know the number of stages to jump back, insert the phi chain.
+  auto DefaultI = Defaults.rbegin();
+  while (DefaultI != Defaults.rend())
+    LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
+
+  if (IllegalPhiDefault.hasValue()) {
+    // The consumer optionally consumes LoopProducer in the same iteration
+    // (because the producer is scheduled at an earlier cycle than the consumer)
+    // or the initial value. To facilitate this we create an illegal block here
+    // by embedding a phi in the middle of the block. We will fix this up
+    // immediately prior to pruning.
+    auto RC = MRI.getRegClass(Reg);
+    Register R = MRI.createVirtualRegister(RC);
+    BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
+        .addReg(IllegalPhiDefault.getValue())
+        .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
+        .addReg(LoopReg)
+        .addMBB(BB); // Block choice is arbitrary and has no effect.
+    return R;
+  }
+
+  return LoopReg;
+}
+
+Register KernelRewriter::phi(Register LoopReg, Optional<Register> InitReg,
+                             const TargetRegisterClass *RC) {
+  // If the init register is not undef, try and find an existing phi.
+  if (InitReg.hasValue()) {
+    auto I = Phis.find({LoopReg, InitReg.getValue()});
+    if (I != Phis.end())
+      return I->second;
+  } else {
+    for (auto &KV : Phis) {
+      if (KV.first.first == LoopReg)
+        return KV.second;
+    }
+  }
+
+  // InitReg is either undef or no existing phi takes InitReg as input. Try and
+  // find a phi that takes undef as input.
+  auto I = UndefPhis.find(LoopReg);
+  if (I != UndefPhis.end()) {
+    Register R = I->second;
+    if (!InitReg.hasValue())
+      // Found a phi taking undef as input, and this input is undef so return
+      // without any more changes.
+      return R;
+    // Found a phi taking undef as input, so rewrite it to take InitReg.
+    MachineInstr *MI = MRI.getVRegDef(R);
+    MI->getOperand(1).setReg(InitReg.getValue());
+    Phis.insert({{LoopReg, InitReg.getValue()}, R});
+    MRI.constrainRegClass(R, MRI.getRegClass(InitReg.getValue()));
+    UndefPhis.erase(I);
+    return R;
+  }
+
+  // Failed to find any existing phi to reuse, so create a new one.
+  if (!RC)
+    RC = MRI.getRegClass(LoopReg);
+  Register R = MRI.createVirtualRegister(RC);
+  if (InitReg.hasValue())
+    MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
+  BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
+      .addReg(InitReg.hasValue() ? *InitReg : undef(RC))
+      .addMBB(PreheaderBB)
+      .addReg(LoopReg)
+      .addMBB(BB);
+  if (!InitReg.hasValue())
+    UndefPhis[LoopReg] = R;
+  else
+    Phis[{LoopReg, *InitReg}] = R;
+  return R;
+}
+
+Register KernelRewriter::undef(const TargetRegisterClass *RC) {
+  Register &R = Undefs[RC];
+  if (R == 0) {
+    // Create an IMPLICIT_DEF that defines this register if we need it.
+    // All uses of this should be removed by the time we have finished unrolling
+    // prologs and epilogs.
+    R = MRI.createVirtualRegister(RC);
+    auto *InsertBB = &PreheaderBB->getParent()->front();
+    BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), R);
+  }
+  return R;
+}
+
+namespace {
+/// Describes an operand in the kernel of a pipelined loop. Characteristics of
+/// the operand are discovered, such as how many in-loop PHIs it has to jump
+/// through and defaults for these phis.
+class KernelOperandInfo {
+  MachineBasicBlock *BB;
+  MachineRegisterInfo &MRI;
+  SmallVector<Register, 4> PhiDefaults;
+  MachineOperand *Source;
+  MachineOperand *Target;
+
+public:
+  KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
+                    const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
+      : MRI(MRI) {
+    Source = MO;
+    BB = MO->getParent()->getParent();
+    while (isRegInLoop(MO)) {
+      MachineInstr *MI = MRI.getVRegDef(MO->getReg());
+      if (MI->isFullCopy()) {
+        MO = &MI->getOperand(1);
+        continue;
+      }
+      if (!MI->isPHI())
+        break;
+      // If this is an illegal phi, don't count it in distance.
+      if (IllegalPhis.count(MI)) {
+        MO = &MI->getOperand(3);
+        continue;
+      }
+
+      Register Default = getInitPhiReg(*MI, BB);
+      MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1)
+                                            : &MI->getOperand(3);
+      PhiDefaults.push_back(Default);
+    }
+    Target = MO;
+  }
+
+  bool operator==(const KernelOperandInfo &Other) const {
+    return PhiDefaults.size() == Other.PhiDefaults.size();
+  }
+
+  void print(raw_ostream &OS) const {
+    OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
+       << *Source->getParent();
+  }
+
+private:
+  bool isRegInLoop(MachineOperand *MO) {
+    return MO->isReg() && MO->getReg().isVirtual() &&
+           MRI.getVRegDef(MO->getReg())->getParent() == BB;
+  }
+};
+} // namespace
+
+MachineBasicBlock *
+PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
+  MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
+  if (LPD == LPD_Front)
+    PeeledFront.push_back(NewBB);
+  else
+    PeeledBack.push_front(NewBB);
+  for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
+       ++I, ++NI) {
+    CanonicalMIs[&*I] = &*I;
+    CanonicalMIs[&*NI] = &*I;
+    BlockMIs[{NewBB, &*I}] = &*NI;
+    BlockMIs[{BB, &*I}] = &*I;
+  }
+  return NewBB;
+}
+
+void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
+  BitVector LS(Schedule.getNumStages(), true);
+  BitVector AS(Schedule.getNumStages(), true);
+  LiveStages[BB] = LS;
+  AvailableStages[BB] = AS;
+
+  // Peel out the prologs.
+  LS.reset();
+  for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
+    LS[I] = 1;
+    Prologs.push_back(peelKernel(LPD_Front));
+    LiveStages[Prologs.back()] = LS;
+    AvailableStages[Prologs.back()] = LS;
+  }
+
+  // Create a block that will end up as the new loop exiting block (dominated by
+  // all prologs and epilogs). It will only contain PHIs, in the same order as
+  // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
+  // that the exiting block is a (sub) clone of BB. This in turn gives us the
+  // property that any value deffed in BB but used outside of BB is used by a
+  // PHI in the exiting block.
+  MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
+
+  // Push out the epilogs, again in reverse order.
+  // We can't assume anything about the minumum loop trip count at this point,
+  // so emit a fairly complex epilog:
+  //  K[0, 1, 2]     // Kernel runs stages 0, 1, 2
+  //  E0[2] <- P1    // Epilog runs stage 2 only, so the state after is [0].
+  //  E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
+  //
+  // This creates a single-successor single-predecessor sequence of blocks for
+  // each epilog, which are kept this way for simplicity at this stage and
+  // cleaned up by the optimizer later.
+  for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
+    Epilogs.push_back(nullptr);
+    for (int J = Schedule.getNumStages() - 1; J >= I; --J) {
+      LS.reset();
+      LS[J] = 1;
+      Epilogs.back() = peelKernel(LPD_Back);
+      LiveStages[Epilogs.back()] = LS;
+      AvailableStages[Epilogs.back()] = AS;
+    }
+  }
+
+  // Now we've defined all the prolog and epilog blocks as a fallthrough
+  // sequence, add the edges that will be followed if the loop trip count is
+  // lower than the number of stages (connecting prologs directly with epilogs).
+  auto PI = Prologs.begin();
+  auto EI = Epilogs.begin();
+  assert(Prologs.size() == Epilogs.size());
+  for (; PI != Prologs.end(); ++PI, ++EI) {
+    MachineBasicBlock *Pred = *(*EI)->pred_begin();
+    (*PI)->addSuccessor(*EI);
+    for (MachineInstr &MI : (*EI)->phis()) {
+      Register Reg = MI.getOperand(1).getReg();
+      MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
+      if (Use && Use->getParent() == Pred)
+        Reg = getEquivalentRegisterIn(Reg, *PI);
+      MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
+      MI.addOperand(MachineOperand::CreateMBB(*PI));
+    }
+  }
+
+  // Create a list of all blocks in order.
+  SmallVector<MachineBasicBlock *, 8> Blocks;
+  llvm::copy(PeeledFront, std::back_inserter(Blocks));
+  Blocks.push_back(BB);
+  llvm::copy(PeeledBack, std::back_inserter(Blocks));
+
+  // Iterate in reverse order over all instructions, remapping as we go.
+  for (MachineBasicBlock *B : reverse(Blocks)) {
+    for (auto I = B->getFirstInstrTerminator()->getReverseIterator();
+         I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
+      MachineInstr *MI = &*I++;
+      rewriteUsesOf(MI);
+    }
+  }
+  // Now all remapping has been done, we're free to optimize the generated code.
+  for (MachineBasicBlock *B : reverse(Blocks))
+    EliminateDeadPhis(B, MRI, LIS);
+  EliminateDeadPhis(ExitingBB, MRI, LIS);
+}
+
+MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
+  MachineFunction &MF = *BB->getParent();
+  MachineBasicBlock *Exit = *BB->succ_begin();
+  if (Exit == BB)
+    Exit = *std::next(BB->succ_begin());
+
+  MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+  MF.insert(std::next(BB->getIterator()), NewBB);
+
+  // Clone all phis in BB into NewBB and rewrite.
+  for (MachineInstr &MI : BB->phis()) {
+    auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
+    Register OldR = MI.getOperand(3).getReg();
+    Register R = MRI.createVirtualRegister(RC);
+    SmallVector<MachineInstr *, 4> Uses;
+    for (MachineInstr &Use : MRI.use_instructions(OldR))
+      if (Use.getParent() != BB)
+        Uses.push_back(&Use);
+    for (MachineInstr *Use : Uses)
+      Use->substituteRegister(OldR, R, /*SubIdx=*/0,
+                              *MRI.getTargetRegisterInfo());
+    MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
+        .addReg(OldR)
+        .addMBB(BB);
+    BlockMIs[{NewBB, &MI}] = NI;
+    CanonicalMIs[NI] = &MI;
+  }
+  BB->replaceSuccessor(Exit, NewBB);
+  Exit->replacePhiUsesWith(BB, NewBB);
+  NewBB->addSuccessor(Exit);
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
+  (void)CanAnalyzeBr;
+  assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+  TII->removeBranch(*BB);
+  TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
+                    Cond, DebugLoc());
+  TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
+  return NewBB;
+}
+
+Register
+PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
+                                                       MachineBasicBlock *BB) {
+  MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+  unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg);
+  return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
+}
+
+void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
+  if (MI->isPHI()) {
+    // This is an illegal PHI. The loop-carried (desired) value is operand 3,
+    // and it is produced by this block.
+    Register PhiR = MI->getOperand(0).getReg();
+    Register R = MI->getOperand(3).getReg();
+    int RMIStage = getStage(MRI.getUniqueVRegDef(R));
+    if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
+      R = MI->getOperand(1).getReg();
+    MRI.setRegClass(R, MRI.getRegClass(PhiR));
+    MRI.replaceRegWith(PhiR, R);
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+    return;
+  }
+
+  int Stage = getStage(MI);
+  if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
+      LiveStages[MI->getParent()].test(Stage))
+    // Instruction is live, no rewriting to do.
+    return;
+
+  for (MachineOperand &DefMO : MI->defs()) {
+    SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
+    for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
+      // Only PHIs can use values from this block by construction.
+      // Match with the equivalent PHI in B.
+      assert(UseMI.isPHI());
+      Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
+                                             MI->getParent());
+      Subs.emplace_back(&UseMI, Reg);
+    }
+    for (auto &Sub : Subs)
+      Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
+                                    *MRI.getTargetRegisterInfo());
+  }
+  if (LIS)
+    LIS->RemoveMachineInstrFromMaps(*MI);
+  MI->eraseFromParent();
+}
+
+void PeelingModuloScheduleExpander::fixupBranches() {
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> Info =
+      TII->analyzeLoopForPipelining(BB);
+  assert(Info);
+
+  // Work outwards from the kernel.
+  bool KernelDisposed = false;
+  int TC = Schedule.getNumStages() - 1;
+  for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
+       ++PI, ++EI, --TC) {
+    MachineBasicBlock *Prolog = *PI;
+    MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
+    MachineBasicBlock *Epilog = *EI;
+    SmallVector<MachineOperand, 4> Cond;
+    TII->removeBranch(*Prolog);
+    Optional<bool> StaticallyGreater =
+        Info->createTripCountGreaterCondition(TC, *Prolog, Cond);
+    if (!StaticallyGreater.hasValue()) {
+      LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
+      // Dynamically branch based on Cond.
+      TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
+    } else if (*StaticallyGreater == false) {
+      LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
+      // Prolog never falls through; branch to epilog and orphan interior
+      // blocks. Leave it to unreachable-block-elim to clean up.
+      Prolog->removeSuccessor(Fallthrough);
+      for (MachineInstr &P : Fallthrough->phis()) {
+        P.RemoveOperand(2);
+        P.RemoveOperand(1);
+      }
+      TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
+      KernelDisposed = true;
+    } else {
+      LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
+      // Prolog always falls through; remove incoming values in epilog.
+      Prolog->removeSuccessor(Epilog);
+      for (MachineInstr &P : Epilog->phis()) {
+        P.RemoveOperand(4);
+        P.RemoveOperand(3);
+      }
+    }
+  }
+
+  if (!KernelDisposed) {
+    Info->adjustTripCount(-(Schedule.getNumStages() - 1));
+    Info->setPreheader(Prologs.back());
+  } else {
+    Info->disposed();
+  }
+}
+
+void PeelingModuloScheduleExpander::rewriteKernel() {
+  KernelRewriter KR(*Schedule.getLoop(), Schedule);
+  KR.rewrite();
+}
+
+void PeelingModuloScheduleExpander::expand() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = Schedule.getLoop()->getLoopPreheader();
+  LLVM_DEBUG(Schedule.dump());
+
+  rewriteKernel();
+  peelPrologAndEpilogs();
+  fixupBranches();
+}
+
+void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = Schedule.getLoop()->getLoopPreheader();
+
+  // Dump the schedule before we invalidate and remap all its instructions.
+  // Stash it in a string so we can print it if we found an error.
+  std::string ScheduleDump;
+  raw_string_ostream OS(ScheduleDump);
+  Schedule.print(OS);
+  OS.flush();
+
+  // First, run the normal ModuleScheduleExpander. We don't support any
+  // InstrChanges.
+  assert(LIS && "Requires LiveIntervals!");
+  ModuloScheduleExpander MSE(MF, Schedule, *LIS,
+                             ModuloScheduleExpander::InstrChangesTy());
+  MSE.expand();
+  MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
+  if (!ExpandedKernel) {
+    // The expander optimized away the kernel. We can't do any useful checking.
+    MSE.cleanup();
+    return;
+  }
+  // Before running the KernelRewriter, re-add BB into the CFG.
+  Preheader->addSuccessor(BB);
+
+  // Now run the new expansion algorithm.
+  KernelRewriter KR(*Schedule.getLoop(), Schedule);
+  KR.rewrite();
+  peelPrologAndEpilogs();
+
+  // Collect all illegal phis that the new algorithm created. We'll give these
+  // to KernelOperandInfo.
+  SmallPtrSet<MachineInstr *, 4> IllegalPhis;
+  for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
+    if (NI->isPHI())
+      IllegalPhis.insert(&*NI);
+  }
+
+  // Co-iterate across both kernels. We expect them to be identical apart from
+  // phis and full COPYs (we look through both).
+  SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
+  auto OI = ExpandedKernel->begin();
+  auto NI = BB->begin();
+  for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
+    while (OI->isPHI() || OI->isFullCopy())
+      ++OI;
+    while (NI->isPHI() || NI->isFullCopy())
+      ++NI;
+    assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
+    // Analyze every operand separately.
+    for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
+         OOpI != OI->operands_end(); ++OOpI, ++NOpI)
+      KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
+                        KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
+  }
+
+  bool Failed = false;
+  for (auto &OldAndNew : KOIs) {
+    if (OldAndNew.first == OldAndNew.second)
+      continue;
+    Failed = true;
+    errs() << "Modulo kernel validation error: [\n";
+    errs() << " [golden] ";
+    OldAndNew.first.print(errs());
+    errs() << "          ";
+    OldAndNew.second.print(errs());
+    errs() << "]\n";
+  }
+
+  if (Failed) {
+    errs() << "Golden reference kernel:\n";
+    ExpandedKernel->print(errs());
+    errs() << "New kernel:\n";
+    BB->print(errs());
+    errs() << ScheduleDump;
+    report_fatal_error(
+        "Modulo kernel validation (-pipeliner-experimental-cg) failed");
+  }
+
+  // Cleanup by removing BB from the CFG again as the original
+  // ModuloScheduleExpander intended.
+  Preheader->removeSuccessor(BB);
+  MSE.cleanup();
+}
+
+//===----------------------------------------------------------------------===//
+// ModuloScheduleTestPass implementation
+//===----------------------------------------------------------------------===//
+// This pass constructs a ModuloSchedule from its module and runs
+// ModuloScheduleExpander.
+//
+// The module is expected to contain a single-block analyzable loop.
+// The total order of instructions is taken from the loop as-is.
+// Instructions are expected to be annotated with a PostInstrSymbol.
+// This PostInstrSymbol must have the following format:
+//  "Stage=%d Cycle=%d".
+//===----------------------------------------------------------------------===//
+
+namespace {
+class ModuloScheduleTest : public MachineFunctionPass {
+public:
+  static char ID;
+
+  ModuloScheduleTest() : MachineFunctionPass(ID) {
+    initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void runOnLoop(MachineFunction &MF, MachineLoop &L);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<LiveIntervals>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // namespace
+
+char ModuloScheduleTest::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
+                      "Modulo Schedule test pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
+                    "Modulo Schedule test pass", false, false)
+
+bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  for (auto *L : MLI) {
+    if (L->getTopBlock() != L->getBottomBlock())
+      continue;
+    runOnLoop(MF, *L);
+    return false;
+  }
+  return false;
+}
+
+static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
+  std::pair<StringRef, StringRef> StageAndCycle = getToken(S, "_");
+  std::pair<StringRef, StringRef> StageTokenAndValue =
+      getToken(StageAndCycle.first, "-");
+  std::pair<StringRef, StringRef> CycleTokenAndValue =
+      getToken(StageAndCycle.second, "-");
+  if (StageTokenAndValue.first != "Stage" ||
+      CycleTokenAndValue.first != "_Cycle") {
+    llvm_unreachable(
+        "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
+    return;
+  }
+
+  StageTokenAndValue.second.drop_front().getAsInteger(10, Stage);
+  CycleTokenAndValue.second.drop_front().getAsInteger(10, Cycle);
+
+  dbgs() << "  Stage=" << Stage << ", Cycle=" << Cycle << "\n";
+}
+
+void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineBasicBlock *BB = L.getTopBlock();
+  dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
+
+  DenseMap<MachineInstr *, int> Cycle, Stage;
+  std::vector<MachineInstr *> Instrs;
+  for (MachineInstr &MI : *BB) {
+    if (MI.isTerminator())
+      continue;
+    Instrs.push_back(&MI);
+    if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
+      dbgs() << "Parsing post-instr symbol for " << MI;
+      parseSymbolString(Sym->getName(), Cycle[&MI], Stage[&MI]);
+    }
+  }
+
+  ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
+                    std::move(Stage));
+  ModuloScheduleExpander MSE(
+      MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
+  MSE.expand();
+  MSE.cleanup();
+}
+
+//===----------------------------------------------------------------------===//
+// ModuloScheduleTestAnnotater implementation
+//===----------------------------------------------------------------------===//
+
+void ModuloScheduleTestAnnotater::annotate() {
+  for (MachineInstr *MI : S.getInstructions()) {
+    SmallVector<char, 16> SV;
+    raw_svector_ostream OS(SV);
+    OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
+    MCSymbol *Sym = MF.getContext().getOrCreateSymbol(OS.str());
+    MI->setPostInstrSymbol(MF, Sym);
+  }
+}
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index c70b62252139..1a493964e678 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -97,7 +97,7 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
                                          unsigned &SingleValReg,
                                          InstrSet &PHIsInCycle) {
   assert(MI->isPHI() && "IsSingleValuePHICycle expects a PHI instruction");
-  unsigned DstReg = MI->getOperand(0).getReg();
+  Register DstReg = MI->getOperand(0).getReg();
 
   // See if we already saw this register.
   if (!PHIsInCycle.insert(MI).second)
@@ -109,16 +109,15 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
 
   // Scan the PHI operands.
   for (unsigned i = 1; i != MI->getNumOperands(); i += 2) {
-    unsigned SrcReg = MI->getOperand(i).getReg();
+    Register SrcReg = MI->getOperand(i).getReg();
     if (SrcReg == DstReg)
       continue;
     MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
 
     // Skip over register-to-register moves.
-    if (SrcMI && SrcMI->isCopy() &&
-        !SrcMI->getOperand(0).getSubReg() &&
+    if (SrcMI && SrcMI->isCopy() && !SrcMI->getOperand(0).getSubReg() &&
         !SrcMI->getOperand(1).getSubReg() &&
-        TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg())) {
+        Register::isVirtualRegister(SrcMI->getOperand(1).getReg())) {
       SrcReg = SrcMI->getOperand(1).getReg();
       SrcMI = MRI->getVRegDef(SrcReg);
     }
@@ -142,8 +141,8 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
 /// other PHIs in a cycle.
 bool OptimizePHIs::IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle) {
   assert(MI->isPHI() && "IsDeadPHICycle expects a PHI instruction");
-  unsigned DstReg = MI->getOperand(0).getReg();
-  assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
+  Register DstReg = MI->getOperand(0).getReg();
+  assert(Register::isVirtualRegister(DstReg) &&
          "PHI destination is not a virtual register");
 
   // See if we already saw this register.
@@ -177,7 +176,7 @@ bool OptimizePHIs::OptimizeBB(MachineBasicBlock &MBB) {
     InstrSet PHIsInCycle;
     if (IsSingleValuePHICycle(MI, SingleValReg, PHIsInCycle) &&
         SingleValReg != 0) {
-      unsigned OldReg = MI->getOperand(0).getReg();
+      Register OldReg = MI->getOperand(0).getReg();
       if (!MRI->constrainRegClass(SingleValReg, MRI->getRegClass(OldReg)))
         continue;
 
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 948a5835438c..4dd4c4b1084e 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -31,7 +31,9 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Pass.h"
@@ -168,7 +170,7 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
 
   // Remove dead IMPLICIT_DEF instructions.
   for (MachineInstr *DefMI : ImpDefs) {
-    unsigned DefReg = DefMI->getOperand(0).getReg();
+    Register DefReg = DefMI->getOperand(0).getReg();
     if (MRI->use_nodbg_empty(DefReg)) {
       if (LIS)
         LIS->RemoveMachineInstrFromMaps(*DefMI);
@@ -183,6 +185,11 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
     MF.DeleteMachineInstr(I.first);
   }
 
+  // TODO: we should use the incremental DomTree updater here.
+  if (Changed)
+    if (auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>())
+      MDT->getBase().recalculate(MF);
+
   LoweredPHIs.clear();
   ImpDefs.clear();
   VRegPHIUseCount.clear();
@@ -240,7 +247,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   MachineInstr *MPhi = MBB.remove(&*MBB.begin());
 
   unsigned NumSrcs = (MPhi->getNumOperands() - 1) / 2;
-  unsigned DestReg = MPhi->getOperand(0).getReg();
+  Register DestReg = MPhi->getOperand(0).getReg();
   assert(MPhi->getOperand(0).getSubReg() == 0 && "Can't handle sub-reg PHIs");
   bool isDead = MPhi->getOperand(0).isDead();
 
@@ -252,11 +259,12 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   // Insert a register to register copy at the top of the current block (but
   // after any remaining phi nodes) which copies the new incoming register
   // into the phi node destination.
+  MachineInstr *PHICopy = nullptr;
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   if (allPhiOperandsUndefined(*MPhi, *MRI))
     // If all sources of a PHI node are implicit_def or undef uses, just emit an
     // implicit_def instead of a copy.
-    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+    PHICopy = BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
             TII->get(TargetOpcode::IMPLICIT_DEF), DestReg);
   else {
     // Can we reuse an earlier PHI node? This only happens for critical edges,
@@ -273,15 +281,13 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
       entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
     }
-    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
-            TII->get(TargetOpcode::COPY), DestReg)
-      .addReg(IncomingReg);
+    // Give the target possiblity to handle special cases fallthrough otherwise
+    PHICopy = TII->createPHIDestinationCopy(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+                                  IncomingReg, DestReg);
   }
 
   // Update live variable information if there is any.
   if (LV) {
-    MachineInstr &PHICopy = *std::prev(AfterPHIsIt);
-
     if (IncomingReg) {
       LiveVariables::VarInfo &VI = LV->getVarInfo(IncomingReg);
 
@@ -302,7 +308,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       // killed.  Note that because the value is defined in several places (once
       // each for each incoming block), the "def" block and instruction fields
       // for the VarInfo is not filled in.
-      LV->addVirtualRegisterKilled(IncomingReg, PHICopy);
+      LV->addVirtualRegisterKilled(IncomingReg, *PHICopy);
     }
 
     // Since we are going to be deleting the PHI node, if it is the last use of
@@ -312,15 +318,14 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
 
     // If the result is dead, update LV.
     if (isDead) {
-      LV->addVirtualRegisterDead(DestReg, PHICopy);
+      LV->addVirtualRegisterDead(DestReg, *PHICopy);
       LV->removeVirtualRegisterDead(DestReg, *MPhi);
     }
   }
 
   // Update LiveIntervals for the new copy or implicit def.
   if (LIS) {
-    SlotIndex DestCopyIndex =
-        LIS->InsertMachineInstrInMaps(*std::prev(AfterPHIsIt));
+    SlotIndex DestCopyIndex = LIS->InsertMachineInstrInMaps(*PHICopy);
 
     SlotIndex MBBStartIndex = LIS->getMBBStartIdx(&MBB);
     if (IncomingReg) {
@@ -368,11 +373,11 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   // IncomingReg register in the corresponding predecessor basic block.
   SmallPtrSet<MachineBasicBlock*, 8> MBBsInsertedInto;
   for (int i = NumSrcs - 1; i >= 0; --i) {
-    unsigned SrcReg = MPhi->getOperand(i*2+1).getReg();
+    Register SrcReg = MPhi->getOperand(i * 2 + 1).getReg();
     unsigned SrcSubReg = MPhi->getOperand(i*2+1).getSubReg();
     bool SrcUndef = MPhi->getOperand(i*2+1).isUndef() ||
       isImplicitlyDefined(SrcReg, *MRI);
-    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+    assert(Register::isVirtualRegister(SrcReg) &&
            "Machine PHI Operands must all be virtual registers!");
 
     // Get the MachineBasicBlock equivalent of the BasicBlock that is the source
@@ -406,9 +411,9 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
           if (DefMI->isImplicitDef())
             ImpDefs.insert(DefMI);
       } else {
-        NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
-                            TII->get(TargetOpcode::COPY), IncomingReg)
-                        .addReg(SrcReg, 0, SrcSubReg);
+        NewSrcInstr =
+            TII->createPHISourceCopy(opBlock, InsertPos, MPhi->getDebugLoc(),
+                                     SrcReg, SrcSubReg, IncomingReg);
       }
     }
 
@@ -457,7 +462,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
           }
         } else {
           // We just inserted this copy.
-          KillInst = std::prev(InsertPos);
+          KillInst = NewSrcInstr;
         }
       }
       assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
@@ -567,7 +572,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
   for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end();
        BBI != BBE && BBI->isPHI(); ++BBI) {
     for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
-      unsigned Reg = BBI->getOperand(i).getReg();
+      Register Reg = BBI->getOperand(i).getReg();
       MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB();
       // Is there a critical edge from PreMBB to MBB?
       if (PreMBB->succ_size() == 1)
diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp
index a3fa1b0ad8ed..529fde84e39a 100644
--- a/lib/CodeGen/PatchableFunction.cpp
+++ b/lib/CodeGen/PatchableFunction.cpp
@@ -78,7 +78,7 @@ bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {
     MIB.add(MO);
 
   FirstActualI->eraseFromParent();
-  MF.ensureAlignment(4);
+  MF.ensureAlignment(Align(16));
   return true;
 }
 
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index b918396aa8c5..54f1d38ed106 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -418,7 +418,7 @@ namespace {
                  const MachineRegisterInfo &MRI,
                  const TargetInstrInfo *TII = nullptr)
         : DefSubReg(DefSubReg), Reg(Reg), MRI(MRI), TII(TII) {
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!Register::isPhysicalRegister(Reg)) {
         Def = MRI.getVRegDef(Reg);
         DefIdx = MRI.def_begin(Reg).getOperandNo();
       }
@@ -460,8 +460,8 @@ optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
   if (!TII->isCoalescableExtInstr(MI, SrcReg, DstReg, SubIdx))
     return false;
 
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
-      TargetRegisterInfo::isPhysicalRegister(SrcReg))
+  if (Register::isPhysicalRegister(DstReg) ||
+      Register::isPhysicalRegister(SrcReg))
     return false;
 
   if (MRI->hasOneNonDBGUse(SrcReg))
@@ -581,7 +581,7 @@ optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
         MRI->constrainRegClass(DstReg, DstRC);
       }
 
-      unsigned NewVR = MRI->createVirtualRegister(RC);
+      Register NewVR = MRI->createVirtualRegister(RC);
       MachineInstr *Copy = BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
                                    TII->get(TargetOpcode::COPY), NewVR)
         .addReg(DstReg, 0, SubIdx);
@@ -609,8 +609,8 @@ bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr &MI) {
   unsigned SrcReg, SrcReg2;
   int CmpMask, CmpValue;
   if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
-      TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
-      (SrcReg2 != 0 && TargetRegisterInfo::isPhysicalRegister(SrcReg2)))
+      Register::isPhysicalRegister(SrcReg) ||
+      (SrcReg2 != 0 && Register::isPhysicalRegister(SrcReg2)))
     return false;
 
   // Attempt to optimize the comparison instruction.
@@ -663,7 +663,7 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
   // Thus, instead of maintaining untested code, we will revisit that if
   // that changes at some point.
   unsigned Reg = RegSubReg.Reg;
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return false;
   const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
 
@@ -675,7 +675,7 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
   do {
     CurSrcPair = SrcToLook.pop_back_val();
     // As explained above, do not handle physical registers
-    if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg))
+    if (Register::isPhysicalRegister(CurSrcPair.Reg))
       return false;
 
     ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI, TII);
@@ -723,7 +723,7 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
       // constraints to the register allocator. Moreover, if we want to extend
       // the live-range of a physical register, unlike SSA virtual register,
       // we will have to check that they aren't redefine before the related use.
-      if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg))
+      if (Register::isPhysicalRegister(CurSrcPair.Reg))
         return false;
 
       // Keep following the chain if the value isn't any better yet.
@@ -761,7 +761,7 @@ insertPHI(MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
   // NewRC is only correct if no subregisters are involved. findNextSource()
   // should have rejected those cases already.
   assert(SrcRegs[0].SubReg == 0 && "should not have subreg operand");
-  unsigned NewVR = MRI.createVirtualRegister(NewRC);
+  Register NewVR = MRI.createVirtualRegister(NewRC);
   MachineBasicBlock *MBB = OrigPHI.getParent();
   MachineInstrBuilder MIB = BuildMI(*MBB, &OrigPHI, OrigPHI.getDebugLoc(),
                                     TII.get(TargetOpcode::PHI), NewVR);
@@ -1170,7 +1170,7 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) {
          "Coalescer can understand multiple defs?!");
   const MachineOperand &MODef = MI.getOperand(0);
   // Do not rewrite physical definitions.
-  if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg()))
+  if (Register::isPhysicalRegister(MODef.getReg()))
     return false;
 
   bool Changed = false;
@@ -1221,7 +1221,7 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) {
 MachineInstr &
 PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
                                  RegSubRegPair Def, RewriteMapTy &RewriteMap) {
-  assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) &&
+  assert(!Register::isPhysicalRegister(Def.Reg) &&
          "We do not rewrite physical registers");
 
   // Find the new source to use in the COPY rewrite.
@@ -1229,7 +1229,7 @@ PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
 
   // Insert the COPY.
   const TargetRegisterClass *DefRC = MRI->getRegClass(Def.Reg);
-  unsigned NewVReg = MRI->createVirtualRegister(DefRC);
+  Register NewVReg = MRI->createVirtualRegister(DefRC);
 
   MachineInstr *NewCopy =
       BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
@@ -1280,7 +1280,7 @@ bool PeepholeOptimizer::optimizeUncoalescableCopy(
   while (CpyRewriter.getNextRewritableSource(Src, Def)) {
     // If a physical register is here, this is probably for a good reason.
     // Do not rewrite that.
-    if (TargetRegisterInfo::isPhysicalRegister(Def.Reg))
+    if (Register::isPhysicalRegister(Def.Reg))
       return false;
 
     // If we do not know how to rewrite this definition, there is no point
@@ -1315,12 +1315,11 @@ bool PeepholeOptimizer::isLoadFoldable(
   if (MCID.getNumDefs() != 1)
     return false;
 
-  unsigned Reg = MI.getOperand(0).getReg();
+  Register Reg = MI.getOperand(0).getReg();
   // To reduce compilation time, we check MRI->hasOneNonDBGUser when inserting
   // loads. It should be checked when processing uses of the load, since
   // uses can be removed during peephole.
-  if (!MI.getOperand(0).getSubReg() &&
-      TargetRegisterInfo::isVirtualRegister(Reg) &&
+  if (!MI.getOperand(0).getSubReg() && Register::isVirtualRegister(Reg) &&
       MRI->hasOneNonDBGUser(Reg)) {
     FoldAsLoadDefCandidates.insert(Reg);
     return true;
@@ -1336,8 +1335,8 @@ bool PeepholeOptimizer::isMoveImmediate(
     return false;
   if (MCID.getNumDefs() != 1)
     return false;
-  unsigned Reg = MI.getOperand(0).getReg();
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  Register Reg = MI.getOperand(0).getReg();
+  if (Register::isVirtualRegister(Reg)) {
     ImmDefMIs.insert(std::make_pair(Reg, &MI));
     ImmDefRegs.insert(Reg);
     return true;
@@ -1359,8 +1358,8 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr &MI,
     // Ignore dead implicit defs.
     if (MO.isImplicit() && MO.isDead())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
     if (ImmDefRegs.count(Reg) == 0)
       continue;
@@ -1393,12 +1392,12 @@ bool PeepholeOptimizer::foldRedundantCopy(MachineInstr &MI,
     DenseMap<unsigned, MachineInstr *> &CopyMIs) {
   assert(MI.isCopy() && "expected a COPY machine instruction");
 
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (!Register::isVirtualRegister(SrcReg))
     return false;
 
-  unsigned DstReg = MI.getOperand(0).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!Register::isVirtualRegister(DstReg))
     return false;
 
   if (CopySrcRegs.insert(SrcReg).second) {
@@ -1416,7 +1415,7 @@ bool PeepholeOptimizer::foldRedundantCopy(MachineInstr &MI,
   if (SrcSubReg != PrevSrcSubReg)
     return false;
 
-  unsigned PrevDstReg = PrevCopy->getOperand(0).getReg();
+  Register PrevDstReg = PrevCopy->getOperand(0).getReg();
 
   // Only replace if the copy register class is the same.
   //
@@ -1433,8 +1432,7 @@ bool PeepholeOptimizer::foldRedundantCopy(MachineInstr &MI,
 }
 
 bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
-  return TargetRegisterInfo::isPhysicalRegister(Reg) &&
-         !MRI->isAllocatable(Reg);
+  return Register::isPhysicalRegister(Reg) && !MRI->isAllocatable(Reg);
 }
 
 bool PeepholeOptimizer::foldRedundantNAPhysCopy(
@@ -1444,9 +1442,9 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
   if (DisableNAPhysCopyOpt)
     return false;
 
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (isNAPhysCopy(SrcReg) && Register::isVirtualRegister(DstReg)) {
     // %vreg = COPY %physreg
     // Avoid using a datastructure which can track multiple live non-allocatable
     // phys->virt copies since LLVM doesn't seem to do this.
@@ -1454,7 +1452,7 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
     return false;
   }
 
-  if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
+  if (!(Register::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
     return false;
 
   // %physreg = COPY %vreg
@@ -1467,7 +1465,7 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
     return false;
   }
 
-  unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg();
+  Register PrevDstReg = PrevCopy->second->getOperand(0).getReg();
   if (PrevDstReg == SrcReg) {
     // Remove the virt->phys copy: we saw the virtual register definition, and
     // the non-allocatable physical register's state hasn't changed since then.
@@ -1489,7 +1487,7 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
 static bool isVirtualRegisterOperand(MachineOperand &MO) {
   if (!MO.isReg())
     return false;
-  return TargetRegisterInfo::isVirtualRegister(MO.getReg());
+  return Register::isVirtualRegister(MO.getReg());
 }
 
 bool PeepholeOptimizer::findTargetRecurrence(
@@ -1662,7 +1660,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         for (const MachineOperand &MO : MI->operands()) {
           // Visit all operands: definitions can be implicit or explicit.
           if (MO.isReg()) {
-            unsigned Reg = MO.getReg();
+            Register Reg = MO.getReg();
             if (MO.isDef() && isNAPhysCopy(Reg)) {
               const auto &Def = NAPhysToVirtMIs.find(Reg);
               if (Def != NAPhysToVirtMIs.end()) {
@@ -1778,7 +1776,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
               LocalMIs.erase(DefMI);
               LocalMIs.insert(FoldMI);
               if (MI->isCall())
-                MI->getMF()->updateCallSiteInfo(MI, FoldMI);
+                MI->getMF()->moveCallSiteInfo(MI, FoldMI);
               MI->eraseFromParent();
               DefMI->eraseFromParent();
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
@@ -1810,7 +1808,11 @@ ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
   assert(Def->isCopy() && "Invalid definition");
   // Copy instruction are supposed to be: Def = Src.
   // If someone breaks this assumption, bad things will happen everywhere.
-  assert(Def->getNumOperands() == 2 && "Invalid number of operands");
+  // There may be implicit uses preventing the copy to be moved across
+  // some target specific register definitions
+  assert(Def->getNumOperands() - Def->getNumImplicitOperands() == 2 &&
+         "Invalid number of operands");
+  assert(!Def->hasImplicitDef() && "Only implicit uses are allowed");
 
   if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
     // If we look for a different subreg, it means we want a subreg of src.
@@ -1855,6 +1857,11 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
     SrcIdx = OpIdx;
   }
 
+  // In some rare case, Def has no input, SrcIdx is out of bound,
+  // getOperand(SrcIdx) will fail below.
+  if (SrcIdx >= Def->getNumOperands())
+    return ValueTrackerResult();
+
   // Stop when any user of the bitcast is a SUBREG_TO_REG, replacing with a COPY
   // will break the assumed guarantees for the upper bits.
   for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(DefOp.getReg())) {
@@ -2087,7 +2094,7 @@ ValueTrackerResult ValueTracker::getNextSource() {
 
     // If we can still move up in the use-def chain, move to the next
     // definition.
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg) && OneRegSrc) {
+    if (!Register::isPhysicalRegister(Reg) && OneRegSrc) {
       MachineRegisterInfo::def_iterator DI = MRI.def_begin(Reg);
       if (DI != MRI.def_end()) {
         Def = DI->getParent();
diff --git a/lib/CodeGen/PreISelIntrinsicLowering.cpp b/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 2752e186875c..0d2f6f99ca96 100644
--- a/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -76,7 +76,7 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
   }
 
   for (auto I = F.use_begin(), E = F.use_end(); I != E;) {
-    auto *CI = dyn_cast<CallInst>(I->getUser());
+    auto *CI = cast<CallInst>(I->getUser());
     assert(CI->getCalledFunction() && "Cannot lower an indirect call!");
     ++I;
 
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index b38987ad1c90..11bff45f9ad5 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -73,9 +73,9 @@ bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) {
 
 void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
   LLVM_DEBUG(dbgs() << "Processing " << *MI);
-  unsigned Reg = MI->getOperand(0).getReg();
+  Register Reg = MI->getOperand(0).getReg();
 
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     // For virtual registers, mark all uses as <undef>, and convert users to
     // implicit-def when possible.
     for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
@@ -100,8 +100,8 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
     for (MachineOperand &MO : UserMI->operands()) {
       if (!MO.isReg())
         continue;
-      unsigned UserReg = MO.getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(UserReg) ||
+      Register UserReg = MO.getReg();
+      if (!Register::isPhysicalRegister(UserReg) ||
           !TRI->regsOverlap(Reg, UserReg))
         continue;
       // UserMI uses or redefines Reg. Set <undef> flags on all uses.
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index d463bee67595..729f06dda62b 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -898,7 +898,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
   // frame index registers. Functions which don't want/need this optimization
   // will continue to use the existing code path.
   if (MFI.getUseLocalStackAllocationBlock()) {
-    unsigned Align = MFI.getLocalFrameMaxAlign();
+    unsigned Align = MFI.getLocalFrameMaxAlign().value();
 
     // Adjust to alignment boundary.
     Offset = alignTo(Offset, Align, Skew);
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index da3ef4b771f3..74e721dbd138 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -129,7 +129,7 @@ const PseudoSourceValue *
 PseudoSourceValueManager::getFixedStack(int FI) {
   std::unique_ptr<FixedStackPseudoSourceValue> &V = FSValues[FI];
   if (!V)
-    V = llvm::make_unique<FixedStackPseudoSourceValue>(FI, TII);
+    V = std::make_unique<FixedStackPseudoSourceValue>(FI, TII);
   return V.get();
 }
 
@@ -138,7 +138,7 @@ PseudoSourceValueManager::getGlobalValueCallEntry(const GlobalValue *GV) {
   std::unique_ptr<const GlobalValuePseudoSourceValue> &E =
       GlobalCallEntries[GV];
   if (!E)
-    E = llvm::make_unique<GlobalValuePseudoSourceValue>(GV, TII);
+    E = std::make_unique<GlobalValuePseudoSourceValue>(GV, TII);
   return E.get();
 }
 
@@ -147,6 +147,6 @@ PseudoSourceValueManager::getExternalSymbolCallEntry(const char *ES) {
   std::unique_ptr<const ExternalSymbolPseudoSourceValue> &E =
       ExternalCallEntries[ES];
   if (!E)
-    E = llvm::make_unique<ExternalSymbolPseudoSourceValue>(ES, TII);
+    E = std::make_unique<ExternalSymbolPseudoSourceValue>(ES, TII);
   return E.get();
 }
diff --git a/lib/CodeGen/ReachingDefAnalysis.cpp b/lib/CodeGen/ReachingDefAnalysis.cpp
index f05c97ad621e..2850033e6419 100644
--- a/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -9,6 +9,7 @@
 #include "llvm/CodeGen/ReachingDefAnalysis.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index 1cbe75c27d13..156daaa03bb5 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -73,7 +73,7 @@ void RegAllocBase::seedLiveRegs() {
   NamedRegionTimer T("seed", "Seed Live Regs", TimerGroupName,
                      TimerGroupDescription, TimePassesIsEnabled);
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     enqueue(&LIS->getInterval(Reg));
@@ -154,7 +154,7 @@ void RegAllocBase::allocatePhysRegs() {
         continue;
       }
       LLVM_DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n");
-      assert(TargetRegisterInfo::isVirtualRegister(SplitVirtReg->reg) &&
+      assert(Register::isVirtualRegister(SplitVirtReg->reg) &&
              "expect split value in virtual register");
       enqueue(SplitVirtReg);
       ++NumNewQueued;
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 2ffa5e389f89..44d0233604e7 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -90,7 +90,7 @@ namespace {
       explicit LiveReg(unsigned VirtReg) : VirtReg(VirtReg) {}
 
       unsigned getSparseSetIndex() const {
-        return TargetRegisterInfo::virtReg2Index(VirtReg);
+        return Register::virtReg2Index(VirtReg);
       }
     };
 
@@ -200,11 +200,11 @@ namespace {
     void assignVirtToPhysReg(LiveReg &, MCPhysReg PhysReg);
 
     LiveRegMap::iterator findLiveVirtReg(unsigned VirtReg) {
-      return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
+      return LiveVirtRegs.find(Register::virtReg2Index(VirtReg));
     }
 
     LiveRegMap::const_iterator findLiveVirtReg(unsigned VirtReg) const {
-      return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
+      return LiveVirtRegs.find(Register::virtReg2Index(VirtReg));
     }
 
     void allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint);
@@ -264,7 +264,7 @@ int RegAllocFast::getStackSpaceFor(unsigned VirtReg) {
 
 /// Returns false if \p VirtReg is known to not live out of the current block.
 bool RegAllocFast::mayLiveOut(unsigned VirtReg) {
-  if (MayLiveAcrossBlocks.test(TargetRegisterInfo::virtReg2Index(VirtReg))) {
+  if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg))) {
     // Cannot be live-out if there are no successors.
     return !MBB->succ_empty();
   }
@@ -272,7 +272,7 @@ bool RegAllocFast::mayLiveOut(unsigned VirtReg) {
   // If this block loops back to itself, it would be necessary to check whether
   // the use comes after the def.
   if (MBB->isSuccessor(MBB)) {
-    MayLiveAcrossBlocks.set(TargetRegisterInfo::virtReg2Index(VirtReg));
+    MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
     return true;
   }
 
@@ -282,7 +282,7 @@ bool RegAllocFast::mayLiveOut(unsigned VirtReg) {
   unsigned C = 0;
   for (const MachineInstr &UseInst : MRI->reg_nodbg_instructions(VirtReg)) {
     if (UseInst.getParent() != MBB || ++C >= Limit) {
-      MayLiveAcrossBlocks.set(TargetRegisterInfo::virtReg2Index(VirtReg));
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
       // Cannot be live-out if there are no successors.
       return !MBB->succ_empty();
     }
@@ -293,7 +293,7 @@ bool RegAllocFast::mayLiveOut(unsigned VirtReg) {
 
 /// Returns false if \p VirtReg is known to not be live into the current block.
 bool RegAllocFast::mayLiveIn(unsigned VirtReg) {
-  if (MayLiveAcrossBlocks.test(TargetRegisterInfo::virtReg2Index(VirtReg)))
+  if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg)))
     return !MBB->pred_empty();
 
   // See if the first \p Limit def of the register are all in the current block.
@@ -301,7 +301,7 @@ bool RegAllocFast::mayLiveIn(unsigned VirtReg) {
   unsigned C = 0;
   for (const MachineInstr &DefInst : MRI->def_instructions(VirtReg)) {
     if (DefInst.getParent() != MBB || ++C >= Limit) {
-      MayLiveAcrossBlocks.set(TargetRegisterInfo::virtReg2Index(VirtReg));
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
       return !MBB->pred_empty();
     }
   }
@@ -394,7 +394,7 @@ void RegAllocFast::killVirtReg(LiveReg &LR) {
 
 /// Mark virtreg as no longer available.
 void RegAllocFast::killVirtReg(unsigned VirtReg) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+  assert(Register::isVirtualRegister(VirtReg) &&
          "killVirtReg needs a virtual register");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
   if (LRI != LiveVirtRegs.end() && LRI->PhysReg)
@@ -405,7 +405,7 @@ void RegAllocFast::killVirtReg(unsigned VirtReg) {
 /// stack slot if needed.
 void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
                                 unsigned VirtReg) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+  assert(Register::isVirtualRegister(VirtReg) &&
          "Spilling a physical register is illegal!");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
   assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
@@ -455,9 +455,8 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
   if (MO.isUndef())
     return;
 
-  unsigned PhysReg = MO.getReg();
-  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
-         "Bad usePhysReg operand");
+  Register PhysReg = MO.getReg();
+  assert(Register::isPhysicalRegister(PhysReg) && "Bad usePhysReg operand");
 
   markRegUsedInInstr(PhysReg);
   switch (PhysRegState[PhysReg]) {
@@ -626,9 +625,9 @@ unsigned RegAllocFast::traceCopyChain(unsigned Reg) const {
   static const unsigned ChainLengthLimit = 3;
   unsigned C = 0;
   do {
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       return Reg;
-    assert(TargetRegisterInfo::isVirtualRegister(Reg));
+    assert(Register::isVirtualRegister(Reg));
 
     MachineInstr *VRegDef = MRI->getUniqueVRegDef(Reg);
     if (!VRegDef || !isCoalescable(*VRegDef))
@@ -646,7 +645,7 @@ unsigned RegAllocFast::traceCopies(unsigned VirtReg) const {
   unsigned C = 0;
   for (const MachineInstr &MI : MRI->def_instructions(VirtReg)) {
     if (isCoalescable(MI)) {
-      unsigned Reg = MI.getOperand(1).getReg();
+      Register Reg = MI.getOperand(1).getReg();
       Reg = traceCopyChain(Reg);
       if (Reg != 0)
         return Reg;
@@ -662,7 +661,7 @@ unsigned RegAllocFast::traceCopies(unsigned VirtReg) const {
 void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint0) {
   const unsigned VirtReg = LR.VirtReg;
 
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+  assert(Register::isVirtualRegister(VirtReg) &&
          "Can only allocate virtual registers");
 
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
@@ -671,8 +670,8 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint0) {
                     << " with hint " << printReg(Hint0, TRI) << '\n');
 
   // Take hint when possible.
-  if (TargetRegisterInfo::isPhysicalRegister(Hint0) &&
-      MRI->isAllocatable(Hint0) && RC.contains(Hint0)) {
+  if (Register::isPhysicalRegister(Hint0) && MRI->isAllocatable(Hint0) &&
+      RC.contains(Hint0)) {
     // Ignore the hint if we would have to spill a dirty register.
     unsigned Cost = calcSpillCost(Hint0);
     if (Cost < spillDirty) {
@@ -692,9 +691,8 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint0) {
 
   // Try other hint.
   unsigned Hint1 = traceCopies(VirtReg);
-  if (TargetRegisterInfo::isPhysicalRegister(Hint1) &&
-      MRI->isAllocatable(Hint1) && RC.contains(Hint1) &&
-      !isRegUsedInInstr(Hint1)) {
+  if (Register::isPhysicalRegister(Hint1) && MRI->isAllocatable(Hint1) &&
+      RC.contains(Hint1) && !isRegUsedInInstr(Hint1)) {
     // Ignore the hint if we would have to spill a dirty register.
     unsigned Cost = calcSpillCost(Hint1);
     if (Cost < spillDirty) {
@@ -752,8 +750,8 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint0) {
 
 void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
   assert(MO.isUndef() && "expected undef use");
-  unsigned VirtReg = MO.getReg();
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && "Expected virtreg");
+  Register VirtReg = MO.getReg();
+  assert(Register::isVirtualRegister(VirtReg) && "Expected virtreg");
 
   LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
   MCPhysReg PhysReg;
@@ -778,14 +776,13 @@ void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
 /// Allocates a register for VirtReg and mark it as dirty.
 MCPhysReg RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
                                       unsigned VirtReg, unsigned Hint) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-         "Not a virtual register");
+  assert(Register::isVirtualRegister(VirtReg) && "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
   if (!LRI->PhysReg) {
     // If there is no hint, peek at the only use of this register.
-    if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
+    if ((!Hint || !Register::isPhysicalRegister(Hint)) &&
         MRI->hasOneNonDBGUse(VirtReg)) {
       const MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(VirtReg);
       // It's a copy, use the destination register as a hint.
@@ -812,8 +809,7 @@ RegAllocFast::LiveReg &RegAllocFast::reloadVirtReg(MachineInstr &MI,
                                                    unsigned OpNum,
                                                    unsigned VirtReg,
                                                    unsigned Hint) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-         "Not a virtual register");
+  assert(Register::isVirtualRegister(VirtReg) && "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
@@ -866,7 +862,7 @@ bool RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO,
   }
 
   // Handle subregister index.
-  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : 0);
+  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : Register());
   MO.setIsRenamable(true);
   MO.setSubReg(0);
 
@@ -893,8 +889,8 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   SmallSet<unsigned, 8> ThroughRegs;
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
     if (MO.isEarlyClobber() || (MO.isUse() && MO.isTied()) ||
         (MO.getSubReg() && MI.readsVirtualRegister(Reg))) {
@@ -908,8 +904,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   LLVM_DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Reg || !Register::isPhysicalRegister(Reg))
+      continue;
     markRegUsedInInstr(Reg);
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
       if (ThroughRegs.count(PhysRegState[*AI]))
@@ -922,8 +919,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
+      continue;
     if (MO.isUse()) {
       if (!MO.isTied()) continue;
       LLVM_DEBUG(dbgs() << "Operand " << I << "(" << MO
@@ -947,8 +945,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
+      continue;
     if (!MO.isEarlyClobber())
       continue;
     // Note: defineVirtReg may invalidate MO.
@@ -961,8 +960,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   UsedInInstr.clear();
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Reg || !Register::isPhysicalRegister(Reg))
+      continue;
     LLVM_DEBUG(dbgs() << "\tSetting " << printReg(Reg, TRI)
                       << " as used in instr\n");
     markRegUsedInInstr(Reg);
@@ -1002,10 +1002,8 @@ void RegAllocFast::dumpState() {
        e = LiveVirtRegs.end(); i != e; ++i) {
     if (!i->PhysReg)
       continue;
-    assert(TargetRegisterInfo::isVirtualRegister(i->VirtReg) &&
-           "Bad map key");
-    assert(TargetRegisterInfo::isPhysicalRegister(i->PhysReg) &&
-           "Bad map value");
+    assert(Register::isVirtualRegister(i->VirtReg) && "Bad map key");
+    assert(Register::isPhysicalRegister(i->PhysReg) && "Bad map value");
     assert(PhysRegState[i->PhysReg] == i->VirtReg && "Bad inverse map");
   }
 }
@@ -1045,9 +1043,9 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
       continue;
     }
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg) continue;
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       VirtOpEnd = i+1;
       if (MO.isUse()) {
         hasTiedOps = hasTiedOps ||
@@ -1096,8 +1094,9 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
   for (unsigned I = 0; I != VirtOpEnd; ++I) {
     MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
+      continue;
     if (MO.isUse()) {
       if (MO.isUndef()) {
         HasUndefUse = true;
@@ -1124,8 +1123,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
     for (MachineOperand &MO : MI.uses()) {
       if (!MO.isReg() || !MO.isUse())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Register::isVirtualRegister(Reg))
         continue;
 
       assert(MO.isUndef() && "Should only have undef virtreg uses left");
@@ -1139,8 +1138,9 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
   if (hasEarlyClobbers) {
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
-      if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+      Register Reg = MO.getReg();
+      if (!Reg || !Register::isPhysicalRegister(Reg))
+        continue;
       // Look for physreg defs and tied uses.
       if (!MO.isDef() && !MO.isTied()) continue;
       markRegUsedInInstr(Reg);
@@ -1166,10 +1166,9 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
 
-    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg) ||
-        !MRI->isAllocatable(Reg))
+    if (!Reg || !Register::isPhysicalRegister(Reg) || !MRI->isAllocatable(Reg))
       continue;
     definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
   }
@@ -1180,10 +1179,10 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
 
     // We have already dealt with phys regs in the previous scan.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       continue;
     MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, CopySrcReg);
     if (setPhysReg(MI, MI.getOperand(I), PhysReg)) {
@@ -1215,8 +1214,8 @@ void RegAllocFast::handleDebugValue(MachineInstr &MI) {
   // mostly constants and frame indices.
   if (!MO.isReg())
     return;
-  unsigned Reg = MO.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  Register Reg = MO.getReg();
+  if (!Register::isVirtualRegister(Reg))
     return;
 
   // See if this virtual register has already been allocated to a physical
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 771fc46415db..d27db678f02a 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -685,7 +685,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
   // The queue holds (size, reg) pairs.
   const unsigned Size = LI->getSize();
   const unsigned Reg = LI->reg;
-  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+  assert(Register::isVirtualRegister(Reg) &&
          "Can only enqueue virtual registers");
   unsigned Prio;
 
@@ -899,7 +899,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
     // Check if any interfering live range is heavier than MaxWeight.
     for (unsigned i = Q.interferingVRegs().size(); i; --i) {
       LiveInterval *Intf = Q.interferingVRegs()[i - 1];
-      assert(TargetRegisterInfo::isVirtualRegister(Intf->reg) &&
+      assert(Register::isVirtualRegister(Intf->reg) &&
              "Only expecting virtual register interference from query");
 
       // Do not allow eviction of a virtual register if we are in the middle
@@ -984,7 +984,7 @@ bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg,
         continue;
 
       // Cannot evict non virtual reg interference.
-      if (!TargetRegisterInfo::isVirtualRegister(Intf->reg))
+      if (!Register::isVirtualRegister(Intf->reg))
         return false;
       // Never evict spill products. They cannot split or spill.
       if (getStage(*Intf) == RS_Done)
@@ -2881,7 +2881,7 @@ void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
         continue;
     }
     // Get the current assignment.
-    Register OtherPhysReg = TargetRegisterInfo::isPhysicalRegister(OtherReg)
+    Register OtherPhysReg = Register::isPhysicalRegister(OtherReg)
                                 ? OtherReg
                                 : VRM->getPhys(OtherReg);
     // Push the collected information.
@@ -2919,7 +2919,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
   SmallVector<unsigned, 2> RecoloringCandidates;
   HintsInfo Info;
   unsigned Reg = VirtReg.reg;
-  unsigned PhysReg = VRM->getPhys(Reg);
+  Register PhysReg = VRM->getPhys(Reg);
   // Start the recoloring algorithm from the input live-interval, then
   // it will propagate to the ones that are copy-related with it.
   Visited.insert(Reg);
@@ -2932,7 +2932,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
     Reg = RecoloringCandidates.pop_back_val();
 
     // We cannot recolor physical register.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       continue;
 
     assert(VRM->hasPhys(Reg) && "We have unallocated variable!!");
@@ -2940,7 +2940,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
     // Get the live interval mapped with this virtual register to be able
     // to check for the interference with the new color.
     LiveInterval &LI = LIS->getInterval(Reg);
-    unsigned CurrPhys = VRM->getPhys(Reg);
+    Register CurrPhys = VRM->getPhys(Reg);
     // Check that the new color matches the register class constraints and
     // that it is free for this live range.
     if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
@@ -3021,7 +3021,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
 /// getting rid of 2 copies.
 void RAGreedy::tryHintsRecoloring() {
   for (LiveInterval *LI : SetOfBrokenHints) {
-    assert(TargetRegisterInfo::isVirtualRegister(LI->reg) &&
+    assert(Register::isVirtualRegister(LI->reg) &&
            "Recoloring is possible only for virtual registers");
     // Some dead defs may be around (e.g., because of debug uses).
     // Ignore those.
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 7a5a6c148ed4..3c4a46b12f99 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -558,7 +558,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
 
   // Iterate over all live ranges.
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
     VRegsToAlloc.insert(Reg);
@@ -824,11 +824,11 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   if (!VRegsToAlloc.empty()) {
     const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
     std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
-      llvm::make_unique<PBQPRAConstraintList>();
-    ConstraintsRoot->addConstraint(llvm::make_unique<SpillCosts>());
-    ConstraintsRoot->addConstraint(llvm::make_unique<Interference>());
+      std::make_unique<PBQPRAConstraintList>();
+    ConstraintsRoot->addConstraint(std::make_unique<SpillCosts>());
+    ConstraintsRoot->addConstraint(std::make_unique<Interference>());
     if (PBQPCoalescing)
-      ConstraintsRoot->addConstraint(llvm::make_unique<Coalescing>());
+      ConstraintsRoot->addConstraint(std::make_unique<Coalescing>());
     ConstraintsRoot->addConstraint(Subtarget.getCustomPBQPConstraints());
 
     bool PBQPAllocComplete = false;
@@ -848,7 +848,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
         std::string GraphFileName = FullyQualifiedName + "." + RS.str() +
                                     ".pbqpgraph";
         std::error_code EC;
-        raw_fd_ostream OS(GraphFileName, EC, sys::fs::F_Text);
+        raw_fd_ostream OS(GraphFileName, EC, sys::fs::OF_Text);
         LLVM_DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
                           << GraphFileName << "\"\n");
         G.dump(OS);
diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp
index b37dfada7101..757ff0e44953 100644
--- a/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -142,6 +142,13 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
   auto SetRegAsDefined = [&RegMask] (unsigned Reg) {
     RegMask[Reg / 32] &= ~(1u << Reg % 32);
   };
+
+  // Some targets can clobber registers "inside" a call, typically in
+  // linker-generated code.
+  for (const MCPhysReg Reg : TRI->getIntraCallClobberedRegs(&MF))
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      SetRegAsDefined(*AI);
+
   // Scan all the physical registers. When a register is defined in the current
   // function set it and all the aliasing registers as defined in the regmask.
   // FIXME: Rewrite to use regunits.
@@ -164,7 +171,8 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
       SetRegAsDefined(PReg);
   }
 
-  if (TargetFrameLowering::isSafeForNoCSROpt(F)) {
+  if (TargetFrameLowering::isSafeForNoCSROpt(F) &&
+      MF.getSubtarget().getFrameLowering()->isProfitableForNoCSROpt(F)) {
     ++NumCSROpt;
     LLVM_DEBUG(dbgs() << MF.getName()
                       << " function optimized for not having CSR.\n");
diff --git a/lib/CodeGen/RegUsageInfoPropagate.cpp b/lib/CodeGen/RegUsageInfoPropagate.cpp
index fc4be82d215e..0205e6193741 100644
--- a/lib/CodeGen/RegUsageInfoPropagate.cpp
+++ b/lib/CodeGen/RegUsageInfoPropagate.cpp
@@ -130,7 +130,11 @@ bool RegUsageInfoPropagation::runOnMachineFunction(MachineFunction &MF) {
       };
 
       if (const Function *F = findCalledFunction(M, MI)) {
-        UpdateRegMask(*F);
+        if (F->isDefinitionExact()) {
+          UpdateRegMask(*F);
+        } else {
+          LLVM_DEBUG(dbgs() << "Function definition is not exact\n");
+        }
       } else {
         LLVM_DEBUG(dbgs() << "Failed to find call target function\n");
       }
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 2db6ab454cea..6ff5ddbc023d 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -406,8 +406,8 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
   Partial = SrcSub || DstSub;
 
   // If one register is a physreg, it must be Dst.
-  if (TargetRegisterInfo::isPhysicalRegister(Src)) {
-    if (TargetRegisterInfo::isPhysicalRegister(Dst))
+  if (Register::isPhysicalRegister(Src)) {
+    if (Register::isPhysicalRegister(Dst))
       return false;
     std::swap(Src, Dst);
     std::swap(SrcSub, DstSub);
@@ -416,7 +416,7 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
 
   const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
 
-  if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
+  if (Register::isPhysicalRegister(Dst)) {
     // Eliminate DstSub on a physreg.
     if (DstSub) {
       Dst = TRI.getSubReg(Dst, DstSub);
@@ -474,8 +474,8 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
     CrossClass = NewRC != DstRC || NewRC != SrcRC;
   }
   // Check our invariants
-  assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
-  assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
+  assert(Register::isVirtualRegister(Src) && "Src must be virtual");
+  assert(!(Register::isPhysicalRegister(Dst) && DstSub) &&
          "Cannot have a physical SubIdx");
   SrcReg = Src;
   DstReg = Dst;
@@ -483,7 +483,7 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
 }
 
 bool CoalescerPair::flip() {
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+  if (Register::isPhysicalRegister(DstReg))
     return false;
   std::swap(SrcReg, DstReg);
   std::swap(SrcIdx, DstIdx);
@@ -507,8 +507,8 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
   }
 
   // Now check that Dst matches DstReg.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
-    if (!TargetRegisterInfo::isPhysicalRegister(Dst))
+  if (Register::isPhysicalRegister(DstReg)) {
+    if (!Register::isPhysicalRegister(Dst))
       return false;
     assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
     // DstSub could be set for a physreg from INSERT_SUBREG.
@@ -802,7 +802,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     return { false, false };
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
-  unsigned NewReg = NewDstMO.getReg();
+  Register NewReg = NewDstMO.getReg();
   if (NewReg != IntB.reg || !IntB.Query(AValNo->def).isKill())
     return { false, false };
 
@@ -835,8 +835,8 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
   if (!NewMI)
     return { false, false };
-  if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
-      TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
+  if (Register::isVirtualRegister(IntA.reg) &&
+      Register::isVirtualRegister(IntB.reg) &&
       !MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
     return { false, false };
   if (NewMI != DefMI) {
@@ -877,7 +877,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       continue;
     // Kill flags are no longer accurate. They are recomputed after RA.
     UseMO.setIsKill(false);
-    if (TargetRegisterInfo::isPhysicalRegister(NewReg))
+    if (Register::isPhysicalRegister(NewReg))
       UseMO.substPhysReg(NewReg, *TRI);
     else
       UseMO.setReg(NewReg);
@@ -1188,7 +1188,7 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
 /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
 /// defining a subregister.
 static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
-  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) &&
+  assert(!Register::isPhysicalRegister(Reg) &&
          "This code cannot handle physreg aliasing");
   for (const MachineOperand &Op : MI.operands()) {
     if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg)
@@ -1209,7 +1209,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   unsigned SrcIdx = CP.isFlipped() ? CP.getDstIdx() : CP.getSrcIdx();
   unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
   unsigned DstIdx = CP.isFlipped() ? CP.getSrcIdx() : CP.getDstIdx();
-  if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
+  if (Register::isPhysicalRegister(SrcReg))
     return false;
 
   LiveInterval &SrcInt = LIS->getInterval(SrcReg);
@@ -1240,7 +1240,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     return false;
   // Only support subregister destinations when the def is read-undef.
   MachineOperand &DstOperand = CopyMI->getOperand(0);
-  unsigned CopyDstReg = DstOperand.getReg();
+  Register CopyDstReg = DstOperand.getReg();
   if (DstOperand.getSubReg() && !DstOperand.isUndef())
     return false;
 
@@ -1254,7 +1254,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
 
   const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
   if (!DefMI->isImplicitDef()) {
-    if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    if (Register::isPhysicalRegister(DstReg)) {
       unsigned NewDstReg = DstReg;
 
       unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
@@ -1269,7 +1269,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     } else {
       // Theoretically, some stack frame reference could exist. Just make sure
       // it hasn't actually happened.
-      assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
+      assert(Register::isVirtualRegister(DstReg) &&
              "Only expect to deal with virtual or physical registers");
     }
   }
@@ -1317,7 +1317,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     if (MO.isReg()) {
       assert(MO.isImplicit() && "No explicit operands after implicit operands.");
       // Discard VReg implicit defs.
-      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      if (Register::isPhysicalRegister(MO.getReg()))
         ImplicitOps.push_back(MO);
     }
   }
@@ -1336,12 +1336,12 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     MachineOperand &MO = NewMI.getOperand(i);
     if (MO.isReg() && MO.isDef()) {
       assert(MO.isImplicit() && MO.isDead() &&
-             TargetRegisterInfo::isPhysicalRegister(MO.getReg()));
+             Register::isPhysicalRegister(MO.getReg()));
       NewMIImplDefs.push_back(MO.getReg());
     }
   }
 
-  if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+  if (Register::isVirtualRegister(DstReg)) {
     unsigned NewIdx = NewMI.getOperand(0).getSubReg();
 
     if (DefRC != nullptr) {
@@ -1428,7 +1428,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   } else if (NewMI.getOperand(0).getReg() != CopyDstReg) {
     // The New instruction may be defining a sub-register of what's actually
     // been asked for. If so it must implicitly define the whole thing.
-    assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+    assert(Register::isPhysicalRegister(DstReg) &&
            "Only expect virtual or physical registers in remat");
     NewMI.getOperand(0).setIsDead(true);
     NewMI.addOperand(MachineOperand::CreateReg(
@@ -1480,7 +1480,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) {
       MachineInstr *UseMI = UseMO.getParent();
       if (UseMI->isDebugValue()) {
-        if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+        if (Register::isPhysicalRegister(DstReg))
           UseMO.substPhysReg(DstReg, *TRI);
         else
           UseMO.setReg(DstReg);
@@ -1651,7 +1651,7 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
 void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
                                           unsigned DstReg,
                                           unsigned SubIdx) {
-  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  bool DstIsPhys = Register::isPhysicalRegister(DstReg);
   LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
 
   if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
@@ -2411,8 +2411,8 @@ std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
     assert(MI && "No defining instruction");
     if (!MI->isFullCopy())
       return std::make_pair(VNI, TrackReg);
-    unsigned SrcReg = MI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    Register SrcReg = MI->getOperand(1).getReg();
+    if (!Register::isVirtualRegister(SrcReg))
       return std::make_pair(VNI, TrackReg);
 
     const LiveInterval &LI = LIS->getInterval(SrcReg);
@@ -3189,9 +3189,9 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
       MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
       assert(MI && "No instruction to erase");
       if (MI->isCopy()) {
-        unsigned Reg = MI->getOperand(1).getReg();
-        if (TargetRegisterInfo::isVirtualRegister(Reg) &&
-            Reg != CP.getSrcReg() && Reg != CP.getDstReg())
+        Register Reg = MI->getOperand(1).getReg();
+        if (Register::isVirtualRegister(Reg) && Reg != CP.getSrcReg() &&
+            Reg != CP.getDstReg())
           ShrinkRegs.push_back(Reg);
       }
       ErasedInstrs.insert(MI);
@@ -3463,10 +3463,10 @@ static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
   if (Copy->getOperand(1).isUndef())
     return false;
 
-  unsigned SrcReg = Copy->getOperand(1).getReg();
-  unsigned DstReg = Copy->getOperand(0).getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)
-      || TargetRegisterInfo::isPhysicalRegister(DstReg))
+  Register SrcReg = Copy->getOperand(1).getReg();
+  Register DstReg = Copy->getOperand(0).getReg();
+  if (Register::isPhysicalRegister(SrcReg) ||
+      Register::isPhysicalRegister(DstReg))
     return false;
 
   return LIS->intervalIsInOneMBB(LIS->getInterval(SrcReg))
@@ -3526,12 +3526,11 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
   if (!isMoveInstr(*TRI, &Copy, SrcReg, DstReg, SrcSubReg, DstSubReg))
     return false;
   // Check if the destination of this copy has any other affinity.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+  if (Register::isPhysicalRegister(DstReg) ||
       // If SrcReg is a physical register, the copy won't be coalesced.
       // Ignoring it may have other side effect (like missing
       // rematerialization). So keep it.
-      TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
-      !isTerminalReg(DstReg, Copy, MRI))
+      Register::isPhysicalRegister(SrcReg) || !isTerminalReg(DstReg, Copy, MRI))
     return false;
 
   // DstReg is a terminal node. Check if it interferes with any other
@@ -3554,7 +3553,7 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
     if (OtherReg == SrcReg)
       OtherReg = OtherSrcReg;
     // Check if OtherReg is a non-terminal.
-    if (TargetRegisterInfo::isPhysicalRegister(OtherReg) ||
+    if (Register::isPhysicalRegister(OtherReg) ||
         isTerminalReg(OtherReg, MI, MRI))
       continue;
     // Check that OtherReg interfere with DstReg.
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 7d9b3aa9b2d7..bf192d1c530d 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -134,6 +134,22 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
   }
   dbgs() << '\n';
 }
+
+LLVM_DUMP_METHOD
+void PressureChange::dump() const {
+  dbgs() << "[" << getPSetOrMax() << ", " << getUnitInc() << "]\n";
+}
+
+void RegPressureDelta::dump() const {
+  dbgs() << "[Excess=";
+  Excess.dump();
+  dbgs() << ", CriticalMax=";
+  CriticalMax.dump();
+  dbgs() << ", CurrentMax=";
+  CurrentMax.dump();
+  dbgs() << "]\n";
+}
+
 #endif
 
 void RegPressureTracker::increaseRegPressure(unsigned RegUnit,
@@ -219,7 +235,7 @@ void LiveRegSet::clear() {
 }
 
 static const LiveRange *getLiveRange(const LiveIntervals &LIS, unsigned Reg) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
+  if (Register::isVirtualRegister(Reg))
     return &LIS.getInterval(Reg);
   return LIS.getCachedRegUnit(Reg);
 }
@@ -345,7 +361,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
   assert(isBottomClosed() && "need bottom-up tracking to intialize.");
   for (const RegisterMaskPair &Pair : P.LiveOutRegs) {
     unsigned RegUnit = Pair.RegUnit;
-    if (TargetRegisterInfo::isVirtualRegister(RegUnit)
+    if (Register::isVirtualRegister(RegUnit)
         && !RPTracker.hasUntiedDef(RegUnit))
       increaseSetPressure(LiveThruPressure, *MRI, RegUnit,
                           LaneBitmask::getNone(), Pair.LaneMask);
@@ -406,7 +422,7 @@ static LaneBitmask getLanesWithProperty(const LiveIntervals &LIS,
     const MachineRegisterInfo &MRI, bool TrackLaneMasks, unsigned RegUnit,
     SlotIndex Pos, LaneBitmask SafeDefault,
     bool(*Property)(const LiveRange &LR, SlotIndex Pos)) {
-  if (TargetRegisterInfo::isVirtualRegister(RegUnit)) {
+  if (Register::isVirtualRegister(RegUnit)) {
     const LiveInterval &LI = LIS.getInterval(RegUnit);
     LaneBitmask Result;
     if (TrackLaneMasks && LI.hasSubRanges()) {
@@ -483,7 +499,7 @@ class RegisterOperandsCollector {
   void collectOperand(const MachineOperand &MO) const {
     if (!MO.isReg() || !MO.getReg())
       return;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (MO.isUse()) {
       if (!MO.isUndef() && !MO.isInternalRead())
         pushReg(Reg, RegOpers.Uses);
@@ -503,7 +519,7 @@ class RegisterOperandsCollector {
 
   void pushReg(unsigned Reg,
                SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneBitmask::getAll()));
     } else if (MRI.isAllocatable(Reg)) {
       for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
@@ -514,7 +530,7 @@ class RegisterOperandsCollector {
   void collectOperandLanes(const MachineOperand &MO) const {
     if (!MO.isReg() || !MO.getReg())
       return;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     unsigned SubRegIdx = MO.getSubReg();
     if (MO.isUse()) {
       if (!MO.isUndef() && !MO.isInternalRead())
@@ -535,7 +551,7 @@ class RegisterOperandsCollector {
 
   void pushRegLanes(unsigned Reg, unsigned SubRegIdx,
                     SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       LaneBitmask LaneMask = SubRegIdx != 0
                              ? TRI.getSubRegIndexLaneMask(SubRegIdx)
                              : MRI.getMaxLaneMaskForVReg(Reg);
@@ -590,7 +606,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
     // If the def is all that is live after the instruction, then in case
     // of a subregister def we need a read-undef flag.
     unsigned RegUnit = I->RegUnit;
-    if (TargetRegisterInfo::isVirtualRegister(RegUnit) &&
+    if (Register::isVirtualRegister(RegUnit) &&
         AddFlagsMI != nullptr && (LiveAfter & ~I->LaneMask).none())
       AddFlagsMI->setRegisterDefReadUndef(RegUnit);
 
@@ -616,7 +632,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
   if (AddFlagsMI != nullptr) {
     for (const RegisterMaskPair &P : DeadDefs) {
       unsigned RegUnit = P.RegUnit;
-      if (!TargetRegisterInfo::isVirtualRegister(RegUnit))
+      if (!Register::isVirtualRegister(RegUnit))
         continue;
       LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, RegUnit,
                                              Pos.getDeadSlot());
@@ -825,7 +841,7 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
   if (TrackUntiedDefs) {
     for (const RegisterMaskPair &Def : RegOpers.Defs) {
       unsigned RegUnit = Def.RegUnit;
-      if (TargetRegisterInfo::isVirtualRegister(RegUnit) &&
+      if (Register::isVirtualRegister(RegUnit) &&
           (LiveRegs.contains(RegUnit) & Def.LaneMask).none())
         UntiedDefs.insert(RegUnit);
     }
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index bb19110e6d70..ec0868acab38 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -49,7 +49,7 @@ using namespace llvm;
 
 STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 
-void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) {
+void RegScavenger::setRegUsed(Register Reg, LaneBitmask LaneMask) {
   LiveUnits.addRegMasked(Reg, LaneMask);
 }
 
@@ -96,12 +96,12 @@ void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) {
   }
 }
 
-void RegScavenger::addRegUnits(BitVector &BV, unsigned Reg) {
+void RegScavenger::addRegUnits(BitVector &BV, Register Reg) {
   for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
     BV.set(*RUI);
 }
 
-void RegScavenger::removeRegUnits(BitVector &BV, unsigned Reg) {
+void RegScavenger::removeRegUnits(BitVector &BV, Register Reg) {
   for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
     BV.reset(*RUI);
 }
@@ -133,8 +133,8 @@ void RegScavenger::determineKillsAndDefs() {
     }
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg) || isReserved(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isPhysicalRegister(Reg) || isReserved(Reg))
       continue;
 
     if (MO.isUse()) {
@@ -204,8 +204,8 @@ void RegScavenger::forward() {
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg) || isReserved(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isPhysicalRegister(Reg) || isReserved(Reg))
       continue;
     if (MO.isUse()) {
       if (MO.isUndef())
@@ -278,14 +278,14 @@ void RegScavenger::backward() {
     --MBBI;
 }
 
-bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const {
+bool RegScavenger::isRegUsed(Register Reg, bool includeReserved) const {
   if (isReserved(Reg))
     return includeReserved;
   return !LiveUnits.available(Reg);
 }
 
-unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
-  for (unsigned Reg : *RC) {
+Register RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
+  for (Register Reg : *RC) {
     if (!isRegUsed(Reg)) {
       LLVM_DEBUG(dbgs() << "Scavenger found unused reg: " << printReg(Reg, TRI)
                         << "\n");
@@ -297,13 +297,13 @@ unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
 
 BitVector RegScavenger::getRegsAvailable(const TargetRegisterClass *RC) {
   BitVector Mask(TRI->getNumRegs());
-  for (unsigned Reg : *RC)
+  for (Register Reg : *RC)
     if (!isRegUsed(Reg))
       Mask.set(Reg);
   return Mask;
 }
 
-unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
+Register RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
                                        BitVector &Candidates,
                                        unsigned InstrLimit,
                                        MachineBasicBlock::iterator &UseMI) {
@@ -329,7 +329,7 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
         Candidates.clearBitsNotInMask(MO.getRegMask());
       if (!MO.isReg() || MO.isUndef() || !MO.getReg())
         continue;
-      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+      if (Register::isVirtualRegister(MO.getReg())) {
         if (MO.isDef())
           isVirtDefInsn = true;
         else if (MO.isKill())
@@ -430,7 +430,7 @@ findSurvivorBackwards(const MachineRegisterInfo &MRI,
       // be usefull for this other vreg as well later.
       bool FoundVReg = false;
       for (const MachineOperand &MO : MI.operands()) {
-        if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
           FoundVReg = true;
           break;
         }
@@ -457,7 +457,7 @@ static unsigned getFrameIndexOperandNum(MachineInstr &MI) {
 }
 
 RegScavenger::ScavengedInfo &
-RegScavenger::spill(unsigned Reg, const TargetRegisterClass &RC, int SPAdj,
+RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
                     MachineBasicBlock::iterator Before,
                     MachineBasicBlock::iterator &UseMI) {
   // Find an available scavenging slot with size and alignment matching
@@ -531,7 +531,7 @@ RegScavenger::spill(unsigned Reg, const TargetRegisterClass &RC, int SPAdj,
   return Scavenged[SI];
 }
 
-unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
+Register RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
                                         MachineBasicBlock::iterator I,
                                         int SPAdj, bool AllowSpill) {
   MachineInstr &MI = *I;
@@ -542,7 +542,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   // Exclude all the registers being used by the instruction.
   for (const MachineOperand &MO : MI.operands()) {
     if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) &&
-        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        !Register::isVirtualRegister(MO.getReg()))
       for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
         Candidates.reset(*AI);
   }
@@ -556,7 +556,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
 
   // Find the register whose use is furthest away.
   MachineBasicBlock::iterator UseMI;
-  unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
+  Register SReg = findSurvivorReg(I, Candidates, 25, UseMI);
 
   // If we found an unused register there is no reason to spill it.
   if (!isRegUsed(SReg)) {
@@ -576,7 +576,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   return SReg;
 }
 
-unsigned RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
+Register RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
                                                  MachineBasicBlock::iterator To,
                                                  bool RestoreAfter, int SPAdj,
                                                  bool AllowSpill) {
@@ -620,8 +620,8 @@ unsigned RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
 /// \p ReserveAfter controls whether the scavenged register needs to be reserved
 /// after the current instruction, otherwise it will only be reserved before the
 /// current instruction.
-static unsigned scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
-                             unsigned VReg, bool ReserveAfter) {
+static Register scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
+                             Register VReg, bool ReserveAfter) {
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
 #ifndef NDEBUG
   // Verify that all definitions and uses are in the same basic block.
@@ -664,7 +664,7 @@ static unsigned scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
   // spill/reload if necessary.
   int SPAdj = 0;
   const TargetRegisterClass &RC = *MRI.getRegClass(VReg);
-  unsigned SReg = RS.scavengeRegisterBackwards(RC, DefMI.getIterator(),
+  Register SReg = RS.scavengeRegisterBackwards(RC, DefMI.getIterator(),
                                                ReserveAfter, SPAdj);
   MRI.replaceRegWith(VReg, SReg);
   ++NumScavengedRegs;
@@ -694,17 +694,17 @@ static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI,
       for (const MachineOperand &MO : NMI.operands()) {
         if (!MO.isReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         // We only care about virtual registers and ignore virtual registers
         // created by the target callbacks in the process (those will be handled
         // in a scavenging round).
-        if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
-            TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs)
+        if (!Register::isVirtualRegister(Reg) ||
+            Register::virtReg2Index(Reg) >= InitialNumVirtRegs)
           continue;
         if (!MO.readsReg())
           continue;
 
-        unsigned SReg = scavengeVReg(MRI, RS, Reg, true);
+        Register SReg = scavengeVReg(MRI, RS, Reg, true);
         N->addRegisterKilled(SReg, &TRI, false);
         RS.setRegUsed(SReg);
       }
@@ -716,10 +716,10 @@ static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI,
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       // Only vregs, no newly created vregs (see above).
-      if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
-          TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs)
+      if (!Register::isVirtualRegister(Reg) ||
+          Register::virtReg2Index(Reg) >= InitialNumVirtRegs)
         continue;
       // We have to look at all operands anyway so we can precalculate here
       // whether there is a reading operand. This allows use to skip the use
@@ -730,14 +730,14 @@ static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI,
         NextInstructionReadsVReg = true;
       }
       if (MO.isDef()) {
-        unsigned SReg = scavengeVReg(MRI, RS, Reg, false);
+        Register SReg = scavengeVReg(MRI, RS, Reg, false);
         I->addRegisterDead(SReg, &TRI, false);
       }
     }
   }
 #ifndef NDEBUG
   for (const MachineOperand &MO : MBB.front().operands()) {
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
       continue;
     assert(!MO.isInternalRead() && "Cannot assign inside bundles");
     assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses");
diff --git a/lib/CodeGen/RenameIndependentSubregs.cpp b/lib/CodeGen/RenameIndependentSubregs.cpp
index 22cff48c3051..e3f5abb6301f 100644
--- a/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -138,7 +138,7 @@ bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const {
   LLVM_DEBUG(dbgs() << printReg(Reg) << ": Splitting into newly created:");
   for (unsigned I = 1, NumClasses = Classes.getNumClasses(); I < NumClasses;
        ++I) {
-    unsigned NewVReg = MRI->createVirtualRegister(RegClass);
+    Register NewVReg = MRI->createVirtualRegister(RegClass);
     LiveInterval &NewLI = LIS->createEmptyInterval(NewVReg);
     Intervals.push_back(&NewLI);
     LLVM_DEBUG(dbgs() << ' ' << printReg(NewVReg));
@@ -390,7 +390,7 @@ bool RenameIndependentSubregs::runOnMachineFunction(MachineFunction &MF) {
   // there can't be any further splitting.
   bool Changed = false;
   for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))
       continue;
     LiveInterval &LI = LIS->getInterval(Reg);
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index a6bc7330e2cc..ddbbd0f8d6e9 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -871,7 +871,7 @@ public:
       report_fatal_error("TargetLowering instance is required");
 
     auto *DL = &F.getParent()->getDataLayout();
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
     // Compute DT and LI only for functions that have the attribute.
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
index 7776dffb4e9c..b4037499d7d1 100644
--- a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -173,15 +173,30 @@ static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
     return;
   }
 
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
     //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %mask_1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
     //  br i1 %mask_1, label %cond.load, label %else
     //
-
-    Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx);
+    }
 
     // Create "cond" block
     //
@@ -290,13 +305,29 @@ static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
     return;
   }
 
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %mask_1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
     //  br i1 %mask_1, label %cond.store, label %else
     //
-    Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx);
+    }
 
     // Create "cond" block
     //
@@ -392,15 +423,30 @@ static void scalarizeMaskedGather(CallInst *CI, bool &ModifiedDT) {
     return;
   }
 
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
-    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
+    //  %Mask1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
     //  br i1 %Mask1, label %cond.load, label %else
     //
 
-    Value *Predicate =
-        Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
 
     // Create "cond" block
     //
@@ -499,14 +545,29 @@ static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
     return;
   }
 
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
-    //  %Mask1 = extractelement <16 x i1> %Mask, i32 Idx
+    //  %Mask1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
     //  br i1 %Mask1, label %cond.store, label %else
     //
-    Value *Predicate =
-        Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
 
     // Create "cond" block
     //
@@ -555,6 +616,32 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
   // The result vector
   Value *VResult = PassThru;
 
+  // Shorten the way if the mask is a vector of constants.
+  if (isConstantIntVector(Mask)) {
+    unsigned MemIndex = 0;
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
+      LoadInst *Load =
+          Builder.CreateAlignedLoad(EltTy, NewPtr, 1, "Load" + Twine(Idx));
+      VResult =
+          Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
+      ++MemIndex;
+    }
+    CI->replaceAllUsesWith(VResult);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
@@ -563,8 +650,14 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
     //  br i1 %mask_1, label %cond.load, label %else
     //
 
-    Value *Predicate =
-        Builder.CreateExtractElement(Mask, Idx);
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
 
     // Create "cond" block
     //
@@ -633,13 +726,44 @@ static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
 
   unsigned VectorWidth = VecType->getNumElements();
 
+  // Shorten the way if the mask is a vector of constants.
+  if (isConstantIntVector(Mask)) {
+    unsigned MemIndex = 0;
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *OneElt =
+          Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
+      Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
+      Builder.CreateAlignedStore(OneElt, NewPtr, 1);
+      ++MemIndex;
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
     //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
     //  br i1 %mask_1, label %cond.store, label %else
     //
-    Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
 
     // Create "cond" block
     //
@@ -727,17 +851,24 @@ bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
     switch (II->getIntrinsicID()) {
     default:
       break;
-    case Intrinsic::masked_load:
+    case Intrinsic::masked_load: {
       // Scalarize unsupported vector masked load
-      if (TTI->isLegalMaskedLoad(CI->getType()))
+      unsigned Alignment =
+        cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      if (TTI->isLegalMaskedLoad(CI->getType(), MaybeAlign(Alignment)))
         return false;
       scalarizeMaskedLoad(CI, ModifiedDT);
       return true;
-    case Intrinsic::masked_store:
-      if (TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType()))
+    }
+    case Intrinsic::masked_store: {
+      unsigned Alignment =
+        cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      if (TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(),
+                                  MaybeAlign(Alignment)))
         return false;
       scalarizeMaskedStore(CI, ModifiedDT);
       return true;
+    }
     case Intrinsic::masked_gather:
       if (TTI->isLegalMaskedGather(CI->getType()))
         return false;
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index d5ad7e92299d..96a1f86c3e04 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -205,10 +204,10 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
   if (ExitMI) {
     for (const MachineOperand &MO : ExitMI->operands()) {
       if (!MO.isReg() || MO.isDef()) continue;
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isPhysicalRegister(Reg)) {
         Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
-      } else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) {
+      } else if (Register::isVirtualRegister(Reg) && MO.readsReg()) {
         addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO));
       }
     }
@@ -285,7 +284,7 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
 void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   MachineInstr *MI = SU->getInstr();
   MachineOperand &MO = MI->getOperand(OperIdx);
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   // We do not need to track any dependencies for constant registers.
   if (MRI.isConstantPhysReg(Reg))
     return;
@@ -361,7 +360,7 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
 
 LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
 {
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   // No point in tracking lanemasks if we don't have interesting subregisters.
   const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
   if (!RC.HasDisjunctSubRegs)
@@ -373,6 +372,13 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
   return TRI->getSubRegIndexLaneMask(SubReg);
 }
 
+bool ScheduleDAGInstrs::deadDefHasNoUse(const MachineOperand &MO) {
+  auto RegUse = CurrentVRegUses.find(MO.getReg());
+  if (RegUse == CurrentVRegUses.end())
+    return true;
+  return (RegUse->LaneMask & getLaneMaskForMO(MO)).none();
+}
+
 /// Adds register output and data dependencies from this SUnit to instructions
 /// that occur later in the same scheduling region if they read from or write to
 /// the virtual register defined at OperIdx.
@@ -382,7 +388,7 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   MachineInstr *MI = SU->getInstr();
   MachineOperand &MO = MI->getOperand(OperIdx);
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
 
   LaneBitmask DefLaneMask;
   LaneBitmask KillLaneMask;
@@ -393,6 +399,18 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
     // earlier instruction.
     KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask;
 
+    if (MO.getSubReg() != 0 && MO.isUndef()) {
+      // There may be other subregister defs on the same instruction of the same
+      // register in later operands. The lanes of other defs will now be live
+      // after this instruction, so these should not be treated as killed by the
+      // instruction even though they appear to be killed in this one operand.
+      for (int I = OperIdx + 1, E = MI->getNumOperands(); I != E; ++I) {
+        const MachineOperand &OtherMO = MI->getOperand(I);
+        if (OtherMO.isReg() && OtherMO.isDef() && OtherMO.getReg() == Reg)
+          KillLaneMask &= ~getLaneMaskForMO(OtherMO);
+      }
+    }
+
     // Clear undef flag, we'll re-add it later once we know which subregister
     // Def is first.
     MO.setIsUndef(false);
@@ -402,8 +420,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   }
 
   if (MO.isDead()) {
-    assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
-           "Dead defs should have no uses");
+    assert(deadDefHasNoUse(MO) && "Dead defs should have no uses");
   } else {
     // Add data dependence to all uses we found so far.
     const TargetSubtargetInfo &ST = MF.getSubtarget();
@@ -491,7 +508,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   const MachineInstr *MI = SU->getInstr();
   const MachineOperand &MO = MI->getOperand(OperIdx);
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
 
   // Remember the use. Data dependencies will be added when we find the def.
   LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO)
@@ -514,7 +531,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
 
 /// Returns true if MI is an instruction we are unable to reason about
 /// (like a call or something with unmodeled side effects).
-static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
+static inline bool isGlobalMemoryObject(AAResults *AA, MachineInstr *MI) {
   return MI->isCall() || MI->hasUnmodeledSideEffects() ||
          (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA));
 }
@@ -701,7 +718,7 @@ void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
   map.reComputeSize();
 }
 
-void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
+void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
                                         RegPressureTracker *RPTracker,
                                         PressureDiffs *PDiffs,
                                         LiveIntervals *LIS,
@@ -821,10 +838,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       const MachineOperand &MO = MI.getOperand(j);
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isPhysicalRegister(Reg)) {
         addPhysRegDeps(SU, j);
-      } else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      } else if (Register::isVirtualRegister(Reg)) {
         HasVRegDef = true;
         addVRegDefDeps(SU, j);
       }
@@ -838,10 +855,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       // additional use dependencies.
       if (!MO.isReg() || !MO.isUse())
         continue;
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isPhysicalRegister(Reg)) {
         addPhysRegDeps(SU, j);
-      } else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) {
+      } else if (Register::isVirtualRegister(Reg) && MO.readsReg()) {
         addVRegUseDeps(SU, j);
       }
     }
@@ -1071,7 +1088,7 @@ static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
   for (MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.readsReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
 
@@ -1102,7 +1119,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
       if (MO.isReg()) {
         if (!MO.isDef())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (!Reg)
           continue;
         LiveRegs.removeReg(Reg);
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 49c922f560fa..e8950b58d42d 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -24,7 +24,6 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -111,10 +110,20 @@ static cl::opt<bool>
   MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
                     cl::desc("DAG combiner may split indexing from loads"));
 
+static cl::opt<bool>
+    EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true),
+                       cl::desc("DAG combiner enable merging multiple stores "
+                                "into a wider store"));
+
 static cl::opt<unsigned> TokenFactorInlineLimit(
     "combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048),
     cl::desc("Limit the number of operands to inline for Token Factors"));
 
+static cl::opt<unsigned> StoreMergeDependenceLimit(
+    "combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10),
+    cl::desc("Limit the number of times for the same StoreNode and RootNode "
+             "to bail out in store merging dependence check"));
+
 namespace {
 
   class DAGCombiner {
@@ -152,6 +161,14 @@ namespace {
     /// which have not yet been combined to the worklist.
     SmallPtrSet<SDNode *, 32> CombinedNodes;
 
+    /// Map from candidate StoreNode to the pair of RootNode and count.
+    /// The count is used to track how many times we have seen the StoreNode
+    /// with the same RootNode bail out in dependence check. If we have seen
+    /// the bail out for the same pair many times over a limit, we won't
+    /// consider the StoreNode with the same RootNode as store merging
+    /// candidate again.
+    DenseMap<SDNode *, std::pair<SDNode *, unsigned>> StoreRootCountMap;
+
     // AA - Used for DAG load/store alias analysis.
     AliasAnalysis *AA;
 
@@ -236,6 +253,7 @@ namespace {
     void removeFromWorklist(SDNode *N) {
       CombinedNodes.erase(N);
       PruningList.remove(N);
+      StoreRootCountMap.erase(N);
 
       auto It = WorklistMap.find(N);
       if (It == WorklistMap.end())
@@ -361,6 +379,7 @@ namespace {
     SDValue visitSUBE(SDNode *N);
     SDValue visitSUBCARRY(SDNode *N);
     SDValue visitMUL(SDNode *N);
+    SDValue visitMULFIX(SDNode *N);
     SDValue useDivRem(SDNode *N);
     SDValue visitSDIV(SDNode *N);
     SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N);
@@ -421,7 +440,6 @@ namespace {
     SDValue visitFP_TO_SINT(SDNode *N);
     SDValue visitFP_TO_UINT(SDNode *N);
     SDValue visitFP_ROUND(SDNode *N);
-    SDValue visitFP_ROUND_INREG(SDNode *N);
     SDValue visitFP_EXTEND(SDNode *N);
     SDValue visitFNEG(SDNode *N);
     SDValue visitFABS(SDNode *N);
@@ -470,7 +488,7 @@ namespace {
     SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                            SDValue N1, SDNodeFlags Flags);
 
-    SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
+    SDValue visitShiftByConstant(SDNode *N);
 
     SDValue foldSelectOfConstants(SDNode *N);
     SDValue foldVSelectOfConstants(SDNode *N);
@@ -497,6 +515,7 @@ namespace {
     bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
                            SDValue &CC) const;
     bool isOneUseSetCC(SDValue N) const;
+    bool isCheaperToUseNegatedFPOps(SDValue X, SDValue Y);
 
     SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                          unsigned HiOp);
@@ -510,7 +529,7 @@ namespace {
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
     SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
-    SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags);
+    SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
     SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
@@ -521,11 +540,11 @@ namespace {
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
-    SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
+    SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
                               SDValue InnerPos, SDValue InnerNeg,
                               unsigned PosOpcode, unsigned NegOpcode,
                               const SDLoc &DL);
-    SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
+    SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue MatchLoadCombine(SDNode *N);
     SDValue MatchStoreCombine(StoreSDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
@@ -742,6 +761,11 @@ CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
   return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
 }
 
+bool TargetLowering::DAGCombinerInfo::
+recursivelyDeleteUnusedNodes(SDNode *N) {
+  return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N);
+}
+
 void TargetLowering::DAGCombinerInfo::
 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
@@ -766,195 +790,6 @@ void DAGCombiner::deleteAndRecombine(SDNode *N) {
   DAG.DeleteNode(N);
 }
 
-/// Return 1 if we can compute the negated form of the specified expression for
-/// the same cost as the expression itself, or 2 if we can compute the negated
-/// form more cheaply than the expression itself.
-static char isNegatibleForFree(SDValue Op, bool LegalOperations,
-                               const TargetLowering &TLI,
-                               const TargetOptions *Options,
-                               bool ForCodeSize,
-                               unsigned Depth = 0) {
-  // fneg is removable even if it has multiple uses.
-  if (Op.getOpcode() == ISD::FNEG)
-    return 2;
-
-  // Don't allow anything with multiple uses unless we know it is free.
-  EVT VT = Op.getValueType();
-  const SDNodeFlags Flags = Op->getFlags();
-  if (!Op.hasOneUse() &&
-      !(Op.getOpcode() == ISD::FP_EXTEND &&
-        TLI.isFPExtFree(VT, Op.getOperand(0).getValueType())))
-    return 0;
-
-  // Don't recurse exponentially.
-  if (Depth > 6)
-    return 0;
-
-  switch (Op.getOpcode()) {
-  default: return false;
-  case ISD::ConstantFP: {
-    if (!LegalOperations)
-      return 1;
-
-    // Don't invert constant FP values after legalization unless the target says
-    // the negated constant is legal.
-    return TLI.isOperationLegal(ISD::ConstantFP, VT) ||
-           TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT,
-                            ForCodeSize);
-  }
-  case ISD::BUILD_VECTOR: {
-    // Only permit BUILD_VECTOR of constants.
-    if (llvm::any_of(Op->op_values(), [&](SDValue N) {
-          return !N.isUndef() && !isa<ConstantFPSDNode>(N);
-        }))
-      return 0;
-    if (!LegalOperations)
-      return 1;
-    if (TLI.isOperationLegal(ISD::ConstantFP, VT) &&
-        TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
-      return 1;
-    return llvm::all_of(Op->op_values(), [&](SDValue N) {
-      return N.isUndef() ||
-             TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(N)->getValueAPF()), VT,
-                              ForCodeSize);
-    });
-  }
-  case ISD::FADD:
-    if (!Options->UnsafeFPMath && !Flags.hasNoSignedZeros())
-      return 0;
-
-    // After operation legalization, it might not be legal to create new FSUBs.
-    if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
-      return 0;
-
-    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
-    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
-                                    Options, ForCodeSize, Depth + 1))
-      return V;
-    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
-    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
-                              ForCodeSize, Depth + 1);
-  case ISD::FSUB:
-    // We can't turn -(A-B) into B-A when we honor signed zeros.
-    if (!Options->NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
-      return 0;
-
-    // fold (fneg (fsub A, B)) -> (fsub B, A)
-    return 1;
-
-  case ISD::FMUL:
-  case ISD::FDIV:
-    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
-    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
-                                    Options, ForCodeSize, Depth + 1))
-      return V;
-
-    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
-                              ForCodeSize, Depth + 1);
-
-  case ISD::FP_EXTEND:
-  case ISD::FP_ROUND:
-  case ISD::FSIN:
-    return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
-                              ForCodeSize, Depth + 1);
-  }
-}
-
-/// If isNegatibleForFree returns true, return the newly negated expression.
-static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
-                                    bool LegalOperations, bool ForCodeSize,
-                                    unsigned Depth = 0) {
-  // fneg is removable even if it has multiple uses.
-  if (Op.getOpcode() == ISD::FNEG)
-    return Op.getOperand(0);
-
-  assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
-  const TargetOptions &Options = DAG.getTarget().Options;
-  const SDNodeFlags Flags = Op->getFlags();
-
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Unknown code");
-  case ISD::ConstantFP: {
-    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
-    V.changeSign();
-    return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
-  }
-  case ISD::BUILD_VECTOR: {
-    SmallVector<SDValue, 4> Ops;
-    for (SDValue C : Op->op_values()) {
-      if (C.isUndef()) {
-        Ops.push_back(C);
-        continue;
-      }
-      APFloat V = cast<ConstantFPSDNode>(C)->getValueAPF();
-      V.changeSign();
-      Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType()));
-    }
-    return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops);
-  }
-  case ISD::FADD:
-    assert(Options.UnsafeFPMath || Flags.hasNoSignedZeros());
-
-    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
-    if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
-                           DAG.getTargetLoweringInfo(), &Options, ForCodeSize,
-                           Depth + 1))
-      return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                         GetNegatedExpression(Op.getOperand(0), DAG,
-                                              LegalOperations, ForCodeSize,
-                                              Depth + 1),
-                         Op.getOperand(1), Flags);
-    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
-    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                       GetNegatedExpression(Op.getOperand(1), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1),
-                       Op.getOperand(0), Flags);
-  case ISD::FSUB:
-    // fold (fneg (fsub 0, B)) -> B
-    if (ConstantFPSDNode *N0CFP =
-            isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true))
-      if (N0CFP->isZero())
-        return Op.getOperand(1);
-
-    // fold (fneg (fsub A, B)) -> (fsub B, A)
-    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(0), Flags);
-
-  case ISD::FMUL:
-  case ISD::FDIV:
-    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
-    if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
-                           DAG.getTargetLoweringInfo(), &Options, ForCodeSize,
-                           Depth + 1))
-      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                         GetNegatedExpression(Op.getOperand(0), DAG,
-                                              LegalOperations, ForCodeSize,
-                                              Depth + 1),
-                         Op.getOperand(1), Flags);
-
-    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
-    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(0),
-                       GetNegatedExpression(Op.getOperand(1), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1), Flags);
-
-  case ISD::FP_EXTEND:
-  case ISD::FSIN:
-    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                       GetNegatedExpression(Op.getOperand(0), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1));
-  case ISD::FP_ROUND:
-    return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
-                       GetNegatedExpression(Op.getOperand(0), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1),
-                       Op.getOperand(1));
-  }
-}
-
 // APInts must be the same size for most operations, this helper
 // function zero extends the shorter of the pair so that they match.
 // We provide an Offset so that we can create bitwidths that won't overflow.
@@ -1124,7 +959,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
       SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
       if (!OpNode.getNode())
         return SDValue();
-      AddToWorklist(OpNode.getNode());
       return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
     }
   }
@@ -1438,7 +1272,6 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
     SDValue RV =
         DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
 
-    AddToWorklist(N0.getNode());
     if (Replace)
       ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
 
@@ -1591,8 +1424,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
       bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
 
       for (SDNode *LN : UpdatedNodes) {
-        AddToWorklist(LN);
         AddUsersToWorklist(LN);
+        AddToWorklist(LN);
       }
       if (!NIsValid)
         continue;
@@ -1673,6 +1506,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::ADDCARRY:           return visitADDCARRY(N);
   case ISD::SUBE:               return visitSUBE(N);
   case ISD::SUBCARRY:           return visitSUBCARRY(N);
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:         return visitMULFIX(N);
   case ISD::MUL:                return visitMUL(N);
   case ISD::SDIV:               return visitSDIV(N);
   case ISD::UDIV:               return visitUDIV(N);
@@ -1736,7 +1573,6 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
   case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
   case ISD::FP_ROUND:           return visitFP_ROUND(N);
-  case ISD::FP_ROUND_INREG:     return visitFP_ROUND_INREG(N);
   case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
   case ISD::FNEG:               return visitFNEG(N);
   case ISD::FABS:               return visitFABS(N);
@@ -3308,6 +3144,18 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
   }
 
+  if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) {
+    // (sub Carry, X)  ->  (addcarry (sub 0, X), 0, Carry)
+    if (SDValue Carry = getAsCarry(TLI, N0)) {
+      SDValue X = N1;
+      SDValue Zero = DAG.getConstant(0, DL, VT);
+      SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X);
+      return DAG.getNode(ISD::ADDCARRY, DL,
+                         DAG.getVTList(VT, Carry.getValueType()), NegX, Zero,
+                         Carry);
+    }
+  }
+
   return SDValue();
 }
 
@@ -3442,6 +3290,30 @@ SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
   return SDValue();
 }
 
+// Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and
+// UMULFIXSAT here.
+SDValue DAGCombiner::visitMULFIX(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Scale = N->getOperand(2);
+  EVT VT = N0.getValueType();
+
+  // fold (mulfix x, undef, scale) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  // Canonicalize constant to RHS (vector doesn't have to splat)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale);
+
+  // fold (mulfix x, 0, scale) -> 0
+  if (isNullConstant(N1))
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -3537,7 +3409,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   //           x * 15 --> (x << 4) - x
   //           x * -33 --> -((x << 5) + x)
   //           x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
-  if (N1IsConst && TLI.decomposeMulByConstant(VT, N1)) {
+  if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
     // TODO: We could handle more general decomposition of any constant by
     //       having the target set a limit on number of ops and making a
     //       callback to determine that sequence (similar to sqrt expansion).
@@ -4083,10 +3955,10 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
 
   if (VT.isVector()) {
     // fold (mulhs x, 0) -> 0
-    if (ISD::isBuildVectorAllZeros(N1.getNode()))
-      return N1;
-    if (ISD::isBuildVectorAllZeros(N0.getNode()))
-      return N0;
+    // do not return N0/N1, because undef node may exist.
+    if (ISD::isBuildVectorAllZeros(N0.getNode()) ||
+        ISD::isBuildVectorAllZeros(N1.getNode()))
+      return DAG.getConstant(0, DL, VT);
   }
 
   // fold (mulhs x, 0) -> 0
@@ -4095,7 +3967,7 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
   // fold (mulhs x, 1) -> (sra x, size(x)-1)
   if (isOneConstant(N1))
     return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
-                       DAG.getConstant(N0.getValueSizeInBits() - 1, DL,
+                       DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL,
                                        getShiftAmountTy(N0.getValueType())));
 
   // fold (mulhs x, undef) -> 0
@@ -4130,10 +4002,10 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
 
   if (VT.isVector()) {
     // fold (mulhu x, 0) -> 0
-    if (ISD::isBuildVectorAllZeros(N1.getNode()))
-      return N1;
-    if (ISD::isBuildVectorAllZeros(N0.getNode()))
-      return N0;
+    // do not return N0/N1, because undef node may exist.
+    if (ISD::isBuildVectorAllZeros(N0.getNode()) ||
+        ISD::isBuildVectorAllZeros(N1.getNode()))
+      return DAG.getConstant(0, DL, VT);
   }
 
   // fold (mulhu x, 0) -> 0
@@ -4265,6 +4137,18 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
+  // (umul_lohi N0, 0) -> (0, 0)
+  if (isNullConstant(N->getOperand(1))) {
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    return CombineTo(N, Zero, Zero);
+  }
+
+  // (umul_lohi N0, 1) -> (N0, 0)
+  if (isOneConstant(N->getOperand(1))) {
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    return CombineTo(N, N->getOperand(0), Zero);
+  }
+
   // If the type is twice as wide is legal, transform the mulhu to a wider
   // multiply plus a shift.
   if (VT.isSimple() && !VT.isVector()) {
@@ -4290,13 +4174,29 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitMULO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
   bool IsSigned = (ISD::SMULO == N->getOpcode());
 
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
+
+  // fold (mulo x, 0) -> 0 + no carry out
+  if (isNullOrNullSplat(N1))
+    return CombineTo(N, DAG.getConstant(0, DL, VT),
+                     DAG.getConstant(0, DL, CarryVT));
+
   // (mulo x, 2) -> (addo x, x)
-  if (ConstantSDNode *C2 = isConstOrConstSplat(N->getOperand(1)))
+  if (ConstantSDNode *C2 = isConstOrConstSplat(N1))
     if (C2->getAPIntValue() == 2)
-      return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, SDLoc(N),
-                         N->getVTList(), N->getOperand(0), N->getOperand(0));
+      return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
+                         N->getVTList(), N0, N0);
 
   return SDValue();
 }
@@ -4444,7 +4344,9 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) &&
        Level <= AfterLegalizeTypes) {
     // Input types must be integer and the same.
-    if (XVT.isInteger() && XVT == Y.getValueType()) {
+    if (XVT.isInteger() && XVT == Y.getValueType() &&
+        !(VT.isVector() && TLI.isTypeLegal(VT) &&
+          !XVT.isVector() && !TLI.isTypeLegal(XVT))) {
       SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
       return DAG.getNode(HandOpcode, DL, VT, Logic);
     }
@@ -4770,8 +4672,8 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
     return true;
   }
 
-  // Do not change the width of a volatile load.
-  if (LoadN->isVolatile())
+  // Do not change the width of a volatile or atomic loads.
+  if (!LoadN->isSimple())
     return false;
 
   // Do not generate loads of non-round integer types since these can
@@ -4803,15 +4705,15 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
   if (!MemVT.isRound())
     return false;
 
-  // Don't change the width of a volatile load.
-  if (LDST->isVolatile())
+  // Don't change the width of a volatile or atomic loads.
+  if (!LDST->isSimple())
     return false;
 
   // Verify that we are actually reducing a load width here.
   if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits())
     return false;
 
-  // Ensure that this isn't going to produce an unsupported unaligned access.
+  // Ensure that this isn't going to produce an unsupported memory access.
   if (ShAmt &&
       !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
                               LDST->getAddressSpace(), ShAmt / 8,
@@ -5076,6 +4978,59 @@ SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) {
   return T1;
 }
 
+/// Try to replace shift/logic that tests if a bit is clear with mask + setcc.
+/// For a target with a bit test, this is expected to become test + set and save
+/// at least 1 instruction.
+static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
+  assert(And->getOpcode() == ISD::AND && "Expected an 'and' op");
+
+  // This is probably not worthwhile without a supported type.
+  EVT VT = And->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // Look through an optional extension and find a 'not'.
+  // TODO: Should we favor test+set even without the 'not' op?
+  SDValue Not = And->getOperand(0), And1 = And->getOperand(1);
+  if (Not.getOpcode() == ISD::ANY_EXTEND)
+    Not = Not.getOperand(0);
+  if (!isBitwiseNot(Not) || !Not.hasOneUse() || !isOneConstant(And1))
+    return SDValue();
+
+  // Look though an optional truncation. The source operand may not be the same
+  // type as the original 'and', but that is ok because we are masking off
+  // everything but the low bit.
+  SDValue Srl = Not.getOperand(0);
+  if (Srl.getOpcode() == ISD::TRUNCATE)
+    Srl = Srl.getOperand(0);
+
+  // Match a shift-right by constant.
+  if (Srl.getOpcode() != ISD::SRL || !Srl.hasOneUse() ||
+      !isa<ConstantSDNode>(Srl.getOperand(1)))
+    return SDValue();
+
+  // We might have looked through casts that make this transform invalid.
+  // TODO: If the source type is wider than the result type, do the mask and
+  //       compare in the source type.
+  const APInt &ShiftAmt = Srl.getConstantOperandAPInt(1);
+  unsigned VTBitWidth = VT.getSizeInBits();
+  if (ShiftAmt.uge(VTBitWidth))
+    return SDValue();
+
+  // Turn this into a bit-test pattern using mask op + setcc:
+  // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0
+  SDLoc DL(And);
+  SDValue X = DAG.getZExtOrTrunc(Srl.getOperand(0), DL, VT);
+  EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Mask = DAG.getConstant(
+      APInt::getOneBitSet(VTBitWidth, ShiftAmt.getZExtValue()), DL, VT);
+  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ);
+  return DAG.getZExtOrTrunc(Setcc, DL, VT);
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5163,6 +5118,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
+
   // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) ->
   // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
   // already be zero by virtue of the width of the base type of the load.
@@ -5337,7 +5293,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     unsigned MemBitSize = MemVT.getScalarSizeInBits();
     APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize);
     if (DAG.MaskedValueIsZero(N1, ExtBits) &&
-        ((!LegalOperations && !LN0->isVolatile()) ||
+        ((!LegalOperations && LN0->isSimple()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad =
           DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(),
@@ -5358,6 +5314,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
     return Shifts;
 
+  if (TLI.hasBitTest(N0, N1))
+    if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
+      return V;
+
   return SDValue();
 }
 
@@ -5564,6 +5524,23 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
   return true;
 }
 
+// Match 2 elements of a packed halfword bswap.
+static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {
+  if (N.getOpcode() == ISD::OR)
+    return isBSwapHWordElement(N.getOperand(0), Parts) &&
+           isBSwapHWordElement(N.getOperand(1), Parts);
+
+  if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) {
+    ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1));
+    if (!C || C->getAPIntValue() != 16)
+      return false;
+    Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode();
+    return true;
+  }
+
+  return false;
+}
+
 /// Match a 32-bit packed halfword bswap. That is
 /// ((x & 0x000000ff) << 8) |
 /// ((x & 0x0000ff00) >> 8) |
@@ -5581,43 +5558,26 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
     return SDValue();
 
   // Look for either
-  // (or (or (and), (and)), (or (and), (and)))
-  // (or (or (or (and), (and)), (and)), (and))
-  if (N0.getOpcode() != ISD::OR)
-    return SDValue();
-  SDValue N00 = N0.getOperand(0);
-  SDValue N01 = N0.getOperand(1);
+  // (or (bswaphpair), (bswaphpair))
+  // (or (or (bswaphpair), (and)), (and))
+  // (or (or (and), (bswaphpair)), (and))
   SDNode *Parts[4] = {};
 
-  if (N1.getOpcode() == ISD::OR &&
-      N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
+  if (isBSwapHWordPair(N0, Parts)) {
     // (or (or (and), (and)), (or (and), (and)))
-    if (!isBSwapHWordElement(N00, Parts))
+    if (!isBSwapHWordPair(N1, Parts))
       return SDValue();
-
-    if (!isBSwapHWordElement(N01, Parts))
-      return SDValue();
-    SDValue N10 = N1.getOperand(0);
-    if (!isBSwapHWordElement(N10, Parts))
-      return SDValue();
-    SDValue N11 = N1.getOperand(1);
-    if (!isBSwapHWordElement(N11, Parts))
-      return SDValue();
-  } else {
+  } else if (N0.getOpcode() == ISD::OR) {
     // (or (or (or (and), (and)), (and)), (and))
     if (!isBSwapHWordElement(N1, Parts))
       return SDValue();
-    if (!isBSwapHWordElement(N01, Parts))
-      return SDValue();
-    if (N00.getOpcode() != ISD::OR)
-      return SDValue();
-    SDValue N000 = N00.getOperand(0);
-    if (!isBSwapHWordElement(N000, Parts))
-      return SDValue();
-    SDValue N001 = N00.getOperand(1);
-    if (!isBSwapHWordElement(N001, Parts))
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) &&
+        !(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts)))
       return SDValue();
-  }
+  } else
+    return SDValue();
 
   // Make sure the parts are all coming from the same node.
   if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
@@ -5791,15 +5751,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
           SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
           SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);
 
-          bool LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
-          if (!LegalMask) {
-            std::swap(NewLHS, NewRHS);
-            ShuffleVectorSDNode::commuteMask(Mask);
-            LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
-          }
-
-          if (LegalMask)
-            return DAG.getVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, Mask);
+          SDValue LegalShuffle =
+              TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS,
+                                          Mask, DAG);
+          if (LegalShuffle)
+            return LegalShuffle;
         }
       }
     }
@@ -5867,8 +5823,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       return V;
 
   // See if this is some rotate idiom.
-  if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
-    return SDValue(Rot, 0);
+  if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N)))
+    return Rot;
 
   if (SDValue Load = MatchLoadCombine(N))
     return Load;
@@ -5914,6 +5870,9 @@ static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift,
 /// Otherwise, returns an expansion of \p ExtractFrom based on the following
 /// patterns:
 ///
+///   (or (add v v) (shrl v bitwidth-1)):
+///     expands (add v v) -> (shl v 1)
+///
 ///   (or (mul v c0) (shrl (mul v c1) c2)):
 ///     expands (mul v c0) -> (shl (mul v c1) c3)
 ///
@@ -5936,6 +5895,23 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
       "Existing shift must be valid as a rotate half");
 
   ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask);
+
+  // Value and Type of the shift.
+  SDValue OppShiftLHS = OppShift.getOperand(0);
+  EVT ShiftedVT = OppShiftLHS.getValueType();
+
+  // Amount of the existing shift.
+  ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
+
+  // (add v v) -> (shl v 1)
+  if (OppShift.getOpcode() == ISD::SRL && OppShiftCst &&
+      ExtractFrom.getOpcode() == ISD::ADD &&
+      ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) &&
+      ExtractFrom.getOperand(0) == OppShiftLHS &&
+      OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1)
+    return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS,
+                       DAG.getShiftAmountConstant(1, ShiftedVT, DL));
+
   // Preconditions:
   //    (or (op0 v c0) (shiftl/r (op0 v c1) c2))
   //
@@ -5959,15 +5935,11 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
 
   // op0 must be the same opcode on both sides, have the same LHS argument,
   // and produce the same value type.
-  SDValue OppShiftLHS = OppShift.getOperand(0);
-  EVT ShiftedVT = OppShiftLHS.getValueType();
   if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() ||
       OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) ||
       ShiftedVT != ExtractFrom.getValueType())
     return SDValue();
 
-  // Amount of the existing shift.
-  ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
   // Constant mul/udiv/shift amount from the RHS of the shift's LHS op.
   ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1));
   // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op.
@@ -6137,7 +6109,7 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
 // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
 // former being preferred if supported.  InnerPos and InnerNeg are Pos and
 // Neg with outer conversions stripped away.
-SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
+SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
                                        SDValue Neg, SDValue InnerPos,
                                        SDValue InnerNeg, unsigned PosOpcode,
                                        unsigned NegOpcode, const SDLoc &DL) {
@@ -6152,32 +6124,33 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
     bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
-                       HasPos ? Pos : Neg).getNode();
+                       HasPos ? Pos : Neg);
   }
 
-  return nullptr;
+  return SDValue();
 }
 
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
 // idioms for rotate, and if the target supports rotation instructions, generate
 // a rot[lr].
-SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
+SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
   EVT VT = LHS.getValueType();
-  if (!TLI.isTypeLegal(VT)) return nullptr;
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
 
   // The target must have at least one rotate flavor.
   bool HasROTL = hasOperation(ISD::ROTL, VT);
   bool HasROTR = hasOperation(ISD::ROTR, VT);
-  if (!HasROTL && !HasROTR) return nullptr;
+  if (!HasROTL && !HasROTR)
+    return SDValue();
 
   // Check for truncated rotate.
   if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
       LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
     assert(LHS.getValueType() == RHS.getValueType());
-    if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(),
-                         SDValue(Rot, 0)).getNode();
+    if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
     }
   }
 
@@ -6192,7 +6165,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
   // If neither side matched a rotate half, bail
   if (!LHSShift && !RHSShift)
-    return nullptr;
+    return SDValue();
 
   // InstCombine may have combined a constant shl, srl, mul, or udiv with one
   // side of the rotate, so try to handle that here. In all cases we need to
@@ -6215,15 +6188,15 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
   // If a side is still missing, nothing else we can do.
   if (!RHSShift || !LHSShift)
-    return nullptr;
+    return SDValue();
 
   // At this point we've matched or extracted a shift op on each side.
 
   if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
-    return nullptr;   // Not shifting the same value.
+    return SDValue(); // Not shifting the same value.
 
   if (LHSShift.getOpcode() == RHSShift.getOpcode())
-    return nullptr;   // Shifts must disagree.
+    return SDValue(); // Shifts must disagree.
 
   // Canonicalize shl to left side in a shl/srl pair.
   if (RHSShift.getOpcode() == ISD::SHL) {
@@ -6267,13 +6240,13 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
       Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
     }
 
-    return Rot.getNode();
+    return Rot;
   }
 
   // If there is a mask here, and we have a variable shift, we can't be sure
   // that we're masking out the right stuff.
   if (LHSMask.getNode() || RHSMask.getNode())
-    return nullptr;
+    return SDValue();
 
   // If the shift amount is sign/zext/any-extended just peel it off.
   SDValue LExtOp0 = LHSShiftAmt;
@@ -6290,17 +6263,17 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
     RExtOp0 = RHSShiftAmt.getOperand(0);
   }
 
-  SDNode *TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
+  SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
                                    LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
   if (TryL)
     return TryL;
 
-  SDNode *TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
+  SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
                                    RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
   if (TryR)
     return TryR;
 
-  return nullptr;
+  return SDValue();
 }
 
 namespace {
@@ -6415,7 +6388,7 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
                                  Depth + 1);
   case ISD::LOAD: {
     auto L = cast<LoadSDNode>(Op.getNode());
-    if (L->isVolatile() || L->isIndexed())
+    if (!L->isSimple() || L->isIndexed())
       return None;
 
     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
@@ -6504,8 +6477,9 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {
   SDValue Chain;
   SmallVector<StoreSDNode *, 8> Stores;
   for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
+    // TODO: Allow unordered atomics when wider type is legal (see D66309)
     if (Store->getMemoryVT() != MVT::i8 ||
-        Store->isVolatile() || Store->isIndexed())
+        !Store->isSimple() || Store->isIndexed())
       return SDValue();
     Stores.push_back(Store);
     Chain = Store->getChain();
@@ -6716,7 +6690,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
       return SDValue();
 
     LoadSDNode *L = P->Load;
-    assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
+    assert(L->hasNUsesOfValue(1, 0) && L->isSimple() &&
+           !L->isIndexed() &&
            "Must be enforced by calculateByteProvider");
     assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
 
@@ -6958,25 +6933,25 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
   if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
       (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
-    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
-    if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
+    SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
+    if (isOneUseSetCC(N01) || isOneUseSetCC(N00)) {
       unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
-      LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
-      RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
-      AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
-      return DAG.getNode(NewOpcode, DL, VT, LHS, RHS);
+      N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
+      N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
+      AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
+      return DAG.getNode(NewOpcode, DL, VT, N00, N01);
     }
   }
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
   if (isAllOnesConstant(N1) && N0.hasOneUse() &&
       (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
-    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
-    if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
+    SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
+    if (isa<ConstantSDNode>(N01) || isa<ConstantSDNode>(N00)) {
       unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
-      LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
-      RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
-      AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
-      return DAG.getNode(NewOpcode, DL, VT, LHS, RHS);
+      N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
+      N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
+      AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
+      return DAG.getNode(NewOpcode, DL, VT, N00, N01);
     }
   }
 
@@ -7079,26 +7054,103 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   return SDValue();
 }
 
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This is a
+/// throughput improvement.
+static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) {
+  // Match a one-use bitwise logic op.
+  SDValue LogicOp = Shift->getOperand(0);
+  if (!LogicOp.hasOneUse())
+    return SDValue();
+
+  unsigned LogicOpcode = LogicOp.getOpcode();
+  if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR &&
+      LogicOpcode != ISD::XOR)
+    return SDValue();
+
+  // Find a matching one-use shift by constant.
+  unsigned ShiftOpcode = Shift->getOpcode();
+  SDValue C1 = Shift->getOperand(1);
+  ConstantSDNode *C1Node = isConstOrConstSplat(C1);
+  assert(C1Node && "Expected a shift with constant operand");
+  const APInt &C1Val = C1Node->getAPIntValue();
+  auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp,
+                             const APInt *&ShiftAmtVal) {
+    if (V.getOpcode() != ShiftOpcode || !V.hasOneUse())
+      return false;
+
+    ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1));
+    if (!ShiftCNode)
+      return false;
+
+    // Capture the shifted operand and shift amount value.
+    ShiftOp = V.getOperand(0);
+    ShiftAmtVal = &ShiftCNode->getAPIntValue();
+
+    // Shift amount types do not have to match their operand type, so check that
+    // the constants are the same width.
+    if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth())
+      return false;
+
+    // The fold is not valid if the sum of the shift values exceeds bitwidth.
+    if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits()))
+      return false;
+
+    return true;
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  SDValue X, Y;
+  const APInt *C0Val;
+  if (matchFirstShift(LogicOp.getOperand(0), X, C0Val))
+    Y = LogicOp.getOperand(1);
+  else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val))
+    Y = LogicOp.getOperand(0);
+  else
+    return SDValue();
+
+  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
+  SDLoc DL(Shift);
+  EVT VT = Shift->getValueType(0);
+  EVT ShiftAmtVT = Shift->getOperand(1).getValueType();
+  SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT);
+  SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC);
+  SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1);
+  return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2);
+}
+
 /// Handle transforms common to the three shifts, when the shift amount is a
 /// constant.
 /// We are looking for: (shift being one of shl/sra/srl)
 ///   shift (binop X, C0), C1
 /// And want to transform into:
 ///   binop (shift X, C1), (shift C0, C1)
-SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
+SDValue DAGCombiner::visitShiftByConstant(SDNode *N) {
+  assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand");
+
   // Do not turn a 'not' into a regular xor.
   if (isBitwiseNot(N->getOperand(0)))
     return SDValue();
 
   // The inner binop must be one-use, since we want to replace it.
-  SDNode *LHS = N->getOperand(0).getNode();
-  if (!LHS->hasOneUse()) return SDValue();
+  SDValue LHS = N->getOperand(0);
+  if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level))
+    return SDValue();
+
+  // TODO: This is limited to early combining because it may reveal regressions
+  //       otherwise. But since we just checked a target hook to see if this is
+  //       desirable, that should have filtered out cases where this interferes
+  //       with some other pattern matching.
+  if (!LegalTypes)
+    if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
+      return R;
 
   // We want to pull some binops through shifts, so that we have (and (shift))
   // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of
   // thing happens with address calculations, so it's important to canonicalize
   // it.
-  switch (LHS->getOpcode()) {
+  switch (LHS.getOpcode()) {
   default:
     return SDValue();
   case ISD::OR:
@@ -7112,14 +7164,14 @@ SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
   }
 
   // We require the RHS of the binop to be a constant and not opaque as well.
-  ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS->getOperand(1));
+  ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS.getOperand(1));
   if (!BinOpCst)
     return SDValue();
 
   // FIXME: disable this unless the input to the binop is a shift by a constant
   // or is copy/select. Enable this in other cases when figure out it's exactly
   // profitable.
-  SDValue BinOpLHSVal = LHS->getOperand(0);
+  SDValue BinOpLHSVal = LHS.getOperand(0);
   bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL ||
                             BinOpLHSVal.getOpcode() == ISD::SRA ||
                             BinOpLHSVal.getOpcode() == ISD::SRL) &&
@@ -7133,24 +7185,16 @@ SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
   if (IsCopyOrSelect && N->hasOneUse())
     return SDValue();
 
-  EVT VT = N->getValueType(0);
-
-  if (!TLI.isDesirableToCommuteWithShift(N, Level))
-    return SDValue();
-
   // Fold the constants, shifting the binop RHS by the shift amount.
-  SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)),
-                               N->getValueType(0),
-                               LHS->getOperand(1), N->getOperand(1));
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue NewRHS = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(1),
+                               N->getOperand(1));
   assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");
 
-  // Create the new shift.
-  SDValue NewShift = DAG.getNode(N->getOpcode(),
-                                 SDLoc(LHS->getOperand(0)),
-                                 VT, LHS->getOperand(0), N->getOperand(1));
-
-  // Create the new binop.
-  return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS);
+  SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0),
+                                 N->getOperand(1));
+  return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS);
 }
 
 SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
@@ -7478,7 +7522,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   }
 
   if (N1C && !N1C->isOpaque())
-    if (SDValue NewSHL = visitShiftByConstant(N, N1C))
+    if (SDValue NewSHL = visitShiftByConstant(N))
       return NewSHL;
 
   return SDValue();
@@ -7597,6 +7641,37 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     }
   }
 
+  // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper.
+  //   sra (add (shl X, N1C), AddC), N1C -->
+  //   sext (add (trunc X to (width - N1C)), AddC')
+  if (!LegalTypes && N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C &&
+      N0.getOperand(0).getOpcode() == ISD::SHL &&
+      N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) {
+    if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) {
+      SDValue Shl = N0.getOperand(0);
+      // Determine what the truncate's type would be and ask the target if that
+      // is a free operation.
+      LLVMContext &Ctx = *DAG.getContext();
+      unsigned ShiftAmt = N1C->getZExtValue();
+      EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt);
+      if (VT.isVector())
+        TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());
+
+      // TODO: The simple type check probably belongs in the default hook
+      //       implementation and/or target-specific overrides (because
+      //       non-simple types likely require masking when legalized), but that
+      //       restriction may conflict with other transforms.
+      if (TruncVT.isSimple() && TLI.isTruncateFree(VT, TruncVT)) {
+        SDLoc DL(N);
+        SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);
+        SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).
+                             trunc(TruncVT.getScalarSizeInBits()), DL, TruncVT);
+        SDValue Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC);
+        return DAG.getSExtOrTrunc(Add, DL, VT);
+      }
+    }
+  }
+
   // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
@@ -7638,7 +7713,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
 
   if (N1C && !N1C->isOpaque())
-    if (SDValue NewSRA = visitShiftByConstant(N, N1C))
+    if (SDValue NewSRA = visitShiftByConstant(N))
       return NewSRA;
 
   return SDValue();
@@ -7819,7 +7894,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     return SDValue(N, 0);
 
   if (N1C && !N1C->isOpaque())
-    if (SDValue NewSRL = visitShiftByConstant(N, N1C))
+    if (SDValue NewSRL = visitShiftByConstant(N))
       return NewSRL;
 
   // Attempt to convert a srl of a load into a narrower zero-extending load.
@@ -8100,6 +8175,43 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
   }
 }
 
+/// If a (v)select has a condition value that is a sign-bit test, try to smear
+/// the condition operand sign-bit across the value width and use it as a mask.
+static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) {
+  SDValue Cond = N->getOperand(0);
+  SDValue C1 = N->getOperand(1);
+  SDValue C2 = N->getOperand(2);
+  assert(isConstantOrConstantVector(C1) && isConstantOrConstantVector(C2) &&
+         "Expected select-of-constants");
+
+  EVT VT = N->getValueType(0);
+  if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse() ||
+      VT != Cond.getOperand(0).getValueType())
+    return SDValue();
+
+  // The inverted-condition + commuted-select variants of these patterns are
+  // canonicalized to these forms in IR.
+  SDValue X = Cond.getOperand(0);
+  SDValue CondC = Cond.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) &&
+      isAllOnesOrAllOnesSplat(C2)) {
+    // i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1
+    SDLoc DL(N);
+    SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
+    return DAG.getNode(ISD::OR, DL, VT, Sra, C1);
+  }
+  if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) {
+    // i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1
+    SDLoc DL(N);
+    SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
+    return DAG.getNode(ISD::AND, DL, VT, Sra, C1);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   SDValue Cond = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8148,22 +8260,36 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
       return Cond;
     }
 
-    // For any constants that differ by 1, we can transform the select into an
-    // extend and add. Use a target hook because some targets may prefer to
-    // transform in the other direction.
+    // Use a target hook because some targets may prefer to transform in the
+    // other direction.
     if (TLI.convertSelectOfConstantsToMath(VT)) {
-      if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
+      // For any constants that differ by 1, we can transform the select into an
+      // extend and add.
+      const APInt &C1Val = C1->getAPIntValue();
+      const APInt &C2Val = C2->getAPIntValue();
+      if (C1Val - 1 == C2Val) {
         // select Cond, C1, C1-1 --> add (zext Cond), C1-1
         if (VT != MVT::i1)
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
         return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
       }
-      if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) {
+      if (C1Val + 1 == C2Val) {
         // select Cond, C1, C1+1 --> add (sext Cond), C1+1
         if (VT != MVT::i1)
           Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
         return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
       }
+
+      // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
+      if (C1Val.isPowerOf2() && C2Val.isNullValue()) {
+        if (VT != MVT::i1)
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
+        SDValue ShAmtC = DAG.getConstant(C1Val.exactLogBase2(), DL, VT);
+        return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC);
+      }
+
+      if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
+        return V;
     }
 
     return SDValue();
@@ -8381,23 +8507,6 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
-static
-std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
-  SDLoc DL(N);
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-
-  // Split the inputs.
-  SDValue Lo, Hi, LL, LH, RL, RH;
-  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
-  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
-
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
-
-  return std::make_pair(Lo, Hi);
-}
-
 // This function assumes all the vselect's arguments are CONCAT_VECTOR
 // nodes and that the condition is a BV of ConstantSDNodes (or undefs).
 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
@@ -8456,7 +8565,6 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
 SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
   SDValue Mask = MSC->getMask();
-  SDValue Data = MSC->getValue();
   SDValue Chain = MSC->getChain();
   SDLoc DL(N);
 
@@ -8464,123 +8572,19 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return Chain;
 
-  if (Level >= AfterLegalizeTypes)
-    return SDValue();
-
-  // If the MSCATTER data type requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-  if (Mask.getOpcode() != ISD::SETCC)
-    return SDValue();
-
-  // Check if any splitting is required.
-  if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
-      TargetLowering::TypeSplitVector)
-    return SDValue();
-  SDValue MaskLo, MaskHi;
-  std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0));
-
-  EVT MemoryVT = MSC->getMemoryVT();
-  unsigned Alignment = MSC->getOriginalAlignment();
-
-  EVT LoMemVT, HiMemVT;
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
-  SDValue DataLo, DataHi;
-  std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
-
-  SDValue Scale = MSC->getScale();
-  SDValue BasePtr = MSC->getBasePtr();
-  SDValue IndexLo, IndexHi;
-  std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL);
-
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MSC->getPointerInfo(),
-                          MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
-                          Alignment, MSC->getAAInfo(), MSC->getRanges());
-
-  SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo, Scale };
-  SDValue Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
-                                    DataLo.getValueType(), DL, OpsLo, MMO);
-
-  // The order of the Scatter operation after split is well defined. The "Hi"
-  // part comes after the "Lo". So these two operations should be chained one
-  // after another.
-  SDValue OpsHi[] = { Lo, DataHi, MaskHi, BasePtr, IndexHi, Scale };
-  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
-                              DL, OpsHi, MMO);
+  return SDValue();
 }
 
 SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
   SDValue Mask = MST->getMask();
-  SDValue Data = MST->getValue();
   SDValue Chain = MST->getChain();
-  EVT VT = Data.getValueType();
   SDLoc DL(N);
 
   // Zap masked stores with a zero mask.
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return Chain;
 
-  if (Level >= AfterLegalizeTypes)
-    return SDValue();
-
-  // If the MSTORE data type requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-  if (Mask.getOpcode() == ISD::SETCC) {
-    // Check if any splitting is required.
-    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
-        TargetLowering::TypeSplitVector)
-      return SDValue();
-
-    SDValue MaskLo, MaskHi, Lo, Hi;
-    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
-    SDValue Ptr   = MST->getBasePtr();
-
-    EVT MemoryVT = MST->getMemoryVT();
-    unsigned Alignment = MST->getOriginalAlignment();
-
-    EVT LoMemVT, HiMemVT;
-    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
-    SDValue DataLo, DataHi;
-    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
-
-    MachineMemOperand *MMO = DAG.getMachineFunction().
-      getMachineMemOperand(MST->getPointerInfo(),
-                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
-                           Alignment, MST->getAAInfo(), MST->getRanges());
-
-    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
-                            MST->isTruncatingStore(),
-                            MST->isCompressingStore());
-
-    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
-                                     MST->isCompressingStore());
-    unsigned HiOffset = LoMemVT.getStoreSize();
-
-    MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MST->getPointerInfo().getWithOffset(HiOffset),
-        MachineMemOperand::MOStore, HiMemVT.getStoreSize(), Alignment,
-        MST->getAAInfo(), MST->getRanges());
-
-    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
-                            MST->isTruncatingStore(),
-                            MST->isCompressingStore());
-
-    AddToWorklist(Lo.getNode());
-    AddToWorklist(Hi.getNode());
-
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
-  }
   return SDValue();
 }
 
@@ -8593,76 +8597,7 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return CombineTo(N, MGT->getPassThru(), MGT->getChain());
 
-  if (Level >= AfterLegalizeTypes)
-    return SDValue();
-
-  // If the MGATHER result requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-
-  if (Mask.getOpcode() != ISD::SETCC)
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // Check if any splitting is required.
-  if (TLI.getTypeAction(*DAG.getContext(), VT) !=
-      TargetLowering::TypeSplitVector)
-    return SDValue();
-
-  SDValue MaskLo, MaskHi, Lo, Hi;
-  std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
-  SDValue PassThru = MGT->getPassThru();
-  SDValue PassThruLo, PassThruHi;
-  std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, DL);
-
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
-
-  SDValue Chain = MGT->getChain();
-  EVT MemoryVT = MGT->getMemoryVT();
-  unsigned Alignment = MGT->getOriginalAlignment();
-
-  EVT LoMemVT, HiMemVT;
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
-  SDValue Scale = MGT->getScale();
-  SDValue BasePtr = MGT->getBasePtr();
-  SDValue Index = MGT->getIndex();
-  SDValue IndexLo, IndexHi;
-  std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
-
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MGT->getPointerInfo(),
-                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
-                          Alignment, MGT->getAAInfo(), MGT->getRanges());
-
-  SDValue OpsLo[] = { Chain, PassThruLo, MaskLo, BasePtr, IndexLo, Scale };
-  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo,
-                           MMO);
-
-  SDValue OpsHi[] = { Chain, PassThruHi, MaskHi, BasePtr, IndexHi, Scale };
-  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi,
-                           MMO);
-
-  AddToWorklist(Lo.getNode());
-  AddToWorklist(Hi.getNode());
-
-  // Build a factor node to remember that this load is independent of the
-  // other one.
-  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
-                      Hi.getValue(1));
-
-  // Legalized the chain result - switch anything that used the old chain to
-  // use the new one.
-  DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain);
-
-  SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-
-  SDValue RetOps[] = { GatherRes, Chain };
-  return DAG.getMergeValues(RetOps, DL);
+  return SDValue();
 }
 
 SDValue DAGCombiner::visitMLOAD(SDNode *N) {
@@ -8674,76 +8609,6 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return CombineTo(N, MLD->getPassThru(), MLD->getChain());
 
-  if (Level >= AfterLegalizeTypes)
-    return SDValue();
-
-  // If the MLOAD result requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-  if (Mask.getOpcode() == ISD::SETCC) {
-    EVT VT = N->getValueType(0);
-
-    // Check if any splitting is required.
-    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
-        TargetLowering::TypeSplitVector)
-      return SDValue();
-
-    SDValue MaskLo, MaskHi, Lo, Hi;
-    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
-    SDValue PassThru = MLD->getPassThru();
-    SDValue PassThruLo, PassThruHi;
-    std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, DL);
-
-    EVT LoVT, HiVT;
-    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
-
-    SDValue Chain = MLD->getChain();
-    SDValue Ptr   = MLD->getBasePtr();
-    EVT MemoryVT = MLD->getMemoryVT();
-    unsigned Alignment = MLD->getOriginalAlignment();
-
-    EVT LoMemVT, HiMemVT;
-    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
-    MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MLD->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
-                         Alignment, MLD->getAAInfo(), MLD->getRanges());
-
-    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, PassThruLo, LoMemVT,
-                           MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad());
-
-    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
-                                     MLD->isExpandingLoad());
-    unsigned HiOffset = LoMemVT.getStoreSize();
-
-    MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MLD->getPointerInfo().getWithOffset(HiOffset),
-        MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), Alignment,
-        MLD->getAAInfo(), MLD->getRanges());
-
-    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, PassThruHi, HiMemVT,
-                           MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad());
-
-    AddToWorklist(Lo.getNode());
-    AddToWorklist(Hi.getNode());
-
-    // Build a factor node to remember that this load is independent of the
-    // other one.
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
-                        Hi.getValue(1));
-
-    // Legalized the chain result - switch anything that used the old chain to
-    // use the new one.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
-
-    SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-
-    SDValue RetOps[] = { LoadRes, Chain };
-    return DAG.getMergeValues(RetOps, DL);
-  }
   return SDValue();
 }
 
@@ -8791,6 +8656,18 @@ SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
     return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
   }
 
+  // select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C)
+  APInt Pow2C;
+  if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() &&
+      isNullOrNullSplat(N2)) {
+    SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT);
+    SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT);
+    return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC);
+  }
+
+  if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
+    return V;
+
   // The general case for select-of-constants:
   // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
   // ...but that only makes sense if a vselect is slower than 2 logic ops, so
@@ -8832,13 +8709,12 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
       isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
 
     if (isAbs) {
-      EVT VT = LHS.getValueType();
       if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
         return DAG.getNode(ISD::ABS, DL, VT, LHS);
 
-      SDValue Shift = DAG.getNode(
-          ISD::SRA, DL, VT, LHS,
-          DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
+      SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS,
+                                  DAG.getConstant(VT.getScalarSizeInBits() - 1,
+                                                  DL, getShiftAmountTy(VT)));
       SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
       AddToWorklist(Shift.getNode());
       AddToWorklist(Add.getNode());
@@ -8851,10 +8727,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     // This is OK if we don't care about what happens if either operand is a
     // NaN.
     //
-    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N0.getOperand(0),
-                                                       N0.getOperand(1), TLI)) {
-      if (SDValue FMinMax = combineMinNumMaxNum(
-              DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG))
+    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) {
+      if (SDValue FMinMax =
+              combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC, TLI, DAG))
         return FMinMax;
     }
 
@@ -9209,8 +9084,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
 
   if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
-      !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() ||
-      !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+      !N0.hasOneUse() || !LN0->isSimple() ||
+      !DstVT.isVector() || !DstVT.isPow2VectorType() ||
+      !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
     return SDValue();
 
   SmallVector<SDNode *, 4> SetCCs;
@@ -9411,7 +9287,8 @@ static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   EVT MemVT = LN0->getMemoryVT();
-  if ((LegalOperations || LN0->isVolatile() || VT.isVector()) &&
+  if ((LegalOperations || !LN0->isSimple() ||
+       VT.isVector()) &&
       !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
     return SDValue();
 
@@ -9436,7 +9313,7 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
   if (!ISD::isNON_EXTLoad(N0.getNode()) ||
       !ISD::isUNINDEXEDLoad(N0.getNode()) ||
       ((LegalOperations || VT.isVector() ||
-        cast<LoadSDNode>(N0)->isVolatile()) &&
+        !cast<LoadSDNode>(N0)->isSimple()) &&
        !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
     return {};
 
@@ -9468,6 +9345,35 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
   return SDValue(N, 0); // Return N so it doesn't get rechecked!
 }
 
+static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
+                                        const TargetLowering &TLI, EVT VT,
+                                        SDNode *N, SDValue N0,
+                                        ISD::LoadExtType ExtLoadType,
+                                        ISD::NodeType ExtOpc) {
+  if (!N0.hasOneUse())
+    return SDValue();
+
+  MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0);
+  if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD)
+    return SDValue();
+
+  if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0)))
+    return SDValue();
+
+  if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+    return SDValue();
+
+  SDLoc dl(Ld);
+  SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
+  SDValue NewLoad = DAG.getMaskedLoad(VT, dl, Ld->getChain(),
+                                      Ld->getBasePtr(), Ld->getMask(),
+                                      PassThru, Ld->getMemoryVT(),
+                                      Ld->getMemOperand(), ExtLoadType,
+                                      Ld->isExpandingLoad());
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
+  return NewLoad;
+}
+
 static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
                                        bool LegalOperations) {
   assert((N->getOpcode() == ISD::SIGN_EXTEND ||
@@ -9568,6 +9474,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                              ISD::SEXTLOAD, ISD::SIGN_EXTEND))
     return foldedExt;
 
+  if (SDValue foldedExt =
+      tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD,
+                               ISD::SIGN_EXTEND))
+    return foldedExt;
+
   // fold (sext (load x)) to multiple smaller sextloads.
   // Only on illegal but splittable vectors.
   if (SDValue ExtLoad = CombineExtLoad(N))
@@ -9856,6 +9767,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                              ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
     return foldedExt;
 
+  if (SDValue foldedExt =
+      tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD,
+                               ISD::ZERO_EXTEND))
+    return foldedExt;
+
   // fold (zext (load x)) to multiple smaller zextloads.
   // Only on illegal but splittable vectors.
   if (SDValue ExtLoad = CombineExtLoad(N))
@@ -10340,7 +10256,10 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     return SDValue();
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-  if (!isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
+  // Reducing the width of a volatile load is illegal.  For atomics, we may be
+  // able to reduce the width provided we never widen again. (see D66309)  
+  if (!LN0->isSimple() ||
+      !isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
     return SDValue();
 
   auto AdjustBigEndianShift = [&](unsigned ShAmt) {
@@ -10369,11 +10288,11 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   SDValue Load;
   if (ExtType == ISD::NON_EXTLOAD)
-    Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
+    Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr,
                        LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
                        LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
   else
-    Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(), NewPtr,
+    Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr,
                           LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
                           NewAlign, LN0->getMemOperand()->getFlags(),
                           LN0->getAAInfo());
@@ -10392,7 +10311,6 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     // no larger than the source) then the useful bits of the result are
     // zero; we can't simply return the shortened shift, because the result
     // of that operation is undefined.
-    SDLoc DL(N0);
     if (ShLeftAmt >= VT.getSizeInBits())
       Result = DAG.getConstant(0, DL, VT);
     else
@@ -10513,7 +10431,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if (ISD::isEXTLoad(N0.getNode()) &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile() &&
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
         N0.hasOneUse()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -10530,7 +10448,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       N0.hasOneUse() &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
@@ -10757,7 +10675,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     // after truncation.
     if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      if (!LN0->isVolatile() &&
+      if (LN0->isSimple() &&
           LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
         SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
                                          VT, LN0->getChain(), LN0->getBasePtr(),
@@ -11051,7 +10969,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       // memory accesses. We don't care if the original type was legal or not
       // as we assume software couldn't rely on the number of accesses of an
       // illegal type.
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
        TLI.isOperationLegal(ISD::LOAD, VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
 
@@ -11237,15 +11155,10 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       for (int i = 0; i != MaskScale; ++i)
         NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);
 
-    bool LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
-    if (!LegalMask) {
-      std::swap(SV0, SV1);
-      ShuffleVectorSDNode::commuteMask(NewMask);
-      LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
-    }
-
-    if (LegalMask)
-      return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask);
+    SDValue LegalShuffle =
+        TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG);
+    if (LegalShuffle)
+      return LegalShuffle;
   }
 
   return SDValue();
@@ -11998,7 +11911,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
   ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
   if (N1C && N1C->isZero())
-    if (N1C->isNegative() || Options.UnsafeFPMath || Flags.hasNoSignedZeros())
+    if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())
       return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -12006,17 +11919,17 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
 
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize) == 2)
-    return DAG.getNode(ISD::FSUB, DL, VT, N0,
-                       GetNegatedExpression(N1, DAG, LegalOperations,
-                                            ForCodeSize), Flags);
+      TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize) == 2)
+    return DAG.getNode(
+        ISD::FSUB, DL, VT, N0,
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);
 
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      isNegatibleForFree(N0, LegalOperations, TLI, &Options, ForCodeSize) == 2)
-    return DAG.getNode(ISD::FSUB, DL, VT, N1,
-                       GetNegatedExpression(N0, DAG, LegalOperations,
-                                            ForCodeSize), Flags);
+      TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize) == 2)
+    return DAG.getNode(
+        ISD::FSUB, DL, VT, N1,
+        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), Flags);
 
   auto isFMulNegTwo = [](SDValue FMul) {
     if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
@@ -12056,7 +11969,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   // If 'unsafe math' or reassoc and nsz, fold lots of things.
   // TODO: break out portions of the transformations below for which Unsafe is
   //       considered and which do not require both nsz and reassoc
-  if ((Options.UnsafeFPMath ||
+  if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
        (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
       AllowNewConst) {
     // fadd (fadd x, c1), c2 -> fadd x, c1 + c2
@@ -12175,7 +12088,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // (fsub A, 0) -> A
   if (N1CFP && N1CFP->isZero()) {
-    if (!N1CFP->isNegative() || Options.UnsafeFPMath ||
+    if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath ||
         Flags.hasNoSignedZeros()) {
       return N0;
     }
@@ -12195,16 +12108,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   if (N0CFP && N0CFP->isZero()) {
     if (N0CFP->isNegative() ||
         (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
-      if (isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize))
-        return GetNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
+      if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize))
+        return TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
       if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
         return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);
     }
   }
 
-  if ((Options.UnsafeFPMath ||
-      (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()))
-      && N1.getOpcode() == ISD::FADD) {
+  if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
+       (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
+      N1.getOpcode() == ISD::FADD) {
     // X - (X + Y) -> -Y
     if (N0 == N1->getOperand(0))
       return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1), Flags);
@@ -12214,10 +12127,10 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   }
 
   // fold (fsub A, (fneg B)) -> (fadd A, B)
-  if (isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize))
-    return DAG.getNode(ISD::FADD, DL, VT, N0,
-                       GetNegatedExpression(N1, DAG, LegalOperations,
-                                            ForCodeSize), Flags);
+  if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize))
+    return DAG.getNode(
+        ISD::FADD, DL, VT, N0,
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);
 
   // FSUB -> FMA combines:
   if (SDValue Fused = visitFSUBForFMACombine(N)) {
@@ -12228,6 +12141,21 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   return SDValue();
 }
 
+/// Return true if both inputs are at least as cheap in negated form and at
+/// least one input is strictly cheaper in negated form.
+bool DAGCombiner::isCheaperToUseNegatedFPOps(SDValue X, SDValue Y) {
+  if (char LHSNeg =
+          TLI.isNegatibleForFree(X, DAG, LegalOperations, ForCodeSize))
+    if (char RHSNeg =
+            TLI.isNegatibleForFree(Y, DAG, LegalOperations, ForCodeSize))
+      // Both negated operands are at least as cheap as their counterparts.
+      // Check to see if at least one is cheaper negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return true;
+
+  return false;
+}
+
 SDValue DAGCombiner::visitFMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12254,10 +12182,6 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
      !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);
 
-  // fold (fmul A, 1.0) -> A
-  if (N1CFP && N1CFP->isExactlyValue(1.0))
-    return N0;
-
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -12302,21 +12226,13 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
     if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
       return DAG.getNode(ISD::FNEG, DL, VT, N0);
 
-  // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options,
-                                       ForCodeSize)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options,
-                                         ForCodeSize)) {
-      // Both can be negated for free, check to see if at least one is cheaper
-      // negated.
-      if (LHSNeg == 2 || RHSNeg == 2)
-        return DAG.getNode(ISD::FMUL, DL, VT,
-                           GetNegatedExpression(N0, DAG, LegalOperations,
-                                                ForCodeSize),
-                           GetNegatedExpression(N1, DAG, LegalOperations,
-                                                ForCodeSize),
-                           Flags);
-    }
+  // -N0 * -N1 --> N0 * N1
+  if (isCheaperToUseNegatedFPOps(N0, N1)) {
+    SDValue NegN0 =
+        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
+    SDValue NegN1 =
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
+    return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags);
   }
 
   // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
@@ -12395,6 +12311,15 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
   }
 
+  // (-N0 * -N1) + N2 --> (N0 * N1) + N2
+  if (isCheaperToUseNegatedFPOps(N0, N1)) {
+    SDValue NegN0 =
+        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
+    SDValue NegN1 =
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
+    return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags);
+  }
+
   if (UnsafeFPMath) {
     if (N0CFP && N0CFP->isZero())
       return N2;
@@ -12602,9 +12527,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) {
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
         return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
-      }
     } else if (N1.getOpcode() == ISD::FP_EXTEND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
       if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
@@ -12645,28 +12569,16 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     }
 
     // Fold into a reciprocal estimate and multiply instead of a real divide.
-    if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) {
-      AddToWorklist(RV.getNode());
-      return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
-    }
+    if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
+      return RV;
   }
 
   // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options,
-                                       ForCodeSize)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options,
-                                         ForCodeSize)) {
-      // Both can be negated for free, check to see if at least one is cheaper
-      // negated.
-      if (LHSNeg == 2 || RHSNeg == 2)
-        return DAG.getNode(ISD::FDIV, SDLoc(N), VT,
-                           GetNegatedExpression(N0, DAG, LegalOperations,
-                                                ForCodeSize),
-                           GetNegatedExpression(N1, DAG, LegalOperations,
-                                                ForCodeSize),
-                           Flags);
-    }
-  }
+  if (isCheaperToUseNegatedFPOps(N0, N1))
+    return DAG.getNode(
+        ISD::FDIV, SDLoc(N), VT,
+        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize),
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);
 
   return SDValue();
 }
@@ -13112,22 +13024,6 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
-  SDValue N0 = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-
-  // fold (fp_round_inreg c1fp) -> c1fp
-  if (N0CFP && isTypeLegal(EVT)) {
-    SDLoc DL(N);
-    SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), DL, EVT);
-    return DAG.getNode(ISD::FP_EXTEND, DL, VT, Round);
-  }
-
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -13236,9 +13132,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
 
-  if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
-                         &DAG.getTarget().Options, ForCodeSize))
-    return GetNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
+  if (TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize))
+    return TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
 
   // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
   // constant pool values.
@@ -14004,11 +13899,12 @@ bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
 }
 
 SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
-  if (OptLevel == CodeGenOpt::None || LD->isVolatile())
+  if (OptLevel == CodeGenOpt::None || !LD->isSimple())
     return SDValue();
   SDValue Chain = LD->getOperand(0);
   StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
-  if (!ST || ST->isVolatile())
+  // TODO: Relax this restriction for unordered atomics (see D66309)
+  if (!ST || !ST->isSimple())
     return SDValue();
 
   EVT LDType = LD->getValueType(0);
@@ -14107,7 +14003,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   // If load is not volatile and there are no uses of the loaded value (and
   // the updated indexed value in case of indexed loads), change uses of the
   // chain value into uses of the chain input (i.e. delete the dead load).
-  if (!LD->isVolatile()) {
+  // TODO: Allow this for unordered atomics (see D66309)
+  if (LD->isSimple()) {
     if (N->getValueType(1) == MVT::Other) {
       // Unindexed loads.
       if (!N->hasAnyUseOfValue(0)) {
@@ -14241,7 +14138,7 @@ struct LoadedSlice {
   /// Helper structure used to compute the cost of a slice.
   struct Cost {
     /// Are we optimizing for code size.
-    bool ForCodeSize;
+    bool ForCodeSize = false;
 
     /// Various cost.
     unsigned Loads = 0;
@@ -14250,10 +14147,10 @@ struct LoadedSlice {
     unsigned ZExts = 0;
     unsigned Shift = 0;
 
-    Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {}
+    explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {}
 
     /// Get the cost of one isolated slice.
-    Cost(const LoadedSlice &LS, bool ForCodeSize = false)
+    Cost(const LoadedSlice &LS, bool ForCodeSize)
         : ForCodeSize(ForCodeSize), Loads(1) {
       EVT TruncType = LS.Inst->getValueType(0);
       EVT LoadedType = LS.getLoadedType();
@@ -14678,7 +14575,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
     return false;
 
   LoadSDNode *LD = cast<LoadSDNode>(N);
-  if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
+  if (!LD->isSimple() || !ISD::isNormalLoad(LD) ||
       !LD->getValueType(0).isInteger())
     return false;
 
@@ -14829,13 +14726,7 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
   else if (Chain->getOpcode() == ISD::TokenFactor &&
            SDValue(LD, 1).hasOneUse()) {
     // LD has only 1 chain use so they are no indirect dependencies.
-    bool isOk = false;
-    for (const SDValue &ChainOp : Chain->op_values())
-      if (ChainOp.getNode() == LD) {
-        isOk = true;
-        break;
-      }
-    if (!isOk)
+    if (!LD->isOperandOf(Chain.getNode()))
       return Result;
   } else
     return Result; // Fail.
@@ -14848,7 +14739,7 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
 /// Check to see if IVal is something that provides a value as specified by
 /// MaskInfo. If so, replace the specified store with a narrower store of
 /// truncated IVal.
-static SDNode *
+static SDValue
 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
                                 SDValue IVal, StoreSDNode *St,
                                 DAGCombiner *DC) {
@@ -14860,14 +14751,19 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   // that uses this.  If not, this is not a replacement.
   APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
                                   ByteShift*8, (ByteShift+NumBytes)*8);
-  if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr;
+  if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue();
 
   // Check that it is legal on the target to do this.  It is legal if the new
   // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
-  // legalization.
-  MVT VT = MVT::getIntegerVT(NumBytes*8);
+  // legalization (and the target doesn't explicitly think this is a bad idea).
+  MVT VT = MVT::getIntegerVT(NumBytes * 8);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!DC->isTypeLegal(VT))
-    return nullptr;
+    return SDValue();
+  if (St->getMemOperand() &&
+      !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                              *St->getMemOperand()))
+    return SDValue();
 
   // Okay, we can do this!  Replace the 'St' store with a store of IVal that is
   // shifted by ByteShift and truncated down to NumBytes.
@@ -14901,8 +14797,7 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   ++OpsNarrowed;
   return DAG
       .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
-                St->getPointerInfo().getWithOffset(StOffset), NewAlign)
-      .getNode();
+                St->getPointerInfo().getWithOffset(StOffset), NewAlign);
 }
 
 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and
@@ -14911,7 +14806,7 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
 /// or code size.
 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
-  if (ST->isVolatile())
+  if (!ST->isSimple())
     return SDValue();
 
   SDValue Chain = ST->getChain();
@@ -14933,16 +14828,16 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     std::pair<unsigned, unsigned> MaskedLoad;
     MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
     if (MaskedLoad.first)
-      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+      if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
                                                   Value.getOperand(1), ST,this))
-        return SDValue(NewST, 0);
+        return NewST;
 
     // Or is commutative, so try swapping X and Y.
     MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
     if (MaskedLoad.first)
-      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+      if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
                                                   Value.getOperand(0), ST,this))
-        return SDValue(NewST, 0);
+        return NewST;
   }
 
   if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
@@ -15367,14 +15262,16 @@ void DAGCombiner::getStoreMergeCandidates(
     // Loads must only have one use.
     if (!Ld->hasNUsesOfValue(1, 0))
       return;
-    // The memory operands must not be volatile/indexed.
-    if (Ld->isVolatile() || Ld->isIndexed())
+    // The memory operands must not be volatile/indexed/atomic.
+    // TODO: May be able to relax for unordered atomics (see D66309)
+    if (!Ld->isSimple() || Ld->isIndexed())
       return;
   }
   auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
                             int64_t &Offset) -> bool {
-    // The memory operands must not be volatile/indexed.
-    if (Other->isVolatile() || Other->isIndexed())
+    // The memory operands must not be volatile/indexed/atomic.
+    // TODO: May be able to relax for unordered atomics (see D66309)
+    if (!Other->isSimple() ||  Other->isIndexed())
       return false;
     // Don't mix temporal stores with non-temporal stores.
     if (St->isNonTemporal() != Other->isNonTemporal())
@@ -15394,8 +15291,10 @@ void DAGCombiner::getStoreMergeCandidates(
         // Loads must only have one use.
         if (!OtherLd->hasNUsesOfValue(1, 0))
           return false;
-        // The memory operands must not be volatile/indexed.
-        if (OtherLd->isVolatile() || OtherLd->isIndexed())
+        // The memory operands must not be volatile/indexed/atomic.
+        // TODO: May be able to relax for unordered atomics (see D66309)
+        if (!OtherLd->isSimple() ||
+            OtherLd->isIndexed())
           return false;
         // Don't mix temporal loads with non-temporal loads.
         if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
@@ -15425,6 +15324,18 @@ void DAGCombiner::getStoreMergeCandidates(
     return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
   };
 
+  // Check if the pair of StoreNode and the RootNode already bail out many
+  // times which is over the limit in dependence check.
+  auto OverLimitInDependenceCheck = [&](SDNode *StoreNode,
+                                        SDNode *RootNode) -> bool {
+    auto RootCount = StoreRootCountMap.find(StoreNode);
+    if (RootCount != StoreRootCountMap.end() &&
+        RootCount->second.first == RootNode &&
+        RootCount->second.second > StoreMergeDependenceLimit)
+      return true;
+    return false;
+  };
+
   // We looking for a root node which is an ancestor to all mergable
   // stores. We search up through a load, to our root and then down
   // through all children. For instance we will find Store{1,2,3} if
@@ -15454,7 +15365,8 @@ void DAGCombiner::getStoreMergeCandidates(
             if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) {
               BaseIndexOffset Ptr;
               int64_t PtrDiff;
-              if (CandidateMatch(OtherST, Ptr, PtrDiff))
+              if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
+                  !OverLimitInDependenceCheck(OtherST, RootNode))
                 StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
             }
   } else
@@ -15464,7 +15376,8 @@ void DAGCombiner::getStoreMergeCandidates(
         if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
           BaseIndexOffset Ptr;
           int64_t PtrDiff;
-          if (CandidateMatch(OtherST, Ptr, PtrDiff))
+          if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
+              !OverLimitInDependenceCheck(OtherST, RootNode))
             StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
         }
 }
@@ -15522,13 +15435,24 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
   // Search through DAG. We can stop early if we find a store node.
   for (unsigned i = 0; i < NumStores; ++i)
     if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
-                                     Max))
+                                     Max)) {
+      // If the searching bail out, record the StoreNode and RootNode in the
+      // StoreRootCountMap. If we have seen the pair many times over a limit,
+      // we won't add the StoreNode into StoreNodes set again.
+      if (Visited.size() >= Max) {
+        auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode];
+        if (RootCount.first == RootNode)
+          RootCount.second++;
+        else
+          RootCount = {RootNode, 1};
+      }
       return false;
+    }
   return true;
 }
 
 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
-  if (OptLevel == CodeGenOpt::None)
+  if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)
     return false;
 
   EVT MemVT = St->getMemoryVT();
@@ -15588,7 +15512,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
 
   bool RV = false;
   while (StoreNodes.size() > 1) {
-    unsigned StartIdx = 0;
+    size_t StartIdx = 0;
     while ((StartIdx + 1 < StoreNodes.size()) &&
            StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
                StoreNodes[StartIdx + 1].OffsetFromBase)
@@ -16113,7 +16037,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
   case MVT::ppcf128:
     return SDValue();
   case MVT::f32:
-    if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) ||
+    if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
       ;
       Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
@@ -16125,7 +16049,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
     return SDValue();
   case MVT::f64:
     if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
-         !ST->isVolatile()) ||
+         ST->isSimple()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
       ;
       Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
@@ -16134,7 +16058,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
                           Ptr, ST->getMemOperand());
     }
 
-    if (!ST->isVolatile() &&
+    if (ST->isSimple() &&
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
       // Many FP stores are not made apparent until after legalize, e.g. for
       // argument passing.  Since this is so common, custom legalize the
@@ -16181,7 +16105,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // memory accesses. We don't care if the original type was legal or not
     // as we assume software couldn't rely on the number of accesses of an
     // illegal type.
-    if (((!LegalOperations && !ST->isVolatile()) ||
+    // TODO: May be able to relax for unordered atomics (see D66309)
+    if (((!LegalOperations && ST->isSimple()) ||
          TLI.isOperationLegal(ISD::STORE, SVT)) &&
         TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
                                      DAG, *ST->getMemOperand())) {
@@ -16242,9 +16167,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // See if we can simplify the input to this truncstore with knowledge that
     // only the low bits are being used.  For example:
     // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
-    SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits);
     AddToWorklist(Value.getNode());
-    if (Shorter)
+    if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits))
       return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
                                ST->getMemOperand());
 
@@ -16263,9 +16187,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // If this is a load followed by a store to the same location, then the store
   // is dead/noop.
+  // TODO: Can relax for unordered atomics (see D66309)
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
     if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
-        ST->isUnindexed() && !ST->isVolatile() &&
+        ST->isUnindexed() && ST->isSimple() &&
         // There can't be any side effects between the load and store, such as
         // a call or store.
         Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
@@ -16274,9 +16199,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     }
   }
 
+  // TODO: Can relax for unordered atomics (see D66309)
   if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
-    if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
-        !ST1->isVolatile()) {
+    if (ST->isUnindexed() && ST->isSimple() &&
+        ST1->isUnindexed() && ST1->isSimple()) {
       if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value &&
           ST->getMemoryVT() == ST1->getMemoryVT()) {
         // If this is a store followed by a store with the same value to the
@@ -16405,7 +16331,8 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
       break;
     case ISD::STORE: {
       StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain);
-      if (ST->isVolatile() || ST->isIndexed())
+      // TODO: Can relax for unordered atomics (see D66309)
+      if (!ST->isSimple() || ST->isIndexed())
         continue;
       const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
       // If we store purely within object bounds just before its lifetime ends,
@@ -16456,6 +16383,11 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
   if (OptLevel == CodeGenOpt::None)
     return SDValue();
 
+  // Can't change the number of memory accesses for a volatile store or break
+  // atomicity for an atomic one.
+  if (!ST->isSimple())
+    return SDValue();
+
   SDValue Val = ST->getValue();
   SDLoc DL(ST);
 
@@ -16531,12 +16463,52 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
 }
 
 /// Convert a disguised subvector insertion into a shuffle:
-/// insert_vector_elt V, (bitcast X from vector type), IdxC -->
-/// bitcast(shuffle (bitcast V), (extended X), Mask)
-/// Note: We do not use an insert_subvector node because that requires a legal
-/// subvector type.
 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
   SDValue InsertVal = N->getOperand(1);
+  SDValue Vec = N->getOperand(0);
+
+  // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N), InsIndex)
+  //   --> (vector_shuffle X, Y)
+  if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() &&
+      InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      isa<ConstantSDNode>(InsertVal.getOperand(1))) {
+    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Vec.getNode());
+    ArrayRef<int> Mask = SVN->getMask();
+
+    SDValue X = Vec.getOperand(0);
+    SDValue Y = Vec.getOperand(1);
+
+    // Vec's operand 0 is using indices from 0 to N-1 and
+    // operand 1 from N to 2N - 1, where N is the number of
+    // elements in the vectors.
+    int XOffset = -1;
+    if (InsertVal.getOperand(0) == X) {
+      XOffset = 0;
+    } else if (InsertVal.getOperand(0) == Y) {
+      XOffset = X.getValueType().getVectorNumElements();
+    }
+
+    if (XOffset != -1) {
+      SmallVector<int, 16> NewMask(Mask.begin(), Mask.end());
+
+      auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1));
+      NewMask[InsIndex] = XOffset + ExtrIndex->getZExtValue();
+      assert(NewMask[InsIndex] <
+                 (int)(2 * Vec.getValueType().getVectorNumElements()) &&
+             NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound");
+
+      SDValue LegalShuffle =
+              TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X,
+                                          Y, NewMask, DAG);
+      if (LegalShuffle)
+        return LegalShuffle;
+    }
+  }
+
+  // insert_vector_elt V, (bitcast X from vector type), IdxC -->
+  // bitcast(shuffle (bitcast V), (extended X), Mask)
+  // Note: We do not use an insert_subvector node because that requires a
+  // legal subvector type.
   if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
       !InsertVal.getOperand(0).getValueType().isVector())
     return SDValue();
@@ -16674,7 +16646,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
                                                   SDValue EltNo,
                                                   LoadSDNode *OriginalLoad) {
-  assert(!OriginalLoad->isVolatile());
+  assert(OriginalLoad->isSimple());
 
   EVT ResultVT = EVE->getValueType(0);
   EVT VecEltVT = InVecVT.getVectorElementType();
@@ -16747,12 +16719,12 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
   SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
   SDValue To[] = { Load, Chain };
   DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  // Make sure to revisit this node to clean it up; it will usually be dead.
+  AddToWorklist(EVE);
   // Since we're explicitly calling ReplaceAllUses, add the new node to the
   // worklist explicitly as well.
-  AddToWorklist(Load.getNode());
   AddUsersToWorklist(Load.getNode()); // Add users too
-  // Make sure to revisit this node to clean it up; it will usually be dead.
-  AddToWorklist(EVE);
+  AddToWorklist(Load.getNode());
   ++OpsNarrowed;
   return SDValue(EVE, 0);
 }
@@ -16982,7 +16954,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       ISD::isNormalLoad(VecOp.getNode()) &&
       !Index->hasPredecessor(VecOp.getNode())) {
     auto *VecLoad = dyn_cast<LoadSDNode>(VecOp);
-    if (VecLoad && !VecLoad->isVolatile())
+    if (VecLoad && VecLoad->isSimple())
       return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad);
   }
 
@@ -17041,7 +17013,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   // Make sure we found a non-volatile load and the extractelement is
   // the only use.
-  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
+  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple())
     return SDValue();
 
   // If Idx was -1 above, Elt is going to be -1, so just return undef.
@@ -17344,17 +17316,16 @@ static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) {
     // the shuffle mask with -1.
   }
 
-  // Turn this into a shuffle with zero if that's legal.
-  EVT VecVT = Extract.getOperand(0).getValueType();
-  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(ShufMask, VecVT))
-    return SDValue();
-
   // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... -->
   // bitcast (shuffle V, ZeroVec, VectorMask)
   SDLoc DL(BV);
+  EVT VecVT = Extract.getOperand(0).getValueType();
   SDValue ZeroVec = DAG.getConstant(0, DL, VecVT);
-  SDValue Shuf = DAG.getVectorShuffle(VecVT, DL, Extract.getOperand(0), ZeroVec,
-                                      ShufMask);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0),
+                                             ZeroVec, ShufMask, DAG);
+  if (!Shuf)
+    return SDValue();
   return DAG.getBitcast(VT, Shuf);
 }
 
@@ -17656,6 +17627,13 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     }
   }
 
+  // A splat of a single element is a SPLAT_VECTOR if supported on the target.
+  if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand)
+    if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
+      assert(!V.isUndef() && "Splat of undef should have been handled earlier");
+      return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V);
+    }
+
   // Check if we can express BUILD VECTOR via subvector extract.
   if (!LegalTypes && (N->getNumOperands() > 1)) {
     SDValue Op0 = N->getOperand(0);
@@ -17829,11 +17807,9 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
     }
   }
 
-  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT))
-    return SDValue();
-
-  return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
-                              DAG.getBitcast(VT, SV1), Mask);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
+                                     DAG.getBitcast(VT, SV1), Mask, DAG);
 }
 
 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
@@ -17853,6 +17829,15 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     SDValue In = N->getOperand(0);
     assert(In.getValueType().isVector() && "Must concat vectors");
 
+    // If the input is a concat_vectors, just make a larger concat by padding
+    // with smaller undefs.
+    if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) {
+      unsigned NumOps = N->getNumOperands() * In.getNumOperands();
+      SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end());
+      Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType()));
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+    }
+
     SDValue Scalar = peekThroughOneUseBitcasts(In);
 
     // concat_vectors(scalar_to_vector(scalar), undef) ->
@@ -18002,6 +17987,23 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   return SDValue();
 }
 
+// Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
+// if the subvector can be sourced for free.
+static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) {
+  if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) {
+    return V.getOperand(1);
+  }
+  auto *IndexC = dyn_cast<ConstantSDNode>(Index);
+  if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
+      V.getOperand(0).getValueType() == SubVT &&
+      (IndexC->getZExtValue() % SubVT.getVectorNumElements()) == 0) {
+    uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorNumElements();
+    return V.getOperand(SubIdx);
+  }
+  return SDValue();
+}
+
 static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
                                               SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -18010,39 +18012,29 @@ static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
   if (!TLI.isBinOp(BinOpcode) || BinOp.getNode()->getNumValues() != 1)
     return SDValue();
 
+  EVT VecVT = BinOp.getValueType();
   SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1);
-  SDValue Index = Extract->getOperand(1);
-  EVT VT = Extract->getValueType(0);
+  if (VecVT != Bop0.getValueType() || VecVT != Bop1.getValueType())
+    return SDValue();
 
-  // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
-  // if the source subvector is the same type as the one being extracted.
-  auto GetSubVector = [VT, Index](SDValue V) -> SDValue {
-    if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
-        V.getOperand(1).getValueType() == VT && V.getOperand(2) == Index) {
-      return V.getOperand(1);
-    }
-    auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-    if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
-        V.getOperand(0).getValueType() == VT &&
-        (IndexC->getZExtValue() % VT.getVectorNumElements()) == 0) {
-      uint64_t SubIdx = IndexC->getZExtValue() / VT.getVectorNumElements();
-      return V.getOperand(SubIdx);
-    }
+  SDValue Index = Extract->getOperand(1);
+  EVT SubVT = Extract->getValueType(0);
+  if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT))
     return SDValue();
-  };
-  SDValue Sub0 = GetSubVector(Bop0);
-  SDValue Sub1 = GetSubVector(Bop1);
+
+  SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT);
+  SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT);
 
   // TODO: We could handle the case where only 1 operand is being inserted by
   //       creating an extract of the other operand, but that requires checking
   //       number of uses and/or costs.
-  if (!Sub0 || !Sub1 || !TLI.isOperationLegalOrCustom(BinOpcode, VT))
+  if (!Sub0 || !Sub1)
     return SDValue();
 
   // We are inserting both operands of the wide binop only to extract back
   // to the narrow vector size. Eliminate all of the insert/extract:
   // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y
-  return DAG.getNode(BinOpcode, SDLoc(Extract), VT, Sub0, Sub1,
+  return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1,
                      BinOp->getFlags());
 }
 
@@ -18174,7 +18166,8 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
 
   auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
   auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
-  if (!Ld || Ld->getExtensionType() || Ld->isVolatile() || !ExtIdx)
+  if (!Ld || Ld->getExtensionType() || !Ld->isSimple() ||
+      !ExtIdx)
     return SDValue();
 
   // Allow targets to opt-out.
@@ -18878,7 +18871,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
     int SplatIndex = SVN->getSplatIndex();
-    if (TLI.isExtractVecEltCheap(VT, SplatIndex) &&
+    if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) &&
         TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) {
       // splat (vector_bo L, R), Index -->
       // splat (scalar_bo (extelt L, Index), (extelt R, Index))
@@ -19153,22 +19146,13 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       SV1 = DAG.getUNDEF(VT);
 
     // Avoid introducing shuffles with illegal mask.
-    if (!TLI.isShuffleMaskLegal(Mask, VT)) {
-      ShuffleVectorSDNode::commuteMask(Mask);
-
-      if (!TLI.isShuffleMaskLegal(Mask, VT))
-        return SDValue();
-
-      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
-      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
-      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
-      std::swap(SV0, SV1);
-    }
-
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
-    return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
+    return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG);
   }
 
   if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
@@ -19191,35 +19175,35 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
       SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
       int Elt = C0->getZExtValue();
       NewMask[0] = Elt;
-      SDValue Val;
       // If we have an implict truncate do truncate here as long as it's legal.
       // if it's not legal, this should
       if (VT.getScalarType() != InVal.getValueType() &&
           InVal.getValueType().isScalarInteger() &&
           isTypeLegal(VT.getScalarType())) {
-        Val =
+        SDValue Val =
             DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
       }
       if (VT.getScalarType() == InVecT.getScalarType() &&
-          VT.getVectorNumElements() <= InVecT.getVectorNumElements() &&
-          TLI.isShuffleMaskLegal(NewMask, VT)) {
-        Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec,
-                                   DAG.getUNDEF(InVecT), NewMask);
-        // If the initial vector is the correct size this shuffle is a
-        // valid result.
-        if (VT == InVecT)
-          return Val;
-        // If not we must truncate the vector.
-        if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
-          MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
-          SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);
-          EVT SubVT =
-              EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),
-                               VT.getVectorNumElements());
-          Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val,
-                            ZeroIdx);
-          return Val;
+          VT.getVectorNumElements() <= InVecT.getVectorNumElements()) {
+        SDValue LegalShuffle =
+          TLI.buildLegalVectorShuffle(InVecT, SDLoc(N), InVec,
+                                      DAG.getUNDEF(InVecT), NewMask, DAG);
+        if (LegalShuffle) {
+          // If the initial vector is the correct size this shuffle is a
+          // valid result.
+          if (VT == InVecT)
+            return LegalShuffle;
+          // If not we must truncate the vector.
+          if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
+            MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+            SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);
+            EVT SubVT =
+                EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),
+                                 VT.getVectorNumElements());
+            return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT,
+                               LegalShuffle, ZeroIdx);
+          }
         }
       }
     }
@@ -19627,6 +19611,39 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
     }
   }
 
+  // Make sure all but the first op are undef or constant.
+  auto ConcatWithConstantOrUndef = [](SDValue Concat) {
+    return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
+           std::all_of(std::next(Concat->op_begin()), Concat->op_end(),
+                     [](const SDValue &Op) {
+                       return Op.isUndef() ||
+                              ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
+                     });
+  };
+
+  // The following pattern is likely to emerge with vector reduction ops. Moving
+  // the binary operation ahead of the concat may allow using a narrower vector
+  // instruction that has better performance than the wide version of the op:
+  // VBinOp (concat X, undef/constant), (concat Y, undef/constant) -->
+  //   concat (VBinOp X, Y), VecC
+  if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) &&
+      (LHS.hasOneUse() || RHS.hasOneUse())) {
+    EVT NarrowVT = LHS.getOperand(0).getValueType();
+    if (NarrowVT == RHS.getOperand(0).getValueType() &&
+        TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
+      SDLoc DL(N);
+      unsigned NumOperands = LHS.getNumOperands();
+      SmallVector<SDValue, 4> ConcatOps;
+      for (unsigned i = 0; i != NumOperands; ++i) {
+        // This constant fold for operands 1 and up.
+        ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i),
+                                        RHS.getOperand(i)));
+      }
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
+    }
+  }
+
   if (SDValue V = scalarizeBinOpOfSplats(N, DAG))
     return V;
 
@@ -19723,7 +19740,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     // Token chains must be identical.
     if (LHS.getOperand(0) != RHS.getOperand(0) ||
         // Do not let this transformation reduce the number of volatile loads.
-        LLD->isVolatile() || RLD->isVolatile() ||
+        // Be conservative for atomics for the moment
+        // TODO: This does appear to be legal for unordered atomics (see D66309)
+        !LLD->isSimple() || !RLD->isSimple() ||
         // FIXME: If either is a pre/post inc/dec load,
         // we'd need to split out the address adjustment.
         LLD->isIndexed() || RLD->isIndexed() ||
@@ -19928,7 +19947,7 @@ SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
 SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(
     const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
     ISD::CondCode CC) {
-  if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType().isFloatingPoint()))
+  if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType()))
     return SDValue();
 
   // If we are before legalize types, we want the other legalization to happen
@@ -20016,8 +20035,13 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
   // when the condition can be materialized as an all-ones register.  Any
   // single bit-test can be materialized as an all-ones register with
   // shift-left and shift-right-arith.
+  // TODO: The operation legality checks could be loosened to include "custom",
+  //       but that may cause regressions for targets that do not have shift
+  //       instructions.
   if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
-      N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
+      N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2) &&
+      TLI.isOperationLegal(ISD::SHL, VT) &&
+      TLI.isOperationLegal(ISD::SRA, VT)) {
     SDValue AndLHS = N0->getOperand(0);
     auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
@@ -20209,7 +20233,10 @@ SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
 ///     =>
 ///   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
 ///     does not require additional intermediate precision]
-SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
+/// For the last iteration, put numerator N into it to gain more precision:
+///   Result = N X_i + X_i (N - N A X_i)
+SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
+                                      SDNodeFlags Flags) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
 
@@ -20230,25 +20257,39 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
   if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
     AddToWorklist(Est.getNode());
 
+    SDLoc DL(Op);
     if (Iterations) {
-      SDLoc DL(Op);
       SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
 
-      // Newton iterations: Est = Est + Est (1 - Arg * Est)
+      // Newton iterations: Est = Est + Est (N - Arg * Est)
+      // If this is the last iteration, also multiply by the numerator.
       for (int i = 0; i < Iterations; ++i) {
-        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
+        SDValue MulEst = Est;
+
+        if (i == Iterations - 1) {
+          MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags);
+          AddToWorklist(MulEst.getNode());
+        }
+
+        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags);
         AddToWorklist(NewEst.getNode());
 
-        NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags);
+        NewEst = DAG.getNode(ISD::FSUB, DL, VT,
+                             (i == Iterations - 1 ? N : FPOne), NewEst, Flags);
         AddToWorklist(NewEst.getNode());
 
         NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
         AddToWorklist(NewEst.getNode());
 
-        Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
+        Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags);
         AddToWorklist(Est.getNode());
       }
+    } else {
+      // If no iterations are available, multiply with N.
+      Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags);
+      AddToWorklist(Est.getNode());
     }
+
     return Est;
   }
 
@@ -20271,31 +20312,19 @@ SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
   // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
   // this entire sequence requires only one FP constant.
   SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
-  AddToWorklist(HalfArg.getNode());
-
   HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
-  AddToWorklist(HalfArg.getNode());
 
   // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
   for (unsigned i = 0; i < Iterations; ++i) {
     SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
-    AddToWorklist(NewEst.getNode());
-
     NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
-    AddToWorklist(NewEst.getNode());
-
     NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
-    AddToWorklist(NewEst.getNode());
-
     Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
-    AddToWorklist(Est.getNode());
   }
 
   // If non-reciprocal square root is requested, multiply the result by Arg.
-  if (!Reciprocal) {
+  if (!Reciprocal)
     Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
-    AddToWorklist(Est.getNode());
-  }
 
   return Est;
 }
@@ -20321,13 +20350,8 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
   // E = (E * -0.5) * ((A * E) * E + -3.0)
   for (unsigned i = 0; i < Iterations; ++i) {
     SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
-    AddToWorklist(AE.getNode());
-
     SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
-    AddToWorklist(AEE.getNode());
-
     SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
-    AddToWorklist(RHS.getNode());
 
     // When calculating a square root at the last iteration build:
     // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
@@ -20340,10 +20364,8 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
       // SQRT: LHS = (A * E) * -0.5
       LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
     }
-    AddToWorklist(LHS.getNode());
 
     Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
-    AddToWorklist(Est.getNode());
   }
 
   return Est;
@@ -20400,16 +20422,11 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
           SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
           SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
           Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
-          AddToWorklist(Fabs.getNode());
-          AddToWorklist(IsDenorm.getNode());
-          AddToWorklist(Est.getNode());
         } else {
           // X == 0.0 ? 0.0 : Est
           SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
           SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
           Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
-          AddToWorklist(IsZero.getNode());
-          AddToWorklist(Est.getNode());
         }
       }
     }
@@ -20432,6 +20449,7 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
 
   struct MemUseCharacteristics {
     bool IsVolatile;
+    bool IsAtomic;
     SDValue BasePtr;
     int64_t Offset;
     Optional<int64_t> NumBytes;
@@ -20447,18 +20465,20 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
                      : (LSN->getAddressingMode() == ISD::PRE_DEC)
                            ? -1 * C->getSExtValue()
                            : 0;
-      return {LSN->isVolatile(), LSN->getBasePtr(), Offset /*base offset*/,
+      return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(),
+              Offset /*base offset*/,
               Optional<int64_t>(LSN->getMemoryVT().getStoreSize()),
               LSN->getMemOperand()};
     }
     if (const auto *LN = cast<LifetimeSDNode>(N))
-      return {false /*isVolatile*/, LN->getOperand(1),
+      return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1),
               (LN->hasOffset()) ? LN->getOffset() : 0,
               (LN->hasOffset()) ? Optional<int64_t>(LN->getSize())
                                 : Optional<int64_t>(),
               (MachineMemOperand *)nullptr};
     // Default.
-    return {false /*isvolatile*/, SDValue(), (int64_t)0 /*offset*/,
+    return {false /*isvolatile*/, /*isAtomic*/ false, SDValue(),
+            (int64_t)0 /*offset*/,
             Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr};
   };
 
@@ -20474,6 +20494,11 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
   if (MUC0.IsVolatile && MUC1.IsVolatile)
     return true;
 
+  // Be conservative about atomics for the moment
+  // TODO: This is way overconservative for unordered atomics (see D66309)
+  if (MUC0.IsAtomic && MUC1.IsAtomic)
+    return true;
+
   if (MUC0.MMO && MUC1.MMO) {
     if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
         (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
@@ -20555,7 +20580,8 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
   SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
 
   // Get alias information for node.
-  const bool IsLoad = isa<LoadSDNode>(N) && !cast<LoadSDNode>(N)->isVolatile();
+  // TODO: relax aliasing for unordered atomics (see D66309)
+  const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple();
 
   // Starting off.
   Chains.push_back(OriginalChain);
@@ -20571,8 +20597,9 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
     case ISD::LOAD:
     case ISD::STORE: {
       // Get alias information for C.
+      // TODO: Relax aliasing for unordered atomics (see D66309)
       bool IsOpLoad = isa<LoadSDNode>(C.getNode()) &&
-                      !cast<LSBaseSDNode>(C.getNode())->isVolatile();
+                      cast<LSBaseSDNode>(C.getNode())->isSimple();
       if ((IsLoad && IsOpLoad) || !isAlias(N, C.getNode())) {
         // Look further up the chain.
         C = C.getOperand(0);
@@ -20727,7 +20754,8 @@ bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
     // If the chain has more than one use, then we can't reorder the mem ops.
     if (!SDValue(Chain, 0)->hasOneUse())
       break;
-    if (Chain->isVolatile() || Chain->isIndexed())
+    // TODO: Relax for unordered atomics (see D66309)
+    if (!Chain->isSimple() || Chain->isIndexed())
       break;
 
     // Find the base pointer and offset for this memory node.
@@ -20795,11 +20823,11 @@ bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
   SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps);
   CombineTo(St, TF);
 
-  AddToWorklist(STChain);
-  // Add TF operands worklist in reverse order.
-  for (auto I = TF->getNumOperands(); I;)
-    AddToWorklist(TF->getOperand(--I).getNode());
+  // Add TF and its operands to the worklist.
   AddToWorklist(TF.getNode());
+  for (const SDValue &Op : TF->ops())
+    AddToWorklist(Op.getNode());
+  AddToWorklist(STChain);
   return true;
 }
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 22c23ba877e8..6d7260d7aee5 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -174,7 +174,7 @@ static unsigned findSinkableLocalRegDef(MachineInstr &MI) {
       if (RegDef)
         return 0;
       RegDef = MO.getReg();
-    } else if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+    } else if (Register::isVirtualRegister(MO.getReg())) {
       // This is another use of a vreg. Don't try to sink it.
       return 0;
     }
@@ -1213,14 +1213,13 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
       if (!FrameAlign)
         FrameAlign = TLI.getByValTypeAlignment(ElementTy, DL);
       Flags.setByValSize(FrameSize);
-      Flags.setByValAlign(FrameAlign);
+      Flags.setByValAlign(Align(FrameAlign));
     }
     if (Arg.IsNest)
       Flags.setNest();
     if (NeedsRegBlock)
       Flags.setInConsecutiveRegs();
-    unsigned OriginalAlignment = DL.getABITypeAlignment(Arg.Ty);
-    Flags.setOrigAlign(OriginalAlignment);
+    Flags.setOrigAlign(Align(DL.getABITypeAlignment(Arg.Ty)));
 
     CLI.OutVals.push_back(Arg.Val);
     CLI.OutFlags.push_back(Flags);
@@ -1237,8 +1236,8 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
     updateValueMap(CLI.CS->getInstruction(), CLI.ResultReg, CLI.NumResultRegs);
 
   // Set labels for heapallocsite call.
-  if (CLI.CS && CLI.CS->getInstruction()->getMetadata("heapallocsite")) {
-    MDNode *MD = CLI.CS->getInstruction()->getMetadata("heapallocsite");
+  if (CLI.CS && CLI.CS->getInstruction()->hasMetadata("heapallocsite")) {
+    const MDNode *MD = CLI.CS->getInstruction()->getMetadata("heapallocsite");
     MF->addCodeViewHeapAllocSite(CLI.Call, MD);
   }
 
@@ -1303,6 +1302,7 @@ bool FastISel::selectCall(const User *I) {
       ExtraInfo |= InlineAsm::Extra_HasSideEffects;
     if (IA->isAlignStack())
       ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+    ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::INLINEASM))
@@ -1388,9 +1388,11 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
              "Expected inlined-at fields to agree");
       // A dbg.declare describes the address of a source variable, so lower it
       // into an indirect DBG_VALUE.
+      auto *Expr = DI->getExpression();
+      Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ true,
-              *Op, DI->getVariable(), DI->getExpression());
+              TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ false,
+              *Op, DI->getVariable(), Expr);
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
@@ -1414,19 +1416,19 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       if (CI->getBitWidth() > 64)
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
             .addCImm(CI)
-            .addImm(0U)
+            .addReg(0U)
             .addMetadata(DI->getVariable())
             .addMetadata(DI->getExpression());
       else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
             .addImm(CI->getZExtValue())
-            .addImm(0U)
+            .addReg(0U)
             .addMetadata(DI->getVariable())
             .addMetadata(DI->getExpression());
     } else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
           .addFPImm(CF)
-          .addImm(0U)
+          .addReg(0U)
           .addMetadata(DI->getVariable())
           .addMetadata(DI->getExpression());
     } else if (unsigned Reg = lookUpRegForValue(V)) {
@@ -1453,24 +1455,12 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
             TII.get(TargetOpcode::DBG_LABEL)).addMetadata(DI->getLabel());
     return true;
   }
-  case Intrinsic::objectsize: {
-    ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
-    unsigned long long Res = CI->isZero() ? -1ULL : 0;
-    Constant *ResCI = ConstantInt::get(II->getType(), Res);
-    unsigned ResultReg = getRegForValue(ResCI);
-    if (!ResultReg)
-      return false;
-    updateValueMap(II, ResultReg);
-    return true;
-  }
-  case Intrinsic::is_constant: {
-    Constant *ResCI = ConstantInt::get(II->getType(), 0);
-    unsigned ResultReg = getRegForValue(ResCI);
-    if (!ResultReg)
-      return false;
-    updateValueMap(II, ResultReg);
-    return true;
-  }
+  case Intrinsic::objectsize:
+    llvm_unreachable("llvm.objectsize.* should have been lowered already");
+
+  case Intrinsic::is_constant:
+    llvm_unreachable("llvm.is.constant.* should have been lowered already");
+
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
   case Intrinsic::expect: {
@@ -1677,11 +1667,11 @@ bool FastISel::selectInstruction(const Instruction *I) {
 /// (fall-through) successor, and update the CFG.
 void FastISel::fastEmitBranch(MachineBasicBlock *MSucc,
                               const DebugLoc &DbgLoc) {
-  if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
+  if (FuncInfo.MBB->getBasicBlock()->sizeWithoutDebug() > 1 &&
       FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
-    // For more accurate line information if this is the only instruction
-    // in the block then emit it, otherwise we have the unconditional
-    // fall-through case, which needs no instructions.
+    // For more accurate line information if this is the only non-debug
+    // instruction in the block then emit it, otherwise we have the
+    // unconditional fall-through case, which needs no instructions.
   } else {
     // The unconditional branch case.
     TII.insertBranch(*FuncInfo.MBB, MSucc, nullptr,
@@ -2028,7 +2018,7 @@ unsigned FastISel::createResultReg(const TargetRegisterClass *RC) {
 
 unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
                                             unsigned OpNum) {
-  if (TargetRegisterInfo::isVirtualRegister(Op)) {
+  if (Register::isVirtualRegister(Op)) {
     const TargetRegisterClass *RegClass =
         TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
     if (!MRI.constrainRegClass(Op, RegClass)) {
@@ -2236,7 +2226,7 @@ unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
 unsigned FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
                                               bool Op0IsKill, uint32_t Idx) {
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
-  assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
+  assert(Register::isVirtualRegister(Op0) &&
          "Cannot yet extract from physregs");
   const TargetRegisterClass *RC = MRI.getRegClass(Op0);
   MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
@@ -2417,10 +2407,9 @@ FastISel::createMachineMemOperandFor(const Instruction *I) const {
   } else
     return nullptr;
 
-  bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal) != nullptr;
-  bool IsInvariant = I->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
-  bool IsDereferenceable =
-      I->getMetadata(LLVMContext::MD_dereferenceable) != nullptr;
+  bool IsNonTemporal = I->hasMetadata(LLVMContext::MD_nontemporal);
+  bool IsInvariant = I->hasMetadata(LLVMContext::MD_invariant_load);
+  bool IsDereferenceable = I->hasMetadata(LLVMContext::MD_dereferenceable);
   const MDNode *Ranges = I->getMetadata(LLVMContext::MD_range);
 
   AAMDNodes AAInfo;
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 8b1759246b76..cf6711adad48 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -424,7 +425,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   unsigned BitWidth = IntVT.getSizeInBits();
 
   unsigned DestReg = ValueMap[PN];
-  if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+  if (!Register::isVirtualRegister(DestReg))
     return;
   LiveOutRegInfo.grow(DestReg);
   LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
@@ -445,7 +446,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
     assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
                                 "CopyToReg node was created.");
     unsigned SrcReg = ValueMap[V];
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    if (!Register::isVirtualRegister(SrcReg)) {
       DestLOI.IsValid = false;
       return;
     }
@@ -480,7 +481,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
     assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
                                 "its CopyToReg node was created.");
     unsigned SrcReg = ValueMap[V];
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    if (!Register::isVirtualRegister(SrcReg)) {
       DestLOI.IsValid = false;
       return;
     }
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 9bc07d35dfc5..c5095995ec2e 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -71,7 +71,7 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
     if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
       continue;
     if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
-      if (TargetRegisterInfo::isPhysicalRegister(RN->getReg()))
+      if (Register::isPhysicalRegister(RN->getReg()))
         continue;
     NumImpUses = N - I;
     break;
@@ -86,7 +86,7 @@ void InstrEmitter::
 EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
                 unsigned SrcReg, DenseMap<SDValue, unsigned> &VRBaseMap) {
   unsigned VRBase = 0;
-  if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+  if (Register::isVirtualRegister(SrcReg)) {
     // Just use the input register directly!
     SDValue Op(Node, ResNo);
     if (IsClone)
@@ -114,7 +114,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
           User->getOperand(2).getNode() == Node &&
           User->getOperand(2).getResNo() == ResNo) {
         unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-        if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+        if (Register::isVirtualRegister(DestReg)) {
           VRBase = DestReg;
           Match = false;
         } else if (DestReg != SrcReg)
@@ -139,7 +139,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
               UseRC = RC;
             else if (RC) {
               const TargetRegisterClass *ComRC =
-                TRI->getCommonSubClass(UseRC, RC, VT.SimpleTy);
+                TRI->getCommonSubClass(UseRC, RC);
               // If multiple uses expect disjoint register classes, we emit
               // copies in AddRegisterOperand.
               if (ComRC)
@@ -219,7 +219,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
       VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
-      assert(TargetRegisterInfo::isPhysicalRegister(VRBase));
+      assert(Register::isPhysicalRegister(VRBase));
       MIB.addReg(VRBase, RegState::Define);
     }
 
@@ -229,7 +229,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
             User->getOperand(2).getNode() == Node &&
             User->getOperand(2).getResNo() == i) {
           unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-          if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+          if (Register::isVirtualRegister(Reg)) {
             const TargetRegisterClass *RegRC = MRI->getRegClass(Reg);
             if (RegRC == RC) {
               VRBase = Reg;
@@ -272,7 +272,7 @@ unsigned InstrEmitter::getVR(SDValue Op,
     // does not include operand register class info.
     const TargetRegisterClass *RC = TLI->getRegClassFor(
         Op.getSimpleValueType(), Op.getNode()->isDivergent());
-    unsigned VReg = MRI->createVirtualRegister(RC);
+    Register VReg = MRI->createVirtualRegister(RC);
     BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
             TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
     return VReg;
@@ -319,7 +319,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
       if (!ConstrainedRC) {
         OpRC = TRI->getAllocatableClass(OpRC);
         assert(OpRC && "Constraints cannot be fulfilled for allocation");
-        unsigned NewVReg = MRI->createVirtualRegister(OpRC);
+        Register NewVReg = MRI->createVirtualRegister(OpRC);
         BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
                 TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
         VReg = NewVReg;
@@ -385,9 +385,8 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
                                       (IIRC && TRI->isDivergentRegClass(IIRC)))
             : nullptr;
 
-    if (OpRC && IIRC && OpRC != IIRC &&
-        TargetRegisterInfo::isVirtualRegister(VReg)) {
-      unsigned NewVReg = MRI->createVirtualRegister(IIRC);
+    if (OpRC && IIRC && OpRC != IIRC && Register::isVirtualRegister(VReg)) {
+      Register NewVReg = MRI->createVirtualRegister(IIRC);
       BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
                TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
       VReg = NewVReg;
@@ -465,7 +464,7 @@ unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
   // register instead.
   RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT, isDivergent), SubIdx);
   assert(RC && "No legal register class for VT supports that SubIdx");
-  unsigned NewReg = MRI->createVirtualRegister(RC);
+  Register NewReg = MRI->createVirtualRegister(RC);
   BuildMI(*MBB, InsertPos, DL, TII->get(TargetOpcode::COPY), NewReg)
     .addReg(VReg);
   return NewReg;
@@ -485,7 +484,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     if (User->getOpcode() == ISD::CopyToReg &&
         User->getOperand(2).getNode() == Node) {
       unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+      if (Register::isVirtualRegister(DestReg)) {
         VRBase = DestReg;
         break;
       }
@@ -503,7 +502,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     unsigned Reg;
     MachineInstr *DefMI;
     RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(0));
-    if (R && TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+    if (R && Register::isPhysicalRegister(R->getReg())) {
       Reg = R->getReg();
       DefMI = nullptr;
     } else {
@@ -529,7 +528,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       // Reg may not support a SubIdx sub-register, and we may need to
       // constrain its register class or issue a COPY to a compatible register
       // class.
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         Reg = ConstrainForSubReg(Reg, SubIdx,
                                  Node->getOperand(0).getSimpleValueType(),
                                  Node->isDivergent(), Node->getDebugLoc());
@@ -541,7 +540,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       MachineInstrBuilder CopyMI =
           BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
                   TII->get(TargetOpcode::COPY), VRBase);
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         CopyMI.addReg(Reg, 0, SubIdx);
       else
         CopyMI.addReg(TRI->getSubReg(Reg, SubIdx));
@@ -614,7 +613,7 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
   const TargetRegisterClass *DstRC =
     TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
-  unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+  Register NewVReg = MRI->createVirtualRegister(DstRC);
   BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
     NewVReg).addReg(VReg);
 
@@ -631,7 +630,7 @@ void InstrEmitter::EmitRegSequence(SDNode *Node,
                                   bool IsClone, bool IsCloned) {
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
   const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
-  unsigned NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
+  Register NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
   const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
   MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II, NewVReg);
   unsigned NumOps = Node->getNumOperands();
@@ -649,7 +648,7 @@ void InstrEmitter::EmitRegSequence(SDNode *Node,
       RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(i-1));
       // Skip physical registers as they don't have a vreg to get and we'll
       // insert copies for them in TwoAddressInstructionPass anyway.
-      if (!R || !TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+      if (!R || !Register::isPhysicalRegister(R->getReg())) {
         unsigned SubIdx = cast<ConstantSDNode>(Op)->getZExtValue();
         unsigned SubReg = getVR(Node->getOperand(i-1), VRBaseMap);
         const TargetRegisterClass *TRC = MRI->getRegClass(SubReg);
@@ -678,7 +677,7 @@ MachineInstr *
 InstrEmitter::EmitDbgValue(SDDbgValue *SD,
                            DenseMap<SDValue, unsigned> &VRBaseMap) {
   MDNode *Var = SD->getVariable();
-  MDNode *Expr = SD->getExpression();
+  const DIExpression *Expr = SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
@@ -702,12 +701,11 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
     // EmitTargetCodeForFrameDebugValue is responsible for allocation.
     auto FrameMI = BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE))
                        .addFrameIndex(SD->getFrameIx());
+
     if (SD->isIndirect())
-      // Push [fi + 0] onto the DIExpression stack.
-      FrameMI.addImm(0);
-    else
-      // Push fi onto the DIExpression stack.
-      FrameMI.addReg(0);
+      Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
+
+    FrameMI.addReg(0);
     return FrameMI.addMetadata(Var).addMetadata(Expr);
   }
   // Otherwise, we're going to create an instruction here.
@@ -753,9 +751,9 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
 
   // Indirect addressing is indicated by an Imm as the second parameter.
   if (SD->isIndirect())
-    MIB.addImm(0U);
-  else
-    MIB.addReg(0U, RegState::Debug);
+    Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
+
+  MIB.addReg(0U, RegState::Debug);
 
   MIB.addMetadata(Var);
   MIB.addMetadata(Expr);
@@ -928,12 +926,12 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   //
   // Collect all the used physreg defs, and make sure that any unused physreg
   // defs are marked as dead.
-  SmallVector<unsigned, 8> UsedRegs;
+  SmallVector<Register, 8> UsedRegs;
 
   // Additional results must be physical register defs.
   if (HasPhysRegOuts) {
     for (unsigned i = NumDefs; i < NumResults; ++i) {
-      unsigned Reg = II.getImplicitDefs()[i - NumDefs];
+      Register Reg = II.getImplicitDefs()[i - NumDefs];
       if (!Node->hasAnyUseOfValue(i))
         continue;
       // This implicitly defined physreg has a use.
@@ -960,8 +958,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
       // direct RegisterSDNode operands.
       for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
         if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
-          unsigned Reg = R->getReg();
-          if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          Register Reg = R->getReg();
+          if (Reg.isPhysical())
             UsedRegs.push_back(Reg);
         }
     }
@@ -995,8 +993,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
   case ISD::CopyToReg: {
     unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
     SDValue SrcVal = Node->getOperand(2);
-    if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
-        SrcVal.isMachineOpcode() &&
+    if (Register::isVirtualRegister(DestReg) && SrcVal.isMachineOpcode() &&
         SrcVal.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
       // Instead building a COPY to that vreg destination, build an
       // IMPLICIT_DEF instruction instead.
@@ -1093,16 +1090,18 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
           // FIXME: Add dead flags for physical and virtual registers defined.
           // For now, mark physical register defs as implicit to help fast
           // regalloc. This makes inline asm look a lot like calls.
-          MIB.addReg(Reg, RegState::Define |
-                  getImplRegState(TargetRegisterInfo::isPhysicalRegister(Reg)));
+          MIB.addReg(Reg,
+                     RegState::Define |
+                         getImplRegState(Register::isPhysicalRegister(Reg)));
         }
         break;
       case InlineAsm::Kind_RegDefEarlyClobber:
       case InlineAsm::Kind_Clobber:
         for (unsigned j = 0; j != NumVals; ++j, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-          MIB.addReg(Reg, RegState::Define | RegState::EarlyClobber |
-                  getImplRegState(TargetRegisterInfo::isPhysicalRegister(Reg)));
+          MIB.addReg(Reg,
+                     RegState::Define | RegState::EarlyClobber |
+                         getImplRegState(Register::isPhysicalRegister(Reg)));
           ECRegs.push_back(Reg);
         }
         break;
@@ -1136,7 +1135,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     // then remove the early-clobber flag.
     for (unsigned Reg : ECRegs) {
       if (MIB->readsRegister(Reg, TRI)) {
-        MachineOperand *MO = 
+        MachineOperand *MO =
             MIB->findRegisterDefOperand(Reg, false, false, TRI);
         assert(MO && "No def operand for clobbered register?");
         MO->setIsEarlyClobber(false);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bf817f00f83d..f9fdf525240f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -161,6 +162,7 @@ private:
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
                            const SDLoc &dl, SDValue ChainIn);
   SDValue ExpandBUILD_VECTOR(SDNode *Node);
+  SDValue ExpandSPLAT_VECTOR(SDNode *Node);
   SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
   void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
                                 SmallVectorImpl<SDValue> &Results);
@@ -236,6 +238,16 @@ public:
     }
     ReplacedNode(Old);
   }
+
+  void ReplaceNodeWithValue(SDValue Old, SDValue New) {
+    LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+               dbgs() << "     with:      "; New->dump(&DAG));
+
+    DAG.ReplaceAllUsesOfValueWith(Old, New);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New.getNode());
+    ReplacedNode(Old.getNode());
+  }
 };
 
 } // end anonymous namespace
@@ -493,8 +505,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       // expand it.
       EVT MemVT = ST->getMemoryVT();
       const DataLayout &DL = DAG.getDataLayout();
-      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT,
-                                  *ST->getMemOperand())) {
+      if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(), DL, MemVT,
+                                              *ST->getMemOperand())) {
         LLVM_DEBUG(dbgs() << "Expanding unsupported unaligned store\n");
         SDValue Result = TLI.expandUnalignedStore(ST, DAG);
         ReplaceNode(SDValue(ST, 0), Result);
@@ -608,8 +620,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       EVT MemVT = ST->getMemoryVT();
       // If this is an unaligned store and the target doesn't support it,
       // expand it.
-      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT,
-                                  *ST->getMemOperand())) {
+      if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(), DL, MemVT,
+                                              *ST->getMemOperand())) {
         SDValue Result = TLI.expandUnalignedStore(ST, DAG);
         ReplaceNode(SDValue(ST, 0), Result);
       }
@@ -669,8 +681,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       const DataLayout &DL = DAG.getDataLayout();
       // If this is an unaligned load and the target doesn't support it,
       // expand it.
-      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT,
-                                  *LD->getMemOperand())) {
+      if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(), DL, MemVT,
+                                              *LD->getMemOperand())) {
         std::tie(RVal, RChain) = TLI.expandUnalignedLoad(LD, DAG);
       }
       break;
@@ -894,11 +906,10 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
         if (SrcVT.getScalarType() == MVT::f16) {
           EVT ISrcVT = SrcVT.changeTypeToInteger();
           EVT IDestVT = DestVT.changeTypeToInteger();
-          EVT LoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());
+          EVT ILoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());
 
-          SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, LoadVT,
-                                          Chain, Ptr, ISrcVT,
-                                          LD->getMemOperand());
+          SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, ILoadVT, Chain,
+                                          Ptr, ISrcVT, LD->getMemOperand());
           Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result);
           Chain = Result.getValue(1);
           break;
@@ -959,15 +970,13 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
 
 #ifndef NDEBUG
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-    assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
-              TargetLowering::TypeLegal ||
-            TLI.isTypeLegal(Node->getValueType(i))) &&
+    assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
+             TargetLowering::TypeLegal &&
            "Unexpected illegal type!");
 
   for (const SDValue &Op : Node->op_values())
     assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
               TargetLowering::TypeLegal ||
-            TLI.isTypeLegal(Op.getValueType()) ||
             Op.getOpcode() == ISD::TargetConstant ||
             Op.getOpcode() == ISD::Register) &&
             "Unexpected illegal type!");
@@ -1004,7 +1013,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getOperand(0).getValueType());
     break;
-  case ISD::FP_ROUND_INREG:
   case ISD::SIGN_EXTEND_INREG: {
     EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
     Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
@@ -1097,38 +1105,15 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       return;
     }
     break;
-  case ISD::STRICT_FADD:
-  case ISD::STRICT_FSUB:
-  case ISD::STRICT_FMUL:
-  case ISD::STRICT_FDIV:
-  case ISD::STRICT_FREM:
-  case ISD::STRICT_FSQRT:
-  case ISD::STRICT_FMA:
-  case ISD::STRICT_FPOW:
-  case ISD::STRICT_FPOWI:
-  case ISD::STRICT_FSIN:
-  case ISD::STRICT_FCOS:
-  case ISD::STRICT_FEXP:
-  case ISD::STRICT_FEXP2:
-  case ISD::STRICT_FLOG:
-  case ISD::STRICT_FLOG10:
-  case ISD::STRICT_FLOG2:
-  case ISD::STRICT_FRINT:
-  case ISD::STRICT_FNEARBYINT:
-  case ISD::STRICT_FMAXNUM:
-  case ISD::STRICT_FMINNUM:
-  case ISD::STRICT_FCEIL:
-  case ISD::STRICT_FFLOOR:
-  case ISD::STRICT_FROUND:
-  case ISD::STRICT_FTRUNC:
-  case ISD::STRICT_FP_ROUND:
-  case ISD::STRICT_FP_EXTEND:
-    // These pseudo-ops get legalized as if they were their non-strict
-    // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
-    // is also legal, but if ISD::FSQRT requires expansion then so does
-    // ISD::STRICT_FSQRT.
+  case ISD::STRICT_LRINT:
+  case ISD::STRICT_LLRINT:
+  case ISD::STRICT_LROUND:
+  case ISD::STRICT_LLROUND:
+    // These pseudo-ops are the same as the other STRICT_ ops except
+    // they are registered with setOperationAction() using the input type
+    // instead of the output type.
     Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
-                                            Node->getValueType(0));
+                                            Node->getOperand(1).getValueType());
     break;
   case ISD::SADDSAT:
   case ISD::UADDSAT:
@@ -1139,7 +1124,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   }
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: {
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
@@ -1650,7 +1636,6 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
   MVT OpVT = LHS.getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
   NeedInvert = false;
-  bool NeedSwap = false;
   switch (TLI.getCondCodeAction(CCCode, OpVT)) {
   default: llvm_unreachable("Unknown condition code action!");
   case TargetLowering::Legal:
@@ -1664,6 +1649,7 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
       return true;
     }
     // Swapping operands didn't work. Try inverting the condition.
+    bool NeedSwap = false;
     InvCC = getSetCCInverse(CCCode, OpVT.isInteger());
     if (!TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
       // If inverting the condition is not enough, try swapping operands
@@ -2021,6 +2007,14 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   return ExpandVectorBuildThroughStack(Node);
 }
 
+SDValue SelectionDAGLegalize::ExpandSPLAT_VECTOR(SDNode *Node) {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue SplatVal = Node->getOperand(0);
+
+  return DAG.getSplatBuildVector(VT, DL, SplatVal);
+}
+
 // Expand a node into a call to a libcall.  If the result value
 // does not fit into a register, return the lo part and set the hi part to the
 // by-reg argument.  If it does fit into a single register, return the result
@@ -2074,12 +2068,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   if (!CallInfo.second.getNode()) {
-    LLVM_DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump());
+    LLVM_DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump(&DAG));
     // It's a tailcall, return the chain (which is the DAG root).
     return DAG.getRoot();
   }
 
-  LLVM_DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump());
+  LLVM_DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump(&DAG));
   return CallInfo.first;
 }
 
@@ -2167,6 +2161,9 @@ SDValue SelectionDAGLegalize::ExpandArgFPLibCall(SDNode* Node,
                                                  RTLIB::Libcall Call_F80,
                                                  RTLIB::Libcall Call_F128,
                                                  RTLIB::Libcall Call_PPCF128) {
+  if (Node->isStrictFPOpcode())
+    Node = DAG.mutateStrictFPToFP(Node);
+
   RTLIB::Libcall LC;
   switch (Node->getOperand(0).getValueType().getSimpleVT().SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
@@ -2815,6 +2812,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   }
   case ISD::STRICT_FP_ROUND:
+    // This expansion does not honor the "strict" properties anyway,
+    // so prefer falling back to the non-strict operation if legal.
+    if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                       Node->getValueType(0))
+        == TargetLowering::Legal)
+      break;
     Tmp1 = EmitStackConvert(Node->getOperand(1), 
                             Node->getValueType(0),
                             Node->getValueType(0), dl, Node->getOperand(0));
@@ -2829,6 +2832,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   case ISD::STRICT_FP_EXTEND:
+    // This expansion does not honor the "strict" properties anyway,
+    // so prefer falling back to the non-strict operation if legal.
+    if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                       Node->getValueType(0))
+        == TargetLowering::Legal)
+      break;
     Tmp1 = EmitStackConvert(Node->getOperand(1),
                             Node->getOperand(1).getValueType(),
                             Node->getValueType(0), dl, Node->getOperand(0));
@@ -2873,19 +2882,6 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
-  case ISD::FP_ROUND_INREG: {
-    // The only way we can lower this is to turn it into a TRUNCSTORE,
-    // EXTLOAD pair, targeting a temporary location (a stack slot).
-
-    // NOTE: there is a choice here between constantly creating new stack
-    // slots and always reusing the same one.  We currently always create
-    // new ones, as reuse may inhibit scheduling.
-    EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
-    Tmp1 = EmitStackConvert(Node->getOperand(0), ExtraVT,
-                            Node->getValueType(0), dl);
-    Results.push_back(Tmp1);
-    break;
-  }
   case ISD::UINT_TO_FP:
     if (TLI.expandUINT_TO_FP(Node, Tmp1, DAG)) {
       Results.push_back(Tmp1);
@@ -2901,33 +2897,26 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
       Results.push_back(Tmp1);
     break;
+  case ISD::STRICT_FP_TO_SINT:
+    if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG)) {
+      ReplaceNode(Node, Tmp1.getNode());
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_TO_SINT node\n");
+      return true;
+    }
+    break;
   case ISD::FP_TO_UINT:
-    if (TLI.expandFP_TO_UINT(Node, Tmp1, DAG))
+    if (TLI.expandFP_TO_UINT(Node, Tmp1, Tmp2, DAG))
       Results.push_back(Tmp1);
     break;
-  case ISD::LROUND:
-    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LROUND_F32,
-                                         RTLIB::LROUND_F64, RTLIB::LROUND_F80,
-                                         RTLIB::LROUND_F128,
-                                         RTLIB::LROUND_PPCF128));
-    break;
-  case ISD::LLROUND:
-    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLROUND_F32,
-                                         RTLIB::LLROUND_F64, RTLIB::LLROUND_F80,
-                                         RTLIB::LLROUND_F128,
-                                         RTLIB::LLROUND_PPCF128));
-    break;
-  case ISD::LRINT:
-    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LRINT_F32,
-                                         RTLIB::LRINT_F64, RTLIB::LRINT_F80,
-                                         RTLIB::LRINT_F128,
-                                         RTLIB::LRINT_PPCF128));
-    break;
-  case ISD::LLRINT:
-    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLRINT_F32,
-                                         RTLIB::LLRINT_F64, RTLIB::LLRINT_F80,
-                                         RTLIB::LLRINT_F128,
-                                         RTLIB::LLRINT_PPCF128));
+  case ISD::STRICT_FP_TO_UINT:
+    if (TLI.expandFP_TO_UINT(Node, Tmp1, Tmp2, DAG)) {
+      // Relink the chain.
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node,1), Tmp2);
+      // Replace the new UINT result.
+      ReplaceNodeWithValue(SDValue(Node, 0), Tmp1);
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_TO_UINT node\n");
+      return true;
+    }
     break;
   case ISD::VAARG:
     Results.push_back(DAG.expandVAArg(Node));
@@ -3348,6 +3337,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
     Results.push_back(TLI.expandFixedPointMul(Node, DAG));
     break;
   case ISD::ADDCARRY:
@@ -3662,6 +3652,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::BUILD_VECTOR:
     Results.push_back(ExpandBUILD_VECTOR(Node));
     break;
+  case ISD::SPLAT_VECTOR:
+    Results.push_back(ExpandSPLAT_VECTOR(Node));
+    break;
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL: {
@@ -3715,6 +3708,33 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   }
 
+  if (Results.empty() && Node->isStrictFPOpcode()) {
+    // FIXME: We were asked to expand a strict floating-point operation,
+    // but there is currently no expansion implemented that would preserve
+    // the "strict" properties.  For now, we just fall back to the non-strict
+    // version if that is legal on the target.  The actual mutation of the
+    // operation will happen in SelectionDAGISel::DoInstructionSelection.
+    switch (Node->getOpcode()) {
+    default:
+      if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                         Node->getValueType(0))
+          == TargetLowering::Legal)
+        return true;
+      break;
+    case ISD::STRICT_LRINT:
+    case ISD::STRICT_LLRINT:
+    case ISD::STRICT_LROUND:
+    case ISD::STRICT_LLROUND:
+      // These are registered by the operand type instead of the value
+      // type. Reflect that here.
+      if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                         Node->getOperand(1).getValueType())
+          == TargetLowering::Legal)
+        return true;
+      break;
+    }
+  }
+
   // Replace the original node with the legalized result.
   if (Results.empty()) {
     LLVM_DEBUG(dbgs() << "Cannot expand node\n");
@@ -3956,6 +3976,34 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                         RTLIB::POW_F80, RTLIB::POW_F128,
                                         RTLIB::POW_PPCF128));
     break;
+  case ISD::LROUND:
+  case ISD::STRICT_LROUND:
+    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LROUND_F32,
+                                         RTLIB::LROUND_F64, RTLIB::LROUND_F80,
+                                         RTLIB::LROUND_F128,
+                                         RTLIB::LROUND_PPCF128));
+    break;
+  case ISD::LLROUND:
+  case ISD::STRICT_LLROUND:
+    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLROUND_F32,
+                                         RTLIB::LLROUND_F64, RTLIB::LLROUND_F80,
+                                         RTLIB::LLROUND_F128,
+                                         RTLIB::LLROUND_PPCF128));
+    break;
+  case ISD::LRINT:
+  case ISD::STRICT_LRINT:
+    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LRINT_F32,
+                                         RTLIB::LRINT_F64, RTLIB::LRINT_F80,
+                                         RTLIB::LRINT_F128,
+                                         RTLIB::LRINT_PPCF128));
+    break;
+  case ISD::LLRINT:
+  case ISD::STRICT_LLRINT:
+    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLRINT_F32,
+                                         RTLIB::LLRINT_F64, RTLIB::LLRINT_F80,
+                                         RTLIB::LLRINT_F128,
+                                         RTLIB::LLRINT_PPCF128));
+    break;
   case ISD::FDIV:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
                                       RTLIB::DIV_F80, RTLIB::DIV_F128,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index b4849b2881e6..72d052473f11 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -42,10 +42,10 @@ static RTLIB::Libcall GetFPLibCall(EVT VT,
 }
 
 //===----------------------------------------------------------------------===//
-//  Convert Float Results to Integer for Non-HW-supported Operations.
+//  Convert Float Results to Integer
 //===----------------------------------------------------------------------===//
 
-bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   LLVM_DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue R = SDValue();
@@ -58,26 +58,18 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to soften the result of this operator!");
 
-    case ISD::Register:
-    case ISD::CopyFromReg:
-    case ISD::CopyToReg:
-      assert(isLegalInHWReg(N->getValueType(ResNo)) &&
-             "Unsupported SoftenFloatRes opcode!");
-      // Only when isLegalInHWReg, we can skip check of the operands.
-      R = SDValue(N, ResNo);
-      break;
     case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
-    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N, ResNo); break;
+    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
-    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N, ResNo); break;
+    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N); break;
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N, ResNo); break;
-    case ISD::FABS:        R = SoftenFloatRes_FABS(N, ResNo); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
     case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
     case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
-    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N, ResNo); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
     case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
     case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
     case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
@@ -89,7 +81,7 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
     case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
     case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
-    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N, ResNo); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
     case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
     case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
     case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
@@ -102,30 +94,24 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
-    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
-    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
-    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
     case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
   }
 
-  if (R.getNode() && R.getNode() != N) {
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode()) {
+    assert(R.getNode() != N);
     SetSoftenedFloat(SDValue(N, ResNo), R);
-    // Return true only if the node is changed, assuming that the operands
-    // are also converted when necessary.
-    return true;
   }
-
-  // Otherwise, return false to tell caller to scan operands.
-  return false;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) {
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
   return BitConvertToInteger(N->getOperand(0));
 }
 
@@ -144,10 +130,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
                      BitConvertToInteger(N->getOperand(1)));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, we can load better from the constant pool.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N) {
   ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
   // In ppcf128, the high 64 bits are always first in memory regardless
   // of Endianness. LLVM's APFloat representation is not Endian sensitive,
@@ -172,19 +155,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, keep the extracted value in register.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
   SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      NewOp.getValueType().getVectorElementType(),
                      NewOp, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, FABS can be implemented as native bitwise operations.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned Size = NVT.getSizeInBits();
 
@@ -200,57 +177,69 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FMIN_F32,
                                            RTLIB::FMIN_F64,
                                            RTLIB::FMIN_F80,
                                            RTLIB::FMIN_F128,
                                            RTLIB::FMIN_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FMAX_F32,
                                            RTLIB::FMAX_F64,
                                            RTLIB::FMAX_F80,
                                            RTLIB::FMAX_F128,
                                            RTLIB::FMAX_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::ADD_F32,
                                            RTLIB::ADD_F64,
                                            RTLIB::ADD_F80,
                                            RTLIB::ADD_F128,
                                            RTLIB::ADD_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::CEIL_F32,
                                            RTLIB::CEIL_F64,
                                            RTLIB::CEIL_F80,
                                            RTLIB::CEIL_F128,
                                            RTLIB::CEIL_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, FCOPYSIGN can be implemented as native bitwise operations.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(0));
   SDValue RHS = BitConvertToInteger(N->getOperand(1));
   SDLoc dl(N);
@@ -301,98 +290,123 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::COS_F32,
                                            RTLIB::COS_F64,
                                            RTLIB::COS_F80,
                                            RTLIB::COS_F128,
                                            RTLIB::COS_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::DIV_F32,
                                            RTLIB::DIV_F64,
                                            RTLIB::DIV_F80,
                                            RTLIB::DIV_F128,
                                            RTLIB::DIV_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::EXP_F32,
                                            RTLIB::EXP_F64,
                                            RTLIB::EXP_F80,
                                            RTLIB::EXP_F128,
                                            RTLIB::EXP_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::EXP2_F32,
                                            RTLIB::EXP2_F64,
                                            RTLIB::EXP2_F80,
                                            RTLIB::EXP2_F128,
                                            RTLIB::EXP2_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FLOOR_F32,
                                            RTLIB::FLOOR_F64,
                                            RTLIB::FLOOR_F80,
                                            RTLIB::FLOOR_F128,
                                            RTLIB::FLOOR_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::LOG_F32,
                                            RTLIB::LOG_F64,
                                            RTLIB::LOG_F80,
                                            RTLIB::LOG_F128,
                                            RTLIB::LOG_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::LOG2_F32,
                                            RTLIB::LOG2_F64,
                                            RTLIB::LOG2_F80,
                                            RTLIB::LOG2_F128,
                                            RTLIB::LOG2_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::LOG10_F32,
                                            RTLIB::LOG10_F64,
                                            RTLIB::LOG10_F80,
                                            RTLIB::LOG10_F128,
                                            RTLIB::LOG10_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
@@ -400,48 +414,57 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
   SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)),
                      GetSoftenedFloat(N->getOperand(2)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[3] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType(),
+                   N->getOperand(2).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FMA_F32,
                                            RTLIB::FMA_F64,
                                            RTLIB::FMA_F80,
                                            RTLIB::FMA_F128,
                                            RTLIB::FMA_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::MUL_F32,
                                            RTLIB::MUL_F64,
                                            RTLIB::MUL_F80,
                                            RTLIB::MUL_F128,
                                            RTLIB::MUL_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::NEARBYINT_F32,
                                            RTLIB::NEARBYINT_F64,
                                            RTLIB::NEARBYINT_F80,
                                            RTLIB::NEARBYINT_F128,
                                            RTLIB::NEARBYINT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, FNEG can be implemented as native bitwise operations.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
 
-  EVT FloatVT = N->getValueType(ResNo);
+  EVT FloatVT = N->getValueType(0);
   if (FloatVT == MVT::f32 || FloatVT == MVT::f64 || FloatVT == MVT::f128) {
     // Expand Y = FNEG(X) -> Y = X ^ sign mask
     APInt SignMask = APInt::getSignMask(NVT.getSizeInBits());
@@ -452,13 +475,14 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
   // Expand Y = FNEG(X) -> Y = SUB -0.0, X
   SDValue Ops[2] = { DAG.getConstantFP(-0.0, dl, N->getValueType(0)),
                      GetSoftenedFloat(N->getOperand(0)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SUB_F32,
                                            RTLIB::SUB_F64,
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, false, dl).first;
+                         NVT, Ops, CallOptions, dl).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
@@ -485,7 +509,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  return TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
@@ -493,15 +520,18 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
   EVT MidVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32);
   SDValue Op = N->getOperand(0);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op,
-                                  false, SDLoc(N)).first;
+                                  CallOptions, SDLoc(N)).first;
   if (N->getValueType(0) == MVT::f32)
     return Res32;
 
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   RTLIB::Libcall LC = RTLIB::getFPEXT(MVT::f32, N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return TLI.makeLibCall(DAG, LC, NVT, Res32, false, SDLoc(N)).first;
+  return TLI.makeLibCall(DAG, LC, NVT, Res32, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
@@ -515,20 +545,27 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
-  return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  return TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::POW_F32,
                                            RTLIB::POW_F64,
                                            RTLIB::POW_F80,
                                            RTLIB::POW_F128,
                                            RTLIB::POW_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
@@ -536,87 +573,111 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
          "Unsupported power type!");
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)), N->getOperand(1) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::POWI_F32,
                                            RTLIB::POWI_F64,
                                            RTLIB::POWI_F80,
                                            RTLIB::POWI_F128,
                                            RTLIB::POWI_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::REM_F32,
                                            RTLIB::REM_F64,
                                            RTLIB::REM_F80,
                                            RTLIB::REM_F128,
                                            RTLIB::REM_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::RINT_F32,
                                            RTLIB::RINT_F64,
                                            RTLIB::RINT_F80,
                                            RTLIB::RINT_F128,
                                            RTLIB::RINT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::ROUND_F32,
                                            RTLIB::ROUND_F64,
                                            RTLIB::ROUND_F80,
                                            RTLIB::ROUND_F128,
                                            RTLIB::ROUND_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SIN_F32,
                                            RTLIB::SIN_F64,
                                            RTLIB::SIN_F80,
                                            RTLIB::SIN_F128,
                                            RTLIB::SIN_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SQRT_F32,
                                            RTLIB::SQRT_F64,
                                            RTLIB::SQRT_F80,
                                            RTLIB::SQRT_F128,
                                            RTLIB::SQRT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SUB_F32,
                                            RTLIB::SUB_F64,
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
@@ -625,17 +686,19 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
     return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, N->getOperand(0));
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::TRUNC_F32,
                                            RTLIB::TRUNC_F64,
                                            RTLIB::TRUNC_F80,
                                            RTLIB::TRUNC_F128,
                                            RTLIB::TRUNC_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
-  bool LegalInHWReg = isLegalInHWReg(N->getValueType(ResNo));
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
@@ -666,23 +729,17 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
   auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL);
-  if (LegalInHWReg)
-    return ExtendNode;
   return BitConvertToInteger(ExtendNode);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo) {
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo) {
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
@@ -736,14 +793,18 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
   // Sign/zero extend the argument if the libcall takes a larger type.
   SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                            NVT, N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(Signed);
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, LC,
                          TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                         Op, Signed, dl).first;
+                         Op, CallOptions, dl).first;
 }
 
 
 //===----------------------------------------------------------------------===//
-//  Convert Float Operand to Integer for Non-HW-supported Operations.
+//  Convert Float Operand to Integer
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
@@ -753,8 +814,6 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
 
   switch (N->getOpcode()) {
   default:
-    if (CanSkipSoftenFloatOperand(N, OpNo))
-      return false;
 #ifndef NDEBUG
     dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
@@ -762,11 +821,7 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
     llvm_unreachable("Do not know how to soften this operator's operand!");
 
   case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
-  case ISD::CopyToReg:   Res = SoftenFloatOp_COPY_TO_REG(N); break;
   case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
-  case ISD::FABS:        Res = SoftenFloatOp_FABS(N); break;
-  case ISD::FCOPYSIGN:   Res = SoftenFloatOp_FCOPYSIGN(N); break;
-  case ISD::FNEG:        Res = SoftenFloatOp_FNEG(N); break;
   case ISD::FP_EXTEND:   Res = SoftenFloatOp_FP_EXTEND(N); break;
   case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
@@ -776,19 +831,9 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::LLROUND:     Res = SoftenFloatOp_LLROUND(N); break;
   case ISD::LRINT:       Res = SoftenFloatOp_LRINT(N); break;
   case ISD::LLRINT:      Res = SoftenFloatOp_LLRINT(N); break;
-  case ISD::SELECT:      Res = SoftenFloatOp_SELECT(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
-  case ISD::STORE:
-    Res = SoftenFloatOp_STORE(N, OpNo);
-    // Do not try to analyze or soften this node again if the value is
-    // or can be held in a register. In that case, Res.getNode() should
-    // be equal to N.
-    if (Res.getNode() == N &&
-        isLegalInHWReg(N->getOperand(OpNo).getValueType()))
-      return false;
-    // Otherwise, we need to reanalyze and lower the new Res nodes.
-    break;
+  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -800,60 +845,16 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
     return true;
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
-         "Invalid operand expansion");
+         "Invalid operand promotion");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
 }
 
-bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
-  if (!isLegalInHWReg(N->getOperand(OpNo).getValueType()))
-    return false;
-
-  // When the operand type can be kept in registers there is nothing to do for
-  // the following opcodes.
-  switch (N->getOperand(OpNo).getOpcode()) {
-    case ISD::BITCAST:
-    case ISD::ConstantFP:
-    case ISD::CopyFromReg:
-    case ISD::CopyToReg:
-    case ISD::FABS:
-    case ISD::FCOPYSIGN:
-    case ISD::FNEG:
-    case ISD::Register:
-    case ISD::SELECT:
-    case ISD::SELECT_CC:
-      return true;
-  }
-
-  switch (N->getOpcode()) {
-    case ISD::ConstantFP:  // Leaf node.
-    case ISD::CopyFromReg: // Operand is a register that we know to be left
-                           // unchanged by SoftenFloatResult().
-    case ISD::Register:    // Leaf node.
-      return true;
-  }
-  return false;
-}
-
 SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
-  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
-                     GetSoftenedFloat(N->getOperand(0)));
-}
-
-SDValue DAGTypeLegalizer::SoftenFloatOp_COPY_TO_REG(SDNode *N) {
-  SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
-  SDValue Op2 = GetSoftenedFloat(N->getOperand(2));
-
-  if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2))
-    return SDValue();
-
-  if (N->getNumOperands() == 3)
-    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2), 0);
+  SDValue Op0 = GetSoftenedFloat(N->getOperand(0));
 
-  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2,
-                                        N->getOperand(3)),
-                 0);
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
@@ -868,7 +869,10 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND libcall");
 
-  return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  return TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 
@@ -885,7 +889,10 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  return TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
@@ -895,7 +902,8 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
-  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N),
+                          N->getOperand(2), N->getOperand(3));
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -911,34 +919,6 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
                  0);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_FABS(SDNode *N) {
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-
-  if (Op == N->getOperand(0))
-    return SDValue();
-
-  return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
-}
-
-SDValue DAGTypeLegalizer::SoftenFloatOp_FCOPYSIGN(SDNode *N) {
-  SDValue Op0 = GetSoftenedFloat(N->getOperand(0));
-  SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
-
-  if (Op0 == N->getOperand(0) && Op1 == N->getOperand(1))
-    return SDValue();
-
-  return SDValue(DAG.UpdateNodeOperands(N, Op0, Op1), 0);
-}
-
-SDValue DAGTypeLegalizer::SoftenFloatOp_FNEG(SDNode *N) {
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-
-  if (Op == N->getOperand(0))
-    return SDValue();
-
-  return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
-}
-
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   bool Signed = N->getOpcode() == ISD::FP_TO_SINT;
   EVT SVT = N->getOperand(0).getValueType();
@@ -962,23 +942,15 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_XINT!");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  SDValue Res = TLI.makeLibCall(DAG, LC, NVT, Op, false, dl).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  SDValue Res = TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl).first;
 
   // Truncate the result if the libcall returns a larger type.
   return DAG.getNode(ISD::TRUNCATE, dl, RVT, Res);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT(SDNode *N) {
-  SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
-  SDValue Op2 = GetSoftenedFloat(N->getOperand(2));
-
-  if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2))
-    return SDValue();
-
-  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2),
-                 0);
-}
-
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
@@ -986,7 +958,8 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
-  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N),
+                          N->getOperand(0), N->getOperand(1));
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -1009,7 +982,8 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
-  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N),
+                          N->getOperand(0), N->getOperand(1));
 
   // If softenSetCCOperands returned a scalar, use it.
   if (!NewRHS.getNode()) {
@@ -1047,13 +1021,16 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_LROUND(SDNode *N) {
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LROUND_F32,
                                            RTLIB::LROUND_F64,
                                            RTLIB::LROUND_F80,
                                            RTLIB::LROUND_F128,
                                            RTLIB::LROUND_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_LLROUND(SDNode *N) {
@@ -1061,13 +1038,16 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_LLROUND(SDNode *N) {
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LLROUND_F32,
                                            RTLIB::LLROUND_F64,
                                            RTLIB::LLROUND_F80,
                                            RTLIB::LLROUND_F128,
                                            RTLIB::LLROUND_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_LRINT(SDNode *N) {
@@ -1075,13 +1055,16 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_LRINT(SDNode *N) {
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LRINT_F32,
                                            RTLIB::LRINT_F64,
                                            RTLIB::LRINT_F80,
                                            RTLIB::LRINT_F128,
                                            RTLIB::LRINT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_LLRINT(SDNode *N) {
@@ -1089,13 +1072,16 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_LLRINT(SDNode *N) {
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LLRINT_F32,
                                            RTLIB::LLRINT_F64,
                                            RTLIB::LLRINT_F80,
                                            RTLIB::LLRINT_F128,
                                            RTLIB::LLRINT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1267,13 +1253,14 @@ void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
 void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::DIV_F32,
                                                    RTLIB::DIV_F64,
                                                    RTLIB::DIV_F80,
                                                    RTLIB::DIV_F128,
                                                    RTLIB::DIV_PPCF128),
-                                 N->getValueType(0), Ops, false,
+                                 N->getValueType(0), Ops, CallOptions,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1341,13 +1328,14 @@ void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
 void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
   SDValue Ops[3] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::FMA_F32,
                                                    RTLIB::FMA_F64,
                                                    RTLIB::FMA_F80,
                                                    RTLIB::FMA_F128,
                                                    RTLIB::FMA_PPCF128),
-                                 N->getValueType(0), Ops, false,
+                                 N->getValueType(0), Ops, CallOptions,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1355,13 +1343,14 @@ void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::MUL_F32,
                                                    RTLIB::MUL_F64,
                                                    RTLIB::MUL_F80,
                                                    RTLIB::MUL_F128,
                                                    RTLIB::MUL_PPCF128),
-                                 N->getValueType(0), Ops, false,
+                                 N->getValueType(0), Ops, CallOptions,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1470,13 +1459,14 @@ void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
 void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::SUB_F32,
                                                    RTLIB::SUB_F64,
                                                    RTLIB::SUB_F80,
                                                    RTLIB::SUB_F128,
                                                    RTLIB::SUB_PPCF128),
-                                 N->getValueType(0), Ops, false,
+                                 N->getValueType(0), Ops, CallOptions,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1555,7 +1545,9 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     }
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
-    Hi = TLI.makeLibCall(DAG, LC, VT, Src, true, dl).first;
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(true);
+    Hi = TLI.makeLibCall(DAG, LC, VT, Src, CallOptions, dl).first;
     GetPairElements(Hi, Lo, Hi);
   }
 
@@ -1732,7 +1724,8 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
-  return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), false, dl).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), CallOptions, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
@@ -1741,8 +1734,9 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, LC, N->getValueType(0), N->getOperand(0),
-                         false, dl).first;
+                         CallOptions, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
@@ -1807,49 +1801,53 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
 SDValue DAGTypeLegalizer::ExpandFloatOp_LROUND(SDNode *N) {
   EVT RVT = N->getValueType(0);
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LROUND_F32,
                                            RTLIB::LROUND_F64,
                                            RTLIB::LROUND_F80,
                                            RTLIB::LROUND_F128,
                                            RTLIB::LROUND_PPCF128),
-                         RVT, N->getOperand(0), false, SDLoc(N)).first;
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_LLROUND(SDNode *N) {
   EVT RVT = N->getValueType(0);
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LLROUND_F32,
                                            RTLIB::LLROUND_F64,
                                            RTLIB::LLROUND_F80,
                                            RTLIB::LLROUND_F128,
                                            RTLIB::LLROUND_PPCF128),
-                         RVT, N->getOperand(0), false, SDLoc(N)).first;
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_LRINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LRINT_F32,
                                            RTLIB::LRINT_F64,
                                            RTLIB::LRINT_F80,
                                            RTLIB::LRINT_F128,
                                            RTLIB::LRINT_PPCF128),
-                         RVT, N->getOperand(0), false, SDLoc(N)).first;
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_LLRINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LLRINT_F32,
                                            RTLIB::LLRINT_F64,
                                            RTLIB::LLRINT_F80,
                                            RTLIB::LLRINT_F128,
                                            RTLIB::LLRINT_PPCF128),
-                         RVT, N->getOperand(0), false, SDLoc(N)).first;
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2002,6 +2000,12 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
              dbgs() << "\n");
   SDValue R = SDValue();
 
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
+    return;
+  }
+
   switch (N->getOpcode()) {
     // These opcodes cannot appear if promotion of FP16 is done in the backend
     // instead of Clang
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 15ac45c37c66..d5c1b539adbd 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -100,6 +100,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
                          Res = PromoteIntRes_BUILD_VECTOR(N); break;
   case ISD::SCALAR_TO_VECTOR:
                          Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SPLAT_VECTOR:
+                         Res = PromoteIntRes_SPLAT_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:
                          Res = PromoteIntRes_CONCAT_VECTORS(N); break;
 
@@ -112,6 +114,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
 
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
 
@@ -148,9 +152,12 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
   case ISD::USUBSAT:     Res = PromoteIntRes_ADDSUBSAT(N); break;
+
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX:     Res = PromoteIntRes_MULFIX(N); break;
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:  Res = PromoteIntRes_MULFIX(N); break;
+
   case ISD::ABS:         Res = PromoteIntRes_ABS(N); break;
 
   case ISD::ATOMIC_LOAD:
@@ -494,7 +501,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
       TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
     NewOpc = ISD::FP_TO_SINT;
 
-  SDValue Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
+  if (N->getOpcode() == ISD::STRICT_FP_TO_UINT &&
+      !TLI.isOperationLegal(ISD::STRICT_FP_TO_UINT, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, NVT))
+    NewOpc = ISD::STRICT_FP_TO_SINT;
+
+  SDValue Res;
+  if (N->isStrictFPOpcode()) {
+    Res = DAG.getNode(NewOpc, dl, { NVT, MVT::Other }, 
+                      { N->getOperand(0), N->getOperand(1) });
+    // Legalize the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  } else
+    Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
 
   // Assert that the converted value fits in the original type.  If it doesn't
   // (eg: because the value being converted is too big), then the result of the
@@ -503,7 +523,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
   // NOTE: fp-to-uint to fp-to-sint promotion guarantees zero extend. For example:
   //   before legalization: fp-to-uint16, 65534. -> 0xfffe
   //   after legalization: fp-to-sint32, 65534. -> 0x0000fffe
-  return DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ?
+  return DAG.getNode((N->getOpcode() == ISD::FP_TO_UINT ||
+                      N->getOpcode() == ISD::STRICT_FP_TO_UINT) ?
                      ISD::AssertZext : ISD::AssertSext, dl, NVT, Res,
                      DAG.getValueType(N->getValueType(0).getScalarType()));
 }
@@ -590,7 +611,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
                    N->getIndex(), N->getScale() };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand());
+                                    N->getMemOperand(), N->getIndexType());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -623,48 +644,84 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
-  // For promoting iN -> iM, this can be expanded by
-  // 1. ANY_EXTEND iN to iM
-  // 2. SHL by M-N
-  // 3. [US][ADD|SUB]SAT
-  // 4. L/ASHR by M-N
+  // If the promoted type is legal, we can convert this to:
+  //   1. ANY_EXTEND iN to iM
+  //   2. SHL by M-N
+  //   3. [US][ADD|SUB]SAT
+  //   4. L/ASHR by M-N
+  // Else it is more efficient to convert this to a min and a max
+  // operation in the higher precision arithmetic.
   SDLoc dl(N);
   SDValue Op1 = N->getOperand(0);
   SDValue Op2 = N->getOperand(1);
   unsigned OldBits = Op1.getScalarValueSizeInBits();
 
   unsigned Opcode = N->getOpcode();
-  unsigned ShiftOp;
-  switch (Opcode) {
-  case ISD::SADDSAT:
-  case ISD::SSUBSAT:
-    ShiftOp = ISD::SRA;
-    break;
-  case ISD::UADDSAT:
-  case ISD::USUBSAT:
-    ShiftOp = ISD::SRL;
-    break;
-  default:
-    llvm_unreachable("Expected opcode to be signed or unsigned saturation "
-                     "addition or subtraction");
-  }
-
-  SDValue Op1Promoted = GetPromotedInteger(Op1);
-  SDValue Op2Promoted = GetPromotedInteger(Op2);
 
+  SDValue Op1Promoted, Op2Promoted;
+  if (Opcode == ISD::UADDSAT || Opcode == ISD::USUBSAT) {
+    Op1Promoted = ZExtPromotedInteger(Op1);
+    Op2Promoted = ZExtPromotedInteger(Op2);
+  } else {
+    Op1Promoted = SExtPromotedInteger(Op1);
+    Op2Promoted = SExtPromotedInteger(Op2);
+  }
   EVT PromotedType = Op1Promoted.getValueType();
   unsigned NewBits = PromotedType.getScalarSizeInBits();
-  unsigned SHLAmount = NewBits - OldBits;
-  EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
-  SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT);
-  Op1Promoted =
-      DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
-  Op2Promoted =
-      DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
 
-  SDValue Result =
-      DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
-  return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
+  if (TLI.isOperationLegalOrCustom(Opcode, PromotedType)) {
+    unsigned ShiftOp;
+    switch (Opcode) {
+    case ISD::SADDSAT:
+    case ISD::SSUBSAT:
+      ShiftOp = ISD::SRA;
+      break;
+    case ISD::UADDSAT:
+    case ISD::USUBSAT:
+      ShiftOp = ISD::SRL;
+      break;
+    default:
+      llvm_unreachable("Expected opcode to be signed or unsigned saturation "
+                       "addition or subtraction");
+    }
+
+    unsigned SHLAmount = NewBits - OldBits;
+    EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+    SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT);
+    Op1Promoted =
+        DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
+    Op2Promoted =
+        DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
+
+    SDValue Result =
+        DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
+    return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
+  } else {
+    if (Opcode == ISD::USUBSAT) {
+      SDValue Max =
+          DAG.getNode(ISD::UMAX, dl, PromotedType, Op1Promoted, Op2Promoted);
+      return DAG.getNode(ISD::SUB, dl, PromotedType, Max, Op2Promoted);
+    }
+
+    if (Opcode == ISD::UADDSAT) {
+      APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
+      SDValue Add =
+          DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
+      return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
+    }
+
+    unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB;
+    APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits);
+    APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits);
+    SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType);
+    SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
+    SDValue Result =
+        DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted);
+    Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax);
+    Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin);
+    return Result;
+  }
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
@@ -673,6 +730,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
   SDValue Op1Promoted, Op2Promoted;
   bool Signed =
       N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::SMULFIXSAT;
+  bool Saturating =
+      N->getOpcode() == ISD::SMULFIXSAT || N->getOpcode() == ISD::UMULFIXSAT;
   if (Signed) {
     Op1Promoted = SExtPromotedInteger(N->getOperand(0));
     Op2Promoted = SExtPromotedInteger(N->getOperand(1));
@@ -685,7 +744,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
   unsigned DiffSize =
       PromotedType.getScalarSizeInBits() - OldType.getScalarSizeInBits();
 
-  bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
   if (Saturating) {
     // Promoting the operand and result values changes the saturation width,
     // which is extends the values that we clamp to on saturation. This could be
@@ -1110,6 +1168,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
                           Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
   case ISD::SCALAR_TO_VECTOR:
                           Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::SPLAT_VECTOR:
+                          Res = PromoteIntOp_SPLAT_VECTOR(N); break;
   case ISD::VSELECT:
   case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
   case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
@@ -1148,7 +1208,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
 
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: Res = PromoteIntOp_MULFIX(N); break;
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: Res = PromoteIntOp_MULFIX(N); break;
 
   case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
 
@@ -1339,6 +1400,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
                                 GetPromotedInteger(N->getOperand(0))), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_SPLAT_VECTOR(SDNode *N) {
+  // Integer SPLAT_VECTOR operands are implicitly truncated, so just promote the
+  // operand in place.
+  return SDValue(
+      DAG.UpdateNodeOperands(N, GetPromotedInteger(N->getOperand(0))), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   assert(OpNo == 0 && "Only know how to promote the condition!");
   SDValue Cond = N->getOperand(0);
@@ -1454,8 +1522,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
     EVT DataVT = N->getValueType(0);
     NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
   } else if (OpNo == 4) {
-    // Need to sign extend the index since the bits will likely be used.
-    NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    // The Index
+    if (N->isIndexSigned())
+      // Need to sign extend the index since the bits will likely be used.
+      NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    else
+      NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
   } else
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
 
@@ -1470,8 +1542,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
     EVT DataVT = N->getValue().getValueType();
     NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
   } else if (OpNo == 4) {
-    // Need to sign extend the index since the bits will likely be used.
-    NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    // The Index
+    if (N->isIndexSigned())
+      // Need to sign extend the index since the bits will likely be used.
+      NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    else
+      NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
   } else
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
@@ -1715,7 +1791,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
 
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: ExpandIntRes_MULFIX(N, Lo, Hi); break;
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break;
 
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_MUL:
@@ -2473,7 +2550,9 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, true/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2488,7 +2567,8 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, false/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2514,7 +2594,9 @@ void DAGTypeLegalizer::ExpandIntRes_LLROUND(SDNode *N, SDValue &Lo,
 
   SDLoc dl(N);
   EVT RetVT = N->getValueType(0);
-  SplitInteger(TLI.makeLibCall(DAG, LC, RetVT, Op, true/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, RetVT, Op, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2540,7 +2622,9 @@ void DAGTypeLegalizer::ExpandIntRes_LLRINT(SDNode *N, SDValue &Lo,
 
   SDLoc dl(N);
   EVT RetVT = N->getValueType(0);
-  SplitInteger(TLI.makeLibCall(DAG, LC, RetVT, Op, true/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, RetVT, Op, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2743,7 +2827,9 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   }
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2777,38 +2863,53 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   uint64_t Scale = N->getConstantOperandVal(2);
-  bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
-  EVT BoolVT = getSetCCResultType(VT);
-  SDValue Zero = DAG.getConstant(0, dl, VT);
+  bool Saturating = (N->getOpcode() == ISD::SMULFIXSAT ||
+                     N->getOpcode() == ISD::UMULFIXSAT);
+  bool Signed = (N->getOpcode() == ISD::SMULFIX ||
+                 N->getOpcode() == ISD::SMULFIXSAT);
+
+  // Handle special case when scale is equal to zero.
   if (!Scale) {
     SDValue Result;
     if (!Saturating) {
       Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
     } else {
-      Result = DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      EVT BoolVT = getSetCCResultType(VT);
+      unsigned MulOp = Signed ? ISD::SMULO : ISD::UMULO;
+      Result = DAG.getNode(MulOp, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
       SDValue Product = Result.getValue(0);
       SDValue Overflow = Result.getValue(1);
-
-      APInt MinVal = APInt::getSignedMinValue(VTSize);
-      APInt MaxVal = APInt::getSignedMaxValue(VTSize);
-      SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
-      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
-      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
-      Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
-      Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+      if (Signed) {
+        APInt MinVal = APInt::getSignedMinValue(VTSize);
+        APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+        SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+        SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+        SDValue Zero = DAG.getConstant(0, dl, VT);
+        SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
+        Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+        Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+      } else {
+        // For unsigned multiplication, we only need to check the max since we
+        // can't really overflow towards zero.
+        APInt MaxVal = APInt::getMaxValue(VTSize);
+        SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+        Result = DAG.getSelect(dl, VT, Overflow, SatMax, Product);
+      }
     }
     SplitInteger(Result, Lo, Hi);
     return;
   }
 
+  // For SMULFIX[SAT] we only expect to find Scale<VTSize, but this assert will
+  // cover for unhandled cases below, while still being valid for UMULFIX[SAT].
+  assert(Scale <= VTSize && "Scale can't be larger than the value type size.");
+
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue LL, LH, RL, RH;
   GetExpandedInteger(LHS, LL, LH);
   GetExpandedInteger(RHS, RL, RH);
   SmallVector<SDValue, 4> Result;
 
-  bool Signed = (N->getOpcode() == ISD::SMULFIX ||
-                 N->getOpcode() == ISD::SMULFIXSAT);
   unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
   if (!TLI.expandMUL_LOHI(LoHiOp, VT, dl, LHS, RHS, Result, NVT, DAG,
                           TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
@@ -2822,19 +2923,9 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
                                     "the size of the current value type");
   EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
 
-  // Shift whole amount by scale.
-  SDValue ResultLL = Result[0];
-  SDValue ResultLH = Result[1];
-  SDValue ResultHL = Result[2];
-  SDValue ResultHH = Result[3];
-
-  SDValue SatMax, SatMin;
-  SDValue NVTZero = DAG.getConstant(0, dl, NVT);
-  SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT);
-  EVT BoolNVT = getSetCCResultType(NVT);
-
-  // After getting the multplication result in 4 parts, we need to perform a
+  // After getting the multiplication result in 4 parts, we need to perform a
   // shift right by the amount of the scale to get the result in that scale.
+  //
   // Let's say we multiply 2 64 bit numbers. The resulting value can be held in
   // 128 bits that are cut into 4 32-bit parts:
   //
@@ -2846,123 +2937,135 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   //
   //                             |NVTSize-|
   //
-  // The resulting Lo and Hi will only need to be one of these 32-bit parts
-  // after shifting.
+  // The resulting Lo and Hi would normally be in LL and LH after the shift. But
+  // to avoid unneccessary shifting of all 4 parts, we can adjust the shift
+  // amount and get Lo and Hi using two funnel shifts. Or for the special case
+  // when Scale is a multiple of NVTSize we can just pick the result without
+  // shifting.
+  uint64_t Part0 = Scale / NVTSize; // Part holding lowest bit needed.
+  if (Scale % NVTSize) {
+    SDValue ShiftAmount = DAG.getConstant(Scale % NVTSize, dl, ShiftTy);
+    Lo = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 1], Result[Part0],
+                     ShiftAmount);
+    Hi = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 2], Result[Part0 + 1],
+                     ShiftAmount);
+  } else {
+    Lo = Result[Part0];
+    Hi = Result[Part0 + 1];
+  }
+
+  // Unless saturation is requested we are done. The result is in <Hi,Lo>.
+  if (!Saturating)
+    return;
+
+  // Can not overflow when there is no integer part.
+  if (Scale == VTSize)
+    return;
+
+  // To handle saturation we must check for overflow in the multiplication.
+  //
+  // Unsigned overflow happened if the upper (VTSize - Scale) bits (of Result)
+  // aren't all zeroes.
+  //
+  // Signed overflow happened if the upper (VTSize - Scale + 1) bits (of Result)
+  // aren't all ones or all zeroes.
+  //
+  // We cannot overflow past HH when multiplying 2 ints of size VTSize, so the
+  // highest bit of HH determines saturation direction in the event of signed
+  // saturation.
+
+  SDValue ResultHL = Result[2];
+  SDValue ResultHH = Result[3];
+
+  SDValue SatMax, SatMin;
+  SDValue NVTZero = DAG.getConstant(0, dl, NVT);
+  SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT);
+  EVT BoolNVT = getSetCCResultType(NVT);
+
+  if (!Signed) {
+    if (Scale < NVTSize) {
+      // Overflow happened if ((HH | (HL >> Scale)) != 0).
+      SDValue HLAdjusted = DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+                                       DAG.getConstant(Scale, dl, ShiftTy));
+      SDValue Tmp = DAG.getNode(ISD::OR, dl, NVT, HLAdjusted, ResultHH);
+      SatMax = DAG.getSetCC(dl, BoolNVT, Tmp, NVTZero, ISD::SETNE);
+    } else if (Scale == NVTSize) {
+      // Overflow happened if (HH != 0).
+      SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETNE);
+    } else if (Scale < VTSize) {
+      // Overflow happened if ((HH >> (Scale - NVTSize)) != 0).
+      SDValue HLAdjusted = DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+                                       DAG.getConstant(Scale - NVTSize, dl,
+                                                       ShiftTy));
+      SatMax = DAG.getSetCC(dl, BoolNVT, HLAdjusted, NVTZero, ISD::SETNE);
+    } else
+      llvm_unreachable("Scale must be less or equal to VTSize for UMULFIXSAT"
+                       "(and saturation can't happen with Scale==VTSize).");
+
+    Hi = DAG.getSelect(dl, NVT, SatMax, NVTNeg1, Hi);
+    Lo = DAG.getSelect(dl, NVT, SatMax, NVTNeg1, Lo);
+    return;
+  }
+
   if (Scale < NVTSize) {
-    // If the scale is less than the size of the VT we expand to, the Hi and
-    // Lo of the result will be in the first 2 parts of the result after
-    // shifting right. This only requires shifting by the scale as far as the
-    // third part in the result (ResultHL).
-    SDValue SRLAmnt = DAG.getConstant(Scale, dl, ShiftTy);
-    SDValue SHLAmnt = DAG.getConstant(NVTSize - Scale, dl, ShiftTy);
-    Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLL, SRLAmnt);
-    Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
-                     DAG.getNode(ISD::SHL, dl, NVT, ResultLH, SHLAmnt));
-    Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
-    Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
-                     DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
-
-    // We cannot overflow past HH when multiplying 2 ints of size VTSize, so the
-    // highest bit of HH determines saturation direction in the event of
-    // saturation.
     // The number of overflow bits we can check are VTSize - Scale + 1 (we
     // include the sign bit). If these top bits are > 0, then we overflowed past
     // the max value. If these top bits are < -1, then we overflowed past the
     // min value. Otherwise, we did not overflow.
-    if (Saturating) {
-      unsigned OverflowBits = VTSize - Scale + 1;
-      assert(OverflowBits <= VTSize && OverflowBits > NVTSize &&
-             "Extent of overflow bits must start within HL");
-      SDValue HLHiMask = DAG.getConstant(
-          APInt::getHighBitsSet(NVTSize, OverflowBits - NVTSize), dl, NVT);
-      SDValue HLLoMask = DAG.getConstant(
-          APInt::getLowBitsSet(NVTSize, VTSize - OverflowBits), dl, NVT);
-
-      // HH > 0 or HH == 0 && HL > HLLoMask
-      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
-      SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
-      SDValue HLPos =
-          DAG.getSetCC(dl, BoolNVT, ResultHL, HLLoMask, ISD::SETUGT);
-      SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLPos));
-
-      // HH < -1 or HH == -1 && HL < HLHiMask
-      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
-      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
-      SDValue HLNeg =
-          DAG.getSetCC(dl, BoolNVT, ResultHL, HLHiMask, ISD::SETULT);
-      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLNeg));
-    }
+    unsigned OverflowBits = VTSize - Scale + 1;
+    assert(OverflowBits <= VTSize && OverflowBits > NVTSize &&
+           "Extent of overflow bits must start within HL");
+    SDValue HLHiMask = DAG.getConstant(
+        APInt::getHighBitsSet(NVTSize, OverflowBits - NVTSize), dl, NVT);
+    SDValue HLLoMask = DAG.getConstant(
+        APInt::getLowBitsSet(NVTSize, VTSize - OverflowBits), dl, NVT);
+    // We overflow max if HH > 0 or (HH == 0 && HL > HLLoMask).
+    SDValue HHGT0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+    SDValue HHEQ0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+    SDValue HLUGT = DAG.getSetCC(dl, BoolNVT, ResultHL, HLLoMask, ISD::SETUGT);
+    SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHGT0,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ0, HLUGT));
+    // We overflow min if HH < -1 or (HH == -1 && HL < HLHiMask).
+    SDValue HHLT = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+    SDValue HHEQ = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+    SDValue HLULT = DAG.getSetCC(dl, BoolNVT, ResultHL, HLHiMask, ISD::SETULT);
+    SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHLT,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ, HLULT));
   } else if (Scale == NVTSize) {
-    // If the scales are equal, Lo and Hi are ResultLH and Result HL,
-    // respectively. Avoid shifting to prevent undefined behavior.
-    Lo = ResultLH;
-    Hi = ResultHL;
-
-    // We overflow max if HH > 0 or HH == 0 && HL sign bit is 1.
-    // We overflow min if HH < -1 or HH == -1 && HL sign bit is 0.
-    if (Saturating) {
-      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
-      SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
-      SDValue HLNeg = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETLT);
-      SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLNeg));
-
-      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
-      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
-      SDValue HLPos = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETGE);
-      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLPos));
-    }
+    // We overflow max if HH > 0 or (HH == 0 && HL sign bit is 1).
+    SDValue HHGT0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+    SDValue HHEQ0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+    SDValue HLNeg = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETLT);
+    SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHGT0,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ0, HLNeg));
+    // We overflow min if HH < -1 or (HH == -1 && HL sign bit is 0).
+    SDValue HHLT = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+    SDValue HHEQ = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+    SDValue HLPos = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETGE);
+    SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHLT,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ, HLPos));
   } else if (Scale < VTSize) {
-    // If the scale is instead less than the old VT size, but greater than or
-    // equal to the expanded VT size, the first part of the result (ResultLL) is
-    // no longer a part of Lo because it would be scaled out anyway. Instead we
-    // can start shifting right from the fourth part (ResultHH) to the second
-    // part (ResultLH), and Result LH will be the new Lo.
-    SDValue SRLAmnt = DAG.getConstant(Scale - NVTSize, dl, ShiftTy);
-    SDValue SHLAmnt = DAG.getConstant(VTSize - Scale, dl, ShiftTy);
-    Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
-    Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
-                     DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
-    Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt);
-    Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
-                     DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt));
-
     // This is similar to the case when we saturate if Scale < NVTSize, but we
-    // only need to chech HH.
-    if (Saturating) {
-      unsigned OverflowBits = VTSize - Scale + 1;
-      SDValue HHHiMask = DAG.getConstant(
-          APInt::getHighBitsSet(NVTSize, OverflowBits), dl, NVT);
-      SDValue HHLoMask = DAG.getConstant(
-          APInt::getLowBitsSet(NVTSize, NVTSize - OverflowBits), dl, NVT);
-
-      SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, HHLoMask, ISD::SETGT);
-      SatMin = DAG.getSetCC(dl, BoolNVT, ResultHH, HHHiMask, ISD::SETLT);
-    }
-  } else if (Scale == VTSize) {
-    assert(
-        !Signed &&
-        "Only unsigned types can have a scale equal to the operand bit width");
-
-    Lo = ResultHL;
-    Hi = ResultHH;
-  } else {
-    llvm_unreachable("Expected the scale to be less than or equal to the width "
-                     "of the operands");
-  }
+    // only need to check HH.
+    unsigned OverflowBits = VTSize - Scale + 1;
+    SDValue HHHiMask = DAG.getConstant(
+        APInt::getHighBitsSet(NVTSize, OverflowBits), dl, NVT);
+    SDValue HHLoMask = DAG.getConstant(
+        APInt::getLowBitsSet(NVTSize, NVTSize - OverflowBits), dl, NVT);
+    SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, HHLoMask, ISD::SETGT);
+    SatMin = DAG.getSetCC(dl, BoolNVT, ResultHH, HHHiMask, ISD::SETLT);
+  } else
+    llvm_unreachable("Illegal scale for signed fixed point mul.");
 
-  if (Saturating) {
-    APInt LHMax = APInt::getSignedMaxValue(NVTSize);
-    APInt LLMax = APInt::getAllOnesValue(NVTSize);
-    APInt LHMin = APInt::getSignedMinValue(NVTSize);
-    Hi = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LHMax, dl, NVT), Hi);
-    Hi = DAG.getSelect(dl, NVT, SatMin, DAG.getConstant(LHMin, dl, NVT), Hi);
-    Lo = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LLMax, dl, NVT), Lo);
-    Lo = DAG.getSelect(dl, NVT, SatMin, NVTZero, Lo);
-  }
+  // Saturate to signed maximum.
+  APInt MaxHi = APInt::getSignedMaxValue(NVTSize);
+  APInt MaxLo = APInt::getAllOnesValue(NVTSize);
+  Hi = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(MaxHi, dl, NVT), Hi);
+  Lo = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(MaxLo, dl, NVT), Lo);
+  // Saturate to signed minimum.
+  APInt MinHi = APInt::getSignedMinValue(NVTSize);
+  Hi = DAG.getSelect(dl, NVT, SatMin, DAG.getConstant(MinHi, dl, NVT), Hi);
+  Lo = DAG.getSelect(dl, NVT, SatMin, NVTZero, Lo);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
@@ -3030,7 +3133,9 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
     LC = RTLIB::SDIV_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
@@ -3129,7 +3234,9 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
 
   if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, isSigned, dl).first, Lo, Hi);
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(isSigned);
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
     return;
   }
 
@@ -3217,7 +3324,9 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
     LC = RTLIB::SREM_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
@@ -3373,7 +3482,8 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
     LC = RTLIB::UDIV_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
@@ -3399,7 +3509,8 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
     LC = RTLIB::UREM_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
@@ -3759,7 +3870,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this SINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, Op, true, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  return TLI.makeLibCall(DAG, LC, DstVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -3924,7 +4037,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this UINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, Op, true, dl).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  return TLI.makeLibCall(DAG, LC, DstVT, Op, CallOptions, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
@@ -4033,6 +4148,23 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_SPLAT_VECTOR(SDNode *N) {
+  SDLoc dl(N);
+
+  SDValue SplatVal = N->getOperand(0);
+
+  assert(!SplatVal.getValueType().isVector() && "Input must be a scalar");
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "Type must be promoted to a vector type");
+  EVT NOutElemVT = NOutVT.getVectorElementType();
+
+  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutElemVT, SplatVal);
+
+  return DAG.getNode(ISD::SPLAT_VECTOR, dl, NOutVT, Op);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
   SDLoc dl(N);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 14fd5be23ccb..b596c174a287 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -81,7 +81,6 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
 
     for (unsigned i = 0, e = Node.getNumValues(); i != e; ++i) {
       SDValue Res(&Node, i);
-      EVT VT = Res.getValueType();
       bool Failed = false;
       // Don't create a value in map.
       auto ResId = (ValueToIdMap.count(Res)) ? ValueToIdMap[Res] : 0;
@@ -135,17 +134,13 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
           dbgs() << "Unprocessed value in a map!";
           Failed = true;
         }
-      } else if (isTypeLegal(VT) || IgnoreNodeResults(&Node)) {
+      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(&Node)) {
         if (Mapped > 1) {
           dbgs() << "Value with legal type was transformed!";
           Failed = true;
         }
       } else {
-        // If the value can be kept in HW registers, softening machinery can
-        // leave it unchanged and don't put it to any map.
-        if (Mapped == 0 &&
-            !(getTypeAction(VT) == TargetLowering::TypeSoftenFloat &&
-              isLegalInHWReg(VT))) {
+        if (Mapped == 0) {
           dbgs() << "Processed value not in any map!";
           Failed = true;
         } else if (Mapped & (Mapped - 1)) {
@@ -257,13 +252,9 @@ bool DAGTypeLegalizer::run() {
         Changed = true;
         goto NodeDone;
       case TargetLowering::TypeSoftenFloat:
-        Changed = SoftenFloatResult(N, i);
-        if (Changed)
-          goto NodeDone;
-        // If not changed, the result type should be legally in register.
-        assert(isLegalInHWReg(ResultVT) &&
-               "Unchanged SoftenFloatResult should be legal in register!");
-        goto ScanOperands;
+        SoftenFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
       case TargetLowering::TypeExpandFloat:
         ExpandFloatResult(N, i);
         Changed = true;
@@ -439,15 +430,9 @@ NodeDone:
     bool Failed = false;
 
     // Check that all result types are legal.
-    // A value type is illegal if its TypeAction is not TypeLegal,
-    // and TLI.RegClassForVT does not have a register class for this type.
-    // For example, the x86_64 target has f128 that is not TypeLegal,
-    // to have softened operators, but it also has FR128 register class to
-    // pass and return f128 values. Hence a legalized node can have f128 type.
     if (!IgnoreNodeResults(&Node))
       for (unsigned i = 0, NumVals = Node.getNumValues(); i < NumVals; ++i)
-        if (!isTypeLegal(Node.getValueType(i)) &&
-            !TLI.isTypeLegal(Node.getValueType(i))) {
+        if (!isTypeLegal(Node.getValueType(i))) {
           dbgs() << "Result type " << i << " illegal: ";
           Node.dump(&DAG);
           Failed = true;
@@ -456,8 +441,7 @@ NodeDone:
     // Check that all operand types are legal.
     for (unsigned i = 0, NumOps = Node.getNumOperands(); i < NumOps; ++i)
       if (!IgnoreNodeResults(Node.getOperand(i).getNode()) &&
-          !isTypeLegal(Node.getOperand(i).getValueType()) &&
-          !TLI.isTypeLegal(Node.getOperand(i).getValueType())) {
+          !isTypeLegal(Node.getOperand(i).getValueType())) {
         dbgs() << "Operand type " << i << " illegal: ";
         Node.getOperand(i).dump(&DAG);
         Failed = true;
@@ -713,23 +697,13 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
 }
 
 void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
-  // f128 of x86_64 could be kept in SSE registers,
-  // but sometimes softened to i128.
-  assert((Result.getValueType() ==
-          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) ||
-          Op.getValueType() ==
-          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
          "Invalid type for softened float");
   AnalyzeNewValue(Result);
 
   auto &OpIdEntry = SoftenedFloats[getTableId(Op)];
-  // Allow repeated calls to save f128 type nodes
-  // or any node with type that transforms to itself.
-  // Many operations on these types are not softened.
-  assert(((OpIdEntry == 0) ||
-          Op.getValueType() ==
-              TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
-         "Node is already converted to integer!");
+  assert((OpIdEntry == 0) && "Node is already converted to integer!");
   OpIdEntry = getTableId(Result);
 }
 
@@ -1003,25 +977,27 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
 /// Convert the node into a libcall with the same prototype.
 SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
                                      bool isSigned) {
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(isSigned);
   unsigned NumOps = N->getNumOperands();
   SDLoc dl(N);
   if (NumOps == 0) {
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), None, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), None, CallOptions,
                            dl).first;
   } else if (NumOps == 1) {
     SDValue Op = N->getOperand(0);
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Op, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Op, CallOptions,
                            dl).first;
   } else if (NumOps == 2) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, CallOptions,
                            dl).first;
   }
   SmallVector<SDValue, 8> Ops(NumOps);
   for (unsigned i = 0; i < NumOps; ++i)
     Ops[i] = N->getOperand(i);
 
-  return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned, dl).first;
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, CallOptions, dl).first;
 }
 
 /// Expand a node into a call to a libcall. Similar to ExpandLibCall except that
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 1d489b1b3a33..4afbae69128a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -73,15 +73,6 @@ private:
     return VT.isSimple() && TLI.isTypeLegal(VT);
   }
 
-  /// Return true if this type can be passed in registers.
-  /// For example, x86_64's f128, should to be legally in registers
-  /// and only some operations converted to library calls or integer
-  /// bitwise operations.
-  bool isLegalInHWReg(EVT VT) const {
-    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-    return VT == NVT && isSimpleLegalType(VT);
-  }
-
   EVT getSetCCResultType(EVT VT) const {
     return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
@@ -306,6 +297,7 @@ private:
   SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
   SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
   SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_SPLAT_VECTOR(SDNode *N);
   SDValue PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N);
   SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N);
@@ -363,6 +355,7 @@ private:
   SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_SPLAT_VECTOR(SDNode *N);
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
@@ -472,14 +465,11 @@ private:
   // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// Given an operand Op of Float type, returns the integer if the Op is not
-  /// supported in target HW and converted to the integer.
-  /// The integer contains exactly the same bits as Op - only the type changed.
-  /// For example, if Op is an f32 which was softened to an i32, then this
-  /// method returns an i32, the bits of which coincide with those of Op.
-  /// If the Op can be efficiently supported in target HW or the operand must
-  /// stay in a register, the Op is not converted to an integer.
-  /// In that case, the given op is returned.
+  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
+  /// integer of the same size, this returns the integer.  The integer contains
+  /// exactly the same bits as Op - only the type changed.  For example, if Op
+  /// is an f32 which was softened to an i32, then this method returns an i32,
+  /// the bits of which coincide with those of Op
   SDValue GetSoftenedFloat(SDValue Op) {
     TableId Id = getTableId(Op);
     auto Iter = SoftenedFloats.find(Id);
@@ -494,19 +484,19 @@ private:
   }
   void SetSoftenedFloat(SDValue Op, SDValue Result);
 
-  // Convert Float Results to Integer for Non-HW-supported Operations.
-  bool SoftenFloatResult(SDNode *N, unsigned ResNo);
+  // Convert Float Results to Integer.
+  void SoftenFloatResult(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_BITCAST(SDNode *N);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
-  SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_ConstantFP(SDNode *N);
   SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FABS(SDNode *N);
   SDValue SoftenFloatRes_FMINNUM(SDNode *N);
   SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
   SDValue SoftenFloatRes_FADD(SDNode *N);
   SDValue SoftenFloatRes_FCEIL(SDNode *N);
-  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
   SDValue SoftenFloatRes_FCOS(SDNode *N);
   SDValue SoftenFloatRes_FDIV(SDNode *N);
   SDValue SoftenFloatRes_FEXP(SDNode *N);
@@ -518,7 +508,7 @@ private:
   SDValue SoftenFloatRes_FMA(SDNode *N);
   SDValue SoftenFloatRes_FMUL(SDNode *N);
   SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
-  SDValue SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FNEG(SDNode *N);
   SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N);
   SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
@@ -531,27 +521,17 @@ private:
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
   SDValue SoftenFloatRes_FTRUNC(SDNode *N);
-  SDValue SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_LOAD(SDNode *N);
+  SDValue SoftenFloatRes_SELECT(SDNode *N);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
   SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
 
-  // Return true if we can skip softening the given operand or SDNode because
-  // either it was soften before by SoftenFloatResult and references to the
-  // operand were replaced by ReplaceValueWith or it's value type is legal in HW
-  // registers and the operand can be left unchanged.
-  bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo);
-
-  // Convert Float Operand to Integer for Non-HW-supported Operations.
+  // Convert Float Operand to Integer.
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_BITCAST(SDNode *N);
-  SDValue SoftenFloatOp_COPY_TO_REG(SDNode *N);
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
-  SDValue SoftenFloatOp_FABS(SDNode *N);
-  SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
-  SDValue SoftenFloatOp_FNEG(SDNode *N);
   SDValue SoftenFloatOp_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
   SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N);
@@ -559,7 +539,6 @@ private:
   SDValue SoftenFloatOp_LLROUND(SDNode *N);
   SDValue SoftenFloatOp_LRINT(SDNode *N);
   SDValue SoftenFloatOp_LLRINT(SDNode *N);
-  SDValue SoftenFloatOp_SELECT(SDNode *N);
   SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
   SDValue SoftenFloatOp_SETCC(SDNode *N);
   SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
@@ -715,6 +694,7 @@ private:
   bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_BITCAST(SDNode *N);
   SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
+  SDValue ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N);
   SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecOp_VSELECT(SDNode *N);
@@ -830,6 +810,7 @@ private:
   SDValue WidenVecRes_Ternary(SDNode *N);
   SDValue WidenVecRes_Binary(SDNode *N);
   SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
+  SDValue WidenVecRes_BinaryWithExtraScalarOp(SDNode *N);
   SDValue WidenVecRes_StrictFP(SDNode *N);
   SDValue WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo);
   SDValue WidenVecRes_Convert(SDNode *N);
@@ -933,6 +914,8 @@ private:
   void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
 
+  void SplitVSETCC(const SDNode *N);
+
   //===--------------------------------------------------------------------===//
   // Generic Expansion: LegalizeTypesGeneric.cpp
   //===--------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 943f63f46c47..5562f400b6e1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -52,17 +52,11 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypePromoteFloat:
       llvm_unreachable("Bitcast of a promotion-needing float should never need"
                        "expansion");
-    case TargetLowering::TypeSoftenFloat: {
-      // Expand the floating point operand only if it was converted to integers.
-      // Otherwise, it is a legal type like f128 that can be saved in a register.
-      auto SoftenedOp = GetSoftenedFloat(InOp);
-      if (isLegalInHWReg(SoftenedOp.getValueType()))
-        break;
-      SplitInteger(SoftenedOp, Lo, Hi);
+    case TargetLowering::TypeSoftenFloat:
+      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    }
     case TargetLowering::TypeExpandInteger:
     case TargetLowering::TypeExpandFloat: {
       auto &DL = DAG.getDataLayout();
@@ -509,23 +503,6 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
   GetSplitOp(Op, Lo, Hi);
 }
 
-static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N,
-                                               SelectionDAG &DAG) {
-  SDLoc DL(N);
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-
-  // Split the inputs.
-  SDValue Lo, Hi, LL, LH, RL, RH;
-  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
-  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
-
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
-
-  return std::make_pair(Lo, Hi);
-}
-
 void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue LL, LH, RL, RH, CL, CH;
   SDLoc dl(N);
@@ -537,16 +514,25 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
   if (Cond.getValueType().isVector()) {
     if (SDValue Res = WidenVSELECTAndMask(N))
       std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl);
-    // It seems to improve code to generate two narrow SETCCs as opposed to
-    // splitting a wide result vector.
-    else if (Cond.getOpcode() == ISD::SETCC)
-      std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG);
     // Check if there are already splitted versions of the vector available and
     // use those instead of splitting the mask operand again.
     else if (getTypeAction(Cond.getValueType()) ==
              TargetLowering::TypeSplitVector)
       GetSplitVector(Cond, CL, CH);
-    else
+    // It seems to improve code to generate two narrow SETCCs as opposed to
+    // splitting a wide result vector.
+    else if (Cond.getOpcode() == ISD::SETCC) {
+      // If the condition is a vXi1 vector, and the LHS of the setcc is a legal
+      // type and the setcc result type is the same vXi1, then leave the setcc
+      // alone.
+      EVT CondLHSVT = Cond.getOperand(0).getValueType();
+      if (Cond.getValueType().getVectorElementType() == MVT::i1 &&
+          isTypeLegal(CondLHSVT) &&
+          getSetCCResultType(CondLHSVT) == Cond.getValueType())
+        std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
+      else
+        SplitVecRes_SETCC(Cond.getNode(), CL, CH);
+    } else
       std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
   }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 10b8b705869e..15c3a0b6cfad 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -38,6 +38,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
@@ -333,14 +334,27 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::STRICT_FFLOOR:
   case ISD::STRICT_FROUND:
   case ISD::STRICT_FTRUNC:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::STRICT_FP_ROUND:
   case ISD::STRICT_FP_EXTEND:
-    // These pseudo-ops get legalized as if they were their non-strict
-    // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
-    // is also legal, but if ISD::FSQRT requires expansion then so does
-    // ISD::STRICT_FSQRT.
-    Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
-                                            Node->getValueType(0));
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    // If we're asked to expand a strict vector floating-point operation,
+    // by default we're going to simply unroll it.  That is usually the
+    // best approach, except in the case where the resulting strict (scalar)
+    // operations would themselves use the fallback mutation to non-strict.
+    // In that specific case, just do the fallback on the vector op.
+    if (Action == TargetLowering::Expand &&
+        TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                       Node->getValueType(0))
+        == TargetLowering::Legal) {
+      EVT EltVT = Node->getValueType(0).getVectorElementType();
+      if (TLI.getOperationAction(Node->getOpcode(), EltVT)
+          == TargetLowering::Expand &&
+          TLI.getStrictFPOperationAction(Node->getOpcode(), EltVT)
+          == TargetLowering::Legal)
+        Action = TargetLowering::Legal;
+    }
     break;
   case ISD::ADD:
   case ISD::SUB:
@@ -439,16 +453,13 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     break;
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: {
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
     break;
   }
-  case ISD::FP_ROUND_INREG:
-    Action = TLI.getOperationAction(Node->getOpcode(),
-               cast<VTSDNode>(Node->getOperand(1))->getVT());
-    break;
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::VECREDUCE_ADD:
@@ -820,6 +831,13 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::SMULFIX:
   case ISD::UMULFIX:
     return ExpandFixedPointMul(Op);
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIXSAT:
+    // FIXME: We do not expand SMULFIXSAT/UMULFIXSAT here yet, not sure exactly
+    // why. Maybe it results in worse codegen compared to the unroll for some
+    // targets? This should probably be investigated. And if we still prefer to
+    // unroll an explanation could be helpful.
+    return DAG.UnrollVectorOp(Op.getNode());
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
@@ -844,6 +862,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::STRICT_FFLOOR:
   case ISD::STRICT_FROUND:
   case ISD::STRICT_FTRUNC:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
     return ExpandStrictFPOp(Op);
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_MUL:
@@ -1168,9 +1188,13 @@ SDValue VectorLegalizer::ExpandABS(SDValue Op) {
 
 SDValue VectorLegalizer::ExpandFP_TO_UINT(SDValue Op) {
   // Attempt to expand using TargetLowering.
-  SDValue Result;
-  if (TLI.expandFP_TO_UINT(Op.getNode(), Result, DAG))
+  SDValue Result, Chain;
+  if (TLI.expandFP_TO_UINT(Op.getNode(), Result, Chain, DAG)) {
+    if (Op.getNode()->isStrictFPOpcode())
+      // Relink the chain
+      DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Chain);
     return Result;
+  }
 
   // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7e4d52617977..3763e886cef2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -52,7 +52,6 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::STRICT_FP_ROUND:   R = ScalarizeVecRes_STRICT_FP_ROUND(N); break;
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
-  case ISD::FP_ROUND_INREG:    R = ScalarizeVecRes_InregOp(N); break;
   case ISD::FPOWI:             R = ScalarizeVecRes_FPOWI(N); break;
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
@@ -171,6 +170,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FFLOOR:
   case ISD::STRICT_FROUND:
   case ISD::STRICT_FTRUNC:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::STRICT_FP_EXTEND:
     R = ScalarizeVecRes_StrictFPOp(N);
     break;
@@ -185,6 +186,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
     R = ScalarizeVecRes_MULFIX(N);
     break;
   }
@@ -604,6 +606,10 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::UINT_TO_FP:
       Res = ScalarizeVecOp_UnaryOp(N);
       break;
+    case ISD::STRICT_FP_TO_SINT:
+    case ISD::STRICT_FP_TO_UINT:
+      Res = ScalarizeVecOp_UnaryOp_StrictFP(N);
+      break;
     case ISD::CONCAT_VECTORS:
       Res = ScalarizeVecOp_CONCAT_VECTORS(N);
       break;
@@ -679,6 +685,23 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op);
 }
 
+/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
+/// Do the strict FP operation on the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N) {
+  assert(N->getValueType(0).getVectorNumElements() == 1 &&
+         "Unexpected vector type!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(1));
+  SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
+                            { N->getValueType(0).getScalarType(), MVT::Other },
+                            { N->getOperand(0), Elt });
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  // Revectorize the result so the types line up with what the uses of this
+  // expression expect.
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+}
+
 /// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
   SmallVector<SDValue, 8> Ops(N->getNumOperands());
@@ -828,7 +851,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::CONCAT_VECTORS:    SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
   case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
   case ISD::INSERT_SUBVECTOR:  SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
-  case ISD::FP_ROUND_INREG:    SplitVecRes_InregOp(N, Lo, Hi); break;
   case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
   case ISD::FCOPYSIGN:         SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
   case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
@@ -883,7 +905,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_ROUND:
   case ISD::STRICT_FP_ROUND:
   case ISD::FP_TO_SINT:
+  case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::FRINT:
   case ISD::FROUND:
   case ISD::FSIN:
@@ -977,6 +1001,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
     SplitVecRes_MULFIX(N, Lo, Hi);
     break;
   }
@@ -1560,10 +1585,14 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  if (Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  }
 
   EVT MemoryVT = MLD->getMemoryVT();
   EVT LoMemVT, HiMemVT;
@@ -1622,10 +1651,14 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  if (Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  }
 
   EVT MemoryVT = MGT->getMemoryVT();
   EVT LoMemVT, HiMemVT;
@@ -1651,11 +1684,11 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
-                           MMO);
+                           MMO, MGT->getIndexType());
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
   Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
-                           MMO);
+                           MMO, MGT->getIndexType());
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -1979,6 +2012,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
       break;
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
+    case ISD::STRICT_FP_TO_SINT:
+    case ISD::STRICT_FP_TO_UINT:
     case ISD::CTTZ:
     case ISD::CTLZ:
     case ISD::CTPOP:
@@ -2293,7 +2328,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
   SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
-                                   OpsLo, MMO);
+                                   OpsLo, MMO, MGT->getIndexType());
 
   MMO = DAG.getMachineFunction().
     getMachineMemOperand(MGT->getPointerInfo(),
@@ -2303,7 +2338,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
   SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
-                                   OpsHi, MMO);
+                                   OpsHi, MMO, MGT->getIndexType());
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -2340,12 +2375,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
   else
     std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
+  // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-    // Split Mask operand
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  }
 
   SDValue Lo, Hi;
   MachineMemOperand *MMO = DAG.getMachineFunction().
@@ -2397,12 +2436,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
   else
     std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
+  // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-    // Split Mask operand
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  }
 
   SDValue IndexHi, IndexLo;
   if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
@@ -2418,7 +2461,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
 
   SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
-                            DL, OpsLo, MMO);
+                            DL, OpsLo, MMO, N->getIndexType());
 
   MMO = DAG.getMachineFunction().
     getMachineMemOperand(N->getPointerInfo(),
@@ -2430,7 +2473,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
   // after another.
   SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale};
   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
-                              DL, OpsHi, MMO);
+                              DL, OpsHi, MMO, N->getIndexType());
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -2596,7 +2639,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
   LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
   HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
   SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
-  return PromoteTargetBoolean(Con, N->getValueType(0));
+
+  EVT OpVT = N->getOperand(0).getValueType();
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, DL, N->getValueType(0), Con);
 }
 
 
@@ -2663,7 +2710,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
-  case ISD::FP_ROUND_INREG:    Res = WidenVecRes_InregOp(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
   case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
@@ -2719,6 +2765,15 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_BinaryCanTrap(N);
     break;
 
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
+    // These are binary operations, but with an extra operand that shouldn't
+    // be widened (the scale).
+    Res = WidenVecRes_BinaryWithExtraScalarOp(N);
+    break;
+
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
@@ -2790,6 +2845,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::STRICT_FP_EXTEND:
   case ISD::STRICT_FP_ROUND:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
     Res = WidenVecRes_Convert_StrictFP(N);
     break;
 
@@ -2866,6 +2923,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_BinaryWithExtraScalarOp(SDNode *N) {
+  // Binary op widening, but with an extra operand that shouldn't be widened.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp3 = N->getOperand(2);
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3,
+                     N->getFlags());
+}
+
 // Given a vector of operations that have been broken up to widen, see
 // if we can collect them together into the next widest legal VT. This
 // implementation is trap-safe.
@@ -3716,7 +3784,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
                     Scale };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand());
+                                    N->getMemOperand(), N->getIndexType());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
@@ -4094,7 +4162,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::FP_EXTEND:
   case ISD::STRICT_FP_EXTEND:
   case ISD::FP_TO_SINT:
+  case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::TRUNCATE:
@@ -4434,7 +4504,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_MGATHER(SDNode *N, unsigned OpNo) {
   SDValue Ops[] = {MG->getChain(), DataOp, Mask, MG->getBasePtr(), Index,
                    Scale};
   SDValue Res = DAG.getMaskedGather(MG->getVTList(), MG->getMemoryVT(), dl, Ops,
-                                    MG->getMemOperand());
+                                    MG->getMemOperand(), MG->getIndexType());
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
   return SDValue();
@@ -4472,7 +4542,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
                    Scale};
   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
                               MSC->getMemoryVT(), SDLoc(N), Ops,
-                              MSC->getMemOperand());
+                              MSC->getMemOperand(), MSC->getIndexType());
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
@@ -4504,7 +4574,10 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
       ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
       DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
-  return PromoteTargetBoolean(CC, VT);
+  EVT OpVT = N->getOperand(0).getValueType();
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, dl, VT, CC);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
@@ -4706,7 +4779,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;
-  unsigned LdAlign = LD->isVolatile() ? 0 : Align; // Allow wider loads.
+  unsigned LdAlign = (!LD->isSimple()) ? 0 : Align; // Allow wider loads.
 
   // Find the vector type that can load from.
   EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 2cb850fa1a3d..7ee44c808fcb 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -498,7 +498,7 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
           // Check for def of register or earlyclobber register.
           for (; NumVals; --NumVals, ++i) {
             unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+            if (Register::isPhysicalRegister(Reg))
               CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
           }
         } else
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 34b4c8502353..ff806bdb822c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1188,6 +1188,10 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
     if (!Pred.isArtificial())
       AddPredQueued(NewSU, Pred);
 
+  // Make sure the clone comes after the original. (InstrEmitter assumes
+  // this ordering.)
+  AddPredQueued(NewSU, SDep(SU, SDep::Artificial));
+
   // Only copy scheduled successors. Cut them from old node's successor
   // list and move them over.
   SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
@@ -1374,7 +1378,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
           // Check for def of register or earlyclobber register.
           for (; NumVals; --NumVals, ++i) {
             unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+            if (Register::isPhysicalRegister(Reg))
               CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
           }
         } else
@@ -2358,7 +2362,7 @@ static bool hasOnlyLiveInOpers(const SUnit *SU) {
         PredSU->getNode()->getOpcode() == ISD::CopyFromReg) {
       unsigned Reg =
         cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         RetVal = true;
         continue;
       }
@@ -2379,7 +2383,7 @@ static bool hasOnlyLiveOutUses(const SUnit *SU) {
     if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
       unsigned Reg =
         cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         RetVal = true;
         continue;
       }
@@ -2948,8 +2952,8 @@ void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
     // like other nodes from the perspective of scheduling heuristics.
     if (SDNode *N = SU.getNode())
       if (N->getOpcode() == ISD::CopyToReg &&
-          TargetRegisterInfo::isVirtualRegister
-            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+          Register::isVirtualRegister(
+              cast<RegisterSDNode>(N->getOperand(1))->getReg()))
         continue;
 
     SDNode *PredFrameSetup = nullptr;
@@ -2995,8 +2999,8 @@ void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
     // like other nodes from the perspective of scheduling heuristics.
     if (SDNode *N = SU.getNode())
       if (N->getOpcode() == ISD::CopyFromReg &&
-          TargetRegisterInfo::isVirtualRegister
-            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+          Register::isVirtualRegister(
+              cast<RegisterSDNode>(N->getOperand(1))->getReg()))
         continue;
 
     // Perform checks on the successors of PredSU.
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 568c6191e512..d4c1fb36475e 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -115,7 +115,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
     return;
 
   unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
+  if (Register::isVirtualRegister(Reg))
     return;
 
   unsigned ResNo = User->getOperand(2).getResNo();
@@ -528,7 +528,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
 /// are input.  This SUnit graph is similar to the SelectionDAG, but
 /// excludes nodes that aren't interesting to scheduling, and represents
 /// glued together nodes with a single SUnit.
-void ScheduleDAGSDNodes::BuildSchedGraph(AliasAnalysis *AA) {
+void ScheduleDAGSDNodes::BuildSchedGraph(AAResults *AA) {
   // Cluster certain nodes which should be scheduled together.
   ClusterNodes();
   // Populate the SUnits array.
@@ -656,7 +656,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
   if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
       !BB->succ_empty()) {
     unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (Register::isVirtualRegister(Reg))
       // This copy is a liveout value. It is likely coalesced, so reduce the
       // latency so not to penalize the def.
       // FIXME: need target specific adjustment here?
@@ -808,7 +808,7 @@ EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
     } else {
       // Copy from physical register.
       assert(I->getReg() && "Unknown physical register!");
-      unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
+      Register VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
       bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
       (void)isNew; // Silence compiler warning.
       assert(isNew && "Node emitted out of order - early");
@@ -909,6 +909,12 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
       // Remember the source order of the inserted instruction.
       if (HasDbg)
         ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen, NewInsn);
+
+      if (MDNode *MD = DAG->getHeapAllocSite(N)) {
+        if (NewInsn && NewInsn->isCall())
+          MF.addCodeViewHeapAllocSite(NewInsn, MD);
+      }
+
       GluedNodes.pop_back();
     }
     auto NewInsn =
@@ -917,6 +923,10 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     if (HasDbg)
       ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders, Seen,
                         NewInsn);
+    if (MDNode *MD = DAG->getHeapAllocSite(SU->getNode())) {
+      if (NewInsn && NewInsn->isCall())
+        MF.addCodeViewHeapAllocSite(NewInsn, MD);
+    }
   }
 
   // Insert all the dbg_values which have not already been inserted in source
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 5163b4fa4fd3..183ce4b0652d 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -26,6 +26,7 @@
 
 namespace llvm {
 
+class AAResults;
 class InstrItineraryData;
 
   /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
@@ -93,7 +94,7 @@ class InstrItineraryData;
     /// are input.  This SUnit graph is similar to the SelectionDAG, but
     /// excludes nodes that aren't interesting to scheduling, and represents
     /// flagged together nodes with a single SUnit.
-    void BuildSchedGraph(AliasAnalysis *AA);
+    void BuildSchedGraph(AAResults *AA);
 
     /// InitNumRegDefsLeft - Determine the # of regs defined by this node.
     ///
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index ab06b55b49fd..e7bac73678a7 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -63,14 +63,13 @@ private:
   /// HazardRec - The hazard recognizer to use.
   ScheduleHazardRecognizer *HazardRec;
 
-  /// AA - AliasAnalysis for making memory reference queries.
-  AliasAnalysis *AA;
+  /// AA - AAResults for making memory reference queries.
+  AAResults *AA;
 
 public:
-  ScheduleDAGVLIW(MachineFunction &mf,
-                  AliasAnalysis *aa,
+  ScheduleDAGVLIW(MachineFunction &mf, AAResults *aa,
                   SchedulingPriorityQueue *availqueue)
-    : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
+      : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
     const TargetSubtargetInfo &STI = mf.getSubtarget();
     HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5852e693fa9f..52a71b91d93f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -859,9 +859,8 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
     break;
   case ISD::TargetExternalSymbol: {
     ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
-    Erased = TargetExternalSymbols.erase(
-               std::pair<std::string,unsigned char>(ESN->getSymbol(),
-                                                    ESN->getTargetFlags()));
+    Erased = TargetExternalSymbols.erase(std::pair<std::string, unsigned>(
+        ESN->getSymbol(), ESN->getTargetFlags()));
     break;
   }
   case ISD::MCSymbol: {
@@ -1084,6 +1083,7 @@ void SelectionDAG::clear() {
   ExternalSymbols.clear();
   TargetExternalSymbols.clear();
   MCSymbols.clear();
+  SDCallSiteDbgInfo.clear();
   std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
             static_cast<CondCodeSDNode*>(nullptr));
   std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
@@ -1353,7 +1353,7 @@ SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
 
 SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
                                        EVT VT, int64_t Offset, bool isTargetGA,
-                                       unsigned char TargetFlags) {
+                                       unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTargetGA) &&
          "Cannot set target flags on target-independent globals");
 
@@ -1400,7 +1400,7 @@ SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
 }
 
 SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
-                                   unsigned char TargetFlags) {
+                                   unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent jump tables");
   unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
@@ -1421,7 +1421,7 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
 SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
                                       unsigned Alignment, int Offset,
                                       bool isTarget,
-                                      unsigned char TargetFlags) {
+                                      unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
@@ -1449,7 +1449,7 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
 SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
                                       unsigned Alignment, int Offset,
                                       bool isTarget,
-                                      unsigned char TargetFlags) {
+                                      unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
@@ -1473,7 +1473,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
 }
 
 SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
-                                     unsigned char TargetFlags) {
+                                     unsigned TargetFlags) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
   ID.AddInteger(Index);
@@ -1535,10 +1535,9 @@ SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
 }
 
 SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
-                                              unsigned char TargetFlags) {
+                                              unsigned TargetFlags) {
   SDNode *&N =
-    TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
-                                                               TargetFlags)];
+      TargetExternalSymbols[std::pair<std::string, unsigned>(Sym, TargetFlags)];
   if (N) return SDValue(N, 0);
   N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
   InsertNode(N);
@@ -1802,9 +1801,8 @@ SDValue SelectionDAG::getLabelNode(unsigned Opcode, const SDLoc &dl,
 }
 
 SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
-                                      int64_t Offset,
-                                      bool isTarget,
-                                      unsigned char TargetFlags) {
+                                      int64_t Offset, bool isTarget,
+                                      unsigned TargetFlags) {
   unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
 
   FoldingSetNodeID ID;
@@ -1900,20 +1898,19 @@ SDValue SelectionDAG::expandVAArg(SDNode *Node) {
   EVT VT = Node->getValueType(0);
   SDValue Tmp1 = Node->getOperand(0);
   SDValue Tmp2 = Node->getOperand(1);
-  unsigned Align = Node->getConstantOperandVal(3);
+  const MaybeAlign MA(Node->getConstantOperandVal(3));
 
   SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
                                Tmp2, MachinePointerInfo(V));
   SDValue VAList = VAListLoad;
 
-  if (Align > TLI.getMinStackArgumentAlignment()) {
-    assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
-
+  if (MA && *MA > TLI.getMinStackArgumentAlignment()) {
     VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
-                     getConstant(Align - 1, dl, VAList.getValueType()));
+                     getConstant(MA->value() - 1, dl, VAList.getValueType()));
 
-    VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
-                     getConstant(-(int64_t)Align, dl, VAList.getValueType()));
+    VAList =
+        getNode(ISD::AND, dl, VAList.getValueType(), VAList,
+                getConstant(-(int64_t)MA->value(), dl, VAList.getValueType()));
   }
 
   // Increment the pointer, VAList, to the next vaarg
@@ -2154,12 +2151,9 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
   }
   case ISD::OR:
   case ISD::XOR:
-    // If the LHS or RHS don't contribute bits to the or, drop them.
-    if (MaskedValueIsZero(V.getOperand(0), DemandedBits))
-      return V.getOperand(1);
-    if (MaskedValueIsZero(V.getOperand(1), DemandedBits))
-      return V.getOperand(0);
-    break;
+  case ISD::SIGN_EXTEND_INREG:
+    return TLI->SimplifyMultipleUseDemandedBits(V, DemandedBits, DemandedElts,
+                                                *this, 0);
   case ISD::SRL:
     // Only look at single-use SRLs.
     if (!V.getNode()->hasOneUse())
@@ -2203,15 +2197,6 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
       return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc);
     break;
   }
-  case ISD::SIGN_EXTEND_INREG:
-    EVT ExVT = cast<VTSDNode>(V.getOperand(1))->getVT();
-    unsigned ExVTBits = ExVT.getScalarSizeInBits();
-
-    // If none of the extended bits are demanded, eliminate the sextinreg.
-    if (DemandedBits.getActiveBits() <= ExVTBits)
-      return V.getOperand(0);
-
-    break;
   }
   return SDValue();
 }
@@ -2395,15 +2380,39 @@ SDValue SelectionDAG::getSplatValue(SDValue V) {
 /// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
 /// is less than the element bit-width of the shift node, return it.
 static const APInt *getValidShiftAmountConstant(SDValue V) {
+  unsigned BitWidth = V.getScalarValueSizeInBits();
   if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1))) {
     // Shifting more than the bitwidth is not valid.
     const APInt &ShAmt = SA->getAPIntValue();
-    if (ShAmt.ult(V.getScalarValueSizeInBits()))
+    if (ShAmt.ult(BitWidth))
       return &ShAmt;
   }
   return nullptr;
 }
 
+/// If a SHL/SRA/SRL node has constant vector shift amounts that are all less
+/// than the element bit-width of the shift node, return the minimum value.
+static const APInt *getValidMinimumShiftAmountConstant(SDValue V) {
+  unsigned BitWidth = V.getScalarValueSizeInBits();
+  auto *BV = dyn_cast<BuildVectorSDNode>(V.getOperand(1));
+  if (!BV)
+    return nullptr;
+  const APInt *MinShAmt = nullptr;
+  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+    auto *SA = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+    if (!SA)
+      return nullptr;
+    // Shifting more than the bitwidth is not valid.
+    const APInt &ShAmt = SA->getAPIntValue();
+    if (ShAmt.uge(BitWidth))
+      return nullptr;
+    if (MinShAmt && MinShAmt->ule(ShAmt))
+      continue;
+    MinShAmt = &ShAmt;
+  }
+  return MinShAmt;
+}
+
 /// Determine which bits of Op are known to be either zero or one and return
 /// them in Known. For vectors, the known bits are those that are shared by
 /// every vector element.
@@ -2437,7 +2446,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     return Known;
   }
 
-  if (Depth == 6)
+  if (Depth >= MaxRecursionDepth)
     return Known;  // Limit search depth.
 
   KnownBits Known2;
@@ -2582,14 +2591,13 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     SDValue Src = Op.getOperand(0);
     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrc = APInt::getAllOnesValue(NumSrcElts);
     if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
       // Offset the demanded elts by the subvector index.
       uint64_t Idx = SubIdx->getZExtValue();
-      APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-      Known = computeKnownBits(Src, DemandedSrc, Depth + 1);
-    } else {
-      Known = computeKnownBits(Src, Depth + 1);
+      DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
     }
+    Known = computeKnownBits(Src, DemandedSrc, Depth + 1);
     break;
   }
   case ISD::SCALAR_TO_VECTOR: {
@@ -2800,25 +2808,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known.One.lshrInPlace(Shift);
       // High bits are known zero.
       Known.Zero.setHighBits(Shift);
-    } else if (auto *BV = dyn_cast<BuildVectorSDNode>(Op.getOperand(1))) {
-      // If the shift amount is a vector of constants see if we can bound
-      // the number of upper zero bits.
-      unsigned ShiftAmountMin = BitWidth;
-      for (unsigned i = 0; i != BV->getNumOperands(); ++i) {
-        if (auto *C = dyn_cast<ConstantSDNode>(BV->getOperand(i))) {
-          const APInt &ShAmt = C->getAPIntValue();
-          if (ShAmt.ult(BitWidth)) {
-            ShiftAmountMin = std::min<unsigned>(ShiftAmountMin,
-                                                ShAmt.getZExtValue());
-            continue;
-          }
-        }
-        // Don't know anything.
-        ShiftAmountMin = 0;
-        break;
-      }
-
-      Known.Zero.setHighBits(ShiftAmountMin);
+    } else if (const APInt *ShMinAmt = getValidMinimumShiftAmountConstant(Op)) {
+      // Minimum shift high bits are known zero.
+      Known.Zero.setHighBits(ShMinAmt->getZExtValue());
     }
     break;
   case ISD::SRA:
@@ -3105,12 +3097,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
 
         // If the first operand is non-negative or has all low bits zero, then
         // the upper bits are all zero.
-        if (Known2.Zero[BitWidth-1] || ((Known2.Zero & LowBits) == LowBits))
+        if (Known2.isNonNegative() || LowBits.isSubsetOf(Known2.Zero))
           Known.Zero |= ~LowBits;
 
         // If the first operand is negative and not all low bits are zero, then
         // the upper bits are all one.
-        if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0))
+        if (Known2.isNegative() && LowBits.intersects(Known2.One))
           Known.One |= ~LowBits;
         assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
       }
@@ -3427,7 +3419,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     return Val.getNumSignBits();
   }
 
-  if (Depth == 6)
+  if (Depth >= MaxRecursionDepth)
     return 1;  // Limit search depth.
 
   if (!DemandedElts)
@@ -3729,6 +3721,18 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
     if (Tmp == 1) return 1;  // Early out.
     return std::min(Tmp, Tmp2)-1;
+  case ISD::MUL: {
+    // The output of the Mul can be at most twice the valid bits in the inputs.
+    unsigned SignBitsOp0 = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    if (SignBitsOp0 == 1)
+      break;
+    unsigned SignBitsOp1 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+    if (SignBitsOp1 == 1)
+      break;
+    unsigned OutValidBits =
+        (VTBits - SignBitsOp0 + 1) + (VTBits - SignBitsOp1 + 1);
+    return OutValidBits > VTBits ? 1 : VTBits - OutValidBits + 1;
+  }
   case ISD::TRUNCATE: {
     // Check if the sign bits of source go down as far as the truncated value.
     unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
@@ -3817,13 +3821,13 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     SDValue Src = Op.getOperand(0);
     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrc = APInt::getAllOnesValue(NumSrcElts);
     if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
       // Offset the demanded elts by the subvector index.
       uint64_t Idx = SubIdx->getZExtValue();
-      APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-      return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+      DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
     }
-    return ComputeNumSignBits(Src, Depth + 1);
+    return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
   }
   case ISD::CONCAT_VECTORS: {
     // Determine the minimum number of sign bits across all demanded
@@ -3976,7 +3980,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
   if (getTarget().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs())
     return true;
 
-  if (Depth == 6)
+  if (Depth >= MaxRecursionDepth)
     return false; // Limit search depth.
 
   // TODO: Handle vectors.
@@ -4645,7 +4649,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return getUNDEF(VT);
 
     // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
-    if ((getTarget().Options.UnsafeFPMath || Flags.hasNoSignedZeros()) &&
+    if ((getTarget().Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
         OpOpcode == ISD::FSUB)
       return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
                      Operand.getOperand(0), Flags);
@@ -5156,22 +5160,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N2C && N2C->isNullValue())
       return N1;
     break;
-  case ISD::FP_ROUND_INREG: {
-    EVT EVT = cast<VTSDNode>(N2)->getVT();
-    assert(VT == N1.getValueType() && "Not an inreg round!");
-    assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
-           "Cannot FP_ROUND_INREG integer types");
-    assert(EVT.isVector() == VT.isVector() &&
-           "FP_ROUND_INREG type should be vector iff the operand "
-           "type is vector!");
-    assert((!EVT.isVector() ||
-            EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
-           "Vector element counts must match in FP_ROUND_INREG");
-    assert(EVT.bitsLE(VT) && "Not rounding down!");
-    (void)EVT;
-    if (cast<VTSDNode>(N2)->getVT() == VT) return N1;  // Not actually rounding.
-    break;
-  }
   case ISD::FP_ROUND:
     assert(VT.isFloatingPoint() &&
            N1.getValueType().isFloatingPoint() &&
@@ -5382,7 +5370,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       std::swap(N1, N2);
     } else {
       switch (Opcode) {
-      case ISD::FP_ROUND_INREG:
       case ISD::SIGN_EXTEND_INREG:
       case ISD::SUB:
         return getUNDEF(VT);     // fold op(undef, arg2) -> undef
@@ -5770,7 +5757,7 @@ static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
 
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                        SDValue Chain, SDValue Dst, SDValue Src,
-                                       uint64_t Size, unsigned Align,
+                                       uint64_t Size, unsigned Alignment,
                                        bool isVol, bool AlwaysInline,
                                        MachinePointerInfo DstPtrInfo,
                                        MachinePointerInfo SrcPtrInfo) {
@@ -5795,15 +5782,15 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
-  if (Align > SrcAlign)
-    SrcAlign = Align;
+  if (Alignment > SrcAlign)
+    SrcAlign = Alignment;
   ConstantDataArraySlice Slice;
   bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
   bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
 
   if (!TLI.findOptimalMemOpLowering(
-          MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align),
+          MemOps, Limit, Size, (DstAlignCanChange ? 0 : Alignment),
           (isZeroConstant ? 0 : SrcAlign), /*IsMemset=*/false,
           /*ZeroMemset=*/false, /*MemcpyStrSrc=*/CopyFromConstant,
           /*AllowOverlap=*/!isVol, DstPtrInfo.getAddrSpace(),
@@ -5818,15 +5805,15 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     // realignment.
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
-      while (NewAlign > Align &&
-             DL.exceedsNaturalStackAlignment(NewAlign))
-          NewAlign /= 2;
+      while (NewAlign > Alignment &&
+             DL.exceedsNaturalStackAlignment(Align(NewAlign)))
+        NewAlign /= 2;
 
-    if (NewAlign > Align) {
+    if (NewAlign > Alignment) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
-      Align = NewAlign;
+      Alignment = NewAlign;
     }
   }
 
@@ -5869,10 +5856,9 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       }
       Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
       if (Value.getNode()) {
-        Store = DAG.getStore(Chain, dl, Value,
-                             DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-                             DstPtrInfo.getWithOffset(DstOff), Align,
-                             MMOFlags);
+        Store = DAG.getStore(
+            Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+            DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags);
         OutChains.push_back(Store);
       }
     }
@@ -5900,7 +5886,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 
       Store = DAG.getTruncStore(
           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-          DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
+          DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags);
       OutStoreChains.push_back(Store);
     }
     SrcOff += VTSize;
@@ -6567,7 +6553,7 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
 SDValue SelectionDAG::getMemIntrinsicNode(
     unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
     EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align,
-    MachineMemOperand::Flags Flags, unsigned Size, const AAMDNodes &AAInfo) {
+    MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
   if (Align == 0)  // Ensure that codegen never sees alignment 0
     Align = getEVTAlignment(MemVT);
 
@@ -6619,7 +6605,9 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
     createOperands(N, Ops);
   }
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
@@ -7022,14 +7010,15 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
 
 SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                                       ArrayRef<SDValue> Ops,
-                                      MachineMemOperand *MMO) {
+                                      MachineMemOperand *MMO,
+                                      ISD::MemIndexType IndexType) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
-      dl.getIROrder(), VTs, VT, MMO));
+      dl.getIROrder(), VTs, VT, MMO, IndexType));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
@@ -7038,7 +7027,7 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
   }
 
   auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
-                                          VTs, VT, MMO);
+                                          VTs, VT, MMO, IndexType);
   createOperands(N, Ops);
 
   assert(N->getPassThru().getValueType() == N->getValueType(0) &&
@@ -7062,14 +7051,15 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
 
 SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                                        ArrayRef<SDValue> Ops,
-                                       MachineMemOperand *MMO) {
+                                       MachineMemOperand *MMO,
+                                       ISD::MemIndexType IndexType) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
-      dl.getIROrder(), VTs, VT, MMO));
+      dl.getIROrder(), VTs, VT, MMO, IndexType));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
@@ -7077,7 +7067,7 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
     return SDValue(E, 0);
   }
   auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
-                                           VTs, VT, MMO);
+                                           VTs, VT, MMO, IndexType);
   createOperands(N, Ops);
 
   assert(N->getMask().getValueType().getVectorNumElements() ==
@@ -7766,16 +7756,22 @@ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
   case ISD::STRICT_FLOG:       NewOpc = ISD::FLOG;       break;
   case ISD::STRICT_FLOG10:     NewOpc = ISD::FLOG10;     break;
   case ISD::STRICT_FLOG2:      NewOpc = ISD::FLOG2;      break;
+  case ISD::STRICT_LRINT:      NewOpc = ISD::LRINT;      break;
+  case ISD::STRICT_LLRINT:     NewOpc = ISD::LLRINT;     break;
   case ISD::STRICT_FRINT:      NewOpc = ISD::FRINT;      break;
   case ISD::STRICT_FNEARBYINT: NewOpc = ISD::FNEARBYINT; break;
   case ISD::STRICT_FMAXNUM:    NewOpc = ISD::FMAXNUM;    break;
   case ISD::STRICT_FMINNUM:    NewOpc = ISD::FMINNUM;    break;
   case ISD::STRICT_FCEIL:      NewOpc = ISD::FCEIL;      break;
   case ISD::STRICT_FFLOOR:     NewOpc = ISD::FFLOOR;     break;
+  case ISD::STRICT_LROUND:     NewOpc = ISD::LROUND;     break;
+  case ISD::STRICT_LLROUND:    NewOpc = ISD::LLROUND;    break;
   case ISD::STRICT_FROUND:     NewOpc = ISD::FROUND;     break;
   case ISD::STRICT_FTRUNC:     NewOpc = ISD::FTRUNC;     break;
   case ISD::STRICT_FP_ROUND:   NewOpc = ISD::FP_ROUND;   break;
   case ISD::STRICT_FP_EXTEND:  NewOpc = ISD::FP_EXTEND;  break;
+  case ISD::STRICT_FP_TO_SINT: NewOpc = ISD::FP_TO_SINT; break;
+  case ISD::STRICT_FP_TO_UINT: NewOpc = ISD::FP_TO_UINT; break;
   }
 
   assert(Node->getNumValues() == 2 && "Unexpected number of results!");
@@ -7925,6 +7921,7 @@ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
     CSEMap.InsertNode(N, IP);
 
   InsertNode(N);
+  NewSDValueDbgMsg(SDValue(N, 0), "Creating new machine node: ", this);
   return N;
 }
 
@@ -8619,7 +8616,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
   // TokenFactor.
   SDValue OldChain = SDValue(OldLoad, 1);
   SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
-  if (!OldLoad->hasAnyUseOfValue(1))
+  if (OldChain == NewChain || !OldLoad->hasAnyUseOfValue(1))
     return NewChain;
 
   SDValue TokenFactor =
@@ -8812,7 +8809,7 @@ HandleSDNode::~HandleSDNode() {
 GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
                                          const DebugLoc &DL,
                                          const GlobalValue *GA, EVT VT,
-                                         int64_t o, unsigned char TF)
+                                         int64_t o, unsigned TF)
     : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
   TheGlobal = GA;
 }
@@ -8986,7 +8983,7 @@ bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
 
   // Loads don't have side effects, look through them.
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
-    if (!Ld->isVolatile())
+    if (Ld->isUnordered())
       return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
   }
   return false;
@@ -9005,21 +9002,51 @@ void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
 
 SDValue
 SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
-                                  ArrayRef<ISD::NodeType> CandidateBinOps) {
+                                  ArrayRef<ISD::NodeType> CandidateBinOps,
+                                  bool AllowPartials) {
   // The pattern must end in an extract from index 0.
   if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
       !isNullConstant(Extract->getOperand(1)))
     return SDValue();
 
-  SDValue Op = Extract->getOperand(0);
-  unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
-
   // Match against one of the candidate binary ops.
+  SDValue Op = Extract->getOperand(0);
   if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
         return Op.getOpcode() == unsigned(BinOp);
       }))
     return SDValue();
 
+  // Floating-point reductions may require relaxed constraints on the final step
+  // of the reduction because they may reorder intermediate operations.
+  unsigned CandidateBinOp = Op.getOpcode();
+  if (Op.getValueType().isFloatingPoint()) {
+    SDNodeFlags Flags = Op->getFlags();
+    switch (CandidateBinOp) {
+    case ISD::FADD:
+      if (!Flags.hasNoSignedZeros() || !Flags.hasAllowReassociation())
+        return SDValue();
+      break;
+    default:
+      llvm_unreachable("Unhandled FP opcode for binop reduction");
+    }
+  }
+
+  // Matching failed - attempt to see if we did enough stages that a partial
+  // reduction from a subvector is possible.
+  auto PartialReduction = [&](SDValue Op, unsigned NumSubElts) {
+    if (!AllowPartials || !Op)
+      return SDValue();
+    EVT OpVT = Op.getValueType();
+    EVT OpSVT = OpVT.getScalarType();
+    EVT SubVT = EVT::getVectorVT(*getContext(), OpSVT, NumSubElts);
+    if (!TLI->isExtractSubvectorCheap(SubVT, OpVT, 0))
+      return SDValue();
+    BinOp = (ISD::NodeType)CandidateBinOp;
+    return getNode(
+        ISD::EXTRACT_SUBVECTOR, SDLoc(Op), SubVT, Op,
+        getConstant(0, SDLoc(Op), TLI->getVectorIdxTy(getDataLayout())));
+  };
+
   // At each stage, we're looking for something that looks like:
   // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
   //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
@@ -9030,10 +9057,16 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
   // <4,5,6,7,u,u,u,u>
   // <2,3,u,u,u,u,u,u>
   // <1,u,u,u,u,u,u,u>
-  unsigned CandidateBinOp = Op.getOpcode();
+  // While a partial reduction match would be:
+  // <2,3,u,u,u,u,u,u>
+  // <1,u,u,u,u,u,u,u>
+  unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
+  SDValue PrevOp;
   for (unsigned i = 0; i < Stages; ++i) {
+    unsigned MaskEnd = (1 << i);
+
     if (Op.getOpcode() != CandidateBinOp)
-      return SDValue();
+      return PartialReduction(PrevOp, MaskEnd);
 
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
@@ -9049,12 +9082,14 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
     // The first operand of the shuffle should be the same as the other operand
     // of the binop.
     if (!Shuffle || Shuffle->getOperand(0) != Op)
-      return SDValue();
+      return PartialReduction(PrevOp, MaskEnd);
 
     // Verify the shuffle has the expected (at this stage of the pyramid) mask.
-    for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
-      if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
-        return SDValue();
+    for (int Index = 0; Index < (int)MaskEnd; ++Index)
+      if (Shuffle->getMaskElt(Index) != (int)(MaskEnd + Index))
+        return PartialReduction(PrevOp, MaskEnd);
+
+    PrevOp = Op;
   }
 
   BinOp = (ISD::NodeType)CandidateBinOp;
@@ -9114,8 +9149,7 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
                                getShiftAmountOperand(Operands[0].getValueType(),
                                                      Operands[1])));
       break;
-    case ISD::SIGN_EXTEND_INREG:
-    case ISD::FP_ROUND_INREG: {
+    case ISD::SIGN_EXTEND_INREG: {
       EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
                                 Operands[0],
@@ -9187,6 +9221,9 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
                                                   int Dist) const {
   if (LD->isVolatile() || Base->isVolatile())
     return false;
+  // TODO: probably too restrictive for atomics, revisit
+  if (!LD->isSimple())
+    return false;
   if (LD->isIndexed() || Base->isIndexed())
     return false;
   if (LD->getChain() != Base->getChain())
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 9592bc30a4e1..3a53ab9717a4 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include <cstdint>
 
 using namespace llvm;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e818dd27c05e..8c15563fcd23 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -833,7 +833,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 
       // If the source register was virtual and if we know something about it,
       // add an assert node.
-      if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) ||
+      if (!Register::isVirtualRegister(Regs[Part + i]) ||
           !RegisterVT.isInteger())
         continue;
 
@@ -948,8 +948,7 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
   unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
   if (HasMatching)
     Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
-  else if (!Regs.empty() &&
-           TargetRegisterInfo::isVirtualRegister(Regs.front())) {
+  else if (!Regs.empty() && Register::isVirtualRegister(Regs.front())) {
     // Put the register class of the virtual registers in the flag word.  That
     // way, later passes can recompute register class constraints for inline
     // assembly as well as normal instructions.
@@ -1810,7 +1809,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
       // offsets to its parts don't wrap either.
       SDValue Ptr = DAG.getObjectPtrOffset(getCurSDLoc(), RetPtr, Offsets[i]);
 
-      SDValue Val = RetOp.getValue(i);
+      SDValue Val = RetOp.getValue(RetOp.getResNo() + i);
       if (MemVTs[i] != ValueVTs[i])
         Val = DAG.getPtrExtOrTrunc(Val, getCurSDLoc(), MemVTs[i]);
       Chains[i] = DAG.getStore(Chain, getCurSDLoc(), Val,
@@ -2263,7 +2262,7 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
     Instruction::BinaryOps Opcode = BOp->getOpcode();
     if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
-        !I.getMetadata(LLVMContext::MD_unpredictable) &&
+        !I.hasMetadata(LLVMContext::MD_unpredictable) &&
         (Opcode == Instruction::And || Opcode == Instruction::Or)) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
                            Opcode,
@@ -2600,9 +2599,11 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
 void
 SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setDiscardResult(true);
   SDValue Chain =
       TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
-                      None, false, getCurSDLoc(), false, false).second;
+                      None, CallOptions, getCurSDLoc()).second;
   // On PS4, the "return address" must still be within the calling function,
   // even if it's at the very end, so emit an explicit TRAP here.
   // Passing 'true' for doesNotReturn above won't generate the trap for us.
@@ -2618,24 +2619,18 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
                                              MachineBasicBlock *SwitchBB) {
   SDLoc dl = getCurSDLoc();
 
-  // Subtract the minimum value
+  // Subtract the minimum value.
   SDValue SwitchOp = getValue(B.SValue);
   EVT VT = SwitchOp.getValueType();
-  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
-                            DAG.getConstant(B.First, dl, VT));
-
-  // Check range
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue RangeCmp = DAG.getSetCC(
-      dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
-                                 Sub.getValueType()),
-      Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT);
+  SDValue RangeSub =
+      DAG.getNode(ISD::SUB, dl, VT, SwitchOp, DAG.getConstant(B.First, dl, VT));
 
   // Determine the type of the test operands.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   bool UsePtrType = false;
-  if (!TLI.isTypeLegal(VT))
+  if (!TLI.isTypeLegal(VT)) {
     UsePtrType = true;
-  else {
+  } else {
     for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
       if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
         // Switch table case range are encoded into series of masks.
@@ -2644,6 +2639,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
         break;
       }
   }
+  SDValue Sub = RangeSub;
   if (UsePtrType) {
     VT = TLI.getPointerTy(DAG.getDataLayout());
     Sub = DAG.getZExtOrTrunc(Sub, dl, VT);
@@ -2655,20 +2651,29 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
 
   MachineBasicBlock* MBB = B.Cases[0].ThisBB;
 
-  addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
+  if (!B.OmitRangeCheck)
+    addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
   addSuccessorWithProb(SwitchBB, MBB, B.Prob);
   SwitchBB->normalizeSuccProbs();
 
-  SDValue BrRange = DAG.getNode(ISD::BRCOND, dl,
-                                MVT::Other, CopyTo, RangeCmp,
-                                DAG.getBasicBlock(B.Default));
+  SDValue Root = CopyTo;
+  if (!B.OmitRangeCheck) {
+    // Conditional branch to the default block.
+    SDValue RangeCmp = DAG.getSetCC(dl,
+        TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                               RangeSub.getValueType()),
+        RangeSub, DAG.getConstant(B.Range, dl, RangeSub.getValueType()),
+        ISD::SETUGT);
+
+    Root = DAG.getNode(ISD::BRCOND, dl, MVT::Other, Root, RangeCmp,
+                       DAG.getBasicBlock(B.Default));
+  }
 
   // Avoid emitting unnecessary branches to the next block.
   if (MBB != NextBlock(SwitchBB))
-    BrRange = DAG.getNode(ISD::BR, dl, MVT::Other, BrRange,
-                          DAG.getBasicBlock(MBB));
+    Root = DAG.getNode(ISD::BR, dl, MVT::Other, Root, DAG.getBasicBlock(MBB));
 
-  DAG.setRoot(BrRange);
+  DAG.setRoot(Root);
 }
 
 /// visitBitTestCase - this function produces one "bit test"
@@ -3266,8 +3271,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
 
     // We care about the legality of the operation after it has been type
     // legalized.
-    while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal &&
-           VT != TLI.getTypeToTransformTo(Ctx, VT))
+    while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal)
       VT = TLI.getTypeToTransformTo(Ctx, VT);
 
     // If the vselect is legal, assume we want to leave this as a vector setcc +
@@ -3534,17 +3538,32 @@ void SelectionDAGBuilder::visitExtractElement(const User &I) {
 void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   SDValue Src1 = getValue(I.getOperand(0));
   SDValue Src2 = getValue(I.getOperand(1));
+  Constant *MaskV = cast<Constant>(I.getOperand(2));
   SDLoc DL = getCurSDLoc();
-
-  SmallVector<int, 8> Mask;
-  ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
-  unsigned MaskNumElts = Mask.size();
-
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   EVT SrcVT = Src1.getValueType();
   unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
+  if (MaskV->isNullValue() && VT.isScalableVector()) {
+    // Canonical splat form of first element of first input vector.
+    SDValue FirstElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                   SrcVT.getScalarType(), Src1,
+                                   DAG.getConstant(0, DL, 
+                                   TLI.getVectorIdxTy(DAG.getDataLayout())));
+    setValue(&I, DAG.getNode(ISD::SPLAT_VECTOR, DL, VT, FirstElt));
+    return;
+  }
+
+  // For now, we only handle splats for scalable vectors.
+  // The DAGCombiner will perform a BUILD_VECTOR -> SPLAT_VECTOR transformation
+  // for targets that support a SPLAT_VECTOR for non-scalable vector types.
+  assert(!VT.isScalableVector() && "Unsupported scalable vector shuffle");
+
+  SmallVector<int, 8> Mask;
+  ShuffleVectorInst::getShuffleMask(MaskV, Mask);
+  unsigned MaskNumElts = Mask.size();
+
   if (SrcNumElts == MaskNumElts) {
     setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, Mask));
     return;
@@ -3825,7 +3844,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
   // Normalize Vector GEP - all scalar operands should be converted to the
   // splat vector.
   unsigned VectorWidth = I.getType()->isVectorTy() ?
-    cast<VectorType>(I.getType())->getVectorNumElements() : 0;
+    I.getType()->getVectorNumElements() : 0;
 
   if (VectorWidth && !N.getValueType().isVector()) {
     LLVMContext &Context = *DAG.getContext();
@@ -3858,12 +3877,11 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
 
       // If this is a scalar constant or a splat vector of constants,
       // handle it quickly.
-      const auto *CI = dyn_cast<ConstantInt>(Idx);
-      if (!CI && isa<ConstantDataVector>(Idx) &&
-          cast<ConstantDataVector>(Idx)->getSplatValue())
-        CI = cast<ConstantInt>(cast<ConstantDataVector>(Idx)->getSplatValue());
+      const auto *C = dyn_cast<Constant>(Idx);
+      if (C && isa<VectorType>(C->getType()))
+        C = C->getSplatValue();
 
-      if (CI) {
+      if (const auto *CI = dyn_cast_or_null<ConstantInt>(C)) {
         if (CI->isZero())
           continue;
         APInt Offs = ElementSize * CI->getValue().sextOrTrunc(IdxSize);
@@ -3872,7 +3890,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
           DAG.getConstant(Offs, dl, EVT::getVectorVT(Context, IdxTy, VectorWidth)) :
           DAG.getConstant(Offs, dl, IdxTy);
 
-        // In an inbouds GEP with an offset that is nonnegative even when
+        // In an inbounds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
         SDNodeFlags Flags;
         if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
@@ -4002,8 +4020,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   Type *Ty = I.getType();
 
   bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
-  bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+  bool isNonTemporal = I.hasMetadata(LLVMContext::MD_nontemporal);
+  bool isInvariant = I.hasMetadata(LLVMContext::MD_invariant_load);
   bool isDereferenceable =
       isDereferenceablePointer(SV, I.getType(), DAG.getDataLayout());
   unsigned Alignment = I.getAlignment();
@@ -4118,7 +4136,7 @@ void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
 
   SDValue Src = getValue(SrcV);
   // Create a virtual register, then update the virtual register.
-  unsigned VReg =
+  Register VReg =
       SwiftError.getOrCreateVRegDefAt(&I, FuncInfo.MBB, I.getPointerOperand());
   // Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
   // Chain can be getRoot or getControlRoot.
@@ -4132,8 +4150,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
          "call visitLoadFromSwiftError when backend supports swifterror");
 
   assert(!I.isVolatile() &&
-         I.getMetadata(LLVMContext::MD_nontemporal) == nullptr &&
-         I.getMetadata(LLVMContext::MD_invariant_load) == nullptr &&
+         !I.hasMetadata(LLVMContext::MD_nontemporal) &&
+         !I.hasMetadata(LLVMContext::MD_invariant_load) &&
          "Support volatile, non temporal, invariant for load_from_swift_error");
 
   const Value *SV = I.getOperand(0);
@@ -4209,7 +4227,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   auto MMOFlags = MachineMemOperand::MONone;
   if (I.isVolatile())
     MMOFlags |= MachineMemOperand::MOVolatile;
-  if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr)
+  if (I.hasMetadata(LLVMContext::MD_nontemporal))
     MMOFlags |= MachineMemOperand::MONonTemporal;
   MMOFlags |= TLI.getMMOFlags(I);
 
@@ -4309,8 +4327,9 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 // are looking for. If first operand of the GEP is a splat vector - we
 // extract the splat value and use it as a uniform base.
 // In all other cases the function returns 'false'.
-static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
-                           SDValue &Scale, SelectionDAGBuilder* SDB) {
+static bool getUniformBase(const Value *&Ptr, SDValue &Base, SDValue &Index,
+                           ISD::MemIndexType &IndexType, SDValue &Scale,
+                           SelectionDAGBuilder *SDB) {
   SelectionDAG& DAG = SDB->DAG;
   LLVMContext &Context = *DAG.getContext();
 
@@ -4330,8 +4349,13 @@ static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
 
   // Ensure all the other indices are 0.
   for (unsigned i = 1; i < FinalIndex; ++i) {
-    auto *C = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (!C || !C->isZero())
+    auto *C = dyn_cast<Constant>(GEP->getOperand(i));
+    if (!C)
+      return false;
+    if (isa<VectorType>(C->getType()))
+      C = C->getSplatValue();
+    auto *CI = dyn_cast_or_null<ConstantInt>(C);
+    if (!CI || !CI->isZero())
       return false;
   }
 
@@ -4346,6 +4370,7 @@ static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
                                 SDB->getCurSDLoc(), TLI.getPointerTy(DL));
   Base = SDB->getValue(Ptr);
   Index = SDB->getValue(IndexVal);
+  IndexType = ISD::SIGNED_SCALED;
 
   if (!Index.getValueType().isVector()) {
     unsigned GEPWidth = GEP->getType()->getVectorNumElements();
@@ -4373,9 +4398,11 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 
   SDValue Base;
   SDValue Index;
+  ISD::MemIndexType IndexType;
   SDValue Scale;
   const Value *BasePtr = Ptr;
-  bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
+  bool UniformBase = getUniformBase(BasePtr, Base, Index, IndexType, Scale,
+                                    this);
 
   const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
   MachineMemOperand *MMO = DAG.getMachineFunction().
@@ -4385,11 +4412,12 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
+    IndexType = ISD::SIGNED_SCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index, Scale };
   SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
-                                         Ops, MMO);
+                                         Ops, MMO, IndexType);
   DAG.setRoot(Scatter);
   setValue(&I, Scatter);
 }
@@ -4476,9 +4504,11 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Root = DAG.getRoot();
   SDValue Base;
   SDValue Index;
+  ISD::MemIndexType IndexType;
   SDValue Scale;
   const Value *BasePtr = Ptr;
-  bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
+  bool UniformBase = getUniformBase(BasePtr, Base, Index, IndexType, Scale,
+                                    this);
   bool ConstantMemory = false;
   if (UniformBase && AA &&
       AA->pointsToConstantMemory(
@@ -4500,11 +4530,12 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
+    IndexType = ISD::SIGNED_SCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
   SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
-                                       Ops, MMO);
+                                       Ops, MMO, IndexType);
 
   SDValue OutChain = Gather.getValue(1);
   if (!ConstantMemory)
@@ -4628,7 +4659,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
   auto Flags = MachineMemOperand::MOLoad;
   if (I.isVolatile())
     Flags |= MachineMemOperand::MOVolatile;
-  if (I.getMetadata(LLVMContext::MD_invariant_load) != nullptr)
+  if (I.hasMetadata(LLVMContext::MD_invariant_load))
     Flags |= MachineMemOperand::MOInvariant;
   if (isDereferenceablePointer(I.getPointerOperand(), I.getType(),
                                DAG.getDataLayout()))
@@ -4645,9 +4676,27 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
                            AAMDNodes(), nullptr, SSID, Order);
 
   InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
-  SDValue L =
-      DAG.getAtomic(ISD::ATOMIC_LOAD, dl, MemVT, MemVT, InChain,
-                    getValue(I.getPointerOperand()), MMO);
+
+  SDValue Ptr = getValue(I.getPointerOperand());
+
+  if (TLI.lowerAtomicLoadAsLoadSDNode(I)) {
+    // TODO: Once this is better exercised by tests, it should be merged with
+    // the normal path for loads to prevent future divergence.
+    SDValue L = DAG.getLoad(MemVT, dl, InChain, Ptr, MMO);
+    if (MemVT != VT)
+      L = DAG.getPtrExtOrTrunc(L, dl, VT);
+
+    setValue(&I, L);
+    SDValue OutChain = L.getValue(1);
+    if (!I.isUnordered())
+      DAG.setRoot(OutChain);
+    else
+      PendingLoads.push_back(OutChain);
+    return;
+  }
+  
+  SDValue L = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, MemVT, MemVT, InChain,
+                            Ptr, MMO);
 
   SDValue OutChain = L.getValue(1);
   if (MemVT != VT)
@@ -4686,9 +4735,17 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   SDValue Val = getValue(I.getValueOperand());
   if (Val.getValueType() != MemVT)
     Val = DAG.getPtrExtOrTrunc(Val, dl, MemVT);
+  SDValue Ptr = getValue(I.getPointerOperand());
 
+  if (TLI.lowerAtomicStoreAsStoreSDNode(I)) {
+    // TODO: Once this is better exercised by tests, it should be merged with
+    // the normal path for stores to prevent future divergence.
+    SDValue S = DAG.getStore(InChain, dl, Val, Ptr, MMO);
+    DAG.setRoot(S);
+    return;
+  }
   SDValue OutChain = DAG.getAtomic(ISD::ATOMIC_STORE, dl, MemVT, InChain,
-                                   getValue(I.getPointerOperand()), Val, MMO);
+                                   Ptr, Val, MMO);
 
 
   DAG.setRoot(OutChain);
@@ -4731,8 +4788,22 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 
   // Add all operands of the call to the operand list.
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
-    SDValue Op = getValue(I.getArgOperand(i));
-    Ops.push_back(Op);
+    const Value *Arg = I.getArgOperand(i);
+    if (!I.paramHasAttr(i, Attribute::ImmArg)) {
+      Ops.push_back(getValue(Arg));
+      continue;
+    }
+
+    // Use TargetConstant instead of a regular constant for immarg.
+    EVT VT = TLI.getValueType(*DL, Arg->getType(), true);
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(Arg)) {
+      assert(CI->getBitWidth() <= 64 &&
+             "large intrinsic immediates not handled");
+      Ops.push_back(DAG.getTargetConstant(*CI, SDLoc(), VT));
+    } else {
+      Ops.push_back(
+          DAG.getTargetConstantFP(*cast<ConstantFP>(Arg), SDLoc(), VT));
+    }
   }
 
   SmallVector<EVT, 4> ValueVTs;
@@ -4749,10 +4820,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     // This is target intrinsic that touches memory
     AAMDNodes AAInfo;
     I.getAAMetadata(AAInfo);
-    Result =
-        DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
-                                MachinePointerInfo(Info.ptrVal, Info.offset),
-                                Info.align, Info.flags, Info.size, AAInfo);
+    Result = DAG.getMemIntrinsicNode(
+        Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
+        MachinePointerInfo(Info.ptrVal, Info.offset),
+        Info.align ? Info.align->value() : 0, Info.flags, Info.size, AAInfo);
   } else if (!HasChain) {
     Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
   } else if (!I.getType()->isVoidTy()) {
@@ -4918,12 +4989,11 @@ static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
     // Put the exponent in the right bit position for later addition to the
     // final result:
     //
-    //   #define LOG2OFe 1.4426950f
-    //   t0 = Op * LOG2OFe
+    // t0 = Op * log2(e)
 
     // TODO: What fast-math-flags should be set here?
     SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
-                             getF32Constant(DAG, 0x3fb8aa3b, dl));
+                             DAG.getConstantFP(numbers::log2ef, dl, MVT::f32));
     return getLimitedPrecisionExp2(t0, dl, DAG);
   }
 
@@ -4941,10 +5011,11 @@ static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
 
-    // Scale the exponent by log(2) [0.69314718f].
+    // Scale the exponent by log(2).
     SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
-    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
-                                        getF32Constant(DAG, 0x3f317218, dl));
+    SDValue LogOfExponent =
+        DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                    DAG.getConstantFP(numbers::ln2f, dl, MVT::f32));
 
     // Get the significand and build it into a floating-point number with
     // exponent of 1.
@@ -5311,19 +5382,32 @@ static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
   return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
 }
 
-// getUnderlyingArgReg - Find underlying register used for a truncated or
-// bitcasted argument.
-static unsigned getUnderlyingArgReg(const SDValue &N) {
+// getUnderlyingArgRegs - Find underlying registers used for a truncated,
+// bitcasted, or split argument. Returns a list of <Register, size in bits>
+static void
+getUnderlyingArgRegs(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
+                     const SDValue &N) {
   switch (N.getOpcode()) {
-  case ISD::CopyFromReg:
-    return cast<RegisterSDNode>(N.getOperand(1))->getReg();
+  case ISD::CopyFromReg: {
+    SDValue Op = N.getOperand(1);
+    Regs.emplace_back(cast<RegisterSDNode>(Op)->getReg(),
+                      Op.getValueType().getSizeInBits());
+    return;
+  }
   case ISD::BITCAST:
   case ISD::AssertZext:
   case ISD::AssertSext:
   case ISD::TRUNCATE:
-    return getUnderlyingArgReg(N.getOperand(0));
+    getUnderlyingArgRegs(Regs, N.getOperand(0));
+    return;
+  case ISD::BUILD_PAIR:
+  case ISD::BUILD_VECTOR:
+  case ISD::CONCAT_VECTORS:
+    for (SDValue Op : N->op_values())
+      getUnderlyingArgRegs(Regs, Op);
+    return;
   default:
-    return 0;
+    return;
   }
 }
 
@@ -5412,11 +5496,16 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   if (FI != std::numeric_limits<int>::max())
     Op = MachineOperand::CreateFI(FI);
 
+  SmallVector<std::pair<unsigned, unsigned>, 8> ArgRegsAndSizes;
   if (!Op && N.getNode()) {
-    unsigned Reg = getUnderlyingArgReg(N);
-    if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
+    getUnderlyingArgRegs(ArgRegsAndSizes, N);
+    Register Reg;
+    if (ArgRegsAndSizes.size() == 1)
+      Reg = ArgRegsAndSizes.front().first;
+
+    if (Reg && Reg.isVirtual()) {
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
-      unsigned PR = RegInfo.getLiveInPhysReg(Reg);
+      Register PR = RegInfo.getLiveInPhysReg(Reg);
       if (PR)
         Reg = PR;
     }
@@ -5436,29 +5525,42 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   }
 
   if (!Op) {
+    // Create a DBG_VALUE for each decomposed value in ArgRegs to cover Reg
+    auto splitMultiRegDbgValue
+      = [&](ArrayRef<std::pair<unsigned, unsigned>> SplitRegs) {
+      unsigned Offset = 0;
+      for (auto RegAndSize : SplitRegs) {
+        auto FragmentExpr = DIExpression::createFragmentExpression(
+          Expr, Offset, RegAndSize.second);
+        if (!FragmentExpr)
+          continue;
+        assert(!IsDbgDeclare && "DbgDeclare operand is not in memory?");
+        FuncInfo.ArgDbgValues.push_back(
+          BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), false,
+                  RegAndSize.first, Variable, *FragmentExpr));
+        Offset += RegAndSize.second;
+      }
+    };
+
     // Check if ValueMap has reg number.
-    DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
+    DenseMap<const Value *, unsigned>::const_iterator
+      VMI = FuncInfo.ValueMap.find(V);
     if (VMI != FuncInfo.ValueMap.end()) {
       const auto &TLI = DAG.getTargetLoweringInfo();
       RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), VMI->second,
                        V->getType(), getABIRegCopyCC(V));
       if (RFV.occupiesMultipleRegs()) {
-        unsigned Offset = 0;
-        for (auto RegAndSize : RFV.getRegsAndSizes()) {
-          Op = MachineOperand::CreateReg(RegAndSize.first, false);
-          auto FragmentExpr = DIExpression::createFragmentExpression(
-              Expr, Offset, RegAndSize.second);
-          if (!FragmentExpr)
-            continue;
-          FuncInfo.ArgDbgValues.push_back(
-              BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
-                      Op->getReg(), Variable, *FragmentExpr));
-          Offset += RegAndSize.second;
-        }
+        splitMultiRegDbgValue(RFV.getRegsAndSizes());
         return true;
       }
+
       Op = MachineOperand::CreateReg(VMI->second, false);
       IsIndirect = IsDbgDeclare;
+    } else if (ArgRegsAndSizes.size() > 1) {
+      // This was split due to the calling convention, and no virtual register
+      // mapping exists for the value.
+      splitMultiRegDbgValue(ArgRegsAndSizes);
+      return true;
     }
   }
 
@@ -5468,8 +5570,10 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   assert(Variable->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   IsIndirect = (Op->isReg()) ? IsIndirect : true;
+  if (IsIndirect)
+    Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
   FuncInfo.ArgDbgValues.push_back(
-      BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
+      BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), false,
               *Op, Variable, Expr));
 
   return true;
@@ -5554,11 +5658,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   case Intrinsic::sponentry:
     setValue(&I, DAG.getNode(ISD::SPONENTRY, sdl,
-                             TLI.getPointerTy(DAG.getDataLayout())));
+                             TLI.getFrameIndexTy(DAG.getDataLayout())));
     return;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
-                             TLI.getPointerTy(DAG.getDataLayout()),
+                             TLI.getFrameIndexTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return;
   case Intrinsic::read_register: {
@@ -5888,65 +5992,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::masked_compressstore:
     visitMaskedStore(I, true /* IsCompressing */);
     return;
-  case Intrinsic::x86_mmx_pslli_w:
-  case Intrinsic::x86_mmx_pslli_d:
-  case Intrinsic::x86_mmx_pslli_q:
-  case Intrinsic::x86_mmx_psrli_w:
-  case Intrinsic::x86_mmx_psrli_d:
-  case Intrinsic::x86_mmx_psrli_q:
-  case Intrinsic::x86_mmx_psrai_w:
-  case Intrinsic::x86_mmx_psrai_d: {
-    SDValue ShAmt = getValue(I.getArgOperand(1));
-    if (isa<ConstantSDNode>(ShAmt)) {
-      visitTargetIntrinsic(I, Intrinsic);
-      return;
-    }
-    unsigned NewIntrinsic = 0;
-    EVT ShAmtVT = MVT::v2i32;
-    switch (Intrinsic) {
-    case Intrinsic::x86_mmx_pslli_w:
-      NewIntrinsic = Intrinsic::x86_mmx_psll_w;
-      break;
-    case Intrinsic::x86_mmx_pslli_d:
-      NewIntrinsic = Intrinsic::x86_mmx_psll_d;
-      break;
-    case Intrinsic::x86_mmx_pslli_q:
-      NewIntrinsic = Intrinsic::x86_mmx_psll_q;
-      break;
-    case Intrinsic::x86_mmx_psrli_w:
-      NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
-      break;
-    case Intrinsic::x86_mmx_psrli_d:
-      NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
-      break;
-    case Intrinsic::x86_mmx_psrli_q:
-      NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
-      break;
-    case Intrinsic::x86_mmx_psrai_w:
-      NewIntrinsic = Intrinsic::x86_mmx_psra_w;
-      break;
-    case Intrinsic::x86_mmx_psrai_d:
-      NewIntrinsic = Intrinsic::x86_mmx_psra_d;
-      break;
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    }
-
-    // The vector shift intrinsics with scalars uses 32b shift amounts but
-    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
-    // to be zero.
-    // We must do this early because v2i32 is not a legal type.
-    SDValue ShOps[2];
-    ShOps[0] = ShAmt;
-    ShOps[1] = DAG.getConstant(0, sdl, MVT::i32);
-    ShAmt =  DAG.getBuildVector(ShAmtVT, sdl, ShOps);
-    EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
-    ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
-    Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
-                       DAG.getConstant(NewIntrinsic, sdl, MVT::i32),
-                       getValue(I.getArgOperand(0)), ShAmt);
-    setValue(&I, Res);
-    return;
-  }
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
@@ -6063,6 +6108,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
   case Intrinsic::experimental_constrained_fma:
+  case Intrinsic::experimental_constrained_fptosi:
+  case Intrinsic::experimental_constrained_fptoui:
   case Intrinsic::experimental_constrained_fptrunc:
   case Intrinsic::experimental_constrained_fpext:
   case Intrinsic::experimental_constrained_sqrt:
@@ -6075,12 +6122,16 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::experimental_constrained_log:
   case Intrinsic::experimental_constrained_log10:
   case Intrinsic::experimental_constrained_log2:
+  case Intrinsic::experimental_constrained_lrint:
+  case Intrinsic::experimental_constrained_llrint:
   case Intrinsic::experimental_constrained_rint:
   case Intrinsic::experimental_constrained_nearbyint:
   case Intrinsic::experimental_constrained_maxnum:
   case Intrinsic::experimental_constrained_minnum:
   case Intrinsic::experimental_constrained_ceil:
   case Intrinsic::experimental_constrained_floor:
+  case Intrinsic::experimental_constrained_lround:
+  case Intrinsic::experimental_constrained_llround:
   case Intrinsic::experimental_constrained_round:
   case Intrinsic::experimental_constrained_trunc:
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
@@ -6272,6 +6323,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              Op3));
     return;
   }
+  case Intrinsic::umul_fix_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    setValue(&I, DAG.getNode(ISD::UMULFIXSAT, sdl, Op1.getValueType(), Op1, Op2,
+                             Op3));
+    return;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
@@ -6347,29 +6406,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     DAG.setRoot(Res);
     return;
   }
-  case Intrinsic::objectsize: {
-    // If we don't know by now, we're never going to know.
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
-
-    assert(CI && "Non-constant type in __builtin_object_size?");
-
-    SDValue Arg = getValue(I.getCalledValue());
-    EVT Ty = Arg.getValueType();
-
-    if (CI->isZero())
-      Res = DAG.getConstant(-1ULL, sdl, Ty);
-    else
-      Res = DAG.getConstant(0, sdl, Ty);
-
-    setValue(&I, Res);
-    return;
-  }
+  case Intrinsic::objectsize:
+    llvm_unreachable("llvm.objectsize.* should have been lowered already");
 
   case Intrinsic::is_constant:
-    // If this wasn't constant-folded away by now, then it's not a
-    // constant.
-    setValue(&I, DAG.getConstant(0, sdl, MVT::i1));
-    return;
+    llvm_unreachable("llvm.is.constant.* should have been lowered already");
 
   case Intrinsic::annotation:
   case Intrinsic::ptr_annotation:
@@ -6818,6 +6859,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Val);
     return;
   }
+  case Intrinsic::ptrmask: {
+    SDValue Ptr = getValue(I.getOperand(0));
+    SDValue Const = getValue(I.getOperand(1));
+
+    EVT DestVT =
+        EVT(DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
+
+    setValue(&I, DAG.getNode(ISD::AND, getCurSDLoc(), DestVT, Ptr,
+                             DAG.getZExtOrTrunc(Const, getCurSDLoc(), DestVT)));
+    return;
+  }
   }
 }
 
@@ -6845,6 +6897,12 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   case Intrinsic::experimental_constrained_fma:
     Opcode = ISD::STRICT_FMA;
     break;
+  case Intrinsic::experimental_constrained_fptosi:
+    Opcode = ISD::STRICT_FP_TO_SINT;
+    break;
+  case Intrinsic::experimental_constrained_fptoui:
+    Opcode = ISD::STRICT_FP_TO_UINT;
+    break;
   case Intrinsic::experimental_constrained_fptrunc:
     Opcode = ISD::STRICT_FP_ROUND;
     break;
@@ -6881,6 +6939,12 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   case Intrinsic::experimental_constrained_log2:
     Opcode = ISD::STRICT_FLOG2;
     break;
+  case Intrinsic::experimental_constrained_lrint:
+    Opcode = ISD::STRICT_LRINT;
+    break;
+  case Intrinsic::experimental_constrained_llrint:
+    Opcode = ISD::STRICT_LLRINT;
+    break;
   case Intrinsic::experimental_constrained_rint:
     Opcode = ISD::STRICT_FRINT;
     break;
@@ -6899,6 +6963,12 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   case Intrinsic::experimental_constrained_floor:
     Opcode = ISD::STRICT_FFLOOR;
     break;
+  case Intrinsic::experimental_constrained_lround:
+    Opcode = ISD::STRICT_LROUND;
+    break;
+  case Intrinsic::experimental_constrained_llround:
+    Opcode = ISD::STRICT_LLROUND;
+    break;
   case Intrinsic::experimental_constrained_round:
     Opcode = ISD::STRICT_FROUND;
     break;
@@ -7102,7 +7172,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   if (SwiftErrorVal && TLI.supportSwiftError()) {
     // Get the last element of InVals.
     SDValue Src = CLI.InVals.back();
-    unsigned VReg = SwiftError.getOrCreateVRegDefAt(
+    Register VReg = SwiftError.getOrCreateVRegDefAt(
         CS.getInstruction(), FuncInfo.MBB, SwiftErrorVal);
     SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
     DAG.setRoot(CopyNode);
@@ -8021,6 +8091,14 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // Compute the constraint code and ConstraintType to use.
     TLI.ComputeConstraintToUse(T, SDValue());
 
+    if (T.ConstraintType == TargetLowering::C_Immediate &&
+        OpInfo.CallOperand && !isa<ConstantSDNode>(OpInfo.CallOperand))
+      // We've delayed emitting a diagnostic like the "n" constraint because
+      // inlining could cause an integer showing up.
+      return emitInlineAsmError(
+          CS, "constraint '" + Twine(T.ConstraintCode) + "' expects an "
+                  "integer constant expression");
+
     ExtraInfo.update(T);
   }
 
@@ -8105,7 +8183,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     switch (OpInfo.Type) {
     case InlineAsm::isOutput:
       if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
-          (OpInfo.ConstraintType == TargetLowering::C_Other &&
+          ((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+            OpInfo.ConstraintType == TargetLowering::C_Other) &&
            OpInfo.isIndirect)) {
         unsigned ConstraintID =
             TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
@@ -8119,13 +8198,14 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
                                                         MVT::i32));
         AsmNodeOperands.push_back(OpInfo.CallOperand);
         break;
-      } else if ((OpInfo.ConstraintType == TargetLowering::C_Other &&
+      } else if (((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+                   OpInfo.ConstraintType == TargetLowering::C_Other) &&
                   !OpInfo.isIndirect) ||
                  OpInfo.ConstraintType == TargetLowering::C_Register ||
                  OpInfo.ConstraintType == TargetLowering::C_RegisterClass) {
         // Otherwise, this outputs to a register (directly for C_Register /
-        // C_RegisterClass, and a target-defined fashion for C_Other). Find a
-        // register that we can use.
+        // C_RegisterClass, and a target-defined fashion for
+        // C_Immediate/C_Other). Find a register that we can use.
         if (OpInfo.AssignedRegs.Regs.empty()) {
           emitInlineAsmError(
               CS, "couldn't allocate output register for constraint '" +
@@ -8205,15 +8285,24 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       }
 
       // Treat indirect 'X' constraint as memory.
-      if (OpInfo.ConstraintType == TargetLowering::C_Other &&
+      if ((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+           OpInfo.ConstraintType == TargetLowering::C_Other) &&
           OpInfo.isIndirect)
         OpInfo.ConstraintType = TargetLowering::C_Memory;
 
-      if (OpInfo.ConstraintType == TargetLowering::C_Other) {
+      if (OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+          OpInfo.ConstraintType == TargetLowering::C_Other) {
         std::vector<SDValue> Ops;
         TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
                                           Ops, DAG);
         if (Ops.empty()) {
+          if (OpInfo.ConstraintType == TargetLowering::C_Immediate)
+            if (isa<ConstantSDNode>(InOperandVal)) {
+              emitInlineAsmError(CS, "value out of range for constraint '" +
+                                 Twine(OpInfo.ConstraintCode) + "'");
+              return;
+            }
+
           emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
                                      Twine(OpInfo.ConstraintCode) + "'");
           return;
@@ -8250,7 +8339,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       }
 
       assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
-              OpInfo.ConstraintType == TargetLowering::C_Register) &&
+              OpInfo.ConstraintType == TargetLowering::C_Register ||
+              OpInfo.ConstraintType == TargetLowering::C_Immediate) &&
              "Unknown constraint type!");
 
       // TODO: Support this.
@@ -8356,6 +8446,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         Val = OpInfo.AssignedRegs.getCopyFromRegs(
             DAG, FuncInfo, getCurSDLoc(), Chain, &Flag, CS.getInstruction());
         break;
+      case TargetLowering::C_Immediate:
       case TargetLowering::C_Other:
         Val = TLI.LowerAsmOutputForConstraint(Chain, Flag, getCurSDLoc(),
                                               OpInfo, DAG);
@@ -9018,7 +9109,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       // Certain targets (such as MIPS), may have a different ABI alignment
       // for a type depending on the context. Give the target a chance to
       // specify the alignment it wants.
-      unsigned OriginalAlignment = getABIAlignmentForCallingConv(ArgTy, DL);
+      const Align OriginalAlignment(getABIAlignmentForCallingConv(ArgTy, DL));
 
       if (Args[i].Ty->isPointerTy()) {
         Flags.setPointer();
@@ -9073,7 +9164,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
           FrameAlign = Args[i].Alignment;
         else
           FrameAlign = getByValTypeAlignment(ElementTy, DL);
-        Flags.setByValAlign(FrameAlign);
+        Flags.setByValAlign(Align(FrameAlign));
       }
       if (Args[i].IsNest)
         Flags.setNest();
@@ -9129,7 +9220,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0) {
-          MyFlags.Flags.setOrigAlign(1);
+          MyFlags.Flags.setOrigAlign(Align::None());
           if (j == NumParts - 1)
             MyFlags.Flags.setSplitEnd();
         }
@@ -9259,7 +9350,7 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
   assert((Op.getOpcode() != ISD::CopyFromReg ||
           cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
          "Copy from a reg to the same reg!");
-  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
+  assert(!Register::isPhysicalRegister(Reg) && "Is a physreg");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // If this is an InlineAsm we have to match the registers required, not the
@@ -9516,8 +9607,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // Certain targets (such as MIPS), may have a different ABI alignment
       // for a type depending on the context. Give the target a chance to
       // specify the alignment it wants.
-      unsigned OriginalAlignment =
-          TLI->getABIAlignmentForCallingConv(ArgTy, DL);
+      const Align OriginalAlignment(
+          TLI->getABIAlignmentForCallingConv(ArgTy, DL));
 
       if (Arg.getType()->isPointerTy()) {
         Flags.setPointer();
@@ -9577,7 +9668,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           FrameAlign = Arg.getParamAlignment();
         else
           FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
-        Flags.setByValAlign(FrameAlign);
+        Flags.setByValAlign(Align(FrameAlign));
       }
       if (Arg.hasAttribute(Attribute::Nest))
         Flags.setNest();
@@ -9586,6 +9677,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       Flags.setOrigAlign(OriginalAlignment);
       if (ArgCopyElisionCandidates.count(&Arg))
         Flags.setCopyElisionCandidate();
+      if (Arg.hasAttribute(Attribute::Returned))
+        Flags.setReturned();
 
       MVT RegisterVT = TLI->getRegisterTypeForCallingConv(
           *CurDAG->getContext(), F.getCallingConv(), VT);
@@ -9598,7 +9691,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
         else if (i > 0) {
-          MyFlags.Flags.setOrigAlign(1);
+          MyFlags.Flags.setOrigAlign(Align::None());
           if (i == NumRegs - 1)
             MyFlags.Flags.setSplitEnd();
         }
@@ -9650,7 +9743,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 
     MachineFunction& MF = SDB->DAG.getMachineFunction();
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
-    unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
+    Register SRetReg =
+        RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
     FuncInfo->DemoteRegister = SRetReg;
     NewRoot =
         SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
@@ -9748,10 +9842,14 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
+    // Analyses past this point are naive and don't expect an assertion.
+    if (Res.getOpcode() == ISD::AssertZext)
+      Res = Res.getOperand(0);
+
     // Update the SwiftErrorVRegDefMap.
     if (Res.getOpcode() == ISD::CopyFromReg && isSwiftErrorArg) {
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         SwiftError->setCurrentVReg(FuncInfo->MBB, SwiftError->getFunctionArg(),
                                    Reg);
     }
@@ -9763,7 +9861,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // FIXME: This isn't very clean... it would be nice to make this more
       // general.
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         FuncInfo->ValueMap[&Arg] = Reg;
         continue;
       }
@@ -10087,8 +10185,6 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
         break;
       }
       case CC_BitTests: {
-        // FIXME: If Fallthrough is unreachable, skip the range check.
-
         // FIXME: Optimize away range check based on pivot comparisons.
         BitTestBlock *BTB = &SL->BitTestCases[I->BTCasesIndex];
 
@@ -10109,6 +10205,11 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
           BTB->DefaultProb -= DefaultProb / 2;
         }
 
+        if (FallthroughUnreachable) {
+          // Skip the range check if the fallthrough block is unreachable.
+          BTB->OmitRangeCheck = true;
+        }
+
         // If we're in the right place, emit the bit test header right now.
         if (CurMBB == SwitchMBB) {
           visitBitTestHeader(*BTB, SwitchMBB);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 0072e33f23b7..bfcf30b430b6 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -426,7 +426,7 @@ public:
   SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
                       SwiftErrorValueTracking &swifterror, CodeGenOpt::Level ol)
       : SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()), DAG(dag),
-        SL(make_unique<SDAGSwitchLowering>(this, funcinfo)), FuncInfo(funcinfo),
+        SL(std::make_unique<SDAGSwitchLowering>(this, funcinfo)), FuncInfo(funcinfo),
         SwiftError(swifterror) {}
 
   void init(GCFunctionInfo *gfi, AliasAnalysis *AA,
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index da3049881d31..bc10f7621239 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -280,6 +280,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EXTRACT_SUBVECTOR:          return "extract_subvector";
   case ISD::SCALAR_TO_VECTOR:           return "scalar_to_vector";
   case ISD::VECTOR_SHUFFLE:             return "vector_shuffle";
+  case ISD::SPLAT_VECTOR:               return "splat_vector";
   case ISD::CARRY_FALSE:                return "carry_false";
   case ISD::ADDC:                       return "addc";
   case ISD::ADDE:                       return "adde";
@@ -305,6 +306,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::SMULFIX:                    return "smulfix";
   case ISD::SMULFIXSAT:                 return "smulfixsat";
   case ISD::UMULFIX:                    return "umulfix";
+  case ISD::UMULFIXSAT:                 return "umulfixsat";
 
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
@@ -318,22 +320,27 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FP_ROUND:                   return "fp_round";
   case ISD::STRICT_FP_ROUND:            return "strict_fp_round";
   case ISD::FLT_ROUNDS_:                return "flt_rounds";
-  case ISD::FP_ROUND_INREG:             return "fp_round_inreg";
   case ISD::FP_EXTEND:                  return "fp_extend";
   case ISD::STRICT_FP_EXTEND:           return "strict_fp_extend";
 
   case ISD::SINT_TO_FP:                 return "sint_to_fp";
   case ISD::UINT_TO_FP:                 return "uint_to_fp";
   case ISD::FP_TO_SINT:                 return "fp_to_sint";
+  case ISD::STRICT_FP_TO_SINT:          return "strict_fp_to_sint";
   case ISD::FP_TO_UINT:                 return "fp_to_uint";
+  case ISD::STRICT_FP_TO_UINT:          return "strict_fp_to_uint";
   case ISD::BITCAST:                    return "bitcast";
   case ISD::ADDRSPACECAST:              return "addrspacecast";
   case ISD::FP16_TO_FP:                 return "fp16_to_fp";
   case ISD::FP_TO_FP16:                 return "fp_to_fp16";
   case ISD::LROUND:                     return "lround";
+  case ISD::STRICT_LROUND:              return "strict_lround";
   case ISD::LLROUND:                    return "llround";
+  case ISD::STRICT_LLROUND:             return "strict_llround";
   case ISD::LRINT:                      return "lrint";
+  case ISD::STRICT_LRINT:               return "strict_lrint";
   case ISD::LLRINT:                     return "llrint";
+  case ISD::STRICT_LLRINT:              return "strict_llrint";
 
     // Control flow instructions
   case ISD::BR:                         return "br";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index bdf9f2c166e1..1f07a241a824 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -434,9 +435,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
-  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(Fn);
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
-  ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
+  ORE = std::make_unique<OptimizationRemarkEmitter>(&Fn);
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
@@ -524,8 +525,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       To = J->second;
     }
     // Make sure the new register has a sufficiently constrained register class.
-    if (TargetRegisterInfo::isVirtualRegister(From) &&
-        TargetRegisterInfo::isVirtualRegister(To))
+    if (Register::isVirtualRegister(From) && Register::isVirtualRegister(To))
       MRI.constrainRegClass(To, MRI.getRegClass(From));
     // Replace it.
 
@@ -572,7 +572,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     bool hasFI = MI->getOperand(0).isFI();
     Register Reg =
         hasFI ? TRI.getFrameRegister(*MF) : MI->getOperand(0).getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       EntryMBB->insert(EntryMBB->begin(), MI);
     else {
       MachineInstr *Def = RegInfo->getVRegDef(Reg);
@@ -582,7 +582,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
         Def->getParent()->insert(std::next(InsertPos), MI);
       } else
         LLVM_DEBUG(dbgs() << "Dropping debug info for dead vreg"
-                          << TargetRegisterInfo::virtReg2Index(Reg) << "\n");
+                          << Register::virtReg2Index(Reg) << "\n");
     }
 
     // If Reg is live-in then update debug info to track its copy in a vreg.
@@ -671,8 +671,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       To = J->second;
     }
     // Make sure the new register has a sufficiently constrained register class.
-    if (TargetRegisterInfo::isVirtualRegister(From) &&
-        TargetRegisterInfo::isVirtualRegister(To))
+    if (Register::isVirtualRegister(From) && Register::isVirtualRegister(To))
       MRI.constrainRegClass(To, MRI.getRegClass(From));
     // Replace it.
 
@@ -760,7 +759,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
       continue;
 
     unsigned DestReg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+    if (!Register::isVirtualRegister(DestReg))
       continue;
 
     // Ignore non-integer values.
@@ -1652,9 +1651,8 @@ static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
 
   // Make sure that the copy dest is not a vreg when the copy source is a
   // physical register.
-  if (!OPI2->isReg() ||
-      (!TargetRegisterInfo::isPhysicalRegister(OPI->getReg()) &&
-       TargetRegisterInfo::isPhysicalRegister(OPI2->getReg())))
+  if (!OPI2->isReg() || (!Register::isPhysicalRegister(OPI->getReg()) &&
+                         Register::isPhysicalRegister(OPI2->getReg())))
     return false;
 
   return true;
@@ -2234,9 +2232,9 @@ void SelectionDAGISel::Select_READ_REGISTER(SDNode *Op) {
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
-  unsigned Reg =
+  Register Reg =
       TLI->getRegisterByName(RegStr->getString().data(), Op->getValueType(0),
-                             *CurDAG);
+                             CurDAG->getMachineFunction());
   SDValue New = CurDAG->getCopyFromReg(
                         Op->getOperand(0), dl, Reg, Op->getValueType(0));
   New->setNodeId(-1);
@@ -2248,9 +2246,9 @@ void SelectionDAGISel::Select_WRITE_REGISTER(SDNode *Op) {
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
-  unsigned Reg = TLI->getRegisterByName(RegStr->getString().data(),
+  Register Reg = TLI->getRegisterByName(RegStr->getString().data(),
                                         Op->getOperand(2).getValueType(),
-                                        *CurDAG);
+                                        CurDAG->getMachineFunction());
   SDValue New = CurDAG->getCopyToReg(
                         Op->getOperand(0), dl, Reg, Op->getOperand(2));
   New->setNodeId(-1);
@@ -3323,10 +3321,13 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       continue;
     }
 
-    case OPC_EmitCopyToReg: {
+    case OPC_EmitCopyToReg:
+    case OPC_EmitCopyToReg2: {
       unsigned RecNo = MatcherTable[MatcherIndex++];
       assert(RecNo < RecordedNodes.size() && "Invalid EmitCopyToReg");
       unsigned DestPhysReg = MatcherTable[MatcherIndex++];
+      if (Opcode == OPC_EmitCopyToReg2)
+        DestPhysReg |= MatcherTable[MatcherIndex++] << 8;
 
       if (!InputChain.getNode())
         InputChain = CurDAG->getEntryNode();
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 395e9a8a4fc5..fad98b6f50dc 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -378,7 +378,6 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
     // We use TargetFrameIndex so that isel will not select it into LEA
     Loc = Builder.DAG.getTargetFrameIndex(Index, Builder.getFrameIndexTy());
 
-#ifndef NDEBUG
     // Right now we always allocate spill slots that are of the same
     // size as the value we're about to spill (the size of spillee can
     // vary since we spill vectors of pointers too).  At some point we
@@ -387,12 +386,18 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
     MachineFrameInfo &MFI = Builder.DAG.getMachineFunction().getFrameInfo();
     assert((MFI.getObjectSize(Index) * 8) == Incoming.getValueSizeInBits() &&
            "Bad spill:  stack slot does not match!");
-#endif
 
+    // Note: Using the alignment of the spill slot (rather than the abi or
+    // preferred alignment) is required for correctness when dealing with spill
+    // slots with preferred alignments larger than frame alignment..
     auto &MF = Builder.DAG.getMachineFunction();
     auto PtrInfo = MachinePointerInfo::getFixedStack(MF, Index);
+    auto *StoreMMO =
+      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 
+                              MFI.getObjectSize(Index),
+                              MFI.getObjectAlignment(Index));
     Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
-                                 PtrInfo);
+                                 StoreMMO);
 
     MMO = getMachineMemOperand(MF, *cast<FrameIndexSDNode>(Loc));
     
@@ -1011,20 +1016,27 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
     return;
   }
 
-  SDValue SpillSlot =
-    DAG.getTargetFrameIndex(*DerivedPtrLocation, getFrameIndexTy());
+  unsigned Index = *DerivedPtrLocation;
+  SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy());
 
   // Note: We know all of these reloads are independent, but don't bother to
   // exploit that chain wise.  DAGCombine will happily do so as needed, so
   // doing it here would be a small compile time win at most.
   SDValue Chain = getRoot();
 
-  SDValue SpillLoad =
-      DAG.getLoad(DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
-                                                           Relocate.getType()),
-                  getCurSDLoc(), Chain, SpillSlot,
-                  MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
-                                                    *DerivedPtrLocation));
+  auto &MF = DAG.getMachineFunction();
+  auto &MFI = MF.getFrameInfo();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, Index);
+  auto *LoadMMO =
+    MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 
+                            MFI.getObjectSize(Index),
+                            MFI.getObjectAlignment(Index));
+
+  auto LoadVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                         Relocate.getType());
+
+  SDValue SpillLoad = DAG.getLoad(LoadVT, getCurSDLoc(), Chain,
+                                  SpillSlot, LoadMMO);
 
   DAG.setRoot(SpillLoad.getValue(1));
 
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b260cd91d468..9ab1324533f1 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -37,7 +36,7 @@ using namespace llvm;
 
 /// NOTE: The TargetMachine owns TLOF.
 TargetLowering::TargetLowering(const TargetMachine &tm)
-  : TargetLoweringBase(tm) {}
+    : TargetLoweringBase(tm) {}
 
 const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
@@ -80,7 +79,7 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
     const CCValAssign &ArgLoc = ArgLocs[I];
     if (!ArgLoc.isRegLoc())
       continue;
-    unsigned Reg = ArgLoc.getLocReg();
+    Register Reg = ArgLoc.getLocReg();
     // Only look at callee saved registers.
     if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
       continue;
@@ -121,19 +120,25 @@ void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
 /// result of type RetVT.
 std::pair<SDValue, SDValue>
 TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
-                            ArrayRef<SDValue> Ops, bool isSigned,
-                            const SDLoc &dl, bool doesNotReturn,
-                            bool isReturnValueUsed,
-                            bool isPostTypeLegalization) const {
+                            ArrayRef<SDValue> Ops,
+                            MakeLibCallOptions CallOptions,
+                            const SDLoc &dl) const {
   TargetLowering::ArgListTy Args;
   Args.reserve(Ops.size());
 
   TargetLowering::ArgListEntry Entry;
-  for (SDValue Op : Ops) {
-    Entry.Node = Op;
+  for (unsigned i = 0; i < Ops.size(); ++i) {
+    SDValue NewOp = Ops[i];
+    Entry.Node = NewOp;
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
-    Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.IsSExt = shouldSignExtendTypeInLibCall(NewOp.getValueType(),
+                                                 CallOptions.IsSExt);
+    Entry.IsZExt = !Entry.IsSExt;
+
+    if (CallOptions.IsSoften &&
+        !shouldExtendTypeInLibCall(CallOptions.OpsVTBeforeSoften[i])) {
+      Entry.IsSExt = Entry.IsZExt = false;
+    }
     Args.push_back(Entry);
   }
 
@@ -144,15 +149,22 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
-  bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
+  bool signExtend = shouldSignExtendTypeInLibCall(RetVT, CallOptions.IsSExt);
+  bool zeroExtend = !signExtend;
+
+  if (CallOptions.IsSoften &&
+      !shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften)) {
+    signExtend = zeroExtend = false;
+  }
+
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
       .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-      .setNoReturn(doesNotReturn)
-      .setDiscardResult(!isReturnValueUsed)
-      .setIsPostTypeLegalization(isPostTypeLegalization)
+      .setNoReturn(CallOptions.DoesNotReturn)
+      .setDiscardResult(!CallOptions.IsReturnValueUsed)
+      .setIsPostTypeLegalization(CallOptions.IsPostTypeLegalization)
       .setSExtResult(signExtend)
-      .setZExtResult(!signExtend);
+      .setZExtResult(zeroExtend);
   return LowerCallTo(CLI);
 }
 
@@ -263,7 +275,8 @@ TargetLowering::findOptimalMemOpLowering(std::vector<EVT> &MemOps,
 void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
                                          SDValue &NewLHS, SDValue &NewRHS,
                                          ISD::CondCode &CCCode,
-                                         const SDLoc &dl) const {
+                                         const SDLoc &dl, const SDValue OldLHS,
+                                         const SDValue OldRHS) const {
   assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128 || VT == MVT::ppcf128)
          && "Unsupported setcc type!");
 
@@ -365,8 +378,11 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   // Use the target specific return value for comparions lib calls.
   EVT RetVT = getCmpLibcallReturnType();
   SDValue Ops[2] = {NewLHS, NewRHS};
-  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, false /*sign irrelevant*/,
-                       dl).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { OldLHS.getValueType(),
+                   OldRHS.getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, RetVT, true);
+  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, CallOptions, dl).first;
   NewRHS = DAG.getConstant(0, dl, RetVT);
 
   CCCode = getCmpLibcallCC(LC1);
@@ -378,8 +394,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
         ISD::SETCC, dl,
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
         NewLHS, NewRHS, DAG.getCondCode(CCCode));
-    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, false/*sign irrelevant*/,
-                         dl).first;
+    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, CallOptions, dl).first;
     NewLHS = DAG.getNode(
         ISD::SETCC, dl,
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
@@ -564,6 +579,170 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
                               AssumeSingleUse);
 }
 
+// TODO: Can we merge SelectionDAG::GetDemandedBits into this?
+// TODO: Under what circumstances can we create nodes? Constant folding?
+SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    SelectionDAG &DAG, unsigned Depth) const {
+  // Limit search depth.
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return SDValue();
+
+  // Ignore UNDEFs.
+  if (Op.isUndef())
+    return SDValue();
+
+  // Not demanding any bits/elts from Op.
+  if (DemandedBits == 0 || DemandedElts == 0)
+    return DAG.getUNDEF(Op.getValueType());
+
+  unsigned NumElts = DemandedElts.getBitWidth();
+  KnownBits LHSKnown, RHSKnown;
+  switch (Op.getOpcode()) {
+  case ISD::BITCAST: {
+    SDValue Src = peekThroughBitcasts(Op.getOperand(0));
+    EVT SrcVT = Src.getValueType();
+    EVT DstVT = Op.getValueType();
+    unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
+    unsigned NumDstEltBits = DstVT.getScalarSizeInBits();
+
+    if (NumSrcEltBits == NumDstEltBits)
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Src, DemandedBits, DemandedElts, DAG, Depth + 1))
+        return DAG.getBitcast(DstVT, V);
+
+    // TODO - bigendian once we have test coverage.
+    if (SrcVT.isVector() && (NumDstEltBits % NumSrcEltBits) == 0 &&
+        DAG.getDataLayout().isLittleEndian()) {
+      unsigned Scale = NumDstEltBits / NumSrcEltBits;
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
+      for (unsigned i = 0; i != Scale; ++i) {
+        unsigned Offset = i * NumSrcEltBits;
+        APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset);
+        if (!Sub.isNullValue()) {
+          DemandedSrcBits |= Sub;
+          for (unsigned j = 0; j != NumElts; ++j)
+            if (DemandedElts[j])
+              DemandedSrcElts.setBit((j * Scale) + i);
+        }
+      }
+
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Src, DemandedSrcBits, DemandedSrcElts, DAG, Depth + 1))
+        return DAG.getBitcast(DstVT, V);
+    }
+
+    // TODO - bigendian once we have test coverage.
+    if ((NumSrcEltBits % NumDstEltBits) == 0 &&
+        DAG.getDataLayout().isLittleEndian()) {
+      unsigned Scale = NumSrcEltBits / NumDstEltBits;
+      unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
+      APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i]) {
+          unsigned Offset = (i % Scale) * NumDstEltBits;
+          DemandedSrcBits.insertBits(DemandedBits, Offset);
+          DemandedSrcElts.setBit(i / Scale);
+        }
+
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Src, DemandedSrcBits, DemandedSrcElts, DAG, Depth + 1))
+        return DAG.getBitcast(DstVT, V);
+    }
+
+    break;
+  }
+  case ISD::AND: {
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and' in this
+    // context.
+    if (DemandedBits.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+      return Op.getOperand(0);
+    if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+      return Op.getOperand(1);
+    break;
+  }
+  case ISD::OR: {
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.  These bits cannot contribute to the result of the 'or' in this
+    // context.
+    if (DemandedBits.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+      return Op.getOperand(0);
+    if (DemandedBits.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+      return Op.getOperand(1);
+    break;
+  }
+  case ISD::XOR: {
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.
+    if (DemandedBits.isSubsetOf(RHSKnown.Zero))
+      return Op.getOperand(0);
+    if (DemandedBits.isSubsetOf(LHSKnown.Zero))
+      return Op.getOperand(1);
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    // If none of the extended bits are demanded, eliminate the sextinreg.
+    EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    if (DemandedBits.getActiveBits() <= ExVT.getScalarSizeInBits())
+      return Op.getOperand(0);
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    // If we don't demand the inserted element, return the base vector.
+    SDValue Vec = Op.getOperand(0);
+    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    EVT VecVT = Vec.getValueType();
+    if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
+        !DemandedElts[CIdx->getZExtValue()])
+      return Vec;
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+    // If all the demanded elts are from one operand and are inline,
+    // then we can use the operand directly.
+    bool AllUndef = true, IdentityLHS = true, IdentityRHS = true;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = ShuffleMask[i];
+      if (M < 0 || !DemandedElts[i])
+        continue;
+      AllUndef = false;
+      IdentityLHS &= (M == (int)i);
+      IdentityRHS &= ((M - NumElts) == i);
+    }
+
+    if (AllUndef)
+      return DAG.getUNDEF(Op.getValueType());
+    if (IdentityLHS)
+      return Op.getOperand(0);
+    if (IdentityRHS)
+      return Op.getOperand(1);
+    break;
+  }
+  default:
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
+      if (SDValue V = SimplifyMultipleUseDemandedBitsForTargetNode(
+              Op, DemandedBits, DemandedElts, DAG, Depth))
+        return V;
+    break;
+  }
+  return SDValue();
+}
+
 /// Look at Op. At this point, we know that only the OriginalDemandedBits of the
 /// result of Op are ever used downstream. If we can use this information to
 /// simplify Op, create a new simplified DAG node and return true, returning the
@@ -619,12 +798,15 @@ bool TargetLowering::SimplifyDemandedBits(
   } else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) {
     // Not demanding any bits/elts from Op.
     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
-  } else if (Depth == 6) { // Limit search depth.
+  } else if (Depth >= SelectionDAG::MaxRecursionDepth) {
+    // Limit search depth.
     return false;
   }
 
   KnownBits Known2, KnownOut;
   switch (Op.getOpcode()) {
+  case ISD::TargetConstant:
+    llvm_unreachable("Can't simplify this node");
   case ISD::SCALAR_TO_VECTOR: {
     if (!DemandedElts[0])
       return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
@@ -728,6 +910,21 @@ bool TargetLowering::SimplifyDemandedBits(
     }
     break;
   }
+  case ISD::EXTRACT_SUBVECTOR: {
+    // If index isn't constant, assume we need all the source vector elements.
+    SDValue Src = Op.getOperand(0);
+    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt SrcElts = APInt::getAllOnesValue(NumSrcElts);
+    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+      // Offset the demanded elts by the subvector index.
+      uint64_t Idx = SubIdx->getZExtValue();
+      SrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    }
+    if (SimplifyDemandedBits(Src, DemandedBits, SrcElts, Known, TLO, Depth + 1))
+      return true;
+    break;
+  }
   case ISD::CONCAT_VECTORS: {
     Known.Zero.setAllBits();
     Known.One.setAllBits();
@@ -773,22 +970,37 @@ bool TargetLowering::SimplifyDemandedBits(
     }
 
     if (!!DemandedLHS || !!DemandedRHS) {
+      SDValue Op0 = Op.getOperand(0);
+      SDValue Op1 = Op.getOperand(1);
+
       Known.Zero.setAllBits();
       Known.One.setAllBits();
       if (!!DemandedLHS) {
-        if (SimplifyDemandedBits(Op.getOperand(0), DemandedBits, DemandedLHS,
-                                 Known2, TLO, Depth + 1))
+        if (SimplifyDemandedBits(Op0, DemandedBits, DemandedLHS, Known2, TLO,
+                                 Depth + 1))
           return true;
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
       if (!!DemandedRHS) {
-        if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedRHS,
-                                 Known2, TLO, Depth + 1))
+        if (SimplifyDemandedBits(Op1, DemandedBits, DemandedRHS, Known2, TLO,
+                                 Depth + 1))
           return true;
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedLHS, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedRHS, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getVectorShuffle(VT, dl, Op0, Op1, ShuffleMask);
+        return TLO.CombineTo(Op, NewOp);
+      }
     }
     break;
   }
@@ -834,6 +1046,20 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     // If all of the demanded bits are known one on one side, return the other.
     // These bits cannot contribute to the result of the 'and'.
     if (DemandedBits.isSubsetOf(Known2.Zero | Known.One))
@@ -869,6 +1095,20 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'or'.
     if (DemandedBits.isSubsetOf(Known2.One | Known.Zero))
@@ -901,6 +1141,20 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'xor'.
     if (DemandedBits.isSubsetOf(Known.Zero))
@@ -1034,7 +1288,7 @@ bool TargetLowering::SimplifyDemandedBits(
       // out) are never demanded.
       // TODO - support non-uniform vector amounts.
       if (Op0.getOpcode() == ISD::SRL) {
-        if ((DemandedBits & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
+        if (!DemandedBits.intersects(APInt::getLowBitsSet(BitWidth, ShAmt))) {
           if (ConstantSDNode *SA2 =
                   isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
             if (SA2->getAPIntValue().ult(BitWidth)) {
@@ -1141,7 +1395,8 @@ bool TargetLowering::SimplifyDemandedBits(
       if (Op0.getOpcode() == ISD::SHL) {
         if (ConstantSDNode *SA2 =
                 isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
-          if ((DemandedBits & APInt::getHighBitsSet(BitWidth, ShAmt)) == 0) {
+          if (!DemandedBits.intersects(
+                  APInt::getHighBitsSet(BitWidth, ShAmt))) {
             if (SA2->getAPIntValue().ult(BitWidth)) {
               unsigned C1 = SA2->getZExtValue();
               unsigned Opc = ISD::SRL;
@@ -1479,6 +1734,11 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     Known = Known.trunc(BitWidth);
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, TruncMask, DemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::TRUNCATE, dl, VT, NewSrc));
+
     // If the input is only used by this truncate, see if we can shrink it based
     // on the known demanded bits.
     if (Src.getNode()->hasOneUse()) {
@@ -1595,9 +1855,7 @@ bool TargetLowering::SimplifyDemandedBits(
     // Bitcast from a vector using SimplifyDemanded Bits/VectorElts.
     // Demand the elt/bit if any of the original elts/bits are demanded.
     // TODO - bigendian once we have test coverage.
-    // TODO - bool vectors once SimplifyDemandedVectorElts has SETCC support.
-    if (SrcVT.isVector() && NumSrcEltBits > 1 &&
-        (BitWidth % NumSrcEltBits) == 0 &&
+    if (SrcVT.isVector() && (BitWidth % NumSrcEltBits) == 0 &&
         TLO.DAG.getDataLayout().isLittleEndian()) {
       unsigned Scale = BitWidth / NumSrcEltBits;
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
@@ -1663,6 +1921,7 @@ bool TargetLowering::SimplifyDemandedBits(
     // Add, Sub, and Mul don't demand any bits in positions beyond that
     // of the highest bit demanded of them.
     SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+    SDNodeFlags Flags = Op.getNode()->getFlags();
     unsigned DemandedBitsLZ = DemandedBits.countLeadingZeros();
     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
     if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, Known2, TLO,
@@ -1671,7 +1930,6 @@ bool TargetLowering::SimplifyDemandedBits(
                              Depth + 1) ||
         // See if the operation should be performed at a smaller bit width.
         ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
-      SDNodeFlags Flags = Op.getNode()->getFlags();
       if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
         // Disable the nsw and nuw flags. We can no longer guarantee that we
         // won't wrap after simplification.
@@ -1684,6 +1942,23 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     }
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!LoMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, LoMask, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, LoMask, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Flags.setNoSignedWrap(false);
+        Flags.setNoUnsignedWrap(false);
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp =
+            TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, Flags);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     // If we have a constant operand, we may be able to turn it into -1 if we
     // do not demand the high bits. This can make the constant smaller to
     // encode, allow more general folding, or match specialized instruction
@@ -1694,10 +1969,8 @@ bool TargetLowering::SimplifyDemandedBits(
     if (C && !C->isAllOnesValue() && !C->isOne() &&
         (C->getAPIntValue() | HighMask).isAllOnesValue()) {
       SDValue Neg1 = TLO.DAG.getAllOnesConstant(dl, VT);
-      // We can't guarantee that the new math op doesn't wrap, so explicitly
-      // clear those flags to prevent folding with a potential existing node
-      // that has those flags set.
-      SDNodeFlags Flags;
+      // Disable the nsw and nuw flags. We can no longer guarantee that we
+      // won't wrap after simplification.
       Flags.setNoSignedWrap(false);
       Flags.setNoUnsignedWrap(false);
       SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Neg1, Flags);
@@ -1837,7 +2110,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   }
 
   // Limit search depth.
-  if (Depth >= 6)
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
     return false;
 
   SDLoc DL(Op);
@@ -2001,6 +2274,15 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       return true;
     APInt BaseElts = DemandedElts;
     BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
+
+    // If none of the base operand elements are demanded, replace it with undef.
+    if (!BaseElts && !Base.isUndef())
+      return TLO.CombineTo(Op,
+                           TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                                           TLO.DAG.getUNDEF(VT),
+                                           Op.getOperand(1),
+                                           Op.getOperand(2)));
+
     if (SimplifyDemandedVectorElts(Base, BaseElts, KnownUndef, KnownZero, TLO,
                                    Depth + 1))
       return true;
@@ -2134,11 +2416,13 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
     // Update legal shuffle masks based on demanded elements if it won't reduce
     // to Identity which can cause premature removal of the shuffle mask.
-    if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps &&
-        isShuffleMaskLegal(NewMask, VT))
-      return TLO.CombineTo(Op,
-                           TLO.DAG.getVectorShuffle(VT, DL, Op.getOperand(0),
-                                                    Op.getOperand(1), NewMask));
+    if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps) {
+      SDValue LegalShuffle =
+          buildLegalVectorShuffle(VT, DL, Op.getOperand(0), Op.getOperand(1),
+                                  NewMask, TLO.DAG);
+      if (LegalShuffle)
+        return TLO.CombineTo(Op, LegalShuffle);
+    }
 
     // Propagate undef/zero elements from LHS/RHS.
     for (unsigned i = 0; i != NumElts; ++i) {
@@ -2304,6 +2588,13 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   Known.resetAll();
 }
 
+void TargetLowering::computeKnownBitsForTargetInstr(
+    GISelKnownBits &Analysis, Register R, KnownBits &Known,
+    const APInt &DemandedElts, const MachineRegisterInfo &MRI,
+    unsigned Depth) const {
+  Known.resetAll();
+}
+
 void TargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
                                                    KnownBits &Known,
                                                    const APInt &DemandedElts,
@@ -2357,6 +2648,36 @@ bool TargetLowering::SimplifyDemandedBitsForTargetNode(
   return false;
 }
 
+SDValue TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    SelectionDAG &DAG, unsigned Depth) const {
+  assert(
+      (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+       Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+       Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+       Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+      "Should use SimplifyMultipleUseDemandedBits if you don't know whether Op"
+      " is a target node!");
+  return SDValue();
+}
+
+SDValue
+TargetLowering::buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0,
+                                        SDValue N1, MutableArrayRef<int> Mask,
+                                        SelectionDAG &DAG) const {
+  bool LegalMask = isShuffleMaskLegal(Mask, VT);
+  if (!LegalMask) {
+    std::swap(N0, N1);
+    ShuffleVectorSDNode::commuteMask(Mask);
+    LegalMask = isShuffleMaskLegal(Mask, VT);
+  }
+
+  if (!LegalMask)
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, DL, N0, N1, Mask);
+}
+
 const Constant *TargetLowering::getTargetConstantFromLoad(LoadSDNode*) const {
   return nullptr;
 }
@@ -2610,6 +2931,77 @@ SDValue TargetLowering::optimizeSetCCOfSignedTruncationCheck(
   return T2;
 }
 
+// (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
+SDValue TargetLowering::optimizeSetCCByHoistingAndByConstFromLogicalShift(
+    EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond,
+    DAGCombinerInfo &DCI, const SDLoc &DL) const {
+  assert(isConstOrConstSplat(N1C) &&
+         isConstOrConstSplat(N1C)->getAPIntValue().isNullValue() &&
+         "Should be a comparison with 0.");
+  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+         "Valid only for [in]equality comparisons.");
+
+  unsigned NewShiftOpcode;
+  SDValue X, C, Y;
+
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Look for '(C l>>/<< Y)'.
+  auto Match = [&NewShiftOpcode, &X, &C, &Y, &TLI, &DAG](SDValue V) {
+    // The shift should be one-use.
+    if (!V.hasOneUse())
+      return false;
+    unsigned OldShiftOpcode = V.getOpcode();
+    switch (OldShiftOpcode) {
+    case ISD::SHL:
+      NewShiftOpcode = ISD::SRL;
+      break;
+    case ISD::SRL:
+      NewShiftOpcode = ISD::SHL;
+      break;
+    default:
+      return false; // must be a logical shift.
+    }
+    // We should be shifting a constant.
+    // FIXME: best to use isConstantOrConstantVector().
+    C = V.getOperand(0);
+    ConstantSDNode *CC =
+        isConstOrConstSplat(C, /*AllowUndefs=*/true, /*AllowTruncation=*/true);
+    if (!CC)
+      return false;
+    Y = V.getOperand(1);
+
+    ConstantSDNode *XC =
+        isConstOrConstSplat(X, /*AllowUndefs=*/true, /*AllowTruncation=*/true);
+    return TLI.shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+        X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG);
+  };
+
+  // LHS of comparison should be an one-use 'and'.
+  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+    return SDValue();
+
+  X = N0.getOperand(0);
+  SDValue Mask = N0.getOperand(1);
+
+  // 'and' is commutative!
+  if (!Match(Mask)) {
+    std::swap(X, Mask);
+    if (!Match(Mask))
+      return SDValue();
+  }
+
+  EVT VT = X.getValueType();
+
+  // Produce:
+  // ((X 'OppositeShiftOpcode' Y) & C) Cond 0
+  SDValue T0 = DAG.getNode(NewShiftOpcode, DL, VT, X, Y);
+  SDValue T1 = DAG.getNode(ISD::AND, DL, VT, T0, C);
+  SDValue T2 = DAG.getSetCC(DL, SCCVT, T1, N1C, Cond);
+  return T2;
+}
+
 /// Try to fold an equality comparison with a {add/sub/xor} binary operation as
 /// the 1st operand (N0). Callers are expected to swap the N0/N1 parameters to
 /// handle the commuted versions of these patterns.
@@ -2726,9 +3118,9 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // (ctpop x) u< 2 -> (x & x-1) == 0
       // (ctpop x) u> 1 -> (x & x-1) != 0
       if ((Cond == ISD::SETULT && C1 == 2) || (Cond == ISD::SETUGT && C1 == 1)){
-        SDValue Sub = DAG.getNode(ISD::SUB, dl, CTVT, CTOp,
-                                  DAG.getConstant(1, dl, CTVT));
-        SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Sub);
+        SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
+        SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
+        SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
         ISD::CondCode CC = Cond == ISD::SETULT ? ISD::SETEQ : ISD::SETNE;
         return DAG.getSetCC(dl, VT, And, DAG.getConstant(0, dl, CTVT), CC);
       }
@@ -2852,7 +3244,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
       APInt bestMask;
       unsigned bestWidth = 0, bestOffset = 0;
-      if (!Lod->isVolatile() && Lod->isUnindexed()) {
+      if (Lod->isSimple() && Lod->isUnindexed()) {
         unsigned origWidth = N0.getValueSizeInBits();
         unsigned maskWidth = origWidth;
         // We can narrow (e.g.) 16-bit extending loads on 32-bit target to
@@ -3178,6 +3570,14 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       }
     }
 
+    if (Cond == ISD::SETEQ || Cond == ISD::SETNE) {
+      // (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
+      if (C1.isNullValue())
+        if (SDValue CC = optimizeSetCCByHoistingAndByConstFromLogicalShift(
+                VT, N0, N1, Cond, DCI, dl))
+          return CC;
+    }
+
     // If we have "setcc X, C0", check to see if we can shrink the immediate
     // by changing cc.
     // TODO: Support this for vectors after legalize ops.
@@ -3203,33 +3603,35 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   // Back to non-vector simplifications.
   // TODO: Can we do these for vector splats?
   if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     const APInt &C1 = N1C->getAPIntValue();
+    EVT ShValTy = N0.getValueType();
 
     // Fold bit comparisons when we can.
     if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
-        (VT == N0.getValueType() ||
-         (isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
+        (VT == ShValTy || (isTypeLegal(VT) && VT.bitsLE(ShValTy))) &&
         N0.getOpcode() == ISD::AND) {
       auto &DL = DAG.getDataLayout();
       if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
-        EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
-                                       !DCI.isBeforeLegalize());
+        EVT ShiftTy = getShiftAmountTy(ShValTy, DL, !DCI.isBeforeLegalize());
         if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
           // Perform the xform if the AND RHS is a single bit.
-          if (AndRHS->getAPIntValue().isPowerOf2()) {
+          unsigned ShCt = AndRHS->getAPIntValue().logBase2();
+          if (AndRHS->getAPIntValue().isPowerOf2() &&
+              ShCt <= TLI.getShiftAmountThreshold(ShValTy)) {
             return DAG.getNode(ISD::TRUNCATE, dl, VT,
-                              DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
-                   DAG.getConstant(AndRHS->getAPIntValue().logBase2(), dl,
-                                   ShiftTy)));
+                               DAG.getNode(ISD::SRL, dl, ShValTy, N0,
+                                           DAG.getConstant(ShCt, dl, ShiftTy)));
           }
         } else if (Cond == ISD::SETEQ && C1 == AndRHS->getAPIntValue()) {
           // (X & 8) == 8  -->  (X & 8) >> 3
           // Perform the xform if C1 is a single bit.
-          if (C1.isPowerOf2()) {
+          unsigned ShCt = C1.logBase2();
+          if (C1.isPowerOf2() &&
+              ShCt <= TLI.getShiftAmountThreshold(ShValTy)) {
             return DAG.getNode(ISD::TRUNCATE, dl, VT,
-                               DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
-                                      DAG.getConstant(C1.logBase2(), dl,
-                                                      ShiftTy)));
+                               DAG.getNode(ISD::SRL, dl, ShValTy, N0,
+                                           DAG.getConstant(ShCt, dl, ShiftTy)));
           }
         }
       }
@@ -3452,15 +3854,21 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   }
 
   // Fold remainder of division by a constant.
-  if (N0.getOpcode() == ISD::UREM && N0.hasOneUse() &&
-      (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+  if ((N0.getOpcode() == ISD::UREM || N0.getOpcode() == ISD::SREM) &&
+      N0.hasOneUse() && (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
     AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
 
     // When division is cheap or optimizing for minimum size,
     // fall through to DIVREM creation by skipping this fold.
-    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize))
-      if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
-        return Folded;
+    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize)) {
+      if (N0.getOpcode() == ISD::UREM) {
+        if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
+          return Folded;
+      } else if (N0.getOpcode() == ISD::SREM) {
+        if (SDValue Folded = buildSREMEqFold(VT, N0, N1, Cond, DCI, dl))
+          return Folded;
+      }
+    }
   }
 
   // Fold away ALL boolean setcc's.
@@ -3567,15 +3975,17 @@ TargetLowering::getConstraintType(StringRef Constraint) const {
   if (S == 1) {
     switch (Constraint[0]) {
     default: break;
-    case 'r': return C_RegisterClass;
+    case 'r':
+      return C_RegisterClass;
     case 'm': // memory
     case 'o': // offsetable
     case 'V': // not offsetable
       return C_Memory;
-    case 'i': // Simple Integer or Relocatable Constant
     case 'n': // Simple Integer
     case 'E': // Floating Point Constant
     case 'F': // Floating Point Constant
+      return C_Immediate;
+    case 'i': // Simple Integer or Relocatable Constant
     case 's': // Relocatable Constant
     case 'p': // Address.
     case 'X': // Allow ANY value.
@@ -3950,6 +4360,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
 /// Return an integer indicating how general CT is.
 static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
   switch (CT) {
+  case TargetLowering::C_Immediate:
   case TargetLowering::C_Other:
   case TargetLowering::C_Unknown:
     return 0;
@@ -4069,11 +4480,12 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
     TargetLowering::ConstraintType CType =
       TLI.getConstraintType(OpInfo.Codes[i]);
 
-    // If this is an 'other' constraint, see if the operand is valid for it.
-    // For example, on X86 we might have an 'rI' constraint.  If the operand
-    // is an integer in the range [0..31] we want to use I (saving a load
-    // of a register), otherwise we must use 'r'.
-    if (CType == TargetLowering::C_Other && Op.getNode()) {
+    // If this is an 'other' or 'immediate' constraint, see if the operand is
+    // valid for it. For example, on X86 we might have an 'rI' constraint. If
+    // the operand is an integer in the range [0..31] we want to use I (saving a
+    // load of a register), otherwise we must use 'r'.
+    if ((CType == TargetLowering::C_Other ||
+         CType == TargetLowering::C_Immediate) && Op.getNode()) {
       assert(OpInfo.Codes[i].size() == 1 &&
              "Unhandled multi-letter 'other' constraint");
       std::vector<SDValue> ResultOps;
@@ -4455,6 +4867,34 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   return DAG.getSelect(dl, VT, IsOne, N0, Q);
 }
 
+/// If all values in Values that *don't* match the predicate are same 'splat'
+/// value, then replace all values with that splat value.
+/// Else, if AlternativeReplacement was provided, then replace all values that
+/// do match predicate with AlternativeReplacement value.
+static void
+turnVectorIntoSplatVector(MutableArrayRef<SDValue> Values,
+                          std::function<bool(SDValue)> Predicate,
+                          SDValue AlternativeReplacement = SDValue()) {
+  SDValue Replacement;
+  // Is there a value for which the Predicate does *NOT* match? What is it?
+  auto SplatValue = llvm::find_if_not(Values, Predicate);
+  if (SplatValue != Values.end()) {
+    // Does Values consist only of SplatValue's and values matching Predicate?
+    if (llvm::all_of(Values, [Predicate, SplatValue](SDValue Value) {
+          return Value == *SplatValue || Predicate(Value);
+        })) // Then we shall replace values matching predicate with SplatValue.
+      Replacement = *SplatValue;
+  }
+  if (!Replacement) {
+    // Oops, we did not find the "baseline" splat value.
+    if (!AlternativeReplacement)
+      return; // Nothing to do.
+    // Let's replace with provided value then.
+    Replacement = AlternativeReplacement;
+  }
+  std::replace_if(Values.begin(), Values.end(), Predicate, Replacement);
+}
+
 /// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE
 /// where the divisor is constant and the comparison target is zero,
 /// return a DAG expression that will generate the same comparison result
@@ -4482,77 +4922,409 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
                                   DAGCombinerInfo &DCI, const SDLoc &DL,
                                   SmallVectorImpl<SDNode *> &Created) const {
   // fold (seteq/ne (urem N, D), 0) -> (setule/ugt (rotr (mul N, P), K), Q)
-  // - D must be constant with D = D0 * 2^K where D0 is odd and D0 != 1
+  // - D must be constant, with D = D0 * 2^K where D0 is odd
   // - P is the multiplicative inverse of D0 modulo 2^W
-  // - Q = floor((2^W - 1) / D0)
+  // - Q = floor(((2^W) - 1) / D)
   // where W is the width of the common type of N and D.
   assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
          "Only applicable for (in)equality comparisons.");
 
+  SelectionDAG &DAG = DCI.DAG;
+
   EVT VT = REMNode.getValueType();
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  EVT ShSVT = ShVT.getScalarType();
 
   // If MUL is unavailable, we cannot proceed in any case.
   if (!isOperationLegalOrCustom(ISD::MUL, VT))
     return SDValue();
 
-  // TODO: Add non-uniform constant support.
-  ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1));
+  // TODO: Could support comparing with non-zero too.
   ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode);
-  if (!Divisor || !CompTarget || Divisor->isNullValue() ||
-      !CompTarget->isNullValue())
+  if (!CompTarget || !CompTarget->isNullValue())
     return SDValue();
 
-  const APInt &D = Divisor->getAPIntValue();
+  bool HadOneDivisor = false;
+  bool AllDivisorsAreOnes = true;
+  bool HadEvenDivisor = false;
+  bool AllDivisorsArePowerOfTwo = true;
+  SmallVector<SDValue, 16> PAmts, KAmts, QAmts;
+
+  auto BuildUREMPattern = [&](ConstantSDNode *C) {
+    // Division by 0 is UB. Leave it to be constant-folded elsewhere.
+    if (C->isNullValue())
+      return false;
+
+    const APInt &D = C->getAPIntValue();
+    // If all divisors are ones, we will prefer to avoid the fold.
+    HadOneDivisor |= D.isOneValue();
+    AllDivisorsAreOnes &= D.isOneValue();
+
+    // Decompose D into D0 * 2^K
+    unsigned K = D.countTrailingZeros();
+    assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate.");
+    APInt D0 = D.lshr(K);
+
+    // D is even if it has trailing zeros.
+    HadEvenDivisor |= (K != 0);
+    // D is a power-of-two if D0 is one.
+    // If all divisors are power-of-two, we will prefer to avoid the fold.
+    AllDivisorsArePowerOfTwo &= D0.isOneValue();
+
+    // P = inv(D0, 2^W)
+    // 2^W requires W + 1 bits, so we have to extend and then truncate.
+    unsigned W = D.getBitWidth();
+    APInt P = D0.zext(W + 1)
+                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
+                  .trunc(W);
+    assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
+    assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
+
+    // Q = floor((2^W - 1) / D)
+    APInt Q = APInt::getAllOnesValue(W).udiv(D);
+
+    assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) &&
+           "We are expecting that K is always less than all-ones for ShSVT");
+
+    // If the divisor is 1 the result can be constant-folded.
+    if (D.isOneValue()) {
+      // Set P and K amount to a bogus values so we can try to splat them.
+      P = 0;
+      K = -1;
+      assert(Q.isAllOnesValue() &&
+             "Expecting all-ones comparison for one divisor");
+    }
+
+    PAmts.push_back(DAG.getConstant(P, DL, SVT));
+    KAmts.push_back(
+        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT));
+    QAmts.push_back(DAG.getConstant(Q, DL, SVT));
+    return true;
+  };
+
+  SDValue N = REMNode.getOperand(0);
+  SDValue D = REMNode.getOperand(1);
 
-  // Decompose D into D0 * 2^K
-  unsigned K = D.countTrailingZeros();
-  bool DivisorIsEven = (K != 0);
-  APInt D0 = D.lshr(K);
+  // Collect the values from each element.
+  if (!ISD::matchUnaryPredicate(D, BuildUREMPattern))
+    return SDValue();
 
-  // The fold is invalid when D0 == 1.
-  // This is reachable because visitSetCC happens before visitREM.
-  if (D0.isOneValue())
+  // If this is a urem by a one, avoid the fold since it can be constant-folded.
+  if (AllDivisorsAreOnes)
     return SDValue();
 
-  // P = inv(D0, 2^W)
-  // 2^W requires W + 1 bits, so we have to extend and then truncate.
-  unsigned W = D.getBitWidth();
-  APInt P = D0.zext(W + 1)
-                .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                .trunc(W);
-  assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
-  assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
+  // If this is a urem by a powers-of-two, avoid the fold since it can be
+  // best implemented as a bit test.
+  if (AllDivisorsArePowerOfTwo)
+    return SDValue();
 
-  // Q = floor((2^W - 1) / D)
-  APInt Q = APInt::getAllOnesValue(W).udiv(D);
+  SDValue PVal, KVal, QVal;
+  if (VT.isVector()) {
+    if (HadOneDivisor) {
+      // Try to turn PAmts into a splat, since we don't care about the values
+      // that are currently '0'. If we can't, just keep '0'`s.
+      turnVectorIntoSplatVector(PAmts, isNullConstant);
+      // Try to turn KAmts into a splat, since we don't care about the values
+      // that are currently '-1'. If we can't, change them to '0'`s.
+      turnVectorIntoSplatVector(KAmts, isAllOnesConstant,
+                                DAG.getConstant(0, DL, ShSVT));
+    }
 
-  SelectionDAG &DAG = DCI.DAG;
+    PVal = DAG.getBuildVector(VT, DL, PAmts);
+    KVal = DAG.getBuildVector(ShVT, DL, KAmts);
+    QVal = DAG.getBuildVector(VT, DL, QAmts);
+  } else {
+    PVal = PAmts[0];
+    KVal = KAmts[0];
+    QVal = QAmts[0];
+  }
 
-  SDValue PVal = DAG.getConstant(P, DL, VT);
-  SDValue QVal = DAG.getConstant(Q, DL, VT);
   // (mul N, P)
-  SDValue Op1 = DAG.getNode(ISD::MUL, DL, VT, REMNode->getOperand(0), PVal);
-  Created.push_back(Op1.getNode());
+  SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal);
+  Created.push_back(Op0.getNode());
 
-  // Rotate right only if D was even.
-  if (DivisorIsEven) {
+  // Rotate right only if any divisor was even. We avoid rotates for all-odd
+  // divisors as a performance improvement, since rotating by 0 is a no-op.
+  if (HadEvenDivisor) {
     // We need ROTR to do this.
     if (!isOperationLegalOrCustom(ISD::ROTR, VT))
       return SDValue();
-    SDValue ShAmt =
-        DAG.getConstant(K, DL, getShiftAmountTy(VT, DAG.getDataLayout()));
     SDNodeFlags Flags;
     Flags.setExact(true);
     // UREM: (rotr (mul N, P), K)
-    Op1 = DAG.getNode(ISD::ROTR, DL, VT, Op1, ShAmt, Flags);
-    Created.push_back(Op1.getNode());
+    Op0 = DAG.getNode(ISD::ROTR, DL, VT, Op0, KVal, Flags);
+    Created.push_back(Op0.getNode());
   }
 
   // UREM: (setule/setugt (rotr (mul N, P), K), Q)
-  return DAG.getSetCC(DL, SETCCVT, Op1, QVal,
+  return DAG.getSetCC(DL, SETCCVT, Op0, QVal,
                       ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
 }
 
+/// Given an ISD::SREM used only by an ISD::SETEQ or ISD::SETNE
+/// where the divisor is constant and the comparison target is zero,
+/// return a DAG expression that will generate the same comparison result
+/// using only multiplications, additions and shifts/rotations.
+/// Ref: "Hacker's Delight" 10-17.
+SDValue TargetLowering::buildSREMEqFold(EVT SETCCVT, SDValue REMNode,
+                                        SDValue CompTargetNode,
+                                        ISD::CondCode Cond,
+                                        DAGCombinerInfo &DCI,
+                                        const SDLoc &DL) const {
+  SmallVector<SDNode *, 7> Built;
+  if (SDValue Folded = prepareSREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond,
+                                         DCI, DL, Built)) {
+    assert(Built.size() <= 7 && "Max size prediction failed.");
+    for (SDNode *N : Built)
+      DCI.AddToWorklist(N);
+    return Folded;
+  }
+
+  return SDValue();
+}
+
+SDValue
+TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
+                                  SDValue CompTargetNode, ISD::CondCode Cond,
+                                  DAGCombinerInfo &DCI, const SDLoc &DL,
+                                  SmallVectorImpl<SDNode *> &Created) const {
+  // Fold:
+  //   (seteq/ne (srem N, D), 0)
+  // To:
+  //   (setule/ugt (rotr (add (mul N, P), A), K), Q)
+  //
+  // - D must be constant, with D = D0 * 2^K where D0 is odd
+  // - P is the multiplicative inverse of D0 modulo 2^W
+  // - A = bitwiseand(floor((2^(W - 1) - 1) / D0), (-(2^k)))
+  // - Q = floor((2 * A) / (2^K))
+  // where W is the width of the common type of N and D.
+  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+         "Only applicable for (in)equality comparisons.");
+
+  SelectionDAG &DAG = DCI.DAG;
+
+  EVT VT = REMNode.getValueType();
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  EVT ShSVT = ShVT.getScalarType();
+
+  // If MUL is unavailable, we cannot proceed in any case.
+  if (!isOperationLegalOrCustom(ISD::MUL, VT))
+    return SDValue();
+
+  // TODO: Could support comparing with non-zero too.
+  ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode);
+  if (!CompTarget || !CompTarget->isNullValue())
+    return SDValue();
+
+  bool HadIntMinDivisor = false;
+  bool HadOneDivisor = false;
+  bool AllDivisorsAreOnes = true;
+  bool HadEvenDivisor = false;
+  bool NeedToApplyOffset = false;
+  bool AllDivisorsArePowerOfTwo = true;
+  SmallVector<SDValue, 16> PAmts, AAmts, KAmts, QAmts;
+
+  auto BuildSREMPattern = [&](ConstantSDNode *C) {
+    // Division by 0 is UB. Leave it to be constant-folded elsewhere.
+    if (C->isNullValue())
+      return false;
+
+    // FIXME: we don't fold `rem %X, -C` to `rem %X, C` in DAGCombine.
+
+    // WARNING: this fold is only valid for positive divisors!
+    APInt D = C->getAPIntValue();
+    if (D.isNegative())
+      D.negate(); //  `rem %X, -C` is equivalent to `rem %X, C`
+
+    HadIntMinDivisor |= D.isMinSignedValue();
+
+    // If all divisors are ones, we will prefer to avoid the fold.
+    HadOneDivisor |= D.isOneValue();
+    AllDivisorsAreOnes &= D.isOneValue();
+
+    // Decompose D into D0 * 2^K
+    unsigned K = D.countTrailingZeros();
+    assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate.");
+    APInt D0 = D.lshr(K);
+
+    if (!D.isMinSignedValue()) {
+      // D is even if it has trailing zeros; unless it's INT_MIN, in which case
+      // we don't care about this lane in this fold, we'll special-handle it.
+      HadEvenDivisor |= (K != 0);
+    }
+
+    // D is a power-of-two if D0 is one. This includes INT_MIN.
+    // If all divisors are power-of-two, we will prefer to avoid the fold.
+    AllDivisorsArePowerOfTwo &= D0.isOneValue();
+
+    // P = inv(D0, 2^W)
+    // 2^W requires W + 1 bits, so we have to extend and then truncate.
+    unsigned W = D.getBitWidth();
+    APInt P = D0.zext(W + 1)
+                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
+                  .trunc(W);
+    assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
+    assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
+
+    // A = floor((2^(W - 1) - 1) / D0) & -2^K
+    APInt A = APInt::getSignedMaxValue(W).udiv(D0);
+    A.clearLowBits(K);
+
+    if (!D.isMinSignedValue()) {
+      // If divisor INT_MIN, then we don't care about this lane in this fold,
+      // we'll special-handle it.
+      NeedToApplyOffset |= A != 0;
+    }
+
+    // Q = floor((2 * A) / (2^K))
+    APInt Q = (2 * A).udiv(APInt::getOneBitSet(W, K));
+
+    assert(APInt::getAllOnesValue(SVT.getSizeInBits()).ugt(A) &&
+           "We are expecting that A is always less than all-ones for SVT");
+    assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) &&
+           "We are expecting that K is always less than all-ones for ShSVT");
+
+    // If the divisor is 1 the result can be constant-folded. Likewise, we
+    // don't care about INT_MIN lanes, those can be set to undef if appropriate.
+    if (D.isOneValue()) {
+      // Set P, A and K to a bogus values so we can try to splat them.
+      P = 0;
+      A = -1;
+      K = -1;
+
+      // x ?% 1 == 0  <-->  true  <-->  x u<= -1
+      Q = -1;
+    }
+
+    PAmts.push_back(DAG.getConstant(P, DL, SVT));
+    AAmts.push_back(DAG.getConstant(A, DL, SVT));
+    KAmts.push_back(
+        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT));
+    QAmts.push_back(DAG.getConstant(Q, DL, SVT));
+    return true;
+  };
+
+  SDValue N = REMNode.getOperand(0);
+  SDValue D = REMNode.getOperand(1);
+
+  // Collect the values from each element.
+  if (!ISD::matchUnaryPredicate(D, BuildSREMPattern))
+    return SDValue();
+
+  // If this is a srem by a one, avoid the fold since it can be constant-folded.
+  if (AllDivisorsAreOnes)
+    return SDValue();
+
+  // If this is a srem by a powers-of-two (including INT_MIN), avoid the fold
+  // since it can be best implemented as a bit test.
+  if (AllDivisorsArePowerOfTwo)
+    return SDValue();
+
+  SDValue PVal, AVal, KVal, QVal;
+  if (VT.isVector()) {
+    if (HadOneDivisor) {
+      // Try to turn PAmts into a splat, since we don't care about the values
+      // that are currently '0'. If we can't, just keep '0'`s.
+      turnVectorIntoSplatVector(PAmts, isNullConstant);
+      // Try to turn AAmts into a splat, since we don't care about the
+      // values that are currently '-1'. If we can't, change them to '0'`s.
+      turnVectorIntoSplatVector(AAmts, isAllOnesConstant,
+                                DAG.getConstant(0, DL, SVT));
+      // Try to turn KAmts into a splat, since we don't care about the values
+      // that are currently '-1'. If we can't, change them to '0'`s.
+      turnVectorIntoSplatVector(KAmts, isAllOnesConstant,
+                                DAG.getConstant(0, DL, ShSVT));
+    }
+
+    PVal = DAG.getBuildVector(VT, DL, PAmts);
+    AVal = DAG.getBuildVector(VT, DL, AAmts);
+    KVal = DAG.getBuildVector(ShVT, DL, KAmts);
+    QVal = DAG.getBuildVector(VT, DL, QAmts);
+  } else {
+    PVal = PAmts[0];
+    AVal = AAmts[0];
+    KVal = KAmts[0];
+    QVal = QAmts[0];
+  }
+
+  // (mul N, P)
+  SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal);
+  Created.push_back(Op0.getNode());
+
+  if (NeedToApplyOffset) {
+    // We need ADD to do this.
+    if (!isOperationLegalOrCustom(ISD::ADD, VT))
+      return SDValue();
+
+    // (add (mul N, P), A)
+    Op0 = DAG.getNode(ISD::ADD, DL, VT, Op0, AVal);
+    Created.push_back(Op0.getNode());
+  }
+
+  // Rotate right only if any divisor was even. We avoid rotates for all-odd
+  // divisors as a performance improvement, since rotating by 0 is a no-op.
+  if (HadEvenDivisor) {
+    // We need ROTR to do this.
+    if (!isOperationLegalOrCustom(ISD::ROTR, VT))
+      return SDValue();
+    SDNodeFlags Flags;
+    Flags.setExact(true);
+    // SREM: (rotr (add (mul N, P), A), K)
+    Op0 = DAG.getNode(ISD::ROTR, DL, VT, Op0, KVal, Flags);
+    Created.push_back(Op0.getNode());
+  }
+
+  // SREM: (setule/setugt (rotr (add (mul N, P), A), K), Q)
+  SDValue Fold =
+      DAG.getSetCC(DL, SETCCVT, Op0, QVal,
+                   ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
+
+  // If we didn't have lanes with INT_MIN divisor, then we're done.
+  if (!HadIntMinDivisor)
+    return Fold;
+
+  // That fold is only valid for positive divisors. Which effectively means,
+  // it is invalid for INT_MIN divisors. So if we have such a lane,
+  // we must fix-up results for said lanes.
+  assert(VT.isVector() && "Can/should only get here for vectors.");
+
+  if (!isOperationLegalOrCustom(ISD::SETEQ, VT) ||
+      !isOperationLegalOrCustom(ISD::AND, VT) ||
+      !isOperationLegalOrCustom(Cond, VT) ||
+      !isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return SDValue();
+
+  Created.push_back(Fold.getNode());
+
+  SDValue IntMin = DAG.getConstant(
+      APInt::getSignedMinValue(SVT.getScalarSizeInBits()), DL, VT);
+  SDValue IntMax = DAG.getConstant(
+      APInt::getSignedMaxValue(SVT.getScalarSizeInBits()), DL, VT);
+  SDValue Zero =
+      DAG.getConstant(APInt::getNullValue(SVT.getScalarSizeInBits()), DL, VT);
+
+  // Which lanes had INT_MIN divisors? Divisor is constant, so const-folded.
+  SDValue DivisorIsIntMin = DAG.getSetCC(DL, SETCCVT, D, IntMin, ISD::SETEQ);
+  Created.push_back(DivisorIsIntMin.getNode());
+
+  // (N s% INT_MIN) ==/!= 0  <-->  (N & INT_MAX) ==/!= 0
+  SDValue Masked = DAG.getNode(ISD::AND, DL, VT, N, IntMax);
+  Created.push_back(Masked.getNode());
+  SDValue MaskedIsZero = DAG.getSetCC(DL, SETCCVT, Masked, Zero, Cond);
+  Created.push_back(MaskedIsZero.getNode());
+
+  // To produce final result we need to blend 2 vectors: 'SetCC' and
+  // 'MaskedIsZero'. If the divisor for channel was *NOT* INT_MIN, we pick
+  // from 'Fold', else pick from 'MaskedIsZero'. Since 'DivisorIsIntMin' is
+  // constant-folded, select can get lowered to a shuffle with constant mask.
+  SDValue Blended =
+      DAG.getNode(ISD::VSELECT, DL, VT, DivisorIsIntMin, MaskedIsZero, Fold);
+
+  return Blended;
+}
+
 bool TargetLowering::
 verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
   if (!isa<ConstantSDNode>(Op.getOperand(0))) {
@@ -4564,6 +5336,246 @@ verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
   return false;
 }
 
+char TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
+                                        bool LegalOperations, bool ForCodeSize,
+                                        unsigned Depth) const {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG)
+    return 2;
+
+  // Don't allow anything with multiple uses unless we know it is free.
+  EVT VT = Op.getValueType();
+  const SDNodeFlags Flags = Op->getFlags();
+  const TargetOptions &Options = DAG.getTarget().Options;
+  if (!Op.hasOneUse() && !(Op.getOpcode() == ISD::FP_EXTEND &&
+                           isFPExtFree(VT, Op.getOperand(0).getValueType())))
+    return 0;
+
+  // Don't recurse exponentially.
+  if (Depth > SelectionDAG::MaxRecursionDepth)
+    return 0;
+
+  switch (Op.getOpcode()) {
+  case ISD::ConstantFP: {
+    if (!LegalOperations)
+      return 1;
+
+    // Don't invert constant FP values after legalization unless the target says
+    // the negated constant is legal.
+    return isOperationLegal(ISD::ConstantFP, VT) ||
+           isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT,
+                        ForCodeSize);
+  }
+  case ISD::BUILD_VECTOR: {
+    // Only permit BUILD_VECTOR of constants.
+    if (llvm::any_of(Op->op_values(), [&](SDValue N) {
+          return !N.isUndef() && !isa<ConstantFPSDNode>(N);
+        }))
+      return 0;
+    if (!LegalOperations)
+      return 1;
+    if (isOperationLegal(ISD::ConstantFP, VT) &&
+        isOperationLegal(ISD::BUILD_VECTOR, VT))
+      return 1;
+    return llvm::all_of(Op->op_values(), [&](SDValue N) {
+      return N.isUndef() ||
+             isFPImmLegal(neg(cast<ConstantFPSDNode>(N)->getValueAPF()), VT,
+                          ForCodeSize);
+    });
+  }
+  case ISD::FADD:
+    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+      return 0;
+
+    // After operation legalization, it might not be legal to create new FSUBs.
+    if (LegalOperations && !isOperationLegalOrCustom(ISD::FSUB, VT))
+      return 0;
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (char V = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                                    ForCodeSize, Depth + 1))
+      return V;
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
+                              ForCodeSize, Depth + 1);
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+      return 0;
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return 1;
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
+    if (char V = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                                    ForCodeSize, Depth + 1))
+      return V;
+
+    // Ignore X * 2.0 because that is expected to be canonicalized to X + X.
+    if (auto *C = isConstOrConstSplatFP(Op.getOperand(1)))
+      if (C->isExactlyValue(2.0) && Op.getOpcode() == ISD::FMUL)
+        return 0;
+
+    return isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
+                              ForCodeSize, Depth + 1);
+
+  case ISD::FMA:
+  case ISD::FMAD: {
+    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+      return 0;
+
+    // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z))
+    // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z))
+    char V2 = isNegatibleForFree(Op.getOperand(2), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    if (!V2)
+      return 0;
+
+    // One of Op0/Op1 must be cheaply negatible, then select the cheapest.
+    char V0 = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    char V1 = isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    char V01 = std::max(V0, V1);
+    return V01 ? std::max(V01, V2) : 0;
+  }
+
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FSIN:
+    return isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                              ForCodeSize, Depth + 1);
+  }
+
+  return 0;
+}
+
+SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                             bool LegalOperations,
+                                             bool ForCodeSize,
+                                             unsigned Depth) const {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG)
+    return Op.getOperand(0);
+
+  assert(Depth <= SelectionDAG::MaxRecursionDepth &&
+         "getNegatedExpression doesn't match isNegatibleForFree");
+  const SDNodeFlags Flags = Op->getFlags();
+
+  switch (Op.getOpcode()) {
+  case ISD::ConstantFP: {
+    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
+    V.changeSign();
+    return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
+  }
+  case ISD::BUILD_VECTOR: {
+    SmallVector<SDValue, 4> Ops;
+    for (SDValue C : Op->op_values()) {
+      if (C.isUndef()) {
+        Ops.push_back(C);
+        continue;
+      }
+      APFloat V = cast<ConstantFPSDNode>(C)->getValueAPF();
+      V.changeSign();
+      Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType()));
+    }
+    return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops);
+  }
+  case ISD::FADD:
+    assert((DAG.getTarget().Options.NoSignedZerosFPMath ||
+            Flags.hasNoSignedZeros()) &&
+           "Expected NSZ fp-flag");
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, ForCodeSize,
+                           Depth + 1))
+      return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
+                         getNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, ForCodeSize,
+                                              Depth + 1),
+                         Op.getOperand(1), Flags);
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
+                       getNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, ForCodeSize,
+                                            Depth + 1),
+                       Op.getOperand(0), Flags);
+  case ISD::FSUB:
+    // fold (fneg (fsub 0, B)) -> B
+    if (ConstantFPSDNode *N0CFP =
+            isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true))
+      if (N0CFP->isZero())
+        return Op.getOperand(1);
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(0), Flags);
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
+    if (isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, ForCodeSize,
+                           Depth + 1))
+      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                         getNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, ForCodeSize,
+                                              Depth + 1),
+                         Op.getOperand(1), Flags);
+
+    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
+    return DAG.getNode(
+        Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0),
+        getNegatedExpression(Op.getOperand(1), DAG, LegalOperations,
+                             ForCodeSize, Depth + 1),
+        Flags);
+
+  case ISD::FMA:
+  case ISD::FMAD: {
+    assert((DAG.getTarget().Options.NoSignedZerosFPMath ||
+            Flags.hasNoSignedZeros()) &&
+           "Expected NSZ fp-flag");
+
+    SDValue Neg2 = getNegatedExpression(Op.getOperand(2), DAG, LegalOperations,
+                                        ForCodeSize, Depth + 1);
+
+    char V0 = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    char V1 = isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    if (V0 >= V1) {
+      // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z))
+      SDValue Neg0 = getNegatedExpression(
+          Op.getOperand(0), DAG, LegalOperations, ForCodeSize, Depth + 1);
+      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Neg0,
+                         Op.getOperand(1), Neg2, Flags);
+    }
+
+    // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z))
+    SDValue Neg1 = getNegatedExpression(Op.getOperand(1), DAG, LegalOperations,
+                                        ForCodeSize, Depth + 1);
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(0), Neg1, Neg2, Flags);
+  }
+
+  case ISD::FP_EXTEND:
+  case ISD::FSIN:
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                       getNegatedExpression(Op.getOperand(0), DAG,
+                                            LegalOperations, ForCodeSize,
+                                            Depth + 1));
+  case ISD::FP_ROUND:
+    return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
+                       getNegatedExpression(Op.getOperand(0), DAG,
+                                            LegalOperations, ForCodeSize,
+                                            Depth + 1),
+                       Op.getOperand(1));
+  }
+
+  llvm_unreachable("Unknown code");
+}
+
 //===----------------------------------------------------------------------===//
 // Legalization Utilities
 //===----------------------------------------------------------------------===//
@@ -4862,7 +5874,8 @@ bool TargetLowering::expandROT(SDNode *Node, SDValue &Result,
 
 bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
                                       SelectionDAG &DAG) const {
-  SDValue Src = Node->getOperand(0);
+  unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
+  SDValue Src = Node->getOperand(OpNo);
   EVT SrcVT = Src.getValueType();
   EVT DstVT = Node->getValueType(0);
   SDLoc dl(SDValue(Node, 0));
@@ -4871,6 +5884,13 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
   if (SrcVT != MVT::f32 || DstVT != MVT::i64)
     return false;
 
+  if (Node->isStrictFPOpcode())
+    // When a NaN is converted to an integer a trap is allowed. We can't
+    // use this expansion here because it would eliminate that trap. Other
+    // traps are also allowed and cannot be eliminated. See 
+    // IEEE 754-2008 sec 5.8.
+    return false;
+
   // Expand f32 -> i64 conversion
   // This algorithm comes from compiler-rt's implementation of fixsfdi:
   // https://github.com/llvm/llvm-project/blob/master/compiler-rt/lib/builtins/fixsfdi.c
@@ -4924,9 +5944,11 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
 }
 
 bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
+                                      SDValue &Chain,
                                       SelectionDAG &DAG) const {
   SDLoc dl(SDValue(Node, 0));
-  SDValue Src = Node->getOperand(0);
+  unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
+  SDValue Src = Node->getOperand(OpNo);
 
   EVT SrcVT = Src.getValueType();
   EVT DstVT = Node->getValueType(0);
@@ -4934,7 +5956,9 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
 
   // Only expand vector types if we have the appropriate vector bit operations.
-  if (DstVT.isVector() && (!isOperationLegalOrCustom(ISD::FP_TO_SINT, DstVT) ||
+  unsigned SIntOpcode = Node->isStrictFPOpcode() ? ISD::STRICT_FP_TO_SINT : 
+                                                   ISD::FP_TO_SINT;
+  if (DstVT.isVector() && (!isOperationLegalOrCustom(SIntOpcode, DstVT) ||
                            !isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
     return false;
 
@@ -4946,14 +5970,21 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
   APInt SignMask = APInt::getSignMask(DstVT.getScalarSizeInBits());
   if (APFloat::opOverflow &
       APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) {
-    Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
+    if (Node->isStrictFPOpcode()) {
+      Result = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other }, 
+                           { Node->getOperand(0), Src }); 
+      Chain = Result.getValue(1);
+    } else
+      Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
     return true;
   }
 
   SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
   SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT);
 
-  bool Strict = shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false);
+  bool Strict = Node->isStrictFPOpcode() ||
+                shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false);
+
   if (Strict) {
     // Expand based on maximum range of FP_TO_SINT, if the value exceeds the
     // signmask then offset (the result of which should be fully representable).
@@ -4963,12 +5994,23 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
     // Result = fp_to_sint(Val) ^ Ofs
 
     // TODO: Should any fast-math-flags be set for the FSUB?
-    SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src,
-                                DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst));
+    SDValue SrcBiased;
+    if (Node->isStrictFPOpcode())
+      SrcBiased = DAG.getNode(ISD::STRICT_FSUB, dl, { SrcVT, MVT::Other }, 
+                              { Node->getOperand(0), Src, Cst });
+    else
+      SrcBiased = DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst);
+    SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src, SrcBiased);
     SDValue Ofs = DAG.getSelect(dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT),
                                 DAG.getConstant(SignMask, dl, DstVT));
-    Result = DAG.getNode(ISD::XOR, dl, DstVT,
-                         DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val), Ofs);
+    SDValue SInt;
+    if (Node->isStrictFPOpcode()) {
+      SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other }, 
+                         { SrcBiased.getValue(1), Val });
+      Chain = SInt.getValue(1);
+    } else
+      SInt = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val);
+    Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, Ofs);
   } else {
     // Expand based on maximum range of FP_TO_SINT:
     // True = fp_to_sint(Src)
@@ -5918,7 +6960,8 @@ SDValue
 TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   assert((Node->getOpcode() == ISD::SMULFIX ||
           Node->getOpcode() == ISD::UMULFIX ||
-          Node->getOpcode() == ISD::SMULFIXSAT) &&
+          Node->getOpcode() == ISD::SMULFIXSAT ||
+          Node->getOpcode() == ISD::UMULFIXSAT) &&
          "Expected a fixed point multiplication opcode");
 
   SDLoc dl(Node);
@@ -5926,15 +6969,19 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   SDValue RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
   unsigned Scale = Node->getConstantOperandVal(2);
-  bool Saturating = Node->getOpcode() == ISD::SMULFIXSAT;
+  bool Saturating = (Node->getOpcode() == ISD::SMULFIXSAT ||
+                     Node->getOpcode() == ISD::UMULFIXSAT);
+  bool Signed = (Node->getOpcode() == ISD::SMULFIX ||
+                 Node->getOpcode() == ISD::SMULFIXSAT);
   EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   unsigned VTSize = VT.getScalarSizeInBits();
 
   if (!Scale) {
     // [us]mul.fix(a, b, 0) -> mul(a, b)
-    if (!Saturating && isOperationLegalOrCustom(ISD::MUL, VT)) {
-      return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
-    } else if (Saturating && isOperationLegalOrCustom(ISD::SMULO, VT)) {
+    if (!Saturating) {
+      if (isOperationLegalOrCustom(ISD::MUL, VT))
+        return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    } else if (Signed && isOperationLegalOrCustom(ISD::SMULO, VT)) {
       SDValue Result =
           DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
       SDValue Product = Result.getValue(0);
@@ -5948,11 +6995,18 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
       SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
       Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
       return DAG.getSelect(dl, VT, Overflow, Result, Product);
+    } else if (!Signed && isOperationLegalOrCustom(ISD::UMULO, VT)) {
+      SDValue Result =
+          DAG.getNode(ISD::UMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+
+      APInt MaxVal = APInt::getMaxValue(VTSize);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      return DAG.getSelect(dl, VT, Overflow, SatMax, Product);
     }
   }
 
-  bool Signed =
-      Node->getOpcode() == ISD::SMULFIX || Node->getOpcode() == ISD::SMULFIXSAT;
   assert(((Signed && Scale < VTSize) || (!Signed && Scale <= VTSize)) &&
          "Expected scale to be less than the number of bits if signed or at "
          "most the number of bits if unsigned.");
@@ -5978,7 +7032,8 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
 
   if (Scale == VTSize)
     // Result is just the top half since we'd be shifting by the width of the
-    // operand.
+    // operand. Overflow impossible so this works for both UMULFIX and
+    // UMULFIXSAT.
     return Hi;
 
   // The result will need to be shifted right by the scale since both operands
@@ -5990,20 +7045,55 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   if (!Saturating)
     return Result;
 
-  unsigned OverflowBits = VTSize - Scale + 1; // +1 for the sign
-  SDValue HiMask =
-      DAG.getConstant(APInt::getHighBitsSet(VTSize, OverflowBits), dl, VT);
-  SDValue LoMask = DAG.getConstant(
-      APInt::getLowBitsSet(VTSize, VTSize - OverflowBits), dl, VT);
-  APInt MaxVal = APInt::getSignedMaxValue(VTSize);
-  APInt MinVal = APInt::getSignedMinValue(VTSize);
-
-  Result = DAG.getSelectCC(dl, Hi, LoMask,
-                           DAG.getConstant(MaxVal, dl, VT), Result,
-                           ISD::SETGT);
-  return DAG.getSelectCC(dl, Hi, HiMask,
-                         DAG.getConstant(MinVal, dl, VT), Result,
-                         ISD::SETLT);
+  if (!Signed) {
+    // Unsigned overflow happened if the upper (VTSize - Scale) bits (of the
+    // widened multiplication) aren't all zeroes.
+
+    // Saturate to max if ((Hi >> Scale) != 0),
+    // which is the same as if (Hi > ((1 << Scale) - 1))
+    APInt MaxVal = APInt::getMaxValue(VTSize);
+    SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale),
+                                      dl, VT);
+    Result = DAG.getSelectCC(dl, Hi, LowMask,
+                             DAG.getConstant(MaxVal, dl, VT), Result,
+                             ISD::SETUGT);
+
+    return Result;
+  }
+
+  // Signed overflow happened if the upper (VTSize - Scale + 1) bits (of the
+  // widened multiplication) aren't all ones or all zeroes.
+
+  SDValue SatMin = DAG.getConstant(APInt::getSignedMinValue(VTSize), dl, VT);
+  SDValue SatMax = DAG.getConstant(APInt::getSignedMaxValue(VTSize), dl, VT);
+
+  if (Scale == 0) {
+    SDValue Sign = DAG.getNode(ISD::SRA, dl, VT, Lo,
+                               DAG.getConstant(VTSize - 1, dl, ShiftTy));
+    SDValue Overflow = DAG.getSetCC(dl, BoolVT, Hi, Sign, ISD::SETNE);
+    // Saturated to SatMin if wide product is negative, and SatMax if wide
+    // product is positive ...
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue ResultIfOverflow = DAG.getSelectCC(dl, Hi, Zero, SatMin, SatMax,
+                                               ISD::SETLT);
+    // ... but only if we overflowed.
+    return DAG.getSelect(dl, VT, Overflow, ResultIfOverflow, Result);
+  }
+
+  //  We handled Scale==0 above so all the bits to examine is in Hi.
+
+  // Saturate to max if ((Hi >> (Scale - 1)) > 0),
+  // which is the same as if (Hi > (1 << (Scale - 1)) - 1)
+  SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale - 1),
+                                    dl, VT);
+  Result = DAG.getSelectCC(dl, Hi, LowMask, SatMax, Result, ISD::SETGT);
+  // Saturate to min if (Hi >> (Scale - 1)) < -1),
+  // which is the same as if (HI < (-1 << (Scale - 1))
+  SDValue HighMask =
+      DAG.getConstant(APInt::getHighBitsSet(VTSize, VTSize - Scale + 1),
+                      dl, VT);
+  Result = DAG.getSelectCC(dl, Hi, HighMask, SatMin, Result, ISD::SETLT);
+  return Result;
 }
 
 void TargetLowering::expandUADDSUBO(
@@ -6060,24 +7150,19 @@ void TargetLowering::expandSADDSUBO(
 
   SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
 
-  //   LHSSign -> LHS >= 0
-  //   RHSSign -> RHS >= 0
-  //   SumSign -> Result >= 0
-  //
-  //   Add:
-  //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
-  //   Sub:
-  //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
-  SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
-  SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
-  SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
-                                    IsAdd ? ISD::SETEQ : ISD::SETNE);
-
-  SDValue SumSign = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETGE);
-  SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
-
-  SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
-  Overflow = DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType);
+  // For an addition, the result should be less than one of the operands (LHS)
+  // if and only if the other operand (RHS) is negative, otherwise there will
+  // be overflow.
+  // For a subtraction, the result should be less than one of the operands
+  // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
+  // otherwise there will be overflow.
+  SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
+  SDValue ConditionRHS =
+      DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT);
+
+  Overflow = DAG.getBoolExtOrTrunc(
+      DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
+      ResultType, ResultType);
 }
 
 bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
@@ -6176,20 +7261,19 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
     // being a legal type for the architecture and thus has to be split to
     // two arguments.
     SDValue Ret;
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(isSigned);
+    CallOptions.setIsPostTypeLegalization(true);
     if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
       // Halves of WideVT are packed into registers in different order
       // depending on platform endianness. This is usually handled by
       // the C calling convention, but we can't defer to it in
       // the legalizer.
       SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
-      Ret = makeLibCall(DAG, LC, WideVT, Args, isSigned, dl,
-          /* doesNotReturn */ false, /* isReturnValueUsed */ true,
-          /* isPostTypeLegalization */ true).first;
+      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
     } else {
       SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
-      Ret = makeLibCall(DAG, LC, WideVT, Args, isSigned, dl,
-          /* doesNotReturn */ false, /* isReturnValueUsed */ true,
-          /* isPostTypeLegalization */ true).first;
+      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
     }
     assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
            "Ret value is a collection of constituent nodes holding result.");
diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp
index 2db0ea570598..412a00095b9b 100644
--- a/lib/CodeGen/ShrinkWrap.cpp
+++ b/lib/CodeGen/ShrinkWrap.cpp
@@ -278,11 +278,10 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
       // Ignore instructions like DBG_VALUE which don't read/def the register.
       if (!MO.isDef() && !MO.readsReg())
         continue;
-      unsigned PhysReg = MO.getReg();
+      Register PhysReg = MO.getReg();
       if (!PhysReg)
         continue;
-      assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
-             "Unallocated register?!");
+      assert(Register::isPhysicalRegister(PhysReg) && "Unallocated register?!");
       // The stack pointer is not normally described as a callee-saved register
       // in calling convention definitions, so we need to watch for it
       // separately. An SP mentioned by a call instruction, we can ignore,
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 23e5ce0acae8..db520d4e6403 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -477,7 +477,10 @@ bool SjLjEHPrepare::runOnFunction(Function &F) {
   UnregisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
       PointerType::getUnqual(FunctionContextTy));
-  FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
+  FrameAddrFn = Intrinsic::getDeclaration(
+      &M, Intrinsic::frameaddress,
+      {Type::getInt8PtrTy(M.getContext(),
+                          M.getDataLayout().getAllocaAddrSpace())});
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
   BuiltinSetupDispatchFn =
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 5c944fe3f6b3..0c1f1220c421 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "SplitKit.h"
-#include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
@@ -22,6 +21,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -437,7 +437,7 @@ void SplitEditor::addDeadDef(LiveInterval &LI, VNInfo *VNI, bool Original) {
     assert(DefMI != nullptr);
     LaneBitmask LM;
     for (const MachineOperand &DefOp : DefMI->defs()) {
-      unsigned R = DefOp.getReg();
+      Register R = DefOp.getReg();
       if (R != LI.reg)
         continue;
       if (unsigned SR = DefOp.getSubReg())
@@ -1373,7 +1373,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     assert(LI.hasSubRanges());
 
     LiveRangeCalc SubLRC;
-    unsigned Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg();
+    Register Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg();
     LaneBitmask LM = Sub != 0 ? TRI.getSubRegIndexLaneMask(Sub)
                               : MRI.getMaxLaneMaskForVReg(Reg);
     for (LiveInterval::SubRange &S : LI.subranges()) {
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index 86ad3811e3ad..78f0bbd24db5 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_LIB_CODEGEN_SPLITKIT_H
 #define LLVM_LIB_CODEGEN_SPLITKIT_H
 
-#include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -25,6 +24,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SlotIndexes.h"
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index ae9401b89700..383c91259ffc 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp
@@ -113,7 +113,7 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
       unsigned Size = DL.getPointerSizeInBits();
       assert((Size % 8) == 0 && "Need pointer size in bytes.");
       Size /= 8;
-      unsigned Reg = (++MOI)->getReg();
+      Register Reg = (++MOI)->getReg();
       int64_t Imm = (++MOI)->getImm();
       Locs.emplace_back(StackMaps::Location::Direct, Size,
                         getDwarfRegNum(Reg, TRI), Imm);
@@ -122,7 +122,7 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     case StackMaps::IndirectMemRefOp: {
       int64_t Size = (++MOI)->getImm();
       assert(Size > 0 && "Need a valid size for indirect memory locations.");
-      unsigned Reg = (++MOI)->getReg();
+      Register Reg = (++MOI)->getReg();
       int64_t Imm = (++MOI)->getImm();
       Locs.emplace_back(StackMaps::Location::Indirect, Size,
                         getDwarfRegNum(Reg, TRI), Imm);
@@ -148,14 +148,14 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     if (MOI->isImplicit())
       return ++MOI;
 
-    assert(TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) &&
+    assert(Register::isPhysicalRegister(MOI->getReg()) &&
            "Virtreg operands should have been rewritten before now.");
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(MOI->getReg());
     assert(!MOI->getSubReg() && "Physical subreg still around.");
 
     unsigned Offset = 0;
     unsigned DwarfRegNum = getDwarfRegNum(MOI->getReg(), TRI);
-    unsigned LLVMRegNum = TRI->getLLVMRegNum(DwarfRegNum, false);
+    unsigned LLVMRegNum = *TRI->getLLVMRegNum(DwarfRegNum, false);
     unsigned SubRegIdx = TRI->getSubRegIndex(LLVMRegNum, MOI->getReg());
     if (SubRegIdx)
       Offset = TRI->getSubRegIdxOffset(SubRegIdx);
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 809960c7fdf9..5683d1db473c 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/Passes.h"
@@ -157,6 +156,68 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
   return NeedsProtector;
 }
 
+bool StackProtector::HasAddressTaken(const Instruction *AI) {
+  for (const User *U : AI->users()) {
+    const auto *I = cast<Instruction>(U);
+    switch (I->getOpcode()) {
+    case Instruction::Store:
+      if (AI == cast<StoreInst>(I)->getValueOperand())
+        return true;
+      break;
+    case Instruction::AtomicCmpXchg:
+      // cmpxchg conceptually includes both a load and store from the same
+      // location. So, like store, the value being stored is what matters.
+      if (AI == cast<AtomicCmpXchgInst>(I)->getNewValOperand())
+        return true;
+      break;
+    case Instruction::PtrToInt:
+      if (AI == cast<PtrToIntInst>(I)->getOperand(0))
+        return true;
+      break;
+    case Instruction::Call: {
+      // Ignore intrinsics that do not become real instructions.
+      // TODO: Narrow this to intrinsics that have store-like effects.
+      const auto *CI = cast<CallInst>(I);
+      if (!isa<DbgInfoIntrinsic>(CI) && !CI->isLifetimeStartOrEnd())
+        return true;
+      break;
+    }
+    case Instruction::Invoke:
+      return true;
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::Select:
+    case Instruction::AddrSpaceCast:
+      if (HasAddressTaken(I))
+        return true;
+      break;
+    case Instruction::PHI: {
+      // Keep track of what PHI nodes we have already visited to ensure
+      // they are only visited once.
+      const auto *PN = cast<PHINode>(I);
+      if (VisitedPHIs.insert(PN).second)
+        if (HasAddressTaken(PN))
+          return true;
+      break;
+    }
+    case Instruction::Load:
+    case Instruction::AtomicRMW:
+    case Instruction::Ret:
+      // These instructions take an address operand, but have load-like or
+      // other innocuous behavior that should not trigger a stack protector.
+      // atomicrmw conceptually has both load and store semantics, but the
+      // value being stored must be integer; so if a pointer is being stored,
+      // we'll catch it in the PtrToInt case above.
+      break;
+    default:
+      // Conservatively return true for any instruction that takes an address
+      // operand, but is not handled above.
+      return true;
+    }
+  }
+  return false;
+}
+
 /// Search for the first call to the llvm.stackprotector intrinsic and return it
 /// if present.
 static const CallInst *findStackProtectorIntrinsic(Function &F) {
@@ -264,9 +325,7 @@ bool StackProtector::RequiresStackProtector() {
           continue;
         }
 
-        if (Strong && PointerMayBeCaptured(AI,
-                                           /* ReturnCaptures */ false,
-                                           /* StoreCaptures */ true)) {
+        if (Strong && HasAddressTaken(AI)) {
           ++NumAddrTaken;
           Layout.insert(std::make_pair(AI, MachineFrameInfo::SSPLK_AddrOf));
           ORE.emit([&]() {
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 99b533e10b87..9c8143c55dc2 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -221,7 +221,7 @@ void StackSlotColoring::InitializeSlots() {
   for (auto *I : Intervals) {
     LiveInterval &li = I->second;
     LLVM_DEBUG(li.dump());
-    int FI = TargetRegisterInfo::stackSlot2Index(li.reg);
+    int FI = Register::stackSlot2Index(li.reg);
     if (MFI->isDeadObjectIndex(FI))
       continue;
 
@@ -268,7 +268,7 @@ StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const {
 int StackSlotColoring::ColorSlot(LiveInterval *li) {
   int Color = -1;
   bool Share = false;
-  int FI = TargetRegisterInfo::stackSlot2Index(li->reg);
+  int FI = Register::stackSlot2Index(li->reg);
   uint8_t StackID = MFI->getStackID(FI);
 
   if (!DisableSharing) {
@@ -330,7 +330,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   bool Changed = false;
   for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
     LiveInterval *li = SSIntervals[i];
-    int SS = TargetRegisterInfo::stackSlot2Index(li->reg);
+    int SS = Register::stackSlot2Index(li->reg);
     int NewSS = ColorSlot(li);
     assert(NewSS >= 0 && "Stack coloring failed?");
     SlotMapping[SS] = NewSS;
@@ -343,7 +343,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "\nSpill slots after coloring:\n");
   for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
     LiveInterval *li = SSIntervals[i];
-    int SS = TargetRegisterInfo::stackSlot2Index(li->reg);
+    int SS = Register::stackSlot2Index(li->reg);
     li->weight = SlotWeights[SS];
   }
   // Sort them by new weight.
diff --git a/lib/CodeGen/SwiftErrorValueTracking.cpp b/lib/CodeGen/SwiftErrorValueTracking.cpp
index 96821cadb1b6..c72a04276a4f 100644
--- a/lib/CodeGen/SwiftErrorValueTracking.cpp
+++ b/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -13,9 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SwiftErrorValueTracking.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/Value.h"
diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp
index a0590a8a6cc6..03c68a37e459 100644
--- a/lib/CodeGen/TailDuplicator.cpp
+++ b/lib/CodeGen/TailDuplicator.cpp
@@ -235,8 +235,8 @@ bool TailDuplicator::tailDuplicateAndUpdate(
     MachineInstr *Copy = Copies[i];
     if (!Copy->isCopy())
       continue;
-    unsigned Dst = Copy->getOperand(0).getReg();
-    unsigned Src = Copy->getOperand(1).getReg();
+    Register Dst = Copy->getOperand(0).getReg();
+    Register Src = Copy->getOperand(1).getReg();
     if (MRI->hasOneNonDBGUse(Src) &&
         MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) {
       // Copy is the only use. Do trivial copy propagation here.
@@ -312,7 +312,7 @@ static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
     if (!MI.isPHI())
       break;
     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
-      unsigned SrcReg = MI.getOperand(i).getReg();
+      Register SrcReg = MI.getOperand(i).getReg();
       UsedByPhi->insert(SrcReg);
     }
   }
@@ -340,17 +340,17 @@ void TailDuplicator::processPHI(
     DenseMap<unsigned, RegSubRegPair> &LocalVRMap,
     SmallVectorImpl<std::pair<unsigned, RegSubRegPair>> &Copies,
     const DenseSet<unsigned> &RegsUsedByPhi, bool Remove) {
-  unsigned DefReg = MI->getOperand(0).getReg();
+  Register DefReg = MI->getOperand(0).getReg();
   unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
   assert(SrcOpIdx && "Unable to find matching PHI source?");
-  unsigned SrcReg = MI->getOperand(SrcOpIdx).getReg();
+  Register SrcReg = MI->getOperand(SrcOpIdx).getReg();
   unsigned SrcSubReg = MI->getOperand(SrcOpIdx).getSubReg();
   const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
   LocalVRMap.insert(std::make_pair(DefReg, RegSubRegPair(SrcReg, SrcSubReg)));
 
   // Insert a copy from source to the end of the block. The def register is the
   // available value liveout of the block.
-  unsigned NewDef = MRI->createVirtualRegister(RC);
+  Register NewDef = MRI->createVirtualRegister(RC);
   Copies.push_back(std::make_pair(NewDef, RegSubRegPair(SrcReg, SrcSubReg)));
   if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg))
     addSSAUpdateEntry(DefReg, NewDef, PredBB);
@@ -384,12 +384,12 @@ void TailDuplicator::duplicateInstruction(
       MachineOperand &MO = NewMI.getOperand(i);
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Register::isVirtualRegister(Reg))
         continue;
       if (MO.isDef()) {
         const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-        unsigned NewReg = MRI->createVirtualRegister(RC);
+        Register NewReg = MRI->createVirtualRegister(RC);
         MO.setReg(NewReg);
         LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
         if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
@@ -433,7 +433,7 @@ void TailDuplicator::duplicateInstruction(
             auto *NewRC = MI->getRegClassConstraint(i, TII, TRI);
             if (NewRC == nullptr)
               NewRC = OrigRC;
-            unsigned NewReg = MRI->createVirtualRegister(NewRC);
+            Register NewReg = MRI->createVirtualRegister(NewRC);
             BuildMI(*PredBB, NewMI, NewMI.getDebugLoc(),
                     TII->get(TargetOpcode::COPY), NewReg)
                 .addReg(VI->second.Reg, 0, VI->second.SubReg);
@@ -477,7 +477,7 @@ void TailDuplicator::updateSuccessorsPHIs(
 
       assert(Idx != 0);
       MachineOperand &MO0 = MI.getOperand(Idx);
-      unsigned Reg = MO0.getReg();
+      Register Reg = MO0.getReg();
       if (isDead) {
         // Folded into the previous BB.
         // There could be duplicate phi source entries. FIXME: Should sdisel
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 9c4483cb240d..9eeacc2584cb 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -71,7 +72,9 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // When interprocedural register allocation is enabled caller saved registers
   // are preferred over callee saved registers.
-  if (MF.getTarget().Options.EnableIPRA && isSafeForNoCSROpt(MF.getFunction()))
+  if (MF.getTarget().Options.EnableIPRA &&
+      isSafeForNoCSROpt(MF.getFunction()) &&
+      isProfitableForNoCSROpt(MF.getFunction()))
     return;
 
   // Get the callee saved register list...
@@ -118,6 +121,18 @@ unsigned TargetFrameLowering::getStackAlignmentSkew(
   return 0;
 }
 
+bool TargetFrameLowering::isSafeForNoCSROpt(const Function &F) {
+  if (!F.hasLocalLinkage() || F.hasAddressTaken() ||
+      !F.hasFnAttribute(Attribute::NoRecurse))
+    return false;
+  // Function should not be optimized as tail call.
+  for (const User *U : F.users())
+    if (auto CS = ImmutableCallSite(U))
+      if (CS.isTailCall())
+        return false;
+  return true;
+}
+
 int TargetFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
   llvm_unreachable("getInitialCFAOffset() not implemented!");
 }
@@ -125,4 +140,4 @@ int TargetFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
 unsigned TargetFrameLowering::getInitialCFARegister(const MachineFunction &MF)
     const {
   llvm_unreachable("getInitialCFARegister() not implemented!");
-}
\ No newline at end of file
+}
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index 868617ffe14d..6cae3b869501 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
@@ -142,7 +143,7 @@ TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
   while (Tail != MBB->end()) {
     auto MI = Tail++;
     if (MI->isCall())
-      MBB->getParent()->updateCallSiteInfo(&*MI);
+      MBB->getParent()->eraseCallSiteInfo(&*MI);
     MBB->erase(MI);
   }
 
@@ -183,10 +184,10 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   bool Reg2IsInternal = MI.getOperand(Idx2).isInternalRead();
   // Avoid calling isRenamable for virtual registers since we assert that
   // renamable property is only queried/set for physical registers.
-  bool Reg1IsRenamable = TargetRegisterInfo::isPhysicalRegister(Reg1)
+  bool Reg1IsRenamable = Register::isPhysicalRegister(Reg1)
                              ? MI.getOperand(Idx1).isRenamable()
                              : false;
-  bool Reg2IsRenamable = TargetRegisterInfo::isPhysicalRegister(Reg2)
+  bool Reg2IsRenamable = Register::isPhysicalRegister(Reg2)
                              ? MI.getOperand(Idx2).isRenamable()
                              : false;
   // If destination is tied to either of the commuted source register, then
@@ -228,9 +229,9 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   CommutedMI->getOperand(Idx1).setIsInternalRead(Reg2IsInternal);
   // Avoid calling setIsRenamable for virtual registers since we assert that
   // renamable property is only queried/set for physical registers.
-  if (TargetRegisterInfo::isPhysicalRegister(Reg1))
+  if (Register::isPhysicalRegister(Reg1))
     CommutedMI->getOperand(Idx2).setIsRenamable(Reg1IsRenamable);
-  if (TargetRegisterInfo::isPhysicalRegister(Reg2))
+  if (Register::isPhysicalRegister(Reg2))
     CommutedMI->getOperand(Idx1).setIsRenamable(Reg2IsRenamable);
   return CommutedMI;
 }
@@ -281,7 +282,7 @@ bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1,
   return true;
 }
 
-bool TargetInstrInfo::findCommutedOpIndices(MachineInstr &MI,
+bool TargetInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
                                             unsigned &SrcOpIdx1,
                                             unsigned &SrcOpIdx2) const {
   assert(!MI.isBundle() &&
@@ -393,7 +394,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
   if (BitOffset < 0 || BitOffset % 8)
     return false;
 
-  Size = BitSize /= 8;
+  Size = BitSize / 8;
   Offset = (unsigned)BitOffset / 8;
 
   assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range");
@@ -442,16 +443,15 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI,
   if (FoldOp.getSubReg() || LiveOp.getSubReg())
     return nullptr;
 
-  unsigned FoldReg = FoldOp.getReg();
-  unsigned LiveReg = LiveOp.getReg();
+  Register FoldReg = FoldOp.getReg();
+  Register LiveReg = LiveOp.getReg();
 
-  assert(TargetRegisterInfo::isVirtualRegister(FoldReg) &&
-         "Cannot fold physregs");
+  assert(Register::isVirtualRegister(FoldReg) && "Cannot fold physregs");
 
   const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
 
-  if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
+  if (Register::isPhysicalRegister(LiveOp.getReg()))
     return RC->contains(LiveOp.getReg()) ? RC : nullptr;
 
   if (RC->hasSubClassEq(MRI.getRegClass(LiveReg)))
@@ -674,9 +674,9 @@ bool TargetInstrInfo::hasReassociableOperands(
   // reassociate.
   MachineInstr *MI1 = nullptr;
   MachineInstr *MI2 = nullptr;
-  if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
+  if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg()))
     MI1 = MRI.getUniqueVRegDef(Op1.getReg());
-  if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
+  if (Op2.isReg() && Register::isVirtualRegister(Op2.getReg()))
     MI2 = MRI.getUniqueVRegDef(Op2.getReg());
 
   // And they need to be in the trace (otherwise, they won't have a depth).
@@ -805,27 +805,27 @@ void TargetInstrInfo::reassociateOps(
   MachineOperand &OpY = Root.getOperand(OpIdx[Row][3]);
   MachineOperand &OpC = Root.getOperand(0);
 
-  unsigned RegA = OpA.getReg();
-  unsigned RegB = OpB.getReg();
-  unsigned RegX = OpX.getReg();
-  unsigned RegY = OpY.getReg();
-  unsigned RegC = OpC.getReg();
+  Register RegA = OpA.getReg();
+  Register RegB = OpB.getReg();
+  Register RegX = OpX.getReg();
+  Register RegY = OpY.getReg();
+  Register RegC = OpC.getReg();
 
-  if (TargetRegisterInfo::isVirtualRegister(RegA))
+  if (Register::isVirtualRegister(RegA))
     MRI.constrainRegClass(RegA, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegB))
+  if (Register::isVirtualRegister(RegB))
     MRI.constrainRegClass(RegB, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegX))
+  if (Register::isVirtualRegister(RegX))
     MRI.constrainRegClass(RegX, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegY))
+  if (Register::isVirtualRegister(RegY))
     MRI.constrainRegClass(RegY, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegC))
+  if (Register::isVirtualRegister(RegC))
     MRI.constrainRegClass(RegC, RC);
 
   // Create a new virtual register for the result of (X op Y) instead of
   // recycling RegB because the MachineCombiner's computation of the critical
   // path requires a new register definition rather than an existing one.
-  unsigned NewVR = MRI.createVirtualRegister(RC);
+  Register NewVR = MRI.createVirtualRegister(RC);
   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
 
   unsigned Opcode = Root.getOpcode();
@@ -880,21 +880,21 @@ void TargetInstrInfo::genAlternativeCodeSequence(
 }
 
 bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
-    const MachineInstr &MI, AliasAnalysis *AA) const {
+    const MachineInstr &MI, AAResults *AA) const {
   const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Remat clients assume operand 0 is the defined register.
   if (!MI.getNumOperands() || !MI.getOperand(0).isReg())
     return false;
-  unsigned DefReg = MI.getOperand(0).getReg();
+  Register DefReg = MI.getOperand(0).getReg();
 
   // A sub-register definition can only be rematerialized if the instruction
   // doesn't read the other parts of the register.  Otherwise it is really a
   // read-modify-write operation on the full virtual register which cannot be
   // moved safely.
-  if (TargetRegisterInfo::isVirtualRegister(DefReg) &&
-      MI.getOperand(0).getSubReg() && MI.readsVirtualRegister(DefReg))
+  if (Register::isVirtualRegister(DefReg) && MI.getOperand(0).getSubReg() &&
+      MI.readsVirtualRegister(DefReg))
     return false;
 
   // A load from a fixed stack slot can be rematerialized. This may be
@@ -924,12 +924,12 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0)
       continue;
 
     // Check for a well-behaved physical register.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       if (MO.isUse()) {
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
@@ -1120,6 +1120,24 @@ bool TargetInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
   return (DefCycle != -1 && DefCycle <= 1);
 }
 
+Optional<ParamLoadedValue>
+TargetInstrInfo::describeLoadedValue(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getMF();
+  const MachineOperand *Op = nullptr;
+  DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {});;
+  const MachineOperand *SrcRegOp, *DestRegOp;
+
+  if (isCopyInstr(MI, SrcRegOp, DestRegOp)) {
+    Op = SrcRegOp;
+    return ParamLoadedValue(*Op, Expr);
+  } else if (MI.isMoveImmediate()) {
+    Op = &MI.getOperand(1);
+    return ParamLoadedValue(*Op, Expr);
+  }
+
+  return None;
+}
+
 /// Both DefMI and UseMI must be valid.  By default, call directly to the
 /// itinerary. This may be overriden by the target.
 int TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
@@ -1227,3 +1245,5 @@ bool TargetInstrInfo::getInsertSubregInputs(
   InsertedReg.SubIdx = (unsigned)MOSubIdx.getImm();
   return true;
 }
+
+TargetInstrInfo::PipelinerLoopInfo::~PipelinerLoopInfo() {}
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 9b28c1a6c450..9b23012f47e3 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -167,6 +167,7 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
         setLibcallName(RTLIB::BZERO, "__bzero");
       break;
     case Triple::aarch64:
+    case Triple::aarch64_32:
       setLibcallName(RTLIB::BZERO, "bzero");
       break;
     default:
@@ -197,6 +198,11 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl");
   }
 
+  if (TT.isPS4CPU()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+  }
+
   if (TT.isOSOpenBSD()) {
     setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr);
   }
@@ -578,13 +584,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   BooleanFloatContents = UndefinedBooleanContent;
   BooleanVectorContents = UndefinedBooleanContent;
   SchedPreferenceInfo = Sched::ILP;
-  JumpBufSize = 0;
-  JumpBufAlignment = 0;
-  MinFunctionAlignment = 0;
-  PrefFunctionAlignment = 0;
-  PrefLoopAlignment = 0;
   GatherAllAliasesMaxDepth = 18;
-  MinStackArgumentAlignment = 1;
   // TODO: the default will be switched to 0 in the next commit, along
   // with the Target-specific changes necessary.
   MaxAtomicSizeInBitsSupported = 1024;
@@ -653,6 +653,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SMULFIX, VT, Expand);
     setOperationAction(ISD::SMULFIXSAT, VT, Expand);
     setOperationAction(ISD::UMULFIX, VT, Expand);
+    setOperationAction(ISD::UMULFIXSAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
@@ -689,6 +690,7 @@ void TargetLoweringBase::initActions() {
       setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, VT, Expand);
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Expand);
     }
 
     // Constrained floating-point operations default to expand.
@@ -708,16 +710,22 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::STRICT_FLOG, VT, Expand);
     setOperationAction(ISD::STRICT_FLOG10, VT, Expand);
     setOperationAction(ISD::STRICT_FLOG2, VT, Expand);
+    setOperationAction(ISD::STRICT_LRINT, VT, Expand);
+    setOperationAction(ISD::STRICT_LLRINT, VT, Expand);
     setOperationAction(ISD::STRICT_FRINT, VT, Expand);
     setOperationAction(ISD::STRICT_FNEARBYINT, VT, Expand);
     setOperationAction(ISD::STRICT_FCEIL, VT, Expand);
     setOperationAction(ISD::STRICT_FFLOOR, VT, Expand);
+    setOperationAction(ISD::STRICT_LROUND, VT, Expand);
+    setOperationAction(ISD::STRICT_LLROUND, VT, Expand);
     setOperationAction(ISD::STRICT_FROUND, VT, Expand);
     setOperationAction(ISD::STRICT_FTRUNC, VT, Expand);
     setOperationAction(ISD::STRICT_FMAXNUM, VT, Expand);
     setOperationAction(ISD::STRICT_FMINNUM, VT, Expand);
     setOperationAction(ISD::STRICT_FP_ROUND, VT, Expand);
     setOperationAction(ISD::STRICT_FP_EXTEND, VT, Expand);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Expand);
 
     // For most targets @llvm.get.dynamic.area.offset just returns 0.
     setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
@@ -824,7 +832,8 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
 
     assert((LA == TypeLegal || LA == TypeSoftenFloat ||
-            ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger) &&
+            (NVT.isVector() ||
+             ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger)) &&
            "Promote may not follow Expand or Promote");
 
     if (LA == TypeSplitVector)
@@ -1257,17 +1266,23 @@ void TargetLoweringBase::computeRegisterProperties(
     MVT EltVT = VT.getVectorElementType();
     unsigned NElts = VT.getVectorNumElements();
     bool IsLegalWiderType = false;
+    bool IsScalable = VT.isScalableVector();
     LegalizeTypeAction PreferredAction = getPreferredVectorAction(VT);
     switch (PreferredAction) {
-    case TypePromoteInteger:
+    case TypePromoteInteger: {
+      MVT::SimpleValueType EndVT = IsScalable ?
+                                   MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE :
+                                   MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE;
       // Try to promote the elements of integer vectors. If no legal
       // promotion was found, fall through to the widen-vector method.
-      for (unsigned nVT = i + 1; nVT <= MVT::LAST_INTEGER_VECTOR_VALUETYPE; ++nVT) {
+      for (unsigned nVT = i + 1;
+           (MVT::SimpleValueType)nVT <= EndVT; ++nVT) {
         MVT SVT = (MVT::SimpleValueType) nVT;
         // Promote vectors of integers to vectors with the same number
         // of elements, with a wider element type.
         if (SVT.getScalarSizeInBits() > EltVT.getSizeInBits() &&
-            SVT.getVectorNumElements() == NElts && isTypeLegal(SVT)) {
+            SVT.getVectorNumElements() == NElts &&
+            SVT.isScalableVector() == IsScalable && isTypeLegal(SVT)) {
           TransformToType[i] = SVT;
           RegisterTypeForVT[i] = SVT;
           NumRegistersForVT[i] = 1;
@@ -1279,23 +1294,37 @@ void TargetLoweringBase::computeRegisterProperties(
       if (IsLegalWiderType)
         break;
       LLVM_FALLTHROUGH;
+    }
 
     case TypeWidenVector:
-      // Try to widen the vector.
-      for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-        MVT SVT = (MVT::SimpleValueType) nVT;
-        if (SVT.getVectorElementType() == EltVT
-            && SVT.getVectorNumElements() > NElts && isTypeLegal(SVT)) {
-          TransformToType[i] = SVT;
-          RegisterTypeForVT[i] = SVT;
-          NumRegistersForVT[i] = 1;
+      if (isPowerOf2_32(NElts)) {
+        // Try to widen the vector.
+        for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+          MVT SVT = (MVT::SimpleValueType) nVT;
+          if (SVT.getVectorElementType() == EltVT
+              && SVT.getVectorNumElements() > NElts
+              && SVT.isScalableVector() == IsScalable && isTypeLegal(SVT)) {
+            TransformToType[i] = SVT;
+            RegisterTypeForVT[i] = SVT;
+            NumRegistersForVT[i] = 1;
+            ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+            IsLegalWiderType = true;
+            break;
+          }
+        }
+        if (IsLegalWiderType)
+          break;
+      } else {
+        // Only widen to the next power of 2 to keep consistency with EVT.
+        MVT NVT = VT.getPow2VectorType();
+        if (isTypeLegal(NVT)) {
+          TransformToType[i] = NVT;
           ValueTypeActions.setTypeAction(VT, TypeWidenVector);
-          IsLegalWiderType = true;
+          RegisterTypeForVT[i] = NVT;
+          NumRegistersForVT[i] = 1;
           break;
         }
       }
-      if (IsLegalWiderType)
-        break;
       LLVM_FALLTHROUGH;
 
     case TypeSplitVector:
@@ -1488,12 +1517,9 @@ unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty,
   return DL.getABITypeAlignment(Ty);
 }
 
-bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
-                                            const DataLayout &DL, EVT VT,
-                                            unsigned AddrSpace,
-                                            unsigned Alignment,
-                                            MachineMemOperand::Flags Flags,
-                                            bool *Fast) const {
+bool TargetLoweringBase::allowsMemoryAccessForAlignment(
+    LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
+    unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast) const {
   // Check if the specified alignment is sufficient based on the data layout.
   // TODO: While using the data layout works in practice, a better solution
   // would be to implement this check directly (make this a virtual function).
@@ -1511,6 +1537,21 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
   return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, Fast);
 }
 
+bool TargetLoweringBase::allowsMemoryAccessForAlignment(
+    LLVMContext &Context, const DataLayout &DL, EVT VT,
+    const MachineMemOperand &MMO, bool *Fast) const {
+  return allowsMemoryAccessForAlignment(Context, DL, VT, MMO.getAddrSpace(),
+                                        MMO.getAlignment(), MMO.getFlags(),
+                                        Fast);
+}
+
+bool TargetLoweringBase::allowsMemoryAccess(
+    LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
+    unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast) const {
+  return allowsMemoryAccessForAlignment(Context, DL, VT, AddrSpace, Alignment,
+                                        Flags, Fast);
+}
+
 bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
                                             const DataLayout &DL, EVT VT,
                                             const MachineMemOperand &MMO,
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 4c8f75b237aa..4978f4b9500b 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -43,6 +43,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -154,6 +155,7 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
     break;
   case Triple::aarch64:
   case Triple::aarch64_be:
+  case Triple::aarch64_32:
     // The small model guarantees static code/data size < 4GB, but not where it
     // will be in memory. Most of these could end up >2GB away so even a signed
     // pc-relative 32-bit address is insufficient, theoretically.
@@ -375,7 +377,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(
                                                    ELF::SHT_PROGBITS, Flags, 0);
   unsigned Size = DL.getPointerSize();
   Streamer.SwitchSection(Sec);
-  Streamer.EmitValueToAlignment(DL.getPointerABIAlignment(0));
+  Streamer.EmitValueToAlignment(DL.getPointerABIAlignment(0).value());
   Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::create(Size, getContext());
   Streamer.emitELFSize(Label, E);
@@ -524,8 +526,8 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
   if (!VM)
     report_fatal_error("MD_associated operand is not ValueAsMetadata");
 
-  GlobalObject *OtherGO = dyn_cast<GlobalObject>(VM->getValue());
-  return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr;
+  auto *OtherGV = dyn_cast<GlobalValue>(VM->getValue());
+  return OtherGV ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGV)) : nullptr;
 }
 
 static unsigned getEntrySizeForKind(SectionKind Kind) {
@@ -566,6 +568,8 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
       SectionName = Attrs.getAttribute("bss-section").getValueAsString();
     } else if (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly()) {
       SectionName = Attrs.getAttribute("rodata-section").getValueAsString();
+    } else if (Attrs.hasAttribute("relro-section") && Kind.isReadOnlyWithRel()) {
+      SectionName = Attrs.getAttribute("relro-section").getValueAsString();
     } else if (Attrs.hasAttribute("data-section") && Kind.isData()) {
       SectionName = Attrs.getAttribute("data-section").getValueAsString();
     }
@@ -1107,8 +1111,8 @@ MCSymbol *TargetLoweringObjectFileMachO::getCFIPersonalitySymbol(
 }
 
 const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
-    const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
-    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+    const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+    int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
   // Although MachO 32-bit targets do not explicitly have a GOTPCREL relocation
   // as 64-bit do, we replace the GOT equivalent by accessing the final symbol
   // through a non_lazy_ptr stub instead. One advantage is that it allows the
@@ -1165,12 +1169,10 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
   MCSymbol *Stub = Ctx.getOrCreateSymbol(Name);
 
   MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(Stub);
-  if (!StubSym.getPointer()) {
-    bool IsIndirectLocal = Sym->isDefined() && !Sym->isExternal();
-    // With the assumption that IsIndirectLocal == GV->hasLocalLinkage().
+
+  if (!StubSym.getPointer())
     StubSym = MachineModuleInfoImpl::StubValueTy(const_cast<MCSymbol *>(Sym),
-                                                 !IsIndirectLocal);
-  }
+                                                 !GV->hasLocalLinkage());
 
   const MCExpr *BSymExpr =
     MCSymbolRefExpr::create(BaseSym, MCSymbolRefExpr::VK_None, Ctx);
@@ -1519,7 +1521,8 @@ static MCSectionCOFF *getCOFFStaticStructorSection(MCContext &Ctx,
     // internally, so we use ".CRT$XCA00001" for them.
     SmallString<24> Name;
     raw_svector_ostream OS(Name);
-    OS << ".CRT$XC" << (Priority < 200 ? 'A' : 'T') << format("%05u", Priority);
+    OS << ".CRT$X" << (IsCtor ? "C" : "T") <<
+        (Priority < 200 ? 'A' : 'T') << format("%05u", Priority);
     MCSectionCOFF *Sec = Ctx.getCOFFSection(
         Name, COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
         SectionKind::getReadOnly());
@@ -1595,7 +1598,8 @@ const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference(
 
 static std::string APIntToHexString(const APInt &AI) {
   unsigned Width = (AI.getBitWidth() / 8) * 2;
-  std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
+  std::string HexString = AI.toString(16, /*Signed=*/false);
+  transform(HexString.begin(), HexString.end(), HexString.begin(), tolower);
   unsigned Size = HexString.size();
   assert(Width >= Size && "hex string is too large!");
   HexString.insert(HexString.begin(), Width - Size, '0');
@@ -1819,3 +1823,82 @@ MCSection *TargetLoweringObjectFileWasm::getStaticDtorSection(
   llvm_unreachable("@llvm.global_dtors should have been lowered already");
   return nullptr;
 }
+
+//===----------------------------------------------------------------------===//
+//                                  XCOFF
+//===----------------------------------------------------------------------===//
+MCSection *TargetLoweringObjectFileXCOFF::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  report_fatal_error("XCOFF explicit sections not yet implemented.");
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  assert(!TM.getFunctionSections() && !TM.getDataSections() &&
+         "XCOFF unique sections not yet implemented.");
+
+  // Common symbols go into a csect with matching name which will get mapped
+  // into the .bss section.
+  if (Kind.isBSSLocal() || Kind.isCommon()) {
+    SmallString<128> Name;
+    getNameWithPrefix(Name, GO, TM);
+    XCOFF::StorageClass SC =
+        TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
+    return getContext().getXCOFFSection(
+        Name, Kind.isBSSLocal() ? XCOFF::XMC_BS : XCOFF::XMC_RW, XCOFF::XTY_CM,
+        SC, Kind, /* BeginSymbolName */ nullptr);
+  }
+
+  if (Kind.isText())
+    return TextSection;
+
+  if (Kind.isData())
+    return DataSection;
+
+  report_fatal_error("XCOFF other section types not yet implemented.");
+}
+
+bool TargetLoweringObjectFileXCOFF::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  report_fatal_error("TLOF XCOFF not yet implemented.");
+}
+
+void TargetLoweringObjectFileXCOFF::Initialize(MCContext &Ctx,
+                                               const TargetMachine &TgtM) {
+  TargetLoweringObjectFile::Initialize(Ctx, TgtM);
+  TTypeEncoding = 0;
+  PersonalityEncoding = 0;
+  LSDAEncoding = 0;
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  report_fatal_error("XCOFF ctor section not yet implemented.");
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  report_fatal_error("XCOFF dtor section not yet implemented.");
+}
+
+const MCExpr *TargetLoweringObjectFileXCOFF::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  report_fatal_error("XCOFF not yet implemented.");
+}
+
+XCOFF::StorageClass TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(
+    const GlobalObject *GO) {
+  switch (GO->getLinkage()) {
+  case GlobalValue::InternalLinkage:
+    return XCOFF::C_HIDEXT;
+  case GlobalValue::ExternalLinkage:
+  case GlobalValue::CommonLinkage:
+    return XCOFF::C_EXT;
+  case GlobalValue::ExternalWeakLinkage:
+    return XCOFF::C_WEAKEXT;
+  default:
+    report_fatal_error(
+        "Unhandled linkage when mapping linkage to StorageClass.");
+  }
+}
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 36df02692f86..f1f4f65adf7c 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -49,9 +49,10 @@
 
 using namespace llvm;
 
-cl::opt<bool> EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
-                         cl::desc("Enable interprocedural register allocation "
-                                  "to reduce load/store at procedure calls."));
+static cl::opt<bool>
+    EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
+               cl::desc("Enable interprocedural register allocation "
+                        "to reduce load/store at procedure calls."));
 static cl::opt<bool> DisablePostRASched("disable-post-ra", cl::Hidden,
     cl::desc("Disable Post Regalloc Scheduler"));
 static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
@@ -152,8 +153,10 @@ static cl::opt<GlobalISelAbortMode> EnableGlobalISelAbort(
 // substitutePass(&PostRASchedulerID, &PostMachineSchedulerID).
 // Targets can return true in targetSchedulesPostRAScheduling() and
 // insert a PostRA scheduling pass wherever it wants.
-cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
-  cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));
+static cl::opt<bool> MISchedPostRA(
+    "misched-postra", cl::Hidden,
+    cl::desc(
+        "Run MachineScheduler post regalloc (independent of preRA sched)"));
 
 // Experimental option to run live interval analysis early.
 static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
@@ -175,10 +178,10 @@ static cl::opt<CFLAAType> UseCFLAA(
 /// Option names for limiting the codegen pipeline.
 /// Those are used in error reporting and we didn't want
 /// to duplicate their names all over the place.
-const char *StartAfterOptName = "start-after";
-const char *StartBeforeOptName = "start-before";
-const char *StopAfterOptName = "stop-after";
-const char *StopBeforeOptName = "stop-before";
+static const char *StartAfterOptName = "start-after";
+static const char *StartBeforeOptName = "start-before";
+static const char *StopAfterOptName = "stop-after";
+static const char *StopBeforeOptName = "stop-before";
 
 static cl::opt<std::string>
     StartAfterOpt(StringRef(StartAfterOptName),
@@ -654,6 +657,7 @@ void TargetPassConfig::addIRPasses() {
   // TODO: add a pass insertion point here
   addPass(createGCLoweringPass());
   addPass(createShadowStackGCLoweringPass());
+  addPass(createLowerConstantIntrinsicsPass());
 
   // Make sure that no unreachable blocks are instruction selected.
   addPass(createUnreachableBlockEliminationPass());
@@ -1231,5 +1235,5 @@ bool TargetPassConfig::isGISelCSEEnabled() const {
 }
 
 std::unique_ptr<CSEConfigBase> TargetPassConfig::getCSEConfig() const {
-  return make_unique<CSEConfigBase>();
+  return std::make_unique<CSEConfigBase>();
 }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index f1b2ecf3243b..e5592c31098a 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -86,22 +86,21 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
 
 namespace llvm {
 
-Printable printReg(unsigned Reg, const TargetRegisterInfo *TRI,
+Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
                    unsigned SubIdx, const MachineRegisterInfo *MRI) {
   return Printable([Reg, TRI, SubIdx, MRI](raw_ostream &OS) {
     if (!Reg)
       OS << "$noreg";
-    else if (TargetRegisterInfo::isStackSlot(Reg))
-      OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
-    else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    else if (Register::isStackSlot(Reg))
+      OS << "SS#" << Register::stackSlot2Index(Reg);
+    else if (Register::isVirtualRegister(Reg)) {
       StringRef Name = MRI ? MRI->getVRegName(Reg) : "";
       if (Name != "") {
         OS << '%' << Name;
       } else {
-        OS << '%' << TargetRegisterInfo::virtReg2Index(Reg);
+        OS << '%' << Register::virtReg2Index(Reg);
       }
-    }
-    else if (!TRI)
+    } else if (!TRI)
       OS << '$' << "physreg" << Reg;
     else if (Reg < TRI->getNumRegs()) {
       OS << '$';
@@ -143,8 +142,8 @@ Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
 
 Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   return Printable([Unit, TRI](raw_ostream &OS) {
-    if (TRI && TRI->isVirtualRegister(Unit)) {
-      OS << '%' << TargetRegisterInfo::virtReg2Index(Unit);
+    if (Register::isVirtualRegister(Unit)) {
+      OS << '%' << Register::virtReg2Index(Unit);
     } else {
       OS << printRegUnit(Unit, TRI);
     }
@@ -189,7 +188,8 @@ TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
 /// the right type that contains this physreg.
 const TargetRegisterClass *
 TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const {
-  assert(isPhysicalRegister(reg) && "reg must be a physical register");
+  assert(Register::isPhysicalRegister(reg) &&
+         "reg must be a physical register");
 
   // Pick the most sub register class of the right type that contains
   // this physreg.
@@ -238,24 +238,16 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
 static inline
 const TargetRegisterClass *firstCommonClass(const uint32_t *A,
                                             const uint32_t *B,
-                                            const TargetRegisterInfo *TRI,
-                                            const MVT::SimpleValueType SVT =
-                                            MVT::SimpleValueType::Any) {
-  const MVT VT(SVT);
+                                            const TargetRegisterInfo *TRI) {
   for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
-    if (unsigned Common = *A++ & *B++) {
-      const TargetRegisterClass *RC =
-          TRI->getRegClass(I + countTrailingZeros(Common));
-      if (SVT == MVT::SimpleValueType::Any || TRI->isTypeLegalForClass(*RC, VT))
-        return RC;
-    }
+    if (unsigned Common = *A++ & *B++)
+      return TRI->getRegClass(I + countTrailingZeros(Common));
   return nullptr;
 }
 
 const TargetRegisterClass *
 TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
-                                      const TargetRegisterClass *B,
-                                      const MVT::SimpleValueType SVT) const {
+                                      const TargetRegisterClass *B) const {
   // First take care of the trivial cases.
   if (A == B)
     return A;
@@ -264,7 +256,7 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
 
   // Register classes are ordered topologically, so the largest common
   // sub-class it the common sub-class with the smallest ID.
-  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this, SVT);
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
 }
 
 const TargetRegisterClass *
@@ -409,7 +401,7 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
 
     // Target-independent hints are either a physical or a virtual register.
     unsigned Phys = Reg;
-    if (VRM && isVirtualRegister(Phys))
+    if (VRM && Register::isVirtualRegister(Phys))
       Phys = VRM->getPhys(Phys);
 
     // Don't add the same reg twice (Hints_MRI may contain multiple virtual
@@ -417,7 +409,7 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
     if (!HintedRegs.insert(Phys).second)
       continue;
     // Check that Phys is a valid hint in VirtReg's register class.
-    if (!isPhysicalRegister(Phys))
+    if (!Register::isPhysicalRegister(Phys))
       continue;
     if (MRI.isReserved(Phys))
       continue;
@@ -433,6 +425,20 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
   return false;
 }
 
+bool TargetRegisterInfo::isCalleeSavedPhysReg(
+    unsigned PhysReg, const MachineFunction &MF) const {
+  if (PhysReg == 0)
+    return false;
+  const uint32_t *callerPreservedRegs =
+      getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+  if (callerPreservedRegs) {
+    assert(Register::isPhysicalRegister(PhysReg) &&
+           "Expected physical register");
+    return (callerPreservedRegs[PhysReg / 32] >> PhysReg % 32) & 1;
+  }
+  return false;
+}
+
 bool TargetRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   return !MF.getFunction().hasFnAttribute("no-realign-stack");
 }
@@ -466,7 +472,7 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,
 unsigned TargetRegisterInfo::getRegSizeInBits(unsigned Reg,
                                          const MachineRegisterInfo &MRI) const {
   const TargetRegisterClass *RC{};
-  if (isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     // The size is not directly available for physical registers.
     // Instead, we need to access a register class that contains Reg and
     // get the size of that register class.
@@ -501,7 +507,7 @@ TargetRegisterInfo::lookThruCopyLike(unsigned SrcReg,
       CopySrcReg = MI->getOperand(2).getReg();
     }
 
-    if (!isVirtualRegister(CopySrcReg))
+    if (!Register::isVirtualRegister(CopySrcReg))
       return CopySrcReg;
 
     SrcReg = CopySrcReg;
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 195279719ad4..ce59452fd1b8 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -300,7 +300,7 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
   // TODO: The following hack exists because predication passes do not
   // correctly append imp-use operands, and readsReg() strangely returns false
   // for predicated defs.
-  unsigned Reg = DefMI->getOperand(DefOperIdx).getReg();
+  Register Reg = DefMI->getOperand(DefOperIdx).getReg();
   const MachineFunction &MF = *DefMI->getMF();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(*DepMI))
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 43d876646967..ea971809d4e4 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -230,7 +230,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (!MOReg)
       continue;
     if (MO.isUse() && MOReg != SavedReg)
@@ -299,7 +299,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
       MachineOperand &MO = OtherMI.getOperand(i);
       if (!MO.isReg())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (!MOReg)
         continue;
       if (DefReg == MOReg)
@@ -418,8 +418,8 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   } else
     return false;
 
-  IsSrcPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
-  IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  IsSrcPhys = Register::isPhysicalRegister(SrcReg);
+  IsDstPhys = Register::isPhysicalRegister(DstReg);
   return true;
 }
 
@@ -427,8 +427,7 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
 /// given instruction, is killed by the given instruction.
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
                             LiveIntervals *LIS) {
-  if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) &&
-      !LIS->isNotInMIMap(*MI)) {
+  if (LIS && Register::isVirtualRegister(Reg) && !LIS->isNotInMIMap(*MI)) {
     // FIXME: Sometimes tryInstructionTransform() will add instructions and
     // test whether they can be folded before keeping them. In this case it
     // sets a kill before recursively calling tryInstructionTransform() again.
@@ -475,12 +474,12 @@ static bool isKilled(MachineInstr &MI, unsigned Reg,
   MachineInstr *DefMI = &MI;
   while (true) {
     // All uses of physical registers are likely to be kills.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+    if (Register::isPhysicalRegister(Reg) &&
         (allowFalsePositives || MRI->hasOneUse(Reg)))
       return true;
     if (!isPlainlyKilled(DefMI, Reg, LIS))
       return false;
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       return true;
     MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
     // If there are multiple defs, we can't do a simple analysis, so just
@@ -536,7 +535,7 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
   }
   IsDstPhys = false;
   if (isTwoAddrUse(UseMI, Reg, DstReg)) {
-    IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+    IsDstPhys = Register::isPhysicalRegister(DstReg);
     return &UseMI;
   }
   return nullptr;
@@ -546,13 +545,13 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
 /// to.
 static unsigned
 getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
-  while (TargetRegisterInfo::isVirtualRegister(Reg))  {
+  while (Register::isVirtualRegister(Reg)) {
     DenseMap<unsigned, unsigned>::iterator SI = RegMap.find(Reg);
     if (SI == RegMap.end())
       return 0;
     Reg = SI->second;
   }
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return Reg;
   return 0;
 }
@@ -683,7 +682,7 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
                                                    unsigned RegBIdx,
                                                    unsigned RegCIdx,
                                                    unsigned Dist) {
-  unsigned RegC = MI->getOperand(RegCIdx).getReg();
+  Register RegC = MI->getOperand(RegCIdx).getReg();
   LLVM_DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
   MachineInstr *NewMI = TII->commuteInstruction(*MI, false, RegBIdx, RegCIdx);
 
@@ -700,7 +699,7 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
   // Update source register map.
   unsigned FromRegC = getMappedReg(RegC, SrcRegMap);
   if (FromRegC) {
-    unsigned RegA = MI->getOperand(DstIdx).getReg();
+    Register RegA = MI->getOperand(DstIdx).getReg();
     SrcRegMap[RegA] = FromRegC;
   }
 
@@ -911,7 +910,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (!MOReg)
       continue;
     if (MO.isDef())
@@ -955,7 +954,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     for (const MachineOperand &MO : OtherMI.operands()) {
       if (!MO.isReg())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (!MOReg)
         continue;
       if (MO.isDef()) {
@@ -1093,7 +1092,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   for (const MachineOperand &MO : KillMI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (MO.isUse()) {
       if (!MOReg)
         continue;
@@ -1105,7 +1104,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
       Uses.insert(MOReg);
       if (isKill && MOReg != Reg)
         Kills.insert(MOReg);
-    } else if (TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+    } else if (Register::isPhysicalRegister(MOReg)) {
       Defs.insert(MOReg);
       if (!MO.isDead())
         LiveDefs.insert(MOReg);
@@ -1130,7 +1129,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
     for (const MachineOperand &MO : OtherMI.operands()) {
       if (!MO.isReg())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (!MOReg)
         continue;
       if (MO.isUse()) {
@@ -1154,8 +1153,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
       unsigned MOReg = OtherDefs[i];
       if (Uses.count(MOReg))
         return false;
-      if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-          LiveDefs.count(MOReg))
+      if (Register::isPhysicalRegister(MOReg) && LiveDefs.count(MOReg))
         return false;
       // Physical register def is seen.
       Defs.erase(MOReg);
@@ -1208,8 +1206,8 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
     return false;
 
   bool MadeChange = false;
-  unsigned DstOpReg = MI->getOperand(DstOpIdx).getReg();
-  unsigned BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
+  Register DstOpReg = MI->getOperand(DstOpIdx).getReg();
+  Register BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
   unsigned OpsNum = MI->getDesc().getNumOperands();
   unsigned OtherOpIdx = MI->getDesc().getNumDefs();
   for (; OtherOpIdx < OpsNum; OtherOpIdx++) {
@@ -1221,7 +1219,7 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
         !TII->findCommutedOpIndices(*MI, BaseOpIdx, OtherOpIdx))
       continue;
 
-    unsigned OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
+    Register OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
     bool AggressiveCommute = false;
 
     // If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp
@@ -1276,14 +1274,14 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
     return false;
 
   MachineInstr &MI = *mi;
-  unsigned regA = MI.getOperand(DstIdx).getReg();
-  unsigned regB = MI.getOperand(SrcIdx).getReg();
+  Register regA = MI.getOperand(DstIdx).getReg();
+  Register regB = MI.getOperand(SrcIdx).getReg();
 
-  assert(TargetRegisterInfo::isVirtualRegister(regB) &&
+  assert(Register::isVirtualRegister(regB) &&
          "cannot make instruction into two-address form");
   bool regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
 
-  if (TargetRegisterInfo::isVirtualRegister(regA))
+  if (Register::isVirtualRegister(regA))
     scanUses(regA);
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
@@ -1363,7 +1361,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
         const TargetRegisterClass *RC =
           TRI->getAllocatableClass(
             TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
-        unsigned Reg = MRI->createVirtualRegister(RC);
+        Register Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
         if (!TII->unfoldMemoryOperand(*MF, MI, Reg,
                                       /*UnfoldLoad=*/true,
@@ -1399,8 +1397,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
           if (LV) {
             for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
               MachineOperand &MO = MI.getOperand(i);
-              if (MO.isReg() &&
-                  TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+              if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
                 if (MO.isUse()) {
                   if (MO.isKill()) {
                     if (NewMIs[0]->killsRegister(MO.getReg()))
@@ -1474,8 +1471,8 @@ collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
     AnyOps = true;
     MachineOperand &SrcMO = MI->getOperand(SrcIdx);
     MachineOperand &DstMO = MI->getOperand(DstIdx);
-    unsigned SrcReg = SrcMO.getReg();
-    unsigned DstReg = DstMO.getReg();
+    Register SrcReg = SrcMO.getReg();
+    Register DstReg = DstMO.getReg();
     // Tied constraint already satisfied?
     if (SrcReg == DstReg)
       continue;
@@ -1485,7 +1482,7 @@ collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
     // Deal with undef uses immediately - simply rewrite the src operand.
     if (SrcMO.isUndef() && !DstMO.getSubReg()) {
       // Constrain the DstReg register class if required.
-      if (TargetRegisterInfo::isVirtualRegister(DstReg))
+      if (Register::isVirtualRegister(DstReg))
         if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
                                                              TRI, *MF))
           MRI->constrainRegClass(DstReg, RC);
@@ -1522,7 +1519,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     unsigned DstIdx = TiedPairs[tpi].second;
 
     const MachineOperand &DstMO = MI->getOperand(DstIdx);
-    unsigned RegA = DstMO.getReg();
+    Register RegA = DstMO.getReg();
 
     // Grab RegB from the instruction because it may have changed if the
     // instruction was commuted.
@@ -1538,7 +1535,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     }
     LastCopiedReg = RegA;
 
-    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
+    assert(Register::isVirtualRegister(RegB) &&
            "cannot make instruction into two-address form");
 
 #ifndef NDEBUG
@@ -1559,14 +1556,13 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     MIB.addReg(RegB, 0, SubRegB);
     const TargetRegisterClass *RC = MRI->getRegClass(RegB);
     if (SubRegB) {
-      if (TargetRegisterInfo::isVirtualRegister(RegA)) {
+      if (Register::isVirtualRegister(RegA)) {
         assert(TRI->getMatchingSuperRegClass(RC, MRI->getRegClass(RegA),
                                              SubRegB) &&
                "tied subregister must be a truncation");
         // The superreg class will not be used to constrain the subreg class.
         RC = nullptr;
-      }
-      else {
+      } else {
         assert(TRI->getMatchingSuperReg(RegA, SubRegB, MRI->getRegClass(RegB))
                && "tied subregister must be a truncation");
       }
@@ -1581,7 +1577,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     if (LIS) {
       LastCopyIdx = LIS->InsertMachineInstrInMaps(*PrevMI).getRegSlot();
 
-      if (TargetRegisterInfo::isVirtualRegister(RegA)) {
+      if (Register::isVirtualRegister(RegA)) {
         LiveInterval &LI = LIS->getInterval(RegA);
         VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
         SlotIndex endIdx =
@@ -1601,8 +1597,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     }
 
     // Make sure regA is a legal regclass for the SrcIdx operand.
-    if (TargetRegisterInfo::isVirtualRegister(RegA) &&
-        TargetRegisterInfo::isVirtualRegister(RegB))
+    if (Register::isVirtualRegister(RegA) && Register::isVirtualRegister(RegB))
       MRI->constrainRegClass(RegA, RC);
     MO.setReg(RegA);
     // The getMatchingSuper asserts guarantee that the register class projected
@@ -1744,8 +1739,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
         if (TiedPairs.size() == 1) {
           unsigned SrcIdx = TiedPairs[0].first;
           unsigned DstIdx = TiedPairs[0].second;
-          unsigned SrcReg = mi->getOperand(SrcIdx).getReg();
-          unsigned DstReg = mi->getOperand(DstIdx).getReg();
+          Register SrcReg = mi->getOperand(SrcIdx).getReg();
+          Register DstReg = mi->getOperand(DstIdx).getReg();
           if (SrcReg != DstReg &&
               tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist, false)) {
             // The tied operands have been eliminated or shifted further down
@@ -1803,9 +1798,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
 void TwoAddressInstructionPass::
 eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  if (MI.getOperand(0).getSubReg() ||
-      TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+  Register DstReg = MI.getOperand(0).getReg();
+  if (MI.getOperand(0).getSubReg() || Register::isPhysicalRegister(DstReg) ||
       !(MI.getNumOperands() & 1)) {
     LLVM_DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << MI);
     llvm_unreachable(nullptr);
@@ -1821,7 +1815,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
   bool DefEmitted = false;
   for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) {
     MachineOperand &UseMO = MI.getOperand(i);
-    unsigned SrcReg = UseMO.getReg();
+    Register SrcReg = UseMO.getReg();
     unsigned SubIdx = MI.getOperand(i+1).getImm();
     // Nothing needs to be inserted for undef operands.
     if (UseMO.isUndef())
@@ -1855,7 +1849,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     DefEmitted = true;
 
     // Update LiveVariables' kill info.
-    if (LV && isKill && !TargetRegisterInfo::isPhysicalRegister(SrcReg))
+    if (LV && isKill && !Register::isPhysicalRegister(SrcReg))
       LV->replaceKillInstruction(SrcReg, MI, *CopyMI);
 
     LLVM_DEBUG(dbgs() << "Inserted: " << *CopyMI);
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 177bab32bccc..3289eff71336 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -103,7 +103,8 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   df_iterator_default_set<MachineBasicBlock*> Reachable;
   bool ModifiedPHI = false;
 
-  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+  MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
   MachineDominatorTree *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
   MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
 
@@ -146,8 +147,14 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   }
 
   // Actually remove the blocks now.
-  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i)
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
+    // Remove any call site information for calls in the block.
+    for (auto &I : DeadBlocks[i]->instrs())
+      if (I.isCall(MachineInstr::IgnoreBundle))
+        DeadBlocks[i]->getParent()->eraseCallSiteInfo(&I);
+
     DeadBlocks[i]->eraseFromParent();
+  }
 
   // Cleanup PHI nodes.
   for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
@@ -167,8 +174,8 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
       if (phi->getNumOperands() == 3) {
         const MachineOperand &Input = phi->getOperand(1);
         const MachineOperand &Output = phi->getOperand(0);
-        unsigned InputReg = Input.getReg();
-        unsigned OutputReg = Output.getReg();
+        Register InputReg = Input.getReg();
+        Register OutputReg = Output.getReg();
         assert(Output.getSubReg() == 0 && "Cannot have output subregister");
         ModifiedPHI = true;
 
diff --git a/lib/CodeGen/ValueTypes.cpp b/lib/CodeGen/ValueTypes.cpp
index a911cdcbec9d..73b862d51c0f 100644
--- a/lib/CodeGen/ValueTypes.cpp
+++ b/lib/CodeGen/ValueTypes.cpp
@@ -115,8 +115,8 @@ std::string EVT::getEVTString() const {
   switch (V.SimpleTy) {
   default:
     if (isVector())
-      return "v" + utostr(getVectorNumElements()) +
-             getVectorElementType().getEVTString();
+      return (isScalableVector() ? "nxv" : "v") + utostr(getVectorNumElements())
+             + getVectorElementType().getEVTString();
     if (isInteger())
       return "i" + utostr(getSizeInBits());
     llvm_unreachable("Invalid EVT!");
@@ -144,6 +144,7 @@ std::string EVT::getEVTString() const {
   case MVT::v32i1:   return "v32i1";
   case MVT::v64i1:   return "v64i1";
   case MVT::v128i1:  return "v128i1";
+  case MVT::v256i1:  return "v256i1";
   case MVT::v512i1:  return "v512i1";
   case MVT::v1024i1: return "v1024i1";
   case MVT::v1i8:    return "v1i8";
@@ -157,6 +158,7 @@ std::string EVT::getEVTString() const {
   case MVT::v256i8:  return "v256i8";
   case MVT::v1i16:   return "v1i16";
   case MVT::v2i16:   return "v2i16";
+  case MVT::v3i16:   return "v3i16";
   case MVT::v4i16:   return "v4i16";
   case MVT::v8i16:   return "v8i16";
   case MVT::v16i16:  return "v16i16";
@@ -187,8 +189,11 @@ std::string EVT::getEVTString() const {
   case MVT::v1f32:   return "v1f32";
   case MVT::v2f32:   return "v2f32";
   case MVT::v2f16:   return "v2f16";
+  case MVT::v3f16:   return "v3f16";
   case MVT::v4f16:   return "v4f16";
   case MVT::v8f16:   return "v8f16";
+  case MVT::v16f16:  return "v16f16";
+  case MVT::v32f16:  return "v32f16";
   case MVT::v3f32:   return "v3f32";
   case MVT::v4f32:   return "v4f32";
   case MVT::v5f32:   return "v5f32";
@@ -205,6 +210,48 @@ std::string EVT::getEVTString() const {
   case MVT::v2f64:   return "v2f64";
   case MVT::v4f64:   return "v4f64";
   case MVT::v8f64:   return "v8f64";
+  case MVT::nxv1i1:  return "nxv1i1";
+  case MVT::nxv2i1:  return "nxv2i1";
+  case MVT::nxv4i1:  return "nxv4i1";
+  case MVT::nxv8i1:  return "nxv8i1";
+  case MVT::nxv16i1: return "nxv16i1";
+  case MVT::nxv32i1: return "nxv32i1";
+  case MVT::nxv1i8:  return "nxv1i8";
+  case MVT::nxv2i8:  return "nxv2i8";
+  case MVT::nxv4i8:  return "nxv4i8";
+  case MVT::nxv8i8:  return "nxv8i8";
+  case MVT::nxv16i8: return "nxv16i8";
+  case MVT::nxv32i8: return "nxv32i8";
+  case MVT::nxv1i16: return "nxv1i16";
+  case MVT::nxv2i16: return "nxv2i16";
+  case MVT::nxv4i16: return "nxv4i16";
+  case MVT::nxv8i16: return "nxv8i16";
+  case MVT::nxv16i16:return "nxv16i16";
+  case MVT::nxv32i16:return "nxv32i16";
+  case MVT::nxv1i32: return "nxv1i32";
+  case MVT::nxv2i32: return "nxv2i32";
+  case MVT::nxv4i32: return "nxv4i32";
+  case MVT::nxv8i32: return "nxv8i32";
+  case MVT::nxv16i32:return "nxv16i32";
+  case MVT::nxv32i32:return "nxv32i32";
+  case MVT::nxv1i64: return "nxv1i64";
+  case MVT::nxv2i64: return "nxv2i64";
+  case MVT::nxv4i64: return "nxv4i64";
+  case MVT::nxv8i64: return "nxv8i64";
+  case MVT::nxv16i64:return "nxv16i64";
+  case MVT::nxv32i64:return "nxv32i64";
+  case MVT::nxv2f16: return "nxv2f16";
+  case MVT::nxv4f16: return "nxv4f16";
+  case MVT::nxv8f16: return "nxv8f16";
+  case MVT::nxv1f32: return "nxv1f32";
+  case MVT::nxv2f32: return "nxv2f32";
+  case MVT::nxv4f32: return "nxv4f32";
+  case MVT::nxv8f32: return "nxv8f32";
+  case MVT::nxv16f32:return "nxv16f32";
+  case MVT::nxv1f64: return "nxv1f64";
+  case MVT::nxv2f64: return "nxv2f64";
+  case MVT::nxv4f64: return "nxv4f64";
+  case MVT::nxv8f64: return "nxv8f64";
   case MVT::Metadata:return "Metadata";
   case MVT::Untyped: return "Untyped";
   case MVT::exnref : return "exnref";
@@ -241,6 +288,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v32i1:   return VectorType::get(Type::getInt1Ty(Context), 32);
   case MVT::v64i1:   return VectorType::get(Type::getInt1Ty(Context), 64);
   case MVT::v128i1:  return VectorType::get(Type::getInt1Ty(Context), 128);
+  case MVT::v256i1:  return VectorType::get(Type::getInt1Ty(Context), 256);
   case MVT::v512i1:  return VectorType::get(Type::getInt1Ty(Context), 512);
   case MVT::v1024i1: return VectorType::get(Type::getInt1Ty(Context), 1024);
   case MVT::v1i8:    return VectorType::get(Type::getInt8Ty(Context), 1);
@@ -254,6 +302,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v256i8:  return VectorType::get(Type::getInt8Ty(Context), 256);
   case MVT::v1i16:   return VectorType::get(Type::getInt16Ty(Context), 1);
   case MVT::v2i16:   return VectorType::get(Type::getInt16Ty(Context), 2);
+  case MVT::v3i16:   return VectorType::get(Type::getInt16Ty(Context), 3);
   case MVT::v4i16:   return VectorType::get(Type::getInt16Ty(Context), 4);
   case MVT::v8i16:   return VectorType::get(Type::getInt16Ty(Context), 8);
   case MVT::v16i16:  return VectorType::get(Type::getInt16Ty(Context), 16);
@@ -282,8 +331,11 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v32i64:  return VectorType::get(Type::getInt64Ty(Context), 32);
   case MVT::v1i128:  return VectorType::get(Type::getInt128Ty(Context), 1);
   case MVT::v2f16:   return VectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::v3f16:   return VectorType::get(Type::getHalfTy(Context), 3);
   case MVT::v4f16:   return VectorType::get(Type::getHalfTy(Context), 4);
   case MVT::v8f16:   return VectorType::get(Type::getHalfTy(Context), 8);
+  case MVT::v16f16:  return VectorType::get(Type::getHalfTy(Context), 16);
+  case MVT::v32f16:  return VectorType::get(Type::getHalfTy(Context), 32);
   case MVT::v1f32:   return VectorType::get(Type::getFloatTy(Context), 1);
   case MVT::v2f32:   return VectorType::get(Type::getFloatTy(Context), 2);
   case MVT::v3f32:   return VectorType::get(Type::getFloatTy(Context), 3);
@@ -302,8 +354,92 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v2f64:   return VectorType::get(Type::getDoubleTy(Context), 2);
   case MVT::v4f64:   return VectorType::get(Type::getDoubleTy(Context), 4);
   case MVT::v8f64:   return VectorType::get(Type::getDoubleTy(Context), 8);
+  case MVT::nxv1i1:  
+    return VectorType::get(Type::getInt1Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i1:  
+    return VectorType::get(Type::getInt1Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i1:  
+    return VectorType::get(Type::getInt1Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i1:  
+    return VectorType::get(Type::getInt1Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i1: 
+    return VectorType::get(Type::getInt1Ty(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv32i1: 
+    return VectorType::get(Type::getInt1Ty(Context), 32, /*Scalable=*/ true);
+  case MVT::nxv1i8:  
+    return VectorType::get(Type::getInt8Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i8:  
+    return VectorType::get(Type::getInt8Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i8:  
+    return VectorType::get(Type::getInt8Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i8:  
+    return VectorType::get(Type::getInt8Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i8: 
+    return VectorType::get(Type::getInt8Ty(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv32i8: 
+    return VectorType::get(Type::getInt8Ty(Context), 32, /*Scalable=*/ true);
+  case MVT::nxv1i16: 
+    return VectorType::get(Type::getInt16Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i16: 
+    return VectorType::get(Type::getInt16Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i16: 
+    return VectorType::get(Type::getInt16Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i16: 
+    return VectorType::get(Type::getInt16Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i16:
+    return VectorType::get(Type::getInt16Ty(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv32i16:
+    return VectorType::get(Type::getInt16Ty(Context), 32, /*Scalable=*/ true);
+  case MVT::nxv1i32: 
+    return VectorType::get(Type::getInt32Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i32: 
+    return VectorType::get(Type::getInt32Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i32: 
+    return VectorType::get(Type::getInt32Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i32: 
+    return VectorType::get(Type::getInt32Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i32:
+    return VectorType::get(Type::getInt32Ty(Context), 16,/*Scalable=*/ true);
+  case MVT::nxv32i32:
+    return VectorType::get(Type::getInt32Ty(Context), 32,/*Scalable=*/ true);
+  case MVT::nxv1i64: 
+    return VectorType::get(Type::getInt64Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i64: 
+    return VectorType::get(Type::getInt64Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i64: 
+    return VectorType::get(Type::getInt64Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i64: 
+    return VectorType::get(Type::getInt64Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i64:
+    return VectorType::get(Type::getInt64Ty(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv32i64:
+    return VectorType::get(Type::getInt64Ty(Context), 32, /*Scalable=*/ true);
+  case MVT::nxv2f16: 
+    return VectorType::get(Type::getHalfTy(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4f16: 
+    return VectorType::get(Type::getHalfTy(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8f16: 
+    return VectorType::get(Type::getHalfTy(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv1f32: 
+    return VectorType::get(Type::getFloatTy(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2f32: 
+    return VectorType::get(Type::getFloatTy(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4f32: 
+    return VectorType::get(Type::getFloatTy(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8f32: 
+    return VectorType::get(Type::getFloatTy(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16f32:
+    return VectorType::get(Type::getFloatTy(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv1f64: 
+    return VectorType::get(Type::getDoubleTy(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2f64: 
+    return VectorType::get(Type::getDoubleTy(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4f64: 
+    return VectorType::get(Type::getDoubleTy(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8f64: 
+    return VectorType::get(Type::getDoubleTy(Context), 8, /*Scalable=*/ true);
   case MVT::Metadata: return Type::getMetadataTy(Context);
- }
+  }
 }
 
 /// Return the value type corresponding to the specified type.  This returns all
@@ -329,7 +465,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::VectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
     return getVectorVT(
-      getVT(VTy->getElementType(), false), VTy->getNumElements());
+      getVT(VTy->getElementType(), /*HandleUnknown=*/ false),
+            VTy->getElementCount());
   }
   }
 }
@@ -345,8 +482,9 @@ EVT EVT::getEVT(Type *Ty, bool HandleUnknown){
     return getIntegerVT(Ty->getContext(), cast<IntegerType>(Ty)->getBitWidth());
   case Type::VectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
-    return getVectorVT(Ty->getContext(), getEVT(VTy->getElementType(), false),
-                       VTy->getNumElements());
+    return getVectorVT(Ty->getContext(),
+                       getEVT(VTy->getElementType(), /*HandleUnknown=*/ false),
+                       VTy->getElementCount());
   }
   }
 }
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 4a06704a8876..5312e2eea96b 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -80,15 +80,14 @@ void VirtRegMap::grow() {
   Virt2SplitMap.resize(NumRegs);
 }
 
-void VirtRegMap::assignVirt2Phys(unsigned virtReg, MCPhysReg physReg) {
-  assert(TargetRegisterInfo::isVirtualRegister(virtReg) &&
-         TargetRegisterInfo::isPhysicalRegister(physReg));
-  assert(Virt2PhysMap[virtReg] == NO_PHYS_REG &&
+void VirtRegMap::assignVirt2Phys(Register virtReg, MCPhysReg physReg) {
+  assert(virtReg.isVirtual() && Register::isPhysicalRegister(physReg));
+  assert(Virt2PhysMap[virtReg.id()] == NO_PHYS_REG &&
          "attempt to assign physical register to already mapped "
          "virtual register");
   assert(!getRegInfo().isReserved(physReg) &&
          "Attempt to map virtReg to a reserved physReg");
-  Virt2PhysMap[virtReg] = physReg;
+  Virt2PhysMap[virtReg.id()] = physReg;
 }
 
 unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
@@ -99,46 +98,46 @@ unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
   return SS;
 }
 
-bool VirtRegMap::hasPreferredPhys(unsigned VirtReg) {
-  unsigned Hint = MRI->getSimpleHint(VirtReg);
-  if (!Hint)
+bool VirtRegMap::hasPreferredPhys(Register VirtReg) {
+  Register Hint = MRI->getSimpleHint(VirtReg);
+  if (!Hint.isValid())
     return false;
-  if (TargetRegisterInfo::isVirtualRegister(Hint))
+  if (Hint.isVirtual())
     Hint = getPhys(Hint);
   return getPhys(VirtReg) == Hint;
 }
 
-bool VirtRegMap::hasKnownPreference(unsigned VirtReg) {
+bool VirtRegMap::hasKnownPreference(Register VirtReg) {
   std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(VirtReg);
-  if (TargetRegisterInfo::isPhysicalRegister(Hint.second))
+  if (Register::isPhysicalRegister(Hint.second))
     return true;
-  if (TargetRegisterInfo::isVirtualRegister(Hint.second))
+  if (Register::isVirtualRegister(Hint.second))
     return hasPhys(Hint.second);
   return false;
 }
 
-int VirtRegMap::assignVirt2StackSlot(unsigned virtReg) {
-  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
-  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+int VirtRegMap::assignVirt2StackSlot(Register virtReg) {
+  assert(virtReg.isVirtual());
+  assert(Virt2StackSlotMap[virtReg.id()] == NO_STACK_SLOT &&
          "attempt to assign stack slot to already spilled register");
   const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
-  return Virt2StackSlotMap[virtReg] = createSpillSlot(RC);
+  return Virt2StackSlotMap[virtReg.id()] = createSpillSlot(RC);
 }
 
-void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) {
-  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
-  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+void VirtRegMap::assignVirt2StackSlot(Register virtReg, int SS) {
+  assert(virtReg.isVirtual());
+  assert(Virt2StackSlotMap[virtReg.id()] == NO_STACK_SLOT &&
          "attempt to assign stack slot to already spilled register");
   assert((SS >= 0 ||
           (SS >= MF->getFrameInfo().getObjectIndexBegin())) &&
          "illegal fixed frame index");
-  Virt2StackSlotMap[virtReg] = SS;
+  Virt2StackSlotMap[virtReg.id()] = SS;
 }
 
 void VirtRegMap::print(raw_ostream &OS, const Module*) const {
   OS << "********** REGISTER MAP **********\n";
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
       OS << '[' << printReg(Reg, TRI) << " -> "
          << printReg(Virt2PhysMap[Reg], TRI) << "] "
@@ -147,7 +146,7 @@ void VirtRegMap::print(raw_ostream &OS, const Module*) const {
   }
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
       OS << '[' << printReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
          << "] " << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
@@ -185,10 +184,10 @@ class VirtRegRewriter : public MachineFunctionPass {
   void rewrite();
   void addMBBLiveIns();
   bool readsUndefSubreg(const MachineOperand &MO) const;
-  void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
+  void addLiveInsForSubRanges(const LiveInterval &LI, Register PhysReg) const;
   void handleIdentityCopy(MachineInstr &MI) const;
   void expandCopyBundle(MachineInstr &MI) const;
-  bool subRegLiveThrough(const MachineInstr &MI, unsigned SuperPhysReg) const;
+  bool subRegLiveThrough(const MachineInstr &MI, Register SuperPhysReg) const;
 
 public:
   static char ID;
@@ -265,7 +264,7 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
 }
 
 void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI,
-                                             unsigned PhysReg) const {
+                                             Register PhysReg) const {
   assert(!LI.empty());
   assert(LI.hasSubRanges());
 
@@ -312,7 +311,7 @@ void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI,
 // assignments.
 void VirtRegRewriter::addMBBLiveIns() {
   for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
-    unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx);
+    Register VirtReg = Register::index2VirtReg(Idx);
     if (MRI->reg_nodbg_empty(VirtReg))
       continue;
     LiveInterval &LI = LIS->getInterval(VirtReg);
@@ -320,7 +319,7 @@ void VirtRegRewriter::addMBBLiveIns() {
       continue;
     // This is a virtual register that is live across basic blocks. Its
     // assigned PhysReg must be marked as live-in to those blocks.
-    unsigned PhysReg = VRM->getPhys(VirtReg);
+    Register PhysReg = VRM->getPhys(VirtReg);
     assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
 
     if (LI.hasSubRanges()) {
@@ -353,7 +352,7 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
   if (MO.isUndef())
     return true;
 
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   const LiveInterval &LI = LIS->getInterval(Reg);
   const MachineInstr &MI = *MO.getParent();
   SlotIndex BaseIndex = LIS->getInstructionIndex(MI);
@@ -469,7 +468,7 @@ void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
 /// \pre \p MI defines a subregister of a virtual register that
 /// has been assigned to \p SuperPhysReg.
 bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
-                                        unsigned SuperPhysReg) const {
+                                        Register SuperPhysReg) const {
   SlotIndex MIIndex = LIS->getInstructionIndex(MI);
   SlotIndex BeforeMIUses = MIIndex.getBaseIndex();
   SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex();
@@ -493,9 +492,9 @@ bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
 
 void VirtRegRewriter::rewrite() {
   bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
-  SmallVector<unsigned, 8> SuperDeads;
-  SmallVector<unsigned, 8> SuperDefs;
-  SmallVector<unsigned, 8> SuperKills;
+  SmallVector<Register, 8> SuperDeads;
+  SmallVector<Register, 8> SuperDefs;
+  SmallVector<Register, 8> SuperKills;
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
@@ -513,10 +512,10 @@ void VirtRegRewriter::rewrite() {
         if (MO.isRegMask())
           MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
 
-        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        if (!MO.isReg() || !MO.getReg().isVirtual())
           continue;
-        unsigned VirtReg = MO.getReg();
-        unsigned PhysReg = VRM->getPhys(VirtReg);
+        Register VirtReg = MO.getReg();
+        Register PhysReg = VRM->getPhys(VirtReg);
         assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
                "Instruction uses unmapped VirtReg");
         assert(!MRI->isReserved(PhysReg) && "Reserved register assignment");
@@ -562,7 +561,7 @@ void VirtRegRewriter::rewrite() {
 
           // PhysReg operands cannot have subregister indexes.
           PhysReg = TRI->getSubReg(PhysReg, SubReg);
-          assert(PhysReg && "Invalid SubReg for physical register");
+          assert(PhysReg.isValid() && "Invalid SubReg for physical register");
           MO.setSubReg(0);
         }
         // Rewrite. Note we could have used MachineOperand::substPhysReg(), but
diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp
index 19c59e9542b4..119c3fd1ec7f 100644
--- a/lib/CodeGen/XRayInstrumentation.cpp
+++ b/lib/CodeGen/XRayInstrumentation.cpp
@@ -111,7 +111,7 @@ void XRayInstrumentation::replaceRetWithPatchableRet(
           MIB.add(MO);
         Terminators.push_back(&T);
         if (T.isCall())
-          MF.updateCallSiteInfo(&T);
+          MF.eraseCallSiteInfo(&T);
       }
     }
   }
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index ec4773d571c8..dd6f75f97a4a 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -209,14 +209,6 @@ struct VisitHelper {
     }
   }
 
-  VisitHelper(TypeVisitorCallbackPipeline &Callbacks, VisitorDataSource Source)
-      : Visitor((Source == VDS_BytesPresent) ? Pipeline : Callbacks) {
-    if (Source == VDS_BytesPresent) {
-      Pipeline = Callbacks;
-      Pipeline.addCallbackToPipelineFront(Deserializer);
-    }
-  }
-
   TypeDeserializer Deserializer;
   TypeVisitorCallbackPipeline Pipeline;
   CVTypeVisitor Visitor;
@@ -230,13 +222,6 @@ Error llvm::codeview::visitTypeRecord(CVType &Record, TypeIndex Index,
   return V.Visitor.visitTypeRecord(Record, Index);
 }
 
-Error llvm::codeview::visitTypeRecord(CVType &Record, TypeIndex Index,
-                                      TypeVisitorCallbackPipeline &Callbacks,
-                                      VisitorDataSource Source) {
-  VisitHelper V(Callbacks, Source);
-  return V.Visitor.visitTypeRecord(Record, Index);
-}
-
 Error llvm::codeview::visitTypeRecord(CVType &Record,
                                       TypeVisitorCallbacks &Callbacks,
                                       VisitorDataSource Source) {
diff --git a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 2f49474115a1..36a384baa13d 100644
--- a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -20,7 +20,6 @@ Error CodeViewRecordIO::beginRecord(Optional<uint32_t> MaxLength) {
   Limit.MaxLength = MaxLength;
   Limit.BeginOffset = getCurrentOffset();
   Limits.push_back(Limit);
-  resetStreamedLen();
   return Error::success();
 }
 
@@ -50,6 +49,7 @@ Error CodeViewRecordIO::endRecord() {
       Streamer->EmitBytes(BytesSR);
       --PaddingBytes;
     }
+    resetStreamedLen();
   }
   return Error::success();
 }
@@ -126,7 +126,11 @@ Error CodeViewRecordIO::mapByteVectorTail(std::vector<uint8_t> &Bytes,
 
 Error CodeViewRecordIO::mapInteger(TypeIndex &TypeInd, const Twine &Comment) {
   if (isStreaming()) {
-    emitComment(Comment);
+    std::string TypeNameStr = Streamer->getTypeName(TypeInd);
+    if (!TypeNameStr.empty())
+      emitComment(Comment + ": " + TypeNameStr);
+    else
+      emitComment(Comment);
     Streamer->EmitIntValue(TypeInd.getIndex(), sizeof(TypeInd.getIndex()));
     incrStreamedLen(sizeof(TypeInd.getIndex()));
   } else if (isWriting()) {
diff --git a/lib/DebugInfo/CodeView/EnumTables.cpp b/lib/DebugInfo/CodeView/EnumTables.cpp
index 54e68ae4ea9f..82f6713a88f5 100644
--- a/lib/DebugInfo/CodeView/EnumTables.cpp
+++ b/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -300,6 +300,128 @@ static const EnumEntry<COFF::SectionCharacteristics>
         CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_READ),
         CV_ENUM_ENT(COFF, IMAGE_SCN_MEM_WRITE)};
 
+static const EnumEntry<uint16_t> ClassOptionNames[] = {
+    CV_ENUM_CLASS_ENT(ClassOptions, Packed),
+    CV_ENUM_CLASS_ENT(ClassOptions, HasConstructorOrDestructor),
+    CV_ENUM_CLASS_ENT(ClassOptions, HasOverloadedOperator),
+    CV_ENUM_CLASS_ENT(ClassOptions, Nested),
+    CV_ENUM_CLASS_ENT(ClassOptions, ContainsNestedClass),
+    CV_ENUM_CLASS_ENT(ClassOptions, HasOverloadedAssignmentOperator),
+    CV_ENUM_CLASS_ENT(ClassOptions, HasConversionOperator),
+    CV_ENUM_CLASS_ENT(ClassOptions, ForwardReference),
+    CV_ENUM_CLASS_ENT(ClassOptions, Scoped),
+    CV_ENUM_CLASS_ENT(ClassOptions, HasUniqueName),
+    CV_ENUM_CLASS_ENT(ClassOptions, Sealed),
+    CV_ENUM_CLASS_ENT(ClassOptions, Intrinsic),
+};
+
+static const EnumEntry<uint8_t> MemberAccessNames[] = {
+    CV_ENUM_CLASS_ENT(MemberAccess, None),
+    CV_ENUM_CLASS_ENT(MemberAccess, Private),
+    CV_ENUM_CLASS_ENT(MemberAccess, Protected),
+    CV_ENUM_CLASS_ENT(MemberAccess, Public),
+};
+
+static const EnumEntry<uint16_t> MethodOptionNames[] = {
+    CV_ENUM_CLASS_ENT(MethodOptions, Pseudo),
+    CV_ENUM_CLASS_ENT(MethodOptions, NoInherit),
+    CV_ENUM_CLASS_ENT(MethodOptions, NoConstruct),
+    CV_ENUM_CLASS_ENT(MethodOptions, CompilerGenerated),
+    CV_ENUM_CLASS_ENT(MethodOptions, Sealed),
+};
+
+static const EnumEntry<uint16_t> MemberKindNames[] = {
+    CV_ENUM_CLASS_ENT(MethodKind, Vanilla),
+    CV_ENUM_CLASS_ENT(MethodKind, Virtual),
+    CV_ENUM_CLASS_ENT(MethodKind, Static),
+    CV_ENUM_CLASS_ENT(MethodKind, Friend),
+    CV_ENUM_CLASS_ENT(MethodKind, IntroducingVirtual),
+    CV_ENUM_CLASS_ENT(MethodKind, PureVirtual),
+    CV_ENUM_CLASS_ENT(MethodKind, PureIntroducingVirtual),
+};
+
+static const EnumEntry<uint8_t> PtrKindNames[] = {
+    CV_ENUM_CLASS_ENT(PointerKind, Near16),
+    CV_ENUM_CLASS_ENT(PointerKind, Far16),
+    CV_ENUM_CLASS_ENT(PointerKind, Huge16),
+    CV_ENUM_CLASS_ENT(PointerKind, BasedOnSegment),
+    CV_ENUM_CLASS_ENT(PointerKind, BasedOnValue),
+    CV_ENUM_CLASS_ENT(PointerKind, BasedOnSegmentValue),
+    CV_ENUM_CLASS_ENT(PointerKind, BasedOnAddress),
+    CV_ENUM_CLASS_ENT(PointerKind, BasedOnSegmentAddress),
+    CV_ENUM_CLASS_ENT(PointerKind, BasedOnType),
+    CV_ENUM_CLASS_ENT(PointerKind, BasedOnSelf),
+    CV_ENUM_CLASS_ENT(PointerKind, Near32),
+    CV_ENUM_CLASS_ENT(PointerKind, Far32),
+    CV_ENUM_CLASS_ENT(PointerKind, Near64),
+};
+
+static const EnumEntry<uint8_t> PtrModeNames[] = {
+    CV_ENUM_CLASS_ENT(PointerMode, Pointer),
+    CV_ENUM_CLASS_ENT(PointerMode, LValueReference),
+    CV_ENUM_CLASS_ENT(PointerMode, PointerToDataMember),
+    CV_ENUM_CLASS_ENT(PointerMode, PointerToMemberFunction),
+    CV_ENUM_CLASS_ENT(PointerMode, RValueReference),
+};
+
+static const EnumEntry<uint16_t> PtrMemberRepNames[] = {
+    CV_ENUM_CLASS_ENT(PointerToMemberRepresentation, Unknown),
+    CV_ENUM_CLASS_ENT(PointerToMemberRepresentation, SingleInheritanceData),
+    CV_ENUM_CLASS_ENT(PointerToMemberRepresentation, MultipleInheritanceData),
+    CV_ENUM_CLASS_ENT(PointerToMemberRepresentation, VirtualInheritanceData),
+    CV_ENUM_CLASS_ENT(PointerToMemberRepresentation, GeneralData),
+    CV_ENUM_CLASS_ENT(PointerToMemberRepresentation, SingleInheritanceFunction),
+    CV_ENUM_CLASS_ENT(PointerToMemberRepresentation,
+                      MultipleInheritanceFunction),
+    CV_ENUM_CLASS_ENT(PointerToMemberRepresentation,
+                      VirtualInheritanceFunction),
+    CV_ENUM_CLASS_ENT(PointerToMemberRepresentation, GeneralFunction),
+};
+
+static const EnumEntry<uint16_t> TypeModifierNames[] = {
+    CV_ENUM_CLASS_ENT(ModifierOptions, Const),
+    CV_ENUM_CLASS_ENT(ModifierOptions, Volatile),
+    CV_ENUM_CLASS_ENT(ModifierOptions, Unaligned),
+};
+
+static const EnumEntry<uint8_t> CallingConventions[] = {
+    CV_ENUM_CLASS_ENT(CallingConvention, NearC),
+    CV_ENUM_CLASS_ENT(CallingConvention, FarC),
+    CV_ENUM_CLASS_ENT(CallingConvention, NearPascal),
+    CV_ENUM_CLASS_ENT(CallingConvention, FarPascal),
+    CV_ENUM_CLASS_ENT(CallingConvention, NearFast),
+    CV_ENUM_CLASS_ENT(CallingConvention, FarFast),
+    CV_ENUM_CLASS_ENT(CallingConvention, NearStdCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, FarStdCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, NearSysCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, FarSysCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, ThisCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, MipsCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, Generic),
+    CV_ENUM_CLASS_ENT(CallingConvention, AlphaCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, PpcCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, SHCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, ArmCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, AM33Call),
+    CV_ENUM_CLASS_ENT(CallingConvention, TriCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, SH5Call),
+    CV_ENUM_CLASS_ENT(CallingConvention, M32RCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, ClrCall),
+    CV_ENUM_CLASS_ENT(CallingConvention, Inline),
+    CV_ENUM_CLASS_ENT(CallingConvention, NearVector),
+};
+
+static const EnumEntry<uint8_t> FunctionOptionEnum[] = {
+    CV_ENUM_CLASS_ENT(FunctionOptions, CxxReturnUdt),
+    CV_ENUM_CLASS_ENT(FunctionOptions, Constructor),
+    CV_ENUM_CLASS_ENT(FunctionOptions, ConstructorWithVirtualBases),
+};
+
+static const EnumEntry<uint16_t> LabelTypeEnum[] = {
+    CV_ENUM_CLASS_ENT(LabelType, Near),
+    CV_ENUM_CLASS_ENT(LabelType, Far),
+};
+
 namespace llvm {
 namespace codeview {
 
@@ -379,5 +501,49 @@ getImageSectionCharacteristicNames() {
   return makeArrayRef(ImageSectionCharacteristicNames);
 }
 
+ArrayRef<EnumEntry<uint16_t>> getClassOptionNames() {
+  return makeArrayRef(ClassOptionNames);
+}
+
+ArrayRef<EnumEntry<uint8_t>> getMemberAccessNames() {
+  return makeArrayRef(MemberAccessNames);
+}
+
+ArrayRef<EnumEntry<uint16_t>> getMethodOptionNames() {
+  return makeArrayRef(MethodOptionNames);
+}
+
+ArrayRef<EnumEntry<uint16_t>> getMemberKindNames() {
+  return makeArrayRef(MemberKindNames);
+}
+
+ArrayRef<EnumEntry<uint8_t>> getPtrKindNames() {
+  return makeArrayRef(PtrKindNames);
+}
+
+ArrayRef<EnumEntry<uint8_t>> getPtrModeNames() {
+  return makeArrayRef(PtrModeNames);
+}
+
+ArrayRef<EnumEntry<uint16_t>> getPtrMemberRepNames() {
+  return makeArrayRef(PtrMemberRepNames);
+}
+
+ArrayRef<EnumEntry<uint16_t>> getTypeModifierNames() {
+  return makeArrayRef(TypeModifierNames);
+}
+
+ArrayRef<EnumEntry<uint8_t>> getCallingConventions() {
+  return makeArrayRef(CallingConventions);
+}
+
+ArrayRef<EnumEntry<uint8_t>> getFunctionOptionEnum() {
+  return makeArrayRef(FunctionOptionEnum);
+}
+
+ArrayRef<EnumEntry<uint16_t>> getLabelTypeEnum() {
+  return makeArrayRef(LabelTypeEnum);
+}
+
 } // end namespace codeview
 } // end namespace llvm
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index 27cb7e35234b..45b63983beb4 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -315,7 +315,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(
 
 Error CVSymbolDumperImpl::visitKnownRecord(
     CVSymbol &CVR, DefRangeFramePointerRelSym &DefRangeFramePointerRel) {
-  W.printNumber("Offset", DefRangeFramePointerRel.Offset);
+  W.printNumber("Offset", DefRangeFramePointerRel.Hdr.Offset);
   printLocalVariableAddrRange(DefRangeFramePointerRel.Range,
                               DefRangeFramePointerRel.getRelocationOffset());
   printLocalVariableAddrGap(DefRangeFramePointerRel.Gaps);
diff --git a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
index 70889839ef48..3b627930e271 100644
--- a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
+++ b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
@@ -229,7 +229,7 @@ Error SymbolRecordMapping::visitKnownRecord(CVSymbol &CVR, DataSym &Data) {
 Error SymbolRecordMapping::visitKnownRecord(
     CVSymbol &CVR, DefRangeFramePointerRelSym &DefRangeFramePointerRel) {
 
-  error(IO.mapInteger(DefRangeFramePointerRel.Offset));
+  error(IO.mapObject(DefRangeFramePointerRel.Hdr.Offset));
   error(mapLocalVariableAddrRange(IO, DefRangeFramePointerRel.Range));
   error(IO.mapVectorTail(DefRangeFramePointerRel.Gaps, MapGap()));
 
diff --git a/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index 47928c2eef64..1aded589e565 100644
--- a/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -7,24 +7,125 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/DebugInfo/CodeView/EnumTables.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
 
+namespace {
+
 #define error(X)                                                               \
   if (auto EC = X)                                                             \
     return EC;
 
-namespace {
+static const EnumEntry<TypeLeafKind> LeafTypeNames[] = {
+#define CV_TYPE(enum, val) {#enum, enum},
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+};
+
+static StringRef getLeafTypeName(TypeLeafKind LT) {
+  switch (LT) {
+#define TYPE_RECORD(ename, value, name)                                        \
+  case ename:                                                                  \
+    return #name;
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+  default:
+    break;
+  }
+  return "UnknownLeaf";
+}
+
+template <typename T>
+static bool compEnumNames(const EnumEntry<T> &lhs, const EnumEntry<T> &rhs) {
+  return lhs.Name < rhs.Name;
+}
+
+template <typename T, typename TFlag>
+static std::string getFlagNames(CodeViewRecordIO &IO, T Value,
+                                ArrayRef<EnumEntry<TFlag>> Flags) {
+  if (!IO.isStreaming())
+    return std::string("");
+  typedef EnumEntry<TFlag> FlagEntry;
+  typedef SmallVector<FlagEntry, 10> FlagVector;
+  FlagVector SetFlags;
+  for (const auto &Flag : Flags) {
+    if (Flag.Value == 0)
+      continue;
+    if ((Value & Flag.Value) == Flag.Value) {
+      SetFlags.push_back(Flag);
+    }
+  }
+
+  llvm::sort(SetFlags, &compEnumNames<TFlag>);
+
+  std::string FlagLabel;
+  bool FirstOcc = true;
+  for (const auto &Flag : SetFlags) {
+    if (FirstOcc)
+      FirstOcc = false;
+    else
+      FlagLabel += (" | ");
+
+    FlagLabel += (Flag.Name.str() + " (0x" + utohexstr(Flag.Value) + ")");
+  }
+
+  if (!FlagLabel.empty()) {
+    std::string LabelWithBraces(" ( ");
+    LabelWithBraces += FlagLabel + " )";
+    return LabelWithBraces;
+  } else
+    return FlagLabel;
+}
+
+template <typename T, typename TEnum>
+static StringRef getEnumName(CodeViewRecordIO &IO, T Value,
+                             ArrayRef<EnumEntry<TEnum>> EnumValues) {
+  if (!IO.isStreaming())
+    return "";
+  StringRef Name;
+  for (const auto &EnumItem : EnumValues) {
+    if (EnumItem.Value == Value) {
+      Name = EnumItem.Name;
+      break;
+    }
+  }
+
+  return Name;
+}
+
+static std::string getMemberAttributes(CodeViewRecordIO &IO,
+                                       MemberAccess Access, MethodKind Kind,
+                                       MethodOptions Options) {
+  if (!IO.isStreaming())
+    return "";
+  std::string AccessSpecifier =
+      getEnumName(IO, uint8_t(Access), makeArrayRef(getMemberAccessNames()));
+  std::string MemberAttrs(AccessSpecifier);
+  if (Kind != MethodKind::Vanilla) {
+    std::string MethodKind =
+        getEnumName(IO, unsigned(Kind), makeArrayRef(getMemberKindNames()));
+    MemberAttrs += ", " + MethodKind;
+  }
+  if (Options != MethodOptions::None) {
+    std::string MethodOptions = getFlagNames(
+        IO, unsigned(Options), makeArrayRef(getMethodOptionNames()));
+    MemberAttrs += ", " + MethodOptions;
+  }
+  return MemberAttrs;
+}
+
 struct MapOneMethodRecord {
   explicit MapOneMethodRecord(bool IsFromOverloadList)
       : IsFromOverloadList(IsFromOverloadList) {}
 
   Error operator()(CodeViewRecordIO &IO, OneMethodRecord &Method) const {
-    error(IO.mapInteger(Method.Attrs.Attrs, "AccessSpecifier"));
+    std::string Attrs = getMemberAttributes(
+        IO, Method.getAccess(), Method.getMethodKind(), Method.getOptions());
+    error(IO.mapInteger(Method.Attrs.Attrs, "Attrs: " + Attrs));
     if (IsFromOverloadList) {
       uint16_t Padding = 0;
-      error(IO.mapInteger(Padding, "Padding"));
+      error(IO.mapInteger(Padding));
     }
     error(IO.mapInteger(Method.Type, "Type"));
     if (Method.isIntroducingVirtual()) {
@@ -41,7 +142,7 @@ struct MapOneMethodRecord {
 private:
   bool IsFromOverloadList;
 };
-}
+} // namespace
 
 static Error mapNameAndUniqueName(CodeViewRecordIO &IO, StringRef &Name,
                                   StringRef &UniqueName, bool HasUniqueName) {
@@ -96,10 +197,22 @@ Error TypeRecordMapping::visitTypeBegin(CVType &CVR) {
     MaxLen = MaxRecordLength - sizeof(RecordPrefix);
   error(IO.beginRecord(MaxLen));
   TypeKind = CVR.kind();
+
+  if (IO.isStreaming()) {
+    auto RecordKind = CVR.kind();
+    uint16_t RecordLen = CVR.length() - 2;
+    std::string RecordKindName =
+        getEnumName(IO, unsigned(RecordKind), makeArrayRef(LeafTypeNames));
+    error(IO.mapInteger(RecordLen, "Record length"));
+    error(IO.mapEnum(RecordKind, "Record kind: " + RecordKindName));
+  }
   return Error::success();
 }
 
 Error TypeRecordMapping::visitTypeBegin(CVType &CVR, TypeIndex Index) {
+  if (IO.isStreaming())
+    IO.emitRawComment(" " + getLeafTypeName(CVR.kind()) + " (0x" +
+                      utohexstr(Index.getIndex()) + ")");
   return visitTypeBegin(CVR);
 }
 
@@ -121,11 +234,21 @@ Error TypeRecordMapping::visitMemberBegin(CVMemberRecord &Record) {
   // followed by the subrecord, followed by a continuation, and that entire
   // sequence spaws `MaxRecordLength` bytes.  So the record's length is
   // calculated as follows.
+
   constexpr uint32_t ContinuationLength = 8;
   error(IO.beginRecord(MaxRecordLength - sizeof(RecordPrefix) -
                        ContinuationLength));
 
   MemberKind = Record.Kind;
+  if (IO.isStreaming()) {
+    std::string MemberKindName = getLeafTypeName(Record.Kind);
+    MemberKindName +=
+        " ( " +
+        (getEnumName(IO, unsigned(Record.Kind), makeArrayRef(LeafTypeNames)))
+            .str() +
+        " )";
+    error(IO.mapEnum(Record.Kind, "Member kind: " + MemberKindName));
+  }
   return Error::success();
 }
 
@@ -144,16 +267,24 @@ Error TypeRecordMapping::visitMemberEnd(CVMemberRecord &Record) {
 }
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR, ModifierRecord &Record) {
+  std::string ModifierNames =
+      getFlagNames(IO, static_cast<uint16_t>(Record.Modifiers),
+                   makeArrayRef(getTypeModifierNames()));
   error(IO.mapInteger(Record.ModifiedType, "ModifiedType"));
-  error(IO.mapEnum(Record.Modifiers, "Modifiers"));
+  error(IO.mapEnum(Record.Modifiers, "Modifiers" + ModifierNames));
   return Error::success();
 }
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
                                           ProcedureRecord &Record) {
+  std::string CallingConvName = getEnumName(
+      IO, uint8_t(Record.CallConv), makeArrayRef(getCallingConventions()));
+  std::string FuncOptionNames =
+      getFlagNames(IO, static_cast<uint16_t>(Record.Options),
+                   makeArrayRef(getFunctionOptionEnum()));
   error(IO.mapInteger(Record.ReturnType, "ReturnType"));
-  error(IO.mapEnum(Record.CallConv, "CallingConvention"));
-  error(IO.mapEnum(Record.Options, "FunctionOptions"));
+  error(IO.mapEnum(Record.CallConv, "CallingConvention: " + CallingConvName));
+  error(IO.mapEnum(Record.Options, "FunctionOptions" + FuncOptionNames));
   error(IO.mapInteger(Record.ParameterCount, "NumParameters"));
   error(IO.mapInteger(Record.ArgumentList, "ArgListType"));
 
@@ -162,11 +293,16 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
                                           MemberFunctionRecord &Record) {
+  std::string CallingConvName = getEnumName(
+      IO, uint8_t(Record.CallConv), makeArrayRef(getCallingConventions()));
+  std::string FuncOptionNames =
+      getFlagNames(IO, static_cast<uint16_t>(Record.Options),
+                   makeArrayRef(getFunctionOptionEnum()));
   error(IO.mapInteger(Record.ReturnType, "ReturnType"));
   error(IO.mapInteger(Record.ClassType, "ClassType"));
   error(IO.mapInteger(Record.ThisType, "ThisType"));
-  error(IO.mapEnum(Record.CallConv, "CallingConvention"));
-  error(IO.mapEnum(Record.Options, "FunctionOptions"));
+  error(IO.mapEnum(Record.CallConv, "CallingConvention: " + CallingConvName));
+  error(IO.mapEnum(Record.Options, "FunctionOptions" + FuncOptionNames));
   error(IO.mapInteger(Record.ParameterCount, "NumParameters"));
   error(IO.mapInteger(Record.ArgumentList, "ArgListType"));
   error(IO.mapInteger(Record.ThisPointerAdjustment, "ThisAdjustment"));
@@ -197,8 +333,40 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 }
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR, PointerRecord &Record) {
+
+  SmallString<128> Attr("Attrs: ");
+
+  if (IO.isStreaming()) {
+    std::string PtrType = getEnumName(IO, unsigned(Record.getPointerKind()),
+                                      makeArrayRef(getPtrKindNames()));
+    Attr += "[ Type: " + PtrType;
+
+    std::string PtrMode = getEnumName(IO, unsigned(Record.getMode()),
+                                      makeArrayRef(getPtrModeNames()));
+    Attr += ", Mode: " + PtrMode;
+
+    auto PtrSizeOf = Record.getSize();
+    Attr += ", SizeOf: " + itostr(PtrSizeOf);
+
+    if (Record.isFlat())
+      Attr += ", isFlat";
+    if (Record.isConst())
+      Attr += ", isConst";
+    if (Record.isVolatile())
+      Attr += ", isVolatile";
+    if (Record.isUnaligned())
+      Attr += ", isUnaligned";
+    if (Record.isRestrict())
+      Attr += ", isRestricted";
+    if (Record.isLValueReferenceThisPtr())
+      Attr += ", isThisPtr&";
+    if (Record.isRValueReferenceThisPtr())
+      Attr += ", isThisPtr&&";
+    Attr += " ]";
+  }
+
   error(IO.mapInteger(Record.ReferentType, "PointeeType"));
-  error(IO.mapInteger(Record.Attrs, "Attributes"));
+  error(IO.mapInteger(Record.Attrs, Attr));
 
   if (Record.isPointerToMember()) {
     if (IO.isReading())
@@ -206,7 +374,10 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR, PointerRecord &Record) {
 
     MemberPointerInfo &M = *Record.MemberInfo;
     error(IO.mapInteger(M.ContainingType, "ClassType"));
-    error(IO.mapEnum(M.Representation, "Representation"));
+    std::string PtrMemberGetRepresentation = getEnumName(
+        IO, uint16_t(M.Representation), makeArrayRef(getPtrMemberRepNames()));
+    error(IO.mapEnum(M.Representation,
+                     "Representation: " + PtrMemberGetRepresentation));
   }
 
   return Error::success();
@@ -226,8 +397,11 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR, ClassRecord &Record) {
          (CVR.kind() == TypeLeafKind::LF_CLASS) ||
          (CVR.kind() == TypeLeafKind::LF_INTERFACE));
 
+  std::string PropertiesNames =
+      getFlagNames(IO, static_cast<uint16_t>(Record.Options),
+                   makeArrayRef(getClassOptionNames()));
   error(IO.mapInteger(Record.MemberCount, "MemberCount"));
-  error(IO.mapEnum(Record.Options, "Properties"));
+  error(IO.mapEnum(Record.Options, "Properties" + PropertiesNames));
   error(IO.mapInteger(Record.FieldList, "FieldList"));
   error(IO.mapInteger(Record.DerivationList, "DerivedFrom"));
   error(IO.mapInteger(Record.VTableShape, "VShape"));
@@ -239,8 +413,11 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR, ClassRecord &Record) {
 }
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR, UnionRecord &Record) {
+  std::string PropertiesNames =
+      getFlagNames(IO, static_cast<uint16_t>(Record.Options),
+                   makeArrayRef(getClassOptionNames()));
   error(IO.mapInteger(Record.MemberCount, "MemberCount"));
-  error(IO.mapEnum(Record.Options, "Properties"));
+  error(IO.mapEnum(Record.Options, "Properties" + PropertiesNames));
   error(IO.mapInteger(Record.FieldList, "FieldList"));
   error(IO.mapEncodedInteger(Record.Size, "SizeOf"));
   error(mapNameAndUniqueName(IO, Record.Name, Record.UniqueName,
@@ -250,8 +427,11 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR, UnionRecord &Record) {
 }
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR, EnumRecord &Record) {
+  std::string PropertiesNames =
+      getFlagNames(IO, static_cast<uint16_t>(Record.Options),
+                   makeArrayRef(getClassOptionNames()));
   error(IO.mapInteger(Record.MemberCount, "NumEnumerators"));
-  error(IO.mapEnum(Record.Options, "Properties"));
+  error(IO.mapEnum(Record.Options, "Properties" + PropertiesNames));
   error(IO.mapInteger(Record.UnderlyingType, "UnderlyingType"));
   error(IO.mapInteger(Record.FieldList, "FieldListType"));
   error(mapNameAndUniqueName(IO, Record.Name, Record.UniqueName,
@@ -383,7 +563,11 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
                                           FieldListRecord &Record) {
-  error(IO.mapByteVectorTail(Record.Data));
+  if (IO.isStreaming()) {
+    if (auto EC = codeview::visitMemberRecordStream(Record.Data, *this))
+      return EC;
+  } else
+    error(IO.mapByteVectorTail(Record.Data));
 
   return Error::success();
 }
@@ -397,13 +581,17 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 }
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR, LabelRecord &Record) {
-  error(IO.mapEnum(Record.Mode, "Mode"));
+  std::string ModeName =
+      getEnumName(IO, uint16_t(Record.Mode), makeArrayRef(getLabelTypeEnum()));
+  error(IO.mapEnum(Record.Mode, "Mode: " + ModeName));
   return Error::success();
 }
 
 Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
                                           BaseClassRecord &Record) {
-  error(IO.mapInteger(Record.Attrs.Attrs, "AccessSpecifier"));
+  std::string Attrs = getMemberAttributes(
+      IO, Record.getAccess(), MethodKind::Vanilla, MethodOptions::None);
+  error(IO.mapInteger(Record.Attrs.Attrs, "Attrs: " + Attrs));
   error(IO.mapInteger(Record.Type, "BaseType"));
   error(IO.mapEncodedInteger(Record.Offset, "BaseOffset"));
 
@@ -412,7 +600,9 @@ Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
 
 Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
                                           EnumeratorRecord &Record) {
-  error(IO.mapInteger(Record.Attrs.Attrs));
+  std::string Attrs = getMemberAttributes(
+      IO, Record.getAccess(), MethodKind::Vanilla, MethodOptions::None);
+  error(IO.mapInteger(Record.Attrs.Attrs, "Attrs: " + Attrs));
 
   // FIXME: Handle full APInt such as __int128.
   error(IO.mapEncodedInteger(Record.Value, "EnumValue"));
@@ -423,7 +613,9 @@ Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
 
 Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
                                           DataMemberRecord &Record) {
-  error(IO.mapInteger(Record.Attrs.Attrs, "AccessSpecifier"));
+  std::string Attrs = getMemberAttributes(
+      IO, Record.getAccess(), MethodKind::Vanilla, MethodOptions::None);
+  error(IO.mapInteger(Record.Attrs.Attrs, "Attrs: " + Attrs));
   error(IO.mapInteger(Record.Type, "Type"));
   error(IO.mapEncodedInteger(Record.FieldOffset, "FieldOffset"));
   error(IO.mapStringZ(Record.Name, "Name"));
@@ -460,7 +652,9 @@ Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
 Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
                                           StaticDataMemberRecord &Record) {
 
-  error(IO.mapInteger(Record.Attrs.Attrs, "AccessSpecifier"));
+  std::string Attrs = getMemberAttributes(
+      IO, Record.getAccess(), MethodKind::Vanilla, MethodOptions::None);
+  error(IO.mapInteger(Record.Attrs.Attrs, "Attrs: " + Attrs));
   error(IO.mapInteger(Record.Type, "Type"));
   error(IO.mapStringZ(Record.Name, "Name"));
 
@@ -470,7 +664,9 @@ Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
 Error TypeRecordMapping::visitKnownMember(CVMemberRecord &CVR,
                                           VirtualBaseClassRecord &Record) {
 
-  error(IO.mapInteger(Record.Attrs.Attrs, "AccessSpecifier"));
+  std::string Attrs = getMemberAttributes(
+      IO, Record.getAccess(), MethodKind::Vanilla, MethodOptions::None);
+  error(IO.mapInteger(Record.Attrs.Attrs, "Attrs: " + Attrs));
   error(IO.mapInteger(Record.BaseType, "BaseType"));
   error(IO.mapInteger(Record.VBPtrType, "VBPtrType"));
   error(IO.mapEncodedInteger(Record.VBPtrOffset, "VBPtrOffset"));
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index f4dd79937608..abbea3a868c8 100644
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -38,9 +38,9 @@ DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() {
 
 bool
 DWARFAbbreviationDeclaration::extract(DataExtractor Data,
-                                      uint32_t* OffsetPtr) {
+                                      uint64_t* OffsetPtr) {
   clear();
-  const uint32_t Offset = *OffsetPtr;
+  const uint64_t Offset = *OffsetPtr;
   Code = Data.getULEB128(OffsetPtr);
   if (Code == 0) {
     return false;
@@ -148,7 +148,7 @@ DWARFAbbreviationDeclaration::findAttributeIndex(dwarf::Attribute Attr) const {
 }
 
 Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
-    const uint32_t DIEOffset, const dwarf::Attribute Attr,
+    const uint64_t DIEOffset, const dwarf::Attribute Attr,
     const DWARFUnit &U) const {
   Optional<uint32_t> MatchAttrIndex = findAttributeIndex(Attr);
   if (!MatchAttrIndex)
@@ -158,7 +158,7 @@ Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
 
   // Add the byte size of ULEB that for the abbrev Code so we can start
   // skipping the attribute data.
-  uint32_t Offset = DIEOffset + CodeByteSize;
+  uint64_t Offset = DIEOffset + CodeByteSize;
   uint32_t AttrIndex = 0;
   for (const auto &Spec : AttributeSpecs) {
     if (*MatchAttrIndex == AttrIndex) {
diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 0721efb40f6a..875f5e9989a0 100644
--- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -42,7 +42,7 @@ static Atom formatAtom(unsigned Atom) { return {Atom}; }
 DWARFAcceleratorTable::~DWARFAcceleratorTable() = default;
 
 Error AppleAcceleratorTable::extract() {
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
 
   // Check that we can at least read the header.
   if (!AccelSection.isValidOffset(offsetof(Header, HeaderDataLength) + 4))
@@ -111,15 +111,15 @@ bool AppleAcceleratorTable::validateForms() {
   return true;
 }
 
-std::pair<uint32_t, dwarf::Tag>
-AppleAcceleratorTable::readAtoms(uint32_t &HashDataOffset) {
-  uint32_t DieOffset = dwarf::DW_INVALID_OFFSET;
+std::pair<uint64_t, dwarf::Tag>
+AppleAcceleratorTable::readAtoms(uint64_t *HashDataOffset) {
+  uint64_t DieOffset = dwarf::DW_INVALID_OFFSET;
   dwarf::Tag DieTag = dwarf::DW_TAG_null;
   dwarf::FormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
 
   for (auto Atom : getAtomsDesc()) {
     DWARFFormValue FormValue(Atom.second);
-    FormValue.extractValue(AccelSection, &HashDataOffset, FormParams);
+    FormValue.extractValue(AccelSection, HashDataOffset, FormParams);
     switch (Atom.first) {
     case dwarf::DW_ATOM_die_offset:
       DieOffset = *FormValue.getAsUnsignedConstant();
@@ -163,19 +163,19 @@ Optional<uint64_t> AppleAcceleratorTable::HeaderData::extractOffset(
 
 bool AppleAcceleratorTable::dumpName(ScopedPrinter &W,
                                      SmallVectorImpl<DWARFFormValue> &AtomForms,
-                                     uint32_t *DataOffset) const {
+                                     uint64_t *DataOffset) const {
   dwarf::FormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
-  uint32_t NameOffset = *DataOffset;
+  uint64_t NameOffset = *DataOffset;
   if (!AccelSection.isValidOffsetForDataOfSize(*DataOffset, 4)) {
     W.printString("Incorrectly terminated list.");
     return false;
   }
-  unsigned StringOffset = AccelSection.getRelocatedValue(4, DataOffset);
+  uint64_t StringOffset = AccelSection.getRelocatedValue(4, DataOffset);
   if (!StringOffset)
     return false; // End of list
 
   DictScope NameScope(W, ("Name@0x" + Twine::utohexstr(NameOffset)).str());
-  W.startLine() << format("String: 0x%08x", StringOffset);
+  W.startLine() << format("String: 0x%08" PRIx64, StringOffset);
   W.getOStream() << " \"" << StringSection.getCStr(&StringOffset) << "\"\n";
 
   unsigned NumData = AccelSection.getU32(DataOffset);
@@ -223,9 +223,9 @@ LLVM_DUMP_METHOD void AppleAcceleratorTable::dump(raw_ostream &OS) const {
   }
 
   // Now go through the actual tables and dump them.
-  uint32_t Offset = sizeof(Hdr) + Hdr.HeaderDataLength;
-  unsigned HashesBase = Offset + Hdr.BucketCount * 4;
-  unsigned OffsetsBase = HashesBase + Hdr.HashCount * 4;
+  uint64_t Offset = sizeof(Hdr) + Hdr.HeaderDataLength;
+  uint64_t HashesBase = Offset + Hdr.BucketCount * 4;
+  uint64_t OffsetsBase = HashesBase + Hdr.HashCount * 4;
 
   for (unsigned Bucket = 0; Bucket < Hdr.BucketCount; ++Bucket) {
     unsigned Index = AccelSection.getU32(&Offset);
@@ -237,14 +237,14 @@ LLVM_DUMP_METHOD void AppleAcceleratorTable::dump(raw_ostream &OS) const {
     }
 
     for (unsigned HashIdx = Index; HashIdx < Hdr.HashCount; ++HashIdx) {
-      unsigned HashOffset = HashesBase + HashIdx*4;
-      unsigned OffsetsOffset = OffsetsBase + HashIdx*4;
+      uint64_t HashOffset = HashesBase + HashIdx*4;
+      uint64_t OffsetsOffset = OffsetsBase + HashIdx*4;
       uint32_t Hash = AccelSection.getU32(&HashOffset);
 
       if (Hash % Hdr.BucketCount != Bucket)
         break;
 
-      unsigned DataOffset = AccelSection.getU32(&OffsetsOffset);
+      uint64_t DataOffset = AccelSection.getU32(&OffsetsOffset);
       ListScope HashScope(W, ("Hash 0x" + Twine::utohexstr(Hash)).str());
       if (!AccelSection.isValidOffset(DataOffset)) {
         W.printString("Invalid section offset");
@@ -265,7 +265,7 @@ AppleAcceleratorTable::Entry::Entry(
 }
 
 void AppleAcceleratorTable::Entry::extract(
-    const AppleAcceleratorTable &AccelTable, uint32_t *Offset) {
+    const AppleAcceleratorTable &AccelTable, uint64_t *Offset) {
 
   dwarf::FormParams FormParams = {AccelTable.Hdr.Version, 0,
                                   dwarf::DwarfFormat::DWARF32};
@@ -302,7 +302,7 @@ Optional<dwarf::Tag> AppleAcceleratorTable::Entry::getTag() const {
 }
 
 AppleAcceleratorTable::ValueIterator::ValueIterator(
-    const AppleAcceleratorTable &AccelTable, unsigned Offset)
+    const AppleAcceleratorTable &AccelTable, uint64_t Offset)
     : AccelTable(&AccelTable), Current(AccelTable.HdrData), DataOffset(Offset) {
   if (!AccelTable.AccelSection.isValidOffsetForDataOfSize(DataOffset, 4))
     return;
@@ -333,25 +333,25 @@ AppleAcceleratorTable::equal_range(StringRef Key) const {
   // Find the bucket.
   unsigned HashValue = djbHash(Key);
   unsigned Bucket = HashValue % Hdr.BucketCount;
-  unsigned BucketBase = sizeof(Hdr) + Hdr.HeaderDataLength;
-  unsigned HashesBase = BucketBase + Hdr.BucketCount * 4;
-  unsigned OffsetsBase = HashesBase + Hdr.HashCount * 4;
+  uint64_t BucketBase = sizeof(Hdr) + Hdr.HeaderDataLength;
+  uint64_t HashesBase = BucketBase + Hdr.BucketCount * 4;
+  uint64_t OffsetsBase = HashesBase + Hdr.HashCount * 4;
 
-  unsigned BucketOffset = BucketBase + Bucket * 4;
+  uint64_t BucketOffset = BucketBase + Bucket * 4;
   unsigned Index = AccelSection.getU32(&BucketOffset);
 
   // Search through all hashes in the bucket.
   for (unsigned HashIdx = Index; HashIdx < Hdr.HashCount; ++HashIdx) {
-    unsigned HashOffset = HashesBase + HashIdx * 4;
-    unsigned OffsetsOffset = OffsetsBase + HashIdx * 4;
+    uint64_t HashOffset = HashesBase + HashIdx * 4;
+    uint64_t OffsetsOffset = OffsetsBase + HashIdx * 4;
     uint32_t Hash = AccelSection.getU32(&HashOffset);
 
     if (Hash % Hdr.BucketCount != Bucket)
       // We are already in the next bucket.
       break;
 
-    unsigned DataOffset = AccelSection.getU32(&OffsetsOffset);
-    unsigned StringOffset = AccelSection.getRelocatedValue(4, &DataOffset);
+    uint64_t DataOffset = AccelSection.getU32(&OffsetsOffset);
+    uint64_t StringOffset = AccelSection.getRelocatedValue(4, &DataOffset);
     if (!StringOffset)
       break;
 
@@ -377,7 +377,7 @@ void DWARFDebugNames::Header::dump(ScopedPrinter &W) const {
 }
 
 Error DWARFDebugNames::Header::extract(const DWARFDataExtractor &AS,
-                                             uint32_t *Offset) {
+                                             uint64_t *Offset) {
   // Check that we can read the fixed-size part.
   if (!AS.isValidOffset(*Offset + sizeof(HeaderPOD) - 1))
     return createStringError(errc::illegal_byte_sequence,
@@ -437,7 +437,7 @@ DWARFDebugNames::Abbrev DWARFDebugNames::AbbrevMapInfo::getTombstoneKey() {
 }
 
 Expected<DWARFDebugNames::AttributeEncoding>
-DWARFDebugNames::NameIndex::extractAttributeEncoding(uint32_t *Offset) {
+DWARFDebugNames::NameIndex::extractAttributeEncoding(uint64_t *Offset) {
   if (*Offset >= EntriesBase) {
     return createStringError(errc::illegal_byte_sequence,
                              "Incorrectly terminated abbreviation table.");
@@ -449,7 +449,7 @@ DWARFDebugNames::NameIndex::extractAttributeEncoding(uint32_t *Offset) {
 }
 
 Expected<std::vector<DWARFDebugNames::AttributeEncoding>>
-DWARFDebugNames::NameIndex::extractAttributeEncodings(uint32_t *Offset) {
+DWARFDebugNames::NameIndex::extractAttributeEncodings(uint64_t *Offset) {
   std::vector<AttributeEncoding> Result;
   for (;;) {
     auto AttrEncOr = extractAttributeEncoding(Offset);
@@ -463,7 +463,7 @@ DWARFDebugNames::NameIndex::extractAttributeEncodings(uint32_t *Offset) {
 }
 
 Expected<DWARFDebugNames::Abbrev>
-DWARFDebugNames::NameIndex::extractAbbrev(uint32_t *Offset) {
+DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) {
   if (*Offset >= EntriesBase) {
     return createStringError(errc::illegal_byte_sequence,
                              "Incorrectly terminated abbreviation table.");
@@ -482,7 +482,7 @@ DWARFDebugNames::NameIndex::extractAbbrev(uint32_t *Offset) {
 
 Error DWARFDebugNames::NameIndex::extract() {
   const DWARFDataExtractor &AS = Section.AccelSection;
-  uint32_t Offset = Base;
+  uint64_t Offset = Base;
   if (Error E = Hdr.extract(AS, &Offset))
     return E;
 
@@ -577,27 +577,27 @@ std::error_code DWARFDebugNames::SentinelError::convertToErrorCode() const {
   return inconvertibleErrorCode();
 }
 
-uint32_t DWARFDebugNames::NameIndex::getCUOffset(uint32_t CU) const {
+uint64_t DWARFDebugNames::NameIndex::getCUOffset(uint32_t CU) const {
   assert(CU < Hdr.CompUnitCount);
-  uint32_t Offset = CUsBase + 4 * CU;
+  uint64_t Offset = CUsBase + 4 * CU;
   return Section.AccelSection.getRelocatedValue(4, &Offset);
 }
 
-uint32_t DWARFDebugNames::NameIndex::getLocalTUOffset(uint32_t TU) const {
+uint64_t DWARFDebugNames::NameIndex::getLocalTUOffset(uint32_t TU) const {
   assert(TU < Hdr.LocalTypeUnitCount);
-  uint32_t Offset = CUsBase + 4 * (Hdr.CompUnitCount + TU);
+  uint64_t Offset = CUsBase + 4 * (Hdr.CompUnitCount + TU);
   return Section.AccelSection.getRelocatedValue(4, &Offset);
 }
 
 uint64_t DWARFDebugNames::NameIndex::getForeignTUSignature(uint32_t TU) const {
   assert(TU < Hdr.ForeignTypeUnitCount);
-  uint32_t Offset =
+  uint64_t Offset =
       CUsBase + 4 * (Hdr.CompUnitCount + Hdr.LocalTypeUnitCount) + 8 * TU;
   return Section.AccelSection.getU64(&Offset);
 }
 
 Expected<DWARFDebugNames::Entry>
-DWARFDebugNames::NameIndex::getEntry(uint32_t *Offset) const {
+DWARFDebugNames::NameIndex::getEntry(uint64_t *Offset) const {
   const DWARFDataExtractor &AS = Section.AccelSection;
   if (!AS.isValidOffset(*Offset))
     return createStringError(errc::illegal_byte_sequence,
@@ -625,12 +625,12 @@ DWARFDebugNames::NameIndex::getEntry(uint32_t *Offset) const {
 DWARFDebugNames::NameTableEntry
 DWARFDebugNames::NameIndex::getNameTableEntry(uint32_t Index) const {
   assert(0 < Index && Index <= Hdr.NameCount);
-  uint32_t StringOffsetOffset = StringOffsetsBase + 4 * (Index - 1);
-  uint32_t EntryOffsetOffset = EntryOffsetsBase + 4 * (Index - 1);
+  uint64_t StringOffsetOffset = StringOffsetsBase + 4 * (Index - 1);
+  uint64_t EntryOffsetOffset = EntryOffsetsBase + 4 * (Index - 1);
   const DWARFDataExtractor &AS = Section.AccelSection;
 
-  uint32_t StringOffset = AS.getRelocatedValue(4, &StringOffsetOffset);
-  uint32_t EntryOffset = AS.getU32(&EntryOffsetOffset);
+  uint64_t StringOffset = AS.getRelocatedValue(4, &StringOffsetOffset);
+  uint64_t EntryOffset = AS.getU32(&EntryOffsetOffset);
   EntryOffset += EntriesBase;
   return {Section.StringSection, Index, StringOffset, EntryOffset};
 }
@@ -638,13 +638,13 @@ DWARFDebugNames::NameIndex::getNameTableEntry(uint32_t Index) const {
 uint32_t
 DWARFDebugNames::NameIndex::getBucketArrayEntry(uint32_t Bucket) const {
   assert(Bucket < Hdr.BucketCount);
-  uint32_t BucketOffset = BucketsBase + 4 * Bucket;
+  uint64_t BucketOffset = BucketsBase + 4 * Bucket;
   return Section.AccelSection.getU32(&BucketOffset);
 }
 
 uint32_t DWARFDebugNames::NameIndex::getHashArrayEntry(uint32_t Index) const {
   assert(0 < Index && Index <= Hdr.NameCount);
-  uint32_t HashOffset = HashesBase + 4 * (Index - 1);
+  uint64_t HashOffset = HashesBase + 4 * (Index - 1);
   return Section.AccelSection.getU32(&HashOffset);
 }
 
@@ -653,8 +653,8 @@ uint32_t DWARFDebugNames::NameIndex::getHashArrayEntry(uint32_t Index) const {
 // it's not possible to recover this entry list (but the other lists may still
 // parse OK).
 bool DWARFDebugNames::NameIndex::dumpEntry(ScopedPrinter &W,
-                                           uint32_t *Offset) const {
-  uint32_t EntryId = *Offset;
+                                           uint64_t *Offset) const {
+  uint64_t EntryId = *Offset;
   auto EntryOr = getEntry(Offset);
   if (!EntryOr) {
     handleAllErrors(EntryOr.takeError(), [](const SentinelError &) {},
@@ -674,10 +674,10 @@ void DWARFDebugNames::NameIndex::dumpName(ScopedPrinter &W,
   if (Hash)
     W.printHex("Hash", *Hash);
 
-  W.startLine() << format("String: 0x%08x", NTE.getStringOffset());
+  W.startLine() << format("String: 0x%08" PRIx64, NTE.getStringOffset());
   W.getOStream() << " \"" << NTE.getString() << "\"\n";
 
-  uint32_t EntryOffset = NTE.getEntryOffset();
+  uint64_t EntryOffset = NTE.getEntryOffset();
   while (dumpEntry(W, &EntryOffset))
     /*empty*/;
 }
@@ -685,7 +685,7 @@ void DWARFDebugNames::NameIndex::dumpName(ScopedPrinter &W,
 void DWARFDebugNames::NameIndex::dumpCUs(ScopedPrinter &W) const {
   ListScope CUScope(W, "Compilation Unit offsets");
   for (uint32_t CU = 0; CU < Hdr.CompUnitCount; ++CU)
-    W.startLine() << format("CU[%u]: 0x%08x\n", CU, getCUOffset(CU));
+    W.startLine() << format("CU[%u]: 0x%08" PRIx64 "\n", CU, getCUOffset(CU));
 }
 
 void DWARFDebugNames::NameIndex::dumpLocalTUs(ScopedPrinter &W) const {
@@ -694,7 +694,8 @@ void DWARFDebugNames::NameIndex::dumpLocalTUs(ScopedPrinter &W) const {
 
   ListScope TUScope(W, "Local Type Unit offsets");
   for (uint32_t TU = 0; TU < Hdr.LocalTypeUnitCount; ++TU)
-    W.startLine() << format("LocalTU[%u]: 0x%08x\n", TU, getLocalTUOffset(TU));
+    W.startLine() << format("LocalTU[%u]: 0x%08" PRIx64 "\n", TU,
+                            getLocalTUOffset(TU));
 }
 
 void DWARFDebugNames::NameIndex::dumpForeignTUs(ScopedPrinter &W) const {
@@ -756,7 +757,7 @@ LLVM_DUMP_METHOD void DWARFDebugNames::NameIndex::dump(ScopedPrinter &W) const {
 }
 
 Error DWARFDebugNames::extract() {
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   while (AccelSection.isValidOffset(Offset)) {
     NameIndex Next(*this, Offset);
     if (Error E = Next.extract())
@@ -778,7 +779,7 @@ LLVM_DUMP_METHOD void DWARFDebugNames::dump(raw_ostream &OS) const {
     NI.dump(W);
 }
 
-Optional<uint32_t>
+Optional<uint64_t>
 DWARFDebugNames::ValueIterator::findEntryOffsetInCurrentIndex() {
   const Header &Hdr = CurrentIndex->Hdr;
   if (Hdr.BucketCount == 0) {
@@ -822,7 +823,7 @@ bool DWARFDebugNames::ValueIterator::getEntryAtCurrentOffset() {
 }
 
 bool DWARFDebugNames::ValueIterator::findInCurrentIndex() {
-  Optional<uint32_t> Offset = findEntryOffsetInCurrentIndex();
+  Optional<uint64_t> Offset = findEntryOffsetInCurrentIndex();
   if (!Offset)
     return false;
   DataOffset = *Offset;
@@ -877,7 +878,7 @@ DWARFDebugNames::equal_range(StringRef Key) const {
 }
 
 const DWARFDebugNames::NameIndex *
-DWARFDebugNames::getCUNameIndex(uint32_t CUOffset) {
+DWARFDebugNames::getCUNameIndex(uint64_t CUOffset) {
   if (CUToNameIndex.size() == 0 && NameIndices.size() > 0) {
     for (const auto &NI : *this) {
       for (uint32_t CU = 0; CU < NI.getCUCount(); ++CU)
diff --git a/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index 74cce42466dd..f59e49268288 100644
--- a/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -15,16 +15,18 @@
 using namespace llvm;
 
 void DWARFCompileUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
-  OS << format("0x%08x", getOffset()) << ": Compile Unit:"
-     << " length = " << format("0x%08x", getLength())
+  OS << format("0x%08" PRIx64, getOffset()) << ": Compile Unit:"
+     << " length = " << format("0x%08" PRIx64, getLength())
      << " version = " << format("0x%04x", getVersion());
   if (getVersion() >= 5)
     OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
-  OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+  OS << " abbr_offset = "
+     << format("0x%04" PRIx64, getAbbreviations()->getOffset())
      << " addr_size = " << format("0x%02x", getAddressByteSize());
   if (getVersion() >= 5 && getUnitType() != dwarf::DW_UT_compile)
     OS << " DWO_id = " << format("0x%016" PRIx64, *getDWOId());
-  OS << " (next unit at " << format("0x%08x", getNextUnitOffset()) << ")\n";
+  OS << " (next unit at " << format("0x%08" PRIx64, getNextUnitOffset())
+     << ")\n";
 
   if (DWARFDie CUDie = getUnitDIE(false))
     CUDie.dump(OS, 0, DumpOpts);
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 5ede9bf59619..c06d85d50609 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -138,7 +138,7 @@ static void dumpDWARFv5StringOffsetsSection(
   DWARFDataExtractor StrOffsetExt(Obj, StringOffsetsSection, LittleEndian, 0);
   DataExtractor StrData(StringSection, LittleEndian, 0);
   uint64_t SectionSize = StringOffsetsSection.Data.size();
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   for (auto &Contribution : Contributions) {
     // Report an ill-formed contribution.
     if (!Contribution) {
@@ -166,10 +166,10 @@ static void dumpDWARFv5StringOffsetsSection(
     }
     // Report a gap in the table.
     if (Offset < ContributionHeader) {
-      OS << format("0x%8.8x: Gap, length = ", Offset);
+      OS << format("0x%8.8" PRIx64 ": Gap, length = ", Offset);
       OS << (ContributionHeader - Offset) << "\n";
     }
-    OS << format("0x%8.8x: ", (uint32_t)ContributionHeader);
+    OS << format("0x%8.8" PRIx64 ": ", ContributionHeader);
     // In DWARF v5 the contribution size in the descriptor does not equal
     // the originally encoded length (it does not contain the length of the
     // version field and the padding, a total of 4 bytes). Add them back in
@@ -181,26 +181,19 @@ static void dumpDWARFv5StringOffsetsSection(
     Offset = Contribution->Base;
     unsigned EntrySize = Contribution->getDwarfOffsetByteSize();
     while (Offset - Contribution->Base < Contribution->Size) {
-      OS << format("0x%8.8x: ", Offset);
-      // FIXME: We can only extract strings if the offset fits in 32 bits.
+      OS << format("0x%8.8" PRIx64 ": ", Offset);
       uint64_t StringOffset =
           StrOffsetExt.getRelocatedValue(EntrySize, &Offset);
-      // Extract the string if we can and display it. Otherwise just report
-      // the offset.
-      if (StringOffset <= std::numeric_limits<uint32_t>::max()) {
-        uint32_t StringOffset32 = (uint32_t)StringOffset;
-        OS << format("%8.8x ", StringOffset32);
-        const char *S = StrData.getCStr(&StringOffset32);
-        if (S)
-          OS << format("\"%s\"", S);
-      } else
-        OS << format("%16.16" PRIx64 " ", StringOffset);
+      OS << format("%8.8" PRIx64 " ", StringOffset);
+      const char *S = StrData.getCStr(&StringOffset);
+      if (S)
+        OS << format("\"%s\"", S);
       OS << "\n";
     }
   }
   // Report a gap at the end of the table.
   if (Offset < SectionSize) {
-    OS << format("0x%8.8x: Gap, length = ", Offset);
+    OS << format("0x%8.8" PRIx64 ": Gap, length = ", Offset);
     OS << (SectionSize - Offset) << "\n";
   }
 }
@@ -225,7 +218,7 @@ static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName,
                                     StringSection, Units, LittleEndian);
   else {
     DataExtractor strOffsetExt(StringOffsetsSection.Data, LittleEndian, 0);
-    uint32_t offset = 0;
+    uint64_t offset = 0;
     uint64_t size = StringOffsetsSection.Data.size();
     // Ensure that size is a multiple of the size of an entry.
     if (size & ((uint64_t)(sizeof(uint32_t) - 1))) {
@@ -235,9 +228,9 @@ static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName,
     }
     DataExtractor StrData(StringSection, LittleEndian, 0);
     while (offset < size) {
-      OS << format("0x%8.8x: ", offset);
-      uint32_t StringOffset = strOffsetExt.getU32(&offset);
-      OS << format("%8.8x  ", StringOffset);
+      OS << format("0x%8.8" PRIx64 ": ", offset);
+      uint64_t StringOffset = strOffsetExt.getU32(&offset);
+      OS << format("%8.8" PRIx64 "  ", StringOffset);
       const char *S = StrData.getCStr(&StringOffset);
       if (S)
         OS << format("\"%s\"", S);
@@ -250,10 +243,10 @@ static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName,
 static void dumpAddrSection(raw_ostream &OS, DWARFDataExtractor &AddrData,
                             DIDumpOptions DumpOpts, uint16_t Version,
                             uint8_t AddrSize) {
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   while (AddrData.isValidOffset(Offset)) {
     DWARFDebugAddrTable AddrTable;
-    uint32_t TableOffset = Offset;
+    uint64_t TableOffset = Offset;
     if (Error Err = AddrTable.extract(AddrData, &Offset, Version, AddrSize,
                                       DWARFContext::dumpWarning)) {
       WithColor::error() << toString(std::move(Err)) << '\n';
@@ -261,8 +254,7 @@ static void dumpAddrSection(raw_ostream &OS, DWARFDataExtractor &AddrData,
       // could be read. If it couldn't, stop reading the section.
       if (!AddrTable.hasValidLength())
         break;
-      uint64_t Length = AddrTable.getLength();
-      Offset = TableOffset + Length;
+      Offset = TableOffset + AddrTable.getLength();
     } else {
       AddrTable.dump(OS, DumpOpts);
     }
@@ -275,10 +267,10 @@ static void dumpRnglistsSection(
     llvm::function_ref<Optional<object::SectionedAddress>(uint32_t)>
         LookupPooledAddress,
     DIDumpOptions DumpOpts) {
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   while (rnglistData.isValidOffset(Offset)) {
     llvm::DWARFDebugRnglistTable Rnglists;
-    uint32_t TableOffset = Offset;
+    uint64_t TableOffset = Offset;
     if (Error Err = Rnglists.extract(rnglistData, &Offset)) {
       WithColor::error() << toString(std::move(Err)) << '\n';
       uint64_t Length = Rnglists.length();
@@ -297,21 +289,25 @@ static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
                                 DWARFDataExtractor Data,
                                 const MCRegisterInfo *MRI,
                                 Optional<uint64_t> DumpOffset) {
-  uint32_t Offset = 0;
-  DWARFDebugLoclists Loclists;
+  uint64_t Offset = 0;
 
-  DWARFListTableHeader Header(".debug_loclists", "locations");
-  if (Error E = Header.extract(Data, &Offset)) {
-    WithColor::error() << toString(std::move(E)) << '\n';
-    return;
-  }
+  while (Data.isValidOffset(Offset)) {
+    DWARFListTableHeader Header(".debug_loclists", "locations");
+    if (Error E = Header.extract(Data, &Offset)) {
+      WithColor::error() << toString(std::move(E)) << '\n';
+      return;
+    }
 
-  Header.dump(OS, DumpOpts);
-  DataExtractor LocData(Data.getData().drop_front(Offset),
-                        Data.isLittleEndian(), Header.getAddrSize());
+    Header.dump(OS, DumpOpts);
+    DataExtractor LocData(Data.getData(),
+                          Data.isLittleEndian(), Header.getAddrSize());
 
-  Loclists.parse(LocData, Header.getVersion());
-  Loclists.dump(OS, 0, MRI, DumpOffset);
+    DWARFDebugLoclists Loclists;
+    uint64_t EndOffset = Header.length() + Header.getHeaderOffset();
+    Loclists.parse(LocData, Offset, EndOffset, Header.getVersion());
+    Loclists.dump(OS, 0, MRI, DumpOpts, DumpOffset);
+    Offset = EndOffset;
+  }
 }
 
 void DWARFContext::dump(
@@ -386,7 +382,7 @@ void DWARFContext::dump(
 
   if (const auto *Off = shouldDump(Explicit, ".debug_loc", DIDT_ID_DebugLoc,
                                    DObj->getLocSection().Data)) {
-    getDebugLoc()->dump(OS, getRegisterInfo(), *Off);
+    getDebugLoc()->dump(OS, getRegisterInfo(), DumpOpts, *Off);
   }
   if (const auto *Off =
           shouldDump(Explicit, ".debug_loclists", DIDT_ID_DebugLoclists,
@@ -398,15 +394,15 @@ void DWARFContext::dump(
   if (const auto *Off =
           shouldDump(ExplicitDWO, ".debug_loc.dwo", DIDT_ID_DebugLoc,
                      DObj->getLocDWOSection().Data)) {
-    getDebugLocDWO()->dump(OS, 0, getRegisterInfo(), *Off);
+    getDebugLocDWO()->dump(OS, 0, getRegisterInfo(), DumpOpts, *Off);
   }
 
   if (const auto *Off = shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame,
-                                   DObj->getDebugFrameSection()))
+                                   DObj->getFrameSection().Data))
     getDebugFrame()->dump(OS, getRegisterInfo(), *Off);
 
   if (const auto *Off = shouldDump(Explicit, ".eh_frame", DIDT_ID_DebugFrame,
-                                   DObj->getEHFrameSection()))
+                                   DObj->getEHFrameSection().Data))
     getEHFrame()->dump(OS, getRegisterInfo(), *Off);
 
   if (DumpType & DIDT_DebugMacro) {
@@ -417,9 +413,9 @@ void DWARFContext::dump(
   }
 
   if (shouldDump(Explicit, ".debug_aranges", DIDT_ID_DebugAranges,
-                 DObj->getARangeSection())) {
-    uint32_t offset = 0;
-    DataExtractor arangesData(DObj->getARangeSection(), isLittleEndian(), 0);
+                 DObj->getArangesSection())) {
+    uint64_t offset = 0;
+    DataExtractor arangesData(DObj->getArangesSection(), isLittleEndian(), 0);
     DWARFDebugArangeSet set;
     while (set.extract(arangesData, &offset))
       set.dump(OS);
@@ -433,7 +429,8 @@ void DWARFContext::dump(
         Parser.skip(dumpWarning);
         continue;
       }
-      OS << "debug_line[" << format("0x%8.8x", Parser.getOffset()) << "]\n";
+      OS << "debug_line[" << format("0x%8.8" PRIx64, Parser.getOffset())
+         << "]\n";
       if (DumpOpts.Verbose) {
         Parser.parseNext(dumpWarning, dumpWarning, &OS);
       } else {
@@ -474,32 +471,32 @@ void DWARFContext::dump(
   }
 
   if (shouldDump(Explicit, ".debug_str", DIDT_ID_DebugStr,
-                 DObj->getStringSection())) {
-    DataExtractor strData(DObj->getStringSection(), isLittleEndian(), 0);
-    uint32_t offset = 0;
-    uint32_t strOffset = 0;
+                 DObj->getStrSection())) {
+    DataExtractor strData(DObj->getStrSection(), isLittleEndian(), 0);
+    uint64_t offset = 0;
+    uint64_t strOffset = 0;
     while (const char *s = strData.getCStr(&offset)) {
-      OS << format("0x%8.8x: \"%s\"\n", strOffset, s);
+      OS << format("0x%8.8" PRIx64 ": \"%s\"\n", strOffset, s);
       strOffset = offset;
     }
   }
   if (shouldDump(ExplicitDWO, ".debug_str.dwo", DIDT_ID_DebugStr,
-                 DObj->getStringDWOSection())) {
-    DataExtractor strDWOData(DObj->getStringDWOSection(), isLittleEndian(), 0);
-    uint32_t offset = 0;
-    uint32_t strDWOOffset = 0;
+                 DObj->getStrDWOSection())) {
+    DataExtractor strDWOData(DObj->getStrDWOSection(), isLittleEndian(), 0);
+    uint64_t offset = 0;
+    uint64_t strDWOOffset = 0;
     while (const char *s = strDWOData.getCStr(&offset)) {
-      OS << format("0x%8.8x: \"%s\"\n", strDWOOffset, s);
+      OS << format("0x%8.8" PRIx64 ": \"%s\"\n", strDWOOffset, s);
       strDWOOffset = offset;
     }
   }
   if (shouldDump(Explicit, ".debug_line_str", DIDT_ID_DebugLineStr,
-                 DObj->getLineStringSection())) {
-    DataExtractor strData(DObj->getLineStringSection(), isLittleEndian(), 0);
-    uint32_t offset = 0;
-    uint32_t strOffset = 0;
+                 DObj->getLineStrSection())) {
+    DataExtractor strData(DObj->getLineStrSection(), isLittleEndian(), 0);
+    uint64_t offset = 0;
+    uint64_t strOffset = 0;
     while (const char *s = strData.getCStr(&offset)) {
-      OS << format("0x%8.8x: \"", strOffset);
+      OS << format("0x%8.8" PRIx64 ": \"", strOffset);
       OS.write_escaped(s);
       OS << "\"\n";
       strOffset = offset;
@@ -514,11 +511,11 @@ void DWARFContext::dump(
   }
 
   if (shouldDump(Explicit, ".debug_ranges", DIDT_ID_DebugRanges,
-                 DObj->getRangeSection().Data)) {
+                 DObj->getRangesSection().Data)) {
     uint8_t savedAddressByteSize = getCUAddrSize();
-    DWARFDataExtractor rangesData(*DObj, DObj->getRangeSection(),
+    DWARFDataExtractor rangesData(*DObj, DObj->getRangesSection(),
                                   isLittleEndian(), savedAddressByteSize);
-    uint32_t offset = 0;
+    uint64_t offset = 0;
     DWARFDebugRangeList rangeList;
     while (rangesData.isValidOffset(offset)) {
       if (Error E = rangeList.extract(rangesData, &offset)) {
@@ -552,38 +549,38 @@ void DWARFContext::dump(
   }
 
   if (shouldDump(Explicit, ".debug_pubnames", DIDT_ID_DebugPubnames,
-                 DObj->getPubNamesSection().Data))
-    DWARFDebugPubTable(*DObj, DObj->getPubNamesSection(), isLittleEndian(), false)
+                 DObj->getPubnamesSection().Data))
+    DWARFDebugPubTable(*DObj, DObj->getPubnamesSection(), isLittleEndian(), false)
         .dump(OS);
 
   if (shouldDump(Explicit, ".debug_pubtypes", DIDT_ID_DebugPubtypes,
-                 DObj->getPubTypesSection().Data))
-    DWARFDebugPubTable(*DObj, DObj->getPubTypesSection(), isLittleEndian(), false)
+                 DObj->getPubtypesSection().Data))
+    DWARFDebugPubTable(*DObj, DObj->getPubtypesSection(), isLittleEndian(), false)
         .dump(OS);
 
   if (shouldDump(Explicit, ".debug_gnu_pubnames", DIDT_ID_DebugGnuPubnames,
-                 DObj->getGnuPubNamesSection().Data))
-    DWARFDebugPubTable(*DObj, DObj->getGnuPubNamesSection(), isLittleEndian(),
+                 DObj->getGnuPubnamesSection().Data))
+    DWARFDebugPubTable(*DObj, DObj->getGnuPubnamesSection(), isLittleEndian(),
                        true /* GnuStyle */)
         .dump(OS);
 
   if (shouldDump(Explicit, ".debug_gnu_pubtypes", DIDT_ID_DebugGnuPubtypes,
-                 DObj->getGnuPubTypesSection().Data))
-    DWARFDebugPubTable(*DObj, DObj->getGnuPubTypesSection(), isLittleEndian(),
+                 DObj->getGnuPubtypesSection().Data))
+    DWARFDebugPubTable(*DObj, DObj->getGnuPubtypesSection(), isLittleEndian(),
                        true /* GnuStyle */)
         .dump(OS);
 
   if (shouldDump(Explicit, ".debug_str_offsets", DIDT_ID_DebugStrOffsets,
-                 DObj->getStringOffsetSection().Data))
+                 DObj->getStrOffsetsSection().Data))
     dumpStringOffsetsSection(OS, "debug_str_offsets", *DObj,
-                             DObj->getStringOffsetSection(),
-                             DObj->getStringSection(), normal_units(),
+                             DObj->getStrOffsetsSection(),
+                             DObj->getStrSection(), normal_units(),
                              isLittleEndian(), getMaxVersion());
   if (shouldDump(ExplicitDWO, ".debug_str_offsets.dwo", DIDT_ID_DebugStrOffsets,
-                 DObj->getStringOffsetDWOSection().Data))
+                 DObj->getStrOffsetsDWOSection().Data))
     dumpStringOffsetsSection(OS, "debug_str_offsets.dwo", *DObj,
-                             DObj->getStringOffsetDWOSection(),
-                             DObj->getStringDWOSection(), dwo_units(),
+                             DObj->getStrOffsetsDWOSection(),
+                             DObj->getStrDWOSection(), dwo_units(),
                              isLittleEndian(), getMaxDWOVersion());
 
   if (shouldDump(Explicit, ".gdb_index", DIDT_ID_GdbIndex,
@@ -607,7 +604,7 @@ void DWARFContext::dump(
                  DObj->getAppleObjCSection().Data))
     getAppleObjC().dump(OS);
   if (shouldDump(Explicit, ".debug_names", DIDT_ID_DebugNames,
-                 DObj->getDebugNamesSection().Data))
+                 DObj->getNamesSection().Data))
     getDebugNames().dump(OS);
 }
 
@@ -641,7 +638,7 @@ DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) {
   return nullptr;
 }
 
-DWARFDie DWARFContext::getDIEForOffset(uint32_t Offset) {
+DWARFDie DWARFContext::getDIEForOffset(uint64_t Offset) {
   parseNormalUnits();
   if (auto *CU = NormalUnits.getUnitForOffset(Offset))
     return CU->getDIEForOffset(Offset);
@@ -667,7 +664,7 @@ const DWARFUnitIndex &DWARFContext::getCUIndex() {
 
   DataExtractor CUIndexData(DObj->getCUIndexSection(), isLittleEndian(), 0);
 
-  CUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_INFO);
+  CUIndex = std::make_unique<DWARFUnitIndex>(DW_SECT_INFO);
   CUIndex->parse(CUIndexData);
   return *CUIndex;
 }
@@ -678,7 +675,7 @@ const DWARFUnitIndex &DWARFContext::getTUIndex() {
 
   DataExtractor TUIndexData(DObj->getTUIndexSection(), isLittleEndian(), 0);
 
-  TUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_TYPES);
+  TUIndex = std::make_unique<DWARFUnitIndex>(DW_SECT_TYPES);
   TUIndex->parse(TUIndexData);
   return *TUIndex;
 }
@@ -688,7 +685,7 @@ DWARFGdbIndex &DWARFContext::getGdbIndex() {
     return *GdbIndex;
 
   DataExtractor GdbIndexData(DObj->getGdbIndexSection(), true /*LE*/, 0);
-  GdbIndex = llvm::make_unique<DWARFGdbIndex>();
+  GdbIndex = std::make_unique<DWARFGdbIndex>();
   GdbIndex->parse(GdbIndexData);
   return *GdbIndex;
 }
@@ -740,7 +737,7 @@ const DWARFDebugLoclists *DWARFContext::getDebugLocDWO() {
   // Use version 4. DWO does not support the DWARF v5 .debug_loclists yet and
   // that means we are parsing the new style .debug_loc (pre-standatized version
   // of the .debug_loclists).
-  LocDWO->parse(LocData, 4 /* Version */);
+  LocDWO->parse(LocData, 0, LocData.getData().size(), 4 /* Version */);
   return LocDWO.get();
 }
 
@@ -766,7 +763,7 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() {
   // provides this information). This problem is fixed in DWARFv4
   // See this dwarf-discuss discussion for more details:
   // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
-  DWARFDataExtractor debugFrameData(DObj->getDebugFrameSection(),
+  DWARFDataExtractor debugFrameData(*DObj, DObj->getFrameSection(),
                                     isLittleEndian(), DObj->getAddressSize());
   DebugFrame.reset(new DWARFDebugFrame(getArch(), false /* IsEH */));
   DebugFrame->parse(debugFrameData);
@@ -777,8 +774,8 @@ const DWARFDebugFrame *DWARFContext::getEHFrame() {
   if (EHFrame)
     return EHFrame.get();
 
-  DWARFDataExtractor debugFrameData(DObj->getEHFrameSection(), isLittleEndian(),
-                                    DObj->getAddressSize());
+  DWARFDataExtractor debugFrameData(*DObj, DObj->getEHFrameSection(),
+                                    isLittleEndian(), DObj->getAddressSize());
   DebugFrame.reset(new DWARFDebugFrame(getArch(), true /* IsEH */));
   DebugFrame->parse(debugFrameData);
   return DebugFrame.get();
@@ -809,29 +806,29 @@ static T &getAccelTable(std::unique_ptr<T> &Cache, const DWARFObject &Obj,
 }
 
 const DWARFDebugNames &DWARFContext::getDebugNames() {
-  return getAccelTable(Names, *DObj, DObj->getDebugNamesSection(),
-                       DObj->getStringSection(), isLittleEndian());
+  return getAccelTable(Names, *DObj, DObj->getNamesSection(),
+                       DObj->getStrSection(), isLittleEndian());
 }
 
 const AppleAcceleratorTable &DWARFContext::getAppleNames() {
   return getAccelTable(AppleNames, *DObj, DObj->getAppleNamesSection(),
-                       DObj->getStringSection(), isLittleEndian());
+                       DObj->getStrSection(), isLittleEndian());
 }
 
 const AppleAcceleratorTable &DWARFContext::getAppleTypes() {
   return getAccelTable(AppleTypes, *DObj, DObj->getAppleTypesSection(),
-                       DObj->getStringSection(), isLittleEndian());
+                       DObj->getStrSection(), isLittleEndian());
 }
 
 const AppleAcceleratorTable &DWARFContext::getAppleNamespaces() {
   return getAccelTable(AppleNamespaces, *DObj,
                        DObj->getAppleNamespacesSection(),
-                       DObj->getStringSection(), isLittleEndian());
+                       DObj->getStrSection(), isLittleEndian());
 }
 
 const AppleAcceleratorTable &DWARFContext::getAppleObjC() {
   return getAccelTable(AppleObjC, *DObj, DObj->getAppleObjCSection(),
-                       DObj->getStringSection(), isLittleEndian());
+                       DObj->getStrSection(), isLittleEndian());
 }
 
 const DWARFDebugLine::LineTable *
@@ -858,7 +855,7 @@ Expected<const DWARFDebugLine::LineTable *> DWARFContext::getLineTableForUnit(
   if (!Offset)
     return nullptr; // No line table for this compile unit.
 
-  uint32_t stmtOffset = *Offset + U->getLineTableOffset();
+  uint64_t stmtOffset = *Offset + U->getLineTableOffset();
   // See if the line table is cached.
   if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
     return lt;
@@ -898,7 +895,7 @@ void DWARFContext::parseDWOUnits(bool Lazy) {
   });
 }
 
-DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
+DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint64_t Offset) {
   parseNormalUnits();
   return dyn_cast_or_null<DWARFCompileUnit>(
       NormalUnits.getUnitForOffset(Offset));
@@ -906,7 +903,7 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
 
 DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
   // First, get the offset of the compile unit.
-  uint32_t CUOffset = getDebugAranges()->findAddress(Address);
+  uint64_t CUOffset = getDebugAranges()->findAddress(Address);
   // Retrieve the compile unit.
   return getCompileUnitForOffset(CUOffset);
 }
@@ -1118,8 +1115,8 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(
   if (!CU)
     return Lines;
 
-  std::string FunctionName = "<invalid>";
   uint32_t StartLine = 0;
+  std::string FunctionName(DILineInfo::BadString);
   getFunctionNameAndStartLineForAddress(CU, Address.Address, Spec.FNKind,
                                         FunctionName, StartLine);
 
@@ -1379,46 +1376,50 @@ class DWARFObjInMemory final : public DWARFObject {
   InfoSectionMap TypesDWOSections;
 
   DWARFSectionMap LocSection;
-  DWARFSectionMap LocListsSection;
+  DWARFSectionMap LoclistsSection;
   DWARFSectionMap LineSection;
-  DWARFSectionMap RangeSection;
+  DWARFSectionMap RangesSection;
   DWARFSectionMap RnglistsSection;
-  DWARFSectionMap StringOffsetSection;
+  DWARFSectionMap StrOffsetsSection;
   DWARFSectionMap LineDWOSection;
+  DWARFSectionMap FrameSection;
+  DWARFSectionMap EHFrameSection;
   DWARFSectionMap LocDWOSection;
-  DWARFSectionMap StringOffsetDWOSection;
-  DWARFSectionMap RangeDWOSection;
+  DWARFSectionMap StrOffsetsDWOSection;
+  DWARFSectionMap RangesDWOSection;
   DWARFSectionMap RnglistsDWOSection;
   DWARFSectionMap AddrSection;
   DWARFSectionMap AppleNamesSection;
   DWARFSectionMap AppleTypesSection;
   DWARFSectionMap AppleNamespacesSection;
   DWARFSectionMap AppleObjCSection;
-  DWARFSectionMap DebugNamesSection;
-  DWARFSectionMap PubNamesSection;
-  DWARFSectionMap PubTypesSection;
-  DWARFSectionMap GnuPubNamesSection;
-  DWARFSectionMap GnuPubTypesSection;
+  DWARFSectionMap NamesSection;
+  DWARFSectionMap PubnamesSection;
+  DWARFSectionMap PubtypesSection;
+  DWARFSectionMap GnuPubnamesSection;
+  DWARFSectionMap GnuPubtypesSection;
 
   DWARFSectionMap *mapNameToDWARFSection(StringRef Name) {
     return StringSwitch<DWARFSectionMap *>(Name)
         .Case("debug_loc", &LocSection)
-        .Case("debug_loclists", &LocListsSection)
+        .Case("debug_loclists", &LoclistsSection)
         .Case("debug_line", &LineSection)
-        .Case("debug_str_offsets", &StringOffsetSection)
-        .Case("debug_ranges", &RangeSection)
+        .Case("debug_frame", &FrameSection)
+        .Case("eh_frame", &EHFrameSection)
+        .Case("debug_str_offsets", &StrOffsetsSection)
+        .Case("debug_ranges", &RangesSection)
         .Case("debug_rnglists", &RnglistsSection)
         .Case("debug_loc.dwo", &LocDWOSection)
         .Case("debug_line.dwo", &LineDWOSection)
-        .Case("debug_names", &DebugNamesSection)
+        .Case("debug_names", &NamesSection)
         .Case("debug_rnglists.dwo", &RnglistsDWOSection)
-        .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
+        .Case("debug_str_offsets.dwo", &StrOffsetsDWOSection)
         .Case("debug_addr", &AddrSection)
         .Case("apple_names", &AppleNamesSection)
-        .Case("debug_pubnames", &PubNamesSection)
-        .Case("debug_pubtypes", &PubTypesSection)
-        .Case("debug_gnu_pubnames", &GnuPubNamesSection)
-        .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
+        .Case("debug_pubnames", &PubnamesSection)
+        .Case("debug_pubtypes", &PubtypesSection)
+        .Case("debug_gnu_pubnames", &GnuPubnamesSection)
+        .Case("debug_gnu_pubtypes", &GnuPubtypesSection)
         .Case("apple_types", &AppleTypesSection)
         .Case("apple_namespaces", &AppleNamespacesSection)
         .Case("apple_namespac", &AppleNamespacesSection)
@@ -1427,17 +1428,15 @@ class DWARFObjInMemory final : public DWARFObject {
   }
 
   StringRef AbbrevSection;
-  StringRef ARangeSection;
-  StringRef DebugFrameSection;
-  StringRef EHFrameSection;
-  StringRef StringSection;
+  StringRef ArangesSection;
+  StringRef StrSection;
   StringRef MacinfoSection;
   StringRef AbbrevDWOSection;
-  StringRef StringDWOSection;
+  StringRef StrDWOSection;
   StringRef CUIndexSection;
   StringRef GdbIndexSection;
   StringRef TUIndexSection;
-  StringRef LineStringSection;
+  StringRef LineStrSection;
 
   // A deque holding section data whose iterators are not invalidated when
   // new decompressed sections are inserted at the end.
@@ -1448,17 +1447,15 @@ class DWARFObjInMemory final : public DWARFObject {
       return &Sec->Data;
     return StringSwitch<StringRef *>(Name)
         .Case("debug_abbrev", &AbbrevSection)
-        .Case("debug_aranges", &ARangeSection)
-        .Case("debug_frame", &DebugFrameSection)
-        .Case("eh_frame", &EHFrameSection)
-        .Case("debug_str", &StringSection)
+        .Case("debug_aranges", &ArangesSection)
+        .Case("debug_str", &StrSection)
         .Case("debug_macinfo", &MacinfoSection)
         .Case("debug_abbrev.dwo", &AbbrevDWOSection)
-        .Case("debug_str.dwo", &StringDWOSection)
+        .Case("debug_str.dwo", &StrDWOSection)
         .Case("debug_cu_index", &CUIndexSection)
         .Case("debug_tu_index", &TUIndexSection)
         .Case("gdb_index", &GdbIndexSection)
-        .Case("debug_line_str", &LineStringSection)
+        .Case("debug_line_str", &LineStrSection)
         // Any more debug info sections go here.
         .Default(nullptr);
   }
@@ -1513,7 +1510,11 @@ public:
     StringMap<unsigned> SectionAmountMap;
     for (const SectionRef &Section : Obj.sections()) {
       StringRef Name;
-      Section.getName(Name);
+      if (auto NameOrErr = Section.getName())
+        Name = *NameOrErr;
+      else
+        consumeError(NameOrErr.takeError());
+
       ++SectionAmountMap[Name];
       SectionNames.push_back({ Name, true });
 
@@ -1526,10 +1527,19 @@ public:
         continue;
 
       StringRef Data;
-      section_iterator RelocatedSection = Section.getRelocatedSection();
+      Expected<section_iterator> SecOrErr = Section.getRelocatedSection();
+      if (!SecOrErr) {
+        ErrorPolicy EP = HandleError(createError(
+            "failed to get relocated section: ", SecOrErr.takeError()));
+        if (EP == ErrorPolicy::Halt)
+          return;
+        continue;
+      }
+
       // Try to obtain an already relocated version of this section.
       // Else use the unrelocated section from the object file. We'll have to
       // apply relocations ourselves later.
+      section_iterator RelocatedSection = *SecOrErr;
       if (!L || !L->getLoadedSectionContents(*RelocatedSection, Data)) {
         Expected<StringRef> E = Section.getContents();
         if (E)
@@ -1560,7 +1570,7 @@ public:
         *SectionData = Data;
         if (Name == "debug_ranges") {
           // FIXME: Use the other dwo range section when we emit it.
-          RangeDWOSection.Data = Data;
+          RangesDWOSection.Data = Data;
         }
       } else if (Name == "debug_info") {
         // Find debug_info and debug_types data by section rather than name as
@@ -1578,12 +1588,15 @@ public:
         continue;
 
       StringRef RelSecName;
-      StringRef RelSecData;
-      RelocatedSection->getName(RelSecName);
+      if (auto NameOrErr = RelocatedSection->getName())
+        RelSecName = *NameOrErr;
+      else
+        consumeError(NameOrErr.takeError());
 
       // If the section we're relocating was relocated already by the JIT,
       // then we used the relocated version above, so we do not need to process
       // relocations for it now.
+      StringRef RelSecData;
       if (L && L->getLoadedSectionContents(*RelocatedSection, RelSecData))
         continue;
 
@@ -1710,12 +1723,12 @@ public:
   const DWARFSection &getLocDWOSection() const override {
     return LocDWOSection;
   }
-  StringRef getStringDWOSection() const override { return StringDWOSection; }
-  const DWARFSection &getStringOffsetDWOSection() const override {
-    return StringOffsetDWOSection;
+  StringRef getStrDWOSection() const override { return StrDWOSection; }
+  const DWARFSection &getStrOffsetsDWOSection() const override {
+    return StrOffsetsDWOSection;
   }
-  const DWARFSection &getRangeDWOSection() const override {
-    return RangeDWOSection;
+  const DWARFSection &getRangesDWOSection() const override {
+    return RangesDWOSection;
   }
   const DWARFSection &getRnglistsDWOSection() const override {
     return RnglistsDWOSection;
@@ -1726,10 +1739,10 @@ public:
   StringRef getTUIndexSection() const override { return TUIndexSection; }
 
   // DWARF v5
-  const DWARFSection &getStringOffsetSection() const override {
-    return StringOffsetSection;
+  const DWARFSection &getStrOffsetsSection() const override {
+    return StrOffsetsSection;
   }
-  StringRef getLineStringSection() const override { return LineStringSection; }
+  StringRef getLineStrSection() const override { return LineStrSection; }
 
   // Sections for DWARF5 split dwarf proposal.
   void forEachInfoDWOSections(
@@ -1745,24 +1758,28 @@ public:
 
   StringRef getAbbrevSection() const override { return AbbrevSection; }
   const DWARFSection &getLocSection() const override { return LocSection; }
-  const DWARFSection &getLoclistsSection() const override { return LocListsSection; }
-  StringRef getARangeSection() const override { return ARangeSection; }
-  StringRef getDebugFrameSection() const override { return DebugFrameSection; }
-  StringRef getEHFrameSection() const override { return EHFrameSection; }
+  const DWARFSection &getLoclistsSection() const override { return LoclistsSection; }
+  StringRef getArangesSection() const override { return ArangesSection; }
+  const DWARFSection &getFrameSection() const override {
+    return FrameSection;
+  }
+  const DWARFSection &getEHFrameSection() const override {
+    return EHFrameSection;
+  }
   const DWARFSection &getLineSection() const override { return LineSection; }
-  StringRef getStringSection() const override { return StringSection; }
-  const DWARFSection &getRangeSection() const override { return RangeSection; }
+  StringRef getStrSection() const override { return StrSection; }
+  const DWARFSection &getRangesSection() const override { return RangesSection; }
   const DWARFSection &getRnglistsSection() const override {
     return RnglistsSection;
   }
   StringRef getMacinfoSection() const override { return MacinfoSection; }
-  const DWARFSection &getPubNamesSection() const override { return PubNamesSection; }
-  const DWARFSection &getPubTypesSection() const override { return PubTypesSection; }
-  const DWARFSection &getGnuPubNamesSection() const override {
-    return GnuPubNamesSection;
+  const DWARFSection &getPubnamesSection() const override { return PubnamesSection; }
+  const DWARFSection &getPubtypesSection() const override { return PubtypesSection; }
+  const DWARFSection &getGnuPubnamesSection() const override {
+    return GnuPubnamesSection;
   }
-  const DWARFSection &getGnuPubTypesSection() const override {
-    return GnuPubTypesSection;
+  const DWARFSection &getGnuPubtypesSection() const override {
+    return GnuPubtypesSection;
   }
   const DWARFSection &getAppleNamesSection() const override {
     return AppleNamesSection;
@@ -1776,8 +1793,8 @@ public:
   const DWARFSection &getAppleObjCSection() const override {
     return AppleObjCSection;
   }
-  const DWARFSection &getDebugNamesSection() const override {
-    return DebugNamesSection;
+  const DWARFSection &getNamesSection() const override {
+    return NamesSection;
   }
 
   StringRef getFileName() const override { return FileName; }
@@ -1799,16 +1816,16 @@ std::unique_ptr<DWARFContext>
 DWARFContext::create(const object::ObjectFile &Obj, const LoadedObjectInfo *L,
                      function_ref<ErrorPolicy(Error)> HandleError,
                      std::string DWPName) {
-  auto DObj = llvm::make_unique<DWARFObjInMemory>(Obj, L, HandleError);
-  return llvm::make_unique<DWARFContext>(std::move(DObj), std::move(DWPName));
+  auto DObj = std::make_unique<DWARFObjInMemory>(Obj, L, HandleError);
+  return std::make_unique<DWARFContext>(std::move(DObj), std::move(DWPName));
 }
 
 std::unique_ptr<DWARFContext>
 DWARFContext::create(const StringMap<std::unique_ptr<MemoryBuffer>> &Sections,
                      uint8_t AddrSize, bool isLittleEndian) {
   auto DObj =
-      llvm::make_unique<DWARFObjInMemory>(Sections, AddrSize, isLittleEndian);
-  return llvm::make_unique<DWARFContext>(std::move(DObj), "");
+      std::make_unique<DWARFObjInMemory>(Sections, AddrSize, isLittleEndian);
+  return std::make_unique<DWARFContext>(std::move(DObj), "");
 }
 
 Error DWARFContext::loadRegisterInfo(const object::ObjectFile &Obj) {
diff --git a/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
index b9adf8cb1d99..53e676bc7031 100644
--- a/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
@@ -12,14 +12,15 @@
 
 using namespace llvm;
 
-uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint32_t *Off,
-                                               uint64_t *SecNdx) const {
+uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint64_t *Off,
+                                               uint64_t *SecNdx,
+                                               Error *Err) const {
   if (SecNdx)
     *SecNdx = object::SectionedAddress::UndefSection;
   if (!Section)
-    return getUnsigned(Off, Size);
+    return getUnsigned(Off, Size, Err);
   Optional<RelocAddrEntry> E = Obj->find(*Section, *Off);
-  uint64_t A = getUnsigned(Off, Size);
+  uint64_t A = getUnsigned(Off, Size, Err);
   if (!E)
     return A;
   if (SecNdx)
@@ -31,13 +32,13 @@ uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint32_t *Off,
 }
 
 Optional<uint64_t>
-DWARFDataExtractor::getEncodedPointer(uint32_t *Offset, uint8_t Encoding,
+DWARFDataExtractor::getEncodedPointer(uint64_t *Offset, uint8_t Encoding,
                                       uint64_t PCRelOffset) const {
   if (Encoding == dwarf::DW_EH_PE_omit)
     return None;
 
   uint64_t Result = 0;
-  uint32_t OldOffset = *Offset;
+  uint64_t OldOffset = *Offset;
   // First get value
   switch (Encoding & 0x0F) {
   case dwarf::DW_EH_PE_absptr:
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp b/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
index 31b324e5eb27..4afac2f99503 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
@@ -26,9 +26,9 @@ void DWARFAbbreviationDeclarationSet::clear() {
 }
 
 bool DWARFAbbreviationDeclarationSet::extract(DataExtractor Data,
-                                              uint32_t *OffsetPtr) {
+                                              uint64_t *OffsetPtr) {
   clear();
-  const uint32_t BeginOffset = *OffsetPtr;
+  const uint64_t BeginOffset = *OffsetPtr;
   Offset = BeginOffset;
   DWARFAbbreviationDeclaration AbbrDecl;
   uint32_t PrevAbbrCode = 0;
@@ -82,12 +82,12 @@ void DWARFDebugAbbrev::extract(DataExtractor Data) {
 void DWARFDebugAbbrev::parse() const {
   if (!Data)
     return;
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   auto I = AbbrDeclSets.begin();
   while (Data->isValidOffset(Offset)) {
     while (I != AbbrDeclSets.end() && I->first < Offset)
       ++I;
-    uint32_t CUAbbrOffset = Offset;
+    uint64_t CUAbbrOffset = Offset;
     DWARFAbbreviationDeclarationSet AbbrDecls;
     if (!AbbrDecls.extract(*Data, &Offset))
       break;
@@ -124,7 +124,7 @@ DWARFDebugAbbrev::getAbbreviationDeclarationSet(uint64_t CUAbbrOffset) const {
   }
 
   if (Data && CUAbbrOffset < Data->getData().size()) {
-    uint32_t Offset = CUAbbrOffset;
+    uint64_t Offset = CUAbbrOffset;
     DWARFAbbreviationDeclarationSet AbbrDecls;
     if (!AbbrDecls.extract(*Data, &Offset))
       return nullptr;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp b/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
index 58626539bba4..f71543799e28 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
@@ -19,7 +19,7 @@ void DWARFDebugAddrTable::clear() {
 }
 
 Error DWARFDebugAddrTable::extract(DWARFDataExtractor Data,
-                                   uint32_t *OffsetPtr,
+                                   uint64_t *OffsetPtr,
                                    uint16_t Version,
                                    uint8_t AddrSize,
                                    std::function<void(Error)> WarnCallback) {
@@ -30,7 +30,7 @@ Error DWARFDebugAddrTable::extract(DWARFDataExtractor Data,
     return createStringError(errc::invalid_argument,
                        "section is not large enough to contain a "
                        ".debug_addr table length at offset 0x%"
-                       PRIx32, *OffsetPtr);
+                       PRIx64, *OffsetPtr);
   uint16_t UnitVersion;
   if (Version == 0) {
     WarnCallback(createStringError(errc::invalid_argument,
@@ -44,28 +44,28 @@ Error DWARFDebugAddrTable::extract(DWARFDataExtractor Data,
   Format = dwarf::DwarfFormat::DWARF32;
   if (UnitVersion >= 5) {
     HeaderData.Length = Data.getU32(OffsetPtr);
-    if (HeaderData.Length == 0xffffffffu) {
+    if (HeaderData.Length == dwarf::DW_LENGTH_DWARF64) {
       invalidateLength();
       return createStringError(errc::not_supported,
-          "DWARF64 is not supported in .debug_addr at offset 0x%" PRIx32,
+          "DWARF64 is not supported in .debug_addr at offset 0x%" PRIx64,
           HeaderOffset);
     }
     if (HeaderData.Length + sizeof(uint32_t) < sizeof(Header)) {
       uint32_t TmpLength = getLength();
       invalidateLength();
       return createStringError(errc::invalid_argument,
-                         ".debug_addr table at offset 0x%" PRIx32
+                         ".debug_addr table at offset 0x%" PRIx64
                          " has too small length (0x%" PRIx32
                          ") to contain a complete header",
                          HeaderOffset, TmpLength);
     }
-    uint32_t End = HeaderOffset + getLength();
+    uint64_t End = HeaderOffset + getLength();
     if (!Data.isValidOffsetForDataOfSize(HeaderOffset, End - HeaderOffset)) {
       uint32_t TmpLength = getLength();
       invalidateLength();
       return createStringError(errc::invalid_argument,
           "section is not large enough to contain a .debug_addr table "
-          "of length 0x%" PRIx32 " at offset 0x%" PRIx32,
+          "of length 0x%" PRIx32 " at offset 0x%" PRIx64,
           TmpLength, HeaderOffset);
     }
 
@@ -88,7 +88,7 @@ Error DWARFDebugAddrTable::extract(DWARFDataExtractor Data,
   // and consists only of a series of addresses.
   if (HeaderData.Version > 5) {
     return createStringError(errc::not_supported, "version %" PRIu16
-        " of .debug_addr section at offset 0x%" PRIx32 " is not supported",
+        " of .debug_addr section at offset 0x%" PRIx64 " is not supported",
         HeaderData.Version, HeaderOffset);
   }
   // FIXME: For now we just treat version mismatch as an error,
@@ -97,19 +97,19 @@ Error DWARFDebugAddrTable::extract(DWARFDataExtractor Data,
   // attribute in the info table.
   if (HeaderData.Version != UnitVersion)
     return createStringError(errc::invalid_argument,
-                       ".debug_addr table at offset 0x%" PRIx32
+                       ".debug_addr table at offset 0x%" PRIx64
                        " has version %" PRIu16
                        " which is different from the version suggested"
                        " by the DWARF unit header: %" PRIu16,
                        HeaderOffset, HeaderData.Version, UnitVersion);
   if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)
     return createStringError(errc::not_supported,
-                       ".debug_addr table at offset 0x%" PRIx32
+                       ".debug_addr table at offset 0x%" PRIx64
                        " has unsupported address size %" PRIu8,
                        HeaderOffset, HeaderData.AddrSize);
   if (HeaderData.AddrSize != AddrSize && AddrSize != 0)
     return createStringError(errc::invalid_argument,
-                       ".debug_addr table at offset 0x%" PRIx32
+                       ".debug_addr table at offset 0x%" PRIx64
                        " has address size %" PRIu8
                        " which is different from CU address size %" PRIu8,
                        HeaderOffset, HeaderData.AddrSize, AddrSize);
@@ -117,13 +117,13 @@ Error DWARFDebugAddrTable::extract(DWARFDataExtractor Data,
   // TODO: add support for non-zero segment selector size.
   if (HeaderData.SegSize != 0)
     return createStringError(errc::not_supported,
-                       ".debug_addr table at offset 0x%" PRIx32
+                       ".debug_addr table at offset 0x%" PRIx64
                        " has unsupported segment selector size %" PRIu8,
                        HeaderOffset, HeaderData.SegSize);
   if (DataSize % HeaderData.AddrSize != 0) {
     invalidateLength();
     return createStringError(errc::invalid_argument,
-                       ".debug_addr table at offset 0x%" PRIx32
+                       ".debug_addr table at offset 0x%" PRIx64
                        " contains data of size %" PRIu32
                        " which is not a multiple of addr size %" PRIu8,
                        HeaderOffset, DataSize, HeaderData.AddrSize);
@@ -162,7 +162,7 @@ Expected<uint64_t> DWARFDebugAddrTable::getAddrEntry(uint32_t Index) const {
     return Addrs[Index];
   return createStringError(errc::invalid_argument,
                            "Index %" PRIu32 " is out of range of the "
-                           ".debug_addr table at offset 0x%" PRIx32,
+                           ".debug_addr table at offset 0x%" PRIx64,
                            Index, HeaderOffset);
 }
 
diff --git a/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
index 6551b61accb8..200b2d52a02b 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -24,13 +24,13 @@ void DWARFDebugArangeSet::Descriptor::dump(raw_ostream &OS,
 }
 
 void DWARFDebugArangeSet::clear() {
-  Offset = -1U;
+  Offset = -1ULL;
   std::memset(&HeaderData, 0, sizeof(Header));
   ArangeDescriptors.clear();
 }
 
 bool
-DWARFDebugArangeSet::extract(DataExtractor data, uint32_t *offset_ptr) {
+DWARFDebugArangeSet::extract(DataExtractor data, uint64_t *offset_ptr) {
   if (data.isValidOffset(*offset_ptr)) {
     ArangeDescriptors.clear();
     Offset = *offset_ptr;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index 6460c9feeab8..ca6043109cdb 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -23,11 +23,11 @@ using namespace llvm;
 void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
   if (!DebugArangesData.isValidOffset(0))
     return;
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   DWARFDebugArangeSet Set;
 
   while (Set.extract(DebugArangesData, &Offset)) {
-    uint32_t CUOffset = Set.getCompileUnitDIEOffset();
+    uint64_t CUOffset = Set.getCompileUnitDIEOffset();
     for (const auto &Desc : Set.descriptors()) {
       uint64_t LowPC = Desc.Address;
       uint64_t HighPC = Desc.getEndAddress();
@@ -43,7 +43,7 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
     return;
 
   // Extract aranges from .debug_aranges section.
-  DataExtractor ArangesData(CTX->getDWARFObj().getARangeSection(),
+  DataExtractor ArangesData(CTX->getDWARFObj().getArangesSection(),
                             CTX->isLittleEndian(), 0);
   extract(ArangesData);
 
@@ -51,7 +51,7 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
   // it may describe only a small subset of compilation units, so we need to
   // manually build aranges for the rest of them.
   for (const auto &CU : CTX->compile_units()) {
-    uint32_t CUOffset = CU->getOffset();
+    uint64_t CUOffset = CU->getOffset();
     if (ParsedCUOffsets.insert(CUOffset).second) {
       Expected<DWARFAddressRangesVector> CURanges = CU->collectAddressRanges();
       if (!CURanges)
@@ -71,7 +71,7 @@ void DWARFDebugAranges::clear() {
   ParsedCUOffsets.clear();
 }
 
-void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
+void DWARFDebugAranges::appendRange(uint64_t CUOffset, uint64_t LowPC,
                                     uint64_t HighPC) {
   if (LowPC >= HighPC)
     return;
@@ -80,7 +80,7 @@ void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
 }
 
 void DWARFDebugAranges::construct() {
-  std::multiset<uint32_t> ValidCUs;  // Maintain the set of CUs describing
+  std::multiset<uint64_t> ValidCUs;  // Maintain the set of CUs describing
                                      // a current address range.
   llvm::sort(Endpoints);
   uint64_t PrevAddress = -1ULL;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index b3f23366f2a2..81b00f65741b 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -34,10 +34,10 @@ using namespace dwarf;
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
 const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
 
-Error CFIProgram::parse(DataExtractor Data, uint32_t *Offset,
-                        uint32_t EndOffset) {
+Error CFIProgram::parse(DWARFDataExtractor Data, uint64_t *Offset,
+                        uint64_t EndOffset) {
   while (*Offset < EndOffset) {
-    uint8_t Opcode = Data.getU8(Offset);
+    uint8_t Opcode = Data.getRelocatedValue(1, Offset);
     // Some instructions have a primary opcode encoded in the top bits.
     uint8_t Primary = Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK;
 
@@ -74,19 +74,19 @@ Error CFIProgram::parse(DataExtractor Data, uint32_t *Offset,
         break;
       case DW_CFA_set_loc:
         // Operands: Address
-        addInstruction(Opcode, Data.getAddress(Offset));
+        addInstruction(Opcode, Data.getRelocatedAddress(Offset));
         break;
       case DW_CFA_advance_loc1:
         // Operands: 1-byte delta
-        addInstruction(Opcode, Data.getU8(Offset));
+        addInstruction(Opcode, Data.getRelocatedValue(1, Offset));
         break;
       case DW_CFA_advance_loc2:
         // Operands: 2-byte delta
-        addInstruction(Opcode, Data.getU16(Offset));
+        addInstruction(Opcode, Data.getRelocatedValue(2, Offset));
         break;
       case DW_CFA_advance_loc4:
         // Operands: 4-byte delta
-        addInstruction(Opcode, Data.getU32(Offset));
+        addInstruction(Opcode, Data.getRelocatedValue(4, Offset));
         break;
       case DW_CFA_restore_extended:
       case DW_CFA_undefined:
@@ -331,7 +331,7 @@ DWARFDebugFrame::DWARFDebugFrame(Triple::ArchType Arch,
 DWARFDebugFrame::~DWARFDebugFrame() = default;
 
 static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
-                                              uint32_t Offset, int Length) {
+                                              uint64_t Offset, int Length) {
   errs() << "DUMP: ";
   for (int i = 0; i < Length; ++i) {
     uint8_t c = Data.getU8(&Offset);
@@ -344,7 +344,7 @@ static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
 // noreturn attribute usage in lambdas. Once the support for those
 // compilers are phased out, we can remove this and return back to
 // a ReportError lambda: [StartOffset](const char *ErrorMsg).
-static void LLVM_ATTRIBUTE_NORETURN ReportError(uint32_t StartOffset,
+static void LLVM_ATTRIBUTE_NORETURN ReportError(uint64_t StartOffset,
                                                 const char *ErrorMsg) {
   std::string Str;
   raw_string_ostream OS(Str);
@@ -354,32 +354,30 @@ static void LLVM_ATTRIBUTE_NORETURN ReportError(uint32_t StartOffset,
 }
 
 void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
-  uint32_t Offset = 0;
-  DenseMap<uint32_t, CIE *> CIEs;
+  uint64_t Offset = 0;
+  DenseMap<uint64_t, CIE *> CIEs;
 
   while (Data.isValidOffset(Offset)) {
-    uint32_t StartOffset = Offset;
+    uint64_t StartOffset = Offset;
 
     bool IsDWARF64 = false;
-    uint64_t Length = Data.getU32(&Offset);
+    uint64_t Length = Data.getRelocatedValue(4, &Offset);
     uint64_t Id;
 
-    if (Length == UINT32_MAX) {
+    if (Length == dwarf::DW_LENGTH_DWARF64) {
       // DWARF-64 is distinguished by the first 32 bits of the initial length
       // field being 0xffffffff. Then, the next 64 bits are the actual entry
       // length.
       IsDWARF64 = true;
-      Length = Data.getU64(&Offset);
+      Length = Data.getRelocatedValue(8, &Offset);
     }
 
     // At this point, Offset points to the next field after Length.
     // Length is the structure size excluding itself. Compute an offset one
     // past the end of the structure (needed to know how many instructions to
     // read).
-    // TODO: For honest DWARF64 support, DataExtractor will have to treat
-    //       offset_ptr as uint64_t*
-    uint32_t StartStructureOffset = Offset;
-    uint32_t EndStructureOffset = Offset + static_cast<uint32_t>(Length);
+    uint64_t StartStructureOffset = Offset;
+    uint64_t EndStructureOffset = Offset + Length;
 
     // The Id field's size depends on the DWARF format
     Id = Data.getUnsigned(&Offset, (IsDWARF64 && !IsEH) ? 8 : 4);
@@ -407,22 +405,23 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
       Optional<uint32_t> PersonalityEncoding;
       if (IsEH) {
         Optional<uint64_t> AugmentationLength;
-        uint32_t StartAugmentationOffset;
-        uint32_t EndAugmentationOffset;
+        uint64_t StartAugmentationOffset;
+        uint64_t EndAugmentationOffset;
 
         // Walk the augmentation string to get all the augmentation data.
         for (unsigned i = 0, e = AugmentationString.size(); i != e; ++i) {
           switch (AugmentationString[i]) {
             default:
-              ReportError(StartOffset,
-                          "Unknown augmentation character in entry at %lx");
+              ReportError(
+                  StartOffset,
+                  "Unknown augmentation character in entry at %" PRIx64);
             case 'L':
               LSDAPointerEncoding = Data.getU8(&Offset);
               break;
             case 'P': {
               if (Personality)
                 ReportError(StartOffset,
-                            "Duplicate personality in entry at %lx");
+                            "Duplicate personality in entry at %" PRIx64);
               PersonalityEncoding = Data.getU8(&Offset);
               Personality = Data.getEncodedPointer(
                   &Offset, *PersonalityEncoding,
@@ -438,13 +437,12 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
             case 'z':
               if (i)
                 ReportError(StartOffset,
-                            "'z' must be the first character at %lx");
+                            "'z' must be the first character at %" PRIx64);
               // Parse the augmentation length first.  We only parse it if
               // the string contains a 'z'.
               AugmentationLength = Data.getULEB128(&Offset);
               StartAugmentationOffset = Offset;
-              EndAugmentationOffset = Offset +
-                static_cast<uint32_t>(*AugmentationLength);
+              EndAugmentationOffset = Offset + *AugmentationLength;
               break;
             case 'B':
               // B-Key is used for signing functions associated with this
@@ -455,14 +453,15 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
 
         if (AugmentationLength.hasValue()) {
           if (Offset != EndAugmentationOffset)
-            ReportError(StartOffset, "Parsing augmentation data at %lx failed");
+            ReportError(StartOffset,
+                        "Parsing augmentation data at %" PRIx64 " failed");
 
           AugmentationData = Data.getData().slice(StartAugmentationOffset,
                                                   EndAugmentationOffset);
         }
       }
 
-      auto Cie = llvm::make_unique<CIE>(
+      auto Cie = std::make_unique<CIE>(
           StartOffset, Length, Version, AugmentationString, AddressSize,
           SegmentDescriptorSize, CodeAlignmentFactor, DataAlignmentFactor,
           ReturnAddressRegister, AugmentationData, FDEPointerEncoding,
@@ -480,8 +479,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
       if (IsEH) {
         // The address size is encoded in the CIE we reference.
         if (!Cie)
-          ReportError(StartOffset,
-                      "Parsing FDE data at %lx failed due to missing CIE");
+          ReportError(StartOffset, "Parsing FDE data at %" PRIx64
+                                   " failed due to missing CIE");
 
         if (auto Val = Data.getEncodedPointer(
                 &Offset, Cie->getFDEPointerEncoding(),
@@ -498,8 +497,7 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
           // Parse the augmentation length and data for this FDE.
           uint64_t AugmentationLength = Data.getULEB128(&Offset);
 
-          uint32_t EndAugmentationOffset =
-            Offset + static_cast<uint32_t>(AugmentationLength);
+          uint64_t EndAugmentationOffset = Offset + AugmentationLength;
 
           // Decode the LSDA if the CIE augmentation string said we should.
           if (Cie->getLSDAPointerEncoding() != DW_EH_PE_omit) {
@@ -509,11 +507,12 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
           }
 
           if (Offset != EndAugmentationOffset)
-            ReportError(StartOffset, "Parsing augmentation data at %lx failed");
+            ReportError(StartOffset,
+                        "Parsing augmentation data at %" PRIx64 " failed");
         }
       } else {
-        InitialLocation = Data.getAddress(&Offset);
-        AddressRange = Data.getAddress(&Offset);
+        InitialLocation = Data.getRelocatedAddress(&Offset);
+        AddressRange = Data.getRelocatedAddress(&Offset);
       }
 
       Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer,
@@ -527,7 +526,8 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
     }
 
     if (Offset != EndStructureOffset)
-      ReportError(StartOffset, "Parsing entry instructions at %lx failed");
+      ReportError(StartOffset,
+                  "Parsing entry instructions at %" PRIx64 " failed");
   }
 }
 
diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index d8a755e90df4..87eab34d58ee 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -19,15 +19,15 @@ using namespace llvm;
 using namespace dwarf;
 
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U,
-                                             uint32_t *OffsetPtr) {
+                                             uint64_t *OffsetPtr) {
   DWARFDataExtractor DebugInfoData = U.getDebugInfoExtractor();
-  const uint32_t UEndOffset = U.getNextUnitOffset();
+  const uint64_t UEndOffset = U.getNextUnitOffset();
   return extractFast(U, OffsetPtr, DebugInfoData, UEndOffset, 0);
 }
 
-bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
+bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint64_t *OffsetPtr,
                                       const DWARFDataExtractor &DebugInfoData,
-                                      uint32_t UEndOffset, uint32_t D) {
+                                      uint64_t UEndOffset, uint32_t D) {
   Offset = *OffsetPtr;
   Depth = D;
   if (Offset >= UEndOffset || !DebugInfoData.isValidOffset(Offset))
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index a1cb1e8582ed..dbee28ff5ab1 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -16,7 +16,6 @@
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -156,7 +155,7 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS,
 // Parse v2-v4 directory and file tables.
 static void
 parseV2DirFileTables(const DWARFDataExtractor &DebugLineData,
-                     uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
+                     uint64_t *OffsetPtr, uint64_t EndPrologueOffset,
                      DWARFDebugLine::ContentTypeTracker &ContentTypes,
                      std::vector<DWARFFormValue> &IncludeDirectories,
                      std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
@@ -187,18 +186,24 @@ parseV2DirFileTables(const DWARFDataExtractor &DebugLineData,
 }
 
 // Parse v5 directory/file entry content descriptions.
-// Returns the descriptors, or an empty vector if we did not find a path or
-// ran off the end of the prologue.
-static ContentDescriptors
-parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t
-    *OffsetPtr, uint64_t EndPrologueOffset, DWARFDebugLine::ContentTypeTracker
-    *ContentTypes) {
+// Returns the descriptors, or an error if we did not find a path or ran off
+// the end of the prologue.
+static llvm::Expected<ContentDescriptors>
+parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint64_t *OffsetPtr,
+                   uint64_t EndPrologueOffset,
+                   DWARFDebugLine::ContentTypeTracker *ContentTypes) {
   ContentDescriptors Descriptors;
   int FormatCount = DebugLineData.getU8(OffsetPtr);
   bool HasPath = false;
   for (int I = 0; I != FormatCount; ++I) {
     if (*OffsetPtr >= EndPrologueOffset)
-      return ContentDescriptors();
+      return createStringError(
+          errc::invalid_argument,
+          "failed to parse entry content descriptions at offset "
+          "0x%8.8" PRIx64
+          " because offset extends beyond the prologue end at offset "
+          "0x%8.8" PRIx64,
+          *OffsetPtr, EndPrologueOffset);
     ContentDescriptor Descriptor;
     Descriptor.Type =
       dwarf::LineNumberEntryFormat(DebugLineData.getULEB128(OffsetPtr));
@@ -209,60 +214,82 @@ parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t
       ContentTypes->trackContentType(Descriptor.Type);
     Descriptors.push_back(Descriptor);
   }
-  return HasPath ? Descriptors : ContentDescriptors();
+
+  if (!HasPath)
+    return createStringError(errc::invalid_argument,
+                             "failed to parse entry content descriptions"
+                             " because no path was found");
+  return Descriptors;
 }
 
-static bool
+static Error
 parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
-                     uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
+                     uint64_t *OffsetPtr, uint64_t EndPrologueOffset,
                      const dwarf::FormParams &FormParams,
                      const DWARFContext &Ctx, const DWARFUnit *U,
                      DWARFDebugLine::ContentTypeTracker &ContentTypes,
                      std::vector<DWARFFormValue> &IncludeDirectories,
                      std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
   // Get the directory entry description.
-  ContentDescriptors DirDescriptors =
+  llvm::Expected<ContentDescriptors> DirDescriptors =
       parseV5EntryFormat(DebugLineData, OffsetPtr, EndPrologueOffset, nullptr);
-  if (DirDescriptors.empty())
-    return false;
+  if (!DirDescriptors)
+    return DirDescriptors.takeError();
 
   // Get the directory entries, according to the format described above.
   int DirEntryCount = DebugLineData.getU8(OffsetPtr);
   for (int I = 0; I != DirEntryCount; ++I) {
     if (*OffsetPtr >= EndPrologueOffset)
-      return false;
-    for (auto Descriptor : DirDescriptors) {
+      return createStringError(
+          errc::invalid_argument,
+          "failed to parse directory entry at offset "
+          "0x%8.8" PRIx64
+          " because offset extends beyond the prologue end at offset "
+          "0x%8.8" PRIx64,
+          *OffsetPtr, EndPrologueOffset);
+    for (auto Descriptor : *DirDescriptors) {
       DWARFFormValue Value(Descriptor.Form);
       switch (Descriptor.Type) {
       case DW_LNCT_path:
         if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, &Ctx, U))
-          return false;
+          return createStringError(errc::invalid_argument,
+                                   "failed to parse directory entry because "
+                                   "extracting the form value failed.");
         IncludeDirectories.push_back(Value);
         break;
       default:
         if (!Value.skipValue(DebugLineData, OffsetPtr, FormParams))
-          return false;
+          return createStringError(errc::invalid_argument,
+                                   "failed to parse directory entry because "
+                                   "skipping the form value failed.");
       }
     }
   }
 
   // Get the file entry description.
-  ContentDescriptors FileDescriptors =
-      parseV5EntryFormat(DebugLineData, OffsetPtr, EndPrologueOffset,
-          &ContentTypes);
-  if (FileDescriptors.empty())
-    return false;
+  llvm::Expected<ContentDescriptors> FileDescriptors = parseV5EntryFormat(
+      DebugLineData, OffsetPtr, EndPrologueOffset, &ContentTypes);
+  if (!FileDescriptors)
+    return FileDescriptors.takeError();
 
   // Get the file entries, according to the format described above.
   int FileEntryCount = DebugLineData.getU8(OffsetPtr);
   for (int I = 0; I != FileEntryCount; ++I) {
     if (*OffsetPtr >= EndPrologueOffset)
-      return false;
+      return createStringError(
+          errc::invalid_argument,
+          "failed to parse file entry at offset "
+          "0x%8.8" PRIx64
+          " because offset extends beyond the prologue end at offset "
+          "0x%8.8" PRIx64,
+          *OffsetPtr, EndPrologueOffset);
     DWARFDebugLine::FileNameEntry FileEntry;
-    for (auto Descriptor : FileDescriptors) {
+    for (auto Descriptor : *FileDescriptors) {
       DWARFFormValue Value(Descriptor.Form);
       if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, &Ctx, U))
-        return false;
+        return createStringError(errc::invalid_argument,
+                                 "failed to parse file entry because "
+                                 "extracting the form value failed.");
       switch (Descriptor.Type) {
       case DW_LNCT_path:
         FileEntry.Name = Value;
@@ -280,7 +307,10 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
         FileEntry.Length = Value.getAsUnsignedConstant().getValue();
         break;
       case DW_LNCT_MD5:
-        assert(Value.getAsBlock().getValue().size() == 16);
+        if (!Value.getAsBlock() || Value.getAsBlock().getValue().size() != 16)
+          return createStringError(
+              errc::invalid_argument,
+              "failed to parse file entry because the MD5 hash is invalid");
         std::uninitialized_copy_n(Value.getAsBlock().getValue().begin(), 16,
                                   FileEntry.Checksum.Bytes.begin());
         break;
@@ -290,21 +320,21 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
     }
     FileNames.push_back(FileEntry);
   }
-  return true;
+  return Error::success();
 }
 
 Error DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
-                                      uint32_t *OffsetPtr,
+                                      uint64_t *OffsetPtr,
                                       const DWARFContext &Ctx,
                                       const DWARFUnit *U) {
   const uint64_t PrologueOffset = *OffsetPtr;
 
   clear();
   TotalLength = DebugLineData.getRelocatedValue(4, OffsetPtr);
-  if (TotalLength == UINT32_MAX) {
+  if (TotalLength == dwarf::DW_LENGTH_DWARF64) {
     FormParams.Format = dwarf::DWARF64;
     TotalLength = DebugLineData.getU64(OffsetPtr);
-  } else if (TotalLength >= 0xfffffff0) {
+  } else if (TotalLength >= dwarf::DW_LENGTH_lo_reserved) {
     return createStringError(errc::invalid_argument,
         "parsing line table prologue at offset 0x%8.8" PRIx64
         " unsupported reserved unit length found of value 0x%8.8" PRIx64,
@@ -343,14 +373,17 @@ Error DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
   }
 
   if (getVersion() >= 5) {
-    if (!parseV5DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
-                              FormParams, Ctx, U, ContentTypes,
-                              IncludeDirectories, FileNames)) {
-      return createStringError(errc::invalid_argument,
-          "parsing line table prologue at 0x%8.8" PRIx64
-          " found an invalid directory or file table description at"
-          " 0x%8.8" PRIx64,
-          PrologueOffset, (uint64_t)*OffsetPtr);
+    if (Error e = parseV5DirFileTables(
+            DebugLineData, OffsetPtr, EndPrologueOffset, FormParams, Ctx, U,
+            ContentTypes, IncludeDirectories, FileNames)) {
+      return joinErrors(
+          createStringError(
+              errc::invalid_argument,
+              "parsing line table prologue at 0x%8.8" PRIx64
+              " found an invalid directory or file table description at"
+              " 0x%8.8" PRIx64,
+              PrologueOffset, *OffsetPtr),
+          std::move(e));
     }
   } else
     parseV2DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
@@ -361,7 +394,7 @@ Error DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
                        "parsing line table prologue at 0x%8.8" PRIx64
                        " should have ended at 0x%8.8" PRIx64
                        " but it ended at 0x%8.8" PRIx64,
-                       PrologueOffset, EndPrologueOffset, (uint64_t)*OffsetPtr);
+                       PrologueOffset, EndPrologueOffset, *OffsetPtr);
   return Error::success();
 }
 
@@ -468,7 +501,7 @@ void DWARFDebugLine::ParsingState::appendRowToMatrix() {
 }
 
 const DWARFDebugLine::LineTable *
-DWARFDebugLine::getLineTable(uint32_t Offset) const {
+DWARFDebugLine::getLineTable(uint64_t Offset) const {
   LineTableConstIter Pos = LineTableMap.find(Offset);
   if (Pos != LineTableMap.end())
     return &Pos->second;
@@ -476,10 +509,10 @@ DWARFDebugLine::getLineTable(uint32_t Offset) const {
 }
 
 Expected<const DWARFDebugLine::LineTable *> DWARFDebugLine::getOrParseLineTable(
-    DWARFDataExtractor &DebugLineData, uint32_t Offset, const DWARFContext &Ctx,
+    DWARFDataExtractor &DebugLineData, uint64_t Offset, const DWARFContext &Ctx,
     const DWARFUnit *U, std::function<void(Error)> RecoverableErrorCallback) {
   if (!DebugLineData.isValidOffset(Offset))
-    return createStringError(errc::invalid_argument, "offset 0x%8.8" PRIx32
+    return createStringError(errc::invalid_argument, "offset 0x%8.8" PRIx64
                        " is not a valid debug line section offset",
                        Offset);
 
@@ -496,10 +529,10 @@ Expected<const DWARFDebugLine::LineTable *> DWARFDebugLine::getOrParseLineTable(
 }
 
 Error DWARFDebugLine::LineTable::parse(
-    DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
+    DWARFDataExtractor &DebugLineData, uint64_t *OffsetPtr,
     const DWARFContext &Ctx, const DWARFUnit *U,
     std::function<void(Error)> RecoverableErrorCallback, raw_ostream *OS) {
-  const uint32_t DebugLineOffset = *OffsetPtr;
+  const uint64_t DebugLineOffset = *OffsetPtr;
 
   clear();
 
@@ -515,7 +548,7 @@ Error DWARFDebugLine::LineTable::parse(
   if (PrologueErr)
     return PrologueErr;
 
-  const uint32_t EndOffset =
+  const uint64_t EndOffset =
       DebugLineOffset + Prologue.TotalLength + Prologue.sizeofTotalLength();
 
   // See if we should tell the data extractor the address size.
@@ -529,7 +562,7 @@ Error DWARFDebugLine::LineTable::parse(
 
   while (*OffsetPtr < EndOffset) {
     if (OS)
-      *OS << format("0x%08.08" PRIx32 ": ", *OffsetPtr);
+      *OS << format("0x%08.08" PRIx64 ": ", *OffsetPtr);
 
     uint8_t Opcode = DebugLineData.getU8(OffsetPtr);
 
@@ -540,7 +573,7 @@ Error DWARFDebugLine::LineTable::parse(
       // Extended Opcodes always start with a zero opcode followed by
       // a uleb128 length so you can skip ones you don't know about
       uint64_t Len = DebugLineData.getULEB128(OffsetPtr);
-      uint32_t ExtOffset = *OffsetPtr;
+      uint64_t ExtOffset = *OffsetPtr;
 
       // Tolerate zero-length; assume length is correct and soldier on.
       if (Len == 0) {
@@ -585,7 +618,7 @@ Error DWARFDebugLine::LineTable::parse(
           DebugLineData.setAddressSize(Len - 1);
         else if (DebugLineData.getAddressSize() != Len - 1) {
           return createStringError(errc::invalid_argument,
-                             "mismatching address size at offset 0x%8.8" PRIx32
+                             "mismatching address size at offset 0x%8.8" PRIx64
                              " expected 0x%2.2" PRIx8 " found 0x%2.2" PRIx64,
                              ExtOffset, DebugLineData.getAddressSize(),
                              Len - 1);
@@ -652,8 +685,8 @@ Error DWARFDebugLine::LineTable::parse(
       // Otherwise we have an unparseable line-number program.
       if (*OffsetPtr - ExtOffset != Len)
         return createStringError(errc::illegal_byte_sequence,
-                           "unexpected line op length at offset 0x%8.8" PRIx32
-                           " expected 0x%2.2" PRIx64 " found 0x%2.2" PRIx32,
+                           "unexpected line op length at offset 0x%8.8" PRIx64
+                           " expected 0x%2.2" PRIx64 " found 0x%2.2" PRIx64,
                            ExtOffset, Len, *OffsetPtr - ExtOffset);
     } else if (Opcode < Prologue.OpcodeBase) {
       if (OS)
@@ -1007,10 +1040,9 @@ static bool isPathAbsoluteOnWindowsOrPosix(const Twine &Path) {
          sys::path::is_absolute(Path, sys::path::Style::windows);
 }
 
-bool DWARFDebugLine::Prologue::getFileNameByIndex(uint64_t FileIndex,
-                                                  StringRef CompDir,
-                                                  FileLineInfoKind Kind,
-                                                  std::string &Result) const {
+bool DWARFDebugLine::Prologue::getFileNameByIndex(
+    uint64_t FileIndex, StringRef CompDir, FileLineInfoKind Kind,
+    std::string &Result, sys::path::Style Style) const {
   if (Kind == FileLineInfoKind::None || !hasFileAtIndex(FileIndex))
     return false;
   const FileNameEntry &Entry = getFileNameEntry(FileIndex);
@@ -1036,11 +1068,11 @@ bool DWARFDebugLine::Prologue::getFileNameByIndex(uint64_t FileIndex,
     // We know that FileName is not absolute, the only way to have an
     // absolute path at this point would be if IncludeDir is absolute.
     if (!CompDir.empty() && !isPathAbsoluteOnWindowsOrPosix(IncludeDir))
-      sys::path::append(FilePath, CompDir);
+      sys::path::append(FilePath, Style, CompDir);
   }
 
   // sys::path::append skips empty strings.
-  sys::path::append(FilePath, IncludeDir, FileName);
+  sys::path::append(FilePath, Style, IncludeDir, FileName);
   Result = FilePath.str();
   return true;
 }
@@ -1092,7 +1124,8 @@ DWARFDebugLine::SectionParser::SectionParser(DWARFDataExtractor &Data,
 }
 
 bool DWARFDebugLine::Prologue::totalLengthIsValid() const {
-  return TotalLength == 0xffffffff || TotalLength < 0xfffffff0;
+  return TotalLength == dwarf::DW_LENGTH_DWARF64 ||
+         TotalLength < dwarf::DW_LENGTH_lo_reserved;
 }
 
 DWARFDebugLine::LineTable DWARFDebugLine::SectionParser::parseNext(
@@ -1101,7 +1134,7 @@ DWARFDebugLine::LineTable DWARFDebugLine::SectionParser::parseNext(
   assert(DebugLineData.isValidOffset(Offset) &&
          "parsing should have terminated");
   DWARFUnit *U = prepareToParse(Offset);
-  uint32_t OldOffset = Offset;
+  uint64_t OldOffset = Offset;
   LineTable LT;
   if (Error Err = LT.parse(DebugLineData, &Offset, Context, U,
                            RecoverableErrorCallback, OS))
@@ -1115,14 +1148,14 @@ void DWARFDebugLine::SectionParser::skip(
   assert(DebugLineData.isValidOffset(Offset) &&
          "parsing should have terminated");
   DWARFUnit *U = prepareToParse(Offset);
-  uint32_t OldOffset = Offset;
+  uint64_t OldOffset = Offset;
   LineTable LT;
   if (Error Err = LT.Prologue.parse(DebugLineData, &Offset, Context, U))
     ErrorCallback(std::move(Err));
   moveToNextTable(OldOffset, LT.Prologue);
 }
 
-DWARFUnit *DWARFDebugLine::SectionParser::prepareToParse(uint32_t Offset) {
+DWARFUnit *DWARFDebugLine::SectionParser::prepareToParse(uint64_t Offset) {
   DWARFUnit *U = nullptr;
   auto It = LineToUnit.find(Offset);
   if (It != LineToUnit.end())
@@ -1131,7 +1164,7 @@ DWARFUnit *DWARFDebugLine::SectionParser::prepareToParse(uint32_t Offset) {
   return U;
 }
 
-void DWARFDebugLine::SectionParser::moveToNextTable(uint32_t OldOffset,
+void DWARFDebugLine::SectionParser::moveToNextTable(uint64_t OldOffset,
                                                     const Prologue &P) {
   // If the length field is not valid, we don't know where the next table is, so
   // cannot continue to parse. Mark the parser as done, and leave the Offset
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 6d8f4bee77c4..4f7b01130a47 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -28,19 +28,18 @@ using namespace llvm;
 // expression that LLVM doesn't produce. Guessing the wrong version means we
 // won't be able to pretty print expressions in DWARF2 binaries produced by
 // non-LLVM tools.
-static void dumpExpression(raw_ostream &OS, ArrayRef<char> Data,
+static void dumpExpression(raw_ostream &OS, ArrayRef<uint8_t> Data,
                            bool IsLittleEndian, unsigned AddressSize,
                            const MCRegisterInfo *MRI, DWARFUnit *U) {
-  DWARFDataExtractor Extractor(StringRef(Data.data(), Data.size()),
-                               IsLittleEndian, AddressSize);
+  DWARFDataExtractor Extractor(toStringRef(Data), IsLittleEndian, AddressSize);
   DWARFExpression(Extractor, dwarf::DWARF_VERSION, AddressSize).print(OS, MRI, U);
 }
 
-void DWARFDebugLoc::LocationList::dump(raw_ostream &OS, bool IsLittleEndian,
+void DWARFDebugLoc::LocationList::dump(raw_ostream &OS, uint64_t BaseAddress,
+                                       bool IsLittleEndian,
                                        unsigned AddressSize,
-                                       const MCRegisterInfo *MRI,
-                                       DWARFUnit *U,
-                                       uint64_t BaseAddress,
+                                       const MCRegisterInfo *MRI, DWARFUnit *U,
+                                       DIDumpOptions DumpOpts,
                                        unsigned Indent) const {
   for (const Entry &E : Entries) {
     OS << '\n';
@@ -64,12 +63,12 @@ DWARFDebugLoc::getLocationListAtOffset(uint64_t Offset) const {
   return nullptr;
 }
 
-void DWARFDebugLoc::dump(raw_ostream &OS, const MCRegisterInfo *MRI,
+void DWARFDebugLoc::dump(raw_ostream &OS, const MCRegisterInfo *MRI, DIDumpOptions DumpOpts,
                          Optional<uint64_t> Offset) const {
   auto DumpLocationList = [&](const LocationList &L) {
-    OS << format("0x%8.8x: ", L.Offset);
-    L.dump(OS, IsLittleEndian, AddressSize, MRI, nullptr, 0, 12);
-    OS << "\n\n";
+    OS << format("0x%8.8" PRIx64 ": ", L.Offset);
+    L.dump(OS, 0, IsLittleEndian, AddressSize, MRI, nullptr, DumpOpts, 12);
+    OS << "\n";
   };
 
   if (Offset) {
@@ -80,50 +79,47 @@ void DWARFDebugLoc::dump(raw_ostream &OS, const MCRegisterInfo *MRI,
 
   for (const LocationList &L : Locations) {
     DumpLocationList(L);
+    if (&L != &Locations.back())
+      OS << '\n';
   }
 }
 
-Optional<DWARFDebugLoc::LocationList>
-DWARFDebugLoc::parseOneLocationList(DWARFDataExtractor Data, unsigned *Offset) {
+Expected<DWARFDebugLoc::LocationList>
+DWARFDebugLoc::parseOneLocationList(const DWARFDataExtractor &Data,
+                                    uint64_t *Offset) {
   LocationList LL;
   LL.Offset = *Offset;
+  AddressSize = Data.getAddressSize();
+  DataExtractor::Cursor C(*Offset);
 
   // 2.6.2 Location Lists
   // A location list entry consists of:
   while (true) {
     Entry E;
-    if (!Data.isValidOffsetForDataOfSize(*Offset, 2 * Data.getAddressSize())) {
-      WithColor::error() << "location list overflows the debug_loc section.\n";
-      return None;
-    }
 
     // 1. A beginning address offset. ...
-    E.Begin = Data.getRelocatedAddress(Offset);
+    E.Begin = Data.getRelocatedAddress(C);
 
     // 2. An ending address offset. ...
-    E.End = Data.getRelocatedAddress(Offset);
+    E.End = Data.getRelocatedAddress(C);
+
+    if (Error Err = C.takeError())
+      return std::move(Err);
 
     // The end of any given location list is marked by an end of list entry,
     // which consists of a 0 for the beginning address offset and a 0 for the
     // ending address offset.
-    if (E.Begin == 0 && E.End == 0)
+    if (E.Begin == 0 && E.End == 0) {
+      *Offset = C.tell();
       return LL;
-
-    if (!Data.isValidOffsetForDataOfSize(*Offset, 2)) {
-      WithColor::error() << "location list overflows the debug_loc section.\n";
-      return None;
     }
 
-    unsigned Bytes = Data.getU16(Offset);
-    if (!Data.isValidOffsetForDataOfSize(*Offset, Bytes)) {
-      WithColor::error() << "location list overflows the debug_loc section.\n";
-      return None;
+    if (E.Begin != (AddressSize == 4 ? -1U : -1ULL)) {
+      unsigned Bytes = Data.getU16(C);
+      // A single location description describing the location of the object...
+      Data.getU8(C, E.Loc, Bytes);
     }
-    // A single location description describing the location of the object...
-    StringRef str = Data.getData().substr(*Offset, Bytes);
-    *Offset += Bytes;
-    E.Loc.reserve(str.size());
-    llvm::copy(str, std::back_inserter(E.Loc));
+
     LL.Entries.push_back(std::move(E));
   }
 }
@@ -132,81 +128,89 @@ void DWARFDebugLoc::parse(const DWARFDataExtractor &data) {
   IsLittleEndian = data.isLittleEndian();
   AddressSize = data.getAddressSize();
 
-  uint32_t Offset = 0;
-  while (data.isValidOffset(Offset + data.getAddressSize() - 1)) {
+  uint64_t Offset = 0;
+  while (Offset < data.getData().size()) {
     if (auto LL = parseOneLocationList(data, &Offset))
       Locations.push_back(std::move(*LL));
-    else
+    else {
+      logAllUnhandledErrors(LL.takeError(), WithColor::error());
       break;
+    }
   }
-  if (data.isValidOffset(Offset))
-    WithColor::error() << "failed to consume entire .debug_loc section\n";
 }
 
-Optional<DWARFDebugLoclists::LocationList>
-DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset,
-                                         unsigned Version) {
+Expected<DWARFDebugLoclists::LocationList>
+DWARFDebugLoclists::parseOneLocationList(const DataExtractor &Data,
+                                         uint64_t *Offset, unsigned Version) {
   LocationList LL;
   LL.Offset = *Offset;
+  DataExtractor::Cursor C(*Offset);
 
   // dwarf::DW_LLE_end_of_list_entry is 0 and indicates the end of the list.
-  while (auto Kind =
-             static_cast<dwarf::LocationListEntry>(Data.getU8(Offset))) {
-
+  while (auto Kind = Data.getU8(C)) {
     Entry E;
     E.Kind = Kind;
+    E.Offset = C.tell() - 1;
     switch (Kind) {
+    case dwarf::DW_LLE_base_addressx:
+      E.Value0 = Data.getULEB128(C);
+      break;
     case dwarf::DW_LLE_startx_length:
-      E.Value0 = Data.getULEB128(Offset);
+      E.Value0 = Data.getULEB128(C);
       // Pre-DWARF 5 has different interpretation of the length field. We have
       // to support both pre- and standartized styles for the compatibility.
       if (Version < 5)
-        E.Value1 = Data.getU32(Offset);
+        E.Value1 = Data.getU32(C);
       else
-        E.Value1 = Data.getULEB128(Offset);
+        E.Value1 = Data.getULEB128(C);
       break;
     case dwarf::DW_LLE_start_length:
-      E.Value0 = Data.getAddress(Offset);
-      E.Value1 = Data.getULEB128(Offset);
+      E.Value0 = Data.getAddress(C);
+      E.Value1 = Data.getULEB128(C);
       break;
     case dwarf::DW_LLE_offset_pair:
-      E.Value0 = Data.getULEB128(Offset);
-      E.Value1 = Data.getULEB128(Offset);
+      E.Value0 = Data.getULEB128(C);
+      E.Value1 = Data.getULEB128(C);
       break;
     case dwarf::DW_LLE_base_address:
-      E.Value0 = Data.getAddress(Offset);
+      E.Value0 = Data.getAddress(C);
       break;
     default:
-      WithColor::error() << "dumping support for LLE of kind " << (int)Kind
-                         << " not implemented\n";
-      return None;
+      cantFail(C.takeError());
+      return createStringError(errc::illegal_byte_sequence,
+                               "LLE of kind %x not supported", (int)Kind);
     }
 
-    if (Kind != dwarf::DW_LLE_base_address) {
-      unsigned Bytes =
-          Version >= 5 ? Data.getULEB128(Offset) : Data.getU16(Offset);
+    if (Kind != dwarf::DW_LLE_base_address &&
+        Kind != dwarf::DW_LLE_base_addressx) {
+      unsigned Bytes = Version >= 5 ? Data.getULEB128(C) : Data.getU16(C);
       // A single location description describing the location of the object...
-      StringRef str = Data.getData().substr(*Offset, Bytes);
-      *Offset += Bytes;
-      E.Loc.resize(str.size());
-      llvm::copy(str, E.Loc.begin());
+      Data.getU8(C, E.Loc, Bytes);
     }
 
     LL.Entries.push_back(std::move(E));
   }
+  if (Error Err = C.takeError())
+    return std::move(Err);
+  Entry E;
+  E.Kind = dwarf::DW_LLE_end_of_list;
+  E.Offset = C.tell() - 1;
+  LL.Entries.push_back(E);
+  *Offset = C.tell();
   return LL;
 }
 
-void DWARFDebugLoclists::parse(DataExtractor data, unsigned Version) {
+void DWARFDebugLoclists::parse(DataExtractor data, uint64_t Offset, uint64_t EndOffset, uint16_t Version) {
   IsLittleEndian = data.isLittleEndian();
   AddressSize = data.getAddressSize();
 
-  uint32_t Offset = 0;
-  while (data.isValidOffset(Offset)) {
+  while (Offset < EndOffset) {
     if (auto LL = parseOneLocationList(data, &Offset, Version))
       Locations.push_back(std::move(*LL));
-    else
+    else {
+      logAllUnhandledErrors(LL.takeError(), WithColor::error());
       return;
+    }
   }
 }
 
@@ -219,51 +223,106 @@ DWARFDebugLoclists::getLocationListAtOffset(uint64_t Offset) const {
   return nullptr;
 }
 
-void DWARFDebugLoclists::LocationList::dump(raw_ostream &OS, uint64_t BaseAddr,
-                                            bool IsLittleEndian,
-                                            unsigned AddressSize,
-                                            const MCRegisterInfo *MRI,
-                                            DWARFUnit *U,
-                                            unsigned Indent) const {
-  for (const Entry &E : Entries) {
-    switch (E.Kind) {
+void DWARFDebugLoclists::Entry::dump(raw_ostream &OS, uint64_t &BaseAddr,
+                                     bool IsLittleEndian, unsigned AddressSize,
+                                     const MCRegisterInfo *MRI, DWARFUnit *U,
+                                     DIDumpOptions DumpOpts, unsigned Indent,
+                                     size_t MaxEncodingStringLength) const {
+  if (DumpOpts.Verbose) {
+    OS << "\n";
+    OS.indent(Indent);
+    auto EncodingString = dwarf::LocListEncodingString(Kind);
+    // Unsupported encodings should have been reported during parsing.
+    assert(!EncodingString.empty() && "Unknown loclist entry encoding");
+    OS << format("%s%*c", EncodingString.data(),
+                 MaxEncodingStringLength - EncodingString.size() + 1, '(');
+    switch (Kind) {
     case dwarf::DW_LLE_startx_length:
-      OS << '\n';
-      OS.indent(Indent);
-      OS << "Addr idx " << E.Value0 << " (w/ length " << E.Value1 << "): ";
-      break;
     case dwarf::DW_LLE_start_length:
-      OS << '\n';
-      OS.indent(Indent);
-      OS << format("[0x%*.*" PRIx64 ", 0x%*.*" PRIx64 "): ", AddressSize * 2,
-                   AddressSize * 2, E.Value0, AddressSize * 2, AddressSize * 2,
-                   E.Value0 + E.Value1);
-      break;
     case dwarf::DW_LLE_offset_pair:
-      OS << '\n';
-      OS.indent(Indent);
-      OS << format("[0x%*.*" PRIx64 ", 0x%*.*" PRIx64 "): ", AddressSize * 2,
-                   AddressSize * 2, BaseAddr + E.Value0, AddressSize * 2,
-                   AddressSize * 2, BaseAddr + E.Value1);
+      OS << format("0x%*.*" PRIx64 ", 0x%*.*" PRIx64, AddressSize * 2,
+                 AddressSize * 2, Value0, AddressSize * 2, AddressSize * 2,
+                 Value1);
       break;
+    case dwarf::DW_LLE_base_addressx:
     case dwarf::DW_LLE_base_address:
-      BaseAddr = E.Value0;
+      OS << format("0x%*.*" PRIx64, AddressSize * 2, AddressSize * 2,
+                   Value0);
+      break;
+    case dwarf::DW_LLE_end_of_list:
       break;
-    default:
-      llvm_unreachable("unreachable locations list kind");
     }
-
-    dumpExpression(OS, E.Loc, IsLittleEndian, AddressSize, MRI, U);
+    OS << ')';
   }
+  auto PrintPrefix = [&] {
+    OS << "\n";
+    OS.indent(Indent);
+    if (DumpOpts.Verbose)
+      OS << format("%*s", MaxEncodingStringLength, (const char *)"=> ");
+  };
+  switch (Kind) {
+  case dwarf::DW_LLE_startx_length:
+    PrintPrefix();
+    OS << "Addr idx " << Value0 << " (w/ length " << Value1 << "): ";
+    break;
+  case dwarf::DW_LLE_start_length:
+    PrintPrefix();
+    DWARFAddressRange(Value0, Value0 + Value1)
+        .dump(OS, AddressSize, DumpOpts);
+    OS << ": ";
+    break;
+  case dwarf::DW_LLE_offset_pair:
+    PrintPrefix();
+    DWARFAddressRange(BaseAddr + Value0, BaseAddr + Value1)
+        .dump(OS, AddressSize, DumpOpts);
+    OS << ": ";
+    break;
+  case dwarf::DW_LLE_base_addressx:
+    if (!DumpOpts.Verbose)
+      return;
+    break;
+  case dwarf::DW_LLE_end_of_list:
+    if (!DumpOpts.Verbose)
+      return;
+    break;
+  case dwarf::DW_LLE_base_address:
+    BaseAddr = Value0;
+    if (!DumpOpts.Verbose)
+      return;
+    break;
+  default:
+    llvm_unreachable("unreachable locations list kind");
+  }
+
+  dumpExpression(OS, Loc, IsLittleEndian, AddressSize, MRI, U);
+}
+void DWARFDebugLoclists::LocationList::dump(raw_ostream &OS, uint64_t BaseAddr,
+                                            bool IsLittleEndian,
+                                            unsigned AddressSize,
+                                            const MCRegisterInfo *MRI,
+                                            DWARFUnit *U,
+                                            DIDumpOptions DumpOpts, 
+                                            unsigned Indent) const {
+  size_t MaxEncodingStringLength = 0;
+  if (DumpOpts.Verbose)
+    for (const auto &Entry : Entries)
+      MaxEncodingStringLength =
+          std::max(MaxEncodingStringLength,
+                   dwarf::LocListEncodingString(Entry.Kind).size());
+
+  for (const Entry &E : Entries)
+    E.dump(OS, BaseAddr, IsLittleEndian, AddressSize, MRI, U, DumpOpts, Indent,
+           MaxEncodingStringLength);
 }
 
 void DWARFDebugLoclists::dump(raw_ostream &OS, uint64_t BaseAddr,
-                              const MCRegisterInfo *MRI,
+                              const MCRegisterInfo *MRI, DIDumpOptions DumpOpts, 
                               Optional<uint64_t> Offset) const {
   auto DumpLocationList = [&](const LocationList &L) {
-    OS << format("0x%8.8x: ", L.Offset);
-    L.dump(OS, BaseAddr, IsLittleEndian, AddressSize, MRI, nullptr, /*Indent=*/12);
-    OS << "\n\n";
+    OS << format("0x%8.8" PRIx64 ": ", L.Offset);
+    L.dump(OS, BaseAddr, IsLittleEndian, AddressSize, MRI, nullptr, DumpOpts,
+           /*Indent=*/12);
+    OS << "\n";
   };
 
   if (Offset) {
@@ -274,5 +333,7 @@ void DWARFDebugLoclists::dump(raw_ostream &OS, uint64_t BaseAddr,
 
   for (const LocationList &L : Locations) {
     DumpLocationList(L);
+    if (&L != &Locations.back())
+      OS << '\n';
   }
 }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index 3317a778cc70..9a0e770aed3d 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -53,7 +53,7 @@ void DWARFDebugMacro::dump(raw_ostream &OS) const {
 }
 
 void DWARFDebugMacro::parse(DataExtractor data) {
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   while (data.isValidOffset(Offset)) {
     // A macro list entry consists of:
     Entry E;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
index 963ec64f5e91..ab71b239cb67 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
@@ -23,7 +23,7 @@ DWARFDebugPubTable::DWARFDebugPubTable(const DWARFObject &Obj,
                                        bool LittleEndian, bool GnuStyle)
     : GnuStyle(GnuStyle) {
   DWARFDataExtractor PubNames(Obj, Sec, LittleEndian, 0);
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   while (PubNames.isValidOffset(Offset)) {
     Sets.push_back({});
     Set &SetData = Sets.back();
@@ -49,13 +49,13 @@ void DWARFDebugPubTable::dump(raw_ostream &OS) const {
   for (const Set &S : Sets) {
     OS << "length = " << format("0x%08x", S.Length);
     OS << " version = " << format("0x%04x", S.Version);
-    OS << " unit_offset = " << format("0x%08x", S.Offset);
+    OS << " unit_offset = " << format("0x%08" PRIx64, S.Offset);
     OS << " unit_size = " << format("0x%08x", S.Size) << '\n';
     OS << (GnuStyle ? "Offset     Linkage  Kind     Name\n"
                     : "Offset     Name\n");
 
     for (const Entry &E : S.Entries) {
-      OS << format("0x%8.8x ", E.SecOffset);
+      OS << format("0x%8.8" PRIx64 " ", E.SecOffset);
       if (GnuStyle) {
         StringRef EntryLinkage =
             GDBIndexEntryLinkageString(E.Descriptor.Linkage);
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index d8df81a0aa0b..1a1857d8cd79 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -17,17 +17,17 @@
 using namespace llvm;
 
 void DWARFDebugRangeList::clear() {
-  Offset = -1U;
+  Offset = -1ULL;
   AddressSize = 0;
   Entries.clear();
 }
 
 Error DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
-                                   uint32_t *offset_ptr) {
+                                   uint64_t *offset_ptr) {
   clear();
   if (!data.isValidOffset(*offset_ptr))
     return createStringError(errc::invalid_argument,
-                       "invalid range list offset 0x%" PRIx32, *offset_ptr);
+                       "invalid range list offset 0x%" PRIx64, *offset_ptr);
 
   AddressSize = data.getAddressSize();
   if (AddressSize != 4 && AddressSize != 8)
@@ -38,7 +38,7 @@ Error DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
     RangeListEntry Entry;
     Entry.SectionIndex = -1ULL;
 
-    uint32_t prev_offset = *offset_ptr;
+    uint64_t prev_offset = *offset_ptr;
     Entry.StartAddress = data.getRelocatedAddress(offset_ptr);
     Entry.EndAddress =
         data.getRelocatedAddress(offset_ptr, &Entry.SectionIndex);
@@ -47,7 +47,7 @@ Error DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
     if (*offset_ptr != prev_offset + 2 * AddressSize) {
       clear();
       return createStringError(errc::invalid_argument,
-                         "invalid range list entry at offset 0x%" PRIx32,
+                         "invalid range list entry at offset 0x%" PRIx64,
                          prev_offset);
     }
     if (Entry.isEndOfListEntry())
@@ -59,12 +59,12 @@ Error DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
 
 void DWARFDebugRangeList::dump(raw_ostream &OS) const {
   for (const RangeListEntry &RLE : Entries) {
-    const char *format_str = (AddressSize == 4
-                              ? "%08x %08"  PRIx64 " %08"  PRIx64 "\n"
-                              : "%08x %016" PRIx64 " %016" PRIx64 "\n");
+    const char *format_str =
+        (AddressSize == 4 ? "%08" PRIx64 " %08" PRIx64 " %08" PRIx64 "\n"
+                          : "%08" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n");
     OS << format(format_str, Offset, RLE.StartAddress, RLE.EndAddress);
   }
-  OS << format("%08x <End of list>\n", Offset);
+  OS << format("%08" PRIx64 " <End of list>\n", Offset);
 }
 
 DWARFAddressRangesVector DWARFDebugRangeList::getAbsoluteRanges(
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
index 5ac3326f6681..f6785b89e86d 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -16,8 +16,8 @@
 
 using namespace llvm;
 
-Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
-                              uint32_t *OffsetPtr) {
+Error RangeListEntry::extract(DWARFDataExtractor Data, uint64_t End,
+                              uint64_t *OffsetPtr) {
   Offset = *OffsetPtr;
   SectionIndex = -1ULL;
   // The caller should guarantee that we have at least 1 byte available, so
@@ -32,41 +32,41 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
     break;
   // TODO: Support other encodings.
   case dwarf::DW_RLE_base_addressx: {
-    uint32_t PreviousOffset = *OffsetPtr - 1;
+    uint64_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getULEB128(OffsetPtr);
     if (End < *OffsetPtr)
       return createStringError(
           errc::invalid_argument,
           "read past end of table when reading "
-          "DW_RLE_base_addressx encoding at offset 0x%" PRIx32,
+          "DW_RLE_base_addressx encoding at offset 0x%" PRIx64,
           PreviousOffset);
     break;
   }
   case dwarf::DW_RLE_startx_endx:
     return createStringError(errc::not_supported,
                        "unsupported rnglists encoding DW_RLE_startx_endx at "
-                       "offset 0x%" PRIx32,
+                       "offset 0x%" PRIx64,
                        *OffsetPtr - 1);
   case dwarf::DW_RLE_startx_length: {
-    uint32_t PreviousOffset = *OffsetPtr - 1;
+    uint64_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getULEB128(OffsetPtr);
     Value1 = Data.getULEB128(OffsetPtr);
     if (End < *OffsetPtr)
       return createStringError(
           errc::invalid_argument,
           "read past end of table when reading "
-          "DW_RLE_startx_length encoding at offset 0x%" PRIx32,
+          "DW_RLE_startx_length encoding at offset 0x%" PRIx64,
           PreviousOffset);
     break;
   }
   case dwarf::DW_RLE_offset_pair: {
-    uint32_t PreviousOffset = *OffsetPtr - 1;
+    uint64_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getULEB128(OffsetPtr);
     Value1 = Data.getULEB128(OffsetPtr);
     if (End < *OffsetPtr)
       return createStringError(errc::invalid_argument,
                          "read past end of table when reading "
-                         "DW_RLE_offset_pair encoding at offset 0x%" PRIx32,
+                         "DW_RLE_offset_pair encoding at offset 0x%" PRIx64,
                          PreviousOffset);
     break;
   }
@@ -74,7 +74,7 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
     if ((End - *OffsetPtr) < Data.getAddressSize())
       return createStringError(errc::invalid_argument,
                          "insufficient space remaining in table for "
-                         "DW_RLE_base_address encoding at offset 0x%" PRIx32,
+                         "DW_RLE_base_address encoding at offset 0x%" PRIx64,
                          *OffsetPtr - 1);
     Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
     break;
@@ -84,27 +84,27 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
       return createStringError(errc::invalid_argument,
                          "insufficient space remaining in table for "
                          "DW_RLE_start_end encoding "
-                         "at offset 0x%" PRIx32,
+                         "at offset 0x%" PRIx64,
                          *OffsetPtr - 1);
     Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
     Value1 = Data.getRelocatedAddress(OffsetPtr);
     break;
   }
   case dwarf::DW_RLE_start_length: {
-    uint32_t PreviousOffset = *OffsetPtr - 1;
+    uint64_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getRelocatedAddress(OffsetPtr, &SectionIndex);
     Value1 = Data.getULEB128(OffsetPtr);
     if (End < *OffsetPtr)
       return createStringError(errc::invalid_argument,
                          "read past end of table when reading "
-                         "DW_RLE_start_length encoding at offset 0x%" PRIx32,
+                         "DW_RLE_start_length encoding at offset 0x%" PRIx64,
                          PreviousOffset);
     break;
   }
   default:
     return createStringError(errc::not_supported,
                        "unknown rnglists encoding 0x%" PRIx32
-                       " at offset 0x%" PRIx32,
+                       " at offset 0x%" PRIx64,
                        uint32_t(Encoding), *OffsetPtr - 1);
   }
 
@@ -187,7 +187,7 @@ void RangeListEntry::dump(
 
   if (DumpOpts.Verbose) {
     // Print the section offset in verbose mode.
-    OS << format("0x%8.8" PRIx32 ":", Offset);
+    OS << format("0x%8.8" PRIx64 ":", Offset);
     auto EncodingString = dwarf::RangeListEncodingString(EntryKind);
     // Unsupported encodings should have been reported during parsing.
     assert(!EncodingString.empty() && "Unknown range entry encoding");
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index d638dc4239f4..cec194e8b6b3 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/WithColor.h"
@@ -91,21 +92,29 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
   }
 
   FormValue.dump(OS, DumpOpts);
+  const auto &DumpLL = [&](auto ExpectedLL) {
+    if (ExpectedLL) {
+      uint64_t BaseAddr = 0;
+      if (Optional<object::SectionedAddress> BA = U->getBaseAddress())
+        BaseAddr = BA->Address;
+      auto LLDumpOpts = DumpOpts;
+      LLDumpOpts.Verbose = false;
+      ExpectedLL->dump(OS, BaseAddr, Ctx.isLittleEndian(), Obj.getAddressSize(),
+                       MRI, U, LLDumpOpts, Indent);
+    } else {
+      OS << '\n';
+      OS.indent(Indent);
+      OS << formatv("error extracting location list: {0}",
+                    fmt_consume(ExpectedLL.takeError()));
+    }
+  };
   if (FormValue.isFormClass(DWARFFormValue::FC_SectionOffset)) {
-    uint32_t Offset = *FormValue.getAsSectionOffset();
+    uint64_t Offset = *FormValue.getAsSectionOffset();
     if (!U->isDWOUnit() && !U->getLocSection()->Data.empty()) {
       DWARFDebugLoc DebugLoc;
       DWARFDataExtractor Data(Obj, *U->getLocSection(), Ctx.isLittleEndian(),
                               Obj.getAddressSize());
-      auto LL = DebugLoc.parseOneLocationList(Data, &Offset);
-      if (LL) {
-        uint64_t BaseAddr = 0;
-        if (Optional<object::SectionedAddress> BA = U->getBaseAddress())
-          BaseAddr = BA->Address;
-        LL->dump(OS, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI, U,
-                 BaseAddr, Indent);
-      } else
-        OS << "error extracting location list.";
+      DumpLL(DebugLoc.parseOneLocationList(Data, &Offset));
       return;
     }
 
@@ -121,18 +130,8 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
       // Modern locations list (.debug_loclists) are used starting from v5.
       // Ideally we should take the version from the .debug_loclists section
       // header, but using CU's version for simplicity.
-      auto LL = DWARFDebugLoclists::parseOneLocationList(
-          Data, &Offset, UseLocLists ? U->getVersion() : 4);
-
-      uint64_t BaseAddr = 0;
-      if (Optional<object::SectionedAddress> BA = U->getBaseAddress())
-        BaseAddr = BA->Address;
-
-      if (LL)
-        LL->dump(OS, BaseAddr, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI,
-                 U, Indent);
-      else
-        OS << "error extracting location list.";
+      DumpLL(DWARFDebugLoclists::parseOneLocationList(
+          Data, &Offset, UseLocLists ? U->getVersion() : 4));
     }
   }
 }
@@ -264,7 +263,7 @@ static void dumpTypeName(raw_ostream &OS, const DWARFDie &D) {
 }
 
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
-                          uint32_t *OffsetPtr, dwarf::Attribute Attr,
+                          uint64_t *OffsetPtr, dwarf::Attribute Attr,
                           dwarf::Form Form, unsigned Indent,
                           DIDumpOptions DumpOpts) {
   if (!Die.isValid())
@@ -568,8 +567,8 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
   if (!isValid())
     return;
   DWARFDataExtractor debug_info_data = U->getDebugInfoExtractor();
-  const uint32_t Offset = getOffset();
-  uint32_t offset = Offset;
+  const uint64_t Offset = getOffset();
+  uint64_t offset = Offset;
   if (DumpOpts.ShowParents) {
     DIDumpOptions ParentDumpOpts = DumpOpts;
     ParentDumpOpts.ShowParents = false;
@@ -581,7 +580,7 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
     uint32_t abbrCode = debug_info_data.getULEB128(&offset);
     if (DumpOpts.ShowAddresses)
       WithColor(OS, HighlightColor::Address).get()
-          << format("\n0x%8.8x: ", Offset);
+          << format("\n0x%8.8" PRIx64 ": ", Offset);
 
     if (abbrCode) {
       auto AbbrevDecl = getAbbreviationDeclarationPtr();
@@ -685,7 +684,7 @@ void DWARFDie::attribute_iterator::updateForIndex(
     AttrValue.Attr = AbbrDecl.getAttrByIndex(Index);
     // Add the previous byte size of any previous attribute value.
     AttrValue.Offset += AttrValue.ByteSize;
-    uint32_t ParseOffset = AttrValue.Offset;
+    uint64_t ParseOffset = AttrValue.Offset;
     auto U = Die.getDwarfUnit();
     assert(U && "Die must have valid DWARF unit");
     AttrValue.Value = DWARFFormValue::createFromUnit(
@@ -733,6 +732,7 @@ bool DWARFAttribute::mayHaveLocationDescription(dwarf::Attribute Attr) {
   case DW_AT_call_data_value:
   // Extensions.
   case DW_AT_GNU_call_site_value:
+  case DW_AT_GNU_call_site_target:
     return true;
   default:
     return false;
diff --git a/lib/DebugInfo/DWARF/DWARFExpression.cpp b/lib/DebugInfo/DWARF/DWARFExpression.cpp
index 470d4b5364b4..5009b1b7b412 100644
--- a/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -119,7 +119,7 @@ static uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
 }
 
 bool DWARFExpression::Operation::extract(DataExtractor Data, uint16_t Version,
-                                         uint8_t AddressSize, uint32_t Offset) {
+                                         uint8_t AddressSize, uint64_t Offset) {
   Opcode = Data.getU8(&Offset);
 
   Desc = getOpDesc(Opcode);
@@ -218,9 +218,8 @@ static bool prettyPrintRegisterOp(raw_ostream &OS, uint8_t Opcode,
   else
     DwarfRegNum = Opcode - DW_OP_reg0;
 
-  int LLVMRegNum = MRI->getLLVMRegNum(DwarfRegNum, isEH);
-  if (LLVMRegNum >= 0) {
-    if (const char *RegName = MRI->getName(LLVMRegNum)) {
+  if (Optional<unsigned> LLVMRegNum = MRI->getLLVMRegNum(DwarfRegNum, isEH)) {
+    if (const char *RegName = MRI->getName(*LLVMRegNum)) {
       if ((Opcode >= DW_OP_breg0 && Opcode <= DW_OP_breg31) ||
           Opcode == DW_OP_bregx)
         OS << format(" %s%+" PRId64, RegName, Operands[OpNum]);
@@ -263,7 +262,7 @@ bool DWARFExpression::Operation::print(raw_ostream &OS,
     if (Size == Operation::BaseTypeRef && U) {
       auto Die = U->getDIEForOffset(U->getOffset() + Operands[Operand]);
       if (Die && Die.getTag() == dwarf::DW_TAG_base_type) {
-        OS << format(" (0x%08x)", U->getOffset() + Operands[Operand]);
+        OS << format(" (0x%08" PRIx64 ")", U->getOffset() + Operands[Operand]);
         if (auto Name = Die.find(dwarf::DW_AT_name))
           OS << " \"" << Name->getAsCString() << "\"";
       } else {
@@ -271,7 +270,7 @@ bool DWARFExpression::Operation::print(raw_ostream &OS,
                      Operands[Operand]);
       }
     } else if (Size == Operation::SizeBlock) {
-      uint32_t Offset = Operands[Operand];
+      uint64_t Offset = Operands[Operand];
       for (unsigned i = 0; i < Operands[Operand - 1]; ++i)
         OS << format(" 0x%02x", Expr->Data.getU8(&Offset));
     } else {
@@ -290,7 +289,7 @@ void DWARFExpression::print(raw_ostream &OS, const MCRegisterInfo *RegInfo,
   uint32_t EntryValExprSize = 0;
   for (auto &Op : *this) {
     if (!Op.print(OS, this, RegInfo, U, IsEH)) {
-      uint32_t FailOffset = Op.getEndOffset();
+      uint64_t FailOffset = Op.getEndOffset();
       while (FailOffset < Data.getData().size())
         OS << format(" %02x", Data.getU8(&FailOffset));
       return;
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 290d35511cdb..26090638b34c 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -98,7 +98,7 @@ DWARFFormValue DWARFFormValue::createFromBlockValue(dwarf::Form F,
 }
 
 DWARFFormValue DWARFFormValue::createFromUnit(dwarf::Form F, const DWARFUnit *U,
-                                              uint32_t *OffsetPtr) {
+                                              uint64_t *OffsetPtr) {
   DWARFFormValue FormValue(F);
   FormValue.extractValue(U->getDebugInfoExtractor(), OffsetPtr,
                          U->getFormParams(), U);
@@ -106,7 +106,7 @@ DWARFFormValue DWARFFormValue::createFromUnit(dwarf::Form F, const DWARFUnit *U,
 }
 
 bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
-                               uint32_t *OffsetPtr,
+                               uint64_t *OffsetPtr,
                                const dwarf::FormParams Params) {
   bool Indirect = false;
   do {
@@ -234,7 +234,7 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
 }
 
 bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
-                                  uint32_t *OffsetPtr, dwarf::FormParams FP,
+                                  uint64_t *OffsetPtr, dwarf::FormParams FP,
                                   const DWARFContext *Ctx,
                                   const DWARFUnit *CU) {
   if (!Ctx && CU)
@@ -590,7 +590,7 @@ Optional<const char *> DWARFFormValue::getAsCString() const {
   // FIXME: Add support for DW_FORM_GNU_strp_alt
   if (Form == DW_FORM_GNU_strp_alt || C == nullptr)
     return None;
-  uint32_t Offset = Value.uval;
+  uint64_t Offset = Value.uval;
   if (Form == DW_FORM_line_strp) {
     // .debug_line_str is tracked in the Context.
     if (const char *Str = C->getLineStringExtractor().getCStr(&Offset))
@@ -624,6 +624,7 @@ Optional<uint64_t> DWARFFormValue::getAsAddress() const {
     return SA->Address;
   return None;
 }
+
 Optional<object::SectionedAddress>
 DWARFFormValue::getAsSectionedAddress() const {
   if (!isFormClass(FC_Address))
diff --git a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index f5f975578082..252b58e5a591 100644
--- a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -112,7 +112,7 @@ void DWARFGdbIndex::dump(raw_ostream &OS) {
 }
 
 bool DWARFGdbIndex::parseImpl(DataExtractor Data) {
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
 
   // Only version 7 is supported at this moment.
   Version = Data.getU32(&Offset);
diff --git a/lib/DebugInfo/DWARF/DWARFListTable.cpp b/lib/DebugInfo/DWARF/DWARFListTable.cpp
index e38e706227da..269ea9f79a6e 100644
--- a/lib/DebugInfo/DWARF/DWARFListTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFListTable.cpp
@@ -16,33 +16,42 @@
 using namespace llvm;
 
 Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
-                                    uint32_t *OffsetPtr) {
+                                    uint64_t *OffsetPtr) {
   HeaderOffset = *OffsetPtr;
   // Read and verify the length field.
   if (!Data.isValidOffsetForDataOfSize(*OffsetPtr, sizeof(uint32_t)))
     return createStringError(errc::invalid_argument,
                        "section is not large enough to contain a "
-                       "%s table length at offset 0x%" PRIx32,
+                       "%s table length at offset 0x%" PRIx64,
                        SectionName.data(), *OffsetPtr);
-  // TODO: Add support for DWARF64.
-  HeaderData.Length = Data.getRelocatedValue(4, OffsetPtr);
-  if (HeaderData.Length == 0xffffffffu)
-    return createStringError(errc::not_supported,
-                       "DWARF64 is not supported in %s at offset 0x%" PRIx32,
-                       SectionName.data(), HeaderOffset);
   Format = dwarf::DwarfFormat::DWARF32;
-  if (HeaderData.Length + sizeof(uint32_t) < sizeof(Header))
+  uint8_t OffsetByteSize = 4;
+  HeaderData.Length = Data.getRelocatedValue(4, OffsetPtr);
+  if (HeaderData.Length == dwarf::DW_LENGTH_DWARF64) {
+    Format = dwarf::DwarfFormat::DWARF64;
+    OffsetByteSize = 8;
+    HeaderData.Length = Data.getU64(OffsetPtr);
+  } else if (HeaderData.Length >= dwarf::DW_LENGTH_lo_reserved) {
+    return createStringError(errc::invalid_argument,
+        "%s table at offset 0x%" PRIx64
+        " has unsupported reserved unit length of value 0x%8.8" PRIx64,
+        SectionName.data(), HeaderOffset, HeaderData.Length);
+  }
+  uint64_t FullLength =
+      HeaderData.Length + dwarf::getUnitLengthFieldByteSize(Format);
+  assert(FullLength == length());
+  if (FullLength < getHeaderSize(Format))
     return createStringError(errc::invalid_argument,
-                       "%s table at offset 0x%" PRIx32
-                       " has too small length (0x%" PRIx32
+                       "%s table at offset 0x%" PRIx64
+                       " has too small length (0x%" PRIx64
                        ") to contain a complete header",
-                       SectionName.data(), HeaderOffset, length());
-  uint32_t End = HeaderOffset + length();
-  if (!Data.isValidOffsetForDataOfSize(HeaderOffset, End - HeaderOffset))
+                       SectionName.data(), HeaderOffset, FullLength);
+  uint64_t End = HeaderOffset + FullLength;
+  if (!Data.isValidOffsetForDataOfSize(HeaderOffset, FullLength))
     return createStringError(errc::invalid_argument,
                        "section is not large enough to contain a %s table "
-                       "of length 0x%" PRIx32 " at offset 0x%" PRIx32,
-                       SectionName.data(), length(), HeaderOffset);
+                       "of length 0x%" PRIx64 " at offset 0x%" PRIx64,
+                       SectionName.data(), FullLength, HeaderOffset);
 
   HeaderData.Version = Data.getU16(OffsetPtr);
   HeaderData.AddrSize = Data.getU8(OffsetPtr);
@@ -53,35 +62,35 @@ Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
   if (HeaderData.Version != 5)
     return createStringError(errc::invalid_argument,
                        "unrecognised %s table version %" PRIu16
-                       " in table at offset 0x%" PRIx32,
+                       " in table at offset 0x%" PRIx64,
                        SectionName.data(), HeaderData.Version, HeaderOffset);
   if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)
     return createStringError(errc::not_supported,
-                       "%s table at offset 0x%" PRIx32
+                       "%s table at offset 0x%" PRIx64
                        " has unsupported address size %" PRIu8,
                        SectionName.data(), HeaderOffset, HeaderData.AddrSize);
   if (HeaderData.SegSize != 0)
     return createStringError(errc::not_supported,
-                       "%s table at offset 0x%" PRIx32
+                       "%s table at offset 0x%" PRIx64
                        " has unsupported segment selector size %" PRIu8,
                        SectionName.data(), HeaderOffset, HeaderData.SegSize);
-  if (End < HeaderOffset + sizeof(HeaderData) +
-                HeaderData.OffsetEntryCount * sizeof(uint32_t))
+  if (End < HeaderOffset + getHeaderSize(Format) +
+                HeaderData.OffsetEntryCount * OffsetByteSize)
     return createStringError(errc::invalid_argument,
-        "%s table at offset 0x%" PRIx32 " has more offset entries (%" PRIu32
+        "%s table at offset 0x%" PRIx64 " has more offset entries (%" PRIu32
         ") than there is space for",
         SectionName.data(), HeaderOffset, HeaderData.OffsetEntryCount);
   Data.setAddressSize(HeaderData.AddrSize);
   for (uint32_t I = 0; I < HeaderData.OffsetEntryCount; ++I)
-    Offsets.push_back(Data.getRelocatedValue(4, OffsetPtr));
+    Offsets.push_back(Data.getRelocatedValue(OffsetByteSize, OffsetPtr));
   return Error::success();
 }
 
 void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   if (DumpOpts.Verbose)
-    OS << format("0x%8.8" PRIx32 ": ", HeaderOffset);
+    OS << format("0x%8.8" PRIx64 ": ", HeaderOffset);
   OS << format(
-      "%s list header: length = 0x%8.8" PRIx32 ", version = 0x%4.4" PRIx16 ", "
+      "%s list header: length = 0x%8.8" PRIx64 ", version = 0x%4.4" PRIx16 ", "
       "addr_size = 0x%2.2" PRIx8 ", seg_size = 0x%2.2" PRIx8
       ", offset_entry_count = "
       "0x%8.8" PRIx32 "\n",
@@ -91,18 +100,17 @@ void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   if (HeaderData.OffsetEntryCount > 0) {
     OS << "offsets: [";
     for (const auto &Off : Offsets) {
-      OS << format("\n0x%8.8" PRIx32, Off);
+      OS << format("\n0x%8.8" PRIx64, Off);
       if (DumpOpts.Verbose)
-        OS << format(" => 0x%8.8" PRIx32,
-                     Off + HeaderOffset + sizeof(HeaderData));
+        OS << format(" => 0x%8.8" PRIx64,
+                     Off + HeaderOffset + getHeaderSize(Format));
     }
     OS << "\n]\n";
   }
 }
 
-uint32_t DWARFListTableHeader::length() const {
+uint64_t DWARFListTableHeader::length() const {
   if (HeaderData.Length == 0)
     return 0;
-  // TODO: DWARF64 support.
-  return HeaderData.Length + sizeof(uint32_t);
+  return HeaderData.Length + dwarf::getUnitLengthFieldByteSize(Format);
 }
diff --git a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index 844920ba5b11..bb81090ba25c 100644
--- a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -24,21 +24,23 @@ void DWARFTypeUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
   if (DumpOpts.SummarizeTypes) {
     OS << "name = '" << Name << "'"
        << " type_signature = " << format("0x%016" PRIx64, getTypeHash())
-       << " length = " << format("0x%08x", getLength()) << '\n';
+       << " length = " << format("0x%08" PRIx64, getLength()) << '\n';
     return;
   }
 
-  OS << format("0x%08x", getOffset()) << ": Type Unit:"
-     << " length = " << format("0x%08x", getLength())
+  OS << format("0x%08" PRIx64, getOffset()) << ": Type Unit:"
+     << " length = " << format("0x%08" PRIx64, getLength())
      << " version = " << format("0x%04x", getVersion());
   if (getVersion() >= 5)
     OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
-  OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+  OS << " abbr_offset = "
+     << format("0x%04" PRIx64, getAbbreviations()->getOffset())
      << " addr_size = " << format("0x%02x", getAddressByteSize())
      << " name = '" << Name << "'"
      << " type_signature = " << format("0x%016" PRIx64, getTypeHash())
-     << " type_offset = " << format("0x%04x", getTypeOffset())
-     << " (next unit at " << format("0x%08x", getNextUnitOffset()) << ")\n";
+     << " type_offset = " << format("0x%04" PRIx64, getTypeOffset())
+     << " (next unit at " << format("0x%08" PRIx64, getNextUnitOffset())
+     << ")\n";
 
   if (DWARFDie TU = getUnitDIE(false))
     TU.dump(OS, 0, DumpOpts);
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index b74acf60c747..a56402a707ad 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -37,9 +37,9 @@ void DWARFUnitVector::addUnitsForSection(DWARFContext &C,
                                          const DWARFSection &Section,
                                          DWARFSectionKind SectionKind) {
   const DWARFObject &D = C.getDWARFObj();
-  addUnitsImpl(C, D, Section, C.getDebugAbbrev(), &D.getRangeSection(),
-               &D.getLocSection(), D.getStringSection(),
-               D.getStringOffsetSection(), &D.getAddrSection(),
+  addUnitsImpl(C, D, Section, C.getDebugAbbrev(), &D.getRangesSection(),
+               &D.getLocSection(), D.getStrSection(),
+               D.getStrOffsetsSection(), &D.getAddrSection(),
                D.getLineSection(), D.isLittleEndian(), false, false,
                SectionKind);
 }
@@ -49,9 +49,9 @@ void DWARFUnitVector::addUnitsForDWOSection(DWARFContext &C,
                                             DWARFSectionKind SectionKind,
                                             bool Lazy) {
   const DWARFObject &D = C.getDWARFObj();
-  addUnitsImpl(C, D, DWOSection, C.getDebugAbbrevDWO(), &D.getRangeDWOSection(),
-               &D.getLocDWOSection(), D.getStringDWOSection(),
-               D.getStringOffsetDWOSection(), &D.getAddrSection(),
+  addUnitsImpl(C, D, DWOSection, C.getDebugAbbrevDWO(), &D.getRangesDWOSection(),
+               &D.getLocDWOSection(), D.getStrDWOSection(),
+               D.getStrOffsetsDWOSection(), &D.getAddrSection(),
                D.getLineDWOSection(), C.isLittleEndian(), true, Lazy,
                SectionKind);
 }
@@ -66,7 +66,7 @@ void DWARFUnitVector::addUnitsImpl(
   // Lazy initialization of Parser, now that we have all section info.
   if (!Parser) {
     Parser = [=, &Context, &Obj, &Section, &SOS,
-              &LS](uint32_t Offset, DWARFSectionKind SectionKind,
+              &LS](uint64_t Offset, DWARFSectionKind SectionKind,
                    const DWARFSection *CurSection,
                    const DWARFUnitIndex::Entry *IndexEntry)
         -> std::unique_ptr<DWARFUnit> {
@@ -83,11 +83,11 @@ void DWARFUnitVector::addUnitsImpl(
         return nullptr;
       std::unique_ptr<DWARFUnit> U;
       if (Header.isTypeUnit())
-        U = llvm::make_unique<DWARFTypeUnit>(Context, InfoSection, Header, DA,
+        U = std::make_unique<DWARFTypeUnit>(Context, InfoSection, Header, DA,
                                              RS, LocSection, SS, SOS, AOS, LS,
                                              LE, IsDWO, *this);
       else
-        U = llvm::make_unique<DWARFCompileUnit>(Context, InfoSection, Header,
+        U = std::make_unique<DWARFCompileUnit>(Context, InfoSection, Header,
                                                 DA, RS, LocSection, SS, SOS,
                                                 AOS, LS, LE, IsDWO, *this);
       return U;
@@ -101,7 +101,7 @@ void DWARFUnitVector::addUnitsImpl(
   // within a section, although not necessarily within the object file,
   // even if we do lazy parsing.
   auto I = this->begin();
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   while (Data.isValidOffset(Offset)) {
     if (I != this->end() &&
         (&(*I)->getInfoSection() != &Section || (*I)->getOffset() == Offset)) {
@@ -126,11 +126,11 @@ DWARFUnit *DWARFUnitVector::addUnit(std::unique_ptr<DWARFUnit> Unit) {
   return this->insert(I, std::move(Unit))->get();
 }
 
-DWARFUnit *DWARFUnitVector::getUnitForOffset(uint32_t Offset) const {
+DWARFUnit *DWARFUnitVector::getUnitForOffset(uint64_t Offset) const {
   auto end = begin() + getNumInfoUnits();
   auto *CU =
       std::upper_bound(begin(), end, Offset,
-                       [](uint32_t LHS, const std::unique_ptr<DWARFUnit> &RHS) {
+                       [](uint64_t LHS, const std::unique_ptr<DWARFUnit> &RHS) {
                          return LHS < RHS->getNextUnitOffset();
                        });
   if (CU != end && (*CU)->getOffset() <= Offset)
@@ -149,7 +149,7 @@ DWARFUnitVector::getUnitForIndexEntry(const DWARFUnitIndex::Entry &E) {
 
   auto *CU =
       std::upper_bound(begin(), end, CUOff->Offset,
-                       [](uint32_t LHS, const std::unique_ptr<DWARFUnit> &RHS) {
+                       [](uint64_t LHS, const std::unique_ptr<DWARFUnit> &RHS) {
                          return LHS < RHS->getNextUnitOffset();
                        });
   if (CU != end && (*CU)->getOffset() <= Offset)
@@ -209,7 +209,7 @@ DWARFUnit::getAddrOffsetSectionItem(uint32_t Index) const {
     if (I != R.end() && std::next(I) == R.end())
       return (*I)->getAddrOffsetSectionItem(Index);
   }
-  uint32_t Offset = AddrOffsetSectionBase + Index * getAddressByteSize();
+  uint64_t Offset = AddrOffsetSectionBase + Index * getAddressByteSize();
   if (AddrOffsetSection->Data.size() < Offset + getAddressByteSize())
     return None;
   DWARFDataExtractor DA(Context.getDWARFObj(), *AddrOffsetSection,
@@ -223,7 +223,7 @@ Optional<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const {
   if (!StringOffsetsTableContribution)
     return None;
   unsigned ItemSize = getDwarfStringOffsetsByteSize();
-  uint32_t Offset = getStringOffsetsBase() + Index * ItemSize;
+  uint64_t Offset = getStringOffsetsBase() + Index * ItemSize;
   if (StringOffsetSection.Data.size() < Offset + ItemSize)
     return None;
   DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
@@ -233,7 +233,7 @@ Optional<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const {
 
 bool DWARFUnitHeader::extract(DWARFContext &Context,
                               const DWARFDataExtractor &debug_info,
-                              uint32_t *offset_ptr,
+                              uint64_t *offset_ptr,
                               DWARFSectionKind SectionKind,
                               const DWARFUnitIndex *Index,
                               const DWARFUnitIndex::Entry *Entry) {
@@ -243,11 +243,9 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
     IndexEntry = Index->getFromOffset(*offset_ptr);
   Length = debug_info.getRelocatedValue(4, offset_ptr);
   FormParams.Format = DWARF32;
-  unsigned SizeOfLength = 4;
-  if (Length == 0xffffffff) {
+  if (Length == dwarf::DW_LENGTH_DWARF64) {
     Length = debug_info.getU64(offset_ptr);
     FormParams.Format = DWARF64;
-    SizeOfLength = 8;
   }
   FormParams.Version = debug_info.getU16(offset_ptr);
   if (FormParams.Version >= 5) {
@@ -277,7 +275,8 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
   }
   if (isTypeUnit()) {
     TypeHash = debug_info.getU64(offset_ptr);
-    TypeOffset = debug_info.getU32(offset_ptr);
+    TypeOffset =
+        debug_info.getUnsigned(offset_ptr, FormParams.getDwarfOffsetByteSize());
   } else if (UnitType == DW_UT_split_compile || UnitType == DW_UT_skeleton)
     DWOId = debug_info.getU64(offset_ptr);
 
@@ -290,7 +289,8 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
   bool TypeOffsetOK =
       !isTypeUnit()
           ? true
-          : TypeOffset >= Size && TypeOffset < getLength() + SizeOfLength;
+          : TypeOffset >= Size &&
+                TypeOffset < getLength() + getUnitLengthFieldByteSize();
   bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
   bool VersionOK = DWARFContext::isSupportedVersion(getVersion());
   bool AddrSizeOK = getAddressByteSize() == 4 || getAddressByteSize() == 8;
@@ -306,16 +306,18 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
 // Parse the rangelist table header, including the optional array of offsets
 // following it (DWARF v5 and later).
 static Expected<DWARFDebugRnglistTable>
-parseRngListTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
-  // TODO: Support DWARF64
+parseRngListTableHeader(DWARFDataExtractor &DA, uint64_t Offset,
+                        DwarfFormat Format) {
   // We are expected to be called with Offset 0 or pointing just past the table
-  // header, which is 12 bytes long for DWARF32.
+  // header. Correct Offset in the latter case so that it points to the start
+  // of the header.
   if (Offset > 0) {
-    if (Offset < 12U)
+    uint64_t HeaderSize = DWARFListTableHeader::getHeaderSize(Format);
+    if (Offset < HeaderSize)
       return createStringError(errc::invalid_argument, "Did not detect a valid"
-                               " range list table with base = 0x%" PRIu32,
+                               " range list table with base = 0x%" PRIx64 "\n",
                                Offset);
-    Offset -= 12U;
+    Offset -= HeaderSize;
   }
   llvm::DWARFDebugRnglistTable Table;
   if (Error E = Table.extractHeaderAndOffsets(DA, &Offset))
@@ -323,13 +325,13 @@ parseRngListTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
   return Table;
 }
 
-Error DWARFUnit::extractRangeList(uint32_t RangeListOffset,
+Error DWARFUnit::extractRangeList(uint64_t RangeListOffset,
                                   DWARFDebugRangeList &RangeList) const {
   // Require that compile unit is extracted.
   assert(!DieArray.empty());
   DWARFDataExtractor RangesData(Context.getDWARFObj(), *RangeSection,
                                 isLittleEndian, getAddressByteSize());
-  uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
+  uint64_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
   return RangeList.extract(RangesData, &ActualRangeListOffset);
 }
 
@@ -354,8 +356,8 @@ void DWARFUnit::extractDIEsToVector(
 
   // Set the offset to that of the first DIE and calculate the start of the
   // next compilation unit header.
-  uint32_t DIEOffset = getOffset() + getHeaderSize();
-  uint32_t NextCUOffset = getNextUnitOffset();
+  uint64_t DIEOffset = getOffset() + getHeaderSize();
+  uint64_t NextCUOffset = getNextUnitOffset();
   DWARFDebugInfoEntry DIE;
   DWARFDataExtractor DebugInfoData = getDebugInfoExtractor();
   uint32_t Depth = 0;
@@ -396,90 +398,98 @@ void DWARFUnit::extractDIEsToVector(
   // unit header).
   if (DIEOffset > NextCUOffset)
     WithColor::warning() << format("DWARF compile unit extends beyond its "
-                                   "bounds cu 0x%8.8x at 0x%8.8x\n",
+                                   "bounds cu 0x%8.8" PRIx64 " "
+                                   "at 0x%8.8" PRIx64 "\n",
                                    getOffset(), DIEOffset);
 }
 
-size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
+void DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
+  if (Error e = tryExtractDIEsIfNeeded(CUDieOnly))
+    WithColor::error() << toString(std::move(e));
+}
+
+Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) {
   if ((CUDieOnly && !DieArray.empty()) ||
       DieArray.size() > 1)
-    return 0; // Already parsed.
+    return Error::success(); // Already parsed.
 
   bool HasCUDie = !DieArray.empty();
   extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray);
 
   if (DieArray.empty())
-    return 0;
+    return Error::success();
 
   // If CU DIE was just parsed, copy several attribute values from it.
-  if (!HasCUDie) {
-    DWARFDie UnitDie = getUnitDIE();
-    if (Optional<uint64_t> DWOId = toUnsigned(UnitDie.find(DW_AT_GNU_dwo_id)))
-      Header.setDWOId(*DWOId);
-    if (!IsDWO) {
-      assert(AddrOffsetSectionBase == 0);
-      assert(RangeSectionBase == 0);
-      AddrOffsetSectionBase = toSectionOffset(UnitDie.find(DW_AT_addr_base), 0);
-      if (!AddrOffsetSectionBase)
-        AddrOffsetSectionBase =
-            toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base), 0);
-      RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0);
-    }
-
-    // In general, in DWARF v5 and beyond we derive the start of the unit's
-    // contribution to the string offsets table from the unit DIE's
-    // DW_AT_str_offsets_base attribute. Split DWARF units do not use this
-    // attribute, so we assume that there is a contribution to the string
-    // offsets table starting at offset 0 of the debug_str_offsets.dwo section.
-    // In both cases we need to determine the format of the contribution,
-    // which may differ from the unit's format.
-    DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
-                          isLittleEndian, 0);
-    if (IsDWO || getVersion() >= 5) {
-      auto StringOffsetOrError =
-          IsDWO ? determineStringOffsetsTableContributionDWO(DA)
-                : determineStringOffsetsTableContribution(DA);
-      if (!StringOffsetOrError) {
-        WithColor::error() << "invalid contribution to string offsets table in section .debug_str_offsets[.dwo]: "
-                           << toString(StringOffsetOrError.takeError()) << '\n';
-      } else {
-        StringOffsetsTableContribution = *StringOffsetOrError;
-      }
-    }
+  if (HasCUDie)
+    return Error::success();
+
+  DWARFDie UnitDie(this, &DieArray[0]);
+  if (Optional<uint64_t> DWOId = toUnsigned(UnitDie.find(DW_AT_GNU_dwo_id)))
+    Header.setDWOId(*DWOId);
+  if (!IsDWO) {
+    assert(AddrOffsetSectionBase == 0);
+    assert(RangeSectionBase == 0);
+    AddrOffsetSectionBase = toSectionOffset(UnitDie.find(DW_AT_addr_base), 0);
+    if (!AddrOffsetSectionBase)
+      AddrOffsetSectionBase =
+          toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base), 0);
+    RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0);
+  }
 
-    // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to
-    // describe address ranges.
-    if (getVersion() >= 5) {
-      if (IsDWO)
-        setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
-      else
-        setRangesSection(&Context.getDWARFObj().getRnglistsSection(),
-                         toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0));
-      if (RangeSection->Data.size()) {
-        // Parse the range list table header. Individual range lists are
-        // extracted lazily.
-        DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
-                                    isLittleEndian, 0);
-        if (auto TableOrError =
-                parseRngListTableHeader(RangesDA, RangeSectionBase))
-          RngListTable = TableOrError.get();
-        else
-          WithColor::error() << "parsing a range list table: "
-                             << toString(TableOrError.takeError())
-                             << '\n';
-
-        // In a split dwarf unit, there is no DW_AT_rnglists_base attribute.
-        // Adjust RangeSectionBase to point past the table header.
-        if (IsDWO && RngListTable)
-          RangeSectionBase = RngListTable->getHeaderSize();
-      }
-    }
+  // In general, in DWARF v5 and beyond we derive the start of the unit's
+  // contribution to the string offsets table from the unit DIE's
+  // DW_AT_str_offsets_base attribute. Split DWARF units do not use this
+  // attribute, so we assume that there is a contribution to the string
+  // offsets table starting at offset 0 of the debug_str_offsets.dwo section.
+  // In both cases we need to determine the format of the contribution,
+  // which may differ from the unit's format.
+  DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
+                        isLittleEndian, 0);
+  if (IsDWO || getVersion() >= 5) {
+    auto StringOffsetOrError =
+        IsDWO ? determineStringOffsetsTableContributionDWO(DA)
+              : determineStringOffsetsTableContribution(DA);
+    if (!StringOffsetOrError)
+      return createStringError(errc::invalid_argument,
+                               "invalid reference to or invalid content in "
+                               ".debug_str_offsets[.dwo]: " +
+                                   toString(StringOffsetOrError.takeError()));
+
+    StringOffsetsTableContribution = *StringOffsetOrError;
+  }
 
-    // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
-    // skeleton CU DIE, so that DWARF users not aware of it are not broken.
+  // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to
+  // describe address ranges.
+  if (getVersion() >= 5) {
+    if (IsDWO)
+      setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
+    else
+      setRangesSection(&Context.getDWARFObj().getRnglistsSection(),
+                       toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0));
+    if (RangeSection->Data.size()) {
+      // Parse the range list table header. Individual range lists are
+      // extracted lazily.
+      DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
+                                  isLittleEndian, 0);
+      auto TableOrError = parseRngListTableHeader(RangesDA, RangeSectionBase,
+                                                  Header.getFormat());
+      if (!TableOrError)
+        return createStringError(errc::invalid_argument,
+                                 "parsing a range list table: " +
+                                     toString(TableOrError.takeError()));
+
+      RngListTable = TableOrError.get();
+
+      // In a split dwarf unit, there is no DW_AT_rnglists_base attribute.
+      // Adjust RangeSectionBase to point past the table header.
+      if (IsDWO && RngListTable)
+        RangeSectionBase = RngListTable->getHeaderSize();
     }
+  }
 
-  return DieArray.size();
+  // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
+  // skeleton CU DIE, so that DWARF users not aware of it are not broken.
+  return Error::success();
 }
 
 bool DWARFUnit::parseDWO() {
@@ -517,7 +527,8 @@ bool DWARFUnit::parseDWO() {
     DWO->setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
     DWARFDataExtractor RangesDA(Context.getDWARFObj(), *RangeSection,
                                 isLittleEndian, 0);
-    if (auto TableOrError = parseRngListTableHeader(RangesDA, RangeSectionBase))
+    if (auto TableOrError = parseRngListTableHeader(RangesDA, RangeSectionBase,
+                                                    Header.getFormat()))
       DWO->RngListTable = TableOrError.get();
     else
       WithColor::error() << "parsing a range list table: "
@@ -541,7 +552,7 @@ void DWARFUnit::clearDIEs(bool KeepCUDie) {
 }
 
 Expected<DWARFAddressRangesVector>
-DWARFUnit::findRnglistFromOffset(uint32_t Offset) {
+DWARFUnit::findRnglistFromOffset(uint64_t Offset) {
   if (getVersion() <= 4) {
     DWARFDebugRangeList RangeList;
     if (Error E = extractRangeList(Offset, RangeList))
@@ -569,9 +580,9 @@ DWARFUnit::findRnglistFromIndex(uint32_t Index) {
   if (RngListTable)
     return createStringError(errc::invalid_argument,
                              "invalid range list table index %d", Index);
-  else
-    return createStringError(errc::invalid_argument,
-                             "missing or invalid range list table");
+
+  return createStringError(errc::invalid_argument,
+                           "missing or invalid range list table");
 }
 
 Expected<DWARFAddressRangesVector> DWARFUnit::collectAddressRanges() {
@@ -780,11 +791,11 @@ StrOffsetsContributionDescriptor::validateContributionSize(
 // Look for a DWARF64-formatted contribution to the string offsets table
 // starting at a given offset and record it in a descriptor.
 static Expected<StrOffsetsContributionDescriptor>
-parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
+parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint64_t Offset) {
   if (!DA.isValidOffsetForDataOfSize(Offset, 16))
     return createStringError(errc::invalid_argument, "section offset exceeds section size");
 
-  if (DA.getU32(&Offset) != 0xffffffff)
+  if (DA.getU32(&Offset) != dwarf::DW_LENGTH_DWARF64)
     return createStringError(errc::invalid_argument, "32 bit contribution referenced from a 64 bit unit");
 
   uint64_t Size = DA.getU64(&Offset);
@@ -798,12 +809,12 @@ parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
 // Look for a DWARF32-formatted contribution to the string offsets table
 // starting at a given offset and record it in a descriptor.
 static Expected<StrOffsetsContributionDescriptor>
-parseDWARF32StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
+parseDWARF32StringOffsetsTableHeader(DWARFDataExtractor &DA, uint64_t Offset) {
   if (!DA.isValidOffsetForDataOfSize(Offset, 8))
     return createStringError(errc::invalid_argument, "section offset exceeds section size");
 
   uint32_t ContributionSize = DA.getU32(&Offset);
-  if (ContributionSize >= 0xfffffff0)
+  if (ContributionSize >= dwarf::DW_LENGTH_lo_reserved)
     return createStringError(errc::invalid_argument, "invalid length");
 
   uint8_t Version = DA.getU16(&Offset);
@@ -823,7 +834,7 @@ parseDWARFStringOffsetsTableHeader(DWARFDataExtractor &DA,
   case dwarf::DwarfFormat::DWARF64: {
     if (Offset < 16)
       return createStringError(errc::invalid_argument, "insufficient space for 64 bit header prefix");
-    auto DescOrError = parseDWARF64StringOffsetsTableHeader(DA, (uint32_t)Offset - 16);
+    auto DescOrError = parseDWARF64StringOffsetsTableHeader(DA, Offset - 16);
     if (!DescOrError)
       return DescOrError.takeError();
     Desc = *DescOrError;
@@ -832,7 +843,7 @@ parseDWARFStringOffsetsTableHeader(DWARFDataExtractor &DA,
   case dwarf::DwarfFormat::DWARF32: {
     if (Offset < 8)
       return createStringError(errc::invalid_argument, "insufficient space for 32 bit header prefix");
-    auto DescOrError = parseDWARF32StringOffsetsTableHeader(DA, (uint32_t)Offset - 8);
+    auto DescOrError = parseDWARF32StringOffsetsTableHeader(DA, Offset - 8);
     if (!DescOrError)
       return DescOrError.takeError();
     Desc = *DescOrError;
diff --git a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index 047c63461ccf..f29c1e6cc5c7 100644
--- a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -18,7 +18,7 @@
 using namespace llvm;
 
 bool DWARFUnitIndex::Header::parse(DataExtractor IndexData,
-                                   uint32_t *OffsetPtr) {
+                                   uint64_t *OffsetPtr) {
   if (!IndexData.isValidOffsetForDataOfSize(*OffsetPtr, 16))
     return false;
   Version = IndexData.getU32(OffsetPtr);
@@ -45,7 +45,7 @@ bool DWARFUnitIndex::parse(DataExtractor IndexData) {
 }
 
 bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   if (!Header.parse(IndexData, &Offset))
     return false;
 
@@ -54,10 +54,10 @@ bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
                       (2 * Header.NumUnits + 1) * 4 * Header.NumColumns))
     return false;
 
-  Rows = llvm::make_unique<Entry[]>(Header.NumBuckets);
+  Rows = std::make_unique<Entry[]>(Header.NumBuckets);
   auto Contribs =
-      llvm::make_unique<Entry::SectionContribution *[]>(Header.NumUnits);
-  ColumnKinds = llvm::make_unique<DWARFSectionKind[]>(Header.NumColumns);
+      std::make_unique<Entry::SectionContribution *[]>(Header.NumUnits);
+  ColumnKinds = std::make_unique<DWARFSectionKind[]>(Header.NumColumns);
 
   // Read Hash Table of Signatures
   for (unsigned i = 0; i != Header.NumBuckets; ++i)
@@ -70,7 +70,7 @@ bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
       continue;
     Rows[i].Index = this;
     Rows[i].Contributions =
-        llvm::make_unique<Entry::SectionContribution[]>(Header.NumColumns);
+        std::make_unique<Entry::SectionContribution[]>(Header.NumColumns);
     Contribs[Index - 1] = Rows[i].Contributions.get();
   }
 
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index c2b3189514a8..bf499b6ee092 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -34,11 +34,11 @@ DWARFVerifier::DieRangeInfo::insert(const DWARFAddressRange &R) {
 
   if (Pos != End) {
     if (Pos->intersects(R))
-      return Pos;
+      return std::move(Pos);
     if (Pos != Begin) {
       auto Iter = Pos - 1;
       if (Iter->intersects(R))
-        return Iter;
+        return std::move(Iter);
     }
   }
 
@@ -98,7 +98,7 @@ bool DWARFVerifier::DieRangeInfo::intersects(const DieRangeInfo &RHS) const {
 }
 
 bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
-                                     uint32_t *Offset, unsigned UnitIndex,
+                                     uint64_t *Offset, unsigned UnitIndex,
                                      uint8_t &UnitType, bool &isUnitDWARF64) {
   uint64_t AbbrOffset, Length;
   uint8_t AddrSize = 0;
@@ -111,9 +111,9 @@ bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
   bool ValidType = true;
   bool ValidAbbrevOffset = true;
 
-  uint32_t OffsetStart = *Offset;
+  uint64_t OffsetStart = *Offset;
   Length = DebugInfoData.getU32(Offset);
-  if (Length == UINT32_MAX) {
+  if (Length == dwarf::DW_LENGTH_DWARF64) {
     Length = DebugInfoData.getU64(Offset);
     isUnitDWARF64 = true;
   }
@@ -139,7 +139,7 @@ bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
   if (!ValidLength || !ValidVersion || !ValidAddrSize || !ValidAbbrevOffset ||
       !ValidType) {
     Success = false;
-    error() << format("Units[%d] - start offset: 0x%08x \n", UnitIndex,
+    error() << format("Units[%d] - start offset: 0x%08" PRIx64 " \n", UnitIndex,
                       OffsetStart);
     if (!ValidLength)
       note() << "The length for this unit is too "
@@ -203,7 +203,7 @@ unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit) {
 }
 
 unsigned DWARFVerifier::verifyDebugInfoCallSite(const DWARFDie &Die) {
-  if (Die.getTag() != DW_TAG_call_site)
+  if (Die.getTag() != DW_TAG_call_site && Die.getTag() != DW_TAG_GNU_call_site)
     return 0;
 
   DWARFDie Curr = Die.getParent();
@@ -223,7 +223,9 @@ unsigned DWARFVerifier::verifyDebugInfoCallSite(const DWARFDie &Die) {
 
   Optional<DWARFFormValue> CallAttr =
       Curr.find({DW_AT_call_all_calls, DW_AT_call_all_source_calls,
-                 DW_AT_call_all_tail_calls});
+                 DW_AT_call_all_tail_calls, DW_AT_GNU_all_call_sites,
+                 DW_AT_GNU_all_source_call_sites,
+                 DW_AT_GNU_all_tail_call_sites});
   if (!CallAttr) {
     error() << "Subprogram with call site entry has no DW_AT_call attribute:";
     Curr.dump(OS);
@@ -273,7 +275,7 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
   const DWARFObject &DObj = DCtx.getDWARFObj();
   DWARFDataExtractor DebugInfoData(DObj, S, DCtx.isLittleEndian(), 0);
   unsigned NumDebugInfoErrors = 0;
-  uint32_t OffsetStart = 0, Offset = 0, UnitIdx = 0;
+  uint64_t OffsetStart = 0, Offset = 0, UnitIdx = 0;
   uint8_t UnitType = 0;
   bool isUnitDWARF64 = false;
   bool isHeaderChainValid = true;
@@ -294,10 +296,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
       switch (UnitType) {
       case dwarf::DW_UT_type:
       case dwarf::DW_UT_split_type: {
-        Unit = TypeUnitVector.addUnit(llvm::make_unique<DWARFTypeUnit>(
-            DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangeSection(),
-            &DObj.getLocSection(), DObj.getStringSection(),
-            DObj.getStringOffsetSection(), &DObj.getAppleObjCSection(),
+        Unit = TypeUnitVector.addUnit(std::make_unique<DWARFTypeUnit>(
+            DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangesSection(),
+            &DObj.getLocSection(), DObj.getStrSection(),
+            DObj.getStrOffsetsSection(), &DObj.getAppleObjCSection(),
             DObj.getLineSection(), DCtx.isLittleEndian(), false,
             TypeUnitVector));
         break;
@@ -308,10 +310,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
       case dwarf::DW_UT_partial:
       // UnitType = 0 means that we are verifying a compile unit in DWARF v4.
       case 0: {
-        Unit = CompileUnitVector.addUnit(llvm::make_unique<DWARFCompileUnit>(
-            DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangeSection(),
-            &DObj.getLocSection(), DObj.getStringSection(),
-            DObj.getStringOffsetSection(), &DObj.getAppleObjCSection(),
+        Unit = CompileUnitVector.addUnit(std::make_unique<DWARFCompileUnit>(
+            DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangesSection(),
+            &DObj.getLocSection(), DObj.getStrSection(),
+            DObj.getStrOffsetsSection(), &DObj.getAppleObjCSection(),
             DObj.getLineSection(), DCtx.isLittleEndian(), false,
             CompileUnitVector));
         break;
@@ -449,7 +451,7 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
   case DW_AT_ranges:
     // Make sure the offset in the DW_AT_ranges attribute is valid.
     if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
-      if (*SectionOffset >= DObj.getRangeSection().Data.size())
+      if (*SectionOffset >= DObj.getRangesSection().Data.size())
         ReportError("DW_AT_ranges offset is beyond .debug_ranges bounds:");
       break;
     }
@@ -466,9 +468,9 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
     ReportError("DIE has invalid DW_AT_stmt_list encoding:");
     break;
   case DW_AT_location: {
-    auto VerifyLocationExpr = [&](StringRef D) {
+    auto VerifyLocationExpr = [&](ArrayRef<uint8_t> D) {
       DWARFUnit *U = Die.getDwarfUnit();
-      DataExtractor Data(D, DCtx.isLittleEndian(), 0);
+      DataExtractor Data(toStringRef(D), DCtx.isLittleEndian(), 0);
       DWARFExpression Expression(Data, U->getVersion(),
                                  U->getAddressByteSize());
       bool Error = llvm::any_of(Expression, [](DWARFExpression::Operation &Op) {
@@ -479,13 +481,13 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
     };
     if (Optional<ArrayRef<uint8_t>> Expr = AttrValue.Value.getAsBlock()) {
       // Verify inlined location.
-      VerifyLocationExpr(llvm::toStringRef(*Expr));
+      VerifyLocationExpr(*Expr);
     } else if (auto LocOffset = AttrValue.Value.getAsSectionOffset()) {
       // Verify location list.
       if (auto DebugLoc = DCtx.getDebugLoc())
         if (auto LocList = DebugLoc->getLocationListAtOffset(*LocOffset))
           for (const auto &Entry : LocList->Entries)
-            VerifyLocationExpr({Entry.Loc.data(), Entry.Loc.size()});
+            VerifyLocationExpr(Entry.Loc);
     }
     break;
   }
@@ -500,6 +502,9 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
         break;
       if (DieTag == DW_TAG_variable && RefTag == DW_TAG_member)
         break;
+      // This might be reference to a function declaration.
+      if (DieTag == DW_TAG_GNU_call_site && RefTag == DW_TAG_subprogram)
+        break;
       ReportError("DIE with tag " + TagString(DieTag) + " has " +
                   AttributeString(Attr) +
                   " that points to DIE with "
@@ -545,7 +550,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
         error() << FormEncodingString(Form) << " CU offset "
                 << format("0x%08" PRIx64, CUOffset)
                 << " is invalid (must be less than CU size of "
-                << format("0x%08" PRIx32, CUSize) << "):\n";
+                << format("0x%08" PRIx64, CUSize) << "):\n";
         Die.dump(OS, 0, DumpOpts);
         dump(Die) << '\n';
       } else {
@@ -578,7 +583,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
   case DW_FORM_strp: {
     auto SecOffset = AttrValue.Value.getAsSectionOffset();
     assert(SecOffset); // DW_FORM_strp is a section offset.
-    if (SecOffset && *SecOffset >= DObj.getStringSection().size()) {
+    if (SecOffset && *SecOffset >= DObj.getStrSection().size()) {
       ++NumErrors;
       error() << "DW_FORM_strp offset beyond .debug_str bounds:\n";
       dump(Die) << '\n';
@@ -605,7 +610,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     // Use a 64-bit type to calculate the offset to guard against overflow.
     uint64_t Offset =
         (uint64_t)DieCU->getStringOffsetsBase() + Index * ItemSize;
-    if (DObj.getStringOffsetSection().Data.size() < Offset + ItemSize) {
+    if (DObj.getStrOffsetsSection().Data.size() < Offset + ItemSize) {
       ++NumErrors;
       error() << FormEncodingString(Form) << " uses index "
               << format("%" PRIu64, Index) << ", which is too large:\n";
@@ -614,7 +619,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     }
     // Check that the string offset is valid.
     uint64_t StringOffset = *DieCU->getStringOffsetSectionItem(Index);
-    if (StringOffset >= DObj.getStringSection().size()) {
+    if (StringOffset >= DObj.getStrSection().size()) {
       ++NumErrors;
       error() << FormEncodingString(Form) << " uses index "
               << format("%" PRIu64, Index)
@@ -635,7 +640,7 @@ unsigned DWARFVerifier::verifyDebugInfoReferences() {
   // getting the DIE by offset and emitting an error
   OS << "Verifying .debug_info references...\n";
   unsigned NumErrors = 0;
-  for (const std::pair<uint64_t, std::set<uint32_t>> &Pair :
+  for (const std::pair<uint64_t, std::set<uint64_t>> &Pair :
        ReferenceToDIEOffsets) {
     if (DCtx.getDIEForOffset(Pair.first))
       continue;
@@ -659,12 +664,12 @@ void DWARFVerifier::verifyDebugLineStmtOffsets() {
     auto StmtSectionOffset = toSectionOffset(Die.find(DW_AT_stmt_list));
     if (!StmtSectionOffset)
       continue;
-    const uint32_t LineTableOffset = *StmtSectionOffset;
+    const uint64_t LineTableOffset = *StmtSectionOffset;
     auto LineTable = DCtx.getLineTableForUnit(CU.get());
     if (LineTableOffset < DCtx.getDWARFObj().getLineSection().Data.size()) {
       if (!LineTable) {
         ++NumDebugLineErrors;
-        error() << ".debug_line[" << format("0x%08" PRIx32, LineTableOffset)
+        error() << ".debug_line[" << format("0x%08" PRIx64, LineTableOffset)
                 << "] was not able to be parsed for CU:\n";
         dump(Die) << '\n';
         continue;
@@ -680,8 +685,8 @@ void DWARFVerifier::verifyDebugLineStmtOffsets() {
     if (Iter != StmtListToDie.end()) {
       ++NumDebugLineErrors;
       error() << "two compile unit DIEs, "
-              << format("0x%08" PRIx32, Iter->second.getOffset()) << " and "
-              << format("0x%08" PRIx32, Die.getOffset())
+              << format("0x%08" PRIx64, Iter->second.getOffset()) << " and "
+              << format("0x%08" PRIx64, Die.getOffset())
               << ", have the same DW_AT_stmt_list section offset:\n";
       dump(Iter->second);
       dump(Die) << '\n';
@@ -826,10 +831,10 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection,
   uint32_t NumBuckets = AccelTable.getNumBuckets();
   uint32_t NumHashes = AccelTable.getNumHashes();
 
-  uint32_t BucketsOffset =
+  uint64_t BucketsOffset =
       AccelTable.getSizeHdr() + AccelTable.getHeaderDataLength();
-  uint32_t HashesBase = BucketsOffset + NumBuckets * 4;
-  uint32_t OffsetsBase = HashesBase + NumHashes * 4;
+  uint64_t HashesBase = BucketsOffset + NumBuckets * 4;
+  uint64_t OffsetsBase = HashesBase + NumHashes * 4;
   for (uint32_t BucketIdx = 0; BucketIdx < NumBuckets; ++BucketIdx) {
     uint32_t HashIdx = AccelSectionData.getU32(&BucketsOffset);
     if (HashIdx >= NumHashes && HashIdx != UINT32_MAX) {
@@ -849,28 +854,29 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection,
   }
 
   for (uint32_t HashIdx = 0; HashIdx < NumHashes; ++HashIdx) {
-    uint32_t HashOffset = HashesBase + 4 * HashIdx;
-    uint32_t DataOffset = OffsetsBase + 4 * HashIdx;
+    uint64_t HashOffset = HashesBase + 4 * HashIdx;
+    uint64_t DataOffset = OffsetsBase + 4 * HashIdx;
     uint32_t Hash = AccelSectionData.getU32(&HashOffset);
-    uint32_t HashDataOffset = AccelSectionData.getU32(&DataOffset);
+    uint64_t HashDataOffset = AccelSectionData.getU32(&DataOffset);
     if (!AccelSectionData.isValidOffsetForDataOfSize(HashDataOffset,
                                                      sizeof(uint64_t))) {
-      error() << format("Hash[%d] has invalid HashData offset: 0x%08x.\n",
+      error() << format("Hash[%d] has invalid HashData offset: "
+                        "0x%08" PRIx64 ".\n",
                         HashIdx, HashDataOffset);
       ++NumErrors;
     }
 
-    uint32_t StrpOffset;
-    uint32_t StringOffset;
+    uint64_t StrpOffset;
+    uint64_t StringOffset;
     uint32_t StringCount = 0;
-    unsigned Offset;
+    uint64_t Offset;
     unsigned Tag;
     while ((StrpOffset = AccelSectionData.getU32(&HashDataOffset)) != 0) {
       const uint32_t NumHashDataObjects =
           AccelSectionData.getU32(&HashDataOffset);
       for (uint32_t HashDataIdx = 0; HashDataIdx < NumHashDataObjects;
            ++HashDataIdx) {
-        std::tie(Offset, Tag) = AccelTable.readAtoms(HashDataOffset);
+        std::tie(Offset, Tag) = AccelTable.readAtoms(&HashDataOffset);
         auto Die = DCtx.getDIEForOffset(Offset);
         if (!Die) {
           const uint32_t BucketIdx =
@@ -882,8 +888,8 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection,
 
           error() << format(
               "%s Bucket[%d] Hash[%d] = 0x%08x "
-              "Str[%u] = 0x%08x "
-              "DIE[%d] = 0x%08x is not a valid DIE offset for \"%s\".\n",
+              "Str[%u] = 0x%08" PRIx64 " DIE[%d] = 0x%08" PRIx64 " "
+              "is not a valid DIE offset for \"%s\".\n",
               SectionName, BucketIdx, HashIdx, Hash, StringCount, StrpOffset,
               HashDataIdx, Offset, Name);
 
@@ -908,8 +914,8 @@ unsigned
 DWARFVerifier::verifyDebugNamesCULists(const DWARFDebugNames &AccelTable) {
   // A map from CU offset to the (first) Name Index offset which claims to index
   // this CU.
-  DenseMap<uint32_t, uint32_t> CUMap;
-  const uint32_t NotIndexed = std::numeric_limits<uint32_t>::max();
+  DenseMap<uint64_t, uint64_t> CUMap;
+  const uint64_t NotIndexed = std::numeric_limits<uint64_t>::max();
 
   CUMap.reserve(DCtx.getNumCompileUnits());
   for (const auto &CU : DCtx.compile_units())
@@ -924,7 +930,7 @@ DWARFVerifier::verifyDebugNamesCULists(const DWARFDebugNames &AccelTable) {
       continue;
     }
     for (uint32_t CU = 0, End = NI.getCUCount(); CU < End; ++CU) {
-      uint32_t Offset = NI.getCUOffset(CU);
+      uint64_t Offset = NI.getCUOffset(CU);
       auto Iter = CUMap.find(Offset);
 
       if (Iter == CUMap.end()) {
@@ -1205,8 +1211,8 @@ unsigned DWARFVerifier::verifyNameIndexEntries(
 
   unsigned NumErrors = 0;
   unsigned NumEntries = 0;
-  uint32_t EntryID = NTE.getEntryOffset();
-  uint32_t NextEntryID = EntryID;
+  uint64_t EntryID = NTE.getEntryOffset();
+  uint64_t NextEntryID = EntryID;
   Expected<DWARFDebugNames::Entry> EntryOr = NI.getEntry(&NextEntryID);
   for (; EntryOr; ++NumEntries, EntryID = NextEntryID,
                                 EntryOr = NI.getEntry(&NextEntryID)) {
@@ -1218,7 +1224,7 @@ unsigned DWARFVerifier::verifyNameIndexEntries(
       ++NumErrors;
       continue;
     }
-    uint32_t CUOffset = NI.getCUOffset(CUIndex);
+    uint64_t CUOffset = NI.getCUOffset(CUIndex);
     uint64_t DIEOffset = CUOffset + *EntryOr->getDIEUnitOffset();
     DWARFDie DIE = DCtx.getDIEForOffset(DIEOffset);
     if (!DIE) {
@@ -1276,9 +1282,9 @@ static bool isVariableIndexable(const DWARFDie &Die, DWARFContext &DCtx) {
   if (!Location)
     return false;
 
-  auto ContainsInterestingOperators = [&](StringRef D) {
+  auto ContainsInterestingOperators = [&](ArrayRef<uint8_t> D) {
     DWARFUnit *U = Die.getDwarfUnit();
-    DataExtractor Data(D, DCtx.isLittleEndian(), U->getAddressByteSize());
+    DataExtractor Data(toStringRef(D), DCtx.isLittleEndian(), U->getAddressByteSize());
     DWARFExpression Expression(Data, U->getVersion(), U->getAddressByteSize());
     return any_of(Expression, [](DWARFExpression::Operation &Op) {
       return !Op.isError() && (Op.getCode() == DW_OP_addr ||
@@ -1289,7 +1295,7 @@ static bool isVariableIndexable(const DWARFDie &Die, DWARFContext &DCtx) {
 
   if (Optional<ArrayRef<uint8_t>> Expr = Location->getAsBlock()) {
     // Inlined location.
-    if (ContainsInterestingOperators(toStringRef(*Expr)))
+    if (ContainsInterestingOperators(*Expr))
       return true;
   } else if (Optional<uint64_t> Offset = Location->getAsSectionOffset()) {
     // Location list.
@@ -1297,7 +1303,7 @@ static bool isVariableIndexable(const DWARFDie &Die, DWARFContext &DCtx) {
       if (const DWARFDebugLoc::LocationList *LocList =
               DebugLoc->getLocationListAtOffset(*Offset)) {
         if (any_of(LocList->Entries, [&](const DWARFDebugLoc::Entry &E) {
-              return ContainsInterestingOperators({E.Loc.data(), E.Loc.size()});
+              return ContainsInterestingOperators(E.Loc);
             }))
           return true;
       }
@@ -1455,7 +1461,7 @@ unsigned DWARFVerifier::verifyDebugNames(const DWARFSection &AccelSection,
 
 bool DWARFVerifier::handleAccelTables() {
   const DWARFObject &D = DCtx.getDWARFObj();
-  DataExtractor StrData(D.getStringSection(), DCtx.isLittleEndian(), 0);
+  DataExtractor StrData(D.getStrSection(), DCtx.isLittleEndian(), 0);
   unsigned NumErrors = 0;
   if (!D.getAppleNamesSection().Data.empty())
     NumErrors += verifyAppleAccelTable(&D.getAppleNamesSection(), &StrData,
@@ -1470,8 +1476,8 @@ bool DWARFVerifier::handleAccelTables() {
     NumErrors += verifyAppleAccelTable(&D.getAppleObjCSection(), &StrData,
                                        ".apple_objc");
 
-  if (!D.getDebugNamesSection().Data.empty())
-    NumErrors += verifyDebugNames(D.getDebugNamesSection(), StrData);
+  if (!D.getNamesSection().Data.empty())
+    NumErrors += verifyDebugNames(D.getNamesSection(), StrData);
   return NumErrors == 0;
 }
 
diff --git a/lib/DebugInfo/GSYM/FileWriter.cpp b/lib/DebugInfo/GSYM/FileWriter.cpp
new file mode 100644
index 000000000000..4b30dcb60a7b
--- /dev/null
+++ b/lib/DebugInfo/GSYM/FileWriter.cpp
@@ -0,0 +1,78 @@
+//===- FileWriter.cpp -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/GSYM/FileWriter.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+using namespace gsym;
+
+FileWriter::~FileWriter() { OS.flush(); }
+
+void FileWriter::writeSLEB(int64_t S) {
+  uint8_t Bytes[32];
+  auto Length = encodeSLEB128(S, Bytes);
+  assert(Length < sizeof(Bytes));
+  OS.write(reinterpret_cast<const char *>(Bytes), Length);
+}
+
+void FileWriter::writeULEB(uint64_t U) {
+  uint8_t Bytes[32];
+  auto Length = encodeULEB128(U, Bytes);
+  assert(Length < sizeof(Bytes));
+  OS.write(reinterpret_cast<const char *>(Bytes), Length);
+}
+
+void FileWriter::writeU8(uint8_t U) {
+  OS.write(reinterpret_cast<const char *>(&U), sizeof(U));
+}
+
+void FileWriter::writeU16(uint16_t U) {
+  const uint16_t Swapped = support::endian::byte_swap(U, ByteOrder);
+  OS.write(reinterpret_cast<const char *>(&Swapped), sizeof(Swapped));
+}
+
+void FileWriter::writeU32(uint32_t U) {
+  const uint32_t Swapped = support::endian::byte_swap(U, ByteOrder);
+  OS.write(reinterpret_cast<const char *>(&Swapped), sizeof(Swapped));
+}
+
+void FileWriter::writeU64(uint64_t U) {
+  const uint64_t Swapped = support::endian::byte_swap(U, ByteOrder);
+  OS.write(reinterpret_cast<const char *>(&Swapped), sizeof(Swapped));
+}
+
+void FileWriter::fixup32(uint32_t U, uint64_t Offset) {
+  const uint32_t Swapped = support::endian::byte_swap(U, ByteOrder);
+  OS.pwrite(reinterpret_cast<const char *>(&Swapped), sizeof(Swapped),
+            Offset);
+}
+
+void FileWriter::writeData(llvm::ArrayRef<uint8_t> Data) {
+  OS.write(reinterpret_cast<const char *>(Data.data()), Data.size());
+}
+
+void FileWriter::writeNullTerminated(llvm::StringRef Str) {
+  OS << Str << '\0';
+}
+
+uint64_t FileWriter::tell() {
+  return OS.tell();
+}
+
+void FileWriter::alignTo(size_t Align) {
+  off_t Offset = OS.tell();
+  off_t AlignedOffset = (Offset + Align - 1) / Align * Align;
+  if (AlignedOffset == Offset)
+    return;
+  off_t PadCount = AlignedOffset - Offset;
+  OS.write_zeros(PadCount);
+}
diff --git a/lib/DebugInfo/GSYM/FunctionInfo.cpp b/lib/DebugInfo/GSYM/FunctionInfo.cpp
index 55c36a55b4be..ad022fec9e32 100644
--- a/lib/DebugInfo/GSYM/FunctionInfo.cpp
+++ b/lib/DebugInfo/GSYM/FunctionInfo.cpp
@@ -1,22 +1,147 @@
-//===- FunctionInfo.cpp -----------------------------------------*- C++ -*-===//
+//===- FunctionInfo.cpp ---------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/GSYM/FunctionInfo.h"
+#include "llvm/DebugInfo/GSYM/FileWriter.h"
+#include "llvm/DebugInfo/GSYM/LineTable.h"
+#include "llvm/DebugInfo/GSYM/InlineInfo.h"
+#include "llvm/Support/DataExtractor.h"
 
 using namespace llvm;
 using namespace gsym;
 
+/// FunctionInfo information type that is used to encode the optional data
+/// that is associated with a FunctionInfo object.
+enum InfoType : uint32_t {
+  EndOfList = 0u,
+  LineTableInfo = 1u,
+  InlineInfo = 2u
+};
+
 raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const FunctionInfo &FI) {
   OS << '[' << HEX64(FI.Range.Start) << '-' << HEX64(FI.Range.End) << "): "
-     << "Name=" << HEX32(FI.Name) << '\n';
-  for (const auto &Line : FI.Lines)
-    OS << Line << '\n';
-  OS << FI.Inline;
+     << "Name=" << HEX32(FI.Name) << '\n' << FI.OptLineTable << FI.Inline;
   return OS;
 }
+
+llvm::Expected<FunctionInfo> FunctionInfo::decode(DataExtractor &Data,
+                                                  uint64_t BaseAddr) {
+  FunctionInfo FI;
+  FI.Range.Start = BaseAddr;
+  uint64_t Offset = 0;
+  if (!Data.isValidOffsetForDataOfSize(Offset, 4))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing FunctionInfo Size", Offset);
+  FI.Range.End = FI.Range.Start + Data.getU32(&Offset);
+  if (!Data.isValidOffsetForDataOfSize(Offset, 4))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing FunctionInfo Name", Offset);
+  FI.Name = Data.getU32(&Offset);
+  if (FI.Name == 0)
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": invalid FunctionInfo Name value 0x%8.8x",
+        Offset - 4, FI.Name);
+  bool Done = false;
+  while (!Done) {
+    if (!Data.isValidOffsetForDataOfSize(Offset, 4))
+      return createStringError(std::errc::io_error,
+          "0x%8.8" PRIx64 ": missing FunctionInfo InfoType value", Offset);
+    const uint32_t IT = Data.getU32(&Offset);
+    if (!Data.isValidOffsetForDataOfSize(Offset, 4))
+      return createStringError(std::errc::io_error,
+          "0x%8.8" PRIx64 ": missing FunctionInfo InfoType length", Offset);
+    const uint32_t InfoLength = Data.getU32(&Offset);
+    if (!Data.isValidOffsetForDataOfSize(Offset, InfoLength))
+      return createStringError(std::errc::io_error,
+          "0x%8.8" PRIx64 ": missing FunctionInfo data for InfoType %u",
+          Offset, IT);
+    DataExtractor InfoData(Data.getData().substr(Offset, InfoLength),
+                           Data.isLittleEndian(),
+                           Data.getAddressSize());
+    switch (IT) {
+      case InfoType::EndOfList:
+        Done = true;
+        break;
+
+      case InfoType::LineTableInfo:
+        if (Expected<LineTable> LT = LineTable::decode(InfoData, BaseAddr))
+          FI.OptLineTable = std::move(LT.get());
+        else
+          return LT.takeError();
+        break;
+
+      case InfoType::InlineInfo:
+        if (Expected<InlineInfo> II = InlineInfo::decode(InfoData, BaseAddr))
+          FI.Inline = std::move(II.get());
+        else
+          return II.takeError();
+        break;
+
+      default:
+        return createStringError(std::errc::io_error,
+                                 "0x%8.8" PRIx64 ": unsupported InfoType %u",
+                                 Offset-8, IT);
+    }
+    Offset += InfoLength;
+  }
+  return std::move(FI);
+}
+
+llvm::Expected<uint64_t> FunctionInfo::encode(FileWriter &O) const {
+  if (!isValid())
+    return createStringError(std::errc::invalid_argument,
+        "attempted to encode invalid FunctionInfo object");
+  // Align FunctionInfo data to a 4 byte alignment.
+  O.alignTo(4);
+  const uint64_t FuncInfoOffset = O.tell();
+  // Write the size in bytes of this function as a uint32_t. This can be zero
+  // if we just have a symbol from a symbol table and that symbol has no size.
+  O.writeU32(size());
+  // Write the name of this function as a uint32_t string table offset.
+  O.writeU32(Name);
+
+  if (OptLineTable.hasValue()) {
+    O.writeU32(InfoType::LineTableInfo);
+    // Write a uint32_t length as zero for now, we will fix this up after
+    // writing the LineTable out with the number of bytes that were written.
+    O.writeU32(0);
+    const auto StartOffset = O.tell();
+    llvm::Error err = OptLineTable->encode(O, Range.Start);
+    if (err)
+      return std::move(err);
+    const off_t Length = O.tell() - StartOffset;
+    if (Length > UINT32_MAX)
+        return createStringError(std::errc::invalid_argument,
+            "LineTable length is greater than UINT32_MAX");
+    // Fixup the size of the LineTable data with the correct size.
+    O.fixup32(static_cast<uint32_t>(Length), StartOffset - 4);
+  }
+
+  // Write out the inline function info if we have any and if it is valid.
+  if (Inline.hasValue()) {
+    O.writeU32(InfoType::InlineInfo);
+    // Write a uint32_t length as zero for now, we will fix this up after
+    // writing the LineTable out with the number of bytes that were written.
+    O.writeU32(0);
+    const auto StartOffset = O.tell();
+    llvm::Error err = Inline->encode(O, Range.Start);
+    if (err)
+      return std::move(err);
+    const off_t Length = O.tell() - StartOffset;
+    if (Length > UINT32_MAX)
+        return createStringError(std::errc::invalid_argument,
+            "InlineInfo length is greater than UINT32_MAX");
+    // Fixup the size of the InlineInfo data with the correct size.
+    O.fixup32(static_cast<uint32_t>(Length), StartOffset - 4);
+  }
+
+  // Terminate the data chunks with and end of list with zero size
+  O.writeU32(InfoType::EndOfList);
+  O.writeU32(0);
+  return FuncInfoOffset;
+}
diff --git a/lib/DebugInfo/GSYM/GsymCreator.cpp b/lib/DebugInfo/GSYM/GsymCreator.cpp
new file mode 100644
index 000000000000..f371426f2010
--- /dev/null
+++ b/lib/DebugInfo/GSYM/GsymCreator.cpp
@@ -0,0 +1,275 @@
+//===- GsymCreator.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/GSYM/GsymCreator.h"
+#include "llvm/DebugInfo/GSYM/FileWriter.h"
+#include "llvm/DebugInfo/GSYM/Header.h"
+#include "llvm/DebugInfo/GSYM/LineTable.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <vector>
+
+using namespace llvm;
+using namespace gsym;
+
+
+GsymCreator::GsymCreator() : StrTab(StringTableBuilder::ELF) {
+  insertFile(StringRef());
+}
+
+uint32_t GsymCreator::insertFile(StringRef Path,
+                                 llvm::sys::path::Style Style) {
+  llvm::StringRef directory = llvm::sys::path::parent_path(Path, Style);
+  llvm::StringRef filename = llvm::sys::path::filename(Path, Style);
+  FileEntry FE(insertString(directory), insertString(filename));
+
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  const auto NextIndex = Files.size();
+  // Find FE in hash map and insert if not present.
+  auto R = FileEntryToIndex.insert(std::make_pair(FE, NextIndex));
+  if (R.second)
+    Files.emplace_back(FE);
+  return R.first->second;
+}
+
+llvm::Error GsymCreator::save(StringRef Path,
+                              llvm::support::endianness ByteOrder) const {
+  std::error_code EC;
+  raw_fd_ostream OutStrm(Path, EC);
+  if (EC)
+    return llvm::errorCodeToError(EC);
+  FileWriter O(OutStrm, ByteOrder);
+  return encode(O);
+}
+
+llvm::Error GsymCreator::encode(FileWriter &O) const {
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  if (Funcs.empty())
+    return createStringError(std::errc::invalid_argument,
+                             "no functions to encode");
+  if (!Finalized)
+    return createStringError(std::errc::invalid_argument,
+                             "GsymCreator wasn't finalized prior to encoding");
+
+  if (Funcs.size() > UINT32_MAX)
+    return createStringError(std::errc::invalid_argument,
+                             "too many FunctionInfos");
+  const uint64_t MinAddr = Funcs.front().startAddress();
+  const uint64_t MaxAddr = Funcs.back().startAddress();
+  const uint64_t AddrDelta = MaxAddr - MinAddr;
+  Header Hdr;
+  Hdr.Magic = GSYM_MAGIC;
+  Hdr.Version = GSYM_VERSION;
+  Hdr.AddrOffSize = 0;
+  Hdr.UUIDSize = static_cast<uint8_t>(UUID.size());
+  Hdr.BaseAddress = MinAddr;
+  Hdr.NumAddresses = static_cast<uint32_t>(Funcs.size());
+  Hdr.StrtabOffset = 0; // We will fix this up later.
+  Hdr.StrtabOffset = 0; // We will fix this up later.
+  memset(Hdr.UUID, 0, sizeof(Hdr.UUID));
+  if (UUID.size() > sizeof(Hdr.UUID))
+    return createStringError(std::errc::invalid_argument,
+                             "invalid UUID size %u", (uint32_t)UUID.size());
+  // Set the address offset size correctly in the GSYM header.
+  if (AddrDelta <= UINT8_MAX)
+    Hdr.AddrOffSize = 1;
+  else if (AddrDelta <= UINT16_MAX)
+    Hdr.AddrOffSize = 2;
+  else if (AddrDelta <= UINT32_MAX)
+    Hdr.AddrOffSize = 4;
+  else
+    Hdr.AddrOffSize = 8;
+  // Copy the UUID value if we have one.
+  if (UUID.size() > 0)
+    memcpy(Hdr.UUID, UUID.data(), UUID.size());
+  // Write out the header.
+  llvm::Error Err = Hdr.encode(O);
+  if (Err)
+    return Err;
+
+  // Write out the address offsets.
+  O.alignTo(Hdr.AddrOffSize);
+  for (const auto &FuncInfo : Funcs) {
+    uint64_t AddrOffset = FuncInfo.startAddress() - Hdr.BaseAddress;
+    switch(Hdr.AddrOffSize) {
+      case 1: O.writeU8(static_cast<uint8_t>(AddrOffset)); break;
+      case 2: O.writeU16(static_cast<uint16_t>(AddrOffset)); break;
+      case 4: O.writeU32(static_cast<uint32_t>(AddrOffset)); break;
+      case 8: O.writeU64(AddrOffset); break;
+    }
+  }
+
+  // Write out all zeros for the AddrInfoOffsets.
+  O.alignTo(4);
+  const off_t AddrInfoOffsetsOffset = O.tell();
+  for (size_t i = 0, n = Funcs.size(); i < n; ++i)
+    O.writeU32(0);
+
+  // Write out the file table
+  O.alignTo(4);
+  assert(!Files.empty());
+  assert(Files[0].Dir == 0);
+  assert(Files[0].Base == 0);
+  size_t NumFiles = Files.size();
+  if (NumFiles > UINT32_MAX)
+    return createStringError(std::errc::invalid_argument,
+                             "too many files");
+  O.writeU32(static_cast<uint32_t>(NumFiles));
+  for (auto File: Files) {
+      O.writeU32(File.Dir);
+      O.writeU32(File.Base);
+  }
+
+  // Write out the sting table.
+  const off_t StrtabOffset = O.tell();
+  StrTab.write(O.get_stream());
+  const off_t StrtabSize = O.tell() - StrtabOffset;
+  std::vector<uint32_t> AddrInfoOffsets;
+
+  // Write out the address infos for each function info.
+  for (const auto &FuncInfo : Funcs) {
+    if (Expected<uint64_t> OffsetOrErr = FuncInfo.encode(O))
+        AddrInfoOffsets.push_back(OffsetOrErr.get());
+    else
+        return OffsetOrErr.takeError();
+  }
+  // Fixup the string table offset and size in the header
+  O.fixup32((uint32_t)StrtabOffset, offsetof(Header, StrtabOffset));
+  O.fixup32((uint32_t)StrtabSize, offsetof(Header, StrtabSize));
+
+  // Fixup all address info offsets
+  uint64_t Offset = 0;
+  for (auto AddrInfoOffset: AddrInfoOffsets) {
+    O.fixup32(AddrInfoOffset, AddrInfoOffsetsOffset + Offset);
+    Offset += 4;
+  }
+  return ErrorSuccess();
+}
+
+llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  if (Finalized)
+    return createStringError(std::errc::invalid_argument,
+                             "already finalized");
+  Finalized = true;
+
+  // Sort function infos so we can emit sorted functions.
+  llvm::sort(Funcs.begin(), Funcs.end());
+
+  // Don't let the string table indexes change by finalizing in order.
+  StrTab.finalizeInOrder();
+
+  // Remove duplicates function infos that have both entries from debug info
+  // (DWARF or Breakpad) and entries from the SymbolTable.
+  //
+  // Also handle overlapping function. Usually there shouldn't be any, but they
+  // can and do happen in some rare cases.
+  //
+  // (a)          (b)         (c)
+  //     ^  ^       ^            ^
+  //     |X |Y      |X ^         |X
+  //     |  |       |  |Y        |  ^
+  //     |  |       |  v         v  |Y
+  //     v  v       v               v
+  //
+  // In (a) and (b), Y is ignored and X will be reported for the full range.
+  // In (c), both functions will be included in the result and lookups for an
+  // address in the intersection will return Y because of binary search.
+  //
+  // Note that in case of (b), we cannot include Y in the result because then
+  // we wouldn't find any function for range (end of Y, end of X)
+  // with binary search
+  auto NumBefore = Funcs.size();
+  auto Curr = Funcs.begin();
+  auto Prev = Funcs.end();
+  while (Curr != Funcs.end()) {
+    // Can't check for overlaps or same address ranges if we don't have a
+    // previous entry
+    if (Prev != Funcs.end()) {
+      if (Prev->Range.intersects(Curr->Range)) {
+        // Overlapping address ranges.
+        if (Prev->Range == Curr->Range) {
+          // Same address range. Check if one is from debug info and the other
+          // is from a symbol table. If so, then keep the one with debug info.
+          // Our sorting guarantees that entries with matching address ranges
+          // that have debug info are last in the sort.
+          if (*Prev == *Curr) {
+            // FunctionInfo entries match exactly (range, lines, inlines)
+            OS << "warning: duplicate function info entries, removing "
+                  "duplicate:\n"
+               << *Curr << '\n';
+            Curr = Funcs.erase(Prev);
+          } else {
+            if (!Prev->hasRichInfo() && Curr->hasRichInfo()) {
+              // Same address range, one with no debug info (symbol) and the
+              // next with debug info. Keep the latter.
+              Curr = Funcs.erase(Prev);
+            } else {
+              OS << "warning: same address range contains different debug "
+                 << "info. Removing:\n"
+                 << *Prev << "\nIn favor of this one:\n"
+                 << *Curr << "\n";
+              Curr = Funcs.erase(Prev);
+            }
+          }
+        } else {
+          // print warnings about overlaps
+          OS << "warning: function ranges overlap:\n"
+             << *Prev << "\n"
+             << *Curr << "\n";
+        }
+      } else if (Prev->Range.size() == 0 &&
+                 Curr->Range.contains(Prev->Range.Start)) {
+        OS << "warning: removing symbol:\n"
+           << *Prev << "\nKeeping:\n"
+           << *Curr << "\n";
+        Curr = Funcs.erase(Prev);
+      }
+    }
+    if (Curr == Funcs.end())
+      break;
+    Prev = Curr++;
+  }
+
+  OS << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
+     << Funcs.size() << " total\n";
+  return Error::success();
+}
+
+uint32_t GsymCreator::insertString(StringRef S) {
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  if (S.empty())
+    return 0;
+  return StrTab.add(S);
+}
+
+void GsymCreator::addFunctionInfo(FunctionInfo &&FI) {
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  Funcs.emplace_back(FI);
+}
+
+void GsymCreator::forEachFunctionInfo(
+    std::function<bool(FunctionInfo &)> const &Callback) {
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  for (auto &FI : Funcs) {
+    if (!Callback(FI))
+      break;
+  }
+}
+
+void GsymCreator::forEachFunctionInfo(
+    std::function<bool(const FunctionInfo &)> const &Callback) const {
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  for (const auto &FI : Funcs) {
+    if (!Callback(FI))
+      break;
+  }
+}
diff --git a/lib/DebugInfo/GSYM/GsymReader.cpp b/lib/DebugInfo/GSYM/GsymReader.cpp
new file mode 100644
index 000000000000..1b448cf80b70
--- /dev/null
+++ b/lib/DebugInfo/GSYM/GsymReader.cpp
@@ -0,0 +1,265 @@
+//===- GsymReader.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/GSYM/GsymReader.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "llvm/DebugInfo/GSYM/GsymCreator.h"
+#include "llvm/DebugInfo/GSYM/InlineInfo.h"
+#include "llvm/DebugInfo/GSYM/LineTable.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace gsym;
+
+GsymReader::GsymReader(std::unique_ptr<MemoryBuffer> Buffer) :
+    MemBuffer(std::move(Buffer)),
+    Endian(support::endian::system_endianness()) {}
+
+  GsymReader::GsymReader(GsymReader &&RHS) = default;
+
+GsymReader::~GsymReader() = default;
+
+llvm::Expected<GsymReader> GsymReader::openFile(StringRef Filename) {
+  // Open the input file and return an appropriate error if needed.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  auto Err = BuffOrErr.getError();
+  if (Err)
+    return llvm::errorCodeToError(Err);
+  return create(BuffOrErr.get());
+}
+
+llvm::Expected<GsymReader> GsymReader::copyBuffer(StringRef Bytes) {
+  auto MemBuffer = MemoryBuffer::getMemBufferCopy(Bytes, "GSYM bytes");
+  return create(MemBuffer);
+}
+
+llvm::Expected<llvm::gsym::GsymReader>
+GsymReader::create(std::unique_ptr<MemoryBuffer> &MemBuffer) {
+  if (!MemBuffer.get())
+    return createStringError(std::errc::invalid_argument,
+                             "invalid memory buffer");
+  GsymReader GR(std::move(MemBuffer));
+  llvm::Error Err = GR.parse();
+  if (Err)
+    return std::move(Err);
+  return std::move(GR);
+}
+
+llvm::Error
+GsymReader::parse() {
+  BinaryStreamReader FileData(MemBuffer->getBuffer(),
+                              support::endian::system_endianness());
+  // Check for the magic bytes. This file format is designed to be mmap'ed
+  // into a process and accessed as read only. This is done for performance
+  // and efficiency for symbolicating and parsing GSYM data.
+  if (FileData.readObject(Hdr))
+    return createStringError(std::errc::invalid_argument,
+                             "not enough data for a GSYM header");
+
+  const auto HostByteOrder = support::endian::system_endianness();
+  switch (Hdr->Magic) {
+    case GSYM_MAGIC:
+      Endian = HostByteOrder;
+      break;
+    case GSYM_CIGAM:
+      // This is a GSYM file, but not native endianness.
+      Endian = sys::IsBigEndianHost ? support::little : support::big;
+      Swap.reset(new SwappedData);
+      break;
+    default:
+      return createStringError(std::errc::invalid_argument,
+                               "not a GSYM file");
+  }
+
+  bool DataIsLittleEndian = HostByteOrder != support::little;
+  // Read a correctly byte swapped header if we need to.
+  if (Swap) {
+    DataExtractor Data(MemBuffer->getBuffer(), DataIsLittleEndian, 4);
+    if (auto ExpectedHdr = Header::decode(Data))
+      Swap->Hdr = ExpectedHdr.get();
+    else
+      return ExpectedHdr.takeError();
+    Hdr = &Swap->Hdr;
+  }
+
+  // Detect errors in the header and report any that are found. If we make it
+  // past this without errors, we know we have a good magic value, a supported
+  // version number, verified address offset size and a valid UUID size.
+  if (Error Err = Hdr->checkForError())
+    return Err;
+
+  if (!Swap) {
+    // This is the native endianness case that is most common and optimized for
+    // efficient lookups. Here we just grab pointers to the native data and
+    // use ArrayRef objects to allow efficient read only access.
+
+    // Read the address offsets.
+    if (FileData.padToAlignment(Hdr->AddrOffSize) ||
+        FileData.readArray(AddrOffsets,
+                           Hdr->NumAddresses * Hdr->AddrOffSize))
+      return createStringError(std::errc::invalid_argument,
+                              "failed to read address table");
+
+    // Read the address info offsets.
+    if (FileData.padToAlignment(4) ||
+        FileData.readArray(AddrInfoOffsets, Hdr->NumAddresses))
+      return createStringError(std::errc::invalid_argument,
+                              "failed to read address info offsets table");
+
+    // Read the file table.
+    uint32_t NumFiles = 0;
+    if (FileData.readInteger(NumFiles) || FileData.readArray(Files, NumFiles))
+      return createStringError(std::errc::invalid_argument,
+                              "failed to read file table");
+
+    // Get the string table.
+    FileData.setOffset(Hdr->StrtabOffset);
+    if (FileData.readFixedString(StrTab.Data, Hdr->StrtabSize))
+      return createStringError(std::errc::invalid_argument,
+                              "failed to read string table");
+} else {
+  // This is the non native endianness case that is not common and not
+  // optimized for lookups. Here we decode the important tables into local
+  // storage and then set the ArrayRef objects to point to these swapped
+  // copies of the read only data so lookups can be as efficient as possible.
+  DataExtractor Data(MemBuffer->getBuffer(), DataIsLittleEndian, 4);
+
+  // Read the address offsets.
+  uint64_t Offset = alignTo(sizeof(Header), Hdr->AddrOffSize);
+  Swap->AddrOffsets.resize(Hdr->NumAddresses * Hdr->AddrOffSize);
+  switch (Hdr->AddrOffSize) {
+    case 1:
+      if (!Data.getU8(&Offset, Swap->AddrOffsets.data(), Hdr->NumAddresses))
+        return createStringError(std::errc::invalid_argument,
+                                  "failed to read address table");
+      break;
+    case 2:
+      if (!Data.getU16(&Offset,
+                        reinterpret_cast<uint16_t *>(Swap->AddrOffsets.data()),
+                        Hdr->NumAddresses))
+        return createStringError(std::errc::invalid_argument,
+                                  "failed to read address table");
+      break;
+    case 4:
+      if (!Data.getU32(&Offset,
+                        reinterpret_cast<uint32_t *>(Swap->AddrOffsets.data()),
+                        Hdr->NumAddresses))
+        return createStringError(std::errc::invalid_argument,
+                                  "failed to read address table");
+      break;
+    case 8:
+      if (!Data.getU64(&Offset,
+                        reinterpret_cast<uint64_t *>(Swap->AddrOffsets.data()),
+                        Hdr->NumAddresses))
+        return createStringError(std::errc::invalid_argument,
+                                  "failed to read address table");
+    }
+    AddrOffsets = ArrayRef<uint8_t>(Swap->AddrOffsets);
+
+    // Read the address info offsets.
+    Offset = alignTo(Offset, 4);
+    Swap->AddrInfoOffsets.resize(Hdr->NumAddresses);
+    if (Data.getU32(&Offset, Swap->AddrInfoOffsets.data(), Hdr->NumAddresses))
+      AddrInfoOffsets = ArrayRef<uint32_t>(Swap->AddrInfoOffsets);
+    else
+      return createStringError(std::errc::invalid_argument,
+                               "failed to read address table");
+    // Read the file table.
+    const uint32_t NumFiles = Data.getU32(&Offset);
+    if (NumFiles > 0) {
+      Swap->Files.resize(NumFiles);
+      if (Data.getU32(&Offset, &Swap->Files[0].Dir, NumFiles*2))
+        Files = ArrayRef<FileEntry>(Swap->Files);
+      else
+        return createStringError(std::errc::invalid_argument,
+                                 "failed to read file table");
+    }
+    // Get the string table.
+    StrTab.Data = MemBuffer->getBuffer().substr(Hdr->StrtabOffset,
+                                                Hdr->StrtabSize);
+    if (StrTab.Data.empty())
+      return createStringError(std::errc::invalid_argument,
+                               "failed to read string table");
+  }
+  return Error::success();
+
+}
+
+const Header &GsymReader::getHeader() const {
+  // The only way to get a GsymReader is from GsymReader::openFile(...) or
+  // GsymReader::copyBuffer() and the header must be valid and initialized to
+  // a valid pointer value, so the assert below should not trigger.
+  assert(Hdr);
+  return *Hdr;
+}
+
+Optional<uint64_t> GsymReader::getAddress(size_t Index) const {
+  switch (Hdr->AddrOffSize) {
+  case 1: return addressForIndex<uint8_t>(Index);
+  case 2: return addressForIndex<uint16_t>(Index);
+  case 4: return addressForIndex<uint32_t>(Index);
+  case 8: return addressForIndex<uint64_t>(Index);
+  }
+  return llvm::None;
+}
+
+Optional<uint64_t> GsymReader::getAddressInfoOffset(size_t Index) const {
+  const auto NumAddrInfoOffsets = AddrInfoOffsets.size();
+  if (Index < NumAddrInfoOffsets)
+    return AddrInfoOffsets[Index];
+  return llvm::None;
+}
+
+Expected<uint64_t>
+GsymReader::getAddressIndex(const uint64_t Addr) const {
+  if (Addr < Hdr->BaseAddress)
+    return createStringError(std::errc::invalid_argument,
+                             "address 0x%" PRIx64 " not in GSYM", Addr);
+  const uint64_t AddrOffset = Addr - Hdr->BaseAddress;
+  switch (Hdr->AddrOffSize) {
+  case 1: return getAddressOffsetIndex<uint8_t>(AddrOffset);
+  case 2: return getAddressOffsetIndex<uint16_t>(AddrOffset);
+  case 4: return getAddressOffsetIndex<uint32_t>(AddrOffset);
+  case 8: return getAddressOffsetIndex<uint64_t>(AddrOffset);
+  default: break;
+  }
+  return createStringError(std::errc::invalid_argument,
+                           "unsupported address offset size %u",
+                           Hdr->AddrOffSize);
+}
+
+llvm::Expected<FunctionInfo> GsymReader::getFunctionInfo(uint64_t Addr) const {
+  Expected<uint64_t> AddressIndex = getAddressIndex(Addr);
+  if (!AddressIndex)
+    return AddressIndex.takeError();
+  // Address info offsets size should have been checked in parse().
+  assert(*AddressIndex < AddrInfoOffsets.size());
+  auto AddrInfoOffset = AddrInfoOffsets[*AddressIndex];
+  DataExtractor Data(MemBuffer->getBuffer().substr(AddrInfoOffset), Endian, 4);
+  if (Optional<uint64_t> OptAddr = getAddress(*AddressIndex)) {
+    auto ExpectedFI = FunctionInfo::decode(Data, *OptAddr);
+    if (ExpectedFI) {
+      if (ExpectedFI->Range.contains(Addr) || ExpectedFI->Range.size() == 0)
+        return ExpectedFI;
+      return createStringError(std::errc::invalid_argument,
+                                "address 0x%" PRIx64 " not in GSYM", Addr);
+    }
+  }
+  return createStringError(std::errc::invalid_argument,
+                           "failed to extract address[%" PRIu64 "]",
+                           *AddressIndex);
+}
diff --git a/lib/DebugInfo/GSYM/Header.cpp b/lib/DebugInfo/GSYM/Header.cpp
new file mode 100644
index 000000000000..0b3fb9c49894
--- /dev/null
+++ b/lib/DebugInfo/GSYM/Header.cpp
@@ -0,0 +1,109 @@
+//===- Header.cpp -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/GSYM/Header.h"
+#include "llvm/DebugInfo/GSYM/FileWriter.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define HEX8(v) llvm::format_hex(v, 4)
+#define HEX16(v) llvm::format_hex(v, 6)
+#define HEX32(v) llvm::format_hex(v, 10)
+#define HEX64(v) llvm::format_hex(v, 18)
+
+using namespace llvm;
+using namespace gsym;
+
+raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const Header &H) {
+  OS << "Header:\n";
+  OS << "  Magic        = " << HEX32(H.Magic) << "\n";
+  OS << "  Version      = " << HEX16(H.Version) << '\n';
+  OS << "  AddrOffSize  = " << HEX8(H.AddrOffSize) << '\n';
+  OS << "  UUIDSize     = " << HEX8(H.UUIDSize) << '\n';
+  OS << "  BaseAddress  = " << HEX64(H.BaseAddress) << '\n';
+  OS << "  NumAddresses = " << HEX32(H.NumAddresses) << '\n';
+  OS << "  StrtabOffset = " << HEX32(H.StrtabOffset) << '\n';
+  OS << "  StrtabSize   = " << HEX32(H.StrtabSize) << '\n';
+  OS << "  UUID         = ";
+  for (uint8_t I = 0; I < H.UUIDSize; ++I)
+    OS << format_hex_no_prefix(H.UUID[I], 2);
+  OS << '\n';
+  return OS;
+}
+
+/// Check the header and detect any errors.
+llvm::Error Header::checkForError() const {
+  if (Magic != GSYM_MAGIC)
+    return createStringError(std::errc::invalid_argument,
+                             "invalid GSYM magic 0x%8.8x", Magic);
+  if (Version != GSYM_VERSION)
+    return createStringError(std::errc::invalid_argument,
+                             "unsupported GSYM version %u", Version);
+  switch (AddrOffSize) {
+    case 1: break;
+    case 2: break;
+    case 4: break;
+    case 8: break;
+    default:
+        return createStringError(std::errc::invalid_argument,
+                                 "invalid address offset size %u",
+                                 AddrOffSize);
+  }
+  if (UUIDSize > GSYM_MAX_UUID_SIZE)
+    return createStringError(std::errc::invalid_argument,
+                             "invalid UUID size %u", UUIDSize);
+  return Error::success();
+}
+
+llvm::Expected<Header> Header::decode(DataExtractor &Data) {
+  uint64_t Offset = 0;
+  // The header is stored as a single blob of data that has a fixed byte size.
+  if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(Header)))
+    return createStringError(std::errc::invalid_argument,
+                             "not enough data for a gsym::Header");
+  Header H;
+  H.Magic = Data.getU32(&Offset);
+  H.Version = Data.getU16(&Offset);
+  H.AddrOffSize = Data.getU8(&Offset);
+  H.UUIDSize = Data.getU8(&Offset);
+  H.BaseAddress = Data.getU64(&Offset);
+  H.NumAddresses = Data.getU32(&Offset);
+  H.StrtabOffset = Data.getU32(&Offset);
+  H.StrtabSize = Data.getU32(&Offset);
+  Data.getU8(&Offset, H.UUID, GSYM_MAX_UUID_SIZE);
+  if (llvm::Error Err = H.checkForError())
+    return std::move(Err);
+  return H;
+}
+
+llvm::Error Header::encode(FileWriter &O) const {
+  // Users must verify the Header is valid prior to calling this funtion.
+  if (llvm::Error Err = checkForError())
+    return Err;
+  O.writeU32(Magic);
+  O.writeU16(Version);
+  O.writeU8(AddrOffSize);
+  O.writeU8(UUIDSize);
+  O.writeU64(BaseAddress);
+  O.writeU32(NumAddresses);
+  O.writeU32(StrtabOffset);
+  O.writeU32(StrtabSize);
+  O.writeData(llvm::ArrayRef<uint8_t>(UUID));
+  return Error::success();
+}
+
+bool llvm::gsym::operator==(const Header &LHS, const Header &RHS) {
+  return LHS.Magic == RHS.Magic && LHS.Version == RHS.Version &&
+      LHS.AddrOffSize == RHS.AddrOffSize && LHS.UUIDSize == RHS.UUIDSize &&
+      LHS.BaseAddress == RHS.BaseAddress &&
+      LHS.NumAddresses == RHS.NumAddresses &&
+      LHS.StrtabOffset == RHS.StrtabOffset &&
+      LHS.StrtabSize == RHS.StrtabSize &&
+      memcmp(LHS.UUID, RHS.UUID, LHS.UUIDSize) == 0;
+}
diff --git a/lib/DebugInfo/GSYM/InlineInfo.cpp b/lib/DebugInfo/GSYM/InlineInfo.cpp
index 781c1755241d..32ed2c709575 100644
--- a/lib/DebugInfo/GSYM/InlineInfo.cpp
+++ b/lib/DebugInfo/GSYM/InlineInfo.cpp
@@ -8,7 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/GSYM/FileEntry.h"
+#include "llvm/DebugInfo/GSYM/FileWriter.h"
 #include "llvm/DebugInfo/GSYM/InlineInfo.h"
+#include "llvm/Support/DataExtractor.h"
 #include <algorithm>
 #include <inttypes.h>
 
@@ -57,3 +59,101 @@ llvm::Optional<InlineInfo::InlineArray> InlineInfo::getInlineStack(uint64_t Addr
     return Result;
   return llvm::None;
 }
+
+/// Decode an InlineInfo in Data at the specified offset.
+///
+/// A local helper function to decode InlineInfo objects. This function is
+/// called recursively when parsing child InlineInfo objects.
+///
+/// \param Data The data extractor to decode from.
+/// \param Offset The offset within \a Data to decode from.
+/// \param BaseAddr The base address to use when decoding address ranges.
+/// \returns An InlineInfo or an error describing the issue that was
+/// encountered during decoding.
+static llvm::Expected<InlineInfo> decode(DataExtractor &Data, uint64_t &Offset,
+                                         uint64_t BaseAddr) {
+  InlineInfo Inline;
+  if (!Data.isValidOffset(Offset))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing InlineInfo address ranges data", Offset);
+  Inline.Ranges.decode(Data, BaseAddr, Offset);
+  if (Inline.Ranges.empty())
+    return Inline;
+  if (!Data.isValidOffsetForDataOfSize(Offset, 1))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing InlineInfo uint8_t indicating children",
+        Offset);
+  bool HasChildren = Data.getU8(&Offset) != 0;
+  if (!Data.isValidOffsetForDataOfSize(Offset, 4))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing InlineInfo uint32_t for name", Offset);
+  Inline.Name = Data.getU32(&Offset);
+  if (!Data.isValidOffset(Offset))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing ULEB128 for InlineInfo call file", Offset);
+  Inline.CallFile = (uint32_t)Data.getULEB128(&Offset);
+  if (!Data.isValidOffset(Offset))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing ULEB128 for InlineInfo call line", Offset);
+  Inline.CallLine = (uint32_t)Data.getULEB128(&Offset);
+  if (HasChildren) {
+    // Child address ranges are encoded relative to the first address in the
+    // parent InlineInfo object.
+    const auto ChildBaseAddr = Inline.Ranges[0].Start;
+    while (true) {
+      llvm::Expected<InlineInfo> Child = decode(Data, Offset, ChildBaseAddr);
+      if (!Child)
+        return Child.takeError();
+      // InlineInfo with empty Ranges termintes a child sibling chain.
+      if (Child.get().Ranges.empty())
+        break;
+      Inline.Children.emplace_back(std::move(*Child));
+    }
+  }
+  return Inline;
+}
+
+llvm::Expected<InlineInfo> InlineInfo::decode(DataExtractor &Data,
+                                              uint64_t BaseAddr) {
+  uint64_t Offset = 0;
+  return ::decode(Data, Offset, BaseAddr);
+}
+
+llvm::Error InlineInfo::encode(FileWriter &O, uint64_t BaseAddr) const {
+  // Users must verify the InlineInfo is valid prior to calling this funtion.
+  // We don't want to emit any InlineInfo objects if they are not valid since
+  // it will waste space in the GSYM file.
+  if (!isValid())
+    return createStringError(std::errc::invalid_argument,
+                             "attempted to encode invalid InlineInfo object");
+  Ranges.encode(O, BaseAddr);
+  bool HasChildren = !Children.empty();
+  O.writeU8(HasChildren);
+  O.writeU32(Name);
+  O.writeULEB(CallFile);
+  O.writeULEB(CallLine);
+  if (HasChildren) {
+    // Child address ranges are encoded as relative to the first
+    // address in the Ranges for this object. This keeps the offsets
+    // small and allows for efficient encoding using ULEB offsets.
+    const uint64_t ChildBaseAddr = Ranges[0].Start;
+    for (const auto &Child : Children) {
+      // Make sure all child address ranges are contained in the parent address
+      // ranges.
+      for (const auto &ChildRange: Child.Ranges) {
+        if (!Ranges.contains(ChildRange))
+          return createStringError(std::errc::invalid_argument,
+                                   "child range not contained in parent");
+      }
+      llvm::Error Err = Child.encode(O, ChildBaseAddr);
+      if (Err)
+        return Err;
+    }
+
+    // Terminate child sibling chain by emitting a zero. This zero will cause
+    // the decodeAll() function above to return false and stop the decoding
+    // of child InlineInfo objects that are siblings.
+    O.writeULEB(0);
+  }
+  return Error::success();
+}
diff --git a/lib/DebugInfo/GSYM/LineTable.cpp b/lib/DebugInfo/GSYM/LineTable.cpp
new file mode 100644
index 000000000000..824c0041be9f
--- /dev/null
+++ b/lib/DebugInfo/GSYM/LineTable.cpp
@@ -0,0 +1,287 @@
+//===- LineTable.cpp --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/GSYM/LineTable.h"
+#include "llvm/DebugInfo/GSYM/FileWriter.h"
+#include "llvm/Support/DataExtractor.h"
+
+using namespace llvm;
+using namespace gsym;
+
+enum LineTableOpCode {
+  EndSequence = 0x00,  ///< End of the line table.
+  SetFile = 0x01,      ///< Set LineTableRow.file_idx, don't push a row.
+  AdvancePC = 0x02,    ///< Increment LineTableRow.address, and push a row.
+  AdvanceLine = 0x03,  ///< Set LineTableRow.file_line, don't push a row.
+  FirstSpecial = 0x04, ///< All special opcodes push a row.
+};
+
+struct DeltaInfo {
+  int64_t Delta;
+  uint32_t Count;
+  DeltaInfo(int64_t D, uint32_t C) : Delta(D), Count(C) {}
+};
+
+inline bool operator<(const DeltaInfo &LHS, int64_t Delta) {
+  return LHS.Delta < Delta;
+}
+
+static bool encodeSpecial(int64_t MinLineDelta, int64_t MaxLineDelta,
+                          int64_t LineDelta, uint64_t AddrDelta,
+                          uint8_t &SpecialOp) {
+  if (LineDelta < MinLineDelta)
+    return false;
+  if (LineDelta > MaxLineDelta)
+    return false;
+  int64_t LineRange = MaxLineDelta - MinLineDelta + 1;
+  int64_t AdjustedOp = ((LineDelta - MinLineDelta) + AddrDelta * LineRange);
+  int64_t Op = AdjustedOp + FirstSpecial;
+  if (Op < 0)
+    return false;
+  if (Op > 255)
+    return false;
+  SpecialOp = (uint8_t)Op;
+  return true;
+}
+
+typedef std::function<bool(const LineEntry &Row)> LineEntryCallback;
+
+static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr,
+                         LineEntryCallback const &Callback) {
+  uint64_t Offset = 0;
+  if (!Data.isValidOffset(Offset))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing LineTable MinDelta", Offset);
+  int64_t MinDelta = Data.getSLEB128(&Offset);
+  if (!Data.isValidOffset(Offset))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing LineTable MaxDelta", Offset);
+  int64_t MaxDelta = Data.getSLEB128(&Offset);
+  int64_t LineRange = MaxDelta - MinDelta + 1;
+  if (!Data.isValidOffset(Offset))
+    return createStringError(std::errc::io_error,
+        "0x%8.8" PRIx64 ": missing LineTable FirstLine", Offset);
+  const uint32_t FirstLine = (uint32_t)Data.getULEB128(&Offset);
+  LineEntry Row(BaseAddr, 1, FirstLine);
+  bool Done = false;
+  while (!Done) {
+    if (!Data.isValidOffset(Offset))
+      return createStringError(std::errc::io_error,
+          "0x%8.8" PRIx64 ": EOF found before EndSequence", Offset);
+    uint8_t Op = Data.getU8(&Offset);
+    switch (Op) {
+    case EndSequence:
+      Done = true;
+      break;
+    case SetFile:
+      if (!Data.isValidOffset(Offset))
+        return createStringError(std::errc::io_error,
+            "0x%8.8" PRIx64 ": EOF found before SetFile value",
+            Offset);
+      Row.File = (uint32_t)Data.getULEB128(&Offset);
+      break;
+    case AdvancePC:
+      if (!Data.isValidOffset(Offset))
+        return createStringError(std::errc::io_error,
+            "0x%8.8" PRIx64 ": EOF found before AdvancePC value",
+            Offset);
+      Row.Addr += Data.getULEB128(&Offset);
+      // If the function callback returns false, we stop parsing.
+      if (Callback(Row) == false)
+        return Error::success();
+      break;
+    case AdvanceLine:
+      if (!Data.isValidOffset(Offset))
+        return createStringError(std::errc::io_error,
+            "0x%8.8" PRIx64 ": EOF found before AdvanceLine value",
+            Offset);
+      Row.Line += Data.getSLEB128(&Offset);
+      break;
+    default: {
+        // A byte that contains both address and line increment.
+        uint8_t AdjustedOp = Op - FirstSpecial;
+        int64_t LineDelta = MinDelta + (AdjustedOp % LineRange);
+        uint64_t AddrDelta = (AdjustedOp / LineRange);
+        Row.Line += LineDelta;
+        Row.Addr += AddrDelta;
+        // If the function callback returns false, we stop parsing.
+        if (Callback(Row) == false)
+          return Error::success();
+        break;
+      }
+    }
+  }
+  return Error::success();
+}
+
+llvm::Error LineTable::encode(FileWriter &Out, uint64_t BaseAddr) const {
+  // Users must verify the LineTable is valid prior to calling this funtion.
+  // We don't want to emit any LineTable objects if they are not valid since
+  // it will waste space in the GSYM file.
+  if (!isValid())
+    return createStringError(std::errc::invalid_argument,
+                             "attempted to encode invalid LineTable object");
+
+  int64_t MinLineDelta = INT64_MAX;
+  int64_t MaxLineDelta = INT64_MIN;
+  std::vector<DeltaInfo> DeltaInfos;
+  if (Lines.size() == 1) {
+    MinLineDelta = 0;
+    MaxLineDelta = 0;
+  } else {
+    int64_t PrevLine = 1;
+    bool First = true;
+    for (const auto &line_entry : Lines) {
+      if (First)
+        First = false;
+      else {
+        int64_t LineDelta = (int64_t)line_entry.Line - PrevLine;
+        auto End = DeltaInfos.end();
+        auto Pos = std::lower_bound(DeltaInfos.begin(), End, LineDelta);
+        if (Pos != End && Pos->Delta == LineDelta)
+          ++Pos->Count;
+        else
+          DeltaInfos.insert(Pos, DeltaInfo(LineDelta, 1));
+        if (LineDelta < MinLineDelta)
+          MinLineDelta = LineDelta;
+        if (LineDelta > MaxLineDelta)
+          MaxLineDelta = LineDelta;
+      }
+      PrevLine = (int64_t)line_entry.Line;
+    }
+    assert(MinLineDelta <= MaxLineDelta);
+  }
+  // Set the min and max line delta intelligently based on the counts of
+  // the line deltas. if our range is too large.
+  const int64_t MaxLineRange = 14;
+  if (MaxLineDelta - MinLineDelta > MaxLineRange) {
+    uint32_t BestIndex = 0;
+    uint32_t BestEndIndex = 0;
+    uint32_t BestCount = 0;
+    const size_t NumDeltaInfos = DeltaInfos.size();
+    for (uint32_t I = 0; I < NumDeltaInfos; ++I) {
+      const int64_t FirstDelta = DeltaInfos[I].Delta;
+      uint32_t CurrCount = 0;
+      uint32_t J;
+      for (J = I; J < NumDeltaInfos; ++J) {
+        auto LineRange = DeltaInfos[J].Delta - FirstDelta;
+        if (LineRange > MaxLineRange)
+          break;
+        CurrCount += DeltaInfos[J].Count;
+      }
+      if (CurrCount > BestCount) {
+        BestIndex = I;
+        BestEndIndex = J - 1;
+        BestCount = CurrCount;
+      }
+    }
+    MinLineDelta = DeltaInfos[BestIndex].Delta;
+    MaxLineDelta = DeltaInfos[BestEndIndex].Delta;
+  }
+  if (MinLineDelta == MaxLineDelta && MinLineDelta > 0 &&
+      MinLineDelta < MaxLineRange)
+    MinLineDelta = 0;
+  assert(MinLineDelta <= MaxLineDelta);
+
+  // Initialize the line entry state as a starting point. All line entries
+  // will be deltas from this.
+  LineEntry Prev(BaseAddr, 1, Lines.front().Line);
+
+  // Write out the min and max line delta as signed LEB128.
+  Out.writeSLEB(MinLineDelta);
+  Out.writeSLEB(MaxLineDelta);
+  // Write out the starting line number as a unsigned LEB128.
+  Out.writeULEB(Prev.Line);
+
+  for (const auto &Curr : Lines) {
+    if (Curr.Addr < BaseAddr)
+      return createStringError(std::errc::invalid_argument,
+                               "LineEntry has address 0x%" PRIx64 " which is "
+                               "less than the function start address 0x%"
+                               PRIx64, Curr.Addr, BaseAddr);
+    if (Curr.Addr < Prev.Addr)
+      return createStringError(std::errc::invalid_argument,
+                               "LineEntry in LineTable not in ascending order");
+    const uint64_t AddrDelta = Curr.Addr - Prev.Addr;
+    int64_t LineDelta = 0;
+    if (Curr.Line > Prev.Line)
+      LineDelta = Curr.Line - Prev.Line;
+    else if (Prev.Line > Curr.Line)
+      LineDelta = -((int32_t)(Prev.Line - Curr.Line));
+
+    // Set the file if it doesn't match the current one.
+    if (Curr.File != Prev.File) {
+      Out.writeU8(SetFile);
+      Out.writeULEB(Curr.File);
+    }
+
+    uint8_t SpecialOp;
+    if (encodeSpecial(MinLineDelta, MaxLineDelta, LineDelta, AddrDelta,
+                      SpecialOp)) {
+      // Advance the PC and line and push a row.
+      Out.writeU8(SpecialOp);
+    } else {
+      // We can't encode the address delta and line delta into
+      // a single special opcode, we must do them separately.
+
+      // Advance the line.
+      if (LineDelta != 0) {
+        Out.writeU8(AdvanceLine);
+        Out.writeSLEB(LineDelta);
+      }
+
+      // Advance the PC and push a row.
+      Out.writeU8(AdvancePC);
+      Out.writeULEB(AddrDelta);
+    }
+    Prev = Curr;
+  }
+  Out.writeU8(EndSequence);
+  return Error::success();
+}
+
+// Parse all line table entries into the "LineTable" vector. We can
+// cache the results of this if needed, or we can call LineTable::lookup()
+// below.
+llvm::Expected<LineTable> LineTable::decode(DataExtractor &Data,
+                                            uint64_t BaseAddr) {
+  LineTable LT;
+  llvm::Error Err = parse(Data, BaseAddr, [&](const LineEntry &Row) -> bool {
+    LT.Lines.push_back(Row);
+    return true; // Keep parsing by returning true.
+  });
+  if (Err)
+    return std::move(Err);
+  return LT;
+}
+// Parse the line table on the fly and find the row we are looking for.
+// We will need to determine if we need to cache the line table by calling
+// LineTable::parseAllEntries(...) or just call this function each time.
+// There is a CPU vs memory tradeoff we will need to determine.
+LineEntry LineTable::lookup(DataExtractor &Data, uint64_t BaseAddr, uint64_t Addr) {
+  LineEntry Result;
+  llvm::Error Err = parse(Data, BaseAddr,
+                          [Addr, &Result](const LineEntry &Row) -> bool {
+    if (Addr < Row.Addr)
+      return false; // Stop parsing, result contains the line table row!
+    Result = Row;
+    if (Addr == Row.Addr) {
+      // Stop parsing, this is the row we are looking for since the address
+      // matches.
+      return false;
+    }
+    return true; // Keep parsing till we find the right row.
+  });
+  return Result;
+}
+
+raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const LineTable &LT) {
+  for (const auto &LineEntry : LT)
+    OS << LineEntry << '\n';
+  return OS;
+}
diff --git a/lib/DebugInfo/GSYM/Range.cpp b/lib/DebugInfo/GSYM/Range.cpp
index ca61984dacbd..19ab700fdd57 100644
--- a/lib/DebugInfo/GSYM/Range.cpp
+++ b/lib/DebugInfo/GSYM/Range.cpp
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/GSYM/Range.h"
+#include "llvm/DebugInfo/GSYM/FileWriter.h"
+#include "llvm/Support/DataExtractor.h"
 #include <algorithm>
 #include <inttypes.h>
 
@@ -40,6 +42,17 @@ bool AddressRanges::contains(uint64_t Addr) const {
   return It != Ranges.begin() && Addr < It[-1].End;
 }
 
+bool AddressRanges::contains(AddressRange Range) const {
+  if (Range.size() == 0)
+    return false;
+  auto It = std::partition_point(
+      Ranges.begin(), Ranges.end(),
+      [=](const AddressRange &R) { return R.Start <= Range.Start; });
+  if (It == Ranges.begin())
+    return false;
+  return Range.End <= It[-1].End;
+}
+
 raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const AddressRange &R) {
   return OS << '[' << HEX64(R.Start) << " - " << HEX64(R.End) << ")";
 }
@@ -53,3 +66,37 @@ raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const AddressRanges &AR) {
   }
   return OS;
 }
+
+void AddressRange::encode(FileWriter &O, uint64_t BaseAddr) const {
+  assert(Start >= BaseAddr);
+  O.writeULEB(Start - BaseAddr);
+  O.writeULEB(size());
+}
+
+void AddressRange::decode(DataExtractor &Data, uint64_t BaseAddr,
+                          uint64_t &Offset) {
+  const uint64_t AddrOffset = Data.getULEB128(&Offset);
+  const uint64_t Size = Data.getULEB128(&Offset);
+  const uint64_t StartAddr = BaseAddr + AddrOffset;
+  Start = StartAddr;
+  End = StartAddr + Size;
+}
+
+void AddressRanges::encode(FileWriter &O, uint64_t BaseAddr) const {
+  O.writeULEB(Ranges.size());
+  if (Ranges.empty())
+    return;
+  for (auto Range : Ranges)
+    Range.encode(O, BaseAddr);
+}
+
+void AddressRanges::decode(DataExtractor &Data, uint64_t BaseAddr,
+                           uint64_t &Offset) {
+  clear();
+  uint64_t NumRanges = Data.getULEB128(&Offset);
+  if (NumRanges == 0)
+    return;
+  Ranges.resize(NumRanges);
+  for (auto &Range : Ranges)
+    Range.decode(Data, BaseAddr, Offset);
+}
diff --git a/lib/DebugInfo/MSF/MappedBlockStream.cpp b/lib/DebugInfo/MSF/MappedBlockStream.cpp
index df925771f0d9..5dc9c86b34fd 100644
--- a/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -52,7 +52,7 @@ MappedBlockStream::MappedBlockStream(uint32_t BlockSize,
 std::unique_ptr<MappedBlockStream> MappedBlockStream::createStream(
     uint32_t BlockSize, const MSFStreamLayout &Layout, BinaryStreamRef MsfData,
     BumpPtrAllocator &Allocator) {
-  return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
+  return std::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
       BlockSize, Layout, MsfData, Allocator);
 }
 
@@ -63,7 +63,7 @@ std::unique_ptr<MappedBlockStream> MappedBlockStream::createIndexedStream(
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
   SL.Length = Layout.StreamSizes[StreamIndex];
-  return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
+  return std::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
       Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
@@ -318,7 +318,7 @@ WritableMappedBlockStream::createStream(uint32_t BlockSize,
                                         const MSFStreamLayout &Layout,
                                         WritableBinaryStreamRef MsfData,
                                         BumpPtrAllocator &Allocator) {
-  return llvm::make_unique<MappedBlockStreamImpl<WritableMappedBlockStream>>(
+  return std::make_unique<MappedBlockStreamImpl<WritableMappedBlockStream>>(
       BlockSize, Layout, MsfData, Allocator);
 }
 
diff --git a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
index a8ae076e1d6c..c2552f55703c 100644
--- a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -405,7 +405,7 @@ DIARawSymbol::findChildren(PDB_SymType Type) const {
       return nullptr;
   }
 
-  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+  return std::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -423,7 +423,7 @@ DIARawSymbol::findChildren(PDB_SymType Type, StringRef Name,
       Symbol->findChildrenEx(EnumVal, Name16Str, CompareFlags, &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+  return std::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -443,7 +443,7 @@ DIARawSymbol::findChildrenByAddr(PDB_SymType Type, StringRef Name,
                                            Section, Offset, &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+  return std::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -462,7 +462,7 @@ DIARawSymbol::findChildrenByVA(PDB_SymType Type, StringRef Name,
                                          &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+  return std::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -480,7 +480,7 @@ DIARawSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
                                           &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+  return std::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -489,7 +489,7 @@ DIARawSymbol::findInlineFramesByAddr(uint32_t Section, uint32_t Offset) const {
   if (S_OK != Symbol->findInlineFramesByAddr(Section, Offset, &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+  return std::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -498,7 +498,7 @@ DIARawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
   if (S_OK != Symbol->findInlineFramesByRVA(RVA, &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+  return std::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -507,7 +507,7 @@ DIARawSymbol::findInlineFramesByVA(uint64_t VA) const {
   if (S_OK != Symbol->findInlineFramesByVA(VA, &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
+  return std::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers> DIARawSymbol::findInlineeLines() const {
@@ -515,7 +515,7 @@ std::unique_ptr<IPDBEnumLineNumbers> DIARawSymbol::findInlineeLines() const {
   if (S_OK != Symbol->findInlineeLines(&DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
+  return std::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
@@ -526,7 +526,7 @@ DIARawSymbol::findInlineeLinesByAddr(uint32_t Section, uint32_t Offset,
       Symbol->findInlineeLinesByAddr(Section, Offset, Length, &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
+  return std::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
@@ -535,7 +535,7 @@ DIARawSymbol::findInlineeLinesByRVA(uint32_t RVA, uint32_t Length) const {
   if (S_OK != Symbol->findInlineeLinesByRVA(RVA, Length, &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
+  return std::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
@@ -544,7 +544,7 @@ DIARawSymbol::findInlineeLinesByVA(uint64_t VA, uint32_t Length) const {
   if (S_OK != Symbol->findInlineeLinesByVA(VA, Length, &DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
+  return std::make_unique<DIAEnumLineNumbers>(DiaEnumerator);
 }
 
 void DIARawSymbol::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const {
@@ -776,7 +776,7 @@ std::unique_ptr<IPDBLineNumber> DIARawSymbol::getSrcLineOnTypeDefn() const {
   if (FAILED(Symbol->getSrcLineOnTypeDefn(&LineNumber)) || !LineNumber)
     return nullptr;
 
-  return llvm::make_unique<DIALineNumber>(LineNumber);
+  return std::make_unique<DIALineNumber>(LineNumber);
 }
 
 uint32_t DIARawSymbol::getStride() const {
@@ -871,7 +871,7 @@ DIARawSymbol::getVirtualBaseTableType() const {
   if (FAILED(Symbol->get_virtualBaseTableType(&TableType)) || !TableType)
     return nullptr;
 
-  auto RawVT = llvm::make_unique<DIARawSymbol>(Session, TableType);
+  auto RawVT = std::make_unique<DIARawSymbol>(Session, TableType);
   auto Pointer =
       PDBSymbol::createAs<PDBSymbolTypePointer>(Session, std::move(RawVT));
   return unique_dyn_cast<PDBSymbolTypeBuiltin>(Pointer->getPointeeType());
diff --git a/lib/DebugInfo/PDB/DIA/DIASectionContrib.cpp b/lib/DebugInfo/PDB/DIA/DIASectionContrib.cpp
index e2d928f2c4b2..4f0e078e6712 100644
--- a/lib/DebugInfo/PDB/DIA/DIASectionContrib.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASectionContrib.cpp
@@ -23,7 +23,7 @@ std::unique_ptr<PDBSymbolCompiland> DIASectionContrib::getCompiland() const {
   if (FAILED(Section->get_compiland(&Symbol)))
     return nullptr;
 
-  auto RawSymbol = llvm::make_unique<DIARawSymbol>(Session, Symbol);
+  auto RawSymbol = std::make_unique<DIARawSymbol>(Session, Symbol);
   return PDBSymbol::createAs<PDBSymbolCompiland>(Session, std::move(RawSymbol));
 }
 
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
index 4e0b8587c613..64ffa776bbd6 100644
--- a/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -73,15 +73,7 @@ static Error LoadDIA(CComPtr<IDiaDataSource> &DiaDataSource) {
 #if !defined(_MSC_VER)
   return llvm::make_error<PDBError>(pdb_error_code::dia_failed_loading);
 #else
-  const wchar_t *msdia_dll = nullptr;
-#if _MSC_VER >= 1900 && _MSC_VER < 2000
-  msdia_dll = L"msdia140.dll"; // VS2015
-#elif _MSC_VER >= 1800
-  msdia_dll = L"msdia120.dll"; // VS2013
-#else
-#error "Unknown Visual Studio version."
-#endif
-
+  const wchar_t *msdia_dll = L"msdia140.dll";
   HRESULT HR;
   if (FAILED(HR = NoRegCoCreate(msdia_dll, CLSID_DiaSource, IID_IDiaDataSource,
                                 reinterpret_cast<LPVOID *>(&DiaDataSource))))
@@ -158,7 +150,7 @@ std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() {
   if (S_OK != Session->get_globalScope(&GlobalScope))
     return nullptr;
 
-  auto RawSymbol = llvm::make_unique<DIARawSymbol>(*this, GlobalScope);
+  auto RawSymbol = std::make_unique<DIARawSymbol>(*this, GlobalScope);
   auto PdbSymbol(PDBSymbol::create(*this, std::move(RawSymbol)));
   std::unique_ptr<PDBSymbolExe> ExeSymbol(
       static_cast<PDBSymbolExe *>(PdbSymbol.release()));
@@ -193,7 +185,7 @@ DIASession::getSymbolById(SymIndexId SymbolId) const {
   if (S_OK != Session->symbolById(SymbolId, &LocatedSymbol))
     return nullptr;
 
-  auto RawSymbol = llvm::make_unique<DIARawSymbol>(*this, LocatedSymbol);
+  auto RawSymbol = std::make_unique<DIARawSymbol>(*this, LocatedSymbol);
   return PDBSymbol::create(*this, std::move(RawSymbol));
 }
 
@@ -210,7 +202,7 @@ DIASession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
     if (S_OK != Session->findSymbolByRVA(RVA, EnumVal, &Symbol))
       return nullptr;
   }
-  auto RawSymbol = llvm::make_unique<DIARawSymbol>(*this, Symbol);
+  auto RawSymbol = std::make_unique<DIARawSymbol>(*this, Symbol);
   return PDBSymbol::create(*this, std::move(RawSymbol));
 }
 
@@ -222,7 +214,7 @@ std::unique_ptr<PDBSymbol> DIASession::findSymbolByRVA(uint32_t RVA,
   if (S_OK != Session->findSymbolByRVA(RVA, EnumVal, &Symbol))
     return nullptr;
 
-  auto RawSymbol = llvm::make_unique<DIARawSymbol>(*this, Symbol);
+  auto RawSymbol = std::make_unique<DIARawSymbol>(*this, Symbol);
   return PDBSymbol::create(*this, std::move(RawSymbol));
 }
 
@@ -235,7 +227,7 @@ DIASession::findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
   if (S_OK != Session->findSymbolByAddr(Sect, Offset, EnumVal, &Symbol))
     return nullptr;
 
-  auto RawSymbol = llvm::make_unique<DIARawSymbol>(*this, Symbol);
+  auto RawSymbol = std::make_unique<DIARawSymbol>(*this, Symbol);
   return PDBSymbol::create(*this, std::move(RawSymbol));
 }
 
@@ -251,7 +243,7 @@ DIASession::findLineNumbers(const PDBSymbolCompiland &Compiland,
                                  RawFile.getDiaFile(), &LineNumbers))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
+  return std::make_unique<DIAEnumLineNumbers>(LineNumbers);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
@@ -265,7 +257,7 @@ DIASession::findLineNumbersByAddress(uint64_t Address, uint32_t Length) const {
     if (S_OK != Session->findLinesByRVA(RVA, Length, &LineNumbers))
       return nullptr;
   }
-  return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
+  return std::make_unique<DIAEnumLineNumbers>(LineNumbers);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
@@ -274,7 +266,7 @@ DIASession::findLineNumbersByRVA(uint32_t RVA, uint32_t Length) const {
   if (S_OK != Session->findLinesByRVA(RVA, Length, &LineNumbers))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
+  return std::make_unique<DIAEnumLineNumbers>(LineNumbers);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
@@ -284,7 +276,7 @@ DIASession::findLineNumbersBySectOffset(uint32_t Section, uint32_t Offset,
   if (S_OK != Session->findLinesByAddr(Section, Offset, Length, &LineNumbers))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumLineNumbers>(LineNumbers);
+  return std::make_unique<DIAEnumLineNumbers>(LineNumbers);
 }
 
 std::unique_ptr<IPDBEnumSourceFiles>
@@ -306,7 +298,7 @@ DIASession::findSourceFiles(const PDBSymbolCompiland *Compiland,
   if (S_OK !=
       Session->findFile(DiaCompiland, Utf16Pattern.m_str, Flags, &SourceFiles))
     return nullptr;
-  return llvm::make_unique<DIAEnumSourceFiles>(*this, SourceFiles);
+  return std::make_unique<DIAEnumSourceFiles>(*this, SourceFiles);
 }
 
 std::unique_ptr<IPDBSourceFile>
@@ -342,7 +334,7 @@ std::unique_ptr<IPDBEnumSourceFiles> DIASession::getAllSourceFiles() const {
   if (S_OK != Session->findFile(nullptr, nullptr, nsNone, &Files))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSourceFiles>(*this, Files);
+  return std::make_unique<DIAEnumSourceFiles>(*this, Files);
 }
 
 std::unique_ptr<IPDBEnumSourceFiles> DIASession::getSourceFilesForCompiland(
@@ -355,7 +347,7 @@ std::unique_ptr<IPDBEnumSourceFiles> DIASession::getSourceFilesForCompiland(
       Session->findFile(RawSymbol.getDiaSymbol(), nullptr, nsNone, &Files))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSourceFiles>(*this, Files);
+  return std::make_unique<DIAEnumSourceFiles>(*this, Files);
 }
 
 std::unique_ptr<IPDBSourceFile>
@@ -364,7 +356,7 @@ DIASession::getSourceFileById(uint32_t FileId) const {
   if (S_OK != Session->findFileById(FileId, &LocatedFile))
     return nullptr;
 
-  return llvm::make_unique<DIASourceFile>(*this, LocatedFile);
+  return std::make_unique<DIASourceFile>(*this, LocatedFile);
 }
 
 std::unique_ptr<IPDBEnumDataStreams> DIASession::getDebugStreams() const {
@@ -372,7 +364,7 @@ std::unique_ptr<IPDBEnumDataStreams> DIASession::getDebugStreams() const {
   if (S_OK != Session->getEnumDebugStreams(&DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumDebugStreams>(DiaEnumerator);
+  return std::make_unique<DIAEnumDebugStreams>(DiaEnumerator);
 }
 
 std::unique_ptr<IPDBEnumTables> DIASession::getEnumTables() const {
@@ -380,7 +372,7 @@ std::unique_ptr<IPDBEnumTables> DIASession::getEnumTables() const {
   if (S_OK != Session->getEnumTables(&DiaEnumerator))
     return nullptr;
 
-  return llvm::make_unique<DIAEnumTables>(DiaEnumerator);
+  return std::make_unique<DIAEnumTables>(DiaEnumerator);
 }
 
 template <class T> static CComPtr<T> getTableEnumerator(IDiaSession &Session) {
@@ -407,7 +399,7 @@ DIASession::getInjectedSources() const {
   if (!Files)
     return nullptr;
 
-  return llvm::make_unique<DIAEnumInjectedSources>(Files);
+  return std::make_unique<DIAEnumInjectedSources>(Files);
 }
 
 std::unique_ptr<IPDBEnumSectionContribs>
@@ -417,7 +409,7 @@ DIASession::getSectionContribs() const {
   if (!Sections)
     return nullptr;
 
-  return llvm::make_unique<DIAEnumSectionContribs>(*this, Sections);
+  return std::make_unique<DIAEnumSectionContribs>(*this, Sections);
 }
 
 std::unique_ptr<IPDBEnumFrameData>
@@ -427,5 +419,5 @@ DIASession::getFrameData() const {
   if (!FD)
     return nullptr;
 
-  return llvm::make_unique<DIAEnumFrameData>(FD);
+  return std::make_unique<DIAEnumFrameData>(FD);
 }
diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp
index 70dc094c42ec..0e4cba3174b2 100644
--- a/lib/DebugInfo/PDB/GenericError.cpp
+++ b/lib/DebugInfo/PDB/GenericError.cpp
@@ -34,8 +34,8 @@ public:
       return "The PDB file path is an invalid UTF8 sequence.";
     case pdb_error_code::signature_out_of_date:
       return "The signature does not match; the file(s) might be out of date.";
-    case pdb_error_code::external_cmdline_ref:
-      return "The path to this file must be provided on the command-line.";
+    case pdb_error_code::no_matching_pch:
+      return "No matching precompiled header could be located.";
     }
     llvm_unreachable("Unrecognized generic_error_code");
   }
diff --git a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index 20b6c6142547..419734771ccd 100644
--- a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -180,12 +180,12 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
 void DbiModuleDescriptorBuilder::addDebugSubsection(
     std::shared_ptr<DebugSubsection> Subsection) {
   assert(Subsection);
-  C13Builders.push_back(llvm::make_unique<DebugSubsectionRecordBuilder>(
+  C13Builders.push_back(std::make_unique<DebugSubsectionRecordBuilder>(
       std::move(Subsection), CodeViewContainer::Pdb));
 }
 
 void DbiModuleDescriptorBuilder::addDebugSubsection(
     const DebugSubsectionRecord &SubsectionContents) {
-  C13Builders.push_back(llvm::make_unique<DebugSubsectionRecordBuilder>(
+  C13Builders.push_back(std::make_unique<DebugSubsectionRecordBuilder>(
       SubsectionContents, CodeViewContainer::Pdb));
 }
diff --git a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index b7ade0072ee5..0e00c2f7ff98 100644
--- a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -114,7 +114,7 @@ Expected<DbiModuleDescriptorBuilder &>
 DbiStreamBuilder::addModuleInfo(StringRef ModuleName) {
   uint32_t Index = ModiList.size();
   ModiList.push_back(
-      llvm::make_unique<DbiModuleDescriptorBuilder>(ModuleName, Index, Msf));
+      std::make_unique<DbiModuleDescriptorBuilder>(ModuleName, Index, Msf));
   return *ModiList.back();
 }
 
diff --git a/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index 8ed5b8b44c59..432f1e9b24d3 100644
--- a/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -183,8 +183,8 @@ void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
 }
 
 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
-    : Msf(Msf), PSH(llvm::make_unique<GSIHashStreamBuilder>()),
-      GSH(llvm::make_unique<GSIHashStreamBuilder>()) {}
+    : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
+      GSH(std::make_unique<GSIHashStreamBuilder>()) {}
 
 GSIStreamBuilder::~GSIStreamBuilder() {}
 
diff --git a/lib/DebugInfo/PDB/Native/Hash.cpp b/lib/DebugInfo/PDB/Native/Hash.cpp
index b5c139ecbec0..7fb6b4bd5d31 100644
--- a/lib/DebugInfo/PDB/Native/Hash.cpp
+++ b/lib/DebugInfo/PDB/Native/Hash.cpp
@@ -8,8 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/CRC.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/JamCRC.h"
 #include <cstdint>
 
 using namespace llvm;
@@ -79,7 +79,6 @@ uint32_t pdb::hashStringV2(StringRef Str) {
 // Corresponds to `SigForPbCb` in langapi/shared/crc32.h.
 uint32_t pdb::hashBufferV8(ArrayRef<uint8_t> Buf) {
   JamCRC JC(/*Init=*/0U);
-  JC.update(makeArrayRef<char>(reinterpret_cast<const char *>(Buf.data()),
-                               Buf.size()));
+  JC.update(Buf);
   return JC.getCRC();
 }
diff --git a/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp b/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
index f17ff5bb01f2..2f6a5bc3d574 100644
--- a/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
@@ -46,30 +46,31 @@ public:
   uint64_t getCodeByteSize() const override { return Entry.FileSize; }
 
   std::string getFileName() const override {
-    auto Name = Strings.getStringForID(Entry.FileNI);
-    assert(Name && "InjectedSourceStream should have rejected this");
-    return *Name;
+    StringRef Ret = cantFail(Strings.getStringForID(Entry.FileNI),
+                             "InjectedSourceStream should have rejected this");
+    return Ret;
   }
 
   std::string getObjectFileName() const override {
-    auto ObjName = Strings.getStringForID(Entry.ObjNI);
-    assert(ObjName && "InjectedSourceStream should have rejected this");
-    return *ObjName;
+    StringRef Ret = cantFail(Strings.getStringForID(Entry.ObjNI),
+                             "InjectedSourceStream should have rejected this");
+    return Ret;
   }
 
   std::string getVirtualFileName() const override {
-    auto VName = Strings.getStringForID(Entry.VFileNI);
-    assert(VName && "InjectedSourceStream should have rejected this");
-    return *VName;
+    StringRef Ret = cantFail(Strings.getStringForID(Entry.VFileNI),
+                             "InjectedSourceStream should have rejected this");
+    return Ret;
   }
 
   uint32_t getCompression() const override { return Entry.Compression; }
 
   std::string getCode() const override {
     // Get name of stream storing the data.
-    auto VName = Strings.getStringForID(Entry.VFileNI);
-    assert(VName && "InjectedSourceStream should have rejected this");
-    std::string StreamName = ("/src/files/" + *VName).str();
+    StringRef VName =
+        cantFail(Strings.getStringForID(Entry.VFileNI),
+                 "InjectedSourceStream should have rejected this");
+    std::string StreamName = ("/src/files/" + VName).str();
 
     // Find stream with that name and read its data.
     // FIXME: Consider validating (or even loading) all this in
@@ -104,14 +105,14 @@ std::unique_ptr<IPDBInjectedSource>
 NativeEnumInjectedSources::getChildAtIndex(uint32_t N) const {
   if (N >= getChildCount())
     return nullptr;
-  return make_unique<NativeInjectedSource>(std::next(Stream.begin(), N)->second,
+  return std::make_unique<NativeInjectedSource>(std::next(Stream.begin(), N)->second,
                                            File, Strings);
 }
 
 std::unique_ptr<IPDBInjectedSource> NativeEnumInjectedSources::getNext() {
   if (Cur == Stream.end())
     return nullptr;
-  return make_unique<NativeInjectedSource>((Cur++)->second, File, Strings);
+  return std::make_unique<NativeInjectedSource>((Cur++)->second, File, Strings);
 }
 
 void NativeEnumInjectedSources::reset() { Cur = Stream.begin(); }
diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
index 8e43cf24495a..2ad552470b61 100644
--- a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -30,68 +30,68 @@ void NativeRawSymbol::dump(raw_ostream &OS, int Indent,
 
 std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findChildren(PDB_SymType Type) const {
-  return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+  return std::make_unique<NullEnumerator<PDBSymbol>>();
 }
 
 std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findChildren(PDB_SymType Type, StringRef Name,
     PDB_NameSearchFlags Flags) const {
-  return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+  return std::make_unique<NullEnumerator<PDBSymbol>>();
 }
 
 std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findChildrenByAddr(PDB_SymType Type, StringRef Name,
     PDB_NameSearchFlags Flags, uint32_t Section, uint32_t Offset) const {
-  return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+  return std::make_unique<NullEnumerator<PDBSymbol>>();
 }
 
 std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findChildrenByVA(PDB_SymType Type, StringRef Name,
    PDB_NameSearchFlags Flags, uint64_t VA) const {
-  return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+  return std::make_unique<NullEnumerator<PDBSymbol>>();
 }
 
 std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
     PDB_NameSearchFlags Flags, uint32_t RVA) const {
-  return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+  return std::make_unique<NullEnumerator<PDBSymbol>>();
 }
 
 std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findInlineFramesByAddr(uint32_t Section,
                                         uint32_t Offset) const {
-  return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+  return std::make_unique<NullEnumerator<PDBSymbol>>();
 }
 
 std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
-  return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+  return std::make_unique<NullEnumerator<PDBSymbol>>();
 }
 
 std::unique_ptr<IPDBEnumSymbols>
 NativeRawSymbol::findInlineFramesByVA(uint64_t VA) const {
-  return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+  return std::make_unique<NullEnumerator<PDBSymbol>>();
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeRawSymbol::findInlineeLines() const {
-  return llvm::make_unique<NullEnumerator<IPDBLineNumber>>();
+  return std::make_unique<NullEnumerator<IPDBLineNumber>>();
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeRawSymbol::findInlineeLinesByAddr(uint32_t Section, uint32_t Offset,
                                         uint32_t Length) const {
-  return llvm::make_unique<NullEnumerator<IPDBLineNumber>>();
+  return std::make_unique<NullEnumerator<IPDBLineNumber>>();
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeRawSymbol::findInlineeLinesByRVA(uint32_t RVA, uint32_t Length) const {
-  return llvm::make_unique<NullEnumerator<IPDBLineNumber>>();
+  return std::make_unique<NullEnumerator<IPDBLineNumber>>();
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeRawSymbol::findInlineeLinesByVA(uint64_t VA, uint32_t Length) const {
-  return llvm::make_unique<NullEnumerator<IPDBLineNumber>>();
+  return std::make_unique<NullEnumerator<IPDBLineNumber>>();
 }
 
 void NativeRawSymbol::getDataBytes(SmallVector<uint8_t, 32> &bytes) const {
diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp
index 8a49cb1c5963..b45a5881dcb5 100644
--- a/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -59,18 +59,18 @@ NativeSession::~NativeSession() = default;
 Error NativeSession::createFromPdb(std::unique_ptr<MemoryBuffer> Buffer,
                                    std::unique_ptr<IPDBSession> &Session) {
   StringRef Path = Buffer->getBufferIdentifier();
-  auto Stream = llvm::make_unique<MemoryBufferByteStream>(
+  auto Stream = std::make_unique<MemoryBufferByteStream>(
       std::move(Buffer), llvm::support::little);
 
-  auto Allocator = llvm::make_unique<BumpPtrAllocator>();
-  auto File = llvm::make_unique<PDBFile>(Path, std::move(Stream), *Allocator);
+  auto Allocator = std::make_unique<BumpPtrAllocator>();
+  auto File = std::make_unique<PDBFile>(Path, std::move(Stream), *Allocator);
   if (auto EC = File->parseFileHeaders())
     return EC;
   if (auto EC = File->parseStreamData())
     return EC;
 
   Session =
-      llvm::make_unique<NativeSession>(std::move(File), std::move(Allocator));
+      std::make_unique<NativeSession>(std::move(File), std::move(Allocator));
 
   return Error::success();
 }
@@ -202,7 +202,7 @@ NativeSession::getInjectedSources() const {
     consumeError(Strings.takeError());
     return nullptr;
   }
-  return make_unique<NativeEnumInjectedSources>(*Pdb, *ISS, *Strings);
+  return std::make_unique<NativeEnumInjectedSources>(*Pdb, *ISS, *Strings);
 }
 
 std::unique_ptr<IPDBEnumSectionContribs>
diff --git a/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp b/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
index 9f5e86281a23..26ccb7daece0 100644
--- a/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
@@ -163,14 +163,14 @@ void NativeTypeEnum::dump(raw_ostream &OS, int Indent,
 std::unique_ptr<IPDBEnumSymbols>
 NativeTypeEnum::findChildren(PDB_SymType Type) const {
   if (Type != PDB_SymType::Data)
-    return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+    return std::make_unique<NullEnumerator<PDBSymbol>>();
 
   const NativeTypeEnum *ClassParent = nullptr;
   if (!Modifiers)
     ClassParent = this;
   else
     ClassParent = UnmodifiedType;
-  return llvm::make_unique<NativeEnumEnumEnumerators>(Session, *ClassParent);
+  return std::make_unique<NativeEnumEnumEnumerators>(Session, *ClassParent);
 }
 
 PDB_SymType NativeTypeEnum::getSymTag() const { return PDB_SymType::Enum; }
diff --git a/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp b/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
index 405303469c18..f98a4c3043eb 100644
--- a/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
@@ -65,7 +65,7 @@ private:
   std::unique_ptr<PDBSymbol> wrap(std::unique_ptr<PDBSymbol> S) const {
     if (!S)
       return nullptr;
-    auto NTFA = llvm::make_unique<NativeTypeFunctionArg>(Session, std::move(S));
+    auto NTFA = std::make_unique<NativeTypeFunctionArg>(Session, std::move(S));
     return PDBSymbol::create(Session, std::move(NTFA));
   }
   NativeSession &Session;
@@ -133,9 +133,9 @@ void NativeTypeFunctionSig::dump(raw_ostream &OS, int Indent,
 std::unique_ptr<IPDBEnumSymbols>
 NativeTypeFunctionSig::findChildren(PDB_SymType Type) const {
   if (Type != PDB_SymType::FunctionArg)
-    return llvm::make_unique<NullEnumerator<PDBSymbol>>();
+    return std::make_unique<NullEnumerator<PDBSymbol>>();
 
-  auto NET = llvm::make_unique<NativeEnumTypes>(Session,
+  auto NET = std::make_unique<NativeEnumTypes>(Session,
                                                 /* copy */ ArgList.ArgIndices);
   return std::unique_ptr<IPDBEnumSymbols>(
       new NativeEnumFunctionArgs(Session, std::move(NET)));
diff --git a/lib/DebugInfo/PDB/Native/PDBFile.cpp b/lib/DebugInfo/PDB/Native/PDBFile.cpp
index 983031dfcb78..9ac226b89139 100644
--- a/lib/DebugInfo/PDB/Native/PDBFile.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -264,7 +264,7 @@ Expected<GlobalsStream &> PDBFile::getPDBGlobalsStream() {
         safelyCreateIndexedStream(DbiS->getGlobalSymbolStreamIndex());
     if (!GlobalS)
       return GlobalS.takeError();
-    auto TempGlobals = llvm::make_unique<GlobalsStream>(std::move(*GlobalS));
+    auto TempGlobals = std::make_unique<GlobalsStream>(std::move(*GlobalS));
     if (auto EC = TempGlobals->reload())
       return std::move(EC);
     Globals = std::move(TempGlobals);
@@ -277,7 +277,7 @@ Expected<InfoStream &> PDBFile::getPDBInfoStream() {
     auto InfoS = safelyCreateIndexedStream(StreamPDB);
     if (!InfoS)
       return InfoS.takeError();
-    auto TempInfo = llvm::make_unique<InfoStream>(std::move(*InfoS));
+    auto TempInfo = std::make_unique<InfoStream>(std::move(*InfoS));
     if (auto EC = TempInfo->reload())
       return std::move(EC);
     Info = std::move(TempInfo);
@@ -290,7 +290,7 @@ Expected<DbiStream &> PDBFile::getPDBDbiStream() {
     auto DbiS = safelyCreateIndexedStream(StreamDBI);
     if (!DbiS)
       return DbiS.takeError();
-    auto TempDbi = llvm::make_unique<DbiStream>(std::move(*DbiS));
+    auto TempDbi = std::make_unique<DbiStream>(std::move(*DbiS));
     if (auto EC = TempDbi->reload(this))
       return std::move(EC);
     Dbi = std::move(TempDbi);
@@ -303,7 +303,7 @@ Expected<TpiStream &> PDBFile::getPDBTpiStream() {
     auto TpiS = safelyCreateIndexedStream(StreamTPI);
     if (!TpiS)
       return TpiS.takeError();
-    auto TempTpi = llvm::make_unique<TpiStream>(*this, std::move(*TpiS));
+    auto TempTpi = std::make_unique<TpiStream>(*this, std::move(*TpiS));
     if (auto EC = TempTpi->reload())
       return std::move(EC);
     Tpi = std::move(TempTpi);
@@ -319,7 +319,7 @@ Expected<TpiStream &> PDBFile::getPDBIpiStream() {
     auto IpiS = safelyCreateIndexedStream(StreamIPI);
     if (!IpiS)
       return IpiS.takeError();
-    auto TempIpi = llvm::make_unique<TpiStream>(*this, std::move(*IpiS));
+    auto TempIpi = std::make_unique<TpiStream>(*this, std::move(*IpiS));
     if (auto EC = TempIpi->reload())
       return std::move(EC);
     Ipi = std::move(TempIpi);
@@ -337,7 +337,7 @@ Expected<PublicsStream &> PDBFile::getPDBPublicsStream() {
         safelyCreateIndexedStream(DbiS->getPublicSymbolStreamIndex());
     if (!PublicS)
       return PublicS.takeError();
-    auto TempPublics = llvm::make_unique<PublicsStream>(std::move(*PublicS));
+    auto TempPublics = std::make_unique<PublicsStream>(std::move(*PublicS));
     if (auto EC = TempPublics->reload())
       return std::move(EC);
     Publics = std::move(TempPublics);
@@ -356,7 +356,7 @@ Expected<SymbolStream &> PDBFile::getPDBSymbolStream() {
     if (!SymbolS)
       return SymbolS.takeError();
 
-    auto TempSymbols = llvm::make_unique<SymbolStream>(std::move(*SymbolS));
+    auto TempSymbols = std::make_unique<SymbolStream>(std::move(*SymbolS));
     if (auto EC = TempSymbols->reload())
       return std::move(EC);
     Symbols = std::move(TempSymbols);
@@ -370,7 +370,7 @@ Expected<PDBStringTable &> PDBFile::getStringTable() {
     if (!NS)
       return NS.takeError();
 
-    auto N = llvm::make_unique<PDBStringTable>();
+    auto N = std::make_unique<PDBStringTable>();
     BinaryStreamReader Reader(**NS);
     if (auto EC = N->reload(Reader))
       return std::move(EC);
@@ -391,7 +391,7 @@ Expected<InjectedSourceStream &> PDBFile::getInjectedSourceStream() {
     if (!Strings)
       return Strings.takeError();
 
-    auto IJ = llvm::make_unique<InjectedSourceStream>(std::move(*IJS));
+    auto IJ = std::make_unique<InjectedSourceStream>(std::move(*IJS));
     if (auto EC = IJ->reload(*Strings))
       return std::move(EC);
     InjectedSources = std::move(IJ);
diff --git a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index 8f5a048ea4b5..aa3288724390 100644
--- a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -22,7 +22,7 @@
 #include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
 #include "llvm/Support/BinaryStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
-#include "llvm/Support/JamCRC.h"
+#include "llvm/Support/CRC.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/xxhash.h"
 
@@ -42,7 +42,7 @@ Error PDBFileBuilder::initialize(uint32_t BlockSize) {
   auto ExpectedMsf = MSFBuilder::create(Allocator, BlockSize);
   if (!ExpectedMsf)
     return ExpectedMsf.takeError();
-  Msf = llvm::make_unique<MSFBuilder>(std::move(*ExpectedMsf));
+  Msf = std::make_unique<MSFBuilder>(std::move(*ExpectedMsf));
   return Error::success();
 }
 
@@ -50,25 +50,25 @@ MSFBuilder &PDBFileBuilder::getMsfBuilder() { return *Msf; }
 
 InfoStreamBuilder &PDBFileBuilder::getInfoBuilder() {
   if (!Info)
-    Info = llvm::make_unique<InfoStreamBuilder>(*Msf, NamedStreams);
+    Info = std::make_unique<InfoStreamBuilder>(*Msf, NamedStreams);
   return *Info;
 }
 
 DbiStreamBuilder &PDBFileBuilder::getDbiBuilder() {
   if (!Dbi)
-    Dbi = llvm::make_unique<DbiStreamBuilder>(*Msf);
+    Dbi = std::make_unique<DbiStreamBuilder>(*Msf);
   return *Dbi;
 }
 
 TpiStreamBuilder &PDBFileBuilder::getTpiBuilder() {
   if (!Tpi)
-    Tpi = llvm::make_unique<TpiStreamBuilder>(*Msf, StreamTPI);
+    Tpi = std::make_unique<TpiStreamBuilder>(*Msf, StreamTPI);
   return *Tpi;
 }
 
 TpiStreamBuilder &PDBFileBuilder::getIpiBuilder() {
   if (!Ipi)
-    Ipi = llvm::make_unique<TpiStreamBuilder>(*Msf, StreamIPI);
+    Ipi = std::make_unique<TpiStreamBuilder>(*Msf, StreamIPI);
   return *Ipi;
 }
 
@@ -78,7 +78,7 @@ PDBStringTableBuilder &PDBFileBuilder::getStringTableBuilder() {
 
 GSIStreamBuilder &PDBFileBuilder::getGsiBuilder() {
   if (!Gsi)
-    Gsi = llvm::make_unique<GSIStreamBuilder>(*Msf);
+    Gsi = std::make_unique<GSIStreamBuilder>(*Msf);
   return *Gsi;
 }
 
@@ -174,8 +174,7 @@ Error PDBFileBuilder::finalizeMsfLayout() {
   if (!InjectedSources.empty()) {
     for (const auto &IS : InjectedSources) {
       JamCRC CRC(0);
-      CRC.update(makeArrayRef(IS.Content->getBufferStart(),
-                              IS.Content->getBufferSize()));
+      CRC.update(arrayRefFromStringRef(IS.Content->getBuffer()));
 
       SrcHeaderBlockEntry Entry;
       ::memset(&Entry, 0, sizeof(SrcHeaderBlockEntry));
diff --git a/lib/DebugInfo/PDB/Native/TpiHashing.cpp b/lib/DebugInfo/PDB/Native/TpiHashing.cpp
index b21b82bf76fd..b71b2b158144 100644
--- a/lib/DebugInfo/PDB/Native/TpiHashing.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiHashing.cpp
@@ -10,7 +10,7 @@
 
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
-#include "llvm/Support/JamCRC.h"
+#include "llvm/Support/CRC.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -124,8 +124,6 @@ Expected<uint32_t> llvm::pdb::hashTypeRecord(const CVType &Rec) {
 
   // Run CRC32 over the bytes. This corresponds to `hashBufv8`.
   JamCRC JC(/*Init=*/0U);
-  ArrayRef<char> Bytes(reinterpret_cast<const char *>(Rec.data().data()),
-                       Rec.data().size());
-  JC.update(Bytes);
+  JC.update(Rec.data());
   return JC.getCRC();
 }
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index 8ee7f897b8bb..ac19db03fab2 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -112,7 +112,7 @@ Error TpiStream::reload() {
     HashStream = std::move(*HS);
   }
 
-  Types = llvm::make_unique<LazyRandomTypeCollection>(
+  Types = std::make_unique<LazyRandomTypeCollection>(
       TypeRecords, getNumTypeRecords(), getTypeIndexOffsets());
   return Error::success();
 }
diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 6b308453c2de..4f10f8524a9b 100644
--- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -135,7 +135,7 @@ Error TpiStreamBuilder::finalizeMsfLayout() {
         reinterpret_cast<const uint8_t *>(HashBuffer.data()),
         calculateHashBufferSize());
     HashValueStream =
-        llvm::make_unique<BinaryByteStream>(Bytes, llvm::support::little);
+        std::make_unique<BinaryByteStream>(Bytes, llvm::support::little);
   }
   return Error::success();
 }
diff --git a/lib/DebugInfo/PDB/PDBSymbolFunc.cpp b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
index 7c3ba981fd6b..cb0329bc0ed7 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
@@ -79,7 +79,7 @@ private:
 
 std::unique_ptr<IPDBEnumChildren<PDBSymbolData>>
 PDBSymbolFunc::getArguments() const {
-  return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
+  return std::make_unique<FunctionArgEnumerator>(Session, *this);
 }
 
 void PDBSymbolFunc::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
index 292320a6fe6d..1373615522eb 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
@@ -63,7 +63,7 @@ private:
 
 std::unique_ptr<IPDBEnumSymbols>
 PDBSymbolTypeFunctionSig::getArguments() const {
-  return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
+  return std::make_unique<FunctionArgEnumerator>(Session, *this);
 }
 
 void PDBSymbolTypeFunctionSig::dump(PDBSymDumper &Dumper) const {
diff --git a/lib/DebugInfo/PDB/UDTLayout.cpp b/lib/DebugInfo/PDB/UDTLayout.cpp
index acb1599480b0..a8e1d0a619ca 100644
--- a/lib/DebugInfo/PDB/UDTLayout.cpp
+++ b/lib/DebugInfo/PDB/UDTLayout.cpp
@@ -71,7 +71,7 @@ DataMemberLayoutItem::DataMemberLayoutItem(
       DataMember(std::move(Member)) {
   auto Type = DataMember->getType();
   if (auto UDT = unique_dyn_cast<PDBSymbolTypeUDT>(Type)) {
-    UdtLayout = llvm::make_unique<ClassLayout>(std::move(UDT));
+    UdtLayout = std::make_unique<ClassLayout>(std::move(UDT));
     UsedBytes = UdtLayout->usedBytes();
   }
 }
@@ -84,7 +84,7 @@ VBPtrLayoutItem::VBPtrLayoutItem(const UDTLayoutBase &Parent,
 }
 
 const PDBSymbolData &DataMemberLayoutItem::getDataMember() {
-  return *dyn_cast<PDBSymbolData>(Symbol);
+  return *cast<PDBSymbolData>(Symbol);
 }
 
 bool DataMemberLayoutItem::hasUDTLayout() const { return UdtLayout != nullptr; }
@@ -205,7 +205,7 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) {
   for (auto &Base : Bases) {
     uint32_t Offset = Base->getOffset();
     // Non-virtual bases never get elided.
-    auto BL = llvm::make_unique<BaseClassLayout>(*this, Offset, false,
+    auto BL = std::make_unique<BaseClassLayout>(*this, Offset, false,
                                                  std::move(Base));
 
     AllBases.push_back(BL.get());
@@ -216,7 +216,7 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) {
   assert(VTables.size() <= 1);
   if (!VTables.empty()) {
     auto VTLayout =
-        llvm::make_unique<VTableLayoutItem>(*this, std::move(VTables[0]));
+        std::make_unique<VTableLayoutItem>(*this, std::move(VTables[0]));
 
     VTable = VTLayout.get();
 
@@ -224,7 +224,7 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) {
   }
 
   for (auto &Data : Members) {
-    auto DM = llvm::make_unique<DataMemberLayoutItem>(*this, std::move(Data));
+    auto DM = std::make_unique<DataMemberLayoutItem>(*this, std::move(Data));
 
     addChildToLayout(std::move(DM));
   }
@@ -236,7 +236,7 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) {
     int VBPO = VB->getVirtualBasePointerOffset();
     if (!hasVBPtrAtOffset(VBPO)) {
       if (auto VBP = VB->getRawSymbol().getVirtualBaseTableType()) {
-        auto VBPL = llvm::make_unique<VBPtrLayoutItem>(*this, std::move(VBP),
+        auto VBPL = std::make_unique<VBPtrLayoutItem>(*this, std::move(VBP),
                                                        VBPO, VBP->getLength());
         VBPtr = VBPL.get();
         addChildToLayout(std::move(VBPL));
@@ -250,7 +250,7 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) {
     uint32_t Offset = UsedBytes.find_last() + 1;
     bool Elide = (Parent != nullptr);
     auto BL =
-        llvm::make_unique<BaseClassLayout>(*this, Offset, Elide, std::move(VB));
+        std::make_unique<BaseClassLayout>(*this, Offset, Elide, std::move(VB));
     AllBases.push_back(BL.get());
 
     // Only lay this virtual base out directly inside of *this* class if this
diff --git a/lib/DebugInfo/Symbolize/DIPrinter.cpp b/lib/DebugInfo/Symbolize/DIPrinter.cpp
index b2bfef251485..b1a80cbc4580 100644
--- a/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -30,11 +30,6 @@
 namespace llvm {
 namespace symbolize {
 
-// By default, DILineInfo contains "<invalid>" for function/filename it
-// cannot fetch. We replace it to "??" to make our output closer to addr2line.
-static const char kDILineInfoBadString[] = "<invalid>";
-static const char kBadString[] = "??";
-
 // Prints source code around in the FileName the Line.
 void DIPrinter::printContext(const std::string &FileName, int64_t Line) {
   if (PrintSourceContext <= 0)
@@ -68,16 +63,16 @@ void DIPrinter::printContext(const std::string &FileName, int64_t Line) {
 void DIPrinter::print(const DILineInfo &Info, bool Inlined) {
   if (PrintFunctionNames) {
     std::string FunctionName = Info.FunctionName;
-    if (FunctionName == kDILineInfoBadString)
-      FunctionName = kBadString;
+    if (FunctionName == DILineInfo::BadString)
+      FunctionName = DILineInfo::Addr2LineBadString;
 
     StringRef Delimiter = PrintPretty ? " at " : "\n";
     StringRef Prefix = (PrintPretty && Inlined) ? " (inlined by) " : "";
     OS << Prefix << FunctionName << Delimiter;
   }
   std::string Filename = Info.FileName;
-  if (Filename == kDILineInfoBadString)
-    Filename = kBadString;
+  if (Filename == DILineInfo::BadString)
+    Filename = DILineInfo::Addr2LineBadString;
   else if (Basenames)
     Filename = llvm::sys::path::filename(Filename);
   if (!Verbose) {
@@ -115,8 +110,8 @@ DIPrinter &DIPrinter::operator<<(const DIInliningInfo &Info) {
 
 DIPrinter &DIPrinter::operator<<(const DIGlobal &Global) {
   std::string Name = Global.Name;
-  if (Name == kDILineInfoBadString)
-    Name = kBadString;
+  if (Name == DILineInfo::BadString)
+    Name = DILineInfo::Addr2LineBadString;
   OS << Name << "\n";
   OS << Global.Start << " " << Global.Size << "\n";
   return *this;
diff --git a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index 2765bf44d504..b4d49d9ff958 100644
--- a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -43,20 +43,22 @@ getDILineInfoSpecifier(FunctionNameKind FNKind) {
 
 ErrorOr<std::unique_ptr<SymbolizableObjectFile>>
 SymbolizableObjectFile::create(const object::ObjectFile *Obj,
-                               std::unique_ptr<DIContext> DICtx) {
+                               std::unique_ptr<DIContext> DICtx,
+                               bool UntagAddresses) {
   assert(DICtx);
   std::unique_ptr<SymbolizableObjectFile> res(
-      new SymbolizableObjectFile(Obj, std::move(DICtx)));
+      new SymbolizableObjectFile(Obj, std::move(DICtx), UntagAddresses));
   std::unique_ptr<DataExtractor> OpdExtractor;
   uint64_t OpdAddress = 0;
   // Find the .opd (function descriptor) section if any, for big-endian
   // PowerPC64 ELF.
   if (Obj->getArch() == Triple::ppc64) {
     for (section_iterator Section : Obj->sections()) {
-      StringRef Name;
-      if (auto EC = Section->getName(Name))
-        return EC;
-      if (Name == ".opd") {
+      Expected<StringRef> NameOrErr = Section->getName();
+      if (!NameOrErr)
+        return errorToErrorCode(NameOrErr.takeError());
+
+      if (*NameOrErr == ".opd") {
         Expected<StringRef> E = Section->getContents();
         if (!E)
           return errorToErrorCode(E.takeError());
@@ -103,8 +105,10 @@ SymbolizableObjectFile::create(const object::ObjectFile *Obj,
 }
 
 SymbolizableObjectFile::SymbolizableObjectFile(const ObjectFile *Obj,
-                                               std::unique_ptr<DIContext> DICtx)
-    : Module(Obj), DebugInfoContext(std::move(DICtx)) {}
+                                               std::unique_ptr<DIContext> DICtx,
+                                               bool UntagAddresses)
+    : Module(Obj), DebugInfoContext(std::move(DICtx)),
+      UntagAddresses(UntagAddresses) {}
 
 namespace {
 
@@ -172,6 +176,12 @@ std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
   if (!SymbolAddressOrErr)
     return errorToErrorCode(SymbolAddressOrErr.takeError());
   uint64_t SymbolAddress = *SymbolAddressOrErr;
+  if (UntagAddresses) {
+    // For kernel addresses, bits 56-63 need to be set, so we sign extend bit 55
+    // into bits 56-63 instead of masking them out.
+    SymbolAddress &= (1ull << 56) - 1;
+    SymbolAddress = (int64_t(SymbolAddress) << 8) >> 8;
+  }
   if (OpdExtractor) {
     // For big-endian PowerPC64 ELF, symbols in the .opd section refer to
     // function descriptors. The first word of the descriptor is a pointer to
@@ -179,10 +189,8 @@ std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
     // For the purposes of symbolization, pretend the symbol's address is that
     // of the function's code, not the descriptor.
     uint64_t OpdOffset = SymbolAddress - OpdAddress;
-    uint32_t OpdOffset32 = OpdOffset;
-    if (OpdOffset == OpdOffset32 &&
-        OpdExtractor->isValidOffsetForAddress(OpdOffset32))
-      SymbolAddress = OpdExtractor->getAddress(&OpdOffset32);
+    if (OpdExtractor->isValidOffsetForAddress(OpdOffset))
+      SymbolAddress = OpdExtractor->getAddress(&OpdOffset);
   }
   Expected<StringRef> SymbolNameOrErr = Symbol.getName();
   if (!SymbolNameOrErr)
diff --git a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
index 9cab94178c1b..b5b9793a44d9 100644
--- a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -31,7 +31,8 @@ namespace symbolize {
 class SymbolizableObjectFile : public SymbolizableModule {
 public:
   static ErrorOr<std::unique_ptr<SymbolizableObjectFile>>
-  create(const object::ObjectFile *Obj, std::unique_ptr<DIContext> DICtx);
+  create(const object::ObjectFile *Obj, std::unique_ptr<DIContext> DICtx,
+         bool UntagAddresses);
 
   DILineInfo symbolizeCode(object::SectionedAddress ModuleOffset,
                            FunctionNameKind FNKind,
@@ -70,6 +71,7 @@ private:
 
   const object::ObjectFile *Module;
   std::unique_ptr<DIContext> DebugInfoContext;
+  bool UntagAddresses;
 
   struct SymbolDesc {
     uint64_t Addr;
@@ -85,7 +87,8 @@ private:
   std::vector<std::pair<SymbolDesc, StringRef>> Objects;
 
   SymbolizableObjectFile(const object::ObjectFile *Obj,
-                         std::unique_ptr<DIContext> DICtx);
+                         std::unique_ptr<DIContext> DICtx,
+                         bool UntagAddresses);
 };
 
 } // end namespace symbolize
diff --git a/lib/DebugInfo/Symbolize/Symbolize.cpp b/lib/DebugInfo/Symbolize/Symbolize.cpp
index 6a619f8f2f37..be79d9e637c1 100644
--- a/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -35,19 +35,6 @@
 #include <cassert>
 #include <cstring>
 
-#if defined(_MSC_VER)
-#include <Windows.h>
-
-// This must be included after windows.h.
-#include <DbgHelp.h>
-#pragma comment(lib, "dbghelp.lib")
-
-// Windows.h conflicts with our COFF header definitions.
-#ifdef IMAGE_FILE_MACHINE_I386
-#undef IMAGE_FILE_MACHINE_I386
-#endif
-#endif
-
 namespace llvm {
 namespace symbolize {
 
@@ -205,7 +192,7 @@ bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
       MemoryBuffer::getFileOrSTDIN(Path);
   if (!MB)
     return false;
-  return CRCHash == llvm::crc32(0, MB.get()->getBuffer());
+  return CRCHash == llvm::crc32(arrayRefFromStringRef(MB.get()->getBuffer()));
 }
 
 bool findDebugBinary(const std::string &OrigPath,
@@ -259,7 +246,11 @@ bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugName,
     return false;
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Name;
-    Section.getName(Name);
+    if (Expected<StringRef> NameOrErr = Section.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
     Name = Name.substr(Name.find_first_not_of("._"));
     if (Name == "gnu_debuglink") {
       Expected<StringRef> ContentsOrErr = Section.getContents();
@@ -268,7 +259,7 @@ bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugName,
         return false;
       }
       DataExtractor DE(*ContentsOrErr, Obj->isLittleEndian(), 0);
-      uint32_t Offset = 0;
+      uint64_t Offset = 0;
       if (const char *DebugNameStr = DE.getCStr(&Offset)) {
         // 4-byte align the offset.
         Offset = (Offset + 3) & ~0x3;
@@ -397,7 +388,7 @@ LLVMSymbolizer::getOrCreateObject(const std::string &Path,
       return I->second.get();
 
     Expected<std::unique_ptr<ObjectFile>> ObjOrErr =
-        UB->getObjectForArch(ArchName);
+        UB->getMachOObjectForArch(ArchName);
     if (!ObjOrErr) {
       ObjectForUBPathAndArch.emplace(std::make_pair(Path, ArchName),
                                      std::unique_ptr<ObjectFile>());
@@ -418,8 +409,8 @@ Expected<SymbolizableModule *>
 LLVMSymbolizer::createModuleInfo(const ObjectFile *Obj,
                                  std::unique_ptr<DIContext> Context,
                                  StringRef ModuleName) {
-  auto InfoOrErr =
-      SymbolizableObjectFile::create(Obj, std::move(Context));
+  auto InfoOrErr = SymbolizableObjectFile::create(Obj, std::move(Context),
+                                                  Opts.UntagAddresses);
   std::unique_ptr<SymbolizableModule> SymMod;
   if (InfoOrErr)
     SymMod = std::move(*InfoOrErr);
@@ -530,21 +521,20 @@ LLVMSymbolizer::DemangleName(const std::string &Name,
     return Result;
   }
 
-#if defined(_MSC_VER)
   if (!Name.empty() && Name.front() == '?') {
     // Only do MSVC C++ demangling on symbols starting with '?'.
-    char DemangledName[1024] = {0};
-    DWORD result = ::UnDecorateSymbolName(
-        Name.c_str(), DemangledName, 1023,
-        UNDNAME_NO_ACCESS_SPECIFIERS |       // Strip public, private, protected
-            UNDNAME_NO_ALLOCATION_LANGUAGE | // Strip __thiscall, __stdcall, etc
-            UNDNAME_NO_THROW_SIGNATURES |    // Strip throw() specifications
-            UNDNAME_NO_MEMBER_TYPE | // Strip virtual, static, etc specifiers
-            UNDNAME_NO_MS_KEYWORDS | // Strip all MS extension keywords
-            UNDNAME_NO_FUNCTION_RETURNS); // Strip function return types
-    return (result == 0) ? Name : std::string(DemangledName);
+    int status = 0;
+    char *DemangledName = microsoftDemangle(
+        Name.c_str(), nullptr, nullptr, &status,
+        MSDemangleFlags(MSDF_NoAccessSpecifier | MSDF_NoCallingConvention |
+                        MSDF_NoMemberType | MSDF_NoReturnType));
+    if (status != 0)
+      return Name;
+    std::string Result = DemangledName;
+    free(DemangledName);
+    return Result;
   }
-#endif
+
   if (DbiModuleDescriptor && DbiModuleDescriptor->isWin32Module())
     return std::string(demanglePE32ExternCFunc(Name));
   return Name;
diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index 5c99c70e3cc6..760d28b3ab9d 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -174,6 +174,16 @@ struct DumpVisitor {
       return printStr("SpecialSubKind::iostream");
     }
   }
+  void print(TemplateParamKind TPK) {
+    switch (TPK) {
+    case TemplateParamKind::Type:
+      return printStr("TemplateParamKind::Type");
+    case TemplateParamKind::NonType:
+      return printStr("TemplateParamKind::NonType");
+    case TemplateParamKind::Template:
+      return printStr("TemplateParamKind::Template");
+    }
+  }
 
   void newLine() {
     printStr("\n");
diff --git a/lib/Demangle/MicrosoftDemangle.cpp b/lib/Demangle/MicrosoftDemangle.cpp
index bf7d77638f34..c681d6e25b87 100644
--- a/lib/Demangle/MicrosoftDemangle.cpp
+++ b/lib/Demangle/MicrosoftDemangle.cpp
@@ -783,8 +783,26 @@ SymbolNode *Demangler::demangleMD5Name(StringView &MangledName) {
   return S;
 }
 
+SymbolNode *Demangler::demangleTypeinfoName(StringView &MangledName) {
+  assert(MangledName.startsWith('.'));
+  MangledName.consumeFront('.');
+
+  TypeNode *T = demangleType(MangledName, QualifierMangleMode::Result);
+  if (Error || !MangledName.empty()) {
+    Error = true;
+    return nullptr;
+  }
+  return synthesizeVariable(Arena, T, "`RTTI Type Descriptor Name'");
+}
+
 // Parser entry point.
 SymbolNode *Demangler::parse(StringView &MangledName) {
+  // Typeinfo names are strings stored in RTTI data. They're not symbol names.
+  // It's still useful to demangle them. They're the only demangled entity
+  // that doesn't start with a "?" but a ".".
+  if (MangledName.startsWith('.'))
+    return demangleTypeinfoName(MangledName);
+
   if (MangledName.startsWith("??@"))
     return demangleMD5Name(MangledName);
 
@@ -2161,7 +2179,7 @@ NodeArrayNode *Demangler::demangleFunctionParameterList(StringView &MangledName,
 
 NodeArrayNode *
 Demangler::demangleTemplateParameterList(StringView &MangledName) {
-  NodeList *Head;
+  NodeList *Head = nullptr;
   NodeList **Current = &Head;
   size_t Count = 0;
 
@@ -2328,12 +2346,22 @@ char *llvm::microsoftDemangle(const char *MangledName, char *Buf, size_t *N,
   if (Flags & MSDF_DumpBackrefs)
     D.dumpBackReferences();
 
+  OutputFlags OF = OF_Default;
+  if (Flags & MSDF_NoCallingConvention)
+    OF = OutputFlags(OF | OF_NoCallingConvention);
+  if (Flags & MSDF_NoAccessSpecifier)
+    OF = OutputFlags(OF | OF_NoAccessSpecifier);
+  if (Flags & MSDF_NoReturnType)
+    OF = OutputFlags(OF | OF_NoReturnType);
+  if (Flags & MSDF_NoMemberType)
+    OF = OutputFlags(OF | OF_NoMemberType);
+
   if (D.Error)
     InternalStatus = demangle_invalid_mangled_name;
   else if (!initializeOutputStream(Buf, N, S, 1024))
     InternalStatus = demangle_memory_alloc_failure;
   else {
-    AST->output(S, OF_Default);
+    AST->output(S, OF);
     S += '\0';
     if (N != nullptr)
       *N = S.getCurrentPosition();
diff --git a/lib/Demangle/MicrosoftDemangleNodes.cpp b/lib/Demangle/MicrosoftDemangleNodes.cpp
index 63ca475ec1fe..9cee975231a2 100644
--- a/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -120,8 +120,6 @@ std::string Node::toString(OutputFlags Flags) const {
   return {OS.getBuffer()};
 }
 
-void TypeNode::outputQuals(bool SpaceBefore, bool SpaceAfter) const {}
-
 void PrimitiveTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
   switch (PrimKind) {
     OUTPUT_ENUM_CLASS_VALUE(PrimitiveKind, Void, "void");
@@ -380,24 +378,28 @@ void LiteralOperatorIdentifierNode::output(OutputStream &OS,
 
 void FunctionSignatureNode::outputPre(OutputStream &OS,
                                       OutputFlags Flags) const {
-  if (FunctionClass & FC_Public)
-    OS << "public: ";
-  if (FunctionClass & FC_Protected)
-    OS << "protected: ";
-  if (FunctionClass & FC_Private)
-    OS << "private: ";
-
-  if (!(FunctionClass & FC_Global)) {
-    if (FunctionClass & FC_Static)
-      OS << "static ";
+  if (!(Flags & OF_NoAccessSpecifier)) {
+    if (FunctionClass & FC_Public)
+      OS << "public: ";
+    if (FunctionClass & FC_Protected)
+      OS << "protected: ";
+    if (FunctionClass & FC_Private)
+      OS << "private: ";
   }
-  if (FunctionClass & FC_Virtual)
-    OS << "virtual ";
 
-  if (FunctionClass & FC_ExternC)
-    OS << "extern \"C\" ";
+  if (!(Flags & OF_NoMemberType)) {
+    if (!(FunctionClass & FC_Global)) {
+      if (FunctionClass & FC_Static)
+        OS << "static ";
+    }
+    if (FunctionClass & FC_Virtual)
+      OS << "virtual ";
+
+    if (FunctionClass & FC_ExternC)
+      OS << "extern \"C\" ";
+  }
 
-  if (ReturnType) {
+  if (!(Flags & OF_NoReturnType) && ReturnType) {
     ReturnType->outputPre(OS, Flags);
     OS << " ";
   }
@@ -440,7 +442,7 @@ void FunctionSignatureNode::outputPost(OutputStream &OS,
   else if (RefQualifier == FunctionRefQualifier::RValueReference)
     OS << " &&";
 
-  if (ReturnType)
+  if (!(Flags & OF_NoReturnType) && ReturnType)
     ReturnType->outputPost(OS, Flags);
 }
 
@@ -582,19 +584,26 @@ void FunctionSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
 }
 
 void VariableSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
+  const char *AccessSpec = nullptr;
+  bool IsStatic = true;
   switch (SC) {
   case StorageClass::PrivateStatic:
-    OS << "private: static ";
+    AccessSpec = "private";
     break;
   case StorageClass::PublicStatic:
-    OS << "public: static ";
+    AccessSpec = "public";
     break;
   case StorageClass::ProtectedStatic:
-    OS << "protected: static ";
+    AccessSpec = "protected";
     break;
   default:
+    IsStatic = false;
     break;
   }
+  if (!(Flags & OF_NoAccessSpecifier) && AccessSpec)
+    OS << AccessSpec << ": ";
+  if (!(Flags & OF_NoMemberType) && IsStatic)
+    OS << "static ";
 
   if (Type) {
     Type->outputPre(OS, Flags);
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 1c6c0406d048..ee7a7cb60bc9 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -32,12 +32,12 @@
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cmath>
 #include <cstring>
+#include <mutex>
 using namespace llvm;
 
 #define DEBUG_TYPE "jit"
@@ -191,7 +191,7 @@ uint64_t ExecutionEngineState::RemoveMapping(StringRef Name) {
 std::string ExecutionEngine::getMangledName(const GlobalValue *GV) {
   assert(GV->hasName() && "Global must have name.");
 
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   SmallString<128> FullName;
 
   const DataLayout &DL =
@@ -204,12 +204,12 @@ std::string ExecutionEngine::getMangledName(const GlobalValue *GV) {
 }
 
 void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   addGlobalMapping(getMangledName(GV), (uint64_t) Addr);
 }
 
 void ExecutionEngine::addGlobalMapping(StringRef Name, uint64_t Addr) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   assert(!Name.empty() && "Empty GlobalMapping symbol name!");
 
@@ -228,14 +228,14 @@ void ExecutionEngine::addGlobalMapping(StringRef Name, uint64_t Addr) {
 }
 
 void ExecutionEngine::clearAllGlobalMappings() {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   EEState.getGlobalAddressMap().clear();
   EEState.getGlobalAddressReverseMap().clear();
 }
 
 void ExecutionEngine::clearGlobalMappingsFromModule(Module *M) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   for (GlobalObject &GO : M->global_objects())
     EEState.RemoveMapping(getMangledName(&GO));
@@ -243,12 +243,12 @@ void ExecutionEngine::clearGlobalMappingsFromModule(Module *M) {
 
 uint64_t ExecutionEngine::updateGlobalMapping(const GlobalValue *GV,
                                               void *Addr) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   return updateGlobalMapping(getMangledName(GV), (uint64_t) Addr);
 }
 
 uint64_t ExecutionEngine::updateGlobalMapping(StringRef Name, uint64_t Addr) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   ExecutionEngineState::GlobalAddressMapTy &Map =
     EEState.getGlobalAddressMap();
@@ -275,7 +275,7 @@ uint64_t ExecutionEngine::updateGlobalMapping(StringRef Name, uint64_t Addr) {
 }
 
 uint64_t ExecutionEngine::getAddressToGlobalIfAvailable(StringRef S) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   uint64_t Address = 0;
   ExecutionEngineState::GlobalAddressMapTy::iterator I =
     EEState.getGlobalAddressMap().find(S);
@@ -286,19 +286,19 @@ uint64_t ExecutionEngine::getAddressToGlobalIfAvailable(StringRef S) {
 
 
 void *ExecutionEngine::getPointerToGlobalIfAvailable(StringRef S) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   if (void* Address = (void *) getAddressToGlobalIfAvailable(S))
     return Address;
   return nullptr;
 }
 
 void *ExecutionEngine::getPointerToGlobalIfAvailable(const GlobalValue *GV) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   return getPointerToGlobalIfAvailable(getMangledName(GV));
 }
 
 const GlobalValue *ExecutionEngine::getGlobalValueAtAddress(void *Addr) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // If we haven't computed the reverse mapping yet, do so first.
   if (EEState.getGlobalAddressReverseMap().empty()) {
@@ -340,14 +340,14 @@ void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE,
   Values.clear();  // Free the old contents.
   Values.reserve(InputArgv.size());
   unsigned PtrSize = EE->getDataLayout().getPointerSize();
-  Array = make_unique<char[]>((InputArgv.size()+1)*PtrSize);
+  Array = std::make_unique<char[]>((InputArgv.size()+1)*PtrSize);
 
   LLVM_DEBUG(dbgs() << "JIT: ARGV = " << (void *)Array.get() << "\n");
   Type *SBytePtr = Type::getInt8PtrTy(C);
 
   for (unsigned i = 0; i != InputArgv.size(); ++i) {
     unsigned Size = InputArgv[i].size()+1;
-    auto Dest = make_unique<char[]>(Size);
+    auto Dest = std::make_unique<char[]>(Size);
     LLVM_DEBUG(dbgs() << "JIT: ARGV[" << i << "] = " << (void *)Dest.get()
                       << "\n");
 
@@ -575,7 +575,7 @@ void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
   if (Function *F = const_cast<Function*>(dyn_cast<Function>(GV)))
     return getPointerToFunction(F);
 
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   if (void* P = getPointerToGlobalIfAvailable(GV))
     return P;
 
@@ -626,7 +626,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       break;
     case Type::VectorTyID:
       // if the whole vector is 'undef' just reserve memory for the value.
-      auto* VTy = dyn_cast<VectorType>(C->getType());
+      auto* VTy = cast<VectorType>(C->getType());
       Type *ElemTy = VTy->getElementType();
       unsigned int elemNum = VTy->getNumElements();
       Result.AggregateVal.resize(elemNum);
@@ -925,7 +925,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         elemNum = CDV->getNumElements();
         ElemTy = CDV->getElementType();
     } else if (CV || CAZ) {
-        VectorType* VTy = dyn_cast<VectorType>(C->getType());
+        auto* VTy = cast<VectorType>(C->getType());
         elemNum = VTy->getNumElements();
         ElemTy = VTy->getElementType();
     } else {
diff --git a/lib/ExecutionEngine/GDBRegistrationListener.cpp b/lib/ExecutionEngine/GDBRegistrationListener.cpp
index 08d20156a590..7ed025fbb481 100644
--- a/lib/ExecutionEngine/GDBRegistrationListener.cpp
+++ b/lib/ExecutionEngine/GDBRegistrationListener.cpp
@@ -14,7 +14,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
-#include "llvm/Support/MutexGuard.h"
+#include <mutex>
 
 using namespace llvm;
 using namespace llvm::object;
@@ -135,7 +135,7 @@ void NotifyDebugger(jit_code_entry* JITCodeEntry) {
 
 GDBJITRegistrationListener::~GDBJITRegistrationListener() {
   // Free all registered object files.
-  llvm::MutexGuard locked(*JITDebugLock);
+  std::lock_guard<llvm::sys::Mutex> locked(*JITDebugLock);
   for (RegisteredObjectBufferMap::iterator I = ObjectBufferMap.begin(),
                                            E = ObjectBufferMap.end();
        I != E; ++I) {
@@ -159,7 +159,7 @@ void GDBJITRegistrationListener::notifyObjectLoaded(
   const char *Buffer = DebugObj.getBinary()->getMemoryBufferRef().getBufferStart();
   size_t      Size = DebugObj.getBinary()->getMemoryBufferRef().getBufferSize();
 
-  llvm::MutexGuard locked(*JITDebugLock);
+  std::lock_guard<llvm::sys::Mutex> locked(*JITDebugLock);
   assert(ObjectBufferMap.find(K) == ObjectBufferMap.end() &&
          "Second attempt to perform debug registration.");
   jit_code_entry* JITCodeEntry = new jit_code_entry();
@@ -178,7 +178,7 @@ void GDBJITRegistrationListener::notifyObjectLoaded(
 }
 
 void GDBJITRegistrationListener::notifyFreeingObject(ObjectKey K) {
-  llvm::MutexGuard locked(*JITDebugLock);
+  std::lock_guard<llvm::sys::Mutex> locked(*JITDebugLock);
   RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(K);
 
   if (I != ObjectBufferMap.end()) {
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index c3a2ccc582c9..71b7f893d712 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
-#include "llvm/Support/UniqueLock.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cmath>
@@ -41,6 +40,7 @@
 #include <cstdio>
 #include <cstring>
 #include <map>
+#include <mutex>
 #include <string>
 #include <utility>
 #include <vector>
@@ -258,7 +258,7 @@ GenericValue Interpreter::callExternalFunction(Function *F,
                                                ArrayRef<GenericValue> ArgVals) {
   TheInterpreter = this;
 
-  unique_lock<sys::Mutex> Guard(*FunctionsLock);
+  std::unique_lock<sys::Mutex> Guard(*FunctionsLock);
 
   // Do a lookup to see if the function is in our cache... this should just be a
   // deferred annotation!
diff --git a/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h b/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h
index 1271ad962b38..b47a798c7603 100644
--- a/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h
+++ b/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h
@@ -20,24 +20,23 @@ namespace jitlink {
 
 template <typename BuilderImpl> class BasicGOTAndStubsBuilder {
 public:
-  BasicGOTAndStubsBuilder(AtomGraph &G) : G(G) {}
+  BasicGOTAndStubsBuilder(LinkGraph &G) : G(G) {}
 
   void run() {
-    // We're going to be adding new atoms, but we don't want to iterate over
-    // the newly added ones, so just copy the existing atoms out.
-    std::vector<DefinedAtom *> DAs(G.defined_atoms().begin(),
-                                   G.defined_atoms().end());
+    // We're going to be adding new blocks, but we don't want to iterate over
+    // the newly added ones, so just copy the existing blocks out.
+    std::vector<Block *> Blocks(G.blocks().begin(), G.blocks().end());
 
-    for (auto *DA : DAs)
-      for (auto &E : DA->edges())
+    for (auto *B : Blocks)
+      for (auto &E : B->edges())
         if (impl().isGOTEdge(E))
-          impl().fixGOTEdge(E, getGOTEntryAtom(E.getTarget()));
+          impl().fixGOTEdge(E, getGOTEntrySymbol(E.getTarget()));
         else if (impl().isExternalBranchEdge(E))
-          impl().fixExternalBranchEdge(E, getStubAtom(E.getTarget()));
+          impl().fixExternalBranchEdge(E, getStubSymbol(E.getTarget()));
   }
 
 protected:
-  Atom &getGOTEntryAtom(Atom &Target) {
+  Symbol &getGOTEntrySymbol(Symbol &Target) {
     assert(Target.hasName() && "GOT edge cannot point to anonymous target");
 
     auto GOTEntryI = GOTEntries.find(Target.getName());
@@ -49,31 +48,31 @@ protected:
           GOTEntries.insert(std::make_pair(Target.getName(), &GOTEntry)).first;
     }
 
-    assert(GOTEntryI != GOTEntries.end() && "Could not get GOT entry atom");
+    assert(GOTEntryI != GOTEntries.end() && "Could not get GOT entry symbol");
     return *GOTEntryI->second;
   }
 
-  Atom &getStubAtom(Atom &Target) {
+  Symbol &getStubSymbol(Symbol &Target) {
     assert(Target.hasName() &&
            "External branch edge can not point to an anonymous target");
     auto StubI = Stubs.find(Target.getName());
 
     if (StubI == Stubs.end()) {
-      auto &StubAtom = impl().createStub(Target);
-      StubI = Stubs.insert(std::make_pair(Target.getName(), &StubAtom)).first;
+      auto &StubSymbol = impl().createStub(Target);
+      StubI = Stubs.insert(std::make_pair(Target.getName(), &StubSymbol)).first;
     }
 
-    assert(StubI != Stubs.end() && "Count not get stub atom");
+    assert(StubI != Stubs.end() && "Count not get stub symbol");
     return *StubI->second;
   }
 
-  AtomGraph &G;
+  LinkGraph &G;
 
 private:
   BuilderImpl &impl() { return static_cast<BuilderImpl &>(*this); }
 
-  DenseMap<StringRef, DefinedAtom *> GOTEntries;
-  DenseMap<StringRef, DefinedAtom *> Stubs;
+  DenseMap<StringRef, Symbol *> GOTEntries;
+  DenseMap<StringRef, Symbol *> Stubs;
 };
 
 } // end namespace jitlink
diff --git a/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
index 25f0e9040ffe..f80b0e7f8909 100644
--- a/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
+++ b/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
@@ -17,18 +17,14 @@
 namespace llvm {
 namespace jitlink {
 
-EHFrameParser::EHFrameParser(AtomGraph &G, Section &EHFrameSection,
-                             StringRef EHFrameContent,
-                             JITTargetAddress EHFrameAddress,
-                             Edge::Kind FDEToCIERelocKind,
-                             Edge::Kind FDEToTargetRelocKind)
-    : G(G), EHFrameSection(EHFrameSection), EHFrameContent(EHFrameContent),
-      EHFrameAddress(EHFrameAddress),
-      EHFrameReader(EHFrameContent, G.getEndianness()),
-      FDEToCIERelocKind(FDEToCIERelocKind),
-      FDEToTargetRelocKind(FDEToTargetRelocKind) {}
-
-Error EHFrameParser::atomize() {
+EHFrameBinaryParser::EHFrameBinaryParser(JITTargetAddress EHFrameAddress,
+                                         StringRef EHFrameContent,
+                                         unsigned PointerSize,
+                                         support::endianness Endianness)
+    : EHFrameAddress(EHFrameAddress), EHFrameContent(EHFrameContent),
+      PointerSize(PointerSize), EHFrameReader(EHFrameContent, Endianness) {}
+
+Error EHFrameBinaryParser::addToGraph() {
   while (!EHFrameReader.empty()) {
     size_t RecordOffset = EHFrameReader.getOffset();
 
@@ -38,44 +34,39 @@ Error EHFrameParser::atomize() {
              << " (offset " << RecordOffset << ")\n";
     });
 
-    size_t CIELength = 0;
-    uint32_t CIELengthField;
-    if (auto Err = EHFrameReader.readInteger(CIELengthField))
+    size_t RecordLength = 0;
+    uint32_t RecordLengthField;
+    if (auto Err = EHFrameReader.readInteger(RecordLengthField))
       return Err;
 
-    // Process CIE length/extended-length fields to build the atom.
+    // Process CIE/FDE length/extended-length fields to build the blocks.
     //
     // The value of these fields describe the length of the *rest* of the CIE
     // (not including data up to the end of the field itself) so we have to
-    // bump CIELength to include the data up to the end of the field: 4 bytes
+    // bump RecordLength to include the data up to the end of the field: 4 bytes
     // for Length, or 12 bytes (4 bytes + 8 bytes) for ExtendedLength.
-    if (CIELengthField == 0) // Length 0 means end of __eh_frame section.
+    if (RecordLengthField == 0) // Length 0 means end of __eh_frame section.
       break;
 
     // If the regular length field's value is 0xffffffff, use extended length.
-    if (CIELengthField == 0xffffffff) {
-      uint64_t CIEExtendedLengthField;
-      if (auto Err = EHFrameReader.readInteger(CIEExtendedLengthField))
+    if (RecordLengthField == 0xffffffff) {
+      uint64_t ExtendedLengthField;
+      if (auto Err = EHFrameReader.readInteger(ExtendedLengthField))
         return Err;
-      if (CIEExtendedLengthField > EHFrameReader.bytesRemaining())
+      if (ExtendedLengthField > EHFrameReader.bytesRemaining())
         return make_error<JITLinkError>("CIE record extends past the end of "
                                         "the __eh_frame section");
-      if (CIEExtendedLengthField + 12 > std::numeric_limits<size_t>::max())
+      if (ExtendedLengthField + 12 > std::numeric_limits<size_t>::max())
         return make_error<JITLinkError>("CIE record too large to process");
-      CIELength = CIEExtendedLengthField + 12;
+      RecordLength = ExtendedLengthField + 12;
     } else {
-      if (CIELengthField > EHFrameReader.bytesRemaining())
+      if (RecordLengthField > EHFrameReader.bytesRemaining())
         return make_error<JITLinkError>("CIE record extends past the end of "
                                         "the __eh_frame section");
-      CIELength = CIELengthField + 4;
+      RecordLength = RecordLengthField + 4;
     }
 
-    LLVM_DEBUG(dbgs() << "  length: " << CIELength << "\n");
-
-    // Add an atom for this record.
-    CurRecordAtom = &G.addAnonymousAtom(
-        EHFrameSection, EHFrameAddress + RecordOffset, G.getPointerSize());
-    CurRecordAtom->setContent(EHFrameContent.substr(RecordOffset, CIELength));
+    LLVM_DEBUG(dbgs() << "  length: " << RecordLength << "\n");
 
     // Read the CIE Pointer.
     size_t CIEPointerAddress = EHFrameAddress + EHFrameReader.getOffset();
@@ -85,21 +76,24 @@ Error EHFrameParser::atomize() {
 
     // Based on the CIE pointer value, parse this as a CIE or FDE record.
     if (CIEPointer == 0) {
-      if (auto Err = processCIE())
+      if (auto Err = processCIE(RecordOffset, RecordLength))
         return Err;
     } else {
-      if (auto Err = processFDE(CIEPointerAddress, CIEPointer))
+      if (auto Err = processFDE(RecordOffset, RecordLength, CIEPointerAddress,
+                                CIEPointer))
         return Err;
     }
 
-    EHFrameReader.setOffset(RecordOffset + CIELength);
+    EHFrameReader.setOffset(RecordOffset + RecordLength);
   }
 
   return Error::success();
 }
 
-Expected<EHFrameParser::AugmentationInfo>
-EHFrameParser::parseAugmentationString() {
+void EHFrameBinaryParser::anchor() {}
+
+Expected<EHFrameBinaryParser::AugmentationInfo>
+EHFrameBinaryParser::parseAugmentationString() {
   AugmentationInfo AugInfo;
   uint8_t NextChar;
   uint8_t *NextField = &AugInfo.Fields[0];
@@ -139,14 +133,14 @@ EHFrameParser::parseAugmentationString() {
   return std::move(AugInfo);
 }
 
-Expected<JITTargetAddress> EHFrameParser::readAbsolutePointer() {
+Expected<JITTargetAddress> EHFrameBinaryParser::readAbsolutePointer() {
   static_assert(sizeof(JITTargetAddress) == sizeof(uint64_t),
                 "Result must be able to hold a uint64_t");
   JITTargetAddress Addr;
-  if (G.getPointerSize() == 8) {
+  if (PointerSize == 8) {
     if (auto Err = EHFrameReader.readInteger(Addr))
       return std::move(Err);
-  } else if (G.getPointerSize() == 4) {
+  } else if (PointerSize == 4) {
     uint32_t Addr32;
     if (auto Err = EHFrameReader.readInteger(Addr32))
       return std::move(Err);
@@ -156,14 +150,19 @@ Expected<JITTargetAddress> EHFrameParser::readAbsolutePointer() {
   return Addr;
 }
 
-Error EHFrameParser::processCIE() {
+Error EHFrameBinaryParser::processCIE(size_t RecordOffset,
+                                      size_t RecordLength) {
   // Use the dwarf namespace for convenient access to pointer encoding
   // constants.
   using namespace dwarf;
 
   LLVM_DEBUG(dbgs() << "  Record is CIE\n");
 
-  CIEInformation CIEInfo(*CurRecordAtom);
+  auto &CIESymbol =
+      createCIERecord(EHFrameAddress + RecordOffset,
+                      EHFrameContent.substr(RecordOffset, RecordLength));
+
+  CIEInformation CIEInfo(CIESymbol);
 
   uint8_t Version = 0;
   if (auto Err = EHFrameReader.readInteger(Version))
@@ -179,7 +178,7 @@ Error EHFrameParser::processCIE() {
 
   // Skip the EH Data field if present.
   if (AugInfo->EHDataFieldPresent)
-    if (auto Err = EHFrameReader.skip(G.getPointerSize()))
+    if (auto Err = EHFrameReader.skip(PointerSize))
       return Err;
 
   // Read and sanity check the code alignment factor.
@@ -226,7 +225,7 @@ Error EHFrameParser::processCIE() {
         return make_error<JITLinkError>(
             "Unsupported LSDA pointer encoding " +
             formatv("{0:x2}", LSDAPointerEncoding) + " in CIE at " +
-            formatv("{0:x16}", CurRecordAtom->getAddress()));
+            formatv("{0:x16}", CIESymbol.getAddress()));
       break;
     }
     case 'P': {
@@ -239,7 +238,7 @@ Error EHFrameParser::processCIE() {
             "Unspported personality pointer "
             "encoding " +
             formatv("{0:x2}", PersonalityPointerEncoding) + " in CIE at " +
-            formatv("{0:x16}", CurRecordAtom->getAddress()));
+            formatv("{0:x16}", CIESymbol.getAddress()));
       uint32_t PersonalityPointerAddress;
       if (auto Err = EHFrameReader.readInteger(PersonalityPointerAddress))
         return Err;
@@ -254,7 +253,7 @@ Error EHFrameParser::processCIE() {
             "Unsupported FDE address pointer "
             "encoding " +
             formatv("{0:x2}", FDEPointerEncoding) + " in CIE at " +
-            formatv("{0:x16}", CurRecordAtom->getAddress()));
+            formatv("{0:x16}", CIESymbol.getAddress()));
       break;
     }
     default:
@@ -267,15 +266,16 @@ Error EHFrameParser::processCIE() {
     return make_error<JITLinkError>("Read past the end of the augmentation "
                                     "data while parsing fields");
 
-  assert(!CIEInfos.count(CurRecordAtom->getAddress()) &&
+  assert(!CIEInfos.count(CIESymbol.getAddress()) &&
          "Multiple CIEs recorded at the same address?");
-  CIEInfos[CurRecordAtom->getAddress()] = std::move(CIEInfo);
+  CIEInfos[CIESymbol.getAddress()] = std::move(CIEInfo);
 
   return Error::success();
 }
 
-Error EHFrameParser::processFDE(JITTargetAddress CIEPointerAddress,
-                                uint32_t CIEPointer) {
+Error EHFrameBinaryParser::processFDE(size_t RecordOffset, size_t RecordLength,
+                                      JITTargetAddress CIEPointerAddress,
+                                      uint32_t CIEPointer) {
   LLVM_DEBUG(dbgs() << "  Record is FDE\n");
 
   LLVM_DEBUG({
@@ -286,16 +286,11 @@ Error EHFrameParser::processFDE(JITTargetAddress CIEPointerAddress,
   auto CIEInfoItr = CIEInfos.find(CIEPointerAddress - CIEPointer);
   if (CIEInfoItr == CIEInfos.end())
     return make_error<JITLinkError>(
-        "FDE at " + formatv("{0:x16}", CurRecordAtom->getAddress()) +
+        "FDE at " + formatv("{0:x16}", EHFrameAddress + RecordOffset) +
         " points to non-existant CIE at " +
         formatv("{0:x16}", CIEPointerAddress - CIEPointer));
   auto &CIEInfo = CIEInfoItr->second;
 
-  // The CIEPointer looks good. Add a relocation.
-  CurRecordAtom->addEdge(FDEToCIERelocKind,
-                         CIEPointerAddress - CurRecordAtom->getAddress(),
-                         *CIEInfo.CIEAtom, 0);
-
   // Read and sanity check the PC-start pointer and size.
   JITTargetAddress PCBeginAddress = EHFrameAddress + EHFrameReader.getOffset();
 
@@ -305,83 +300,68 @@ Error EHFrameParser::processFDE(JITTargetAddress CIEPointerAddress,
 
   JITTargetAddress PCBegin = PCBeginAddress + *PCBeginDelta;
   LLVM_DEBUG({
-    dbgs() << "  PC begin: " << format("0x%016" PRIx64, PCBegin) << "\n";
+    dbgs() << "    PC begin: " << format("0x%016" PRIx64, PCBegin) << "\n";
   });
 
-  auto *TargetAtom = G.getAtomByAddress(PCBegin);
+  auto *TargetSymbol = getSymbolAtAddress(PCBegin);
 
-  if (!TargetAtom)
+  if (!TargetSymbol)
     return make_error<JITLinkError>("FDE PC-begin " +
                                     formatv("{0:x16}", PCBegin) +
-                                    " does not point at atom");
+                                    " does not point at symbol");
 
-  if (TargetAtom->getAddress() != PCBegin)
+  if (TargetSymbol->getAddress() != PCBegin)
     return make_error<JITLinkError>(
         "FDE PC-begin " + formatv("{0:x16}", PCBegin) +
-        " does not point to start of atom at " +
-        formatv("{0:x16}", TargetAtom->getAddress()));
-
-  LLVM_DEBUG(dbgs() << "  FDE target: " << *TargetAtom << "\n");
+        " does not point to start of symbol at " +
+        formatv("{0:x16}", TargetSymbol->getAddress()));
 
-  // The PC-start pointer and size look good. Add relocations.
-  CurRecordAtom->addEdge(FDEToTargetRelocKind,
-                         PCBeginAddress - CurRecordAtom->getAddress(),
-                         *TargetAtom, 0);
-
-  // Add a keep-alive relocation from the function to the FDE to ensure it is
-  // not dead stripped.
-  TargetAtom->addEdge(Edge::KeepAlive, 0, *CurRecordAtom, 0);
+  LLVM_DEBUG(dbgs() << "  FDE target: " << *TargetSymbol << "\n");
 
   // Skip over the PC range size field.
-  if (auto Err = EHFrameReader.skip(G.getPointerSize()))
+  if (auto Err = EHFrameReader.skip(PointerSize))
     return Err;
 
+  Symbol *LSDASymbol = nullptr;
+  JITTargetAddress LSDAAddress = 0;
   if (CIEInfo.FDEsHaveLSDAField) {
     uint64_t AugmentationDataSize;
     if (auto Err = EHFrameReader.readULEB128(AugmentationDataSize))
       return Err;
-    if (AugmentationDataSize != G.getPointerSize())
+    if (AugmentationDataSize != PointerSize)
       return make_error<JITLinkError>(
           "Unexpected FDE augmentation data size (expected " +
-          Twine(G.getPointerSize()) + ", got " + Twine(AugmentationDataSize) +
-          ") for FDE at " + formatv("{0:x16}", CurRecordAtom->getAddress()));
-    JITTargetAddress LSDAAddress = EHFrameAddress + EHFrameReader.getOffset();
+          Twine(PointerSize) + ", got " + Twine(AugmentationDataSize) +
+          ") for FDE at " + formatv("{0:x16}", EHFrameAddress + RecordOffset));
+    LSDAAddress = EHFrameAddress + EHFrameReader.getOffset();
     auto LSDADelta = readAbsolutePointer();
     if (!LSDADelta)
       return LSDADelta.takeError();
 
     JITTargetAddress LSDA = LSDAAddress + *LSDADelta;
 
-    auto *LSDAAtom = G.getAtomByAddress(LSDA);
+    LSDASymbol = getSymbolAtAddress(LSDA);
 
-    if (!LSDAAtom)
+    if (!LSDASymbol)
       return make_error<JITLinkError>("FDE LSDA " + formatv("{0:x16}", LSDA) +
-                                      " does not point at atom");
+                                      " does not point at symbol");
 
-    if (LSDAAtom->getAddress() != LSDA)
+    if (LSDASymbol->getAddress() != LSDA)
       return make_error<JITLinkError>(
           "FDE LSDA " + formatv("{0:x16}", LSDA) +
-          " does not point to start of atom at " +
-          formatv("{0:x16}", LSDAAtom->getAddress()));
-
-    LLVM_DEBUG(dbgs() << "  FDE LSDA: " << *LSDAAtom << "\n");
+          " does not point to start of symbol at " +
+          formatv("{0:x16}", LSDASymbol->getAddress()));
 
-    // LSDA looks good. Add relocations.
-    CurRecordAtom->addEdge(FDEToTargetRelocKind,
-                           LSDAAddress - CurRecordAtom->getAddress(), *LSDAAtom,
-                           0);
+    LLVM_DEBUG(dbgs() << "  FDE LSDA: " << *LSDASymbol << "\n");
   }
 
-  return Error::success();
-}
+  JITTargetAddress RecordAddress = EHFrameAddress + RecordOffset;
+  auto FDESymbol = createFDERecord(
+      RecordAddress, EHFrameContent.substr(RecordOffset, RecordLength),
+      *CIEInfo.CIESymbol, CIEPointerAddress - RecordAddress, *TargetSymbol,
+      PCBeginAddress - RecordAddress, LSDASymbol, LSDAAddress - RecordAddress);
 
-Error addEHFrame(AtomGraph &G, Section &EHFrameSection,
-                 StringRef EHFrameContent, JITTargetAddress EHFrameAddress,
-                 Edge::Kind FDEToCIERelocKind,
-                 Edge::Kind FDEToTargetRelocKind) {
-  return EHFrameParser(G, EHFrameSection, EHFrameContent, EHFrameAddress,
-                       FDEToCIERelocKind, FDEToTargetRelocKind)
-      .atomize();
+  return FDESymbol.takeError();
 }
 
 // Determine whether we can register EH tables.
@@ -451,11 +431,13 @@ static Error deregisterFrameWrapper(const void *P) {
 
 template <typename HandleFDEFn>
 Error walkAppleEHFrameSection(const char *const SectionStart,
+                              size_t SectionSize,
                               HandleFDEFn HandleFDE) {
   const char *CurCFIRecord = SectionStart;
+  const char *End = SectionStart + SectionSize;
   uint64_t Size = *reinterpret_cast<const uint32_t *>(CurCFIRecord);
 
-  while (Size != 0) {
+  while (CurCFIRecord != End && Size != 0) {
     const char *OffsetField = CurCFIRecord + (Size == 0xffffffff ? 12 : 4);
     if (Size == 0xffffffff)
       Size = *reinterpret_cast<const uint64_t *>(CurCFIRecord + 4) + 12;
@@ -484,10 +466,12 @@ Error walkAppleEHFrameSection(const char *const SectionStart,
 
 #endif // __APPLE__
 
-Error registerEHFrameSection(const void *EHFrameSectionAddr) {
+Error registerEHFrameSection(const void *EHFrameSectionAddr,
+                             size_t EHFrameSectionSize) {
 #ifdef __APPLE__
   // On Darwin __register_frame has to be called for each FDE entry.
   return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
+                                 EHFrameSectionSize,
                                  registerFrameWrapper);
 #else
   // On Linux __register_frame takes a single argument:
@@ -499,9 +483,11 @@ Error registerEHFrameSection(const void *EHFrameSectionAddr) {
 #endif
 }
 
-Error deregisterEHFrameSection(const void *EHFrameSectionAddr) {
+Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
+                               size_t EHFrameSectionSize) {
 #ifdef __APPLE__
   return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
+                                 EHFrameSectionSize,
                                  deregisterFrameWrapper);
 #else
   return deregisterFrameWrapper(EHFrameSectionAddr);
@@ -517,23 +503,31 @@ InProcessEHFrameRegistrar &InProcessEHFrameRegistrar::getInstance() {
 
 InProcessEHFrameRegistrar::InProcessEHFrameRegistrar() {}
 
-AtomGraphPassFunction
+LinkGraphPassFunction
 createEHFrameRecorderPass(const Triple &TT,
-                          StoreFrameAddressFunction StoreFrameAddress) {
+                          StoreFrameRangeFunction StoreRangeAddress) {
   const char *EHFrameSectionName = nullptr;
   if (TT.getObjectFormat() == Triple::MachO)
     EHFrameSectionName = "__eh_frame";
   else
     EHFrameSectionName = ".eh_frame";
 
-  auto RecordEHFrame = [EHFrameSectionName,
-                        StoreFrameAddress](AtomGraph &G) -> Error {
-    // Search for a non-empty eh-frame and record the address of the first atom
-    // in it.
+  auto RecordEHFrame =
+      [EHFrameSectionName,
+       StoreFrameRange = std::move(StoreRangeAddress)](LinkGraph &G) -> Error {
+    // Search for a non-empty eh-frame and record the address of the first
+    // symbol in it.
     JITTargetAddress Addr = 0;
-    if (auto *S = G.findSectionByName(EHFrameSectionName))
-      Addr = S->getRange().getStart();
-    StoreFrameAddress(Addr);
+    size_t Size = 0;
+    if (auto *S = G.findSectionByName(EHFrameSectionName)) {
+      auto R = SectionRange(*S);
+      Addr = R.getStart();
+      Size = R.getSize();
+    }
+    if (Addr == 0 && Size != 0)
+      return make_error<JITLinkError>("__eh_frame section can not have zero "
+                                      "address with non-zero size");
+    StoreFrameRange(Addr, Size);
     return Error::success();
   };
 
diff --git a/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h b/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
index d679edef7ea6..6f9f68ad8382 100644
--- a/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
+++ b/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
@@ -21,18 +21,31 @@
 namespace llvm {
 namespace jitlink {
 
-/// A generic parser for eh-frame sections.
+/// A generic binary parser for eh-frame sections.
 ///
-/// Adds atoms representing CIE and FDE entries, using the given FDE-to-CIE and
-/// FDEToTarget relocation kinds.
-class EHFrameParser {
+/// Adds blocks and symbols representing CIE and FDE entries to a JITLink graph.
+///
+/// This parser assumes that the user has already verified that the EH-frame's
+/// address range does not overlap any other section/symbol, so that generated
+/// CIE/FDE records do not overlap other sections/symbols.
+class EHFrameBinaryParser {
 public:
-  EHFrameParser(AtomGraph &G, Section &EHFrameSection, StringRef EHFrameContent,
-                JITTargetAddress EHFrameAddress, Edge::Kind FDEToCIERelocKind,
-                Edge::Kind FDEToTargetRelocKind);
-  Error atomize();
+  EHFrameBinaryParser(JITTargetAddress EHFrameAddress, StringRef EHFrameContent,
+                      unsigned PointerSize, support::endianness Endianness);
+  virtual ~EHFrameBinaryParser() {}
+
+  Error addToGraph();
 
 private:
+  virtual void anchor();
+  virtual Symbol *getSymbolAtAddress(JITTargetAddress Addr) = 0;
+  virtual Symbol &createCIERecord(JITTargetAddress RecordAddr,
+                                  StringRef RecordContent) = 0;
+  virtual Expected<Symbol &>
+  createFDERecord(JITTargetAddress RecordAddr, StringRef RecordContent,
+                  Symbol &CIE, size_t CIEOffset, Symbol &Func,
+                  size_t FuncOffset, Symbol *LSDA, size_t LSDAOffset) = 0;
+
   struct AugmentationInfo {
     bool AugmentationDataPresent = false;
     bool EHDataFieldPresent = false;
@@ -41,31 +54,24 @@ private:
 
   Expected<AugmentationInfo> parseAugmentationString();
   Expected<JITTargetAddress> readAbsolutePointer();
-  Error processCIE();
-  Error processFDE(JITTargetAddress CIEPointerAddress, uint32_t CIEPointer);
+  Error processCIE(size_t RecordOffset, size_t RecordLength);
+  Error processFDE(size_t RecordOffset, size_t RecordLength,
+                   JITTargetAddress CIEPointerOffset, uint32_t CIEPointer);
 
   struct CIEInformation {
     CIEInformation() = default;
-    CIEInformation(DefinedAtom &CIEAtom) : CIEAtom(&CIEAtom) {}
-    DefinedAtom *CIEAtom = nullptr;
+    CIEInformation(Symbol &CIESymbol) : CIESymbol(&CIESymbol) {}
+    Symbol *CIESymbol = nullptr;
     bool FDEsHaveLSDAField = false;
   };
 
-  AtomGraph &G;
-  Section &EHFrameSection;
-  StringRef EHFrameContent;
   JITTargetAddress EHFrameAddress;
+  StringRef EHFrameContent;
+  unsigned PointerSize;
   BinaryStreamReader EHFrameReader;
-  DefinedAtom *CurRecordAtom = nullptr;
   DenseMap<JITTargetAddress, CIEInformation> CIEInfos;
-  Edge::Kind FDEToCIERelocKind;
-  Edge::Kind FDEToTargetRelocKind;
 };
 
-Error addEHFrame(AtomGraph &G, Section &EHFrameSection,
-                 StringRef EHFrameContent, JITTargetAddress EHFrameAddress,
-                 Edge::Kind FDEToCIERelocKind, Edge::Kind FDEToTargetRelocKind);
-
 } // end namespace jitlink
 } // end namespace llvm
 
diff --git a/lib/ExecutionEngine/JITLink/JITLink.cpp b/lib/ExecutionEngine/JITLink/JITLink.cpp
index 9d0a7459dc09..1e19038951ac 100644
--- a/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -56,95 +56,151 @@ std::error_code JITLinkError::convertToErrorCode() const {
   return std::error_code(GenericJITLinkError, *JITLinkerErrorCategory);
 }
 
-const StringRef getGenericEdgeKindName(Edge::Kind K) {
+const char *getGenericEdgeKindName(Edge::Kind K) {
   switch (K) {
   case Edge::Invalid:
     return "INVALID RELOCATION";
   case Edge::KeepAlive:
     return "Keep-Alive";
-  case Edge::LayoutNext:
-    return "Layout-Next";
   default:
     llvm_unreachable("Unrecognized relocation kind");
   }
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const Atom &A) {
+const char *getLinkageName(Linkage L) {
+  switch (L) {
+  case Linkage::Strong:
+    return "strong";
+  case Linkage::Weak:
+    return "weak";
+  }
+  llvm_unreachable("Unrecognized llvm.jitlink.Linkage enum");
+}
+
+const char *getScopeName(Scope S) {
+  switch (S) {
+  case Scope::Default:
+    return "default";
+  case Scope::Hidden:
+    return "hidden";
+  case Scope::Local:
+    return "local";
+  }
+  llvm_unreachable("Unrecognized llvm.jitlink.Scope enum");
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Block &B) {
+  return OS << formatv("{0:x16}", B.getAddress()) << " -- "
+            << formatv("{0:x16}", B.getAddress() + B.getSize()) << ": "
+            << (B.isZeroFill() ? "zero-fill" : "content")
+            << ", align = " << B.getAlignment()
+            << ", align-ofs = " << B.getAlignmentOffset()
+            << ", section = " << B.getSection().getName();
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) {
   OS << "<";
-  if (A.getName().empty())
-    OS << "anon@" << format("0x%016" PRIx64, A.getAddress());
+  if (Sym.getName().empty())
+    OS << "*anon*";
   else
-    OS << A.getName();
-  OS << " [";
-  if (A.isDefined()) {
-    auto &DA = static_cast<const DefinedAtom &>(A);
-    OS << " section=" << DA.getSection().getName();
-    if (DA.isLive())
-      OS << " live";
-    if (DA.shouldDiscard())
-      OS << " should-discard";
-  } else
-    OS << " external";
-  OS << " ]>";
+    OS << Sym.getName();
+  OS << ": flags = ";
+  switch (Sym.getLinkage()) {
+  case Linkage::Strong:
+    OS << 'S';
+    break;
+  case Linkage::Weak:
+    OS << 'W';
+    break;
+  }
+  switch (Sym.getScope()) {
+  case Scope::Default:
+    OS << 'D';
+    break;
+  case Scope::Hidden:
+    OS << 'H';
+    break;
+  case Scope::Local:
+    OS << 'L';
+    break;
+  }
+  OS << (Sym.isLive() ? '+' : '-')
+     << ", size = " << formatv("{0:x8}", Sym.getSize())
+     << ", addr = " << formatv("{0:x16}", Sym.getAddress()) << " ("
+     << formatv("{0:x16}", Sym.getAddressable().getAddress()) << " + "
+     << formatv("{0:x8}", Sym.getOffset());
+  if (Sym.isDefined())
+    OS << " " << Sym.getBlock().getSection().getName();
+  OS << ")>";
   return OS;
 }
 
-void printEdge(raw_ostream &OS, const Atom &FixupAtom, const Edge &E,
+void printEdge(raw_ostream &OS, const Block &B, const Edge &E,
                StringRef EdgeKindName) {
-  OS << "edge@" << formatv("{0:x16}", FixupAtom.getAddress() + E.getOffset())
-     << ": " << FixupAtom << " + " << E.getOffset() << " -- " << EdgeKindName
-     << " -> " << E.getTarget() << " + " << E.getAddend();
+  OS << "edge@" << formatv("{0:x16}", B.getAddress() + E.getOffset()) << ": "
+     << formatv("{0:x16}", B.getAddress()) << " + " << E.getOffset() << " -- "
+     << EdgeKindName << " -> " << E.getTarget() << " + " << E.getAddend();
 }
 
 Section::~Section() {
-  for (auto *DA : DefinedAtoms)
-    DA->~DefinedAtom();
+  for (auto *Sym : Symbols)
+    Sym->~Symbol();
 }
 
-void AtomGraph::dump(raw_ostream &OS,
+LinkGraph::~LinkGraph() {
+  // Destroy blocks.
+  for (auto *B : Blocks)
+    B->~Block();
+}
+
+void LinkGraph::dump(raw_ostream &OS,
                      std::function<StringRef(Edge::Kind)> EdgeKindToName) {
   if (!EdgeKindToName)
     EdgeKindToName = [](Edge::Kind K) { return StringRef(); };
 
-  OS << "Defined atoms:\n";
-  for (auto *DA : defined_atoms()) {
-    OS << "  " << format("0x%016" PRIx64, DA->getAddress()) << ": " << *DA
+  OS << "Symbols:\n";
+  for (auto *Sym : defined_symbols()) {
+    OS << "  " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym
        << "\n";
-    for (auto &E : DA->edges()) {
-      OS << "    ";
-      StringRef EdgeName = (E.getKind() < Edge::FirstRelocation
-                                ? getGenericEdgeKindName(E.getKind())
-                                : EdgeKindToName(E.getKind()));
-
-      if (!EdgeName.empty())
-        printEdge(OS, *DA, E, EdgeName);
-      else {
-        auto EdgeNumberString = std::to_string(E.getKind());
-        printEdge(OS, *DA, E, EdgeNumberString);
+    if (Sym->isDefined()) {
+      for (auto &E : Sym->getBlock().edges()) {
+        OS << "    ";
+        StringRef EdgeName = (E.getKind() < Edge::FirstRelocation
+                                  ? getGenericEdgeKindName(E.getKind())
+                                  : EdgeKindToName(E.getKind()));
+
+        if (!EdgeName.empty())
+          printEdge(OS, Sym->getBlock(), E, EdgeName);
+        else {
+          auto EdgeNumberString = std::to_string(E.getKind());
+          printEdge(OS, Sym->getBlock(), E, EdgeNumberString);
+        }
+        OS << "\n";
       }
-      OS << "\n";
     }
   }
 
-  OS << "Absolute atoms:\n";
-  for (auto *A : absolute_atoms())
-    OS << "  " << format("0x%016" PRIx64, A->getAddress()) << ": " << *A
+  OS << "Absolute symbols:\n";
+  for (auto *Sym : absolute_symbols())
+    OS << "  " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym
        << "\n";
 
-  OS << "External atoms:\n";
-  for (auto *A : external_atoms())
-    OS << "  " << format("0x%016" PRIx64, A->getAddress()) << ": " << *A
+  OS << "External symbols:\n";
+  for (auto *Sym : external_symbols())
+    OS << "  " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym
        << "\n";
 }
 
+void JITLinkAsyncLookupContinuation::anchor() {}
+
 JITLinkContext::~JITLinkContext() {}
 
 bool JITLinkContext::shouldAddDefaultTargetPasses(const Triple &TT) const {
   return true;
 }
 
-AtomGraphPassFunction JITLinkContext::getMarkLivePass(const Triple &TT) const {
-  return AtomGraphPassFunction();
+LinkGraphPassFunction JITLinkContext::getMarkLivePass(const Triple &TT) const {
+  return LinkGraphPassFunction();
 }
 
 Error JITLinkContext::modifyPassConfig(const Triple &TT,
@@ -152,9 +208,9 @@ Error JITLinkContext::modifyPassConfig(const Triple &TT,
   return Error::success();
 }
 
-Error markAllAtomsLive(AtomGraph &G) {
-  for (auto *DA : G.defined_atoms())
-    DA->setLive(true);
+Error markAllSymbolsLive(LinkGraph &G) {
+  for (auto *Sym : G.defined_symbols())
+    Sym->setLive(true);
   return Error::success();
 }
 
diff --git a/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 96e074da122b..d4270b5aa796 100644
--- a/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "JITLinkGeneric.h"
-#include "EHFrameSupportImpl.h"
 
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -25,7 +24,7 @@ JITLinkerBase::~JITLinkerBase() {}
 
 void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
 
-  // Build the atom graph.
+  // Build the link graph.
   if (auto GraphOrErr = buildGraph(Ctx->getObjectBuffer()))
     G = std::move(*GraphOrErr);
   else
@@ -33,33 +32,33 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
   assert(G && "Graph should have been created by buildGraph above");
 
   // Prune and optimize the graph.
-  if (auto Err = runPasses(Passes.PrePrunePasses, *G))
+  if (auto Err = runPasses(Passes.PrePrunePasses))
     return Ctx->notifyFailed(std::move(Err));
 
   LLVM_DEBUG({
-    dbgs() << "Atom graph \"" << G->getName() << "\" pre-pruning:\n";
+    dbgs() << "Link graph \"" << G->getName() << "\" pre-pruning:\n";
     dumpGraph(dbgs());
   });
 
   prune(*G);
 
   LLVM_DEBUG({
-    dbgs() << "Atom graph \"" << G->getName() << "\" post-pruning:\n";
+    dbgs() << "Link graph \"" << G->getName() << "\" post-pruning:\n";
     dumpGraph(dbgs());
   });
 
   // Run post-pruning passes.
-  if (auto Err = runPasses(Passes.PostPrunePasses, *G))
+  if (auto Err = runPasses(Passes.PostPrunePasses))
     return Ctx->notifyFailed(std::move(Err));
 
-  // Sort atoms into segments.
-  layOutAtoms();
+  // Sort blocks into segments.
+  auto Layout = layOutBlocks();
 
   // Allocate memory for segments.
   if (auto Err = allocateSegments(Layout))
     return Ctx->notifyFailed(std::move(Err));
 
-  // Notify client that the defined atoms have been assigned addresses.
+  // Notify client that the defined symbols have been assigned addresses.
   Ctx->notifyResolved(*G);
 
   auto ExternalSymbols = getExternalSymbolNames();
@@ -74,42 +73,42 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
   //             [Self=std::move(Self)](Expected<AsyncLookupResult> Result) {
   //               Self->linkPhase2(std::move(Self), std::move(Result));
   //             });
-  //
-  // FIXME: Use move capture once we have c++14.
   auto *TmpCtx = Ctx.get();
-  auto *UnownedSelf = Self.release();
-  auto Phase2Continuation =
-      [UnownedSelf](Expected<AsyncLookupResult> LookupResult) {
-        std::unique_ptr<JITLinkerBase> Self(UnownedSelf);
-        UnownedSelf->linkPhase2(std::move(Self), std::move(LookupResult));
-      };
-  TmpCtx->lookup(std::move(ExternalSymbols), std::move(Phase2Continuation));
+  TmpCtx->lookup(std::move(ExternalSymbols),
+                 createLookupContinuation(
+                     [S = std::move(Self), L = std::move(Layout)](
+                         Expected<AsyncLookupResult> LookupResult) mutable {
+                       auto &TmpSelf = *S;
+                       TmpSelf.linkPhase2(std::move(S), std::move(LookupResult),
+                                          std::move(L));
+                     }));
 }
 
 void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
-                               Expected<AsyncLookupResult> LR) {
+                               Expected<AsyncLookupResult> LR,
+                               SegmentLayoutMap Layout) {
   // If the lookup failed, bail out.
   if (!LR)
     return deallocateAndBailOut(LR.takeError());
 
-  // Assign addresses to external atoms.
+  // Assign addresses to external addressables.
   applyLookupResult(*LR);
 
   LLVM_DEBUG({
-    dbgs() << "Atom graph \"" << G->getName() << "\" before copy-and-fixup:\n";
+    dbgs() << "Link graph \"" << G->getName() << "\" before copy-and-fixup:\n";
     dumpGraph(dbgs());
   });
 
-  // Copy atom content to working memory and fix up.
-  if (auto Err = copyAndFixUpAllAtoms(Layout, *Alloc))
+  // Copy block content to working memory and fix up.
+  if (auto Err = copyAndFixUpBlocks(Layout, *Alloc))
     return deallocateAndBailOut(std::move(Err));
 
   LLVM_DEBUG({
-    dbgs() << "Atom graph \"" << G->getName() << "\" after copy-and-fixup:\n";
+    dbgs() << "Link graph \"" << G->getName() << "\" after copy-and-fixup:\n";
     dumpGraph(dbgs());
   });
 
-  if (auto Err = runPasses(Passes.PostFixupPasses, *G))
+  if (auto Err = runPasses(Passes.PostFixupPasses))
     return deallocateAndBailOut(std::move(Err));
 
   // FIXME: Use move capture once we have c++14.
@@ -128,82 +127,38 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self, Error Err) {
   Ctx->notifyFinalized(std::move(Alloc));
 }
 
-Error JITLinkerBase::runPasses(AtomGraphPassList &Passes, AtomGraph &G) {
+Error JITLinkerBase::runPasses(LinkGraphPassList &Passes) {
   for (auto &P : Passes)
-    if (auto Err = P(G))
+    if (auto Err = P(*G))
       return Err;
   return Error::success();
 }
 
-void JITLinkerBase::layOutAtoms() {
-  // Group sections by protections, and whether or not they're zero-fill.
-  for (auto &S : G->sections()) {
+JITLinkerBase::SegmentLayoutMap JITLinkerBase::layOutBlocks() {
 
-    // Skip empty sections.
-    if (S.atoms_empty())
-      continue;
+  SegmentLayoutMap Layout;
 
-    auto &SL = Layout[S.getProtectionFlags()];
-    if (S.isZeroFill())
-      SL.ZeroFillSections.push_back(SegmentLayout::SectionLayout(S));
+  /// Partition blocks based on permissions and content vs. zero-fill.
+  for (auto *B : G->blocks()) {
+    auto &SegLists = Layout[B->getSection().getProtectionFlags()];
+    if (!B->isZeroFill())
+      SegLists.ContentBlocks.push_back(B);
     else
-      SL.ContentSections.push_back(SegmentLayout::SectionLayout(S));
+      SegLists.ZeroFillBlocks.push_back(B);
   }
 
-  // Sort sections within the layout by ordinal.
-  {
-    auto CompareByOrdinal = [](const SegmentLayout::SectionLayout &LHS,
-                               const SegmentLayout::SectionLayout &RHS) {
-      return LHS.S->getSectionOrdinal() < RHS.S->getSectionOrdinal();
+  /// Sort blocks within each list.
+  for (auto &KV : Layout) {
+
+    auto CompareBlocks = [](const Block *LHS, const Block *RHS) {
+      if (LHS->getSection().getOrdinal() != RHS->getSection().getOrdinal())
+        return LHS->getSection().getOrdinal() < RHS->getSection().getOrdinal();
+      return LHS->getOrdinal() < RHS->getOrdinal();
     };
-    for (auto &KV : Layout) {
-      auto &SL = KV.second;
-      std::sort(SL.ContentSections.begin(), SL.ContentSections.end(),
-                CompareByOrdinal);
-      std::sort(SL.ZeroFillSections.begin(), SL.ZeroFillSections.end(),
-                CompareByOrdinal);
-    }
-  }
 
-  // Add atoms to the sections.
-  for (auto &KV : Layout) {
-    auto &SL = KV.second;
-    for (auto *SIList : {&SL.ContentSections, &SL.ZeroFillSections}) {
-      for (auto &SI : *SIList) {
-        // First build the set of layout-heads (i.e. "heads" of layout-next
-        // chains) by copying the section atoms, then eliminating any that
-        // appear as layout-next targets.
-        DenseSet<DefinedAtom *> LayoutHeads;
-        for (auto *DA : SI.S->atoms())
-          LayoutHeads.insert(DA);
-
-        for (auto *DA : SI.S->atoms())
-          if (DA->hasLayoutNext())
-            LayoutHeads.erase(&DA->getLayoutNext());
-
-        // Next, sort the layout heads by address order.
-        std::vector<DefinedAtom *> OrderedLayoutHeads;
-        OrderedLayoutHeads.reserve(LayoutHeads.size());
-        for (auto *DA : LayoutHeads)
-          OrderedLayoutHeads.push_back(DA);
-
-        // Now sort the list of layout heads by address.
-        std::sort(OrderedLayoutHeads.begin(), OrderedLayoutHeads.end(),
-                  [](const DefinedAtom *LHS, const DefinedAtom *RHS) {
-                    return LHS->getAddress() < RHS->getAddress();
-                  });
-
-        // Now populate the SI.Atoms field by appending each of the chains.
-        for (auto *DA : OrderedLayoutHeads) {
-          SI.Atoms.push_back(DA);
-          while (DA->hasLayoutNext()) {
-            auto &Next = DA->getLayoutNext();
-            SI.Atoms.push_back(&Next);
-            DA = &Next;
-          }
-        }
-      }
-    }
+    auto &SegLists = KV.second;
+    llvm::sort(SegLists.ContentBlocks, CompareBlocks);
+    llvm::sort(SegLists.ZeroFillBlocks, CompareBlocks);
   }
 
   LLVM_DEBUG({
@@ -213,18 +168,16 @@ void JITLinkerBase::layOutAtoms() {
              << static_cast<sys::Memory::ProtectionFlags>(KV.first) << ":\n";
       auto &SL = KV.second;
       for (auto &SIEntry :
-           {std::make_pair(&SL.ContentSections, "content sections"),
-            std::make_pair(&SL.ZeroFillSections, "zero-fill sections")}) {
-        auto &SIList = *SIEntry.first;
+           {std::make_pair(&SL.ContentBlocks, "content block"),
+            std::make_pair(&SL.ZeroFillBlocks, "zero-fill block")}) {
         dbgs() << "    " << SIEntry.second << ":\n";
-        for (auto &SI : SIList) {
-          dbgs() << "      " << SI.S->getName() << ":\n";
-          for (auto *DA : SI.Atoms)
-            dbgs() << "        " << *DA << "\n";
-        }
+        for (auto *B : *SIEntry.first)
+          dbgs() << "      " << *B << "\n";
       }
     }
   });
+
+  return Layout;
 }
 
 Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) {
@@ -234,74 +187,36 @@ Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) {
   JITLinkMemoryManager::SegmentsRequestMap Segments;
   for (auto &KV : Layout) {
     auto &Prot = KV.first;
-    auto &SegLayout = KV.second;
+    auto &SegLists = KV.second;
+
+    uint64_t SegAlign = 1;
 
     // Calculate segment content size.
     size_t SegContentSize = 0;
-    for (auto &SI : SegLayout.ContentSections) {
-      assert(!SI.S->atoms_empty() && "Sections in layout must not be empty");
-      assert(!SI.Atoms.empty() && "Section layouts must not be empty");
-
-      // Bump to section alignment before processing atoms.
-      SegContentSize = alignTo(SegContentSize, SI.S->getAlignment());
-
-      for (auto *DA : SI.Atoms) {
-        SegContentSize = alignTo(SegContentSize, DA->getAlignment());
-        SegContentSize += DA->getSize();
-      }
+    for (auto *B : SegLists.ContentBlocks) {
+      SegAlign = std::max(SegAlign, B->getAlignment());
+      SegContentSize = alignToBlock(SegContentSize, *B);
+      SegContentSize += B->getSize();
     }
 
-    // Get segment content alignment.
-    unsigned SegContentAlign = 1;
-    if (!SegLayout.ContentSections.empty()) {
-      auto &FirstContentSection = SegLayout.ContentSections.front();
-      SegContentAlign =
-          std::max(FirstContentSection.S->getAlignment(),
-                   FirstContentSection.Atoms.front()->getAlignment());
-    }
+    uint64_t SegZeroFillStart = SegContentSize;
+    uint64_t SegZeroFillEnd = SegZeroFillStart;
 
-    // Calculate segment zero-fill size.
-    uint64_t SegZeroFillSize = 0;
-    for (auto &SI : SegLayout.ZeroFillSections) {
-      assert(!SI.S->atoms_empty() && "Sections in layout must not be empty");
-      assert(!SI.Atoms.empty() && "Section layouts must not be empty");
-
-      // Bump to section alignment before processing atoms.
-      SegZeroFillSize = alignTo(SegZeroFillSize, SI.S->getAlignment());
-
-      for (auto *DA : SI.Atoms) {
-        SegZeroFillSize = alignTo(SegZeroFillSize, DA->getAlignment());
-        SegZeroFillSize += DA->getSize();
-      }
-    }
-
-    // Calculate segment zero-fill alignment.
-    uint32_t SegZeroFillAlign = 1;
-
-    if (!SegLayout.ZeroFillSections.empty()) {
-      auto &FirstZeroFillSection = SegLayout.ZeroFillSections.front();
-      SegZeroFillAlign =
-          std::max(FirstZeroFillSection.S->getAlignment(),
-                   FirstZeroFillSection.Atoms.front()->getAlignment());
+    for (auto *B : SegLists.ZeroFillBlocks) {
+      SegAlign = std::max(SegAlign, B->getAlignment());
+      SegZeroFillEnd = alignToBlock(SegZeroFillEnd, *B);
+      SegZeroFillEnd += B->getSize();
     }
 
-    if (SegContentSize == 0)
-      SegContentAlign = SegZeroFillAlign;
-
-    if (SegContentAlign % SegZeroFillAlign != 0)
-      return make_error<JITLinkError>("First content atom alignment does not "
-                                      "accommodate first zero-fill atom "
-                                      "alignment");
-
-    Segments[Prot] = {SegContentSize, SegContentAlign, SegZeroFillSize,
-                      SegZeroFillAlign};
+    Segments[Prot] = {SegAlign, SegContentSize,
+                      SegZeroFillEnd - SegZeroFillStart};
 
     LLVM_DEBUG({
       dbgs() << (&KV == &*Layout.begin() ? "" : "; ")
-             << static_cast<sys::Memory::ProtectionFlags>(Prot) << ": "
-             << SegContentSize << " content bytes (alignment "
-             << SegContentAlign << ") + " << SegZeroFillSize
-             << " zero-fill bytes (alignment " << SegZeroFillAlign << ")";
+             << static_cast<sys::Memory::ProtectionFlags>(Prot)
+             << ": alignment = " << SegAlign
+             << ", content size = " << SegContentSize
+             << ", zero-fill size = " << (SegZeroFillEnd - SegZeroFillStart);
     });
   }
   LLVM_DEBUG(dbgs() << " }\n");
@@ -320,22 +235,19 @@ Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) {
     }
   });
 
-  // Update atom target addresses.
+  // Update block target addresses.
   for (auto &KV : Layout) {
     auto &Prot = KV.first;
     auto &SL = KV.second;
 
-    JITTargetAddress AtomTargetAddr =
+    JITTargetAddress NextBlockAddr =
         Alloc->getTargetMemory(static_cast<sys::Memory::ProtectionFlags>(Prot));
 
-    for (auto *SIList : {&SL.ContentSections, &SL.ZeroFillSections})
-      for (auto &SI : *SIList) {
-        AtomTargetAddr = alignTo(AtomTargetAddr, SI.S->getAlignment());
-        for (auto *DA : SI.Atoms) {
-          AtomTargetAddr = alignTo(AtomTargetAddr, DA->getAlignment());
-          DA->setAddress(AtomTargetAddr);
-          AtomTargetAddr += DA->getSize();
-        }
+    for (auto *SIList : {&SL.ContentBlocks, &SL.ZeroFillBlocks})
+      for (auto *B : *SIList) {
+        NextBlockAddr = alignToBlock(NextBlockAddr, *B);
+        B->setAddress(NextBlockAddr);
+        NextBlockAddr += B->getSize();
       }
   }
 
@@ -343,34 +255,35 @@ Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) {
 }
 
 DenseSet<StringRef> JITLinkerBase::getExternalSymbolNames() const {
-  // Identify unresolved external atoms.
+  // Identify unresolved external symbols.
   DenseSet<StringRef> UnresolvedExternals;
-  for (auto *DA : G->external_atoms()) {
-    assert(DA->getAddress() == 0 &&
+  for (auto *Sym : G->external_symbols()) {
+    assert(Sym->getAddress() == 0 &&
            "External has already been assigned an address");
-    assert(DA->getName() != StringRef() && DA->getName() != "" &&
+    assert(Sym->getName() != StringRef() && Sym->getName() != "" &&
            "Externals must be named");
-    UnresolvedExternals.insert(DA->getName());
+    UnresolvedExternals.insert(Sym->getName());
   }
   return UnresolvedExternals;
 }
 
 void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) {
-  for (auto &KV : Result) {
-    Atom &A = G->getAtomByName(KV.first);
-    assert(A.getAddress() == 0 && "Atom already resolved");
-    A.setAddress(KV.second.getAddress());
+  for (auto *Sym : G->external_symbols()) {
+    assert(Sym->getAddress() == 0 && "Symbol already resolved");
+    assert(!Sym->isDefined() && "Symbol being resolved is already defined");
+    assert(Result.count(Sym->getName()) && "Missing resolution for symbol");
+    Sym->getAddressable().setAddress(Result[Sym->getName()].getAddress());
   }
 
   LLVM_DEBUG({
     dbgs() << "Externals after applying lookup result:\n";
-    for (auto *A : G->external_atoms())
-      dbgs() << "  " << A->getName() << ": "
-             << formatv("{0:x16}", A->getAddress()) << "\n";
+    for (auto *Sym : G->external_symbols())
+      dbgs() << "  " << Sym->getName() << ": "
+             << formatv("{0:x16}", Sym->getAddress()) << "\n";
   });
-  assert(llvm::all_of(G->external_atoms(),
-                      [](Atom *A) { return A->getAddress() != 0; }) &&
-         "All atoms should have been resolved by this point");
+  assert(llvm::all_of(G->external_symbols(),
+                      [](Symbol *Sym) { return Sym->getAddress() != 0; }) &&
+         "All symbols should have been resolved by this point");
 }
 
 void JITLinkerBase::deallocateAndBailOut(Error Err) {
@@ -384,96 +297,60 @@ void JITLinkerBase::dumpGraph(raw_ostream &OS) {
   G->dump(dbgs(), [this](Edge::Kind K) { return getEdgeKindName(K); });
 }
 
-void prune(AtomGraph &G) {
-  std::vector<DefinedAtom *> Worklist;
-  DenseMap<DefinedAtom *, std::vector<Edge *>> EdgesToUpdate;
-
-  // Build the initial worklist from all atoms initially live.
-  for (auto *DA : G.defined_atoms()) {
-    if (!DA->isLive() || DA->shouldDiscard())
-      continue;
-
-    for (auto &E : DA->edges()) {
-      if (!E.getTarget().isDefined())
-        continue;
+void prune(LinkGraph &G) {
+  std::vector<Symbol *> Worklist;
+  DenseSet<Block *> VisitedBlocks;
 
-      auto &EDT = static_cast<DefinedAtom &>(E.getTarget());
+  // Build the initial worklist from all symbols initially live.
+  for (auto *Sym : G.defined_symbols())
+    if (Sym->isLive())
+      Worklist.push_back(Sym);
 
-      if (EDT.shouldDiscard())
-        EdgesToUpdate[&EDT].push_back(&E);
-      else if (E.isKeepAlive() && !EDT.isLive())
-        Worklist.push_back(&EDT);
-    }
-  }
-
-  // Propagate live flags to all atoms reachable from the initial live set.
+  // Propagate live flags to all symbols reachable from the initial live set.
   while (!Worklist.empty()) {
-    DefinedAtom &NextLive = *Worklist.back();
+    auto *Sym = Worklist.back();
     Worklist.pop_back();
 
-    assert(!NextLive.shouldDiscard() &&
-           "should-discard nodes should never make it into the worklist");
+    auto &B = Sym->getBlock();
 
-    // If this atom has already been marked as live, or is marked to be
-    // discarded, then skip it.
-    if (NextLive.isLive())
+    // Skip addressables that we've visited before.
+    if (VisitedBlocks.count(&B))
       continue;
 
-    // Otherwise set it as live and add any non-live atoms that it points to
-    // to the worklist.
-    NextLive.setLive(true);
-
-    for (auto &E : NextLive.edges()) {
-      if (!E.getTarget().isDefined())
-        continue;
+    VisitedBlocks.insert(&B);
 
-      auto &EDT = static_cast<DefinedAtom &>(E.getTarget());
-
-      if (EDT.shouldDiscard())
-        EdgesToUpdate[&EDT].push_back(&E);
-      else if (E.isKeepAlive() && !EDT.isLive())
-        Worklist.push_back(&EDT);
+    for (auto &E : Sym->getBlock().edges()) {
+      if (E.getTarget().isDefined() && !E.getTarget().isLive()) {
+        E.getTarget().setLive(true);
+        Worklist.push_back(&E.getTarget());
+      }
     }
   }
 
-  // Collect atoms to remove, then remove them from the graph.
-  std::vector<DefinedAtom *> AtomsToRemove;
-  for (auto *DA : G.defined_atoms())
-    if (DA->shouldDiscard() || !DA->isLive())
-      AtomsToRemove.push_back(DA);
-
-  LLVM_DEBUG(dbgs() << "Pruning atoms:\n");
-  for (auto *DA : AtomsToRemove) {
-    LLVM_DEBUG(dbgs() << "  " << *DA << "... ");
-
-    // Check whether we need to replace this atom with an external atom.
-    //
-    // We replace if all of the following hold:
-    //   (1) The atom is marked should-discard,
-    //   (2) it has live edges (i.e. edges from live atoms) pointing to it.
-    //
-    // Otherwise we simply delete the atom.
-
-    G.removeDefinedAtom(*DA);
-
-    auto EdgesToUpdateItr = EdgesToUpdate.find(DA);
-    if (EdgesToUpdateItr != EdgesToUpdate.end()) {
-      auto &ExternalReplacement = G.addExternalAtom(DA->getName());
-      for (auto *EdgeToUpdate : EdgesToUpdateItr->second)
-        EdgeToUpdate->setTarget(ExternalReplacement);
-      LLVM_DEBUG(dbgs() << "replaced with " << ExternalReplacement << "\n");
-    } else
-      LLVM_DEBUG(dbgs() << "deleted\n");
+  // Collect all the symbols to remove, then remove them.
+  {
+    LLVM_DEBUG(dbgs() << "Dead-stripping symbols:\n");
+    std::vector<Symbol *> SymbolsToRemove;
+    for (auto *Sym : G.defined_symbols())
+      if (!Sym->isLive())
+        SymbolsToRemove.push_back(Sym);
+    for (auto *Sym : SymbolsToRemove) {
+      LLVM_DEBUG(dbgs() << "  " << *Sym << "...\n");
+      G.removeDefinedSymbol(*Sym);
+    }
   }
 
-  // Finally, discard any absolute symbols that were marked should-discard.
+  // Delete any unused blocks.
   {
-    std::vector<Atom *> AbsoluteAtomsToRemove;
-    for (auto *A : G.absolute_atoms())
-      if (A->shouldDiscard() || A->isLive())
-        AbsoluteAtomsToRemove.push_back(A);
-    for (auto *A : AbsoluteAtomsToRemove)
-      G.removeAbsoluteAtom(*A);
+    LLVM_DEBUG(dbgs() << "Dead-stripping blocks:\n");
+    std::vector<Block *> BlocksToRemove;
+    for (auto *B : G.blocks())
+      if (!VisitedBlocks.count(B))
+        BlocksToRemove.push_back(B);
+    for (auto *B : BlocksToRemove) {
+      LLVM_DEBUG(dbgs() << "  " << *B << "...\n");
+      G.removeBlock(*B);
+    }
   }
 }
 
diff --git a/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
index e6fd6e38f7a6..07dee6cee200 100644
--- a/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
+++ b/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
@@ -41,39 +41,32 @@ public:
 
 protected:
   struct SegmentLayout {
-    using SectionAtomsList = std::vector<DefinedAtom *>;
-    struct SectionLayout {
-      SectionLayout(Section &S) : S(&S) {}
+    using BlocksList = std::vector<Block *>;
 
-      Section *S;
-      SectionAtomsList Atoms;
-    };
-
-    using SectionLayoutList = std::vector<SectionLayout>;
-
-    SectionLayoutList ContentSections;
-    SectionLayoutList ZeroFillSections;
+    BlocksList ContentBlocks;
+    BlocksList ZeroFillBlocks;
   };
 
   using SegmentLayoutMap = DenseMap<unsigned, SegmentLayout>;
 
   // Phase 1:
-  //   1.1: Build atom graph
+  //   1.1: Build link graph
   //   1.2: Run pre-prune passes
   //   1.2: Prune graph
   //   1.3: Run post-prune passes
-  //   1.4: Sort atoms into segments
+  //   1.4: Sort blocks into segments
   //   1.5: Allocate segment memory
   //   1.6: Identify externals and make an async call to resolve function
   void linkPhase1(std::unique_ptr<JITLinkerBase> Self);
 
   // Phase 2:
   //   2.1: Apply resolution results
-  //   2.2: Fix up atom contents
+  //   2.2: Fix up block contents
   //   2.3: Call OnResolved callback
   //   2.3: Make an async call to transfer and finalize memory.
   void linkPhase2(std::unique_ptr<JITLinkerBase> Self,
-                  Expected<AsyncLookupResult> LookupResult);
+                  Expected<AsyncLookupResult> LookupResult,
+                  SegmentLayoutMap Layout);
 
   // Phase 3:
   //   3.1: Call OnFinalized callback, handing off allocation.
@@ -81,24 +74,37 @@ protected:
 
   // Build a graph from the given object buffer.
   // To be implemented by the client.
-  virtual Expected<std::unique_ptr<AtomGraph>>
+  virtual Expected<std::unique_ptr<LinkGraph>>
   buildGraph(MemoryBufferRef ObjBuffer) = 0;
 
-  // For debug dumping of the atom graph.
+  // For debug dumping of the link graph.
   virtual StringRef getEdgeKindName(Edge::Kind K) const = 0;
 
+  // Alight a JITTargetAddress to conform with block alignment requirements.
+  static JITTargetAddress alignToBlock(JITTargetAddress Addr, Block &B) {
+    uint64_t Delta = (B.getAlignmentOffset() - Addr) % B.getAlignment();
+    return Addr + Delta;
+  }
+
+  // Alight a pointer to conform with block alignment requirements.
+  static char *alignToBlock(char *P, Block &B) {
+    uint64_t PAddr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(P));
+    uint64_t Delta = (B.getAlignmentOffset() - PAddr) % B.getAlignment();
+    return P + Delta;
+  }
+
 private:
   // Run all passes in the given pass list, bailing out immediately if any pass
   // returns an error.
-  Error runPasses(AtomGraphPassList &Passes, AtomGraph &G);
+  Error runPasses(LinkGraphPassList &Passes);
 
-  // Copy atom contents and apply relocations.
+  // Copy block contents and apply relocations.
   // Implemented in JITLinker.
   virtual Error
-  copyAndFixUpAllAtoms(const SegmentLayoutMap &Layout,
-                       JITLinkMemoryManager::Allocation &Alloc) const = 0;
+  copyAndFixUpBlocks(const SegmentLayoutMap &Layout,
+                     JITLinkMemoryManager::Allocation &Alloc) const = 0;
 
-  void layOutAtoms();
+  SegmentLayoutMap layOutBlocks();
   Error allocateSegments(const SegmentLayoutMap &Layout);
   DenseSet<StringRef> getExternalSymbolNames() const;
   void applyLookupResult(AsyncLookupResult LR);
@@ -108,8 +114,7 @@ private:
 
   std::unique_ptr<JITLinkContext> Ctx;
   PassConfiguration Passes;
-  std::unique_ptr<AtomGraph> G;
-  SegmentLayoutMap Layout;
+  std::unique_ptr<LinkGraph> G;
   std::unique_ptr<JITLinkMemoryManager::Allocation> Alloc;
 };
 
@@ -121,7 +126,7 @@ public:
   /// Link should be called with the constructor arguments for LinkerImpl, which
   /// will be forwarded to the constructor.
   template <typename... ArgTs> static void link(ArgTs &&... Args) {
-    auto L = llvm::make_unique<LinkerImpl>(std::forward<ArgTs>(Args)...);
+    auto L = std::make_unique<LinkerImpl>(std::forward<ArgTs>(Args)...);
 
     // Ownership of the linker is passed into the linker's doLink function to
     // allow it to be passed on to async continuations.
@@ -140,17 +145,17 @@ private:
   }
 
   Error
-  copyAndFixUpAllAtoms(const SegmentLayoutMap &Layout,
-                       JITLinkMemoryManager::Allocation &Alloc) const override {
-    LLVM_DEBUG(dbgs() << "Copying and fixing up atoms:\n");
+  copyAndFixUpBlocks(const SegmentLayoutMap &Layout,
+                     JITLinkMemoryManager::Allocation &Alloc) const override {
+    LLVM_DEBUG(dbgs() << "Copying and fixing up blocks:\n");
     for (auto &KV : Layout) {
       auto &Prot = KV.first;
       auto &SegLayout = KV.second;
 
       auto SegMem = Alloc.getWorkingMemory(
           static_cast<sys::Memory::ProtectionFlags>(Prot));
-      char *LastAtomEnd = SegMem.data();
-      char *AtomDataPtr = LastAtomEnd;
+      char *LastBlockEnd = SegMem.data();
+      char *BlockDataPtr = LastBlockEnd;
 
       LLVM_DEBUG({
         dbgs() << "  Processing segment "
@@ -160,93 +165,79 @@ private:
                << " ]\n    Processing content sections:\n";
       });
 
-      for (auto &SI : SegLayout.ContentSections) {
-        LLVM_DEBUG(dbgs() << "    " << SI.S->getName() << ":\n");
+      for (auto *B : SegLayout.ContentBlocks) {
+        LLVM_DEBUG(dbgs() << "    " << *B << ":\n");
+
+        // Pad to alignment/alignment-offset.
+        BlockDataPtr = alignToBlock(BlockDataPtr, *B);
 
-        AtomDataPtr += alignmentAdjustment(AtomDataPtr, SI.S->getAlignment());
+        LLVM_DEBUG({
+          dbgs() << "      Bumped block pointer to "
+                 << (const void *)BlockDataPtr << " to meet block alignment "
+                 << B->getAlignment() << " and alignment offset "
+                 << B->getAlignmentOffset() << "\n";
+        });
 
+        // Zero pad up to alignment.
         LLVM_DEBUG({
-          dbgs() << "      Bumped atom pointer to " << (const void *)AtomDataPtr
-                 << " to meet section alignment "
-                 << " of " << SI.S->getAlignment() << "\n";
+          if (LastBlockEnd != BlockDataPtr)
+            dbgs() << "      Zero padding from " << (const void *)LastBlockEnd
+                   << " to " << (const void *)BlockDataPtr << "\n";
         });
 
-        for (auto *DA : SI.Atoms) {
-
-          // Align.
-          AtomDataPtr += alignmentAdjustment(AtomDataPtr, DA->getAlignment());
-          LLVM_DEBUG({
-            dbgs() << "      Bumped atom pointer to "
-                   << (const void *)AtomDataPtr << " to meet alignment of "
-                   << DA->getAlignment() << "\n";
-          });
-
-          // Zero pad up to alignment.
-          LLVM_DEBUG({
-            if (LastAtomEnd != AtomDataPtr)
-              dbgs() << "      Zero padding from " << (const void *)LastAtomEnd
-                     << " to " << (const void *)AtomDataPtr << "\n";
-          });
-          while (LastAtomEnd != AtomDataPtr)
-            *LastAtomEnd++ = 0;
-
-          // Copy initial atom content.
-          LLVM_DEBUG({
-            dbgs() << "      Copying atom " << *DA << " content, "
-                   << DA->getContent().size() << " bytes, from "
-                   << (const void *)DA->getContent().data() << " to "
-                   << (const void *)AtomDataPtr << "\n";
-          });
-          memcpy(AtomDataPtr, DA->getContent().data(), DA->getContent().size());
-
-          // Copy atom data and apply fixups.
-          LLVM_DEBUG(dbgs() << "      Applying fixups.\n");
-          for (auto &E : DA->edges()) {
-
-            // Skip non-relocation edges.
-            if (!E.isRelocation())
-              continue;
-
-            // Dispatch to LinkerImpl for fixup.
-            if (auto Err = impl().applyFixup(*DA, E, AtomDataPtr))
-              return Err;
-          }
-
-          // Point the atom's content to the fixed up buffer.
-          DA->setContent(StringRef(AtomDataPtr, DA->getContent().size()));
-
-          // Update atom end pointer.
-          LastAtomEnd = AtomDataPtr + DA->getContent().size();
-          AtomDataPtr = LastAtomEnd;
+        while (LastBlockEnd != BlockDataPtr)
+          *LastBlockEnd++ = 0;
+
+        // Copy initial block content.
+        LLVM_DEBUG({
+          dbgs() << "      Copying block " << *B << " content, "
+                 << B->getContent().size() << " bytes, from "
+                 << (const void *)B->getContent().data() << " to "
+                 << (const void *)BlockDataPtr << "\n";
+        });
+        memcpy(BlockDataPtr, B->getContent().data(), B->getContent().size());
+
+        // Copy Block data and apply fixups.
+        LLVM_DEBUG(dbgs() << "      Applying fixups.\n");
+        for (auto &E : B->edges()) {
+
+          // Skip non-relocation edges.
+          if (!E.isRelocation())
+            continue;
+
+          // Dispatch to LinkerImpl for fixup.
+          if (auto Err = impl().applyFixup(*B, E, BlockDataPtr))
+            return Err;
         }
+
+        // Point the block's content to the fixed up buffer.
+        B->setContent(StringRef(BlockDataPtr, B->getContent().size()));
+
+        // Update block end pointer.
+        LastBlockEnd = BlockDataPtr + B->getContent().size();
+        BlockDataPtr = LastBlockEnd;
       }
 
       // Zero pad the rest of the segment.
       LLVM_DEBUG({
         dbgs() << "    Zero padding end of segment from "
-               << (const void *)LastAtomEnd << " to "
+               << (const void *)LastBlockEnd << " to "
                << (const void *)((char *)SegMem.data() + SegMem.size()) << "\n";
       });
-      while (LastAtomEnd != SegMem.data() + SegMem.size())
-        *LastAtomEnd++ = 0;
+      while (LastBlockEnd != SegMem.data() + SegMem.size())
+        *LastBlockEnd++ = 0;
     }
 
     return Error::success();
   }
 };
 
-/// Dead strips and replaces discarded definitions with external atoms.
+/// Removes dead symbols/blocks/addressables.
 ///
-/// Finds the set of nodes reachable from any node initially marked live
-/// (nodes marked should-discard are treated as not live, even if they are
-/// reachable). All nodes not marked as live at the end of this process,
-/// are deleted. Nodes that are live, but marked should-discard are replaced
-/// with external atoms and all edges to them are re-written.
-void prune(AtomGraph &G);
-
-Error addEHFrame(AtomGraph &G, Section &EHFrameSection,
-                 StringRef EHFrameContent, JITTargetAddress EHFrameAddress,
-                 Edge::Kind FDEToCIERelocKind, Edge::Kind FDEToTargetRelocKind);
+/// Finds the set of symbols and addressables reachable from any symbol
+/// initially marked live. All symbols/addressables not marked live at the end
+/// of this process are removed.
+void prune(LinkGraph &G);
 
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 267307cfde05..9e0d207e8bdb 100644
--- a/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -38,9 +38,21 @@ InProcessMemoryManager::allocate(const SegmentsRequestMap &Request) {
       OnFinalize(applyProtections());
     }
     Error deallocate() override {
-      for (auto &KV : SegBlocks)
-        if (auto EC = sys::Memory::releaseMappedMemory(KV.second))
-          return errorCodeToError(EC);
+      if (SegBlocks.empty())
+        return Error::success();
+      void *SlabStart = SegBlocks.begin()->second.base();
+      char *SlabEnd = (char *)SlabStart;
+      for (auto &KV : SegBlocks) {
+        SlabStart = std::min(SlabStart, KV.second.base());
+        SlabEnd = std::max(SlabEnd, (char *)(KV.second.base()) +
+                                        KV.second.allocatedSize());
+      }
+      size_t SlabSize = SlabEnd - (char *)SlabStart;
+      assert((SlabSize % sys::Process::getPageSizeEstimate()) == 0 &&
+             "Slab size is not a multiple of page size");
+      sys::MemoryBlock Slab(SlabStart, SlabSize);
+      if (auto EC = sys::Memory::releaseMappedMemory(Slab))
+        return errorCodeToError(EC);
       return Error::success();
     }
 
@@ -61,37 +73,52 @@ InProcessMemoryManager::allocate(const SegmentsRequestMap &Request) {
     AllocationMap SegBlocks;
   };
 
+  if (!isPowerOf2_64((uint64_t)sys::Process::getPageSizeEstimate()))
+    return make_error<StringError>("Page size is not a power of 2",
+                                   inconvertibleErrorCode());
+
   AllocationMap Blocks;
   const sys::Memory::ProtectionFlags ReadWrite =
       static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
                                                 sys::Memory::MF_WRITE);
 
+  // Compute the total number of pages to allocate.
+  size_t TotalSize = 0;
   for (auto &KV : Request) {
-    auto &Seg = KV.second;
+    const auto &Seg = KV.second;
 
-    if (Seg.getContentAlignment() > sys::Process::getPageSizeEstimate())
+    if (Seg.getAlignment() > sys::Process::getPageSizeEstimate())
       return make_error<StringError>("Cannot request higher than page "
                                      "alignment",
                                      inconvertibleErrorCode());
 
-    if (sys::Process::getPageSizeEstimate() % Seg.getContentAlignment() != 0)
-      return make_error<StringError>("Page size is not a multiple of "
-                                     "alignment",
-                                     inconvertibleErrorCode());
+    TotalSize = alignTo(TotalSize, sys::Process::getPageSizeEstimate());
+    TotalSize += Seg.getContentSize();
+    TotalSize += Seg.getZeroFillSize();
+  }
+
+  // Allocate one slab to cover all the segments.
+  std::error_code EC;
+  auto SlabRemaining =
+      sys::Memory::allocateMappedMemory(TotalSize, nullptr, ReadWrite, EC);
+
+  if (EC)
+    return errorCodeToError(EC);
+
+  // Allocate segment memory from the slab.
+  for (auto &KV : Request) {
 
-    uint64_t ZeroFillStart =
-        alignTo(Seg.getContentSize(), Seg.getZeroFillAlignment());
-    uint64_t SegmentSize = ZeroFillStart + Seg.getZeroFillSize();
+    const auto &Seg = KV.second;
 
-    std::error_code EC;
-    auto SegMem =
-        sys::Memory::allocateMappedMemory(SegmentSize, nullptr, ReadWrite, EC);
+    uint64_t SegmentSize = alignTo(Seg.getContentSize() + Seg.getZeroFillSize(),
+                                   sys::Process::getPageSizeEstimate());
 
-    if (EC)
-      return errorCodeToError(EC);
+    sys::MemoryBlock SegMem(SlabRemaining.base(), SegmentSize);
+    SlabRemaining = sys::MemoryBlock((char *)SlabRemaining.base() + SegmentSize,
+                                     SegmentSize);
 
     // Zero out the zero-fill memory.
-    memset(static_cast<char *>(SegMem.base()) + ZeroFillStart, 0,
+    memset(static_cast<char *>(SegMem.base()) + Seg.getContentSize(), 0,
            Seg.getZeroFillSize());
 
     // Record the block for this segment.
diff --git a/lib/ExecutionEngine/JITLink/MachO.cpp b/lib/ExecutionEngine/JITLink/MachO.cpp
index 15995b8ce98f..58bc0f56e155 100644
--- a/lib/ExecutionEngine/JITLink/MachO.cpp
+++ b/lib/ExecutionEngine/JITLink/MachO.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ExecutionEngine/JITLink/MachO.h"
 
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ExecutionEngine/JITLink/MachO_arm64.h"
 #include "llvm/ExecutionEngine/JITLink/MachO_x86_64.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
@@ -64,6 +65,8 @@ void jitLink_MachO(std::unique_ptr<JITLinkContext> Ctx) {
     });
 
     switch (Header.cputype) {
+    case MachO::CPU_TYPE_ARM64:
+      return jitLink_MachO_arm64(std::move(Ctx));
     case MachO::CPU_TYPE_X86_64:
       return jitLink_MachO_x86_64(std::move(Ctx));
     }
diff --git a/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.cpp b/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.cpp
deleted file mode 100644
index 1501c7ad0bc5..000000000000
--- a/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.cpp
+++ /dev/null
@@ -1,411 +0,0 @@
-//=--------- MachOAtomGraphBuilder.cpp - MachO AtomGraph builder ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Generic MachO AtomGraph buliding code.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MachOAtomGraphBuilder.h"
-
-#define DEBUG_TYPE "jitlink"
-
-namespace llvm {
-namespace jitlink {
-
-MachOAtomGraphBuilder::~MachOAtomGraphBuilder() {}
-
-Expected<std::unique_ptr<AtomGraph>> MachOAtomGraphBuilder::buildGraph() {
-  if (auto Err = parseSections())
-    return std::move(Err);
-
-  if (auto Err = addAtoms())
-    return std::move(Err);
-
-  if (auto Err = addRelocations())
-    return std::move(Err);
-
-  return std::move(G);
-}
-
-MachOAtomGraphBuilder::MachOAtomGraphBuilder(const object::MachOObjectFile &Obj)
-    : Obj(Obj),
-      G(llvm::make_unique<AtomGraph>(Obj.getFileName(), getPointerSize(Obj),
-                                     getEndianness(Obj))) {}
-
-void MachOAtomGraphBuilder::addCustomAtomizer(StringRef SectionName,
-                                              CustomAtomizeFunction Atomizer) {
-  assert(!CustomAtomizeFunctions.count(SectionName) &&
-         "Custom atomizer for this section already exists");
-  CustomAtomizeFunctions[SectionName] = std::move(Atomizer);
-}
-
-bool MachOAtomGraphBuilder::areLayoutLocked(const Atom &A, const Atom &B) {
-  // If these atoms are the same then they're trivially "locked".
-  if (&A == &B)
-    return true;
-
-  // If A and B are different, check whether either is undefined. (in which
-  // case they are not locked).
-  if (!A.isDefined() || !B.isDefined())
-    return false;
-
-  // A and B are different, but they're both defined atoms. We need to check
-  // whether they're part of the same alt_entry chain.
-  auto &DA = static_cast<const DefinedAtom &>(A);
-  auto &DB = static_cast<const DefinedAtom &>(B);
-
-  auto AStartItr = AltEntryStarts.find(&DA);
-  if (AStartItr == AltEntryStarts.end()) // If A is not in a chain bail out.
-    return false;
-
-  auto BStartItr = AltEntryStarts.find(&DB);
-  if (BStartItr == AltEntryStarts.end()) // If B is not in a chain bail out.
-    return false;
-
-  // A and B are layout locked if they're in the same chain.
-  return AStartItr->second == BStartItr->second;
-}
-
-unsigned
-MachOAtomGraphBuilder::getPointerSize(const object::MachOObjectFile &Obj) {
-  return Obj.is64Bit() ? 8 : 4;
-}
-
-support::endianness
-MachOAtomGraphBuilder::getEndianness(const object::MachOObjectFile &Obj) {
-  return Obj.isLittleEndian() ? support::little : support::big;
-}
-
-MachOAtomGraphBuilder::MachOSection &MachOAtomGraphBuilder::getCommonSection() {
-  if (!CommonSymbolsSection) {
-    auto Prot = static_cast<sys::Memory::ProtectionFlags>(
-        sys::Memory::MF_READ | sys::Memory::MF_WRITE);
-    auto &GenericSection = G->createSection("<common>", 1, Prot, true);
-    CommonSymbolsSection = MachOSection(GenericSection);
-  }
-  return *CommonSymbolsSection;
-}
-
-Error MachOAtomGraphBuilder::parseSections() {
-  for (auto &SecRef : Obj.sections()) {
-    assert((SecRef.getAlignment() <= std::numeric_limits<uint32_t>::max()) &&
-           "Section alignment does not fit in 32 bits");
-
-    StringRef Name;
-    if (auto EC = SecRef.getName(Name))
-      return errorCodeToError(EC);
-
-    unsigned SectionIndex = SecRef.getIndex() + 1;
-
-    uint32_t Align = SecRef.getAlignment();
-    if (!isPowerOf2_32(Align))
-      return make_error<JITLinkError>("Section " + Name +
-                                      " has non-power-of-2 "
-                                      "alignment");
-
-    // FIXME: Get real section permissions
-    // How, exactly, on MachO?
-    sys::Memory::ProtectionFlags Prot;
-    if (SecRef.isText())
-      Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                       sys::Memory::MF_EXEC);
-    else
-      Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                       sys::Memory::MF_WRITE);
-
-    auto &GenericSection = G->createSection(Name, Align, Prot, SecRef.isBSS());
-
-    LLVM_DEBUG({
-      dbgs() << "Adding section " << Name << ": "
-             << format("0x%016" PRIx64, SecRef.getAddress())
-             << ", align: " << SecRef.getAlignment() << "\n";
-    });
-
-    assert(!Sections.count(SectionIndex) && "Section index already in use");
-
-    auto &MachOSec =
-        Sections
-            .try_emplace(SectionIndex, GenericSection, SecRef.getAddress(),
-                         SecRef.getAlignment())
-            .first->second;
-
-    if (!SecRef.isVirtual()) {
-      // If this section has content then record it.
-      Expected<StringRef> Content = SecRef.getContents();
-      if (!Content)
-        return Content.takeError();
-      if (Content->size() != SecRef.getSize())
-        return make_error<JITLinkError>("Section content size does not match "
-                                        "declared size for " +
-                                        Name);
-      MachOSec.setContent(*Content);
-    } else {
-      // If this is a zero-fill section then just record the size.
-      MachOSec.setZeroFill(SecRef.getSize());
-    }
-
-    uint32_t SectionFlags =
-        Obj.is64Bit() ? Obj.getSection64(SecRef.getRawDataRefImpl()).flags
-                      : Obj.getSection(SecRef.getRawDataRefImpl()).flags;
-
-    MachOSec.setNoDeadStrip(SectionFlags & MachO::S_ATTR_NO_DEAD_STRIP);
-  }
-
-  return Error::success();
-}
-
-// Adds atoms with identified start addresses (but not lengths) for all named
-// atoms.
-// Also, for every section that contains named atoms, but does not have an
-// atom at offset zero of that section, constructs an anonymous atom covering
-// that range.
-Error MachOAtomGraphBuilder::addNonCustomAtoms() {
-  using AddrToAtomMap = std::map<JITTargetAddress, DefinedAtom *>;
-  DenseMap<MachOSection *, AddrToAtomMap> SecToAtoms;
-
-  DenseMap<MachOSection *, unsigned> FirstOrdinal;
-  std::vector<DefinedAtom *> AltEntryAtoms;
-
-  DenseSet<StringRef> ProcessedSymbols; // Used to check for duplicate defs.
-
-  for (auto SymI = Obj.symbol_begin(), SymE = Obj.symbol_end(); SymI != SymE;
-       ++SymI) {
-    object::SymbolRef Sym(SymI->getRawDataRefImpl(), &Obj);
-
-    auto Name = Sym.getName();
-    if (!Name)
-      return Name.takeError();
-
-    // Bail out on duplicate definitions: There should never be more than one
-    // definition for a symbol in a given object file.
-    if (ProcessedSymbols.count(*Name))
-      return make_error<JITLinkError>("Duplicate definition within object: " +
-                                      *Name);
-    else
-      ProcessedSymbols.insert(*Name);
-
-    auto Addr = Sym.getAddress();
-    if (!Addr)
-      return Addr.takeError();
-
-    auto SymType = Sym.getType();
-    if (!SymType)
-      return SymType.takeError();
-
-    auto Flags = Sym.getFlags();
-
-    if (Flags & object::SymbolRef::SF_Undefined) {
-      LLVM_DEBUG(dbgs() << "Adding undef atom \"" << *Name << "\"\n");
-      G->addExternalAtom(*Name);
-      continue;
-    } else if (Flags & object::SymbolRef::SF_Absolute) {
-      LLVM_DEBUG(dbgs() << "Adding absolute \"" << *Name << "\" addr: "
-                        << format("0x%016" PRIx64, *Addr) << "\n");
-      auto &A = G->addAbsoluteAtom(*Name, *Addr);
-      A.setGlobal(Flags & object::SymbolRef::SF_Global);
-      A.setExported(Flags & object::SymbolRef::SF_Exported);
-      A.setWeak(Flags & object::SymbolRef::SF_Weak);
-      continue;
-    } else if (Flags & object::SymbolRef::SF_Common) {
-      LLVM_DEBUG({
-        dbgs() << "Adding common \"" << *Name
-               << "\" addr: " << format("0x%016" PRIx64, *Addr) << "\n";
-      });
-      auto &A =
-          G->addCommonAtom(getCommonSection().getGenericSection(), *Name, *Addr,
-                           std::max(Sym.getAlignment(), 1U),
-                           Obj.getCommonSymbolSize(Sym.getRawDataRefImpl()));
-      A.setGlobal(Flags & object::SymbolRef::SF_Global);
-      A.setExported(Flags & object::SymbolRef::SF_Exported);
-      continue;
-    }
-
-    LLVM_DEBUG(dbgs() << "Adding defined atom \"" << *Name << "\"\n");
-
-    // This atom is neither undefined nor absolute, so it must be defined in
-    // this object. Get its section index.
-    auto SecItr = Sym.getSection();
-    if (!SecItr)
-      return SecItr.takeError();
-
-    uint64_t SectionIndex = (*SecItr)->getIndex() + 1;
-
-    LLVM_DEBUG(dbgs() << "  to section index " << SectionIndex << "\n");
-
-    auto SecByIndexItr = Sections.find(SectionIndex);
-    if (SecByIndexItr == Sections.end())
-      return make_error<JITLinkError>("Unrecognized section index in macho");
-
-    auto &Sec = SecByIndexItr->second;
-
-    auto &DA = G->addDefinedAtom(Sec.getGenericSection(), *Name, *Addr,
-                                 std::max(Sym.getAlignment(), 1U));
-
-    DA.setGlobal(Flags & object::SymbolRef::SF_Global);
-    DA.setExported(Flags & object::SymbolRef::SF_Exported);
-    DA.setWeak(Flags & object::SymbolRef::SF_Weak);
-
-    DA.setCallable(*SymType & object::SymbolRef::ST_Function);
-
-    // Check NDesc flags.
-    {
-      uint16_t NDesc = 0;
-      if (Obj.is64Bit())
-        NDesc = Obj.getSymbol64TableEntry(SymI->getRawDataRefImpl()).n_desc;
-      else
-        NDesc = Obj.getSymbolTableEntry(SymI->getRawDataRefImpl()).n_desc;
-
-      // Record atom for alt-entry post-processing (where the layout-next
-      // constraints will be added).
-      if (NDesc & MachO::N_ALT_ENTRY)
-        AltEntryAtoms.push_back(&DA);
-
-      // If this atom has a no-dead-strip attr attached then mark it live.
-      if (NDesc & MachO::N_NO_DEAD_STRIP)
-        DA.setLive(true);
-    }
-
-    LLVM_DEBUG({
-      dbgs() << "  Added " << *Name
-             << " addr: " << format("0x%016" PRIx64, *Addr)
-             << ", align: " << DA.getAlignment()
-             << ", section: " << Sec.getGenericSection().getName() << "\n";
-    });
-
-    auto &SecAtoms = SecToAtoms[&Sec];
-    SecAtoms[DA.getAddress() - Sec.getAddress()] = &DA;
-  }
-
-  // Add anonymous atoms.
-  for (auto &KV : Sections) {
-    auto &S = KV.second;
-
-    // Skip empty sections.
-    if (S.empty())
-      continue;
-
-    // Skip sections with custom handling.
-    if (CustomAtomizeFunctions.count(S.getName()))
-      continue;
-
-    auto SAI = SecToAtoms.find(&S);
-
-    // If S is not in the SecToAtoms map then it contained no named atom. Add
-    // one anonymous atom to cover the whole section.
-    if (SAI == SecToAtoms.end()) {
-      SecToAtoms[&S][0] = &G->addAnonymousAtom(
-          S.getGenericSection(), S.getAddress(), S.getAlignment());
-      continue;
-    }
-
-    // Otherwise, check whether this section had an atom covering offset zero.
-    // If not, add one.
-    auto &SecAtoms = SAI->second;
-    if (!SecAtoms.count(0))
-      SecAtoms[0] = &G->addAnonymousAtom(S.getGenericSection(), S.getAddress(),
-                                         S.getAlignment());
-  }
-
-  LLVM_DEBUG(dbgs() << "MachOGraphBuilder setting atom content\n");
-
-  // Set atom contents and any section-based flags.
-  for (auto &KV : SecToAtoms) {
-    auto &S = *KV.first;
-    auto &SecAtoms = KV.second;
-
-    // Iterate the atoms in reverse order and set up their contents.
-    JITTargetAddress LastAtomAddr = S.getSize();
-    for (auto I = SecAtoms.rbegin(), E = SecAtoms.rend(); I != E; ++I) {
-      auto Offset = I->first;
-      auto &A = *I->second;
-      LLVM_DEBUG({
-        dbgs() << "  " << A << " to [ " << S.getAddress() + Offset << " .. "
-               << S.getAddress() + LastAtomAddr << " ]\n";
-      });
-
-      if (S.isZeroFill())
-        A.setZeroFill(LastAtomAddr - Offset);
-      else
-        A.setContent(S.getContent().substr(Offset, LastAtomAddr - Offset));
-
-      // If the section has no-dead-strip set then mark the atom as live.
-      if (S.isNoDeadStrip())
-        A.setLive(true);
-
-      LastAtomAddr = Offset;
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "Adding alt-entry starts\n");
-
-  // Sort alt-entry atoms by address in ascending order.
-  llvm::sort(AltEntryAtoms.begin(), AltEntryAtoms.end(),
-             [](const DefinedAtom *LHS, const DefinedAtom *RHS) {
-               return LHS->getAddress() < RHS->getAddress();
-             });
-
-  // Process alt-entry atoms in address order to build the table of alt-entry
-  // atoms to alt-entry chain starts.
-  for (auto *DA : AltEntryAtoms) {
-    assert(!AltEntryStarts.count(DA) && "Duplicate entry in AltEntryStarts");
-
-    // DA is an alt-entry atom. Look for the predecessor atom that it is locked
-    // to, bailing out if we do not find one.
-    auto AltEntryPred = G->findAtomByAddress(DA->getAddress() - 1);
-    if (!AltEntryPred)
-      return AltEntryPred.takeError();
-
-    // Add a LayoutNext edge from the predecessor to this atom.
-    AltEntryPred->setLayoutNext(*DA);
-
-    // Check to see whether the predecessor itself is an alt-entry atom.
-    auto AltEntryStartItr = AltEntryStarts.find(&*AltEntryPred);
-    if (AltEntryStartItr != AltEntryStarts.end()) {
-      // If the predecessor was an alt-entry atom then re-use its value.
-      LLVM_DEBUG({
-        dbgs() << "  " << *DA << " -> " << *AltEntryStartItr->second
-               << " (based on existing entry for " << *AltEntryPred << ")\n";
-      });
-      AltEntryStarts[DA] = AltEntryStartItr->second;
-    } else {
-      // If the predecessor does not have an entry then add an entry for this
-      // atom (i.e. the alt_entry atom) and a self-reference entry for the
-      /// predecessory atom that is the start of this chain.
-      LLVM_DEBUG({
-        dbgs() << "  " << *AltEntryPred << " -> " << *AltEntryPred << "\n"
-               << "  " << *DA << " -> " << *AltEntryPred << "\n";
-      });
-      AltEntryStarts[&*AltEntryPred] = &*AltEntryPred;
-      AltEntryStarts[DA] = &*AltEntryPred;
-    }
-  }
-
-  return Error::success();
-}
-
-Error MachOAtomGraphBuilder::addAtoms() {
-  // Add all named atoms.
-  if (auto Err = addNonCustomAtoms())
-    return Err;
-
-  // Process special sections.
-  for (auto &KV : Sections) {
-    auto &S = KV.second;
-    auto HI = CustomAtomizeFunctions.find(S.getGenericSection().getName());
-    if (HI != CustomAtomizeFunctions.end()) {
-      auto &Atomize = HI->second;
-      if (auto Err = Atomize(S))
-        return Err;
-    }
-  }
-
-  return Error::success();
-}
-
-} // end namespace jitlink
-} // end namespace llvm
diff --git a/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.h b/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.h
deleted file mode 100644
index 72d441b24d06..000000000000
--- a/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.h
+++ /dev/null
@@ -1,138 +0,0 @@
-//===----- MachOAtomGraphBuilder.h - MachO AtomGraph builder ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Generic MachO AtomGraph building code.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LIB_EXECUTIONENGINE_JITLINK_MACHOATOMGRAPHBUILDER_H
-#define LIB_EXECUTIONENGINE_JITLINK_MACHOATOMGRAPHBUILDER_H
-
-#include "llvm/ExecutionEngine/JITLink/JITLink.h"
-
-#include "JITLinkGeneric.h"
-
-#include "llvm/Object/MachO.h"
-
-namespace llvm {
-namespace jitlink {
-
-class MachOAtomGraphBuilder {
-public:
-  virtual ~MachOAtomGraphBuilder();
-  Expected<std::unique_ptr<AtomGraph>> buildGraph();
-
-protected:
-  using OffsetToAtomMap = std::map<JITTargetAddress, DefinedAtom *>;
-
-  class MachOSection {
-  public:
-    MachOSection() = default;
-
-    /// Create a MachO section with the given address and alignment.
-    MachOSection(Section &GenericSection, JITTargetAddress Address,
-                 unsigned Alignment)
-        : Address(Address), GenericSection(&GenericSection),
-          Alignment(Alignment) {}
-
-    /// Create a section without address, content or size (used for common
-    /// symbol sections).
-    MachOSection(Section &GenericSection) : GenericSection(&GenericSection) {}
-
-    Section &getGenericSection() const {
-      assert(GenericSection && "Section is null");
-      return *GenericSection;
-    }
-
-    StringRef getName() const {
-      assert(GenericSection && "No generic section attached");
-      return GenericSection->getName();
-    }
-
-    MachOSection &setContent(StringRef Content) {
-      assert(!ContentPtr && !Size && "Content/zeroFill already set");
-      ContentPtr = Content.data();
-      Size = Content.size();
-      return *this;
-    }
-
-    MachOSection &setZeroFill(uint64_t Size) {
-      assert(!ContentPtr && !this->Size && "Content/zeroFill already set");
-      this->Size = Size;
-      return *this;
-    }
-
-    bool isZeroFill() const { return !ContentPtr; }
-
-    bool empty() const { return getSize() == 0; }
-
-    size_t getSize() const { return Size; }
-
-    StringRef getContent() const {
-      assert(ContentPtr && "getContent() called on zero-fill section");
-      return {ContentPtr, static_cast<size_t>(Size)};
-    }
-
-    JITTargetAddress getAddress() const { return Address; }
-
-    unsigned getAlignment() const { return Alignment; }
-
-    MachOSection &setNoDeadStrip(bool NoDeadStrip) {
-      this->NoDeadStrip = NoDeadStrip;
-      return *this;
-    }
-
-    bool isNoDeadStrip() const { return NoDeadStrip; }
-
-  private:
-    JITTargetAddress Address = 0;
-    Section *GenericSection = nullptr;
-    const char *ContentPtr = nullptr;
-    uint64_t Size = 0;
-    unsigned Alignment = 0;
-    bool NoDeadStrip = false;
-  };
-
-  using CustomAtomizeFunction = std::function<Error(MachOSection &S)>;
-
-  MachOAtomGraphBuilder(const object::MachOObjectFile &Obj);
-
-  AtomGraph &getGraph() const { return *G; }
-
-  const object::MachOObjectFile &getObject() const { return Obj; }
-
-  void addCustomAtomizer(StringRef SectionName, CustomAtomizeFunction Atomizer);
-
-  virtual Error addRelocations() = 0;
-
-  /// Returns true if Atom A and Atom B are at a fixed offset from one another
-  /// (i.e. if they're part of the same alt-entry chain).
-  bool areLayoutLocked(const Atom &A, const Atom &B);
-
-private:
-  static unsigned getPointerSize(const object::MachOObjectFile &Obj);
-  static support::endianness getEndianness(const object::MachOObjectFile &Obj);
-
-  MachOSection &getCommonSection();
-
-  Error parseSections();
-  Error addNonCustomAtoms();
-  Error addAtoms();
-
-  const object::MachOObjectFile &Obj;
-  std::unique_ptr<AtomGraph> G;
-  DenseMap<const DefinedAtom *, const DefinedAtom *> AltEntryStarts;
-  DenseMap<unsigned, MachOSection> Sections;
-  StringMap<CustomAtomizeFunction> CustomAtomizeFunctions;
-  Optional<MachOSection> CommonSymbolsSection;
-};
-
-} // end namespace jitlink
-} // end namespace llvm
-
-#endif // LIB_EXECUTIONENGINE_JITLINK_MACHOATOMGRAPHBUILDER_H
diff --git a/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
new file mode 100644
index 000000000000..7366f53ebf36
--- /dev/null
+++ b/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -0,0 +1,535 @@
+//=--------- MachOLinkGraphBuilder.cpp - MachO LinkGraph builder ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic MachO LinkGraph buliding code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MachOLinkGraphBuilder.h"
+
+#define DEBUG_TYPE "jitlink"
+
+static const char *CommonSectionName = "__common";
+
+namespace llvm {
+namespace jitlink {
+
+MachOLinkGraphBuilder::~MachOLinkGraphBuilder() {}
+
+Expected<std::unique_ptr<LinkGraph>> MachOLinkGraphBuilder::buildGraph() {
+
+  // Sanity check: we only operate on relocatable objects.
+  if (!Obj.isRelocatableObject())
+    return make_error<JITLinkError>("Object is not a relocatable MachO");
+
+  if (auto Err = createNormalizedSections())
+    return std::move(Err);
+
+  if (auto Err = createNormalizedSymbols())
+    return std::move(Err);
+
+  if (auto Err = graphifyRegularSymbols())
+    return std::move(Err);
+
+  if (auto Err = graphifySectionsWithCustomParsers())
+    return std::move(Err);
+
+  if (auto Err = addRelocations())
+    return std::move(Err);
+
+  return std::move(G);
+}
+
+MachOLinkGraphBuilder::MachOLinkGraphBuilder(const object::MachOObjectFile &Obj)
+    : Obj(Obj),
+      G(std::make_unique<LinkGraph>(Obj.getFileName(), getPointerSize(Obj),
+                                    getEndianness(Obj))) {}
+
+void MachOLinkGraphBuilder::addCustomSectionParser(
+    StringRef SectionName, SectionParserFunction Parser) {
+  assert(!CustomSectionParserFunctions.count(SectionName) &&
+         "Custom parser for this section already exists");
+  CustomSectionParserFunctions[SectionName] = std::move(Parser);
+}
+
+Linkage MachOLinkGraphBuilder::getLinkage(uint16_t Desc) {
+  if ((Desc & MachO::N_WEAK_DEF) || (Desc & MachO::N_WEAK_REF))
+    return Linkage::Weak;
+  return Linkage::Strong;
+}
+
+Scope MachOLinkGraphBuilder::getScope(StringRef Name, uint8_t Type) {
+  if (Name.startswith("l"))
+    return Scope::Local;
+  if (Type & MachO::N_PEXT)
+    return Scope::Hidden;
+  if (Type & MachO::N_EXT)
+    return Scope::Default;
+  return Scope::Local;
+}
+
+bool MachOLinkGraphBuilder::isAltEntry(const NormalizedSymbol &NSym) {
+  return NSym.Desc & MachO::N_ALT_ENTRY;
+}
+
+unsigned
+MachOLinkGraphBuilder::getPointerSize(const object::MachOObjectFile &Obj) {
+  return Obj.is64Bit() ? 8 : 4;
+}
+
+support::endianness
+MachOLinkGraphBuilder::getEndianness(const object::MachOObjectFile &Obj) {
+  return Obj.isLittleEndian() ? support::little : support::big;
+}
+
+Section &MachOLinkGraphBuilder::getCommonSection() {
+  if (!CommonSection) {
+    auto Prot = static_cast<sys::Memory::ProtectionFlags>(
+        sys::Memory::MF_READ | sys::Memory::MF_WRITE);
+    CommonSection = &G->createSection(CommonSectionName, Prot);
+  }
+  return *CommonSection;
+}
+
+Error MachOLinkGraphBuilder::createNormalizedSections() {
+  // Build normalized sections. Verifies that section data is in-range (for
+  // sections with content) and that address ranges are non-overlapping.
+
+  LLVM_DEBUG(dbgs() << "Creating normalized sections...\n");
+
+  for (auto &SecRef : Obj.sections()) {
+    NormalizedSection NSec;
+    uint32_t DataOffset = 0;
+
+    auto SecIndex = Obj.getSectionIndex(SecRef.getRawDataRefImpl());
+
+    auto Name = SecRef.getName();
+    if (!Name)
+      return Name.takeError();
+
+    if (Obj.is64Bit()) {
+      const MachO::section_64 &Sec64 =
+          Obj.getSection64(SecRef.getRawDataRefImpl());
+
+      NSec.Address = Sec64.addr;
+      NSec.Size = Sec64.size;
+      NSec.Alignment = 1ULL << Sec64.align;
+      NSec.Flags = Sec64.flags;
+      DataOffset = Sec64.offset;
+    } else {
+      const MachO::section &Sec32 = Obj.getSection(SecRef.getRawDataRefImpl());
+      NSec.Address = Sec32.addr;
+      NSec.Size = Sec32.size;
+      NSec.Alignment = 1ULL << Sec32.align;
+      NSec.Flags = Sec32.flags;
+      DataOffset = Sec32.offset;
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "  " << *Name << ": " << formatv("{0:x16}", NSec.Address)
+             << " -- " << formatv("{0:x16}", NSec.Address + NSec.Size)
+             << ", align: " << NSec.Alignment << ", index: " << SecIndex
+             << "\n";
+    });
+
+    // Get the section data if any.
+    {
+      unsigned SectionType = NSec.Flags & MachO::SECTION_TYPE;
+      if (SectionType != MachO::S_ZEROFILL &&
+          SectionType != MachO::S_GB_ZEROFILL) {
+
+        if (DataOffset + NSec.Size > Obj.getData().size())
+          return make_error<JITLinkError>(
+              "Section data extends past end of file");
+
+        NSec.Data = Obj.getData().data() + DataOffset;
+      }
+    }
+
+    // Get prot flags.
+    // FIXME: Make sure this test is correct (it's probably missing cases
+    // as-is).
+    sys::Memory::ProtectionFlags Prot;
+    if (NSec.Flags & MachO::S_ATTR_PURE_INSTRUCTIONS)
+      Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                       sys::Memory::MF_EXEC);
+    else
+      Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                       sys::Memory::MF_WRITE);
+
+    NSec.GraphSection = &G->createSection(*Name, Prot);
+    IndexToSection.insert(std::make_pair(SecIndex, std::move(NSec)));
+  }
+
+  std::vector<NormalizedSection *> Sections;
+  Sections.reserve(IndexToSection.size());
+  for (auto &KV : IndexToSection)
+    Sections.push_back(&KV.second);
+
+  // If we didn't end up creating any sections then bail out. The code below
+  // assumes that we have at least one section.
+  if (Sections.empty())
+    return Error::success();
+
+  llvm::sort(Sections,
+             [](const NormalizedSection *LHS, const NormalizedSection *RHS) {
+               assert(LHS && RHS && "Null section?");
+               return LHS->Address < RHS->Address;
+             });
+
+  for (unsigned I = 0, E = Sections.size() - 1; I != E; ++I) {
+    auto &Cur = *Sections[I];
+    auto &Next = *Sections[I + 1];
+    if (Next.Address < Cur.Address + Cur.Size)
+      return make_error<JITLinkError>(
+          "Address range for section " + Cur.GraphSection->getName() +
+          formatv(" [ {0:x16} -- {1:x16} ] ", Cur.Address,
+                  Cur.Address + Cur.Size) +
+          "overlaps " +
+          formatv(" [ {0:x16} -- {1:x16} ] ", Next.Address,
+                  Next.Address + Next.Size));
+  }
+
+  return Error::success();
+}
+
+Error MachOLinkGraphBuilder::createNormalizedSymbols() {
+  LLVM_DEBUG(dbgs() << "Creating normalized symbols...\n");
+
+  for (auto &SymRef : Obj.symbols()) {
+
+    unsigned SymbolIndex = Obj.getSymbolIndex(SymRef.getRawDataRefImpl());
+    uint64_t Value;
+    uint32_t NStrX;
+    uint8_t Type;
+    uint8_t Sect;
+    uint16_t Desc;
+
+    if (Obj.is64Bit()) {
+      const MachO::nlist_64 &NL64 =
+          Obj.getSymbol64TableEntry(SymRef.getRawDataRefImpl());
+      Value = NL64.n_value;
+      NStrX = NL64.n_strx;
+      Type = NL64.n_type;
+      Sect = NL64.n_sect;
+      Desc = NL64.n_desc;
+    } else {
+      const MachO::nlist &NL32 =
+          Obj.getSymbolTableEntry(SymRef.getRawDataRefImpl());
+      Value = NL32.n_value;
+      NStrX = NL32.n_strx;
+      Type = NL32.n_type;
+      Sect = NL32.n_sect;
+      Desc = NL32.n_desc;
+    }
+
+    // Skip stabs.
+    // FIXME: Are there other symbols we should be skipping?
+    if (Type & MachO::N_STAB)
+      continue;
+
+    Optional<StringRef> Name;
+    if (NStrX) {
+      if (auto NameOrErr = SymRef.getName())
+        Name = *NameOrErr;
+      else
+        return NameOrErr.takeError();
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "  ";
+      if (!Name)
+        dbgs() << "<anonymous symbol>";
+      else
+        dbgs() << *Name;
+      dbgs() << ": value = " << formatv("{0:x16}", Value)
+             << ", type = " << formatv("{0:x2}", Type)
+             << ", desc = " << formatv("{0:x4}", Desc) << ", sect = ";
+      if (Sect)
+        dbgs() << static_cast<unsigned>(Sect - 1);
+      else
+        dbgs() << "none";
+      dbgs() << "\n";
+    });
+
+    // If this symbol has a section, sanity check that the addresses line up.
+    NormalizedSection *NSec = nullptr;
+    if (Sect != 0) {
+      if (auto NSecOrErr = findSectionByIndex(Sect - 1))
+        NSec = &*NSecOrErr;
+      else
+        return NSecOrErr.takeError();
+
+      if (Value < NSec->Address || Value > NSec->Address + NSec->Size)
+        return make_error<JITLinkError>("Symbol address does not fall within "
+                                        "section");
+    }
+
+    IndexToSymbol[SymbolIndex] =
+        &createNormalizedSymbol(*Name, Value, Type, Sect, Desc,
+                                getLinkage(Type), getScope(*Name, Type));
+  }
+
+  return Error::success();
+}
+
+void MachOLinkGraphBuilder::addSectionStartSymAndBlock(
+    Section &GraphSec, uint64_t Address, const char *Data, uint64_t Size,
+    uint32_t Alignment, bool IsLive) {
+  Block &B =
+      Data ? G->createContentBlock(GraphSec, StringRef(Data, Size), Address,
+                                   Alignment, 0)
+           : G->createZeroFillBlock(GraphSec, Size, Address, Alignment, 0);
+  auto &Sym = G->addAnonymousSymbol(B, 0, Size, false, IsLive);
+  assert(!AddrToCanonicalSymbol.count(Sym.getAddress()) &&
+         "Anonymous block start symbol clashes with existing symbol address");
+  AddrToCanonicalSymbol[Sym.getAddress()] = &Sym;
+}
+
+Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
+
+  LLVM_DEBUG(dbgs() << "Creating graph symbols...\n");
+
+  /// We only have 256 section indexes: Use a vector rather than a map.
+  std::vector<std::vector<NormalizedSymbol *>> SecIndexToSymbols;
+  SecIndexToSymbols.resize(256);
+
+  // Create commons, externs, and absolutes, and partition all other symbols by
+  // section.
+  for (auto &KV : IndexToSymbol) {
+    auto &NSym = *KV.second;
+
+    switch (NSym.Type & MachO::N_TYPE) {
+    case MachO::N_UNDF:
+      if (NSym.Value) {
+        if (!NSym.Name)
+          return make_error<JITLinkError>("Anonymous common symbol at index " +
+                                          Twine(KV.first));
+        NSym.GraphSymbol = &G->addCommonSymbol(
+            *NSym.Name, NSym.S, getCommonSection(), NSym.Value, 0,
+            1ull << MachO::GET_COMM_ALIGN(NSym.Desc),
+            NSym.Desc & MachO::N_NO_DEAD_STRIP);
+      } else {
+        if (!NSym.Name)
+          return make_error<JITLinkError>("Anonymous external symbol at "
+                                          "index " +
+                                          Twine(KV.first));
+        NSym.GraphSymbol = &G->addExternalSymbol(*NSym.Name, 0);
+      }
+      break;
+    case MachO::N_ABS:
+      if (!NSym.Name)
+        return make_error<JITLinkError>("Anonymous absolute symbol at index " +
+                                        Twine(KV.first));
+      NSym.GraphSymbol = &G->addAbsoluteSymbol(
+          *NSym.Name, NSym.Value, 0, Linkage::Strong, Scope::Default,
+          NSym.Desc & MachO::N_NO_DEAD_STRIP);
+      break;
+    case MachO::N_SECT:
+      SecIndexToSymbols[NSym.Sect - 1].push_back(&NSym);
+      break;
+    case MachO::N_PBUD:
+      return make_error<JITLinkError>(
+          "Unupported N_PBUD symbol " +
+          (NSym.Name ? ("\"" + *NSym.Name + "\"") : Twine("<anon>")) +
+          " at index " + Twine(KV.first));
+    case MachO::N_INDR:
+      return make_error<JITLinkError>(
+          "Unupported N_INDR symbol " +
+          (NSym.Name ? ("\"" + *NSym.Name + "\"") : Twine("<anon>")) +
+          " at index " + Twine(KV.first));
+    default:
+      return make_error<JITLinkError>(
+          "Unrecognized symbol type " + Twine(NSym.Type & MachO::N_TYPE) +
+          " for symbol " +
+          (NSym.Name ? ("\"" + *NSym.Name + "\"") : Twine("<anon>")) +
+          " at index " + Twine(KV.first));
+    }
+  }
+
+  // Loop over sections performing regular graphification for those that
+  // don't have custom parsers.
+  for (auto &KV : IndexToSection) {
+    auto SecIndex = KV.first;
+    auto &NSec = KV.second;
+
+    // Skip sections with custom parsers.
+    if (CustomSectionParserFunctions.count(NSec.GraphSection->getName())) {
+      LLVM_DEBUG({
+        dbgs() << "  Skipping section " << NSec.GraphSection->getName()
+               << " as it has a custom parser.\n";
+      });
+      continue;
+    } else
+      LLVM_DEBUG({
+        dbgs() << "  Processing section " << NSec.GraphSection->getName()
+               << "...\n";
+      });
+
+    bool SectionIsNoDeadStrip = NSec.Flags & MachO::S_ATTR_NO_DEAD_STRIP;
+    bool SectionIsText = NSec.Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
+
+    auto &SecNSymStack = SecIndexToSymbols[SecIndex];
+
+    // If this section is non-empty but there are no symbols covering it then
+    // create one block and anonymous symbol to cover the entire section.
+    if (SecNSymStack.empty()) {
+      if (NSec.Size > 0) {
+        LLVM_DEBUG({
+          dbgs() << "    Section non-empty, but contains no symbols. "
+                    "Creating anonymous block to cover "
+                 << formatv("{0:x16}", NSec.Address) << " -- "
+                 << formatv("{0:x16}", NSec.Address + NSec.Size) << "\n";
+        });
+        addSectionStartSymAndBlock(*NSec.GraphSection, NSec.Address, NSec.Data,
+                                   NSec.Size, NSec.Alignment,
+                                   SectionIsNoDeadStrip);
+      } else
+        LLVM_DEBUG({
+          dbgs() << "    Section empty and contains no symbols. Skipping.\n";
+        });
+      continue;
+    }
+
+    // Sort the symbol stack in by address, alt-entry status, scope, and name.
+    // We sort in reverse order so that symbols will be visited in the right
+    // order when we pop off the stack below.
+    llvm::sort(SecNSymStack, [](const NormalizedSymbol *LHS,
+                                const NormalizedSymbol *RHS) {
+      if (LHS->Value != RHS->Value)
+        return LHS->Value > RHS->Value;
+      if (isAltEntry(*LHS) != isAltEntry(*RHS))
+        return isAltEntry(*RHS);
+      if (LHS->S != RHS->S)
+        return static_cast<uint8_t>(LHS->S) < static_cast<uint8_t>(RHS->S);
+      return LHS->Name < RHS->Name;
+    });
+
+    // The first symbol in a section can not be an alt-entry symbol.
+    if (!SecNSymStack.empty() && isAltEntry(*SecNSymStack.back()))
+      return make_error<JITLinkError>(
+          "First symbol in " + NSec.GraphSection->getName() + " is alt-entry");
+
+    // If the section is non-empty but there is no symbol covering the start
+    // address then add an anonymous one.
+    if (SecNSymStack.back()->Value != NSec.Address) {
+      auto AnonBlockSize = SecNSymStack.back()->Value - NSec.Address;
+      LLVM_DEBUG({
+        dbgs() << "    Section start not covered by symbol. "
+               << "Creating anonymous block to cover [ "
+               << formatv("{0:x16}", NSec.Address) << " -- "
+               << formatv("{0:x16}", NSec.Address + AnonBlockSize) << " ]\n";
+      });
+      addSectionStartSymAndBlock(*NSec.GraphSection, NSec.Address, NSec.Data,
+                                 AnonBlockSize, NSec.Alignment,
+                                 SectionIsNoDeadStrip);
+    }
+
+    // Visit section symbols in order by popping off the reverse-sorted stack,
+    // building blocks for each alt-entry chain and creating symbols as we go.
+    while (!SecNSymStack.empty()) {
+      SmallVector<NormalizedSymbol *, 8> BlockSyms;
+
+      BlockSyms.push_back(SecNSymStack.back());
+      SecNSymStack.pop_back();
+      while (!SecNSymStack.empty() &&
+             (isAltEntry(*SecNSymStack.back()) ||
+              SecNSymStack.back()->Value == BlockSyms.back()->Value)) {
+        BlockSyms.push_back(SecNSymStack.back());
+        SecNSymStack.pop_back();
+      }
+
+      // BlockNSyms now contains the block symbols in reverse canonical order.
+      JITTargetAddress BlockStart = BlockSyms.front()->Value;
+      JITTargetAddress BlockEnd = SecNSymStack.empty()
+                                      ? NSec.Address + NSec.Size
+                                      : SecNSymStack.back()->Value;
+      JITTargetAddress BlockOffset = BlockStart - NSec.Address;
+      JITTargetAddress BlockSize = BlockEnd - BlockStart;
+
+      LLVM_DEBUG({
+        dbgs() << "    Creating block for " << formatv("{0:x16}", BlockStart)
+               << " -- " << formatv("{0:x16}", BlockEnd) << ": "
+               << NSec.GraphSection->getName() << " + "
+               << formatv("{0:x16}", BlockOffset) << " with "
+               << BlockSyms.size() << " symbol(s)...\n";
+      });
+
+      Block &B =
+          NSec.Data
+              ? G->createContentBlock(
+                    *NSec.GraphSection,
+                    StringRef(NSec.Data + BlockOffset, BlockSize), BlockStart,
+                    NSec.Alignment, BlockStart % NSec.Alignment)
+              : G->createZeroFillBlock(*NSec.GraphSection, BlockSize,
+                                       BlockStart, NSec.Alignment,
+                                       BlockStart % NSec.Alignment);
+
+      Optional<JITTargetAddress> LastCanonicalAddr;
+      JITTargetAddress SymEnd = BlockEnd;
+      while (!BlockSyms.empty()) {
+        auto &NSym = *BlockSyms.back();
+        BlockSyms.pop_back();
+
+        bool SymLive =
+            (NSym.Desc & MachO::N_NO_DEAD_STRIP) || SectionIsNoDeadStrip;
+
+        LLVM_DEBUG({
+          dbgs() << "      " << formatv("{0:x16}", NSym.Value) << " -- "
+                 << formatv("{0:x16}", SymEnd) << ": ";
+          if (!NSym.Name)
+            dbgs() << "<anonymous symbol>";
+          else
+            dbgs() << NSym.Name;
+          if (SymLive)
+            dbgs() << " [no-dead-strip]";
+          if (LastCanonicalAddr == NSym.Value)
+            dbgs() << " [non-canonical]";
+          dbgs() << "\n";
+        });
+
+        auto &Sym =
+            NSym.Name
+                ? G->addDefinedSymbol(B, NSym.Value - BlockStart, *NSym.Name,
+                                      SymEnd - NSym.Value, NSym.L, NSym.S,
+                                      SectionIsText, SymLive)
+                : G->addAnonymousSymbol(B, NSym.Value - BlockStart,
+                                        SymEnd - NSym.Value, SectionIsText,
+                                        SymLive);
+        NSym.GraphSymbol = &Sym;
+        if (LastCanonicalAddr != Sym.getAddress()) {
+          if (LastCanonicalAddr)
+            SymEnd = *LastCanonicalAddr;
+          LastCanonicalAddr = Sym.getAddress();
+          setCanonicalSymbol(Sym);
+        }
+      }
+    }
+  }
+
+  return Error::success();
+}
+
+Error MachOLinkGraphBuilder::graphifySectionsWithCustomParsers() {
+  // Graphify special sections.
+  for (auto &KV : IndexToSection) {
+    auto &NSec = KV.second;
+
+    auto HI = CustomSectionParserFunctions.find(NSec.GraphSection->getName());
+    if (HI != CustomSectionParserFunctions.end()) {
+      auto &Parse = HI->second;
+      if (auto Err = Parse(NSec))
+        return Err;
+    }
+  }
+
+  return Error::success();
+}
+
+} // end namespace jitlink
+} // end namespace llvm
diff --git a/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
new file mode 100644
index 000000000000..e1123cd11048
--- /dev/null
+++ b/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
@@ -0,0 +1,269 @@
+//===----- MachOLinkGraphBuilder.h - MachO LinkGraph builder ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic MachO LinkGraph building code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIB_EXECUTIONENGINE_JITLINK_MACHOLINKGRAPHBUILDER_H
+#define LIB_EXECUTIONENGINE_JITLINK_MACHOLINKGRAPHBUILDER_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+#include "EHFrameSupportImpl.h"
+#include "JITLinkGeneric.h"
+#include "llvm/Object/MachO.h"
+
+#include <list>
+
+namespace llvm {
+namespace jitlink {
+
+class MachOLinkGraphBuilder {
+public:
+  virtual ~MachOLinkGraphBuilder();
+  Expected<std::unique_ptr<LinkGraph>> buildGraph();
+
+protected:
+  class MachOEHFrameBinaryParser : public EHFrameBinaryParser {
+  public:
+    MachOEHFrameBinaryParser(MachOLinkGraphBuilder &Builder,
+                             JITTargetAddress EHFrameAddress,
+                             StringRef EHFrameContent, Section &EHFrameSection,
+                             uint64_t CIEAlignment, uint64_t FDEAlignment,
+                             Edge::Kind FDEToCIERelocKind,
+                             Edge::Kind FDEToTargetRelocKind)
+        : EHFrameBinaryParser(EHFrameAddress, EHFrameContent,
+                              Builder.getGraph().getPointerSize(),
+                              Builder.getGraph().getEndianness()),
+          Builder(Builder), EHFrameSection(EHFrameSection),
+          CIEAlignment(CIEAlignment), FDEAlignment(FDEAlignment),
+          FDEToCIERelocKind(FDEToCIERelocKind),
+          FDEToTargetRelocKind(FDEToTargetRelocKind) {}
+
+    Symbol *getSymbolAtAddress(JITTargetAddress Address) override {
+      if (auto *Sym = Builder.getSymbolByAddress(Address))
+        if (Sym->getAddress() == Address)
+          return Sym;
+      return nullptr;
+    }
+
+    Symbol &createCIERecord(JITTargetAddress RecordAddr,
+                            StringRef RecordContent) override {
+      auto &G = Builder.getGraph();
+      auto &B = G.createContentBlock(EHFrameSection, RecordContent, RecordAddr,
+                                     CIEAlignment, 0);
+      auto &CIESymbol =
+          G.addAnonymousSymbol(B, 0, RecordContent.size(), false, false);
+      Builder.setCanonicalSymbol(CIESymbol);
+      return CIESymbol;
+    }
+
+    Expected<Symbol &> createFDERecord(JITTargetAddress RecordAddr,
+                                       StringRef RecordContent, Symbol &CIE,
+                                       size_t CIEOffset, Symbol &Func,
+                                       size_t FuncOffset, Symbol *LSDA,
+                                       size_t LSDAOffset) override {
+      auto &G = Builder.getGraph();
+      auto &B = G.createContentBlock(EHFrameSection, RecordContent, RecordAddr,
+                                     FDEAlignment, 0);
+
+      // Add edges to CIE, Func, and (conditionally) LSDA.
+      B.addEdge(FDEToCIERelocKind, CIEOffset, CIE, 0);
+      B.addEdge(FDEToTargetRelocKind, FuncOffset, Func, 0);
+
+      if (LSDA)
+        B.addEdge(FDEToTargetRelocKind, LSDAOffset, *LSDA, 0);
+
+      auto &FDESymbol =
+          G.addAnonymousSymbol(B, 0, RecordContent.size(), false, false);
+
+      // Add a keep-alive relocation from the function to the FDE to ensure it
+      // is not dead stripped.
+      Func.getBlock().addEdge(Edge::KeepAlive, 0, FDESymbol, 0);
+
+      return FDESymbol;
+    }
+
+  private:
+    MachOLinkGraphBuilder &Builder;
+    Section &EHFrameSection;
+    uint64_t CIEAlignment;
+    uint64_t FDEAlignment;
+    Edge::Kind FDEToCIERelocKind;
+    Edge::Kind FDEToTargetRelocKind;
+  };
+
+  struct NormalizedSymbol {
+    friend class MachOLinkGraphBuilder;
+
+  private:
+    NormalizedSymbol(Optional<StringRef> Name, uint64_t Value, uint8_t Type,
+                     uint8_t Sect, uint16_t Desc, Linkage L, Scope S)
+        : Name(Name), Value(Value), Type(Type), Sect(Sect), Desc(Desc), L(L),
+          S(S) {
+      assert((!Name || !Name->empty()) && "Name must be none or non-empty");
+    }
+
+  public:
+    NormalizedSymbol(const NormalizedSymbol &) = delete;
+    NormalizedSymbol &operator=(const NormalizedSymbol &) = delete;
+    NormalizedSymbol(NormalizedSymbol &&) = delete;
+    NormalizedSymbol &operator=(NormalizedSymbol &&) = delete;
+
+    Optional<StringRef> Name;
+    uint64_t Value = 0;
+    uint8_t Type = 0;
+    uint8_t Sect = 0;
+    uint16_t Desc = 0;
+    Linkage L = Linkage::Strong;
+    Scope S = Scope::Default;
+    Symbol *GraphSymbol = nullptr;
+  };
+
+  class NormalizedSection {
+    friend class MachOLinkGraphBuilder;
+
+  private:
+    NormalizedSection() = default;
+
+  public:
+    Section *GraphSection = nullptr;
+    uint64_t Address = 0;
+    uint64_t Size = 0;
+    uint64_t Alignment = 0;
+    uint32_t Flags = 0;
+    const char *Data = nullptr;
+  };
+
+  using SectionParserFunction = std::function<Error(NormalizedSection &S)>;
+
+  MachOLinkGraphBuilder(const object::MachOObjectFile &Obj);
+
+  LinkGraph &getGraph() const { return *G; }
+
+  const object::MachOObjectFile &getObject() const { return Obj; }
+
+  void addCustomSectionParser(StringRef SectionName,
+                              SectionParserFunction Parse);
+
+  virtual Error addRelocations() = 0;
+
+  /// Create a symbol.
+  template <typename... ArgTs>
+  NormalizedSymbol &createNormalizedSymbol(ArgTs &&... Args) {
+    NormalizedSymbol *Sym = reinterpret_cast<NormalizedSymbol *>(
+        Allocator.Allocate<NormalizedSymbol>());
+    new (Sym) NormalizedSymbol(std::forward<ArgTs>(Args)...);
+    return *Sym;
+  }
+
+  /// Index is zero-based (MachO section indexes are usually one-based) and
+  /// assumed to be in-range. Client is responsible for checking.
+  NormalizedSection &getSectionByIndex(unsigned Index) {
+    auto I = IndexToSection.find(Index);
+    assert(I != IndexToSection.end() && "No section recorded at index");
+    return I->second;
+  }
+
+  /// Try to get the section at the given index. Will return an error if the
+  /// given index is out of range, or if no section has been added for the given
+  /// index.
+  Expected<NormalizedSection &> findSectionByIndex(unsigned Index) {
+    auto I = IndexToSection.find(Index);
+    if (I == IndexToSection.end())
+      return make_error<JITLinkError>("No section recorded for index " +
+                                      formatv("{0:u}", Index));
+    return I->second;
+  }
+
+  /// Try to get the symbol at the given index. Will return an error if the
+  /// given index is out of range, or if no symbol has been added for the given
+  /// index.
+  Expected<NormalizedSymbol &> findSymbolByIndex(uint64_t Index) {
+    if (Index >= IndexToSymbol.size())
+      return make_error<JITLinkError>("Symbol index out of range");
+    auto *Sym = IndexToSymbol[Index];
+    if (!Sym)
+      return make_error<JITLinkError>("No symbol at index " +
+                                      formatv("{0:u}", Index));
+    return *Sym;
+  }
+
+  /// Returns the symbol with the highest address not greater than the search
+  /// address, or null if no such symbol exists.
+  Symbol *getSymbolByAddress(JITTargetAddress Address) {
+    auto I = AddrToCanonicalSymbol.upper_bound(Address);
+    if (I == AddrToCanonicalSymbol.begin())
+      return nullptr;
+    return std::prev(I)->second;
+  }
+
+  /// Returns the symbol with the highest address not greater than the search
+  /// address, or an error if no such symbol exists.
+  Expected<Symbol &> findSymbolByAddress(JITTargetAddress Address) {
+    auto *Sym = getSymbolByAddress(Address);
+    if (Sym)
+      if (Address < Sym->getAddress() + Sym->getSize())
+        return *Sym;
+    return make_error<JITLinkError>("No symbol covering address " +
+                                    formatv("{0:x16}", Address));
+  }
+
+  static Linkage getLinkage(uint16_t Desc);
+  static Scope getScope(StringRef Name, uint8_t Type);
+  static bool isAltEntry(const NormalizedSymbol &NSym);
+
+private:
+  static unsigned getPointerSize(const object::MachOObjectFile &Obj);
+  static support::endianness getEndianness(const object::MachOObjectFile &Obj);
+
+  void setCanonicalSymbol(Symbol &Sym) {
+    auto *&CanonicalSymEntry = AddrToCanonicalSymbol[Sym.getAddress()];
+    // There should be no symbol at this address, or, if there is,
+    // it should be a zero-sized symbol from an empty section (which
+    // we can safely override).
+    assert((!CanonicalSymEntry || CanonicalSymEntry->getSize() == 0) &&
+           "Duplicate canonical symbol at address");
+    CanonicalSymEntry = &Sym;
+  }
+
+  Section &getCommonSection();
+  void addSectionStartSymAndBlock(Section &GraphSec, uint64_t Address,
+                                  const char *Data, uint64_t Size,
+                                  uint32_t Alignment, bool IsLive);
+
+  Error createNormalizedSections();
+  Error createNormalizedSymbols();
+
+  /// Create graph blocks and symbols for externals, absolutes, commons and
+  /// all defined symbols in sections without custom parsers.
+  Error graphifyRegularSymbols();
+
+  /// Create graph blocks and symbols for all sections.
+  Error graphifySectionsWithCustomParsers();
+
+  // Put the BumpPtrAllocator first so that we don't free any of the underlying
+  // memory until the Symbol/Addressable destructors have been run.
+  BumpPtrAllocator Allocator;
+
+  const object::MachOObjectFile &Obj;
+  std::unique_ptr<LinkGraph> G;
+
+  DenseMap<unsigned, NormalizedSection> IndexToSection;
+  Section *CommonSection = nullptr;
+
+  DenseMap<uint32_t, NormalizedSymbol *> IndexToSymbol;
+  std::map<JITTargetAddress, Symbol *> AddrToCanonicalSymbol;
+  StringMap<SectionParserFunction> CustomSectionParserFunctions;
+};
+
+} // end namespace jitlink
+} // end namespace llvm
+
+#endif // LIB_EXECUTIONENGINE_JITLINK_MACHOLINKGRAPHBUILDER_H
diff --git a/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
new file mode 100644
index 000000000000..945343bff89d
--- /dev/null
+++ b/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -0,0 +1,736 @@
+//===---- MachO_arm64.cpp - JIT linker implementation for MachO/arm64 -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// MachO/arm64 jit-link implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/MachO_arm64.h"
+
+#include "BasicGOTAndStubsBuilder.h"
+#include "MachOLinkGraphBuilder.h"
+
+#define DEBUG_TYPE "jitlink"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+using namespace llvm::jitlink::MachO_arm64_Edges;
+
+namespace {
+
+class MachOLinkGraphBuilder_arm64 : public MachOLinkGraphBuilder {
+public:
+  MachOLinkGraphBuilder_arm64(const object::MachOObjectFile &Obj)
+      : MachOLinkGraphBuilder(Obj),
+        NumSymbols(Obj.getSymtabLoadCommand().nsyms) {
+    addCustomSectionParser(
+        "__eh_frame", [this](NormalizedSection &EHFrameSection) {
+          if (!EHFrameSection.Data)
+            return make_error<JITLinkError>(
+                "__eh_frame section is marked zero-fill");
+          return MachOEHFrameBinaryParser(
+                     *this, EHFrameSection.Address,
+                     StringRef(EHFrameSection.Data, EHFrameSection.Size),
+                     *EHFrameSection.GraphSection, 8, 4, NegDelta32, Delta64)
+              .addToGraph();
+        });
+  }
+
+private:
+  static Expected<MachOARM64RelocationKind>
+  getRelocationKind(const MachO::relocation_info &RI) {
+    switch (RI.r_type) {
+    case MachO::ARM64_RELOC_UNSIGNED:
+      if (!RI.r_pcrel) {
+        if (RI.r_length == 3)
+          return RI.r_extern ? Pointer64 : Pointer64Anon;
+        else if (RI.r_length == 2)
+          return Pointer32;
+      }
+      break;
+    case MachO::ARM64_RELOC_SUBTRACTOR:
+      // SUBTRACTOR must be non-pc-rel, extern, with length 2 or 3.
+      // Initially represent SUBTRACTOR relocations with 'Delta<W>'.
+      // They may be turned into NegDelta<W> by parsePairRelocation.
+      if (!RI.r_pcrel && RI.r_extern) {
+        if (RI.r_length == 2)
+          return Delta32;
+        else if (RI.r_length == 3)
+          return Delta64;
+      }
+      break;
+    case MachO::ARM64_RELOC_BRANCH26:
+      if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
+        return Branch26;
+      break;
+    case MachO::ARM64_RELOC_PAGE21:
+      if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
+        return Page21;
+      break;
+    case MachO::ARM64_RELOC_PAGEOFF12:
+      if (!RI.r_pcrel && RI.r_extern && RI.r_length == 2)
+        return PageOffset12;
+      break;
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
+      if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
+        return GOTPage21;
+      break;
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12:
+      if (!RI.r_pcrel && RI.r_extern && RI.r_length == 2)
+        return GOTPageOffset12;
+      break;
+    case MachO::ARM64_RELOC_POINTER_TO_GOT:
+      if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
+        return PointerToGOT;
+      break;
+    case MachO::ARM64_RELOC_ADDEND:
+      if (!RI.r_pcrel && !RI.r_extern && RI.r_length == 2)
+        return PairedAddend;
+      break;
+    }
+
+    return make_error<JITLinkError>(
+        "Unsupported arm64 relocation: address=" +
+        formatv("{0:x8}", RI.r_address) +
+        ", symbolnum=" + formatv("{0:x6}", RI.r_symbolnum) +
+        ", kind=" + formatv("{0:x1}", RI.r_type) +
+        ", pc_rel=" + (RI.r_pcrel ? "true" : "false") +
+        ", extern=" + (RI.r_extern ? "true" : "false") +
+        ", length=" + formatv("{0:d}", RI.r_length));
+  }
+
+  MachO::relocation_info
+  getRelocationInfo(const object::relocation_iterator RelItr) {
+    MachO::any_relocation_info ARI =
+        getObject().getRelocation(RelItr->getRawDataRefImpl());
+    MachO::relocation_info RI;
+    memcpy(&RI, &ARI, sizeof(MachO::relocation_info));
+    return RI;
+  }
+
+  using PairRelocInfo =
+      std::tuple<MachOARM64RelocationKind, Symbol *, uint64_t>;
+
+  // Parses paired SUBTRACTOR/UNSIGNED relocations and, on success,
+  // returns the edge kind and addend to be used.
+  Expected<PairRelocInfo>
+  parsePairRelocation(Block &BlockToFix, Edge::Kind SubtractorKind,
+                      const MachO::relocation_info &SubRI,
+                      JITTargetAddress FixupAddress, const char *FixupContent,
+                      object::relocation_iterator &UnsignedRelItr,
+                      object::relocation_iterator &RelEnd) {
+    using namespace support;
+
+    assert(((SubtractorKind == Delta32 && SubRI.r_length == 2) ||
+            (SubtractorKind == Delta64 && SubRI.r_length == 3)) &&
+           "Subtractor kind should match length");
+    assert(SubRI.r_extern && "SUBTRACTOR reloc symbol should be extern");
+    assert(!SubRI.r_pcrel && "SUBTRACTOR reloc should not be PCRel");
+
+    if (UnsignedRelItr == RelEnd)
+      return make_error<JITLinkError>("arm64 SUBTRACTOR without paired "
+                                      "UNSIGNED relocation");
+
+    auto UnsignedRI = getRelocationInfo(UnsignedRelItr);
+
+    if (SubRI.r_address != UnsignedRI.r_address)
+      return make_error<JITLinkError>("arm64 SUBTRACTOR and paired UNSIGNED "
+                                      "point to different addresses");
+
+    if (SubRI.r_length != UnsignedRI.r_length)
+      return make_error<JITLinkError>("length of arm64 SUBTRACTOR and paired "
+                                      "UNSIGNED reloc must match");
+
+    Symbol *FromSymbol;
+    if (auto FromSymbolOrErr = findSymbolByIndex(SubRI.r_symbolnum))
+      FromSymbol = FromSymbolOrErr->GraphSymbol;
+    else
+      return FromSymbolOrErr.takeError();
+
+    // Read the current fixup value.
+    uint64_t FixupValue = 0;
+    if (SubRI.r_length == 3)
+      FixupValue = *(const little64_t *)FixupContent;
+    else
+      FixupValue = *(const little32_t *)FixupContent;
+
+    // Find 'ToSymbol' using symbol number or address, depending on whether the
+    // paired UNSIGNED relocation is extern.
+    Symbol *ToSymbol = nullptr;
+    if (UnsignedRI.r_extern) {
+      // Find target symbol by symbol index.
+      if (auto ToSymbolOrErr = findSymbolByIndex(UnsignedRI.r_symbolnum))
+        ToSymbol = ToSymbolOrErr->GraphSymbol;
+      else
+        return ToSymbolOrErr.takeError();
+    } else {
+      if (auto ToSymbolOrErr = findSymbolByAddress(FixupValue))
+        ToSymbol = &*ToSymbolOrErr;
+      else
+        return ToSymbolOrErr.takeError();
+      FixupValue -= ToSymbol->getAddress();
+    }
+
+    MachOARM64RelocationKind DeltaKind;
+    Symbol *TargetSymbol;
+    uint64_t Addend;
+    if (&BlockToFix == &FromSymbol->getAddressable()) {
+      TargetSymbol = ToSymbol;
+      DeltaKind = (SubRI.r_length == 3) ? Delta64 : Delta32;
+      Addend = FixupValue + (FixupAddress - FromSymbol->getAddress());
+      // FIXME: handle extern 'from'.
+    } else if (&BlockToFix == &ToSymbol->getAddressable()) {
+      TargetSymbol = &*FromSymbol;
+      DeltaKind = (SubRI.r_length == 3) ? NegDelta64 : NegDelta32;
+      Addend = FixupValue - (FixupAddress - ToSymbol->getAddress());
+    } else {
+      // BlockToFix was neither FromSymbol nor ToSymbol.
+      return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
+                                      "either 'A' or 'B' (or a symbol in one "
+                                      "of their alt-entry groups)");
+    }
+
+    return PairRelocInfo(DeltaKind, TargetSymbol, Addend);
+  }
+
+  Error addRelocations() override {
+    using namespace support;
+    auto &Obj = getObject();
+
+    for (auto &S : Obj.sections()) {
+
+      JITTargetAddress SectionAddress = S.getAddress();
+
+      for (auto RelItr = S.relocation_begin(), RelEnd = S.relocation_end();
+           RelItr != RelEnd; ++RelItr) {
+
+        MachO::relocation_info RI = getRelocationInfo(RelItr);
+
+        // Sanity check the relocation kind.
+        auto Kind = getRelocationKind(RI);
+        if (!Kind)
+          return Kind.takeError();
+
+        // Find the address of the value to fix up.
+        JITTargetAddress FixupAddress = SectionAddress + (uint32_t)RI.r_address;
+
+        LLVM_DEBUG({
+          dbgs() << "Processing " << getMachOARM64RelocationKindName(*Kind)
+                 << " relocation at " << format("0x%016" PRIx64, FixupAddress)
+                 << "\n";
+        });
+
+        // Find the block that the fixup points to.
+        Block *BlockToFix = nullptr;
+        {
+          auto SymbolToFixOrErr = findSymbolByAddress(FixupAddress);
+          if (!SymbolToFixOrErr)
+            return SymbolToFixOrErr.takeError();
+          BlockToFix = &SymbolToFixOrErr->getBlock();
+        }
+
+        if (FixupAddress + static_cast<JITTargetAddress>(1ULL << RI.r_length) >
+            BlockToFix->getAddress() + BlockToFix->getContent().size())
+          return make_error<JITLinkError>(
+              "Relocation content extends past end of fixup block");
+
+        // Get a pointer to the fixup content.
+        const char *FixupContent = BlockToFix->getContent().data() +
+                                   (FixupAddress - BlockToFix->getAddress());
+
+        // The target symbol and addend will be populated by the switch below.
+        Symbol *TargetSymbol = nullptr;
+        uint64_t Addend = 0;
+
+        if (*Kind == PairedAddend) {
+          // If this is an Addend relocation then process it and move to the
+          // paired reloc.
+
+          Addend = RI.r_symbolnum;
+
+          if (RelItr == RelEnd)
+            return make_error<JITLinkError>("Unpaired Addend reloc at " +
+                                            formatv("{0:x16}", FixupAddress));
+          ++RelItr;
+          RI = getRelocationInfo(RelItr);
+
+          Kind = getRelocationKind(RI);
+          if (!Kind)
+            return Kind.takeError();
+
+          if (*Kind != Branch26 && *Kind != Page21 && *Kind != PageOffset12)
+            return make_error<JITLinkError>(
+                "Invalid relocation pair: Addend + " +
+                getMachOARM64RelocationKindName(*Kind));
+          else
+            LLVM_DEBUG({
+              dbgs() << "  pair is " << getMachOARM64RelocationKindName(*Kind)
+                     << "`\n";
+            });
+
+          // Find the address of the value to fix up.
+          JITTargetAddress PairedFixupAddress =
+              SectionAddress + (uint32_t)RI.r_address;
+          if (PairedFixupAddress != FixupAddress)
+            return make_error<JITLinkError>("Paired relocation points at "
+                                            "different target");
+        }
+
+        switch (*Kind) {
+        case Branch26: {
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
+          else
+            return TargetSymbolOrErr.takeError();
+          uint32_t Instr = *(const ulittle32_t *)FixupContent;
+          if ((Instr & 0x7fffffff) != 0x14000000)
+            return make_error<JITLinkError>("BRANCH26 target is not a B or BL "
+                                            "instruction with a zero addend");
+          break;
+        }
+        case Pointer32:
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
+          else
+            return TargetSymbolOrErr.takeError();
+          Addend = *(const ulittle32_t *)FixupContent;
+          break;
+        case Pointer64:
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
+          else
+            return TargetSymbolOrErr.takeError();
+          Addend = *(const ulittle64_t *)FixupContent;
+          break;
+        case Pointer64Anon: {
+          JITTargetAddress TargetAddress = *(const ulittle64_t *)FixupContent;
+          if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
+            TargetSymbol = &*TargetSymbolOrErr;
+          else
+            return TargetSymbolOrErr.takeError();
+          Addend = TargetAddress - TargetSymbol->getAddress();
+          break;
+        }
+        case Page21:
+        case GOTPage21: {
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
+          else
+            return TargetSymbolOrErr.takeError();
+          uint32_t Instr = *(const ulittle32_t *)FixupContent;
+          if ((Instr & 0xffffffe0) != 0x90000000)
+            return make_error<JITLinkError>("PAGE21/GOTPAGE21 target is not an "
+                                            "ADRP instruction with a zero "
+                                            "addend");
+          break;
+        }
+        case PageOffset12: {
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
+          else
+            return TargetSymbolOrErr.takeError();
+          break;
+        }
+        case GOTPageOffset12: {
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
+          else
+            return TargetSymbolOrErr.takeError();
+          uint32_t Instr = *(const ulittle32_t *)FixupContent;
+          if ((Instr & 0xfffffc00) != 0xf9400000)
+            return make_error<JITLinkError>("GOTPAGEOFF12 target is not an LDR "
+                                            "immediate instruction with a zero "
+                                            "addend");
+          break;
+        }
+        case PointerToGOT:
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
+          else
+            return TargetSymbolOrErr.takeError();
+          break;
+        case Delta32:
+        case Delta64: {
+          // We use Delta32/Delta64 to represent SUBTRACTOR relocations.
+          // parsePairRelocation handles the paired reloc, and returns the
+          // edge kind to be used (either Delta32/Delta64, or
+          // NegDelta32/NegDelta64, depending on the direction of the
+          // subtraction) along with the addend.
+          auto PairInfo =
+              parsePairRelocation(*BlockToFix, *Kind, RI, FixupAddress,
+                                  FixupContent, ++RelItr, RelEnd);
+          if (!PairInfo)
+            return PairInfo.takeError();
+          std::tie(*Kind, TargetSymbol, Addend) = *PairInfo;
+          assert(TargetSymbol && "No target symbol from parsePairRelocation?");
+          break;
+        }
+        default:
+          llvm_unreachable("Special relocation kind should not appear in "
+                           "mach-o file");
+        }
+
+        LLVM_DEBUG({
+          Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
+                  Addend);
+          printEdge(dbgs(), *BlockToFix, GE,
+                    getMachOARM64RelocationKindName(*Kind));
+          dbgs() << "\n";
+        });
+        BlockToFix->addEdge(*Kind, FixupAddress - BlockToFix->getAddress(),
+                            *TargetSymbol, Addend);
+      }
+    }
+    return Error::success();
+  }
+
+  unsigned NumSymbols = 0;
+};
+
+class MachO_arm64_GOTAndStubsBuilder
+    : public BasicGOTAndStubsBuilder<MachO_arm64_GOTAndStubsBuilder> {
+public:
+  MachO_arm64_GOTAndStubsBuilder(LinkGraph &G)
+      : BasicGOTAndStubsBuilder<MachO_arm64_GOTAndStubsBuilder>(G) {}
+
+  bool isGOTEdge(Edge &E) const {
+    return E.getKind() == GOTPage21 || E.getKind() == GOTPageOffset12 ||
+           E.getKind() == PointerToGOT;
+  }
+
+  Symbol &createGOTEntry(Symbol &Target) {
+    auto &GOTEntryBlock = G.createContentBlock(
+        getGOTSection(), getGOTEntryBlockContent(), 0, 8, 0);
+    GOTEntryBlock.addEdge(Pointer64, 0, Target, 0);
+    return G.addAnonymousSymbol(GOTEntryBlock, 0, 8, false, false);
+  }
+
+  void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
+    if (E.getKind() == GOTPage21 || E.getKind() == GOTPageOffset12) {
+      // Update the target, but leave the edge addend as-is.
+      E.setTarget(GOTEntry);
+    } else if (E.getKind() == PointerToGOT) {
+      E.setTarget(GOTEntry);
+      E.setKind(Delta32);
+    } else
+      llvm_unreachable("Not a GOT edge?");
+  }
+
+  bool isExternalBranchEdge(Edge &E) {
+    return E.getKind() == Branch26 && !E.getTarget().isDefined();
+  }
+
+  Symbol &createStub(Symbol &Target) {
+    auto &StubContentBlock =
+        G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 1, 0);
+    // Re-use GOT entries for stub targets.
+    auto &GOTEntrySymbol = getGOTEntrySymbol(Target);
+    StubContentBlock.addEdge(LDRLiteral19, 0, GOTEntrySymbol, 0);
+    return G.addAnonymousSymbol(StubContentBlock, 0, 8, true, false);
+  }
+
+  void fixExternalBranchEdge(Edge &E, Symbol &Stub) {
+    assert(E.getKind() == Branch26 && "Not a Branch32 edge?");
+    assert(E.getAddend() == 0 && "Branch32 edge has non-zero addend?");
+    E.setTarget(Stub);
+  }
+
+private:
+  Section &getGOTSection() {
+    if (!GOTSection)
+      GOTSection = &G.createSection("$__GOT", sys::Memory::MF_READ);
+    return *GOTSection;
+  }
+
+  Section &getStubsSection() {
+    if (!StubsSection) {
+      auto StubsProt = static_cast<sys::Memory::ProtectionFlags>(
+          sys::Memory::MF_READ | sys::Memory::MF_EXEC);
+      StubsSection = &G.createSection("$__STUBS", StubsProt);
+    }
+    return *StubsSection;
+  }
+
+  StringRef getGOTEntryBlockContent() {
+    return StringRef(reinterpret_cast<const char *>(NullGOTEntryContent),
+                     sizeof(NullGOTEntryContent));
+  }
+
+  StringRef getStubBlockContent() {
+    return StringRef(reinterpret_cast<const char *>(StubContent),
+                     sizeof(StubContent));
+  }
+
+  static const uint8_t NullGOTEntryContent[8];
+  static const uint8_t StubContent[8];
+  Section *GOTSection = nullptr;
+  Section *StubsSection = nullptr;
+};
+
+const uint8_t MachO_arm64_GOTAndStubsBuilder::NullGOTEntryContent[8] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const uint8_t MachO_arm64_GOTAndStubsBuilder::StubContent[8] = {
+    0x10, 0x00, 0x00, 0x58, // LDR x16, <literal>
+    0x00, 0x02, 0x1f, 0xd6  // BR  x16
+};
+
+} // namespace
+
+namespace llvm {
+namespace jitlink {
+
+class MachOJITLinker_arm64 : public JITLinker<MachOJITLinker_arm64> {
+  friend class JITLinker<MachOJITLinker_arm64>;
+
+public:
+  MachOJITLinker_arm64(std::unique_ptr<JITLinkContext> Ctx,
+                       PassConfiguration PassConfig)
+      : JITLinker(std::move(Ctx), std::move(PassConfig)) {}
+
+private:
+  StringRef getEdgeKindName(Edge::Kind R) const override {
+    return getMachOARM64RelocationKindName(R);
+  }
+
+  Expected<std::unique_ptr<LinkGraph>>
+  buildGraph(MemoryBufferRef ObjBuffer) override {
+    auto MachOObj = object::ObjectFile::createMachOObjectFile(ObjBuffer);
+    if (!MachOObj)
+      return MachOObj.takeError();
+    return MachOLinkGraphBuilder_arm64(**MachOObj).buildGraph();
+  }
+
+  static Error targetOutOfRangeError(const Block &B, const Edge &E) {
+    std::string ErrMsg;
+    {
+      raw_string_ostream ErrStream(ErrMsg);
+      ErrStream << "Relocation target out of range: ";
+      printEdge(ErrStream, B, E, getMachOARM64RelocationKindName(E.getKind()));
+      ErrStream << "\n";
+    }
+    return make_error<JITLinkError>(std::move(ErrMsg));
+  }
+
+  static unsigned getPageOffset12Shift(uint32_t Instr) {
+    constexpr uint32_t LDRLiteralMask = 0x3ffffc00;
+
+    // Check for a GPR LDR immediate with a zero embedded literal.
+    // If found, the top two bits contain the shift.
+    if ((Instr & LDRLiteralMask) == 0x39400000)
+      return Instr >> 30;
+
+    // Check for a Neon LDR immediate of size 64-bit or less with a zero
+    // embedded literal. If found, the top two bits contain the shift.
+    if ((Instr & LDRLiteralMask) == 0x3d400000)
+      return Instr >> 30;
+
+    // Check for a Neon LDR immediate of size 128-bit with a zero embedded
+    // literal.
+    constexpr uint32_t SizeBitsMask = 0xc0000000;
+    if ((Instr & (LDRLiteralMask | SizeBitsMask)) == 0x3dc00000)
+      return 4;
+
+    return 0;
+  }
+
+  Error applyFixup(Block &B, const Edge &E, char *BlockWorkingMem) const {
+    using namespace support;
+
+    char *FixupPtr = BlockWorkingMem + E.getOffset();
+    JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
+
+    switch (E.getKind()) {
+    case Branch26: {
+      assert((FixupAddress & 0x3) == 0 && "Branch-inst is not 32-bit aligned");
+
+      int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
+
+      if (static_cast<uint64_t>(Value) & 0x3)
+        return make_error<JITLinkError>("Branch26 target is not 32-bit "
+                                        "aligned");
+
+      if (Value < -(1 << 27) || Value > ((1 << 27) - 1))
+        return targetOutOfRangeError(B, E);
+
+      uint32_t RawInstr = *(little32_t *)FixupPtr;
+      assert((RawInstr & 0x7fffffff) == 0x14000000 &&
+             "RawInstr isn't a B or BR immediate instruction");
+      uint32_t Imm = (static_cast<uint32_t>(Value) & ((1 << 28) - 1)) >> 2;
+      uint32_t FixedInstr = RawInstr | Imm;
+      *(little32_t *)FixupPtr = FixedInstr;
+      break;
+    }
+    case Pointer32: {
+      uint64_t Value = E.getTarget().getAddress() + E.getAddend();
+      if (Value > std::numeric_limits<uint32_t>::max())
+        return targetOutOfRangeError(B, E);
+      *(ulittle32_t *)FixupPtr = Value;
+      break;
+    }
+    case Pointer64: {
+      uint64_t Value = E.getTarget().getAddress() + E.getAddend();
+      *(ulittle64_t *)FixupPtr = Value;
+      break;
+    }
+    case Page21:
+    case GOTPage21: {
+      assert(E.getAddend() == 0 && "PAGE21/GOTPAGE21 with non-zero addend");
+      uint64_t TargetPage =
+          E.getTarget().getAddress() & ~static_cast<uint64_t>(4096 - 1);
+      uint64_t PCPage = B.getAddress() & ~static_cast<uint64_t>(4096 - 1);
+
+      int64_t PageDelta = TargetPage - PCPage;
+      if (PageDelta < -(1 << 30) || PageDelta > ((1 << 30) - 1))
+        return targetOutOfRangeError(B, E);
+
+      uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
+      assert((RawInstr & 0xffffffe0) == 0x90000000 &&
+             "RawInstr isn't an ADRP instruction");
+      uint32_t ImmLo = (static_cast<uint64_t>(PageDelta) >> 12) & 0x3;
+      uint32_t ImmHi = (static_cast<uint64_t>(PageDelta) >> 14) & 0x7ffff;
+      uint32_t FixedInstr = RawInstr | (ImmLo << 29) | (ImmHi << 5);
+      *(ulittle32_t *)FixupPtr = FixedInstr;
+      break;
+    }
+    case PageOffset12: {
+      assert(E.getAddend() == 0 && "PAGEOFF12 with non-zero addend");
+      uint64_t TargetOffset = E.getTarget().getAddress() & 0xfff;
+
+      uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
+      unsigned ImmShift = getPageOffset12Shift(RawInstr);
+
+      if (TargetOffset & ((1 << ImmShift) - 1))
+        return make_error<JITLinkError>("PAGEOFF12 target is not aligned");
+
+      uint32_t EncodedImm = (TargetOffset >> ImmShift) << 10;
+      uint32_t FixedInstr = RawInstr | EncodedImm;
+      *(ulittle32_t *)FixupPtr = FixedInstr;
+      break;
+    }
+    case GOTPageOffset12: {
+      assert(E.getAddend() == 0 && "GOTPAGEOF12 with non-zero addend");
+
+      uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
+      assert((RawInstr & 0xfffffc00) == 0xf9400000 &&
+             "RawInstr isn't a 64-bit LDR immediate");
+
+      uint32_t TargetOffset = E.getTarget().getAddress() & 0xfff;
+      assert((TargetOffset & 0x7) == 0 && "GOT entry is not 8-byte aligned");
+      uint32_t EncodedImm = (TargetOffset >> 3) << 10;
+      uint32_t FixedInstr = RawInstr | EncodedImm;
+      *(ulittle32_t *)FixupPtr = FixedInstr;
+      break;
+    }
+    case LDRLiteral19: {
+      assert((FixupAddress & 0x3) == 0 && "LDR is not 32-bit aligned");
+      assert(E.getAddend() == 0 && "LDRLiteral19 with non-zero addend");
+      uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
+      assert(RawInstr == 0x58000010 && "RawInstr isn't a 64-bit LDR literal");
+      int64_t Delta = E.getTarget().getAddress() - FixupAddress;
+      if (Delta & 0x3)
+        return make_error<JITLinkError>("LDR literal target is not 32-bit "
+                                        "aligned");
+      if (Delta < -(1 << 20) || Delta > ((1 << 20) - 1))
+        return targetOutOfRangeError(B, E);
+
+      uint32_t EncodedImm = (static_cast<uint32_t>(Delta) >> 2) << 5;
+      uint32_t FixedInstr = RawInstr | EncodedImm;
+      *(ulittle32_t *)FixupPtr = FixedInstr;
+      break;
+    }
+    case Delta32:
+    case Delta64:
+    case NegDelta32:
+    case NegDelta64: {
+      int64_t Value;
+      if (E.getKind() == Delta32 || E.getKind() == Delta64)
+        Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
+      else
+        Value = FixupAddress - E.getTarget().getAddress() + E.getAddend();
+
+      if (E.getKind() == Delta32 || E.getKind() == NegDelta32) {
+        if (Value < std::numeric_limits<int32_t>::min() ||
+            Value > std::numeric_limits<int32_t>::max())
+          return targetOutOfRangeError(B, E);
+        *(little32_t *)FixupPtr = Value;
+      } else
+        *(little64_t *)FixupPtr = Value;
+      break;
+    }
+    default:
+      llvm_unreachable("Unrecognized edge kind");
+    }
+
+    return Error::success();
+  }
+
+  uint64_t NullValue = 0;
+};
+
+void jitLink_MachO_arm64(std::unique_ptr<JITLinkContext> Ctx) {
+  PassConfiguration Config;
+  Triple TT("arm64-apple-ios");
+
+  if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+    // Add a mark-live pass.
+    if (auto MarkLive = Ctx->getMarkLivePass(TT))
+      Config.PrePrunePasses.push_back(std::move(MarkLive));
+    else
+      Config.PrePrunePasses.push_back(markAllSymbolsLive);
+
+    // Add an in-place GOT/Stubs pass.
+    Config.PostPrunePasses.push_back([](LinkGraph &G) -> Error {
+      MachO_arm64_GOTAndStubsBuilder(G).run();
+      return Error::success();
+    });
+  }
+
+  if (auto Err = Ctx->modifyPassConfig(TT, Config))
+    return Ctx->notifyFailed(std::move(Err));
+
+  // Construct a JITLinker and run the link function.
+  MachOJITLinker_arm64::link(std::move(Ctx), std::move(Config));
+}
+
+StringRef getMachOARM64RelocationKindName(Edge::Kind R) {
+  switch (R) {
+  case Branch26:
+    return "Branch26";
+  case Pointer64:
+    return "Pointer64";
+  case Pointer64Anon:
+    return "Pointer64Anon";
+  case Page21:
+    return "Page21";
+  case PageOffset12:
+    return "PageOffset12";
+  case GOTPage21:
+    return "GOTPage21";
+  case GOTPageOffset12:
+    return "GOTPageOffset12";
+  case PointerToGOT:
+    return "PointerToGOT";
+  case PairedAddend:
+    return "PairedAddend";
+  case LDRLiteral19:
+    return "LDRLiteral19";
+  case Delta32:
+    return "Delta32";
+  case Delta64:
+    return "Delta64";
+  case NegDelta32:
+    return "NegDelta32";
+  case NegDelta64:
+    return "NegDelta64";
+  default:
+    return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
+  }
+}
+
+} // end namespace jitlink
+} // end namespace llvm
diff --git a/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 4010678c6d33..d83787ffd598 100644
--- a/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -13,7 +13,7 @@
 #include "llvm/ExecutionEngine/JITLink/MachO_x86_64.h"
 
 #include "BasicGOTAndStubsBuilder.h"
-#include "MachOAtomGraphBuilder.h"
+#include "MachOLinkGraphBuilder.h"
 
 #define DEBUG_TYPE "jitlink"
 
@@ -23,16 +23,21 @@ using namespace llvm::jitlink::MachO_x86_64_Edges;
 
 namespace {
 
-class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder {
+class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
 public:
-  MachOAtomGraphBuilder_x86_64(const object::MachOObjectFile &Obj)
-      : MachOAtomGraphBuilder(Obj),
-        NumSymbols(Obj.getSymtabLoadCommand().nsyms) {
-    addCustomAtomizer("__eh_frame", [this](MachOSection &EHFrameSection) {
-      return addEHFrame(getGraph(), EHFrameSection.getGenericSection(),
-                        EHFrameSection.getContent(),
-                        EHFrameSection.getAddress(), NegDelta32, Delta64);
-    });
+  MachOLinkGraphBuilder_x86_64(const object::MachOObjectFile &Obj)
+      : MachOLinkGraphBuilder(Obj) {
+    addCustomSectionParser(
+        "__eh_frame", [this](NormalizedSection &EHFrameSection) {
+          if (!EHFrameSection.Data)
+            return make_error<JITLinkError>(
+                "__eh_frame section is marked zero-fill");
+          return MachOEHFrameBinaryParser(
+                     *this, EHFrameSection.Address,
+                     StringRef(EHFrameSection.Data, EHFrameSection.Size),
+                     *EHFrameSection.GraphSection, 8, 4, NegDelta32, Delta64)
+              .addToGraph();
+        });
   }
 
 private:
@@ -40,8 +45,12 @@ private:
   getRelocationKind(const MachO::relocation_info &RI) {
     switch (RI.r_type) {
     case MachO::X86_64_RELOC_UNSIGNED:
-      if (!RI.r_pcrel && RI.r_length == 3)
-        return RI.r_extern ? Pointer64 : Pointer64Anon;
+      if (!RI.r_pcrel) {
+        if (RI.r_length == 3)
+          return RI.r_extern ? Pointer64 : Pointer64Anon;
+        else if (RI.r_extern && RI.r_length == 2)
+          return Pointer32;
+      }
       break;
     case MachO::X86_64_RELOC_SIGNED:
       if (RI.r_pcrel && RI.r_length == 2)
@@ -94,21 +103,10 @@ private:
         ", symbolnum=" + formatv("{0:x6}", RI.r_symbolnum) +
         ", kind=" + formatv("{0:x1}", RI.r_type) +
         ", pc_rel=" + (RI.r_pcrel ? "true" : "false") +
-        ", extern= " + (RI.r_extern ? "true" : "false") +
+        ", extern=" + (RI.r_extern ? "true" : "false") +
         ", length=" + formatv("{0:d}", RI.r_length));
   }
 
-  Expected<Atom &> findAtomBySymbolIndex(const MachO::relocation_info &RI) {
-    auto &Obj = getObject();
-    if (RI.r_symbolnum >= NumSymbols)
-      return make_error<JITLinkError>("Symbol index out of range");
-    auto SymI = Obj.getSymbolByIndex(RI.r_symbolnum);
-    auto Name = SymI->getName();
-    if (!Name)
-      return Name.takeError();
-    return getGraph().getAtomByName(*Name);
-  }
-
   MachO::relocation_info
   getRelocationInfo(const object::relocation_iterator RelItr) {
     MachO::any_relocation_info ARI =
@@ -118,12 +116,12 @@ private:
     return RI;
   }
 
-  using PairRelocInfo = std::tuple<MachOX86RelocationKind, Atom *, uint64_t>;
+  using PairRelocInfo = std::tuple<MachOX86RelocationKind, Symbol *, uint64_t>;
 
   // Parses paired SUBTRACTOR/UNSIGNED relocations and, on success,
   // returns the edge kind and addend to be used.
   Expected<PairRelocInfo>
-  parsePairRelocation(DefinedAtom &AtomToFix, Edge::Kind SubtractorKind,
+  parsePairRelocation(Block &BlockToFix, Edge::Kind SubtractorKind,
                       const MachO::relocation_info &SubRI,
                       JITTargetAddress FixupAddress, const char *FixupContent,
                       object::relocation_iterator &UnsignedRelItr,
@@ -150,9 +148,11 @@ private:
       return make_error<JITLinkError>("length of x86_64 SUBTRACTOR and paired "
                                       "UNSIGNED reloc must match");
 
-    auto FromAtom = findAtomBySymbolIndex(SubRI);
-    if (!FromAtom)
-      return FromAtom.takeError();
+    Symbol *FromSymbol;
+    if (auto FromSymbolOrErr = findSymbolByIndex(SubRI.r_symbolnum))
+      FromSymbol = FromSymbolOrErr->GraphSymbol;
+    else
+      return FromSymbolOrErr.takeError();
 
     // Read the current fixup value.
     uint64_t FixupValue = 0;
@@ -161,54 +161,60 @@ private:
     else
       FixupValue = *(const little32_t *)FixupContent;
 
-    // Find 'ToAtom' using symbol number or address, depending on whether the
+    // Find 'ToSymbol' using symbol number or address, depending on whether the
     // paired UNSIGNED relocation is extern.
-    Atom *ToAtom = nullptr;
+    Symbol *ToSymbol = nullptr;
     if (UnsignedRI.r_extern) {
-      // Find target atom by symbol index.
-      if (auto ToAtomOrErr = findAtomBySymbolIndex(UnsignedRI))
-        ToAtom = &*ToAtomOrErr;
+      // Find target symbol by symbol index.
+      if (auto ToSymbolOrErr = findSymbolByIndex(UnsignedRI.r_symbolnum))
+        ToSymbol = ToSymbolOrErr->GraphSymbol;
       else
-        return ToAtomOrErr.takeError();
+        return ToSymbolOrErr.takeError();
     } else {
-      if (auto ToAtomOrErr = getGraph().findAtomByAddress(FixupValue))
-        ToAtom = &*ToAtomOrErr;
+      if (auto ToSymbolOrErr = findSymbolByAddress(FixupValue))
+        ToSymbol = &*ToSymbolOrErr;
       else
-        return ToAtomOrErr.takeError();
-      FixupValue -= ToAtom->getAddress();
+        return ToSymbolOrErr.takeError();
+      FixupValue -= ToSymbol->getAddress();
     }
 
     MachOX86RelocationKind DeltaKind;
-    Atom *TargetAtom;
+    Symbol *TargetSymbol;
     uint64_t Addend;
-    if (areLayoutLocked(AtomToFix, *FromAtom)) {
-      TargetAtom = ToAtom;
+    if (&BlockToFix == &FromSymbol->getAddressable()) {
+      TargetSymbol = ToSymbol;
       DeltaKind = (SubRI.r_length == 3) ? Delta64 : Delta32;
-      Addend = FixupValue + (FixupAddress - FromAtom->getAddress());
+      Addend = FixupValue + (FixupAddress - FromSymbol->getAddress());
       // FIXME: handle extern 'from'.
-    } else if (areLayoutLocked(AtomToFix, *ToAtom)) {
-      TargetAtom = &*FromAtom;
+    } else if (&BlockToFix == &ToSymbol->getAddressable()) {
+      TargetSymbol = FromSymbol;
       DeltaKind = (SubRI.r_length == 3) ? NegDelta64 : NegDelta32;
-      Addend = FixupValue - (FixupAddress - ToAtom->getAddress());
+      Addend = FixupValue - (FixupAddress - ToSymbol->getAddress());
     } else {
-      // AtomToFix was neither FromAtom nor ToAtom.
+      // BlockToFix was neither FromSymbol nor ToSymbol.
       return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
-                                      "either 'A' or 'B' (or an atom in one "
-                                      "of their alt-entry groups)");
+                                      "either 'A' or 'B' (or a symbol in one "
+                                      "of their alt-entry chains)");
     }
 
-    return PairRelocInfo(DeltaKind, TargetAtom, Addend);
+    return PairRelocInfo(DeltaKind, TargetSymbol, Addend);
   }
 
   Error addRelocations() override {
     using namespace support;
-    auto &G = getGraph();
     auto &Obj = getObject();
 
     for (auto &S : Obj.sections()) {
 
       JITTargetAddress SectionAddress = S.getAddress();
 
+      if (S.isVirtual()) {
+        if (S.relocation_begin() != S.relocation_end())
+          return make_error<JITLinkError>("Virtual section contains "
+                                          "relocations");
+        continue;
+      }
+
       for (auto RelItr = S.relocation_begin(), RelEnd = S.relocation_end();
            RelItr != RelEnd; ++RelItr) {
 
@@ -227,26 +233,26 @@ private:
                  << format("0x%016" PRIx64, FixupAddress) << "\n";
         });
 
-        // Find the atom that the fixup points to.
-        DefinedAtom *AtomToFix = nullptr;
+        // Find the block that the fixup points to.
+        Block *BlockToFix = nullptr;
         {
-          auto AtomToFixOrErr = G.findAtomByAddress(FixupAddress);
-          if (!AtomToFixOrErr)
-            return AtomToFixOrErr.takeError();
-          AtomToFix = &*AtomToFixOrErr;
+          auto SymbolToFixOrErr = findSymbolByAddress(FixupAddress);
+          if (!SymbolToFixOrErr)
+            return SymbolToFixOrErr.takeError();
+          BlockToFix = &SymbolToFixOrErr->getBlock();
         }
 
         if (FixupAddress + static_cast<JITTargetAddress>(1ULL << RI.r_length) >
-            AtomToFix->getAddress() + AtomToFix->getContent().size())
+            BlockToFix->getAddress() + BlockToFix->getContent().size())
           return make_error<JITLinkError>(
-              "Relocation content extends past end of fixup atom");
+              "Relocation extends past end of fixup block");
 
         // Get a pointer to the fixup content.
-        const char *FixupContent = AtomToFix->getContent().data() +
-                                   (FixupAddress - AtomToFix->getAddress());
+        const char *FixupContent = BlockToFix->getContent().data() +
+                                   (FixupAddress - BlockToFix->getAddress());
 
-        // The target atom and addend will be populated by the switch below.
-        Atom *TargetAtom = nullptr;
+        // The target symbol and addend will be populated by the switch below.
+        Symbol *TargetSymbol = nullptr;
         uint64_t Addend = 0;
 
         switch (*Kind) {
@@ -254,46 +260,53 @@ private:
         case PCRel32:
         case PCRel32GOTLoad:
         case PCRel32GOT:
-          if (auto TargetAtomOrErr = findAtomBySymbolIndex(RI))
-            TargetAtom = &*TargetAtomOrErr;
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
+          else
+            return TargetSymbolOrErr.takeError();
+          Addend = *(const ulittle32_t *)FixupContent;
+          break;
+        case Pointer32:
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
-            return TargetAtomOrErr.takeError();
+            return TargetSymbolOrErr.takeError();
           Addend = *(const ulittle32_t *)FixupContent;
           break;
         case Pointer64:
-          if (auto TargetAtomOrErr = findAtomBySymbolIndex(RI))
-            TargetAtom = &*TargetAtomOrErr;
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
-            return TargetAtomOrErr.takeError();
+            return TargetSymbolOrErr.takeError();
           Addend = *(const ulittle64_t *)FixupContent;
           break;
         case Pointer64Anon: {
           JITTargetAddress TargetAddress = *(const ulittle64_t *)FixupContent;
-          if (auto TargetAtomOrErr = G.findAtomByAddress(TargetAddress))
-            TargetAtom = &*TargetAtomOrErr;
+          if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
+            TargetSymbol = &*TargetSymbolOrErr;
           else
-            return TargetAtomOrErr.takeError();
-          Addend = TargetAddress - TargetAtom->getAddress();
+            return TargetSymbolOrErr.takeError();
+          Addend = TargetAddress - TargetSymbol->getAddress();
           break;
         }
         case PCRel32Minus1:
         case PCRel32Minus2:
         case PCRel32Minus4:
-          if (auto TargetAtomOrErr = findAtomBySymbolIndex(RI))
-            TargetAtom = &*TargetAtomOrErr;
+          if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
+            TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
-            return TargetAtomOrErr.takeError();
+            return TargetSymbolOrErr.takeError();
           Addend = *(const ulittle32_t *)FixupContent +
                    (1 << (*Kind - PCRel32Minus1));
           break;
         case PCRel32Anon: {
           JITTargetAddress TargetAddress =
               FixupAddress + 4 + *(const ulittle32_t *)FixupContent;
-          if (auto TargetAtomOrErr = G.findAtomByAddress(TargetAddress))
-            TargetAtom = &*TargetAtomOrErr;
+          if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
+            TargetSymbol = &*TargetSymbolOrErr;
           else
-            return TargetAtomOrErr.takeError();
-          Addend = TargetAddress - TargetAtom->getAddress();
+            return TargetSymbolOrErr.takeError();
+          Addend = TargetAddress - TargetSymbol->getAddress();
           break;
         }
         case PCRel32Minus1Anon:
@@ -303,11 +316,11 @@ private:
               static_cast<JITTargetAddress>(1ULL << (*Kind - PCRel32Minus1Anon));
           JITTargetAddress TargetAddress =
               FixupAddress + 4 + Delta + *(const ulittle32_t *)FixupContent;
-          if (auto TargetAtomOrErr = G.findAtomByAddress(TargetAddress))
-            TargetAtom = &*TargetAtomOrErr;
+          if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
+            TargetSymbol = &*TargetSymbolOrErr;
           else
-            return TargetAtomOrErr.takeError();
-          Addend = TargetAddress - TargetAtom->getAddress();
+            return TargetSymbolOrErr.takeError();
+          Addend = TargetAddress - TargetSymbol->getAddress();
           break;
         }
         case Delta32:
@@ -318,12 +331,12 @@ private:
           // NegDelta32/NegDelta64, depending on the direction of the
           // subtraction) along with the addend.
           auto PairInfo =
-              parsePairRelocation(*AtomToFix, *Kind, RI, FixupAddress,
+              parsePairRelocation(*BlockToFix, *Kind, RI, FixupAddress,
                                   FixupContent, ++RelItr, RelEnd);
           if (!PairInfo)
             return PairInfo.takeError();
-          std::tie(*Kind, TargetAtom, Addend) = *PairInfo;
-          assert(TargetAtom && "No target atom from parsePairRelocation?");
+          std::tie(*Kind, TargetSymbol, Addend) = *PairInfo;
+          assert(TargetSymbol && "No target symbol from parsePairRelocation?");
           break;
         }
         default:
@@ -332,41 +345,38 @@ private:
         }
 
         LLVM_DEBUG({
-          Edge GE(*Kind, FixupAddress - AtomToFix->getAddress(), *TargetAtom,
+          Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
                   Addend);
-          printEdge(dbgs(), *AtomToFix, GE,
+          printEdge(dbgs(), *BlockToFix, GE,
                     getMachOX86RelocationKindName(*Kind));
           dbgs() << "\n";
         });
-        AtomToFix->addEdge(*Kind, FixupAddress - AtomToFix->getAddress(),
-                           *TargetAtom, Addend);
+        BlockToFix->addEdge(*Kind, FixupAddress - BlockToFix->getAddress(),
+                            *TargetSymbol, Addend);
       }
     }
     return Error::success();
   }
-
-  unsigned NumSymbols = 0;
 };
 
 class MachO_x86_64_GOTAndStubsBuilder
     : public BasicGOTAndStubsBuilder<MachO_x86_64_GOTAndStubsBuilder> {
 public:
-  MachO_x86_64_GOTAndStubsBuilder(AtomGraph &G)
+  MachO_x86_64_GOTAndStubsBuilder(LinkGraph &G)
       : BasicGOTAndStubsBuilder<MachO_x86_64_GOTAndStubsBuilder>(G) {}
 
   bool isGOTEdge(Edge &E) const {
     return E.getKind() == PCRel32GOT || E.getKind() == PCRel32GOTLoad;
   }
 
-  DefinedAtom &createGOTEntry(Atom &Target) {
-    auto &GOTEntryAtom = G.addAnonymousAtom(getGOTSection(), 0x0, 8);
-    GOTEntryAtom.setContent(
-        StringRef(reinterpret_cast<const char *>(NullGOTEntryContent), 8));
-    GOTEntryAtom.addEdge(Pointer64, 0, Target, 0);
-    return GOTEntryAtom;
+  Symbol &createGOTEntry(Symbol &Target) {
+    auto &GOTEntryBlock = G.createContentBlock(
+        getGOTSection(), getGOTEntryBlockContent(), 0, 8, 0);
+    GOTEntryBlock.addEdge(Pointer64, 0, Target, 0);
+    return G.addAnonymousSymbol(GOTEntryBlock, 0, 8, false, false);
   }
 
-  void fixGOTEdge(Edge &E, Atom &GOTEntry) {
+  void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
     assert((E.getKind() == PCRel32GOT || E.getKind() == PCRel32GOTLoad) &&
            "Not a GOT edge?");
     E.setKind(PCRel32);
@@ -378,19 +388,16 @@ public:
     return E.getKind() == Branch32 && !E.getTarget().isDefined();
   }
 
-  DefinedAtom &createStub(Atom &Target) {
-    auto &StubAtom = G.addAnonymousAtom(getStubsSection(), 0x0, 2);
-    StubAtom.setContent(
-        StringRef(reinterpret_cast<const char *>(StubContent), 6));
-
+  Symbol &createStub(Symbol &Target) {
+    auto &StubContentBlock =
+        G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 1, 0);
     // Re-use GOT entries for stub targets.
-    auto &GOTEntryAtom = getGOTEntryAtom(Target);
-    StubAtom.addEdge(PCRel32, 2, GOTEntryAtom, 0);
-
-    return StubAtom;
+    auto &GOTEntrySymbol = getGOTEntrySymbol(Target);
+    StubContentBlock.addEdge(PCRel32, 2, GOTEntrySymbol, 0);
+    return G.addAnonymousSymbol(StubContentBlock, 0, 6, true, false);
   }
 
-  void fixExternalBranchEdge(Edge &E, Atom &Stub) {
+  void fixExternalBranchEdge(Edge &E, Symbol &Stub) {
     assert(E.getKind() == Branch32 && "Not a Branch32 edge?");
     assert(E.getAddend() == 0 && "Branch32 edge has non-zero addend?");
     E.setTarget(Stub);
@@ -399,7 +406,7 @@ public:
 private:
   Section &getGOTSection() {
     if (!GOTSection)
-      GOTSection = &G.createSection("$__GOT", 8, sys::Memory::MF_READ, false);
+      GOTSection = &G.createSection("$__GOT", sys::Memory::MF_READ);
     return *GOTSection;
   }
 
@@ -407,11 +414,21 @@ private:
     if (!StubsSection) {
       auto StubsProt = static_cast<sys::Memory::ProtectionFlags>(
           sys::Memory::MF_READ | sys::Memory::MF_EXEC);
-      StubsSection = &G.createSection("$__STUBS", 8, StubsProt, false);
+      StubsSection = &G.createSection("$__STUBS", StubsProt);
     }
     return *StubsSection;
   }
 
+  StringRef getGOTEntryBlockContent() {
+    return StringRef(reinterpret_cast<const char *>(NullGOTEntryContent),
+                     sizeof(NullGOTEntryContent));
+  }
+
+  StringRef getStubBlockContent() {
+    return StringRef(reinterpret_cast<const char *>(StubContent),
+                     sizeof(StubContent));
+  }
+
   static const uint8_t NullGOTEntryContent[8];
   static const uint8_t StubContent[6];
   Section *GOTSection = nullptr;
@@ -440,30 +457,31 @@ private:
     return getMachOX86RelocationKindName(R);
   }
 
-  Expected<std::unique_ptr<AtomGraph>>
+  Expected<std::unique_ptr<LinkGraph>>
   buildGraph(MemoryBufferRef ObjBuffer) override {
     auto MachOObj = object::ObjectFile::createMachOObjectFile(ObjBuffer);
     if (!MachOObj)
       return MachOObj.takeError();
-    return MachOAtomGraphBuilder_x86_64(**MachOObj).buildGraph();
+    return MachOLinkGraphBuilder_x86_64(**MachOObj).buildGraph();
   }
 
-  static Error targetOutOfRangeError(const Atom &A, const Edge &E) {
+  static Error targetOutOfRangeError(const Block &B, const Edge &E) {
     std::string ErrMsg;
     {
       raw_string_ostream ErrStream(ErrMsg);
       ErrStream << "Relocation target out of range: ";
-      printEdge(ErrStream, A, E, getMachOX86RelocationKindName(E.getKind()));
+      printEdge(ErrStream, B, E, getMachOX86RelocationKindName(E.getKind()));
       ErrStream << "\n";
     }
     return make_error<JITLinkError>(std::move(ErrMsg));
   }
 
-  Error applyFixup(DefinedAtom &A, const Edge &E, char *AtomWorkingMem) const {
+  Error applyFixup(Block &B, const Edge &E, char *BlockWorkingMem) const {
+
     using namespace support;
 
-    char *FixupPtr = AtomWorkingMem + E.getOffset();
-    JITTargetAddress FixupAddress = A.getAddress() + E.getOffset();
+    char *FixupPtr = BlockWorkingMem + E.getOffset();
+    JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
 
     switch (E.getKind()) {
     case Branch32:
@@ -473,7 +491,7 @@ private:
           E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
       if (Value < std::numeric_limits<int32_t>::min() ||
           Value > std::numeric_limits<int32_t>::max())
-        return targetOutOfRangeError(A, E);
+        return targetOutOfRangeError(B, E);
       *(little32_t *)FixupPtr = Value;
       break;
     }
@@ -491,7 +509,7 @@ private:
           E.getTarget().getAddress() - (FixupAddress + Delta) + E.getAddend();
       if (Value < std::numeric_limits<int32_t>::min() ||
           Value > std::numeric_limits<int32_t>::max())
-        return targetOutOfRangeError(A, E);
+        return targetOutOfRangeError(B, E);
       *(little32_t *)FixupPtr = Value;
       break;
     }
@@ -503,7 +521,7 @@ private:
           E.getTarget().getAddress() - (FixupAddress + Delta) + E.getAddend();
       if (Value < std::numeric_limits<int32_t>::min() ||
           Value > std::numeric_limits<int32_t>::max())
-        return targetOutOfRangeError(A, E);
+        return targetOutOfRangeError(B, E);
       *(little32_t *)FixupPtr = Value;
       break;
     }
@@ -520,12 +538,19 @@ private:
       if (E.getKind() == Delta32 || E.getKind() == NegDelta32) {
         if (Value < std::numeric_limits<int32_t>::min() ||
             Value > std::numeric_limits<int32_t>::max())
-          return targetOutOfRangeError(A, E);
+          return targetOutOfRangeError(B, E);
         *(little32_t *)FixupPtr = Value;
       } else
         *(little64_t *)FixupPtr = Value;
       break;
     }
+    case Pointer32: {
+      uint64_t Value = E.getTarget().getAddress() + E.getAddend();
+      if (Value > std::numeric_limits<uint32_t>::max())
+        return targetOutOfRangeError(B, E);
+      *(ulittle32_t *)FixupPtr = Value;
+      break;
+    }
     default:
       llvm_unreachable("Unrecognized edge kind");
     }
@@ -545,10 +570,10 @@ void jitLink_MachO_x86_64(std::unique_ptr<JITLinkContext> Ctx) {
     if (auto MarkLive = Ctx->getMarkLivePass(TT))
       Config.PrePrunePasses.push_back(std::move(MarkLive));
     else
-      Config.PrePrunePasses.push_back(markAllAtomsLive);
+      Config.PrePrunePasses.push_back(markAllSymbolsLive);
 
     // Add an in-place GOT/Stubs pass.
-    Config.PostPrunePasses.push_back([](AtomGraph &G) -> Error {
+    Config.PostPrunePasses.push_back([](LinkGraph &G) -> Error {
       MachO_x86_64_GOTAndStubsBuilder(G).run();
       return Error::success();
     });
@@ -565,6 +590,8 @@ StringRef getMachOX86RelocationKindName(Edge::Kind R) {
   switch (R) {
   case Branch32:
     return "Branch32";
+  case Pointer32:
+    return "Pointer32";
   case Pointer64:
     return "Pointer64";
   case Pointer64Anon:
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 08815b7a80ae..94741f5f01d5 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -23,7 +23,7 @@
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/MutexGuard.h"
+#include <mutex>
 
 using namespace llvm;
 
@@ -88,7 +88,7 @@ MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> TM,
 }
 
 MCJIT::~MCJIT() {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   Dyld.deregisterEHFrames();
 
@@ -100,7 +100,7 @@ MCJIT::~MCJIT() {
 }
 
 void MCJIT::addModule(std::unique_ptr<Module> M) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   if (M->getDataLayout().isDefault())
     M->setDataLayout(getDataLayout());
@@ -109,7 +109,7 @@ void MCJIT::addModule(std::unique_ptr<Module> M) {
 }
 
 bool MCJIT::removeModule(Module *M) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   return OwnedModules.removeModule(M);
 }
 
@@ -136,14 +136,14 @@ void MCJIT::addArchive(object::OwningBinary<object::Archive> A) {
 }
 
 void MCJIT::setObjectCache(ObjectCache* NewCache) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   ObjCache = NewCache;
 }
 
 std::unique_ptr<MemoryBuffer> MCJIT::emitObject(Module *M) {
   assert(M && "Can not emit a null module");
 
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // Materialize all globals in the module if they have not been
   // materialized already.
@@ -185,7 +185,7 @@ std::unique_ptr<MemoryBuffer> MCJIT::emitObject(Module *M) {
 
 void MCJIT::generateCodeForModule(Module *M) {
   // Get a thread lock to make sure we aren't trying to load multiple times
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // This must be a module which has already been added to this MCJIT instance.
   assert(OwnedModules.ownsModule(M) &&
@@ -234,7 +234,7 @@ void MCJIT::generateCodeForModule(Module *M) {
 }
 
 void MCJIT::finalizeLoadedModules() {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // Resolve any outstanding relocations.
   Dyld.resolveRelocations();
@@ -250,7 +250,7 @@ void MCJIT::finalizeLoadedModules() {
 
 // FIXME: Rename this.
 void MCJIT::finalizeObject() {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // Generate code for module is going to move objects out of the 'added' list,
   // so we need to copy that out before using it:
@@ -265,7 +265,7 @@ void MCJIT::finalizeObject() {
 }
 
 void MCJIT::finalizeModule(Module *M) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // This must be a module which has already been added to this MCJIT instance.
   assert(OwnedModules.ownsModule(M) && "MCJIT::finalizeModule: Unknown module.");
@@ -292,7 +292,7 @@ Module *MCJIT::findModuleForSymbol(const std::string &Name,
   if (DemangledName[0] == getDataLayout().getGlobalPrefix())
     DemangledName = DemangledName.substr(1);
 
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // If it hasn't already been generated, see if it's in one of our modules.
   for (ModulePtrSet::iterator I = OwnedModules.begin_added(),
@@ -332,7 +332,7 @@ uint64_t MCJIT::getSymbolAddress(const std::string &Name,
 
 JITSymbol MCJIT::findSymbol(const std::string &Name,
                             bool CheckFunctionsOnly) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // First, check to see if we already have this symbol.
   if (auto Sym = findExistingSymbol(Name))
@@ -388,7 +388,7 @@ JITSymbol MCJIT::findSymbol(const std::string &Name,
 }
 
 uint64_t MCJIT::getGlobalValueAddress(const std::string &Name) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   uint64_t Result = getSymbolAddress(Name, false);
   if (Result != 0)
     finalizeLoadedModules();
@@ -396,7 +396,7 @@ uint64_t MCJIT::getGlobalValueAddress(const std::string &Name) {
 }
 
 uint64_t MCJIT::getFunctionAddress(const std::string &Name) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   uint64_t Result = getSymbolAddress(Name, true);
   if (Result != 0)
     finalizeLoadedModules();
@@ -405,7 +405,7 @@ uint64_t MCJIT::getFunctionAddress(const std::string &Name) {
 
 // Deprecated.  Use getFunctionAddress instead.
 void *MCJIT::getPointerToFunction(Function *F) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   Mangler Mang;
   SmallString<128> Name;
@@ -632,14 +632,14 @@ void *MCJIT::getPointerToNamedFunction(StringRef Name, bool AbortOnFailure) {
 void MCJIT::RegisterJITEventListener(JITEventListener *L) {
   if (!L)
     return;
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   EventListeners.push_back(L);
 }
 
 void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
   if (!L)
     return;
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   auto I = find(reverse(EventListeners), L);
   if (I != EventListeners.rend()) {
     std::swap(*I, EventListeners.back());
@@ -651,7 +651,7 @@ void MCJIT::notifyObjectLoaded(const object::ObjectFile &Obj,
                                const RuntimeDyld::LoadedObjectInfo &L) {
   uint64_t Key =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(Obj.getData().data()));
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   MemMgr->notifyObjectLoaded(this, Obj);
   for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
     EventListeners[I]->notifyObjectLoaded(Key, Obj, L);
@@ -661,7 +661,7 @@ void MCJIT::notifyObjectLoaded(const object::ObjectFile &Obj,
 void MCJIT::notifyFreeingObject(const object::ObjectFile &Obj) {
   uint64_t Key =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(Obj.getData().data()));
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   for (JITEventListener *L : EventListeners)
     L->notifyFreeingObject(Key);
 }
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 2ad9d24555f3..bb5d96051da9 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -177,7 +177,7 @@ void OProfileJITEventListener::notifyFreeingObject(ObjectKey Key) {
 
 namespace llvm {
 JITEventListener *JITEventListener::createOProfileJITEventListener() {
-  return new OProfileJITEventListener(llvm::make_unique<OProfileWrapper>());
+  return new OProfileJITEventListener(std::make_unique<OProfileWrapper>());
 }
 
 } // namespace llvm
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
index 1a2667736926..b78d2531382d 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -17,11 +17,11 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Mutex.h"
-#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstring>
 #include <dirent.h>
 #include <fcntl.h>
+#include <mutex>
 #include <stddef.h>
 #include <sys/stat.h>
 #include <unistd.h>
@@ -54,7 +54,7 @@ bool OProfileWrapper::initialize() {
   using namespace llvm;
   using namespace llvm::sys;
 
-  MutexGuard Guard(OProfileInitializationMutex);
+  std::lock_guard<sys::Mutex> Guard(OProfileInitializationMutex);
 
   if (Initialized)
     return OpenAgentFunc != 0;
diff --git a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index 99bf53bc3afa..75ddbc30445d 100644
--- a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -54,11 +54,12 @@ static ThreadSafeModule extractSubModule(ThreadSafeModule &TSM,
       llvm_unreachable("Unsupported global type");
   };
 
-  auto NewTSMod = cloneToNewContext(TSM, ShouldExtract, DeleteExtractedDefs);
-  auto &M = *NewTSMod.getModule();
-  M.setModuleIdentifier((M.getModuleIdentifier() + Suffix).str());
+  auto NewTSM = cloneToNewContext(TSM, ShouldExtract, DeleteExtractedDefs);
+  NewTSM.withModuleDo([&](Module &M) {
+    M.setModuleIdentifier((M.getModuleIdentifier() + Suffix).str());
+  });
 
-  return NewTSMod;
+  return NewTSM;
 }
 
 namespace llvm {
@@ -117,39 +118,44 @@ void CompileOnDemandLayer::setPartitionFunction(PartitionFunction Partition) {
   this->Partition = std::move(Partition);
 }
 
+void CompileOnDemandLayer::setImplMap(ImplSymbolMap *Imp) {
+  this->AliaseeImpls = Imp;
+}
 void CompileOnDemandLayer::emit(MaterializationResponsibility R,
                                 ThreadSafeModule TSM) {
-  assert(TSM.getModule() && "Null module");
+  assert(TSM && "Null module");
 
   auto &ES = getExecutionSession();
-  auto &M = *TSM.getModule();
-
-  // First, do some cleanup on the module:
-  cleanUpModule(M);
 
-  // Now sort the callables and non-callables, build re-exports and lodge the
+  // Sort the callables and non-callables, build re-exports and lodge the
   // actual module with the implementation dylib.
   auto &PDR = getPerDylibResources(R.getTargetJITDylib());
 
-  MangleAndInterner Mangle(ES, M.getDataLayout());
   SymbolAliasMap NonCallables;
   SymbolAliasMap Callables;
-  for (auto &GV : M.global_values()) {
-    if (GV.isDeclaration() || GV.hasLocalLinkage() || GV.hasAppendingLinkage())
-      continue;
-
-    auto Name = Mangle(GV.getName());
-    auto Flags = JITSymbolFlags::fromGlobalValue(GV);
-    if (Flags.isCallable())
-      Callables[Name] = SymbolAliasMapEntry(Name, Flags);
-    else
-      NonCallables[Name] = SymbolAliasMapEntry(Name, Flags);
-  }
+  TSM.withModuleDo([&](Module &M) {
+    // First, do some cleanup on the module:
+    cleanUpModule(M);
+
+    MangleAndInterner Mangle(ES, M.getDataLayout());
+    for (auto &GV : M.global_values()) {
+      if (GV.isDeclaration() || GV.hasLocalLinkage() ||
+          GV.hasAppendingLinkage())
+        continue;
+
+      auto Name = Mangle(GV.getName());
+      auto Flags = JITSymbolFlags::fromGlobalValue(GV);
+      if (Flags.isCallable())
+        Callables[Name] = SymbolAliasMapEntry(Name, Flags);
+      else
+        NonCallables[Name] = SymbolAliasMapEntry(Name, Flags);
+    }
+  });
 
   // Create a partitioning materialization unit and lodge it with the
   // implementation dylib.
   if (auto Err = PDR.getImplDylib().define(
-          llvm::make_unique<PartitioningIRMaterializationUnit>(
+          std::make_unique<PartitioningIRMaterializationUnit>(
               ES, std::move(TSM), R.getVModuleKey(), *this))) {
     ES.reportError(std::move(Err));
     R.failMaterialization();
@@ -158,7 +164,7 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R,
 
   R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), true));
   R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
-                          std::move(Callables)));
+                          std::move(Callables), AliaseeImpls));
 }
 
 CompileOnDemandLayer::PerDylibResources &
@@ -239,14 +245,16 @@ void CompileOnDemandLayer::emitPartition(
   //        memory manager instance to the linking layer.
 
   auto &ES = getExecutionSession();
-
   GlobalValueSet RequestedGVs;
   for (auto &Name : R.getRequestedSymbols()) {
     assert(Defs.count(Name) && "No definition for symbol");
     RequestedGVs.insert(Defs[Name]);
   }
 
-  auto GVsToExtract = Partition(RequestedGVs);
+  /// Perform partitioning with the context lock held, since the partition
+  /// function is allowed to access the globals to compute the partition.
+  auto GVsToExtract =
+      TSM.withModuleDo([&](Module &M) { return Partition(RequestedGVs); });
 
   // Take a 'None' partition to mean the whole module (as opposed to an empty
   // partition, which means "materialize nothing"). Emit the whole module
@@ -259,43 +267,52 @@ void CompileOnDemandLayer::emitPartition(
 
   // If the partition is empty, return the whole module to the symbol table.
   if (GVsToExtract->empty()) {
-    R.replace(llvm::make_unique<PartitioningIRMaterializationUnit>(
+    R.replace(std::make_unique<PartitioningIRMaterializationUnit>(
         std::move(TSM), R.getSymbols(), std::move(Defs), *this));
     return;
   }
 
   // Ok -- we actually need to partition the symbols. Promote the symbol
-  // linkages/names.
-  // FIXME: We apply this once per partitioning. It's safe, but overkill.
-  {
-    auto PromotedGlobals = PromoteSymbols(*TSM.getModule());
-    if (!PromotedGlobals.empty()) {
-      MangleAndInterner Mangle(ES, TSM.getModule()->getDataLayout());
-      SymbolFlagsMap SymbolFlags;
-      for (auto &GV : PromotedGlobals)
-        SymbolFlags[Mangle(GV->getName())] =
-            JITSymbolFlags::fromGlobalValue(*GV);
-      if (auto Err = R.defineMaterializing(SymbolFlags)) {
-        ES.reportError(std::move(Err));
-        R.failMaterialization();
-        return;
-      }
-    }
+  // linkages/names, expand the partition to include any required symbols
+  // (i.e. symbols that can't be separated from our partition), and
+  // then extract the partition.
+  //
+  // FIXME: We apply this promotion once per partitioning. It's safe, but
+  // overkill.
+
+  auto ExtractedTSM =
+      TSM.withModuleDo([&](Module &M) -> Expected<ThreadSafeModule> {
+        auto PromotedGlobals = PromoteSymbols(M);
+        if (!PromotedGlobals.empty()) {
+          MangleAndInterner Mangle(ES, M.getDataLayout());
+          SymbolFlagsMap SymbolFlags;
+          for (auto &GV : PromotedGlobals)
+            SymbolFlags[Mangle(GV->getName())] =
+                JITSymbolFlags::fromGlobalValue(*GV);
+          if (auto Err = R.defineMaterializing(SymbolFlags))
+            return std::move(Err);
+        }
+
+        expandPartition(*GVsToExtract);
+
+        // Extract the requested partiton (plus any necessary aliases) and
+        // put the rest back into the impl dylib.
+        auto ShouldExtract = [&](const GlobalValue &GV) -> bool {
+          return GVsToExtract->count(&GV);
+        };
+
+        return extractSubModule(TSM, ".submodule", ShouldExtract);
+      });
+
+  if (!ExtractedTSM) {
+    ES.reportError(ExtractedTSM.takeError());
+    R.failMaterialization();
+    return;
   }
 
-  expandPartition(*GVsToExtract);
-
-  // Extract the requested partiton (plus any necessary aliases) and
-  // put the rest back into the impl dylib.
-  auto ShouldExtract = [&](const GlobalValue &GV) -> bool {
-    return GVsToExtract->count(&GV);
-  };
-
-  auto ExtractedTSM = extractSubModule(TSM, ".submodule", ShouldExtract);
-  R.replace(llvm::make_unique<PartitioningIRMaterializationUnit>(
+  R.replace(std::make_unique<PartitioningIRMaterializationUnit>(
       ES, std::move(TSM), R.getVModuleKey(), *this));
-
-  BaseLayer.emit(std::move(R), std::move(ExtractedTSM));
+  BaseLayer.emit(std::move(R), std::move(*ExtractedTSM));
 }
 
 } // end namespace orc
diff --git a/lib/ExecutionEngine/Orc/CompileUtils.cpp b/lib/ExecutionEngine/Orc/CompileUtils.cpp
index d46b6fcf9a5f..f8251627a4ef 100644
--- a/lib/ExecutionEngine/Orc/CompileUtils.cpp
+++ b/lib/ExecutionEngine/Orc/CompileUtils.cpp
@@ -42,7 +42,7 @@ SimpleCompiler::CompileResult SimpleCompiler::operator()(Module &M) {
     PM.run(M);
   }
 
-  auto ObjBuffer = llvm::make_unique<SmallVectorMemoryBuffer>(
+  auto ObjBuffer = std::make_unique<SmallVectorMemoryBuffer>(
       std::move(ObjBufferSV),
       "<in memory object compiled from " + M.getModuleIdentifier() + ">");
 
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index dac37e030e0c..5c7d888c2d6e 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -151,6 +151,8 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) {
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) {
+  if (Flags.hasError())
+    OS << "[*ERROR*]";
   if (Flags.isCallable())
     OS << "[Callable]";
   else
@@ -224,7 +226,7 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases) {
   for (auto &KV : Aliases)
     OS << " " << *KV.first << ": " << KV.second.Aliasee << " "
        << KV.second.AliasFlags;
-  OS << " }\n";
+  OS << " }";
   return OS;
 }
 
@@ -238,15 +240,18 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S) {
     return OS << "Materializing";
   case SymbolState::Resolved:
     return OS << "Resolved";
+  case SymbolState::Emitted:
+    return OS << "Emitted";
   case SymbolState::Ready:
     return OS << "Ready";
   }
   llvm_unreachable("Invalid state");
 }
 
-FailedToMaterialize::FailedToMaterialize(SymbolNameSet Symbols)
+FailedToMaterialize::FailedToMaterialize(
+    std::shared_ptr<SymbolDependenceMap> Symbols)
     : Symbols(std::move(Symbols)) {
-  assert(!this->Symbols.empty() && "Can not fail to resolve an empty set");
+  assert(!this->Symbols->empty() && "Can not fail to resolve an empty set");
 }
 
 std::error_code FailedToMaterialize::convertToErrorCode() const {
@@ -254,7 +259,7 @@ std::error_code FailedToMaterialize::convertToErrorCode() const {
 }
 
 void FailedToMaterialize::log(raw_ostream &OS) const {
-  OS << "Failed to materialize symbols: " << Symbols;
+  OS << "Failed to materialize symbols: " << *Symbols;
 }
 
 SymbolsNotFound::SymbolsNotFound(SymbolNameSet Symbols)
@@ -367,35 +372,35 @@ SymbolNameSet MaterializationResponsibility::getRequestedSymbols() const {
   return JD.getRequestedSymbols(SymbolFlags);
 }
 
-void MaterializationResponsibility::notifyResolved(const SymbolMap &Symbols) {
+Error MaterializationResponsibility::notifyResolved(const SymbolMap &Symbols) {
   LLVM_DEBUG({
     dbgs() << "In " << JD.getName() << " resolving " << Symbols << "\n";
   });
 #ifndef NDEBUG
   for (auto &KV : Symbols) {
+    auto WeakFlags = JITSymbolFlags::Weak | JITSymbolFlags::Common;
     auto I = SymbolFlags.find(KV.first);
     assert(I != SymbolFlags.end() &&
            "Resolving symbol outside this responsibility set");
-    if (I->second.isWeak())
-      assert(I->second == (KV.second.getFlags() | JITSymbolFlags::Weak) &&
-             "Resolving symbol with incorrect flags");
-    else
-      assert(I->second == KV.second.getFlags() &&
-             "Resolving symbol with incorrect flags");
+    assert((KV.second.getFlags() & ~WeakFlags) == (I->second & ~WeakFlags) &&
+           "Resolving symbol with incorrect flags");
   }
 #endif
 
-  JD.resolve(Symbols);
+  return JD.resolve(Symbols);
 }
 
-void MaterializationResponsibility::notifyEmitted() {
+Error MaterializationResponsibility::notifyEmitted() {
 
   LLVM_DEBUG({
     dbgs() << "In " << JD.getName() << " emitting " << SymbolFlags << "\n";
   });
 
-  JD.emit(SymbolFlags);
+  if (auto Err = JD.emit(SymbolFlags))
+    return Err;
+
   SymbolFlags.clear();
+  return Error::success();
 }
 
 Error MaterializationResponsibility::defineMaterializing(
@@ -417,12 +422,13 @@ void MaterializationResponsibility::failMaterialization() {
            << SymbolFlags << "\n";
   });
 
-  SymbolNameSet FailedSymbols;
-  for (auto &KV : SymbolFlags)
-    FailedSymbols.insert(KV.first);
+  JITDylib::FailedSymbolsWorklist Worklist;
 
-  JD.notifyFailed(FailedSymbols);
+  for (auto &KV : SymbolFlags)
+    Worklist.push_back(std::make_pair(&JD, KV.first));
   SymbolFlags.clear();
+
+  JD.notifyFailed(std::move(Worklist));
 }
 
 void MaterializationResponsibility::replace(
@@ -485,8 +491,9 @@ StringRef AbsoluteSymbolsMaterializationUnit::getName() const {
 
 void AbsoluteSymbolsMaterializationUnit::materialize(
     MaterializationResponsibility R) {
-  R.notifyResolved(Symbols);
-  R.notifyEmitted();
+  // No dependencies, so these calls can't fail.
+  cantFail(R.notifyResolved(Symbols));
+  cantFail(R.notifyEmitted());
 }
 
 void AbsoluteSymbolsMaterializationUnit::discard(const JITDylib &JD,
@@ -625,6 +632,7 @@ void ReExportsMaterializationUnit::materialize(
     };
 
     auto OnComplete = [QueryInfo](Expected<SymbolMap> Result) {
+      auto &ES = QueryInfo->R.getTargetJITDylib().getExecutionSession();
       if (Result) {
         SymbolMap ResolutionMap;
         for (auto &KV : QueryInfo->Aliases) {
@@ -633,10 +641,17 @@ void ReExportsMaterializationUnit::materialize(
           ResolutionMap[KV.first] = JITEvaluatedSymbol(
               (*Result)[KV.second.Aliasee].getAddress(), KV.second.AliasFlags);
         }
-        QueryInfo->R.notifyResolved(ResolutionMap);
-        QueryInfo->R.notifyEmitted();
+        if (auto Err = QueryInfo->R.notifyResolved(ResolutionMap)) {
+          ES.reportError(std::move(Err));
+          QueryInfo->R.failMaterialization();
+          return;
+        }
+        if (auto Err = QueryInfo->R.notifyEmitted()) {
+          ES.reportError(std::move(Err));
+          QueryInfo->R.failMaterialization();
+          return;
+        }
       } else {
-        auto &ES = QueryInfo->R.getTargetJITDylib().getExecutionSession();
         ES.reportError(Result.takeError());
         QueryInfo->R.failMaterialization();
       }
@@ -694,7 +709,7 @@ ReexportsGenerator::ReexportsGenerator(JITDylib &SourceJD,
       Allow(std::move(Allow)) {}
 
 Expected<SymbolNameSet>
-ReexportsGenerator::operator()(JITDylib &JD, const SymbolNameSet &Names) {
+ReexportsGenerator::tryToGenerate(JITDylib &JD, const SymbolNameSet &Names) {
   orc::SymbolNameSet Added;
   orc::SymbolAliasMap AliasMap;
 
@@ -716,6 +731,19 @@ ReexportsGenerator::operator()(JITDylib &JD, const SymbolNameSet &Names) {
   return Added;
 }
 
+JITDylib::DefinitionGenerator::~DefinitionGenerator() {}
+
+void JITDylib::removeGenerator(DefinitionGenerator &G) {
+  ES.runSessionLocked([&]() {
+    auto I = std::find_if(DefGenerators.begin(), DefGenerators.end(),
+                          [&](const std::unique_ptr<DefinitionGenerator> &H) {
+                            return H.get() == &G;
+                          });
+    assert(I != DefGenerators.end() && "Generator not found");
+    DefGenerators.erase(I);
+  });
+}
+
 Error JITDylib::defineMaterializing(const SymbolFlagsMap &SymbolFlags) {
   return ES.runSessionLocked([&]() -> Error {
     std::vector<SymbolTable::iterator> AddedSyms;
@@ -823,26 +851,52 @@ void JITDylib::addDependencies(const SymbolStringPtr &Name,
   assert(Symbols[Name].isInMaterializationPhase() &&
          "Can not add dependencies for a symbol that is not materializing");
 
+  // If Name is already in an error state then just bail out.
+  if (Symbols[Name].getFlags().hasError())
+    return;
+
   auto &MI = MaterializingInfos[Name];
-  assert(!MI.IsEmitted && "Can not add dependencies to an emitted symbol");
+  assert(Symbols[Name].getState() != SymbolState::Emitted &&
+         "Can not add dependencies to an emitted symbol");
 
+  bool DependsOnSymbolInErrorState = false;
+
+  // Register dependencies, record whether any depenendency is in the error
+  // state.
   for (auto &KV : Dependencies) {
     assert(KV.first && "Null JITDylib in dependency?");
     auto &OtherJITDylib = *KV.first;
     auto &DepsOnOtherJITDylib = MI.UnemittedDependencies[&OtherJITDylib];
 
     for (auto &OtherSymbol : KV.second) {
+
+      // Check the sym entry for the dependency.
+      auto OtherSymI = OtherJITDylib.Symbols.find(OtherSymbol);
+
 #ifndef NDEBUG
-      // Assert that this symbol exists and has not been emitted already.
-      auto SymI = OtherJITDylib.Symbols.find(OtherSymbol);
-      assert(SymI != OtherJITDylib.Symbols.end() &&
-             (SymI->second.getState() != SymbolState::Ready &&
-              "Dependency on emitted symbol"));
+      // Assert that this symbol exists and has not reached the ready state
+      // already.
+      assert(OtherSymI != OtherJITDylib.Symbols.end() &&
+             (OtherSymI->second.getState() != SymbolState::Ready &&
+              "Dependency on emitted/ready symbol"));
 #endif
 
+      auto &OtherSymEntry = OtherSymI->second;
+
+      // If the dependency is in an error state then note this and continue,
+      // we will move this symbol to the error state below.
+      if (OtherSymEntry.getFlags().hasError()) {
+        DependsOnSymbolInErrorState = true;
+        continue;
+      }
+
+      // If the dependency was not in the error state then add it to
+      // our list of dependencies.
+      assert(OtherJITDylib.MaterializingInfos.count(OtherSymbol) &&
+             "No MaterializingInfo for dependency");
       auto &OtherMI = OtherJITDylib.MaterializingInfos[OtherSymbol];
 
-      if (OtherMI.IsEmitted)
+      if (OtherSymEntry.getState() == SymbolState::Emitted)
         transferEmittedNodeDependencies(MI, Name, OtherMI);
       else if (&OtherJITDylib != this || OtherSymbol != Name) {
         OtherMI.Dependants[this].insert(Name);
@@ -853,63 +907,142 @@ void JITDylib::addDependencies(const SymbolStringPtr &Name,
     if (DepsOnOtherJITDylib.empty())
       MI.UnemittedDependencies.erase(&OtherJITDylib);
   }
+
+  // If this symbol dependended on any symbols in the error state then move
+  // this symbol to the error state too.
+  if (DependsOnSymbolInErrorState)
+    Symbols[Name].setFlags(Symbols[Name].getFlags() | JITSymbolFlags::HasError);
 }
 
-void JITDylib::resolve(const SymbolMap &Resolved) {
-  auto CompletedQueries = ES.runSessionLocked([&, this]() {
-    AsynchronousSymbolQuerySet CompletedQueries;
+Error JITDylib::resolve(const SymbolMap &Resolved) {
+  SymbolNameSet SymbolsInErrorState;
+  AsynchronousSymbolQuerySet CompletedQueries;
+
+  ES.runSessionLocked([&, this]() {
+    struct WorklistEntry {
+      SymbolTable::iterator SymI;
+      JITEvaluatedSymbol ResolvedSym;
+    };
+
+    std::vector<WorklistEntry> Worklist;
+    Worklist.reserve(Resolved.size());
+
+    // Build worklist and check for any symbols in the error state.
     for (const auto &KV : Resolved) {
-      auto &Name = KV.first;
-      auto Sym = KV.second;
 
-      auto I = Symbols.find(Name);
+      assert(!KV.second.getFlags().hasError() &&
+             "Resolution result can not have error flag set");
 
-      assert(I != Symbols.end() && "Symbol not found");
-      assert(!I->second.hasMaterializerAttached() &&
+      auto SymI = Symbols.find(KV.first);
+
+      assert(SymI != Symbols.end() && "Symbol not found");
+      assert(!SymI->second.hasMaterializerAttached() &&
              "Resolving symbol with materializer attached?");
-      assert(I->second.getState() == SymbolState::Materializing &&
+      assert(SymI->second.getState() == SymbolState::Materializing &&
              "Symbol should be materializing");
-      assert(I->second.getAddress() == 0 && "Symbol has already been resolved");
+      assert(SymI->second.getAddress() == 0 &&
+             "Symbol has already been resolved");
+
+      if (SymI->second.getFlags().hasError())
+        SymbolsInErrorState.insert(KV.first);
+      else {
+        auto Flags = KV.second.getFlags();
+        Flags &= ~(JITSymbolFlags::Weak | JITSymbolFlags::Common);
+        assert(Flags == (SymI->second.getFlags() &
+                         ~(JITSymbolFlags::Weak | JITSymbolFlags::Common)) &&
+               "Resolved flags should match the declared flags");
+
+        Worklist.push_back(
+            {SymI, JITEvaluatedSymbol(KV.second.getAddress(), Flags)});
+      }
+    }
+
+    // If any symbols were in the error state then bail out.
+    if (!SymbolsInErrorState.empty())
+      return;
+
+    while (!Worklist.empty()) {
+      auto SymI = Worklist.back().SymI;
+      auto ResolvedSym = Worklist.back().ResolvedSym;
+      Worklist.pop_back();
 
-      assert((Sym.getFlags() & ~JITSymbolFlags::Weak) ==
-                 (I->second.getFlags() & ~JITSymbolFlags::Weak) &&
-             "Resolved flags should match the declared flags");
+      auto &Name = SymI->first;
 
-      // Once resolved, symbols can never be weak.
-      JITSymbolFlags ResolvedFlags = Sym.getFlags();
-      ResolvedFlags &= ~JITSymbolFlags::Weak;
-      I->second.setAddress(Sym.getAddress());
-      I->second.setFlags(ResolvedFlags);
-      I->second.setState(SymbolState::Resolved);
+      // Resolved symbols can not be weak: discard the weak flag.
+      JITSymbolFlags ResolvedFlags = ResolvedSym.getFlags();
+      SymI->second.setAddress(ResolvedSym.getAddress());
+      SymI->second.setFlags(ResolvedFlags);
+      SymI->second.setState(SymbolState::Resolved);
 
       auto &MI = MaterializingInfos[Name];
       for (auto &Q : MI.takeQueriesMeeting(SymbolState::Resolved)) {
-        Q->notifySymbolMetRequiredState(Name, Sym);
+        Q->notifySymbolMetRequiredState(Name, ResolvedSym);
+        Q->removeQueryDependence(*this, Name);
         if (Q->isComplete())
           CompletedQueries.insert(std::move(Q));
       }
     }
-
-    return CompletedQueries;
   });
 
+  assert((SymbolsInErrorState.empty() || CompletedQueries.empty()) &&
+         "Can't fail symbols and completed queries at the same time");
+
+  // If we failed any symbols then return an error.
+  if (!SymbolsInErrorState.empty()) {
+    auto FailedSymbolsDepMap = std::make_shared<SymbolDependenceMap>();
+    (*FailedSymbolsDepMap)[this] = std::move(SymbolsInErrorState);
+    return make_error<FailedToMaterialize>(std::move(FailedSymbolsDepMap));
+  }
+
+  // Otherwise notify all the completed queries.
   for (auto &Q : CompletedQueries) {
     assert(Q->isComplete() && "Q not completed");
     Q->handleComplete();
   }
+
+  return Error::success();
 }
 
-void JITDylib::emit(const SymbolFlagsMap &Emitted) {
-  auto CompletedQueries = ES.runSessionLocked([&, this]() {
-    AsynchronousSymbolQuerySet CompletedQueries;
+Error JITDylib::emit(const SymbolFlagsMap &Emitted) {
+  AsynchronousSymbolQuerySet CompletedQueries;
+  SymbolNameSet SymbolsInErrorState;
 
+  ES.runSessionLocked([&, this]() {
+    std::vector<SymbolTable::iterator> Worklist;
+
+    // Scan to build worklist, record any symbols in the erorr state.
     for (const auto &KV : Emitted) {
-      const auto &Name = KV.first;
+      auto &Name = KV.first;
+
+      auto SymI = Symbols.find(Name);
+      assert(SymI != Symbols.end() && "No symbol table entry for Name");
+
+      if (SymI->second.getFlags().hasError())
+        SymbolsInErrorState.insert(Name);
+      else
+        Worklist.push_back(SymI);
+    }
+
+    // If any symbols were in the error state then bail out.
+    if (!SymbolsInErrorState.empty())
+      return;
+
+    // Otherwise update dependencies and move to the emitted state.
+    while (!Worklist.empty()) {
+      auto SymI = Worklist.back();
+      Worklist.pop_back();
+
+      auto &Name = SymI->first;
+      auto &SymEntry = SymI->second;
+
+      // Move symbol to the emitted state.
+      assert(SymEntry.getState() == SymbolState::Resolved &&
+             "Emitting from state other than Resolved");
+      SymEntry.setState(SymbolState::Emitted);
 
       auto MII = MaterializingInfos.find(Name);
       assert(MII != MaterializingInfos.end() &&
              "Missing MaterializingInfo entry");
-
       auto &MI = MII->second;
 
       // For each dependant, transfer this node's emitted dependencies to
@@ -926,8 +1059,12 @@ void JITDylib::emit(const SymbolFlagsMap &Emitted) {
           auto &DependantMI = DependantMII->second;
 
           // Remove the dependant's dependency on this node.
+          assert(DependantMI.UnemittedDependencies.count(this) &&
+                 "Dependant does not have an unemitted dependencies record for "
+                 "this JITDylib");
           assert(DependantMI.UnemittedDependencies[this].count(Name) &&
                  "Dependant does not count this symbol as a dependency?");
+
           DependantMI.UnemittedDependencies[this].erase(Name);
           if (DependantMI.UnemittedDependencies[this].empty())
             DependantMI.UnemittedDependencies.erase(this);
@@ -936,20 +1073,22 @@ void JITDylib::emit(const SymbolFlagsMap &Emitted) {
           DependantJD.transferEmittedNodeDependencies(DependantMI,
                                                       DependantName, MI);
 
+          auto DependantSymI = DependantJD.Symbols.find(DependantName);
+          assert(DependantSymI != DependantJD.Symbols.end() &&
+                 "Dependant has no entry in the Symbols table");
+          auto &DependantSymEntry = DependantSymI->second;
+
           // If the dependant is emitted and this node was the last of its
           // unemitted dependencies then the dependant node is now ready, so
           // notify any pending queries on the dependant node.
-          if (DependantMI.IsEmitted &&
+          if (DependantSymEntry.getState() == SymbolState::Emitted &&
               DependantMI.UnemittedDependencies.empty()) {
             assert(DependantMI.Dependants.empty() &&
                    "Dependants should be empty by now");
 
             // Since this dependant is now ready, we erase its MaterializingInfo
             // and update its materializing state.
-            auto DependantSymI = DependantJD.Symbols.find(DependantName);
-            assert(DependantSymI != DependantJD.Symbols.end() &&
-                   "Dependant has no entry in the Symbols table");
-            DependantSymI->second.setState(SymbolState::Ready);
+            DependantSymEntry.setState(SymbolState::Ready);
 
             for (auto &Q : DependantMI.takeQueriesMeeting(SymbolState::Ready)) {
               Q->notifySymbolMetRequiredState(
@@ -963,12 +1102,9 @@ void JITDylib::emit(const SymbolFlagsMap &Emitted) {
           }
         }
       }
-      MI.Dependants.clear();
-      MI.IsEmitted = true;
 
+      MI.Dependants.clear();
       if (MI.UnemittedDependencies.empty()) {
-        auto SymI = Symbols.find(Name);
-        assert(SymI != Symbols.end() && "Symbol has no entry in Symbols table");
         SymI->second.setState(SymbolState::Ready);
         for (auto &Q : MI.takeQueriesMeeting(SymbolState::Ready)) {
           Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
@@ -979,80 +1115,138 @@ void JITDylib::emit(const SymbolFlagsMap &Emitted) {
         MaterializingInfos.erase(MII);
       }
     }
-
-    return CompletedQueries;
   });
 
+  assert((SymbolsInErrorState.empty() || CompletedQueries.empty()) &&
+         "Can't fail symbols and completed queries at the same time");
+
+  // If we failed any symbols then return an error.
+  if (!SymbolsInErrorState.empty()) {
+    auto FailedSymbolsDepMap = std::make_shared<SymbolDependenceMap>();
+    (*FailedSymbolsDepMap)[this] = std::move(SymbolsInErrorState);
+    return make_error<FailedToMaterialize>(std::move(FailedSymbolsDepMap));
+  }
+
+  // Otherwise notify all the completed queries.
   for (auto &Q : CompletedQueries) {
     assert(Q->isComplete() && "Q is not complete");
     Q->handleComplete();
   }
+
+  return Error::success();
 }
 
-void JITDylib::notifyFailed(const SymbolNameSet &FailedSymbols) {
+void JITDylib::notifyFailed(FailedSymbolsWorklist Worklist) {
+  AsynchronousSymbolQuerySet FailedQueries;
+  auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
 
-  // FIXME: This should fail any transitively dependant symbols too.
+  // Failing no symbols is a no-op.
+  if (Worklist.empty())
+    return;
 
-  auto FailedQueriesToNotify = ES.runSessionLocked([&, this]() {
-    AsynchronousSymbolQuerySet FailedQueries;
-    std::vector<MaterializingInfosMap::iterator> MIIsToRemove;
+  auto &ES = Worklist.front().first->getExecutionSession();
 
-    for (auto &Name : FailedSymbols) {
-      auto I = Symbols.find(Name);
-      assert(I != Symbols.end() && "Symbol not present in this JITDylib");
-      Symbols.erase(I);
+  ES.runSessionLocked([&]() {
+    while (!Worklist.empty()) {
+      assert(Worklist.back().first && "Failed JITDylib can not be null");
+      auto &JD = *Worklist.back().first;
+      auto Name = std::move(Worklist.back().second);
+      Worklist.pop_back();
 
-      auto MII = MaterializingInfos.find(Name);
+      (*FailedSymbolsMap)[&JD].insert(Name);
+
+      assert(JD.Symbols.count(Name) && "No symbol table entry for Name");
+      auto &Sym = JD.Symbols[Name];
 
-      // If we have not created a MaterializingInfo for this symbol yet then
-      // there is nobody to notify.
-      if (MII == MaterializingInfos.end())
+      // Move the symbol into the error state.
+      // Note that this may be redundant: The symbol might already have been
+      // moved to this state in response to the failure of a dependence.
+      Sym.setFlags(Sym.getFlags() | JITSymbolFlags::HasError);
+
+      // FIXME: Come up with a sane mapping of state to
+      // presence-of-MaterializingInfo so that we can assert presence / absence
+      // here, rather than testing it.
+      auto MII = JD.MaterializingInfos.find(Name);
+
+      if (MII == JD.MaterializingInfos.end())
         continue;
 
-      // Remove this symbol from the dependants list of any dependencies.
-      for (auto &KV : MII->second.UnemittedDependencies) {
-        auto *DependencyJD = KV.first;
-        auto &Dependencies = KV.second;
-        for (auto &DependencyName : Dependencies) {
-          auto DependencyMII =
-              DependencyJD->MaterializingInfos.find(DependencyName);
-          assert(DependencyMII != DependencyJD->MaterializingInfos.end() &&
-                 "Unemitted dependency must have a MaterializingInfo entry");
-          assert(DependencyMII->second.Dependants.count(this) &&
-                 "Dependency's dependants list does not contain this JITDylib");
-          assert(DependencyMII->second.Dependants[this].count(Name) &&
-                 "Dependency's dependants list does not contain dependant");
-          DependencyMII->second.Dependants[this].erase(Name);
+      auto &MI = MII->second;
+
+      // Move all dependants to the error state and disconnect from them.
+      for (auto &KV : MI.Dependants) {
+        auto &DependantJD = *KV.first;
+        for (auto &DependantName : KV.second) {
+          assert(DependantJD.Symbols.count(DependantName) &&
+                 "No symbol table entry for DependantName");
+          auto &DependantSym = DependantJD.Symbols[DependantName];
+          DependantSym.setFlags(DependantSym.getFlags() |
+                                JITSymbolFlags::HasError);
+
+          assert(DependantJD.MaterializingInfos.count(DependantName) &&
+                 "No MaterializingInfo for dependant");
+          auto &DependantMI = DependantJD.MaterializingInfos[DependantName];
+
+          auto UnemittedDepI = DependantMI.UnemittedDependencies.find(&JD);
+          assert(UnemittedDepI != DependantMI.UnemittedDependencies.end() &&
+                 "No UnemittedDependencies entry for this JITDylib");
+          assert(UnemittedDepI->second.count(Name) &&
+                 "No UnemittedDependencies entry for this symbol");
+          UnemittedDepI->second.erase(Name);
+          if (UnemittedDepI->second.empty())
+            DependantMI.UnemittedDependencies.erase(UnemittedDepI);
+
+          // If this symbol is already in the emitted state then we need to
+          // take responsibility for failing its queries, so add it to the
+          // worklist.
+          if (DependantSym.getState() == SymbolState::Emitted) {
+            assert(DependantMI.Dependants.empty() &&
+                   "Emitted symbol should not have dependants");
+            Worklist.push_back(std::make_pair(&DependantJD, DependantName));
+          }
         }
       }
+      MI.Dependants.clear();
 
-      // Copy all the queries to the FailedQueries list, then abandon them.
-      // This has to be a copy, and the copy has to come before the abandon
-      // operation: Each Q.detach() call will reach back into this
-      // PendingQueries list to remove Q.
-      for (auto &Q : MII->second.pendingQueries())
-        FailedQueries.insert(Q);
-
-      MIIsToRemove.push_back(std::move(MII));
-    }
-
-    // Detach failed queries.
-    for (auto &Q : FailedQueries)
-      Q->detach();
+      // Disconnect from all unemitted depenencies.
+      for (auto &KV : MI.UnemittedDependencies) {
+        auto &UnemittedDepJD = *KV.first;
+        for (auto &UnemittedDepName : KV.second) {
+          auto UnemittedDepMII =
+              UnemittedDepJD.MaterializingInfos.find(UnemittedDepName);
+          assert(UnemittedDepMII != UnemittedDepJD.MaterializingInfos.end() &&
+                 "Missing MII for unemitted dependency");
+          assert(UnemittedDepMII->second.Dependants.count(&JD) &&
+                 "JD not listed as a dependant of unemitted dependency");
+          assert(UnemittedDepMII->second.Dependants[&JD].count(Name) &&
+                 "Name is not listed as a dependant of unemitted dependency");
+          UnemittedDepMII->second.Dependants[&JD].erase(Name);
+          if (UnemittedDepMII->second.Dependants[&JD].empty())
+            UnemittedDepMII->second.Dependants.erase(&JD);
+        }
+      }
+      MI.UnemittedDependencies.clear();
 
-    // Remove the MaterializingInfos.
-    for (auto &MII : MIIsToRemove) {
-      assert(!MII->second.hasQueriesPending() &&
-             "Queries remain after symbol was failed");
+      // Collect queries to be failed for this MII.
+      for (auto &Q : MII->second.pendingQueries()) {
+        // Add the query to the list to be failed and detach it.
+        FailedQueries.insert(Q);
+        Q->detach();
+      }
 
-      MaterializingInfos.erase(MII);
+      assert(MI.Dependants.empty() &&
+             "Can not delete MaterializingInfo with dependants still attached");
+      assert(MI.UnemittedDependencies.empty() &&
+             "Can not delete MaterializingInfo with unemitted dependencies "
+             "still attached");
+      assert(!MI.hasQueriesPending() &&
+             "Can not delete MaterializingInfo with queries pending");
+      JD.MaterializingInfos.erase(MII);
     }
-
-    return FailedQueries;
   });
 
-  for (auto &Q : FailedQueriesToNotify)
-    Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
+  for (auto &Q : FailedQueries)
+    Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbolsMap));
 }
 
 void JITDylib::setSearchOrder(JITDylibSearchList NewSearchOrder,
@@ -1159,10 +1353,18 @@ Expected<SymbolFlagsMap> JITDylib::lookupFlags(const SymbolNameSet &Names) {
     if (!Unresolved)
       return Unresolved.takeError();
 
-    if (DefGenerator && !Unresolved->empty()) {
-      auto NewDefs = DefGenerator(*this, *Unresolved);
+    /// Run any definition generators.
+    for (auto &DG : DefGenerators) {
+
+      // Bail out early if we've resolved everything.
+      if (Unresolved->empty())
+        break;
+
+      // Run this generator.
+      auto NewDefs = DG->tryToGenerate(*this, *Unresolved);
       if (!NewDefs)
         return NewDefs.takeError();
+
       if (!NewDefs->empty()) {
         auto Unresolved2 = lookupFlagsImpl(Result, *NewDefs);
         if (!Unresolved2)
@@ -1171,7 +1373,10 @@ Expected<SymbolFlagsMap> JITDylib::lookupFlags(const SymbolNameSet &Names) {
         assert(Unresolved2->empty() &&
                "All fallback defs should have been found by lookupFlagsImpl");
       }
-    };
+
+      for (auto &Name : *NewDefs)
+        Unresolved->erase(Name);
+    }
     return Result;
   });
 }
@@ -1197,15 +1402,34 @@ Error JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
                            MaterializationUnitList &MUs) {
   assert(Q && "Query can not be null");
 
-  lodgeQueryImpl(Q, Unresolved, MatchNonExported, MUs);
-  if (DefGenerator && !Unresolved.empty()) {
-    auto NewDefs = DefGenerator(*this, Unresolved);
+  if (auto Err = lodgeQueryImpl(Q, Unresolved, MatchNonExported, MUs))
+    return Err;
+
+  // Run any definition generators.
+  for (auto &DG : DefGenerators) {
+
+    // Bail out early if we have resolved everything.
+    if (Unresolved.empty())
+      break;
+
+    // Run the generator.
+    auto NewDefs = DG->tryToGenerate(*this, Unresolved);
+
+    // If the generator returns an error then bail out.
     if (!NewDefs)
       return NewDefs.takeError();
+
+    // If the generator was able to generate new definitions for any of the
+    // unresolved symbols then lodge the query against them.
     if (!NewDefs->empty()) {
       for (auto &D : *NewDefs)
         Unresolved.erase(D);
-      lodgeQueryImpl(Q, *NewDefs, MatchNonExported, MUs);
+
+      // Lodge query. This can not fail as any new definitions were added
+      // by the generator under the session locked. Since they can't have
+      // started materializing yet the can not have failed.
+      cantFail(lodgeQueryImpl(Q, *NewDefs, MatchNonExported, MUs));
+
       assert(NewDefs->empty() &&
              "All fallback defs should have been found by lookupImpl");
     }
@@ -1214,7 +1438,7 @@ Error JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
   return Error::success();
 }
 
-void JITDylib::lodgeQueryImpl(
+Error JITDylib::lodgeQueryImpl(
     std::shared_ptr<AsynchronousSymbolQuery> &Q, SymbolNameSet &Unresolved,
     bool MatchNonExported,
     std::vector<std::unique_ptr<MaterializationUnit>> &MUs) {
@@ -1235,6 +1459,14 @@ void JITDylib::lodgeQueryImpl(
     // Unresolved set.
     ToRemove.push_back(Name);
 
+    // If we matched against this symbol but it is in the error state then
+    // bail out and treat it as a failure to materialize.
+    if (SymI->second.getFlags().hasError()) {
+      auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
+      (*FailedSymbolsMap)[this] = {Name};
+      return make_error<FailedToMaterialize>(std::move(FailedSymbolsMap));
+    }
+
     // If this symbol already meets the required state for then notify the
     // query and continue.
     if (SymI->second.getState() >= Q->getRequiredState()) {
@@ -1277,6 +1509,8 @@ void JITDylib::lodgeQueryImpl(
   // Remove any symbols that we found.
   for (auto &Name : ToRemove)
     Unresolved.erase(Name);
+
+  return Error::success();
 }
 
 Expected<SymbolNameSet>
@@ -1292,9 +1526,16 @@ JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
   SymbolNameSet Unresolved = std::move(Names);
   auto Err = ES.runSessionLocked([&, this]() -> Error {
     QueryComplete = lookupImpl(Q, MUs, Unresolved);
-    if (DefGenerator && !Unresolved.empty()) {
+
+    // Run any definition generators.
+    for (auto &DG : DefGenerators) {
+
+      // Bail out early if we have resolved everything.
+      if (Unresolved.empty())
+        break;
+
       assert(!QueryComplete && "query complete but unresolved symbols remain?");
-      auto NewDefs = DefGenerator(*this, Unresolved);
+      auto NewDefs = DG->tryToGenerate(*this, Unresolved);
       if (!NewDefs)
         return NewDefs.takeError();
       if (!NewDefs->empty()) {
@@ -1432,8 +1673,6 @@ void JITDylib::dump(raw_ostream &OS) {
       OS << "  MaterializingInfos entries:\n";
     for (auto &KV : MaterializingInfos) {
       OS << "    \"" << *KV.first << "\":\n"
-         << "      IsEmitted = " << (KV.second.IsEmitted ? "true" : "false")
-         << "\n"
          << "      " << KV.second.pendingQueries().size()
          << " pending queries: { ";
       for (const auto &Q : KV.second.pendingQueries())
@@ -1486,13 +1725,6 @@ JITDylib::MaterializingInfo::takeQueriesMeeting(SymbolState RequiredState) {
   return Result;
 }
 
-JITDylib::AsynchronousSymbolQueryList
-JITDylib::MaterializingInfo::takeAllQueries() {
-  AsynchronousSymbolQueryList Result;
-  std::swap(Result, PendingQueries);
-  return Result;
-}
-
 JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
     : ES(ES), JITDylibName(std::move(Name)) {
   SearchOrder.push_back({this, true});
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index f7fc5f8f1797..4a886ac0597c 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 
+#include "llvm/ExecutionEngine/Orc/Layer.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -67,7 +68,7 @@ CtorDtorIterator::Element CtorDtorIterator::operator*() const {
     }
   }
 
-  ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+  auto *Priority = cast<ConstantInt>(CS->getOperand(0));
   Value *Data = CS->getNumOperands() == 3 ? CS->getOperand(2) : nullptr;
   if (Data && !isa<GlobalValue>(Data))
     Data = nullptr;
@@ -87,7 +88,7 @@ iterator_range<CtorDtorIterator> getDestructors(const Module &M) {
 }
 
 void CtorDtorRunner::add(iterator_range<CtorDtorIterator> CtorDtors) {
-  if (empty(CtorDtors))
+  if (CtorDtors.empty())
     return;
 
   MangleAndInterner Mangle(
@@ -178,20 +179,20 @@ DynamicLibrarySearchGenerator::DynamicLibrarySearchGenerator(
     : Dylib(std::move(Dylib)), Allow(std::move(Allow)),
       GlobalPrefix(GlobalPrefix) {}
 
-Expected<DynamicLibrarySearchGenerator>
+Expected<std::unique_ptr<DynamicLibrarySearchGenerator>>
 DynamicLibrarySearchGenerator::Load(const char *FileName, char GlobalPrefix,
                                     SymbolPredicate Allow) {
   std::string ErrMsg;
   auto Lib = sys::DynamicLibrary::getPermanentLibrary(FileName, &ErrMsg);
   if (!Lib.isValid())
     return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
-  return DynamicLibrarySearchGenerator(std::move(Lib), GlobalPrefix,
-                                       std::move(Allow));
+  return std::make_unique<DynamicLibrarySearchGenerator>(
+      std::move(Lib), GlobalPrefix, std::move(Allow));
 }
 
 Expected<SymbolNameSet>
-DynamicLibrarySearchGenerator::operator()(JITDylib &JD,
-                                          const SymbolNameSet &Names) {
+DynamicLibrarySearchGenerator::tryToGenerate(JITDylib &JD,
+                                             const SymbolNameSet &Names) {
   orc::SymbolNameSet Added;
   orc::SymbolMap NewSymbols;
 
@@ -226,5 +227,82 @@ DynamicLibrarySearchGenerator::operator()(JITDylib &JD,
   return Added;
 }
 
+Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
+StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName) {
+  auto ArchiveBuffer = errorOrToExpected(MemoryBuffer::getFile(FileName));
+
+  if (!ArchiveBuffer)
+    return ArchiveBuffer.takeError();
+
+  return Create(L, std::move(*ArchiveBuffer));
+}
+
+Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
+StaticLibraryDefinitionGenerator::Create(
+    ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer) {
+  Error Err = Error::success();
+
+  std::unique_ptr<StaticLibraryDefinitionGenerator> ADG(
+      new StaticLibraryDefinitionGenerator(L, std::move(ArchiveBuffer), Err));
+
+  if (Err)
+    return std::move(Err);
+
+  return std::move(ADG);
+}
+
+Expected<SymbolNameSet>
+StaticLibraryDefinitionGenerator::tryToGenerate(JITDylib &JD,
+                                                const SymbolNameSet &Names) {
+
+  DenseSet<std::pair<StringRef, StringRef>> ChildBufferInfos;
+  SymbolNameSet NewDefs;
+
+  for (const auto &Name : Names) {
+    auto Child = Archive.findSym(*Name);
+    if (!Child)
+      return Child.takeError();
+    if (*Child == None)
+      continue;
+    auto ChildBuffer = (*Child)->getMemoryBufferRef();
+    if (!ChildBuffer)
+      return ChildBuffer.takeError();
+    ChildBufferInfos.insert(
+        {ChildBuffer->getBuffer(), ChildBuffer->getBufferIdentifier()});
+    NewDefs.insert(Name);
+  }
+
+  for (auto ChildBufferInfo : ChildBufferInfos) {
+    MemoryBufferRef ChildBufferRef(ChildBufferInfo.first,
+                                   ChildBufferInfo.second);
+
+    if (auto Err =
+            L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef), VModuleKey()))
+      return std::move(Err);
+
+    --UnrealizedObjects;
+  }
+
+  return NewDefs;
+}
+
+StaticLibraryDefinitionGenerator::StaticLibraryDefinitionGenerator(
+    ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer, Error &Err)
+    : L(L), ArchiveBuffer(std::move(ArchiveBuffer)),
+      Archive(*this->ArchiveBuffer, Err) {
+
+  if (Err)
+    return;
+
+  Error Err2 = Error::success();
+  for (auto _ : Archive.children(Err2)) {
+    (void)_;
+    ++UnrealizedObjects;
+  }
+
+  // No need to check this: We will leave it to the caller.
+  Err = std::move(Err2);
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
index 81dfc02f55b2..d311f34179c7 100644
--- a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
+++ b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
@@ -22,9 +22,9 @@ void IRCompileLayer::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) {
 
 void IRCompileLayer::emit(MaterializationResponsibility R,
                           ThreadSafeModule TSM) {
-  assert(TSM.getModule() && "Module must not be null");
+  assert(TSM && "Module must not be null");
 
-  if (auto Obj = Compile(*TSM.getModule())) {
+  if (auto Obj = TSM.withModuleDo(Compile)) {
     {
       std::lock_guard<std::mutex> Lock(IRLayerMutex);
       if (NotifyCompiled)
diff --git a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
index e3519284613e..845ecc71eb87 100644
--- a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
+++ b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
@@ -19,7 +19,7 @@ IRTransformLayer::IRTransformLayer(ExecutionSession &ES,
 
 void IRTransformLayer::emit(MaterializationResponsibility R,
                             ThreadSafeModule TSM) {
-  assert(TSM.getModule() && "Module must not be null");
+  assert(TSM && "Module must not be null");
 
   if (auto TransformedTSM = Transform(std::move(TSM), R))
     BaseLayer.emit(std::move(R), std::move(*TransformedTSM));
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index cc3656fe5dc5..0295db7633dd 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -37,8 +37,9 @@ private:
   void materialize(MaterializationResponsibility R) override {
     SymbolMap Result;
     Result[Name] = JITEvaluatedSymbol(Compile(), JITSymbolFlags::Exported);
-    R.notifyResolved(Result);
-    R.notifyEmitted();
+    // No dependencies, so these calls cannot fail.
+    cantFail(R.notifyResolved(Result));
+    cantFail(R.notifyEmitted());
   }
 
   void discard(const JITDylib &JD, const SymbolStringPtr &Name) override {
@@ -66,7 +67,7 @@ JITCompileCallbackManager::getCompileCallback(CompileFunction Compile) {
     std::lock_guard<std::mutex> Lock(CCMgrMutex);
     AddrToSymbol[*TrampolineAddr] = CallbackName;
     cantFail(CallbacksJD.define(
-        llvm::make_unique<CompileCallbackMaterializationUnit>(
+        std::make_unique<CompileCallbackMaterializationUnit>(
             std::move(CallbackName), std::move(Compile),
             ES.allocateVModule())));
     return *TrampolineAddr;
@@ -119,7 +120,8 @@ createLocalCompileCallbackManager(const Triple &T, ExecutionSession &ES,
     return make_error<StringError>(
         std::string("No callback manager available for ") + T.str(),
         inconvertibleErrorCode());
-  case Triple::aarch64: {
+  case Triple::aarch64:
+  case Triple::aarch64_32: {
     typedef orc::LocalJITCompileCallbackManager<orc::OrcAArch64> CCMgrT;
     return CCMgrT::Create(ES, ErrorHandlerAddress);
     }
@@ -162,50 +164,51 @@ createLocalIndirectStubsManagerBuilder(const Triple &T) {
   switch (T.getArch()) {
     default:
       return [](){
-        return llvm::make_unique<
+        return std::make_unique<
                        orc::LocalIndirectStubsManager<orc::OrcGenericABI>>();
       };
 
     case Triple::aarch64:
+    case Triple::aarch64_32:
       return [](){
-        return llvm::make_unique<
+        return std::make_unique<
                        orc::LocalIndirectStubsManager<orc::OrcAArch64>>();
       };
 
     case Triple::x86:
       return [](){
-        return llvm::make_unique<
+        return std::make_unique<
                        orc::LocalIndirectStubsManager<orc::OrcI386>>();
       };
 
     case Triple::mips:
       return [](){
-          return llvm::make_unique<
+          return std::make_unique<
                       orc::LocalIndirectStubsManager<orc::OrcMips32Be>>();
       };
 
     case Triple::mipsel:
       return [](){
-          return llvm::make_unique<
+          return std::make_unique<
                       orc::LocalIndirectStubsManager<orc::OrcMips32Le>>();
       };
 
     case Triple::mips64:
     case Triple::mips64el:
       return [](){
-          return llvm::make_unique<
+          return std::make_unique<
                       orc::LocalIndirectStubsManager<orc::OrcMips64>>();
       };
       
     case Triple::x86_64:
       if (T.getOS() == Triple::OSType::Win32) {
         return [](){
-          return llvm::make_unique<
+          return std::make_unique<
                      orc::LocalIndirectStubsManager<orc::OrcX86_64_Win32>>();
         };
       } else {
         return [](){
-          return llvm::make_unique<
+          return std::make_unique<
                      orc::LocalIndirectStubsManager<orc::OrcX86_64_SysV>>();
         };
       }
diff --git a/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp b/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
index df23547a9de3..1d3e6db913e2 100644
--- a/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
+++ b/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 
+#include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 
 namespace llvm {
@@ -22,7 +23,21 @@ JITTargetMachineBuilder::JITTargetMachineBuilder(Triple TT)
 Expected<JITTargetMachineBuilder> JITTargetMachineBuilder::detectHost() {
   // FIXME: getProcessTriple is bogus. It returns the host LLVM was compiled on,
   //        rather than a valid triple for the current process.
-  return JITTargetMachineBuilder(Triple(sys::getProcessTriple()));
+  JITTargetMachineBuilder TMBuilder((Triple(sys::getProcessTriple())));
+
+  // Retrieve host CPU name and sub-target features and add them to builder.
+  // Relocation model, code model and codegen opt level are kept to default
+  // values.
+  llvm::SubtargetFeatures SubtargetFeatures;
+  llvm::StringMap<bool> FeatureMap;
+  llvm::sys::getHostCPUFeatures(FeatureMap);
+  for (auto &Feature : FeatureMap)
+    SubtargetFeatures.AddFeature(Feature.first(), Feature.second);
+
+  TMBuilder.setCPU(llvm::sys::getHostCPUName());
+  TMBuilder.addFeatures(SubtargetFeatures.getFeatures());
+
+  return TMBuilder;
 }
 
 Expected<std::unique_ptr<TargetMachine>>
diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp
index b120691faf07..a80f78afe80f 100644
--- a/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -41,7 +41,8 @@ Error LLJIT::defineAbsolute(StringRef Name, JITEvaluatedSymbol Sym) {
 Error LLJIT::addIRModule(JITDylib &JD, ThreadSafeModule TSM) {
   assert(TSM && "Can not add null module");
 
-  if (auto Err = applyDataLayout(*TSM.getModule()))
+  if (auto Err =
+          TSM.withModuleDo([&](Module &M) { return applyDataLayout(M); }))
     return Err;
 
   return CompileLayer->add(JD, std::move(TSM), ES->allocateVModule());
@@ -63,12 +64,21 @@ LLJIT::createObjectLinkingLayer(LLJITBuilderState &S, ExecutionSession &ES) {
 
   // If the config state provided an ObjectLinkingLayer factory then use it.
   if (S.CreateObjectLinkingLayer)
-    return S.CreateObjectLinkingLayer(ES);
+    return S.CreateObjectLinkingLayer(ES, S.JTMB->getTargetTriple());
 
   // Otherwise default to creating an RTDyldObjectLinkingLayer that constructs
   // a new SectionMemoryManager for each object.
-  auto GetMemMgr = []() { return llvm::make_unique<SectionMemoryManager>(); };
-  return llvm::make_unique<RTDyldObjectLinkingLayer>(ES, std::move(GetMemMgr));
+  auto GetMemMgr = []() { return std::make_unique<SectionMemoryManager>(); };
+  auto ObjLinkingLayer =
+      std::make_unique<RTDyldObjectLinkingLayer>(ES, std::move(GetMemMgr));
+
+  if (S.JTMB->getTargetTriple().isOSBinFormatCOFF())
+    ObjLinkingLayer->setOverrideObjectFlagsWithResponsibilityFlags(true);
+
+  // FIXME: Explicit conversion to std::unique_ptr<ObjectLayer> added to silence
+  //        errors from some GCC / libstdc++ bots. Remove this conversion (i.e.
+  //        just return ObjLinkingLayer) once those bots are upgraded.
+  return std::unique_ptr<ObjectLayer>(std::move(ObjLinkingLayer));
 }
 
 Expected<IRCompileLayer::CompileFunction>
@@ -92,7 +102,7 @@ LLJIT::createCompileFunction(LLJITBuilderState &S,
 }
 
 LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
-    : ES(S.ES ? std::move(S.ES) : llvm::make_unique<ExecutionSession>()),
+    : ES(S.ES ? std::move(S.ES) : std::make_unique<ExecutionSession>()),
       Main(this->ES->getMainJITDylib()), DL(""), CtorRunner(Main),
       DtorRunner(Main) {
 
@@ -113,13 +123,13 @@ LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
       Err = CompileFunction.takeError();
       return;
     }
-    CompileLayer = llvm::make_unique<IRCompileLayer>(
+    CompileLayer = std::make_unique<IRCompileLayer>(
         *ES, *ObjLinkingLayer, std::move(*CompileFunction));
   }
 
   if (S.NumCompileThreads > 0) {
     CompileLayer->setCloneToNewContextOnEmit(true);
-    CompileThreads = llvm::make_unique<ThreadPool>(S.NumCompileThreads);
+    CompileThreads = std::make_unique<ThreadPool>(S.NumCompileThreads);
     ES->setDispatchMaterialization(
         [this](JITDylib &JD, std::unique_ptr<MaterializationUnit> MU) {
           // FIXME: Switch to move capture once we have c++14.
@@ -166,10 +176,14 @@ Error LLLazyJITBuilderState::prepareForConstruction() {
 Error LLLazyJIT::addLazyIRModule(JITDylib &JD, ThreadSafeModule TSM) {
   assert(TSM && "Can not add null module");
 
-  if (auto Err = applyDataLayout(*TSM.getModule()))
-    return Err;
+  if (auto Err = TSM.withModuleDo([&](Module &M) -> Error {
+        if (auto Err = applyDataLayout(M))
+          return Err;
 
-  recordCtorDtors(*TSM.getModule());
+        recordCtorDtors(M);
+        return Error::success();
+      }))
+    return Err;
 
   return CODLayer->add(JD, std::move(TSM), ES->allocateVModule());
 }
@@ -212,10 +226,10 @@ LLLazyJIT::LLLazyJIT(LLLazyJITBuilderState &S, Error &Err) : LLJIT(S, Err) {
   }
 
   // Create the transform layer.
-  TransformLayer = llvm::make_unique<IRTransformLayer>(*ES, *CompileLayer);
+  TransformLayer = std::make_unique<IRTransformLayer>(*ES, *CompileLayer);
 
   // Create the COD layer.
-  CODLayer = llvm::make_unique<CompileOnDemandLayer>(
+  CODLayer = std::make_unique<CompileOnDemandLayer>(
       *ES, *TransformLayer, *LCTMgr, std::move(ISMBuilder));
 
   if (S.NumCompileThreads > 0)
diff --git a/lib/ExecutionEngine/Orc/Layer.cpp b/lib/ExecutionEngine/Orc/Layer.cpp
index 3ed2dabf4545..580e2682ec8c 100644
--- a/lib/ExecutionEngine/Orc/Layer.cpp
+++ b/lib/ExecutionEngine/Orc/Layer.cpp
@@ -19,7 +19,7 @@ IRLayer::IRLayer(ExecutionSession &ES) : ES(ES) {}
 IRLayer::~IRLayer() {}
 
 Error IRLayer::add(JITDylib &JD, ThreadSafeModule TSM, VModuleKey K) {
-  return JD.define(llvm::make_unique<BasicIRLayerMaterializationUnit>(
+  return JD.define(std::make_unique<BasicIRLayerMaterializationUnit>(
       *this, std::move(K), std::move(TSM)));
 }
 
@@ -29,15 +29,17 @@ IRMaterializationUnit::IRMaterializationUnit(ExecutionSession &ES,
 
   assert(this->TSM && "Module must not be null");
 
-  MangleAndInterner Mangle(ES, this->TSM.getModule()->getDataLayout());
-  for (auto &G : this->TSM.getModule()->global_values()) {
-    if (G.hasName() && !G.isDeclaration() && !G.hasLocalLinkage() &&
-        !G.hasAvailableExternallyLinkage() && !G.hasAppendingLinkage()) {
-      auto MangledName = Mangle(G.getName());
-      SymbolFlags[MangledName] = JITSymbolFlags::fromGlobalValue(G);
-      SymbolToDefinition[MangledName] = &G;
+  MangleAndInterner Mangle(ES, this->TSM.getModuleUnlocked()->getDataLayout());
+  this->TSM.withModuleDo([&](Module &M) {
+    for (auto &G : M.global_values()) {
+      if (G.hasName() && !G.isDeclaration() && !G.hasLocalLinkage() &&
+          !G.hasAvailableExternallyLinkage() && !G.hasAppendingLinkage()) {
+        auto MangledName = Mangle(G.getName());
+        SymbolFlags[MangledName] = JITSymbolFlags::fromGlobalValue(G);
+        SymbolToDefinition[MangledName] = &G;
+      }
     }
-  }
+  });
 }
 
 IRMaterializationUnit::IRMaterializationUnit(
@@ -47,8 +49,9 @@ IRMaterializationUnit::IRMaterializationUnit(
       TSM(std::move(TSM)), SymbolToDefinition(std::move(SymbolToDefinition)) {}
 
 StringRef IRMaterializationUnit::getName() const {
-  if (TSM.getModule())
-    return TSM.getModule()->getModuleIdentifier();
+  if (TSM)
+    return TSM.withModuleDo(
+        [](const Module &M) -> StringRef { return M.getModuleIdentifier(); });
   return "<null module>";
 }
 
@@ -90,7 +93,6 @@ void BasicIRLayerMaterializationUnit::materialize(
   auto &N = R.getTargetJITDylib().getName();
 #endif // NDEBUG
 
-  auto Lock = TSM.getContextLock();
   LLVM_DEBUG(ES.runSessionLocked(
       [&]() { dbgs() << "Emitting, for " << N << ", " << *this << "\n"; }););
   L.emit(std::move(R), std::move(TSM));
diff --git a/lib/ExecutionEngine/Orc/LazyReexports.cpp b/lib/ExecutionEngine/Orc/LazyReexports.cpp
index fc8205845654..93aabd817d60 100644
--- a/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -50,7 +50,6 @@ LazyCallThroughManager::callThroughToSymbol(JITTargetAddress TrampolineAddr) {
     SourceJD = I->second.first;
     SymbolName = I->second.second;
   }
-
   auto LookupResult =
       ES.lookup(JITDylibSearchList({{SourceJD, true}}), SymbolName);
 
@@ -91,6 +90,7 @@ createLocalLazyCallThroughManager(const Triple &T, ExecutionSession &ES,
         inconvertibleErrorCode());
 
   case Triple::aarch64:
+  case Triple::aarch64_32:
     return LocalLazyCallThroughManager::Create<OrcAArch64>(ES,
                                                            ErrorHandlerAddr);
 
@@ -121,7 +121,8 @@ createLocalLazyCallThroughManager(const Triple &T, ExecutionSession &ES,
 
 LazyReexportsMaterializationUnit::LazyReexportsMaterializationUnit(
     LazyCallThroughManager &LCTManager, IndirectStubsManager &ISManager,
-    JITDylib &SourceJD, SymbolAliasMap CallableAliases, VModuleKey K)
+    JITDylib &SourceJD, SymbolAliasMap CallableAliases, ImplSymbolMap *SrcJDLoc,
+    VModuleKey K)
     : MaterializationUnit(extractFlags(CallableAliases), std::move(K)),
       LCTManager(LCTManager), ISManager(ISManager), SourceJD(SourceJD),
       CallableAliases(std::move(CallableAliases)),
@@ -129,7 +130,8 @@ LazyReexportsMaterializationUnit::LazyReexportsMaterializationUnit(
           [&ISManager](JITDylib &JD, const SymbolStringPtr &SymbolName,
                        JITTargetAddress ResolvedAddr) {
             return ISManager.updatePointer(*SymbolName, ResolvedAddr);
-          })) {}
+          })),
+      AliaseeTable(SrcJDLoc) {}
 
 StringRef LazyReexportsMaterializationUnit::getName() const {
   return "<Lazy Reexports>";
@@ -149,7 +151,7 @@ void LazyReexportsMaterializationUnit::materialize(
 
   if (!CallableAliases.empty())
     R.replace(lazyReexports(LCTManager, ISManager, SourceJD,
-                            std::move(CallableAliases)));
+                            std::move(CallableAliases), AliaseeTable));
 
   IndirectStubsManager::StubInitsMap StubInits;
   for (auto &Alias : RequestedAliases) {
@@ -168,6 +170,9 @@ void LazyReexportsMaterializationUnit::materialize(
         std::make_pair(*CallThroughTrampoline, Alias.second.AliasFlags);
   }
 
+  if (AliaseeTable != nullptr && !RequestedAliases.empty())
+    AliaseeTable->trackImpls(RequestedAliases, &SourceJD);
+
   if (auto Err = ISManager.createStubs(StubInits)) {
     SourceJD.getExecutionSession().reportError(std::move(Err));
     R.failMaterialization();
@@ -178,8 +183,9 @@ void LazyReexportsMaterializationUnit::materialize(
   for (auto &Alias : RequestedAliases)
     Stubs[Alias.first] = ISManager.findStub(*Alias.first, false);
 
-  R.notifyResolved(Stubs);
-  R.notifyEmitted();
+  // No registered dependencies, so these calls cannot fail.
+  cantFail(R.notifyResolved(Stubs));
+  cantFail(R.notifyEmitted());
 }
 
 void LazyReexportsMaterializationUnit::discard(const JITDylib &JD,
diff --git a/lib/ExecutionEngine/Orc/Legacy.cpp b/lib/ExecutionEngine/Orc/Legacy.cpp
index ce6368b57a89..9f9a6730b2c3 100644
--- a/lib/ExecutionEngine/Orc/Legacy.cpp
+++ b/lib/ExecutionEngine/Orc/Legacy.cpp
@@ -23,7 +23,8 @@ void JITSymbolResolverAdapter::lookup(const LookupSet &Symbols,
   for (auto &S : Symbols)
     InternedSymbols.insert(ES.intern(S));
 
-  auto OnResolvedWithUnwrap = [OnResolved](Expected<SymbolMap> InternedResult) {
+  auto OnResolvedWithUnwrap = [OnResolved = std::move(OnResolved)](
+                                  Expected<SymbolMap> InternedResult) mutable {
     if (!InternedResult) {
       OnResolved(InternedResult.takeError());
       return;
@@ -36,7 +37,7 @@ void JITSymbolResolverAdapter::lookup(const LookupSet &Symbols,
   };
 
   auto Q = std::make_shared<AsynchronousSymbolQuery>(
-      InternedSymbols, SymbolState::Resolved, OnResolvedWithUnwrap);
+      InternedSymbols, SymbolState::Resolved, std::move(OnResolvedWithUnwrap));
 
   auto Unresolved = R.lookup(Q, InternedSymbols);
   if (Unresolved.empty()) {
diff --git a/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index def0b300eca1..874decb2ade0 100644
--- a/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -29,6 +29,13 @@ public:
                                    std::unique_ptr<MemoryBuffer> ObjBuffer)
       : Layer(Layer), MR(std::move(MR)), ObjBuffer(std::move(ObjBuffer)) {}
 
+  ~ObjectLinkingLayerJITLinkContext() {
+    // If there is an object buffer return function then use it to
+    // return ownership of the buffer.
+    if (Layer.ReturnObjectBuffer)
+      Layer.ReturnObjectBuffer(std::move(ObjBuffer));
+  }
+
   JITLinkMemoryManager &getMemoryManager() override { return Layer.MemMgr; }
 
   MemoryBufferRef getObjectBuffer() const override {
@@ -41,7 +48,7 @@ public:
   }
 
   void lookup(const DenseSet<StringRef> &Symbols,
-              JITLinkAsyncLookupContinuation LookupContinuation) override {
+              std::unique_ptr<JITLinkAsyncLookupContinuation> LC) override {
 
     JITDylibSearchList SearchOrder;
     MR.getTargetJITDylib().withSearchOrderDo(
@@ -54,18 +61,16 @@ public:
       InternedSymbols.insert(ES.intern(S));
 
     // OnResolve -- De-intern the symbols and pass the result to the linker.
-    // FIXME: Capture LookupContinuation by move once we have c++14.
-    auto SharedLookupContinuation =
-        std::make_shared<JITLinkAsyncLookupContinuation>(
-            std::move(LookupContinuation));
-    auto OnResolve = [SharedLookupContinuation](Expected<SymbolMap> Result) {
+    auto OnResolve = [this, LookupContinuation = std::move(LC)](
+                         Expected<SymbolMap> Result) mutable {
+      auto Main = Layer.getExecutionSession().intern("_main");
       if (!Result)
-        (*SharedLookupContinuation)(Result.takeError());
+        LookupContinuation->run(Result.takeError());
       else {
         AsyncLookupResult LR;
         for (auto &KV : *Result)
           LR[*KV.first] = KV.second;
-        (*SharedLookupContinuation)(std::move(LR));
+        LookupContinuation->run(std::move(LR));
       }
     };
 
@@ -75,29 +80,25 @@ public:
               });
   }
 
-  void notifyResolved(AtomGraph &G) override {
+  void notifyResolved(LinkGraph &G) override {
     auto &ES = Layer.getExecutionSession();
 
     SymbolFlagsMap ExtraSymbolsToClaim;
     bool AutoClaim = Layer.AutoClaimObjectSymbols;
 
     SymbolMap InternedResult;
-    for (auto *DA : G.defined_atoms())
-      if (DA->hasName() && DA->isGlobal()) {
-        auto InternedName = ES.intern(DA->getName());
+    for (auto *Sym : G.defined_symbols())
+      if (Sym->hasName() && Sym->getScope() != Scope::Local) {
+        auto InternedName = ES.intern(Sym->getName());
         JITSymbolFlags Flags;
 
-        if (DA->isExported())
-          Flags |= JITSymbolFlags::Exported;
-        if (DA->isWeak())
-          Flags |= JITSymbolFlags::Weak;
-        if (DA->isCallable())
+        if (Sym->isCallable())
           Flags |= JITSymbolFlags::Callable;
-        if (DA->isCommon())
-          Flags |= JITSymbolFlags::Common;
+        if (Sym->getScope() == Scope::Default)
+          Flags |= JITSymbolFlags::Exported;
 
         InternedResult[InternedName] =
-            JITEvaluatedSymbol(DA->getAddress(), Flags);
+            JITEvaluatedSymbol(Sym->getAddress(), Flags);
         if (AutoClaim && !MR.getSymbols().count(InternedName)) {
           assert(!ExtraSymbolsToClaim.count(InternedName) &&
                  "Duplicate symbol to claim?");
@@ -105,17 +106,17 @@ public:
         }
       }
 
-    for (auto *A : G.absolute_atoms())
-      if (A->hasName()) {
-        auto InternedName = ES.intern(A->getName());
+    for (auto *Sym : G.absolute_symbols())
+      if (Sym->hasName()) {
+        auto InternedName = ES.intern(Sym->getName());
         JITSymbolFlags Flags;
         Flags |= JITSymbolFlags::Absolute;
-        if (A->isWeak())
-          Flags |= JITSymbolFlags::Weak;
-        if (A->isCallable())
+        if (Sym->isCallable())
           Flags |= JITSymbolFlags::Callable;
+        if (Sym->getLinkage() == Linkage::Weak)
+          Flags |= JITSymbolFlags::Weak;
         InternedResult[InternedName] =
-            JITEvaluatedSymbol(A->getAddress(), Flags);
+            JITEvaluatedSymbol(Sym->getAddress(), Flags);
         if (AutoClaim && !MR.getSymbols().count(InternedName)) {
           assert(!ExtraSymbolsToClaim.count(InternedName) &&
                  "Duplicate symbol to claim?");
@@ -126,35 +127,38 @@ public:
     if (!ExtraSymbolsToClaim.empty())
       if (auto Err = MR.defineMaterializing(ExtraSymbolsToClaim))
         return notifyFailed(std::move(Err));
-
-    MR.notifyResolved(InternedResult);
-
+    if (auto Err = MR.notifyResolved(InternedResult)) {
+      Layer.getExecutionSession().reportError(std::move(Err));
+      MR.failMaterialization();
+      return;
+    }
     Layer.notifyLoaded(MR);
   }
 
   void notifyFinalized(
       std::unique_ptr<JITLinkMemoryManager::Allocation> A) override {
-
     if (auto Err = Layer.notifyEmitted(MR, std::move(A))) {
       Layer.getExecutionSession().reportError(std::move(Err));
       MR.failMaterialization();
-
       return;
     }
-    MR.notifyEmitted();
+    if (auto Err = MR.notifyEmitted()) {
+      Layer.getExecutionSession().reportError(std::move(Err));
+      MR.failMaterialization();
+    }
   }
 
-  AtomGraphPassFunction getMarkLivePass(const Triple &TT) const override {
-    return [this](AtomGraph &G) { return markResponsibilitySymbolsLive(G); };
+  LinkGraphPassFunction getMarkLivePass(const Triple &TT) const override {
+    return [this](LinkGraph &G) { return markResponsibilitySymbolsLive(G); };
   }
 
   Error modifyPassConfig(const Triple &TT, PassConfiguration &Config) override {
     // Add passes to mark duplicate defs as should-discard, and to walk the
-    // atom graph to build the symbol dependence graph.
+    // link graph to build the symbol dependence graph.
     Config.PrePrunePasses.push_back(
-        [this](AtomGraph &G) { return markSymbolsToDiscard(G); });
+        [this](LinkGraph &G) { return externalizeWeakAndCommonSymbols(G); });
     Config.PostPrunePasses.push_back(
-        [this](AtomGraph &G) { return computeNamedSymbolDependencies(G); });
+        [this](LinkGraph &G) { return computeNamedSymbolDependencies(G); });
 
     Layer.modifyPassConfig(MR, TT, Config);
 
@@ -162,65 +166,59 @@ public:
   }
 
 private:
-  using AnonAtomNamedDependenciesMap =
-      DenseMap<const DefinedAtom *, SymbolNameSet>;
+  using AnonToNamedDependenciesMap = DenseMap<const Symbol *, SymbolNameSet>;
 
-  Error markSymbolsToDiscard(AtomGraph &G) {
+  Error externalizeWeakAndCommonSymbols(LinkGraph &G) {
     auto &ES = Layer.getExecutionSession();
-    for (auto *DA : G.defined_atoms())
-      if (DA->isWeak() && DA->hasName()) {
-        auto S = ES.intern(DA->getName());
-        auto I = MR.getSymbols().find(S);
-        if (I == MR.getSymbols().end())
-          DA->setShouldDiscard(true);
+    for (auto *Sym : G.defined_symbols())
+      if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak) {
+        if (!MR.getSymbols().count(ES.intern(Sym->getName())))
+          G.makeExternal(*Sym);
       }
 
-    for (auto *A : G.absolute_atoms())
-      if (A->isWeak() && A->hasName()) {
-        auto S = ES.intern(A->getName());
-        auto I = MR.getSymbols().find(S);
-        if (I == MR.getSymbols().end())
-          A->setShouldDiscard(true);
+    for (auto *Sym : G.absolute_symbols())
+      if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak) {
+        if (!MR.getSymbols().count(ES.intern(Sym->getName())))
+          G.makeExternal(*Sym);
       }
 
     return Error::success();
   }
 
-  Error markResponsibilitySymbolsLive(AtomGraph &G) const {
+  Error markResponsibilitySymbolsLive(LinkGraph &G) const {
     auto &ES = Layer.getExecutionSession();
-    for (auto *DA : G.defined_atoms())
-      if (DA->hasName() &&
-          MR.getSymbols().count(ES.intern(DA->getName())))
-        DA->setLive(true);
+    for (auto *Sym : G.defined_symbols())
+      if (Sym->hasName() && MR.getSymbols().count(ES.intern(Sym->getName())))
+        Sym->setLive(true);
     return Error::success();
   }
 
-  Error computeNamedSymbolDependencies(AtomGraph &G) {
+  Error computeNamedSymbolDependencies(LinkGraph &G) {
     auto &ES = MR.getTargetJITDylib().getExecutionSession();
     auto AnonDeps = computeAnonDeps(G);
 
-    for (auto *DA : G.defined_atoms()) {
+    for (auto *Sym : G.defined_symbols()) {
 
       // Skip anonymous and non-global atoms: we do not need dependencies for
       // these.
-      if (!DA->hasName() || !DA->isGlobal())
+      if (Sym->getScope() == Scope::Local)
         continue;
 
-      auto DAName = ES.intern(DA->getName());
-      SymbolNameSet &DADeps = NamedSymbolDeps[DAName];
+      auto SymName = ES.intern(Sym->getName());
+      SymbolNameSet &SymDeps = NamedSymbolDeps[SymName];
 
-      for (auto &E : DA->edges()) {
-        auto &TA = E.getTarget();
+      for (auto &E : Sym->getBlock().edges()) {
+        auto &TargetSym = E.getTarget();
 
-        if (TA.hasName())
-          DADeps.insert(ES.intern(TA.getName()));
+        if (TargetSym.getScope() != Scope::Local)
+          SymDeps.insert(ES.intern(TargetSym.getName()));
         else {
-          assert(TA.isDefined() && "Anonymous atoms must be defined");
-          auto &DTA = static_cast<DefinedAtom &>(TA);
-          auto I = AnonDeps.find(&DTA);
+          assert(TargetSym.isDefined() &&
+                 "Anonymous/local symbols must be defined");
+          auto I = AnonDeps.find(&TargetSym);
           if (I != AnonDeps.end())
             for (auto &S : I->second)
-              DADeps.insert(S);
+              SymDeps.insert(S);
         }
       }
     }
@@ -228,58 +226,59 @@ private:
     return Error::success();
   }
 
-  AnonAtomNamedDependenciesMap computeAnonDeps(AtomGraph &G) {
+  AnonToNamedDependenciesMap computeAnonDeps(LinkGraph &G) {
 
     auto &ES = MR.getTargetJITDylib().getExecutionSession();
-    AnonAtomNamedDependenciesMap DepMap;
+    AnonToNamedDependenciesMap DepMap;
 
-    // For all anonymous atoms:
+    // For all anonymous symbols:
     // (1) Add their named dependencies.
     // (2) Add them to the worklist for further iteration if they have any
-    //     depend on any other anonymous atoms.
+    //     depend on any other anonymous symbols.
     struct WorklistEntry {
-      WorklistEntry(DefinedAtom *DA, DenseSet<DefinedAtom *> DAAnonDeps)
-          : DA(DA), DAAnonDeps(std::move(DAAnonDeps)) {}
+      WorklistEntry(Symbol *Sym, DenseSet<Symbol *> SymAnonDeps)
+          : Sym(Sym), SymAnonDeps(std::move(SymAnonDeps)) {}
 
-      DefinedAtom *DA = nullptr;
-      DenseSet<DefinedAtom *> DAAnonDeps;
+      Symbol *Sym = nullptr;
+      DenseSet<Symbol *> SymAnonDeps;
     };
     std::vector<WorklistEntry> Worklist;
-    for (auto *DA : G.defined_atoms())
-      if (!DA->hasName()) {
-        auto &DANamedDeps = DepMap[DA];
-        DenseSet<DefinedAtom *> DAAnonDeps;
-
-        for (auto &E : DA->edges()) {
-          auto &TA = E.getTarget();
-          if (TA.hasName())
-            DANamedDeps.insert(ES.intern(TA.getName()));
+    for (auto *Sym : G.defined_symbols())
+      if (!Sym->hasName()) {
+        auto &SymNamedDeps = DepMap[Sym];
+        DenseSet<Symbol *> SymAnonDeps;
+
+        for (auto &E : Sym->getBlock().edges()) {
+          auto &TargetSym = E.getTarget();
+          if (TargetSym.hasName())
+            SymNamedDeps.insert(ES.intern(TargetSym.getName()));
           else {
-            assert(TA.isDefined() && "Anonymous atoms must be defined");
-            DAAnonDeps.insert(static_cast<DefinedAtom *>(&TA));
+            assert(TargetSym.isDefined() &&
+                   "Anonymous symbols must be defined");
+            SymAnonDeps.insert(&TargetSym);
           }
         }
 
-        if (!DAAnonDeps.empty())
-          Worklist.push_back(WorklistEntry(DA, std::move(DAAnonDeps)));
+        if (!SymAnonDeps.empty())
+          Worklist.push_back(WorklistEntry(Sym, std::move(SymAnonDeps)));
       }
 
-    // Loop over all anonymous atoms with anonymous dependencies, propagating
+    // Loop over all anonymous symbols with anonymous dependencies, propagating
     // their respective *named* dependencies. Iterate until we hit a stable
     // state.
     bool Changed;
     do {
       Changed = false;
       for (auto &WLEntry : Worklist) {
-        auto *DA = WLEntry.DA;
-        auto &DANamedDeps = DepMap[DA];
-        auto &DAAnonDeps = WLEntry.DAAnonDeps;
+        auto *Sym = WLEntry.Sym;
+        auto &SymNamedDeps = DepMap[Sym];
+        auto &SymAnonDeps = WLEntry.SymAnonDeps;
 
-        for (auto *TA : DAAnonDeps) {
-          auto I = DepMap.find(TA);
+        for (auto *TargetSym : SymAnonDeps) {
+          auto I = DepMap.find(TargetSym);
           if (I != DepMap.end())
             for (const auto &S : I->second)
-              Changed |= DANamedDeps.insert(S).second;
+              Changed |= SymNamedDeps.insert(S).second;
         }
       }
     } while (Changed);
@@ -330,7 +329,7 @@ ObjectLinkingLayer::~ObjectLinkingLayer() {
 void ObjectLinkingLayer::emit(MaterializationResponsibility R,
                               std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Object must not be null");
-  jitLink(llvm::make_unique<ObjectLinkingLayerJITLinkContext>(
+  jitLink(std::make_unique<ObjectLinkingLayerJITLinkContext>(
       *this, std::move(R), std::move(O)));
 }
 
@@ -410,7 +409,7 @@ Error ObjectLinkingLayer::removeAllModules() {
 }
 
 EHFrameRegistrationPlugin::EHFrameRegistrationPlugin(
-    jitlink::EHFrameRegistrar &Registrar)
+    EHFrameRegistrar &Registrar)
     : Registrar(Registrar) {}
 
 void EHFrameRegistrationPlugin::modifyPassConfig(
@@ -419,61 +418,66 @@ void EHFrameRegistrationPlugin::modifyPassConfig(
   assert(!InProcessLinks.count(&MR) && "Link for MR already being tracked?");
 
   PassConfig.PostFixupPasses.push_back(
-      createEHFrameRecorderPass(TT, [this, &MR](JITTargetAddress Addr) {
+      createEHFrameRecorderPass(TT, [this, &MR](JITTargetAddress Addr,
+                                                size_t Size) {
         if (Addr)
-          InProcessLinks[&MR] = Addr;
+          InProcessLinks[&MR] = { Addr, Size };
       }));
 }
 
 Error EHFrameRegistrationPlugin::notifyEmitted(
     MaterializationResponsibility &MR) {
 
-  auto EHFrameAddrItr = InProcessLinks.find(&MR);
-  if (EHFrameAddrItr == InProcessLinks.end())
+  auto EHFrameRangeItr = InProcessLinks.find(&MR);
+  if (EHFrameRangeItr == InProcessLinks.end())
     return Error::success();
 
-  auto EHFrameAddr = EHFrameAddrItr->second;
-  assert(EHFrameAddr && "eh-frame addr to register can not be null");
+  auto EHFrameRange = EHFrameRangeItr->second;
+  assert(EHFrameRange.Addr &&
+         "eh-frame addr to register can not be null");
 
-  InProcessLinks.erase(EHFrameAddrItr);
+  InProcessLinks.erase(EHFrameRangeItr);
   if (auto Key = MR.getVModuleKey())
-    TrackedEHFrameAddrs[Key] = EHFrameAddr;
+    TrackedEHFrameRanges[Key] = EHFrameRange;
   else
-    UntrackedEHFrameAddrs.push_back(EHFrameAddr);
+    UntrackedEHFrameRanges.push_back(EHFrameRange);
 
-  return Registrar.registerEHFrames(EHFrameAddr);
+  return Registrar.registerEHFrames(EHFrameRange.Addr, EHFrameRange.Size);
 }
 
 Error EHFrameRegistrationPlugin::notifyRemovingModule(VModuleKey K) {
-  auto EHFrameAddrItr = TrackedEHFrameAddrs.find(K);
-  if (EHFrameAddrItr == TrackedEHFrameAddrs.end())
+  auto EHFrameRangeItr = TrackedEHFrameRanges.find(K);
+  if (EHFrameRangeItr == TrackedEHFrameRanges.end())
     return Error::success();
 
-  auto EHFrameAddr = EHFrameAddrItr->second;
-  assert(EHFrameAddr && "Tracked eh-frame addr must not be null");
+  auto EHFrameRange = EHFrameRangeItr->second;
+  assert(EHFrameRange.Addr && "Tracked eh-frame range must not be null");
 
-  TrackedEHFrameAddrs.erase(EHFrameAddrItr);
+  TrackedEHFrameRanges.erase(EHFrameRangeItr);
 
-  return Registrar.deregisterEHFrames(EHFrameAddr);
+  return Registrar.deregisterEHFrames(EHFrameRange.Addr, EHFrameRange.Size);
 }
 
 Error EHFrameRegistrationPlugin::notifyRemovingAllModules() {
 
-  std::vector<JITTargetAddress> EHFrameAddrs = std::move(UntrackedEHFrameAddrs);
-  EHFrameAddrs.reserve(EHFrameAddrs.size() + TrackedEHFrameAddrs.size());
+  std::vector<EHFrameRange> EHFrameRanges =
+    std::move(UntrackedEHFrameRanges);
+  EHFrameRanges.reserve(EHFrameRanges.size() + TrackedEHFrameRanges.size());
 
-  for (auto &KV : TrackedEHFrameAddrs)
-    EHFrameAddrs.push_back(KV.second);
+  for (auto &KV : TrackedEHFrameRanges)
+    EHFrameRanges.push_back(KV.second);
 
-  TrackedEHFrameAddrs.clear();
+  TrackedEHFrameRanges.clear();
 
   Error Err = Error::success();
 
-  while (!EHFrameAddrs.empty()) {
-    auto EHFrameAddr = EHFrameAddrs.back();
-    assert(EHFrameAddr && "Untracked eh-frame addr must not be null");
-    EHFrameAddrs.pop_back();
-    Err = joinErrors(std::move(Err), Registrar.deregisterEHFrames(EHFrameAddr));
+  while (!EHFrameRanges.empty()) {
+    auto EHFrameRange = EHFrameRanges.back();
+    assert(EHFrameRange.Addr && "Untracked eh-frame range must not be null");
+    EHFrameRanges.pop_back();
+    Err = joinErrors(std::move(Err),
+                     Registrar.deregisterEHFrames(EHFrameRange.Addr,
+                                                  EHFrameRange.Size));
   }
 
   return Err;
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index 98129e1690d2..e0af3df9d010 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -97,7 +97,7 @@ public:
 
   template <typename LayerT>
   std::unique_ptr<GenericLayerImpl<LayerT>> createGenericLayer(LayerT &Layer) {
-    return llvm::make_unique<GenericLayerImpl<LayerT>>(Layer);
+    return std::make_unique<GenericLayerImpl<LayerT>>(Layer);
   }
 
 } // end namespace detail
@@ -316,7 +316,8 @@ public:
     if (auto Err = CtorRunner.runViaLayer(*this))
       return std::move(Err);
 
-    IRStaticDestructorRunners.emplace_back(std::move(DtorNames), K);
+    IRStaticDestructorRunners.emplace_back(AcknowledgeORCv1Deprecation,
+                                           std::move(DtorNames), K);
 
     return K;
   }
@@ -326,7 +327,7 @@ public:
                    LLVMOrcSymbolResolverFn ExternalResolver,
                    void *ExternalResolverCtx) {
     return addIRModule(CompileLayer, std::move(M),
-                       llvm::make_unique<SectionMemoryManager>(),
+                       std::make_unique<SectionMemoryManager>(),
                        std::move(ExternalResolver), ExternalResolverCtx);
   }
 
@@ -340,7 +341,7 @@ public:
                                      inconvertibleErrorCode());
 
     return addIRModule(*CODLayer, std::move(M),
-                       llvm::make_unique<SectionMemoryManager>(),
+                       std::make_unique<SectionMemoryManager>(),
                        std::move(ExternalResolver), ExternalResolverCtx);
   }
 
@@ -468,7 +469,7 @@ private:
     if (!CCMgr)
       return nullptr;
 
-    return llvm::make_unique<CODLayerT>(
+    return std::make_unique<CODLayerT>(
         AcknowledgeORCv1Deprecation, ES, CompileLayer,
         [&Resolvers](orc::VModuleKey K) {
           auto ResolverI = Resolvers.find(K);
diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index b22ecd5f80a1..939cd539d1fb 100644
--- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -27,9 +27,9 @@ public:
 
     // Build an OnResolve callback to unwrap the interned strings and pass them
     // to the OnResolved callback.
-    // FIXME: Switch to move capture of OnResolved once we have c++14.
     auto OnResolvedWithUnwrap =
-        [OnResolved](Expected<SymbolMap> InternedResult) {
+        [OnResolved = std::move(OnResolved)](
+            Expected<SymbolMap> InternedResult) mutable {
           if (!InternedResult) {
             OnResolved(InternedResult.takeError());
             return;
@@ -50,7 +50,7 @@ public:
     MR.getTargetJITDylib().withSearchOrderDo(
         [&](const JITDylibSearchList &JDs) { SearchOrder = JDs; });
     ES.lookup(SearchOrder, InternedSymbols, SymbolState::Resolved,
-              OnResolvedWithUnwrap, RegisterDependencies);
+              std::move(OnResolvedWithUnwrap), RegisterDependencies);
   }
 
   Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) {
@@ -133,8 +133,6 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
 
   JITDylibSearchOrderResolver Resolver(*SharedR);
 
-  // FIXME: Switch to move-capture for the 'O' buffer once we have c++14.
-  MemoryBuffer *UnownedObjBuffer = O.release();
   jitLinkForORC(
       **Obj, std::move(O), *MemMgr, Resolver, ProcessAllSections,
       [this, K, SharedR, &Obj, InternalSymbols](
@@ -143,9 +141,8 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
         return onObjLoad(K, *SharedR, **Obj, std::move(LoadedObjInfo),
                          ResolvedSymbols, *InternalSymbols);
       },
-      [this, K, SharedR, UnownedObjBuffer](Error Err) {
-        std::unique_ptr<MemoryBuffer> ObjBuffer(UnownedObjBuffer);
-        onObjEmit(K, std::move(ObjBuffer), *SharedR, std::move(Err));
+      [this, K, SharedR, O = std::move(O)](Error Err) mutable {
+        onObjEmit(K, std::move(O), *SharedR, std::move(Err));
       });
 }
 
@@ -184,7 +181,10 @@ Error RTDyldObjectLinkingLayer::onObjLoad(
     if (auto Err = R.defineMaterializing(ExtraSymbolsToClaim))
       return Err;
 
-  R.notifyResolved(Symbols);
+  if (auto Err = R.notifyResolved(Symbols)) {
+    R.failMaterialization();
+    return Err;
+  }
 
   if (NotifyLoaded)
     NotifyLoaded(K, Obj, *LoadedObjInfo);
@@ -201,7 +201,11 @@ void RTDyldObjectLinkingLayer::onObjEmit(
     return;
   }
 
-  R.notifyEmitted();
+  if (auto Err = R.notifyEmitted()) {
+    getExecutionSession().reportError(std::move(Err));
+    R.failMaterialization();
+    return;
+  }
 
   if (NotifyEmitted)
     NotifyEmitted(K, std::move(ObjBuffer));
diff --git a/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp b/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
new file mode 100644
index 000000000000..f22acf50419d
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
@@ -0,0 +1,307 @@
+//===-- SpeculateAnalyses.cpp  --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/SpeculateAnalyses.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include <algorithm>
+
+namespace {
+using namespace llvm;
+SmallVector<const BasicBlock *, 8> findBBwithCalls(const Function &F,
+                                                   bool IndirectCall = false) {
+  SmallVector<const BasicBlock *, 8> BBs;
+
+  auto findCallInst = [&IndirectCall](const Instruction &I) {
+    if (auto Call = dyn_cast<CallBase>(&I))
+      return Call->isIndirectCall() ? IndirectCall : true;
+    else
+      return false;
+  };
+  for (auto &BB : F)
+    if (findCallInst(*BB.getTerminator()) ||
+        llvm::any_of(BB.instructionsWithoutDebug(), findCallInst))
+      BBs.emplace_back(&BB);
+
+  return BBs;
+}
+} // namespace
+
+// Implementations of Queries shouldn't need to lock the resources
+// such as LLVMContext, each argument (function) has a non-shared LLVMContext
+// Plus, if Queries contain states necessary locking scheme should be provided.
+namespace llvm {
+namespace orc {
+
+// Collect direct calls only
+void SpeculateQuery::findCalles(const BasicBlock *BB,
+                                DenseSet<StringRef> &CallesNames) {
+  assert(BB != nullptr && "Traversing Null BB to find calls?");
+
+  auto getCalledFunction = [&CallesNames](const CallBase *Call) {
+    auto CalledValue = Call->getCalledOperand()->stripPointerCasts();
+    if (auto DirectCall = dyn_cast<Function>(CalledValue))
+      CallesNames.insert(DirectCall->getName());
+  };
+  for (auto &I : BB->instructionsWithoutDebug())
+    if (auto CI = dyn_cast<CallInst>(&I))
+      getCalledFunction(CI);
+
+  if (auto II = dyn_cast<InvokeInst>(BB->getTerminator()))
+    getCalledFunction(II);
+}
+
+bool SpeculateQuery::isStraightLine(const Function &F) {
+  return llvm::all_of(F.getBasicBlockList(), [](const BasicBlock &BB) {
+    return BB.getSingleSuccessor() != nullptr;
+  });
+}
+
+// BlockFreqQuery Implementations
+
+size_t BlockFreqQuery::numBBToGet(size_t numBB) {
+  // small CFG
+  if (numBB < 4)
+    return numBB;
+  // mid-size CFG
+  else if (numBB < 20)
+    return (numBB / 2);
+  else
+    return (numBB / 2) + (numBB / 4);
+}
+
+BlockFreqQuery::ResultTy BlockFreqQuery::operator()(Function &F) {
+  DenseMap<StringRef, DenseSet<StringRef>> CallerAndCalles;
+  DenseSet<StringRef> Calles;
+  SmallVector<std::pair<const BasicBlock *, uint64_t>, 8> BBFreqs;
+
+  PassBuilder PB;
+  FunctionAnalysisManager FAM;
+  PB.registerFunctionAnalyses(FAM);
+
+  auto IBBs = findBBwithCalls(F);
+
+  if (IBBs.empty())
+    return None;
+
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
+  for (const auto I : IBBs)
+    BBFreqs.push_back({I, BFI.getBlockFreq(I).getFrequency()});
+
+  assert(IBBs.size() == BBFreqs.size() && "BB Count Mismatch");
+
+  llvm::sort(BBFreqs.begin(), BBFreqs.end(),
+             [](decltype(BBFreqs)::const_reference BBF,
+                decltype(BBFreqs)::const_reference BBS) {
+               return BBF.second > BBS.second ? true : false;
+             });
+
+  // ignoring number of direct calls in a BB
+  auto Topk = numBBToGet(BBFreqs.size());
+
+  for (size_t i = 0; i < Topk; i++)
+    findCalles(BBFreqs[i].first, Calles);
+
+  assert(!Calles.empty() && "Running Analysis on Function with no calls?");
+
+  CallerAndCalles.insert({F.getName(), std::move(Calles)});
+
+  return CallerAndCalles;
+}
+
+// SequenceBBQuery Implementation
+std::size_t SequenceBBQuery::getHottestBlocks(std::size_t TotalBlocks) {
+  if (TotalBlocks == 1)
+    return TotalBlocks;
+  return TotalBlocks / 2;
+}
+
+// FIXME : find good implementation.
+SequenceBBQuery::BlockListTy
+SequenceBBQuery::rearrangeBB(const Function &F, const BlockListTy &BBList) {
+  BlockListTy RearrangedBBSet;
+
+  for (auto &Block : F.getBasicBlockList())
+    if (llvm::is_contained(BBList, &Block))
+      RearrangedBBSet.push_back(&Block);
+
+  assert(RearrangedBBSet.size() == BBList.size() &&
+         "BasicBlock missing while rearranging?");
+  return RearrangedBBSet;
+}
+
+void SequenceBBQuery::traverseToEntryBlock(const BasicBlock *AtBB,
+                                           const BlockListTy &CallerBlocks,
+                                           const BackEdgesInfoTy &BackEdgesInfo,
+                                           const BranchProbabilityInfo *BPI,
+                                           VisitedBlocksInfoTy &VisitedBlocks) {
+  auto Itr = VisitedBlocks.find(AtBB);
+  if (Itr != VisitedBlocks.end()) { // already visited.
+    if (!Itr->second.Upward)
+      return;
+    Itr->second.Upward = false;
+  } else {
+    // Create hint for newly discoverd blocks.
+    WalkDirection BlockHint;
+    BlockHint.Upward = false;
+    // FIXME: Expensive Check
+    if (llvm::is_contained(CallerBlocks, AtBB))
+      BlockHint.CallerBlock = true;
+    VisitedBlocks.insert(std::make_pair(AtBB, BlockHint));
+  }
+
+  const_pred_iterator PIt = pred_begin(AtBB), EIt = pred_end(AtBB);
+  // Move this check to top, when we have code setup to launch speculative
+  // compiles for function in entry BB, this triggers the speculative compiles
+  // before running the program.
+  if (PIt == EIt) // No Preds.
+    return;
+
+  DenseSet<const BasicBlock *> PredSkipNodes;
+
+  // Since we are checking for predecessor's backedges, this Block
+  // occurs in second position.
+  for (auto &I : BackEdgesInfo)
+    if (I.second == AtBB)
+      PredSkipNodes.insert(I.first);
+
+  // Skip predecessors which source of back-edges.
+  for (; PIt != EIt; ++PIt)
+    // checking EdgeHotness is cheaper
+    if (BPI->isEdgeHot(*PIt, AtBB) && !PredSkipNodes.count(*PIt))
+      traverseToEntryBlock(*PIt, CallerBlocks, BackEdgesInfo, BPI,
+                           VisitedBlocks);
+}
+
+void SequenceBBQuery::traverseToExitBlock(const BasicBlock *AtBB,
+                                          const BlockListTy &CallerBlocks,
+                                          const BackEdgesInfoTy &BackEdgesInfo,
+                                          const BranchProbabilityInfo *BPI,
+                                          VisitedBlocksInfoTy &VisitedBlocks) {
+  auto Itr = VisitedBlocks.find(AtBB);
+  if (Itr != VisitedBlocks.end()) { // already visited.
+    if (!Itr->second.Downward)
+      return;
+    Itr->second.Downward = false;
+  } else {
+    // Create hint for newly discoverd blocks.
+    WalkDirection BlockHint;
+    BlockHint.Downward = false;
+    // FIXME: Expensive Check
+    if (llvm::is_contained(CallerBlocks, AtBB))
+      BlockHint.CallerBlock = true;
+    VisitedBlocks.insert(std::make_pair(AtBB, BlockHint));
+  }
+
+  succ_const_iterator PIt = succ_begin(AtBB), EIt = succ_end(AtBB);
+  if (PIt == EIt) // No succs.
+    return;
+
+  // If there are hot edges, then compute SuccSkipNodes.
+  DenseSet<const BasicBlock *> SuccSkipNodes;
+
+  // Since we are checking for successor's backedges, this Block
+  // occurs in first position.
+  for (auto &I : BackEdgesInfo)
+    if (I.first == AtBB)
+      SuccSkipNodes.insert(I.second);
+
+  for (; PIt != EIt; ++PIt)
+    if (BPI->isEdgeHot(AtBB, *PIt) && !SuccSkipNodes.count(*PIt))
+      traverseToExitBlock(*PIt, CallerBlocks, BackEdgesInfo, BPI,
+                          VisitedBlocks);
+}
+
+// Get Block frequencies for blocks and take most frquently executed block,
+// walk towards the entry block from those blocks and discover the basic blocks
+// with call.
+SequenceBBQuery::BlockListTy
+SequenceBBQuery::queryCFG(Function &F, const BlockListTy &CallerBlocks) {
+
+  BlockFreqInfoTy BBFreqs;
+  VisitedBlocksInfoTy VisitedBlocks;
+  BackEdgesInfoTy BackEdgesInfo;
+
+  PassBuilder PB;
+  FunctionAnalysisManager FAM;
+  PB.registerFunctionAnalyses(FAM);
+
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
+  llvm::FindFunctionBackedges(F, BackEdgesInfo);
+
+  for (const auto I : CallerBlocks)
+    BBFreqs.push_back({I, BFI.getBlockFreq(I).getFrequency()});
+
+  llvm::sort(BBFreqs, [](decltype(BBFreqs)::const_reference Bbf,
+                         decltype(BBFreqs)::const_reference Bbs) {
+    return Bbf.second > Bbs.second;
+  });
+
+  ArrayRef<std::pair<const BasicBlock *, uint64_t>> HotBlocksRef(BBFreqs);
+  HotBlocksRef =
+      HotBlocksRef.drop_back(BBFreqs.size() - getHottestBlocks(BBFreqs.size()));
+
+  BranchProbabilityInfo *BPI =
+      FAM.getCachedResult<BranchProbabilityAnalysis>(F);
+
+  // visit NHotBlocks,
+  // traverse upwards to entry
+  // traverse downwards to end.
+
+  for (auto I : HotBlocksRef) {
+    traverseToEntryBlock(I.first, CallerBlocks, BackEdgesInfo, BPI,
+                         VisitedBlocks);
+    traverseToExitBlock(I.first, CallerBlocks, BackEdgesInfo, BPI,
+                        VisitedBlocks);
+  }
+
+  BlockListTy MinCallerBlocks;
+  for (auto &I : VisitedBlocks)
+    if (I.second.CallerBlock)
+      MinCallerBlocks.push_back(std::move(I.first));
+
+  return rearrangeBB(F, MinCallerBlocks);
+}
+
+SpeculateQuery::ResultTy SequenceBBQuery::operator()(Function &F) {
+  // reduce the number of lists!
+  DenseMap<StringRef, DenseSet<StringRef>> CallerAndCalles;
+  DenseSet<StringRef> Calles;
+  BlockListTy SequencedBlocks;
+  BlockListTy CallerBlocks;
+
+  CallerBlocks = findBBwithCalls(F);
+  if (CallerBlocks.empty())
+    return None;
+
+  if (isStraightLine(F))
+    SequencedBlocks = rearrangeBB(F, CallerBlocks);
+  else
+    SequencedBlocks = queryCFG(F, CallerBlocks);
+
+  for (auto BB : SequencedBlocks)
+    findCalles(BB, Calles);
+
+  CallerAndCalles.insert({F.getName(), std::move(Calles)});
+  return CallerAndCalles;
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/lib/ExecutionEngine/Orc/Speculation.cpp b/lib/ExecutionEngine/Orc/Speculation.cpp
new file mode 100644
index 000000000000..f29201c147a1
--- /dev/null
+++ b/lib/ExecutionEngine/Orc/Speculation.cpp
@@ -0,0 +1,146 @@
+//===---------- speculation.cpp - Utilities for Speculation ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Speculation.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+
+#include <vector>
+
+namespace llvm {
+
+namespace orc {
+
+// ImplSymbolMap methods
+void ImplSymbolMap::trackImpls(SymbolAliasMap ImplMaps, JITDylib *SrcJD) {
+  assert(SrcJD && "Tracking on Null Source .impl dylib");
+  std::lock_guard<std::mutex> Lockit(ConcurrentAccess);
+  for (auto &I : ImplMaps) {
+    auto It = Maps.insert({I.first, {I.second.Aliasee, SrcJD}});
+    // check rationale when independent dylibs have same symbol name?
+    assert(It.second && "ImplSymbols are already tracked for this Symbol?");
+    (void)(It);
+  }
+}
+
+// Trigger Speculative Compiles.
+void Speculator::speculateForEntryPoint(Speculator *Ptr, uint64_t StubId) {
+  assert(Ptr && " Null Address Received in orc_speculate_for ");
+  Ptr->speculateFor(StubId);
+}
+
+Error Speculator::addSpeculationRuntime(JITDylib &JD,
+                                        MangleAndInterner &Mangle) {
+  JITEvaluatedSymbol ThisPtr(pointerToJITTargetAddress(this),
+                             JITSymbolFlags::Exported);
+  JITEvaluatedSymbol SpeculateForEntryPtr(
+      pointerToJITTargetAddress(&speculateForEntryPoint),
+      JITSymbolFlags::Exported);
+  return JD.define(absoluteSymbols({
+      {Mangle("__orc_speculator"), ThisPtr},                // Data Symbol
+      {Mangle("__orc_speculate_for"), SpeculateForEntryPtr} // Callable Symbol
+  }));
+}
+
+// If two modules, share the same LLVMContext, different threads must
+// not access them concurrently without locking the associated LLVMContext
+// this implementation follows this contract.
+void IRSpeculationLayer::emit(MaterializationResponsibility R,
+                              ThreadSafeModule TSM) {
+
+  assert(TSM && "Speculation Layer received Null Module ?");
+  assert(TSM.getContext().getContext() != nullptr &&
+         "Module with null LLVMContext?");
+
+  // Instrumentation of runtime calls, lock the Module
+  TSM.withModuleDo([this, &R](Module &M) {
+    auto &MContext = M.getContext();
+    auto SpeculatorVTy = StructType::create(MContext, "Class.Speculator");
+    auto RuntimeCallTy = FunctionType::get(
+        Type::getVoidTy(MContext),
+        {SpeculatorVTy->getPointerTo(), Type::getInt64Ty(MContext)}, false);
+    auto RuntimeCall =
+        Function::Create(RuntimeCallTy, Function::LinkageTypes::ExternalLinkage,
+                         "__orc_speculate_for", &M);
+    auto SpeclAddr = new GlobalVariable(
+        M, SpeculatorVTy, false, GlobalValue::LinkageTypes::ExternalLinkage,
+        nullptr, "__orc_speculator");
+
+    IRBuilder<> Mutator(MContext);
+
+    // QueryAnalysis allowed to transform the IR source, one such example is
+    // Simplify CFG helps the static branch prediction heuristics!
+    for (auto &Fn : M.getFunctionList()) {
+      if (!Fn.isDeclaration()) {
+
+        auto IRNames = QueryAnalysis(Fn);
+        // Instrument and register if Query has result
+        if (IRNames.hasValue()) {
+
+          // Emit globals for each function.
+          auto LoadValueTy = Type::getInt8Ty(MContext);
+          auto SpeculatorGuard = new GlobalVariable(
+              M, LoadValueTy, false, GlobalValue::LinkageTypes::InternalLinkage,
+              ConstantInt::get(LoadValueTy, 0),
+              "__orc_speculate.guard.for." + Fn.getName());
+          SpeculatorGuard->setAlignment(Align::None());
+          SpeculatorGuard->setUnnamedAddr(GlobalValue::UnnamedAddr::Local);
+
+          BasicBlock &ProgramEntry = Fn.getEntryBlock();
+          // Create BasicBlocks before the program's entry basicblock
+          BasicBlock *SpeculateBlock = BasicBlock::Create(
+              MContext, "__orc_speculate.block", &Fn, &ProgramEntry);
+          BasicBlock *SpeculateDecisionBlock = BasicBlock::Create(
+              MContext, "__orc_speculate.decision.block", &Fn, SpeculateBlock);
+
+          assert(SpeculateDecisionBlock == &Fn.getEntryBlock() &&
+                 "SpeculateDecisionBlock not updated?");
+          Mutator.SetInsertPoint(SpeculateDecisionBlock);
+
+          auto LoadGuard =
+              Mutator.CreateLoad(LoadValueTy, SpeculatorGuard, "guard.value");
+          // if just loaded value equal to 0,return true.
+          auto CanSpeculate =
+              Mutator.CreateICmpEQ(LoadGuard, ConstantInt::get(LoadValueTy, 0),
+                                   "compare.to.speculate");
+          Mutator.CreateCondBr(CanSpeculate, SpeculateBlock, &ProgramEntry);
+
+          Mutator.SetInsertPoint(SpeculateBlock);
+          auto ImplAddrToUint =
+              Mutator.CreatePtrToInt(&Fn, Type::getInt64Ty(MContext));
+          Mutator.CreateCall(RuntimeCallTy, RuntimeCall,
+                             {SpeclAddr, ImplAddrToUint});
+          Mutator.CreateStore(ConstantInt::get(LoadValueTy, 1),
+                              SpeculatorGuard);
+          Mutator.CreateBr(&ProgramEntry);
+
+          assert(Mutator.GetInsertBlock()->getParent() == &Fn &&
+                 "IR builder association mismatch?");
+          S.registerSymbols(internToJITSymbols(IRNames.getValue()),
+                            &R.getTargetJITDylib());
+        }
+      }
+    }
+  });
+
+  assert(!TSM.withModuleDo([](const Module &M) { return verifyModule(M); }) &&
+         "Speculation Instrumentation breaks IR?");
+
+  NextLayer.emit(std::move(R), std::move(TSM));
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp b/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
index 4cb7376758a7..1f4e6f132115 100644
--- a/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
+++ b/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
@@ -23,41 +23,41 @@ ThreadSafeModule cloneToNewContext(ThreadSafeModule &TSM,
   if (!ShouldCloneDef)
     ShouldCloneDef = [](const GlobalValue &) { return true; };
 
-  auto Lock = TSM.getContextLock();
+  return TSM.withModuleDo([&](Module &M) {
+    SmallVector<char, 1> ClonedModuleBuffer;
 
-  SmallVector<char, 1> ClonedModuleBuffer;
+    {
+      std::set<GlobalValue *> ClonedDefsInSrc;
+      ValueToValueMapTy VMap;
+      auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) {
+        if (ShouldCloneDef(*GV)) {
+          ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV));
+          return true;
+        }
+        return false;
+      });
 
-  {
-    std::set<GlobalValue *> ClonedDefsInSrc;
-    ValueToValueMapTy VMap;
-    auto Tmp = CloneModule(*TSM.getModule(), VMap, [&](const GlobalValue *GV) {
-      if (ShouldCloneDef(*GV)) {
-        ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV));
-        return true;
-      }
-      return false;
-    });
+      if (UpdateClonedDefSource)
+        for (auto *GV : ClonedDefsInSrc)
+          UpdateClonedDefSource(*GV);
 
-    if (UpdateClonedDefSource)
-      for (auto *GV : ClonedDefsInSrc)
-        UpdateClonedDefSource(*GV);
+      BitcodeWriter BCWriter(ClonedModuleBuffer);
 
-    BitcodeWriter BCWriter(ClonedModuleBuffer);
+      BCWriter.writeModule(*Tmp);
+      BCWriter.writeSymtab();
+      BCWriter.writeStrtab();
+    }
 
-    BCWriter.writeModule(*Tmp);
-    BCWriter.writeSymtab();
-    BCWriter.writeStrtab();
-  }
+    MemoryBufferRef ClonedModuleBufferRef(
+        StringRef(ClonedModuleBuffer.data(), ClonedModuleBuffer.size()),
+        "cloned module buffer");
+    ThreadSafeContext NewTSCtx(std::make_unique<LLVMContext>());
 
-  MemoryBufferRef ClonedModuleBufferRef(
-      StringRef(ClonedModuleBuffer.data(), ClonedModuleBuffer.size()),
-      "cloned module buffer");
-  ThreadSafeContext NewTSCtx(llvm::make_unique<LLVMContext>());
-
-  auto ClonedModule =
-      cantFail(parseBitcodeFile(ClonedModuleBufferRef, *NewTSCtx.getContext()));
-  ClonedModule->setModuleIdentifier(TSM.getModule()->getName());
-  return ThreadSafeModule(std::move(ClonedModule), std::move(NewTSCtx));
+    auto ClonedModule = cantFail(
+        parseBitcodeFile(ClonedModuleBufferRef, *NewTSCtx.getContext()));
+    ClonedModule->setModuleIdentifier(M.getName());
+    return ThreadSafeModule(std::move(ClonedModule), std::move(NewTSCtx));
+  });
 }
 
 } // end namespace orc
diff --git a/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp b/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
index 5606421a3cb0..184388dc4d7a 100644
--- a/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
+++ b/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
@@ -26,11 +26,11 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
-#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
+#include <mutex>
 
 #include <sys/mman.h>  // mmap()
 #include <sys/types.h> // getpid()
@@ -203,7 +203,7 @@ PerfJITEventListener::PerfJITEventListener() : Pid(::getpid()) {
     return;
   }
 
-  Dumpstream = make_unique<raw_fd_ostream>(DumpFd, true);
+  Dumpstream = std::make_unique<raw_fd_ostream>(DumpFd, true);
 
   LLVMPerfJitHeader Header = {0};
   if (!FillMachine(Header))
@@ -420,7 +420,7 @@ void PerfJITEventListener::NotifyCode(Expected<llvm::StringRef> &Symbol,
   rec.Tid = get_threadid();
 
   // avoid interspersing output
-  MutexGuard Guard(Mutex);
+  std::lock_guard<sys::Mutex> Guard(Mutex);
 
   rec.CodeIndex = CodeGeneration++; // under lock!
 
@@ -462,7 +462,7 @@ void PerfJITEventListener::NotifyDebug(uint64_t CodeAddr,
   // * char name[n]      : source file name in ASCII, including null termination
 
   // avoid interspersing output
-  MutexGuard Guard(Mutex);
+  std::lock_guard<sys::Mutex> Guard(Mutex);
 
   Dumpstream->write(reinterpret_cast<const char *>(&rec), sizeof(rec));
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index e26e6ce45db4..2df71a5e5e74 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -17,10 +17,11 @@
 #include "RuntimeDyldMachO.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/MutexGuard.h"
+#include <mutex>
 
 #include <future>
 
@@ -120,7 +121,7 @@ static void dumpSectionMemory(const SectionEntry &S, StringRef State) {
 
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // Print out the sections prior to relocation.
   LLVM_DEBUG(for (int i = 0, e = Sections.size(); i != e; ++i)
@@ -156,7 +157,7 @@ void RuntimeDyldImpl::resolveLocalRelocations() {
 
 void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
                                         uint64_t TargetAddress) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
   for (unsigned i = 0, e = Sections.size(); i != e; ++i) {
     if (Sections[i].getAddress() == LocalAddress) {
       reassignSectionAddress(i, TargetAddress);
@@ -177,7 +178,7 @@ static Error getOffset(const SymbolRef &Sym, SectionRef Sec,
 
 Expected<RuntimeDyldImpl::ObjSectionToIDMap>
 RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
-  MutexGuard locked(lock);
+  std::lock_guard<sys::Mutex> locked(lock);
 
   // Save information about our target
   Arch = (Triple::ArchType)Obj.getArch();
@@ -347,8 +348,12 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
        SI != SE; ++SI) {
     StubMap Stubs;
-    section_iterator RelocatedSection = SI->getRelocatedSection();
 
+    Expected<section_iterator> RelSecOrErr = SI->getRelocatedSection();
+    if (!RelSecOrErr)
+      return RelSecOrErr.takeError();
+
+    section_iterator RelocatedSection = *RelSecOrErr;
     if (RelocatedSection == SE)
       continue;
 
@@ -535,9 +540,10 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
       bool IsCode = Section.isText();
       bool IsReadOnly = isReadOnlyData(Section);
 
-      StringRef Name;
-      if (auto EC = Section.getName(Name))
-        return errorCodeToError(EC);
+      Expected<StringRef> NameOrErr = Section.getName();
+      if (!NameOrErr)
+        return NameOrErr.takeError();
+      StringRef Name = *NameOrErr;
 
       uint64_t StubBufSize = computeSectionStubBufSize(Obj, Section);
 
@@ -646,7 +652,12 @@ unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj,
   unsigned StubBufSize = 0;
   for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
        SI != SE; ++SI) {
-    section_iterator RelSecI = SI->getRelocatedSection();
+
+    Expected<section_iterator> RelSecOrErr = SI->getRelocatedSection();
+    if (!RelSecOrErr)
+      report_fatal_error(toString(RelSecOrErr.takeError()));
+
+    section_iterator RelSecI = *RelSecOrErr;
     if (!(RelSecI == Section))
       continue;
 
@@ -727,16 +738,17 @@ Error RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
 
   // Assign the address of each symbol
   for (auto &Sym : SymbolsToAllocate) {
-    uint32_t Align = Sym.getAlignment();
+    uint32_t Alignment = Sym.getAlignment();
     uint64_t Size = Sym.getCommonSize();
     StringRef Name;
     if (auto NameOrErr = Sym.getName())
       Name = *NameOrErr;
     else
       return NameOrErr.takeError();
-    if (Align) {
+    if (Alignment) {
       // This symbol has an alignment requirement.
-      uint64_t AlignOffset = OffsetToAlignment((uint64_t)Addr, Align);
+      uint64_t AlignOffset =
+          offsetToAlignment((uint64_t)Addr, Align(Alignment));
       Addr += AlignOffset;
       Offset += AlignOffset;
     }
@@ -777,9 +789,10 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   // anyway, so we should guarantee that the alignment is always at least 1.
   Alignment = std::max(1u, Alignment);
 
-  StringRef Name;
-  if (auto EC = Section.getName(Name))
-    return errorCodeToError(EC);
+  Expected<StringRef> NameOrErr = Section.getName();
+  if (!NameOrErr)
+    return NameOrErr.takeError();
+  StringRef Name = *NameOrErr;
 
   StubBufSize = computeSectionStubBufSize(Obj, Section);
 
@@ -917,7 +930,8 @@ void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE,
 
 uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr,
                                              unsigned AbiVariant) {
-  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be) {
+  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be ||
+      Arch == Triple::aarch64_32) {
     // This stub has to be able to access the full address space,
     // since symbol lookup won't necessarily find a handy, in-range,
     // PLT stub for functions which could be anywhere.
@@ -1175,17 +1189,15 @@ Error RuntimeDyldImpl::resolveExternalSymbols() {
 }
 
 void RuntimeDyldImpl::finalizeAsync(
-    std::unique_ptr<RuntimeDyldImpl> This, std::function<void(Error)> OnEmitted,
+    std::unique_ptr<RuntimeDyldImpl> This,
+    unique_function<void(Error)> OnEmitted,
     std::unique_ptr<MemoryBuffer> UnderlyingBuffer) {
 
-  // FIXME: Move-capture OnRelocsApplied and UnderlyingBuffer once we have
-  // c++14.
-  auto SharedUnderlyingBuffer =
-      std::shared_ptr<MemoryBuffer>(std::move(UnderlyingBuffer));
   auto SharedThis = std::shared_ptr<RuntimeDyldImpl>(std::move(This));
   auto PostResolveContinuation =
-      [SharedThis, OnEmitted, SharedUnderlyingBuffer](
-          Expected<JITSymbolResolver::LookupResult> Result) {
+      [SharedThis, OnEmitted = std::move(OnEmitted),
+       UnderlyingBuffer = std::move(UnderlyingBuffer)](
+          Expected<JITSymbolResolver::LookupResult> Result) mutable {
         if (!Result) {
           OnEmitted(Result.takeError());
           return;
@@ -1219,7 +1231,7 @@ void RuntimeDyldImpl::finalizeAsync(
   }
 
   if (!Symbols.empty()) {
-    SharedThis->Resolver.lookup(Symbols, PostResolveContinuation);
+    SharedThis->Resolver.lookup(Symbols, std::move(PostResolveContinuation));
   } else
     PostResolveContinuation(std::map<StringRef, JITEvaluatedSymbol>());
 }
@@ -1395,11 +1407,11 @@ void jitLinkForORC(object::ObjectFile &Obj,
                    std::unique_ptr<MemoryBuffer> UnderlyingBuffer,
                    RuntimeDyld::MemoryManager &MemMgr,
                    JITSymbolResolver &Resolver, bool ProcessAllSections,
-                   std::function<Error(
+                   unique_function<Error(
                        std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObj,
                        std::map<StringRef, JITEvaluatedSymbol>)>
                        OnLoaded,
-                   std::function<void(Error)> OnEmitted) {
+                   unique_function<void(Error)> OnEmitted) {
 
   RuntimeDyld RTDyld(MemMgr, Resolver);
   RTDyld.setProcessAllSections(ProcessAllSections);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
index d4e3b0ba7670..27a7690db34f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
@@ -50,18 +50,18 @@ llvm::RuntimeDyldCOFF::create(Triple::ArchType Arch,
   switch (Arch) {
   default: llvm_unreachable("Unsupported target for RuntimeDyldCOFF.");
   case Triple::x86:
-    return make_unique<RuntimeDyldCOFFI386>(MemMgr, Resolver);
+    return std::make_unique<RuntimeDyldCOFFI386>(MemMgr, Resolver);
   case Triple::thumb:
-    return make_unique<RuntimeDyldCOFFThumb>(MemMgr, Resolver);
+    return std::make_unique<RuntimeDyldCOFFThumb>(MemMgr, Resolver);
   case Triple::x86_64:
-    return make_unique<RuntimeDyldCOFFX86_64>(MemMgr, Resolver);
+    return std::make_unique<RuntimeDyldCOFFX86_64>(MemMgr, Resolver);
   }
 }
 
 std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyldCOFF::loadObject(const object::ObjectFile &O) {
   if (auto ObjSectionToIDOrErr = loadObjectImpl(O)) {
-    return llvm::make_unique<LoadedCOFFObjectInfo>(*this, *ObjSectionToIDOrErr);
+    return std::make_unique<LoadedCOFFObjectInfo>(*this, *ObjSectionToIDOrErr);
   } else {
     HasError = true;
     raw_string_ostream ErrStream(ErrorStr);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index ec31ea4e573c..b9c5a12e08d8 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -851,7 +851,7 @@ RuntimeDyldChecker::RuntimeDyldChecker(
     GetGOTInfoFunction GetGOTInfo, support::endianness Endianness,
     MCDisassembler *Disassembler, MCInstPrinter *InstPrinter,
     raw_ostream &ErrStream)
-    : Impl(::llvm::make_unique<RuntimeDyldCheckerImpl>(
+    : Impl(::std::make_unique<RuntimeDyldCheckerImpl>(
           std::move(IsSymbolValid), std::move(GetSymbolInfo),
           std::move(GetSectionInfo), std::move(GetStubInfo),
           std::move(GetGOTInfo), Endianness, Disassembler, InstPrinter,
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 60041a45e2b8..440ab4174a56 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -160,9 +160,13 @@ createRTDyldELFObject(MemoryBufferRef Buffer, const ObjectFile &SourceObject,
   // Iterate over all sections in the object.
   auto SI = SourceObject.section_begin();
   for (const auto &Sec : Obj->sections()) {
-    StringRef SectionName;
-    Sec.getName(SectionName);
-    if (SectionName != "") {
+    Expected<StringRef> NameOrErr = Sec.getName();
+    if (!NameOrErr) {
+      consumeError(NameOrErr.takeError());
+      continue;
+    }
+
+    if (*NameOrErr != "") {
       DataRefImpl ShdrRef = Sec.getRawDataRefImpl();
       Elf_Shdr *shdr = const_cast<Elf_Shdr *>(
           reinterpret_cast<const Elf_Shdr *>(ShdrRef.p));
@@ -238,19 +242,19 @@ llvm::RuntimeDyldELF::create(Triple::ArchType Arch,
                              JITSymbolResolver &Resolver) {
   switch (Arch) {
   default:
-    return make_unique<RuntimeDyldELF>(MemMgr, Resolver);
+    return std::make_unique<RuntimeDyldELF>(MemMgr, Resolver);
   case Triple::mips:
   case Triple::mipsel:
   case Triple::mips64:
   case Triple::mips64el:
-    return make_unique<RuntimeDyldELFMips>(MemMgr, Resolver);
+    return std::make_unique<RuntimeDyldELFMips>(MemMgr, Resolver);
   }
 }
 
 std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyldELF::loadObject(const object::ObjectFile &O) {
   if (auto ObjSectionToIDOrErr = loadObjectImpl(O))
-    return llvm::make_unique<LoadedELFObjectInfo>(*this, *ObjSectionToIDOrErr);
+    return std::make_unique<LoadedELFObjectInfo>(*this, *ObjSectionToIDOrErr);
   else {
     HasError = true;
     raw_string_ostream ErrStream(ErrorStr);
@@ -567,10 +571,11 @@ Error RuntimeDyldELF::findPPC64TOCSection(const ELFObjectFileBase &Obj,
 
   // The TOC consists of sections .got, .toc, .tocbss, .plt in that
   // order. The TOC starts where the first of these sections starts.
-  for (auto &Section: Obj.sections()) {
-    StringRef SectionName;
-    if (auto EC = Section.getName(SectionName))
-      return errorCodeToError(EC);
+  for (auto &Section : Obj.sections()) {
+    Expected<StringRef> NameOrErr = Section.getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+    StringRef SectionName = *NameOrErr;
 
     if (SectionName == ".got"
         || SectionName == ".toc"
@@ -601,13 +606,19 @@ Error RuntimeDyldELF::findOPDEntrySection(const ELFObjectFileBase &Obj,
   // .opd entries
   for (section_iterator si = Obj.section_begin(), se = Obj.section_end();
        si != se; ++si) {
-    section_iterator RelSecI = si->getRelocatedSection();
+
+    Expected<section_iterator> RelSecOrErr = si->getRelocatedSection();
+    if (!RelSecOrErr)
+      report_fatal_error(toString(RelSecOrErr.takeError()));
+
+    section_iterator RelSecI = *RelSecOrErr;
     if (RelSecI == Obj.section_end())
       continue;
 
-    StringRef RelSectionName;
-    if (auto EC = RelSecI->getName(RelSectionName))
-      return errorCodeToError(EC);
+    Expected<StringRef> NameOrErr = RelSecI->getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+    StringRef RelSectionName = *NameOrErr;
 
     if (RelSectionName != ".opd")
       continue;
@@ -1865,7 +1876,12 @@ Error RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj,
       for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
            SI != SE; ++SI) {
         if (SI->relocation_begin() != SI->relocation_end()) {
-          section_iterator RelocatedSection = SI->getRelocatedSection();
+          Expected<section_iterator> RelSecOrErr = SI->getRelocatedSection();
+          if (!RelSecOrErr)
+            return make_error<RuntimeDyldError>(
+                toString(RelSecOrErr.takeError()));
+
+          section_iterator RelocatedSection = *RelSecOrErr;
           ObjSectionToIDMap::iterator i = SectionMap.find(*RelocatedSection);
           assert (i != SectionMap.end());
           SectionToGOTMap[i->second] = GOTSectionID;
@@ -1879,8 +1895,14 @@ Error RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj,
   ObjSectionToIDMap::iterator i, e;
   for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
     const SectionRef &Section = i->first;
+
     StringRef Name;
-    Section.getName(Name);
+    Expected<StringRef> NameOrErr = Section.getName();
+    if (NameOrErr)
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
     if (Name == ".eh_frame") {
       UnregisteredEHFrameSections.push_back(i->second);
       break;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 68b3468fbc9d..cec7b92b8c48 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -549,7 +549,7 @@ public:
   void resolveLocalRelocations();
 
   static void finalizeAsync(std::unique_ptr<RuntimeDyldImpl> This,
-                            std::function<void(Error)> OnEmitted,
+                            unique_function<void(Error)> OnEmitted,
                             std::unique_ptr<MemoryBuffer> UnderlyingBuffer);
 
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 202c3ca1c507..9ca76602ea18 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -233,7 +233,10 @@ RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(const ObjectFile &Obj,
 
   for (const auto &Section : Obj.sections()) {
     StringRef Name;
-    Section.getName(Name);
+    if (Expected<StringRef> NameOrErr = Section.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
 
     // Force emission of the __text, __eh_frame, and __gcc_except_tab sections
     // if they're present. Otherwise call down to the impl to handle other
@@ -351,20 +354,22 @@ RuntimeDyldMachO::create(Triple::ArchType Arch,
     llvm_unreachable("Unsupported target for RuntimeDyldMachO.");
     break;
   case Triple::arm:
-    return make_unique<RuntimeDyldMachOARM>(MemMgr, Resolver);
+    return std::make_unique<RuntimeDyldMachOARM>(MemMgr, Resolver);
   case Triple::aarch64:
-    return make_unique<RuntimeDyldMachOAArch64>(MemMgr, Resolver);
+    return std::make_unique<RuntimeDyldMachOAArch64>(MemMgr, Resolver);
+  case Triple::aarch64_32:
+    return std::make_unique<RuntimeDyldMachOAArch64>(MemMgr, Resolver);
   case Triple::x86:
-    return make_unique<RuntimeDyldMachOI386>(MemMgr, Resolver);
+    return std::make_unique<RuntimeDyldMachOI386>(MemMgr, Resolver);
   case Triple::x86_64:
-    return make_unique<RuntimeDyldMachOX86_64>(MemMgr, Resolver);
+    return std::make_unique<RuntimeDyldMachOX86_64>(MemMgr, Resolver);
   }
 }
 
 std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyldMachO::loadObject(const object::ObjectFile &O) {
   if (auto ObjSectionToIDOrErr = loadObjectImpl(O))
-    return llvm::make_unique<LoadedMachOObjectInfo>(*this,
+    return std::make_unique<LoadedMachOObjectInfo>(*this,
                                                     *ObjSectionToIDOrErr);
   else {
     HasError = true;
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index d2d74534cf90..dc4af08583de 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -284,14 +284,14 @@ public:
     // Look for and record the EH frame section IDs.
     for (const auto &SectionPair : SectionMap) {
       const object::SectionRef &Section = SectionPair.first;
-      StringRef Name;
-      if (auto EC = Section.getName(Name))
-        return errorCodeToError(EC);
+      Expected<StringRef> NameOrErr = Section.getName();
+      if (!NameOrErr)
+        return NameOrErr.takeError();
 
       // Note unwind info is stored in .pdata but often points to .xdata
       // with an IMAGE_REL_AMD64_ADDR32NB relocation. Using a memory manager
       // that keeps sections ordered in relation to __ImageBase is necessary.
-      if (Name == ".pdata")
+      if ((*NameOrErr) == ".pdata")
         UnregisteredEHFrameSections.push_back(SectionPair.second);
     }
     return Error::success();
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index 3bec8b979f7d..a76958a9e2c2 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -289,7 +289,10 @@ public:
   Error finalizeSection(const ObjectFile &Obj, unsigned SectionID,
                        const SectionRef &Section) {
     StringRef Name;
-    Section.getName(Name);
+    if (Expected<StringRef> NameOrErr = Section.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
 
     if (Name == "__nl_symbol_ptr")
       return populateIndirectSymbolPointersSection(cast<MachOObjectFile>(Obj),
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
index f0de27ba14bb..523deb29b723 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
@@ -128,7 +128,10 @@ public:
   Error finalizeSection(const ObjectFile &Obj, unsigned SectionID,
                        const SectionRef &Section) {
     StringRef Name;
-    Section.getName(Name);
+    if (Expected<StringRef> NameOrErr = Section.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
 
     if (Name == "__jump_table")
       return populateJumpTable(cast<MachOObjectFile>(Obj), Section, SectionID);
diff --git a/lib/FuzzMutate/FuzzerCLI.cpp b/lib/FuzzMutate/FuzzerCLI.cpp
index 63d31c035390..f2368ea7f26b 100644
--- a/lib/FuzzMutate/FuzzerCLI.cpp
+++ b/lib/FuzzMutate/FuzzerCLI.cpp
@@ -171,7 +171,7 @@ std::unique_ptr<Module> llvm::parseModule(
 
   if (Size <= 1)
     // We get bogus data given an empty corpus - just create a new module.
-    return llvm::make_unique<Module>("M", Context);
+    return std::make_unique<Module>("M", Context);
 
   auto Buffer = MemoryBuffer::getMemBuffer(
       StringRef(reinterpret_cast<const char *>(Data), Size), "Fuzzer input",
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index eb5760daecb3..b0c26e0ecaf5 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -352,6 +352,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::PreserveAll:   Out << "preserve_allcc"; break;
   case CallingConv::CXX_FAST_TLS:  Out << "cxx_fast_tlscc"; break;
   case CallingConv::GHC:           Out << "ghccc"; break;
+  case CallingConv::Tail:          Out << "tailcc"; break;
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
   case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
@@ -835,7 +836,7 @@ SlotTracker *ModuleSlotTracker::getMachine() {
 
   ShouldCreateStorage = false;
   MachineStorage =
-      llvm::make_unique<SlotTracker>(M, ShouldInitializeAllMetadata);
+      std::make_unique<SlotTracker>(M, ShouldInitializeAllMetadata);
   Machine = MachineStorage.get();
   return Machine;
 }
@@ -2312,7 +2313,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
   if (const MDNode *N = dyn_cast<MDNode>(MD)) {
     std::unique_ptr<SlotTracker> MachineStorage;
     if (!Machine) {
-      MachineStorage = make_unique<SlotTracker>(Context);
+      MachineStorage = std::make_unique<SlotTracker>(Context);
       Machine = MachineStorage.get();
     }
     int Slot = Machine->getMetadataSlot(N);
@@ -2950,7 +2951,7 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
 
   FunctionSummary::FFlags FFlags = FS->fflags();
   if (FFlags.ReadNone | FFlags.ReadOnly | FFlags.NoRecurse |
-      FFlags.ReturnDoesNotAlias) {
+      FFlags.ReturnDoesNotAlias | FFlags.NoInline) {
     Out << ", funcFlags: (";
     Out << "readNone: " << FFlags.ReadNone;
     Out << ", readOnly: " << FFlags.ReadOnly;
@@ -3553,6 +3554,10 @@ void AssemblyWriter::printArgument(const Argument *Arg, AttributeSet Attrs) {
   if (Arg->hasName()) {
     Out << ' ';
     PrintLLVMName(Out, Arg);
+  } else {
+    int Slot = Machine.getLocalSlot(Arg);
+    assert(Slot != -1 && "expect argument in function here");
+    Out << " %" << Slot;
   }
 }
 
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index f989fa3b910e..15e488bbb13b 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -159,7 +159,7 @@ public:
 };
 
 class TypeAttributeImpl : public EnumAttributeImpl {
-  virtual void anchor();
+  void anchor() override;
 
   Type *Ty;
 
@@ -208,8 +208,8 @@ public:
   Attribute getAttribute(Attribute::AttrKind Kind) const;
   Attribute getAttribute(StringRef Kind) const;
 
-  unsigned getAlignment() const;
-  unsigned getStackAlignment() const;
+  MaybeAlign getAlignment() const;
+  MaybeAlign getStackAlignment() const;
   uint64_t getDereferenceableBytes() const;
   uint64_t getDereferenceableOrNullBytes() const;
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index bb90bcd7dd74..cc370e628e9a 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -142,17 +142,14 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
   return Attribute(PA);
 }
 
-Attribute Attribute::getWithAlignment(LLVMContext &Context, uint64_t Align) {
-  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
-  assert(Align <= 0x40000000 && "Alignment too large.");
-  return get(Context, Alignment, Align);
+Attribute Attribute::getWithAlignment(LLVMContext &Context, Align A) {
+  assert(A <= 0x40000000 && "Alignment too large.");
+  return get(Context, Alignment, A.value());
 }
 
-Attribute Attribute::getWithStackAlignment(LLVMContext &Context,
-                                           uint64_t Align) {
-  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
-  assert(Align <= 0x100 && "Alignment too large.");
-  return get(Context, StackAlignment, Align);
+Attribute Attribute::getWithStackAlignment(LLVMContext &Context, Align A) {
+  assert(A <= 0x100 && "Alignment too large.");
+  return get(Context, StackAlignment, A.value());
 }
 
 Attribute Attribute::getWithDereferenceableBytes(LLVMContext &Context,
@@ -244,16 +241,16 @@ bool Attribute::hasAttribute(StringRef Kind) const {
   return pImpl && pImpl->hasAttribute(Kind);
 }
 
-unsigned Attribute::getAlignment() const {
+MaybeAlign Attribute::getAlignment() const {
   assert(hasAttribute(Attribute::Alignment) &&
          "Trying to get alignment from non-alignment attribute!");
-  return pImpl->getValueAsInt();
+  return MaybeAlign(pImpl->getValueAsInt());
 }
 
-unsigned Attribute::getStackAlignment() const {
+MaybeAlign Attribute::getStackAlignment() const {
   assert(hasAttribute(Attribute::StackAlignment) &&
          "Trying to get alignment from non-alignment attribute!");
-  return pImpl->getValueAsInt();
+  return MaybeAlign(pImpl->getValueAsInt());
 }
 
 uint64_t Attribute::getDereferenceableBytes() const {
@@ -670,12 +667,12 @@ Attribute AttributeSet::getAttribute(StringRef Kind) const {
   return SetNode ? SetNode->getAttribute(Kind) : Attribute();
 }
 
-unsigned AttributeSet::getAlignment() const {
-  return SetNode ? SetNode->getAlignment() : 0;
+MaybeAlign AttributeSet::getAlignment() const {
+  return SetNode ? SetNode->getAlignment() : None;
 }
 
-unsigned AttributeSet::getStackAlignment() const {
-  return SetNode ? SetNode->getStackAlignment() : 0;
+MaybeAlign AttributeSet::getStackAlignment() const {
+  return SetNode ? SetNode->getStackAlignment() : None;
 }
 
 uint64_t AttributeSet::getDereferenceableBytes() const {
@@ -782,10 +779,12 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
       Attr = Attribute::getWithByValType(C, B.getByValType());
       break;
     case Attribute::Alignment:
-      Attr = Attribute::getWithAlignment(C, B.getAlignment());
+      assert(B.getAlignment() && "Alignment must be set");
+      Attr = Attribute::getWithAlignment(C, *B.getAlignment());
       break;
     case Attribute::StackAlignment:
-      Attr = Attribute::getWithStackAlignment(C, B.getStackAlignment());
+      assert(B.getStackAlignment() && "StackAlignment must be set");
+      Attr = Attribute::getWithStackAlignment(C, *B.getStackAlignment());
       break;
     case Attribute::Dereferenceable:
       Attr = Attribute::getWithDereferenceableBytes(
@@ -836,18 +835,18 @@ Attribute AttributeSetNode::getAttribute(StringRef Kind) const {
   return {};
 }
 
-unsigned AttributeSetNode::getAlignment() const {
+MaybeAlign AttributeSetNode::getAlignment() const {
   for (const auto I : *this)
     if (I.hasAttribute(Attribute::Alignment))
       return I.getAlignment();
-  return 0;
+  return None;
 }
 
-unsigned AttributeSetNode::getStackAlignment() const {
+MaybeAlign AttributeSetNode::getStackAlignment() const {
   for (const auto I : *this)
     if (I.hasAttribute(Attribute::StackAlignment))
       return I.getStackAlignment();
-  return 0;
+  return None;
 }
 
 Type *AttributeSetNode::getByValType() const {
@@ -1164,8 +1163,8 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
 #ifndef NDEBUG
   // FIXME it is not obvious how this should work for alignment. For now, say
   // we can't change a known alignment.
-  unsigned OldAlign = getAttributes(Index).getAlignment();
-  unsigned NewAlign = B.getAlignment();
+  const MaybeAlign OldAlign = getAttributes(Index).getAlignment();
+  const MaybeAlign NewAlign = B.getAlignment();
   assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
          "Attempt to change alignment!");
 #endif
@@ -1349,11 +1348,11 @@ Attribute AttributeList::getAttribute(unsigned Index, StringRef Kind) const {
   return getAttributes(Index).getAttribute(Kind);
 }
 
-unsigned AttributeList::getRetAlignment() const {
+MaybeAlign AttributeList::getRetAlignment() const {
   return getAttributes(ReturnIndex).getAlignment();
 }
 
-unsigned AttributeList::getParamAlignment(unsigned ArgNo) const {
+MaybeAlign AttributeList::getParamAlignment(unsigned ArgNo) const {
   return getAttributes(ArgNo + FirstArgIndex).getAlignment();
 }
 
@@ -1361,8 +1360,7 @@ Type *AttributeList::getParamByValType(unsigned Index) const {
   return getAttributes(Index+FirstArgIndex).getByValType();
 }
 
-
-unsigned AttributeList::getStackAlignment(unsigned Index) const {
+MaybeAlign AttributeList::getStackAlignment(unsigned Index) const {
   return getAttributes(Index).getStackAlignment();
 }
 
@@ -1438,7 +1436,9 @@ AttrBuilder::AttrBuilder(AttributeSet AS) {
 void AttrBuilder::clear() {
   Attrs.reset();
   TargetDepAttrs.clear();
-  Alignment = StackAlignment = DerefBytes = DerefOrNullBytes = 0;
+  Alignment.reset();
+  StackAlignment.reset();
+  DerefBytes = DerefOrNullBytes = 0;
   AllocSizeArgs = 0;
   ByValType = nullptr;
 }
@@ -1486,9 +1486,9 @@ AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
   Attrs[Val] = false;
 
   if (Val == Attribute::Alignment)
-    Alignment = 0;
+    Alignment.reset();
   else if (Val == Attribute::StackAlignment)
-    StackAlignment = 0;
+    StackAlignment.reset();
   else if (Val == Attribute::ByVal)
     ByValType = nullptr;
   else if (Val == Attribute::Dereferenceable)
@@ -1517,23 +1517,23 @@ std::pair<unsigned, Optional<unsigned>> AttrBuilder::getAllocSizeArgs() const {
   return unpackAllocSizeArgs(AllocSizeArgs);
 }
 
-AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) {
-  if (Align == 0) return *this;
+AttrBuilder &AttrBuilder::addAlignmentAttr(MaybeAlign Align) {
+  if (!Align)
+    return *this;
 
-  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
-  assert(Align <= 0x40000000 && "Alignment too large.");
+  assert(*Align <= 0x40000000 && "Alignment too large.");
 
   Attrs[Attribute::Alignment] = true;
   Alignment = Align;
   return *this;
 }
 
-AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align) {
+AttrBuilder &AttrBuilder::addStackAlignmentAttr(MaybeAlign Align) {
   // Default alignment, allow the target to define how to align it.
-  if (Align == 0) return *this;
+  if (!Align)
+    return *this;
 
-  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
-  assert(Align <= 0x100 && "Alignment too large.");
+  assert(*Align <= 0x100 && "Alignment too large.");
 
   Attrs[Attribute::StackAlignment] = true;
   StackAlignment = Align;
@@ -1610,10 +1610,10 @@ AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
 AttrBuilder &AttrBuilder::remove(const AttrBuilder &B) {
   // FIXME: What if both have alignments, but they don't match?!
   if (B.Alignment)
-    Alignment = 0;
+    Alignment.reset();
 
   if (B.StackAlignment)
-    StackAlignment = 0;
+    StackAlignment.reset();
 
   if (B.DerefBytes)
     DerefBytes = 0;
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index a2d820352825..79f580d0e14d 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -490,12 +490,6 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
 static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
-  // Upgrade intrinsics "clang.arc.use" which doesn't start with "llvm.".
-  if (F->getName() == "clang.arc.use") {
-    NewFn = nullptr;
-    return true;
-  }
-
   // Quickly eliminate it, if it's not a candidate.
   StringRef Name = F->getName();
   if (Name.size() <= 8 || !Name.startswith("llvm."))
@@ -528,7 +522,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
                                         F->arg_begin()->getType());
       return true;
     }
-    Regex vldRegex("^arm\\.neon\\.vld([1234]|[234]lane)\\.v[a-z0-9]*$");
+    static const Regex vldRegex("^arm\\.neon\\.vld([1234]|[234]lane)\\.v[a-z0-9]*$");
     if (vldRegex.match(Name)) {
       auto fArgs = F->getFunctionType()->params();
       SmallVector<Type *, 4> Tys(fArgs.begin(), fArgs.end());
@@ -539,7 +533,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
                                "llvm." + Name + ".p0i8", F->getParent());
       return true;
     }
-    Regex vstRegex("^arm\\.neon\\.vst([1234]|[234]lane)\\.v[a-z0-9]*$");
+    static const Regex vstRegex("^arm\\.neon\\.vst([1234]|[234]lane)\\.v[a-z0-9]*$");
     if (vstRegex.match(Name)) {
       static const Intrinsic::ID StoreInts[] = {Intrinsic::arm_neon_vst1,
                                                 Intrinsic::arm_neon_vst2,
@@ -604,7 +598,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   }
   case 'e': {
     SmallVector<StringRef, 2> Groups;
-    Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+");
+    static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+");
     if (R.match(Name, &Groups)) {
       Intrinsic::ID ID = Intrinsic::not_intrinsic;
       if (Groups[1] == "fadd")
@@ -789,6 +783,19 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
 
+  case 'p':
+    if (Name == "prefetch") {
+      // Handle address space overloading.
+      Type *Tys[] = {F->arg_begin()->getType()};
+      if (F->getName() != Intrinsic::getName(Intrinsic::prefetch, Tys)) {
+        rename(F);
+        NewFn =
+            Intrinsic::getDeclaration(F->getParent(), Intrinsic::prefetch, Tys);
+        return true;
+      }
+    }
+    break;
+
   case 's':
     if (Name == "stackprotectorcheck") {
       NewFn = nullptr;
@@ -1648,14 +1655,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     // Get the Function's name.
     StringRef Name = F->getName();
 
-    // clang.arc.use is an old name for llvm.arc.clang.arc.use. It is dropped
-    // from upgrader because the optimizer now only recognizes intrinsics for
-    // ARC runtime calls.
-    if (Name == "clang.arc.use") {
-      CI->eraseFromParent();
-      return;
-    }
-
     assert(Name.startswith("llvm.") && "Intrinsic doesn't start with 'llvm.'");
     Name = Name.substr(5);
 
@@ -3831,7 +3830,9 @@ bool llvm::UpgradeDebugInfo(Module &M) {
   return Modified;
 }
 
-bool llvm::UpgradeRetainReleaseMarker(Module &M) {
+/// This checks for objc retain release marker which should be upgraded. It
+/// returns true if module is modified.
+static bool UpgradeRetainReleaseMarker(Module &M) {
   bool Changed = false;
   const char *MarkerKey = "clang.arc.retainAutoreleasedReturnValueMarker";
   NamedMDNode *ModRetainReleaseMarker = M.getNamedMetadata(MarkerKey);
@@ -3855,6 +3856,106 @@ bool llvm::UpgradeRetainReleaseMarker(Module &M) {
   return Changed;
 }
 
+void llvm::UpgradeARCRuntime(Module &M) {
+  // This lambda converts normal function calls to ARC runtime functions to
+  // intrinsic calls.
+  auto UpgradeToIntrinsic = [&](const char *OldFunc,
+                                llvm::Intrinsic::ID IntrinsicFunc) {
+    Function *Fn = M.getFunction(OldFunc);
+
+    if (!Fn)
+      return;
+
+    Function *NewFn = llvm::Intrinsic::getDeclaration(&M, IntrinsicFunc);
+
+    for (auto I = Fn->user_begin(), E = Fn->user_end(); I != E;) {
+      CallInst *CI = dyn_cast<CallInst>(*I++);
+      if (!CI || CI->getCalledFunction() != Fn)
+        continue;
+
+      IRBuilder<> Builder(CI->getParent(), CI->getIterator());
+      FunctionType *NewFuncTy = NewFn->getFunctionType();
+      SmallVector<Value *, 2> Args;
+
+      for (unsigned I = 0, E = CI->getNumArgOperands(); I != E; ++I) {
+        Value *Arg = CI->getArgOperand(I);
+        // Bitcast argument to the parameter type of the new function if it's
+        // not a variadic argument.
+        if (I < NewFuncTy->getNumParams())
+          Arg = Builder.CreateBitCast(Arg, NewFuncTy->getParamType(I));
+        Args.push_back(Arg);
+      }
+
+      // Create a call instruction that calls the new function.
+      CallInst *NewCall = Builder.CreateCall(NewFuncTy, NewFn, Args);
+      NewCall->setTailCallKind(cast<CallInst>(CI)->getTailCallKind());
+      NewCall->setName(CI->getName());
+
+      // Bitcast the return value back to the type of the old call.
+      Value *NewRetVal = Builder.CreateBitCast(NewCall, CI->getType());
+
+      if (!CI->use_empty())
+        CI->replaceAllUsesWith(NewRetVal);
+      CI->eraseFromParent();
+    }
+
+    if (Fn->use_empty())
+      Fn->eraseFromParent();
+  };
+
+  // Unconditionally convert a call to "clang.arc.use" to a call to
+  // "llvm.objc.clang.arc.use".
+  UpgradeToIntrinsic("clang.arc.use", llvm::Intrinsic::objc_clang_arc_use);
+
+  // Upgrade the retain release marker. If there is no need to upgrade
+  // the marker, that means either the module is already new enough to contain
+  // new intrinsics or it is not ARC. There is no need to upgrade runtime call.
+  if (!UpgradeRetainReleaseMarker(M))
+    return;
+
+  std::pair<const char *, llvm::Intrinsic::ID> RuntimeFuncs[] = {
+      {"objc_autorelease", llvm::Intrinsic::objc_autorelease},
+      {"objc_autoreleasePoolPop", llvm::Intrinsic::objc_autoreleasePoolPop},
+      {"objc_autoreleasePoolPush", llvm::Intrinsic::objc_autoreleasePoolPush},
+      {"objc_autoreleaseReturnValue",
+       llvm::Intrinsic::objc_autoreleaseReturnValue},
+      {"objc_copyWeak", llvm::Intrinsic::objc_copyWeak},
+      {"objc_destroyWeak", llvm::Intrinsic::objc_destroyWeak},
+      {"objc_initWeak", llvm::Intrinsic::objc_initWeak},
+      {"objc_loadWeak", llvm::Intrinsic::objc_loadWeak},
+      {"objc_loadWeakRetained", llvm::Intrinsic::objc_loadWeakRetained},
+      {"objc_moveWeak", llvm::Intrinsic::objc_moveWeak},
+      {"objc_release", llvm::Intrinsic::objc_release},
+      {"objc_retain", llvm::Intrinsic::objc_retain},
+      {"objc_retainAutorelease", llvm::Intrinsic::objc_retainAutorelease},
+      {"objc_retainAutoreleaseReturnValue",
+       llvm::Intrinsic::objc_retainAutoreleaseReturnValue},
+      {"objc_retainAutoreleasedReturnValue",
+       llvm::Intrinsic::objc_retainAutoreleasedReturnValue},
+      {"objc_retainBlock", llvm::Intrinsic::objc_retainBlock},
+      {"objc_storeStrong", llvm::Intrinsic::objc_storeStrong},
+      {"objc_storeWeak", llvm::Intrinsic::objc_storeWeak},
+      {"objc_unsafeClaimAutoreleasedReturnValue",
+       llvm::Intrinsic::objc_unsafeClaimAutoreleasedReturnValue},
+      {"objc_retainedObject", llvm::Intrinsic::objc_retainedObject},
+      {"objc_unretainedObject", llvm::Intrinsic::objc_unretainedObject},
+      {"objc_unretainedPointer", llvm::Intrinsic::objc_unretainedPointer},
+      {"objc_retain_autorelease", llvm::Intrinsic::objc_retain_autorelease},
+      {"objc_sync_enter", llvm::Intrinsic::objc_sync_enter},
+      {"objc_sync_exit", llvm::Intrinsic::objc_sync_exit},
+      {"objc_arc_annotation_topdown_bbstart",
+       llvm::Intrinsic::objc_arc_annotation_topdown_bbstart},
+      {"objc_arc_annotation_topdown_bbend",
+       llvm::Intrinsic::objc_arc_annotation_topdown_bbend},
+      {"objc_arc_annotation_bottomup_bbstart",
+       llvm::Intrinsic::objc_arc_annotation_bottomup_bbstart},
+      {"objc_arc_annotation_bottomup_bbend",
+       llvm::Intrinsic::objc_arc_annotation_bottomup_bbend}};
+
+  for (auto &I : RuntimeFuncs)
+    UpgradeToIntrinsic(I.first, I.second);
+}
+
 bool llvm::UpgradeModuleFlags(Module &M) {
   NamedMDNode *ModFlags = M.getModuleFlagsMetadata();
   if (!ModFlags)
@@ -4012,3 +4113,23 @@ MDNode *llvm::upgradeInstructionLoopAttachment(MDNode &N) {
 
   return MDTuple::get(T->getContext(), Ops);
 }
+
+std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
+  std::string AddrSpaces = "-p270:32:32-p271:32:32-p272:64:64";
+
+  // If X86, and the datalayout matches the expected format, add pointer size
+  // address spaces to the datalayout.
+  Triple::ArchType Arch = Triple(TT).getArch();
+  if ((Arch != llvm::Triple::x86 && Arch != llvm::Triple::x86_64) ||
+      DL.contains(AddrSpaces))
+    return DL;
+
+  SmallVector<StringRef, 4> Groups;
+  Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)");
+  if (!R.match(DL, &Groups))
+    return DL;
+
+  SmallString<1024> Buf;
+  std::string Res = (Groups[1] + AddrSpaces + Groups[3]).toStringRef(Buf).str();
+  return Res;
+}
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 34410712645d..bdee6990f932 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -107,6 +107,13 @@ BasicBlock::instructionsWithoutDebug() {
   return make_filter_range(*this, Fn);
 }
 
+filter_iterator<BasicBlock::const_iterator,
+                std::function<bool(const Instruction &)>>::difference_type
+BasicBlock::sizeWithoutDebug() const {
+  return std::distance(instructionsWithoutDebug().begin(),
+                       instructionsWithoutDebug().end());
+}
+
 void BasicBlock::removeFromParent() {
   getParent()->getBasicBlockList().remove(getIterator());
 }
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 835fbb3443b8..71fa795ec294 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -746,7 +746,7 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
                                                     ConstantInt::get(Ty, i));
       Constant *V2Element = ConstantExpr::getExtractElement(V2,
                                                     ConstantInt::get(Ty, i));
-      Constant *Cond = dyn_cast<Constant>(CondV->getOperand(i));
+      auto *Cond = cast<Constant>(CondV->getOperand(i));
       if (V1Element == V2Element) {
         V = V1Element;
       } else if (isa<UndefValue>(Cond)) {
@@ -787,12 +787,9 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
 
 Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
                                                       Constant *Idx) {
-  if (isa<UndefValue>(Val))  // ee(undef, x) -> undef
-    return UndefValue::get(Val->getType()->getVectorElementType());
-  if (Val->isNullValue())  // ee(zero, x) -> zero
-    return Constant::getNullValue(Val->getType()->getVectorElementType());
-  // ee({w,x,y,z}, undef) -> undef
-  if (isa<UndefValue>(Idx))
+  // extractelt undef, C -> undef
+  // extractelt C, undef -> undef
+  if (isa<UndefValue>(Val) || isa<UndefValue>(Idx))
     return UndefValue::get(Val->getType()->getVectorElementType());
 
   if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx)) {
@@ -1125,7 +1122,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
             isa<GlobalValue>(CE1->getOperand(0))) {
           GlobalValue *GV = cast<GlobalValue>(CE1->getOperand(0));
 
-          unsigned GVAlign;
+          MaybeAlign GVAlign;
 
           if (Module *TheModule = GV->getParent()) {
             GVAlign = GV->getPointerAlignment(TheModule->getDataLayout());
@@ -1139,19 +1136,19 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
             // increased code size (see https://reviews.llvm.org/D55115)
             // FIXME: This code should be deleted once existing targets have
             // appropriate defaults
-            if (GVAlign == 0U && isa<Function>(GV))
-              GVAlign = 4U;
+            if (!GVAlign && isa<Function>(GV))
+              GVAlign = Align(4);
           } else if (isa<Function>(GV)) {
             // Without a datalayout we have to assume the worst case: that the
             // function pointer isn't aligned at all.
-            GVAlign = 0U;
+            GVAlign = llvm::None;
           } else {
-            GVAlign = GV->getAlignment();
+            GVAlign = MaybeAlign(GV->getAlignment());
           }
 
-          if (GVAlign > 1) {
+          if (GVAlign && *GVAlign > 1) {
             unsigned DstWidth = CI2->getType()->getBitWidth();
-            unsigned SrcWidth = std::min(DstWidth, Log2_32(GVAlign));
+            unsigned SrcWidth = std::min(DstWidth, Log2(*GVAlign));
             APInt BitsNotSet(APInt::getLowBitsSet(DstWidth, SrcWidth));
 
             // If checking bits we know are clear, return zero.
diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp
index 920fdc01a14f..642bf0f39342 100644
--- a/lib/IR/ConstantRange.cpp
+++ b/lib/IR/ConstantRange.cpp
@@ -269,6 +269,27 @@ ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
 
     return makeExactMulNSWRegion(Other.getSignedMin())
         .intersectWith(makeExactMulNSWRegion(Other.getSignedMax()));
+
+  case Instruction::Shl: {
+    // For given range of shift amounts, if we ignore all illegal shift amounts
+    // (that always produce poison), what shift amount range is left?
+    ConstantRange ShAmt = Other.intersectWith(
+        ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, (BitWidth - 1) + 1)));
+    if (ShAmt.isEmptySet()) {
+      // If the entire range of shift amounts is already poison-producing,
+      // then we can freely add more poison-producing flags ontop of that.
+      return getFull(BitWidth);
+    }
+    // There are some legal shift amounts, we can compute conservatively-correct
+    // range of no-wrap inputs. Note that by now we have clamped the ShAmtUMax
+    // to be at most bitwidth-1, which results in most conservative range.
+    APInt ShAmtUMax = ShAmt.getUnsignedMax();
+    if (Unsigned)
+      return getNonEmpty(APInt::getNullValue(BitWidth),
+                         APInt::getMaxValue(BitWidth).lshr(ShAmtUMax) + 1);
+    return getNonEmpty(APInt::getSignedMinValue(BitWidth).ashr(ShAmtUMax),
+                       APInt::getSignedMaxValue(BitWidth).ashr(ShAmtUMax) + 1);
+  }
   }
 }
 
@@ -815,14 +836,55 @@ ConstantRange::add(const ConstantRange &Other) const {
   return X;
 }
 
-ConstantRange ConstantRange::addWithNoSignedWrap(const APInt &Other) const {
-  // Calculate the subset of this range such that "X + Other" is
-  // guaranteed not to wrap (overflow) for all X in this subset.
-  auto NSWRange = ConstantRange::makeExactNoWrapRegion(
-      BinaryOperator::Add, Other, OverflowingBinaryOperator::NoSignedWrap);
-  auto NSWConstrainedRange = intersectWith(NSWRange);
+ConstantRange ConstantRange::addWithNoWrap(const ConstantRange &Other,
+                                           unsigned NoWrapKind,
+                                           PreferredRangeType RangeType) const {
+  // Calculate the range for "X + Y" which is guaranteed not to wrap(overflow).
+  // (X is from this, and Y is from Other)
+  if (isEmptySet() || Other.isEmptySet())
+    return getEmpty();
+  if (isFullSet() && Other.isFullSet())
+    return getFull();
+
+  using OBO = OverflowingBinaryOperator;
+  ConstantRange Result = add(Other);
+
+  auto addWithNoUnsignedWrap = [this](const ConstantRange &Other) {
+    APInt LMin = getUnsignedMin(), LMax = getUnsignedMax();
+    APInt RMin = Other.getUnsignedMin(), RMax = Other.getUnsignedMax();
+    bool Overflow;
+    APInt NewMin = LMin.uadd_ov(RMin, Overflow);
+    if (Overflow)
+      return getEmpty();
+    APInt NewMax = LMax.uadd_sat(RMax);
+    return getNonEmpty(std::move(NewMin), std::move(NewMax) + 1);
+  };
+
+  auto addWithNoSignedWrap = [this](const ConstantRange &Other) {
+    APInt LMin = getSignedMin(), LMax = getSignedMax();
+    APInt RMin = Other.getSignedMin(), RMax = Other.getSignedMax();
+    if (LMin.isNonNegative()) {
+      bool Overflow;
+      APInt Temp = LMin.sadd_ov(RMin, Overflow);
+      if (Overflow)
+        return getEmpty();
+    }
+    if (LMax.isNegative()) {
+      bool Overflow;
+      APInt Temp = LMax.sadd_ov(RMax, Overflow);
+      if (Overflow)
+        return getEmpty();
+    }
+    APInt NewMin = LMin.sadd_sat(RMin);
+    APInt NewMax = LMax.sadd_sat(RMax);
+    return getNonEmpty(std::move(NewMin), std::move(NewMax) + 1);
+  };
 
-  return NSWConstrainedRange.add(ConstantRange(Other));
+  if (NoWrapKind & OBO::NoSignedWrap)
+    Result = Result.intersectWith(addWithNoSignedWrap(Other), RangeType);
+  if (NoWrapKind & OBO::NoUnsignedWrap)
+    Result = Result.intersectWith(addWithNoUnsignedWrap(Other), RangeType);
+  return Result;
 }
 
 ConstantRange
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index ff551da29ae6..f792f01efc1a 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -250,6 +251,20 @@ bool Constant::isNaN() const {
   return true;
 }
 
+bool Constant::isElementWiseEqual(Value *Y) const {
+  // Are they fully identical?
+  if (this == Y)
+    return true;
+  // They may still be identical element-wise (if they have `undef`s).
+  auto *Cy = dyn_cast<Constant>(Y);
+  if (!Cy)
+    return false;
+  return PatternMatch::match(ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_EQ,
+                                                   const_cast<Constant *>(this),
+                                                   Cy),
+                             PatternMatch::m_One());
+}
+
 bool Constant::containsUndefElement() const {
   if (!getType()->isVectorTy())
     return false;
@@ -502,22 +517,32 @@ bool Constant::needsRelocation() const {
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(this))
     return BA->getFunction()->needsRelocation();
 
-  // While raw uses of blockaddress need to be relocated, differences between
-  // two of them don't when they are for labels in the same function.  This is a
-  // common idiom when creating a table for the indirect goto extension, so we
-  // handle it efficiently here.
-  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(this))
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(this)) {
     if (CE->getOpcode() == Instruction::Sub) {
       ConstantExpr *LHS = dyn_cast<ConstantExpr>(CE->getOperand(0));
       ConstantExpr *RHS = dyn_cast<ConstantExpr>(CE->getOperand(1));
       if (LHS && RHS && LHS->getOpcode() == Instruction::PtrToInt &&
-          RHS->getOpcode() == Instruction::PtrToInt &&
-          isa<BlockAddress>(LHS->getOperand(0)) &&
-          isa<BlockAddress>(RHS->getOperand(0)) &&
-          cast<BlockAddress>(LHS->getOperand(0))->getFunction() ==
-              cast<BlockAddress>(RHS->getOperand(0))->getFunction())
-        return false;
+          RHS->getOpcode() == Instruction::PtrToInt) {
+        Constant *LHSOp0 = LHS->getOperand(0);
+        Constant *RHSOp0 = RHS->getOperand(0);
+
+        // While raw uses of blockaddress need to be relocated, differences
+        // between two of them don't when they are for labels in the same
+        // function.  This is a common idiom when creating a table for the
+        // indirect goto extension, so we handle it efficiently here.
+        if (isa<BlockAddress>(LHSOp0) && isa<BlockAddress>(RHSOp0) &&
+            cast<BlockAddress>(LHSOp0)->getFunction() ==
+                cast<BlockAddress>(RHSOp0)->getFunction())
+          return false;
+
+        // Relative pointers do not need to be dynamically relocated.
+        if (auto *LHSGV = dyn_cast<GlobalValue>(LHSOp0->stripPointerCasts()))
+          if (auto *RHSGV = dyn_cast<GlobalValue>(RHSOp0->stripPointerCasts()))
+            if (LHSGV->isDSOLocal() && RHSGV->isDSOLocal())
+              return false;
+      }
     }
+  }
 
   bool Result = false;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
@@ -563,13 +588,10 @@ void Constant::removeDeadConstantUsers() const {
     }
 
     // If the constant was dead, then the iterator is invalidated.
-    if (LastNonDeadUser == E) {
+    if (LastNonDeadUser == E)
       I = user_begin();
-      if (I == E) break;
-    } else {
-      I = LastNonDeadUser;
-      ++I;
-    }
+    else
+      I = std::next(LastNonDeadUser);
   }
 }
 
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index 7614dab9f15d..1ec9087551f8 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -480,14 +480,16 @@ struct ConstantExprKeyType {
       : Opcode(CE->getOpcode()),
         SubclassOptionalData(CE->getRawSubclassOptionalData()),
         SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands),
-        Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()) {}
+        Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()),
+        ExplicitTy(nullptr) {}
 
   ConstantExprKeyType(const ConstantExpr *CE,
                       SmallVectorImpl<Constant *> &Storage)
       : Opcode(CE->getOpcode()),
         SubclassOptionalData(CE->getRawSubclassOptionalData()),
         SubclassData(CE->isCompare() ? CE->getPredicate() : 0),
-        Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()) {
+        Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()),
+        ExplicitTy(nullptr) {
     assert(Storage.empty() && "Expected empty storage");
     for (unsigned I = 0, E = CE->getNumOperands(); I != E; ++I)
       Storage.push_back(CE->getOperand(I));
@@ -676,9 +678,9 @@ public:
     /// Hash once, and reuse it for the lookup and the insertion if needed.
     LookupKeyHashed Lookup(MapInfo::getHashValue(Key), Key);
 
-    auto I = Map.find_as(Lookup);
-    if (I != Map.end())
-      return *I;
+    auto ItMap = Map.find_as(Lookup);
+    if (ItMap != Map.end())
+      return *ItMap;
 
     // Update to the new value.  Optimize for the case when we have a single
     // operand that we're changing, but handle bulk updates efficiently.
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 310935b5213a..a5f46b16e600 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -140,7 +140,16 @@ unsigned LLVMGetLastEnumAttributeKind(void) {
 
 LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID,
                                          uint64_t Val) {
-  return wrap(Attribute::get(*unwrap(C), (Attribute::AttrKind)KindID, Val));
+  auto &Ctx = *unwrap(C);
+  auto AttrKind = (Attribute::AttrKind)KindID;
+
+  if (AttrKind == Attribute::AttrKind::ByVal) {
+    // After r362128, byval attributes need to have a type attribute. Provide a
+    // NULL one until a proper API is added for this.
+    return wrap(Attribute::getWithByValType(Ctx, NULL));
+  } else {
+    return wrap(Attribute::get(Ctx, AttrKind, Val));
+  }
 }
 
 unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A) {
@@ -386,7 +395,7 @@ void LLVMDumpModule(LLVMModuleRef M) {
 LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
                                char **ErrorMessage) {
   std::error_code EC;
-  raw_fd_ostream dest(Filename, EC, sys::fs::F_Text);
+  raw_fd_ostream dest(Filename, EC, sys::fs::OF_Text);
   if (EC) {
     *ErrorMessage = strdup(EC.message().c_str());
     return true;
@@ -1999,13 +2008,13 @@ unsigned LLVMGetAlignment(LLVMValueRef V) {
 void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
   Value *P = unwrap<Value>(V);
   if (GlobalObject *GV = dyn_cast<GlobalObject>(P))
-    GV->setAlignment(Bytes);
+    GV->setAlignment(MaybeAlign(Bytes));
   else if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
-    AI->setAlignment(Bytes);
+    AI->setAlignment(MaybeAlign(Bytes));
   else if (LoadInst *LI = dyn_cast<LoadInst>(P))
-    LI->setAlignment(Bytes);
+    LI->setAlignment(MaybeAlign(Bytes));
   else if (StoreInst *SI = dyn_cast<StoreInst>(P))
-    SI->setAlignment(Bytes);
+    SI->setAlignment(MaybeAlign(Bytes));
   else
     llvm_unreachable(
         "only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment");
@@ -2480,7 +2489,7 @@ LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
   Argument *A = unwrap<Argument>(Arg);
-  A->addAttr(Attribute::getWithAlignment(A->getContext(), align));
+  A->addAttr(Attribute::getWithAlignment(A->getContext(), Align(align)));
 }
 
 /*--.. Operations on ifuncs ................................................--*/
@@ -2779,7 +2788,8 @@ void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
 void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
                                 unsigned align) {
   auto *Call = unwrap<CallBase>(Instr);
-  Attribute AlignAttr = Attribute::getWithAlignment(Call->getContext(), align);
+  Attribute AlignAttr =
+      Attribute::getWithAlignment(Call->getContext(), Align(align));
   Call->addAttribute(index, AlignAttr);
 }
 
@@ -3518,6 +3528,47 @@ static LLVMAtomicOrdering mapToLLVMOrdering(AtomicOrdering Ordering) {
   llvm_unreachable("Invalid AtomicOrdering value!");
 }
 
+static AtomicRMWInst::BinOp mapFromLLVMRMWBinOp(LLVMAtomicRMWBinOp BinOp) {
+  switch (BinOp) {
+    case LLVMAtomicRMWBinOpXchg: return AtomicRMWInst::Xchg;
+    case LLVMAtomicRMWBinOpAdd: return AtomicRMWInst::Add;
+    case LLVMAtomicRMWBinOpSub: return AtomicRMWInst::Sub;
+    case LLVMAtomicRMWBinOpAnd: return AtomicRMWInst::And;
+    case LLVMAtomicRMWBinOpNand: return AtomicRMWInst::Nand;
+    case LLVMAtomicRMWBinOpOr: return AtomicRMWInst::Or;
+    case LLVMAtomicRMWBinOpXor: return AtomicRMWInst::Xor;
+    case LLVMAtomicRMWBinOpMax: return AtomicRMWInst::Max;
+    case LLVMAtomicRMWBinOpMin: return AtomicRMWInst::Min;
+    case LLVMAtomicRMWBinOpUMax: return AtomicRMWInst::UMax;
+    case LLVMAtomicRMWBinOpUMin: return AtomicRMWInst::UMin;
+    case LLVMAtomicRMWBinOpFAdd: return AtomicRMWInst::FAdd;
+    case LLVMAtomicRMWBinOpFSub: return AtomicRMWInst::FSub;
+  }
+
+  llvm_unreachable("Invalid LLVMAtomicRMWBinOp value!");
+}
+
+static LLVMAtomicRMWBinOp mapToLLVMRMWBinOp(AtomicRMWInst::BinOp BinOp) {
+  switch (BinOp) {
+    case AtomicRMWInst::Xchg: return LLVMAtomicRMWBinOpXchg;
+    case AtomicRMWInst::Add: return LLVMAtomicRMWBinOpAdd;
+    case AtomicRMWInst::Sub: return LLVMAtomicRMWBinOpSub;
+    case AtomicRMWInst::And: return LLVMAtomicRMWBinOpAnd;
+    case AtomicRMWInst::Nand: return LLVMAtomicRMWBinOpNand;
+    case AtomicRMWInst::Or: return LLVMAtomicRMWBinOpOr;
+    case AtomicRMWInst::Xor: return LLVMAtomicRMWBinOpXor;
+    case AtomicRMWInst::Max: return LLVMAtomicRMWBinOpMax;
+    case AtomicRMWInst::Min: return LLVMAtomicRMWBinOpMin;
+    case AtomicRMWInst::UMax: return LLVMAtomicRMWBinOpUMax;
+    case AtomicRMWInst::UMin: return LLVMAtomicRMWBinOpUMin;
+    case AtomicRMWInst::FAdd: return LLVMAtomicRMWBinOpFAdd;
+    case AtomicRMWInst::FSub: return LLVMAtomicRMWBinOpFSub;
+    default: break;
+  }
+
+  llvm_unreachable("Invalid AtomicRMWBinOp value!");
+}
+
 // TODO: Should this and other atomic instructions support building with
 // "syncscope"?
 LLVMValueRef LLVMBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering,
@@ -3593,14 +3644,30 @@ LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) {
   Value *P = unwrap<Value>(MemAccessInst);
   if (LoadInst *LI = dyn_cast<LoadInst>(P))
     return LI->isVolatile();
-  return cast<StoreInst>(P)->isVolatile();
+  if (StoreInst *SI = dyn_cast<StoreInst>(P))
+    return SI->isVolatile();
+  if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(P))
+    return AI->isVolatile();
+  return cast<AtomicCmpXchgInst>(P)->isVolatile();
 }
 
 void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) {
   Value *P = unwrap<Value>(MemAccessInst);
   if (LoadInst *LI = dyn_cast<LoadInst>(P))
     return LI->setVolatile(isVolatile);
-  return cast<StoreInst>(P)->setVolatile(isVolatile);
+  if (StoreInst *SI = dyn_cast<StoreInst>(P))
+    return SI->setVolatile(isVolatile);
+  if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(P))
+    return AI->setVolatile(isVolatile);
+  return cast<AtomicCmpXchgInst>(P)->setVolatile(isVolatile);
+}
+
+LLVMBool LLVMGetWeak(LLVMValueRef CmpXchgInst) {
+  return unwrap<AtomicCmpXchgInst>(CmpXchgInst)->isWeak();
+}
+
+void LLVMSetWeak(LLVMValueRef CmpXchgInst, LLVMBool isWeak) {
+  return unwrap<AtomicCmpXchgInst>(CmpXchgInst)->setWeak(isWeak);
 }
 
 LLVMAtomicOrdering LLVMGetOrdering(LLVMValueRef MemAccessInst) {
@@ -3608,8 +3675,10 @@ LLVMAtomicOrdering LLVMGetOrdering(LLVMValueRef MemAccessInst) {
   AtomicOrdering O;
   if (LoadInst *LI = dyn_cast<LoadInst>(P))
     O = LI->getOrdering();
+  else if (StoreInst *SI = dyn_cast<StoreInst>(P))
+    O = SI->getOrdering();
   else
-    O = cast<StoreInst>(P)->getOrdering();
+    O = cast<AtomicRMWInst>(P)->getOrdering();
   return mapToLLVMOrdering(O);
 }
 
@@ -3622,6 +3691,14 @@ void LLVMSetOrdering(LLVMValueRef MemAccessInst, LLVMAtomicOrdering Ordering) {
   return cast<StoreInst>(P)->setOrdering(O);
 }
 
+LLVMAtomicRMWBinOp LLVMGetAtomicRMWBinOp(LLVMValueRef Inst) {
+  return mapToLLVMRMWBinOp(unwrap<AtomicRMWInst>(Inst)->getOperation());
+}
+
+void LLVMSetAtomicRMWBinOp(LLVMValueRef Inst, LLVMAtomicRMWBinOp BinOp) {
+  unwrap<AtomicRMWInst>(Inst)->setOperation(mapFromLLVMRMWBinOp(BinOp));
+}
+
 /*--.. Casts ...............................................................--*/
 
 LLVMValueRef LLVMBuildTrunc(LLVMBuilderRef B, LLVMValueRef Val,
@@ -3840,20 +3917,7 @@ LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
                                LLVMValueRef PTR, LLVMValueRef Val,
                                LLVMAtomicOrdering ordering,
                                LLVMBool singleThread) {
-  AtomicRMWInst::BinOp intop;
-  switch (op) {
-    case LLVMAtomicRMWBinOpXchg: intop = AtomicRMWInst::Xchg; break;
-    case LLVMAtomicRMWBinOpAdd: intop = AtomicRMWInst::Add; break;
-    case LLVMAtomicRMWBinOpSub: intop = AtomicRMWInst::Sub; break;
-    case LLVMAtomicRMWBinOpAnd: intop = AtomicRMWInst::And; break;
-    case LLVMAtomicRMWBinOpNand: intop = AtomicRMWInst::Nand; break;
-    case LLVMAtomicRMWBinOpOr: intop = AtomicRMWInst::Or; break;
-    case LLVMAtomicRMWBinOpXor: intop = AtomicRMWInst::Xor; break;
-    case LLVMAtomicRMWBinOpMax: intop = AtomicRMWInst::Max; break;
-    case LLVMAtomicRMWBinOpMin: intop = AtomicRMWInst::Min; break;
-    case LLVMAtomicRMWBinOpUMax: intop = AtomicRMWInst::UMax; break;
-    case LLVMAtomicRMWBinOpUMin: intop = AtomicRMWInst::UMin; break;
-  }
+  AtomicRMWInst::BinOp intop = mapFromLLVMRMWBinOp(op);
   return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val),
     mapFromLLVMOrdering(ordering), singleThread ? SyncScope::SingleThread
                                                 : SyncScope::System));
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 2493c6cbe532..5d5671227430 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -25,7 +25,7 @@
 using namespace llvm;
 using namespace llvm::dwarf;
 
-cl::opt<bool>
+static cl::opt<bool>
     UseDbgAddr("use-dbg-addr",
                llvm::cl::desc("Use llvm.dbg.addr for all local variables"),
                cl::init(false), cl::Hidden);
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index 6e0ebbd4a730..5fe7a2e94b6a 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TypeSize.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -44,7 +45,6 @@ using namespace llvm;
 
 StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
   assert(!ST->isOpaque() && "Cannot get layout of opaque structs");
-  StructAlignment = 0;
   StructSize = 0;
   IsPadded = false;
   NumElements = ST->getNumElements();
@@ -52,10 +52,10 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
   // Loop over each of the elements, placing them in memory.
   for (unsigned i = 0, e = NumElements; i != e; ++i) {
     Type *Ty = ST->getElementType(i);
-    unsigned TyAlign = ST->isPacked() ? 1 : DL.getABITypeAlignment(Ty);
+    const Align TyAlign(ST->isPacked() ? 1 : DL.getABITypeAlignment(Ty));
 
     // Add padding if necessary to align the data element properly.
-    if ((StructSize & (TyAlign-1)) != 0) {
+    if (!isAligned(TyAlign, StructSize)) {
       IsPadded = true;
       StructSize = alignTo(StructSize, TyAlign);
     }
@@ -67,12 +67,9 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
     StructSize += DL.getTypeAllocSize(Ty); // Consume space for this data item
   }
 
-  // Empty structures have alignment of 1 byte.
-  if (StructAlignment == 0) StructAlignment = 1;
-
   // Add padding to the end of the struct so that it could be put in an array
   // and all array elements would be aligned correctly.
-  if ((StructSize & (StructAlignment-1)) != 0) {
+  if (!isAligned(StructAlignment, StructSize)) {
     IsPadded = true;
     StructSize = alignTo(StructSize, StructAlignment);
   }
@@ -102,9 +99,8 @@ unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const {
 // LayoutAlignElem, LayoutAlign support
 //===----------------------------------------------------------------------===//
 
-LayoutAlignElem
-LayoutAlignElem::get(AlignTypeEnum align_type, unsigned abi_align,
-                     unsigned pref_align, uint32_t bit_width) {
+LayoutAlignElem LayoutAlignElem::get(AlignTypeEnum align_type, Align abi_align,
+                                     Align pref_align, uint32_t bit_width) {
   assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
   LayoutAlignElem retval;
   retval.AlignType = align_type;
@@ -126,10 +122,9 @@ LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const {
 // PointerAlignElem, PointerAlign support
 //===----------------------------------------------------------------------===//
 
-PointerAlignElem
-PointerAlignElem::get(uint32_t AddressSpace, unsigned ABIAlign,
-                      unsigned PrefAlign, uint32_t TypeByteWidth,
-                      uint32_t IndexWidth) {
+PointerAlignElem PointerAlignElem::get(uint32_t AddressSpace, Align ABIAlign,
+                                       Align PrefAlign, uint32_t TypeByteWidth,
+                                       uint32_t IndexWidth) {
   assert(ABIAlign <= PrefAlign && "Preferred alignment worse than ABI!");
   PointerAlignElem retval;
   retval.AddressSpace = AddressSpace;
@@ -162,18 +157,18 @@ const char *DataLayout::getManglingComponent(const Triple &T) {
 }
 
 static const LayoutAlignElem DefaultAlignments[] = {
-  { INTEGER_ALIGN, 1, 1, 1 },    // i1
-  { INTEGER_ALIGN, 8, 1, 1 },    // i8
-  { INTEGER_ALIGN, 16, 2, 2 },   // i16
-  { INTEGER_ALIGN, 32, 4, 4 },   // i32
-  { INTEGER_ALIGN, 64, 4, 8 },   // i64
-  { FLOAT_ALIGN, 16, 2, 2 },     // half
-  { FLOAT_ALIGN, 32, 4, 4 },     // float
-  { FLOAT_ALIGN, 64, 8, 8 },     // double
-  { FLOAT_ALIGN, 128, 16, 16 },  // ppcf128, quad, ...
-  { VECTOR_ALIGN, 64, 8, 8 },    // v2i32, v1i64, ...
-  { VECTOR_ALIGN, 128, 16, 16 }, // v16i8, v8i16, v4i32, ...
-  { AGGREGATE_ALIGN, 0, 0, 8 }   // struct
+    {INTEGER_ALIGN, 1, Align(1), Align(1)},    // i1
+    {INTEGER_ALIGN, 8, Align(1), Align(1)},    // i8
+    {INTEGER_ALIGN, 16, Align(2), Align(2)},   // i16
+    {INTEGER_ALIGN, 32, Align(4), Align(4)},   // i32
+    {INTEGER_ALIGN, 64, Align(4), Align(8)},   // i64
+    {FLOAT_ALIGN, 16, Align(2), Align(2)},     // half
+    {FLOAT_ALIGN, 32, Align(4), Align(4)},     // float
+    {FLOAT_ALIGN, 64, Align(8), Align(8)},     // double
+    {FLOAT_ALIGN, 128, Align(16), Align(16)},  // ppcf128, quad, ...
+    {VECTOR_ALIGN, 64, Align(8), Align(8)},    // v2i32, v1i64, ...
+    {VECTOR_ALIGN, 128, Align(16), Align(16)}, // v16i8, v8i16, v4i32, ...
+    {AGGREGATE_ALIGN, 0, Align(1), Align(8)}   // struct
 };
 
 void DataLayout::reset(StringRef Desc) {
@@ -182,9 +177,9 @@ void DataLayout::reset(StringRef Desc) {
   LayoutMap = nullptr;
   BigEndian = false;
   AllocaAddrSpace = 0;
-  StackNaturalAlign = 0;
+  StackNaturalAlign.reset();
   ProgramAddrSpace = 0;
-  FunctionPtrAlign = 0;
+  FunctionPtrAlign.reset();
   TheFunctionPtrAlignType = FunctionPtrAlignType::Independent;
   ManglingMode = MM_None;
   NonIntegralAddressSpaces.clear();
@@ -194,7 +189,7 @@ void DataLayout::reset(StringRef Desc) {
     setAlignment((AlignTypeEnum)E.AlignType, E.ABIAlign, E.PrefAlign,
                  E.TypeBitWidth);
   }
-  setPointerAlignment(0, 8, 8, 8, 8);
+  setPointerAlignment(0, Align(8), Align(8), 8, 8);
 
   parseSpecifier(Desc);
 }
@@ -320,8 +315,9 @@ void DataLayout::parseSpecifier(StringRef Desc) {
             report_fatal_error("Invalid index size of 0 bytes");
         }
       }
-      setPointerAlignment(AddrSpace, PointerABIAlign, PointerPrefAlign,
-                          PointerMemSize, IndexSize);
+      setPointerAlignment(AddrSpace, assumeAligned(PointerABIAlign),
+                          assumeAligned(PointerPrefAlign), PointerMemSize,
+                          IndexSize);
       break;
     }
     case 'i':
@@ -349,11 +345,16 @@ void DataLayout::parseSpecifier(StringRef Desc) {
         report_fatal_error(
             "Missing alignment specification in datalayout string");
       Split = split(Rest, ':');
-      unsigned ABIAlign = inBytes(getInt(Tok));
+      const unsigned ABIAlign = inBytes(getInt(Tok));
       if (AlignType != AGGREGATE_ALIGN && !ABIAlign)
         report_fatal_error(
             "ABI alignment specification must be >0 for non-aggregate types");
 
+      if (!isUInt<16>(ABIAlign))
+        report_fatal_error("Invalid ABI alignment, must be a 16bit integer");
+      if (ABIAlign != 0 && !isPowerOf2_64(ABIAlign))
+        report_fatal_error("Invalid ABI alignment, must be a power of 2");
+
       // Preferred alignment.
       unsigned PrefAlign = ABIAlign;
       if (!Rest.empty()) {
@@ -361,7 +362,14 @@ void DataLayout::parseSpecifier(StringRef Desc) {
         PrefAlign = inBytes(getInt(Tok));
       }
 
-      setAlignment(AlignType, ABIAlign, PrefAlign, Size);
+      if (!isUInt<16>(PrefAlign))
+        report_fatal_error(
+            "Invalid preferred alignment, must be a 16bit integer");
+      if (PrefAlign != 0 && !isPowerOf2_64(PrefAlign))
+        report_fatal_error("Invalid preferred alignment, must be a power of 2");
+
+      setAlignment(AlignType, assumeAligned(ABIAlign), assumeAligned(PrefAlign),
+                   Size);
 
       break;
     }
@@ -378,7 +386,10 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       }
       break;
     case 'S': { // Stack natural alignment.
-      StackNaturalAlign = inBytes(getInt(Tok));
+      uint64_t Alignment = inBytes(getInt(Tok));
+      if (Alignment != 0 && !llvm::isPowerOf2_64(Alignment))
+        report_fatal_error("Alignment is neither 0 nor a power of 2");
+      StackNaturalAlign = MaybeAlign(Alignment);
       break;
     }
     case 'F': {
@@ -394,7 +405,10 @@ void DataLayout::parseSpecifier(StringRef Desc) {
                            "datalayout string");
       }
       Tok = Tok.substr(1);
-      FunctionPtrAlign = inBytes(getInt(Tok));
+      uint64_t Alignment = inBytes(getInt(Tok));
+      if (Alignment != 0 && !llvm::isPowerOf2_64(Alignment))
+        report_fatal_error("Alignment is neither 0 nor a power of 2");
+      FunctionPtrAlign = MaybeAlign(Alignment);
       break;
     }
     case 'P': { // Function address space.
@@ -468,20 +482,15 @@ DataLayout::findAlignmentLowerBound(AlignTypeEnum AlignType,
   });
 }
 
-void
-DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
-                         unsigned pref_align, uint32_t bit_width) {
+void DataLayout::setAlignment(AlignTypeEnum align_type, Align abi_align,
+                              Align pref_align, uint32_t bit_width) {
+  // AlignmentsTy::ABIAlign and AlignmentsTy::PrefAlign were once stored as
+  // uint16_t, it is unclear if there are requirements for alignment to be less
+  // than 2^16 other than storage. In the meantime we leave the restriction as
+  // an assert. See D67400 for context.
+  assert(Log2(abi_align) < 16 && Log2(pref_align) < 16 && "Alignment too big");
   if (!isUInt<24>(bit_width))
     report_fatal_error("Invalid bit width, must be a 24bit integer");
-  if (!isUInt<16>(abi_align))
-    report_fatal_error("Invalid ABI alignment, must be a 16bit integer");
-  if (!isUInt<16>(pref_align))
-    report_fatal_error("Invalid preferred alignment, must be a 16bit integer");
-  if (abi_align != 0 && !isPowerOf2_64(abi_align))
-    report_fatal_error("Invalid ABI alignment, must be a power of 2");
-  if (pref_align != 0 && !isPowerOf2_64(pref_align))
-    report_fatal_error("Invalid preferred alignment, must be a power of 2");
-
   if (pref_align < abi_align)
     report_fatal_error(
         "Preferred alignment cannot be less than the ABI alignment");
@@ -507,8 +516,8 @@ DataLayout::findPointerLowerBound(uint32_t AddressSpace) {
   });
 }
 
-void DataLayout::setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
-                                     unsigned PrefAlign, uint32_t TypeByteWidth,
+void DataLayout::setPointerAlignment(uint32_t AddrSpace, Align ABIAlign,
+                                     Align PrefAlign, uint32_t TypeByteWidth,
                                      uint32_t IndexWidth) {
   if (PrefAlign < ABIAlign)
     report_fatal_error(
@@ -528,9 +537,8 @@ void DataLayout::setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
 
 /// getAlignmentInfo - Return the alignment (either ABI if ABIInfo = true or
 /// preferred if ABIInfo = false) the layout wants for the specified datatype.
-unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
-                                      uint32_t BitWidth, bool ABIInfo,
-                                      Type *Ty) const {
+Align DataLayout::getAlignmentInfo(AlignTypeEnum AlignType, uint32_t BitWidth,
+                                   bool ABIInfo, Type *Ty) const {
   AlignmentsTy::const_iterator I = findAlignmentLowerBound(AlignType, BitWidth);
   // See if we found an exact match. Of if we are looking for an integer type,
   // but don't have an exact match take the next largest integer. This is where
@@ -549,10 +557,11 @@ unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
   } else if (AlignType == VECTOR_ALIGN) {
     // By default, use natural alignment for vector types. This is consistent
     // with what clang and llvm-gcc do.
-    unsigned Align = getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
-    Align *= cast<VectorType>(Ty)->getNumElements();
-    Align = PowerOf2Ceil(Align);
-    return Align;
+    unsigned Alignment =
+        getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
+    Alignment *= cast<VectorType>(Ty)->getNumElements();
+    Alignment = PowerOf2Ceil(Alignment);
+    return Align(Alignment);
    }
 
   // If we still couldn't find a reasonable default alignment, fall back
@@ -561,9 +570,9 @@ unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
   // approximation of reality, and if the user wanted something less
   // less conservative, they should have specified it explicitly in the data
   // layout.
-  unsigned Align = getTypeStoreSize(Ty);
-  Align = PowerOf2Ceil(Align);
-  return Align;
+   unsigned Alignment = getTypeStoreSize(Ty);
+   Alignment = PowerOf2Ceil(Alignment);
+   return Align(Alignment);
 }
 
 namespace {
@@ -624,7 +633,7 @@ const StructLayout *DataLayout::getStructLayout(StructType *Ty) const {
   return L;
 }
 
-unsigned DataLayout::getPointerABIAlignment(unsigned AS) const {
+Align DataLayout::getPointerABIAlignment(unsigned AS) const {
   PointersTy::const_iterator I = findPointerLowerBound(AS);
   if (I == Pointers.end() || I->AddressSpace != AS) {
     I = findPointerLowerBound(0);
@@ -633,7 +642,7 @@ unsigned DataLayout::getPointerABIAlignment(unsigned AS) const {
   return I->ABIAlign;
 }
 
-unsigned DataLayout::getPointerPrefAlignment(unsigned AS) const {
+Align DataLayout::getPointerPrefAlignment(unsigned AS) const {
   PointersTy::const_iterator I = findPointerLowerBound(AS);
   if (I == Pointers.end() || I->AddressSpace != AS) {
     I = findPointerLowerBound(0);
@@ -690,21 +699,18 @@ unsigned DataLayout::getIndexTypeSizeInBits(Type *Ty) const {
   Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
   == false) for the requested type \a Ty.
  */
-unsigned DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
+Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
   AlignTypeEnum AlignType;
 
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
   switch (Ty->getTypeID()) {
   // Early escape for the non-numeric types.
   case Type::LabelTyID:
-    return (abi_or_pref
-            ? getPointerABIAlignment(0)
-            : getPointerPrefAlignment(0));
+    return abi_or_pref ? getPointerABIAlignment(0) : getPointerPrefAlignment(0);
   case Type::PointerTyID: {
     unsigned AS = cast<PointerType>(Ty)->getAddressSpace();
-    return (abi_or_pref
-            ? getPointerABIAlignment(AS)
-            : getPointerPrefAlignment(AS));
+    return abi_or_pref ? getPointerABIAlignment(AS)
+                       : getPointerPrefAlignment(AS);
     }
   case Type::ArrayTyID:
     return getAlignment(cast<ArrayType>(Ty)->getElementType(), abi_or_pref);
@@ -712,11 +718,11 @@ unsigned DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
   case Type::StructTyID: {
     // Packed structure types always have an ABI alignment of one.
     if (cast<StructType>(Ty)->isPacked() && abi_or_pref)
-      return 1;
+      return Align::None();
 
     // Get the layout annotation... which is lazily created on demand.
     const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
-    unsigned Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref, Ty);
+    const Align Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref, Ty);
     return std::max(Align, Layout->getAlignment());
   }
   case Type::IntegerTyID:
@@ -740,27 +746,24 @@ unsigned DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
     llvm_unreachable("Bad type for getAlignment!!!");
   }
 
-  return getAlignmentInfo(AlignType, getTypeSizeInBits(Ty), abi_or_pref, Ty);
+  // If we're dealing with a scalable vector, we just need the known minimum
+  // size for determining alignment. If not, we'll get the exact size.
+  return getAlignmentInfo(AlignType, getTypeSizeInBits(Ty).getKnownMinSize(),
+                          abi_or_pref, Ty);
 }
 
 unsigned DataLayout::getABITypeAlignment(Type *Ty) const {
-  return getAlignment(Ty, true);
+  return getAlignment(Ty, true).value();
 }
 
 /// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
 /// an integer type of the specified bitwidth.
-unsigned DataLayout::getABIIntegerTypeAlignment(unsigned BitWidth) const {
+Align DataLayout::getABIIntegerTypeAlignment(unsigned BitWidth) const {
   return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, nullptr);
 }
 
 unsigned DataLayout::getPrefTypeAlignment(Type *Ty) const {
-  return getAlignment(Ty, false);
-}
-
-unsigned DataLayout::getPreferredTypeAlignmentShift(Type *Ty) const {
-  unsigned Align = getPrefTypeAlignment(Ty);
-  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
-  return Log2_32(Align);
+  return getAlignment(Ty, false).value();
 }
 
 IntegerType *DataLayout::getIntPtrType(LLVMContext &C,
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index ce47ef207434..1bbe6b85d260 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -279,7 +279,7 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
 }
 
 static MDNode *stripDebugLocFromLoopID(MDNode *N) {
-  assert(!empty(N->operands()) && "Missing self reference?");
+  assert(!N->operands().empty() && "Missing self reference?");
 
   // if there is no debug location, we do not have to rewrite this MDNode.
   if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
@@ -929,6 +929,26 @@ const char *LLVMDIFileGetSource(LLVMMetadataRef File, unsigned *Len) {
   return "";
 }
 
+LLVMMetadataRef LLVMDIBuilderCreateMacro(LLVMDIBuilderRef Builder,
+                                         LLVMMetadataRef ParentMacroFile,
+                                         unsigned Line,
+                                         LLVMDWARFMacinfoRecordType RecordType,
+                                         const char *Name, size_t NameLen,
+                                         const char *Value, size_t ValueLen) {
+  return wrap(
+      unwrap(Builder)->createMacro(unwrapDI<DIMacroFile>(ParentMacroFile), Line,
+                                   static_cast<MacinfoRecordType>(RecordType),
+                                   {Name, NameLen}, {Value, ValueLen}));
+}
+
+LLVMMetadataRef
+LLVMDIBuilderCreateTempMacroFile(LLVMDIBuilderRef Builder,
+                                 LLVMMetadataRef ParentMacroFile, unsigned Line,
+                                 LLVMMetadataRef File) {
+  return wrap(unwrap(Builder)->createTempMacroFile(
+      unwrapDI<DIMacroFile>(ParentMacroFile), Line, unwrapDI<DIFile>(File)));
+}
+
 LLVMMetadataRef LLVMDIBuilderCreateEnumerator(LLVMDIBuilderRef Builder,
                                               const char *Name, size_t NameLen,
                                               int64_t Value,
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index 900df27d1d33..94ec3abfa7a2 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -828,15 +828,23 @@ DIExpression *DIExpression::getImpl(LLVMContext &Context,
 }
 
 unsigned DIExpression::ExprOperand::getSize() const {
-  switch (getOp()) {
+  uint64_t Op = getOp();
+
+  if (Op >= dwarf::DW_OP_breg0 && Op <= dwarf::DW_OP_breg31)
+    return 2;
+
+  switch (Op) {
   case dwarf::DW_OP_LLVM_convert:
   case dwarf::DW_OP_LLVM_fragment:
+  case dwarf::DW_OP_bregx:
     return 3;
   case dwarf::DW_OP_constu:
+  case dwarf::DW_OP_consts:
   case dwarf::DW_OP_deref_size:
   case dwarf::DW_OP_plus_uconst:
   case dwarf::DW_OP_LLVM_tag_offset:
-  case dwarf::DW_OP_entry_value:
+  case dwarf::DW_OP_LLVM_entry_value:
+  case dwarf::DW_OP_regx:
     return 2;
   default:
     return 1;
@@ -849,8 +857,13 @@ bool DIExpression::isValid() const {
     if (I->get() + I->getSize() > E->get())
       return false;
 
+    uint64_t Op = I->getOp();
+    if ((Op >= dwarf::DW_OP_reg0 && Op <= dwarf::DW_OP_reg31) ||
+        (Op >= dwarf::DW_OP_breg0 && Op <= dwarf::DW_OP_breg31))
+      return true;
+
     // Check that the operand is valid.
-    switch (I->getOp()) {
+    switch (Op) {
     default:
       return false;
     case dwarf::DW_OP_LLVM_fragment:
@@ -877,10 +890,12 @@ bool DIExpression::isValid() const {
         return false;
       break;
     }
-    case dwarf::DW_OP_entry_value: {
-      // An entry value operator must appear at the begin and the size
-      // of following expression should be 1, because we support only
-      // entry values of a simple register location.
+    case dwarf::DW_OP_LLVM_entry_value: {
+      // An entry value operator must appear at the beginning and the number of
+      // operations it cover can currently only be 1, because we support only
+      // entry values of a simple register location. One reason for this is that
+      // we currently can't calculate the size of the resulting DWARF block for
+      // other expressions.
       return I->get() == expr_op_begin()->get() && I->getArg(0) == 1 &&
              getNumElements() == 2;
     }
@@ -905,6 +920,8 @@ bool DIExpression::isValid() const {
     case dwarf::DW_OP_lit0:
     case dwarf::DW_OP_not:
     case dwarf::DW_OP_dup:
+    case dwarf::DW_OP_regx:
+    case dwarf::DW_OP_bregx:
       break;
     }
   }
@@ -1035,7 +1052,7 @@ DIExpression *DIExpression::prependOpcodes(const DIExpression *Expr,
   assert(Expr && "Can't prepend ops to this expression");
 
   if (EntryValue) {
-    Ops.push_back(dwarf::DW_OP_entry_value);
+    Ops.push_back(dwarf::DW_OP_LLVM_entry_value);
     // Add size info needed for entry value expression.
     // Add plus one for target register operand.
     Ops.push_back(Expr->getNumElements() + 1);
@@ -1146,6 +1163,7 @@ Optional<DIExpression *> DIExpression::createFragmentExpression(
       Op.appendToVector(Ops);
     }
   }
+  assert(Expr && "Unknown DIExpression");
   Ops.push_back(dwarf::DW_OP_LLVM_fragment);
   Ops.push_back(OffsetInBits);
   Ops.push_back(SizeInBits);
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 4a8e3cca3493..99d5aec3f043 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -370,5 +370,16 @@ std::string DiagnosticInfoOptimizationBase::getMsg() const {
   return OS.str();
 }
 
+DiagnosticInfoMisExpect::DiagnosticInfoMisExpect(const Instruction *Inst,
+                                                 Twine &Msg)
+    : DiagnosticInfoWithLocationBase(DK_MisExpect, DS_Warning,
+                                     *Inst->getParent()->getParent(),
+                                     Inst->getDebugLoc()),
+      Msg(Msg) {}
+
+void DiagnosticInfoMisExpect::print(DiagnosticPrinter &DP) const {
+  DP << getLocationStr() << ": " << getMsg();
+}
+
 void OptimizationRemarkAnalysisFPCommute::anchor() {}
 void OptimizationRemarkAnalysisAliasing::anchor() {}
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index dc28d22548dd..3f70d2c904e5 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -251,7 +251,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
 
   // We only need a symbol table for a function if the context keeps value names
   if (!getContext().shouldDiscardValueNames())
-    SymTab = make_unique<ValueSymbolTable>();
+    SymTab = std::make_unique<ValueSymbolTable>();
 
   // If the function has arguments, mark them as lazily built.
   if (Ty->getNumParams())
@@ -293,7 +293,8 @@ void Function::BuildLazyArguments() const {
 
   // Clear the lazy arguments bit.
   unsigned SDC = getSubclassDataFromValue();
-  const_cast<Function*>(this)->setValueSubclassData(SDC &= ~(1<<0));
+  SDC &= ~(1 << 0);
+  const_cast<Function*>(this)->setValueSubclassData(SDC);
   assert(!hasLazyArguments());
 }
 
@@ -611,9 +612,11 @@ static std::string getMangledTypeStr(Type* Ty) {
       Result += "vararg";
     // Ensure nested function types are distinguishable.
     Result += "f";
-  } else if (isa<VectorType>(Ty)) {
-    Result += "v" + utostr(Ty->getVectorNumElements()) +
-      getMangledTypeStr(Ty->getVectorElementType());
+  } else if (VectorType* VTy = dyn_cast<VectorType>(Ty)) {
+    if (VTy->isScalable())
+      Result += "nx";
+    Result += "v" + utostr(VTy->getVectorNumElements()) +
+      getMangledTypeStr(VTy->getVectorElementType());
   } else if (Ty) {
     switch (Ty->getTypeID()) {
     default: llvm_unreachable("Unhandled type");
@@ -700,7 +703,11 @@ enum IIT_Info {
   IIT_STRUCT7 = 39,
   IIT_STRUCT8 = 40,
   IIT_F128 = 41,
-  IIT_VEC_ELEMENT = 42
+  IIT_VEC_ELEMENT = 42,
+  IIT_SCALABLE_VEC = 43,
+  IIT_SUBDIVIDE2_ARG = 44,
+  IIT_SUBDIVIDE4_ARG = 45,
+  IIT_VEC_OF_BITCASTS_TO_INT = 46
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -865,12 +872,36 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
       DecodeIITType(NextElt, Infos, OutputTable);
     return;
   }
+  case IIT_SUBDIVIDE2_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Subdivide2Argument,
+                                             ArgInfo));
+    return;
+  }
+  case IIT_SUBDIVIDE4_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Subdivide4Argument,
+                                             ArgInfo));
+    return;
+  }
   case IIT_VEC_ELEMENT: {
     unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecElementArgument,
                                              ArgInfo));
     return;
   }
+  case IIT_SCALABLE_VEC: {
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::ScalableVecArgument,
+                                             0));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  }
+  case IIT_VEC_OF_BITCASTS_TO_INT: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfBitcastsToInt,
+                                             ArgInfo));
+    return;
+  }
   }
   llvm_unreachable("unhandled");
 }
@@ -961,6 +992,14 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
     assert(ITy->getBitWidth() % 2 == 0);
     return IntegerType::get(Context, ITy->getBitWidth() / 2);
   }
+  case IITDescriptor::Subdivide2Argument:
+  case IITDescriptor::Subdivide4Argument: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    VectorType *VTy = dyn_cast<VectorType>(Ty);
+    assert(VTy && "Expected an argument of Vector Type");
+    int SubDivs = D.Kind == IITDescriptor::Subdivide2Argument ? 1 : 2;
+    return VectorType::getSubdividedVectorType(VTy, SubDivs);
+  }
   case IITDescriptor::HalfVecArgument:
     return VectorType::getHalfElementsVectorType(cast<VectorType>(
                                                   Tys[D.getArgumentNumber()]));
@@ -968,7 +1007,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
     Type *EltTy = DecodeFixedType(Infos, Tys, Context);
     Type *Ty = Tys[D.getArgumentNumber()];
     if (auto *VTy = dyn_cast<VectorType>(Ty))
-      return VectorType::get(EltTy, VTy->getNumElements());
+      return VectorType::get(EltTy, VTy->getElementCount());
     return EltTy;
   }
   case IITDescriptor::PtrToArgument: {
@@ -989,9 +1028,20 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
       return VTy->getElementType();
     llvm_unreachable("Expected an argument of Vector Type");
   }
+  case IITDescriptor::VecOfBitcastsToInt: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    VectorType *VTy = dyn_cast<VectorType>(Ty);
+    assert(VTy && "Expected an argument of Vector Type");
+    return VectorType::getInteger(VTy);
+  }
   case IITDescriptor::VecOfAnyPtrsToElt:
     // Return the overloaded type (which determines the pointers address space)
     return Tys[D.getOverloadArgNumber()];
+  case IITDescriptor::ScalableVecArgument: {
+    Type *Ty = DecodeFixedType(Infos, Tys, Context);
+    return VectorType::get(Ty->getVectorElementType(),
+                           { Ty->getVectorNumElements(), true });
+  }
   }
   llvm_unreachable("unhandled");
 }
@@ -1174,8 +1224,9 @@ static bool matchIntrinsicType(
     }
     case IITDescriptor::HalfVecArgument:
       // If this is a forward reference, defer the check for later.
-      return D.getArgumentNumber() >= ArgTys.size() ||
-             !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return IsDeferredCheck || DeferCheck(Ty);
+      return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
              VectorType::getHalfElementsVectorType(
                      cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
     case IITDescriptor::SameVecWidthArgument: {
@@ -1191,8 +1242,8 @@ static bool matchIntrinsicType(
         return true;
       Type *EltTy = Ty;
       if (ThisArgType) {
-        if (ReferenceType->getVectorNumElements() !=
-            ThisArgType->getVectorNumElements())
+        if (ReferenceType->getElementCount() !=
+            ThisArgType->getElementCount())
           return true;
         EltTy = ThisArgType->getVectorElementType();
       }
@@ -1255,6 +1306,36 @@ static bool matchIntrinsicType(
       auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
       return !ReferenceType || Ty != ReferenceType->getElementType();
     }
+    case IITDescriptor::Subdivide2Argument:
+    case IITDescriptor::Subdivide4Argument: {
+      // If this is a forward reference, defer the check for later.
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return IsDeferredCheck || DeferCheck(Ty);
+
+      Type *NewTy = ArgTys[D.getArgumentNumber()];
+      if (auto *VTy = dyn_cast<VectorType>(NewTy)) {
+        int SubDivs = D.Kind == IITDescriptor::Subdivide2Argument ? 1 : 2;
+        NewTy = VectorType::getSubdividedVectorType(VTy, SubDivs);
+        return Ty != NewTy;
+      }
+      return true;
+    }
+    case IITDescriptor::ScalableVecArgument: {
+      VectorType *VTy = dyn_cast<VectorType>(Ty);
+      if (!VTy || !VTy->isScalable())
+        return true;
+      return matchIntrinsicType(VTy, Infos, ArgTys, DeferredChecks,
+                                IsDeferredCheck);
+    }
+    case IITDescriptor::VecOfBitcastsToInt: {
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return IsDeferredCheck || DeferCheck(Ty);
+      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+      auto *ThisArgVecTy = dyn_cast<VectorType>(Ty);
+      if (!ThisArgVecTy || !ReferenceType)
+        return true;
+      return ThisArgVecTy != VectorType::getInteger(ReferenceType);
+    }
   }
   llvm_unreachable("unhandled");
 }
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index e2bfc0420bc5..46a9696b2944 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -114,18 +114,22 @@ unsigned GlobalValue::getAddressSpace() const {
 }
 
 void GlobalObject::setAlignment(unsigned Align) {
-  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
-  assert(Align <= MaximumAlignment &&
+  setAlignment(MaybeAlign(Align));
+}
+
+void GlobalObject::setAlignment(MaybeAlign Align) {
+  assert((!Align || Align <= MaximumAlignment) &&
          "Alignment is greater than MaximumAlignment!");
-  unsigned AlignmentData = Log2_32(Align) + 1;
+  unsigned AlignmentData = encode(Align);
   unsigned OldData = getGlobalValueSubClassData();
   setGlobalValueSubClassData((OldData & ~AlignmentMask) | AlignmentData);
-  assert(getAlignment() == Align && "Alignment representation error!");
+  assert(MaybeAlign(getAlignment()) == Align &&
+         "Alignment representation error!");
 }
 
 void GlobalObject::copyAttributesFrom(const GlobalObject *Src) {
   GlobalValue::copyAttributesFrom(Src);
-  setAlignment(Src->getAlignment());
+  setAlignment(MaybeAlign(Src->getAlignment()));
   setSection(Src->getSection());
 }
 
@@ -427,6 +431,43 @@ GlobalIndirectSymbol::GlobalIndirectSymbol(Type *Ty, ValueTy VTy,
     Op<0>() = Symbol;
 }
 
+static const GlobalObject *
+findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases) {
+  if (auto *GO = dyn_cast<GlobalObject>(C))
+    return GO;
+  if (auto *GA = dyn_cast<GlobalAlias>(C))
+    if (Aliases.insert(GA).second)
+      return findBaseObject(GA->getOperand(0), Aliases);
+  if (auto *CE = dyn_cast<ConstantExpr>(C)) {
+    switch (CE->getOpcode()) {
+    case Instruction::Add: {
+      auto *LHS = findBaseObject(CE->getOperand(0), Aliases);
+      auto *RHS = findBaseObject(CE->getOperand(1), Aliases);
+      if (LHS && RHS)
+        return nullptr;
+      return LHS ? LHS : RHS;
+    }
+    case Instruction::Sub: {
+      if (findBaseObject(CE->getOperand(1), Aliases))
+        return nullptr;
+      return findBaseObject(CE->getOperand(0), Aliases);
+    }
+    case Instruction::IntToPtr:
+    case Instruction::PtrToInt:
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+      return findBaseObject(CE->getOperand(0), Aliases);
+    default:
+      break;
+    }
+  }
+  return nullptr;
+}
+
+const GlobalObject *GlobalIndirectSymbol::getBaseObject() const {
+  DenseSet<const GlobalAlias *> Aliases;
+  return findBaseObject(getOperand(0), Aliases);
+}
 
 //===----------------------------------------------------------------------===//
 // GlobalAlias Implementation
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index 0c6461c9078f..b782012e9731 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -49,7 +49,7 @@ GlobalVariable *IRBuilderBase::CreateGlobalString(StringRef Str,
                                 nullptr, GlobalVariable::NotThreadLocal,
                                 AddressSpace);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(1);
+  GV->setAlignment(Align::None());
   return GV;
 }
 
@@ -289,8 +289,10 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove(
   CallInst *CI = createCallHelper(TheFn, Ops, this);
 
   // Set the alignment of the pointer args.
-  CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), DstAlign));
-  CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), SrcAlign));
+  CI->addParamAttr(
+      0, Attribute::getWithAlignment(CI->getContext(), Align(DstAlign)));
+  CI->addParamAttr(
+      1, Attribute::getWithAlignment(CI->getContext(), Align(SrcAlign)));
 
   // Set the TBAA info if present.
   if (TBAATag)
diff --git a/lib/IR/IRPrintingPasses.cpp b/lib/IR/IRPrintingPasses.cpp
index 35b06135a828..953cf9410162 100644
--- a/lib/IR/IRPrintingPasses.cpp
+++ b/lib/IR/IRPrintingPasses.cpp
@@ -26,14 +26,22 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &) {
-  if (!Banner.empty())
-    OS << Banner << "\n";
-  if (llvm::isFunctionInPrintList("*"))
+  if (llvm::isFunctionInPrintList("*")) {
+    if (!Banner.empty())
+      OS << Banner << "\n";
     M.print(OS, nullptr, ShouldPreserveUseListOrder);
+  }
   else {
-    for(const auto &F : M.functions())
-      if (llvm::isFunctionInPrintList(F.getName()))
+    bool BannerPrinted = false;
+    for(const auto &F : M.functions()) {
+      if (llvm::isFunctionInPrintList(F.getName())) {
+        if (!BannerPrinted && !Banner.empty()) {
+          OS << Banner << "\n";
+          BannerPrinted = true;
+        }
         F.print(OS);
+      }
+    }
   }
   return PreservedAnalyses::all();
 }
diff --git a/lib/IR/InlineAsm.cpp b/lib/IR/InlineAsm.cpp
index 99da7caaccf0..fd732f9eda8b 100644
--- a/lib/IR/InlineAsm.cpp
+++ b/lib/IR/InlineAsm.cpp
@@ -181,6 +181,16 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
       // FIXME: For now assuming these are 2-character constraints.
       pCodes->push_back(StringRef(I+1, 2));
       I += 3;
+    } else if (*I == '@') {
+      // Multi-letter constraint
+      ++I;
+      unsigned char C = static_cast<unsigned char>(*I);
+      assert(isdigit(C) && "Expected a digit!");
+      int N = C - '0';
+      assert(N > 0 && "Found a zero letter constraint!");
+      ++I;
+      pCodes->push_back(StringRef(I, N));
+      I += N;
     } else {
       // Single letter constraint.
       pCodes->push_back(StringRef(I, 1));
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index ba5629d1662b..b157c7bb34bf 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -524,7 +524,7 @@ bool Instruction::mayReadFromMemory() const {
   case Instruction::Call:
   case Instruction::Invoke:
   case Instruction::CallBr:
-    return !cast<CallBase>(this)->doesNotAccessMemory();
+    return !cast<CallBase>(this)->doesNotReadMemory();
   case Instruction::Store:
     return !cast<StoreInst>(this)->isUnordered();
   }
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 2e7cad103c12..245c7628b08e 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TypeSize.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -45,12 +46,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> SwitchInstProfUpdateWrapperStrict(
-    "switch-inst-prof-update-wrapper-strict", cl::Hidden,
-    cl::desc("Assert that prof branch_weights metadata is valid when creating "
-             "an instance of SwitchInstProfUpdateWrapper"),
-    cl::init(false));
-
 //===----------------------------------------------------------------------===//
 //                            AllocaInst Class
 //===----------------------------------------------------------------------===//
@@ -822,6 +817,17 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough,
   setName(NameStr);
 }
 
+void CallBrInst::updateArgBlockAddresses(unsigned i, BasicBlock *B) {
+  assert(getNumIndirectDests() > i && "IndirectDest # out of range for callbr");
+  if (BasicBlock *OldBB = getIndirectDest(i)) {
+    BlockAddress *Old = BlockAddress::get(OldBB);
+    BlockAddress *New = BlockAddress::get(B);
+    for (unsigned ArgNo = 0, e = getNumArgOperands(); ArgNo != e; ++ArgNo)
+      if (dyn_cast<BlockAddress>(getArgOperand(ArgNo)) == Old)
+        setArgOperand(ArgNo, New);
+  }
+}
+
 CallBrInst::CallBrInst(const CallBrInst &CBI)
     : CallBase(CBI.Attrs, CBI.FTy, CBI.getType(), Instruction::CallBr,
                OperandTraits<CallBase>::op_end(this) - CBI.getNumOperands(),
@@ -1223,7 +1229,7 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
   : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
                      getAISize(Ty->getContext(), ArraySize), InsertBefore),
     AllocatedType(Ty) {
-  setAlignment(Align);
+  setAlignment(MaybeAlign(Align));
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
 }
@@ -1234,18 +1240,21 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
   : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
                      getAISize(Ty->getContext(), ArraySize), InsertAtEnd),
       AllocatedType(Ty) {
-  setAlignment(Align);
+  setAlignment(MaybeAlign(Align));
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
 }
 
-void AllocaInst::setAlignment(unsigned Align) {
-  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
-  assert(Align <= MaximumAlignment &&
+void AllocaInst::setAlignment(MaybeAlign Align) {
+  assert((!Align || *Align <= MaximumAlignment) &&
          "Alignment is greater than MaximumAlignment!");
   setInstructionSubclassData((getSubclassDataFromInstruction() & ~31) |
-                             (Log2_32(Align) + 1));
-  assert(getAlignment() == Align && "Alignment representation error!");
+                             encode(Align));
+  if (Align)
+    assert(getAlignment() == Align->value() &&
+           "Alignment representation error!");
+  else
+    assert(getAlignment() == 0 && "Alignment representation error!");
 }
 
 bool AllocaInst::isArrayAllocation() const {
@@ -1287,36 +1296,36 @@ LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name,
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    Instruction *InsertBef)
-    : LoadInst(Ty, Ptr, Name, isVolatile, /*Align=*/0, InsertBef) {}
+    : LoadInst(Ty, Ptr, Name, isVolatile, /*Align=*/None, InsertBef) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    BasicBlock *InsertAE)
-    : LoadInst(Ty, Ptr, Name, isVolatile, /*Align=*/0, InsertAE) {}
+    : LoadInst(Ty, Ptr, Name, isVolatile, /*Align=*/None, InsertAE) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
-                   unsigned Align, Instruction *InsertBef)
+                   MaybeAlign Align, Instruction *InsertBef)
     : LoadInst(Ty, Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
                SyncScope::System, InsertBef) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
-                   unsigned Align, BasicBlock *InsertAE)
+                   MaybeAlign Align, BasicBlock *InsertAE)
     : LoadInst(Ty, Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
                SyncScope::System, InsertAE) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
-                   unsigned Align, AtomicOrdering Order,
-                   SyncScope::ID SSID, Instruction *InsertBef)
+                   MaybeAlign Align, AtomicOrdering Order, SyncScope::ID SSID,
+                   Instruction *InsertBef)
     : UnaryInstruction(Ty, Load, Ptr, InsertBef) {
   assert(Ty == cast<PointerType>(Ptr->getType())->getElementType());
   setVolatile(isVolatile);
-  setAlignment(Align);
+  setAlignment(MaybeAlign(Align));
   setAtomic(Order, SSID);
   AssertOK();
   setName(Name);
 }
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
-                   unsigned Align, AtomicOrdering Order, SyncScope::ID SSID,
+                   MaybeAlign Align, AtomicOrdering Order, SyncScope::ID SSID,
                    BasicBlock *InsertAE)
     : UnaryInstruction(Ty, Load, Ptr, InsertAE) {
   assert(Ty == cast<PointerType>(Ptr->getType())->getElementType());
@@ -1327,13 +1336,16 @@ LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
   setName(Name);
 }
 
-void LoadInst::setAlignment(unsigned Align) {
-  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
-  assert(Align <= MaximumAlignment &&
+void LoadInst::setAlignment(MaybeAlign Align) {
+  assert((!Align || *Align <= MaximumAlignment) &&
          "Alignment is greater than MaximumAlignment!");
   setInstructionSubclassData((getSubclassDataFromInstruction() & ~(31 << 1)) |
-                             ((Log2_32(Align)+1)<<1));
-  assert(getAlignment() == Align && "Alignment representation error!");
+                             (encode(Align) << 1));
+  if (Align)
+    assert(getAlignment() == Align->value() &&
+           "Alignment representation error!");
+  else
+    assert(getAlignment() == 0 && "Alignment representation error!");
 }
 
 //===----------------------------------------------------------------------===//
@@ -1359,30 +1371,28 @@ StoreInst::StoreInst(Value *val, Value *addr, BasicBlock *InsertAtEnd)
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
                      Instruction *InsertBefore)
-    : StoreInst(val, addr, isVolatile, /*Align=*/0, InsertBefore) {}
+    : StoreInst(val, addr, isVolatile, /*Align=*/None, InsertBefore) {}
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
                      BasicBlock *InsertAtEnd)
-    : StoreInst(val, addr, isVolatile, /*Align=*/0, InsertAtEnd) {}
+    : StoreInst(val, addr, isVolatile, /*Align=*/None, InsertAtEnd) {}
 
-StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, unsigned Align,
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
                      Instruction *InsertBefore)
     : StoreInst(val, addr, isVolatile, Align, AtomicOrdering::NotAtomic,
                 SyncScope::System, InsertBefore) {}
 
-StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, unsigned Align,
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
                      BasicBlock *InsertAtEnd)
     : StoreInst(val, addr, isVolatile, Align, AtomicOrdering::NotAtomic,
                 SyncScope::System, InsertAtEnd) {}
 
-StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
-                     unsigned Align, AtomicOrdering Order,
-                     SyncScope::ID SSID,
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
+                     AtomicOrdering Order, SyncScope::ID SSID,
                      Instruction *InsertBefore)
-  : Instruction(Type::getVoidTy(val->getContext()), Store,
-                OperandTraits<StoreInst>::op_begin(this),
-                OperandTraits<StoreInst>::operands(this),
-                InsertBefore) {
+    : Instruction(Type::getVoidTy(val->getContext()), Store,
+                  OperandTraits<StoreInst>::op_begin(this),
+                  OperandTraits<StoreInst>::operands(this), InsertBefore) {
   Op<0>() = val;
   Op<1>() = addr;
   setVolatile(isVolatile);
@@ -1391,14 +1401,12 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
   AssertOK();
 }
 
-StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
-                     unsigned Align, AtomicOrdering Order,
-                     SyncScope::ID SSID,
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
+                     AtomicOrdering Order, SyncScope::ID SSID,
                      BasicBlock *InsertAtEnd)
-  : Instruction(Type::getVoidTy(val->getContext()), Store,
-                OperandTraits<StoreInst>::op_begin(this),
-                OperandTraits<StoreInst>::operands(this),
-                InsertAtEnd) {
+    : Instruction(Type::getVoidTy(val->getContext()), Store,
+                  OperandTraits<StoreInst>::op_begin(this),
+                  OperandTraits<StoreInst>::operands(this), InsertAtEnd) {
   Op<0>() = val;
   Op<1>() = addr;
   setVolatile(isVolatile);
@@ -1407,13 +1415,16 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
   AssertOK();
 }
 
-void StoreInst::setAlignment(unsigned Align) {
-  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
-  assert(Align <= MaximumAlignment &&
+void StoreInst::setAlignment(MaybeAlign Align) {
+  assert((!Align || *Align <= MaximumAlignment) &&
          "Alignment is greater than MaximumAlignment!");
   setInstructionSubclassData((getSubclassDataFromInstruction() & ~(31 << 1)) |
-                             ((Log2_32(Align)+1) << 1));
-  assert(getAlignment() == Align && "Alignment representation error!");
+                             (encode(Align) << 1));
+  if (Align)
+    assert(getAlignment() == Align->value() &&
+           "Alignment representation error!");
+  else
+    assert(getAlignment() == 0 && "Alignment representation error!");
 }
 
 //===----------------------------------------------------------------------===//
@@ -1778,7 +1789,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                                      const Twine &Name,
                                      Instruction *InsertBefore)
 : Instruction(VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
-                cast<VectorType>(Mask->getType())->getNumElements()),
+                cast<VectorType>(Mask->getType())->getElementCount()),
               ShuffleVector,
               OperandTraits<ShuffleVectorInst>::op_begin(this),
               OperandTraits<ShuffleVectorInst>::operands(this),
@@ -1795,7 +1806,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                                      const Twine &Name,
                                      BasicBlock *InsertAtEnd)
 : Instruction(VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
-                cast<VectorType>(Mask->getType())->getNumElements()),
+                cast<VectorType>(Mask->getType())->getElementCount()),
               ShuffleVector,
               OperandTraits<ShuffleVectorInst>::op_begin(this),
               OperandTraits<ShuffleVectorInst>::operands(this),
@@ -2968,8 +2979,8 @@ bool CastInst::isCastable(Type *SrcTy, Type *DestTy) {
       }
 
   // Get the bit sizes, we'll need these
-  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
-  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
+  TypeSize SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
+  TypeSize DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
 
   // Run through the possibilities ...
   if (DestTy->isIntegerTy()) {               // Casting to integral
@@ -3016,7 +3027,7 @@ bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) {
 
   if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy)) {
     if (VectorType *DestVecTy = dyn_cast<VectorType>(DestTy)) {
-      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
+      if (SrcVecTy->getElementCount() == DestVecTy->getElementCount()) {
         // An element by element cast. Valid if casting the elements is valid.
         SrcTy = SrcVecTy->getElementType();
         DestTy = DestVecTy->getElementType();
@@ -3030,12 +3041,12 @@ bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) {
     }
   }
 
-  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
-  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
+  TypeSize SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
+  TypeSize DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
 
   // Could still have vectors of pointers if the number of elements doesn't
   // match
-  if (SrcBits == 0 || DestBits == 0)
+  if (SrcBits.getKnownMinSize() == 0 || DestBits.getKnownMinSize() == 0)
     return false;
 
   if (SrcBits != DestBits)
@@ -3886,7 +3897,7 @@ SwitchInstProfUpdateWrapper::getProfBranchWeightsMD(const SwitchInst &SI) {
 }
 
 MDNode *SwitchInstProfUpdateWrapper::buildProfBranchWeightsMD() {
-  assert(State == Changed && "called only if metadata has changed");
+  assert(Changed && "called only if metadata has changed");
 
   if (!Weights)
     return nullptr;
@@ -3905,17 +3916,12 @@ MDNode *SwitchInstProfUpdateWrapper::buildProfBranchWeightsMD() {
 
 void SwitchInstProfUpdateWrapper::init() {
   MDNode *ProfileData = getProfBranchWeightsMD(SI);
-  if (!ProfileData) {
-    State = Initialized;
+  if (!ProfileData)
     return;
-  }
 
   if (ProfileData->getNumOperands() != SI.getNumSuccessors() + 1) {
-    State = Invalid;
-    if (SwitchInstProfUpdateWrapperStrict)
-      llvm_unreachable("number of prof branch_weights metadata operands does "
-                       "not correspond to number of succesors");
-    return;
+    llvm_unreachable("number of prof branch_weights metadata operands does "
+                     "not correspond to number of succesors");
   }
 
   SmallVector<uint32_t, 8> Weights;
@@ -3924,7 +3930,6 @@ void SwitchInstProfUpdateWrapper::init() {
     uint32_t CW = C->getValue().getZExtValue();
     Weights.push_back(CW);
   }
-  State = Initialized;
   this->Weights = std::move(Weights);
 }
 
@@ -3933,7 +3938,7 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) {
   if (Weights) {
     assert(SI.getNumSuccessors() == Weights->size() &&
            "num of prof branch_weights must accord with num of successors");
-    State = Changed;
+    Changed = true;
     // Copy the last case to the place of the removed one and shrink.
     // This is tightly coupled with the way SwitchInst::removeCase() removes
     // the cases in SwitchInst::removeCase(CaseIt).
@@ -3948,15 +3953,12 @@ void SwitchInstProfUpdateWrapper::addCase(
     SwitchInstProfUpdateWrapper::CaseWeightOpt W) {
   SI.addCase(OnVal, Dest);
 
-  if (State == Invalid)
-    return;
-
   if (!Weights && W && *W) {
-    State = Changed;
+    Changed = true;
     Weights = SmallVector<uint32_t, 8>(SI.getNumSuccessors(), 0);
     Weights.getValue()[SI.getNumSuccessors() - 1] = *W;
   } else if (Weights) {
-    State = Changed;
+    Changed = true;
     Weights.getValue().push_back(W ? *W : 0);
   }
   if (Weights)
@@ -3967,11 +3969,9 @@ void SwitchInstProfUpdateWrapper::addCase(
 SymbolTableList<Instruction>::iterator
 SwitchInstProfUpdateWrapper::eraseFromParent() {
   // Instruction is erased. Mark as unchanged to not touch it in the destructor.
-  if (State != Invalid) {
-    State = Initialized;
-    if (Weights)
-      Weights->resize(0);
-  }
+  Changed = false;
+  if (Weights)
+    Weights->resize(0);
   return SI.eraseFromParent();
 }
 
@@ -3984,7 +3984,7 @@ SwitchInstProfUpdateWrapper::getSuccessorWeight(unsigned idx) {
 
 void SwitchInstProfUpdateWrapper::setSuccessorWeight(
     unsigned idx, SwitchInstProfUpdateWrapper::CaseWeightOpt W) {
-  if (!W || State == Invalid)
+  if (!W)
     return;
 
   if (!Weights && *W)
@@ -3993,7 +3993,7 @@ void SwitchInstProfUpdateWrapper::setSuccessorWeight(
   if (Weights) {
     auto &OldW = Weights.getValue()[idx];
     if (*W != OldW) {
-      State = Changed;
+      Changed = true;
       OldW = *W;
     }
   }
@@ -4136,13 +4136,14 @@ AllocaInst *AllocaInst::cloneImpl() const {
 
 LoadInst *LoadInst::cloneImpl() const {
   return new LoadInst(getType(), getOperand(0), Twine(), isVolatile(),
-                      getAlignment(), getOrdering(), getSyncScopeID());
+                      MaybeAlign(getAlignment()), getOrdering(),
+                      getSyncScopeID());
 }
 
 StoreInst *StoreInst::cloneImpl() const {
   return new StoreInst(getOperand(0), getOperand(1), isVolatile(),
-                       getAlignment(), getOrdering(), getSyncScopeID());
-
+                       MaybeAlign(getAlignment()), getOrdering(),
+                       getSyncScopeID());
 }
 
 AtomicCmpXchgInst *AtomicCmpXchgInst::cloneImpl() const {
diff --git a/lib/IR/IntrinsicInst.cpp b/lib/IR/IntrinsicInst.cpp
index 7a042326f67f..26ed46a9cd91 100644
--- a/lib/IR/IntrinsicInst.cpp
+++ b/lib/IR/IntrinsicInst.cpp
@@ -67,13 +67,12 @@ int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
   // size 1. During the search, we can skip the prefix that we already know is
   // identical. By using strncmp we consider names with differing suffixes to
   // be part of the equal range.
-  size_t CmpStart = 0;
   size_t CmpEnd = 4; // Skip the "llvm" component.
   const char *const *Low = NameTable.begin();
   const char *const *High = NameTable.end();
   const char *const *LastLow = Low;
   while (CmpEnd < Name.size() && High - Low > 0) {
-    CmpStart = CmpEnd;
+    size_t CmpStart = CmpEnd;
     CmpEnd = Name.find('.', CmpStart + 1);
     CmpEnd = CmpEnd == StringRef::npos ? Name.size() : CmpEnd;
     auto Cmp = [CmpStart, CmpEnd](const char *LHS, const char *RHS) {
@@ -107,7 +106,7 @@ Optional<ConstrainedFPIntrinsic::RoundingMode>
 ConstrainedFPIntrinsic::getRoundingMode() const {
   unsigned NumOperands = getNumArgOperands();
   Metadata *MD =
-      dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 2))->getMetadata();
+      cast<MetadataAsValue>(getArgOperand(NumOperands - 2))->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return None;
   return StrToRoundingMode(cast<MDString>(MD)->getString());
@@ -143,7 +142,7 @@ ConstrainedFPIntrinsic::RoundingModeToStr(RoundingMode UseRounding) {
     RoundingStr = "round.upward";
     break;
   case ConstrainedFPIntrinsic::rmTowardZero:
-    RoundingStr = "round.tozero";
+    RoundingStr = "round.towardzero";
     break;
   }
   return RoundingStr;
@@ -153,7 +152,7 @@ Optional<ConstrainedFPIntrinsic::ExceptionBehavior>
 ConstrainedFPIntrinsic::getExceptionBehavior() const {
   unsigned NumOperands = getNumArgOperands();
   Metadata *MD =
-      dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 1))->getMetadata();
+      cast<MetadataAsValue>(getArgOperand(NumOperands - 1))->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return None;
   return StrToExceptionBehavior(cast<MDString>(MD)->getString());
@@ -189,6 +188,8 @@ bool ConstrainedFPIntrinsic::isUnaryOp() const {
   switch (getIntrinsicID()) {
     default:
       return false;
+    case Intrinsic::experimental_constrained_fptosi:
+    case Intrinsic::experimental_constrained_fptoui:
     case Intrinsic::experimental_constrained_fptrunc:
     case Intrinsic::experimental_constrained_fpext:
     case Intrinsic::experimental_constrained_sqrt:
@@ -199,10 +200,14 @@ bool ConstrainedFPIntrinsic::isUnaryOp() const {
     case Intrinsic::experimental_constrained_log:
     case Intrinsic::experimental_constrained_log10:
     case Intrinsic::experimental_constrained_log2:
+    case Intrinsic::experimental_constrained_lrint:
+    case Intrinsic::experimental_constrained_llrint:
     case Intrinsic::experimental_constrained_rint:
     case Intrinsic::experimental_constrained_nearbyint:
     case Intrinsic::experimental_constrained_ceil:
     case Intrinsic::experimental_constrained_floor:
+    case Intrinsic::experimental_constrained_lround:
+    case Intrinsic::experimental_constrained_llround:
     case Intrinsic::experimental_constrained_round:
     case Intrinsic::experimental_constrained_trunc:
       return true;
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index e1cdf6b539db..5e8772186a2a 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -36,34 +36,9 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   // Create the fixed metadata kinds. This is done in the same order as the
   // MD_* enum values so that they correspond.
   std::pair<unsigned, StringRef> MDKinds[] = {
-    {MD_dbg, "dbg"},
-    {MD_tbaa, "tbaa"},
-    {MD_prof, "prof"},
-    {MD_fpmath, "fpmath"},
-    {MD_range, "range"},
-    {MD_tbaa_struct, "tbaa.struct"},
-    {MD_invariant_load, "invariant.load"},
-    {MD_alias_scope, "alias.scope"},
-    {MD_noalias, "noalias"},
-    {MD_nontemporal, "nontemporal"},
-    {MD_mem_parallel_loop_access, "llvm.mem.parallel_loop_access"},
-    {MD_nonnull, "nonnull"},
-    {MD_dereferenceable, "dereferenceable"},
-    {MD_dereferenceable_or_null, "dereferenceable_or_null"},
-    {MD_make_implicit, "make.implicit"},
-    {MD_unpredictable, "unpredictable"},
-    {MD_invariant_group, "invariant.group"},
-    {MD_align, "align"},
-    {MD_loop, "llvm.loop"},
-    {MD_type, "type"},
-    {MD_section_prefix, "section_prefix"},
-    {MD_absolute_symbol, "absolute_symbol"},
-    {MD_associated, "associated"},
-    {MD_callees, "callees"},
-    {MD_irr_loop, "irr_loop"},
-    {MD_access_group, "llvm.access.group"},
-    {MD_callback, "callback"},
-    {MD_preserve_access_index, "llvm.preserve.access.index"},
+#define LLVM_FIXED_MD_KIND(EnumID, Name, Value) {EnumID, Name},
+#include "llvm/IR/FixedMetadataKinds.def"
+#undef LLVM_FIXED_MD_KIND
   };
 
   for (auto &MDKind : MDKinds) {
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index c6ab2c6f213a..5f9782714170 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -21,7 +21,7 @@
 using namespace llvm;
 
 LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
-  : DiagHandler(llvm::make_unique<DiagnosticHandler>()),
+  : DiagHandler(std::make_unique<DiagnosticHandler>()),
     VoidTy(C, Type::VoidTyID),
     LabelTy(C, Type::LabelTyID),
     HalfTy(C, Type::HalfTyID),
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index c575d6e782b9..3a03c493100b 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -1680,7 +1680,6 @@ bool FPPassManager::runOnFunction(Function &F) {
 bool FPPassManager::runOnModule(Module &M) {
   bool Changed = false;
 
-  llvm::TimeTraceScope TimeScope("OptModule", M.getName());
   for (Function &F : M)
     Changed |= runOnFunction(F);
 
@@ -1999,10 +1998,28 @@ void FunctionPass::assignPassManager(PMStack &PMS,
   FPP->add(this);
 }
 
+void BasicBlockPass::preparePassManager(PMStack &PMS) {
+  // Find BBPassManager
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_BasicBlockPassManager)
+    PMS.pop();
+
+  // If this pass is destroying high level information that is used
+  // by other passes that are managed by BBPM then do not insert
+  // this pass in current BBPM. Use new BBPassManager.
+  if (PMS.top()->getPassManagerType() == PMT_BasicBlockPassManager &&
+      !PMS.top()->preserveHigherLevelAnalysis(this))
+    PMS.pop();
+}
+
 /// Find appropriate Basic Pass Manager or Call Graph Pass Manager
 /// in the PM Stack and add self into that manager.
 void BasicBlockPass::assignPassManager(PMStack &PMS,
                                        PassManagerType PreferredType) {
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_BasicBlockPassManager)
+    PMS.pop();
+
   BBPassManager *BBP;
 
   // Basic Pass Manager is a leaf pass manager. It does not handle
@@ -2018,6 +2035,7 @@ void BasicBlockPass::assignPassManager(PMStack &PMS,
 
     // [1] Create new Basic Block Manager
     BBP = new BBPassManager();
+    BBP->populateInheritedAnalysis(PMS);
 
     // [2] Set up new manager's top level manager
     // Basic Block Pass Manager does not live by itself
diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp
index 14bcb3a29b07..7bdb85ace522 100644
--- a/lib/IR/MDBuilder.cpp
+++ b/lib/IR/MDBuilder.cpp
@@ -309,3 +309,15 @@ MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
   };
   return MDNode::get(Context, Vals);
 }
+
+MDNode *MDBuilder::createMisExpect(uint64_t Index, uint64_t LikleyWeight,
+                                   uint64_t UnlikleyWeight) {
+  auto *IntType = Type::getInt64Ty(Context);
+  Metadata *Vals[] = {
+      createString("misexpect"),
+      createConstant(ConstantInt::get(IntType, Index)),
+      createConstant(ConstantInt::get(IntType, LikleyWeight)),
+      createConstant(ConstantInt::get(IntType, UnlikleyWeight)),
+  };
+  return MDNode::get(Context, Vals);
+}
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 748a2238e642..62c2aa86f3b0 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -1497,6 +1497,24 @@ void GlobalObject::addTypeMetadata(unsigned Offset, Metadata *TypeID) {
                      TypeID}));
 }
 
+void GlobalObject::addVCallVisibilityMetadata(VCallVisibility Visibility) {
+  addMetadata(LLVMContext::MD_vcall_visibility,
+              *MDNode::get(getContext(),
+                           {ConstantAsMetadata::get(ConstantInt::get(
+                               Type::getInt64Ty(getContext()), Visibility))}));
+}
+
+GlobalObject::VCallVisibility GlobalObject::getVCallVisibility() const {
+  if (MDNode *MD = getMetadata(LLVMContext::MD_vcall_visibility)) {
+    uint64_t Val = cast<ConstantInt>(
+                       cast<ConstantAsMetadata>(MD->getOperand(0))->getValue())
+                       ->getZExtValue();
+    assert(Val <= 2 && "unknown vcall visibility!");
+    return (VCallVisibility)Val;
+  }
+  return VCallVisibility::VCallVisibilityPublic;
+}
+
 void Function::setSubprogram(DISubprogram *SP) {
   setMetadata(LLVMContext::MD_dbg, SP);
 }
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index dbf4035ac7c1..25efd009194f 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -604,7 +604,7 @@ GlobalVariable *llvm::collectUsedGlobalVariables(
 
   const ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
   for (Value *Op : Init->operands()) {
-    GlobalValue *G = cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases());
+    GlobalValue *G = cast<GlobalValue>(Op->stripPointerCasts());
     Set.insert(G);
   }
   return GV;
diff --git a/lib/IR/RemarkStreamer.cpp b/lib/IR/RemarkStreamer.cpp
index 5b4c7e72b479..0fcc06b961f3 100644
--- a/lib/IR/RemarkStreamer.cpp
+++ b/lib/IR/RemarkStreamer.cpp
@@ -15,15 +15,17 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Remarks/BitstreamRemarkSerializer.h"
 #include "llvm/Remarks/RemarkFormat.h"
+#include "llvm/Remarks/RemarkSerializer.h"
 
 using namespace llvm;
 
-RemarkStreamer::RemarkStreamer(StringRef Filename,
-                               std::unique_ptr<remarks::Serializer> Serializer)
-    : Filename(Filename), PassFilter(), Serializer(std::move(Serializer)) {
-  assert(!Filename.empty() && "This needs to be a real filename.");
-}
+RemarkStreamer::RemarkStreamer(
+    std::unique_ptr<remarks::RemarkSerializer> RemarkSerializer,
+    Optional<StringRef> FilenameIn)
+    : PassFilter(), RemarkSerializer(std::move(RemarkSerializer)),
+      Filename(FilenameIn ? Optional<std::string>(FilenameIn->str()) : None) {}
 
 Error RemarkStreamer::setFilter(StringRef Filter) {
   Regex R = Regex(Filter);
@@ -99,24 +101,13 @@ void RemarkStreamer::emit(const DiagnosticInfoOptimizationBase &Diag) {
   // First, convert the diagnostic to a remark.
   remarks::Remark R = toRemark(Diag);
   // Then, emit the remark through the serializer.
-  Serializer->emit(R);
+  RemarkSerializer->emit(R);
 }
 
 char RemarkSetupFileError::ID = 0;
 char RemarkSetupPatternError::ID = 0;
 char RemarkSetupFormatError::ID = 0;
 
-static std::unique_ptr<remarks::Serializer>
-formatToSerializer(remarks::Format RemarksFormat, raw_ostream &OS) {
-  switch (RemarksFormat) {
-  default:
-    llvm_unreachable("Unknown remark serializer format.");
-    return nullptr;
-  case remarks::Format::YAML:
-    return llvm::make_unique<remarks::YAMLSerializer>(OS);
-  };
-}
-
 Expected<std::unique_ptr<ToolOutputFile>>
 llvm::setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
                                StringRef RemarksPasses, StringRef RemarksFormat,
@@ -131,24 +122,63 @@ llvm::setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
   if (RemarksFilename.empty())
     return nullptr;
 
+  Expected<remarks::Format> Format = remarks::parseFormat(RemarksFormat);
+  if (Error E = Format.takeError())
+    return make_error<RemarkSetupFormatError>(std::move(E));
+
   std::error_code EC;
+  auto Flags = *Format == remarks::Format::YAML ? sys::fs::OF_Text
+                                                : sys::fs::OF_None;
   auto RemarksFile =
-      llvm::make_unique<ToolOutputFile>(RemarksFilename, EC, sys::fs::F_None);
+      std::make_unique<ToolOutputFile>(RemarksFilename, EC, Flags);
   // We don't use llvm::FileError here because some diagnostics want the file
   // name separately.
   if (EC)
     return make_error<RemarkSetupFileError>(errorCodeToError(EC));
 
+  Expected<std::unique_ptr<remarks::RemarkSerializer>> RemarkSerializer =
+      remarks::createRemarkSerializer(
+          *Format, remarks::SerializerMode::Separate, RemarksFile->os());
+  if (Error E = RemarkSerializer.takeError())
+    return make_error<RemarkSetupFormatError>(std::move(E));
+
+  Context.setRemarkStreamer(std::make_unique<RemarkStreamer>(
+      std::move(*RemarkSerializer), RemarksFilename));
+
+  if (!RemarksPasses.empty())
+    if (Error E = Context.getRemarkStreamer()->setFilter(RemarksPasses))
+      return make_error<RemarkSetupPatternError>(std::move(E));
+
+  return std::move(RemarksFile);
+}
+
+Error llvm::setupOptimizationRemarks(LLVMContext &Context, raw_ostream &OS,
+                                     StringRef RemarksPasses,
+                                     StringRef RemarksFormat,
+                                     bool RemarksWithHotness,
+                                     unsigned RemarksHotnessThreshold) {
+  if (RemarksWithHotness)
+    Context.setDiagnosticsHotnessRequested(true);
+
+  if (RemarksHotnessThreshold)
+    Context.setDiagnosticsHotnessThreshold(RemarksHotnessThreshold);
+
   Expected<remarks::Format> Format = remarks::parseFormat(RemarksFormat);
   if (Error E = Format.takeError())
     return make_error<RemarkSetupFormatError>(std::move(E));
 
-  Context.setRemarkStreamer(llvm::make_unique<RemarkStreamer>(
-      RemarksFilename, formatToSerializer(*Format, RemarksFile->os())));
+  Expected<std::unique_ptr<remarks::RemarkSerializer>> RemarkSerializer =
+      remarks::createRemarkSerializer(*Format,
+                                      remarks::SerializerMode::Separate, OS);
+  if (Error E = RemarkSerializer.takeError())
+    return make_error<RemarkSetupFormatError>(std::move(E));
+
+  Context.setRemarkStreamer(
+      std::make_unique<RemarkStreamer>(std::move(*RemarkSerializer)));
 
   if (!RemarksPasses.empty())
     if (Error E = Context.getRemarkStreamer()->setFilter(RemarksPasses))
       return make_error<RemarkSetupPatternError>(std::move(E));
 
-  return std::move(RemarksFile);
+  return Error::success();
 }
diff --git a/lib/IR/SafepointIRVerifier.cpp b/lib/IR/SafepointIRVerifier.cpp
index 7f3dea5e6a6d..c90347ec48fd 100644
--- a/lib/IR/SafepointIRVerifier.cpp
+++ b/lib/IR/SafepointIRVerifier.cpp
@@ -102,11 +102,11 @@ public:
   }
 
   bool isDeadEdge(const Use *U) const {
-    assert(dyn_cast<Instruction>(U->getUser())->isTerminator() &&
+    assert(cast<Instruction>(U->getUser())->isTerminator() &&
            "edge must be operand of terminator");
     assert(cast_or_null<BasicBlock>(U->get()) &&
            "edge must refer to basic block");
-    assert(!isDeadBlock(dyn_cast<Instruction>(U->getUser())->getParent()) &&
+    assert(!isDeadBlock(cast<Instruction>(U->getUser())->getParent()) &&
            "isDeadEdge() must be applied to edge from live block");
     return DeadEdges.count(U);
   }
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 8ece7f223dd2..3eab5042b542 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TypeSize.h"
 #include <cassert>
 #include <utility>
 
@@ -111,18 +112,22 @@ bool Type::isEmptyTy() const {
   return false;
 }
 
-unsigned Type::getPrimitiveSizeInBits() const {
+TypeSize Type::getPrimitiveSizeInBits() const {
   switch (getTypeID()) {
-  case Type::HalfTyID: return 16;
-  case Type::FloatTyID: return 32;
-  case Type::DoubleTyID: return 64;
-  case Type::X86_FP80TyID: return 80;
-  case Type::FP128TyID: return 128;
-  case Type::PPC_FP128TyID: return 128;
-  case Type::X86_MMXTyID: return 64;
-  case Type::IntegerTyID: return cast<IntegerType>(this)->getBitWidth();
-  case Type::VectorTyID:  return cast<VectorType>(this)->getBitWidth();
-  default: return 0;
+  case Type::HalfTyID: return TypeSize::Fixed(16);
+  case Type::FloatTyID: return TypeSize::Fixed(32);
+  case Type::DoubleTyID: return TypeSize::Fixed(64);
+  case Type::X86_FP80TyID: return TypeSize::Fixed(80);
+  case Type::FP128TyID: return TypeSize::Fixed(128);
+  case Type::PPC_FP128TyID: return TypeSize::Fixed(128);
+  case Type::X86_MMXTyID: return TypeSize::Fixed(64);
+  case Type::IntegerTyID:
+    return TypeSize::Fixed(cast<IntegerType>(this)->getBitWidth());
+  case Type::VectorTyID: {
+    const VectorType *VTy = cast<VectorType>(this);
+    return TypeSize(VTy->getBitWidth(), VTy->isScalable());
+  }
+  default: return TypeSize::Fixed(0);
   }
 }
 
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index b7f77dc3043e..3c8a5b536695 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -444,15 +444,11 @@ void Value::replaceUsesOutsideBlock(Value *New, BasicBlock *BB) {
          "replaceUses of value with new value of different type!");
   assert(BB && "Basic block that may contain a use of 'New' must be defined\n");
 
-  use_iterator UI = use_begin(), E = use_end();
-  for (; UI != E;) {
-    Use &U = *UI;
-    ++UI;
-    auto *Usr = dyn_cast<Instruction>(U.getUser());
-    if (Usr && Usr->getParent() == BB)
-      continue;
-    U.set(New);
-  }
+  replaceUsesWithIf(New, [BB](Use &U) {
+    auto *I = dyn_cast<Instruction>(U.getUser());
+    // Don't replace if it's an instruction in the BB basic block.
+    return !I || I->getParent() != BB;
+  });
 }
 
 namespace {
@@ -460,8 +456,8 @@ namespace {
 enum PointerStripKind {
   PSK_ZeroIndices,
   PSK_ZeroIndicesAndAliases,
-  PSK_ZeroIndicesAndAliasesSameRepresentation,
-  PSK_ZeroIndicesAndAliasesAndInvariantGroups,
+  PSK_ZeroIndicesSameRepresentation,
+  PSK_ZeroIndicesAndInvariantGroups,
   PSK_InBoundsConstantIndices,
   PSK_InBounds
 };
@@ -479,10 +475,10 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) {
   do {
     if (auto *GEP = dyn_cast<GEPOperator>(V)) {
       switch (StripKind) {
-      case PSK_ZeroIndicesAndAliases:
-      case PSK_ZeroIndicesAndAliasesSameRepresentation:
-      case PSK_ZeroIndicesAndAliasesAndInvariantGroups:
       case PSK_ZeroIndices:
+      case PSK_ZeroIndicesAndAliases:
+      case PSK_ZeroIndicesSameRepresentation:
+      case PSK_ZeroIndicesAndInvariantGroups:
         if (!GEP->hasAllZeroIndices())
           return V;
         break;
@@ -498,15 +494,13 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) {
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
-    } else if (StripKind != PSK_ZeroIndicesAndAliasesSameRepresentation &&
+    } else if (StripKind != PSK_ZeroIndicesSameRepresentation &&
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       // TODO: If we know an address space cast will not change the
       //       representation we could look through it here as well.
       V = cast<Operator>(V)->getOperand(0);
-    } else if (auto *GA = dyn_cast<GlobalAlias>(V)) {
-      if (StripKind == PSK_ZeroIndices || GA->isInterposable())
-        return V;
-      V = GA->getAliasee();
+    } else if (StripKind == PSK_ZeroIndicesAndAliases && isa<GlobalAlias>(V)) {
+      V = cast<GlobalAlias>(V)->getAliasee();
     } else {
       if (const auto *Call = dyn_cast<CallBase>(V)) {
         if (const Value *RV = Call->getReturnedArgOperand()) {
@@ -516,7 +510,7 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) {
         // The result of launder.invariant.group must alias it's argument,
         // but it can't be marked with returned attribute, that's why it needs
         // special case.
-        if (StripKind == PSK_ZeroIndicesAndAliasesAndInvariantGroups &&
+        if (StripKind == PSK_ZeroIndicesAndInvariantGroups &&
             (Call->getIntrinsicID() == Intrinsic::launder_invariant_group ||
              Call->getIntrinsicID() == Intrinsic::strip_invariant_group)) {
           V = Call->getArgOperand(0);
@@ -533,16 +527,15 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) {
 } // end anonymous namespace
 
 const Value *Value::stripPointerCasts() const {
-  return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliases>(this);
+  return stripPointerCastsAndOffsets<PSK_ZeroIndices>(this);
 }
 
-const Value *Value::stripPointerCastsSameRepresentation() const {
-  return stripPointerCastsAndOffsets<
-      PSK_ZeroIndicesAndAliasesSameRepresentation>(this);
+const Value *Value::stripPointerCastsAndAliases() const {
+  return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliases>(this);
 }
 
-const Value *Value::stripPointerCastsNoFollowAliases() const {
-  return stripPointerCastsAndOffsets<PSK_ZeroIndices>(this);
+const Value *Value::stripPointerCastsSameRepresentation() const {
+  return stripPointerCastsAndOffsets<PSK_ZeroIndicesSameRepresentation>(this);
 }
 
 const Value *Value::stripInBoundsConstantOffsets() const {
@@ -550,8 +543,7 @@ const Value *Value::stripInBoundsConstantOffsets() const {
 }
 
 const Value *Value::stripPointerCastsAndInvariantGroups() const {
-  return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliasesAndInvariantGroups>(
-      this);
+  return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndInvariantGroups>(this);
 }
 
 const Value *
@@ -650,6 +642,19 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
       }
       CanBeNull = true;
     }
+  } else if (auto *IP = dyn_cast<IntToPtrInst>(this)) {
+    if (MDNode *MD = IP->getMetadata(LLVMContext::MD_dereferenceable)) {
+      ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
+      DerefBytes = CI->getLimitedValue();
+    }
+    if (DerefBytes == 0) {
+      if (MDNode *MD =
+              IP->getMetadata(LLVMContext::MD_dereferenceable_or_null)) {
+        ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
+        DerefBytes = CI->getLimitedValue();
+      }
+      CanBeNull = true;
+    }
   } else if (auto *AI = dyn_cast<AllocaInst>(this)) {
     if (!AI->isArrayAllocation()) {
       DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType());
@@ -666,21 +671,21 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
   return DerefBytes;
 }
 
-unsigned Value::getPointerAlignment(const DataLayout &DL) const {
+MaybeAlign Value::getPointerAlignment(const DataLayout &DL) const {
   assert(getType()->isPointerTy() && "must be pointer");
-
-  unsigned Align = 0;
   if (auto *GO = dyn_cast<GlobalObject>(this)) {
     if (isa<Function>(GO)) {
+      const MaybeAlign FunctionPtrAlign = DL.getFunctionPtrAlign();
       switch (DL.getFunctionPtrAlignType()) {
       case DataLayout::FunctionPtrAlignType::Independent:
-        return DL.getFunctionPtrAlign();
+        return FunctionPtrAlign;
       case DataLayout::FunctionPtrAlignType::MultipleOfFunctionAlign:
-        return std::max(DL.getFunctionPtrAlign(), GO->getAlignment());
+        return std::max(FunctionPtrAlign, MaybeAlign(GO->getAlignment()));
       }
+      llvm_unreachable("Unhandled FunctionPtrAlignType");
     }
-    Align = GO->getAlignment();
-    if (Align == 0) {
+    const MaybeAlign Alignment(GO->getAlignment());
+    if (!Alignment) {
       if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
         Type *ObjectType = GVar->getValueType();
         if (ObjectType->isSized()) {
@@ -688,37 +693,43 @@ unsigned Value::getPointerAlignment(const DataLayout &DL) const {
           // it the preferred alignment. Otherwise, we have to assume that it
           // may only have the minimum ABI alignment.
           if (GVar->isStrongDefinitionForLinker())
-            Align = DL.getPreferredAlignment(GVar);
+            return MaybeAlign(DL.getPreferredAlignment(GVar));
           else
-            Align = DL.getABITypeAlignment(ObjectType);
+            return Align(DL.getABITypeAlignment(ObjectType));
         }
       }
     }
+    return Alignment;
   } else if (const Argument *A = dyn_cast<Argument>(this)) {
-    Align = A->getParamAlignment();
-
-    if (!Align && A->hasStructRetAttr()) {
+    const MaybeAlign Alignment(A->getParamAlignment());
+    if (!Alignment && A->hasStructRetAttr()) {
       // An sret parameter has at least the ABI alignment of the return type.
       Type *EltTy = cast<PointerType>(A->getType())->getElementType();
       if (EltTy->isSized())
-        Align = DL.getABITypeAlignment(EltTy);
+        return Align(DL.getABITypeAlignment(EltTy));
     }
+    return Alignment;
   } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(this)) {
-    Align = AI->getAlignment();
-    if (Align == 0) {
+    const MaybeAlign Alignment(AI->getAlignment());
+    if (!Alignment) {
       Type *AllocatedType = AI->getAllocatedType();
       if (AllocatedType->isSized())
-        Align = DL.getPrefTypeAlignment(AllocatedType);
+        return MaybeAlign(DL.getPrefTypeAlignment(AllocatedType));
     }
-  } else if (const auto *Call = dyn_cast<CallBase>(this))
-    Align = Call->getAttributes().getRetAlignment();
-  else if (const LoadInst *LI = dyn_cast<LoadInst>(this))
+    return Alignment;
+  } else if (const auto *Call = dyn_cast<CallBase>(this)) {
+    const MaybeAlign Alignment(Call->getRetAlignment());
+    if (!Alignment && Call->getCalledFunction())
+      return MaybeAlign(
+          Call->getCalledFunction()->getAttributes().getRetAlignment());
+    return Alignment;
+  } else if (const LoadInst *LI = dyn_cast<LoadInst>(this)) {
     if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) {
       ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
-      Align = CI->getLimitedValue();
+      return MaybeAlign(CI->getLimitedValue());
     }
-
-  return Align;
+  }
+  return llvm::None;
 }
 
 const Value *Value::DoPHITranslation(const BasicBlock *CurBB,
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 9346c8bda75d..b17fc433ed74 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -119,6 +119,7 @@ struct VerifierSupport {
   raw_ostream *OS;
   const Module &M;
   ModuleSlotTracker MST;
+  Triple TT;
   const DataLayout &DL;
   LLVMContext &Context;
 
@@ -130,7 +131,8 @@ struct VerifierSupport {
   bool TreatBrokenDebugInfoAsError = true;
 
   explicit VerifierSupport(raw_ostream *OS, const Module &M)
-      : OS(OS), M(M), MST(&M), DL(M.getDataLayout()), Context(M.getContext()) {}
+      : OS(OS), M(M), MST(&M), TT(M.getTargetTriple()), DL(M.getDataLayout()),
+        Context(M.getContext()) {}
 
 private:
   void Write(const Module *M) {
@@ -416,6 +418,7 @@ private:
   void visitBasicBlock(BasicBlock &BB);
   void visitRangeMetadata(Instruction &I, MDNode *Range, Type *Ty);
   void visitDereferenceableMetadata(Instruction &I, MDNode *MD);
+  void visitProfMetadata(Instruction &I, MDNode *MD);
 
   template <class Ty> bool isValidMetadataArray(const MDTuple &N);
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
@@ -515,6 +518,7 @@ private:
                                 DIExpression::FragmentInfo Fragment,
                                 ValueOrMetadata *Desc);
   void verifyFnArgs(const DbgVariableIntrinsic &I);
+  void verifyNotEntryValue(const DbgVariableIntrinsic &I);
 
   /// Module-level debug info verification...
   void verifyCompileUnits();
@@ -670,7 +674,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
         Assert(InitArray, "wrong initalizer for intrinsic global variable",
                Init);
         for (Value *Op : InitArray->operands()) {
-          Value *V = Op->stripPointerCastsNoFollowAliases();
+          Value *V = Op->stripPointerCasts();
           Assert(isa<GlobalVariable>(V) || isa<Function>(V) ||
                      isa<GlobalAlias>(V),
                  "invalid llvm.used member", V);
@@ -979,6 +983,9 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
            N.getRawVTableHolder());
   AssertDI(!hasConflictingReferenceFlags(N.getFlags()),
            "invalid reference flags", &N);
+  unsigned DIBlockByRefStruct = 1 << 4;
+  AssertDI((N.getFlags() & DIBlockByRefStruct) == 0,
+           "DIBlockByRefStruct on DICompositeType is no longer supported", &N);
 
   if (N.isVector()) {
     const DINodeArray Elements = N.getElements();
@@ -1306,11 +1313,12 @@ void Verifier::visitDIImportedEntity(const DIImportedEntity &N) {
 }
 
 void Verifier::visitComdat(const Comdat &C) {
-  // The Module is invalid if the GlobalValue has private linkage.  Entities
-  // with private linkage don't have entries in the symbol table.
-  if (const GlobalValue *GV = M.getNamedValue(C.getName()))
-    Assert(!GV->hasPrivateLinkage(), "comdat global value has private linkage",
-           GV);
+  // In COFF the Module is invalid if the GlobalValue has private linkage.
+  // Entities with private linkage don't have entries in the symbol table.
+  if (TT.isOSBinFormatCOFF())
+    if (const GlobalValue *GV = M.getNamedValue(C.getName()))
+      Assert(!GV->hasPrivateLinkage(),
+             "comdat global value has private linkage", GV);
 }
 
 void Verifier::visitModuleIdents(const Module &M) {
@@ -2497,6 +2505,15 @@ void Verifier::visitCallBrInst(CallBrInst &CBI) {
         Assert(CBI.getOperand(i) != CBI.getOperand(j),
                "Duplicate callbr destination!", &CBI);
   }
+  {
+    SmallPtrSet<BasicBlock *, 4> ArgBBs;
+    for (Value *V : CBI.args())
+      if (auto *BA = dyn_cast<BlockAddress>(V))
+        ArgBBs.insert(BA->getBasicBlock());
+    for (BasicBlock *BB : CBI.getIndirectDests())
+      Assert(ArgBBs.find(BB) != ArgBBs.end(),
+             "Indirect label missing from arglist.", &CBI);
+  }
 
   visitTerminator(CBI);
 }
@@ -2715,8 +2732,8 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
          &I);
 
   if (SrcTy->isVectorTy()) {
-    VectorType *VSrc = dyn_cast<VectorType>(SrcTy);
-    VectorType *VDest = dyn_cast<VectorType>(DestTy);
+    VectorType *VSrc = cast<VectorType>(SrcTy);
+    VectorType *VDest = cast<VectorType>(DestTy);
     Assert(VSrc->getNumElements() == VDest->getNumElements(),
            "PtrToInt Vector width mismatch", &I);
   }
@@ -2740,8 +2757,8 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
   Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(), "IntToPtr type mismatch",
          &I);
   if (SrcTy->isVectorTy()) {
-    VectorType *VSrc = dyn_cast<VectorType>(SrcTy);
-    VectorType *VDest = dyn_cast<VectorType>(DestTy);
+    VectorType *VSrc = cast<VectorType>(SrcTy);
+    VectorType *VDest = cast<VectorType>(DestTy);
     Assert(VSrc->getNumElements() == VDest->getNumElements(),
            "IntToPtr Vector width mismatch", &I);
   }
@@ -3983,9 +4000,9 @@ void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
 void Verifier::visitDereferenceableMetadata(Instruction& I, MDNode* MD) {
   Assert(I.getType()->isPointerTy(), "dereferenceable, dereferenceable_or_null "
          "apply only to pointer types", &I);
-  Assert(isa<LoadInst>(I),
+  Assert((isa<LoadInst>(I) || isa<IntToPtrInst>(I)),
          "dereferenceable, dereferenceable_or_null apply only to load"
-         " instructions, use attributes for calls or invokes", &I);
+         " and inttoptr instructions, use attributes for calls or invokes", &I);
   Assert(MD->getNumOperands() == 1, "dereferenceable, dereferenceable_or_null "
          "take one operand!", &I);
   ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(MD->getOperand(0));
@@ -3993,6 +4010,45 @@ void Verifier::visitDereferenceableMetadata(Instruction& I, MDNode* MD) {
          "dereferenceable_or_null metadata value must be an i64!", &I);
 }
 
+void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
+  Assert(MD->getNumOperands() >= 2,
+         "!prof annotations should have no less than 2 operands", MD);
+
+  // Check first operand.
+  Assert(MD->getOperand(0) != nullptr, "first operand should not be null", MD);
+  Assert(isa<MDString>(MD->getOperand(0)),
+         "expected string with name of the !prof annotation", MD);
+  MDString *MDS = cast<MDString>(MD->getOperand(0));
+  StringRef ProfName = MDS->getString();
+
+  // Check consistency of !prof branch_weights metadata.
+  if (ProfName.equals("branch_weights")) {
+    unsigned ExpectedNumOperands = 0;
+    if (BranchInst *BI = dyn_cast<BranchInst>(&I))
+      ExpectedNumOperands = BI->getNumSuccessors();
+    else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I))
+      ExpectedNumOperands = SI->getNumSuccessors();
+    else if (isa<CallInst>(&I) || isa<InvokeInst>(&I))
+      ExpectedNumOperands = 1;
+    else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(&I))
+      ExpectedNumOperands = IBI->getNumDestinations();
+    else if (isa<SelectInst>(&I))
+      ExpectedNumOperands = 2;
+    else
+      CheckFailed("!prof branch_weights are not allowed for this instruction",
+                  MD);
+
+    Assert(MD->getNumOperands() == 1 + ExpectedNumOperands,
+           "Wrong number of operands", MD);
+    for (unsigned i = 1; i < MD->getNumOperands(); ++i) {
+      auto &MDO = MD->getOperand(i);
+      Assert(MDO, "second operand should not be null", MD);
+      Assert(mdconst::dyn_extract<ConstantInt>(MDO),
+             "!prof brunch_weights operand is not a const int");
+    }
+  }
+}
+
 /// verifyInstruction - Verify that an instruction is well formed.
 ///
 void Verifier::visitInstruction(Instruction &I) {
@@ -4150,13 +4206,18 @@ void Verifier::visitInstruction(Instruction &I) {
            "alignment is larger that implementation defined limit", &I);
   }
 
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_prof))
+    visitProfMetadata(I, MD);
+
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     AssertDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N);
   }
 
-  if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I))
+  if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
     verifyFragmentExpression(*DII);
+    verifyNotEntryValue(*DII);
+  }
 
   InstsInThisBlock.insert(&I);
 }
@@ -4236,6 +4297,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
   case Intrinsic::experimental_constrained_fma:
+  case Intrinsic::experimental_constrained_fptosi:
+  case Intrinsic::experimental_constrained_fptoui:
   case Intrinsic::experimental_constrained_fptrunc:
   case Intrinsic::experimental_constrained_fpext:
   case Intrinsic::experimental_constrained_sqrt:
@@ -4248,12 +4311,16 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::experimental_constrained_log:
   case Intrinsic::experimental_constrained_log10:
   case Intrinsic::experimental_constrained_log2:
+  case Intrinsic::experimental_constrained_lrint:
+  case Intrinsic::experimental_constrained_llrint:
   case Intrinsic::experimental_constrained_rint:
   case Intrinsic::experimental_constrained_nearbyint:
   case Intrinsic::experimental_constrained_maxnum:
   case Intrinsic::experimental_constrained_minnum:
   case Intrinsic::experimental_constrained_ceil:
   case Intrinsic::experimental_constrained_floor:
+  case Intrinsic::experimental_constrained_lround:
+  case Intrinsic::experimental_constrained_llround:
   case Intrinsic::experimental_constrained_round:
   case Intrinsic::experimental_constrained_trunc:
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(Call));
@@ -4623,7 +4690,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   }
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat:
-  case Intrinsic::umul_fix: {
+  case Intrinsic::umul_fix:
+  case Intrinsic::umul_fix_sat: {
     Value *Op1 = Call.getArgOperand(0);
     Value *Op2 = Call.getArgOperand(1);
     Assert(Op1->getType()->isIntOrIntVectorTy(),
@@ -4705,6 +4773,31 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     HasRoundingMD = true;
     break;
 
+  case Intrinsic::experimental_constrained_lrint:
+  case Intrinsic::experimental_constrained_llrint: {
+    Assert((NumOperands == 3), "invalid arguments for constrained FP intrinsic",
+           &FPI);
+    Type *ValTy = FPI.getArgOperand(0)->getType();
+    Type *ResultTy = FPI.getType();
+    Assert(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
+           "Intrinsic does not support vectors", &FPI);
+    HasExceptionMD = true;
+    HasRoundingMD = true;
+  } 
+    break;
+
+  case Intrinsic::experimental_constrained_lround:
+  case Intrinsic::experimental_constrained_llround: {
+    Assert((NumOperands == 2), "invalid arguments for constrained FP intrinsic",
+           &FPI);
+    Type *ValTy = FPI.getArgOperand(0)->getType();
+    Type *ResultTy = FPI.getType();
+    Assert(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
+           "Intrinsic does not support vectors", &FPI);
+    HasExceptionMD = true;
+    break;
+  } 
+
   case Intrinsic::experimental_constrained_fma:
     Assert((NumOperands == 5), "invalid arguments for constrained FP intrinsic",
            &FPI);
@@ -4727,6 +4820,33 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     HasRoundingMD = true;
     break;
 
+  case Intrinsic::experimental_constrained_fptosi:
+  case Intrinsic::experimental_constrained_fptoui: { 
+    Assert((NumOperands == 2),
+           "invalid arguments for constrained FP intrinsic", &FPI);
+    HasExceptionMD = true;
+
+    Value *Operand = FPI.getArgOperand(0);
+    uint64_t NumSrcElem = 0;
+    Assert(Operand->getType()->isFPOrFPVectorTy(),
+           "Intrinsic first argument must be floating point", &FPI);
+    if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
+      NumSrcElem = OperandT->getNumElements();
+    }
+
+    Operand = &FPI;
+    Assert((NumSrcElem > 0) == Operand->getType()->isVectorTy(),
+           "Intrinsic first argument and result disagree on vector use", &FPI);
+    Assert(Operand->getType()->isIntOrIntVectorTy(),
+           "Intrinsic result must be an integer", &FPI);
+    if (auto *OperandT = dyn_cast<VectorType>(Operand->getType())) {
+      Assert(NumSrcElem == OperandT->getNumElements(),
+             "Intrinsic first argument and result vector lengths must be equal",
+             &FPI);
+    }
+  }
+    break;
+
   case Intrinsic::experimental_constrained_fptrunc:
   case Intrinsic::experimental_constrained_fpext: {
     if (FPI.getIntrinsicID() == Intrinsic::experimental_constrained_fptrunc) {
@@ -4826,11 +4946,6 @@ void Verifier::visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII) {
   // This check is redundant with one in visitLocalVariable().
   AssertDI(isType(Var->getRawType()), "invalid type ref", Var,
            Var->getRawType());
-  if (auto *Type = dyn_cast_or_null<DIType>(Var->getRawType()))
-    if (Type->isBlockByrefStruct())
-      AssertDI(DII.getExpression() && DII.getExpression()->getNumElements(),
-               "BlockByRef variable without complex expression", Var, &DII);
-
   verifyFnArgs(DII);
 }
 
@@ -4935,6 +5050,16 @@ void Verifier::verifyFnArgs(const DbgVariableIntrinsic &I) {
            Prev, Var);
 }
 
+void Verifier::verifyNotEntryValue(const DbgVariableIntrinsic &I) {
+  DIExpression *E = dyn_cast_or_null<DIExpression>(I.getRawExpression());
+
+  // We don't know whether this intrinsic verified correctly.
+  if (!E || !E->isValid())
+    return;
+
+  AssertDI(!E->isEntryValue(), "Entry values are only allowed in MIR", &I);
+}
+
 void Verifier::verifyCompileUnits() {
   // When more than one Module is imported into the same context, such as during
   // an LTO build before linking the modules, ODR type uniquing may cause types
@@ -5021,7 +5146,7 @@ struct VerifierLegacyPass : public FunctionPass {
   }
 
   bool doInitialization(Module &M) override {
-    V = llvm::make_unique<Verifier>(
+    V = std::make_unique<Verifier>(
         &dbgs(), /*ShouldTreatBrokenDebugInfoAsError=*/false, M);
     return false;
   }
diff --git a/lib/LTO/Caching.cpp b/lib/LTO/Caching.cpp
index 000ab91dba7c..12dcd182de2d 100644
--- a/lib/LTO/Caching.cpp
+++ b/lib/LTO/Caching.cpp
@@ -142,8 +142,8 @@ Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
       }
 
       // This CacheStream will move the temporary file into the cache when done.
-      return llvm::make_unique<CacheStream>(
-          llvm::make_unique<raw_fd_ostream>(Temp->FD, /* ShouldClose */ false),
+      return std::make_unique<CacheStream>(
+          std::make_unique<raw_fd_ostream>(Temp->FD, /* ShouldClose */ false),
           AddBuffer, std::move(*Temp), EntryPath.str(), Task);
     };
   };
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 64506890956a..1e345e7dd89e 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
 
@@ -383,7 +384,9 @@ static bool isWeakObjectWithRWAccess(GlobalValueSummary *GVS) {
 
 static void thinLTOInternalizeAndPromoteGUID(
     GlobalValueSummaryList &GVSummaryList, GlobalValue::GUID GUID,
-    function_ref<bool(StringRef, GlobalValue::GUID)> isExported) {
+    function_ref<bool(StringRef, GlobalValue::GUID)> isExported,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing) {
   for (auto &S : GVSummaryList) {
     if (isExported(S->modulePath(), GUID)) {
       if (GlobalValue::isLocalLinkage(S->linkage()))
@@ -392,6 +395,8 @@ static void thinLTOInternalizeAndPromoteGUID(
                // Ignore local and appending linkage values since the linker
                // doesn't resolve them.
                !GlobalValue::isLocalLinkage(S->linkage()) &&
+               (!GlobalValue::isInterposableLinkage(S->linkage()) ||
+                isPrevailing(GUID, S.get())) &&
                S->linkage() != GlobalValue::AppendingLinkage &&
                // We can't internalize available_externally globals because this
                // can break function pointer equality.
@@ -410,9 +415,12 @@ static void thinLTOInternalizeAndPromoteGUID(
 // as external and non-exported values as internal.
 void llvm::thinLTOInternalizeAndPromoteInIndex(
     ModuleSummaryIndex &Index,
-    function_ref<bool(StringRef, GlobalValue::GUID)> isExported) {
+    function_ref<bool(StringRef, GlobalValue::GUID)> isExported,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing) {
   for (auto &I : Index)
-    thinLTOInternalizeAndPromoteGUID(I.second.SummaryList, I.first, isExported);
+    thinLTOInternalizeAndPromoteGUID(I.second.SummaryList, I.first, isExported,
+                                     isPrevailing);
 }
 
 // Requires a destructor for std::vector<InputModule>.
@@ -459,8 +467,8 @@ BitcodeModule &InputFile::getSingleBitcodeModule() {
 LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
                                       Config &Conf)
     : ParallelCodeGenParallelismLevel(ParallelCodeGenParallelismLevel),
-      Ctx(Conf), CombinedModule(llvm::make_unique<Module>("ld-temp.o", Ctx)),
-      Mover(llvm::make_unique<IRMover>(*CombinedModule)) {}
+      Ctx(Conf), CombinedModule(std::make_unique<Module>("ld-temp.o", Ctx)),
+      Mover(std::make_unique<IRMover>(*CombinedModule)) {}
 
 LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
     : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
@@ -754,7 +762,8 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
       // For now they aren't reported correctly by ModuleSymbolTable.
       auto &CommonRes = RegularLTO.Commons[Sym.getIRName()];
       CommonRes.Size = std::max(CommonRes.Size, Sym.getCommonSize());
-      CommonRes.Align = std::max(CommonRes.Align, Sym.getCommonAlignment());
+      CommonRes.Align =
+          std::max(CommonRes.Align, MaybeAlign(Sym.getCommonAlignment()));
       CommonRes.Prevailing |= Res.Prevailing;
     }
 
@@ -899,8 +908,7 @@ Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
         GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
 
     if (Res.second.VisibleOutsideSummary && Res.second.Prevailing)
-      GUIDPreservedSymbols.insert(GlobalValue::getGUID(
-          GlobalValue::dropLLVMManglingEscape(Res.second.IRName)));
+      GUIDPreservedSymbols.insert(GUID);
 
     GUIDPrevailingResolutions[GUID] =
         Res.second.Prevailing ? PrevailingType::Yes : PrevailingType::No;
@@ -996,6 +1004,8 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
         GV->setLinkage(GlobalValue::InternalLinkage);
     }
 
+    RegularLTO.CombinedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
+
     if (Conf.PostInternalizeModuleHook &&
         !Conf.PostInternalizeModuleHook(0, *RegularLTO.CombinedModule))
       return Error::success();
@@ -1004,6 +1014,16 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
                  std::move(RegularLTO.CombinedModule), ThinLTO.CombinedIndex);
 }
 
+static const char *libcallRoutineNames[] = {
+#define HANDLE_LIBCALL(code, name) name,
+#include "llvm/IR/RuntimeLibcalls.def"
+#undef HANDLE_LIBCALL
+};
+
+ArrayRef<const char*> LTO::getRuntimeLibcallSymbols() {
+  return makeArrayRef(libcallRoutineNames);
+}
+
 /// This class defines the interface to the ThinLTO backend.
 class lto::ThinBackendProc {
 protected:
@@ -1141,7 +1161,7 @@ ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) {
   return [=](Config &Conf, ModuleSummaryIndex &CombinedIndex,
              const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
              AddStreamFn AddStream, NativeObjectCache Cache) {
-    return llvm::make_unique<InProcessThinBackend>(
+    return std::make_unique<InProcessThinBackend>(
         Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries,
         AddStream, Cache);
   };
@@ -1204,7 +1224,7 @@ public:
 
     std::error_code EC;
     raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
-                      sys::fs::OpenFlags::F_None);
+                      sys::fs::OpenFlags::OF_None);
     if (EC)
       return errorCodeToError(EC);
     WriteIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
@@ -1231,7 +1251,7 @@ ThinBackend lto::createWriteIndexesThinBackend(
   return [=](Config &Conf, ModuleSummaryIndex &CombinedIndex,
              const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
              AddStreamFn AddStream, NativeObjectCache Cache) {
-    return llvm::make_unique<WriteIndexesThinBackend>(
+    return std::make_unique<WriteIndexesThinBackend>(
         Conf, CombinedIndex, ModuleToDefinedGVSummaries, OldPrefix, NewPrefix,
         ShouldEmitImportsFiles, LinkedObjectsFile, OnWrite);
   };
@@ -1274,6 +1294,15 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   if (DumpThinCGSCCs)
     ThinLTO.CombinedIndex.dumpSCCs(outs());
 
+  std::set<GlobalValue::GUID> ExportedGUIDs;
+
+  // Perform index-based WPD. This will return immediately if there are
+  // no index entries in the typeIdMetadata map (e.g. if we are instead
+  // performing IR-based WPD in hybrid regular/thin LTO mode).
+  std::map<ValueInfo, std::vector<VTableSlotSummary>> LocalWPDTargetsMap;
+  runWholeProgramDevirtOnIndex(ThinLTO.CombinedIndex, ExportedGUIDs,
+                               LocalWPDTargetsMap);
+
   if (Conf.OptLevel > 0)
     ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                              ImportLists, ExportLists);
@@ -1282,7 +1311,6 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   // at -O0 because summary-based DCE is implemented using internalization, and
   // we must apply DCE consistently with the full LTO module in order to avoid
   // undefined references during the final link.
-  std::set<GlobalValue::GUID> ExportedGUIDs;
   for (auto &Res : GlobalResolutions) {
     // If the symbol does not have external references or it is not prevailing,
     // then not need to mark it as exported from a ThinLTO partition.
@@ -1308,12 +1336,19 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
             ExportList->second.count(GUID)) ||
            ExportedGUIDs.count(GUID);
   };
-  thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported);
+
+  // Update local devirtualized targets that were exported by cross-module
+  // importing or by other devirtualizations marked in the ExportedGUIDs set.
+  updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported,
+                           LocalWPDTargetsMap);
 
   auto isPrevailing = [&](GlobalValue::GUID GUID,
                           const GlobalValueSummary *S) {
     return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
   };
+  thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported,
+                                      isPrevailing);
+
   auto recordNewLinkage = [&](StringRef ModuleIdentifier,
                               GlobalValue::GUID GUID,
                               GlobalValue::LinkageTypes NewLinkage) {
@@ -1368,7 +1403,7 @@ lto::setupStatsFile(StringRef StatsFilename) {
   llvm::EnableStatistics(false);
   std::error_code EC;
   auto StatsFile =
-      llvm::make_unique<ToolOutputFile>(StatsFilename, EC, sys::fs::F_None);
+      std::make_unique<ToolOutputFile>(StatsFilename, EC, sys::fs::OF_None);
   if (EC)
     return errorCodeToError(EC);
 
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 7456e7175163..2761f8367b0d 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -57,8 +58,8 @@ Error Config::addSaveTemps(std::string OutputFileName,
   ShouldDiscardValueNames = false;
 
   std::error_code EC;
-  ResolutionFile = llvm::make_unique<raw_fd_ostream>(
-      OutputFileName + "resolution.txt", EC, sys::fs::OpenFlags::F_Text);
+  ResolutionFile = std::make_unique<raw_fd_ostream>(
+      OutputFileName + "resolution.txt", EC, sys::fs::OpenFlags::OF_Text);
   if (EC)
     return errorCodeToError(EC);
 
@@ -83,7 +84,7 @@ Error Config::addSaveTemps(std::string OutputFileName,
         PathPrefix = M.getModuleIdentifier() + ".";
       std::string Path = PathPrefix + PathSuffix + ".bc";
       std::error_code EC;
-      raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::F_None);
+      raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
       // Because -save-temps is a debugging feature, we report the error
       // directly and exit.
       if (EC)
@@ -103,7 +104,7 @@ Error Config::addSaveTemps(std::string OutputFileName,
   CombinedIndexHook = [=](const ModuleSummaryIndex &Index) {
     std::string Path = OutputFileName + "index.bc";
     std::error_code EC;
-    raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::F_None);
+    raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
     // Because -save-temps is a debugging feature, we report the error
     // directly and exit.
     if (EC)
@@ -111,7 +112,7 @@ Error Config::addSaveTemps(std::string OutputFileName,
     WriteIndexToFile(Index, OS);
 
     Path = OutputFileName + "index.dot";
-    raw_fd_ostream OSDot(Path, EC, sys::fs::OpenFlags::F_None);
+    raw_fd_ostream OSDot(Path, EC, sys::fs::OpenFlags::OF_None);
     if (EC)
       reportOpenError(Path, EC.message());
     Index.exportToDot(OSDot);
@@ -165,7 +166,10 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
                         PGOOptions::IRUse, PGOOptions::CSIRUse);
   }
 
-  PassBuilder PB(TM, PipelineTuningOptions(), PGOOpt);
+  PassInstrumentationCallbacks PIC;
+  StandardInstrumentations SI;
+  SI.registerCallbacks(PIC);
+  PassBuilder PB(TM, PipelineTuningOptions(),PGOOpt, &PIC);
   AAManager AA;
 
   // Parse a custom AA pipeline if asked to.
@@ -329,7 +333,7 @@ void codegen(Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
 
   if (!DwoFile.empty()) {
     std::error_code EC;
-    DwoOut = llvm::make_unique<ToolOutputFile>(DwoFile, EC, sys::fs::F_None);
+    DwoOut = std::make_unique<ToolOutputFile>(DwoFile, EC, sys::fs::OF_None);
     if (EC)
       report_fatal_error("Failed to open " + DwoFile + ": " + EC.message());
   }
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 6bb3bfaefc9c..882192892867 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -151,7 +151,7 @@ void LTOCodeGenerator::initializeLTOPasses() {
 void LTOCodeGenerator::setAsmUndefinedRefs(LTOModule *Mod) {
   const std::vector<StringRef> &undefs = Mod->getAsmUndefinedRefs();
   for (int i = 0, e = undefs.size(); i != e; ++i)
-    AsmUndefinedRefs[undefs[i]] = 1;
+    AsmUndefinedRefs.insert(undefs[i]);
 }
 
 bool LTOCodeGenerator::addModule(LTOModule *Mod) {
@@ -174,7 +174,7 @@ void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) {
   AsmUndefinedRefs.clear();
 
   MergedModule = Mod->takeModule();
-  TheLinker = make_unique<Linker>(*MergedModule);
+  TheLinker = std::make_unique<Linker>(*MergedModule);
   setAsmUndefinedRefs(&*Mod);
 
   // We've just changed the input, so let's make sure we verify it.
@@ -229,7 +229,7 @@ bool LTOCodeGenerator::writeMergedModules(StringRef Path) {
 
   // create output file
   std::error_code EC;
-  ToolOutputFile Out(Path, EC, sys::fs::F_None);
+  ToolOutputFile Out(Path, EC, sys::fs::OF_None);
   if (EC) {
     std::string ErrMsg = "could not open bitcode file for writing: ";
     ErrMsg += Path.str() + ": " + EC.message();
@@ -365,7 +365,8 @@ bool LTOCodeGenerator::determineTarget() {
       MCpu = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       MCpu = "yonah";
-    else if (Triple.getArch() == llvm::Triple::aarch64)
+    else if (Triple.getArch() == llvm::Triple::aarch64 ||
+             Triple.getArch() == llvm::Triple::aarch64_32)
       MCpu = "cyclone";
   }
 
@@ -462,6 +463,8 @@ void LTOCodeGenerator::applyScopeRestrictions() {
 
   internalizeModule(*MergedModule, mustPreserveGV);
 
+  MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
+
   ScopeRestrictionsDone = true;
 }
 
@@ -690,7 +693,7 @@ LTOCodeGenerator::setDiagnosticHandler(lto_diagnostic_handler_t DiagHandler,
     return Context.setDiagnosticHandler(nullptr);
   // Register the LTOCodeGenerator stub in the LLVMContext to forward the
   // diagnostic to the external DiagHandler.
-  Context.setDiagnosticHandler(llvm::make_unique<LTODiagnosticHandler>(this),
+  Context.setDiagnosticHandler(std::make_unique<LTODiagnosticHandler>(this),
                                true);
 }
 
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 7ffe7bf84ba8..587b332e7064 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -220,7 +220,8 @@ LTOModule::makeLTOModule(MemoryBufferRef Buffer, const TargetOptions &options,
       CPU = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       CPU = "yonah";
-    else if (Triple.getArch() == llvm::Triple::aarch64)
+    else if (Triple.getArch() == llvm::Triple::aarch64 ||
+             Triple.getArch() == llvm::Triple::aarch64_32)
       CPU = "cyclone";
   }
 
diff --git a/lib/LTO/SummaryBasedOptimizations.cpp b/lib/LTO/SummaryBasedOptimizations.cpp
index e919fd530fb0..6db495de003b 100644
--- a/lib/LTO/SummaryBasedOptimizations.cpp
+++ b/lib/LTO/SummaryBasedOptimizations.cpp
@@ -18,7 +18,7 @@
 
 using namespace llvm;
 
-cl::opt<bool> ThinLTOSynthesizeEntryCounts(
+static cl::opt<bool> ThinLTOSynthesizeEntryCounts(
     "thinlto-synthesize-entry-counts", cl::init(false), cl::Hidden,
     cl::desc("Synthesize entry counts based on the summary"));
 
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 1c52218836ca..d151de17896f 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/CachePruning.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SHA1.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
@@ -52,6 +53,7 @@
 #include "llvm/Transforms/IPO/FunctionImport.h"
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 
@@ -89,7 +91,7 @@ static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
   // User asked to save temps, let dump the bitcode file after import.
   std::string SaveTempPath = (TempDir + llvm::Twine(count) + Suffix).str();
   std::error_code EC;
-  raw_fd_ostream OS(SaveTempPath, EC, sys::fs::F_None);
+  raw_fd_ostream OS(SaveTempPath, EC, sys::fs::OF_None);
   if (EC)
     report_fatal_error(Twine("Failed to open ") + SaveTempPath +
                        " to save optimized bitcode\n");
@@ -224,7 +226,8 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
 }
 
 static void optimizeModule(Module &TheModule, TargetMachine &TM,
-                           unsigned OptLevel, bool Freestanding) {
+                           unsigned OptLevel, bool Freestanding,
+                           ModuleSummaryIndex *Index) {
   // Populate the PassManager
   PassManagerBuilder PMB;
   PMB.LibraryInfo = new TargetLibraryInfoImpl(TM.getTargetTriple());
@@ -238,6 +241,7 @@ static void optimizeModule(Module &TheModule, TargetMachine &TM,
   // Already did this in verifyLoadedModule().
   PMB.VerifyInput = false;
   PMB.VerifyOutput = false;
+  PMB.ImportSummary = Index;
 
   legacy::PassManager PM;
 
@@ -295,7 +299,7 @@ std::unique_ptr<MemoryBuffer> codegenModule(Module &TheModule,
     // Run codegen now. resulting binary is in OutputBuffer.
     PM.run(TheModule);
   }
-  return make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer));
+  return std::make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer));
 }
 
 /// Manage caching for a single Module.
@@ -368,23 +372,26 @@ public:
     // Write to a temporary to avoid race condition
     SmallString<128> TempFilename;
     SmallString<128> CachePath(EntryPath);
-    int TempFD;
     llvm::sys::path::remove_filename(CachePath);
     sys::path::append(TempFilename, CachePath, "Thin-%%%%%%.tmp.o");
-    std::error_code EC =
-      sys::fs::createUniqueFile(TempFilename, TempFD, TempFilename);
-    if (EC) {
-      errs() << "Error: " << EC.message() << "\n";
-      report_fatal_error("ThinLTO: Can't get a temporary file");
-    }
-    {
-      raw_fd_ostream OS(TempFD, /* ShouldClose */ true);
-      OS << OutputBuffer.getBuffer();
+
+    if (auto Err = handleErrors(
+            llvm::writeFileAtomically(TempFilename, EntryPath,
+                                      OutputBuffer.getBuffer()),
+            [](const llvm::AtomicFileWriteError &E) {
+              std::string ErrorMsgBuffer;
+              llvm::raw_string_ostream S(ErrorMsgBuffer);
+              E.log(S);
+
+              if (E.Error ==
+                  llvm::atomic_write_error::failed_to_create_uniq_file) {
+                errs() << "Error: " << ErrorMsgBuffer << "\n";
+                report_fatal_error("ThinLTO: Can't get a temporary file");
+              }
+            })) {
+      // FIXME
+      consumeError(std::move(Err));
     }
-    // Rename temp file to final destination; rename is atomic
-    EC = sys::fs::rename(TempFilename, EntryPath);
-    if (EC)
-      sys::fs::remove(TempFilename);
   }
 };
 
@@ -429,7 +436,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
     saveTempBitcode(TheModule, SaveTempsDir, count, ".3.imported.bc");
   }
 
-  optimizeModule(TheModule, TM, OptLevel, Freestanding);
+  optimizeModule(TheModule, TM, OptLevel, Freestanding, &Index);
 
   saveTempBitcode(TheModule, SaveTempsDir, count, ".4.opt.bc");
 
@@ -442,7 +449,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
       auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI);
       WriteBitcodeToFile(TheModule, OS, true, &Index);
     }
-    return make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer));
+    return std::make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer));
   }
 
   return codegenModule(TheModule, TM);
@@ -457,10 +464,9 @@ static void resolvePrevailingInIndex(
     ModuleSummaryIndex &Index,
     StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>>
         &ResolvedODR,
-    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
-
-  DenseMap<GlobalValue::GUID, const GlobalValueSummary *> PrevailingCopy;
-  computePrevailingCopies(Index, PrevailingCopy);
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+    const DenseMap<GlobalValue::GUID, const GlobalValueSummary *>
+        &PrevailingCopy) {
 
   auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) {
     const auto &Prevailing = PrevailingCopy.find(GUID);
@@ -490,7 +496,8 @@ static void initTMBuilder(TargetMachineBuilder &TMBuilder,
       TMBuilder.MCpu = "core2";
     else if (TheTriple.getArch() == llvm::Triple::x86)
       TMBuilder.MCpu = "yonah";
-    else if (TheTriple.getArch() == llvm::Triple::aarch64)
+    else if (TheTriple.getArch() == llvm::Triple::aarch64 ||
+             TheTriple.getArch() == llvm::Triple::aarch64_32)
       TMBuilder.MCpu = "cyclone";
   }
   TMBuilder.TheTriple = std::move(TheTriple);
@@ -557,7 +564,7 @@ std::unique_ptr<TargetMachine> TargetMachineBuilder::create() const {
  */
 std::unique_ptr<ModuleSummaryIndex> ThinLTOCodeGenerator::linkCombinedIndex() {
   std::unique_ptr<ModuleSummaryIndex> CombinedIndex =
-      llvm::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
+      std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
   uint64_t NextModuleId = 0;
   for (auto &Mod : Modules) {
     auto &M = Mod->getSingleBitcodeModule();
@@ -573,19 +580,36 @@ std::unique_ptr<ModuleSummaryIndex> ThinLTOCodeGenerator::linkCombinedIndex() {
   return CombinedIndex;
 }
 
-static void internalizeAndPromoteInIndex(
-    const StringMap<FunctionImporter::ExportSetTy> &ExportLists,
-    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
-    ModuleSummaryIndex &Index) {
-  auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
+struct IsExported {
+  const StringMap<FunctionImporter::ExportSetTy> &ExportLists;
+  const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols;
+
+  IsExported(const StringMap<FunctionImporter::ExportSetTy> &ExportLists,
+             const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols)
+      : ExportLists(ExportLists), GUIDPreservedSymbols(GUIDPreservedSymbols) {}
+
+  bool operator()(StringRef ModuleIdentifier, GlobalValue::GUID GUID) const {
     const auto &ExportList = ExportLists.find(ModuleIdentifier);
     return (ExportList != ExportLists.end() &&
             ExportList->second.count(GUID)) ||
            GUIDPreservedSymbols.count(GUID);
-  };
+  }
+};
 
-  thinLTOInternalizeAndPromoteInIndex(Index, isExported);
-}
+struct IsPrevailing {
+  const DenseMap<GlobalValue::GUID, const GlobalValueSummary *> &PrevailingCopy;
+  IsPrevailing(const DenseMap<GlobalValue::GUID, const GlobalValueSummary *>
+                   &PrevailingCopy)
+      : PrevailingCopy(PrevailingCopy) {}
+
+  bool operator()(GlobalValue::GUID GUID, const GlobalValueSummary *S) const {
+    const auto &Prevailing = PrevailingCopy.find(GUID);
+    // Not in map means that there was only one copy, which must be prevailing.
+    if (Prevailing == PrevailingCopy.end())
+      return true;
+    return Prevailing->second == S;
+  };
+};
 
 static void computeDeadSymbolsInIndex(
     ModuleSummaryIndex &Index,
@@ -629,16 +653,22 @@ void ThinLTOCodeGenerator::promote(Module &TheModule, ModuleSummaryIndex &Index,
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
                            ExportLists);
 
+  DenseMap<GlobalValue::GUID, const GlobalValueSummary *> PrevailingCopy;
+  computePrevailingCopies(Index, PrevailingCopy);
+
   // Resolve prevailing symbols
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
-  resolvePrevailingInIndex(Index, ResolvedODR, GUIDPreservedSymbols);
+  resolvePrevailingInIndex(Index, ResolvedODR, GUIDPreservedSymbols,
+                           PrevailingCopy);
 
   thinLTOResolvePrevailingInModule(
       TheModule, ModuleToDefinedGVSummaries[ModuleIdentifier]);
 
   // Promote the exported values in the index, so that they are promoted
   // in the module.
-  internalizeAndPromoteInIndex(ExportLists, GUIDPreservedSymbols, Index);
+  thinLTOInternalizeAndPromoteInIndex(
+      Index, IsExported(ExportLists, GUIDPreservedSymbols),
+      IsPrevailing(PrevailingCopy));
 
   promoteModule(TheModule, Index);
 }
@@ -785,13 +815,19 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
   if (ExportList.empty() && GUIDPreservedSymbols.empty())
     return;
 
+  DenseMap<GlobalValue::GUID, const GlobalValueSummary *> PrevailingCopy;
+  computePrevailingCopies(Index, PrevailingCopy);
+
   // Resolve prevailing symbols
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
-  resolvePrevailingInIndex(Index, ResolvedODR, GUIDPreservedSymbols);
+  resolvePrevailingInIndex(Index, ResolvedODR, GUIDPreservedSymbols,
+                           PrevailingCopy);
 
   // Promote the exported values in the index, so that they are promoted
   // in the module.
-  internalizeAndPromoteInIndex(ExportLists, GUIDPreservedSymbols, Index);
+  thinLTOInternalizeAndPromoteInIndex(
+      Index, IsExported(ExportLists, GUIDPreservedSymbols),
+      IsPrevailing(PrevailingCopy));
 
   promoteModule(TheModule, Index);
 
@@ -810,7 +846,8 @@ void ThinLTOCodeGenerator::optimize(Module &TheModule) {
   initTMBuilder(TMBuilder, Triple(TheModule.getTargetTriple()));
 
   // Optimize now
-  optimizeModule(TheModule, *TMBuilder.create(), OptLevel, Freestanding);
+  optimizeModule(TheModule, *TMBuilder.create(), OptLevel, Freestanding,
+                 nullptr);
 }
 
 /// Write out the generated object file, either from CacheEntryPath or from
@@ -845,7 +882,7 @@ ThinLTOCodeGenerator::writeGeneratedObject(int count, StringRef CacheEntryPath,
   }
   // No cache entry, just write out the buffer.
   std::error_code Err;
-  raw_fd_ostream OS(OutputPath, Err, sys::fs::F_None);
+  raw_fd_ostream OS(OutputPath, Err, sys::fs::OF_None);
   if (Err)
     report_fatal_error("Can't open output '" + OutputPath + "'\n");
   OS << OutputBuffer.getBuffer();
@@ -900,7 +937,7 @@ void ThinLTOCodeGenerator::run() {
   if (!SaveTempsDir.empty()) {
     auto SaveTempPath = SaveTempsDir + "index.bc";
     std::error_code EC;
-    raw_fd_ostream OS(SaveTempPath, EC, sys::fs::F_None);
+    raw_fd_ostream OS(SaveTempPath, EC, sys::fs::OF_None);
     if (EC)
       report_fatal_error(Twine("Failed to open ") + SaveTempPath +
                          " to save optimized bitcode\n");
@@ -931,6 +968,15 @@ void ThinLTOCodeGenerator::run() {
   // Synthesize entry counts for functions in the combined index.
   computeSyntheticCounts(*Index);
 
+  // Perform index-based WPD. This will return immediately if there are
+  // no index entries in the typeIdMetadata map (e.g. if we are instead
+  // performing IR-based WPD in hybrid regular/thin LTO mode).
+  std::map<ValueInfo, std::vector<VTableSlotSummary>> LocalWPDTargetsMap;
+  std::set<GlobalValue::GUID> ExportedGUIDs;
+  runWholeProgramDevirtOnIndex(*Index, ExportedGUIDs, LocalWPDTargetsMap);
+  for (auto GUID : ExportedGUIDs)
+    GUIDPreservedSymbols.insert(GUID);
+
   // Collect the import/export lists for all modules from the call-graph in the
   // combined index.
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
@@ -944,14 +990,23 @@ void ThinLTOCodeGenerator::run() {
   // on the index, and nuke this map.
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
 
+  DenseMap<GlobalValue::GUID, const GlobalValueSummary *> PrevailingCopy;
+  computePrevailingCopies(*Index, PrevailingCopy);
+
   // Resolve prevailing symbols, this has to be computed early because it
   // impacts the caching.
-  resolvePrevailingInIndex(*Index, ResolvedODR, GUIDPreservedSymbols);
+  resolvePrevailingInIndex(*Index, ResolvedODR, GUIDPreservedSymbols,
+                           PrevailingCopy);
 
   // Use global summary-based analysis to identify symbols that can be
   // internalized (because they aren't exported or preserved as per callback).
   // Changes are made in the index, consumed in the ThinLTO backends.
-  internalizeAndPromoteInIndex(ExportLists, GUIDPreservedSymbols, *Index);
+  updateIndexWPDForExports(*Index,
+                           IsExported(ExportLists, GUIDPreservedSymbols),
+                           LocalWPDTargetsMap);
+  thinLTOInternalizeAndPromoteInIndex(
+      *Index, IsExported(ExportLists, GUIDPreservedSymbols),
+      IsPrevailing(PrevailingCopy));
 
   // Make sure that every module has an entry in the ExportLists, ImportList,
   // GVSummary and ResolvedODR maps to enable threaded access to these maps
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index 37515d93ed50..6784d81595e5 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -398,7 +398,7 @@ class IRLinker {
   /// due to the use of Value handles which the Linker doesn't actually need,
   /// but this allows us to reuse the ValueMapper code.
   ValueToValueMapTy ValueMap;
-  ValueToValueMapTy AliasValueMap;
+  ValueToValueMapTy IndirectSymbolValueMap;
 
   DenseSet<GlobalValue *> ValuesToLink;
   std::vector<GlobalValue *> Worklist;
@@ -437,7 +437,7 @@ class IRLinker {
 
   /// Entry point for mapping values and alternate context for mapping aliases.
   ValueMapper Mapper;
-  unsigned AliasMCID;
+  unsigned IndirectSymbolMCID;
 
   /// Handles cloning of a global values from the source module into
   /// the destination module, including setting the attributes and visibility.
@@ -480,13 +480,15 @@ class IRLinker {
   ///
   /// Note this code may call the client-provided \p AddLazyFor.
   bool shouldLink(GlobalValue *DGV, GlobalValue &SGV);
-  Expected<Constant *> linkGlobalValueProto(GlobalValue *GV, bool ForAlias);
+  Expected<Constant *> linkGlobalValueProto(GlobalValue *GV,
+                                            bool ForIndirectSymbol);
 
   Error linkModuleFlagsMetadata();
 
   void linkGlobalVariable(GlobalVariable &Dst, GlobalVariable &Src);
   Error linkFunctionBody(Function &Dst, Function &Src);
-  void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
+  void linkIndirectSymbolBody(GlobalIndirectSymbol &Dst,
+                              GlobalIndirectSymbol &Src);
   Error linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
 
   /// Replace all types in the source AttributeList with the
@@ -497,7 +499,7 @@ class IRLinker {
   /// into the destination module.
   GlobalVariable *copyGlobalVariableProto(const GlobalVariable *SGVar);
   Function *copyFunctionProto(const Function *SF);
-  GlobalValue *copyGlobalAliasProto(const GlobalAlias *SGA);
+  GlobalValue *copyGlobalIndirectSymbolProto(const GlobalIndirectSymbol *SGIS);
 
   /// Perform "replace all uses with" operations. These work items need to be
   /// performed as part of materialization, but we postpone them to happen after
@@ -524,8 +526,8 @@ public:
         SharedMDs(SharedMDs), IsPerformingImport(IsPerformingImport),
         Mapper(ValueMap, RF_MoveDistinctMDs | RF_IgnoreMissingLocals, &TypeMap,
                &GValMaterializer),
-        AliasMCID(Mapper.registerAlternateMappingContext(AliasValueMap,
-                                                         &LValMaterializer)) {
+        IndirectSymbolMCID(Mapper.registerAlternateMappingContext(
+            IndirectSymbolValueMap, &LValMaterializer)) {
     ValueMap.getMDMap() = std::move(SharedMDs);
     for (GlobalValue *GV : ValuesToLink)
       maybeAdd(GV);
@@ -535,7 +537,7 @@ public:
   ~IRLinker() { SharedMDs = std::move(*ValueMap.getMDMap()); }
 
   Error run();
-  Value *materialize(Value *V, bool ForAlias);
+  Value *materialize(Value *V, bool ForIndirectSymbol);
 };
 }
 
@@ -568,12 +570,12 @@ Value *LocalValueMaterializer::materialize(Value *SGV) {
   return TheIRLinker.materialize(SGV, true);
 }
 
-Value *IRLinker::materialize(Value *V, bool ForAlias) {
+Value *IRLinker::materialize(Value *V, bool ForIndirectSymbol) {
   auto *SGV = dyn_cast<GlobalValue>(V);
   if (!SGV)
     return nullptr;
 
-  Expected<Constant *> NewProto = linkGlobalValueProto(SGV, ForAlias);
+  Expected<Constant *> NewProto = linkGlobalValueProto(SGV, ForIndirectSymbol);
   if (!NewProto) {
     setError(NewProto.takeError());
     return nullptr;
@@ -593,23 +595,23 @@ Value *IRLinker::materialize(Value *V, bool ForAlias) {
     if (V->hasInitializer() || V->hasAppendingLinkage())
       return New;
   } else {
-    auto *A = cast<GlobalAlias>(New);
-    if (A->getAliasee())
+    auto *IS = cast<GlobalIndirectSymbol>(New);
+    if (IS->getIndirectSymbol())
       return New;
   }
 
-  // When linking a global for an alias, it will always be linked. However we
-  // need to check if it was not already scheduled to satisfy a reference from a
-  // regular global value initializer. We know if it has been schedule if the
-  // "New" GlobalValue that is mapped here for the alias is the same as the one
-  // already mapped. If there is an entry in the ValueMap but the value is
-  // different, it means that the value already had a definition in the
-  // destination module (linkonce for instance), but we need a new definition
-  // for the alias ("New" will be different.
-  if (ForAlias && ValueMap.lookup(SGV) == New)
+  // When linking a global for an indirect symbol, it will always be linked.
+  // However we need to check if it was not already scheduled to satisfy a
+  // reference from a regular global value initializer. We know if it has been
+  // schedule if the "New" GlobalValue that is mapped here for the indirect
+  // symbol is the same as the one already mapped. If there is an entry in the
+  // ValueMap but the value is different, it means that the value already had a
+  // definition in the destination module (linkonce for instance), but we need a
+  // new definition for the indirect symbol ("New" will be different.
+  if (ForIndirectSymbol && ValueMap.lookup(SGV) == New)
     return New;
 
-  if (ForAlias || shouldLink(New, *SGV))
+  if (ForIndirectSymbol || shouldLink(New, *SGV))
     setError(linkGlobalValueBody(*New, *SGV));
 
   return New;
@@ -627,7 +629,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
                          /*init*/ nullptr, SGVar->getName(),
                          /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
                          SGVar->getType()->getAddressSpace());
-  NewDGV->setAlignment(SGVar->getAlignment());
+  NewDGV->setAlignment(MaybeAlign(SGVar->getAlignment()));
   NewDGV->copyAttributesFrom(SGVar);
   return NewDGV;
 }
@@ -660,16 +662,24 @@ Function *IRLinker::copyFunctionProto(const Function *SF) {
   return F;
 }
 
-/// Set up prototypes for any aliases that come over from the source module.
-GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
+/// Set up prototypes for any indirect symbols that come over from the source
+/// module.
+GlobalValue *
+IRLinker::copyGlobalIndirectSymbolProto(const GlobalIndirectSymbol *SGIS) {
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
-  auto *Ty = TypeMap.get(SGA->getValueType());
-  auto *GA =
-      GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
-                          GlobalValue::ExternalLinkage, SGA->getName(), &DstM);
-  GA->copyAttributesFrom(SGA);
-  return GA;
+  auto *Ty = TypeMap.get(SGIS->getValueType());
+  GlobalIndirectSymbol *GIS;
+  if (isa<GlobalAlias>(SGIS))
+    GIS = GlobalAlias::create(Ty, SGIS->getType()->getPointerAddressSpace(),
+                              GlobalValue::ExternalLinkage, SGIS->getName(),
+                              &DstM);
+  else
+    GIS = GlobalIFunc::create(Ty, SGIS->getType()->getPointerAddressSpace(),
+                              GlobalValue::ExternalLinkage, SGIS->getName(),
+                              nullptr, &DstM);
+  GIS->copyAttributesFrom(SGIS);
+  return GIS;
 }
 
 GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
@@ -681,7 +691,7 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
     NewGV = copyFunctionProto(SF);
   } else {
     if (ForDefinition)
-      NewGV = copyGlobalAliasProto(cast<GlobalAlias>(SGV));
+      NewGV = copyGlobalIndirectSymbolProto(cast<GlobalIndirectSymbol>(SGV));
     else if (SGV->getValueType()->isFunctionTy())
       NewGV =
           Function::Create(cast<FunctionType>(TypeMap.get(SGV->getValueType())),
@@ -748,8 +758,18 @@ void IRLinker::computeTypeMapping() {
   }
 
   for (GlobalValue &SGV : *SrcM)
-    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
+    if (GlobalValue *DGV = getLinkedToGlobal(&SGV)) {
+      if (DGV->getType() == SGV.getType()) {
+        // If the types of DGV and SGV are the same, it means that DGV is from
+        // the source module and got added to DstM from a shared metadata.  We
+        // shouldn't map this type to itself in case the type's components get
+        // remapped to a new type from DstM (for instance, during the loop over
+        // SrcM->getIdentifiedStructTypes() below).
+        continue;
+      }
+
       TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+    }
 
   for (GlobalValue &SGV : SrcM->aliases())
     if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
@@ -940,7 +960,7 @@ bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
 }
 
 Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
-                                                    bool ForAlias) {
+                                                    bool ForIndirectSymbol) {
   GlobalValue *DGV = getLinkedToGlobal(SGV);
 
   bool ShouldLink = shouldLink(DGV, *SGV);
@@ -951,12 +971,12 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
     if (I != ValueMap.end())
       return cast<Constant>(I->second);
 
-    I = AliasValueMap.find(SGV);
-    if (I != AliasValueMap.end())
+    I = IndirectSymbolValueMap.find(SGV);
+    if (I != IndirectSymbolValueMap.end())
       return cast<Constant>(I->second);
   }
 
-  if (!ShouldLink && ForAlias)
+  if (!ShouldLink && ForIndirectSymbol)
     DGV = nullptr;
 
   // Handle the ultra special appending linkage case first.
@@ -975,8 +995,8 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
     if (DoneLinkingBodies)
       return nullptr;
 
-    NewGV = copyGlobalValueProto(SGV, ShouldLink || ForAlias);
-    if (ShouldLink || !ForAlias)
+    NewGV = copyGlobalValueProto(SGV, ShouldLink || ForIndirectSymbol);
+    if (ShouldLink || !ForIndirectSymbol)
       forceRenaming(NewGV, SGV->getName());
   }
 
@@ -987,7 +1007,7 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
     if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F))
       NewGV = Remangled.getValue();
 
-  if (ShouldLink || ForAlias) {
+  if (ShouldLink || ForIndirectSymbol) {
     if (const Comdat *SC = SGV->getComdat()) {
       if (auto *GO = dyn_cast<GlobalObject>(NewGV)) {
         Comdat *DC = DstM.getOrInsertComdat(SC->getName());
@@ -997,7 +1017,7 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
     }
   }
 
-  if (!ShouldLink && ForAlias)
+  if (!ShouldLink && ForIndirectSymbol)
     NewGV->setLinkage(GlobalValue::InternalLinkage);
 
   Constant *C = NewGV;
@@ -1060,8 +1080,10 @@ Error IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
   return Error::success();
 }
 
-void IRLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
-  Mapper.scheduleMapGlobalAliasee(Dst, *Src.getAliasee(), AliasMCID);
+void IRLinker::linkIndirectSymbolBody(GlobalIndirectSymbol &Dst,
+                                      GlobalIndirectSymbol &Src) {
+  Mapper.scheduleMapGlobalIndirectSymbol(Dst, *Src.getIndirectSymbol(),
+                                         IndirectSymbolMCID);
 }
 
 Error IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
@@ -1071,7 +1093,7 @@ Error IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
     linkGlobalVariable(cast<GlobalVariable>(Dst), *GVar);
     return Error::success();
   }
-  linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
+  linkIndirectSymbolBody(cast<GlobalIndirectSymbol>(Dst), cast<GlobalIndirectSymbol>(Src));
   return Error::success();
 }
 
@@ -1411,7 +1433,7 @@ Error IRLinker::run() {
 
     // Already mapped.
     if (ValueMap.find(GV) != ValueMap.end() ||
-        AliasValueMap.find(GV) != AliasValueMap.end())
+        IndirectSymbolValueMap.find(GV) != IndirectSymbolValueMap.end())
       continue;
 
     assert(!GV->isDeclaration());
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index a18f4cc25bcc..35d6290e901b 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -351,7 +351,8 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
         SGVar->setConstant(false);
       }
       if (DGVar->hasCommonLinkage() && SGVar->hasCommonLinkage()) {
-        unsigned Align = std::max(DGVar->getAlignment(), SGVar->getAlignment());
+        MaybeAlign Align(
+            std::max(DGVar->getAlignment(), SGVar->getAlignment()));
         SGVar->setAlignment(Align);
         DGVar->setAlignment(Align);
       }
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 2c68723a12f8..6f160e491cea 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -36,6 +36,7 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
@@ -336,7 +337,7 @@ public:
 } // end anonymous namespace
 
 void ELFWriter::align(unsigned Alignment) {
-  uint64_t Padding = OffsetToAlignment(W.OS.tell(), Alignment);
+  uint64_t Padding = offsetToAlignment(W.OS.tell(), Align(Alignment));
   W.OS.write_zeros(Padding);
 }
 
@@ -511,6 +512,19 @@ static uint8_t mergeTypeForSet(uint8_t origType, uint8_t newType) {
   return Type;
 }
 
+static bool isIFunc(const MCSymbolELF *Symbol) {
+  while (Symbol->getType() != ELF::STT_GNU_IFUNC) {
+    const MCSymbolRefExpr *Value;
+    if (!Symbol->isVariable() ||
+        !(Value = dyn_cast<MCSymbolRefExpr>(Symbol->getVariableValue())) ||
+        Value->getKind() != MCSymbolRefExpr::VK_None ||
+        mergeTypeForSet(Symbol->getType(), ELF::STT_GNU_IFUNC) != ELF::STT_GNU_IFUNC)
+      return false;
+    Symbol = &cast<MCSymbolELF>(Value->getSymbol());
+  }
+  return true;
+}
+
 void ELFWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex,
                             ELFSymbolData &MSD, const MCAsmLayout &Layout) {
   const auto &Symbol = cast<MCSymbolELF>(*MSD.Symbol);
@@ -524,6 +538,8 @@ void ELFWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex,
   // Binding and Type share the same byte as upper and lower nibbles
   uint8_t Binding = Symbol.getBinding();
   uint8_t Type = Symbol.getType();
+  if (isIFunc(&Symbol))
+    Type = ELF::STT_GNU_IFUNC;
   if (Base) {
     Type = mergeTypeForSet(Type, Base->getType());
   }
@@ -622,7 +638,7 @@ void ELFWriter::computeSymbolTable(
   unsigned EntrySize = is64Bit() ? ELF::SYMENTRY_SIZE64 : ELF::SYMENTRY_SIZE32;
   MCSectionELF *SymtabSection =
       Ctx.getELFSection(".symtab", ELF::SHT_SYMTAB, 0, EntrySize, "");
-  SymtabSection->setAlignment(is64Bit() ? 8 : 4);
+  SymtabSection->setAlignment(is64Bit() ? Align(8) : Align(4));
   SymbolTableIndex = addToSectionTable(SymtabSection);
 
   align(SymtabSection->getAlignment());
@@ -720,7 +736,7 @@ void ELFWriter::computeSymbolTable(
     MCSectionELF *SymtabShndxSection =
         Ctx.getELFSection(".symtab_shndx", ELF::SHT_SYMTAB_SHNDX, 0, 4, "");
     SymtabShndxSectionIndex = addToSectionTable(SymtabShndxSection);
-    SymtabShndxSection->setAlignment(4);
+    SymtabShndxSection->setAlignment(Align(4));
   }
 
   ArrayRef<std::string> FileNames = Asm.getFileNames();
@@ -808,7 +824,7 @@ MCSectionELF *ELFWriter::createRelocationSection(MCContext &Ctx,
   MCSectionELF *RelaSection = Ctx.createELFRelSection(
       RelaSectionName, hasRelocationAddend() ? ELF::SHT_RELA : ELF::SHT_REL,
       Flags, EntrySize, Sec.getGroup(), &Sec);
-  RelaSection->setAlignment(is64Bit() ? 8 : 4);
+  RelaSection->setAlignment(is64Bit() ? Align(8) : Align(4));
   return RelaSection;
 }
 
@@ -895,7 +911,7 @@ void ELFWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
     Section.setFlags(Section.getFlags() | ELF::SHF_COMPRESSED);
     // Alignment field should reflect the requirements of
     // the compressed section header.
-    Section.setAlignment(is64Bit() ? 8 : 4);
+    Section.setAlignment(is64Bit() ? Align(8) : Align(4));
   } else {
     // Add "z" prefix to section name. This is zlib-gnu style.
     MC.renameELFSection(&Section, (".z" + SectionName.drop_front(1)).str());
@@ -1119,7 +1135,7 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) {
       if (!GroupIdx) {
         MCSectionELF *Group = Ctx.createELFGroupSection(SignatureSymbol);
         GroupIdx = addToSectionTable(Group);
-        Group->setAlignment(4);
+        Group->setAlignment(Align(4));
         Groups.push_back(Group);
       }
       std::vector<const MCSectionELF *> &Members =
@@ -1437,22 +1453,7 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
   MCContext &Ctx = Asm.getContext();
 
   if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
-    // Let A, B and C being the components of Target and R be the location of
-    // the fixup. If the fixup is not pcrel, we want to compute (A - B + C).
-    // If it is pcrel, we want to compute (A - B + C - R).
-
-    // In general, ELF has no relocations for -B. It can only represent (A + C)
-    // or (A + C - R). If B = R + K and the relocation is not pcrel, we can
-    // replace B to implement it: (A - R - K + C)
-    if (IsPCRel) {
-      Ctx.reportError(
-          Fixup.getLoc(),
-          "No relocation available to represent this relative expression");
-      return;
-    }
-
     const auto &SymB = cast<MCSymbolELF>(RefB->getSymbol());
-
     if (SymB.isUndefined()) {
       Ctx.reportError(Fixup.getLoc(),
                       Twine("symbol '") + SymB.getName() +
@@ -1468,10 +1469,9 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
       return;
     }
 
-    uint64_t SymBOffset = Layout.getSymbolOffset(SymB);
-    uint64_t K = SymBOffset - FixupOffset;
+    assert(!IsPCRel && "should have been folded");
     IsPCRel = true;
-    C -= K;
+    C += FixupOffset - Layout.getSymbolOffset(SymB);
   }
 
   // We either rejected the fixup or folded B into C at this point.
@@ -1489,38 +1489,35 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
     }
   }
 
-  unsigned Type = TargetObjectWriter->getRelocType(Ctx, Target, Fixup, IsPCRel);
-  uint64_t OriginalC = C;
-  bool RelocateWithSymbol = shouldRelocateWithSymbol(Asm, RefA, SymA, C, Type);
-  if (!RelocateWithSymbol && SymA && !SymA->isUndefined())
-    C += Layout.getSymbolOffset(*SymA);
-
-  uint64_t Addend = 0;
-  if (hasRelocationAddend()) {
-    Addend = C;
-    C = 0;
-  }
-
-  FixedValue = C;
-
   const MCSectionELF *SecA = (SymA && SymA->isInSection())
                                  ? cast<MCSectionELF>(&SymA->getSection())
                                  : nullptr;
   if (!checkRelocation(Ctx, Fixup.getLoc(), &FixupSection, SecA))
     return;
 
+  unsigned Type = TargetObjectWriter->getRelocType(Ctx, Target, Fixup, IsPCRel);
+  bool RelocateWithSymbol = shouldRelocateWithSymbol(Asm, RefA, SymA, C, Type);
+  uint64_t Addend = 0;
+
+  FixedValue = !RelocateWithSymbol && SymA && !SymA->isUndefined()
+                   ? C + Layout.getSymbolOffset(*SymA)
+                   : C;
+  if (hasRelocationAddend()) {
+    Addend = FixedValue;
+    FixedValue = 0;
+  }
+
   if (!RelocateWithSymbol) {
     const auto *SectionSymbol =
         SecA ? cast<MCSymbolELF>(SecA->getBeginSymbol()) : nullptr;
     if (SectionSymbol)
       SectionSymbol->setUsedInReloc();
-    ELFRelocationEntry Rec(FixupOffset, SectionSymbol, Type, Addend, SymA,
-                           OriginalC);
+    ELFRelocationEntry Rec(FixupOffset, SectionSymbol, Type, Addend, SymA, C);
     Relocations[&FixupSection].push_back(Rec);
     return;
   }
 
-  const auto *RenamedSymA = SymA;
+  const MCSymbolELF *RenamedSymA = SymA;
   if (SymA) {
     if (const MCSymbolELF *R = Renames.lookup(SymA))
       RenamedSymA = R;
@@ -1530,8 +1527,7 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
     else
       RenamedSymA->setUsedInReloc();
   }
-  ELFRelocationEntry Rec(FixupOffset, RenamedSymA, Type, Addend, SymA,
-                         OriginalC);
+  ELFRelocationEntry Rec(FixupOffset, RenamedSymA, Type, Addend, SymA, C);
   Relocations[&FixupSection].push_back(Rec);
 }
 
@@ -1551,7 +1547,7 @@ bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
 std::unique_ptr<MCObjectWriter>
 llvm::createELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
                             raw_pwrite_stream &OS, bool IsLittleEndian) {
-  return llvm::make_unique<ELFSingleObjectWriter>(std::move(MOTW), OS,
+  return std::make_unique<ELFSingleObjectWriter>(std::move(MOTW), OS,
                                                   IsLittleEndian);
 }
 
@@ -1559,6 +1555,6 @@ std::unique_ptr<MCObjectWriter>
 llvm::createELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
                                raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
                                bool IsLittleEndian) {
-  return llvm::make_unique<ELFDwoObjectWriter>(std::move(MOTW), OS, DwoOS,
+  return std::make_unique<ELFDwoObjectWriter>(std::move(MOTW), OS, DwoOS,
                                                IsLittleEndian);
 }
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 9b1102cbe7d1..b800e9caee22 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -73,6 +73,7 @@ const MCFixupKindInfo &MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"FK_Data_2", 0, 16, 0},
       {"FK_Data_4", 0, 32, 0},
       {"FK_Data_8", 0, 64, 0},
+      {"FK_Data_6b", 0, 6, 0},
       {"FK_PCRel_1", 0, 8, MCFixupKindInfo::FKF_IsPCRel},
       {"FK_PCRel_2", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
       {"FK_PCRel_4", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
@@ -93,10 +94,12 @@ const MCFixupKindInfo &MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"FK_Data_Add_2", 0, 16, 0},
       {"FK_Data_Add_4", 0, 32, 0},
       {"FK_Data_Add_8", 0, 64, 0},
+      {"FK_Data_Add_6b", 0, 6, 0},
       {"FK_Data_Sub_1", 0, 8, 0},
       {"FK_Data_Sub_2", 0, 16, 0},
       {"FK_Data_Sub_4", 0, 32, 0},
-      {"FK_Data_Sub_8", 0, 64, 0}};
+      {"FK_Data_Sub_8", 0, 64, 0},
+      {"FK_Data_Sub_6b", 0, 6, 0}};
 
   assert((size_t)Kind <= array_lengthof(Builtins) && "Unknown fixup kind");
   return Builtins[Kind];
diff --git a/lib/MC/MCAsmInfoXCOFF.cpp b/lib/MC/MCAsmInfoXCOFF.cpp
index 74c21f0c9e6d..65fe8848e20f 100644
--- a/lib/MC/MCAsmInfoXCOFF.cpp
+++ b/lib/MC/MCAsmInfoXCOFF.cpp
@@ -15,4 +15,21 @@ void MCAsmInfoXCOFF::anchor() {}
 MCAsmInfoXCOFF::MCAsmInfoXCOFF() {
   IsLittleEndian = false;
   HasDotTypeDotSizeDirective = false;
+  COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
+  UseDotAlignForAlignment = true;
+  AsciiDirective = nullptr; // not supported
+  AscizDirective = nullptr; // not supported
+  NeedsFunctionDescriptors = true;
+  HasDotLGloblDirective = true;
+  Data64bitsDirective = "\t.llong\t";
+  SupportsQuotedNames = false;
+}
+
+bool MCAsmInfoXCOFF::isValidUnquotedName(StringRef Name) const {
+  // FIXME: Remove this function when we stop using "TOC[TC0]" as a symbol name.
+  if (Name.equals("TOC[TC0]"))
+    return true;
+
+  return MCAsmInfo::isValidUnquotedName(Name);
 }
diff --git a/lib/MC/MCAsmMacro.cpp b/lib/MC/MCAsmMacro.cpp
index ba4fb7d4f387..186a68b02a29 100644
--- a/lib/MC/MCAsmMacro.cpp
+++ b/lib/MC/MCAsmMacro.cpp
@@ -11,6 +11,7 @@
 
 using namespace llvm;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCAsmMacroParameter::dump(raw_ostream &OS) const {
   OS << "\"" << Name << "\"";
   if (Required)
@@ -39,3 +40,4 @@ void MCAsmMacro::dump(raw_ostream &OS) const {
   }
   OS << "  (BEGIN BODY)" << Body << "(END BODY)\n";
 }
+#endif
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 7a2b0b8a1220..2d9c2cb21255 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -23,6 +24,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
@@ -66,7 +68,7 @@ public:
                 std::unique_ptr<MCAsmBackend> asmbackend, bool showInst)
       : MCStreamer(Context), OSOwner(std::move(os)), OS(*OSOwner),
         MAI(Context.getAsmInfo()), InstPrinter(printer),
-        Assembler(llvm::make_unique<MCAssembler>(
+        Assembler(std::make_unique<MCAssembler>(
             Context, std::move(asmbackend), std::move(emitter),
             (asmbackend) ? asmbackend->createObjectWriter(NullStream)
                          : nullptr)),
@@ -162,6 +164,8 @@ public:
   void EmitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
   void EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
+  void EmitXCOFFLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                  unsigned ByteAlign) override;
   void emitELFSize(MCSymbol *Symbol, const MCExpr *Value) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
@@ -254,9 +258,26 @@ public:
                                       unsigned SourceLineNum,
                                       const MCSymbol *FnStartSym,
                                       const MCSymbol *FnEndSym) override;
+
+  void PrintCVDefRangePrefix(
+      ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges);
+
+  void EmitCVDefRangeDirective(
+      ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+      codeview::DefRangeRegisterRelHeader DRHdr) override;
+
+  void EmitCVDefRangeDirective(
+      ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+      codeview::DefRangeSubfieldRegisterHeader DRHdr) override;
+
+  void EmitCVDefRangeDirective(
+      ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+      codeview::DefRangeRegisterHeader DRHdr) override;
+
   void EmitCVDefRangeDirective(
       ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
-      StringRef FixedSizePortion) override;
+      codeview::DefRangeFramePointerRelHeader DRHdr) override;
+
   void EmitCVStringTableDirective() override;
   void EmitCVFileChecksumsDirective() override;
   void EmitCVFileChecksumOffsetDirective(unsigned FileNo) override;
@@ -291,13 +312,13 @@ public:
   void EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) override;
   void EmitWinCFIStartChained(SMLoc Loc) override;
   void EmitWinCFIEndChained(SMLoc Loc) override;
-  void EmitWinCFIPushReg(unsigned Register, SMLoc Loc) override;
-  void EmitWinCFISetFrame(unsigned Register, unsigned Offset,
+  void EmitWinCFIPushReg(MCRegister Register, SMLoc Loc) override;
+  void EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
                           SMLoc Loc) override;
   void EmitWinCFIAllocStack(unsigned Size, SMLoc Loc) override;
-  void EmitWinCFISaveReg(unsigned Register, unsigned Offset,
+  void EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
                          SMLoc Loc) override;
-  void EmitWinCFISaveXMM(unsigned Register, unsigned Offset,
+  void EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
                          SMLoc Loc) override;
   void EmitWinCFIPushFrame(bool Code, SMLoc Loc) override;
   void EmitWinCFIEndProlog(SMLoc Loc) override;
@@ -630,6 +651,7 @@ bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_Global: // .globl/.global
     OS << MAI->getGlobalDirective();
     break;
+  case MCSA_LGlobal:        OS << "\t.lglobl\t";          break;
   case MCSA_Hidden:         OS << "\t.hidden\t";          break;
   case MCSA_IndirectSymbol: OS << "\t.indirect_symbol\t"; break;
   case MCSA_Internal:       OS << "\t.internal\t";        break;
@@ -740,6 +762,24 @@ void MCAsmStreamer::EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {
   EmitEOL();
 }
 
+// We need an XCOFF-specific version of this directive as the AIX syntax
+// requires a QualName argument identifying the csect name and storage mapping
+// class to appear before the alignment if we are specifying it.
+void MCAsmStreamer::EmitXCOFFLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                               unsigned ByteAlignment) {
+  assert(MAI->getLCOMMDirectiveAlignmentType() == LCOMM::Log2Alignment &&
+         "We only support writing log base-2 alignment format with XCOFF.");
+  assert(isPowerOf2_32(ByteAlignment) && "Alignment must be a power of 2.");
+
+  OS << "\t.lcomm\t";
+  Symbol->print(OS, MAI);
+  OS << ',' << Size;
+  OS << ',' << Symbol->getName();
+  OS << ',' << Log2_32(ByteAlignment);
+
+  EmitEOL();
+}
+
 void MCAsmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
   assert(MAI->hasDotTypeDotSizeDirective());
   OS << "\t.size\t";
@@ -1082,6 +1122,16 @@ void MCAsmStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
 void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                          unsigned ValueSize,
                                          unsigned MaxBytesToEmit) {
+  if (MAI->useDotAlignForAlignment()) {
+    if (!isPowerOf2_32(ByteAlignment))
+      report_fatal_error("Only power-of-two alignments are supported "
+                         "with .align.");
+    OS << "\t.align\t";
+    OS << Log2_32(ByteAlignment);
+    EmitEOL();
+    return;
+  }
+
   // Some assemblers don't support non-power of two alignments, so we always
   // emit alignments as a power of two if possible.
   if (isPowerOf2_32(ByteAlignment)) {
@@ -1376,9 +1426,8 @@ void MCAsmStreamer::EmitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
       PrimaryFunctionId, SourceFileId, SourceLineNum, FnStartSym, FnEndSym);
 }
 
-void MCAsmStreamer::EmitCVDefRangeDirective(
-    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
-    StringRef FixedSizePortion) {
+void MCAsmStreamer::PrintCVDefRangePrefix(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges) {
   OS << "\t.cv_def_range\t";
   for (std::pair<const MCSymbol *, const MCSymbol *> Range : Ranges) {
     OS << ' ';
@@ -1386,10 +1435,43 @@ void MCAsmStreamer::EmitCVDefRangeDirective(
     OS << ' ';
     Range.second->print(OS, MAI);
   }
-  OS << ", ";
-  PrintQuotedString(FixedSizePortion, OS);
+}
+
+void MCAsmStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    codeview::DefRangeRegisterRelHeader DRHdr) {
+  PrintCVDefRangePrefix(Ranges);
+  OS << ", reg_rel, ";
+  OS << DRHdr.Register << ", " << DRHdr.Flags << ", "
+     << DRHdr.BasePointerOffset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    codeview::DefRangeSubfieldRegisterHeader DRHdr) {
+  PrintCVDefRangePrefix(Ranges);
+  OS << ", subfield_reg, ";
+  OS << DRHdr.Register << ", " << DRHdr.OffsetInParent;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    codeview::DefRangeRegisterHeader DRHdr) {
+  PrintCVDefRangePrefix(Ranges);
+  OS << ", reg, ";
+  OS << DRHdr.Register;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    codeview::DefRangeFramePointerRelHeader DRHdr) {
+  PrintCVDefRangePrefix(Ranges);
+  OS << ", frame_ptr_rel, ";
+  OS << DRHdr.Offset;
   EmitEOL();
-  this->MCStreamer::EmitCVDefRangeDirective(Ranges, FixedSizePortion);
 }
 
 void MCAsmStreamer::EmitCVStringTableDirective() {
@@ -1453,9 +1535,8 @@ void MCAsmStreamer::EmitRegisterName(int64_t Register) {
     // just ones that map to LLVM register numbers and have known names.
     // Fall back to using the original number directly if no name is known.
     const MCRegisterInfo *MRI = getContext().getRegisterInfo();
-    int LLVMRegister = MRI->getLLVMRegNumFromEH(Register);
-    if (LLVMRegister != -1) {
-      InstPrinter->printRegName(OS, LLVMRegister);
+    if (Optional<unsigned> LLVMRegister = MRI->getLLVMRegNum(Register, true)) {
+      InstPrinter->printRegName(OS, *LLVMRegister);
       return;
     }
   }
@@ -1668,6 +1749,12 @@ void MCAsmStreamer::EmitWinEHHandlerData(SMLoc Loc) {
   // We only do this so the section switch that terminates the handler
   // data block is visible.
   WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo();
+
+  // Do nothing if no frame is open. MCStreamer should've already reported an
+  // error.
+  if (!CurFrame)
+    return;
+
   MCSection *TextSec = &CurFrame->Function->getSection();
   MCSection *XData = getAssociatedXDataSection(TextSec);
   SwitchSectionNoChange(XData);
@@ -1676,18 +1763,21 @@ void MCAsmStreamer::EmitWinEHHandlerData(SMLoc Loc) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFIPushReg(unsigned Register, SMLoc Loc) {
+void MCAsmStreamer::EmitWinCFIPushReg(MCRegister Register, SMLoc Loc) {
   MCStreamer::EmitWinCFIPushReg(Register, Loc);
 
-  OS << "\t.seh_pushreg " << Register;
+  OS << "\t.seh_pushreg ";
+  InstPrinter->printRegName(OS, Register);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFISetFrame(unsigned Register, unsigned Offset,
+void MCAsmStreamer::EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
                                        SMLoc Loc) {
   MCStreamer::EmitWinCFISetFrame(Register, Offset, Loc);
 
-  OS << "\t.seh_setframe " << Register << ", " << Offset;
+  OS << "\t.seh_setframe ";
+  InstPrinter->printRegName(OS, Register);
+  OS << ", " << Offset;
   EmitEOL();
 }
 
@@ -1698,19 +1788,23 @@ void MCAsmStreamer::EmitWinCFIAllocStack(unsigned Size, SMLoc Loc) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFISaveReg(unsigned Register, unsigned Offset,
+void MCAsmStreamer::EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
                                       SMLoc Loc) {
   MCStreamer::EmitWinCFISaveReg(Register, Offset, Loc);
 
-  OS << "\t.seh_savereg " << Register << ", " << Offset;
+  OS << "\t.seh_savereg ";
+  InstPrinter->printRegName(OS, Register);
+  OS << ", " << Offset;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset,
+void MCAsmStreamer::EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
                                       SMLoc Loc) {
   MCStreamer::EmitWinCFISaveXMM(Register, Offset, Loc);
 
-  OS << "\t.seh_savexmm " << Register << ", " << Offset;
+  OS << "\t.seh_savexmm ";
+  InstPrinter->printRegName(OS, Register);
+  OS << ", " << Offset;
   EmitEOL();
 }
 
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index c4f4d4c2870e..cf42fe85b8e5 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -321,7 +322,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   case MCFragment::FT_Align: {
     const MCAlignFragment &AF = cast<MCAlignFragment>(F);
     unsigned Offset = Layout.getFragmentOffset(&AF);
-    unsigned Size = OffsetToAlignment(Offset, AF.getAlignment());
+    unsigned Size = offsetToAlignment(Offset, Align(AF.getAlignment()));
 
     // Insert extra Nops for code alignment if the target define
     // shouldInsertExtraNopBytesForCodeAlign target hook.
@@ -840,6 +841,10 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
           getBackend().shouldInsertFixupForCodeAlign(*this, Layout, *AF);
         }
         continue;
+      } else if (auto *FragWithFixups =
+                     dyn_cast<MCDwarfCallFrameFragment>(&Frag)) {
+        Fixups = FragWithFixups->getFixups();
+        Contents = FragWithFixups->getContents();
       } else
         llvm_unreachable("Unknown fragment with fixups!");
       for (const MCFixup &Fixup : Fixups) {
@@ -969,13 +974,9 @@ bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
   MCContext &Context = Layout.getAssembler().getContext();
   uint64_t OldSize = DF.getContents().size();
   int64_t AddrDelta;
-  bool Abs;
-  if (getBackend().requiresDiffExpressionRelocations())
-    Abs = DF.getAddrDelta().evaluateAsAbsolute(AddrDelta, Layout);
-  else {
-    Abs = DF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, Layout);
-    assert(Abs && "We created a line delta with an invalid expression");
-  }
+  bool Abs = DF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, Layout);
+  assert(Abs && "We created a line delta with an invalid expression");
+  (void)Abs;
   int64_t LineDelta;
   LineDelta = DF.getLineDelta();
   SmallVectorImpl<char> &Data = DF.getContents();
@@ -983,7 +984,7 @@ bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
   raw_svector_ostream OSE(Data);
   DF.getFixups().clear();
 
-  if (Abs) {
+  if (!getBackend().requiresDiffExpressionRelocations()) {
     MCDwarfLineAddr::Encode(Context, getDWARFLinetableParams(), LineDelta,
                             AddrDelta, OSE);
   } else {
@@ -1017,10 +1018,25 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
   bool Abs = DF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, Layout);
   assert(Abs && "We created call frame with an invalid expression");
   (void) Abs;
-  SmallString<8> &Data = DF.getContents();
+  SmallVectorImpl<char> &Data = DF.getContents();
   Data.clear();
   raw_svector_ostream OSE(Data);
-  MCDwarfFrameEmitter::EncodeAdvanceLoc(Context, AddrDelta, OSE);
+  DF.getFixups().clear();
+
+  if (getBackend().requiresDiffExpressionRelocations()) {
+    uint32_t Offset;
+    uint32_t Size;
+    MCDwarfFrameEmitter::EncodeAdvanceLoc(Context, AddrDelta, OSE, &Offset,
+                                          &Size);
+    if (Size) {
+      DF.getFixups().push_back(MCFixup::create(
+          Offset, &DF.getAddrDelta(),
+          MCFixup::getKindForSizeInBits(Size /*In bits.*/, false /*isPCRel*/)));
+    }
+  } else {
+    MCDwarfFrameEmitter::EncodeAdvanceLoc(Context, AddrDelta, OSE);
+  }
+
   return OldSize != Data.size();
 }
 
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 0dc2e2d37caf..a69ee19e1a1a 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -58,11 +58,12 @@ AsSecureLogFileName("as-secure-log-file-name",
 
 MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
-                     bool DoAutoReset)
+                     MCTargetOptions const *TargetOpts, bool DoAutoReset)
     : SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi),
       Symbols(Allocator), UsedNames(Allocator),
+      InlineAsmUsedLabelNames(Allocator),
       CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0),
-      AutoReset(DoAutoReset) {
+      AutoReset(DoAutoReset), TargetOptions(TargetOpts) {
   SecureLogFile = AsSecureLogFileName;
 
   if (SrcMgr && SrcMgr->getNumBuffers())
@@ -90,6 +91,7 @@ void MCContext::reset() {
   XCOFFAllocator.DestroyAll();
 
   MCSubtargetAllocator.DestroyAll();
+  InlineAsmUsedLabelNames.clear();
   UsedNames.clear();
   Symbols.clear();
   Allocator.Reset();
@@ -272,6 +274,10 @@ void MCContext::setSymbolValue(MCStreamer &Streamer,
   Streamer.EmitAssignment(Symbol, MCConstantExpr::create(Val, *this));
 }
 
+void MCContext::registerInlineAsmLabel(MCSymbol *Sym) {
+  InlineAsmUsedLabelNames[Sym->getName()] = Sym;
+}
+
 //===----------------------------------------------------------------------===//
 // Section Management
 //===----------------------------------------------------------------------===//
@@ -531,6 +537,8 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
 
 MCSectionXCOFF *MCContext::getXCOFFSection(StringRef Section,
                                            XCOFF::StorageMappingClass SMC,
+                                           XCOFF::SymbolType Type,
+                                           XCOFF::StorageClass SC,
                                            SectionKind Kind,
                                            const char *BeginSymName) {
   // Do the lookup. If we have a hit, return it.
@@ -548,7 +556,7 @@ MCSectionXCOFF *MCContext::getXCOFFSection(StringRef Section,
     Begin = createTempSymbol(BeginSymName, false);
 
   MCSectionXCOFF *Result = new (XCOFFAllocator.Allocate())
-      MCSectionXCOFF(CachedName, SMC, Kind, Begin);
+      MCSectionXCOFF(CachedName, SMC, Type, SC, Kind, Begin);
   Entry.second = Result;
 
   auto *F = new MCDataFragment();
@@ -690,6 +698,21 @@ void MCContext::reportError(SMLoc Loc, const Twine &Msg) {
     report_fatal_error(Msg, false);
 }
 
+void MCContext::reportWarning(SMLoc Loc, const Twine &Msg) {
+  if (TargetOptions && TargetOptions->MCNoWarn)
+    return;
+  if (TargetOptions && TargetOptions->MCFatalWarnings)
+    reportError(Loc, Msg);
+  else {
+    // If we have a source manager use it. Otherwise, try using the inline
+    // source manager.
+    if (SrcMgr)
+      SrcMgr->PrintMessage(Loc, SourceMgr::DK_Warning, Msg);
+    else if (InlineSrcMgr)
+      InlineSrcMgr->PrintMessage(Loc, SourceMgr::DK_Warning, Msg);
+  }
+}
+
 void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) {
   reportError(Loc, Msg);
 
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index aae6fdf90931..bcc7c45afc01 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -544,8 +544,8 @@ Expected<unsigned> MCDwarfLineTable::tryGetFile(StringRef &Directory,
                            FileNumber);
 }
 
-bool isRootFile(const MCDwarfFile &RootFile, StringRef &Directory,
-                StringRef &FileName, Optional<MD5::MD5Result> Checksum) {
+static bool isRootFile(const MCDwarfFile &RootFile, StringRef &Directory,
+                       StringRef &FileName, Optional<MD5::MD5Result> Checksum) {
   if (RootFile.Name.empty() || RootFile.Name != FileName.data())
     return false;
   return RootFile.Checksum == Checksum;
@@ -1897,26 +1897,54 @@ void MCDwarfFrameEmitter::EmitAdvanceLoc(MCObjectStreamer &Streamer,
 }
 
 void MCDwarfFrameEmitter::EncodeAdvanceLoc(MCContext &Context,
-                                           uint64_t AddrDelta,
-                                           raw_ostream &OS) {
+                                           uint64_t AddrDelta, raw_ostream &OS,
+                                           uint32_t *Offset, uint32_t *Size) {
   // Scale the address delta by the minimum instruction length.
   AddrDelta = ScaleAddrDelta(Context, AddrDelta);
 
+  bool WithFixups = false;
+  if (Offset && Size)
+    WithFixups = true;
+
   support::endianness E =
       Context.getAsmInfo()->isLittleEndian() ? support::little : support::big;
   if (AddrDelta == 0) {
+    if (WithFixups) {
+      *Offset = 0;
+      *Size = 0;
+    }
   } else if (isUIntN(6, AddrDelta)) {
     uint8_t Opcode = dwarf::DW_CFA_advance_loc | AddrDelta;
-    OS << Opcode;
+    if (WithFixups) {
+      *Offset = OS.tell();
+      *Size = 6;
+      OS << uint8_t(dwarf::DW_CFA_advance_loc);
+    } else
+      OS << Opcode;
   } else if (isUInt<8>(AddrDelta)) {
     OS << uint8_t(dwarf::DW_CFA_advance_loc1);
-    OS << uint8_t(AddrDelta);
+    if (WithFixups) {
+      *Offset = OS.tell();
+      *Size = 8;
+      OS.write_zeros(1);
+    } else
+      OS << uint8_t(AddrDelta);
   } else if (isUInt<16>(AddrDelta)) {
     OS << uint8_t(dwarf::DW_CFA_advance_loc2);
-    support::endian::write<uint16_t>(OS, AddrDelta, E);
+    if (WithFixups) {
+      *Offset = OS.tell();
+      *Size = 16;
+      OS.write_zeros(2);
+    } else
+      support::endian::write<uint16_t>(OS, AddrDelta, E);
   } else {
     assert(isUInt<32>(AddrDelta));
     OS << uint8_t(dwarf::DW_CFA_advance_loc4);
-    support::endian::write<uint32_t>(OS, AddrDelta, E);
+    if (WithFixups) {
+      *Offset = OS.tell();
+      *Size = 32;
+      OS.write_zeros(4);
+    } else
+      support::endian::write<uint32_t>(OS, AddrDelta, E);
   }
 }
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 245dd063004f..fa2133078bfe 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -139,7 +139,7 @@ static void setSectionAlignmentForBundling(const MCAssembler &Assembler,
                                            MCSection *Section) {
   if (Section && Assembler.isBundlingEnabled() && Section->hasInstructions() &&
       Section->getAlignment() < Assembler.getBundleAlignSize())
-    Section->setAlignment(Assembler.getBundleAlignSize());
+    Section->setAlignment(Align(Assembler.getBundleAlignSize()));
 }
 
 void MCELFStreamer::ChangeSection(MCSection *Section,
@@ -277,6 +277,9 @@ bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
 
   case MCSA_AltEntry:
     llvm_unreachable("ELF doesn't support the .alt_entry attribute");
+
+  case MCSA_LGlobal:
+    llvm_unreachable("ELF doesn't support the .lglobl attribute");
   }
 
   return true;
@@ -306,7 +309,7 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
 
     // Update the maximum alignment of the section if necessary.
     if (ByteAlignment > Section.getAlignment())
-      Section.setAlignment(ByteAlignment);
+      Section.setAlignment(Align(ByteAlignment));
 
     SwitchSection(P.first, P.second);
   } else {
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index ab53ed42778e..813c00f6f3bb 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -259,6 +259,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_TOC_LO: return "toc@l";
   case VK_PPC_TOC_HI: return "toc@h";
   case VK_PPC_TOC_HA: return "toc@ha";
+  case VK_PPC_U: return "u";
+  case VK_PPC_L: return "l";
   case VK_PPC_DTPMOD: return "dtpmod";
   case VK_PPC_TPREL_LO: return "tprel@l";
   case VK_PPC_TPREL_HI: return "tprel@h";
@@ -373,6 +375,8 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("toc@l", VK_PPC_TOC_LO)
     .Case("toc@h", VK_PPC_TOC_HI)
     .Case("toc@ha", VK_PPC_TOC_HA)
+    .Case("u", VK_PPC_U)
+    .Case("l", VK_PPC_L)
     .Case("tls", VK_PPC_TLS)
     .Case("dtpmod", VK_PPC_DTPMOD)
     .Case("tprel@l", VK_PPC_TPREL_LO)
@@ -453,26 +457,28 @@ void MCTargetExpr::anchor() {}
 /* *** */
 
 bool MCExpr::evaluateAsAbsolute(int64_t &Res) const {
-  return evaluateAsAbsolute(Res, nullptr, nullptr, nullptr);
+  return evaluateAsAbsolute(Res, nullptr, nullptr, nullptr, false);
 }
 
 bool MCExpr::evaluateAsAbsolute(int64_t &Res,
                                 const MCAsmLayout &Layout) const {
-  return evaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, nullptr);
+  return evaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, nullptr, false);
 }
 
 bool MCExpr::evaluateAsAbsolute(int64_t &Res,
                                 const MCAsmLayout &Layout,
                                 const SectionAddrMap &Addrs) const {
-  return evaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, &Addrs);
+  // Setting InSet causes us to absolutize differences across sections and that
+  // is what the MachO writer uses Addrs for.
+  return evaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, &Addrs, true);
 }
 
 bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const {
-  return evaluateAsAbsolute(Res, &Asm, nullptr, nullptr);
+  return evaluateAsAbsolute(Res, &Asm, nullptr, nullptr, false);
 }
 
 bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm) const {
-  return evaluateAsAbsolute(Res, Asm, nullptr, nullptr);
+  return evaluateAsAbsolute(Res, Asm, nullptr, nullptr, false);
 }
 
 bool MCExpr::evaluateKnownAbsolute(int64_t &Res,
@@ -481,15 +487,6 @@ bool MCExpr::evaluateKnownAbsolute(int64_t &Res,
                             true);
 }
 
-bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
-                                const MCAsmLayout *Layout,
-                                const SectionAddrMap *Addrs) const {
-  // FIXME: The use if InSet = Addrs is a hack. Setting InSet causes us
-  // absolutize differences across sections and that is what the MachO writer
-  // uses Addrs for.
-  return evaluateAsAbsolute(Res, Asm, Layout, Addrs, Addrs);
-}
-
 bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
                                 const MCAsmLayout *Layout,
                                 const SectionAddrMap *Addrs, bool InSet) const {
@@ -577,6 +574,24 @@ static void AttemptToFoldSymbolOffsetDifference(
   A = B = nullptr;
 }
 
+static bool canFold(const MCAssembler *Asm, const MCSymbolRefExpr *A,
+                    const MCSymbolRefExpr *B, bool InSet) {
+  if (InSet)
+    return true;
+
+  if (!Asm->getBackend().requiresDiffExpressionRelocations())
+    return true;
+
+  const MCSymbol &CheckSym = A ? A->getSymbol() : B->getSymbol();
+  if (!CheckSym.isInSection())
+    return true;
+
+  if (!CheckSym.getSection().hasInstructions())
+    return true;
+
+  return false;
+}
+
 /// Evaluate the result of an add between (conceptually) two MCValues.
 ///
 /// This routine conceptually attempts to construct an MCValue:
@@ -617,8 +632,7 @@ EvaluateSymbolicAdd(const MCAssembler *Asm, const MCAsmLayout *Layout,
   // the backend requires this to be emitted as individual relocations, unless
   // the InSet flag is set to get the current difference anyway (used for
   // example to calculate symbol sizes).
-  if (Asm &&
-      (InSet || !Asm->getBackend().requiresDiffExpressionRelocations())) {
+  if (Asm && canFold(Asm, LHS_A, LHS_B, InSet)) {
     // First, fold out any differences which are fully resolved. By
     // reassociating terms in
     //   Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst).
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 159f4070fe9f..c5c06f323e68 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -64,12 +64,6 @@ StringRef MCInstPrinter::markup(StringRef s) const {
   else
     return "";
 }
-StringRef MCInstPrinter::markup(StringRef a, StringRef b) const {
-  if (getUseMarkup())
-    return a;
-  else
-    return b;
-}
 
 // For asm-style hex (e.g. 0ffh) the first digit always has to be a number.
 static bool needsLeadingZero(uint64_t Value)
@@ -89,24 +83,25 @@ format_object<int64_t> MCInstPrinter::formatDec(int64_t Value) const {
 }
 
 format_object<int64_t> MCInstPrinter::formatHex(int64_t Value) const {
-  switch(PrintHexStyle) {
+  switch (PrintHexStyle) {
   case HexStyle::C:
-    if (Value < 0)
+    if (Value < 0) {
+      if (Value == std::numeric_limits<int64_t>::min())
+        return format<int64_t>("-0x8000000000000000", Value);
       return format("-0x%" PRIx64, -Value);
-    else
-      return format("0x%" PRIx64, Value);
+    }
+    return format("0x%" PRIx64, Value);
   case HexStyle::Asm:
     if (Value < 0) {
-      if (needsLeadingZero((uint64_t)(-Value)))
+      if (Value == std::numeric_limits<int64_t>::min())
+        return format<int64_t>("-8000000000000000h", Value);
+      if (needsLeadingZero(-(uint64_t)(Value)))
         return format("-0%" PRIx64 "h", -Value);
-      else
-        return format("-%" PRIx64 "h", -Value);
-    } else {
-      if (needsLeadingZero((uint64_t)(Value)))
-        return format("0%" PRIx64 "h", Value);
-      else
-        return format("%" PRIx64 "h", Value);
+      return format("-%" PRIx64 "h", -Value);
     }
+    if (needsLeadingZero((uint64_t)(Value)))
+      return format("0%" PRIx64 "h", Value);
+    return format("%" PRIx64 "h", Value);
   }
   llvm_unreachable("unsupported print style");
 }
diff --git a/lib/MC/MCInstrAnalysis.cpp b/lib/MC/MCInstrAnalysis.cpp
index eca87f940bf5..54741fdd686d 100644
--- a/lib/MC/MCInstrAnalysis.cpp
+++ b/lib/MC/MCInstrAnalysis.cpp
@@ -33,3 +33,9 @@ bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
   Target = Addr+Size+Imm;
   return true;
 }
+
+Optional<uint64_t>
+MCInstrAnalysis::evaluateMemoryOperandAddress(const MCInst &Inst, uint64_t Addr,
+                                              uint64_t Size) const {
+  return None;
+}
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 613f255a4ea4..8e558a36b7a1 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -330,6 +330,7 @@ bool MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Sym,
   case MCSA_Protected:
   case MCSA_Weak:
   case MCSA_Local:
+  case MCSA_LGlobal:
     return false;
 
   case MCSA_Global:
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 9f555abe1404..70c0409ece7a 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -28,7 +28,7 @@ static bool useCompactUnwind(const Triple &T) {
     return false;
 
   // aarch64 always has it.
-  if (T.getArch() == Triple::aarch64)
+  if (T.getArch() == Triple::aarch64 || T.getArch() == Triple::aarch64_32)
     return true;
 
   // armv7k always has it.
@@ -57,7 +57,8 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
           MachO::S_ATTR_STRIP_STATIC_SYMS | MachO::S_ATTR_LIVE_SUPPORT,
       SectionKind::getReadOnly());
 
-  if (T.isOSDarwin() && T.getArch() == Triple::aarch64)
+  if (T.isOSDarwin() &&
+      (T.getArch() == Triple::aarch64 || T.getArch() == Triple::aarch64_32))
     SupportsCompactUnwindWithoutEHFrame = true;
 
   if (T.isWatchABI())
@@ -193,7 +194,7 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
 
     if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)
       CompactUnwindDwarfEHFrameOnly = 0x04000000;  // UNWIND_X86_64_MODE_DWARF
-    else if (T.getArch() == Triple::aarch64)
+    else if (T.getArch() == Triple::aarch64 || T.getArch() == Triple::aarch64_32)
       CompactUnwindDwarfEHFrameOnly = 0x03000000;  // UNWIND_ARM64_MODE_DWARF
     else if (T.getArch() == Triple::arm || T.getArch() == Triple::thumb)
       CompactUnwindDwarfEHFrameOnly = 0x04000000;  // UNWIND_ARM_MODE_DWARF
@@ -768,7 +769,12 @@ void MCObjectFileInfo::initXCOFFMCObjectFileInfo(const Triple &T) {
   // the ABI or object file format. For example, the XL compiler uses an unnamed
   // csect for program code.
   TextSection = Ctx->getXCOFFSection(
-      ".text", XCOFF::StorageMappingClass::XMC_PR, SectionKind::getText());
+      ".text", XCOFF::StorageMappingClass::XMC_PR, XCOFF::XTY_SD,
+      XCOFF::C_HIDEXT, SectionKind::getText());
+
+  DataSection = Ctx->getXCOFFSection(
+      ".data", XCOFF::StorageMappingClass::XMC_RW, XCOFF::XTY_SD,
+      XCOFF::C_HIDEXT, SectionKind::getData());
 }
 
 void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 1587d8498666..83f6ab8fe332 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -27,7 +27,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
                                    std::unique_ptr<MCObjectWriter> OW,
                                    std::unique_ptr<MCCodeEmitter> Emitter)
     : MCStreamer(Context),
-      Assembler(llvm::make_unique<MCAssembler>(
+      Assembler(std::make_unique<MCAssembler>(
           Context, std::move(TAB), std::move(Emitter), std::move(OW))),
       EmitEHFrame(true), EmitDebugFrame(false) {}
 
@@ -539,7 +539,7 @@ void MCObjectStreamer::EmitValueToAlignment(unsigned ByteAlignment,
   // Update the maximum alignment on the current section if necessary.
   MCSection *CurSec = getCurrentSectionOnly();
   if (ByteAlignment > CurSec->getAlignment())
-    CurSec->setAlignment(ByteAlignment);
+    CurSec->setAlignment(Align(ByteAlignment));
 }
 
 void MCObjectStreamer::EmitCodeAlignment(unsigned ByteAlignment,
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 084f6a7a2e14..b59ac08ad6cc 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
@@ -524,6 +525,19 @@ private:
   /// directives parsed by this class.
   StringMap<DirectiveKind> DirectiveKindMap;
 
+  // Codeview def_range type parsing.
+  enum CVDefRangeType {
+    CVDR_DEFRANGE = 0, // Placeholder
+    CVDR_DEFRANGE_REGISTER,
+    CVDR_DEFRANGE_FRAMEPOINTER_REL,
+    CVDR_DEFRANGE_SUBFIELD_REGISTER,
+    CVDR_DEFRANGE_REGISTER_REL
+  };
+
+  /// Maps Codeview def_range types --> CVDefRangeType enum, for
+  /// Codeview def_range types parsed by this class.
+  StringMap<CVDefRangeType> CVDefRangeTypeMap;
+
   // ".ascii", ".asciz", ".string"
   bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
   bool parseDirectiveReloc(SMLoc DirectiveLoc); // ".reloc"
@@ -671,6 +685,7 @@ private:
   bool parseDirectiveAddrsigSym();
 
   void initializeDirectiveKindMap();
+  void initializeCVDefRangeTypeMap();
 };
 
 } // end anonymous namespace
@@ -714,12 +729,14 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
     PlatformParser.reset(createWasmAsmParser());
     break;
   case MCObjectFileInfo::IsXCOFF:
-    // TODO: Need to implement createXCOFFAsmParser for XCOFF format.
+    report_fatal_error(
+        "Need to implement createXCOFFAsmParser for XCOFF format.");
     break;
   }
 
   PlatformParser->Initialize(*this);
   initializeDirectiveKindMap();
+  initializeCVDefRangeTypeMap();
 
   NumOfMacroInstantiations = 0;
 }
@@ -1142,7 +1159,9 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
       }
     }
 
-    MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);
+    MCSymbol *Sym = getContext().getInlineAsmLabel(SymbolName);
+    if (!Sym)
+      Sym = getContext().getOrCreateSymbol(SymbolName);
 
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
@@ -1737,6 +1756,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   StringMap<DirectiveKind>::const_iterator DirKindIt =
       DirectiveKindMap.find(IDVal);
   DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end())
+
                               ? DK_NO_DIRECTIVE
                               : DirKindIt->getValue();
   switch (DirKind) {
@@ -2895,11 +2915,27 @@ bool AsmParser::parseEscapedString(std::string &Data) {
     }
 
     // Recognize escaped characters. Note that this escape semantics currently
-    // loosely follows Darwin 'as'. Notably, it doesn't support hex escapes.
+    // loosely follows Darwin 'as'.
     ++i;
     if (i == e)
       return TokError("unexpected backslash at end of string");
 
+    // Recognize hex sequences similarly to GNU 'as'.
+    if (Str[i] == 'x' || Str[i] == 'X') {
+      size_t length = Str.size();
+      if (i + 1 >= length || !isHexDigit(Str[i + 1]))
+        return TokError("invalid hexadecimal escape sequence");
+
+      // Consume hex characters. GNU 'as' reads all hexadecimal characters and
+      // then truncates to the lower 16 bits. Seems reasonable.
+      unsigned Value = 0;
+      while (i + 1 < length && isHexDigit(Str[i + 1]))
+        Value = Value * 16 + hexDigitValue(Str[++i]);
+
+      Data += (unsigned char)(Value & 0xFF);
+      continue;
+    }
+
     // Recognize octal sequences.
     if ((unsigned)(Str[i] - '0') <= 7) {
       // Consume up to three octal characters.
@@ -3825,6 +3861,13 @@ bool AsmParser::parseDirectiveCVInlineLinetable() {
   return false;
 }
 
+void AsmParser::initializeCVDefRangeTypeMap() {
+  CVDefRangeTypeMap["reg"] = CVDR_DEFRANGE_REGISTER;
+  CVDefRangeTypeMap["frame_ptr_rel"] = CVDR_DEFRANGE_FRAMEPOINTER_REL;
+  CVDefRangeTypeMap["subfield_reg"] = CVDR_DEFRANGE_SUBFIELD_REGISTER;
+  CVDefRangeTypeMap["reg_rel"] = CVDR_DEFRANGE_REGISTER_REL;
+}
+
 /// parseDirectiveCVDefRange
 /// ::= .cv_def_range RangeStart RangeEnd (GapStart GapEnd)*, bytes*
 bool AsmParser::parseDirectiveCVDefRange() {
@@ -3846,13 +3889,92 @@ bool AsmParser::parseDirectiveCVDefRange() {
     Ranges.push_back({GapStartSym, GapEndSym});
   }
 
-  std::string FixedSizePortion;
-  if (parseToken(AsmToken::Comma, "unexpected token in directive") ||
-      parseEscapedString(FixedSizePortion))
-    return true;
-
-  getStreamer().EmitCVDefRangeDirective(Ranges, FixedSizePortion);
-  return false;
+  StringRef CVDefRangeTypeStr;
+  if (parseToken(
+          AsmToken::Comma,
+          "expected comma before def_range type in .cv_def_range directive") ||
+      parseIdentifier(CVDefRangeTypeStr))
+    return Error(Loc, "expected def_range type in directive");
+
+  StringMap<CVDefRangeType>::const_iterator CVTypeIt =
+      CVDefRangeTypeMap.find(CVDefRangeTypeStr);
+  CVDefRangeType CVDRType = (CVTypeIt == CVDefRangeTypeMap.end())
+                                ? CVDR_DEFRANGE
+                                : CVTypeIt->getValue();
+  switch (CVDRType) {
+  case CVDR_DEFRANGE_REGISTER: {
+    int64_t DRRegister;
+    if (parseToken(AsmToken::Comma, "expected comma before register number in "
+                                    ".cv_def_range directive") ||
+        parseAbsoluteExpression(DRRegister))
+      return Error(Loc, "expected register number");
+
+    codeview::DefRangeRegisterHeader DRHdr;
+    DRHdr.Register = DRRegister;
+    DRHdr.MayHaveNoName = 0;
+    getStreamer().EmitCVDefRangeDirective(Ranges, DRHdr);
+    break;
+  }
+  case CVDR_DEFRANGE_FRAMEPOINTER_REL: {
+    int64_t DROffset;
+    if (parseToken(AsmToken::Comma,
+                   "expected comma before offset in .cv_def_range directive") ||
+        parseAbsoluteExpression(DROffset))
+      return Error(Loc, "expected offset value");
+
+    codeview::DefRangeFramePointerRelHeader DRHdr;
+    DRHdr.Offset = DROffset;
+    getStreamer().EmitCVDefRangeDirective(Ranges, DRHdr);
+    break;
+  }
+  case CVDR_DEFRANGE_SUBFIELD_REGISTER: {
+    int64_t DRRegister;
+    int64_t DROffsetInParent;
+    if (parseToken(AsmToken::Comma, "expected comma before register number in "
+                                    ".cv_def_range directive") ||
+        parseAbsoluteExpression(DRRegister))
+      return Error(Loc, "expected register number");
+    if (parseToken(AsmToken::Comma,
+                   "expected comma before offset in .cv_def_range directive") ||
+        parseAbsoluteExpression(DROffsetInParent))
+      return Error(Loc, "expected offset value");
+
+    codeview::DefRangeSubfieldRegisterHeader DRHdr;
+    DRHdr.Register = DRRegister;
+    DRHdr.MayHaveNoName = 0;
+    DRHdr.OffsetInParent = DROffsetInParent;
+    getStreamer().EmitCVDefRangeDirective(Ranges, DRHdr);
+    break;
+  }
+  case CVDR_DEFRANGE_REGISTER_REL: {
+    int64_t DRRegister;
+    int64_t DRFlags;
+    int64_t DRBasePointerOffset;
+    if (parseToken(AsmToken::Comma, "expected comma before register number in "
+                                    ".cv_def_range directive") ||
+        parseAbsoluteExpression(DRRegister))
+      return Error(Loc, "expected register value");
+    if (parseToken(
+            AsmToken::Comma,
+            "expected comma before flag value in .cv_def_range directive") ||
+        parseAbsoluteExpression(DRFlags))
+      return Error(Loc, "expected flag value");
+    if (parseToken(AsmToken::Comma, "expected comma before base pointer offset "
+                                    "in .cv_def_range directive") ||
+        parseAbsoluteExpression(DRBasePointerOffset))
+      return Error(Loc, "expected base pointer offset value");
+
+    codeview::DefRangeRegisterRelHeader DRHdr;
+    DRHdr.Register = DRRegister;
+    DRHdr.Flags = DRFlags;
+    DRHdr.BasePointerOffset = DRBasePointerOffset;
+    getStreamer().EmitCVDefRangeDirective(Ranges, DRHdr);
+    break;
+  }
+  default:
+    return Error(Loc, "unexpected def_range type in .cv_def_range directive");
+  }
+  return true;
 }
 
 /// parseDirectiveCVString
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index 1217ea99e465..06f8310ae061 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -69,6 +69,7 @@ class COFFAsmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveSecIdx>(".secidx");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveLinkOnce>(".linkonce");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveRVA>(".rva");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
 
     // Win64 EH directives.
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>(
@@ -83,21 +84,10 @@ class COFFAsmParser : public MCAsmParserExtension {
                                                                 ".seh_handler");
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandlerData>(
                                                             ".seh_handlerdata");
-    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushReg>(
-                                                                ".seh_pushreg");
-    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSetFrame>(
-                                                               ".seh_setframe");
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveAllocStack>(
                                                              ".seh_stackalloc");
-    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveReg>(
-                                                                ".seh_savereg");
-    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveXMM>(
-                                                                ".seh_savexmm");
-    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushFrame>(
-                                                              ".seh_pushframe");
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProlog>(
                                                             ".seh_endprologue");
-    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
   }
 
   bool ParseSectionDirectiveText(StringRef, SMLoc) {
@@ -143,12 +133,7 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseSEHDirectiveEndChained(StringRef, SMLoc);
   bool ParseSEHDirectiveHandler(StringRef, SMLoc);
   bool ParseSEHDirectiveHandlerData(StringRef, SMLoc);
-  bool ParseSEHDirectivePushReg(StringRef, SMLoc);
-  bool ParseSEHDirectiveSetFrame(StringRef, SMLoc);
   bool ParseSEHDirectiveAllocStack(StringRef, SMLoc);
-  bool ParseSEHDirectiveSaveReg(StringRef, SMLoc);
-  bool ParseSEHDirectiveSaveXMM(StringRef, SMLoc);
-  bool ParseSEHDirectivePushFrame(StringRef, SMLoc);
   bool ParseSEHDirectiveEndProlog(StringRef, SMLoc);
 
   bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except);
@@ -682,39 +667,6 @@ bool COFFAsmParser::ParseSEHDirectiveHandlerData(StringRef, SMLoc Loc) {
   return false;
 }
 
-bool COFFAsmParser::ParseSEHDirectivePushReg(StringRef, SMLoc Loc) {
-  unsigned Reg = 0;
-  if (ParseSEHRegisterNumber(Reg))
-    return true;
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
-
-  Lex();
-  getStreamer().EmitWinCFIPushReg(Reg, Loc);
-  return false;
-}
-
-bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc Loc) {
-  unsigned Reg = 0;
-  int64_t Off;
-  if (ParseSEHRegisterNumber(Reg))
-    return true;
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("you must specify a stack pointer offset");
-
-  Lex();
-  if (getParser().parseAbsoluteExpression(Off))
-    return true;
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
-
-  Lex();
-  getStreamer().EmitWinCFISetFrame(Reg, Off, Loc);
-  return false;
-}
-
 bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc Loc) {
   int64_t Size;
   if (getParser().parseAbsoluteExpression(Size))
@@ -728,71 +680,6 @@ bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc Loc) {
   return false;
 }
 
-bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc Loc) {
-  unsigned Reg = 0;
-  int64_t Off;
-  if (ParseSEHRegisterNumber(Reg))
-    return true;
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("you must specify an offset on the stack");
-
-  Lex();
-  if (getParser().parseAbsoluteExpression(Off))
-    return true;
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
-
-  Lex();
-  // FIXME: Err on %xmm* registers
-  getStreamer().EmitWinCFISaveReg(Reg, Off, Loc);
-  return false;
-}
-
-// FIXME: This method is inherently x86-specific. It should really be in the
-// x86 backend.
-bool COFFAsmParser::ParseSEHDirectiveSaveXMM(StringRef, SMLoc Loc) {
-  unsigned Reg = 0;
-  int64_t Off;
-  if (ParseSEHRegisterNumber(Reg))
-    return true;
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("you must specify an offset on the stack");
-
-  Lex();
-  if (getParser().parseAbsoluteExpression(Off))
-    return true;
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
-
-  Lex();
-  // FIXME: Err on non-%xmm* registers
-  getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc);
-  return false;
-}
-
-bool COFFAsmParser::ParseSEHDirectivePushFrame(StringRef, SMLoc Loc) {
-  bool Code = false;
-  StringRef CodeID;
-  if (getLexer().is(AsmToken::At)) {
-    SMLoc startLoc = getLexer().getLoc();
-    Lex();
-    if (!getParser().parseIdentifier(CodeID)) {
-      if (CodeID != "code")
-        return Error(startLoc, "expected @code");
-      Code = true;
-    }
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
-
-  Lex();
-  getStreamer().EmitWinCFIPushFrame(Code, Loc);
-  return false;
-}
-
 bool COFFAsmParser::ParseSEHDirectiveEndProlog(StringRef, SMLoc Loc) {
   Lex();
   getStreamer().EmitWinCFIEndProlog(Loc);
@@ -816,46 +703,6 @@ bool COFFAsmParser::ParseAtUnwindOrAtExcept(bool &unwind, bool &except) {
   return false;
 }
 
-bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) {
-  SMLoc startLoc = getLexer().getLoc();
-  if (getLexer().is(AsmToken::Percent)) {
-    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
-    SMLoc endLoc;
-    unsigned LLVMRegNo;
-    if (getParser().getTargetParser().ParseRegister(LLVMRegNo,startLoc,endLoc))
-      return true;
-
-#if 0
-    // FIXME: TargetAsmInfo::getCalleeSavedRegs() commits a serious layering
-    // violation so this validation code is disabled.
-
-    // Check that this is a non-volatile register.
-    const unsigned *NVRegs = TAI.getCalleeSavedRegs();
-    unsigned i;
-    for (i = 0; NVRegs[i] != 0; ++i)
-      if (NVRegs[i] == LLVMRegNo)
-        break;
-    if (NVRegs[i] == 0)
-      return Error(startLoc, "expected non-volatile register");
-#endif
-
-    int SEHRegNo = MRI->getSEHRegNum(LLVMRegNo);
-    if (SEHRegNo < 0)
-      return Error(startLoc,"register can't be represented in SEH unwind info");
-    RegNo = SEHRegNo;
-  }
-  else {
-    int64_t n;
-    if (getParser().parseAbsoluteExpression(n))
-      return true;
-    if (n > 15)
-      return Error(startLoc, "register number is too high");
-    RegNo = n;
-  }
-
-  return false;
-}
-
 namespace llvm {
 
 MCAsmParserExtension *createCOFFAsmParser() {
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 1160934dc62c..bd66e5f39c0d 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -778,8 +778,8 @@ bool DarwinAsmParser::parseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
   raw_fd_ostream *OS = getContext().getSecureLog();
   if (!OS) {
     std::error_code EC;
-    auto NewOS = llvm::make_unique<raw_fd_ostream>(
-        StringRef(SecureLogFile), EC, sys::fs::F_Append | sys::fs::F_Text);
+    auto NewOS = std::make_unique<raw_fd_ostream>(
+        StringRef(SecureLogFile), EC, sys::fs::OF_Append | sys::fs::OF_Text);
     if (EC)
        return Error(IDLoc, Twine("can't open secure log file: ") +
                                SecureLogFile + " (" + EC.message() + ")");
diff --git a/lib/MC/MCParser/WasmAsmParser.cpp b/lib/MC/MCParser/WasmAsmParser.cpp
index 28d4459fecd4..0c242aed706d 100644
--- a/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/lib/MC/MCParser/WasmAsmParser.cpp
@@ -123,6 +123,7 @@ public:
                     // See use of .init_array in WasmObjectWriter and
                     // TargetLoweringObjectFileWasm
                     .StartsWith(".init_array", SectionKind::getData())
+                    .StartsWith(".debug_", SectionKind::getMetadata())
                     .Default(Optional<SectionKind>());
     if (!Kind.hasValue())
       return Parser->Error(Lexer->getLoc(), "unknown section kind: " + Name);
diff --git a/lib/MC/MCRegisterInfo.cpp b/lib/MC/MCRegisterInfo.cpp
index 4273b876b7bb..d491c0eb7e06 100644
--- a/lib/MC/MCRegisterInfo.cpp
+++ b/lib/MC/MCRegisterInfo.cpp
@@ -20,15 +20,16 @@
 
 using namespace llvm;
 
-unsigned MCRegisterInfo::getMatchingSuperReg(unsigned Reg, unsigned SubIdx,
-                                             const MCRegisterClass *RC) const {
+MCRegister
+MCRegisterInfo::getMatchingSuperReg(MCRegister Reg, unsigned SubIdx,
+                                    const MCRegisterClass *RC) const {
   for (MCSuperRegIterator Supers(Reg, this); Supers.isValid(); ++Supers)
     if (RC->contains(*Supers) && Reg == getSubReg(*Supers, SubIdx))
       return *Supers;
   return 0;
 }
 
-unsigned MCRegisterInfo::getSubReg(unsigned Reg, unsigned Idx) const {
+MCRegister MCRegisterInfo::getSubReg(MCRegister Reg, unsigned Idx) const {
   assert(Idx && Idx < getNumSubRegIndices() &&
          "This is not a subregister index");
   // Get a pointer to the corresponding SubRegIndices list. This list has the
@@ -40,7 +41,8 @@ unsigned MCRegisterInfo::getSubReg(unsigned Reg, unsigned Idx) const {
   return 0;
 }
 
-unsigned MCRegisterInfo::getSubRegIndex(unsigned Reg, unsigned SubReg) const {
+unsigned MCRegisterInfo::getSubRegIndex(MCRegister Reg,
+                                        MCRegister SubReg) const {
   assert(SubReg && SubReg < getNumRegs() && "This is not a register");
   // Get a pointer to the corresponding SubRegIndices list. This list has the
   // name of each sub-register in the same order as MCSubRegIterator.
@@ -63,7 +65,7 @@ unsigned MCRegisterInfo::getSubRegIdxOffset(unsigned Idx) const {
   return SubRegIdxRanges[Idx].Offset;
 }
 
-int MCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+int MCRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
   const DwarfLLVMRegPair *M = isEH ? EHL2DwarfRegs : L2DwarfRegs;
   unsigned Size = isEH ? EHL2DwarfRegsSize : L2DwarfRegsSize;
 
@@ -76,29 +78,18 @@ int MCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return I->ToReg;
 }
 
-int MCRegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
+Optional<unsigned> MCRegisterInfo::getLLVMRegNum(unsigned RegNum,
+                                                 bool isEH) const {
   const DwarfLLVMRegPair *M = isEH ? EHDwarf2LRegs : Dwarf2LRegs;
   unsigned Size = isEH ? EHDwarf2LRegsSize : Dwarf2LRegsSize;
 
   if (!M)
-    return -1;
-  DwarfLLVMRegPair Key = { RegNum, 0 };
-  const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
-  assert(I != M+Size && I->FromReg == RegNum && "Invalid RegNum");
-  return I->ToReg;
-}
-
-int MCRegisterInfo::getLLVMRegNumFromEH(unsigned RegNum) const {
-  const DwarfLLVMRegPair *M = EHDwarf2LRegs;
-  unsigned Size = EHDwarf2LRegsSize;
-
-  if (!M)
-    return -1;
+    return None;
   DwarfLLVMRegPair Key = { RegNum, 0 };
   const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
-  if (I == M+Size || I->FromReg != RegNum)
-    return -1;
-  return I->ToReg;
+  if (I != M + Size && I->FromReg == RegNum)
+    return I->ToReg;
+  return None;
 }
 
 int MCRegisterInfo::getDwarfRegNumFromDwarfEHRegNum(unsigned RegNum) const {
@@ -110,22 +101,21 @@ int MCRegisterInfo::getDwarfRegNumFromDwarfEHRegNum(unsigned RegNum) const {
   // a corresponding LLVM register number at all.  So if we can't map the
   // EH register number to an LLVM register number, assume it's just a
   // valid DWARF register number as is.
-  int LRegNum = getLLVMRegNumFromEH(RegNum);
-  if (LRegNum != -1)
-    return getDwarfRegNum(LRegNum, false);
+  if (Optional<unsigned> LRegNum = getLLVMRegNum(RegNum, true))
+    return getDwarfRegNum(*LRegNum, false);
   return RegNum;
 }
 
-int MCRegisterInfo::getSEHRegNum(unsigned RegNum) const {
-  const DenseMap<unsigned, int>::const_iterator I = L2SEHRegs.find(RegNum);
+int MCRegisterInfo::getSEHRegNum(MCRegister RegNum) const {
+  const DenseMap<MCRegister, int>::const_iterator I = L2SEHRegs.find(RegNum);
   if (I == L2SEHRegs.end()) return (int)RegNum;
   return I->second;
 }
 
-int MCRegisterInfo::getCodeViewRegNum(unsigned RegNum) const {
+int MCRegisterInfo::getCodeViewRegNum(MCRegister RegNum) const {
   if (L2CVRegs.empty())
     report_fatal_error("target does not implement codeview register mapping");
-  const DenseMap<unsigned, int>::const_iterator I = L2CVRegs.find(RegNum);
+  const DenseMap<MCRegister, int>::const_iterator I = L2CVRegs.find(RegNum);
   if (I == L2CVRegs.end())
     report_fatal_error("unknown codeview register " + (RegNum < getNumRegs()
                                                            ? getName(RegNum)
diff --git a/lib/MC/MCSectionXCOFF.cpp b/lib/MC/MCSectionXCOFF.cpp
index d1a637345024..d52959f15f92 100644
--- a/lib/MC/MCSectionXCOFF.cpp
+++ b/lib/MC/MCSectionXCOFF.cpp
@@ -15,19 +15,65 @@ using namespace llvm;
 
 MCSectionXCOFF::~MCSectionXCOFF() = default;
 
+static StringRef getMappingClassString(XCOFF::StorageMappingClass SMC) {
+  switch (SMC) {
+  case XCOFF::XMC_DS:
+    return "DS";
+  case XCOFF::XMC_RW:
+    return "RW";
+  case XCOFF::XMC_PR:
+    return "PR";
+  default:
+    report_fatal_error("Unhandled storage-mapping class.");
+  }
+}
+
 void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                           raw_ostream &OS,
                                           const MCExpr *Subsection) const {
   if (getKind().isText()) {
+    if (getMappingClass() != XCOFF::XMC_PR)
+      report_fatal_error("Unhandled storage-mapping class for .text csect");
+
     OS << "\t.csect " << getSectionName() << "["
-       << "PR"
+       << getMappingClassString(getMappingClass())
        << "]" << '\n';
     return;
   }
 
+  if (getKind().isData()) {
+    switch (getMappingClass()) {
+    case XCOFF::XMC_RW:
+    case XCOFF::XMC_DS:
+      OS << "\t.csect " << getSectionName() << "["
+         << getMappingClassString(getMappingClass()) << "]" << '\n';
+      break;
+    case XCOFF::XMC_TC0:
+      OS << "\t.toc\n";
+      break;
+    default:
+      report_fatal_error(
+          "Unhandled storage-mapping class for .data csect.");
+    }
+    return;
+  }
+
+  if (getKind().isBSSLocal() || getKind().isCommon()) {
+    assert((getMappingClass() == XCOFF::XMC_RW ||
+            getMappingClass() == XCOFF::XMC_BS) &&
+           "Generated a storage-mapping class for a common/bss csect we don't "
+           "understand how to switch to.");
+    assert(getCSectType() == XCOFF::XTY_CM &&
+           "wrong csect type for .bss csect");
+    // Don't have to print a directive for switching to section for commons.
+    // '.comm' and '.lcomm' directives for the variable will create the needed
+    // csect.
+    return;
+  }
+
   report_fatal_error("Printing for this SectionKind is unimplemented.");
 }
 
 bool MCSectionXCOFF::UseCodeAlign() const { return getKind().isText(); }
 
-bool MCSectionXCOFF::isVirtualSection() const { return !getKind().isCommon(); }
+bool MCSectionXCOFF::isVirtualSection() const { return XCOFF::XTY_CM == Type; }
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index decbb96817e3..b8278cb11079 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeView.h"
@@ -21,6 +22,8 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegister.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbol.h"
@@ -327,10 +330,56 @@ void MCStreamer::EmitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
                                                 const MCSymbol *FnStartSym,
                                                 const MCSymbol *FnEndSym) {}
 
+/// Only call this on endian-specific types like ulittle16_t and little32_t, or
+/// structs composed of them.
+template <typename T>
+static void copyBytesForDefRange(SmallString<20> &BytePrefix,
+                                 codeview::SymbolKind SymKind,
+                                 const T &DefRangeHeader) {
+  BytePrefix.resize(2 + sizeof(T));
+  codeview::ulittle16_t SymKindLE = codeview::ulittle16_t(SymKind);
+  memcpy(&BytePrefix[0], &SymKindLE, 2);
+  memcpy(&BytePrefix[2], &DefRangeHeader, sizeof(T));
+}
+
 void MCStreamer::EmitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     StringRef FixedSizePortion) {}
 
+void MCStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    codeview::DefRangeRegisterRelHeader DRHdr) {
+  SmallString<20> BytePrefix;
+  copyBytesForDefRange(BytePrefix, codeview::S_DEFRANGE_REGISTER_REL, DRHdr);
+  EmitCVDefRangeDirective(Ranges, BytePrefix);
+}
+
+void MCStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    codeview::DefRangeSubfieldRegisterHeader DRHdr) {
+  SmallString<20> BytePrefix;
+  copyBytesForDefRange(BytePrefix, codeview::S_DEFRANGE_SUBFIELD_REGISTER,
+                       DRHdr);
+  EmitCVDefRangeDirective(Ranges, BytePrefix);
+}
+
+void MCStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    codeview::DefRangeRegisterHeader DRHdr) {
+  SmallString<20> BytePrefix;
+  copyBytesForDefRange(BytePrefix, codeview::S_DEFRANGE_REGISTER, DRHdr);
+  EmitCVDefRangeDirective(Ranges, BytePrefix);
+}
+
+void MCStreamer::EmitCVDefRangeDirective(
+    ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
+    codeview::DefRangeFramePointerRelHeader DRHdr) {
+  SmallString<20> BytePrefix;
+  copyBytesForDefRange(BytePrefix, codeview::S_DEFRANGE_FRAMEPOINTER_REL,
+                       DRHdr);
+  EmitCVDefRangeDirective(Ranges, BytePrefix);
+}
+
 void MCStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
                                      MCSymbol *EHSymbol) {
 }
@@ -631,7 +680,7 @@ void MCStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) {
   MCSymbol *StartProc = EmitCFILabel();
 
   WinFrameInfos.emplace_back(
-      llvm::make_unique<WinEH::FrameInfo>(Symbol, StartProc));
+      std::make_unique<WinEH::FrameInfo>(Symbol, StartProc));
   CurrentWinFrameInfo = WinFrameInfos.back().get();
   CurrentWinFrameInfo->TextSection = getCurrentSectionOnly();
 }
@@ -665,7 +714,7 @@ void MCStreamer::EmitWinCFIStartChained(SMLoc Loc) {
 
   MCSymbol *StartProc = EmitCFILabel();
 
-  WinFrameInfos.emplace_back(llvm::make_unique<WinEH::FrameInfo>(
+  WinFrameInfos.emplace_back(std::make_unique<WinEH::FrameInfo>(
       CurFrame->Function, StartProc, CurFrame));
   CurrentWinFrameInfo = WinFrameInfos.back().get();
   CurrentWinFrameInfo->TextSection = getCurrentSectionOnly();
@@ -763,18 +812,23 @@ MCSection *MCStreamer::getAssociatedXDataSection(const MCSection *TextSec) {
 
 void MCStreamer::EmitSyntaxDirective() {}
 
-void MCStreamer::EmitWinCFIPushReg(unsigned Register, SMLoc Loc) {
+static unsigned encodeSEHRegNum(MCContext &Ctx, MCRegister Reg) {
+  return Ctx.getRegisterInfo()->getSEHRegNum(Reg);
+}
+
+void MCStreamer::EmitWinCFIPushReg(MCRegister Register, SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
     return;
 
   MCSymbol *Label = EmitCFILabel();
 
-  WinEH::Instruction Inst = Win64EH::Instruction::PushNonVol(Label, Register);
+  WinEH::Instruction Inst = Win64EH::Instruction::PushNonVol(
+      Label, encodeSEHRegNum(Context, Register));
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWinCFISetFrame(unsigned Register, unsigned Offset,
+void MCStreamer::EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
                                     SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
@@ -790,8 +844,8 @@ void MCStreamer::EmitWinCFISetFrame(unsigned Register, unsigned Offset,
 
   MCSymbol *Label = EmitCFILabel();
 
-  WinEH::Instruction Inst =
-      Win64EH::Instruction::SetFPReg(Label, Register, Offset);
+  WinEH::Instruction Inst = Win64EH::Instruction::SetFPReg(
+      Label, encodeSEHRegNum(getContext(), Register), Offset);
   CurFrame->LastFrameInst = CurFrame->Instructions.size();
   CurFrame->Instructions.push_back(Inst);
 }
@@ -813,7 +867,7 @@ void MCStreamer::EmitWinCFIAllocStack(unsigned Size, SMLoc Loc) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWinCFISaveReg(unsigned Register, unsigned Offset,
+void MCStreamer::EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
                                    SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
@@ -825,12 +879,12 @@ void MCStreamer::EmitWinCFISaveReg(unsigned Register, unsigned Offset,
 
   MCSymbol *Label = EmitCFILabel();
 
-  WinEH::Instruction Inst =
-      Win64EH::Instruction::SaveNonVol(Label, Register, Offset);
+  WinEH::Instruction Inst = Win64EH::Instruction::SaveNonVol(
+      Label, encodeSEHRegNum(Context, Register), Offset);
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset,
+void MCStreamer::EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
                                    SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
@@ -840,8 +894,8 @@ void MCStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset,
 
   MCSymbol *Label = EmitCFILabel();
 
-  WinEH::Instruction Inst =
-      Win64EH::Instruction::SaveXMM(Label, Register, Offset);
+  WinEH::Instruction Inst = Win64EH::Instruction::SaveXMM(
+      Label, encodeSEHRegNum(Context, Register), Offset);
   CurFrame->Instructions.push_back(Inst);
 }
 
@@ -1009,6 +1063,10 @@ void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
 void MCStreamer::EmitCOFFSymbolType(int Type) {
   llvm_unreachable("this directive only supported on COFF targets");
 }
+void MCStreamer::EmitXCOFFLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                            unsigned ByteAlign) {
+  llvm_unreachable("this directive only supported on XCOFF targets");
+}
 void MCStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
 void MCStreamer::emitELFSymverDirective(StringRef AliasName,
                                         const MCSymbol *Aliasee) {}
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index 5fd48d9e1010..c8678df02bfd 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -315,3 +315,28 @@ void MCSubtargetInfo::initInstrItins(InstrItineraryData &InstrItins) const {
   InstrItins = InstrItineraryData(getSchedModel(), Stages, OperandCycles,
                                   ForwardingPaths);
 }
+
+Optional<unsigned> MCSubtargetInfo::getCacheSize(unsigned Level) const {
+  return Optional<unsigned>();
+}
+
+Optional<unsigned>
+MCSubtargetInfo::getCacheAssociativity(unsigned Level) const {
+  return Optional<unsigned>();
+}
+
+Optional<unsigned> MCSubtargetInfo::getCacheLineSize(unsigned Level) const {
+  return Optional<unsigned>();
+}
+
+unsigned MCSubtargetInfo::getPrefetchDistance() const {
+  return 0;
+}
+
+unsigned MCSubtargetInfo::getMaxPrefetchIterationsAhead() const {
+  return UINT_MAX;
+}
+
+unsigned MCSubtargetInfo::getMinPrefetchStride() const {
+  return 1;
+}
diff --git a/lib/MC/MCWasmObjectTargetWriter.cpp b/lib/MC/MCWasmObjectTargetWriter.cpp
index e46257823e34..1ccb3a58d5c1 100644
--- a/lib/MC/MCWasmObjectTargetWriter.cpp
+++ b/lib/MC/MCWasmObjectTargetWriter.cpp
@@ -10,8 +10,9 @@
 
 using namespace llvm;
 
-MCWasmObjectTargetWriter::MCWasmObjectTargetWriter(bool Is64Bit)
-    : Is64Bit(Is64Bit) {}
+MCWasmObjectTargetWriter::MCWasmObjectTargetWriter(bool Is64Bit,
+                                                   bool IsEmscripten)
+    : Is64Bit(Is64Bit), IsEmscripten(IsEmscripten) {}
 
 // Pin the vtable to this object file
 MCWasmObjectTargetWriter::~MCWasmObjectTargetWriter() = default;
diff --git a/lib/MC/MCWasmStreamer.cpp b/lib/MC/MCWasmStreamer.cpp
index 86fa72197855..e7e96ecbb3a0 100644
--- a/lib/MC/MCWasmStreamer.cpp
+++ b/lib/MC/MCWasmStreamer.cpp
@@ -122,7 +122,7 @@ bool MCWasmStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
     break;
 
   case MCSA_NoDeadStrip:
-    Symbol->setExported();
+    Symbol->setNoStrip();
     break;
 
   default:
diff --git a/lib/MC/MCWinCOFFStreamer.cpp b/lib/MC/MCWinCOFFStreamer.cpp
index 04d5f100a2ff..c5a21312140b 100644
--- a/lib/MC/MCWinCOFFStreamer.cpp
+++ b/lib/MC/MCWinCOFFStreamer.cpp
@@ -88,7 +88,19 @@ void MCWinCOFFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
 }
 
 void MCWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
-  llvm_unreachable("not implemented");
+  // Let the target do whatever target specific stuff it needs to do.
+  getAssembler().getBackend().handleAssemblerFlag(Flag);
+
+  switch (Flag) {
+  // None of these require COFF specific handling.
+  case MCAF_SyntaxUnified:
+  case MCAF_Code16:
+  case MCAF_Code32:
+  case MCAF_Code64:
+    break;
+  case MCAF_SubsectionsViaSymbols:
+    llvm_unreachable("COFF doesn't support .subsections_via_symbols");
+  }
 }
 
 void MCWinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
@@ -180,7 +192,7 @@ void MCWinCOFFStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
   MCSection *SXData = getContext().getObjectFileInfo()->getSXDataSection();
   getAssembler().registerSection(*SXData);
   if (SXData->getAlignment() < 4)
-    SXData->setAlignment(4);
+    SXData->setAlignment(Align(4));
 
   new MCSymbolIdFragment(Symbol, SXData);
 
@@ -197,7 +209,7 @@ void MCWinCOFFStreamer::EmitCOFFSymbolIndex(MCSymbol const *Symbol) {
   MCSection *Sec = getCurrentSectionOnly();
   getAssembler().registerSection(*Sec);
   if (Sec->getAlignment() < 4)
-    Sec->setAlignment(4);
+    Sec->setAlignment(Align(4));
 
   new MCSymbolIdFragment(Symbol, getCurrentSectionOnly());
 
diff --git a/lib/MC/MCXCOFFStreamer.cpp b/lib/MC/MCXCOFFStreamer.cpp
index 071de024a3fa..50937d6adc0c 100644
--- a/lib/MC/MCXCOFFStreamer.cpp
+++ b/lib/MC/MCXCOFFStreamer.cpp
@@ -10,10 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCXCOFFStreamer.h"
+#include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
+#include "llvm/MC/MCXCOFFStreamer.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -25,14 +27,38 @@ MCXCOFFStreamer::MCXCOFFStreamer(MCContext &Context,
     : MCObjectStreamer(Context, std::move(MAB), std::move(OW),
                        std::move(Emitter)) {}
 
-bool MCXCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool MCXCOFFStreamer::EmitSymbolAttribute(MCSymbol *Sym,
                                           MCSymbolAttr Attribute) {
-  report_fatal_error("Symbol attributes not implemented for XCOFF.");
+  auto *Symbol = cast<MCSymbolXCOFF>(Sym);
+  getAssembler().registerSymbol(*Symbol);
+
+  switch (Attribute) {
+  case MCSA_Global:
+    Symbol->setStorageClass(XCOFF::C_EXT);
+    Symbol->setExternal(true);
+    break;
+  default:
+    report_fatal_error("Not implemented yet.");
+  }
+  return true;
 }
 
 void MCXCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        unsigned ByteAlignment) {
-  report_fatal_error("Emiting common symbols not implemented for XCOFF.");
+  getAssembler().registerSymbol(*Symbol);
+  Symbol->setExternal(cast<MCSymbolXCOFF>(Symbol)->getStorageClass() !=
+                      XCOFF::C_HIDEXT);
+  Symbol->setCommon(Size, ByteAlignment);
+
+  // Need to add this symbol to the current Fragment which will belong to the
+  // containing CSECT.
+  auto *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+  assert(F && "Expected a valid section with a fragment set.");
+  Symbol->setFragment(F);
+
+  // Emit the alignment and storage for the variable to the section.
+  EmitValueToAlignment(ByteAlignment);
+  EmitZeros(Size);
 }
 
 void MCXCOFFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
@@ -42,8 +68,18 @@ void MCXCOFFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
 }
 
 void MCXCOFFStreamer::EmitInstToData(const MCInst &Inst,
-                                     const MCSubtargetInfo &) {
-  report_fatal_error("Instruction emission not implemented for XCOFF.");
+                                     const MCSubtargetInfo &STI) {
+  MCAssembler &Assembler = getAssembler();
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
+
+  // TODO: Handle Fixups later
+
+  MCDataFragment *DF = getOrCreateDataFragment(&STI);
+  DF->setHasInstructions(STI);
+  DF->getContents().append(Code.begin(), Code.end());
 }
 
 MCStreamer *llvm::createXCOFFStreamer(MCContext &Context,
@@ -57,3 +93,9 @@ MCStreamer *llvm::createXCOFFStreamer(MCContext &Context,
     S->getAssembler().setRelaxAll(true);
   return S;
 }
+
+void MCXCOFFStreamer::EmitXCOFFLocalCommonSymbol(MCSymbol *Symbol,
+                                                 uint64_t Size,
+                                                 unsigned ByteAlignment) {
+  EmitCommonSymbol(Symbol, Size, ByteAlignment);
+}
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index f0ceb86b25af..9f6af981aca1 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -126,7 +127,7 @@ uint64_t MachObjectWriter::getPaddingSize(const MCSection *Sec,
   const MCSection &NextSec = *Layout.getSectionOrder()[Next];
   if (NextSec.isVirtualSection())
     return 0;
-  return OffsetToAlignment(EndAddr, NextSec.getAlignment());
+  return offsetToAlignment(EndAddr, Align(NextSec.getAlignment()));
 }
 
 void MachObjectWriter::writeHeader(MachO::HeaderFileType Type,
@@ -444,7 +445,8 @@ void MachObjectWriter::writeLinkerOptionsLoadCommand(
   }
 
   // Pad to a multiple of the pointer size.
-  W.OS.write_zeros(OffsetToAlignment(BytesWritten, is64Bit() ? 8 : 4));
+  W.OS.write_zeros(
+      offsetToAlignment(BytesWritten, is64Bit() ? Align(8) : Align(4)));
 
   assert(W.OS.tell() - Start == Size);
 }
@@ -832,7 +834,8 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
   // The section data is padded to 4 bytes.
   //
   // FIXME: Is this machine dependent?
-  unsigned SectionDataPadding = OffsetToAlignment(SectionDataFileSize, 4);
+  unsigned SectionDataPadding =
+      offsetToAlignment(SectionDataFileSize, Align(4));
   SectionDataFileSize += SectionDataPadding;
 
   // Write the prolog, starting with the header and load command...
@@ -997,7 +1000,8 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
 #endif
     Asm.getLOHContainer().emit(*this, Layout);
     // Pad to a multiple of the pointer size.
-    W.OS.write_zeros(OffsetToAlignment(LOHRawSize, is64Bit() ? 8 : 4));
+    W.OS.write_zeros(
+        offsetToAlignment(LOHRawSize, is64Bit() ? Align(8) : Align(4)));
     assert(W.OS.tell() - Start == LOHSize);
   }
 
@@ -1043,6 +1047,6 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
 std::unique_ptr<MCObjectWriter>
 llvm::createMachObjectWriter(std::unique_ptr<MCMachObjectTargetWriter> MOTW,
                              raw_pwrite_stream &OS, bool IsLittleEndian) {
-  return llvm::make_unique<MachObjectWriter>(std::move(MOTW), OS,
+  return std::make_unique<MachObjectWriter>(std::move(MOTW), OS,
                                              IsLittleEndian);
 }
diff --git a/lib/MC/StringTableBuilder.cpp b/lib/MC/StringTableBuilder.cpp
index cb3db8e2268c..c9c88ec58432 100644
--- a/lib/MC/StringTableBuilder.cpp
+++ b/lib/MC/StringTableBuilder.cpp
@@ -38,6 +38,7 @@ void StringTableBuilder::initSize() {
     // Start the table with a NUL byte.
     Size = 1;
     break;
+  case XCOFF:
   case WinCOFF:
     // Make room to write the table size later.
     Size = 4;
@@ -67,9 +68,12 @@ void StringTableBuilder::write(uint8_t *Buf) const {
     if (!Data.empty())
       memcpy(Buf + P.second, Data.data(), Data.size());
   }
-  if (K != WinCOFF)
-    return;
-  support::endian::write32le(Buf, Size);
+  // The COFF formats store the size of the string table in the first 4 bytes.
+  // For Windows, the format is little-endian; for AIX, it is big-endian.
+  if (K == WinCOFF)
+    support::endian::write32le(Buf, Size);
+  else if (K == XCOFF)
+    support::endian::write32be(Buf, Size);
 }
 
 // Returns the character at Pos from end of a string.
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index 098343cd0107..c1ff3cc2480c 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -258,6 +258,7 @@ class WasmObjectWriter : public MCObjectWriter {
 
   // TargetObjectWriter wrappers.
   bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
+  bool isEmscripten() const { return TargetObjectWriter->isEmscripten(); }
 
   void startSection(SectionBookkeeping &Section, unsigned SectionId);
   void startCustomSection(SectionBookkeeping &Section, StringRef Name);
@@ -426,9 +427,10 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
                                         const MCFragment *Fragment,
                                         const MCFixup &Fixup, MCValue Target,
                                         uint64_t &FixedValue) {
-  MCAsmBackend &Backend = Asm.getBackend();
-  bool IsPCRel = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
-                 MCFixupKindInfo::FKF_IsPCRel;
+  // The WebAssembly backend should never generate FKF_IsPCRel fixups
+  assert(!(Asm.getBackend().getFixupKindInfo(Fixup.getKind()).Flags &
+           MCFixupKindInfo::FKF_IsPCRel));
+
   const auto &FixupSection = cast<MCSectionWasm>(*Fragment->getParent());
   uint64_t C = Target.getConstant();
   uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
@@ -439,51 +441,22 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
     return;
 
   if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
-    assert(RefB->getKind() == MCSymbolRefExpr::VK_None &&
-           "Should not have constructed this");
-
-    // Let A, B and C being the components of Target and R be the location of
-    // the fixup. If the fixup is not pcrel, we want to compute (A - B + C).
-    // If it is pcrel, we want to compute (A - B + C - R).
-
-    // In general, Wasm has no relocations for -B. It can only represent (A + C)
-    // or (A + C - R). If B = R + K and the relocation is not pcrel, we can
-    // replace B to implement it: (A - R - K + C)
-    if (IsPCRel) {
-      Ctx.reportError(
-          Fixup.getLoc(),
-          "No relocation available to represent this relative expression");
-      return;
-    }
-
+    // To get here the A - B expression must have failed evaluateAsRelocatable.
+    // This means either A or B must be undefined and in WebAssembly we can't
+    // support either of those cases.
     const auto &SymB = cast<MCSymbolWasm>(RefB->getSymbol());
-
-    if (SymB.isUndefined()) {
-      Ctx.reportError(Fixup.getLoc(),
-                      Twine("symbol '") + SymB.getName() +
-                          "' can not be undefined in a subtraction expression");
-      return;
-    }
-
-    assert(!SymB.isAbsolute() && "Should have been folded");
-    const MCSection &SecB = SymB.getSection();
-    if (&SecB != &FixupSection) {
-      Ctx.reportError(Fixup.getLoc(),
-                      "Cannot represent a difference across sections");
-      return;
-    }
-
-    uint64_t SymBOffset = Layout.getSymbolOffset(SymB);
-    uint64_t K = SymBOffset - FixupOffset;
-    IsPCRel = true;
-    C -= K;
+    Ctx.reportError(
+        Fixup.getLoc(),
+        Twine("symbol '") + SymB.getName() +
+            "': unsupported subtraction expression used in relocation.");
+    return;
   }
 
   // We either rejected the fixup or folded B into C at this point.
   const MCSymbolRefExpr *RefA = Target.getSymA();
-  const auto *SymA = RefA ? cast<MCSymbolWasm>(&RefA->getSymbol()) : nullptr;
+  const auto *SymA = cast<MCSymbolWasm>(&RefA->getSymbol());
 
-  if (SymA && SymA->isVariable()) {
+  if (SymA->isVariable()) {
     const MCExpr *Expr = SymA->getVariableValue();
     const auto *Inner = cast<MCSymbolRefExpr>(Expr);
     if (Inner->getKind() == MCSymbolRefExpr::VK_WEAKREF)
@@ -496,8 +469,6 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
   FixedValue = 0;
 
   unsigned Type = TargetObjectWriter->getRelocType(Target, Fixup);
-  assert(!IsPCRel);
-  assert(SymA);
 
   // Absolute offset within a section or a function.
   // Currently only supported for for metadata sections.
@@ -1296,12 +1267,12 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
 
       // Separate out the producers and target features sections
       if (Name == "producers") {
-        ProducersSection = llvm::make_unique<WasmCustomSection>(Name, &Section);
+        ProducersSection = std::make_unique<WasmCustomSection>(Name, &Section);
         continue;
       }
       if (Name == "target_features") {
         TargetFeaturesSection =
-            llvm::make_unique<WasmCustomSection>(Name, &Section);
+            std::make_unique<WasmCustomSection>(Name, &Section);
         continue;
       }
 
@@ -1379,7 +1350,9 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
         report_fatal_error(".size expression must be evaluatable");
 
       auto &DataSection = static_cast<MCSectionWasm &>(WS.getSection());
-      assert(DataSection.isWasmData());
+      if (!DataSection.isWasmData())
+        report_fatal_error("data symbols must live in a data section: " +
+                           WS.getName());
 
       // For each data symbol, export it in the symtab as a reference to the
       // corresponding Wasm data segment.
@@ -1473,8 +1446,12 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       Flags |= wasm::WASM_SYMBOL_BINDING_LOCAL;
     if (WS.isUndefined())
       Flags |= wasm::WASM_SYMBOL_UNDEFINED;
-    if (WS.isExported())
-      Flags |= wasm::WASM_SYMBOL_EXPORTED;
+    if (WS.isNoStrip()) {
+      Flags |= wasm::WASM_SYMBOL_NO_STRIP;
+      if (isEmscripten()) {
+        Flags |= wasm::WASM_SYMBOL_EXPORTED;
+      }
+    }
     if (WS.getName() != WS.getImportName())
       Flags |= wasm::WASM_SYMBOL_EXPLICIT_NAME;
 
@@ -1618,5 +1595,5 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
 std::unique_ptr<MCObjectWriter>
 llvm::createWasmObjectWriter(std::unique_ptr<MCWasmObjectTargetWriter> MOTW,
                              raw_pwrite_stream &OS) {
-  return llvm::make_unique<WasmObjectWriter>(std::move(MOTW), OS);
+  return std::make_unique<WasmObjectWriter>(std::move(MOTW), OS);
 }
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 0e6c05bc726d..749ed8badfaa 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -31,10 +31,10 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/CRC.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/JamCRC.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -239,7 +239,7 @@ WinCOFFObjectWriter::WinCOFFObjectWriter(
 }
 
 COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
-  Symbols.push_back(make_unique<COFFSymbol>(Name));
+  Symbols.push_back(std::make_unique<COFFSymbol>(Name));
   return Symbols.back().get();
 }
 
@@ -251,7 +251,7 @@ COFFSymbol *WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol *Symbol) {
 }
 
 COFFSection *WinCOFFObjectWriter::createSection(StringRef Name) {
-  Sections.emplace_back(make_unique<COFFSection>(Name));
+  Sections.emplace_back(std::make_unique<COFFSection>(Name));
   return Sections.back().get();
 }
 
@@ -605,7 +605,7 @@ uint32_t WinCOFFObjectWriter::writeSectionContents(MCAssembler &Asm,
   // Calculate our CRC with an initial value of '0', this is not how
   // JamCRC is specified but it aligns with the expected output.
   JamCRC JC(/*Init=*/0);
-  JC.update(Buf);
+  JC.update(makeArrayRef(reinterpret_cast<uint8_t*>(Buf.data()), Buf.size()));
   return JC.getCRC();
 }
 
@@ -1098,5 +1098,5 @@ void MCWinCOFFObjectTargetWriter::anchor() {}
 
 std::unique_ptr<MCObjectWriter> llvm::createWinCOFFObjectWriter(
     std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS) {
-  return llvm::make_unique<WinCOFFObjectWriter>(std::move(MOTW), OS);
+  return std::make_unique<WinCOFFObjectWriter>(std::move(MOTW), OS);
 }
diff --git a/lib/MC/XCOFFObjectWriter.cpp b/lib/MC/XCOFFObjectWriter.cpp
index 9b9a7b6c118c..353c21068735 100644
--- a/lib/MC/XCOFFObjectWriter.cpp
+++ b/lib/MC/XCOFFObjectWriter.cpp
@@ -10,18 +10,135 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionXCOFF.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCXCOFFObjectWriter.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <deque>
 
 using namespace llvm;
 
+// An XCOFF object file has a limited set of predefined sections. The most
+// important ones for us (right now) are:
+// .text --> contains program code and read-only data.
+// .data --> contains initialized data, function descriptors, and the TOC.
+// .bss  --> contains uninitialized data.
+// Each of these sections is composed of 'Control Sections'. A Control Section
+// is more commonly referred to as a csect. A csect is an indivisible unit of
+// code or data, and acts as a container for symbols. A csect is mapped
+// into a section based on its storage-mapping class, with the exception of
+// XMC_RW which gets mapped to either .data or .bss based on whether it's
+// explicitly initialized or not.
+//
+// We don't represent the sections in the MC layer as there is nothing
+// interesting about them at at that level: they carry information that is
+// only relevant to the ObjectWriter, so we materialize them in this class.
 namespace {
 
+constexpr unsigned DefaultSectionAlign = 4;
+
+// Packs the csect's alignment and type into a byte.
+uint8_t getEncodedType(const MCSectionXCOFF *);
+
+// Wrapper around an MCSymbolXCOFF.
+struct Symbol {
+  const MCSymbolXCOFF *const MCSym;
+  uint32_t SymbolTableIndex;
+
+  XCOFF::StorageClass getStorageClass() const {
+    return MCSym->getStorageClass();
+  }
+  StringRef getName() const { return MCSym->getName(); }
+  Symbol(const MCSymbolXCOFF *MCSym) : MCSym(MCSym), SymbolTableIndex(-1) {}
+};
+
+// Wrapper for an MCSectionXCOFF.
+struct ControlSection {
+  const MCSectionXCOFF *const MCCsect;
+  uint32_t SymbolTableIndex;
+  uint32_t Address;
+  uint32_t Size;
+
+  SmallVector<Symbol, 1> Syms;
+  StringRef getName() const { return MCCsect->getSectionName(); }
+  ControlSection(const MCSectionXCOFF *MCSec)
+      : MCCsect(MCSec), SymbolTableIndex(-1), Address(-1), Size(0) {}
+};
+
+// Represents the data related to a section excluding the csects that make up
+// the raw data of the section. The csects are stored separately as not all
+// sections contain csects, and some sections contain csects which are better
+// stored separately, e.g. the .data section containing read-write, descriptor,
+// TOCBase and TOC-entry csects.
+struct Section {
+  char Name[XCOFF::NameSize];
+  // The physical/virtual address of the section. For an object file
+  // these values are equivalent.
+  uint32_t Address;
+  uint32_t Size;
+  uint32_t FileOffsetToData;
+  uint32_t FileOffsetToRelocations;
+  uint32_t RelocationCount;
+  int32_t Flags;
+
+  int16_t Index;
+
+  // Virtual sections do not need storage allocated in the object file.
+  const bool IsVirtual;
+
+  void reset() {
+    Address = 0;
+    Size = 0;
+    FileOffsetToData = 0;
+    FileOffsetToRelocations = 0;
+    RelocationCount = 0;
+    Index = -1;
+  }
+
+  Section(const char *N, XCOFF::SectionTypeFlags Flags, bool IsVirtual)
+      : Address(0), Size(0), FileOffsetToData(0), FileOffsetToRelocations(0),
+        RelocationCount(0), Flags(Flags), Index(-1), IsVirtual(IsVirtual) {
+    strncpy(Name, N, XCOFF::NameSize);
+  }
+};
+
 class XCOFFObjectWriter : public MCObjectWriter {
+  // Type to be used for a container representing a set of csects with
+  // (approximately) the same storage mapping class. For example all the csects
+  // with a storage mapping class of `xmc_pr` will get placed into the same
+  // container.
+  using CsectGroup = std::deque<ControlSection>;
+
   support::endian::Writer W;
   std::unique_ptr<MCXCOFFObjectTargetWriter> TargetObjectWriter;
+  StringTableBuilder Strings;
+
+  // The non-empty sections, in the order they will appear in the section header
+  // table.
+  std::vector<Section *> Sections;
+
+  // The Predefined sections.
+  Section Text;
+  Section BSS;
+
+  // CsectGroups. These store the csects which make up different parts of
+  // the sections. Should have one for each set of csects that get mapped into
+  // the same section and get handled in a 'similar' way.
+  CsectGroup ProgramCodeCsects;
+  CsectGroup BSSCsects;
+
+  uint32_t SymbolTableEntryCount = 0;
+  uint32_t SymbolTableOffset = 0;
+
+  virtual void reset() override;
 
   void executePostLayoutBinding(MCAssembler &, const MCAsmLayout &) override;
 
@@ -30,6 +147,40 @@ class XCOFFObjectWriter : public MCObjectWriter {
 
   uint64_t writeObject(MCAssembler &, const MCAsmLayout &) override;
 
+  static bool nameShouldBeInStringTable(const StringRef &);
+  void writeSymbolName(const StringRef &);
+  void writeSymbolTableEntryForCsectMemberLabel(const Symbol &,
+                                                const ControlSection &, int16_t,
+                                                uint64_t);
+  void writeSymbolTableEntryForControlSection(const ControlSection &, int16_t,
+                                              XCOFF::StorageClass);
+  void writeFileHeader();
+  void writeSectionHeaderTable();
+  void writeSections(const MCAssembler &Asm, const MCAsmLayout &Layout);
+  void writeSymbolTable(const MCAsmLayout &Layout);
+
+  // Called after all the csects and symbols have been processed by
+  // `executePostLayoutBinding`, this function handles building up the majority
+  // of the structures in the object file representation. Namely:
+  // *) Calculates physical/virtual addresses, raw-pointer offsets, and section
+  //    sizes.
+  // *) Assigns symbol table indices.
+  // *) Builds up the section header table by adding any non-empty sections to
+  //    `Sections`.
+  void assignAddressesAndIndices(const MCAsmLayout &);
+
+  bool
+  needsAuxiliaryHeader() const { /* TODO aux header support not implemented. */
+    return false;
+  }
+
+  // Returns the size of the auxiliary header to be written to the object file.
+  size_t auxiliaryHeaderSize() const {
+    assert(!needsAuxiliaryHeader() &&
+           "Auxiliary header support not implemented.");
+    return 0;
+  }
+
 public:
   XCOFFObjectWriter(std::unique_ptr<MCXCOFFObjectTargetWriter> MOTW,
                     raw_pwrite_stream &OS);
@@ -37,11 +188,100 @@ public:
 
 XCOFFObjectWriter::XCOFFObjectWriter(
     std::unique_ptr<MCXCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS)
-    : W(OS, support::big), TargetObjectWriter(std::move(MOTW)) {}
+    : W(OS, support::big), TargetObjectWriter(std::move(MOTW)),
+      Strings(StringTableBuilder::XCOFF),
+      Text(".text", XCOFF::STYP_TEXT, /* IsVirtual */ false),
+      BSS(".bss", XCOFF::STYP_BSS, /* IsVirtual */ true) {}
+
+void XCOFFObjectWriter::reset() {
+  // Reset any sections we have written to, and empty the section header table.
+  for (auto *Sec : Sections)
+    Sec->reset();
+  Sections.clear();
+
+  // Clear any csects we have stored.
+  ProgramCodeCsects.clear();
+  BSSCsects.clear();
+
+  // Reset the symbol table and string table.
+  SymbolTableEntryCount = 0;
+  SymbolTableOffset = 0;
+  Strings.clear();
+
+  MCObjectWriter::reset();
+}
+
+void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
+                                                 const MCAsmLayout &Layout) {
+  if (TargetObjectWriter->is64Bit())
+    report_fatal_error("64-bit XCOFF object files are not supported yet.");
+
+  // Maps the MC Section representation to its corresponding ControlSection
+  // wrapper. Needed for finding the ControlSection to insert an MCSymbol into
+  // from its containing MCSectionXCOFF.
+  DenseMap<const MCSectionXCOFF *, ControlSection *> WrapperMap;
+
+  for (const auto &S : Asm) {
+    const auto *MCSec = cast<const MCSectionXCOFF>(&S);
+    assert(WrapperMap.find(MCSec) == WrapperMap.end() &&
+           "Cannot add a csect twice.");
 
-void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &,
-                                                 const MCAsmLayout &) {
-  // TODO Implement once we have sections and symbols to handle.
+    // If the name does not fit in the storage provided in the symbol table
+    // entry, add it to the string table.
+    if (nameShouldBeInStringTable(MCSec->getSectionName()))
+      Strings.add(MCSec->getSectionName());
+
+    switch (MCSec->getMappingClass()) {
+    case XCOFF::XMC_PR:
+      assert(XCOFF::XTY_SD == MCSec->getCSectType() &&
+             "Only an initialized csect can contain program code.");
+      ProgramCodeCsects.emplace_back(MCSec);
+      WrapperMap[MCSec] = &ProgramCodeCsects.back();
+      break;
+    case XCOFF::XMC_RW:
+      if (XCOFF::XTY_CM == MCSec->getCSectType()) {
+        BSSCsects.emplace_back(MCSec);
+        WrapperMap[MCSec] = &BSSCsects.back();
+        break;
+      }
+      report_fatal_error("Unhandled mapping of read-write csect to section.");
+    case XCOFF::XMC_TC0:
+      // TODO FIXME Handle emiting the TOC base.
+      break;
+    case XCOFF::XMC_BS:
+      assert(XCOFF::XTY_CM == MCSec->getCSectType() &&
+             "Mapping invalid csect. CSECT with bss storage class must be "
+             "common type.");
+      BSSCsects.emplace_back(MCSec);
+      WrapperMap[MCSec] = &BSSCsects.back();
+      break;
+    default:
+      report_fatal_error("Unhandled mapping of csect to section.");
+    }
+  }
+
+  for (const MCSymbol &S : Asm.symbols()) {
+    // Nothing to do for temporary symbols.
+    if (S.isTemporary())
+      continue;
+    const MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(&S);
+
+    // Map the symbol into its containing csect.
+    const MCSectionXCOFF *ContainingCsect = XSym->getContainingCsect();
+    assert(WrapperMap.find(ContainingCsect) != WrapperMap.end() &&
+           "Expected containing csect to exist in map");
+
+    // Lookup the containing csect and add the symbol to it.
+    WrapperMap[ContainingCsect]->Syms.emplace_back(XSym);
+
+    // If the name does not fit in the storage provided in the symbol table
+    // entry, add it to the string table.
+    if (nameShouldBeInStringTable(XSym->getName()))
+      Strings.add(XSym->getName());
+    }
+
+  Strings.finalize();
+  assignAddressesAndIndices(Layout);
 }
 
 void XCOFFObjectWriter::recordRelocation(MCAssembler &, const MCAsmLayout &,
@@ -50,7 +290,29 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &, const MCAsmLayout &,
   report_fatal_error("XCOFF relocations not supported.");
 }
 
-uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &) {
+void XCOFFObjectWriter::writeSections(const MCAssembler &Asm,
+                                      const MCAsmLayout &Layout) {
+  // Write the program code control sections one at a time.
+  uint32_t CurrentAddressLocation = Text.Address;
+  for (const auto &Csect : ProgramCodeCsects) {
+    if (uint32_t PaddingSize = Csect.Address - CurrentAddressLocation)
+      W.OS.write_zeros(PaddingSize);
+    Asm.writeSectionData(W.OS, Csect.MCCsect, Layout);
+    CurrentAddressLocation = Csect.Address + Csect.Size;
+  }
+
+  if (Text.Index != -1) {
+    // The size of the tail padding in a section is the end virtual address of
+    // the current section minus the the end virtual address of the last csect
+    // in that section.
+    if (uint32_t PaddingSize =
+            Text.Address + Text.Size - CurrentAddressLocation)
+      W.OS.write_zeros(PaddingSize);
+  }
+}
+
+uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm,
+                                        const MCAsmLayout &Layout) {
   // We always emit a timestamp of 0 for reproducibility, so ensure incremental
   // linking is not enabled, in case, like with Windows COFF, such a timestamp
   // is incompatible with incremental linking of XCOFF.
@@ -62,27 +324,274 @@ uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &) {
 
   uint64_t StartOffset = W.OS.tell();
 
-  // TODO FIXME Assign section numbers/finalize sections.
+  writeFileHeader();
+  writeSectionHeaderTable();
+  writeSections(Asm, Layout);
+  // TODO writeRelocations();
 
-  // TODO FIXME Finalize symbols.
+  writeSymbolTable(Layout);
+  // Write the string table.
+  Strings.write(W.OS);
+
+  return W.OS.tell() - StartOffset;
+}
 
+bool XCOFFObjectWriter::nameShouldBeInStringTable(const StringRef &SymbolName) {
+  return SymbolName.size() > XCOFF::NameSize;
+}
+
+void XCOFFObjectWriter::writeSymbolName(const StringRef &SymbolName) {
+  if (nameShouldBeInStringTable(SymbolName)) {
+    W.write<int32_t>(0);
+    W.write<uint32_t>(Strings.getOffset(SymbolName));
+  } else {
+    char Name[XCOFF::NameSize];
+    std::strncpy(Name, SymbolName.data(), XCOFF::NameSize);
+    ArrayRef<char> NameRef(Name, XCOFF::NameSize);
+    W.write(NameRef);
+  }
+}
+
+void XCOFFObjectWriter::writeSymbolTableEntryForCsectMemberLabel(
+    const Symbol &SymbolRef, const ControlSection &CSectionRef,
+    int16_t SectionIndex, uint64_t SymbolOffset) {
+  // Name or Zeros and string table offset
+  writeSymbolName(SymbolRef.getName());
+  assert(SymbolOffset <= UINT32_MAX - CSectionRef.Address &&
+         "Symbol address overflows.");
+  W.write<uint32_t>(CSectionRef.Address + SymbolOffset);
+  W.write<int16_t>(SectionIndex);
+  // Basic/Derived type. See the description of the n_type field for symbol
+  // table entries for a detailed description. Since we don't yet support
+  // visibility, and all other bits are either optionally set or reserved, this
+  // is always zero.
+  // TODO FIXME How to assert a symbol's visibilty is default?
+  // TODO Set the function indicator (bit 10, 0x0020) for functions
+  // when debugging is enabled.
+  W.write<uint16_t>(0);
+  W.write<uint8_t>(SymbolRef.getStorageClass());
+  // Always 1 aux entry for now.
+  W.write<uint8_t>(1);
+
+  // Now output the auxiliary entry.
+  W.write<uint32_t>(CSectionRef.SymbolTableIndex);
+  // Parameter typecheck hash. Not supported.
+  W.write<uint32_t>(0);
+  // Typecheck section number. Not supported.
+  W.write<uint16_t>(0);
+  // Symbol type: Label
+  W.write<uint8_t>(XCOFF::XTY_LD);
+  // Storage mapping class.
+  W.write<uint8_t>(CSectionRef.MCCsect->getMappingClass());
+  // Reserved (x_stab).
+  W.write<uint32_t>(0);
+  // Reserved (x_snstab).
+  W.write<uint16_t>(0);
+}
+
+void XCOFFObjectWriter::writeSymbolTableEntryForControlSection(
+    const ControlSection &CSectionRef, int16_t SectionIndex,
+    XCOFF::StorageClass StorageClass) {
+  // n_name, n_zeros, n_offset
+  writeSymbolName(CSectionRef.getName());
+  // n_value
+  W.write<uint32_t>(CSectionRef.Address);
+  // n_scnum
+  W.write<int16_t>(SectionIndex);
+  // Basic/Derived type. See the description of the n_type field for symbol
+  // table entries for a detailed description. Since we don't yet support
+  // visibility, and all other bits are either optionally set or reserved, this
+  // is always zero.
+  // TODO FIXME How to assert a symbol's visibilty is default?
+  // TODO Set the function indicator (bit 10, 0x0020) for functions
+  // when debugging is enabled.
+  W.write<uint16_t>(0);
+  // n_sclass
+  W.write<uint8_t>(StorageClass);
+  // Always 1 aux entry for now.
+  W.write<uint8_t>(1);
+
+  // Now output the auxiliary entry.
+  W.write<uint32_t>(CSectionRef.Size);
+  // Parameter typecheck hash. Not supported.
+  W.write<uint32_t>(0);
+  // Typecheck section number. Not supported.
+  W.write<uint16_t>(0);
+  // Symbol type.
+  W.write<uint8_t>(getEncodedType(CSectionRef.MCCsect));
+  // Storage mapping class.
+  W.write<uint8_t>(CSectionRef.MCCsect->getMappingClass());
+  // Reserved (x_stab).
+  W.write<uint32_t>(0);
+  // Reserved (x_snstab).
+  W.write<uint16_t>(0);
+}
+
+void XCOFFObjectWriter::writeFileHeader() {
   // Magic.
   W.write<uint16_t>(0x01df);
   // Number of sections.
-  W.write<uint16_t>(0);
+  W.write<uint16_t>(Sections.size());
   // Timestamp field. For reproducible output we write a 0, which represents no
   // timestamp.
   W.write<int32_t>(0);
   // Byte Offset to the start of the symbol table.
-  W.write<uint32_t>(0);
+  W.write<uint32_t>(SymbolTableOffset);
   // Number of entries in the symbol table.
-  W.write<int32_t>(0);
+  W.write<int32_t>(SymbolTableEntryCount);
   // Size of the optional header.
   W.write<uint16_t>(0);
   // Flags.
   W.write<uint16_t>(0);
+}
 
-  return W.OS.tell() - StartOffset;
+void XCOFFObjectWriter::writeSectionHeaderTable() {
+  for (const auto *Sec : Sections) {
+    // Write Name.
+    ArrayRef<char> NameRef(Sec->Name, XCOFF::NameSize);
+    W.write(NameRef);
+
+    // Write the Physical Address and Virtual Address. In an object file these
+    // are the same.
+    W.write<uint32_t>(Sec->Address);
+    W.write<uint32_t>(Sec->Address);
+
+    W.write<uint32_t>(Sec->Size);
+    W.write<uint32_t>(Sec->FileOffsetToData);
+
+    // Relocation pointer and Lineno pointer. Not supported yet.
+    W.write<uint32_t>(0);
+    W.write<uint32_t>(0);
+
+    // Relocation and line-number counts. Not supported yet.
+    W.write<uint16_t>(0);
+    W.write<uint16_t>(0);
+
+    W.write<int32_t>(Sec->Flags);
+  }
+}
+
+void XCOFFObjectWriter::writeSymbolTable(const MCAsmLayout &Layout) {
+  // Print out symbol table for the program code.
+  for (const auto &Csect : ProgramCodeCsects) {
+    // Write out the control section first and then each symbol in it.
+    writeSymbolTableEntryForControlSection(Csect, Text.Index,
+                                           Csect.MCCsect->getStorageClass());
+    for (const auto &Sym : Csect.Syms)
+      writeSymbolTableEntryForCsectMemberLabel(
+          Sym, Csect, Text.Index, Layout.getSymbolOffset(*Sym.MCSym));
+  }
+
+  // The BSS Section is special in that the csects must contain a single symbol,
+  // and the contained symbol cannot be represented in the symbol table as a
+  // label definition.
+  for (auto &Csect : BSSCsects) {
+    assert(Csect.Syms.size() == 1 &&
+           "Uninitialized csect cannot contain more then 1 symbol.");
+    Symbol &Sym = Csect.Syms.back();
+    writeSymbolTableEntryForControlSection(Csect, BSS.Index,
+                                           Sym.getStorageClass());
+  }
+}
+
+void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
+  // The address corrresponds to the address of sections and symbols in the
+  // object file. We place the shared address 0 immediately after the
+  // section header table.
+  uint32_t Address = 0;
+  // Section indices are 1-based in XCOFF.
+  int16_t SectionIndex = 1;
+  // The first symbol table entry is for the file name. We are not emitting it
+  // yet, so start at index 0.
+  uint32_t SymbolTableIndex = 0;
+
+  // Text section comes first.
+  if (!ProgramCodeCsects.empty()) {
+    Sections.push_back(&Text);
+    Text.Index = SectionIndex++;
+    for (auto &Csect : ProgramCodeCsects) {
+      const MCSectionXCOFF *MCSec = Csect.MCCsect;
+      Csect.Address = alignTo(Address, MCSec->getAlignment());
+      Csect.Size = Layout.getSectionAddressSize(MCSec);
+      Address = Csect.Address + Csect.Size;
+      Csect.SymbolTableIndex = SymbolTableIndex;
+      // 1 main and 1 auxiliary symbol table entry for the csect.
+      SymbolTableIndex += 2;
+      for (auto &Sym : Csect.Syms) {
+        Sym.SymbolTableIndex = SymbolTableIndex;
+        // 1 main and 1 auxiliary symbol table entry for each contained symbol
+        SymbolTableIndex += 2;
+      }
+    }
+    Address = alignTo(Address, DefaultSectionAlign);
+
+    // The first csect of a section can be aligned by adjusting the virtual
+    // address of its containing section instead of writing zeroes into the
+    // object file.
+    Text.Address = ProgramCodeCsects.front().Address;
+
+    Text.Size = Address - Text.Address;
+  }
+
+  // Data section Second. TODO
+
+  // BSS Section third.
+  if (!BSSCsects.empty()) {
+    Sections.push_back(&BSS);
+    BSS.Index = SectionIndex++;
+    for (auto &Csect : BSSCsects) {
+      const MCSectionXCOFF *MCSec = Csect.MCCsect;
+      Csect.Address = alignTo(Address, MCSec->getAlignment());
+      Csect.Size = Layout.getSectionAddressSize(MCSec);
+      Address = Csect.Address + Csect.Size;
+      Csect.SymbolTableIndex = SymbolTableIndex;
+      // 1 main and 1 auxiliary symbol table entry for the csect.
+      SymbolTableIndex += 2;
+
+      assert(Csect.Syms.size() == 1 &&
+             "csect in the BSS can only contain a single symbol.");
+      Csect.Syms[0].SymbolTableIndex = Csect.SymbolTableIndex;
+    }
+    // Pad out Address to the default alignment. This is to match how the system
+    // assembler handles the .bss section. Its size is always a multiple of 4.
+    Address = alignTo(Address, DefaultSectionAlign);
+
+    BSS.Address = BSSCsects.front().Address;
+    BSS.Size = Address - BSS.Address;
+  }
+
+  SymbolTableEntryCount = SymbolTableIndex;
+
+  // Calculate the RawPointer value for each section.
+  uint64_t RawPointer = sizeof(XCOFF::FileHeader32) + auxiliaryHeaderSize() +
+                        Sections.size() * sizeof(XCOFF::SectionHeader32);
+  for (auto *Sec : Sections) {
+    if (!Sec->IsVirtual) {
+      Sec->FileOffsetToData = RawPointer;
+      RawPointer += Sec->Size;
+    }
+  }
+
+  // TODO Add in Relocation storage to the RawPointer Calculation.
+  // TODO What to align the SymbolTable to?
+  // TODO Error check that the number of symbol table entries fits in 32-bits
+  // signed ...
+  if (SymbolTableEntryCount)
+    SymbolTableOffset = RawPointer;
+}
+
+// Takes the log base 2 of the alignment and shifts the result into the 5 most
+// significant bits of a byte, then or's in the csect type into the least
+// significant 3 bits.
+uint8_t getEncodedType(const MCSectionXCOFF *Sec) {
+  unsigned Align = Sec->getAlignment();
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of 2.");
+  unsigned Log2Align = Log2_32(Align);
+  // Result is a number in the range [0, 31] which fits in the 5 least
+  // significant bits. Shift this value into the 5 most significant bits, and
+  // bitwise-or in the csect type.
+  uint8_t EncodedAlign = Log2Align << 3;
+  return EncodedAlign | Sec->getCSectType();
 }
 
 } // end anonymous namespace
@@ -90,5 +599,5 @@ uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &) {
 std::unique_ptr<MCObjectWriter>
 llvm::createXCOFFObjectWriter(std::unique_ptr<MCXCOFFObjectTargetWriter> MOTW,
                               raw_pwrite_stream &OS) {
-  return llvm::make_unique<XCOFFObjectWriter>(std::move(MOTW), OS);
+  return std::make_unique<XCOFFObjectWriter>(std::move(MOTW), OS);
 }
diff --git a/lib/MCA/CodeEmitter.cpp b/lib/MCA/CodeEmitter.cpp
new file mode 100644
index 000000000000..294107219cb0
--- /dev/null
+++ b/lib/MCA/CodeEmitter.cpp
@@ -0,0 +1,37 @@
+//===--------------------- CodeEmitter.cpp ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CodeEmitter API.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MCA/CodeEmitter.h"
+
+namespace llvm {
+namespace mca {
+
+CodeEmitter::EncodingInfo
+CodeEmitter::getOrCreateEncodingInfo(unsigned MCID) {
+  EncodingInfo &EI = Encodings[MCID];
+  if (EI.second)
+    return EI;
+
+  SmallVector<llvm::MCFixup, 2> Fixups;
+  const MCInst &Inst = Sequence[MCID];
+  MCInst Relaxed(Sequence[MCID]);
+  if (MAB.mayNeedRelaxation(Inst, STI))
+    MAB.relaxInstruction(Inst, STI, Relaxed);
+
+  EI.first = Code.size();
+  MCE.encodeInstruction(Relaxed, VecOS, Fixups, STI);
+  EI.second = Code.size() - EI.first;
+  return EI;
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/lib/MCA/Context.cpp b/lib/MCA/Context.cpp
index f0e8dfab8680..0160e1f9f787 100644
--- a/lib/MCA/Context.cpp
+++ b/lib/MCA/Context.cpp
@@ -28,24 +28,23 @@ namespace llvm {
 namespace mca {
 
 std::unique_ptr<Pipeline>
-Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
-                               SourceMgr &SrcMgr) {
+Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
   const MCSchedModel &SM = STI.getSchedModel();
 
   // Create the hardware units defining the backend.
-  auto RCU = llvm::make_unique<RetireControlUnit>(SM);
-  auto PRF = llvm::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
-  auto LSU = llvm::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
+  auto RCU = std::make_unique<RetireControlUnit>(SM);
+  auto PRF = std::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
+  auto LSU = std::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
                                        Opts.StoreQueueSize, Opts.AssumeNoAlias);
-  auto HWS = llvm::make_unique<Scheduler>(SM, *LSU);
+  auto HWS = std::make_unique<Scheduler>(SM, *LSU);
 
   // Create the pipeline stages.
-  auto Fetch = llvm::make_unique<EntryStage>(SrcMgr);
-  auto Dispatch = llvm::make_unique<DispatchStage>(STI, MRI, Opts.DispatchWidth,
+  auto Fetch = std::make_unique<EntryStage>(SrcMgr);
+  auto Dispatch = std::make_unique<DispatchStage>(STI, MRI, Opts.DispatchWidth,
                                                    *RCU, *PRF);
   auto Execute =
-      llvm::make_unique<ExecuteStage>(*HWS, Opts.EnableBottleneckAnalysis);
-  auto Retire = llvm::make_unique<RetireStage>(*RCU, *PRF);
+      std::make_unique<ExecuteStage>(*HWS, Opts.EnableBottleneckAnalysis);
+  auto Retire = std::make_unique<RetireStage>(*RCU, *PRF, *LSU);
 
   // Pass the ownership of all the hardware units to this Context.
   addHardwareUnit(std::move(RCU));
@@ -54,10 +53,10 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
   addHardwareUnit(std::move(HWS));
 
   // Build the pipeline.
-  auto StagePipeline = llvm::make_unique<Pipeline>();
+  auto StagePipeline = std::make_unique<Pipeline>();
   StagePipeline->appendStage(std::move(Fetch));
   if (Opts.MicroOpQueueSize)
-    StagePipeline->appendStage(llvm::make_unique<MicroOpQueueStage>(
+    StagePipeline->appendStage(std::make_unique<MicroOpQueueStage>(
         Opts.MicroOpQueueSize, Opts.DecodersThroughput));
   StagePipeline->appendStage(std::move(Dispatch));
   StagePipeline->appendStage(std::move(Execute));
diff --git a/lib/MCA/HardwareUnits/LSUnit.cpp b/lib/MCA/HardwareUnits/LSUnit.cpp
index ac1a6a36547b..0ee084c7ce1a 100644
--- a/lib/MCA/HardwareUnits/LSUnit.cpp
+++ b/lib/MCA/HardwareUnits/LSUnit.cpp
@@ -29,12 +29,12 @@ LSUnitBase::LSUnitBase(const MCSchedModel &SM, unsigned LQ, unsigned SQ,
     const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
     if (!LQSize && EPI.LoadQueueID) {
       const MCProcResourceDesc &LdQDesc = *SM.getProcResource(EPI.LoadQueueID);
-      LQSize = LdQDesc.BufferSize;
+      LQSize = std::max(0, LdQDesc.BufferSize);
     }
 
     if (!SQSize && EPI.StoreQueueID) {
       const MCProcResourceDesc &StQDesc = *SM.getProcResource(EPI.StoreQueueID);
-      SQSize = StQDesc.BufferSize;
+      SQSize = std::max(0, StQDesc.BufferSize);
     }
   }
 }
@@ -72,9 +72,9 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
   assert((Desc.MayLoad || Desc.MayStore) && "Not a memory operation!");
 
   if (Desc.MayLoad)
-    assignLQSlot();
+    acquireLQSlot();
   if (Desc.MayStore)
-    assignSQSlot();
+    acquireSQSlot();
 
   if (Desc.MayStore) {
     // Always create a new group for store operations.
@@ -160,26 +160,28 @@ LSUnit::Status LSUnit::isAvailable(const InstRef &IR) const {
 }
 
 void LSUnitBase::onInstructionExecuted(const InstRef &IR) {
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
-  bool IsALoad = Desc.MayLoad;
-  bool IsAStore = Desc.MayStore;
-  assert((IsALoad || IsAStore) && "Expected a memory operation!");
-
   unsigned GroupID = IR.getInstruction()->getLSUTokenID();
   auto It = Groups.find(GroupID);
+  assert(It != Groups.end() && "Instruction not dispatched to the LS unit");
   It->second->onInstructionExecuted();
-  if (It->second->isExecuted()) {
+  if (It->second->isExecuted())
     Groups.erase(It);
-  }
+}
+
+void LSUnitBase::onInstructionRetired(const InstRef &IR) {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  bool IsALoad = Desc.MayLoad;
+  bool IsAStore = Desc.MayStore;
+  assert((IsALoad || IsAStore) && "Expected a memory operation!");
 
   if (IsALoad) {
-    UsedLQEntries--;
+    releaseLQSlot();
     LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << IR.getSourceIndex()
                       << " has been removed from the load queue.\n");
   }
 
   if (IsAStore) {
-    UsedSQEntries--;
+    releaseSQSlot();
     LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << IR.getSourceIndex()
                       << " has been removed from the store queue.\n");
   }
diff --git a/lib/MCA/HardwareUnits/RegisterFile.cpp b/lib/MCA/HardwareUnits/RegisterFile.cpp
index 86a888ea8cae..7ea5506f11d6 100644
--- a/lib/MCA/HardwareUnits/RegisterFile.cpp
+++ b/lib/MCA/HardwareUnits/RegisterFile.cpp
@@ -147,7 +147,7 @@ void RegisterFile::freePhysRegs(const RegisterRenamingInfo &Entry,
 void RegisterFile::addRegisterWrite(WriteRef Write,
                                     MutableArrayRef<unsigned> UsedPhysRegs) {
   WriteState &WS = *Write.getWriteState();
-  unsigned RegID = WS.getRegisterID();
+  MCPhysReg RegID = WS.getRegisterID();
   assert(RegID && "Adding an invalid register definition?");
 
   LLVM_DEBUG({
@@ -194,7 +194,7 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
   }
 
   // Update zero registers.
-  unsigned ZeroRegisterID =
+  MCPhysReg ZeroRegisterID =
       WS.clearsSuperRegisters() ? RegID : WS.getRegisterID();
   if (IsWriteZero) {
     ZeroRegisters.setBit(ZeroRegisterID);
@@ -247,7 +247,7 @@ void RegisterFile::removeRegisterWrite(
   if (WS.isEliminated())
     return;
 
-  unsigned RegID = WS.getRegisterID();
+  MCPhysReg RegID = WS.getRegisterID();
 
   assert(RegID != 0 && "Invalidating an already invalid register?");
   assert(WS.getCyclesLeft() != UNKNOWN_CYCLES &&
@@ -255,7 +255,7 @@ void RegisterFile::removeRegisterWrite(
   assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
 
   bool ShouldFreePhysRegs = !WS.isWriteZero();
-  unsigned RenameAs = RegisterMappings[RegID].second.RenameAs;
+  MCPhysReg RenameAs = RegisterMappings[RegID].second.RenameAs;
   if (RenameAs && RenameAs != RegID) {
     RegID = RenameAs;
 
@@ -355,7 +355,7 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, ReadState &RS) {
 
 void RegisterFile::collectWrites(const ReadState &RS,
                                  SmallVectorImpl<WriteRef> &Writes) const {
-  unsigned RegID = RS.getRegisterID();
+  MCPhysReg RegID = RS.getRegisterID();
   assert(RegID && RegID < RegisterMappings.size());
   LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register "
                     << MRI.getName(RegID) << '\n');
@@ -397,7 +397,7 @@ void RegisterFile::collectWrites(const ReadState &RS,
 
 void RegisterFile::addRegisterRead(ReadState &RS,
                                    const MCSubtargetInfo &STI) const {
-  unsigned RegID = RS.getRegisterID();
+  MCPhysReg RegID = RS.getRegisterID();
   const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
   RS.setPRF(RRI.IndexPlusCost.first);
   if (RS.isIndependentFromDef())
@@ -424,11 +424,11 @@ void RegisterFile::addRegisterRead(ReadState &RS,
   }
 }
 
-unsigned RegisterFile::isAvailable(ArrayRef<unsigned> Regs) const {
+unsigned RegisterFile::isAvailable(ArrayRef<MCPhysReg> Regs) const {
   SmallVector<unsigned, 4> NumPhysRegs(getNumRegisterFiles());
 
   // Find how many new mappings must be created for each register file.
-  for (const unsigned RegID : Regs) {
+  for (const MCPhysReg RegID : Regs) {
     const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
     const IndexPlusCostPairTy &Entry = RRI.IndexPlusCost;
     if (Entry.first)
diff --git a/lib/MCA/HardwareUnits/ResourceManager.cpp b/lib/MCA/HardwareUnits/ResourceManager.cpp
index 06f2476353d6..088aea3e23c6 100644
--- a/lib/MCA/HardwareUnits/ResourceManager.cpp
+++ b/lib/MCA/HardwareUnits/ResourceManager.cpp
@@ -104,7 +104,7 @@ void ResourceState::dump() const {
 static std::unique_ptr<ResourceStrategy>
 getStrategyFor(const ResourceState &RS) {
   if (RS.isAResourceGroup() || RS.getNumUnits() > 1)
-    return llvm::make_unique<DefaultResourceStrategy>(RS.getReadyMask());
+    return std::make_unique<DefaultResourceStrategy>(RS.getReadyMask());
   return std::unique_ptr<ResourceStrategy>(nullptr);
 }
 
@@ -114,7 +114,8 @@ ResourceManager::ResourceManager(const MCSchedModel &SM)
       Resource2Groups(SM.getNumProcResourceKinds() - 1, 0),
       ProcResID2Mask(SM.getNumProcResourceKinds(), 0),
       ResIndex2ProcResID(SM.getNumProcResourceKinds() - 1, 0),
-      ProcResUnitMask(0), ReservedResourceGroups(0) {
+      ProcResUnitMask(0), ReservedResourceGroups(0),
+      AvailableBuffers(~0ULL), ReservedBuffers(0) {
   computeProcResourceMasks(SM, ProcResID2Mask);
 
   // initialize vector ResIndex2ProcResID.
@@ -127,7 +128,7 @@ ResourceManager::ResourceManager(const MCSchedModel &SM)
     uint64_t Mask = ProcResID2Mask[I];
     unsigned Index = getResourceStateIndex(Mask);
     Resources[Index] =
-        llvm::make_unique<ResourceState>(*SM.getProcResource(I), I, Mask);
+        std::make_unique<ResourceState>(*SM.getProcResource(I), I, Mask);
     Strategies[Index] = getStrategyFor(*Resources[Index]);
   }
 
@@ -241,33 +242,41 @@ void ResourceManager::release(const ResourceRef &RR) {
 }
 
 ResourceStateEvent
-ResourceManager::canBeDispatched(ArrayRef<uint64_t> Buffers) const {
-  ResourceStateEvent Result = ResourceStateEvent::RS_BUFFER_AVAILABLE;
-  for (uint64_t Buffer : Buffers) {
-    ResourceState &RS = *Resources[getResourceStateIndex(Buffer)];
-    Result = RS.isBufferAvailable();
-    if (Result != ResourceStateEvent::RS_BUFFER_AVAILABLE)
-      break;
-  }
-  return Result;
+ResourceManager::canBeDispatched(uint64_t ConsumedBuffers) const {
+  if (ConsumedBuffers & ReservedBuffers)
+    return ResourceStateEvent::RS_RESERVED;
+  if (ConsumedBuffers & (~AvailableBuffers))
+    return ResourceStateEvent::RS_BUFFER_UNAVAILABLE;
+  return ResourceStateEvent::RS_BUFFER_AVAILABLE;
 }
 
-void ResourceManager::reserveBuffers(ArrayRef<uint64_t> Buffers) {
-  for (const uint64_t Buffer : Buffers) {
-    ResourceState &RS = *Resources[getResourceStateIndex(Buffer)];
+void ResourceManager::reserveBuffers(uint64_t ConsumedBuffers) {
+  while (ConsumedBuffers) {
+    uint64_t CurrentBuffer = ConsumedBuffers & (-ConsumedBuffers);
+    ResourceState &RS = *Resources[getResourceStateIndex(CurrentBuffer)];
+    ConsumedBuffers ^= CurrentBuffer;
     assert(RS.isBufferAvailable() == ResourceStateEvent::RS_BUFFER_AVAILABLE);
-    RS.reserveBuffer();
-
+    if (!RS.reserveBuffer())
+      AvailableBuffers ^= CurrentBuffer;
     if (RS.isADispatchHazard()) {
-      assert(!RS.isReserved());
-      RS.setReserved();
+      // Reserve this buffer now, and release it once pipeline resources
+      // consumed by the instruction become available again.
+      // We do this to simulate an in-order dispatch/issue of instructions.
+      ReservedBuffers ^= CurrentBuffer;
     }
   }
 }
 
-void ResourceManager::releaseBuffers(ArrayRef<uint64_t> Buffers) {
-  for (const uint64_t R : Buffers)
-    Resources[getResourceStateIndex(R)]->releaseBuffer();
+void ResourceManager::releaseBuffers(uint64_t ConsumedBuffers) {
+  AvailableBuffers |= ConsumedBuffers;
+  while (ConsumedBuffers) {
+    uint64_t CurrentBuffer = ConsumedBuffers & (-ConsumedBuffers);
+    ResourceState &RS = *Resources[getResourceStateIndex(CurrentBuffer)];
+    ConsumedBuffers ^= CurrentBuffer;
+    RS.releaseBuffer();
+    // Do not unreserve dispatch hazard resource buffers. Wait until all
+    // pipeline resources have been freed too.
+  }
 }
 
 uint64_t ResourceManager::checkAvailability(const InstrDesc &Desc) const {
@@ -322,7 +331,6 @@ void ResourceManager::cycleEvent(SmallVectorImpl<ResourceRef> &ResourcesFreed) {
 
       if (countPopulation(RR.first) == 1)
         release(RR);
-
       releaseResource(RR.first);
       ResourcesFreed.push_back(RR);
     }
@@ -336,7 +344,7 @@ void ResourceManager::reserveResource(uint64_t ResourceID) {
   const unsigned Index = getResourceStateIndex(ResourceID);
   ResourceState &Resource = *Resources[Index];
   assert(Resource.isAResourceGroup() && !Resource.isReserved() &&
-         "Unexpected resource found!");
+         "Unexpected resource state found!");
   Resource.setReserved();
   ReservedResourceGroups ^= 1ULL << Index;
 }
@@ -347,6 +355,9 @@ void ResourceManager::releaseResource(uint64_t ResourceID) {
   Resource.clearReserved();
   if (Resource.isAResourceGroup())
     ReservedResourceGroups ^= 1ULL << Index;
+  // Now it is safe to release dispatch/issue resources.
+  if (Resource.isADispatchHazard())
+    ReservedBuffers ^= 1ULL << Index;
 }
 
 } // namespace mca
diff --git a/lib/MCA/HardwareUnits/RetireControlUnit.cpp b/lib/MCA/HardwareUnits/RetireControlUnit.cpp
index 068c5062ccdf..de519d7fd94a 100644
--- a/lib/MCA/HardwareUnits/RetireControlUnit.cpp
+++ b/lib/MCA/HardwareUnits/RetireControlUnit.cpp
@@ -21,65 +21,78 @@ namespace mca {
 
 RetireControlUnit::RetireControlUnit(const MCSchedModel &SM)
     : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0),
-      AvailableSlots(SM.MicroOpBufferSize), MaxRetirePerCycle(0) {
+      NumROBEntries(SM.MicroOpBufferSize),
+      AvailableEntries(SM.MicroOpBufferSize), MaxRetirePerCycle(0) {
   // Check if the scheduling model provides extra information about the machine
   // processor. If so, then use that information to set the reorder buffer size
   // and the maximum number of instructions retired per cycle.
   if (SM.hasExtraProcessorInfo()) {
     const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
     if (EPI.ReorderBufferSize)
-      AvailableSlots = EPI.ReorderBufferSize;
+      AvailableEntries = EPI.ReorderBufferSize;
     MaxRetirePerCycle = EPI.MaxRetirePerCycle;
   }
-
-  assert(AvailableSlots && "Invalid reorder buffer size!");
-  Queue.resize(AvailableSlots);
+  NumROBEntries = AvailableEntries;
+  assert(NumROBEntries && "Invalid reorder buffer size!");
+  Queue.resize(2 * NumROBEntries);
 }
 
 // Reserves a number of slots, and returns a new token.
-unsigned RetireControlUnit::reserveSlot(const InstRef &IR,
-                                        unsigned NumMicroOps) {
-  assert(isAvailable(NumMicroOps) && "Reorder Buffer unavailable!");
-  unsigned NormalizedQuantity =
-      std::min(NumMicroOps, static_cast<unsigned>(Queue.size()));
-  // Zero latency instructions may have zero uOps. Artificially bump this
-  // value to 1. Although zero latency instructions don't consume scheduler
-  // resources, they still consume one slot in the retire queue.
-  NormalizedQuantity = std::max(NormalizedQuantity, 1U);
+unsigned RetireControlUnit::dispatch(const InstRef &IR) {
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned Entries = normalizeQuantity(Inst.getNumMicroOps());
+  assert((AvailableEntries >= Entries) && "Reorder Buffer unavailable!");
+
   unsigned TokenID = NextAvailableSlotIdx;
-  Queue[NextAvailableSlotIdx] = {IR, NormalizedQuantity, false};
-  NextAvailableSlotIdx += NormalizedQuantity;
+  Queue[NextAvailableSlotIdx] = {IR, Entries, false};
+  NextAvailableSlotIdx += std::max(1U, Entries);
   NextAvailableSlotIdx %= Queue.size();
-  AvailableSlots -= NormalizedQuantity;
+
+  AvailableEntries -= Entries;
   return TokenID;
 }
 
-const RetireControlUnit::RUToken &RetireControlUnit::peekCurrentToken() const {
-  return Queue[CurrentInstructionSlotIdx];
+const RetireControlUnit::RUToken &RetireControlUnit::getCurrentToken() const {
+  const RetireControlUnit::RUToken &Current = Queue[CurrentInstructionSlotIdx];
+#ifndef NDEBUG
+  const Instruction *Inst = Current.IR.getInstruction();
+  assert(Inst && "Invalid RUToken in the RCU queue.");
+#endif
+  return Current;
+}
+
+unsigned RetireControlUnit::computeNextSlotIdx() const {
+  const RetireControlUnit::RUToken &Current = getCurrentToken();
+  unsigned NextSlotIdx = CurrentInstructionSlotIdx + std::max(1U, Current.NumSlots);
+  return NextSlotIdx % Queue.size();
+}
+
+const RetireControlUnit::RUToken &RetireControlUnit::peekNextToken() const {
+  return Queue[computeNextSlotIdx()];
 }
 
 void RetireControlUnit::consumeCurrentToken() {
   RetireControlUnit::RUToken &Current = Queue[CurrentInstructionSlotIdx];
-  assert(Current.NumSlots && "Reserved zero slots?");
-  assert(Current.IR && "Invalid RUToken in the RCU queue.");
   Current.IR.getInstruction()->retire();
 
   // Update the slot index to be the next item in the circular queue.
-  CurrentInstructionSlotIdx += Current.NumSlots;
+  CurrentInstructionSlotIdx += std::max(1U, Current.NumSlots);
   CurrentInstructionSlotIdx %= Queue.size();
-  AvailableSlots += Current.NumSlots;
+  AvailableEntries += Current.NumSlots;
+  Current = { InstRef(), 0U, false };
 }
 
 void RetireControlUnit::onInstructionExecuted(unsigned TokenID) {
   assert(Queue.size() > TokenID);
-  assert(Queue[TokenID].Executed == false && Queue[TokenID].IR);
+  assert(Queue[TokenID].IR.getInstruction() && "Instruction was not dispatched!");
+  assert(Queue[TokenID].Executed == false && "Instruction already executed!");
   Queue[TokenID].Executed = true;
 }
 
 #ifndef NDEBUG
 void RetireControlUnit::dump() const {
-  dbgs() << "Retire Unit: { Total Slots=" << Queue.size()
-         << ", Available Slots=" << AvailableSlots << " }\n";
+  dbgs() << "Retire Unit: { Total ROB Entries =" << NumROBEntries
+         << ", Available ROB entries=" << AvailableEntries << " }\n";
 }
 #endif
 
diff --git a/lib/MCA/HardwareUnits/Scheduler.cpp b/lib/MCA/HardwareUnits/Scheduler.cpp
index 0f0f2ffb8325..8730336c6669 100644
--- a/lib/MCA/HardwareUnits/Scheduler.cpp
+++ b/lib/MCA/HardwareUnits/Scheduler.cpp
@@ -21,7 +21,7 @@ namespace mca {
 
 void Scheduler::initializeStrategy(std::unique_ptr<SchedulerStrategy> S) {
   // Ensure we have a valid (non-null) strategy object.
-  Strategy = S ? std::move(S) : llvm::make_unique<DefaultSchedulerStrategy>();
+  Strategy = S ? std::move(S) : std::make_unique<DefaultSchedulerStrategy>();
 }
 
 // Anchor the vtable of SchedulerStrategy and DefaultSchedulerStrategy.
@@ -38,9 +38,8 @@ void Scheduler::dump() const {
 #endif
 
 Scheduler::Status Scheduler::isAvailable(const InstRef &IR) {
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
-
-  ResourceStateEvent RSE = Resources->canBeDispatched(Desc.Buffers);
+  ResourceStateEvent RSE =
+      Resources->canBeDispatched(IR.getInstruction()->getUsedBuffers());
   HadTokenStall = RSE != RS_BUFFER_AVAILABLE;
 
   switch (RSE) {
@@ -106,7 +105,7 @@ void Scheduler::issueInstruction(
   bool HasDependentUsers = Inst.hasDependentUsers();
   HasDependentUsers |= Inst.isMemOp() && LSU.hasDependentUsers(IR);
 
-  Resources->releaseBuffers(Inst.getDesc().Buffers);
+  Resources->releaseBuffers(Inst.getUsedBuffers());
   issueInstructionImpl(IR, UsedResources);
   // Instructions that have been issued during this cycle might have unblocked
   // other dependent instructions. Dependent instructions may be issued during
@@ -300,8 +299,7 @@ bool Scheduler::mustIssueImmediately(const InstRef &IR) const {
 
 bool Scheduler::dispatch(InstRef &IR) {
   Instruction &IS = *IR.getInstruction();
-  const InstrDesc &Desc = IS.getDesc();
-  Resources->reserveBuffers(Desc.Buffers);
+  Resources->reserveBuffers(IS.getUsedBuffers());
 
   // If necessary, reserve queue entries in the load-store unit (LSU).
   if (IS.isMemOp())
diff --git a/lib/MCA/InstrBuilder.cpp b/lib/MCA/InstrBuilder.cpp
index 829920366c90..bd28c733535c 100644
--- a/lib/MCA/InstrBuilder.cpp
+++ b/lib/MCA/InstrBuilder.cpp
@@ -80,7 +80,7 @@ static void initializeUsedResources(InstrDesc &ID,
     if (PR.BufferSize < 0) {
       AllInOrderResources = false;
     } else {
-      Buffers.setBit(PRE->ProcResourceIdx);
+      Buffers.setBit(getResourceStateIndex(Mask));
       AnyDispatchHazards |= (PR.BufferSize == 0);
       AllInOrderResources &= (PR.BufferSize <= 1);
     }
@@ -139,9 +139,6 @@ static void initializeUsedResources(InstrDesc &ID,
     }
   }
 
-  ID.UsedProcResUnits = UsedResourceUnits;
-  ID.UsedProcResGroups = UsedResourceGroups;
-
   // A SchedWrite may specify a number of cycles in which a resource group
   // is reserved. For example (on target x86; cpu Haswell):
   //
@@ -177,20 +174,13 @@ static void initializeUsedResources(InstrDesc &ID,
 
       uint64_t Mask = ProcResourceMasks[I];
       if (Mask != SR.first && ((Mask & SR.first) == SR.first))
-        Buffers.setBit(I);
+        Buffers.setBit(getResourceStateIndex(Mask));
     }
   }
 
-  // Now set the buffers.
-  if (unsigned NumBuffers = Buffers.countPopulation()) {
-    ID.Buffers.resize(NumBuffers);
-    for (unsigned I = 0, E = NumProcResources; I < E && NumBuffers; ++I) {
-      if (Buffers[I]) {
-        --NumBuffers;
-        ID.Buffers[NumBuffers] = ProcResourceMasks[I];
-      }
-    }
-  }
+  ID.UsedBuffers = Buffers.getZExtValue();
+  ID.UsedProcResUnits = UsedResourceUnits;
+  ID.UsedProcResGroups = UsedResourceGroups;
 
   LLVM_DEBUG({
     for (const std::pair<uint64_t, ResourceUsage> &R : ID.Resources)
@@ -198,8 +188,12 @@ static void initializeUsedResources(InstrDesc &ID,
              << "Reserved=" << R.second.isReserved() << ", "
              << "#Units=" << R.second.NumUnits << ", "
              << "cy=" << R.second.size() << '\n';
-    for (const uint64_t R : ID.Buffers)
-      dbgs() << "\t\tBuffer Mask=" << format_hex(R, 16) << '\n';
+    uint64_t BufferIDs = ID.UsedBuffers;
+    while (BufferIDs) {
+      uint64_t Current = BufferIDs & (-BufferIDs);
+      dbgs() << "\t\tBuffer Mask=" << format_hex(Current, 16) << '\n';
+      BufferIDs ^= Current;
+    }
     dbgs() << "\t\t Used Units=" << format_hex(ID.UsedProcResUnits, 16) << '\n';
     dbgs() << "\t\tUsed Groups=" << format_hex(ID.UsedProcResGroups, 16)
            << '\n';
@@ -464,9 +458,8 @@ void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
 
   // FIXME: If an instruction opcode is marked as 'mayLoad', and it has no
   // "unmodeledSideEffects", then this logic optimistically assumes that any
-  // extra register operands in the variadic sequence are not register
+  // extra register operand in the variadic sequence is not a register
   // definition.
-
   bool AssumeDefsOnly = !MCDesc.mayStore() && MCDesc.mayLoad() &&
                         !MCDesc.hasUnmodeledSideEffects();
   for (unsigned I = 0, OpIndex = MCDesc.getNumOperands();
@@ -493,7 +486,7 @@ Error InstrBuilder::verifyInstrDesc(const InstrDesc &ID,
     return ErrorSuccess();
 
   bool UsesMemory = ID.MayLoad || ID.MayStore;
-  bool UsesBuffers = !ID.Buffers.empty();
+  bool UsesBuffers = ID.UsedBuffers;
   bool UsesResources = !ID.Resources.empty();
   if (!UsesMemory && !UsesBuffers && !UsesResources)
     return ErrorSuccess();
@@ -550,7 +543,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
   LLVM_DEBUG(dbgs() << "\t\tSchedClassID=" << SchedClassID << '\n');
 
   // Create a new empty descriptor.
-  std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
+  std::unique_ptr<InstrDesc> ID = std::make_unique<InstrDesc>();
   ID->NumMicroOps = SCDesc.NumMicroOps;
   ID->SchedClassID = SchedClassID;
 
@@ -619,7 +612,7 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
   if (!DescOrErr)
     return DescOrErr.takeError();
   const InstrDesc &D = *DescOrErr;
-  std::unique_ptr<Instruction> NewIS = llvm::make_unique<Instruction>(D);
+  std::unique_ptr<Instruction> NewIS = std::make_unique<Instruction>(D);
 
   // Check if this is a dependency breaking instruction.
   APInt Mask;
@@ -636,8 +629,8 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
   }
 
   // Initialize Reads first.
+  MCPhysReg RegID = 0;
   for (const ReadDescriptor &RD : D.Reads) {
-    int RegID = -1;
     if (!RD.isImplicitRead()) {
       // explicit read.
       const MCOperand &Op = MCI.getOperand(RD.OpIndex);
@@ -655,7 +648,6 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
       continue;
 
     // Okay, this is a register operand. Create a ReadState for it.
-    assert(RegID > 0 && "Invalid register ID found!");
     NewIS->getUses().emplace_back(RD, RegID);
     ReadState &RS = NewIS->getUses().back();
 
@@ -696,8 +688,8 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
   // Initialize writes.
   unsigned WriteIndex = 0;
   for (const WriteDescriptor &WD : D.Writes) {
-    unsigned RegID = WD.isImplicitWrite() ? WD.RegisterID
-                                          : MCI.getOperand(WD.OpIndex).getReg();
+    RegID = WD.isImplicitWrite() ? WD.RegisterID
+                                 : MCI.getOperand(WD.OpIndex).getReg();
     // Check if this is a optional definition that references NoReg.
     if (WD.IsOptionalDef && !RegID) {
       ++WriteIndex;
diff --git a/lib/MCA/Instruction.cpp b/lib/MCA/Instruction.cpp
index 001842bca318..e5f2c4fd1eec 100644
--- a/lib/MCA/Instruction.cpp
+++ b/lib/MCA/Instruction.cpp
@@ -18,7 +18,7 @@
 namespace llvm {
 namespace mca {
 
-void WriteState::writeStartEvent(unsigned IID, unsigned RegID,
+void WriteState::writeStartEvent(unsigned IID, MCPhysReg RegID,
                                  unsigned Cycles) {
   CRD.IID = IID;
   CRD.RegID = RegID;
@@ -27,7 +27,7 @@ void WriteState::writeStartEvent(unsigned IID, unsigned RegID,
   DependentWrite = nullptr;
 }
 
-void ReadState::writeStartEvent(unsigned IID, unsigned RegID, unsigned Cycles) {
+void ReadState::writeStartEvent(unsigned IID, MCPhysReg RegID, unsigned Cycles) {
   assert(DependentWrites);
   assert(CyclesLeft == UNKNOWN_CYCLES);
 
diff --git a/lib/MCA/Stages/DispatchStage.cpp b/lib/MCA/Stages/DispatchStage.cpp
index 7334a268e9a6..3a3d82259160 100644
--- a/lib/MCA/Stages/DispatchStage.cpp
+++ b/lib/MCA/Stages/DispatchStage.cpp
@@ -44,7 +44,7 @@ void DispatchStage::notifyInstructionDispatched(const InstRef &IR,
 }
 
 bool DispatchStage::checkPRF(const InstRef &IR) const {
-  SmallVector<unsigned, 4> RegDefs;
+  SmallVector<MCPhysReg, 4> RegDefs;
   for (const WriteState &RegDef : IR.getInstruction()->getDefs())
     RegDefs.emplace_back(RegDef.getRegisterID());
 
@@ -60,7 +60,7 @@ bool DispatchStage::checkPRF(const InstRef &IR) const {
 }
 
 bool DispatchStage::checkRCU(const InstRef &IR) const {
-  const unsigned NumMicroOps = IR.getInstruction()->getDesc().NumMicroOps;
+  const unsigned NumMicroOps = IR.getInstruction()->getNumMicroOps();
   if (RCU.isAvailable(NumMicroOps))
     return true;
   notifyEvent<HWStallEvent>(
@@ -79,7 +79,7 @@ Error DispatchStage::dispatch(InstRef IR) {
   assert(!CarryOver && "Cannot dispatch another instruction!");
   Instruction &IS = *IR.getInstruction();
   const InstrDesc &Desc = IS.getDesc();
-  const unsigned NumMicroOps = Desc.NumMicroOps;
+  const unsigned NumMicroOps = IS.getNumMicroOps();
   if (NumMicroOps > DispatchWidth) {
     assert(AvailableEntries == DispatchWidth);
     AvailableEntries = 0;
@@ -123,9 +123,10 @@ Error DispatchStage::dispatch(InstRef IR) {
   for (WriteState &WS : IS.getDefs())
     PRF.addRegisterWrite(WriteRef(IR.getSourceIndex(), &WS), RegisterFiles);
 
-  // Reserve slots in the RCU, and notify the instruction that it has been
-  // dispatched to the schedulers for execution.
-  IS.dispatch(RCU.reserveSlot(IR, NumMicroOps));
+  // Reserve entries in the reorder buffer.
+  unsigned RCUTokenID = RCU.dispatch(IR);
+  // Notify the instruction that it has been dispatched.
+  IS.dispatch(RCUTokenID);
 
   // Notify listeners of the "instruction dispatched" event,
   // and move IR to the next stage.
@@ -155,8 +156,10 @@ Error DispatchStage::cycleStart() {
 }
 
 bool DispatchStage::isAvailable(const InstRef &IR) const {
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
-  unsigned Required = std::min(Desc.NumMicroOps, DispatchWidth);
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned NumMicroOps = Inst.getNumMicroOps();
+  const InstrDesc &Desc = Inst.getDesc();
+  unsigned Required = std::min(NumMicroOps, DispatchWidth);
   if (Required > AvailableEntries)
     return false;
 
diff --git a/lib/MCA/Stages/EntryStage.cpp b/lib/MCA/Stages/EntryStage.cpp
index d2f5613a0fb6..66135790a4cd 100644
--- a/lib/MCA/Stages/EntryStage.cpp
+++ b/lib/MCA/Stages/EntryStage.cpp
@@ -33,7 +33,7 @@ void EntryStage::getNextInstruction() {
   if (!SM.hasNext())
     return;
   SourceRef SR = SM.peekNext();
-  std::unique_ptr<Instruction> Inst = llvm::make_unique<Instruction>(SR.second);
+  std::unique_ptr<Instruction> Inst = std::make_unique<Instruction>(SR.second);
   CurrentInstruction = InstRef(SR.first, Inst.get());
   Instructions.emplace_back(std::move(Inst));
   SM.updateNext();
diff --git a/lib/MCA/Stages/ExecuteStage.cpp b/lib/MCA/Stages/ExecuteStage.cpp
index a2b361fcd1bf..2284ed7f2816 100644
--- a/lib/MCA/Stages/ExecuteStage.cpp
+++ b/lib/MCA/Stages/ExecuteStage.cpp
@@ -56,12 +56,13 @@ Error ExecuteStage::issueInstruction(InstRef &IR) {
   SmallVector<InstRef, 4> Ready;
 
   HWS.issueInstruction(IR, Used, Pending, Ready);
-  NumIssuedOpcodes += IR.getInstruction()->getDesc().NumMicroOps;
+  Instruction &IS = *IR.getInstruction();
+  NumIssuedOpcodes += IS.getNumMicroOps();
 
   notifyReservedOrReleasedBuffers(IR, /* Reserved */ false);
 
   notifyInstructionIssued(IR, Used);
-  if (IR.getInstruction()->isExecuted()) {
+  if (IS.isExecuted()) {
     notifyInstructionExecuted(IR);
     // FIXME: add a buffer of executed instructions.
     if (Error S = moveToTheNextStage(IR))
@@ -199,7 +200,8 @@ Error ExecuteStage::execute(InstRef &IR) {
   // units have been consumed.
   bool IsReadyInstruction = HWS.dispatch(IR);
   const Instruction &Inst = *IR.getInstruction();
-  NumDispatchedOpcodes += Inst.getDesc().NumMicroOps;
+  unsigned NumMicroOps = Inst.getNumMicroOps();
+  NumDispatchedOpcodes += NumMicroOps;
   notifyReservedOrReleasedBuffers(IR, /* Reserved */ true);
  
   if (!IsReadyInstruction) {
@@ -269,13 +271,17 @@ void ExecuteStage::notifyInstructionIssued(
 
 void ExecuteStage::notifyReservedOrReleasedBuffers(const InstRef &IR,
                                                    bool Reserved) const {
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
-  if (Desc.Buffers.empty())
+  uint64_t UsedBuffers = IR.getInstruction()->getDesc().UsedBuffers;
+  if (!UsedBuffers)
     return;
 
-  SmallVector<unsigned, 4> BufferIDs(Desc.Buffers.begin(), Desc.Buffers.end());
-  std::transform(Desc.Buffers.begin(), Desc.Buffers.end(), BufferIDs.begin(),
-                 [&](uint64_t Op) { return HWS.getResourceID(Op); });
+  SmallVector<unsigned, 4> BufferIDs(countPopulation(UsedBuffers), 0);
+  for (unsigned I = 0, E = BufferIDs.size(); I < E; ++I) {
+    uint64_t CurrentBufferMask = UsedBuffers & (-UsedBuffers);
+    BufferIDs[I] = HWS.getResourceID(CurrentBufferMask);
+    UsedBuffers ^= CurrentBufferMask;
+  }
+
   if (Reserved) {
     for (HWEventListener *Listener : getListeners())
       Listener->onReservedBuffers(IR, BufferIDs);
diff --git a/lib/MCA/Stages/RetireStage.cpp b/lib/MCA/Stages/RetireStage.cpp
index e1789dd7fa2a..f792af748bce 100644
--- a/lib/MCA/Stages/RetireStage.cpp
+++ b/lib/MCA/Stages/RetireStage.cpp
@@ -31,11 +31,11 @@ llvm::Error RetireStage::cycleStart() {
   while (!RCU.isEmpty()) {
     if (MaxRetirePerCycle != 0 && NumRetired == MaxRetirePerCycle)
       break;
-    const RetireControlUnit::RUToken &Current = RCU.peekCurrentToken();
+    const RetireControlUnit::RUToken &Current = RCU.getCurrentToken();
     if (!Current.Executed)
       break;
-    RCU.consumeCurrentToken();
     notifyInstructionRetired(Current.IR);
+    RCU.consumeCurrentToken();
     NumRetired++;
   }
 
@@ -52,6 +52,10 @@ void RetireStage::notifyInstructionRetired(const InstRef &IR) const {
   llvm::SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
   const Instruction &Inst = *IR.getInstruction();
 
+  // Release the load/store queue entries.
+  if (Inst.isMemOp())
+    LSU.onInstructionRetired(IR);
+
   for (const WriteState &WS : Inst.getDefs())
     PRF.removeRegisterWrite(WS, FreedRegs);
   notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index 49e66f46ab3f..148c011d9cd4 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -223,8 +223,8 @@ Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const {
   return Name.drop_back(1);
 }
 
-Expected<uint32_t> ArchiveMemberHeader::getSize() const {
-  uint32_t Ret;
+Expected<uint64_t> ArchiveMemberHeader::getSize() const {
+  uint64_t Ret;
   if (StringRef(ArMemHdr->Size,
                 sizeof(ArMemHdr->Size)).rtrim(" ").getAsInteger(10, Ret)) {
     std::string Buf;
@@ -550,7 +550,7 @@ Archive::Archive(MemoryBufferRef Source, Error &Err)
   } else if (Buffer.startswith(Magic)) {
     IsThin = false;
   } else {
-    Err = make_error<GenericBinaryError>("File too small to be an archive",
+    Err = make_error<GenericBinaryError>("file too small to be an archive",
                                          object_error::invalid_file_type);
     return;
   }
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index 228f6b40c5ec..5234b0e18233 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -16,8 +16,10 @@
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolicFile.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -147,7 +149,7 @@ static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val) {
 
 static void printRestOfMemberHeader(
     raw_ostream &Out, const sys::TimePoint<std::chrono::seconds> &ModTime,
-    unsigned UID, unsigned GID, unsigned Perms, unsigned Size) {
+    unsigned UID, unsigned GID, unsigned Perms, uint64_t Size) {
   printWithSpacePadding(Out, sys::toTimeT(ModTime), 12);
 
   // The format has only 6 chars for uid and gid. Truncate if the provided
@@ -164,7 +166,7 @@ static void
 printGNUSmallMemberHeader(raw_ostream &Out, StringRef Name,
                           const sys::TimePoint<std::chrono::seconds> &ModTime,
                           unsigned UID, unsigned GID, unsigned Perms,
-                          unsigned Size) {
+                          uint64_t Size) {
   printWithSpacePadding(Out, Twine(Name) + "/", 16);
   printRestOfMemberHeader(Out, ModTime, UID, GID, Perms, Size);
 }
@@ -172,11 +174,10 @@ printGNUSmallMemberHeader(raw_ostream &Out, StringRef Name,
 static void
 printBSDMemberHeader(raw_ostream &Out, uint64_t Pos, StringRef Name,
                      const sys::TimePoint<std::chrono::seconds> &ModTime,
-                     unsigned UID, unsigned GID, unsigned Perms,
-                     unsigned Size) {
+                     unsigned UID, unsigned GID, unsigned Perms, uint64_t Size) {
   uint64_t PosAfterHeader = Pos + 60 + Name.size();
   // Pad so that even 64 bit object files are aligned.
-  unsigned Pad = OffsetToAlignment(PosAfterHeader, 8);
+  unsigned Pad = offsetToAlignment(PosAfterHeader, Align(8));
   unsigned NameWithPadding = Name.size() + Pad;
   printWithSpacePadding(Out, Twine("#1/") + Twine(NameWithPadding), 16);
   printRestOfMemberHeader(Out, ModTime, UID, GID, Perms,
@@ -208,7 +209,7 @@ static void
 printMemberHeader(raw_ostream &Out, uint64_t Pos, raw_ostream &StringTable,
                   StringMap<uint64_t> &MemberNames, object::Archive::Kind Kind,
                   bool Thin, const NewArchiveMember &M,
-                  sys::TimePoint<std::chrono::seconds> ModTime, unsigned Size) {
+                  sys::TimePoint<std::chrono::seconds> ModTime, uint64_t Size) {
   if (isBSDLike(Kind))
     return printBSDMemberHeader(Out, Pos, M.MemberName, ModTime, M.UID, M.GID,
                                 M.Perms, Size);
@@ -243,7 +244,7 @@ struct MemberData {
 
 static MemberData computeStringTable(StringRef Names) {
   unsigned Size = Names.size();
-  unsigned Pad = OffsetToAlignment(Size, 2);
+  unsigned Pad = offsetToAlignment(Size, Align(2));
   std::string Header;
   raw_string_ostream Out(Header);
   printWithSpacePadding(Out, "//", 48);
@@ -307,8 +308,8 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
   // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
   // uniformly.
   // We do this for all bsd formats because it simplifies aligning members.
-  unsigned Alignment = isBSDLike(Kind) ? 8 : 2;
-  unsigned Pad = OffsetToAlignment(Size, Alignment);
+  const Align Alignment(isBSDLike(Kind) ? 8 : 2);
+  unsigned Pad = offsetToAlignment(Size, Alignment);
   Size += Pad;
 
   if (isBSDLike(Kind)) {
@@ -464,8 +465,9 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
     // uniformly.  This matches the behaviour with cctools and ensures that ld64
     // is happy with archives that we generate.
     unsigned MemberPadding =
-        isDarwin(Kind) ? OffsetToAlignment(Data.size(), 8) : 0;
-    unsigned TailPadding = OffsetToAlignment(Data.size() + MemberPadding, 2);
+        isDarwin(Kind) ? offsetToAlignment(Data.size(), Align(8)) : 0;
+    unsigned TailPadding =
+        offsetToAlignment(Data.size() + MemberPadding, Align(2));
     StringRef Padding = StringRef(PaddingData, MemberPadding + TailPadding);
 
     sys::TimePoint<std::chrono::seconds> ModTime;
@@ -474,8 +476,17 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
       ModTime = sys::toTimePoint(FilenameCount[M.MemberName]++);
     else
       ModTime = M.ModTime;
+
+    uint64_t Size = Buf.getBufferSize() + MemberPadding;
+    if (Size > object::Archive::MaxMemberSize) {
+      std::string StringMsg =
+          "File " + M.MemberName.str() + " exceeds size limit";
+      return make_error<object::GenericBinaryError>(
+          std::move(StringMsg), object::object_error::parse_failed);
+    }
+
     printMemberHeader(Out, Pos, StringTable, MemberNames, Kind, Thin, M,
-                      ModTime, Buf.getBufferSize() + MemberPadding);
+                      ModTime, Size);
     Out.flush();
 
     Expected<std::vector<unsigned>> Symbols =
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index a953c1d8cb80..944d2bc1bca7 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/Minidump.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/TapiUniversal.h"
 #include "llvm/Object/WindowsResource.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -86,6 +87,8 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::minidump:
     return MinidumpFile::create(Buffer);
+  case file_magic::tapi_file:
+    return TapiUniversal::create(Buffer);
   }
   llvm_unreachable("Unexpected Binary File Type");
 }
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 854664e679df..2c0f6dc2b1e9 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -936,29 +936,6 @@ iterator_range<base_reloc_iterator> COFFObjectFile::base_relocs() const {
   return make_range(base_reloc_begin(), base_reloc_end());
 }
 
-std::error_code
-COFFObjectFile::getCOFFHeader(const coff_file_header *&Res) const {
-  Res = COFFHeader;
-  return std::error_code();
-}
-
-std::error_code
-COFFObjectFile::getCOFFBigObjHeader(const coff_bigobj_file_header *&Res) const {
-  Res = COFFBigObjHeader;
-  return std::error_code();
-}
-
-std::error_code COFFObjectFile::getPE32Header(const pe32_header *&Res) const {
-  Res = PE32Header;
-  return std::error_code();
-}
-
-std::error_code
-COFFObjectFile::getPE32PlusHeader(const pe32plus_header *&Res) const {
-  Res = PE32PlusHeader;
-  return std::error_code();
-}
-
 std::error_code
 COFFObjectFile::getDataDirectory(uint32_t Index,
                                  const data_directory *&Res) const {
@@ -994,11 +971,12 @@ std::error_code COFFObjectFile::getSection(int32_t Index,
 std::error_code COFFObjectFile::getSection(StringRef SectionName,
                                            const coff_section *&Result) const {
   Result = nullptr;
-  StringRef SecName;
   for (const SectionRef &Section : sections()) {
-    if (std::error_code E = Section.getName(SecName))
-      return E;
-    if (SecName == SectionName) {
+    auto NameOrErr = Section.getName();
+    if (!NameOrErr)
+      return errorToErrorCode(NameOrErr.takeError());
+
+    if (*NameOrErr == SectionName) {
       Result = getCOFFSection(Section);
       return std::error_code();
     }
@@ -1684,9 +1662,12 @@ std::error_code BaseRelocRef::getRVA(uint32_t &Result) const {
   return std::error_code();
 }
 
-#define RETURN_IF_ERROR(E)                                                     \
-  if (E)                                                                       \
-    return E;
+#define RETURN_IF_ERROR(Expr)                                                  \
+  do {                                                                         \
+    Error E = (Expr);                                                          \
+    if (E)                                                                     \
+      return std::move(E);                                                     \
+  } while (0)
 
 Expected<ArrayRef<UTF16>>
 ResourceSectionRef::getDirStringAtOffset(uint32_t Offset) {
@@ -1715,11 +1696,168 @@ ResourceSectionRef::getTableAtOffset(uint32_t Offset) {
   return *Table;
 }
 
+Expected<const coff_resource_dir_entry &>
+ResourceSectionRef::getTableEntryAtOffset(uint32_t Offset) {
+  const coff_resource_dir_entry *Entry = nullptr;
+
+  BinaryStreamReader Reader(BBS);
+  Reader.setOffset(Offset);
+  RETURN_IF_ERROR(Reader.readObject(Entry));
+  assert(Entry != nullptr);
+  return *Entry;
+}
+
+Expected<const coff_resource_data_entry &>
+ResourceSectionRef::getDataEntryAtOffset(uint32_t Offset) {
+  const coff_resource_data_entry *Entry = nullptr;
+
+  BinaryStreamReader Reader(BBS);
+  Reader.setOffset(Offset);
+  RETURN_IF_ERROR(Reader.readObject(Entry));
+  assert(Entry != nullptr);
+  return *Entry;
+}
+
 Expected<const coff_resource_dir_table &>
 ResourceSectionRef::getEntrySubDir(const coff_resource_dir_entry &Entry) {
+  assert(Entry.Offset.isSubDir());
   return getTableAtOffset(Entry.Offset.value());
 }
 
+Expected<const coff_resource_data_entry &>
+ResourceSectionRef::getEntryData(const coff_resource_dir_entry &Entry) {
+  assert(!Entry.Offset.isSubDir());
+  return getDataEntryAtOffset(Entry.Offset.value());
+}
+
 Expected<const coff_resource_dir_table &> ResourceSectionRef::getBaseTable() {
   return getTableAtOffset(0);
 }
+
+Expected<const coff_resource_dir_entry &>
+ResourceSectionRef::getTableEntry(const coff_resource_dir_table &Table,
+                                  uint32_t Index) {
+  if (Index >= (uint32_t)(Table.NumberOfNameEntries + Table.NumberOfIDEntries))
+    return createStringError(object_error::parse_failed, "index out of range");
+  const uint8_t *TablePtr = reinterpret_cast<const uint8_t *>(&Table);
+  ptrdiff_t TableOffset = TablePtr - BBS.data().data();
+  return getTableEntryAtOffset(TableOffset + sizeof(Table) +
+                               Index * sizeof(coff_resource_dir_entry));
+}
+
+Error ResourceSectionRef::load(const COFFObjectFile *O) {
+  for (const SectionRef &S : O->sections()) {
+    Expected<StringRef> Name = S.getName();
+    if (!Name)
+      return Name.takeError();
+
+    if (*Name == ".rsrc" || *Name == ".rsrc$01")
+      return load(O, S);
+  }
+  return createStringError(object_error::parse_failed,
+                           "no resource section found");
+}
+
+Error ResourceSectionRef::load(const COFFObjectFile *O, const SectionRef &S) {
+  Obj = O;
+  Section = S;
+  Expected<StringRef> Contents = Section.getContents();
+  if (!Contents)
+    return Contents.takeError();
+  BBS = BinaryByteStream(*Contents, support::little);
+  const coff_section *COFFSect = Obj->getCOFFSection(Section);
+  ArrayRef<coff_relocation> OrigRelocs = Obj->getRelocations(COFFSect);
+  Relocs.reserve(OrigRelocs.size());
+  for (const coff_relocation &R : OrigRelocs)
+    Relocs.push_back(&R);
+  std::sort(Relocs.begin(), Relocs.end(),
+            [](const coff_relocation *A, const coff_relocation *B) {
+              return A->VirtualAddress < B->VirtualAddress;
+            });
+  return Error::success();
+}
+
+Expected<StringRef>
+ResourceSectionRef::getContents(const coff_resource_data_entry &Entry) {
+  if (!Obj)
+    return createStringError(object_error::parse_failed, "no object provided");
+
+  // Find a potential relocation at the DataRVA field (first member of
+  // the coff_resource_data_entry struct).
+  const uint8_t *EntryPtr = reinterpret_cast<const uint8_t *>(&Entry);
+  ptrdiff_t EntryOffset = EntryPtr - BBS.data().data();
+  coff_relocation RelocTarget{ulittle32_t(EntryOffset), ulittle32_t(0),
+                              ulittle16_t(0)};
+  auto RelocsForOffset =
+      std::equal_range(Relocs.begin(), Relocs.end(), &RelocTarget,
+                       [](const coff_relocation *A, const coff_relocation *B) {
+                         return A->VirtualAddress < B->VirtualAddress;
+                       });
+
+  if (RelocsForOffset.first != RelocsForOffset.second) {
+    // We found a relocation with the right offset. Check that it does have
+    // the expected type.
+    const coff_relocation &R = **RelocsForOffset.first;
+    uint16_t RVAReloc;
+    switch (Obj->getMachine()) {
+    case COFF::IMAGE_FILE_MACHINE_I386:
+      RVAReloc = COFF::IMAGE_REL_I386_DIR32NB;
+      break;
+    case COFF::IMAGE_FILE_MACHINE_AMD64:
+      RVAReloc = COFF::IMAGE_REL_AMD64_ADDR32NB;
+      break;
+    case COFF::IMAGE_FILE_MACHINE_ARMNT:
+      RVAReloc = COFF::IMAGE_REL_ARM_ADDR32NB;
+      break;
+    case COFF::IMAGE_FILE_MACHINE_ARM64:
+      RVAReloc = COFF::IMAGE_REL_ARM64_ADDR32NB;
+      break;
+    default:
+      return createStringError(object_error::parse_failed,
+                               "unsupported architecture");
+    }
+    if (R.Type != RVAReloc)
+      return createStringError(object_error::parse_failed,
+                               "unexpected relocation type");
+    // Get the relocation's symbol
+    Expected<COFFSymbolRef> Sym = Obj->getSymbol(R.SymbolTableIndex);
+    if (!Sym)
+      return Sym.takeError();
+    const coff_section *Section = nullptr;
+    // And the symbol's section
+    if (std::error_code EC = Obj->getSection(Sym->getSectionNumber(), Section))
+      return errorCodeToError(EC);
+    // Add the initial value of DataRVA to the symbol's offset to find the
+    // data it points at.
+    uint64_t Offset = Entry.DataRVA + Sym->getValue();
+    ArrayRef<uint8_t> Contents;
+    if (Error E = Obj->getSectionContents(Section, Contents))
+      return std::move(E);
+    if (Offset + Entry.DataSize > Contents.size())
+      return createStringError(object_error::parse_failed,
+                               "data outside of section");
+    // Return a reference to the data inside the section.
+    return StringRef(reinterpret_cast<const char *>(Contents.data()) + Offset,
+                     Entry.DataSize);
+  } else {
+    // Relocatable objects need a relocation for the DataRVA field.
+    if (Obj->isRelocatableObject())
+      return createStringError(object_error::parse_failed,
+                               "no relocation found for DataRVA");
+
+    // Locate the section that contains the address that DataRVA points at.
+    uint64_t VA = Entry.DataRVA + Obj->getImageBase();
+    for (const SectionRef &S : Obj->sections()) {
+      if (VA >= S.getAddress() &&
+          VA + Entry.DataSize <= S.getAddress() + S.getSize()) {
+        uint64_t Offset = VA - S.getAddress();
+        Expected<StringRef> Contents = S.getContents();
+        if (!Contents)
+          return Contents.takeError();
+        return Contents->slice(Offset, Offset + Entry.DataSize);
+      }
+    }
+    return createStringError(object_error::parse_failed,
+                             "address not found in image");
+  }
+}
diff --git a/lib/Object/Decompressor.cpp b/lib/Object/Decompressor.cpp
index ec15e6f69ada..11efd857d1a1 100644
--- a/lib/Object/Decompressor.cpp
+++ b/lib/Object/Decompressor.cpp
@@ -56,7 +56,7 @@ Error Decompressor::consumeCompressedZLibHeader(bool Is64Bit,
     return createError("corrupted compressed section header");
 
   DataExtractor Extractor(SectionData, IsLittleEndian, 0);
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   if (Extractor.getUnsigned(&Offset, Is64Bit ? sizeof(Elf64_Word)
                                              : sizeof(Elf32_Word)) !=
       ELFCOMPRESS_ZLIB)
@@ -77,10 +77,15 @@ bool Decompressor::isGnuStyle(StringRef Name) {
 }
 
 bool Decompressor::isCompressed(const object::SectionRef &Section) {
-  StringRef Name;
-  if (Section.getName(Name))
-    return false;
-  return Section.isCompressed() || isGnuStyle(Name);
+  if (Section.isCompressed())
+    return true;
+
+  Expected<StringRef> SecNameOrErr = Section.getName();
+  if (SecNameOrErr)
+    return isGnuStyle(*SecNameOrErr);
+
+  consumeError(SecNameOrErr.takeError());
+  return false;
 }
 
 bool Decompressor::isCompressedELFSection(uint64_t Flags, StringRef Name) {
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index 8660b1a64bdd..d491288579df 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -255,6 +255,8 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_ADDRSIG);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_DEPENDENT_LIBRARIES);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_SYMPART);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_PART_EHDR);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_PART_PHDR);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_verdef);
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index c7b715793048..bf6ffd6c37b9 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -43,7 +43,16 @@ const EnumEntry<unsigned> llvm::object::ElfSymbolTypes[NumElfSymbolTypes] = {
     {"File", "FILE", ELF::STT_FILE},
     {"Common", "COMMON", ELF::STT_COMMON},
     {"TLS", "TLS", ELF::STT_TLS},
-    {"GNU_IFunc", "IFUNC", ELF::STT_GNU_IFUNC}};
+    {"Unknown", "<unknown>: 7", 7},
+    {"Unknown", "<unknown>: 8", 8},
+    {"Unknown", "<unknown>: 9", 9},
+    {"GNU_IFunc", "IFUNC", ELF::STT_GNU_IFUNC},
+    {"OS Specific", "<OS specific>: 11", 11},
+    {"OS Specific", "<OS specific>: 12", 12},
+    {"Proc Specific", "<processor specific>: 13", 13},
+    {"Proc Specific", "<processor specific>: 14", 14},
+    {"Proc Specific", "<processor specific>: 15", 15}
+};
 
 ELFObjectFileBase::ELFObjectFileBase(unsigned int Type, MemoryBufferRef Source)
     : ObjectFile(Type, Source) {}
@@ -54,7 +63,7 @@ createPtr(MemoryBufferRef Object) {
   auto Ret = ELFObjectFile<ELFT>::create(Object);
   if (Error E = Ret.takeError())
     return std::move(E);
-  return make_unique<ELFObjectFile<ELFT>>(std::move(*Ret));
+  return std::make_unique<ELFObjectFile<ELFT>>(std::move(*Ret));
 }
 
 Expected<std::unique_ptr<ObjectFile>>
@@ -194,7 +203,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
     default:
       break;
     case ARMBuildAttrs::Not_Allowed:
-      Features.AddFeature("vfp2d16sp", false);
+      Features.AddFeature("vfp2sp", false);
       Features.AddFeature("vfp3d16sp", false);
       Features.AddFeature("vfp4d16sp", false);
       break;
@@ -347,6 +356,21 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
     case ARMBuildAttrs::v7E_M:
       Triple += "v7em";
       break;
+    case ARMBuildAttrs::v8_A:
+      Triple += "v8a";
+      break;
+    case ARMBuildAttrs::v8_R:
+      Triple += "v8r";
+      break;
+    case ARMBuildAttrs::v8_M_Base:
+      Triple += "v8m.base";
+      break;
+    case ARMBuildAttrs::v8_M_Main:
+      Triple += "v8m.main";
+      break;
+    case ARMBuildAttrs::v8_1_M_Main:
+      Triple += "v8.1m.main";
+      break;
     }
   }
   if (!isLittleEndian())
@@ -383,9 +407,13 @@ ELFObjectFileBase::getPltAddresses() const {
     return {};
   Optional<SectionRef> Plt = None, RelaPlt = None, GotPlt = None;
   for (const SectionRef &Section : sections()) {
-    StringRef Name;
-    if (Section.getName(Name))
+    Expected<StringRef> NameOrErr = Section.getName();
+    if (!NameOrErr) {
+      consumeError(NameOrErr.takeError());
       continue;
+    }
+    StringRef Name = *NameOrErr;
+
     if (Name == ".plt")
       Plt = Section;
     else if (Name == ".rela.plt" || Name == ".rel.plt")
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 5aec844003c0..c0c873f97354 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -57,12 +57,6 @@ namespace {
 
 } // end anonymous namespace
 
-static const std::array<StringRef, 17> validArchs = {
-    "i386",   "x86_64", "x86_64h",  "armv4t",  "arm",    "armv5e",
-    "armv6",  "armv6m", "armv7",    "armv7em", "armv7k", "armv7m",
-    "armv7s", "arm64",  "arm64_32", "ppc",     "ppc64",
-};
-
 static Error malformedError(const Twine &Msg) {
   return make_error<GenericBinaryError>("truncated or malformed object (" +
                                             Msg + ")",
@@ -1951,6 +1945,11 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const {
   return SectSize;
 }
 
+ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset,
+                                                      uint64_t Size) const {
+  return arrayRefFromStringRef(getData().substr(Offset, Size));
+}
+
 Expected<ArrayRef<uint8_t>>
 MachOObjectFile::getSectionContents(DataRefImpl Sec) const {
   uint32_t Offset;
@@ -1966,7 +1965,7 @@ MachOObjectFile::getSectionContents(DataRefImpl Sec) const {
     Size = Sect.size;
   }
 
-  return arrayRefFromStringRef(getData().substr(Offset, Size));
+  return getSectionContents(Offset, Size);
 }
 
 uint64_t MachOObjectFile::getSectionAlignment(DataRefImpl Sec) const {
@@ -1992,13 +1991,12 @@ Expected<SectionRef> MachOObjectFile::getSection(unsigned SectionIndex) const {
 }
 
 Expected<SectionRef> MachOObjectFile::getSection(StringRef SectionName) const {
-  StringRef SecName;
   for (const SectionRef &Section : sections()) {
-    if (std::error_code E = Section.getName(SecName))
-      return errorCodeToError(E);
-    if (SecName == SectionName) {
+    auto NameOrErr = Section.getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+    if (*NameOrErr == SectionName)
       return Section;
-    }
   }
   return errorCodeToError(object_error::parse_failed);
 }
@@ -2724,11 +2722,19 @@ Triple MachOObjectFile::getHostArch() {
 }
 
 bool MachOObjectFile::isValidArch(StringRef ArchFlag) {
-  return std::find(validArchs.cbegin(), validArchs.cend(), ArchFlag) !=
-         validArchs.cend();
+  auto validArchs = getValidArchs();
+  return llvm::find(validArchs, ArchFlag) != validArchs.end();
 }
 
-ArrayRef<StringRef> MachOObjectFile::getValidArchs() { return validArchs; }
+ArrayRef<StringRef> MachOObjectFile::getValidArchs() {
+  static const std::array<StringRef, 17> validArchs = {{
+      "i386",   "x86_64", "x86_64h",  "armv4t",  "arm",    "armv5e",
+      "armv6",  "armv6m", "armv7",    "armv7em", "armv7k", "armv7m",
+      "armv7s", "arm64",  "arm64_32", "ppc",     "ppc64",
+  }};
+
+  return validArchs;
+}
 
 Triple::ArchType MachOObjectFile::getArch() const {
   return getArch(getCPUType(*this));
@@ -3427,7 +3433,7 @@ iterator_range<rebase_iterator>
 MachOObjectFile::rebaseTable(Error &Err, MachOObjectFile *O,
                              ArrayRef<uint8_t> Opcodes, bool is64) {
   if (O->BindRebaseSectionTable == nullptr)
-    O->BindRebaseSectionTable = llvm::make_unique<BindRebaseSegInfo>(O);
+    O->BindRebaseSectionTable = std::make_unique<BindRebaseSegInfo>(O);
   MachORebaseEntry Start(&Err, O, Opcodes, is64);
   Start.moveToFirst();
 
@@ -3993,7 +3999,11 @@ BindRebaseSegInfo::BindRebaseSegInfo(const object::MachOObjectFile *Obj) {
   uint64_t CurSegAddress;
   for (const SectionRef &Section : Obj->sections()) {
     SectionInfo Info;
-    Section.getName(Info.SectionName);
+    Expected<StringRef> NameOrErr = Section.getName();
+    if (!NameOrErr)
+      consumeError(NameOrErr.takeError());
+    else
+      Info.SectionName = *NameOrErr;
     Info.Address = Section.getAddress();
     Info.Size = Section.getSize();
     Info.SegmentName =
@@ -4094,7 +4104,7 @@ MachOObjectFile::bindTable(Error &Err, MachOObjectFile *O,
                            ArrayRef<uint8_t> Opcodes, bool is64,
                            MachOBindEntry::Kind BKind) {
   if (O->BindRebaseSectionTable == nullptr)
-    O->BindRebaseSectionTable = llvm::make_unique<BindRebaseSegInfo>(O);
+    O->BindRebaseSectionTable = std::make_unique<BindRebaseSegInfo>(O);
   MachOBindEntry Start(&Err, O, Opcodes, is64, BKind);
   Start.moveToFirst();
 
@@ -4610,7 +4620,7 @@ void MachOObjectFile::ReadULEB128s(uint64_t Index,
                                    SmallVectorImpl<uint64_t> &Out) const {
   DataExtractor extractor(ObjectFile::getData(), true, 0);
 
-  uint32_t offset = Index;
+  uint64_t offset = Index;
   uint64_t data = 0;
   while (uint64_t delta = extractor.getULEB128(&offset)) {
     data += delta;
diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
index b3f0993412c6..a178ecde949e 100644
--- a/lib/Object/MachOUniversal.cpp
+++ b/lib/Object/MachOUniversal.cpp
@@ -155,15 +155,16 @@ MachOUniversalBinary::MachOUniversalBinary(MemoryBufferRef Source, Error &Err)
         ") extends past the end of the file");
       return;
     }
-#define MAXSECTALIGN 15 /* 2**15 or 0x8000 */
-    if (A.getAlign() > MAXSECTALIGN) {
-      Err = malformedError("align (2^" + Twine(A.getAlign()) + ") too large "
-        "for cputype (" + Twine(A.getCPUType()) + ") cpusubtype (" +
-        Twine(A.getCPUSubType() & ~MachO::CPU_SUBTYPE_MASK) +
-        ") (maximum 2^" + Twine(MAXSECTALIGN) + ")");
+
+    if (A.getAlign() > MaxSectionAlignment) {
+      Err = malformedError("align (2^" + Twine(A.getAlign()) +
+                           ") too large for cputype (" + Twine(A.getCPUType()) +
+                           ") cpusubtype (" +
+                           Twine(A.getCPUSubType() & ~MachO::CPU_SUBTYPE_MASK) +
+                           ") (maximum 2^" + Twine(MaxSectionAlignment) + ")");
       return;
     }
-    if(A.getOffset() % (1 << A.getAlign()) != 0){
+    if(A.getOffset() % (1ull << A.getAlign()) != 0){
       Err = malformedError("offset: " + Twine(A.getOffset()) +
         " for cputype (" + Twine(A.getCPUType()) + ") cpusubtype (" +
         Twine(A.getCPUSubType() & ~MachO::CPU_SUBTYPE_MASK) +
@@ -209,19 +210,34 @@ MachOUniversalBinary::MachOUniversalBinary(MemoryBufferRef Source, Error &Err)
   Err = Error::success();
 }
 
-Expected<std::unique_ptr<MachOObjectFile>>
+Expected<MachOUniversalBinary::ObjectForArch>
 MachOUniversalBinary::getObjectForArch(StringRef ArchName) const {
   if (Triple(ArchName).getArch() == Triple::ArchType::UnknownArch)
     return make_error<GenericBinaryError>("Unknown architecture "
                                           "named: " +
                                               ArchName,
                                           object_error::arch_not_found);
-
-  for (auto &Obj : objects())
+  for (const auto &Obj : objects())
     if (Obj.getArchFlagName() == ArchName)
-      return Obj.getAsObjectFile();
+      return Obj;
   return make_error<GenericBinaryError>("fat file does not "
                                         "contain " +
                                             ArchName,
                                         object_error::arch_not_found);
 }
+
+Expected<std::unique_ptr<MachOObjectFile>>
+MachOUniversalBinary::getMachOObjectForArch(StringRef ArchName) const {
+  Expected<ObjectForArch> O = getObjectForArch(ArchName);
+  if (!O)
+    return O.takeError();
+  return O->getAsObjectFile();
+}
+
+Expected<std::unique_ptr<Archive>>
+MachOUniversalBinary::getArchiveForArch(StringRef ArchName) const {
+  Expected<ObjectForArch> O = getObjectForArch(ArchName);
+  if (!O)
+    return O.takeError();
+  return O->getAsArchive();
+}
diff --git a/lib/Object/Minidump.cpp b/lib/Object/Minidump.cpp
index 7b5b21558699..3e932fe7be28 100644
--- a/lib/Object/Minidump.cpp
+++ b/lib/Object/Minidump.cpp
@@ -53,13 +53,30 @@ Expected<std::string> MinidumpFile::getString(size_t Offset) const {
   return Result;
 }
 
+Expected<iterator_range<MinidumpFile::MemoryInfoIterator>>
+MinidumpFile::getMemoryInfoList() const {
+  Optional<ArrayRef<uint8_t>> Stream = getRawStream(StreamType::MemoryInfoList);
+  if (!Stream)
+    return createError("No such stream");
+  auto ExpectedHeader =
+      getDataSliceAs<minidump::MemoryInfoListHeader>(*Stream, 0, 1);
+  if (!ExpectedHeader)
+    return ExpectedHeader.takeError();
+  const minidump::MemoryInfoListHeader &H = ExpectedHeader.get()[0];
+  Expected<ArrayRef<uint8_t>> Data =
+      getDataSlice(*Stream, H.SizeOfHeader, H.SizeOfEntry * H.NumberOfEntries);
+  if (!Data)
+    return Data.takeError();
+  return make_range(MemoryInfoIterator(*Data, H.SizeOfEntry),
+                    MemoryInfoIterator({}, H.SizeOfEntry));
+}
+
 template <typename T>
-Expected<ArrayRef<T>> MinidumpFile::getListStream(StreamType Stream) const {
-  auto OptionalStream = getRawStream(Stream);
-  if (!OptionalStream)
+Expected<ArrayRef<T>> MinidumpFile::getListStream(StreamType Type) const {
+  Optional<ArrayRef<uint8_t>> Stream = getRawStream(Type);
+  if (!Stream)
     return createError("No such stream");
-  auto ExpectedSize =
-      getDataSliceAs<support::ulittle32_t>(*OptionalStream, 0, 1);
+  auto ExpectedSize = getDataSliceAs<support::ulittle32_t>(*Stream, 0, 1);
   if (!ExpectedSize)
     return ExpectedSize.takeError();
 
@@ -69,10 +86,10 @@ Expected<ArrayRef<T>> MinidumpFile::getListStream(StreamType Stream) const {
   // Some producers insert additional padding bytes to align the list to an
   // 8-byte boundary. Check for that by comparing the list size with the overall
   // stream size.
-  if (ListOffset + sizeof(T) * ListSize < OptionalStream->size())
+  if (ListOffset + sizeof(T) * ListSize < Stream->size())
     ListOffset = 8;
 
-  return getDataSliceAs<T>(*OptionalStream, ListOffset, ListSize);
+  return getDataSliceAs<T>(*Stream, ListOffset, ListSize);
 }
 template Expected<ArrayRef<Module>>
     MinidumpFile::getListStream(StreamType) const;
@@ -109,13 +126,14 @@ MinidumpFile::create(MemoryBufferRef Source) {
     return ExpectedStreams.takeError();
 
   DenseMap<StreamType, std::size_t> StreamMap;
-  for (const auto &Stream : llvm::enumerate(*ExpectedStreams)) {
-    StreamType Type = Stream.value().Type;
-    const LocationDescriptor &Loc = Stream.value().Location;
+  for (const auto &StreamDescriptor : llvm::enumerate(*ExpectedStreams)) {
+    StreamType Type = StreamDescriptor.value().Type;
+    const LocationDescriptor &Loc = StreamDescriptor.value().Location;
 
-    auto ExpectedStream = getDataSlice(Data, Loc.RVA, Loc.DataSize);
-    if (!ExpectedStream)
-      return ExpectedStream.takeError();
+    Expected<ArrayRef<uint8_t>> Stream =
+        getDataSlice(Data, Loc.RVA, Loc.DataSize);
+    if (!Stream)
+      return Stream.takeError();
 
     if (Type == StreamType::Unused && Loc.DataSize == 0) {
       // Ignore dummy streams. This is technically ill-formed, but a number of
@@ -128,7 +146,7 @@ MinidumpFile::create(MemoryBufferRef Source) {
       return createError("Cannot handle one of the minidump streams");
 
     // Update the directory map, checking for duplicate stream types.
-    if (!StreamMap.try_emplace(Type, Stream.index()).second)
+    if (!StreamMap.try_emplace(Type, StreamDescriptor.index()).second)
       return createError("Duplicate stream type");
   }
 
diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index d84798cc6dd0..b486e9f5c9a8 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp
@@ -138,7 +138,7 @@ LLVMBinaryRef LLVMMachOUniversalBinaryCopyObjectForArch(LLVMBinaryRef BR,
                                                         char **ErrorMessage) {
   auto universal = cast<MachOUniversalBinary>(unwrap(BR));
   Expected<std::unique_ptr<ObjectFile>> ObjOrErr(
-      universal->getObjectForArch({Arch, ArchLen}));
+      universal->getMachOObjectForArch({Arch, ArchLen}));
   if (!ObjOrErr) {
     *ErrorMessage = strdup(toString(ObjOrErr.takeError()).c_str());
     return nullptr;
@@ -251,10 +251,10 @@ void LLVMMoveToNextSymbol(LLVMSymbolIteratorRef SI) {
 
 // SectionRef accessors
 const char *LLVMGetSectionName(LLVMSectionIteratorRef SI) {
-  StringRef ret;
-  if (std::error_code ec = (*unwrap(SI))->getName(ret))
-   report_fatal_error(ec.message());
-  return ret.data();
+  auto NameOrErr = (*unwrap(SI))->getName();
+  if (!NameOrErr)
+    report_fatal_error(NameOrErr.takeError());
+  return NameOrErr->data();
 }
 
 uint64_t LLVMGetSectionSize(LLVMSectionIteratorRef SI) {
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index 101f5dcc0821..e0e63a5a7d76 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -67,8 +67,10 @@ Error ObjectFile::printSymbolName(raw_ostream &OS, DataRefImpl Symb) const {
 uint32_t ObjectFile::getSymbolAlignment(DataRefImpl DRI) const { return 0; }
 
 bool ObjectFile::isSectionBitcode(DataRefImpl Sec) const {
-  if (Expected<StringRef> NameOrErr = getSectionName(Sec))
+  Expected<StringRef> NameOrErr = getSectionName(Sec);
+  if (NameOrErr)
     return *NameOrErr == ".llvmbc";
+  consumeError(NameOrErr.takeError());
   return false;
 }
 
@@ -82,7 +84,8 @@ bool ObjectFile::isBerkeleyData(DataRefImpl Sec) const {
   return isSectionData(Sec);
 }
 
-section_iterator ObjectFile::getRelocatedSection(DataRefImpl Sec) const {
+Expected<section_iterator>
+ObjectFile::getRelocatedSection(DataRefImpl Sec) const {
   return section_iterator(SectionRef(Sec, this));
 }
 
@@ -103,7 +106,7 @@ Triple ObjectFile::makeTriple() const {
     TheTriple.setObjectFormat(Triple::MachO);
 
   if (isCOFF()) {
-    const auto COFFObj = dyn_cast<COFFObjectFile>(this);
+    const auto COFFObj = cast<COFFObjectFile>(this);
     if (COFFObj->getArch() == Triple::thumb)
       TheTriple.setTriple("thumbv7-windows");
   }
@@ -127,6 +130,8 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type) {
   case file_magic::pdb:
   case file_magic::minidump:
     return errorCodeToError(object_error::invalid_file_type);
+  case file_magic::tapi_file:
+    return errorCodeToError(object_error::invalid_file_type);
   case file_magic::elf:
   case file_magic::elf_relocatable:
   case file_magic::elf_executable:
diff --git a/lib/Object/RelocationResolver.cpp b/lib/Object/RelocationResolver.cpp
index 0a243f32e12c..ca89f5671b8a 100644
--- a/lib/Object/RelocationResolver.cpp
+++ b/lib/Object/RelocationResolver.cpp
@@ -30,6 +30,7 @@ static bool supportsX86_64(uint64_t Type) {
   case ELF::R_X86_64_DTPOFF32:
   case ELF::R_X86_64_DTPOFF64:
   case ELF::R_X86_64_PC32:
+  case ELF::R_X86_64_PC64:
   case ELF::R_X86_64_32:
   case ELF::R_X86_64_32S:
     return true;
@@ -47,6 +48,7 @@ static uint64_t resolveX86_64(RelocationRef R, uint64_t S, uint64_t A) {
   case ELF::R_X86_64_DTPOFF64:
     return S + getELFAddend(R);
   case ELF::R_X86_64_PC32:
+  case ELF::R_X86_64_PC64:
     return S + getELFAddend(R) - R.getOffset();
   case ELF::R_X86_64_32:
   case ELF::R_X86_64_32S:
@@ -90,9 +92,9 @@ static bool supportsBPF(uint64_t Type) {
 static uint64_t resolveBPF(RelocationRef R, uint64_t S, uint64_t A) {
   switch (R.getType()) {
   case ELF::R_BPF_64_32:
-    return S & 0xFFFFFFFF;
+    return (S + A) & 0xFFFFFFFF;
   case ELF::R_BPF_64_64:
-    return S;
+    return S + A;
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -335,6 +337,8 @@ static bool supportsRISCV(uint64_t Type) {
   case ELF::R_RISCV_NONE:
   case ELF::R_RISCV_32:
   case ELF::R_RISCV_64:
+  case ELF::R_RISCV_SET6:
+  case ELF::R_RISCV_SUB6:
   case ELF::R_RISCV_ADD8:
   case ELF::R_RISCV_SUB8:
   case ELF::R_RISCV_ADD16:
@@ -358,6 +362,10 @@ static uint64_t resolveRISCV(RelocationRef R, uint64_t S, uint64_t A) {
     return (S + RA) & 0xFFFFFFFF;
   case ELF::R_RISCV_64:
     return S + RA;
+  case ELF::R_RISCV_SET6:
+    return (A + (S + RA)) & 0xFF;
+  case ELF::R_RISCV_SUB6:
+    return (A - (S + RA)) & 0xFF;
   case ELF::R_RISCV_ADD8:
     return (A + (S + RA)) & 0xFF;
   case ELF::R_RISCV_SUB8:
@@ -420,6 +428,47 @@ static uint64_t resolveCOFFX86_64(RelocationRef R, uint64_t S, uint64_t A) {
   }
 }
 
+static bool supportsCOFFARM(uint64_t Type) {
+  switch (Type) {
+  case COFF::IMAGE_REL_ARM_SECREL:
+  case COFF::IMAGE_REL_ARM_ADDR32:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static uint64_t resolveCOFFARM(RelocationRef R, uint64_t S, uint64_t A) {
+  switch (R.getType()) {
+  case COFF::IMAGE_REL_ARM_SECREL:
+  case COFF::IMAGE_REL_ARM_ADDR32:
+    return (S + A) & 0xFFFFFFFF;
+  default:
+    llvm_unreachable("Invalid relocation type");
+  }
+}
+
+static bool supportsCOFFARM64(uint64_t Type) {
+  switch (Type) {
+  case COFF::IMAGE_REL_ARM64_SECREL:
+  case COFF::IMAGE_REL_ARM64_ADDR64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static uint64_t resolveCOFFARM64(RelocationRef R, uint64_t S, uint64_t A) {
+  switch (R.getType()) {
+  case COFF::IMAGE_REL_ARM64_SECREL:
+    return (S + A) & 0xFFFFFFFF;
+  case COFF::IMAGE_REL_ARM64_ADDR64:
+    return S + A;
+  default:
+    llvm_unreachable("Invalid relocation type");
+  }
+}
+
 static bool supportsMachOX86_64(uint64_t Type) {
   return Type == MachO::X86_64_RELOC_UNSIGNED;
 }
@@ -472,9 +521,19 @@ static uint64_t resolveWasm32(RelocationRef R, uint64_t S, uint64_t A) {
 std::pair<bool (*)(uint64_t), RelocationResolver>
 getRelocationResolver(const ObjectFile &Obj) {
   if (Obj.isCOFF()) {
-    if (Obj.getBytesInAddress() == 8)
+    switch (Obj.getArch()) {
+    case Triple::x86_64:
       return {supportsCOFFX86_64, resolveCOFFX86_64};
-    return {supportsCOFFX86, resolveCOFFX86};
+    case Triple::x86:
+      return {supportsCOFFX86, resolveCOFFX86};
+    case Triple::arm:
+    case Triple::thumb:
+      return {supportsCOFFARM, resolveCOFFARM};
+    case Triple::aarch64:
+      return {supportsCOFFARM64, resolveCOFFARM64};
+    default:
+      return {nullptr, nullptr};
+    }
   } else if (Obj.isELF()) {
     if (Obj.getBytesInAddress() == 8) {
       switch (Obj.getArch()) {
diff --git a/lib/Object/SymbolicFile.cpp b/lib/Object/SymbolicFile.cpp
index 2b152b7d8da3..3db4ad9ed14b 100644
--- a/lib/Object/SymbolicFile.cpp
+++ b/lib/Object/SymbolicFile.cpp
@@ -53,6 +53,7 @@ SymbolicFile::createSymbolicFile(MemoryBufferRef Object, file_magic Type,
   case file_magic::windows_resource:
   case file_magic::pdb:
   case file_magic::minidump:
+  case file_magic::tapi_file:
     return errorCodeToError(object_error::invalid_file_type);
   case file_magic::elf:
   case file_magic::elf_executable:
diff --git a/lib/Object/TapiFile.cpp b/lib/Object/TapiFile.cpp
new file mode 100644
index 000000000000..c409bd8e5995
--- /dev/null
+++ b/lib/Object/TapiFile.cpp
@@ -0,0 +1,104 @@
+//===- TapiFile.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Text-based Dynamcic Library Stub format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/TapiFile.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace MachO;
+using namespace object;
+
+static constexpr StringLiteral ObjC1ClassNamePrefix = ".objc_class_name_";
+static constexpr StringLiteral ObjC2ClassNamePrefix = "_OBJC_CLASS_$_";
+static constexpr StringLiteral ObjC2MetaClassNamePrefix = "_OBJC_METACLASS_$_";
+static constexpr StringLiteral ObjC2EHTypePrefix = "_OBJC_EHTYPE_$_";
+static constexpr StringLiteral ObjC2IVarPrefix = "_OBJC_IVAR_$_";
+
+static uint32_t getFlags(const Symbol *Sym) {
+  uint32_t Flags = BasicSymbolRef::SF_Global;
+  if (Sym->isUndefined())
+    Flags |= BasicSymbolRef::SF_Undefined;
+  else
+    Flags |= BasicSymbolRef::SF_Exported;
+
+  if (Sym->isWeakDefined() || Sym->isWeakReferenced())
+    Flags |= BasicSymbolRef::SF_Weak;
+
+  return Flags;
+}
+
+TapiFile::TapiFile(MemoryBufferRef Source, const InterfaceFile &interface,
+                   Architecture Arch)
+    : SymbolicFile(ID_TapiFile, Source) {
+  for (const auto *Symbol : interface.symbols()) {
+    if (!Symbol->getArchitectures().has(Arch))
+      continue;
+
+    switch (Symbol->getKind()) {
+    case SymbolKind::GlobalSymbol:
+      Symbols.emplace_back(StringRef(), Symbol->getName(), getFlags(Symbol));
+      break;
+    case SymbolKind::ObjectiveCClass:
+      if (interface.getPlatforms().count(PlatformKind::macOS) &&
+          Arch == AK_i386) {
+        Symbols.emplace_back(ObjC1ClassNamePrefix, Symbol->getName(),
+                             getFlags(Symbol));
+      } else {
+        Symbols.emplace_back(ObjC2ClassNamePrefix, Symbol->getName(),
+                             getFlags(Symbol));
+        Symbols.emplace_back(ObjC2MetaClassNamePrefix, Symbol->getName(),
+                             getFlags(Symbol));
+      }
+      break;
+    case SymbolKind::ObjectiveCClassEHType:
+      Symbols.emplace_back(ObjC2EHTypePrefix, Symbol->getName(),
+                           getFlags(Symbol));
+      break;
+    case SymbolKind::ObjectiveCInstanceVariable:
+      Symbols.emplace_back(ObjC2IVarPrefix, Symbol->getName(),
+                           getFlags(Symbol));
+      break;
+    }
+  }
+}
+
+TapiFile::~TapiFile() = default;
+
+void TapiFile::moveSymbolNext(DataRefImpl &DRI) const {
+  const auto *Sym = reinterpret_cast<const Symbol *>(DRI.p);
+  DRI.p = reinterpret_cast<uintptr_t>(++Sym);
+}
+
+Error TapiFile::printSymbolName(raw_ostream &OS, DataRefImpl DRI) const {
+  const auto *Sym = reinterpret_cast<const Symbol *>(DRI.p);
+  OS << Sym->Prefix << Sym->Name;
+  return Error::success();
+}
+
+uint32_t TapiFile::getSymbolFlags(DataRefImpl DRI) const {
+  const auto *Sym = reinterpret_cast<const Symbol *>(DRI.p);
+  return Sym->Flags;
+}
+
+basic_symbol_iterator TapiFile::symbol_begin() const {
+  DataRefImpl DRI;
+  DRI.p = reinterpret_cast<uintptr_t>(&*Symbols.begin());
+  return BasicSymbolRef{DRI, this};
+}
+
+basic_symbol_iterator TapiFile::symbol_end() const {
+  DataRefImpl DRI;
+  DRI.p = reinterpret_cast<uintptr_t>(&*Symbols.end());
+  return BasicSymbolRef{DRI, this};
+}
diff --git a/lib/Object/TapiUniversal.cpp b/lib/Object/TapiUniversal.cpp
new file mode 100644
index 000000000000..b3273e345a61
--- /dev/null
+++ b/lib/Object/TapiUniversal.cpp
@@ -0,0 +1,54 @@
+//===- TapiUniversal.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Text-based Dynamic Library Stub format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/TapiUniversal.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/TextAPI/MachO/TextAPIReader.h"
+
+using namespace llvm;
+using namespace MachO;
+using namespace object;
+
+TapiUniversal::TapiUniversal(MemoryBufferRef Source, Error &Err)
+    : Binary(ID_TapiUniversal, Source) {
+  auto Result = TextAPIReader::get(Source);
+  ErrorAsOutParameter ErrAsOuParam(&Err);
+  if (!Result) {
+    Err = Result.takeError();
+    return;
+  }
+  ParsedFile = std::move(Result.get());
+
+  auto Archs = ParsedFile->getArchitectures();
+  for (auto Arch : Archs)
+    Architectures.emplace_back(Arch);
+}
+
+TapiUniversal::~TapiUniversal() = default;
+
+Expected<std::unique_ptr<TapiFile>>
+TapiUniversal::ObjectForArch::getAsObjectFile() const {
+  return std::unique_ptr<TapiFile>(new TapiFile(Parent->getMemoryBufferRef(),
+                                                *Parent->ParsedFile.get(),
+                                                Parent->Architectures[Index]));
+}
+
+Expected<std::unique_ptr<TapiUniversal>>
+TapiUniversal::create(MemoryBufferRef Source) {
+  Error Err = Error::success();
+  std::unique_ptr<TapiUniversal> Ret(new TapiUniversal(Source, Err));
+  if (Err)
+    return std::move(Err);
+  return std::move(Ret);
+}
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 82aa1830dced..014b403556df 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -56,7 +56,7 @@ LLVM_DUMP_METHOD void WasmSymbol::dump() const { print(dbgs()); }
 Expected<std::unique_ptr<WasmObjectFile>>
 ObjectFile::createWasmObjectFile(MemoryBufferRef Buffer) {
   Error Err = Error::success();
-  auto ObjectFile = llvm::make_unique<WasmObjectFile>(Buffer, Err);
+  auto ObjectFile = std::make_unique<WasmObjectFile>(Buffer, Err);
   if (Err)
     return std::move(Err);
 
@@ -781,7 +781,7 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
       break;
     case wasm::R_WASM_GLOBAL_INDEX_LEB:
       // R_WASM_GLOBAL_INDEX_LEB are can be used against function and data
-      // symbols to refer to thier GOT enties.
+      // symbols to refer to their GOT entries.
       if (!isValidGlobalSymbol(Reloc.Index) &&
           !isValidDataSymbol(Reloc.Index) &&
           !isValidFunctionSymbol(Reloc.Index))
@@ -881,12 +881,9 @@ Error WasmObjectFile::parseTypeSection(ReadContext &Ctx) {
       Sig.Params.push_back(wasm::ValType(ParamType));
     }
     uint32_t ReturnCount = readVaruint32(Ctx);
-    if (ReturnCount) {
-      if (ReturnCount != 1) {
-        return make_error<GenericBinaryError>(
-            "Multiple return types not supported", object_error::parse_failed);
-      }
-      Sig.Returns.push_back(wasm::ValType(readUint8(Ctx)));
+    while (ReturnCount--) {
+      uint32_t ReturnType = readUint8(Ctx);
+      Sig.Returns.push_back(wasm::ValType(ReturnType));
     }
     Signatures.push_back(std::move(Sig));
   }
diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp
index d76e1231684c..10717718b201 100644
--- a/lib/Object/WindowsResource.cpp
+++ b/lib/Object/WindowsResource.cpp
@@ -30,15 +30,24 @@ namespace object {
   if (auto EC = X)                                                             \
     return EC;
 
+#define UNWRAP_REF_OR_RETURN(Name, Expr)                                       \
+  auto Name##OrErr = Expr;                                                     \
+  if (!Name##OrErr)                                                            \
+    return Name##OrErr.takeError();                                            \
+  const auto &Name = *Name##OrErr;
+
+#define UNWRAP_OR_RETURN(Name, Expr)                                           \
+  auto Name##OrErr = Expr;                                                     \
+  if (!Name##OrErr)                                                            \
+    return Name##OrErr.takeError();                                            \
+  auto Name = *Name##OrErr;
+
 const uint32_t MIN_HEADER_SIZE = 7 * sizeof(uint32_t) + 2 * sizeof(uint16_t);
 
 // COFF files seem to be inconsistent with alignment between sections, just use
 // 8-byte because it makes everyone happy.
 const uint32_t SECTION_ALIGNMENT = sizeof(uint64_t);
 
-uint32_t WindowsResourceParser::TreeNode::StringCount = 0;
-uint32_t WindowsResourceParser::TreeNode::DataCount = 0;
-
 WindowsResource::WindowsResource(MemoryBufferRef Source)
     : Binary(Binary::ID_WinRes, Source) {
   size_t LeadingSize = WIN_RES_MAGIC_SIZE + WIN_RES_NULL_ENTRY_SIZE;
@@ -128,7 +137,8 @@ Error ResourceEntryRef::loadNext() {
   return Error::success();
 }
 
-WindowsResourceParser::WindowsResourceParser() : Root(false) {}
+WindowsResourceParser::WindowsResourceParser(bool MinGW)
+    : Root(false), MinGW(MinGW) {}
 
 void printResourceTypeName(uint16_t TypeID, raw_ostream &OS) {
   switch (TypeID) {
@@ -200,6 +210,122 @@ static std::string makeDuplicateResourceError(
   return OS.str();
 }
 
+static void printStringOrID(const WindowsResourceParser::StringOrID &S,
+                            raw_string_ostream &OS, bool IsType, bool IsID) {
+  if (S.IsString) {
+    std::string UTF8;
+    if (!convertUTF16LEToUTF8String(S.String, UTF8))
+      UTF8 = "(failed conversion from UTF16)";
+    OS << '\"' << UTF8 << '\"';
+  } else if (IsType)
+    printResourceTypeName(S.ID, OS);
+  else if (IsID)
+    OS << "ID " << S.ID;
+  else
+    OS << S.ID;
+}
+
+static std::string makeDuplicateResourceError(
+    const std::vector<WindowsResourceParser::StringOrID> &Context,
+    StringRef File1, StringRef File2) {
+  std::string Ret;
+  raw_string_ostream OS(Ret);
+
+  OS << "duplicate resource:";
+
+  if (Context.size() >= 1) {
+    OS << " type ";
+    printStringOrID(Context[0], OS, /* IsType */ true, /* IsID */ true);
+  }
+
+  if (Context.size() >= 2) {
+    OS << "/name ";
+    printStringOrID(Context[1], OS, /* IsType */ false, /* IsID */ true);
+  }
+
+  if (Context.size() >= 3) {
+    OS << "/language ";
+    printStringOrID(Context[2], OS, /* IsType */ false, /* IsID */ false);
+  }
+  OS << ", in " << File1 << " and in " << File2;
+
+  return OS.str();
+}
+
+// MinGW specific. Remove default manifests (with language zero) if there are
+// other manifests present, and report an error if there are more than one
+// manifest with a non-zero language code.
+// GCC has the concept of a default manifest resource object, which gets
+// linked in implicitly if present. This default manifest has got language
+// id zero, and should be dropped silently if there's another manifest present.
+// If the user resources surprisignly had a manifest with language id zero,
+// we should also ignore the duplicate default manifest.
+void WindowsResourceParser::cleanUpManifests(
+    std::vector<std::string> &Duplicates) {
+  auto TypeIt = Root.IDChildren.find(/* RT_MANIFEST */ 24);
+  if (TypeIt == Root.IDChildren.end())
+    return;
+
+  TreeNode *TypeNode = TypeIt->second.get();
+  auto NameIt =
+      TypeNode->IDChildren.find(/* CREATEPROCESS_MANIFEST_RESOURCE_ID */ 1);
+  if (NameIt == TypeNode->IDChildren.end())
+    return;
+
+  TreeNode *NameNode = NameIt->second.get();
+  if (NameNode->IDChildren.size() <= 1)
+    return; // None or one manifest present, all good.
+
+  // If we have more than one manifest, drop the language zero one if present,
+  // and check again.
+  auto LangZeroIt = NameNode->IDChildren.find(0);
+  if (LangZeroIt != NameNode->IDChildren.end() &&
+      LangZeroIt->second->IsDataNode) {
+    uint32_t RemovedIndex = LangZeroIt->second->DataIndex;
+    NameNode->IDChildren.erase(LangZeroIt);
+    Data.erase(Data.begin() + RemovedIndex);
+    Root.shiftDataIndexDown(RemovedIndex);
+
+    // If we're now down to one manifest, all is good.
+    if (NameNode->IDChildren.size() <= 1)
+      return;
+  }
+
+  // More than one non-language-zero manifest
+  auto FirstIt = NameNode->IDChildren.begin();
+  uint32_t FirstLang = FirstIt->first;
+  TreeNode *FirstNode = FirstIt->second.get();
+  auto LastIt = NameNode->IDChildren.rbegin();
+  uint32_t LastLang = LastIt->first;
+  TreeNode *LastNode = LastIt->second.get();
+  Duplicates.push_back(
+      ("duplicate non-default manifests with languages " + Twine(FirstLang) +
+       " in " + InputFilenames[FirstNode->Origin] + " and " + Twine(LastLang) +
+       " in " + InputFilenames[LastNode->Origin])
+          .str());
+}
+
+// Ignore duplicates of manifests with language zero (the default manifest),
+// in case the user has provided a manifest with that language id. See
+// the function comment above for context. Only returns true if MinGW is set
+// to true.
+bool WindowsResourceParser::shouldIgnoreDuplicate(
+    const ResourceEntryRef &Entry) const {
+  return MinGW && !Entry.checkTypeString() &&
+         Entry.getTypeID() == /* RT_MANIFEST */ 24 &&
+         !Entry.checkNameString() &&
+         Entry.getNameID() == /* CREATEPROCESS_MANIFEST_RESOURCE_ID */ 1 &&
+         Entry.getLanguage() == 0;
+}
+
+bool WindowsResourceParser::shouldIgnoreDuplicate(
+    const std::vector<StringOrID> &Context) const {
+  return MinGW && Context.size() == 3 && !Context[0].IsString &&
+         Context[0].ID == /* RT_MANIFEST */ 24 && !Context[1].IsString &&
+         Context[1].ID == /* CREATEPROCESS_MANIFEST_RESOURCE_ID */ 1 &&
+         !Context[2].IsString && Context[2].ID == 0;
+}
+
 Error WindowsResourceParser::parse(WindowsResource *WR,
                                    std::vector<std::string> &Duplicates) {
   auto EntryOrErr = WR->getHeadEntry();
@@ -219,112 +345,176 @@ Error WindowsResourceParser::parse(WindowsResource *WR,
   }
 
   ResourceEntryRef Entry = EntryOrErr.get();
+  uint32_t Origin = InputFilenames.size();
+  InputFilenames.push_back(WR->getFileName());
   bool End = false;
   while (!End) {
-    Data.push_back(Entry.getData());
 
-    bool IsNewTypeString = false;
-    bool IsNewNameString = false;
-
-    TreeNode* Node;
-    bool IsNewNode = Root.addEntry(Entry, InputFilenames.size(),
-                                   IsNewTypeString, IsNewNameString, Node);
-    InputFilenames.push_back(WR->getFileName());
+    TreeNode *Node;
+    bool IsNewNode = Root.addEntry(Entry, Origin, Data, StringTable, Node);
     if (!IsNewNode) {
-      Duplicates.push_back(makeDuplicateResourceError(
-          Entry, InputFilenames[Node->Origin], WR->getFileName()));
+      if (!shouldIgnoreDuplicate(Entry))
+        Duplicates.push_back(makeDuplicateResourceError(
+            Entry, InputFilenames[Node->Origin], WR->getFileName()));
     }
 
-    if (IsNewTypeString)
-      StringTable.push_back(Entry.getTypeString());
-
-    if (IsNewNameString)
-      StringTable.push_back(Entry.getNameString());
-
     RETURN_IF_ERROR(Entry.moveNext(End));
   }
 
   return Error::success();
 }
 
+Error WindowsResourceParser::parse(ResourceSectionRef &RSR, StringRef Filename,
+                                   std::vector<std::string> &Duplicates) {
+  UNWRAP_REF_OR_RETURN(BaseTable, RSR.getBaseTable());
+  uint32_t Origin = InputFilenames.size();
+  InputFilenames.push_back(Filename);
+  std::vector<StringOrID> Context;
+  return addChildren(Root, RSR, BaseTable, Origin, Context, Duplicates);
+}
+
 void WindowsResourceParser::printTree(raw_ostream &OS) const {
   ScopedPrinter Writer(OS);
   Root.print(Writer, "Resource Tree");
 }
 
-bool WindowsResourceParser::TreeNode::addEntry(const ResourceEntryRef &Entry,
-                                               uint32_t Origin,
-                                               bool &IsNewTypeString,
-                                               bool &IsNewNameString,
-                                               TreeNode *&Result) {
-  TreeNode &TypeNode = addTypeNode(Entry, IsNewTypeString);
-  TreeNode &NameNode = TypeNode.addNameNode(Entry, IsNewNameString);
-  return NameNode.addLanguageNode(Entry, Origin, Result);
+bool WindowsResourceParser::TreeNode::addEntry(
+    const ResourceEntryRef &Entry, uint32_t Origin,
+    std::vector<std::vector<uint8_t>> &Data,
+    std::vector<std::vector<UTF16>> &StringTable, TreeNode *&Result) {
+  TreeNode &TypeNode = addTypeNode(Entry, StringTable);
+  TreeNode &NameNode = TypeNode.addNameNode(Entry, StringTable);
+  return NameNode.addLanguageNode(Entry, Origin, Data, Result);
 }
 
-WindowsResourceParser::TreeNode::TreeNode(bool IsStringNode) {
-  if (IsStringNode)
-    StringIndex = StringCount++;
+Error WindowsResourceParser::addChildren(TreeNode &Node,
+                                         ResourceSectionRef &RSR,
+                                         const coff_resource_dir_table &Table,
+                                         uint32_t Origin,
+                                         std::vector<StringOrID> &Context,
+                                         std::vector<std::string> &Duplicates) {
+
+  for (int i = 0; i < Table.NumberOfNameEntries + Table.NumberOfIDEntries;
+       i++) {
+    UNWRAP_REF_OR_RETURN(Entry, RSR.getTableEntry(Table, i));
+    TreeNode *Child;
+
+    if (Entry.Offset.isSubDir()) {
+
+      // Create a new subdirectory and recurse
+      if (i < Table.NumberOfNameEntries) {
+        UNWRAP_OR_RETURN(NameString, RSR.getEntryNameString(Entry));
+        Child = &Node.addNameChild(NameString, StringTable);
+        Context.push_back(StringOrID(NameString));
+      } else {
+        Child = &Node.addIDChild(Entry.Identifier.ID);
+        Context.push_back(StringOrID(Entry.Identifier.ID));
+      }
+
+      UNWRAP_REF_OR_RETURN(NextTable, RSR.getEntrySubDir(Entry));
+      Error E =
+          addChildren(*Child, RSR, NextTable, Origin, Context, Duplicates);
+      if (E)
+        return E;
+      Context.pop_back();
+
+    } else {
+
+      // Data leaves are supposed to have a numeric ID as identifier (language).
+      if (Table.NumberOfNameEntries > 0)
+        return createStringError(object_error::parse_failed,
+                                 "unexpected string key for data object");
+
+      // Try adding a data leaf
+      UNWRAP_REF_OR_RETURN(DataEntry, RSR.getEntryData(Entry));
+      TreeNode *Child;
+      Context.push_back(StringOrID(Entry.Identifier.ID));
+      bool Added = Node.addDataChild(Entry.Identifier.ID, Table.MajorVersion,
+                                     Table.MinorVersion, Table.Characteristics,
+                                     Origin, Data.size(), Child);
+      if (Added) {
+        UNWRAP_OR_RETURN(Contents, RSR.getContents(DataEntry));
+        Data.push_back(ArrayRef<uint8_t>(
+            reinterpret_cast<const uint8_t *>(Contents.data()),
+            Contents.size()));
+      } else {
+        if (!shouldIgnoreDuplicate(Context))
+          Duplicates.push_back(makeDuplicateResourceError(
+              Context, InputFilenames[Child->Origin], InputFilenames.back()));
+      }
+      Context.pop_back();
+
+    }
+  }
+  return Error::success();
 }
 
+WindowsResourceParser::TreeNode::TreeNode(uint32_t StringIndex)
+    : StringIndex(StringIndex) {}
+
 WindowsResourceParser::TreeNode::TreeNode(uint16_t MajorVersion,
                                           uint16_t MinorVersion,
                                           uint32_t Characteristics,
-                                          uint32_t Origin)
-    : IsDataNode(true), MajorVersion(MajorVersion), MinorVersion(MinorVersion),
-      Characteristics(Characteristics), Origin(Origin) {
-  DataIndex = DataCount++;
-}
+                                          uint32_t Origin, uint32_t DataIndex)
+    : IsDataNode(true), DataIndex(DataIndex), MajorVersion(MajorVersion),
+      MinorVersion(MinorVersion), Characteristics(Characteristics),
+      Origin(Origin) {}
 
 std::unique_ptr<WindowsResourceParser::TreeNode>
-WindowsResourceParser::TreeNode::createStringNode() {
-  return std::unique_ptr<TreeNode>(new TreeNode(true));
+WindowsResourceParser::TreeNode::createStringNode(uint32_t Index) {
+  return std::unique_ptr<TreeNode>(new TreeNode(Index));
 }
 
 std::unique_ptr<WindowsResourceParser::TreeNode>
 WindowsResourceParser::TreeNode::createIDNode() {
-  return std::unique_ptr<TreeNode>(new TreeNode(false));
+  return std::unique_ptr<TreeNode>(new TreeNode(0));
 }
 
 std::unique_ptr<WindowsResourceParser::TreeNode>
 WindowsResourceParser::TreeNode::createDataNode(uint16_t MajorVersion,
                                                 uint16_t MinorVersion,
                                                 uint32_t Characteristics,
-                                                uint32_t Origin) {
-  return std::unique_ptr<TreeNode>(
-      new TreeNode(MajorVersion, MinorVersion, Characteristics, Origin));
+                                                uint32_t Origin,
+                                                uint32_t DataIndex) {
+  return std::unique_ptr<TreeNode>(new TreeNode(
+      MajorVersion, MinorVersion, Characteristics, Origin, DataIndex));
 }
 
-WindowsResourceParser::TreeNode &
-WindowsResourceParser::TreeNode::addTypeNode(const ResourceEntryRef &Entry,
-                                             bool &IsNewTypeString) {
+WindowsResourceParser::TreeNode &WindowsResourceParser::TreeNode::addTypeNode(
+    const ResourceEntryRef &Entry,
+    std::vector<std::vector<UTF16>> &StringTable) {
   if (Entry.checkTypeString())
-    return addNameChild(Entry.getTypeString(), IsNewTypeString);
+    return addNameChild(Entry.getTypeString(), StringTable);
   else
     return addIDChild(Entry.getTypeID());
 }
 
-WindowsResourceParser::TreeNode &
-WindowsResourceParser::TreeNode::addNameNode(const ResourceEntryRef &Entry,
-                                             bool &IsNewNameString) {
+WindowsResourceParser::TreeNode &WindowsResourceParser::TreeNode::addNameNode(
+    const ResourceEntryRef &Entry,
+    std::vector<std::vector<UTF16>> &StringTable) {
   if (Entry.checkNameString())
-    return addNameChild(Entry.getNameString(), IsNewNameString);
+    return addNameChild(Entry.getNameString(), StringTable);
   else
     return addIDChild(Entry.getNameID());
 }
 
 bool WindowsResourceParser::TreeNode::addLanguageNode(
-    const ResourceEntryRef &Entry, uint32_t Origin, TreeNode *&Result) {
-  return addDataChild(Entry.getLanguage(), Entry.getMajorVersion(),
-                      Entry.getMinorVersion(), Entry.getCharacteristics(),
-                      Origin, Result);
+    const ResourceEntryRef &Entry, uint32_t Origin,
+    std::vector<std::vector<uint8_t>> &Data, TreeNode *&Result) {
+  bool Added = addDataChild(Entry.getLanguage(), Entry.getMajorVersion(),
+                            Entry.getMinorVersion(), Entry.getCharacteristics(),
+                            Origin, Data.size(), Result);
+  if (Added)
+    Data.push_back(Entry.getData());
+  return Added;
 }
 
 bool WindowsResourceParser::TreeNode::addDataChild(
     uint32_t ID, uint16_t MajorVersion, uint16_t MinorVersion,
-    uint32_t Characteristics, uint32_t Origin, TreeNode *&Result) {
-  auto NewChild =
-      createDataNode(MajorVersion, MinorVersion, Characteristics, Origin);
+    uint32_t Characteristics, uint32_t Origin, uint32_t DataIndex,
+    TreeNode *&Result) {
+  auto NewChild = createDataNode(MajorVersion, MinorVersion, Characteristics,
+                                 Origin, DataIndex);
   auto ElementInserted = IDChildren.emplace(ID, std::move(NewChild));
   Result = ElementInserted.first->second.get();
   return ElementInserted.second;
@@ -342,16 +532,15 @@ WindowsResourceParser::TreeNode &WindowsResourceParser::TreeNode::addIDChild(
     return *(Child->second);
 }
 
-WindowsResourceParser::TreeNode &
-WindowsResourceParser::TreeNode::addNameChild(ArrayRef<UTF16> NameRef,
-                                              bool &IsNewString) {
+WindowsResourceParser::TreeNode &WindowsResourceParser::TreeNode::addNameChild(
+    ArrayRef<UTF16> NameRef, std::vector<std::vector<UTF16>> &StringTable) {
   std::string NameString;
   convertUTF16LEToUTF8String(NameRef, NameString);
 
   auto Child = StringChildren.find(NameString);
   if (Child == StringChildren.end()) {
-    auto NewChild = createStringNode();
-    IsNewString = true;
+    auto NewChild = createStringNode(StringTable.size());
+    StringTable.push_back(NameRef);
     WindowsResourceParser::TreeNode &Node = *NewChild;
     StringChildren.emplace(NameString, std::move(NewChild));
     return Node;
@@ -396,6 +585,19 @@ uint32_t WindowsResourceParser::TreeNode::getTreeSize() const {
   return Size;
 }
 
+// Shift DataIndex of all data children with an Index greater or equal to the
+// given one, to fill a gap from removing an entry from the Data vector.
+void WindowsResourceParser::TreeNode::shiftDataIndexDown(uint32_t Index) {
+  if (IsDataNode && DataIndex >= Index) {
+    DataIndex--;
+  } else {
+    for (auto &Child : IDChildren)
+      Child.second->shiftDataIndexDown(Index);
+    for (auto &Child : StringChildren)
+      Child.second->shiftDataIndexDown(Index);
+  }
+}
+
 class WindowsResourceCOFFWriter {
 public:
   WindowsResourceCOFFWriter(COFF::MachineTypes MachineType,
@@ -515,6 +717,14 @@ WindowsResourceCOFFWriter::write(uint32_t TimeDateStamp) {
   return std::move(OutputBuffer);
 }
 
+// According to COFF specification, if the Src has a size equal to Dest,
+// it's okay to *not* copy the trailing zero.
+static void coffnamecpy(char (&Dest)[COFF::NameSize], StringRef Src) {
+  assert(Src.size() <= COFF::NameSize &&
+         "Src is not larger than COFF::NameSize");
+  strncpy(Dest, Src.data(), (size_t)COFF::NameSize);
+}
+
 void WindowsResourceCOFFWriter::writeCOFFHeader(uint32_t TimeDateStamp) {
   // Write the COFF header.
   auto *Header = reinterpret_cast<coff_file_header *>(BufferStart);
@@ -534,7 +744,7 @@ void WindowsResourceCOFFWriter::writeFirstSectionHeader() {
   CurrentOffset += sizeof(coff_file_header);
   auto *SectionOneHeader =
       reinterpret_cast<coff_section *>(BufferStart + CurrentOffset);
-  strncpy(SectionOneHeader->Name, ".rsrc$01", (size_t)COFF::NameSize);
+  coffnamecpy(SectionOneHeader->Name, ".rsrc$01");
   SectionOneHeader->VirtualSize = 0;
   SectionOneHeader->VirtualAddress = 0;
   SectionOneHeader->SizeOfRawData = SectionOneSize;
@@ -552,7 +762,7 @@ void WindowsResourceCOFFWriter::writeSecondSectionHeader() {
   CurrentOffset += sizeof(coff_section);
   auto *SectionTwoHeader =
       reinterpret_cast<coff_section *>(BufferStart + CurrentOffset);
-  strncpy(SectionTwoHeader->Name, ".rsrc$02", (size_t)COFF::NameSize);
+  coffnamecpy(SectionTwoHeader->Name, ".rsrc$02");
   SectionTwoHeader->VirtualSize = 0;
   SectionTwoHeader->VirtualAddress = 0;
   SectionTwoHeader->SizeOfRawData = SectionTwoSize;
@@ -590,7 +800,7 @@ void WindowsResourceCOFFWriter::writeSymbolTable() {
   // Now write the symbol table.
   // First, the feat symbol.
   auto *Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
-  strncpy(Symbol->Name.ShortName, "@feat.00", (size_t)COFF::NameSize);
+  coffnamecpy(Symbol->Name.ShortName, "@feat.00");
   Symbol->Value = 0x11;
   Symbol->SectionNumber = 0xffff;
   Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
@@ -600,7 +810,7 @@ void WindowsResourceCOFFWriter::writeSymbolTable() {
 
   // Now write the .rsrc1 symbol + aux.
   Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
-  strncpy(Symbol->Name.ShortName, ".rsrc$01", (size_t)COFF::NameSize);
+  coffnamecpy(Symbol->Name.ShortName, ".rsrc$01");
   Symbol->Value = 0;
   Symbol->SectionNumber = 1;
   Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
@@ -619,7 +829,7 @@ void WindowsResourceCOFFWriter::writeSymbolTable() {
 
   // Now write the .rsrc2 symbol + aux.
   Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
-  strncpy(Symbol->Name.ShortName, ".rsrc$02", (size_t)COFF::NameSize);
+  coffnamecpy(Symbol->Name.ShortName, ".rsrc$02");
   Symbol->Value = 0;
   Symbol->SectionNumber = 2;
   Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
@@ -640,7 +850,7 @@ void WindowsResourceCOFFWriter::writeSymbolTable() {
   for (unsigned i = 0; i < Data.size(); i++) {
     auto RelocationName = formatv("$R{0:X-6}", i & 0xffffff).sstr<COFF::NameSize>();
     Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
-    memcpy(Symbol->Name.ShortName, RelocationName.data(), (size_t) COFF::NameSize);
+    coffnamecpy(Symbol->Name.ShortName, RelocationName);
     Symbol->Value = DataOffsets[i];
     Symbol->SectionNumber = 2;
     Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
diff --git a/lib/Object/XCOFFObjectFile.cpp b/lib/Object/XCOFFObjectFile.cpp
index 602b7357986a..98782c2701c1 100644
--- a/lib/Object/XCOFFObjectFile.cpp
+++ b/lib/Object/XCOFFObjectFile.cpp
@@ -11,17 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/XCOFFObjectFile.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/BinaryStreamReader.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
 #include <cstddef>
 #include <cstring>
 
 namespace llvm {
 namespace object {
 
+enum { FUNCTION_SYM = 0x20, SYM_TYPE_MASK = 0x07, RELOC_OVERFLOW = 65535 };
+
 // Checks that [Ptr, Ptr + Size) bytes fall inside the memory buffer
 // 'M'. Returns a pointer to the underlying object on success.
 template <typename T>
@@ -42,10 +39,25 @@ template <typename T> static const T *viewAs(uintptr_t in) {
   return reinterpret_cast<const T *>(in);
 }
 
-static StringRef generateStringRef(const char *Name, uint64_t Size) {
-  auto NulCharPtr = static_cast<const char *>(memchr(Name, '\0', Size));
+static StringRef generateXCOFFFixedNameStringRef(const char *Name) {
+  auto NulCharPtr =
+      static_cast<const char *>(memchr(Name, '\0', XCOFF::NameSize));
   return NulCharPtr ? StringRef(Name, NulCharPtr - Name)
-                    : StringRef(Name, Size);
+                    : StringRef(Name, XCOFF::NameSize);
+}
+
+bool XCOFFRelocation32::isRelocationSigned() const {
+  return Info & XR_SIGN_INDICATOR_MASK;
+}
+
+bool XCOFFRelocation32::isFixupIndicated() const {
+  return Info & XR_FIXUP_INDICATOR_MASK;
+}
+
+uint8_t XCOFFRelocation32::getRelocatedLength() const {
+  // The relocation encodes the bit length being relocated minus 1. Add back
+  // the 1 to get the actual length being relocated.
+  return (Info & XR_BIASED_LENGTH_MASK) + 1;
 }
 
 void XCOFFObjectFile::checkSectionAddress(uintptr_t Addr,
@@ -83,6 +95,9 @@ XCOFFObjectFile::toSection64(DataRefImpl Ref) const {
 const XCOFFSymbolEntry *XCOFFObjectFile::toSymbolEntry(DataRefImpl Ref) const {
   assert(!is64Bit() && "Symbol table support not implemented for 64-bit.");
   assert(Ref.p != 0 && "Symbol table pointer can not be nullptr!");
+#ifndef NDEBUG
+  checkSymbolEntryPointer(Ref.p);
+#endif
   auto SymEntPtr = viewAs<XCOFFSymbolEntry>(Ref.p);
   return SymEntPtr;
 }
@@ -112,23 +127,19 @@ XCOFFObjectFile::sectionHeaderTable64() const {
 void XCOFFObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
   const XCOFFSymbolEntry *SymEntPtr = toSymbolEntry(Symb);
   SymEntPtr += SymEntPtr->NumberOfAuxEntries + 1;
+#ifndef NDEBUG
+  // This function is used by basic_symbol_iterator, which allows to
+  // point to the end-of-symbol-table address.
+  if (reinterpret_cast<uintptr_t>(SymEntPtr) != getEndOfSymbolTableAddress())
+    checkSymbolEntryPointer(reinterpret_cast<uintptr_t>(SymEntPtr));
+#endif
   Symb.p = reinterpret_cast<uintptr_t>(SymEntPtr);
 }
 
-Expected<StringRef> XCOFFObjectFile::getSymbolName(DataRefImpl Symb) const {
-  const XCOFFSymbolEntry *SymEntPtr = toSymbolEntry(Symb);
-
-  if (SymEntPtr->NameInStrTbl.Magic != XCOFFSymbolEntry::NAME_IN_STR_TBL_MAGIC)
-    return generateStringRef(SymEntPtr->SymbolName, XCOFF::SymbolNameSize);
-
-  // A storage class value with the high-order bit on indicates that the name is
-  // a symbolic debugger stabstring.
-  if (SymEntPtr->StorageClass & 0x80)
-    return StringRef("Unimplemented Debug Name");
-
-  uint32_t Offset = SymEntPtr->NameInStrTbl.Offset;
-  // The byte offset is relative to the start of the string table
-  // or .debug section. A byte offset value of 0 is a null or zero-length symbol
+Expected<StringRef>
+XCOFFObjectFile::getStringTableEntry(uint32_t Offset) const {
+  // The byte offset is relative to the start of the string table.
+  // A byte offset value of 0 is a null or zero-length symbol
   // name. A byte offset in the range 1 to 3 (inclusive) points into the length
   // field; as a soft-error recovery mechanism, we treat such cases as having an
   // offset of 0.
@@ -138,10 +149,32 @@ Expected<StringRef> XCOFFObjectFile::getSymbolName(DataRefImpl Symb) const {
   if (StringTable.Data != nullptr && StringTable.Size > Offset)
     return (StringTable.Data + Offset);
 
-  return make_error<GenericBinaryError>("Symbol Name parse failed",
+  return make_error<GenericBinaryError>("Bad offset for string table entry",
                                         object_error::parse_failed);
 }
 
+Expected<StringRef>
+XCOFFObjectFile::getCFileName(const XCOFFFileAuxEnt *CFileEntPtr) const {
+  if (CFileEntPtr->NameInStrTbl.Magic !=
+      XCOFFSymbolEntry::NAME_IN_STR_TBL_MAGIC)
+    return generateXCOFFFixedNameStringRef(CFileEntPtr->Name);
+  return getStringTableEntry(CFileEntPtr->NameInStrTbl.Offset);
+}
+
+Expected<StringRef> XCOFFObjectFile::getSymbolName(DataRefImpl Symb) const {
+  const XCOFFSymbolEntry *SymEntPtr = toSymbolEntry(Symb);
+
+  // A storage class value with the high-order bit on indicates that the name is
+  // a symbolic debugger stabstring.
+  if (SymEntPtr->StorageClass & 0x80)
+    return StringRef("Unimplemented Debug Name");
+
+  if (SymEntPtr->NameInStrTbl.Magic != XCOFFSymbolEntry::NAME_IN_STR_TBL_MAGIC)
+    return generateXCOFFFixedNameStringRef(SymEntPtr->SymbolName);
+
+  return getStringTableEntry(SymEntPtr->NameInStrTbl.Offset);
+}
+
 Expected<uint64_t> XCOFFObjectFile::getSymbolAddress(DataRefImpl Symb) const {
   uint64_t Result = 0;
   llvm_unreachable("Not yet implemented!");
@@ -149,6 +182,7 @@ Expected<uint64_t> XCOFFObjectFile::getSymbolAddress(DataRefImpl Symb) const {
 }
 
 uint64_t XCOFFObjectFile::getSymbolValueImpl(DataRefImpl Symb) const {
+  assert(!is64Bit() && "Symbol table support not implemented for 64-bit.");
   return toSymbolEntry(Symb)->Value;
 }
 
@@ -185,7 +219,7 @@ void XCOFFObjectFile::moveSectionNext(DataRefImpl &Sec) const {
 }
 
 Expected<StringRef> XCOFFObjectFile::getSectionName(DataRefImpl Sec) const {
-  return generateStringRef(getSectionNameInternal(Sec), XCOFF::SectionNameSize);
+  return generateXCOFFFixedNameStringRef(getSectionNameInternal(Sec));
 }
 
 uint64_t XCOFFObjectFile::getSectionAddress(DataRefImpl Sec) const {
@@ -393,8 +427,8 @@ XCOFFObjectFile::getSymbolSectionName(const XCOFFSymbolEntry *SymEntPtr) const {
   default:
     Expected<DataRefImpl> SecRef = getSectionByNum(SectionNum);
     if (SecRef)
-      return generateStringRef(getSectionNameInternal(SecRef.get()),
-                               XCOFF::SectionNameSize);
+      return generateXCOFFFixedNameStringRef(
+          getSectionNameInternal(SecRef.get()));
     return SecRef.takeError();
   }
 }
@@ -442,6 +476,48 @@ uint32_t XCOFFObjectFile::getNumberOfSymbolTableEntries64() const {
   return fileHeader64()->NumberOfSymTableEntries;
 }
 
+uintptr_t XCOFFObjectFile::getEndOfSymbolTableAddress() const {
+  uint32_t NumberOfSymTableEntries =
+      is64Bit() ? getNumberOfSymbolTableEntries64()
+                : getLogicalNumberOfSymbolTableEntries32();
+  return getWithOffset(reinterpret_cast<uintptr_t>(SymbolTblPtr),
+                       XCOFF::SymbolTableEntrySize * NumberOfSymTableEntries);
+}
+
+void XCOFFObjectFile::checkSymbolEntryPointer(uintptr_t SymbolEntPtr) const {
+  if (SymbolEntPtr < reinterpret_cast<uintptr_t>(SymbolTblPtr))
+    report_fatal_error("Symbol table entry is outside of symbol table.");
+
+  if (SymbolEntPtr >= getEndOfSymbolTableAddress())
+    report_fatal_error("Symbol table entry is outside of symbol table.");
+
+  ptrdiff_t Offset = reinterpret_cast<const char *>(SymbolEntPtr) -
+                     reinterpret_cast<const char *>(SymbolTblPtr);
+
+  if (Offset % XCOFF::SymbolTableEntrySize != 0)
+    report_fatal_error(
+        "Symbol table entry position is not valid inside of symbol table.");
+}
+
+uint32_t XCOFFObjectFile::getSymbolIndex(uintptr_t SymbolEntPtr) const {
+  return (reinterpret_cast<const char *>(SymbolEntPtr) -
+          reinterpret_cast<const char *>(SymbolTblPtr)) /
+         XCOFF::SymbolTableEntrySize;
+}
+
+Expected<StringRef>
+XCOFFObjectFile::getSymbolNameByIndex(uint32_t Index) const {
+  if (is64Bit())
+    report_fatal_error("64-bit symbol table support not implemented yet.");
+
+  if (Index >= getLogicalNumberOfSymbolTableEntries32())
+    return errorCodeToError(object_error::invalid_symbol_index);
+
+  DataRefImpl SymDRI;
+  SymDRI.p = reinterpret_cast<uintptr_t>(getPointerToSymbolTable() + Index);
+  return getSymbolName(SymDRI);
+}
+
 uint16_t XCOFFObjectFile::getFlags() const {
   return is64Bit() ? fileHeader64()->Flags : fileHeader32()->Flags;
 }
@@ -477,6 +553,46 @@ ArrayRef<XCOFFSectionHeader32> XCOFFObjectFile::sections32() const {
                                         TablePtr + getNumberOfSections());
 }
 
+// In an XCOFF32 file, when the field value is 65535, then an STYP_OVRFLO
+// section header contains the actual count of relocation entries in the s_paddr
+// field. STYP_OVRFLO headers contain the section index of their corresponding
+// sections as their raw "NumberOfRelocations" field value.
+Expected<uint32_t> XCOFFObjectFile::getLogicalNumberOfRelocationEntries(
+    const XCOFFSectionHeader32 &Sec) const {
+
+  uint16_t SectionIndex = &Sec - sectionHeaderTable32() + 1;
+
+  if (Sec.NumberOfRelocations < RELOC_OVERFLOW)
+    return Sec.NumberOfRelocations;
+  for (const auto &Sec : sections32()) {
+    if (Sec.Flags == XCOFF::STYP_OVRFLO &&
+        Sec.NumberOfRelocations == SectionIndex)
+      return Sec.PhysicalAddress;
+  }
+  return errorCodeToError(object_error::parse_failed);
+}
+
+Expected<ArrayRef<XCOFFRelocation32>>
+XCOFFObjectFile::relocations(const XCOFFSectionHeader32 &Sec) const {
+  uintptr_t RelocAddr = getWithOffset(reinterpret_cast<uintptr_t>(FileHeader),
+                                      Sec.FileOffsetToRelocationInfo);
+  auto NumRelocEntriesOrErr = getLogicalNumberOfRelocationEntries(Sec);
+  if (Error E = NumRelocEntriesOrErr.takeError())
+    return std::move(E);
+
+  uint32_t NumRelocEntries = NumRelocEntriesOrErr.get();
+
+  auto RelocationOrErr =
+      getObject<XCOFFRelocation32>(Data, reinterpret_cast<void *>(RelocAddr),
+                                   NumRelocEntries * sizeof(XCOFFRelocation32));
+  if (Error E = RelocationOrErr.takeError())
+    return std::move(E);
+
+  const XCOFFRelocation32 *StartReloc = RelocationOrErr.get();
+
+  return ArrayRef<XCOFFRelocation32>(StartReloc, StartReloc + NumRelocEntries);
+}
+
 Expected<XCOFFStringTable>
 XCOFFObjectFile::parseStringTable(const XCOFFObjectFile *Obj, uint64_t Offset) {
   // If there is a string table, then the buffer must contain at least 4 bytes
@@ -507,7 +623,7 @@ XCOFFObjectFile::parseStringTable(const XCOFFObjectFile *Obj, uint64_t Offset) {
 
 Expected<std::unique_ptr<XCOFFObjectFile>>
 XCOFFObjectFile::create(unsigned Type, MemoryBufferRef MBR) {
-  // Can't use make_unique because of the private constructor.
+  // Can't use std::make_unique because of the private constructor.
   std::unique_ptr<XCOFFObjectFile> Obj;
   Obj.reset(new XCOFFObjectFile(Type, MBR));
 
@@ -573,11 +689,77 @@ ObjectFile::createXCOFFObjectFile(MemoryBufferRef MemBufRef,
 }
 
 StringRef XCOFFSectionHeader32::getName() const {
-  return generateStringRef(Name, XCOFF::SectionNameSize);
+  return generateXCOFFFixedNameStringRef(Name);
 }
 
 StringRef XCOFFSectionHeader64::getName() const {
-  return generateStringRef(Name, XCOFF::SectionNameSize);
+  return generateXCOFFFixedNameStringRef(Name);
+}
+
+XCOFF::StorageClass XCOFFSymbolRef::getStorageClass() const {
+  return OwningObjectPtr->toSymbolEntry(SymEntDataRef)->StorageClass;
+}
+
+uint8_t XCOFFSymbolRef::getNumberOfAuxEntries() const {
+  return OwningObjectPtr->toSymbolEntry(SymEntDataRef)->NumberOfAuxEntries;
+}
+
+const XCOFFCsectAuxEnt32 *XCOFFSymbolRef::getXCOFFCsectAuxEnt32() const {
+  assert(!OwningObjectPtr->is64Bit() &&
+         "32-bit interface called on 64-bit object file.");
+  assert(hasCsectAuxEnt() && "No Csect Auxiliary Entry is found.");
+
+  // In XCOFF32, the csect auxilliary entry is always the last auxiliary
+  // entry for the symbol.
+  uintptr_t AuxAddr = getWithOffset(
+      SymEntDataRef.p, XCOFF::SymbolTableEntrySize * getNumberOfAuxEntries());
+
+#ifndef NDEBUG
+  OwningObjectPtr->checkSymbolEntryPointer(AuxAddr);
+#endif
+
+  return reinterpret_cast<const XCOFFCsectAuxEnt32 *>(AuxAddr);
+}
+
+uint16_t XCOFFSymbolRef::getType() const {
+  return OwningObjectPtr->toSymbolEntry(SymEntDataRef)->SymbolType;
+}
+
+int16_t XCOFFSymbolRef::getSectionNumber() const {
+  return OwningObjectPtr->toSymbolEntry(SymEntDataRef)->SectionNumber;
+}
+
+bool XCOFFSymbolRef::hasCsectAuxEnt() const {
+  XCOFF::StorageClass SC = getStorageClass();
+  return (SC == XCOFF::C_EXT || SC == XCOFF::C_WEAKEXT ||
+          SC == XCOFF::C_HIDEXT);
+}
+
+bool XCOFFSymbolRef::isFunction() const {
+  if (OwningObjectPtr->is64Bit())
+    report_fatal_error("64-bit support is unimplemented yet.");
+
+  if (getType() & FUNCTION_SYM)
+    return true;
+
+  if (!hasCsectAuxEnt())
+    return false;
+
+  const XCOFFCsectAuxEnt32 *CsectAuxEnt = getXCOFFCsectAuxEnt32();
+
+  // A function definition should be a label definition.
+  if ((CsectAuxEnt->SymbolAlignmentAndType & SYM_TYPE_MASK) != XCOFF::XTY_LD)
+    return false;
+
+  if (CsectAuxEnt->StorageMappingClass != XCOFF::XMC_PR)
+    return false;
+
+  int16_t SectNum = getSectionNumber();
+  Expected<DataRefImpl> SI = OwningObjectPtr->getSectionByNum(SectNum);
+  if (!SI)
+    return false;
+
+  return (OwningObjectPtr->getSectionFlags(SI.get()) & XCOFF::STYP_TEXT);
 }
 
 } // namespace object
diff --git a/lib/ObjectYAML/COFFEmitter.cpp b/lib/ObjectYAML/COFFEmitter.cpp
new file mode 100644
index 000000000000..efcdc51e1670
--- /dev/null
+++ b/lib/ObjectYAML/COFFEmitter.cpp
@@ -0,0 +1,622 @@
+//===- yaml2coff - Convert YAML to a COFF object file ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The COFF component of yaml2obj.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/ObjectYAML/ObjectYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+
+/// This parses a yaml stream that represents a COFF object file.
+/// See docs/yaml2obj for the yaml scheema.
+struct COFFParser {
+  COFFParser(COFFYAML::Object &Obj, yaml::ErrorHandler EH)
+      : Obj(Obj), SectionTableStart(0), SectionTableSize(0), ErrHandler(EH) {
+    // A COFF string table always starts with a 4 byte size field. Offsets into
+    // it include this size, so allocate it now.
+    StringTable.append(4, char(0));
+  }
+
+  bool useBigObj() const {
+    return static_cast<int32_t>(Obj.Sections.size()) >
+           COFF::MaxNumberOfSections16;
+  }
+
+  bool isPE() const { return Obj.OptionalHeader.hasValue(); }
+  bool is64Bit() const {
+    return Obj.Header.Machine == COFF::IMAGE_FILE_MACHINE_AMD64 ||
+           Obj.Header.Machine == COFF::IMAGE_FILE_MACHINE_ARM64;
+  }
+
+  uint32_t getFileAlignment() const {
+    return Obj.OptionalHeader->Header.FileAlignment;
+  }
+
+  unsigned getHeaderSize() const {
+    return useBigObj() ? COFF::Header32Size : COFF::Header16Size;
+  }
+
+  unsigned getSymbolSize() const {
+    return useBigObj() ? COFF::Symbol32Size : COFF::Symbol16Size;
+  }
+
+  bool parseSections() {
+    for (std::vector<COFFYAML::Section>::iterator i = Obj.Sections.begin(),
+                                                  e = Obj.Sections.end();
+         i != e; ++i) {
+      COFFYAML::Section &Sec = *i;
+
+      // If the name is less than 8 bytes, store it in place, otherwise
+      // store it in the string table.
+      StringRef Name = Sec.Name;
+
+      if (Name.size() <= COFF::NameSize) {
+        std::copy(Name.begin(), Name.end(), Sec.Header.Name);
+      } else {
+        // Add string to the string table and format the index for output.
+        unsigned Index = getStringIndex(Name);
+        std::string str = utostr(Index);
+        if (str.size() > 7) {
+          ErrHandler("string table got too large");
+          return false;
+        }
+        Sec.Header.Name[0] = '/';
+        std::copy(str.begin(), str.end(), Sec.Header.Name + 1);
+      }
+
+      if (Sec.Alignment) {
+        if (Sec.Alignment > 8192) {
+          ErrHandler("section alignment is too large");
+          return false;
+        }
+        if (!isPowerOf2_32(Sec.Alignment)) {
+          ErrHandler("section alignment is not a power of 2");
+          return false;
+        }
+        Sec.Header.Characteristics |= (Log2_32(Sec.Alignment) + 1) << 20;
+      }
+    }
+    return true;
+  }
+
+  bool parseSymbols() {
+    for (std::vector<COFFYAML::Symbol>::iterator i = Obj.Symbols.begin(),
+                                                 e = Obj.Symbols.end();
+         i != e; ++i) {
+      COFFYAML::Symbol &Sym = *i;
+
+      // If the name is less than 8 bytes, store it in place, otherwise
+      // store it in the string table.
+      StringRef Name = Sym.Name;
+      if (Name.size() <= COFF::NameSize) {
+        std::copy(Name.begin(), Name.end(), Sym.Header.Name);
+      } else {
+        // Add string to the string table and format the index for output.
+        unsigned Index = getStringIndex(Name);
+        *reinterpret_cast<support::aligned_ulittle32_t *>(Sym.Header.Name + 4) =
+            Index;
+      }
+
+      Sym.Header.Type = Sym.SimpleType;
+      Sym.Header.Type |= Sym.ComplexType << COFF::SCT_COMPLEX_TYPE_SHIFT;
+    }
+    return true;
+  }
+
+  bool parse() {
+    if (!parseSections())
+      return false;
+    if (!parseSymbols())
+      return false;
+    return true;
+  }
+
+  unsigned getStringIndex(StringRef Str) {
+    StringMap<unsigned>::iterator i = StringTableMap.find(Str);
+    if (i == StringTableMap.end()) {
+      unsigned Index = StringTable.size();
+      StringTable.append(Str.begin(), Str.end());
+      StringTable.push_back(0);
+      StringTableMap[Str] = Index;
+      return Index;
+    }
+    return i->second;
+  }
+
+  COFFYAML::Object &Obj;
+
+  codeview::StringsAndChecksums StringsAndChecksums;
+  BumpPtrAllocator Allocator;
+  StringMap<unsigned> StringTableMap;
+  std::string StringTable;
+  uint32_t SectionTableStart;
+  uint32_t SectionTableSize;
+
+  yaml::ErrorHandler ErrHandler;
+};
+
+enum { DOSStubSize = 128 };
+
+} // end anonymous namespace
+
+// Take a CP and assign addresses and sizes to everything. Returns false if the
+// layout is not valid to do.
+static bool layoutOptionalHeader(COFFParser &CP) {
+  if (!CP.isPE())
+    return true;
+  unsigned PEHeaderSize = CP.is64Bit() ? sizeof(object::pe32plus_header)
+                                       : sizeof(object::pe32_header);
+  CP.Obj.Header.SizeOfOptionalHeader =
+      PEHeaderSize +
+      sizeof(object::data_directory) * (COFF::NUM_DATA_DIRECTORIES + 1);
+  return true;
+}
+
+static yaml::BinaryRef
+toDebugS(ArrayRef<CodeViewYAML::YAMLDebugSubsection> Subsections,
+         const codeview::StringsAndChecksums &SC, BumpPtrAllocator &Allocator) {
+  using namespace codeview;
+  ExitOnError Err("Error occurred writing .debug$S section");
+  auto CVSS =
+      Err(CodeViewYAML::toCodeViewSubsectionList(Allocator, Subsections, SC));
+
+  std::vector<DebugSubsectionRecordBuilder> Builders;
+  uint32_t Size = sizeof(uint32_t);
+  for (auto &SS : CVSS) {
+    DebugSubsectionRecordBuilder B(SS, CodeViewContainer::ObjectFile);
+    Size += B.calculateSerializedLength();
+    Builders.push_back(std::move(B));
+  }
+  uint8_t *Buffer = Allocator.Allocate<uint8_t>(Size);
+  MutableArrayRef<uint8_t> Output(Buffer, Size);
+  BinaryStreamWriter Writer(Output, support::little);
+
+  Err(Writer.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC));
+  for (const auto &B : Builders) {
+    Err(B.commit(Writer));
+  }
+  return {Output};
+}
+
+// Take a CP and assign addresses and sizes to everything. Returns false if the
+// layout is not valid to do.
+static bool layoutCOFF(COFFParser &CP) {
+  // The section table starts immediately after the header, including the
+  // optional header.
+  CP.SectionTableStart =
+      CP.getHeaderSize() + CP.Obj.Header.SizeOfOptionalHeader;
+  if (CP.isPE())
+    CP.SectionTableStart += DOSStubSize + sizeof(COFF::PEMagic);
+  CP.SectionTableSize = COFF::SectionSize * CP.Obj.Sections.size();
+
+  uint32_t CurrentSectionDataOffset =
+      CP.SectionTableStart + CP.SectionTableSize;
+
+  for (COFFYAML::Section &S : CP.Obj.Sections) {
+    // We support specifying exactly one of SectionData or Subsections.  So if
+    // there is already some SectionData, then we don't need to do any of this.
+    if (S.Name == ".debug$S" && S.SectionData.binary_size() == 0) {
+      CodeViewYAML::initializeStringsAndChecksums(S.DebugS,
+                                                  CP.StringsAndChecksums);
+      if (CP.StringsAndChecksums.hasChecksums() &&
+          CP.StringsAndChecksums.hasStrings())
+        break;
+    }
+  }
+
+  // Assign each section data address consecutively.
+  for (COFFYAML::Section &S : CP.Obj.Sections) {
+    if (S.Name == ".debug$S") {
+      if (S.SectionData.binary_size() == 0) {
+        assert(CP.StringsAndChecksums.hasStrings() &&
+               "Object file does not have debug string table!");
+
+        S.SectionData =
+            toDebugS(S.DebugS, CP.StringsAndChecksums, CP.Allocator);
+      }
+    } else if (S.Name == ".debug$T") {
+      if (S.SectionData.binary_size() == 0)
+        S.SectionData = CodeViewYAML::toDebugT(S.DebugT, CP.Allocator, S.Name);
+    } else if (S.Name == ".debug$P") {
+      if (S.SectionData.binary_size() == 0)
+        S.SectionData = CodeViewYAML::toDebugT(S.DebugP, CP.Allocator, S.Name);
+    } else if (S.Name == ".debug$H") {
+      if (S.DebugH.hasValue() && S.SectionData.binary_size() == 0)
+        S.SectionData = CodeViewYAML::toDebugH(*S.DebugH, CP.Allocator);
+    }
+
+    if (S.SectionData.binary_size() > 0) {
+      CurrentSectionDataOffset = alignTo(CurrentSectionDataOffset,
+                                         CP.isPE() ? CP.getFileAlignment() : 4);
+      S.Header.SizeOfRawData = S.SectionData.binary_size();
+      if (CP.isPE())
+        S.Header.SizeOfRawData =
+            alignTo(S.Header.SizeOfRawData, CP.getFileAlignment());
+      S.Header.PointerToRawData = CurrentSectionDataOffset;
+      CurrentSectionDataOffset += S.Header.SizeOfRawData;
+      if (!S.Relocations.empty()) {
+        S.Header.PointerToRelocations = CurrentSectionDataOffset;
+        S.Header.NumberOfRelocations = S.Relocations.size();
+        CurrentSectionDataOffset +=
+            S.Header.NumberOfRelocations * COFF::RelocationSize;
+      }
+    } else {
+      // Leave SizeOfRawData unaltered. For .bss sections in object files, it
+      // carries the section size.
+      S.Header.PointerToRawData = 0;
+    }
+  }
+
+  uint32_t SymbolTableStart = CurrentSectionDataOffset;
+
+  // Calculate number of symbols.
+  uint32_t NumberOfSymbols = 0;
+  for (std::vector<COFFYAML::Symbol>::iterator i = CP.Obj.Symbols.begin(),
+                                               e = CP.Obj.Symbols.end();
+       i != e; ++i) {
+    uint32_t NumberOfAuxSymbols = 0;
+    if (i->FunctionDefinition)
+      NumberOfAuxSymbols += 1;
+    if (i->bfAndefSymbol)
+      NumberOfAuxSymbols += 1;
+    if (i->WeakExternal)
+      NumberOfAuxSymbols += 1;
+    if (!i->File.empty())
+      NumberOfAuxSymbols +=
+          (i->File.size() + CP.getSymbolSize() - 1) / CP.getSymbolSize();
+    if (i->SectionDefinition)
+      NumberOfAuxSymbols += 1;
+    if (i->CLRToken)
+      NumberOfAuxSymbols += 1;
+    i->Header.NumberOfAuxSymbols = NumberOfAuxSymbols;
+    NumberOfSymbols += 1 + NumberOfAuxSymbols;
+  }
+
+  // Store all the allocated start addresses in the header.
+  CP.Obj.Header.NumberOfSections = CP.Obj.Sections.size();
+  CP.Obj.Header.NumberOfSymbols = NumberOfSymbols;
+  if (NumberOfSymbols > 0 || CP.StringTable.size() > 4)
+    CP.Obj.Header.PointerToSymbolTable = SymbolTableStart;
+  else
+    CP.Obj.Header.PointerToSymbolTable = 0;
+
+  *reinterpret_cast<support::ulittle32_t *>(&CP.StringTable[0]) =
+      CP.StringTable.size();
+
+  return true;
+}
+
+template <typename value_type> struct binary_le_impl {
+  value_type Value;
+  binary_le_impl(value_type V) : Value(V) {}
+};
+
+template <typename value_type>
+raw_ostream &operator<<(raw_ostream &OS,
+                        const binary_le_impl<value_type> &BLE) {
+  char Buffer[sizeof(BLE.Value)];
+  support::endian::write<value_type, support::little, support::unaligned>(
+      Buffer, BLE.Value);
+  OS.write(Buffer, sizeof(BLE.Value));
+  return OS;
+}
+
+template <typename value_type>
+binary_le_impl<value_type> binary_le(value_type V) {
+  return binary_le_impl<value_type>(V);
+}
+
+template <size_t NumBytes> struct zeros_impl {};
+
+template <size_t NumBytes>
+raw_ostream &operator<<(raw_ostream &OS, const zeros_impl<NumBytes> &) {
+  char Buffer[NumBytes];
+  memset(Buffer, 0, sizeof(Buffer));
+  OS.write(Buffer, sizeof(Buffer));
+  return OS;
+}
+
+template <typename T> zeros_impl<sizeof(T)> zeros(const T &) {
+  return zeros_impl<sizeof(T)>();
+}
+
+template <typename T>
+static uint32_t initializeOptionalHeader(COFFParser &CP, uint16_t Magic,
+                                         T Header) {
+  memset(Header, 0, sizeof(*Header));
+  Header->Magic = Magic;
+  Header->SectionAlignment = CP.Obj.OptionalHeader->Header.SectionAlignment;
+  Header->FileAlignment = CP.Obj.OptionalHeader->Header.FileAlignment;
+  uint32_t SizeOfCode = 0, SizeOfInitializedData = 0,
+           SizeOfUninitializedData = 0;
+  uint32_t SizeOfHeaders = alignTo(CP.SectionTableStart + CP.SectionTableSize,
+                                   Header->FileAlignment);
+  uint32_t SizeOfImage = alignTo(SizeOfHeaders, Header->SectionAlignment);
+  uint32_t BaseOfData = 0;
+  for (const COFFYAML::Section &S : CP.Obj.Sections) {
+    if (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_CODE)
+      SizeOfCode += S.Header.SizeOfRawData;
+    if (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+      SizeOfInitializedData += S.Header.SizeOfRawData;
+    if (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+      SizeOfUninitializedData += S.Header.SizeOfRawData;
+    if (S.Name.equals(".text"))
+      Header->BaseOfCode = S.Header.VirtualAddress; // RVA
+    else if (S.Name.equals(".data"))
+      BaseOfData = S.Header.VirtualAddress; // RVA
+    if (S.Header.VirtualAddress)
+      SizeOfImage += alignTo(S.Header.VirtualSize, Header->SectionAlignment);
+  }
+  Header->SizeOfCode = SizeOfCode;
+  Header->SizeOfInitializedData = SizeOfInitializedData;
+  Header->SizeOfUninitializedData = SizeOfUninitializedData;
+  Header->AddressOfEntryPoint =
+      CP.Obj.OptionalHeader->Header.AddressOfEntryPoint; // RVA
+  Header->ImageBase = CP.Obj.OptionalHeader->Header.ImageBase;
+  Header->MajorOperatingSystemVersion =
+      CP.Obj.OptionalHeader->Header.MajorOperatingSystemVersion;
+  Header->MinorOperatingSystemVersion =
+      CP.Obj.OptionalHeader->Header.MinorOperatingSystemVersion;
+  Header->MajorImageVersion = CP.Obj.OptionalHeader->Header.MajorImageVersion;
+  Header->MinorImageVersion = CP.Obj.OptionalHeader->Header.MinorImageVersion;
+  Header->MajorSubsystemVersion =
+      CP.Obj.OptionalHeader->Header.MajorSubsystemVersion;
+  Header->MinorSubsystemVersion =
+      CP.Obj.OptionalHeader->Header.MinorSubsystemVersion;
+  Header->SizeOfImage = SizeOfImage;
+  Header->SizeOfHeaders = SizeOfHeaders;
+  Header->Subsystem = CP.Obj.OptionalHeader->Header.Subsystem;
+  Header->DLLCharacteristics = CP.Obj.OptionalHeader->Header.DLLCharacteristics;
+  Header->SizeOfStackReserve = CP.Obj.OptionalHeader->Header.SizeOfStackReserve;
+  Header->SizeOfStackCommit = CP.Obj.OptionalHeader->Header.SizeOfStackCommit;
+  Header->SizeOfHeapReserve = CP.Obj.OptionalHeader->Header.SizeOfHeapReserve;
+  Header->SizeOfHeapCommit = CP.Obj.OptionalHeader->Header.SizeOfHeapCommit;
+  Header->NumberOfRvaAndSize = COFF::NUM_DATA_DIRECTORIES + 1;
+  return BaseOfData;
+}
+
+static bool writeCOFF(COFFParser &CP, raw_ostream &OS) {
+  if (CP.isPE()) {
+    // PE files start with a DOS stub.
+    object::dos_header DH;
+    memset(&DH, 0, sizeof(DH));
+
+    // DOS EXEs start with "MZ" magic.
+    DH.Magic[0] = 'M';
+    DH.Magic[1] = 'Z';
+    // Initializing the AddressOfRelocationTable is strictly optional but
+    // mollifies certain tools which expect it to have a value greater than
+    // 0x40.
+    DH.AddressOfRelocationTable = sizeof(DH);
+    // This is the address of the PE signature.
+    DH.AddressOfNewExeHeader = DOSStubSize;
+
+    // Write out our DOS stub.
+    OS.write(reinterpret_cast<char *>(&DH), sizeof(DH));
+    // Write padding until we reach the position of where our PE signature
+    // should live.
+    OS.write_zeros(DOSStubSize - sizeof(DH));
+    // Write out the PE signature.
+    OS.write(COFF::PEMagic, sizeof(COFF::PEMagic));
+  }
+  if (CP.useBigObj()) {
+    OS << binary_le(static_cast<uint16_t>(COFF::IMAGE_FILE_MACHINE_UNKNOWN))
+       << binary_le(static_cast<uint16_t>(0xffff))
+       << binary_le(
+              static_cast<uint16_t>(COFF::BigObjHeader::MinBigObjectVersion))
+       << binary_le(CP.Obj.Header.Machine)
+       << binary_le(CP.Obj.Header.TimeDateStamp);
+    OS.write(COFF::BigObjMagic, sizeof(COFF::BigObjMagic));
+    OS << zeros(uint32_t(0)) << zeros(uint32_t(0)) << zeros(uint32_t(0))
+       << zeros(uint32_t(0)) << binary_le(CP.Obj.Header.NumberOfSections)
+       << binary_le(CP.Obj.Header.PointerToSymbolTable)
+       << binary_le(CP.Obj.Header.NumberOfSymbols);
+  } else {
+    OS << binary_le(CP.Obj.Header.Machine)
+       << binary_le(static_cast<int16_t>(CP.Obj.Header.NumberOfSections))
+       << binary_le(CP.Obj.Header.TimeDateStamp)
+       << binary_le(CP.Obj.Header.PointerToSymbolTable)
+       << binary_le(CP.Obj.Header.NumberOfSymbols)
+       << binary_le(CP.Obj.Header.SizeOfOptionalHeader)
+       << binary_le(CP.Obj.Header.Characteristics);
+  }
+  if (CP.isPE()) {
+    if (CP.is64Bit()) {
+      object::pe32plus_header PEH;
+      initializeOptionalHeader(CP, COFF::PE32Header::PE32_PLUS, &PEH);
+      OS.write(reinterpret_cast<char *>(&PEH), sizeof(PEH));
+    } else {
+      object::pe32_header PEH;
+      uint32_t BaseOfData =
+          initializeOptionalHeader(CP, COFF::PE32Header::PE32, &PEH);
+      PEH.BaseOfData = BaseOfData;
+      OS.write(reinterpret_cast<char *>(&PEH), sizeof(PEH));
+    }
+    for (const Optional<COFF::DataDirectory> &DD :
+         CP.Obj.OptionalHeader->DataDirectories) {
+      if (!DD.hasValue()) {
+        OS << zeros(uint32_t(0));
+        OS << zeros(uint32_t(0));
+      } else {
+        OS << binary_le(DD->RelativeVirtualAddress);
+        OS << binary_le(DD->Size);
+      }
+    }
+    OS << zeros(uint32_t(0));
+    OS << zeros(uint32_t(0));
+  }
+
+  assert(OS.tell() == CP.SectionTableStart);
+  // Output section table.
+  for (std::vector<COFFYAML::Section>::iterator i = CP.Obj.Sections.begin(),
+                                                e = CP.Obj.Sections.end();
+       i != e; ++i) {
+    OS.write(i->Header.Name, COFF::NameSize);
+    OS << binary_le(i->Header.VirtualSize)
+       << binary_le(i->Header.VirtualAddress)
+       << binary_le(i->Header.SizeOfRawData)
+       << binary_le(i->Header.PointerToRawData)
+       << binary_le(i->Header.PointerToRelocations)
+       << binary_le(i->Header.PointerToLineNumbers)
+       << binary_le(i->Header.NumberOfRelocations)
+       << binary_le(i->Header.NumberOfLineNumbers)
+       << binary_le(i->Header.Characteristics);
+  }
+  assert(OS.tell() == CP.SectionTableStart + CP.SectionTableSize);
+
+  unsigned CurSymbol = 0;
+  StringMap<unsigned> SymbolTableIndexMap;
+  for (std::vector<COFFYAML::Symbol>::iterator I = CP.Obj.Symbols.begin(),
+                                               E = CP.Obj.Symbols.end();
+       I != E; ++I) {
+    SymbolTableIndexMap[I->Name] = CurSymbol;
+    CurSymbol += 1 + I->Header.NumberOfAuxSymbols;
+  }
+
+  // Output section data.
+  for (const COFFYAML::Section &S : CP.Obj.Sections) {
+    if (S.Header.SizeOfRawData == 0 || S.Header.PointerToRawData == 0)
+      continue;
+    assert(S.Header.PointerToRawData >= OS.tell());
+    OS.write_zeros(S.Header.PointerToRawData - OS.tell());
+    S.SectionData.writeAsBinary(OS);
+    assert(S.Header.SizeOfRawData >= S.SectionData.binary_size());
+    OS.write_zeros(S.Header.SizeOfRawData - S.SectionData.binary_size());
+    for (const COFFYAML::Relocation &R : S.Relocations) {
+      uint32_t SymbolTableIndex;
+      if (R.SymbolTableIndex) {
+        if (!R.SymbolName.empty())
+          WithColor::error()
+              << "Both SymbolName and SymbolTableIndex specified\n";
+        SymbolTableIndex = *R.SymbolTableIndex;
+      } else {
+        SymbolTableIndex = SymbolTableIndexMap[R.SymbolName];
+      }
+      OS << binary_le(R.VirtualAddress) << binary_le(SymbolTableIndex)
+         << binary_le(R.Type);
+    }
+  }
+
+  // Output symbol table.
+
+  for (std::vector<COFFYAML::Symbol>::const_iterator i = CP.Obj.Symbols.begin(),
+                                                     e = CP.Obj.Symbols.end();
+       i != e; ++i) {
+    OS.write(i->Header.Name, COFF::NameSize);
+    OS << binary_le(i->Header.Value);
+    if (CP.useBigObj())
+      OS << binary_le(i->Header.SectionNumber);
+    else
+      OS << binary_le(static_cast<int16_t>(i->Header.SectionNumber));
+    OS << binary_le(i->Header.Type) << binary_le(i->Header.StorageClass)
+       << binary_le(i->Header.NumberOfAuxSymbols);
+
+    if (i->FunctionDefinition) {
+      OS << binary_le(i->FunctionDefinition->TagIndex)
+         << binary_le(i->FunctionDefinition->TotalSize)
+         << binary_le(i->FunctionDefinition->PointerToLinenumber)
+         << binary_le(i->FunctionDefinition->PointerToNextFunction)
+         << zeros(i->FunctionDefinition->unused);
+      OS.write_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
+    }
+    if (i->bfAndefSymbol) {
+      OS << zeros(i->bfAndefSymbol->unused1)
+         << binary_le(i->bfAndefSymbol->Linenumber)
+         << zeros(i->bfAndefSymbol->unused2)
+         << binary_le(i->bfAndefSymbol->PointerToNextFunction)
+         << zeros(i->bfAndefSymbol->unused3);
+      OS.write_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
+    }
+    if (i->WeakExternal) {
+      OS << binary_le(i->WeakExternal->TagIndex)
+         << binary_le(i->WeakExternal->Characteristics)
+         << zeros(i->WeakExternal->unused);
+      OS.write_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
+    }
+    if (!i->File.empty()) {
+      unsigned SymbolSize = CP.getSymbolSize();
+      uint32_t NumberOfAuxRecords =
+          (i->File.size() + SymbolSize - 1) / SymbolSize;
+      uint32_t NumberOfAuxBytes = NumberOfAuxRecords * SymbolSize;
+      uint32_t NumZeros = NumberOfAuxBytes - i->File.size();
+      OS.write(i->File.data(), i->File.size());
+      OS.write_zeros(NumZeros);
+    }
+    if (i->SectionDefinition) {
+      OS << binary_le(i->SectionDefinition->Length)
+         << binary_le(i->SectionDefinition->NumberOfRelocations)
+         << binary_le(i->SectionDefinition->NumberOfLinenumbers)
+         << binary_le(i->SectionDefinition->CheckSum)
+         << binary_le(static_cast<int16_t>(i->SectionDefinition->Number))
+         << binary_le(i->SectionDefinition->Selection)
+         << zeros(i->SectionDefinition->unused)
+         << binary_le(static_cast<int16_t>(i->SectionDefinition->Number >> 16));
+      OS.write_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
+    }
+    if (i->CLRToken) {
+      OS << binary_le(i->CLRToken->AuxType) << zeros(i->CLRToken->unused1)
+         << binary_le(i->CLRToken->SymbolTableIndex)
+         << zeros(i->CLRToken->unused2);
+      OS.write_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
+    }
+  }
+
+  // Output string table.
+  if (CP.Obj.Header.PointerToSymbolTable)
+    OS.write(&CP.StringTable[0], CP.StringTable.size());
+  return true;
+}
+
+namespace llvm {
+namespace yaml {
+
+bool yaml2coff(llvm::COFFYAML::Object &Doc, raw_ostream &Out,
+               ErrorHandler ErrHandler) {
+  COFFParser CP(Doc, ErrHandler);
+  if (!CP.parse()) {
+    ErrHandler("failed to parse YAML file");
+    return false;
+  }
+
+  if (!layoutOptionalHeader(CP)) {
+    ErrHandler("failed to layout optional header for COFF file");
+    return false;
+  }
+
+  if (!layoutCOFF(CP)) {
+    ErrHandler("failed to layout COFF file");
+    return false;
+  }
+  if (!writeCOFF(CP, Out)) {
+    ErrHandler("failed to write COFF file");
+    return false;
+  }
+  return true;
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index 227107c051dd..95409fdc3300 100644
--- a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -391,7 +391,7 @@ template <> void SymbolRecordImpl<DefRangeRegisterSym>::map(IO &IO) {
 }
 
 template <> void SymbolRecordImpl<DefRangeFramePointerRelSym>::map(IO &IO) {
-  IO.mapRequired("Offset", Symbol.Offset);
+  IO.mapRequired("Offset", Symbol.Hdr.Offset);
   IO.mapRequired("Range", Symbol.Range);
   IO.mapRequired("Gaps", Symbol.Gaps);
 }
diff --git a/lib/ObjectYAML/ELFEmitter.cpp b/lib/ObjectYAML/ELFEmitter.cpp
new file mode 100644
index 000000000000..e0faed256f6b
--- /dev/null
+++ b/lib/ObjectYAML/ELFEmitter.cpp
@@ -0,0 +1,1152 @@
+//===- yaml2elf - Convert YAML to a ELF object file -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The ELF component of yaml2obj.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ObjectYAML/ELFYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// This class is used to build up a contiguous binary blob while keeping
+// track of an offset in the output (which notionally begins at
+// `InitialOffset`).
+namespace {
+class ContiguousBlobAccumulator {
+  const uint64_t InitialOffset;
+  SmallVector<char, 128> Buf;
+  raw_svector_ostream OS;
+
+  /// \returns The new offset.
+  uint64_t padToAlignment(unsigned Align) {
+    if (Align == 0)
+      Align = 1;
+    uint64_t CurrentOffset = InitialOffset + OS.tell();
+    uint64_t AlignedOffset = alignTo(CurrentOffset, Align);
+    OS.write_zeros(AlignedOffset - CurrentOffset);
+    return AlignedOffset; // == CurrentOffset;
+  }
+
+public:
+  ContiguousBlobAccumulator(uint64_t InitialOffset_)
+      : InitialOffset(InitialOffset_), Buf(), OS(Buf) {}
+  template <class Integer>
+  raw_ostream &getOSAndAlignedOffset(Integer &Offset, unsigned Align) {
+    Offset = padToAlignment(Align);
+    return OS;
+  }
+  void writeBlobToStream(raw_ostream &Out) { Out << OS.str(); }
+};
+
+// Used to keep track of section and symbol names, so that in the YAML file
+// sections and symbols can be referenced by name instead of by index.
+class NameToIdxMap {
+  StringMap<unsigned> Map;
+
+public:
+  /// \Returns false if name is already present in the map.
+  bool addName(StringRef Name, unsigned Ndx) {
+    return Map.insert({Name, Ndx}).second;
+  }
+  /// \Returns false if name is not present in the map.
+  bool lookup(StringRef Name, unsigned &Idx) const {
+    auto I = Map.find(Name);
+    if (I == Map.end())
+      return false;
+    Idx = I->getValue();
+    return true;
+  }
+  /// Asserts if name is not present in the map.
+  unsigned get(StringRef Name) const {
+    unsigned Idx;
+    if (lookup(Name, Idx))
+      return Idx;
+    assert(false && "Expected section not found in index");
+    return 0;
+  }
+  unsigned size() const { return Map.size(); }
+};
+
+/// "Single point of truth" for the ELF file construction.
+/// TODO: This class still has a ways to go before it is truly a "single
+/// point of truth".
+template <class ELFT> class ELFState {
+  typedef typename ELFT::Ehdr Elf_Ehdr;
+  typedef typename ELFT::Phdr Elf_Phdr;
+  typedef typename ELFT::Shdr Elf_Shdr;
+  typedef typename ELFT::Sym Elf_Sym;
+  typedef typename ELFT::Rel Elf_Rel;
+  typedef typename ELFT::Rela Elf_Rela;
+  typedef typename ELFT::Relr Elf_Relr;
+  typedef typename ELFT::Dyn Elf_Dyn;
+
+  enum class SymtabType { Static, Dynamic };
+
+  /// The future ".strtab" section.
+  StringTableBuilder DotStrtab{StringTableBuilder::ELF};
+
+  /// The future ".shstrtab" section.
+  StringTableBuilder DotShStrtab{StringTableBuilder::ELF};
+
+  /// The future ".dynstr" section.
+  StringTableBuilder DotDynstr{StringTableBuilder::ELF};
+
+  NameToIdxMap SN2I;
+  NameToIdxMap SymN2I;
+  NameToIdxMap DynSymN2I;
+  ELFYAML::Object &Doc;
+
+  bool HasError = false;
+  yaml::ErrorHandler ErrHandler;
+  void reportError(const Twine &Msg);
+
+  std::vector<Elf_Sym> toELFSymbols(ArrayRef<ELFYAML::Symbol> Symbols,
+                                    const StringTableBuilder &Strtab);
+  unsigned toSectionIndex(StringRef S, StringRef LocSec, StringRef LocSym = "");
+  unsigned toSymbolIndex(StringRef S, StringRef LocSec, bool IsDynamic);
+
+  void buildSectionIndex();
+  void buildSymbolIndexes();
+  void initProgramHeaders(std::vector<Elf_Phdr> &PHeaders);
+  bool initImplicitHeader(ContiguousBlobAccumulator &CBA, Elf_Shdr &Header,
+                          StringRef SecName, ELFYAML::Section *YAMLSec);
+  void initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
+                          ContiguousBlobAccumulator &CBA);
+  void initSymtabSectionHeader(Elf_Shdr &SHeader, SymtabType STType,
+                               ContiguousBlobAccumulator &CBA,
+                               ELFYAML::Section *YAMLSec);
+  void initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
+                               StringTableBuilder &STB,
+                               ContiguousBlobAccumulator &CBA,
+                               ELFYAML::Section *YAMLSec);
+  void setProgramHeaderLayout(std::vector<Elf_Phdr> &PHeaders,
+                              std::vector<Elf_Shdr> &SHeaders);
+  void finalizeStrings();
+  void writeELFHeader(ContiguousBlobAccumulator &CBA, raw_ostream &OS);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::RawContentSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::RelocationSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader, const ELFYAML::Group &Group,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::SymtabShndxSection &Shndx,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::SymverSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::VerneedSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::VerdefSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::MipsABIFlags &Section,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::DynamicSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::StackSizesSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::HashSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::AddrsigSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+
+  ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH);
+
+public:
+  static bool writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
+                       yaml::ErrorHandler EH);
+};
+} // end anonymous namespace
+
+template <class T> static size_t arrayDataSize(ArrayRef<T> A) {
+  return A.size() * sizeof(T);
+}
+
+template <class T> static void writeArrayData(raw_ostream &OS, ArrayRef<T> A) {
+  OS.write((const char *)A.data(), arrayDataSize(A));
+}
+
+template <class T> static void zero(T &Obj) { memset(&Obj, 0, sizeof(Obj)); }
+
+template <class ELFT>
+ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
+    : Doc(D), ErrHandler(EH) {
+  StringSet<> DocSections;
+  for (std::unique_ptr<ELFYAML::Section> &D : Doc.Sections) {
+    if (!D->Name.empty())
+      DocSections.insert(D->Name);
+
+    // Some sections wants to link to .symtab by default.
+    // That means we want to create the symbol table for them.
+    if (D->Type == llvm::ELF::SHT_REL || D->Type == llvm::ELF::SHT_RELA)
+      if (!Doc.Symbols && D->Link.empty())
+        Doc.Symbols.emplace();
+  }
+
+  // Insert SHT_NULL section implicitly when it is not defined in YAML.
+  if (Doc.Sections.empty() || Doc.Sections.front()->Type != ELF::SHT_NULL)
+    Doc.Sections.insert(
+        Doc.Sections.begin(),
+        std::make_unique<ELFYAML::Section>(
+            ELFYAML::Section::SectionKind::RawContent, /*IsImplicit=*/true));
+
+  std::vector<StringRef> ImplicitSections;
+  if (Doc.Symbols)
+    ImplicitSections.push_back(".symtab");
+  ImplicitSections.insert(ImplicitSections.end(), {".strtab", ".shstrtab"});
+
+  if (!Doc.DynamicSymbols.empty())
+    ImplicitSections.insert(ImplicitSections.end(), {".dynsym", ".dynstr"});
+
+  // Insert placeholders for implicit sections that are not
+  // defined explicitly in YAML.
+  for (StringRef SecName : ImplicitSections) {
+    if (DocSections.count(SecName))
+      continue;
+
+    std::unique_ptr<ELFYAML::Section> Sec = std::make_unique<ELFYAML::Section>(
+        ELFYAML::Section::SectionKind::RawContent, true /*IsImplicit*/);
+    Sec->Name = SecName;
+    Doc.Sections.push_back(std::move(Sec));
+  }
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeELFHeader(ContiguousBlobAccumulator &CBA, raw_ostream &OS) {
+  using namespace llvm::ELF;
+
+  Elf_Ehdr Header;
+  zero(Header);
+  Header.e_ident[EI_MAG0] = 0x7f;
+  Header.e_ident[EI_MAG1] = 'E';
+  Header.e_ident[EI_MAG2] = 'L';
+  Header.e_ident[EI_MAG3] = 'F';
+  Header.e_ident[EI_CLASS] = ELFT::Is64Bits ? ELFCLASS64 : ELFCLASS32;
+  Header.e_ident[EI_DATA] = Doc.Header.Data;
+  Header.e_ident[EI_VERSION] = EV_CURRENT;
+  Header.e_ident[EI_OSABI] = Doc.Header.OSABI;
+  Header.e_ident[EI_ABIVERSION] = Doc.Header.ABIVersion;
+  Header.e_type = Doc.Header.Type;
+  Header.e_machine = Doc.Header.Machine;
+  Header.e_version = EV_CURRENT;
+  Header.e_entry = Doc.Header.Entry;
+  Header.e_phoff = Doc.ProgramHeaders.size() ? sizeof(Header) : 0;
+  Header.e_flags = Doc.Header.Flags;
+  Header.e_ehsize = sizeof(Elf_Ehdr);
+  Header.e_phentsize = Doc.ProgramHeaders.size() ? sizeof(Elf_Phdr) : 0;
+  Header.e_phnum = Doc.ProgramHeaders.size();
+
+  Header.e_shentsize =
+      Doc.Header.SHEntSize ? (uint16_t)*Doc.Header.SHEntSize : sizeof(Elf_Shdr);
+  // Immediately following the ELF header and program headers.
+  // Align the start of the section header and write the ELF header.
+  uint64_t SHOff;
+  CBA.getOSAndAlignedOffset(SHOff, sizeof(typename ELFT::uint));
+  Header.e_shoff =
+      Doc.Header.SHOff ? typename ELFT::uint(*Doc.Header.SHOff) : SHOff;
+  Header.e_shnum =
+      Doc.Header.SHNum ? (uint16_t)*Doc.Header.SHNum : Doc.Sections.size();
+  Header.e_shstrndx = Doc.Header.SHStrNdx ? (uint16_t)*Doc.Header.SHStrNdx
+                                          : SN2I.get(".shstrtab");
+
+  OS.write((const char *)&Header, sizeof(Header));
+}
+
+template <class ELFT>
+void ELFState<ELFT>::initProgramHeaders(std::vector<Elf_Phdr> &PHeaders) {
+  for (const auto &YamlPhdr : Doc.ProgramHeaders) {
+    Elf_Phdr Phdr;
+    Phdr.p_type = YamlPhdr.Type;
+    Phdr.p_flags = YamlPhdr.Flags;
+    Phdr.p_vaddr = YamlPhdr.VAddr;
+    Phdr.p_paddr = YamlPhdr.PAddr;
+    PHeaders.push_back(Phdr);
+  }
+}
+
+template <class ELFT>
+unsigned ELFState<ELFT>::toSectionIndex(StringRef S, StringRef LocSec,
+                                        StringRef LocSym) {
+  unsigned Index;
+  if (SN2I.lookup(S, Index) || to_integer(S, Index))
+    return Index;
+
+  assert(LocSec.empty() || LocSym.empty());
+  if (!LocSym.empty())
+    reportError("unknown section referenced: '" + S + "' by YAML symbol '" +
+                LocSym + "'");
+  else
+    reportError("unknown section referenced: '" + S + "' by YAML section '" +
+                LocSec + "'");
+  return 0;
+}
+
+template <class ELFT>
+unsigned ELFState<ELFT>::toSymbolIndex(StringRef S, StringRef LocSec,
+                                       bool IsDynamic) {
+  const NameToIdxMap &SymMap = IsDynamic ? DynSymN2I : SymN2I;
+  unsigned Index;
+  // Here we try to look up S in the symbol table. If it is not there,
+  // treat its value as a symbol index.
+  if (!SymMap.lookup(S, Index) && !to_integer(S, Index)) {
+    reportError("unknown symbol referenced: '" + S + "' by YAML section '" +
+                LocSec + "'");
+    return 0;
+  }
+  return Index;
+}
+
+template <class ELFT>
+bool ELFState<ELFT>::initImplicitHeader(ContiguousBlobAccumulator &CBA,
+                                        Elf_Shdr &Header, StringRef SecName,
+                                        ELFYAML::Section *YAMLSec) {
+  // Check if the header was already initialized.
+  if (Header.sh_offset)
+    return false;
+
+  if (SecName == ".symtab")
+    initSymtabSectionHeader(Header, SymtabType::Static, CBA, YAMLSec);
+  else if (SecName == ".strtab")
+    initStrtabSectionHeader(Header, SecName, DotStrtab, CBA, YAMLSec);
+  else if (SecName == ".shstrtab")
+    initStrtabSectionHeader(Header, SecName, DotShStrtab, CBA, YAMLSec);
+  else if (SecName == ".dynsym")
+    initSymtabSectionHeader(Header, SymtabType::Dynamic, CBA, YAMLSec);
+  else if (SecName == ".dynstr")
+    initStrtabSectionHeader(Header, SecName, DotDynstr, CBA, YAMLSec);
+  else
+    return false;
+
+  // Override the fields if requested.
+  if (YAMLSec) {
+    if (YAMLSec->ShName)
+      Header.sh_name = *YAMLSec->ShName;
+    if (YAMLSec->ShOffset)
+      Header.sh_offset = *YAMLSec->ShOffset;
+    if (YAMLSec->ShSize)
+      Header.sh_size = *YAMLSec->ShSize;
+  }
+
+  return true;
+}
+
+StringRef llvm::ELFYAML::dropUniqueSuffix(StringRef S) {
+  size_t SuffixPos = S.rfind(" [");
+  if (SuffixPos == StringRef::npos)
+    return S;
+  return S.substr(0, SuffixPos);
+}
+
+template <class ELFT>
+void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
+                                        ContiguousBlobAccumulator &CBA) {
+  // Ensure SHN_UNDEF entry is present. An all-zero section header is a
+  // valid SHN_UNDEF entry since SHT_NULL == 0.
+  SHeaders.resize(Doc.Sections.size());
+
+  for (size_t I = 0; I < Doc.Sections.size(); ++I) {
+    ELFYAML::Section *Sec = Doc.Sections[I].get();
+    if (I == 0 && Sec->IsImplicit)
+      continue;
+
+    // We have a few sections like string or symbol tables that are usually
+    // added implicitly to the end. However, if they are explicitly specified
+    // in the YAML, we need to write them here. This ensures the file offset
+    // remains correct.
+    Elf_Shdr &SHeader = SHeaders[I];
+    if (initImplicitHeader(CBA, SHeader, Sec->Name,
+                           Sec->IsImplicit ? nullptr : Sec))
+      continue;
+
+    assert(Sec && "It can't be null unless it is an implicit section. But all "
+                  "implicit sections should already have been handled above.");
+
+    SHeader.sh_name =
+        DotShStrtab.getOffset(ELFYAML::dropUniqueSuffix(Sec->Name));
+    SHeader.sh_type = Sec->Type;
+    if (Sec->Flags)
+      SHeader.sh_flags = *Sec->Flags;
+    SHeader.sh_addr = Sec->Address;
+    SHeader.sh_addralign = Sec->AddressAlign;
+
+    if (!Sec->Link.empty())
+      SHeader.sh_link = toSectionIndex(Sec->Link, Sec->Name);
+
+    if (I == 0) {
+      if (auto RawSec = dyn_cast<ELFYAML::RawContentSection>(Sec)) {
+        // We do not write any content for special SHN_UNDEF section.
+        if (RawSec->Size)
+          SHeader.sh_size = *RawSec->Size;
+        if (RawSec->Info)
+          SHeader.sh_info = *RawSec->Info;
+      }
+      if (Sec->EntSize)
+        SHeader.sh_entsize = *Sec->EntSize;
+    } else if (auto S = dyn_cast<ELFYAML::RawContentSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::SymtabShndxSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::RelocationSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::Group>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::MipsABIFlags>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::NoBitsSection>(Sec)) {
+      SHeader.sh_entsize = 0;
+      SHeader.sh_size = S->Size;
+      // SHT_NOBITS section does not have content
+      // so just to setup the section offset.
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+    } else if (auto S = dyn_cast<ELFYAML::DynamicSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::SymverSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::VerneedSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::VerdefSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::StackSizesSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::HashSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::AddrsigSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
+    } else {
+      llvm_unreachable("Unknown section type");
+    }
+
+    // Override the fields if requested.
+    if (Sec) {
+      if (Sec->ShName)
+        SHeader.sh_name = *Sec->ShName;
+      if (Sec->ShOffset)
+        SHeader.sh_offset = *Sec->ShOffset;
+      if (Sec->ShSize)
+        SHeader.sh_size = *Sec->ShSize;
+    }
+  }
+}
+
+static size_t findFirstNonGlobal(ArrayRef<ELFYAML::Symbol> Symbols) {
+  for (size_t I = 0; I < Symbols.size(); ++I)
+    if (Symbols[I].Binding.value != ELF::STB_LOCAL)
+      return I;
+  return Symbols.size();
+}
+
+static uint64_t writeContent(raw_ostream &OS,
+                             const Optional<yaml::BinaryRef> &Content,
+                             const Optional<llvm::yaml::Hex64> &Size) {
+  size_t ContentSize = 0;
+  if (Content) {
+    Content->writeAsBinary(OS);
+    ContentSize = Content->binary_size();
+  }
+
+  if (!Size)
+    return ContentSize;
+
+  OS.write_zeros(*Size - ContentSize);
+  return *Size;
+}
+
+template <class ELFT>
+std::vector<typename ELFT::Sym>
+ELFState<ELFT>::toELFSymbols(ArrayRef<ELFYAML::Symbol> Symbols,
+                             const StringTableBuilder &Strtab) {
+  std::vector<Elf_Sym> Ret;
+  Ret.resize(Symbols.size() + 1);
+
+  size_t I = 0;
+  for (const auto &Sym : Symbols) {
+    Elf_Sym &Symbol = Ret[++I];
+
+    // If NameIndex, which contains the name offset, is explicitly specified, we
+    // use it. This is useful for preparing broken objects. Otherwise, we add
+    // the specified Name to the string table builder to get its offset.
+    if (Sym.NameIndex)
+      Symbol.st_name = *Sym.NameIndex;
+    else if (!Sym.Name.empty())
+      Symbol.st_name = Strtab.getOffset(ELFYAML::dropUniqueSuffix(Sym.Name));
+
+    Symbol.setBindingAndType(Sym.Binding, Sym.Type);
+    if (!Sym.Section.empty())
+      Symbol.st_shndx = toSectionIndex(Sym.Section, "", Sym.Name);
+    else if (Sym.Index)
+      Symbol.st_shndx = *Sym.Index;
+
+    Symbol.st_value = Sym.Value;
+    Symbol.st_other = Sym.Other ? *Sym.Other : 0;
+    Symbol.st_size = Sym.Size;
+  }
+
+  return Ret;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
+                                             SymtabType STType,
+                                             ContiguousBlobAccumulator &CBA,
+                                             ELFYAML::Section *YAMLSec) {
+
+  bool IsStatic = STType == SymtabType::Static;
+  ArrayRef<ELFYAML::Symbol> Symbols;
+  if (IsStatic && Doc.Symbols)
+    Symbols = *Doc.Symbols;
+  else if (!IsStatic)
+    Symbols = Doc.DynamicSymbols;
+
+  ELFYAML::RawContentSection *RawSec =
+      dyn_cast_or_null<ELFYAML::RawContentSection>(YAMLSec);
+  if (RawSec && !Symbols.empty() && (RawSec->Content || RawSec->Size)) {
+    if (RawSec->Content)
+      reportError("cannot specify both `Content` and " +
+                  (IsStatic ? Twine("`Symbols`") : Twine("`DynamicSymbols`")) +
+                  " for symbol table section '" + RawSec->Name + "'");
+    if (RawSec->Size)
+      reportError("cannot specify both `Size` and " +
+                  (IsStatic ? Twine("`Symbols`") : Twine("`DynamicSymbols`")) +
+                  " for symbol table section '" + RawSec->Name + "'");
+    return;
+  }
+
+  zero(SHeader);
+  SHeader.sh_name = DotShStrtab.getOffset(IsStatic ? ".symtab" : ".dynsym");
+
+  if (YAMLSec)
+    SHeader.sh_type = YAMLSec->Type;
+  else
+    SHeader.sh_type = IsStatic ? ELF::SHT_SYMTAB : ELF::SHT_DYNSYM;
+
+  if (RawSec && !RawSec->Link.empty()) {
+    // If the Link field is explicitly defined in the document,
+    // we should use it.
+    SHeader.sh_link = toSectionIndex(RawSec->Link, RawSec->Name);
+  } else {
+    // When we describe the .dynsym section in the document explicitly, it is
+    // allowed to omit the "DynamicSymbols" tag. In this case .dynstr is not
+    // added implicitly and we should be able to leave the Link zeroed if
+    // .dynstr is not defined.
+    unsigned Link = 0;
+    if (IsStatic)
+      Link = SN2I.get(".strtab");
+    else
+      SN2I.lookup(".dynstr", Link);
+    SHeader.sh_link = Link;
+  }
+
+  if (YAMLSec && YAMLSec->Flags)
+    SHeader.sh_flags = *YAMLSec->Flags;
+  else if (!IsStatic)
+    SHeader.sh_flags = ELF::SHF_ALLOC;
+
+  // If the symbol table section is explicitly described in the YAML
+  // then we should set the fields requested.
+  SHeader.sh_info = (RawSec && RawSec->Info) ? (unsigned)(*RawSec->Info)
+                                             : findFirstNonGlobal(Symbols) + 1;
+  SHeader.sh_entsize = (YAMLSec && YAMLSec->EntSize)
+                           ? (uint64_t)(*YAMLSec->EntSize)
+                           : sizeof(Elf_Sym);
+  SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 8;
+  SHeader.sh_addr = YAMLSec ? (uint64_t)YAMLSec->Address : 0;
+
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  if (RawSec && (RawSec->Content || RawSec->Size)) {
+    assert(Symbols.empty());
+    SHeader.sh_size = writeContent(OS, RawSec->Content, RawSec->Size);
+    return;
+  }
+
+  std::vector<Elf_Sym> Syms =
+      toELFSymbols(Symbols, IsStatic ? DotStrtab : DotDynstr);
+  writeArrayData(OS, makeArrayRef(Syms));
+  SHeader.sh_size = arrayDataSize(makeArrayRef(Syms));
+}
+
+template <class ELFT>
+void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
+                                             StringTableBuilder &STB,
+                                             ContiguousBlobAccumulator &CBA,
+                                             ELFYAML::Section *YAMLSec) {
+  zero(SHeader);
+  SHeader.sh_name = DotShStrtab.getOffset(Name);
+  SHeader.sh_type = YAMLSec ? YAMLSec->Type : ELF::SHT_STRTAB;
+  SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 1;
+
+  ELFYAML::RawContentSection *RawSec =
+      dyn_cast_or_null<ELFYAML::RawContentSection>(YAMLSec);
+
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  if (RawSec && (RawSec->Content || RawSec->Size)) {
+    SHeader.sh_size = writeContent(OS, RawSec->Content, RawSec->Size);
+  } else {
+    STB.write(OS);
+    SHeader.sh_size = STB.getSize();
+  }
+
+  if (YAMLSec && YAMLSec->EntSize)
+    SHeader.sh_entsize = *YAMLSec->EntSize;
+
+  if (RawSec && RawSec->Info)
+    SHeader.sh_info = *RawSec->Info;
+
+  if (YAMLSec && YAMLSec->Flags)
+    SHeader.sh_flags = *YAMLSec->Flags;
+  else if (Name == ".dynstr")
+    SHeader.sh_flags = ELF::SHF_ALLOC;
+
+  // If the section is explicitly described in the YAML
+  // then we want to use its section address.
+  if (YAMLSec)
+    SHeader.sh_addr = YAMLSec->Address;
+}
+
+template <class ELFT> void ELFState<ELFT>::reportError(const Twine &Msg) {
+  ErrHandler(Msg);
+  HasError = true;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::setProgramHeaderLayout(std::vector<Elf_Phdr> &PHeaders,
+                                            std::vector<Elf_Shdr> &SHeaders) {
+  uint32_t PhdrIdx = 0;
+  for (auto &YamlPhdr : Doc.ProgramHeaders) {
+    Elf_Phdr &PHeader = PHeaders[PhdrIdx++];
+
+    std::vector<Elf_Shdr *> Sections;
+    for (const ELFYAML::SectionName &SecName : YamlPhdr.Sections) {
+      unsigned Index;
+      if (!SN2I.lookup(SecName.Section, Index)) {
+        reportError("unknown section referenced: '" + SecName.Section +
+                    "' by program header");
+        continue;
+      }
+      Sections.push_back(&SHeaders[Index]);
+    }
+
+    if (YamlPhdr.Offset) {
+      PHeader.p_offset = *YamlPhdr.Offset;
+    } else {
+      if (YamlPhdr.Sections.size())
+        PHeader.p_offset = UINT32_MAX;
+      else
+        PHeader.p_offset = 0;
+
+      // Find the minimum offset for the program header.
+      for (Elf_Shdr *SHeader : Sections)
+        PHeader.p_offset = std::min(PHeader.p_offset, SHeader->sh_offset);
+    }
+
+    // Find the maximum offset of the end of a section in order to set p_filesz
+    // and p_memsz. When setting p_filesz, trailing SHT_NOBITS sections are not
+    // counted.
+    uint64_t FileOffset = PHeader.p_offset, MemOffset = PHeader.p_offset;
+    for (Elf_Shdr *SHeader : Sections) {
+      uint64_t End = SHeader->sh_offset + SHeader->sh_size;
+      MemOffset = std::max(MemOffset, End);
+
+      if (SHeader->sh_type != llvm::ELF::SHT_NOBITS)
+        FileOffset = std::max(FileOffset, End);
+    }
+
+    // Set the file size and the memory size if not set explicitly.
+    PHeader.p_filesz = YamlPhdr.FileSize ? uint64_t(*YamlPhdr.FileSize)
+                                         : FileOffset - PHeader.p_offset;
+    PHeader.p_memsz = YamlPhdr.MemSize ? uint64_t(*YamlPhdr.MemSize)
+                                       : MemOffset - PHeader.p_offset;
+
+    if (YamlPhdr.Align) {
+      PHeader.p_align = *YamlPhdr.Align;
+    } else {
+      // Set the alignment of the segment to be the maximum alignment of the
+      // sections so that by default the segment has a valid and sensible
+      // alignment.
+      PHeader.p_align = 1;
+      for (Elf_Shdr *SHeader : Sections)
+        PHeader.p_align = std::max(PHeader.p_align, SHeader->sh_addralign);
+    }
+  }
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(
+    Elf_Shdr &SHeader, const ELFYAML::RawContentSection &Section,
+    ContiguousBlobAccumulator &CBA) {
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  SHeader.sh_size = writeContent(OS, Section.Content, Section.Size);
+
+  if (Section.EntSize)
+    SHeader.sh_entsize = *Section.EntSize;
+  else if (Section.Type == llvm::ELF::SHT_RELR)
+    SHeader.sh_entsize = sizeof(Elf_Relr);
+  else
+    SHeader.sh_entsize = 0;
+
+  if (Section.Info)
+    SHeader.sh_info = *Section.Info;
+}
+
+static bool isMips64EL(const ELFYAML::Object &Doc) {
+  return Doc.Header.Machine == ELFYAML::ELF_EM(llvm::ELF::EM_MIPS) &&
+         Doc.Header.Class == ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64) &&
+         Doc.Header.Data == ELFYAML::ELF_ELFDATA(ELF::ELFDATA2LSB);
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(
+    Elf_Shdr &SHeader, const ELFYAML::RelocationSection &Section,
+    ContiguousBlobAccumulator &CBA) {
+  assert((Section.Type == llvm::ELF::SHT_REL ||
+          Section.Type == llvm::ELF::SHT_RELA) &&
+         "Section type is not SHT_REL nor SHT_RELA");
+
+  bool IsRela = Section.Type == llvm::ELF::SHT_RELA;
+  SHeader.sh_entsize = IsRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel);
+  SHeader.sh_size = SHeader.sh_entsize * Section.Relocations.size();
+
+  // For relocation section set link to .symtab by default.
+  if (Section.Link.empty())
+    SHeader.sh_link = SN2I.get(".symtab");
+
+  if (!Section.RelocatableSec.empty())
+    SHeader.sh_info = toSectionIndex(Section.RelocatableSec, Section.Name);
+
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  for (const auto &Rel : Section.Relocations) {
+    unsigned SymIdx = Rel.Symbol ? toSymbolIndex(*Rel.Symbol, Section.Name,
+                                                 Section.Link == ".dynsym")
+                                 : 0;
+    if (IsRela) {
+      Elf_Rela REntry;
+      zero(REntry);
+      REntry.r_offset = Rel.Offset;
+      REntry.r_addend = Rel.Addend;
+      REntry.setSymbolAndType(SymIdx, Rel.Type, isMips64EL(Doc));
+      OS.write((const char *)&REntry, sizeof(REntry));
+    } else {
+      Elf_Rel REntry;
+      zero(REntry);
+      REntry.r_offset = Rel.Offset;
+      REntry.setSymbolAndType(SymIdx, Rel.Type, isMips64EL(Doc));
+      OS.write((const char *)&REntry, sizeof(REntry));
+    }
+  }
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(
+    Elf_Shdr &SHeader, const ELFYAML::SymtabShndxSection &Shndx,
+    ContiguousBlobAccumulator &CBA) {
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+
+  for (uint32_t E : Shndx.Entries)
+    support::endian::write<uint32_t>(OS, E, ELFT::TargetEndianness);
+
+  SHeader.sh_entsize = Shndx.EntSize ? (uint64_t)*Shndx.EntSize : 4;
+  SHeader.sh_size = Shndx.Entries.size() * SHeader.sh_entsize;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::Group &Section,
+                                         ContiguousBlobAccumulator &CBA) {
+  assert(Section.Type == llvm::ELF::SHT_GROUP &&
+         "Section type is not SHT_GROUP");
+
+  SHeader.sh_entsize = 4;
+  SHeader.sh_size = SHeader.sh_entsize * Section.Members.size();
+  SHeader.sh_info =
+      toSymbolIndex(Section.Signature, Section.Name, /*IsDynamic=*/false);
+
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+
+  for (const ELFYAML::SectionOrType &Member : Section.Members) {
+    unsigned int SectionIndex = 0;
+    if (Member.sectionNameOrType == "GRP_COMDAT")
+      SectionIndex = llvm::ELF::GRP_COMDAT;
+    else
+      SectionIndex = toSectionIndex(Member.sectionNameOrType, Section.Name);
+    support::endian::write<uint32_t>(OS, SectionIndex, ELFT::TargetEndianness);
+  }
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::SymverSection &Section,
+                                         ContiguousBlobAccumulator &CBA) {
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  for (uint16_t Version : Section.Entries)
+    support::endian::write<uint16_t>(OS, Version, ELFT::TargetEndianness);
+
+  SHeader.sh_entsize = Section.EntSize ? (uint64_t)*Section.EntSize : 2;
+  SHeader.sh_size = Section.Entries.size() * SHeader.sh_entsize;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(
+    Elf_Shdr &SHeader, const ELFYAML::StackSizesSection &Section,
+    ContiguousBlobAccumulator &CBA) {
+  using uintX_t = typename ELFT::uint;
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+
+  if (Section.Content || Section.Size) {
+    SHeader.sh_size = writeContent(OS, Section.Content, Section.Size);
+    return;
+  }
+
+  for (const ELFYAML::StackSizeEntry &E : *Section.Entries) {
+    support::endian::write<uintX_t>(OS, E.Address, ELFT::TargetEndianness);
+    SHeader.sh_size += sizeof(uintX_t) + encodeULEB128(E.Size, OS);
+  }
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::HashSection &Section,
+                                         ContiguousBlobAccumulator &CBA) {
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+
+  unsigned Link = 0;
+  if (Section.Link.empty() && SN2I.lookup(".dynsym", Link))
+    SHeader.sh_link = Link;
+
+  if (Section.Content || Section.Size) {
+    SHeader.sh_size = writeContent(OS, Section.Content, Section.Size);
+    return;
+  }
+
+  support::endian::write<uint32_t>(OS, Section.Bucket->size(),
+                                   ELFT::TargetEndianness);
+  support::endian::write<uint32_t>(OS, Section.Chain->size(),
+                                   ELFT::TargetEndianness);
+  for (uint32_t Val : *Section.Bucket)
+    support::endian::write<uint32_t>(OS, Val, ELFT::TargetEndianness);
+  for (uint32_t Val : *Section.Chain)
+    support::endian::write<uint32_t>(OS, Val, ELFT::TargetEndianness);
+
+  SHeader.sh_size = (2 + Section.Bucket->size() + Section.Chain->size()) * 4;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::VerdefSection &Section,
+                                         ContiguousBlobAccumulator &CBA) {
+  typedef typename ELFT::Verdef Elf_Verdef;
+  typedef typename ELFT::Verdaux Elf_Verdaux;
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+
+  uint64_t AuxCnt = 0;
+  for (size_t I = 0; I < Section.Entries.size(); ++I) {
+    const ELFYAML::VerdefEntry &E = Section.Entries[I];
+
+    Elf_Verdef VerDef;
+    VerDef.vd_version = E.Version;
+    VerDef.vd_flags = E.Flags;
+    VerDef.vd_ndx = E.VersionNdx;
+    VerDef.vd_hash = E.Hash;
+    VerDef.vd_aux = sizeof(Elf_Verdef);
+    VerDef.vd_cnt = E.VerNames.size();
+    if (I == Section.Entries.size() - 1)
+      VerDef.vd_next = 0;
+    else
+      VerDef.vd_next =
+          sizeof(Elf_Verdef) + E.VerNames.size() * sizeof(Elf_Verdaux);
+    OS.write((const char *)&VerDef, sizeof(Elf_Verdef));
+
+    for (size_t J = 0; J < E.VerNames.size(); ++J, ++AuxCnt) {
+      Elf_Verdaux VernAux;
+      VernAux.vda_name = DotDynstr.getOffset(E.VerNames[J]);
+      if (J == E.VerNames.size() - 1)
+        VernAux.vda_next = 0;
+      else
+        VernAux.vda_next = sizeof(Elf_Verdaux);
+      OS.write((const char *)&VernAux, sizeof(Elf_Verdaux));
+    }
+  }
+
+  SHeader.sh_size = Section.Entries.size() * sizeof(Elf_Verdef) +
+                    AuxCnt * sizeof(Elf_Verdaux);
+  SHeader.sh_info = Section.Info;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::VerneedSection &Section,
+                                         ContiguousBlobAccumulator &CBA) {
+  typedef typename ELFT::Verneed Elf_Verneed;
+  typedef typename ELFT::Vernaux Elf_Vernaux;
+
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+
+  uint64_t AuxCnt = 0;
+  for (size_t I = 0; I < Section.VerneedV.size(); ++I) {
+    const ELFYAML::VerneedEntry &VE = Section.VerneedV[I];
+
+    Elf_Verneed VerNeed;
+    VerNeed.vn_version = VE.Version;
+    VerNeed.vn_file = DotDynstr.getOffset(VE.File);
+    if (I == Section.VerneedV.size() - 1)
+      VerNeed.vn_next = 0;
+    else
+      VerNeed.vn_next =
+          sizeof(Elf_Verneed) + VE.AuxV.size() * sizeof(Elf_Vernaux);
+    VerNeed.vn_cnt = VE.AuxV.size();
+    VerNeed.vn_aux = sizeof(Elf_Verneed);
+    OS.write((const char *)&VerNeed, sizeof(Elf_Verneed));
+
+    for (size_t J = 0; J < VE.AuxV.size(); ++J, ++AuxCnt) {
+      const ELFYAML::VernauxEntry &VAuxE = VE.AuxV[J];
+
+      Elf_Vernaux VernAux;
+      VernAux.vna_hash = VAuxE.Hash;
+      VernAux.vna_flags = VAuxE.Flags;
+      VernAux.vna_other = VAuxE.Other;
+      VernAux.vna_name = DotDynstr.getOffset(VAuxE.Name);
+      if (J == VE.AuxV.size() - 1)
+        VernAux.vna_next = 0;
+      else
+        VernAux.vna_next = sizeof(Elf_Vernaux);
+      OS.write((const char *)&VernAux, sizeof(Elf_Vernaux));
+    }
+  }
+
+  SHeader.sh_size = Section.VerneedV.size() * sizeof(Elf_Verneed) +
+                    AuxCnt * sizeof(Elf_Vernaux);
+  SHeader.sh_info = Section.Info;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::MipsABIFlags &Section,
+                                         ContiguousBlobAccumulator &CBA) {
+  assert(Section.Type == llvm::ELF::SHT_MIPS_ABIFLAGS &&
+         "Section type is not SHT_MIPS_ABIFLAGS");
+
+  object::Elf_Mips_ABIFlags<ELFT> Flags;
+  zero(Flags);
+  SHeader.sh_entsize = sizeof(Flags);
+  SHeader.sh_size = SHeader.sh_entsize;
+
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  Flags.version = Section.Version;
+  Flags.isa_level = Section.ISALevel;
+  Flags.isa_rev = Section.ISARevision;
+  Flags.gpr_size = Section.GPRSize;
+  Flags.cpr1_size = Section.CPR1Size;
+  Flags.cpr2_size = Section.CPR2Size;
+  Flags.fp_abi = Section.FpABI;
+  Flags.isa_ext = Section.ISAExtension;
+  Flags.ases = Section.ASEs;
+  Flags.flags1 = Section.Flags1;
+  Flags.flags2 = Section.Flags2;
+  OS.write((const char *)&Flags, sizeof(Flags));
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::DynamicSection &Section,
+                                         ContiguousBlobAccumulator &CBA) {
+  typedef typename ELFT::uint uintX_t;
+
+  assert(Section.Type == llvm::ELF::SHT_DYNAMIC &&
+         "Section type is not SHT_DYNAMIC");
+
+  if (!Section.Entries.empty() && Section.Content)
+    reportError("cannot specify both raw content and explicit entries "
+                "for dynamic section '" +
+                Section.Name + "'");
+
+  if (Section.Content)
+    SHeader.sh_size = Section.Content->binary_size();
+  else
+    SHeader.sh_size = 2 * sizeof(uintX_t) * Section.Entries.size();
+  if (Section.EntSize)
+    SHeader.sh_entsize = *Section.EntSize;
+  else
+    SHeader.sh_entsize = sizeof(Elf_Dyn);
+
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  for (const ELFYAML::DynamicEntry &DE : Section.Entries) {
+    support::endian::write<uintX_t>(OS, DE.Tag, ELFT::TargetEndianness);
+    support::endian::write<uintX_t>(OS, DE.Val, ELFT::TargetEndianness);
+  }
+  if (Section.Content)
+    Section.Content->writeAsBinary(OS);
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::AddrsigSection &Section,
+                                         ContiguousBlobAccumulator &CBA) {
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+
+  unsigned Link = 0;
+  if (Section.Link.empty() && SN2I.lookup(".symtab", Link))
+    SHeader.sh_link = Link;
+
+  if (Section.Content || Section.Size) {
+    SHeader.sh_size = writeContent(OS, Section.Content, Section.Size);
+    return;
+  }
+
+  for (const ELFYAML::AddrsigSymbol &Sym : *Section.Symbols) {
+    uint64_t Val =
+        Sym.Name ? toSymbolIndex(*Sym.Name, Section.Name, /*IsDynamic=*/false)
+                 : (uint32_t)*Sym.Index;
+    SHeader.sh_size += encodeULEB128(Val, OS);
+  }
+}
+
+template <class ELFT> void ELFState<ELFT>::buildSectionIndex() {
+  for (unsigned I = 0, E = Doc.Sections.size(); I != E; ++I) {
+    StringRef Name = Doc.Sections[I]->Name;
+    if (Name.empty())
+      continue;
+
+    DotShStrtab.add(ELFYAML::dropUniqueSuffix(Name));
+    if (!SN2I.addName(Name, I))
+      reportError("repeated section name: '" + Name +
+                  "' at YAML section number " + Twine(I));
+  }
+
+  DotShStrtab.finalize();
+}
+
+template <class ELFT> void ELFState<ELFT>::buildSymbolIndexes() {
+  auto Build = [this](ArrayRef<ELFYAML::Symbol> V, NameToIdxMap &Map) {
+    for (size_t I = 0, S = V.size(); I < S; ++I) {
+      const ELFYAML::Symbol &Sym = V[I];
+      if (!Sym.Name.empty() && !Map.addName(Sym.Name, I + 1))
+        reportError("repeated symbol name: '" + Sym.Name + "'");
+    }
+  };
+
+  if (Doc.Symbols)
+    Build(*Doc.Symbols, SymN2I);
+  Build(Doc.DynamicSymbols, DynSymN2I);
+}
+
+template <class ELFT> void ELFState<ELFT>::finalizeStrings() {
+  // Add the regular symbol names to .strtab section.
+  if (Doc.Symbols)
+    for (const ELFYAML::Symbol &Sym : *Doc.Symbols)
+      DotStrtab.add(ELFYAML::dropUniqueSuffix(Sym.Name));
+  DotStrtab.finalize();
+
+  // Add the dynamic symbol names to .dynstr section.
+  for (const ELFYAML::Symbol &Sym : Doc.DynamicSymbols)
+    DotDynstr.add(ELFYAML::dropUniqueSuffix(Sym.Name));
+
+  // SHT_GNU_verdef and SHT_GNU_verneed sections might also
+  // add strings to .dynstr section.
+  for (const std::unique_ptr<ELFYAML::Section> &Sec : Doc.Sections) {
+    if (auto VerNeed = dyn_cast<ELFYAML::VerneedSection>(Sec.get())) {
+      for (const ELFYAML::VerneedEntry &VE : VerNeed->VerneedV) {
+        DotDynstr.add(VE.File);
+        for (const ELFYAML::VernauxEntry &Aux : VE.AuxV)
+          DotDynstr.add(Aux.Name);
+      }
+    } else if (auto VerDef = dyn_cast<ELFYAML::VerdefSection>(Sec.get())) {
+      for (const ELFYAML::VerdefEntry &E : VerDef->Entries)
+        for (StringRef Name : E.VerNames)
+          DotDynstr.add(Name);
+    }
+  }
+
+  DotDynstr.finalize();
+}
+
+template <class ELFT>
+bool ELFState<ELFT>::writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
+                              yaml::ErrorHandler EH) {
+  ELFState<ELFT> State(Doc, EH);
+
+  // Finalize .strtab and .dynstr sections. We do that early because want to
+  // finalize the string table builders before writing the content of the
+  // sections that might want to use them.
+  State.finalizeStrings();
+
+  State.buildSectionIndex();
+  State.buildSymbolIndexes();
+
+  std::vector<Elf_Phdr> PHeaders;
+  State.initProgramHeaders(PHeaders);
+
+  // XXX: This offset is tightly coupled with the order that we write
+  // things to `OS`.
+  const size_t SectionContentBeginOffset =
+      sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * Doc.ProgramHeaders.size();
+  ContiguousBlobAccumulator CBA(SectionContentBeginOffset);
+
+  std::vector<Elf_Shdr> SHeaders;
+  State.initSectionHeaders(SHeaders, CBA);
+
+  // Now we can decide segment offsets
+  State.setProgramHeaderLayout(PHeaders, SHeaders);
+
+  if (State.HasError)
+    return false;
+
+  State.writeELFHeader(CBA, OS);
+  writeArrayData(OS, makeArrayRef(PHeaders));
+  CBA.writeBlobToStream(OS);
+  writeArrayData(OS, makeArrayRef(SHeaders));
+  return true;
+}
+
+namespace llvm {
+namespace yaml {
+
+bool yaml2elf(llvm::ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH) {
+  bool IsLE = Doc.Header.Data == ELFYAML::ELF_ELFDATA(ELF::ELFDATA2LSB);
+  bool Is64Bit = Doc.Header.Class == ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64);
+  if (Is64Bit) {
+    if (IsLE)
+      return ELFState<object::ELF64LE>::writeELF(Out, Doc, EH);
+    return ELFState<object::ELF64BE>::writeELF(Out, Doc, EH);
+  }
+  if (IsLE)
+    return ELFState<object::ELF32LE>::writeELF(Out, Doc, EH);
+  return ELFState<object::ELF32BE>::writeELF(Out, Doc, EH);
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp
index 7497154c757d..29585abe6e80 100644
--- a/lib/ObjectYAML/ELFYAML.cpp
+++ b/lib/ObjectYAML/ELFYAML.cpp
@@ -11,12 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/ELFYAML.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MipsABIFlags.h"
 #include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/WithColor.h"
 #include <cassert>
 #include <cstdint>
 
@@ -50,6 +52,8 @@ void ScalarEnumerationTraits<ELFYAML::ELF_PT>::enumeration(
   ECase(PT_PHDR);
   ECase(PT_TLS);
   ECase(PT_GNU_EH_FRAME);
+  ECase(PT_GNU_STACK);
+  ECase(PT_GNU_RELRO);
 #undef ECase
   IO.enumFallback<Hex32>(Value);
 }
@@ -217,6 +221,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(
   ECase(EM_LANAI);
   ECase(EM_BPF);
 #undef ECase
+  IO.enumFallback<Hex16>(Value);
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFCLASS>::enumeration(
@@ -459,6 +464,9 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
   ECase(SHT_LLVM_CALL_GRAPH_PROFILE);
   ECase(SHT_LLVM_ADDRSIG);
   ECase(SHT_LLVM_DEPENDENT_LIBRARIES);
+  ECase(SHT_LLVM_SYMPART);
+  ECase(SHT_LLVM_PART_EHDR);
+  ECase(SHT_LLVM_PART_PHDR);
   ECase(SHT_GNU_ATTRIBUTES);
   ECase(SHT_GNU_HASH);
   ECase(SHT_GNU_verdef);
@@ -563,7 +571,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHN>::enumeration(
   ECase(SHN_HEXAGON_SCOMMON_4);
   ECase(SHN_HEXAGON_SCOMMON_8);
 #undef ECase
-  IO.enumFallback<Hex32>(Value);
+  IO.enumFallback<Hex16>(Value);
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_STB>::enumeration(
@@ -592,34 +600,6 @@ void ScalarEnumerationTraits<ELFYAML::ELF_STT>::enumeration(
   IO.enumFallback<Hex8>(Value);
 }
 
-void ScalarEnumerationTraits<ELFYAML::ELF_STV>::enumeration(
-    IO &IO, ELFYAML::ELF_STV &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X)
-  ECase(STV_DEFAULT);
-  ECase(STV_INTERNAL);
-  ECase(STV_HIDDEN);
-  ECase(STV_PROTECTED);
-#undef ECase
-}
-
-void ScalarBitSetTraits<ELFYAML::ELF_STO>::bitset(IO &IO,
-                                                  ELFYAML::ELF_STO &Value) {
-  const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
-  assert(Object && "The IO context is not initialized");
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
-  switch (Object->Header.Machine) {
-  case ELF::EM_MIPS:
-    BCase(STO_MIPS_OPTIONAL);
-    BCase(STO_MIPS_PLT);
-    BCase(STO_MIPS_PIC);
-    BCase(STO_MIPS_MICROMIPS);
-    break;
-  default:
-    break; // Nothing to do
-  }
-#undef BCase
-#undef BCaseMask
-}
 
 void ScalarEnumerationTraits<ELFYAML::ELF_RSS>::enumeration(
     IO &IO, ELFYAML::ELF_RSS &Value) {
@@ -671,8 +651,12 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
   case ELF::EM_BPF:
 #include "llvm/BinaryFormat/ELFRelocs/BPF.def"
     break;
+  case ELF::EM_PPC64:
+#include "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
+    break;
   default:
-    llvm_unreachable("Unsupported architecture");
+    // Nothing to do.
+    break;
   }
 #undef ELF_RELOC
   IO.enumFallback<Hex32>(Value);
@@ -845,7 +829,7 @@ void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
   IO.mapOptional("Entry", FileHdr.Entry, Hex64(0));
 
   IO.mapOptional("SHEntSize", FileHdr.SHEntSize);
-  IO.mapOptional("SHOffset", FileHdr.SHOffset);
+  IO.mapOptional("SHOff", FileHdr.SHOff);
   IO.mapOptional("SHNum", FileHdr.SHNum);
   IO.mapOptional("SHStrNdx", FileHdr.SHStrNdx);
 }
@@ -863,18 +847,111 @@ void MappingTraits<ELFYAML::ProgramHeader>::mapping(
   IO.mapOptional("Offset", Phdr.Offset);
 }
 
+LLVM_YAML_STRONG_TYPEDEF(StringRef, StOtherPiece)
+
+template <> struct ScalarTraits<StOtherPiece> {
+  static void output(const StOtherPiece &Val, void *, raw_ostream &Out) {
+    Out << Val;
+  }
+  static StringRef input(StringRef Scalar, void *, StOtherPiece &Val) {
+    Val = Scalar;
+    return {};
+  }
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+template <> struct SequenceElementTraits<StOtherPiece> {
+  static const bool flow = true;
+};
+
 namespace {
 
 struct NormalizedOther {
-  NormalizedOther(IO &)
-      : Visibility(ELFYAML::ELF_STV(0)), Other(ELFYAML::ELF_STO(0)) {}
-  NormalizedOther(IO &, uint8_t Original)
-      : Visibility(Original & 0x3), Other(Original & ~0x3) {}
+  NormalizedOther(IO &IO) : YamlIO(IO) {}
+  NormalizedOther(IO &IO, Optional<uint8_t> Original) : YamlIO(IO) {
+    assert(Original && "This constructor is only used for outputting YAML and "
+                       "assumes a non-empty Original");
+    std::vector<StOtherPiece> Ret;
+    const auto *Object = static_cast<ELFYAML::Object *>(YamlIO.getContext());
+    for (std::pair<StringRef, uint8_t> &P :
+         getFlags(Object->Header.Machine).takeVector()) {
+      uint8_t FlagValue = P.second;
+      if ((*Original & FlagValue) != FlagValue)
+        continue;
+      *Original &= ~FlagValue;
+      Ret.push_back({P.first});
+    }
+
+    if (*Original != 0) {
+      UnknownFlagsHolder = std::to_string(*Original);
+      Ret.push_back({UnknownFlagsHolder});
+    }
+
+    if (!Ret.empty())
+      Other = std::move(Ret);
+  }
+
+  uint8_t toValue(StringRef Name) {
+    const auto *Object = static_cast<ELFYAML::Object *>(YamlIO.getContext());
+    MapVector<StringRef, uint8_t> Flags = getFlags(Object->Header.Machine);
 
-  uint8_t denormalize(IO &) { return Visibility | Other; }
+    auto It = Flags.find(Name);
+    if (It != Flags.end())
+      return It->second;
+
+    uint8_t Val;
+    if (to_integer(Name, Val))
+      return Val;
+
+    YamlIO.setError("an unknown value is used for symbol's 'Other' field: " +
+                    Name);
+    return 0;
+  }
 
-  ELFYAML::ELF_STV Visibility;
-  ELFYAML::ELF_STO Other;
+  Optional<uint8_t> denormalize(IO &) {
+    if (!Other)
+      return None;
+    uint8_t Ret = 0;
+    for (StOtherPiece &Val : *Other)
+      Ret |= toValue(Val);
+    return Ret;
+  }
+
+  // st_other field is used to encode symbol visibility and platform-dependent
+  // flags and values. This method returns a name to value map that is used for
+  // parsing and encoding this field.
+  MapVector<StringRef, uint8_t> getFlags(unsigned EMachine) {
+    MapVector<StringRef, uint8_t> Map;
+    // STV_* values are just enumeration values. We add them in a reversed order
+    // because when we convert the st_other to named constants when printing
+    // YAML we want to use a maximum number of bits on each step:
+    // when we have st_other == 3, we want to print it as STV_PROTECTED (3), but
+    // not as STV_HIDDEN (2) + STV_INTERNAL (1).
+    Map["STV_PROTECTED"] = ELF::STV_PROTECTED;
+    Map["STV_HIDDEN"] = ELF::STV_HIDDEN;
+    Map["STV_INTERNAL"] = ELF::STV_INTERNAL;
+    // STV_DEFAULT is used to represent the default visibility and has a value
+    // 0. We want to be able to read it from YAML documents, but there is no
+    // reason to print it.
+    if (!YamlIO.outputting())
+      Map["STV_DEFAULT"] = ELF::STV_DEFAULT;
+
+    // MIPS is not consistent. All of the STO_MIPS_* values are bit flags,
+    // except STO_MIPS_MIPS16 which overlaps them. It should be checked and
+    // consumed first when we print the output, because we do not want to print
+    // any other flags that have the same bits instead.
+    if (EMachine == ELF::EM_MIPS) {
+      Map["STO_MIPS_MIPS16"] = ELF::STO_MIPS_MIPS16;
+      Map["STO_MIPS_MICROMIPS"] = ELF::STO_MIPS_MICROMIPS;
+      Map["STO_MIPS_PIC"] = ELF::STO_MIPS_PIC;
+      Map["STO_MIPS_PLT"] = ELF::STO_MIPS_PLT;
+      Map["STO_MIPS_OPTIONAL"] = ELF::STO_MIPS_OPTIONAL;
+    }
+    return Map;
+  }
+
+  IO &YamlIO;
+  Optional<std::vector<StOtherPiece>> Other;
+  std::string UnknownFlagsHolder;
 };
 
 } // end anonymous namespace
@@ -888,17 +965,21 @@ void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
   IO.mapOptional("Binding", Symbol.Binding, ELFYAML::ELF_STB(0));
   IO.mapOptional("Value", Symbol.Value, Hex64(0));
   IO.mapOptional("Size", Symbol.Size, Hex64(0));
-  MappingNormalization<NormalizedOther, uint8_t> Keys(IO, Symbol.Other);
-  IO.mapOptional("Visibility", Keys->Visibility, ELFYAML::ELF_STV(0));
-  IO.mapOptional("Other", Keys->Other, ELFYAML::ELF_STO(0));
+
+  // Symbol's Other field is a bit special. It is usually a field that
+  // represents st_other and holds the symbol visibility. However, on some
+  // platforms, it can contain bit fields and regular values, or even sometimes a
+  // crazy mix of them (see comments for NormalizedOther). Because of this, we
+  // need special handling.
+  MappingNormalization<NormalizedOther, Optional<uint8_t>> Keys(IO,
+                                                                Symbol.Other);
+  IO.mapOptional("Other", Keys->Other);
 }
 
 StringRef MappingTraits<ELFYAML::Symbol>::validate(IO &IO,
                                                    ELFYAML::Symbol &Symbol) {
   if (Symbol.Index && Symbol.Section.data())
     return "Index and Section cannot both be specified for Symbol";
-  if (Symbol.Index && *Symbol.Index == ELFYAML::ELF_SHN(ELF::SHN_XINDEX))
-    return "Large indexes are not supported";
   if (Symbol.NameIndex && !Symbol.Name.empty())
     return "Name and NameIndex cannot both be specified for Symbol";
   return StringRef();
@@ -914,10 +995,11 @@ static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) {
   IO.mapOptional("EntSize", Section.EntSize);
 
   // obj2yaml does not dump these fields. They are expected to be empty when we
-  // are producing YAML, because yaml2obj sets appropriate values for sh_offset
-  // and sh_size automatically when they are not explicitly defined.
+  // are producing YAML, because yaml2obj sets appropriate values for them
+  // automatically when they are not explicitly defined.
   assert(!IO.outputting() ||
          (!Section.ShOffset.hasValue() && !Section.ShSize.hasValue()));
+  IO.mapOptional("ShName", Section.ShName);
   IO.mapOptional("ShOffset", Section.ShOffset);
   IO.mapOptional("ShSize", Section.ShSize);
 }
@@ -935,6 +1017,21 @@ static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
   IO.mapOptional("Info", Section.Info);
 }
 
+static void sectionMapping(IO &IO, ELFYAML::StackSizesSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Content", Section.Content);
+  IO.mapOptional("Size", Section.Size);
+  IO.mapOptional("Entries", Section.Entries);
+}
+
+static void sectionMapping(IO &IO, ELFYAML::HashSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Content", Section.Content);
+  IO.mapOptional("Bucket", Section.Bucket);
+  IO.mapOptional("Chain", Section.Chain);
+  IO.mapOptional("Size", Section.Size);
+}
+
 static void sectionMapping(IO &IO, ELFYAML::NoBitsSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Size", Section.Size, Hex64(0));
@@ -969,6 +1066,18 @@ static void groupSectionMapping(IO &IO, ELFYAML::Group &Group) {
   IO.mapRequired("Members", Group.Members);
 }
 
+static void sectionMapping(IO &IO, ELFYAML::SymtabShndxSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Entries", Section.Entries);
+}
+
+static void sectionMapping(IO &IO, ELFYAML::AddrsigSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Content", Section.Content);
+  IO.mapOptional("Size", Section.Size);
+  IO.mapOptional("Symbols", Section.Symbols);
+}
+
 void MappingTraits<ELFYAML::SectionOrType>::mapping(
     IO &IO, ELFYAML::SectionOrType &sectionOrType) {
   IO.mapRequired("SectionOrType", sectionOrType.sectionNameOrType);
@@ -1029,6 +1138,11 @@ void MappingTraits<std::unique_ptr<ELFYAML::Section>>::mapping(
       Section.reset(new ELFYAML::NoBitsSection());
     sectionMapping(IO, *cast<ELFYAML::NoBitsSection>(Section.get()));
     break;
+  case ELF::SHT_HASH:
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::HashSection());
+    sectionMapping(IO, *cast<ELFYAML::HashSection>(Section.get()));
+    break;
   case ELF::SHT_MIPS_ABIFLAGS:
     if (!IO.outputting())
       Section.reset(new ELFYAML::MipsABIFlags());
@@ -1049,21 +1163,113 @@ void MappingTraits<std::unique_ptr<ELFYAML::Section>>::mapping(
       Section.reset(new ELFYAML::VerneedSection());
     sectionMapping(IO, *cast<ELFYAML::VerneedSection>(Section.get()));
     break;
-  default:
+  case ELF::SHT_SYMTAB_SHNDX:
     if (!IO.outputting())
-      Section.reset(new ELFYAML::RawContentSection());
-    sectionMapping(IO, *cast<ELFYAML::RawContentSection>(Section.get()));
+      Section.reset(new ELFYAML::SymtabShndxSection());
+    sectionMapping(IO, *cast<ELFYAML::SymtabShndxSection>(Section.get()));
+    break;
+  case ELF::SHT_LLVM_ADDRSIG:
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::AddrsigSection());
+    sectionMapping(IO, *cast<ELFYAML::AddrsigSection>(Section.get()));
+    break;
+  default:
+    if (!IO.outputting()) {
+      StringRef Name;
+      IO.mapOptional("Name", Name, StringRef());
+      Name = ELFYAML::dropUniqueSuffix(Name);
+
+      if (ELFYAML::StackSizesSection::nameMatches(Name))
+        Section = std::make_unique<ELFYAML::StackSizesSection>();
+      else
+        Section = std::make_unique<ELFYAML::RawContentSection>();
+    }
+
+    if (auto S = dyn_cast<ELFYAML::RawContentSection>(Section.get()))
+      sectionMapping(IO, *S);
+    else
+      sectionMapping(IO, *cast<ELFYAML::StackSizesSection>(Section.get()));
   }
 }
 
 StringRef MappingTraits<std::unique_ptr<ELFYAML::Section>>::validate(
     IO &io, std::unique_ptr<ELFYAML::Section> &Section) {
-  const auto *RawSection = dyn_cast<ELFYAML::RawContentSection>(Section.get());
-  if (!RawSection)
+  if (const auto *RawSection =
+          dyn_cast<ELFYAML::RawContentSection>(Section.get())) {
+    if (RawSection->Size && RawSection->Content &&
+        (uint64_t)(*RawSection->Size) < RawSection->Content->binary_size())
+      return "Section size must be greater than or equal to the content size";
     return {};
-  if (RawSection->Size && RawSection->Content &&
-      (uint64_t)(*RawSection->Size) < RawSection->Content->binary_size())
-    return "Section size must be greater than or equal to the content size";
+  }
+
+  if (const auto *SS = dyn_cast<ELFYAML::StackSizesSection>(Section.get())) {
+    if (!SS->Entries && !SS->Content && !SS->Size)
+      return ".stack_sizes: one of Content, Entries and Size must be specified";
+
+    if (SS->Size && SS->Content &&
+        (uint64_t)(*SS->Size) < SS->Content->binary_size())
+      return ".stack_sizes: Size must be greater than or equal to the content "
+             "size";
+
+    // We accept Content, Size or both together when there are no Entries.
+    if (!SS->Entries)
+      return {};
+
+    if (SS->Size)
+      return ".stack_sizes: Size and Entries cannot be used together";
+    if (SS->Content)
+      return ".stack_sizes: Content and Entries cannot be used together";
+    return {};
+  }
+
+  if (const auto *HS = dyn_cast<ELFYAML::HashSection>(Section.get())) {
+    if (!HS->Content && !HS->Bucket && !HS->Chain && !HS->Size)
+      return "one of \"Content\", \"Size\", \"Bucket\" or \"Chain\" must be "
+             "specified";
+
+    if (HS->Content || HS->Size) {
+      if (HS->Size && HS->Content &&
+          (uint64_t)*HS->Size < HS->Content->binary_size())
+        return "\"Size\" must be greater than or equal to the content "
+               "size";
+
+      if (HS->Bucket)
+        return "\"Bucket\" cannot be used with \"Content\" or \"Size\"";
+      if (HS->Chain)
+        return "\"Chain\" cannot be used with \"Content\" or \"Size\"";
+      return {};
+    }
+
+    if ((HS->Bucket && !HS->Chain) || (!HS->Bucket && HS->Chain))
+      return "\"Bucket\" and \"Chain\" must be used together";
+    return {};
+  }
+
+  if (const auto *Sec = dyn_cast<ELFYAML::AddrsigSection>(Section.get())) {
+    if (!Sec->Symbols && !Sec->Content && !Sec->Size)
+      return "one of \"Content\", \"Size\" or \"Symbols\" must be specified";
+
+    if (Sec->Content || Sec->Size) {
+      if (Sec->Size && Sec->Content &&
+          (uint64_t)*Sec->Size < Sec->Content->binary_size())
+        return "\"Size\" must be greater than or equal to the content "
+               "size";
+
+      if (Sec->Symbols)
+        return "\"Symbols\" cannot be used with \"Content\" or \"Size\"";
+      return {};
+    }
+
+    if (!Sec->Symbols)
+      return {};
+
+    for (const ELFYAML::AddrsigSymbol &AS : *Sec->Symbols)
+      if (AS.Index && AS.Name)
+        return "\"Index\" and \"Name\" cannot be used together when defining a "
+               "symbol";
+    return {};
+  }
+
   return {};
 }
 
@@ -1092,6 +1298,13 @@ struct NormalizedMips64RelType {
 
 } // end anonymous namespace
 
+void MappingTraits<ELFYAML::StackSizeEntry>::mapping(
+    IO &IO, ELFYAML::StackSizeEntry &E) {
+  assert(IO.getContext() && "The IO context is not initialized");
+  IO.mapOptional("Address", E.Address, Hex64(0));
+  IO.mapRequired("Size", E.Size);
+}
+
 void MappingTraits<ELFYAML::DynamicEntry>::mapping(IO &IO,
                                                    ELFYAML::DynamicEntry &Rel) {
   assert(IO.getContext() && "The IO context is not initialized");
@@ -1164,6 +1377,12 @@ void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
   IO.setContext(nullptr);
 }
 
+void MappingTraits<ELFYAML::AddrsigSymbol>::mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym) {
+  assert(IO.getContext() && "The IO context is not initialized");
+  IO.mapOptional("Name", Sym.Name);
+  IO.mapOptional("Index", Sym.Index);
+}
+
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_AFL_REG)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_ABI_FP)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_EXT)
diff --git a/lib/ObjectYAML/MachOEmitter.cpp b/lib/ObjectYAML/MachOEmitter.cpp
new file mode 100644
index 000000000000..b56f811ce67d
--- /dev/null
+++ b/lib/ObjectYAML/MachOEmitter.cpp
@@ -0,0 +1,580 @@
+//===- yaml2macho - Convert YAML to a Mach object file --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The Mach component of yaml2obj.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ObjectYAML/DWARFEmitter.h"
+#include "llvm/ObjectYAML/ObjectYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+namespace {
+
+class MachOWriter {
+public:
+  MachOWriter(MachOYAML::Object &Obj) : Obj(Obj), is64Bit(true), fileStart(0) {
+    is64Bit = Obj.Header.magic == MachO::MH_MAGIC_64 ||
+              Obj.Header.magic == MachO::MH_CIGAM_64;
+    memset(reinterpret_cast<void *>(&Header), 0, sizeof(MachO::mach_header_64));
+  }
+
+  void writeMachO(raw_ostream &OS);
+
+private:
+  void writeHeader(raw_ostream &OS);
+  void writeLoadCommands(raw_ostream &OS);
+  void writeSectionData(raw_ostream &OS);
+  void writeLinkEditData(raw_ostream &OS);
+
+  void writeBindOpcodes(raw_ostream &OS,
+                        std::vector<MachOYAML::BindOpcode> &BindOpcodes);
+  // LinkEdit writers
+  void writeRebaseOpcodes(raw_ostream &OS);
+  void writeBasicBindOpcodes(raw_ostream &OS);
+  void writeWeakBindOpcodes(raw_ostream &OS);
+  void writeLazyBindOpcodes(raw_ostream &OS);
+  void writeNameList(raw_ostream &OS);
+  void writeStringTable(raw_ostream &OS);
+  void writeExportTrie(raw_ostream &OS);
+
+  void dumpExportEntry(raw_ostream &OS, MachOYAML::ExportEntry &Entry);
+  void ZeroToOffset(raw_ostream &OS, size_t offset);
+
+  MachOYAML::Object &Obj;
+  bool is64Bit;
+  uint64_t fileStart;
+
+  MachO::mach_header_64 Header;
+};
+
+void MachOWriter::writeMachO(raw_ostream &OS) {
+  fileStart = OS.tell();
+  writeHeader(OS);
+  writeLoadCommands(OS);
+  writeSectionData(OS);
+}
+
+void MachOWriter::writeHeader(raw_ostream &OS) {
+  Header.magic = Obj.Header.magic;
+  Header.cputype = Obj.Header.cputype;
+  Header.cpusubtype = Obj.Header.cpusubtype;
+  Header.filetype = Obj.Header.filetype;
+  Header.ncmds = Obj.Header.ncmds;
+  Header.sizeofcmds = Obj.Header.sizeofcmds;
+  Header.flags = Obj.Header.flags;
+  Header.reserved = Obj.Header.reserved;
+
+  if (Obj.IsLittleEndian != sys::IsLittleEndianHost)
+    MachO::swapStruct(Header);
+
+  auto header_size =
+      is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
+  OS.write((const char *)&Header, header_size);
+}
+
+template <typename SectionType>
+SectionType constructSection(MachOYAML::Section Sec) {
+  SectionType TempSec;
+  memcpy(reinterpret_cast<void *>(&TempSec.sectname[0]), &Sec.sectname[0], 16);
+  memcpy(reinterpret_cast<void *>(&TempSec.segname[0]), &Sec.segname[0], 16);
+  TempSec.addr = Sec.addr;
+  TempSec.size = Sec.size;
+  TempSec.offset = Sec.offset;
+  TempSec.align = Sec.align;
+  TempSec.reloff = Sec.reloff;
+  TempSec.nreloc = Sec.nreloc;
+  TempSec.flags = Sec.flags;
+  TempSec.reserved1 = Sec.reserved1;
+  TempSec.reserved2 = Sec.reserved2;
+  return TempSec;
+}
+
+template <typename StructType>
+size_t writeLoadCommandData(MachOYAML::LoadCommand &LC, raw_ostream &OS,
+                            bool IsLittleEndian) {
+  return 0;
+}
+
+template <>
+size_t writeLoadCommandData<MachO::segment_command>(MachOYAML::LoadCommand &LC,
+                                                    raw_ostream &OS,
+                                                    bool IsLittleEndian) {
+  size_t BytesWritten = 0;
+  for (const auto &Sec : LC.Sections) {
+    auto TempSec = constructSection<MachO::section>(Sec);
+    if (IsLittleEndian != sys::IsLittleEndianHost)
+      MachO::swapStruct(TempSec);
+    OS.write(reinterpret_cast<const char *>(&(TempSec)),
+             sizeof(MachO::section));
+    BytesWritten += sizeof(MachO::section);
+  }
+  return BytesWritten;
+}
+
+template <>
+size_t writeLoadCommandData<MachO::segment_command_64>(
+    MachOYAML::LoadCommand &LC, raw_ostream &OS, bool IsLittleEndian) {
+  size_t BytesWritten = 0;
+  for (const auto &Sec : LC.Sections) {
+    auto TempSec = constructSection<MachO::section_64>(Sec);
+    TempSec.reserved3 = Sec.reserved3;
+    if (IsLittleEndian != sys::IsLittleEndianHost)
+      MachO::swapStruct(TempSec);
+    OS.write(reinterpret_cast<const char *>(&(TempSec)),
+             sizeof(MachO::section_64));
+    BytesWritten += sizeof(MachO::section_64);
+  }
+  return BytesWritten;
+}
+
+size_t writePayloadString(MachOYAML::LoadCommand &LC, raw_ostream &OS) {
+  size_t BytesWritten = 0;
+  if (!LC.PayloadString.empty()) {
+    OS.write(LC.PayloadString.c_str(), LC.PayloadString.length());
+    BytesWritten = LC.PayloadString.length();
+  }
+  return BytesWritten;
+}
+
+template <>
+size_t writeLoadCommandData<MachO::dylib_command>(MachOYAML::LoadCommand &LC,
+                                                  raw_ostream &OS,
+                                                  bool IsLittleEndian) {
+  return writePayloadString(LC, OS);
+}
+
+template <>
+size_t writeLoadCommandData<MachO::dylinker_command>(MachOYAML::LoadCommand &LC,
+                                                     raw_ostream &OS,
+                                                     bool IsLittleEndian) {
+  return writePayloadString(LC, OS);
+}
+
+template <>
+size_t writeLoadCommandData<MachO::rpath_command>(MachOYAML::LoadCommand &LC,
+                                                  raw_ostream &OS,
+                                                  bool IsLittleEndian) {
+  return writePayloadString(LC, OS);
+}
+
+template <>
+size_t writeLoadCommandData<MachO::build_version_command>(
+    MachOYAML::LoadCommand &LC, raw_ostream &OS, bool IsLittleEndian) {
+  size_t BytesWritten = 0;
+  for (const auto &T : LC.Tools) {
+    struct MachO::build_tool_version tool = T;
+    if (IsLittleEndian != sys::IsLittleEndianHost)
+      MachO::swapStruct(tool);
+    OS.write(reinterpret_cast<const char *>(&tool),
+             sizeof(MachO::build_tool_version));
+    BytesWritten += sizeof(MachO::build_tool_version);
+  }
+  return BytesWritten;
+}
+
+void ZeroFillBytes(raw_ostream &OS, size_t Size) {
+  std::vector<uint8_t> FillData;
+  FillData.insert(FillData.begin(), Size, 0);
+  OS.write(reinterpret_cast<char *>(FillData.data()), Size);
+}
+
+void Fill(raw_ostream &OS, size_t Size, uint32_t Data) {
+  std::vector<uint32_t> FillData;
+  FillData.insert(FillData.begin(), (Size / 4) + 1, Data);
+  OS.write(reinterpret_cast<char *>(FillData.data()), Size);
+}
+
+void MachOWriter::ZeroToOffset(raw_ostream &OS, size_t Offset) {
+  auto currOffset = OS.tell() - fileStart;
+  if (currOffset < Offset)
+    ZeroFillBytes(OS, Offset - currOffset);
+}
+
+void MachOWriter::writeLoadCommands(raw_ostream &OS) {
+  for (auto &LC : Obj.LoadCommands) {
+    size_t BytesWritten = 0;
+    llvm::MachO::macho_load_command Data = LC.Data;
+
+#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
+  case MachO::LCName:                                                          \
+    if (Obj.IsLittleEndian != sys::IsLittleEndianHost)                         \
+      MachO::swapStruct(Data.LCStruct##_data);                                 \
+    OS.write(reinterpret_cast<const char *>(&(Data.LCStruct##_data)),          \
+             sizeof(MachO::LCStruct));                                         \
+    BytesWritten = sizeof(MachO::LCStruct);                                    \
+    BytesWritten +=                                                            \
+        writeLoadCommandData<MachO::LCStruct>(LC, OS, Obj.IsLittleEndian);     \
+    break;
+
+    switch (LC.Data.load_command_data.cmd) {
+    default:
+      if (Obj.IsLittleEndian != sys::IsLittleEndianHost)
+        MachO::swapStruct(Data.load_command_data);
+      OS.write(reinterpret_cast<const char *>(&(Data.load_command_data)),
+               sizeof(MachO::load_command));
+      BytesWritten = sizeof(MachO::load_command);
+      BytesWritten +=
+          writeLoadCommandData<MachO::load_command>(LC, OS, Obj.IsLittleEndian);
+      break;
+#include "llvm/BinaryFormat/MachO.def"
+    }
+
+    if (LC.PayloadBytes.size() > 0) {
+      OS.write(reinterpret_cast<const char *>(LC.PayloadBytes.data()),
+               LC.PayloadBytes.size());
+      BytesWritten += LC.PayloadBytes.size();
+    }
+
+    if (LC.ZeroPadBytes > 0) {
+      ZeroFillBytes(OS, LC.ZeroPadBytes);
+      BytesWritten += LC.ZeroPadBytes;
+    }
+
+    // Fill remaining bytes with 0. This will only get hit in partially
+    // specified test cases.
+    auto BytesRemaining = LC.Data.load_command_data.cmdsize - BytesWritten;
+    if (BytesRemaining > 0) {
+      ZeroFillBytes(OS, BytesRemaining);
+    }
+  }
+}
+
+void MachOWriter::writeSectionData(raw_ostream &OS) {
+  bool FoundLinkEditSeg = false;
+  for (auto &LC : Obj.LoadCommands) {
+    switch (LC.Data.load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+    case MachO::LC_SEGMENT_64:
+      uint64_t segOff = is64Bit ? LC.Data.segment_command_64_data.fileoff
+                                : LC.Data.segment_command_data.fileoff;
+      if (0 ==
+          strncmp(&LC.Data.segment_command_data.segname[0], "__LINKEDIT", 16)) {
+        FoundLinkEditSeg = true;
+        writeLinkEditData(OS);
+      }
+      for (auto &Sec : LC.Sections) {
+        ZeroToOffset(OS, Sec.offset);
+        // Zero Fill any data between the end of the last thing we wrote and the
+        // start of this section.
+        assert((OS.tell() - fileStart <= Sec.offset ||
+                Sec.offset == (uint32_t)0) &&
+               "Wrote too much data somewhere, section offsets don't line up.");
+        if (0 == strncmp(&Sec.segname[0], "__DWARF", 16)) {
+          if (0 == strncmp(&Sec.sectname[0], "__debug_str", 16)) {
+            DWARFYAML::EmitDebugStr(OS, Obj.DWARF);
+          } else if (0 == strncmp(&Sec.sectname[0], "__debug_abbrev", 16)) {
+            DWARFYAML::EmitDebugAbbrev(OS, Obj.DWARF);
+          } else if (0 == strncmp(&Sec.sectname[0], "__debug_aranges", 16)) {
+            DWARFYAML::EmitDebugAranges(OS, Obj.DWARF);
+          } else if (0 == strncmp(&Sec.sectname[0], "__debug_pubnames", 16)) {
+            DWARFYAML::EmitPubSection(OS, Obj.DWARF.PubNames,
+                                      Obj.IsLittleEndian);
+          } else if (0 == strncmp(&Sec.sectname[0], "__debug_pubtypes", 16)) {
+            DWARFYAML::EmitPubSection(OS, Obj.DWARF.PubTypes,
+                                      Obj.IsLittleEndian);
+          } else if (0 == strncmp(&Sec.sectname[0], "__debug_info", 16)) {
+            DWARFYAML::EmitDebugInfo(OS, Obj.DWARF);
+          } else if (0 == strncmp(&Sec.sectname[0], "__debug_line", 16)) {
+            DWARFYAML::EmitDebugLine(OS, Obj.DWARF);
+          }
+
+          continue;
+        }
+
+        // Skip if it's a virtual section.
+        if (MachO::isVirtualSection(Sec.flags & MachO::SECTION_TYPE))
+          continue;
+
+        if (Sec.content) {
+          yaml::BinaryRef Content = *Sec.content;
+          Content.writeAsBinary(OS);
+          ZeroFillBytes(OS, Sec.size - Content.binary_size());
+        } else {
+          // Fill section data with 0xDEADBEEF.
+          Fill(OS, Sec.size, 0xDEADBEEFu);
+        }
+      }
+      uint64_t segSize = is64Bit ? LC.Data.segment_command_64_data.filesize
+                                 : LC.Data.segment_command_data.filesize;
+      ZeroToOffset(OS, segOff + segSize);
+      break;
+    }
+  }
+  // Old PPC Object Files didn't have __LINKEDIT segments, the data was just
+  // stuck at the end of the file.
+  if (!FoundLinkEditSeg)
+    writeLinkEditData(OS);
+}
+
+void MachOWriter::writeBindOpcodes(
+    raw_ostream &OS, std::vector<MachOYAML::BindOpcode> &BindOpcodes) {
+
+  for (auto Opcode : BindOpcodes) {
+    uint8_t OpByte = Opcode.Opcode | Opcode.Imm;
+    OS.write(reinterpret_cast<char *>(&OpByte), 1);
+    for (auto Data : Opcode.ULEBExtraData) {
+      encodeULEB128(Data, OS);
+    }
+    for (auto Data : Opcode.SLEBExtraData) {
+      encodeSLEB128(Data, OS);
+    }
+    if (!Opcode.Symbol.empty()) {
+      OS.write(Opcode.Symbol.data(), Opcode.Symbol.size());
+      OS.write('\0');
+    }
+  }
+}
+
+void MachOWriter::dumpExportEntry(raw_ostream &OS,
+                                  MachOYAML::ExportEntry &Entry) {
+  encodeSLEB128(Entry.TerminalSize, OS);
+  if (Entry.TerminalSize > 0) {
+    encodeSLEB128(Entry.Flags, OS);
+    if (Entry.Flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {
+      encodeSLEB128(Entry.Other, OS);
+      OS << Entry.ImportName;
+      OS.write('\0');
+    } else {
+      encodeSLEB128(Entry.Address, OS);
+      if (Entry.Flags & MachO::EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER)
+        encodeSLEB128(Entry.Other, OS);
+    }
+  }
+  OS.write(static_cast<uint8_t>(Entry.Children.size()));
+  for (auto EE : Entry.Children) {
+    OS << EE.Name;
+    OS.write('\0');
+    encodeSLEB128(EE.NodeOffset, OS);
+  }
+  for (auto EE : Entry.Children)
+    dumpExportEntry(OS, EE);
+}
+
+void MachOWriter::writeExportTrie(raw_ostream &OS) {
+  dumpExportEntry(OS, Obj.LinkEdit.ExportTrie);
+}
+
+template <typename NListType>
+void writeNListEntry(MachOYAML::NListEntry &NLE, raw_ostream &OS,
+                     bool IsLittleEndian) {
+  NListType ListEntry;
+  ListEntry.n_strx = NLE.n_strx;
+  ListEntry.n_type = NLE.n_type;
+  ListEntry.n_sect = NLE.n_sect;
+  ListEntry.n_desc = NLE.n_desc;
+  ListEntry.n_value = NLE.n_value;
+
+  if (IsLittleEndian != sys::IsLittleEndianHost)
+    MachO::swapStruct(ListEntry);
+  OS.write(reinterpret_cast<const char *>(&ListEntry), sizeof(NListType));
+}
+
+void MachOWriter::writeLinkEditData(raw_ostream &OS) {
+  typedef void (MachOWriter::*writeHandler)(raw_ostream &);
+  typedef std::pair<uint64_t, writeHandler> writeOperation;
+  std::vector<writeOperation> WriteQueue;
+
+  MachO::dyld_info_command *DyldInfoOnlyCmd = 0;
+  MachO::symtab_command *SymtabCmd = 0;
+  for (auto &LC : Obj.LoadCommands) {
+    switch (LC.Data.load_command_data.cmd) {
+    case MachO::LC_SYMTAB:
+      SymtabCmd = &LC.Data.symtab_command_data;
+      WriteQueue.push_back(
+          std::make_pair(SymtabCmd->symoff, &MachOWriter::writeNameList));
+      WriteQueue.push_back(
+          std::make_pair(SymtabCmd->stroff, &MachOWriter::writeStringTable));
+      break;
+    case MachO::LC_DYLD_INFO_ONLY:
+      DyldInfoOnlyCmd = &LC.Data.dyld_info_command_data;
+      WriteQueue.push_back(std::make_pair(DyldInfoOnlyCmd->rebase_off,
+                                          &MachOWriter::writeRebaseOpcodes));
+      WriteQueue.push_back(std::make_pair(DyldInfoOnlyCmd->bind_off,
+                                          &MachOWriter::writeBasicBindOpcodes));
+      WriteQueue.push_back(std::make_pair(DyldInfoOnlyCmd->weak_bind_off,
+                                          &MachOWriter::writeWeakBindOpcodes));
+      WriteQueue.push_back(std::make_pair(DyldInfoOnlyCmd->lazy_bind_off,
+                                          &MachOWriter::writeLazyBindOpcodes));
+      WriteQueue.push_back(std::make_pair(DyldInfoOnlyCmd->export_off,
+                                          &MachOWriter::writeExportTrie));
+      break;
+    }
+  }
+
+  llvm::sort(WriteQueue, [](const writeOperation &a, const writeOperation &b) {
+    return a.first < b.first;
+  });
+
+  for (auto writeOp : WriteQueue) {
+    ZeroToOffset(OS, writeOp.first);
+    (this->*writeOp.second)(OS);
+  }
+}
+
+void MachOWriter::writeRebaseOpcodes(raw_ostream &OS) {
+  MachOYAML::LinkEditData &LinkEdit = Obj.LinkEdit;
+
+  for (auto Opcode : LinkEdit.RebaseOpcodes) {
+    uint8_t OpByte = Opcode.Opcode | Opcode.Imm;
+    OS.write(reinterpret_cast<char *>(&OpByte), 1);
+    for (auto Data : Opcode.ExtraData)
+      encodeULEB128(Data, OS);
+  }
+}
+
+void MachOWriter::writeBasicBindOpcodes(raw_ostream &OS) {
+  writeBindOpcodes(OS, Obj.LinkEdit.BindOpcodes);
+}
+
+void MachOWriter::writeWeakBindOpcodes(raw_ostream &OS) {
+  writeBindOpcodes(OS, Obj.LinkEdit.WeakBindOpcodes);
+}
+
+void MachOWriter::writeLazyBindOpcodes(raw_ostream &OS) {
+  writeBindOpcodes(OS, Obj.LinkEdit.LazyBindOpcodes);
+}
+
+void MachOWriter::writeNameList(raw_ostream &OS) {
+  for (auto NLE : Obj.LinkEdit.NameList) {
+    if (is64Bit)
+      writeNListEntry<MachO::nlist_64>(NLE, OS, Obj.IsLittleEndian);
+    else
+      writeNListEntry<MachO::nlist>(NLE, OS, Obj.IsLittleEndian);
+  }
+}
+
+void MachOWriter::writeStringTable(raw_ostream &OS) {
+  for (auto Str : Obj.LinkEdit.StringTable) {
+    OS.write(Str.data(), Str.size());
+    OS.write('\0');
+  }
+}
+
+class UniversalWriter {
+public:
+  UniversalWriter(yaml::YamlObjectFile &ObjectFile)
+      : ObjectFile(ObjectFile), fileStart(0) {}
+
+  void writeMachO(raw_ostream &OS);
+
+private:
+  void writeFatHeader(raw_ostream &OS);
+  void writeFatArchs(raw_ostream &OS);
+
+  void ZeroToOffset(raw_ostream &OS, size_t offset);
+
+  yaml::YamlObjectFile &ObjectFile;
+  uint64_t fileStart;
+};
+
+void UniversalWriter::writeMachO(raw_ostream &OS) {
+  fileStart = OS.tell();
+  if (ObjectFile.MachO) {
+    MachOWriter Writer(*ObjectFile.MachO);
+    Writer.writeMachO(OS);
+    return;
+  }
+
+  writeFatHeader(OS);
+  writeFatArchs(OS);
+
+  auto &FatFile = *ObjectFile.FatMachO;
+  assert(FatFile.FatArchs.size() == FatFile.Slices.size());
+  for (size_t i = 0; i < FatFile.Slices.size(); i++) {
+    ZeroToOffset(OS, FatFile.FatArchs[i].offset);
+    MachOWriter Writer(FatFile.Slices[i]);
+    Writer.writeMachO(OS);
+
+    auto SliceEnd = FatFile.FatArchs[i].offset + FatFile.FatArchs[i].size;
+    ZeroToOffset(OS, SliceEnd);
+  }
+}
+
+void UniversalWriter::writeFatHeader(raw_ostream &OS) {
+  auto &FatFile = *ObjectFile.FatMachO;
+  MachO::fat_header header;
+  header.magic = FatFile.Header.magic;
+  header.nfat_arch = FatFile.Header.nfat_arch;
+  if (sys::IsLittleEndianHost)
+    swapStruct(header);
+  OS.write(reinterpret_cast<const char *>(&header), sizeof(MachO::fat_header));
+}
+
+template <typename FatArchType>
+FatArchType constructFatArch(MachOYAML::FatArch &Arch) {
+  FatArchType FatArch;
+  FatArch.cputype = Arch.cputype;
+  FatArch.cpusubtype = Arch.cpusubtype;
+  FatArch.offset = Arch.offset;
+  FatArch.size = Arch.size;
+  FatArch.align = Arch.align;
+  return FatArch;
+}
+
+template <typename StructType>
+void writeFatArch(MachOYAML::FatArch &LC, raw_ostream &OS) {}
+
+template <>
+void writeFatArch<MachO::fat_arch>(MachOYAML::FatArch &Arch, raw_ostream &OS) {
+  auto FatArch = constructFatArch<MachO::fat_arch>(Arch);
+  if (sys::IsLittleEndianHost)
+    swapStruct(FatArch);
+  OS.write(reinterpret_cast<const char *>(&FatArch), sizeof(MachO::fat_arch));
+}
+
+template <>
+void writeFatArch<MachO::fat_arch_64>(MachOYAML::FatArch &Arch,
+                                      raw_ostream &OS) {
+  auto FatArch = constructFatArch<MachO::fat_arch_64>(Arch);
+  FatArch.reserved = Arch.reserved;
+  if (sys::IsLittleEndianHost)
+    swapStruct(FatArch);
+  OS.write(reinterpret_cast<const char *>(&FatArch),
+           sizeof(MachO::fat_arch_64));
+}
+
+void UniversalWriter::writeFatArchs(raw_ostream &OS) {
+  auto &FatFile = *ObjectFile.FatMachO;
+  bool is64Bit = FatFile.Header.magic == MachO::FAT_MAGIC_64;
+  for (auto Arch : FatFile.FatArchs) {
+    if (is64Bit)
+      writeFatArch<MachO::fat_arch_64>(Arch, OS);
+    else
+      writeFatArch<MachO::fat_arch>(Arch, OS);
+  }
+}
+
+void UniversalWriter::ZeroToOffset(raw_ostream &OS, size_t Offset) {
+  auto currOffset = OS.tell() - fileStart;
+  if (currOffset < Offset)
+    ZeroFillBytes(OS, Offset - currOffset);
+}
+
+} // end anonymous namespace
+
+namespace llvm {
+namespace yaml {
+
+bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler /*EH*/) {
+  UniversalWriter Writer(Doc);
+  Writer.writeMachO(Out);
+  return true;
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/lib/ObjectYAML/MachOYAML.cpp b/lib/ObjectYAML/MachOYAML.cpp
index d12f12cf4435..0f7cd1e1495c 100644
--- a/lib/ObjectYAML/MachOYAML.cpp
+++ b/lib/ObjectYAML/MachOYAML.cpp
@@ -287,6 +287,15 @@ void MappingTraits<MachOYAML::Section>::mapping(IO &IO,
   IO.mapRequired("reserved1", Section.reserved1);
   IO.mapRequired("reserved2", Section.reserved2);
   IO.mapOptional("reserved3", Section.reserved3);
+  IO.mapOptional("content", Section.content);
+}
+
+StringRef
+MappingTraits<MachOYAML::Section>::validate(IO &IO,
+                                            MachOYAML::Section &Section) {
+  if (Section.content && Section.size < Section.content->binary_size())
+    return "Section size must be greater than or equal to the content size";
+  return {};
 }
 
 void MappingTraits<MachO::build_tool_version>::mapping(
diff --git a/lib/ObjectYAML/MinidumpEmitter.cpp b/lib/ObjectYAML/MinidumpEmitter.cpp
new file mode 100644
index 000000000000..bbfd2cd8cbab
--- /dev/null
+++ b/lib/ObjectYAML/MinidumpEmitter.cpp
@@ -0,0 +1,247 @@
+//===- yaml2minidump.cpp - Convert a YAML file to a minidump file ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/MinidumpYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::minidump;
+using namespace llvm::MinidumpYAML;
+
+namespace {
+/// A helper class to manage the placement of various structures into the final
+/// minidump binary. Space for objects can be allocated via various allocate***
+/// methods, while the final minidump file is written by calling the writeTo
+/// method. The plain versions of allocation functions take a reference to the
+/// data which is to be written (and hence the data must be available until
+/// writeTo is called), while the "New" versions allocate the data in an
+/// allocator-managed buffer, which is available until the allocator object is
+/// destroyed. For both kinds of functions, it is possible to modify the
+/// data for which the space has been "allocated" until the final writeTo call.
+/// This is useful for "linking" the allocated structures via their offsets.
+class BlobAllocator {
+public:
+  size_t tell() const { return NextOffset; }
+
+  size_t allocateCallback(size_t Size,
+                          std::function<void(raw_ostream &)> Callback) {
+    size_t Offset = NextOffset;
+    NextOffset += Size;
+    Callbacks.push_back(std::move(Callback));
+    return Offset;
+  }
+
+  size_t allocateBytes(ArrayRef<uint8_t> Data) {
+    return allocateCallback(
+        Data.size(), [Data](raw_ostream &OS) { OS << toStringRef(Data); });
+  }
+
+  size_t allocateBytes(yaml::BinaryRef Data) {
+    return allocateCallback(Data.binary_size(), [Data](raw_ostream &OS) {
+      Data.writeAsBinary(OS);
+    });
+  }
+
+  template <typename T> size_t allocateArray(ArrayRef<T> Data) {
+    return allocateBytes({reinterpret_cast<const uint8_t *>(Data.data()),
+                          sizeof(T) * Data.size()});
+  }
+
+  template <typename T, typename RangeType>
+  std::pair<size_t, MutableArrayRef<T>>
+  allocateNewArray(const iterator_range<RangeType> &Range);
+
+  template <typename T> size_t allocateObject(const T &Data) {
+    return allocateArray(makeArrayRef(Data));
+  }
+
+  template <typename T, typename... Types>
+  std::pair<size_t, T *> allocateNewObject(Types &&... Args) {
+    T *Object = new (Temporaries.Allocate<T>()) T(std::forward<Types>(Args)...);
+    return {allocateObject(*Object), Object};
+  }
+
+  size_t allocateString(StringRef Str);
+
+  void writeTo(raw_ostream &OS) const;
+
+private:
+  size_t NextOffset = 0;
+
+  BumpPtrAllocator Temporaries;
+  std::vector<std::function<void(raw_ostream &)>> Callbacks;
+};
+} // namespace
+
+template <typename T, typename RangeType>
+std::pair<size_t, MutableArrayRef<T>>
+BlobAllocator::allocateNewArray(const iterator_range<RangeType> &Range) {
+  size_t Num = std::distance(Range.begin(), Range.end());
+  MutableArrayRef<T> Array(Temporaries.Allocate<T>(Num), Num);
+  std::uninitialized_copy(Range.begin(), Range.end(), Array.begin());
+  return {allocateArray(Array), Array};
+}
+
+size_t BlobAllocator::allocateString(StringRef Str) {
+  SmallVector<UTF16, 32> WStr;
+  bool OK = convertUTF8ToUTF16String(Str, WStr);
+  assert(OK && "Invalid UTF8 in Str?");
+  (void)OK;
+
+  // The utf16 string is null-terminated, but the terminator is not counted in
+  // the string size.
+  WStr.push_back(0);
+  size_t Result =
+      allocateNewObject<support::ulittle32_t>(2 * (WStr.size() - 1)).first;
+  allocateNewArray<support::ulittle16_t>(make_range(WStr.begin(), WStr.end()));
+  return Result;
+}
+
+void BlobAllocator::writeTo(raw_ostream &OS) const {
+  size_t BeginOffset = OS.tell();
+  for (const auto &Callback : Callbacks)
+    Callback(OS);
+  assert(OS.tell() == BeginOffset + NextOffset &&
+         "Callbacks wrote an unexpected number of bytes.");
+  (void)BeginOffset;
+}
+
+static LocationDescriptor layout(BlobAllocator &File, yaml::BinaryRef Data) {
+  return {support::ulittle32_t(Data.binary_size()),
+          support::ulittle32_t(File.allocateBytes(Data))};
+}
+
+static size_t layout(BlobAllocator &File, MinidumpYAML::ExceptionStream &S) {
+  File.allocateObject(S.MDExceptionStream);
+
+  size_t DataEnd = File.tell();
+
+  // Lay out the thread context data, (which is not a part of the stream).
+  // TODO: This usually (always?) matches the thread context of the
+  // corresponding thread, and may overlap memory regions as well.  We could
+  // add a level of indirection to the MinidumpYAML format (like an array of
+  // Blobs that the LocationDescriptors index into) to be able to distinguish
+  // the cases where location descriptions overlap vs happen to reference
+  // identical data.
+  S.MDExceptionStream.ThreadContext = layout(File, S.ThreadContext);
+
+  return DataEnd;
+}
+
+static void layout(BlobAllocator &File, MemoryListStream::entry_type &Range) {
+  Range.Entry.Memory = layout(File, Range.Content);
+}
+
+static void layout(BlobAllocator &File, ModuleListStream::entry_type &M) {
+  M.Entry.ModuleNameRVA = File.allocateString(M.Name);
+
+  M.Entry.CvRecord = layout(File, M.CvRecord);
+  M.Entry.MiscRecord = layout(File, M.MiscRecord);
+}
+
+static void layout(BlobAllocator &File, ThreadListStream::entry_type &T) {
+  T.Entry.Stack.Memory = layout(File, T.Stack);
+  T.Entry.Context = layout(File, T.Context);
+}
+
+template <typename EntryT>
+static size_t layout(BlobAllocator &File,
+                     MinidumpYAML::detail::ListStream<EntryT> &S) {
+
+  File.allocateNewObject<support::ulittle32_t>(S.Entries.size());
+  for (auto &E : S.Entries)
+    File.allocateObject(E.Entry);
+
+  size_t DataEnd = File.tell();
+
+  // Lay out the auxiliary data, (which is not a part of the stream).
+  DataEnd = File.tell();
+  for (auto &E : S.Entries)
+    layout(File, E);
+
+  return DataEnd;
+}
+
+static Directory layout(BlobAllocator &File, Stream &S) {
+  Directory Result;
+  Result.Type = S.Type;
+  Result.Location.RVA = File.tell();
+  Optional<size_t> DataEnd;
+  switch (S.Kind) {
+  case Stream::StreamKind::Exception:
+    DataEnd = layout(File, cast<MinidumpYAML::ExceptionStream>(S));
+    break;
+  case Stream::StreamKind::MemoryInfoList: {
+    MemoryInfoListStream &InfoList = cast<MemoryInfoListStream>(S);
+    File.allocateNewObject<minidump::MemoryInfoListHeader>(
+        sizeof(minidump::MemoryInfoListHeader), sizeof(minidump::MemoryInfo),
+        InfoList.Infos.size());
+    File.allocateArray(makeArrayRef(InfoList.Infos));
+    break;
+  }
+  case Stream::StreamKind::MemoryList:
+    DataEnd = layout(File, cast<MemoryListStream>(S));
+    break;
+  case Stream::StreamKind::ModuleList:
+    DataEnd = layout(File, cast<ModuleListStream>(S));
+    break;
+  case Stream::StreamKind::RawContent: {
+    RawContentStream &Raw = cast<RawContentStream>(S);
+    File.allocateCallback(Raw.Size, [&Raw](raw_ostream &OS) {
+      Raw.Content.writeAsBinary(OS);
+      assert(Raw.Content.binary_size() <= Raw.Size);
+      OS << std::string(Raw.Size - Raw.Content.binary_size(), '\0');
+    });
+    break;
+  }
+  case Stream::StreamKind::SystemInfo: {
+    SystemInfoStream &SystemInfo = cast<SystemInfoStream>(S);
+    File.allocateObject(SystemInfo.Info);
+    // The CSD string is not a part of the stream.
+    DataEnd = File.tell();
+    SystemInfo.Info.CSDVersionRVA = File.allocateString(SystemInfo.CSDVersion);
+    break;
+  }
+  case Stream::StreamKind::TextContent:
+    File.allocateArray(arrayRefFromStringRef(cast<TextContentStream>(S).Text));
+    break;
+  case Stream::StreamKind::ThreadList:
+    DataEnd = layout(File, cast<ThreadListStream>(S));
+    break;
+  }
+  // If DataEnd is not set, we assume everything we generated is a part of the
+  // stream.
+  Result.Location.DataSize =
+      DataEnd.getValueOr(File.tell()) - Result.Location.RVA;
+  return Result;
+}
+
+namespace llvm {
+namespace yaml {
+
+bool yaml2minidump(MinidumpYAML::Object &Obj, raw_ostream &Out,
+                   ErrorHandler /*EH*/) {
+  BlobAllocator File;
+  File.allocateObject(Obj.Header);
+
+  std::vector<Directory> StreamDirectory(Obj.Streams.size());
+  Obj.Header.StreamDirectoryRVA =
+      File.allocateArray(makeArrayRef(StreamDirectory));
+  Obj.Header.NumberOfStreams = StreamDirectory.size();
+
+  for (auto &Stream : enumerate(Obj.Streams))
+    StreamDirectory[Stream.index()] = layout(File, *Stream.value());
+
+  File.writeTo(Out);
+  return true;
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/lib/ObjectYAML/MinidumpYAML.cpp b/lib/ObjectYAML/MinidumpYAML.cpp
index f5f2acd0cc4b..21b2a4d78629 100644
--- a/lib/ObjectYAML/MinidumpYAML.cpp
+++ b/lib/ObjectYAML/MinidumpYAML.cpp
@@ -8,110 +8,11 @@
 
 #include "llvm/ObjectYAML/MinidumpYAML.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/ConvertUTF.h"
 
 using namespace llvm;
 using namespace llvm::MinidumpYAML;
 using namespace llvm::minidump;
 
-namespace {
-/// A helper class to manage the placement of various structures into the final
-/// minidump binary. Space for objects can be allocated via various allocate***
-/// methods, while the final minidump file is written by calling the writeTo
-/// method. The plain versions of allocation functions take a reference to the
-/// data which is to be written (and hence the data must be available until
-/// writeTo is called), while the "New" versions allocate the data in an
-/// allocator-managed buffer, which is available until the allocator object is
-/// destroyed. For both kinds of functions, it is possible to modify the
-/// data for which the space has been "allocated" until the final writeTo call.
-/// This is useful for "linking" the allocated structures via their offsets.
-class BlobAllocator {
-public:
-  size_t tell() const { return NextOffset; }
-
-  size_t allocateCallback(size_t Size,
-                          std::function<void(raw_ostream &)> Callback) {
-    size_t Offset = NextOffset;
-    NextOffset += Size;
-    Callbacks.push_back(std::move(Callback));
-    return Offset;
-  }
-
-  size_t allocateBytes(ArrayRef<uint8_t> Data) {
-    return allocateCallback(
-        Data.size(), [Data](raw_ostream &OS) { OS << toStringRef(Data); });
-  }
-
-  size_t allocateBytes(yaml::BinaryRef Data) {
-    return allocateCallback(Data.binary_size(), [Data](raw_ostream &OS) {
-      Data.writeAsBinary(OS);
-    });
-  }
-
-  template <typename T> size_t allocateArray(ArrayRef<T> Data) {
-    return allocateBytes({reinterpret_cast<const uint8_t *>(Data.data()),
-                          sizeof(T) * Data.size()});
-  }
-
-  template <typename T, typename RangeType>
-  std::pair<size_t, MutableArrayRef<T>>
-  allocateNewArray(const iterator_range<RangeType> &Range);
-
-  template <typename T> size_t allocateObject(const T &Data) {
-    return allocateArray(makeArrayRef(Data));
-  }
-
-  template <typename T, typename... Types>
-  std::pair<size_t, T *> allocateNewObject(Types &&... Args) {
-    T *Object = new (Temporaries.Allocate<T>()) T(std::forward<Types>(Args)...);
-    return {allocateObject(*Object), Object};
-  }
-
-  size_t allocateString(StringRef Str);
-
-  void writeTo(raw_ostream &OS) const;
-
-private:
-  size_t NextOffset = 0;
-
-  BumpPtrAllocator Temporaries;
-  std::vector<std::function<void(raw_ostream &)>> Callbacks;
-};
-} // namespace
-
-template <typename T, typename RangeType>
-std::pair<size_t, MutableArrayRef<T>>
-BlobAllocator::allocateNewArray(const iterator_range<RangeType> &Range) {
-  size_t Num = std::distance(Range.begin(), Range.end());
-  MutableArrayRef<T> Array(Temporaries.Allocate<T>(Num), Num);
-  std::uninitialized_copy(Range.begin(), Range.end(), Array.begin());
-  return {allocateArray(Array), Array};
-}
-
-size_t BlobAllocator::allocateString(StringRef Str) {
-  SmallVector<UTF16, 32> WStr;
-  bool OK = convertUTF8ToUTF16String(Str, WStr);
-  assert(OK && "Invalid UTF8 in Str?");
-  (void)OK;
-
-  // The utf16 string is null-terminated, but the terminator is not counted in
-  // the string size.
-  WStr.push_back(0);
-  size_t Result =
-      allocateNewObject<support::ulittle32_t>(2 * (WStr.size() - 1)).first;
-  allocateNewArray<support::ulittle16_t>(make_range(WStr.begin(), WStr.end()));
-  return Result;
-}
-
-void BlobAllocator::writeTo(raw_ostream &OS) const {
-  size_t BeginOffset = OS.tell();
-  for (const auto &Callback : Callbacks)
-    Callback(OS);
-  assert(OS.tell() == BeginOffset + NextOffset &&
-         "Callbacks wrote an unexpected number of bytes.");
-  (void)BeginOffset;
-}
-
 /// Perform an optional yaml-mapping of an endian-aware type EndianType. The
 /// only purpose of this function is to avoid casting the Default value to the
 /// endian type;
@@ -168,6 +69,10 @@ Stream::~Stream() = default;
 
 Stream::StreamKind Stream::getKind(StreamType Type) {
   switch (Type) {
+  case StreamType::Exception:
+    return StreamKind::Exception;
+  case StreamType::MemoryInfoList:
+    return StreamKind::MemoryInfoList;
   case StreamType::MemoryList:
     return StreamKind::MemoryList;
   case StreamType::ModuleList:
@@ -192,22 +97,45 @@ Stream::StreamKind Stream::getKind(StreamType Type) {
 std::unique_ptr<Stream> Stream::create(StreamType Type) {
   StreamKind Kind = getKind(Type);
   switch (Kind) {
+  case StreamKind::Exception:
+    return std::make_unique<ExceptionStream>();
+  case StreamKind::MemoryInfoList:
+    return std::make_unique<MemoryInfoListStream>();
   case StreamKind::MemoryList:
-    return llvm::make_unique<MemoryListStream>();
+    return std::make_unique<MemoryListStream>();
   case StreamKind::ModuleList:
-    return llvm::make_unique<ModuleListStream>();
+    return std::make_unique<ModuleListStream>();
   case StreamKind::RawContent:
-    return llvm::make_unique<RawContentStream>(Type);
+    return std::make_unique<RawContentStream>(Type);
   case StreamKind::SystemInfo:
-    return llvm::make_unique<SystemInfoStream>();
+    return std::make_unique<SystemInfoStream>();
   case StreamKind::TextContent:
-    return llvm::make_unique<TextContentStream>(Type);
+    return std::make_unique<TextContentStream>(Type);
   case StreamKind::ThreadList:
-    return llvm::make_unique<ThreadListStream>();
+    return std::make_unique<ThreadListStream>();
   }
   llvm_unreachable("Unhandled stream kind!");
 }
 
+void yaml::ScalarBitSetTraits<MemoryProtection>::bitset(
+    IO &IO, MemoryProtection &Protect) {
+#define HANDLE_MDMP_PROTECT(CODE, NAME, NATIVENAME)                            \
+  IO.bitSetCase(Protect, #NATIVENAME, MemoryProtection::NAME);
+#include "llvm/BinaryFormat/MinidumpConstants.def"
+}
+
+void yaml::ScalarBitSetTraits<MemoryState>::bitset(IO &IO, MemoryState &State) {
+#define HANDLE_MDMP_MEMSTATE(CODE, NAME, NATIVENAME)                           \
+  IO.bitSetCase(State, #NATIVENAME, MemoryState::NAME);
+#include "llvm/BinaryFormat/MinidumpConstants.def"
+}
+
+void yaml::ScalarBitSetTraits<MemoryType>::bitset(IO &IO, MemoryType &Type) {
+#define HANDLE_MDMP_MEMTYPE(CODE, NAME, NATIVENAME)                            \
+  IO.bitSetCase(Type, #NATIVENAME, MemoryType::NAME);
+#include "llvm/BinaryFormat/MinidumpConstants.def"
+}
+
 void yaml::ScalarEnumerationTraits<ProcessorArchitecture>::enumeration(
     IO &IO, ProcessorArchitecture &Arch) {
 #define HANDLE_MDMP_ARCH(CODE, NAME)                                           \
@@ -314,6 +242,20 @@ void yaml::MappingTraits<CPUInfo::X86Info>::mapping(IO &IO,
   mapOptionalHex(IO, "AMD Extended Features", Info.AMDExtendedFeatures, 0);
 }
 
+void yaml::MappingTraits<MemoryInfo>::mapping(IO &IO, MemoryInfo &Info) {
+  mapRequiredHex(IO, "Base Address", Info.BaseAddress);
+  mapOptionalHex(IO, "Allocation Base", Info.AllocationBase, Info.BaseAddress);
+  mapRequiredAs<MemoryProtection>(IO, "Allocation Protect",
+                                  Info.AllocationProtect);
+  mapOptionalHex(IO, "Reserved0", Info.Reserved0, 0);
+  mapRequiredHex(IO, "Region Size", Info.RegionSize);
+  mapRequiredAs<MemoryState>(IO, "State", Info.State);
+  mapOptionalAs<MemoryProtection>(IO, "Protect", Info.Protect,
+                                  Info.AllocationProtect);
+  mapRequiredAs<MemoryType>(IO, "Type", Info.Type);
+  mapOptionalHex(IO, "Reserved1", Info.Reserved1, 0);
+}
+
 void yaml::MappingTraits<VSFixedFileInfo>::mapping(IO &IO,
                                                    VSFixedFileInfo &Info) {
   mapOptionalHex(IO, "Signature", Info.Signature, 0);
@@ -336,8 +278,7 @@ void yaml::MappingTraits<ModuleListStream::entry_type>::mapping(
   mapRequiredHex(IO, "Base of Image", M.Entry.BaseOfImage);
   mapRequiredHex(IO, "Size of Image", M.Entry.SizeOfImage);
   mapOptionalHex(IO, "Checksum", M.Entry.Checksum, 0);
-  IO.mapOptional("Time Date Stamp", M.Entry.TimeDateStamp,
-                 support::ulittle32_t(0));
+  mapOptional(IO, "Time Date Stamp", M.Entry.TimeDateStamp, 0);
   IO.mapRequired("Module Name", M.Name);
   IO.mapOptional("Version Info", M.Entry.VersionInfo, VSFixedFileInfo());
   IO.mapRequired("CodeView Record", M.CvRecord);
@@ -363,6 +304,10 @@ void yaml::MappingTraits<MemoryListStream::entry_type>::mapping(
       IO, Range.Entry, Range.Content);
 }
 
+static void streamMapping(yaml::IO &IO, MemoryInfoListStream &Stream) {
+  IO.mapRequired("Memory Ranges", Stream.Infos);
+}
+
 static void streamMapping(yaml::IO &IO, MemoryListStream &Stream) {
   IO.mapRequired("Memory Ranges", Stream.Entries);
 }
@@ -425,6 +370,32 @@ static void streamMapping(yaml::IO &IO, ThreadListStream &Stream) {
   IO.mapRequired("Threads", Stream.Entries);
 }
 
+static void streamMapping(yaml::IO &IO, MinidumpYAML::ExceptionStream &Stream) {
+  mapRequiredHex(IO, "Thread ID", Stream.MDExceptionStream.ThreadId);
+  IO.mapRequired("Exception Record", Stream.MDExceptionStream.ExceptionRecord);
+  IO.mapRequired("Thread Context", Stream.ThreadContext);
+}
+
+void yaml::MappingTraits<minidump::Exception>::mapping(
+    yaml::IO &IO, minidump::Exception &Exception) {
+  mapRequiredHex(IO, "Exception Code", Exception.ExceptionCode);
+  mapOptionalHex(IO, "Exception Flags", Exception.ExceptionFlags, 0);
+  mapOptionalHex(IO, "Exception Record", Exception.ExceptionRecord, 0);
+  mapOptionalHex(IO, "Exception Address", Exception.ExceptionAddress, 0);
+  mapOptional(IO, "Number of Parameters", Exception.NumberParameters, 0);
+
+  for (size_t Index = 0; Index < Exception.MaxParameters; ++Index) {
+    SmallString<16> Name("Parameter ");
+    Twine(Index).toVector(Name);
+    support::ulittle64_t &Field = Exception.ExceptionInformation[Index];
+
+    if (Index < Exception.NumberParameters)
+      mapRequiredHex(IO, Name.c_str(), Field);
+    else
+      mapOptionalHex(IO, Name.c_str(), Field, 0);
+  }
+}
+
 void yaml::MappingTraits<std::unique_ptr<Stream>>::mapping(
     yaml::IO &IO, std::unique_ptr<MinidumpYAML::Stream> &S) {
   StreamType Type;
@@ -435,6 +406,12 @@ void yaml::MappingTraits<std::unique_ptr<Stream>>::mapping(
   if (!IO.outputting())
     S = MinidumpYAML::Stream::create(Type);
   switch (S->Kind) {
+  case MinidumpYAML::Stream::StreamKind::Exception:
+    streamMapping(IO, llvm::cast<MinidumpYAML::ExceptionStream>(*S));
+    break;
+  case MinidumpYAML::Stream::StreamKind::MemoryInfoList:
+    streamMapping(IO, llvm::cast<MemoryInfoListStream>(*S));
+    break;
   case MinidumpYAML::Stream::StreamKind::MemoryList:
     streamMapping(IO, llvm::cast<MemoryListStream>(*S));
     break;
@@ -461,6 +438,8 @@ StringRef yaml::MappingTraits<std::unique_ptr<Stream>>::validate(
   switch (S->Kind) {
   case MinidumpYAML::Stream::StreamKind::RawContent:
     return streamValidate(cast<RawContentStream>(*S));
+  case MinidumpYAML::Stream::StreamKind::Exception:
+  case MinidumpYAML::Stream::StreamKind::MemoryInfoList:
   case MinidumpYAML::Stream::StreamKind::MemoryList:
   case MinidumpYAML::Stream::StreamKind::ModuleList:
   case MinidumpYAML::Stream::StreamKind::SystemInfo:
@@ -479,118 +458,28 @@ void yaml::MappingTraits<Object>::mapping(IO &IO, Object &O) {
   IO.mapRequired("Streams", O.Streams);
 }
 
-static LocationDescriptor layout(BlobAllocator &File, yaml::BinaryRef Data) {
-  return {support::ulittle32_t(Data.binary_size()),
-          support::ulittle32_t(File.allocateBytes(Data))};
-}
-
-static void layout(BlobAllocator &File, MemoryListStream::entry_type &Range) {
-  Range.Entry.Memory = layout(File, Range.Content);
-}
-
-static void layout(BlobAllocator &File, ModuleListStream::entry_type &M) {
-  M.Entry.ModuleNameRVA = File.allocateString(M.Name);
-
-  M.Entry.CvRecord = layout(File, M.CvRecord);
-  M.Entry.MiscRecord = layout(File, M.MiscRecord);
-}
-
-static void layout(BlobAllocator &File, ThreadListStream::entry_type &T) {
-  T.Entry.Stack.Memory = layout(File, T.Stack);
-  T.Entry.Context = layout(File, T.Context);
-}
-
-template <typename EntryT>
-static size_t layout(BlobAllocator &File,
-                     MinidumpYAML::detail::ListStream<EntryT> &S) {
-
-  File.allocateNewObject<support::ulittle32_t>(S.Entries.size());
-  for (auto &E : S.Entries)
-    File.allocateObject(E.Entry);
-
-  size_t DataEnd = File.tell();
-
-  // Lay out the auxiliary data, (which is not a part of the stream).
-  DataEnd = File.tell();
-  for (auto &E : S.Entries)
-    layout(File, E);
-
-  return DataEnd;
-}
-
-static Directory layout(BlobAllocator &File, Stream &S) {
-  Directory Result;
-  Result.Type = S.Type;
-  Result.Location.RVA = File.tell();
-  Optional<size_t> DataEnd;
-  switch (S.Kind) {
-  case Stream::StreamKind::MemoryList:
-    DataEnd = layout(File, cast<MemoryListStream>(S));
-    break;
-  case Stream::StreamKind::ModuleList:
-    DataEnd = layout(File, cast<ModuleListStream>(S));
-    break;
-  case Stream::StreamKind::RawContent: {
-    RawContentStream &Raw = cast<RawContentStream>(S);
-    File.allocateCallback(Raw.Size, [&Raw](raw_ostream &OS) {
-      Raw.Content.writeAsBinary(OS);
-      assert(Raw.Content.binary_size() <= Raw.Size);
-      OS << std::string(Raw.Size - Raw.Content.binary_size(), '\0');
-    });
-    break;
-  }
-  case Stream::StreamKind::SystemInfo: {
-    SystemInfoStream &SystemInfo = cast<SystemInfoStream>(S);
-    File.allocateObject(SystemInfo.Info);
-    // The CSD string is not a part of the stream.
-    DataEnd = File.tell();
-    SystemInfo.Info.CSDVersionRVA = File.allocateString(SystemInfo.CSDVersion);
-    break;
-  }
-  case Stream::StreamKind::TextContent:
-    File.allocateArray(arrayRefFromStringRef(cast<TextContentStream>(S).Text));
-    break;
-  case Stream::StreamKind::ThreadList:
-    DataEnd = layout(File, cast<ThreadListStream>(S));
-    break;
-  }
-  // If DataEnd is not set, we assume everything we generated is a part of the
-  // stream.
-  Result.Location.DataSize =
-      DataEnd.getValueOr(File.tell()) - Result.Location.RVA;
-  return Result;
-}
-
-void MinidumpYAML::writeAsBinary(Object &Obj, raw_ostream &OS) {
-  BlobAllocator File;
-  File.allocateObject(Obj.Header);
-
-  std::vector<Directory> StreamDirectory(Obj.Streams.size());
-  Obj.Header.StreamDirectoryRVA =
-      File.allocateArray(makeArrayRef(StreamDirectory));
-  Obj.Header.NumberOfStreams = StreamDirectory.size();
-
-  for (auto &Stream : enumerate(Obj.Streams))
-    StreamDirectory[Stream.index()] = layout(File, *Stream.value());
-
-  File.writeTo(OS);
-}
-
-Error MinidumpYAML::writeAsBinary(StringRef Yaml, raw_ostream &OS) {
-  yaml::Input Input(Yaml);
-  Object Obj;
-  Input >> Obj;
-  if (std::error_code EC = Input.error())
-    return errorCodeToError(EC);
-
-  writeAsBinary(Obj, OS);
-  return Error::success();
-}
-
 Expected<std::unique_ptr<Stream>>
 Stream::create(const Directory &StreamDesc, const object::MinidumpFile &File) {
   StreamKind Kind = getKind(StreamDesc.Type);
   switch (Kind) {
+  case StreamKind::Exception: {
+    Expected<const minidump::ExceptionStream &> ExpectedExceptionStream =
+        File.getExceptionStream();
+    if (!ExpectedExceptionStream)
+      return ExpectedExceptionStream.takeError();
+    Expected<ArrayRef<uint8_t>> ExpectedThreadContext =
+        File.getRawData(ExpectedExceptionStream->ThreadContext);
+    if (!ExpectedThreadContext)
+      return ExpectedThreadContext.takeError();
+    return std::make_unique<ExceptionStream>(*ExpectedExceptionStream,
+                                             *ExpectedThreadContext);
+  }
+  case StreamKind::MemoryInfoList: {
+    if (auto ExpectedList = File.getMemoryInfoList())
+      return std::make_unique<MemoryInfoListStream>(*ExpectedList);
+    else
+      return ExpectedList.takeError();
+  }
   case StreamKind::MemoryList: {
     auto ExpectedList = File.getMemoryList();
     if (!ExpectedList)
@@ -602,7 +491,7 @@ Stream::create(const Directory &StreamDesc, const object::MinidumpFile &File) {
         return ExpectedContent.takeError();
       Ranges.push_back({MD, *ExpectedContent});
     }
-    return llvm::make_unique<MemoryListStream>(std::move(Ranges));
+    return std::make_unique<MemoryListStream>(std::move(Ranges));
   }
   case StreamKind::ModuleList: {
     auto ExpectedList = File.getModuleList();
@@ -622,10 +511,10 @@ Stream::create(const Directory &StreamDesc, const object::MinidumpFile &File) {
       Modules.push_back(
           {M, std::move(*ExpectedName), *ExpectedCv, *ExpectedMisc});
     }
-    return llvm::make_unique<ModuleListStream>(std::move(Modules));
+    return std::make_unique<ModuleListStream>(std::move(Modules));
   }
   case StreamKind::RawContent:
-    return llvm::make_unique<RawContentStream>(StreamDesc.Type,
+    return std::make_unique<RawContentStream>(StreamDesc.Type,
                                                File.getRawStream(StreamDesc));
   case StreamKind::SystemInfo: {
     auto ExpectedInfo = File.getSystemInfo();
@@ -634,11 +523,11 @@ Stream::create(const Directory &StreamDesc, const object::MinidumpFile &File) {
     auto ExpectedCSDVersion = File.getString(ExpectedInfo->CSDVersionRVA);
     if (!ExpectedCSDVersion)
       return ExpectedInfo.takeError();
-    return llvm::make_unique<SystemInfoStream>(*ExpectedInfo,
+    return std::make_unique<SystemInfoStream>(*ExpectedInfo,
                                                std::move(*ExpectedCSDVersion));
   }
   case StreamKind::TextContent:
-    return llvm::make_unique<TextContentStream>(
+    return std::make_unique<TextContentStream>(
         StreamDesc.Type, toStringRef(File.getRawStream(StreamDesc)));
   case StreamKind::ThreadList: {
     auto ExpectedList = File.getThreadList();
@@ -654,7 +543,7 @@ Stream::create(const Directory &StreamDesc, const object::MinidumpFile &File) {
         return ExpectedContext.takeError();
       Threads.push_back({T, *ExpectedStack, *ExpectedContext});
     }
-    return llvm::make_unique<ThreadListStream>(std::move(Threads));
+    return std::make_unique<ThreadListStream>(std::move(Threads));
   }
   }
   llvm_unreachable("Unhandled stream kind!");
diff --git a/lib/ObjectYAML/WasmEmitter.cpp b/lib/ObjectYAML/WasmEmitter.cpp
new file mode 100644
index 000000000000..debc040587a8
--- /dev/null
+++ b/lib/ObjectYAML/WasmEmitter.cpp
@@ -0,0 +1,633 @@
+//===- yaml2wasm - Convert YAML to a Wasm object file --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The Wasm component of yaml2obj.
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "llvm/Object/Wasm.h"
+#include "llvm/ObjectYAML/ObjectYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+
+namespace {
+/// This parses a yaml stream that represents a Wasm object file.
+/// See docs/yaml2obj for the yaml scheema.
+class WasmWriter {
+public:
+  WasmWriter(WasmYAML::Object &Obj, yaml::ErrorHandler EH)
+      : Obj(Obj), ErrHandler(EH) {}
+  bool writeWasm(raw_ostream &OS);
+
+private:
+  void writeRelocSection(raw_ostream &OS, WasmYAML::Section &Sec,
+                         uint32_t SectionIndex);
+
+  void writeInitExpr(raw_ostream &OS, const wasm::WasmInitExpr &InitExpr);
+
+  void writeSectionContent(raw_ostream &OS, WasmYAML::CustomSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::TypeSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::ImportSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::FunctionSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::TableSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::MemorySection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::GlobalSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::EventSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::ExportSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::StartSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::ElemSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::CodeSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::DataSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::DataCountSection &Section);
+
+  // Custom section types
+  void writeSectionContent(raw_ostream &OS, WasmYAML::DylinkSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::NameSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::LinkingSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::ProducersSection &Section);
+  void writeSectionContent(raw_ostream &OS,
+                          WasmYAML::TargetFeaturesSection &Section);
+  WasmYAML::Object &Obj;
+  uint32_t NumImportedFunctions = 0;
+  uint32_t NumImportedGlobals = 0;
+  uint32_t NumImportedEvents = 0;
+
+  bool HasError = false;
+  yaml::ErrorHandler ErrHandler;
+  void reportError(const Twine &Msg);
+};
+
+class SubSectionWriter {
+  raw_ostream &OS;
+  std::string OutString;
+  raw_string_ostream StringStream;
+
+public:
+  SubSectionWriter(raw_ostream &OS) : OS(OS), StringStream(OutString) {}
+
+  void done() {
+    StringStream.flush();
+    encodeULEB128(OutString.size(), OS);
+    OS << OutString;
+    OutString.clear();
+  }
+
+  raw_ostream &getStream() { return StringStream; }
+};
+
+} // end anonymous namespace
+
+static int writeUint64(raw_ostream &OS, uint64_t Value) {
+  char Data[sizeof(Value)];
+  support::endian::write64le(Data, Value);
+  OS.write(Data, sizeof(Data));
+  return 0;
+}
+
+static int writeUint32(raw_ostream &OS, uint32_t Value) {
+  char Data[sizeof(Value)];
+  support::endian::write32le(Data, Value);
+  OS.write(Data, sizeof(Data));
+  return 0;
+}
+
+static int writeUint8(raw_ostream &OS, uint8_t Value) {
+  char Data[sizeof(Value)];
+  memcpy(Data, &Value, sizeof(Data));
+  OS.write(Data, sizeof(Data));
+  return 0;
+}
+
+static int writeStringRef(const StringRef &Str, raw_ostream &OS) {
+  encodeULEB128(Str.size(), OS);
+  OS << Str;
+  return 0;
+}
+
+static int writeLimits(const WasmYAML::Limits &Lim, raw_ostream &OS) {
+  writeUint8(OS, Lim.Flags);
+  encodeULEB128(Lim.Initial, OS);
+  if (Lim.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
+    encodeULEB128(Lim.Maximum, OS);
+  return 0;
+}
+
+void WasmWriter::reportError(const Twine &Msg) {
+  ErrHandler(Msg);
+  HasError = true;
+}
+
+void WasmWriter::writeInitExpr(raw_ostream &OS,
+                               const wasm::WasmInitExpr &InitExpr) {
+  writeUint8(OS, InitExpr.Opcode);
+  switch (InitExpr.Opcode) {
+  case wasm::WASM_OPCODE_I32_CONST:
+    encodeSLEB128(InitExpr.Value.Int32, OS);
+    break;
+  case wasm::WASM_OPCODE_I64_CONST:
+    encodeSLEB128(InitExpr.Value.Int64, OS);
+    break;
+  case wasm::WASM_OPCODE_F32_CONST:
+    writeUint32(OS, InitExpr.Value.Float32);
+    break;
+  case wasm::WASM_OPCODE_F64_CONST:
+    writeUint64(OS, InitExpr.Value.Float64);
+    break;
+  case wasm::WASM_OPCODE_GLOBAL_GET:
+    encodeULEB128(InitExpr.Value.Global, OS);
+    break;
+  default:
+    reportError("unknown opcode in init_expr: " + Twine(InitExpr.Opcode));
+    return;
+  }
+  writeUint8(OS, wasm::WASM_OPCODE_END);
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::DylinkSection &Section) {
+  writeStringRef(Section.Name, OS);
+  encodeULEB128(Section.MemorySize, OS);
+  encodeULEB128(Section.MemoryAlignment, OS);
+  encodeULEB128(Section.TableSize, OS);
+  encodeULEB128(Section.TableAlignment, OS);
+  encodeULEB128(Section.Needed.size(), OS);
+  for (StringRef Needed : Section.Needed)
+    writeStringRef(Needed, OS);
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::LinkingSection &Section) {
+  writeStringRef(Section.Name, OS);
+  encodeULEB128(Section.Version, OS);
+
+  SubSectionWriter SubSection(OS);
+
+  // SYMBOL_TABLE subsection
+  if (Section.SymbolTable.size()) {
+    writeUint8(OS, wasm::WASM_SYMBOL_TABLE);
+
+    encodeULEB128(Section.SymbolTable.size(), SubSection.getStream());
+#ifndef NDEBUG
+    uint32_t SymbolIndex = 0;
+#endif
+    for (const WasmYAML::SymbolInfo &Info : Section.SymbolTable) {
+      assert(Info.Index == SymbolIndex++);
+      writeUint8(SubSection.getStream(), Info.Kind);
+      encodeULEB128(Info.Flags, SubSection.getStream());
+      switch (Info.Kind) {
+      case wasm::WASM_SYMBOL_TYPE_FUNCTION:
+      case wasm::WASM_SYMBOL_TYPE_GLOBAL:
+      case wasm::WASM_SYMBOL_TYPE_EVENT:
+        encodeULEB128(Info.ElementIndex, SubSection.getStream());
+        if ((Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0 ||
+            (Info.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0)
+          writeStringRef(Info.Name, SubSection.getStream());
+        break;
+      case wasm::WASM_SYMBOL_TYPE_DATA:
+        writeStringRef(Info.Name, SubSection.getStream());
+        if ((Info.Flags & wasm::WASM_SYMBOL_UNDEFINED) == 0) {
+          encodeULEB128(Info.DataRef.Segment, SubSection.getStream());
+          encodeULEB128(Info.DataRef.Offset, SubSection.getStream());
+          encodeULEB128(Info.DataRef.Size, SubSection.getStream());
+        }
+        break;
+      case wasm::WASM_SYMBOL_TYPE_SECTION:
+        encodeULEB128(Info.ElementIndex, SubSection.getStream());
+        break;
+      default:
+        llvm_unreachable("unexpected kind");
+      }
+    }
+
+    SubSection.done();
+  }
+
+  // SEGMENT_NAMES subsection
+  if (Section.SegmentInfos.size()) {
+    writeUint8(OS, wasm::WASM_SEGMENT_INFO);
+    encodeULEB128(Section.SegmentInfos.size(), SubSection.getStream());
+    for (const WasmYAML::SegmentInfo &SegmentInfo : Section.SegmentInfos) {
+      writeStringRef(SegmentInfo.Name, SubSection.getStream());
+      encodeULEB128(SegmentInfo.Alignment, SubSection.getStream());
+      encodeULEB128(SegmentInfo.Flags, SubSection.getStream());
+    }
+    SubSection.done();
+  }
+
+  // INIT_FUNCS subsection
+  if (Section.InitFunctions.size()) {
+    writeUint8(OS, wasm::WASM_INIT_FUNCS);
+    encodeULEB128(Section.InitFunctions.size(), SubSection.getStream());
+    for (const WasmYAML::InitFunction &Func : Section.InitFunctions) {
+      encodeULEB128(Func.Priority, SubSection.getStream());
+      encodeULEB128(Func.Symbol, SubSection.getStream());
+    }
+    SubSection.done();
+  }
+
+  // COMDAT_INFO subsection
+  if (Section.Comdats.size()) {
+    writeUint8(OS, wasm::WASM_COMDAT_INFO);
+    encodeULEB128(Section.Comdats.size(), SubSection.getStream());
+    for (const auto &C : Section.Comdats) {
+      writeStringRef(C.Name, SubSection.getStream());
+      encodeULEB128(0, SubSection.getStream()); // flags for future use
+      encodeULEB128(C.Entries.size(), SubSection.getStream());
+      for (const WasmYAML::ComdatEntry &Entry : C.Entries) {
+        writeUint8(SubSection.getStream(), Entry.Kind);
+        encodeULEB128(Entry.Index, SubSection.getStream());
+      }
+    }
+    SubSection.done();
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::NameSection &Section) {
+  writeStringRef(Section.Name, OS);
+  if (Section.FunctionNames.size()) {
+    writeUint8(OS, wasm::WASM_NAMES_FUNCTION);
+
+    SubSectionWriter SubSection(OS);
+
+    encodeULEB128(Section.FunctionNames.size(), SubSection.getStream());
+    for (const WasmYAML::NameEntry &NameEntry : Section.FunctionNames) {
+      encodeULEB128(NameEntry.Index, SubSection.getStream());
+      writeStringRef(NameEntry.Name, SubSection.getStream());
+    }
+
+    SubSection.done();
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::ProducersSection &Section) {
+  writeStringRef(Section.Name, OS);
+  int Fields = int(!Section.Languages.empty()) + int(!Section.Tools.empty()) +
+               int(!Section.SDKs.empty());
+  if (Fields == 0)
+    return;
+  encodeULEB128(Fields, OS);
+  for (auto &Field : {std::make_pair(StringRef("language"), &Section.Languages),
+                      std::make_pair(StringRef("processed-by"), &Section.Tools),
+                      std::make_pair(StringRef("sdk"), &Section.SDKs)}) {
+    if (Field.second->empty())
+      continue;
+    writeStringRef(Field.first, OS);
+    encodeULEB128(Field.second->size(), OS);
+    for (auto &Entry : *Field.second) {
+      writeStringRef(Entry.Name, OS);
+      writeStringRef(Entry.Version, OS);
+    }
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::TargetFeaturesSection &Section) {
+  writeStringRef(Section.Name, OS);
+  encodeULEB128(Section.Features.size(), OS);
+  for (auto &E : Section.Features) {
+    writeUint8(OS, E.Prefix);
+    writeStringRef(E.Name, OS);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::CustomSection &Section) {
+  if (auto S = dyn_cast<WasmYAML::DylinkSection>(&Section)) {
+    writeSectionContent(OS, *S);
+  } else if (auto S = dyn_cast<WasmYAML::NameSection>(&Section)) {
+    writeSectionContent(OS, *S);
+  } else if (auto S = dyn_cast<WasmYAML::LinkingSection>(&Section)) {
+    writeSectionContent(OS, *S);
+  } else if (auto S = dyn_cast<WasmYAML::ProducersSection>(&Section)) {
+    writeSectionContent(OS, *S);
+  } else if (auto S = dyn_cast<WasmYAML::TargetFeaturesSection>(&Section)) {
+    writeSectionContent(OS, *S);
+  } else {
+    writeStringRef(Section.Name, OS);
+    Section.Payload.writeAsBinary(OS);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::TypeSection &Section) {
+  encodeULEB128(Section.Signatures.size(), OS);
+  uint32_t ExpectedIndex = 0;
+  for (const WasmYAML::Signature &Sig : Section.Signatures) {
+    if (Sig.Index != ExpectedIndex) {
+      reportError("unexpected type index: " + Twine(Sig.Index));
+      return;
+    }
+    ++ExpectedIndex;
+    writeUint8(OS, Sig.Form);
+    encodeULEB128(Sig.ParamTypes.size(), OS);
+    for (auto ParamType : Sig.ParamTypes)
+      writeUint8(OS, ParamType);
+    encodeULEB128(Sig.ReturnTypes.size(), OS);
+    for (auto ReturnType : Sig.ReturnTypes)
+      writeUint8(OS, ReturnType);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::ImportSection &Section) {
+  encodeULEB128(Section.Imports.size(), OS);
+  for (const WasmYAML::Import &Import : Section.Imports) {
+    writeStringRef(Import.Module, OS);
+    writeStringRef(Import.Field, OS);
+    writeUint8(OS, Import.Kind);
+    switch (Import.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      encodeULEB128(Import.SigIndex, OS);
+      NumImportedFunctions++;
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      writeUint8(OS, Import.GlobalImport.Type);
+      writeUint8(OS, Import.GlobalImport.Mutable);
+      NumImportedGlobals++;
+      break;
+    case wasm::WASM_EXTERNAL_EVENT:
+      writeUint32(OS, Import.EventImport.Attribute);
+      writeUint32(OS, Import.EventImport.SigIndex);
+      NumImportedGlobals++;
+      break;
+    case wasm::WASM_EXTERNAL_MEMORY:
+      writeLimits(Import.Memory, OS);
+      break;
+    case wasm::WASM_EXTERNAL_TABLE:
+      writeUint8(OS, Import.TableImport.ElemType);
+      writeLimits(Import.TableImport.TableLimits, OS);
+      break;
+    default:
+      reportError("unknown import type: " +Twine(Import.Kind));
+      return;
+    }
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::FunctionSection &Section) {
+  encodeULEB128(Section.FunctionTypes.size(), OS);
+  for (uint32_t FuncType : Section.FunctionTypes)
+    encodeULEB128(FuncType, OS);
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::ExportSection &Section) {
+  encodeULEB128(Section.Exports.size(), OS);
+  for (const WasmYAML::Export &Export : Section.Exports) {
+    writeStringRef(Export.Name, OS);
+    writeUint8(OS, Export.Kind);
+    encodeULEB128(Export.Index, OS);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::StartSection &Section) {
+  encodeULEB128(Section.StartFunction, OS);
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::TableSection &Section) {
+  encodeULEB128(Section.Tables.size(), OS);
+  for (auto &Table : Section.Tables) {
+    writeUint8(OS, Table.ElemType);
+    writeLimits(Table.TableLimits, OS);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::MemorySection &Section) {
+  encodeULEB128(Section.Memories.size(), OS);
+  for (const WasmYAML::Limits &Mem : Section.Memories)
+    writeLimits(Mem, OS);
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::GlobalSection &Section) {
+  encodeULEB128(Section.Globals.size(), OS);
+  uint32_t ExpectedIndex = NumImportedGlobals;
+  for (auto &Global : Section.Globals) {
+    if (Global.Index != ExpectedIndex) {
+      reportError("unexpected global index: " + Twine(Global.Index));
+      return;
+    }
+    ++ExpectedIndex;
+    writeUint8(OS, Global.Type);
+    writeUint8(OS, Global.Mutable);
+    writeInitExpr(OS, Global.InitExpr);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::EventSection &Section) {
+  encodeULEB128(Section.Events.size(), OS);
+  uint32_t ExpectedIndex = NumImportedEvents;
+  for (auto &Event : Section.Events) {
+    if (Event.Index != ExpectedIndex) {
+      reportError("unexpected event index: " + Twine(Event.Index));
+      return;
+    }
+    ++ExpectedIndex;
+    encodeULEB128(Event.Attribute, OS);
+    encodeULEB128(Event.SigIndex, OS);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::ElemSection &Section) {
+  encodeULEB128(Section.Segments.size(), OS);
+  for (auto &Segment : Section.Segments) {
+    encodeULEB128(Segment.TableIndex, OS);
+    writeInitExpr(OS, Segment.Offset);
+
+    encodeULEB128(Segment.Functions.size(), OS);
+    for (auto &Function : Segment.Functions)
+      encodeULEB128(Function, OS);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::CodeSection &Section) {
+  encodeULEB128(Section.Functions.size(), OS);
+  uint32_t ExpectedIndex = NumImportedFunctions;
+  for (auto &Func : Section.Functions) {
+    std::string OutString;
+    raw_string_ostream StringStream(OutString);
+    if (Func.Index != ExpectedIndex) {
+      reportError("unexpected function index: " + Twine(Func.Index));
+      return;
+    }
+    ++ExpectedIndex;
+
+    encodeULEB128(Func.Locals.size(), StringStream);
+    for (auto &LocalDecl : Func.Locals) {
+      encodeULEB128(LocalDecl.Count, StringStream);
+      writeUint8(StringStream, LocalDecl.Type);
+    }
+
+    Func.Body.writeAsBinary(StringStream);
+
+    // Write the section size followed by the content
+    StringStream.flush();
+    encodeULEB128(OutString.size(), OS);
+    OS << OutString;
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::DataSection &Section) {
+  encodeULEB128(Section.Segments.size(), OS);
+  for (auto &Segment : Section.Segments) {
+    encodeULEB128(Segment.InitFlags, OS);
+    if (Segment.InitFlags & wasm::WASM_SEGMENT_HAS_MEMINDEX)
+      encodeULEB128(Segment.MemoryIndex, OS);
+    if ((Segment.InitFlags & wasm::WASM_SEGMENT_IS_PASSIVE) == 0)
+      writeInitExpr(OS, Segment.Offset);
+    encodeULEB128(Segment.Content.binary_size(), OS);
+    Segment.Content.writeAsBinary(OS);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::DataCountSection &Section) {
+  encodeULEB128(Section.Count, OS);
+}
+
+void WasmWriter::writeRelocSection(raw_ostream &OS, WasmYAML::Section &Sec,
+                                  uint32_t SectionIndex) {
+  switch (Sec.Type) {
+  case wasm::WASM_SEC_CODE:
+    writeStringRef("reloc.CODE", OS);
+    break;
+  case wasm::WASM_SEC_DATA:
+    writeStringRef("reloc.DATA", OS);
+    break;
+  case wasm::WASM_SEC_CUSTOM: {
+    auto *CustomSection = cast<WasmYAML::CustomSection>(&Sec);
+    writeStringRef(("reloc." + CustomSection->Name).str(), OS);
+    break;
+  }
+  default:
+    llvm_unreachable("not yet implemented");
+  }
+
+  encodeULEB128(SectionIndex, OS);
+  encodeULEB128(Sec.Relocations.size(), OS);
+
+  for (auto Reloc : Sec.Relocations) {
+    writeUint8(OS, Reloc.Type);
+    encodeULEB128(Reloc.Offset, OS);
+    encodeULEB128(Reloc.Index, OS);
+    switch (Reloc.Type) {
+    case wasm::R_WASM_MEMORY_ADDR_LEB:
+    case wasm::R_WASM_MEMORY_ADDR_SLEB:
+    case wasm::R_WASM_MEMORY_ADDR_I32:
+    case wasm::R_WASM_FUNCTION_OFFSET_I32:
+    case wasm::R_WASM_SECTION_OFFSET_I32:
+      encodeULEB128(Reloc.Addend, OS);
+    }
+  }
+}
+
+bool WasmWriter::writeWasm(raw_ostream &OS) {
+  // Write headers
+  OS.write(wasm::WasmMagic, sizeof(wasm::WasmMagic));
+  writeUint32(OS, Obj.Header.Version);
+
+  // Write each section
+  llvm::object::WasmSectionOrderChecker Checker;
+  for (const std::unique_ptr<WasmYAML::Section> &Sec : Obj.Sections) {
+    StringRef SecName = "";
+    if (auto S = dyn_cast<WasmYAML::CustomSection>(Sec.get()))
+      SecName = S->Name;
+    if (!Checker.isValidSectionOrder(Sec->Type, SecName)) {
+      reportError("out of order section type: " + Twine(Sec->Type));
+      return false;
+    }
+    encodeULEB128(Sec->Type, OS);
+    std::string OutString;
+    raw_string_ostream StringStream(OutString);
+    if (auto S = dyn_cast<WasmYAML::CustomSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::TypeSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::ImportSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::FunctionSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::TableSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::MemorySection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::GlobalSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::EventSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::ExportSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::StartSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::ElemSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::CodeSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::DataSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::DataCountSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
+    else
+      reportError("unknown section type: " + Twine(Sec->Type));
+
+    if (HasError)
+      return false;
+
+    StringStream.flush();
+
+    // Write the section size followed by the content
+    encodeULEB128(OutString.size(), OS);
+    OS << OutString;
+  }
+
+  // write reloc sections for any section that have relocations
+  uint32_t SectionIndex = 0;
+  for (const std::unique_ptr<WasmYAML::Section> &Sec : Obj.Sections) {
+    if (Sec->Relocations.empty()) {
+      SectionIndex++;
+      continue;
+    }
+
+    writeUint8(OS, wasm::WASM_SEC_CUSTOM);
+    std::string OutString;
+    raw_string_ostream StringStream(OutString);
+    writeRelocSection(StringStream, *Sec, SectionIndex++);
+    StringStream.flush();
+
+    encodeULEB128(OutString.size(), OS);
+    OS << OutString;
+  }
+
+  return true;
+}
+
+namespace llvm {
+namespace yaml {
+
+bool yaml2wasm(WasmYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH) {
+  WasmWriter Writer(Doc, EH);
+  return Writer.writeWasm(Out);
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index 88491d955c49..232d5122004a 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -295,8 +295,8 @@ void ScalarEnumerationTraits<WasmYAML::SectionType>::enumeration(
 void MappingTraits<WasmYAML::Signature>::mapping(
     IO &IO, WasmYAML::Signature &Signature) {
   IO.mapRequired("Index", Signature.Index);
-  IO.mapRequired("ReturnType", Signature.ReturnType);
   IO.mapRequired("ParamTypes", Signature.ParamTypes);
+  IO.mapRequired("ReturnTypes", Signature.ReturnTypes);
 }
 
 void MappingTraits<WasmYAML::Table>::mapping(IO &IO, WasmYAML::Table &Table) {
@@ -535,6 +535,7 @@ void ScalarBitSetTraits<WasmYAML::SymbolFlags>::bitset(
   BCaseMask(UNDEFINED, UNDEFINED);
   BCaseMask(EXPORTED, EXPORTED);
   BCaseMask(EXPLICIT_NAME, EXPLICIT_NAME);
+  BCaseMask(NO_STRIP, NO_STRIP);
 #undef BCaseMask
 }
 
@@ -559,7 +560,6 @@ void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
   ECase(V128);
   ECase(FUNCREF);
   ECase(FUNC);
-  ECase(NORESULT);
 #undef ECase
 }
 
diff --git a/lib/ObjectYAML/yaml2obj.cpp b/lib/ObjectYAML/yaml2obj.cpp
new file mode 100644
index 000000000000..c18fa5cfdb5e
--- /dev/null
+++ b/lib/ObjectYAML/yaml2obj.cpp
@@ -0,0 +1,77 @@
+//===-- yaml2obj.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/ObjectYAML/ObjectYAML.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/YAMLTraits.h"
+
+namespace llvm {
+namespace yaml {
+
+bool convertYAML(yaml::Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
+                 unsigned DocNum) {
+  unsigned CurDocNum = 0;
+  do {
+    if (++CurDocNum != DocNum)
+      continue;
+
+    yaml::YamlObjectFile Doc;
+    YIn >> Doc;
+    if (std::error_code EC = YIn.error()) {
+      ErrHandler("failed to parse YAML input: " + EC.message());
+      return false;
+    }
+
+    if (Doc.Elf)
+      return yaml2elf(*Doc.Elf, Out, ErrHandler);
+    if (Doc.Coff)
+      return yaml2coff(*Doc.Coff, Out, ErrHandler);
+    if (Doc.MachO || Doc.FatMachO)
+      return yaml2macho(Doc, Out, ErrHandler);
+    if (Doc.Minidump)
+      return yaml2minidump(*Doc.Minidump, Out, ErrHandler);
+    if (Doc.Wasm)
+      return yaml2wasm(*Doc.Wasm, Out, ErrHandler);
+
+    ErrHandler("unknown document type");
+    return false;
+
+  } while (YIn.nextDocument());
+
+  ErrHandler("cannot find the " + Twine(DocNum) +
+             getOrdinalSuffix(DocNum).data() + " document");
+  return false;
+}
+
+std::unique_ptr<object::ObjectFile>
+yaml2ObjectFile(SmallVectorImpl<char> &Storage, StringRef Yaml,
+                ErrorHandler ErrHandler) {
+  Storage.clear();
+  raw_svector_ostream OS(Storage);
+
+  yaml::Input YIn(Yaml);
+  if (!convertYAML(YIn, OS, ErrHandler))
+    return {};
+
+  Expected<std::unique_ptr<object::ObjectFile>> ObjOrErr =
+      object::ObjectFile::createObjectFile(
+          MemoryBufferRef(OS.str(), "YamlObject"));
+  if (ObjOrErr)
+    return std::move(*ObjOrErr);
+
+  ErrHandler(toString(ObjOrErr.takeError()));
+  return {};
+}
+
+} // namespace yaml
+} // namespace llvm
diff --git a/lib/Option/ArgList.cpp b/lib/Option/ArgList.cpp
index f37c142da69b..09e921502eb6 100644
--- a/lib/Option/ArgList.cpp
+++ b/lib/Option/ArgList.cpp
@@ -241,7 +241,7 @@ void DerivedArgList::AddSynthesizedArg(Arg *A) {
 
 Arg *DerivedArgList::MakeFlagArg(const Arg *BaseArg, const Option Opt) const {
   SynthesizedArgs.push_back(
-      make_unique<Arg>(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()),
+      std::make_unique<Arg>(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()),
                        BaseArgs.MakeIndex(Opt.getName()), BaseArg));
   return SynthesizedArgs.back().get();
 }
@@ -250,7 +250,7 @@ Arg *DerivedArgList::MakePositionalArg(const Arg *BaseArg, const Option Opt,
                                        StringRef Value) const {
   unsigned Index = BaseArgs.MakeIndex(Value);
   SynthesizedArgs.push_back(
-      make_unique<Arg>(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()),
+      std::make_unique<Arg>(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()),
                        Index, BaseArgs.getArgString(Index), BaseArg));
   return SynthesizedArgs.back().get();
 }
@@ -259,7 +259,7 @@ Arg *DerivedArgList::MakeSeparateArg(const Arg *BaseArg, const Option Opt,
                                      StringRef Value) const {
   unsigned Index = BaseArgs.MakeIndex(Opt.getName(), Value);
   SynthesizedArgs.push_back(
-      make_unique<Arg>(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()),
+      std::make_unique<Arg>(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()),
                        Index, BaseArgs.getArgString(Index + 1), BaseArg));
   return SynthesizedArgs.back().get();
 }
@@ -267,7 +267,7 @@ Arg *DerivedArgList::MakeSeparateArg(const Arg *BaseArg, const Option Opt,
 Arg *DerivedArgList::MakeJoinedArg(const Arg *BaseArg, const Option Opt,
                                    StringRef Value) const {
   unsigned Index = BaseArgs.MakeIndex((Opt.getName() + Value).str());
-  SynthesizedArgs.push_back(make_unique<Arg>(
+  SynthesizedArgs.push_back(std::make_unique<Arg>(
       Opt, MakeArgString(Opt.getPrefix() + Opt.getName()), Index,
       BaseArgs.getArgString(Index) + Opt.getName().size(), BaseArg));
   return SynthesizedArgs.back().get();
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index e2b2a2b25268..1aaccb510f8c 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/DDG.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/DominanceFrontier.h"
@@ -35,6 +36,7 @@
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
@@ -51,6 +53,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/UnreachableBlockElim.h"
 #include "llvm/IR/Dominators.h"
@@ -101,6 +104,7 @@
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Instrumentation/PoisonChecking.h"
+#include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
@@ -138,13 +142,14 @@
 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
+#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerWidenableCondition.h"
 #include "llvm/Transforms/Scalar/MakeGuardsExplicit.h"
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
-#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
 #include "llvm/Transforms/Scalar/MergeICmps.h"
+#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
 #include "llvm/Transforms/Scalar/NaryReassociate.h"
 #include "llvm/Transforms/Scalar/NewGVN.h"
 #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
@@ -206,7 +211,7 @@ static cl::opt<bool> EnableSyntheticCounts(
     cl::desc("Run synthetic function entry count generation "
              "pass"));
 
-static Regex DefaultAliasRegex(
+static const Regex DefaultAliasRegex(
     "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$");
 
 // This option is used in simplifying testing SampleFDO optimizations for
@@ -466,8 +471,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   if ((Phase != ThinLTOPhase::PreLink || !PGOOpt ||
        PGOOpt->Action != PGOOptions::SampleUse) &&
       PTO.LoopUnrolling)
-    LPM2.addPass(
-        LoopFullUnrollPass(Level, false, PTO.ForgetAllSCEVInLoopUnroll));
+    LPM2.addPass(LoopFullUnrollPass(Level, /*OnlyWhenForced=*/false,
+                                    PTO.ForgetAllSCEVInLoopUnroll));
 
   for (auto &C : LoopOptimizerEndEPCallbacks)
     C(LPM2, Level);
@@ -475,10 +480,15 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // We provide the opt remark emitter pass for LICM to use. We only need to do
   // this once as it is immutable.
   FPM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
-  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), DebugLogging));
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      std::move(LPM1), EnableMSSALoopDependency, DebugLogging));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
-  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), DebugLogging));
+  // The loop passes in LPM2 (IndVarSimplifyPass, LoopIdiomRecognizePass,
+  // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
+  // *All* loop passes must preserve it, in order to be able to use it.
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      std::move(LPM2), /*UseMemorySSA=*/false, DebugLogging));
 
   // Eliminate redundancies.
   if (Level != O1) {
@@ -515,7 +525,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(DSEPass());
   FPM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      DebugLogging));
+      EnableMSSALoopDependency, DebugLogging));
 
   for (auto &C : ScalarOptimizerLateEPCallbacks)
     C(FPM, Level);
@@ -540,6 +550,7 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                                     bool RunProfileGen, bool IsCS,
                                     std::string ProfileFile,
                                     std::string ProfileRemappingFile) {
+  assert(Level != O0 && "Not expecting O0 here!");
   // Generally running simplification passes and the inliner with an high
   // threshold results in smaller executables, but there may be cases where
   // the size grows, so let's be conservative here and skip this simplification
@@ -570,34 +581,63 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
     CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
 
     MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPipeline)));
+
+    // Delete anything that is now dead to make sure that we don't instrument
+    // dead code. Instrumentation can end up keeping dead code around and
+    // dramatically increase code size.
+    MPM.addPass(GlobalDCEPass());
   }
 
-  // Delete anything that is now dead to make sure that we don't instrument
-  // dead code. Instrumentation can end up keeping dead code around and
-  // dramatically increase code size.
-  MPM.addPass(GlobalDCEPass());
+  if (!RunProfileGen) {
+    assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
+    MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
+    // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
+    // RequireAnalysisPass for PSI before subsequent non-module passes.
+    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+    return;
+  }
 
-  if (RunProfileGen) {
-    MPM.addPass(PGOInstrumentationGen(IsCS));
+  // Perform PGO instrumentation.
+  MPM.addPass(PGOInstrumentationGen(IsCS));
 
-    FunctionPassManager FPM;
-    FPM.addPass(
-        createFunctionToLoopPassAdaptor(LoopRotatePass(), DebugLogging));
-    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-
-    // Add the profile lowering pass.
-    InstrProfOptions Options;
-    if (!ProfileFile.empty())
-      Options.InstrProfileOutput = ProfileFile;
-    Options.DoCounterPromotion = true;
-    Options.UseBFIInPromotion = IsCS;
-    MPM.addPass(InstrProfiling(Options, IsCS));
-  } else if (!ProfileFile.empty()) {
+  FunctionPassManager FPM;
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      LoopRotatePass(), EnableMSSALoopDependency, DebugLogging));
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+
+  // Add the profile lowering pass.
+  InstrProfOptions Options;
+  if (!ProfileFile.empty())
+    Options.InstrProfileOutput = ProfileFile;
+  // Do counter promotion at Level greater than O0.
+  Options.DoCounterPromotion = true;
+  Options.UseBFIInPromotion = IsCS;
+  MPM.addPass(InstrProfiling(Options, IsCS));
+}
+
+void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM,
+                                         bool DebugLogging, bool RunProfileGen,
+                                         bool IsCS, std::string ProfileFile,
+                                         std::string ProfileRemappingFile) {
+  if (!RunProfileGen) {
+    assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
     MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
     // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
     // RequireAnalysisPass for PSI before subsequent non-module passes.
     MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+    return;
   }
+
+  // Perform PGO instrumentation.
+  MPM.addPass(PGOInstrumentationGen(IsCS));
+  // Add the profile lowering pass.
+  InstrProfOptions Options;
+  if (!ProfileFile.empty())
+    Options.InstrProfileOutput = ProfileFile;
+  // Do not do counter promotion at O0.
+  Options.DoCounterPromotion = false;
+  Options.UseBFIInPromotion = IsCS;
+  MPM.addPass(InstrProfiling(Options, IsCS));
 }
 
 static InlineParams
@@ -852,6 +892,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
 
   FunctionPassManager OptimizePM(DebugLogging);
   OptimizePM.addPass(Float2IntPass());
+  OptimizePM.addPass(LowerConstantIntrinsicsPass());
+
   // FIXME: We need to run some loop optimizations to re-rotate loops after
   // simplify-cfg and others undo their rotation.
 
@@ -863,8 +905,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
     C(OptimizePM, Level);
 
   // First rotate loops that may have been un-rotated by prior passes.
-  OptimizePM.addPass(
-      createFunctionToLoopPassAdaptor(LoopRotatePass(), DebugLogging));
+  OptimizePM.addPass(createFunctionToLoopPassAdaptor(
+      LoopRotatePass(), EnableMSSALoopDependency, DebugLogging));
 
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
   // into separate loop that would otherwise inhibit vectorization.  This is
@@ -911,19 +953,19 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   // combiner for cleanup here so that the unrolling and LICM can be pipelined
   // across the loop nests.
   // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-  if (EnableUnrollAndJam) {
+  if (EnableUnrollAndJam && PTO.LoopUnrolling) {
     OptimizePM.addPass(
         createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
   }
-  if (PTO.LoopUnrolling)
-    OptimizePM.addPass(LoopUnrollPass(
-        LoopUnrollOptions(Level, false, PTO.ForgetAllSCEVInLoopUnroll)));
+  OptimizePM.addPass(LoopUnrollPass(
+      LoopUnrollOptions(Level, /*OnlyWhenForced=*/!PTO.LoopUnrolling,
+                        PTO.ForgetAllSCEVInLoopUnroll)));
   OptimizePM.addPass(WarnMissedTransformationsPass());
   OptimizePM.addPass(InstCombinePass());
   OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      DebugLogging));
+      EnableMSSALoopDependency, DebugLogging));
 
   // Now that we've vectorized and unrolled loops, we may have more refined
   // alignment information, try to re-derive it here.
@@ -1422,12 +1464,23 @@ Expected<LoopUnrollOptions> parseLoopUnrollOptions(StringRef Params) {
       UnrollOpts.setOptLevel(OptLevel);
       continue;
     }
+    if (ParamName.consume_front("full-unroll-max=")) {
+      int Count;
+      if (ParamName.getAsInteger(0, Count))
+        return make_error<StringError>(
+            formatv("invalid LoopUnrollPass parameter '{0}' ", ParamName).str(),
+            inconvertibleErrorCode());
+      UnrollOpts.setFullUnrollMaxCount(Count);
+      continue;
+    }
 
     bool Enable = !ParamName.consume_front("no-");
     if (ParamName == "partial") {
       UnrollOpts.setPartial(Enable);
     } else if (ParamName == "peeling") {
       UnrollOpts.setPeeling(Enable);
+    } else if (ParamName == "profile-peeling") {
+      UnrollOpts.setProfileBasedPeeling(Enable);
     } else if (ParamName == "runtime") {
       UnrollOpts.setRuntime(Enable);
     } else if (ParamName == "upperbound") {
@@ -1542,6 +1595,26 @@ Expected<bool> parseLoopUnswitchOptions(StringRef Params) {
   }
   return Result;
 }
+
+Expected<bool> parseMergedLoadStoreMotionOptions(StringRef Params) {
+  bool Result = false;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    bool Enable = !ParamName.consume_front("no-");
+    if (ParamName == "split-footer-bb") {
+      Result = Enable;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid MergedLoadStoreMotion pass parameter '{0}' ",
+                  ParamName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
 } // namespace
 
 /// Tests whether a pass name starts with a valid prefix for a default pipeline
@@ -1629,7 +1702,7 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
   // Explicitly handle pass manager names.
   if (Name == "function")
     return true;
-  if (Name == "loop")
+  if (Name == "loop" || Name == "loop-mssa")
     return true;
 
   // Explicitly handle custom-parsed pass names.
@@ -1653,7 +1726,7 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
 template <typename CallbacksT>
 static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks) {
   // Explicitly handle pass manager names.
-  if (Name == "loop")
+  if (Name == "loop" || Name == "loop-mssa")
     return true;
 
   // Explicitly handle custom-parsed pass names.
@@ -1800,9 +1873,19 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
                               .Case("O3", O3)
                               .Case("Os", Os)
                               .Case("Oz", Oz);
-    if (L == O0)
-      // At O0 we do nothing at all!
+    if (L == O0) {
+      // Add instrumentation PGO passes -- at O0 we can still do PGO.
+      if (PGOOpt && Matches[1] != "thinlto" &&
+          (PGOOpt->Action == PGOOptions::IRInstr ||
+           PGOOpt->Action == PGOOptions::IRUse))
+        addPGOInstrPassesForO0(
+            MPM, DebugLogging,
+            /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr),
+            /* IsCS */ false, PGOOpt->ProfileFile,
+            PGOOpt->ProfileRemappingFile);
+      // Do nothing else at all!
       return Error::success();
+    }
 
     if (Matches[1] == "default") {
       MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging));
@@ -1947,14 +2030,15 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
       FPM.addPass(std::move(NestedFPM));
       return Error::success();
     }
-    if (Name == "loop") {
+    if (Name == "loop" || Name == "loop-mssa") {
       LoopPassManager LPM(DebugLogging);
       if (auto Err = parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
                                            DebugLogging))
         return Err;
       // Add the nested pass manager with the appropriate adaptor.
-      FPM.addPass(
-          createFunctionToLoopPassAdaptor(std::move(LPM), DebugLogging));
+      bool UseMemorySSA = (Name == "loop-mssa");
+      FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA,
+                                                  DebugLogging));
       return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 347f75870eb3..1fa274d172b1 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -24,7 +24,6 @@ MODULE_ANALYSIS("module-summary", ModuleSummaryIndexAnalysis())
 MODULE_ANALYSIS("no-op-module", NoOpModuleAnalysis())
 MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis())
 MODULE_ANALYSIS("stack-safety", StackSafetyGlobalAnalysis())
-MODULE_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 MODULE_ANALYSIS("verify", VerifierAnalysis())
 MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 MODULE_ANALYSIS("asan-globals-md", ASanGlobalsMetadataAnalysis())
@@ -87,7 +86,10 @@ MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass(nullptr, nullptr))
 MODULE_PASS("verify", VerifierPass())
 MODULE_PASS("asan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/false, false, true, false))
+MODULE_PASS("msan-module", MemorySanitizerPass({}))
+MODULE_PASS("tsan-module", ThreadSanitizerPass())
 MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false))
+MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
 #undef MODULE_PASS
 
@@ -185,6 +187,7 @@ FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass())
 FUNCTION_PASS("loweratomic", LowerAtomicPass())
 FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
 FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass())
+FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
 FUNCTION_PASS("lower-widenable-condition", LowerWidenableConditionPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("gvn", GVN())
@@ -195,7 +198,6 @@ FUNCTION_PASS("lowerinvoke", LowerInvokePass())
 FUNCTION_PASS("mem2reg", PromotePass())
 FUNCTION_PASS("memcpyopt", MemCpyOptPass())
 FUNCTION_PASS("mergeicmps", MergeICmpsPass())
-FUNCTION_PASS("mldst-motion", MergedLoadStoreMotionPass())
 FUNCTION_PASS("nary-reassociate", NaryReassociatePass())
 FUNCTION_PASS("newgvn", NewGVNPass())
 FUNCTION_PASS("jump-threading", JumpThreadingPass())
@@ -270,6 +272,11 @@ FUNCTION_PASS_WITH_PARAMS("loop-vectorize",
                              return LoopVectorizePass(Opts);
                            },
                            parseLoopVectorizeOptions)
+FUNCTION_PASS_WITH_PARAMS("mldst-motion",
+                           [](MergedLoadStoreMotionOptions Opts) {
+                             return MergedLoadStoreMotionPass(Opts);
+                           },
+                           parseMergedLoadStoreMotionOptions)
 #undef FUNCTION_PASS_WITH_PARAMS
 
 #ifndef LOOP_ANALYSIS
@@ -277,6 +284,7 @@ FUNCTION_PASS_WITH_PARAMS("loop-vectorize",
 #endif
 LOOP_ANALYSIS("no-op-loop", NoOpLoopAnalysis())
 LOOP_ANALYSIS("access-info", LoopAccessAnalysis())
+LOOP_ANALYSIS("ddg", DDGAnalysis())
 LOOP_ANALYSIS("ivusers", IVUsersAnalysis())
 LOOP_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 #undef LOOP_ANALYSIS
@@ -299,7 +307,9 @@ LOOP_PASS("irce", IRCEPass())
 LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass())
 LOOP_PASS("unroll-full", LoopFullUnrollPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
+LOOP_PASS("print<ddg>", DDGAnalysisPrinterPass(dbgs()))
 LOOP_PASS("print<ivusers>", IVUsersPrinterPass(dbgs()))
+LOOP_PASS("print<loop-cache-cost>", LoopCachePrinterPass(dbgs()))
 LOOP_PASS("loop-predication", LoopPredicationPass())
 LOOP_PASS("guard-widening", GuardWideningPass())
 #undef LOOP_PASS
diff --git a/lib/ProfileData/Coverage/CoverageMapping.cpp b/lib/ProfileData/Coverage/CoverageMapping.cpp
index afd6618e7cb3..8d5e56e26c0f 100644
--- a/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -194,6 +194,15 @@ void FunctionRecordIterator::skipOtherFiles() {
     *this = FunctionRecordIterator();
 }
 
+ArrayRef<unsigned> CoverageMapping::getImpreciseRecordIndicesForFilename(
+    StringRef Filename) const {
+  size_t FilenameHash = hash_value(Filename);
+  auto RecordIt = FilenameHash2RecordIndices.find(FilenameHash);
+  if (RecordIt == FilenameHash2RecordIndices.end())
+    return {};
+  return RecordIt->second;
+}
+
 Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
     IndexedInstrProfReader &ProfileReader) {
@@ -249,6 +258,20 @@ Error CoverageMapping::loadFunctionRecord(
     return Error::success();
 
   Functions.push_back(std::move(Function));
+
+  // Performance optimization: keep track of the indices of the function records
+  // which correspond to each filename. This can be used to substantially speed
+  // up queries for coverage info in a file.
+  unsigned RecordIndex = Functions.size() - 1;
+  for (StringRef Filename : Record.Filenames) {
+    auto &RecordIndices = FilenameHash2RecordIndices[hash_value(Filename)];
+    // Note that there may be duplicates in the filename set for a function
+    // record, because of e.g. macro expansions in the function in which both
+    // the macro and the function are defined in the same file.
+    if (RecordIndices.empty() || RecordIndices.back() != RecordIndex)
+      RecordIndices.push_back(RecordIndex);
+  }
+
   return Error::success();
 }
 
@@ -270,6 +293,16 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
   return std::move(Coverage);
 }
 
+// If E is a no_data_found error, returns success. Otherwise returns E.
+static Error handleMaybeNoDataFoundError(Error E) {
+  return handleErrors(
+      std::move(E), [](const CoverageMapError &CME) {
+        if (CME.get() == coveragemap_error::no_data_found)
+          return static_cast<Error>(Error::success());
+        return make_error<CoverageMapError>(CME.get());
+      });
+}
+
 Expected<std::unique_ptr<CoverageMapping>>
 CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
                       StringRef ProfileFilename, ArrayRef<StringRef> Arches) {
@@ -289,12 +322,21 @@ CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
         CovMappingBufOrErr.get()->getMemBufferRef();
     auto CoverageReadersOrErr =
         BinaryCoverageReader::create(CovMappingBufRef, Arch, Buffers);
-    if (Error E = CoverageReadersOrErr.takeError())
-      return std::move(E);
+    if (Error E = CoverageReadersOrErr.takeError()) {
+      E = handleMaybeNoDataFoundError(std::move(E));
+      if (E)
+        return std::move(E);
+      // E == success (originally a no_data_found error).
+      continue;
+    }
     for (auto &Reader : CoverageReadersOrErr.get())
       Readers.push_back(std::move(Reader));
     Buffers.push_back(std::move(CovMappingBufOrErr.get()));
   }
+  // If no readers were created, either no objects were provided or none of them
+  // had coverage data. Return an error in the latter case.
+  if (Readers.empty() && !ObjectFilenames.empty())
+    return make_error<CoverageMapError>(coveragemap_error::no_data_found);
   return load(Readers, *ProfileReader);
 }
 
@@ -607,7 +649,12 @@ CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const {
   CoverageData FileCoverage(Filename);
   std::vector<CountedRegion> Regions;
 
-  for (const auto &Function : Functions) {
+  // Look up the function records in the given file. Due to hash collisions on
+  // the filename, we may get back some records that are not in the file.
+  ArrayRef<unsigned> RecordIndices =
+      getImpreciseRecordIndicesForFilename(Filename);
+  for (unsigned RecordIndex : RecordIndices) {
+    const FunctionRecord &Function = Functions[RecordIndex];
     auto MainFileID = findMainViewFileID(Filename, Function);
     auto FileIDs = gatherFileIDs(Filename, Function);
     for (const auto &CR : Function.CountedRegions)
@@ -627,7 +674,12 @@ CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const {
 std::vector<InstantiationGroup>
 CoverageMapping::getInstantiationGroups(StringRef Filename) const {
   FunctionInstantiationSetCollector InstantiationSetCollector;
-  for (const auto &Function : Functions) {
+  // Look up the function records in the given file. Due to hash collisions on
+  // the filename, we may get back some records that are not in the file.
+  ArrayRef<unsigned> RecordIndices =
+      getImpreciseRecordIndicesForFilename(Filename);
+  for (unsigned RecordIndex : RecordIndices) {
+    const FunctionRecord &Function = Functions[RecordIndex];
     auto MainFileID = findMainViewFileID(Filename, Function);
     if (!MainFileID)
       continue;
diff --git a/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index e193e10f91d9..679ff3525eeb 100644
--- a/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -506,7 +506,7 @@ public:
       return make_error<CoverageMapError>(coveragemap_error::malformed);
     // Each coverage map has an alignment of 8, so we need to adjust alignment
     // before reading the next map.
-    Buf += alignmentAdjustment(Buf, 8);
+    Buf += offsetToAlignedAddr(Buf, Align(8));
 
     auto CFR = reinterpret_cast<const FuncRecordType *>(FunBuf);
     while ((const char *)CFR < FunEnd) {
@@ -539,7 +539,7 @@ Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
 
   switch (Version) {
   case CovMapVersion::Version1:
-    return llvm::make_unique<VersionedCovMapFuncRecordReader<
+    return std::make_unique<VersionedCovMapFuncRecordReader<
         CovMapVersion::Version1, IntPtrT, Endian>>(P, R, F);
   case CovMapVersion::Version2:
   case CovMapVersion::Version3:
@@ -547,10 +547,10 @@ Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
     if (Error E = P.create(P.getNameData()))
       return std::move(E);
     if (Version == CovMapVersion::Version2)
-      return llvm::make_unique<VersionedCovMapFuncRecordReader<
+      return std::make_unique<VersionedCovMapFuncRecordReader<
           CovMapVersion::Version2, IntPtrT, Endian>>(P, R, F);
     else
-      return llvm::make_unique<VersionedCovMapFuncRecordReader<
+      return std::make_unique<VersionedCovMapFuncRecordReader<
           CovMapVersion::Version3, IntPtrT, Endian>>(P, R, F);
   }
   llvm_unreachable("Unsupported version");
@@ -648,7 +648,7 @@ loadTestingFormat(StringRef Data) {
   // Skip the padding bytes because coverage map data has an alignment of 8.
   if (CoverageMapping.empty())
     return make_error<CoverageMapError>(coveragemap_error::truncated);
-  size_t Pad = alignmentAdjustment(CoverageMapping.data(), 8);
+  size_t Pad = offsetToAlignedAddr(CoverageMapping.data(), Align(8));
   if (CoverageMapping.size() < Pad)
     return make_error<CoverageMapError>(coveragemap_error::malformed);
   CoverageMapping = CoverageMapping.substr(Pad);
@@ -666,11 +666,11 @@ static Expected<SectionRef> lookupSection(ObjectFile &OF, StringRef Name) {
   };
   Name = stripSuffix(Name);
 
-  StringRef FoundName;
   for (const auto &Section : OF.sections()) {
-    if (auto EC = Section.getName(FoundName))
-      return errorCodeToError(EC);
-    if (stripSuffix(FoundName) == Name)
+    Expected<StringRef> NameOrErr = Section.getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+    if (stripSuffix(*NameOrErr) == Name)
       return Section;
   }
   return make_error<CoverageMapError>(coveragemap_error::no_data_found);
@@ -682,7 +682,7 @@ loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch) {
   if (auto *Universal = dyn_cast<MachOUniversalBinary>(Bin.get())) {
     // If we have a universal binary, try to look up the object for the
     // appropriate architecture.
-    auto ObjectFileOrErr = Universal->getObjectForArch(Arch);
+    auto ObjectFileOrErr = Universal->getMachOObjectForArch(Arch);
     if (!ObjectFileOrErr)
       return ObjectFileOrErr.takeError();
     OF = std::move(ObjectFileOrErr.get());
diff --git a/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 432b20f217ca..d75854a60d1e 100644
--- a/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -24,6 +24,16 @@
 using namespace llvm;
 using namespace coverage;
 
+CoverageFilenamesSectionWriter::CoverageFilenamesSectionWriter(
+    ArrayRef<StringRef> Filenames)
+    : Filenames(Filenames) {
+#ifndef NDEBUG
+  StringSet<> NameSet;
+  for (StringRef Name : Filenames)
+    assert(NameSet.insert(Name).second && "Duplicate filename");
+#endif
+}
+
 void CoverageFilenamesSectionWriter::write(raw_ostream &OS) {
   encodeULEB128(Filenames.size(), OS);
   for (const auto &Filename : Filenames) {
diff --git a/lib/ProfileData/GCOV.cpp b/lib/ProfileData/GCOV.cpp
index fa4e433d7aa6..00e6294c57a6 100644
--- a/lib/ProfileData/GCOV.cpp
+++ b/lib/ProfileData/GCOV.cpp
@@ -40,7 +40,7 @@ bool GCOVFile::readGCNO(GCOVBuffer &Buffer) {
   while (true) {
     if (!Buffer.readFunctionTag())
       break;
-    auto GFun = make_unique<GCOVFunction>(*this);
+    auto GFun = std::make_unique<GCOVFunction>(*this);
     if (!GFun->readGCNO(Buffer, Version))
       return false;
     Functions.push_back(std::move(GFun));
@@ -164,7 +164,7 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
   for (uint32_t i = 0, e = BlockCount; i != e; ++i) {
     if (!Buff.readInt(Dummy))
       return false; // Block flags;
-    Blocks.push_back(make_unique<GCOVBlock>(*this, i));
+    Blocks.push_back(std::make_unique<GCOVBlock>(*this, i));
   }
 
   // read edges.
@@ -185,7 +185,7 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
       uint32_t Dst;
       if (!Buff.readInt(Dst))
         return false;
-      Edges.push_back(make_unique<GCOVEdge>(*Blocks[BlockNo], *Blocks[Dst]));
+      Edges.push_back(std::make_unique<GCOVEdge>(*Blocks[BlockNo], *Blocks[Dst]));
       GCOVEdge *Edge = Edges.back().get();
       Blocks[BlockNo]->addDstEdge(Edge);
       Blocks[Dst]->addSrcEdge(Edge);
@@ -702,14 +702,14 @@ std::string FileInfo::getCoveragePath(StringRef Filename,
 std::unique_ptr<raw_ostream>
 FileInfo::openCoveragePath(StringRef CoveragePath) {
   if (Options.NoOutput)
-    return llvm::make_unique<raw_null_ostream>();
+    return std::make_unique<raw_null_ostream>();
 
   std::error_code EC;
   auto OS =
-      llvm::make_unique<raw_fd_ostream>(CoveragePath, EC, sys::fs::F_Text);
+      std::make_unique<raw_fd_ostream>(CoveragePath, EC, sys::fs::OF_Text);
   if (EC) {
     errs() << EC.message() << "\n";
-    return llvm::make_unique<raw_null_ostream>();
+    return std::make_unique<raw_null_ostream>();
   }
   return std::move(OS);
 }
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 510fd9887d9a..57d4fbc59f83 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -478,7 +478,7 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
   return Error::success();
 }
 
-void InstrProfRecord::accumuateCounts(CountSumOrPercent &Sum) const {
+void InstrProfRecord::accumulateCounts(CountSumOrPercent &Sum) const {
   uint64_t FuncSum = 0;
   Sum.NumEntries += Counts.size();
   for (size_t F = 0, E = Counts.size(); F < E; ++F)
@@ -552,7 +552,7 @@ void InstrProfRecord::overlap(InstrProfRecord &Other, OverlapStats &Overlap,
                               uint64_t ValueCutoff) {
   // FuncLevel CountSum for other should already computed and nonzero.
   assert(FuncLevelOverlap.Test.CountSum >= 1.0f);
-  accumuateCounts(FuncLevelOverlap.Base);
+  accumulateCounts(FuncLevelOverlap.Base);
   bool Mismatch = (Counts.size() != Other.Counts.size());
 
   // Check if the value profiles mismatch.
@@ -1078,12 +1078,10 @@ bool isIRPGOFlagSet(const Module *M) {
   if (!IRInstrVar->hasInitializer())
     return false;
 
-  const Constant *InitVal = IRInstrVar->getInitializer();
+  auto *InitVal = dyn_cast_or_null<ConstantInt>(IRInstrVar->getInitializer());
   if (!InitVal)
     return false;
-
-  return (dyn_cast<ConstantInt>(InitVal)->getZExtValue() &
-          VARIANT_MASK_IR_PROF) != 0;
+  return (InitVal->getZExtValue() & VARIANT_MASK_IR_PROF) != 0;
 }
 
 // Check if we can safely rename this Comdat function.
@@ -1166,9 +1164,9 @@ void createProfileFileNameVar(Module &M, StringRef InstrProfileOutput) {
   }
 }
 
-Error OverlapStats::accumuateCounts(const std::string &BaseFilename,
-                                    const std::string &TestFilename,
-                                    bool IsCS) {
+Error OverlapStats::accumulateCounts(const std::string &BaseFilename,
+                                     const std::string &TestFilename,
+                                     bool IsCS) {
   auto getProfileSum = [IsCS](const std::string &Filename,
                               CountSumOrPercent &Sum) -> Error {
     auto ReaderOrErr = InstrProfReader::create(Filename);
@@ -1176,7 +1174,7 @@ Error OverlapStats::accumuateCounts(const std::string &BaseFilename,
       return E;
     }
     auto Reader = std::move(ReaderOrErr.get());
-    Reader->accumuateCounts(Sum, IsCS);
+    Reader->accumulateCounts(Sum, IsCS);
     return Error::success();
   };
   auto Ret = getProfileSum(BaseFilename, Base);
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index fec1c152991c..23d078a3ddee 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -119,7 +119,7 @@ IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
   // Create the reader.
   if (!IndexedInstrProfReader::hasFormat(*Buffer))
     return make_error<InstrProfError>(instrprof_error::bad_magic);
-  auto Result = llvm::make_unique<IndexedInstrProfReader>(
+  auto Result = std::make_unique<IndexedInstrProfReader>(
       std::move(Buffer), std::move(RemappingBuffer));
 
   // Initialize the reader and return the result.
@@ -385,7 +385,7 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   NamesStart = Start + NamesOffset;
   ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset);
 
-  std::unique_ptr<InstrProfSymtab> NewSymtab = make_unique<InstrProfSymtab>();
+  std::unique_ptr<InstrProfSymtab> NewSymtab = std::make_unique<InstrProfSymtab>();
   if (Error E = createSymtab(*NewSymtab.get()))
     return E;
 
@@ -413,13 +413,19 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
   if (NumCounters == 0)
     return error(instrprof_error::malformed);
 
-  auto RawCounts = makeArrayRef(getCounter(CounterPtr), NumCounters);
   auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart);
+  ptrdiff_t MaxNumCounters = NamesStartAsCounter - CountersStart;
 
-  // Check bounds.
-  if (RawCounts.data() < CountersStart ||
-      RawCounts.data() + RawCounts.size() > NamesStartAsCounter)
+  // Check bounds. Note that the counter pointer embedded in the data record
+  // may itself be corrupt.
+  if (NumCounters > MaxNumCounters)
     return error(instrprof_error::malformed);
+  ptrdiff_t CounterOffset = getCounterOffset(CounterPtr);
+  if (CounterOffset < 0 || CounterOffset > MaxNumCounters ||
+      (CounterOffset + NumCounters) > MaxNumCounters)
+    return error(instrprof_error::malformed);
+
+  auto RawCounts = makeArrayRef(getCounter(CounterOffset), NumCounters);
 
   if (ShouldSwapBytes) {
     Record.Counts.clear();
@@ -767,7 +773,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
         UseCS ? this->CS_Summary : this->Summary;
 
     // initialize InstrProfSummary using the SummaryData from disk.
-    Summary = llvm::make_unique<ProfileSummary>(
+    Summary = std::make_unique<ProfileSummary>(
         UseCS ? ProfileSummary::PSK_CSInstr : ProfileSummary::PSK_Instr,
         DetailedSummary, SummaryData->get(Summary::TotalBlockCount),
         SummaryData->get(Summary::MaxBlockCount),
@@ -777,13 +783,13 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
         SummaryData->get(Summary::TotalNumFunctions));
     return Cur + SummarySize;
   } else {
-    // For older version of profile data, we need to compute on the fly:
-    using namespace IndexedInstrProf;
-
+    // The older versions do not support a profile summary. This just computes
+    // an empty summary, which will not result in accurate hot/cold detection.
+    // We would need to call addRecord for all NamedInstrProfRecords to get the
+    // correct summary. However, this version is old (prior to early 2016) and
+    // has not been supporting an accurate summary for several years.
     InstrProfSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
-    // FIXME: This only computes an empty summary. Need to call addRecord for
-    // all NamedInstrProfRecords to get the correct summary.
-    this->Summary = Builder.getSummary();
+    Summary = Builder.getSummary();
     return Cur;
   }
 }
@@ -827,18 +833,18 @@ Error IndexedInstrProfReader::readHeader() {
 
   // The rest of the file is an on disk hash table.
   auto IndexPtr =
-      llvm::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
+      std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
           Start + HashOffset, Cur, Start, HashType, FormatVersion);
 
   // Load the remapping table now if requested.
   if (RemappingBuffer) {
-    Remapper = llvm::make_unique<
+    Remapper = std::make_unique<
         InstrProfReaderItaniumRemapper<OnDiskHashTableImplV3>>(
         std::move(RemappingBuffer), *IndexPtr);
     if (Error E = Remapper->populateRemappings())
       return E;
   } else {
-    Remapper = llvm::make_unique<InstrProfReaderNullRemapper>(*IndexPtr);
+    Remapper = std::make_unique<InstrProfReaderNullRemapper>(*IndexPtr);
   }
   Index = std::move(IndexPtr);
 
@@ -849,7 +855,7 @@ InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
   if (Symtab.get())
     return *Symtab.get();
 
-  std::unique_ptr<InstrProfSymtab> NewSymtab = make_unique<InstrProfSymtab>();
+  std::unique_ptr<InstrProfSymtab> NewSymtab = std::make_unique<InstrProfSymtab>();
   if (Error E = Index->populateSymtab(*NewSymtab.get())) {
     consumeError(error(InstrProfError::take(std::move(E))));
   }
@@ -901,7 +907,7 @@ Error IndexedInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
   return success();
 }
 
-void InstrProfReader::accumuateCounts(CountSumOrPercent &Sum, bool IsCS) {
+void InstrProfReader::accumulateCounts(CountSumOrPercent &Sum, bool IsCS) {
   uint64_t NumFuncs = 0;
   for (const auto &Func : *this) {
     if (isIRLevelProfile()) {
@@ -909,7 +915,7 @@ void InstrProfReader::accumuateCounts(CountSumOrPercent &Sum, bool IsCS) {
       if (FuncIsCS != IsCS)
         continue;
     }
-    Func.accumuateCounts(Sum);
+    Func.accumulateCounts(Sum);
     ++NumFuncs;
   }
   Sum.NumEntries = NumFuncs;
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index 4ca2defd26da..ccb270e0b719 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -193,7 +193,7 @@ void InstrProfWriter::overlapRecord(NamedInstrProfRecord &&Other,
                                     const OverlapFuncFilters &FuncFilter) {
   auto Name = Other.Name;
   auto Hash = Other.Hash;
-  Other.accumuateCounts(FuncLevelOverlap.Test);
+  Other.accumulateCounts(FuncLevelOverlap.Test);
   if (FunctionData.find(Name) == FunctionData.end()) {
     Overlap.addOneUnique(FuncLevelOverlap.Test);
     return;
diff --git a/lib/ProfileData/ProfileSummaryBuilder.cpp b/lib/ProfileData/ProfileSummaryBuilder.cpp
index 4d5b00935742..3299b5f92069 100644
--- a/lib/ProfileData/ProfileSummaryBuilder.cpp
+++ b/lib/ProfileData/ProfileSummaryBuilder.cpp
@@ -93,14 +93,14 @@ void ProfileSummaryBuilder::computeDetailedSummary() {
 
 std::unique_ptr<ProfileSummary> SampleProfileSummaryBuilder::getSummary() {
   computeDetailedSummary();
-  return llvm::make_unique<ProfileSummary>(
+  return std::make_unique<ProfileSummary>(
       ProfileSummary::PSK_Sample, DetailedSummary, TotalCount, MaxCount, 0,
       MaxFunctionCount, NumCounts, NumFunctions);
 }
 
 std::unique_ptr<ProfileSummary> InstrProfSummaryBuilder::getSummary() {
   computeDetailedSummary();
-  return llvm::make_unique<ProfileSummary>(
+  return std::make_unique<ProfileSummary>(
       ProfileSummary::PSK_Instr, DetailedSummary, TotalCount, MaxCount,
       MaxInternalBlockCount, MaxFunctionCount, NumCounts, NumFunctions);
 }
diff --git a/lib/ProfileData/SampleProf.cpp b/lib/ProfileData/SampleProf.cpp
index e17865cd15a4..003e8d4d4296 100644
--- a/lib/ProfileData/SampleProf.cpp
+++ b/lib/ProfileData/SampleProf.cpp
@@ -16,7 +16,9 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <string>
@@ -28,8 +30,6 @@ using namespace sampleprof;
 namespace llvm {
 namespace sampleprof {
 SampleProfileFormat FunctionSamples::Format;
-DenseMap<uint64_t, StringRef> FunctionSamples::GUIDToFuncNameMap;
-Module *FunctionSamples::CurrentModule;
 } // namespace sampleprof
 } // namespace llvm
 
@@ -68,6 +68,12 @@ class SampleProfErrorCategoryType : public std::error_category {
       return "Counter overflow";
     case sampleprof_error::ostream_seek_unsupported:
       return "Ostream does not support seek";
+    case sampleprof_error::compress_failed:
+      return "Compress failure";
+    case sampleprof_error::uncompress_failed:
+      return "Uncompress failure";
+    case sampleprof_error::zlib_unavailable:
+      return "Zlib is unavailable";
     }
     llvm_unreachable("A value of sampleprof_error has no message.");
   }
@@ -102,8 +108,8 @@ void SampleRecord::print(raw_ostream &OS, unsigned Indent) const {
   OS << NumSamples;
   if (hasCalls()) {
     OS << ", calls:";
-    for (const auto &I : getCallTargets())
-      OS << " " << I.first() << ":" << I.second;
+    for (const auto &I : getSortedCallTargets())
+      OS << " " << I.first << ":" << I.second;
   }
   OS << "\n";
 }
@@ -149,6 +155,7 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
         FS.second.print(OS, Indent + 4);
       }
     }
+    OS.indent(Indent);
     OS << "}\n";
   } else {
     OS << "No inlined callsites in this function\n";
@@ -190,3 +197,44 @@ FunctionSamples::findFunctionSamples(const DILocation *DIL) const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); }
 #endif
+
+std::error_code ProfileSymbolList::read(const uint8_t *Data,
+                                        uint64_t ListSize) {
+  const char *ListStart = reinterpret_cast<const char *>(Data);
+  uint64_t Size = 0;
+  while (Size < ListSize) {
+    StringRef Str(ListStart + Size);
+    add(Str);
+    Size += Str.size() + 1;
+  }
+  if (Size != ListSize)
+    return sampleprof_error::malformed;
+  return sampleprof_error::success;
+}
+
+std::error_code ProfileSymbolList::write(raw_ostream &OS) {
+  // Sort the symbols before output. If doing compression.
+  // It will make the compression much more effective.
+  std::vector<StringRef> SortedList;
+  SortedList.insert(SortedList.begin(), Syms.begin(), Syms.end());
+  llvm::sort(SortedList);
+
+  std::string OutputString;
+  for (auto &Sym : SortedList) {
+    OutputString.append(Sym.str());
+    OutputString.append(1, '\0');
+  }
+
+  OS << OutputString;
+  return sampleprof_error::success;
+}
+
+void ProfileSymbolList::dump(raw_ostream &OS) const {
+  OS << "======== Dump profile symbol list ========\n";
+  std::vector<StringRef> SortedList;
+  SortedList.insert(SortedList.begin(), Syms.begin(), Syms.end());
+  llvm::sort(SortedList);
+
+  for (auto &Sym : SortedList)
+    OS << Sym << "\n";
+}
diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp
index 192b6c711562..001aafce7bfd 100644
--- a/lib/ProfileData/SampleProfReader.cpp
+++ b/lib/ProfileData/SampleProfReader.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/LineIterator.h"
@@ -190,7 +191,7 @@ static bool ParseLine(const StringRef &Input, bool &IsCallsite, uint32_t &Depth,
 /// the expected format.
 ///
 /// \returns true if the file was loaded successfully, false otherwise.
-std::error_code SampleProfileReaderText::read() {
+std::error_code SampleProfileReaderText::readImpl() {
   line_iterator LineIt(*Buffer, /*SkipBlanks=*/true, '#');
   sampleprof_error Result = sampleprof_error::success;
 
@@ -345,7 +346,7 @@ inline ErrorOr<uint32_t> SampleProfileReaderBinary::readStringIndex(T &Table) {
   return *Idx;
 }
 
-ErrorOr<StringRef> SampleProfileReaderRawBinary::readStringFromTable() {
+ErrorOr<StringRef> SampleProfileReaderBinary::readStringFromTable() {
   auto Idx = readStringIndex(NameTable);
   if (std::error_code EC = Idx.getError())
     return EC;
@@ -438,7 +439,9 @@ SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderBinary::readFuncProfile() {
+std::error_code
+SampleProfileReaderBinary::readFuncProfile(const uint8_t *Start) {
+  Data = Start;
   auto NumHeadSamples = readNumber<uint64_t>();
   if (std::error_code EC = NumHeadSamples.getError())
     return EC;
@@ -458,25 +461,210 @@ std::error_code SampleProfileReaderBinary::readFuncProfile() {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderBinary::read() {
+std::error_code SampleProfileReaderBinary::readImpl() {
   while (!at_eof()) {
-    if (std::error_code EC = readFuncProfile())
+    if (std::error_code EC = readFuncProfile(Data))
+      return EC;
+  }
+
+  return sampleprof_error::success;
+}
+
+std::error_code
+SampleProfileReaderExtBinary::readOneSection(const uint8_t *Start,
+                                             uint64_t Size, SecType Type) {
+  Data = Start;
+  End = Start + Size;
+  switch (Type) {
+  case SecProfSummary:
+    if (std::error_code EC = readSummary())
+      return EC;
+    break;
+  case SecNameTable:
+    if (std::error_code EC = readNameTable())
+      return EC;
+    break;
+  case SecLBRProfile:
+    if (std::error_code EC = readFuncProfiles())
+      return EC;
+    break;
+  case SecProfileSymbolList:
+    if (std::error_code EC = readProfileSymbolList())
       return EC;
+    break;
+  case SecFuncOffsetTable:
+    if (std::error_code EC = readFuncOffsetTable())
+      return EC;
+    break;
+  default:
+    break;
   }
+  return sampleprof_error::success;
+}
+
+void SampleProfileReaderExtBinary::collectFuncsFrom(const Module &M) {
+  UseAllFuncs = false;
+  FuncsToUse.clear();
+  for (auto &F : M)
+    FuncsToUse.insert(FunctionSamples::getCanonicalFnName(F));
+}
+
+std::error_code SampleProfileReaderExtBinary::readFuncOffsetTable() {
+  auto Size = readNumber<uint64_t>();
+  if (std::error_code EC = Size.getError())
+    return EC;
+
+  FuncOffsetTable.reserve(*Size);
+  for (uint32_t I = 0; I < *Size; ++I) {
+    auto FName(readStringFromTable());
+    if (std::error_code EC = FName.getError())
+      return EC;
 
+    auto Offset = readNumber<uint64_t>();
+    if (std::error_code EC = Offset.getError())
+      return EC;
+
+    FuncOffsetTable[*FName] = *Offset;
+  }
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderCompactBinary::read() {
-  for (auto Name : FuncsToUse) {
-    auto GUID = std::to_string(MD5Hash(Name));
-    auto iter = FuncOffsetTable.find(StringRef(GUID));
-    if (iter == FuncOffsetTable.end())
+std::error_code SampleProfileReaderExtBinary::readFuncProfiles() {
+  const uint8_t *Start = Data;
+  if (UseAllFuncs) {
+    while (Data < End) {
+      if (std::error_code EC = readFuncProfile(Data))
+        return EC;
+    }
+    assert(Data == End && "More data is read than expected");
+    return sampleprof_error::success;
+  }
+
+  if (Remapper) {
+    for (auto Name : FuncsToUse) {
+      Remapper->insert(Name);
+    }
+  }
+
+  for (auto NameOffset : FuncOffsetTable) {
+    auto FuncName = NameOffset.first;
+    if (!FuncsToUse.count(FuncName) &&
+        (!Remapper || !Remapper->exist(FuncName)))
       continue;
+    const uint8_t *FuncProfileAddr = Start + NameOffset.second;
+    assert(FuncProfileAddr < End && "out of LBRProfile section");
+    if (std::error_code EC = readFuncProfile(FuncProfileAddr))
+      return EC;
+  }
+
+  Data = End;
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderExtBinary::readProfileSymbolList() {
+  if (!ProfSymList)
+    ProfSymList = std::make_unique<ProfileSymbolList>();
+
+  if (std::error_code EC = ProfSymList->read(Data, End - Data))
+    return EC;
+
+  Data = End;
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderExtBinaryBase::decompressSection(
+    const uint8_t *SecStart, const uint64_t SecSize,
+    const uint8_t *&DecompressBuf, uint64_t &DecompressBufSize) {
+  Data = SecStart;
+  End = SecStart + SecSize;
+  auto DecompressSize = readNumber<uint64_t>();
+  if (std::error_code EC = DecompressSize.getError())
+    return EC;
+  DecompressBufSize = *DecompressSize;
+
+  auto CompressSize = readNumber<uint64_t>();
+  if (std::error_code EC = CompressSize.getError())
+    return EC;
+
+  if (!llvm::zlib::isAvailable())
+    return sampleprof_error::zlib_unavailable;
+
+  StringRef CompressedStrings(reinterpret_cast<const char *>(Data),
+                              *CompressSize);
+  char *Buffer = Allocator.Allocate<char>(DecompressBufSize);
+  size_t UCSize = DecompressBufSize;
+  llvm::Error E =
+      zlib::uncompress(CompressedStrings, Buffer, UCSize);
+  if (E)
+    return sampleprof_error::uncompress_failed;
+  DecompressBuf = reinterpret_cast<const uint8_t *>(Buffer);
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderExtBinaryBase::readImpl() {
+  const uint8_t *BufStart =
+      reinterpret_cast<const uint8_t *>(Buffer->getBufferStart());
+
+  for (auto &Entry : SecHdrTable) {
+    // Skip empty section.
+    if (!Entry.Size)
+      continue;
+
+    const uint8_t *SecStart = BufStart + Entry.Offset;
+    uint64_t SecSize = Entry.Size;
+
+    // If the section is compressed, decompress it into a buffer
+    // DecompressBuf before reading the actual data. The pointee of
+    // 'Data' will be changed to buffer hold by DecompressBuf
+    // temporarily when reading the actual data.
+    bool isCompressed = hasSecFlag(Entry, SecFlagCompress);
+    if (isCompressed) {
+      const uint8_t *DecompressBuf;
+      uint64_t DecompressBufSize;
+      if (std::error_code EC = decompressSection(
+              SecStart, SecSize, DecompressBuf, DecompressBufSize))
+        return EC;
+      SecStart = DecompressBuf;
+      SecSize = DecompressBufSize;
+    }
+
+    if (std::error_code EC = readOneSection(SecStart, SecSize, Entry.Type))
+      return EC;
+    if (Data != SecStart + SecSize)
+      return sampleprof_error::malformed;
+
+    // Change the pointee of 'Data' from DecompressBuf to original Buffer.
+    if (isCompressed) {
+      Data = BufStart + Entry.Offset;
+      End = BufStart + Buffer->getBufferSize();
+    }
+  }
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderCompactBinary::readImpl() {
+  std::vector<uint64_t> OffsetsToUse;
+  if (UseAllFuncs) {
+    for (auto FuncEntry : FuncOffsetTable) {
+      OffsetsToUse.push_back(FuncEntry.second);
+    }
+  }
+  else {
+    for (auto Name : FuncsToUse) {
+      auto GUID = std::to_string(MD5Hash(Name));
+      auto iter = FuncOffsetTable.find(StringRef(GUID));
+      if (iter == FuncOffsetTable.end())
+        continue;
+      OffsetsToUse.push_back(iter->second);
+    }
+  }
+
+  for (auto Offset : OffsetsToUse) {
     const uint8_t *SavedData = Data;
-    Data = reinterpret_cast<const uint8_t *>(Buffer->getBufferStart()) +
-           iter->second;
-    if (std::error_code EC = readFuncProfile())
+    if (std::error_code EC = readFuncProfile(
+            reinterpret_cast<const uint8_t *>(Buffer->getBufferStart()) +
+            Offset))
       return EC;
     Data = SavedData;
   }
@@ -489,6 +677,12 @@ std::error_code SampleProfileReaderRawBinary::verifySPMagic(uint64_t Magic) {
   return sampleprof_error::bad_magic;
 }
 
+std::error_code SampleProfileReaderExtBinary::verifySPMagic(uint64_t Magic) {
+  if (Magic == SPMagic(SPF_Ext_Binary))
+    return sampleprof_error::success;
+  return sampleprof_error::bad_magic;
+}
+
 std::error_code
 SampleProfileReaderCompactBinary::verifySPMagic(uint64_t Magic) {
   if (Magic == SPMagic(SPF_Compact_Binary))
@@ -496,7 +690,7 @@ SampleProfileReaderCompactBinary::verifySPMagic(uint64_t Magic) {
   return sampleprof_error::bad_magic;
 }
 
-std::error_code SampleProfileReaderRawBinary::readNameTable() {
+std::error_code SampleProfileReaderBinary::readNameTable() {
   auto Size = readNumber<uint32_t>();
   if (std::error_code EC = Size.getError())
     return EC;
@@ -525,10 +719,98 @@ std::error_code SampleProfileReaderCompactBinary::readNameTable() {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileReaderBinary::readHeader() {
-  Data = reinterpret_cast<const uint8_t *>(Buffer->getBufferStart());
-  End = Data + Buffer->getBufferSize();
+std::error_code SampleProfileReaderExtBinaryBase::readSecHdrTableEntry() {
+  SecHdrTableEntry Entry;
+  auto Type = readUnencodedNumber<uint64_t>();
+  if (std::error_code EC = Type.getError())
+    return EC;
+  Entry.Type = static_cast<SecType>(*Type);
 
+  auto Flags = readUnencodedNumber<uint64_t>();
+  if (std::error_code EC = Flags.getError())
+    return EC;
+  Entry.Flags = *Flags;
+
+  auto Offset = readUnencodedNumber<uint64_t>();
+  if (std::error_code EC = Offset.getError())
+    return EC;
+  Entry.Offset = *Offset;
+
+  auto Size = readUnencodedNumber<uint64_t>();
+  if (std::error_code EC = Size.getError())
+    return EC;
+  Entry.Size = *Size;
+
+  SecHdrTable.push_back(std::move(Entry));
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderExtBinaryBase::readSecHdrTable() {
+  auto EntryNum = readUnencodedNumber<uint64_t>();
+  if (std::error_code EC = EntryNum.getError())
+    return EC;
+
+  for (uint32_t i = 0; i < (*EntryNum); i++)
+    if (std::error_code EC = readSecHdrTableEntry())
+      return EC;
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderExtBinaryBase::readHeader() {
+  const uint8_t *BufStart =
+      reinterpret_cast<const uint8_t *>(Buffer->getBufferStart());
+  Data = BufStart;
+  End = BufStart + Buffer->getBufferSize();
+
+  if (std::error_code EC = readMagicIdent())
+    return EC;
+
+  if (std::error_code EC = readSecHdrTable())
+    return EC;
+
+  return sampleprof_error::success;
+}
+
+uint64_t SampleProfileReaderExtBinaryBase::getSectionSize(SecType Type) {
+  for (auto &Entry : SecHdrTable) {
+    if (Entry.Type == Type)
+      return Entry.Size;
+  }
+  return 0;
+}
+
+uint64_t SampleProfileReaderExtBinaryBase::getFileSize() {
+  // Sections in SecHdrTable is not necessarily in the same order as
+  // sections in the profile because section like FuncOffsetTable needs
+  // to be written after section LBRProfile but needs to be read before
+  // section LBRProfile, so we cannot simply use the last entry in
+  // SecHdrTable to calculate the file size.
+  uint64_t FileSize = 0;
+  for (auto &Entry : SecHdrTable) {
+    FileSize = std::max(Entry.Offset + Entry.Size, FileSize);
+  }
+  return FileSize;
+}
+
+bool SampleProfileReaderExtBinaryBase::dumpSectionInfo(raw_ostream &OS) {
+  uint64_t TotalSecsSize = 0;
+  for (auto &Entry : SecHdrTable) {
+    OS << getSecName(Entry.Type) << " - Offset: " << Entry.Offset
+       << ", Size: " << Entry.Size << "\n";
+    TotalSecsSize += getSectionSize(Entry.Type);
+  }
+  uint64_t HeaderSize = SecHdrTable.front().Offset;
+  assert(HeaderSize + TotalSecsSize == getFileSize() &&
+         "Size of 'header + sections' doesn't match the total size of profile");
+
+  OS << "Header Size: " << HeaderSize << "\n";
+  OS << "Total Sections Size: " << TotalSecsSize << "\n";
+  OS << "File Size: " << getFileSize() << "\n";
+  return true;
+}
+
+std::error_code SampleProfileReaderBinary::readMagicIdent() {
   // Read and check the magic identifier.
   auto Magic = readNumber<uint64_t>();
   if (std::error_code EC = Magic.getError())
@@ -543,6 +825,16 @@ std::error_code SampleProfileReaderBinary::readHeader() {
   else if (*Version != SPVersion())
     return sampleprof_error::unsupported_version;
 
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderBinary::readHeader() {
+  Data = reinterpret_cast<const uint8_t *>(Buffer->getBufferStart());
+  End = Data + Buffer->getBufferSize();
+
+  if (std::error_code EC = readMagicIdent())
+    return EC;
+
   if (std::error_code EC = readSummary())
     return EC;
 
@@ -590,12 +882,11 @@ std::error_code SampleProfileReaderCompactBinary::readFuncOffsetTable() {
   return sampleprof_error::success;
 }
 
-void SampleProfileReaderCompactBinary::collectFuncsToUse(const Module &M) {
+void SampleProfileReaderCompactBinary::collectFuncsFrom(const Module &M) {
+  UseAllFuncs = false;
   FuncsToUse.clear();
-  for (auto &F : M) {
-    StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
-    FuncsToUse.insert(CanonName);
-  }
+  for (auto &F : M)
+    FuncsToUse.insert(FunctionSamples::getCanonicalFnName(F));
 }
 
 std::error_code SampleProfileReaderBinary::readSummaryEntry(
@@ -647,7 +938,7 @@ std::error_code SampleProfileReaderBinary::readSummary() {
     if (EC != sampleprof_error::success)
       return EC;
   }
-  Summary = llvm::make_unique<ProfileSummary>(
+  Summary = std::make_unique<ProfileSummary>(
       ProfileSummary::PSK_Sample, Entries, *TotalCount, *MaxBlockCount, 0,
       *MaxFunctionCount, *NumBlocks, *NumFunctions);
 
@@ -661,6 +952,13 @@ bool SampleProfileReaderRawBinary::hasFormat(const MemoryBuffer &Buffer) {
   return Magic == SPMagic();
 }
 
+bool SampleProfileReaderExtBinary::hasFormat(const MemoryBuffer &Buffer) {
+  const uint8_t *Data =
+      reinterpret_cast<const uint8_t *>(Buffer.getBufferStart());
+  uint64_t Magic = decodeULEB128(Data);
+  return Magic == SPMagic(SPF_Ext_Binary);
+}
+
 bool SampleProfileReaderCompactBinary::hasFormat(const MemoryBuffer &Buffer) {
   const uint8_t *Data =
       reinterpret_cast<const uint8_t *>(Buffer.getBufferStart());
@@ -894,7 +1192,7 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
 ///
 /// This format is generated by the Linux Perf conversion tool at
 /// https://github.com/google/autofdo.
-std::error_code SampleProfileReaderGCC::read() {
+std::error_code SampleProfileReaderGCC::readImpl() {
   // Read the string table.
   if (std::error_code EC = readNameTable())
     return EC;
@@ -911,38 +1209,31 @@ bool SampleProfileReaderGCC::hasFormat(const MemoryBuffer &Buffer) {
   return Magic == "adcg*704";
 }
 
-std::error_code SampleProfileReaderItaniumRemapper::read() {
-  // If the underlying data is in compact format, we can't remap it because
+void SampleProfileReaderItaniumRemapper::applyRemapping(LLVMContext &Ctx) {
+  // If the reader is in compact format, we can't remap it because
   // we don't know what the original function names were.
-  if (getFormat() == SPF_Compact_Binary) {
+  if (Reader.getFormat() == SPF_Compact_Binary) {
     Ctx.diagnose(DiagnosticInfoSampleProfile(
-        Buffer->getBufferIdentifier(),
+        Reader.getBuffer()->getBufferIdentifier(),
         "Profile data remapping cannot be applied to profile data "
         "in compact format (original mangled names are not available).",
         DS_Warning));
-    return sampleprof_error::success;
-  }
-
-  if (Error E = Remappings.read(*Buffer)) {
-    handleAllErrors(
-        std::move(E), [&](const SymbolRemappingParseError &ParseError) {
-          reportError(ParseError.getLineNum(), ParseError.getMessage());
-        });
-    return sampleprof_error::malformed;
+    return;
   }
 
-  for (auto &Sample : getProfiles())
-    if (auto Key = Remappings.insert(Sample.first()))
+  assert(Remappings && "should be initialized while creating remapper");
+  for (auto &Sample : Reader.getProfiles())
+    if (auto Key = Remappings->insert(Sample.first()))
       SampleMap.insert({Key, &Sample.second});
 
-  return sampleprof_error::success;
+  RemappingApplied = true;
 }
 
 FunctionSamples *
 SampleProfileReaderItaniumRemapper::getSamplesFor(StringRef Fname) {
-  if (auto Key = Remappings.lookup(Fname))
+  if (auto Key = Remappings->lookup(Fname))
     return SampleMap.lookup(Key);
-  return SampleProfileReader::getSamplesFor(Fname);
+  return nullptr;
 }
 
 /// Prepare a memory buffer for the contents of \p Filename.
@@ -968,13 +1259,16 @@ setupMemoryBuffer(const Twine &Filename) {
 ///
 /// \param C The LLVM context to use to emit diagnostics.
 ///
+/// \param RemapFilename The file used for profile remapping.
+///
 /// \returns an error code indicating the status of the created reader.
 ErrorOr<std::unique_ptr<SampleProfileReader>>
-SampleProfileReader::create(const Twine &Filename, LLVMContext &C) {
+SampleProfileReader::create(const std::string Filename, LLVMContext &C,
+                            const std::string RemapFilename) {
   auto BufferOrError = setupMemoryBuffer(Filename);
   if (std::error_code EC = BufferOrError.getError())
     return EC;
-  return create(BufferOrError.get(), C);
+  return create(BufferOrError.get(), C, RemapFilename);
 }
 
 /// Create a sample profile remapper from the given input, to remap the
@@ -982,20 +1276,48 @@ SampleProfileReader::create(const Twine &Filename, LLVMContext &C) {
 ///
 /// \param Filename The file to open.
 ///
-/// \param C The LLVM context to use to emit diagnostics.
+/// \param Reader The profile reader the remapper is going to be applied to.
 ///
-/// \param Underlying The underlying profile data reader to remap.
+/// \param C The LLVM context to use to emit diagnostics.
 ///
 /// \returns an error code indicating the status of the created reader.
-ErrorOr<std::unique_ptr<SampleProfileReader>>
-SampleProfileReaderItaniumRemapper::create(
-    const Twine &Filename, LLVMContext &C,
-    std::unique_ptr<SampleProfileReader> Underlying) {
+ErrorOr<std::unique_ptr<SampleProfileReaderItaniumRemapper>>
+SampleProfileReaderItaniumRemapper::create(const std::string Filename,
+                                           SampleProfileReader &Reader,
+                                           LLVMContext &C) {
   auto BufferOrError = setupMemoryBuffer(Filename);
   if (std::error_code EC = BufferOrError.getError())
     return EC;
-  return llvm::make_unique<SampleProfileReaderItaniumRemapper>(
-      std::move(BufferOrError.get()), C, std::move(Underlying));
+  return create(BufferOrError.get(), Reader, C);
+}
+
+/// Create a sample profile remapper from the given input, to remap the
+/// function names in the given profile data.
+///
+/// \param B The memory buffer to create the reader from (assumes ownership).
+///
+/// \param C The LLVM context to use to emit diagnostics.
+///
+/// \param Reader The profile reader the remapper is going to be applied to.
+///
+/// \returns an error code indicating the status of the created reader.
+ErrorOr<std::unique_ptr<SampleProfileReaderItaniumRemapper>>
+SampleProfileReaderItaniumRemapper::create(std::unique_ptr<MemoryBuffer> &B,
+                                           SampleProfileReader &Reader,
+                                           LLVMContext &C) {
+  auto Remappings = std::make_unique<SymbolRemappingReader>();
+  if (Error E = Remappings->read(*B.get())) {
+    handleAllErrors(
+        std::move(E), [&](const SymbolRemappingParseError &ParseError) {
+          C.diagnose(DiagnosticInfoSampleProfile(B->getBufferIdentifier(),
+                                                 ParseError.getLineNum(),
+                                                 ParseError.getMessage()));
+        });
+    return sampleprof_error::malformed;
+  }
+
+  return std::make_unique<SampleProfileReaderItaniumRemapper>(
+      std::move(B), std::move(Remappings), Reader);
 }
 
 /// Create a sample profile reader based on the format of the input data.
@@ -1004,12 +1326,17 @@ SampleProfileReaderItaniumRemapper::create(
 ///
 /// \param C The LLVM context to use to emit diagnostics.
 ///
+/// \param RemapFilename The file used for profile remapping.
+///
 /// \returns an error code indicating the status of the created reader.
 ErrorOr<std::unique_ptr<SampleProfileReader>>
-SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C) {
+SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C,
+                            const std::string RemapFilename) {
   std::unique_ptr<SampleProfileReader> Reader;
   if (SampleProfileReaderRawBinary::hasFormat(*B))
     Reader.reset(new SampleProfileReaderRawBinary(std::move(B), C));
+  else if (SampleProfileReaderExtBinary::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderExtBinary(std::move(B), C));
   else if (SampleProfileReaderCompactBinary::hasFormat(*B))
     Reader.reset(new SampleProfileReaderCompactBinary(std::move(B), C));
   else if (SampleProfileReaderGCC::hasFormat(*B))
@@ -1019,9 +1346,21 @@ SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C) {
   else
     return sampleprof_error::unrecognized_format;
 
+  if (!RemapFilename.empty()) {
+    auto ReaderOrErr =
+        SampleProfileReaderItaniumRemapper::create(RemapFilename, *Reader, C);
+    if (std::error_code EC = ReaderOrErr.getError()) {
+      std::string Msg = "Could not create remapper: " + EC.message();
+      C.diagnose(DiagnosticInfoSampleProfile(RemapFilename, Msg));
+      return EC;
+    }
+    Reader->Remapper = std::move(ReaderOrErr.get());
+  }
+
   FunctionSamples::Format = Reader->getFormat();
-  if (std::error_code EC = Reader->readHeader())
+  if (std::error_code EC = Reader->readHeader()) {
     return EC;
+  }
 
   return std::move(Reader);
 }
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index 8b876e0aa5d9..8d09af31f94b 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorOr.h"
@@ -39,11 +40,8 @@
 using namespace llvm;
 using namespace sampleprof;
 
-std::error_code
-SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
-  if (std::error_code EC = writeHeader(ProfileMap))
-    return EC;
-
+std::error_code SampleProfileWriter::writeFuncProfiles(
+    const StringMap<FunctionSamples> &ProfileMap) {
   // Sort the ProfileMap by total samples.
   typedef std::pair<StringRef, const FunctionSamples *> NameFunctionSamples;
   std::vector<NameFunctionSamples> V;
@@ -58,12 +56,161 @@ SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
       });
 
   for (const auto &I : V) {
-    if (std::error_code EC = write(*I.second))
+    if (std::error_code EC = writeSample(*I.second))
       return EC;
   }
   return sampleprof_error::success;
 }
 
+std::error_code
+SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
+  if (std::error_code EC = writeHeader(ProfileMap))
+    return EC;
+
+  if (std::error_code EC = writeFuncProfiles(ProfileMap))
+    return EC;
+
+  return sampleprof_error::success;
+}
+
+SecHdrTableEntry &
+SampleProfileWriterExtBinaryBase::getEntryInLayout(SecType Type) {
+  auto SecIt = std::find_if(
+      SectionHdrLayout.begin(), SectionHdrLayout.end(),
+      [=](const auto &Entry) -> bool { return Entry.Type == Type; });
+  return *SecIt;
+}
+
+/// Return the current position and prepare to use it as the start
+/// position of a section.
+uint64_t SampleProfileWriterExtBinaryBase::markSectionStart(SecType Type) {
+  uint64_t SectionStart = OutputStream->tell();
+  auto &Entry = getEntryInLayout(Type);
+  // Use LocalBuf as a temporary output for writting data.
+  if (hasSecFlag(Entry, SecFlagCompress))
+    LocalBufStream.swap(OutputStream);
+  return SectionStart;
+}
+
+std::error_code SampleProfileWriterExtBinaryBase::compressAndOutput() {
+  if (!llvm::zlib::isAvailable())
+    return sampleprof_error::zlib_unavailable;
+  std::string &UncompressedStrings =
+      static_cast<raw_string_ostream *>(LocalBufStream.get())->str();
+  if (UncompressedStrings.size() == 0)
+    return sampleprof_error::success;
+  auto &OS = *OutputStream;
+  SmallString<128> CompressedStrings;
+  llvm::Error E = zlib::compress(UncompressedStrings, CompressedStrings,
+                                 zlib::BestSizeCompression);
+  if (E)
+    return sampleprof_error::compress_failed;
+  encodeULEB128(UncompressedStrings.size(), OS);
+  encodeULEB128(CompressedStrings.size(), OS);
+  OS << CompressedStrings.str();
+  UncompressedStrings.clear();
+  return sampleprof_error::success;
+}
+
+/// Add a new section into section header table.
+std::error_code
+SampleProfileWriterExtBinaryBase::addNewSection(SecType Type,
+                                                uint64_t SectionStart) {
+  auto Entry = getEntryInLayout(Type);
+  if (hasSecFlag(Entry, SecFlagCompress)) {
+    LocalBufStream.swap(OutputStream);
+    if (std::error_code EC = compressAndOutput())
+      return EC;
+  }
+  SecHdrTable.push_back({Type, Entry.Flags, SectionStart - FileStart,
+                         OutputStream->tell() - SectionStart});
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterExtBinaryBase::write(
+    const StringMap<FunctionSamples> &ProfileMap) {
+  if (std::error_code EC = writeHeader(ProfileMap))
+    return EC;
+
+  std::string LocalBuf;
+  LocalBufStream = std::make_unique<raw_string_ostream>(LocalBuf);
+  if (std::error_code EC = writeSections(ProfileMap))
+    return EC;
+
+  if (std::error_code EC = writeSecHdrTable())
+    return EC;
+
+  return sampleprof_error::success;
+}
+
+std::error_code
+SampleProfileWriterExtBinary::writeSample(const FunctionSamples &S) {
+  uint64_t Offset = OutputStream->tell();
+  StringRef Name = S.getName();
+  FuncOffsetTable[Name] = Offset - SecLBRProfileStart;
+  encodeULEB128(S.getHeadSamples(), *OutputStream);
+  return writeBody(S);
+}
+
+std::error_code SampleProfileWriterExtBinary::writeFuncOffsetTable() {
+  auto &OS = *OutputStream;
+
+  // Write out the table size.
+  encodeULEB128(FuncOffsetTable.size(), OS);
+
+  // Write out FuncOffsetTable.
+  for (auto entry : FuncOffsetTable) {
+    writeNameIdx(entry.first);
+    encodeULEB128(entry.second, OS);
+  }
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterExtBinary::writeSections(
+    const StringMap<FunctionSamples> &ProfileMap) {
+  uint64_t SectionStart = markSectionStart(SecProfSummary);
+  computeSummary(ProfileMap);
+  if (auto EC = writeSummary())
+    return EC;
+  if (std::error_code EC = addNewSection(SecProfSummary, SectionStart))
+    return EC;
+
+  // Generate the name table for all the functions referenced in the profile.
+  SectionStart = markSectionStart(SecNameTable);
+  for (const auto &I : ProfileMap) {
+    addName(I.first());
+    addNames(I.second);
+  }
+  writeNameTable();
+  if (std::error_code EC = addNewSection(SecNameTable, SectionStart))
+    return EC;
+
+  SectionStart = markSectionStart(SecLBRProfile);
+  SecLBRProfileStart = OutputStream->tell();
+  if (std::error_code EC = writeFuncProfiles(ProfileMap))
+    return EC;
+  if (std::error_code EC = addNewSection(SecLBRProfile, SectionStart))
+    return EC;
+
+  if (ProfSymList && ProfSymList->toCompress())
+    setToCompressSection(SecProfileSymbolList);
+
+  SectionStart = markSectionStart(SecProfileSymbolList);
+  if (ProfSymList && ProfSymList->size() > 0)
+    if (std::error_code EC = ProfSymList->write(*OutputStream))
+      return EC;
+  if (std::error_code EC = addNewSection(SecProfileSymbolList, SectionStart))
+    return EC;
+
+  SectionStart = markSectionStart(SecFuncOffsetTable);
+  if (std::error_code EC = writeFuncOffsetTable())
+    return EC;
+  if (std::error_code EC = addNewSection(SecFuncOffsetTable, SectionStart))
+    return EC;
+
+  return sampleprof_error::success;
+}
+
 std::error_code SampleProfileWriterCompactBinary::write(
     const StringMap<FunctionSamples> &ProfileMap) {
   if (std::error_code EC = SampleProfileWriter::write(ProfileMap))
@@ -81,7 +228,7 @@ std::error_code SampleProfileWriterCompactBinary::write(
 ///
 /// The format used here is more structured and deliberate because
 /// it needs to be parsed by the SampleProfileReaderText class.
-std::error_code SampleProfileWriterText::write(const FunctionSamples &S) {
+std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
   auto &OS = *OutputStream;
   OS << S.getName() << ":" << S.getTotalSamples();
   if (Indent == 0)
@@ -100,8 +247,8 @@ std::error_code SampleProfileWriterText::write(const FunctionSamples &S) {
 
     OS << Sample.getSamples();
 
-    for (const auto &J : Sample.getCallTargets())
-      OS << " " << J.first() << ":" << J.second;
+    for (const auto &J : Sample.getSortedCallTargets())
+      OS << " " << J.first << ":" << J.second;
     OS << "\n";
   }
 
@@ -117,7 +264,7 @@ std::error_code SampleProfileWriterText::write(const FunctionSamples &S) {
         OS << Loc.LineOffset << ": ";
       else
         OS << Loc.LineOffset << "." << Loc.Discriminator << ": ";
-      if (std::error_code EC = write(CalleeSamples))
+      if (std::error_code EC = writeSample(CalleeSamples))
         return EC;
     }
   Indent -= 1;
@@ -163,7 +310,7 @@ void SampleProfileWriterBinary::stablizeNameTable(std::set<StringRef> &V) {
     NameTable[N] = i++;
 }
 
-std::error_code SampleProfileWriterRawBinary::writeNameTable() {
+std::error_code SampleProfileWriterBinary::writeNameTable() {
   auto &OS = *OutputStream;
   std::set<StringRef> V;
   stablizeNameTable(V);
@@ -214,25 +361,18 @@ std::error_code SampleProfileWriterCompactBinary::writeNameTable() {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileWriterRawBinary::writeMagicIdent() {
-  auto &OS = *OutputStream;
-  // Write file magic identifier.
-  encodeULEB128(SPMagic(), OS);
-  encodeULEB128(SPVersion(), OS);
-  return sampleprof_error::success;
-}
-
-std::error_code SampleProfileWriterCompactBinary::writeMagicIdent() {
+std::error_code
+SampleProfileWriterBinary::writeMagicIdent(SampleProfileFormat Format) {
   auto &OS = *OutputStream;
   // Write file magic identifier.
-  encodeULEB128(SPMagic(SPF_Compact_Binary), OS);
+  encodeULEB128(SPMagic(Format), OS);
   encodeULEB128(SPVersion(), OS);
   return sampleprof_error::success;
 }
 
 std::error_code SampleProfileWriterBinary::writeHeader(
     const StringMap<FunctionSamples> &ProfileMap) {
-  writeMagicIdent();
+  writeMagicIdent(Format);
 
   computeSummary(ProfileMap);
   if (auto EC = writeSummary())
@@ -248,6 +388,82 @@ std::error_code SampleProfileWriterBinary::writeHeader(
   return sampleprof_error::success;
 }
 
+void SampleProfileWriterExtBinaryBase::setToCompressAllSections() {
+  for (auto &Entry : SectionHdrLayout)
+    addSecFlags(Entry, SecFlagCompress);
+}
+
+void SampleProfileWriterExtBinaryBase::setToCompressSection(SecType Type) {
+  addSectionFlags(Type, SecFlagCompress);
+}
+
+void SampleProfileWriterExtBinaryBase::addSectionFlags(SecType Type,
+                                                       SecFlags Flags) {
+  for (auto &Entry : SectionHdrLayout) {
+    if (Entry.Type == Type)
+      addSecFlags(Entry, Flags);
+  }
+}
+
+void SampleProfileWriterExtBinaryBase::allocSecHdrTable() {
+  support::endian::Writer Writer(*OutputStream, support::little);
+
+  Writer.write(static_cast<uint64_t>(SectionHdrLayout.size()));
+  SecHdrTableOffset = OutputStream->tell();
+  for (uint32_t i = 0; i < SectionHdrLayout.size(); i++) {
+    Writer.write(static_cast<uint64_t>(-1));
+    Writer.write(static_cast<uint64_t>(-1));
+    Writer.write(static_cast<uint64_t>(-1));
+    Writer.write(static_cast<uint64_t>(-1));
+  }
+}
+
+std::error_code SampleProfileWriterExtBinaryBase::writeSecHdrTable() {
+  auto &OFS = static_cast<raw_fd_ostream &>(*OutputStream);
+  uint64_t Saved = OutputStream->tell();
+
+  // Set OutputStream to the location saved in SecHdrTableOffset.
+  if (OFS.seek(SecHdrTableOffset) == (uint64_t)-1)
+    return sampleprof_error::ostream_seek_unsupported;
+  support::endian::Writer Writer(*OutputStream, support::little);
+
+  DenseMap<uint32_t, uint32_t> IndexMap;
+  for (uint32_t i = 0; i < SecHdrTable.size(); i++) {
+    IndexMap.insert({static_cast<uint32_t>(SecHdrTable[i].Type), i});
+  }
+
+  // Write the section header table in the order specified in
+  // SectionHdrLayout. That is the sections order Reader will see.
+  // Note that the sections order in which Reader expects to read
+  // may be different from the order in which Writer is able to
+  // write, so we need to adjust the order in SecHdrTable to be
+  // consistent with SectionHdrLayout when we write SecHdrTable
+  // to the memory.
+  for (uint32_t i = 0; i < SectionHdrLayout.size(); i++) {
+    uint32_t idx = IndexMap[static_cast<uint32_t>(SectionHdrLayout[i].Type)];
+    Writer.write(static_cast<uint64_t>(SecHdrTable[idx].Type));
+    Writer.write(static_cast<uint64_t>(SecHdrTable[idx].Flags));
+    Writer.write(static_cast<uint64_t>(SecHdrTable[idx].Offset));
+    Writer.write(static_cast<uint64_t>(SecHdrTable[idx].Size));
+  }
+
+  // Reset OutputStream.
+  if (OFS.seek(Saved) == (uint64_t)-1)
+    return sampleprof_error::ostream_seek_unsupported;
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterExtBinaryBase::writeHeader(
+    const StringMap<FunctionSamples> &ProfileMap) {
+  auto &OS = *OutputStream;
+  FileStart = OS.tell();
+  writeMagicIdent(Format);
+
+  allocSecHdrTable();
+  return sampleprof_error::success;
+}
+
 std::error_code SampleProfileWriterCompactBinary::writeHeader(
     const StringMap<FunctionSamples> &ProfileMap) {
   support::endian::Writer Writer(*OutputStream, support::little);
@@ -294,8 +510,8 @@ std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
     encodeULEB128(Loc.Discriminator, OS);
     encodeULEB128(Sample.getSamples(), OS);
     encodeULEB128(Sample.getCallTargets().size(), OS);
-    for (const auto &J : Sample.getCallTargets()) {
-      StringRef Callee = J.first();
+    for (const auto &J : Sample.getSortedCallTargets()) {
+      StringRef Callee = J.first;
       uint64_t CalleeSamples = J.second;
       if (std::error_code EC = writeNameIdx(Callee))
         return EC;
@@ -324,13 +540,14 @@ std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
 /// Write samples of a top-level function to a binary file.
 ///
 /// \returns true if the samples were written successfully, false otherwise.
-std::error_code SampleProfileWriterBinary::write(const FunctionSamples &S) {
+std::error_code
+SampleProfileWriterBinary::writeSample(const FunctionSamples &S) {
   encodeULEB128(S.getHeadSamples(), *OutputStream);
   return writeBody(S);
 }
 
 std::error_code
-SampleProfileWriterCompactBinary::write(const FunctionSamples &S) {
+SampleProfileWriterCompactBinary::writeSample(const FunctionSamples &S) {
   uint64_t Offset = OutputStream->tell();
   StringRef Name = S.getName();
   FuncOffsetTable[Name] = Offset;
@@ -349,10 +566,11 @@ ErrorOr<std::unique_ptr<SampleProfileWriter>>
 SampleProfileWriter::create(StringRef Filename, SampleProfileFormat Format) {
   std::error_code EC;
   std::unique_ptr<raw_ostream> OS;
-  if (Format == SPF_Binary || Format == SPF_Compact_Binary)
-    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_None));
+  if (Format == SPF_Binary || Format == SPF_Ext_Binary ||
+      Format == SPF_Compact_Binary)
+    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::OF_None));
   else
-    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_Text));
+    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::OF_Text));
   if (EC)
     return EC;
 
@@ -374,6 +592,8 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
 
   if (Format == SPF_Binary)
     Writer.reset(new SampleProfileWriterRawBinary(OS));
+  else if (Format == SPF_Ext_Binary)
+    Writer.reset(new SampleProfileWriterExtBinary(OS));
   else if (Format == SPF_Compact_Binary)
     Writer.reset(new SampleProfileWriterCompactBinary(OS));
   else if (Format == SPF_Text)
@@ -386,6 +606,7 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
   if (EC)
     return EC;
 
+  Writer->Format = Format;
   return std::move(Writer);
 }
 
diff --git a/lib/Remarks/BitstreamRemarkParser.cpp b/lib/Remarks/BitstreamRemarkParser.cpp
new file mode 100644
index 000000000000..99a82e1ee3af
--- /dev/null
+++ b/lib/Remarks/BitstreamRemarkParser.cpp
@@ -0,0 +1,597 @@
+//===- BitstreamRemarkParser.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utility methods used by clients that want to use the
+// parser for remark diagnostics in LLVM.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Remarks/BitstreamRemarkParser.h"
+#include "BitstreamRemarkParser.h"
+#include "llvm/Remarks/BitstreamRemarkContainer.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+using namespace llvm::remarks;
+
+static Error unknownRecord(const char *BlockName, unsigned RecordID) {
+  return createStringError(
+      std::make_error_code(std::errc::illegal_byte_sequence),
+      "Error while parsing %s: unknown record entry (%lu).", BlockName,
+      RecordID);
+}
+
+static Error malformedRecord(const char *BlockName, const char *RecordName) {
+  return createStringError(
+      std::make_error_code(std::errc::illegal_byte_sequence),
+      "Error while parsing %s: malformed record entry (%s).", BlockName,
+      RecordName);
+}
+
+BitstreamMetaParserHelper::BitstreamMetaParserHelper(
+    BitstreamCursor &Stream, BitstreamBlockInfo &BlockInfo)
+    : Stream(Stream), BlockInfo(BlockInfo) {}
+
+/// Parse a record and fill in the fields in the parser.
+static Error parseRecord(BitstreamMetaParserHelper &Parser, unsigned Code) {
+  BitstreamCursor &Stream = Parser.Stream;
+  // Note: 2 is used here because it's the max number of fields we have per
+  // record.
+  SmallVector<uint64_t, 2> Record;
+  StringRef Blob;
+  Expected<unsigned> RecordID = Stream.readRecord(Code, Record, &Blob);
+  if (!RecordID)
+    return RecordID.takeError();
+
+  switch (*RecordID) {
+  case RECORD_META_CONTAINER_INFO: {
+    if (Record.size() != 2)
+      return malformedRecord("BLOCK_META", "RECORD_META_CONTAINER_INFO");
+    Parser.ContainerVersion = Record[0];
+    Parser.ContainerType = Record[1];
+    break;
+  }
+  case RECORD_META_REMARK_VERSION: {
+    if (Record.size() != 1)
+      return malformedRecord("BLOCK_META", "RECORD_META_REMARK_VERSION");
+    Parser.RemarkVersion = Record[0];
+    break;
+  }
+  case RECORD_META_STRTAB: {
+    if (Record.size() != 0)
+      return malformedRecord("BLOCK_META", "RECORD_META_STRTAB");
+    Parser.StrTabBuf = Blob;
+    break;
+  }
+  case RECORD_META_EXTERNAL_FILE: {
+    if (Record.size() != 0)
+      return malformedRecord("BLOCK_META", "RECORD_META_EXTERNAL_FILE");
+    Parser.ExternalFilePath = Blob;
+    break;
+  }
+  default:
+    return unknownRecord("BLOCK_META", *RecordID);
+  }
+  return Error::success();
+}
+
+BitstreamRemarkParserHelper::BitstreamRemarkParserHelper(
+    BitstreamCursor &Stream)
+    : Stream(Stream) {}
+
+/// Parse a record and fill in the fields in the parser.
+static Error parseRecord(BitstreamRemarkParserHelper &Parser, unsigned Code) {
+  BitstreamCursor &Stream = Parser.Stream;
+  // Note: 5 is used here because it's the max number of fields we have per
+  // record.
+  SmallVector<uint64_t, 5> Record;
+  StringRef Blob;
+  Expected<unsigned> RecordID = Stream.readRecord(Code, Record, &Blob);
+  if (!RecordID)
+    return RecordID.takeError();
+
+  switch (*RecordID) {
+  case RECORD_REMARK_HEADER: {
+    if (Record.size() != 4)
+      return malformedRecord("BLOCK_REMARK", "RECORD_REMARK_HEADER");
+    Parser.Type = Record[0];
+    Parser.RemarkNameIdx = Record[1];
+    Parser.PassNameIdx = Record[2];
+    Parser.FunctionNameIdx = Record[3];
+    break;
+  }
+  case RECORD_REMARK_DEBUG_LOC: {
+    if (Record.size() != 3)
+      return malformedRecord("BLOCK_REMARK", "RECORD_REMARK_DEBUG_LOC");
+    Parser.SourceFileNameIdx = Record[0];
+    Parser.SourceLine = Record[1];
+    Parser.SourceColumn = Record[2];
+    break;
+  }
+  case RECORD_REMARK_HOTNESS: {
+    if (Record.size() != 1)
+      return malformedRecord("BLOCK_REMARK", "RECORD_REMARK_HOTNESS");
+    Parser.Hotness = Record[0];
+    break;
+  }
+  case RECORD_REMARK_ARG_WITH_DEBUGLOC: {
+    if (Record.size() != 5)
+      return malformedRecord("BLOCK_REMARK", "RECORD_REMARK_ARG_WITH_DEBUGLOC");
+    // Create a temporary argument. Use that as a valid memory location for this
+    // argument entry.
+    Parser.TmpArgs.emplace_back();
+    Parser.TmpArgs.back().KeyIdx = Record[0];
+    Parser.TmpArgs.back().ValueIdx = Record[1];
+    Parser.TmpArgs.back().SourceFileNameIdx = Record[2];
+    Parser.TmpArgs.back().SourceLine = Record[3];
+    Parser.TmpArgs.back().SourceColumn = Record[4];
+    Parser.Args =
+        ArrayRef<BitstreamRemarkParserHelper::Argument>(Parser.TmpArgs);
+    break;
+  }
+  case RECORD_REMARK_ARG_WITHOUT_DEBUGLOC: {
+    if (Record.size() != 2)
+      return malformedRecord("BLOCK_REMARK",
+                             "RECORD_REMARK_ARG_WITHOUT_DEBUGLOC");
+    // Create a temporary argument. Use that as a valid memory location for this
+    // argument entry.
+    Parser.TmpArgs.emplace_back();
+    Parser.TmpArgs.back().KeyIdx = Record[0];
+    Parser.TmpArgs.back().ValueIdx = Record[1];
+    Parser.Args =
+        ArrayRef<BitstreamRemarkParserHelper::Argument>(Parser.TmpArgs);
+    break;
+  }
+  default:
+    return unknownRecord("BLOCK_REMARK", *RecordID);
+  }
+  return Error::success();
+}
+
+template <typename T>
+static Error parseBlock(T &ParserHelper, unsigned BlockID,
+                        const char *BlockName) {
+  BitstreamCursor &Stream = ParserHelper.Stream;
+  Expected<BitstreamEntry> Next = Stream.advance();
+  if (!Next)
+    return Next.takeError();
+  if (Next->Kind != BitstreamEntry::SubBlock || Next->ID != BlockID)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing %s: expecting [ENTER_SUBBLOCK, %s, ...].",
+        BlockName, BlockName);
+  if (Stream.EnterSubBlock(BlockID))
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while entering %s.", BlockName);
+
+  // Stop when there is nothing to read anymore or when we encounter an
+  // END_BLOCK.
+  while (!Stream.AtEndOfStream()) {
+    Expected<BitstreamEntry> Next = Stream.advance();
+    if (!Next)
+      return Next.takeError();
+    switch (Next->Kind) {
+    case BitstreamEntry::EndBlock:
+      return Error::success();
+    case BitstreamEntry::Error:
+    case BitstreamEntry::SubBlock:
+      return createStringError(
+          std::make_error_code(std::errc::illegal_byte_sequence),
+          "Error while parsing %s: expecting records.", BlockName);
+    case BitstreamEntry::Record:
+      if (Error E = parseRecord(ParserHelper, Next->ID))
+        return E;
+      continue;
+    }
+  }
+  // If we're here, it means we didn't get an END_BLOCK yet, but we're at the
+  // end of the stream. In this case, error.
+  return createStringError(
+      std::make_error_code(std::errc::illegal_byte_sequence),
+      "Error while parsing %s: unterminated block.", BlockName);
+}
+
+Error BitstreamMetaParserHelper::parse() {
+  return parseBlock(*this, META_BLOCK_ID, "META_BLOCK");
+}
+
+Error BitstreamRemarkParserHelper::parse() {
+  return parseBlock(*this, REMARK_BLOCK_ID, "REMARK_BLOCK");
+}
+
+BitstreamParserHelper::BitstreamParserHelper(StringRef Buffer)
+    : Stream(Buffer) {}
+
+Expected<std::array<char, 4>> BitstreamParserHelper::parseMagic() {
+  std::array<char, 4> Result;
+  for (unsigned i = 0; i < 4; ++i)
+    if (Expected<unsigned> R = Stream.Read(8))
+      Result[i] = *R;
+    else
+      return R.takeError();
+  return Result;
+}
+
+Error BitstreamParserHelper::parseBlockInfoBlock() {
+  Expected<BitstreamEntry> Next = Stream.advance();
+  if (!Next)
+    return Next.takeError();
+  if (Next->Kind != BitstreamEntry::SubBlock ||
+      Next->ID != llvm::bitc::BLOCKINFO_BLOCK_ID)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCKINFO_BLOCK: expecting [ENTER_SUBBLOCK, "
+        "BLOCKINFO_BLOCK, ...].");
+
+  Expected<Optional<BitstreamBlockInfo>> MaybeBlockInfo =
+      Stream.ReadBlockInfoBlock();
+  if (!MaybeBlockInfo)
+    return MaybeBlockInfo.takeError();
+
+  if (!*MaybeBlockInfo)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCKINFO_BLOCK.");
+
+  BlockInfo = **MaybeBlockInfo;
+
+  Stream.setBlockInfo(&BlockInfo);
+  return Error::success();
+}
+
+static Expected<bool> isBlock(BitstreamCursor &Stream, unsigned BlockID) {
+  bool Result = false;
+  uint64_t PreviousBitNo = Stream.GetCurrentBitNo();
+  Expected<BitstreamEntry> Next = Stream.advance();
+  if (!Next)
+    return Next.takeError();
+  switch (Next->Kind) {
+  case BitstreamEntry::SubBlock:
+    // Check for the block id.
+    Result = Next->ID == BlockID;
+    break;
+  case BitstreamEntry::Error:
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Unexpected error while parsing bitstream.");
+  default:
+    Result = false;
+    break;
+  }
+  if (Error E = Stream.JumpToBit(PreviousBitNo))
+    return std::move(E);
+  return Result;
+}
+
+Expected<bool> BitstreamParserHelper::isMetaBlock() {
+  return isBlock(Stream, META_BLOCK_ID);
+}
+
+Expected<bool> BitstreamParserHelper::isRemarkBlock() {
+  return isBlock(Stream, META_BLOCK_ID);
+}
+
+static Error validateMagicNumber(StringRef Magic) {
+  if (Magic != remarks::ContainerMagic)
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "Unknown magic number: expecting %s, got %.4s.",
+                             remarks::ContainerMagic.data(), Magic.data());
+  return Error::success();
+}
+
+static Error advanceToMetaBlock(BitstreamParserHelper &Helper) {
+  Expected<std::array<char, 4>> Magic = Helper.parseMagic();
+  if (!Magic)
+    return Magic.takeError();
+  if (Error E = validateMagicNumber(StringRef(Magic->data(), Magic->size())))
+    return E;
+  if (Error E = Helper.parseBlockInfoBlock())
+    return E;
+  Expected<bool> isMetaBlock = Helper.isMetaBlock();
+  if (!isMetaBlock)
+    return isMetaBlock.takeError();
+  if (!*isMetaBlock)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Expecting META_BLOCK after the BLOCKINFO_BLOCK.");
+  return Error::success();
+}
+
+Expected<std::unique_ptr<BitstreamRemarkParser>>
+remarks::createBitstreamParserFromMeta(
+    StringRef Buf, Optional<ParsedStringTable> StrTab,
+    Optional<StringRef> ExternalFilePrependPath) {
+  BitstreamParserHelper Helper(Buf);
+  Expected<std::array<char, 4>> Magic = Helper.parseMagic();
+  if (!Magic)
+    return Magic.takeError();
+
+  if (Error E = validateMagicNumber(StringRef(Magic->data(), Magic->size())))
+    return std::move(E);
+
+  auto Parser =
+      StrTab ? std::make_unique<BitstreamRemarkParser>(Buf, std::move(*StrTab))
+             : std::make_unique<BitstreamRemarkParser>(Buf);
+
+  if (ExternalFilePrependPath)
+    Parser->ExternalFilePrependPath = *ExternalFilePrependPath;
+
+  return std::move(Parser);
+}
+
+Expected<std::unique_ptr<Remark>> BitstreamRemarkParser::next() {
+  if (ParserHelper.atEndOfStream())
+    return make_error<EndOfFileError>();
+
+  if (!ReadyToParseRemarks) {
+    if (Error E = parseMeta())
+      return std::move(E);
+    ReadyToParseRemarks = true;
+  }
+
+  return parseRemark();
+}
+
+Error BitstreamRemarkParser::parseMeta() {
+  // Advance and to the meta block.
+  if (Error E = advanceToMetaBlock(ParserHelper))
+    return E;
+
+  BitstreamMetaParserHelper MetaHelper(ParserHelper.Stream,
+                                       ParserHelper.BlockInfo);
+  if (Error E = MetaHelper.parse())
+    return E;
+
+  if (Error E = processCommonMeta(MetaHelper))
+    return E;
+
+  switch (ContainerType) {
+  case BitstreamRemarkContainerType::Standalone:
+    return processStandaloneMeta(MetaHelper);
+  case BitstreamRemarkContainerType::SeparateRemarksFile:
+    return processSeparateRemarksFileMeta(MetaHelper);
+  case BitstreamRemarkContainerType::SeparateRemarksMeta:
+    return processSeparateRemarksMetaMeta(MetaHelper);
+  }
+  llvm_unreachable("Unknown BitstreamRemarkContainerType enum");
+}
+
+Error BitstreamRemarkParser::processCommonMeta(
+    BitstreamMetaParserHelper &MetaHelper) {
+  if (Optional<uint64_t> Version = MetaHelper.ContainerVersion)
+    ContainerVersion = *Version;
+  else
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_META: missing container version.");
+
+  if (Optional<uint8_t> Type = MetaHelper.ContainerType) {
+    // Always >= BitstreamRemarkContainerType::First since it's unsigned.
+    if (*Type > static_cast<uint8_t>(BitstreamRemarkContainerType::Last))
+      return createStringError(
+          std::make_error_code(std::errc::illegal_byte_sequence),
+          "Error while parsing BLOCK_META: invalid container type.");
+
+    ContainerType = static_cast<BitstreamRemarkContainerType>(*Type);
+  } else
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_META: missing container type.");
+
+  return Error::success();
+}
+
+static Error processStrTab(BitstreamRemarkParser &P,
+                           Optional<StringRef> StrTabBuf) {
+  if (!StrTabBuf)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_META: missing string table.");
+  // Parse and assign the string table.
+  P.StrTab.emplace(*StrTabBuf);
+  return Error::success();
+}
+
+static Error processRemarkVersion(BitstreamRemarkParser &P,
+                                  Optional<uint64_t> RemarkVersion) {
+  if (!RemarkVersion)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_META: missing remark version.");
+  P.RemarkVersion = *RemarkVersion;
+  return Error::success();
+}
+
+Error BitstreamRemarkParser::processExternalFilePath(
+    Optional<StringRef> ExternalFilePath) {
+  if (!ExternalFilePath)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_META: missing external file path.");
+
+  SmallString<80> FullPath(ExternalFilePrependPath);
+  sys::path::append(FullPath, *ExternalFilePath);
+
+  // External file: open the external file, parse it, check if its metadata
+  // matches the one from the separate metadata, then replace the current parser
+  // with the one parsing the remarks.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFile(FullPath);
+  if (std::error_code EC = BufferOrErr.getError())
+    return createFileError(FullPath, EC);
+  TmpRemarkBuffer = std::move(*BufferOrErr);
+
+  // Create a separate parser used for parsing the separate file.
+  ParserHelper = BitstreamParserHelper(TmpRemarkBuffer->getBuffer());
+  // Advance and check until we can parse the meta block.
+  if (Error E = advanceToMetaBlock(ParserHelper))
+    return E;
+  // Parse the meta from the separate file.
+  // Note: here we overwrite the BlockInfo with the one from the file. This will
+  // be used to parse the rest of the file.
+  BitstreamMetaParserHelper SeparateMetaHelper(ParserHelper.Stream,
+                                               ParserHelper.BlockInfo);
+  if (Error E = SeparateMetaHelper.parse())
+    return E;
+
+  uint64_t PreviousContainerVersion = ContainerVersion;
+  if (Error E = processCommonMeta(SeparateMetaHelper))
+    return E;
+
+  if (ContainerType != BitstreamRemarkContainerType::SeparateRemarksFile)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing external file's BLOCK_META: wrong container "
+        "type.");
+
+  if (PreviousContainerVersion != ContainerVersion)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing external file's BLOCK_META: mismatching versions: "
+        "original meta: %lu, external file meta: %lu.",
+        PreviousContainerVersion, ContainerVersion);
+
+  // Process the meta from the separate file.
+  return processSeparateRemarksFileMeta(SeparateMetaHelper);
+}
+
+Error BitstreamRemarkParser::processStandaloneMeta(
+    BitstreamMetaParserHelper &Helper) {
+  if (Error E = processStrTab(*this, Helper.StrTabBuf))
+    return E;
+  return processRemarkVersion(*this, Helper.RemarkVersion);
+}
+
+Error BitstreamRemarkParser::processSeparateRemarksFileMeta(
+    BitstreamMetaParserHelper &Helper) {
+  return processRemarkVersion(*this, Helper.RemarkVersion);
+}
+
+Error BitstreamRemarkParser::processSeparateRemarksMetaMeta(
+    BitstreamMetaParserHelper &Helper) {
+  if (Error E = processStrTab(*this, Helper.StrTabBuf))
+    return E;
+  return processExternalFilePath(Helper.ExternalFilePath);
+}
+
+Expected<std::unique_ptr<Remark>> BitstreamRemarkParser::parseRemark() {
+  BitstreamRemarkParserHelper RemarkHelper(ParserHelper.Stream);
+  if (Error E = RemarkHelper.parse())
+    return std::move(E);
+
+  return processRemark(RemarkHelper);
+}
+
+Expected<std::unique_ptr<Remark>>
+BitstreamRemarkParser::processRemark(BitstreamRemarkParserHelper &Helper) {
+  std::unique_ptr<Remark> Result = std::make_unique<Remark>();
+  Remark &R = *Result;
+
+  if (StrTab == None)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Error while parsing BLOCK_REMARK: missing string table.");
+
+  if (!Helper.Type)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_REMARK: missing remark type.");
+
+  // Always >= Type::First since it's unsigned.
+  if (*Helper.Type > static_cast<uint8_t>(Type::Last))
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_REMARK: unknown remark type.");
+
+  R.RemarkType = static_cast<Type>(*Helper.Type);
+
+  if (!Helper.RemarkNameIdx)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_REMARK: missing remark name.");
+
+  if (Expected<StringRef> RemarkName = (*StrTab)[*Helper.RemarkNameIdx])
+    R.RemarkName = *RemarkName;
+  else
+    return RemarkName.takeError();
+
+  if (!Helper.PassNameIdx)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_REMARK: missing remark pass.");
+
+  if (Expected<StringRef> PassName = (*StrTab)[*Helper.PassNameIdx])
+    R.PassName = *PassName;
+  else
+    return PassName.takeError();
+
+  if (!Helper.FunctionNameIdx)
+    return createStringError(
+        std::make_error_code(std::errc::illegal_byte_sequence),
+        "Error while parsing BLOCK_REMARK: missing remark function name.");
+  if (Expected<StringRef> FunctionName = (*StrTab)[*Helper.FunctionNameIdx])
+    R.FunctionName = *FunctionName;
+  else
+    return FunctionName.takeError();
+
+  if (Helper.SourceFileNameIdx && Helper.SourceLine && Helper.SourceColumn) {
+    Expected<StringRef> SourceFileName = (*StrTab)[*Helper.SourceFileNameIdx];
+    if (!SourceFileName)
+      return SourceFileName.takeError();
+    R.Loc.emplace();
+    R.Loc->SourceFilePath = *SourceFileName;
+    R.Loc->SourceLine = *Helper.SourceLine;
+    R.Loc->SourceColumn = *Helper.SourceColumn;
+  }
+
+  if (Helper.Hotness)
+    R.Hotness = *Helper.Hotness;
+
+  if (!Helper.Args)
+    return std::move(Result);
+
+  for (const BitstreamRemarkParserHelper::Argument &Arg : *Helper.Args) {
+    if (!Arg.KeyIdx)
+      return createStringError(
+          std::make_error_code(std::errc::illegal_byte_sequence),
+          "Error while parsing BLOCK_REMARK: missing key in remark argument.");
+    if (!Arg.ValueIdx)
+      return createStringError(
+          std::make_error_code(std::errc::illegal_byte_sequence),
+          "Error while parsing BLOCK_REMARK: missing value in remark "
+          "argument.");
+
+    // We have at least a key and a value, create an entry.
+    R.Args.emplace_back();
+
+    if (Expected<StringRef> Key = (*StrTab)[*Arg.KeyIdx])
+      R.Args.back().Key = *Key;
+    else
+      return Key.takeError();
+
+    if (Expected<StringRef> Value = (*StrTab)[*Arg.ValueIdx])
+      R.Args.back().Val = *Value;
+    else
+      return Value.takeError();
+
+    if (Arg.SourceFileNameIdx && Arg.SourceLine && Arg.SourceColumn) {
+      if (Expected<StringRef> SourceFileName =
+              (*StrTab)[*Arg.SourceFileNameIdx]) {
+        R.Args.back().Loc.emplace();
+        R.Args.back().Loc->SourceFilePath = *SourceFileName;
+        R.Args.back().Loc->SourceLine = *Arg.SourceLine;
+        R.Args.back().Loc->SourceColumn = *Arg.SourceColumn;
+      } else
+        return SourceFileName.takeError();
+    }
+  }
+
+  return std::move(Result);
+}
diff --git a/lib/Remarks/BitstreamRemarkParser.h b/lib/Remarks/BitstreamRemarkParser.h
new file mode 100644
index 000000000000..7c9cc2f1e7db
--- /dev/null
+++ b/lib/Remarks/BitstreamRemarkParser.h
@@ -0,0 +1,83 @@
+//===-- BitstreamRemarkParser.h - Parser for Bitstream remarks --*- C++/-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the impementation of the Bitstream remark parser.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H
+#define LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Remarks/BitstreamRemarkParser.h"
+#include "llvm/Remarks/RemarkFormat.h"
+#include "llvm/Remarks/RemarkParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+#include <string>
+
+namespace llvm {
+namespace remarks {
+/// Parses and holds the state of the latest parsed remark.
+struct BitstreamRemarkParser : public RemarkParser {
+  /// The buffer to parse.
+  BitstreamParserHelper ParserHelper;
+  /// The string table used for parsing strings.
+  Optional<ParsedStringTable> StrTab;
+  /// Temporary remark buffer used when the remarks are stored separately.
+  std::unique_ptr<MemoryBuffer> TmpRemarkBuffer;
+  /// The common metadata used to decide how to parse the buffer.
+  /// This is filled when parsing the metadata block.
+  uint64_t ContainerVersion;
+  uint64_t RemarkVersion;
+  BitstreamRemarkContainerType ContainerType;
+  /// Wether the parser is ready to parse remarks.
+  bool ReadyToParseRemarks = false;
+
+  /// Create a parser that expects to find a string table embedded in the
+  /// stream.
+  BitstreamRemarkParser(StringRef Buf)
+      : RemarkParser(Format::Bitstream), ParserHelper(Buf) {}
+
+  /// Create a parser that uses a pre-parsed string table.
+  BitstreamRemarkParser(StringRef Buf, ParsedStringTable StrTab)
+      : RemarkParser(Format::Bitstream), ParserHelper(Buf),
+        StrTab(std::move(StrTab)) {}
+
+  Expected<std::unique_ptr<Remark>> next() override;
+
+  static bool classof(const RemarkParser *P) {
+    return P->ParserFormat == Format::Bitstream;
+  }
+
+  /// Parse and process the metadata of the buffer.
+  Error parseMeta();
+
+  /// Parse a Bitstream remark.
+  Expected<std::unique_ptr<Remark>> parseRemark();
+
+private:
+  /// Helper functions.
+  Error processCommonMeta(BitstreamMetaParserHelper &Helper);
+  Error processStandaloneMeta(BitstreamMetaParserHelper &Helper);
+  Error processSeparateRemarksFileMeta(BitstreamMetaParserHelper &Helper);
+  Error processSeparateRemarksMetaMeta(BitstreamMetaParserHelper &Helper);
+  Expected<std::unique_ptr<Remark>>
+  processRemark(BitstreamRemarkParserHelper &Helper);
+  Error processExternalFilePath(Optional<StringRef> ExternalFilePath);
+};
+
+Expected<std::unique_ptr<BitstreamRemarkParser>> createBitstreamParserFromMeta(
+    StringRef Buf, Optional<ParsedStringTable> StrTab = None,
+    Optional<StringRef> ExternalFilePrependPath = None);
+
+} // end namespace remarks
+} // end namespace llvm
+
+#endif /* LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H */
diff --git a/lib/Remarks/BitstreamRemarkSerializer.cpp b/lib/Remarks/BitstreamRemarkSerializer.cpp
new file mode 100644
index 000000000000..d02782c7954d
--- /dev/null
+++ b/lib/Remarks/BitstreamRemarkSerializer.cpp
@@ -0,0 +1,386 @@
+//===- BitstreamRemarkSerializer.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the implementation of the LLVM bitstream remark serializer
+// using LLVM's bitstream writer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Remarks/BitstreamRemarkSerializer.h"
+
+using namespace llvm;
+using namespace llvm::remarks;
+
+BitstreamRemarkSerializerHelper::BitstreamRemarkSerializerHelper(
+    BitstreamRemarkContainerType ContainerType)
+    : Encoded(), R(), Bitstream(Encoded), ContainerType(ContainerType) {}
+
+static void push(SmallVectorImpl<uint64_t> &R, StringRef Str) {
+  for (const char C : Str)
+    R.push_back(C);
+}
+
+static void setRecordName(unsigned RecordID, BitstreamWriter &Bitstream,
+                          SmallVectorImpl<uint64_t> &R, StringRef Str) {
+  R.clear();
+  R.push_back(RecordID);
+  push(R, Str);
+  Bitstream.EmitRecord(bitc::BLOCKINFO_CODE_SETRECORDNAME, R);
+}
+
+static void initBlock(unsigned BlockID, BitstreamWriter &Bitstream,
+                      SmallVectorImpl<uint64_t> &R, StringRef Str) {
+  R.clear();
+  R.push_back(BlockID);
+  Bitstream.EmitRecord(bitc::BLOCKINFO_CODE_SETBID, R);
+
+  R.clear();
+  push(R, Str);
+  Bitstream.EmitRecord(bitc::BLOCKINFO_CODE_BLOCKNAME, R);
+}
+
+void BitstreamRemarkSerializerHelper::setupMetaBlockInfo() {
+  // Setup the metadata block.
+  initBlock(META_BLOCK_ID, Bitstream, R, MetaBlockName);
+
+  // The container information.
+  setRecordName(RECORD_META_CONTAINER_INFO, Bitstream, R,
+                MetaContainerInfoName);
+
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
+  Abbrev->Add(BitCodeAbbrevOp(RECORD_META_CONTAINER_INFO));
+  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Version.
+  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2));  // Type.
+  RecordMetaContainerInfoAbbrevID =
+      Bitstream.EmitBlockInfoAbbrev(META_BLOCK_ID, Abbrev);
+}
+
+void BitstreamRemarkSerializerHelper::setupMetaRemarkVersion() {
+  setRecordName(RECORD_META_REMARK_VERSION, Bitstream, R,
+                MetaRemarkVersionName);
+
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
+  Abbrev->Add(BitCodeAbbrevOp(RECORD_META_REMARK_VERSION));
+  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Version.
+  RecordMetaRemarkVersionAbbrevID =
+      Bitstream.EmitBlockInfoAbbrev(META_BLOCK_ID, Abbrev);
+}
+
+void BitstreamRemarkSerializerHelper::emitMetaRemarkVersion(
+    uint64_t RemarkVersion) {
+  // The remark version is emitted only if we emit remarks.
+  R.clear();
+  R.push_back(RECORD_META_REMARK_VERSION);
+  R.push_back(RemarkVersion);
+  Bitstream.EmitRecordWithAbbrev(RecordMetaRemarkVersionAbbrevID, R);
+}
+
+void BitstreamRemarkSerializerHelper::setupMetaStrTab() {
+  setRecordName(RECORD_META_STRTAB, Bitstream, R, MetaStrTabName);
+
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
+  Abbrev->Add(BitCodeAbbrevOp(RECORD_META_STRTAB));
+  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Raw table.
+  RecordMetaStrTabAbbrevID =
+      Bitstream.EmitBlockInfoAbbrev(META_BLOCK_ID, Abbrev);
+}
+
+void BitstreamRemarkSerializerHelper::emitMetaStrTab(
+    const StringTable &StrTab) {
+  // The string table is not emitted if we emit remarks separately.
+  R.clear();
+  R.push_back(RECORD_META_STRTAB);
+
+  // Serialize to a blob.
+  std::string Buf;
+  raw_string_ostream OS(Buf);
+  StrTab.serialize(OS);
+  StringRef Blob = OS.str();
+  Bitstream.EmitRecordWithBlob(RecordMetaStrTabAbbrevID, R, Blob);
+}
+
+void BitstreamRemarkSerializerHelper::setupMetaExternalFile() {
+  setRecordName(RECORD_META_EXTERNAL_FILE, Bitstream, R, MetaExternalFileName);
+
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
+  Abbrev->Add(BitCodeAbbrevOp(RECORD_META_EXTERNAL_FILE));
+  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Filename.
+  RecordMetaExternalFileAbbrevID =
+      Bitstream.EmitBlockInfoAbbrev(META_BLOCK_ID, Abbrev);
+}
+
+void BitstreamRemarkSerializerHelper::emitMetaExternalFile(StringRef Filename) {
+  // The external file is emitted only if we emit the separate metadata.
+  R.clear();
+  R.push_back(RECORD_META_EXTERNAL_FILE);
+  Bitstream.EmitRecordWithBlob(RecordMetaExternalFileAbbrevID, R, Filename);
+}
+
+void BitstreamRemarkSerializerHelper::setupRemarkBlockInfo() {
+  // Setup the remark block.
+  initBlock(REMARK_BLOCK_ID, Bitstream, R, RemarkBlockName);
+
+  // The header of a remark.
+  {
+    setRecordName(RECORD_REMARK_HEADER, Bitstream, R, RemarkHeaderName);
+
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(RECORD_REMARK_HEADER));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // Type
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Remark Name
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Pass name
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // Function name
+    RecordRemarkHeaderAbbrevID =
+        Bitstream.EmitBlockInfoAbbrev(REMARK_BLOCK_ID, Abbrev);
+  }
+
+  // The location of a remark.
+  {
+    setRecordName(RECORD_REMARK_DEBUG_LOC, Bitstream, R, RemarkDebugLocName);
+
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(RECORD_REMARK_DEBUG_LOC));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7));    // File
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Line
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Column
+    RecordRemarkDebugLocAbbrevID =
+        Bitstream.EmitBlockInfoAbbrev(REMARK_BLOCK_ID, Abbrev);
+  }
+
+  // The hotness of a remark.
+  {
+    setRecordName(RECORD_REMARK_HOTNESS, Bitstream, R, RemarkHotnessName);
+
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(RECORD_REMARK_HOTNESS));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Hotness
+    RecordRemarkHotnessAbbrevID =
+        Bitstream.EmitBlockInfoAbbrev(REMARK_BLOCK_ID, Abbrev);
+  }
+
+  // An argument entry with a debug location attached.
+  {
+    setRecordName(RECORD_REMARK_ARG_WITH_DEBUGLOC, Bitstream, R,
+                  RemarkArgWithDebugLocName);
+
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(RECORD_REMARK_ARG_WITH_DEBUGLOC));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7));    // Key
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7));    // Value
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7));    // File
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Line
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Column
+    RecordRemarkArgWithDebugLocAbbrevID =
+        Bitstream.EmitBlockInfoAbbrev(REMARK_BLOCK_ID, Abbrev);
+  }
+
+  // An argument entry with no debug location attached.
+  {
+    setRecordName(RECORD_REMARK_ARG_WITHOUT_DEBUGLOC, Bitstream, R,
+                  RemarkArgWithoutDebugLocName);
+
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(RECORD_REMARK_ARG_WITHOUT_DEBUGLOC));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7)); // Key
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7)); // Value
+    RecordRemarkArgWithoutDebugLocAbbrevID =
+        Bitstream.EmitBlockInfoAbbrev(REMARK_BLOCK_ID, Abbrev);
+  }
+}
+
+void BitstreamRemarkSerializerHelper::setupBlockInfo() {
+  // Emit magic number.
+  for (const char C : ContainerMagic)
+    Bitstream.Emit(static_cast<unsigned>(C), 8);
+
+  Bitstream.EnterBlockInfoBlock();
+
+  // Setup the main metadata. Depending on the container type, we'll setup the
+  // required records next.
+  setupMetaBlockInfo();
+
+  switch (ContainerType) {
+  case BitstreamRemarkContainerType::SeparateRemarksMeta:
+    // Needs a string table that the separate remark file is using.
+    setupMetaStrTab();
+    // Needs to know where the external remarks file is.
+    setupMetaExternalFile();
+    break;
+  case BitstreamRemarkContainerType::SeparateRemarksFile:
+    // Contains remarks: emit the version.
+    setupMetaRemarkVersion();
+    // Contains remarks: emit the remark abbrevs.
+    setupRemarkBlockInfo();
+    break;
+  case BitstreamRemarkContainerType::Standalone:
+    // Contains remarks: emit the version.
+    setupMetaRemarkVersion();
+    // Needs a string table.
+    setupMetaStrTab();
+    // Contains remarks: emit the remark abbrevs.
+    setupRemarkBlockInfo();
+    break;
+  }
+
+  Bitstream.ExitBlock();
+}
+
+void BitstreamRemarkSerializerHelper::emitMetaBlock(
+    uint64_t ContainerVersion, Optional<uint64_t> RemarkVersion,
+    Optional<const StringTable *> StrTab, Optional<StringRef> Filename) {
+  // Emit the meta block
+  Bitstream.EnterSubblock(META_BLOCK_ID, 3);
+
+  // The container version and type.
+  R.clear();
+  R.push_back(RECORD_META_CONTAINER_INFO);
+  R.push_back(ContainerVersion);
+  R.push_back(static_cast<uint64_t>(ContainerType));
+  Bitstream.EmitRecordWithAbbrev(RecordMetaContainerInfoAbbrevID, R);
+
+  switch (ContainerType) {
+  case BitstreamRemarkContainerType::SeparateRemarksMeta:
+    assert(StrTab != None && *StrTab != nullptr);
+    emitMetaStrTab(**StrTab);
+    assert(Filename != None);
+    emitMetaExternalFile(*Filename);
+    break;
+  case BitstreamRemarkContainerType::SeparateRemarksFile:
+    assert(RemarkVersion != None);
+    emitMetaRemarkVersion(*RemarkVersion);
+    break;
+  case BitstreamRemarkContainerType::Standalone:
+    assert(RemarkVersion != None);
+    emitMetaRemarkVersion(*RemarkVersion);
+    assert(StrTab != None && *StrTab != nullptr);
+    emitMetaStrTab(**StrTab);
+    break;
+  }
+
+  Bitstream.ExitBlock();
+}
+
+void BitstreamRemarkSerializerHelper::emitRemarkBlock(const Remark &Remark,
+                                                      StringTable &StrTab) {
+  Bitstream.EnterSubblock(REMARK_BLOCK_ID, 4);
+
+  R.clear();
+  R.push_back(RECORD_REMARK_HEADER);
+  R.push_back(static_cast<uint64_t>(Remark.RemarkType));
+  R.push_back(StrTab.add(Remark.RemarkName).first);
+  R.push_back(StrTab.add(Remark.PassName).first);
+  R.push_back(StrTab.add(Remark.FunctionName).first);
+  Bitstream.EmitRecordWithAbbrev(RecordRemarkHeaderAbbrevID, R);
+
+  if (const Optional<RemarkLocation> &Loc = Remark.Loc) {
+    R.clear();
+    R.push_back(RECORD_REMARK_DEBUG_LOC);
+    R.push_back(StrTab.add(Loc->SourceFilePath).first);
+    R.push_back(Loc->SourceLine);
+    R.push_back(Loc->SourceColumn);
+    Bitstream.EmitRecordWithAbbrev(RecordRemarkDebugLocAbbrevID, R);
+  }
+
+  if (Optional<uint64_t> Hotness = Remark.Hotness) {
+    R.clear();
+    R.push_back(RECORD_REMARK_HOTNESS);
+    R.push_back(*Hotness);
+    Bitstream.EmitRecordWithAbbrev(RecordRemarkHotnessAbbrevID, R);
+  }
+
+  for (const Argument &Arg : Remark.Args) {
+    R.clear();
+    unsigned Key = StrTab.add(Arg.Key).first;
+    unsigned Val = StrTab.add(Arg.Val).first;
+    bool HasDebugLoc = Arg.Loc != None;
+    R.push_back(HasDebugLoc ? RECORD_REMARK_ARG_WITH_DEBUGLOC
+                            : RECORD_REMARK_ARG_WITHOUT_DEBUGLOC);
+    R.push_back(Key);
+    R.push_back(Val);
+    if (HasDebugLoc) {
+      R.push_back(StrTab.add(Arg.Loc->SourceFilePath).first);
+      R.push_back(Arg.Loc->SourceLine);
+      R.push_back(Arg.Loc->SourceColumn);
+    }
+    Bitstream.EmitRecordWithAbbrev(HasDebugLoc
+                                       ? RecordRemarkArgWithDebugLocAbbrevID
+                                       : RecordRemarkArgWithoutDebugLocAbbrevID,
+                                   R);
+  }
+  Bitstream.ExitBlock();
+}
+
+void BitstreamRemarkSerializerHelper::flushToStream(raw_ostream &OS) {
+  OS.write(Encoded.data(), Encoded.size());
+  Encoded.clear();
+}
+
+StringRef BitstreamRemarkSerializerHelper::getBuffer() {
+  return StringRef(Encoded.data(), Encoded.size());
+}
+
+BitstreamRemarkSerializer::BitstreamRemarkSerializer(raw_ostream &OS,
+                                                     SerializerMode Mode)
+    : RemarkSerializer(Format::Bitstream, OS, Mode),
+      Helper(BitstreamRemarkContainerType::SeparateRemarksFile) {
+  assert(Mode == SerializerMode::Separate &&
+         "For SerializerMode::Standalone, a pre-filled string table needs to "
+         "be provided.");
+  // We always use a string table with bitstream.
+  StrTab.emplace();
+}
+
+BitstreamRemarkSerializer::BitstreamRemarkSerializer(raw_ostream &OS,
+                                                     SerializerMode Mode,
+                                                     StringTable StrTabIn)
+    : RemarkSerializer(Format::Bitstream, OS, Mode),
+      Helper(Mode == SerializerMode::Separate
+                 ? BitstreamRemarkContainerType::SeparateRemarksFile
+                 : BitstreamRemarkContainerType::Standalone) {
+  StrTab = std::move(StrTabIn);
+}
+
+void BitstreamRemarkSerializer::emit(const Remark &Remark) {
+  if (!DidSetUp) {
+    // Emit the metadata that is embedded in the remark file.
+    // If we're in standalone mode, serialize the string table as well.
+    bool IsStandalone =
+        Helper.ContainerType == BitstreamRemarkContainerType::Standalone;
+    BitstreamMetaSerializer MetaSerializer(
+        OS, Helper,
+        IsStandalone ? &*StrTab : Optional<const StringTable *>(None));
+    MetaSerializer.emit();
+    DidSetUp = true;
+  }
+
+  assert(DidSetUp &&
+         "The Block info block and the meta block were not emitted yet.");
+  Helper.emitRemarkBlock(Remark, *StrTab);
+
+  Helper.flushToStream(OS);
+}
+
+std::unique_ptr<MetaSerializer> BitstreamRemarkSerializer::metaSerializer(
+    raw_ostream &OS, Optional<StringRef> ExternalFilename) {
+  assert(Helper.ContainerType !=
+         BitstreamRemarkContainerType::SeparateRemarksMeta);
+  bool IsStandalone =
+      Helper.ContainerType == BitstreamRemarkContainerType::Standalone;
+  return std::make_unique<BitstreamMetaSerializer>(
+      OS,
+      IsStandalone ? BitstreamRemarkContainerType::Standalone
+                   : BitstreamRemarkContainerType::SeparateRemarksMeta,
+      &*StrTab, ExternalFilename);
+}
+
+void BitstreamMetaSerializer::emit() {
+  Helper->setupBlockInfo();
+  Helper->emitMetaBlock(CurrentContainerVersion, CurrentRemarkVersion, StrTab,
+                        ExternalFilename);
+  Helper->flushToStream(OS);
+}
diff --git a/lib/Remarks/RemarkFormat.cpp b/lib/Remarks/RemarkFormat.cpp
index bcd0f753ff64..f2d0331ec6a8 100644
--- a/lib/Remarks/RemarkFormat.cpp
+++ b/lib/Remarks/RemarkFormat.cpp
@@ -19,11 +19,13 @@ using namespace llvm::remarks;
 Expected<Format> llvm::remarks::parseFormat(StringRef FormatStr) {
   auto Result = StringSwitch<Format>(FormatStr)
                     .Cases("", "yaml", Format::YAML)
+                    .Case("yaml-strtab", Format::YAMLStrTab)
+                    .Case("bitstream", Format::Bitstream)
                     .Default(Format::Unknown);
 
   if (Result == Format::Unknown)
     return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Unknown remark serializer format: '%s'",
+                             "Unknown remark format: '%s'",
                              FormatStr.data());
 
   return Result;
diff --git a/lib/Remarks/RemarkParser.cpp b/lib/Remarks/RemarkParser.cpp
index f67464073bd1..c5c3d0badd3e 100644
--- a/lib/Remarks/RemarkParser.cpp
+++ b/lib/Remarks/RemarkParser.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Remarks/RemarkParser.h"
+#include "BitstreamRemarkParser.h"
 #include "YAMLRemarkParser.h"
 #include "llvm-c/Remarks.h"
 #include "llvm/ADT/STLExtras.h"
@@ -47,32 +48,81 @@ Expected<StringRef> ParsedStringTable::operator[](size_t Index) const {
   return StringRef(Buffer.data() + Offset, NextOffset - Offset - 1);
 }
 
-Expected<std::unique_ptr<Parser>>
+Expected<std::unique_ptr<RemarkParser>>
+llvm::remarks::createRemarkParser(Format ParserFormat, StringRef Buf) {
+  switch (ParserFormat) {
+  case Format::YAML:
+    return std::make_unique<YAMLRemarkParser>(Buf);
+  case Format::YAMLStrTab:
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "The YAML with string table format requires a parsed string table.");
+  case Format::Bitstream:
+    return std::make_unique<BitstreamRemarkParser>(Buf);
+  case Format::Unknown:
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "Unknown remark parser format.");
+  }
+  llvm_unreachable("unhandled ParseFormat");
+}
+
+Expected<std::unique_ptr<RemarkParser>>
 llvm::remarks::createRemarkParser(Format ParserFormat, StringRef Buf,
-                                  Optional<const ParsedStringTable *> StrTab) {
+                                  ParsedStringTable StrTab) {
+  switch (ParserFormat) {
+  case Format::YAML:
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "The YAML format can't be used with a string "
+                             "table. Use yaml-strtab instead.");
+  case Format::YAMLStrTab:
+    return std::make_unique<YAMLStrTabRemarkParser>(Buf, std::move(StrTab));
+  case Format::Bitstream:
+    return std::make_unique<BitstreamRemarkParser>(Buf, std::move(StrTab));
+  case Format::Unknown:
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "Unknown remark parser format.");
+  }
+  llvm_unreachable("unhandled ParseFormat");
+}
+
+Expected<std::unique_ptr<RemarkParser>>
+llvm::remarks::createRemarkParserFromMeta(
+    Format ParserFormat, StringRef Buf, Optional<ParsedStringTable> StrTab,
+    Optional<StringRef> ExternalFilePrependPath) {
   switch (ParserFormat) {
+  // Depending on the metadata, the format can be either yaml or yaml-strtab,
+  // regardless of the input argument.
   case Format::YAML:
-    return llvm::make_unique<YAMLRemarkParser>(Buf, StrTab);
+  case Format::YAMLStrTab:
+    return createYAMLParserFromMeta(Buf, std::move(StrTab),
+                                    std::move(ExternalFilePrependPath));
+  case Format::Bitstream:
+    return createBitstreamParserFromMeta(Buf, std::move(StrTab),
+                                         std::move(ExternalFilePrependPath));
   case Format::Unknown:
     return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Unknown remark parser format.");
   }
-  llvm_unreachable("unknown format");
+  llvm_unreachable("unhandled ParseFormat");
 }
 
+namespace {
 // Wrapper that holds the state needed to interact with the C API.
 struct CParser {
-  std::unique_ptr<Parser> TheParser;
+  std::unique_ptr<RemarkParser> TheParser;
   Optional<std::string> Err;
 
   CParser(Format ParserFormat, StringRef Buf,
-          Optional<const ParsedStringTable *> StrTab = None)
-      : TheParser(cantFail(createRemarkParser(ParserFormat, Buf, StrTab))) {}
+          Optional<ParsedStringTable> StrTab = None)
+      : TheParser(cantFail(
+            StrTab ? createRemarkParser(ParserFormat, Buf, std::move(*StrTab))
+                   : createRemarkParser(ParserFormat, Buf))) {}
 
   void handleError(Error E) { Err.emplace(toString(std::move(E))); }
   bool hasError() const { return Err.hasValue(); }
   const char *getMessage() const { return Err ? Err->c_str() : nullptr; };
 };
+} // namespace
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(CParser, LLVMRemarkParserRef)
@@ -83,10 +133,16 @@ extern "C" LLVMRemarkParserRef LLVMRemarkParserCreateYAML(const void *Buf,
                           StringRef(static_cast<const char *>(Buf), Size)));
 }
 
+extern "C" LLVMRemarkParserRef LLVMRemarkParserCreateBitstream(const void *Buf,
+                                                               uint64_t Size) {
+  return wrap(new CParser(Format::Bitstream,
+                          StringRef(static_cast<const char *>(Buf), Size)));
+}
+
 extern "C" LLVMRemarkEntryRef
 LLVMRemarkParserGetNext(LLVMRemarkParserRef Parser) {
   CParser &TheCParser = *unwrap(Parser);
-  remarks::Parser &TheParser = *TheCParser.TheParser;
+  remarks::RemarkParser &TheParser = *TheCParser.TheParser;
 
   Expected<std::unique_ptr<Remark>> MaybeRemark = TheParser.next();
   if (Error E = MaybeRemark.takeError()) {
diff --git a/lib/Remarks/RemarkSerializer.cpp b/lib/Remarks/RemarkSerializer.cpp
new file mode 100644
index 000000000000..ab19c84bbadb
--- /dev/null
+++ b/lib/Remarks/RemarkSerializer.cpp
@@ -0,0 +1,54 @@
+//===- RemarkSerializer.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides tools for serializing remarks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Remarks/RemarkSerializer.h"
+#include "llvm/Remarks/BitstreamRemarkSerializer.h"
+#include "llvm/Remarks/YAMLRemarkSerializer.h"
+
+using namespace llvm;
+using namespace llvm::remarks;
+
+Expected<std::unique_ptr<RemarkSerializer>>
+remarks::createRemarkSerializer(Format RemarksFormat, SerializerMode Mode,
+                                raw_ostream &OS) {
+  switch (RemarksFormat) {
+  case Format::Unknown:
+    return createStringError(std::errc::invalid_argument,
+                             "Unknown remark serializer format.");
+  case Format::YAML:
+    return std::make_unique<YAMLRemarkSerializer>(OS, Mode);
+  case Format::YAMLStrTab:
+    return std::make_unique<YAMLStrTabRemarkSerializer>(OS, Mode);
+  case Format::Bitstream:
+    return std::make_unique<BitstreamRemarkSerializer>(OS, Mode);
+  }
+  llvm_unreachable("Unknown remarks::Format enum");
+}
+
+Expected<std::unique_ptr<RemarkSerializer>>
+remarks::createRemarkSerializer(Format RemarksFormat, SerializerMode Mode,
+                                raw_ostream &OS, remarks::StringTable StrTab) {
+  switch (RemarksFormat) {
+  case Format::Unknown:
+    return createStringError(std::errc::invalid_argument,
+                             "Unknown remark serializer format.");
+  case Format::YAML:
+    return std::make_unique<YAMLRemarkSerializer>(OS, Mode, std::move(StrTab));
+  case Format::YAMLStrTab:
+    return std::make_unique<YAMLStrTabRemarkSerializer>(OS, Mode,
+                                                        std::move(StrTab));
+  case Format::Bitstream:
+    return std::make_unique<BitstreamRemarkSerializer>(OS, Mode,
+                                                       std::move(StrTab));
+  }
+  llvm_unreachable("Unknown remarks::Format enum");
+}
diff --git a/lib/Remarks/RemarkStringTable.cpp b/lib/Remarks/RemarkStringTable.cpp
index 984aa5b33b48..51156465be51 100644
--- a/lib/Remarks/RemarkStringTable.cpp
+++ b/lib/Remarks/RemarkStringTable.cpp
@@ -11,6 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Remarks/RemarkStringTable.h"
+#include "llvm/Remarks/Remark.h"
+#include "llvm/Remarks/RemarkParser.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Error.h"
 #include <vector>
@@ -18,6 +20,14 @@
 using namespace llvm;
 using namespace llvm::remarks;
 
+StringTable::StringTable(const ParsedStringTable &Other) : StrTab() {
+  for (unsigned i = 0, e = Other.size(); i < e; ++i)
+    if (Expected<StringRef> MaybeStr = Other[i])
+      add(*MaybeStr);
+    else
+      llvm_unreachable("Unexpected error while building remarks string table.");
+}
+
 std::pair<unsigned, StringRef> StringTable::add(StringRef Str) {
   size_t NextID = StrTab.size();
   auto KV = StrTab.insert({Str, NextID});
@@ -28,10 +38,22 @@ std::pair<unsigned, StringRef> StringTable::add(StringRef Str) {
   return {KV.first->second, KV.first->first()};
 }
 
+void StringTable::internalize(Remark &R) {
+  auto Impl = [&](StringRef &S) { S = add(S).second; };
+  Impl(R.PassName);
+  Impl(R.RemarkName);
+  Impl(R.FunctionName);
+  if (R.Loc)
+    Impl(R.Loc->SourceFilePath);
+  for (Argument &Arg : R.Args) {
+    Impl(Arg.Key);
+    Impl(Arg.Val);
+    if (Arg.Loc)
+      Impl(Arg.Loc->SourceFilePath);
+  }
+}
+
 void StringTable::serialize(raw_ostream &OS) const {
-  // Emit the number of strings.
-  uint64_t StrTabSize = SerializedSize;
-  support::endian::write(OS, StrTabSize, support::little);
   // Emit the sequence of strings.
   for (StringRef Str : serialize()) {
     OS << Str;
diff --git a/lib/Remarks/YAMLRemarkParser.cpp b/lib/Remarks/YAMLRemarkParser.cpp
index ed78b7ba5d95..dd834d85676e 100644
--- a/lib/Remarks/YAMLRemarkParser.cpp
+++ b/lib/Remarks/YAMLRemarkParser.cpp
@@ -14,6 +14,8 @@
 #include "YAMLRemarkParser.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Remarks/RemarkParser.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Path.h"
 
 using namespace llvm;
 using namespace llvm::remarks;
@@ -54,9 +56,123 @@ static SourceMgr setupSM(std::string &LastErrorMessage) {
   return SM;
 }
 
+// Parse the magic number. This function returns true if this represents remark
+// metadata, false otherwise.
+static Expected<bool> parseMagic(StringRef &Buf) {
+  if (!Buf.consume_front(remarks::Magic))
+    return false;
+
+  if (Buf.size() < 1 || !Buf.consume_front(StringRef("\0", 1)))
+    return createStringError(std::errc::illegal_byte_sequence,
+                             "Expecting \\0 after magic number.");
+  return true;
+}
+
+static Expected<uint64_t> parseVersion(StringRef &Buf) {
+  if (Buf.size() < sizeof(uint64_t))
+    return createStringError(std::errc::illegal_byte_sequence,
+                             "Expecting version number.");
+
+  uint64_t Version =
+      support::endian::read<uint64_t, support::little, support::unaligned>(
+          Buf.data());
+  if (Version != remarks::CurrentRemarkVersion)
+    return createStringError(std::errc::illegal_byte_sequence,
+                             "Mismatching remark version. Got %" PRId64
+                             ", expected %" PRId64 ".",
+                             Version, remarks::CurrentRemarkVersion);
+  Buf = Buf.drop_front(sizeof(uint64_t));
+  return Version;
+}
+
+static Expected<uint64_t> parseStrTabSize(StringRef &Buf) {
+  if (Buf.size() < sizeof(uint64_t))
+    return createStringError(std::errc::illegal_byte_sequence,
+                             "Expecting string table size.");
+  uint64_t StrTabSize =
+      support::endian::read<uint64_t, support::little, support::unaligned>(
+          Buf.data());
+  Buf = Buf.drop_front(sizeof(uint64_t));
+  return StrTabSize;
+}
+
+static Expected<ParsedStringTable> parseStrTab(StringRef &Buf,
+                                               uint64_t StrTabSize) {
+  if (Buf.size() < StrTabSize)
+    return createStringError(std::errc::illegal_byte_sequence,
+                             "Expecting string table.");
+
+  // Attach the string table to the parser.
+  ParsedStringTable Result(StringRef(Buf.data(), StrTabSize));
+  Buf = Buf.drop_front(StrTabSize);
+  return Expected<ParsedStringTable>(std::move(Result));
+}
+
+Expected<std::unique_ptr<YAMLRemarkParser>>
+remarks::createYAMLParserFromMeta(StringRef Buf,
+                                  Optional<ParsedStringTable> StrTab,
+                                  Optional<StringRef> ExternalFilePrependPath) {
+  // We now have a magic number. The metadata has to be correct.
+  Expected<bool> isMeta = parseMagic(Buf);
+  if (!isMeta)
+    return isMeta.takeError();
+  // If it's not recognized as metadata, roll back.
+  std::unique_ptr<MemoryBuffer> SeparateBuf;
+  if (*isMeta) {
+    Expected<uint64_t> Version = parseVersion(Buf);
+    if (!Version)
+      return Version.takeError();
+
+    Expected<uint64_t> StrTabSize = parseStrTabSize(Buf);
+    if (!StrTabSize)
+      return StrTabSize.takeError();
+
+    // If the size of string table is not 0, try to build one.
+    if (*StrTabSize != 0) {
+      if (StrTab)
+        return createStringError(std::errc::illegal_byte_sequence,
+                                 "String table already provided.");
+      Expected<ParsedStringTable> MaybeStrTab = parseStrTab(Buf, *StrTabSize);
+      if (!MaybeStrTab)
+        return MaybeStrTab.takeError();
+      StrTab = std::move(*MaybeStrTab);
+    }
+    // If it starts with "---", there is no external file.
+    if (!Buf.startswith("---")) {
+      // At this point, we expect Buf to contain the external file path.
+      StringRef ExternalFilePath = Buf;
+      SmallString<80> FullPath;
+      if (ExternalFilePrependPath)
+        FullPath = *ExternalFilePrependPath;
+      sys::path::append(FullPath, ExternalFilePath);
+
+      // Try to open the file and start parsing from there.
+      ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+          MemoryBuffer::getFile(FullPath);
+      if (std::error_code EC = BufferOrErr.getError())
+        return createFileError(FullPath, EC);
+
+      // Keep the buffer alive.
+      SeparateBuf = std::move(*BufferOrErr);
+      Buf = SeparateBuf->getBuffer();
+    }
+  }
+
+  std::unique_ptr<YAMLRemarkParser> Result =
+      StrTab
+          ? std::make_unique<YAMLStrTabRemarkParser>(Buf, std::move(*StrTab))
+          : std::make_unique<YAMLRemarkParser>(Buf);
+  if (SeparateBuf)
+    Result->SeparateBuf = std::move(SeparateBuf);
+  return std::move(Result);
+}
+
+YAMLRemarkParser::YAMLRemarkParser(StringRef Buf)
+    : YAMLRemarkParser(Buf, None) {}
+
 YAMLRemarkParser::YAMLRemarkParser(StringRef Buf,
-                                   Optional<const ParsedStringTable *> StrTab)
-    : Parser{Format::YAML}, StrTab(StrTab), LastErrorMessage(),
+                                   Optional<ParsedStringTable> StrTab)
+    : RemarkParser{Format::YAML}, StrTab(std::move(StrTab)), LastErrorMessage(),
       SM(setupSM(LastErrorMessage)), Stream(Buf, SM), YAMLIt(Stream.begin()) {}
 
 Error YAMLRemarkParser::error(StringRef Message, yaml::Node &Node) {
@@ -86,7 +202,7 @@ YAMLRemarkParser::parseRemark(yaml::Document &RemarkEntry) {
   if (!Root)
     return error("document root is not of mapping type.", *YAMLRoot);
 
-  std::unique_ptr<Remark> Result = llvm::make_unique<Remark>();
+  std::unique_ptr<Remark> Result = std::make_unique<Remark>();
   Remark &TheRemark = *Result;
 
   // First, the type. It needs special handling since is not part of the
@@ -179,22 +295,7 @@ Expected<StringRef> YAMLRemarkParser::parseStr(yaml::KeyValueNode &Node) {
   auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
   if (!Value)
     return error("expected a value of scalar type.", Node);
-  StringRef Result;
-  if (!StrTab) {
-    Result = Value->getRawValue();
-  } else {
-    // If we have a string table, parse it as an unsigned.
-    unsigned StrID = 0;
-    if (Expected<unsigned> MaybeStrID = parseUnsigned(Node))
-      StrID = *MaybeStrID;
-    else
-      return MaybeStrID.takeError();
-
-    if (Expected<StringRef> Str = (**StrTab)[StrID])
-      Result = *Str;
-    else
-      return Str.takeError();
-  }
+  StringRef Result = Value->getRawValue();
 
   if (Result.front() == '\'')
     Result = Result.drop_front();
@@ -325,3 +426,29 @@ Expected<std::unique_ptr<Remark>> YAMLRemarkParser::next() {
 
   return std::move(*MaybeResult);
 }
+
+Expected<StringRef> YAMLStrTabRemarkParser::parseStr(yaml::KeyValueNode &Node) {
+  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
+  if (!Value)
+    return error("expected a value of scalar type.", Node);
+  StringRef Result;
+  // If we have a string table, parse it as an unsigned.
+  unsigned StrID = 0;
+  if (Expected<unsigned> MaybeStrID = parseUnsigned(Node))
+    StrID = *MaybeStrID;
+  else
+    return MaybeStrID.takeError();
+
+  if (Expected<StringRef> Str = (*StrTab)[StrID])
+    Result = *Str;
+  else
+    return Str.takeError();
+
+  if (Result.front() == '\'')
+    Result = Result.drop_front();
+
+  if (Result.back() == '\'')
+    Result = Result.drop_back();
+
+  return Result;
+}
diff --git a/lib/Remarks/YAMLRemarkParser.h b/lib/Remarks/YAMLRemarkParser.h
index cea76e63e75c..03707433bc03 100644
--- a/lib/Remarks/YAMLRemarkParser.h
+++ b/lib/Remarks/YAMLRemarkParser.h
@@ -18,6 +18,7 @@
 #include "llvm/Remarks/Remark.h"
 #include "llvm/Remarks/RemarkParser.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -46,9 +47,9 @@ private:
 };
 
 /// Regular YAML to Remark parser.
-struct YAMLRemarkParser : public Parser {
+struct YAMLRemarkParser : public RemarkParser {
   /// The string table used for parsing strings.
-  Optional<const ParsedStringTable *> StrTab;
+  Optional<ParsedStringTable> StrTab;
   /// Last error message that can come from the YAML parser diagnostics.
   /// We need this for catching errors in the constructor.
   std::string LastErrorMessage;
@@ -58,17 +59,20 @@ struct YAMLRemarkParser : public Parser {
   yaml::Stream Stream;
   /// Iterator in the YAML stream.
   yaml::document_iterator YAMLIt;
+  /// If we parse remark metadata in separate mode, we need to open a new file
+  /// and parse that.
+  std::unique_ptr<MemoryBuffer> SeparateBuf;
 
-  YAMLRemarkParser(StringRef Buf,
-                   Optional<const ParsedStringTable *> StrTab = None);
+  YAMLRemarkParser(StringRef Buf);
 
   Expected<std::unique_ptr<Remark>> next() override;
 
-  static bool classof(const Parser *P) {
+  static bool classof(const RemarkParser *P) {
     return P->ParserFormat == Format::YAML;
   }
 
-private:
+protected:
+  YAMLRemarkParser(StringRef Buf, Optional<ParsedStringTable> StrTab);
   /// Create a YAMLParseError error from an existing error generated by the YAML
   /// parser.
   /// If there is no error, this returns Success.
@@ -82,7 +86,7 @@ private:
   /// Parse one key to a string.
   Expected<StringRef> parseKey(yaml::KeyValueNode &Node);
   /// Parse one value to a string.
-  Expected<StringRef> parseStr(yaml::KeyValueNode &Node);
+  virtual Expected<StringRef> parseStr(yaml::KeyValueNode &Node);
   /// Parse one value to an unsigned.
   Expected<unsigned> parseUnsigned(yaml::KeyValueNode &Node);
   /// Parse a debug location.
@@ -90,6 +94,26 @@ private:
   /// Parse an argument.
   Expected<Argument> parseArg(yaml::Node &Node);
 };
+
+/// YAML with a string table to Remark parser.
+struct YAMLStrTabRemarkParser : public YAMLRemarkParser {
+  YAMLStrTabRemarkParser(StringRef Buf, ParsedStringTable StrTab)
+      : YAMLRemarkParser(Buf, std::move(StrTab)) {}
+
+  static bool classof(const RemarkParser *P) {
+    return P->ParserFormat == Format::YAMLStrTab;
+  }
+
+protected:
+  /// Parse one value to a string.
+  Expected<StringRef> parseStr(yaml::KeyValueNode &Node) override;
+};
+
+Expected<std::unique_ptr<YAMLRemarkParser>>
+createYAMLParserFromMeta(StringRef Buf,
+                         Optional<ParsedStringTable> StrTab = None,
+                         Optional<StringRef> ExternalFilePrependPath = None);
+
 } // end namespace remarks
 } // end namespace llvm
 
diff --git a/lib/Remarks/YAMLRemarkSerializer.cpp b/lib/Remarks/YAMLRemarkSerializer.cpp
index d64ae8e12ab0..3a42fe0678eb 100644
--- a/lib/Remarks/YAMLRemarkSerializer.cpp
+++ b/lib/Remarks/YAMLRemarkSerializer.cpp
@@ -11,16 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Remarks/RemarkSerializer.h"
+#include "llvm/Remarks/YAMLRemarkSerializer.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 using namespace llvm::remarks;
 
-cl::opt<bool> RemarksYAMLStringTable(
-    "remarks-yaml-string-table", cl::init(false), cl::Hidden,
-    cl::desc("Enable the usage of a string table with YAML remarks."));
-
 // Use the same keys whether we use a string table or not (respectively, T is an
 // unsigned or a StringRef).
 template <typename T>
@@ -60,11 +56,14 @@ template <> struct MappingTraits<remarks::Remark *> {
     else
       llvm_unreachable("Unknown remark type");
 
-    if (Optional<StringTable> &StrTab =
-            reinterpret_cast<YAMLSerializer *>(io.getContext())->StrTab) {
-      unsigned PassID = StrTab->add(Remark->PassName).first;
-      unsigned NameID = StrTab->add(Remark->RemarkName).first;
-      unsigned FunctionID = StrTab->add(Remark->FunctionName).first;
+    if (auto *Serializer = dyn_cast<YAMLStrTabRemarkSerializer>(
+            reinterpret_cast<RemarkSerializer *>(io.getContext()))) {
+      assert(Serializer->StrTab.hasValue() &&
+             "YAMLStrTabSerializer with no StrTab.");
+      StringTable &StrTab = *Serializer->StrTab;
+      unsigned PassID = StrTab.add(Remark->PassName).first;
+      unsigned NameID = StrTab.add(Remark->RemarkName).first;
+      unsigned FunctionID = StrTab.add(Remark->FunctionName).first;
       mapRemarkHeader(io, PassID, NameID, Remark->Loc, FunctionID,
                       Remark->Hotness, Remark->Args);
     } else {
@@ -82,9 +81,12 @@ template <> struct MappingTraits<RemarkLocation> {
     unsigned Line = RL.SourceLine;
     unsigned Col = RL.SourceColumn;
 
-    if (Optional<StringTable> &StrTab =
-            reinterpret_cast<YAMLSerializer *>(io.getContext())->StrTab) {
-      unsigned FileID = StrTab->add(File).first;
+    if (auto *Serializer = dyn_cast<YAMLStrTabRemarkSerializer>(
+            reinterpret_cast<RemarkSerializer *>(io.getContext()))) {
+      assert(Serializer->StrTab.hasValue() &&
+             "YAMLStrTabSerializer with no StrTab.");
+      StringTable &StrTab = *Serializer->StrTab;
+      unsigned FileID = StrTab.add(File).first;
       io.mapRequired("File", FileID);
     } else {
       io.mapRequired("File", File);
@@ -101,7 +103,7 @@ template <> struct MappingTraits<RemarkLocation> {
 /// newlines in strings.
 struct StringBlockVal {
   StringRef Value;
-  StringBlockVal(const std::string &Value) : Value(Value) {}
+  StringBlockVal(StringRef R) : Value(R) {}
 };
 
 template <> struct BlockScalarTraits<StringBlockVal> {
@@ -134,9 +136,12 @@ template <> struct MappingTraits<Argument> {
   static void mapping(IO &io, Argument &A) {
     assert(io.outputting() && "input not yet implemented");
 
-    if (Optional<StringTable> &StrTab =
-            reinterpret_cast<YAMLSerializer *>(io.getContext())->StrTab) {
-      auto ValueID = StrTab->add(A.Val).first;
+    if (auto *Serializer = dyn_cast<YAMLStrTabRemarkSerializer>(
+            reinterpret_cast<RemarkSerializer *>(io.getContext()))) {
+      assert(Serializer->StrTab.hasValue() &&
+             "YAMLStrTabSerializer with no StrTab.");
+      StringTable &StrTab = *Serializer->StrTab;
+      auto ValueID = StrTab.add(A.Val).first;
       io.mapRequired(A.Key.data(), ValueID);
     } else if (StringRef(A.Val).count('\n') > 1) {
       StringBlockVal S(A.Val);
@@ -153,15 +158,100 @@ template <> struct MappingTraits<Argument> {
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(Argument)
 
-YAMLSerializer::YAMLSerializer(raw_ostream &OS, UseStringTable UseStringTable)
-    : Serializer(OS), YAMLOutput(OS, reinterpret_cast<void *>(this)) {
-  if (UseStringTable == remarks::UseStringTable::Yes || RemarksYAMLStringTable)
-    StrTab.emplace();
+YAMLRemarkSerializer::YAMLRemarkSerializer(raw_ostream &OS, SerializerMode Mode,
+                                           Optional<StringTable> StrTabIn)
+    : YAMLRemarkSerializer(Format::YAML, OS, Mode, std::move(StrTabIn)) {}
+
+YAMLRemarkSerializer::YAMLRemarkSerializer(Format SerializerFormat,
+                                           raw_ostream &OS, SerializerMode Mode,
+                                           Optional<StringTable> StrTabIn)
+    : RemarkSerializer(SerializerFormat, OS, Mode),
+      YAMLOutput(OS, reinterpret_cast<void *>(this)) {
+  StrTab = std::move(StrTabIn);
 }
 
-void YAMLSerializer::emit(const Remark &Remark) {
+void YAMLRemarkSerializer::emit(const Remark &Remark) {
   // Again, YAMLTraits expect a non-const object for inputting, but we're not
   // using that here.
   auto R = const_cast<remarks::Remark *>(&Remark);
   YAMLOutput << R;
 }
+
+std::unique_ptr<MetaSerializer>
+YAMLRemarkSerializer::metaSerializer(raw_ostream &OS,
+                                     Optional<StringRef> ExternalFilename) {
+  return std::make_unique<YAMLMetaSerializer>(OS, ExternalFilename);
+}
+
+void YAMLStrTabRemarkSerializer::emit(const Remark &Remark) {
+  // In standalone mode, for the serializer with a string table, emit the
+  // metadata first and set DidEmitMeta to avoid emitting it again.
+  if (Mode == SerializerMode::Standalone && !DidEmitMeta) {
+    std::unique_ptr<MetaSerializer> MetaSerializer =
+        metaSerializer(OS, /*ExternalFilename=*/None);
+    MetaSerializer->emit();
+    DidEmitMeta = true;
+  }
+
+  // Then do the usual remark emission.
+  YAMLRemarkSerializer::emit(Remark);
+}
+
+std::unique_ptr<MetaSerializer> YAMLStrTabRemarkSerializer::metaSerializer(
+    raw_ostream &OS, Optional<StringRef> ExternalFilename) {
+  assert(StrTab);
+  return std::make_unique<YAMLStrTabMetaSerializer>(OS, ExternalFilename,
+                                                    *StrTab);
+}
+
+static void emitMagic(raw_ostream &OS) {
+  // Emit the magic number.
+  OS << remarks::Magic;
+  // Explicitly emit a '\0'.
+  OS.write('\0');
+}
+
+static void emitVersion(raw_ostream &OS) {
+  // Emit the version number: little-endian uint64_t.
+  std::array<char, 8> Version;
+  support::endian::write64le(Version.data(), remarks::CurrentRemarkVersion);
+  OS.write(Version.data(), Version.size());
+}
+
+static void emitStrTab(raw_ostream &OS, Optional<const StringTable *> StrTab) {
+  // Emit the string table in the section.
+  uint64_t StrTabSize = StrTab ? (*StrTab)->SerializedSize : 0;
+  // Emit the total size of the string table (the size itself excluded):
+  // little-endian uint64_t.
+  // Note: even if no string table is used, emit 0.
+  std::array<char, 8> StrTabSizeBuf;
+  support::endian::write64le(StrTabSizeBuf.data(), StrTabSize);
+  OS.write(StrTabSizeBuf.data(), StrTabSizeBuf.size());
+  if (StrTab)
+    (*StrTab)->serialize(OS);
+}
+
+static void emitExternalFile(raw_ostream &OS, StringRef Filename) {
+  // Emit the null-terminated absolute path to the remark file.
+  SmallString<128> FilenameBuf = Filename;
+  sys::fs::make_absolute(FilenameBuf);
+  assert(!FilenameBuf.empty() && "The filename can't be empty.");
+  OS.write(FilenameBuf.data(), FilenameBuf.size());
+  OS.write('\0');
+}
+
+void YAMLMetaSerializer::emit() {
+  emitMagic(OS);
+  emitVersion(OS);
+  emitStrTab(OS, None);
+  if (ExternalFilename)
+    emitExternalFile(OS, *ExternalFilename);
+}
+
+void YAMLStrTabMetaSerializer::emit() {
+  emitMagic(OS);
+  emitVersion(OS);
+  emitStrTab(OS, &StrTab);
+  if (ExternalFilename)
+    emitExternalFile(OS, *ExternalFilename);
+}
diff --git a/lib/Support/AArch64TargetParser.cpp b/lib/Support/AArch64TargetParser.cpp
index df4caa1f07fd..6f1d6d50eee2 100644
--- a/lib/Support/AArch64TargetParser.cpp
+++ b/lib/Support/AArch64TargetParser.cpp
@@ -96,8 +96,8 @@ bool AArch64::getExtensionFeatures(unsigned Extensions,
     Features.push_back("+sve2-sm4");
   if (Extensions & AEK_SVE2SHA3)
     Features.push_back("+sve2-sha3");
-  if (Extensions & AEK_BITPERM)
-    Features.push_back("+bitperm");
+  if (Extensions & AEK_SVE2BITPERM)
+    Features.push_back("+sve2-bitperm");
   if (Extensions & AEK_RCPC)
     Features.push_back("+rcpc");
 
diff --git a/lib/Support/ABIBreak.cpp b/lib/Support/ABIBreak.cpp
new file mode 100644
index 000000000000..247b635e02b8
--- /dev/null
+++ b/lib/Support/ABIBreak.cpp
@@ -0,0 +1,24 @@
+//===----- lib/Support/ABIBreak.cpp - EnableABIBreakingChecks -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/abi-breaking.h"
+
+#ifndef _MSC_VER
+namespace llvm {
+
+// One of these two variables will be referenced by a symbol defined in
+// llvm-config.h. We provide a link-time (or load time for DSO) failure when
+// there is a mismatch in the build configuration of the API client and LLVM.
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+int EnableABIBreakingChecks;
+#else
+int DisableABIBreakingChecks;
+#endif
+
+} // end namespace llvm
+#endif
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 43173311cd80..758fe8b4f866 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -401,6 +401,33 @@ void APInt::insertBits(const APInt &subBits, unsigned bitPosition) {
   }
 }
 
+void APInt::insertBits(uint64_t subBits, unsigned bitPosition, unsigned numBits) {
+  uint64_t maskBits = maskTrailingOnes<uint64_t>(numBits);
+  subBits &= maskBits;
+  if (isSingleWord()) {
+    U.VAL &= ~(maskBits << bitPosition);
+    U.VAL |= subBits << bitPosition;
+    return;
+  }
+
+  unsigned loBit = whichBit(bitPosition);
+  unsigned loWord = whichWord(bitPosition);
+  unsigned hiWord = whichWord(bitPosition + numBits - 1);
+  if (loWord == hiWord) {
+    U.pVal[loWord] &= ~(maskBits << loBit);
+    U.pVal[loWord] |= subBits << loBit;
+    return;
+  }
+
+  static_assert(8 * sizeof(WordType) <= 64, "This code assumes only two words affected");
+  unsigned wordBits = 8 * sizeof(WordType);
+  U.pVal[loWord] &= ~(maskBits << loBit);
+  U.pVal[loWord] |= subBits << loBit;
+
+  U.pVal[hiWord] &= ~(maskBits >> (wordBits - loBit));
+  U.pVal[hiWord] |= subBits >> (wordBits - loBit);
+}
+
 APInt APInt::extractBits(unsigned numBits, unsigned bitPosition) const {
   assert(numBits > 0 && "Can't extract zero bits");
   assert(bitPosition < BitWidth && (numBits + bitPosition) <= BitWidth &&
@@ -438,6 +465,31 @@ APInt APInt::extractBits(unsigned numBits, unsigned bitPosition) const {
   return Result.clearUnusedBits();
 }
 
+uint64_t APInt::extractBitsAsZExtValue(unsigned numBits,
+                                       unsigned bitPosition) const {
+  assert(numBits > 0 && "Can't extract zero bits");
+  assert(bitPosition < BitWidth && (numBits + bitPosition) <= BitWidth &&
+         "Illegal bit extraction");
+  assert(numBits <= 64 && "Illegal bit extraction");
+
+  uint64_t maskBits = maskTrailingOnes<uint64_t>(numBits);
+  if (isSingleWord())
+    return (U.VAL >> bitPosition) & maskBits;
+
+  unsigned loBit = whichBit(bitPosition);
+  unsigned loWord = whichWord(bitPosition);
+  unsigned hiWord = whichWord(bitPosition + numBits - 1);
+  if (loWord == hiWord)
+    return (U.pVal[loWord] >> loBit) & maskBits;
+
+  static_assert(8 * sizeof(WordType) <= 64, "This code assumes only two words affected");
+  unsigned wordBits = 8 * sizeof(WordType);
+  uint64_t retBits = U.pVal[loWord] >> loBit;
+  retBits |= U.pVal[hiWord] << (wordBits - loBit);
+  retBits &= maskBits;
+  return retBits;
+}
+
 unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   assert(!str.empty() && "Invalid string length");
   assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 ||
diff --git a/lib/Support/ARMTargetParser.cpp b/lib/Support/ARMTargetParser.cpp
index be948cfc95d4..ce5daa7fe58c 100644
--- a/lib/Support/ARMTargetParser.cpp
+++ b/lib/Support/ARMTargetParser.cpp
@@ -176,10 +176,8 @@ bool ARM::getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features) {
     // exist).
 
     {"+fpregs", "-fpregs", FPUVersion::VFPV2, FPURestriction::SP_D16},
-    {"+vfp2", "-vfp2", FPUVersion::VFPV2, FPURestriction::None},
-    {"+vfp2d16", "-vfp2d16", FPUVersion::VFPV2, FPURestriction::D16},
-    {"+vfp2d16sp", "-vfp2d16sp", FPUVersion::VFPV2, FPURestriction::SP_D16},
-    {"+vfp2sp", "-vfp2sp", FPUVersion::VFPV2, FPURestriction::None},
+    {"+vfp2", "-vfp2", FPUVersion::VFPV2, FPURestriction::D16},
+    {"+vfp2sp", "-vfp2sp", FPUVersion::VFPV2, FPURestriction::SP_D16},
     {"+vfp3", "-vfp3", FPUVersion::VFPV3, FPURestriction::None},
     {"+vfp3d16", "-vfp3d16", FPUVersion::VFPV3, FPURestriction::D16},
     {"+vfp3d16sp", "-vfp3d16sp", FPUVersion::VFPV3, FPURestriction::SP_D16},
@@ -195,7 +193,7 @@ bool ARM::getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features) {
     {"+fp-armv8sp", "-fp-armv8sp", FPUVersion::VFPV5, FPURestriction::None},
     {"+fullfp16", "-fullfp16", FPUVersion::VFPV5_FULLFP16, FPURestriction::SP_D16},
     {"+fp64", "-fp64", FPUVersion::VFPV2, FPURestriction::D16},
-    {"+d32", "-d32", FPUVersion::VFPV2, FPURestriction::None},
+    {"+d32", "-d32", FPUVersion::VFPV3, FPURestriction::None},
   };
 
   for (const auto &Info: FPUFeatureInfoList) {
diff --git a/lib/Support/CRC.cpp b/lib/Support/CRC.cpp
index fd98f3a24003..7c008d3b599d 100644
--- a/lib/Support/CRC.cpp
+++ b/lib/Support/CRC.cpp
@@ -6,63 +6,94 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file implements llvm::crc32 function.
+// This file contains implementations of CRC functions.
+//
+// The implementation technique is the one mentioned in:
+// D. V. Sarwate. 1988. Computation of cyclic redundancy checks via table
+// look-up. Commun. ACM 31, 8 (August 1988)
+//
+// See also Ross N. Williams "A Painless Guide to CRC Error Detection
+// Algorithms" (https://zlib.net/crc_v3.txt) or Hacker's Delight (2nd ed.)
+// Chapter 14 (Figure 14-7 in particular) for how the algorithm works.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/CRC.h"
+
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Config/config.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Threading.h"
-#include <array>
 
 using namespace llvm;
 
 #if LLVM_ENABLE_ZLIB == 0 || !HAVE_ZLIB_H
-using CRC32Table = std::array<uint32_t, 256>;
-
-static void initCRC32Table(CRC32Table *Tbl) {
-  auto Shuffle = [](uint32_t V) {
-    return (V & 1) ? (V >> 1) ^ 0xEDB88320U : V >> 1;
-  };
-
-  for (size_t I = 0; I < Tbl->size(); ++I) {
-    uint32_t V = Shuffle(I);
-    V = Shuffle(V);
-    V = Shuffle(V);
-    V = Shuffle(V);
-    V = Shuffle(V);
-    V = Shuffle(V);
-    V = Shuffle(V);
-    (*Tbl)[I] = Shuffle(V);
-  }
-}
 
-uint32_t llvm::crc32(uint32_t CRC, StringRef S) {
-  static llvm::once_flag InitFlag;
-  static CRC32Table Tbl;
-  llvm::call_once(InitFlag, initCRC32Table, &Tbl);
+static const uint32_t CRCTable[256] = {
+    0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+    0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+    0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+    0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+    0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+    0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+    0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
+    0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+    0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+    0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+    0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
+    0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+    0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+    0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+    0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+    0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+    0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+    0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+    0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+    0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+    0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+    0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+    0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+    0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+    0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+    0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+    0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+    0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+    0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+    0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+    0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+    0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+    0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+    0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+    0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+    0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+    0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+    0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+    0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+    0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+    0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+    0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+    0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d};
 
-  const uint8_t *P = reinterpret_cast<const uint8_t *>(S.data());
-  size_t Len = S.size();
+uint32_t llvm::crc32(uint32_t CRC, ArrayRef<uint8_t> Data) {
   CRC ^= 0xFFFFFFFFU;
-  for (; Len >= 8; Len -= 8) {
-    CRC = Tbl[(CRC ^ *P++) & 0xFF] ^ (CRC >> 8);
-    CRC = Tbl[(CRC ^ *P++) & 0xFF] ^ (CRC >> 8);
-    CRC = Tbl[(CRC ^ *P++) & 0xFF] ^ (CRC >> 8);
-    CRC = Tbl[(CRC ^ *P++) & 0xFF] ^ (CRC >> 8);
-    CRC = Tbl[(CRC ^ *P++) & 0xFF] ^ (CRC >> 8);
-    CRC = Tbl[(CRC ^ *P++) & 0xFF] ^ (CRC >> 8);
-    CRC = Tbl[(CRC ^ *P++) & 0xFF] ^ (CRC >> 8);
-    CRC = Tbl[(CRC ^ *P++) & 0xFF] ^ (CRC >> 8);
+  for (uint8_t Byte : Data) {
+    int TableIdx = (CRC ^ Byte) & 0xff;
+    CRC = CRCTable[TableIdx] ^ (CRC >> 8);
   }
-  while (Len--)
-    CRC = Tbl[(CRC ^ *P++) & 0xFF] ^ (CRC >> 8);
   return CRC ^ 0xFFFFFFFFU;
 }
+
 #else
+
 #include <zlib.h>
-uint32_t llvm::crc32(uint32_t CRC, StringRef S) {
-  return ::crc32(CRC, (const Bytef *)S.data(), S.size());
+uint32_t llvm::crc32(uint32_t CRC, ArrayRef<uint8_t> Data) {
+  return ::crc32(CRC, (const Bytef *)Data.data(), Data.size());
 }
+
 #endif
+
+uint32_t llvm::crc32(ArrayRef<uint8_t> Data) { return crc32(0, Data); }
+
+void JamCRC::update(ArrayRef<uint8_t> Data) {
+  CRC ^= 0xFFFFFFFFU; // Undo CRC-32 Init.
+  CRC = crc32(CRC, Data);
+  CRC ^= 0xFFFFFFFFU; // Undo CRC-32 XorOut.
+}
diff --git a/lib/Support/CachePruning.cpp b/lib/Support/CachePruning.cpp
index 9813eec0e433..7a2f6c53435a 100644
--- a/lib/Support/CachePruning.cpp
+++ b/lib/Support/CachePruning.cpp
@@ -45,7 +45,7 @@ struct FileInfo {
 /// interval option.
 static void writeTimestampFile(StringRef TimestampFile) {
   std::error_code EC;
-  raw_fd_ostream Out(TimestampFile.str(), EC, sys::fs::F_None);
+  raw_fd_ostream Out(TimestampFile.str(), EC, sys::fs::OF_None);
 }
 
 static Expected<std::chrono::seconds> parseDuration(StringRef Duration) {
diff --git a/lib/Support/CodeGenCoverage.cpp b/lib/Support/CodeGenCoverage.cpp
index f39eb7533b43..2db4193ce382 100644
--- a/lib/Support/CodeGenCoverage.cpp
+++ b/lib/Support/CodeGenCoverage.cpp
@@ -101,9 +101,9 @@ bool CodeGenCoverage::emit(StringRef CoveragePrefix,
     std::string CoverageFilename = (CoveragePrefix + Pid).str();
 
     std::error_code EC;
-    sys::fs::OpenFlags OpenFlags = sys::fs::F_Append;
+    sys::fs::OpenFlags OpenFlags = sys::fs::OF_Append;
     std::unique_ptr<ToolOutputFile> CoverageFile =
-        llvm::make_unique<ToolOutputFile>(CoverageFilename, EC, OpenFlags);
+        std::make_unique<ToolOutputFile>(CoverageFilename, EC, OpenFlags);
     if (EC)
       return false;
 
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 25510fa58ff5..620f7ffd4c9f 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -692,7 +692,7 @@ static inline bool ProvideOption(Option *Handler, StringRef ArgName,
   return false;
 }
 
-static bool ProvidePositionalOption(Option *Handler, StringRef Arg, int i) {
+bool llvm::cl::ProvidePositionalOption(Option *Handler, StringRef Arg, int i) {
   int Dummy = i;
   return ProvideOption(Handler, Handler->ArgStr, Arg, 0, nullptr, Dummy);
 }
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index c2459256f8fe..9d13fce9cc52 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -10,8 +10,8 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
 #include "llvm/Support/ThreadLocal.h"
+#include <mutex>
 #include <setjmp.h>
 using namespace llvm;
 
@@ -71,7 +71,7 @@ public:
 
 }
 
-static ManagedStatic<sys::Mutex> gCrashRecoveryContextMutex;
+static ManagedStatic<std::mutex> gCrashRecoveryContextMutex;
 static bool gCrashRecoveryEnabled = false;
 
 static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContext>>
@@ -116,7 +116,7 @@ CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
 }
 
 void CrashRecoveryContext::Enable() {
-  sys::ScopedLock L(*gCrashRecoveryContextMutex);
+  std::lock_guard<std::mutex> L(*gCrashRecoveryContextMutex);
   // FIXME: Shouldn't this be a refcount or something?
   if (gCrashRecoveryEnabled)
     return;
@@ -125,7 +125,7 @@ void CrashRecoveryContext::Enable() {
 }
 
 void CrashRecoveryContext::Disable() {
-  sys::ScopedLock L(*gCrashRecoveryContextMutex);
+  std::lock_guard<std::mutex> L(*gCrashRecoveryContextMutex);
   if (!gCrashRecoveryEnabled)
     return;
   gCrashRecoveryEnabled = false;
diff --git a/lib/Support/DataExtractor.cpp b/lib/Support/DataExtractor.cpp
index 673bbb4d06f4..a98297cdb35f 100644
--- a/lib/Support/DataExtractor.cpp
+++ b/lib/Support/DataExtractor.cpp
@@ -7,111 +7,137 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/SwapByteOrder.h"
+
 using namespace llvm;
 
+static void unexpectedEndReached(Error *E) {
+  if (E)
+    *E = createStringError(errc::illegal_byte_sequence,
+                           "unexpected end of data");
+}
+
+static bool isError(Error *E) { return E && *E; }
+
 template <typename T>
-static T getU(uint32_t *offset_ptr, const DataExtractor *de,
-              bool isLittleEndian, const char *Data) {
+static T getU(uint64_t *offset_ptr, const DataExtractor *de,
+              bool isLittleEndian, const char *Data, llvm::Error *Err) {
+  ErrorAsOutParameter ErrAsOut(Err);
   T val = 0;
-  uint32_t offset = *offset_ptr;
-  if (de->isValidOffsetForDataOfSize(offset, sizeof(val))) {
-    std::memcpy(&val, &Data[offset], sizeof(val));
-    if (sys::IsLittleEndianHost != isLittleEndian)
-      sys::swapByteOrder(val);
-
-    // Advance the offset
-    *offset_ptr += sizeof(val);
+  if (isError(Err))
+    return val;
+
+  uint64_t offset = *offset_ptr;
+  if (!de->isValidOffsetForDataOfSize(offset, sizeof(T))) {
+    unexpectedEndReached(Err);
+    return val;
   }
+  std::memcpy(&val, &Data[offset], sizeof(val));
+  if (sys::IsLittleEndianHost != isLittleEndian)
+    sys::swapByteOrder(val);
+
+  // Advance the offset
+  *offset_ptr += sizeof(val);
   return val;
 }
 
 template <typename T>
-static T *getUs(uint32_t *offset_ptr, T *dst, uint32_t count,
-                const DataExtractor *de, bool isLittleEndian, const char *Data){
-  uint32_t offset = *offset_ptr;
-
-  if (count > 0 && de->isValidOffsetForDataOfSize(offset, sizeof(*dst)*count)) {
-    for (T *value_ptr = dst, *end = dst + count; value_ptr != end;
-        ++value_ptr, offset += sizeof(*dst))
-      *value_ptr = getU<T>(offset_ptr, de, isLittleEndian, Data);
-    // Advance the offset
-    *offset_ptr = offset;
-    // Return a non-NULL pointer to the converted data as an indicator of
-    // success
-    return dst;
+static T *getUs(uint64_t *offset_ptr, T *dst, uint32_t count,
+                const DataExtractor *de, bool isLittleEndian, const char *Data,
+                llvm::Error *Err) {
+  ErrorAsOutParameter ErrAsOut(Err);
+  if (isError(Err))
+    return nullptr;
+
+  uint64_t offset = *offset_ptr;
+
+  if (!de->isValidOffsetForDataOfSize(offset, sizeof(*dst) * count)) {
+    unexpectedEndReached(Err);
+    return nullptr;
   }
-  return nullptr;
+  for (T *value_ptr = dst, *end = dst + count; value_ptr != end;
+       ++value_ptr, offset += sizeof(*dst))
+    *value_ptr = getU<T>(offset_ptr, de, isLittleEndian, Data, Err);
+  // Advance the offset
+  *offset_ptr = offset;
+  // Return a non-NULL pointer to the converted data as an indicator of
+  // success
+  return dst;
 }
 
-uint8_t DataExtractor::getU8(uint32_t *offset_ptr) const {
-  return getU<uint8_t>(offset_ptr, this, IsLittleEndian, Data.data());
+uint8_t DataExtractor::getU8(uint64_t *offset_ptr, llvm::Error *Err) const {
+  return getU<uint8_t>(offset_ptr, this, IsLittleEndian, Data.data(), Err);
 }
 
 uint8_t *
-DataExtractor::getU8(uint32_t *offset_ptr, uint8_t *dst, uint32_t count) const {
+DataExtractor::getU8(uint64_t *offset_ptr, uint8_t *dst, uint32_t count) const {
   return getUs<uint8_t>(offset_ptr, dst, count, this, IsLittleEndian,
-                       Data.data());
+                        Data.data(), nullptr);
 }
 
+uint8_t *DataExtractor::getU8(Cursor &C, uint8_t *Dst, uint32_t Count) const {
+  return getUs<uint8_t>(&C.Offset, Dst, Count, this, IsLittleEndian,
+                        Data.data(), &C.Err);
+}
 
-uint16_t DataExtractor::getU16(uint32_t *offset_ptr) const {
-  return getU<uint16_t>(offset_ptr, this, IsLittleEndian, Data.data());
+uint16_t DataExtractor::getU16(uint64_t *offset_ptr, llvm::Error *Err) const {
+  return getU<uint16_t>(offset_ptr, this, IsLittleEndian, Data.data(), Err);
 }
 
-uint16_t *DataExtractor::getU16(uint32_t *offset_ptr, uint16_t *dst,
+uint16_t *DataExtractor::getU16(uint64_t *offset_ptr, uint16_t *dst,
                                 uint32_t count) const {
   return getUs<uint16_t>(offset_ptr, dst, count, this, IsLittleEndian,
-                        Data.data());
+                         Data.data(), nullptr);
 }
 
-uint32_t DataExtractor::getU24(uint32_t *offset_ptr) const {
+uint32_t DataExtractor::getU24(uint64_t *offset_ptr) const {
   uint24_t ExtractedVal =
-      getU<uint24_t>(offset_ptr, this, IsLittleEndian, Data.data());
+      getU<uint24_t>(offset_ptr, this, IsLittleEndian, Data.data(), nullptr);
   // The 3 bytes are in the correct byte order for the host.
   return ExtractedVal.getAsUint32(sys::IsLittleEndianHost);
 }
 
-uint32_t DataExtractor::getU32(uint32_t *offset_ptr) const {
-  return getU<uint32_t>(offset_ptr, this, IsLittleEndian, Data.data());
+uint32_t DataExtractor::getU32(uint64_t *offset_ptr, llvm::Error *Err) const {
+  return getU<uint32_t>(offset_ptr, this, IsLittleEndian, Data.data(), Err);
 }
 
-uint32_t *DataExtractor::getU32(uint32_t *offset_ptr, uint32_t *dst,
+uint32_t *DataExtractor::getU32(uint64_t *offset_ptr, uint32_t *dst,
                                 uint32_t count) const {
   return getUs<uint32_t>(offset_ptr, dst, count, this, IsLittleEndian,
-                        Data.data());
+                         Data.data(), nullptr);
 }
 
-uint64_t DataExtractor::getU64(uint32_t *offset_ptr) const {
-  return getU<uint64_t>(offset_ptr, this, IsLittleEndian, Data.data());
+uint64_t DataExtractor::getU64(uint64_t *offset_ptr, llvm::Error *Err) const {
+  return getU<uint64_t>(offset_ptr, this, IsLittleEndian, Data.data(), Err);
 }
 
-uint64_t *DataExtractor::getU64(uint32_t *offset_ptr, uint64_t *dst,
+uint64_t *DataExtractor::getU64(uint64_t *offset_ptr, uint64_t *dst,
                                 uint32_t count) const {
   return getUs<uint64_t>(offset_ptr, dst, count, this, IsLittleEndian,
-                        Data.data());
+                         Data.data(), nullptr);
 }
 
-uint64_t
-DataExtractor::getUnsigned(uint32_t *offset_ptr, uint32_t byte_size) const {
+uint64_t DataExtractor::getUnsigned(uint64_t *offset_ptr, uint32_t byte_size,
+                                    llvm::Error *Err) const {
   switch (byte_size) {
   case 1:
-    return getU8(offset_ptr);
+    return getU8(offset_ptr, Err);
   case 2:
-    return getU16(offset_ptr);
+    return getU16(offset_ptr, Err);
   case 4:
-    return getU32(offset_ptr);
+    return getU32(offset_ptr, Err);
   case 8:
-    return getU64(offset_ptr);
+    return getU64(offset_ptr, Err);
   }
   llvm_unreachable("getUnsigned unhandled case!");
 }
 
 int64_t
-DataExtractor::getSigned(uint32_t *offset_ptr, uint32_t byte_size) const {
+DataExtractor::getSigned(uint64_t *offset_ptr, uint32_t byte_size) const {
   switch (byte_size) {
   case 1:
     return (int8_t)getU8(offset_ptr);
@@ -125,8 +151,8 @@ DataExtractor::getSigned(uint32_t *offset_ptr, uint32_t byte_size) const {
   llvm_unreachable("getSigned unhandled case!");
 }
 
-const char *DataExtractor::getCStr(uint32_t *offset_ptr) const {
-  uint32_t offset = *offset_ptr;
+const char *DataExtractor::getCStr(uint64_t *offset_ptr) const {
+  uint64_t offset = *offset_ptr;
   StringRef::size_type pos = Data.find('\0', offset);
   if (pos != StringRef::npos) {
     *offset_ptr = pos + 1;
@@ -135,31 +161,38 @@ const char *DataExtractor::getCStr(uint32_t *offset_ptr) const {
   return nullptr;
 }
 
-StringRef DataExtractor::getCStrRef(uint32_t *OffsetPtr) const {
-  uint32_t Start = *OffsetPtr;
+StringRef DataExtractor::getCStrRef(uint64_t *offset_ptr) const {
+  uint64_t Start = *offset_ptr;
   StringRef::size_type Pos = Data.find('\0', Start);
   if (Pos != StringRef::npos) {
-    *OffsetPtr = Pos + 1;
+    *offset_ptr = Pos + 1;
     return StringRef(Data.data() + Start, Pos - Start);
   }
   return StringRef();
 }
 
-uint64_t DataExtractor::getULEB128(uint32_t *offset_ptr) const {
+uint64_t DataExtractor::getULEB128(uint64_t *offset_ptr,
+                                   llvm::Error *Err) const {
   assert(*offset_ptr <= Data.size());
+  ErrorAsOutParameter ErrAsOut(Err);
+  if (isError(Err))
+    return 0;
 
   const char *error;
   unsigned bytes_read;
   uint64_t result = decodeULEB128(
       reinterpret_cast<const uint8_t *>(Data.data() + *offset_ptr), &bytes_read,
       reinterpret_cast<const uint8_t *>(Data.data() + Data.size()), &error);
-  if (error)
+  if (error) {
+    if (Err)
+      *Err = createStringError(errc::illegal_byte_sequence, error);
     return 0;
+  }
   *offset_ptr += bytes_read;
   return result;
 }
 
-int64_t DataExtractor::getSLEB128(uint32_t *offset_ptr) const {
+int64_t DataExtractor::getSLEB128(uint64_t *offset_ptr) const {
   assert(*offset_ptr <= Data.size());
 
   const char *error;
@@ -172,3 +205,14 @@ int64_t DataExtractor::getSLEB128(uint32_t *offset_ptr) const {
   *offset_ptr += bytes_read;
   return result;
 }
+
+void DataExtractor::skip(Cursor &C, uint64_t Length) const {
+  ErrorAsOutParameter ErrAsOut(&C.Err);
+  if (isError(&C.Err))
+    return;
+
+  if (isValidOffsetForDataOfSize(C.Offset, Length))
+    C.Offset += Length;
+  else
+    unexpectedEndReached(&C.Err);
+}
diff --git a/lib/Support/Error.cpp b/lib/Support/Error.cpp
index 72bc08af2ddb..9ea08c37478e 100644
--- a/lib/Support/Error.cpp
+++ b/lib/Support/Error.cpp
@@ -87,7 +87,7 @@ std::error_code FileError::convertToErrorCode() const {
 Error errorCodeToError(std::error_code EC) {
   if (!EC)
     return Error::success();
-  return Error(llvm::make_unique<ECError>(ECError(EC)));
+  return Error(std::make_unique<ECError>(ECError(EC)));
 }
 
 std::error_code errorToErrorCode(Error Err) {
@@ -167,18 +167,3 @@ void LLVMDisposeErrorMessage(char *ErrMsg) { delete[] ErrMsg; }
 LLVMErrorTypeId LLVMGetStringErrorTypeId() {
   return reinterpret_cast<void *>(&StringError::ID);
 }
-
-#ifndef _MSC_VER
-namespace llvm {
-
-// One of these two variables will be referenced by a symbol defined in
-// llvm-config.h. We provide a link-time (or load time for DSO) failure when
-// there is a mismatch in the build configuration of the API client and LLVM.
-#if LLVM_ENABLE_ABI_BREAKING_CHECKS
-int EnableABIBreakingChecks;
-#else
-int DisableABIBreakingChecks;
-#endif
-
-} // end namespace llvm
-#endif
diff --git a/lib/Support/FileCheck.cpp b/lib/Support/FileCheck.cpp
index e0f17787bdf8..841e406a7b69 100644
--- a/lib/Support/FileCheck.cpp
+++ b/lib/Support/FileCheck.cpp
@@ -14,31 +14,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FileCheck.h"
+#include "FileCheckImpl.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <list>
-#include <map>
 #include <tuple>
 #include <utility>
 
 using namespace llvm;
 
-void FileCheckNumericVariable::setValue(uint64_t NewValue) {
-  assert(!Value && "Overwriting numeric variable's value is not allowed");
-  Value = NewValue;
-}
-
-void FileCheckNumericVariable::clearValue() {
-  if (!Value)
-    return;
-  Value = None;
-}
-
 Expected<uint64_t> FileCheckNumericVariableUse::eval() const {
   Optional<uint64_t> Value = NumericVariable->getValue();
   if (Value)
     return *Value;
+
   return make_error<FileCheckUndefVarError>(Name);
 }
 
@@ -109,7 +100,7 @@ FileCheckPattern::parseVariable(StringRef &Str, const SourceMgr &SM) {
 
 // StringRef holding all characters considered as horizontal whitespaces by
 // FileCheck input canonicalization.
-StringRef SpaceChars = " \t";
+constexpr StringLiteral SpaceChars = " \t";
 
 // Parsing helper function that strips the first character in S and returns it.
 static char popFront(StringRef &S) {
@@ -159,7 +150,9 @@ FileCheckPattern::parseNumericVariableDefinition(
 
 Expected<std::unique_ptr<FileCheckNumericVariableUse>>
 FileCheckPattern::parseNumericVariableUse(StringRef Name, bool IsPseudo,
-                                          const SourceMgr &SM) const {
+                                          Optional<size_t> LineNumber,
+                                          FileCheckPatternContext *Context,
+                                          const SourceMgr &SM) {
   if (IsPseudo && !Name.equals("@LINE"))
     return FileCheckErrorDiagnostic::get(
         SM, Name, "invalid pseudo numeric variable '" + Name + "'");
@@ -185,21 +178,25 @@ FileCheckPattern::parseNumericVariableUse(StringRef Name, bool IsPseudo,
   if (DefLineNumber && LineNumber && *DefLineNumber == *LineNumber)
     return FileCheckErrorDiagnostic::get(
         SM, Name,
-        "numeric variable '" + Name + "' defined on the same line as used");
+        "numeric variable '" + Name +
+            "' defined earlier in the same CHECK directive");
 
-  return llvm::make_unique<FileCheckNumericVariableUse>(Name, NumericVariable);
+  return std::make_unique<FileCheckNumericVariableUse>(Name, NumericVariable);
 }
 
 Expected<std::unique_ptr<FileCheckExpressionAST>>
 FileCheckPattern::parseNumericOperand(StringRef &Expr, AllowedOperand AO,
-                                      const SourceMgr &SM) const {
+                                      Optional<size_t> LineNumber,
+                                      FileCheckPatternContext *Context,
+                                      const SourceMgr &SM) {
   if (AO == AllowedOperand::LineVar || AO == AllowedOperand::Any) {
     // Try to parse as a numeric variable use.
     Expected<FileCheckPattern::VariableProperties> ParseVarResult =
         parseVariable(Expr, SM);
     if (ParseVarResult)
       return parseNumericVariableUse(ParseVarResult->Name,
-                                     ParseVarResult->IsPseudo, SM);
+                                     ParseVarResult->IsPseudo, LineNumber,
+                                     Context, SM);
     if (AO == AllowedOperand::LineVar)
       return ParseVarResult.takeError();
     // Ignore the error and retry parsing as a literal.
@@ -209,7 +206,7 @@ FileCheckPattern::parseNumericOperand(StringRef &Expr, AllowedOperand AO,
   // Otherwise, parse it as a literal.
   uint64_t LiteralValue;
   if (!Expr.consumeInteger(/*Radix=*/10, LiteralValue))
-    return llvm::make_unique<FileCheckExpressionLiteral>(LiteralValue);
+    return std::make_unique<FileCheckExpressionLiteral>(LiteralValue);
 
   return FileCheckErrorDiagnostic::get(SM, Expr,
                                        "invalid operand format '" + Expr + "'");
@@ -223,10 +220,10 @@ static uint64_t sub(uint64_t LeftOp, uint64_t RightOp) {
   return LeftOp - RightOp;
 }
 
-Expected<std::unique_ptr<FileCheckExpressionAST>>
-FileCheckPattern::parseBinop(StringRef &Expr,
-                             std::unique_ptr<FileCheckExpressionAST> LeftOp,
-                             bool IsLegacyLineExpr, const SourceMgr &SM) const {
+Expected<std::unique_ptr<FileCheckExpressionAST>> FileCheckPattern::parseBinop(
+    StringRef &Expr, std::unique_ptr<FileCheckExpressionAST> LeftOp,
+    bool IsLegacyLineExpr, Optional<size_t> LineNumber,
+    FileCheckPatternContext *Context, const SourceMgr &SM) {
   Expr = Expr.ltrim(SpaceChars);
   if (Expr.empty())
     return std::move(LeftOp);
@@ -257,12 +254,12 @@ FileCheckPattern::parseBinop(StringRef &Expr,
   AllowedOperand AO =
       IsLegacyLineExpr ? AllowedOperand::Literal : AllowedOperand::Any;
   Expected<std::unique_ptr<FileCheckExpressionAST>> RightOpResult =
-      parseNumericOperand(Expr, AO, SM);
+      parseNumericOperand(Expr, AO, LineNumber, Context, SM);
   if (!RightOpResult)
     return RightOpResult;
 
   Expr = Expr.ltrim(SpaceChars);
-  return llvm::make_unique<FileCheckASTBinop>(EvalBinop, std::move(LeftOp),
+  return std::make_unique<FileCheckASTBinop>(EvalBinop, std::move(LeftOp),
                                               std::move(*RightOpResult));
 }
 
@@ -270,56 +267,60 @@ Expected<std::unique_ptr<FileCheckExpressionAST>>
 FileCheckPattern::parseNumericSubstitutionBlock(
     StringRef Expr,
     Optional<FileCheckNumericVariable *> &DefinedNumericVariable,
-    bool IsLegacyLineExpr, const SourceMgr &SM) const {
-  // Parse the numeric variable definition.
+    bool IsLegacyLineExpr, Optional<size_t> LineNumber,
+    FileCheckPatternContext *Context, const SourceMgr &SM) {
+  std::unique_ptr<FileCheckExpressionAST> ExpressionAST = nullptr;
+  StringRef DefExpr = StringRef();
   DefinedNumericVariable = None;
+  // Save variable definition expression if any.
   size_t DefEnd = Expr.find(':');
   if (DefEnd != StringRef::npos) {
-    StringRef DefExpr = Expr.substr(0, DefEnd);
-    StringRef UseExpr = Expr.substr(DefEnd + 1);
+    DefExpr = Expr.substr(0, DefEnd);
+    Expr = Expr.substr(DefEnd + 1);
+  }
 
-    UseExpr = UseExpr.ltrim(SpaceChars);
-    if (!UseExpr.empty())
-      return FileCheckErrorDiagnostic::get(
-          SM, UseExpr,
-          "unexpected string after variable definition: '" + UseExpr + "'");
+  // Parse the expression itself.
+  Expr = Expr.ltrim(SpaceChars);
+  if (!Expr.empty()) {
+    // The first operand in a legacy @LINE expression is always the @LINE
+    // pseudo variable.
+    AllowedOperand AO =
+        IsLegacyLineExpr ? AllowedOperand::LineVar : AllowedOperand::Any;
+    Expected<std::unique_ptr<FileCheckExpressionAST>> ParseResult =
+        parseNumericOperand(Expr, AO, LineNumber, Context, SM);
+    while (ParseResult && !Expr.empty()) {
+      ParseResult = parseBinop(Expr, std::move(*ParseResult), IsLegacyLineExpr,
+                               LineNumber, Context, SM);
+      // Legacy @LINE expressions only allow 2 operands.
+      if (ParseResult && IsLegacyLineExpr && !Expr.empty())
+        return FileCheckErrorDiagnostic::get(
+            SM, Expr,
+            "unexpected characters at end of expression '" + Expr + "'");
+    }
+    if (!ParseResult)
+      return ParseResult;
+    ExpressionAST = std::move(*ParseResult);
+  }
 
+  // Parse the numeric variable definition.
+  if (DefEnd != StringRef::npos) {
     DefExpr = DefExpr.ltrim(SpaceChars);
     Expected<FileCheckNumericVariable *> ParseResult =
         parseNumericVariableDefinition(DefExpr, Context, LineNumber, SM);
+
     if (!ParseResult)
       return ParseResult.takeError();
     DefinedNumericVariable = *ParseResult;
-
-    return nullptr;
   }
 
-  // Parse the expression itself.
-  Expr = Expr.ltrim(SpaceChars);
-  // The first operand in a legacy @LINE expression is always the @LINE pseudo
-  // variable.
-  AllowedOperand AO =
-      IsLegacyLineExpr ? AllowedOperand::LineVar : AllowedOperand::Any;
-  Expected<std::unique_ptr<FileCheckExpressionAST>> ParseResult =
-      parseNumericOperand(Expr, AO, SM);
-  while (ParseResult && !Expr.empty()) {
-    ParseResult =
-        parseBinop(Expr, std::move(*ParseResult), IsLegacyLineExpr, SM);
-    // Legacy @LINE expressions only allow 2 operands.
-    if (ParseResult && IsLegacyLineExpr && !Expr.empty())
-      return FileCheckErrorDiagnostic::get(
-          SM, Expr,
-          "unexpected characters at end of expression '" + Expr + "'");
-  }
-  if (!ParseResult)
-    return ParseResult;
-  return std::move(*ParseResult);
+  return std::move(ExpressionAST);
 }
 
 bool FileCheckPattern::parsePattern(StringRef PatternStr, StringRef Prefix,
                                     SourceMgr &SM,
                                     const FileCheckRequest &Req) {
   bool MatchFullLinesHere = Req.MatchFullLines && CheckTy != Check::CheckNot;
+  IgnoreCase = Req.IgnoreCase;
 
   PatternLoc = SMLoc::getFromPointer(PatternStr.data());
 
@@ -396,14 +397,15 @@ bool FileCheckPattern::parsePattern(StringRef PatternStr, StringRef Prefix,
       continue;
     }
 
-    // String and numeric substitution blocks. String substitution blocks come
+    // String and numeric substitution blocks. Pattern substitution blocks come
     // in two forms: [[foo:.*]] and [[foo]]. The former matches .* (or some
     // other regex) and assigns it to the string variable 'foo'. The latter
-    // substitutes foo's value. Numeric substitution blocks work the same way
-    // as string ones, but start with a '#' sign after the double brackets.
-    // Both string and numeric variable names must satisfy the regular
-    // expression "[a-zA-Z_][0-9a-zA-Z_]*" to be valid, as this helps catch
-    // some common errors.
+    // substitutes foo's value. Numeric substitution blocks recognize the same
+    // form as string ones, but start with a '#' sign after the double
+    // brackets. They also accept a combined form which sets a numeric variable
+    // to the evaluation of an expression. Both string and numeric variable
+    // names must satisfy the regular expression "[a-zA-Z_][0-9a-zA-Z_]*" to be
+    // valid, as this helps catch some common errors.
     if (PatternStr.startswith("[[")) {
       StringRef UnparsedPatternStr = PatternStr.substr(2);
       // Find the closing bracket pair ending the match.  End is going to be an
@@ -424,6 +426,7 @@ bool FileCheckPattern::parsePattern(StringRef PatternStr, StringRef Prefix,
       PatternStr = UnparsedPatternStr.substr(End + 2);
 
       bool IsDefinition = false;
+      bool SubstNeeded = false;
       // Whether the substitution block is a legacy use of @LINE with string
       // substitution block syntax.
       bool IsLegacyLineExpr = false;
@@ -454,6 +457,7 @@ bool FileCheckPattern::parsePattern(StringRef PatternStr, StringRef Prefix,
         bool IsPseudo = ParseVarResult->IsPseudo;
 
         IsDefinition = (VarEndIdx != StringRef::npos);
+        SubstNeeded = !IsDefinition;
         if (IsDefinition) {
           if ((IsPseudo || !MatchStr.consume_front(":"))) {
             SM.PrintMessage(SMLoc::getFromPointer(Name.data()),
@@ -488,22 +492,61 @@ bool FileCheckPattern::parsePattern(StringRef PatternStr, StringRef Prefix,
       if (IsNumBlock) {
         Expected<std::unique_ptr<FileCheckExpressionAST>> ParseResult =
             parseNumericSubstitutionBlock(MatchStr, DefinedNumericVariable,
-                                          IsLegacyLineExpr, SM);
+                                          IsLegacyLineExpr, LineNumber, Context,
+                                          SM);
         if (!ParseResult) {
           logAllUnhandledErrors(ParseResult.takeError(), errs());
           return true;
         }
         ExpressionAST = std::move(*ParseResult);
+        SubstNeeded = ExpressionAST != nullptr;
         if (DefinedNumericVariable) {
           IsDefinition = true;
           DefName = (*DefinedNumericVariable)->getName();
-          MatchRegexp = StringRef("[0-9]+");
-        } else
+        }
+        if (SubstNeeded)
           SubstStr = MatchStr;
+        else
+          MatchRegexp = "[0-9]+";
       }
 
+      // Handle variable definition: [[<def>:(...)]] and [[#(...)<def>:(...)]].
+      if (IsDefinition) {
+        RegExStr += '(';
+        ++SubstInsertIdx;
+
+        if (IsNumBlock) {
+          FileCheckNumericVariableMatch NumericVariableDefinition = {
+              *DefinedNumericVariable, CurParen};
+          NumericVariableDefs[DefName] = NumericVariableDefinition;
+          // This store is done here rather than in match() to allow
+          // parseNumericVariableUse() to get the pointer to the class instance
+          // of the right variable definition corresponding to a given numeric
+          // variable use.
+          Context->GlobalNumericVariableTable[DefName] =
+              *DefinedNumericVariable;
+        } else {
+          VariableDefs[DefName] = CurParen;
+          // Mark string variable as defined to detect collisions between
+          // string and numeric variables in parseNumericVariableUse() and
+          // defineCmdlineVariables() when the latter is created later than the
+          // former. We cannot reuse GlobalVariableTable for this by populating
+          // it with an empty string since we would then lose the ability to
+          // detect the use of an undefined variable in match().
+          Context->DefinedVariableTable[DefName] = true;
+        }
+
+        ++CurParen;
+      }
+
+      if (!MatchRegexp.empty() && AddRegExToRegEx(MatchRegexp, CurParen, SM))
+        return true;
+
+      if (IsDefinition)
+        RegExStr += ')';
+
       // Handle substitutions: [[foo]] and [[#<foo expr>]].
-      if (!IsDefinition) {
+      if (SubstNeeded) {
         // Handle substitution of string variables that were defined earlier on
         // the same line by emitting a backreference. Expressions do not
         // support substituting a numeric variable defined on the same line.
@@ -526,37 +569,7 @@ bool FileCheckPattern::parsePattern(StringRef PatternStr, StringRef Prefix,
                   : Context->makeStringSubstitution(SubstStr, SubstInsertIdx);
           Substitutions.push_back(Substitution);
         }
-        continue;
-      }
-
-      // Handle variable definitions: [[<def>:(...)]] and
-      // [[#(...)<def>:(...)]].
-      if (IsNumBlock) {
-        FileCheckNumericVariableMatch NumericVariableDefinition = {
-            *DefinedNumericVariable, CurParen};
-        NumericVariableDefs[DefName] = NumericVariableDefinition;
-        // This store is done here rather than in match() to allow
-        // parseNumericVariableUse() to get the pointer to the class instance
-        // of the right variable definition corresponding to a given numeric
-        // variable use.
-        Context->GlobalNumericVariableTable[DefName] = *DefinedNumericVariable;
-      } else {
-        VariableDefs[DefName] = CurParen;
-        // Mark the string variable as defined to detect collisions between
-        // string and numeric variables in parseNumericVariableUse() and
-        // DefineCmdlineVariables() when the latter is created later than the
-        // former. We cannot reuse GlobalVariableTable for this by populating
-        // it with an empty string since we would then lose the ability to
-        // detect the use of an undefined variable in match().
-        Context->DefinedVariableTable[DefName] = true;
       }
-      RegExStr += '(';
-      ++CurParen;
-
-      if (AddRegExToRegEx(MatchRegexp, CurParen, SM))
-        return true;
-
-      RegExStr += ')';
     }
 
     // Handle fixed string matches.
@@ -607,7 +620,8 @@ Expected<size_t> FileCheckPattern::match(StringRef Buffer, size_t &MatchLen,
   // If this is a fixed string pattern, just match it now.
   if (!FixedStr.empty()) {
     MatchLen = FixedStr.size();
-    size_t Pos = Buffer.find(FixedStr);
+    size_t Pos = IgnoreCase ? Buffer.find_lower(FixedStr)
+                            : Buffer.find(FixedStr);
     if (Pos == StringRef::npos)
       return make_error<FileCheckNotFoundError>();
     return Pos;
@@ -631,10 +645,8 @@ Expected<size_t> FileCheckPattern::match(StringRef Buffer, size_t &MatchLen,
     for (const auto &Substitution : Substitutions) {
       // Substitute and check for failure (e.g. use of undefined variable).
       Expected<std::string> Value = Substitution->getResult();
-      if (!Value) {
-        Context->LineVariable->clearValue();
+      if (!Value)
         return Value.takeError();
-      }
 
       // Plop it into the regex at the adjusted offset.
       TmpStr.insert(TmpStr.begin() + Substitution->getIndex() + InsertOffset,
@@ -644,11 +656,13 @@ Expected<size_t> FileCheckPattern::match(StringRef Buffer, size_t &MatchLen,
 
     // Match the newly constructed regex.
     RegExToMatch = TmpStr;
-    Context->LineVariable->clearValue();
   }
 
   SmallVector<StringRef, 4> MatchInfo;
-  if (!Regex(RegExToMatch, Regex::Newline).match(Buffer, &MatchInfo))
+  unsigned int Flags = Regex::Newline;
+  if (IgnoreCase)
+    Flags |= Regex::IgnoreCase;
+  if (!Regex(RegExToMatch, Flags).match(Buffer, &MatchInfo))
     return make_error<FileCheckNotFoundError>();
 
   // Successful regex match.
@@ -824,7 +838,7 @@ template <class... Types>
 FileCheckNumericVariable *
 FileCheckPatternContext::makeNumericVariable(Types... args) {
   NumericVariables.push_back(
-      llvm::make_unique<FileCheckNumericVariable>(args...));
+      std::make_unique<FileCheckNumericVariable>(args...));
   return NumericVariables.back().get();
 }
 
@@ -832,14 +846,14 @@ FileCheckSubstitution *
 FileCheckPatternContext::makeStringSubstitution(StringRef VarName,
                                                 size_t InsertIdx) {
   Substitutions.push_back(
-      llvm::make_unique<FileCheckStringSubstitution>(this, VarName, InsertIdx));
+      std::make_unique<FileCheckStringSubstitution>(this, VarName, InsertIdx));
   return Substitutions.back().get();
 }
 
 FileCheckSubstitution *FileCheckPatternContext::makeNumericSubstitution(
     StringRef ExpressionStr,
     std::unique_ptr<FileCheckExpressionAST> ExpressionAST, size_t InsertIdx) {
-  Substitutions.push_back(llvm::make_unique<FileCheckNumericSubstitution>(
+  Substitutions.push_back(std::make_unique<FileCheckNumericSubstitution>(
       this, ExpressionStr, std::move(ExpressionAST), InsertIdx));
   return Substitutions.back().get();
 }
@@ -1108,16 +1122,22 @@ void FileCheckPatternContext::createLineVariable() {
   GlobalNumericVariableTable[LineName] = LineVariable;
 }
 
-bool FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
-                              std::vector<FileCheckString> &CheckStrings) {
+FileCheck::FileCheck(FileCheckRequest Req)
+    : Req(Req), PatternContext(std::make_unique<FileCheckPatternContext>()),
+      CheckStrings(std::make_unique<std::vector<FileCheckString>>()) {}
+
+FileCheck::~FileCheck() = default;
+
+bool FileCheck::readCheckFile(SourceMgr &SM, StringRef Buffer,
+                              Regex &PrefixRE) {
   Error DefineError =
-      PatternContext.defineCmdlineVariables(Req.GlobalDefines, SM);
+      PatternContext->defineCmdlineVariables(Req.GlobalDefines, SM);
   if (DefineError) {
     logAllUnhandledErrors(std::move(DefineError), errs());
     return true;
   }
 
-  PatternContext.createLineVariable();
+  PatternContext->createLineVariable();
 
   std::vector<FileCheckPattern> ImplicitNegativeChecks;
   for (const auto &PatternString : Req.ImplicitCheckNot) {
@@ -1133,7 +1153,7 @@ bool FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
     SM.AddNewSourceBuffer(std::move(CmdLine), SMLoc());
 
     ImplicitNegativeChecks.push_back(
-        FileCheckPattern(Check::CheckNot, &PatternContext));
+        FileCheckPattern(Check::CheckNot, PatternContext.get()));
     ImplicitNegativeChecks.back().parsePattern(PatternInBuffer,
                                                "IMPLICIT-CHECK", SM, Req);
   }
@@ -1196,7 +1216,7 @@ bool FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
     SMLoc PatternLoc = SMLoc::getFromPointer(Buffer.data());
 
     // Parse the pattern.
-    FileCheckPattern P(CheckTy, &PatternContext, LineNumber);
+    FileCheckPattern P(CheckTy, PatternContext.get(), LineNumber);
     if (P.parsePattern(Buffer.substr(0, EOL), UsedPrefix, SM, Req))
       return true;
 
@@ -1214,7 +1234,7 @@ bool FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
     // Verify that CHECK-NEXT/SAME/EMPTY lines have at least one CHECK line before them.
     if ((CheckTy == Check::CheckNext || CheckTy == Check::CheckSame ||
          CheckTy == Check::CheckEmpty) &&
-        CheckStrings.empty()) {
+        CheckStrings->empty()) {
       StringRef Type = CheckTy == Check::CheckNext
                            ? "NEXT"
                            : CheckTy == Check::CheckEmpty ? "EMPTY" : "SAME";
@@ -1232,21 +1252,21 @@ bool FileCheck::ReadCheckFile(SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
     }
 
     // Okay, add the string we captured to the output vector and move on.
-    CheckStrings.emplace_back(P, UsedPrefix, PatternLoc);
-    std::swap(DagNotMatches, CheckStrings.back().DagNotStrings);
+    CheckStrings->emplace_back(P, UsedPrefix, PatternLoc);
+    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
     DagNotMatches = ImplicitNegativeChecks;
   }
 
   // Add an EOF pattern for any trailing CHECK-DAG/-NOTs, and use the first
   // prefix as a filler for the error message.
   if (!DagNotMatches.empty()) {
-    CheckStrings.emplace_back(
-        FileCheckPattern(Check::CheckEOF, &PatternContext, LineNumber + 1),
+    CheckStrings->emplace_back(
+        FileCheckPattern(Check::CheckEOF, PatternContext.get(), LineNumber + 1),
         *Req.CheckPrefixes.begin(), SMLoc::getFromPointer(Buffer.data()));
-    std::swap(DagNotMatches, CheckStrings.back().DagNotStrings);
+    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
   }
 
-  if (CheckStrings.empty()) {
+  if (CheckStrings->empty()) {
     errs() << "error: no check strings found with prefix"
            << (Req.CheckPrefixes.size() > 1 ? "es " : " ");
     auto I = Req.CheckPrefixes.begin();
@@ -1704,7 +1724,7 @@ FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
 
 // A check prefix must contain only alphanumeric, hyphens and underscores.
 static bool ValidateCheckPrefix(StringRef CheckPrefix) {
-  Regex Validator("^[a-zA-Z0-9_-]*$");
+  static const Regex Validator("^[a-zA-Z0-9_-]*$");
   return Validator.match(CheckPrefix);
 }
 
@@ -1759,11 +1779,32 @@ Error FileCheckPatternContext::defineCmdlineVariables(
   unsigned I = 0;
   Error Errs = Error::success();
   std::string CmdlineDefsDiag;
-  StringRef Prefix1 = "Global define #";
-  StringRef Prefix2 = ": ";
-  for (StringRef CmdlineDef : CmdlineDefines)
-    CmdlineDefsDiag +=
-        (Prefix1 + Twine(++I) + Prefix2 + CmdlineDef + "\n").str();
+  SmallVector<std::pair<size_t, size_t>, 4> CmdlineDefsIndices;
+  for (StringRef CmdlineDef : CmdlineDefines) {
+    std::string DefPrefix = ("Global define #" + Twine(++I) + ": ").str();
+    size_t EqIdx = CmdlineDef.find('=');
+    if (EqIdx == StringRef::npos) {
+      CmdlineDefsIndices.push_back(std::make_pair(CmdlineDefsDiag.size(), 0));
+      continue;
+    }
+    // Numeric variable definition.
+    if (CmdlineDef[0] == '#') {
+      // Append a copy of the command-line definition adapted to use the same
+      // format as in the input file to be able to reuse
+      // parseNumericSubstitutionBlock.
+      CmdlineDefsDiag += (DefPrefix + CmdlineDef + " (parsed as: [[").str();
+      std::string SubstitutionStr = CmdlineDef;
+      SubstitutionStr[EqIdx] = ':';
+      CmdlineDefsIndices.push_back(
+          std::make_pair(CmdlineDefsDiag.size(), SubstitutionStr.size()));
+      CmdlineDefsDiag += (SubstitutionStr + Twine("]])\n")).str();
+    } else {
+      CmdlineDefsDiag += DefPrefix;
+      CmdlineDefsIndices.push_back(
+          std::make_pair(CmdlineDefsDiag.size(), CmdlineDef.size()));
+      CmdlineDefsDiag += (CmdlineDef + "\n").str();
+    }
+  }
 
   // Create a buffer with fake command line content in order to display
   // parsing diagnostic with location information and point to the
@@ -1773,14 +1814,10 @@ Error FileCheckPatternContext::defineCmdlineVariables(
   StringRef CmdlineDefsDiagRef = CmdLineDefsDiagBuffer->getBuffer();
   SM.AddNewSourceBuffer(std::move(CmdLineDefsDiagBuffer), SMLoc());
 
-  SmallVector<StringRef, 4> CmdlineDefsDiagVec;
-  CmdlineDefsDiagRef.split(CmdlineDefsDiagVec, '\n', -1 /*MaxSplit*/,
-                           false /*KeepEmpty*/);
-  for (StringRef CmdlineDefDiag : CmdlineDefsDiagVec) {
-    unsigned DefStart = CmdlineDefDiag.find(Prefix2) + Prefix2.size();
-    StringRef CmdlineDef = CmdlineDefDiag.substr(DefStart);
-    size_t EqIdx = CmdlineDef.find('=');
-    if (EqIdx == StringRef::npos) {
+  for (std::pair<size_t, size_t> CmdlineDefIndices : CmdlineDefsIndices) {
+    StringRef CmdlineDef = CmdlineDefsDiagRef.substr(CmdlineDefIndices.first,
+                                                     CmdlineDefIndices.second);
+    if (CmdlineDef.empty()) {
       Errs = joinErrors(
           std::move(Errs),
           FileCheckErrorDiagnostic::get(
@@ -1790,31 +1827,35 @@ Error FileCheckPatternContext::defineCmdlineVariables(
 
     // Numeric variable definition.
     if (CmdlineDef[0] == '#') {
-      StringRef CmdlineName = CmdlineDef.substr(1, EqIdx - 1);
-      Expected<FileCheckNumericVariable *> ParseResult =
-          FileCheckPattern::parseNumericVariableDefinition(CmdlineName, this,
-                                                           None, SM);
-      if (!ParseResult) {
-        Errs = joinErrors(std::move(Errs), ParseResult.takeError());
+      // Now parse the definition both to check that the syntax is correct and
+      // to create the necessary class instance.
+      StringRef CmdlineDefExpr = CmdlineDef.substr(1);
+      Optional<FileCheckNumericVariable *> DefinedNumericVariable;
+      Expected<std::unique_ptr<FileCheckExpressionAST>> ExpressionASTResult =
+          FileCheckPattern::parseNumericSubstitutionBlock(
+              CmdlineDefExpr, DefinedNumericVariable, false, None, this, SM);
+      if (!ExpressionASTResult) {
+        Errs = joinErrors(std::move(Errs), ExpressionASTResult.takeError());
         continue;
       }
-
-      StringRef CmdlineVal = CmdlineDef.substr(EqIdx + 1);
-      uint64_t Val;
-      if (CmdlineVal.getAsInteger(10, Val)) {
-        Errs = joinErrors(std::move(Errs),
-                          FileCheckErrorDiagnostic::get(
-                              SM, CmdlineVal,
-                              "invalid value in numeric variable definition '" +
-                                  CmdlineVal + "'"));
+      std::unique_ptr<FileCheckExpressionAST> ExpressionAST =
+          std::move(*ExpressionASTResult);
+      // Now evaluate the expression whose value this variable should be set
+      // to, since the expression of a command-line variable definition should
+      // only use variables defined earlier on the command-line. If not, this
+      // is an error and we report it.
+      Expected<uint64_t> Value = ExpressionAST->eval();
+      if (!Value) {
+        Errs = joinErrors(std::move(Errs), Value.takeError());
         continue;
       }
-      FileCheckNumericVariable *DefinedNumericVariable = *ParseResult;
-      DefinedNumericVariable->setValue(Val);
+
+      assert(DefinedNumericVariable && "No variable defined");
+      (*DefinedNumericVariable)->setValue(*Value);
 
       // Record this variable definition.
-      GlobalNumericVariableTable[DefinedNumericVariable->getName()] =
-          DefinedNumericVariable;
+      GlobalNumericVariableTable[(*DefinedNumericVariable)->getName()] =
+          *DefinedNumericVariable;
     } else {
       // String variable definition.
       std::pair<StringRef, StringRef> CmdlineNameVal = CmdlineDef.split('=');
@@ -1851,7 +1892,7 @@ Error FileCheckPatternContext::defineCmdlineVariables(
       }
       GlobalVariableTable.insert(CmdlineNameVal);
       // Mark the string variable as defined to detect collisions between
-      // string and numeric variables in DefineCmdlineVariables when the latter
+      // string and numeric variables in defineCmdlineVariables when the latter
       // is created later than the former. We cannot reuse GlobalVariableTable
       // for this by populating it with an empty string since we would then
       // lose the ability to detect the use of an undefined variable in
@@ -1887,18 +1928,17 @@ void FileCheckPatternContext::clearLocalVars() {
     GlobalNumericVariableTable.erase(Var);
 }
 
-bool FileCheck::CheckInput(SourceMgr &SM, StringRef Buffer,
-                           ArrayRef<FileCheckString> CheckStrings,
+bool FileCheck::checkInput(SourceMgr &SM, StringRef Buffer,
                            std::vector<FileCheckDiag> *Diags) {
   bool ChecksFailed = false;
 
-  unsigned i = 0, j = 0, e = CheckStrings.size();
+  unsigned i = 0, j = 0, e = CheckStrings->size();
   while (true) {
     StringRef CheckRegion;
     if (j == e) {
       CheckRegion = Buffer;
     } else {
-      const FileCheckString &CheckLabelStr = CheckStrings[j];
+      const FileCheckString &CheckLabelStr = (*CheckStrings)[j];
       if (CheckLabelStr.Pat.getCheckTy() != Check::CheckLabel) {
         ++j;
         continue;
@@ -1921,10 +1961,10 @@ bool FileCheck::CheckInput(SourceMgr &SM, StringRef Buffer,
     // CHECK-LABEL and it would clear variables defined on the command-line
     // before they get used.
     if (i != 0 && Req.EnableVarScope)
-      PatternContext.clearLocalVars();
+      PatternContext->clearLocalVars();
 
     for (; i != j; ++i) {
-      const FileCheckString &CheckStr = CheckStrings[i];
+      const FileCheckString &CheckStr = (*CheckStrings)[i];
 
       // Check each string within the scanned region, including a second check
       // of any final CHECK-LABEL (to verify CHECK-NOT and CHECK-DAG)
diff --git a/lib/Support/FileCheckImpl.h b/lib/Support/FileCheckImpl.h
new file mode 100644
index 000000000000..06ce8301cec4
--- /dev/null
+++ b/lib/Support/FileCheckImpl.h
@@ -0,0 +1,624 @@
+//===-- FileCheckImpl.h - Private FileCheck Interface ------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the private interfaces of FileCheck. Its purpose is to
+// allow unit testing of FileCheck and to separate the interface from the
+// implementation. It is only meant to be used by FileCheck.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_SUPPORT_FILECHECKIMPL_H
+#define LLVM_LIB_SUPPORT_FILECHECKIMPL_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/SourceMgr.h"
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// Numeric substitution handling code.
+//===----------------------------------------------------------------------===//
+
+/// Base class representing the AST of a given expression.
+class FileCheckExpressionAST {
+public:
+  virtual ~FileCheckExpressionAST() = default;
+
+  /// Evaluates and \returns the value of the expression represented by this
+  /// AST or an error if evaluation fails.
+  virtual Expected<uint64_t> eval() const = 0;
+};
+
+/// Class representing an unsigned literal in the AST of an expression.
+class FileCheckExpressionLiteral : public FileCheckExpressionAST {
+private:
+  /// Actual value of the literal.
+  uint64_t Value;
+
+public:
+  /// Constructs a literal with the specified value.
+  FileCheckExpressionLiteral(uint64_t Val) : Value(Val) {}
+
+  /// \returns the literal's value.
+  Expected<uint64_t> eval() const { return Value; }
+};
+
+/// Class to represent an undefined variable error, which quotes that
+/// variable's name when printed.
+class FileCheckUndefVarError : public ErrorInfo<FileCheckUndefVarError> {
+private:
+  StringRef VarName;
+
+public:
+  static char ID;
+
+  FileCheckUndefVarError(StringRef VarName) : VarName(VarName) {}
+
+  StringRef getVarName() const { return VarName; }
+
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  /// Print name of variable associated with this error.
+  void log(raw_ostream &OS) const override {
+    OS << "\"";
+    OS.write_escaped(VarName) << "\"";
+  }
+};
+
+/// Class representing a numeric variable and its associated current value.
+class FileCheckNumericVariable {
+private:
+  /// Name of the numeric variable.
+  StringRef Name;
+
+  /// Value of numeric variable, if defined, or None otherwise.
+  Optional<uint64_t> Value;
+
+  /// Line number where this variable is defined, or None if defined before
+  /// input is parsed. Used to determine whether a variable is defined on the
+  /// same line as a given use.
+  Optional<size_t> DefLineNumber;
+
+public:
+  /// Constructor for a variable \p Name defined at line \p DefLineNumber or
+  /// defined before input is parsed if \p DefLineNumber is None.
+  explicit FileCheckNumericVariable(StringRef Name,
+                                    Optional<size_t> DefLineNumber = None)
+      : Name(Name), DefLineNumber(DefLineNumber) {}
+
+  /// \returns name of this numeric variable.
+  StringRef getName() const { return Name; }
+
+  /// \returns this variable's value.
+  Optional<uint64_t> getValue() const { return Value; }
+
+  /// Sets value of this numeric variable to \p NewValue.
+  void setValue(uint64_t NewValue) { Value = NewValue; }
+
+  /// Clears value of this numeric variable, regardless of whether it is
+  /// currently defined or not.
+  void clearValue() { Value = None; }
+
+  /// \returns the line number where this variable is defined, if any, or None
+  /// if defined before input is parsed.
+  Optional<size_t> getDefLineNumber() { return DefLineNumber; }
+};
+
+/// Class representing the use of a numeric variable in the AST of an
+/// expression.
+class FileCheckNumericVariableUse : public FileCheckExpressionAST {
+private:
+  /// Name of the numeric variable.
+  StringRef Name;
+
+  /// Pointer to the class instance for the variable this use is about.
+  FileCheckNumericVariable *NumericVariable;
+
+public:
+  FileCheckNumericVariableUse(StringRef Name,
+                              FileCheckNumericVariable *NumericVariable)
+      : Name(Name), NumericVariable(NumericVariable) {}
+
+  /// \returns the value of the variable referenced by this instance.
+  Expected<uint64_t> eval() const;
+};
+
+/// Type of functions evaluating a given binary operation.
+using binop_eval_t = uint64_t (*)(uint64_t, uint64_t);
+
+/// Class representing a single binary operation in the AST of an expression.
+class FileCheckASTBinop : public FileCheckExpressionAST {
+private:
+  /// Left operand.
+  std::unique_ptr<FileCheckExpressionAST> LeftOperand;
+
+  /// Right operand.
+  std::unique_ptr<FileCheckExpressionAST> RightOperand;
+
+  /// Pointer to function that can evaluate this binary operation.
+  binop_eval_t EvalBinop;
+
+public:
+  FileCheckASTBinop(binop_eval_t EvalBinop,
+                    std::unique_ptr<FileCheckExpressionAST> LeftOp,
+                    std::unique_ptr<FileCheckExpressionAST> RightOp)
+      : EvalBinop(EvalBinop) {
+    LeftOperand = std::move(LeftOp);
+    RightOperand = std::move(RightOp);
+  }
+
+  /// Evaluates the value of the binary operation represented by this AST,
+  /// using EvalBinop on the result of recursively evaluating the operands.
+  /// \returns the expression value or an error if an undefined numeric
+  /// variable is used in one of the operands.
+  Expected<uint64_t> eval() const;
+};
+
+class FileCheckPatternContext;
+
+/// Class representing a substitution to perform in the RegExStr string.
+class FileCheckSubstitution {
+protected:
+  /// Pointer to a class instance holding, among other things, the table with
+  /// the values of live string variables at the start of any given CHECK line.
+  /// Used for substituting string variables with the text they were defined
+  /// as. Expressions are linked to the numeric variables they use at
+  /// parse time and directly access the value of the numeric variable to
+  /// evaluate their value.
+  FileCheckPatternContext *Context;
+
+  /// The string that needs to be substituted for something else. For a
+  /// string variable this is its name, otherwise this is the whole expression.
+  StringRef FromStr;
+
+  // Index in RegExStr of where to do the substitution.
+  size_t InsertIdx;
+
+public:
+  FileCheckSubstitution(FileCheckPatternContext *Context, StringRef VarName,
+                        size_t InsertIdx)
+      : Context(Context), FromStr(VarName), InsertIdx(InsertIdx) {}
+
+  virtual ~FileCheckSubstitution() = default;
+
+  /// \returns the string to be substituted for something else.
+  StringRef getFromString() const { return FromStr; }
+
+  /// \returns the index where the substitution is to be performed in RegExStr.
+  size_t getIndex() const { return InsertIdx; }
+
+  /// \returns a string containing the result of the substitution represented
+  /// by this class instance or an error if substitution failed.
+  virtual Expected<std::string> getResult() const = 0;
+};
+
+class FileCheckStringSubstitution : public FileCheckSubstitution {
+public:
+  FileCheckStringSubstitution(FileCheckPatternContext *Context,
+                              StringRef VarName, size_t InsertIdx)
+      : FileCheckSubstitution(Context, VarName, InsertIdx) {}
+
+  /// \returns the text that the string variable in this substitution matched
+  /// when defined, or an error if the variable is undefined.
+  Expected<std::string> getResult() const override;
+};
+
+class FileCheckNumericSubstitution : public FileCheckSubstitution {
+private:
+  /// Pointer to the class representing the expression whose value is to be
+  /// substituted.
+  std::unique_ptr<FileCheckExpressionAST> ExpressionAST;
+
+public:
+  FileCheckNumericSubstitution(FileCheckPatternContext *Context, StringRef Expr,
+                               std::unique_ptr<FileCheckExpressionAST> ExprAST,
+                               size_t InsertIdx)
+      : FileCheckSubstitution(Context, Expr, InsertIdx) {
+    ExpressionAST = std::move(ExprAST);
+  }
+
+  /// \returns a string containing the result of evaluating the expression in
+  /// this substitution, or an error if evaluation failed.
+  Expected<std::string> getResult() const override;
+};
+
+//===----------------------------------------------------------------------===//
+// Pattern handling code.
+//===----------------------------------------------------------------------===//
+
+struct FileCheckDiag;
+
+/// Class holding the FileCheckPattern global state, shared by all patterns:
+/// tables holding values of variables and whether they are defined or not at
+/// any given time in the matching process.
+class FileCheckPatternContext {
+  friend class FileCheckPattern;
+
+private:
+  /// When matching a given pattern, this holds the value of all the string
+  /// variables defined in previous patterns. In a pattern, only the last
+  /// definition for a given variable is recorded in this table.
+  /// Back-references are used for uses after any the other definition.
+  StringMap<StringRef> GlobalVariableTable;
+
+  /// Map of all string variables defined so far. Used at parse time to detect
+  /// a name conflict between a numeric variable and a string variable when
+  /// the former is defined on a later line than the latter.
+  StringMap<bool> DefinedVariableTable;
+
+  /// When matching a given pattern, this holds the pointers to the classes
+  /// representing the numeric variables defined in previous patterns. When
+  /// matching a pattern all definitions for that pattern are recorded in the
+  /// NumericVariableDefs table in the FileCheckPattern instance of that
+  /// pattern.
+  StringMap<FileCheckNumericVariable *> GlobalNumericVariableTable;
+
+  /// Pointer to the class instance representing the @LINE pseudo variable for
+  /// easily updating its value.
+  FileCheckNumericVariable *LineVariable = nullptr;
+
+  /// Vector holding pointers to all parsed numeric variables. Used to
+  /// automatically free them once they are guaranteed to no longer be used.
+  std::vector<std::unique_ptr<FileCheckNumericVariable>> NumericVariables;
+
+  /// Vector holding pointers to all substitutions. Used to automatically free
+  /// them once they are guaranteed to no longer be used.
+  std::vector<std::unique_ptr<FileCheckSubstitution>> Substitutions;
+
+public:
+  /// \returns the value of string variable \p VarName or an error if no such
+  /// variable has been defined.
+  Expected<StringRef> getPatternVarValue(StringRef VarName);
+
+  /// Defines string and numeric variables from definitions given on the
+  /// command line, passed as a vector of [#]VAR=VAL strings in
+  /// \p CmdlineDefines. \returns an error list containing diagnostics against
+  /// \p SM for all definition parsing failures, if any, or Success otherwise.
+  Error defineCmdlineVariables(std::vector<std::string> &CmdlineDefines,
+                               SourceMgr &SM);
+
+  /// Create @LINE pseudo variable. Value is set when pattern are being
+  /// matched.
+  void createLineVariable();
+
+  /// Undefines local variables (variables whose name does not start with a '$'
+  /// sign), i.e. removes them from GlobalVariableTable and from
+  /// GlobalNumericVariableTable and also clears the value of numeric
+  /// variables.
+  void clearLocalVars();
+
+private:
+  /// Makes a new numeric variable and registers it for destruction when the
+  /// context is destroyed.
+  template <class... Types>
+  FileCheckNumericVariable *makeNumericVariable(Types... args);
+
+  /// Makes a new string substitution and registers it for destruction when the
+  /// context is destroyed.
+  FileCheckSubstitution *makeStringSubstitution(StringRef VarName,
+                                                size_t InsertIdx);
+
+  /// Makes a new numeric substitution and registers it for destruction when
+  /// the context is destroyed.
+  FileCheckSubstitution *
+  makeNumericSubstitution(StringRef ExpressionStr,
+                          std::unique_ptr<FileCheckExpressionAST> ExpressionAST,
+                          size_t InsertIdx);
+};
+
+/// Class to represent an error holding a diagnostic with location information
+/// used when printing it.
+class FileCheckErrorDiagnostic : public ErrorInfo<FileCheckErrorDiagnostic> {
+private:
+  SMDiagnostic Diagnostic;
+
+public:
+  static char ID;
+
+  FileCheckErrorDiagnostic(SMDiagnostic &&Diag) : Diagnostic(Diag) {}
+
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  /// Print diagnostic associated with this error when printing the error.
+  void log(raw_ostream &OS) const override { Diagnostic.print(nullptr, OS); }
+
+  static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg) {
+    return make_error<FileCheckErrorDiagnostic>(
+        SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg));
+  }
+
+  static Error get(const SourceMgr &SM, StringRef Buffer, const Twine &ErrMsg) {
+    return get(SM, SMLoc::getFromPointer(Buffer.data()), ErrMsg);
+  }
+};
+
+class FileCheckNotFoundError : public ErrorInfo<FileCheckNotFoundError> {
+public:
+  static char ID;
+
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  /// Print diagnostic associated with this error when printing the error.
+  void log(raw_ostream &OS) const override {
+    OS << "String not found in input";
+  }
+};
+
+class FileCheckPattern {
+  SMLoc PatternLoc;
+
+  /// A fixed string to match as the pattern or empty if this pattern requires
+  /// a regex match.
+  StringRef FixedStr;
+
+  /// A regex string to match as the pattern or empty if this pattern requires
+  /// a fixed string to match.
+  std::string RegExStr;
+
+  /// Entries in this vector represent a substitution of a string variable or
+  /// an expression in the RegExStr regex at match time. For example, in the
+  /// case of a CHECK directive with the pattern "foo[[bar]]baz[[#N+1]]",
+  /// RegExStr will contain "foobaz" and we'll get two entries in this vector
+  /// that tells us to insert the value of string variable "bar" at offset 3
+  /// and the value of expression "N+1" at offset 6.
+  std::vector<FileCheckSubstitution *> Substitutions;
+
+  /// Maps names of string variables defined in a pattern to the number of
+  /// their parenthesis group in RegExStr capturing their last definition.
+  ///
+  /// E.g. for the pattern "foo[[bar:.*]]baz([[bar]][[QUUX]][[bar:.*]])",
+  /// RegExStr will be "foo(.*)baz(\1<quux value>(.*))" where <quux value> is
+  /// the value captured for QUUX on the earlier line where it was defined, and
+  /// VariableDefs will map "bar" to the third parenthesis group which captures
+  /// the second definition of "bar".
+  ///
+  /// Note: uses std::map rather than StringMap to be able to get the key when
+  /// iterating over values.
+  std::map<StringRef, unsigned> VariableDefs;
+
+  /// Structure representing the definition of a numeric variable in a pattern.
+  /// It holds the pointer to the class representing the numeric variable whose
+  /// value is being defined and the number of the parenthesis group in
+  /// RegExStr to capture that value.
+  struct FileCheckNumericVariableMatch {
+    /// Pointer to class representing the numeric variable whose value is being
+    /// defined.
+    FileCheckNumericVariable *DefinedNumericVariable;
+
+    /// Number of the parenthesis group in RegExStr that captures the value of
+    /// this numeric variable definition.
+    unsigned CaptureParenGroup;
+  };
+
+  /// Holds the number of the parenthesis group in RegExStr and pointer to the
+  /// corresponding FileCheckNumericVariable class instance of all numeric
+  /// variable definitions. Used to set the matched value of all those
+  /// variables.
+  StringMap<FileCheckNumericVariableMatch> NumericVariableDefs;
+
+  /// Pointer to a class instance holding the global state shared by all
+  /// patterns:
+  /// - separate tables with the values of live string and numeric variables
+  ///   respectively at the start of any given CHECK line;
+  /// - table holding whether a string variable has been defined at any given
+  ///   point during the parsing phase.
+  FileCheckPatternContext *Context;
+
+  Check::FileCheckType CheckTy;
+
+  /// Line number for this CHECK pattern or None if it is an implicit pattern.
+  /// Used to determine whether a variable definition is made on an earlier
+  /// line to the one with this CHECK.
+  Optional<size_t> LineNumber;
+
+  /// Ignore case while matching if set to true.
+  bool IgnoreCase = false;
+
+public:
+  FileCheckPattern(Check::FileCheckType Ty, FileCheckPatternContext *Context,
+                   Optional<size_t> Line = None)
+      : Context(Context), CheckTy(Ty), LineNumber(Line) {}
+
+  /// \returns the location in source code.
+  SMLoc getLoc() const { return PatternLoc; }
+
+  /// \returns the pointer to the global state for all patterns in this
+  /// FileCheck instance.
+  FileCheckPatternContext *getContext() const { return Context; }
+
+  /// \returns whether \p C is a valid first character for a variable name.
+  static bool isValidVarNameStart(char C);
+
+  /// Parsing information about a variable.
+  struct VariableProperties {
+    StringRef Name;
+    bool IsPseudo;
+  };
+
+  /// Parses the string at the start of \p Str for a variable name. \returns
+  /// a VariableProperties structure holding the variable name and whether it
+  /// is the name of a pseudo variable, or an error holding a diagnostic
+  /// against \p SM if parsing fail. If parsing was successful, also strips
+  /// \p Str from the variable name.
+  static Expected<VariableProperties> parseVariable(StringRef &Str,
+                                                    const SourceMgr &SM);
+  /// Parses \p Expr for a numeric substitution block at line \p LineNumber,
+  /// or before input is parsed if \p LineNumber is None. Parameter
+  /// \p IsLegacyLineExpr indicates whether \p Expr should be a legacy @LINE
+  /// expression and \p Context points to the class instance holding the live
+  /// string and numeric variables. \returns a pointer to the class instance
+  /// representing the AST of the expression whose value must be substitued, or
+  /// an error holding a diagnostic against \p SM if parsing fails. If
+  /// substitution was successful, sets \p DefinedNumericVariable to point to
+  /// the class representing the numeric variable defined in this numeric
+  /// substitution block, or None if this block does not define any variable.
+  static Expected<std::unique_ptr<FileCheckExpressionAST>>
+  parseNumericSubstitutionBlock(
+      StringRef Expr,
+      Optional<FileCheckNumericVariable *> &DefinedNumericVariable,
+      bool IsLegacyLineExpr, Optional<size_t> LineNumber,
+      FileCheckPatternContext *Context, const SourceMgr &SM);
+  /// Parses the pattern in \p PatternStr and initializes this FileCheckPattern
+  /// instance accordingly.
+  ///
+  /// \p Prefix provides which prefix is being matched, \p Req describes the
+  /// global options that influence the parsing such as whitespace
+  /// canonicalization, \p SM provides the SourceMgr used for error reports.
+  /// \returns true in case of an error, false otherwise.
+  bool parsePattern(StringRef PatternStr, StringRef Prefix, SourceMgr &SM,
+                    const FileCheckRequest &Req);
+  /// Matches the pattern string against the input buffer \p Buffer
+  ///
+  /// \returns the position that is matched or an error indicating why matching
+  /// failed. If there is a match, updates \p MatchLen with the size of the
+  /// matched string.
+  ///
+  /// The GlobalVariableTable StringMap in the FileCheckPatternContext class
+  /// instance provides the current values of FileCheck string variables and
+  /// is updated if this match defines new values. Likewise, the
+  /// GlobalNumericVariableTable StringMap in the same class provides the
+  /// current values of FileCheck numeric variables and is updated if this
+  /// match defines new numeric values.
+  Expected<size_t> match(StringRef Buffer, size_t &MatchLen,
+                         const SourceMgr &SM) const;
+  /// Prints the value of successful substitutions or the name of the undefined
+  /// string or numeric variables preventing a successful substitution.
+  void printSubstitutions(const SourceMgr &SM, StringRef Buffer,
+                          SMRange MatchRange = None) const;
+  void printFuzzyMatch(const SourceMgr &SM, StringRef Buffer,
+                       std::vector<FileCheckDiag> *Diags) const;
+
+  bool hasVariable() const {
+    return !(Substitutions.empty() && VariableDefs.empty());
+  }
+
+  Check::FileCheckType getCheckTy() const { return CheckTy; }
+
+  int getCount() const { return CheckTy.getCount(); }
+
+private:
+  bool AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM);
+  void AddBackrefToRegEx(unsigned BackrefNum);
+  /// Computes an arbitrary estimate for the quality of matching this pattern
+  /// at the start of \p Buffer; a distance of zero should correspond to a
+  /// perfect match.
+  unsigned computeMatchDistance(StringRef Buffer) const;
+  /// Finds the closing sequence of a regex variable usage or definition.
+  ///
+  /// \p Str has to point in the beginning of the definition (right after the
+  /// opening sequence). \p SM holds the SourceMgr used for error repporting.
+  ///  \returns the offset of the closing sequence within Str, or npos if it
+  /// was not found.
+  size_t FindRegexVarEnd(StringRef Str, SourceMgr &SM);
+
+  /// Parses \p Expr for the name of a numeric variable to be defined at line
+  /// \p LineNumber, or before input is parsed if \p LineNumber is None.
+  /// \returns a pointer to the class instance representing that variable,
+  /// creating it if needed, or an error holding a diagnostic against \p SM
+  /// should defining such a variable be invalid.
+  static Expected<FileCheckNumericVariable *> parseNumericVariableDefinition(
+      StringRef &Expr, FileCheckPatternContext *Context,
+      Optional<size_t> LineNumber, const SourceMgr &SM);
+  /// Parses \p Name as a (pseudo if \p IsPseudo is true) numeric variable use
+  /// at line \p LineNumber, or before input is parsed if \p LineNumber is
+  /// None. Parameter \p Context points to the class instance holding the live
+  /// string and numeric variables. \returns the pointer to the class instance
+  /// representing that variable if successful, or an error holding a
+  /// diagnostic against \p SM otherwise.
+  static Expected<std::unique_ptr<FileCheckNumericVariableUse>>
+  parseNumericVariableUse(StringRef Name, bool IsPseudo,
+                          Optional<size_t> LineNumber,
+                          FileCheckPatternContext *Context,
+                          const SourceMgr &SM);
+  enum class AllowedOperand { LineVar, Literal, Any };
+  /// Parses \p Expr for use of a numeric operand at line \p LineNumber, or
+  /// before input is parsed if \p LineNumber is None. Accepts both literal
+  /// values and numeric variables, depending on the value of \p AO. Parameter
+  /// \p Context points to the class instance holding the live string and
+  /// numeric variables. \returns the class representing that operand in the
+  /// AST of the expression or an error holding a diagnostic against \p SM
+  /// otherwise.
+  static Expected<std::unique_ptr<FileCheckExpressionAST>>
+  parseNumericOperand(StringRef &Expr, AllowedOperand AO,
+                      Optional<size_t> LineNumber,
+                      FileCheckPatternContext *Context, const SourceMgr &SM);
+  /// Parses \p Expr for a binary operation at line \p LineNumber, or before
+  /// input is parsed if \p LineNumber is None. The left operand of this binary
+  /// operation is given in \p LeftOp and \p IsLegacyLineExpr indicates whether
+  /// we are parsing a legacy @LINE expression. Parameter \p Context points to
+  /// the class instance holding the live string and numeric variables.
+  /// \returns the class representing the binary operation in the AST of the
+  /// expression, or an error holding a diagnostic against \p SM otherwise.
+  static Expected<std::unique_ptr<FileCheckExpressionAST>>
+  parseBinop(StringRef &Expr, std::unique_ptr<FileCheckExpressionAST> LeftOp,
+             bool IsLegacyLineExpr, Optional<size_t> LineNumber,
+             FileCheckPatternContext *Context, const SourceMgr &SM);
+};
+
+//===----------------------------------------------------------------------===//
+// Check Strings.
+//===----------------------------------------------------------------------===//
+
+/// A check that we found in the input file.
+struct FileCheckString {
+  /// The pattern to match.
+  FileCheckPattern Pat;
+
+  /// Which prefix name this check matched.
+  StringRef Prefix;
+
+  /// The location in the match file that the check string was specified.
+  SMLoc Loc;
+
+  /// All of the strings that are disallowed from occurring between this match
+  /// string and the previous one (or start of file).
+  std::vector<FileCheckPattern> DagNotStrings;
+
+  FileCheckString(const FileCheckPattern &P, StringRef S, SMLoc L)
+      : Pat(P), Prefix(S), Loc(L) {}
+
+  /// Matches check string and its "not strings" and/or "dag strings".
+  size_t Check(const SourceMgr &SM, StringRef Buffer, bool IsLabelScanMode,
+               size_t &MatchLen, FileCheckRequest &Req,
+               std::vector<FileCheckDiag> *Diags) const;
+
+  /// Verifies that there is a single line in the given \p Buffer. Errors are
+  /// reported against \p SM.
+  bool CheckNext(const SourceMgr &SM, StringRef Buffer) const;
+  /// Verifies that there is no newline in the given \p Buffer. Errors are
+  /// reported against \p SM.
+  bool CheckSame(const SourceMgr &SM, StringRef Buffer) const;
+  /// Verifies that none of the strings in \p NotStrings are found in the given
+  /// \p Buffer. Errors are reported against \p SM and diagnostics recorded in
+  /// \p Diags according to the verbosity level set in \p Req.
+  bool CheckNot(const SourceMgr &SM, StringRef Buffer,
+                const std::vector<const FileCheckPattern *> &NotStrings,
+                const FileCheckRequest &Req,
+                std::vector<FileCheckDiag> *Diags) const;
+  /// Matches "dag strings" and their mixed "not strings".
+  size_t CheckDag(const SourceMgr &SM, StringRef Buffer,
+                  std::vector<const FileCheckPattern *> &NotStrings,
+                  const FileCheckRequest &Req,
+                  std::vector<FileCheckDiag> *Diags) const;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Support/FileCollector.cpp b/lib/Support/FileCollector.cpp
new file mode 100644
index 000000000000..47fca6413722
--- /dev/null
+++ b/lib/Support/FileCollector.cpp
@@ -0,0 +1,268 @@
+//===-- FileCollector.cpp ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/FileCollector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+
+using namespace llvm;
+
+static bool isCaseSensitivePath(StringRef Path) {
+  SmallString<256> TmpDest = Path, UpperDest, RealDest;
+
+  // Remove component traversals, links, etc.
+  if (!sys::fs::real_path(Path, TmpDest))
+    return true; // Current default value in vfs.yaml
+  Path = TmpDest;
+
+  // Change path to all upper case and ask for its real path, if the latter
+  // exists and is equal to path, it's not case sensitive. Default to case
+  // sensitive in the absence of real_path, since this is the YAMLVFSWriter
+  // default.
+  UpperDest = Path.upper();
+  if (sys::fs::real_path(UpperDest, RealDest) && Path.equals(RealDest))
+    return false;
+  return true;
+}
+
+FileCollector::FileCollector(std::string Root, std::string OverlayRoot)
+    : Root(std::move(Root)), OverlayRoot(std::move(OverlayRoot)) {
+  sys::fs::create_directories(this->Root, true);
+}
+
+bool FileCollector::getRealPath(StringRef SrcPath,
+                                SmallVectorImpl<char> &Result) {
+  SmallString<256> RealPath;
+  StringRef FileName = sys::path::filename(SrcPath);
+  std::string Directory = sys::path::parent_path(SrcPath).str();
+  auto DirWithSymlink = SymlinkMap.find(Directory);
+
+  // Use real_path to fix any symbolic link component present in a path.
+  // Computing the real path is expensive, cache the search through the parent
+  // path Directory.
+  if (DirWithSymlink == SymlinkMap.end()) {
+    auto EC = sys::fs::real_path(Directory, RealPath);
+    if (EC)
+      return false;
+    SymlinkMap[Directory] = RealPath.str();
+  } else {
+    RealPath = DirWithSymlink->second;
+  }
+
+  sys::path::append(RealPath, FileName);
+  Result.swap(RealPath);
+  return true;
+}
+
+void FileCollector::addFile(const Twine &file) {
+  std::lock_guard<std::mutex> lock(Mutex);
+  std::string FileStr = file.str();
+  if (markAsSeen(FileStr))
+    addFileImpl(FileStr);
+}
+
+void FileCollector::addFileImpl(StringRef SrcPath) {
+  // We need an absolute src path to append to the root.
+  SmallString<256> AbsoluteSrc = SrcPath;
+  sys::fs::make_absolute(AbsoluteSrc);
+
+  // Canonicalize src to a native path to avoid mixed separator styles.
+  sys::path::native(AbsoluteSrc);
+
+  // Remove redundant leading "./" pieces and consecutive separators.
+  AbsoluteSrc = sys::path::remove_leading_dotslash(AbsoluteSrc);
+
+  // Canonicalize the source path by removing "..", "." components.
+  SmallString<256> VirtualPath = AbsoluteSrc;
+  sys::path::remove_dots(VirtualPath, /*remove_dot_dot=*/true);
+
+  // If a ".." component is present after a symlink component, remove_dots may
+  // lead to the wrong real destination path. Let the source be canonicalized
+  // like that but make sure we always use the real path for the destination.
+  SmallString<256> CopyFrom;
+  if (!getRealPath(AbsoluteSrc, CopyFrom))
+    CopyFrom = VirtualPath;
+
+  SmallString<256> DstPath = StringRef(Root);
+  sys::path::append(DstPath, sys::path::relative_path(CopyFrom));
+
+  // Always map a canonical src path to its real path into the YAML, by doing
+  // this we map different virtual src paths to the same entry in the VFS
+  // overlay, which is a way to emulate symlink inside the VFS; this is also
+  // needed for correctness, not doing that can lead to module redefinition
+  // errors.
+  addFileToMapping(VirtualPath, DstPath);
+}
+
+/// Set the access and modification time for the given file from the given
+/// status object.
+static std::error_code
+copyAccessAndModificationTime(StringRef Filename,
+                              const sys::fs::file_status &Stat) {
+  int FD;
+
+  if (auto EC =
+          sys::fs::openFileForWrite(Filename, FD, sys::fs::CD_OpenExisting))
+    return EC;
+
+  if (auto EC = sys::fs::setLastAccessAndModificationTime(
+          FD, Stat.getLastAccessedTime(), Stat.getLastModificationTime()))
+    return EC;
+
+  if (auto EC = sys::Process::SafelyCloseFileDescriptor(FD))
+    return EC;
+
+  return {};
+}
+
+std::error_code FileCollector::copyFiles(bool StopOnError) {
+  for (auto &entry : VFSWriter.getMappings()) {
+    // Create directory tree.
+    if (std::error_code EC =
+            sys::fs::create_directories(sys::path::parent_path(entry.RPath),
+                                        /*IgnoreExisting=*/true)) {
+      if (StopOnError)
+        return EC;
+    }
+
+    // Get the status of the original file/directory.
+    sys::fs::file_status Stat;
+    if (std::error_code EC = sys::fs::status(entry.VPath, Stat)) {
+      if (StopOnError)
+        return EC;
+      continue;
+    }
+
+    if (Stat.type() == sys::fs::file_type::directory_file) {
+      // Construct a directory when it's just a directory entry.
+      if (std::error_code EC =
+              sys::fs::create_directories(entry.RPath,
+                                          /*IgnoreExisting=*/true)) {
+        if (StopOnError)
+          return EC;
+      }
+      continue;
+    }
+
+    // Copy file over.
+    if (std::error_code EC = sys::fs::copy_file(entry.VPath, entry.RPath)) {
+      if (StopOnError)
+        return EC;
+    }
+
+    // Copy over permissions.
+    if (auto perms = sys::fs::getPermissions(entry.VPath)) {
+      if (std::error_code EC = sys::fs::setPermissions(entry.RPath, *perms)) {
+        if (StopOnError)
+          return EC;
+      }
+    }
+
+    // Copy over modification time.
+    copyAccessAndModificationTime(entry.RPath, Stat);
+  }
+  return {};
+}
+
+std::error_code FileCollector::writeMapping(StringRef mapping_file) {
+  std::lock_guard<std::mutex> lock(Mutex);
+
+  VFSWriter.setOverlayDir(OverlayRoot);
+  VFSWriter.setCaseSensitivity(isCaseSensitivePath(OverlayRoot));
+  VFSWriter.setUseExternalNames(false);
+
+  std::error_code EC;
+  raw_fd_ostream os(mapping_file, EC, sys::fs::OF_Text);
+  if (EC)
+    return EC;
+
+  VFSWriter.write(os);
+
+  return {};
+}
+
+namespace {
+
+class FileCollectorFileSystem : public vfs::FileSystem {
+public:
+  explicit FileCollectorFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS,
+                                   std::shared_ptr<FileCollector> Collector)
+      : FS(std::move(FS)), Collector(std::move(Collector)) {}
+
+  llvm::ErrorOr<llvm::vfs::Status> status(const Twine &Path) override {
+    auto Result = FS->status(Path);
+    if (Result && Result->exists())
+      Collector->addFile(Path);
+    return Result;
+  }
+
+  llvm::ErrorOr<std::unique_ptr<llvm::vfs::File>>
+  openFileForRead(const Twine &Path) override {
+    auto Result = FS->openFileForRead(Path);
+    if (Result && *Result)
+      Collector->addFile(Path);
+    return Result;
+  }
+
+  llvm::vfs::directory_iterator dir_begin(const llvm::Twine &Dir,
+                                          std::error_code &EC) override {
+    auto It = FS->dir_begin(Dir, EC);
+    if (EC)
+      return It;
+    // Collect everything that's listed in case the user needs it.
+    Collector->addFile(Dir);
+    for (; !EC && It != llvm::vfs::directory_iterator(); It.increment(EC)) {
+      if (It->type() == sys::fs::file_type::regular_file ||
+          It->type() == sys::fs::file_type::directory_file ||
+          It->type() == sys::fs::file_type::symlink_file) {
+        Collector->addFile(It->path());
+      }
+    }
+    if (EC)
+      return It;
+    // Return a new iterator.
+    return FS->dir_begin(Dir, EC);
+  }
+
+  std::error_code getRealPath(const Twine &Path,
+                              SmallVectorImpl<char> &Output) const override {
+    auto EC = FS->getRealPath(Path, Output);
+    if (!EC) {
+      Collector->addFile(Path);
+      if (Output.size() > 0)
+        Collector->addFile(Output);
+    }
+    return EC;
+  }
+
+  std::error_code isLocal(const Twine &Path, bool &Result) override {
+    return FS->isLocal(Path, Result);
+  }
+
+  llvm::ErrorOr<std::string> getCurrentWorkingDirectory() const override {
+    return FS->getCurrentWorkingDirectory();
+  }
+
+  std::error_code setCurrentWorkingDirectory(const llvm::Twine &Path) override {
+    return FS->setCurrentWorkingDirectory(Path);
+  }
+
+private:
+  IntrusiveRefCntPtr<vfs::FileSystem> FS;
+  std::shared_ptr<FileCollector> Collector;
+};
+
+} // end anonymous namespace
+
+IntrusiveRefCntPtr<vfs::FileSystem>
+FileCollector::createCollectorVFS(IntrusiveRefCntPtr<vfs::FileSystem> BaseFS,
+                                  std::shared_ptr<FileCollector> Collector) {
+  return new FileCollectorFileSystem(std::move(BaseFS), std::move(Collector));
+}
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 3d6b569f2993..024dd3e57a40 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -121,7 +121,7 @@ createInMemoryBuffer(StringRef Path, size_t Size, unsigned Mode) {
       Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
   if (EC)
     return errorCodeToError(EC);
-  return llvm::make_unique<InMemoryBuffer>(Path, MB, Size, Mode);
+  return std::make_unique<InMemoryBuffer>(Path, MB, Size, Mode);
 }
 
 static Expected<std::unique_ptr<FileOutputBuffer>>
@@ -146,7 +146,7 @@ createOnDiskBuffer(StringRef Path, size_t Size, unsigned Mode) {
 
   // Mmap it.
   std::error_code EC;
-  auto MappedFile = llvm::make_unique<fs::mapped_file_region>(
+  auto MappedFile = std::make_unique<fs::mapped_file_region>(
       fs::convertFDToNativeFile(File.FD), fs::mapped_file_region::readwrite,
       Size, 0, EC);
 
@@ -157,7 +157,7 @@ createOnDiskBuffer(StringRef Path, size_t Size, unsigned Mode) {
     return createInMemoryBuffer(Path, Size, Mode);
   }
 
-  return llvm::make_unique<OnDiskBuffer>(Path, std::move(File),
+  return std::make_unique<OnDiskBuffer>(Path, std::move(File),
                                          std::move(MappedFile));
 }
 
diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
index 62eb7bfda195..d11fbb54dc0d 100644
--- a/lib/Support/FileUtilities.cpp
+++ b/lib/Support/FileUtilities.cpp
@@ -12,9 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FileUtilities.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <cmath>
@@ -264,3 +267,66 @@ int llvm::DiffFilesWithTolerance(StringRef NameA,
 
   return CompareFailed;
 }
+
+void llvm::AtomicFileWriteError::log(raw_ostream &OS) const {
+  OS << "atomic_write_error: ";
+  switch (Error) {
+  case atomic_write_error::failed_to_create_uniq_file:
+    OS << "failed_to_create_uniq_file";
+    return;
+  case atomic_write_error::output_stream_error:
+    OS << "output_stream_error";
+    return;
+  case atomic_write_error::failed_to_rename_temp_file:
+    OS << "failed_to_rename_temp_file";
+    return;
+  }
+  llvm_unreachable("unknown atomic_write_error value in "
+                   "failed_to_rename_temp_file::log()");
+}
+
+llvm::Error llvm::writeFileAtomically(StringRef TempPathModel,
+                                      StringRef FinalPath, StringRef Buffer) {
+  return writeFileAtomically(TempPathModel, FinalPath,
+                             [&Buffer](llvm::raw_ostream &OS) {
+                               OS.write(Buffer.data(), Buffer.size());
+                               return llvm::Error::success();
+                             });
+}
+
+llvm::Error llvm::writeFileAtomically(
+    StringRef TempPathModel, StringRef FinalPath,
+    std::function<llvm::Error(llvm::raw_ostream &)> Writer) {
+  SmallString<128> GeneratedUniqPath;
+  int TempFD;
+  if (sys::fs::createUniqueFile(TempPathModel.str(), TempFD,
+                                GeneratedUniqPath)) {
+    return llvm::make_error<AtomicFileWriteError>(
+        atomic_write_error::failed_to_create_uniq_file);
+  }
+  llvm::FileRemover RemoveTmpFileOnFail(GeneratedUniqPath);
+
+  raw_fd_ostream OS(TempFD, /*shouldClose=*/true);
+  if (llvm::Error Err = Writer(OS)) {
+    return Err;
+  }
+
+  OS.close();
+  if (OS.has_error()) {
+    OS.clear_error();
+    return llvm::make_error<AtomicFileWriteError>(
+        atomic_write_error::output_stream_error);
+  }
+
+  if (const std::error_code Error =
+          sys::fs::rename(/*from=*/GeneratedUniqPath.c_str(),
+                          /*to=*/FinalPath.str().c_str())) {
+    return llvm::make_error<AtomicFileWriteError>(
+        atomic_write_error::failed_to_rename_temp_file);
+  }
+
+  RemoveTmpFileOnFail.releaseFile();
+  return Error::success();
+}
+
+char llvm::AtomicFileWriteError::ID;
diff --git a/lib/Support/GlobPattern.cpp b/lib/Support/GlobPattern.cpp
index 6011be86d77f..8dae6941ec77 100644
--- a/lib/Support/GlobPattern.cpp
+++ b/lib/Support/GlobPattern.cpp
@@ -19,7 +19,7 @@
 using namespace llvm;
 
 static bool hasWildcard(StringRef S) {
-  return S.find_first_of("?*[") != StringRef::npos;
+  return S.find_first_of("?*[\\") != StringRef::npos;
 }
 
 // Expands character ranges and returns a bitmap.
@@ -60,8 +60,9 @@ static Expected<BitVector> expand(StringRef S, StringRef Original) {
 }
 
 // This is a scanner for the glob pattern.
-// A glob pattern token is one of "*", "?", "[<chars>]", "[^<chars>]"
-// (which is a negative form of "[<chars>]"), or a non-meta character.
+// A glob pattern token is one of "*", "?", "\", "[<chars>]", "[^<chars>]"
+// (which is a negative form of "[<chars>]"), "[!<chars>]" (which is
+// equivalent to "[^<chars>]"), or a non-meta character.
 // This function returns the first token in S.
 static Expected<BitVector> scan(StringRef &S, StringRef Original) {
   switch (S[0]) {
@@ -74,14 +75,16 @@ static Expected<BitVector> scan(StringRef &S, StringRef Original) {
     S = S.substr(1);
     return BitVector(256, true);
   case '[': {
-    size_t End = S.find(']', 1);
+    // ']' is allowed as the first character of a character class. '[]' is
+    // invalid. So, just skip the first character.
+    size_t End = S.find(']', 2);
     if (End == StringRef::npos)
       return make_error<StringError>("invalid glob pattern: " + Original,
                                      errc::invalid_argument);
 
     StringRef Chars = S.substr(1, End - 1);
     S = S.substr(End + 1);
-    if (Chars.startswith("^")) {
+    if (Chars.startswith("^") || Chars.startswith("!")) {
       Expected<BitVector> BV = expand(Chars.substr(1), Original);
       if (!BV)
         return BV.takeError();
@@ -89,6 +92,11 @@ static Expected<BitVector> scan(StringRef &S, StringRef Original) {
     }
     return expand(Chars, Original);
   }
+  case '\\':
+    // Eat this character and fall through below to treat it like a non-meta
+    // character.
+    S = S.substr(1);
+    LLVM_FALLTHROUGH;
   default:
     BitVector BV(256, false);
     BV[(uint8_t)S[0]] = true;
@@ -107,8 +115,9 @@ Expected<GlobPattern> GlobPattern::create(StringRef S) {
     return Pat;
   }
 
-  // S is something like "foo*". We can use startswith().
-  if (S.endswith("*") && !hasWildcard(S.drop_back())) {
+  // S is something like "foo*", and the "* is not escaped. We can use
+  // startswith().
+  if (S.endswith("*") && !S.endswith("\\*") && !hasWildcard(S.drop_back())) {
     Pat.Prefix = S.drop_back();
     return Pat;
   }
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index d491912bdc0c..2a473a1994c2 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -316,7 +316,7 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) {
         unsigned int Id;
         if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
           if (Id >= 8561 && HaveVectorSupport)
-            return "arch13";
+            return "z15";
           if (Id >= 3906 && HaveVectorSupport)
             return "z14";
           if (Id >= 2964 && HaveVectorSupport)
@@ -680,7 +680,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     // Skylake Xeon:
     case 0x55:
       *Type = X86::INTEL_COREI7;
-      if (Features3 & (1 << (X86::FEATURE_AVX512BF16 - 64)))
+      if (Features2 & (1 << (X86::FEATURE_AVX512BF16 - 32)))
         *Subtype = X86::INTEL_COREI7_COOPERLAKE; // "cooperlake"
       else if (Features2 & (1 << (X86::FEATURE_AVX512VNNI - 32)))
         *Subtype = X86::INTEL_COREI7_CASCADELAKE; // "cascadelake"
@@ -746,6 +746,13 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
       break;
 
     default: // Unknown family 6 CPU, try to guess.
+      // TODO detect tigerlake host
+      if (Features3 & (1 << (X86::FEATURE_AVX512VP2INTERSECT - 64))) {
+        *Type = X86::INTEL_COREI7;
+        *Subtype = X86::INTEL_COREI7_TIGERLAKE;
+        break;
+      }
+
       if (Features & (1 << X86::FEATURE_AVX512VBMI2)) {
         *Type = X86::INTEL_COREI7;
         *Subtype = X86::INTEL_COREI7_ICELAKE_CLIENT;
@@ -758,7 +765,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         break;
       }
 
-      if (Features3 & (1 << (X86::FEATURE_AVX512BF16 - 64))) {
+      if (Features2 & (1 << (X86::FEATURE_AVX512BF16 - 32))) {
         *Type = X86::INTEL_COREI7;
         *Subtype = X86::INTEL_COREI7_COOPERLAKE;
         break;
@@ -1034,7 +1041,7 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(X86::FEATURE_BMI);
   if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVX)
     setFeature(X86::FEATURE_AVX2);
-  if (HasLeaf7 && ((EBX >> 9) & 1))
+  if (HasLeaf7 && ((EBX >> 8) & 1))
     setFeature(X86::FEATURE_BMI2);
   if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
     setFeature(X86::FEATURE_AVX512F);
@@ -1078,6 +1085,13 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(X86::FEATURE_AVX5124VNNIW);
   if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save)
     setFeature(X86::FEATURE_AVX5124FMAPS);
+  if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save)
+    setFeature(X86::FEATURE_AVX512VP2INTERSECT);
+
+  bool HasLeaf7Subleaf1 =
+      MaxLeaf >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save)
+    setFeature(X86::FEATURE_AVX512BF16);
 
   unsigned MaxExtLevel;
   getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -1369,7 +1383,6 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["bmi2"]       = HasLeaf7 && ((EBX >>  8) & 1);
   Features["invpcid"]    = HasLeaf7 && ((EBX >> 10) & 1);
   Features["rtm"]        = HasLeaf7 && ((EBX >> 11) & 1);
-  Features["mpx"]        = HasLeaf7 && ((EBX >> 14) & 1);
   // AVX512 is only supported if the OS supports the context save for it.
   Features["avx512f"]    = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
   Features["avx512dq"]   = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save;
@@ -1499,6 +1512,17 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 
   return true;
 }
+#elif defined(_WIN32) && (defined(__aarch64__) || defined(_M_ARM64))
+bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
+  if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+    Features["neon"] = true;
+  if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE))
+    Features["crc"] = true;
+  if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
+    Features["crypto"] = true;
+
+  return true;
+}
 #else
 bool sys::getHostCPUFeatures(StringMap<bool> &Features) { return false; }
 #endif
diff --git a/lib/Support/JSON.cpp b/lib/Support/JSON.cpp
index 95e5ed654277..16b1d11efd08 100644
--- a/lib/Support/JSON.cpp
+++ b/lib/Support/JSON.cpp
@@ -502,7 +502,7 @@ bool Parser::parseError(const char *Msg) {
     }
   }
   Err.emplace(
-      llvm::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
+      std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
   return false;
 }
 } // namespace
diff --git a/lib/Support/JamCRC.cpp b/lib/Support/JamCRC.cpp
deleted file mode 100644
index e043a3c33c28..000000000000
--- a/lib/Support/JamCRC.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-//===-- JamCRC.cpp - Cyclic Redundancy Check --------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains an implementation of JamCRC.
-//
-//===----------------------------------------------------------------------===//
-//
-// The implementation technique is the one mentioned in:
-// D. V. Sarwate. 1988. Computation of cyclic redundancy checks via table
-// look-up. Commun. ACM 31, 8 (August 1988)
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/JamCRC.h"
-#include "llvm/ADT/ArrayRef.h"
-
-using namespace llvm;
-
-static const uint32_t CRCTable[256] = {
-    0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
-    0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
-    0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-    0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
-    0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
-    0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-    0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
-    0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
-    0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-    0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
-    0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
-    0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-    0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
-    0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
-    0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-    0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
-    0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
-    0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-    0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
-    0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
-    0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-    0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
-    0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
-    0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-    0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
-    0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
-    0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-    0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
-    0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
-    0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-    0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
-    0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
-    0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-    0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
-    0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
-    0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-    0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
-    0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
-    0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-    0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
-    0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
-    0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-    0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
-    0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
-    0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-    0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
-    0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
-    0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-    0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
-    0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
-    0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-    0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
-    0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
-    0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-    0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
-    0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
-    0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-    0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
-    0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
-    0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-    0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
-    0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
-    0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-    0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-};
-
-void JamCRC::update(ArrayRef<char> Data) {
-  for (char Byte : Data) {
-    int TableIdx = (CRC ^ Byte) & 0xff;
-    CRC = CRCTable[TableIdx] ^ (CRC >> 8);
-  }
-}
diff --git a/lib/Support/ManagedStatic.cpp b/lib/Support/ManagedStatic.cpp
index 28ceb1a70e42..053493f72fb5 100644
--- a/lib/Support/ManagedStatic.cpp
+++ b/lib/Support/ManagedStatic.cpp
@@ -12,21 +12,20 @@
 
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/Threading.h"
 #include <cassert>
+#include <mutex>
 using namespace llvm;
 
 static const ManagedStaticBase *StaticList = nullptr;
-static sys::Mutex *ManagedStaticMutex = nullptr;
+static std::recursive_mutex *ManagedStaticMutex = nullptr;
 static llvm::once_flag mutex_init_flag;
 
 static void initializeMutex() {
-  ManagedStaticMutex = new sys::Mutex();
+  ManagedStaticMutex = new std::recursive_mutex();
 }
 
-static sys::Mutex* getManagedStaticMutex() {
+static std::recursive_mutex *getManagedStaticMutex() {
   llvm::call_once(mutex_init_flag, initializeMutex);
   return ManagedStaticMutex;
 }
@@ -35,7 +34,7 @@ void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(),
                                               void (*Deleter)(void*)) const {
   assert(Creator);
   if (llvm_is_multithreaded()) {
-    MutexGuard Lock(*getManagedStaticMutex());
+    std::lock_guard<std::recursive_mutex> Lock(*getManagedStaticMutex());
 
     if (!Ptr.load(std::memory_order_relaxed)) {
       void *Tmp = Creator();
@@ -77,7 +76,7 @@ void ManagedStaticBase::destroy() const {
 
 /// llvm_shutdown - Deallocate and destroy all ManagedStatic variables.
 void llvm::llvm_shutdown() {
-  MutexGuard Lock(*getManagedStaticMutex());
+  std::lock_guard<std::recursive_mutex> Lock(*getManagedStaticMutex());
 
   while (StaticList)
     StaticList->destroy();
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index d0e5bb154c1a..e4027ca7bbfd 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -211,15 +211,17 @@ static ErrorOr<std::unique_ptr<WritableMemoryBuffer>>
 getMemoryBufferForStream(sys::fs::file_t FD, const Twine &BufferName) {
   const ssize_t ChunkSize = 4096*4;
   SmallString<ChunkSize> Buffer;
-  size_t ReadBytes;
   // Read into Buffer until we hit EOF.
-  do {
+  for (;;) {
     Buffer.reserve(Buffer.size() + ChunkSize);
-    if (auto EC = sys::fs::readNativeFile(
-            FD, makeMutableArrayRef(Buffer.end(), ChunkSize), &ReadBytes))
-      return EC;
-    Buffer.set_size(Buffer.size() + ReadBytes);
-  } while (ReadBytes != 0);
+    Expected<size_t> ReadBytes = sys::fs::readNativeFile(
+        FD, makeMutableArrayRef(Buffer.end(), ChunkSize));
+    if (!ReadBytes)
+      return errorToErrorCode(ReadBytes.takeError());
+    if (*ReadBytes == 0)
+      break;
+    Buffer.set_size(Buffer.size() + *ReadBytes);
+  }
 
   return getMemBufferCopyImpl(Buffer, BufferName);
 }
@@ -458,7 +460,20 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
     return make_error_code(errc::not_enough_memory);
   }
 
-  sys::fs::readNativeFileSlice(FD, Buf->getBuffer(), Offset);
+  // Read until EOF, zero-initialize the rest.
+  MutableArrayRef<char> ToRead = Buf->getBuffer();
+  while (!ToRead.empty()) {
+    Expected<size_t> ReadBytes =
+        sys::fs::readNativeFileSlice(FD, ToRead, Offset);
+    if (!ReadBytes)
+      return errorToErrorCode(ReadBytes.takeError());
+    if (*ReadBytes == 0) {
+      std::memset(ToRead.data(), 0, ToRead.size());
+      break;
+    }
+    ToRead = ToRead.drop_front(*ReadBytes);
+    Offset += *ReadBytes;
+  }
 
   return std::move(Buf);
 }
diff --git a/lib/Support/Mutex.cpp b/lib/Support/Mutex.cpp
deleted file mode 100644
index 69b7b8126ab1..000000000000
--- a/lib/Support/Mutex.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//===- Mutex.cpp - Mutual Exclusion Lock ------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the llvm::sys::Mutex class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Mutex.h"
-#include "llvm/Config/config.h"
-#include "llvm/Support/ErrorHandling.h"
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only TRULY operating system
-//===          independent code.
-//===----------------------------------------------------------------------===//
-
-#if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
-// Define all methods as no-ops if threading is explicitly disabled
-namespace llvm {
-using namespace sys;
-MutexImpl::MutexImpl( bool recursive) { }
-MutexImpl::~MutexImpl() { }
-bool MutexImpl::acquire() { return true; }
-bool MutexImpl::release() { return true; }
-bool MutexImpl::tryacquire() { return true; }
-}
-#else
-
-#if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_MUTEX_LOCK)
-
-#include <cassert>
-#include <pthread.h>
-#include <stdlib.h>
-
-namespace llvm {
-using namespace sys;
-
-// Construct a Mutex using pthread calls
-MutexImpl::MutexImpl( bool recursive)
-  : data_(nullptr)
-{
-  // Declare the pthread_mutex data structures
-  pthread_mutex_t* mutex =
-    static_cast<pthread_mutex_t*>(safe_malloc(sizeof(pthread_mutex_t)));
-
-  pthread_mutexattr_t attr;
-
-  // Initialize the mutex attributes
-  int errorcode = pthread_mutexattr_init(&attr);
-  assert(errorcode == 0); (void)errorcode;
-
-  // Initialize the mutex as a recursive mutex, if requested, or normal
-  // otherwise.
-  int kind = ( recursive  ? PTHREAD_MUTEX_RECURSIVE : PTHREAD_MUTEX_NORMAL );
-  errorcode = pthread_mutexattr_settype(&attr, kind);
-  assert(errorcode == 0);
-
-  // Initialize the mutex
-  errorcode = pthread_mutex_init(mutex, &attr);
-  assert(errorcode == 0);
-
-  // Destroy the attributes
-  errorcode = pthread_mutexattr_destroy(&attr);
-  assert(errorcode == 0);
-
-  // Assign the data member
-  data_ = mutex;
-}
-
-// Destruct a Mutex
-MutexImpl::~MutexImpl()
-{
-  pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
-  assert(mutex != nullptr);
-  pthread_mutex_destroy(mutex);
-  free(mutex);
-}
-
-bool
-MutexImpl::acquire()
-{
-  pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
-  assert(mutex != nullptr);
-
-  int errorcode = pthread_mutex_lock(mutex);
-  return errorcode == 0;
-}
-
-bool
-MutexImpl::release()
-{
-  pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
-  assert(mutex != nullptr);
-
-  int errorcode = pthread_mutex_unlock(mutex);
-  return errorcode == 0;
-}
-
-bool
-MutexImpl::tryacquire()
-{
-  pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
-  assert(mutex != nullptr);
-
-  int errorcode = pthread_mutex_trylock(mutex);
-  return errorcode == 0;
-}
-
-}
-
-#elif defined(LLVM_ON_UNIX)
-#include "Unix/Mutex.inc"
-#elif defined( _WIN32)
-#include "Windows/Mutex.inc"
-#else
-#warning Neither LLVM_ON_UNIX nor _WIN32 was set in Support/Mutex.cpp
-#endif
-#endif
diff --git a/lib/Support/Parallel.cpp b/lib/Support/Parallel.cpp
index 621bccbf2a4c..355c64b7d079 100644
--- a/lib/Support/Parallel.cpp
+++ b/lib/Support/Parallel.cpp
@@ -32,34 +32,6 @@ public:
   static Executor *getDefaultExecutor();
 };
 
-#if defined(_MSC_VER)
-/// An Executor that runs tasks via ConcRT.
-class ConcRTExecutor : public Executor {
-  struct Taskish {
-    Taskish(std::function<void()> Task) : Task(Task) {}
-
-    std::function<void()> Task;
-
-    static void run(void *P) {
-      Taskish *Self = static_cast<Taskish *>(P);
-      Self->Task();
-      concurrency::Free(Self);
-    }
-  };
-
-public:
-  virtual void add(std::function<void()> F) {
-    Concurrency::CurrentScheduler::ScheduleTask(
-        Taskish::run, new (concurrency::Alloc(sizeof(Taskish))) Taskish(F));
-  }
-};
-
-Executor *Executor::getDefaultExecutor() {
-  static ConcRTExecutor exec;
-  return &exec;
-}
-
-#else
 /// An implementation of an Executor that runs closures on a thread pool
 ///   in filo order.
 class ThreadPoolExecutor : public Executor {
@@ -117,8 +89,7 @@ Executor *Executor::getDefaultExecutor() {
   static ThreadPoolExecutor exec;
   return &exec;
 }
-#endif
-}
+} // namespace
 
 static std::atomic<int> TaskGroupInstances;
 
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index c49260125dba..14def83802da 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -855,11 +855,11 @@ void make_absolute(const Twine &current_directory,
   StringRef p(path.data(), path.size());
 
   bool rootDirectory = path::has_root_directory(p);
-  bool rootName =
-      (real_style(Style::native) != Style::windows) || path::has_root_name(p);
+  bool rootName = path::has_root_name(p);
 
   // Already absolute.
-  if (rootName && rootDirectory)
+  if ((rootName || real_style(Style::native) != Style::windows) &&
+      rootDirectory)
     return;
 
   // All of the following conditions will need the current directory.
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index aec00baec0e3..bfb238cc8539 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -121,31 +121,63 @@ extern "C" const char *__crashreporter_info__
 asm(".desc ___crashreporter_info__, 0x10");
 #endif
 
-/// CrashHandler - This callback is run if a fatal signal is delivered to the
-/// process, it prints the pretty stack trace.
+static void setCrashLogMessage(const char *msg) LLVM_ATTRIBUTE_UNUSED;
+static void setCrashLogMessage(const char *msg) {
+#ifdef HAVE_CRASHREPORTERCLIENT_H
+  (void)CRSetCrashLogMessage(msg);
+#elif HAVE_CRASHREPORTER_INFO
+  __crashreporter_info__ = msg;
+#endif
+  // Don't reorder subsequent operations: whatever comes after might crash and
+  // we want the system crash handling to see the message we just set.
+  std::atomic_signal_fence(std::memory_order_seq_cst);
+}
+
+#ifdef __APPLE__
+using CrashHandlerString = SmallString<2048>;
+using CrashHandlerStringStorage =
+    std::aligned_storage<sizeof(CrashHandlerString),
+                         alignof(CrashHandlerString)>::type;
+static CrashHandlerStringStorage crashHandlerStringStorage;
+#endif
+
+/// This callback is run if a fatal signal is delivered to the process, it
+/// prints the pretty stack trace.
 static void CrashHandler(void *) {
 #ifndef __APPLE__
   // On non-apple systems, just emit the crash stack trace to stderr.
   PrintCurStackTrace(errs());
 #else
-  // Otherwise, emit to a smallvector of chars, send *that* to stderr, but also
-  // put it into __crashreporter_info__.
-  SmallString<2048> TmpStr;
+  // Emit the crash stack trace to a SmallString, put it where the system crash
+  // handling will find it, and also send it to stderr.
+  //
+  // The SmallString is fairly large in the hope that we don't allocate (we're
+  // handling a fatal signal, something is already pretty wrong, allocation
+  // might not work). Further, we don't use a magic static in case that's also
+  // borked. We leak any allocation that does occur because the program is about
+  // to die anyways. This is technically racy if we were handling two fatal
+  // signals, however if we're in that situation a race is the least of our
+  // worries.
+  auto &crashHandlerString =
+      *new (&crashHandlerStringStorage) CrashHandlerString;
+
+  // If we crash while trying to print the stack trace, we still want the system
+  // crash handling to have some partial information. That'll work out as long
+  // as the SmallString doesn't allocate. If it does allocate then the system
+  // crash handling will see some garbage because the inline buffer now contains
+  // a pointer.
+  setCrashLogMessage(crashHandlerString.c_str());
+
   {
-    raw_svector_ostream Stream(TmpStr);
+    raw_svector_ostream Stream(crashHandlerString);
     PrintCurStackTrace(Stream);
   }
 
-  if (!TmpStr.empty()) {
-#ifdef HAVE_CRASHREPORTERCLIENT_H
-    // Cast to void to avoid warning.
-    (void)CRSetCrashLogMessage(TmpStr.c_str());
-#elif HAVE_CRASHREPORTER_INFO
-    __crashreporter_info__ = strdup(TmpStr.c_str());
-#endif
-    errs() << TmpStr.str();
-  }
-
+  if (!crashHandlerString.empty()) {
+    setCrashLogMessage(crashHandlerString.c_str());
+    errs() << crashHandlerString.str();
+  } else
+    setCrashLogMessage("No crash information.");
 #endif
 }
 
diff --git a/lib/Support/RWMutex.cpp b/lib/Support/RWMutex.cpp
index 7ce856b716c6..5accf73e5f94 100644
--- a/lib/Support/RWMutex.cpp
+++ b/lib/Support/RWMutex.cpp
@@ -14,24 +14,20 @@
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Config/config.h"
 
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only TRULY operating system
-//===          independent code.
-//===----------------------------------------------------------------------===//
+#if defined(LLVM_USE_RW_MUTEX_IMPL)
+using namespace llvm;
+using namespace sys;
 
 #if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
 // Define all methods as no-ops if threading is explicitly disabled
 
-using namespace llvm;
-using namespace sys;
-
 RWMutexImpl::RWMutexImpl() = default;
 RWMutexImpl::~RWMutexImpl() = default;
 
-bool RWMutexImpl::reader_acquire() { return true; }
-bool RWMutexImpl::reader_release() { return true; }
-bool RWMutexImpl::writer_acquire() { return true; }
-bool RWMutexImpl::writer_release() { return true; }
+bool RWMutexImpl::lock_shared() { return true; }
+bool RWMutexImpl::unlock_shared() { return true; }
+bool RWMutexImpl::lock() { return true; }
+bool RWMutexImpl::unlock() { return true; }
 
 #else
 
@@ -41,9 +37,6 @@ bool RWMutexImpl::writer_release() { return true; }
 #include <cstdlib>
 #include <pthread.h>
 
-using namespace llvm;
-using namespace sys;
-
 // Construct a RWMutex using pthread calls
 RWMutexImpl::RWMutexImpl()
 {
@@ -75,7 +68,7 @@ RWMutexImpl::~RWMutexImpl()
 }
 
 bool
-RWMutexImpl::reader_acquire()
+RWMutexImpl::lock_shared()
 {
   pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
   assert(rwlock != nullptr);
@@ -85,7 +78,7 @@ RWMutexImpl::reader_acquire()
 }
 
 bool
-RWMutexImpl::reader_release()
+RWMutexImpl::unlock_shared()
 {
   pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
   assert(rwlock != nullptr);
@@ -95,7 +88,7 @@ RWMutexImpl::reader_release()
 }
 
 bool
-RWMutexImpl::writer_acquire()
+RWMutexImpl::lock()
 {
   pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
   assert(rwlock != nullptr);
@@ -105,7 +98,7 @@ RWMutexImpl::writer_acquire()
 }
 
 bool
-RWMutexImpl::writer_release()
+RWMutexImpl::unlock()
 {
   pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
   assert(rwlock != nullptr);
@@ -114,11 +107,30 @@ RWMutexImpl::writer_release()
   return errorcode == 0;
 }
 
-#elif defined(LLVM_ON_UNIX)
-#include "Unix/RWMutex.inc"
-#elif defined( _WIN32)
-#include "Windows/RWMutex.inc"
 #else
-#warning Neither LLVM_ON_UNIX nor _WIN32 was set in Support/Mutex.cpp
+
+RWMutexImpl::RWMutexImpl() : data_(new MutexImpl(false)) { }
+
+RWMutexImpl::~RWMutexImpl() {
+  delete static_cast<MutexImpl *>(data_);
+}
+
+bool RWMutexImpl::lock_shared() {
+  return static_cast<MutexImpl *>(data_)->acquire();
+}
+
+bool RWMutexImpl::unlock_shared() {
+  return static_cast<MutexImpl *>(data_)->release();
+}
+
+bool RWMutexImpl::lock() {
+  return static_cast<MutexImpl *>(data_)->acquire();
+}
+
+bool RWMutexImpl::unlock() {
+  return static_cast<MutexImpl *>(data_)->release();
+}
+
+#endif
 #endif
 #endif
diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp
index 4c1b07038024..8da345d4f140 100644
--- a/lib/Support/Regex.cpp
+++ b/lib/Support/Regex.cpp
@@ -52,14 +52,24 @@ Regex::~Regex() {
   }
 }
 
-bool Regex::isValid(std::string &Error) const {
-  if (!error)
-    return true;
+namespace {
 
+/// Utility to convert a regex error code into a human-readable string.
+void RegexErrorToString(int error, struct llvm_regex *preg,
+                        std::string &Error) {
   size_t len = llvm_regerror(error, preg, nullptr, 0);
 
   Error.resize(len - 1);
   llvm_regerror(error, preg, &Error[0], len);
+}
+
+} // namespace
+
+bool Regex::isValid(std::string &Error) const {
+  if (!error)
+    return true;
+
+  RegexErrorToString(error, preg, Error);
   return false;
 }
 
@@ -69,8 +79,14 @@ unsigned Regex::getNumMatches() const {
   return preg->re_nsub;
 }
 
-bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
-  if (error)
+bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches,
+                  std::string *Error) const {
+  // Reset error, if given.
+  if (Error && !Error->empty())
+    *Error = "";
+
+  // Check if the regex itself didn't successfully compile.
+  if (Error ? !isValid(*Error) : !isValid())
     return false;
 
   unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
@@ -83,11 +99,13 @@ bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
 
   int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
 
+  // Failure to match is not an error, it's just a normal return value.
+  // Any other error code is considered abnormal, and is logged in the Error.
   if (rc == REG_NOMATCH)
     return false;
   if (rc != 0) {
-    // regexec can fail due to invalid pattern or running out of memory.
-    error = rc;
+    if (Error)
+      RegexErrorToString(error, preg, *Error);
     return false;
   }
 
@@ -112,14 +130,11 @@ bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
 }
 
 std::string Regex::sub(StringRef Repl, StringRef String,
-                       std::string *Error) {
+                       std::string *Error) const {
   SmallVector<StringRef, 8> Matches;
 
-  // Reset error, if given.
-  if (Error && !Error->empty()) *Error = "";
-
   // Return the input if there was no match.
-  if (!match(String, &Matches))
+  if (!match(String, &Matches, Error))
     return String;
 
   // Otherwise splice in the replacement string, starting with the prefix before
diff --git a/lib/Support/Signposts.cpp b/lib/Support/Signposts.cpp
index d456f41d2fa6..aa159e1da2ae 100644
--- a/lib/Support/Signposts.cpp
+++ b/lib/Support/Signposts.cpp
@@ -78,6 +78,8 @@ public:
 
 #if LLVM_SUPPORT_XCODE_SIGNPOSTS
 #define HAVE_ANY_SIGNPOST_IMPL 1
+#else
+#define HAVE_ANY_SIGNPOST_IMPL 0
 #endif
 
 SignpostEmitter::SignpostEmitter() {
diff --git a/lib/Support/SpecialCaseList.cpp b/lib/Support/SpecialCaseList.cpp
index 96e09f9552bb..9bd1f18a4ee7 100644
--- a/lib/Support/SpecialCaseList.cpp
+++ b/lib/Support/SpecialCaseList.cpp
@@ -53,7 +53,7 @@ bool SpecialCaseList::Matcher::insert(std::string Regexp,
     return false;
 
   RegExes.emplace_back(
-      std::make_pair(make_unique<Regex>(std::move(CheckRE)), LineNumber));
+      std::make_pair(std::make_unique<Regex>(std::move(CheckRE)), LineNumber));
   return true;
 }
 
@@ -175,7 +175,7 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
 
     // Create this section if it has not been seen before.
     if (SectionsMap.find(Section) == SectionsMap.end()) {
-      std::unique_ptr<Matcher> M = make_unique<Matcher>();
+      std::unique_ptr<Matcher> M = std::make_unique<Matcher>();
       std::string REError;
       if (!M->insert(Section, LineNo, REError)) {
         Error = (Twine("malformed section ") + Section + ": '" + REError).str();
diff --git a/lib/Support/Statistic.cpp b/lib/Support/Statistic.cpp
index e4f0535d21aa..8b4177c7fba6 100644
--- a/lib/Support/Statistic.cpp
+++ b/lib/Support/Statistic.cpp
@@ -57,7 +57,7 @@ namespace {
 /// This class is also used to look up statistic values from applications that
 /// use LLVM.
 class StatisticInfo {
-  std::vector<Statistic*> Stats;
+  std::vector<TrackingStatistic *> Stats;
 
   friend void llvm::PrintStatistics();
   friend void llvm::PrintStatistics(raw_ostream &OS);
@@ -66,14 +66,12 @@ class StatisticInfo {
   /// Sort statistics by debugtype,name,description.
   void sort();
 public:
-  using const_iterator = std::vector<Statistic *>::const_iterator;
+  using const_iterator = std::vector<TrackingStatistic *>::const_iterator;
 
   StatisticInfo();
   ~StatisticInfo();
 
-  void addStatistic(Statistic *S) {
-    Stats.push_back(S);
-  }
+  void addStatistic(TrackingStatistic *S) { Stats.push_back(S); }
 
   const_iterator begin() const { return Stats.begin(); }
   const_iterator end() const { return Stats.end(); }
@@ -90,7 +88,7 @@ static ManagedStatic<sys::SmartMutex<true> > StatLock;
 
 /// RegisterStatistic - The first time a statistic is bumped, this method is
 /// called.
-void Statistic::RegisterStatistic() {
+void TrackingStatistic::RegisterStatistic() {
   // If stats are enabled, inform StatInfo that this statistic should be
   // printed.
   // llvm_shutdown calls destructors while holding the ManagedStatic mutex.
@@ -135,15 +133,16 @@ bool llvm::AreStatisticsEnabled() {
 }
 
 void StatisticInfo::sort() {
-  llvm::stable_sort(Stats, [](const Statistic *LHS, const Statistic *RHS) {
-    if (int Cmp = std::strcmp(LHS->getDebugType(), RHS->getDebugType()))
-      return Cmp < 0;
+  llvm::stable_sort(
+      Stats, [](const TrackingStatistic *LHS, const TrackingStatistic *RHS) {
+        if (int Cmp = std::strcmp(LHS->getDebugType(), RHS->getDebugType()))
+          return Cmp < 0;
 
-    if (int Cmp = std::strcmp(LHS->getName(), RHS->getName()))
-      return Cmp < 0;
+        if (int Cmp = std::strcmp(LHS->getName(), RHS->getName()))
+          return Cmp < 0;
 
-    return std::strcmp(LHS->getDesc(), RHS->getDesc()) < 0;
-  });
+        return std::strcmp(LHS->getDesc(), RHS->getDesc()) < 0;
+      });
 }
 
 void StatisticInfo::reset() {
@@ -207,7 +206,7 @@ void llvm::PrintStatisticsJSON(raw_ostream &OS) {
   // Print all of the statistics.
   OS << "{\n";
   const char *delim = "";
-  for (const Statistic *Stat : Stats.Stats) {
+  for (const TrackingStatistic *Stat : Stats.Stats) {
     OS << delim;
     assert(yaml::needsQuotes(Stat->getDebugType()) == yaml::QuotingType::None &&
            "Statistic group/type name is simple.");
diff --git a/lib/Support/StringExtras.cpp b/lib/Support/StringExtras.cpp
index bf28b2be5657..af8dd463e125 100644
--- a/lib/Support/StringExtras.cpp
+++ b/lib/Support/StringExtras.cpp
@@ -60,7 +60,9 @@ void llvm::SplitString(StringRef Source,
 void llvm::printEscapedString(StringRef Name, raw_ostream &Out) {
   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     unsigned char C = Name[i];
-    if (isPrint(C) && C != '\\' && C != '"')
+    if (C == '\\')
+      Out << '\\' << C;
+    else if (isPrint(C) && C != '"')
       Out << C;
     else
       Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F);
diff --git a/lib/Support/TimeProfiler.cpp b/lib/Support/TimeProfiler.cpp
index bc2340815645..ca9119e30b65 100644
--- a/lib/Support/TimeProfiler.cpp
+++ b/lib/Support/TimeProfiler.cpp
@@ -24,29 +24,38 @@ using namespace std::chrono;
 
 namespace llvm {
 
-static cl::opt<unsigned> TimeTraceGranularity(
-    "time-trace-granularity",
-    cl::desc(
-        "Minimum time granularity (in microseconds) traced by time profiler"),
-    cl::init(500));
-
 TimeTraceProfiler *TimeTraceProfilerInstance = nullptr;
 
 typedef duration<steady_clock::rep, steady_clock::period> DurationType;
+typedef time_point<steady_clock> TimePointType;
 typedef std::pair<size_t, DurationType> CountAndDurationType;
 typedef std::pair<std::string, CountAndDurationType>
     NameAndCountAndDurationType;
 
 struct Entry {
-  time_point<steady_clock> Start;
-  DurationType Duration;
+  TimePointType Start;
+  TimePointType End;
   std::string Name;
   std::string Detail;
 
-  Entry(time_point<steady_clock> &&S, DurationType &&D, std::string &&N,
-        std::string &&Dt)
-      : Start(std::move(S)), Duration(std::move(D)), Name(std::move(N)),
+  Entry(TimePointType &&S, TimePointType &&E, std::string &&N, std::string &&Dt)
+      : Start(std::move(S)), End(std::move(E)), Name(std::move(N)),
         Detail(std::move(Dt)){};
+
+  // Calculate timings for FlameGraph. Cast time points to microsecond precision
+  // rather than casting duration. This avoid truncation issues causing inner
+  // scopes overruning outer scopes.
+  steady_clock::rep getFlameGraphStartUs(TimePointType StartTime) const {
+    return (time_point_cast<microseconds>(Start) -
+            time_point_cast<microseconds>(StartTime))
+        .count();
+  }
+
+  steady_clock::rep getFlameGraphDurUs() const {
+    return (time_point_cast<microseconds>(End) -
+            time_point_cast<microseconds>(Start))
+        .count();
+  }
 };
 
 struct TimeTraceProfiler {
@@ -55,17 +64,27 @@ struct TimeTraceProfiler {
   }
 
   void begin(std::string Name, llvm::function_ref<std::string()> Detail) {
-    Stack.emplace_back(steady_clock::now(), DurationType{}, std::move(Name),
+    Stack.emplace_back(steady_clock::now(), TimePointType(), std::move(Name),
                        Detail());
   }
 
   void end() {
     assert(!Stack.empty() && "Must call begin() first");
     auto &E = Stack.back();
-    E.Duration = steady_clock::now() - E.Start;
+    E.End = steady_clock::now();
+
+    // Check that end times monotonically increase.
+    assert((Entries.empty() ||
+            (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
+             Entries.back().getFlameGraphStartUs(StartTime) +
+                 Entries.back().getFlameGraphDurUs())) &&
+           "TimeProfiler scope ended earlier than previous scope");
 
-    // Only include sections longer than TimeTraceGranularity msec.
-    if (duration_cast<microseconds>(E.Duration).count() > TimeTraceGranularity)
+    // Calculate duration at full precision for overall counts.
+    DurationType Duration = E.End - E.Start;
+
+    // Only include sections longer or equal to TimeTraceGranularity msec.
+    if (duration_cast<microseconds>(Duration).count() >= TimeTraceGranularity)
       Entries.emplace_back(E);
 
     // Track total time taken by each "name", but only the topmost levels of
@@ -78,7 +97,7 @@ struct TimeTraceProfiler {
         }) == Stack.rend()) {
       auto &CountAndTotal = CountAndTotalPerName[E.Name];
       CountAndTotal.first++;
-      CountAndTotal.second += E.Duration;
+      CountAndTotal.second += Duration;
     }
 
     Stack.pop_back();
@@ -94,8 +113,8 @@ struct TimeTraceProfiler {
 
     // Emit all events for the main flame graph.
     for (const auto &E : Entries) {
-      auto StartUs = duration_cast<microseconds>(E.Start - StartTime).count();
-      auto DurUs = duration_cast<microseconds>(E.Duration).count();
+      auto StartUs = E.getFlameGraphStartUs(StartTime);
+      auto DurUs = E.getFlameGraphDurUs();
 
       J.object([&]{
         J.attribute("pid", 1);
@@ -160,13 +179,17 @@ struct TimeTraceProfiler {
   SmallVector<Entry, 16> Stack;
   SmallVector<Entry, 128> Entries;
   StringMap<CountAndDurationType> CountAndTotalPerName;
-  time_point<steady_clock> StartTime;
+  TimePointType StartTime;
+
+  // Minimum time granularity (in microseconds)
+  unsigned TimeTraceGranularity;
 };
 
-void timeTraceProfilerInitialize() {
+void timeTraceProfilerInitialize(unsigned TimeTraceGranularity) {
   assert(TimeTraceProfilerInstance == nullptr &&
          "Profiler should not be initialized");
   TimeTraceProfilerInstance = new TimeTraceProfiler();
+  TimeTraceProfilerInstance->TimeTraceGranularity = TimeTraceGranularity;
 }
 
 void timeTraceProfilerCleanup() {
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 2a7ff1eaaf63..10c9b8e0b329 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -58,23 +58,23 @@ namespace {
 std::unique_ptr<raw_fd_ostream> llvm::CreateInfoOutputFile() {
   const std::string &OutputFilename = getLibSupportInfoOutputFilename();
   if (OutputFilename.empty())
-    return llvm::make_unique<raw_fd_ostream>(2, false); // stderr.
+    return std::make_unique<raw_fd_ostream>(2, false); // stderr.
   if (OutputFilename == "-")
-    return llvm::make_unique<raw_fd_ostream>(1, false); // stdout.
+    return std::make_unique<raw_fd_ostream>(1, false); // stdout.
 
   // Append mode is used because the info output file is opened and closed
   // each time -stats or -time-passes wants to print output to it. To
   // compensate for this, the test-suite Makefiles have code to delete the
   // info output file before running commands which write to it.
   std::error_code EC;
-  auto Result = llvm::make_unique<raw_fd_ostream>(
-      OutputFilename, EC, sys::fs::F_Append | sys::fs::F_Text);
+  auto Result = std::make_unique<raw_fd_ostream>(
+      OutputFilename, EC, sys::fs::OF_Append | sys::fs::OF_Text);
   if (!EC)
     return Result;
 
   errs() << "Error opening info-output-file '"
     << OutputFilename << " for appending!\n";
-  return llvm::make_unique<raw_fd_ostream>(2, false); // stderr.
+  return std::make_unique<raw_fd_ostream>(2, false); // stderr.
 }
 
 namespace {
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index a0927da50e48..05f8e32896fa 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -176,7 +176,7 @@ Memory::releaseMappedMemory(MemoryBlock &M) {
 
 std::error_code
 Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
-  static const size_t PageSize = Process::getPageSizeEstimate();
+  static const Align PageSize = Align(Process::getPageSizeEstimate());
   if (M.Address == nullptr || M.AllocatedSize == 0)
     return std::error_code();
 
@@ -184,8 +184,8 @@ Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
     return std::error_code(EINVAL, std::generic_category());
 
   int Protect = getPosixProtectionFlags(Flags);
-  uintptr_t Start = alignAddr((uint8_t *)M.Address - PageSize + 1, PageSize);
-  uintptr_t End = alignAddr((uint8_t *)M.Address + M.AllocatedSize, PageSize);
+  uintptr_t Start = alignAddr((const uint8_t *)M.Address - PageSize.value() + 1, PageSize);
+  uintptr_t End = alignAddr((const uint8_t *)M.Address + M.AllocatedSize, PageSize);
 
   bool InvalidateCache = (Flags & MF_EXEC);
 
diff --git a/lib/Support/Unix/Mutex.inc b/lib/Support/Unix/Mutex.inc
deleted file mode 100644
index 2c982b38d6ff..000000000000
--- a/lib/Support/Unix/Mutex.inc
+++ /dev/null
@@ -1,42 +0,0 @@
-//===- llvm/Support/Unix/Mutex.inc - Unix Mutex Implementation ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Unix specific (non-pthread) Mutex class.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic UNIX code that
-//===          is guaranteed to work on *all* UNIX variants.
-//===----------------------------------------------------------------------===//
-
-namespace llvm
-{
-using namespace sys;
-
-MutexImpl::MutexImpl( bool recursive)
-{
-}
-
-MutexImpl::~MutexImpl()
-{
-}
-
-bool
-MutexImpl::release()
-{
-  return true;
-}
-
-bool
-MutexImpl::tryacquire( void )
-{
-  return true;
-}
-
-}
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index e80880c6b3cb..a617eca3566a 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -186,12 +186,12 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||   \
     defined(__minix) || defined(__DragonFly__) ||                              \
     defined(__FreeBSD_kernel__) || defined(_AIX)
-  StringRef curproc("/proc/curproc/file");
+  const char *curproc = "/proc/curproc/file";
   char exe_path[PATH_MAX];
   // /proc is not mounted by default under FreeBSD, but gives more accurate
   // information than argv[0] when it is.
   if (sys::fs::exists(curproc)) {
-    ssize_t len = readlink(curproc.str().c_str(), exe_path, sizeof(exe_path));
+    ssize_t len = readlink(curproc, exe_path, sizeof(exe_path));
     if (len > 0) {
       // Null terminate the string for realpath. readlink never null
       // terminates its output.
@@ -205,10 +205,10 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
     return exe_path;
 #elif defined(__linux__) || defined(__CYGWIN__)
   char exe_path[MAXPATHLEN];
-  StringRef aPath("/proc/self/exe");
+  const char *aPath = "/proc/self/exe";
   if (sys::fs::exists(aPath)) {
     // /proc is not always mounted under Linux (chroot for example).
-    ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
+    ssize_t len = readlink(aPath, exe_path, sizeof(exe_path));
     if (len < 0)
       return "";
 
@@ -443,7 +443,7 @@ static bool is_local_impl(struct STATVFS &Vfs) {
   std::unique_ptr<char[]> Buf;
   int Tries = 3;
   while (Tries--) {
-    Buf = llvm::make_unique<char[]>(BufSize);
+    Buf = std::make_unique<char[]>(BufSize);
     Ret = mntctl(MCTL_QUERY, BufSize, Buf.get());
     if (Ret != 0)
       break;
@@ -833,7 +833,10 @@ std::error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
 static file_type direntType(dirent* Entry) {
   // Most platforms provide the file type in the dirent: Linux/BSD/Mac.
   // The DTTOIF macro lets us reuse our status -> type conversion.
-#if defined(_DIRENT_HAVE_D_TYPE) && defined(DTTOIF)
+  // Note that while glibc provides a macro to see if this is supported,
+  // _DIRENT_HAVE_D_TYPE, it's not defined on BSD/Mac, so we test for the
+  // d_type-to-mode_t conversion macro instead.
+#if defined(DTTOIF)
   return typeForMode(DTTOIF(Entry->d_type));
 #else
   // Other platforms such as Solaris require a stat() to get the type.
@@ -884,9 +887,9 @@ static int nativeOpenFlags(CreationDisposition Disp, OpenFlags Flags,
   else if (Access == (FA_Read | FA_Write))
     Result |= O_RDWR;
 
-  // This is for compatibility with old code that assumed F_Append implied
+  // This is for compatibility with old code that assumed OF_Append implied
   // would open an existing file.  See Windows/Path.inc for a longer comment.
-  if (Flags & F_Append)
+  if (Flags & OF_Append)
     Disp = CD_OpenAlways;
 
   if (Disp == CD_CreateNew) {
@@ -901,7 +904,7 @@ static int nativeOpenFlags(CreationDisposition Disp, OpenFlags Flags,
     // Nothing special, just don't add O_CREAT and we get these semantics.
   }
 
-  if (Flags & F_Append)
+  if (Flags & OF_Append)
     Result |= O_APPEND;
 
 #ifdef O_CLOEXEC
@@ -996,44 +999,28 @@ file_t getStdinHandle() { return 0; }
 file_t getStdoutHandle() { return 1; }
 file_t getStderrHandle() { return 2; }
 
-std::error_code readNativeFile(file_t FD, MutableArrayRef<char> Buf,
-                               size_t *BytesRead) {
-  *BytesRead = sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Buf.size());
-  if (ssize_t(*BytesRead) == -1)
-    return std::error_code(errno, std::generic_category());
-  return std::error_code();
+Expected<size_t> readNativeFile(file_t FD, MutableArrayRef<char> Buf) {
+  ssize_t NumRead =
+      sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Buf.size());
+  if (ssize_t(NumRead) == -1)
+    return errorCodeToError(std::error_code(errno, std::generic_category()));
+  return NumRead;
 }
 
-std::error_code readNativeFileSlice(file_t FD, MutableArrayRef<char> Buf,
-                                    size_t Offset) {
-  char *BufPtr = Buf.data();
-  size_t BytesLeft = Buf.size();
-
-#ifndef HAVE_PREAD
-  // If we don't have pread, seek to Offset.
-  if (lseek(FD, Offset, SEEK_SET) == -1)
-    return std::error_code(errno, std::generic_category());
-#endif
-
-  while (BytesLeft) {
+Expected<size_t> readNativeFileSlice(file_t FD, MutableArrayRef<char> Buf,
+                                     uint64_t Offset) {
 #ifdef HAVE_PREAD
-    ssize_t NumRead = sys::RetryAfterSignal(-1, ::pread, FD, BufPtr, BytesLeft,
-                                            Buf.size() - BytesLeft + Offset);
+  ssize_t NumRead =
+      sys::RetryAfterSignal(-1, ::pread, FD, Buf.data(), Buf.size(), Offset);
 #else
-    ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, BufPtr, BytesLeft);
+  if (lseek(FD, Offset, SEEK_SET) == -1)
+    return errorCodeToError(std::error_code(errno, std::generic_category()));
+  ssize_t NumRead =
+      sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Buf.size());
 #endif
-    if (NumRead == -1) {
-      // Error while reading.
-      return std::error_code(errno, std::generic_category());
-    }
-    if (NumRead == 0) {
-      memset(BufPtr, 0, BytesLeft); // zero-initialize rest of the buffer.
-      break;
-    }
-    BytesLeft -= NumRead;
-    BufPtr += NumRead;
-  }
-  return std::error_code();
+  if (NumRead == -1)
+    return errorCodeToError(std::error_code(errno, std::generic_category()));
+  return NumRead;
 }
 
 std::error_code closeFile(file_t &F) {
@@ -1200,7 +1187,7 @@ namespace fs {
 /// implementation.
 std::error_code copy_file(const Twine &From, const Twine &To) {
   uint32_t Flag = COPYFILE_DATA;
-#if __has_builtin(__builtin_available)
+#if __has_builtin(__builtin_available) && defined(COPYFILE_CLONE)
   if (__builtin_available(macos 10.12, *)) {
     bool IsSymlink;
     if (std::error_code Error = is_symlink_file(From, IsSymlink))
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 4115ee396582..dfe81d7e2833 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -15,8 +15,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/MutexGuard.h"
+#include <mutex>
 #if HAVE_FCNTL_H
 #include <fcntl.h>
 #endif
@@ -327,13 +326,13 @@ extern "C" int tigetnum(char *capname);
 #endif
 
 #ifdef HAVE_TERMINFO
-static ManagedStatic<sys::Mutex> TermColorMutex;
+static ManagedStatic<std::mutex> TermColorMutex;
 #endif
 
 static bool terminalHasColors(int fd) {
 #ifdef HAVE_TERMINFO
   // First, acquire a global lock because these C routines are thread hostile.
-  MutexGuard G(*TermColorMutex);
+  std::lock_guard<std::mutex> G(*TermColorMutex);
 
   int errret = 0;
   if (setupterm(nullptr, fd, &errret) != 0)
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index c4123a64046f..520685a0e987 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -136,7 +136,7 @@ static bool RedirectIO_PS(const std::string *Path, int FD, std::string *ErrMsg,
   if (int Err = posix_spawn_file_actions_addopen(
           FileActions, FD, File,
           FD == 0 ? O_RDONLY : O_WRONLY | O_CREAT, 0666))
-    return MakeErrMsg(ErrMsg, "Cannot dup2", Err);
+    return MakeErrMsg(ErrMsg, "Cannot posix_spawn_file_actions_addopen", Err);
   return false;
 }
 #endif
@@ -444,7 +444,7 @@ std::error_code
 llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
                                  WindowsEncodingMethod Encoding /*unused*/) {
   std::error_code EC;
-  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text);
+  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::OF_Text);
 
   if (EC)
     return EC;
diff --git a/lib/Support/Unix/RWMutex.inc b/lib/Support/Unix/RWMutex.inc
deleted file mode 100644
index 8b47dfa0f85c..000000000000
--- a/lib/Support/Unix/RWMutex.inc
+++ /dev/null
@@ -1,50 +0,0 @@
-//= llvm/Support/Unix/RWMutex.inc - Unix Reader/Writer Mutual Exclusion Lock  =//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Unix specific (non-pthread) RWMutex class.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic UNIX code that
-//===          is guaranteed to work on *all* UNIX variants.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Mutex.h"
-
-namespace llvm {
-
-using namespace sys;
-
-// This naive implementation treats readers the same as writers.  This
-// will therefore deadlock if a thread tries to acquire a read lock
-// multiple times.
-
-RWMutexImpl::RWMutexImpl() : data_(new MutexImpl(false)) { }
-
-RWMutexImpl::~RWMutexImpl() {
-  delete static_cast<MutexImpl *>(data_);
-}
-
-bool RWMutexImpl::reader_acquire() {
-  return static_cast<MutexImpl *>(data_)->acquire();
-}
-
-bool RWMutexImpl::reader_release() {
-  return static_cast<MutexImpl *>(data_)->release();
-}
-
-bool RWMutexImpl::writer_acquire() {
-  return static_cast<MutexImpl *>(data_)->acquire();
-}
-
-bool RWMutexImpl::writer_release() {
-  return static_cast<MutexImpl *>(data_)->release();
-}
-
-}
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 634c16aa36c7..5e0cde4a81ed 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -43,7 +43,6 @@
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SaveAndRestore.h"
-#include "llvm/Support/UniqueLock.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <string>
@@ -83,12 +82,18 @@ using namespace llvm;
 static RETSIGTYPE SignalHandler(int Sig);  // defined below.
 static RETSIGTYPE InfoSignalHandler(int Sig);  // defined below.
 
+static void DefaultPipeSignalFunction() {
+  exit(EX_IOERR);
+}
+
 using SignalHandlerFunctionType = void (*)();
 /// The function to call if ctrl-c is pressed.
 static std::atomic<SignalHandlerFunctionType> InterruptFunction =
     ATOMIC_VAR_INIT(nullptr);
 static std::atomic<SignalHandlerFunctionType> InfoSignalFunction =
     ATOMIC_VAR_INIT(nullptr);
+static std::atomic<SignalHandlerFunctionType> PipeSignalFunction =
+    ATOMIC_VAR_INIT(DefaultPipeSignalFunction);
 
 namespace {
 /// Signal-safe removal of files.
@@ -364,7 +369,8 @@ static RETSIGTYPE SignalHandler(int Sig) {
 
       // Send a special return code that drivers can check for, from sysexits.h.
       if (Sig == SIGPIPE)
-        exit(EX_IOERR);
+        if (SignalHandlerFunctionType CurrentPipeFunction = PipeSignalFunction)
+          CurrentPipeFunction();
 
       raise(Sig);   // Execute the default handler.
       return;
@@ -404,6 +410,11 @@ void llvm::sys::SetInfoSignalFunction(void (*Handler)()) {
   RegisterHandlers();
 }
 
+void llvm::sys::SetPipeSignalFunction(void (*Handler)()) {
+  PipeSignalFunction.exchange(Handler);
+  RegisterHandlers();
+}
+
 // The public API
 bool llvm::sys::RemoveFileOnSignal(StringRef Filename,
                                    std::string* ErrMsg) {
diff --git a/lib/Support/VirtualFileSystem.cpp b/lib/Support/VirtualFileSystem.cpp
index 5d3480e97148..c390cb1b2227 100644
--- a/lib/Support/VirtualFileSystem.cpp
+++ b/lib/Support/VirtualFileSystem.cpp
@@ -176,9 +176,9 @@ class RealFile : public File {
   Status S;
   std::string RealName;
 
-  RealFile(file_t FD, StringRef NewName, StringRef NewRealPathName)
-      : FD(FD), S(NewName, {}, {}, {}, {}, {},
-                  llvm::sys::fs::file_type::status_error, {}),
+  RealFile(file_t RawFD, StringRef NewName, StringRef NewRealPathName)
+      : FD(RawFD), S(NewName, {}, {}, {}, {}, {},
+                     llvm::sys::fs::file_type::status_error, {}),
         RealName(NewRealPathName.str()) {
     assert(FD != kInvalidFile && "Invalid or inactive file descriptor");
   }
@@ -349,7 +349,7 @@ IntrusiveRefCntPtr<FileSystem> vfs::getRealFileSystem() {
 }
 
 std::unique_ptr<FileSystem> vfs::createPhysicalFileSystem() {
-  return llvm::make_unique<RealFileSystem>(false);
+  return std::make_unique<RealFileSystem>(false);
 }
 
 namespace {
@@ -754,7 +754,7 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
           ResolvedUser, ResolvedGroup, 0, sys::fs::file_type::directory_file,
           NewDirectoryPerms);
       Dir = cast<detail::InMemoryDirectory>(Dir->addChild(
-          Name, llvm::make_unique<detail::InMemoryDirectory>(std::move(Stat))));
+          Name, std::make_unique<detail::InMemoryDirectory>(std::move(Stat))));
       continue;
     }
 
@@ -989,6 +989,16 @@ std::error_code InMemoryFileSystem::isLocal(const Twine &Path, bool &Result) {
 // RedirectingFileSystem implementation
 //===-----------------------------------------------------------------------===/
 
+RedirectingFileSystem::RedirectingFileSystem(IntrusiveRefCntPtr<FileSystem> FS)
+    : ExternalFS(std::move(FS)) {
+  if (ExternalFS)
+    if (auto ExternalWorkingDirectory =
+            ExternalFS->getCurrentWorkingDirectory()) {
+      WorkingDirectory = *ExternalWorkingDirectory;
+      ExternalFSValidWD = true;
+    }
+}
+
 // FIXME: reuse implementation common with OverlayFSDirIterImpl as these
 // iterators are conceptually similar.
 class llvm::vfs::VFSFromYamlDirIterImpl
@@ -1035,12 +1045,27 @@ public:
 
 llvm::ErrorOr<std::string>
 RedirectingFileSystem::getCurrentWorkingDirectory() const {
-  return ExternalFS->getCurrentWorkingDirectory();
+  return WorkingDirectory;
 }
 
 std::error_code
 RedirectingFileSystem::setCurrentWorkingDirectory(const Twine &Path) {
-  return ExternalFS->setCurrentWorkingDirectory(Path);
+  // Don't change the working directory if the path doesn't exist.
+  if (!exists(Path))
+    return errc::no_such_file_or_directory;
+
+  // Always change the external FS but ignore its result.
+  if (ExternalFS) {
+    auto EC = ExternalFS->setCurrentWorkingDirectory(Path);
+    ExternalFSValidWD = !static_cast<bool>(EC);
+  }
+
+  SmallString<128> AbsolutePath;
+  Path.toVector(AbsolutePath);
+  if (std::error_code EC = makeAbsolute(AbsolutePath))
+    return EC;
+  WorkingDirectory = AbsolutePath.str();
+  return {};
 }
 
 std::error_code RedirectingFileSystem::isLocal(const Twine &Path,
@@ -1053,7 +1078,7 @@ directory_iterator RedirectingFileSystem::dir_begin(const Twine &Dir,
   ErrorOr<RedirectingFileSystem::Entry *> E = lookupPath(Dir);
   if (!E) {
     EC = E.getError();
-    if (IsFallthrough && EC == errc::no_such_file_or_directory)
+    if (shouldUseExternalFS() && EC == errc::no_such_file_or_directory)
       return ExternalFS->dir_begin(Dir, EC);
     return {};
   }
@@ -1071,7 +1096,7 @@ directory_iterator RedirectingFileSystem::dir_begin(const Twine &Dir,
   auto *D = cast<RedirectingFileSystem::RedirectingDirectoryEntry>(*E);
   return directory_iterator(std::make_shared<VFSFromYamlDirIterImpl>(
       Dir, D->contents_begin(), D->contents_end(),
-      /*IterateExternalFS=*/IsFallthrough, *ExternalFS, EC));
+      /*IterateExternalFS=*/shouldUseExternalFS(), *ExternalFS, EC));
 }
 
 void RedirectingFileSystem::setExternalContentsPrefixDir(StringRef PrefixDir) {
@@ -1082,20 +1107,19 @@ StringRef RedirectingFileSystem::getExternalContentsPrefixDir() const {
   return ExternalContentsPrefixDir;
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void RedirectingFileSystem::dump() const {
+void RedirectingFileSystem::dump(raw_ostream &OS) const {
   for (const auto &Root : Roots)
-    dumpEntry(Root.get());
+    dumpEntry(OS, Root.get());
 }
 
-LLVM_DUMP_METHOD void
-RedirectingFileSystem::dumpEntry(RedirectingFileSystem::Entry *E,
-                                 int NumSpaces) const {
+void RedirectingFileSystem::dumpEntry(raw_ostream &OS,
+                                      RedirectingFileSystem::Entry *E,
+                                      int NumSpaces) const {
   StringRef Name = E->getName();
   for (int i = 0, e = NumSpaces; i < e; ++i)
-    dbgs() << " ";
-  dbgs() << "'" << Name.str().c_str() << "'"
-         << "\n";
+    OS << " ";
+  OS << "'" << Name.str().c_str() << "'"
+     << "\n";
 
   if (E->getKind() == RedirectingFileSystem::EK_Directory) {
     auto *DE = dyn_cast<RedirectingFileSystem::RedirectingDirectoryEntry>(E);
@@ -1103,9 +1127,12 @@ RedirectingFileSystem::dumpEntry(RedirectingFileSystem::Entry *E,
 
     for (std::unique_ptr<Entry> &SubEntry :
          llvm::make_range(DE->contents_begin(), DE->contents_end()))
-      dumpEntry(SubEntry.get(), NumSpaces + 2);
+      dumpEntry(OS, SubEntry.get(), NumSpaces + 2);
   }
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RedirectingFileSystem::dump() const { dump(dbgs()); }
 #endif
 
 /// A helper class to hold the common YAML parsing state.
@@ -1209,7 +1236,7 @@ class llvm::vfs::RedirectingFileSystemParser {
 
     // ... or create a new one
     std::unique_ptr<RedirectingFileSystem::Entry> E =
-        llvm::make_unique<RedirectingFileSystem::RedirectingDirectoryEntry>(
+        std::make_unique<RedirectingFileSystem::RedirectingDirectoryEntry>(
             Name, Status("", getNextVirtualUniqueID(),
                          std::chrono::system_clock::now(), 0, 0, 0,
                          file_type::directory_file, sys::fs::all_all));
@@ -1221,7 +1248,7 @@ class llvm::vfs::RedirectingFileSystemParser {
     }
 
     auto *DE =
-        dyn_cast<RedirectingFileSystem::RedirectingDirectoryEntry>(ParentEntry);
+        cast<RedirectingFileSystem::RedirectingDirectoryEntry>(ParentEntry);
     DE->addContent(std::move(E));
     return DE->getLastContent();
   }
@@ -1232,9 +1259,7 @@ class llvm::vfs::RedirectingFileSystemParser {
     StringRef Name = SrcE->getName();
     switch (SrcE->getKind()) {
     case RedirectingFileSystem::EK_Directory: {
-      auto *DE =
-          dyn_cast<RedirectingFileSystem::RedirectingDirectoryEntry>(SrcE);
-      assert(DE && "Must be a directory");
+      auto *DE = cast<RedirectingFileSystem::RedirectingDirectoryEntry>(SrcE);
       // Empty directories could be present in the YAML as a way to
       // describe a file for a current directory after some of its subdir
       // is parsed. This only leads to redundant walks, ignore it.
@@ -1246,13 +1271,12 @@ class llvm::vfs::RedirectingFileSystemParser {
       break;
     }
     case RedirectingFileSystem::EK_File: {
-      auto *FE = dyn_cast<RedirectingFileSystem::RedirectingFileEntry>(SrcE);
-      assert(FE && "Must be a file");
       assert(NewParentE && "Parent entry must exist");
-      auto *DE = dyn_cast<RedirectingFileSystem::RedirectingDirectoryEntry>(
-          NewParentE);
+      auto *FE = cast<RedirectingFileSystem::RedirectingFileEntry>(SrcE);
+      auto *DE =
+          cast<RedirectingFileSystem::RedirectingDirectoryEntry>(NewParentE);
       DE->addContent(
-          llvm::make_unique<RedirectingFileSystem::RedirectingFileEntry>(
+          std::make_unique<RedirectingFileSystem::RedirectingFileEntry>(
               Name, FE->getExternalContentsPath(), FE->getUseName()));
       break;
     }
@@ -1423,12 +1447,12 @@ class llvm::vfs::RedirectingFileSystemParser {
     std::unique_ptr<RedirectingFileSystem::Entry> Result;
     switch (Kind) {
     case RedirectingFileSystem::EK_File:
-      Result = llvm::make_unique<RedirectingFileSystem::RedirectingFileEntry>(
+      Result = std::make_unique<RedirectingFileSystem::RedirectingFileEntry>(
           LastComponent, std::move(ExternalContentsPath), UseExternalName);
       break;
     case RedirectingFileSystem::EK_Directory:
       Result =
-          llvm::make_unique<RedirectingFileSystem::RedirectingDirectoryEntry>(
+          std::make_unique<RedirectingFileSystem::RedirectingDirectoryEntry>(
               LastComponent, std::move(EntryArrayContents),
               Status("", getNextVirtualUniqueID(),
                      std::chrono::system_clock::now(), 0, 0, 0,
@@ -1447,7 +1471,7 @@ class llvm::vfs::RedirectingFileSystemParser {
       std::vector<std::unique_ptr<RedirectingFileSystem::Entry>> Entries;
       Entries.push_back(std::move(Result));
       Result =
-          llvm::make_unique<RedirectingFileSystem::RedirectingDirectoryEntry>(
+          std::make_unique<RedirectingFileSystem::RedirectingDirectoryEntry>(
               *I, std::move(Entries),
               Status("", getNextVirtualUniqueID(),
                      std::chrono::system_clock::now(), 0, 0, 0,
@@ -1573,7 +1597,7 @@ RedirectingFileSystem::create(std::unique_ptr<MemoryBuffer> Buffer,
   RedirectingFileSystemParser P(Stream);
 
   std::unique_ptr<RedirectingFileSystem> FS(
-      new RedirectingFileSystem(std::move(ExternalFS)));
+      new RedirectingFileSystem(ExternalFS));
 
   if (!YAMLFilePath.empty()) {
     // Use the YAML path from -ivfsoverlay to compute the dir to be prefixed
@@ -1702,7 +1726,7 @@ ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path,
 ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path) {
   ErrorOr<RedirectingFileSystem::Entry *> Result = lookupPath(Path);
   if (!Result) {
-    if (IsFallthrough &&
+    if (shouldUseExternalFS() &&
         Result.getError() == llvm::errc::no_such_file_or_directory) {
       return ExternalFS->status(Path);
     }
@@ -1740,7 +1764,7 @@ ErrorOr<std::unique_ptr<File>>
 RedirectingFileSystem::openFileForRead(const Twine &Path) {
   ErrorOr<RedirectingFileSystem::Entry *> E = lookupPath(Path);
   if (!E) {
-    if (IsFallthrough &&
+    if (shouldUseExternalFS() &&
         E.getError() == llvm::errc::no_such_file_or_directory) {
       return ExternalFS->openFileForRead(Path);
     }
@@ -1763,7 +1787,7 @@ RedirectingFileSystem::openFileForRead(const Twine &Path) {
   Status S = getRedirectedFileStatus(Path, F->useExternalName(UseExternalNames),
                                      *ExternalStatus);
   return std::unique_ptr<File>(
-      llvm::make_unique<FileWithFixedStatus>(std::move(*Result), S));
+      std::make_unique<FileWithFixedStatus>(std::move(*Result), S));
 }
 
 std::error_code
@@ -1771,7 +1795,7 @@ RedirectingFileSystem::getRealPath(const Twine &Path,
                                    SmallVectorImpl<char> &Output) const {
   ErrorOr<RedirectingFileSystem::Entry *> Result = lookupPath(Path);
   if (!Result) {
-    if (IsFallthrough &&
+    if (shouldUseExternalFS() &&
         Result.getError() == llvm::errc::no_such_file_or_directory) {
       return ExternalFS->getRealPath(Path, Output);
     }
@@ -1784,8 +1808,8 @@ RedirectingFileSystem::getRealPath(const Twine &Path,
   }
   // Even if there is a directory entry, fall back to ExternalFS if allowed,
   // because directories don't have a single external contents path.
-  return IsFallthrough ? ExternalFS->getRealPath(Path, Output)
-                       : llvm::errc::invalid_argument;
+  return shouldUseExternalFS() ? ExternalFS->getRealPath(Path, Output)
+                               : llvm::errc::invalid_argument;
 }
 
 IntrusiveRefCntPtr<FileSystem>
diff --git a/lib/Support/Windows/Mutex.inc b/lib/Support/Windows/Mutex.inc
deleted file mode 100644
index b55b14febf2c..000000000000
--- a/lib/Support/Windows/Mutex.inc
+++ /dev/null
@@ -1,56 +0,0 @@
-//===- llvm/Support/Win32/Mutex.inc - Win32 Mutex Implementation -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Win32 specific (non-pthread) Mutex class.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic Win32 code that
-//===          is guaranteed to work on *all* Win32 variants.
-//===----------------------------------------------------------------------===//
-
-#include "WindowsSupport.h"
-#include "llvm/Support/Mutex.h"
-
-namespace llvm {
-
-sys::MutexImpl::MutexImpl(bool /*recursive*/)
-{
-  data_ = new CRITICAL_SECTION;
-  InitializeCriticalSection((LPCRITICAL_SECTION)data_);
-}
-
-sys::MutexImpl::~MutexImpl()
-{
-  DeleteCriticalSection((LPCRITICAL_SECTION)data_);
-  delete (LPCRITICAL_SECTION)data_;
-  data_ = 0;
-}
-
-bool
-sys::MutexImpl::acquire()
-{
-  EnterCriticalSection((LPCRITICAL_SECTION)data_);
-  return true;
-}
-
-bool
-sys::MutexImpl::release()
-{
-  LeaveCriticalSection((LPCRITICAL_SECTION)data_);
-  return true;
-}
-
-bool
-sys::MutexImpl::tryacquire()
-{
-  return TryEnterCriticalSection((LPCRITICAL_SECTION)data_);
-}
-
-}
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 5704930aeecc..c3b13abef5de 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -371,13 +371,19 @@ static std::error_code realPathFromHandle(HANDLE H,
   if (std::error_code EC = realPathFromHandle(H, Buffer))
     return EC;
 
-  const wchar_t *Data = Buffer.data();
+  // Strip the \\?\ prefix. We don't want it ending up in output, and such
+  // paths don't get canonicalized by file APIs.
+  wchar_t *Data = Buffer.data();
   DWORD CountChars = Buffer.size();
-  if (CountChars >= 4) {
-    if (0 == ::memcmp(Data, L"\\\\?\\", 8)) {
-      CountChars -= 4;
-      Data += 4;
-    }
+  if (CountChars >= 8 && ::memcmp(Data, L"\\\\?\\UNC\\", 16) == 0) {
+    // Convert \\?\UNC\foo\bar to \\foo\bar
+    CountChars -= 6;
+    Data += 6;
+    Data[0] = '\\';
+  } else if (CountChars >= 4 && ::memcmp(Data, L"\\\\?\\", 8) == 0) {
+    // Convert \\?\c:\foo to c:\foo
+    CountChars -= 4;
+    Data += 4;
   }
 
   // Convert the result from UTF-16 to UTF-8.
@@ -1217,57 +1223,34 @@ file_t getStdinHandle() { return ::GetStdHandle(STD_INPUT_HANDLE); }
 file_t getStdoutHandle() { return ::GetStdHandle(STD_OUTPUT_HANDLE); }
 file_t getStderrHandle() { return ::GetStdHandle(STD_ERROR_HANDLE); }
 
-std::error_code readNativeFileImpl(file_t FileHandle, char *BufPtr, size_t BytesToRead,
-                                   size_t *BytesRead, OVERLAPPED *Overlap) {
+Expected<size_t> readNativeFileImpl(file_t FileHandle,
+                                    MutableArrayRef<char> Buf,
+                                    OVERLAPPED *Overlap) {
   // ReadFile can only read 2GB at a time. The caller should check the number of
   // bytes and read in a loop until termination.
-  DWORD BytesToRead32 =
-      std::min(size_t(std::numeric_limits<DWORD>::max()), BytesToRead);
-  DWORD BytesRead32 = 0;
-  bool Success =
-      ::ReadFile(FileHandle, BufPtr, BytesToRead32, &BytesRead32, Overlap);
-  *BytesRead = BytesRead32;
-  if (!Success) {
-    DWORD Err = ::GetLastError();
-    // Pipe EOF is not an error.
-    if (Err == ERROR_BROKEN_PIPE)
-      return std::error_code();
-    return mapWindowsError(Err);
-  }
-  return std::error_code();
-}
-
-std::error_code readNativeFile(file_t FileHandle, MutableArrayRef<char> Buf,
-                               size_t *BytesRead) {
-  return readNativeFileImpl(FileHandle, Buf.data(), Buf.size(), BytesRead,
-                            /*Overlap=*/nullptr);
-}
-
-std::error_code readNativeFileSlice(file_t FileHandle,
-                                    MutableArrayRef<char> Buf, size_t Offset) {
-  char *BufPtr = Buf.data();
-  size_t BytesLeft = Buf.size();
-
-  while (BytesLeft) {
-    uint64_t CurOff = Buf.size() - BytesLeft + Offset;
-    OVERLAPPED Overlapped = {};
-    Overlapped.Offset = uint32_t(CurOff);
-    Overlapped.OffsetHigh = uint32_t(uint64_t(CurOff) >> 32);
-
-    size_t BytesRead = 0;
-    if (auto EC = readNativeFileImpl(FileHandle, BufPtr, BytesLeft, &BytesRead,
-                                     &Overlapped))
-      return EC;
-
-    // Once we reach EOF, zero the remaining bytes in the buffer.
-    if (BytesRead == 0) {
-      memset(BufPtr, 0, BytesLeft);
-      break;
-    }
-    BytesLeft -= BytesRead;
-    BufPtr += BytesRead;
-  }
-  return std::error_code();
+  DWORD BytesToRead =
+      std::min(size_t(std::numeric_limits<DWORD>::max()), Buf.size());
+  DWORD BytesRead = 0;
+  if (::ReadFile(FileHandle, Buf.data(), BytesToRead, &BytesRead, Overlap))
+    return BytesRead;
+  DWORD Err = ::GetLastError();
+  // EOF is not an error.
+  if (Err == ERROR_BROKEN_PIPE || Err == ERROR_HANDLE_EOF)
+    return BytesRead;
+  return errorCodeToError(mapWindowsError(Err));
+}
+
+Expected<size_t> readNativeFile(file_t FileHandle, MutableArrayRef<char> Buf) {
+  return readNativeFileImpl(FileHandle, Buf, /*Overlap=*/nullptr);
+}
+
+Expected<size_t> readNativeFileSlice(file_t FileHandle,
+                                     MutableArrayRef<char> Buf,
+                                     uint64_t Offset) {
+  OVERLAPPED Overlapped = {};
+  Overlapped.Offset = uint32_t(Offset);
+  Overlapped.OffsetHigh = uint32_t(Offset >> 32);
+  return readNativeFileImpl(FileHandle, Buf, &Overlapped);
 }
 
 std::error_code closeFile(file_t &F) {
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 0f54e59ee55b..a23ed95fc390 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -470,7 +470,7 @@ std::error_code
 llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
                                  WindowsEncodingMethod Encoding) {
   std::error_code EC;
-  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::F_Text);
+  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OF_Text);
   if (EC)
     return EC;
 
diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
deleted file mode 100644
index 8df9bc394160..000000000000
--- a/lib/Support/Windows/RWMutex.inc
+++ /dev/null
@@ -1,128 +0,0 @@
-//= llvm/Support/Win32/Mutex.inc - Win32 Reader/Writer Mutual Exclusion Lock  =//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Win32 specific (non-pthread) RWMutex class.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic Win32 code that
-//===          is guaranteed to work on *all* Win32 variants.
-//===----------------------------------------------------------------------===//
-
-#include "WindowsSupport.h"
-
-namespace llvm {
-
-// Windows has slim read-writer lock support on Vista and higher, so we
-// will attempt to load the APIs.  If they exist, we will use them, and
-// if not, we will fall back on critical sections.  When we drop support
-// for XP, we can stop lazy-loading these APIs and just use them directly.
-#if defined(__MINGW32__)
-  // Taken from WinNT.h
-  typedef struct _RTL_SRWLOCK {
-    PVOID Ptr;
-  } RTL_SRWLOCK, *PRTL_SRWLOCK;
-
-  // Taken from WinBase.h
-  typedef RTL_SRWLOCK SRWLOCK, *PSRWLOCK;
-#endif
-
-static VOID (WINAPI *fpInitializeSRWLock)(PSRWLOCK lock) = NULL;
-static VOID (WINAPI *fpAcquireSRWLockExclusive)(PSRWLOCK lock) = NULL;
-static VOID (WINAPI *fpAcquireSRWLockShared)(PSRWLOCK lock) = NULL;
-static VOID (WINAPI *fpReleaseSRWLockExclusive)(PSRWLOCK lock) = NULL;
-static VOID (WINAPI *fpReleaseSRWLockShared)(PSRWLOCK lock) = NULL;
-
-static bool sHasSRW = false;
-
-static bool loadSRW() {
-  static bool sChecked = false;
-  if (!sChecked) {
-    sChecked = true;
-
-    if (HMODULE hLib = ::GetModuleHandleW(L"Kernel32.dll")) {
-      fpInitializeSRWLock =
-        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
-                                               "InitializeSRWLock");
-      fpAcquireSRWLockExclusive =
-        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
-                                               "AcquireSRWLockExclusive");
-      fpAcquireSRWLockShared =
-        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
-                                               "AcquireSRWLockShared");
-      fpReleaseSRWLockExclusive =
-        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
-                                               "ReleaseSRWLockExclusive");
-      fpReleaseSRWLockShared =
-        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
-                                               "ReleaseSRWLockShared");
-
-      if (fpInitializeSRWLock != NULL) {
-        sHasSRW = true;
-      }
-    }
-  }
-  return sHasSRW;
-}
-
-sys::RWMutexImpl::RWMutexImpl() {
-  if (loadSRW()) {
-    data_ = safe_calloc(1, sizeof(SRWLOCK));
-    fpInitializeSRWLock(static_cast<PSRWLOCK>(data_));
-  } else {
-    data_ = safe_calloc(1, sizeof(CRITICAL_SECTION));
-    InitializeCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
-  }
-}
-
-sys::RWMutexImpl::~RWMutexImpl() {
-  if (!sHasSRW)
-    DeleteCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
-  // Nothing to do in the case of slim reader/writers except free the memory.
-  free(data_);
-}
-
-bool sys::RWMutexImpl::reader_acquire() {
-  if (sHasSRW) {
-    fpAcquireSRWLockShared(static_cast<PSRWLOCK>(data_));
-  } else {
-    EnterCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
-  }
-  return true;
-}
-
-bool sys::RWMutexImpl::reader_release() {
-  if (sHasSRW) {
-    fpReleaseSRWLockShared(static_cast<PSRWLOCK>(data_));
-  } else {
-    LeaveCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
-  }
-  return true;
-}
-
-bool sys::RWMutexImpl::writer_acquire() {
-  if (sHasSRW) {
-    fpAcquireSRWLockExclusive(static_cast<PSRWLOCK>(data_));
-  } else {
-    EnterCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
-  }
-  return true;
-}
-
-bool sys::RWMutexImpl::writer_release() {
-  if (sHasSRW) {
-    fpReleaseSRWLockExclusive(static_cast<PSRWLOCK>(data_));
-  } else {
-    LeaveCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
-  }
-  return true;
-}
-
-
-}
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index 6a820ef22b1e..d962daf79348 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -560,6 +560,9 @@ void llvm::sys::SetInfoSignalFunction(void (*Handler)()) {
   // Unimplemented.
 }
 
+void llvm::sys::SetPipeSignalFunction(void (*Handler)()) {
+  // Unimplemented.
+}
 
 /// Add a function to be called when a signal is delivered to the process. The
 /// handler can have a cookie passed to it to identify what instance of the
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index fed9b2f462ef..2e2e97430b76 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -38,6 +38,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h" // Get build system configuration settings
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
diff --git a/lib/Support/Windows/explicit_symbols.inc b/lib/Support/Windows/explicit_symbols.inc
index bbbf7ea6a777..0a4fda1d4e8c 100644
--- a/lib/Support/Windows/explicit_symbols.inc
+++ b/lib/Support/Windows/explicit_symbols.inc
@@ -90,12 +90,6 @@
   INLINE_DEF_FLOAT_SYMBOL(tanf, 1)
   INLINE_DEF_FLOAT_SYMBOL(tanhf, 1)
 
-  // These were added in VS 2013.
-#if (1800 <= _MSC_VER && _MSC_VER < 1900)
-  INLINE_DEF_FLOAT_SYMBOL(copysignf, 2)
-  INLINE_DEF_FLOAT_SYMBOL(fminf, 2)
-  INLINE_DEF_FLOAT_SYMBOL(fmaxf, 2)
-#endif
 #undef INLINE_DEF_FLOAT_SYMBOL
 #endif
 
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 09eb36943de9..eba22fd14725 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -40,7 +40,7 @@ IO::IO(void *Context) : Ctxt(Context) {}
 
 IO::~IO() = default;
 
-void *IO::getContext() {
+void *IO::getContext() const {
   return Ctxt;
 }
 
@@ -79,7 +79,7 @@ void Input::ScalarHNode::anchor() {}
 void Input::MapHNode::anchor() {}
 void Input::SequenceHNode::anchor() {}
 
-bool Input::outputting() {
+bool Input::outputting() const {
   return false;
 }
 
@@ -377,12 +377,12 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
       // Copy string to permanent storage
       KeyStr = StringStorage.str().copy(StringAllocator);
     }
-    return llvm::make_unique<ScalarHNode>(N, KeyStr);
+    return std::make_unique<ScalarHNode>(N, KeyStr);
   } else if (BlockScalarNode *BSN = dyn_cast<BlockScalarNode>(N)) {
     StringRef ValueCopy = BSN->getValue().copy(StringAllocator);
-    return llvm::make_unique<ScalarHNode>(N, ValueCopy);
+    return std::make_unique<ScalarHNode>(N, ValueCopy);
   } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) {
-    auto SQHNode = llvm::make_unique<SequenceHNode>(N);
+    auto SQHNode = std::make_unique<SequenceHNode>(N);
     for (Node &SN : *SQ) {
       auto Entry = createHNodes(&SN);
       if (EC)
@@ -391,7 +391,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
     }
     return std::move(SQHNode);
   } else if (MappingNode *Map = dyn_cast<MappingNode>(N)) {
-    auto mapHNode = llvm::make_unique<MapHNode>(N);
+    auto mapHNode = std::make_unique<MapHNode>(N);
     for (KeyValueNode &KVN : *Map) {
       Node *KeyNode = KVN.getKey();
       ScalarNode *Key = dyn_cast<ScalarNode>(KeyNode);
@@ -416,7 +416,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
     }
     return std::move(mapHNode);
   } else if (isa<NullNode>(N)) {
-    return llvm::make_unique<EmptyHNode>(N);
+    return std::make_unique<EmptyHNode>(N);
   } else {
     setError(N, "unknown node kind");
     return nullptr;
@@ -440,7 +440,7 @@ Output::Output(raw_ostream &yout, void *context, int WrapColumn)
 
 Output::~Output() = default;
 
-bool Output::outputting() {
+bool Output::outputting() const {
   return true;
 }
 
diff --git a/lib/Support/Z3Solver.cpp b/lib/Support/Z3Solver.cpp
index f1a6fdf87cf2..a83d0f441a4b 100644
--- a/lib/Support/Z3Solver.cpp
+++ b/lib/Support/Z3Solver.cpp
@@ -886,7 +886,7 @@ public:
 
 llvm::SMTSolverRef llvm::CreateZ3Solver() {
 #if LLVM_WITH_Z3
-  return llvm::make_unique<Z3Solver>();
+  return std::make_unique<Z3Solver>();
 #else
   llvm::report_fatal_error("LLVM was not compiled with Z3 support, rebuild "
                            "with -DLLVM_ENABLE_Z3_SOLVER=ON",
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 2baccaa0cbd7..b9989371f5ea 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -65,6 +65,17 @@
 
 using namespace llvm;
 
+const raw_ostream::Colors raw_ostream::BLACK;
+const raw_ostream::Colors raw_ostream::RED;
+const raw_ostream::Colors raw_ostream::GREEN;
+const raw_ostream::Colors raw_ostream::YELLOW;
+const raw_ostream::Colors raw_ostream::BLUE;
+const raw_ostream::Colors raw_ostream::MAGENTA;
+const raw_ostream::Colors raw_ostream::CYAN;
+const raw_ostream::Colors raw_ostream::WHITE;
+const raw_ostream::Colors raw_ostream::SAVEDCOLOR;
+const raw_ostream::Colors raw_ostream::RESET;
+
 raw_ostream::~raw_ostream() {
   // raw_ostream's subclasses should take care to flush the buffer
   // in their destructors.
@@ -133,6 +144,14 @@ raw_ostream &raw_ostream::write_hex(unsigned long long N) {
   return *this;
 }
 
+raw_ostream &raw_ostream::operator<<(Colors C) {
+  if (C == Colors::RESET)
+    resetColor();
+  else
+    changeColor(C);
+  return *this;
+}
+
 raw_ostream &raw_ostream::write_uuid(const uuid_t UUID) {
   for (int Idx = 0; Idx < 16; ++Idx) {
     *this << format("%02" PRIX32, UUID[Idx]);
@@ -784,11 +803,15 @@ size_t raw_fd_ostream::preferred_buffer_size() const {
 
 raw_ostream &raw_fd_ostream::changeColor(enum Colors colors, bool bold,
                                          bool bg) {
+  if (!ColorEnabled)
+    return *this;
+
   if (sys::Process::ColorNeedsFlush())
     flush();
   const char *colorcode =
-    (colors == SAVEDCOLOR) ? sys::Process::OutputBold(bg)
-    : sys::Process::OutputColor(colors, bold, bg);
+      (colors == SAVEDCOLOR)
+          ? sys::Process::OutputBold(bg)
+          : sys::Process::OutputColor(static_cast<char>(colors), bold, bg);
   if (colorcode) {
     size_t len = strlen(colorcode);
     write(colorcode, len);
@@ -799,6 +822,9 @@ raw_ostream &raw_fd_ostream::changeColor(enum Colors colors, bool bold,
 }
 
 raw_ostream &raw_fd_ostream::resetColor() {
+  if (!ColorEnabled)
+    return *this;
+
   if (sys::Process::ColorNeedsFlush())
     flush();
   const char *colorcode = sys::Process::ResetColor();
@@ -812,6 +838,9 @@ raw_ostream &raw_fd_ostream::resetColor() {
 }
 
 raw_ostream &raw_fd_ostream::reverseColor() {
+  if (!ColorEnabled)
+    return *this;
+
   if (sys::Process::ColorNeedsFlush())
     flush();
   const char *colorcode = sys::Process::OutputReverse();
@@ -843,7 +872,7 @@ void raw_fd_ostream::anchor() {}
 raw_ostream &llvm::outs() {
   // Set buffer settings to model stdout behavior.
   std::error_code EC;
-  static raw_fd_ostream S("-", EC, sys::fs::F_None);
+  static raw_fd_ostream S("-", EC, sys::fs::OF_None);
   assert(!EC);
   return S;
 }
diff --git a/lib/Support/regcomp.c b/lib/Support/regcomp.c
index 12669ab75d1a..ee2a1d87a267 100644
--- a/lib/Support/regcomp.c
+++ b/lib/Support/regcomp.c
@@ -48,6 +48,7 @@
 #include "regex2.h"
 
 #include "llvm/Config/config.h"
+#include "llvm/Support/Compiler.h"
 
 /* character-class table */
 static struct cclass {
@@ -537,7 +538,7 @@ p_ere_exp(struct parse *p)
 		break;
 	case '{':		/* okay as ordinary except if digit follows */
 		REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
-		/* FALLTHROUGH */
+		LLVM_FALLTHROUGH;
 	default:
 		ordinary(p, c);
 		break;
@@ -733,7 +734,7 @@ p_simp_re(struct parse *p,
 		break;
 	case '*':
 		REQUIRE(starordinary, REG_BADRPT);
-		/* FALLTHROUGH */
+		LLVM_FALLTHROUGH;
 	default:
 		ordinary(p, (char)c);
 		break;
@@ -1635,7 +1636,7 @@ findmust(struct parse *p, struct re_guts *g)
 					return;
 				}
 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
-			/* fallthrough */
+			LLVM_FALLTHROUGH;
 		default:		/* things that break a sequence */
 			if (newlen > g->mlen) {		/* ends one */
 				start = newstart;
diff --git a/lib/TableGen/Error.cpp b/lib/TableGen/Error.cpp
index 7523b32ca0e5..54b063cb4f8d 100644
--- a/lib/TableGen/Error.cpp
+++ b/lib/TableGen/Error.cpp
@@ -39,6 +39,8 @@ static void PrintMessage(ArrayRef<SMLoc> Loc, SourceMgr::DiagKind Kind,
                         "instantiated from multiclass");
 }
 
+void PrintNote(const Twine &Msg) { WithColor::note() << Msg << "\n"; }
+
 void PrintNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg) {
   PrintMessage(NoteLoc, SourceMgr::DK_Note, Msg);
 }
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index bcd39584e450..48ded6c45a46 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -49,6 +49,9 @@ static cl::list<std::string>
 MacroNames("D", cl::desc("Name of the macro to be defined"),
             cl::value_desc("macro name"), cl::Prefix);
 
+static cl::opt<bool>
+WriteIfChanged("write-if-changed", cl::desc("Only write output if it changed"));
+
 static int reportError(const char *ProgName, Twine Msg) {
   errs() << ProgName << ": " << Msg;
   errs().flush();
@@ -64,7 +67,7 @@ static int createDependencyFile(const TGParser &Parser, const char *argv0) {
     return reportError(argv0, "the option -d must be used together with -o\n");
 
   std::error_code EC;
-  ToolOutputFile DepOut(DependFilename, EC, sys::fs::F_Text);
+  ToolOutputFile DepOut(DependFilename, EC, sys::fs::OF_None);
   if (EC)
     return reportError(argv0, "error opening " + DependFilename + ":" +
                                   EC.message() + "\n");
@@ -114,15 +117,17 @@ int llvm::TableGenMain(char *argv0, TableGenMainFn *MainFn) {
       return Ret;
   }
 
-  // Only updates the real output file if there are any differences.
-  // This prevents recompilation of all the files depending on it if there
-  // aren't any.
-  if (auto ExistingOrErr = MemoryBuffer::getFile(OutputFilename))
-    if (std::move(ExistingOrErr.get())->getBuffer() == Out.str())
-      return 0;
+  if (WriteIfChanged) {
+    // Only updates the real output file if there are any differences.
+    // This prevents recompilation of all the files depending on it if there
+    // aren't any.
+    if (auto ExistingOrErr = MemoryBuffer::getFile(OutputFilename))
+      if (std::move(ExistingOrErr.get())->getBuffer() == Out.str())
+        return 0;
+  }
 
   std::error_code EC;
-  ToolOutputFile OutFile(OutputFilename, EC, sys::fs::F_Text);
+  ToolOutputFile OutFile(OutputFilename, EC, sys::fs::OF_None);
   if (EC)
     return reportError(argv0, "error opening " + OutputFilename + ":" +
                                   EC.message() + "\n");
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 27d1bdc7f4c3..835ef8c7141b 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -438,7 +438,7 @@ Init *BitsInit::resolveReferences(Resolver &R) const {
         CachedBitVarRef = CurBitVar->getBitVar();
         CachedBitVarResolved = CachedBitVarRef->resolveReferences(R);
       }
-
+      assert(CachedBitVarResolved && "Unresolved bitvar reference");
       NewBit = CachedBitVarResolved->getBit(CurBitVar->getBitNum());
     } else {
       // getBit(0) implicitly converts int and bits<1> values to bit.
@@ -1616,7 +1616,7 @@ void VarDefInit::Profile(FoldingSetNodeID &ID) const {
 DefInit *VarDefInit::instantiate() {
   if (!Def) {
     RecordKeeper &Records = Class->getRecords();
-    auto NewRecOwner = make_unique<Record>(Records.getNewAnonymousName(),
+    auto NewRecOwner = std::make_unique<Record>(Records.getNewAnonymousName(),
                                            Class->getLoc(), Records,
                                            /*IsAnonymous=*/true);
     Record *NewRec = NewRecOwner.get();
@@ -1930,6 +1930,13 @@ void DagInit::Profile(FoldingSetNodeID &ID) const {
   ProfileDagInit(ID, Val, ValName, makeArrayRef(getTrailingObjects<Init *>(), NumArgs), makeArrayRef(getTrailingObjects<StringInit *>(), NumArgNames));
 }
 
+Record *DagInit::getOperatorAsDef(ArrayRef<SMLoc> Loc) const {
+  if (DefInit *DefI = dyn_cast<DefInit>(Val))
+    return DefI->getDef();
+  PrintFatalError(Loc, "Expected record as operator");
+  return nullptr;
+}
+
 Init *DagInit::resolveReferences(Resolver &R) const {
   SmallVector<Init*, 8> NewArgs;
   NewArgs.reserve(arg_size());
diff --git a/lib/TableGen/SetTheory.cpp b/lib/TableGen/SetTheory.cpp
index a870e41d58f8..5a30ee98cce9 100644
--- a/lib/TableGen/SetTheory.cpp
+++ b/lib/TableGen/SetTheory.cpp
@@ -255,16 +255,16 @@ void SetTheory::Operator::anchor() {}
 void SetTheory::Expander::anchor() {}
 
 SetTheory::SetTheory() {
-  addOperator("add", llvm::make_unique<AddOp>());
-  addOperator("sub", llvm::make_unique<SubOp>());
-  addOperator("and", llvm::make_unique<AndOp>());
-  addOperator("shl", llvm::make_unique<ShlOp>());
-  addOperator("trunc", llvm::make_unique<TruncOp>());
-  addOperator("rotl", llvm::make_unique<RotOp>(false));
-  addOperator("rotr", llvm::make_unique<RotOp>(true));
-  addOperator("decimate", llvm::make_unique<DecimateOp>());
-  addOperator("interleave", llvm::make_unique<InterleaveOp>());
-  addOperator("sequence", llvm::make_unique<SequenceOp>());
+  addOperator("add", std::make_unique<AddOp>());
+  addOperator("sub", std::make_unique<SubOp>());
+  addOperator("and", std::make_unique<AndOp>());
+  addOperator("shl", std::make_unique<ShlOp>());
+  addOperator("trunc", std::make_unique<TruncOp>());
+  addOperator("rotl", std::make_unique<RotOp>(false));
+  addOperator("rotr", std::make_unique<RotOp>(true));
+  addOperator("decimate", std::make_unique<DecimateOp>());
+  addOperator("interleave", std::make_unique<InterleaveOp>());
+  addOperator("sequence", std::make_unique<SequenceOp>());
 }
 
 void SetTheory::addOperator(StringRef Name, std::unique_ptr<Operator> Op) {
@@ -276,7 +276,7 @@ void SetTheory::addExpander(StringRef ClassName, std::unique_ptr<Expander> E) {
 }
 
 void SetTheory::addFieldExpander(StringRef ClassName, StringRef FieldName) {
-  addExpander(ClassName, llvm::make_unique<FieldExpander>(FieldName));
+  addExpander(ClassName, std::make_unique<FieldExpander>(FieldName));
 }
 
 void SetTheory::evaluate(Init *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
diff --git a/lib/TableGen/TGLexer.cpp b/lib/TableGen/TGLexer.cpp
index d28c62b3133d..da2286e41fe5 100644
--- a/lib/TableGen/TGLexer.cpp
+++ b/lib/TableGen/TGLexer.cpp
@@ -51,7 +51,7 @@ TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
 
   // Pretend that we enter the "top-level" include file.
   PrepIncludeStack.push_back(
-      make_unique<std::vector<PreprocessorControlDesc>>());
+      std::make_unique<std::vector<PreprocessorControlDesc>>());
 
   // Put all macros defined in the command line into the DefinedMacros set.
   std::for_each(Macros.begin(), Macros.end(),
@@ -393,7 +393,7 @@ bool TGLexer::LexInclude() {
   CurPtr = CurBuf.begin();
 
   PrepIncludeStack.push_back(
-      make_unique<std::vector<PreprocessorControlDesc>>());
+      std::make_unique<std::vector<PreprocessorControlDesc>>());
   return false;
 }
 
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index a9ace152d59e..c373e2899a5d 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -378,7 +378,7 @@ bool TGParser::resolve(const ForeachLoop &Loop, SubstStack &Substs,
   auto LI = dyn_cast<ListInit>(List);
   if (!LI) {
     if (!Final) {
-      Dest->emplace_back(make_unique<ForeachLoop>(Loop.Loc, Loop.IterVar,
+      Dest->emplace_back(std::make_unique<ForeachLoop>(Loop.Loc, Loop.IterVar,
                                                   List));
       return resolve(Loop.Entries, Substs, Final, &Dest->back().Loop->Entries,
                      Loc);
@@ -413,7 +413,7 @@ bool TGParser::resolve(const std::vector<RecordsEntry> &Source,
     if (E.Loop) {
       Error = resolve(*E.Loop, Substs, Final, Dest);
     } else {
-      auto Rec = make_unique<Record>(*E.Rec);
+      auto Rec = std::make_unique<Record>(*E.Rec);
       if (Loc)
         Rec->appendLoc(*Loc);
 
@@ -1147,9 +1147,9 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       if (!InitList.back()) return nullptr;
 
       // All BinOps require their arguments to be of compatible types.
-      TypedInit *TI = dyn_cast<TypedInit>(InitList.back());
+      RecTy *ListType = cast<TypedInit>(InitList.back())->getType();
       if (!ArgType) {
-        ArgType = TI->getType();
+        ArgType = ListType;
 
         switch (Code) {
         case BinOpInit::LISTCONCAT:
@@ -1198,11 +1198,11 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         default: llvm_unreachable("other ops have fixed argument types");
         }
       } else {
-        RecTy *Resolved = resolveTypes(ArgType, TI->getType());
+        RecTy *Resolved = resolveTypes(ArgType, ListType);
         if (!Resolved) {
           Error(InitLoc, Twine("expected value of type '") +
-                         ArgType->getAsString() + "', got '" +
-                         TI->getType()->getAsString() + "'");
+                             ArgType->getAsString() + "', got '" +
+                             ListType->getAsString() + "'");
           return nullptr;
         }
         if (Code != BinOpInit::ADD && Code != BinOpInit::AND &&
@@ -1330,7 +1330,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     std::unique_ptr<Record> ParseRecTmp;
     Record *ParseRec = CurRec;
     if (!ParseRec) {
-      ParseRecTmp = make_unique<Record>(".parse", ArrayRef<SMLoc>{}, Records);
+      ParseRecTmp = std::make_unique<Record>(".parse", ArrayRef<SMLoc>{}, Records);
       ParseRec = ParseRecTmp.get();
     }
 
@@ -1597,7 +1597,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     std::unique_ptr<Record> ParseRecTmp;
     Record *ParseRec = CurRec;
     if (!ParseRec) {
-      ParseRecTmp = make_unique<Record>(".parse", ArrayRef<SMLoc>{}, Records);
+      ParseRecTmp = std::make_unique<Record>(".parse", ArrayRef<SMLoc>{}, Records);
       ParseRec = ParseRecTmp.get();
     }
 
@@ -2702,10 +2702,10 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
     return true;
 
   if (isa<UnsetInit>(Name))
-    CurRec = make_unique<Record>(Records.getNewAnonymousName(), DefLoc, Records,
+    CurRec = std::make_unique<Record>(Records.getNewAnonymousName(), DefLoc, Records,
                                  /*Anonymous=*/true);
   else
-    CurRec = make_unique<Record>(Name, DefLoc, Records);
+    CurRec = std::make_unique<Record>(Name, DefLoc, Records);
 
   if (ParseObjectBody(CurRec.get()))
     return true;
@@ -2783,7 +2783,7 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
   Lex.Lex();  // Eat the in
 
   // Create a loop object and remember it.
-  Loops.push_back(llvm::make_unique<ForeachLoop>(Loc, IterName, ListValue));
+  Loops.push_back(std::make_unique<ForeachLoop>(Loc, IterName, ListValue));
 
   if (Lex.getCode() != tgtok::l_brace) {
     // FOREACH Declaration IN Object
@@ -2834,7 +2834,7 @@ bool TGParser::ParseClass() {
   } else {
     // If this is the first reference to this class, create and add it.
     auto NewRec =
-        llvm::make_unique<Record>(Lex.getCurStrVal(), Lex.getLoc(), Records,
+        std::make_unique<Record>(Lex.getCurStrVal(), Lex.getLoc(), Records,
                                   /*Class=*/true);
     CurRec = NewRec.get();
     Records.addClass(std::move(NewRec));
@@ -2963,7 +2963,7 @@ bool TGParser::ParseMultiClass() {
 
   auto Result =
     MultiClasses.insert(std::make_pair(Name,
-                    llvm::make_unique<MultiClass>(Name, Lex.getLoc(),Records)));
+                    std::make_unique<MultiClass>(Name, Lex.getLoc(),Records)));
 
   if (!Result.second)
     return TokError("multiclass '" + Name + "' already defined");
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 6965403a25ab..ac765ebcddc0 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -55,8 +55,9 @@ FunctionPass *createAArch64CollectLOHPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
                                  AArch64Subtarget &, AArch64RegisterBankInfo &);
-FunctionPass *createAArch64PreLegalizeCombiner();
-FunctionPass *createAArch64StackTaggingPass();
+FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAArch64StackTaggingPass(bool MergeInit);
+FunctionPass *createAArch64StackTaggingPreRAPass();
 
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
@@ -80,6 +81,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&);
 void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 void initializeAArch64StackTaggingPass(PassRegistry&);
+void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e39c6995e367..5b4c9e2149da 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -115,11 +115,12 @@ def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true",
 def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true",
   "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>;
 
-def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true",
+def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true",
   "Enable bit permutation SVE2 instructions", [FeatureSVE2]>;
 
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                         "Has zero-cycle register moves">;
+
 def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
                                         "Has zero-cycle zeroing instructions for generic registers">;
 
@@ -284,6 +285,10 @@ def FeatureSEL2 : SubtargetFeature<
     "sel2", "HasSEL2", "true",
     "Enable v8.4-A Secure Exception Level 2 extension">;
 
+def FeaturePMU : SubtargetFeature<
+    "pmu", "HasPMU", "true",
+    "Enable v8.4-A PMU extension">;
+
 def FeatureTLB_RMI : SubtargetFeature<
     "tlb-rmi", "HasTLB_RMI", "true",
     "Enable v8.4-A TLB Range and Maintenance Instructions">;
@@ -345,6 +350,21 @@ def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen",
 def FeatureMTE : SubtargetFeature<"mte", "HasMTE",
     "true", "Enable Memory Tagging Extension" >;
 
+def FeatureTRBE : SubtargetFeature<"trbe", "HasTRBE",
+    "true", "Enable Trace Buffer Extension">;
+
+def FeatureETE : SubtargetFeature<"ete", "HasETE",
+    "true", "Enable Embedded Trace Extension",
+    [FeatureTRBE]>;
+
+def FeatureTME : SubtargetFeature<"tme", "HasTME",
+    "true", "Enable Transactional Memory Extension" >;
+
+def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
+    "AllowTaggedGlobals",
+    "true", "Use an instruction sequence for taking the address of a global "
+    "that allows a memory tag in the upper address bits">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -354,7 +374,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
   FeaturePAN, FeatureLOR, FeatureVH]>;
 
 def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
-  "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO, 
+  "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO,
   FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>;
 
 def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
@@ -364,7 +384,7 @@ def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
 def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
   "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
   FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT,
-  FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
+  FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI,
   FeatureFMI, FeatureRCPC_IMMO]>;
 
 def HasV8_5aOps : SubtargetFeature<
@@ -390,6 +410,7 @@ include "AArch64Schedule.td"
 include "AArch64InstrInfo.td"
 include "AArch64SchedPredicates.td"
 include "AArch64SchedPredExynos.td"
+include "AArch64Combine.td"
 
 def AArch64InstrInfo : InstrInfo;
 
@@ -484,6 +505,19 @@ def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    FeaturePredictableSelectIsExpensive
                                    ]>;
 
+def ProcA65     : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
+                                   "Cortex-A65 ARM processors", [
+                                   HasV8_2aOps,
+                                   FeatureCrypto,
+                                   FeatureDotProd,
+                                   FeatureFPARMv8,
+                                   FeatureFullFP16,
+                                   FeatureNEON,
+                                   FeatureRAS,
+                                   FeatureRCPC,
+                                   FeatureSSBS,
+                                   ]>;
+
 def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    "Cortex-A72 ARM processors", [
                                    FeatureCRC,
@@ -641,6 +675,33 @@ def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    FeatureSlowSTRQro
                                    ]>;
 
+def ProcNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily",
+                                      "NeoverseE1",
+                                      "Neoverse E1 ARM processors", [
+                                      HasV8_2aOps,
+                                      FeatureCrypto,
+                                      FeatureDotProd,
+                                      FeatureFPARMv8,
+                                      FeatureFullFP16,
+                                      FeatureNEON,
+                                      FeatureRCPC,
+                                      FeatureSSBS,
+                                      ]>;
+
+def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily",
+                                      "NeoverseN1",
+                                      "Neoverse N1 ARM processors", [
+                                      HasV8_2aOps,
+                                      FeatureCrypto,
+                                      FeatureDotProd,
+                                      FeatureFPARMv8,
+                                      FeatureFullFP16,
+                                      FeatureNEON,
+                                      FeatureRCPC,
+                                      FeatureSPE,
+                                      FeatureSSBS,
+                                      ]>;
+
 def ProcSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
                                    FeatureCrypto,
@@ -732,19 +793,28 @@ def : ProcessorModel<"generic", NoSchedModel, [
                      FeatureFuseAES,
                      FeatureNEON,
                      FeaturePerfMon,
-                     FeaturePostRAScheduler
+                     FeaturePostRAScheduler,
+// ETE and TRBE are future architecture extensions. We temporariliy enable them
+// by default for users targeting generic AArch64, until it is decided in which
+// armv8.x-a architecture revision they will end up. The extensions do not
+// affect code generated by the compiler and can be used only by explicitly
+// mentioning the new system register names in assembly.
+                     FeatureETE
                      ]>;
 
-// FIXME: Cortex-A35 and Cortex-A55 are currently modeled as a Cortex-A53.
 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
+def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>;
+def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>;
 def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
 def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
 def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
 def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
 def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
+def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>;
+def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
 def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 92c8c4955d50..13d389cec7a0 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -552,7 +552,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
     std::vector<unsigned> ToErase;
     for (auto &U : I.operands()) {
       if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
-        unsigned OrigReg = U.getReg();
+        Register OrigReg = U.getReg();
         U.setReg(Substs[OrigReg]);
         if (U.isKill())
           // Don't erase straight away, because there may be other operands
@@ -611,12 +611,12 @@ void AArch64A57FPLoadBalancing::scanInstruction(
 
     // Create a new chain. Multiplies don't require forwarding so can go on any
     // unit.
-    unsigned DestReg = MI->getOperand(0).getReg();
+    Register DestReg = MI->getOperand(0).getReg();
 
     LLVM_DEBUG(dbgs() << "New chain started for register "
                       << printReg(DestReg, TRI) << " at " << *MI);
 
-    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+    auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg));
     ActiveChains[DestReg] = G.get();
     AllChains.push_back(std::move(G));
 
@@ -624,8 +624,8 @@ void AArch64A57FPLoadBalancing::scanInstruction(
 
     // It is beneficial to keep MLAs on the same functional unit as their
     // accumulator operand.
-    unsigned DestReg  = MI->getOperand(0).getReg();
-    unsigned AccumReg = MI->getOperand(3).getReg();
+    Register DestReg = MI->getOperand(0).getReg();
+    Register AccumReg = MI->getOperand(3).getReg();
 
     maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
     maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
@@ -661,7 +661,7 @@ void AArch64A57FPLoadBalancing::scanInstruction(
 
     LLVM_DEBUG(dbgs() << "Creating new chain for dest register "
                       << printReg(DestReg, TRI) << "\n");
-    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+    auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg));
     ActiveChains[DestReg] = G.get();
     AllChains.push_back(std::move(G));
 
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 89404463e1f0..981b366c14b1 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -105,14 +105,14 @@ static bool isGPR64(unsigned Reg, unsigned SubReg,
                     const MachineRegisterInfo *MRI) {
   if (SubReg)
     return false;
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
+  if (Register::isVirtualRegister(Reg))
     return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
   return AArch64::GPR64RegClass.contains(Reg);
 }
 
 static bool isFPR64(unsigned Reg, unsigned SubReg,
                     const MachineRegisterInfo *MRI) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
+  if (Register::isVirtualRegister(Reg))
     return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
             SubReg == 0) ||
            (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
@@ -201,8 +201,8 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform(
   unsigned NumNewCopies = 3;
   unsigned NumRemovableCopies = 0;
 
-  unsigned OrigSrc0 = MI.getOperand(1).getReg();
-  unsigned OrigSrc1 = MI.getOperand(2).getReg();
+  Register OrigSrc0 = MI.getOperand(1).getReg();
+  Register OrigSrc1 = MI.getOperand(2).getReg();
   unsigned SubReg0;
   unsigned SubReg1;
   if (!MRI->def_empty(OrigSrc0)) {
@@ -236,7 +236,7 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform(
   // any of the uses is a transformable instruction, it's likely the tranforms
   // will chain, enabling us to save a copy there, too. This is an aggressive
   // heuristic that approximates the graph based cost analysis described above.
-  unsigned Dst = MI.getOperand(0).getReg();
+  Register Dst = MI.getOperand(0).getReg();
   bool AllUsesAreCopies = true;
   for (MachineRegisterInfo::use_instr_nodbg_iterator
            Use = MRI->use_instr_nodbg_begin(Dst),
@@ -293,8 +293,8 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
   assert(OldOpc != NewOpc && "transform an instruction to itself?!");
 
   // Check if we need a copy for the source registers.
-  unsigned OrigSrc0 = MI.getOperand(1).getReg();
-  unsigned OrigSrc1 = MI.getOperand(2).getReg();
+  Register OrigSrc0 = MI.getOperand(1).getReg();
+  Register OrigSrc1 = MI.getOperand(2).getReg();
   unsigned Src0 = 0, SubReg0;
   unsigned Src1 = 0, SubReg1;
   bool KillSrc0 = false, KillSrc1 = false;
@@ -354,7 +354,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
   // Create a vreg for the destination.
   // FIXME: No need to do this if the ultimate user expects an FPR64.
   // Check for that and avoid the copy if possible.
-  unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+  Register Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
 
   // For now, all of the new instructions have the same simple three-register
   // form, so no need to special case based on what instruction we're
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 094fbd999523..7ea7915c2ca6 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -99,7 +99,8 @@ public:
   void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
   void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
 
-  std::map<std::pair<unsigned, uint32_t>, MCSymbol *> HwasanMemaccessSymbols;
+  typedef std::tuple<unsigned, bool, uint32_t> HwasanMemaccessTuple;
+  std::map<HwasanMemaccessTuple, MCSymbol *> HwasanMemaccessSymbols;
   void LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI);
   void EmitHwasanMemaccessSymbols(Module &M);
 
@@ -150,7 +151,7 @@ private:
   void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
   bool printAsmRegInClass(const MachineOperand &MO,
-                          const TargetRegisterClass *RC, bool isVector,
+                          const TargetRegisterClass *RC, unsigned AltName,
                           raw_ostream &O);
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
@@ -236,9 +237,12 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
 }
 
 void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
-  unsigned Reg = MI.getOperand(0).getReg();
+  Register Reg = MI.getOperand(0).getReg();
+  bool IsShort =
+      MI.getOpcode() == AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES;
   uint32_t AccessInfo = MI.getOperand(1).getImm();
-  MCSymbol *&Sym = HwasanMemaccessSymbols[{Reg, AccessInfo}];
+  MCSymbol *&Sym =
+      HwasanMemaccessSymbols[HwasanMemaccessTuple(Reg, IsShort, AccessInfo)];
   if (!Sym) {
     // FIXME: Make this work on non-ELF.
     if (!TM.getTargetTriple().isOSBinFormatELF())
@@ -246,6 +250,8 @@ void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
 
     std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" +
                           utostr(AccessInfo);
+    if (IsShort)
+      SymName += "_short";
     Sym = OutContext.getOrCreateSymbol(SymName);
   }
 
@@ -263,15 +269,22 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
   std::unique_ptr<MCSubtargetInfo> STI(
       TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
 
-  MCSymbol *HwasanTagMismatchSym =
+  MCSymbol *HwasanTagMismatchV1Sym =
       OutContext.getOrCreateSymbol("__hwasan_tag_mismatch");
+  MCSymbol *HwasanTagMismatchV2Sym =
+      OutContext.getOrCreateSymbol("__hwasan_tag_mismatch_v2");
 
-  const MCSymbolRefExpr *HwasanTagMismatchRef =
-      MCSymbolRefExpr::create(HwasanTagMismatchSym, OutContext);
+  const MCSymbolRefExpr *HwasanTagMismatchV1Ref =
+      MCSymbolRefExpr::create(HwasanTagMismatchV1Sym, OutContext);
+  const MCSymbolRefExpr *HwasanTagMismatchV2Ref =
+      MCSymbolRefExpr::create(HwasanTagMismatchV2Sym, OutContext);
 
   for (auto &P : HwasanMemaccessSymbols) {
-    unsigned Reg = P.first.first;
-    uint32_t AccessInfo = P.first.second;
+    unsigned Reg = std::get<0>(P.first);
+    bool IsShort = std::get<1>(P.first);
+    uint32_t AccessInfo = std::get<2>(P.first);
+    const MCSymbolRefExpr *HwasanTagMismatchRef =
+        IsShort ? HwasanTagMismatchV2Ref : HwasanTagMismatchV1Ref;
     MCSymbol *Sym = P.second;
 
     OutStreamer->SwitchSection(OutContext.getELFSection(
@@ -304,82 +317,86 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
             .addReg(Reg)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
         *STI);
-    MCSymbol *HandlePartialSym = OutContext.createTempSymbol();
+    MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol();
     OutStreamer->EmitInstruction(
         MCInstBuilder(AArch64::Bcc)
             .addImm(AArch64CC::NE)
-            .addExpr(MCSymbolRefExpr::create(HandlePartialSym, OutContext)),
+            .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym,
+                                             OutContext)),
         *STI);
     MCSymbol *ReturnSym = OutContext.createTempSymbol();
     OutStreamer->EmitLabel(ReturnSym);
     OutStreamer->EmitInstruction(
         MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
+    OutStreamer->EmitLabel(HandleMismatchOrPartialSym);
 
-    OutStreamer->EmitLabel(HandlePartialSym);
-    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
-                                     .addReg(AArch64::WZR)
-                                     .addReg(AArch64::W16)
-                                     .addImm(15)
-                                     .addImm(0),
-                                 *STI);
-    MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
-    OutStreamer->EmitInstruction(
-        MCInstBuilder(AArch64::Bcc)
-            .addImm(AArch64CC::HI)
-            .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
-        *STI);
-
-    OutStreamer->EmitInstruction(
-        MCInstBuilder(AArch64::ANDXri)
-            .addReg(AArch64::X17)
-            .addReg(Reg)
-            .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
-        *STI);
-    unsigned Size = 1 << (AccessInfo & 0xf);
-    if (Size != 1)
-      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
-                                       .addReg(AArch64::X17)
-                                       .addReg(AArch64::X17)
-                                       .addImm(Size - 1)
+    if (IsShort) {
+      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
+                                       .addReg(AArch64::WZR)
+                                       .addReg(AArch64::W16)
+                                       .addImm(15)
                                        .addImm(0),
                                    *STI);
-    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
-                                     .addReg(AArch64::WZR)
-                                     .addReg(AArch64::W16)
-                                     .addReg(AArch64::W17)
-                                     .addImm(0),
-                                 *STI);
-    OutStreamer->EmitInstruction(
-        MCInstBuilder(AArch64::Bcc)
-            .addImm(AArch64CC::LS)
-            .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
-        *STI);
-
-    OutStreamer->EmitInstruction(
-        MCInstBuilder(AArch64::ORRXri)
-            .addReg(AArch64::X16)
-            .addReg(Reg)
-            .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
-        *STI);
-    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
-                                     .addReg(AArch64::W16)
-                                     .addReg(AArch64::X16)
-                                     .addImm(0),
-                                 *STI);
-    OutStreamer->EmitInstruction(
-        MCInstBuilder(AArch64::SUBSXrs)
-            .addReg(AArch64::XZR)
-            .addReg(AArch64::X16)
-            .addReg(Reg)
-            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
-        *STI);
-    OutStreamer->EmitInstruction(
-        MCInstBuilder(AArch64::Bcc)
-            .addImm(AArch64CC::EQ)
-            .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
-        *STI);
+      MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
+      OutStreamer->EmitInstruction(
+          MCInstBuilder(AArch64::Bcc)
+              .addImm(AArch64CC::HI)
+              .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+          *STI);
+
+      OutStreamer->EmitInstruction(
+          MCInstBuilder(AArch64::ANDXri)
+              .addReg(AArch64::X17)
+              .addReg(Reg)
+              .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+          *STI);
+      unsigned Size = 1 << (AccessInfo & 0xf);
+      if (Size != 1)
+        OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
+                                         .addReg(AArch64::X17)
+                                         .addReg(AArch64::X17)
+                                         .addImm(Size - 1)
+                                         .addImm(0),
+                                     *STI);
+      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
+                                       .addReg(AArch64::WZR)
+                                       .addReg(AArch64::W16)
+                                       .addReg(AArch64::W17)
+                                       .addImm(0),
+                                   *STI);
+      OutStreamer->EmitInstruction(
+          MCInstBuilder(AArch64::Bcc)
+              .addImm(AArch64CC::LS)
+              .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+          *STI);
+
+      OutStreamer->EmitInstruction(
+          MCInstBuilder(AArch64::ORRXri)
+              .addReg(AArch64::X16)
+              .addReg(Reg)
+              .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+          *STI);
+      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
+                                       .addReg(AArch64::W16)
+                                       .addReg(AArch64::X16)
+                                       .addImm(0),
+                                   *STI);
+      OutStreamer->EmitInstruction(
+          MCInstBuilder(AArch64::SUBSXrs)
+              .addReg(AArch64::XZR)
+              .addReg(AArch64::X16)
+              .addReg(Reg)
+              .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
+          *STI);
+      OutStreamer->EmitInstruction(
+          MCInstBuilder(AArch64::Bcc)
+              .addImm(AArch64CC::EQ)
+              .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
+          *STI);
+
+      OutStreamer->EmitLabel(HandleMismatchSym);
+    }
 
-    OutStreamer->EmitLabel(HandleMismatchSym);
     OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre)
                                      .addReg(AArch64::SP)
                                      .addReg(AArch64::X0)
@@ -414,16 +431,16 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
         MCInstBuilder(AArch64::ADRP)
             .addReg(AArch64::X16)
             .addExpr(AArch64MCExpr::create(
-                HwasanTagMismatchRef,
-                AArch64MCExpr::VariantKind::VK_GOT_PAGE, OutContext)),
+                HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
+                OutContext)),
         *STI);
     OutStreamer->EmitInstruction(
         MCInstBuilder(AArch64::LDRXui)
             .addReg(AArch64::X16)
             .addReg(AArch64::X16)
             .addExpr(AArch64MCExpr::create(
-                HwasanTagMismatchRef,
-                AArch64MCExpr::VariantKind::VK_GOT_LO12, OutContext)),
+                HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
+                OutContext)),
         *STI);
     OutStreamer->EmitInstruction(
         MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
@@ -485,15 +502,14 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
   default:
     llvm_unreachable("<unknown operand type>");
   case MachineOperand::MO_Register: {
-    unsigned Reg = MO.getReg();
-    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    Register Reg = MO.getReg();
+    assert(Register::isPhysicalRegister(Reg));
     assert(!MO.getSubReg() && "Subregs should be eliminated!");
     O << AArch64InstPrinter::getRegisterName(Reg);
     break;
   }
   case MachineOperand::MO_Immediate: {
-    int64_t Imm = MO.getImm();
-    O << '#' << Imm;
+    O << MO.getImm();
     break;
   }
   case MachineOperand::MO_GlobalAddress: {
@@ -510,7 +526,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
 
 bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
                                           raw_ostream &O) {
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   switch (Mode) {
   default:
     return true; // Unknown mode.
@@ -531,14 +547,13 @@ bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
 // printing.
 bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
                                            const TargetRegisterClass *RC,
-                                           bool isVector, raw_ostream &O) {
+                                           unsigned AltName, raw_ostream &O) {
   assert(MO.isReg() && "Should only get here with a register!");
   const TargetRegisterInfo *RI = STI->getRegisterInfo();
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
   assert(RI->regsOverlap(RegToPrint, Reg));
-  O << AArch64InstPrinter::getRegisterName(
-           RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
+  O << AArch64InstPrinter::getRegisterName(RegToPrint, AltName);
   return false;
 }
 
@@ -574,6 +589,7 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     case 's': // Print S register.
     case 'd': // Print D register.
     case 'q': // Print Q register.
+    case 'z': // Print Z register.
       if (MO.isReg()) {
         const TargetRegisterClass *RC;
         switch (ExtraCode[0]) {
@@ -592,10 +608,13 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         case 'q':
           RC = &AArch64::FPR128RegClass;
           break;
+        case 'z':
+          RC = &AArch64::ZPRRegClass;
+          break;
         default:
           return true;
         }
-        return printAsmRegInClass(MO, RC, false /* vector */, O);
+        return printAsmRegInClass(MO, RC, AArch64::NoRegAltName, O);
       }
       printOperand(MI, OpNum, O);
       return false;
@@ -605,16 +624,26 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
   // According to ARM, we should emit x and v registers unless we have a
   // modifier.
   if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
 
     // If this is a w or x register, print an x register.
     if (AArch64::GPR32allRegClass.contains(Reg) ||
         AArch64::GPR64allRegClass.contains(Reg))
       return printAsmMRegister(MO, 'x', O);
 
+    unsigned AltName = AArch64::NoRegAltName;
+    const TargetRegisterClass *RegClass;
+    if (AArch64::ZPRRegClass.contains(Reg)) {
+      RegClass = &AArch64::ZPRRegClass;
+    } else if (AArch64::PPRRegClass.contains(Reg)) {
+      RegClass = &AArch64::PPRRegClass;
+    } else {
+      RegClass = &AArch64::FPR128RegClass;
+      AltName = AArch64::vreg;
+    }
+
     // If this is a b, h, s, d, or q register, print it as a v register.
-    return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
-                              O);
+    return printAsmRegInClass(MO, RegClass, AltName, O);
   }
 
   printOperand(MI, OpNum, O);
@@ -682,7 +711,7 @@ void AArch64AsmPrinter::EmitJumpTableInfo() {
     if (JTBBs.empty()) continue;
 
     unsigned Size = AFI->getJumpTableEntrySize(JTI);
-    EmitAlignment(Log2_32(Size));
+    EmitAlignment(Align(Size));
     OutStreamer->EmitLabel(GetJTISymbol(JTI));
 
     for (auto *JTBB : JTBBs)
@@ -725,12 +754,12 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
 ///             add xDest, xDest, xScratch, lsl #2
 void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
                                                 const llvm::MachineInstr &MI) {
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned ScratchReg = MI.getOperand(1).getReg();
-  unsigned ScratchRegW =
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg = MI.getOperand(1).getReg();
+  Register ScratchRegW =
       STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32);
-  unsigned TableReg = MI.getOperand(2).getReg();
-  unsigned EntryReg = MI.getOperand(3).getReg();
+  Register TableReg = MI.getOperand(2).getReg();
+  Register EntryReg = MI.getOperand(3).getReg();
   int JTIdx = MI.getOperand(4).getIndex();
   bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8;
 
@@ -800,7 +829,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
   if (CallTarget) {
     assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
            "High 16 bits of call target should be zero.");
-    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
     EncodedBytes = 16;
     // Materialize the jump address:
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
@@ -830,7 +859,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
 }
 
 void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
-  unsigned DestReg = MI.getOperand(0).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
   if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
     // Convert H/S/D register to corresponding Q register
     if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
@@ -894,32 +923,32 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   default:
     break;
     case AArch64::MOVMCSym: {
-    unsigned DestReg = MI->getOperand(0).getReg();
-    const MachineOperand &MO_Sym = MI->getOperand(1);
-    MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym);
-    MCOperand Hi_MCSym, Lo_MCSym;
-
-    Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S);
-    Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC);
-
-    MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym);
-    MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym);
-
-    MCInst MovZ;
-    MovZ.setOpcode(AArch64::MOVZXi);
-    MovZ.addOperand(MCOperand::createReg(DestReg));
-    MovZ.addOperand(Hi_MCSym);
-    MovZ.addOperand(MCOperand::createImm(16));
-    EmitToStreamer(*OutStreamer, MovZ);
-
-    MCInst MovK;
-    MovK.setOpcode(AArch64::MOVKXi);
-    MovK.addOperand(MCOperand::createReg(DestReg));
-    MovK.addOperand(MCOperand::createReg(DestReg));
-    MovK.addOperand(Lo_MCSym);
-    MovK.addOperand(MCOperand::createImm(0));
-    EmitToStreamer(*OutStreamer, MovK);
-    return;
+      Register DestReg = MI->getOperand(0).getReg();
+      const MachineOperand &MO_Sym = MI->getOperand(1);
+      MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym);
+      MCOperand Hi_MCSym, Lo_MCSym;
+
+      Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S);
+      Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC);
+
+      MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym);
+      MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym);
+
+      MCInst MovZ;
+      MovZ.setOpcode(AArch64::MOVZXi);
+      MovZ.addOperand(MCOperand::createReg(DestReg));
+      MovZ.addOperand(Hi_MCSym);
+      MovZ.addOperand(MCOperand::createImm(16));
+      EmitToStreamer(*OutStreamer, MovZ);
+
+      MCInst MovK;
+      MovK.setOpcode(AArch64::MOVKXi);
+      MovK.addOperand(MCOperand::createReg(DestReg));
+      MovK.addOperand(MCOperand::createReg(DestReg));
+      MovK.addOperand(Lo_MCSym);
+      MovK.addOperand(MCOperand::createImm(0));
+      EmitToStreamer(*OutStreamer, MovK);
+      return;
   }
   case AArch64::MOVIv2d_ns:
     // If the target has <rdar://problem/16473581>, lower this
@@ -1084,6 +1113,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
 
   case AArch64::HWASAN_CHECK_MEMACCESS:
+  case AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES:
     LowerHWASAN_CHECK_MEMACCESS(*MI);
     return;
 
@@ -1193,4 +1223,6 @@ extern "C" void LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
+  RegisterAsmPrinter<AArch64AsmPrinter> W(getTheARM64_32Target());
+  RegisterAsmPrinter<AArch64AsmPrinter> V(getTheAArch64_32Target());
 }
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index 59757769c89a..ed93d02aa615 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -99,7 +99,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
   /// (it's an implicit-def of the BL).
   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
 
-  bool isArgumentHandler() const override { return true; }
+  bool isIncomingArgumentHandler() const override { return true; }
 
   uint64_t StackUsed;
 };
@@ -110,6 +110,7 @@ struct FormalArgHandler : public IncomingArgHandler {
     : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
+    MIRBuilder.getMRI()->addLiveIn(PhysReg);
     MIRBuilder.getMBB().addLiveIn(PhysReg);
   }
 };
@@ -129,14 +130,29 @@ struct CallReturnHandler : public IncomingArgHandler {
 struct OutgoingArgHandler : public CallLowering::ValueHandler {
   OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                      MachineInstrBuilder MIB, CCAssignFn *AssignFn,
-                     CCAssignFn *AssignFnVarArg)
+                     CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
+                     int FPDiff = 0)
       : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
-        AssignFnVarArg(AssignFnVarArg), StackSize(0) {}
+        AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
+        StackSize(0) {}
+
+  bool isIncomingArgumentHandler() const override { return false; }
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
+    MachineFunction &MF = MIRBuilder.getMF();
     LLT p0 = LLT::pointer(0, 64);
     LLT s64 = LLT::scalar(64);
+
+    if (IsTailCall) {
+      Offset += FPDiff;
+      int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+      Register FIReg = MRI.createGenericVirtualRegister(p0);
+      MIRBuilder.buildFrameIndex(FIReg, FI);
+      MPO = MachinePointerInfo::getFixedStack(MF, FI);
+      return FIReg;
+    }
+
     Register SPReg = MRI.createGenericVirtualRegister(p0);
     MIRBuilder.buildCopy(SPReg, Register(AArch64::SP));
 
@@ -146,7 +162,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
     Register AddrReg = MRI.createGenericVirtualRegister(p0);
     MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
 
-    MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+    MPO = MachinePointerInfo::getStack(MF, Offset);
     return AddrReg;
   }
 
@@ -173,12 +189,13 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                  CCValAssign::LocInfo LocInfo,
                  const CallLowering::ArgInfo &Info,
+                 ISD::ArgFlagsTy Flags,
                  CCState &State) override {
     bool Res;
     if (Info.IsFixed)
-      Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+      Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
     else
-      Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+      Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State);
 
     StackSize = State.getNextStackOffset();
     return Res;
@@ -186,10 +203,19 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
 
   MachineInstrBuilder MIB;
   CCAssignFn *AssignFnVarArg;
+  bool IsTailCall;
+
+  /// For tail calls, the byte offset of the call's argument area from the
+  /// callee's. Unused elsewhere.
+  int FPDiff;
   uint64_t StackSize;
 };
 } // namespace
 
+static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) {
+  return CallConv == CallingConv::Fast && TailCallOpt;
+}
+
 void AArch64CallLowering::splitToValueTypes(
     const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
     const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const {
@@ -207,7 +233,7 @@ void AArch64CallLowering::splitToValueTypes(
     // No splitting to do, but we want to replace the original type (e.g. [1 x
     // double] -> double).
     SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
-                           OrigArg.Flags, OrigArg.IsFixed);
+                           OrigArg.Flags[0], OrigArg.IsFixed);
     return;
   }
 
@@ -218,13 +244,13 @@ void AArch64CallLowering::splitToValueTypes(
       OrigArg.Ty, CallConv, false);
   for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
     Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
-    SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags,
+    SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0],
                            OrigArg.IsFixed);
     if (NeedsRegBlock)
-      SplitArgs.back().Flags.setInConsecutiveRegs();
+      SplitArgs.back().Flags[0].setInConsecutiveRegs();
   }
 
-  SplitArgs.back().Flags.setInConsecutiveRegsLast();
+  SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
 }
 
 bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -344,6 +370,49 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
   return Success;
 }
 
+/// Helper function to compute forwarded registers for musttail calls. Computes
+/// the forwarded registers, sets MBB liveness, and emits COPY instructions that
+/// can be used to save + restore registers later.
+static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
+                                             CCAssignFn *AssignFn) {
+  MachineBasicBlock &MBB = MIRBuilder.getMBB();
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  if (!MFI.hasMustTailInVarArgFunc())
+    return;
+
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  const Function &F = MF.getFunction();
+  assert(F.isVarArg() && "Expected F to be vararg?");
+
+  // Compute the set of forwarded registers. The rest are scratch.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(F.getCallingConv(), /*IsVarArg=*/true, MF, ArgLocs,
+                 F.getContext());
+  SmallVector<MVT, 2> RegParmTypes;
+  RegParmTypes.push_back(MVT::i64);
+  RegParmTypes.push_back(MVT::f128);
+
+  // Later on, we can use this vector to restore the registers if necessary.
+  SmallVectorImpl<ForwardedRegister> &Forwards =
+      FuncInfo->getForwardedMustTailRegParms();
+  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn);
+
+  // Conservatively forward X8, since it might be used for an aggregate
+  // return.
+  if (!CCInfo.isAllocated(AArch64::X8)) {
+    unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
+    Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
+  }
+
+  // Add the forwards to the MachineBasicBlock and MachineFunction.
+  for (const auto &F : Forwards) {
+    MBB.addLiveIn(F.PReg);
+    MIRBuilder.buildCopy(Register(F.VReg), Register(F.PReg));
+  }
+}
+
 bool AArch64CallLowering::lowerFormalArguments(
     MachineIRBuilder &MIRBuilder, const Function &F,
     ArrayRef<ArrayRef<Register>> VRegs) const {
@@ -376,64 +445,530 @@ bool AArch64CallLowering::lowerFormalArguments(
   if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
     return false;
 
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  uint64_t StackOffset = Handler.StackUsed;
   if (F.isVarArg()) {
-    if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
-      // FIXME: we need to reimplement saveVarArgsRegisters from
+    auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+    if (!Subtarget.isTargetDarwin()) {
+        // FIXME: we need to reimplement saveVarArgsRegisters from
       // AArch64ISelLowering.
       return false;
     }
 
-    // We currently pass all varargs at 8-byte alignment.
-    uint64_t StackOffset = alignTo(Handler.StackUsed, 8);
+    // We currently pass all varargs at 8-byte alignment, or 4 in ILP32.
+    StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8);
 
     auto &MFI = MIRBuilder.getMF().getFrameInfo();
-    AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
   }
 
+  if (doesCalleeRestoreStack(F.getCallingConv(),
+                             MF.getTarget().Options.GuaranteedTailCallOpt)) {
+    // We have a non-standard ABI, so why not make full use of the stack that
+    // we're going to pop? It must be aligned to 16 B in any case.
+    StackOffset = alignTo(StackOffset, 16);
+
+    // If we're expected to restore the stack (e.g. fastcc), then we'll be
+    // adding a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackOffset);
+
+    // Our own callers will guarantee that the space is free by giving an
+    // aligned value to CALLSEQ_START.
+  }
+
+  // When we tail call, we need to check if the callee's arguments
+  // will fit on the caller's stack. So, whenever we lower formal arguments,
+  // we should keep track of this information, since we might lower a tail call
+  // in this function later.
+  FuncInfo->setBytesInStackArgArea(StackOffset);
+
   auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   if (Subtarget.hasCustomCallingConv())
     Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
 
+  handleMustTailForwardedRegisters(MIRBuilder, AssignFn);
+
   // Move back to the end of the basic block.
   MIRBuilder.setMBB(MBB);
 
   return true;
 }
 
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+  return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::PreserveMost:
+  case CallingConv::Swift:
+    return true;
+  default:
+    return canGuaranteeTCO(CC);
+  }
+}
+
+/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
+/// CC.
+static std::pair<CCAssignFn *, CCAssignFn *>
+getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI) {
+  return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
+}
+
+bool AArch64CallLowering::doCallerAndCalleePassArgsTheSameWay(
+    CallLoweringInfo &Info, MachineFunction &MF,
+    SmallVectorImpl<ArgInfo> &InArgs) const {
+  const Function &CallerF = MF.getFunction();
+  CallingConv::ID CalleeCC = Info.CallConv;
+  CallingConv::ID CallerCC = CallerF.getCallingConv();
+
+  // If the calling conventions match, then everything must be the same.
+  if (CalleeCC == CallerCC)
+    return true;
+
+  // Check if the caller and callee will handle arguments in the same way.
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+  CCAssignFn *CalleeAssignFnFixed;
+  CCAssignFn *CalleeAssignFnVarArg;
+  std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
+      getAssignFnsForCC(CalleeCC, TLI);
+
+  CCAssignFn *CallerAssignFnFixed;
+  CCAssignFn *CallerAssignFnVarArg;
+  std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
+      getAssignFnsForCC(CallerCC, TLI);
+
+  if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed,
+                         *CalleeAssignFnVarArg, *CallerAssignFnFixed,
+                         *CallerAssignFnVarArg))
+    return false;
+
+  // Make sure that the caller and callee preserve all of the same registers.
+  auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+  if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) {
+    TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
+    TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
+  }
+
+  return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved);
+}
+
+bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable(
+    CallLoweringInfo &Info, MachineFunction &MF,
+    SmallVectorImpl<ArgInfo> &OutArgs) const {
+  // If there are no outgoing arguments, then we are done.
+  if (OutArgs.empty())
+    return true;
+
+  const Function &CallerF = MF.getFunction();
+  CallingConv::ID CalleeCC = Info.CallConv;
+  CallingConv::ID CallerCC = CallerF.getCallingConv();
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+
+  CCAssignFn *AssignFnFixed;
+  CCAssignFn *AssignFnVarArg;
+  std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+  // We have outgoing arguments. Make sure that we can tail call with them.
+  SmallVector<CCValAssign, 16> OutLocs;
+  CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
+
+  if (!analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg)) {
+    LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
+    return false;
+  }
+
+  // Make sure that they can fit on the caller's stack.
+  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
+    LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
+    return false;
+  }
+
+  // Verify that the parameters in callee-saved registers match.
+  // TODO: Port this over to CallLowering as general code once swiftself is
+  // supported.
+  auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+  const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  for (unsigned i = 0; i < OutLocs.size(); ++i) {
+    auto &ArgLoc = OutLocs[i];
+    // If it's not a register, it's fine.
+    if (!ArgLoc.isRegLoc()) {
+      if (Info.IsVarArg) {
+        // Be conservative and disallow variadic memory operands to match SDAG's
+        // behaviour.
+        // FIXME: If the caller's calling convention is C, then we can
+        // potentially use its argument area. However, for cases like fastcc,
+        // we can't do anything.
+        LLVM_DEBUG(
+            dbgs()
+            << "... Cannot tail call vararg function with stack arguments\n");
+        return false;
+      }
+      continue;
+    }
+
+    Register Reg = ArgLoc.getLocReg();
+
+    // Only look at callee-saved registers.
+    if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
+      continue;
+
+    LLVM_DEBUG(
+        dbgs()
+        << "... Call has an argument passed in a callee-saved register.\n");
+
+    // Check if it was copied from.
+    ArgInfo &OutInfo = OutArgs[i];
+
+    if (OutInfo.Regs.size() > 1) {
+      LLVM_DEBUG(
+          dbgs() << "... Cannot handle arguments in multiple registers.\n");
+      return false;
+    }
+
+    // Check if we copy the register, walking through copies from virtual
+    // registers. Note that getDefIgnoringCopies does not ignore copies from
+    // physical registers.
+    MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
+    if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) {
+      LLVM_DEBUG(
+          dbgs()
+          << "... Parameter was not copied into a VReg, cannot tail call.\n");
+      return false;
+    }
+
+    // Got a copy. Verify that it's the same as the register we want.
+    Register CopyRHS = RegDef->getOperand(1).getReg();
+    if (CopyRHS != Reg) {
+      LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into "
+                           "VReg, cannot tail call.\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool AArch64CallLowering::isEligibleForTailCallOptimization(
+    MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+    SmallVectorImpl<ArgInfo> &InArgs,
+    SmallVectorImpl<ArgInfo> &OutArgs) const {
+
+  // Must pass all target-independent checks in order to tail call optimize.
+  if (!Info.IsTailCall)
+    return false;
+
+  CallingConv::ID CalleeCC = Info.CallConv;
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &CallerF = MF.getFunction();
+
+  LLVM_DEBUG(dbgs() << "Attempting to lower call as tail call\n");
+
+  if (Info.SwiftErrorVReg) {
+    // TODO: We should handle this.
+    // Note that this is also handled by the check for no outgoing arguments.
+    // Proactively disabling this though, because the swifterror handling in
+    // lowerCall inserts a COPY *after* the location of the call.
+    LLVM_DEBUG(dbgs() << "... Cannot handle tail calls with swifterror yet.\n");
+    return false;
+  }
+
+  if (!mayTailCallThisCC(CalleeCC)) {
+    LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
+    return false;
+  }
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible (see
+  // X86).
+  //
+  // FIXME: In AArch64ISelLowering, this isn't worked around. Can/should we try
+  // it?
+  //
+  // On Windows, "inreg" attributes signify non-aggregate indirect returns.
+  // In this case, it is necessary to save/restore X0 in the callee. Tail
+  // call opt interferes with this. So we disable tail call opt when the
+  // caller has an argument with "inreg" attribute.
+  //
+  // FIXME: Check whether the callee also has an "inreg" argument.
+  //
+  // When the caller has a swifterror argument, we don't want to tail call
+  // because would have to move into the swifterror register before the
+  // tail call.
+  if (any_of(CallerF.args(), [](const Argument &A) {
+        return A.hasByValAttr() || A.hasInRegAttr() || A.hasSwiftErrorAttr();
+      })) {
+    LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval, "
+                         "inreg, or swifterror arguments\n");
+    return false;
+  }
+
+  // Externally-defined functions with weak linkage should not be
+  // tail-called on AArch64 when the OS does not support dynamic
+  // pre-emption of symbols, as the AAELF spec requires normal calls
+  // to undefined weak functions to be replaced with a NOP or jump to the
+  // next instruction. The behaviour of branch instructions in this
+  // situation (as used for tail calls) is implementation-defined, so we
+  // cannot rely on the linker replacing the tail call with a return.
+  if (Info.Callee.isGlobal()) {
+    const GlobalValue *GV = Info.Callee.getGlobal();
+    const Triple &TT = MF.getTarget().getTargetTriple();
+    if (GV->hasExternalWeakLinkage() &&
+        (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
+         TT.isOSBinFormatMachO())) {
+      LLVM_DEBUG(dbgs() << "... Cannot tail call externally-defined function "
+                           "with weak linkage for this OS.\n");
+      return false;
+    }
+  }
+
+  // If we have -tailcallopt, then we're done.
+  if (MF.getTarget().Options.GuaranteedTailCallOpt)
+    return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
+
+  // We don't have -tailcallopt, so we're allowed to change the ABI (sibcall).
+  // Try to find cases where we can do that.
+
+  // I want anyone implementing a new calling convention to think long and hard
+  // about this assert.
+  assert((!Info.IsVarArg || CalleeCC == CallingConv::C) &&
+         "Unexpected variadic calling convention");
+
+  // Verify that the incoming and outgoing arguments from the callee are
+  // safe to tail call.
+  if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "... Caller and callee have incompatible calling conventions.\n");
+    return false;
+  }
+
+  if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
+    return false;
+
+  LLVM_DEBUG(
+      dbgs() << "... Call is eligible for tail call optimization.\n");
+  return true;
+}
+
+static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect,
+                              bool IsTailCall) {
+  if (!IsTailCall)
+    return IsIndirect ? AArch64::BLR : AArch64::BL;
+
+  if (!IsIndirect)
+    return AArch64::TCRETURNdi;
+
+  // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
+  // x16 or x17.
+  if (CallerF.hasFnAttribute("branch-target-enforcement"))
+    return AArch64::TCRETURNriBTI;
+
+  return AArch64::TCRETURNri;
+}
+
+bool AArch64CallLowering::lowerTailCall(
+    MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+    SmallVectorImpl<ArgInfo> &OutArgs) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = MF.getFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+  // True when we're tail calling, but without -tailcallopt.
+  bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
+
+  // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64
+  // register class. Until we can do that, we should fall back here.
+  if (F.hasFnAttribute("branch-target-enforcement")) {
+    LLVM_DEBUG(
+        dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n");
+    return false;
+  }
+
+  // Find out which ABI gets to decide where things go.
+  CallingConv::ID CalleeCC = Info.CallConv;
+  CCAssignFn *AssignFnFixed;
+  CCAssignFn *AssignFnVarArg;
+  std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+  MachineInstrBuilder CallSeqStart;
+  if (!IsSibCall)
+    CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
+
+  unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true);
+  auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+  MIB.add(Info.Callee);
+
+  // Byte offset for the tail call. When we are sibcalling, this will always
+  // be 0.
+  MIB.addImm(0);
+
+  // Tell the call which registers are clobbered.
+  auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(MF, F.getCallingConv());
+  if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv())
+    TRI->UpdateCustomCallPreservedMask(MF, &Mask);
+  MIB.addRegMask(Mask);
+
+  if (TRI->isAnyArgRegReserved(MF))
+    TRI->emitReservedArgRegCallError(MF);
+
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0.
+  int FPDiff = 0;
+
+  // This will be 0 for sibcalls, potentially nonzero for tail calls produced
+  // by -tailcallopt. For sibcalls, the memory operands for the call are
+  // already available in the caller's incoming argument space.
+  unsigned NumBytes = 0;
+  if (!IsSibCall) {
+    // We aren't sibcalling, so we need to compute FPDiff. We need to do this
+    // before handling assignments, because FPDiff must be known for memory
+    // arguments.
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+    SmallVector<CCValAssign, 16> OutLocs;
+    CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
+    analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg);
+
+    // The callee will pop the argument stack as a tail call. Thus, we must
+    // keep it 16-byte aligned.
+    NumBytes = alignTo(OutInfo.getNextStackOffset(), 16);
+
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
+
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+  }
+
+  const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
+
+  // Do the actual argument marshalling.
+  SmallVector<unsigned, 8> PhysRegs;
+  OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
+                             AssignFnVarArg, true, FPDiff);
+  if (!handleAssignments(MIRBuilder, OutArgs, Handler))
+    return false;
+
+  if (Info.IsVarArg && Info.IsMustTailCall) {
+    // Now we know what's being passed to the function. Add uses to the call for
+    // the forwarded registers that we *aren't* passing as parameters. This will
+    // preserve the copies we build earlier.
+    for (const auto &F : Forwards) {
+      Register ForwardedReg = F.PReg;
+      // If the register is already passed, or aliases a register which is
+      // already being passed, then skip it.
+      if (any_of(MIB->uses(), [&ForwardedReg, &TRI](const MachineOperand &Use) {
+            if (!Use.isReg())
+              return false;
+            return TRI->regsOverlap(Use.getReg(), ForwardedReg);
+          }))
+        continue;
+
+      // We aren't passing it already, so we should add it to the call.
+      MIRBuilder.buildCopy(ForwardedReg, Register(F.VReg));
+      MIB.addReg(ForwardedReg, RegState::Implicit);
+    }
+  }
+
+  // If we have -tailcallopt, we need to adjust the stack. We'll do the call
+  // sequence start and end here.
+  if (!IsSibCall) {
+    MIB->getOperand(1).setImm(FPDiff);
+    CallSeqStart.addImm(NumBytes).addImm(0);
+    // End the call sequence *before* emitting the call. Normally, we would
+    // tidy the frame up after the call. However, here, we've laid out the
+    // parameters so that when SP is reset, they will be in the correct
+    // location.
+    MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0);
+  }
+
+  // Now we can add the actual call instruction to the correct basic block.
+  MIRBuilder.insertInstr(MIB);
+
+  // If Callee is a reg, since it is used by a target specific instruction,
+  // it must have a register class matching the constraint of that instruction.
+  if (Info.Callee.isReg())
+    MIB->getOperand(0).setReg(constrainOperandRegClass(
+        MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+        0));
+
+  MF.getFrameInfo().setHasTailCall();
+  Info.LoweredTailCall = true;
+  return true;
+}
+
 bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
-                                    CallingConv::ID CallConv,
-                                    const MachineOperand &Callee,
-                                    const ArgInfo &OrigRet,
-                                    ArrayRef<ArgInfo> OrigArgs,
-                                    Register SwiftErrorVReg) const {
+                                    CallLoweringInfo &Info) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   auto &DL = F.getParent()->getDataLayout();
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
 
-  SmallVector<ArgInfo, 8> SplitArgs;
-  for (auto &OrigArg : OrigArgs) {
-    splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv);
+  SmallVector<ArgInfo, 8> OutArgs;
+  for (auto &OrigArg : Info.OrigArgs) {
+    splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv);
     // AAPCS requires that we zero-extend i1 to 8 bits by the caller.
     if (OrigArg.Ty->isIntegerTy(1))
-      SplitArgs.back().Flags.setZExt();
+      OutArgs.back().Flags[0].setZExt();
+  }
+
+  SmallVector<ArgInfo, 8> InArgs;
+  if (!Info.OrigRet.Ty->isVoidTy())
+    splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv());
+
+  // If we can lower as a tail call, do that instead.
+  bool CanTailCallOpt =
+      isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
+
+  // We must emit a tail call if we have musttail.
+  if (Info.IsMustTailCall && !CanTailCallOpt) {
+    // There are types of incoming/outgoing arguments we can't handle yet, so
+    // it doesn't make sense to actually die here like in ISelLowering. Instead,
+    // fall back to SelectionDAG and let it try to handle this.
+    LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
+    return false;
   }
 
+  if (CanTailCallOpt)
+    return lowerTailCall(MIRBuilder, Info, OutArgs);
+
   // Find out which ABI gets to decide where things go.
-  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
-  CCAssignFn *AssignFnFixed =
-      TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-  CCAssignFn *AssignFnVarArg =
-      TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true);
+  CCAssignFn *AssignFnFixed;
+  CCAssignFn *AssignFnVarArg;
+  std::tie(AssignFnFixed, AssignFnVarArg) =
+      getAssignFnsForCC(Info.CallConv, TLI);
 
-  auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
+  MachineInstrBuilder CallSeqStart;
+  CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
 
   // Create a temporarily-floating call instruction so we can add the implicit
   // uses of arg registers.
-  auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR
-                                                          : AArch64::BL);
-  MIB.add(Callee);
+  unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false);
+
+  auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+  MIB.add(Info.Callee);
 
   // Tell the call which registers are clobbered.
   auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
@@ -448,8 +983,8 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // Do the actual argument marshalling.
   SmallVector<unsigned, 8> PhysRegs;
   OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
-                             AssignFnVarArg);
-  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+                             AssignFnVarArg, false);
+  if (!handleAssignments(MIRBuilder, OutArgs, Handler))
     return false;
 
   // Now we can add the actual call instruction to the correct basic block.
@@ -458,34 +993,37 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // If Callee is a reg, since it is used by a target specific
   // instruction, it must have a register class matching the
   // constraint of that instruction.
-  if (Callee.isReg())
+  if (Info.Callee.isReg())
     MIB->getOperand(0).setReg(constrainOperandRegClass(
         MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
-        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
+        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+        0));
 
   // Finally we can copy the returned value back into its virtual-register. In
   // symmetry with the arugments, the physical register must be an
   // implicit-define of the call instruction.
-  CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
-  if (!OrigRet.Ty->isVoidTy()) {
-    SplitArgs.clear();
-
-    splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv());
-
+  if (!Info.OrigRet.Ty->isVoidTy()) {
+    CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
     CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
-    if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+    if (!handleAssignments(MIRBuilder, InArgs, Handler))
       return false;
   }
 
-  if (SwiftErrorVReg) {
+  if (Info.SwiftErrorVReg) {
     MIB.addDef(AArch64::X21, RegState::Implicit);
-    MIRBuilder.buildCopy(SwiftErrorVReg, Register(AArch64::X21));
+    MIRBuilder.buildCopy(Info.SwiftErrorVReg, Register(AArch64::X21));
   }
 
+  uint64_t CalleePopBytes =
+      doesCalleeRestoreStack(Info.CallConv,
+                             MF.getTarget().Options.GuaranteedTailCallOpt)
+          ? alignTo(Handler.StackSize, 16)
+          : 0;
+
   CallSeqStart.addImm(Handler.StackSize).addImm(0);
   MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
       .addImm(Handler.StackSize)
-      .addImm(0);
+      .addImm(CalleePopBytes);
 
   return true;
 }
diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h
index 4f428f254537..b0c601c7062c 100644
--- a/lib/Target/AArch64/AArch64CallLowering.h
+++ b/lib/Target/AArch64/AArch64CallLowering.h
@@ -40,16 +40,15 @@ public:
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<ArrayRef<Register>> VRegs) const override;
 
-  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
-                 const MachineOperand &Callee, const ArgInfo &OrigRet,
-                 ArrayRef<ArgInfo> OrigArgs,
-                 Register SwiftErrorVReg) const override;
+  bool lowerCall(MachineIRBuilder &MIRBuilder,
+                 CallLoweringInfo &Info) const override;
 
-  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
-                 const MachineOperand &Callee, const ArgInfo &OrigRet,
-                 ArrayRef<ArgInfo> OrigArgs) const override {
-    return lowerCall(MIRBuilder, CallConv, Callee, OrigRet, OrigArgs, 0);
-  }
+  /// Returns true if the call can be lowered as a tail call.
+  bool
+  isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder,
+                                    CallLoweringInfo &Info,
+                                    SmallVectorImpl<ArgInfo> &InArgs,
+                                    SmallVectorImpl<ArgInfo> &OutArgs) const;
 
   bool supportSwiftError() const override { return true; }
 
@@ -64,6 +63,18 @@ private:
                          SmallVectorImpl<ArgInfo> &SplitArgs,
                          const DataLayout &DL, MachineRegisterInfo &MRI,
                          CallingConv::ID CallConv) const;
+
+  bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+                     SmallVectorImpl<ArgInfo> &OutArgs) const;
+
+  bool
+  doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info,
+                                      MachineFunction &MF,
+                                      SmallVectorImpl<ArgInfo> &InArgs) const;
+
+  bool
+  areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF,
+                                    SmallVectorImpl<ArgInfo> &OutArgs) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64CallingConvention.cpp b/lib/Target/AArch64/AArch64CallingConvention.cpp
index 02538a187611..a0695cef615f 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -40,12 +40,14 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
                              CCState &State, unsigned SlotAlign) {
   unsigned Size = LocVT.getSizeInBits() / 8;
-  unsigned StackAlign =
+  const Align StackAlign =
       State.getMachineFunction().getDataLayout().getStackAlignment();
-  unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+  const Align OrigAlign(ArgFlags.getOrigAlign());
+  const Align Align = std::min(OrigAlign, StackAlign);
 
   for (auto &It : PendingMembers) {
-    It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+    It.convertToMem(State.AllocateStack(
+        Size, std::max((unsigned)Align.value(), SlotAlign)));
     State.addLoc(It);
     SlotAlign = 1;
   }
@@ -79,10 +81,14 @@ static bool CC_AArch64_Custom_Stack_Block(
 static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                     CCValAssign::LocInfo &LocInfo,
                                     ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+      State.getMachineFunction().getSubtarget());
+  bool IsDarwinILP32 = Subtarget.isTargetILP32() && Subtarget.isTargetMachO();
+
   // Try to allocate a contiguous block of registers, each of the correct
   // size to hold one member.
   ArrayRef<MCPhysReg> RegList;
-  if (LocVT.SimpleTy == MVT::i64)
+  if (LocVT.SimpleTy == MVT::i64 || (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32))
     RegList = XRegList;
   else if (LocVT.SimpleTy == MVT::f16)
     RegList = HRegList;
@@ -107,8 +113,12 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   if (!ArgFlags.isInConsecutiveRegsLast())
     return true;
 
-  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
-  if (RegResult) {
+  // [N x i32] arguments get packed into x-registers on Darwin's arm64_32
+  // because that's how the armv7k Clang front-end emits small structs.
+  unsigned EltsPerReg = (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32) ? 2 : 1;
+  unsigned RegResult = State.AllocateRegBlock(
+      RegList, alignTo(PendingMembers.size(), EltsPerReg) / EltsPerReg);
+  if (RegResult && EltsPerReg == 1) {
     for (auto &It : PendingMembers) {
       It.convertToReg(RegResult);
       State.addLoc(It);
@@ -116,14 +126,26 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     }
     PendingMembers.clear();
     return true;
+  } else if (RegResult) {
+    assert(EltsPerReg == 2 && "unexpected ABI");
+    bool UseHigh = false;
+    CCValAssign::LocInfo Info;
+    for (auto &It : PendingMembers) {
+      Info = UseHigh ? CCValAssign::AExtUpper : CCValAssign::ZExt;
+      State.addLoc(CCValAssign::getReg(It.getValNo(), MVT::i32, RegResult,
+                                       MVT::i64, Info));
+      UseHigh = !UseHigh;
+      if (!UseHigh)
+        ++RegResult;
+    }
+    PendingMembers.clear();
+    return true;
   }
 
   // Mark all regs in the class as unavailable
   for (auto Reg : RegList)
     State.AllocateReg(Reg);
 
-  const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
-      State.getMachineFunction().getSubtarget());
   unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
 
   return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
index 13cc0c583fd2..5a55d090d7c8 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -25,6 +25,9 @@ bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
 bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                           CCValAssign::LocInfo LocInfo,
                           ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags, CCState &State);
 bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                              CCValAssign::LocInfo LocInfo,
                              ISD::ArgFlagsTy ArgFlags, CCState &State);
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index d969a9e1ab3a..bccbbd4591ed 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -17,6 +17,10 @@ class CCIfAlign<string Align, CCAction A> :
 class CCIfBigEndian<CCAction A> :
   CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
 
+class CCIfILP32<CCAction A> :
+  CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>;
+
+
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
 //===----------------------------------------------------------------------===//
@@ -70,6 +74,18 @@ def CC_AArch64_AAPCS : CallingConv<[
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
+  CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+            nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+           CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
+  CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+            nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+           CCPassIndirect<i64>>,
+
+  CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+           CCAssignToReg<[P0, P1, P2, P3]>>,
+  CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+           CCPassIndirect<i64>>,
+
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -111,6 +127,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
   CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
@@ -135,7 +152,14 @@ def RetCC_AArch64_AAPCS : CallingConv<[
       CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                               [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
-      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+            nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+           CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
+
+  CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+           CCAssignToReg<[P0, P1, P2, P3]>>
 ]>;
 
 // Vararg functions on windows pass floats in integer registers
@@ -202,6 +226,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
   CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Re-demote pointers to 32-bits so we don't end up storing 64-bit
+  // values and clobbering neighbouring stack locations. Not very pretty.
+  CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+  CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>,
+
   CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
            CCAssignToStack<8, 8>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
@@ -229,6 +259,29 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
            CCAssignToStack<16, 16>>
 ]>;
 
+// In the ILP32 world, the minimum stack slot size is 4 bytes. Otherwise the
+// same as the normal Darwin VarArgs handling.
+let Entry = 1 in
+def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // Handle all scalar types as either i32 or f32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[f16],     CCPromoteToType<f32>>,
+
+  // Everything is on the stack.
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
+]>;
+
+
 // The WebKit_JS calling convention only passes the first argument (the callee)
 // in register and the remaining arguments on stack. We allow 32bit stack slots,
 // so that WebKit can write partial values in the stack and define the other
@@ -298,6 +351,12 @@ def CC_AArch64_GHC : CallingConv<[
   CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
 ]>;
 
+// The order of the callee-saves in this file is important, because the
+// FrameLowering code will use this order to determine the layout the
+// callee-save area in the stack frame. As can be observed below, Darwin
+// requires the frame-record (LR, FP) to be at the top the callee-save area,
+// whereas for other platforms they are at the bottom.
+
 // FIXME: LR is only callee-saved in the sense that *we* preserve it and are
 // presumably a callee to someone. External functions may not do so, but this
 // is currently safe since BL has LR as an implicit-def and what happens after a
@@ -306,7 +365,13 @@ def CC_AArch64_GHC : CallingConv<[
 // It would be better to model its preservation semantics properly (create a
 // vreg on entry, use it in RET & tail call generation; make that vreg def if we
 // end up saving LR as part of a call frame). Watch this space...
-def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+                                           X25, X26, X27, X28, LR, FP,
+                                           D8,  D9,  D10, D11,
+                                           D12, D13, D14, D15)>;
+
+// Darwin puts the frame-record at the top of the callee-save area.
+def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
                                            X23, X24, X25, X26, X27, X28,
                                            D8,  D9,  D10, D11,
                                            D12, D13, D14, D15)>;
@@ -314,17 +379,24 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 // Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x.
 // We put FP before LR, so that frame lowering logic generates (FP,LR) pairs,
 // and not (LR,FP) pairs.
-def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add FP, LR, X19, X20, X21, X22,
-                                               X23, X24, X25, X26, X27, X28,
+def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+                                               X25, X26, X27, X28, FP, LR,
                                                D8, D9, D10, D11,
                                                D12, D13, D14, D15)>;
 
 // AArch64 PCS for vector functions (VPCS)
 // must (additionally) preserve full Q8-Q23 registers
-def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
-                                          X23, X24, X25, X26, X27, X28,
+def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+                                          X25, X26, X27, X28, LR, FP,
                                           (sequence "Q%u", 8, 23))>;
 
+// Functions taking SVE arguments or returning an SVE type
+// must (additionally) preserve full Z8-Z23 and predicate registers P4-P15
+def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+                                               X25, X26, X27, X28, LR, FP,
+                                               (sequence "Z%u", 8, 23),
+                                               (sequence "P%u", 4, 15))>;
+
 // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
 // 'this' and the pointer return value are both passed in X0 in these cases,
 // this can be partially modelled by treating X0 as a callee-saved register;
@@ -336,7 +408,7 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
 
 def CSR_AArch64_AAPCS_SwiftError
-    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>;
+    : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>;
 
 // The function used by Darwin to obtain the address of a thread-local variable
 // guarantees more than a normal AAPCS function. x16 and x17 are used on the
@@ -352,7 +424,7 @@ def CSR_AArch64_TLS_Darwin
 // fast path calls a function that follows CSR_AArch64_TLS_Darwin,
 // CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
 def CSR_AArch64_CXX_TLS_Darwin
-    : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+    : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS,
                            (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
                            (sequence "D%u", 0, 31))>;
 
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 9f324b433209..35e6fef24363 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -103,6 +103,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -181,6 +182,7 @@ static bool canDefBePartOfLOH(const MachineInstr &MI) {
   case AArch64::ADDXri:
     return canAddBePartOfLOH(MI);
   case AArch64::LDRXui:
+  case AArch64::LDRWui:
     // Check immediate to see if the immediate is an address.
     switch (MI.getOperand(2).getType()) {
     default:
@@ -312,7 +314,8 @@ static void handleUse(const MachineInstr &MI, const MachineOperand &MO,
     Info.Type = MCLOH_AdrpAdd;
     Info.IsCandidate = true;
     Info.MI0 = &MI;
-  } else if (MI.getOpcode() == AArch64::LDRXui &&
+  } else if ((MI.getOpcode() == AArch64::LDRXui ||
+              MI.getOpcode() == AArch64::LDRWui) &&
              MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) {
     Info.Type = MCLOH_AdrpLdrGot;
     Info.IsCandidate = true;
@@ -357,7 +360,9 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo,
       return true;
     }
   } else {
-    assert(MI.getOpcode() == AArch64::LDRXui && "Expect LDRXui");
+    assert((MI.getOpcode() == AArch64::LDRXui ||
+            MI.getOpcode() == AArch64::LDRWui) &&
+           "Expect LDRXui or LDRWui");
     assert((MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) &&
            "Expected GOT relocation");
     if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) {
@@ -474,13 +479,23 @@ static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) {
     handleClobber(LOHInfos[Idx]);
   }
   // Handle uses.
+
+  SmallSet<int, 4> UsesSeen;
   for (const MachineOperand &MO : MI.uses()) {
     if (!MO.isReg() || !MO.readsReg())
       continue;
     int Idx = mapRegToGPRIndex(MO.getReg());
     if (Idx < 0)
       continue;
-    handleUse(MI, MO, LOHInfos[Idx]);
+
+    // Multiple uses of the same register within a single instruction don't
+    // count as MultiUser or block optimization. This is especially important on
+    // arm64_32, where any memory operation is likely to be an explicit use of
+    // xN and an implicit use of wN (the base address register).
+    if (!UsesSeen.count(Idx)) {
+      handleUse(MI, MO, LOHInfos[Idx]);
+      UsesSeen.insert(Idx);
+    }
   }
 }
 
@@ -512,6 +527,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
       switch (Opcode) {
       case AArch64::ADDXri:
       case AArch64::LDRXui:
+      case AArch64::LDRWui:
         if (canDefBePartOfLOH(MI)) {
           const MachineOperand &Def = MI.getOperand(0);
           const MachineOperand &Op = MI.getOperand(1);
diff --git a/lib/Target/AArch64/AArch64Combine.td b/lib/Target/AArch64/AArch64Combine.td
new file mode 100644
index 000000000000..bb99f2516ecf
--- /dev/null
+++ b/lib/Target/AArch64/AArch64Combine.td
@@ -0,0 +1,18 @@
+//=- AArch64.td - Define AArch64 Combine Rules ---------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/GlobalISel/Combine.td"
+
+def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
+  "AArch64GenPreLegalizerCombinerHelper", [all_combines,
+                                           elide_br_by_inverting_cond]> {
+  let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
+}
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 453132e09669..25e23e4623de 100644
--- a/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -78,7 +78,7 @@ void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) {
-  if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+  if (!Register::isVirtualRegister(MO.getReg()))
     return nullptr;
   return MRI->getUniqueVRegDef(MO.getReg());
 }
@@ -98,7 +98,7 @@ MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
   }
   bool Is64Bit;
   unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit);
-  unsigned NewDestReg = MI.getOperand(0).getReg();
+  Register NewDestReg = MI.getOperand(0).getReg();
   if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg()))
     NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
 
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 2cfbcc592d6a..43ae9f8ec47f 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -220,7 +220,7 @@ bool SSACCmpConv::trivialTailPHIs() {
     // PHI operands come in (VReg, MBB) pairs.
     for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
       MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
-      unsigned Reg = I.getOperand(oi).getReg();
+      Register Reg = I.getOperand(oi).getReg();
       if (MBB == Head) {
         assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
         HeadReg = Reg;
@@ -259,7 +259,7 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) {
   // Writes to the zero register are dead.
   if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
     return true;
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+  if (!Register::isVirtualRegister(DstReg))
     return false;
   // A virtual register def without any uses will be marked dead later, and
   // eventually replaced by the zero register.
@@ -631,7 +631,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
     }
     const MCInstrDesc &MCID = TII->get(Opc);
     // Create a dummy virtual register for the SUBS def.
-    unsigned DestReg =
+    Register DestReg =
         MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
     // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
     BuildMI(*Head, Head->end(), TermDL, MCID)
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index a43077cb88ec..bc3808df1dbc 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -145,8 +145,8 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
         continue;
       // We should not have any relevant physreg defs that are replacable by
       // zero before register allocation. So we just check for dead vreg defs.
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+      Register Reg = MO.getReg();
+      if (!Register::isVirtualRegister(Reg) ||
           (!MO.isDead() && !MRI->use_nodbg_empty(Reg)))
         continue;
       assert(!MO.isImplicit() && "Unexpected implicit def!");
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 210c10eb1842..082e17e44d04 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -109,7 +109,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI,
                                        unsigned BitSize) {
   MachineInstr &MI = *MBBI;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   uint64_t Imm = MI.getOperand(1).getImm();
 
   if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
@@ -150,7 +150,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
       } break;
     case AArch64::MOVKWi:
     case AArch64::MOVKXi: {
-      unsigned DstReg = MI.getOperand(0).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
       bool DstIsDead = MI.getOperand(0).isDead();
       MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
         .addReg(DstReg,
@@ -174,14 +174,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   const MachineOperand &Dest = MI.getOperand(0);
-  unsigned StatusReg = MI.getOperand(1).getReg();
+  Register StatusReg = MI.getOperand(1).getReg();
   bool StatusDead = MI.getOperand(1).isDead();
   // Duplicating undef operands into 2 instructions does not guarantee the same
   // value on both; However undef should be replaced by xzr anyway.
   assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
-  unsigned AddrReg = MI.getOperand(2).getReg();
-  unsigned DesiredReg = MI.getOperand(3).getReg();
-  unsigned NewReg = MI.getOperand(4).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register DesiredReg = MI.getOperand(3).getReg();
+  Register NewReg = MI.getOperand(4).getReg();
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -254,16 +254,16 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   DebugLoc DL = MI.getDebugLoc();
   MachineOperand &DestLo = MI.getOperand(0);
   MachineOperand &DestHi = MI.getOperand(1);
-  unsigned StatusReg = MI.getOperand(2).getReg();
+  Register StatusReg = MI.getOperand(2).getReg();
   bool StatusDead = MI.getOperand(2).isDead();
   // Duplicating undef operands into 2 instructions does not guarantee the same
   // value on both; However undef should be replaced by xzr anyway.
   assert(!MI.getOperand(3).isUndef() && "cannot handle undef");
-  unsigned AddrReg = MI.getOperand(3).getReg();
-  unsigned DesiredLoReg = MI.getOperand(4).getReg();
-  unsigned DesiredHiReg = MI.getOperand(5).getReg();
-  unsigned NewLoReg = MI.getOperand(6).getReg();
-  unsigned NewHiReg = MI.getOperand(7).getReg();
+  Register AddrReg = MI.getOperand(3).getReg();
+  Register DesiredLoReg = MI.getOperand(4).getReg();
+  Register DesiredHiReg = MI.getOperand(5).getReg();
+  Register NewLoReg = MI.getOperand(6).getReg();
+  Register NewHiReg = MI.getOperand(7).getReg();
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -475,7 +475,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
 
   case AArch64::LOADgot: {
     MachineFunction *MF = MBB.getParent();
-    unsigned DstReg = MI.getOperand(0).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
     const MachineOperand &MO1 = MI.getOperand(1);
     unsigned Flags = MO1.getTargetFlags();
 
@@ -495,12 +495,26 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
       }
     } else {
       // Small codemodel expand into ADRP + LDR.
+      MachineFunction &MF = *MI.getParent()->getParent();
+      DebugLoc DL = MI.getDebugLoc();
       MachineInstrBuilder MIB1 =
           BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
-      MachineInstrBuilder MIB2 =
-          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
-              .add(MI.getOperand(0))
-              .addReg(DstReg);
+
+      MachineInstrBuilder MIB2;
+      if (MF.getSubtarget<AArch64Subtarget>().isTargetILP32()) {
+        auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+        unsigned Reg32 = TRI->getSubReg(DstReg, AArch64::sub_32);
+        unsigned DstFlags = MI.getOperand(0).getTargetFlags();
+        MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRWui))
+                   .addDef(Reg32)
+                   .addReg(DstReg, RegState::Kill)
+                   .addReg(DstReg, DstFlags | RegState::Implicit);
+      } else {
+        unsigned DstReg = MI.getOperand(0).getReg();
+        MIB2 = BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui))
+                   .add(MI.getOperand(0))
+                   .addUse(DstReg, RegState::Kill);
+      }
 
       if (MO1.isGlobal()) {
         MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
@@ -534,11 +548,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case AArch64::MOVaddrTLS:
   case AArch64::MOVaddrEXT: {
     // Expand into ADRP + ADD.
-    unsigned DstReg = MI.getOperand(0).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
     MachineInstrBuilder MIB1 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
             .add(MI.getOperand(1));
 
+    if (MI.getOperand(1).getTargetFlags() & AArch64II::MO_TAGGED) {
+      // MO_TAGGED on the page indicates a tagged address. Set the tag now.
+      // We do so by creating a MOVK that sets bits 48-63 of the register to
+      // (global address + 0x100000000 - PC) >> 48. This assumes that we're in
+      // the small code model so we can assume a binary size of <= 4GB, which
+      // makes the untagged PC relative offset positive. The binary must also be
+      // loaded into address range [0, 2^48). Both of these properties need to
+      // be ensured at runtime when using tagged addresses.
+      auto Tag = MI.getOperand(1);
+      Tag.setTargetFlags(AArch64II::MO_PREL | AArch64II::MO_G3);
+      Tag.setOffset(0x100000000);
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi), DstReg)
+          .addReg(DstReg)
+          .add(Tag)
+          .addImm(48);
+    }
+
     MachineInstrBuilder MIB2 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
             .add(MI.getOperand(0))
@@ -561,7 +592,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     return true;
 
   case AArch64::MOVbaseTLS: {
-    unsigned DstReg = MI.getOperand(0).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
     auto SysReg = AArch64SysReg::TPIDR_EL0;
     MachineFunction *MF = MBB.getParent();
     if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
@@ -642,11 +673,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      // instruction sequence.
      int BaseOffset = -AFI->getTaggedBasePointerOffset();
      unsigned FrameReg;
-     int FrameRegOffset = TFI->resolveFrameOffsetReference(
-         MF, BaseOffset, false /*isFixed*/, FrameReg, /*PreferFP=*/false,
+     StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
+         MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg,
+         /*PreferFP=*/false,
          /*ForSimm=*/true);
      Register SrcReg = FrameReg;
-     if (FrameRegOffset != 0) {
+     if (FrameRegOffset) {
        // Use output register as temporary.
        SrcReg = MI.getOperand(0).getReg();
        emitFrameOffset(MBB, &MI, MI.getDebugLoc(), SrcReg, FrameReg,
diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 3b3182128c4c..b54fc2e51bac 100644
--- a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -642,7 +642,7 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
   }
 
   // Loads from the stack pointer don't get prefetched.
-  unsigned BaseReg = MI.getOperand(BaseRegIdx).getReg();
+  Register BaseReg = MI.getOperand(BaseRegIdx).getReg();
   if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
     return None;
 
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 8dc2768b9597..277a3052f1e5 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -459,7 +459,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
   if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO())
     return 0;
 
-  unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+  unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
 
   EVT DestEVT = TLI.getValueType(DL, GV->getType(), true);
   if (!DestEVT.isSimple())
@@ -474,12 +474,32 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
             ADRPReg)
         .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);
 
-    ResultReg = createResultReg(&AArch64::GPR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+    unsigned LdrOpc;
+    if (Subtarget->isTargetILP32()) {
+      ResultReg = createResultReg(&AArch64::GPR32RegClass);
+      LdrOpc = AArch64::LDRWui;
+    } else {
+      ResultReg = createResultReg(&AArch64::GPR64RegClass);
+      LdrOpc = AArch64::LDRXui;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(LdrOpc),
             ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0,
-                          AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags);
+      .addReg(ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                        AArch64II::MO_NC | OpFlags);
+    if (!Subtarget->isTargetILP32())
+      return ResultReg;
+
+    // LDRWui produces a 32-bit register, but pointers in-register are 64-bits
+    // so we must extend the result on ILP32.
+    unsigned Result64 = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::SUBREG_TO_REG))
+        .addDef(Result64)
+        .addImm(0)
+        .addReg(ResultReg, RegState::Kill)
+        .addImm(AArch64::sub_32);
+    return Result64;
   } else {
     // ADRP + ADDX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
@@ -504,6 +524,15 @@ unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
   if (!CEVT.isSimple())
     return 0;
   MVT VT = CEVT.getSimpleVT();
+  // arm64_32 has 32-bit pointers held in 64-bit registers. Because of that,
+  // 'null' pointers need to have a somewhat special treatment.
+  if (const auto *CPN = dyn_cast<ConstantPointerNull>(C)) {
+    (void)CPN;
+    assert(CPN->getType()->getPointerAddressSpace() == 0 &&
+           "Unexpected address space");
+    assert(VT == MVT::i64 && "Expected 64-bit pointers");
+    return materializeInt(ConstantInt::get(Type::getInt64Ty(*Context), 0), VT);
+  }
 
   if (const auto *CI = dyn_cast<ConstantInt>(C))
     return materializeInt(CI, VT);
@@ -946,6 +975,9 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
 bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(DL, Ty, true);
 
+  if (Subtarget->isTargetILP32() && Ty->isPointerTy())
+    return false;
+
   // Only handle simple types.
   if (evt == MVT::Other || !evt.isSimple())
     return false;
@@ -988,6 +1020,9 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const {
 }
 
 bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
+  if (Subtarget->isTargetILP32())
+    return false;
+
   unsigned ScaleFactor = getImplicitScaleFactor(VT);
   if (!ScaleFactor)
     return false;
@@ -3165,6 +3200,11 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (IsTailCall)
     return false;
 
+  // FIXME: we could and should support this, but for now correctness at -O0 is
+  // more important.
+  if (Subtarget->isTargetILP32())
+    return false;
+
   CodeModel::Model CM = TM.getCodeModel();
   // Only support the small-addressing and large code models.
   if (CM != CodeModel::Large && !Subtarget->useSmallAddressing())
@@ -3434,8 +3474,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     MFI.setFrameAddressIsTaken(true);
 
     const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
-    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
-    unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    Register SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr);
     // Recursively load frame address
@@ -3796,6 +3836,11 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  // FIXME: in principle it could. Mostly just a case of zero extending outgoing
+  // pointers.
+  if (Subtarget->isTargetILP32())
+    return false;
+
   if (F.isVarArg())
     return false;
 
@@ -3842,7 +3887,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
-    unsigned DestReg = VA.getLocReg();
+    Register DestReg = VA.getLocReg();
     // Avoid a cross-class copy. This is very unlikely.
     if (!MRI.getRegClass(SrcReg)->contains(DestReg))
       return false;
@@ -3970,7 +4015,7 @@ unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) {
     if (DestVT == MVT::i64) {
       // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
       // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
-      unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+      Register Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(AArch64::SUBREG_TO_REG), Reg64)
           .addImm(0)
@@ -4123,7 +4168,7 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
   };
   unsigned Opc = OpcTable[IsZExt][Is64Bit];
   if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
-    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    Register TmpReg = MRI.createVirtualRegister(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), TmpReg)
         .addImm(0)
@@ -4244,7 +4289,7 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
   };
   unsigned Opc = OpcTable[IsZExt][Is64Bit];
   if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
-    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    Register TmpReg = MRI.createVirtualRegister(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), TmpReg)
         .addImm(0)
@@ -4353,7 +4398,7 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
   };
   unsigned Opc = OpcTable[IsZExt][Is64Bit];
   if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
-    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    Register TmpReg = MRI.createVirtualRegister(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), TmpReg)
         .addImm(0)
@@ -4412,7 +4457,7 @@ unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
   if (DestVT == MVT::i8 || DestVT == MVT::i16)
     DestVT = MVT::i32;
   else if (DestVT == MVT::i64) {
-    unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    Register Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), Src64)
         .addImm(0)
@@ -4495,7 +4540,7 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
   const auto *LoadMI = MI;
   if (LoadMI->getOpcode() == TargetOpcode::COPY &&
       LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) {
-    unsigned LoadReg = MI->getOperand(1).getReg();
+    Register LoadReg = MI->getOperand(1).getReg();
     LoadMI = MRI.getUniqueVRegDef(LoadReg);
     assert(LoadMI && "Expected valid instruction");
   }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 8c6e5cbd5c13..68e1e6a30224 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -44,11 +44,19 @@
 // |                                   |
 // |-----------------------------------|
 // |                                   |
-// | prev_fp, prev_lr                  |
+// | callee-saved gpr registers        | <--.
+// |                                   |    | On Darwin platforms these
+// |- - - - - - - - - - - - - - - - - -|    | callee saves are swapped,
+// |                                   |    | (frame record first)
+// | prev_fp, prev_lr                  | <--'
 // | (a.k.a. "frame record")           |
 // |-----------------------------------| <- fp(=x29)
 // |                                   |
-// | other callee-saved registers      |
+// | callee-saved fp/simd/SVE regs     |
+// |                                   |
+// |-----------------------------------|
+// |                                   |
+// |        SVE stack objects          |
 // |                                   |
 // |-----------------------------------|
 // |.empty.space.to.make.part.below....|
@@ -80,6 +88,20 @@
 // * A frame pointer is definitely needed when there are local variables with
 //   more-than-default alignment requirements.
 //
+// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
+// callee-saved area, since the unwind encoding does not allow for encoding
+// this dynamically and existing tools depend on this layout. For other
+// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
+// area to allow SVE stack objects (allocated directly below the callee-saves,
+// if available) to be accessed directly from the framepointer.
+// The SVE spill/fill instructions have VL-scaled addressing modes such
+// as:
+//    ldr z8, [fp, #-7 mul vl]
+// For SVE the size of the vector length (VL) is not known at compile-time, so
+// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
+// layout, we don't need to add an unscaled offset to the framepointer before
+// accessing the SVE object in the frame.
+//
 // In some cases when a base pointer is not strictly needed, it is generated
 // anyway when offsets from the frame pointer to access local variables become
 // so large that the offset can't be encoded in the immediate fields of loads
@@ -94,6 +116,7 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
+#include "AArch64StackOffset.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -173,7 +196,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
         if (!MO.isFI())
           continue;
 
-        int Offset = 0;
+        StackOffset Offset;
         if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
             AArch64FrameOffsetCannotUpdate)
           return 0;
@@ -183,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
   return DefaultSafeSPDisplacement;
 }
 
+/// Returns the size of the entire SVE stackframe (calleesaves + spills).
+static StackOffset getSVEStackSize(const MachineFunction &MF) {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8};
+}
+
 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   if (!EnableRedZone)
     return false;
@@ -195,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned NumBytes = AFI->getLocalStackSize();
 
-  return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128);
+  return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 ||
+           getSVEStackSize(MF));
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
@@ -273,14 +303,15 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
       // Most call frames will be allocated at the start of a function so
       // this is OK, but it is a limitation that needs dealing with.
       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
-      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8},
+                      TII);
     }
   } else if (CalleePopAmount != 0) {
     // If the calling convention demands that the callee pops arguments from the
     // stack, we want to add it back if we have a reserved call frame.
     assert(CalleePopAmount < 0xffffff && "call frame too large");
-    emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
-                    TII);
+    emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
+                    {-(int64_t)CalleePopAmount, MVT::i8}, TII);
   }
   return MBB.erase(I);
 }
@@ -416,6 +447,9 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
+  if (MF.getFunction().hasOptSize())
+    return false;
+
   if (AFI->getLocalStackSize() == 0)
     return false;
 
@@ -436,6 +470,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   if (canUseRedZone(MF))
     return false;
 
+  // When there is an SVE area on the stack, always allocate the
+  // callee-saves and spills/locals separately.
+  if (getSVEStackSize(MF))
+    return false;
+
   return true;
 }
 
@@ -474,8 +513,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
     Imm = -Imm;
     LLVM_FALLTHROUGH;
   case AArch64::STPXpre: {
-    unsigned Reg0 = MBBI->getOperand(1).getReg();
-    unsigned Reg1 = MBBI->getOperand(2).getReg();
+    Register Reg0 = MBBI->getOperand(1).getReg();
+    Register Reg1 = MBBI->getOperand(2).getReg();
     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
                 .addImm(Imm * 8)
@@ -523,8 +562,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
   }
   case AArch64::STPXi:
   case AArch64::LDPXi: {
-    unsigned Reg0 = MBBI->getOperand(0).getReg();
-    unsigned Reg1 = MBBI->getOperand(1).getReg();
+    Register Reg0 = MBBI->getOperand(0).getReg();
+    Register Reg1 = MBBI->getOperand(1).getReg();
     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
                 .addImm(Imm * 8)
@@ -791,6 +830,10 @@ static bool needsWinCFI(const MachineFunction &MF) {
          F.needsUnwindTableEntry();
 }
 
+static bool isTargetDarwin(const MachineFunction &MF) {
+  return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
+}
+
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -846,6 +889,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // Ideally it should match SP value after prologue.
   AFI->setTaggedBasePointerOffset(MFI.getStackSize());
 
+  const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
   // getStackSize() includes all the locals in its size calculation. We don't
   // include these locals when computing the stack size of a funclet, as they
   // are allocated in the parent's stack frame and accessed via the frame
@@ -856,6 +901,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                            : (int)MFI.getStackSize();
   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
     assert(!HasFP && "unexpected function without stack frame but with FP");
+    assert(!SVEStackSize &&
+           "unexpected function without stack frame but with SVE objects");
     // All of the stack allocation is for locals.
     AFI->setLocalStackSize(NumBytes);
     if (!NumBytes)
@@ -866,8 +913,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       AFI->setHasRedZone(true);
       ++NumRedZoneFunctions;
     } else {
-      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+                      {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
+                      false, NeedsWinCFI, &HasWinCFI);
       if (!NeedsWinCFI) {
         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
@@ -901,8 +949,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   if (CombineSPBump) {
-    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
-                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+                    {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
+                    NeedsWinCFI, &HasWinCFI);
     NumBytes = 0;
   } else if (PrologueSaveSize != 0) {
     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
@@ -948,9 +998,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (HasFP) {
-    // Only set up FP if we actually need to. Frame pointer is fp =
-    // sp - fixedobject - 16.
-    int FPOffset = AFI->getCalleeSavedStackSize() - 16;
+    // Only set up FP if we actually need to.
+    int FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
+
     if (CombineSPBump)
       FPOffset += AFI->getLocalStackSize();
 
@@ -958,8 +1008,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     //          mov fp,sp          when FPOffset is zero.
     // Note: All stores of callee-saved registers are marked as "FrameSetup".
     // This code marks the instruction(s) that set the FP also.
-    emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
-                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+    emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
+                    {FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false,
+                    NeedsWinCFI, &HasWinCFI);
   }
 
   if (windowsRequiresStackProbe(MF, NumBytes)) {
@@ -1056,6 +1107,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     NumBytes = 0;
   }
 
+  emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII,
+                  MachineInstr::FrameSetup);
+
   // Allocate space for the rest of the frame.
   if (NumBytes) {
     const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
@@ -1071,8 +1125,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
       // the correct value here, as NumBytes also includes padding bytes,
       // which shouldn't be counted here.
-      emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+      emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
+                      {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
+                      false, NeedsWinCFI, &HasWinCFI);
 
     if (NeedsRealignment) {
       const unsigned Alignment = MFI.getMaxAlignment();
@@ -1130,8 +1185,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
 
   if (needsFrameMoves) {
     const DataLayout &TD = MF.getDataLayout();
-    const int StackGrowth = -TD.getPointerSize(0);
-    unsigned FramePtr = RegInfo->getFrameRegister(MF);
+    const int StackGrowth = isTargetDarwin(MF)
+                                ? (2 * -TD.getPointerSize(0))
+                                : -AFI->getCalleeSavedStackSize();
+    Register FramePtr = RegInfo->getFrameRegister(MF);
     // An example of the prologue:
     //
     //     .globl __foo
@@ -1202,7 +1259,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // Define the current CFA rule to use the provided FP.
       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
-          nullptr, Reg, 2 * StackGrowth - FixedObject));
+          nullptr, Reg, StackGrowth - FixedObject));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
@@ -1401,11 +1458,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameDestroy);
   }
 
+  const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
+    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
-                    NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy,
-                    false, NeedsWinCFI, &HasWinCFI);
+                    {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
     if (NeedsWinCFI && HasWinCFI)
       BuildMI(MBB, MBB.getFirstTerminator(), DL,
               TII->get(AArch64::SEH_EpilogEnd))
@@ -1416,6 +1476,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
+  // Deallocate the SVE area.
+  if (SVEStackSize)
+    if (!AFI->isStackRealigned())
+      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize,
+                      TII, MachineInstr::FrameDestroy);
+
   if (!hasFP(MF)) {
     bool RedZone = canUseRedZone(MF);
     // If this was a redzone leaf function, we don't need to restore the
@@ -1437,8 +1503,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
 
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
-                    StackRestoreBytes, TII, MachineInstr::FrameDestroy, false,
-                    NeedsWinCFI, &HasWinCFI);
+                    {StackRestoreBytes, MVT::i8}, TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
     if (Done) {
       if (NeedsWinCFI) {
         HasWinCFI = true;
@@ -1456,13 +1522,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // FIXME: Rather than doing the math here, we should instead just use
   // non-post-indexed loads for the restores if we aren't actually going to
   // be able to save any instructions.
-  if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned()))
+  if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
+    int64_t OffsetToFrameRecord =
+        isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0;
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
-                    -AFI->getCalleeSavedStackSize() + 16, TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI);
-  else if (NumBytes)
-    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI);
+                    {OffsetToFrameRecord, MVT::i8},
+                    TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
+  } else if (NumBytes)
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+                    {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false,
+                    NeedsWinCFI);
 
   // This must be placed after the callee-save restore code because that code
   // assumes the SP is at the same location as it was after the callee-save save
@@ -1483,8 +1552,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
 
     emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
-                    AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false,
-                    NeedsWinCFI, &HasWinCFI);
+                    {(int64_t)AfterCSRPopSize, MVT::i8}, TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
   }
   if (NeedsWinCFI && HasWinCFI)
     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
@@ -1501,10 +1570,11 @@ int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                  int FI,
                                                  unsigned &FrameReg) const {
   return resolveFrameIndexReference(
-      MF, FI, FrameReg,
-      /*PreferFP=*/
-      MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
-      /*ForSimm=*/false);
+             MF, FI, FrameReg,
+             /*PreferFP=*/
+             MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
+             /*ForSimm=*/false)
+      .getBytes();
 }
 
 int AArch64FrameLowering::getNonLocalFrameIndexReference(
@@ -1512,18 +1582,19 @@ int AArch64FrameLowering::getNonLocalFrameIndexReference(
   return getSEHFrameIndexOffset(MF, FI);
 }
 
-static int getFPOffset(const MachineFunction &MF, int ObjectOffset) {
+static StackOffset getFPOffset(const MachineFunction &MF, int ObjectOffset) {
   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
   unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
-  return ObjectOffset + FixedObject + 16;
+  unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize();
+  return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
 }
 
-static int getStackOffset(const MachineFunction &MF, int ObjectOffset) {
+static StackOffset getStackOffset(const MachineFunction &MF, int ObjectOffset) {
   const auto &MFI = MF.getFrameInfo();
-  return ObjectOffset + MFI.getStackSize();
+  return {ObjectOffset + (int)MFI.getStackSize(), MVT::i8};
 }
 
 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
@@ -1532,23 +1603,23 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
       MF.getSubtarget().getRegisterInfo());
   int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
   return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
-             ? getFPOffset(MF, ObjectOffset)
-             : getStackOffset(MF, ObjectOffset);
+             ? getFPOffset(MF, ObjectOffset).getBytes()
+             : getStackOffset(MF, ObjectOffset).getBytes();
 }
 
-int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
-                                                     int FI, unsigned &FrameReg,
-                                                     bool PreferFP,
-                                                     bool ForSimm) const {
+StackOffset AArch64FrameLowering::resolveFrameIndexReference(
+    const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP,
+    bool ForSimm) const {
   const auto &MFI = MF.getFrameInfo();
   int ObjectOffset = MFI.getObjectOffset(FI);
   bool isFixed = MFI.isFixedObjectIndex(FI);
-  return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, FrameReg,
+  bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector;
+  return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
                                      PreferFP, ForSimm);
 }
 
-int AArch64FrameLowering::resolveFrameOffsetReference(
-    const MachineFunction &MF, int ObjectOffset, bool isFixed,
+StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
+    const MachineFunction &MF, int ObjectOffset, bool isFixed, bool isSVE,
     unsigned &FrameReg, bool PreferFP, bool ForSimm) const {
   const auto &MFI = MF.getFrameInfo();
   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
@@ -1556,17 +1627,23 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
 
-  int FPOffset = getFPOffset(MF, ObjectOffset);
-  int Offset = getStackOffset(MF, ObjectOffset);
+  int FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
+  int Offset = getStackOffset(MF, ObjectOffset).getBytes();
   bool isCSR =
       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize());
 
+  const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
   // Use frame pointer to reference fixed objects. Use it for locals if
   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
   // reliable as a base). Make sure useFPForScavengingIndex() does the
   // right thing for the emergency spill slot.
   bool UseFP = false;
-  if (AFI->hasStackFrame()) {
+  if (AFI->hasStackFrame() && !isSVE) {
+    // We shouldn't prefer using the FP when there is an SVE area
+    // in between the FP and the non-SVE locals/spills.
+    PreferFP &= !SVEStackSize;
+
     // Note: Keeping the following as multiple 'if' statements rather than
     // merging to a single expression for readability.
     //
@@ -1594,8 +1671,10 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
         bool CanUseBP = RegInfo->hasBasePointer(MF);
         if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
           UseFP = PreferFP;
-        else if (!CanUseBP) // Can't use BP. Forced to use FP.
+        else if (!CanUseBP) { // Can't use BP. Forced to use FP.
+          assert(!SVEStackSize && "Expected BP to be available");
           UseFP = true;
+        }
         // else we can use BP and FP, but the offset from FP won't fit.
         // That will make us scavenge registers which we can probably avoid by
         // using BP. If it won't fit for BP either, we'll scavenge anyway.
@@ -1625,9 +1704,36 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
          "In the presence of dynamic stack pointer realignment, "
          "non-argument/CSR objects cannot be accessed through the frame pointer");
 
+  if (isSVE) {
+    int64_t OffsetToSVEArea =
+        MFI.getStackSize() - AFI->getCalleeSavedStackSize();
+    StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8};
+    StackOffset SPOffset = SVEStackSize +
+                           StackOffset(ObjectOffset, MVT::nxv1i8) +
+                           StackOffset(OffsetToSVEArea, MVT::i8);
+    // Always use the FP for SVE spills if available and beneficial.
+    if (hasFP(MF) &&
+        (SPOffset.getBytes() ||
+         FPOffset.getScalableBytes() < SPOffset.getScalableBytes() ||
+         RegInfo->needsStackRealignment(MF))) {
+      FrameReg = RegInfo->getFrameRegister(MF);
+      return FPOffset;
+    }
+
+    FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
+                                           : (unsigned)AArch64::SP;
+    return SPOffset;
+  }
+
+  StackOffset ScalableOffset = {};
+  if (UseFP && !(isFixed || isCSR))
+    ScalableOffset = -SVEStackSize;
+  if (!UseFP && (isFixed || isCSR))
+    ScalableOffset = SVEStackSize;
+
   if (UseFP) {
     FrameReg = RegInfo->getFrameRegister(MF);
-    return FPOffset;
+    return StackOffset(FPOffset, MVT::i8) + ScalableOffset;
   }
 
   // Use the base pointer if we have one.
@@ -1644,7 +1750,7 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
       Offset -= AFI->getLocalStackSize();
   }
 
-  return Offset;
+  return StackOffset(Offset, MVT::i8) + ScalableOffset;
 }
 
 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
@@ -1682,6 +1788,23 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
   return true;
 }
 
+/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
+/// WindowsCFI requires that only consecutive registers can be paired.
+/// LR and FP need to be allocated together when the frame needs to save
+/// the frame-record. This means any other register pairing with LR is invalid.
+static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
+                                      bool NeedsWinCFI, bool NeedsFrameRecord) {
+  if (NeedsWinCFI)
+    return invalidateWindowsRegisterPairing(Reg1, Reg2, true);
+
+  // If we need to store the frame record, don't pair any register
+  // with LR other than FP.
+  if (NeedsFrameRecord)
+    return Reg2 == AArch64::LR;
+
+  return false;
+}
+
 namespace {
 
 struct RegPairInfo {
@@ -1701,7 +1824,7 @@ struct RegPairInfo {
 static void computeCalleeSaveRegisterPairs(
     MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
-    bool &NeedShadowCallStackProlog) {
+    bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
 
   if (CSI.empty())
     return;
@@ -1743,7 +1866,8 @@ static void computeCalleeSaveRegisterPairs(
       switch (RPI.Type) {
       case RegPairInfo::GPR:
         if (AArch64::GPR64RegClass.contains(NextReg) &&
-            !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
+            !invalidateRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
+                                       NeedsFrameRecord))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR64:
@@ -1777,6 +1901,10 @@ static void computeCalleeSaveRegisterPairs(
             (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
            "Out of order callee saved regs!");
 
+    assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
+            RPI.Reg1 == AArch64::LR) &&
+           "FrameRecord must be allocated together with LR");
+
     // MachO's compact unwind format relies on all registers being stored in
     // adjacent register pairs.
     assert((!produceCompactUnwindFrame(MF) ||
@@ -1825,7 +1953,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
 
   bool NeedShadowCallStackProlog = false;
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
-                                 NeedShadowCallStackProlog);
+                                 NeedShadowCallStackProlog, hasFP(MF));
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   if (NeedShadowCallStackProlog) {
@@ -1955,7 +2083,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
 
   bool NeedShadowCallStackProlog = false;
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
-                                 NeedShadowCallStackProlog);
+                                 NeedShadowCallStackProlog, hasFP(MF));
 
   auto EmitMI = [&](const RegPairInfo &RPI) {
     unsigned Reg1 = RPI.Reg1;
@@ -2113,19 +2241,26 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     SavedRegs.set(AArch64::LR);
   }
 
-  LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+  LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
              for (unsigned Reg
                   : SavedRegs.set_bits()) dbgs()
              << ' ' << printReg(Reg, RegInfo);
              dbgs() << "\n";);
 
   // If any callee-saved registers are used, the frame cannot be eliminated.
-  bool CanEliminateFrame = SavedRegs.count() == 0;
+  unsigned MaxAlign = getStackAlignment();
+  int64_t SVEStackSize =
+      alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign);
+  assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+  bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
 
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
-  bool BigStack = (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
+
+  // Conservatively always assume BigStack when there are SVE spills.
+  bool BigStack = SVEStackSize ||
+                  (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
     AFI->setHasStackFrame(true);
 
@@ -2145,7 +2280,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       // store the pair.
       if (produceCompactUnwindFrame(MF))
         SavedRegs.set(UnspilledCSGPRPaired);
-      ExtraCSSpill = UnspilledCSGPRPaired;
+      ExtraCSSpill = UnspilledCSGPR;
     }
 
     // If we didn't find an extra callee-saved register to spill, create
@@ -2181,14 +2316,42 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
   return AFI->hasCalleeSaveStackFreeSpace();
 }
 
+int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI,
+                                                    unsigned &MaxAlign) const {
+  // Process all fixed stack objects.
+  int64_t Offset = 0;
+  for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
+    if (MFI.getStackID(I) == TargetStackID::SVEVector) {
+      int64_t FixedOffset = -MFI.getObjectOffset(I);
+      if (FixedOffset > Offset)
+        Offset = FixedOffset;
+    }
+
+  // Note: We don't take allocatable stack objects into
+  // account yet, because allocation for those is not yet
+  // implemented.
+  return Offset;
+}
+
 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
+         "Upwards growing stack unsupported");
+
+  unsigned MaxAlign = getStackAlignment();
+  int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign);
+
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign));
+  assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+
   // If this function isn't doing Win64-style C++ EH, we don't need to do
   // anything.
   if (!MF.hasEHFunclets())
     return;
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
   WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
 
   MachineBasicBlock &MBB = MF.front();
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 6dbd34b2189f..ac150e86c9eb 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
 
+#include "AArch64StackOffset.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
@@ -20,7 +21,7 @@ namespace llvm {
 class AArch64FrameLowering : public TargetFrameLowering {
 public:
   explicit AArch64FrameLowering()
-      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+      : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16),
                             true /*StackRealignable*/) {}
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
@@ -39,12 +40,13 @@ public:
 
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
-  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
-                                 unsigned &FrameReg, bool PreferFP,
-                                 bool ForSimm) const;
-  int resolveFrameOffsetReference(const MachineFunction &MF, int ObjectOffset,
-                                  bool isFixed, unsigned &FrameReg,
-                                  bool PreferFP, bool ForSimm) const;
+  StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                         unsigned &FrameReg, bool PreferFP,
+                                         bool ForSimm) const;
+  StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
+                                          int ObjectOffset, bool isFixed,
+                                          bool isSVE, unsigned &FrameReg,
+                                          bool PreferFP, bool ForSimm) const;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
@@ -85,9 +87,21 @@ public:
                                int FI) const override;
   int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;
 
+  bool isSupportedStackID(TargetStackID::Value ID) const override {
+    switch (ID) {
+    default:
+      return false;
+    case TargetStackID::Default:
+    case TargetStackID::SVEVector:
+    case TargetStackID::NoAlloc:
+      return true;
+    }
+  }
+
 private:
   bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
                                       unsigned StackBumpBytes) const;
+  int64_t determineSVEStackSize(MachineFrameInfo &MF, unsigned &MaxAlign) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index cd7e927ac80c..1f08505f37e7 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2053,7 +2053,7 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
 }
 
 static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
-  if (Depth >= 6)
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
     return;
   // Initialize UsefulBits
   if (!Depth) {
@@ -2913,49 +2913,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       return;
     break;
 
-  case ISD::EXTRACT_VECTOR_ELT: {
-    // Extracting lane zero is a special case where we can just use a plain
-    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
-    // the rest of the compiler, especially the register allocator and copyi
-    // propagation, to reason about, so is preferred when it's possible to
-    // use it.
-    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
-    // Bail and use the default Select() for non-zero lanes.
-    if (LaneNode->getZExtValue() != 0)
-      break;
-    // If the element type is not the same as the result type, likewise
-    // bail and use the default Select(), as there's more to do than just
-    // a cross-class COPY. This catches extracts of i8 and i16 elements
-    // since they will need an explicit zext.
-    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
-      break;
-    unsigned SubReg;
-    switch (Node->getOperand(0)
-                .getValueType()
-                .getVectorElementType()
-                .getSizeInBits()) {
-    default:
-      llvm_unreachable("Unexpected vector element type!");
-    case 64:
-      SubReg = AArch64::dsub;
-      break;
-    case 32:
-      SubReg = AArch64::ssub;
-      break;
-    case 16:
-      SubReg = AArch64::hsub;
-      break;
-    case 8:
-      llvm_unreachable("unexpected zext-requiring extract element!");
-    }
-    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
-                                                     Node->getOperand(0));
-    LLVM_DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
-    LLVM_DEBUG(Extract->dumpr(CurDAG));
-    LLVM_DEBUG(dbgs() << "\n");
-    ReplaceNode(Node, Extract.getNode());
-    return;
-  }
   case ISD::Constant: {
     // Materialize zero constants as copies from WZR/XZR.  This allows
     // the coalescer to propagate these into other instructions.
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7becc99fb5c7..2746117e8ee5 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -161,6 +162,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addQRTypeForNEON(MVT::v8f16);
   }
 
+  if (Subtarget->hasSVE()) {
+    // Add legal sve predicate types
+    addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
+    addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
+    addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
+    addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
+
+    // Add legal sve data types
+    addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
+
+    addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
+  }
+
   // Compute derived properties from the register classes
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
@@ -283,7 +307,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // AArch64 lacks both left-rotate and popcount instructions.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
   setOperationAction(ISD::ROTL, MVT::i64, Expand);
-  for (MVT VT : MVT::vector_valuetypes()) {
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     setOperationAction(ISD::ROTL, VT, Expand);
     setOperationAction(ISD::ROTR, VT, Expand);
   }
@@ -297,7 +321,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-  for (MVT VT : MVT::vector_valuetypes()) {
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
   }
@@ -606,6 +630,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
 
+  MaxLoadsPerMemcmpOptSize = 4;
+  MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
+                      ? MaxLoadsPerMemcmpOptSize : 8;
+
   setStackPointerRegisterToSaveRestore(AArch64::SP);
 
   setSchedulingPreference(Sched::Hybrid);
@@ -613,10 +641,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   EnableExtLdPromotion = true;
 
   // Set required alignment.
-  setMinFunctionAlignment(2);
+  setMinFunctionAlignment(Align(4));
   // Set preferred alignments.
-  setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
-  setPrefLoopAlignment(STI.getPrefLoopAlignment());
+  setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
+  setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
 
   // Only change the limit for entries in a jump table if specified by
   // the sub target, but not at the command line.
@@ -725,7 +753,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     // Likewise, narrowing and extending vector loads/stores aren't handled
     // directly.
-    for (MVT VT : MVT::vector_valuetypes()) {
+    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 
       if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
@@ -741,7 +769,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BSWAP, VT, Expand);
       setOperationAction(ISD::CTTZ, VT, Expand);
 
-      for (MVT InnerVT : MVT::vector_valuetypes()) {
+      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
@@ -773,6 +801,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
   }
 
+  if (Subtarget->hasSVE()) {
+    for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
+      if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1)
+        setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+    }
+  }
+
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 }
 
@@ -1025,6 +1060,14 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
     Known.One &= Known2.One;
     break;
   }
+  case AArch64ISD::LOADgot:
+  case AArch64ISD::ADDlow: {
+    if (!Subtarget->isTargetILP32())
+      break;
+    // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
+    Known.Zero = APInt::getHighBitsSet(64, 32);
+    break;
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
@@ -1100,6 +1143,32 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
+// Same as above but handling LLTs instead.
+bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
+    LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+    bool *Fast) const {
+  if (Subtarget->requiresStrictAlign())
+    return false;
+
+  if (Fast) {
+    // Some CPUs are fine with unaligned stores except for 128-bit ones.
+    *Fast = !Subtarget->isMisaligned128StoreSlow() ||
+            Ty.getSizeInBytes() != 16 ||
+            // See comments in performSTORECombine() for more details about
+            // these conditions.
+
+            // Code that uses clang vector extensions can mark that it
+            // wants unaligned accesses to be treated as fast by
+            // underspecifying alignment to be 1 or 2.
+            Align <= 2 ||
+
+            // Disregard v2i64. Memcpy lowering produces those and splitting
+            // them regresses performance on micro-benchmarks and olden/bh.
+            Ty == LLT::vector(2, 64);
+  }
+  return true;
+}
+
 FastISel *
 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                       const TargetLibraryInfo *libInfo) const {
@@ -1238,6 +1307,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::STZG:              return "AArch64ISD::STZG";
   case AArch64ISD::ST2G:              return "AArch64ISD::ST2G";
   case AArch64ISD::STZ2G:             return "AArch64ISD::STZ2G";
+  case AArch64ISD::SUNPKHI:           return "AArch64ISD::SUNPKHI";
+  case AArch64ISD::SUNPKLO:           return "AArch64ISD::SUNPKLO";
+  case AArch64ISD::UUNPKHI:           return "AArch64ISD::UUNPKHI";
+  case AArch64ISD::UUNPKLO:           return "AArch64ISD::UUNPKLO";
   }
   return nullptr;
 }
@@ -1263,9 +1336,9 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction::iterator It = ++MBB->getIterator();
 
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned IfTrueReg = MI.getOperand(1).getReg();
-  unsigned IfFalseReg = MI.getOperand(2).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
+  Register IfTrueReg = MI.getOperand(1).getReg();
+  Register IfFalseReg = MI.getOperand(2).getReg();
   unsigned CondCode = MI.getOperand(3).getImm();
   bool NZCVKilled = MI.getOperand(4).isKill();
 
@@ -2140,7 +2213,8 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                              RTLIB::Libcall Call) const {
   SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
-  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
+  MakeLibCallOptions CallOptions;
+  return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
 }
 
 // Returns true if the given Op is the overflow flag result of an overflow
@@ -2349,7 +2423,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
   // precise. That doesn't take part in the LibCall so we can't directly use
   // LowerF128Call.
   SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+  MakeLibCallOptions CallOptions;
+  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions,
                      SDLoc(Op)).first;
 }
 
@@ -2419,7 +2494,8 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
 
   SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
-  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
+  MakeLibCallOptions CallOptions;
+  return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first;
 }
 
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -2773,6 +2849,19 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
 
+  case Intrinsic::aarch64_sve_sunpkhi:
+    return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_sunpklo:
+    return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_uunpkhi:
+    return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_uunpklo:
+    return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
+                       Op.getOperand(1));
+
   case Intrinsic::localaddress: {
     const auto &MF = DAG.getMachineFunction();
     const auto *RegInfo = Subtarget->getRegisterInfo();
@@ -2937,6 +3026,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerBUILD_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::SPLAT_VECTOR:
+    return LowerSPLAT_VECTOR(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::SRA:
@@ -3014,8 +3105,11 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
       return CC_AArch64_Win64_VarArg;
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
-    return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
-  case CallingConv::Win64:
+    if (!IsVarArg)
+      return CC_AArch64_DarwinPCS;
+    return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
+                                      : CC_AArch64_DarwinPCS_VarArg;
+   case CallingConv::Win64:
     return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
   case CallingConv::AArch64_VectorCall:
     return CC_AArch64_AAPCS;
@@ -3038,6 +3132,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
+  DenseMap<unsigned, SDValue> CopiedRegs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
@@ -3094,11 +3189,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       continue;
     }
 
+    SDValue ArgValue;
     if (VA.isRegLoc()) {
       // Arguments stored in registers.
       EVT RegVT = VA.getLocVT();
-
-      SDValue ArgValue;
       const TargetRegisterClass *RC;
 
       if (RegVT == MVT::i32)
@@ -3113,6 +3207,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         RC = &AArch64::FPR64RegClass;
       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
         RC = &AArch64::FPR128RegClass;
+      else if (RegVT.isScalableVector() &&
+               RegVT.getVectorElementType() == MVT::i1)
+        RC = &AArch64::PPRRegClass;
+      else if (RegVT.isScalableVector())
+        RC = &AArch64::ZPRRegClass;
       else
         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
@@ -3128,20 +3227,23 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full:
         break;
+      case CCValAssign::Indirect:
+        assert(VA.getValVT().isScalableVector() &&
+               "Only scalable vectors can be passed indirectly");
+        llvm_unreachable("Spilling of SVE vectors not yet implemented");
       case CCValAssign::BCvt:
         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
         break;
       case CCValAssign::AExt:
       case CCValAssign::SExt:
       case CCValAssign::ZExt:
-        // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
-        // nodes after our lowering.
-        assert(RegVT == Ins[i].VT && "incorrect register location selected");
+        break;
+      case CCValAssign::AExtUpper:
+        ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
+                               DAG.getConstant(32, DL, RegVT));
+        ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
         break;
       }
-
-      InVals.push_back(ArgValue);
-
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
       unsigned ArgOffset = VA.getLocMemOffset();
@@ -3156,7 +3258,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-      SDValue ArgValue;
 
       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
@@ -3165,9 +3266,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       switch (VA.getLocInfo()) {
       default:
         break;
+      case CCValAssign::Trunc:
       case CCValAssign::BCvt:
         MemVT = VA.getLocVT();
         break;
+      case CCValAssign::Indirect:
+        assert(VA.getValVT().isScalableVector() &&
+               "Only scalable vectors can be passed indirectly");
+        llvm_unreachable("Spilling of SVE vectors not yet implemented");
       case CCValAssign::SExt:
         ExtType = ISD::SEXTLOAD;
         break;
@@ -3184,8 +3290,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
           MemVT);
 
-      InVals.push_back(ArgValue);
     }
+    if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
+      ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
+                             ArgValue, DAG.getValueType(MVT::i32));
+    InVals.push_back(ArgValue);
   }
 
   // varargs
@@ -3202,8 +3311,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
     // This will point to the next argument passed via stack.
     unsigned StackOffset = CCInfo.getNextStackOffset();
-    // We currently pass all varargs at 8-byte alignment.
-    StackOffset = ((StackOffset + 7) & ~7);
+    // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
+    StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
 
     if (MFI.hasMustTailInVarArgFunc()) {
@@ -3233,8 +3342,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         assert(!FuncInfo->getSRetReturnReg());
 
         MVT PtrTy = getPointerTy(DAG.getDataLayout());
-        unsigned Reg =
-          MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+        Register Reg =
+            MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
         FuncInfo->setSRetReturnReg(Reg);
 
         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
@@ -3366,6 +3475,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
                           : RetCC_AArch64_AAPCS;
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
+  DenseMap<unsigned, SDValue> CopiedRegs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC);
@@ -3383,10 +3493,16 @@ SDValue AArch64TargetLowering::LowerCallResult(
       continue;
     }
 
-    SDValue Val =
-        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
-    Chain = Val.getValue(1);
-    InFlag = Val.getValue(2);
+    // Avoid copying a physreg twice since RegAllocFast is incompetent and only
+    // allows one use of a physreg per block.
+    SDValue Val = CopiedRegs.lookup(VA.getLocReg());
+    if (!Val) {
+      Val =
+          DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+      Chain = Val.getValue(1);
+      InFlag = Val.getValue(2);
+      CopiedRegs[VA.getLocReg()] = Val;
+    }
 
     switch (VA.getLocInfo()) {
     default:
@@ -3396,6 +3512,15 @@ SDValue AArch64TargetLowering::LowerCallResult(
     case CCValAssign::BCvt:
       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
       break;
+    case CCValAssign::AExtUpper:
+      Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
+                        DAG.getConstant(32, DL, VA.getLocVT()));
+      LLVM_FALLTHROUGH;
+    case CCValAssign::AExt:
+      LLVM_FALLTHROUGH;
+    case CCValAssign::ZExt:
+      Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
+      break;
     }
 
     InVals.push_back(Val);
@@ -3593,6 +3718,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool IsVarArg = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
+  MachineFunction::CallSiteInfo CSInfo;
   bool IsThisReturn = false;
 
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -3709,6 +3835,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                         getPointerTy(DAG.getDataLayout()));
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallSet<unsigned, 8> RegsUsed;
   SmallVector<SDValue, 8> MemOpChains;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
@@ -3716,7 +3843,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
       SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
-       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+       RegsToPass.emplace_back(F.PReg, Val);
     }
   }
 
@@ -3747,12 +3874,25 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       }
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
+    case CCValAssign::AExtUpper:
+      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
+                        DAG.getConstant(32, DL, VA.getLocVT()));
+      break;
     case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      Arg = DAG.getBitcast(VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::Trunc:
+      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
       break;
     case CCValAssign::FPExt:
       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
       break;
+    case CCValAssign::Indirect:
+      assert(VA.getValVT().isScalableVector() &&
+             "Only scalable vectors can be passed indirectly");
+      llvm_unreachable("Spilling of SVE vectors not yet implemented");
     }
 
     if (VA.isRegLoc()) {
@@ -3764,7 +3904,33 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                "unexpected use of 'returned'");
         IsThisReturn = true;
       }
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      if (RegsUsed.count(VA.getLocReg())) {
+        // If this register has already been used then we're trying to pack
+        // parts of an [N x i32] into an X-register. The extension type will
+        // take care of putting the two halves in the right place but we have to
+        // combine them.
+        SDValue &Bits =
+            std::find_if(RegsToPass.begin(), RegsToPass.end(),
+                         [=](const std::pair<unsigned, SDValue> &Elt) {
+                           return Elt.first == VA.getLocReg();
+                         })
+                ->second;
+        Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
+        // Call site info is used for function's parameter entry value
+        // tracking. For now we track only simple cases when parameter
+        // is transferred through whole register.
+        CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
+                                    [&VA](MachineFunction::ArgRegPair ArgReg) {
+                                      return ArgReg.Reg == VA.getLocReg();
+                                    }),
+                     CSInfo.end());
+      } else {
+        RegsToPass.emplace_back(VA.getLocReg(), Arg);
+        RegsUsed.insert(VA.getLocReg());
+        const TargetOptions &Options = DAG.getTarget().Options;
+        if (Options.EnableDebugEntryValues)
+          CSInfo.emplace_back(VA.getLocReg(), i);
+      }
     } else {
       assert(VA.isMemLoc());
 
@@ -3899,6 +4065,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Ops.push_back(DAG.getRegister(RegToPass.first,
                                   RegToPass.second.getValueType()));
 
+  // Check callee args/returns for SVE registers and set calling convention
+  // accordingly.
+  if (CallConv == CallingConv::C) {
+    bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
+      return Out.VT.isScalableVector();
+    });
+    bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
+      return In.VT.isScalableVector();
+    });
+
+    if (CalleeInSVE || CalleeOutSVE)
+      CallConv = CallingConv::AArch64_SVE_VectorCall;
+  }
+
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
@@ -3930,12 +4110,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // actual call instruction.
   if (IsTailCall) {
     MF.getFrameInfo().setHasTailCall();
-    return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+    SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+    return Ret;
   }
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
   InFlag = Chain.getValue(1);
+  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
   uint64_t CalleePopBytes =
       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
@@ -3983,7 +4166,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   // Copy the result values into the output registers.
   SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
+  SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
+  SmallSet<unsigned, 4> RegsUsed;
   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
        ++i, ++realRVLocIdx) {
     CCValAssign &VA = RVLocs[i];
@@ -4005,11 +4189,38 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
+    case CCValAssign::AExt:
+    case CCValAssign::ZExt:
+      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
+      break;
+    case CCValAssign::AExtUpper:
+      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
+      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
+      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
+                        DAG.getConstant(32, DL, VA.getLocVT()));
+      break;
     }
 
-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    if (RegsUsed.count(VA.getLocReg())) {
+      SDValue &Bits =
+          std::find_if(RetVals.begin(), RetVals.end(),
+                       [=](const std::pair<unsigned, SDValue> &Elt) {
+                         return Elt.first == VA.getLocReg();
+                       })
+              ->second;
+      Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
+    } else {
+      RetVals.emplace_back(VA.getLocReg(), Arg);
+      RegsUsed.insert(VA.getLocReg());
+    }
+  }
+
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  for (auto &RetVal : RetVals) {
+    Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+    RetOps.push_back(
+        DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
   }
 
   // Windows AArch64 ABIs require that for returning structs by value we copy
@@ -4139,8 +4350,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                   SelectionDAG &DAG) const {
   GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
-  unsigned char OpFlags =
-      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+  unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
 
   if (OpFlags != AArch64II::MO_NO_FLAG)
     assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
@@ -4204,6 +4414,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
 
   SDLoc DL(Op);
   MVT PtrVT = getPointerTy(DAG.getDataLayout());
+  MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
   SDValue TLVPAddr =
@@ -4214,13 +4425,15 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   // to obtain the address of the variable.
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet = DAG.getLoad(
-      MVT::i64, DL, Chain, DescAddr,
+      PtrMemVT, DL, Chain, DescAddr,
       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-      /* Alignment = */ 8,
-      MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
-          MachineMemOperand::MODereferenceable);
+      /* Alignment = */ PtrMemVT.getSizeInBits() / 8,
+      MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
   Chain = FuncTLVGet.getValue(1);
 
+  // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
+  FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
+
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   MFI.setAdjustsStack(true);
 
@@ -4470,7 +4683,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   // value of a libcall against zero, which is just what the rest of LowerBR_CC
   // is expecting to deal with.
   if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
 
     // If softenSetCCOperands returned a scalar, we need to compare the result
     // against zero to select between true and false values.
@@ -4736,7 +4949,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   // Handle f128 first, since one possible outcome is a normal integer
   // comparison which gets picked up by the next if statement.
   if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
 
     // If softenSetCCOperands returned a scalar, use it.
     if (!RHS.getNode()) {
@@ -4798,7 +5011,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
   // Handle f128 first, because it will result in a comparison of some RTLIB
   // call result against zero.
   if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
 
     // If softenSetCCOperands returned a scalar, we need to compare the result
     // against zero to select between true and false values.
@@ -5096,6 +5309,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
   SDLoc DL(Op);
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
                                  getPointerTy(DAG.getDataLayout()));
+  FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                       MachinePointerInfo(SV));
@@ -5202,15 +5416,15 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
   // pointer.
   SDLoc DL(Op);
-  unsigned VaListSize =
-      Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
+  unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
+  unsigned VaListSize = (Subtarget->isTargetDarwin() ||
+                         Subtarget->isTargetWindows()) ? PtrSize : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
-  return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
-                       Op.getOperand(2),
-                       DAG.getConstant(VaListSize, DL, MVT::i32),
-                       8, false, false, false, MachinePointerInfo(DestSV),
+  return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize,
+                       false, false, false, MachinePointerInfo(DestSV),
                        MachinePointerInfo(SrcSV));
 }
 
@@ -5224,12 +5438,15 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Addr = Op.getOperand(1);
   unsigned Align = Op.getConstantOperandVal(3);
+  unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
-
-  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
+  auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
+  SDValue VAList =
+      DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
   Chain = VAList.getValue(1);
+  VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
 
-  if (Align > 8) {
+  if (Align > MinSlotSize) {
     assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                          DAG.getConstant(Align - 1, DL, PtrVT));
@@ -5238,14 +5455,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   }
 
   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
-  uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+  unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
 
   // Scalar integer and FP values smaller than 64 bits are implicitly extended
   // up to 64 bits.  At the very least, we have to increase the striding of the
   // vaargs list to match this, and for FP values we need to introduce
   // FP_ROUND nodes as well.
   if (VT.isInteger() && !VT.isVector())
-    ArgSize = 8;
+    ArgSize = std::max(ArgSize, MinSlotSize);
   bool NeedFPTrunc = false;
   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
     ArgSize = 8;
@@ -5255,6 +5472,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   // Increment the pointer, VAList, to the next vaarg
   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                                DAG.getConstant(ArgSize, DL, PtrVT));
+  VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
+
   // Store the incremented VAList to the legalized pointer
   SDValue APStore =
       DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
@@ -5284,10 +5503,15 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
   SDLoc DL(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr =
-      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo());
+
+  if (Subtarget->isTargetILP32())
+    FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
+                            DAG.getValueType(VT));
+
   return FrameAddr;
 }
 
@@ -5306,9 +5530,9 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
-                                                  SelectionDAG &DAG) const {
-  unsigned Reg = MatchRegisterName(RegName);
+Register AArch64TargetLowering::
+getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const {
+  Register Reg = MatchRegisterName(RegName);
   if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
     const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
     unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
@@ -5653,6 +5877,21 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   return "r";
 }
 
+enum PredicateConstraint {
+  Upl,
+  Upa,
+  Invalid
+};
+
+static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
+  PredicateConstraint P = PredicateConstraint::Invalid;
+  if (Constraint == "Upa")
+    P = PredicateConstraint::Upa;
+  if (Constraint == "Upl")
+    P = PredicateConstraint::Upl;
+  return P;
+}
+
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 AArch64TargetLowering::ConstraintType
@@ -5661,19 +5900,30 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
     switch (Constraint[0]) {
     default:
       break;
-    case 'z':
-      return C_Other;
     case 'x':
     case 'w':
+    case 'y':
       return C_RegisterClass;
     // An address with a single base register. Due to the way we
     // currently handle addresses it is the same as 'r'.
     case 'Q':
       return C_Memory;
+    case 'I':
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'Y':
+    case 'Z':
+      return C_Immediate;
+    case 'z':
     case 'S': // A symbolic address
       return C_Other;
     }
-  }
+  } else if (parsePredicateConstraint(Constraint) !=
+             PredicateConstraint::Invalid)
+      return C_RegisterClass;
   return TargetLowering::getConstraintType(Constraint);
 }
 
@@ -5697,12 +5947,17 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
     break;
   case 'x':
   case 'w':
+  case 'y':
     if (type->isFloatingPointTy() || type->isVectorTy())
       weight = CW_Register;
     break;
   case 'z':
     weight = CW_Constant;
     break;
+  case 'U':
+    if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
+      weight = CW_Register;
+    break;
   }
   return weight;
 }
@@ -5719,6 +5974,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
     case 'w':
       if (!Subtarget->hasFPARMv8())
         break;
+      if (VT.isScalableVector())
+        return std::make_pair(0U, &AArch64::ZPRRegClass);
       if (VT.getSizeInBits() == 16)
         return std::make_pair(0U, &AArch64::FPR16RegClass);
       if (VT.getSizeInBits() == 32)
@@ -5733,9 +5990,25 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
     case 'x':
       if (!Subtarget->hasFPARMv8())
         break;
+      if (VT.isScalableVector())
+        return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
       if (VT.getSizeInBits() == 128)
         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
       break;
+    case 'y':
+      if (!Subtarget->hasFPARMv8())
+        break;
+      if (VT.isScalableVector())
+        return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
+      break;
+    }
+  } else {
+    PredicateConstraint PC = parsePredicateConstraint(Constraint);
+    if (PC != PredicateConstraint::Invalid) {
+      assert(VT.isScalableVector());
+      bool restricted = (PC == PredicateConstraint::Upl);
+      return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
+                          : std::make_pair(0U, &AArch64::PPRRegClass);
     }
   }
   if (StringRef("{cc}").equals_lower(Constraint))
@@ -6279,6 +6552,8 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
 
 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts % 2 != 0)
+    return false;
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
   for (unsigned i = 0; i != NumElts; i += 2) {
@@ -6446,8 +6721,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
   if (!isConcatMask(Mask, VT, SplitV0))
     return SDValue();
 
-  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                VT.getVectorNumElements() / 2);
+  EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
   if (SplitV0) {
     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
                      DAG.getConstant(0, DL, MVT::i64));
@@ -6790,6 +7064,41 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   return GenerateTBL(Op, ShuffleMask, DAG);
 }
 
+SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT ElemVT = VT.getScalarType();
+
+  SDValue SplatVal = Op.getOperand(0);
+
+  // Extend input splat value where needed to fit into a GPR (32b or 64b only)
+  // FPRs don't have this restriction.
+  switch (ElemVT.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+  case MVT::i16:
+    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
+    break;
+  case MVT::i64:
+    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
+    break;
+  case MVT::i32:
+    // Fine as is
+    break;
+  // TODO: we can support splats of i1s and float types, but haven't added
+  // patterns yet.
+  case MVT::i1:
+  case MVT::f16:
+  case MVT::f32:
+  case MVT::f64:
+  default:
+    llvm_unreachable("Unsupported SPLAT_VECTOR input operand type");
+    break;
+  }
+
+  return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
+}
+
 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
                                APInt &UndefBits) {
   EVT VT = BVN->getValueType(0);
@@ -8063,7 +8372,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
-    Info.align = 0;
+    Info.align.reset();
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
     return true;
@@ -8089,7 +8398,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
-    Info.align = 0;
+    Info.align.reset();
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
     return true;
@@ -8101,7 +8410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -8112,7 +8421,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -8122,7 +8431,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = 16;
+    Info.align = Align(16);
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   case Intrinsic::aarch64_stlxp:
@@ -8131,7 +8440,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(2);
     Info.offset = 0;
-    Info.align = 16;
+    Info.align = Align(16);
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   default:
@@ -8278,7 +8587,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
       // Get the shift amount based on the scaling factor:
       // log2(sizeof(IdxTy)) - log2(8).
       uint64_t ShiftAmt =
-          countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
+        countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
       // Is the constant foldable in the shift of the addressing mode?
       // I.e., shift amount is between 1 and 4 inclusive.
       if (ShiftAmt == 0 || ShiftAmt > 4)
@@ -8739,6 +9048,39 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
   return MVT::Other;
 }
 
+LLT AArch64TargetLowering::getOptimalMemOpLLT(
+    uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+    bool ZeroMemset, bool MemcpyStrSrc,
+    const AttributeList &FuncAttributes) const {
+  bool CanImplicitFloat =
+      !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
+  bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
+  bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
+  // Only use AdvSIMD to implement memset of 32-byte and above. It would have
+  // taken one instruction to materialize the v2i64 zero and one store (with
+  // restrictive addressing mode). Just do i64 stores.
+  bool IsSmallMemset = IsMemset && Size < 32;
+  auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
+    if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
+      return true;
+    bool Fast;
+    return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
+                                          &Fast) &&
+           Fast;
+  };
+
+  if (CanUseNEON && IsMemset && !IsSmallMemset &&
+      AlignmentIsAcceptable(MVT::v2i64, 16))
+    return LLT::vector(2, 64);
+  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
+    return LLT::scalar(128);
+  if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
+    return LLT::scalar(64);
+  if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
+    return LLT::scalar(32);
+  return LLT();
+}
+
 // 12-bit optionally shifted immediates are legal for adds.
 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
   if (Immed == std::numeric_limits<int64_t>::min()) {
@@ -10065,6 +10407,14 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
     Opcode = AArch64ISD::SQSHLU_I;
     IsRightShift = false;
     break;
+  case Intrinsic::aarch64_neon_sshl:
+  case Intrinsic::aarch64_neon_ushl:
+    // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
+    // left shift for positive shift amounts. Below, we only replace the current
+    // node with VSHL, if this condition is met.
+    Opcode = AArch64ISD::VSHL;
+    IsRightShift = false;
+    break;
   }
 
   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
@@ -10151,6 +10501,8 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_neon_sqshlu:
   case Intrinsic::aarch64_neon_srshl:
   case Intrinsic::aarch64_neon_urshl:
+  case Intrinsic::aarch64_neon_sshl:
+  case Intrinsic::aarch64_neon_ushl:
     return tryCombineShiftImm(IID, N, DAG);
   case Intrinsic::aarch64_crc32b:
   case Intrinsic::aarch64_crc32cb:
@@ -10482,10 +10834,10 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return ReplacedSplat;
 
   SDLoc DL(S);
-  unsigned NumElts = VT.getVectorNumElements() / 2;
+
   // Split VT into two.
-  EVT HalfVT =
-      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+  EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+  unsigned NumElts = HalfVT.getVectorNumElements();
   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
                                    DAG.getConstant(0, DL, MVT::i64));
   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
@@ -10567,7 +10919,7 @@ static SDValue performPostLD1Combine(SDNode *N,
     // are predecessors to each other or the Vector.
     SmallPtrSet<const SDNode *, 32> Visited;
     SmallVector<const SDNode *, 16> Worklist;
-    Visited.insert(N);
+    Visited.insert(Addr.getNode());
     Worklist.push_back(User);
     Worklist.push_back(LD);
     Worklist.push_back(Vector.getNode());
@@ -11983,6 +12335,27 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
   return Mask->getValue().isPowerOf2();
 }
 
+bool AArch64TargetLowering::
+    shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+        unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+        SelectionDAG &DAG) const {
+  // Does baseline recommend not to perform the fold by default?
+  if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+          X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
+    return false;
+  // Else, if this is a vector shift, prefer 'shl'.
+  return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
+}
+
+bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
+                                              SDNode *N) const {
+  if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+      !Subtarget->isTargetWindows())
+    return false;
+  return true;
+}
+
 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
   // Update IsSplitCSR in AArch64unctionInfo.
   AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
@@ -12009,7 +12382,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
-    unsigned NewVR = MRI->createVirtualRegister(RC);
+    Register NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     // FIXME: this currently does not emit CFI pseudo-instructions, it works
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 4421c31f65c9..00fa96bc4e6d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -191,6 +191,11 @@ enum NodeType : unsigned {
   FRECPE, FRECPS,
   FRSQRTE, FRSQRTS,
 
+  SUNPKHI,
+  SUNPKLO,
+  UUNPKHI,
+  UUNPKLO,
+
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LD3post,
@@ -261,6 +266,14 @@ public:
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
+  MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override {
+    // Returning i64 unconditionally here (i.e. even for ILP32) means that the
+    // *DAG* representation of pointers will always be 64-bits. They will be
+    // truncated and extended when transferred to memory, but the 64-bit DAG
+    // allows us to use AArch64's addressing modes much more easily.
+    return MVT::getIntegerVT(64);
+  }
+
   bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
                                     TargetLoweringOpt &TLO) const override;
 
@@ -272,6 +285,10 @@ public:
       EVT VT, unsigned AddrSpace = 0, unsigned Align = 1,
       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
       bool *Fast = nullptr) const override;
+  /// LLT variant.
+  bool allowsMisalignedMemoryAccesses(
+    LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+    bool *Fast = nullptr) const override;
 
   /// Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -358,6 +375,10 @@ public:
                           bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                           const AttributeList &FuncAttributes) const override;
 
+  LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          const AttributeList &FuncAttributes) const override;
+
   /// Return true if the addressing mode represented by AM is legal for this
   /// target, for a load/store of the specified type.
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
@@ -480,11 +501,12 @@ public:
     return VT.getSizeInBits() >= 64; // vector 'bic'
   }
 
-  bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
-    if (DAG.getMachineFunction().getFunction().hasMinSize())
-      return false;
-    return true;
-  }
+  bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+      SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+      unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+      SelectionDAG &DAG) const override;
+
+  bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
 
   bool shouldTransformSignedTruncationCheck(EVT XVT,
                                             unsigned KeptBits) const override {
@@ -655,6 +677,7 @@ private:
   SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
@@ -690,8 +713,8 @@ private:
   unsigned combineRepeatedFPDivisors() const override;
 
   ConstraintType getConstraintType(StringRef Constraint) const override;
-  unsigned getRegisterByName(const char* RegName, EVT VT,
-                             SelectionDAG &DAG) const override;
+  Register getRegisterByName(const char* RegName, EVT VT,
+                             const MachineFunction &MF) const override;
 
   /// Examine constraint string and operand type and determine a weight value.
   /// The operand object must already have been set up with the operand type.
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index e22cb44d81ae..459b53923625 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -204,19 +204,27 @@ def : Pat<(relaxed_store<atomic_store_64>
 
 def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
 
 def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
 
 def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
 
 def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
 
 def : Pat<(ldxr_1 GPR64sp:$addr),
           (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
@@ -237,19 +245,27 @@ def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
 
 def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
 
 def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
 
 def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
 
 def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
 
 def : Pat<(ldaxr_1 GPR64sp:$addr),
           (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
@@ -271,22 +287,30 @@ def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
 def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
                      (int_aarch64_stxr node:$val, node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
 
 def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
                      (int_aarch64_stxr node:$val, node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
 
 def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
                      (int_aarch64_stxr node:$val, node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
 
 def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
                      (int_aarch64_stxr node:$val, node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
 
 
 def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
@@ -317,22 +341,30 @@ def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
 def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
                      (int_aarch64_stlxr node:$val, node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
 
 def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
                      (int_aarch64_stlxr node:$val, node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
 
 def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
                      (int_aarch64_stlxr node:$val, node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
 
 def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
                      (int_aarch64_stlxr node:$val, node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+  let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
 
 
 def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
@@ -422,4 +454,3 @@ let Predicates = [HasLSE] in {
   defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">;
   defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">;
 }
-
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index d619137b55c5..f555e4123307 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -480,76 +480,40 @@ def BranchTarget14Operand : BranchTarget<14>;
 def BranchTarget26Operand : BranchTarget<26>;
 def PCRelLabel19Operand   : PCRelLabel<19>;
 
-def MovZSymbolG3AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG3";
+def MovWSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovWSymbolG3";
   let RenderMethod = "addImmOperands";
 }
 
-def movz_symbol_g3 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG3AsmOperand;
+def movw_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovWSymbolG3AsmOperand;
 }
 
-def MovZSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG2";
+def MovWSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovWSymbolG2";
   let RenderMethod = "addImmOperands";
 }
 
-def movz_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG2AsmOperand;
+def movw_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovWSymbolG2AsmOperand;
 }
 
-def MovZSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG1";
+def MovWSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovWSymbolG1";
   let RenderMethod = "addImmOperands";
 }
 
-def movz_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG1AsmOperand;
+def movw_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovWSymbolG1AsmOperand;
 }
 
-def MovZSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG0";
+def MovWSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovWSymbolG0";
   let RenderMethod = "addImmOperands";
 }
 
-def movz_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG0AsmOperand;
-}
-
-def MovKSymbolG3AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG3";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g3 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG3AsmOperand;
-}
-
-def MovKSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG2";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG2AsmOperand;
-}
-
-def MovKSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG1";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG1AsmOperand;
-}
-
-def MovKSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG0";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG0AsmOperand;
+def movw_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovWSymbolG0AsmOperand;
 }
 
 class fixedpoint_i32<ValueType FloatVT>
@@ -673,6 +637,11 @@ def logical_imm64_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
 }]>;
 
+def gi_logical_imm32_XFORM : GICustomOperandRenderer<"renderLogicalImm32">,
+  GISDNodeXFormEquiv<logical_imm32_XFORM>;
+def gi_logical_imm64_XFORM : GICustomOperandRenderer<"renderLogicalImm64">,
+  GISDNodeXFormEquiv<logical_imm64_XFORM>;
+
 let DiagnosticType = "LogicalSecondSource" in {
   def LogicalImm32Operand : AsmOperandClass {
     let Name = "LogicalImm32";
@@ -714,12 +683,15 @@ def logical_imm64_not : Operand<i64> {
   let ParserMatchClass = LogicalImm64NotOperand;
 }
 
-// imm0_65535 predicate - True if the immediate is in the range [0,65535].
-def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+// iXX_imm0_65535 predicates - True if the immediate is in the range [0,65535].
+let ParserMatchClass = AsmImmRange<0, 65535>, PrintMethod = "printImmHex" in {
+def i32_imm0_65535 : Operand<i32>, TImmLeaf<i32, [{
   return ((uint32_t)Imm) < 65536;
-}]> {
-  let ParserMatchClass = AsmImmRange<0, 65535>;
-  let PrintMethod = "printImmHex";
+}]>;
+
+def i64_imm0_65535 : Operand<i64>, TImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 65536;
+}]>;
 }
 
 // imm0_255 predicate - True if the immediate is in the range [0,255].
@@ -815,6 +787,14 @@ class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
 def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
 def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
 
+def gi_arith_shifted_reg32 :
+  GIComplexOperandMatcher<s32, "selectArithShiftedRegister">,
+  GIComplexPatternEquiv<arith_shifted_reg32>;
+
+def gi_arith_shifted_reg64 :
+  GIComplexOperandMatcher<s64, "selectArithShiftedRegister">,
+  GIComplexPatternEquiv<arith_shifted_reg64>;
+
 // An arithmetic shifter operand:
 //  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
 //  {5-0} - imm6
@@ -837,6 +817,14 @@ class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
 def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
 def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
 
+def gi_logical_shifted_reg32 :
+  GIComplexOperandMatcher<s32, "selectLogicalShiftedRegister">,
+  GIComplexPatternEquiv<logical_shifted_reg32>;
+
+def gi_logical_shifted_reg64 :
+  GIComplexOperandMatcher<s64, "selectLogicalShiftedRegister">,
+  GIComplexPatternEquiv<logical_shifted_reg64>;
+
 // A logical vector shifter operand:
 //  {7-6} - shift type: 00 = lsl
 //  {5-0} - imm6: #0, #8, #16, or #24
@@ -918,6 +906,14 @@ class neg_addsub_shifted_imm<ValueType Ty>
 def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
 def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
 
+def gi_neg_addsub_shifted_imm32 :
+    GIComplexOperandMatcher<s32, "selectNegArithImmed">,
+    GIComplexPatternEquiv<neg_addsub_shifted_imm32>;
+
+def gi_neg_addsub_shifted_imm64 :
+    GIComplexOperandMatcher<s64, "selectNegArithImmed">,
+    GIComplexPatternEquiv<neg_addsub_shifted_imm64>;
+
 // An extend operand:
 //  {5-3} - extend type
 //  {2-0} - imm3
@@ -948,6 +944,21 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
   let MIOperandInfo = (ops GPR32, arith_extend64);
 }
 
+def arith_extended_reg32_i32 : arith_extended_reg32<i32>;
+def gi_arith_extended_reg32_i32 :
+    GIComplexOperandMatcher<s32, "selectArithExtendedRegister">,
+    GIComplexPatternEquiv<arith_extended_reg32_i32>;
+
+def arith_extended_reg32_i64 : arith_extended_reg32<i64>;
+def gi_arith_extended_reg32_i64 :
+    GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
+    GIComplexPatternEquiv<arith_extended_reg32_i64>;
+
+def arith_extended_reg32to64_i64 : arith_extended_reg32to64<i64>;
+def gi_arith_extended_reg32to64_i64 :
+    GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
+    GIComplexPatternEquiv<arith_extended_reg32to64_i64>;
+
 // Floating-point immediate.
 def fpimm16 : Operand<f16>,
               FPImmLeaf<f16, [{
@@ -1000,8 +1011,8 @@ class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
   let RenderMethod = "addVectorIndexOperands";
 }
 
-class AsmVectorIndexOpnd<AsmOperandClass mc, code pred>
-    : Operand<i64>, ImmLeaf<i64, pred> {
+class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc, code pred>
+    : Operand<ty>, ImmLeaf<ty, pred> {
   let ParserMatchClass = mc;
   let PrintMethod = "printVectorIndex";
 }
@@ -1012,11 +1023,17 @@ def VectorIndexHOperand : AsmVectorIndex<0, 7>;
 def VectorIndexSOperand : AsmVectorIndex<0, 3>;
 def VectorIndexDOperand : AsmVectorIndex<0, 1>;
 
-def VectorIndex1 : AsmVectorIndexOpnd<VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
-def VectorIndexB : AsmVectorIndexOpnd<VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
-def VectorIndexH : AsmVectorIndexOpnd<VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
-def VectorIndexS : AsmVectorIndexOpnd<VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
-def VectorIndexD : AsmVectorIndexOpnd<VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+def VectorIndex1 : AsmVectorIndexOpnd<i64, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
+def VectorIndexB : AsmVectorIndexOpnd<i64, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def VectorIndexH : AsmVectorIndexOpnd<i64, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def VectorIndexS : AsmVectorIndexOpnd<i64, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+def VectorIndexD : AsmVectorIndexOpnd<i64, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+
+def VectorIndex132b : AsmVectorIndexOpnd<i32, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
+def VectorIndexB32b : AsmVectorIndexOpnd<i32, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def VectorIndexH32b : AsmVectorIndexOpnd<i32, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def VectorIndexS32b : AsmVectorIndexOpnd<i32, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+def VectorIndexD32b : AsmVectorIndexOpnd<i32, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
 
 def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">;
 def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">;
@@ -1025,15 +1042,15 @@ def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">;
 def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">;
 
 def sve_elm_idx_extdup_b
-  : AsmVectorIndexOpnd<SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
+  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
 def sve_elm_idx_extdup_h
-  : AsmVectorIndexOpnd<SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
+  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
 def sve_elm_idx_extdup_s
-  : AsmVectorIndexOpnd<SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
 def sve_elm_idx_extdup_d
-  : AsmVectorIndexOpnd<SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
 def sve_elm_idx_extdup_q
-  : AsmVectorIndexOpnd<SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
 
 // 8-bit immediate for AdvSIMD where 64-bit values of the form:
 // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
@@ -1082,6 +1099,45 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
   let Inst{4-0} = Rt;
 }
 
+// System instructions for transactional memory extension
+class TMBaseSystemI<bit L, bits<4> CRm, bits<3> op2, dag oops, dag iops,
+                    string asm, string operands, list<dag> pattern>
+    : BaseSystemI<L, oops, iops, asm, operands, pattern>,
+      Sched<[WriteSys]> {
+  let Inst{20-12} = 0b000110011;
+  let Inst{11-8} = CRm;
+  let Inst{7-5} = op2;
+  let DecoderMethod = "";
+
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+// System instructions for transactional memory - single input operand
+class TMSystemI<bits<4> CRm, string asm, list<dag> pattern>
+    : TMBaseSystemI<0b1, CRm, 0b011,
+                    (outs GPR64:$Rt), (ins), asm, "\t$Rt", pattern> {
+  bits<5> Rt;
+  let Inst{4-0} = Rt;
+}
+
+// System instructions for transactional memory - no operand
+class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern>
+    : TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> {
+  let Inst{4-0} = 0b11111;
+}
+
+// System instructions for exit from transactions
+class TMSystemException<bits<3> op1, string asm, list<dag> pattern>
+    : I<(outs), (ins i64_imm0_65535:$imm), asm, "\t$imm", "", pattern>,
+      Sched<[WriteSys]> {
+  bits<16> imm;
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = op1;
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = 0b00000;
+}
+
 // Hint instructions that take both a CRm and a 3-bit immediate.
 // NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
 // model patterns with sufficiently fine granularity
@@ -2180,11 +2236,11 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
   // Add/Subtract extended register
   let AddedComplexity = 1, hasSideEffects = 0 in {
   def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+                           arith_extended_reg32_i32, mnemonic, OpNode> {
     let Inst{31} = 0;
   }
   def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
-                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+                           arith_extended_reg32to64_i64, mnemonic, OpNode> {
     let Inst{31} = 1;
   }
   }
@@ -2254,11 +2310,11 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
   // Add/Subtract extended register
   let AddedComplexity = 1 in {
   def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+                           arith_extended_reg32_i32, mnemonic, OpNode> {
     let Inst{31} = 0;
   }
   def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
-                           arith_extended_reg32<i64>, mnemonic, OpNode> {
+                           arith_extended_reg32_i64, mnemonic, OpNode> {
     let Inst{31} = 1;
   }
   }
@@ -2969,6 +3025,22 @@ def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
 def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
 def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
 
+def gi_ro_Xindexed8 :
+    GIComplexOperandMatcher<s64, "selectAddrModeXRO<8>">,
+    GIComplexPatternEquiv<ro_Xindexed8>;
+def gi_ro_Xindexed16 :
+    GIComplexOperandMatcher<s64, "selectAddrModeXRO<16>">,
+    GIComplexPatternEquiv<ro_Xindexed16>;
+def gi_ro_Xindexed32 :
+    GIComplexOperandMatcher<s64, "selectAddrModeXRO<32>">,
+    GIComplexPatternEquiv<ro_Xindexed32>;
+def gi_ro_Xindexed64 :
+    GIComplexOperandMatcher<s64, "selectAddrModeXRO<64>">,
+    GIComplexPatternEquiv<ro_Xindexed64>;
+def gi_ro_Xindexed128 :
+    GIComplexOperandMatcher<s64, "selectAddrModeXRO<128>">,
+    GIComplexPatternEquiv<ro_Xindexed128>;
+
 def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
 def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
 def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
@@ -4086,7 +4158,7 @@ multiclass MemTagStore<bits<2> opc1, string insn> {
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
 class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
-    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+    : I<(outs), (ins i32_imm0_65535:$imm), asm, "\t$imm", "", []>,
       Sched<[WriteSys]> {
   bits<16> imm;
   let Inst{31-24} = 0b11010100;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 215e96a82d0e..5c35e5bcdd30 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/Casting.h"
@@ -82,6 +83,10 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
   }
 
+  // Meta-instructions emit no code.
+  if (MI.isMetaInstruction())
+    return 0;
+
   // FIXME: We currently only handle pseudoinstructions that don't get expanded
   //        before the assembly printer.
   unsigned NumBytes = 0;
@@ -91,12 +96,6 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     // Anything not explicitly designated otherwise is a normal 4-byte insn.
     NumBytes = 4;
     break;
-  case TargetOpcode::DBG_VALUE:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-    NumBytes = 0;
-    break;
   case TargetOpcode::STACKMAP:
     // The upper bound for a stackmap intrinsic is the full length of its shadow
     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
@@ -416,7 +415,7 @@ unsigned AArch64InstrInfo::insertBranch(
 
 // Find the original register that VReg is copied from.
 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
-  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+  while (Register::isVirtualRegister(VReg)) {
     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
     if (!DefMI->isFullCopy())
       return VReg;
@@ -431,7 +430,7 @@ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
                                 unsigned *NewVReg = nullptr) {
   VReg = removeCopies(MRI, VReg);
-  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+  if (!Register::isVirtualRegister(VReg))
     return 0;
 
   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
@@ -574,7 +573,7 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
       CC = AArch64CC::NE;
       break;
     }
-    unsigned SrcReg = Cond[2].getReg();
+    Register SrcReg = Cond[2].getReg();
     if (Is64Bit) {
       // cmp reg, #0 is actually subs xzr, reg, #0.
       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
@@ -930,7 +929,7 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
 }
 
 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
-    const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const {
+    const MachineInstr &MIa, const MachineInstr &MIb) const {
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
   int64_t OffsetA = 0, OffsetB = 0;
@@ -1071,8 +1070,8 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
     assert(MO.isReg() &&
            "Operand has register constraints without being a register!");
 
-    unsigned Reg = MO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    Register Reg = MO.getReg();
+    if (Register::isPhysicalRegister(Reg)) {
       if (!OpRegCstraints->contains(Reg))
         return false;
     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
@@ -1472,6 +1471,8 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return false;
 
   MachineBasicBlock &MBB = *MI.getParent();
+  auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
+  auto TRI = Subtarget.getRegisterInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   if (MI.getOpcode() == AArch64::CATCHRET) {
@@ -1497,21 +1498,32 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return true;
   }
 
-  unsigned Reg = MI.getOperand(0).getReg();
+  Register Reg = MI.getOperand(0).getReg();
   const GlobalValue *GV =
       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
   const TargetMachine &TM = MBB.getParent()->getTarget();
-  unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
+  unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
   const unsigned char MO_NC = AArch64II::MO_NC;
 
   if ((OpFlags & AArch64II::MO_GOT) != 0) {
     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
         .addGlobalAddress(GV, 0, OpFlags);
-    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill)
-        .addImm(0)
-        .addMemOperand(*MI.memoperands_begin());
+    if (Subtarget.isTargetILP32()) {
+      unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
+      BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
+          .addDef(Reg32, RegState::Dead)
+          .addUse(Reg, RegState::Kill)
+          .addImm(0)
+          .addMemOperand(*MI.memoperands_begin())
+          .addDef(Reg, RegState::Implicit);
+    } else {
+      BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(0)
+          .addMemOperand(*MI.memoperands_begin());
+    }
   } else if (TM.getCodeModel() == CodeModel::Large) {
+    assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
         .addImm(0);
@@ -1538,10 +1550,20 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
-    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill)
-        .addGlobalAddress(GV, 0, LoFlags)
-        .addMemOperand(*MI.memoperands_begin());
+    if (Subtarget.isTargetILP32()) {
+      unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
+      BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
+          .addDef(Reg32, RegState::Dead)
+          .addUse(Reg, RegState::Kill)
+          .addGlobalAddress(GV, 0, LoFlags)
+          .addMemOperand(*MI.memoperands_begin())
+          .addDef(Reg, RegState::Implicit);
+    } else {
+      BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addGlobalAddress(GV, 0, LoFlags)
+          .addMemOperand(*MI.memoperands_begin());
+    }
   }
 
   MBB.erase(MI);
@@ -1581,7 +1603,7 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
     break;
   case TargetOpcode::COPY: {
     // GPR32 copies will by lowered to ORRXrs
-    unsigned DstReg = MI.getOperand(0).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
     return (AArch64::GPR32RegClass.contains(DstReg) ||
             AArch64::GPR64RegClass.contains(DstReg));
   }
@@ -1611,7 +1633,7 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
     break;
   case TargetOpcode::COPY: {
     // FPR64 copies will by lowered to ORR.16b
-    unsigned DstReg = MI.getOperand(0).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
     return (AArch64::FPR64RegClass.contains(DstReg) ||
             AArch64::FPR128RegClass.contains(DstReg));
   }
@@ -1917,7 +1939,7 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
   // e.g., ldr x0, [x0]
   // This case will never occur with an FI base.
   if (MI.getOperand(1).isReg()) {
-    unsigned BaseReg = MI.getOperand(1).getReg();
+    Register BaseReg = MI.getOperand(1).getReg();
     const TargetRegisterInfo *TRI = &getRegisterInfo();
     if (MI.modifiesRegister(BaseReg, TRI))
       return false;
@@ -1928,6 +1950,17 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
   if (isLdStPairSuppressed(MI))
     return false;
 
+  // Do not pair any callee-save store/reload instructions in the
+  // prologue/epilogue if the CFI information encoded the operations as separate
+  // instructions, as that will cause the size of the actual prologue to mismatch
+  // with the prologue size recorded in the Windows CFI.
+  const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
+  bool NeedsWinCFI = MAI->usesWindowsCFI() &&
+                     MI.getMF()->getFunction().needsUnwindTableEntry();
+  if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
+                      MI.getFlag(MachineInstr::FrameDestroy)))
+    return false;
+
   // On some CPUs quad load/store pairs are slower than two single load/stores.
   if (Subtarget.isPaired128Slow()) {
     switch (MI.getOpcode()) {
@@ -2165,6 +2198,18 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
     MinOffset = -256;
     MaxOffset = 255;
     break;
+  case AArch64::LDR_PXI:
+  case AArch64::STR_PXI:
+    Scale = Width = 2;
+    MinOffset = -256;
+    MaxOffset = 255;
+    break;
+  case AArch64::LDR_ZXI:
+  case AArch64::STR_ZXI:
+    Scale = Width = 16;
+    MinOffset = -256;
+    MaxOffset = 255;
+    break;
   case AArch64::ST2GOffset:
   case AArch64::STZ2GOffset:
     Scale = 16;
@@ -2350,7 +2395,7 @@ static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
   if (!SubIdx)
     return MIB.addReg(Reg, State);
 
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
   return MIB.addReg(Reg, State, SubIdx);
 }
@@ -2474,6 +2519,27 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  // Copy a Predicate register by ORRing with itself.
+  if (AArch64::PPRRegClass.contains(DestReg) &&
+      AArch64::PPRRegClass.contains(SrcReg)) {
+    assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+    BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
+      .addReg(SrcReg) // Pg
+      .addReg(SrcReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  // Copy a Z register by ORRing with itself.
+  if (AArch64::ZPRRegClass.contains(DestReg) &&
+      AArch64::ZPRRegClass.contains(SrcReg)) {
+    assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+    BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
+      .addReg(SrcReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
   if (AArch64::GPR64spRegClass.contains(DestReg) &&
       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
@@ -2722,7 +2788,7 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
                                     MachineMemOperand *MMO) {
   unsigned SrcReg0 = SrcReg;
   unsigned SrcReg1 = SrcReg;
-  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+  if (Register::isPhysicalRegister(SrcReg)) {
     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
     SubIdx0 = 0;
     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
@@ -2761,7 +2827,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
   case 4:
     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
       Opc = AArch64::STRWui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+      if (Register::isVirtualRegister(SrcReg))
         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
       else
         assert(SrcReg != AArch64::WSP);
@@ -2771,7 +2837,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
   case 8:
     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
       Opc = AArch64::STRXui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+      if (Register::isVirtualRegister(SrcReg))
         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
       else
         assert(SrcReg != AArch64::SP);
@@ -2852,7 +2918,7 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
   unsigned DestReg0 = DestReg;
   unsigned DestReg1 = DestReg;
   bool IsUndef = true;
-  if (TargetRegisterInfo::isPhysicalRegister(DestReg)) {
+  if (Register::isPhysicalRegister(DestReg)) {
     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
     SubIdx0 = 0;
     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
@@ -2892,7 +2958,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
   case 4:
     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
       Opc = AArch64::LDRWui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+      if (Register::isVirtualRegister(DestReg))
         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
       else
         assert(DestReg != AArch64::WSP);
@@ -2902,7 +2968,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
   case 8:
     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
       Opc = AArch64::LDRXui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+      if (Register::isVirtualRegister(DestReg))
         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
       else
         assert(DestReg != AArch64::SP);
@@ -2972,21 +3038,39 @@ void AArch64InstrInfo::loadRegFromStackSlot(
   MI.addMemOperand(MMO);
 }
 
-void llvm::emitFrameOffset(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
-                           unsigned DestReg, unsigned SrcReg, int Offset,
-                           const TargetInstrInfo *TII,
-                           MachineInstr::MIFlag Flag, bool SetNZCV,
-                           bool NeedsWinCFI, bool *HasWinCFI) {
-  if (DestReg == SrcReg && Offset == 0)
-    return;
-
-  assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
-         "SP increment/decrement not 16-byte aligned");
-
-  bool isSub = Offset < 0;
-  if (isSub)
-    Offset = -Offset;
+// Helper function to emit a frame offset adjustment from a given
+// pointer (SrcReg), stored into DestReg. This function is explicit
+// in that it requires the opcode.
+static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               const DebugLoc &DL, unsigned DestReg,
+                               unsigned SrcReg, int64_t Offset, unsigned Opc,
+                               const TargetInstrInfo *TII,
+                               MachineInstr::MIFlag Flag, bool NeedsWinCFI,
+                               bool *HasWinCFI) {
+  int Sign = 1;
+  unsigned MaxEncoding, ShiftSize;
+  switch (Opc) {
+  case AArch64::ADDXri:
+  case AArch64::ADDSXri:
+  case AArch64::SUBXri:
+  case AArch64::SUBSXri:
+    MaxEncoding = 0xfff;
+    ShiftSize = 12;
+    break;
+  case AArch64::ADDVL_XXI:
+  case AArch64::ADDPL_XXI:
+    MaxEncoding = 31;
+    ShiftSize = 0;
+    if (Offset < 0) {
+      MaxEncoding = 32;
+      Sign = -1;
+      Offset = -Offset;
+    }
+    break;
+  default:
+    llvm_unreachable("Unsupported opcode");
+  }
 
   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
   // scratch register.  If DestReg is a virtual register, use it as the
@@ -2999,65 +3083,94 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
   // of code.
   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
 
-  unsigned Opc;
-  if (SetNZCV)
-    Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
-  else
-    Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
-  const unsigned MaxEncoding = 0xfff;
-  const unsigned ShiftSize = 12;
   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
-  while (((unsigned)Offset) >= (1 << ShiftSize)) {
-    unsigned ThisVal;
-    if (((unsigned)Offset) > MaxEncodableValue) {
-      ThisVal = MaxEncodableValue;
-    } else {
-      ThisVal = Offset & MaxEncodableValue;
+  do {
+    unsigned ThisVal = std::min<unsigned>(Offset, MaxEncodableValue);
+    unsigned LocalShiftSize = 0;
+    if (ThisVal > MaxEncoding) {
+      ThisVal = ThisVal >> ShiftSize;
+      LocalShiftSize = ShiftSize;
     }
     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
            "Encoding cannot handle value that big");
-    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-        .addReg(SrcReg)
-        .addImm(ThisVal >> ShiftSize)
-        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
-        .setMIFlag(Flag);
-
-    if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) {
+    auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+                   .addReg(SrcReg)
+                   .addImm(Sign * (int)ThisVal);
+    if (ShiftSize)
+      MBI = MBI.addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
+    MBI = MBI.setMIFlag(Flag);
+
+    if (NeedsWinCFI) {
+      assert(Sign == 1 && "SEH directives should always have a positive sign");
+      int Imm = (int)(ThisVal << LocalShiftSize);
+      if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
+          (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
+        if (HasWinCFI)
+          *HasWinCFI = true;
+        if (Imm == 0)
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
+        else
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
+              .addImm(Imm)
+              .setMIFlag(Flag);
+        assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to "
+                                      "emit a single SEH directive");
+      } else if (DestReg == AArch64::SP) {
+        if (HasWinCFI)
+          *HasWinCFI = true;
+        assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+            .addImm(Imm)
+            .setMIFlag(Flag);
+      }
       if (HasWinCFI)
         *HasWinCFI = true;
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
-          .addImm(ThisVal)
-          .setMIFlag(Flag);
     }
 
     SrcReg = DestReg;
-    Offset -= ThisVal;
-    if (Offset == 0)
-      return;
-  }
-  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-      .addReg(SrcReg)
-      .addImm(Offset)
-      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
-      .setMIFlag(Flag);
+    Offset -= ThisVal << LocalShiftSize;
+  } while (Offset);
+}
 
-  if (NeedsWinCFI) {
-    if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
-        (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
-      if (HasWinCFI)
-        *HasWinCFI = true;
-      if (Offset == 0)
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
-                setMIFlag(Flag);
-      else
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
-                addImm(Offset).setMIFlag(Flag);
-    } else if (DestReg == AArch64::SP) {
-      if (HasWinCFI)
-        *HasWinCFI = true;
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
-              addImm(Offset).setMIFlag(Flag);
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           StackOffset Offset, const TargetInstrInfo *TII,
+                           MachineInstr::MIFlag Flag, bool SetNZCV,
+                           bool NeedsWinCFI, bool *HasWinCFI) {
+  int64_t Bytes, NumPredicateVectors, NumDataVectors;
+  Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
+
+  // First emit non-scalable frame offsets, or a simple 'mov'.
+  if (Bytes || (!Offset && SrcReg != DestReg)) {
+    assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
+           "SP increment/decrement not 16-byte aligned");
+    unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
+    if (Bytes < 0) {
+      Bytes = -Bytes;
+      Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
     }
+    emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
+                       NeedsWinCFI, HasWinCFI);
+    SrcReg = DestReg;
+  }
+
+  assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
+         "SetNZCV not supported with SVE vectors");
+  assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
+         "WinCFI not supported with SVE vectors");
+
+  if (NumDataVectors) {
+    emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
+                       AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
+    SrcReg = DestReg;
+  }
+
+  if (NumPredicateVectors) {
+    assert(DestReg != AArch64::SP && "Unaligned access to SP");
+    emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
+                       AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
   }
 }
 
@@ -3079,15 +3192,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   // <rdar://problem/11522048>
   //
   if (MI.isFullCopy()) {
-    unsigned DstReg = MI.getOperand(0).getReg();
-    unsigned SrcReg = MI.getOperand(1).getReg();
-    if (SrcReg == AArch64::SP &&
-        TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
       return nullptr;
     }
-    if (DstReg == AArch64::SP &&
-        TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
       return nullptr;
     }
@@ -3127,14 +3238,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     MachineBasicBlock &MBB = *MI.getParent();
     const MachineOperand &DstMO = MI.getOperand(0);
     const MachineOperand &SrcMO = MI.getOperand(1);
-    unsigned DstReg = DstMO.getReg();
-    unsigned SrcReg = SrcMO.getReg();
+    Register DstReg = DstMO.getReg();
+    Register SrcReg = SrcMO.getReg();
     // This is slightly expensive to compute for physical regs since
     // getMinimalPhysRegClass is slow.
     auto getRegClass = [&](unsigned Reg) {
-      return TargetRegisterInfo::isVirtualRegister(Reg)
-                 ? MRI.getRegClass(Reg)
-                 : TRI.getMinimalPhysRegClass(Reg);
+      return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
+                                              : TRI.getMinimalPhysRegClass(Reg);
     };
 
     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
@@ -3159,8 +3269,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     //
     //   STRXui %xzr, %stack.0
     //
-    if (IsSpill && DstMO.isUndef() &&
-        TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+    if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
       assert(SrcMO.getSubReg() == 0 &&
              "Unexpected subreg on physical register");
       const TargetRegisterClass *SpillRC;
@@ -3243,10 +3352,23 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   return nullptr;
 }
 
-int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+static bool isSVEScaledImmInstruction(unsigned Opcode) {
+  switch (Opcode) {
+  case AArch64::LDR_ZXI:
+  case AArch64::STR_ZXI:
+  case AArch64::LDR_PXI:
+  case AArch64::STR_PXI:
+    return true;
+  default:
+    return false;
+  }
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
+                                    StackOffset &SOffset,
                                     bool *OutUseUnscaledOp,
                                     unsigned *OutUnscaledOp,
-                                    int *EmittableOffset) {
+                                    int64_t *EmittableOffset) {
   // Set output values in case of early exit.
   if (EmittableOffset)
     *EmittableOffset = 0;
@@ -3285,6 +3407,10 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
 
   // Construct the complete offset.
+  bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode());
+  int64_t Offset =
+      IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes());
+
   const MachineOperand &ImmOpnd =
       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
   Offset += ImmOpnd.getImm() * Scale;
@@ -3304,7 +3430,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
          "Cannot have remainder when using unscaled op");
 
   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
-  int NewOffset = Offset / Scale;
+  int64_t NewOffset = Offset / Scale;
   if (MinOff <= NewOffset && NewOffset <= MaxOff)
     Offset = Remainder;
   else {
@@ -3319,27 +3445,33 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
   if (OutUnscaledOp && UnscaledOp)
     *OutUnscaledOp = *UnscaledOp;
 
+  if (IsMulVL)
+    SOffset = StackOffset(Offset, MVT::nxv1i8) +
+              StackOffset(SOffset.getBytes(), MVT::i8);
+  else
+    SOffset = StackOffset(Offset, MVT::i8) +
+              StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
   return AArch64FrameOffsetCanUpdate |
-         (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+         (SOffset ? 0 : AArch64FrameOffsetIsLegal);
 }
 
 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                                    unsigned FrameReg, int &Offset,
+                                    unsigned FrameReg, StackOffset &Offset,
                                     const AArch64InstrInfo *TII) {
   unsigned Opcode = MI.getOpcode();
   unsigned ImmIdx = FrameRegIdx + 1;
 
   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
-    Offset += MI.getOperand(ImmIdx).getImm();
+    Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
     MI.eraseFromParent();
-    Offset = 0;
+    Offset = StackOffset();
     return true;
   }
 
-  int NewOffset;
+  int64_t NewOffset;
   unsigned UnscaledOp;
   bool UseUnscaledOp;
   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
@@ -3352,7 +3484,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       MI.setDesc(TII->get(UnscaledOp));
 
     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
-    return Offset == 0;
+    return !Offset;
   }
 
   return false;
@@ -3428,13 +3560,19 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
   switch (Inst.getOpcode()) {
   default:
     break;
+  case AArch64::FADDHrr:
   case AArch64::FADDSrr:
   case AArch64::FADDDrr:
+  case AArch64::FADDv4f16:
+  case AArch64::FADDv8f16:
   case AArch64::FADDv2f32:
   case AArch64::FADDv2f64:
   case AArch64::FADDv4f32:
+  case AArch64::FSUBHrr:
   case AArch64::FSUBSrr:
   case AArch64::FSUBDrr:
+  case AArch64::FSUBv4f16:
+  case AArch64::FSUBv8f16:
   case AArch64::FSUBv2f32:
   case AArch64::FSUBv2f64:
   case AArch64::FSUBv4f32:
@@ -3459,7 +3597,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineInstr *MI = nullptr;
 
-  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
     MI = MRI.getUniqueVRegDef(MO.getReg());
   // And it needs to be in the trace (otherwise, it won't have a depth).
   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
@@ -3544,86 +3682,48 @@ static bool getMaddPatterns(MachineInstr &Root,
     Opc = NewOpc;
   }
 
+  auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
+                      MachineCombinerPattern Pattern) {
+    if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
+      Patterns.push_back(Pattern);
+      Found = true;
+    }
+  };
+
+  typedef MachineCombinerPattern MCP;
+
   switch (Opc) {
   default:
     break;
   case AArch64::ADDWrr:
     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
            "ADDWrr does not have register operands");
-    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
-                          AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
-      Found = true;
-    }
-    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
-                          AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
-      Found = true;
-    }
+    setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
+    setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
     break;
   case AArch64::ADDXrr:
-    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
-                          AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
-      Found = true;
-    }
-    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
-                          AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
-      Found = true;
-    }
+    setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
+    setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
     break;
   case AArch64::SUBWrr:
-    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
-                          AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
-      Found = true;
-    }
-    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
-                          AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
-      Found = true;
-    }
+    setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
+    setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
     break;
   case AArch64::SUBXrr:
-    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
-                          AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
-      Found = true;
-    }
-    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
-                          AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
-      Found = true;
-    }
+    setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
+    setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
     break;
   case AArch64::ADDWri:
-    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
-                          AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
-      Found = true;
-    }
+    setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
     break;
   case AArch64::ADDXri:
-    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
-                          AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
-      Found = true;
-    }
+    setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
     break;
   case AArch64::SUBWri:
-    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
-                          AArch64::WZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
-      Found = true;
-    }
+    setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
     break;
   case AArch64::SUBXri:
-    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
-                          AArch64::XZR)) {
-      Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
-      Found = true;
-    }
+    setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
     break;
   }
   return Found;
@@ -3640,204 +3740,135 @@ static bool getFMAPatterns(MachineInstr &Root,
   MachineBasicBlock &MBB = *Root.getParent();
   bool Found = false;
 
+  auto Match = [&](int Opcode, int Operand,
+                   MachineCombinerPattern Pattern) -> bool {
+    if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
+      Patterns.push_back(Pattern);
+      return true;
+    }
+    return false;
+  };
+
+  typedef MachineCombinerPattern MCP;
+
   switch (Root.getOpcode()) {
   default:
     assert(false && "Unsupported FP instruction in combiner\n");
     break;
+  case AArch64::FADDHrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "FADDHrr does not have register operands");
+
+    Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
+    Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
+    break;
   case AArch64::FADDSrr:
     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
-           "FADDWrr does not have register operands");
-    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
-      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                                  AArch64::FMULv1i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
-      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv1i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
-      Found = true;
-    }
+           "FADDSrr does not have register operands");
+
+    Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
+             Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
+
+    Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
+             Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
     break;
   case AArch64::FADDDrr:
-    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
-      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                                  AArch64::FMULv1i64_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
-      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv1i64_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
-      Found = true;
-    }
+    Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
+             Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
+
+    Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
+             Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
+    break;
+  case AArch64::FADDv4f16:
+    Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
+             Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
+
+    Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
+             Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
+    break;
+  case AArch64::FADDv8f16:
+    Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
+             Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
+
+    Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
+             Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
     break;
   case AArch64::FADDv2f32:
-    if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                           AArch64::FMULv2i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                                  AArch64::FMULv2f32)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                           AArch64::FMULv2i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv2f32)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
-      Found = true;
-    }
+    Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
+             Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
+
+    Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
+             Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
     break;
   case AArch64::FADDv2f64:
-    if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                           AArch64::FMULv2i64_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                                  AArch64::FMULv2f64)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                           AArch64::FMULv2i64_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv2f64)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
-      Found = true;
-    }
+    Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
+             Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
+
+    Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
+             Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
     break;
   case AArch64::FADDv4f32:
-    if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                           AArch64::FMULv4i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                                  AArch64::FMULv4f32)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                           AArch64::FMULv4i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv4f32)) {
-      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
-      Found = true;
-    }
-    break;
+    Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
+             Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
 
+    Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
+             Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
+    break;
+  case AArch64::FSUBHrr:
+    Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
+    Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
+    Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
+    break;
   case AArch64::FSUBSrr:
-    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
-      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
-      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv1i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
-      Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
-      Found = true;
-    }
+    Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
+
+    Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
+             Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
+
+    Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
     break;
   case AArch64::FSUBDrr:
-    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
-      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
-      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv1i64_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
-      Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
-      Found = true;
-    }
+    Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
+
+    Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
+             Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
+
+    Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
+    break;
+  case AArch64::FSUBv4f16:
+    Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
+             Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
+
+    Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
+             Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
+    break;
+  case AArch64::FSUBv8f16:
+    Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
+             Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
+
+    Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
+             Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
     break;
   case AArch64::FSUBv2f32:
-    if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                           AArch64::FMULv2i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv2f32)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                           AArch64::FMULv2i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                                  AArch64::FMULv2f32)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1);
-      Found = true;
-    }
+    Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
+             Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
+
+    Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
+             Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
     break;
   case AArch64::FSUBv2f64:
-    if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                           AArch64::FMULv2i64_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv2f64)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                           AArch64::FMULv2i64_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                                  AArch64::FMULv2f64)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1);
-      Found = true;
-    }
+    Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
+             Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
+
+    Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
+             Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
     break;
   case AArch64::FSUBv4f32:
-    if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                           AArch64::FMULv4i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
-                                  AArch64::FMULv4f32)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
-      Found = true;
-    }
-    if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                           AArch64::FMULv4i32_indexed)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1);
-      Found = true;
-    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
-                                  AArch64::FMULv4f32)) {
-      Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1);
-      Found = true;
-    }
+    Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
+             Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
+
+    Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
+             Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
     break;
   }
   return Found;
@@ -3851,6 +3882,10 @@ bool AArch64InstrInfo::isThroughputPattern(
   switch (Pattern) {
   default:
     break;
+  case MachineCombinerPattern::FMULADDH_OP1:
+  case MachineCombinerPattern::FMULADDH_OP2:
+  case MachineCombinerPattern::FMULSUBH_OP1:
+  case MachineCombinerPattern::FMULSUBH_OP2:
   case MachineCombinerPattern::FMULADDS_OP1:
   case MachineCombinerPattern::FMULADDS_OP2:
   case MachineCombinerPattern::FMULSUBS_OP1:
@@ -3859,12 +3894,21 @@ bool AArch64InstrInfo::isThroughputPattern(
   case MachineCombinerPattern::FMULADDD_OP2:
   case MachineCombinerPattern::FMULSUBD_OP1:
   case MachineCombinerPattern::FMULSUBD_OP2:
+  case MachineCombinerPattern::FNMULSUBH_OP1:
   case MachineCombinerPattern::FNMULSUBS_OP1:
   case MachineCombinerPattern::FNMULSUBD_OP1:
+  case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
+  case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
+  case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f16_OP2:
+  case MachineCombinerPattern::FMLAv4f16_OP1:
+  case MachineCombinerPattern::FMLAv8f16_OP1:
+  case MachineCombinerPattern::FMLAv8f16_OP2:
   case MachineCombinerPattern::FMLAv2f32_OP2:
   case MachineCombinerPattern::FMLAv2f32_OP1:
   case MachineCombinerPattern::FMLAv2f64_OP1:
@@ -3877,10 +3921,18 @@ bool AArch64InstrInfo::isThroughputPattern(
   case MachineCombinerPattern::FMLAv4f32_OP2:
   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
+  case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
+  case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
+  case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv4f16_OP1:
+  case MachineCombinerPattern::FMLSv4f16_OP2:
+  case MachineCombinerPattern::FMLSv8f16_OP1:
+  case MachineCombinerPattern::FMLSv8f16_OP2:
   case MachineCombinerPattern::FMLSv2f32_OP2:
   case MachineCombinerPattern::FMLSv2f64_OP2:
   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
@@ -3933,15 +3985,15 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
                  unsigned MaddOpc, const TargetRegisterClass *RC,
                  FMAInstKind kind = FMAInstKind::Default,
-                 const unsigned *ReplacedAddend = nullptr) {
+                 const Register *ReplacedAddend = nullptr) {
   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
 
   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
-  unsigned ResultReg = Root.getOperand(0).getReg();
-  unsigned SrcReg0 = MUL->getOperand(1).getReg();
+  Register ResultReg = Root.getOperand(0).getReg();
+  Register SrcReg0 = MUL->getOperand(1).getReg();
   bool Src0IsKill = MUL->getOperand(1).isKill();
-  unsigned SrcReg1 = MUL->getOperand(2).getReg();
+  Register SrcReg1 = MUL->getOperand(2).getReg();
   bool Src1IsKill = MUL->getOperand(2).isKill();
 
   unsigned SrcReg2;
@@ -3955,13 +4007,13 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
   }
 
-  if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+  if (Register::isVirtualRegister(ResultReg))
     MRI.constrainRegClass(ResultReg, RC);
-  if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+  if (Register::isVirtualRegister(SrcReg0))
     MRI.constrainRegClass(SrcReg0, RC);
-  if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+  if (Register::isVirtualRegister(SrcReg1))
     MRI.constrainRegClass(SrcReg1, RC);
-  if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
+  if (Register::isVirtualRegister(SrcReg2))
     MRI.constrainRegClass(SrcReg2, RC);
 
   MachineInstrBuilder MIB;
@@ -4015,19 +4067,19 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
 
   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
-  unsigned ResultReg = Root.getOperand(0).getReg();
-  unsigned SrcReg0 = MUL->getOperand(1).getReg();
+  Register ResultReg = Root.getOperand(0).getReg();
+  Register SrcReg0 = MUL->getOperand(1).getReg();
   bool Src0IsKill = MUL->getOperand(1).isKill();
-  unsigned SrcReg1 = MUL->getOperand(2).getReg();
+  Register SrcReg1 = MUL->getOperand(2).getReg();
   bool Src1IsKill = MUL->getOperand(2).isKill();
 
-  if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+  if (Register::isVirtualRegister(ResultReg))
     MRI.constrainRegClass(ResultReg, RC);
-  if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+  if (Register::isVirtualRegister(SrcReg0))
     MRI.constrainRegClass(SrcReg0, RC);
-  if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+  if (Register::isVirtualRegister(SrcReg1))
     MRI.constrainRegClass(SrcReg1, RC);
-  if (TargetRegisterInfo::isVirtualRegister(VR))
+  if (Register::isVirtualRegister(VR))
     MRI.constrainRegClass(VR, RC);
 
   MachineInstrBuilder MIB =
@@ -4116,7 +4168,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+    Register NewVR = MRI.createVirtualRegister(OrrRC);
     uint64_t Imm = Root.getOperand(2).getImm();
 
     if (Root.getOperand(3).isImm()) {
@@ -4158,7 +4210,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    unsigned NewVR = MRI.createVirtualRegister(SubRC);
+    Register NewVR = MRI.createVirtualRegister(SubRC);
     // SUB NewVR, 0, C
     MachineInstrBuilder MIB1 =
         BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
@@ -4208,7 +4260,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+    Register NewVR = MRI.createVirtualRegister(OrrRC);
     uint64_t Imm = Root.getOperand(2).getImm();
     if (Root.getOperand(3).isImm()) {
       unsigned Val = Root.getOperand(3).getImm();
@@ -4228,34 +4280,35 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     break;
   }
   // Floating Point Support
+  case MachineCombinerPattern::FMULADDH_OP1:
+    Opc = AArch64::FMADDHrrr;
+    RC = &AArch64::FPR16RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
   case MachineCombinerPattern::FMULADDS_OP1:
+    Opc = AArch64::FMADDSrrr;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
   case MachineCombinerPattern::FMULADDD_OP1:
-    // MUL I=A,B,0
-    // ADD R,I,C
-    // ==> MADD R,A,B,C
-    // --- Create(MADD);
-    if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
-      Opc = AArch64::FMADDSrrr;
-      RC = &AArch64::FPR32RegClass;
-    } else {
-      Opc = AArch64::FMADDDrrr;
-      RC = &AArch64::FPR64RegClass;
-    }
+    Opc = AArch64::FMADDDrrr;
+    RC = &AArch64::FPR64RegClass;
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
+
+  case MachineCombinerPattern::FMULADDH_OP2:
+    Opc = AArch64::FMADDHrrr;
+    RC = &AArch64::FPR16RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
   case MachineCombinerPattern::FMULADDS_OP2:
+    Opc = AArch64::FMADDSrrr;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
   case MachineCombinerPattern::FMULADDD_OP2:
-    // FMUL I=A,B,0
-    // FADD R,C,I
-    // ==> FMADD R,A,B,C
-    // --- Create(FMADD);
-    if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
-      Opc = AArch64::FMADDSrrr;
-      RC = &AArch64::FPR32RegClass;
-    } else {
-      Opc = AArch64::FMADDDrrr;
-      RC = &AArch64::FPR64RegClass;
-    }
+    Opc = AArch64::FMADDDrrr;
+    RC = &AArch64::FPR64RegClass;
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
 
@@ -4285,6 +4338,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
                            FMAInstKind::Indexed);
     break;
 
+  case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FMLAv4i16_indexed;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv4f16_OP1:
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FMLAv4f16;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Accumulator);
+    break;
+  case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FMLAv4i16_indexed;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv4f16_OP2:
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FMLAv4f16;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Accumulator);
+    break;
+
   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
   case MachineCombinerPattern::FMLAv2f32_OP1:
     RC = &AArch64::FPR64RegClass;
@@ -4312,6 +4390,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
 
+  case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
+    RC = &AArch64::FPR128RegClass;
+    Opc = AArch64::FMLAv8i16_indexed;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv8f16_OP1:
+    RC = &AArch64::FPR128RegClass;
+    Opc = AArch64::FMLAv8f16;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Accumulator);
+    break;
+  case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    Opc = AArch64::FMLAv8i16_indexed;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv8f16_OP2:
+    RC = &AArch64::FPR128RegClass;
+    Opc = AArch64::FMLAv8f16;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Accumulator);
+    break;
+
   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
   case MachineCombinerPattern::FMLAv2f64_OP1:
     RC = &AArch64::FPR128RegClass;
@@ -4367,56 +4470,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
 
+  case MachineCombinerPattern::FMULSUBH_OP1:
+    Opc = AArch64::FNMSUBHrrr;
+    RC = &AArch64::FPR16RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
   case MachineCombinerPattern::FMULSUBS_OP1:
-  case MachineCombinerPattern::FMULSUBD_OP1: {
-    // FMUL I=A,B,0
-    // FSUB R,I,C
-    // ==> FNMSUB R,A,B,C // = -C + A*B
-    // --- Create(FNMSUB);
-    if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
-      Opc = AArch64::FNMSUBSrrr;
-      RC = &AArch64::FPR32RegClass;
-    } else {
-      Opc = AArch64::FNMSUBDrrr;
-      RC = &AArch64::FPR64RegClass;
-    }
+    Opc = AArch64::FNMSUBSrrr;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::FMULSUBD_OP1:
+    Opc = AArch64::FNMSUBDrrr;
+    RC = &AArch64::FPR64RegClass;
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
-  }
 
+  case MachineCombinerPattern::FNMULSUBH_OP1:
+    Opc = AArch64::FNMADDHrrr;
+    RC = &AArch64::FPR16RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
   case MachineCombinerPattern::FNMULSUBS_OP1:
-  case MachineCombinerPattern::FNMULSUBD_OP1: {
-    // FNMUL I=A,B,0
-    // FSUB R,I,C
-    // ==> FNMADD R,A,B,C // = -A*B - C
-    // --- Create(FNMADD);
-    if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
-      Opc = AArch64::FNMADDSrrr;
-      RC = &AArch64::FPR32RegClass;
-    } else {
-      Opc = AArch64::FNMADDDrrr;
-      RC = &AArch64::FPR64RegClass;
-    }
+    Opc = AArch64::FNMADDSrrr;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::FNMULSUBD_OP1:
+    Opc = AArch64::FNMADDDrrr;
+    RC = &AArch64::FPR64RegClass;
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
-  }
 
+  case MachineCombinerPattern::FMULSUBH_OP2:
+    Opc = AArch64::FMSUBHrrr;
+    RC = &AArch64::FPR16RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
   case MachineCombinerPattern::FMULSUBS_OP2:
-  case MachineCombinerPattern::FMULSUBD_OP2: {
-    // FMUL I=A,B,0
-    // FSUB R,C,I
-    // ==> FMSUB R,A,B,C (computes C - A*B)
-    // --- Create(FMSUB);
-    if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
-      Opc = AArch64::FMSUBSrrr;
-      RC = &AArch64::FPR32RegClass;
-    } else {
-      Opc = AArch64::FMSUBDrrr;
-      RC = &AArch64::FPR64RegClass;
-    }
+    Opc = AArch64::FMSUBSrrr;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::FMULSUBD_OP2:
+    Opc = AArch64::FMSUBDrrr;
+    RC = &AArch64::FPR64RegClass;
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
-  }
 
   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
     Opc = AArch64::FMLSv1i32_indexed;
@@ -4432,6 +4532,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
                            FMAInstKind::Indexed);
     break;
 
+  case MachineCombinerPattern::FMLSv4f16_OP1:
+  case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
+    RC = &AArch64::FPR64RegClass;
+    Register NewVR = MRI.createVirtualRegister(RC);
+    MachineInstrBuilder MIB1 =
+        BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
+            .add(Root.getOperand(2));
+    InsInstrs.push_back(MIB1);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+    if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
+      Opc = AArch64::FMLAv4f16;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator, &NewVR);
+    } else {
+      Opc = AArch64::FMLAv4i16_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed, &NewVR);
+    }
+    break;
+  }
+  case MachineCombinerPattern::FMLSv4f16_OP2:
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FMLSv4f16;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Accumulator);
+    break;
+  case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FMLSv4i16_indexed;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
   case MachineCombinerPattern::FMLSv2f32_OP2:
   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
     RC = &AArch64::FPR64RegClass;
@@ -4446,6 +4579,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
 
+  case MachineCombinerPattern::FMLSv8f16_OP1:
+  case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
+    RC = &AArch64::FPR128RegClass;
+    Register NewVR = MRI.createVirtualRegister(RC);
+    MachineInstrBuilder MIB1 =
+        BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
+            .add(Root.getOperand(2));
+    InsInstrs.push_back(MIB1);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+    if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
+      Opc = AArch64::FMLAv8f16;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator, &NewVR);
+    } else {
+      Opc = AArch64::FMLAv8i16_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed, &NewVR);
+    }
+    break;
+  }
+  case MachineCombinerPattern::FMLSv8f16_OP2:
+    RC = &AArch64::FPR128RegClass;
+    Opc = AArch64::FMLSv8f16;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Accumulator);
+    break;
+  case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    Opc = AArch64::FMLSv8i16_indexed;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
   case MachineCombinerPattern::FMLSv2f64_OP2:
   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
     RC = &AArch64::FPR128RegClass;
@@ -4476,7 +4642,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   case MachineCombinerPattern::FMLSv2f32_OP1:
   case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
     RC = &AArch64::FPR64RegClass;
-    unsigned NewVR = MRI.createVirtualRegister(RC);
+    Register NewVR = MRI.createVirtualRegister(RC);
     MachineInstrBuilder MIB1 =
         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
             .add(Root.getOperand(2));
@@ -4496,7 +4662,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   case MachineCombinerPattern::FMLSv4f32_OP1:
   case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
     RC = &AArch64::FPR128RegClass;
-    unsigned NewVR = MRI.createVirtualRegister(RC);
+    Register NewVR = MRI.createVirtualRegister(RC);
     MachineInstrBuilder MIB1 =
         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
             .add(Root.getOperand(2));
@@ -4516,7 +4682,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   case MachineCombinerPattern::FMLSv2f64_OP1:
   case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
     RC = &AArch64::FPR128RegClass;
-    unsigned NewVR = MRI.createVirtualRegister(RC);
+    Register NewVR = MRI.createVirtualRegister(RC);
     MachineInstrBuilder MIB1 =
         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
             .add(Root.getOperand(2));
@@ -4617,15 +4783,15 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
   MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  unsigned VReg = MI.getOperand(0).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+  Register VReg = MI.getOperand(0).getReg();
+  if (!Register::isVirtualRegister(VReg))
     return false;
 
   MachineInstr *DefMI = MRI->getVRegDef(VReg);
 
   // Look through COPY instructions to find definition.
   while (DefMI->isCopy()) {
-    unsigned CopyVReg = DefMI->getOperand(1).getReg();
+    Register CopyVReg = DefMI->getOperand(1).getReg();
     if (!MRI->hasOneNonDBGUse(CopyVReg))
       return false;
     if (!MRI->hasOneDef(CopyVReg))
@@ -4653,8 +4819,8 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
       return false;
 
     MachineOperand &MO = DefMI->getOperand(1);
-    unsigned NewReg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(NewReg))
+    Register NewReg = MO.getReg();
+    if (!Register::isVirtualRegister(NewReg))
       return false;
 
     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
@@ -4737,9 +4903,13 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
 
   static const std::pair<unsigned, const char *> TargetFlags[] = {
       {MO_COFFSTUB, "aarch64-coffstub"},
-      {MO_GOT, "aarch64-got"},   {MO_NC, "aarch64-nc"},
-      {MO_S, "aarch64-s"},       {MO_TLS, "aarch64-tls"},
-      {MO_DLLIMPORT, "aarch64-dllimport"}};
+      {MO_GOT, "aarch64-got"},
+      {MO_NC, "aarch64-nc"},
+      {MO_S, "aarch64-s"},
+      {MO_TLS, "aarch64-tls"},
+      {MO_DLLIMPORT, "aarch64-dllimport"},
+      {MO_PREL, "aarch64-prel"},
+      {MO_TAGGED, "aarch64-tagged"}};
   return makeArrayRef(TargetFlags);
 }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 7be4daba7dc4..1688045e4fb8 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -15,6 +15,7 @@
 
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
+#include "AArch64StackOffset.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -55,8 +56,7 @@ public:
 
   bool
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
-                                  const MachineInstr &MIb,
-                                  AliasAnalysis *AA = nullptr) const override;
+                                  const MachineInstr &MIb) const override;
 
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
@@ -299,7 +299,7 @@ private:
 /// if necessary, to be replaced by the scavenger at the end of PEI.
 void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
-                     int Offset, const TargetInstrInfo *TII,
+                     StackOffset Offset, const TargetInstrInfo *TII,
                      MachineInstr::MIFlag = MachineInstr::NoFlags,
                      bool SetNZCV = false, bool NeedsWinCFI = false,
                      bool *HasWinCFI = nullptr);
@@ -308,7 +308,7 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
 /// FP. Return false if the offset could not be handled directly in MI, and
 /// return the left-over portion by reference.
 bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                              unsigned FrameReg, int &Offset,
+                              unsigned FrameReg, StackOffset &Offset,
                               const AArch64InstrInfo *TII);
 
 /// Use to report the frame offset status in isAArch64FrameOffsetLegal.
@@ -332,10 +332,10 @@ enum AArch64FrameOffsetStatus {
 /// If set, @p EmittableOffset contains the amount that can be set in @p MI
 /// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
 /// is a legal offset.
-int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset,
                               bool *OutUseUnscaledOp = nullptr,
                               unsigned *OutUnscaledOp = nullptr,
-                              int *EmittableOffset = nullptr);
+                              int64_t *EmittableOffset = nullptr);
 
 static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index eed53f36d574..1981bd5d3bf0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -62,6 +62,9 @@ def HasAM            : Predicate<"Subtarget->hasAM()">,
 def HasSEL2          : Predicate<"Subtarget->hasSEL2()">,
                        AssemblerPredicate<"FeatureSEL2", "sel2">;
 
+def HasPMU           : Predicate<"Subtarget->hasPMU()">,
+                       AssemblerPredicate<"FeaturePMU", "pmu">;
+
 def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">,
                        AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">;
 
@@ -116,7 +119,7 @@ def HasSVE2SM4       : Predicate<"Subtarget->hasSVE2SM4()">,
 def HasSVE2SHA3      : Predicate<"Subtarget->hasSVE2SHA3()">,
                                  AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
 def HasSVE2BitPerm   : Predicate<"Subtarget->hasSVE2BitPerm()">,
-                                 AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">;
+                                 AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicate<"FeatureRCPC", "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -133,6 +136,12 @@ def HasBTI           : Predicate<"Subtarget->hasBTI()">,
                        AssemblerPredicate<"FeatureBranchTargetId", "bti">;
 def HasMTE           : Predicate<"Subtarget->hasMTE()">,
                        AssemblerPredicate<"FeatureMTE", "mte">;
+def HasTME           : Predicate<"Subtarget->hasTME()">,
+                       AssemblerPredicate<"FeatureTME", "tme">;
+def HasETE           : Predicate<"Subtarget->hasETE()">,
+                       AssemblerPredicate<"FeatureETE", "ete">;
+def HasTRBE          : Predicate<"Subtarget->hasTRBE()">,
+                       AssemblerPredicate<"FeatureTRBE", "trbe">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
@@ -415,6 +424,14 @@ def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, S
 def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def SDT_AArch64unpk : SDTypeProfile<1, 1, [
+    SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>
+]>;
+def AArch64sunpkhi : SDNode<"AArch64ISD::SUNPKHI", SDT_AArch64unpk>;
+def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>;
+def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
+def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
+
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -431,6 +448,13 @@ let RecomputePerFunction = 1 in {
 
   def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
   def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
+
+  // Toggles patterns which aren't beneficial in GlobalISel when we aren't
+  // optimizing. This allows us to selectively use patterns without impacting
+  // SelectionDAG's behaviour.
+  // FIXME: One day there will probably be a nicer way to check for this, but
+  // today is not that day.
+  def OptimizedGISelOrOtherSelector : Predicate<"!MF->getFunction().hasOptNone() || MF->getProperties().hasProperty(MachineFunctionProperties::Property::FailedISel) || !MF->getProperties().hasProperty(MachineFunctionProperties::Property::Legalized)">;
 }
 
 include "AArch64InstrFormats.td"
@@ -785,7 +809,11 @@ def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
 let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in {
 def HWASAN_CHECK_MEMACCESS : Pseudo<
   (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
-  [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>,
+  [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
+  Sched<[]>;
+def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo<
+  (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
+  [(int_hwasan_check_memaccess_shortgranules X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
   Sched<[]>;
 }
 
@@ -804,6 +832,23 @@ def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
                 (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
                  sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
 
+
+let Predicates = [HasTME] in {
+
+def TSTART : TMSystemI<0b0000, "tstart",
+                      [(set GPR64:$Rt, (int_aarch64_tstart))]>;
+
+def TCOMMIT : TMSystemINoOperand<0b0000, "tcommit", [(int_aarch64_tcommit)]>;
+
+def TCANCEL : TMSystemException<0b011, "tcancel",
+                                [(int_aarch64_tcancel i64_imm0_65535:$imm)]>;
+
+def TTEST : TMSystemI<0b0001, "ttest", [(set GPR64:$Rt, (int_aarch64_ttest))]> {
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+} // HasTME
+
 //===----------------------------------------------------------------------===//
 // Move immediate instructions.
 //===----------------------------------------------------------------------===//
@@ -815,37 +860,37 @@ let PostEncoderMethod = "fixMOVZ" in
 defm MOVZ : MoveImmediate<0b10, "movz">;
 
 // First group of aliases covers an implicit "lsl #0".
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, i32_imm0_65535:$imm, 0), 0>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, i32_imm0_65535:$imm, 0), 0>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
 
 // Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>;
 
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>;
 
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g3:$sym, 48), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g2:$sym, 32), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g0:$sym, 0), 0>;
 
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>;
 
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>;
 
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g0:$sym, 0), 0>;
 
 // Final group of aliases covers true "mov $Rd, $imm" cases.
 multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
@@ -917,8 +962,12 @@ def trunc_imm : SDNodeXForm<imm, [{
 def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">,
   GISDNodeXFormEquiv<trunc_imm>;
 
+let Predicates = [OptimizedGISelOrOtherSelector] in {
+// The SUBREG_TO_REG isn't eliminated at -O0, which can result in pointless
+// copies.
 def : Pat<(i64 i64imm_32bit:$src),
           (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+}
 
 // Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
 def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
@@ -1012,10 +1061,10 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
 def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
           (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
 let AddedComplexity = 1 in {
-def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
-          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
-def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
-          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3),
+          (SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3),
+          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>;
 }
 
 // Because of the immediate format for add/sub-imm instructions, the
@@ -2165,8 +2214,8 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
 def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{
   if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) {
     const DataLayout &DL = MF->getDataLayout();
-    unsigned Align = G->getGlobal()->getPointerAlignment(DL);
-    return Align >= 4 && G->getOffset() % 4 == 0;
+    MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL);
+    return Align && *Align >= 4 && G->getOffset() % 4 == 0;
   }
   if (auto *C = dyn_cast<ConstantPoolSDNode>(N))
     return C->getAlignment() >= 4 && C->getOffset() % 4 == 0;
@@ -3281,20 +3330,37 @@ defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
 
 // N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
 // the NEON variant.
+
+// Here we handle first -(a + b*c) for FNMADD:
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, FPR16:$Ra)),
+          (FMSUBHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
 def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
           (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
 def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
           (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
-// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
-// "(-a) + b*(-c)".
+// Now it's time for "(-a) + (-b)*c"
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, (fneg FPR16:$Ra))),
+          (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
 def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
           (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
 def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
           (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
+// And here "(-a) + b*(-c)"
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma FPR16:$Rn, (fneg FPR16:$Rm), (fneg FPR16:$Ra))),
+          (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
 def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
           (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
@@ -6939,5 +7005,124 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
 def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>;
 def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>;
 
+// Extracting lane zero is a special case where we can just use a plain
+// EXTRACT_SUBREG instruction, which will become FMOV. This is easier for the
+// rest of the compiler, especially the register allocator and copy propagation,
+// to reason about, so is preferred when it's possible to use it.
+let AddedComplexity = 10 in {
+  def : Pat<(i64 (extractelt (v2i64 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, dsub)>;
+  def : Pat<(i32 (extractelt (v4i32 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, ssub)>;
+  def : Pat<(i32 (extractelt (v2i32 V64:$V), (i64 0))), (EXTRACT_SUBREG V64:$V, ssub)>;
+}
+
+// dot_v4i8
+class mul_v4i8<SDPatternOperator ldop> :
+  PatFrag<(ops node:$Rn, node:$Rm, node:$offset),
+          (mul (ldop (add node:$Rn, node:$offset)),
+               (ldop (add node:$Rm, node:$offset)))>;
+class mulz_v4i8<SDPatternOperator ldop> :
+  PatFrag<(ops node:$Rn, node:$Rm),
+          (mul (ldop node:$Rn), (ldop node:$Rm))>;
+
+def load_v4i8 :
+  OutPatFrag<(ops node:$R),
+             (INSERT_SUBREG
+              (v2i32 (IMPLICIT_DEF)),
+               (i32 (COPY_TO_REGCLASS (LDRWui node:$R, (i64 0)), FPR32)),
+              ssub)>;
+
+class dot_v4i8<Instruction DOT, SDPatternOperator ldop> :
+  Pat<(i32 (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 3)),
+           (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 2)),
+           (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 1)),
+                (mulz_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm))))),
+      (EXTRACT_SUBREG (i64 (DOT (DUPv2i32gpr WZR),
+                                (load_v4i8 GPR64sp:$Rn),
+                                (load_v4i8 GPR64sp:$Rm))),
+                      sub_32)>, Requires<[HasDotProd]>;
+
+// dot_v8i8
+class ee_v8i8<SDPatternOperator extend> :
+  PatFrag<(ops node:$V, node:$K),
+          (v4i16 (extract_subvector (v8i16 (extend node:$V)), node:$K))>;
+
+class mul_v8i8<SDPatternOperator mulop, SDPatternOperator extend> :
+  PatFrag<(ops node:$M, node:$N, node:$K),
+          (mulop (v4i16 (ee_v8i8<extend> node:$M, node:$K)),
+                 (v4i16 (ee_v8i8<extend> node:$N, node:$K)))>;
+
+class idot_v8i8<SDPatternOperator mulop, SDPatternOperator extend> :
+  PatFrag<(ops node:$M, node:$N),
+          (i32 (extractelt
+           (v4i32 (AArch64uaddv
+            (add (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 0)),
+                 (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 4))))),
+           (i64 0)))>;
+
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def VADDV_32 : OutPatFrag<(ops node:$R), (ADDPv2i32 node:$R, node:$R)>;
+
+class odot_v8i8<Instruction DOT> :
+  OutPatFrag<(ops node:$Vm, node:$Vn),
+             (EXTRACT_SUBREG
+              (VADDV_32
+               (i64 (DOT (DUPv2i32gpr WZR),
+                         (v8i8 node:$Vm),
+                         (v8i8 node:$Vn)))),
+              sub_32)>;
+
+class dot_v8i8<Instruction DOT, SDPatternOperator mulop,
+                    SDPatternOperator extend> :
+  Pat<(idot_v8i8<mulop, extend> V64:$Vm, V64:$Vn),
+      (odot_v8i8<DOT> V64:$Vm, V64:$Vn)>,
+  Requires<[HasDotProd]>;
+
+// dot_v16i8
+class ee_v16i8<SDPatternOperator extend> :
+  PatFrag<(ops node:$V, node:$K1, node:$K2),
+          (v4i16 (extract_subvector
+           (v8i16 (extend
+            (v8i8 (extract_subvector node:$V, node:$K1)))), node:$K2))>;
+
+class mul_v16i8<SDPatternOperator mulop, SDPatternOperator extend> :
+  PatFrag<(ops node:$M, node:$N, node:$K1, node:$K2),
+          (v4i32
+           (mulop (v4i16 (ee_v16i8<extend> node:$M, node:$K1, node:$K2)),
+                  (v4i16 (ee_v16i8<extend> node:$N, node:$K1, node:$K2))))>;
+
+class idot_v16i8<SDPatternOperator m, SDPatternOperator x> :
+  PatFrag<(ops node:$M, node:$N),
+          (i32 (extractelt
+           (v4i32 (AArch64uaddv
+            (add
+             (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 0)),
+                  (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 0))),
+             (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 4)),
+                  (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 4)))))),
+           (i64 0)))>;
+
+class odot_v16i8<Instruction DOT> :
+  OutPatFrag<(ops node:$Vm, node:$Vn),
+             (i32 (ADDVv4i32v
+              (DOT (DUPv4i32gpr WZR), node:$Vm, node:$Vn)))>;
+
+class dot_v16i8<Instruction DOT, SDPatternOperator mulop,
+                SDPatternOperator extend> :
+  Pat<(idot_v16i8<mulop, extend> V128:$Vm, V128:$Vn),
+      (odot_v16i8<DOT> V128:$Vm, V128:$Vn)>,
+  Requires<[HasDotProd]>;
+
+let AddedComplexity = 10 in {
+  def : dot_v4i8<SDOTv8i8, sextloadi8>;
+  def : dot_v4i8<UDOTv8i8, zextloadi8>;
+  def : dot_v8i8<SDOTv8i8, AArch64smull, sext>;
+  def : dot_v8i8<UDOTv8i8, AArch64umull, zext>;
+  def : dot_v16i8<SDOTv16i8, AArch64smull, sext>;
+  def : dot_v16i8<UDOTv16i8, AArch64umull, zext>;
+
+  // FIXME: add patterns to generate vector by element dot product.
+  // FIXME: add SVE dot-product patterns.
+}
+
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 4e13fb8e2027..961f38cad1e4 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -51,9 +51,19 @@ public:
                              const AArch64Subtarget &STI,
                              const AArch64RegisterBankInfo &RBI);
 
-  bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+  bool select(MachineInstr &I) override;
   static const char *getName() { return DEBUG_TYPE; }
 
+  void setupMF(MachineFunction &MF, GISelKnownBits &KB,
+               CodeGenCoverage &CoverageInfo) override {
+    InstructionSelector::setupMF(MF, KB, CoverageInfo);
+
+    // hasFnAttribute() is expensive to call on every BRCOND selection, so
+    // cache it here for each run of the selector.
+    ProduceNonFlagSettingCondBr =
+        !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
+  }
+
 private:
   /// tblgen-erated 'select' implementation, used as the initial selector for
   /// the patterns that don't require complex C++.
@@ -68,6 +78,10 @@ private:
 
   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
 
+  /// Eliminate same-sized cross-bank copies into stores before selectImpl().
+  void contractCrossBankCopyIntoStore(MachineInstr &I,
+                                      MachineRegisterInfo &MRI) const;
+
   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
                           MachineRegisterInfo &MRI) const;
   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
@@ -101,8 +115,6 @@ private:
   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
 
-  void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI,
-                                 SmallVectorImpl<Optional<int>> &Idxs) const;
   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
@@ -116,6 +128,7 @@ private:
   bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const;
 
   unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
   MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
@@ -128,6 +141,8 @@ private:
   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                                    MachineOperand &Predicate,
                                    MachineIRBuilder &MIRBuilder) const;
+  MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS,
+                        MachineIRBuilder &MIRBuilder) const;
   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
                         MachineIRBuilder &MIRBuilder) const;
   MachineInstr *emitTST(const Register &LHS, const Register &RHS,
@@ -155,7 +170,9 @@ private:
   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
 
+  ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
+  ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
 
   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
                                             unsigned Size) const;
@@ -183,11 +200,48 @@ private:
     return selectAddrModeIndexed(Root, Width / 8);
   }
 
+  bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
+                                     const MachineRegisterInfo &MRI) const;
+  ComplexRendererFns
+  selectAddrModeShiftedExtendXReg(MachineOperand &Root,
+                                  unsigned SizeInBytes) const;
+  ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
+  ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
+                                       unsigned SizeInBytes) const;
+  template <int Width>
+  ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
+    return selectAddrModeXRO(Root, Width / 8);
+  }
+
+  ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const;
+
+  ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
+    return selectShiftedRegister(Root);
+  }
+
+  ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
+    // TODO: selectShiftedRegister should allow for rotates on logical shifts.
+    // For now, make them the same. The only difference between the two is that
+    // logical shifts are allowed to fold in rotates. Otherwise, these are
+    // functionally the same.
+    return selectShiftedRegister(Root);
+  }
+
+  /// Instructions that accept extend modifiers like UXTW expect the register
+  /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a
+  /// subregister copy if necessary. Return either ExtReg, or the result of the
+  /// new copy.
+  Register narrowExtendRegIfNeeded(Register ExtReg,
+                                             MachineIRBuilder &MIB) const;
+  ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
+
   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
+  void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I) const;
+  void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I) const;
 
   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
   void materializeLargeCMVal(MachineInstr &I, const Value *V,
-                             unsigned char OpFlags) const;
+                             unsigned OpFlags) const;
 
   // Optimization methods.
   bool tryOptVectorShuffle(MachineInstr &I) const;
@@ -197,12 +251,22 @@ private:
                                       MachineOperand &Predicate,
                                       MachineIRBuilder &MIRBuilder) const;
 
+  /// Return true if \p MI is a load or store of \p NumBytes bytes.
+  bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
+
+  /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
+  /// register zeroed out. In other words, the result of MI has been explicitly
+  /// zero extended.
+  bool isDef32(const MachineInstr &MI) const;
+
   const AArch64TargetMachine &TM;
   const AArch64Subtarget &STI;
   const AArch64InstrInfo &TII;
   const AArch64RegisterInfo &TRI;
   const AArch64RegisterBankInfo &RBI;
 
+  bool ProduceNonFlagSettingCondBr = false;
+
 #define GET_GLOBALISEL_PREDICATES_DECL
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_DECL
@@ -312,7 +376,7 @@ static bool getSubRegForClass(const TargetRegisterClass *RC,
     SubReg = AArch64::hsub;
     break;
   case 32:
-    if (RC == &AArch64::GPR32RegClass)
+    if (RC != &AArch64::FPR32RegClass)
       SubReg = AArch64::sub_32;
     else
       SubReg = AArch64::ssub;
@@ -357,7 +421,7 @@ static bool unsupportedBinOp(const MachineInstr &I,
     // so, this will need to be taught about that, and we'll need to get the
     // bank out of the minimal class for the register.
     // Either way, this needs to be documented (and possibly verified).
-    if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+    if (!Register::isVirtualRegister(MO.getReg())) {
       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
       return true;
     }
@@ -492,8 +556,8 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
                         const MachineRegisterInfo &MRI,
                         const TargetRegisterInfo &TRI,
                         const RegisterBankInfo &RBI) {
-  const unsigned DstReg = I.getOperand(0).getReg();
-  const unsigned SrcReg = I.getOperand(1).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register SrcReg = I.getOperand(1).getReg();
   const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
 
@@ -502,7 +566,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
       (DstSize == SrcSize ||
        // Copies are a mean to setup initial types, the number of
        // bits may not exactly match.
-       (TargetRegisterInfo::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
+       (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
        // Copies are a mean to copy bits around, as long as we are
        // on the same register class, that's fine. Otherwise, that
        // means we need some SUBREG_TO_REG or AND & co.
@@ -526,7 +590,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
 /// SubRegCopy (To class) = COPY CopyReg:SubReg
 /// Dst = COPY SubRegCopy
 static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
-                                  const RegisterBankInfo &RBI, unsigned SrcReg,
+                                  const RegisterBankInfo &RBI, Register SrcReg,
                                   const TargetRegisterClass *From,
                                   const TargetRegisterClass *To,
                                   unsigned SubReg) {
@@ -539,7 +603,7 @@ static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
 
   // It's possible that the destination register won't be constrained. Make
   // sure that happens.
-  if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()))
+  if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
 
   return true;
@@ -553,8 +617,8 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                      const RegisterBankInfo &RBI) {
-  unsigned DstReg = I.getOperand(0).getReg();
-  unsigned SrcReg = I.getOperand(1).getReg();
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg = I.getOperand(1).getReg();
   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
   unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
@@ -579,8 +643,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                        const RegisterBankInfo &RBI) {
 
-  unsigned DstReg = I.getOperand(0).getReg();
-  unsigned SrcReg = I.getOperand(1).getReg();
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg = I.getOperand(1).getReg();
   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
 
@@ -607,11 +671,10 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   // result.
   auto CheckCopy = [&]() {
     // If we have a bitcast or something, we can't have physical registers.
-    assert(
-        (I.isCopy() ||
-         (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()) &&
-          !TargetRegisterInfo::isPhysicalRegister(I.getOperand(1).getReg()))) &&
-        "No phys reg on generic operator!");
+    assert((I.isCopy() ||
+            (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
+             !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
+           "No phys reg on generic operator!");
     assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI));
     (void)KnownValid;
     return true;
@@ -626,38 +689,38 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
       return false;
     }
 
-    // Is this a cross-bank copy?
-    if (DstRegBank.getID() != SrcRegBank.getID()) {
-      // If we're doing a cross-bank copy on different-sized registers, we need
-      // to do a bit more work.
-      unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
-      unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
-
-      if (SrcSize > DstSize) {
-        // We're doing a cross-bank copy into a smaller register. We need a
-        // subregister copy. First, get a register class that's on the same bank
-        // as the destination, but the same size as the source.
-        const TargetRegisterClass *SubregRC =
-            getMinClassForRegBank(DstRegBank, SrcSize, true);
-        assert(SubregRC && "Didn't get a register class for subreg?");
-
-        // Get the appropriate subregister for the destination.
-        unsigned SubReg = 0;
-        if (!getSubRegForClass(DstRC, TRI, SubReg)) {
-          LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
-          return false;
-        }
-
-        // Now, insert a subregister copy using the new register class.
-        selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
-        return CheckCopy();
+    unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
+    unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
+
+    // If we're doing a cross-bank copy on different-sized registers, we need
+    // to do a bit more work.
+    if (SrcSize > DstSize) {
+      // We're doing a cross-bank copy into a smaller register. We need a
+      // subregister copy. First, get a register class that's on the same bank
+      // as the destination, but the same size as the source.
+      const TargetRegisterClass *SubregRC =
+          getMinClassForRegBank(DstRegBank, SrcSize, true);
+      assert(SubregRC && "Didn't get a register class for subreg?");
+
+      // Get the appropriate subregister for the destination.
+      unsigned SubReg = 0;
+      if (!getSubRegForClass(DstRC, TRI, SubReg)) {
+        LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
+        return false;
       }
 
-      else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
-               SrcSize == 16) {
+      // Now, insert a subregister copy using the new register class.
+      selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
+      return CheckCopy();
+    }
+
+    // Is this a cross-bank copy?
+    if (DstRegBank.getID() != SrcRegBank.getID()) {
+      if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
+          SrcSize == 16) {
         // Special case for FPR16 to GPR32.
         // FIXME: This can probably be generalized like the above case.
-        unsigned PromoteReg =
+        Register PromoteReg =
             MRI.createVirtualRegister(&AArch64::FPR32RegClass);
         BuildMI(*I.getParent(), I, I.getDebugLoc(),
                 TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
@@ -674,7 +737,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
 
     // If the destination is a physical register, then there's nothing to
     // change, so we're done.
-    if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+    if (Register::isPhysicalRegister(DstReg))
       return CheckCopy();
   }
 
@@ -955,7 +1018,9 @@ bool AArch64InstructionSelector::selectVectorSHL(
     return false;
 
   unsigned Opc = 0;
-  if (Ty == LLT::vector(4, 32)) {
+  if (Ty == LLT::vector(2, 64)) {
+    Opc = AArch64::USHLv2i64;
+  } else if (Ty == LLT::vector(4, 32)) {
     Opc = AArch64::USHLv4i32;
   } else if (Ty == LLT::vector(2, 32)) {
     Opc = AArch64::USHLv2i32;
@@ -989,7 +1054,11 @@ bool AArch64InstructionSelector::selectVectorASHR(
   unsigned Opc = 0;
   unsigned NegOpc = 0;
   const TargetRegisterClass *RC = nullptr;
-  if (Ty == LLT::vector(4, 32)) {
+  if (Ty == LLT::vector(2, 64)) {
+    Opc = AArch64::SSHLv2i64;
+    NegOpc = AArch64::NEGv2i64;
+    RC = &AArch64::FPR128RegClass;
+  } else if (Ty == LLT::vector(4, 32)) {
     Opc = AArch64::SSHLv4i32;
     NegOpc = AArch64::NEGv4i32;
     RC = &AArch64::FPR128RegClass;
@@ -1044,7 +1113,7 @@ bool AArch64InstructionSelector::selectVaStartDarwin(
 }
 
 void AArch64InstructionSelector::materializeLargeCMVal(
-    MachineInstr &I, const Value *V, unsigned char OpFlags) const {
+    MachineInstr &I, const Value *V, unsigned OpFlags) const {
   MachineBasicBlock &MBB = *I.getParent();
   MachineFunction &MF = *MBB.getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1097,8 +1166,8 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
     // some reason we receive input GMIR that has an s64 shift amount that's not
     // a G_CONSTANT, insert a truncate so that we can still select the s32
     // register-register variant.
-    unsigned SrcReg = I.getOperand(1).getReg();
-    unsigned ShiftReg = I.getOperand(2).getReg();
+    Register SrcReg = I.getOperand(1).getReg();
+    Register ShiftReg = I.getOperand(2).getReg();
     const LLT ShiftTy = MRI.getType(ShiftReg);
     const LLT SrcTy = MRI.getType(SrcReg);
     if (SrcTy.isVector())
@@ -1118,6 +1187,9 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
     }
     return;
   }
+  case TargetOpcode::G_STORE:
+    contractCrossBankCopyIntoStore(I, MRI);
+    return;
   default:
     return;
   }
@@ -1158,6 +1230,48 @@ bool AArch64InstructionSelector::earlySelectSHL(
   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
 }
 
+void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
+    MachineInstr &I, MachineRegisterInfo &MRI) const {
+  assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
+  // If we're storing a scalar, it doesn't matter what register bank that
+  // scalar is on. All that matters is the size.
+  //
+  // So, if we see something like this (with a 32-bit scalar as an example):
+  //
+  // %x:gpr(s32) = ... something ...
+  // %y:fpr(s32) = COPY %x:gpr(s32)
+  // G_STORE %y:fpr(s32)
+  //
+  // We can fix this up into something like this:
+  //
+  // G_STORE %x:gpr(s32)
+  //
+  // And then continue the selection process normally.
+  MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI);
+  if (!Def)
+    return;
+  Register DefDstReg = Def->getOperand(0).getReg();
+  LLT DefDstTy = MRI.getType(DefDstReg);
+  Register StoreSrcReg = I.getOperand(0).getReg();
+  LLT StoreSrcTy = MRI.getType(StoreSrcReg);
+
+  // If we get something strange like a physical register, then we shouldn't
+  // go any further.
+  if (!DefDstTy.isValid())
+    return;
+
+  // Are the source and dst types the same size?
+  if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
+    return;
+
+  if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
+      RBI.getRegBank(DefDstReg, MRI, TRI))
+    return;
+
+  // We have a cross-bank copy, which is entering a store. Let's fold it.
+  I.getOperand(0).setReg(DefDstReg);
+}
+
 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -1169,13 +1283,37 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
   switch (I.getOpcode()) {
   case TargetOpcode::G_SHL:
     return earlySelectSHL(I, MRI);
+  case TargetOpcode::G_CONSTANT: {
+    bool IsZero = false;
+    if (I.getOperand(1).isCImm())
+      IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
+    else if (I.getOperand(1).isImm())
+      IsZero = I.getOperand(1).getImm() == 0;
+
+    if (!IsZero)
+      return false;
+
+    Register DefReg = I.getOperand(0).getReg();
+    LLT Ty = MRI.getType(DefReg);
+    if (Ty != LLT::scalar(64) && Ty != LLT::scalar(32))
+      return false;
+
+    if (Ty == LLT::scalar(64)) {
+      I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
+      RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
+    } else {
+      I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
+      RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
+    }
+    I.setDesc(TII.get(TargetOpcode::COPY));
+    return true;
+  }
   default:
     return false;
   }
 }
 
-bool AArch64InstructionSelector::select(MachineInstr &I,
-                                        CodeGenCoverage &CoverageInfo) const {
+bool AArch64InstructionSelector::select(MachineInstr &I) {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
 
@@ -1244,7 +1382,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
   if (earlySelect(I))
     return true;
 
-  if (selectImpl(I, CoverageInfo))
+  if (selectImpl(I, *CoverageInfo))
     return true;
 
   LLT Ty =
@@ -1439,14 +1577,43 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     return true;
   }
   case TargetOpcode::G_EXTRACT: {
-    LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
-    LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    Register DstReg = I.getOperand(0).getReg();
+    Register SrcReg = I.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    LLT DstTy = MRI.getType(DstReg);
     (void)DstTy;
     unsigned SrcSize = SrcTy.getSizeInBits();
-    // Larger extracts are vectors, same-size extracts should be something else
-    // by now (either split up or simplified to a COPY).
-    if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32)
-      return false;
+
+    if (SrcTy.getSizeInBits() > 64) {
+      // This should be an extract of an s128, which is like a vector extract.
+      if (SrcTy.getSizeInBits() != 128)
+        return false;
+      // Only support extracting 64 bits from an s128 at the moment.
+      if (DstTy.getSizeInBits() != 64)
+        return false;
+
+      const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+      const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+      // Check we have the right regbank always.
+      assert(SrcRB.getID() == AArch64::FPRRegBankID &&
+             DstRB.getID() == AArch64::FPRRegBankID &&
+             "Wrong extract regbank!");
+      (void)SrcRB;
+
+      // Emit the same code as a vector extract.
+      // Offset must be a multiple of 64.
+      unsigned Offset = I.getOperand(2).getImm();
+      if (Offset % 64 != 0)
+        return false;
+      unsigned LaneIdx = Offset / 64;
+      MachineIRBuilder MIB(I);
+      MachineInstr *Extract = emitExtractVectorElt(
+          DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
+      if (!Extract)
+        return false;
+      I.eraseFromParent();
+      return true;
+    }
 
     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
@@ -1458,7 +1625,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
     }
 
-    Register DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+    DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
         .addReg(DstReg, 0, AArch64::sub_32);
@@ -1521,11 +1688,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
 
   case TargetOpcode::G_GLOBAL_VALUE: {
     auto GV = I.getOperand(1).getGlobal();
-    if (GV->isThreadLocal()) {
-      // FIXME: we don't support TLS yet.
-      return false;
-    }
-    unsigned char OpFlags = STI.ClassifyGlobalReference(GV, TM);
+    if (GV->isThreadLocal())
+      return selectTLSGlobalValue(I, MRI);
+
+    unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
     if (OpFlags & AArch64II::MO_GOT) {
       I.setDesc(TII.get(AArch64::LOADgot));
       I.getOperand(1).setTargetFlags(OpFlags);
@@ -1562,8 +1728,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     }
 
     auto &MemOp = **I.memoperands_begin();
-    if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
-      LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+    if (MemOp.isAtomic()) {
+      // For now we just support s8 acquire loads to be able to compile stack
+      // protector code.
+      if (MemOp.getOrdering() == AtomicOrdering::Acquire &&
+          MemOp.getSize() == 1) {
+        I.setDesc(TII.get(AArch64::LDARB));
+        return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+      }
+      LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n");
       return false;
     }
     unsigned MemSizeInBits = MemOp.getSize() * 8;
@@ -1598,7 +1771,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
         const unsigned Size = MemSizeInBits / 8;
         const unsigned Scale = Log2_32(Size);
         if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
-          unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
+          Register Ptr2Reg = PtrMI->getOperand(1).getReg();
           I.getOperand(1).setReg(Ptr2Reg);
           PtrMI = MRI.getVRegDef(Ptr2Reg);
           Offset = Imm / Size;
@@ -1688,8 +1861,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
       return selectVectorSHL(I, MRI);
     LLVM_FALLTHROUGH;
   case TargetOpcode::G_OR:
-  case TargetOpcode::G_LSHR:
-  case TargetOpcode::G_GEP: {
+  case TargetOpcode::G_LSHR: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
       return false;
@@ -1711,6 +1883,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
+  case TargetOpcode::G_GEP: {
+    MachineIRBuilder MIRBuilder(I);
+    emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2),
+            MIRBuilder);
+    I.eraseFromParent();
+    return true;
+  }
   case TargetOpcode::G_UADDO: {
     // TODO: Support other types.
     unsigned OpSize = Ty.getSizeInBits();
@@ -1816,6 +1995,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
         return true;
       }
+
+      if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
+        MachineIRBuilder MIB(I);
+        MachineInstr *Extract = emitExtractVectorElt(
+            DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
+        if (!Extract)
+          return false;
+        I.eraseFromParent();
+        return true;
+      }
     }
 
     return false;
@@ -1868,21 +2057,41 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_SEXT: {
     unsigned Opcode = I.getOpcode();
-    const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
-              SrcTy = MRI.getType(I.getOperand(1).getReg());
-    const bool isSigned = Opcode == TargetOpcode::G_SEXT;
+    const bool IsSigned = Opcode == TargetOpcode::G_SEXT;
     const Register DefReg = I.getOperand(0).getReg();
     const Register SrcReg = I.getOperand(1).getReg();
-    const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+    const LLT DstTy = MRI.getType(DefReg);
+    const LLT SrcTy = MRI.getType(SrcReg);
+    unsigned DstSize = DstTy.getSizeInBits();
+    unsigned SrcSize = SrcTy.getSizeInBits();
 
-    if (RB.getID() != AArch64::GPRRegBankID) {
-      LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
-                        << ", expected: GPR\n");
-      return false;
-    }
+    assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
+               AArch64::GPRRegBankID &&
+           "Unexpected ext regbank");
 
+    MachineIRBuilder MIB(I);
     MachineInstr *ExtI;
-    if (DstTy == LLT::scalar(64)) {
+    if (DstTy.isVector())
+      return false; // Should be handled by imported patterns.
+
+    // First check if we're extending the result of a load which has a dest type
+    // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
+    // GPR register on AArch64 and all loads which are smaller automatically
+    // zero-extend the upper bits. E.g.
+    // %v(s8) = G_LOAD %p, :: (load 1)
+    // %v2(s32) = G_ZEXT %v(s8)
+    if (!IsSigned) {
+      auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
+      if (LoadMI &&
+          RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) {
+        const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
+        unsigned BytesLoaded = MemOp->getSize();
+        if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
+          return selectCopy(I, TII, MRI, TRI, RBI);
+      }
+    }
+
+    if (DstSize == 64) {
       // FIXME: Can we avoid manually doing this?
       if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
         LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
@@ -1890,33 +2099,26 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
         return false;
       }
 
-      const Register SrcXReg =
-          MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-      BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
-          .addDef(SrcXReg)
-          .addImm(0)
-          .addUse(SrcReg)
-          .addImm(AArch64::sub_32);
-
-      const unsigned NewOpc = isSigned ? AArch64::SBFMXri : AArch64::UBFMXri;
-      ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
-                 .addDef(DefReg)
-                 .addUse(SrcXReg)
-                 .addImm(0)
-                 .addImm(SrcTy.getSizeInBits() - 1);
-    } else if (DstTy.isScalar() && DstTy.getSizeInBits() <= 32) {
-      const unsigned NewOpc = isSigned ? AArch64::SBFMWri : AArch64::UBFMWri;
-      ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
-                 .addDef(DefReg)
-                 .addUse(SrcReg)
-                 .addImm(0)
-                 .addImm(SrcTy.getSizeInBits() - 1);
+      auto SubregToReg =
+          MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {})
+              .addImm(0)
+              .addUse(SrcReg)
+              .addImm(AArch64::sub_32);
+
+      ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
+                             {DefReg}, {SubregToReg})
+                  .addImm(0)
+                  .addImm(SrcSize - 1);
+    } else if (DstSize <= 32) {
+      ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
+                             {DefReg}, {SrcReg})
+                  .addImm(0)
+                  .addImm(SrcSize - 1);
     } else {
       return false;
     }
 
     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
-
     I.eraseFromParent();
     return true;
   }
@@ -2163,6 +2365,37 @@ bool AArch64InstructionSelector::selectJumpTable(
   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
 }
 
+bool AArch64InstructionSelector::selectTLSGlobalValue(
+    MachineInstr &I, MachineRegisterInfo &MRI) const {
+  if (!STI.isTargetMachO())
+    return false;
+  MachineFunction &MF = *I.getParent()->getParent();
+  MF.getFrameInfo().setAdjustsStack(true);
+
+  const GlobalValue &GV = *I.getOperand(1).getGlobal();
+  MachineIRBuilder MIB(I);
+
+  MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {})
+      .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
+
+  auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
+                             {Register(AArch64::X0)})
+                  .addImm(0);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  MIB.buildInstr(AArch64::BLR, {}, {Load})
+      .addDef(AArch64::X0, RegState::Implicit)
+      .addRegMask(TRI.getTLSCallPreservedMask());
+
+  MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
+  RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
+                               MRI);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AArch64InstructionSelector::selectIntrinsicTrunc(
     MachineInstr &I, MachineRegisterInfo &MRI) const {
   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
@@ -2478,16 +2711,40 @@ bool AArch64InstructionSelector::selectMergeValues(
   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
+  const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
 
-  // At the moment we only support merging two s32s into an s64.
   if (I.getNumOperands() != 3)
     return false;
-  if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
-    return false;
-  const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
+
+  // Merging 2 s64s into an s128.
+  if (DstTy == LLT::scalar(128)) {
+    if (SrcTy.getSizeInBits() != 64)
+      return false;
+    MachineIRBuilder MIB(I);
+    Register DstReg = I.getOperand(0).getReg();
+    Register Src1Reg = I.getOperand(1).getReg();
+    Register Src2Reg = I.getOperand(2).getReg();
+    auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
+    MachineInstr *InsMI =
+        emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
+    if (!InsMI)
+      return false;
+    MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
+                                          Src2Reg, /* LaneIdx */ 1, RB, MIB);
+    if (!Ins2MI)
+      return false;
+    constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
+    I.eraseFromParent();
+    return true;
+  }
+
   if (RB.getID() != AArch64::GPRRegBankID)
     return false;
 
+  if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
+    return false;
+
   auto *DstRC = &AArch64::GPR64RegClass;
   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -2695,7 +2952,8 @@ bool AArch64InstructionSelector::selectUnmergeValues(
   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
   const LLT WideTy = MRI.getType(SrcReg);
   (void)WideTy;
-  assert(WideTy.isVector() && "can only unmerge from vector types!");
+  assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
+         "can only unmerge from vector or s128 types!");
   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
          "source register size too small!");
 
@@ -2802,29 +3060,6 @@ bool AArch64InstructionSelector::selectConcatVectors(
   return true;
 }
 
-void AArch64InstructionSelector::collectShuffleMaskIndices(
-    MachineInstr &I, MachineRegisterInfo &MRI,
-    SmallVectorImpl<Optional<int>> &Idxs) const {
-  MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg());
-  assert(
-      MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
-      "G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR");
-  // Find the constant indices.
-  for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) {
-    // Look through copies.
-    MachineInstr *ScalarDef =
-        getDefIgnoringCopies(MaskDef->getOperand(i).getReg(), MRI);
-    assert(ScalarDef && "Could not find vreg def of shufflevec index op");
-    if (ScalarDef->getOpcode() != TargetOpcode::G_CONSTANT) {
-      // This be an undef if not a constant.
-      assert(ScalarDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF);
-      Idxs.push_back(None);
-    } else {
-      Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue());
-    }
-  }
-}
-
 unsigned
 AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal,
                                                   MachineFunction &MF) const {
@@ -2905,6 +3140,31 @@ getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
   return std::make_pair(Opc, SubregIdx);
 }
 
+MachineInstr *
+AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
+                                    MachineOperand &RHS,
+                                    MachineIRBuilder &MIRBuilder) const {
+  assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
+  MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+  static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri},
+                                       {AArch64::ADDWrr, AArch64::ADDWri}};
+  bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32;
+  auto ImmFns = selectArithImmed(RHS);
+  unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
+  auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()});
+
+  // If we matched a valid constant immediate, add those operands.
+  if (ImmFns) {
+    for (auto &RenderFn : *ImmFns)
+      RenderFn(AddMI);
+  } else {
+    AddMI.addUse(RHS.getReg());
+  }
+
+  constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI);
+  return &*AddMI;
+}
+
 MachineInstr *
 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
                                     MachineIRBuilder &MIRBuilder) const {
@@ -3151,7 +3411,7 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
 
     // Can't see past copies from physregs.
     if (Opc == TargetOpcode::COPY &&
-        TargetRegisterInfo::isPhysicalRegister(CondDef->getOperand(1).getReg()))
+        Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
       return false;
 
     CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
@@ -3342,16 +3602,9 @@ bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
     return false;
 
   // The shuffle's second operand doesn't matter if the mask is all zero.
-  auto *ZeroVec = getOpcodeDef(G_BUILD_VECTOR, I.getOperand(3).getReg(), MRI);
-  if (!ZeroVec)
+  const Constant *Mask = I.getOperand(3).getShuffleMask();
+  if (!isa<ConstantAggregateZero>(Mask))
     return false;
-  int64_t Zero = 0;
-  if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero)
-    return false;
-  for (unsigned i = 1, e = ZeroVec->getNumOperands() - 1; i < e; ++i) {
-    if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg())
-      return false; // This wasn't an all zeros vector.
-  }
 
   // We're done, now find out what kind of splat we need.
   LLT VecTy = MRI.getType(I.getOperand(0).getReg());
@@ -3399,19 +3652,14 @@ bool AArch64InstructionSelector::selectShuffleVector(
   const LLT Src1Ty = MRI.getType(Src1Reg);
   Register Src2Reg = I.getOperand(2).getReg();
   const LLT Src2Ty = MRI.getType(Src2Reg);
+  const Constant *ShuffleMask = I.getOperand(3).getShuffleMask();
 
   MachineBasicBlock &MBB = *I.getParent();
   MachineFunction &MF = *MBB.getParent();
   LLVMContext &Ctx = MF.getFunction().getContext();
 
-  // G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask
-  // operand, it comes in as a normal vector value which we have to analyze to
-  // find the mask indices. If the mask element is undef, then
-  // collectShuffleMaskIndices() will add a None entry for that index into
-  // the list.
-  SmallVector<Optional<int>, 8> Mask;
-  collectShuffleMaskIndices(I, MRI, Mask);
-  assert(!Mask.empty() && "Expected to find mask indices");
+  SmallVector<int, 8> Mask;
+  ShuffleVectorInst::getShuffleMask(ShuffleMask, Mask);
 
   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
   // it's originated from a <1 x T> type. Those should have been lowered into
@@ -3424,10 +3672,10 @@ bool AArch64InstructionSelector::selectShuffleVector(
   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
 
   SmallVector<Constant *, 64> CstIdxs;
-  for (auto &MaybeVal : Mask) {
+  for (int Val : Mask) {
     // For now, any undef indexes we'll just assume to be 0. This should be
     // optimized in future, e.g. to select DUP etc.
-    int Val = MaybeVal.hasValue() ? *MaybeVal : 0;
+    Val = Val < 0 ? 0 : Val;
     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
       unsigned Offset = Byte + Val * BytesPerElt;
       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
@@ -3684,21 +3932,6 @@ static unsigned findIntrinsicID(MachineInstr &I) {
   return IntrinOp->getIntrinsicID();
 }
 
-/// Helper function to emit the correct opcode for a llvm.aarch64.stlxr
-/// intrinsic.
-static unsigned getStlxrOpcode(unsigned NumBytesToStore) {
-  switch (NumBytesToStore) {
-  // TODO: 1, 2, and 4 byte stores.
-  case 8:
-    return AArch64::STLXRX;
-  default:
-    LLVM_DEBUG(dbgs() << "Unexpected number of bytes to store! ("
-                      << NumBytesToStore << ")\n");
-    break;
-  }
-  return 0;
-}
-
 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
     MachineInstr &I, MachineRegisterInfo &MRI) const {
   // Find the intrinsic ID.
@@ -3719,32 +3952,6 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
       return false;
     MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
     break;
-  case Intrinsic::aarch64_stlxr:
-    Register StatReg = I.getOperand(0).getReg();
-    assert(RBI.getSizeInBits(StatReg, MRI, TRI) == 32 &&
-           "Status register must be 32 bits!");
-    Register SrcReg = I.getOperand(2).getReg();
-
-    if (RBI.getSizeInBits(SrcReg, MRI, TRI) != 64) {
-      LLVM_DEBUG(dbgs() << "Only support 64-bit sources right now.\n");
-      return false;
-    }
-
-    Register PtrReg = I.getOperand(3).getReg();
-    assert(MRI.getType(PtrReg).isPointer() && "Expected pointer operand");
-
-    // Expect only one memory operand.
-    if (!I.hasOneMemOperand())
-      return false;
-
-    const MachineMemOperand *MemOp = *I.memoperands_begin();
-    unsigned NumBytesToStore = MemOp->getSize();
-    unsigned Opc = getStlxrOpcode(NumBytesToStore);
-    if (!Opc)
-      return false;
-
-    auto StoreMI = MIRBuilder.buildInstr(Opc, {StatReg}, {SrcReg, PtrReg});
-    constrainSelectedInstRegOperands(*StoreMI, TII, TRI, RBI);
   }
 
   I.eraseFromParent();
@@ -3860,6 +4067,30 @@ AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
 }
 
+/// Helper to select an immediate value that can be represented as a 12-bit
+/// value shifted left by either 0 or 12. If it is possible to do so, return
+/// the immediate and shift value. If not, return None.
+///
+/// Used by selectArithImmed and selectNegArithImmed.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::select12BitValueWithLeftShift(
+    uint64_t Immed) const {
+  unsigned ShiftAmt;
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return None;
+
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
+  }};
+}
+
 /// SelectArithImmed - Select an immediate value that can be represented as
 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
 /// Val set to the 12-bit value and Shift set to the shifter operand.
@@ -3871,24 +4102,231 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
   // here because the ComplexPattern opcode list is only used in
   // root-level opcode matching.
   auto MaybeImmed = getImmedFromMO(Root);
+  if (MaybeImmed == None)
+    return None;
+  return select12BitValueWithLeftShift(*MaybeImmed);
+}
+
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
+  // We need a register here, because we need to know if we have a 64 or 32
+  // bit immediate.
+  if (!Root.isReg())
+    return None;
+  auto MaybeImmed = getImmedFromMO(Root);
   if (MaybeImmed == None)
     return None;
   uint64_t Immed = *MaybeImmed;
-  unsigned ShiftAmt;
 
-  if (Immed >> 12 == 0) {
-    ShiftAmt = 0;
-  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
-    ShiftAmt = 12;
-    Immed = Immed >> 12;
-  } else
+  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+  // have the opposite effect on the C flag, so this pattern mustn't match under
+  // those circumstances.
+  if (Immed == 0)
     return None;
 
-  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
-  return {{
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
-  }};
+  // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
+  // the root.
+  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+  if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
+    Immed = ~((uint32_t)Immed) + 1;
+  else
+    Immed = ~Immed + 1ULL;
+
+  if (Immed & 0xFFFFFFFFFF000000ULL)
+    return None;
+
+  Immed &= 0xFFFFFFULL;
+  return select12BitValueWithLeftShift(Immed);
+}
+
+/// Return true if it is worth folding MI into an extended register. That is,
+/// if it's safe to pull it into the addressing mode of a load or store as a
+/// shift.
+bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
+    MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+  // Always fold if there is one use, or if we're optimizing for size.
+  Register DefReg = MI.getOperand(0).getReg();
+  if (MRI.hasOneUse(DefReg) ||
+      MI.getParent()->getParent()->getFunction().hasMinSize())
+    return true;
+
+  // It's better to avoid folding and recomputing shifts when we don't have a
+  // fastpath.
+  if (!STI.hasLSLFast())
+    return false;
+
+  // We have a fastpath, so folding a shift in and potentially computing it
+  // many times may be beneficial. Check if this is only used in memory ops.
+  // If it is, then we should fold.
+  return all_of(MRI.use_instructions(DefReg),
+                [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
+}
+
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3, lsl #3]
+///
+/// Where x2 is the base register, and x3 is an offset register. The shift-left
+/// is a constant value specific to this load instruction. That is, we'll never
+/// see anything other than a 3 here (which corresponds to the size of the
+/// element being loaded.)
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
+    MachineOperand &Root, unsigned SizeInBytes) const {
+  if (!Root.isReg())
+    return None;
+  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+  // Make sure that the memory op is a valid size.
+  int64_t LegalShiftVal = Log2_32(SizeInBytes);
+  if (LegalShiftVal == 0)
+    return None;
+
+  // We want to find something like this:
+  //
+  // val = G_CONSTANT LegalShiftVal
+  // shift = G_SHL off_reg val
+  // ptr = G_GEP base_reg shift
+  // x = G_LOAD ptr
+  //
+  // And fold it into this addressing mode:
+  //
+  // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
+
+  // Check if we can find the G_GEP.
+  MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI);
+  if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI))
+    return None;
+
+  // Now, try to match an opcode which will match our specific offset.
+  // We want a G_SHL or a G_MUL.
+  MachineInstr *OffsetInst = getDefIgnoringCopies(Gep->getOperand(2).getReg(), MRI);
+  if (!OffsetInst)
+    return None;
+
+  unsigned OffsetOpc = OffsetInst->getOpcode();
+  if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
+    return None;
+
+  if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
+    return None;
+
+  // Now, try to find the specific G_CONSTANT. Start by assuming that the
+  // register we will offset is the LHS, and the register containing the
+  // constant is the RHS.
+  Register OffsetReg = OffsetInst->getOperand(1).getReg();
+  Register ConstantReg = OffsetInst->getOperand(2).getReg();
+  auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+  if (!ValAndVReg) {
+    // We didn't get a constant on the RHS. If the opcode is a shift, then
+    // we're done.
+    if (OffsetOpc == TargetOpcode::G_SHL)
+      return None;
+
+    // If we have a G_MUL, we can use either register. Try looking at the RHS.
+    std::swap(OffsetReg, ConstantReg);
+    ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+    if (!ValAndVReg)
+      return None;
+  }
+
+  // The value must fit into 3 bits, and must be positive. Make sure that is
+  // true.
+  int64_t ImmVal = ValAndVReg->Value;
+
+  // Since we're going to pull this into a shift, the constant value must be
+  // a power of 2. If we got a multiply, then we need to check this.
+  if (OffsetOpc == TargetOpcode::G_MUL) {
+    if (!isPowerOf2_32(ImmVal))
+      return None;
+
+    // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
+    ImmVal = Log2_32(ImmVal);
+  }
+
+  if ((ImmVal & 0x7) != ImmVal)
+    return None;
+
+  // We are only allowed to shift by LegalShiftVal. This shift value is built
+  // into the instruction, so we can't just use whatever we want.
+  if (ImmVal != LegalShiftVal)
+    return None;
+
+  // We can use the LHS of the GEP as the base, and the LHS of the shift as an
+  // offset. Signify that we are shifting by setting the shift flag to 1.
+  return {{[=](MachineInstrBuilder &MIB) {
+             MIB.addUse(Gep->getOperand(1).getReg());
+           },
+           [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
+           [=](MachineInstrBuilder &MIB) {
+             // Need to add both immediates here to make sure that they are both
+             // added to the instruction.
+             MIB.addImm(0);
+             MIB.addImm(1);
+           }}};
+}
+
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3]
+///
+/// Where x2 is the base register, and x3 is an offset register.
+///
+/// When possible (or profitable) to fold a G_GEP into the address calculation,
+/// this will do so. Otherwise, it will return None.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeRegisterOffset(
+    MachineOperand &Root) const {
+  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+  // We need a GEP.
+  MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
+  if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
+    return None;
+
+  // If this is used more than once, let's not bother folding.
+  // TODO: Check if they are memory ops. If they are, then we can still fold
+  // without having to recompute anything.
+  if (!MRI.hasOneUse(Gep->getOperand(0).getReg()))
+    return None;
+
+  // Base is the GEP's LHS, offset is its RHS.
+  return {{[=](MachineInstrBuilder &MIB) {
+             MIB.addUse(Gep->getOperand(1).getReg());
+           },
+           [=](MachineInstrBuilder &MIB) {
+             MIB.addUse(Gep->getOperand(2).getReg());
+           },
+           [=](MachineInstrBuilder &MIB) {
+             // Need to add both immediates here to make sure that they are both
+             // added to the instruction.
+             MIB.addImm(0);
+             MIB.addImm(0);
+           }}};
+}
+
+/// This is intended to be equivalent to selectAddrModeXRO in
+/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
+                                              unsigned SizeInBytes) const {
+  MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+  // If we have a constant offset, then we probably don't want to match a
+  // register offset.
+  if (isBaseWithConstantOffset(Root, MRI))
+    return None;
+
+  // Try to fold shifts into the addressing mode.
+  auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
+  if (AddrModeFns)
+    return AddrModeFns;
+
+  // If that doesn't work, see if it's possible to fold in registers from
+  // a GEP.
+  return selectAddrModeRegisterOffset(Root);
 }
 
 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
@@ -3994,6 +4432,205 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
   }};
 }
 
+/// Given a shift instruction, return the correct shift type for that
+/// instruction.
+static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
+  // TODO: Handle AArch64_AM::ROR
+  switch (MI.getOpcode()) {
+  default:
+    return AArch64_AM::InvalidShiftExtend;
+  case TargetOpcode::G_SHL:
+    return AArch64_AM::LSL;
+  case TargetOpcode::G_LSHR:
+    return AArch64_AM::LSR;
+  case TargetOpcode::G_ASHR:
+    return AArch64_AM::ASR;
+  }
+}
+
+/// Select a "shifted register" operand. If the value is not shifted, set the
+/// shift operand to a default value of "lsl 0".
+///
+/// TODO: Allow shifted register to be rotated in logical instructions.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
+  if (!Root.isReg())
+    return None;
+  MachineRegisterInfo &MRI =
+      Root.getParent()->getParent()->getParent()->getRegInfo();
+
+  // Check if the operand is defined by an instruction which corresponds to
+  // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
+  //
+  // TODO: Handle AArch64_AM::ROR for logical instructions.
+  MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
+  if (!ShiftInst)
+    return None;
+  AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
+  if (ShType == AArch64_AM::InvalidShiftExtend)
+    return None;
+  if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
+    return None;
+
+  // Need an immediate on the RHS.
+  MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
+  auto Immed = getImmedFromMO(ShiftRHS);
+  if (!Immed)
+    return None;
+
+  // We have something that we can fold. Fold in the shift's LHS and RHS into
+  // the instruction.
+  MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
+  Register ShiftReg = ShiftLHS.getReg();
+
+  unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
+  unsigned Val = *Immed & (NumBits - 1);
+  unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
+
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
+}
+
+/// Get the correct ShiftExtendType for an extend instruction.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  unsigned Opc = MI.getOpcode();
+
+  // Handle explicit extend instructions first.
+  if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
+    unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    assert(Size != 64 && "Extend from 64 bits?");
+    switch (Size) {
+    case 8:
+      return AArch64_AM::SXTB;
+    case 16:
+      return AArch64_AM::SXTH;
+    case 32:
+      return AArch64_AM::SXTW;
+    default:
+      return AArch64_AM::InvalidShiftExtend;
+    }
+  }
+
+  if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
+    unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    assert(Size != 64 && "Extend from 64 bits?");
+    switch (Size) {
+    case 8:
+      return AArch64_AM::UXTB;
+    case 16:
+      return AArch64_AM::UXTH;
+    case 32:
+      return AArch64_AM::UXTW;
+    default:
+      return AArch64_AM::InvalidShiftExtend;
+    }
+  }
+
+  // Don't have an explicit extend. Try to handle a G_AND with a constant mask
+  // on the RHS.
+  if (Opc != TargetOpcode::G_AND)
+    return AArch64_AM::InvalidShiftExtend;
+
+  Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
+  if (!MaybeAndMask)
+    return AArch64_AM::InvalidShiftExtend;
+  uint64_t AndMask = *MaybeAndMask;
+  switch (AndMask) {
+  default:
+    return AArch64_AM::InvalidShiftExtend;
+  case 0xFF:
+    return AArch64_AM::UXTB;
+  case 0xFFFF:
+    return AArch64_AM::UXTH;
+  case 0xFFFFFFFF:
+    return AArch64_AM::UXTW;
+  }
+}
+
+Register AArch64InstructionSelector::narrowExtendRegIfNeeded(
+    Register ExtReg, MachineIRBuilder &MIB) const {
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  if (MRI.getType(ExtReg).getSizeInBits() == 32)
+    return ExtReg;
+
+  // Insert a copy to move ExtReg to GPR32.
+  Register NarrowReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  auto Copy = MIB.buildCopy({NarrowReg}, {ExtReg});
+
+  // Select the copy into a subregister copy.
+  selectCopy(*Copy, TII, MRI, TRI, RBI);
+  return Copy.getReg(0);
+}
+
+/// Select an "extended register" operand. This operand folds in an extend
+/// followed by an optional left shift.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectArithExtendedRegister(
+    MachineOperand &Root) const {
+  if (!Root.isReg())
+    return None;
+  MachineRegisterInfo &MRI =
+      Root.getParent()->getParent()->getParent()->getRegInfo();
+
+  uint64_t ShiftVal = 0;
+  Register ExtReg;
+  AArch64_AM::ShiftExtendType Ext;
+  MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
+  if (!RootDef)
+    return None;
+
+  if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
+    return None;
+
+  // Check if we can fold a shift and an extend.
+  if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
+    // Look for a constant on the RHS of the shift.
+    MachineOperand &RHS = RootDef->getOperand(2);
+    Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
+    if (!MaybeShiftVal)
+      return None;
+    ShiftVal = *MaybeShiftVal;
+    if (ShiftVal > 4)
+      return None;
+    // Look for a valid extend instruction on the LHS of the shift.
+    MachineOperand &LHS = RootDef->getOperand(1);
+    MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
+    if (!ExtDef)
+      return None;
+    Ext = getExtendTypeForInst(*ExtDef, MRI);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return None;
+    ExtReg = ExtDef->getOperand(1).getReg();
+  } else {
+    // Didn't get a shift. Try just folding an extend.
+    Ext = getExtendTypeForInst(*RootDef, MRI);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return None;
+    ExtReg = RootDef->getOperand(1).getReg();
+
+    // If we have a 32 bit instruction which zeroes out the high half of a
+    // register, we get an implicit zero extend for free. Check if we have one.
+    // FIXME: We actually emit the extend right now even though we don't have
+    // to.
+    if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
+      MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
+      if (ExtInst && isDef32(*ExtInst))
+        return None;
+    }
+  }
+
+  // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
+  // copy.
+  MachineIRBuilder MIB(*RootDef);
+  ExtReg = narrowExtendRegIfNeeded(ExtReg, MIB);
+
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
+           [=](MachineInstrBuilder &MIB) {
+             MIB.addImm(getArithExtendImm(Ext, ShiftVal));
+           }}};
+}
+
 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
                                                 const MachineInstr &MI) const {
   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -4003,6 +4640,51 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
   MIB.addImm(CstVal.getValue());
 }
 
+void AArch64InstructionSelector::renderLogicalImm32(
+    MachineInstrBuilder &MIB, const MachineInstr &I) const {
+  assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+  uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
+  uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
+  MIB.addImm(Enc);
+}
+
+void AArch64InstructionSelector::renderLogicalImm64(
+    MachineInstrBuilder &MIB, const MachineInstr &I) const {
+  assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+  uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
+  uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
+  MIB.addImm(Enc);
+}
+
+bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
+    const MachineInstr &MI, unsigned NumBytes) const {
+  if (!MI.mayLoadOrStore())
+    return false;
+  assert(MI.hasOneMemOperand() &&
+         "Expected load/store to have only one mem op!");
+  return (*MI.memoperands_begin())->getSize() == NumBytes;
+}
+
+bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
+    return false;
+
+  // Only return true if we know the operation will zero-out the high half of
+  // the 64-bit register. Truncates can be subregister copies, which don't
+  // zero out the high bits. Copies and other copy-like instructions can be
+  // fed by truncates, or could be lowered as subregister copies.
+  switch (MI.getOpcode()) {
+  default:
+    return true;
+  case TargetOpcode::COPY:
+  case TargetOpcode::G_BITCAST:
+  case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_PHI:
+    return false;
+  }
+}
+
 namespace llvm {
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index a985b330eafa..7a1901bd5b1e 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -13,7 +13,9 @@
 
 #include "AArch64LegalizerInfo.h"
 #include "AArch64Subtarget.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -50,6 +52,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
   const LLT v2s64 = LLT::vector(2, 64);
   const LLT v2p0 = LLT::vector(2, p0);
 
+  // FIXME: support subtargets which have neon/fp-armv8 disabled.
+  if (!ST.hasNEON() || !ST.hasFPARMv8()) {
+    computeTables();
+    return;
+  }
+
   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
     .legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64})
     .clampScalar(0, s1, s64)
@@ -74,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
 
   getActionDefinitionsBuilder(G_BSWAP)
       .legalFor({s32, s64, v4s32, v2s32, v2s64})
-      .clampScalar(0, s16, s64)
+      .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0);
 
   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
@@ -104,6 +112,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
 
   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
       .legalFor({s32, s64})
+      .libcallFor({s128})
       .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0)
       .scalarize(0);
@@ -115,8 +124,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
         return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
                AmtTy.getSizeInBits() == 32;
       })
-      .legalFor(
-          {{s32, s32}, {s32, s64}, {s64, s64}, {v2s32, v2s32}, {v4s32, v4s32}})
+      .legalFor({{s32, s32},
+                 {s32, s64},
+                 {s64, s64},
+                 {v2s32, v2s32},
+                 {v4s32, v4s32},
+                 {v2s64, v2s64}})
       .clampScalar(1, s32, s64)
       .clampScalar(0, s32, s64)
       .minScalarSameAs(1, 0);
@@ -191,14 +204,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .legalIf([=](const LegalityQuery &Query) {
         const LLT &Ty0 = Query.Types[0];
         const LLT &Ty1 = Query.Types[1];
-        if (Ty1 != s32 && Ty1 != s64)
+        if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128)
           return false;
         if (Ty1 == p0)
           return true;
         return isPowerOf2_32(Ty0.getSizeInBits()) &&
                (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8);
       })
-      .clampScalar(1, s32, s64)
+      .clampScalar(1, s32, s128)
       .widenScalarToNextPow2(1)
       .maxScalarIf(typeInSet(1, {s32}), 0, s16)
       .maxScalarIf(typeInSet(1, {s64}), 0, s32)
@@ -236,6 +249,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
                                  {s32, p0, 32, 8},
                                  {s64, p0, 64, 8},
                                  {p0, p0, 64, 8},
+                                 {s128, p0, 128, 8},
                                  {v8s8, p0, 64, 8},
                                  {v16s8, p0, 128, 8},
                                  {v4s16, p0, 64, 8},
@@ -247,14 +261,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .legalForTypesWithMemDesc({{s32, p0, 8, 8},
                                  {s32, p0, 16, 8}})
       .clampScalar(0, s8, s64)
-      .widenScalarToNextPow2(0)
-      // TODO: We could support sum-of-pow2's but the lowering code doesn't know
-      //       how to do that yet.
-      .unsupportedIfMemSizeNotPow2()
+      .lowerIfMemSizeNotPow2()
       // Lower any any-extending loads left into G_ANYEXT and G_LOAD
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
       })
+      .widenScalarToNextPow2(0)
       .clampMaxNumElements(0, s32, 2)
       .clampMaxNumElements(0, s64, 1)
       .customIf(IsPtrVecPred);
@@ -262,9 +274,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
   getActionDefinitionsBuilder(G_STORE)
       .legalForTypesWithMemDesc({{s8, p0, 8, 8},
                                  {s16, p0, 16, 8},
+                                 {s32, p0, 8, 8},
+                                 {s32, p0, 16, 8},
                                  {s32, p0, 32, 8},
                                  {s64, p0, 64, 8},
                                  {p0, p0, 64, 8},
+                                 {s128, p0, 128, 8},
                                  {v16s8, p0, 128, 8},
                                  {v4s16, p0, 64, 8},
                                  {v8s16, p0, 128, 8},
@@ -272,10 +287,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
                                  {v4s32, p0, 128, 8},
                                  {v2s64, p0, 128, 8}})
       .clampScalar(0, s8, s64)
-      .widenScalarToNextPow2(0)
-      // TODO: We could support sum-of-pow2's but the lowering code doesn't know
-      //       how to do that yet.
-      .unsupportedIfMemSizeNotPow2()
+      .lowerIfMemSizeNotPow2()
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
                Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
@@ -305,8 +317,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
                  {v8s16, v8s16},
                  {v8s8, v8s8},
                  {v16s8, v16s8}})
-      .clampScalar(0, s32, s32)
       .clampScalar(1, s32, s64)
+      .clampScalar(0, s32, s32)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
@@ -330,33 +342,40 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .widenScalarToNextPow2(1);
 
   // Extensions
-  getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
-      .legalIf([=](const LegalityQuery &Query) {
-        unsigned DstSize = Query.Types[0].getSizeInBits();
-
-        // Make sure that we have something that will fit in a register, and
-        // make sure it's a power of 2.
-        if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
-          return false;
+  auto ExtLegalFunc = [=](const LegalityQuery &Query) {
+    unsigned DstSize = Query.Types[0].getSizeInBits();
+
+    if (DstSize == 128 && !Query.Types[0].isVector())
+      return false; // Extending to a scalar s128 needs narrowing.
+    
+    // Make sure that we have something that will fit in a register, and
+    // make sure it's a power of 2.
+    if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
+      return false;
 
-        const LLT &SrcTy = Query.Types[1];
+    const LLT &SrcTy = Query.Types[1];
 
-        // Special case for s1.
-        if (SrcTy == s1)
-          return true;
+    // Special case for s1.
+    if (SrcTy == s1)
+      return true;
 
-        // Make sure we fit in a register otherwise. Don't bother checking that
-        // the source type is below 128 bits. We shouldn't be allowing anything
-        // through which is wider than the destination in the first place.
-        unsigned SrcSize = SrcTy.getSizeInBits();
-        if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
-          return false;
+    // Make sure we fit in a register otherwise. Don't bother checking that
+    // the source type is below 128 bits. We shouldn't be allowing anything
+    // through which is wider than the destination in the first place.
+    unsigned SrcSize = SrcTy.getSizeInBits();
+    if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
+      return false;
 
-        return true;
-      });
+    return true;
+  };
+  getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
+      .legalIf(ExtLegalFunc)
+      .clampScalar(0, s64, s64); // Just for s128, others are handled above.
 
   getActionDefinitionsBuilder(G_TRUNC).alwaysLegal();
 
+  getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
   // FP conversions
   getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
       {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}});
@@ -591,6 +610,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
     return Query.Types[0] == p0 && Query.Types[1] == s64;
   });
 
+  getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
+
   computeTables();
   verify(*ST.getInstrInfo());
 }
@@ -617,6 +638,24 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
   llvm_unreachable("expected switch to return");
 }
 
+bool AArch64LegalizerInfo::legalizeIntrinsic(
+    MachineInstr &MI, MachineRegisterInfo &MRI,
+    MachineIRBuilder &MIRBuilder) const {
+  switch (MI.getIntrinsicID()) {
+  case Intrinsic::memcpy:
+  case Intrinsic::memset:
+  case Intrinsic::memmove:
+    if (createMemLibcall(MIRBuilder, MRI, MI) ==
+        LegalizerHelper::UnableToLegalize)
+      return false;
+    MI.eraseFromParent();
+    return true;
+  default:
+    break;
+  }
+  return true;
+}
+
 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
     GISelChangeObserver &Observer) const {
@@ -655,7 +694,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
   // legalized. In order to allow further legalization of the inst, we create
   // a new instruction and erase the existing one.
 
-  unsigned ValReg = MI.getOperand(0).getReg();
+  Register ValReg = MI.getOperand(0).getReg();
   const LLT ValTy = MRI.getType(ValReg);
 
   if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
@@ -672,7 +711,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
     auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg});
     MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO);
   } else {
-    unsigned NewReg = MRI.createGenericVirtualRegister(NewTy);
+    Register NewReg = MRI.createGenericVirtualRegister(NewTy);
     auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO);
     MIRBuilder.buildBitcast({ValReg}, {NewLoad});
   }
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h
index f3362a18620f..15161bab466c 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -31,6 +31,9 @@ public:
                       MachineIRBuilder &MIRBuilder,
                       GISelChangeObserver &Observer) const override;
 
+  bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &MIRBuilder) const override;
+
 private:
   bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
                      MachineIRBuilder &MIRBuilder) const;
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 65b5f906e3f6..a0c4a25bb5b9 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -201,8 +201,22 @@ static bool isNarrowStore(unsigned Opc) {
   }
 }
 
+// These instruction set memory tag and either keep memory contents unchanged or
+// set it to zero, ignoring the address part of the source register.
+static bool isTagStore(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::STGOffset:
+  case AArch64::STZGOffset:
+  case AArch64::ST2GOffset:
+  case AArch64::STZ2GOffset:
+    return true;
+  }
+}
+
 // Scaling factor for unscaled load or store.
-static int getMemScale(MachineInstr &MI) {
+static int getMemScale(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Opcode has unknown scale!");
@@ -255,6 +269,11 @@ static int getMemScale(MachineInstr &MI) {
   case AArch64::STURQi:
   case AArch64::LDPQi:
   case AArch64::STPQi:
+  case AArch64::STGOffset:
+  case AArch64::STZGOffset:
+  case AArch64::ST2GOffset:
+  case AArch64::STZ2GOffset:
+  case AArch64::STGPi:
     return 16;
   }
 }
@@ -449,6 +468,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
     return AArch64::STPWpre;
   case AArch64::STPXi:
     return AArch64::STPXpre;
+  case AArch64::STGOffset:
+    return AArch64::STGPreIndex;
+  case AArch64::STZGOffset:
+    return AArch64::STZGPreIndex;
+  case AArch64::ST2GOffset:
+    return AArch64::ST2GPreIndex;
+  case AArch64::STZ2GOffset:
+    return AArch64::STZ2GPreIndex;
+  case AArch64::STGPi:
+    return AArch64::STGPpre;
   }
 }
 
@@ -518,6 +547,16 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
     return AArch64::STPWpost;
   case AArch64::STPXi:
     return AArch64::STPXpost;
+  case AArch64::STGOffset:
+    return AArch64::STGPostIndex;
+  case AArch64::STZGOffset:
+    return AArch64::STZGPostIndex;
+  case AArch64::ST2GOffset:
+    return AArch64::ST2GPostIndex;
+  case AArch64::STZ2GOffset:
+    return AArch64::STZ2GPostIndex;
+  case AArch64::STGPi:
+    return AArch64::STGPpost;
   }
 }
 
@@ -536,10 +575,30 @@ static bool isPairedLdSt(const MachineInstr &MI) {
   case AArch64::STPQi:
   case AArch64::STPWi:
   case AArch64::STPXi:
+  case AArch64::STGPi:
     return true;
   }
 }
 
+// Returns the scale and offset range of pre/post indexed variants of MI.
+static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
+                                       int &MinOffset, int &MaxOffset) {
+  bool IsPaired = isPairedLdSt(MI);
+  bool IsTagStore = isTagStore(MI);
+  // ST*G and all paired ldst have the same scale in pre/post-indexed variants
+  // as in the "unsigned offset" variant.
+  // All other pre/post indexed ldst instructions are unscaled.
+  Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1;
+
+  if (IsPaired) {
+    MinOffset = -64;
+    MaxOffset = 63;
+  } else {
+    MinOffset = -256;
+    MaxOffset = 255;
+  }
+}
+
 static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
                                           unsigned PairedRegOp = 0) {
   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
@@ -618,6 +677,11 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
   case AArch64::LDRWui:
   case AArch64::LDRHHui:
   case AArch64::LDRBBui:
+  case AArch64::STGOffset:
+  case AArch64::STZGOffset:
+  case AArch64::ST2GOffset:
+  case AArch64::STZ2GOffset:
+  case AArch64::STGPi:
   // Unscaled instructions.
   case AArch64::STURSi:
   case AArch64::STURDi:
@@ -808,7 +872,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
       //   STRWui %w1, ...
       //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
       //   STRW %w0
-      unsigned Reg = getLdStRegOp(*I).getReg();
+      Register Reg = getLdStRegOp(*I).getReg();
       for (MachineInstr &MI : make_range(std::next(I), Paired))
         MI.clearRegisterKills(Reg, TRI);
     }
@@ -837,9 +901,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
     // Right now, DstMO has the extended register, since it comes from an
     // extended opcode.
-    unsigned DstRegX = DstMO.getReg();
+    Register DstRegX = DstMO.getReg();
     // Get the W variant of that register.
-    unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
+    Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
     // Update the result of LDP to use the W instead of the X variant.
     DstMO.setReg(DstRegW);
     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
@@ -882,9 +946,9 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
 
   int LoadSize = getMemScale(*LoadI);
   int StoreSize = getMemScale(*StoreI);
-  unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+  Register LdRt = getLdStRegOp(*LoadI).getReg();
   const MachineOperand &StMO = getLdStRegOp(*StoreI);
-  unsigned StRt = getLdStRegOp(*StoreI).getReg();
+  Register StRt = getLdStRegOp(*StoreI).getReg();
   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
 
   assert((IsStoreXReg ||
@@ -933,10 +997,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
                                ? getLdStOffsetOp(*StoreI).getImm()
                                : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
     int Width = LoadSize * 8;
-    unsigned DestReg = IsStoreXReg
-                           ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
-                                                      &AArch64::GPR64RegClass)
-                           : LdRt;
+    unsigned DestReg =
+        IsStoreXReg ? Register(TRI->getMatchingSuperReg(
+                          LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
+                    : LdRt;
 
     assert((UnscaledLdOffset >= UnscaledStOffset &&
             (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
@@ -1042,7 +1106,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
   MachineBasicBlock::iterator B = I->getParent()->begin();
   MachineBasicBlock::iterator MBBI = I;
   MachineInstr &LoadMI = *I;
-  unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
+  Register BaseReg = getLdStBaseOp(LoadMI).getReg();
 
   // If the load is the first instruction in the block, there's obviously
   // not any matching store.
@@ -1156,8 +1220,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 
   bool MayLoad = FirstMI.mayLoad();
   bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
-  unsigned Reg = getLdStRegOp(FirstMI).getReg();
-  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+  Register Reg = getLdStRegOp(FirstMI).getReg();
+  Register BaseReg = getLdStBaseOp(FirstMI).getReg();
   int Offset = getLdStOffsetOp(FirstMI).getImm();
   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
@@ -1188,7 +1252,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
       // check for +1/-1. Make sure to check the new instruction offset is
       // actually an immediate and not a symbolic reference destined for
       // a relocation.
-      unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
+      Register MIBaseReg = getLdStBaseOp(MI).getReg();
       int MIOffset = getLdStOffsetOp(MI).getImm();
       bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
       if (IsUnscaled != MIIsUnscaled) {
@@ -1328,18 +1392,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
                              : getPostIndexedOpcode(I->getOpcode());
   MachineInstrBuilder MIB;
+  int Scale, MinOffset, MaxOffset;
+  getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
   if (!isPairedLdSt(*I)) {
     // Non-paired instruction.
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
               .add(getLdStRegOp(*Update))
               .add(getLdStRegOp(*I))
               .add(getLdStBaseOp(*I))
-              .addImm(Value)
+              .addImm(Value / Scale)
               .setMemRefs(I->memoperands())
               .setMIFlags(I->mergeFlagsWith(*Update));
   } else {
     // Paired instruction.
-    int Scale = getMemScale(*I);
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
               .add(getLdStRegOp(*Update))
               .add(getLdStRegOp(*I, 0))
@@ -1395,28 +1460,21 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
         MI.getOperand(1).getReg() != BaseReg)
       break;
 
-    bool IsPairedInsn = isPairedLdSt(MemMI);
     int UpdateOffset = MI.getOperand(2).getImm();
     if (MI.getOpcode() == AArch64::SUBXri)
       UpdateOffset = -UpdateOffset;
 
-    // For non-paired load/store instructions, the immediate must fit in a
-    // signed 9-bit integer.
-    if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
+    // The immediate must be a multiple of the scaling factor of the pre/post
+    // indexed instruction.
+    int Scale, MinOffset, MaxOffset;
+    getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
+    if (UpdateOffset % Scale != 0)
       break;
 
-    // For paired load/store instructions, the immediate must be a multiple of
-    // the scaling factor.  The scaled offset must also fit into a signed 7-bit
-    // integer.
-    if (IsPairedInsn) {
-      int Scale = getMemScale(MemMI);
-      if (UpdateOffset % Scale != 0)
-        break;
-
-      int ScaledOffset = UpdateOffset / Scale;
-      if (ScaledOffset > 63 || ScaledOffset < -64)
-        break;
-    }
+    // Scaled offset must fit in the instruction immediate.
+    int ScaledOffset = UpdateOffset / Scale;
+    if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
+      break;
 
     // If we have a non-zero Offset, we check that it matches the amount
     // we're adding to the register.
@@ -1433,7 +1491,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
   MachineInstr &MemMI = *I;
   MachineBasicBlock::iterator MBBI = I;
 
-  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+  Register BaseReg = getLdStBaseOp(MemMI).getReg();
   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
 
   // Scan forward looking for post-index opportunities.  Updating instructions
@@ -1442,13 +1500,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
   if (MIUnscaledOffset != UnscaledOffset)
     return E;
 
-  // If the base register overlaps a destination register, we can't
-  // merge the update.
-  bool IsPairedInsn = isPairedLdSt(MemMI);
-  for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
-    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
-    if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
-      return E;
+  // If the base register overlaps a source/destination register, we can't
+  // merge the update. This does not apply to tag store instructions which
+  // ignore the address part of the source register.
+  // This does not apply to STGPi as well, which does not have unpredictable
+  // behavior in this case unlike normal stores, and always performs writeback
+  // after reading the source register value.
+  if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
+    bool IsPairedInsn = isPairedLdSt(MemMI);
+    for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+      Register DestReg = getLdStRegOp(MemMI, i).getReg();
+      if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+        return E;
+    }
   }
 
   // Track which register units have been modified and used between the first
@@ -1487,7 +1551,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   MachineInstr &MemMI = *I;
   MachineBasicBlock::iterator MBBI = I;
 
-  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+  Register BaseReg = getLdStBaseOp(MemMI).getReg();
   int Offset = getLdStOffsetOp(MemMI).getImm();
 
   // If the load/store is the first instruction in the block, there's obviously
@@ -1496,11 +1560,13 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     return E;
   // If the base register overlaps a destination register, we can't
   // merge the update.
-  bool IsPairedInsn = isPairedLdSt(MemMI);
-  for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
-    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
-    if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
-      return E;
+  if (!isTagStore(MemMI)) {
+    bool IsPairedInsn = isPairedLdSt(MemMI);
+    for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+      Register DestReg = getLdStRegOp(MemMI, i).getReg();
+      if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+        return E;
+    }
   }
 
   // Track which register units have been modified and used between the first
@@ -1659,7 +1725,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   // however, is not, so adjust here.
   int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
 
-  // Look forward to try to find a post-index instruction. For example,
+  // Look forward to try to find a pre-index instruction. For example,
   // ldr x1, [x0, #64]
   // add x0, x0, #64
   //   merged into:
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index e7d4a2789a28..afd5ae6bcbf2 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -148,6 +148,8 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
       RefFlags |= AArch64MCExpr::VK_TLSDESC;
       break;
     }
+  } else if (MO.getTargetFlags() & AArch64II::MO_PREL) {
+    RefFlags |= AArch64MCExpr::VK_PREL;
   } else {
     // No modifier means this is a generic reference, classified as absolute for
     // the cases where it matters (:abs_g0: etc).
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 0efeeb272ec1..0009fb7b5520 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include <cassert>
 
@@ -95,6 +96,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// returned struct in a register. This field holds the virtual register into
   /// which the sret argument is passed.
   unsigned SRetReturnReg = 0;
+  /// SVE stack size (for predicates and data vectors) are maintained here
+  /// rather than in FrameInfo, as the placement and Stack IDs are target
+  /// specific.
+  uint64_t StackSizeSVE = 0;
+
+  /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid.
+  bool HasCalculatedStackSizeSVE = false;
 
   /// Has a value when it is known whether or not the function uses a
   /// redzone, and no value otherwise.
@@ -131,6 +139,15 @@ public:
     ArgumentStackToRestore = bytes;
   }
 
+  bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
+
+  void setStackSizeSVE(uint64_t S) {
+    HasCalculatedStackSizeSVE = true;
+    StackSizeSVE = S;
+  }
+
+  uint64_t getStackSizeSVE() const { return StackSizeSVE; }
+
   bool hasStackFrame() const { return HasStackFrame; }
   void setHasStackFrame(bool s) { HasStackFrame = s; }
 
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index aff861aae6be..d503c39b1f90 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -162,11 +162,11 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
 
   LiveIntervals &LIs = G.getMetadata().LIS;
 
-  if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
-    LLVM_DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
-                      << '\n');
-    LLVM_DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
-                      << '\n');
+  if (Register::isPhysicalRegister(Rd) || Register::isPhysicalRegister(Ra)) {
+    LLVM_DEBUG(dbgs() << "Rd is a physical reg:"
+                      << Register::isPhysicalRegister(Rd) << '\n');
+    LLVM_DEBUG(dbgs() << "Ra is a physical reg:"
+                      << Register::isPhysicalRegister(Ra) << '\n');
     return false;
   }
 
@@ -359,8 +359,8 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
       case AArch64::FMADDDrrr:
       case AArch64::FNMSUBDrrr:
       case AArch64::FNMADDDrrr: {
-        unsigned Rd = MI.getOperand(0).getReg();
-        unsigned Ra = MI.getOperand(3).getReg();
+        Register Rd = MI.getOperand(0).getReg();
+        Register Ra = MI.getOperand(3).getReg();
 
         if (addIntraChainConstraint(G, Rd, Ra))
           addInterChainConstraint(G, Rd, Ra);
@@ -369,7 +369,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
 
       case AArch64::FMLAv2f32:
       case AArch64::FMLSv2f32: {
-        unsigned Rd = MI.getOperand(0).getReg();
+        Register Rd = MI.getOperand(0).getReg();
         addInterChainConstraint(G, Rd, Rd);
         break;
       }
diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
index 5f7245bfbd74..d30ea120bae4 100644
--- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
+++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
@@ -15,7 +15,9 @@
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
@@ -25,12 +27,31 @@
 using namespace llvm;
 using namespace MIPatternMatch;
 
+#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AArch64GenGICombiner.inc"
+#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
 namespace {
+#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AArch64GenGICombiner.inc"
+#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
 class AArch64PreLegalizerCombinerInfo : public CombinerInfo {
+  GISelKnownBits *KB;
+  MachineDominatorTree *MDT;
+
 public:
-  AArch64PreLegalizerCombinerInfo()
+  AArch64GenPreLegalizerCombinerHelper Generated;
+
+  AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+                                  GISelKnownBits *KB, MachineDominatorTree *MDT)
       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
-                     /*LegalizerInfo*/ nullptr) {}
+                     /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
+        KB(KB), MDT(MDT) {
+    if (!Generated.parseCommandLineOption())
+      report_fatal_error("Invalid rule identifier");
+  }
+
   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
                        MachineIRBuilder &B) const override;
 };
@@ -38,24 +59,50 @@ public:
 bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
                                               MachineInstr &MI,
                                               MachineIRBuilder &B) const {
-  CombinerHelper Helper(Observer, B);
+  CombinerHelper Helper(Observer, B, KB, MDT);
 
   switch (MI.getOpcode()) {
-  default:
-    return false;
-  case TargetOpcode::COPY:
-    return Helper.tryCombineCopy(MI);
-  case TargetOpcode::G_BR:
-    return Helper.tryCombineBr(MI);
+  case TargetOpcode::G_CONCAT_VECTORS:
+    return Helper.tryCombineConcatVectors(MI);
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return Helper.tryCombineShuffleVector(MI);
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_SEXTLOAD:
-  case TargetOpcode::G_ZEXTLOAD:
-    return Helper.tryCombineExtendingLoads(MI);
+  case TargetOpcode::G_ZEXTLOAD: {
+    bool Changed = false;
+    Changed |= Helper.tryCombineExtendingLoads(MI);
+    Changed |= Helper.tryCombineIndexedLoadStore(MI);
+    return Changed;
+  }
+  case TargetOpcode::G_STORE:
+    return Helper.tryCombineIndexedLoadStore(MI);
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+    switch (MI.getIntrinsicID()) {
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+    case Intrinsic::memset: {
+      // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
+      // heuristics decide.
+      unsigned MaxLen = EnableOpt ? 0 : 32;
+      // Try to inline memcpy type calls if optimizations are enabled.
+      return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen)
+                              : false;
+    }
+    default:
+      break;
+    }
   }
 
+  if (Generated.tryCombineAll(Observer, MI, B))
+    return true;
+
   return false;
 }
 
+#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AArch64GenGICombiner.inc"
+#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
 // Pass boilerplate
 // ================
 
@@ -63,24 +110,33 @@ class AArch64PreLegalizerCombiner : public MachineFunctionPass {
 public:
   static char ID;
 
-  AArch64PreLegalizerCombiner();
+  AArch64PreLegalizerCombiner(bool IsOptNone = false);
 
   StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+  bool IsOptNone;
 };
-}
+} // end anonymous namespace
 
 void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
   AU.setPreservesCFG();
   getSelectionDAGFallbackAnalysisUsage(AU);
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  if (!IsOptNone) {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+  }
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() : MachineFunctionPass(ID) {
+AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone)
+    : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
   initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
 }
 
@@ -89,7 +145,14 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
           MachineFunctionProperties::Property::FailedISel))
     return false;
   auto *TPC = &getAnalysis<TargetPassConfig>();
-  AArch64PreLegalizerCombinerInfo PCInfo;
+  const Function &F = MF.getFunction();
+  bool EnableOpt =
+      MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+  MachineDominatorTree *MDT =
+      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+  AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+                                         F.hasMinSize(), KB, MDT);
   Combiner C(PCInfo, TPC);
   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
 }
@@ -99,13 +162,14 @@ INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
                       "Combine AArch64 machine instrs before legalization",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
 INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
                     "Combine AArch64 machine instrs before legalization", false,
                     false)
 
 
 namespace llvm {
-FunctionPass *createAArch64PreLegalizeCombiner() {
-  return new AArch64PreLegalizerCombiner();
+FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) {
+  return new AArch64PreLegalizerCombiner(IsOptNone);
 }
 } // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index b52259cc9acd..8ec73aa3c040 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -563,12 +563,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     return getSameKindOfOperandsMapping(MI);
   }
   case TargetOpcode::COPY: {
-    unsigned DstReg = MI.getOperand(0).getReg();
-    unsigned SrcReg = MI.getOperand(1).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
     // Check if one of the register is not a generic register.
-    if ((TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+    if ((Register::isPhysicalRegister(DstReg) ||
          !MRI.getType(DstReg).isValid()) ||
-        (TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
+        (Register::isPhysicalRegister(SrcReg) ||
          !MRI.getType(SrcReg).isValid())) {
       const RegisterBank *DstRB = getRegBank(DstReg, MRI, TRI);
       const RegisterBank *SrcRB = getRegBank(SrcReg, MRI, TRI);
@@ -635,6 +635,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   // Some of the floating-point instructions have mixed GPR and FPR operands:
   // fine-tune the computed mapping.
   switch (Opc) {
+  case TargetOpcode::G_TRUNC: {
+    LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+    if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128)
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+    break;
+  }
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP:
     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
@@ -687,7 +693,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_STORE:
     // Check if that store is fed by fp instructions.
     if (OpRegBankIdx[0] == PMI_FirstGPR) {
-      unsigned VReg = MI.getOperand(0).getReg();
+      Register VReg = MI.getOperand(0).getReg();
       if (!VReg)
         break;
       MachineInstr *DefMI = MRI.getVRegDef(VReg);
@@ -702,11 +708,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
 
     // If we're taking in vectors, we have no choice but to put everything on
-    // FPRs.
+    // FPRs, except for the condition. The condition must always be on a GPR.
     LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
     if (SrcTy.isVector()) {
-      for (unsigned Idx = 0; Idx < 4; ++Idx)
-        OpRegBankIdx[Idx] = PMI_FirstFPR;
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR};
       break;
     }
 
@@ -740,7 +745,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // This doesn't check the condition, since it's just whatever is in NZCV.
     // This isn't passed explicitly in a register to fcsel/csel.
     for (unsigned Idx = 2; Idx < 4; ++Idx) {
-      unsigned VReg = MI.getOperand(Idx).getReg();
+      Register VReg = MI.getOperand(Idx).getReg();
       MachineInstr *DefMI = MRI.getVRegDef(VReg);
       if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank ||
           onlyDefinesFP(*DefMI, MRI, TRI))
@@ -750,8 +755,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // If we have more FP constraints than not, then move everything over to
     // FPR.
     if (NumFP >= 2)
-      for (unsigned Idx = 0; Idx < 4; ++Idx)
-        OpRegBankIdx[Idx] = PMI_FirstFPR;
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR};
 
     break;
   }
@@ -764,7 +768,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg());
     // UNMERGE into scalars from a vector should always use FPR.
     // Likewise if any of the uses are FP instructions.
-    if (SrcTy.isVector() ||
+    if (SrcTy.isVector() || SrcTy == LLT::scalar(128) ||
         any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
                [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) {
       // Set the register bank of every operand to FPR.
@@ -795,12 +799,21 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // Index needs to be a GPR.
     OpRegBankIdx[3] = PMI_FirstGPR;
     break;
+  case TargetOpcode::G_EXTRACT: {
+    // For s128 sources we have to use fpr.
+    LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+    if (SrcTy.getSizeInBits() == 128) {
+      OpRegBankIdx[0] = PMI_FirstFPR;
+      OpRegBankIdx[1] = PMI_FirstFPR;
+    }
+    break;
+  }
   case TargetOpcode::G_BUILD_VECTOR:
     // If the first source operand belongs to a FPR register bank, then make
     // sure that we preserve that.
     if (OpRegBankIdx[1] != PMI_FirstGPR)
       break;
-    unsigned VReg = MI.getOperand(1).getReg();
+    Register VReg = MI.getOperand(1).getReg();
     if (!VReg)
       break;
 
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 6d5a4e3d2f76..de176088595d 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -15,6 +15,7 @@
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64StackOffset.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
@@ -23,10 +24,10 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
@@ -63,8 +64,9 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_AAPCS_SwiftError_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
     return CSR_AArch64_RT_MostRegs_SaveList;
-  else
-    return CSR_AArch64_AAPCS_SaveList;
+  if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
+    return CSR_Darwin_AArch64_AAPCS_SaveList;
+  return CSR_AArch64_AAPCS_SaveList;
 }
 
 const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
@@ -120,6 +122,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                : CSR_AArch64_CXX_TLS_Darwin_RegMask;
   if (CC == CallingConv::AArch64_VectorCall)
     return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
+  if (CC == CallingConv::AArch64_SVE_VectorCall)
+    return CSR_AArch64_SVE_AAPCS_RegMask;
   if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
           ->supportSwiftError() &&
       MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
@@ -388,7 +392,7 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
                                              int64_t Offset) const {
   assert(Offset <= INT_MAX && "Offset too big to fit in int.");
   assert(MI && "Unable to get the legal offset for nil instruction.");
-  int SaveOffset = Offset;
+  StackOffset SaveOffset(Offset, MVT::i8);
   return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
 }
 
@@ -418,7 +422,9 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
 
 void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
                                             int64_t Offset) const {
-  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  // ARM doesn't need the general 64-bit offsets
+  StackOffset Off(Offset, MVT::i8);
+
   unsigned i = 0;
 
   while (!MI.getOperand(i).isFI()) {
@@ -441,40 +447,69 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
   const AArch64InstrInfo *TII =
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   const AArch64FrameLowering *TFI = getFrameLowering(MF);
 
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  bool Tagged =
+      MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
   unsigned FrameReg;
-  int Offset;
 
   // Special handling of dbg_value, stackmap and patchpoint instructions.
   if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
       MI.getOpcode() == TargetOpcode::PATCHPOINT) {
-    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
-                                             /*PreferFP=*/true,
-                                             /*ForSimm=*/false);
-    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+    StackOffset Offset =
+        TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                        /*PreferFP=*/true,
+                                        /*ForSimm=*/false);
+    Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8);
     MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes());
     return;
   }
 
   if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) {
     MachineOperand &FI = MI.getOperand(FIOperandNum);
-    Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
+    int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
     FI.ChangeToImmediate(Offset);
     return;
   }
 
+  StackOffset Offset;
   if (MI.getOpcode() == AArch64::TAGPstack) {
     // TAGPstack must use the virtual frame register in its 3rd operand.
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
     const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     FrameReg = MI.getOperand(3).getReg();
-    Offset =
-        MFI.getObjectOffset(FrameIndex) + AFI->getTaggedBasePointerOffset();
+    Offset = {MFI.getObjectOffset(FrameIndex) +
+                  AFI->getTaggedBasePointerOffset(),
+              MVT::i8};
+  } else if (Tagged) {
+    StackOffset SPOffset = {
+        MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8};
+    if (MFI.hasVarSizedObjects() ||
+        isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) !=
+            (AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) {
+      // Can't update to SP + offset in place. Precalculate the tagged pointer
+      // in a scratch register.
+      Offset = TFI->resolveFrameIndexReference(
+          MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
+      Register ScratchReg =
+          MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+      emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
+                      TII);
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
+          .addReg(ScratchReg)
+          .addReg(ScratchReg)
+          .addImm(0);
+      MI.getOperand(FIOperandNum)
+          .ChangeToRegister(ScratchReg, false, false, true);
+      return;
+    }
+    FrameReg = AArch64::SP;
+    Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(),
+              MVT::i8};
   } else {
     Offset = TFI->resolveFrameIndexReference(
         MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
@@ -490,7 +525,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above.  Handle the rest, providing a register that is
   // SP+LargeImm.
-  unsigned ScratchReg =
+  Register ScratchReg =
       MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
   emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
   MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
diff --git a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index 854670079e40..28a7e680849b 100644
--- a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -426,16 +426,16 @@ bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   // Get the operands of the current SIMD arithmetic instruction.
-  unsigned MulDest = MI.getOperand(0).getReg();
-  unsigned SrcReg0 = MI.getOperand(1).getReg();
+  Register MulDest = MI.getOperand(0).getReg();
+  Register SrcReg0 = MI.getOperand(1).getReg();
   unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
-  unsigned SrcReg1 = MI.getOperand(2).getReg();
+  Register SrcReg1 = MI.getOperand(2).getReg();
   unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
   unsigned DupDest;
 
   // Instructions of interest have either 4 or 5 operands.
   if (MI.getNumOperands() == 5) {
-    unsigned SrcReg2 = MI.getOperand(3).getReg();
+    Register SrcReg2 = MI.getOperand(3).getReg();
     unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
     unsigned LaneNumber = MI.getOperand(4).getImm();
     // Create a new DUP instruction. Note that if an equivalent DUP instruction
diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 79ab42f4c080..b573eac76754 100644
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -82,11 +82,11 @@ let Predicates = [HasSVE] in {
   defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">;
   defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">;
 
-  defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">;
-  defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">;
+  defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
+  defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;
 
-  defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">;
-  defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">;
+  defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
+  defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
 
   defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">;
   defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">;
@@ -94,14 +94,14 @@ let Predicates = [HasSVE] in {
   defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">;
   defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">;
   defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">;
-  defm ABS_ZPmZ  : sve_int_un_pred_arit_0<  0b110, "abs">;
-  defm NEG_ZPmZ  : sve_int_un_pred_arit_0<  0b111, "neg">;
-
-  defm CLS_ZPmZ  : sve_int_un_pred_arit_1<   0b000, "cls">;
-  defm CLZ_ZPmZ  : sve_int_un_pred_arit_1<   0b001, "clz">;
-  defm CNT_ZPmZ  : sve_int_un_pred_arit_1<   0b010, "cnt">;
-  defm CNOT_ZPmZ : sve_int_un_pred_arit_1<   0b011, "cnot">;
-  defm NOT_ZPmZ  : sve_int_un_pred_arit_1<   0b110, "not">;
+  defm ABS_ZPmZ  : sve_int_un_pred_arit_0<  0b110, "abs", int_aarch64_sve_abs>;
+  defm NEG_ZPmZ  : sve_int_un_pred_arit_0<  0b111, "neg", int_aarch64_sve_neg>;
+
+  defm CLS_ZPmZ  : sve_int_un_pred_arit_1<   0b000, "cls",  null_frag>;
+  defm CLZ_ZPmZ  : sve_int_un_pred_arit_1<   0b001, "clz",  null_frag>;
+  defm CNT_ZPmZ  : sve_int_un_pred_arit_1<   0b010, "cnt",  int_aarch64_sve_cnt>;
+  defm CNOT_ZPmZ : sve_int_un_pred_arit_1<   0b011, "cnot", null_frag>;
+  defm NOT_ZPmZ  : sve_int_un_pred_arit_1<   0b110, "not",  null_frag>;
   defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">;
   defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">;
 
@@ -138,12 +138,12 @@ let Predicates = [HasSVE] in {
   defm FDIVR_ZPmZ  : sve_fp_2op_p_zds<0b1100, "fdivr">;
   defm FDIV_ZPmZ   : sve_fp_2op_p_zds<0b1101, "fdiv">;
 
-  defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd">;
-  defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub">;
-  defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul">;
-  defm FTSMUL_ZZZ  : sve_fp_3op_u_zd<0b011, "ftsmul">;
-  defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps">;
-  defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">;
+  defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
+  defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub", null_frag>;
+  defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul", null_frag>;
+  defm FTSMUL_ZZZ  : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>;
+  defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps", null_frag>;
+  defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>;
 
   defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">;
 
@@ -187,7 +187,7 @@ let Predicates = [HasSVE] in {
   defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;
 
   // Splat scalar register (unpredicated, GPR or vector + element index)
-  defm DUP_ZR  : sve_int_perm_dup_r<"dup">;
+  defm DUP_ZR  : sve_int_perm_dup_r<"dup", AArch64dup>;
   defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
 
   // Splat scalar register (predicated)
@@ -211,13 +211,13 @@ let Predicates = [HasSVE] in {
   defm REV_PP : sve_int_perm_reverse_p<"rev">;
   defm REV_ZZ : sve_int_perm_reverse_z<"rev">;
 
-  defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">;
-  defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">;
-  defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">;
-  defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">;
+  defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
+  defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
+  defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
+  defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi", AArch64uunpkhi>;
 
-  def  PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">;
-  def  PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">;
+  defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>;
+  defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>;
 
   defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
   defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
@@ -1020,6 +1020,56 @@ let Predicates = [HasSVE] in {
                   (FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
   def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
                   (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+  def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+  def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+  def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+  def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+  def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+  def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+
+  def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+  def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+  def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+  def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+  def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+  def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+
+  def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+  def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+  def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+  def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+  def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+  def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+
+  def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+  def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+  def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+  def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+  def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+  def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+
+  def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+  def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+  def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+  def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+  def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+  def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+
+  def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+  def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+  def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+  def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+  def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+  def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+
+  def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+  def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+  def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+  def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+  def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+  def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+
 }
 
 let Predicates = [HasSVE2] in {
@@ -1164,6 +1214,13 @@ let Predicates = [HasSVE2] in {
   defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">;
   defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">;
 
+  // SVE2 predicated shifts
+  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
+  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
+  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
+  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
+  defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+
   // SVE2 integer add/subtract long
   defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">;
   defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">;
@@ -1199,14 +1256,14 @@ let Predicates = [HasSVE2] in {
   defm PMULLT_ZZZ   : sve2_pmul_long<0b1, "pmullt">;
 
   // SVE2 bitwise shift and insert
-  defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">;
-  defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">;
+  defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">;
+  defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">;
 
   // SVE2 bitwise shift right and accumulate
-  defm SSRA_ZZI  : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">;
-  defm USRA_ZZI  : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">;
-  defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">;
-  defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">;
+  defm SSRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">;
+  defm USRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b01, "usra">;
+  defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">;
+  defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">;
 
   // SVE2 complex integer add
   defm CADD_ZZI   : sve2_int_cadd<0b0, "cadd">;
@@ -1228,41 +1285,47 @@ let Predicates = [HasSVE2] in {
   defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">;
   defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;
 
-  // SVE2 bitwise shift right narrow
-  defm SQSHRUNB_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">;
-  defm SQSHRUNT_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">;
-  defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">;
-  defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">;
-  defm SHRNB_ZZI     : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">;
-  defm SHRNT_ZZI     : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">;
-  defm RSHRNB_ZZI    : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">;
-  defm RSHRNT_ZZI    : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">;
-  defm SQSHRNB_ZZI   : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">;
-  defm SQSHRNT_ZZI   : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">;
-  defm SQRSHRNB_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">;
-  defm SQRSHRNT_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">;
-  defm UQSHRNB_ZZI   : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">;
-  defm UQSHRNT_ZZI   : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">;
-  defm UQRSHRNB_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">;
-  defm UQRSHRNT_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">;
-
-  // SVE2 integer add/subtract narrow high part
-  defm ADDHNB_ZZZ  : sve2_int_addsub_narrow_high<0b000, "addhnb">;
-  defm ADDHNT_ZZZ  : sve2_int_addsub_narrow_high<0b001, "addhnt">;
-  defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">;
-  defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">;
-  defm SUBHNB_ZZZ  : sve2_int_addsub_narrow_high<0b100, "subhnb">;
-  defm SUBHNT_ZZZ  : sve2_int_addsub_narrow_high<0b101, "subhnt">;
-  defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">;
-  defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">;
-
-  // SVE2 saturating extract narrow
-  defm SQXTNB_ZZ  : sve2_int_sat_extract_narrow<0b000, "sqxtnb">;
-  defm SQXTNT_ZZ  : sve2_int_sat_extract_narrow<0b001, "sqxtnt">;
-  defm UQXTNB_ZZ  : sve2_int_sat_extract_narrow<0b010, "uqxtnb">;
-  defm UQXTNT_ZZ  : sve2_int_sat_extract_narrow<0b011, "uqxtnt">;
-  defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">;
-  defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">;
+  // SVE2 bitwise shift right narrow (bottom)
+  defm SQSHRUNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">;
+  defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">;
+  defm SHRNB_ZZI     : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">;
+  defm RSHRNB_ZZI    : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">;
+  defm SQSHRNB_ZZI   : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">;
+  defm SQRSHRNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">;
+  defm UQSHRNB_ZZI   : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">;
+  defm UQRSHRNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">;
+
+  // SVE2 bitwise shift right narrow (top)
+  defm SQSHRUNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">;
+  defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">;
+  defm SHRNT_ZZI     : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">;
+  defm RSHRNT_ZZI    : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">;
+  defm SQSHRNT_ZZI   : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">;
+  defm SQRSHRNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">;
+  defm UQSHRNT_ZZI   : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">;
+  defm UQRSHRNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">;
+
+  // SVE2 integer add/subtract narrow high part (bottom)
+  defm ADDHNB_ZZZ  : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">;
+  defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">;
+  defm SUBHNB_ZZZ  : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">;
+  defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">;
+
+  // SVE2 integer add/subtract narrow high part (top)
+  defm ADDHNT_ZZZ  : sve2_int_addsub_narrow_high_top<0b00, "addhnt">;
+  defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">;
+  defm SUBHNT_ZZZ  : sve2_int_addsub_narrow_high_top<0b10, "subhnt">;
+  defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">;
+
+  // SVE2 saturating extract narrow (bottom)
+  defm SQXTNB_ZZ  : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">;
+  defm UQXTNB_ZZ  : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">;
+  defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">;
+
+  // SVE2 saturating extract narrow (top)
+  defm SQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">;
+  defm UQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">;
+  defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">;
 
   // SVE2 character match
   defm MATCH_PPzZZ  : sve2_char_match<0b0, "match">;
@@ -1289,10 +1352,14 @@ let Predicates = [HasSVE2] in {
   // SVE2 histogram generation (vector)
   defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;
 
+  // SVE2 floating-point base 2 logarithm as integer
+  defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
+
   // SVE2 floating-point convert precision
   defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">;
   defm FCVTNT_ZPmZ  : sve2_fp_convert_down_narrow<"fcvtnt">;
   defm FCVTLT_ZPmZ  : sve2_fp_convert_up_long<"fcvtlt">;
+  def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
 
   // SVE2 floating-point pairwise operations
   defm FADDP_ZPmZZ   : sve2_fp_pairwise_pred<0b000, "faddp">;
@@ -1321,58 +1388,45 @@ let Predicates = [HasSVE2] in {
   def BSL2N_ZZZZ_D  : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">;
   def NBSL_ZZZZ_D   : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">;
 
-  // sve_int_rotate_imm
+  // SVE2 bitwise xor and rotate right by immediate
   defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">;
 
   // SVE2 extract vector (immediate offset, constructive)
   def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
 
-  // SVE floating-point convert precision
-  def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
-
-  // SVE floating-point convert to integer
-  defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
-
-  // Non-temporal contiguous loads (vector + register)
-  defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
-  defm LDNT1B_ZZR_S  : sve2_mem_cldnt_vs<0b00001, "ldnt1b",  Z_s, ZPR32>;
-  defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
-  defm LDNT1H_ZZR_S  : sve2_mem_cldnt_vs<0b00101, "ldnt1h",  Z_s, ZPR32>;
-  defm LDNT1W_ZZR_S  : sve2_mem_cldnt_vs<0b01001, "ldnt1w",  Z_s, ZPR32>;
-
-  defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
-  defm LDNT1B_ZZR_D  : sve2_mem_cldnt_vs<0b10010, "ldnt1b",  Z_d, ZPR64>;
-  defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
-  defm LDNT1H_ZZR_D  : sve2_mem_cldnt_vs<0b10110, "ldnt1h",  Z_d, ZPR64>;
-  defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
-  defm LDNT1W_ZZR_D  : sve2_mem_cldnt_vs<0b11010, "ldnt1w",  Z_d, ZPR64>;
-  defm LDNT1D_ZZR_D  : sve2_mem_cldnt_vs<0b11110, "ldnt1d",  Z_d, ZPR64>;
+  // SVE2 non-temporal gather loads
+  defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
+  defm LDNT1B_ZZR_S  : sve2_mem_gldnt_vs<0b00001, "ldnt1b",  Z_s, ZPR32>;
+  defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
+  defm LDNT1H_ZZR_S  : sve2_mem_gldnt_vs<0b00101, "ldnt1h",  Z_s, ZPR32>;
+  defm LDNT1W_ZZR_S  : sve2_mem_gldnt_vs<0b01001, "ldnt1w",  Z_s, ZPR32>;
+
+  defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
+  defm LDNT1B_ZZR_D  : sve2_mem_gldnt_vs<0b10010, "ldnt1b",  Z_d, ZPR64>;
+  defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
+  defm LDNT1H_ZZR_D  : sve2_mem_gldnt_vs<0b10110, "ldnt1h",  Z_d, ZPR64>;
+  defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
+  defm LDNT1W_ZZR_D  : sve2_mem_gldnt_vs<0b11010, "ldnt1w",  Z_d, ZPR64>;
+  defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs<0b11110, "ldnt1d",  Z_d, ZPR64>;
 
   // SVE2 vector splice (constructive)
   defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
 
-  // Predicated shifts
-  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
-  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
-  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
-  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
-  defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
-
-  // Non-temporal contiguous stores (vector + register)
-  defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
-  defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
-  defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
+  // SVE2 non-temporal scatter stores
+  defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
+  defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
+  defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
 
-  defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
-  defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
-  defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
-  defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
+  defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
+  defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
+  defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
+  defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
 
-  // SVE table lookup (three sources)
+  // SVE2 table lookup (three sources)
   defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
   defm TBX_ZZZ  : sve2_int_perm_tbx<"tbx">;
 
-  // SVE integer compare scalar count and limit
+  // SVE2 integer compare scalar count and limit
   defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">;
   defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">;
   defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">;
@@ -1383,7 +1437,7 @@ let Predicates = [HasSVE2] in {
   defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">;
   defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">;
 
-  // SVE pointer conflict compare
+  // SVE2 pointer conflict compare
   defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
   defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">;
 }
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 60dbace03ca6..ba61ed726e84 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -32,7 +32,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     const AArch64TargetLowering &TLI = *STI.getTargetLowering();
 
     EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
-    Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+    Type *IntPtrTy = Type::getInt8PtrTy(*DAG.getContext());
     TargetLowering::ArgListTy Args;
     TargetLowering::ArgListEntry Entry;
     Entry.Node = Dst;
diff --git a/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
index 3087e6ce441d..7307961ddb5f 100644
--- a/lib/Target/AArch64/AArch64SpeculationHardening.cpp
+++ b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
@@ -106,6 +106,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 
@@ -115,9 +116,9 @@ using namespace llvm;
 
 #define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass"
 
-cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden,
-                          cl::desc("Sanitize loads from memory."),
-                          cl::init(true));
+static cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden,
+                                 cl::desc("Sanitize loads from memory."),
+                                 cl::init(true));
 
 namespace {
 
@@ -521,7 +522,7 @@ bool AArch64SpeculationHardening::slhLoads(MachineBasicBlock &MBB) {
       for (auto Use : MI.uses()) {
         if (!Use.isReg())
           continue;
-        unsigned Reg = Use.getReg();
+        Register Reg = Use.getReg();
         // Some loads of floating point data have implicit defs/uses on a
         // super register of that floating point data. Some examples:
         // $s0 = LDRSui $sp, 22, implicit-def $q0
@@ -561,8 +562,8 @@ bool AArch64SpeculationHardening::expandSpeculationSafeValue(
     // miss-speculation isn't happening because we're already inserting barriers
     // to guarantee that.
     if (!UseControlFlowSpeculationBarrier && !UsesFullSpeculationBarrier) {
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned SrcReg = MI.getOperand(1).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
+      Register SrcReg = MI.getOperand(1).getReg();
       // Mark this register and all its aliasing registers as needing to be
       // value speculation hardened before its next use, by using a CSDB
       // barrier instruction.
diff --git a/lib/Target/AArch64/AArch64StackOffset.h b/lib/Target/AArch64/AArch64StackOffset.h
new file mode 100644
index 000000000000..13f12a6c9c30
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StackOffset.h
@@ -0,0 +1,138 @@
+//==--AArch64StackOffset.h ---------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the StackOffset class, which is used to
+// describe scalable and non-scalable offsets during frame lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
+
+#include "llvm/Support/MachineValueType.h"
+
+namespace llvm {
+
+/// StackOffset is a wrapper around scalable and non-scalable offsets and is
+/// used in several functions such as 'isAArch64FrameOffsetLegal' and
+/// 'emitFrameOffset()'. StackOffsets are described by MVTs, e.g.
+//
+///   StackOffset(1, MVT::nxv16i8)
+//
+/// would describe an offset as being the size of a single SVE vector.
+///
+/// The class also implements simple arithmetic (addition/subtraction) on these
+/// offsets, e.g.
+//
+///   StackOffset(1, MVT::nxv16i8) + StackOffset(1, MVT::i64)
+//
+/// describes an offset that spans the combined storage required for an SVE
+/// vector and a 64bit GPR.
+class StackOffset {
+  int64_t Bytes;
+  int64_t ScalableBytes;
+
+  explicit operator int() const;
+
+public:
+  using Part = std::pair<int64_t, MVT>;
+
+  StackOffset() : Bytes(0), ScalableBytes(0) {}
+
+  StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() {
+    assert(MVT(T).getSizeInBits() % 8 == 0 &&
+           "Offset type is not a multiple of bytes");
+    *this += Part(Offset, T);
+  }
+
+  StackOffset(const StackOffset &Other)
+      : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {}
+
+  StackOffset &operator=(const StackOffset &) = default;
+
+  StackOffset &operator+=(const StackOffset::Part &Other) {
+    int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8);
+    if (Other.second.isScalableVector())
+      ScalableBytes += OffsetInBytes;
+    else
+      Bytes += OffsetInBytes;
+    return *this;
+  }
+
+  StackOffset &operator+=(const StackOffset &Other) {
+    Bytes += Other.Bytes;
+    ScalableBytes += Other.ScalableBytes;
+    return *this;
+  }
+
+  StackOffset operator+(const StackOffset &Other) const {
+    StackOffset Res(*this);
+    Res += Other;
+    return Res;
+  }
+
+  StackOffset &operator-=(const StackOffset &Other) {
+    Bytes -= Other.Bytes;
+    ScalableBytes -= Other.ScalableBytes;
+    return *this;
+  }
+
+  StackOffset operator-(const StackOffset &Other) const {
+    StackOffset Res(*this);
+    Res -= Other;
+    return Res;
+  }
+
+  StackOffset operator-() const {
+    StackOffset Res = {};
+    const StackOffset Other(*this);
+    Res -= Other;
+    return Res;
+  }
+
+  /// Returns the scalable part of the offset in bytes.
+  int64_t getScalableBytes() const { return ScalableBytes; }
+
+  /// Returns the non-scalable part of the offset in bytes.
+  int64_t getBytes() const { return Bytes; }
+
+  /// Returns the offset in parts to which this frame offset can be
+  /// decomposed for the purpose of describing a frame offset.
+  /// For non-scalable offsets this is simply its byte size.
+  void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors,
+                         int64_t &NumDataVectors) const {
+    assert(isValid() && "Invalid frame offset");
+
+    NumBytes = Bytes;
+    NumDataVectors = 0;
+    NumPredicateVectors = ScalableBytes / 2;
+    // This method is used to get the offsets to adjust the frame offset.
+    // If the function requires ADDPL to be used and needs more than two ADDPL
+    // instructions, part of the offset is folded into NumDataVectors so that it
+    // uses ADDVL for part of it, reducing the number of ADDPL instructions.
+    if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
+        NumPredicateVectors > 62) {
+      NumDataVectors = NumPredicateVectors / 8;
+      NumPredicateVectors -= NumDataVectors * 8;
+    }
+  }
+
+  /// Returns whether the offset is known zero.
+  explicit operator bool() const { return Bytes || ScalableBytes; }
+
+  bool isValid() const {
+    // The smallest scalable element supported by scaled SVE addressing
+    // modes are predicates, which are 2 scalable bytes in size. So the scalable
+    // byte offset must always be a multiple of 2.
+    return ScalableBytes % 2 == 0;
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64StackTagging.cpp b/lib/Target/AArch64/AArch64StackTagging.cpp
index 6e99c48bf1d7..e6dbe01d3807 100644
--- a/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -55,9 +56,215 @@ using namespace llvm;
 
 #define DEBUG_TYPE "stack-tagging"
 
-static constexpr unsigned kTagGranuleSize = 16;
+static cl::opt<bool> ClMergeInit(
+    "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+    cl::desc("merge stack variable initializers with tagging when possible"));
+
+static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
+                                     cl::init(40), cl::Hidden);
+
+static const Align kTagGranuleSize = Align(16);
 
 namespace {
+
+class InitializerBuilder {
+  uint64_t Size;
+  const DataLayout *DL;
+  Value *BasePtr;
+  Function *SetTagFn;
+  Function *SetTagZeroFn;
+  Function *StgpFn;
+
+  // List of initializers sorted by start offset.
+  struct Range {
+    uint64_t Start, End;
+    Instruction *Inst;
+  };
+  SmallVector<Range, 4> Ranges;
+  // 8-aligned offset => 8-byte initializer
+  // Missing keys are zero initialized.
+  std::map<uint64_t, Value *> Out;
+
+public:
+  InitializerBuilder(uint64_t Size, const DataLayout *DL, Value *BasePtr,
+                     Function *SetTagFn, Function *SetTagZeroFn,
+                     Function *StgpFn)
+      : Size(Size), DL(DL), BasePtr(BasePtr), SetTagFn(SetTagFn),
+        SetTagZeroFn(SetTagZeroFn), StgpFn(StgpFn) {}
+
+  bool addRange(uint64_t Start, uint64_t End, Instruction *Inst) {
+    auto I = std::lower_bound(
+        Ranges.begin(), Ranges.end(), Start,
+        [](const Range &LHS, uint64_t RHS) { return LHS.End <= RHS; });
+    if (I != Ranges.end() && End > I->Start) {
+      // Overlap - bail.
+      return false;
+    }
+    Ranges.insert(I, {Start, End, Inst});
+    return true;
+  }
+
+  bool addStore(uint64_t Offset, StoreInst *SI, const DataLayout *DL) {
+    int64_t StoreSize = DL->getTypeStoreSize(SI->getOperand(0)->getType());
+    if (!addRange(Offset, Offset + StoreSize, SI))
+      return false;
+    IRBuilder<> IRB(SI);
+    applyStore(IRB, Offset, Offset + StoreSize, SI->getOperand(0));
+    return true;
+  }
+
+  bool addMemSet(uint64_t Offset, MemSetInst *MSI) {
+    uint64_t StoreSize = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+    if (!addRange(Offset, Offset + StoreSize, MSI))
+      return false;
+    IRBuilder<> IRB(MSI);
+    applyMemSet(IRB, Offset, Offset + StoreSize,
+                cast<ConstantInt>(MSI->getValue()));
+    return true;
+  }
+
+  void applyMemSet(IRBuilder<> &IRB, int64_t Start, int64_t End,
+                   ConstantInt *V) {
+    // Out[] does not distinguish between zero and undef, and we already know
+    // that this memset does not overlap with any other initializer. Nothing to
+    // do for memset(0).
+    if (V->isZero())
+      return;
+    for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) {
+      uint64_t Cst = 0x0101010101010101UL;
+      int LowBits = Offset < Start ? (Start - Offset) * 8 : 0;
+      if (LowBits)
+        Cst = (Cst >> LowBits) << LowBits;
+      int HighBits = End - Offset < 8 ? (8 - (End - Offset)) * 8 : 0;
+      if (HighBits)
+        Cst = (Cst << HighBits) >> HighBits;
+      ConstantInt *C =
+          ConstantInt::get(IRB.getInt64Ty(), Cst * V->getZExtValue());
+
+      Value *&CurrentV = Out[Offset];
+      if (!CurrentV) {
+        CurrentV = C;
+      } else {
+        CurrentV = IRB.CreateOr(CurrentV, C);
+      }
+    }
+  }
+
+  // Take a 64-bit slice of the value starting at the given offset (in bytes).
+  // Offset can be negative. Pad with zeroes on both sides when necessary.
+  Value *sliceValue(IRBuilder<> &IRB, Value *V, int64_t Offset) {
+    if (Offset > 0) {
+      V = IRB.CreateLShr(V, Offset * 8);
+      V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+    } else if (Offset < 0) {
+      V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+      V = IRB.CreateShl(V, -Offset * 8);
+    } else {
+      V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+    }
+    return V;
+  }
+
+  void applyStore(IRBuilder<> &IRB, int64_t Start, int64_t End,
+                  Value *StoredValue) {
+    StoredValue = flatten(IRB, StoredValue);
+    for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) {
+      Value *V = sliceValue(IRB, StoredValue, Offset - Start);
+      Value *&CurrentV = Out[Offset];
+      if (!CurrentV) {
+        CurrentV = V;
+      } else {
+        CurrentV = IRB.CreateOr(CurrentV, V);
+      }
+    }
+  }
+
+  void generate(IRBuilder<> &IRB) {
+    LLVM_DEBUG(dbgs() << "Combined initializer\n");
+    // No initializers => the entire allocation is undef.
+    if (Ranges.empty()) {
+      emitUndef(IRB, 0, Size);
+      return;
+    }
+
+    // Look through 8-byte initializer list 16 bytes at a time;
+    // If one of the two 8-byte halfs is non-zero non-undef, emit STGP.
+    // Otherwise, emit zeroes up to next available item.
+    uint64_t LastOffset = 0;
+    for (uint64_t Offset = 0; Offset < Size; Offset += 16) {
+      auto I1 = Out.find(Offset);
+      auto I2 = Out.find(Offset + 8);
+      if (I1 == Out.end() && I2 == Out.end())
+        continue;
+
+      if (Offset > LastOffset)
+        emitZeroes(IRB, LastOffset, Offset - LastOffset);
+
+      Value *Store1 = I1 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty())
+                                      : I1->second;
+      Value *Store2 = I2 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty())
+                                      : I2->second;
+      emitPair(IRB, Offset, Store1, Store2);
+      LastOffset = Offset + 16;
+    }
+
+    // memset(0) does not update Out[], therefore the tail can be either undef
+    // or zero.
+    if (LastOffset < Size)
+      emitZeroes(IRB, LastOffset, Size - LastOffset);
+
+    for (const auto &R : Ranges) {
+      R.Inst->eraseFromParent();
+    }
+  }
+
+  void emitZeroes(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) {
+    LLVM_DEBUG(dbgs() << "  [" << Offset << ", " << Offset + Size
+                      << ") zero\n");
+    Value *Ptr = BasePtr;
+    if (Offset)
+      Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+    IRB.CreateCall(SetTagZeroFn,
+                   {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+  }
+
+  void emitUndef(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) {
+    LLVM_DEBUG(dbgs() << "  [" << Offset << ", " << Offset + Size
+                      << ") undef\n");
+    Value *Ptr = BasePtr;
+    if (Offset)
+      Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+    IRB.CreateCall(SetTagFn, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+  }
+
+  void emitPair(IRBuilder<> &IRB, uint64_t Offset, Value *A, Value *B) {
+    LLVM_DEBUG(dbgs() << "  [" << Offset << ", " << Offset + 16 << "):\n");
+    LLVM_DEBUG(dbgs() << "    " << *A << "\n    " << *B << "\n");
+    Value *Ptr = BasePtr;
+    if (Offset)
+      Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+    IRB.CreateCall(StgpFn, {Ptr, A, B});
+  }
+
+  Value *flatten(IRBuilder<> &IRB, Value *V) {
+    if (V->getType()->isIntegerTy())
+      return V;
+    // vector of pointers -> vector of ints
+    if (VectorType *VecTy = dyn_cast<VectorType>(V->getType())) {
+      LLVMContext &Ctx = IRB.getContext();
+      Type *EltTy = VecTy->getElementType();
+      if (EltTy->isPointerTy()) {
+        uint32_t EltSize = DL->getTypeSizeInBits(EltTy);
+        Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize),
+                                      VecTy->getNumElements());
+        V = IRB.CreatePointerCast(V, NewTy);
+      }
+    }
+    return IRB.CreateBitOrPointerCast(
+        V, IRB.getIntNTy(DL->getTypeStoreSize(V->getType()) * 8));
+  }
+};
+
 class AArch64StackTagging : public FunctionPass {
   struct AllocaInfo {
     AllocaInst *AI;
@@ -67,10 +274,15 @@ class AArch64StackTagging : public FunctionPass {
     int Tag; // -1 for non-tagged allocations
   };
 
+  bool MergeInit;
+
 public:
   static char ID; // Pass ID, replacement for typeid
 
-  AArch64StackTagging() : FunctionPass(ID) {
+  AArch64StackTagging(bool MergeInit = true)
+      : FunctionPass(ID),
+        MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit
+                                                      : MergeInit) {
     initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry());
   }
 
@@ -81,6 +293,9 @@ public:
                  uint64_t Size);
   void untagAlloca(AllocaInst *AI, Instruction *InsertBefore, uint64_t Size);
 
+  Instruction *collectInitializers(Instruction *StartInst, Value *StartPtr,
+                                   uint64_t Size, InitializerBuilder &IB);
+
   Instruction *
   insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas,
                           const DominatorTree *DT);
@@ -92,9 +307,12 @@ private:
   Function *F;
   Function *SetTagFunc;
   const DataLayout *DL;
+  AAResults *AA;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    if (MergeInit)
+      AU.addRequired<AAResultsWrapperPass>();
   }
 };
 
@@ -107,8 +325,68 @@ INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
 INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
                     false, false)
 
-FunctionPass *llvm::createAArch64StackTaggingPass() {
-  return new AArch64StackTagging();
+FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) {
+  return new AArch64StackTagging(MergeInit);
+}
+
+Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
+                                                      Value *StartPtr,
+                                                      uint64_t Size,
+                                                      InitializerBuilder &IB) {
+  MemoryLocation AllocaLoc{StartPtr, Size};
+  Instruction *LastInst = StartInst;
+  BasicBlock::iterator BI(StartInst);
+
+  unsigned Count = 0;
+  for (; Count < ClScanLimit && !BI->isTerminator(); ++BI) {
+    if (!isa<DbgInfoIntrinsic>(*BI))
+      ++Count;
+
+    if (isNoModRef(AA->getModRefInfo(&*BI, AllocaLoc)))
+      continue;
+
+    if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+      // If the instruction is readnone, ignore it, otherwise bail out.  We
+      // don't even allow readonly here because we don't want something like:
+      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+      if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+        break;
+      continue;
+    }
+
+    if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+      if (!NextStore->isSimple())
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      Optional<int64_t> Offset =
+          isPointerOffset(StartPtr, NextStore->getPointerOperand(), *DL);
+      if (!Offset)
+        break;
+
+      if (!IB.addStore(*Offset, NextStore, DL))
+        break;
+      LastInst = NextStore;
+    } else {
+      MemSetInst *MSI = cast<MemSetInst>(BI);
+
+      if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+        break;
+
+      if (!isa<ConstantInt>(MSI->getValue()))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), *DL);
+      if (!Offset)
+        break;
+
+      if (!IB.addMemSet(*Offset, MSI))
+        break;
+      LastInst = MSI;
+    }
+  }
+  return LastInst;
 }
 
 bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
@@ -127,8 +405,23 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
 
 void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore,
                                     Value *Ptr, uint64_t Size) {
+  auto SetTagZeroFunc =
+      Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag_zero);
+  auto StgpFunc =
+      Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_stgp);
+
+  InitializerBuilder IB(Size, DL, Ptr, SetTagFunc, SetTagZeroFunc, StgpFunc);
+  bool LittleEndian =
+      Triple(AI->getModule()->getTargetTriple()).isLittleEndian();
+  // Current implementation of initializer merging assumes little endianness.
+  if (MergeInit && !F->hasOptNone() && LittleEndian) {
+    LLVM_DEBUG(dbgs() << "collecting initializers for " << *AI
+                      << ", size = " << Size << "\n");
+    InsertBefore = collectInitializers(InsertBefore, Ptr, Size, IB);
+  }
+
   IRBuilder<> IRB(InsertBefore);
-  IRB.CreateCall(SetTagFunc, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+  IB.generate(IRB);
 }
 
 void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
@@ -166,7 +459,8 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
 }
 
 void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
-  unsigned NewAlignment = std::max(Info.AI->getAlignment(), kTagGranuleSize);
+  const Align NewAlignment =
+      max(MaybeAlign(Info.AI->getAlignment()), kTagGranuleSize);
   Info.AI->setAlignment(NewAlignment);
 
   uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
@@ -179,7 +473,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
       Info.AI->isArrayAllocation()
           ? ArrayType::get(
                 Info.AI->getAllocatedType(),
-                dyn_cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
+                cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
           : Info.AI->getAllocatedType();
   Type *PaddingType =
       ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size);
@@ -187,7 +481,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
   auto *NewAI = new AllocaInst(
       TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI);
   NewAI->takeName(Info.AI);
-  NewAI->setAlignment(Info.AI->getAlignment());
+  NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment()));
   NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
   NewAI->setSwiftError(Info.AI->isSwiftError());
   NewAI->copyMetadata(*Info.AI);
@@ -198,6 +492,24 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
   Info.AI = NewAI;
 }
 
+// Helper function to check for post-dominance.
+static bool postDominates(const PostDominatorTree *PDT, const IntrinsicInst *A,
+                          const IntrinsicInst *B) {
+  const BasicBlock *ABB = A->getParent();
+  const BasicBlock *BBB = B->getParent();
+
+  if (ABB != BBB)
+    return PDT->dominates(ABB, BBB);
+
+  for (const Instruction &I : *ABB) {
+    if (&I == B)
+      return true;
+    if (&I == A)
+      return false;
+  }
+  llvm_unreachable("Corrupt instruction list");
+}
+
 // FIXME: check for MTE extension
 bool AArch64StackTagging::runOnFunction(Function &Fn) {
   if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag))
@@ -205,6 +517,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
 
   F = &Fn;
   DL = &Fn.getParent()->getDataLayout();
+  if (MergeInit)
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order
   SmallVector<Instruction *, 8> RetVec;
@@ -270,23 +584,31 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   if (NumInterestingAllocas == 0)
     return true;
 
+  std::unique_ptr<DominatorTree> DeleteDT;
+  DominatorTree *DT = nullptr;
+  if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+    DT = &P->getDomTree();
+
+  if (DT == nullptr && (NumInterestingAllocas > 1 ||
+                        !F->hasFnAttribute(Attribute::OptimizeNone))) {
+    DeleteDT = std::make_unique<DominatorTree>(*F);
+    DT = DeleteDT.get();
+  }
+
+  std::unique_ptr<PostDominatorTree> DeletePDT;
+  PostDominatorTree *PDT = nullptr;
+  if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>())
+    PDT = &P->getPostDomTree();
+
+  if (PDT == nullptr && !F->hasFnAttribute(Attribute::OptimizeNone)) {
+    DeletePDT = std::make_unique<PostDominatorTree>(*F);
+    PDT = DeletePDT.get();
+  }
+
   SetTagFunc =
       Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
 
-  // Compute DT only if the function has the attribute, there are more than 1
-  // interesting allocas, and it is not available for free.
-  Instruction *Base;
-  if (NumInterestingAllocas > 1) {
-    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-    if (DTWP) {
-      Base = insertBaseTaggedPointer(Allocas, &DTWP->getDomTree());
-    } else {
-      DominatorTree DT(*F);
-      Base = insertBaseTaggedPointer(Allocas, &DT);
-    }
-  } else {
-    Base = insertBaseTaggedPointer(Allocas, nullptr);
-  }
+  Instruction *Base = insertBaseTaggedPointer(Allocas, DT);
 
   for (auto &I : Allocas) {
     const AllocaInfo &Info = I.second;
@@ -309,11 +631,37 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
     if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 &&
         Info.LifetimeEnd.size() == 1) {
       IntrinsicInst *Start = Info.LifetimeStart[0];
+      IntrinsicInst *End = Info.LifetimeEnd[0];
       uint64_t Size =
           dyn_cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
       Size = alignTo(Size, kTagGranuleSize);
       tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
-      untagAlloca(AI, Info.LifetimeEnd[0], Size);
+      // We need to ensure that if we tag some object, we certainly untag it
+      // before the function exits.
+      if (PDT != nullptr && postDominates(PDT, End, Start)) {
+        untagAlloca(AI, End, Size);
+      } else {
+        SmallVector<Instruction *, 8> ReachableRetVec;
+        unsigned NumCoveredExits = 0;
+        for (auto &RI : RetVec) {
+          if (!isPotentiallyReachable(Start, RI, nullptr, DT))
+            continue;
+          ReachableRetVec.push_back(RI);
+          if (DT != nullptr && DT->dominates(End, RI))
+            ++NumCoveredExits;
+        }
+        // If there's a mix of covered and non-covered exits, just put the untag
+        // on exits, so we avoid the redundancy of untagging twice.
+        if (NumCoveredExits == ReachableRetVec.size()) {
+          untagAlloca(AI, End, Size);
+        } else {
+          for (auto &RI : ReachableRetVec)
+            untagAlloca(AI, RI, Size);
+          // We may have inserted untag outside of the lifetime interval.
+          // Remove the lifetime end call for this alloca.
+          End->eraseFromParent();
+        }
+      }
     } else {
       uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
       Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy());
diff --git a/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
new file mode 100644
index 000000000000..3cc556f74aea
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -0,0 +1,209 @@
+//===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-stack-tagging-pre-ra"
+
+enum UncheckedLdStMode { UncheckedNever, UncheckedSafe, UncheckedAlways };
+
+cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
+    "stack-tagging-unchecked-ld-st", cl::Hidden,
+    cl::init(UncheckedSafe),
+    cl::desc(
+        "Unconditionally apply unchecked-ld-st optimization (even for large "
+        "stack frames, or in the presence of variable sized allocas)."),
+    cl::values(
+        clEnumValN(UncheckedNever, "never", "never apply unchecked-ld-st"),
+        clEnumValN(
+            UncheckedSafe, "safe",
+            "apply unchecked-ld-st when the target is definitely within range"),
+        clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));
+
+namespace {
+
+class AArch64StackTaggingPreRA : public MachineFunctionPass {
+  MachineFunction *MF;
+  AArch64FunctionInfo *AFI;
+  MachineFrameInfo *MFI;
+  MachineRegisterInfo *MRI;
+  const AArch64RegisterInfo *TRI;
+  const AArch64InstrInfo *TII;
+
+  SmallVector<MachineInstr*, 16> ReTags;
+
+public:
+  static char ID;
+  AArch64StackTaggingPreRA() : MachineFunctionPass(ID) {
+    initializeAArch64StackTaggingPreRAPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool mayUseUncheckedLoadStore();
+  void uncheckUsesOf(unsigned TaggedReg, int FI);
+  void uncheckLoadsAndStores();
+
+  bool runOnMachineFunction(MachineFunction &Func) override;
+  StringRef getPassName() const override {
+    return "AArch64 Stack Tagging PreRA";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // end anonymous namespace
+
+char AArch64StackTaggingPreRA::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
+                      "AArch64 Stack Tagging PreRA Pass", false, false)
+INITIALIZE_PASS_END(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
+                    "AArch64 Stack Tagging PreRA Pass", false, false)
+
+FunctionPass *llvm::createAArch64StackTaggingPreRAPass() {
+  return new AArch64StackTaggingPreRA();
+}
+
+static bool isUncheckedLoadOrStoreOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case AArch64::LDRWui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRBui:
+  case AArch64::LDRBBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRBui:
+  case AArch64::STRBBui:
+  case AArch64::STRHui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() {
+  if (ClUncheckedLdSt == UncheckedNever)
+    return false;
+  else if (ClUncheckedLdSt == UncheckedAlways)
+    return true;
+
+  // This estimate can be improved if we had harder guarantees about stack frame
+  // layout. With LocalStackAllocation we can estimate SP offset to any
+  // preallocated slot. AArch64FrameLowering::orderFrameObjects could put tagged
+  // objects ahead of non-tagged ones, but that's not always desirable.
+  //
+  // Underestimating SP offset here may require the use of LDG to materialize
+  // the tagged address of the stack slot, along with a scratch register
+  // allocation (post-regalloc!).
+  //
+  // For now we do the safe thing here and require that the entire stack frame
+  // is within range of the shortest of the unchecked instructions.
+  unsigned FrameSize = 0;
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i)
+    FrameSize += MFI->getObjectSize(i);
+  bool EntireFrameReachableFromSP = FrameSize < 0xf00;
+  return !MFI->hasVarSizedObjects() && EntireFrameReachableFromSP;
+}
+
+void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) {
+  for (auto UI = MRI->use_instr_begin(TaggedReg), E = MRI->use_instr_end();
+       UI != E;) {
+    MachineInstr *UseI = &*(UI++);
+    if (isUncheckedLoadOrStoreOpcode(UseI->getOpcode())) {
+      // FI operand is always the one before the immediate offset.
+      unsigned OpIdx = TII->getLoadStoreImmIdx(UseI->getOpcode()) - 1;
+      if (UseI->getOperand(OpIdx).isReg() &&
+          UseI->getOperand(OpIdx).getReg() == TaggedReg) {
+        UseI->getOperand(OpIdx).ChangeToFrameIndex(FI);
+        UseI->getOperand(OpIdx).setTargetFlags(AArch64II::MO_TAGGED);
+      }
+    } else if (UseI->isCopy() &&
+               Register::isVirtualRegister(UseI->getOperand(0).getReg())) {
+      uncheckUsesOf(UseI->getOperand(0).getReg(), FI);
+    }
+  }
+}
+
+void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
+  for (auto *I : ReTags) {
+    unsigned TaggedReg = I->getOperand(0).getReg();
+    int FI = I->getOperand(1).getIndex();
+    uncheckUsesOf(TaggedReg, FI);
+  }
+}
+
+bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  MRI = &MF->getRegInfo();
+  AFI = MF->getInfo<AArch64FunctionInfo>();
+  TII = static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo());
+  TRI = static_cast<const AArch64RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+  MFI = &MF->getFrameInfo();
+  ReTags.clear();
+
+  assert(MRI->isSSA());
+
+  LLVM_DEBUG(dbgs() << "********** AArch64 Stack Tagging PreRA **********\n"
+                    << "********** Function: " << MF->getName() << '\n');
+
+  SmallSetVector<int, 8> TaggedSlots;
+  for (auto &BB : *MF) {
+    for (auto &I : BB) {
+      if (I.getOpcode() == AArch64::TAGPstack) {
+        ReTags.push_back(&I);
+        int FI = I.getOperand(1).getIndex();
+        TaggedSlots.insert(FI);
+        // There should be no offsets in TAGP yet.
+        assert(I.getOperand(2).getImm() == 0);
+      }
+    }
+  }
+
+  if (ReTags.empty())
+    return false;
+
+  if (mayUseUncheckedLoadStore())
+    uncheckLoadsAndStores();
+
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 0e84a00df006..5deb601822b8 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -151,7 +151,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
       int64_t Offset;
       if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) &&
           BaseOp->isReg()) {
-        unsigned BaseReg = BaseOp->getReg();
+        Register BaseReg = BaseOp->getReg();
         if (PrevBaseReg == BaseReg) {
           // If this block can take STPs, skip ahead to the next block.
           if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 3bc89b91c3f7..558bea368eff 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -71,19 +71,22 @@ void AArch64Subtarget::initializeProperties() {
   case CortexA35:
     break;
   case CortexA53:
-    PrefFunctionAlignment = 3;
+    PrefFunctionLogAlignment = 3;
     break;
   case CortexA55:
     break;
   case CortexA57:
     MaxInterleaveFactor = 4;
-    PrefFunctionAlignment = 4;
+    PrefFunctionLogAlignment = 4;
+    break;
+  case CortexA65:
+    PrefFunctionLogAlignment = 3;
     break;
   case CortexA72:
   case CortexA73:
   case CortexA75:
   case CortexA76:
-    PrefFunctionAlignment = 4;
+    PrefFunctionLogAlignment = 4;
     break;
   case Cyclone:
     CacheLineSize = 64;
@@ -94,14 +97,14 @@ void AArch64Subtarget::initializeProperties() {
   case ExynosM1:
     MaxInterleaveFactor = 4;
     MaxJumpTableSize = 8;
-    PrefFunctionAlignment = 4;
-    PrefLoopAlignment = 3;
+    PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 3;
     break;
   case ExynosM3:
     MaxInterleaveFactor = 4;
     MaxJumpTableSize = 20;
-    PrefFunctionAlignment = 5;
-    PrefLoopAlignment = 4;
+    PrefFunctionLogAlignment = 5;
+    PrefLoopLogAlignment = 4;
     break;
   case Falkor:
     MaxInterleaveFactor = 4;
@@ -122,6 +125,12 @@ void AArch64Subtarget::initializeProperties() {
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
+  case NeoverseE1:
+    PrefFunctionLogAlignment = 3;
+    break;
+  case NeoverseN1:
+    PrefFunctionLogAlignment = 4;
+    break;
   case Saphira:
     MaxInterleaveFactor = 4;
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
@@ -129,8 +138,8 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case ThunderX2T99:
     CacheLineSize = 64;
-    PrefFunctionAlignment = 3;
-    PrefLoopAlignment = 2;
+    PrefFunctionLogAlignment = 3;
+    PrefLoopLogAlignment = 2;
     MaxInterleaveFactor = 4;
     PrefetchDistance = 128;
     MinPrefetchStride = 1024;
@@ -143,15 +152,15 @@ void AArch64Subtarget::initializeProperties() {
   case ThunderXT81:
   case ThunderXT83:
     CacheLineSize = 128;
-    PrefFunctionAlignment = 3;
-    PrefLoopAlignment = 2;
+    PrefFunctionLogAlignment = 3;
+    PrefLoopLogAlignment = 2;
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
   case TSV110:
     CacheLineSize = 64;
-    PrefFunctionAlignment = 4;
-    PrefLoopAlignment = 2;
+    PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 2;
     break;
   }
 }
@@ -187,7 +196,7 @@ const CallLowering *AArch64Subtarget::getCallLowering() const {
   return CallLoweringInfo.get();
 }
 
-const InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
+InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
   return InstSelector.get();
 }
 
@@ -201,7 +210,7 @@ const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
 
 /// Find the target operand flags that describe how a global value should be
 /// referenced for the current subtarget.
-unsigned char
+unsigned
 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
                                           const TargetMachine &TM) const {
   // MachO large model always goes via a GOT, simply to get a single 8-byte
@@ -224,10 +233,17 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
       GV->hasExternalWeakLinkage())
     return AArch64II::MO_GOT;
 
+  // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
+  // that their nominal addresses are tagged and outside of the code model. In
+  // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
+  // tag if necessary based on MO_TAGGED.
+  if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
+    return AArch64II::MO_NC | AArch64II::MO_TAGGED;
+
   return AArch64II::MO_NO_FLAG;
 }
 
-unsigned char AArch64Subtarget::classifyGlobalFunctionReference(
+unsigned AArch64Subtarget::classifyGlobalFunctionReference(
     const GlobalValue *GV, const TargetMachine &TM) const {
   // MachO large model always goes via a GOT, because we don't have the
   // relocations available to do anything else..
@@ -275,7 +291,7 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
 
 std::unique_ptr<PBQPRAConstraint>
 AArch64Subtarget::getCustomPBQPConstraints() const {
-  return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
+  return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
 }
 
 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 0c84cfb8329a..f3212fae8e5e 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -42,6 +42,7 @@ public:
     CortexA53,
     CortexA55,
     CortexA57,
+    CortexA65,
     CortexA72,
     CortexA73,
     CortexA75,
@@ -51,6 +52,8 @@ public:
     ExynosM3,
     Falkor,
     Kryo,
+    NeoverseE1,
+    NeoverseN1,
     Saphira,
     ThunderX2T99,
     ThunderX,
@@ -113,6 +116,7 @@ protected:
   bool HasTRACEV8_4 = false;
   bool HasAM = false;
   bool HasSEL2 = false;
+  bool HasPMU = false;
   bool HasTLB_RMI = false;
   bool HasFMI = false;
   bool HasRCPC_IMMO = false;
@@ -134,6 +138,7 @@ protected:
   bool HasBTI = false;
   bool HasRandGen = false;
   bool HasMTE = false;
+  bool HasTME = false;
 
   // Arm SVE2 extensions
   bool HasSVE2AES = false;
@@ -141,6 +146,10 @@ protected:
   bool HasSVE2SHA3 = false;
   bool HasSVE2BitPerm = false;
 
+  // Future architecture extensions.
+  bool HasETE = false;
+  bool HasTRBE = false;
+
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
 
@@ -183,14 +192,15 @@ protected:
   bool UseEL1ForTP = false;
   bool UseEL2ForTP = false;
   bool UseEL3ForTP = false;
+  bool AllowTaggedGlobals = false;
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 3;
   uint16_t CacheLineSize = 0;
   uint16_t PrefetchDistance = 0;
   uint16_t MinPrefetchStride = 1;
   unsigned MaxPrefetchIterationsAhead = UINT_MAX;
-  unsigned PrefFunctionAlignment = 0;
-  unsigned PrefLoopAlignment = 0;
+  unsigned PrefFunctionLogAlignment = 0;
+  unsigned PrefLoopLogAlignment = 0;
   unsigned MaxJumpTableSize = 0;
   unsigned WideningBaseCost = 0;
 
@@ -247,7 +257,7 @@ public:
     return &getInstrInfo()->getRegisterInfo();
   }
   const CallLowering *getCallLowering() const override;
-  const InstructionSelector *getInstructionSelector() const override;
+  InstructionSelector *getInstructionSelector() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
   const Triple &getTargetTriple() const { return TargetTriple; }
@@ -344,14 +354,16 @@ public:
   unsigned getVectorInsertExtractBaseCost() const {
     return VectorInsertExtractBaseCost;
   }
-  unsigned getCacheLineSize() const { return CacheLineSize; }
-  unsigned getPrefetchDistance() const { return PrefetchDistance; }
-  unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
-  unsigned getMaxPrefetchIterationsAhead() const {
+  unsigned getCacheLineSize() const override { return CacheLineSize; }
+  unsigned getPrefetchDistance() const override { return PrefetchDistance; }
+  unsigned getMinPrefetchStride() const override { return MinPrefetchStride; }
+  unsigned getMaxPrefetchIterationsAhead() const override {
     return MaxPrefetchIterationsAhead;
   }
-  unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
-  unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }
+  unsigned getPrefFunctionLogAlignment() const {
+    return PrefFunctionLogAlignment;
+  }
+  unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; }
 
   unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
 
@@ -380,6 +392,7 @@ public:
   bool hasBTI() const { return HasBTI; }
   bool hasRandGen() const { return HasRandGen; }
   bool hasMTE() const { return HasMTE; }
+  bool hasTME() const { return HasTME; }
   // Arm SVE2 extensions
   bool hasSVE2AES() const { return HasSVE2AES; }
   bool hasSVE2SM4() const { return HasSVE2SM4; }
@@ -399,6 +412,8 @@ public:
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
+  bool isTargetILP32() const { return TargetTriple.isArch32Bit(); }
+
   bool useAA() const override { return UseAA; }
 
   bool hasVH() const { return HasVH; }
@@ -421,10 +436,17 @@ public:
   bool hasTRACEV8_4() const { return HasTRACEV8_4; }
   bool hasAM() const { return HasAM; }
   bool hasSEL2() const { return HasSEL2; }
+  bool hasPMU() const { return HasPMU; }
   bool hasTLB_RMI() const { return HasTLB_RMI; }
   bool hasFMI() const { return HasFMI; }
   bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
 
+  bool addrSinkUsingGEPs() const override {
+    // Keeping GEPs inbounds is important for exploiting AArch64
+    // addressing-modes in ILP32 mode.
+    return useAA() || isTargetILP32();
+  }
+
   bool useSmallAddressing() const {
     switch (TLInfo.getTargetMachine().getCodeModel()) {
       case CodeModel::Kernel:
@@ -443,11 +465,11 @@ public:
 
   /// ClassifyGlobalReference - Find the target operand flags that describe
   /// how a global value should be referenced for the current subtarget.
-  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const;
+  unsigned ClassifyGlobalReference(const GlobalValue *GV,
+                                   const TargetMachine &TM) const;
 
-  unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
-                                                const TargetMachine &TM) const;
+  unsigned classifyGlobalFunctionReference(const GlobalValue *GV,
+                                           const TargetMachine &TM) const;
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
index 536a6591478b..05249a4ea6a8 100644
--- a/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -612,6 +612,7 @@ def : ROSysReg<"ISR_EL1",            0b11, 0b000, 0b1100, 0b0001, 0b000>;
 def : ROSysReg<"CNTPCT_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b001>;
 def : ROSysReg<"CNTVCT_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b010>;
 def : ROSysReg<"ID_MMFR4_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b110>;
+def : ROSysReg<"ID_MMFR5_EL1",       0b11, 0b000, 0b0000, 0b0011, 0b110>;
 
 // Trace registers
 //                                 Op0    Op1     CRn     CRm    Op2
@@ -1321,6 +1322,12 @@ def : RWSysReg<"CNTHPS_CTL_EL2",  0b11, 0b100, 0b1110, 0b0101, 0b001>;
 def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>;
 } // FeatureSEL2
 
+// v8.4a PMU registers
+//                          Op0   Op1    CRn     CRm     Op2
+let Requires = [{ {AArch64::FeaturePMU} }] in {
+def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
+} // FeaturePMU
+
 // v8.4a RAS registers
 //                              Op0   Op1    CRn     CRm     Op2
 let Requires = [{ {AArch64::FeatureRASv8_4} }] in {
@@ -1452,14 +1459,37 @@ let Requires = [{ {AArch64::FeatureMTE} }] in {
 def : RWSysReg<"TCO",              0b11, 0b011, 0b0100, 0b0010, 0b111>;
 def : RWSysReg<"GCR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b110>;
 def : RWSysReg<"RGSR_EL1",         0b11, 0b000, 0b0001, 0b0000, 0b101>;
-def : RWSysReg<"TFSR_EL1",         0b11, 0b000, 0b0110, 0b0101, 0b000>;
-def : RWSysReg<"TFSR_EL2",         0b11, 0b100, 0b0110, 0b0101, 0b000>;
-def : RWSysReg<"TFSR_EL3",         0b11, 0b110, 0b0110, 0b0110, 0b000>;
-def : RWSysReg<"TFSR_EL12",        0b11, 0b101, 0b0110, 0b0110, 0b000>;
-def : RWSysReg<"TFSRE0_EL1",       0b11, 0b000, 0b0110, 0b0110, 0b001>;
+def : RWSysReg<"TFSR_EL1",         0b11, 0b000, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSR_EL2",         0b11, 0b100, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSR_EL3",         0b11, 0b110, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSR_EL12",        0b11, 0b101, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSRE0_EL1",       0b11, 0b000, 0b0101, 0b0110, 0b001>;
 def : ROSysReg<"GMID_EL1",         0b11, 0b001, 0b0000, 0b0000, 0b100>;
 } // HasMTE
 
+// Embedded Trace Extension R/W System registers
+let Requires = [{ {AArch64::FeatureETE} }] in {
+//              Name            Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"TRCRSR",        0b10, 0b001, 0b0000, 0b1010, 0b000>;
+//  TRCEXTINSELR0 has the same encoding as ETM TRCEXTINSELR
+def : RWSysReg<"TRCEXTINSELR0", 0b10, 0b001, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"TRCEXTINSELR1", 0b10, 0b001, 0b0000, 0b1001, 0b100>;
+def : RWSysReg<"TRCEXTINSELR2", 0b10, 0b001, 0b0000, 0b1010, 0b100>;
+def : RWSysReg<"TRCEXTINSELR3", 0b10, 0b001, 0b0000, 0b1011, 0b100>;
+} // FeatureETE
+
+// Trace Buffer Extension System registers
+let Requires = [{ {AArch64::FeatureTRBE} }] in {
+//                   Name       Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"TRBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b000>;
+def : RWSysReg<"TRBPTR_EL1",    0b11, 0b000, 0b1001, 0b1011, 0b001>;
+def : RWSysReg<"TRBBASER_EL1",  0b11, 0b000, 0b1001, 0b1011, 0b010>;
+def : RWSysReg<"TRBSR_EL1",     0b11, 0b000, 0b1001, 0b1011, 0b011>;
+def : RWSysReg<"TRBMAR_EL1",    0b11, 0b000, 0b1001, 0b1011, 0b100>;
+def : RWSysReg<"TRBTRG_EL1",    0b11, 0b000, 0b1001, 0b1011, 0b110>;
+def : ROSysReg<"TRBIDR_EL1",    0b11, 0b000, 0b1001, 0b1011, 0b111>;
+} // FeatureTRBE
+
 // Cyclone specific system registers
 //                                 Op0    Op1     CRn     CRm    Op2
 let Requires = [{ {AArch64::ProcCyclone} }] in
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 865461480499..b3ed96e815be 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -157,6 +157,8 @@ extern "C" void LLVMInitializeAArch64Target() {
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
   RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
   RegisterTargetMachine<AArch64leTargetMachine> Z(getTheARM64Target());
+  RegisterTargetMachine<AArch64leTargetMachine> W(getTheARM64_32Target());
+  RegisterTargetMachine<AArch64leTargetMachine> V(getTheAArch64_32Target());
   auto PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
   initializeAArch64A53Fix835769Pass(*PR);
@@ -180,6 +182,7 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeLDTLSCleanupPass(*PR);
   initializeAArch64SpeculationHardeningPass(*PR);
   initializeAArch64StackTaggingPass(*PR);
+  initializeAArch64StackTaggingPreRAPass(*PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -187,11 +190,11 @@ extern "C" void LLVMInitializeAArch64Target() {
 //===----------------------------------------------------------------------===//
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
-    return llvm::make_unique<AArch64_MachoTargetObjectFile>();
+    return std::make_unique<AArch64_MachoTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
-    return llvm::make_unique<AArch64_COFFTargetObjectFile>();
+    return std::make_unique<AArch64_COFFTargetObjectFile>();
 
-  return llvm::make_unique<AArch64_ELFTargetObjectFile>();
+  return std::make_unique<AArch64_ELFTargetObjectFile>();
 }
 
 // Helper function to build a DataLayout string
@@ -200,8 +203,11 @@ static std::string computeDataLayout(const Triple &TT,
                                      bool LittleEndian) {
   if (Options.getABIName() == "ilp32")
     return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
-  if (TT.isOSBinFormatMachO())
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::aarch64_32)
+      return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128";
     return "e-m:o-i64:64-i128:128-n32:64-S128";
+  }
   if (TT.isOSBinFormatCOFF())
     return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
@@ -277,8 +283,11 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
     this->Options.TrapUnreachable = true;
   }
 
-  // Enable GlobalISel at or below EnableGlobalISelAt0.
-  if (getOptLevel() <= EnableGlobalISelAtO) {
+  // Enable GlobalISel at or below EnableGlobalISelAt0, unless this is
+  // MachO/CodeModel::Large, which GlobalISel does not support.
+  if (getOptLevel() <= EnableGlobalISelAtO &&
+      TT.getArch() != Triple::aarch64_32 &&
+      !(getCodeModel() == CodeModel::Large && TT.isOSBinFormatMachO())) {
     setGlobalISel(true);
     setGlobalISelAbort(GlobalISelAbortMode::Disable);
   }
@@ -310,7 +319,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
+    I = std::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
                                             isLittle);
   }
   return I.get();
@@ -448,7 +457,8 @@ void AArch64PassConfig::addIRPasses() {
     addPass(createLICMPass());
   }
 
-  addPass(createAArch64StackTaggingPass());
+  addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() !=
+                                        CodeGenOpt::None));
 }
 
 // Pass Pipeline Configuration
@@ -502,7 +512,8 @@ bool AArch64PassConfig::addIRTranslator() {
 }
 
 void AArch64PassConfig::addPreLegalizeMachineIR() {
-  addPass(createAArch64PreLegalizeCombiner());
+  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+  addPass(createAArch64PreLegalizeCombiner(IsOptNone));
 }
 
 bool AArch64PassConfig::addLegalizeMachineIR() {
@@ -516,9 +527,7 @@ bool AArch64PassConfig::addRegBankSelect() {
 }
 
 void AArch64PassConfig::addPreGlobalInstructionSelect() {
-  // Workaround the deficiency of the fast register allocator.
-  if (TM->getOptLevel() == CodeGenOpt::None)
-    addPass(new Localizer());
+  addPass(new Localizer());
 }
 
 bool AArch64PassConfig::addGlobalInstructionSelect() {
@@ -540,6 +549,8 @@ bool AArch64PassConfig::addILPOpts() {
   if (EnableStPairSuppress)
     addPass(createAArch64StorePairSuppressPass());
   addPass(createAArch64SIMDInstrOptPass());
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64StackTaggingPreRAPass());
   return true;
 }
 
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 1c3d5d0743ad..54562094fcf5 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -59,8 +59,8 @@ MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
 }
 
 const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
-    const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
-    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+    const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+    int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
   assert((Offset+MV.getConstant() == 0) &&
          "Arch64 does not support GOT PC rel with extra offset");
   // On ARM64 Darwin, we can reference symbols with foo@GOT-., which
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 7ead363d42fe..1cb4c028c80d 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -35,7 +35,8 @@ public:
                                     const TargetMachine &TM,
                                     MachineModuleInfo *MMI) const override;
 
-  const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+  const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
+                                          const MCSymbol *Sym,
                                           const MCValue &MV, int64_t Offset,
                                           MachineModuleInfo *MMI,
                                           MCStreamer &Streamer) const override;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a4b78f2a7d6b..dc916a7b3407 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -618,6 +618,19 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
+AArch64TTIImpl::TTI::MemCmpExpansionOptions
+AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+  TTI::MemCmpExpansionOptions Options;
+  Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
+  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+  Options.NumLoadsPerBlock = Options.MaxNumLoads;
+  // TODO: Though vector loads usually perform well on AArch64, in some targets
+  // they may wake up the FP unit, which raises the power consumption.  Perhaps
+  // they could be used with no holds barred (-O3).
+  Options.LoadSizes = {8, 4, 2, 1};
+  return Options;
+}
+
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                     unsigned Alignment, unsigned AddressSpace,
                                     const Instruction *I) {
@@ -879,22 +892,6 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
   return Considerable;
 }
 
-unsigned AArch64TTIImpl::getCacheLineSize() {
-  return ST->getCacheLineSize();
-}
-
-unsigned AArch64TTIImpl::getPrefetchDistance() {
-  return ST->getPrefetchDistance();
-}
-
-unsigned AArch64TTIImpl::getMinPrefetchStride() {
-  return ST->getMinPrefetchStride();
-}
-
-unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
-  return ST->getMaxPrefetchIterationsAhead();
-}
-
 bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                            TTI::ReductionFlags Flags) const {
   assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 10c15a139b4c..32c59f41e1c3 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -85,7 +85,8 @@ public:
 
   bool enableInterleavedAccessVectorization() { return true; }
 
-  unsigned getNumberOfRegisters(bool Vector) {
+  unsigned getNumberOfRegisters(unsigned ClassID) const {
+    bool Vector = (ClassID == 1);
     if (Vector) {
       if (ST->hasNEON())
         return 32;
@@ -130,6 +131,9 @@ public:
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                          const Instruction *I = nullptr);
 
+  TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+                                                    bool IsZeroCmp) const;
+
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
 
@@ -153,14 +157,6 @@ public:
   shouldConsiderAddressTypePromotion(const Instruction &I,
                                      bool &AllowPromotionWithoutCommonHeader);
 
-  unsigned getCacheLineSize();
-
-  unsigned getPrefetchDistance();
-
-  unsigned getMinPrefetchStride();
-
-  unsigned getMaxPrefetchIterationsAhead();
-
   bool shouldExpandReduction(const IntrinsicInst *II) const {
     return false;
   }
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index f4c55d48d215..4fb409f020d9 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -935,48 +935,34 @@ public:
     return false;
   }
 
-  bool isMovZSymbolG3() const {
-    return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
+  bool isMovWSymbolG3() const {
+    return isMovWSymbol({AArch64MCExpr::VK_ABS_G3, AArch64MCExpr::VK_PREL_G3});
   }
 
-  bool isMovZSymbolG2() const {
-    return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
-                         AArch64MCExpr::VK_TPREL_G2,
-                         AArch64MCExpr::VK_DTPREL_G2});
-  }
-
-  bool isMovZSymbolG1() const {
-    return isMovWSymbol({
-        AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
-        AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
-        AArch64MCExpr::VK_DTPREL_G1,
-    });
-  }
-
-  bool isMovZSymbolG0() const {
-    return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
-                         AArch64MCExpr::VK_TPREL_G0,
-                         AArch64MCExpr::VK_DTPREL_G0});
-  }
-
-  bool isMovKSymbolG3() const {
-    return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
-  }
-
-  bool isMovKSymbolG2() const {
-    return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC);
+  bool isMovWSymbolG2() const {
+    return isMovWSymbol(
+        {AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+         AArch64MCExpr::VK_ABS_G2_NC, AArch64MCExpr::VK_PREL_G2,
+         AArch64MCExpr::VK_PREL_G2_NC, AArch64MCExpr::VK_TPREL_G2,
+         AArch64MCExpr::VK_DTPREL_G2});
   }
 
-  bool isMovKSymbolG1() const {
-    return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC,
-                         AArch64MCExpr::VK_TPREL_G1_NC,
-                         AArch64MCExpr::VK_DTPREL_G1_NC});
+  bool isMovWSymbolG1() const {
+    return isMovWSymbol(
+        {AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
+         AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_PREL_G1,
+         AArch64MCExpr::VK_PREL_G1_NC, AArch64MCExpr::VK_GOTTPREL_G1,
+         AArch64MCExpr::VK_TPREL_G1, AArch64MCExpr::VK_TPREL_G1_NC,
+         AArch64MCExpr::VK_DTPREL_G1, AArch64MCExpr::VK_DTPREL_G1_NC});
   }
 
-  bool isMovKSymbolG0() const {
+  bool isMovWSymbolG0() const {
     return isMovWSymbol(
-        {AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
-         AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC});
+        {AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+         AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_PREL_G0,
+         AArch64MCExpr::VK_PREL_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
+         AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_TPREL_G0_NC,
+         AArch64MCExpr::VK_DTPREL_G0, AArch64MCExpr::VK_DTPREL_G0_NC});
   }
 
   template<int RegWidth, int Shift>
@@ -1814,7 +1800,7 @@ public:
 
   static std::unique_ptr<AArch64Operand>
   CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_Token, Ctx);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->Tok.IsSuffix = IsSuffix;
@@ -1829,7 +1815,7 @@ public:
             AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
             unsigned ShiftAmount = 0,
             unsigned HasExplicitAmount = false) {
-    auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_Register, Ctx);
     Op->Reg.RegNum = RegNum;
     Op->Reg.Kind = Kind;
     Op->Reg.ElementWidth = 0;
@@ -1861,7 +1847,7 @@ public:
   CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
                    unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E,
                    MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_VectorList, Ctx);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.NumElements = NumElements;
@@ -1874,7 +1860,7 @@ public:
 
   static std::unique_ptr<AArch64Operand>
   CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_VectorIndex, Ctx);
     Op->VectorIndex.Val = Idx;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -1883,7 +1869,7 @@ public:
 
   static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
                                                    SMLoc E, MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_Immediate, Ctx);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -1894,7 +1880,7 @@ public:
                                                           unsigned ShiftAmount,
                                                           SMLoc S, SMLoc E,
                                                           MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
     Op->ShiftedImm .Val = Val;
     Op->ShiftedImm.ShiftAmount = ShiftAmount;
     Op->StartLoc = S;
@@ -1904,7 +1890,7 @@ public:
 
   static std::unique_ptr<AArch64Operand>
   CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_CondCode, Ctx);
     Op->CondCode.Code = Code;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -1913,7 +1899,7 @@ public:
 
   static std::unique_ptr<AArch64Operand>
   CreateFPImm(APFloat Val, bool IsExact, SMLoc S, MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_FPImm, Ctx);
     Op->FPImm.Val = Val.bitcastToAPInt().getSExtValue();
     Op->FPImm.IsExact = IsExact;
     Op->StartLoc = S;
@@ -1925,7 +1911,7 @@ public:
                                                        StringRef Str,
                                                        SMLoc S,
                                                        MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_Barrier, Ctx);
     Op->Barrier.Val = Val;
     Op->Barrier.Data = Str.data();
     Op->Barrier.Length = Str.size();
@@ -1939,7 +1925,7 @@ public:
                                                       uint32_t MSRReg,
                                                       uint32_t PStateField,
                                                       MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_SysReg, Ctx);
     Op->SysReg.Data = Str.data();
     Op->SysReg.Length = Str.size();
     Op->SysReg.MRSReg = MRSReg;
@@ -1952,7 +1938,7 @@ public:
 
   static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
                                                      SMLoc E, MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_SysCR, Ctx);
     Op->SysCRImm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -1963,7 +1949,7 @@ public:
                                                         StringRef Str,
                                                         SMLoc S,
                                                         MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_Prefetch, Ctx);
     Op->Prefetch.Val = Val;
     Op->Barrier.Data = Str.data();
     Op->Barrier.Length = Str.size();
@@ -1976,7 +1962,7 @@ public:
                                                        StringRef Str,
                                                        SMLoc S,
                                                        MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_PSBHint, Ctx);
     Op->PSBHint.Val = Val;
     Op->PSBHint.Data = Str.data();
     Op->PSBHint.Length = Str.size();
@@ -1989,7 +1975,7 @@ public:
                                                        StringRef Str,
                                                        SMLoc S,
                                                        MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_BTIHint, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_BTIHint, Ctx);
     Op->BTIHint.Val = Val << 1 | 32;
     Op->BTIHint.Data = Str.data();
     Op->BTIHint.Length = Str.size();
@@ -2001,7 +1987,7 @@ public:
   static std::unique_ptr<AArch64Operand>
   CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
                     bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
-    auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
+    auto Op = std::make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
     Op->ShiftExtend.Type = ShOp;
     Op->ShiftExtend.Amount = Val;
     Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
@@ -2840,7 +2826,7 @@ static const struct Extension {
     {"sve2-aes", {AArch64::FeatureSVE2AES}},
     {"sve2-sm4", {AArch64::FeatureSVE2SM4}},
     {"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
-    {"bitperm", {AArch64::FeatureSVE2BitPerm}},
+    {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
     // FIXME: Unsupported extensions
     {"pan", {}},
     {"lor", {}},
@@ -3260,6 +3246,13 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
                   .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
                   .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
                   .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+                  .Case("prel_g3", AArch64MCExpr::VK_PREL_G3)
+                  .Case("prel_g2", AArch64MCExpr::VK_PREL_G2)
+                  .Case("prel_g2_nc", AArch64MCExpr::VK_PREL_G2_NC)
+                  .Case("prel_g1", AArch64MCExpr::VK_PREL_G1)
+                  .Case("prel_g1_nc", AArch64MCExpr::VK_PREL_G1_NC)
+                  .Case("prel_g0", AArch64MCExpr::VK_PREL_G0)
+                  .Case("prel_g0_nc", AArch64MCExpr::VK_PREL_G0_NC)
                   .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
                   .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
                   .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
@@ -5283,7 +5276,7 @@ bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
 
   auto parseOp = [&]() -> bool {
     SMLoc L = getLoc();
-    const MCExpr *Expr;
+    const MCExpr *Expr = nullptr;
     if (check(getParser().parseExpression(Expr), L, "expected expression"))
       return true;
     const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
@@ -5542,43 +5535,43 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   switch (Kind) {
   default:
     return Match_InvalidOperand;
-  case MCK__35_0:
+  case MCK__HASH_0:
     ExpectedVal = 0;
     break;
-  case MCK__35_1:
+  case MCK__HASH_1:
     ExpectedVal = 1;
     break;
-  case MCK__35_12:
+  case MCK__HASH_12:
     ExpectedVal = 12;
     break;
-  case MCK__35_16:
+  case MCK__HASH_16:
     ExpectedVal = 16;
     break;
-  case MCK__35_2:
+  case MCK__HASH_2:
     ExpectedVal = 2;
     break;
-  case MCK__35_24:
+  case MCK__HASH_24:
     ExpectedVal = 24;
     break;
-  case MCK__35_3:
+  case MCK__HASH_3:
     ExpectedVal = 3;
     break;
-  case MCK__35_32:
+  case MCK__HASH_32:
     ExpectedVal = 32;
     break;
-  case MCK__35_4:
+  case MCK__HASH_4:
     ExpectedVal = 4;
     break;
-  case MCK__35_48:
+  case MCK__HASH_48:
     ExpectedVal = 48;
     break;
-  case MCK__35_6:
+  case MCK__HASH_6:
     ExpectedVal = 6;
     break;
-  case MCK__35_64:
+  case MCK__HASH_64:
     ExpectedVal = 64;
     break;
-  case MCK__35_8:
+  case MCK__HASH_8:
     ExpectedVal = 8;
     break;
   }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 6418211a4f55..21ce5785ea5e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -153,9 +153,8 @@ static unsigned AdrImmBits(unsigned Value) {
 static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
                                  uint64_t Value, MCContext &Ctx,
                                  const Triple &TheTriple, bool IsResolved) {
-  unsigned Kind = Fixup.getKind();
   int64_t SignedValue = static_cast<int64_t>(Value);
-  switch (Kind) {
+  switch (Fixup.getTargetKind()) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case AArch64::fixup_aarch64_pcrel_adr_imm21:
@@ -574,7 +573,7 @@ public:
       case MCCFIInstruction::OpDefCfa: {
         // Defines a frame pointer.
         unsigned XReg =
-            getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true));
+            getXRegFromWReg(*MRI.getLLVMRegNum(Inst.getRegister(), true));
 
         // Other CFA registers than FP are not supported by compact unwind.
         // Fallback on DWARF.
@@ -593,8 +592,8 @@ public:
         assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
                "Frame pointer not pushed!");
 
-        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
-        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+        unsigned LRReg = *MRI.getLLVMRegNum(LRPush.getRegister(), true);
+        unsigned FPReg = *MRI.getLLVMRegNum(FPPush.getRegister(), true);
 
         LRReg = getXRegFromWReg(LRReg);
         FPReg = getXRegFromWReg(FPReg);
@@ -615,14 +614,14 @@ public:
       case MCCFIInstruction::OpOffset: {
         // Registers are saved in pairs. We expect there to be two consecutive
         // `.cfi_offset' instructions with the appropriate registers specified.
-        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        unsigned Reg1 = *MRI.getLLVMRegNum(Inst.getRegister(), true);
         if (i + 1 == e)
           return CU::UNWIND_ARM64_MODE_DWARF;
 
         const MCCFIInstruction &Inst2 = Instrs[++i];
         if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
           return CU::UNWIND_ARM64_MODE_DWARF;
-        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+        unsigned Reg2 = *MRI.getLLVMRegNum(Inst2.getRegister(), true);
 
         // N.B. The encodings must be in register number order, and the X
         // registers before the D registers.
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index c871e2c62eac..0fd1ca187be7 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -57,7 +57,7 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32)
 static bool isNonILP32reloc(const MCFixup &Fixup,
                             AArch64MCExpr::VariantKind RefKind,
                             MCContext &Ctx) {
-  if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw)
+  if (Fixup.getTargetKind() != AArch64::fixup_aarch64_movw)
     return false;
   switch (RefKind) {
   case AArch64MCExpr::VK_ABS_G3:
@@ -120,7 +120,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
          "Should only be expression-level modifiers here");
 
   if (IsPCRel) {
-    switch ((unsigned)Fixup.getKind()) {
+    switch (Fixup.getTargetKind()) {
     case FK_Data_1:
       Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
       return ELF::R_AARCH64_NONE;
@@ -184,7 +184,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
   } else {
     if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
       return ELF::R_AARCH64_NONE;
-    switch ((unsigned)Fixup.getKind()) {
+    switch (Fixup.getTargetKind()) {
     case FK_NONE:
       return ELF::R_AARCH64_NONE;
     case FK_Data_1:
@@ -394,6 +394,20 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         return R_CLS(MOVW_SABS_G0);
       if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
         return R_CLS(MOVW_UABS_G0_NC);
+      if (RefKind == AArch64MCExpr::VK_PREL_G3)
+        return ELF::R_AARCH64_MOVW_PREL_G3;
+      if (RefKind == AArch64MCExpr::VK_PREL_G2)
+        return ELF::R_AARCH64_MOVW_PREL_G2;
+      if (RefKind == AArch64MCExpr::VK_PREL_G2_NC)
+        return ELF::R_AARCH64_MOVW_PREL_G2_NC;
+      if (RefKind == AArch64MCExpr::VK_PREL_G1)
+        return R_CLS(MOVW_PREL_G1);
+      if (RefKind == AArch64MCExpr::VK_PREL_G1_NC)
+        return ELF::R_AARCH64_MOVW_PREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_PREL_G0)
+        return R_CLS(MOVW_PREL_G0);
+      if (RefKind == AArch64MCExpr::VK_PREL_G0_NC)
+        return R_CLS(MOVW_PREL_G0_NC);
       if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
         return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
       if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
@@ -434,5 +448,5 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) {
-  return llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
+  return std::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index d0a544273b8b..1a16468484ad 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -172,7 +172,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     int ImmS = MI->getOperand(4).getImm();
 
     if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) &&
-        (ImmR == 0 || ImmS < ImmR)) {
+        (ImmR == 0 || ImmS < ImmR) &&
+        STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
       // BFC takes precedence over its entire range, sligtly differently to BFI.
       int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
       int LSB = (BitWidth - ImmR) % BitWidth;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index ecff1ab0a8b3..5926a4f81616 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -30,7 +30,7 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
     cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
                clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
 
-AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
   // We prefer NEON instructions to be printed in the short, Apple-specific
   // form when targeting Darwin.
   AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
@@ -39,7 +39,8 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
   PrivateLabelPrefix = "L";
   SeparatorString = "%%";
   CommentString = ";";
-  CodePointerSize = CalleeSaveStackSlotSize = 8;
+  CalleeSaveStackSlotSize = 8;
+  CodePointerSize = IsILP32 ? 4 : 8;
 
   AlignmentIsInBytes = false;
   UsesELFSectionDirectiveForBSS = true;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 36ae92afc8c1..7274ae79f74a 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -23,7 +23,7 @@ class Target;
 class Triple;
 
 struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
-  explicit AArch64MCAsmInfoDarwin();
+  explicit AArch64MCAsmInfoDarwin(bool IsILP32);
   const MCExpr *
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 0a529321edc8..548e399e05a3 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -42,6 +42,13 @@ StringRef AArch64MCExpr::getVariantKindName() const {
   case VK_ABS_G0:              return ":abs_g0:";
   case VK_ABS_G0_S:            return ":abs_g0_s:";
   case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_PREL_G3:             return ":prel_g3:";
+  case VK_PREL_G2:             return ":prel_g2:";
+  case VK_PREL_G2_NC:          return ":prel_g2_nc:";
+  case VK_PREL_G1:             return ":prel_g1:";
+  case VK_PREL_G1_NC:          return ":prel_g1_nc:";
+  case VK_PREL_G0:             return ":prel_g0:";
+  case VK_PREL_G0_NC:          return ":prel_g0_nc:";
   case VK_DTPREL_G2:           return ":dtprel_g2:";
   case VK_DTPREL_G1:           return ":dtprel_g1:";
   case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index ec9c95911628..a82ff2e91426 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -27,12 +27,13 @@ public:
     // symbol. E.g. direct, via the GOT, ...
     VK_ABS      = 0x001,
     VK_SABS     = 0x002,
-    VK_GOT      = 0x003,
-    VK_DTPREL   = 0x004,
-    VK_GOTTPREL = 0x005,
-    VK_TPREL    = 0x006,
-    VK_TLSDESC  = 0x007,
-    VK_SECREL   = 0x008,
+    VK_PREL     = 0x003,
+    VK_GOT      = 0x004,
+    VK_DTPREL   = 0x005,
+    VK_GOTTPREL = 0x006,
+    VK_TPREL    = 0x007,
+    VK_TLSDESC  = 0x008,
+    VK_SECREL   = 0x009,
     VK_SymLocBits = 0x00f,
 
     // Variants specifying which part of the final address calculation is
@@ -72,6 +73,13 @@ public:
     VK_ABS_G0_S          = VK_SABS     | VK_G0,
     VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
     VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_PREL_G3           = VK_PREL     | VK_G3,
+    VK_PREL_G2           = VK_PREL     | VK_G2,
+    VK_PREL_G2_NC        = VK_PREL     | VK_G2      | VK_NC,
+    VK_PREL_G1           = VK_PREL     | VK_G1,
+    VK_PREL_G1_NC        = VK_PREL     | VK_G1      | VK_NC,
+    VK_PREL_G0           = VK_PREL     | VK_G0,
+    VK_PREL_G0_NC        = VK_PREL     | VK_G0      | VK_NC,
     VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
     VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
     VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index df12274d9470..1d583ec0087b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -241,7 +241,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
                                          const Triple &TheTriple) {
   MCAsmInfo *MAI;
   if (TheTriple.isOSBinFormatMachO())
-    MAI = new AArch64MCAsmInfoDarwin();
+    MAI = new AArch64MCAsmInfoDarwin(TheTriple.getArch() == Triple::aarch64_32);
   else if (TheTriple.isWindowsMSVCEnvironment())
     MAI = new AArch64MCAsmInfoMicrosoftCOFF();
   else if (TheTriple.isOSBinFormatCOFF())
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index b3ce5ef22eef..fc04d37eb362 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -54,7 +54,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
   RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
   Log2Size = ~0U;
 
-  switch ((unsigned)Fixup.getKind()) {
+  switch (Fixup.getTargetKind()) {
   default:
     return false;
 
@@ -406,6 +406,6 @@ void AArch64MachObjectWriter::recordRelocation(
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype,
                                     bool IsILP32) {
-  return llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype,
+  return std::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype,
                                                     IsILP32);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index a45880a07427..aa50bd05cb71 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -120,7 +120,7 @@ bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
 namespace llvm {
 
 std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter() {
-  return llvm::make_unique<AArch64WinCOFFObjectWriter>();
+  return std::make_unique<AArch64WinCOFFObjectWriter>();
 }
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td
index 808e59467081..8ccf6aa675ba 100644
--- a/lib/Target/AArch64/SVEInstrFormats.td
+++ b/lib/Target/AArch64/SVEInstrFormats.td
@@ -279,6 +279,19 @@ let Predicates = [HasSVE] in {
   defm PTRUES : sve_int_ptrue<0b001, "ptrues">;
 }
 
+//===----------------------------------------------------------------------===//
+// SVE pattern match helpers.
+//===----------------------------------------------------------------------===//
+
+class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                   Instruction inst>
+: Pat<(vtd (op vt1:$Op1)),
+      (inst $Op1)>;
+
+class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                   ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
+      (inst $Op1, $Op2, $Op3)>;
 
 //===----------------------------------------------------------------------===//
 // SVE Predicate Misc Group
@@ -403,12 +416,12 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm> {
 }
 
 class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
-                      ZPRRegOp zprty>
-: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg),
-  asm, "\t$Zdn, $Pg",
+                      ZPRRegOp zprty, PPRRegOp pprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm),
+  asm, "\t$Zdn, $Pm",
   "",
   []>, Sched<[]> {
-  bits<4> Pg;
+  bits<4> Pm;
   bits<5> Zdn;
   let Inst{31-24} = 0b00100101;
   let Inst{23-22} = sz8_64;
@@ -416,7 +429,7 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
   let Inst{18-16} = opc{4-2};
   let Inst{15-11} = 0b10000;
   let Inst{10-9}  = opc{1-0};
-  let Inst{8-5}   = Pg;
+  let Inst{8-5}   = Pm;
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
@@ -425,9 +438,16 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
 }
 
 multiclass sve_int_count_v<bits<5> opc, string asm> {
-  def _H : sve_int_count_v<0b01, opc, asm, ZPR16>;
-  def _S : sve_int_count_v<0b10, opc, asm, ZPR32>;
-  def _D : sve_int_count_v<0b11, opc, asm, ZPR64>;
+  def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>;
+  def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>;
+  def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>;
+
+  def : InstAlias<asm # "\t$Zdn, $Pm",
+                 (!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>;
+  def : InstAlias<asm # "\t$Zdn, $Pm",
+                 (!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>;
+  def : InstAlias<asm # "\t$Zdn, $Pm",
+                  (!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>;
 }
 
 class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
@@ -609,11 +629,12 @@ multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm> {
 //===----------------------------------------------------------------------===//
 
 class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
-                         RegisterClass srcRegType>
+                         ValueType vt, RegisterClass srcRegType,
+                         SDPatternOperator op>
 : I<(outs zprty:$Zd), (ins srcRegType:$Rn),
   asm, "\t$Zd, $Rn",
   "",
-  []>, Sched<[]> {
+  [(set (vt zprty:$Zd), (op srcRegType:$Rn))]>, Sched<[]> {
   bits<5> Rn;
   bits<5> Zd;
   let Inst{31-24} = 0b00000101;
@@ -623,11 +644,11 @@ class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_perm_dup_r<string asm> {
-  def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>;
-  def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>;
-  def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>;
-  def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>;
+multiclass sve_int_perm_dup_r<string asm, SDPatternOperator op> {
+  def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, nxv16i8, GPR32sp, op>;
+  def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, nxv8i16, GPR32sp, op>;
+  def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, nxv4i32, GPR32sp, op>;
+  def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, nxv2i64, GPR64sp, op>;
 
   def : InstAlias<"mov $Zd, $Rn",
                   (!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>;
@@ -744,7 +765,7 @@ multiclass sve2_int_perm_tbl<string asm> {
 }
 
 class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm),
   asm, "\t$Zd, $Zn, $Zm",
   "",
   []>, Sched<[]> {
@@ -758,6 +779,8 @@ class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
   let Inst{15-10} = 0b001011;
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
 }
 
 multiclass sve2_int_perm_tbx<string asm> {
@@ -826,10 +849,14 @@ class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_perm_unpk<bits<2> opc, string asm> {
+multiclass sve_int_perm_unpk<bits<2> opc, string asm, SDPatternOperator op> {
   def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>;
   def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>;
   def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>;
+
+  def : SVE_1_Op_Pat<nxv8i16, op, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Pat<nxv4i32, op, nxv8i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Pat<nxv2i64, op, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -1197,10 +1224,12 @@ multiclass sve_fp_ftmad<string asm> {
 //===----------------------------------------------------------------------===//
 
 class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
-                      ZPRRegOp zprty>
+                      ZPRRegOp zprty,
+                      ValueType vt, ValueType vt2, SDPatternOperator op>
 : I<(outs zprty:$Zd), (ins  zprty:$Zn, zprty:$Zm),
   asm, "\t$Zd, $Zn, $Zm",
-  "", []>, Sched<[]> {
+  "",
+  [(set (vt zprty:$Zd), (op (vt zprty:$Zn), (vt2 zprty:$Zm)))]>, Sched<[]> {
   bits<5> Zd;
   bits<5> Zm;
   bits<5> Zn;
@@ -1214,10 +1243,10 @@ class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_fp_3op_u_zd<bits<3> opc, string asm> {
-  def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
-  def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
-  def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
+multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
+  def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16, nxv8f16, nxv8f16, op>;
+  def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32, nxv4f32, nxv4f32, op>;
+  def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64, nxv2f64, nxv2f64, op>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1489,7 +1518,7 @@ multiclass sve_fp_fcadd<string asm> {
 
 class sve2_fp_convert_precision<bits<4> opc, string asm,
                                 ZPRRegOp zprty1, ZPRRegOp zprty2>
-: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn),
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
   asm, "\t$Zd, $Pg/m, $Zn",
   "",
   []>, Sched<[]> {
@@ -1504,6 +1533,8 @@ class sve2_fp_convert_precision<bits<4> opc, string asm,
   let Inst{12-10} = Pg;
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
 }
 
 multiclass sve2_fp_convert_down_narrow<string asm> {
@@ -1998,12 +2029,14 @@ class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
 
   let Constraints = "$Zda = $_Zda";
   let DestructiveInstType = Destructive;
-  let ElementSize = zprty1.ElementSize;
 }
 
-multiclass sve_intx_dot<bit opc, string asm> {
+multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> {
   def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>;
   def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>;
+
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32,  nxv16i8, nxv16i8, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64,  nxv8i16, nxv8i16, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2028,22 +2061,27 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
 
   let Constraints = "$Zda = $_Zda";
   let DestructiveInstType = Destructive;
-  let ElementSize = ElementSizeNone;
 }
 
-multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> {
-  def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
+multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
+                                        SDPatternOperator op> {
+  def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> {
     bits<2> iop;
     bits<3> Zm;
     let Inst{20-19} = iop;
     let Inst{18-16} = Zm;
   }
-  def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
+  def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> {
     bits<1> iop;
     bits<4> Zm;
     let Inst{20} = iop;
     let Inst{19-16} = Zm;
   }
+
+  def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b:$idx))),
+            (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>;
+  def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b:$idx))),
+            (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2399,21 +2437,40 @@ multiclass sve2_misc_bitwise<bits<4> opc, string asm> {
   def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
 }
 
-multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
-  let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in {
-    def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8,  ZPR8>;
-    def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>;
-    def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>;
-    def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>;
-  }
-}
-
 multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> {
   def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
   def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
   def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
 }
 
+class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
+                                   ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+  asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01000101;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Zm;
+  let Inst{15-11} = 0b10010;
+  let Inst{10}    = opc;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+  let DestructiveInstType = Destructive;
+  let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
+  def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8,  ZPR8>;
+  def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
+  def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
+  def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;
+}
+
 class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
                                    ZPRRegOp zprty1, ZPRRegOp zprty2,
                                    Operand immtype>
@@ -2451,9 +2508,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
 // SVE2 Accumulate Group
 //===----------------------------------------------------------------------===//
 
-class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
-                                  ZPRRegOp zprty, Operand immtype>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
+class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
+                             ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm),
   asm, "\t$Zd, $Zn, $imm",
   "", []>, Sched<[]> {
   bits<5> Zd;
@@ -2468,38 +2525,40 @@ class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
   let Inst{10}    = opc;
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
 }
 
-multiclass sve2_int_bin_cons_shift_imm_left<bit opc, string asm> {
-  def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
-  def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
+  def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+  def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
     let Inst{19} = imm{3};
   }
-  def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+  def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
     let Inst{20-19} = imm{4-3};
   }
-  def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+  def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
 }
 
-multiclass sve2_int_bin_cons_shift_imm_right<bit opc, string asm> {
-  def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
-  def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
+  def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+  def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
     let Inst{19} = imm{3};
   }
-  def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+  def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
     let Inst{20-19} = imm{4-3};
   }
-  def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+  def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
 }
 
-class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
-                                        ZPRRegOp zprty, Operand immtype>
+class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
+                                   ZPRRegOp zprty, Operand immtype>
 : I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm),
   asm, "\t$Zda, $Zn, $imm",
   "", []>, Sched<[]> {
@@ -2521,15 +2580,15 @@ class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_int_bin_accum_cons_shift_imm_right<bits<2> opc, string asm> {
-  def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
-  def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
+  def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+  def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
     let Inst{19} = imm{3};
   }
-  def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+  def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
     let Inst{20-19} = imm{4-3};
   }
-  def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+  def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
@@ -2607,9 +2666,9 @@ multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> {
 // SVE2 Narrowing Group
 //===----------------------------------------------------------------------===//
 
-class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
-                                         string asm, ZPRRegOp zprty1,
-                                         ZPRRegOp zprty2, Operand immtype>
+class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
+                                           string asm, ZPRRegOp zprty1,
+                                           ZPRRegOp zprty2, Operand immtype>
 : I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
   asm, "\t$Zd, $Zn, $imm",
   "", []>, Sched<[]> {
@@ -2622,26 +2681,63 @@ class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
   let Inst{20-19} = tsz8_64{1-0};
   let Inst{18-16} = imm{2-0}; // imm3
   let Inst{15-14} = 0b00;
-  let Inst{13-10} = opc;
+  let Inst{13-11} = opc;
+  let Inst{10}    = 0b0;
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_int_bin_cons_shift_imm_right_narrow<bits<4> opc, string asm> {
-  def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16,
-                                              vecshiftR8>;
-  def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32,
-                                              vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm> {
+  def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
+                                                vecshiftR8>;
+  def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
+                                                vecshiftR16> {
     let Inst{19} = imm{3};
   }
-  def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64,
-                                              vecshiftR32> {
+  def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
+                                                vecshiftR32> {
     let Inst{20-19} = imm{4-3};
   }
 }
 
-class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
-                                  ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
+                                        string asm, ZPRRegOp zprty1,
+                                        ZPRRegOp zprty2, Operand immtype>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm),
+  asm, "\t$Zd, $Zn, $imm",
+  "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> imm;
+  let Inst{31-23} = 0b010001010;
+  let Inst{22}    = tsz8_64{2};
+  let Inst{21}    = 0b1;
+  let Inst{20-19} = tsz8_64{1-0};
+  let Inst{18-16} = imm{2-0}; // imm3
+  let Inst{15-14} = 0b00;
+  let Inst{13-11} = opc;
+  let Inst{10}    = 0b1;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm> {
+  def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
+                                             vecshiftR8>;
+  def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
+                                             vecshiftR16> {
+    let Inst{19} = imm{3};
+  }
+  def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
+                                             vecshiftR32> {
+    let Inst{20-19} = imm{4-3};
+  }
+}
+
+class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
+                                         ZPRRegOp zprty1, ZPRRegOp zprty2>
 : I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
   asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
   bits<5> Zd;
@@ -2652,19 +2748,46 @@ class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
   let Inst{21}    = 0b1;
   let Inst{20-16} = Zm;
   let Inst{15-13} = 0b011;
-  let Inst{12-10} = opc; // S, R, T
+  let Inst{12-11} = opc; // S, R
+  let Inst{10}    = 0b0; // Top
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_int_addsub_narrow_high<bits<3> opc, string asm> {
-  def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>;
-  def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>;
-  def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm> {
+  def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>;
+  def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>;
+  def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm,
+                                      ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+  asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01000101;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15-13} = 0b011;
+  let Inst{12-11} = opc; // S, R
+  let Inst{10}    = 0b1; // Top
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm> {
+  def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>;
+  def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>;
+  def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>;
 }
 
-class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
-                                  ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm,
+                                         ZPRRegOp zprty1, ZPRRegOp zprty2>
 : I<(outs zprty1:$Zd), (ins zprty2:$Zn),
   asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
   bits<5> Zd;
@@ -2674,15 +2797,41 @@ class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
   let Inst{21}    = 0b1;
   let Inst{20-19} = tsz8_64{1-0};
   let Inst{18-13} = 0b000010;
-  let Inst{12-10} = opc;
+  let Inst{12-11} = opc;
+  let Inst{10}    = 0b0;
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_int_sat_extract_narrow<bits<3> opc, string asm> {
-  def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>;
-  def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>;
-  def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm> {
+  def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>;
+  def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>;
+  def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm,
+                                      ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn),
+  asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-23} = 0b010001010;
+  let Inst{22}    = tsz8_64{2};
+  let Inst{21}    = 0b1;
+  let Inst{20-19} = tsz8_64{1-0};
+  let Inst{18-13} = 0b000010;
+  let Inst{12-11} = opc;
+  let Inst{10}    = 0b1;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm> {
+  def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>;
+  def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>;
+  def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2713,11 +2862,17 @@ class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm,
+                                  SDPatternOperator op> {
   def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
   def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
   def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm> {
@@ -2735,11 +2890,21 @@ multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm> {
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
 }
 
-multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
+                                  SDPatternOperator op> {
   def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
   def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
   def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm> {
@@ -3886,9 +4051,9 @@ multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
 }
 
-class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
-                             RegisterOperand VecList>
-: I<(outs VecList:$Zt), iops,
+class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
+                             RegisterOperand listty, ZPRRegOp zprty>
+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
   asm, "\t$Zt, $Pg, [$Zn, $Rm]",
   "",
   []>, Sched<[]> {
@@ -3908,17 +4073,14 @@ class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
   let mayStore = 1;
 }
 
-multiclass sve2_mem_cstnt_vs<bits<3> opc, string asm,
+multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
                              RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_cstnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
-                                     asm, listty>;
+  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
 
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
-  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
                  (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
 }
@@ -4147,6 +4309,14 @@ class sve_int_perm_punpk<bit opc, string asm>
   let Inst{3-0}   = Pd;
 }
 
+multiclass sve_int_perm_punpk<bit opc, string asm, SDPatternOperator op> {
+  def NAME : sve_int_perm_punpk<opc, asm>;
+
+  def : SVE_1_Op_Pat<nxv8i1, op, nxv16i1, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Pat<nxv4i1, op, nxv8i1,  !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Pat<nxv2i1, op, nxv4i1,  !cast<Instruction>(NAME)>;
+}
+
 class sve_int_rdffr_pred<bit s, string asm>
 : I<(outs PPR8:$Pd), (ins PPRAny:$Pg),
   asm, "\t$Pd, $Pg/z",
@@ -5094,7 +5264,7 @@ multiclass sve_mem_p_fill<string asm> {
                   (!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
 }
 
-class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
+class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
                              RegisterOperand VecList>
 : I<(outs VecList:$Zt), iops,
   asm, "\t$Zt, $Pg/z, [$Zn, $Rm]",
@@ -5119,17 +5289,15 @@ class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
   let mayLoad = 1;
 }
 
-multiclass sve2_mem_cldnt_vs<bits<5> opc, string asm,
+multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
                              RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_cldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
+  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
                                      asm, listty>;
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
-  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
                  (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
 }
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 7bb075c36e79..c27fc7a112ec 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -125,7 +125,7 @@ namespace llvm {
 
 uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) {
   // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
-  Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
+  static const Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
 
   std::string UpperName = Name.upper();
   SmallVector<StringRef, 5> Ops;
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e5e2fc2cb0df..7a4fcac09ec4 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -313,9 +313,9 @@ struct SysAlias {
   uint16_t Encoding;
   FeatureBitset FeaturesRequired;
 
-  SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {};
-  SysAlias (const char *N, uint16_t E, FeatureBitset F) :
-    Name(N), Encoding(E), FeaturesRequired(F) {};
+  constexpr SysAlias(const char *N, uint16_t E) : Name(N), Encoding(E) {}
+  constexpr SysAlias(const char *N, uint16_t E, FeatureBitset F)
+      : Name(N), Encoding(E), FeaturesRequired(F) {}
 
   bool haveFeatures(FeatureBitset ActiveFeatures) const {
     return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
@@ -326,9 +326,10 @@ struct SysAlias {
 
 struct SysAliasReg : SysAlias {
   bool NeedsReg;
-  SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {};
-  SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F) : SysAlias(N, E, F),
-    NeedsReg(R) {};
+  constexpr SysAliasReg(const char *N, uint16_t E, bool R)
+      : SysAlias(N, E), NeedsReg(R) {}
+  constexpr SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F)
+      : SysAlias(N, E, F), NeedsReg(R) {}
 };
 
 namespace AArch64AT{
@@ -627,6 +628,18 @@ namespace AArch64II {
     /// MO_S - Indicates that the bits of the symbol operand represented by
     /// MO_G0 etc are signed.
     MO_S = 0x100,
+
+    /// MO_PREL - Indicates that the bits of the symbol operand represented by
+    /// MO_G0 etc are PC relative.
+    MO_PREL = 0x200,
+
+    /// MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag
+    /// in bits 56-63.
+    /// On a FrameIndex operand, indicates that the underlying memory is tagged
+    /// with an unknown tag value (MTE); this needs to be lowered either to an
+    /// SP-relative load or store instruction (which do not check tags), or to
+    /// an LDG instruction to obtain the tag value.
+    MO_TAGGED = 0x400,
   };
 } // end namespace AArch64II
 
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 19a8bd901629..b64422ae5427 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -188,6 +188,10 @@ ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
 ModulePass *createR600OpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
+ModulePass *createAMDGPUPrintfRuntimeBinding();
+void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&);
+extern char &AMDGPUPrintfRuntimeBindingID;
+
 ModulePass* createAMDGPUUnifyMetadataPass();
 void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
 extern char &AMDGPUUnifyMetadataID;
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index baeba534012c..42b477e07b3b 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -10,6 +10,15 @@ include "llvm/TableGen/SearchableTable.td"
 include "llvm/Target/Target.td"
 include "AMDGPUFeatures.td"
 
+def p0 : PtrValueType<i64, 0>;
+def p1 : PtrValueType<i64, 1>;
+def p2 : PtrValueType<i32, 2>;
+def p3 : PtrValueType<i32, 3>;
+def p4 : PtrValueType<i64, 4>;
+def p5 : PtrValueType<i32, 5>;
+def p6 : PtrValueType<i32, 6>;
+
+
 class BoolToList<bit Value> {
   list<int> ret = !if(Value, [1]<int>, []<int>);
 }
@@ -145,6 +154,12 @@ def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
   "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode"
 >;
 
+def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug",
+  "HasMFMAInlineLiteralBug",
+  "true",
+  "MFMA cannot use inline literal as SrcC"
+>;
+
 def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard",
   "HasVcmpxPermlaneHazard",
   "true",
@@ -802,6 +817,7 @@ def FeatureISAVersion9_0_8 : FeatureSet<
    FeaturePkFmacF16Inst,
    FeatureAtomicFaddInsts,
    FeatureSRAMECC,
+   FeatureMFMAInlineLiteralBug,
    FeatureCodeObjectV3]>;
 
 def FeatureISAVersion9_0_9 : FeatureSet<
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 419ebb2240ad..e72b3f4fde63 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -173,6 +173,9 @@ static StringRef intrinsicToAttrName(Intrinsic::ID ID,
   case Intrinsic::amdgcn_implicitarg_ptr:
     return "amdgpu-implicitarg-ptr";
   case Intrinsic::amdgcn_queue_ptr:
+  case Intrinsic::amdgcn_is_shared:
+  case Intrinsic::amdgcn_is_private:
+    // TODO: Does not require queue ptr on gfx9+
   case Intrinsic::trap:
   case Intrinsic::debugtrap:
     IsQueuePtr = true;
@@ -194,18 +197,12 @@ static bool handleAttr(Function &Parent, const Function &Callee,
 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
                                    bool &NeedQueuePtr) {
   // X ids unnecessarily propagated to kernels.
-  static const StringRef AttrNames[] = {
-    { "amdgpu-work-item-id-x" },
-    { "amdgpu-work-item-id-y" },
-    { "amdgpu-work-item-id-z" },
-    { "amdgpu-work-group-id-x" },
-    { "amdgpu-work-group-id-y" },
-    { "amdgpu-work-group-id-z" },
-    { "amdgpu-dispatch-ptr" },
-    { "amdgpu-dispatch-id" },
-    { "amdgpu-kernarg-segment-ptr" },
-    { "amdgpu-implicitarg-ptr" }
-  };
+  static constexpr StringLiteral AttrNames[] = {
+      "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
+      "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
+      "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
+      "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
+      "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
 
   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
     NeedQueuePtr = true;
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 097730441ed8..f0e7ee910f95 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -48,8 +48,8 @@ public:
     return ArgDescriptor(Reg, Mask, false, true);
   }
 
-  static ArgDescriptor createStack(Register Reg, unsigned Mask = ~0u) {
-    return ArgDescriptor(Reg, Mask, true, true);
+  static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
+    return ArgDescriptor(Offset, Mask, true, true);
   }
 
   static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 743ac64b8f10..f2d903c8e7b1 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -229,7 +229,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
   // alignment.
   Streamer.EmitValueToAlignment(64, 0, 1, 0);
   if (ReadOnlySection.getAlignment() < 64)
-    ReadOnlySection.setAlignment(64);
+    ReadOnlySection.setAlignment(Align(64));
 
   const MCSubtargetInfo &STI = MF->getSubtarget();
 
@@ -273,7 +273,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   AsmPrinter::EmitFunctionEntryLabel();
 }
 
-void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
   if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
     // Write a line for the basic block label if it is not only fallthrough.
     DisasmLines.push_back(
@@ -342,6 +342,8 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
 // Print comments that apply to both callable functions and entry points.
 void AMDGPUAsmPrinter::emitCommonFunctionComments(
   uint32_t NumVGPR,
+  Optional<uint32_t> NumAGPR,
+  uint32_t TotalNumVGPR,
   uint32_t NumSGPR,
   uint64_t ScratchSize,
   uint64_t CodeSize,
@@ -349,6 +351,11 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
   OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
   OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
+  if (NumAGPR) {
+    OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
+    OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
+                                false);
+  }
   OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
   OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
                               false);
@@ -417,7 +424,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   // The starting address of all shader programs must be 256 bytes aligned.
   // Regular functions just need the basic required instruction alignment.
-  MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
+  MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
 
   SetupMachineFunction(MF);
 
@@ -474,6 +481,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
       emitCommonFunctionComments(
         Info.NumVGPR,
+        STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(),
+        Info.getTotalNumVGPRs(STM),
         Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
         Info.PrivateSegmentSize,
         getFunctionCodeSize(MF), MFI);
@@ -481,7 +490,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     }
 
     OutStreamer->emitRawComment(" Kernel info:", false);
-    emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+    emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR,
+                               STM.hasMAIInsts()
+                                 ? CurrentProgramInfo.NumAccVGPR
+                                 : Optional<uint32_t>(),
+                               CurrentProgramInfo.NumVGPR,
                                CurrentProgramInfo.NumSGPR,
                                CurrentProgramInfo.ScratchSize,
                                getFunctionCodeSize(MF), MFI);
@@ -506,6 +519,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       " NumVGPRsForWavesPerEU: " +
       Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
 
+    OutStreamer->emitRawComment(
+      " Occupancy: " +
+      Twine(CurrentProgramInfo.Occupancy), false);
+
     OutStreamer->emitRawComment(
       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
 
@@ -588,6 +605,11 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
                                                      UsesVCC, UsesFlatScratch);
 }
 
+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
+  const GCNSubtarget &ST) const {
+  return std::max(NumVGPR, NumAGPR);
+}
+
 AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
   const MachineFunction &MF) const {
   SIFunctionResourceInfo Info;
@@ -634,11 +656,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
         HighestVGPRReg = Reg;
         break;
       }
-      MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg);
-      if (MRI.isPhysRegUsed(AReg)) {
-        HighestVGPRReg = AReg;
-        break;
+    }
+
+    if (ST.hasMAIInsts()) {
+      MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
+      for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
+        if (MRI.isPhysRegUsed(Reg)) {
+          HighestAGPRReg = Reg;
+          break;
+        }
       }
+      Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 :
+        TRI.getHWRegIndex(HighestAGPRReg) + 1;
     }
 
     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
@@ -660,6 +689,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
   }
 
   int32_t MaxVGPR = -1;
+  int32_t MaxAGPR = -1;
   int32_t MaxSGPR = -1;
   uint64_t CalleeFrameSize = 0;
 
@@ -669,11 +699,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
       for (const MachineOperand &MO : MI.operands()) {
         unsigned Width = 0;
         bool IsSGPR = false;
+        bool IsAGPR = false;
 
         if (!MO.isReg())
           continue;
 
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         switch (Reg) {
         case AMDGPU::EXEC:
         case AMDGPU::EXEC_LO:
@@ -744,6 +775,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           Width = 1;
         } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) {
           IsSGPR = false;
+          IsAGPR = true;
           Width = 1;
         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
@@ -755,6 +787,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           Width = 2;
         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
           IsSGPR = false;
+          IsAGPR = true;
           Width = 2;
         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
           IsSGPR = false;
@@ -771,6 +804,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           Width = 4;
         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
           IsSGPR = false;
+          IsAGPR = true;
           Width = 4;
         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
@@ -790,6 +824,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           Width = 16;
         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
           IsSGPR = false;
+          IsAGPR = true;
           Width = 16;
         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
           IsSGPR = true;
@@ -799,6 +834,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           Width = 32;
         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
           IsSGPR = false;
+          IsAGPR = true;
           Width = 32;
         } else {
           llvm_unreachable("Unknown register class");
@@ -807,6 +843,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
         int MaxUsed = HWReg + Width - 1;
         if (IsSGPR) {
           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+        } else if (IsAGPR) {
+          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
         } else {
           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
         }
@@ -828,6 +866,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
             47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
           MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
           MaxVGPR = std::max(MaxVGPR, 23);
+          MaxAGPR = std::max(MaxAGPR, 23);
 
           CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
           Info.UsesVCC = true;
@@ -852,6 +891,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
 
           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
+          MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
           CalleeFrameSize
             = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
           Info.UsesVCC |= I->second.UsesVCC;
@@ -868,6 +908,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
 
   Info.NumExplicitSGPR = MaxSGPR + 1;
   Info.NumVGPR = MaxVGPR + 1;
+  Info.NumAGPR = MaxAGPR + 1;
   Info.PrivateSegmentSize += CalleeFrameSize;
 
   return Info;
@@ -876,8 +917,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                                         const MachineFunction &MF) {
   SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
 
-  ProgInfo.NumVGPR = Info.NumVGPR;
+  ProgInfo.NumArchVGPR = Info.NumVGPR;
+  ProgInfo.NumAccVGPR = Info.NumAGPR;
+  ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
   ProgInfo.NumSGPR = Info.NumExplicitSGPR;
   ProgInfo.ScratchSize = Info.PrivateSegmentSize;
   ProgInfo.VCCUsed = Info.UsesVCC;
@@ -890,7 +934,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     MF.getFunction().getContext().diagnose(DiagStackSize);
   }
 
-  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
@@ -1057,6 +1100,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
       S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
       S_00B84C_EXCP_EN(0);
+
+  ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize,
+                                            ProgInfo.NumSGPRsForWavesPerEU,
+                                            ProgInfo.NumVGPRsForWavesPerEU);
 }
 
 static unsigned getRsrcReg(CallingConv::ID CallConv) {
@@ -1214,17 +1261,16 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   if (STM.isXNACKEnabled())
     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
-  unsigned MaxKernArgAlign;
+  Align MaxKernArgAlign;
   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
 
-  // These alignment values are specified in powers of two, so alignment =
-  // 2^n.  The minimum alignment is 2^4 = 16.
-  Out.kernarg_segment_alignment = std::max<size_t>(4,
-      countTrailingZeros(MaxKernArgAlign));
+  // kernarg_segment_alignment is specified as log of the alignment.
+  // The minimum alignment is 16.
+  Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
 }
 
 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index cf77034329ef..c50c19a4609c 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -43,6 +43,7 @@ private:
     // Track the number of explicitly used VGPRs. Special registers reserved at
     // the end are tracked separately.
     int32_t NumVGPR = 0;
+    int32_t NumAGPR = 0;
     int32_t NumExplicitSGPR = 0;
     uint64_t PrivateSegmentSize = 0;
     bool UsesVCC = false;
@@ -51,6 +52,7 @@ private:
     bool HasRecursion = false;
 
     int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
+    int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
   };
 
   SIProgramInfo CurrentProgramInfo;
@@ -77,6 +79,8 @@ private:
   void EmitPALMetadata(const MachineFunction &MF,
                        const SIProgramInfo &KernelInfo);
   void emitCommonFunctionComments(uint32_t NumVGPR,
+                                  Optional<uint32_t> NumAGPR,
+                                  uint32_t TotalNumVGPR,
                                   uint32_t NumSGPR,
                                   uint64_t ScratchSize,
                                   uint64_t CodeSize,
@@ -125,7 +129,7 @@ public:
 
   void EmitFunctionEntryLabel() override;
 
-  void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+  void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
 
   void EmitGlobalVariable(const GlobalVariable *GV) override;
 
@@ -140,8 +144,8 @@ public:
                        const char *ExtraCode, raw_ostream &O) override;
 
 protected:
-  mutable std::vector<std::string> DisasmLines, HexLines;
-  mutable size_t DisasmLineMaxLen;
+  std::vector<std::string> DisasmLines, HexLines;
+  size_t DisasmLineMaxLen;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 8a92e7d923fb..ba8343142c63 100644
--- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
@@ -24,20 +25,10 @@
 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
 
 using namespace llvm;
+using namespace llvm::AMDGPU;
 
 namespace {
 
-enum DPP_CTRL {
-  DPP_ROW_SR1 = 0x111,
-  DPP_ROW_SR2 = 0x112,
-  DPP_ROW_SR3 = 0x113,
-  DPP_ROW_SR4 = 0x114,
-  DPP_ROW_SR8 = 0x118,
-  DPP_WF_SR1 = 0x138,
-  DPP_ROW_BCAST15 = 0x142,
-  DPP_ROW_BCAST31 = 0x143
-};
-
 struct ReplacementInfo {
   Instruction *I;
   AtomicRMWInst::BinOp Op;
@@ -52,9 +43,12 @@ private:
   const LegacyDivergenceAnalysis *DA;
   const DataLayout *DL;
   DominatorTree *DT;
-  bool HasDPP;
+  const GCNSubtarget *ST;
   bool IsPixelShader;
 
+  Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+                   Value *const Identity) const;
+  Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
   void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
                       bool ValDivergent) const;
 
@@ -93,8 +87,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
   DT = DTW ? &DTW->getDomTree() : nullptr;
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
-  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-  HasDPP = ST.hasDPP();
+  ST = &TM.getSubtarget<GCNSubtarget>(F);
   IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
 
   visit(F);
@@ -142,17 +135,18 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
 
   // If the pointer operand is divergent, then each lane is doing an atomic
   // operation on a different address, and we cannot optimize that.
-  if (DA->isDivergent(I.getOperand(PtrIdx))) {
+  if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
     return;
   }
 
-  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget, and the atomic operation is 32
   // bits.
-  if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+  if (ValDivergent &&
+      (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
     return;
   }
 
@@ -219,20 +213,21 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
 
   const unsigned ValIdx = 0;
 
-  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget, and the atomic operation is 32
   // bits.
-  if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+  if (ValDivergent &&
+      (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
     return;
   }
 
   // If any of the other arguments to the intrinsic are divergent, we can't
   // optimize the operation.
   for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
-    if (DA->isDivergent(I.getOperand(Idx))) {
+    if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
       return;
     }
   }
@@ -282,6 +277,111 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
   return B.CreateSelect(Cond, LHS, RHS);
 }
 
+// Use the builder to create an inclusive scan of V across the wavefront, with
+// all lanes active.
+Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+                                        Value *V, Value *const Identity) const {
+  Type *const Ty = V->getType();
+  Module *M = B.GetInsertBlock()->getModule();
+  Function *UpdateDPP =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+  Function *PermLaneX16 =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
+  Function *ReadLane =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+
+  for (unsigned Idx = 0; Idx < 4; Idx++) {
+    V = buildNonAtomicBinOp(
+        B, Op, V,
+        B.CreateCall(UpdateDPP,
+                     {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
+                      B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+  }
+  if (ST->hasDPPBroadcasts()) {
+    // GFX9 has DPP row broadcast operations.
+    V = buildNonAtomicBinOp(
+        B, Op, V,
+        B.CreateCall(UpdateDPP,
+                     {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
+                      B.getInt32(0xf), B.getFalse()}));
+    V = buildNonAtomicBinOp(
+        B, Op, V,
+        B.CreateCall(UpdateDPP,
+                     {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
+                      B.getInt32(0xf), B.getFalse()}));
+  } else {
+    // On GFX10 all DPP operations are confined to a single row. To get cross-
+    // row operations we have to use permlane or readlane.
+
+    // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
+    // 48..63).
+    Value *const PermX =
+        B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
+                                   B.getFalse(), B.getFalse()});
+    V = buildNonAtomicBinOp(
+        B, Op, V,
+        B.CreateCall(UpdateDPP,
+                     {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
+                      B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
+    if (!ST->isWave32()) {
+      // Combine lane 31 into lanes 32..63.
+      Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
+      V = buildNonAtomicBinOp(
+          B, Op, V,
+          B.CreateCall(UpdateDPP,
+                       {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
+                        B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
+    }
+  }
+  return V;
+}
+
+// Use the builder to create a shift right of V across the wavefront, with all
+// lanes active, to turn an inclusive scan into an exclusive scan.
+Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
+                                              Value *const Identity) const {
+  Type *const Ty = V->getType();
+  Module *M = B.GetInsertBlock()->getModule();
+  Function *UpdateDPP =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+  Function *ReadLane =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+  Function *WriteLane =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+
+  if (ST->hasDPPWavefrontShifts()) {
+    // GFX9 has DPP wavefront shift operations.
+    V = B.CreateCall(UpdateDPP,
+                     {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
+                      B.getInt32(0xf), B.getFalse()});
+  } else {
+    // On GFX10 all DPP operations are confined to a single row. To get cross-
+    // row operations we have to use permlane or readlane.
+    Value *Old = V;
+    V = B.CreateCall(UpdateDPP,
+                     {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
+                      B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
+
+    // Copy the old lane 15 to the new lane 16.
+    V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
+                                 B.getInt32(16), V});
+
+    if (!ST->isWave32()) {
+      // Copy the old lane 31 to the new lane 32.
+      V = B.CreateCall(
+          WriteLane,
+          {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
+
+      // Copy the old lane 47 to the new lane 48.
+      V = B.CreateCall(
+          WriteLane,
+          {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
+    }
+  }
+
+  return V;
+}
+
 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
                                          unsigned BitWidth) {
   switch (Op) {
@@ -345,23 +445,29 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
 
   // We need to know how many lanes are active within the wavefront, and we do
   // this by doing a ballot of active lanes.
+  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
   CallInst *const Ballot = B.CreateIntrinsic(
-      Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()},
+      Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()},
       {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
 
   // We need to know how many lanes are active within the wavefront that are
   // below us. If we counted each lane linearly starting from 0, a lane is
   // below us only if its associated index was less than ours. We do this by
   // using the mbcnt intrinsic.
-  Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
-  Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
-  Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
-  CallInst *const PartialMbcnt = B.CreateIntrinsic(
-      Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
-  Value *const Mbcnt =
-      B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
-                                        {ExtractHi, PartialMbcnt}),
-                      Ty, false);
+  Value *Mbcnt;
+  if (ST->isWave32()) {
+    Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+                              {Ballot, B.getInt32(0)});
+  } else {
+    Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
+    Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
+    Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
+    Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+                              {ExtractLo, B.getInt32(0)});
+    Mbcnt =
+        B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
+  }
+  Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
 
   Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
 
@@ -373,47 +479,25 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   if (ValDivergent) {
     // First we need to set all inactive invocations to the identity value, so
     // that they can correctly contribute to the final result.
-    CallInst *const SetInactive =
-        B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-
-    CallInst *const FirstDPP =
-        B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
-                          {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
-                           B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
-    ExclScan = FirstDPP;
-
-    const unsigned Iters = 7;
-    const unsigned DPPCtrl[Iters] = {
-        DPP_ROW_SR1, DPP_ROW_SR2,     DPP_ROW_SR3,    DPP_ROW_SR4,
-        DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
-    const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
-    const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
-
-    // This loop performs an exclusive scan across the wavefront, with all lanes
-    // active (by using the WWM intrinsic).
-    for (unsigned Idx = 0; Idx < Iters; Idx++) {
-      Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
-      CallInst *const DPP = B.CreateIntrinsic(
-          Intrinsic::amdgcn_update_dpp, Ty,
-          {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
-           B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
-
-      ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
-    }
+    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
 
-    NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
+    const AtomicRMWInst::BinOp ScanOp =
+        Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+    NewV = buildScan(B, ScanOp, NewV, Identity);
+    ExclScan = buildShiftRight(B, NewV, Identity);
 
     // Read the value from the last lane, which has accumlated the values of
     // each active lane in the wavefront. This will be our new value which we
     // will provide to the atomic operation.
+    Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
     if (TyBitWidth == 64) {
       Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
       Value *const ExtractHi =
-          B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
+          B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
       CallInst *const ReadLaneLo = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
+          Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
       CallInst *const ReadLaneHi = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
+          Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
       Value *const PartialInsert = B.CreateInsertElement(
           UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
       Value *const Insert =
@@ -421,7 +505,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
       NewV = B.CreateBitCast(Insert, Ty);
     } else if (TyBitWidth == 32) {
       NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
-                               {NewV, B.getInt32(63)});
+                               {NewV, LastLaneIdx});
     } else {
       llvm_unreachable("Unhandled atomic bit width");
     }
@@ -493,77 +577,80 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   // original instruction.
   B.SetInsertPoint(&I);
 
-  // Create a PHI node to get our new atomic result into the exit block.
-  PHINode *const PHI = B.CreatePHI(Ty, 2);
-  PHI->addIncoming(UndefValue::get(Ty), EntryBB);
-  PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
-
-  // We need to broadcast the value who was the lowest active lane (the first
-  // lane) to all other lanes in the wavefront. We use an intrinsic for this,
-  // but have to handle 64-bit broadcasts with two calls to this intrinsic.
-  Value *BroadcastI = nullptr;
-
-  if (TyBitWidth == 64) {
-    Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
-    Value *const ExtractHi =
-        B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
-    CallInst *const ReadFirstLaneLo =
-        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
-    CallInst *const ReadFirstLaneHi =
-        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
-    Value *const PartialInsert = B.CreateInsertElement(
-        UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
-    Value *const Insert =
-        B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
-    BroadcastI = B.CreateBitCast(Insert, Ty);
-  } else if (TyBitWidth == 32) {
-
-    BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
-  } else {
-    llvm_unreachable("Unhandled atomic bit width");
-  }
+  const bool NeedResult = !I.use_empty();
+  if (NeedResult) {
+    // Create a PHI node to get our new atomic result into the exit block.
+    PHINode *const PHI = B.CreatePHI(Ty, 2);
+    PHI->addIncoming(UndefValue::get(Ty), EntryBB);
+    PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
 
-  // Now that we have the result of our single atomic operation, we need to
-  // get our individual lane's slice into the result. We use the lane offset we
-  // previously calculated combined with the atomic result value we got from the
-  // first lane, to get our lane's index into the atomic result.
-  Value *LaneOffset = nullptr;
-  if (ValDivergent) {
-    LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
-  } else {
-    switch (Op) {
-    default:
-      llvm_unreachable("Unhandled atomic op");
-    case AtomicRMWInst::Add:
-    case AtomicRMWInst::Sub:
-      LaneOffset = B.CreateMul(V, Mbcnt);
-      break;
-    case AtomicRMWInst::And:
-    case AtomicRMWInst::Or:
-    case AtomicRMWInst::Max:
-    case AtomicRMWInst::Min:
-    case AtomicRMWInst::UMax:
-    case AtomicRMWInst::UMin:
-      LaneOffset = B.CreateSelect(Cond, Identity, V);
-      break;
-    case AtomicRMWInst::Xor:
-      LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
-      break;
+    // We need to broadcast the value who was the lowest active lane (the first
+    // lane) to all other lanes in the wavefront. We use an intrinsic for this,
+    // but have to handle 64-bit broadcasts with two calls to this intrinsic.
+    Value *BroadcastI = nullptr;
+
+    if (TyBitWidth == 64) {
+      Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
+      Value *const ExtractHi =
+          B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
+      CallInst *const ReadFirstLaneLo =
+          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
+      CallInst *const ReadFirstLaneHi =
+          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
+      Value *const PartialInsert = B.CreateInsertElement(
+          UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
+      Value *const Insert =
+          B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
+      BroadcastI = B.CreateBitCast(Insert, Ty);
+    } else if (TyBitWidth == 32) {
+
+      BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
+    } else {
+      llvm_unreachable("Unhandled atomic bit width");
     }
-  }
-  Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
 
-  if (IsPixelShader) {
-    // Need a final PHI to reconverge to above the helper lane branch mask.
-    B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+    // Now that we have the result of our single atomic operation, we need to
+    // get our individual lane's slice into the result. We use the lane offset
+    // we previously calculated combined with the atomic result value we got
+    // from the first lane, to get our lane's index into the atomic result.
+    Value *LaneOffset = nullptr;
+    if (ValDivergent) {
+      LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+    } else {
+      switch (Op) {
+      default:
+        llvm_unreachable("Unhandled atomic op");
+      case AtomicRMWInst::Add:
+      case AtomicRMWInst::Sub:
+        LaneOffset = B.CreateMul(V, Mbcnt);
+        break;
+      case AtomicRMWInst::And:
+      case AtomicRMWInst::Or:
+      case AtomicRMWInst::Max:
+      case AtomicRMWInst::Min:
+      case AtomicRMWInst::UMax:
+      case AtomicRMWInst::UMin:
+        LaneOffset = B.CreateSelect(Cond, Identity, V);
+        break;
+      case AtomicRMWInst::Xor:
+        LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
+        break;
+      }
+    }
+    Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
 
-    PHINode *const PHI = B.CreatePHI(Ty, 2);
-    PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
-    PHI->addIncoming(Result, I.getParent());
-    I.replaceAllUsesWith(PHI);
-  } else {
-    // Replace the original atomic instruction with the new one.
-    I.replaceAllUsesWith(Result);
+    if (IsPixelShader) {
+      // Need a final PHI to reconverge to above the helper lane branch mask.
+      B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+
+      PHINode *const PHI = B.CreatePHI(Ty, 2);
+      PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
+      PHI->addIncoming(Result, I.getParent());
+      I.replaceAllUsesWith(PHI);
+    } else {
+      // Replace the original atomic instruction with the new one.
+      I.replaceAllUsesWith(Result);
+    }
   }
 
   // And delete the original.
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index b107c357196d..58c44acde1a7 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -30,13 +30,15 @@ using namespace llvm;
 
 namespace {
 
-struct OutgoingArgHandler : public CallLowering::ValueHandler {
-  OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                     MachineInstrBuilder MIB, CCAssignFn *AssignFn)
-      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+struct OutgoingValueHandler : public CallLowering::ValueHandler {
+  OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                       MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+      : ValueHandler(B, MRI, AssignFn), MIB(MIB) {}
 
   MachineInstrBuilder MIB;
 
+  bool isIncomingArgumentHandler() const override { return false; }
+
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
     llvm_unreachable("not implemented");
@@ -49,15 +51,96 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         CCValAssign &VA) override {
-    MIB.addUse(PhysReg);
-    MIRBuilder.buildCopy(PhysReg, ValVReg);
+    Register ExtReg;
+    if (VA.getLocVT().getSizeInBits() < 32) {
+      // 16-bit types are reported as legal for 32-bit registers. We need to
+      // extend and do a 32-bit copy to avoid the verifier complaining about it.
+      ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
+    } else
+      ExtReg = extendRegister(ValVReg, VA);
+
+    MIRBuilder.buildCopy(PhysReg, ExtReg);
+    MIB.addUse(PhysReg, RegState::Implicit);
   }
 
   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                  CCValAssign::LocInfo LocInfo,
                  const CallLowering::ArgInfo &Info,
+                 ISD::ArgFlagsTy Flags,
                  CCState &State) override {
-    return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+    return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+  }
+};
+
+struct IncomingArgHandler : public CallLowering::ValueHandler {
+  uint64_t StackUsed = 0;
+
+  IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                     CCAssignFn *AssignFn)
+    : ValueHandler(B, MRI, AssignFn) {}
+
+  Register getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+    auto &MFI = MIRBuilder.getMF().getFrameInfo();
+    int FI = MFI.CreateFixedObject(Size, Offset, true);
+    MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+    Register AddrReg = MRI.createGenericVirtualRegister(
+      LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32));
+    MIRBuilder.buildFrameIndex(AddrReg, FI);
+    StackUsed = std::max(StackUsed, Size + Offset);
+    return AddrReg;
+  }
+
+  void assignValueToReg(Register ValVReg, Register PhysReg,
+                        CCValAssign &VA) override {
+    markPhysRegUsed(PhysReg);
+
+    if (VA.getLocVT().getSizeInBits() < 32) {
+      // 16-bit types are reported as legal for 32-bit registers. We need to do
+      // a 32-bit copy, and truncate to avoid the verifier complaining about it.
+      auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+      MIRBuilder.buildTrunc(ValVReg, Copy);
+      return;
+    }
+
+    switch (VA.getLocInfo()) {
+    case CCValAssign::LocInfo::SExt:
+    case CCValAssign::LocInfo::ZExt:
+    case CCValAssign::LocInfo::AExt: {
+      auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
+      MIRBuilder.buildTrunc(ValVReg, Copy);
+      break;
+    }
+    default:
+      MIRBuilder.buildCopy(ValVReg, PhysReg);
+      break;
+    }
+  }
+
+  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+    // FIXME: Get alignment
+    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+      MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1);
+    MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+  }
+
+  /// How the physical register gets marked varies between formal
+  /// parameters (it's a basic-block live-in), and a call instruction
+  /// (it's an implicit-def of the BL).
+  virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+
+  // FIXME: What is the point of this being a callback?
+  bool isIncomingArgumentHandler() const override { return true; }
+};
+
+struct FormalArgHandler : public IncomingArgHandler {
+  FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                   CCAssignFn *AssignFn)
+    : IncomingArgHandler(B, MRI, AssignFn) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIRBuilder.getMBB().addLiveIn(PhysReg);
   }
 };
 
@@ -67,55 +150,198 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
   : CallLowering(&TLI) {
 }
 
-bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+void AMDGPUCallLowering::splitToValueTypes(
+    const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
+    const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv,
+    SplitArgTy PerformArgSplit) const {
+  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+  LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+  if (OrigArg.Ty->isVoidTy())
+    return;
+
+  SmallVector<EVT, 4> SplitVTs;
+  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
+
+  assert(OrigArg.Regs.size() == SplitVTs.size());
+
+  int SplitIdx = 0;
+  for (EVT VT : SplitVTs) {
+    unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
+    Type *Ty = VT.getTypeForEVT(Ctx);
+
+
+
+    if (NumParts == 1) {
+      // No splitting to do, but we want to replace the original type (e.g. [1 x
+      // double] -> double).
+      SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty,
+                             OrigArg.Flags, OrigArg.IsFixed);
+
+      ++SplitIdx;
+      continue;
+    }
+
+    LLT LLTy = getLLTForType(*Ty, DL);
+
+    SmallVector<Register, 8> SplitRegs;
+
+    EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
+    Type *PartTy = PartVT.getTypeForEVT(Ctx);
+    LLT PartLLT = getLLTForType(*PartTy, DL);
+
+    // FIXME: Should we be reporting all of the part registers for a single
+    // argument, and let handleAssignments take care of the repacking?
+    for (unsigned i = 0; i < NumParts; ++i) {
+      Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
+      SplitRegs.push_back(PartReg);
+      SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
+    }
+
+    PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx);
+
+    ++SplitIdx;
+  }
+}
+
+// Get the appropriate type to make \p OrigTy \p Factor times bigger.
+static LLT getMultipleType(LLT OrigTy, int Factor) {
+  if (OrigTy.isVector()) {
+    return LLT::vector(OrigTy.getNumElements() * Factor,
+                       OrigTy.getElementType());
+  }
+
+  return LLT::scalar(OrigTy.getSizeInBits() * Factor);
+}
+
+// TODO: Move to generic code
+static void unpackRegsToOrigType(MachineIRBuilder &B,
+                                 ArrayRef<Register> DstRegs,
+                                 Register SrcReg,
+                                 LLT SrcTy,
+                                 LLT PartTy) {
+  assert(DstRegs.size() > 1 && "Nothing to unpack");
+
+  MachineFunction &MF = B.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  const unsigned SrcSize = SrcTy.getSizeInBits();
+  const unsigned PartSize = PartTy.getSizeInBits();
+
+  if (SrcTy.isVector() && !PartTy.isVector() &&
+      PartSize > SrcTy.getElementType().getSizeInBits()) {
+    // Vector was scalarized, and the elements extended.
+    auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
+                                                  SrcReg);
+    for (int i = 0, e = DstRegs.size(); i != e; ++i)
+      B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
+    return;
+  }
+
+  if (SrcSize % PartSize == 0) {
+    B.buildUnmerge(DstRegs, SrcReg);
+    return;
+  }
+
+  const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
+
+  LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
+  auto ImpDef = B.buildUndef(BigTy);
+
+  Register BigReg = MRI.createGenericVirtualRegister(BigTy);
+  B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0);
+
+  int64_t Offset = 0;
+  for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
+    B.buildExtract(DstRegs[i], BigReg, Offset);
+}
+
+/// Lower the return value for the already existing \p Ret. This assumes that
+/// \p B's insertion point is correct.
+bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
+                                        const Value *Val, ArrayRef<Register> VRegs,
+                                        MachineInstrBuilder &Ret) const {
+  if (!Val)
+    return true;
+
+  auto &MF = B.getMF();
+  const auto &F = MF.getFunction();
+  const DataLayout &DL = MF.getDataLayout();
+
+  CallingConv::ID CC = F.getCallingConv();
+  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  ArgInfo OrigRetInfo(VRegs, Val->getType());
+  setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
+  SmallVector<ArgInfo, 4> SplitRetInfos;
+
+  splitToValueTypes(
+    OrigRetInfo, SplitRetInfos, DL, MRI, CC,
+    [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+      unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT);
+    });
+
+  CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
+
+  OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn);
+  return handleAssignments(B, SplitRetInfos, RetHandler);
+}
+
+bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
                                      const Value *Val,
                                      ArrayRef<Register> VRegs) const {
 
-  MachineFunction &MF = MIRBuilder.getMF();
+  MachineFunction &MF = B.getMF();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   MFI->setIfReturnsVoid(!Val);
 
-  if (!Val) {
-    MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+  assert(!Val == VRegs.empty() && "Return value without a vreg");
+
+  CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
+  const bool IsShader = AMDGPU::isShader(CC);
+  const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
+                         AMDGPU::isKernel(CC);
+  if (IsWaveEnd) {
+    B.buildInstr(AMDGPU::S_ENDPGM)
+      .addImm(0);
     return true;
   }
 
-  Register VReg = VRegs[0];
-
-  const Function &F = MF.getFunction();
-  auto &DL = F.getParent()->getDataLayout();
-  if (!AMDGPU::isShader(F.getCallingConv()))
-    return false;
+  auto const &ST = B.getMF().getSubtarget<GCNSubtarget>();
 
+  unsigned ReturnOpc =
+      IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
 
-  const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
-  SmallVector<EVT, 4> SplitVTs;
-  SmallVector<uint64_t, 4> Offsets;
-  ArgInfo OrigArg{VReg, Val->getType()};
-  setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
-  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
-
-  SmallVector<ArgInfo, 8> SplitArgs;
-  CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false);
-  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
-    Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext());
-    SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed});
+  auto Ret = B.buildInstrNoInsert(ReturnOpc);
+  Register ReturnAddrVReg;
+  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
+    ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
+    Ret.addUse(ReturnAddrVReg);
   }
-  auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG);
-  OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn);
-  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+
+  if (!lowerReturnVal(B, Val, VRegs, Ret))
     return false;
-  MIRBuilder.insertInstr(RetInstr);
 
+  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
+                                         &AMDGPU::SGPR_64RegClass);
+    B.buildCopy(ReturnAddrVReg, LiveInReturn);
+  }
+
+  // TODO: Handle CalleeSavedRegsViaCopy.
+
+  B.insertInstr(Ret);
   return true;
 }
 
-Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
+Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
                                                Type *ParamTy,
                                                uint64_t Offset) const {
 
-  MachineFunction &MF = MIRBuilder.getMF();
+  MachineFunction &MF = B.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
@@ -128,79 +354,37 @@ Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
   Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
 
   Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
-  MIRBuilder.buildConstant(OffsetReg, Offset);
+  B.buildConstant(OffsetReg, Offset);
 
-  MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
+  B.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
 
   return DstReg;
 }
 
-void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
                                         Type *ParamTy, uint64_t Offset,
                                         unsigned Align,
                                         Register DstReg) const {
-  MachineFunction &MF = MIRBuilder.getMF();
+  MachineFunction &MF = B.getMF();
   const Function &F = MF.getFunction();
   const DataLayout &DL = F.getParent()->getDataLayout();
   PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
-  Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
+  Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
 
   MachineMemOperand *MMO =
       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
-                                       MachineMemOperand::MONonTemporal |
+                                       MachineMemOperand::MODereferenceable |
                                        MachineMemOperand::MOInvariant,
                                        TypeSize, Align);
 
-  MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);
-}
-
-static Register findFirstFreeSGPR(CCState &CCInfo) {
-  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
-  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
-    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
-      return AMDGPU::SGPR0 + Reg;
-    }
-  }
-  llvm_unreachable("Cannot allocate sgpr");
-}
-
-static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
-                                           MachineFunction &MF,
-                                           const SIRegisterInfo &TRI,
-                                           SIMachineFunctionInfo &Info) {
-  const LLT S32 = LLT::scalar(32);
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  if (Info.hasWorkItemIDX()) {
-    Register Reg = AMDGPU::VGPR0;
-    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
-
-    CCInfo.AllocateReg(Reg);
-    Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
-  }
-
-  if (Info.hasWorkItemIDY()) {
-    Register Reg = AMDGPU::VGPR1;
-    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
-
-    CCInfo.AllocateReg(Reg);
-    Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
-  }
-
-  if (Info.hasWorkItemIDZ()) {
-    Register Reg = AMDGPU::VGPR2;
-    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
-
-    CCInfo.AllocateReg(Reg);
-    Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
-  }
+  B.buildLoad(DstReg, PtrReg, *MMO);
 }
 
 // Allocate special inputs passed in user SGPRs.
 static void allocateHSAUserSGPRs(CCState &CCInfo,
-                                 MachineIRBuilder &MIRBuilder,
+                                 MachineIRBuilder &B,
                                  MachineFunction &MF,
                                  const SIRegisterInfo &TRI,
                                  SIMachineFunctionInfo &Info) {
@@ -229,8 +413,8 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
     const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
     Register VReg = MRI.createGenericVirtualRegister(P4);
     MRI.addLiveIn(InputPtrReg, VReg);
-    MIRBuilder.getMBB().addLiveIn(InputPtrReg);
-    MIRBuilder.buildCopy(VReg, InputPtrReg);
+    B.getMBB().addLiveIn(InputPtrReg);
+    B.buildCopy(VReg, InputPtrReg);
     CCInfo.AllocateReg(InputPtrReg);
   }
 
@@ -250,74 +434,22 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
   // these from the dispatch pointer.
 }
 
-static void allocateSystemSGPRs(CCState &CCInfo,
-                                MachineFunction &MF,
-                                SIMachineFunctionInfo &Info,
-                                CallingConv::ID CallConv,
-                                bool IsShader) {
-  const LLT S32 = LLT::scalar(32);
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  if (Info.hasWorkGroupIDX()) {
-    Register Reg = Info.addWorkGroupIDX();
-    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasWorkGroupIDY()) {
-    Register Reg = Info.addWorkGroupIDY();
-    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasWorkGroupIDZ()) {
-    unsigned Reg = Info.addWorkGroupIDZ();
-    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasWorkGroupInfo()) {
-    unsigned Reg = Info.addWorkGroupInfo();
-    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasPrivateSegmentWaveByteOffset()) {
-    // Scratch wave offset passed in system SGPR.
-    unsigned PrivateSegmentWaveByteOffsetReg;
-
-    if (IsShader) {
-      PrivateSegmentWaveByteOffsetReg =
-        Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
-
-      // This is true if the scratch wave byte offset doesn't have a fixed
-      // location.
-      if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
-        PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
-        Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
-      }
-    } else
-      PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
-
-    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
-    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
-  }
-}
-
 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
-    MachineIRBuilder &MIRBuilder, const Function &F,
+    MachineIRBuilder &B, const Function &F,
     ArrayRef<ArrayRef<Register>> VRegs) const {
-  MachineFunction &MF = MIRBuilder.getMF();
+  MachineFunction &MF = B.getMF();
   const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+
   const DataLayout &DL = F.getParent()->getDataLayout();
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
 
-  allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info);
+  allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
 
   unsigned i = 0;
   const unsigned KernArgBaseAlign = 16;
@@ -343,123 +475,242 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
       : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
     unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
     ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
-    lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg);
+    lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg);
     if (OrigArgRegs.size() > 1)
-      unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder);
+      unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
     ++i;
   }
 
-  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
-  allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
+  TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
+  TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
   return true;
 }
 
+// TODO: Move this to generic code
+static void packSplitRegsToOrigType(MachineIRBuilder &B,
+                                    ArrayRef<Register> OrigRegs,
+                                    ArrayRef<Register> Regs,
+                                    LLT LLTy,
+                                    LLT PartLLT) {
+  if (!LLTy.isVector() && !PartLLT.isVector()) {
+    B.buildMerge(OrigRegs[0], Regs);
+    return;
+  }
+
+  if (LLTy.isVector() && PartLLT.isVector()) {
+    assert(LLTy.getElementType() == PartLLT.getElementType());
+
+    int DstElts = LLTy.getNumElements();
+    int PartElts = PartLLT.getNumElements();
+    if (DstElts % PartElts == 0)
+      B.buildConcatVectors(OrigRegs[0], Regs);
+    else {
+      // Deal with v3s16 split into v2s16
+      assert(PartElts == 2 && DstElts % 2 != 0);
+      int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts);
+
+      LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType());
+      auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs);
+      B.buildExtract(OrigRegs[0], RoundedConcat, 0);
+    }
+
+    return;
+  }
+
+  assert(LLTy.isVector() && !PartLLT.isVector());
+
+  LLT DstEltTy = LLTy.getElementType();
+  if (DstEltTy == PartLLT) {
+    // Vector was trivially scalarized.
+    B.buildBuildVector(OrigRegs[0], Regs);
+  } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
+    // Deal with vector with 64-bit elements decomposed to 32-bit
+    // registers. Need to create intermediate 64-bit elements.
+    SmallVector<Register, 8> EltMerges;
+    int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
+
+    assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
+
+    for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I)  {
+      auto Merge = B.buildMerge(DstEltTy,
+                                         Regs.take_front(PartsPerElt));
+      EltMerges.push_back(Merge.getReg(0));
+      Regs = Regs.drop_front(PartsPerElt);
+    }
+
+    B.buildBuildVector(OrigRegs[0], EltMerges);
+  } else {
+    // Vector was split, and elements promoted to a wider type.
+    LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
+    auto BV = B.buildBuildVector(BVType, Regs);
+    B.buildTrunc(OrigRegs[0], BV);
+  }
+}
+
 bool AMDGPUCallLowering::lowerFormalArguments(
-    MachineIRBuilder &MIRBuilder, const Function &F,
+    MachineIRBuilder &B, const Function &F,
     ArrayRef<ArrayRef<Register>> VRegs) const {
+  CallingConv::ID CC = F.getCallingConv();
+
   // The infrastructure for normal calling convention lowering is essentially
   // useless for kernels. We want to avoid any kind of legalization or argument
   // splitting.
-  if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
-    return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs);
+  if (CC == CallingConv::AMDGPU_KERNEL)
+    return lowerFormalArgumentsKernel(B, F, VRegs);
 
-  // AMDGPU_GS and AMDGP_HS are not supported yet.
-  if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
-      F.getCallingConv() == CallingConv::AMDGPU_HS)
-    return false;
+  const bool IsShader = AMDGPU::isShader(CC);
+  const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
 
-  MachineFunction &MF = MIRBuilder.getMF();
+  MachineFunction &MF = B.getMF();
+  MachineBasicBlock &MBB = B.getMBB();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  bool IsShader = AMDGPU::isShader(F.getCallingConv());
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+  CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
+
+  if (!IsEntryFunc) {
+    Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
+    Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
+                                         &AMDGPU::SGPR_64RegClass);
+    MBB.addLiveIn(ReturnAddrReg);
+    B.buildCopy(LiveInReturn, ReturnAddrReg);
+  }
 
   if (Info->hasImplicitBufferPtr()) {
-    unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
+    Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(ImplicitBufferPtrReg);
   }
 
-  unsigned NumArgs = F.arg_size();
-  Function::const_arg_iterator CurOrigArg = F.arg_begin();
-  const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+
+  SmallVector<ArgInfo, 32> SplitArgs;
+  unsigned Idx = 0;
   unsigned PSInputNum = 0;
-  BitVector Skipped(NumArgs);
-  for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
-    EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType());
-
-    // We can only hanlde simple value types at the moment.
-    ISD::ArgFlagsTy Flags;
-    assert(VRegs[i].size() == 1 && "Can't lower into more than one register");
-    ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()};
-    setArgFlags(OrigArg, i + 1, DL, F);
-    Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
-
-    if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
-        !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() &&
-        PSInputNum <= 15) {
-      if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) {
-        Skipped.set(i);
-        ++PSInputNum;
+
+  for (auto &Arg : F.args()) {
+    if (DL.getTypeStoreSize(Arg.getType()) == 0)
+      continue;
+
+    const bool InReg = Arg.hasAttribute(Attribute::InReg);
+
+    // SGPR arguments to functions not implemented.
+    if (!IsShader && InReg)
+      return false;
+
+    if (Arg.hasAttribute(Attribute::SwiftSelf) ||
+        Arg.hasAttribute(Attribute::SwiftError) ||
+        Arg.hasAttribute(Attribute::Nest))
+      return false;
+
+    if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
+      const bool ArgUsed = !Arg.use_empty();
+      bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
+
+      if (!SkipArg) {
+        Info->markPSInputAllocated(PSInputNum);
+        if (ArgUsed)
+          Info->markPSInputEnabled(PSInputNum);
+      }
+
+      ++PSInputNum;
+
+      if (SkipArg) {
+        for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
+          B.buildUndef(VRegs[Idx][I]);
+
+        ++Idx;
         continue;
       }
+    }
 
-      Info->markPSInputAllocated(PSInputNum);
-      if (!CurOrigArg->use_empty())
-        Info->markPSInputEnabled(PSInputNum);
+    ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+    setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
 
-      ++PSInputNum;
+    splitToValueTypes(
+      OrigArg, SplitArgs, DL, MRI, CC,
+      // FIXME: We should probably be passing multiple registers to
+      // handleAssignments to do this
+      [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+        packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
+                                LLTy, PartLLT);
+      });
+
+    ++Idx;
+  }
+
+  // At least one interpolation mode must be enabled or else the GPU will
+  // hang.
+  //
+  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
+  // set PSInputAddr, the user wants to enable some bits after the compilation
+  // based on run-time states. Since we can't know what the final PSInputEna
+  // will look like, so we shouldn't do anything here and the user should take
+  // responsibility for the correct programming.
+  //
+  // Otherwise, the following restrictions apply:
+  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+  //   enabled too.
+  if (CC == CallingConv::AMDGPU_PS) {
+    if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+        ((Info->getPSInputAddr() & 0xF) == 0 &&
+         Info->isPSInputAllocated(11))) {
+      CCInfo.AllocateReg(AMDGPU::VGPR0);
+      CCInfo.AllocateReg(AMDGPU::VGPR1);
+      Info->markPSInputAllocated(0);
+      Info->markPSInputEnabled(0);
     }
 
-    CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
-                                             /*IsVarArg=*/false);
-
-    if (ValEVT.isVector()) {
-      EVT ElemVT = ValEVT.getVectorElementType();
-      if (!ValEVT.isSimple())
-        return false;
-      MVT ValVT = ElemVT.getSimpleVT();
-      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full,
-                          OrigArg.Flags, CCInfo);
-      if (!Res)
-        return false;
-    } else {
-      MVT ValVT = ValEVT.getSimpleVT();
-      if (!ValEVT.isSimple())
-        return false;
-      bool Res =
-          AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
-
-      // Fail if we don't know how to handle this type.
-      if (Res)
-        return false;
+    if (Subtarget.isAmdPalOS()) {
+      // For isAmdPalOS, the user does not enable some bits after compilation
+      // based on run-time states; the register values being generated here are
+      // the final ones set in hardware. Therefore we need to apply the
+      // workaround to PSInputAddr and PSInputEnable together.  (The case where
+      // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
+      // set up an input arg for a particular interpolation mode, but nothing
+      // uses that input arg. Really we should have an earlier pass that removes
+      // such an arg.)
+      unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+      if ((PsInputBits & 0x7F) == 0 ||
+          ((PsInputBits & 0xF) == 0 &&
+           (PsInputBits >> 11 & 1)))
+        Info->markPSInputEnabled(
+          countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
     }
   }
 
-  Function::const_arg_iterator Arg = F.arg_begin();
-
-  if (F.getCallingConv() == CallingConv::AMDGPU_VS ||
-      F.getCallingConv() == CallingConv::AMDGPU_PS) {
-    for (unsigned i = 0, OrigArgIdx = 0;
-         OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
-       if (Skipped.test(OrigArgIdx))
-          continue;
-       assert(VRegs[OrigArgIdx].size() == 1 &&
-              "Can't lower into more than 1 reg");
-       CCValAssign &VA = ArgLocs[i++];
-       MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]);
-       MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
-       MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg());
-    }
+  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+  CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
 
-    allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader);
-    return true;
+  if (!MBB.empty())
+    B.setInstr(*MBB.begin());
+
+  FormalArgHandler Handler(B, MRI, AssignFn);
+  if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
+    return false;
+
+  if (!IsEntryFunc) {
+    // Special inputs come after user arguments.
+    TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+  }
+
+  // Start adding system SGPRs.
+  if (IsEntryFunc) {
+    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
+  } else {
+    CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
+    CCInfo.AllocateReg(Info->getFrameOffsetReg());
+    TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
-  return false;
+  // Move back to the end of the basic block.
+  B.setMBB(MBB);
+
+  return true;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 3599659cac6a..53a562586bc0 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -20,26 +20,37 @@
 namespace llvm {
 
 class AMDGPUTargetLowering;
+class MachineInstrBuilder;
 
 class AMDGPUCallLowering: public CallLowering {
-  Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+  Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy,
                              uint64_t Offset) const;
 
-  void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
-                      uint64_t Offset, unsigned Align,
-                      Register DstReg) const;
+  void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
+                      unsigned Align, Register DstReg) const;
 
- public:
+  /// A function of this type is used to perform value split action.
+  using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT, int)>;
+
+  void splitToValueTypes(const ArgInfo &OrigArgInfo,
+                         SmallVectorImpl<ArgInfo> &SplitArgs,
+                         const DataLayout &DL, MachineRegisterInfo &MRI,
+                         CallingConv::ID CallConv,
+                         SplitArgTy SplitArg) const;
+
+  bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
+                      ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
+
+public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 
-  bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+  bool lowerReturn(MachineIRBuilder &B, const Value *Val,
                    ArrayRef<Register> VRegs) const override;
 
-  bool lowerFormalArgumentsKernel(MachineIRBuilder &MIRBuilder,
-                                  const Function &F,
+  bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F,
                                   ArrayRef<ArrayRef<Register>> VRegs) const;
 
-  bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+  bool lowerFormalArguments(MachineIRBuilder &B, const Function &F,
                             ArrayRef<ArrayRef<Register>> VRegs) const override;
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 3688cd77542e..f8a54a61aac2 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -24,22 +24,9 @@ def CC_SI : CallingConv<[
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
     SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
     SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
-    SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
-    SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
-    SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
-    SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
-    SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
-    SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
-    SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
-    SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
-    SGPR104, SGPR105
+    SGPR40, SGPR41, SGPR42, SGPR43
   ]>>>,
 
-  // We have no way of referring to the generated register tuples
-  // here, so use a custom function.
-  CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
-  CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
-
   // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
   CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
@@ -69,15 +56,7 @@ def RetCC_SI_Shader : CallingConv<[
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
     SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
     SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
-    SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
-    SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
-    SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
-    SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
-    SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
-    SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
-    SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
-    SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
-    SGPR104, SGPR105
+    SGPR40, SGPR41, SGPR42, SGPR43
   ]>>,
 
   // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
@@ -138,7 +117,6 @@ def CC_AMDGPU_Func : CallingConv<[
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
-  CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
   CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
   CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
@@ -157,7 +135,6 @@ def RetCC_AMDGPU_Func : CallingConv<[
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
-  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>
 ]>;
 
 def CC_AMDGPU : CallingConv<[
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b750c6b5f6d2..1640a4a59ee2 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -55,6 +55,12 @@ static cl::opt<bool> WidenLoads(
   cl::ReallyHidden,
   cl::init(true));
 
+static cl::opt<bool> UseMul24Intrin(
+  "amdgpu-codegenprepare-mul24",
+  cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(true));
+
 class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
   const GCNSubtarget *ST = nullptr;
@@ -509,7 +515,9 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
     }
   }
 
-  I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
+  Value *NewVal = insertValues(Builder, Ty, ResultVals);
+  NewVal->takeName(&I);
+  I.replaceAllUsesWith(NewVal);
   I.eraseFromParent();
 
   return true;
@@ -879,7 +887,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
       DA->isUniform(&I) && promoteUniformOpToI32(I))
     return true;
 
-  if (replaceMulWithMul24(I))
+  if (UseMul24Intrin && replaceMulWithMul24(I))
     return true;
 
   bool Changed = false;
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index e80797736363..61ce83b30e00 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -13,9 +13,9 @@
 #include "AMDGPUFrameLowering.h"
 
 using namespace llvm;
-AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
-    int LAO, unsigned TransAl)
-  : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, Align StackAl,
+                                         int LAO, Align TransAl)
+    : TargetFrameLowering(D, StackAl, LAO, TransAl) {}
 
 AMDGPUFrameLowering::~AMDGPUFrameLowering() = default;
 
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 48b64488303e..92e256cf2829 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -25,8 +25,8 @@ namespace llvm {
 /// See TargetFrameInfo for more comments.
 class AMDGPUFrameLowering : public TargetFrameLowering {
 public:
-  AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
-                      unsigned TransAl = 1);
+  AMDGPUFrameLowering(StackDirection D, Align StackAl, int LAO,
+                      Align TransAl = Align::None());
   ~AMDGPUFrameLowering() override;
 
   /// \returns The number of 32-bit sub-registers that are used when storing
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
index cad4c2ef404c..f2be1ca44d34 100644
--- a/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -12,10 +12,6 @@
 
 include "AMDGPU.td"
 
-def p0 : PtrValueType<i64, 0>;
-def p1 : PtrValueType<i64, 1>;
-def p4 : PtrValueType<i64, 4>;
-
 def sd_vsrc0 : ComplexPattern<i32, 1, "">;
 def gi_vsrc0 :
     GIComplexOperandMatcher<s32, "selectVSRC0">,
@@ -38,6 +34,18 @@ def gi_vop3omods :
     GIComplexOperandMatcher<s32, "selectVOP3OMods">,
     GIComplexPatternEquiv<VOP3OMods>;
 
+def gi_vop3omods0clamp0omod :
+    GIComplexOperandMatcher<s32, "selectVOP3Mods0Clamp0OMod">,
+    GIComplexPatternEquiv<VOP3Mods0Clamp0OMod>;
+
+def gi_vop3opselmods0 :
+    GIComplexOperandMatcher<s32, "selectVOP3OpSelMods0">,
+    GIComplexPatternEquiv<VOP3OpSelMods0>;
+
+def gi_vop3opselmods :
+    GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
+    GIComplexPatternEquiv<VOP3OpSelMods>;
+
 def gi_smrd_imm :
     GIComplexOperandMatcher<s64, "selectSmrdImm">,
     GIComplexPatternEquiv<SMRDImm>;
@@ -50,12 +58,19 @@ def gi_smrd_sgpr :
     GIComplexOperandMatcher<s64, "selectSmrdSgpr">,
     GIComplexPatternEquiv<SMRDSgpr>;
 
+// FIXME: Why are the atomic versions separated?
 def gi_flat_offset :
     GIComplexOperandMatcher<s64, "selectFlatOffset">,
     GIComplexPatternEquiv<FLATOffset>;
 def gi_flat_offset_signed :
     GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
     GIComplexPatternEquiv<FLATOffsetSigned>;
+def gi_flat_atomic :
+    GIComplexOperandMatcher<s64, "selectFlatOffset">,
+    GIComplexPatternEquiv<FLATAtomic>;
+def gi_flat_signed_atomic :
+    GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
+    GIComplexPatternEquiv<FLATSignedAtomic>;
 
 def gi_mubuf_scratch_offset :
     GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
@@ -64,6 +79,44 @@ def gi_mubuf_scratch_offen :
     GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">,
     GIComplexPatternEquiv<MUBUFScratchOffen>;
 
+def gi_ds_1addr_1offset :
+    GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
+    GIComplexPatternEquiv<DS1Addr1Offset>;
+
+
+// Separate load nodes are defined to glue m0 initialization in
+// SelectionDAG. The GISel selector can just insert m0 initialization
+// directly before before selecting a glue-less load, so hide this
+// distinction.
+
+def : GINodeEquiv<G_LOAD, AMDGPUld_glue> {
+  let CheckMMOIsNonAtomic = 1;
+}
+
+def : GINodeEquiv<G_STORE, AMDGPUst_glue> {
+  let CheckMMOIsNonAtomic = 1;
+}
+
+def : GINodeEquiv<G_LOAD, AMDGPUatomic_ld_glue> {
+  bit CheckMMOIsAtomic = 1;
+}
+
+
+
+def : GINodeEquiv<G_ATOMIC_CMPXCHG, atomic_cmp_swap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_XCHG, atomic_swap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_ADD, atomic_load_add_glue>;
+def : GINodeEquiv<G_ATOMICRMW_SUB, atomic_load_sub_glue>;
+def : GINodeEquiv<G_ATOMICRMW_AND, atomic_load_and_glue>;
+def : GINodeEquiv<G_ATOMICRMW_OR, atomic_load_or_glue>;
+def : GINodeEquiv<G_ATOMICRMW_XOR, atomic_load_xor_glue>;
+def : GINodeEquiv<G_ATOMICRMW_MIN, atomic_load_min_glue>;
+def : GINodeEquiv<G_ATOMICRMW_MAX, atomic_load_max_glue>;
+def : GINodeEquiv<G_ATOMICRMW_UMIN, atomic_load_umin_glue>;
+def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax_glue>;
+def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>;
+
+def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>;
 
 class GISelSop2Pat <
   SDPatternOperator node,
@@ -143,20 +196,6 @@ multiclass GISelVop2IntrPat <
 def : GISelSop2Pat <or, S_OR_B32, i32>;
 def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
 
-// FIXME: We can't re-use SelectionDAG patterns here because they match
-// against a custom SDNode and we would need to create a generic machine
-// instruction that is equivalent to the custom SDNode.  This would also require
-// us to custom legalize the intrinsic to the new generic machine instruction,
-// but I can't get custom legalizing of intrinsic to work and I'm not sure if
-// this is even supported yet.
-def : GISelVop3Pat2ModsPat <
-  int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e64, v2f16, f32>;
-
-defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
-def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
-defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>;
-def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
-
 // Since GlobalISel is more flexible then SelectionDAG, I think we can get
 // away with adding patterns for integer types and not legalizing all
 // loads and stores to vector types.  This should help simplify the load/store
@@ -164,3 +203,6 @@ def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
 foreach Ty = [i64, p0, p1, p4] in {
   defm : SMRD_Pattern <"S_LOAD_DWORDX2",  Ty>;
 }
+
+def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">,
+  GISDNodeXFormEquiv<as_i32timm>;
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 0a1f48231b18..85d1ad349157 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -22,15 +22,17 @@ enum PartialMappingIdx {
   PM_SGPR128 = 9,
   PM_SGPR256 = 10,
   PM_SGPR512 = 11,
-  PM_VGPR1  = 12,
-  PM_VGPR16 = 16,
-  PM_VGPR32 = 17,
-  PM_VGPR64 = 18,
-  PM_VGPR128 = 19,
-  PM_VGPR256 = 20,
-  PM_VGPR512 = 21,
-  PM_SGPR96 = 22,
-  PM_VGPR96 = 23
+  PM_SGPR1024 = 12,
+  PM_VGPR1  = 13,
+  PM_VGPR16 = 17,
+  PM_VGPR32 = 18,
+  PM_VGPR64 = 19,
+  PM_VGPR128 = 20,
+  PM_VGPR256 = 21,
+  PM_VGPR512 = 22,
+  PM_VGPR1024 = 23,
+  PM_SGPR96 = 24,
+  PM_VGPR96 = 25
 };
 
 const RegisterBankInfo::PartialMapping PartMappings[] {
@@ -45,6 +47,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
   {0, 128, SGPRRegBank},
   {0, 256, SGPRRegBank},
   {0, 512, SGPRRegBank},
+  {0, 1024, SGPRRegBank},
 
   {0, 1,  VGPRRegBank}, // VGPR begin
   {0, 16, VGPRRegBank},
@@ -53,8 +56,9 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
   {0, 128, VGPRRegBank},
   {0, 256, VGPRRegBank},
   {0, 512, VGPRRegBank},
+  {0, 1024, VGPRRegBank},
   {0, 96, SGPRRegBank},
-  {0, 96, VGPRRegBank},
+  {0, 96, VGPRRegBank}
 };
 
 const RegisterBankInfo::ValueMapping ValMappings[] {
@@ -65,41 +69,43 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
   {&PartMappings[1], 1},
 
   // SGPRs
-  {&PartMappings[2], 1},
+  {&PartMappings[2], 1}, // 1
   {nullptr, 0}, // Illegal power of 2 sizes
   {nullptr, 0},
   {nullptr, 0},
-  {&PartMappings[3], 1},
-  {&PartMappings[4], 1},
-  {&PartMappings[5], 1},
-  {&PartMappings[6], 1},
-  {&PartMappings[7], 1},
-  {&PartMappings[8], 1},
-
-    // VGPRs
-  {&PartMappings[9], 1},
+  {&PartMappings[3], 1}, // 16
+  {&PartMappings[4], 1}, // 32
+  {&PartMappings[5], 1}, // 64
+  {&PartMappings[6], 1}, // 128
+  {&PartMappings[7], 1}, // 256
+  {&PartMappings[8], 1}, // 512
+  {&PartMappings[9], 1}, // 1024
+
+  // VGPRs
+  {&PartMappings[10], 1}, // 1
   {nullptr, 0},
   {nullptr, 0},
   {nullptr, 0},
-  {&PartMappings[10], 1},
-  {&PartMappings[11], 1},
-  {&PartMappings[12], 1},
-  {&PartMappings[13], 1},
-  {&PartMappings[14], 1},
-  {&PartMappings[15], 1},
-  {&PartMappings[16], 1},
-  {&PartMappings[17], 1}
+  {&PartMappings[11], 1}, // 16
+  {&PartMappings[12], 1}, // 32
+  {&PartMappings[13], 1}, // 64
+  {&PartMappings[14], 1}, // 128
+  {&PartMappings[15], 1}, // 256
+  {&PartMappings[16], 1}, // 512
+  {&PartMappings[17], 1}, // 1024
+  {&PartMappings[18], 1},
+  {&PartMappings[19], 1}
 };
 
 const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
-     /*32-bit op*/ {0, 32, SGPRRegBank},
-   /*2x32-bit op*/ {0, 32, SGPRRegBank},
-                   {32, 32, SGPRRegBank},
-/*<2x32-bit> op*/  {0, 64, SGPRRegBank},
-
-    /*32-bit op*/  {0, 32, VGPRRegBank},
-  /*2x32-bit op*/  {0, 32, VGPRRegBank},
-                   {32, 32, VGPRRegBank},
+  {0, 32, SGPRRegBank}, // 32-bit op
+  {0, 32, SGPRRegBank}, // 2x32-bit op
+  {32, 32, SGPRRegBank},
+  {0, 64, SGPRRegBank}, // <2x32-bit> op
+
+  {0, 32, VGPRRegBank}, // 32-bit op
+  {0, 32, VGPRRegBank}, // 2x32-bit op
+  {32, 32, VGPRRegBank},
 };
 
 
@@ -116,7 +122,7 @@ const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
 enum ValueMappingIdx {
   SCCStartIdx = 0,
   SGPRStartIdx = 2,
-  VGPRStartIdx = 12
+  VGPRStartIdx = 13
 };
 
 const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b31de0af5018..9f5bcd8ff5f0 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -218,12 +218,13 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 
-  unsigned MaxKernArgAlign;
+  Align MaxKernArgAlign;
   HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
                                                                MaxKernArgAlign);
   HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
   HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
-  HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);
+  HSACodeProps.mKernargSegmentAlign =
+      std::max(MaxKernArgAlign, Align(4)).value();
   HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
   HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
   HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
@@ -883,7 +884,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
 
   auto Kern = HSAMetadataDoc->getMapNode();
 
-  unsigned MaxKernArgAlign;
+  Align MaxKernArgAlign;
   Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode(
       STM.getKernArgSegmentSize(F, MaxKernArgAlign));
   Kern[".group_segment_fixed_size"] =
@@ -891,7 +892,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
   Kern[".private_segment_fixed_size"] =
       Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
   Kern[".kernarg_segment_align"] =
-      Kern.getDocument()->getNode(std::max(uint32_t(4), MaxKernArgAlign));
+      Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
   Kern[".wavefront_size"] =
       Kern.getDocument()->getNode(STM.getWavefrontSize());
   Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 2eecddbd7b01..80ac8ca67bcd 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -52,7 +52,7 @@ public:
 class MetadataStreamerV3 final : public MetadataStreamer {
 private:
   std::unique_ptr<msgpack::Document> HSAMetadataDoc =
-      llvm::make_unique<msgpack::Document>();
+      std::make_unique<msgpack::Document>();
 
   void dump(StringRef HSAMetadataString) const;
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ea730539f834..f330bd7ebcdd 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -172,8 +172,9 @@ private:
 
   MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
 
-  SDNode *glueCopyToM0LDSInit(SDNode *N) const;
+  SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
   SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
+  SDNode *glueCopyToM0LDSInit(SDNode *N) const;
 
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
@@ -186,10 +187,11 @@ private:
   bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
                    SDValue &SOffset, SDValue &Offset, SDValue &Offen,
                    SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
-                   SDValue &TFE, SDValue &DLC) const;
+                   SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
                          SDValue &SOffset, SDValue &Offset, SDValue &GLC,
-                         SDValue &SLC, SDValue &TFE, SDValue &DLC) const;
+                         SDValue &SLC, SDValue &TFE, SDValue &DLC,
+                         SDValue &SWZ) const;
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                          SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
                          SDValue &SLC) const;
@@ -202,21 +204,20 @@ private:
 
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
                          SDValue &Offset, SDValue &GLC, SDValue &SLC,
-                         SDValue &TFE, SDValue &DLC) const;
+                         SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
                          SDValue &Offset, SDValue &SLC) const;
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
                          SDValue &Offset) const;
 
+  template <bool IsSigned>
+  bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+                        SDValue &Offset, SDValue &SLC) const;
   bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr,
                         SDValue &Offset, SDValue &SLC) const;
   bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr,
                               SDValue &Offset, SDValue &SLC) const;
 
-  template <bool IsSigned>
-  bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
-                        SDValue &Offset, SDValue &SLC) const;
-
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
   SDValue Expand32BitAddress(SDValue Addr) const;
@@ -262,6 +263,8 @@ private:
 
   SDValue getHi16Elt(SDValue In) const;
 
+  SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
+
   void SelectADD_SUB_I64(SDNode *N);
   void SelectAddcSubb(SDNode *N);
   void SelectUADDO_USUBO(SDNode *N);
@@ -282,6 +285,7 @@ private:
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
   void SelectDS_GWS(SDNode *N, unsigned IntrID);
   void SelectINTRINSIC_W_CHAIN(SDNode *N);
+  void SelectINTRINSIC_WO_CHAIN(SDNode *N);
   void SelectINTRINSIC_VOID(SDNode *N);
 
 protected:
@@ -543,7 +547,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
   if (!N->isMachineOpcode()) {
     if (N->getOpcode() == ISD::CopyToReg) {
       unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
         return MRI.getRegClass(Reg);
       }
@@ -582,19 +586,10 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
   }
 }
 
-SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
-  const SITargetLowering& Lowering =
-    *static_cast<const SITargetLowering*>(getTargetLowering());
-
-  assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
-
-  SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N),
-                                 Val);
-
-  SDValue Glue = M0.getValue(1);
-
+SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
+                                         SDValue Glue) const {
   SmallVector <SDValue, 8> Ops;
-  Ops.push_back(M0); // Replace the chain.
+  Ops.push_back(NewChain); // Replace the chain.
   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
     Ops.push_back(N->getOperand(i));
 
@@ -602,6 +597,16 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
 }
 
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
+  const SITargetLowering& Lowering =
+    *static_cast<const SITargetLowering*>(getTargetLowering());
+
+  assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
+
+  SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
+  return glueCopyToOp(N, M0, M0.getValue(1));
+}
+
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
@@ -635,13 +640,13 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
 static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
   switch (NumVectorElts) {
   case 1:
-    return AMDGPU::SReg_32_XM0RegClassID;
+    return AMDGPU::SReg_32RegClassID;
   case 2:
     return AMDGPU::SReg_64RegClassID;
   case 3:
     return AMDGPU::SGPR_96RegClassID;
   case 4:
-    return AMDGPU::SReg_128RegClassID;
+    return AMDGPU::SGPR_128RegClassID;
   case 5:
     return AMDGPU::SGPR_160RegClassID;
   case 8:
@@ -713,12 +718,17 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return;   // Already selected.
   }
 
-  if (isa<AtomicSDNode>(N) ||
+  // isa<MemSDNode> almost works but is slightly too permissive for some DS
+  // intrinsics.
+  if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
        Opc == ISD::ATOMIC_LOAD_FADD ||
        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
-       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
+       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
     N = glueCopyToM0LDSInit(N);
+    SelectCode(N);
+    return;
+  }
 
   switch (Opc) {
   default:
@@ -781,7 +791,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SDValue RC, SubReg0, SubReg1;
     SDLoc DL(N);
     if (N->getValueType(0) == MVT::i128) {
-      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
+      RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
     } else if (N->getValueType(0) == MVT::i64) {
@@ -815,14 +825,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
     return;
   }
-  case ISD::LOAD:
-  case ISD::STORE:
-  case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE: {
-    N = glueCopyToM0LDSInit(N);
-    break;
-  }
-
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     // There is a scalar version available, but unlike the vector version which
@@ -908,6 +910,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectINTRINSIC_W_CHAIN(N);
     return;
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    SelectINTRINSIC_WO_CHAIN(N);
+    return;
+  }
   case ISD::INTRINSIC_VOID: {
     SelectINTRINSIC_VOID(N);
     return;
@@ -961,6 +967,14 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   return true;
 }
 
+SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
+                                                       const SDLoc &DL) const {
+  SDNode *Mov = CurDAG->getMachineNode(
+    AMDGPU::S_MOV_B32, DL, MVT::i32,
+    CurDAG->getTargetConstant(Val, DL, MVT::i32));
+  return SDValue(Mov, 0);
+}
+
 // FIXME: Should only handle addcarry/subcarry
 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   SDLoc DL(N);
@@ -1308,7 +1322,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
                                      SDValue &Offset, SDValue &Offen,
                                      SDValue &Idxen, SDValue &Addr64,
                                      SDValue &GLC, SDValue &SLC,
-                                     SDValue &TFE, SDValue &DLC) const {
+                                     SDValue &TFE, SDValue &DLC,
+                                     SDValue &SWZ) const {
   // Subtarget prefers to use flat instruction
   if (Subtarget->useFlatForGlobal())
     return false;
@@ -1321,6 +1336,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
     SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
   TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
   DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -1400,7 +1416,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                                            SDValue &VAddr, SDValue &SOffset,
                                            SDValue &Offset, SDValue &GLC,
                                            SDValue &SLC, SDValue &TFE,
-                                           SDValue &DLC) const {
+                                           SDValue &DLC, SDValue &SWZ) const {
   SDValue Ptr, Offen, Idxen, Addr64;
 
   // addr64 bit was removed for volcanic islands.
@@ -1408,7 +1424,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
     return false;
 
   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE, DLC))
+              GLC, SLC, TFE, DLC, SWZ))
     return false;
 
   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
@@ -1430,9 +1446,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                                            SDValue &Offset,
                                            SDValue &SLC) const {
   SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
-  SDValue GLC, TFE, DLC;
+  SDValue GLC, TFE, DLC, SWZ;
 
-  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC);
+  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ);
 }
 
 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
@@ -1557,13 +1573,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &SOffset, SDValue &Offset,
                                            SDValue &GLC, SDValue &SLC,
-                                           SDValue &TFE, SDValue &DLC) const {
+                                           SDValue &TFE, SDValue &DLC,
+                                           SDValue &SWZ) const {
   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
   const SIInstrInfo *TII =
     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE, DLC))
+              GLC, SLC, TFE, DLC, SWZ))
     return false;
 
   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
@@ -1585,16 +1602,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &Soffset, SDValue &Offset
                                            ) const {
-  SDValue GLC, SLC, TFE, DLC;
+  SDValue GLC, SLC, TFE, DLC, SWZ;
 
-  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
 }
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &Soffset, SDValue &Offset,
                                            SDValue &SLC) const {
-  SDValue GLC, TFE, DLC;
+  SDValue GLC, TFE, DLC, SWZ;
+
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
+}
 
-  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
+// Find a load or store from corresponding pattern root.
+// Roots may be build_vector, bitconvert or their combinations.
+static MemSDNode* findMemSDNode(SDNode *N) {
+  N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
+  if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
+    return MN;
+  assert(isa<BuildVectorSDNode>(N));
+  for (SDValue V : N->op_values())
+    if (MemSDNode *MN =
+          dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
+      return MN;
+  llvm_unreachable("cannot find MemSDNode in the pattern!");
 }
 
 template <bool IsSigned>
@@ -1603,8 +1634,95 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
                                           SDValue &VAddr,
                                           SDValue &Offset,
                                           SDValue &SLC) const {
-  return static_cast<const SITargetLowering*>(getTargetLowering())->
-    SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC);
+  int64_t OffsetVal = 0;
+
+  if (Subtarget->hasFlatInstOffsets() &&
+      (!Subtarget->hasFlatSegmentOffsetBug() ||
+       findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
+      CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+    const SIInstrInfo *TII = Subtarget->getInstrInfo();
+    unsigned AS = findMemSDNode(N)->getAddressSpace();
+    if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+      Addr = N0;
+      OffsetVal = COffsetVal;
+    } else {
+      // If the offset doesn't fit, put the low bits into the offset field and
+      // add the rest.
+
+      SDLoc DL(N);
+      uint64_t ImmField;
+      const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
+      if (IsSigned) {
+        ImmField = SignExtend64(COffsetVal, NumBits);
+
+        // Don't use a negative offset field if the base offset is positive.
+        // Since the scheduler currently relies on the offset field, doing so
+        // could result in strange scheduling decisions.
+
+        // TODO: Should we not do this in the opposite direction as well?
+        if (static_cast<int64_t>(COffsetVal) > 0) {
+          if (static_cast<int64_t>(ImmField) < 0) {
+            const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1);
+            ImmField = COffsetVal & OffsetMask;
+          }
+        }
+      } else {
+        // TODO: Should we do this for a negative offset?
+        const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
+        ImmField = COffsetVal & OffsetMask;
+      }
+
+      uint64_t RemainderOffset = COffsetVal - ImmField;
+
+      assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
+      assert(RemainderOffset + ImmField == COffsetVal);
+
+      OffsetVal = ImmField;
+
+      // TODO: Should this try to use a scalar add pseudo if the base address is
+      // uniform and saddr is usable?
+      SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+      SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+      SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                            DL, MVT::i32, N0, Sub0);
+      SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                            DL, MVT::i32, N0, Sub1);
+
+      SDValue AddOffsetLo
+        = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+      SDValue AddOffsetHi
+        = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+
+      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+      SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+      SDNode *Add = CurDAG->getMachineNode(
+        AMDGPU::V_ADD_I32_e64, DL, VTs,
+        {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+
+      SDNode *Addc = CurDAG->getMachineNode(
+        AMDGPU::V_ADDC_U32_e64, DL, VTs,
+        {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+
+      SDValue RegSequenceArgs[] = {
+        CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+        SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1
+      };
+
+      Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                            MVT::i64, RegSequenceArgs), 0);
+    }
+  }
+
+  VAddr = Addr;
+  Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
+  SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+  return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
@@ -1616,10 +1734,10 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
 }
 
 bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
-                                          SDValue Addr,
-                                          SDValue &VAddr,
-                                          SDValue &Offset,
-                                          SDValue &SLC) const {
+                                                SDValue Addr,
+                                                SDValue &VAddr,
+                                                SDValue &Offset,
+                                                SDValue &SLC) const {
   return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC);
 }
 
@@ -2158,10 +2276,12 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
   // offset field) % 64. Some versions of the programming guide omit the m0
   // part, or claim it's from offset 0.
   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
-    // If we have a constant offset, try to use the default value for m0 as a
-    // base to possibly avoid setting it up.
-    glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
-    ImmOffset = ConstOffset->getZExtValue() + 1;
+    // If we have a constant offset, try to use the 0 in m0 as the base.
+    // TODO: Look into changing the default m0 initialization value. If the
+    // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
+    // the immediate offset.
+    glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
+    ImmOffset = ConstOffset->getZExtValue();
   } else {
     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
       ImmOffset = BaseOffset.getConstantOperandVal(1);
@@ -2182,22 +2302,7 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
     glueCopyToM0(N, SDValue(M0Base, 0));
   }
 
-  SDValue V0;
   SDValue Chain = N->getOperand(0);
-  SDValue Glue;
-  if (HasVSrc) {
-    SDValue VSrc0 = N->getOperand(2);
-
-    // The manual doesn't mention this, but it seems only v0 works.
-    V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
-
-    SDValue CopyToV0 = CurDAG->getCopyToReg(
-      N->getOperand(0), SL, V0, VSrc0,
-      N->getOperand(N->getNumOperands() - 1));
-    Chain = CopyToV0;
-    Glue = CopyToV0.getValue(1);
-  }
-
   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
 
   // TODO: Can this just be removed from the instruction?
@@ -2206,14 +2311,11 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
   const unsigned Opc = gwsIntrinToOpcode(IntrID);
   SmallVector<SDValue, 5> Ops;
   if (HasVSrc)
-    Ops.push_back(V0);
+    Ops.push_back(N->getOperand(2));
   Ops.push_back(OffsetField);
   Ops.push_back(GDS);
   Ops.push_back(Chain);
 
-  if (HasVSrc)
-    Ops.push_back(Glue);
-
   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
 }
@@ -2233,6 +2335,28 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   SelectCode(N);
 }
 
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
+  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned Opcode;
+  switch (IntrID) {
+  case Intrinsic::amdgcn_wqm:
+    Opcode = AMDGPU::WQM;
+    break;
+  case Intrinsic::amdgcn_softwqm:
+    Opcode = AMDGPU::SOFT_WQM;
+    break;
+  case Intrinsic::amdgcn_wwm:
+    Opcode = AMDGPU::WWM;
+    break;
+  default:
+    SelectCode(N);
+    return;
+  }
+
+  SDValue Src = N->getOperand(1);
+  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
+}
+
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   switch (IntrID) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 39016ed37193..1115d8c23620 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -12,10 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define AMDGPU_LOG2E_F     1.44269504088896340735992468100189214f
-#define AMDGPU_LN2_F       0.693147180559945309417232121458176568f
-#define AMDGPU_LN10_F      2.30258509299404568401799145468436421f
-
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
 #include "AMDGPUCallLowering.h"
@@ -37,82 +33,9 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
 using namespace llvm;
 
-static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
-                           CCValAssign::LocInfo LocInfo,
-                           ISD::ArgFlagsTy ArgFlags, CCState &State,
-                           const TargetRegisterClass *RC,
-                           unsigned NumRegs) {
-  ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
-  unsigned RegResult = State.AllocateReg(RegList);
-  if (RegResult == AMDGPU::NoRegister)
-    return false;
-
-  State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
-  return true;
-}
-
-static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
-                              CCValAssign::LocInfo LocInfo,
-                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  switch (LocVT.SimpleTy) {
-  case MVT::i64:
-  case MVT::f64:
-  case MVT::v2i32:
-  case MVT::v2f32:
-  case MVT::v4i16:
-  case MVT::v4f16: {
-    // Up to SGPR0-SGPR105
-    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
-                          &AMDGPU::SGPR_64RegClass, 53);
-  }
-  default:
-    return false;
-  }
-}
-
-// Allocate up to VGPR31.
-//
-// TODO: Since there are no VGPR alignent requirements would it be better to
-// split into individual scalar registers?
-static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
-                              CCValAssign::LocInfo LocInfo,
-                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  switch (LocVT.SimpleTy) {
-  case MVT::i64:
-  case MVT::f64:
-  case MVT::v2i32:
-  case MVT::v2f32:
-  case MVT::v4i16:
-  case MVT::v4f16: {
-    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
-                          &AMDGPU::VReg_64RegClass, 31);
-  }
-  case MVT::v4i32:
-  case MVT::v4f32:
-  case MVT::v2i64:
-  case MVT::v2f64: {
-    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
-                          &AMDGPU::VReg_128RegClass, 29);
-  }
-  case MVT::v8i32:
-  case MVT::v8f32: {
-    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
-                          &AMDGPU::VReg_256RegClass, 25);
-
-  }
-  case MVT::v16i32:
-  case MVT::v16f32: {
-    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
-                          &AMDGPU::VReg_512RegClass, 17);
-
-  }
-  default:
-    return false;
-  }
-}
-
 #include "AMDGPUGenCallingConv.inc"
 
 // Find a larger type to do a load / store of a vector with.
@@ -208,7 +131,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
   }
 
-  for (MVT VT : MVT::integer_vector_valuetypes()) {
+  for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
@@ -218,6 +141,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
@@ -225,8 +151,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
@@ -286,8 +215,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+  setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
+  setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
 
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -571,6 +503,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FABS);
   setTargetDAGCombine(ISD::AssertZext);
   setTargetDAGCombine(ISD::AssertSext);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 }
 
 //===----------------------------------------------------------------------===//
@@ -630,15 +563,26 @@ static bool hasSourceMods(const SDNode *N) {
   case ISD::FREM:
   case ISD::INLINEASM:
   case ISD::INLINEASM_BR:
-  case AMDGPUISD::INTERP_P1:
-  case AMDGPUISD::INTERP_P2:
   case AMDGPUISD::DIV_SCALE:
+  case ISD::INTRINSIC_W_CHAIN:
 
   // TODO: Should really be looking at the users of the bitcast. These are
   // problematic because bitcasts are used to legalize all stores to integer
   // types.
   case ISD::BITCAST:
     return false;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
+    case Intrinsic::amdgcn_interp_p1:
+    case Intrinsic::amdgcn_interp_p2:
+    case Intrinsic::amdgcn_interp_mov:
+    case Intrinsic::amdgcn_interp_p1_f16:
+    case Intrinsic::amdgcn_interp_p2_f16:
+      return false;
+    default:
+      return true;
+    }
+  }
   default:
     return true;
   }
@@ -745,8 +689,9 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
     return false;
 
   bool Fast = false;
-  return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy,
-                            MMO, &Fast) && Fast;
+  return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                        CastTy, MMO, &Fast) &&
+         Fast;
 }
 
 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
@@ -782,9 +727,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
     break;
     case ISD::LOAD:
     {
-      const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
-      if (L->getMemOperand()->getAddrSpace()
-      == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+      if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
+          AMDGPUAS::CONSTANT_ADDRESS_32BIT)
         return true;
       return false;
     }
@@ -1199,9 +1143,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::FROUND: return LowerFROUND(Op, DAG);
   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   case ISD::FLOG:
-    return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
+    return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
   case ISD::FLOG10:
-    return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
+    return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
   case ISD::FEXP:
     return lowerFEXP(Op, DAG);
   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
@@ -1236,7 +1180,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
-static bool hasDefinedInitializer(const GlobalValue *GV) {
+bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
   if (!GVar || !GVar->hasInitializer())
     return false;
@@ -2349,30 +2293,13 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
 }
 
-// Return M_LOG2E of appropriate type
-static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) {
-  switch (VT.getScalarType().getSimpleVT().SimpleTy) {
-  case MVT::f32:
-    return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT);
-  case MVT::f16:
-    return DAG.getConstantFP(
-      APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"),
-      SL, VT);
-  case MVT::f64:
-    return DAG.getConstantFP(
-      APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT);
-  default:
-    llvm_unreachable("unsupported fp type");
-  }
-}
-
 // exp2(M_LOG2E_F * f);
 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
 
-  const SDValue K = getLog2EVal(DAG, SL, VT);
+  const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
   return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
 }
@@ -2836,8 +2763,16 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
 static SDValue simplifyI24(SDNode *Node24,
                            TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
-  SDValue LHS = Node24->getOperand(0);
-  SDValue RHS = Node24->getOperand(1);
+  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
+
+  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
+  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
+  unsigned NewOpcode = Node24->getOpcode();
+  if (IsIntrin) {
+    unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
+    NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
+      AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+  }
 
   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
 
@@ -2847,7 +2782,7 @@ static SDValue simplifyI24(SDNode *Node24,
   SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
   SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
   if (DemandedLHS || DemandedRHS)
-    return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(),
+    return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
                        DemandedLHS ? DemandedLHS : LHS,
                        DemandedRHS ? DemandedRHS : RHS);
 
@@ -2904,54 +2839,6 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
   return true;
 }
 
-// Find a load or store from corresponding pattern root.
-// Roots may be build_vector, bitconvert or their combinations.
-static MemSDNode* findMemSDNode(SDNode *N) {
-  N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
-  if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
-    return MN;
-  assert(isa<BuildVectorSDNode>(N));
-  for (SDValue V : N->op_values())
-    if (MemSDNode *MN =
-          dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
-      return MN;
-  llvm_unreachable("cannot find MemSDNode in the pattern!");
-}
-
-bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned,
-                                            SelectionDAG &DAG,
-                                            SDNode *N,
-                                            SDValue Addr,
-                                            SDValue &VAddr,
-                                            SDValue &Offset,
-                                            SDValue &SLC) const {
-  const GCNSubtarget &ST =
-        DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
-  int64_t OffsetVal = 0;
-
-  if (ST.hasFlatInstOffsets() &&
-      (!ST.hasFlatSegmentOffsetBug() ||
-       findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
-      DAG.isBaseWithConstantOffset(Addr)) {
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-    int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
-    const SIInstrInfo *TII = ST.getInstrInfo();
-    if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(),
-                               IsSigned)) {
-      Addr = N0;
-      OffsetVal = COffsetVal;
-    }
-  }
-
-  VAddr = Addr;
-  Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
-  SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1);
-
-  return true;
-}
-
 // Replace load of an illegal type with a store of a bitcast to a friendlier
 // type.
 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
@@ -3085,6 +2972,19 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
 
   return SDValue();
 }
+
+SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IID) {
+  case Intrinsic::amdgcn_mul_i24:
+  case Intrinsic::amdgcn_mul_u24:
+    return simplifyI24(N, DCI);
+  default:
+    return SDValue();
+  }
+}
+
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -4173,6 +4073,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AssertZext:
   case ISD::AssertSext:
     return performAssertSZExtCombine(N, DCI);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performIntrinsicWOChainCombine(N, DCI);
   }
   return SDValue();
 }
@@ -4203,14 +4105,28 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
 }
 
+// This may be called multiple times, and nothing prevents creating multiple
+// objects at the same offset. See if we already defined this object.
+static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
+                                       int64_t Offset) {
+  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
+    if (MFI.getObjectOffset(I) == Offset) {
+      assert(MFI.getObjectSize(I) == Size);
+      return I;
+    }
+  }
+
+  return MFI.CreateFixedObject(Size, Offset, true);
+}
+
 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
                                                   EVT VT,
                                                   const SDLoc &SL,
                                                   int64_t Offset) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
 
-  int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
 
@@ -4260,7 +4176,7 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
   const AMDGPUSubtarget &ST =
       AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
   unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
-  unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
+  const Align Alignment = ST.getAlignmentForImplicitArgPtr();
   uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
                        ExplicitArgOffset;
   switch (Param) {
@@ -4295,6 +4211,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(SETCC)
   NODE_NAME_CASE(SETREG)
+  NODE_NAME_CASE(DENORM_MODE)
   NODE_NAME_CASE(FMA_W_CHAIN)
   NODE_NAME_CASE(FMUL_W_CHAIN)
   NODE_NAME_CASE(CLAMP)
@@ -4377,13 +4294,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(KILL)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
-  NODE_NAME_CASE(INIT_EXEC)
-  NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
-  NODE_NAME_CASE(SENDMSG)
-  NODE_NAME_CASE(SENDMSGHALT)
-  NODE_NAME_CASE(INTERP_MOV)
-  NODE_NAME_CASE(INTERP_P1)
-  NODE_NAME_CASE(INTERP_P2)
   NODE_NAME_CASE(INTERP_P1LL_F16)
   NODE_NAME_CASE(INTERP_P1LV_F16)
   NODE_NAME_CASE(INTERP_P2_F16)
@@ -4428,6 +4338,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
+  NODE_NAME_CASE(BUFFER_ATOMIC_INC)
+  NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
   NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
@@ -4576,9 +4488,9 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
       } else if (SelBits == 0x0c) {
-        Known.Zero |= 0xff << I;
+        Known.Zero |= 0xFFull << I;
       } else if (SelBits > 0x0c) {
-        Known.One |= 0xff << I;
+        Known.One |= 0xFFull << I;
       }
       Sel >>= 8;
     }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fe7ad694943d..dea0d1d4343a 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -38,6 +38,7 @@ private:
 public:
   static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
   static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
+  static bool hasDefinedInitializer(const GlobalValue *GV);
 
 protected:
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -78,6 +79,7 @@ protected:
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
                                        unsigned Opc, SDValue LHS,
@@ -324,10 +326,6 @@ public:
   }
 
   AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
-
-  bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N,
-                        SDValue Addr, SDValue &VAddr, SDValue &Offset,
-                        SDValue &SLC) const;
 };
 
 namespace AMDGPUISD {
@@ -369,6 +367,9 @@ enum NodeType : unsigned {
   // result bit per item in the wavefront.
   SETCC,
   SETREG,
+
+  DENORM_MODE,
+
   // FP ops with input and output chain.
   FMA_W_CHAIN,
   FMUL_W_CHAIN,
@@ -475,13 +476,6 @@ enum NodeType : unsigned {
   BUILD_VERTICAL_VECTOR,
   /// Pointer to the start of the shader's constant data.
   CONST_DATA_PTR,
-  INIT_EXEC,
-  INIT_EXEC_FROM_INPUT,
-  SENDMSG,
-  SENDMSGHALT,
-  INTERP_MOV,
-  INTERP_P1,
-  INTERP_P2,
   INTERP_P1LL_F16,
   INTERP_P1LV_F16,
   INTERP_P2_F16,
@@ -532,6 +526,8 @@ enum NodeType : unsigned {
   BUFFER_ATOMIC_AND,
   BUFFER_ATOMIC_OR,
   BUFFER_ATOMIC_XOR,
+  BUFFER_ATOMIC_INC,
+  BUFFER_ATOMIC_DEC,
   BUFFER_ATOMIC_CMPSWAP,
   BUFFER_ATOMIC_FADD,
   BUFFER_ATOMIC_PK_FADD,
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index f4df20b8f03e..a83ec23ec054 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -51,7 +51,7 @@ ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
 
 // Inliner constraint to achieve reasonable compilation time
 static cl::opt<size_t>
-MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300),
+MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
       cl::desc("Maximum BB number allowed in a function after inlining"
                " (compile time constraint)"));
 
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 4a8446955496..cf0ce5659951 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -110,39 +110,38 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 // Force dependencies for vector trunc stores
 def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>;
 
-def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
-def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
-
+def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
+def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
 // out = a - floor(a)
-def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
 
 // out = 1.0 / a
-def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
 
 // out = 1.0 / sqrt(a)
-def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
 
 // out = 1.0 / sqrt(a)
-def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
-def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrsq_legacy_impl : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrcp_legacy_impl : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
 
 def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
 
 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
-def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
+def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 
-def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+def AMDGPUldexp_impl : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
 
-def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
-def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
-def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
-def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
-def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
+def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
 def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
 def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
 
 
-def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
+def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
 
 // out = max(a, b) a and b are floats, where a nan comparison fails.
 // This is not commutative because this gives the second operand:
@@ -285,7 +284,7 @@ def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
 def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
-def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
+def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
 
 def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>;
 
@@ -320,7 +319,7 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
   []
 >;
 
-def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
 
 def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
                   SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
@@ -330,35 +329,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
 
 def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
 
-def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
-                      SDTypeProfile<0, 1, [SDTCisInt<0>]>,
-                      [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
-                                 SDTypeProfile<0, 2,
-                                 [SDTCisInt<0>, SDTCisInt<1>]>,
-                                 [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
-                    SDTypeProfile<0, 1, [SDTCisInt<0>]>,
-                    [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT",
-                    SDTypeProfile<0, 1, [SDTCisInt<0>]>,
-                    [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
-                        SDTypeProfile<1, 3, [SDTCisFP<0>]>,
-                        [SDNPInGlue]>;
-
-def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1",
-                      SDTypeProfile<1, 3, [SDTCisFP<0>]>,
-                      [SDNPInGlue, SDNPOutGlue]>;
-
-def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
-                      SDTypeProfile<1, 4, [SDTCisFP<0>]>,
-                      [SDNPInGlue]>;
-
 def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
                             SDTypeProfile<1, 7, [SDTCisFP<0>]>,
                             [SDNPInGlue, SDNPOutGlue]>;
@@ -425,3 +395,65 @@ def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
 def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
   [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
+
+
+//===----------------------------------------------------------------------===//
+// Intrinsic/Custom node compatability PatFrags
+//===----------------------------------------------------------------------===//
+
+def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src),
+                                           (AMDGPUrcp_impl node:$src)]>;
+def AMDGPUrcp_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rcp_legacy node:$src),
+                                                  (AMDGPUrcp_legacy_impl node:$src)]>;
+
+def AMDGPUrsq_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rsq_legacy node:$src),
+                                                  (AMDGPUrsq_legacy_impl node:$src)]>;
+
+def AMDGPUrsq : PatFrags<(ops node:$src), [(int_amdgcn_rsq node:$src),
+                                           (AMDGPUrsq_impl node:$src)]>;
+
+def AMDGPUrsq_clamp : PatFrags<(ops node:$src), [(int_amdgcn_rsq_clamp node:$src),
+                                                 (AMDGPUrsq_clamp_impl node:$src)]>;
+
+def AMDGPUsin : PatFrags<(ops node:$src), [(int_amdgcn_sin node:$src),
+                                           (AMDGPUsin_impl node:$src)]>;
+def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src),
+                                           (AMDGPUcos_impl node:$src)]>;
+def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src),
+                                             (AMDGPUfract_impl node:$src)]>;
+
+def AMDGPUldexp : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_ldexp node:$src0, node:$src1),
+   (AMDGPUldexp_impl node:$src0, node:$src1)]>;
+
+def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_class node:$src0, node:$src1),
+  (AMDGPUfp_class_impl node:$src0, node:$src1)]>;
+
+def AMDGPUfmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+  [(int_amdgcn_fmed3 node:$src0, node:$src1, node:$src2),
+   (AMDGPUfmed3_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUffbh_i32 : PatFrags<(ops node:$src),
+  [(int_amdgcn_sffbh node:$src),
+   (AMDGPUffbh_i32_impl node:$src)]>;
+
+def AMDGPUpkrtz_f16_f32 : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_cvt_pkrtz node:$src0, node:$src1),
+  (AMDGPUpkrtz_f16_f32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpknorm_i16_f32 : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_cvt_pknorm_i16 node:$src0, node:$src1),
+  (AMDGPUpknorm_i16_f32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpknorm_u16_f32 : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_cvt_pknorm_u16 node:$src0, node:$src1),
+  (AMDGPUpknorm_u16_f32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpk_i16_i32 : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_cvt_pk_i16 node:$src0, node:$src1),
+  (AMDGPUpk_i16_i32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpk_u16_u32 : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_cvt_pk_u16 node:$src0, node:$src1),
+  (AMDGPUpk_u16_u32_impl node:$src0, node:$src1)]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 901a2eaa8829..3cfa9d57ec46 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -19,8 +19,10 @@
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -61,8 +63,14 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
 
 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
 
+void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
+                                        CodeGenCoverage &CoverageInfo) {
+  MRI = &MF.getRegInfo();
+  InstructionSelector::setupMF(MF, KB, CoverageInfo);
+}
+
 static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return Reg == AMDGPU::SCC;
 
   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
@@ -71,7 +79,9 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
   if (RC) {
     // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the
     // context of the register bank has been lost.
-    if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID)
+    // Has a hack getRegClassForSizeOnBank uses exactly SGPR_32RegClass, which
+    // won't ever beconstrained any further.
+    if (RC != &AMDGPU::SGPR_32RegClass)
       return false;
     const LLT Ty = MRI.getType(Reg);
     return Ty.isValid() && Ty.getSizeInBits() == 1;
@@ -83,7 +93,7 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
 
 bool AMDGPUInstructionSelector::isVCC(Register Reg,
                                       const MachineRegisterInfo &MRI) const {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return Reg == TRI.getVCC();
 
   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
@@ -102,8 +112,6 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
   const DebugLoc &DL = I.getDebugLoc();
   MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   I.setDesc(TII.get(TargetOpcode::COPY));
 
   const MachineOperand &Src = I.getOperand(1);
@@ -111,33 +119,33 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
   Register DstReg = Dst.getReg();
   Register SrcReg = Src.getReg();
 
-  if (isVCC(DstReg, MRI)) {
+  if (isVCC(DstReg, *MRI)) {
     if (SrcReg == AMDGPU::SCC) {
       const TargetRegisterClass *RC
-        = TRI.getConstrainedRegClassForOperand(Dst, MRI);
+        = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
       if (!RC)
         return true;
-      return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+      return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
     }
 
-    if (!isVCC(SrcReg, MRI)) {
+    if (!isVCC(SrcReg, *MRI)) {
       // TODO: Should probably leave the copy and let copyPhysReg expand it.
-      if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI))
+      if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
         return false;
 
       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
         .addImm(0)
         .addReg(SrcReg);
 
-      if (!MRI.getRegClassOrNull(SrcReg))
-        MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI));
+      if (!MRI->getRegClassOrNull(SrcReg))
+        MRI->setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, *MRI));
       I.eraseFromParent();
       return true;
     }
 
     const TargetRegisterClass *RC =
-      TRI.getConstrainedRegClassForOperand(Dst, MRI);
-    if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI))
+      TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+    if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
       return false;
 
     // Don't constrain the source register to a class so the def instruction
@@ -148,8 +156,8 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
     // with size 1. An SReg_32 with size 1 is ambiguous with wave32.
     if (Src.isUndef()) {
       const TargetRegisterClass *SrcRC =
-        TRI.getConstrainedRegClassForOperand(Src, MRI);
-      if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+        TRI.getConstrainedRegClassForOperand(Src, *MRI);
+      if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
         return false;
     }
 
@@ -157,30 +165,26 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
   }
 
   for (const MachineOperand &MO : I.operands()) {
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+    if (Register::isPhysicalRegister(MO.getReg()))
       continue;
 
     const TargetRegisterClass *RC =
-            TRI.getConstrainedRegClassForOperand(MO, MRI);
+            TRI.getConstrainedRegClassForOperand(MO, *MRI);
     if (!RC)
       continue;
-    RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+    RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
   }
   return true;
 }
 
 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
   const Register DefReg = I.getOperand(0).getReg();
-  const LLT DefTy = MRI.getType(DefReg);
+  const LLT DefTy = MRI->getType(DefReg);
 
   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
 
   const RegClassOrRegBank &RegClassOrBank =
-    MRI.getRegClassOrRegBank(DefReg);
+    MRI->getRegClassOrRegBank(DefReg);
 
   const TargetRegisterClass *DefRC
     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
@@ -196,7 +200,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
       return false;
     }
 
-    DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI);
+    DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
     if (!DefRC) {
       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
       return false;
@@ -204,7 +208,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
   }
 
   I.setDesc(TII.get(TargetOpcode::PHI));
-  return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
+  return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
 }
 
 MachineOperand
@@ -214,13 +218,11 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
 
   MachineInstr *MI = MO.getParent();
   MachineBasicBlock *BB = MO.getParent()->getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  Register DstReg = MRI.createVirtualRegister(&SubRC);
+  Register DstReg = MRI->createVirtualRegister(&SubRC);
 
   if (MO.isReg()) {
     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
             .addReg(Reg, 0, ComposedSubIdx);
 
@@ -244,10 +246,6 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
   }
 }
 
-static int64_t getConstant(const MachineInstr *MI) {
-  return MI->getOperand(1).getCImm()->getSExtValue();
-}
-
 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
   switch (Opc) {
   case AMDGPU::G_AND:
@@ -262,16 +260,13 @@ static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
 }
 
 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineOperand &Dst = I.getOperand(0);
   MachineOperand &Src0 = I.getOperand(1);
   MachineOperand &Src1 = I.getOperand(2);
   Register DstReg = Dst.getReg();
-  unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
 
-  const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
     const TargetRegisterClass *RC = TRI.getBoolRC();
     unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
@@ -282,12 +277,12 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
     // The selector for G_ICMP relies on seeing the register bank for the result
     // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will
     // be ambiguous whether it's a scalar or vector bool.
-    if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg()))
-      MRI.setRegClass(Src0.getReg(), RC);
-    if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg()))
-      MRI.setRegClass(Src1.getReg(), RC);
+    if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg()))
+      MRI->setRegClass(Src0.getReg(), RC);
+    if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg()))
+      MRI->setRegClass(Src1.getReg(), RC);
 
-    return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+    return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
   }
 
   // TODO: Should this allow an SCC bank result, and produce a copy from SCC for
@@ -295,14 +290,7 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
   if (DstRB->getID() == AMDGPU::SGPRRegBankID) {
     unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32);
     I.setDesc(TII.get(InstOpc));
-
-    const TargetRegisterClass *RC
-      = TRI.getConstrainedRegClassForOperand(Dst, MRI);
-    if (!RC)
-      return false;
-    return RBI.constrainGenericRegister(DstReg, *RC, MRI) &&
-           RBI.constrainGenericRegister(Src0.getReg(), *RC, MRI) &&
-           RBI.constrainGenericRegister(Src1.getReg(), *RC, MRI);
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
   return false;
@@ -311,11 +299,10 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   Register DstReg = I.getOperand(0).getReg();
   const DebugLoc &DL = I.getDebugLoc();
-  unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
-  const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
 
@@ -340,7 +327,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
 
     const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64;
 
-    Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass());
+    Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
     MachineInstr *Add
       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
       .addDef(UnusedCarry, RegState::Dead)
@@ -363,8 +350,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
 
-  Register DstLo = MRI.createVirtualRegister(&HalfRC);
-  Register DstHi = MRI.createVirtualRegister(&HalfRC);
+  Register DstLo = MRI->createVirtualRegister(&HalfRC);
+  Register DstHi = MRI->createVirtualRegister(&HalfRC);
 
   if (IsSALU) {
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
@@ -375,14 +362,14 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
       .add(Hi2);
   } else {
     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
-    Register CarryReg = MRI.createVirtualRegister(CarryRC);
+    Register CarryReg = MRI->createVirtualRegister(CarryRC);
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
       .addDef(CarryReg)
       .add(Lo1)
       .add(Lo2)
       .addImm(0);
     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
-      .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead)
+      .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
       .add(Hi1)
       .add(Hi2)
       .addReg(CarryReg, RegState::Kill)
@@ -399,19 +386,61 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
     .addImm(AMDGPU::sub1);
 
 
-  if (!RBI.constrainGenericRegister(DstReg, RC, MRI))
+  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
     return false;
 
   I.eraseFromParent();
   return true;
 }
 
-bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  assert(I.getOperand(2).getImm() % 32 == 0);
-  unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32);
+  const DebugLoc &DL = I.getDebugLoc();
+  Register Dst0Reg = I.getOperand(0).getReg();
+  Register Dst1Reg = I.getOperand(1).getReg();
+  const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO;
+
+  if (!isSCC(Dst1Reg, MRI)) {
+    // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
+    // carry out despite the _i32 name. These were renamed in VI to _U32.
+    // FIXME: We should probably rename the opcodes here.
+    unsigned NewOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+    I.setDesc(TII.get(NewOpc));
+    I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+    I.addOperand(*MF, MachineOperand::CreateImm(0));
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  Register Src0Reg = I.getOperand(2).getReg();
+  Register Src1Reg = I.getOperand(3).getReg();
+  unsigned NewOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+  BuildMI(*BB, &I, DL, TII.get(NewOpc), Dst0Reg)
+    .add(I.getOperand(2))
+    .add(I.getOperand(3));
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
+    .addReg(AMDGPU::SCC);
+
+  if (!MRI.getRegClassOrNull(Dst1Reg))
+    MRI.setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
+
+  if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, MRI) ||
+      !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, MRI) ||
+      !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, MRI))
+    return false;
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  unsigned Offset = I.getOperand(2).getImm();
+  if (Offset % 32 != 0)
+    return false;
+
+  unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32);
   const DebugLoc &DL = I.getDebugLoc();
   MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY),
                                I.getOperand(0).getReg())
@@ -419,10 +448,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
 
   for (const MachineOperand &MO : Copy->operands()) {
     const TargetRegisterClass *RC =
-            TRI.getConstrainedRegClassForOperand(MO, MRI);
+            TRI.getConstrainedRegClassForOperand(MO, *MRI);
     if (!RC)
       continue;
-    RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+    RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
   }
   I.eraseFromParent();
   return true;
@@ -430,21 +459,19 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
 
 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
   MachineBasicBlock *BB = MI.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   Register DstReg = MI.getOperand(0).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+  LLT DstTy = MRI->getType(DstReg);
+  LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
 
   const unsigned SrcSize = SrcTy.getSizeInBits();
   if (SrcSize < 32)
     return false;
 
   const DebugLoc &DL = MI.getDebugLoc();
-  const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI);
+  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
   const unsigned DstSize = DstTy.getSizeInBits();
   const TargetRegisterClass *DstRC =
-    TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI);
+    TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
   if (!DstRC)
     return false;
 
@@ -457,12 +484,12 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
     MIB.addImm(SubRegs[I]);
 
     const TargetRegisterClass *SrcRC
-      = TRI.getConstrainedRegClassForOperand(Src, MRI);
-    if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI))
+      = TRI.getConstrainedRegClassForOperand(Src, *MRI);
+    if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
       return false;
   }
 
-  if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI))
+  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
     return false;
 
   MI.eraseFromParent();
@@ -471,25 +498,23 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
 
 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
   MachineBasicBlock *BB = MI.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   const int NumDst = MI.getNumOperands() - 1;
 
   MachineOperand &Src = MI.getOperand(NumDst);
 
   Register SrcReg = Src.getReg();
   Register DstReg0 = MI.getOperand(0).getReg();
-  LLT DstTy = MRI.getType(DstReg0);
-  LLT SrcTy = MRI.getType(SrcReg);
+  LLT DstTy = MRI->getType(DstReg0);
+  LLT SrcTy = MRI->getType(SrcReg);
 
   const unsigned DstSize = DstTy.getSizeInBits();
   const unsigned SrcSize = SrcTy.getSizeInBits();
   const DebugLoc &DL = MI.getDebugLoc();
-  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
 
   const TargetRegisterClass *SrcRC =
-    TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI);
-  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+    TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
+  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
     return false;
 
   const unsigned SrcFlags = getUndefRegState(Src.isUndef());
@@ -504,8 +529,8 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
       .addReg(SrcReg, SrcFlags, SubRegs[I]);
 
     const TargetRegisterClass *DstRC =
-      TRI.getConstrainedRegClassForOperand(Dst, MRI);
-    if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI))
+      TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+    if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
       return false;
   }
 
@@ -518,16 +543,13 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
 }
 
 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   const MachineOperand &MO = I.getOperand(0);
 
   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
   // regbank check here is to know why getConstrainedRegClassForOperand failed.
-  const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI);
-  if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) ||
-      (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) {
+  const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
+  if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
+      (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
     return true;
   }
@@ -537,44 +559,62 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
 
 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32);
-  DebugLoc DL = I.getDebugLoc();
-  MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG))
-                               .addDef(I.getOperand(0).getReg())
-                               .addReg(I.getOperand(1).getReg())
-                               .addReg(I.getOperand(2).getReg())
-                               .addImm(SubReg);
-
-  for (const MachineOperand &MO : Ins->operands()) {
-    if (!MO.isReg())
-      continue;
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-      continue;
 
-    const TargetRegisterClass *RC =
-            TRI.getConstrainedRegClassForOperand(MO, MRI);
-    if (!RC)
-      continue;
-    RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
-  }
+  Register DstReg = I.getOperand(0).getReg();
+  Register Src0Reg = I.getOperand(1).getReg();
+  Register Src1Reg = I.getOperand(2).getReg();
+  LLT Src1Ty = MRI->getType(Src1Reg);
+
+  unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
+  unsigned InsSize = Src1Ty.getSizeInBits();
+
+  int64_t Offset = I.getOperand(3).getImm();
+  if (Offset % 32 != 0)
+    return false;
+
+  unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
+  if (SubReg == AMDGPU::NoSubRegister)
+    return false;
+
+  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
+  const TargetRegisterClass *DstRC =
+    TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+  if (!DstRC)
+    return false;
+
+  const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
+  const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
+  const TargetRegisterClass *Src0RC =
+    TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
+  const TargetRegisterClass *Src1RC =
+    TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
+
+  // Deal with weird cases where the class only partially supports the subreg
+  // index.
+  Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
+  if (!Src0RC)
+    return false;
+
+  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
+      !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
+      !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
+    return false;
+
+  const DebugLoc &DL = I.getDebugLoc();
+  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
+    .addReg(Src0Reg)
+    .addReg(Src1Reg)
+    .addImm(SubReg);
+
   I.eraseFromParent();
   return true;
 }
 
-bool AMDGPUInstructionSelector::selectG_INTRINSIC(
-  MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
-  unsigned IntrinsicID =  I.getOperand(I.getNumExplicitDefs()).getIntrinsicID();
+bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
+  unsigned IntrinsicID = I.getIntrinsicID();
   switch (IntrinsicID) {
-  case Intrinsic::maxnum:
-  case Intrinsic::minnum:
-  case Intrinsic::amdgcn_cvt_pkrtz:
-    return selectImpl(I, CoverageInfo);
   case Intrinsic::amdgcn_if_break: {
     MachineBasicBlock *BB = I.getParent();
-    MachineFunction *MF = BB->getParent();
-    MachineRegisterInfo &MRI = MF->getRegInfo();
 
     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
     // SelectionDAG uses for wave32 vs wave64.
@@ -589,15 +629,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(
 
     I.eraseFromParent();
 
-    for (Register Reg : { DstReg, Src0Reg, Src1Reg }) {
-      if (!MRI.getRegClassOrNull(Reg))
-        MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
-    }
+    for (Register Reg : { DstReg, Src0Reg, Src1Reg })
+      MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
 
     return true;
   }
   default:
-    return selectImpl(I, CoverageInfo);
+    return selectImpl(I, *CoverageInfo);
   }
 }
 
@@ -677,17 +715,15 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
 
 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   const DebugLoc &DL = I.getDebugLoc();
 
-  unsigned SrcReg = I.getOperand(2).getReg();
-  unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI);
+  Register SrcReg = I.getOperand(2).getReg();
+  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
 
   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
 
-  unsigned CCReg = I.getOperand(0).getReg();
-  if (isSCC(CCReg, MRI)) {
+  Register CCReg = I.getOperand(0).getReg();
+  if (isSCC(CCReg, *MRI)) {
     int Opcode = getS_CMPOpcode(Pred, Size);
     if (Opcode == -1)
       return false;
@@ -698,7 +734,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
       .addReg(AMDGPU::SCC);
     bool Ret =
         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
-        RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI);
+        RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
     I.eraseFromParent();
     return Ret;
   }
@@ -712,7 +748,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
             .add(I.getOperand(2))
             .add(I.getOperand(3));
   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
-                               *TRI.getBoolRC(), MRI);
+                               *TRI.getBoolRC(), *MRI);
   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
   I.eraseFromParent();
   return Ret;
@@ -736,19 +772,273 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
           .addImm(Enabled);
 }
 
+static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
+  int64_t C;
+  if (mi_match(Reg, MRI, m_ICst(C)) && C == 0)
+    return true;
+
+  // FIXME: matcher should ignore copies
+  return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0;
+}
+
+static unsigned extractGLC(unsigned AuxiliaryData) {
+  return AuxiliaryData & 1;
+}
+
+static unsigned extractSLC(unsigned AuxiliaryData) {
+  return (AuxiliaryData >> 1) & 1;
+}
+
+static unsigned extractDLC(unsigned AuxiliaryData) {
+  return (AuxiliaryData >> 2) & 1;
+}
+
+static unsigned extractSWZ(unsigned AuxiliaryData) {
+  return (AuxiliaryData >> 3) & 1;
+}
+
+// Returns Base register, constant offset, and offset def point.
+static std::tuple<Register, unsigned, MachineInstr *>
+getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
+  MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+  if (!Def)
+    return std::make_tuple(Reg, 0, nullptr);
+
+  if (Def->getOpcode() == AMDGPU::G_CONSTANT) {
+    unsigned Offset;
+    const MachineOperand &Op = Def->getOperand(1);
+    if (Op.isImm())
+      Offset = Op.getImm();
+    else
+      Offset = Op.getCImm()->getZExtValue();
+
+    return std::make_tuple(Register(), Offset, Def);
+  }
+
+  int64_t Offset;
+  if (Def->getOpcode() == AMDGPU::G_ADD) {
+    // TODO: Handle G_OR used for add case
+    if (mi_match(Def->getOperand(1).getReg(), MRI, m_ICst(Offset)))
+      return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def);
+
+    // FIXME: matcher should ignore copies
+    if (mi_match(Def->getOperand(1).getReg(), MRI, m_Copy(m_ICst(Offset))))
+      return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def);
+  }
+
+  return std::make_tuple(Reg, 0, Def);
+}
+
+static unsigned getBufferStoreOpcode(LLT Ty,
+                                     const unsigned MemSize,
+                                     const bool Offen) {
+  const int Size = Ty.getSizeInBits();
+  switch (8 * MemSize) {
+  case 8:
+    return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
+                   AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
+  case 16:
+    return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
+                   AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
+  default:
+    unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
+                           AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
+    if (Size > 32)
+      Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
+    return Opc;
+  }
+}
+
+static unsigned getBufferStoreFormatOpcode(LLT Ty,
+                                           const unsigned MemSize,
+                                           const bool Offen) {
+  bool IsD16Packed = Ty.getScalarSizeInBits() == 16;
+  bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits();
+  int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
+
+  if (IsD16Packed) {
+    switch (NumElts) {
+    case 1:
+      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
+                     AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
+    case 2:
+      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact :
+                     AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact;
+    case 3:
+      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact :
+                     AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact;
+    case 4:
+      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact :
+                     AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact;
+    default:
+      return -1;
+    }
+  }
+
+  if (IsD16Unpacked) {
+    switch (NumElts) {
+    case 1:
+      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
+                     AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
+    case 2:
+      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact :
+                     AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact;
+    case 3:
+      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact :
+                     AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact;
+    case 4:
+      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact :
+                     AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact;
+    default:
+      return -1;
+    }
+  }
+
+  switch (NumElts) {
+  case 1:
+    return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact :
+                   AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact;
+  case 2:
+    return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact :
+                  AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact;
+  case 3:
+    return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact :
+                   AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact;
+  case 4:
+    return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact :
+                   AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact;
+  default:
+    return -1;
+  }
+
+  llvm_unreachable("unhandled buffer store");
+}
+
+// TODO: Move this to combiner
+// Returns base register, imm offset, total constant offset.
+std::tuple<Register, unsigned, unsigned>
+AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B,
+                                              Register OrigOffset) const {
+  const unsigned MaxImm = 4095;
+  Register BaseReg;
+  unsigned TotalConstOffset;
+  MachineInstr *OffsetDef;
+
+  std::tie(BaseReg, TotalConstOffset, OffsetDef)
+    = getBaseWithConstantOffset(*MRI, OrigOffset);
+
+  unsigned ImmOffset = TotalConstOffset;
+
+  // If the immediate value is too big for the immoffset field, put the value
+  // and -4096 into the immoffset field so that the value that is copied/added
+  // for the voffset field is a multiple of 4096, and it stands more chance
+  // of being CSEd with the copy/add for another similar load/store.f
+  // However, do not do that rounding down to a multiple of 4096 if that is a
+  // negative number, as it appears to be illegal to have a negative offset
+  // in the vgpr, even if adding the immediate offset makes it positive.
+  unsigned Overflow = ImmOffset & ~MaxImm;
+  ImmOffset -= Overflow;
+  if ((int32_t)Overflow < 0) {
+    Overflow += ImmOffset;
+    ImmOffset = 0;
+  }
+
+  if (Overflow != 0) {
+    // In case this is in a waterfall loop, insert offset code at the def point
+    // of the offset, not inside the loop.
+    MachineBasicBlock::iterator OldInsPt = B.getInsertPt();
+    MachineBasicBlock &OldMBB = B.getMBB();
+    B.setInstr(*OffsetDef);
+
+    if (!BaseReg) {
+      BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      B.buildInstr(AMDGPU::V_MOV_B32_e32)
+        .addDef(BaseReg)
+        .addImm(Overflow);
+    } else {
+      Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      B.buildInstr(AMDGPU::V_MOV_B32_e32)
+        .addDef(OverflowVal)
+        .addImm(Overflow);
+
+      Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg)
+        .addReg(BaseReg)
+        .addReg(OverflowVal, RegState::Kill)
+        .addImm(0);
+      BaseReg = NewBaseReg;
+    }
+
+    B.setInsertPt(OldMBB, OldInsPt);
+  }
+
+  return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+}
+
+bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI,
+                                                     bool IsFormat) const {
+  MachineIRBuilder B(MI);
+  MachineFunction &MF = B.getMF();
+  Register VData = MI.getOperand(1).getReg();
+  LLT Ty = MRI->getType(VData);
+
+  int Size = Ty.getSizeInBits();
+  if (Size % 32 != 0)
+    return false;
+
+  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+  const int MemSize = MMO->getSize();
+
+  Register RSrc = MI.getOperand(2).getReg();
+  Register VOffset = MI.getOperand(3).getReg();
+  Register SOffset = MI.getOperand(4).getReg();
+  unsigned AuxiliaryData = MI.getOperand(5).getImm();
+  unsigned ImmOffset;
+  unsigned TotalOffset;
+
+  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+  if (TotalOffset != 0)
+    MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize);
+
+  const bool Offen = !isZero(VOffset, *MRI);
+
+  int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) :
+    getBufferStoreOpcode(Ty, MemSize, Offen);
+  if (Opc == -1)
+    return false;
+
+  MachineInstrBuilder MIB = B.buildInstr(Opc)
+    .addUse(VData);
+
+  if (Offen)
+    MIB.addUse(VOffset);
+
+  MIB.addUse(RSrc)
+     .addUse(SOffset)
+     .addImm(ImmOffset)
+     .addImm(extractGLC(AuxiliaryData))
+     .addImm(extractSLC(AuxiliaryData))
+     .addImm(0) // tfe: FIXME: Remove from inst
+     .addImm(extractDLC(AuxiliaryData))
+     .addImm(extractSWZ(AuxiliaryData))
+     .addMemOperand(MMO);
+
+  MI.eraseFromParent();
+
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
-  MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
+    MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  unsigned IntrinsicID = I.getOperand(0).getIntrinsicID();
+  unsigned IntrinsicID = I.getIntrinsicID();
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_exp: {
-    int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
-    int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
-    int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg()));
-    int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg()));
+    int64_t Tgt = I.getOperand(1).getImm();
+    int64_t Enabled = I.getOperand(2).getImm();
+    int64_t Done = I.getOperand(7).getImm();
+    int64_t VM = I.getOperand(8).getImm();
 
     MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
                                  I.getOperand(4).getReg(),
@@ -761,13 +1051,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
   }
   case Intrinsic::amdgcn_exp_compr: {
     const DebugLoc &DL = I.getDebugLoc();
-    int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
-    int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
-    unsigned Reg0 = I.getOperand(3).getReg();
-    unsigned Reg1 = I.getOperand(4).getReg();
-    unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg()));
-    int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg()));
+    int64_t Tgt = I.getOperand(1).getImm();
+    int64_t Enabled = I.getOperand(2).getImm();
+    Register Reg0 = I.getOperand(3).getReg();
+    Register Reg1 = I.getOperand(4).getReg();
+    Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    int64_t Done = I.getOperand(5).getImm();
+    int64_t VM = I.getOperand(6).getImm();
 
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
     MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
@@ -786,27 +1076,29 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     Register Reg = I.getOperand(1).getReg();
     I.eraseFromParent();
 
-    if (!MRI.getRegClassOrNull(Reg))
-      MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
+    if (!MRI->getRegClassOrNull(Reg))
+      MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
     return true;
   }
+  case Intrinsic::amdgcn_raw_buffer_store:
+    return selectStoreIntrinsic(I, false);
+  case Intrinsic::amdgcn_raw_buffer_store_format:
+    return selectStoreIntrinsic(I, true);
   default:
-    return selectImpl(I, CoverageInfo);
+    return selectImpl(I, *CoverageInfo);
   }
 }
 
 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   const DebugLoc &DL = I.getDebugLoc();
 
-  unsigned DstReg = I.getOperand(0).getReg();
-  unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+  Register DstReg = I.getOperand(0).getReg();
+  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
   assert(Size <= 32 || Size == 64);
   const MachineOperand &CCOp = I.getOperand(1);
-  unsigned CCReg = CCOp.getReg();
-  if (isSCC(CCReg, MRI)) {
+  Register CCReg = CCOp.getReg();
+  if (isSCC(CCReg, *MRI)) {
     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
                                          AMDGPU::S_CSELECT_B32;
     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
@@ -815,8 +1107,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
     // bank, because it does not cover the register class that we used to represent
     // for it.  So we need to manually set the register class here.
-    if (!MRI.getRegClassOrNull(CCReg))
-        MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI));
+    if (!MRI->getRegClassOrNull(CCReg))
+        MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
             .add(I.getOperand(2))
             .add(I.getOperand(3));
@@ -845,52 +1137,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
 }
 
 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  DebugLoc DL = I.getDebugLoc();
-  unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI);
-  if (PtrSize != 64) {
-    LLVM_DEBUG(dbgs() << "Unhandled address space\n");
-    return false;
-  }
-
-  unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
-  unsigned Opcode;
-
-  // FIXME: Remove this when integers > s32 naturally selected.
-  switch (StoreSize) {
-  default:
-    return false;
-  case 32:
-    Opcode = AMDGPU::FLAT_STORE_DWORD;
-    break;
-  case 64:
-    Opcode = AMDGPU::FLAT_STORE_DWORDX2;
-    break;
-  case 96:
-    Opcode = AMDGPU::FLAT_STORE_DWORDX3;
-    break;
-  case 128:
-    Opcode = AMDGPU::FLAT_STORE_DWORDX4;
-    break;
-  }
-
-  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
-          .add(I.getOperand(1))
-          .add(I.getOperand(0))
-          .addImm(0)  // offset
-          .addImm(0)  // glc
-          .addImm(0)  // slc
-          .addImm(0); // dlc
-
-
-  // Now that we selected an opcode, we need to constrain the register
-  // operands to use appropriate classes.
-  bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
-
-  I.eraseFromParent();
-  return Ret;
+  initM0(I);
+  return selectImpl(I, *CoverageInfo);
 }
 
 static int sizeToSubRegIndex(unsigned Size) {
@@ -915,19 +1163,15 @@ static int sizeToSubRegIndex(unsigned Size) {
 }
 
 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  unsigned DstReg = I.getOperand(0).getReg();
-  unsigned SrcReg = I.getOperand(1).getReg();
-  const LLT DstTy = MRI.getType(DstReg);
-  const LLT SrcTy = MRI.getType(SrcReg);
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg = I.getOperand(1).getReg();
+  const LLT DstTy = MRI->getType(DstReg);
+  const LLT SrcTy = MRI->getType(SrcReg);
   if (!DstTy.isScalar())
     return false;
 
-  const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
-  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI);
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
   if (SrcRB != DstRB)
     return false;
 
@@ -935,9 +1179,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
   unsigned SrcSize = SrcTy.getSizeInBits();
 
   const TargetRegisterClass *SrcRC
-    = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI);
+    = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
   const TargetRegisterClass *DstRC
-    = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI);
+    = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
 
   if (SrcSize > 32) {
     int SubRegIdx = sizeToSubRegIndex(DstSize);
@@ -953,8 +1197,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
     I.getOperand(1).setSubReg(SubRegIdx);
   }
 
-  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
-      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
     return false;
   }
@@ -974,20 +1218,18 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
   bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
   const DebugLoc &DL = I.getDebugLoc();
   MachineBasicBlock &MBB = *I.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const unsigned DstReg = I.getOperand(0).getReg();
-  const unsigned SrcReg = I.getOperand(1).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register SrcReg = I.getOperand(1).getReg();
 
-  const LLT DstTy = MRI.getType(DstReg);
-  const LLT SrcTy = MRI.getType(SrcReg);
+  const LLT DstTy = MRI->getType(DstReg);
+  const LLT SrcTy = MRI->getType(SrcReg);
   const LLT S1 = LLT::scalar(1);
   const unsigned SrcSize = SrcTy.getSizeInBits();
   const unsigned DstSize = DstTy.getSizeInBits();
   if (!DstTy.isScalar())
     return false;
 
-  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
 
   if (SrcBank->getID() == AMDGPU::SCCRegBankID) {
     if (SrcTy != S1 || DstSize > 64) // Invalid
@@ -1000,7 +1242,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
 
     // FIXME: Create an extra copy to avoid incorrectly constraining the result
     // of the scc producer.
-    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg)
       .addReg(SrcReg);
     BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
@@ -1010,7 +1252,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
     BuildMI(MBB, I, DL, TII.get(Opcode), DstReg)
       .addImm(0)
       .addImm(Signed ? -1 : 1);
-    return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
+    I.eraseFromParent();
+    return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
   }
 
   if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) {
@@ -1024,6 +1267,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
       .addImm(0)               // src1_modifiers
       .addImm(Signed ? -1 : 1) // src1
       .addUse(SrcReg);
+    I.eraseFromParent();
     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
   }
 
@@ -1040,6 +1284,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
         .addImm(Mask)
         .addReg(SrcReg);
+      I.eraseFromParent();
       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
     }
 
@@ -1049,11 +1294,12 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
       .addReg(SrcReg)
       .addImm(0) // Offset
       .addImm(SrcSize); // Width
+    I.eraseFromParent();
     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
   }
 
   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
-    if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI))
+    if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
       return false;
 
     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
@@ -1061,7 +1307,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
         .addReg(SrcReg);
-      return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+      I.eraseFromParent();
+      return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
     }
 
     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
@@ -1070,10 +1317,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
     if (DstSize > 32 && SrcSize <= 32) {
       // We need a 64-bit register source, but the high bits don't matter.
-      unsigned ExtReg
-        = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-      unsigned UndefReg
-        = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
         .addReg(SrcReg)
@@ -1085,7 +1330,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
         .addReg(ExtReg)
         .addImm(SrcSize << 16);
 
-      return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+      I.eraseFromParent();
+      return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
     }
 
     unsigned Mask;
@@ -1099,16 +1345,58 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
         .addImm(SrcSize << 16);
     }
 
-    return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+    I.eraseFromParent();
+    return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
   }
 
   return false;
 }
 
+static int64_t getFPTrueImmVal(unsigned Size, bool Signed) {
+  switch (Size) {
+  case 16:
+    return Signed ? 0xBC00 : 0x3C00;
+  case 32:
+    return Signed ? 0xbf800000 : 0x3f800000;
+  case 64:
+    return Signed ? 0xbff0000000000000 : 0x3ff0000000000000;
+  default:
+    llvm_unreachable("Invalid FP type size");
+  }
+}
+
+bool AMDGPUInstructionSelector::selectG_SITOFP_UITOFP(MachineInstr &I) const {
+  MachineBasicBlock *MBB = I.getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  Register Src = I.getOperand(1).getReg();
+  if (!isSCC(Src, MRI))
+    return selectImpl(I, *CoverageInfo);
+
+  bool Signed = I.getOpcode() == AMDGPU::G_SITOFP;
+  Register DstReg = I.getOperand(0).getReg();
+  const LLT DstTy = MRI.getType(DstReg);
+  const unsigned DstSize = DstTy.getSizeInBits();
+  const DebugLoc &DL = I.getDebugLoc();
+
+  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
+    .addReg(Src);
+
+  unsigned NewOpc =
+    DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+  auto MIB = BuildMI(*MBB, I, DL, TII.get(NewOpc), DstReg)
+    .addImm(0)
+    .addImm(getFPTrueImmVal(DstSize, Signed));
+
+  if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+    return false;
+
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineOperand &ImmOp = I.getOperand(1);
 
   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
@@ -1119,15 +1407,15 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
   }
 
-  unsigned DstReg = I.getOperand(0).getReg();
+  Register DstReg = I.getOperand(0).getReg();
   unsigned Size;
   bool IsSgpr;
-  const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg());
+  const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
   if (RB) {
     IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
-    Size = MRI.getType(DstReg).getSizeInBits();
+    Size = MRI->getType(DstReg).getSizeInBits();
   } else {
-    const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg);
+    const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
     IsSgpr = TRI.isSGPRClass(RC);
     Size = TRI.getRegSizeInBits(*RC);
   }
@@ -1142,34 +1430,41 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
-  DebugLoc DL = I.getDebugLoc();
-  const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass :
-                                           &AMDGPU::VGPR_32RegClass;
-  unsigned LoReg = MRI.createVirtualRegister(RC);
-  unsigned HiReg = MRI.createVirtualRegister(RC);
-  const APInt &Imm = APInt(Size, I.getOperand(1).getImm());
-
-  BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
-          .addImm(Imm.trunc(32).getZExtValue());
+  const DebugLoc &DL = I.getDebugLoc();
 
-  BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
-          .addImm(Imm.ashr(32).getZExtValue());
+  APInt Imm(Size, I.getOperand(1).getImm());
 
-  const MachineInstr *RS =
-      BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
-              .addReg(LoReg)
-              .addImm(AMDGPU::sub0)
-              .addReg(HiReg)
-              .addImm(AMDGPU::sub1);
+  MachineInstr *ResInst;
+  if (IsSgpr && TII.isInlineConstant(Imm)) {
+    ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
+      .addImm(I.getOperand(1).getImm());
+  } else {
+    const TargetRegisterClass *RC = IsSgpr ?
+      &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
+    Register LoReg = MRI->createVirtualRegister(RC);
+    Register HiReg = MRI->createVirtualRegister(RC);
+
+    BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
+      .addImm(Imm.trunc(32).getZExtValue());
+
+    BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
+      .addImm(Imm.ashr(32).getZExtValue());
+
+    ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+      .addReg(LoReg)
+      .addImm(AMDGPU::sub0)
+      .addReg(HiReg)
+      .addImm(AMDGPU::sub1);
+  }
 
   // We can't call constrainSelectedInstRegOperands here, because it doesn't
   // work for target independent opcodes
   I.eraseFromParent();
   const TargetRegisterClass *DstRC =
-      TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI);
+    TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
   if (!DstRC)
     return true;
-  return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
+  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
 }
 
 static bool isConstant(const MachineInstr &MI) {
@@ -1188,13 +1483,13 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
 
   GEPInfo GEPInfo(*PtrMI);
 
-  for (unsigned i = 1, e = 3; i < e; ++i) {
+  for (unsigned i = 1; i != 3; ++i) {
     const MachineOperand &GEPOp = PtrMI->getOperand(i);
     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
     assert(OpDef);
-    if (isConstant(*OpDef)) {
-      // FIXME: Is it possible to have multiple Imm parts?  Maybe if we
-      // are lacking other optimizations.
+    if (i == 2 && isConstant(*OpDef)) {
+      // TODO: Could handle constant base + variable offset, but a combine
+      // probably should have commuted it.
       assert(GEPInfo.Imm == 0);
       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
       continue;
@@ -1240,16 +1535,26 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
   return false;
 }
 
-bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
-  // TODO: Can/should we insert m0 initialization here for DS instructions and
-  // call the normal selector?
-  return false;
+void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+
+  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
+  unsigned AS = PtrTy.getAddressSpace();
+  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
+      STI.ldsRequiresM0Init()) {
+    // If DS instructions require M0 initializtion, insert it before selecting.
+    BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+      .addImm(-1);
+  }
+}
+
+bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
+  initM0(I);
+  return selectImpl(I, *CoverageInfo);
 }
 
 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineOperand &CondOp = I.getOperand(0);
   Register CondReg = CondOp.getReg();
   const DebugLoc &DL = I.getDebugLoc();
@@ -1263,11 +1568,12 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
   // RegBankSelect knows what it's doing if the branch condition is scc, even
   // though it currently does not.
-  if (isSCC(CondReg, MRI)) {
+  if (isSCC(CondReg, *MRI)) {
     CondPhysReg = AMDGPU::SCC;
     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
-    ConstrainRC = &AMDGPU::SReg_32_XM0RegClass;
-  } else if (isVCC(CondReg, MRI)) {
+    // FIXME: Hack for isSCC tests
+    ConstrainRC = &AMDGPU::SGPR_32RegClass;
+  } else if (isVCC(CondReg, *MRI)) {
     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
     // We sort of know that a VCC producer based on the register bank, that ands
     // inactive lanes with 0. What if there was a logical operation with vcc
@@ -1279,8 +1585,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
   } else
     return false;
 
-  if (!MRI.getRegClassOrNull(CondReg))
-    MRI.setRegClass(CondReg, ConstrainRC);
+  if (!MRI->getRegClassOrNull(CondReg))
+    MRI->setRegClass(CondReg, ConstrainRC);
 
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
     .addReg(CondReg);
@@ -1292,27 +1598,83 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
 }
 
 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
   Register DstReg = I.getOperand(0).getReg();
-  const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
   if (IsVGPR)
     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
 
   return RBI.constrainGenericRegister(
-    DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI);
+    DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
+}
+
+bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const {
+  uint64_t Align = I.getOperand(2).getImm();
+  const uint64_t Mask = ~((UINT64_C(1) << Align) - 1);
+
+  MachineBasicBlock *BB = I.getParent();
+
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg = I.getOperand(1).getReg();
+
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
+  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
+  unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
+  unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+  const TargetRegisterClass &RegRC
+    = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+
+  LLT Ty = MRI->getType(DstReg);
+
+  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
+                                                                  *MRI);
+  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
+                                                                  *MRI);
+  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
+      !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+    return false;
+
+  const DebugLoc &DL = I.getDebugLoc();
+  Register ImmReg = MRI->createVirtualRegister(&RegRC);
+  BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg)
+    .addImm(Mask);
+
+  if (Ty.getSizeInBits() == 32) {
+    BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
+      .addReg(SrcReg)
+      .addReg(ImmReg);
+    I.eraseFromParent();
+    return true;
+  }
+
+  Register HiReg = MRI->createVirtualRegister(&RegRC);
+  Register LoReg = MRI->createVirtualRegister(&RegRC);
+  Register MaskLo = MRI->createVirtualRegister(&RegRC);
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
+    .addReg(SrcReg, 0, AMDGPU::sub0);
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
+    .addReg(SrcReg, 0, AMDGPU::sub1);
+
+  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo)
+    .addReg(LoReg)
+    .addReg(ImmReg);
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+    .addReg(MaskLo)
+    .addImm(AMDGPU::sub0)
+    .addReg(HiReg)
+    .addImm(AMDGPU::sub1);
+  I.eraseFromParent();
+  return true;
 }
 
-bool AMDGPUInstructionSelector::select(MachineInstr &I,
-                                       CodeGenCoverage &CoverageInfo) const {
+bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   if (I.isPHI())
     return selectPHI(I);
 
-  if (!isPreISelGenericOpcode(I.getOpcode())) {
+  if (!I.isPreISelOpcode()) {
     if (I.isCopy())
       return selectCOPY(I);
     return true;
@@ -1324,16 +1686,18 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
   case TargetOpcode::G_XOR:
     if (selectG_AND_OR_XOR(I))
       return true;
-    return selectImpl(I, CoverageInfo);
+    return selectImpl(I, *CoverageInfo);
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_SUB:
-    if (selectG_ADD_SUB(I))
+    if (selectImpl(I, *CoverageInfo))
       return true;
-    LLVM_FALLTHROUGH;
-  default:
-    return selectImpl(I, CoverageInfo);
+    return selectG_ADD_SUB(I);
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_USUBO:
+    return selectG_UADDO_USUBO(I);
   case TargetOpcode::G_INTTOPTR:
   case TargetOpcode::G_BITCAST:
+  case TargetOpcode::G_PTRTOINT:
     return selectCOPY(I);
   case TargetOpcode::G_CONSTANT:
   case TargetOpcode::G_FCONSTANT:
@@ -1353,32 +1717,40 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
   case TargetOpcode::G_INSERT:
     return selectG_INSERT(I);
   case TargetOpcode::G_INTRINSIC:
-    return selectG_INTRINSIC(I, CoverageInfo);
+    return selectG_INTRINSIC(I);
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
-    return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
+    return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
   case TargetOpcode::G_ICMP:
     if (selectG_ICMP(I))
       return true;
-    return selectImpl(I, CoverageInfo);
+    return selectImpl(I, *CoverageInfo);
   case TargetOpcode::G_LOAD:
-    return selectImpl(I, CoverageInfo);
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_SUB:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR:
+  case TargetOpcode::G_ATOMICRMW_MIN:
+  case TargetOpcode::G_ATOMICRMW_MAX:
+  case TargetOpcode::G_ATOMICRMW_UMIN:
+  case TargetOpcode::G_ATOMICRMW_UMAX:
+  case TargetOpcode::G_ATOMICRMW_FADD:
+    return selectG_LOAD_ATOMICRMW(I);
   case TargetOpcode::G_SELECT:
     return selectG_SELECT(I);
   case TargetOpcode::G_STORE:
-    if (selectImpl(I, CoverageInfo))
-      return true;
     return selectG_STORE(I);
   case TargetOpcode::G_TRUNC:
     return selectG_TRUNC(I);
   case TargetOpcode::G_SEXT:
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_ANYEXT:
-    if (selectG_SZA_EXT(I)) {
-      I.eraseFromParent();
-      return true;
-    }
-
-    return false;
+    return selectG_SZA_EXT(I);
+  case TargetOpcode::G_SITOFP:
+  case TargetOpcode::G_UITOFP:
+    return selectG_SITOFP_UITOFP(I);
   case TargetOpcode::G_BRCOND:
     return selectG_BRCOND(I);
   case TargetOpcode::G_FRAME_INDEX:
@@ -1388,6 +1760,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
     // is checking for G_CONSTANT
     I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE));
     return true;
+  case TargetOpcode::G_PTR_MASK:
+    return selectG_PTR_MASK(I);
+  default:
+    return selectImpl(I, *CoverageInfo);
   }
   return false;
 }
@@ -1402,14 +1778,14 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
 
 std::pair<Register, unsigned>
 AMDGPUInstructionSelector::selectVOP3ModsImpl(
-  Register Src, const MachineRegisterInfo &MRI) const {
+  Register Src) const {
   unsigned Mods = 0;
-  MachineInstr *MI = MRI.getVRegDef(Src);
+  MachineInstr *MI = MRI->getVRegDef(Src);
 
   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
     Src = MI->getOperand(1).getReg();
     Mods |= SISrcMods::NEG;
-    MI = MRI.getVRegDef(Src);
+    MI = MRI->getVRegDef(Src);
   }
 
   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
@@ -1432,12 +1808,23 @@ AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
 
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
-  MachineRegisterInfo &MRI
-    = Root.getParent()->getParent()->getParent()->getRegInfo();
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
+  }};
+}
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const {
   Register Src;
   unsigned Mods;
-  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
 
   return {{
       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1446,6 +1833,7 @@ AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
   }};
 }
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
   return {{
@@ -1457,12 +1845,9 @@ AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
 
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
-  MachineRegisterInfo &MRI
-    = Root.getParent()->getParent()->getParent()->getRegInfo();
-
   Register Src;
   unsigned Mods;
-  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
 
   return {{
       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1471,12 +1856,28 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
 }
 
 InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
-  MachineRegisterInfo &MRI =
-      Root.getParent()->getParent()->getParent()->getRegInfo();
+AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const {
+  // FIXME: Handle clamp and op_sel
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // clamp
+  }};
+}
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
+  // FIXME: Handle op_sel
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
   SmallVector<GEPInfo, 4> AddrInfo;
-  getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
 
   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
     return None;
@@ -1496,11 +1897,8 @@ AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
 
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
-  MachineRegisterInfo &MRI =
-      Root.getParent()->getParent()->getParent()->getRegInfo();
-
   SmallVector<GEPInfo, 4> AddrInfo;
-  getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
 
   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
     return None;
@@ -1521,10 +1919,9 @@ InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
   MachineInstr *MI = Root.getParent();
   MachineBasicBlock *MBB = MI->getParent();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
   SmallVector<GEPInfo, 4> AddrInfo;
-  getAddrModeInfo(*MI, MRI, AddrInfo);
+  getAddrModeInfo(*MI, *MRI, AddrInfo);
 
   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
   // then we can select all ptr + 32-bit offsets not just immediate offsets.
@@ -1540,7 +1937,7 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
   // failed trying to select this load into one of the _IMM variants since
   // the _IMM Patterns are considered before the _SGPR patterns.
   unsigned PtrReg = GEPInfo.SgprParts[0];
-  unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
           .addImm(GEPInfo.Imm);
   return {{
@@ -1553,8 +1950,6 @@ template <bool Signed>
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
   MachineInstr *MI = Root.getParent();
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
   InstructionSelector::ComplexRendererFns Default = {{
       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
@@ -1565,12 +1960,12 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
   if (!STI.hasFlatInstOffsets())
     return Default;
 
-  const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg());
+  const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
   if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP)
     return Default;
 
   Optional<int64_t> Offset =
-    getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI);
+    getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
   if (!Offset.hasValue())
     return Default;
 
@@ -1597,12 +1992,6 @@ AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
   return selectFlatOffsetImpl<true>(Root);
 }
 
-// FIXME: Implement
-static bool signBitIsZero(const MachineOperand &Op,
-                          const MachineRegisterInfo &MRI) {
-  return false;
-}
-
 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
   return PSV && PSV->isStack();
@@ -1613,12 +2002,11 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
   MachineInstr *MI = Root.getParent();
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
 
   int64_t Offset = 0;
-  if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) {
-    Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) {
+    Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
     // TODO: Should this be inside the render function? The iterator seems to
     // move.
@@ -1652,18 +2040,18 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
   // offsets.
   Optional<int> FI;
   Register VAddr = Root.getReg();
-  if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) {
-    if (isBaseWithConstantOffset(Root, MRI)) {
+  if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
+    if (isBaseWithConstantOffset(Root, *MRI)) {
       const MachineOperand &LHS = RootDef->getOperand(1);
       const MachineOperand &RHS = RootDef->getOperand(2);
-      const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
-      const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
+      const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
+      const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
       if (LHSDef && RHSDef) {
         int64_t PossibleOffset =
             RHSDef->getOperand(1).getCImm()->getSExtValue();
         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
             (!STI.privateMemoryResourceIsRangeChecked() ||
-             signBitIsZero(LHS, MRI))) {
+             KnownBits->signBitIsZero(LHS.getReg()))) {
           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
             FI = LHSDef->getOperand(1).getIndex();
           else
@@ -1700,15 +2088,30 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
            }}};
 }
 
+bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI,
+                                                const MachineOperand &Base,
+                                                int64_t Offset,
+                                                unsigned OffsetBits) const {
+  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
+      (OffsetBits == 8 && !isUInt<8>(Offset)))
+    return false;
+
+  if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
+    return true;
+
+  // On Southern Islands instruction with a negative base value and an offset
+  // don't seem to work.
+  return KnownBits->signBitIsZero(Base.getReg());
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
     MachineOperand &Root) const {
   MachineInstr *MI = Root.getParent();
   MachineBasicBlock *MBB = MI->getParent();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
   int64_t Offset = 0;
-  if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) ||
+  if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
     return {};
 
@@ -1728,3 +2131,54 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }      // offset
   }};
 }
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
+  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
+  if (!RootDef) {
+    return {{
+        [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+      }};
+  }
+
+  int64_t ConstAddr = 0;
+  if (isBaseWithConstantOffset(Root, *MRI)) {
+    const MachineOperand &LHS = RootDef->getOperand(1);
+    const MachineOperand &RHS = RootDef->getOperand(2);
+    const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
+    const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
+    if (LHSDef && RHSDef) {
+      int64_t PossibleOffset =
+        RHSDef->getOperand(1).getCImm()->getSExtValue();
+      if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) {
+        // (add n0, c0)
+        return {{
+            [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
+            [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); }
+          }};
+      }
+    }
+  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+
+
+
+  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+
+
+  }
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+    }};
+}
+
+void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
+                                                 const MachineInstr &MI) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+  Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI);
+  assert(CstVal && "Expected constant value");
+  MIB.addImm(CstVal.getValue());
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 4f489ddfb23d..d3c83a6a872a 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -35,6 +35,7 @@ class AMDGPUInstrInfo;
 class AMDGPURegisterBankInfo;
 class GCNSubtarget;
 class MachineInstr;
+class MachineIRBuilder;
 class MachineOperand;
 class MachineRegisterInfo;
 class SIInstrInfo;
@@ -42,14 +43,20 @@ class SIMachineFunctionInfo;
 class SIRegisterInfo;
 
 class AMDGPUInstructionSelector : public InstructionSelector {
+private:
+  MachineRegisterInfo *MRI;
+
 public:
   AMDGPUInstructionSelector(const GCNSubtarget &STI,
                             const AMDGPURegisterBankInfo &RBI,
                             const AMDGPUTargetMachine &TM);
 
-  bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+  bool select(MachineInstr &I) override;
   static const char *getName();
 
+  void setupMF(MachineFunction &MF, GISelKnownBits &KB,
+               CodeGenCoverage &CoverageInfo) override;
+
 private:
   struct GEPInfo {
     const MachineInstr &GEP;
@@ -72,32 +79,42 @@ private:
   bool selectPHI(MachineInstr &I) const;
   bool selectG_TRUNC(MachineInstr &I) const;
   bool selectG_SZA_EXT(MachineInstr &I) const;
+  bool selectG_SITOFP_UITOFP(MachineInstr &I) const;
   bool selectG_CONSTANT(MachineInstr &I) const;
   bool selectG_AND_OR_XOR(MachineInstr &I) const;
   bool selectG_ADD_SUB(MachineInstr &I) const;
+  bool selectG_UADDO_USUBO(MachineInstr &I) const;
   bool selectG_EXTRACT(MachineInstr &I) const;
   bool selectG_MERGE_VALUES(MachineInstr &I) const;
   bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
   bool selectG_GEP(MachineInstr &I) const;
   bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
   bool selectG_INSERT(MachineInstr &I) const;
-  bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
-  bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
-                                        CodeGenCoverage &CoverageInfo) const;
+  bool selectG_INTRINSIC(MachineInstr &I) const;
+
+  std::tuple<Register, unsigned, unsigned>
+  splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
+
+  bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const;
+
+  bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const;
   int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
   bool selectG_ICMP(MachineInstr &I) const;
   bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
   void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
                        SmallVectorImpl<GEPInfo> &AddrInfo) const;
   bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
-  bool selectG_LOAD(MachineInstr &I) const;
-  bool selectG_SELECT(MachineInstr &I) const;
+
+  void initM0(MachineInstr &I) const;
+  bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const;
   bool selectG_STORE(MachineInstr &I) const;
+  bool selectG_SELECT(MachineInstr &I) const;
   bool selectG_BRCOND(MachineInstr &I) const;
   bool selectG_FRAME_INDEX(MachineInstr &I) const;
+  bool selectG_PTR_MASK(MachineInstr &I) const;
 
   std::pair<Register, unsigned>
-  selectVOP3ModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
+  selectVOP3ModsImpl(Register Src) const;
 
   InstructionSelector::ComplexRendererFns
   selectVCSRC(MachineOperand &Root) const;
@@ -108,10 +125,17 @@ private:
   InstructionSelector::ComplexRendererFns
   selectVOP3Mods0(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
+  selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
   selectVOP3OMods(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectVOP3Mods(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectVOP3OpSelMods0(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3OpSelMods(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectSmrdImm(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
@@ -133,6 +157,16 @@ private:
   InstructionSelector::ComplexRendererFns
   selectMUBUFScratchOffset(MachineOperand &Root) const;
 
+  bool isDSOffsetLegal(const MachineRegisterInfo &MRI,
+                       const MachineOperand &Base,
+                       int64_t Offset, unsigned OffsetBits) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectDS1Addr1Offset(MachineOperand &Root) const;
+
+  void renderTruncImm32(MachineInstrBuilder &MIB,
+                        const MachineInstr &MI) const;
+
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
   const AMDGPURegisterBankInfo &RBI;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 61bc415c839d..846e7f577a28 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -75,7 +75,7 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
      let isCodeGenOnly = 1;
 }
 
-def TruePredicate : Predicate<"true">;
+def TruePredicate : Predicate<"">;
 
 class PredicateControl {
   Predicate SubtargetPredicate = TruePredicate;
@@ -220,80 +220,48 @@ def hi_f16_elt : PatLeaf<
 // PatLeafs for floating-point comparisons
 //===----------------------------------------------------------------------===//
 
-def COND_OEQ : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
->;
-
-def COND_ONE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
->;
-
-def COND_OGT : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
->;
-
-def COND_OGE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
->;
-
-def COND_OLT : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
->;
-
-def COND_OLE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
->;
-
-def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
-def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+def COND_OEQ : PatFrags<(ops), [(OtherVT SETOEQ), (OtherVT SETEQ)]>;
+def COND_ONE : PatFrags<(ops), [(OtherVT SETONE), (OtherVT SETNE)]>;
+def COND_OGT : PatFrags<(ops), [(OtherVT SETOGT), (OtherVT SETGT)]>;
+def COND_OGE : PatFrags<(ops), [(OtherVT SETOGE), (OtherVT SETGE)]>;
+def COND_OLT : PatFrags<(ops), [(OtherVT SETOLT), (OtherVT SETLT)]>;
+def COND_OLE : PatFrags<(ops), [(OtherVT SETOLE), (OtherVT SETLE)]>;
+def COND_O   : PatFrags<(ops), [(OtherVT SETO)]>;
+def COND_UO  : PatFrags<(ops), [(OtherVT SETUO)]>;
 
 //===----------------------------------------------------------------------===//
 // PatLeafs for unsigned / unordered comparisons
 //===----------------------------------------------------------------------===//
 
-def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
-def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
-def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
-def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
-def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
-def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+def COND_UEQ : PatFrag<(ops), (OtherVT SETUEQ)>;
+def COND_UNE : PatFrag<(ops), (OtherVT SETUNE)>;
+def COND_UGT : PatFrag<(ops), (OtherVT SETUGT)>;
+def COND_UGE : PatFrag<(ops), (OtherVT SETUGE)>;
+def COND_ULT : PatFrag<(ops), (OtherVT SETULT)>;
+def COND_ULE : PatFrag<(ops), (OtherVT SETULE)>;
 
 // XXX - For some reason R600 version is preferring to use unordered
 // for setne?
-def COND_UNE_NE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
->;
+def COND_UNE_NE  : PatFrags<(ops), [(OtherVT SETUNE), (OtherVT SETNE)]>;
 
 //===----------------------------------------------------------------------===//
 // PatLeafs for signed comparisons
 //===----------------------------------------------------------------------===//
 
-def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
-def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
-def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
-def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+def COND_SGT : PatFrag<(ops), (OtherVT SETGT)>;
+def COND_SGE : PatFrag<(ops), (OtherVT SETGE)>;
+def COND_SLT : PatFrag<(ops), (OtherVT SETLT)>;
+def COND_SLE : PatFrag<(ops), (OtherVT SETLE)>;
 
 //===----------------------------------------------------------------------===//
 // PatLeafs for integer equality
 //===----------------------------------------------------------------------===//
 
-def COND_EQ : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
->;
-
-def COND_NE : PatLeaf <
-  (cond),
-  [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
->;
+def COND_EQ : PatFrags<(ops), [(OtherVT SETEQ), (OtherVT SETUEQ)]>;
+def COND_NE : PatFrags<(ops), [(OtherVT SETNE), (OtherVT SETUNE)]>;
 
+// FIXME: Should not need code predicate
+//def COND_NULL : PatLeaf<(OtherVT null_frag)>;
 def COND_NULL : PatLeaf <
   (cond),
   [{(void)N; return false;}]
@@ -335,17 +303,17 @@ def TEX_SHADOW_ARRAY : PatLeaf<
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
 
+def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
+  [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
 class AddressSpaceList<list<int> AS> {
   list<int> AddrSpaces = AS;
 }
 
-class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
-}]>;
-
-class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAlignment() >= 16;
-}]>;
+class Aligned<int Bytes> {
+  int MinAlignment = Bytes;
+}
 
 class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
 
@@ -502,6 +470,35 @@ defm atomic_store_#as : binary_atomic_op<atomic_store>;
 } // End foreach AddrSpace
 
 
+multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
+  foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+    let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
+      defm "_"#as : binary_atomic_op<atomic_op, IsInt>;
+
+      let PredicateCode = [{return (SDValue(N, 0).use_empty());}] in {
+        defm "_"#as#"_noret" : binary_atomic_op<atomic_op, IsInt>;
+      }
+
+      let PredicateCode = [{return !(SDValue(N, 0).use_empty());}] in {
+        defm "_"#as#"_ret" : binary_atomic_op<atomic_op, IsInt>;
+      }
+    }
+  }
+}
+
+defm atomic_swap : ret_noret_binary_atomic_op<atomic_swap>;
+defm atomic_load_add : ret_noret_binary_atomic_op<atomic_load_add>;
+defm atomic_load_and : ret_noret_binary_atomic_op<atomic_load_and>;
+defm atomic_load_max : ret_noret_binary_atomic_op<atomic_load_max>;
+defm atomic_load_min : ret_noret_binary_atomic_op<atomic_load_min>;
+defm atomic_load_or : ret_noret_binary_atomic_op<atomic_load_or>;
+defm atomic_load_sub : ret_noret_binary_atomic_op<atomic_load_sub>;
+defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>;
+defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>;
+defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
+defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
+
+
 def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
 def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
 
@@ -513,21 +510,31 @@ def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
 def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
 def atomic_store_local : LocalStore <atomic_store>;
 
-def load_align8_local : Aligned8Bytes <
-  (ops node:$ptr), (load_local node:$ptr)
->;
 
-def load_align16_local : Aligned16Bytes <
-  (ops node:$ptr), (load_local node:$ptr)
->;
+def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+  let IsLoad = 1;
+  let IsNonExtLoad = 1;
+  let MinAlignment = 8;
+}
 
-def store_align8_local : Aligned8Bytes <
-  (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
->;
+def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+  let IsLoad = 1;
+  let IsNonExtLoad = 1;
+  let MinAlignment = 16;
+}
+
+def store_align8_local: PatFrag<(ops node:$val, node:$ptr),
+                                (store_local node:$val, node:$ptr)>, Aligned<8> {
+  let IsStore = 1;
+  let IsTruncStore = 0;
+}
+
+def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
+                                (store_local node:$val, node:$ptr)>, Aligned<16> {
+  let IsStore = 1;
+  let IsTruncStore = 0;
+}
 
-def store_align16_local : Aligned16Bytes <
-  (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
->;
 
 def atomic_store_flat  : FlatStore <atomic_store>;
 def truncstorei8_hi16_flat  : StoreHi16<truncstorei8>, FlatStoreAddress;
@@ -547,69 +554,26 @@ class region_binary_atomic_op<SDNode atomic_op> :
 }]>;
 
 
-def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
-def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
-def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
-def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
-def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
-def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
-def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
-def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
-def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
-def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
-def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
-
 def mskor_global : PatFrag<(ops node:$val, node:$ptr),
                             (AMDGPUstore_mskor node:$val, node:$ptr), [{
   return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
-class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag<
-    (ops node:$ptr, node:$cmp, node:$swap),
-    (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
-      AtomicSDNode *AN = cast<AtomicSDNode>(N);
-      return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-
-class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag<
-    (ops node:$ptr, node:$cmp, node:$swap),
-    (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
-      AtomicSDNode *AN = cast<AtomicSDNode>(N);
-      return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
-}]>;
+let AddressSpaces = StoreAddress_local.AddrSpaces in {
+defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+}
 
-def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>;
+let AddressSpaces = StoreAddress_region.AddrSpaces in {
+defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+}
 
 class global_binary_atomic_op_frag<SDNode atomic_op> : PatFrag<
     (ops node:$ptr, node:$value),
     (atomic_op node:$ptr, node:$value),
     [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
 
-multiclass global_binary_atomic_op<SDNode atomic_op> {
-  def "" : global_binary_atomic_op_frag<atomic_op>;
-
-  def _noret : PatFrag<
-        (ops node:$ptr, node:$value),
-        (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
-
-  def _ret : PatFrag<
-        (ops node:$ptr, node:$value),
-        (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
-}
-
-defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
-defm atomic_add_global : global_binary_atomic_op<atomic_load_add>;
-defm atomic_and_global : global_binary_atomic_op<atomic_load_and>;
-defm atomic_max_global : global_binary_atomic_op<atomic_load_max>;
-defm atomic_min_global : global_binary_atomic_op<atomic_load_min>;
-defm atomic_or_global : global_binary_atomic_op<atomic_load_or>;
-defm atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
-defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
-defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
-defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
-
 // Legacy.
 def AMDGPUatomic_cmp_swap_global : PatFrag<
   (ops node:$ptr, node:$value),
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 670f6225fbf7..5aba35a19ced 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -11,6 +11,13 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
+#if defined(_MSC_VER) || defined(__MINGW32__)
+// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
+// from the Visual C++ cmath / math.h headers:
+// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
+#define _USE_MATH_DEFINES
+#endif
+
 #include "AMDGPU.h"
 #include "AMDGPULegalizerInfo.h"
 #include "AMDGPUTargetMachine.h"
@@ -20,6 +27,7 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 
@@ -32,7 +40,7 @@ using namespace LegalityPredicates;
 
 
 static LegalityPredicate isMultiple32(unsigned TypeIdx,
-                                      unsigned MaxSize = 512) {
+                                      unsigned MaxSize = 1024) {
   return [=](const LegalityQuery &Query) {
     const LLT Ty = Query.Types[TypeIdx];
     const LLT EltTy = Ty.getScalarType();
@@ -40,12 +48,27 @@ static LegalityPredicate isMultiple32(unsigned TypeIdx,
   };
 }
 
+static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx].getSizeInBits() == Size;
+  };
+}
+
 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT Ty = Query.Types[TypeIdx];
     return Ty.isVector() &&
            Ty.getNumElements() % 2 != 0 &&
-           Ty.getElementType().getSizeInBits() < 32;
+           Ty.getElementType().getSizeInBits() < 32 &&
+           Ty.getSizeInBits() % 32 != 0;
+  };
+}
+
+static LegalityPredicate isWideVec16(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    const LLT EltTy = Ty.getScalarType();
+    return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
   };
 }
 
@@ -68,6 +91,31 @@ static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
   };
 }
 
+// Increase the number of vector elements to reach the next multiple of 32-bit
+// type.
+static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+
+    const LLT EltTy = Ty.getElementType();
+    const int Size = Ty.getSizeInBits();
+    const int EltSize = EltTy.getSizeInBits();
+    const int NextMul32 = (Size + 31) / 32;
+
+    assert(EltSize < 32);
+
+    const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
+    return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
+  };
+}
+
+static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
+  };
+}
+
 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
@@ -82,7 +130,7 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
   };
 }
 
-// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
+// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
 // v2s16.
 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
@@ -94,7 +142,21 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
              EltSize == 128 || EltSize == 256;
     }
 
-    return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
+    return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
+  };
+}
+
+static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx].getElementType() == Type;
+  };
+}
+
+static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
+           Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
   };
 }
 
@@ -112,9 +174,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   const LLT S16 = LLT::scalar(16);
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
+  const LLT S96 = LLT::scalar(96);
   const LLT S128 = LLT::scalar(128);
   const LLT S256 = LLT::scalar(256);
-  const LLT S512 = LLT::scalar(512);
+  const LLT S1024 = LLT::scalar(1024);
 
   const LLT V2S16 = LLT::vector(2, 16);
   const LLT V4S16 = LLT::vector(4, 16);
@@ -134,6 +197,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   const LLT V14S32 = LLT::vector(14, 32);
   const LLT V15S32 = LLT::vector(15, 32);
   const LLT V16S32 = LLT::vector(16, 32);
+  const LLT V32S32 = LLT::vector(32, 32);
 
   const LLT V2S64 = LLT::vector(2, 64);
   const LLT V3S64 = LLT::vector(3, 64);
@@ -142,16 +206,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   const LLT V6S64 = LLT::vector(6, 64);
   const LLT V7S64 = LLT::vector(7, 64);
   const LLT V8S64 = LLT::vector(8, 64);
+  const LLT V16S64 = LLT::vector(16, 64);
 
   std::initializer_list<LLT> AllS32Vectors =
     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
-     V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
+     V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
   std::initializer_list<LLT> AllS64Vectors =
-    {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
+    {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
 
   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
+  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
+  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 
@@ -162,7 +229,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   };
 
   const std::initializer_list<LLT> AddrSpaces32 = {
-    LocalPtr, PrivatePtr
+    LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
   };
 
   const std::initializer_list<LLT> FPTypesBase = {
@@ -216,37 +283,34 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
     .clampScalar(0, S32, S64)
     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
-    .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
+    .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
     .widenScalarToNextPow2(0)
     .scalarize(0);
 
-  getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
+  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
     .legalFor({{S32, S1}})
-    .clampScalar(0, S32, S32);
+    .clampScalar(0, S32, S32)
+    .scalarize(0); // TODO: Implement.
+
+  getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
+    .lower();
 
   getActionDefinitionsBuilder(G_BITCAST)
-    .legalForCartesianProduct({S32, V2S16})
-    .legalForCartesianProduct({S64, V2S32, V4S16})
-    .legalForCartesianProduct({V2S64, V4S32})
     // Don't worry about the size constraint.
-    .legalIf(all(isPointer(0), isPointer(1)));
+    .legalIf(all(isRegisterType(0), isRegisterType(1)))
+    // FIXME: Testing hack
+    .legalForCartesianProduct({S16, LLT::vector(2, 8), });
 
-  if (ST.has16BitInsts()) {
-    getActionDefinitionsBuilder(G_FCONSTANT)
-      .legalFor({S32, S64, S16})
-      .clampScalar(0, S16, S64);
-  } else {
-    getActionDefinitionsBuilder(G_FCONSTANT)
-      .legalFor({S32, S64})
-      .clampScalar(0, S32, S64);
-  }
+  getActionDefinitionsBuilder(G_FCONSTANT)
+    .legalFor({S32, S64, S16})
+    .clampScalar(0, S16, S64);
 
   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
-    .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
+    .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
-    .clampScalarOrElt(0, S32, S512)
+    .clampScalarOrElt(0, S32, S1024)
     .legalIf(isMultiple32(0))
     .widenScalarToNextPow2(0, 32)
     .clampMaxNumElements(0, S32, 16);
@@ -256,23 +320,33 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   // values may not be legal.  We need to figure out how to distinguish
   // between these two scenarios.
   getActionDefinitionsBuilder(G_CONSTANT)
-    .legalFor({S1, S32, S64, GlobalPtr,
+    .legalFor({S1, S32, S64, S16, GlobalPtr,
                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
     .clampScalar(0, S32, S64)
     .widenScalarToNextPow2(0)
     .legalIf(isPointer(0));
 
   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
+  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
+    .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
+
 
   auto &FPOpActions = getActionDefinitionsBuilder(
-    { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
+    { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
     .legalFor({S32, S64});
+  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
+    .customFor({S32, S64});
+  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
+    .customFor({S32, S64});
 
   if (ST.has16BitInsts()) {
     if (ST.hasVOP3PInsts())
       FPOpActions.legalFor({S16, V2S16});
     else
       FPOpActions.legalFor({S16});
+
+    TrigActions.customFor({S16});
+    FDIVActions.customFor({S16});
   }
 
   auto &MinNumMaxNum = getActionDefinitionsBuilder({
@@ -293,22 +367,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
   }
 
-  // TODO: Implement
-  getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
-
   if (ST.hasVOP3PInsts())
     FPOpActions.clampMaxNumElements(0, S16, 2);
+
   FPOpActions
     .scalarize(0)
     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 
+  TrigActions
+    .scalarize(0)
+    .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+
+  FDIVActions
+    .scalarize(0)
+    .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+
+  getActionDefinitionsBuilder({G_FNEG, G_FABS})
+    .legalFor(FPTypesPK16)
+    .clampMaxNumElements(0, S16, 2)
+    .scalarize(0)
+    .clampScalar(0, S16, S64);
+
+  // TODO: Implement
+  getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+
   if (ST.has16BitInsts()) {
-    getActionDefinitionsBuilder(G_FSQRT)
+    getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
       .legalFor({S32, S64, S16})
       .scalarize(0)
       .clampScalar(0, S16, S64);
   } else {
-    getActionDefinitionsBuilder(G_FSQRT)
+    getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
       .legalFor({S32, S64})
       .scalarize(0)
       .clampScalar(0, S32, S64);
@@ -334,23 +423,43 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0)
       .clampScalar(0, S32, S64);
 
+  // Whether this is legal depends on the floating point mode for the function.
+  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
+  if (ST.hasMadF16())
+    FMad.customFor({S32, S16});
+  else
+    FMad.customFor({S32});
+  FMad.scalarize(0)
+      .lower();
+
   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
                {S32, S1}, {S64, S1}, {S16, S1},
+               {S96, S32},
                // FIXME: Hack
                {S64, LLT::scalar(33)},
                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
     .scalarize(0);
 
-  getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
-    .legalFor({{S32, S32}, {S64, S32}})
+  // TODO: Split s1->s64 during regbankselect for VALU.
+  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+    .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
     .lowerFor({{S32, S64}})
-    .customFor({{S64, S64}})
-    .scalarize(0);
+    .customFor({{S64, S64}});
+  if (ST.has16BitInsts())
+    IToFP.legalFor({{S16, S16}});
+  IToFP.clampScalar(1, S32, S64)
+       .scalarize(0);
+
+  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+    .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
+  if (ST.has16BitInsts())
+    FPToI.legalFor({{S16, S16}});
+  else
+    FPToI.minScalar(1, S32);
 
-  getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
-    .legalFor({{S32, S32}, {S32, S64}})
-    .scalarize(0);
+  FPToI.minScalar(0, S32)
+       .scalarize(0);
 
   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
     .legalFor({S32, S64})
@@ -374,6 +483,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .legalForCartesianProduct(AddrSpaces32, {S32})
     .scalarize(0);
 
+  getActionDefinitionsBuilder(G_PTR_MASK)
+    .scalarize(0)
+    .alwaysLegal();
+
   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 
   auto &CmpBuilder =
@@ -415,7 +528,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .widenScalarToNextPow2(1, 32);
 
   // TODO: Expand for > s32
-  getActionDefinitionsBuilder(G_BSWAP)
+  getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
     .legalFor({S32})
     .clampScalar(0, S32, S32)
     .scalarize(0);
@@ -491,87 +604,239 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
       });
 
-  if (ST.hasFlatAddressSpace()) {
-    getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
-      .scalarize(0)
-      .custom();
-  }
+  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
+    .scalarize(0)
+    .custom();
 
   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
   // handle some operations by just promoting the register during
   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
-  getActionDefinitionsBuilder({G_LOAD, G_STORE})
-    .narrowScalarIf([](const LegalityQuery &Query) {
-        unsigned Size = Query.Types[0].getSizeInBits();
-        unsigned MemSize = Query.MMODescrs[0].SizeInBits;
-        return (Size > 32 && MemSize < Size);
-      },
-      [](const LegalityQuery &Query) {
-        return std::make_pair(0, LLT::scalar(32));
-      })
-    .fewerElementsIf([=](const LegalityQuery &Query) {
-        unsigned MemSize = Query.MMODescrs[0].SizeInBits;
-        return (MemSize == 96) &&
-               Query.Types[0].isVector() &&
-               !ST.hasDwordx3LoadStores();
-      },
-      [=](const LegalityQuery &Query) {
-        return std::make_pair(0, V2S32);
-      })
-    .legalIf([=](const LegalityQuery &Query) {
-        const LLT &Ty0 = Query.Types[0];
-
-        unsigned Size = Ty0.getSizeInBits();
-        unsigned MemSize = Query.MMODescrs[0].SizeInBits;
-        if (Size < 32 || (Size > 32 && MemSize < Size))
-          return false;
-
-        if (Ty0.isVector() && Size != MemSize)
-          return false;
-
-        // TODO: Decompose private loads into 4-byte components.
-        // TODO: Illegal flat loads on SI
-        switch (MemSize) {
-        case 8:
-        case 16:
-          return Size == 32;
-        case 32:
-        case 64:
-        case 128:
-          return true;
+  auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
+    switch (AS) {
+    // FIXME: Private element size.
+    case AMDGPUAS::PRIVATE_ADDRESS:
+      return 32;
+    // FIXME: Check subtarget
+    case AMDGPUAS::LOCAL_ADDRESS:
+      return ST.useDS128() ? 128 : 64;
+
+    // Treat constant and global as identical. SMRD loads are sometimes usable
+    // for global loads (ideally constant address space should be eliminated)
+    // depending on the context. Legality cannot be context dependent, but
+    // RegBankSelect can split the load as necessary depending on the pointer
+    // register bank/uniformity and if the memory is invariant or not written in
+    // a kernel.
+    case AMDGPUAS::CONSTANT_ADDRESS:
+    case AMDGPUAS::GLOBAL_ADDRESS:
+      return 512;
+    default:
+      return 128;
+    }
+  };
 
-        case 96:
-          return ST.hasDwordx3LoadStores();
-
-        case 256:
-        case 512:
-          // TODO: Possibly support loads of i256 and i512 .  This will require
-          // adding i256 and i512 types to MVT in order for to be able to use
-          // TableGen.
-          // TODO: Add support for other vector types, this will require
-          //       defining more value mappings for the new types.
-          return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
-                                    Ty0.getScalarType().getSizeInBits() == 64);
-
-        default:
-          return false;
-        }
-      })
-    .clampScalar(0, S32, S64);
+  const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
+    const LLT DstTy = Query.Types[0];
+
+    // Split vector extloads.
+    unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+    if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
+      return true;
+
+    const LLT PtrTy = Query.Types[1];
+    unsigned AS = PtrTy.getAddressSpace();
+    if (MemSize > maxSizeForAddrSpace(AS))
+      return true;
+
+    // Catch weird sized loads that don't evenly divide into the access sizes
+    // TODO: May be able to widen depending on alignment etc.
+    unsigned NumRegs = MemSize / 32;
+    if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
+      return true;
+
+    unsigned Align = Query.MMODescrs[0].AlignInBits;
+    if (Align < MemSize) {
+      const SITargetLowering *TLI = ST.getTargetLowering();
+      return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
+    }
+
+    return false;
+  };
 
+  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
+  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
+  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
+
+  // TODO: Refine based on subtargets which support unaligned access or 128-bit
+  // LDS
+  // TODO: Unsupported flat for SI.
+
+  for (unsigned Op : {G_LOAD, G_STORE}) {
+    const bool IsStore = Op == G_STORE;
+
+    auto &Actions = getActionDefinitionsBuilder(Op);
+    // Whitelist the common cases.
+    // TODO: Pointer loads
+    // TODO: Wide constant loads
+    // TODO: Only CI+ has 3x loads
+    // TODO: Loads to s16 on gfx9
+    Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
+                                      {V2S32, GlobalPtr, 64, GlobalAlign32},
+                                      {V3S32, GlobalPtr, 96, GlobalAlign32},
+                                      {S96, GlobalPtr, 96, GlobalAlign32},
+                                      {V4S32, GlobalPtr, 128, GlobalAlign32},
+                                      {S128, GlobalPtr, 128, GlobalAlign32},
+                                      {S64, GlobalPtr, 64, GlobalAlign32},
+                                      {V2S64, GlobalPtr, 128, GlobalAlign32},
+                                      {V2S16, GlobalPtr, 32, GlobalAlign32},
+                                      {S32, GlobalPtr, 8, GlobalAlign8},
+                                      {S32, GlobalPtr, 16, GlobalAlign16},
+
+                                      {S32, LocalPtr, 32, 32},
+                                      {S64, LocalPtr, 64, 32},
+                                      {V2S32, LocalPtr, 64, 32},
+                                      {S32, LocalPtr, 8, 8},
+                                      {S32, LocalPtr, 16, 16},
+                                      {V2S16, LocalPtr, 32, 32},
+
+                                      {S32, PrivatePtr, 32, 32},
+                                      {S32, PrivatePtr, 8, 8},
+                                      {S32, PrivatePtr, 16, 16},
+                                      {V2S16, PrivatePtr, 32, 32},
+
+                                      {S32, FlatPtr, 32, GlobalAlign32},
+                                      {S32, FlatPtr, 16, GlobalAlign16},
+                                      {S32, FlatPtr, 8, GlobalAlign8},
+                                      {V2S16, FlatPtr, 32, GlobalAlign32},
+
+                                      {S32, ConstantPtr, 32, GlobalAlign32},
+                                      {V2S32, ConstantPtr, 64, GlobalAlign32},
+                                      {V3S32, ConstantPtr, 96, GlobalAlign32},
+                                      {V4S32, ConstantPtr, 128, GlobalAlign32},
+                                      {S64, ConstantPtr, 64, GlobalAlign32},
+                                      {S128, ConstantPtr, 128, GlobalAlign32},
+                                      {V2S32, ConstantPtr, 32, GlobalAlign32}});
+    Actions
+        .customIf(typeIs(1, Constant32Ptr))
+        .narrowScalarIf(
+            [=](const LegalityQuery &Query) -> bool {
+              return !Query.Types[0].isVector() && needToSplitLoad(Query);
+            },
+            [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
+              const LLT DstTy = Query.Types[0];
+              const LLT PtrTy = Query.Types[1];
+
+              const unsigned DstSize = DstTy.getSizeInBits();
+              unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+
+              // Split extloads.
+              if (DstSize > MemSize)
+                return std::make_pair(0, LLT::scalar(MemSize));
+
+              if (DstSize > 32 && (DstSize % 32 != 0)) {
+                // FIXME: Need a way to specify non-extload of larger size if
+                // suitably aligned.
+                return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
+              }
+
+              unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+              if (MemSize > MaxSize)
+                return std::make_pair(0, LLT::scalar(MaxSize));
+
+              unsigned Align = Query.MMODescrs[0].AlignInBits;
+              return std::make_pair(0, LLT::scalar(Align));
+            })
+        .fewerElementsIf(
+            [=](const LegalityQuery &Query) -> bool {
+              return Query.Types[0].isVector() && needToSplitLoad(Query);
+            },
+            [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
+              const LLT DstTy = Query.Types[0];
+              const LLT PtrTy = Query.Types[1];
+
+              LLT EltTy = DstTy.getElementType();
+              unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+
+              // Split if it's too large for the address space.
+              if (Query.MMODescrs[0].SizeInBits > MaxSize) {
+                unsigned NumElts = DstTy.getNumElements();
+                unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
+
+                // FIXME: Refine when odd breakdowns handled
+                // The scalars will need to be re-legalized.
+                if (NumPieces == 1 || NumPieces >= NumElts ||
+                    NumElts % NumPieces != 0)
+                  return std::make_pair(0, EltTy);
+
+                return std::make_pair(0,
+                                      LLT::vector(NumElts / NumPieces, EltTy));
+              }
+
+              // Need to split because of alignment.
+              unsigned Align = Query.MMODescrs[0].AlignInBits;
+              unsigned EltSize = EltTy.getSizeInBits();
+              if (EltSize > Align &&
+                  (EltSize / Align < DstTy.getNumElements())) {
+                return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
+              }
+
+              // May need relegalization for the scalars.
+              return std::make_pair(0, EltTy);
+            })
+        .minScalar(0, S32);
+
+    if (IsStore)
+      Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
+
+    // TODO: Need a bitcast lower option?
+    Actions
+        .legalIf([=](const LegalityQuery &Query) {
+          const LLT Ty0 = Query.Types[0];
+          unsigned Size = Ty0.getSizeInBits();
+          unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+          unsigned Align = Query.MMODescrs[0].AlignInBits;
+
+          // No extending vector loads.
+          if (Size > MemSize && Ty0.isVector())
+            return false;
+
+          // FIXME: Widening store from alignment not valid.
+          if (MemSize < Size)
+            MemSize = std::max(MemSize, Align);
+
+          switch (MemSize) {
+          case 8:
+          case 16:
+            return Size == 32;
+          case 32:
+          case 64:
+          case 128:
+            return true;
+          case 96:
+            return ST.hasDwordx3LoadStores();
+          case 256:
+          case 512:
+            return true;
+          default:
+            return false;
+          }
+        })
+        .widenScalarToNextPow2(0)
+        // TODO: v3s32->v4s32 with alignment
+        .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
+  }
 
-  // FIXME: Handle alignment requirements.
   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
-    .legalForTypesWithMemDesc({
-        {S32, GlobalPtr, 8, 8},
-        {S32, GlobalPtr, 16, 8},
-        {S32, LocalPtr, 8, 8},
-        {S32, LocalPtr, 16, 8},
-        {S32, PrivatePtr, 8, 8},
-        {S32, PrivatePtr, 16, 8}});
+                       .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
+                                                  {S32, GlobalPtr, 16, 2 * 8},
+                                                  {S32, LocalPtr, 8, 8},
+                                                  {S32, LocalPtr, 16, 16},
+                                                  {S32, PrivatePtr, 8, 8},
+                                                  {S32, PrivatePtr, 16, 16},
+                                                  {S32, ConstantPtr, 8, 8},
+                                                  {S32, ConstantPtr, 16, 2 * 8}});
   if (ST.hasFlatAddressSpace()) {
-    ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
-                                       {S32, FlatPtr, 16, 8}});
+    ExtLoads.legalForTypesWithMemDesc(
+        {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
   }
 
   ExtLoads.clampScalar(0, S32, S32)
@@ -590,6 +855,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
   }
 
+  getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
+    .legalFor({{S32, LocalPtr}});
+
+  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
+    .lower();
+
   // TODO: Pointer types, any 32-bit or 64-bit vector
   getActionDefinitionsBuilder(G_SELECT)
     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
@@ -643,7 +914,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
           return (EltTy.getSizeInBits() == 16 ||
                   EltTy.getSizeInBits() % 32 == 0) &&
                  VecTy.getSizeInBits() % 32 == 0 &&
-                 VecTy.getSizeInBits() <= 512 &&
+                 VecTy.getSizeInBits() <= 1024 &&
                  IdxTy.getSizeInBits() == 32;
         })
       .clampScalar(EltTypeIdx, S32, S64)
@@ -663,6 +934,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
     // FIXME: Doesn't handle extract of illegal sizes.
     getActionDefinitionsBuilder(Op)
+      .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
+      // FIXME: Multiples of 16 should not be legal.
       .legalIf([=](const LegalityQuery &Query) {
           const LLT BigTy = Query.Types[BigTyIdx];
           const LLT LitTy = Query.Types[LitTyIdx];
@@ -686,18 +959,36 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   }
 
-  getActionDefinitionsBuilder(G_BUILD_VECTOR)
-      .legalForCartesianProduct(AllS32Vectors, {S32})
-      .legalForCartesianProduct(AllS64Vectors, {S64})
-      .clampNumElements(0, V16S32, V16S32)
-      .clampNumElements(0, V2S64, V8S64)
-      .minScalarSameAs(1, 0)
-      .legalIf(isRegisterType(0))
-      .minScalarOrElt(0, S32);
+  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
+    .legalForCartesianProduct(AllS32Vectors, {S32})
+    .legalForCartesianProduct(AllS64Vectors, {S64})
+    .clampNumElements(0, V16S32, V32S32)
+    .clampNumElements(0, V2S64, V16S64)
+    .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
+
+  if (ST.hasScalarPackInsts())
+    BuildVector.legalFor({V2S16, S32});
+
+  BuildVector
+    .minScalarSameAs(1, 0)
+    .legalIf(isRegisterType(0))
+    .minScalarOrElt(0, S32);
+
+  if (ST.hasScalarPackInsts()) {
+    getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
+      .legalFor({V2S16, S32})
+      .lower();
+  } else {
+    getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
+      .lower();
+  }
 
   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
     .legalIf(isRegisterType(0));
 
+  // TODO: Don't fully scalarize v2s16 pieces
+  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
+
   // Merge/Unmerge
   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
@@ -715,14 +1006,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       return false;
     };
 
-    getActionDefinitionsBuilder(Op)
+    auto &Builder = getActionDefinitionsBuilder(Op)
       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
       // worth considering the multiples of 64 since 2*192 and 2*384 are not
       // valid.
       .clampScalar(LitTyIdx, S16, S256)
       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
-
+      .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
+      .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
+                           elementTypeIs(1, S16)),
+                       changeTo(1, V2S16))
       // Break up vectors with weird elements into scalars
       .fewerElementsIf(
         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
@@ -730,25 +1024,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .fewerElementsIf(
         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
         scalarize(1))
-      .clampScalar(BigTyIdx, S32, S512)
-      .widenScalarIf(
+      .clampScalar(BigTyIdx, S32, S1024)
+      .lowerFor({{S16, V2S16}});
+
+    if (Op == G_MERGE_VALUES) {
+      Builder.widenScalarIf(
+        // TODO: Use 16-bit shifts if legal for 8-bit values?
         [=](const LegalityQuery &Query) {
-          const LLT &Ty = Query.Types[BigTyIdx];
-          return !isPowerOf2_32(Ty.getSizeInBits()) &&
-                 Ty.getSizeInBits() % 16 != 0;
+          const LLT Ty = Query.Types[LitTyIdx];
+          return Ty.getSizeInBits() < 32;
         },
-        [=](const LegalityQuery &Query) {
-          // Pick the next power of 2, or a multiple of 64 over 128.
-          // Whichever is smaller.
-          const LLT &Ty = Query.Types[BigTyIdx];
-          unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
-          if (NewSizeInBits >= 256) {
-            unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
-            if (RoundedTo < NewSizeInBits)
-              NewSizeInBits = RoundedTo;
-          }
-          return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
-        })
+        changeTo(LitTyIdx, S32));
+    }
+
+    Builder.widenScalarIf(
+      [=](const LegalityQuery &Query) {
+        const LLT Ty = Query.Types[BigTyIdx];
+        return !isPowerOf2_32(Ty.getSizeInBits()) &&
+          Ty.getSizeInBits() % 16 != 0;
+      },
+      [=](const LegalityQuery &Query) {
+        // Pick the next power of 2, or a multiple of 64 over 128.
+        // Whichever is smaller.
+        const LLT &Ty = Query.Types[BigTyIdx];
+        unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
+        if (NewSizeInBits >= 256) {
+          unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
+          if (RoundedTo < NewSizeInBits)
+            NewSizeInBits = RoundedTo;
+        }
+        return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
+      })
       .legalIf([=](const LegalityQuery &Query) {
           const LLT &BigTy = Query.Types[BigTyIdx];
           const LLT &LitTy = Query.Types[LitTyIdx];
@@ -760,43 +1066,56 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
           return BigTy.getSizeInBits() % 16 == 0 &&
                  LitTy.getSizeInBits() % 16 == 0 &&
-                 BigTy.getSizeInBits() <= 512;
+                 BigTy.getSizeInBits() <= 1024;
         })
       // Any vectors left are the wrong size. Scalarize them.
       .scalarize(0)
       .scalarize(1);
   }
 
+  getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
   computeTables();
   verify(*ST.getInstrInfo());
 }
 
 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
-                                         MachineIRBuilder &MIRBuilder,
+                                         MachineIRBuilder &B,
                                          GISelChangeObserver &Observer) const {
   switch (MI.getOpcode()) {
   case TargetOpcode::G_ADDRSPACE_CAST:
-    return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
+    return legalizeAddrSpaceCast(MI, MRI, B);
   case TargetOpcode::G_FRINT:
-    return legalizeFrint(MI, MRI, MIRBuilder);
+    return legalizeFrint(MI, MRI, B);
   case TargetOpcode::G_FCEIL:
-    return legalizeFceil(MI, MRI, MIRBuilder);
+    return legalizeFceil(MI, MRI, B);
   case TargetOpcode::G_INTRINSIC_TRUNC:
-    return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
+    return legalizeIntrinsicTrunc(MI, MRI, B);
   case TargetOpcode::G_SITOFP:
-    return legalizeITOFP(MI, MRI, MIRBuilder, true);
+    return legalizeITOFP(MI, MRI, B, true);
   case TargetOpcode::G_UITOFP:
-    return legalizeITOFP(MI, MRI, MIRBuilder, false);
+    return legalizeITOFP(MI, MRI, B, false);
   case TargetOpcode::G_FMINNUM:
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINNUM_IEEE:
   case TargetOpcode::G_FMAXNUM_IEEE:
-    return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
+    return legalizeMinNumMaxNum(MI, MRI, B);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
-    return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
+    return legalizeExtractVectorElt(MI, MRI, B);
   case TargetOpcode::G_INSERT_VECTOR_ELT:
-    return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
+    return legalizeInsertVectorElt(MI, MRI, B);
+  case TargetOpcode::G_FSIN:
+  case TargetOpcode::G_FCOS:
+    return legalizeSinCos(MI, MRI, B);
+  case TargetOpcode::G_GLOBAL_VALUE:
+    return legalizeGlobalValue(MI, MRI, B);
+  case TargetOpcode::G_LOAD:
+    return legalizeLoad(MI, MRI, B, Observer);
+  case TargetOpcode::G_FMAD:
+    return legalizeFMad(MI, MRI, B);
+  case TargetOpcode::G_FDIV:
+    return legalizeFDIV(MI, MRI, B);
   default:
     return false;
   }
@@ -807,11 +1126,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
 Register AMDGPULegalizerInfo::getSegmentAperture(
   unsigned AS,
   MachineRegisterInfo &MRI,
-  MachineIRBuilder &MIRBuilder) const {
-  MachineFunction &MF = MIRBuilder.getMF();
+  MachineIRBuilder &B) const {
+  MachineFunction &MF = B.getMF();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const LLT S32 = LLT::scalar(32);
 
+  assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
+
   if (ST.hasApertureRegs()) {
     // FIXME: Use inline constants (src_{shared, private}_base) instead of
     // getreg.
@@ -829,13 +1150,13 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
-    MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
+    B.buildInstr(AMDGPU::S_GETREG_B32)
       .addDef(GetReg)
       .addImm(Encoding);
     MRI.setType(GetReg, S32);
 
-    auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
-    MIRBuilder.buildInstr(TargetOpcode::G_SHL)
+    auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
+    B.buildInstr(TargetOpcode::G_SHL)
       .addDef(ApertureReg)
       .addUse(GetReg)
       .addUse(ShiftAmt.getReg(0));
@@ -846,8 +1167,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
   Register QueuePtr = MRI.createGenericVirtualRegister(
     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
 
-  // FIXME: Placeholder until we can track the input registers.
-  MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
+    return Register();
 
   // Offset into amd_queue_t for group_segment_aperture_base_hi /
   // private_segment_aperture_base_hi.
@@ -870,18 +1192,19 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
   Register LoadResult = MRI.createGenericVirtualRegister(S32);
   Register LoadAddr;
 
-  MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
-  MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
+  B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
+  B.buildLoad(LoadResult, LoadAddr, *MMO);
   return LoadResult;
 }
 
 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   MachineInstr &MI, MachineRegisterInfo &MRI,
-  MachineIRBuilder &MIRBuilder) const {
-  MachineFunction &MF = MIRBuilder.getMF();
+  MachineIRBuilder &B) const {
+  MachineFunction &MF = B.getMF();
 
-  MIRBuilder.setInstr(MI);
+  B.setInstr(MI);
 
+  const LLT S32 = LLT::scalar(32);
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
 
@@ -899,7 +1222,28 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
-    MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
+    MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
+    return true;
+  }
+
+  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+    // Truncate.
+    B.buildExtract(Dst, Src, 0);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+    const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+    uint32_t AddrHiVal = Info->get32BitAddressHighBits();
+
+    // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
+    // another. Merge operands are required to be the same type, but creating an
+    // extra ptrtoint would be kind of pointless.
+    auto HighAddr = B.buildConstant(
+      LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
+    B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
+    MI.eraseFromParent();
     return true;
   }
 
@@ -908,47 +1252,52 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
     unsigned NullVal = TM.getNullPointerValue(DestAS);
 
-    auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
-    auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
+    auto SegmentNull = B.buildConstant(DstTy, NullVal);
+    auto FlatNull = B.buildConstant(SrcTy, 0);
 
     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
 
     // Extract low 32-bits of the pointer.
-    MIRBuilder.buildExtract(PtrLo32, Src, 0);
+    B.buildExtract(PtrLo32, Src, 0);
 
     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
-    MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
-    MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
+    B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
+    B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
 
     MI.eraseFromParent();
     return true;
   }
 
-  assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
-         SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
+  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
+    return false;
+
+  if (!ST.hasFlatAddressSpace())
+    return false;
 
   auto SegmentNull =
-      MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+      B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
   auto FlatNull =
-      MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+      B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
 
-  Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
+  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+  if (!ApertureReg.isValid())
+    return false;
 
   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
-  MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
+  B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
 
   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
 
   // Coerce the type of the low half of the result so we can use merge_values.
-  Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
-  MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
+  Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
+  B.buildInstr(TargetOpcode::G_PTRTOINT)
     .addDef(SrcAsInt)
     .addUse(Src);
 
   // TODO: Should we allow mismatched types but matching sizes in merges to
   // avoid the ptrtoint?
-  MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
-  MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
+  B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
+  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
 
   MI.eraseFromParent();
   return true;
@@ -956,8 +1305,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
 
 bool AMDGPULegalizerInfo::legalizeFrint(
   MachineInstr &MI, MachineRegisterInfo &MRI,
-  MachineIRBuilder &MIRBuilder) const {
-  MIRBuilder.setInstr(MI);
+  MachineIRBuilder &B) const {
+  B.setInstr(MI);
 
   Register Src = MI.getOperand(1).getReg();
   LLT Ty = MRI.getType(Src);
@@ -966,18 +1315,18 @@ bool AMDGPULegalizerInfo::legalizeFrint(
   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
 
-  auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
-  auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
+  auto C1 = B.buildFConstant(Ty, C1Val);
+  auto CopySign = B.buildFCopysign(Ty, C1, Src);
 
   // TODO: Should this propagate fast-math-flags?
-  auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
-  auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
+  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
+  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
 
-  auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
-  auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
+  auto C2 = B.buildFConstant(Ty, C2Val);
+  auto Fabs = B.buildFAbs(Ty, Src);
 
-  auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
-  MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
+  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
+  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
   return true;
 }
 
@@ -1124,7 +1473,7 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
   MachineIRBuilder HelperBuilder(MI);
   GISelObserverWrapper DummyObserver;
   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
-  HelperBuilder.setMBB(*MI.getParent());
+  HelperBuilder.setInstr(MI);
   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
 }
 
@@ -1187,6 +1536,194 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeSinCos(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  MachineIRBuilder &B) const {
+  B.setInstr(MI);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Flags = MI.getFlags();
+
+  Register TrigVal;
+  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
+  if (ST.hasTrigReducedRange()) {
+    auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
+    TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
+      .addUse(MulVal.getReg(0))
+      .setMIFlags(Flags).getReg(0);
+  } else
+    TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
+
+  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
+    Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
+  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
+    .addUse(TrigVal)
+    .setMIFlags(Flags);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
+  Register DstReg, LLT PtrTy,
+  MachineIRBuilder &B, const GlobalValue *GV,
+  unsigned Offset, unsigned GAFlags) const {
+  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
+  // to the following code sequence:
+  //
+  // For constant address space:
+  //   s_getpc_b64 s[0:1]
+  //   s_add_u32 s0, s0, $symbol
+  //   s_addc_u32 s1, s1, 0
+  //
+  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
+  //   a fixup or relocation is emitted to replace $symbol with a literal
+  //   constant, which is a pc-relative offset from the encoding of the $symbol
+  //   operand to the global variable.
+  //
+  // For global address space:
+  //   s_getpc_b64 s[0:1]
+  //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
+  //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
+  //
+  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
+  //   fixups or relocations are emitted to replace $symbol@*@lo and
+  //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
+  //   which is a 64-bit pc-relative offset from the encoding of the $symbol
+  //   operand to the global variable.
+  //
+  // What we want here is an offset from the value returned by s_getpc
+  // (which is the address of the s_add_u32 instruction) to the global
+  // variable, but since the encoding of $symbol starts 4 bytes after the start
+  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
+  // small. This requires us to add 4 to the global variable offset in order to
+  // compute the correct address.
+
+  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+
+  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
+    B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
+
+  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
+    .addDef(PCReg);
+
+  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
+  if (GAFlags == SIInstrInfo::MO_NONE)
+    MIB.addImm(0);
+  else
+    MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
+
+  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
+
+  if (PtrTy.getSizeInBits() == 32)
+    B.buildExtract(DstReg, PCReg, 0);
+  return true;
+ }
+
+bool AMDGPULegalizerInfo::legalizeGlobalValue(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  MachineIRBuilder &B) const {
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  unsigned AS = Ty.getAddressSpace();
+
+  const GlobalValue *GV = MI.getOperand(1).getGlobal();
+  MachineFunction &MF = B.getMF();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  B.setInstr(MI);
+
+  if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
+    if (!MFI->isEntryFunction()) {
+      const Function &Fn = MF.getFunction();
+      DiagnosticInfoUnsupported BadLDSDecl(
+        Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
+      Fn.getContext().diagnose(BadLDSDecl);
+    }
+
+    // TODO: We could emit code to handle the initialization somewhere.
+    if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
+      B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
+      MI.eraseFromParent();
+      return true;
+    }
+
+    const Function &Fn = MF.getFunction();
+    DiagnosticInfoUnsupported BadInit(
+      Fn, "unsupported initializer for address space", MI.getDebugLoc());
+    Fn.getContext().diagnose(BadInit);
+    return true;
+  }
+
+  const SITargetLowering *TLI = ST.getTargetLowering();
+
+  if (TLI->shouldEmitFixup(GV)) {
+    buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (TLI->shouldEmitPCReloc(GV)) {
+    buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
+
+  MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
+    MachinePointerInfo::getGOT(MF),
+    MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+    MachineMemOperand::MOInvariant,
+    8 /*Size*/, 8 /*Align*/);
+
+  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
+
+  if (Ty.getSizeInBits() == 32) {
+    // Truncate if this is a 32-bit constant adrdess.
+    auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
+    B.buildExtract(DstReg, Load, 0);
+  } else
+    B.buildLoad(DstReg, GOTAddr, *GOTMMO);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeLoad(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  MachineIRBuilder &B, GISelChangeObserver &Observer) const {
+  B.setInstr(MI);
+  LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+  auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
+  Observer.changingInstr(MI);
+  MI.getOperand(1).setReg(Cast.getReg(0));
+  Observer.changedInstr(MI);
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFMad(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  MachineIRBuilder &B) const {
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+  assert(Ty.isScalar());
+
+  // TODO: Always legal with future ftz flag.
+  if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
+    return true;
+  if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
+    return true;
+
+  MachineFunction &MF = B.getMF();
+
+  MachineIRBuilder HelperBuilder(MI);
+  GISelObserverWrapper DummyObserver;
+  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
+  HelperBuilder.setMBB(*MI.getParent());
+  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
+}
+
 // Return the use branch instruction, otherwise null if the usage is invalid.
 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
                                        MachineRegisterInfo &MRI) {
@@ -1212,10 +1749,9 @@ Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
 
 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
                                          const ArgDescriptor *Arg) const {
-  if (!Arg->isRegister())
+  if (!Arg->isRegister() || !Arg->getRegister().isValid())
     return false; // TODO: Handle these
 
-  assert(Arg->getRegister() != 0);
   assert(Arg->getRegister().isPhysical());
 
   MachineRegisterInfo &MRI = *B.getMRI();
@@ -1229,19 +1765,30 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
     const unsigned Mask = Arg->getMask();
     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
 
-    auto ShiftAmt = B.buildConstant(S32, Shift);
-    auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
-    B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
+    Register AndMaskSrc = LiveIn;
+
+    if (Shift != 0) {
+      auto ShiftAmt = B.buildConstant(S32, Shift);
+      AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
+    }
+
+    B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
   } else
     B.buildCopy(DstReg, LiveIn);
 
   // Insert the argument copy if it doens't already exist.
   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
   if (!MRI.getVRegDef(LiveIn)) {
+    // FIXME: Should have scoped insert pt
+    MachineBasicBlock &OrigInsBB = B.getMBB();
+    auto OrigInsPt = B.getInsertPt();
+
     MachineBasicBlock &EntryMBB = B.getMF().front();
     EntryMBB.addLiveIn(Arg->getRegister());
     B.setInsertPt(EntryMBB, EntryMBB.begin());
     B.buildCopy(LiveIn, Arg->getRegister());
+
+    B.setInsertPt(OrigInsBB, OrigInsPt);
   }
 
   return true;
@@ -1272,6 +1819,113 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
   return false;
 }
 
+bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
+                                       MachineRegisterInfo &MRI,
+                                       MachineIRBuilder &B) const {
+  B.setInstr(MI);
+
+  if (legalizeFastUnsafeFDIV(MI, MRI, B))
+    return true;
+
+  return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
+                                                 MachineRegisterInfo &MRI,
+                                                 MachineIRBuilder &B) const {
+  Register Res = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+
+  uint16_t Flags = MI.getFlags();
+
+  LLT ResTy = MRI.getType(Res);
+  LLT S32 = LLT::scalar(32);
+  LLT S64 = LLT::scalar(64);
+
+  const MachineFunction &MF = B.getMF();
+  bool Unsafe =
+    MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
+
+  if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
+    return false;
+
+  if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
+    return false;
+
+  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
+    // 1 / x -> RCP(x)
+    if (CLHS->isExactlyValue(1.0)) {
+      B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
+        .addUse(RHS)
+        .setMIFlags(Flags);
+
+      MI.eraseFromParent();
+      return true;
+    }
+
+    // -1 / x -> RCP( FNEG(x) )
+    if (CLHS->isExactlyValue(-1.0)) {
+      auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
+      B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
+        .addUse(FNeg.getReg(0))
+        .setMIFlags(Flags);
+
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  // x / y -> x * (1.0 / y)
+  if (Unsafe) {
+    auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+      .addUse(RHS)
+      .setMIFlags(Flags);
+    B.buildFMul(Res, LHS, RCP, Flags);
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
+                                                 MachineRegisterInfo &MRI,
+                                                 MachineIRBuilder &B) const {
+  B.setInstr(MI);
+  Register Res = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(2).getReg();
+  Register RHS = MI.getOperand(3).getReg();
+  uint16_t Flags = MI.getFlags();
+
+  LLT S32 = LLT::scalar(32);
+  LLT S1 = LLT::scalar(1);
+
+  auto Abs = B.buildFAbs(S32, RHS, Flags);
+  const APFloat C0Val(1.0f);
+
+  auto C0 = B.buildConstant(S32, 0x6f800000);
+  auto C1 = B.buildConstant(S32, 0x2f800000);
+  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
+
+  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
+  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
+
+  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
+
+  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
+    .addUse(Mul0.getReg(0))
+    .setMIFlags(Flags);
+
+  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
+
+  B.buildFMul(Res, Sel, Mul1, Flags);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
                                                  MachineRegisterInfo &MRI,
                                                  MachineIRBuilder &B) const {
@@ -1306,11 +1960,79 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
+                                              MachineRegisterInfo &MRI,
+                                              MachineIRBuilder &B,
+                                              unsigned AddrSpace) const {
+  B.setInstr(MI);
+  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
+  auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
+  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// Handle register layout difference for f16 images for some subtargets.
+Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
+                                             MachineRegisterInfo &MRI,
+                                             Register Reg) const {
+  if (!ST.hasUnpackedD16VMem())
+    return Reg;
+
+  const LLT S16 = LLT::scalar(16);
+  const LLT S32 = LLT::scalar(32);
+  LLT StoreVT = MRI.getType(Reg);
+  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
+
+  auto Unmerge = B.buildUnmerge(S16, Reg);
+
+  SmallVector<Register, 4> WideRegs;
+  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+    WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
+
+  int NumElts = StoreVT.getNumElements();
+
+  return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+}
+
+bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
+                                                 MachineRegisterInfo &MRI,
+                                                 MachineIRBuilder &B,
+                                                 bool IsFormat) const {
+  // TODO: Reject f16 format on targets where unsupported.
+  Register VData = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(VData);
+
+  B.setInstr(MI);
+
+  const LLT S32 = LLT::scalar(32);
+  const LLT S16 = LLT::scalar(16);
+
+  // Fixup illegal register types for i8 stores.
+  if (Ty == LLT::scalar(8) || Ty == S16) {
+    Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
+    MI.getOperand(1).setReg(AnyExt);
+    return true;
+  }
+
+  if (Ty.isVector()) {
+    if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
+      if (IsFormat)
+        MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
+      return true;
+    }
+
+    return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
+  }
+
+  return Ty == S32;
+}
+
 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
                                             MachineRegisterInfo &MRI,
                                             MachineIRBuilder &B) const {
   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
-  switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+  switch (MI.getIntrinsicID()) {
   case Intrinsic::amdgcn_if: {
     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
       const SIRegisterInfo *TRI
@@ -1386,6 +2108,22 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
   case Intrinsic::amdgcn_dispatch_id:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
+  case Intrinsic::amdgcn_fdiv_fast:
+    return legalizeFDIVFastIntrin(MI, MRI, B);
+  case Intrinsic::amdgcn_is_shared:
+    return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
+  case Intrinsic::amdgcn_is_private:
+    return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
+  case Intrinsic::amdgcn_wavefrontsize: {
+    B.setInstr(MI);
+    B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
+    MI.eraseFromParent();
+    return true;
+  }
+  case Intrinsic::amdgcn_raw_buffer_store:
+    return legalizeRawBufferStore(MI, MRI, B, false);
+  case Intrinsic::amdgcn_raw_buffer_store_format:
+    return legalizeRawBufferStore(MI, MRI, B, true);
   default:
     return true;
   }
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 3f1cc1d265dd..d0fba23a8686 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -16,6 +16,7 @@
 
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "AMDGPUArgumentUsageInfo.h"
+#include "SIInstrInfo.h"
 
 namespace llvm {
 
@@ -32,29 +33,44 @@ public:
                       const GCNTargetMachine &TM);
 
   bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                      MachineIRBuilder &MIRBuilder,
+                      MachineIRBuilder &B,
                       GISelChangeObserver &Observer) const override;
 
   Register getSegmentAperture(unsigned AddrSpace,
                               MachineRegisterInfo &MRI,
-                              MachineIRBuilder &MIRBuilder) const;
+                              MachineIRBuilder &B) const;
 
   bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI,
-                             MachineIRBuilder &MIRBuilder) const;
+                             MachineIRBuilder &B) const;
   bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     MachineIRBuilder &MIRBuilder) const;
+                     MachineIRBuilder &B) const;
   bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     MachineIRBuilder &MIRBuilder) const;
+                     MachineIRBuilder &B) const;
   bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
-                              MachineIRBuilder &MIRBuilder) const;
+                              MachineIRBuilder &B) const;
   bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     MachineIRBuilder &MIRBuilder, bool Signed) const;
+                     MachineIRBuilder &B, bool Signed) const;
   bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI,
-                            MachineIRBuilder &MIRBuilder) const;
+                            MachineIRBuilder &B) const;
   bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
-                                MachineIRBuilder &MIRBuilder) const;
+                                MachineIRBuilder &B) const;
   bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
-                               MachineIRBuilder &MIRBuilder) const;
+                               MachineIRBuilder &B) const;
+  bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      MachineIRBuilder &B) const;
+
+  bool buildPCRelGlobalAddress(
+    Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
+    unsigned Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const;
+
+  bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B) const;
+  bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
+                    MachineIRBuilder &B,
+                    GISelChangeObserver &Observer) const;
+
+  bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI,
+                    MachineIRBuilder &B) const;
 
   Register getLiveInRegister(MachineRegisterInfo &MRI,
                              Register Reg, LLT Ty) const;
@@ -65,10 +81,24 @@ public:
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
 
+  bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
+                    MachineIRBuilder &B) const;
+  bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              MachineIRBuilder &B) const;
+  bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              MachineIRBuilder &B) const;
+
   bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B) const;
+  bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B, unsigned AddrSpace) const;
+
+  Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                          Register Reg) const;
+  bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              MachineIRBuilder &B, bool IsFormat) const;
   bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
-                         MachineIRBuilder &MIRBuilder) const override;
+                         MachineIRBuilder &B) const override;
 
 };
 } // End llvm namespace.
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index ce0a9db7c7f4..2c94e0046651 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -48,18 +49,10 @@ static cl::list<std::string> UseNative("amdgpu-use-native",
   cl::CommaSeparated, cl::ValueOptional,
   cl::Hidden);
 
-#define MATH_PI     3.14159265358979323846264338327950288419716939937511
-#define MATH_E      2.71828182845904523536028747135266249775724709369996
-#define MATH_SQRT2  1.41421356237309504880168872420969807856967187537695
-
-#define MATH_LOG2E     1.4426950408889634073599246810018921374266459541529859
-#define MATH_LOG10E    0.4342944819032518276511289189166050822943970058036665
-// Value of log2(10)
-#define MATH_LOG2_10   3.3219280948873623478703194294893901758648313930245806
-// Value of 1 / log2(10)
-#define MATH_RLOG2_10  0.3010299956639811952137388947244930267681898814621085
-// Value of 1 / M_LOG2E_F = 1 / log2(e)
-#define MATH_RLOG2_E   0.6931471805599453094172321214581765680755001343602552
+#define MATH_PI      numbers::pi
+#define MATH_E       numbers::e
+#define MATH_SQRT2   numbers::sqrt2
+#define MATH_SQRT1_2 numbers::inv_sqrt2
 
 namespace llvm {
 
@@ -254,8 +247,8 @@ struct TableEntry {
 
 /* a list of {result, input} */
 static const TableEntry tbl_acos[] = {
-  {MATH_PI/2.0, 0.0},
-  {MATH_PI/2.0, -0.0},
+  {MATH_PI / 2.0, 0.0},
+  {MATH_PI / 2.0, -0.0},
   {0.0, 1.0},
   {MATH_PI, -1.0}
 };
@@ -271,8 +264,8 @@ static const TableEntry tbl_acospi[] = {
 static const TableEntry tbl_asin[] = {
   {0.0, 0.0},
   {-0.0, -0.0},
-  {MATH_PI/2.0, 1.0},
-  {-MATH_PI/2.0, -1.0}
+  {MATH_PI / 2.0, 1.0},
+  {-MATH_PI / 2.0, -1.0}
 };
 static const TableEntry tbl_asinh[] = {
   {0.0, 0.0},
@@ -287,8 +280,8 @@ static const TableEntry tbl_asinpi[] = {
 static const TableEntry tbl_atan[] = {
   {0.0, 0.0},
   {-0.0, -0.0},
-  {MATH_PI/4.0, 1.0},
-  {-MATH_PI/4.0, -1.0}
+  {MATH_PI / 4.0, 1.0},
+  {-MATH_PI / 4.0, -1.0}
 };
 static const TableEntry tbl_atanh[] = {
   {0.0, 0.0},
@@ -359,7 +352,7 @@ static const TableEntry tbl_log10[] = {
 };
 static const TableEntry tbl_rsqrt[] = {
   {1.0, 1.0},
-  {1.0/MATH_SQRT2, 2.0}
+  {MATH_SQRT1_2, 2.0}
 };
 static const TableEntry tbl_sin[] = {
   {0.0, 0.0},
@@ -868,7 +861,7 @@ static double log2(double V) {
 #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
   return ::log2(V);
 #else
-  return log(V) / 0.693147180559945309417;
+  return log(V) / numbers::ln2;
 #endif
 }
 }
@@ -1430,8 +1423,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
   B.SetInsertPoint(&*ItNew);
   AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
     std::string(prefix) + UI->getName());
-  Alloc->setAlignment(UCallee->getParent()->getDataLayout()
-                       .getTypeAllocSize(RetType));
+  Alloc->setAlignment(MaybeAlign(
+      UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
   return Alloc;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index a5bac25701a0..e1ae496d9cbc 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -55,7 +55,7 @@ enum EManglingParam {
 };
 
 struct ManglingRule {
-   StringRef const Name;
+   const char *Name;
    unsigned char Lead[2];
    unsigned char Param[5];
 
@@ -69,7 +69,7 @@ struct ManglingRule {
 
 // Information about library functions with unmangled names.
 class UnmangledFuncInfo {
-  StringRef const Name;
+  const char *Name;
   unsigned NumArgs;
 
   // Table for all lib functions with unmangled names.
@@ -82,7 +82,7 @@ class UnmangledFuncInfo {
 
 public:
   using ID = AMDGPULibFunc::EFuncId;
-  UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs)
+  constexpr UnmangledFuncInfo(const char *_Name, unsigned _NumArgs)
       : Name(_Name), NumArgs(_NumArgs) {}
   // Get index to Table by function name.
   static bool lookup(StringRef Name, ID &Id);
@@ -133,8 +133,8 @@ unsigned ManglingRule::getNumArgs() const {
 //    E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of
 //    prev lead type, etc. see ParamIterator::getNextParam() for details.
 
-static const ManglingRule manglingRules[] = {
-{ StringRef(), {0}, {0} },
+static constexpr ManglingRule manglingRules[] = {
+{ "", {0}, {0} },
 { "abs"                             , {1},   {E_ANY}},
 { "abs_diff"                        , {1},   {E_ANY,E_COPY}},
 { "acos"                            , {1},   {E_ANY}},
@@ -682,9 +682,9 @@ bool AMDGPULibFunc::parse(StringRef FuncName, AMDGPULibFunc &F) {
   }
 
   if (eatTerm(FuncName, "_Z"))
-    F.Impl = make_unique<AMDGPUMangledLibFunc>();
+    F.Impl = std::make_unique<AMDGPUMangledLibFunc>();
   else
-    F.Impl = make_unique<AMDGPUUnmangledLibFunc>();
+    F.Impl = std::make_unique<AMDGPUUnmangledLibFunc>();
   if (F.Impl->parseFuncName(FuncName))
     return true;
 
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 5dd5b3691e0a..e64542a395f0 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -72,10 +72,10 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   BasicBlock &EntryBlock = *F.begin();
   IRBuilder<> Builder(&*EntryBlock.begin());
 
-  const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
+  const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
   const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
 
-  unsigned MaxAlign;
+  Align MaxAlign;
   // FIXME: Alignment is broken broken with explicit arg offset.;
   const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
   if (TotalKernArgSize == 0)
@@ -94,12 +94,12 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
 
   for (Argument &Arg : F.args()) {
     Type *ArgTy = Arg.getType();
-    unsigned Align = DL.getABITypeAlignment(ArgTy);
+    unsigned ABITypeAlign = DL.getABITypeAlignment(ArgTy);
     unsigned Size = DL.getTypeSizeInBits(ArgTy);
     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
 
-    uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
-    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+    uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
+    ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
 
     if (Arg.use_empty())
       continue;
@@ -128,8 +128,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
 
     int64_t AlignDownOffset = alignDown(EltOffset, 4);
     int64_t OffsetDiff = EltOffset - AlignDownOffset;
-    unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
-                                      KernArgBaseAlign);
+    Align AdjustedAlign = commonAlignment(
+        KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
 
     Value *ArgPtr;
     Type *AdjustedArgTy;
@@ -160,7 +160,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
                                    ArgPtr->getName() + ".cast");
     LoadInst *Load =
-        Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
+        Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign.value());
     Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
 
     MDBuilder MDB(Ctx);
@@ -220,8 +220,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   }
 
   KernArgSegment->addAttribute(
-    AttributeList::ReturnIndex,
-    Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
+      AttributeList::ReturnIndex,
+      Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
 
   return true;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index ae4c32c258a7..3760aed87a43 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -211,6 +211,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
     lowerOperand(MO, MCOp);
     OutMI.addOperand(MCOp);
   }
+
+  int FIIdx = AMDGPU::getNamedOperandIdx(MCOpcode, AMDGPU::OpName::fi);
+  if (FIIdx >= (int)OutMI.getNumOperands())
+    OutMI.addOperand(MCOperand::createImm(0));
 }
 
 bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 237490957058..ba72f71f4322 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -694,7 +694,7 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
                                        const MachineRegisterInfo *MRI,
                                        const TargetRegisterInfo *TRI,
                                        PHILinearize &PHIInfo) {
-  if (TRI->isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
                       << "\n");
     // If this is a source register to a PHI we are chaining, it
@@ -734,7 +734,7 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
                                              const MachineRegisterInfo *MRI,
                                              const TargetRegisterInfo *TRI,
                                              PHILinearize &PHIInfo) {
-  if (TRI->isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
                       << "\n");
     for (auto &UI : MRI->use_operands(Reg)) {
@@ -949,7 +949,7 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
                          (IncludeLoopPHI && IsLoopPHI);
     if (ShouldReplace) {
 
-      if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+      if (Register::isPhysicalRegister(NewRegister)) {
         LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
                           << printReg(NewRegister, MRI->getTargetRegisterInfo())
                           << "\n");
@@ -1016,13 +1016,15 @@ bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
 // before are no longer register kills.
 void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
   const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  (void)TRI; // It's used by LLVM_DEBUG.
+
   for (auto MBBI : MBBs) {
     MachineBasicBlock *MBB = MBBI;
     for (auto &II : *MBB) {
       for (auto &RI : II.uses()) {
         if (RI.isReg()) {
-          unsigned Reg = RI.getReg();
-          if (TRI->isVirtualRegister(Reg)) {
+          Register Reg = RI.getReg();
+          if (Register::isVirtualRegister(Reg)) {
             if (hasNoDef(Reg, MRI))
               continue;
             if (!MRI->hasOneDef(Reg)) {
@@ -1402,7 +1404,7 @@ void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest(
 unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo(
     MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) {
   unsigned DestReg = getPHIDestReg(PHI);
-  unsigned LinearizeDestReg =
+  Register LinearizeDestReg =
       MRI->createVirtualRegister(MRI->getRegClass(DestReg));
   PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc());
   storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices);
@@ -1890,7 +1892,7 @@ void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled(
   if (!Cond[0].isReg())
     return;
 
-  unsigned CondReg = Cond[0].getReg();
+  Register CondReg = Cond[0].getReg();
   for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) {
     (*UI).setIsKill(false);
   }
@@ -1929,8 +1931,8 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co
                               BBSelectReg, TrueBB->getNumber());
   } else {
     const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg);
-    unsigned TrueBBReg = MRI->createVirtualRegister(RegClass);
-    unsigned FalseBBReg = MRI->createVirtualRegister(RegClass);
+    Register TrueBBReg = MRI->createVirtualRegister(RegClass);
+    Register FalseBBReg = MRI->createVirtualRegister(RegClass);
     TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
                               TrueBBReg, TrueBB->getNumber());
     TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
@@ -1996,7 +1998,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
       InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI);
     }
     const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
-    unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
+    Register NextDestReg = MRI->createVirtualRegister(RegClass);
     bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
     LLVM_DEBUG(dbgs() << "Insert Chained PHI\n");
     insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
@@ -2056,8 +2058,8 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
       // register, unless it is the outgoing BB select register. We have
       // already creaed phi nodes for these.
       const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
-      unsigned PHIDestReg = MRI->createVirtualRegister(RegClass);
-      unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
+      Register PHIDestReg = MRI->createVirtualRegister(RegClass);
+      Register IfSourceReg = MRI->createVirtualRegister(RegClass);
       // Create initializer, this value is never used, but is needed
       // to satisfy SSA.
       LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
@@ -2172,7 +2174,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
           MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent();
           const TargetRegisterClass *RegClass =
               MRI->getRegClass(CurrentBackedgeReg);
-          unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass);
+          Register NewBackedgeReg = MRI->createVirtualRegister(RegClass);
           MachineInstrBuilder BackedgePHI =
               BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL,
                       TII->get(TargetOpcode::PHI), NewBackedgeReg);
@@ -2230,7 +2232,7 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
        I != E;) {
     MachineOperand &O = *I;
     ++I;
-    if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+    if (Register::isPhysicalRegister(NewRegister)) {
       LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
                         << printReg(NewRegister, MRI->getTargetRegisterInfo())
                         << "\n");
@@ -2309,7 +2311,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
   } else {
     // Handle internal block.
     const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn);
-    unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
+    Register CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
     rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg);
     bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB;
     MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB,
@@ -2446,7 +2448,7 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
   }
 
   const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest);
-  unsigned NewDestReg = MRI->createVirtualRegister(RegClass);
+  Register NewDestReg = MRI->createVirtualRegister(RegClass);
   LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI);
   MachineInstrBuilder MIB =
       BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
@@ -2734,9 +2736,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
     }
     const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI());
     unsigned InReg = LRegion->getBBSelectRegIn();
-    unsigned InnerSelectReg =
+    Register InnerSelectReg =
         MRI->createVirtualRegister(MRI->getRegClass(InReg));
-    unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
+    Register NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
     TII->materializeImmediate(*(LRegion->getEntry()),
                               LRegion->getEntry()->getFirstTerminator(), DL,
                               NewInReg, Region->getEntry()->getNumber());
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0d3a1f1a769f..89ca702f577d 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -17,7 +17,6 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   LocalMemoryObjects(),
   ExplicitKernArgSize(0),
-  MaxKernArgAlign(0),
   LDSSize(0),
   IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
   NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 52987e2fa411..9818ab1ef148 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -23,7 +23,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
 
 protected:
   uint64_t ExplicitKernArgSize; // Cache for this.
-  unsigned MaxKernArgAlign; // Cache for this.
+  Align MaxKernArgAlign;        // Cache for this.
 
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize;
@@ -47,9 +47,7 @@ public:
     return ExplicitKernArgSize;
   }
 
-  unsigned getMaxKernArgAlign() const {
-    return MaxKernArgAlign;
-  }
+  unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }
 
   unsigned getLDSSize() const {
     return LDSSize;
diff --git a/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
new file mode 100644
index 000000000000..5250bf455d71
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -0,0 +1,592 @@
+//=== AMDGPUPrintfRuntimeBinding.cpp - OpenCL printf implementation -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// \file
+//
+// The pass bind printfs to a kernel arg pointer that will be bound to a buffer
+// later by the runtime.
+//
+// This pass traverses the functions in the module and converts
+// each call to printf to a sequence of operations that
+// store the following into the printf buffer:
+// - format string (passed as a module's metadata unique ID)
+// - bitwise copies of printf arguments
+// The backend passes will need to store metadata in the kernel
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "printfToRuntime"
+#define DWORD_ALIGN 4
+
+namespace {
+class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding final
+    : public ModulePass {
+
+public:
+  static char ID;
+
+  explicit AMDGPUPrintfRuntimeBinding();
+
+private:
+  bool runOnModule(Module &M) override;
+  void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers,
+                               StringRef fmt, size_t num_ops) const;
+
+  bool shouldPrintAsStr(char Specifier, Type *OpType) const;
+  bool
+  lowerPrintfForGpu(Module &M,
+                    function_ref<const TargetLibraryInfo &(Function &)> GetTLI);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+
+  Value *simplify(Instruction *I, const TargetLibraryInfo *TLI) {
+    return SimplifyInstruction(I, {*TD, TLI, DT});
+  }
+
+  const DataLayout *TD;
+  const DominatorTree *DT;
+  SmallVector<CallInst *, 32> Printfs;
+};
+} // namespace
+
+char AMDGPUPrintfRuntimeBinding::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUPrintfRuntimeBinding,
+                      "amdgpu-printf-runtime-binding", "AMDGPU Printf lowering",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding",
+                    "AMDGPU Printf lowering", false, false)
+
+char &llvm::AMDGPUPrintfRuntimeBindingID = AMDGPUPrintfRuntimeBinding::ID;
+
+namespace llvm {
+ModulePass *createAMDGPUPrintfRuntimeBinding() {
+  return new AMDGPUPrintfRuntimeBinding();
+}
+} // namespace llvm
+
+AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding()
+    : ModulePass(ID), TD(nullptr), DT(nullptr) {
+  initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry());
+}
+
+void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers(
+    SmallVectorImpl<char> &OpConvSpecifiers, StringRef Fmt,
+    size_t NumOps) const {
+  // not all format characters are collected.
+  // At this time the format characters of interest
+  // are %p and %s, which use to know if we
+  // are either storing a literal string or a
+  // pointer to the printf buffer.
+  static const char ConvSpecifiers[] = "cdieEfgGaosuxXp";
+  size_t CurFmtSpecifierIdx = 0;
+  size_t PrevFmtSpecifierIdx = 0;
+
+  while ((CurFmtSpecifierIdx = Fmt.find_first_of(
+              ConvSpecifiers, CurFmtSpecifierIdx)) != StringRef::npos) {
+    bool ArgDump = false;
+    StringRef CurFmt = Fmt.substr(PrevFmtSpecifierIdx,
+                                  CurFmtSpecifierIdx - PrevFmtSpecifierIdx);
+    size_t pTag = CurFmt.find_last_of("%");
+    if (pTag != StringRef::npos) {
+      ArgDump = true;
+      while (pTag && CurFmt[--pTag] == '%') {
+        ArgDump = !ArgDump;
+      }
+    }
+
+    if (ArgDump)
+      OpConvSpecifiers.push_back(Fmt[CurFmtSpecifierIdx]);
+
+    PrevFmtSpecifierIdx = ++CurFmtSpecifierIdx;
+  }
+}
+
+bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier,
+                                                  Type *OpType) const {
+  if (Specifier != 's')
+    return false;
+  const PointerType *PT = dyn_cast<PointerType>(OpType);
+  if (!PT || PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return false;
+  Type *ElemType = PT->getContainedType(0);
+  if (ElemType->getTypeID() != Type::IntegerTyID)
+    return false;
+  IntegerType *ElemIType = cast<IntegerType>(ElemType);
+  return ElemIType->getBitWidth() == 8;
+}
+
+bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
+    Module &M, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
+  LLVMContext &Ctx = M.getContext();
+  IRBuilder<> Builder(Ctx);
+  Type *I32Ty = Type::getInt32Ty(Ctx);
+  unsigned UniqID = 0;
+  // NB: This is important for this string size to be divizable by 4
+  const char NonLiteralStr[4] = "???";
+
+  for (auto CI : Printfs) {
+    unsigned NumOps = CI->getNumArgOperands();
+
+    SmallString<16> OpConvSpecifiers;
+    Value *Op = CI->getArgOperand(0);
+
+    if (auto LI = dyn_cast<LoadInst>(Op)) {
+      Op = LI->getPointerOperand();
+      for (auto Use : Op->users()) {
+        if (auto SI = dyn_cast<StoreInst>(Use)) {
+          Op = SI->getValueOperand();
+          break;
+        }
+      }
+    }
+
+    if (auto I = dyn_cast<Instruction>(Op)) {
+      Value *Op_simplified = simplify(I, &GetTLI(*I->getFunction()));
+      if (Op_simplified)
+        Op = Op_simplified;
+    }
+
+    ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Op);
+
+    if (ConstExpr) {
+      GlobalVariable *GVar = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+
+      StringRef Str("unknown");
+      if (GVar && GVar->hasInitializer()) {
+        auto Init = GVar->getInitializer();
+        if (auto CA = dyn_cast<ConstantDataArray>(Init)) {
+          if (CA->isString())
+            Str = CA->getAsCString();
+        } else if (isa<ConstantAggregateZero>(Init)) {
+          Str = "";
+        }
+        //
+        // we need this call to ascertain
+        // that we are printing a string
+        // or a pointer. It takes out the
+        // specifiers and fills up the first
+        // arg
+        getConversionSpecifiers(OpConvSpecifiers, Str, NumOps - 1);
+      }
+      // Add metadata for the string
+      std::string AStreamHolder;
+      raw_string_ostream Sizes(AStreamHolder);
+      int Sum = DWORD_ALIGN;
+      Sizes << CI->getNumArgOperands() - 1;
+      Sizes << ':';
+      for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() &&
+                                  ArgCount <= OpConvSpecifiers.size();
+           ArgCount++) {
+        Value *Arg = CI->getArgOperand(ArgCount);
+        Type *ArgType = Arg->getType();
+        unsigned ArgSize = TD->getTypeAllocSizeInBits(ArgType);
+        ArgSize = ArgSize / 8;
+        //
+        // ArgSize by design should be a multiple of DWORD_ALIGN,
+        // expand the arguments that do not follow this rule.
+        //
+        if (ArgSize % DWORD_ALIGN != 0) {
+          llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx);
+          VectorType *LLVMVecType = llvm::dyn_cast<llvm::VectorType>(ArgType);
+          int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1;
+          if (LLVMVecType && NumElem > 1)
+            ResType = llvm::VectorType::get(ResType, NumElem);
+          Builder.SetInsertPoint(CI);
+          Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+          if (OpConvSpecifiers[ArgCount - 1] == 'x' ||
+              OpConvSpecifiers[ArgCount - 1] == 'X' ||
+              OpConvSpecifiers[ArgCount - 1] == 'u' ||
+              OpConvSpecifiers[ArgCount - 1] == 'o')
+            Arg = Builder.CreateZExt(Arg, ResType);
+          else
+            Arg = Builder.CreateSExt(Arg, ResType);
+          ArgType = Arg->getType();
+          ArgSize = TD->getTypeAllocSizeInBits(ArgType);
+          ArgSize = ArgSize / 8;
+          CI->setOperand(ArgCount, Arg);
+        }
+        if (OpConvSpecifiers[ArgCount - 1] == 'f') {
+          ConstantFP *FpCons = dyn_cast<ConstantFP>(Arg);
+          if (FpCons)
+            ArgSize = 4;
+          else {
+            FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg);
+            if (FpExt && FpExt->getType()->isDoubleTy() &&
+                FpExt->getOperand(0)->getType()->isFloatTy())
+              ArgSize = 4;
+          }
+        }
+        if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
+          if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+            GlobalVariable *GV =
+                dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+            if (GV && GV->hasInitializer()) {
+              Constant *Init = GV->getInitializer();
+              ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
+              if (Init->isZeroValue() || CA->isString()) {
+                size_t SizeStr = Init->isZeroValue()
+                                     ? 1
+                                     : (strlen(CA->getAsCString().data()) + 1);
+                size_t Rem = SizeStr % DWORD_ALIGN;
+                size_t NSizeStr = 0;
+                LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr
+                                  << '\n');
+                if (Rem) {
+                  NSizeStr = SizeStr + (DWORD_ALIGN - Rem);
+                } else {
+                  NSizeStr = SizeStr;
+                }
+                ArgSize = NSizeStr;
+              }
+            } else {
+              ArgSize = sizeof(NonLiteralStr);
+            }
+          } else {
+            ArgSize = sizeof(NonLiteralStr);
+          }
+        }
+        LLVM_DEBUG(dbgs() << "Printf ArgSize (in buffer) = " << ArgSize
+                          << " for type: " << *ArgType << '\n');
+        Sizes << ArgSize << ':';
+        Sum += ArgSize;
+      }
+      LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str.str()
+                        << '\n');
+      for (size_t I = 0; I < Str.size(); ++I) {
+        // Rest of the C escape sequences (e.g. \') are handled correctly
+        // by the MDParser
+        switch (Str[I]) {
+        case '\a':
+          Sizes << "\\a";
+          break;
+        case '\b':
+          Sizes << "\\b";
+          break;
+        case '\f':
+          Sizes << "\\f";
+          break;
+        case '\n':
+          Sizes << "\\n";
+          break;
+        case '\r':
+          Sizes << "\\r";
+          break;
+        case '\v':
+          Sizes << "\\v";
+          break;
+        case ':':
+          // ':' cannot be scanned by Flex, as it is defined as a delimiter
+          // Replace it with it's octal representation \72
+          Sizes << "\\72";
+          break;
+        default:
+          Sizes << Str[I];
+          break;
+        }
+      }
+
+      // Insert the printf_alloc call
+      Builder.SetInsertPoint(CI);
+      Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+      AttributeList Attr = AttributeList::get(Ctx, AttributeList::FunctionIndex,
+                                              Attribute::NoUnwind);
+
+      Type *SizetTy = Type::getInt32Ty(Ctx);
+
+      Type *Tys_alloc[1] = {SizetTy};
+      Type *I8Ptr = PointerType::get(Type::getInt8Ty(Ctx), 1);
+      FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false);
+      FunctionCallee PrintfAllocFn =
+          M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr);
+
+      LLVM_DEBUG(dbgs() << "Printf metadata = " << Sizes.str() << '\n');
+      std::string fmtstr = itostr(++UniqID) + ":" + Sizes.str().c_str();
+      MDString *fmtStrArray = MDString::get(Ctx, fmtstr);
+
+      // Instead of creating global variables, the
+      // printf format strings are extracted
+      // and passed as metadata. This avoids
+      // polluting llvm's symbol tables in this module.
+      // Metadata is going to be extracted
+      // by the backend passes and inserted
+      // into the OpenCL binary as appropriate.
+      StringRef amd("llvm.printf.fmts");
+      NamedMDNode *metaD = M.getOrInsertNamedMetadata(amd);
+      MDNode *myMD = MDNode::get(Ctx, fmtStrArray);
+      metaD->addOperand(myMD);
+      Value *sumC = ConstantInt::get(SizetTy, Sum, false);
+      SmallVector<Value *, 1> alloc_args;
+      alloc_args.push_back(sumC);
+      CallInst *pcall =
+          CallInst::Create(PrintfAllocFn, alloc_args, "printf_alloc_fn", CI);
+
+      //
+      // Insert code to split basicblock with a
+      // piece of hammock code.
+      // basicblock splits after buffer overflow check
+      //
+      ConstantPointerNull *zeroIntPtr =
+          ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx), 1));
+      ICmpInst *cmp =
+          dyn_cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
+      if (!CI->use_empty()) {
+        Value *result =
+            Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res");
+        CI->replaceAllUsesWith(result);
+      }
+      SplitBlock(CI->getParent(), cmp);
+      Instruction *Brnch =
+          SplitBlockAndInsertIfThen(cmp, cmp->getNextNode(), false);
+
+      Builder.SetInsertPoint(Brnch);
+
+      // store unique printf id in the buffer
+      //
+      SmallVector<Value *, 1> ZeroIdxList;
+      ConstantInt *zeroInt =
+          ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10));
+      ZeroIdxList.push_back(zeroInt);
+
+      GetElementPtrInst *BufferIdx =
+          dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
+              nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch));
+
+      Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
+      Value *id_gep_cast =
+          new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch);
+
+      StoreInst *stbuff =
+          new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast);
+      stbuff->insertBefore(Brnch); // to Remove unused variable warning
+
+      SmallVector<Value *, 2> FourthIdxList;
+      ConstantInt *fourInt =
+          ConstantInt::get(Ctx, APInt(32, StringRef("4"), 10));
+
+      FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id
+      // the following GEP is the buffer pointer
+      BufferIdx = cast<GetElementPtrInst>(GetElementPtrInst::Create(
+          nullptr, pcall, FourthIdxList, "PrintBuffGep", Brnch));
+
+      Type *Int32Ty = Type::getInt32Ty(Ctx);
+      Type *Int64Ty = Type::getInt64Ty(Ctx);
+      for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() &&
+                                  ArgCount <= OpConvSpecifiers.size();
+           ArgCount++) {
+        Value *Arg = CI->getArgOperand(ArgCount);
+        Type *ArgType = Arg->getType();
+        SmallVector<Value *, 32> WhatToStore;
+        if (ArgType->isFPOrFPVectorTy() &&
+            (ArgType->getTypeID() != Type::VectorTyID)) {
+          Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty;
+          if (OpConvSpecifiers[ArgCount - 1] == 'f') {
+            ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg);
+            if (fpCons) {
+              APFloat Val(fpCons->getValueAPF());
+              bool Lost = false;
+              Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+                          &Lost);
+              Arg = ConstantFP::get(Ctx, Val);
+              IType = Int32Ty;
+            } else {
+              FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg);
+              if (FpExt && FpExt->getType()->isDoubleTy() &&
+                  FpExt->getOperand(0)->getType()->isFloatTy()) {
+                Arg = FpExt->getOperand(0);
+                IType = Int32Ty;
+              }
+            }
+          }
+          Arg = new BitCastInst(Arg, IType, "PrintArgFP", Brnch);
+          WhatToStore.push_back(Arg);
+        } else if (ArgType->getTypeID() == Type::PointerTyID) {
+          if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
+            const char *S = NonLiteralStr;
+            if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+              GlobalVariable *GV =
+                  dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+              if (GV && GV->hasInitializer()) {
+                Constant *Init = GV->getInitializer();
+                ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
+                if (Init->isZeroValue() || CA->isString()) {
+                  S = Init->isZeroValue() ? "" : CA->getAsCString().data();
+                }
+              }
+            }
+            size_t SizeStr = strlen(S) + 1;
+            size_t Rem = SizeStr % DWORD_ALIGN;
+            size_t NSizeStr = 0;
+            if (Rem) {
+              NSizeStr = SizeStr + (DWORD_ALIGN - Rem);
+            } else {
+              NSizeStr = SizeStr;
+            }
+            if (S[0]) {
+              char *MyNewStr = new char[NSizeStr]();
+              strcpy(MyNewStr, S);
+              int NumInts = NSizeStr / 4;
+              int CharC = 0;
+              while (NumInts) {
+                int ANum = *(int *)(MyNewStr + CharC);
+                CharC += 4;
+                NumInts--;
+                Value *ANumV = ConstantInt::get(Int32Ty, ANum, false);
+                WhatToStore.push_back(ANumV);
+              }
+              delete[] MyNewStr;
+            } else {
+              // Empty string, give a hint to RT it is no NULL
+              Value *ANumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false);
+              WhatToStore.push_back(ANumV);
+            }
+          } else {
+            uint64_t Size = TD->getTypeAllocSizeInBits(ArgType);
+            assert((Size == 32 || Size == 64) && "unsupported size");
+            Type *DstType = (Size == 32) ? Int32Ty : Int64Ty;
+            Arg = new PtrToIntInst(Arg, DstType, "PrintArgPtr", Brnch);
+            WhatToStore.push_back(Arg);
+          }
+        } else if (ArgType->getTypeID() == Type::VectorTyID) {
+          Type *IType = NULL;
+          uint32_t EleCount = cast<VectorType>(ArgType)->getNumElements();
+          uint32_t EleSize = ArgType->getScalarSizeInBits();
+          uint32_t TotalSize = EleCount * EleSize;
+          if (EleCount == 3) {
+            IntegerType *Int32Ty = Type::getInt32Ty(ArgType->getContext());
+            Constant *Indices[4] = {
+                ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 1),
+                ConstantInt::get(Int32Ty, 2), ConstantInt::get(Int32Ty, 2)};
+            Constant *Mask = ConstantVector::get(Indices);
+            ShuffleVectorInst *Shuffle = new ShuffleVectorInst(Arg, Arg, Mask);
+            Shuffle->insertBefore(Brnch);
+            Arg = Shuffle;
+            ArgType = Arg->getType();
+            TotalSize += EleSize;
+          }
+          switch (EleSize) {
+          default:
+            EleCount = TotalSize / 64;
+            IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+            break;
+          case 8:
+            if (EleCount >= 8) {
+              EleCount = TotalSize / 64;
+              IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+            } else if (EleCount >= 3) {
+              EleCount = 1;
+              IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+            } else {
+              EleCount = 1;
+              IType = dyn_cast<Type>(Type::getInt16Ty(ArgType->getContext()));
+            }
+            break;
+          case 16:
+            if (EleCount >= 3) {
+              EleCount = TotalSize / 64;
+              IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+            } else {
+              EleCount = 1;
+              IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+            }
+            break;
+          }
+          if (EleCount > 1) {
+            IType = dyn_cast<Type>(VectorType::get(IType, EleCount));
+          }
+          Arg = new BitCastInst(Arg, IType, "PrintArgVect", Brnch);
+          WhatToStore.push_back(Arg);
+        } else {
+          WhatToStore.push_back(Arg);
+        }
+        for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) {
+          Value *TheBtCast = WhatToStore[I];
+          unsigned ArgSize =
+              TD->getTypeAllocSizeInBits(TheBtCast->getType()) / 8;
+          SmallVector<Value *, 1> BuffOffset;
+          BuffOffset.push_back(ConstantInt::get(I32Ty, ArgSize));
+
+          Type *ArgPointer = PointerType::get(TheBtCast->getType(), 1);
+          Value *CastedGEP =
+              new BitCastInst(BufferIdx, ArgPointer, "PrintBuffPtrCast", Brnch);
+          StoreInst *StBuff = new StoreInst(TheBtCast, CastedGEP, Brnch);
+          LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n"
+                            << *StBuff << '\n');
+          (void)StBuff;
+          if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands())
+            break;
+          BufferIdx = dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
+              nullptr, BufferIdx, BuffOffset, "PrintBuffNextPtr", Brnch));
+          LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
+                            << *BufferIdx << '\n');
+        }
+      }
+    }
+  }
+
+  // erase the printf calls
+  for (auto CI : Printfs)
+    CI->eraseFromParent();
+
+  Printfs.clear();
+  return true;
+}
+
+bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
+  Triple TT(M.getTargetTriple());
+  if (TT.getArch() == Triple::r600)
+    return false;
+
+  auto PrintfFunction = M.getFunction("printf");
+  if (!PrintfFunction)
+    return false;
+
+  for (auto &U : PrintfFunction->uses()) {
+    if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
+      if (CI->isCallee(&U))
+        Printfs.push_back(CI);
+    }
+  }
+
+  if (Printfs.empty())
+    return false;
+
+  TD = &M.getDataLayout();
+  auto DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
+
+  return lowerPrintfForGpu(M, GetTLI);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e4c9d6685d4a..3e9dcca114a3 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -801,7 +801,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       GlobalVariable::NotThreadLocal,
       AMDGPUAS::LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(I.getAlignment());
+  GV->setAlignment(MaybeAlign(I.getAlignment()));
 
   Value *TCntY, *TCntZ;
 
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 815cbc5e26ee..4d78188b3dc3 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -17,9 +17,9 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -33,6 +33,7 @@
 #include "AMDGPUGenRegisterBankInfo.def"
 
 using namespace llvm;
+using namespace MIPatternMatch;
 
 namespace {
 
@@ -84,9 +85,11 @@ public:
 };
 
 }
-AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
+AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
     : AMDGPUGenRegisterBankInfo(),
-      TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
+      Subtarget(ST),
+      TRI(Subtarget.getRegisterInfo()),
+      TII(Subtarget.getInstrInfo()) {
 
   // HACK: Until this is fully tablegen'd.
   static bool AlreadyInit = false;
@@ -163,11 +166,10 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost(
 
 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
     const TargetRegisterClass &RC) const {
+  if (&RC == &AMDGPU::SReg_1RegClass)
+    return AMDGPU::VCCRegBank;
 
-  if (TRI->isSGPRClass(&RC))
-    return getRegBank(AMDGPU::SGPRRegBankID);
-
-  return getRegBank(AMDGPU::VGPRRegBankID);
+  return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
 }
 
 template <unsigned NumOps>
@@ -192,7 +194,8 @@ AMDGPURegisterBankInfo::addMappingFromTable(
     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
   }
 
-  unsigned MappingID = 0;
+  // getInstrMapping's default mapping uses ID 1, so start at 2.
+  unsigned MappingID = 2;
   for (const auto &Entry : Table) {
     for (unsigned I = 0; I < NumOps; ++I) {
       int OpIdx = RegSrcOpIdx[I];
@@ -210,7 +213,7 @@ AMDGPURegisterBankInfo::addMappingFromTable(
 RegisterBankInfo::InstructionMappings
 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
-  switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+  switch (MI.getIntrinsicID()) {
   case Intrinsic::amdgcn_readlane: {
     static const OpRegBankEntry<3> Table[2] = {
       // Perfectly legal.
@@ -251,7 +254,7 @@ RegisterBankInfo::InstructionMappings
 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
 
-  switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+  switch (MI.getIntrinsicID()) {
   case Intrinsic::amdgcn_buffer_load: {
     static const OpRegBankEntry<3> Table[4] = {
       // Perfectly legal.
@@ -303,6 +306,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
   }
   case Intrinsic::amdgcn_s_sendmsg:
   case Intrinsic::amdgcn_s_sendmsghalt: {
+    // FIXME: Should have no register for immediate
     static const OpRegBankEntry<1> Table[2] = {
       // Perfectly legal.
       { { AMDGPU::SGPRRegBankID }, 1 },
@@ -319,12 +323,15 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
   }
 }
 
-static bool isInstrUniform(const MachineInstr &MI) {
+// FIXME: Returns uniform if there's no source value information. This is
+// probably wrong.
+static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) {
   if (!MI.hasOneMemOperand())
     return false;
 
   const MachineMemOperand *MMO = *MI.memoperands_begin();
-  return AMDGPUInstrInfo::isUniformMMO(MMO);
+  return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
+         AMDGPUInstrInfo::isUniformMMO(MMO);
 }
 
 RegisterBankInfo::InstructionMappings
@@ -337,6 +344,31 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
 
   InstructionMappings AltMappings;
   switch (MI.getOpcode()) {
+  case TargetOpcode::G_CONSTANT: {
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    if (Size == 1) {
+      static const OpRegBankEntry<1> Table[4] = {
+        { { AMDGPU::VGPRRegBankID }, 1 },
+        { { AMDGPU::SGPRRegBankID }, 1 },
+        { { AMDGPU::VCCRegBankID }, 1 },
+        { { AMDGPU::SCCRegBankID }, 1 }
+      };
+
+      return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_FRAME_INDEX:
+  case TargetOpcode::G_GLOBAL_VALUE: {
+    static const OpRegBankEntry<1> Table[2] = {
+      { { AMDGPU::VGPRRegBankID }, 1 },
+      { { AMDGPU::SGPRRegBankID }, 1 }
+    };
+
+    return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
+  }
   case TargetOpcode::G_AND:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR: {
@@ -408,23 +440,29 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     AltMappings.push_back(&VSMapping);
     break;
   }
-  case TargetOpcode::G_LOAD: {
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_ZEXTLOAD:
+  case TargetOpcode::G_SEXTLOAD: {
     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
+    unsigned PtrSize = PtrTy.getSizeInBits();
+    unsigned AS = PtrTy.getAddressSpace();
     LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
-    // FIXME: Should we be hard coding the size for these mappings?
-    if (isInstrUniform(MI)) {
+    if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
+         AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+        isInstrUniformNonExtLoadAlign4(MI)) {
       const InstructionMapping &SSMapping = getInstructionMapping(
           1, 1, getOperandsMapping(
                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
-                     AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+                     AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
           2); // Num Operands
       AltMappings.push_back(&SSMapping);
     }
 
     const InstructionMapping &VVMapping = getInstructionMapping(
         2, 1, getOperandsMapping(
-                  {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
-                   AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
+          {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
+           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
         2); // Num Operands
     AltMappings.push_back(&VVMapping);
 
@@ -620,57 +658,53 @@ static LLT getHalfSizedType(LLT Ty) {
 ///
 /// There is additional complexity to try for compare values to identify the
 /// unique values used.
-void AMDGPURegisterBankInfo::executeInWaterfallLoop(
-  MachineInstr &MI, MachineRegisterInfo &MRI,
-  ArrayRef<unsigned> OpIndices) const {
-  MachineFunction *MF = MI.getParent()->getParent();
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  MachineBasicBlock::iterator I(MI);
-
-  MachineBasicBlock &MBB = *MI.getParent();
-  const DebugLoc &DL = MI.getDebugLoc();
-
-  // Use a set to avoid extra readfirstlanes in the case where multiple operands
-  // are the same register.
-  SmallSet<Register, 4> SGPROperandRegs;
-  for (unsigned Op : OpIndices) {
-    assert(MI.getOperand(Op).isUse());
-    Register Reg = MI.getOperand(Op).getReg();
-    const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
-    if (OpBank->getID() == AMDGPU::VGPRRegBankID)
-      SGPROperandRegs.insert(Reg);
-  }
-
-  // No operands need to be replaced, so no need to loop.
-  if (SGPROperandRegs.empty())
-    return;
-
-  MachineIRBuilder B(MI);
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+  MachineIRBuilder &B,
+  iterator_range<MachineBasicBlock::iterator> Range,
+  SmallSet<Register, 4> &SGPROperandRegs,
+  MachineRegisterInfo &MRI) const {
   SmallVector<Register, 4> ResultRegs;
   SmallVector<Register, 4> InitResultRegs;
   SmallVector<Register, 4> PhiRegs;
-  for (MachineOperand &Def : MI.defs()) {
-    LLT ResTy = MRI.getType(Def.getReg());
-    const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
-    ResultRegs.push_back(Def.getReg());
-    Register InitReg = B.buildUndef(ResTy).getReg(0);
-    Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
-    InitResultRegs.push_back(InitReg);
-    PhiRegs.push_back(PhiReg);
-    MRI.setRegBank(PhiReg, *DefBank);
-    MRI.setRegBank(InitReg, *DefBank);
+
+  MachineBasicBlock &MBB = B.getMBB();
+  MachineFunction *MF = &B.getMF();
+
+  const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
+  const unsigned WaveAndOpc = Subtarget.isWave32() ?
+    AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+  const unsigned MovTermOpc = Subtarget.isWave32() ?
+    AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+  const unsigned XorTermOpc = Subtarget.isWave32() ?
+    AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
+  const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
+    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+  const unsigned ExecReg =  Subtarget.isWave32() ?
+    AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+  for (MachineInstr &MI : Range) {
+    for (MachineOperand &Def : MI.defs()) {
+      LLT ResTy = MRI.getType(Def.getReg());
+      const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
+      ResultRegs.push_back(Def.getReg());
+      Register InitReg = B.buildUndef(ResTy).getReg(0);
+      Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
+      InitResultRegs.push_back(InitReg);
+      PhiRegs.push_back(PhiReg);
+      MRI.setRegBank(PhiReg, *DefBank);
+      MRI.setRegBank(InitReg, *DefBank);
+    }
   }
 
-  Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
-  Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+  Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
+  Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
 
   // Don't bother using generic instructions/registers for the exec mask.
   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
     .addDef(InitSaveExecReg);
 
-  Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-  Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  Register PhiExec = MRI.createVirtualRegister(WaveRC);
+  Register NewExec = MRI.createVirtualRegister(WaveRC);
 
   // To insert the loop we need to split the block. Move everything before this
   // point to a new block, and insert a new empty block before this instruction.
@@ -688,7 +722,7 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
   // Move the rest of the block into a new block.
   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
-  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+  RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
 
   MBB.addSuccessor(LoopBB);
   RestoreExecBB->addSuccessor(RemainderBB);
@@ -711,164 +745,173 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
       .addMBB(LoopBB);
   }
 
-  // Move the instruction into the loop.
-  LoopBB->splice(LoopBB->end(), &MBB, I);
-  I = std::prev(LoopBB->end());
+  const DebugLoc &DL = B.getDL();
+
+  // Figure out the iterator range after splicing the instructions.
+  auto NewBegin = std::prev(LoopBB->end());
 
-  B.setInstr(*I);
+  // Move the instruction into the loop. Note we moved everything after
+  // Range.end() already into a new block, so Range.end() is no longer valid.
+  LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
+
+  auto NewEnd = LoopBB->end();
+
+  MachineBasicBlock::iterator I = Range.begin();
+  B.setInsertPt(*LoopBB, I);
 
   Register CondReg;
 
-  for (MachineOperand &Op : MI.uses()) {
-    if (!Op.isReg())
-      continue;
+  for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+    for (MachineOperand &Op : MI.uses()) {
+      if (!Op.isReg() || Op.isDef())
+        continue;
 
-    assert(!Op.isDef());
-    if (SGPROperandRegs.count(Op.getReg())) {
-      LLT OpTy = MRI.getType(Op.getReg());
-      unsigned OpSize = OpTy.getSizeInBits();
-
-      // Can only do a readlane of 32-bit pieces.
-      if (OpSize == 32) {
-        // Avoid extra copies in the simple case of one 32-bit register.
-        Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-        MRI.setType(CurrentLaneOpReg, OpTy);
-
-        constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
-        // Read the next variant <- also loop target.
-        BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
-          .addReg(Op.getReg());
-
-        Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-        bool First = CondReg == AMDGPU::NoRegister;
-        if (First)
-          CondReg = NewCondReg;
-
-        // Compare the just read M0 value to all possible Idx values.
-        B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
-          .addDef(NewCondReg)
-          .addReg(CurrentLaneOpReg)
-          .addReg(Op.getReg());
-        Op.setReg(CurrentLaneOpReg);
-
-        if (!First) {
-          Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
-
-          // If there are multiple operands to consider, and the conditions.
-          B.buildInstr(AMDGPU::S_AND_B64)
-            .addDef(AndReg)
-            .addReg(NewCondReg)
-            .addReg(CondReg);
-          CondReg = AndReg;
-        }
-      } else {
-        LLT S32 = LLT::scalar(32);
-        SmallVector<Register, 8> ReadlanePieces;
+      if (SGPROperandRegs.count(Op.getReg())) {
+        LLT OpTy = MRI.getType(Op.getReg());
+        unsigned OpSize = OpTy.getSizeInBits();
 
-        // The compares can be done as 64-bit, but the extract needs to be done
-        // in 32-bit pieces.
+        // Can only do a readlane of 32-bit pieces.
+        if (OpSize == 32) {
+          // Avoid extra copies in the simple case of one 32-bit register.
+          Register CurrentLaneOpReg
+            = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+          MRI.setType(CurrentLaneOpReg, OpTy);
 
-        bool Is64 = OpSize % 64 == 0;
+          constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+          // Read the next variant <- also loop target.
+          BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                  CurrentLaneOpReg)
+            .addReg(Op.getReg());
 
-        LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
-        unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
-                                          : AMDGPU::V_CMP_EQ_U32_e64;
+          Register NewCondReg = MRI.createVirtualRegister(WaveRC);
+          bool First = CondReg == AMDGPU::NoRegister;
+          if (First)
+            CondReg = NewCondReg;
 
-        // The compares can be done as 64-bit, but the extract needs to be done
-        // in 32-bit pieces.
+          // Compare the just read M0 value to all possible Idx values.
+          B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
+            .addDef(NewCondReg)
+            .addReg(CurrentLaneOpReg)
+            .addReg(Op.getReg());
+          Op.setReg(CurrentLaneOpReg);
 
-        // Insert the unmerge before the loop.
+          if (!First) {
+            Register AndReg = MRI.createVirtualRegister(WaveRC);
 
-        B.setMBB(MBB);
-        auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
-        B.setInstr(*I);
+            // If there are multiple operands to consider, and the conditions.
+            B.buildInstr(WaveAndOpc)
+              .addDef(AndReg)
+              .addReg(NewCondReg)
+              .addReg(CondReg);
+            CondReg = AndReg;
+          }
+        } else {
+          LLT S32 = LLT::scalar(32);
+          SmallVector<Register, 8> ReadlanePieces;
 
-        unsigned NumPieces = Unmerge->getNumOperands() - 1;
-        for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
-          unsigned UnmergePiece = Unmerge.getReg(PieceIdx);
+          // The compares can be done as 64-bit, but the extract needs to be done
+          // in 32-bit pieces.
 
-          Register CurrentLaneOpReg;
-          if (Is64) {
-            Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
-            Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+          bool Is64 = OpSize % 64 == 0;
 
-            MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
-            MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
-            MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+          LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
+          unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
+            : AMDGPU::V_CMP_EQ_U32_e64;
 
-            // Read the next variant <- also loop target.
-            BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                    CurrentLaneOpRegLo)
-              .addReg(UnmergePiece, 0, AMDGPU::sub0);
+          // The compares can be done as 64-bit, but the extract needs to be done
+          // in 32-bit pieces.
 
-            // Read the next variant <- also loop target.
-            BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                    CurrentLaneOpRegHi)
-              .addReg(UnmergePiece, 0, AMDGPU::sub1);
+          // Insert the unmerge before the loop.
 
-            CurrentLaneOpReg =
-                B.buildMerge(LLT::scalar(64),
-                             {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
-                    .getReg(0);
+          B.setMBB(MBB);
+          auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+          B.setInstr(*I);
 
-            MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
+          unsigned NumPieces = Unmerge->getNumOperands() - 1;
+          for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
+            Register UnmergePiece = Unmerge.getReg(PieceIdx);
 
-            if (OpTy.getScalarSizeInBits() == 64) {
-              // If we need to produce a 64-bit element vector, so use the
-              // merged pieces
-              ReadlanePieces.push_back(CurrentLaneOpReg);
+            Register CurrentLaneOpReg;
+            if (Is64) {
+              Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
+              Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+
+              MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
+              MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
+              MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+
+              // Read the next variant <- also loop target.
+              BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                      CurrentLaneOpRegLo)
+                .addReg(UnmergePiece, 0, AMDGPU::sub0);
+
+              // Read the next variant <- also loop target.
+              BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                      CurrentLaneOpRegHi)
+                .addReg(UnmergePiece, 0, AMDGPU::sub1);
+
+              CurrentLaneOpReg =
+                B.buildMerge(LLT::scalar(64),
+                             {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
+                .getReg(0);
+
+              MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
+
+              if (OpTy.getScalarSizeInBits() == 64) {
+                // If we need to produce a 64-bit element vector, so use the
+                // merged pieces
+                ReadlanePieces.push_back(CurrentLaneOpReg);
+              } else {
+                // 32-bit element type.
+                ReadlanePieces.push_back(CurrentLaneOpRegLo);
+                ReadlanePieces.push_back(CurrentLaneOpRegHi);
+              }
             } else {
-              // 32-bit element type.
-              ReadlanePieces.push_back(CurrentLaneOpRegLo);
-              ReadlanePieces.push_back(CurrentLaneOpRegHi);
+              CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
+              MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
+              MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
+
+              // Read the next variant <- also loop target.
+              BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                      CurrentLaneOpReg)
+                .addReg(UnmergePiece);
+              ReadlanePieces.push_back(CurrentLaneOpReg);
             }
-          } else {
-            CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
-            MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
-            MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
 
-            // Read the next variant <- also loop target.
-            BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                    CurrentLaneOpReg)
-              .addReg(UnmergePiece);
-            ReadlanePieces.push_back(CurrentLaneOpReg);
-          }
+            Register NewCondReg = MRI.createVirtualRegister(WaveRC);
+            bool First = CondReg == AMDGPU::NoRegister;
+            if (First)
+              CondReg = NewCondReg;
 
-          Register NewCondReg
-            = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
-          bool First = CondReg == AMDGPU::NoRegister;
-          if (First)
-            CondReg = NewCondReg;
+            B.buildInstr(CmpOp)
+              .addDef(NewCondReg)
+              .addReg(CurrentLaneOpReg)
+              .addReg(UnmergePiece);
 
-          B.buildInstr(CmpOp)
-            .addDef(NewCondReg)
-            .addReg(CurrentLaneOpReg)
-            .addReg(UnmergePiece);
+            if (!First) {
+              Register AndReg = MRI.createVirtualRegister(WaveRC);
 
-          if (!First) {
-            Register AndReg
-              = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+              // If there are multiple operands to consider, and the conditions.
+              B.buildInstr(WaveAndOpc)
+                .addDef(AndReg)
+                .addReg(NewCondReg)
+                .addReg(CondReg);
+              CondReg = AndReg;
+            }
+          }
 
-            // If there are multiple operands to consider, and the conditions.
-            B.buildInstr(AMDGPU::S_AND_B64)
-              .addDef(AndReg)
-              .addReg(NewCondReg)
-              .addReg(CondReg);
-            CondReg = AndReg;
+          // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
+          // BUILD_VECTOR
+          if (OpTy.isVector()) {
+            auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
+            Op.setReg(Merge.getReg(0));
+          } else {
+            auto Merge = B.buildMerge(OpTy, ReadlanePieces);
+            Op.setReg(Merge.getReg(0));
           }
-        }
 
-        // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
-        // BUILD_VECTOR
-        if (OpTy.isVector()) {
-          auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
-          Op.setReg(Merge.getReg(0));
-        } else {
-          auto Merge = B.buildMerge(OpTy, ReadlanePieces);
-          Op.setReg(Merge.getReg(0));
+          MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
         }
-
-        MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
       }
     }
   }
@@ -876,16 +919,16 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
   B.setInsertPt(*LoopBB, LoopBB->end());
 
   // Update EXEC, save the original EXEC value to VCC.
-  B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
+  B.buildInstr(AndSaveExecOpc)
     .addDef(NewExec)
     .addReg(CondReg, RegState::Kill);
 
   MRI.setSimpleHint(NewExec, CondReg);
 
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
-  B.buildInstr(AMDGPU::S_XOR_B64_term)
-    .addDef(AMDGPU::EXEC)
-    .addReg(AMDGPU::EXEC)
+  B.buildInstr(XorTermOpc)
+    .addDef(ExecReg)
+    .addReg(ExecReg)
     .addReg(NewExec);
 
   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
@@ -896,14 +939,60 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
     .addMBB(LoopBB);
 
   // Save the EXEC mask before the loop.
-  BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
-    .addReg(AMDGPU::EXEC);
+  BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
+    .addReg(ExecReg);
 
   // Restore the EXEC mask after the loop.
   B.setMBB(*RestoreExecBB);
-  B.buildInstr(AMDGPU::S_MOV_B64_term)
-    .addDef(AMDGPU::EXEC)
+  B.buildInstr(MovTermOpc)
+    .addDef(ExecReg)
     .addReg(SaveExecReg);
+
+  // Restore the insert point before the original instruction.
+  B.setInsertPt(MBB, MBB.end());
+
+  return true;
+}
+
+// Return any unique registers used by \p MI at \p OpIndices that need to be
+// handled in a waterfall loop. Returns these registers in \p
+// SGPROperandRegs. Returns true if there are any operansd to handle and a
+// waterfall loop is necessary.
+bool AMDGPURegisterBankInfo::collectWaterfallOperands(
+  SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
+  MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
+  for (unsigned Op : OpIndices) {
+    assert(MI.getOperand(Op).isUse());
+    Register Reg = MI.getOperand(Op).getReg();
+    const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
+    if (OpBank->getID() == AMDGPU::VGPRRegBankID)
+      SGPROperandRegs.insert(Reg);
+  }
+
+  // No operands need to be replaced, so no need to loop.
+  return !SGPROperandRegs.empty();
+}
+
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+  MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
+  ArrayRef<unsigned> OpIndices) const {
+  // Use a set to avoid extra readfirstlanes in the case where multiple operands
+  // are the same register.
+  SmallSet<Register, 4> SGPROperandRegs;
+
+  if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
+    return false;
+
+  MachineBasicBlock::iterator I = MI.getIterator();
+  return executeInWaterfallLoop(B, make_range(I, std::next(I)),
+                                SGPROperandRegs, MRI);
+}
+
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  ArrayRef<unsigned> OpIndices) const {
+  MachineIRBuilder B(MI);
+  return executeInWaterfallLoop(B, MI, MRI, OpIndices);
 }
 
 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
@@ -960,8 +1049,13 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
   SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
 
   // If the pointer is an SGPR, we have nothing to do.
-  if (SrcRegs.empty())
-    return false;
+  if (SrcRegs.empty()) {
+    Register PtrReg = MI.getOperand(1).getReg();
+    const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
+    if (PtrBank == &AMDGPU::SGPRRegBank)
+      return false;
+    SrcRegs.push_back(PtrReg);
+  }
 
   assert(LoadSize % MaxNonSmrdLoadSize == 0);
 
@@ -1013,6 +1107,33 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPURegisterBankInfo::applyMappingImage(
+    MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+    MachineRegisterInfo &MRI, int RsrcIdx) const {
+  const int NumDefs = MI.getNumExplicitDefs();
+
+  // The reported argument index is relative to the IR intrinsic call arguments,
+  // so we need to shift by the number of defs and the intrinsic ID.
+  RsrcIdx += NumDefs + 1;
+
+  // Insert copies to VGPR arguments.
+  applyDefaultMapping(OpdMapper);
+
+  // Fixup any SGPR arguments.
+  SmallVector<unsigned, 4> SGPRIndexes;
+  for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
+    if (!MI.getOperand(I).isReg())
+      continue;
+
+    // If this intrinsic has a sampler, it immediately follows rsrc.
+    if (I == RsrcIdx || I == RsrcIdx + 1)
+      SGPRIndexes.push_back(I);
+  }
+
+  executeInWaterfallLoop(MI, MRI, SGPRIndexes);
+  return true;
+}
+
 // For cases where only a single copy is inserted for matching register banks.
 // Replace the register in the instruction operand
 static void substituteSimpleCopyRegs(
@@ -1024,6 +1145,184 @@ static void substituteSimpleCopyRegs(
   }
 }
 
+/// Handle register layout difference for f16 images for some subtargets.
+Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
+                                                MachineRegisterInfo &MRI,
+                                                Register Reg) const {
+  if (!Subtarget.hasUnpackedD16VMem())
+    return Reg;
+
+  const LLT S16 = LLT::scalar(16);
+  LLT StoreVT = MRI.getType(Reg);
+  if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
+    return Reg;
+
+  auto Unmerge = B.buildUnmerge(S16, Reg);
+
+
+  SmallVector<Register, 4> WideRegs;
+  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+    WideRegs.push_back(Unmerge.getReg(I));
+
+  const LLT S32 = LLT::scalar(32);
+  int NumElts = StoreVT.getNumElements();
+
+  return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+}
+
+static std::pair<Register, unsigned>
+getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
+  int64_t Const;
+  if (mi_match(Reg, MRI, m_ICst(Const)))
+    return std::make_pair(Register(), Const);
+
+  Register Base;
+  if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
+    return std::make_pair(Base, Const);
+
+  // TODO: Handle G_OR used for add case
+  return std::make_pair(Reg, 0);
+}
+
+std::pair<Register, unsigned>
+AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
+                                           Register OrigOffset) const {
+  const unsigned MaxImm = 4095;
+  Register BaseReg;
+  unsigned ImmOffset;
+  const LLT S32 = LLT::scalar(32);
+
+  std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
+                                                           OrigOffset);
+
+  unsigned C1 = 0;
+  if (ImmOffset != 0) {
+    // If the immediate value is too big for the immoffset field, put the value
+    // and -4096 into the immoffset field so that the value that is copied/added
+    // for the voffset field is a multiple of 4096, and it stands more chance
+    // of being CSEd with the copy/add for another similar load/store.
+    // However, do not do that rounding down to a multiple of 4096 if that is a
+    // negative number, as it appears to be illegal to have a negative offset
+    // in the vgpr, even if adding the immediate offset makes it positive.
+    unsigned Overflow = ImmOffset & ~MaxImm;
+    ImmOffset -= Overflow;
+    if ((int32_t)Overflow < 0) {
+      Overflow += ImmOffset;
+      ImmOffset = 0;
+    }
+
+    C1 = ImmOffset;
+    if (Overflow != 0) {
+      if (!BaseReg)
+        BaseReg = B.buildConstant(S32, Overflow).getReg(0);
+      else {
+        auto OverflowVal = B.buildConstant(S32, Overflow);
+        BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
+      }
+    }
+  }
+
+  if (!BaseReg)
+    BaseReg = B.buildConstant(S32, 0).getReg(0);
+
+  return {BaseReg, C1};
+}
+
+static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
+  int64_t C;
+  return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
+}
+
+static unsigned extractGLC(unsigned CachePolicy) {
+  return CachePolicy & 1;
+}
+
+static unsigned extractSLC(unsigned CachePolicy) {
+  return (CachePolicy >> 1) & 1;
+}
+
+static unsigned extractDLC(unsigned CachePolicy) {
+  return (CachePolicy >> 2) & 1;
+}
+
+MachineInstr *
+AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
+                                             MachineInstr &MI) const {
+   MachineRegisterInfo &MRI = *B.getMRI();
+  executeInWaterfallLoop(B, MI, MRI, {2, 4});
+
+  // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
+
+  Register VData = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(VData);
+
+  int EltSize = Ty.getScalarSizeInBits();
+  int Size = Ty.getSizeInBits();
+
+  // FIXME: Broken integer truncstore.
+  if (EltSize != 32)
+    report_fatal_error("unhandled intrinsic store");
+
+  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
+  const int MemSize = (*MI.memoperands_begin())->getSize();
+
+
+  Register RSrc = MI.getOperand(2).getReg();
+  Register VOffset = MI.getOperand(3).getReg();
+  Register SOffset = MI.getOperand(4).getReg();
+  unsigned CachePolicy = MI.getOperand(5).getImm();
+
+  unsigned ImmOffset;
+  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
+
+  const bool Offen = !isZero(VOffset, MRI);
+
+  unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
+  switch (8 * MemSize) {
+  case 8:
+    Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
+                  AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
+    break;
+  case 16:
+    Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
+                  AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
+    break;
+  default:
+    Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
+                  AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
+    if (Size > 32)
+      Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
+    break;
+  }
+
+
+  // Set the insertion point back to the instruction in case it was moved into a
+  // loop.
+  B.setInstr(MI);
+
+  MachineInstrBuilder MIB = B.buildInstr(Opc)
+    .addUse(VData);
+
+  if (Offen)
+    MIB.addUse(VOffset);
+
+  MIB.addUse(RSrc)
+     .addUse(SOffset)
+     .addImm(ImmOffset)
+     .addImm(extractGLC(CachePolicy))
+     .addImm(extractSLC(CachePolicy))
+     .addImm(0) // tfe: FIXME: Remove from inst
+     .addImm(extractDLC(CachePolicy))
+     .cloneMemRefs(MI);
+
+  // FIXME: We need a way to report failure from applyMappingImpl.
+  // Insert constrain copies before inserting the loop.
+  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
+    report_fatal_error("failed to constrain selected store intrinsic");
+
+  return MIB;
+}
+
 void AMDGPURegisterBankInfo::applyMappingImpl(
     const OperandsMapper &OpdMapper) const {
   MachineInstr &MI = OpdMapper.getMI();
@@ -1289,12 +1588,202 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     MI.eraseFromParent();
     return;
   }
-  case AMDGPU::G_EXTRACT_VECTOR_ELT:
-    applyDefaultMapping(OpdMapper);
-    executeInWaterfallLoop(MI, MRI, { 2 });
+  case AMDGPU::G_BUILD_VECTOR:
+  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
+    Register DstReg = MI.getOperand(0).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+    if (DstTy != LLT::vector(2, 16))
+      break;
+
+    assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
+    substituteSimpleCopyRegs(OpdMapper, 1);
+    substituteSimpleCopyRegs(OpdMapper, 2);
+
+    const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+    if (DstBank == &AMDGPU::SGPRRegBank)
+      break; // Can use S_PACK_* instructions.
+
+    MachineIRBuilder B(MI);
+
+    Register Lo = MI.getOperand(1).getReg();
+    Register Hi = MI.getOperand(2).getReg();
+    const LLT S32 = LLT::scalar(32);
+
+    const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI);
+    const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI);
+
+    Register ZextLo;
+    Register ShiftHi;
+
+    if (Opc == AMDGPU::G_BUILD_VECTOR) {
+      ZextLo = B.buildZExt(S32, Lo).getReg(0);
+      MRI.setRegBank(ZextLo, *BankLo);
+
+      Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
+      MRI.setRegBank(ZextHi, *BankHi);
+
+      auto ShiftAmt = B.buildConstant(S32, 16);
+      MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
+
+      ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
+      MRI.setRegBank(ShiftHi, *BankHi);
+    } else {
+      Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
+      MRI.setRegBank(MaskLo, *BankLo);
+
+      auto ShiftAmt = B.buildConstant(S32, 16);
+      MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
+
+      ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
+      MRI.setRegBank(ShiftHi, *BankHi);
+
+      ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
+      MRI.setRegBank(ZextLo, *BankLo);
+    }
+
+    auto Or = B.buildOr(S32, ZextLo, ShiftHi);
+    MRI.setRegBank(Or.getReg(0), *DstBank);
+
+    B.buildBitcast(DstReg, Or);
+    MI.eraseFromParent();
+    return;
+  }
+  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
+    SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
+
+    assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
+
+    if (DstRegs.empty()) {
+      applyDefaultMapping(OpdMapper);
+      executeInWaterfallLoop(MI, MRI, { 2 });
+      return;
+    }
+
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    Register IdxReg = MI.getOperand(2).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+    (void)DstTy;
+
+    assert(DstTy.getSizeInBits() == 64);
+
+    LLT SrcTy = MRI.getType(SrcReg);
+    const LLT S32 = LLT::scalar(32);
+    LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+
+    MachineIRBuilder B(MI);
+    auto CastSrc = B.buildBitcast(Vec32, SrcReg);
+    auto One = B.buildConstant(S32, 1);
+
+    // Split the vector index into 32-bit pieces. Prepare to move all of the
+    // new instructions into a waterfall loop if necessary.
+    //
+    // Don't put the bitcast or constant in the loop.
+    MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
+
+    // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
+    auto IdxLo = B.buildShl(S32, IdxReg, One);
+    auto IdxHi = B.buildAdd(S32, IdxLo, One);
+    B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
+    B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
+
+    const ValueMapping &DstMapping
+      = OpdMapper.getInstrMapping().getOperandMapping(0);
+
+    // FIXME: Should be getting from mapping or not?
+    const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+    MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank);
+    MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
+    MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
+    MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
+    MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
+
+    SmallSet<Register, 4> OpsToWaterfall;
+    if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
+      MI.eraseFromParent();
+      return;
+    }
+
+    // Remove the original instruction to avoid potentially confusing the
+    // waterfall loop logic.
+    B.setInstr(*Span.begin());
+    MI.eraseFromParent();
+    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
+                           OpsToWaterfall, MRI);
+    return;
+  }
+  case AMDGPU::G_INSERT_VECTOR_ELT: {
+    SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
+
+    assert(OpdMapper.getVRegs(0).empty());
+    assert(OpdMapper.getVRegs(1).empty());
+    assert(OpdMapper.getVRegs(3).empty());
+
+    if (InsRegs.empty()) {
+      applyDefaultMapping(OpdMapper);
+      executeInWaterfallLoop(MI, MRI, { 3 });
+      return;
+    }
+
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    Register InsReg = MI.getOperand(2).getReg();
+    Register IdxReg = MI.getOperand(3).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    LLT InsTy = MRI.getType(InsReg);
+    (void)InsTy;
+
+    assert(InsTy.getSizeInBits() == 64);
+
+    const LLT S32 = LLT::scalar(32);
+    LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+
+    MachineIRBuilder B(MI);
+    auto CastSrc = B.buildBitcast(Vec32, SrcReg);
+    auto One = B.buildConstant(S32, 1);
+
+    // Split the vector index into 32-bit pieces. Prepare to move all of the
+    // new instructions into a waterfall loop if necessary.
+    //
+    // Don't put the bitcast or constant in the loop.
+    MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
+
+    // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
+    auto IdxLo = B.buildShl(S32, IdxReg, One);
+    auto IdxHi = B.buildAdd(S32, IdxLo, One);
+
+    auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
+    auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
+    B.buildBitcast(DstReg, InsHi);
+
+    const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+    const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+    const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI);
+
+    MRI.setRegBank(InsReg, *InsSrcBank);
+    MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
+    MRI.setRegBank(InsLo.getReg(0), *DstBank);
+    MRI.setRegBank(InsHi.getReg(0), *DstBank);
+    MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
+    MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
+    MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
+
+
+    SmallSet<Register, 4> OpsToWaterfall;
+    if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
+      MI.eraseFromParent();
+      return;
+    }
+
+    B.setInstr(*Span.begin());
+    MI.eraseFromParent();
+
+    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
+                           OpsToWaterfall, MRI);
     return;
+  }
   case AMDGPU::G_INTRINSIC: {
-    switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+    switch (MI.getIntrinsicID()) {
     case Intrinsic::amdgcn_s_buffer_load: {
       // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
       executeInWaterfallLoop(MI, MRI, { 2, 3 });
@@ -1303,8 +1792,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     case Intrinsic::amdgcn_readlane: {
       substituteSimpleCopyRegs(OpdMapper, 2);
 
-      assert(empty(OpdMapper.getVRegs(0)));
-      assert(empty(OpdMapper.getVRegs(3)));
+      assert(OpdMapper.getVRegs(0).empty());
+      assert(OpdMapper.getVRegs(3).empty());
 
       // Make sure the index is an SGPR. It doesn't make sense to run this in a
       // waterfall loop, so assume it's a uniform value.
@@ -1312,9 +1801,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       return;
     }
     case Intrinsic::amdgcn_writelane: {
-      assert(empty(OpdMapper.getVRegs(0)));
-      assert(empty(OpdMapper.getVRegs(2)));
-      assert(empty(OpdMapper.getVRegs(3)));
+      assert(OpdMapper.getVRegs(0).empty());
+      assert(OpdMapper.getVRegs(2).empty());
+      assert(OpdMapper.getVRegs(3).empty());
 
       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
@@ -1327,7 +1816,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     break;
   }
   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
-    switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+    auto IntrID = MI.getIntrinsicID();
+    switch (IntrID) {
     case Intrinsic::amdgcn_buffer_load: {
       executeInWaterfallLoop(MI, MRI, { 2 });
       return;
@@ -1335,23 +1825,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     case Intrinsic::amdgcn_ds_ordered_add:
     case Intrinsic::amdgcn_ds_ordered_swap: {
       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
-      assert(empty(OpdMapper.getVRegs(0)));
+      assert(OpdMapper.getVRegs(0).empty());
       substituteSimpleCopyRegs(OpdMapper, 3);
       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
       return;
     }
+    case Intrinsic::amdgcn_ds_gws_init:
+    case Intrinsic::amdgcn_ds_gws_barrier:
+    case Intrinsic::amdgcn_ds_gws_sema_br: {
+      // Only the first lane is executes, so readfirstlane is safe.
+      substituteSimpleCopyRegs(OpdMapper, 1);
+      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+      return;
+    }
+    case Intrinsic::amdgcn_ds_gws_sema_v:
+    case Intrinsic::amdgcn_ds_gws_sema_p:
+    case Intrinsic::amdgcn_ds_gws_sema_release_all: {
+      // Only the first lane is executes, so readfirstlane is safe.
+      constrainOpWithReadfirstlane(MI, MRI, 1); // M0
+      return;
+    }
     case Intrinsic::amdgcn_s_sendmsg:
     case Intrinsic::amdgcn_s_sendmsghalt: {
       // FIXME: Should this use a waterfall loop?
       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
       return;
     }
-    default:
+    case Intrinsic::amdgcn_raw_buffer_load:
+    case Intrinsic::amdgcn_raw_buffer_load_format:
+    case Intrinsic::amdgcn_raw_tbuffer_load:
+    case Intrinsic::amdgcn_raw_buffer_store:
+    case Intrinsic::amdgcn_raw_buffer_store_format:
+    case Intrinsic::amdgcn_raw_tbuffer_store: {
+      applyDefaultMapping(OpdMapper);
+      executeInWaterfallLoop(MI, MRI, {2, 4});
+      return;
+    }
+    case Intrinsic::amdgcn_struct_buffer_load:
+    case Intrinsic::amdgcn_struct_buffer_store:
+    case Intrinsic::amdgcn_struct_tbuffer_load:
+    case Intrinsic::amdgcn_struct_tbuffer_store: {
+      applyDefaultMapping(OpdMapper);
+      executeInWaterfallLoop(MI, MRI, {2, 5});
+      return;
+    }
+    default: {
+      if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
+              AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+        // Non-images can have complications from operands that allow both SGPR
+        // and VGPR. For now it's too complicated to figure out the final opcode
+        // to derive the register bank from the MCInstrDesc.
+        if (RSrcIntrin->IsImage) {
+          applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
+          return;
+        }
+      }
+
       break;
     }
+    }
     break;
   }
-  case AMDGPU::G_LOAD: {
+  case AMDGPU::G_LOAD:
+  case AMDGPU::G_ZEXTLOAD:
+  case AMDGPU::G_SEXTLOAD: {
     if (applyMappingWideLoad(MI, OpdMapper, MRI))
       return;
     break;
@@ -1451,26 +1988,72 @@ AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
                                MI.getNumOperands());
 }
 
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
+                                        const MachineInstr &MI,
+                                        int RsrcIdx) const {
+  // The reported argument index is relative to the IR intrinsic call arguments,
+  // so we need to shift by the number of defs and the intrinsic ID.
+  RsrcIdx += MI.getNumExplicitDefs() + 1;
+
+  const int NumOps = MI.getNumOperands();
+  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
+
+  // TODO: Should packed/unpacked D16 difference be reported here as part of
+  // the value mapping?
+  for (int I = 0; I != NumOps; ++I) {
+    if (!MI.getOperand(I).isReg())
+      continue;
+
+    Register OpReg = MI.getOperand(I).getReg();
+    unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
+
+    // FIXME: Probably need a new intrinsic register bank searchable table to
+    // handle arbitrary intrinsics easily.
+    //
+    // If this has a sampler, it immediately follows rsrc.
+    const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
+
+    if (MustBeSGPR) {
+      // If this must be an SGPR, so we must report whatever it is as legal.
+      unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+      OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
+    } else {
+      // Some operands must be VGPR, and these are easy to copy to.
+      OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    }
+  }
+
+  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
+}
+
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
 
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+  SmallVector<const ValueMapping*, 2> OpdsMapping(2);
   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
   LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
-  unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+  Register PtrReg = MI.getOperand(1).getReg();
+  LLT PtrTy = MRI.getType(PtrReg);
+  unsigned AS = PtrTy.getAddressSpace();
+  unsigned PtrSize = PtrTy.getSizeInBits();
 
   const ValueMapping *ValMapping;
   const ValueMapping *PtrMapping;
 
-  if (isInstrUniform(MI)) {
+  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
+
+  if (PtrBank == &AMDGPU::SGPRRegBank &&
+      (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
+       AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+      isInstrUniformNonExtLoadAlign4(MI)) {
     // We have a uniform instruction so we want to use an SMRD load
     ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
   } else {
     ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
-    // FIXME: What would happen if we used SGPRRegBankID here?
     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
   }
 
@@ -1494,6 +2077,31 @@ AMDGPURegisterBankInfo::getRegBankID(Register Reg,
   return Bank ? Bank->getID() : Default;
 }
 
+
+static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
+  return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
+    AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+}
+
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
+                                         const MachineRegisterInfo &MRI,
+                                         const TargetRegisterInfo &TRI) const {
+  // Lie and claim anything is legal, even though this needs to be an SGPR
+  // applyMapping will have to deal with it as a waterfall loop.
+  unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
+  unsigned Size = getSizeInBits(Reg, MRI, TRI);
+  return AMDGPU::getValueMapping(Bank, Size);
+}
+
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
+                                         const MachineRegisterInfo &MRI,
+                                         const TargetRegisterInfo &TRI) const {
+  unsigned Size = getSizeInBits(Reg, MRI, TRI);
+  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+}
+
 ///
 /// This function must return a legal mapping, because
 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
@@ -1536,7 +2144,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     int ResultBank = -1;
 
     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
-      unsigned Reg = MI.getOperand(I).getReg();
+      Register Reg = MI.getOperand(I).getReg();
       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
 
       // FIXME: Assuming VGPR for any undetermined inputs.
@@ -1660,7 +2268,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
     LLVM_FALLTHROUGH;
   }
-
   case AMDGPU::G_GEP:
   case AMDGPU::G_ADD:
   case AMDGPU::G_SUB:
@@ -1669,15 +2276,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_LSHR:
   case AMDGPU::G_ASHR:
   case AMDGPU::G_UADDO:
-  case AMDGPU::G_SADDO:
   case AMDGPU::G_USUBO:
-  case AMDGPU::G_SSUBO:
   case AMDGPU::G_UADDE:
   case AMDGPU::G_SADDE:
   case AMDGPU::G_USUBE:
   case AMDGPU::G_SSUBE:
-  case AMDGPU::G_UMULH:
-  case AMDGPU::G_SMULH:
   case AMDGPU::G_SMIN:
   case AMDGPU::G_SMAX:
   case AMDGPU::G_UMIN:
@@ -1692,17 +2295,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FPTOUI:
   case AMDGPU::G_FMUL:
   case AMDGPU::G_FMA:
+  case AMDGPU::G_FMAD:
   case AMDGPU::G_FSQRT:
+  case AMDGPU::G_FFLOOR:
+  case AMDGPU::G_FCEIL:
+  case AMDGPU::G_FRINT:
   case AMDGPU::G_SITOFP:
   case AMDGPU::G_UITOFP:
   case AMDGPU::G_FPTRUNC:
   case AMDGPU::G_FPEXT:
   case AMDGPU::G_FEXP2:
   case AMDGPU::G_FLOG2:
+  case AMDGPU::G_FMINNUM:
+  case AMDGPU::G_FMAXNUM:
+  case AMDGPU::G_FMINNUM_IEEE:
+  case AMDGPU::G_FMAXNUM_IEEE:
   case AMDGPU::G_FCANONICALIZE:
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_INTRINSIC_ROUND:
+  case AMDGPU::G_AMDGPU_FFBH_U32:
+    return getDefaultMappingVOP(MI);
+  case AMDGPU::G_UMULH:
+  case AMDGPU::G_SMULH: {
+    if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
+      return getDefaultMappingSOP(MI);
     return getDefaultMappingVOP(MI);
+  }
   case AMDGPU::G_IMPLICIT_DEF: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
@@ -1710,12 +2328,19 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case AMDGPU::G_FCONSTANT:
   case AMDGPU::G_CONSTANT:
-  case AMDGPU::G_FRAME_INDEX:
+  case AMDGPU::G_GLOBAL_VALUE:
   case AMDGPU::G_BLOCK_ADDR: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     break;
   }
+  case AMDGPU::G_FRAME_INDEX: {
+    // TODO: This should be the same as other constants, but eliminateFrameIndex
+    // currently assumes VALU uses.
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    break;
+  }
   case AMDGPU::G_INSERT: {
     unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
                                           AMDGPU::VGPRRegBankID;
@@ -1737,8 +2362,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[2] = nullptr;
     break;
   }
-  case AMDGPU::G_MERGE_VALUES:
   case AMDGPU::G_BUILD_VECTOR:
+  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    if (DstTy == LLT::vector(2, 16)) {
+      unsigned DstSize = DstTy.getSizeInBits();
+      unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+      unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+      unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+      unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
+
+      OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
+      OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
+      OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
+      break;
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  case AMDGPU::G_MERGE_VALUES:
   case AMDGPU::G_CONCAT_VECTORS: {
     unsigned Bank = isSALUMapping(MI) ?
       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
@@ -1760,6 +2402,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_CTTZ_ZERO_UNDEF:
   case AMDGPU::G_CTPOP:
   case AMDGPU::G_BSWAP:
+  case AMDGPU::G_BITREVERSE:
   case AMDGPU::G_FABS:
   case AMDGPU::G_FNEG: {
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -1848,7 +2491,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
                      Op3Bank == AMDGPU::SGPRRegBankID &&
       (Size == 32 || (Size == 64 &&
                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
-                      MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
+                      Subtarget.hasScalarCompareEq64()));
 
     unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
 
@@ -1859,14 +2502,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
-    unsigned OutputBankID = isSALUMapping(MI) ?
-                            AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+    // VGPR index can be used for waterfall when indexing a SGPR vector.
+    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+    unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
 
-    OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
-    OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
+    OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
 
     // The index can be either if the source vector is VGPR.
     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
@@ -1879,15 +2524,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
-    unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
-    unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
+                                            MRI, *TRI);
+    unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
 
     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
-    OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
-    OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
+    OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
+                                                       InsertSize);
 
     // The index can be either if the source vector is VGPR.
-    OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
+    OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
     break;
   }
   case AMDGPU::G_UNMERGE_VALUES: {
@@ -1903,11 +2551,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case AMDGPU::G_INTRINSIC: {
-    switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+    switch (MI.getIntrinsicID()) {
     default:
       return getInvalidInstructionMapping();
-    case Intrinsic::maxnum:
-    case Intrinsic::minnum:
     case Intrinsic::amdgcn_div_fmas:
     case Intrinsic::amdgcn_trig_preop:
     case Intrinsic::amdgcn_sin:
@@ -1938,6 +2584,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mbcnt_hi:
     case Intrinsic::amdgcn_ubfe:
     case Intrinsic::amdgcn_sbfe:
+    case Intrinsic::amdgcn_mul_u24:
+    case Intrinsic::amdgcn_mul_i24:
     case Intrinsic::amdgcn_lerp:
     case Intrinsic::amdgcn_sad_u8:
     case Intrinsic::amdgcn_msad_u8:
@@ -1956,10 +2604,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_udot4:
     case Intrinsic::amdgcn_sdot8:
     case Intrinsic::amdgcn_udot8:
-    case Intrinsic::amdgcn_fdiv_fast:
     case Intrinsic::amdgcn_wwm:
     case Intrinsic::amdgcn_wqm:
       return getDefaultMappingVOP(MI);
+    case Intrinsic::amdgcn_ds_swizzle:
     case Intrinsic::amdgcn_ds_permute:
     case Intrinsic::amdgcn_ds_bpermute:
     case Intrinsic::amdgcn_update_dpp:
@@ -2040,7 +2688,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     case Intrinsic::amdgcn_readlane: {
       // This must be an SGPR, but accept a VGPR.
-      unsigned IdxReg = MI.getOperand(3).getReg();
+      Register IdxReg = MI.getOperand(3).getReg();
       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
       unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
@@ -2055,10 +2703,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     case Intrinsic::amdgcn_writelane: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-      unsigned SrcReg = MI.getOperand(2).getReg();
+      Register SrcReg = MI.getOperand(2).getReg();
       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
       unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
-      unsigned IdxReg = MI.getOperand(3).getReg();
+      Register IdxReg = MI.getOperand(3).getReg();
       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
       unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
@@ -2081,9 +2729,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
-    switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
-    default:
-      return getInvalidInstructionMapping();
+    auto IntrID = MI.getIntrinsicID();
+    switch (IntrID) {
     case Intrinsic::amdgcn_s_getreg:
     case Intrinsic::amdgcn_s_memtime:
     case Intrinsic::amdgcn_s_memrealtime:
@@ -2123,18 +2770,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
       break;
     case Intrinsic::amdgcn_exp:
-      OpdsMapping[0] = nullptr; // IntrinsicID
-      // FIXME: These are immediate values which can't be read from registers.
-      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
-      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
       // FIXME: Could we support packed types here?
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
-      // FIXME: These are immediate values which can't be read from registers.
-      OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
-      OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
       break;
     case Intrinsic::amdgcn_buffer_load: {
       Register RSrc = MI.getOperand(2).getReg();   // SGPR
@@ -2169,11 +2809,97 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
-    case Intrinsic::amdgcn_end_cf: {
+    case Intrinsic::amdgcn_end_cf:
+    case Intrinsic::amdgcn_init_exec: {
+      unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+      break;
+    }
+    case Intrinsic::amdgcn_else: {
+      unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
+      break;
+    }
+    case Intrinsic::amdgcn_kill: {
+      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+      break;
+    }
+    case Intrinsic::amdgcn_raw_buffer_load:
+    case Intrinsic::amdgcn_raw_tbuffer_load: {
+      // FIXME: Should make intrinsic ID the last operand of the instruction,
+      // then this would be the same as store
+      OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      break;
+    }
+    case Intrinsic::amdgcn_raw_buffer_store:
+    case Intrinsic::amdgcn_raw_buffer_store_format:
+    case Intrinsic::amdgcn_raw_tbuffer_store: {
+      OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      break;
+    }
+    case Intrinsic::amdgcn_struct_buffer_load:
+    case Intrinsic::amdgcn_struct_tbuffer_load: {
+      OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+      break;
+    }
+    case Intrinsic::amdgcn_struct_buffer_store:
+    case Intrinsic::amdgcn_struct_tbuffer_store: {
+      OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+      OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+      break;
+    }
+    case Intrinsic::amdgcn_init_exec_from_input: {
       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+      break;
+    }
+    case Intrinsic::amdgcn_ds_gws_init:
+    case Intrinsic::amdgcn_ds_gws_barrier:
+    case Intrinsic::amdgcn_ds_gws_sema_br: {
+      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+
+      // This must be an SGPR, but accept a VGPR.
+      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+                                   AMDGPU::SGPRRegBankID);
+      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
+    case Intrinsic::amdgcn_ds_gws_sema_v:
+    case Intrinsic::amdgcn_ds_gws_sema_p:
+    case Intrinsic::amdgcn_ds_gws_sema_release_all: {
+      // This must be an SGPR, but accept a VGPR.
+      unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+                                   AMDGPU::SGPRRegBankID);
+      OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
+      break;
+    }
+    default:
+      if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
+              AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+        // Non-images can have complications from operands that allow both SGPR
+        // and VGPR. For now it's too complicated to figure out the final opcode
+        // to derive the register bank from the MCInstrDesc.
+        if (RSrcIntrin->IsImage)
+          return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
+      }
+
+      return getInvalidInstructionMapping();
     }
     break;
   }
@@ -2216,6 +2942,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
 
   case AMDGPU::G_LOAD:
+  case AMDGPU::G_ZEXTLOAD:
+  case AMDGPU::G_SEXTLOAD:
     return getInstrMappingForLoad(MI);
 
   case AMDGPU::G_ATOMICRMW_XCHG:
@@ -2228,6 +2956,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_ATOMICRMW_MIN:
   case AMDGPU::G_ATOMICRMW_UMAX:
   case AMDGPU::G_ATOMICRMW_UMIN:
+  case AMDGPU::G_ATOMICRMW_FADD:
   case AMDGPU::G_ATOMIC_CMPXCHG: {
     return getDefaultMappingAllVGPR(MI);
   }
@@ -2247,4 +2976,3 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
                                getOperandsMapping(OpdsMapping),
                                MI.getNumOperands());
 }
-
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index f3a96e2a6128..a14b74961118 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -13,6 +13,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
 
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 
@@ -23,7 +25,9 @@
 namespace llvm {
 
 class LLT;
+class GCNSubtarget;
 class MachineIRBuilder;
+class SIInstrInfo;
 class SIRegisterInfo;
 class TargetRegisterInfo;
 
@@ -36,9 +40,27 @@ protected:
 #include "AMDGPUGenRegisterBank.inc"
 };
 class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
+  const GCNSubtarget &Subtarget;
   const SIRegisterInfo *TRI;
-
-  void executeInWaterfallLoop(MachineInstr &MI,
+  const SIInstrInfo *TII;
+
+  bool collectWaterfallOperands(
+    SmallSet<Register, 4> &SGPROperandRegs,
+    MachineInstr &MI,
+    MachineRegisterInfo &MRI,
+    ArrayRef<unsigned> OpIndices) const;
+
+  bool executeInWaterfallLoop(
+    MachineIRBuilder &B,
+    iterator_range<MachineBasicBlock::iterator> Range,
+    SmallSet<Register, 4> &SGPROperandRegs,
+    MachineRegisterInfo &MRI) const;
+
+  bool executeInWaterfallLoop(MachineIRBuilder &B,
+                              MachineInstr &MI,
+                              MachineRegisterInfo &MRI,
+                              ArrayRef<unsigned> OpIndices) const;
+  bool executeInWaterfallLoop(MachineInstr &MI,
                               MachineRegisterInfo &MRI,
                               ArrayRef<unsigned> OpIndices) const;
 
@@ -47,6 +69,19 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
   bool applyMappingWideLoad(MachineInstr &MI,
                             const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
                             MachineRegisterInfo &MRI) const;
+  bool
+  applyMappingImage(MachineInstr &MI,
+                    const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+                    MachineRegisterInfo &MRI, int RSrcIdx) const;
+
+  Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                          Register Reg) const;
+
+  std::pair<Register, unsigned>
+  splitBufferOffsets(MachineIRBuilder &B, Register Offset) const;
+
+  MachineInstr *selectStoreIntrinsic(MachineIRBuilder &B,
+                                     MachineInstr &MI) const;
 
   /// See RegisterBankInfo::applyMapping.
   void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
@@ -58,6 +93,16 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
                         const TargetRegisterInfo &TRI,
                         unsigned Default = AMDGPU::VGPRRegBankID) const;
 
+  // Return a value mapping for an operand that is required to be an SGPR.
+  const ValueMapping *getSGPROpMapping(Register Reg,
+                                       const MachineRegisterInfo &MRI,
+                                       const TargetRegisterInfo &TRI) const;
+
+  // Return a value mapping for an operand that is required to be a VGPR.
+  const ValueMapping *getVGPROpMapping(Register Reg,
+                                       const MachineRegisterInfo &MRI,
+                                       const TargetRegisterInfo &TRI) const;
+
   /// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p
   /// Regs. This appropriately sets the regbank of the new registers.
   void split64BitValueForMapping(MachineIRBuilder &B,
@@ -90,8 +135,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
   const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
   const InstructionMapping &getDefaultMappingAllVGPR(
     const MachineInstr &MI) const;
+
+  const InstructionMapping &getImageMapping(const MachineRegisterInfo &MRI,
+                                            const MachineInstr &MI,
+                                            int RsrcIdx) const;
+
 public:
-  AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
+  AMDGPURegisterBankInfo(const GCNSubtarget &STI);
 
   unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
                     unsigned Size) const override;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 9555694fb106..00f53b157577 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,14 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 def SGPRRegBank : RegisterBank<"SGPR",
-  [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512]
+  [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512, SReg_1024]
 >;
 
 def VGPRRegBank : RegisterBank<"VGPR",
-  [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
+  [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
 >;
 
 def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.
-def VCCRegBank : RegisterBank <"VCC", [SReg_64]>;
+def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 7cffdf1a4dcf..9806e6b0714f 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -26,19 +26,59 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) {
-  static const unsigned SubRegs[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
-    AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
-    AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
-    AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
-    AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24,
-    AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29,
-    AMDGPU::sub30, AMDGPU::sub31
-  };
-
-  assert(Channel < array_lengthof(SubRegs));
-  return SubRegs[Channel];
+// Table of NumRegs sized pieces at every 32-bit offset.
+static const uint16_t SubRegFromChannelTable[][32] = {
+  { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+    AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+    AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
+    AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
+    AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31
+  },
+  {
+    AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, AMDGPU::sub3_sub4,
+    AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, AMDGPU::sub6_sub7, AMDGPU::sub7_sub8,
+    AMDGPU::sub8_sub9, AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12,
+    AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, AMDGPU::sub15_sub16,
+    AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, AMDGPU::sub18_sub19, AMDGPU::sub19_sub20,
+    AMDGPU::sub20_sub21, AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24,
+    AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, AMDGPU::sub27_sub28,
+    AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, AMDGPU::sub30_sub31, AMDGPU::NoSubRegister
+  },
+  {
+    AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5,
+    AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9,
+    AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13,
+    AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17,
+    AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21,
+    AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25,
+    AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29,
+    AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
+  },
+  {
+    AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6,
+    AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10,
+    AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14,
+    AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18,
+    AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22,
+    AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26,
+    AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30,
+    AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
+  }
+};
+
+// FIXME: TableGen should generate something to make this manageable for all
+// register classes. At a minimum we could use the opposite of
+// composeSubRegIndices and go up from the base 32-bit subreg.
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel, unsigned NumRegs) {
+  const unsigned NumRegIndex = NumRegs - 1;
+
+  assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) &&
+         "Not implemented");
+  assert(Channel < array_lengthof(SubRegFromChannelTable[0]));
+  return SubRegFromChannelTable[NumRegIndex][Channel];
 }
 
 void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 3453a8c1b0b3..9e713ca804a1 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -28,7 +28,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
 
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
-  static unsigned getSubRegFromChannel(unsigned Channel);
+  static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
 
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
 };
diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index f8703c36127a..26b8b7840270 100644
--- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -81,6 +81,8 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umax>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_and>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
@@ -92,6 +94,8 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umax>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_and>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
 def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1eb9b83456c5..3bb6dd4571c0 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -175,6 +175,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
   HasFminFmaxLegacy(true),
   EnablePromoteAlloca(false),
   HasTrigReducedRange(false),
+  MaxWavesPerEU(10),
   LocalMemorySize(0),
   WavefrontSize(0)
   { }
@@ -261,6 +262,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     AddNoCarryInsts(false),
     HasUnpackedD16VMem(false),
     LDSMisalignedBug(false),
+    HasMFMAInlineLiteralBug(false),
 
     ScalarizeGlobal(false),
 
@@ -278,9 +280,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
     TLInfo(TM, *this),
     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
+  MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
-  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+  RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
   InstSelector.reset(new AMDGPUInstructionSelector(
   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 }
@@ -489,28 +492,28 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 }
 
 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
-                                                 unsigned &MaxAlign) const {
+                                                 Align &MaxAlign) const {
   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 
   const DataLayout &DL = F.getParent()->getDataLayout();
   uint64_t ExplicitArgBytes = 0;
-  MaxAlign = 1;
+  MaxAlign = Align::None();
 
   for (const Argument &Arg : F.args()) {
     Type *ArgTy = Arg.getType();
 
-    unsigned Align = DL.getABITypeAlignment(ArgTy);
+    const Align Alignment(DL.getABITypeAlignment(ArgTy));
     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
-    ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
-    MaxAlign = std::max(MaxAlign, Align);
+    ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
+    MaxAlign = std::max(MaxAlign, Alignment);
   }
 
   return ExplicitArgBytes;
 }
 
 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
-                                                unsigned &MaxAlign) const {
+                                                Align &MaxAlign) const {
   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 
   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
@@ -518,7 +521,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
   if (ImplicitBytes != 0) {
-    unsigned Alignment = getAlignmentForImplicitArgPtr();
+    const Align Alignment = getAlignmentForImplicitArgPtr();
     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
   }
 
@@ -566,7 +569,7 @@ bool GCNSubtarget::hasMadF16() const {
 
 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
   if (getGeneration() >= AMDGPUSubtarget::GFX10)
-    return 10;
+    return getMaxWavesPerEU();
 
   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     if (SGPRs <= 80)
@@ -591,25 +594,12 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 }
 
 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
-  if (VGPRs <= 24)
-    return 10;
-  if (VGPRs <= 28)
-    return 9;
-  if (VGPRs <= 32)
-    return 8;
-  if (VGPRs <= 36)
-    return 7;
-  if (VGPRs <= 40)
-    return 6;
-  if (VGPRs <= 48)
-    return 5;
-  if (VGPRs <= 64)
-    return 4;
-  if (VGPRs <= 84)
-    return 3;
-  if (VGPRs <= 128)
-    return 2;
-  return 1;
+  unsigned MaxWaves = getMaxWavesPerEU();
+  unsigned Granule = getVGPRAllocGranule();
+  if (VGPRs < Granule)
+    return MaxWaves;
+  unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
+  return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
 }
 
 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
@@ -629,6 +619,20 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
   return 2; // VCC.
 }
 
+unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
+                                        unsigned LDSSize,
+                                        unsigned NumSGPRs,
+                                        unsigned NumVGPRs) const {
+  unsigned Occupancy =
+    std::min(getMaxWavesPerEU(),
+             getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
+  if (NumSGPRs)
+    Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
+  if (NumVGPRs)
+    Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
+  return Occupancy;
+}
+
 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
   const Function &F = MF.getFunction();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -878,8 +882,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
 
 void GCNSubtarget::getPostRAMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
-  Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
+  Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
+  Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 }
 
 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 78c3b823946d..936feb00c62b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -75,6 +75,7 @@ protected:
   bool HasFminFmaxLegacy;
   bool EnablePromoteAlloca;
   bool HasTrigReducedRange;
+  unsigned MaxWavesPerEU;
   int LocalMemorySize;
   unsigned WavefrontSize;
 
@@ -195,8 +196,8 @@ public:
     return LocalMemorySize;
   }
 
-  unsigned getAlignmentForImplicitArgPtr() const {
-    return isAmdHsaOS() ? 8 : 4;
+  Align getAlignmentForImplicitArgPtr() const {
+    return isAmdHsaOS() ? Align(8) : Align(4);
   }
 
   /// Returns the offset in bytes from the start of the input buffer
@@ -223,7 +224,9 @@ public:
   /// subtarget.
   virtual unsigned getMinWavesPerEU() const = 0;
 
-  unsigned getMaxWavesPerEU() const { return 10; }
+  /// \returns Maximum number of waves per execution unit supported by the
+  /// subtarget without any kind of limitation.
+  unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
 
   /// Creates value range metadata on an workitemid.* inrinsic call or load.
   bool makeLIDRangeMetadata(Instruction *I) const;
@@ -235,16 +238,17 @@ public:
       return 16;
     return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
   }
-  uint64_t getExplicitKernArgSize(const Function &F,
-                                  unsigned &MaxAlign) const;
-  unsigned getKernArgSegmentSize(const Function &F,
-                                 unsigned &MaxAlign) const;
+  uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
+  unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
 
   virtual ~AMDGPUSubtarget() {}
 };
 
 class GCNSubtarget : public AMDGPUGenSubtargetInfo,
                      public AMDGPUSubtarget {
+
+  using AMDGPUSubtarget::getMaxWavesPerEU;
+
 public:
   enum TrapHandlerAbi {
     TrapHandlerAbiNone = 0,
@@ -362,6 +366,7 @@ protected:
   bool CaymanISA;
   bool CFALUBug;
   bool LDSMisalignedBug;
+  bool HasMFMAInlineLiteralBug;
   bool HasVertexCache;
   short TexVTXClauseSize;
   bool ScalarizeGlobal;
@@ -416,7 +421,7 @@ public:
     return CallLoweringInfo.get();
   }
 
-  const InstructionSelector *getInstructionSelector() const override {
+  InstructionSelector *getInstructionSelector() const override {
     return InstSelector.get();
   }
 
@@ -544,6 +549,14 @@ public:
     return GFX9Insts;
   }
 
+  bool hasScalarPackInsts() const {
+    return GFX9Insts;
+  }
+
+  bool hasScalarMulHiInsts() const {
+    return GFX9Insts;
+  }
+
   TrapHandlerAbi getTrapHandlerAbi() const {
     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
   }
@@ -611,6 +624,11 @@ public:
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
+  /// \returns If target supports S_DENORM_MODE.
+  bool hasDenormModeInst() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX10;
+  }
+
   bool useFlatForGlobal() const {
     return FlatForGlobal;
   }
@@ -848,9 +866,7 @@ public:
   // on the pointer value itself may rely on the alignment / known low bits of
   // the pointer. Set this to something above the minimum to avoid needing
   // dynamic realignment in common cases.
-  unsigned getStackAlignment() const {
-    return 16;
-  }
+  Align getStackAlignment() const { return Align(16); }
 
   bool enableMachineScheduler() const override {
     return true;
@@ -881,12 +897,6 @@ public:
     return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
   }
 
-  /// \returns Maximum number of waves per execution unit supported by the
-  /// subtarget without any kind of limitation.
-  unsigned getMaxWavesPerEU() const {
-    return AMDGPU::IsaInfo::getMaxWavesPerEU();
-  }
-
   /// \returns Number of waves per work group supported by the subtarget and
   /// limited by given \p FlatWorkGroupSize.
   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
@@ -944,6 +954,14 @@ public:
     return HasDPP;
   }
 
+  bool hasDPPBroadcasts() const {
+    return HasDPP && getGeneration() < GFX10;
+  }
+
+  bool hasDPPWavefrontShifts() const {
+    return HasDPP && getGeneration() < GFX10;
+  }
+
   bool hasDPP8() const {
     return HasDPP8;
   }
@@ -974,6 +992,10 @@ public:
     return SGPRInitBug;
   }
 
+  bool hasMFMAInlineLiteralBug() const {
+    return HasMFMAInlineLiteralBug;
+  }
+
   bool has12DWordStoreHazard() const {
     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }
@@ -1036,6 +1058,13 @@ public:
   /// VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 
+  /// Return occupancy for the given function. Used LDS and a number of
+  /// registers if provided.
+  /// Note, occupancy can be affected by the scratch allocation as well, but
+  /// we do not have enough information to compute it.
+  unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0,
+                            unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
+
   /// \returns true if the flat_scratch register should be initialized with the
   /// pointer to the wave's scratch memory rather than a size and offset.
   bool flatScratchIsPointer() const {
@@ -1226,9 +1255,7 @@ public:
     return Gen;
   }
 
-  unsigned getStackAlignment() const {
-    return 4;
-  }
+  Align getStackAlignment() const { return Align(4); }
 
   R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
                                                  StringRef GPU, StringRef FS);
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0ea8db04c298..e8cf77161a14 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -238,16 +238,17 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUUseNativeCallsPass(*PR);
   initializeAMDGPUSimplifyLibCallsPass(*PR);
   initializeAMDGPUInlinerPass(*PR);
+  initializeAMDGPUPrintfRuntimeBindingPass(*PR);
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
-  return llvm::make_unique<AMDGPUTargetObjectFile>();
+  return std::make_unique<AMDGPUTargetObjectFile>();
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
+  return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
 }
 
 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
@@ -257,7 +258,7 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
-    new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
+    new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
@@ -412,6 +413,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
         PM.add(createAMDGPUExternalAAWrapperPass());
       }
       PM.add(createAMDGPUUnifyMetadataPass());
+      PM.add(createAMDGPUPrintfRuntimeBinding());
       PM.add(createAMDGPUPropagateAttributesLatePass(this));
       if (Internalize) {
         PM.add(createInternalizePass(mustPreserveGV));
@@ -482,7 +484,7 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+    I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
   }
 
   return I.get();
@@ -518,7 +520,7 @@ const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
+    I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
   }
 
   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
@@ -659,6 +661,8 @@ void AMDGPUPassConfig::addIRPasses() {
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  addPass(createAMDGPUPrintfRuntimeBinding());
+
   // This must occur before inlining, as the inliner will not look through
   // bitcast calls.
   addPass(createAMDGPUFixFunctionBitcastsPass());
@@ -681,12 +685,6 @@ void AMDGPUPassConfig::addIRPasses() {
   // without ever running any passes on the second.
   addPass(createBarrierNoopPass());
 
-  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
-    // TODO: May want to move later or split into an early and late one.
-
-    addPass(createAMDGPUCodeGenPreparePass());
-  }
-
   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
   if (TM.getTargetTriple().getArch() == Triple::r600)
     addPass(createR600OpenCLImageTypeLoweringPass());
@@ -714,6 +712,11 @@ void AMDGPUPassConfig::addIRPasses() {
     }
   }
 
+  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+    // TODO: May want to move later or split into an early and late one.
+    addPass(createAMDGPUCodeGenPreparePass());
+  }
+
   TargetPassConfig::addIRPasses();
 
   // EarlyCSE is not always strong enough to clean up what LSR produces. For
@@ -1046,7 +1049,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
     return true;
 
   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
-      !AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)) {
+      !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
   }
 
@@ -1095,7 +1098,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
 
   if (YamlMFI.ArgInfo &&
       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
-                             AMDGPU::SReg_128RegClass,
+                             AMDGPU::SGPR_128RegClass,
                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index aaed280a1270..616196ad5ba3 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -57,7 +57,7 @@ using namespace llvm;
 static cl::opt<unsigned> UnrollThresholdPrivate(
   "amdgpu-unroll-threshold-private",
   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
-  cl::init(2500), cl::Hidden);
+  cl::init(2000), cl::Hidden);
 
 static cl::opt<unsigned> UnrollThresholdLocal(
   "amdgpu-unroll-threshold-local",
@@ -590,6 +590,61 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
   return false;
 }
 
+bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+                                            Intrinsic::ID IID) const {
+  switch (IID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax:
+  case Intrinsic::amdgcn_is_shared:
+  case Intrinsic::amdgcn_is_private:
+    OpIndexes.push_back(0);
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
+  IntrinsicInst *II, Value *OldV, Value *NewV) const {
+  auto IntrID = II->getIntrinsicID();
+  switch (IntrID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_ds_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_fmax: {
+    const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
+    if (!IsVolatile->isZero())
+      return false;
+    Module *M = II->getParent()->getParent()->getParent();
+    Type *DestTy = II->getType();
+    Type *SrcTy = NewV->getType();
+    Function *NewDecl =
+        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+    II->setArgOperand(0, NewV);
+    II->setCalledFunction(NewDecl);
+    return true;
+  }
+  case Intrinsic::amdgcn_is_shared:
+  case Intrinsic::amdgcn_is_private: {
+    unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
+      AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
+    unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+    LLVMContext &Ctx = NewV->getType()->getContext();
+    ConstantInt *NewVal = (TrueAS == NewAS) ?
+      ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
+    II->replaceAllUsesWith(NewVal);
+    II->eraseFromParent();
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                        Type *SubTp) {
   if (ST->hasVOP3PInsts()) {
@@ -638,6 +693,39 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   CommonTTI.getUnrollingPreferences(L, SE, UP);
 }
 
+unsigned GCNTTIImpl::getUserCost(const User *U,
+                                 ArrayRef<const Value *> Operands) {
+  // Estimate extractelement elimination
+  if (const ExtractElementInst *EE = dyn_cast<ExtractElementInst>(U)) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(EE->getOperand(1));
+    unsigned Idx = -1;
+    if (CI)
+      Idx = CI->getZExtValue();
+    return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(),
+                              Idx);
+  }
+
+  // Estimate insertelement elimination
+  if (const InsertElementInst *IE = dyn_cast<InsertElementInst>(U)) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
+    unsigned Idx = -1;
+    if (CI)
+      Idx = CI->getZExtValue();
+    return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx);
+  }
+
+  // Estimate different intrinsics, e.g. llvm.fabs
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+    SmallVector<Value *, 4> Args(II->arg_operands());
+    FastMathFlags FMF;
+    if (auto *FPMO = dyn_cast<FPMathOperator>(II))
+      FMF = FPMO->getFastMathFlags();
+    return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
+                                 FMF);
+  }
+  return BaseT::getUserCost(U, Operands);
+}
+
 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 6f1bf5a26f0d..67f7f9074f10 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -46,10 +46,18 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
 
   Triple TargetTriple;
 
+  const TargetSubtargetInfo *ST;
+  const TargetLoweringBase *TLI;
+
+  const TargetSubtargetInfo *getST() const { return ST; }
+  const TargetLoweringBase *getTLI() const { return TLI; }
+
 public:
   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
-    : BaseT(TM, F.getParent()->getDataLayout()),
-      TargetTriple(TM->getTargetTriple()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()),
+        TargetTriple(TM->getTargetTriple()),
+        ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
+        TLI(ST->getTargetLowering()) {}
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
@@ -183,6 +191,11 @@ public:
     return AMDGPUAS::FLAT_ADDRESS;
   }
 
+  bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+                                  Intrinsic::ID IID) const;
+  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+                                        Value *OldV, Value *NewV) const;
+
   unsigned getVectorSplitCost() { return 0; }
 
   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
@@ -191,7 +204,7 @@ public:
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
 
-  unsigned getInliningThresholdMultiplier() { return 7; }
+  unsigned getInliningThresholdMultiplier() { return 9; }
 
   int getInlinerVectorBonusPercent() { return 0; }
 
@@ -201,6 +214,7 @@ public:
   int getMinMaxReductionCost(Type *Ty, Type *CondTy,
                              bool IsPairwiseForm,
                              bool IsUnsigned);
+  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
 };
 
 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 12f2e9519c9e..101ecfc0c87c 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1307,8 +1307,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
 
   if (LandBlkHasOtherPred) {
     report_fatal_error("Extra register needed to handle CFG");
-    unsigned CmpResReg =
-      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+    Register CmpResReg =
+        HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
     report_fatal_error("Extra compare instruction needed to handle CFG");
     insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
         CmpResReg, DebugLoc());
@@ -1316,8 +1316,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
 
   // XXX: We are running this after RA, so creating virtual registers will
   // cause an assertion failure in the PostRA scheduling pass.
-  unsigned InitReg =
-    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+  Register InitReg =
+      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
   insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
       DebugLoc());
 
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 6d678966c98e..9dd511fab57c 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -143,6 +143,7 @@ public:
     ImmTyDLC,
     ImmTyGLC,
     ImmTySLC,
+    ImmTySWZ,
     ImmTyTFE,
     ImmTyD16,
     ImmTyClampSI,
@@ -216,14 +217,15 @@ public:
     if (Kind == Token)
       return true;
 
-    if (Kind != Expression || !Expr)
-      return false;
-
     // When parsing operands, we can't always tell if something was meant to be
     // a token, like 'gds', or an expression that references a global variable.
     // In this case, we assume the string is an expression, and if we need to
     // interpret is a token, then we treat the symbol name as the token.
-    return isa<MCSymbolRefExpr>(Expr);
+    return isSymbolRefExpr();
+  }
+
+  bool isSymbolRefExpr() const {
+    return isExpr() && Expr && isa<MCSymbolRefExpr>(Expr);
   }
 
   bool isImm() const override {
@@ -274,8 +276,10 @@ public:
            isRegClass(AMDGPU::VReg_64RegClassID) ||
            isRegClass(AMDGPU::VReg_96RegClassID) ||
            isRegClass(AMDGPU::VReg_128RegClassID) ||
+           isRegClass(AMDGPU::VReg_160RegClassID) ||
            isRegClass(AMDGPU::VReg_256RegClassID) ||
-           isRegClass(AMDGPU::VReg_512RegClassID);
+           isRegClass(AMDGPU::VReg_512RegClassID) ||
+           isRegClass(AMDGPU::VReg_1024RegClassID);
   }
 
   bool isVReg32() const {
@@ -286,6 +290,10 @@ public:
     return isOff() || isVReg32();
   }
 
+  bool isNull() const {
+    return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
+  }
+
   bool isSDWAOperand(MVT type) const;
   bool isSDWAFP16Operand() const;
   bool isSDWAFP32Operand() const;
@@ -325,6 +333,7 @@ public:
   bool isDLC() const { return isImmTy(ImmTyDLC); }
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
+  bool isSWZ() const { return isImmTy(ImmTySWZ); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
   bool isD16() const { return isImmTy(ImmTyD16); }
   bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); }
@@ -817,6 +826,7 @@ public:
     case ImmTyDLC: OS << "DLC"; break;
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
+    case ImmTySWZ: OS << "SWZ"; break;
     case ImmTyTFE: OS << "TFE"; break;
     case ImmTyD16: OS << "D16"; break;
     case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -886,7 +896,7 @@ public:
                                       int64_t Val, SMLoc Loc,
                                       ImmTy Type = ImmTyNone,
                                       bool IsFPImm = false) {
-    auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser);
+    auto Op = std::make_unique<AMDGPUOperand>(Immediate, AsmParser);
     Op->Imm.Val = Val;
     Op->Imm.IsFPImm = IsFPImm;
     Op->Imm.Type = Type;
@@ -899,7 +909,7 @@ public:
   static AMDGPUOperand::Ptr CreateToken(const AMDGPUAsmParser *AsmParser,
                                         StringRef Str, SMLoc Loc,
                                         bool HasExplicitEncodingSize = true) {
-    auto Res = llvm::make_unique<AMDGPUOperand>(Token, AsmParser);
+    auto Res = std::make_unique<AMDGPUOperand>(Token, AsmParser);
     Res->Tok.Data = Str.data();
     Res->Tok.Length = Str.size();
     Res->StartLoc = Loc;
@@ -910,7 +920,7 @@ public:
   static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser,
                                       unsigned RegNo, SMLoc S,
                                       SMLoc E) {
-    auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser);
+    auto Op = std::make_unique<AMDGPUOperand>(Register, AsmParser);
     Op->Reg.RegNo = RegNo;
     Op->Reg.Mods = Modifiers();
     Op->StartLoc = S;
@@ -920,7 +930,7 @@ public:
 
   static AMDGPUOperand::Ptr CreateExpr(const AMDGPUAsmParser *AsmParser,
                                        const class MCExpr *Expr, SMLoc S) {
-    auto Op = llvm::make_unique<AMDGPUOperand>(Expression, AsmParser);
+    auto Op = std::make_unique<AMDGPUOperand>(Expression, AsmParser);
     Op->Expr = Expr;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -1051,11 +1061,23 @@ private:
                            std::string &CollectString);
 
   bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
-                             RegisterKind RegKind, unsigned Reg1,
-                             unsigned RegNum);
+                             RegisterKind RegKind, unsigned Reg1);
   bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
-                           unsigned& RegNum, unsigned& RegWidth,
-                           unsigned *DwordRegIndex);
+                           unsigned& RegNum, unsigned& RegWidth);
+  unsigned ParseRegularReg(RegisterKind &RegKind,
+                           unsigned &RegNum,
+                           unsigned &RegWidth);
+  unsigned ParseSpecialReg(RegisterKind &RegKind,
+                           unsigned &RegNum,
+                           unsigned &RegWidth);
+  unsigned ParseRegList(RegisterKind &RegKind,
+                        unsigned &RegNum,
+                        unsigned &RegWidth);
+  bool ParseRegRange(unsigned& Num, unsigned& Width);
+  unsigned getRegularReg(RegisterKind RegKind,
+                         unsigned RegNum,
+                         unsigned RegWidth);
+
   bool isRegister();
   bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
   Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind);
@@ -1306,6 +1328,7 @@ private:
   bool validateOpSel(const MCInst &Inst);
   bool validateVccOperand(unsigned Reg) const;
   bool validateVOP3Literal(const MCInst &Inst) const;
+  unsigned getConstantBusLimit(unsigned Opcode) const;
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
   unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1321,6 +1344,7 @@ private:
   void peekTokens(MutableArrayRef<AsmToken> Tokens);
   AsmToken::TokenKind getTokenKind() const;
   bool parseExpr(int64_t &Imm);
+  bool parseExpr(OperandVector &Operands);
   StringRef getTokenStr() const;
   AsmToken peekToken();
   AsmToken getToken() const;
@@ -1399,9 +1423,12 @@ public:
   void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
   void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
   void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands);
+  void cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands);
   void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
   void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
-                uint64_t BasicInstType, bool skipVcc = false);
+               uint64_t BasicInstType,
+               bool SkipDstVcc = false,
+               bool SkipSrcVcc = false);
 
   AMDGPUOperand::Ptr defaultBLGP() const;
   AMDGPUOperand::Ptr defaultCBSZ() const;
@@ -1636,8 +1663,8 @@ bool AMDGPUOperand::isSDWAInt32Operand() const {
 }
 
 bool AMDGPUOperand::isBoolReg() const {
-  return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
-    isSCSrcB64() : isSCSrcB32();
+  return (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) ||
+         (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32());
 }
 
 uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
@@ -1849,6 +1876,8 @@ static bool isInlineValue(unsigned Reg) {
   case AMDGPU::SRC_EXECZ:
   case AMDGPU::SRC_SCC:
     return true;
+  case AMDGPU::SGPR_NULL:
+    return true;
   default:
     return false;
   }
@@ -1870,8 +1899,10 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
       case 2: return AMDGPU::VReg_64RegClassID;
       case 3: return AMDGPU::VReg_96RegClassID;
       case 4: return AMDGPU::VReg_128RegClassID;
+      case 5: return AMDGPU::VReg_160RegClassID;
       case 8: return AMDGPU::VReg_256RegClassID;
       case 16: return AMDGPU::VReg_512RegClassID;
+      case 32: return AMDGPU::VReg_1024RegClassID;
     }
   } else if (Is == IS_TTMP) {
     switch (RegWidth) {
@@ -1944,7 +1975,7 @@ static unsigned getSpecialRegForName(StringRef RegName) {
     .Case("tba_lo", AMDGPU::TBA_LO)
     .Case("tba_hi", AMDGPU::TBA_HI)
     .Case("null", AMDGPU::SGPR_NULL)
-    .Default(0);
+    .Default(AMDGPU::NoRegister);
 }
 
 bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
@@ -1959,8 +1990,7 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
 }
 
 bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
-                                            RegisterKind RegKind, unsigned Reg1,
-                                            unsigned RegNum) {
+                                            RegisterKind RegKind, unsigned Reg1) {
   switch (RegKind) {
   case IS_SPECIAL:
     if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
@@ -2008,14 +2038,37 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
   }
 }
 
-static const StringRef Registers[] = {
-  { "v" },
-  { "s" },
-  { "ttmp" },
-  { "acc" },
-  { "a" },
+struct RegInfo {
+  StringLiteral Name;
+  RegisterKind Kind;
+};
+
+static constexpr RegInfo RegularRegisters[] = {
+  {{"v"},    IS_VGPR},
+  {{"s"},    IS_SGPR},
+  {{"ttmp"}, IS_TTMP},
+  {{"acc"},  IS_AGPR},
+  {{"a"},    IS_AGPR},
 };
 
+static bool isRegularReg(RegisterKind Kind) {
+  return Kind == IS_VGPR ||
+         Kind == IS_SGPR ||
+         Kind == IS_TTMP ||
+         Kind == IS_AGPR;
+}
+
+static const RegInfo* getRegularRegInfo(StringRef Str) {
+  for (const RegInfo &Reg : RegularRegisters)
+    if (Str.startswith(Reg.Name))
+      return &Reg;
+  return nullptr;
+}
+
+static bool getRegNum(StringRef Str, unsigned& Num) {
+  return !Str.getAsInteger(10, Num);
+}
+
 bool
 AMDGPUAsmParser::isRegister(const AsmToken &Token,
                             const AsmToken &NextToken) const {
@@ -2029,24 +2082,24 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token,
 
   // A single register like s0 or a range of registers like s[0:1]
 
-  StringRef RegName = Token.getString();
-
-  for (StringRef Reg : Registers) {
-    if (RegName.startswith(Reg)) {
-      if (Reg.size() < RegName.size()) {
-        unsigned RegNum;
-        // A single register with an index: rXX
-        if (!RegName.substr(Reg.size()).getAsInteger(10, RegNum))
-          return true;
-      } else {
-        // A range of registers: r[XX:YY].
-        if (NextToken.is(AsmToken::LBrac))
-          return true;
-      }
+  StringRef Str = Token.getString();
+  const RegInfo *Reg = getRegularRegInfo(Str);
+  if (Reg) {
+    StringRef RegName = Reg->Name;
+    StringRef RegSuffix = Str.substr(RegName.size());
+    if (!RegSuffix.empty()) {
+      unsigned Num;
+      // A single register with an index: rXX
+      if (getRegNum(RegSuffix, Num))
+        return true;
+    } else {
+      // A range of registers: r[XX:YY].
+      if (NextToken.is(AsmToken::LBrac))
+        return true;
     }
   }
 
-  return getSpecialRegForName(RegName);
+  return getSpecialRegForName(Str) != AMDGPU::NoRegister;
 }
 
 bool
@@ -2055,137 +2108,161 @@ AMDGPUAsmParser::isRegister()
   return isRegister(getToken(), peekToken());
 }
 
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
-                                          unsigned &RegNum, unsigned &RegWidth,
-                                          unsigned *DwordRegIndex) {
-  if (DwordRegIndex) { *DwordRegIndex = 0; }
+unsigned
+AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
+                               unsigned RegNum,
+                               unsigned RegWidth) {
+
+  assert(isRegularReg(RegKind));
+
+  unsigned AlignSize = 1;
+  if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
+    // SGPR and TTMP registers must be aligned.
+    // Max required alignment is 4 dwords.
+    AlignSize = std::min(RegWidth, 4u);
+  }
+
+  if (RegNum % AlignSize != 0)
+    return AMDGPU::NoRegister;
+
+  unsigned RegIdx = RegNum / AlignSize;
+  int RCID = getRegClass(RegKind, RegWidth);
+  if (RCID == -1)
+    return AMDGPU::NoRegister;
+
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-  if (getLexer().is(AsmToken::Identifier)) {
-    StringRef RegName = Parser.getTok().getString();
-    if ((Reg = getSpecialRegForName(RegName))) {
-      Parser.Lex();
-      RegKind = IS_SPECIAL;
-    } else {
-      unsigned RegNumIndex = 0;
-      if (RegName[0] == 'v') {
-        RegNumIndex = 1;
-        RegKind = IS_VGPR;
-      } else if (RegName[0] == 's') {
-        RegNumIndex = 1;
-        RegKind = IS_SGPR;
-      } else if (RegName[0] == 'a') {
-        RegNumIndex = RegName.startswith("acc") ? 3 : 1;
-        RegKind = IS_AGPR;
-      } else if (RegName.startswith("ttmp")) {
-        RegNumIndex = strlen("ttmp");
-        RegKind = IS_TTMP;
-      } else {
-        return false;
-      }
-      if (RegName.size() > RegNumIndex) {
-        // Single 32-bit register: vXX.
-        if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum))
-          return false;
-        Parser.Lex();
-        RegWidth = 1;
-      } else {
-        // Range of registers: v[XX:YY]. ":YY" is optional.
-        Parser.Lex();
-        int64_t RegLo, RegHi;
-        if (getLexer().isNot(AsmToken::LBrac))
-          return false;
-        Parser.Lex();
+  const MCRegisterClass RC = TRI->getRegClass(RCID);
+  if (RegIdx >= RC.getNumRegs())
+    return AMDGPU::NoRegister;
 
-        if (getParser().parseAbsoluteExpression(RegLo))
-          return false;
+  return RC.getRegister(RegIdx);
+}
 
-        const bool isRBrace = getLexer().is(AsmToken::RBrac);
-        if (!isRBrace && getLexer().isNot(AsmToken::Colon))
-          return false;
-        Parser.Lex();
+bool
+AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
+  int64_t RegLo, RegHi;
+  if (!trySkipToken(AsmToken::LBrac))
+    return false;
 
-        if (isRBrace) {
-          RegHi = RegLo;
-        } else {
-          if (getParser().parseAbsoluteExpression(RegHi))
-            return false;
+  if (!parseExpr(RegLo))
+    return false;
 
-          if (getLexer().isNot(AsmToken::RBrac))
-            return false;
-          Parser.Lex();
-        }
-        RegNum = (unsigned) RegLo;
-        RegWidth = (RegHi - RegLo) + 1;
-      }
-    }
-  } else if (getLexer().is(AsmToken::LBrac)) {
-    // List of consecutive registers: [s0,s1,s2,s3]
-    Parser.Lex();
-    if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, nullptr))
-      return false;
-    if (RegWidth != 1)
+  if (trySkipToken(AsmToken::Colon)) {
+    if (!parseExpr(RegHi))
       return false;
-    RegisterKind RegKind1;
-    unsigned Reg1, RegNum1, RegWidth1;
-    do {
-      if (getLexer().is(AsmToken::Comma)) {
-        Parser.Lex();
-      } else if (getLexer().is(AsmToken::RBrac)) {
-        Parser.Lex();
-        break;
-      } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1, nullptr)) {
-        if (RegWidth1 != 1) {
-          return false;
-        }
-        if (RegKind1 != RegKind) {
-          return false;
-        }
-        if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) {
-          return false;
-        }
-      } else {
-        return false;
-      }
-    } while (true);
   } else {
-    return false;
+    RegHi = RegLo;
   }
-  switch (RegKind) {
-  case IS_SPECIAL:
+
+  if (!trySkipToken(AsmToken::RBrac))
+    return false;
+
+  if (!isUInt<32>(RegLo) || !isUInt<32>(RegHi) || RegLo > RegHi)
+    return false;
+
+  Num = static_cast<unsigned>(RegLo);
+  Width = (RegHi - RegLo) + 1;
+  return true;
+}
+
+unsigned
+AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
+                                 unsigned &RegNum,
+                                 unsigned &RegWidth) {
+  assert(isToken(AsmToken::Identifier));
+  unsigned Reg = getSpecialRegForName(getTokenStr());
+  if (Reg) {
     RegNum = 0;
     RegWidth = 1;
-    break;
-  case IS_VGPR:
-  case IS_SGPR:
-  case IS_AGPR:
-  case IS_TTMP:
-  {
-    unsigned Size = 1;
-    if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
-      // SGPR and TTMP registers must be aligned. Max required alignment is 4 dwords.
-      Size = std::min(RegWidth, 4u);
-    }
-    if (RegNum % Size != 0)
-      return false;
-    if (DwordRegIndex) { *DwordRegIndex = RegNum; }
-    RegNum = RegNum / Size;
-    int RCID = getRegClass(RegKind, RegWidth);
-    if (RCID == -1)
-      return false;
-    const MCRegisterClass RC = TRI->getRegClass(RCID);
-    if (RegNum >= RC.getNumRegs())
-      return false;
-    Reg = RC.getRegister(RegNum);
-    break;
+    RegKind = IS_SPECIAL;
+    lex(); // skip register name
+  }
+  return Reg;
+}
+
+unsigned
+AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
+                                 unsigned &RegNum,
+                                 unsigned &RegWidth) {
+  assert(isToken(AsmToken::Identifier));
+  StringRef RegName = getTokenStr();
+
+  const RegInfo *RI = getRegularRegInfo(RegName);
+  if (!RI)
+    return AMDGPU::NoRegister;
+  lex(); // skip register name
+
+  RegKind = RI->Kind;
+  StringRef RegSuffix = RegName.substr(RI->Name.size());
+  if (!RegSuffix.empty()) {
+    // Single 32-bit register: vXX.
+    if (!getRegNum(RegSuffix, RegNum))
+      return AMDGPU::NoRegister;
+    RegWidth = 1;
+  } else {
+    // Range of registers: v[XX:YY]. ":YY" is optional.
+    if (!ParseRegRange(RegNum, RegWidth))
+      return AMDGPU::NoRegister;
   }
 
-  default:
-    llvm_unreachable("unexpected register kind");
+  return getRegularReg(RegKind, RegNum, RegWidth);
+}
+
+unsigned
+AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
+                              unsigned &RegNum,
+                              unsigned &RegWidth) {
+  unsigned Reg = AMDGPU::NoRegister;
+
+  if (!trySkipToken(AsmToken::LBrac))
+    return AMDGPU::NoRegister;
+
+  // List of consecutive registers, e.g.: [s0,s1,s2,s3]
+
+  if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
+    return AMDGPU::NoRegister;
+  if (RegWidth != 1)
+    return AMDGPU::NoRegister;
+
+  for (; trySkipToken(AsmToken::Comma); ) {
+    RegisterKind NextRegKind;
+    unsigned NextReg, NextRegNum, NextRegWidth;
+
+    if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth))
+      return AMDGPU::NoRegister;
+    if (NextRegWidth != 1)
+      return AMDGPU::NoRegister;
+    if (NextRegKind != RegKind)
+      return AMDGPU::NoRegister;
+    if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg))
+      return AMDGPU::NoRegister;
   }
 
-  if (!subtargetHasRegister(*TRI, Reg))
-    return false;
-  return true;
+  if (!trySkipToken(AsmToken::RBrac))
+    return AMDGPU::NoRegister;
+
+  if (isRegularReg(RegKind))
+    Reg = getRegularReg(RegKind, RegNum, RegWidth);
+
+  return Reg;
+}
+
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind,
+                                          unsigned &Reg,
+                                          unsigned &RegNum,
+                                          unsigned &RegWidth) {
+  Reg = AMDGPU::NoRegister;
+
+  if (isToken(AsmToken::Identifier)) {
+    Reg = ParseSpecialReg(RegKind, RegNum, RegWidth);
+    if (Reg == AMDGPU::NoRegister)
+      Reg = ParseRegularReg(RegKind, RegNum, RegWidth);
+  } else {
+    Reg = ParseRegList(RegKind, RegNum, RegWidth);
+  }
+
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg);
 }
 
 Optional<StringRef>
@@ -2241,18 +2318,18 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
   SMLoc StartLoc = Tok.getLoc();
   SMLoc EndLoc = Tok.getEndLoc();
   RegisterKind RegKind;
-  unsigned Reg, RegNum, RegWidth, DwordRegIndex;
+  unsigned Reg, RegNum, RegWidth;
 
-  if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
+  if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
     //FIXME: improve error messages (bug 41303).
     Error(StartLoc, "not a valid operand.");
     return nullptr;
   }
   if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
-    if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth))
+    if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
       return nullptr;
   } else
-    KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+    KernelScope.usesRegister(RegKind, RegNum, RegWidth);
   return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc);
 }
 
@@ -2648,7 +2725,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
     case AMDGPU::VCC_LO:
     case AMDGPU::VCC_HI:
     case AMDGPU::M0:
-    case AMDGPU::SGPR_NULL:
       return Reg;
     default:
       break;
@@ -2697,13 +2773,38 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
   }
 }
 
+unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const {
+  if (!isGFX10())
+    return 1;
+
+  switch (Opcode) {
+  // 64-bit shift instructions can use only one scalar value input
+  case AMDGPU::V_LSHLREV_B64:
+  case AMDGPU::V_LSHLREV_B64_gfx10:
+  case AMDGPU::V_LSHL_B64:
+  case AMDGPU::V_LSHRREV_B64:
+  case AMDGPU::V_LSHRREV_B64_gfx10:
+  case AMDGPU::V_LSHR_B64:
+  case AMDGPU::V_ASHRREV_I64:
+  case AMDGPU::V_ASHRREV_I64_gfx10:
+  case AMDGPU::V_ASHR_I64:
+    return 1;
+  default:
+    return 2;
+  }
+}
+
 bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
   const MCOperand &MO = Inst.getOperand(OpIdx);
   if (MO.isImm()) {
     return !isInlineConstant(Inst, OpIdx);
+  } else if (MO.isReg()) {
+    auto Reg = MO.getReg();
+    const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+    return isSGPR(mc2PseudoReg(Reg), TRI) && Reg != SGPR_NULL;
+  } else {
+    return true;
   }
-  return !MO.isReg() ||
-         isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
 }
 
 bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
@@ -2782,10 +2883,7 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
   }
   ConstantBusUseCount += NumLiterals;
 
-  if (isGFX10())
-    return ConstantBusUseCount <= 2;
-
-  return ConstantBusUseCount <= 1;
+  return ConstantBusUseCount <= getConstantBusLimit(Opcode);
 }
 
 bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
@@ -3212,6 +3310,7 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
 
   const int OpIndices[] = { Src0Idx, Src1Idx };
 
+  unsigned NumExprs = 0;
   unsigned NumLiterals = 0;
   uint32_t LiteralValue;
 
@@ -3219,19 +3318,21 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
     if (OpIdx == -1) break;
 
     const MCOperand &MO = Inst.getOperand(OpIdx);
-    if (MO.isImm() &&
-        // Exclude special imm operands (like that used by s_set_gpr_idx_on)
-        AMDGPU::isSISrcOperand(Desc, OpIdx) &&
-        !isInlineConstant(Inst, OpIdx)) {
-      uint32_t Value = static_cast<uint32_t>(MO.getImm());
-      if (NumLiterals == 0 || LiteralValue != Value) {
-        LiteralValue = Value;
-        ++NumLiterals;
+    // Exclude special imm operands (like that used by s_set_gpr_idx_on)
+    if (AMDGPU::isSISrcOperand(Desc, OpIdx)) {
+      if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
+        uint32_t Value = static_cast<uint32_t>(MO.getImm());
+        if (NumLiterals == 0 || LiteralValue != Value) {
+          LiteralValue = Value;
+          ++NumLiterals;
+        }
+      } else if (MO.isExpr()) {
+        ++NumExprs;
       }
     }
   }
 
-  return NumLiterals <= 1;
+  return NumLiterals + NumExprs <= 1;
 }
 
 bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
@@ -3267,6 +3368,7 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
 
   const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
 
+  unsigned NumExprs = 0;
   unsigned NumLiterals = 0;
   uint32_t LiteralValue;
 
@@ -3274,17 +3376,26 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
     if (OpIdx == -1) break;
 
     const MCOperand &MO = Inst.getOperand(OpIdx);
-    if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx))
+    if (!MO.isImm() && !MO.isExpr())
+      continue;
+    if (!AMDGPU::isSISrcOperand(Desc, OpIdx))
       continue;
 
-    if (!isInlineConstant(Inst, OpIdx)) {
+    if (OpIdx == Src2Idx && (Desc.TSFlags & SIInstrFlags::IsMAI) &&
+        getFeatureBits()[AMDGPU::FeatureMFMAInlineLiteralBug])
+      return false;
+
+    if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
       uint32_t Value = static_cast<uint32_t>(MO.getImm());
       if (NumLiterals == 0 || LiteralValue != Value) {
         LiteralValue = Value;
         ++NumLiterals;
       }
+    } else if (MO.isExpr()) {
+      ++NumExprs;
     }
   }
+  NumLiterals += NumExprs;
 
   return !NumLiterals ||
          (NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]);
@@ -3607,37 +3718,44 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
                        Val, ValRange);
-      UserSGPRCount += 4;
+      if (Val)
+        UserSGPRCount += 4;
     } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
                        ValRange);
-      UserSGPRCount += 2;
+      if (Val)
+        UserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
                        ValRange);
-      UserSGPRCount += 2;
+      if (Val)
+        UserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
                        Val, ValRange);
-      UserSGPRCount += 2;
+      if (Val)
+        UserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
                        ValRange);
-      UserSGPRCount += 2;
+      if (Val)
+        UserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
                        ValRange);
-      UserSGPRCount += 2;
+      if (Val)
+        UserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
                        Val, ValRange);
-      UserSGPRCount += 1;
+      if (Val)
+        UserSGPRCount += 1;
     } else if (ID == ".amdhsa_wavefront_size32") {
       if (IVersion.Major < 10)
         return getParser().Error(IDRange.Start, "directive requires gfx10+",
@@ -5224,6 +5342,23 @@ AMDGPUAsmParser::parseExpr(int64_t &Imm) {
   return !getParser().parseAbsoluteExpression(Imm);
 }
 
+bool
+AMDGPUAsmParser::parseExpr(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  const MCExpr *Expr;
+  if (Parser.parseExpression(Expr))
+    return false;
+
+  int64_t IntVal;
+  if (Expr->evaluateAsAbsolute(IntVal)) {
+    Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+  } else {
+    Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+  }
+  return true;
+}
+
 bool
 AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
   if (isToken(AsmToken::String)) {
@@ -5605,25 +5740,29 @@ bool AMDGPUOperand::isGPRIdxMode() const {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
 
-  switch (getLexer().getKind()) {
-    default: return MatchOperand_ParseFail;
-    case AsmToken::Integer: {
-      int64_t Imm;
-      if (getParser().parseAbsoluteExpression(Imm))
-        return MatchOperand_ParseFail;
-      Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S));
-      return MatchOperand_Success;
-    }
+  // Make sure we are not parsing something
+  // that looks like a label or an expression but is not.
+  // This will improve error messages.
+  if (isRegister() || isModifier())
+    return MatchOperand_NoMatch;
 
-    case AsmToken::Identifier:
-      Operands.push_back(AMDGPUOperand::CreateExpr(this,
-          MCSymbolRefExpr::create(getContext().getOrCreateSymbol(
-                                  Parser.getTok().getString()), getContext()), S));
-      Parser.Lex();
-      return MatchOperand_Success;
+  if (parseExpr(Operands)) {
+
+    AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
+    assert(Opr.isImm() || Opr.isExpr());
+    SMLoc Loc = Opr.getStartLoc();
+
+    // Currently we do not support arbitrary expressions as branch targets.
+    // Only labels and absolute expressions are accepted.
+    if (Opr.isExpr() && !Opr.isSymbolRefExpr()) {
+      Error(Loc, "expected an absolute expression or a label");
+    } else if (Opr.isImm() && !Opr.isS16Imm()) {
+      Error(Loc, "expected a 16-bit signed jump offset");
+    }
   }
+
+  return MatchOperand_Success; // avoid excessive error messages
 }
 
 //===----------------------------------------------------------------------===//
@@ -5908,6 +6047,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"format",  AMDGPUOperand::ImmTyFORMAT, false, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
+  {"swz",     AMDGPUOperand::ImmTySWZ, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
   {"d16",     AMDGPUOperand::ImmTyD16, true, nullptr},
   {"high",    AMDGPUOperand::ImmTyHigh, true, nullptr},
@@ -5941,8 +6081,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
 };
 
 OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
-  unsigned size = Operands.size();
-  assert(size > 0);
 
   OperandMatchResultTy res = parseOptionalOpr(Operands);
 
@@ -5957,17 +6095,13 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan
   // to make sure autogenerated parser of custom operands never hit hardcoded
   // mandatory operands.
 
-  if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) {
-
-    // We have parsed the first optional operand.
-    // Parse as many operands as necessary to skip all mandatory operands.
+  for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) {
+    if (res != MatchOperand_Success ||
+        isToken(AsmToken::EndOfStatement))
+      break;
 
-    for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) {
-      if (res != MatchOperand_Success ||
-          getLexer().is(AsmToken::EndOfStatement)) break;
-      if (getLexer().is(AsmToken::Comma)) Parser.Lex();
-      res = parseOptionalOpr(Operands);
-    }
+    trySkipToken(AsmToken::Comma);
+    res = parseOptionalOpr(Operands);
   }
 
   return res;
@@ -6682,7 +6816,11 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
 }
 
 void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) {
-  cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true);
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true, true);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, false, true);
 }
 
 void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
@@ -6690,11 +6828,14 @@ void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
 }
 
 void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
-                              uint64_t BasicInstType, bool skipVcc) {
+                              uint64_t BasicInstType,
+                              bool SkipDstVcc,
+                              bool SkipSrcVcc) {
   using namespace llvm::AMDGPU::SDWA;
 
   OptionalImmIndexMap OptionalIdx;
-  bool skippedVcc = false;
+  bool SkipVcc = SkipDstVcc || SkipSrcVcc;
+  bool SkippedVcc = false;
 
   unsigned I = 1;
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
@@ -6704,19 +6845,21 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 
   for (unsigned E = Operands.size(); I != E; ++I) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
-    if (skipVcc && !skippedVcc && Op.isReg() &&
+    if (SkipVcc && !SkippedVcc && Op.isReg() &&
         (Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) {
       // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
       // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3)
       // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand.
       // Skip VCC only if we didn't skip it on previous iteration.
+      // Note that src0 and src1 occupy 2 slots each because of modifiers.
       if (BasicInstType == SIInstrFlags::VOP2 &&
-          (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) {
-        skippedVcc = true;
+          ((SkipDstVcc && Inst.getNumOperands() == 1) ||
+           (SkipSrcVcc && Inst.getNumOperands() == 5))) {
+        SkippedVcc = true;
         continue;
       } else if (BasicInstType == SIInstrFlags::VOPC &&
                  Inst.getNumOperands() == 0) {
-        skippedVcc = true;
+        SkippedVcc = true;
         continue;
       }
     }
@@ -6728,7 +6871,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
     } else {
       llvm_unreachable("Invalid operand type");
     }
-    skippedVcc = false;
+    SkippedVcc = false;
   }
 
   if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx10 &&
@@ -6849,6 +6992,14 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
     return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand;
   case MCK_AttrChan:
     return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand;
+  case MCK_SReg_64:
+  case MCK_SReg_64_XEXEC:
+    // Null is defined as a 32-bit register but
+    // it should also be enabled with 64-bit operands.
+    // The following code enables it for SReg_64 operands
+    // used as source and destination. Remaining source
+    // operands are handled in isInlinableImm.
+    return Operand.isNull() ? Match_Success : Match_InvalidOperand;
   default:
     return Match_InvalidOperand;
   }
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 62a19d848af2..1b12550aed88 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 8, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 9, "SelectMUBUFAddr64">;
 def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
 
 def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
 def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
 
-def MUBUFOffset : ComplexPattern<i64, 7, "SelectMUBUFOffset">;
+def MUBUFOffset : ComplexPattern<i64, 8, "SelectMUBUFOffset">;
 def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
 
@@ -54,6 +54,17 @@ class MTBUFAddr64Table <bit is_addr64, string Name> {
 // MTBUF classes
 //===----------------------------------------------------------------------===//
 
+class MTBUFGetBaseOpcode<string Op> {
+  string ret = !subst("FORMAT_XY", "FORMAT_X",
+    !subst("FORMAT_XYZ", "FORMAT_X",
+    !subst("FORMAT_XYZW", "FORMAT_X", Op)));
+}
+
+class getMTBUFElements<string Op> {
+  int ret = 1;
+}
+
+
 class MTBUF_Pseudo <string opName, dag outs, dag ins,
                     string asmOps, list<dag> pattern=[]> :
   InstSI<outs, ins, "", pattern>,
@@ -67,6 +78,9 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
   string Mnemonic = opName;
   string AsmOperands = asmOps;
 
+  Instruction Opcode = !cast<Instruction>(NAME);
+  Instruction BaseOpcode = !cast<Instruction>(MTBUFGetBaseOpcode<NAME>.ret);
+
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MTBUF = 1;
@@ -90,6 +104,7 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> has_offset  = 1;
   bits<1> has_slc     = 1;
   bits<1> has_tfe     = 1;
+  bits<4> elements    = 0;
 }
 
 class MTBUF_Real <MTBUF_Pseudo ps> :
@@ -126,17 +141,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
   dag InsNoData = !if(!empty(vaddrList),
     (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc),
+         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
     (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
-         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc)
+         offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
   );
   dag InsData = !if(!empty(vaddrList),
     (ins vdataClass:$vdata,                    SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
-         SLC:$slc, TFE:$tfe, DLC:$dlc),
+         SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
     (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
          SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
-         SLC:$slc, TFE:$tfe, DLC:$dlc)
+         SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
   );
   dag ret = !if(!empty(vdataList), InsNoData, InsData);
 }
@@ -181,51 +196,54 @@ class MTBUF_SetupAddr<int addrKind> {
 class MTBUF_Load_Pseudo <string opName,
                          int addrKind,
                          RegisterClass vdataClass,
+                         int elems,
                          list<dag> pattern=[],
                          // Workaround bug bz30254
                          int addrKindCopy = addrKind>
   : MTBUF_Pseudo<opName,
                  (outs vdataClass:$vdata),
                  getMTBUFIns<addrKindCopy>.ret,
-                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
                  pattern>,
     MTBUF_SetupAddr<addrKindCopy> {
   let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 1;
   let mayStore = 0;
+  let elements = elems;
 }
 
 multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
-                              ValueType load_vt = i32,
+                              int elems, ValueType load_vt = i32,
                               SDPatternOperator ld = null_frag> {
 
-  def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+  def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
     [(set load_vt:$vdata,
      (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
-                      i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
+                      i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
     MTBUFAddr64Table<0, NAME>;
 
-  def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+  def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
     [(set load_vt:$vdata,
      (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
-                      i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
+                      i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
     MTBUFAddr64Table<1, NAME>;
 
-  def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
-  def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
-  def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+  def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+  def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
 
   let DisableWQM = 1 in {
-    def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
-    def _OFFEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
-    def _IDXEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
-    def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+    def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>;
+    def _OFFEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+    def _IDXEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+    def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
   }
 }
 
 class MTBUF_Store_Pseudo <string opName,
                           int addrKind,
                           RegisterClass vdataClass,
+                          int elems,
                           list<dag> pattern=[],
                           // Workaround bug bz30254
                           int addrKindCopy = addrKind,
@@ -233,39 +251,40 @@ class MTBUF_Store_Pseudo <string opName,
   : MTBUF_Pseudo<opName,
                  (outs),
                  getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
-                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
                  pattern>,
     MTBUF_SetupAddr<addrKindCopy> {
   let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 0;
   let mayStore = 1;
+  let elements = elems;
 }
 
 multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
-                               ValueType store_vt = i32,
+                               int elems, ValueType store_vt = i32,
                                SDPatternOperator st = null_frag> {
 
-  def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+  def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
     [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
                                        i16:$offset, i8:$format, i1:$glc,
-                                       i1:$slc, i1:$tfe, i1:$dlc))]>,
+                                       i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
     MTBUFAddr64Table<0, NAME>;
 
-  def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+  def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
     [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                        i16:$offset, i8:$format, i1:$glc,
-                                       i1:$slc, i1:$tfe, i1:$dlc))]>,
+                                       i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
     MTBUFAddr64Table<1, NAME>;
 
-  def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
-  def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
-  def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+  def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+  def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
 
   let DisableWQM = 1 in {
-    def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
-    def _OFFEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
-    def _IDXEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
-    def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+    def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>;
+    def _OFFEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+    def _IDXEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+    def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
   }
 }
 
@@ -320,7 +339,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
   bits<1> has_offset  = 1;
   bits<1> has_slc     = 1;
   bits<1> has_tfe     = 1;
-  bits<4> dwords      = 0;
+  bits<4> elements    = 0;
 }
 
 class MUBUF_Real <MUBUF_Pseudo ps> :
@@ -393,18 +412,30 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
   );
   dag ret = !con(
               !if(!empty(vdataList), InsNoData, InsData),
-              !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc))
+              !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz))
              );
 }
 
-class getMUBUFDwords<RegisterClass regClass> {
-  string regClassAsInt = !cast<string>(regClass);
+class getMUBUFElements<ValueType vt> {
+  // eq does not support ValueType for some reason.
+  string vtAsStr = !cast<string>(vt);
+
   int ret =
-    !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1,
-    !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2,
-    !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3,
-    !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4,
-    0))));
+    !if(!eq(vtAsStr, "f16"), 1,
+      !if(!eq(vtAsStr, "v2f16"), 2,
+        !if(!eq(vtAsStr, "v3f16"), 3,
+          !if(!eq(vtAsStr, "v4f16"), 4,
+            !if(!eq(vt.Size, 32), 1,
+              !if(!eq(vt.Size, 64), 2,
+                !if(!eq(vt.Size, 96), 3,
+                  !if(!eq(vt.Size, 128), 4, 0)
+                )
+              )
+            )
+          )
+        )
+      )
+    );
 }
 
 class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
@@ -442,18 +473,18 @@ class MUBUF_SetupAddr<int addrKind> {
 
 class MUBUF_Load_Pseudo <string opName,
                          int addrKind,
-                         RegisterClass vdataClass,
+                         ValueType vdata_vt,
                          bit HasTiedDest = 0,
                          bit isLds = 0,
                          list<dag> pattern=[],
                          // Workaround bug bz30254
                          int addrKindCopy = addrKind>
   : MUBUF_Pseudo<opName,
-                 (outs vdataClass:$vdata),
+                 (outs getVregSrcForVT<vdata_vt>.ret:$vdata),
                  !con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
-                      !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
+                      !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))),
                  " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
-                   !if(isLds, " lds", "$tfe") # "$dlc",
+                   !if(isLds, " lds", "$tfe") # "$dlc" # "$swz",
                  pattern>,
     MUBUF_SetupAddr<addrKindCopy> {
   let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -467,19 +498,19 @@ class MUBUF_Load_Pseudo <string opName,
   let Uses = !if(isLds, [EXEC, M0], [EXEC]);
   let has_tfe = !if(isLds, 0, 1);
   let lds = isLds;
-  let dwords = getMUBUFDwords<vdataClass>.ret;
+  let elements = getMUBUFElements<vdata_vt>.ret;
 }
 
 class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
-  (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
-  (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+  (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+  (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
 >;
 
 class MUBUF_Addr64_Load_Pat <Instruction inst,
                             ValueType load_vt = i32,
                             SDPatternOperator ld = null_frag> : Pat <
-  (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
-  (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+  (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+  (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
 >;
 
 multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
@@ -490,89 +521,87 @@ multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPa
 
 // FIXME: tfe can't be an operand because it requires a separate
 // opcode because it needs an N+1 register class dest register.
-multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+multiclass MUBUF_Pseudo_Loads<string opName,
                               ValueType load_vt = i32,
                               SDPatternOperator ld = null_frag,
                               bit TiedDest = 0,
                               bit isLds = 0> {
 
-  def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>,
+  def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>,
     MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
 
-  def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, TiedDest, isLds>,
+  def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds>,
     MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
 
-  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
-  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
-  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
+  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
+  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
+  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
 
   let DisableWQM = 1 in {
-    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>;
-    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
-    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
-    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
+    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>;
+    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
+    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
+    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
   }
 }
 
-multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass,
-                                  ValueType load_vt = i32,
+multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32,
                                   SDPatternOperator ld_nolds = null_frag,
                                   SDPatternOperator ld_lds = null_frag> {
-  defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>;
-  defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>;
+  defm NAME : MUBUF_Pseudo_Loads<opName, load_vt, ld_nolds>;
+  defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, ld_lds, 0, 1>;
 }
 
 class MUBUF_Store_Pseudo <string opName,
                           int addrKind,
-                          RegisterClass vdataClass,
+                          ValueType store_vt,
                           list<dag> pattern=[],
                           // Workaround bug bz30254
-                          int addrKindCopy = addrKind,
-                          RegisterClass vdataClassCopy = vdataClass>
+                          int addrKindCopy = addrKind>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
-                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
+                 getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret]>.ret,
+                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
                  pattern>,
     MUBUF_SetupAddr<addrKindCopy> {
   let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 0;
   let mayStore = 1;
   let maybeAtomic = 1;
-  let dwords = getMUBUFDwords<vdataClass>.ret;
+  let elements = getMUBUFElements<store_vt>.ret;
 }
 
-multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+multiclass MUBUF_Pseudo_Stores<string opName,
                                ValueType store_vt = i32,
                                SDPatternOperator st = null_frag> {
 
-  def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+  def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt,
     [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
-                                       i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
+                                       i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
     MUBUFAddr64Table<0, NAME>;
 
-  def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+  def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt,
     [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
-                                       i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
+                                       i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
     MUBUFAddr64Table<1, NAME>;
 
-  def _OFFEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
-  def _IDXEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
-  def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  def _OFFEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
+  def _IDXEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
+  def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
 
   let DisableWQM = 1 in {
-    def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
-    def _OFFEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
-    def _IDXEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
-    def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+    def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt>;
+    def _OFFEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
+    def _IDXEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
+    def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
   }
 }
 
 class MUBUF_Pseudo_Store_Lds<string opName>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
-                 " $srsrc, $soffset$offset lds$glc$slc"> {
+                 (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz),
+                 " $srsrc, $soffset$offset lds$glc$slc$swz"> {
   let mayLoad = 0;
   let mayStore = 1;
   let maybeAtomic = 1;
@@ -686,7 +715,7 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
                                         RegisterClass vdataClass,
                                         ValueType vdataType,
                                         SDPatternOperator atomic,
-                                        bit isFP = getIsFP<vdataType>.ret> {
+                                        bit isFP = isFloatType<vdataType>.ret> {
   let FPAtomic = isFP in
   def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
                 MUBUFAddr64Table <0, NAME>;
@@ -710,7 +739,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
                                      RegisterClass vdataClass,
                                      ValueType vdataType,
                                      SDPatternOperator atomic,
-                                     bit isFP = getIsFP<vdataType>.ret> {
+                                     bit isFP = isFloatType<vdataType>.ret> {
   let FPAtomic = isFP in
   def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
     [(set vdataType:$vdata,
@@ -748,107 +777,107 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
 //===----------------------------------------------------------------------===//
 
 defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds <
-  "buffer_load_format_x", VGPR_32
+  "buffer_load_format_x", f32
 >;
 defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
-  "buffer_load_format_xy", VReg_64
+  "buffer_load_format_xy", v2f32
 >;
 defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Pseudo_Loads <
-  "buffer_load_format_xyz", VReg_96
+  "buffer_load_format_xyz", v3f32
 >;
 defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads <
-  "buffer_load_format_xyzw", VReg_128
+  "buffer_load_format_xyzw", v4f32
 >;
 defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores <
-  "buffer_store_format_x", VGPR_32
+  "buffer_store_format_x", f32
 >;
 defm BUFFER_STORE_FORMAT_XY : MUBUF_Pseudo_Stores <
-  "buffer_store_format_xy", VReg_64
+  "buffer_store_format_xy", v2f32
 >;
 defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
-  "buffer_store_format_xyz", VReg_96
+  "buffer_store_format_xyz", v3f32
 >;
 defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
-  "buffer_store_format_xyzw", VReg_128
+  "buffer_store_format_xyzw", v4f32
 >;
 
 let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
   defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads <
-    "buffer_load_format_d16_x", VGPR_32
+    "buffer_load_format_d16_x", i32
   >;
   defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads <
-    "buffer_load_format_d16_xy", VReg_64
+    "buffer_load_format_d16_xy", v2i32
   >;
   defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads <
-    "buffer_load_format_d16_xyz", VReg_96
+    "buffer_load_format_d16_xyz", v3i32
   >;
   defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads <
-   "buffer_load_format_d16_xyzw", VReg_128
+   "buffer_load_format_d16_xyzw", v4i32
   >;
   defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores <
-    "buffer_store_format_d16_x", VGPR_32
+    "buffer_store_format_d16_x", i32
   >;
   defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores <
-    "buffer_store_format_d16_xy", VReg_64
+    "buffer_store_format_d16_xy", v2i32
   >;
   defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores <
-    "buffer_store_format_d16_xyz", VReg_96
+    "buffer_store_format_d16_xyz", v3i32
   >;
   defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores <
-    "buffer_store_format_d16_xyzw", VReg_128
+    "buffer_store_format_d16_xyzw", v4i32
   >;
 } // End HasUnpackedD16VMem.
 
 let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
   defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
-    "buffer_load_format_d16_x", VGPR_32
+    "buffer_load_format_d16_x", f16
   >;
   defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
-    "buffer_load_format_d16_xy", VGPR_32
+    "buffer_load_format_d16_xy", v2f16
   >;
   defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads <
-    "buffer_load_format_d16_xyz", VReg_64
+    "buffer_load_format_d16_xyz", v3f16
   >;
   defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads <
-    "buffer_load_format_d16_xyzw", VReg_64
+    "buffer_load_format_d16_xyzw", v4f16
   >;
   defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores <
-    "buffer_store_format_d16_x", VGPR_32
+    "buffer_store_format_d16_x", f16
   >;
   defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores <
-    "buffer_store_format_d16_xy", VGPR_32
+    "buffer_store_format_d16_xy", v2f16
   >;
   defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores <
-    "buffer_store_format_d16_xyz", VReg_64
+    "buffer_store_format_d16_xyz", v3f16
   >;
   defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores <
-    "buffer_store_format_d16_xyzw", VReg_64
+    "buffer_store_format_d16_xyzw", v4f16
   >;
 } // End HasPackedD16VMem.
 
 defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
-  "buffer_load_ubyte", VGPR_32, i32
+  "buffer_load_ubyte", i32
 >;
 defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds <
-  "buffer_load_sbyte", VGPR_32, i32
+  "buffer_load_sbyte", i32
 >;
 defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds <
-  "buffer_load_ushort", VGPR_32, i32
+  "buffer_load_ushort", i32
 >;
 defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds <
-  "buffer_load_sshort", VGPR_32, i32
+  "buffer_load_sshort", i32
 >;
 defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
-  "buffer_load_dword", VGPR_32, i32
+  "buffer_load_dword", i32
 >;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx2", VReg_64, v2i32
+  "buffer_load_dwordx2", v2i32
 >;
 defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx3", VReg_96, v3i32
+  "buffer_load_dwordx3", v3i32
 >;
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx4", VReg_128, v4i32
+  "buffer_load_dwordx4", v4i32
 >;
 
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
@@ -867,111 +896,111 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
 // in at least GFX8+ chips. See Bug 37653.
 let SubtargetPredicate = isGFX8GFX9 in {
 defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1
+  "buffer_load_dwordx2", v2i32, null_frag, 0, 1
 >;
 defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1
+  "buffer_load_dwordx3", v3i32, null_frag, 0, 1
 >;
 defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1
+  "buffer_load_dwordx4", v4i32, null_frag, 0, 1
 >;
 }
 
 defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
-  "buffer_store_byte", VGPR_32, i32, truncstorei8_global
+  "buffer_store_byte", i32, truncstorei8_global
 >;
 defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
-  "buffer_store_short", VGPR_32, i32, truncstorei16_global
+  "buffer_store_short", i32, truncstorei16_global
 >;
 defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
-  "buffer_store_dword", VGPR_32, i32, store_global
+  "buffer_store_dword", i32, store_global
 >;
 defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
-  "buffer_store_dwordx2", VReg_64, v2i32, store_global
+  "buffer_store_dwordx2", v2i32, store_global
 >;
 defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
-  "buffer_store_dwordx3", VReg_96, v3i32, store_global
+  "buffer_store_dwordx3", v3i32, store_global
 >;
 defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
-  "buffer_store_dwordx4", VReg_128, v4i32, store_global
+  "buffer_store_dwordx4", v4i32, store_global
 >;
 defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
+  "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global_32
 >;
 defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
   "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
 >;
 defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_add", VGPR_32, i32, atomic_add_global
+  "buffer_atomic_add", VGPR_32, i32, atomic_load_add_global_32
 >;
 defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
+  "buffer_atomic_sub", VGPR_32, i32, atomic_load_sub_global_32
 >;
 defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
+  "buffer_atomic_smin", VGPR_32, i32, atomic_load_min_global_32
 >;
 defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
+  "buffer_atomic_umin", VGPR_32, i32, atomic_load_umin_global_32
 >;
 defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
+  "buffer_atomic_smax", VGPR_32, i32, atomic_load_max_global_32
 >;
 defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
+  "buffer_atomic_umax", VGPR_32, i32, atomic_load_umax_global_32
 >;
 defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_and", VGPR_32, i32, atomic_and_global
+  "buffer_atomic_and", VGPR_32, i32, atomic_load_and_global_32
 >;
 defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_or", VGPR_32, i32, atomic_or_global
+  "buffer_atomic_or", VGPR_32, i32, atomic_load_or_global_32
 >;
 defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+  "buffer_atomic_xor", VGPR_32, i32, atomic_load_xor_global_32
 >;
 defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global
+  "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global_32
 >;
 defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global
+  "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global_32
 >;
 defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global
+  "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global_64
 >;
 defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
 >;
 defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global
+  "buffer_atomic_add_x2", VReg_64, i64, atomic_load_add_global_64
 >;
 defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global
+  "buffer_atomic_sub_x2", VReg_64, i64, atomic_load_sub_global_64
 >;
 defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global
+  "buffer_atomic_smin_x2", VReg_64, i64, atomic_load_min_global_64
 >;
 defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global
+  "buffer_atomic_umin_x2", VReg_64, i64, atomic_load_umin_global_64
 >;
 defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global
+  "buffer_atomic_smax_x2", VReg_64, i64, atomic_load_max_global_64
 >;
 defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global
+  "buffer_atomic_umax_x2", VReg_64, i64, atomic_load_umax_global_64
 >;
 defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global
+  "buffer_atomic_and_x2", VReg_64, i64, atomic_load_and_global_64
 >;
 defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global
+  "buffer_atomic_or_x2", VReg_64, i64, atomic_load_or_global_64
 >;
 defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global
+  "buffer_atomic_xor_x2", VReg_64, i64, atomic_load_xor_global_64
 >;
 defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global
+  "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global_64
 >;
 defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
+  "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64
 >;
 
 let SubtargetPredicate = isGFX8GFX9 in {
@@ -981,58 +1010,75 @@ def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
 let SubtargetPredicate = isGFX6 in { // isn't on CI & VI
 /*
 defm BUFFER_ATOMIC_RSUB        : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
-defm BUFFER_ATOMIC_FCMPSWAP    : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">;
-defm BUFFER_ATOMIC_FMIN        : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin">;
-defm BUFFER_ATOMIC_FMAX        : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax">;
 defm BUFFER_ATOMIC_RSUB_X2     : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub_x2">;
-defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap_x2">;
-defm BUFFER_ATOMIC_FMIN_X2     : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin_x2">;
-defm BUFFER_ATOMIC_FMAX_X2     : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax_x2">;
 */
 
 def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
                                           int_amdgcn_buffer_wbinvl1_sc>;
 }
 
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+
+defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
+>;
+defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_fmin", VGPR_32, f32, null_frag
+>;
+defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_fmax", VGPR_32, f32, null_frag
+>;
+defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
+>;
+defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_fmin_x2", VReg_64, f64, null_frag
+>;
+defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_fmax_x2", VReg_64, f64, null_frag
+>;
+
+}
+
 let SubtargetPredicate = HasD16LoadStore in {
 
 defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_ubyte_d16", VGPR_32, i32, null_frag, 1
+  "buffer_load_ubyte_d16", i32, null_frag, 1
 >;
 
 defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
-  "buffer_load_ubyte_d16_hi", VGPR_32, i32, null_frag, 1
+  "buffer_load_ubyte_d16_hi", i32, null_frag, 1
 >;
 
 defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_sbyte_d16", VGPR_32, i32, null_frag, 1
+  "buffer_load_sbyte_d16", i32, null_frag, 1
 >;
 
 defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
-  "buffer_load_sbyte_d16_hi", VGPR_32, i32, null_frag, 1
+  "buffer_load_sbyte_d16_hi", i32, null_frag, 1
 >;
 
 defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_short_d16", VGPR_32, i32, null_frag, 1
+  "buffer_load_short_d16", i32, null_frag, 1
 >;
 
 defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
-  "buffer_load_short_d16_hi", VGPR_32, i32, null_frag, 1
+  "buffer_load_short_d16_hi", i32, null_frag, 1
 >;
 
 defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
-  "buffer_store_byte_d16_hi", VGPR_32, i32
+  "buffer_store_byte_d16_hi", i32
 >;
 
 defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
-  "buffer_store_short_d16_hi", VGPR_32, i32
+  "buffer_store_short_d16_hi", i32
 >;
 
 defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
-  "buffer_load_format_d16_hi_x", VGPR_32
+  "buffer_load_format_d16_hi_x", i32
 >;
 defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
-  "buffer_store_format_d16_hi_x", VGPR_32
+  "buffer_store_format_d16_hi_x", i32
 >;
 
 } // End HasD16LoadStore
@@ -1043,10 +1089,10 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
 let SubtargetPredicate = HasAtomicFaddInsts in {
 
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
-  "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global
+  "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
 >;
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
-  "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+  "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
 >;
 
 } // End SubtargetPredicate = HasAtomicFaddInsts
@@ -1055,35 +1101,35 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 
-defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     VGPR_32>;
-defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    VReg_64>;
-defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   VReg_96>;
-defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  VReg_128>;
-defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    VGPR_32>;
-defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   VReg_64>;
-defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_96>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     VGPR_32,  1>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    VReg_64,  2>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   VReg_96,  3>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  VReg_128, 4>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    VGPR_32,  1>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   VReg_64,  2>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_96,  3>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>;
 
 let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
-  defm TBUFFER_LOAD_FORMAT_D16_X_gfx80     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32>;
-  defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VReg_64>;
-  defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_96>;
-  defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_128>;
-  defm TBUFFER_STORE_FORMAT_D16_X_gfx80    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32>;
-  defm TBUFFER_STORE_FORMAT_D16_XY_gfx80   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VReg_64>;
-  defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_96>;
-  defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>;
+  defm TBUFFER_LOAD_FORMAT_D16_X_gfx80     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32,  1>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VReg_64,  2>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_96,  3>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_128, 4>;
+  defm TBUFFER_STORE_FORMAT_D16_X_gfx80    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32,  1>;
+  defm TBUFFER_STORE_FORMAT_D16_XY_gfx80   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VReg_64,  2>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_96,  3>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>;
 } // End HasUnpackedD16VMem.
 
 let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
-  defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32>;
-  defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VGPR_32>;
-  defm TBUFFER_LOAD_FORMAT_D16_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_64>;
-  defm TBUFFER_LOAD_FORMAT_D16_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_64>;
-  defm TBUFFER_STORE_FORMAT_D16_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32>;
-  defm TBUFFER_STORE_FORMAT_D16_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VGPR_32>;
-  defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_64>;
-  defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>;
+  defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32, 1>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VGPR_32, 2>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_64, 3>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_64, 4>;
+  defm TBUFFER_STORE_FORMAT_D16_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32, 1>;
+  defm TBUFFER_STORE_FORMAT_D16_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VGPR_32, 2>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_64, 3>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>;
 } // End HasPackedD16VMem.
 
 let SubtargetPredicate = isGFX7Plus in {
@@ -1118,6 +1164,10 @@ def extract_dlc : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
 }]>;
 
+def extract_swz : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // buffer_load/store_format patterns
 //===----------------------------------------------------------------------===//
@@ -1125,33 +1175,37 @@ def extract_dlc : SDNodeXForm<imm, [{
 multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
                                   string opcode> {
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
-              imm:$cachepolicy, 0)),
+    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+              timm:$auxiliary, 0)),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0,  (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
-              imm:$cachepolicy, 0)),
+    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+              timm:$auxiliary, 0)),
     (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0,  (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
-              imm:$cachepolicy, imm)),
+    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+              timm:$auxiliary, timm)),
     (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0,  (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
-              imm:$cachepolicy, imm)),
+    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+              timm:$auxiliary, timm)),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0,  (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 }
 
@@ -1182,8 +1236,12 @@ let SubtargetPredicate = HasPackedD16VMem in {
 
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
@@ -1196,36 +1254,40 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort,  i32, "BUFFER_LOAD_USHORT">;
 multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
                                    string opcode> {
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
-              imm:$cachepolicy, 0),
+    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+              timm:$auxiliary, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0,  (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
-              imm:$cachepolicy, 0),
+    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+              timm:$auxiliary, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
-      (as_i16imm $offset), (extract_glc $cachepolicy),
-      (extract_slc $cachepolicy), 0,  (extract_dlc $cachepolicy))
+      (as_i16imm $offset), (extract_glc $auxiliary),
+      (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
-              imm:$cachepolicy, imm),
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+              timm:$auxiliary, timm),
     (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
-      (as_i16imm $offset), (extract_glc $cachepolicy),
-      (extract_slc $cachepolicy), 0,  (extract_dlc $cachepolicy))
+      (as_i16imm $offset), (extract_glc $auxiliary),
+      (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
-              imm:$cachepolicy, imm),
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+              timm:$auxiliary, timm),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
       $vdata,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy),
-      (extract_slc $cachepolicy), 0,  (extract_dlc $cachepolicy))
+      $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary),
+      (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 }
 
@@ -1256,8 +1318,12 @@ let SubtargetPredicate = HasPackedD16VMem in {
 
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
@@ -1273,32 +1339,32 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
                                 string opcode> {
   def : GCNPat<
     (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
-          0, i32:$soffset, imm:$offset,
-          imm:$cachepolicy, 0)),
+          0, i32:$soffset, timm:$offset,
+          timm:$cachepolicy, 0)),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
                                         (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
     (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          0, i32:$soffset, imm:$offset,
-          imm:$cachepolicy, imm)),
+          0, i32:$soffset, timm:$offset,
+          timm:$cachepolicy, timm)),
     (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
                                        (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
     (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
-          i32:$voffset, i32:$soffset, imm:$offset,
-          imm:$cachepolicy, 0)),
+          i32:$voffset, i32:$soffset, timm:$offset,
+          timm:$cachepolicy, 0)),
     (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
                                        (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
     (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          i32:$voffset, i32:$soffset, imm:$offset,
-          imm:$cachepolicy, imm)),
+          i32:$voffset, i32:$soffset, timm:$offset,
+          timm:$cachepolicy, timm)),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
       $vdata_in,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
@@ -1316,6 +1382,8 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64,  "BUFFER_ATOMIC_ADD_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
@@ -1326,37 +1394,39 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
 
 multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
                                        string opcode> {
   def : GCNPat<
     (name vt:$vdata_in, v4i32:$rsrc, 0,
-          0, i32:$soffset, imm:$offset,
-          imm:$cachepolicy, 0),
+          0, i32:$soffset, timm:$offset,
+          timm:$cachepolicy, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset,
                                         (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
     (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          0, i32:$soffset, imm:$offset,
-          imm:$cachepolicy, imm),
+          0, i32:$soffset, timm:$offset,
+          timm:$cachepolicy, timm),
     (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
                                        (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
     (name vt:$vdata_in, v4i32:$rsrc, 0,
-          i32:$voffset, i32:$soffset, imm:$offset,
-          imm:$cachepolicy, 0),
+          i32:$voffset, i32:$soffset, timm:$offset,
+          timm:$cachepolicy, 0),
     (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
                                        (as_i16imm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
     (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          i32:$voffset, i32:$soffset, imm:$offset,
-          imm:$cachepolicy, imm),
+          i32:$voffset, i32:$soffset, timm:$offset,
+          timm:$cachepolicy, timm),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
       $vdata_in,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
@@ -1370,8 +1440,8 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMI
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
       i32:$data, i32:$cmp, v4i32:$rsrc, 0,
-      0, i32:$soffset, imm:$offset,
-      imm:$cachepolicy, 0),
+      0, i32:$soffset, timm:$offset,
+      timm:$cachepolicy, 0),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
       (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1382,8 +1452,8 @@ def : GCNPat<
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
       i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
-      0, i32:$soffset, imm:$offset,
-      imm:$cachepolicy, imm),
+      0, i32:$soffset, timm:$offset,
+      timm:$cachepolicy, timm),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
       (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1394,8 +1464,8 @@ def : GCNPat<
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
       i32:$data, i32:$cmp, v4i32:$rsrc, 0,
-      i32:$voffset, i32:$soffset, imm:$offset,
-      imm:$cachepolicy, 0),
+      i32:$voffset, i32:$soffset, timm:$offset,
+      timm:$cachepolicy, 0),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
       (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1406,8 +1476,8 @@ def : GCNPat<
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
       i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
-      i32:$voffset, i32:$soffset, imm:$offset,
-      imm:$cachepolicy, imm),
+      i32:$voffset, i32:$soffset, timm:$offset,
+      timm:$cachepolicy, timm),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
       (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1419,8 +1489,8 @@ def : GCNPat<
 class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
                               PatFrag constant_ld> : GCNPat <
      (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
-                                   i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
-     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
+                                   i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
   >;
 
 multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
@@ -1428,12 +1498,12 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins
   def : GCNPat <
      (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                    i16:$offset, i1:$slc))),
-     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
+     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
   >;
 
   def : GCNPat <
     (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
-    (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
+    (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
   >;
 }
 
@@ -1454,8 +1524,8 @@ multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
 
   def : GCNPat <
     (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
-                          i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
-    (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
+                          i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+    (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
   >;
 }
 
@@ -1478,12 +1548,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
   def : GCNPat <
     (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
                                i32:$soffset, u16imm:$offset))),
-    (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
+    (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
   >;
 
   def : GCNPat <
     (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
-    (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0)
+    (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
   >;
 }
 
@@ -1493,12 +1563,12 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
                                 ValueType vt, PatFrag ld_frag> {
   def : GCNPat <
     (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
-    (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
+    (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
   >;
 
   def : GCNPat <
     (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
-    (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
+    (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
   >;
 }
 
@@ -1512,7 +1582,10 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET,
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
+
+foreach vt = Reg32Types.types in {
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
+}
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
@@ -1535,16 +1608,16 @@ defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D1
 
 multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
                                       ValueType vt, PatFrag atomic_st> {
-  // Store follows atomic op convention so address is forst
+  // Store follows atomic op convention so address is first
   def : GCNPat <
      (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                    i16:$offset, i1:$slc), vt:$val),
-     (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
+     (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
   >;
 
   def : GCNPat <
     (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
-    (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
+    (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
   >;
 }
 let SubtargetPredicate = isGFX6GFX7 in {
@@ -1558,8 +1631,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
 
   def : GCNPat <
     (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
-                                      i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)),
-    (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
+                                      i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)),
+    (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
   >;
 }
 
@@ -1573,13 +1646,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
   def : GCNPat <
     (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
                                       i32:$soffset, u16imm:$offset)),
-    (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
+    (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
   >;
 
   def : GCNPat <
     (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
                                        u16imm:$offset)),
-    (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0)
+    (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
   >;
 }
 
@@ -1587,7 +1660,11 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET,
 defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
 defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
 defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>;
+
+foreach vt = Reg32Types.types in {
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, vt, store_private>;
+}
+
 defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>;
 defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>;
 defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
@@ -1613,37 +1690,41 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D
 multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
                                   string opcode> {
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
-              imm:$format, imm:$cachepolicy, 0)),
+    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+              timm:$format, timm:$auxiliary, 0)),
     (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
       (as_i8imm $format),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
-              imm:$format, imm:$cachepolicy, imm)),
+    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+              timm:$format, timm:$auxiliary, timm)),
     (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
       (as_i8imm $format),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
-              imm:$format, imm:$cachepolicy, 0)),
+    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+              timm:$format, timm:$auxiliary, 0)),
     (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
       (as_i8imm $format),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
-              imm:$format, imm:$cachepolicy, imm)),
+    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+              timm:$format, timm:$auxiliary, timm)),
     (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset),
       (as_i8imm $format),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 }
 
@@ -1671,37 +1752,41 @@ let SubtargetPredicate = HasPackedD16VMem in {
 multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
                                    string opcode> {
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
-          imm:$format, imm:$cachepolicy, 0),
+    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+          timm:$format, timm:$auxiliary, 0),
     (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
       (as_i16imm $offset), (as_i8imm $format),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
-          imm:$format, imm:$cachepolicy, imm),
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+          timm:$format, timm:$auxiliary, timm),
     (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
       (as_i16imm $offset), (as_i8imm $format),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
-    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
-          imm:$format, imm:$cachepolicy, 0),
+    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+          timm:$format, timm:$auxiliary, 0),
     (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
       (as_i16imm $offset), (as_i8imm $format),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
-          imm:$offset, imm:$format, imm:$cachepolicy, imm),
+          timm:$offset, timm:$format, timm:$auxiliary, timm),
     (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
       $vdata,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
-      (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+      (extract_swz $auxiliary))
   >;
 }
 
@@ -1957,10 +2042,9 @@ defm BUFFER_ATOMIC_OR          : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03a>;
 defm BUFFER_ATOMIC_XOR         : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03b>;
 defm BUFFER_ATOMIC_INC         : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03c>;
 defm BUFFER_ATOMIC_DEC         : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03d>;
-// FIXME-GFX6-GFX7-GFX10: Add following instructions:
-//defm BUFFER_ATOMIC_FCMPSWAP    : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>;
-//defm BUFFER_ATOMIC_FMIN        : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>;
-//defm BUFFER_ATOMIC_FMAX        : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>;
+defm BUFFER_ATOMIC_FCMPSWAP    : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>;
+defm BUFFER_ATOMIC_FMIN        : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>;
+defm BUFFER_ATOMIC_FMAX        : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>;
 defm BUFFER_ATOMIC_SWAP_X2     : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x050>;
 defm BUFFER_ATOMIC_CMPSWAP_X2  : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x051>;
 defm BUFFER_ATOMIC_ADD_X2      : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x052>;
@@ -1975,10 +2059,9 @@ defm BUFFER_ATOMIC_XOR_X2      : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05b>;
 defm BUFFER_ATOMIC_INC_X2      : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>;
 defm BUFFER_ATOMIC_DEC_X2      : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>;
 // FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7.
-// FIXME-GFX6-GFX7-GFX10: Add following instructions:
-//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
-//defm BUFFER_ATOMIC_FMIN_X2     : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
-//defm BUFFER_ATOMIC_FMAX_X2     : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
+defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
+defm BUFFER_ATOMIC_FMIN_X2     : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
+defm BUFFER_ATOMIC_FMAX_X2     : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
 
 defm BUFFER_WBINVL1_SC        : MUBUF_Real_gfx6<0x070>;
 defm BUFFER_WBINVL1_VOL       : MUBUF_Real_gfx7<0x070>;
@@ -2353,7 +2436,7 @@ let SubtargetPredicate = HasPackedD16VMem in {
 def MUBUFInfoTable : GenericTable {
   let FilterClass = "MUBUF_Pseudo";
   let CppTypeName = "MUBUFInfo";
-  let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"];
+  let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"];
 
   let PrimaryKey = ["Opcode"];
   let PrimaryKeyName = "getMUBUFOpcodeHelper";
@@ -2364,7 +2447,26 @@ def getMUBUFInfoFromOpcode : SearchIndex {
   let Key = ["Opcode"];
 }
 
-def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex {
+def getMUBUFInfoFromBaseOpcodeAndElements : SearchIndex {
   let Table = MUBUFInfoTable;
-  let Key = ["BaseOpcode", "dwords"];
+  let Key = ["BaseOpcode", "elements"];
+}
+
+def MTBUFInfoTable : GenericTable {
+  let FilterClass = "MTBUF_Pseudo";
+  let CppTypeName = "MTBUFInfo";
+  let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getMTBUFOpcodeHelper";
+}
+
+def getMTBUFInfoFromOpcode : SearchIndex {
+  let Table = MTBUFInfoTable;
+  let Key = ["Opcode"];
+}
+
+def getMTBUFInfoFromBaseOpcodeAndElements : SearchIndex {
+  let Table = MTBUFInfoTable;
+  let Key = ["BaseOpcode", "elements"];
 }
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index c52eaaa3fdc5..816ec14a0e98 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -81,6 +81,17 @@ class DS_Real <DS_Pseudo ds> :
 
 // DS Pseudo instructions
 
+class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+  (outs),
+  (ins rc:$data0, offset:$offset, gds:$gds),
+  "$data0$offset$gds"> {
+
+  let has_addr = 0;
+  let has_data1 = 0;
+  let has_vdst = 0;
+}
+
 class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs),
@@ -317,13 +328,16 @@ class DS_GWS <string opName, dag ins, string asmOps>
 
 class DS_GWS_0D <string opName>
 : DS_GWS<opName,
-  (ins offset:$offset, gds:$gds), "$offset gds">;
+  (ins offset:$offset, gds:$gds), "$offset gds"> {
+  let hasSideEffects = 1;
+}
 
 class DS_GWS_1D <string opName>
 : DS_GWS<opName,
   (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
 
   let has_gws_data0 = 1;
+  let hasSideEffects = 1;
 }
 
 class DS_VOID <string opName> : DS_Pseudo<opName,
@@ -391,11 +405,12 @@ def DS_WRITE_B8_D16_HI  : DS_1A1D_NORET<"ds_write_b8_d16_hi">;
 def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">;
 }
 
+} // End has_m0_read = 0
+
 let SubtargetPredicate = HasDSAddTid in {
-def DS_WRITE_ADDTID_B32 : DS_1A1D_NORET<"ds_write_addtid_b32">;
+def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
 }
 
-} // End has_m0_read = 0
 } // End mayLoad = 0
 
 defm DS_MSKOR_B32     : DS_1A2D_NORET_mc<"ds_mskor_b32">;
@@ -540,13 +555,14 @@ def DS_READ_I8_D16_HI  : DS_1A_RET_Tied<"ds_read_i8_d16_hi">;
 def DS_READ_U16_D16    : DS_1A_RET_Tied<"ds_read_u16_d16">;
 def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
 }
+} // End has_m0_read = 0
 
 let SubtargetPredicate = HasDSAddTid in {
-def DS_READ_ADDTID_B32 : DS_1A_RET<"ds_read_addtid_b32">;
-}
-} // End has_m0_read = 0
+def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">;
 }
 
+} // End mayStore = 0
+
 def DS_CONSUME       : DS_0A_RET<"ds_consume">;
 def DS_APPEND        : DS_0A_RET<"ds_append">;
 def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
@@ -600,13 +616,13 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
 //===----------------------------------------------------------------------===//
 
 def : GCNPat <
-  (int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
+  (int_amdgcn_ds_swizzle i32:$src, timm:$offset16),
   (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
 >;
 
 class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
-  (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
-  (inst $ptr, (as_i16imm $offset), (i1 gds))
+  (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset))),
+  (inst $ptr, offset:$offset, (i1 gds))
 >;
 
 multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
@@ -621,8 +637,8 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
 }
 
 class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
-  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
-  (inst $ptr, (as_i16imm $offset), (i1 0), $in)
+  (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$in),
+  (inst $ptr, offset:$offset, (i1 0), $in)
 >;
 
 defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
@@ -636,13 +652,20 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
-defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
+
+foreach vt = Reg32Types.types in {
+defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
+}
+
 defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
 defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
 
 let AddedComplexity = 100 in {
 
-defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
+foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
+}
+
 defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
 
 } // End AddedComplexity = 100
@@ -664,8 +687,8 @@ def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>;
 }
 
 class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
-  (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
-  (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
+  (frag vt:$value, (DS1Addr1Offset i32:$ptr, i16:$offset)),
+  (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
 >;
 
 multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -681,8 +704,8 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
 // Irritatingly, atomic_store reverses the order of operands from a
 // normal store.
 class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
-  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
-  (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+  (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+  (inst $ptr, $value, offset:$offset, (i1 0))
 >;
 
 multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -699,9 +722,13 @@ defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
 defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
 defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
 defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
-defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">;
+
+foreach vt = VGPR_32.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
+}
+
+defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">;
 
 let OtherPredicates = [D16PreservesUnusedBits] in {
 def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
@@ -736,46 +763,49 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
 
 let AddedComplexity = 100 in {
 
-defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
+foreach vt = VReg_64.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
+}
+
 defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
 
 } // End AddedComplexity = 100
 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
-  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
-  (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
+  (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+  (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
 >;
 
 multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
   let OtherPredicates = [LDSRequiresM0Init] in {
-    def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
+    def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
     def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
-                         !cast<PatFrag>(frag#"_local")>;
+                         !cast<PatFrag>(frag#"_local_"#vt.Size)>;
   }
 
-  def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
+  def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
 }
 
 
 
 class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
-  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
-  (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds))
+  (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
+  (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))
 >;
 
 multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
   let OtherPredicates = [LDSRequiresM0Init] in {
-    def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
+    def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
     def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
-                          !cast<PatFrag>(frag#"_local")>;
+                          !cast<PatFrag>(frag#"_local_"#vt.Size)>;
   }
 
-  def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
+  def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
 }
 
 
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 4ec4be9bc485..ec2e2c4e8b71 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1095,6 +1095,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
   case 106: return createRegOperand(VCC);
   case 108: return createRegOperand(TBA);
   case 110: return createRegOperand(TMA);
+  case 125: return createRegOperand(SGPR_NULL);
   case 126: return createRegOperand(EXEC);
   case 235: return createRegOperand(SRC_SHARED_BASE);
   case 236: return createRegOperand(SRC_SHARED_LIMIT);
@@ -1172,7 +1173,8 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
 
     int TTmpIdx = getTTmpIdx(Val);
     if (TTmpIdx >= 0) {
-      return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx);
+      auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32);
+      return createSRegOperand(TTmpClsId, TTmpIdx);
     } else if (Val > SGPR_MAX) {
       return IsWave64 ? decodeSpecialReg64(Val)
                       : decodeSpecialReg32(Val);
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 0550092ce1d6..792e26d21f98 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -322,46 +322,46 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$
 
 defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN,
                                 RAT_ATOMIC_XCHG_INT_NORET,
-                                atomic_swap_global_ret,
-                                atomic_swap_global_noret>;
+                                atomic_swap_global_ret_32,
+                                atomic_swap_global_noret_32>;
 defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET,
-                               atomic_add_global_ret, atomic_add_global_noret>;
+                               atomic_load_add_global_ret_32, atomic_load_add_global_noret_32>;
 defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET,
-                               atomic_sub_global_ret, atomic_sub_global_noret>;
+                               atomic_load_sub_global_ret_32, atomic_load_sub_global_noret_32>;
 defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN,
                                RAT_ATOMIC_MIN_INT_NORET,
-                               atomic_min_global_ret, atomic_min_global_noret>;
+                               atomic_load_min_global_ret_32, atomic_load_min_global_noret_32>;
 defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN,
                                 RAT_ATOMIC_MIN_UINT_NORET,
-                                atomic_umin_global_ret, atomic_umin_global_noret>;
+                                atomic_load_umin_global_ret_32, atomic_load_umin_global_noret_32>;
 defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN,
                                RAT_ATOMIC_MAX_INT_NORET,
-                               atomic_max_global_ret, atomic_max_global_noret>;
+                               atomic_load_max_global_ret_32, atomic_load_max_global_noret_32>;
 defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN,
                                 RAT_ATOMIC_MAX_UINT_NORET,
-                                atomic_umax_global_ret, atomic_umax_global_noret>;
+                                atomic_load_umax_global_ret_32, atomic_load_umax_global_noret_32>;
 defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET,
-                               atomic_and_global_ret, atomic_and_global_noret>;
+                               atomic_load_and_global_ret_32, atomic_load_and_global_noret_32>;
 defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET,
-                              atomic_or_global_ret, atomic_or_global_noret>;
+                              atomic_load_or_global_ret_32, atomic_load_or_global_noret_32>;
 defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET,
-                               atomic_xor_global_ret, atomic_xor_global_noret>;
+                               atomic_load_xor_global_ret_32, atomic_load_xor_global_noret_32>;
 defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
                                         RAT_ATOMIC_INC_UINT_NORET,
-                                        atomic_add_global_ret,
-                                        atomic_add_global_noret, 1>;
+                                        atomic_load_add_global_ret_32,
+                                        atomic_load_add_global_noret_32, 1>;
 defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
                                         RAT_ATOMIC_INC_UINT_NORET,
-                                        atomic_sub_global_ret,
-                                        atomic_sub_global_noret, -1>;
+                                        atomic_load_sub_global_ret_32,
+                                        atomic_load_sub_global_noret_32, -1>;
 defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
                                         RAT_ATOMIC_DEC_UINT_NORET,
-                                        atomic_add_global_ret,
-                                        atomic_add_global_noret, -1>;
+                                        atomic_load_add_global_ret_32,
+                                        atomic_load_add_global_noret_32, -1>;
 defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
                                         RAT_ATOMIC_DEC_UINT_NORET,
-                                        atomic_sub_global_ret,
-                                        atomic_sub_global_noret, 1>;
+                                        atomic_load_sub_global_ret_32,
+                                        atomic_load_sub_global_noret_32, 1>;
 
 // Should be predicated on FeatureFP64
 // def FMA_64 : R600_3OP <
@@ -628,37 +628,37 @@ def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
   [(truncstorei16_local i32:$src1, i32:$src0)]
 >;
 def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
-  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_load_add_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
-  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_load_sub_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
-  [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_load_and_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
-  [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_load_or_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
-  [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_load_xor_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
-  [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_load_min_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
-  [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_load_max_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
-  [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_load_umin_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
-  [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_load_umax_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
-  [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
+  [(set i32:$dst, (atomic_swap_local_32 i32:$src0, i32:$src1))]
 >;
 def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
-  [(set i32:$dst, (atomic_cmp_swap_local i32:$src0, i32:$src1, i32:$src2))]
+  [(set i32:$dst, (atomic_cmp_swap_local_32 i32:$src0, i32:$src1, i32:$src2))]
 >;
 def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
   [(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))]
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 889f60dae920..80ee17eba141 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -270,7 +270,7 @@ multiclass FLAT_Atomic_Pseudo<
   SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
-  bit isFP = getIsFP<data_vt>.ret> {
+  bit isFP = isFloatType<data_vt>.ret> {
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
@@ -300,7 +300,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
   SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
-  bit isFP = getIsFP<data_vt>.ret> {
+  bit isFP = isFloatType<data_vt>.ret> {
 
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
@@ -333,7 +333,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
   SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
-  bit isFP = getIsFP<data_vt>.ret> {
+  bit isFP = isFloatType<data_vt>.ret> {
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
@@ -564,76 +564,76 @@ defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswa
                                   v2i64, VReg_128>;
 
 defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",
-                             VGPR_32, i32, atomic_swap_global>;
+                             VGPR_32, i32, atomic_swap_global_32>;
 
 defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2",
-                                VReg_64, i64, atomic_swap_global>;
+                                VReg_64, i64, atomic_swap_global_64>;
 
 defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add",
-                           VGPR_32, i32, atomic_add_global>;
+                           VGPR_32, i32, atomic_load_add_global_32>;
 
 defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub",
-                           VGPR_32, i32, atomic_sub_global>;
+                           VGPR_32, i32, atomic_load_sub_global_32>;
 
 defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin",
-                            VGPR_32, i32, atomic_min_global>;
+                            VGPR_32, i32, atomic_load_min_global_32>;
 
 defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin",
-                            VGPR_32, i32, atomic_umin_global>;
+                            VGPR_32, i32, atomic_load_umin_global_32>;
 
 defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax",
-                            VGPR_32, i32, atomic_max_global>;
+                            VGPR_32, i32, atomic_load_max_global_32>;
 
 defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax",
-                            VGPR_32, i32, atomic_umax_global>;
+                            VGPR_32, i32, atomic_load_umax_global_32>;
 
 defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and",
-                           VGPR_32, i32, atomic_and_global>;
+                           VGPR_32, i32, atomic_load_and_global_32>;
 
 defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or",
-                          VGPR_32, i32, atomic_or_global>;
+                          VGPR_32, i32, atomic_load_or_global_32>;
 
 defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor",
-                           VGPR_32, i32, atomic_xor_global>;
+                           VGPR_32, i32, atomic_load_xor_global_32>;
 
 defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc",
-                           VGPR_32, i32, atomic_inc_global>;
+                           VGPR_32, i32, atomic_inc_global_32>;
 
 defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec",
-                           VGPR_32, i32, atomic_dec_global>;
+                           VGPR_32, i32, atomic_dec_global_32>;
 
 defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2",
-                              VReg_64, i64, atomic_add_global>;
+                              VReg_64, i64, atomic_load_add_global_64>;
 
 defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2",
-                              VReg_64, i64, atomic_sub_global>;
+                              VReg_64, i64, atomic_load_sub_global_64>;
 
 defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2",
-                               VReg_64, i64, atomic_min_global>;
+                               VReg_64, i64, atomic_load_min_global_64>;
 
 defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2",
-                               VReg_64, i64, atomic_umin_global>;
+                               VReg_64, i64, atomic_load_umin_global_64>;
 
 defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2",
-                               VReg_64, i64, atomic_max_global>;
+                               VReg_64, i64, atomic_load_max_global_64>;
 
 defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2",
-                               VReg_64, i64, atomic_umax_global>;
+                               VReg_64, i64, atomic_load_umax_global_64>;
 
 defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2",
-                              VReg_64, i64, atomic_and_global>;
+                              VReg_64, i64, atomic_load_and_global_64>;
 
 defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2",
-                             VReg_64, i64, atomic_or_global>;
+                             VReg_64, i64, atomic_load_or_global_64>;
 
 defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2",
-                              VReg_64, i64, atomic_xor_global>;
+                              VReg_64, i64, atomic_load_xor_global_64>;
 
 defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
-                              VReg_64, i64, atomic_inc_global>;
+                              VReg_64, i64, atomic_inc_global_64>;
 
 defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
-                              VReg_64, i64, atomic_dec_global>;
+                              VReg_64, i64, atomic_dec_global_64>;
 } // End is_flat_global = 1
 
 } // End SubtargetPredicate = HasFlatGlobalInsts
@@ -686,10 +686,10 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
 let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in {
 
 defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
-  "global_atomic_add_f32", VGPR_32, f32, atomic_add_global
+  "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
 >;
 defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
-  "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+  "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
 >;
 
 } // End SubtargetPredicate = HasAtomicFaddInsts
@@ -777,8 +777,6 @@ def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
 def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>;
 def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
 def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>;
 
@@ -787,41 +785,50 @@ def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
 
 def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
 def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32, VReg_64>;
+
+foreach vt = Reg32Types.types in {
+def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
+def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt, VReg_64>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+}
+
 def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>;
 def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32, VReg_128>;
 
 def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
 def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>;
 
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
 def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
 
 def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
 def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
@@ -847,9 +854,6 @@ def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
 
 } // End OtherPredicates = [HasFlatAddressSpace]
 
-def atomic_fadd_global    : global_binary_atomic_op_frag<SIglobal_atomic_fadd>;
-def atomic_pk_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_pk_fadd>;
-
 let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in {
 
 def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
@@ -863,8 +867,16 @@ def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
 def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
 def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>;
 
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>;
+foreach vt = Reg32Types.types in {
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, vt>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, vt, VGPR_32>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, vt>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, vt, VReg_64>;
+}
+
 def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
 def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>;
 
@@ -875,8 +887,6 @@ def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>;
 def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>;
 def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>;
 def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32, VReg_64>;
 def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>;
 def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32, VReg_128>;
 
@@ -902,36 +912,36 @@ def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
 def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
 def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>;
 
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
+
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
 
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32,    atomic_fadd_global, f32>;
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global, v2f16>;
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32,    atomic_fadd_global_noret, f32>;
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global_noret, v2f16>;
 
 } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
 
@@ -1174,7 +1184,7 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
   let AssemblerPredicate = isGFX10Plus;
   let DecoderNamespace = "GFX10";
 
-  let Inst{11-0}  = {offset{12}, offset{10-0}};
+  let Inst{11-0}  = offset{11-0};
   let Inst{12}    = !if(ps.has_dlc, dlc, ps.dlcValue);
   let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d);
   let Inst{55}    = 0;
diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp
index e1845e2e8e87..98678873e37c 100644
--- a/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -41,6 +41,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -155,8 +156,6 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
                                            RegSubRegPair CombOldVGPR,
                                            bool CombBCZ) const {
   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
-  assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
-         TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
 
   auto OrigOp = OrigMI.getOpcode();
   auto DPPOp = getDPPOp(OrigOp);
@@ -178,7 +177,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     if (OldIdx != -1) {
       assert(OldIdx == NumOperands);
       assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
-      DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
+      auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
+      DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
+                     CombOldVGPR.SubReg);
       ++NumOperands;
     } else {
       // TODO: this discards MAC/FMA instructions for now, let's add it later
@@ -195,6 +196,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
       DPPInst.addImm(Mod0->getImm());
       ++NumOperands;
+    } else if (AMDGPU::getNamedOperandIdx(DPPOp,
+                   AMDGPU::OpName::src0_modifiers) != -1) {
+      DPPInst.addImm(0);
+      ++NumOperands;
     }
     auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
     assert(Src0);
@@ -214,6 +219,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
       DPPInst.addImm(Mod1->getImm());
       ++NumOperands;
+    } else if (AMDGPU::getNamedOperandIdx(DPPOp,
+                   AMDGPU::OpName::src1_modifiers) != -1) {
+      DPPInst.addImm(0);
+      ++NumOperands;
     }
     if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
       if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
@@ -344,6 +353,10 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
   auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
   assert(DstOpnd && DstOpnd->isReg());
   auto DPPMovReg = DstOpnd->getReg();
+  if (DPPMovReg.isPhysical()) {
+    LLVM_DEBUG(dbgs() << "  failed: dpp move writes physreg\n");
+    return false;
+  }
   if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
     LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
                          " for all uses\n");
@@ -362,7 +375,13 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
   bool BoundCtrlZero = BCZOpnd->getImm();
 
   auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
+  auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
   assert(OldOpnd && OldOpnd->isReg());
+  assert(SrcOpnd && SrcOpnd->isReg());
+  if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) {
+    LLVM_DEBUG(dbgs() << "  failed: dpp move reads physreg\n");
+    return false;
+  }
 
   auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
   // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
@@ -408,6 +427,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
     dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
 
   SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
+  DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos;
   auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
   // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
   if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
@@ -420,13 +440,49 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
 
   OrigMIs.push_back(&MovMI);
   bool Rollback = true;
+  SmallVector<MachineOperand*, 16> Uses;
+
   for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
+    Uses.push_back(&Use);
+  }
+
+  while (!Uses.empty()) {
+    MachineOperand *Use = Uses.pop_back_val();
     Rollback = true;
 
-    auto &OrigMI = *Use.getParent();
+    auto &OrigMI = *Use->getParent();
     LLVM_DEBUG(dbgs() << "  try: " << OrigMI);
 
     auto OrigOp = OrigMI.getOpcode();
+    if (OrigOp == AMDGPU::REG_SEQUENCE) {
+      Register FwdReg = OrigMI.getOperand(0).getReg();
+      unsigned FwdSubReg = 0;
+
+      if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) {
+        LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
+                             " for all uses\n");
+        break;
+      }
+
+      unsigned OpNo, E = OrigMI.getNumOperands();
+      for (OpNo = 1; OpNo < E; OpNo += 2) {
+        if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) {
+          FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm();
+          break;
+        }
+      }
+
+      if (!FwdSubReg)
+        break;
+
+      for (auto &Op : MRI->use_nodbg_operands(FwdReg)) {
+        if (Op.getSubReg() == FwdSubReg)
+          Uses.push_back(&Op);
+      }
+      RegSeqWithOpNos[&OrigMI].push_back(OpNo);
+      continue;
+    }
+
     if (TII->isVOP3(OrigOp)) {
       if (!TII->hasVALU32BitEncoding(OrigOp)) {
         LLVM_DEBUG(dbgs() << "  failed: VOP3 hasn't e32 equivalent\n");
@@ -447,14 +503,14 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
     }
 
     LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
-    if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
+    if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
       if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
                                         OldOpndValue, CombBCZ)) {
         DPPMIs.push_back(DPPInst);
         Rollback = false;
       }
     } else if (OrigMI.isCommutable() &&
-               &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+               Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
       auto *BB = OrigMI.getParent();
       auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
       BB->insert(OrigMI, NewMI);
@@ -475,9 +531,22 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
     OrigMIs.push_back(&OrigMI);
   }
 
+  Rollback |= !Uses.empty();
+
   for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
     MI->eraseFromParent();
 
+  if (!Rollback) {
+    for (auto &S : RegSeqWithOpNos) {
+      if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
+        S.first->eraseFromParent();
+        continue;
+      }
+      while (!S.second.empty())
+        S.first->getOperand(S.second.pop_back_val()).setIsUndef(true);
+    }
+  }
+
   return !Rollback;
 }
 
@@ -498,6 +567,13 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
       if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
         Changed = true;
         ++NumDPPMovsCombined;
+      } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+        auto Split = TII->expandMovDPP64(MI);
+        for (auto M : { Split.first, Split.second }) {
+          if (combineDPPMov(*M))
+            ++NumDPPMovsCombined;
+        }
+        Changed = true;
       }
     }
   }
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 885239e2faed..9528aee4c50e 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -726,7 +726,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 
   if (!TRI->isVGPR(MRI, Def.getReg()))
     return WaitStatesNeeded;
-  unsigned Reg = Def.getReg();
+  Register Reg = Def.getReg();
   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
     int DataIdx = createsVALUHazard(*MI);
     return DataIdx >= 0 &&
@@ -792,7 +792,7 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
     return 0;
 
-  unsigned LaneSelectReg = LaneSelectOp->getReg();
+  Register LaneSelectReg = LaneSelectOp->getReg();
   auto IsHazardFn = [TII] (MachineInstr *MI) {
     return TII->isVALU(*MI);
   };
@@ -891,7 +891,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
   // which is always a VGPR and available.
   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
-  unsigned Reg = Src0->getReg();
+  Register Reg = Src0->getReg();
   bool IsUndef = Src0->isUndef();
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII->get(AMDGPU::V_MOV_B32_e32))
@@ -952,6 +952,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
   unsigned SDSTName;
   switch (MI->getOpcode()) {
   case AMDGPU::V_READLANE_B32:
+  case AMDGPU::V_READLANE_B32_gfx10:
   case AMDGPU::V_READFIRSTLANE_B32:
     SDSTName = AMDGPU::OpName::vdst;
     break;
@@ -976,7 +977,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
   if (!SDST)
     return false;
 
-  const unsigned SDSTReg = SDST->getReg();
+  const Register SDSTReg = SDST->getReg();
   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
   };
@@ -1251,14 +1252,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
     const int MaxWaitStates = 18;
-    unsigned Reg = Op.getReg();
+    Register Reg = Op.getReg();
     unsigned HazardDefLatency = 0;
 
     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
                               (MachineInstr *MI) {
       if (!IsMFMAFn(MI))
         return false;
-      unsigned DstReg = MI->getOperand(0).getReg();
+      Register DstReg = MI->getOperand(0).getReg();
       if (DstReg == Reg)
         return false;
       HazardDefLatency = std::max(HazardDefLatency,
@@ -1304,7 +1305,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
         return false;
-      unsigned DstReg = MI->getOperand(0).getReg();
+      Register DstReg = MI->getOperand(0).getReg();
       return TRI.regsOverlap(Reg, DstReg);
     };
 
@@ -1330,14 +1331,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
     const int MaxWaitStates = 13;
-    unsigned DstReg = MI->getOperand(0).getReg();
+    Register DstReg = MI->getOperand(0).getReg();
     unsigned HazardDefLatency = 0;
 
     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
                          (MachineInstr *MI) {
       if (!IsMFMAFn(MI))
         return false;
-      unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+      Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
       HazardDefLatency = std::max(HazardDefLatency,
                                   TSchedModel.computeInstrLatency(MI));
       return TRI.regsOverlap(Reg, DstReg);
@@ -1376,7 +1377,7 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
       continue;
 
-    unsigned Reg = Op.getReg();
+    Register Reg = Op.getReg();
 
     const int AccVgprReadLdStWaitStates = 2;
     const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index 1eb617640c32..39072af7d871 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 3525174223bd..90ab6a14ce20 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -237,7 +237,7 @@ public:
 
 GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
                                              StrategyKind S)
-  : BaseClass(C, llvm::make_unique<SchedStrategyStub>())
+  : BaseClass(C, std::make_unique<SchedStrategyStub>())
   , Context(C)
   , Strategy(S)
   , UPTracker(*LIS) {
diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 51c4c99cfb18..36a8f74150f5 100644
--- a/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -173,11 +173,11 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
   bool NSA = false;
   for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
-    unsigned Reg = Op.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+    Register Reg = Op.getReg();
+    if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
       return NSA_Status::FIXED;
 
-    unsigned PhysReg = VRM->getPhys(Reg);
+    Register PhysReg = VRM->getPhys(Reg);
 
     if (!Fast) {
       if (!PhysReg)
@@ -276,7 +276,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
     SlotIndex MinInd, MaxInd;
     for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
-      unsigned Reg = Op.getReg();
+      Register Reg = Op.getReg();
       LiveInterval *LI = &LIS->getInterval(Reg);
       if (llvm::find(Intervals, LI) != Intervals.end()) {
         // Same register used, unable to make sequential
diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index f0d47eaa4ed1..2927d4eb745a 100644
--- a/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -230,7 +230,7 @@ private:
 public:
   Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
     return Printable([Reg, SubReg, this](raw_ostream &OS) {
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (Register::isPhysicalRegister(Reg)) {
         OS << llvm::printReg(Reg, TRI);
         return;
       }
@@ -275,7 +275,7 @@ char GCNRegBankReassign::ID = 0;
 char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
 
 unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
-  assert (TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(Register::isPhysicalRegister(Reg));
 
   const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
   unsigned Size = TRI->getRegSizeInBits(*RC);
@@ -293,7 +293,7 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
 
 unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
                                             int Bank) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     if (!VRM->isAssignedReg(Reg))
       return 0;
 
@@ -364,7 +364,7 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
     if (!Op.isReg() || Op.isUndef())
       continue;
 
-    unsigned R = Op.getReg();
+    Register R = Op.getReg();
     if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
       continue;
 
@@ -420,12 +420,12 @@ unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
 }
 
 bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+  if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
     return false;
 
   const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
 
-  unsigned PhysReg = VRM->getPhys(Reg);
+  Register PhysReg = VRM->getPhys(Reg);
 
   if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
     return false;
@@ -654,7 +654,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
   }
   std::sort(BankStalls.begin(), BankStalls.end());
 
-  unsigned OrigReg = VRM->getPhys(C.Reg);
+  Register OrigReg = VRM->getPhys(C.Reg);
   LRM->unassign(LI);
   while (!BankStalls.empty()) {
     BankStall BS = BankStalls.pop_back_val();
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 39460fbd8a84..d593204cba05 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -40,7 +40,7 @@ void llvm::printLivesAt(SlotIndex SI,
          << *LIS.getInstructionFromIndex(SI);
   unsigned Num = 0;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    const unsigned Reg = Register::index2VirtReg(I);
     if (!LIS.hasInterval(Reg))
       continue;
     const auto &LI = LIS.getInterval(Reg);
@@ -84,7 +84,7 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
 
 unsigned GCNRegPressure::getRegKind(unsigned Reg,
                                     const MachineRegisterInfo &MRI) {
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
   const auto RC = MRI.getRegClass(Reg);
   auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
   return STI->isSGPRClass(RC) ?
@@ -183,7 +183,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
-  OS << "VGPRs: " << getVGPRNum();
+  OS << "VGPRs: " << Value[VGPR32] << ' ';
+  OS << "AGPRs: " << Value[AGPR32];
   if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
   OS << ", SGPRs: " << getSGPRNum();
   if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
@@ -196,8 +197,7 @@ void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
 
 static LaneBitmask getDefRegMask(const MachineOperand &MO,
                                  const MachineRegisterInfo &MRI) {
-  assert(MO.isDef() && MO.isReg() &&
-    TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+  assert(MO.isDef() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
 
   // We don't rely on read-undef flag because in case of tentative schedule
   // tracking it isn't set correctly yet. This works correctly however since
@@ -210,8 +210,7 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO,
 static LaneBitmask getUsedRegMask(const MachineOperand &MO,
                                   const MachineRegisterInfo &MRI,
                                   const LiveIntervals &LIS) {
-  assert(MO.isUse() && MO.isReg() &&
-         TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+  assert(MO.isUse() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
 
   if (auto SubReg = MO.getSubReg())
     return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
@@ -232,7 +231,7 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
                       const MachineRegisterInfo &MRI) {
   SmallVector<RegisterMaskPair, 8> Res;
   for (const auto &MO : MI.operands()) {
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
       continue;
     if (!MO.isUse() || !MO.readsReg())
       continue;
@@ -278,7 +277,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
                                            const MachineRegisterInfo &MRI) {
   GCNRPTracker::LiveRegSet LiveRegs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    auto Reg = TargetRegisterInfo::index2VirtReg(I);
+    auto Reg = Register::index2VirtReg(I);
     if (!LIS.hasInterval(Reg))
       continue;
     auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -329,8 +328,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   MaxPressure = max(AtMIPressure, MaxPressure);
 
   for (const auto &MO : MI.defs()) {
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) ||
-         MO.isDead())
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()) || MO.isDead())
       continue;
 
     auto Reg = MO.getReg();
@@ -408,8 +406,8 @@ void GCNDownwardRPTracker::advanceToNext() {
   for (const auto &MO : LastTrackedMI->defs()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
     auto &LiveMask = LiveRegs[Reg];
     auto PrevMask = LiveMask;
@@ -500,7 +498,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
                                  const MachineRegisterInfo &MRI) {
   const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     auto It = LiveRegs.find(Reg);
     if (It != LiveRegs.end() && It->second.any())
       OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index e4894418b943..5862cdb04166 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -214,7 +214,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
   SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    auto Reg = TargetRegisterInfo::index2VirtReg(I);
+    auto Reg = Register::index2VirtReg(I);
     if (!LIS.hasInterval(Reg))
       continue;
     auto &LI = LIS.getInterval(Reg);
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 4ea990ae490e..973491a70d3c 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -71,8 +71,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // the tracker, so we need to pass those function a non-const copy.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
-  std::vector<unsigned> Pressure;
-  std::vector<unsigned> MaxPressure;
+  Pressure.clear();
+  MaxPressure.clear();
 
   if (AtTop)
     TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
@@ -103,10 +103,10 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // the analysis to look through dependencies to find the path with the least
   // register pressure.
 
-  // We only need to update the RPDelata for instructions that increase
-  // register pressure.  Instructions that decrease or keep reg pressure
-  // the same will be marked as RegExcess in tryCandidate() when they
-  // are compared with instructions that increase the register pressure.
+  // We only need to update the RPDelta for instructions that increase register
+  // pressure. Instructions that decrease or keep reg pressure the same will be
+  // marked as RegExcess in tryCandidate() when they are compared with
+  // instructions that increase the register pressure.
   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
     Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
@@ -160,6 +160,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       if (TryCand.ResDelta == SchedResourceDelta())
         TryCand.initResourceDelta(Zone.DAG, SchedModel);
       Cand.setBest(TryCand);
+      LLVM_DEBUG(traceCandidate(Cand));
     }
   }
 }
@@ -195,6 +196,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
   } else {
     LLVM_DEBUG(traceCandidate(BotCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
+      assert(TCand.SU == BotCand.SU &&
+             "Last pick result should correspond to re-picking right now");
+    }
+#endif
   }
 
   // Check if the top Q has a better candidate.
@@ -206,6 +216,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
   } else {
     LLVM_DEBUG(traceCandidate(TopCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
+      assert(TCand.SU == TopCand.SU &&
+           "Last pick result should correspond to re-picking right now");
+    }
+#endif
   }
 
   // Pick best from BotCand and TopCand.
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index eaf3dee9ba5d..dd687a930c79 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -40,6 +40,9 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
                      const SIRegisterInfo *SRI,
                      unsigned SGPRPressure, unsigned VGPRPressure);
 
+  std::vector<unsigned> Pressure;
+  std::vector<unsigned> MaxPressure;
+
   unsigned SGPRExcessLimit;
   unsigned VGPRExcessLimit;
   unsigned SGPRCriticalLimit;
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 57c0ba26cc3a..1f94ab799122 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -109,7 +109,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
                                  MCContext *Ctx) {
   int64_t SignedValue = static_cast<int64_t>(Value);
 
-  switch (static_cast<unsigned>(Fixup.getKind())) {
+  switch (Fixup.getTargetKind()) {
   case AMDGPU::fixup_si_sopp_br: {
     int64_t BrImm = (SignedValue - 4) / 4;
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 6549a8d7d592..d352219a7a98 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -87,7 +87,7 @@ std::unique_ptr<MCObjectTargetWriter>
 llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
                                   bool HasRelocationAddend,
                                   uint8_t ABIVersion) {
-  return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
+  return std::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
                                                   HasRelocationAddend,
                                                   ABIVersion);
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 01b53432cbb7..a9888e6ed924 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -196,6 +196,10 @@ void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
   printNamedBit(MI, OpNo, O, "slc");
 }
 
+void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+}
+
 void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
                                  const MCSubtargetInfo &STI, raw_ostream &O) {
   printNamedBit(MI, OpNo, O, "tfe");
@@ -292,35 +296,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   }
 #endif
 
-  unsigned AltName = AMDGPU::Reg32;
-
-  if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo) ||
-      MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo) ||
-      MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(RegNo))
-    AltName = AMDGPU::Reg64;
-  else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo) ||
-           MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo) ||
-           MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(RegNo))
-    AltName = AMDGPU::Reg128;
-  else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo) ||
-           MRI.getRegClass(AMDGPU::SReg_96RegClassID).contains(RegNo))
-    AltName = AMDGPU::Reg96;
-  else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo) ||
-           MRI.getRegClass(AMDGPU::SReg_160RegClassID).contains(RegNo))
-    AltName = AMDGPU::Reg160;
-  else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo) ||
-           MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo))
-    AltName = AMDGPU::Reg256;
-  else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo) ||
-           MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo) ||
-           MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(RegNo))
-    AltName = AMDGPU::Reg512;
-  else if (MRI.getRegClass(AMDGPU::VReg_1024RegClassID).contains(RegNo) ||
-           MRI.getRegClass(AMDGPU::SReg_1024RegClassID).contains(RegNo) ||
-           MRI.getRegClass(AMDGPU::AReg_1024RegClassID).contains(RegNo))
-    AltName = AMDGPU::Reg1024;
-
-  O << getRegisterName(RegNo, AltName);
+  O << getRegisterName(RegNo);
 }
 
 void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -623,9 +599,11 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10:
   case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10:
   case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10:
+  case AMDGPU::V_CNDMASK_B32_dpp_gfx10:
   case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10:
   case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10:
   case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10:
+  case AMDGPU::V_CNDMASK_B32_dpp8_gfx10:
   case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
   case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
   case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
@@ -689,6 +667,7 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
   switch (MI->getOpcode()) {
   default: break;
 
+  case AMDGPU::V_CNDMASK_B32_sdwa_gfx10:
   case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10:
   case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10:
   case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10:
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index b544d1ef3605..66b70831ff9e 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -12,7 +12,6 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
 
-#include "AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -26,8 +25,7 @@ public:
   //Autogenerated by tblgen
   void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
                         raw_ostream &O);
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = AMDGPU::NoRegAltName);
+  static const char *getRegisterName(unsigned RegNo);
 
   void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
                  const MCSubtargetInfo &STI) override;
@@ -74,6 +72,8 @@ private:
                 raw_ostream &O);
   void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                 raw_ostream &O);
+  void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
   void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                 raw_ostream &O);
   void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 8f11433476f4..c15da8075a34 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -250,7 +250,7 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
 bool AMDGPUTargetAsmStreamer::EmitCodeEnd() {
   const uint32_t Encoded_s_code_end = 0xbf9f0000;
   OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n';
-  OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n';
+  OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n';
   return true;
 }
 
@@ -602,7 +602,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
   MCStreamer &OS = getStreamer();
   OS.PushSection();
   OS.EmitValueToAlignment(64, Encoded_s_code_end, 4);
-  for (unsigned I = 0; I < 32; ++I)
+  for (unsigned I = 0; I < 48; ++I)
     OS.EmitIntValue(Encoded_s_code_end, 4);
   OS.PopSection();
   return true;
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 4735e6cb2446..f33ad950d5d9 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -26,7 +26,7 @@ def MIMGEncoding : GenericEnum {
 
 // Represent an ISA-level opcode, independent of the encoding and the
 // vdata/vaddr size.
-class MIMGBaseOpcode {
+class MIMGBaseOpcode : PredicateControl {
   MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME);
   bit Store = 0;
   bit Atomic = 0;
@@ -291,7 +291,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm,
 
 multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0,
                            bit isResInfo = 0> {
-  def "" : MIMGBaseOpcode, PredicateControl {
+  def "" : MIMGBaseOpcode {
     let Coordinates = !if(isResInfo, 0, 1);
     let LodOrClampOrMip = mip;
     let HasD16 = has_d16;
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp
index 3fb18862fca8..b29cd75f75cf 100644
--- a/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -104,7 +104,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
 
   // Functions needs to be cacheline (256B) aligned.
-  MF.ensureAlignment(8);
+  MF.ensureAlignment(Align(256));
 
   SetupMachineFunction(MF);
 
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 8098b81d1ea2..e4160ac11c86 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -303,7 +303,7 @@ private:
       if (!MO.isReg())
         continue;
       if (MO.isDef()) {
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (R600::R600_Reg128RegClass.contains(Reg))
           DstMI = Reg;
         else
@@ -312,7 +312,7 @@ private:
               &R600::R600_Reg128RegClass);
       }
       if (MO.isUse()) {
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (R600::R600_Reg128RegClass.contains(Reg))
           SrcMI = Reg;
         else
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index c6e8a060d8a0..fd75c41040e1 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -135,7 +135,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
 
         const R600RegisterInfo &TRI = TII->getRegisterInfo();
 
-        unsigned DstReg = MI.getOperand(0).getReg();
+        Register DstReg = MI.getOperand(0).getReg();
         unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
 
         for (unsigned Chan = 0; Chan < 4; ++Chan) {
@@ -155,12 +155,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
           unsigned Opcode = BMI->getOpcode();
           // While not strictly necessary from hw point of view, we force
           // all src operands of a dot4 inst to belong to the same slot.
-          unsigned Src0 = BMI->getOperand(
-              TII->getOperandIdx(Opcode, R600::OpName::src0))
-              .getReg();
-          unsigned Src1 = BMI->getOperand(
-              TII->getOperandIdx(Opcode, R600::OpName::src1))
-              .getReg();
+          Register Src0 =
+              BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src0))
+                  .getReg();
+          Register Src1 =
+              BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src1))
+                  .getReg();
           (void) Src0;
           (void) Src1;
           if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
@@ -205,10 +205,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
       // T0_Z = CUBE T1_X, T1_Z
       // T0_W = CUBE T1_Y, T1_Z
       for (unsigned Chan = 0; Chan < 4; Chan++) {
-        unsigned DstReg = MI.getOperand(
-                            TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
-        unsigned Src0 = MI.getOperand(
-                           TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
+        Register DstReg =
+            MI.getOperand(TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
+        Register Src0 =
+            MI.getOperand(TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
         unsigned Src1 = 0;
 
         // Determine the correct source registers
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
index 950e238f4979..283e4d1935ea 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -15,9 +15,9 @@ namespace llvm {
 
 class R600FrameLowering : public AMDGPUFrameLowering {
 public:
-  R600FrameLowering(StackDirection D, unsigned StackAl, int LAO,
-                    unsigned TransAl = 1) :
-    AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+  R600FrameLowering(StackDirection D, Align StackAl, int LAO,
+                    Align TransAl = Align::None())
+      : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~R600FrameLowering() override;
 
   void emitPrologue(MachineFunction &MF,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index f80a53ba1dc6..659458b0b752 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -334,8 +335,8 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   }
 
   case R600::MASK_WRITE: {
-    unsigned maskedRegister = MI.getOperand(0).getReg();
-    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+    Register maskedRegister = MI.getOperand(0).getReg();
+    assert(Register::isVirtualRegister(maskedRegister));
     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
     break;
@@ -782,7 +783,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
     return TrigVal;
   // On R600 hw, COS/SIN input must be between -Pi and Pi.
   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
-      DAG.getConstantFP(3.14159265359, DL, MVT::f32));
+      DAG.getConstantFP(numbers::pif, DL, MVT::f32));
 }
 
 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index d9e839fe2035..04a5e93f6213 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -97,8 +97,8 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI) const {
   for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
                                         E = MBBI->operands_end(); I != E; ++I) {
-    if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
-        I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+    if (I->isReg() && !Register::isVirtualRegister(I->getReg()) && I->isUse() &&
+        RI.isPhysRegLiveAcrossClauses(I->getReg()))
       return false;
   }
   return true;
@@ -242,8 +242,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
   for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
                                         E = MI.operands_end();
        I != E; ++I) {
-    if (!I->isReg() || !I->isUse() ||
-        TargetRegisterInfo::isVirtualRegister(I->getReg()))
+    if (!I->isReg() || !I->isUse() || Register::isVirtualRegister(I->getReg()))
       continue;
 
     if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
@@ -294,7 +293,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
     for (unsigned j = 0; j < 8; j++) {
       MachineOperand &MO =
           MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (Reg == R600::ALU_CONST) {
         MachineOperand &Sel =
             MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
@@ -317,7 +316,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
     if (SrcIdx < 0)
       break;
     MachineOperand &MO = MI.getOperand(SrcIdx);
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == R600::ALU_CONST) {
       MachineOperand &Sel =
           MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
@@ -348,7 +347,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
   unsigned i = 0;
   for (const auto &Src : getSrcs(MI)) {
     ++i;
-    unsigned Reg = Src.first->getReg();
+    Register Reg = Src.first->getReg();
     int Index = RI.getEncodingValue(Reg) & 0xff;
     if (Reg == R600::OQAP) {
       Result.push_back(std::make_pair(Index, 0U));
@@ -865,7 +864,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
   if (idx < 0)
     return false;
 
-  unsigned Reg = MI.getOperand(idx).getReg();
+  Register Reg = MI.getOperand(idx).getReg();
   switch (Reg) {
   default: return false;
   case R600::PRED_SEL_ONE:
@@ -1038,7 +1037,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
       unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
       unsigned Address = calculateIndirectAddress(RegIndex, Channel);
-      unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+      Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
       if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
         buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
                       getIndirectAddrRegClass()->getRegister(Address));
@@ -1052,7 +1051,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
       unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
       unsigned Address = calculateIndirectAddress(RegIndex, Channel);
-      unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+      Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
       if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
         buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
                       MI.getOperand(ValOpIdx).getReg());
@@ -1193,8 +1192,7 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
   const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
   for (std::pair<unsigned, unsigned> LI : MRI.liveins()) {
     unsigned Reg = LI.first;
-    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
-        !IndirectRC->contains(Reg))
+    if (Register::isVirtualRegister(Reg) || !IndirectRC->contains(Reg))
       continue;
 
     unsigned RegIndex;
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 34267a909b5e..7569a2629539 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -183,7 +183,7 @@ isPhysicalRegCopy(MachineInstr *MI) {
   if (MI->getOpcode() != R600::COPY)
     return false;
 
-  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
+  return !Register::isVirtualRegister(MI->getOperand(1).getReg());
 }
 
 void R600SchedStrategy::releaseTopNode(SUnit *SU) {
@@ -209,7 +209,7 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
 
 bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
                                           const TargetRegisterClass *RC) const {
-  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (!Register::isVirtualRegister(Reg)) {
     return RC->contains(Reg);
   } else {
     return MRI->getRegClass(Reg) == RC;
@@ -270,7 +270,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   }
 
   // Is the result already member of a X/Y/Z/W class ?
-  unsigned DestReg = MI->getOperand(0).getReg();
+  Register DestReg = MI->getOperand(0).getReg();
   if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
       regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
     return AluT_X;
@@ -357,7 +357,7 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
   if (DstIndex == -1) {
     return;
   }
-  unsigned DestReg = MI->getOperand(DstIndex).getReg();
+  Register DestReg = MI->getOperand(DstIndex).getReg();
   // PressureRegister crashes if an operand is def and used in the same inst
   // and we try to constraint its regclass
   for (MachineInstr::mop_iterator It = MI->operands_begin(),
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 9f1cb6582b5c..cec7f563f480 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -58,7 +58,7 @@ using namespace llvm;
 
 static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
   assert(MRI.isSSA());
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return false;
   const MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
   return MI && MI->isImplicitDef();
@@ -197,17 +197,17 @@ unsigned getReassignedChan(
 MachineInstr *R600VectorRegMerger::RebuildVector(
     RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
     const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const {
-  unsigned Reg = RSI->Instr->getOperand(0).getReg();
+  Register Reg = RSI->Instr->getOperand(0).getReg();
   MachineBasicBlock::iterator Pos = RSI->Instr;
   MachineBasicBlock &MBB = *Pos->getParent();
   DebugLoc DL = Pos->getDebugLoc();
 
-  unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+  Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
   DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
   std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
   for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
       E = RSI->RegToChan.end(); It != E; ++It) {
-    unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
+    Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
     unsigned SubReg = (*It).first;
     unsigned Swizzle = (*It).second;
     unsigned Chan = getReassignedChan(RemapChan, Swizzle);
@@ -350,7 +350,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
       MachineInstr &MI = *MII;
       if (MI.getOpcode() != R600::REG_SEQUENCE) {
         if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
-          unsigned Reg = MI.getOperand(1).getReg();
+          Register Reg = MI.getOperand(1).getReg();
           for (MachineRegisterInfo::def_instr_iterator
                It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
                It != E; ++It) {
@@ -363,7 +363,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
       RegSeqInfo RSI(*MRI, &MI);
 
       // All uses of MI are swizzeable ?
-      unsigned Reg = MI.getOperand(0).getReg();
+      Register Reg = MI.getOperand(0).getReg();
       if (!areAllUsesSwizzeable(Reg))
         continue;
 
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index df200baf11c1..176269f9b68c 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -90,7 +90,7 @@ private:
       if (DstIdx == -1) {
         continue;
       }
-      unsigned Dst = BI->getOperand(DstIdx).getReg();
+      Register Dst = BI->getOperand(DstIdx).getReg();
       if (isTrans || TII->isTransOnly(*BI)) {
         Result[Dst] = R600::PS;
         continue;
@@ -136,7 +136,7 @@ private:
       int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
       if (OperandIdx < 0)
         continue;
-      unsigned Src = MI.getOperand(OperandIdx).getReg();
+      Register Src = MI.getOperand(OperandIdx).getReg();
       const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
       if (It != PVs.end())
         MI.getOperand(OperandIdx).setReg(It->second);
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 685df74490fe..ef12c1d24594 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -93,7 +93,7 @@ const RegClassWeight &R600RegisterInfo::getRegClassWeight(
 }
 
 bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
-  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(!Register::isVirtualRegister(Reg));
 
   switch (Reg) {
   case R600::OQAP:
diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp
index f8094e35816c..ee011286b8ff 100644
--- a/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -129,7 +129,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
             continue;
 
           // Create a register for the intialization value.
-          unsigned PrevDst =
+          Register PrevDst =
               MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
           unsigned NewDst = 0; // Final initialized value will be in here
 
@@ -150,7 +150,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
               NewDst =
                   MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
               // Initialize dword
-              unsigned SubReg =
+              Register SubReg =
                   MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
               BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
                   .addImm(0);
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a0e1ec6ac235..23ef56afc39c 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -99,7 +99,10 @@ enum : uint64_t {
   FPAtomic = UINT64_C(1) << 53,
 
   // Is a MFMA instruction.
-  IsMAI = UINT64_C(1) << 54
+  IsMAI = UINT64_C(1) << 54,
+
+  // Is a DOT instruction.
+  IsDOT = UINT64_C(1) << 55
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -444,6 +447,7 @@ namespace DPP {
 
 enum DppCtrl : unsigned {
   QUAD_PERM_FIRST   = 0,
+  QUAD_PERM_ID      = 0xE4, // identity permutation
   QUAD_PERM_LAST    = 0xFF,
   DPP_UNUSED1       = 0x100,
   ROW_SHL0          = 0x100,
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 624953963cf4..65286751c12d 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -113,10 +113,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
 public:
   static char ID;
 
+  MachineRegisterInfo *MRI;
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+
   SIFixSGPRCopies() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  void processPHINode(MachineInstr &MI);
+
   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -148,7 +154,7 @@ static bool hasVectorOperands(const MachineInstr &MI,
   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     if (!MI.getOperand(i).isReg() ||
-        !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+        !Register::isVirtualRegister(MI.getOperand(i).getReg()))
       continue;
 
     if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
@@ -161,21 +167,19 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
 getCopyRegClasses(const MachineInstr &Copy,
                   const SIRegisterInfo &TRI,
                   const MachineRegisterInfo &MRI) {
-  unsigned DstReg = Copy.getOperand(0).getReg();
-  unsigned SrcReg = Copy.getOperand(1).getReg();
+  Register DstReg = Copy.getOperand(0).getReg();
+  Register SrcReg = Copy.getOperand(1).getReg();
 
-  const TargetRegisterClass *SrcRC =
-    TargetRegisterInfo::isVirtualRegister(SrcReg) ?
-    MRI.getRegClass(SrcReg) :
-    TRI.getPhysRegClass(SrcReg);
+  const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg)
+                                         ? MRI.getRegClass(SrcReg)
+                                         : TRI.getPhysRegClass(SrcReg);
 
   // We don't really care about the subregister here.
   // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
 
-  const TargetRegisterClass *DstRC =
-    TargetRegisterInfo::isVirtualRegister(DstReg) ?
-    MRI.getRegClass(DstReg) :
-    TRI.getPhysRegClass(DstReg);
+  const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg)
+                                         ? MRI.getRegClass(DstReg)
+                                         : TRI.getPhysRegClass(DstReg);
 
   return std::make_pair(SrcRC, DstRC);
 }
@@ -199,10 +203,10 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
                                       const SIInstrInfo *TII) {
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   auto &Src = MI.getOperand(1);
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = Src.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
-      !TargetRegisterInfo::isVirtualRegister(DstReg))
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = Src.getReg();
+  if (!Register::isVirtualRegister(SrcReg) ||
+      !Register::isVirtualRegister(DstReg))
     return false;
 
   for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
@@ -238,7 +242,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
                                         MachineRegisterInfo &MRI) {
   assert(MI.isRegSequence());
 
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
     return false;
 
@@ -250,7 +254,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
     return false;
 
   // It is illegal to have vreg inputs to a physreg defining reg_sequence.
-  if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
+  if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
     return false;
 
   const TargetRegisterClass *SrcRC, *DstRC;
@@ -281,7 +285,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   bool IsAGPR = TRI->hasAGPRs(DstRC);
 
   for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
-    unsigned SrcReg = MI.getOperand(I).getReg();
+    Register SrcReg = MI.getOperand(I).getReg();
     unsigned SrcSubReg = MI.getOperand(I).getSubReg();
 
     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
@@ -291,7 +295,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
     SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
     const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
 
-    unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
+    Register TmpReg = MRI.createVirtualRegister(NewSrcRC);
 
     BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
             TmpReg)
@@ -299,7 +303,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
 
     if (IsAGPR) {
       const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
-      unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC);
+      Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);
       unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
         AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
@@ -315,52 +319,6 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   return true;
 }
 
-static bool phiHasVGPROperands(const MachineInstr &PHI,
-                               const MachineRegisterInfo &MRI,
-                               const SIRegisterInfo *TRI,
-                               const SIInstrInfo *TII) {
-  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
-    unsigned Reg = PHI.getOperand(i).getReg();
-    if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
-      return true;
-  }
-  return false;
-}
-
-static bool phiHasBreakDef(const MachineInstr &PHI,
-                           const MachineRegisterInfo &MRI,
-                           SmallSet<unsigned, 8> &Visited) {
-  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
-    unsigned Reg = PHI.getOperand(i).getReg();
-    if (Visited.count(Reg))
-      continue;
-
-    Visited.insert(Reg);
-
-    MachineInstr *DefInstr = MRI.getVRegDef(Reg);
-    switch (DefInstr->getOpcode()) {
-    default:
-      break;
-    case AMDGPU::SI_IF_BREAK:
-      return true;
-    case AMDGPU::PHI:
-      if (phiHasBreakDef(*DefInstr, MRI, Visited))
-        return true;
-    }
-  }
-  return false;
-}
-
-static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
-                                          const TargetRegisterInfo &TRI) {
-  for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
-       E = MBB.end(); I != E; ++I) {
-    if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
-      return true;
-  }
-  return false;
-}
-
 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
                                     const MachineInstr *MoveImm,
                                     const SIInstrInfo *TII,
@@ -422,12 +380,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
   return false;
 }
 
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
-                                        const TargetRegisterInfo *TRI) {
-  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
-           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
-}
-
 // Checks if there is potential path From instruction To instruction.
 // If CutOff is specified and it sits in between of that path we ignore
 // a higher portion of the path and report it is not reachable.
@@ -468,6 +420,7 @@ getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
 // executioon.
 static bool hoistAndMergeSGPRInits(unsigned Reg,
                                    const MachineRegisterInfo &MRI,
+                                   const TargetRegisterInfo *TRI,
                                    MachineDominatorTree &MDT,
                                    const TargetInstrInfo *TII) {
   // List of inits by immediate value.
@@ -482,7 +435,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
 
   for (auto &MI : MRI.def_instructions(Reg)) {
     MachineOperand *Imm = nullptr;
-    for (auto &MO: MI.operands()) {
+    for (auto &MO : MI.operands()) {
       if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
           (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
         Imm = nullptr;
@@ -587,8 +540,44 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
     }
   }
 
-  for (auto MI : MergedInstrs)
-    MI->removeFromParent();
+  // Remove initializations that were merged into another.
+  for (auto &Init : Inits) {
+    auto &Defs = Init.second;
+    auto I = Defs.begin();
+    while (I != Defs.end()) {
+      if (MergedInstrs.count(*I)) {
+        (*I)->eraseFromParent();
+        I = Defs.erase(I);
+      } else
+        ++I;
+    }
+  }
+
+  // Try to schedule SGPR initializations as early as possible in the MBB.
+  for (auto &Init : Inits) {
+    auto &Defs = Init.second;
+    for (auto MI : Defs) {
+      auto MBB = MI->getParent();
+      MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII);
+      MachineBasicBlock::reverse_iterator B(BoundaryMI);
+      // Check if B should actually be a boundary. If not set the previous
+      // instruction as the boundary instead.
+      if (!TII->isBasicBlockPrologue(*B))
+        B++;
+
+      auto R = std::next(MI->getReverseIterator());
+      const unsigned Threshold = 50;
+      // Search until B or Threshold for a place to insert the initialization.
+      for (unsigned I = 0; R != B && I < Threshold; ++R, ++I)
+        if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) ||
+            TII->isSchedulingBoundary(*R, MBB, *MBB->getParent()))
+          break;
+
+      // Move to directly after R.
+      if (&*--R != MI)
+        MBB->splice(*R, MBB, MI);
+    }
+  }
 
   if (Changed)
     MRI.clearKillFlags(Reg);
@@ -598,9 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
 
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  const SIInstrInfo *TII = ST.getInstrInfo();
+  MRI = &MF.getRegInfo();
+  TRI = ST.getRegisterInfo();
+  TII = ST.getInstrInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
 
   SmallVector<MachineInstr *, 16> Worklist;
@@ -617,22 +606,39 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         continue;
       case AMDGPU::COPY:
       case AMDGPU::WQM:
+      case AMDGPU::SOFT_WQM:
       case AMDGPU::WWM: {
-        // If the destination register is a physical register there isn't really
-        // much we can do to fix this.
-        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
-          continue;
+        Register DstReg = MI.getOperand(0).getReg();
 
         const TargetRegisterClass *SrcRC, *DstRC;
-        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
+
+        if (!Register::isVirtualRegister(DstReg)) {
+          // If the destination register is a physical register there isn't
+          // really much we can do to fix this.
+          // Some special instructions use M0 as an input. Some even only use
+          // the first lane. Insert a readfirstlane and hope for the best.
+          if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
+            Register TmpReg
+              = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+            BuildMI(MBB, MI, MI.getDebugLoc(),
+                    TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
+              .add(MI.getOperand(1));
+            MI.getOperand(1).setReg(TmpReg);
+          }
+
+          continue;
+        }
+
         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
-          unsigned SrcReg = MI.getOperand(1).getReg();
-          if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          Register SrcReg = MI.getOperand(1).getReg();
+          if (!Register::isVirtualRegister(SrcReg)) {
             TII->moveToVALU(MI, MDT);
             break;
           }
 
-          MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
+          MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
           unsigned SMovOp;
           int64_t Imm;
           // If we are just copying an immediate, we can replace the copy with
@@ -651,70 +657,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
       case AMDGPU::PHI: {
-        unsigned Reg = MI.getOperand(0).getReg();
-        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
-          break;
-
-        // We don't need to fix the PHI if the common dominator of the
-        // two incoming blocks terminates with a uniform branch.
-        bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
-        if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
-          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
-          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
-
-          if (!predsHasDivergentTerminator(MBB0, TRI) &&
-              !predsHasDivergentTerminator(MBB1, TRI)) {
-            LLVM_DEBUG(dbgs()
-                       << "Not fixing PHI for uniform branch: " << MI << '\n');
-            break;
-          }
-        }
-
-        // If a PHI node defines an SGPR and any of its operands are VGPRs,
-        // then we need to move it to the VALU.
-        //
-        // Also, if a PHI node defines an SGPR and has all SGPR operands
-        // we must move it to the VALU, because the SGPR operands will
-        // all end up being assigned the same register, which means
-        // there is a potential for a conflict if different threads take
-        // different control flow paths.
-        //
-        // For Example:
-        //
-        // sgpr0 = def;
-        // ...
-        // sgpr1 = def;
-        // ...
-        // sgpr2 = PHI sgpr0, sgpr1
-        // use sgpr2;
-        //
-        // Will Become:
-        //
-        // sgpr2 = def;
-        // ...
-        // sgpr2 = def;
-        // ...
-        // use sgpr2
-        //
-        // The one exception to this rule is when one of the operands
-        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
-        // instruction.  In this case, there we know the program will
-        // never enter the second block (the loop) without entering
-        // the first block (where the condition is computed), so there
-        // is no chance for values to be over-written.
-
-        SmallSet<unsigned, 8> Visited;
-        if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
-          LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
-          TII->moveToVALU(MI, MDT);
-        }
-
+        processPHINode(MI);
         break;
       }
       case AMDGPU::REG_SEQUENCE:
         if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
             !hasVectorOperands(MI, TRI)) {
-          foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
+          foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
           continue;
         }
 
@@ -724,9 +673,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         break;
       case AMDGPU::INSERT_SUBREG: {
         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
-        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
-        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
-        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+        DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
         if (TRI->isSGPRClass(DstRC) &&
             (TRI->hasVectorRegisters(Src0RC) ||
              TRI->hasVectorRegisters(Src1RC))) {
@@ -735,12 +684,159 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         }
         break;
       }
+      case AMDGPU::V_WRITELANE_B32: {
+        // Some architectures allow more than one constant bus access without
+        // SGPR restriction
+        if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
+          break;
+
+        // Writelane is special in that it can use SGPR and M0 (which would
+        // normally count as using the constant bus twice - but in this case it
+        // is allowed since the lane selector doesn't count as a use of the
+        // constant bus). However, it is still required to abide by the 1 SGPR
+        // rule. Apply a fix here as we might have multiple SGPRs after
+        // legalizing VGPRs to SGPRs
+        int Src0Idx =
+            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+        int Src1Idx =
+            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+        MachineOperand &Src0 = MI.getOperand(Src0Idx);
+        MachineOperand &Src1 = MI.getOperand(Src1Idx);
+
+        // Check to see if the instruction violates the 1 SGPR rule
+        if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
+             Src0.getReg() != AMDGPU::M0) &&
+            (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
+             Src1.getReg() != AMDGPU::M0)) {
+
+          // Check for trivially easy constant prop into one of the operands
+          // If this is the case then perform the operation now to resolve SGPR
+          // issue. If we don't do that here we will always insert a mov to m0
+          // that can't be resolved in later operand folding pass
+          bool Resolved = false;
+          for (MachineOperand *MO : {&Src0, &Src1}) {
+            if (Register::isVirtualRegister(MO->getReg())) {
+              MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
+              if (DefMI && TII->isFoldableCopy(*DefMI)) {
+                const MachineOperand &Def = DefMI->getOperand(0);
+                if (Def.isReg() &&
+                    MO->getReg() == Def.getReg() &&
+                    MO->getSubReg() == Def.getSubReg()) {
+                  const MachineOperand &Copied = DefMI->getOperand(1);
+                  if (Copied.isImm() &&
+                      TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
+                    MO->ChangeToImmediate(Copied.getImm());
+                    Resolved = true;
+                    break;
+                  }
+                }
+              }
+            }
+          }
+
+          if (!Resolved) {
+            // Haven't managed to resolve by replacing an SGPR with an immediate
+            // Move src1 to be in M0
+            BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                    TII->get(AMDGPU::COPY), AMDGPU::M0)
+                .add(Src1);
+            Src1.ChangeToRegister(AMDGPU::M0, false);
+          }
+        }
+        break;
+      }
       }
     }
   }
 
   if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
-    hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII);
+    hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
 
   return true;
 }
+
+void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
+  unsigned numVGPRUses = 0;
+  bool AllAGPRUses = true;
+  SetVector<const MachineInstr *> worklist;
+  SmallSet<const MachineInstr *, 4> Visited;
+  worklist.insert(&MI);
+  Visited.insert(&MI);
+  while (!worklist.empty()) {
+    const MachineInstr *Instr = worklist.pop_back_val();
+    unsigned Reg = Instr->getOperand(0).getReg();
+    for (const auto &Use : MRI->use_operands(Reg)) {
+      const MachineInstr *UseMI = Use.getParent();
+      AllAGPRUses &= (UseMI->isCopy() &&
+                      TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) ||
+                     TRI->isAGPR(*MRI, Use.getReg());
+      if (UseMI->isCopy() || UseMI->isRegSequence()) {
+        if (UseMI->isCopy() &&
+          UseMI->getOperand(0).getReg().isPhysical() &&
+          !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
+          numVGPRUses++;
+        }
+        if (Visited.insert(UseMI).second)
+          worklist.insert(UseMI);
+
+        continue;
+      }
+
+      if (UseMI->isPHI()) {
+        const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
+        if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
+          UseRC != &AMDGPU::VReg_1RegClass)
+          numVGPRUses++;
+        continue;
+      }
+
+      const TargetRegisterClass *OpRC =
+        TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
+      if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
+        OpRC != &AMDGPU::VS_64RegClass) {
+        numVGPRUses++;
+      }
+    }
+  }
+
+  Register PHIRes = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
+  if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
+    LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
+    MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
+  }
+
+  bool hasVGPRInput = false;
+  for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+    unsigned InputReg = MI.getOperand(i).getReg();
+    MachineInstr *Def = MRI->getVRegDef(InputReg);
+    if (TRI->isVectorRegister(*MRI, InputReg)) {
+      if (Def->isCopy()) {
+        unsigned SrcReg = Def->getOperand(1).getReg();
+        const TargetRegisterClass *RC =
+          TRI->getRegClassForReg(*MRI, SrcReg);
+        if (TRI->isSGPRClass(RC))
+          continue;
+      }
+      hasVGPRInput = true;
+      break;
+    }
+    else if (Def->isCopy() &&
+      TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
+      hasVGPRInput = true;
+      break;
+    }
+  }
+
+  if ((!TRI->isVectorRegister(*MRI, PHIRes) &&
+       RC0 != &AMDGPU::VReg_1RegClass) &&
+    (hasVGPRInput || numVGPRUses > 1)) {
+    LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
+    TII->moveToVALU(MI);
+  }
+  else {
+    LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
+    TII->legalizeOperands(MI, MDT);
+  }
+
+}
diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
index 5b834c8de13a..a0119297b112 100644
--- a/lib/Target/AMDGPU/SIFixupVectorISel.cpp
+++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -91,8 +91,7 @@ static bool findSRegBaseAndIndex(MachineOperand *Op,
   Worklist.push_back(Op);
   while (!Worklist.empty()) {
     MachineOperand *WOp = Worklist.pop_back_val();
-    if (!WOp->isReg() ||
-        !TargetRegisterInfo::isVirtualRegister(WOp->getReg()))
+    if (!WOp->isReg() || !Register::isVirtualRegister(WOp->getReg()))
       continue;
     MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
     switch (DefInst->getOpcode()) {
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 74d77d328019..4eac03168760 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -142,16 +142,20 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
   switch (Opc) {
   case AMDGPU::V_MAC_F32_e64:
   case AMDGPU::V_MAC_F16_e64:
-  case AMDGPU::V_FMAC_F32_e64: {
+  case AMDGPU::V_FMAC_F32_e64:
+  case AMDGPU::V_FMAC_F16_e64: {
     // Special case for mac. Since this is replaced with mad when folded into
     // src2, we need to check the legality for the final instruction.
     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     if (static_cast<int>(OpNo) == Src2Idx) {
-      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
-      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
+                   Opc == AMDGPU::V_FMAC_F16_e64;
+      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
+                   Opc == AMDGPU::V_FMAC_F32_e64;
 
       unsigned Opc = IsFMA ?
-        AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+        (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
+        (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
       const MCInstrDesc &MadDesc = TII->get(Opc);
       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
     }
@@ -235,9 +239,11 @@ static bool updateOperand(FoldCandidate &Fold,
 
   if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
     MachineBasicBlock *MBB = MI->getParent();
-    auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
-    if (Liveness != MachineBasicBlock::LQR_Dead)
+    auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI, 16);
+    if (Liveness != MachineBasicBlock::LQR_Dead) {
+      LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
       return false;
+    }
 
     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
     int Op32 = Fold.getShrinkOpcode();
@@ -248,7 +254,7 @@ static bool updateOperand(FoldCandidate &Fold,
     bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
 
     const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
-    unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
+    Register NewReg0 = MRI.createVirtualRegister(Dst0RC);
 
     MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
 
@@ -314,12 +320,15 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
-         Opc == AMDGPU::V_FMAC_F32_e64) &&
+         Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
-      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
-      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
+                   Opc == AMDGPU::V_FMAC_F16_e64;
+      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
+                   Opc == AMDGPU::V_FMAC_F32_e64;
       unsigned NewOpc = IsFMA ?
-        AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+        (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
+        (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
 
       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
       // to fold the operand.
@@ -435,7 +444,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
       OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
     return false;
 
-  if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) {
+  if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
+      TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
     UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
     return true;
   }
@@ -443,8 +453,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
   if (!OpToFold.isReg())
     return false;
 
-  unsigned UseReg = OpToFold.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(UseReg))
+  Register UseReg = OpToFold.getReg();
+  if (!Register::isVirtualRegister(UseReg))
     return false;
 
   if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
@@ -481,6 +491,9 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
       return false; // Can only fold splat constants
   }
 
+  if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
+    return false;
+
   FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
   return true;
 }
@@ -518,7 +531,7 @@ void SIFoldOperands::foldOperand(
   // REG_SEQUENCE instructions, so we have to fold them into the
   // uses of REG_SEQUENCE.
   if (UseMI->isRegSequence()) {
-    unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+    Register RegSeqDstReg = UseMI->getOperand(0).getReg();
     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
 
     MachineRegisterInfo::use_iterator Next;
@@ -569,15 +582,18 @@ void SIFoldOperands::foldOperand(
       OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
 
   if (FoldingImmLike && UseMI->isCopy()) {
-    unsigned DestReg = UseMI->getOperand(0).getReg();
-    const TargetRegisterClass *DestRC
-      = TargetRegisterInfo::isVirtualRegister(DestReg) ?
-      MRI->getRegClass(DestReg) :
-      TRI->getPhysRegClass(DestReg);
-
-    unsigned SrcReg  = UseMI->getOperand(1).getReg();
-    if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
-      TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    Register DestReg = UseMI->getOperand(0).getReg();
+
+    // Don't fold into a copy to a physical register. Doing so would interfere
+    // with the register coalescer's logic which would avoid redundant
+    // initalizations.
+    if (DestReg.isPhysical())
+      return;
+
+    const TargetRegisterClass *DestRC =  MRI->getRegClass(DestReg);
+
+    Register SrcReg = UseMI->getOperand(1).getReg();
+    if (SrcReg.isVirtual()) { // XXX - This can be an assert?
       const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
       if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
         MachineRegisterInfo::use_iterator NextUse;
@@ -613,10 +629,17 @@ void SIFoldOperands::foldOperand(
       return;
 
     UseMI->setDesc(TII->get(MovOp));
+    MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
+    MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
+    while (ImpOpI != ImpOpE) {
+      MachineInstr::mop_iterator Tmp = ImpOpI;
+      ImpOpI++;
+      UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));
+    }
     CopiesToReplace.push_back(UseMI);
   } else {
     if (UseMI->isCopy() && OpToFold.isReg() &&
-        TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
+        Register::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
         TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
         TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
         !UseMI->getOperand(1).getSubReg()) {
@@ -677,6 +700,9 @@ void SIFoldOperands::foldOperand(
         // =>
         // %sgpr1 = COPY %sgpr0
         UseMI->setDesc(TII->get(AMDGPU::COPY));
+        UseMI->getOperand(1).setReg(OpToFold.getReg());
+        UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+        UseMI->getOperand(1).setIsKill(false);
         UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
@@ -708,7 +734,7 @@ void SIFoldOperands::foldOperand(
 
   // Split 64-bit constants into 32-bits for folding.
   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
-    unsigned UseReg = UseOp.getReg();
+    Register UseReg = UseOp.getReg();
     const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
 
     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
@@ -810,7 +836,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
   if (Op.isReg()) {
     // If this has a subregister, it obviously is a register source.
     if (Op.getSubReg() != AMDGPU::NoSubRegister ||
-        !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+        !Register::isVirtualRegister(Op.getReg()))
       return &Op;
 
     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
@@ -1073,6 +1099,13 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     Copy->addImplicitDefUseOperands(*MF);
 
   for (FoldCandidate &Fold : FoldList) {
+    if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) {
+      Register Reg = Fold.OpToFold->getReg();
+      MachineInstr *DefMI = Fold.OpToFold->getParent();
+      if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
+          execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
+        continue;
+    }
     if (updateOperand(Fold, *TII, *TRI, *ST)) {
       // Clear kill flags.
       if (Fold.isReg()) {
@@ -1316,6 +1349,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
 
   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineBasicBlock::iterator I, Next;
+
+    MachineOperand *CurrentKnownM0Val = nullptr;
     for (I = MBB->begin(); I != MBB->end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
@@ -1328,6 +1363,25 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
         if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
             !tryFoldOMod(MI))
           tryFoldClamp(MI);
+
+        // Saw an unknown clobber of m0, so we no longer know what it is.
+        if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
+          CurrentKnownM0Val = nullptr;
+        continue;
+      }
+
+      // Specially track simple redefs of m0 to the same value in a block, so we
+      // can erase the later ones.
+      if (MI.getOperand(0).getReg() == AMDGPU::M0) {
+        MachineOperand &NewM0Val = MI.getOperand(1);
+        if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
+          MI.eraseFromParent();
+          continue;
+        }
+
+        // We aren't tracking other physical registers
+        CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) ?
+          nullptr : &NewM0Val;
         continue;
       }
 
@@ -1339,8 +1393,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       if (!FoldingImm && !OpToFold.isReg())
         continue;
 
-      if (OpToFold.isReg() &&
-          !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
+      if (OpToFold.isReg() && !Register::isVirtualRegister(OpToFold.getReg()))
         continue;
 
       // Prevent folding operands backwards in the function. For example,
@@ -1350,8 +1403,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       //    ...
       //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
       MachineOperand &Dst = MI.getOperand(0);
-      if (Dst.isReg() &&
-          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+      if (Dst.isReg() && !Register::isVirtualRegister(Dst.getReg()))
         continue;
 
       foldInstOperand(MI, OpToFold);
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index f3c9ad63a80a..26bae5734df7 100644
--- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -120,7 +120,7 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
     return false;
   // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it.
   for (const MachineOperand &ResMO : MI.defs()) {
-    unsigned ResReg = ResMO.getReg();
+    Register ResReg = ResMO.getReg();
     for (const MachineOperand &MO : MI.uses()) {
       if (!MO.isReg() || MO.isDef())
         continue;
@@ -144,7 +144,7 @@ static unsigned getMopState(const MachineOperand &MO) {
     S |= RegState::Kill;
   if (MO.isEarlyClobber())
     S |= RegState::EarlyClobber;
-  if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
+  if (Register::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
     S |= RegState::Renamable;
   return S;
 }
@@ -152,7 +152,7 @@ static unsigned getMopState(const MachineOperand &MO) {
 template <typename Callable>
 void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
                                       Callable Func) const {
-  if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) ||
+  if (LaneMask.all() || Register::isPhysicalRegister(Reg) ||
       LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
     Func(0);
     return;
@@ -216,7 +216,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
     if (!MO.isReg())
       continue;
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
 
     // If it is tied we will need to write same register as we read.
     if (MO.isTied())
@@ -227,7 +227,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
     if (Conflict == Map.end())
       continue;
 
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       return false;
 
     LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
@@ -265,13 +265,13 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
 
-    LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ?
-                         TRI->getSubRegIndexLaneMask(MO.getSubReg()) :
-                         LaneBitmask::getAll();
+    LaneBitmask Mask = Register::isVirtualRegister(Reg)
+                           ? TRI->getSubRegIndexLaneMask(MO.getSubReg())
+                           : LaneBitmask::getAll();
     RegUse &Map = MO.isDef() ? Defs : Uses;
 
     auto Loc = Map.find(Reg);
@@ -389,7 +389,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
       for (auto &&R : Defs) {
         unsigned Reg = R.first;
         Uses.erase(Reg);
-        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+        if (Register::isPhysicalRegister(Reg))
           continue;
         LIS->removeInterval(Reg);
         LIS->createAndComputeVirtRegInterval(Reg);
@@ -397,7 +397,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
 
       for (auto &&R : Uses) {
         unsigned Reg = R.first;
-        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+        if (Register::isPhysicalRegister(Reg))
           continue;
         LIS->removeInterval(Reg);
         LIS->createAndComputeVirtRegInterval(Reg);
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index feab6bed2603..ed07ed100a19 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -112,6 +112,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
       .addImm(0) // slc
       .addImm(0) // tfe
       .addImm(0) // dlc
+      .addImm(0) // swz
       .addMemOperand(MMO);
     return;
   }
@@ -132,6 +133,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
     .addImm(0) // slc
     .addImm(0) // tfe
     .addImm(0) // dlc
+    .addImm(0) // swz
     .addMemOperand(MMO);
 }
 
@@ -157,6 +159,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
       .addImm(0) // slc
       .addImm(0) // tfe
       .addImm(0) // dlc
+      .addImm(0) // swz
       .addMemOperand(MMO);
     return;
   }
@@ -177,6 +180,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
     .addImm(0) // slc
     .addImm(0) // tfe
     .addImm(0) // dlc
+    .addImm(0) // swz
     .addMemOperand(MMO);
 }
 
@@ -202,15 +206,15 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
   DebugLoc DL;
   MachineBasicBlock::iterator I = MBB.begin();
 
-  unsigned FlatScratchInitReg
-    = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+  Register FlatScratchInitReg =
+      MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MRI.addLiveIn(FlatScratchInitReg);
   MBB.addLiveIn(FlatScratchInitReg);
 
-  unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
-  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+  Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+  Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
 
   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
 
@@ -424,8 +428,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
       getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
 
   // We need to insert initialization of the scratch resource descriptor.
-  unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
-    AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+  Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
+      AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
   if (ST.isAmdHsaOrMesa(F)) {
@@ -539,9 +543,9 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
   if (ST.isAmdPalOS()) {
     // The pointer to the GIT is formed from the offset passed in and either
     // the amdgpu-git-ptr-high function attribute or the top part of the PC
-    unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-    unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
-    unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+    Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+    Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+    Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
 
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
@@ -601,14 +605,14 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
     assert(!ST.isAmdHsaOrMesa(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
-    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+    Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+    Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
 
     // Use relocations to get the pointer, and setup the other bits manually.
     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
 
     if (MFI->hasImplicitBufferPtr()) {
-      unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+      Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
 
       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
@@ -640,8 +644,8 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
       }
     } else {
-      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+      Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+      Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
 
       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
@@ -669,6 +673,8 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
   case TargetStackID::NoAlloc:
   case TargetStackID::SGPRSpill:
     return true;
+  case TargetStackID::SVEVector:
+    return false;
   }
   llvm_unreachable("Invalid TargetStackID::Value");
 }
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index c644f4726e2c..d9970fd6b4b8 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -20,9 +20,9 @@ class GCNSubtarget;
 
 class SIFrameLowering final : public AMDGPUFrameLowering {
 public:
-  SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
-                  unsigned TransAl = 1) :
-    AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+  SIFrameLowering(StackDirection D, Align StackAl, int LAO,
+                  Align TransAl = Align::None())
+      : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~SIFrameLowering() override = default;
 
   void emitEntryFunctionPrologue(MachineFunction &MF,
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index db0782e2bf3e..56ebf9c06741 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -20,11 +20,11 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -35,6 +35,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/DAGCombine.h"
@@ -44,6 +45,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
@@ -115,7 +117,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
-  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
 
   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
@@ -125,10 +127,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
   addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
 
-  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
-  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
+  addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
 
-  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
 
   addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
@@ -141,12 +143,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
 
   if (Subtarget->has16BitInsts()) {
-    addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
-    addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
+    addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
+    addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
 
     // Unless there are also VOP3P operations, not operations are really legal.
-    addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
-    addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+    addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
+    addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
   }
@@ -178,6 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STORE, MVT::v32i32, Custom);
 
   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+  setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
@@ -215,31 +218,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
-
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
-
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
@@ -653,6 +635,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMA, MVT::v4f16, Custom);
 
     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
     setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
@@ -687,6 +670,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT, VT, Custom);
   }
 
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
+
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::ADDCARRY);
   setTargetDAGCombine(ISD::SUB);
@@ -768,19 +778,22 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                     CallingConv::ID CC,
                                                     EVT VT) const {
-  // TODO: Consider splitting all arguments into 32-bit pieces.
-  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+  if (CC == CallingConv::AMDGPU_KERNEL)
+    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+
+  if (VT.isVector()) {
     EVT ScalarVT = VT.getScalarType();
     unsigned Size = ScalarVT.getSizeInBits();
     if (Size == 32)
       return ScalarVT.getSimpleVT();
 
-    if (Size == 64)
+    if (Size > 32)
       return MVT::i32;
 
     if (Size == 16 && Subtarget->has16BitInsts())
       return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
-  }
+  } else if (VT.getSizeInBits() > 32)
+    return MVT::i32;
 
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
 }
@@ -788,7 +801,10 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                          CallingConv::ID CC,
                                                          EVT VT) const {
-  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+  if (CC == CallingConv::AMDGPU_KERNEL)
+    return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+
+  if (VT.isVector()) {
     unsigned NumElts = VT.getVectorNumElements();
     EVT ScalarVT = VT.getScalarType();
     unsigned Size = ScalarVT.getSizeInBits();
@@ -796,12 +812,13 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
     if (Size == 32)
       return NumElts;
 
-    if (Size == 64)
-      return 2 * NumElts;
+    if (Size > 32)
+      return NumElts * ((Size + 31) / 32);
 
     if (Size == 16 && Subtarget->has16BitInsts())
-      return (VT.getVectorNumElements() + 1) / 2;
-  }
+      return (NumElts + 1) / 2;
+  } else if (VT.getSizeInBits() > 32)
+    return (VT.getSizeInBits() + 31) / 32;
 
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
 }
@@ -821,10 +838,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
       return NumIntermediates;
     }
 
-    if (Size == 64) {
+    if (Size > 32) {
       RegisterVT = MVT::i32;
       IntermediateVT = RegisterVT;
-      NumIntermediates = 2 * NumElts;
+      NumIntermediates = NumElts * ((Size + 31) / 32);
       return NumIntermediates;
     }
 
@@ -901,7 +918,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Info.ptrVal = MFI->getImagePSV(
         *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
         CI.getArgOperand(RsrcIntr->RsrcArg));
-      Info.align = 0;
+      Info.align.reset();
     } else {
       Info.ptrVal = MFI->getBufferPSV(
         *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
@@ -947,7 +964,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
-    Info.align = 0;
+    Info.align.reset();
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
@@ -964,7 +981,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = MFI->getBufferPSV(
       *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
       CI.getArgOperand(1));
-    Info.align = 0;
+    Info.align.reset();
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
     const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
@@ -978,7 +995,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
                             ->getPointerElementType());
     Info.ptrVal = CI.getOperand(0);
-    Info.align = 0;
+    Info.align.reset();
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
     return true;
@@ -988,7 +1005,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
-    Info.align = 0;
+    Info.align.reset();
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
@@ -1012,7 +1029,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // This is an abstract access, but we need to specify a type and size.
     Info.memVT = MVT::i32;
     Info.size = 4;
-    Info.align = 4;
+    Info.align = Align(4);
 
     Info.flags = MachineMemOperand::MOStore;
     if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
@@ -1215,21 +1232,12 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
   return true;
 }
 
-bool SITargetLowering::allowsMisalignedMemoryAccesses(
-    EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
-    bool *IsFast) const {
+bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
+    unsigned Size, unsigned AddrSpace, unsigned Align,
+    MachineMemOperand::Flags Flags, bool *IsFast) const {
   if (IsFast)
     *IsFast = false;
 
-  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
-  // which isn't a simple VT.
-  // Until MVT is extended to handle this, simply check for the size and
-  // rely on the condition below: allow accesses if the size is a multiple of 4.
-  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
-                           VT.getStoreSize() > 16)) {
-    return false;
-  }
-
   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
@@ -1268,7 +1276,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
   }
 
   // Smaller than dword value must be aligned.
-  if (VT.bitsLT(MVT::i32))
+  if (Size < 32)
     return false;
 
   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
@@ -1277,7 +1285,26 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
   if (IsFast)
     *IsFast = true;
 
-  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+  return Size >= 32 && Align >= 4;
+}
+
+bool SITargetLowering::allowsMisalignedMemoryAccesses(
+    EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+    bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
+  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+  // which isn't a simple VT.
+  // Until MVT is extended to handle this, simply check for the size and
+  // rely on the condition below: allow accesses if the size is a multiple of 4.
+  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
+                           VT.getStoreSize() > 16)) {
+    return false;
+  }
+
+  return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
+                                            Align, Flags, IsFast);
 }
 
 EVT SITargetLowering::getOptimalMemOpType(
@@ -1336,9 +1363,9 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
 
 TargetLoweringBase::LegalizeTypeAction
 SITargetLowering::getPreferredVectorAction(MVT VT) const {
-  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
-    return TypeSplitVector;
-
+  int NumElts = VT.getVectorNumElements();
+  if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
+    return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
@@ -1562,7 +1589,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
       // entire split argument.
       if (Arg->Flags.isSplit()) {
         while (!Arg->Flags.isSplitEnd()) {
-          assert(!Arg->VT.isVector() &&
+          assert((!Arg->VT.isVector() ||
+                  Arg->VT.getScalarSizeInBits() == 16) &&
                  "unexpected vector split in ps argument type");
           if (!SkipArg)
             Splits.push_back(*Arg);
@@ -1589,29 +1617,32 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
 }
 
 // Allocate special inputs passed in VGPRs.
-static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
-                                           MachineFunction &MF,
-                                           const SIRegisterInfo &TRI,
-                                           SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+                                                      MachineFunction &MF,
+                                                      const SIRegisterInfo &TRI,
+                                                      SIMachineFunctionInfo &Info) const {
+  const LLT S32 = LLT::scalar(32);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
   if (Info.hasWorkItemIDX()) {
-    unsigned Reg = AMDGPU::VGPR0;
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    Register Reg = AMDGPU::VGPR0;
+    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
 
     CCInfo.AllocateReg(Reg);
     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
   }
 
   if (Info.hasWorkItemIDY()) {
-    unsigned Reg = AMDGPU::VGPR1;
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    Register Reg = AMDGPU::VGPR1;
+    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
 
     CCInfo.AllocateReg(Reg);
     Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
   }
 
   if (Info.hasWorkItemIDZ()) {
-    unsigned Reg = AMDGPU::VGPR2;
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    Register Reg = AMDGPU::VGPR2;
+    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
 
     CCInfo.AllocateReg(Reg);
     Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
@@ -1642,7 +1673,8 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
   assert(Reg != AMDGPU::NoRegister);
 
   MachineFunction &MF = CCInfo.getMachineFunction();
-  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+  Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+  MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
   return ArgDescriptor::createRegister(Reg, Mask);
 }
 
@@ -1671,10 +1703,10 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
 }
 
-static void allocateSpecialInputVGPRs(CCState &CCInfo,
-                                      MachineFunction &MF,
-                                      const SIRegisterInfo &TRI,
-                                      SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
+                                                 MachineFunction &MF,
+                                                 const SIRegisterInfo &TRI,
+                                                 SIMachineFunctionInfo &Info) const {
   const unsigned Mask = 0x3ff;
   ArgDescriptor Arg;
 
@@ -1692,10 +1724,11 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo,
     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
 }
 
-static void allocateSpecialInputSGPRs(CCState &CCInfo,
-                                      MachineFunction &MF,
-                                      const SIRegisterInfo &TRI,
-                                      SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateSpecialInputSGPRs(
+  CCState &CCInfo,
+  MachineFunction &MF,
+  const SIRegisterInfo &TRI,
+  SIMachineFunctionInfo &Info) const {
   auto &ArgInfo = Info.getArgInfo();
 
   // TODO: Unify handling with private memory pointers.
@@ -1728,10 +1761,10 @@ static void allocateSpecialInputSGPRs(CCState &CCInfo,
 }
 
 // Allocate special inputs passed in user SGPRs.
-static void allocateHSAUserSGPRs(CCState &CCInfo,
-                                 MachineFunction &MF,
-                                 const SIRegisterInfo &TRI,
-                                 SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
+                                            MachineFunction &MF,
+                                            const SIRegisterInfo &TRI,
+                                            SIMachineFunctionInfo &Info) const {
   if (Info.hasImplicitBufferPtr()) {
     unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
@@ -1758,9 +1791,12 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
   }
 
   if (Info.hasKernargSegmentPtr()) {
-    unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
-    MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
     CCInfo.AllocateReg(InputPtrReg);
+
+    Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
+    MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
   }
 
   if (Info.hasDispatchID()) {
@@ -1780,32 +1816,32 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
 }
 
 // Allocate special input registers that are initialized per-wave.
-static void allocateSystemSGPRs(CCState &CCInfo,
-                                MachineFunction &MF,
-                                SIMachineFunctionInfo &Info,
-                                CallingConv::ID CallConv,
-                                bool IsShader) {
+void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
+                                           MachineFunction &MF,
+                                           SIMachineFunctionInfo &Info,
+                                           CallingConv::ID CallConv,
+                                           bool IsShader) const {
   if (Info.hasWorkGroupIDX()) {
     unsigned Reg = Info.addWorkGroupIDX();
-    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
   if (Info.hasWorkGroupIDY()) {
     unsigned Reg = Info.addWorkGroupIDY();
-    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
   if (Info.hasWorkGroupIDZ()) {
     unsigned Reg = Info.addWorkGroupIDZ();
-    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
   if (Info.hasWorkGroupInfo()) {
     unsigned Reg = Info.addWorkGroupInfo();
-    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
@@ -1860,7 +1896,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
     // resource. For the Code Object V2 ABI, this will be the first 4 user
     // SGPR inputs. We can reserve those and use them directly.
 
-    unsigned PrivateSegmentBufferReg =
+    Register PrivateSegmentBufferReg =
         Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
     Info.setScratchRSrcReg(PrivateSegmentBufferReg);
   } else {
@@ -1921,7 +1957,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
     //
     // FIXME: Should not do this if inline asm is reading/writing these
     // registers.
-    unsigned PreloadedSP = Info.getPreloadedReg(
+    Register PreloadedSP = Info.getPreloadedReg(
         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
     Info.setStackPtrOffsetReg(PreloadedSP);
@@ -1971,7 +2007,7 @@ void SITargetLowering::insertCopiesSplitCSR(
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
-    unsigned NewVR = MRI->createVirtualRegister(RC);
+    Register NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     Entry->addLiveIn(*I);
     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
@@ -2134,7 +2170,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
     assert(VA.isRegLoc() && "Parameter must be in a register!");
 
-    unsigned Reg = VA.getLocReg();
+    Register Reg = VA.getLocReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
     EVT ValVT = VA.getValVT();
 
@@ -2652,6 +2688,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool IsThisReturn = false;
   MachineFunction &MF = DAG.getMachineFunction();
 
+  if (Callee.isUndef() || isNullConstant(Callee)) {
+    if (!CLI.IsTailCall) {
+      for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+        InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+    }
+
+    return Chain;
+  }
+
   if (IsVarArg) {
     return lowerUnhandledCall(CLI, InVals,
                               "unsupported call to variadic function ");
@@ -2782,7 +2827,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
       int32_t Offset = LocMemOffset;
 
       SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
-      unsigned Align = 0;
+      MaybeAlign Alignment;
 
       if (IsTailCall) {
         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
@@ -2790,8 +2835,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
           Flags.getByValSize() : VA.getValVT().getStoreSize();
 
         // FIXME: We can have better than the minimum byval required alignment.
-        Align = Flags.isByVal() ? Flags.getByValAlign() :
-          MinAlign(Subtarget->getStackAlignment(), Offset);
+        Alignment =
+            Flags.isByVal()
+                ? MaybeAlign(Flags.getByValAlign())
+                : commonAlignment(Subtarget->getStackAlignment(), Offset);
 
         Offset = Offset + FPDiff;
         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
@@ -2810,7 +2857,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
       } else {
         DstAddr = PtrOff;
         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
-        Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
+        Alignment =
+            commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
       }
 
       if (Outs[i].Flags.isByVal()) {
@@ -2825,7 +2873,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 
         MemOpChains.push_back(Cpy);
       } else {
-        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
+        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo,
+                                     Alignment ? Alignment->value() : 0);
         MemOpChains.push_back(Store);
       }
     }
@@ -2937,9 +2986,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                          IsThisReturn ? OutVals[0] : SDValue());
 }
 
-unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
-                                             SelectionDAG &DAG) const {
-  unsigned Reg = StringSwitch<unsigned>(RegName)
+Register SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                             const MachineFunction &MF) const {
+  Register Reg = StringSwitch<Register>(RegName)
     .Case("m0", AMDGPU::M0)
     .Case("exec", AMDGPU::EXEC)
     .Case("exec_lo", AMDGPU::EXEC_LO)
@@ -2947,7 +2996,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
     .Case("flat_scratch", AMDGPU::FLAT_SCR)
     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
-    .Default(AMDGPU::NoRegister);
+    .Default(Register());
 
   if (Reg == AMDGPU::NoRegister) {
     report_fatal_error(Twine("invalid register name \""
@@ -3055,6 +3104,20 @@ splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
   return std::make_pair(LoopBB, RemainderBB);
 }
 
+/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
+void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  auto I = MI.getIterator();
+  auto E = std::next(I);
+
+  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+    .addImm(0);
+
+  MIBundleBuilder Bundler(*MBB, I, E);
+  finalizeBundle(*MBB, Bundler.begin());
+}
+
 MachineBasicBlock *
 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
                                          MachineBasicBlock *BB) const {
@@ -3066,12 +3129,13 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
   MachineBasicBlock *RemainderBB;
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
-  MachineBasicBlock::iterator Prev = std::prev(MI.getIterator());
+  // Apparently kill flags are only valid if the def is in the same block?
+  if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
+    Src->setIsKill(false);
 
   std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
 
   MachineBasicBlock::iterator I = LoopBB->end();
-  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
 
   const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
     AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
@@ -3081,23 +3145,9 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
     .addImm(0)
     .addImm(EncodedReg);
 
-  // This is a pain, but we're not allowed to have physical register live-ins
-  // yet. Insert a pair of copies if the VGPR0 hack is necessary.
-  if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) {
-    unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0)
-      .add(*Src);
+  bundleInstWithWaitcnt(MI);
 
-    BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg())
-      .addReg(Data0);
-
-    MRI.setSimpleHint(Data0, Src->getReg());
-  }
-
-  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
-    .addImm(0);
-
-  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
   // Load and check TRAP_STS.MEM_VIOL
   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
@@ -3138,10 +3188,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
   MachineBasicBlock::iterator I = LoopBB.begin();
 
   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
-  unsigned PhiExec = MRI.createVirtualRegister(BoolRC);
-  unsigned NewExec = MRI.createVirtualRegister(BoolRC);
-  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned CondReg = MRI.createVirtualRegister(BoolRC);
+  Register PhiExec = MRI.createVirtualRegister(BoolRC);
+  Register NewExec = MRI.createVirtualRegister(BoolRC);
+  Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  Register CondReg = MRI.createVirtualRegister(BoolRC);
 
   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
     .addReg(InitReg)
@@ -3240,9 +3290,9 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
   MachineBasicBlock::iterator I(&MI);
 
   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
-  unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC);
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+  Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
 
@@ -3315,7 +3365,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
 
       SetOn->getOperand(3).setIsUndef();
     } else {
-      unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
           .add(*Idx)
           .addImm(Offset);
@@ -3351,8 +3401,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
-  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
+  Register Dst = MI.getOperand(0).getReg();
+  Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
 
   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
@@ -3390,8 +3440,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I(&MI);
 
-  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
 
@@ -3442,7 +3492,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
-  unsigned Dst = MI.getOperand(0).getReg();
+  Register Dst = MI.getOperand(0).getReg();
   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
@@ -3505,7 +3555,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 
   const DebugLoc &DL = MI.getDebugLoc();
 
-  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
+  Register PhiReg = MRI.createVirtualRegister(VecRC);
 
   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
                               Offset, UseGPRIdxMode, false);
@@ -3564,22 +3614,22 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MachineOperand &Src0 = MI.getOperand(1);
     MachineOperand &Src1 = MI.getOperand(2);
 
-    unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-    unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
      Src0, BoolRC, AMDGPU::sub0,
-     &AMDGPU::SReg_32_XM0RegClass);
+     &AMDGPU::SReg_32RegClass);
     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
       Src0, BoolRC, AMDGPU::sub1,
-      &AMDGPU::SReg_32_XM0RegClass);
+      &AMDGPU::SReg_32RegClass);
 
     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
       Src1, BoolRC, AMDGPU::sub0,
-      &AMDGPU::SReg_32_XM0RegClass);
+      &AMDGPU::SReg_32RegClass);
     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
       Src1, BoolRC, AMDGPU::sub1,
-      &AMDGPU::SReg_32_XM0RegClass);
+      &AMDGPU::SReg_32RegClass);
 
     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
 
@@ -3632,8 +3682,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     // S_CMOV_B64 exec, -1
     MachineInstr *FirstMI = &*BB->begin();
     MachineRegisterInfo &MRI = MF->getRegInfo();
-    unsigned InputReg = MI.getOperand(0).getReg();
-    unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    Register InputReg = MI.getOperand(0).getReg();
+    Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     bool Found = false;
 
     // Move the COPY of the input reg to the beginning, so that we can use it.
@@ -3707,16 +3757,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
     const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
-    unsigned Dst = MI.getOperand(0).getReg();
-    unsigned Src0 = MI.getOperand(1).getReg();
-    unsigned Src1 = MI.getOperand(2).getReg();
+    Register Dst = MI.getOperand(0).getReg();
+    Register Src0 = MI.getOperand(1).getReg();
+    Register Src1 = MI.getOperand(2).getReg();
     const DebugLoc &DL = MI.getDebugLoc();
-    unsigned SrcCond = MI.getOperand(3).getReg();
+    Register SrcCond = MI.getOperand(3).getReg();
 
-    unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-    unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC);
+    Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
 
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
       .addReg(SrcCond);
@@ -3814,8 +3864,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::DS_GWS_SEMA_P:
   case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
   case AMDGPU::DS_GWS_BARRIER:
-    if (getSubtarget()->hasGWSAutoReplay())
+    // A s_waitcnt 0 is required to be the instruction immediately following.
+    if (getSubtarget()->hasGWSAutoReplay()) {
+      bundleInstWithWaitcnt(MI);
       return BB;
+    }
+
     return emitGWSMemViolTestLoop(MI, BB);
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
@@ -3939,6 +3993,30 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
 }
 
+SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+  SDValue Lo0, Hi0;
+  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+  SDValue Lo1, Hi1;
+  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
+  SDValue Lo2, Hi2;
+  std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
+
+  SDLoc SL(Op);
+
+  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2,
+                             Op->getFlags());
+  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2,
+                             Op->getFlags());
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
+
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -3991,6 +4069,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
     return lowerFMINNUM_FMAXNUM(Op, DAG);
+  case ISD::FMA:
+    return splitTernaryVectorOp(Op, DAG);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -4070,6 +4150,41 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
   return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
 }
 
+SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
+                                             SelectionDAG &DAG,
+                                             ArrayRef<SDValue> Ops) const {
+  SDLoc DL(M);
+  EVT LoadVT = M->getValueType(0);
+  EVT EltType = LoadVT.getScalarType();
+  EVT IntVT = LoadVT.changeTypeToInteger();
+
+  bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
+
+  unsigned Opc =
+      IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD;
+
+  if (IsD16) {
+    return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
+  }
+
+  // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+  if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
+    return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+  if (isTypeLegal(LoadVT)) {
+    return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
+                               M->getMemOperand(), DAG);
+  }
+
+  EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
+  SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
+  SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
+                                        M->getMemOperand(), DAG);
+  return DAG.getMergeValues(
+      {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
+      DL);
+}
+
 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
                                   SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
@@ -4196,8 +4311,14 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
   }
   case ISD::INTRINSIC_W_CHAIN: {
     if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
-      Results.push_back(Res);
-      Results.push_back(Res.getValue(1));
+      if (Res.getOpcode() == ISD::MERGE_VALUES) {
+        // FIXME: Hacky
+        Results.push_back(Res.getOperand(0));
+        Results.push_back(Res.getOperand(1));
+      } else {
+        Results.push_back(Res);
+        Results.push_back(Res.getValue(1));
+      }
       return;
     }
 
@@ -4935,11 +5056,8 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
   // small. This requires us to add 4 to the global variable offset in order to
   // compute the correct address.
-  unsigned LoFlags = GAFlags;
-  if (LoFlags == SIInstrInfo::MO_NONE)
-    LoFlags = SIInstrInfo::MO_REL32;
   SDValue PtrLo =
-      DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags);
+      DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
   SDValue PtrHi;
   if (GAFlags == SIInstrInfo::MO_NONE) {
     PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
@@ -5563,14 +5681,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
   SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
   unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
   SDValue Ops[] = {
-      DAG.getEntryNode(),                         // Chain
-      Rsrc,                                       // rsrc
-      DAG.getConstant(0, DL, MVT::i32),           // vindex
-      {},                                         // voffset
-      {},                                         // soffset
-      {},                                         // offset
-      DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
-      DAG.getConstant(0, DL, MVT::i1),            // idxen
+      DAG.getEntryNode(),                               // Chain
+      Rsrc,                                             // rsrc
+      DAG.getConstant(0, DL, MVT::i32),                 // vindex
+      {},                                               // voffset
+      {},                                               // soffset
+      {},                                               // offset
+      DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+      DAG.getTargetConstant(0, DL, MVT::i1),            // idxen
   };
 
   // Use the alignment to ensure that the required offsets will fit into the
@@ -5579,7 +5697,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
 
   uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
   for (unsigned i = 0; i < NumLoads; ++i) {
-    Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
+    Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
     Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
                                             Ops, LoadVT, MMO));
   }
@@ -5758,45 +5876,31 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
-  case Intrinsic::amdgcn_interp_mov: {
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
-    SDValue Glue = M0.getValue(1);
-    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
-                       Op.getOperand(2), Op.getOperand(3), Glue);
-  }
-  case Intrinsic::amdgcn_interp_p1: {
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
-    SDValue Glue = M0.getValue(1);
-    return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
-                       Op.getOperand(2), Op.getOperand(3), Glue);
-  }
-  case Intrinsic::amdgcn_interp_p2: {
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
-    SDValue Glue = SDValue(M0.getNode(), 1);
-    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
-                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
-                       Glue);
-  }
   case Intrinsic::amdgcn_interp_p1_f16: {
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
-    SDValue Glue = M0.getValue(1);
+    SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
+                                    Op.getOperand(5), SDValue());
     if (getSubtarget()->getLDSBankCount() == 16) {
       // 16 bank LDS
-      SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
-                              DAG.getConstant(2, DL, MVT::i32), // P0
-                              Op.getOperand(2), // Attrchan
-                              Op.getOperand(3), // Attr
-                              Glue);
+
+      // FIXME: This implicitly will insert a second CopyToReg to M0.
+      SDValue S = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
+        DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32),
+        DAG.getConstant(2, DL, MVT::i32), // P0
+        Op.getOperand(2),  // Attrchan
+        Op.getOperand(3),  // Attr
+        Op.getOperand(5)); // m0
+
       SDValue Ops[] = {
         Op.getOperand(1), // Src0
         Op.getOperand(2), // Attrchan
         Op.getOperand(3), // Attr
-        DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+        DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
         S, // Src2 - holds two f16 values selected by high
-        DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+        DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
         Op.getOperand(4), // high
-        DAG.getConstant(0, DL, MVT::i1), // $clamp
-        DAG.getConstant(0, DL, MVT::i32) // $omod
+        DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
+        DAG.getTargetConstant(0, DL, MVT::i32) // $omod
       };
       return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
     } else {
@@ -5805,28 +5909,28 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         Op.getOperand(1), // Src0
         Op.getOperand(2), // Attrchan
         Op.getOperand(3), // Attr
-        DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+        DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
         Op.getOperand(4), // high
-        DAG.getConstant(0, DL, MVT::i1), // $clamp
-        DAG.getConstant(0, DL, MVT::i32), // $omod
-        Glue
+        DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
+        DAG.getTargetConstant(0, DL, MVT::i32), // $omod
+        ToM0.getValue(1)
       };
       return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
     }
   }
   case Intrinsic::amdgcn_interp_p2_f16: {
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
-    SDValue Glue = SDValue(M0.getNode(), 1);
+    SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
+                                    Op.getOperand(6), SDValue());
     SDValue Ops[] = {
       Op.getOperand(2), // Src0
       Op.getOperand(3), // Attrchan
       Op.getOperand(4), // Attr
-      DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+      DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
       Op.getOperand(1), // Src2
-      DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+      DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
       Op.getOperand(5), // high
-      DAG.getConstant(0, DL, MVT::i1), // $clamp
-      Glue
+      DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
+      ToM0.getValue(1)
     };
     return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
   }
@@ -5947,16 +6051,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                Op.getOperand(1), Op.getOperand(2));
     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
   }
-  case Intrinsic::amdgcn_wqm: {
-    SDValue Src = Op.getOperand(1);
-    return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
-                   0);
-  }
-  case Intrinsic::amdgcn_wwm: {
-    SDValue Src = Op.getOperand(1);
-    return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
-                   0);
-  }
   case Intrinsic::amdgcn_fmad_ftz:
     return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
@@ -5977,6 +6071,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                             SIInstrInfo::MO_ABS32_LO);
     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
   }
+  case Intrinsic::amdgcn_is_shared:
+  case Intrinsic::amdgcn_is_private: {
+    SDLoc SL(Op);
+    unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
+      AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
+    SDValue Aperture = getSegmentAperture(AS, SL, DAG);
+    SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
+                                 Op.getOperand(1));
+
+    SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
+                                DAG.getConstant(1, SL, MVT::i32));
+    return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -5986,6 +6093,30 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
 }
 
+// This function computes an appropriate offset to pass to
+// MachineMemOperand::setOffset() based on the offset inputs to
+// an intrinsic.  If any of the offsets are non-contstant or
+// if VIndex is non-zero then this function returns 0.  Otherwise,
+// it returns the sum of VOffset, SOffset, and Offset.
+static unsigned getBufferOffsetForMMO(SDValue VOffset,
+                                      SDValue SOffset,
+                                      SDValue Offset,
+                                      SDValue VIndex = SDValue()) {
+
+  if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
+      !isa<ConstantSDNode>(Offset))
+    return 0;
+
+  if (VIndex) {
+    if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue())
+      return 0;
+  }
+
+  return cast<ConstantSDNode>(VOffset)->getSExtValue() +
+         cast<ConstantSDNode>(SOffset)->getSExtValue() +
+         cast<ConstantSDNode>(Offset)->getSExtValue();
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@@ -6128,17 +6259,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       SDValue(),        // voffset -- will be set by setBufferOffsets
       SDValue(),        // soffset -- will be set by setBufferOffsets
       SDValue(),        // offset -- will be set by setBufferOffsets
-      DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
-      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
     };
 
-    setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
+    unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
+    // We don't know the offset if vindex is non-zero, so clear it.
+    if (IdxEn)
+      Offset = 0;
+
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
 
     EVT VT = Op.getValueType();
     EVT IntVT = VT.changeTypeToInteger();
     auto *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(Offset);
     EVT LoadVT = Op.getValueType();
 
     if (LoadVT.getScalarType() == MVT::f16)
@@ -6155,6 +6291,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_buffer_load_format: {
+    const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format;
+
     auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -6163,32 +6301,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Offsets.first,    // voffset
       Op.getOperand(4), // soffset
       Offsets.second,   // offset
-      Op.getOperand(5), // cachepolicy
-      DAG.getConstant(0, DL, MVT::i1), // idxen
+      Op.getOperand(5), // cachepolicy, swizzled buffer
+      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
     };
 
-    unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
-        AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
-    EVT VT = Op.getValueType();
-    EVT IntVT = VT.changeTypeToInteger();
     auto *M = cast<MemSDNode>(Op);
-    EVT LoadVT = Op.getValueType();
-
-    if (LoadVT.getScalarType() == MVT::f16)
-      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
-                                 M, DAG, Ops);
-
-    // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
-    if (LoadVT.getScalarType() == MVT::i8 ||
-        LoadVT.getScalarType() == MVT::i16)
-      return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
-
-    return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
-                               M->getMemOperand(), DAG);
+    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5]));
+    return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
   }
   case Intrinsic::amdgcn_struct_buffer_load:
   case Intrinsic::amdgcn_struct_buffer_load_format: {
+    const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format;
+
     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -6197,29 +6321,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Offsets.first,    // voffset
       Op.getOperand(5), // soffset
       Offsets.second,   // offset
-      Op.getOperand(6), // cachepolicy
-      DAG.getConstant(1, DL, MVT::i1), // idxen
+      Op.getOperand(6), // cachepolicy, swizzled buffer
+      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
     };
 
-    unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
-        AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
-    EVT VT = Op.getValueType();
-    EVT IntVT = VT.changeTypeToInteger();
     auto *M = cast<MemSDNode>(Op);
-    EVT LoadVT = Op.getValueType();
-
-    if (LoadVT.getScalarType() == MVT::f16)
-      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
-                                 M, DAG, Ops);
-
-    // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
-    if (LoadVT.getScalarType() == MVT::i8 ||
-        LoadVT.getScalarType() == MVT::i16)
-      return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
-
-    return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
-                               M->getMemOperand(), DAG);
+    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5],
+                                                        Ops[2]));
+    return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
   }
   case Intrinsic::amdgcn_tbuffer_load: {
     MemSDNode *M = cast<MemSDNode>(Op);
@@ -6239,9 +6348,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(4),  // voffset
       Op.getOperand(5),  // soffset
       Op.getOperand(6),  // offset
-      DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
-      DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
-      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+      DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
     };
 
     if (LoadVT.getScalarType() == MVT::f16)
@@ -6264,8 +6373,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(4),  // soffset
       Offsets.second,    // offset
       Op.getOperand(5),  // format
-      Op.getOperand(6),  // cachepolicy
-      DAG.getConstant(0, DL, MVT::i1), // idxen
+      Op.getOperand(6),  // cachepolicy, swizzled buffer
+      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
     };
 
     if (LoadVT.getScalarType() == MVT::f16)
@@ -6288,8 +6397,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(5),  // soffset
       Offsets.second,    // offset
       Op.getOperand(6),  // format
-      Op.getOperand(7),  // cachepolicy
-      DAG.getConstant(1, DL, MVT::i1), // idxen
+      Op.getOperand(7),  // cachepolicy, swizzled buffer
+      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
     };
 
     if (LoadVT.getScalarType() == MVT::f16)
@@ -6321,13 +6430,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       SDValue(),        // voffset -- will be set by setBufferOffsets
       SDValue(),        // soffset -- will be set by setBufferOffsets
       SDValue(),        // offset -- will be set by setBufferOffsets
-      DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
-      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
     };
-    setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+    unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+    // We don't know the offset if vindex is non-zero, so clear it.
+    if (IdxEn)
+      Offset = 0;
     EVT VT = Op.getValueType();
 
     auto *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(Offset);
     unsigned Opcode = 0;
 
     switch (IntrID) {
@@ -6377,7 +6490,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
   case Intrinsic::amdgcn_raw_buffer_atomic_and:
   case Intrinsic::amdgcn_raw_buffer_atomic_or:
-  case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
+  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+  case Intrinsic::amdgcn_raw_buffer_atomic_dec: {
     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -6388,11 +6503,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(5), // soffset
       Offsets.second,   // offset
       Op.getOperand(6), // cachepolicy
-      DAG.getConstant(0, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
     };
     EVT VT = Op.getValueType();
 
     auto *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
     unsigned Opcode = 0;
 
     switch (IntrID) {
@@ -6426,6 +6542,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     case Intrinsic::amdgcn_raw_buffer_atomic_xor:
       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
       break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
+      break;
+    case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
+      break;
     default:
       llvm_unreachable("unhandled atomic opcode");
     }
@@ -6442,7 +6564,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
   case Intrinsic::amdgcn_struct_buffer_atomic_and:
   case Intrinsic::amdgcn_struct_buffer_atomic_or:
-  case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
+  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+  case Intrinsic::amdgcn_struct_buffer_atomic_dec: {
     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -6453,11 +6577,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(6), // soffset
       Offsets.second,   // offset
       Op.getOperand(7), // cachepolicy
-      DAG.getConstant(1, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
     };
     EVT VT = Op.getValueType();
 
     auto *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
+                                                        Ops[3]));
     unsigned Opcode = 0;
 
     switch (IntrID) {
@@ -6491,6 +6617,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     case Intrinsic::amdgcn_struct_buffer_atomic_xor:
       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
       break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
+      break;
+    case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
+      break;
     default:
       llvm_unreachable("unhandled atomic opcode");
     }
@@ -6512,12 +6644,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       SDValue(),        // voffset -- will be set by setBufferOffsets
       SDValue(),        // soffset -- will be set by setBufferOffsets
       SDValue(),        // offset -- will be set by setBufferOffsets
-      DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
-      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
     };
-    setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+    unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+    // We don't know the offset if vindex is non-zero, so clear it.
+    if (IdxEn)
+      Offset = 0;
     EVT VT = Op.getValueType();
     auto *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(Offset);
 
     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
                                    Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -6534,10 +6670,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(6), // soffset
       Offsets.second,   // offset
       Op.getOperand(7), // cachepolicy
-      DAG.getConstant(0, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
     };
     EVT VT = Op.getValueType();
     auto *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7]));
 
     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
                                    Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -6554,10 +6691,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(7), // soffset
       Offsets.second,   // offset
       Op.getOperand(8), // cachepolicy
-      DAG.getConstant(1, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
     };
     EVT VT = Op.getValueType();
     auto *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7],
+                                                        Ops[4]));
 
     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
                                    Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -6686,23 +6825,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
   }
-  case Intrinsic::amdgcn_s_sendmsg:
-  case Intrinsic::amdgcn_s_sendmsghalt: {
-    unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
-      AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
-    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
-    SDValue Glue = Chain.getValue(1);
-    return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
-                       Op.getOperand(2), Glue);
-  }
-  case Intrinsic::amdgcn_init_exec: {
-    return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
-                       Op.getOperand(2));
-  }
-  case Intrinsic::amdgcn_init_exec_from_input: {
-    return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
-                       Op.getOperand(2), Op.getOperand(3));
-  }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
       const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -6733,9 +6855,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Op.getOperand(5),  // voffset
       Op.getOperand(6),  // soffset
       Op.getOperand(7),  // offset
-      DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
-      DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
-      DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
+      DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen
     };
     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
                            AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -6759,8 +6881,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Op.getOperand(6),  // soffset
       Offsets.second,    // offset
       Op.getOperand(7),  // format
-      Op.getOperand(8),  // cachepolicy
-      DAG.getConstant(1, DL, MVT::i1), // idexen
+      Op.getOperand(8),  // cachepolicy, swizzled buffer
+      DAG.getTargetConstant(1, DL, MVT::i1), // idexen
     };
     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
                            AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -6784,8 +6906,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Op.getOperand(5),  // soffset
       Offsets.second,    // offset
       Op.getOperand(6),  // format
-      Op.getOperand(7),  // cachepolicy
-      DAG.getConstant(0, DL, MVT::i1), // idexen
+      Op.getOperand(7),  // cachepolicy, swizzled buffer
+      DAG.getTargetConstant(0, DL, MVT::i1), // idexen
     };
     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
                            AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -6813,14 +6935,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       SDValue(), // voffset -- will be set by setBufferOffsets
       SDValue(), // soffset -- will be set by setBufferOffsets
       SDValue(), // offset -- will be set by setBufferOffsets
-      DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
-      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
     };
-    setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+    unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+    // We don't know the offset if vindex is non-zero, so clear it.
+    if (IdxEn)
+      Offset = 0;
     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
     MemSDNode *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(Offset);
 
     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
     EVT VDataType = VData.getValueType().getScalarType();
@@ -6833,10 +6959,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
   case Intrinsic::amdgcn_raw_buffer_store:
   case Intrinsic::amdgcn_raw_buffer_store_format: {
+    const bool IsFormat =
+        IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format;
+
     SDValue VData = Op.getOperand(2);
-    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    EVT VDataVT = VData.getValueType();
+    EVT EltType = VDataVT.getScalarType();
+    bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
     if (IsD16)
       VData = handleD16VData(VData, DAG);
+
+    if (!isTypeLegal(VDataVT)) {
+      VData =
+          DAG.getNode(ISD::BITCAST, DL,
+                      getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
+    }
+
     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
     SDValue Ops[] = {
       Chain,
@@ -6846,18 +6984,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Offsets.first,    // voffset
       Op.getOperand(5), // soffset
       Offsets.second,   // offset
-      Op.getOperand(6), // cachepolicy
-      DAG.getConstant(0, DL, MVT::i1), // idxen
+      Op.getOperand(6), // cachepolicy, swizzled buffer
+      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
     };
-    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
-                   AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+    unsigned Opc =
+        IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
     MemSDNode *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
 
     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
-    EVT VDataType = VData.getValueType().getScalarType();
-    if (VDataType == MVT::i8 || VDataType == MVT::i16)
-      return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
+    if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
+      return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
                                    M->getMemoryVT(), M->getMemOperand());
@@ -6865,10 +7003,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
   case Intrinsic::amdgcn_struct_buffer_store:
   case Intrinsic::amdgcn_struct_buffer_store_format: {
+    const bool IsFormat =
+        IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format;
+
     SDValue VData = Op.getOperand(2);
-    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+    EVT VDataVT = VData.getValueType();
+    EVT EltType = VDataVT.getScalarType();
+    bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
+
     if (IsD16)
       VData = handleD16VData(VData, DAG);
+
+    if (!isTypeLegal(VDataVT)) {
+      VData =
+          DAG.getNode(ISD::BITCAST, DL,
+                      getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
+    }
+
     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
     SDValue Ops[] = {
       Chain,
@@ -6878,17 +7029,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Offsets.first,    // voffset
       Op.getOperand(6), // soffset
       Offsets.second,   // offset
-      Op.getOperand(7), // cachepolicy
-      DAG.getConstant(1, DL, MVT::i1), // idxen
+      Op.getOperand(7), // cachepolicy, swizzled buffer
+      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
     };
     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
     MemSDNode *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
+                                                        Ops[3]));
 
     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
     EVT VDataType = VData.getValueType().getScalarType();
-    if (VDataType == MVT::i8 || VDataType == MVT::i16)
+    if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
       return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
@@ -6908,13 +7061,17 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       SDValue(),        // voffset -- will be set by setBufferOffsets
       SDValue(),        // soffset -- will be set by setBufferOffsets
       SDValue(),        // offset -- will be set by setBufferOffsets
-      DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
-      DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+      DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
     };
-    setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+    unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+    // We don't know the offset if vindex is non-zero, so clear it.
+    if (IdxEn)
+      Offset = 0;
     EVT VT = Op.getOperand(2).getValueType();
 
     auto *M = cast<MemSDNode>(Op);
+    M->getMemOperand()->setOffset(Offset);
     unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
                                     : AMDGPUISD::BUFFER_ATOMIC_FADD;
 
@@ -6987,7 +7144,7 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
       Overflow += ImmOffset;
       ImmOffset = 0;
     }
-    C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
+    C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
     if (Overflow) {
       auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
       if (!N0)
@@ -7001,14 +7158,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
   if (!N0)
     N0 = DAG.getConstant(0, DL, MVT::i32);
   if (!C1)
-    C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
+    C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
   return {N0, SDValue(C1, 0)};
 }
 
 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
 // pointed to by Offsets.
-void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
+unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
                                         SelectionDAG &DAG, SDValue *Offsets,
                                         unsigned Align) const {
   SDLoc DL(CombinedOffset);
@@ -7018,8 +7175,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
     if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
-      Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
-      return;
+      Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
+      return SOffset + ImmOffset;
     }
   }
   if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
@@ -7031,13 +7188,14 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
                                                 Subtarget, Align)) {
       Offsets[0] = N0;
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
-      Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
-      return;
+      Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
+      return 0;
     }
   }
   Offsets[0] = CombinedOffset;
   Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
-  Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
+  Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
+  return 0;
 }
 
 // Handle 8 bit and 16 bit buffer loads
@@ -7053,9 +7211,10 @@ SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
   SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
                                                Ops, IntVT,
                                                M->getMemOperand());
-  SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
-                                        LoadVT.getScalarType(), BufferLoad);
-  return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
+  SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
+  LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
+
+  return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
 }
 
 // Handle 8 bit and 16 bit buffer stores
@@ -7063,6 +7222,9 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
                                                       EVT VDataType, SDLoc DL,
                                                       SDValue Ops[],
                                                       MemSDNode *M) const {
+  if (VDataType == MVT::f16)
+    Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
+
   SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
   Ops[1] = BufferStoreExt;
   unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
@@ -7215,8 +7377,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
          "Custom lowering for non-i32 vectors hasn't been implemented.");
 
-  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
-                          *Load->getMemOperand())) {
+  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                      MemVT, *Load->getMemOperand())) {
     SDValue Ops[2];
     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
     return DAG.getMergeValues(Ops, DL);
@@ -7505,6 +7667,19 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
 }
 
+// Returns immediate value for setting the F32 denorm mode when using the
+// S_DENORM_MODE instruction.
+static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
+                                          const SDLoc &SL, const GCNSubtarget *ST) {
+  assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
+  int DPDenormModeDefault = ST->hasFP64Denormals()
+                                ? FP_DENORM_FLUSH_NONE
+                                : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+  int Mode = SPDenormMode | (DPDenormModeDefault << 2);
+  return DAG.getTargetConstant(Mode, SL, MVT::i32);
+}
+
 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
     return FastLowered;
@@ -7531,16 +7706,26 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
-
   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
 
   if (!Subtarget->hasFP32Denormals()) {
     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
-                                                      SL, MVT::i32);
-    SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
-                                       DAG.getEntryNode(),
-                                       EnableDenormValue, BitField);
+
+    SDValue EnableDenorm;
+    if (Subtarget->hasDenormModeInst()) {
+      const SDValue EnableDenormValue =
+          getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
+
+      EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
+                                 DAG.getEntryNode(), EnableDenormValue);
+    } else {
+      const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
+                                                        SL, MVT::i32);
+      EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
+                                 DAG.getEntryNode(), EnableDenormValue,
+                                 BitField);
+    }
+
     SDValue Ops[3] = {
       NegDivScale0,
       EnableDenorm.getValue(0),
@@ -7562,19 +7747,29 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
                              NumeratorScaled, Mul);
 
-  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
 
   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
                              NumeratorScaled, Fma3);
 
   if (!Subtarget->hasFP32Denormals()) {
-    const SDValue DisableDenormValue =
-        DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
-    SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
-                                        Fma4.getValue(1),
-                                        DisableDenormValue,
-                                        BitField,
-                                        Fma4.getValue(2));
+
+    SDValue DisableDenorm;
+    if (Subtarget->hasDenormModeInst()) {
+      const SDValue DisableDenormValue =
+          getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
+
+      DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
+                                  Fma4.getValue(1), DisableDenormValue,
+                                  Fma4.getValue(2));
+    } else {
+      const SDValue DisableDenormValue =
+          DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+
+      DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
+                                  Fma4.getValue(1), DisableDenormValue,
+                                  BitField, Fma4.getValue(2));
+    }
 
     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
                                       DisableDenorm, DAG.getRoot());
@@ -7684,8 +7879,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.isVector() &&
          Store->getValue().getValueType().getScalarType() == MVT::i32);
 
-  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
-                          *Store->getMemOperand())) {
+  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                      VT, *Store->getMemOperand())) {
     return expandUnalignedStore(Store, DAG);
   }
 
@@ -10065,7 +10260,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
     // to try understanding copies to physical registers.
     if (SrcVal.getValueType() == MVT::i1 &&
-        TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
+        Register::isPhysicalRegister(DestReg->getReg())) {
       SDLoc SL(Node);
       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
       SDValue VReg = DAG.getRegister(
@@ -10218,7 +10413,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
         MachineOperand &Op = MI.getOperand(I);
         if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
              OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
-            !TargetRegisterInfo::isVirtualRegister(Op.getReg()) ||
+            !Register::isVirtualRegister(Op.getReg()) ||
             !TRI->isAGPR(MRI, Op.getReg()))
           continue;
         auto *Src = MRI.getUniqueVRegDef(Op.getReg());
@@ -10256,7 +10451,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
          Node->use_begin()->isMachineOpcode() &&
          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
          !Node->use_begin()->hasAnyUseOfValue(0))) {
-      unsigned Def = MI.getOperand(0).getReg();
+      Register Def = MI.getOperand(0).getReg();
 
       // Change this into a noret atomic.
       MI.setDesc(TII->get(NoRetAtomicOp));
@@ -10300,7 +10495,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
 
   // Combine the constants and the pointer.
   const SDValue Ops1[] = {
-    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+    DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
     Ptr,
     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
     SubRegHi,
@@ -10330,7 +10525,7 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
 
   const SDValue Ops[] = {
-    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+    DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
     PtrLo,
     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
     PtrHi,
@@ -10364,7 +10559,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, nullptr);
       case 32:
       case 16:
-        RC = &AMDGPU::SReg_32_XM0RegClass;
+        RC = &AMDGPU::SReg_32RegClass;
         break;
       case 64:
         RC = &AMDGPU::SGPR_64RegClass;
@@ -10373,7 +10568,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         RC = &AMDGPU::SReg_96RegClass;
         break;
       case 128:
-        RC = &AMDGPU::SReg_128RegClass;
+        RC = &AMDGPU::SGPR_128RegClass;
         break;
       case 160:
         RC = &AMDGPU::SReg_160RegClass;
@@ -10415,6 +10610,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       }
       break;
     case 'a':
+      if (!Subtarget->hasMAIInsts())
+        break;
       switch (VT.getSizeInBits()) {
       default:
         return std::make_pair(0U, nullptr);
@@ -10548,9 +10745,9 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
   Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
 }
 
-unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
-  const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
-  const unsigned CacheLineAlign = 6; // log2(64)
+Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+  const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
+  const Align CacheLineAlign = Align(64);
 
   // Pre-GFX10 target did not benefit from loop alignment
   if (!ML || DisableLoopAlignment ||
@@ -10578,7 +10775,7 @@ unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
     // If inner loop block is aligned assume in average half of the alignment
     // size to be added as nops.
     if (MBB != Header)
-      LoopSize += (1 << MBB->getAlignment()) / 2;
+      LoopSize += MBB->getAlignment().value() / 2;
 
     for (const MachineInstr &MI : *MBB) {
       LoopSize += TII->getInstSizeInBytes(MI);
@@ -10644,7 +10841,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
       const MachineRegisterInfo &MRI = MF->getRegInfo();
       const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
       unsigned Reg = R->getReg();
-      if (TRI.isPhysicalRegister(Reg))
+      if (Register::isPhysicalRegister(Reg))
         return !TRI.isSGPRReg(MRI, Reg);
 
       if (MRI.isLiveIn(Reg)) {
@@ -10683,12 +10880,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
     case ISD::INTRINSIC_W_CHAIN:
       return AMDGPU::isIntrinsicSourceOfDivergence(
       cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
-    // In some cases intrinsics that are a source of divergence have been
-    // lowered to AMDGPUISD so we also need to check those too.
-    case AMDGPUISD::INTERP_MOV:
-    case AMDGPUISD::INTERP_P1:
-    case AMDGPUISD::INTERP_P2:
-      return true;
   }
   return false;
 }
@@ -10748,3 +10939,110 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 
   return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
 }
+
+const TargetRegisterClass *
+SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+  const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
+    return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
+                                               : &AMDGPU::SReg_32RegClass;
+  if (!TRI->isSGPRClass(RC) && !isDivergent)
+    return TRI->getEquivalentSGPRClass(RC);
+  else if (TRI->isSGPRClass(RC) && isDivergent)
+    return TRI->getEquivalentVGPRClass(RC);
+
+  return RC;
+}
+
+static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
+  if (!Visited.insert(V).second)
+    return false;
+  bool Result = false;
+  for (auto U : V->users()) {
+    if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
+      if (V == U->getOperand(1)) {
+        switch (Intrinsic->getIntrinsicID()) {
+        default:
+          Result = false;
+          break;
+        case Intrinsic::amdgcn_if_break:
+        case Intrinsic::amdgcn_if:
+        case Intrinsic::amdgcn_else:
+          Result = true;
+          break;
+        }
+      }
+      if (V == U->getOperand(0)) {
+        switch (Intrinsic->getIntrinsicID()) {
+        default:
+          Result = false;
+          break;
+        case Intrinsic::amdgcn_end_cf:
+        case Intrinsic::amdgcn_loop:
+          Result = true;
+          break;
+        }
+      }
+    } else {
+      Result = hasCFUser(U, Visited);
+    }
+    if (Result)
+      break;
+  }
+  return Result;
+}
+
+bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
+                                               const Value *V) const {
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::amdgcn_if_break:
+      return true;
+    }
+  }
+  if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
+    if (const IntrinsicInst *Intrinsic =
+            dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
+      switch (Intrinsic->getIntrinsicID()) {
+      default:
+        return false;
+      case Intrinsic::amdgcn_if:
+      case Intrinsic::amdgcn_else: {
+        ArrayRef<unsigned> Indices = ExtValue->getIndices();
+        if (Indices.size() == 1 && Indices[0] == 1) {
+          return true;
+        }
+      }
+      }
+    }
+  }
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (isa<InlineAsm>(CI->getCalledValue())) {
+      const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
+      ImmutableCallSite CS(CI);
+      TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
+          MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
+      for (auto &TC : TargetConstraints) {
+        if (TC.Type == InlineAsm::isOutput) {
+          ComputeConstraintToUse(TC, SDValue());
+          unsigned AssignedReg;
+          const TargetRegisterClass *RC;
+          std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
+              SIRI, TC.ConstraintCode, TC.ConstraintVT);
+          if (RC) {
+            MachineRegisterInfo &MRI = MF.getRegInfo();
+            if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
+              return true;
+            else if (SIRI->isSGPRClass(RC))
+              return true;
+          }
+        }
+      }
+    }
+  }
+  SmallPtrSet<const Value *, 16> Visited;
+  return hasCFUser(V, Visited);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 21a215e16ce7..f0102feb65c4 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -94,6 +94,9 @@ private:
                               SelectionDAG &DAG, ArrayRef<SDValue> Ops,
                               bool IsIntrinsic = false) const;
 
+  SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
+                             ArrayRef<SDValue> Ops) const;
+
   // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
   // dwordx4 if on SI.
   SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
@@ -183,6 +186,7 @@ private:
 
   unsigned isCFIntrinsic(const SDNode *Intr) const;
 
+public:
   /// \returns True if fixup needs to be emitted for given global value \p GV,
   /// false otherwise.
   bool shouldEmitFixup(const GlobalValue *GV) const;
@@ -195,11 +199,14 @@ private:
   /// global value \p GV, false otherwise.
   bool shouldEmitPCReloc(const GlobalValue *GV) const;
 
+private:
   // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
   // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
   // pointed to by Offsets.
-  void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
-                        SDValue *Offsets, unsigned Align = 4) const;
+  /// \returns 0 If there is a non-constant offset or if the offset is 0.
+  /// Otherwise returns the constant offset.
+  unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
+                           SDValue *Offsets, unsigned Align = 4) const;
 
   // Handle 8 bit and 16 bit buffer loads
   SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
@@ -235,6 +242,11 @@ public:
   bool canMergeStoresTo(unsigned AS, EVT MemVT,
                         const SelectionDAG &DAG) const override;
 
+  bool allowsMisalignedMemoryAccessesImpl(
+      unsigned Size, unsigned AS, unsigned Align,
+      MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+      bool *IsFast = nullptr) const;
+
   bool allowsMisalignedMemoryAccesses(
       EVT VT, unsigned AS, unsigned Align,
       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
@@ -309,12 +321,13 @@ public:
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
-  unsigned getRegisterByName(const char* RegName, EVT VT,
-                             SelectionDAG &DAG) const override;
+  Register getRegisterByName(const char* RegName, EVT VT,
+                             const MachineFunction &MF) const override;
 
   MachineBasicBlock *splitKillBlock(MachineInstr &MI,
                                     MachineBasicBlock *BB) const;
 
+  void bundleInstWithWaitcnt(MachineInstr &MI) const;
   MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI,
                                             MachineBasicBlock *BB) const;
 
@@ -330,6 +343,7 @@ public:
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
   SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -374,7 +388,37 @@ public:
                                     unsigned Depth = 0) const override;
   AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
 
-  unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
+  virtual const TargetRegisterClass *
+  getRegClassFor(MVT VT, bool isDivergent) const override;
+  virtual bool requiresUniformRegister(MachineFunction &MF,
+                                       const Value *V) const override;
+  Align getPrefLoopAlignment(MachineLoop *ML) const override;
+
+  void allocateHSAUserSGPRs(CCState &CCInfo,
+                            MachineFunction &MF,
+                            const SIRegisterInfo &TRI,
+                            SIMachineFunctionInfo &Info) const;
+
+  void allocateSystemSGPRs(CCState &CCInfo,
+                           MachineFunction &MF,
+                           SIMachineFunctionInfo &Info,
+                           CallingConv::ID CallConv,
+                           bool IsShader) const;
+
+  void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+                                      MachineFunction &MF,
+                                      const SIRegisterInfo &TRI,
+                                      SIMachineFunctionInfo &Info) const;
+  void allocateSpecialInputSGPRs(
+    CCState &CCInfo,
+    MachineFunction &MF,
+    const SIRegisterInfo &TRI,
+    SIMachineFunctionInfo &Info) const;
+
+  void allocateSpecialInputVGPRs(CCState &CCInfo,
+                                 MachineFunction &MF,
+                                 const SIRegisterInfo &TRI,
+                                 SIMachineFunctionInfo &Info) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c89d5b71ec5c..dcb04e426584 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1483,12 +1483,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
       if (BI.Incoming) {
         if (!Brackets)
-          Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
+          Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
         else
           *Brackets = *BI.Incoming;
       } else {
         if (!Brackets)
-          Brackets = llvm::make_unique<WaitcntBrackets>(ST);
+          Brackets = std::make_unique<WaitcntBrackets>(ST);
         else
           Brackets->clear();
       }
@@ -1508,7 +1508,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
             if (!MoveBracketsToSucc) {
               MoveBracketsToSucc = &SuccBI;
             } else {
-              SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
+              SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
             }
           } else if (SuccBI.Incoming->merge(*Brackets)) {
             SuccBI.Dirty = true;
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 561a16c3e351..4dcbe92861f2 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -124,6 +124,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   // This bit indicates that this is one of MFMA instructions.
   field bit IsMAI = 0;
 
+  // This bit indicates that this is one of DOT instructions.
+  field bit IsDOT = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -189,6 +192,8 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let TSFlags{54} = IsMAI;
 
+  let TSFlags{55} = IsDOT;
+
   let SchedRW = [Write32Bit];
 
   field bits<1> DisableSIDecoder = 0;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index ba8ed6993a56..d97e6a62971b 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -318,8 +318,25 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
 
   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
     const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
-    if (SOffset && SOffset->isReg())
-      return false;
+    if (SOffset && SOffset->isReg()) {
+      // We can only handle this if it's a stack access, as any other resource
+      // would require reporting multiple base registers.
+      const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+      if (AddrReg && !AddrReg->isFI())
+        return false;
+
+      const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
+      const SIMachineFunctionInfo *MFI
+        = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
+      if (RSrc->getReg() != MFI->getScratchRSrcReg())
+        return false;
+
+      const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
+      BaseOp = SOffset;
+      Offset = OffsetImm->getImm();
+      return true;
+    }
 
     const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
     if (!AddrReg)
@@ -458,9 +475,9 @@ bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
   const MachineRegisterInfo &MRI =
       FirstLdSt.getParent()->getParent()->getRegInfo();
 
-  const unsigned Reg = FirstDst->getReg();
+  const Register Reg = FirstDst->getReg();
 
-  const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg)
+  const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
                                          ? MRI.getRegClass(Reg)
                                          : RI.getPhysRegClass(Reg);
 
@@ -807,7 +824,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
          "Not a VGPR32 reg");
 
   if (Cond.size() == 1) {
-    unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+    Register SReg = MRI.createVirtualRegister(BoolXExecRC);
     BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
       .add(Cond[0]);
     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
@@ -820,7 +837,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
     switch (Cond[0].getImm()) {
     case SIInstrInfo::SCC_TRUE: {
-      unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+      Register SReg = MRI.createVirtualRegister(BoolXExecRC);
       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
                                             : AMDGPU::S_CSELECT_B64), SReg)
         .addImm(-1)
@@ -834,7 +851,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
       break;
     }
     case SIInstrInfo::SCC_FALSE: {
-      unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+      Register SReg = MRI.createVirtualRegister(BoolXExecRC);
       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
                                             : AMDGPU::S_CSELECT_B64), SReg)
         .addImm(0)
@@ -850,7 +867,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     case SIInstrInfo::VCCNZ: {
       MachineOperand RegOp = Cond[1];
       RegOp.setImplicit(false);
-      unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+      Register SReg = MRI.createVirtualRegister(BoolXExecRC);
       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
         .add(RegOp);
       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
@@ -864,7 +881,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
     case SIInstrInfo::VCCZ: {
       MachineOperand RegOp = Cond[1];
       RegOp.setImplicit(false);
-      unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+      Register SReg = MRI.createVirtualRegister(BoolXExecRC);
       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
         .add(RegOp);
       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
@@ -876,8 +893,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
       break;
     }
     case SIInstrInfo::EXECNZ: {
-      unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
-      unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+      Register SReg = MRI.createVirtualRegister(BoolXExecRC);
+      Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
         .addImm(0);
@@ -894,8 +911,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
       break;
     }
     case SIInstrInfo::EXECZ: {
-      unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
-      unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+      Register SReg = MRI.createVirtualRegister(BoolXExecRC);
+      Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
         .addImm(0);
@@ -925,7 +942,7 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
                                const DebugLoc &DL,
                                unsigned SrcReg, int Value) const {
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
+  Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
     .addImm(Value)
     .addReg(SrcReg);
@@ -938,7 +955,7 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
                                const DebugLoc &DL,
                                unsigned SrcReg, int Value) const {
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
+  Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
     .addImm(Value)
     .addReg(SrcReg);
@@ -1052,12 +1069,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
     // The SGPR spill/restore instructions only work on number sgprs, so we need
     // to make sure we are using the correct register class.
-    if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
+    if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) {
       MachineRegisterInfo &MRI = MF->getRegInfo();
       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
     }
 
-    MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
+    BuildMI(MBB, MI, DL, OpDesc)
       .addReg(SrcReg, getKillRegState(isKill)) // data
       .addFrameIndex(FrameIndex)               // addr
       .addMemOperand(MMO)
@@ -1068,11 +1085,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     // correctly handled.
     if (RI.spillSGPRToVGPR())
       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
-    if (ST.hasScalarStores()) {
-      // m0 is used for offset to scalar stores if used to spill.
-      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
-    }
-
     return;
   }
 
@@ -1083,7 +1095,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
   if (RI.hasAGPRs(RC)) {
     MachineRegisterInfo &MRI = MF->getRegInfo();
-    unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     MIB.addReg(Tmp, RegState::Define);
   }
   MIB.addReg(SrcReg, getKillRegState(isKill)) // data
@@ -1182,24 +1194,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
-    if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
+    if (Register::isVirtualRegister(DestReg) && SpillSize == 4) {
       MachineRegisterInfo &MRI = MF->getRegInfo();
       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
     }
 
     if (RI.spillSGPRToVGPR())
       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
-    MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
+    BuildMI(MBB, MI, DL, OpDesc, DestReg)
       .addFrameIndex(FrameIndex) // addr
       .addMemOperand(MMO)
       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
-
-    if (ST.hasScalarStores()) {
-      // m0 is used for offset to scalar stores if used to spill.
-      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
-    }
-
     return;
   }
 
@@ -1208,7 +1214,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
   if (RI.hasAGPRs(RC)) {
     MachineRegisterInfo &MRI = MF->getRegInfo();
-    unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     MIB.addReg(Tmp, RegState::Define);
   }
   MIB.addFrameIndex(FrameIndex)        // vaddr
@@ -1242,13 +1248,13 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
 
     if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
         WorkGroupSize > WavefrontSize) {
-      unsigned TIDIGXReg
-        = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
-      unsigned TIDIGYReg
-        = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
-      unsigned TIDIGZReg
-        = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
-      unsigned InputPtrReg =
+      Register TIDIGXReg =
+          MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+      Register TIDIGYReg =
+          MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+      Register TIDIGZReg =
+          MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+      Register InputPtrReg =
           MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
         if (!Entry.isLiveIn(Reg))
@@ -1410,9 +1416,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     break;
 
   case AMDGPU::V_MOV_B64_PSEUDO: {
-    unsigned Dst = MI.getOperand(0).getReg();
-    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
-    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+    Register Dst = MI.getOperand(0).getReg();
+    Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+    Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
 
     const MachineOperand &SrcOp = MI.getOperand(1);
     // FIXME: Will this work for 64-bit floating point immediates?
@@ -1437,6 +1443,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
+  case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
+    expandMovDPP64(MI);
+    break;
+  }
   case AMDGPU::V_SET_INACTIVE_B32: {
     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
@@ -1469,7 +1479,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::V_MOVRELD_B32_V8:
   case AMDGPU::V_MOVRELD_B32_V16: {
     const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
-    unsigned VecReg = MI.getOperand(0).getReg();
+    Register VecReg = MI.getOperand(0).getReg();
     bool IsUndef = MI.getOperand(1).isUndef();
     unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
     assert(VecReg == MI.getOperand(1).getReg());
@@ -1492,9 +1502,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   }
   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
     MachineFunction &MF = *MBB.getParent();
-    unsigned Reg = MI.getOperand(0).getReg();
-    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
-    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+    Register Reg = MI.getOperand(0).getReg();
+    Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
+    Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
 
     // Create a bundle so these instructions won't be re-ordered by the
     // post-RA scheduler.
@@ -1531,7 +1541,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     break;
   }
   case TargetOpcode::BUNDLE: {
-    if (!MI.mayLoad())
+    if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
       return false;
 
     // If it is a load it must be a memory clause
@@ -1550,6 +1560,64 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   return true;
 }
 
+std::pair<MachineInstr*, MachineInstr*>
+SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
+  assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  Register Dst = MI.getOperand(0).getReg();
+  unsigned Part = 0;
+  MachineInstr *Split[2];
+
+
+  for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
+    auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
+    if (Dst.isPhysical()) {
+      MovDPP.addDef(RI.getSubReg(Dst, Sub));
+    } else {
+      assert(MRI.isSSA());
+      auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      MovDPP.addDef(Tmp);
+    }
+
+    for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
+      const MachineOperand &SrcOp = MI.getOperand(I);
+      assert(!SrcOp.isFPImm());
+      if (SrcOp.isImm()) {
+        APInt Imm(64, SrcOp.getImm());
+        Imm.ashrInPlace(Part * 32);
+        MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
+      } else {
+        assert(SrcOp.isReg());
+        Register Src = SrcOp.getReg();
+        if (Src.isPhysical())
+          MovDPP.addReg(RI.getSubReg(Src, Sub));
+        else
+          MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
+      }
+    }
+
+    for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
+      MovDPP.addImm(MI.getOperand(I).getImm());
+
+    Split[Part] = MovDPP;
+    ++Part;
+  }
+
+  if (Dst.isVirtual())
+    BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
+      .addReg(Split[0]->getOperand(0).getReg())
+      .addImm(AMDGPU::sub0)
+      .addReg(Split[1]->getOperand(0).getReg())
+      .addImm(AMDGPU::sub1);
+
+  MI.eraseFromParent();
+  return std::make_pair(Split[0], Split[1]);
+}
+
 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
                                       MachineOperand &Src0,
                                       unsigned Src0OpName,
@@ -1574,7 +1642,7 @@ bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
                                              MachineOperand &RegOp,
                                              MachineOperand &NonRegOp) {
-  unsigned Reg = RegOp.getReg();
+  Register Reg = RegOp.getReg();
   unsigned SubReg = RegOp.getSubReg();
   bool IsKill = RegOp.isKill();
   bool IsDead = RegOp.isDead();
@@ -1646,7 +1714,8 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
 // This needs to be implemented because the source modifiers may be inserted
 // between the true commutable operands, and the base
 // TargetInstrInfo::commuteInstruction uses it.
-bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
+bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+                                        unsigned &SrcOpIdx0,
                                         unsigned &SrcOpIdx1) const {
   return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
 }
@@ -1710,7 +1779,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
 
   // FIXME: Virtual register workaround for RegScavenger not working with empty
   // blocks.
-  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
 
   auto I = MBB.end();
 
@@ -2163,7 +2232,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
 
   SmallVector<unsigned, 8> Regs;
   for (int Idx = 0; Idx != NElts; ++Idx) {
-    unsigned DstElt = MRI.createVirtualRegister(EltRC);
+    Register DstElt = MRI.createVirtualRegister(EltRC);
     Regs.push_back(DstElt);
 
     unsigned SubIdx = SubIndices[Idx];
@@ -2327,7 +2396,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       UseMI.RemoveOperand(
           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
 
-      unsigned Src1Reg = Src1->getReg();
+      Register Src1Reg = Src1->getReg();
       unsigned Src1SubReg = Src1->getSubReg();
       Src0->setReg(Src1Reg);
       Src0->setSubReg(Src1SubReg);
@@ -2367,12 +2436,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
           MRI->hasOneUse(Src0->getReg())) {
           Src0->ChangeToImmediate(Def->getOperand(1).getImm());
           Src0Inlined = true;
-        } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
-            (ST.getConstantBusLimit(Opc) <= 1 &&
-             RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
-            (RI.isVirtualRegister(Src0->getReg()) &&
-            (ST.getConstantBusLimit(Opc) <= 1 &&
-             RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
+        } else if ((Register::isPhysicalRegister(Src0->getReg()) &&
+                    (ST.getConstantBusLimit(Opc) <= 1 &&
+                     RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
+                   (Register::isVirtualRegister(Src0->getReg()) &&
+                    (ST.getConstantBusLimit(Opc) <= 1 &&
+                     RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
           return false;
           // VGPR is okay as Src0 - fallthrough
       }
@@ -2385,10 +2454,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
             MRI->hasOneUse(Src1->getReg()) &&
             commuteInstruction(UseMI)) {
             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
-        } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
-            RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
-            (RI.isVirtualRegister(Src1->getReg()) &&
-            RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+        } else if ((Register::isPhysicalRegister(Src1->getReg()) &&
+                    RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
+                   (Register::isVirtualRegister(Src1->getReg()) &&
+                    RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
           return false;
           // VGPR is okay as Src1 - fallthrough
       }
@@ -2472,8 +2541,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
 }
 
 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
-                                                  const MachineInstr &MIb,
-                                                  AliasAnalysis *AA) const {
+                                                  const MachineInstr &MIb) const {
   assert((MIa.mayLoad() || MIa.mayStore()) &&
          "MIa must load from or modify a memory location");
   assert((MIb.mayLoad() || MIb.mayStore()) &&
@@ -2664,6 +2732,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
          MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
+         MI.getOpcode() == AMDGPU::S_DENORM_MODE ||
          changesVGPRIndexingMode(MI);
 }
 
@@ -2865,8 +2934,16 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
   if (OpInfo.RegClass < 0)
     return false;
 
-  if (MO.isImm() && isInlineConstant(MO, OpInfo))
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+
+  if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
+    if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
+        OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                    AMDGPU::OpName::src2))
+      return false;
     return RI.opCanUseInlineConstant(OpInfo.OperandType);
+  }
 
   if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
     return false;
@@ -2874,8 +2951,6 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
   if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
     return true;
 
-  const MachineFunction *MF = MI.getParent()->getParent();
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
   return ST.hasVOP3Literal();
 }
 
@@ -3036,7 +3111,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
   if (!MO.isUse())
     return false;
 
-  if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+  if (Register::isVirtualRegister(MO.getReg()))
     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
 
   // Null is free
@@ -3093,7 +3168,8 @@ static bool shouldReadExec(const MachineInstr &MI) {
     return true;
   }
 
-  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
+  if (MI.isPreISelOpcode() ||
+      SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
       SIInstrInfo::isSALU(MI) ||
       SIInstrInfo::isSMRD(MI))
     return false;
@@ -3104,7 +3180,7 @@ static bool shouldReadExec(const MachineInstr &MI) {
 static bool isSubRegOf(const SIRegisterInfo &TRI,
                        const MachineOperand &SuperVec,
                        const MachineOperand &SubReg) {
-  if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
+  if (Register::isPhysicalRegister(SubReg.getReg()))
     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
 
   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
@@ -3144,8 +3220,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       if (!Op.isReg())
         continue;
 
-      unsigned Reg = Op.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+      Register Reg = Op.getReg();
+      if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) {
         ErrInfo = "inlineasm operand has incorrect register class.";
         return false;
       }
@@ -3209,9 +3285,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       continue;
 
     if (RegClass != -1) {
-      unsigned Reg = MI.getOperand(i).getReg();
-      if (Reg == AMDGPU::NoRegister ||
-          TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MI.getOperand(i).getReg();
+      if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg))
         continue;
 
       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
@@ -3304,7 +3379,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         ErrInfo =
             "Dst register should be tied to implicit use of preserved register";
         return false;
-      } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
+      } else if (Register::isPhysicalRegister(TiedMO.getReg()) &&
                  Dst.getReg() != TiedMO.getReg()) {
         ErrInfo = "Dst register should use same physical register as preserved";
         return false;
@@ -3409,6 +3484,32 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  // Special case for writelane - this can break the multiple constant bus rule,
+  // but still can't use more than one SGPR register
+  if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
+    unsigned SGPRCount = 0;
+    Register SGPRUsed = AMDGPU::NoRegister;
+
+    for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
+      if (OpIdx == -1)
+        break;
+
+      const MachineOperand &MO = MI.getOperand(OpIdx);
+
+      if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
+        if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
+          if (MO.getReg() != SGPRUsed)
+            ++SGPRCount;
+          SGPRUsed = MO.getReg();
+        }
+      }
+      if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
+        ErrInfo = "WRITELANE instruction violates constant bus restriction";
+        return false;
+      }
+    }
+  }
+
   // Verify misc. restrictions on specific instructions.
   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
@@ -3609,7 +3710,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
         ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
       ErrInfo = "Invalid dpp_ctrl value: "
-                "broadcats are not supported on GFX10+";
+                "broadcasts are not supported on GFX10+";
       return false;
     }
     if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
@@ -3631,6 +3732,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::PHI: return AMDGPU::PHI;
   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
   case AMDGPU::WQM: return AMDGPU::WQM;
+  case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
   case AMDGPU::WWM: return AMDGPU::WWM;
   case AMDGPU::S_MOV_B32: {
     const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -3708,9 +3810,9 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
   const MCInstrDesc &Desc = get(MI.getOpcode());
   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
       Desc.OpInfo[OpNo].RegClass == -1) {
-    unsigned Reg = MI.getOperand(OpNo).getReg();
+    Register Reg = MI.getOperand(OpNo).getReg();
 
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (Register::isVirtualRegister(Reg))
       return MRI.getRegClass(Reg);
     return RI.getPhysRegClass(Reg);
   }
@@ -3741,7 +3843,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
   else
     VRC = &AMDGPU::VGPR_32RegClass;
 
-  unsigned Reg = MRI.createVirtualRegister(VRC);
+  Register Reg = MRI.createVirtualRegister(VRC);
   DebugLoc DL = MBB->findDebugLoc(I);
   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
   MO.ChangeToRegister(Reg, false);
@@ -3756,7 +3858,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
                                          const {
   MachineBasicBlock *MBB = MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned SubReg = MRI.createVirtualRegister(SubRC);
+  Register SubReg = MRI.createVirtualRegister(SubRC);
 
   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
@@ -3768,7 +3870,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
   // value so we don't need to worry about merging its subreg index with the
   // SubIdx passed to this function. The register coalescer should be able to
   // eliminate this extra copy.
-  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+  Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
 
   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
@@ -3814,11 +3916,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
   if (!MO.isReg())
     return false;
 
-  unsigned Reg = MO.getReg();
-  const TargetRegisterClass *RC =
-    TargetRegisterInfo::isVirtualRegister(Reg) ?
-    MRI.getRegClass(Reg) :
-    RI.getPhysRegClass(Reg);
+  Register Reg = MO.getReg();
+  const TargetRegisterClass *RC = Register::isVirtualRegister(Reg)
+                                      ? MRI.getRegClass(Reg)
+                                      : RI.getPhysRegClass(Reg);
 
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
@@ -3935,13 +4036,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
   if (Opc == AMDGPU::V_WRITELANE_B32) {
     const DebugLoc &DL = MI.getDebugLoc();
     if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
-      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
           .add(Src0);
       Src0.ChangeToRegister(Reg, false);
     }
     if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
-      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
       const DebugLoc &DL = MI.getDebugLoc();
       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
           .add(Src1);
@@ -3967,7 +4068,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
   // select is uniform.
   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
       RI.isVGPR(MRI, Src1.getReg())) {
-    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
     const DebugLoc &DL = MI.getDebugLoc();
     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
         .add(Src1);
@@ -4003,7 +4104,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
 
   MI.setDesc(get(CommutedOpc));
 
-  unsigned Src0Reg = Src0.getReg();
+  Register Src0Reg = Src0.getReg();
   unsigned Src0SubReg = Src0.getSubReg();
   bool Src0Kill = Src0.isKill();
 
@@ -4039,13 +4140,13 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
     MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
     const DebugLoc &DL = MI.getDebugLoc();
     if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
-      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
         .add(Src1);
       Src1.ChangeToRegister(Reg, false);
     }
     if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
-      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
         .add(Src2);
       Src2.ChangeToRegister(Reg, false);
@@ -4113,12 +4214,12 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
                                          MachineRegisterInfo &MRI) const {
   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
-  unsigned DstReg = MRI.createVirtualRegister(SRC);
+  Register DstReg = MRI.createVirtualRegister(SRC);
   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
 
   if (RI.hasAGPRs(VRC)) {
     VRC = RI.getEquivalentVGPRClass(VRC);
-    unsigned NewSrcReg = MRI.createVirtualRegister(VRC);
+    Register NewSrcReg = MRI.createVirtualRegister(VRC);
     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
             get(TargetOpcode::COPY), NewSrcReg)
         .addReg(SrcReg);
@@ -4134,7 +4235,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
 
   SmallVector<unsigned, 8> SRegs;
   for (unsigned i = 0; i < SubRegs; ++i) {
-    unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
@@ -4176,7 +4277,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
                                          MachineOperand &Op,
                                          MachineRegisterInfo &MRI,
                                          const DebugLoc &DL) const {
-  unsigned OpReg = Op.getReg();
+  Register OpReg = Op.getReg();
   unsigned OpSubReg = Op.getSubReg();
 
   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
@@ -4186,7 +4287,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
   if (DstRC == OpRC)
     return;
 
-  unsigned DstReg = MRI.createVirtualRegister(DstRC);
+  Register DstReg = MRI.createVirtualRegister(DstRC);
   MachineInstr *Copy =
       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
 
@@ -4198,8 +4299,19 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
     return;
 
   // Try to eliminate the copy if it is copying an immediate value.
-  if (Def->isMoveImmediate())
+  if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
     FoldImmediate(*Copy, *Def, OpReg, &MRI);
+
+  bool ImpDef = Def->isImplicitDef();
+  while (!ImpDef && Def && Def->isCopy()) {
+    if (Def->getOperand(1).getReg().isPhysical())
+      break;
+    Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
+    ImpDef = Def && Def->isImplicitDef();
+  }
+  if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
+      !ImpDef)
+    Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
 }
 
 // Emit the actual waterfall loop, executing the wrapped instruction for each
@@ -4223,18 +4335,18 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
 
   MachineBasicBlock::iterator I = LoopBB.begin();
 
-  unsigned VRsrc = Rsrc.getReg();
+  Register VRsrc = Rsrc.getReg();
   unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
 
-  unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
-  unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
-  unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
-  unsigned AndCond = MRI.createVirtualRegister(BoolXExecRC);
-  unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+  Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+  Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+  Register AndCond = MRI.createVirtualRegister(BoolXExecRC);
+  Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
 
   // Beginning of the loop, read the next Rsrc variant.
   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
@@ -4302,7 +4414,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
 
-  unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
 
   // Save the EXEC mask
   BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
@@ -4370,10 +4482,10 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
                              AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
 
   // Create an empty resource descriptor
-  unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-  unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+  Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
   uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
 
   // Zero64 = 0
@@ -4430,7 +4542,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
       if (!MI.getOperand(i).isReg() ||
-          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+          !Register::isVirtualRegister(MI.getOperand(i).getReg()))
         continue;
       const TargetRegisterClass *OpRC =
           MRI.getRegClass(MI.getOperand(i).getReg());
@@ -4447,8 +4559,16 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
       if (!VRC) {
         assert(SRC);
-        VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC)
-                                                : RI.getEquivalentVGPRClass(SRC);
+        if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
+          VRC = &AMDGPU::VReg_1RegClass;
+        } else
+          VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
+                    ? RI.getEquivalentAGPRClass(SRC)
+                    : RI.getEquivalentVGPRClass(SRC);
+      } else {
+          VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
+                    ? RI.getEquivalentAGPRClass(VRC)
+                    : RI.getEquivalentVGPRClass(VRC);
       }
       RC = VRC;
     } else {
@@ -4458,7 +4578,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
     // Update all the operands so they have the same type.
     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
       MachineOperand &Op = MI.getOperand(I);
-      if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+      if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
         continue;
 
       // MI is a PHI instruction.
@@ -4483,7 +4603,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
       // subregister index types e.g. sub0_sub1 + sub2 + sub3
       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
         MachineOperand &Op = MI.getOperand(I);
-        if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+        if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
           continue;
 
         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
@@ -4502,8 +4622,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
   // Legalize INSERT_SUBREG
   // src0 must have the same register class as dst
   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
-    unsigned Dst = MI.getOperand(0).getReg();
-    unsigned Src0 = MI.getOperand(1).getReg();
+    Register Dst = MI.getOperand(0).getReg();
+    Register Src0 = MI.getOperand(1).getReg();
     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
     if (DstRC != Src0RC) {
@@ -4577,13 +4697,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
     if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
       // This is already an ADDR64 instruction so we need to add the pointer
       // extracted from the resource descriptor to the current value of VAddr.
-      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+      Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
       const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-      unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
-      unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+      Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+      Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
 
       unsigned RsrcPtr, NewSRsrc;
       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
@@ -4623,7 +4743,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
       unsigned RsrcPtr, NewSRsrc;
       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
 
-      unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+      Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
@@ -4661,6 +4781,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
           MIB.addImm(TFE->getImm());
         }
 
+        MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
+
         MIB.cloneMemRefs(MI);
         Addr64 = MIB;
       } else {
@@ -4933,8 +5055,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
     unsigned NewDstReg = AMDGPU::NoRegister;
     if (HasDst) {
-      unsigned DstReg = Inst.getOperand(0).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+      Register DstReg = Inst.getOperand(0).getReg();
+      if (Register::isPhysicalRegister(DstReg))
         continue;
 
       // Update the destination register class.
@@ -4943,7 +5065,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
         continue;
 
       if (Inst.isCopy() &&
-          TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
+          Register::isVirtualRegister(Inst.getOperand(1).getReg()) &&
           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
         // Instead of creating a copy where src and dst are the same register
         // class, we just replace all uses of dst with src.  These kinds of
@@ -4988,8 +5110,8 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
     MachineBasicBlock &MBB = *Inst.getParent();
     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
-    unsigned OldDstReg = Inst.getOperand(0).getReg();
-    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register OldDstReg = Inst.getOperand(0).getReg();
+    Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
     unsigned Opc = Inst.getOpcode();
     assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
@@ -5022,8 +5144,8 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
 
   MachineOperand &Dest = Inst.getOperand(0);
   MachineOperand &Src = Inst.getOperand(1);
-  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
   unsigned SubOp = ST.hasAddNoCarry() ?
     AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
@@ -5052,7 +5174,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
   MachineOperand &Src1 = Inst.getOperand(2);
 
   if (ST.hasDLInsts()) {
-    unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
 
@@ -5072,8 +5194,8 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
     bool Src1IsSGPR = Src1.isReg() &&
                       RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
     MachineInstr *Xor;
-    unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-    unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
     // Build a pair of scalar instructions and add them to the work list.
     // The next iteration over the work list will lower these to the vector
@@ -5117,8 +5239,8 @@ void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
   MachineOperand &Src0 = Inst.getOperand(1);
   MachineOperand &Src1 = Inst.getOperand(2);
 
-  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
     .add(Src0)
@@ -5146,8 +5268,8 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
   MachineOperand &Src0 = Inst.getOperand(1);
   MachineOperand &Src1 = Inst.getOperand(2);
 
-  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
     .add(Src1);
@@ -5189,16 +5311,16 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
-  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+  Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
 
-  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+  Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
 
-  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+  Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
     .addReg(DestSub0)
     .addImm(AMDGPU::sub0)
@@ -5226,12 +5348,12 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
 
-  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
-  unsigned CarryReg = MRI.createVirtualRegister(CarryRC);
-  unsigned DeadCarryReg = MRI.createVirtualRegister(CarryRC);
+  Register CarryReg = MRI.createVirtualRegister(CarryRC);
+  Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
 
   MachineOperand &Dest = Inst.getOperand(0);
   MachineOperand &Src0 = Inst.getOperand(1);
@@ -5327,17 +5449,17 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
-  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+  Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
                               .add(SrcReg0Sub0)
                               .add(SrcReg1Sub0);
 
-  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+  Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
                               .add(SrcReg0Sub1)
                               .add(SrcReg1Sub1);
 
-  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+  Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
     .addReg(DestSub0)
     .addImm(AMDGPU::sub0)
@@ -5368,7 +5490,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
 
   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
 
-  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
 
   MachineOperand* Op0;
   MachineOperand* Op1;
@@ -5384,7 +5506,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
   BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
     .add(*Op0);
 
-  unsigned NewDest = MRI.createVirtualRegister(DestRC);
+  Register NewDest = MRI.createVirtualRegister(DestRC);
 
   MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
     .addReg(Interm)
@@ -5411,8 +5533,8 @@ void SIInstrInfo::splitScalar64BitBCNT(
     MRI.getRegClass(Src.getReg()) :
     &AMDGPU::SGPR_32RegClass;
 
-  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
 
@@ -5451,9 +5573,9 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
          Offset == 0 && "Not implemented");
 
   if (BitWidth < 32) {
-    unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+    Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
@@ -5476,8 +5598,8 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
   }
 
   MachineOperand &Src = Inst.getOperand(1);
-  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
     .addImm(31)
@@ -5506,6 +5628,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
     switch (UseMI.getOpcode()) {
     case AMDGPU::COPY:
     case AMDGPU::WQM:
+    case AMDGPU::SOFT_WQM:
     case AMDGPU::WWM:
     case AMDGPU::REG_SEQUENCE:
     case AMDGPU::PHI:
@@ -5531,7 +5654,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
                                  MachineRegisterInfo &MRI,
                                  MachineInstr &Inst) const {
-  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   MachineBasicBlock *MBB = Inst.getParent();
   MachineOperand &Src0 = Inst.getOperand(1);
   MachineOperand &Src1 = Inst.getOperand(2);
@@ -5539,8 +5662,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
 
   switch (Inst.getOpcode()) {
   case AMDGPU::S_PACK_LL_B32_B16: {
-    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
     // 0.
@@ -5558,7 +5681,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
     break;
   }
   case AMDGPU::S_PACK_LH_B32_B16: {
-    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
       .addImm(0xffff);
     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
@@ -5568,8 +5691,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
     break;
   }
   case AMDGPU::S_PACK_HH_B32_B16: {
-    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
       .addImm(16)
       .add(Src0);
@@ -5623,17 +5746,27 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
   case AMDGPU::REG_SEQUENCE:
   case AMDGPU::INSERT_SUBREG:
   case AMDGPU::WQM:
+  case AMDGPU::SOFT_WQM:
   case AMDGPU::WWM: {
     const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
     if (RI.hasAGPRs(SrcRC)) {
       if (RI.hasAGPRs(NewDstRC))
         return nullptr;
 
-      NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
+      switch (Inst.getOpcode()) {
+      case AMDGPU::PHI:
+      case AMDGPU::REG_SEQUENCE:
+      case AMDGPU::INSERT_SUBREG:
+        NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
+        break;
+      default:
+        NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+      }
+
       if (!NewDstRC)
         return nullptr;
     } else {
-       if (RI.hasVGPRs(NewDstRC))
+      if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
         return nullptr;
 
       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
@@ -5686,7 +5819,7 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
       return MO.getReg();
 
     // If this could be a VGPR or an SGPR, Check the dynamic register class.
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
     if (RI.isSGPRClass(RegRC))
       UsedSGPRs[i] = Reg;
@@ -5941,7 +6074,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
   MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
 
   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
-    unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
+    Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
     MachineInstr *SIIF =
         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
             .add(Branch->getOperand(0))
@@ -5968,8 +6101,8 @@ void SIInstrInfo::convertNonUniformLoopRegion(
 
   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
 
-    unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
-    unsigned BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
+    Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
+    Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
     MachineInstrBuilder HeaderPHIBuilder =
         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
     for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
@@ -5979,7 +6112,7 @@ void SIInstrInfo::convertNonUniformLoopRegion(
         HeaderPHIBuilder.addReg(BackEdgeReg);
       } else {
         MachineBasicBlock *PMBB = *PI;
-        unsigned ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
+        Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
                              ZeroReg, 0);
         HeaderPHIBuilder.addReg(ZeroReg);
@@ -6063,13 +6196,30 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
 
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  unsigned UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
+  Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
   MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
 
   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
 }
 
+MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator I,
+                                               const DebugLoc &DL,
+                                               Register DestReg,
+                                               RegScavenger &RS) const {
+  if (ST.hasAddNoCarry())
+    return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
+
+  Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
+  // TODO: Users need to deal with this.
+  if (!UnusedCarry.isValid())
+    return MachineInstrBuilder();
+
+  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+           .addReg(UnusedCarry, RegState::Define | RegState::Dead);
+}
+
 bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
   switch (Opcode) {
   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
@@ -6115,7 +6265,21 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
     return false;
 
   const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
-  return RCID == AMDGPU::SReg_128RegClassID;
+  return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
+}
+
+unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace,
+                                           bool Signed) const {
+  if (!ST.hasFlatInstOffsets())
+    return 0;
+
+  if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
+    return 0;
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::GFX10)
+    return Signed ? 12 : 11;
+
+  return Signed ? 13 : 12;
 }
 
 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
@@ -6254,7 +6418,7 @@ static bool followSubRegDef(MachineInstr &MI,
 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
                                      MachineRegisterInfo &MRI) {
   assert(MRI.isSSA());
-  if (!TargetRegisterInfo::isVirtualRegister(P.Reg))
+  if (!Register::isVirtualRegister(P.Reg))
     return nullptr;
 
   auto RSR = P;
@@ -6265,8 +6429,7 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
     case AMDGPU::COPY:
     case AMDGPU::V_MOV_B32_e32: {
       auto &Op1 = MI->getOperand(1);
-      if (Op1.isReg() &&
-        TargetRegisterInfo::isVirtualRegister(Op1.getReg())) {
+      if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) {
         if (Op1.isUndef())
           return nullptr;
         RSR = getRegSubRegPair(Op1);
@@ -6360,3 +6523,40 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
       return true;
   }
 }
+
+MachineInstr *SIInstrInfo::createPHIDestinationCopy(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
+    const DebugLoc &DL, Register Src, Register Dst) const {
+  auto Cur = MBB.begin();
+  if (Cur != MBB.end())
+    do {
+      if (!Cur->isPHI() && Cur->readsRegister(Dst))
+        return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
+      ++Cur;
+    } while (Cur != MBB.end() && Cur != LastPHIIt);
+
+  return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
+                                                   Dst);
+}
+
+MachineInstr *SIInstrInfo::createPHISourceCopy(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
+    const DebugLoc &DL, Register Src, Register SrcSubReg, Register Dst) const {
+  if (InsPt != MBB.end() &&
+      (InsPt->getOpcode() == AMDGPU::SI_IF ||
+       InsPt->getOpcode() == AMDGPU::SI_ELSE ||
+       InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
+      InsPt->definesRegister(Src)) {
+    InsPt++;
+    return BuildMI(MBB, InsPt, InsPt->getDebugLoc(),
+                   get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
+                                     : AMDGPU::S_MOV_B64_term),
+                   Dst)
+        .addReg(Src, 0, SrcSubReg)
+        .addReg(AMDGPU::EXEC, RegState::Implicit);
+  }
+  return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
+                                              Dst);
+}
+
+bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 3ff35da0b963..be463442c888 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -173,7 +173,7 @@ public:
   }
 
   bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                         AliasAnalysis *AA) const override;
+                                         AAResults *AA) const override;
 
   bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
                                int64_t &Offset1,
@@ -229,6 +229,14 @@ public:
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
+  // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
+  // instructions. Returns a pair of generated instructions.
+  // Can split either post-RA with physical registers or pre-RA with
+  // virtual registers. In latter case IR needs to be in SSA form and
+  // and a REG_SEQUENCE is produced to define original register.
+  std::pair<MachineInstr*, MachineInstr*>
+  expandMovDPP64(MachineInstr &MI) const;
+
   // Returns an opcode that can be used to move a value to a \p DstRC
   // register.  If there is no hardware instruction that can store to \p
   // DstRC, then AMDGPU::COPY is returned.
@@ -242,7 +250,7 @@ public:
     return commuteOpcode(MI.getOpcode());
   }
 
-  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+  bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
   bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0,
@@ -303,8 +311,7 @@ public:
 
   bool
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
-                                  const MachineInstr &MIb,
-                                  AliasAnalysis *AA = nullptr) const override;
+                                  const MachineInstr &MIb) const override;
 
   bool isFoldableCopy(const MachineInstr &MI) const;
 
@@ -578,6 +585,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::IsMAI;
   }
 
+  static bool isDOT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
+  }
+
+  bool isDOT(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
+  }
+
   static bool isScalarUnit(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
   }
@@ -954,6 +969,19 @@ public:
 
   bool isBasicBlockPrologue(const MachineInstr &MI) const override;
 
+  MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator InsPt,
+                                         const DebugLoc &DL, Register Src,
+                                         Register Dst) const override;
+
+  MachineInstr *createPHISourceCopy(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator InsPt,
+                                    const DebugLoc &DL, Register Src,
+                                    Register SrcSubReg,
+                                    Register Dst) const override;
+
+  bool isWave32() const;
+
   /// Return a partially built integer add instruction without carry.
   /// Caller must add source operands.
   /// For pre-GFX9 it will generate unused carry destination operand.
@@ -963,6 +991,12 @@ public:
                                     const DebugLoc &DL,
                                     unsigned DestReg) const;
 
+  MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    const DebugLoc &DL,
+                                    Register DestReg,
+                                    RegScavenger &RS) const;
+
   static bool isKillTerminator(unsigned Opcode);
   const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const;
 
@@ -970,6 +1004,8 @@ public:
     return isUInt<12>(Imm);
   }
 
+  unsigned getNumFlatOffsetBits(unsigned AddrSpace, bool Signed) const;
+
   /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT
   /// encoded instruction. If \p Signed, this is for an instruction that
   /// interprets the offset as signed.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index c382c816e0b4..1eecbf555613 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -84,7 +84,7 @@ def SDTtbuffer_load : SDTypeProfile<1, 8,
    SDTCisVT<4, i32>,    // soffset(SGPR)
    SDTCisVT<5, i32>,    // offset(imm)
    SDTCisVT<6, i32>,    // format(imm)
-   SDTCisVT<7, i32>,    // cachecontrol(imm)
+   SDTCisVT<7, i32>,    // cachepolicy, swizzled buffer(imm)
    SDTCisVT<8, i1>      // idxen(imm)
   ]>;
 
@@ -102,7 +102,7 @@ def SDTtbuffer_store : SDTypeProfile<0, 9,
      SDTCisVT<4, i32>,    // soffset(SGPR)
      SDTCisVT<5, i32>,    // offset(imm)
      SDTCisVT<6, i32>,    // format(imm)
-     SDTCisVT<7, i32>,    // cachecontrol(imm)
+     SDTCisVT<7, i32>,    // cachepolicy, swizzled buffer(imm)
      SDTCisVT<8, i1>      // idxen(imm)
     ]>;
 
@@ -119,7 +119,7 @@ def SDTBufferLoad : SDTypeProfile<1, 7,
      SDTCisVT<3, i32>,   // voffset(VGPR)
      SDTCisVT<4, i32>,   // soffset(SGPR)
      SDTCisVT<5, i32>,   // offset(imm)
-     SDTCisVT<6, i32>,   // cachepolicy(imm)
+     SDTCisVT<6, i32>,   // cachepolicy, swizzled buffer(imm)
      SDTCisVT<7, i1>]>;  // idxen(imm)
 
 def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
@@ -145,7 +145,7 @@ def SDTBufferStore : SDTypeProfile<0, 8,
      SDTCisVT<3, i32>,   // voffset(VGPR)
      SDTCisVT<4, i32>,   // soffset(SGPR)
      SDTCisVT<5, i32>,   // offset(imm)
-     SDTCisVT<6, i32>,   // cachepolicy(imm)
+     SDTCisVT<6, i32>,   // cachepolicy, swizzled buffer(imm)
      SDTCisVT<7, i1>]>;  // idxen(imm)
 
 def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
@@ -198,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">;
 def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">;
 def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
 def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
+def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
+def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
 def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
 def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
 
@@ -264,6 +266,11 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
   [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
+  SDTypeProfile<0 ,1, [SDTCisInt<0>]>,
+  [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]
+>;
+
 //===----------------------------------------------------------------------===//
 // ValueType helpers
 //===----------------------------------------------------------------------===//
@@ -277,7 +284,9 @@ class isFloatType<ValueType SrcVT> {
     !if(!eq(SrcVT.Value, f64.Value), 1,
     !if(!eq(SrcVT.Value, v2f16.Value), 1,
     !if(!eq(SrcVT.Value, v4f16.Value), 1,
-    0)))));
+    !if(!eq(SrcVT.Value, v2f32.Value), 1,
+    !if(!eq(SrcVT.Value, v2f64.Value), 1,
+    0)))))));
 }
 
 class isIntType<ValueType SrcVT> {
@@ -300,14 +309,36 @@ class isPackedType<ValueType SrcVT> {
 // PatFrags for global memory operations
 //===----------------------------------------------------------------------===//
 
-defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
-defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
+foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
+
 
-def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
-def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
-def atomic_load_fadd_local : local_binary_atomic_op<atomic_load_fadd>;
-def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
-def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
+defm atomic_inc_#as : binary_atomic_op<SIatomic_inc>;
+defm atomic_dec_#as : binary_atomic_op<SIatomic_dec>;
+defm atomic_load_fmin_#as : binary_atomic_op<SIatomic_fmin, 0>;
+defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
+
+
+} // End let AddressSpaces = ...
+} // End foreach AddrSpace
+
+def atomic_fadd_global_noret : PatFrag<
+  (ops node:$ptr, node:$value),
+  (SIglobal_atomic_fadd node:$ptr, node:$value)> {
+  // FIXME: Move this
+  let MemoryVT = f32;
+  let IsAtomic = 1;
+  let AddressSpaces = StoreAddress_global.AddrSpaces;
+}
+
+def atomic_pk_fadd_global_noret : PatFrag<
+    (ops node:$ptr, node:$value),
+    (SIglobal_atomic_pk_fadd node:$ptr, node:$value)> {
+  // FIXME: Move this
+  let MemoryVT = v2f16;
+  let IsAtomic = 1;
+  let AddressSpaces = StoreAddress_global.AddrSpaces;
+}
 
 //===----------------------------------------------------------------------===//
 // SDNodes PatFrags for loads/stores with a glue input.
@@ -328,10 +359,12 @@ def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
 >;
 
 def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr)> {
+  let IsLoad = 1;
   let IsUnindexed = 1;
 }
 
 def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+  let IsLoad = 1;
   let IsNonExtLoad = 1;
 }
 
@@ -347,14 +380,15 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr),
   let MemoryVT = i64;
 }
 
-def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
+def extload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
   let IsLoad = 1;
   let IsAnyExtLoad = 1;
 }
 
-def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{
-  return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
-}]>;
+def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+  let IsLoad = 1;
+  let IsSignExtLoad = 1;
+}
 
 def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
   let IsLoad = 1;
@@ -391,25 +425,50 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> {
   let MemoryVT = i16;
 }
 
-def load_glue_align8 : Aligned8Bytes <
-  (ops node:$ptr), (load_glue node:$ptr)
->;
-def load_glue_align16 : Aligned16Bytes <
-  (ops node:$ptr), (load_glue node:$ptr)
->;
 
+let IsLoad = 1, AddressSpaces = LoadAddress_local.AddrSpaces in {
+def load_local_m0 : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
+  let IsNonExtLoad = 1;
+}
 
-def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
-def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress;
-def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
-def extloadi8_local_m0 : LoadFrag<extloadi8_glue>, LocalAddress;
-def zextloadi8_local_m0 : LoadFrag<zextloadi8_glue>, LocalAddress;
-def extloadi16_local_m0 : LoadFrag<extloadi16_glue>, LocalAddress;
-def zextloadi16_local_m0 : LoadFrag<zextloadi16_glue>, LocalAddress;
-def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
-def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
-def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
-def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress;
+let MemoryVT = i8 in {
+def extloadi8_local_m0 : PatFrag<(ops node:$ptr), (extloadi8_glue node:$ptr)>;
+def sextloadi8_local_m0 : PatFrag<(ops node:$ptr), (sextloadi8_glue node:$ptr)>;
+def zextloadi8_local_m0 : PatFrag<(ops node:$ptr), (zextloadi8_glue node:$ptr)>;
+}
+
+let MemoryVT = i16 in {
+def extloadi16_local_m0 : PatFrag<(ops node:$ptr), (extloadi16_glue node:$ptr)>;
+def sextloadi16_local_m0 : PatFrag<(ops node:$ptr), (sextloadi16_glue node:$ptr)>;
+def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)>;
+}
+
+def load_align8_local_m0 : PatFrag<(ops node:$ptr),
+                                   (load_local_m0 node:$ptr)> {
+  let IsLoad = 1;
+  let IsNonExtLoad = 1;
+  let MinAlignment = 8;
+}
+def load_align16_local_m0 : PatFrag<(ops node:$ptr),
+                                    (load_local_m0 node:$ptr)> {
+  let IsLoad = 1;
+  let IsNonExtLoad = 1;
+  let MinAlignment = 16;
+}
+
+} // End IsLoad = 1
+
+let IsAtomic = 1, AddressSpaces = LoadAddress_local.AddrSpaces in {
+def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr),
+                                      (atomic_load_32_glue node:$ptr)> {
+  let MemoryVT = i32;
+}
+def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr),
+                                       (atomic_load_64_glue node:$ptr)> {
+  let MemoryVT = i64;
+}
+
+} // End let AddressSpaces = LoadAddress_local.AddrSpaces
 
 
 def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
@@ -420,50 +479,88 @@ def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore,
   [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
 >;
 
-def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val),
-  (AMDGPUatomic_st_glue node:$ptr, node:$val)> {
-}
-
 def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr),
-                                   (AMDGPUst_glue node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
-}]>;
+                                   (AMDGPUst_glue node:$val, node:$ptr)> {
+  let IsStore = 1;
+  let IsUnindexed = 1;
+}
 
 def store_glue : PatFrag<(ops node:$val, node:$ptr),
-                      (unindexedstore_glue node:$val, node:$ptr), [{
-  return !cast<StoreSDNode>(N)->isTruncatingStore();
-}]>;
+                         (unindexedstore_glue node:$val, node:$ptr)> {
+  let IsStore = 1;
+  let IsTruncStore = 0;
+}
 
 def truncstore_glue : PatFrag<(ops node:$val, node:$ptr),
-  (unindexedstore_glue node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->isTruncatingStore();
-}]>;
+  (unindexedstore_glue node:$val, node:$ptr)> {
+  let IsStore = 1;
+  let IsTruncStore = 1;
+}
 
 def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr),
-                           (truncstore_glue node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+                           (truncstore_glue node:$val, node:$ptr)> {
+  let IsStore = 1;
+  let MemoryVT = i8;
+}
 
 def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr),
-                           (truncstore_glue node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+                           (truncstore_glue node:$val, node:$ptr)> {
+  let IsStore = 1;
+  let MemoryVT = i16;
+}
 
-def store_glue_align8 : Aligned8Bytes <
-  (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
->;
+let IsStore = 1, AddressSpaces = StoreAddress_local.AddrSpaces in {
+def store_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+                             (store_glue node:$val, node:$ptr)> {
+  let IsStore = 1;
+  let IsTruncStore = 0;
+}
 
-def store_glue_align16 : Aligned16Bytes <
-  (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
->;
+def truncstorei8_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+                                    (unindexedstore_glue node:$val, node:$ptr)> {
+  let IsStore = 1;
+  let MemoryVT = i8;
+}
+
+def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+                                    (unindexedstore_glue node:$val, node:$ptr)> {
+  let IsStore = 1;
+  let MemoryVT = i16;
+}
+}
+
+def store_align16_local_m0 : PatFrag <
+  (ops node:$value, node:$ptr),
+  (store_local_m0 node:$value, node:$ptr)> {
+  let IsStore = 1;
+  let IsTruncStore = 0;
+  let MinAlignment = 16;
+}
 
-def store_local_m0 : StoreFrag<store_glue>, LocalAddress;
-def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress;
-def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress;
-def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress;
+def store_align8_local_m0 : PatFrag <
+  (ops node:$value, node:$ptr),
+  (store_local_m0 node:$value, node:$ptr)> {
+  let IsStore = 1;
+  let IsTruncStore = 0;
+  let MinAlignment = 8;
+}
+
+let AddressSpaces = StoreAddress_local.AddrSpaces in {
+
+def atomic_store_local_32_m0 : PatFrag <
+  (ops node:$value, node:$ptr),
+  (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i32;
+}
+def atomic_store_local_64_m0 : PatFrag <
+  (ops node:$value, node:$ptr),
+  (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i64;
+}
+} // End let AddressSpaces = StoreAddress_local.AddrSpaces
 
-def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress;
-def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress;
 
 def si_setcc_uniform : PatFrag <
   (ops node:$lhs, node:$rhs, node:$cond),
@@ -539,16 +636,27 @@ def lshl_rev : PatFrag <
   (shl $src0, $src1)
 >;
 
+def add_ctpop : PatFrag <
+  (ops node:$src0, node:$src1),
+  (add (ctpop $src0), $src1)
+>;
+
 multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
-                            SDTypeProfile tc = SDTAtomic2> {
+                            SDTypeProfile tc = SDTAtomic2,
+                            bit IsInt = 1> {
 
   def _glue : SDNode <
     !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc,
     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
   >;
 
-  def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
-  def _region_m0 : region_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+  let AddressSpaces = StoreAddress_local.AddrSpaces in {
+    defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+  }
+
+  let AddressSpaces = StoreAddress_region.AddrSpaces in {
+    defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+  }
 }
 
 defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
@@ -563,17 +671,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
 defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
 defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
 defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
-defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32>;
-defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
-defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
-
-def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
-  [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
->;
-
-def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>;
-def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion<atomic_cmp_swap_glue>;
-
+defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
 
 def as_i1imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
@@ -591,6 +691,10 @@ def as_i32imm: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
 }]>;
 
+def as_i32timm: SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
 def as_i64imm: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
 }]>;
@@ -627,9 +731,13 @@ def SIMM16bit : ImmLeaf <i32,
 >;
 
 def UIMM16bit : ImmLeaf <i32,
-  [{return isUInt<16>(Imm); }]
+  [{return isUInt<16>(Imm);}]
 >;
 
+def i64imm_32bit : ImmLeaf<i64, [{
+  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
+
 class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
   return isInlineImmediate(N);
 }]>;
@@ -763,6 +871,18 @@ def ExpTgtMatchClass : AsmOperandClass {
   let RenderMethod = "printExpTgt";
 }
 
+def SWaitMatchClass : AsmOperandClass {
+  let Name = "SWaitCnt";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseSWaitCntOps";
+}
+
+def VReg32OrOffClass : AsmOperandClass {
+  let Name = "VReg32OrOff";
+  let ParserMethod = "parseVReg32OrOff";
+}
+
+let OperandType = "OPERAND_IMMEDIATE" in {
 def SendMsgImm : Operand<i32> {
   let PrintMethod = "printSendMsg";
   let ParserMatchClass = SendMsgMatchClass;
@@ -778,22 +898,11 @@ def EndpgmImm : Operand<i16> {
   let ParserMatchClass = EndpgmMatchClass;
 }
 
-def SWaitMatchClass : AsmOperandClass {
-  let Name = "SWaitCnt";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "parseSWaitCntOps";
-}
-
-def VReg32OrOffClass : AsmOperandClass {
-  let Name = "VReg32OrOff";
-  let ParserMethod = "parseVReg32OrOff";
-}
-
 def WAIT_FLAG : Operand <i32> {
   let ParserMatchClass = SWaitMatchClass;
   let PrintMethod = "printWaitFlag";
-  let OperandType = "OPERAND_IMMEDIATE";
 }
+} // End OperandType = "OPERAND_IMMEDIATE"
 
 include "SIInstrFormats.td"
 include "VIInstrFormats.td"
@@ -929,6 +1038,7 @@ def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
 def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
 def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
 def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>;
 def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
 def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
 def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>;
@@ -1317,18 +1427,6 @@ class getVALUDstForVT<ValueType VT> {
                               VOPDstS64orS32)))); // else VT == i1
 }
 
-// Returns true if VT is floating point.
-class getIsFP<ValueType VT> {
-  bit ret = !if(!eq(VT.Value, f16.Value), 1,
-            !if(!eq(VT.Value, v2f16.Value), 1,
-            !if(!eq(VT.Value, v4f16.Value), 1,
-            !if(!eq(VT.Value, f32.Value), 1,
-            !if(!eq(VT.Value, v2f32.Value), 1,
-            !if(!eq(VT.Value, f64.Value), 1,
-            !if(!eq(VT.Value, v2f64.Value), 1,
-            0)))))));
-}
-
 // Returns the register class to use for the destination of VOP[12C]
 // instructions with SDWA extension
 class getSDWADstForVT<ValueType VT> {
@@ -1340,7 +1438,7 @@ class getSDWADstForVT<ValueType VT> {
 // Returns the register class to use for source 0 of VOP[12C]
 // instructions for the given VT.
 class getVOPSrc0ForVT<ValueType VT> {
-  bit isFP = getIsFP<VT>.ret;
+  bit isFP = isFloatType<VT>.ret;
 
   RegisterOperand ret =
     !if(isFP,
@@ -1373,11 +1471,14 @@ class getVOPSrc0ForVT<ValueType VT> {
 // Returns the vreg register class to use for source operand given VT
 class getVregSrcForVT<ValueType VT> {
   RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
-                        !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
+                        !if(!eq(VT.Size, 96), VReg_96,
+                          !if(!eq(VT.Size, 64), VReg_64,
+                            !if(!eq(VT.Size, 48), VReg_64,
+                              VGPR_32))));
 }
 
 class getSDWASrcForVT <ValueType VT> {
-  bit isFP = getIsFP<VT>.ret;
+  bit isFP = isFloatType<VT>.ret;
   RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
   RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
   RegisterOperand ret = !if(isFP, retFlt, retInt);
@@ -1386,7 +1487,7 @@ class getSDWASrcForVT <ValueType VT> {
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
-  bit isFP = getIsFP<VT>.ret;
+  bit isFP = isFloatType<VT>.ret;
   RegisterOperand ret =
   !if(!eq(VT.Size, 128),
      VSrc_128,
@@ -1433,7 +1534,7 @@ class isModifierType<ValueType SrcVT> {
 
 // Return type of input modifiers operand for specified input operand
 class getSrcMod <ValueType VT, bit EnableF32SrcMods> {
-  bit isFP = getIsFP<VT>.ret;
+  bit isFP = isFloatType<VT>.ret;
   bit isPacked = isPackedType<VT>.ret;
   Operand ret =  !if(!eq(VT.Size, 64),
                      !if(isFP, FP64InputMods, Int64InputMods),
@@ -1452,7 +1553,7 @@ class getOpSelMod <ValueType VT> {
 
 // Return type of input modifiers operand specified input operand for DPP
 class getSrcModExt <ValueType VT> {
-  bit isFP = getIsFP<VT>.ret;
+  bit isFP = isFloatType<VT>.ret;
   Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
 }
 
@@ -2038,6 +2139,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   field int NeedPatGen = PatGenMode.NoPattern;
 
   field bit IsMAI = 0;
+  field bit IsDOT = 0;
 
   field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
   field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 70f20bb69370..21984c6ad910 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -43,8 +43,8 @@ multiclass V_INTERP_P1_F32_m : VINTRP_m <
   (outs VINTRPDst:$vdst),
   (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
   "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
-  [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
-                                               (i32 imm:$attr)))]
+  [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc,
+                   (i32 timm:$attrchan), (i32 timm:$attr), M0))]
 >;
 
 let OtherPredicates = [has32BankLDS] in {
@@ -66,8 +66,8 @@ defm V_INTERP_P2_F32 : VINTRP_m <
   (outs VINTRPDst:$vdst),
   (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
   "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
-  [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
-                                                          (i32 imm:$attr)))]>;
+  [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
+                   (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
 
 } // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
 
@@ -76,8 +76,8 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
   (outs VINTRPDst:$vdst),
   (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
   "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
-  [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
-                                     (i32 imm:$attr)))]>;
+  [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc),
+                   (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
 
 } // End Uses = [M0, EXEC]
 
@@ -92,6 +92,11 @@ def ATOMIC_FENCE : SPseudoInstSI<
   let maybeAtomic = 1;
 }
 
+def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
+  let HasExt = 1;
+  let HasExtDPP = 1;
+}
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 
 // For use in patterns
@@ -107,10 +112,19 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)>;
 
+// 64-bit vector move with dpp. Expanded post-RA.
+def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
+  let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
+}
+
 // Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
 // WQM pass processes it.
 def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 
+// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is
+// turned into a copy by WQM pass, but does not seed WQM requirements.
+def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+
 // Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
 // that the @earlyclobber is respected. The @earlyclobber is to make sure that
 // the instruction that defines $src0 (which is run in WWM) doesn't
@@ -345,13 +359,15 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
 }
 
 def SI_INIT_EXEC : SPseudoInstSI <
-  (outs), (ins i64imm:$src), []> {
+  (outs), (ins i64imm:$src),
+  [(int_amdgcn_init_exec (i64 timm:$src))]> {
   let Defs = [EXEC];
   let usesCustomInserter = 1;
   let isAsCheapAsAMove = 1;
   let WaveSizePredicate = isWave64;
 }
 
+// FIXME: Intrinsic should be mangled for wave size.
 def SI_INIT_EXEC_LO : SPseudoInstSI <
   (outs), (ins i32imm:$src), []> {
   let Defs = [EXEC_LO];
@@ -360,12 +376,20 @@ def SI_INIT_EXEC_LO : SPseudoInstSI <
   let WaveSizePredicate = isWave32;
 }
 
+// FIXME: Wave32 version
 def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
-  (outs), (ins SSrc_b32:$input, i32imm:$shift), []> {
+  (outs), (ins SSrc_b32:$input, i32imm:$shift),
+  [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
   let Defs = [EXEC];
   let usesCustomInserter = 1;
 }
 
+def : GCNPat <
+  (int_amdgcn_init_exec timm:$src),
+  (SI_INIT_EXEC_LO (as_i32imm imm:$src))> {
+  let WaveSizePredicate = isWave32;
+}
+
 // Return for returning shaders to a shader variant epilog.
 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -604,25 +628,6 @@ def : GCNPat <
   (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
 >;
 
-def : GCNPat <
-  (AMDGPUinit_exec i64:$src),
-  (SI_INIT_EXEC (as_i64imm $src))
-> {
-  let WaveSizePredicate = isWave64;
-}
-
-def : GCNPat <
-  (AMDGPUinit_exec i64:$src),
-  (SI_INIT_EXEC_LO (as_i32imm $src))
-> {
-  let WaveSizePredicate = isWave32;
-}
-
-def : GCNPat <
-  (AMDGPUinit_exec_from_input i32:$input, i32:$shift),
-  (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
->;
-
 def : GCNPat<
   (AMDGPUtrap timm:$trapid),
   (S_TRAP $trapid)
@@ -740,22 +745,22 @@ def : GCNPat <
 
 def : GCNPat <
   (i32 (fp_to_sint f16:$src)),
-  (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
+  (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
 >;
 
 def : GCNPat <
   (i32 (fp_to_uint f16:$src)),
-  (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
+  (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
 >;
 
 def : GCNPat <
   (f16 (sint_to_fp i32:$src)),
-  (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
+  (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 VSrc_b32:$src))
 >;
 
 def : GCNPat <
   (f16 (uint_to_fp i32:$src)),
-  (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
+  (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src))
 >;
 
 //===----------------------------------------------------------------------===//
@@ -808,8 +813,14 @@ def : GCNPat <
   (V_BCNT_U32_B32_e64 $popcnt, $val)
 >;
 }
+
 def : GCNPat <
-  (i16 (add (i16 (trunc (getDivergentFrag<ctpop>.ret i32:$popcnt))), i16:$val)),
+  (i32 (ctpop i32:$popcnt)),
+  (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))
+>;
+
+def : GCNPat <
+  (i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)),
   (V_BCNT_U32_B32_e64 $popcnt, $val)
 >;
 
@@ -1076,53 +1087,158 @@ def : GCNPat <
 /********** ================================ **********/
 
 // Prevent expanding both fneg and fabs.
+// TODO: Add IgnoredBySelectionDAG bit?
+let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG
 
 def : GCNPat <
-  (fneg (fabs f32:$src)),
-  (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
+  (fneg (fabs (f32 SReg_32:$src))),
+  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit
 >;
 
-// FIXME: Should use S_OR_B32
 def : GCNPat <
-  (fneg (fabs f64:$src)),
-  (REG_SEQUENCE VReg_64,
-    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
-    sub0,
-    (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
-                  (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
-    sub1)
+  (fabs (f32 SReg_32:$src)),
+  (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff)))
+>;
+
+def : GCNPat <
+  (fneg (f32 SReg_32:$src)),
+  (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
+>;
+
+def : GCNPat <
+  (fneg (f16 SReg_32:$src)),
+  (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
+>;
+
+def : GCNPat <
+  (fneg (f16 VGPR_32:$src)),
+  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+  (fabs (f16 SReg_32:$src)),
+  (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
+>;
+
+def : GCNPat <
+  (fneg (fabs (f16 SReg_32:$src))),
+  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
+>;
+
+def : GCNPat <
+  (fneg (fabs (f16 VGPR_32:$src))),
+  (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
+>;
+
+def : GCNPat <
+  (fneg (v2f16 SReg_32:$src)),
+  (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000)))
+>;
+
+def : GCNPat <
+  (fabs (v2f16 SReg_32:$src)),
+  (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff)))
+>;
+
+// This is really (fneg (fabs v2f16:$src))
+//
+// fabs is not reported as free because there is modifier for it in
+// VOP3P instructions, so it is turned into the bit op.
+def : GCNPat <
+  (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
+  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
 >;
 
 def : GCNPat <
-  (fabs f32:$src),
-  (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff)))
+  (fneg (v2f16 (fabs SReg_32:$src))),
+  (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
+>;
+
+// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled
+ // def : GCNPat <
+//   (fneg (f64 SReg_64:$src)),
+//   (REG_SEQUENCE SReg_64,
+//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+//     sub0,
+//     (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+//                (i32 (S_MOV_B32 (i32 0x80000000)))),
+//     sub1)
+// >;
+
+// def : GCNPat <
+//   (fneg (fabs (f64 SReg_64:$src))),
+//   (REG_SEQUENCE SReg_64,
+//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+//     sub0,
+//     (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+//               (S_MOV_B32 (i32 0x80000000))), // Set sign bit.
+//     sub1)
+// >;
+
+} // End let AddedComplexity = 1
+
+def : GCNPat <
+  (fabs (f32 VGPR_32:$src)),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+  (fneg (f32 VGPR_32:$src)),
+  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+  (fabs (f16 VGPR_32:$src)),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
 >;
 
 def : GCNPat <
-  (fneg f32:$src),
-  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
+  (fneg (v2f16 VGPR_32:$src)),
+  (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
 >;
 
 def : GCNPat <
-  (fabs f64:$src),
+  (fabs (v2f16 VGPR_32:$src)),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+  (fneg (v2f16 (fabs VGPR_32:$src))),
+  (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit
+>;
+
+def : GCNPat <
+  (fabs (f64 VReg_64:$src)),
   (REG_SEQUENCE VReg_64,
-    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
     sub0,
-    (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+    (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
                    (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
      sub1)
 >;
 
+// TODO: Use SGPR for constant
 def : GCNPat <
-  (fneg f64:$src),
+  (fneg (f64 VReg_64:$src)),
   (REG_SEQUENCE VReg_64,
-    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
     sub0,
-    (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+    (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
                    (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
     sub1)
 >;
 
+// TODO: Use SGPR for constant
+def : GCNPat <
+  (fneg (fabs (f64 VReg_64:$src))),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
+    sub0,
+    (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
+                  (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
+    sub1)
+>;
+
 def : GCNPat <
   (fcopysign f16:$src0, f16:$src1),
   (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -1154,45 +1270,6 @@ def : GCNPat <
              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
 >;
 
-def : GCNPat <
-  (fneg f16:$src),
-  (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000)))
->;
-
-def : GCNPat <
-  (fabs f16:$src),
-  (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff)))
->;
-
-def : GCNPat <
-  (fneg (fabs f16:$src)),
-  (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
->;
-
-def : GCNPat <
-  (fneg v2f16:$src),
-  (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000)))
->;
-
-def : GCNPat <
-  (fabs v2f16:$src),
-  (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff)))
->;
-
-// This is really (fneg (fabs v2f16:$src))
-//
-// fabs is not reported as free because there is modifier for it in
-// VOP3P instructions, so it is turned into the bit op.
-def : GCNPat <
-  (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
-  (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
->;
-
-def : GCNPat <
-  (fneg (v2f16 (fabs v2f16:$src))),
-  (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
->;
-
 /********** ================== **********/
 /********** Immediate Patterns **********/
 /********** ================== **********/
@@ -1544,7 +1621,7 @@ def : GCNPat <
   (V_CVT_F16_F32_e32 (
       V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
-                        $src))
+                        SSrc_i1:$src))
 >;
 
 def : GCNPat <
@@ -1552,35 +1629,35 @@ def : GCNPat <
   (V_CVT_F16_F32_e32 (
       V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
-                        $src))
+                        SSrc_i1:$src))
 >;
 
 def : GCNPat <
   (f32 (sint_to_fp i1:$src)),
   (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
-                        $src)
+                        SSrc_i1:$src)
 >;
 
 def : GCNPat <
   (f32 (uint_to_fp i1:$src)),
   (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
-                        $src)
+                        SSrc_i1:$src)
 >;
 
 def : GCNPat <
   (f64 (sint_to_fp i1:$src)),
   (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                                         /*src1mod*/(i32 0), /*src1*/(i32 -1),
-                                        $src))
+                                        SSrc_i1:$src))
 >;
 
 def : GCNPat <
   (f64 (uint_to_fp i1:$src)),
   (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                                         /*src1mod*/(i32 0), /*src1*/(i32 1),
-                                        $src))
+                                        SSrc_i1:$src))
 >;
 
 //===----------------------------------------------------------------------===//
@@ -1788,6 +1865,22 @@ def : GCNPat <
   (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
 >;
 
+def : GCNPat <
+  (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
+                           timm:$bound_ctrl)),
+  (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
+                        (as_i32imm $row_mask), (as_i32imm $bank_mask),
+                        (as_i1imm $bound_ctrl))
+>;
+
+def : GCNPat <
+  (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
+                              timm:$bank_mask, timm:$bound_ctrl)),
+  (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
+                        (as_i32imm $row_mask), (as_i32imm $bank_mask),
+                        (as_i1imm $bound_ctrl))
+>;
+
 //===----------------------------------------------------------------------===//
 // Fract Patterns
 //===----------------------------------------------------------------------===//
@@ -1915,3 +2008,13 @@ def : FP16Med3Pat<f16, V_MED3_F16>;
 defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
 defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
 } // End Predicates = [isGFX9Plus]
+
+class AMDGPUGenericInstruction : GenericInstruction {
+  let Namespace = "AMDGPU";
+}
+
+def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = 0;
+}
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ae8b967893a2..20db1c37f354 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -42,10 +42,7 @@
 //
 // Future improvements:
 //
-// - This currently relies on the scheduler to place loads and stores next to
-//   each other, and then only merges adjacent pairs of instructions. It would
-//   be good to be more flexible with interleaved instructions, and possibly run
-//   before scheduling. It currently missing stores of constants because loading
+// - This is currently missing stores of constants because loading
 //   the constant into the data register is placed between the stores, although
 //   this is arguably a scheduling problem.
 //
@@ -98,14 +95,9 @@ enum InstClassEnum {
   DS_READ,
   DS_WRITE,
   S_BUFFER_LOAD_IMM,
-  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
-  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
-  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
-  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
-  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
-  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
-  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
-  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
+  BUFFER_LOAD,
+  BUFFER_STORE,
+  MIMG,
 };
 
 enum RegisterEnum {
@@ -114,6 +106,7 @@ enum RegisterEnum {
   SOFFSET = 0x4,
   VADDR = 0x8,
   ADDR = 0x10,
+  SSAMP = 0x20,
 };
 
 class SILoadStoreOptimizer : public MachineFunctionPass {
@@ -126,6 +119,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     unsigned Width0;
     unsigned Width1;
     unsigned BaseOff;
+    unsigned DMask0;
+    unsigned DMask1;
     InstClassEnum InstClass;
     bool GLC0;
     bool GLC1;
@@ -135,6 +130,60 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     bool DLC1;
     bool UseST64;
     SmallVector<MachineInstr *, 8> InstsToMove;
+    int AddrIdx[5];
+    const MachineOperand *AddrReg[5];
+    unsigned NumAddresses;
+
+    bool hasSameBaseAddress(const MachineInstr &MI) {
+      for (unsigned i = 0; i < NumAddresses; i++) {
+        const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
+
+        if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
+          if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
+              AddrReg[i]->getImm() != AddrRegNext.getImm()) {
+            return false;
+          }
+          continue;
+        }
+
+        // Check same base pointer. Be careful of subregisters, which can occur
+        // with vectors of pointers.
+        if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
+            AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
+         return false;
+        }
+      }
+      return true;
+    }
+
+    bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
+      for (unsigned i = 0; i < NumAddresses; ++i) {
+        const MachineOperand *AddrOp = AddrReg[i];
+        // Immediates are always OK.
+        if (AddrOp->isImm())
+          continue;
+
+        // Don't try to merge addresses that aren't either immediates or registers.
+        // TODO: Should be possible to merge FrameIndexes and maybe some other
+        // non-register
+        if (!AddrOp->isReg())
+          return false;
+
+        // TODO: We should be able to merge physical reg addreses.
+        if (Register::isPhysicalRegister(AddrOp->getReg()))
+          return false;
+
+        // If an address has only one use then there will be on other
+        // instructions with the same address, so we can't merge this one.
+        if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
+          return false;
+      }
+      return true;
+    }
+
+    void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
+               const GCNSubtarget &STM);
+    void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
   };
 
   struct BaseRegisters {
@@ -160,14 +209,12 @@ private:
   AliasAnalysis *AA = nullptr;
   bool OptimizeAgain;
 
+  static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII);
   static bool offsetsCanBeCombined(CombineInfo &CI);
   static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
   static unsigned getNewOpcode(const CombineInfo &CI);
   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
-  unsigned getOpcodeWidth(const MachineInstr &MI);
-  InstClassEnum getInstClass(unsigned Opc);
-  unsigned getRegs(unsigned Opc);
 
   bool findMatchingInst(CombineInfo &CI);
 
@@ -178,22 +225,27 @@ private:
   unsigned write2Opcode(unsigned EltSize) const;
   unsigned write2ST64Opcode(unsigned EltSize) const;
   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
+  MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI);
   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
 
   void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
-                           int32_t NewOffset);
-  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
-  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
-  Optional<int32_t> extractConstOffset(const MachineOperand &Op);
-  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
+                           int32_t NewOffset) const;
+  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
+  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
+  Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
+  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
   /// Promotes constant offset to the immediate by adjusting the base. It
   /// tries to use a base from the nearby instructions that allows it to have
   /// a 13bit constant offset which gets promoted to the immediate.
   bool promoteConstantOffsetToImm(MachineInstr &CI,
                                   MemInfoMap &Visited,
-                                  SmallPtrSet<MachineInstr *, 4> &Promoted);
+                                  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
+  void addInstToMergeableList(const CombineInfo &CI,
+                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
+  bool collectMergeableInsts(MachineBasicBlock &MBB,
+                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
 
 public:
   static char ID;
@@ -202,7 +254,11 @@ public:
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
-  bool optimizeBlock(MachineBasicBlock &MBB);
+  void removeCombinedInst(std::list<CombineInfo> &MergeList,
+                                         const MachineInstr &MI);
+  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
+                                     bool &OptimizeListAgain);
+  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -216,6 +272,264 @@ public:
   }
 };
 
+static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
+  const unsigned Opc = MI.getOpcode();
+
+  if (TII.isMUBUF(Opc)) {
+    // FIXME: Handle d16 correctly
+    return AMDGPU::getMUBUFElements(Opc);
+  }
+  if (TII.isMIMG(MI)) {
+    uint64_t DMaskImm =
+        TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
+    return countPopulation(DMaskImm);
+  }
+
+  switch (Opc) {
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+    return 1;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+    return 2;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return 4;
+  default:
+    return 0;
+  }
+}
+
+/// Maps instruction opcode to enum InstClassEnum.
+static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
+  switch (Opc) {
+  default:
+    if (TII.isMUBUF(Opc)) {
+      switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
+      default:
+        return UNKNOWN;
+      case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+      case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
+      case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+      case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+        return BUFFER_LOAD;
+      case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+      case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+      case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+      case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+        return BUFFER_STORE;
+      }
+    }
+    if (TII.isMIMG(Opc)) {
+      // Ignore instructions encoded without vaddr.
+      if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1)
+        return UNKNOWN;
+      // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
+      if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc))
+        return UNKNOWN;
+      return MIMG;
+    }
+    return UNKNOWN;
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return S_BUFFER_LOAD_IMM;
+  case AMDGPU::DS_READ_B32:
+  case AMDGPU::DS_READ_B32_gfx9:
+  case AMDGPU::DS_READ_B64:
+  case AMDGPU::DS_READ_B64_gfx9:
+    return DS_READ;
+  case AMDGPU::DS_WRITE_B32:
+  case AMDGPU::DS_WRITE_B32_gfx9:
+  case AMDGPU::DS_WRITE_B64:
+  case AMDGPU::DS_WRITE_B64_gfx9:
+    return DS_WRITE;
+  }
+}
+
+/// Determines instruction subclass from opcode. Only instructions
+/// of the same subclass can be merged together.
+static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
+  switch (Opc) {
+  default:
+    if (TII.isMUBUF(Opc))
+      return AMDGPU::getMUBUFBaseOpcode(Opc);
+    if (TII.isMIMG(Opc)) {
+      const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+      assert(Info);
+      return Info->BaseOpcode;
+    }
+    return -1;
+  case AMDGPU::DS_READ_B32:
+  case AMDGPU::DS_READ_B32_gfx9:
+  case AMDGPU::DS_READ_B64:
+  case AMDGPU::DS_READ_B64_gfx9:
+  case AMDGPU::DS_WRITE_B32:
+  case AMDGPU::DS_WRITE_B32_gfx9:
+  case AMDGPU::DS_WRITE_B64:
+  case AMDGPU::DS_WRITE_B64_gfx9:
+    return Opc;
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
+  }
+}
+
+static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
+  if (TII.isMUBUF(Opc)) {
+    unsigned result = 0;
+
+    if (AMDGPU::getMUBUFHasVAddr(Opc)) {
+      result |= VADDR;
+    }
+
+    if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
+      result |= SRSRC;
+    }
+
+    if (AMDGPU::getMUBUFHasSoffset(Opc)) {
+      result |= SOFFSET;
+    }
+
+    return result;
+  }
+
+  if (TII.isMIMG(Opc)) {
+    unsigned result = VADDR | SRSRC;
+    const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+    if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
+      result |= SSAMP;
+    return result;
+  }
+
+  switch (Opc) {
+  default:
+    return 0;
+  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+    return SBASE;
+  case AMDGPU::DS_READ_B32:
+  case AMDGPU::DS_READ_B64:
+  case AMDGPU::DS_READ_B32_gfx9:
+  case AMDGPU::DS_READ_B64_gfx9:
+  case AMDGPU::DS_WRITE_B32:
+  case AMDGPU::DS_WRITE_B64:
+  case AMDGPU::DS_WRITE_B32_gfx9:
+  case AMDGPU::DS_WRITE_B64_gfx9:
+    return ADDR;
+  }
+}
+
+
+void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
+                                              const SIInstrInfo &TII,
+                                              const GCNSubtarget &STM) {
+  I = MI;
+  unsigned Opc = MI->getOpcode();
+  InstClass = getInstClass(Opc, TII);
+
+  if (InstClass == UNKNOWN)
+    return;
+
+  switch (InstClass) {
+  case DS_READ:
+   EltSize =
+          (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
+                                                                          : 4;
+   break;
+  case DS_WRITE:
+    EltSize =
+          (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
+                                                                            : 4;
+    break;
+  case S_BUFFER_LOAD_IMM:
+    EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
+    break;
+  default:
+    EltSize = 4;
+    break;
+  }
+
+  if (InstClass == MIMG) {
+    DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
+  } else {
+    int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
+    Offset0 = I->getOperand(OffsetIdx).getImm();
+  }
+
+  Width0 = getOpcodeWidth(*I, TII);
+
+  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
+    Offset0 &= 0xffff;
+  } else if (InstClass != MIMG) {
+    GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
+    if (InstClass != S_BUFFER_LOAD_IMM) {
+      SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
+    }
+    DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
+  }
+
+  unsigned AddrOpName[5] = {0};
+  NumAddresses = 0;
+  const unsigned Regs = getRegs(I->getOpcode(), TII);
+
+  if (Regs & ADDR) {
+    AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
+  }
+
+  if (Regs & SBASE) {
+    AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
+  }
+
+  if (Regs & SRSRC) {
+    AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+  }
+
+  if (Regs & SOFFSET) {
+    AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
+  }
+
+  if (Regs & VADDR) {
+    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
+  }
+
+  if (Regs & SSAMP) {
+    AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp;
+  }
+
+  for (unsigned i = 0; i < NumAddresses; i++) {
+    AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
+    AddrReg[i] = &I->getOperand(AddrIdx[i]);
+  }
+
+  InstsToMove.clear();
+}
+
+void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
+                                                  const SIInstrInfo &TII) {
+  Paired = MI;
+  assert(InstClass == getInstClass(Paired->getOpcode(), TII));
+
+  if (InstClass == MIMG) {
+    DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm();
+  } else {
+    int OffsetIdx =
+        AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
+    Offset1 = Paired->getOperand(OffsetIdx).getImm();
+  }
+
+  Width1 = getOpcodeWidth(*Paired, TII);
+  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
+    Offset1 &= 0xffff;
+  } else if (InstClass != MIMG) {
+    GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm();
+    if (InstClass != S_BUFFER_LOAD_IMM) {
+      SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm();
+    }
+    DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm();
+  }
+}
+
+
 } // end anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
@@ -249,8 +563,7 @@ static void addDefsUsesToList(const MachineInstr &MI,
     if (Op.isReg()) {
       if (Op.isDef())
         RegDefs.insert(Op.getReg());
-      else if (Op.readsReg() &&
-               TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
+      else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
         PhysRegUses.insert(Op.getReg());
     }
   }
@@ -282,7 +595,7 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
     if (Use.isReg() &&
         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
          (Use.isDef() && RegDefs.count(Use.getReg())) ||
-         (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
+         (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
           PhysRegUses.count(Use.getReg())))) {
       Insts.push_back(&MI);
       addDefsUsesToList(MI, RegDefs, PhysRegUses);
@@ -307,7 +620,59 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
   return true;
 }
 
+// This function assumes that \p A and \p B have are identical except for
+// size and offset, and they referecne adjacent memory.
+static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
+                                                   const MachineMemOperand *A,
+                                                   const MachineMemOperand *B) {
+  unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
+  unsigned Size = A->getSize() + B->getSize();
+  // This function adds the offset parameter to the existing offset for A,
+  // so we pass 0 here as the offset and then manually set it to the correct
+  // value after the call.
+  MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
+  MMO->setOffset(MinOffset);
+  return MMO;
+}
+
+bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) {
+  assert(CI.InstClass == MIMG);
+
+  // Ignore instructions with tfe/lwe set.
+  const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
+  const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
+
+  if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
+    return false;
+
+  // Check other optional immediate operands for equality.
+  unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
+                                AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
+                                AMDGPU::OpName::da,  AMDGPU::OpName::r128};
+
+  for (auto op : OperandsToMatch) {
+    int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
+    if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx)
+      return false;
+    if (Idx != -1 &&
+        CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm())
+      return false;
+  }
+
+  // Check DMask for overlaps.
+  unsigned MaxMask = std::max(CI.DMask0, CI.DMask1);
+  unsigned MinMask = std::min(CI.DMask0, CI.DMask1);
+
+  unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
+  if ((1u << AllowedBitsForMin) <= MinMask)
+    return false;
+
+  return true;
+}
+
 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
+  assert(CI.InstClass != MIMG);
+
   // XXX - Would the same offset be OK? Is there any reason this would happen or
   // be useful?
   if (CI.Offset0 == CI.Offset1)
@@ -384,164 +749,24 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
   }
 }
 
-unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
-  const unsigned Opc = MI.getOpcode();
-
-  if (TII->isMUBUF(MI)) {
-    return AMDGPU::getMUBUFDwords(Opc);
-  }
-
-  switch (Opc) {
-  default:
-    return 0;
-  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
-    return 1;
-  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
-    return 2;
-  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
-    return 4;
-  }
-}
-
-InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
-  if (TII->isMUBUF(Opc)) {
-    const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
-
-    // If we couldn't identify the opcode, bail out.
-    if (baseOpcode == -1) {
-      return UNKNOWN;
-    }
-
-    switch (baseOpcode) {
-    default:
-      return UNKNOWN;
-    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
-      return BUFFER_LOAD_OFFEN;
-    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
-      return BUFFER_LOAD_OFFSET;
-    case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
-      return BUFFER_STORE_OFFEN;
-    case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
-      return BUFFER_STORE_OFFSET;
-    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
-      return BUFFER_LOAD_OFFEN_exact;
-    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
-      return BUFFER_LOAD_OFFSET_exact;
-    case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
-      return BUFFER_STORE_OFFEN_exact;
-    case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
-      return BUFFER_STORE_OFFSET_exact;
-    }
-  }
-
-  switch (Opc) {
-  default:
-    return UNKNOWN;
-  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
-    return S_BUFFER_LOAD_IMM;
-  case AMDGPU::DS_READ_B32:
-  case AMDGPU::DS_READ_B64:
-  case AMDGPU::DS_READ_B32_gfx9:
-  case AMDGPU::DS_READ_B64_gfx9:
-    return DS_READ;
-  case AMDGPU::DS_WRITE_B32:
-  case AMDGPU::DS_WRITE_B64:
-  case AMDGPU::DS_WRITE_B32_gfx9:
-  case AMDGPU::DS_WRITE_B64_gfx9:
-    return DS_WRITE;
-  }
-}
-
-unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
-  if (TII->isMUBUF(Opc)) {
-    unsigned result = 0;
-
-    if (AMDGPU::getMUBUFHasVAddr(Opc)) {
-      result |= VADDR;
-    }
-
-    if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
-      result |= SRSRC;
-    }
-
-    if (AMDGPU::getMUBUFHasSoffset(Opc)) {
-      result |= SOFFSET;
-    }
-
-    return result;
-  }
-
-  switch (Opc) {
-  default:
-    return 0;
-  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
-  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
-    return SBASE;
-  case AMDGPU::DS_READ_B32:
-  case AMDGPU::DS_READ_B64:
-  case AMDGPU::DS_READ_B32_gfx9:
-  case AMDGPU::DS_READ_B64_gfx9:
-  case AMDGPU::DS_WRITE_B32:
-  case AMDGPU::DS_WRITE_B64:
-  case AMDGPU::DS_WRITE_B32_gfx9:
-  case AMDGPU::DS_WRITE_B64_gfx9:
-    return ADDR;
-  }
-}
-
 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
   MachineBasicBlock::iterator E = MBB->end();
   MachineBasicBlock::iterator MBBI = CI.I;
 
   const unsigned Opc = CI.I->getOpcode();
-  const InstClassEnum InstClass = getInstClass(Opc);
+  const InstClassEnum InstClass = getInstClass(Opc, *TII);
 
   if (InstClass == UNKNOWN) {
     return false;
   }
+  const unsigned InstSubclass = getInstSubclass(Opc, *TII);
 
-  const unsigned Regs = getRegs(Opc);
-
-  unsigned AddrOpName[5] = {0};
-  int AddrIdx[5];
-  const MachineOperand *AddrReg[5];
-  unsigned NumAddresses = 0;
-
-  if (Regs & ADDR) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
-  }
-
-  if (Regs & SBASE) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
-  }
-
-  if (Regs & SRSRC) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
-  }
-
-  if (Regs & SOFFSET) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
-  }
-
-  if (Regs & VADDR) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
-  }
-
-  for (unsigned i = 0; i < NumAddresses; i++) {
-    AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
-    AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
-
-    // We only ever merge operations with the same base address register, so
-    // don't bother scanning forward if there are no other uses.
-    if (AddrReg[i]->isReg() &&
-        (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
-         MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
-      return false;
-  }
+  // Do not merge VMEM buffer instructions with "swizzled" bit set.
+  int Swizzled =
+      AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
+  if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
+    return false;
 
   ++MBBI;
 
@@ -550,11 +775,10 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
   for (; MBBI != E; ++MBBI) {
-    const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
 
-    if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
-        (IsDS && (MBBI->getOpcode() != Opc))) {
-      // This is not a matching DS instruction, but we can keep looking as
+    if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
+        (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
+      // This is not a matching instruction, but we can keep looking as
       // long as one of these conditions are met:
       // 1. It is safe to move I down past MBBI.
       // 2. It is safe to move MBBI down past the instruction that I will
@@ -599,58 +823,23 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
                               CI.InstsToMove))
       continue;
 
-    bool Match = true;
-    for (unsigned i = 0; i < NumAddresses; i++) {
-      const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
-
-      if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
-        if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
-            AddrReg[i]->getImm() != AddrRegNext.getImm()) {
-          Match = false;
-          break;
-        }
-        continue;
-      }
-
-      // Check same base pointer. Be careful of subregisters, which can occur
-      // with vectors of pointers.
-      if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
-          AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
-        Match = false;
-        break;
-      }
-    }
+    bool Match = CI.hasSameBaseAddress(*MBBI);
 
     if (Match) {
-      int OffsetIdx =
-          AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
-      CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
-      CI.Width0 = getOpcodeWidth(*CI.I);
-      CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
-      CI.Width1 = getOpcodeWidth(*MBBI);
-      CI.Paired = MBBI;
-
-      if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
-        CI.Offset0 &= 0xffff;
-        CI.Offset1 &= 0xffff;
-      } else {
-        CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
-        CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
-        if (CI.InstClass != S_BUFFER_LOAD_IMM) {
-          CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
-          CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
-        }
-        CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
-        CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
-      }
+      CI.setPaired(MBBI, *TII);
+
+      // Check both offsets (or masks for MIMG) can be combined and fit in the
+      // reduced range.
+      bool canBeCombined =
+          CI.InstClass == MIMG
+              ? dmasksCanBeCombined(CI, *TII)
+              : widthsFit(*STM, CI) && offsetsCanBeCombined(CI);
 
-      // Check both offsets fit in the reduced range.
       // We also need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
-      if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
-        if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
-          return true;
+      if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
+        return true;
     }
 
     // We've found a load/store that we couldn't merge for some reason.
@@ -711,15 +900,15 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
 
   const TargetRegisterClass *SuperRC =
       (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
-  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+  Register DestReg = MRI->createVirtualRegister(SuperRC);
 
   DebugLoc DL = CI.I->getDebugLoc();
 
-  unsigned BaseReg = AddrReg->getReg();
+  Register BaseReg = AddrReg->getReg();
   unsigned BaseSubReg = AddrReg->getSubReg();
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
-    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
         .addImm(CI.BaseOff);
 
@@ -755,12 +944,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
 
   moveInstsAfter(Copy1, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
-  return Next;
+  return Read2;
 }
 
 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
@@ -809,11 +997,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
   const MCInstrDesc &Write2Desc = TII->get(Opc);
   DebugLoc DL = CI.I->getDebugLoc();
 
-  unsigned BaseReg = AddrReg->getReg();
+  Register BaseReg = AddrReg->getReg();
   unsigned BaseSubReg = AddrReg->getSubReg();
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
-    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
         .addImm(CI.BaseOff);
 
@@ -839,12 +1027,65 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
 
   moveInstsAfter(Write2, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
-  return Next;
+  return Write2;
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) {
+  MachineBasicBlock *MBB = CI.I->getParent();
+  DebugLoc DL = CI.I->getDebugLoc();
+  const unsigned Opcode = getNewOpcode(CI);
+
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+
+  Register DestReg = MRI->createVirtualRegister(SuperRC);
+  unsigned MergedDMask = CI.DMask0 | CI.DMask1;
+  unsigned DMaskIdx =
+      AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
+
+  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
+  for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
+    if (I == DMaskIdx)
+      MIB.addImm(MergedDMask);
+    else
+      MIB.add((*CI.I).getOperand(I));
+  }
+
+  // It shouldn't be possible to get this far if the two instructions
+  // don't have a single memoperand, because MachineInstr::mayAlias()
+  // will return true if this is the case.
+  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+  // Copy to the old destination registers.
+  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+
+  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+      .add(*Dest0) // Copy to same destination including flags and sub reg.
+      .addReg(DestReg, 0, SubRegIdx0);
+  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+                            .add(*Dest1)
+                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+  moveInstsAfter(Copy1, CI.InstsToMove);
+
+  CI.I->eraseFromParent();
+  CI.Paired->eraseFromParent();
+  return New;
 }
 
 MachineBasicBlock::iterator
@@ -855,15 +1096,24 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
 
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
 
-  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+  Register DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
 
-  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
-      .addImm(MergedOffset) // offset
-      .addImm(CI.GLC0)      // glc
-      .addImm(CI.DLC0)      // dlc
-      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+  // It shouldn't be possible to get this far if the two instructions
+  // don't have a single memoperand, because MachineInstr::mayAlias()
+  // will return true if this is the case.
+  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+  MachineInstr *New =
+    BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+        .addImm(MergedOffset) // offset
+        .addImm(CI.GLC0)      // glc
+        .addImm(CI.DLC0)      // dlc
+        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -883,10 +1133,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
 
   moveInstsAfter(Copy1, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
-  return Next;
+  return New;
 }
 
 MachineBasicBlock::iterator
@@ -899,24 +1148,34 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
 
   // Copy to the new source register.
-  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+  Register DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
 
   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
 
-  const unsigned Regs = getRegs(Opcode);
+  const unsigned Regs = getRegs(Opcode, *TII);
 
   if (Regs & VADDR)
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
-  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
-      .addImm(MergedOffset) // offset
-      .addImm(CI.GLC0)      // glc
-      .addImm(CI.SLC0)      // slc
-      .addImm(0)            // tfe
-      .addImm(CI.DLC0)      // dlc
-      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+  // It shouldn't be possible to get this far if the two instructions
+  // don't have a single memoperand, because MachineInstr::mayAlias()
+  // will return true if this is the case.
+  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+  MachineInstr *New =
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+        .addImm(MergedOffset) // offset
+        .addImm(CI.GLC0)      // glc
+        .addImm(CI.SLC0)      // slc
+        .addImm(0)            // tfe
+        .addImm(CI.DLC0)      // dlc
+        .addImm(0)            // swz
+        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -936,10 +1195,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
 
   moveInstsAfter(Copy1, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
-  return Next;
+  return New;
 }
 
 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
@@ -947,7 +1205,10 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
 
   switch (CI.InstClass) {
   default:
-    return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
+    assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
+    // FIXME: Handle d16 correctly
+    return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
+                                  Width);
   case UNKNOWN:
     llvm_unreachable("Unknown instruction class");
   case S_BUFFER_LOAD_IMM:
@@ -959,76 +1220,47 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
     case 4:
       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
     }
+  case MIMG:
+    assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width));
+    return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
   }
 }
 
 std::pair<unsigned, unsigned>
 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
-  if (CI.Offset0 > CI.Offset1) {
-    switch (CI.Width0) {
-    default:
-      return std::make_pair(0, 0);
-    case 1:
-      switch (CI.Width1) {
-      default:
-        return std::make_pair(0, 0);
-      case 1:
-        return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
-      case 2:
-        return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
-      case 3:
-        return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
-      }
-    case 2:
-      switch (CI.Width1) {
-      default:
-        return std::make_pair(0, 0);
-      case 1:
-        return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
-      case 2:
-        return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
-      }
-    case 3:
-      switch (CI.Width1) {
-      default:
-        return std::make_pair(0, 0);
-      case 1:
-        return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
-      }
-    }
+
+  if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4)
+    return std::make_pair(0, 0);
+
+  bool ReverseOrder;
+  if (CI.InstClass == MIMG) {
+    assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) &&
+           "No overlaps");
+    ReverseOrder = CI.DMask0 > CI.DMask1;
+  } else
+    ReverseOrder = CI.Offset0 > CI.Offset1;
+
+  static const unsigned Idxs[4][4] = {
+      {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
+      {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
+      {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
+      {AMDGPU::sub3, 0, 0, 0},
+  };
+  unsigned Idx0;
+  unsigned Idx1;
+
+  assert(CI.Width0 >= 1 && CI.Width0 <= 3);
+  assert(CI.Width1 >= 1 && CI.Width1 <= 3);
+
+  if (ReverseOrder) {
+    Idx1 = Idxs[0][CI.Width1 - 1];
+    Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
   } else {
-    switch (CI.Width0) {
-    default:
-      return std::make_pair(0, 0);
-    case 1:
-      switch (CI.Width1) {
-      default:
-        return std::make_pair(0, 0);
-      case 1:
-        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
-      case 2:
-        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
-      case 3:
-        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
-      }
-    case 2:
-      switch (CI.Width1) {
-      default:
-        return std::make_pair(0, 0);
-      case 1:
-        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
-      case 2:
-        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
-      }
-    case 3:
-      switch (CI.Width1) {
-      default:
-        return std::make_pair(0, 0);
-      case 1:
-        return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
-      }
-    }
+    Idx0 = Idxs[0][CI.Width0 - 1];
+    Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
   }
+
+  return std::make_pair(Idx0, Idx1);
 }
 
 const TargetRegisterClass *
@@ -1040,7 +1272,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
     case 2:
       return &AMDGPU::SReg_64_XEXECRegClass;
     case 4:
-      return &AMDGPU::SReg_128RegClass;
+      return &AMDGPU::SGPR_128RegClass;
     case 8:
       return &AMDGPU::SReg_256RegClass;
     case 16:
@@ -1073,7 +1305,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
 
   // Copy to the new source register.
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
-  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
+  Register SrcReg = MRI->createVirtualRegister(SuperRC);
 
   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
@@ -1087,35 +1319,45 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
 
-  const unsigned Regs = getRegs(Opcode);
+  const unsigned Regs = getRegs(Opcode, *TII);
 
   if (Regs & VADDR)
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
-  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
-      .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
-      .addImm(CI.GLC0)      // glc
-      .addImm(CI.SLC0)      // slc
-      .addImm(0)            // tfe
-      .addImm(CI.DLC0)      // dlc
-      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+
+  // It shouldn't be possible to get this far if the two instructions
+  // don't have a single memoperand, because MachineInstr::mayAlias()
+  // will return true if this is the case.
+  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+  MachineInstr *New =
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+        .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
+        .addImm(CI.GLC0)      // glc
+        .addImm(CI.SLC0)      // slc
+        .addImm(0)            // tfe
+        .addImm(CI.DLC0)      // dlc
+        .addImm(0)            // swz
+        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
   moveInstsAfter(MIB, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
-  return Next;
+  return New;
 }
 
 MachineOperand
-SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
+SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
   APInt V(32, Val, true);
   if (TII->isInlineConstant(V))
     return MachineOperand::CreateImm(Val);
 
-  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
   MachineInstr *Mov =
   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
           TII->get(AMDGPU::S_MOV_B32), Reg)
@@ -1127,7 +1369,7 @@ SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
 
 // Compute base address using Addr and return the final register.
 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
-                                           const MemAddress &Addr) {
+                                           const MemAddress &Addr) const {
   MachineBasicBlock *MBB = MI.getParent();
   MachineBasicBlock::iterator MBBI = MI.getIterator();
   DebugLoc DL = MI.getDebugLoc();
@@ -1146,11 +1388,11 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
 
   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-  unsigned CarryReg = MRI->createVirtualRegister(CarryRC);
-  unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC);
+  Register CarryReg = MRI->createVirtualRegister(CarryRC);
+  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
 
-  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   MachineInstr *LoHalf =
     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
       .addReg(CarryReg, RegState::Define)
@@ -1170,7 +1412,7 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
   (void)HiHalf;
   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
 
-  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+  Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
   MachineInstr *FullBase =
     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
       .addReg(DestSub0)
@@ -1186,13 +1428,13 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
 // Update base and offset with the NewBase and NewOffset in MI.
 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
                                                unsigned NewBase,
-                                               int32_t NewOffset) {
+                                               int32_t NewOffset) const {
   TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
 }
 
 Optional<int32_t>
-SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
+SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
   if (Op.isImm())
     return Op.getImm();
 
@@ -1218,7 +1460,7 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
 //   %Base:vreg_64 =
 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
-                                                      MemAddress &Addr) {
+                                                      MemAddress &Addr) const {
   if (!Base.isReg())
     return;
 
@@ -1273,15 +1515,16 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
     MachineInstr &MI,
     MemInfoMap &Visited,
-    SmallPtrSet<MachineInstr *, 4> &AnchorList) {
+    SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
+
+  if (!(MI.mayLoad() ^ MI.mayStore()))
+    return false;
 
   // TODO: Support flat and scratch.
-  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
-      TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
+  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
     return false;
 
-  // TODO: Support Store.
-  if (!MI.mayLoad())
+  if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
     return false;
 
   if (AnchorList.count(&MI))
@@ -1418,100 +1661,166 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   return false;
 }
 
-// Scan through looking for adjacent LDS operations with constant offsets from
-// the same base register. We rely on the scheduler to do the hard work of
-// clustering nearby loads, and assume these are all adjacent.
-bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
-  bool Modified = false;
+void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
+                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
+  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
+    if (AddrList.front().hasSameBaseAddress(*CI.I) &&
+        AddrList.front().InstClass == CI.InstClass) {
+      AddrList.emplace_back(CI);
+      return;
+    }
+  }
+
+  // Base address not found, so add a new list.
+  MergeableInsts.emplace_back(1, CI);
+}
 
+bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
+                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
+  bool Modified = false;
   // Contain the list
   MemInfoMap Visited;
   // Contains the list of instructions for which constant offsets are being
   // promoted to the IMM.
   SmallPtrSet<MachineInstr *, 4> AnchorList;
 
-  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
-    MachineInstr &MI = *I;
-
+  // Sort potential mergeable instructions into lists.  One list per base address.
+  for (MachineInstr &MI : MBB.instrs()) {
+    // We run this before checking if an address is mergeable, because it can produce
+    // better code even if the instructions aren't mergeable.
     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
       Modified = true;
 
+    const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
+    if (InstClass == UNKNOWN)
+      continue;
+
     // Don't combine if volatile.
-    if (MI.hasOrderedMemoryRef()) {
-      ++I;
+    if (MI.hasOrderedMemoryRef())
+      continue;
+
+    CombineInfo CI;
+    CI.setMI(MI, *TII, *STM);
+
+    if (!CI.hasMergeableAddress(*MRI))
+      continue;
+
+    addInstToMergeableList(CI, MergeableInsts);
+  }
+  return Modified;
+}
+
+// Scan through looking for adjacent LDS operations with constant offsets from
+// the same base register. We rely on the scheduler to do the hard work of
+// clustering nearby loads, and assume these are all adjacent.
+bool SILoadStoreOptimizer::optimizeBlock(
+                       std::list<std::list<CombineInfo> > &MergeableInsts) {
+  bool Modified = false;
+
+  for (std::list<CombineInfo> &MergeList : MergeableInsts) {
+    if (MergeList.size() < 2)
+      continue;
+
+    bool OptimizeListAgain = false;
+    if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
+      // We weren't able to make any changes, so clear the list so we don't
+      // process the same instructions the next time we try to optimize this
+      // block.
+      MergeList.clear();
       continue;
     }
 
-    const unsigned Opc = MI.getOpcode();
+    // We made changes, but also determined that there were no more optimization
+    // opportunities, so we don't need to reprocess the list
+    if (!OptimizeListAgain)
+      MergeList.clear();
 
-    CombineInfo CI;
-    CI.I = I;
-    CI.InstClass = getInstClass(Opc);
+    OptimizeAgain |= OptimizeListAgain;
+    Modified = true;
+  }
+  return Modified;
+}
+
+void
+SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
+                                         const MachineInstr &MI) {
+
+  for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
+    if (&*CI->I == &MI) {
+      MergeList.erase(CI);
+      return;
+    }
+  }
+}
+
+bool
+SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
+                                          std::list<CombineInfo> &MergeList,
+                                          bool &OptimizeListAgain) {
+  bool Modified = false;
+  for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
+    CombineInfo &CI = *I;
 
     switch (CI.InstClass) {
     default:
       break;
     case DS_READ:
-      CI.EltSize =
-          (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
-                                                                          : 4;
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeRead2Pair(CI);
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
+        CI.setMI(NewMI, *TII, *STM);
       }
-      continue;
+      break;
     case DS_WRITE:
-      CI.EltSize =
-          (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
-                                                                            : 4;
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeWrite2Pair(CI);
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
+        CI.setMI(NewMI, *TII, *STM);
       }
-      continue;
+      break;
     case S_BUFFER_LOAD_IMM:
-      CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeSBufferLoadImmPair(CI);
-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
+        CI.setMI(NewMI, *TII, *STM);
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
       }
-      continue;
-    case BUFFER_LOAD_OFFEN:
-    case BUFFER_LOAD_OFFSET:
-    case BUFFER_LOAD_OFFEN_exact:
-    case BUFFER_LOAD_OFFSET_exact:
-      CI.EltSize = 4;
+      break;
+    case BUFFER_LOAD:
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeBufferLoadPair(CI);
-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
+        CI.setMI(NewMI, *TII, *STM);
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
       }
-      continue;
-    case BUFFER_STORE_OFFEN:
-    case BUFFER_STORE_OFFSET:
-    case BUFFER_STORE_OFFEN_exact:
-    case BUFFER_STORE_OFFSET_exact:
-      CI.EltSize = 4;
+      break;
+    case BUFFER_STORE:
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeBufferStorePair(CI);
-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
+        CI.setMI(NewMI, *TII, *STM);
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
       }
-      continue;
+      break;
+    case MIMG:
+      if (findMatchingInst(CI)) {
+        Modified = true;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeImagePair(CI);
+        CI.setMI(NewMI, *TII, *STM);
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+      }
+      break;
     }
-
-    ++I;
+    // Clear the InstsToMove after we have finished searching so we don't have
+    // stale values left over if we search for this CI again in another pass
+    // over the block.
+    CI.InstsToMove.clear();
   }
 
   return Modified;
@@ -1537,10 +1846,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   bool Modified = false;
 
+
   for (MachineBasicBlock &MBB : MF) {
+    std::list<std::list<CombineInfo> > MergeableInsts;
+    // First pass: Collect list of all instructions we know how to merge.
+    Modified |= collectMergeableInsts(MBB, MergeableInsts);
     do {
       OptimizeAgain = false;
-      Modified |= optimizeBlock(MBB);
+      Modified |= optimizeBlock(MergeableInsts);
     } while (OptimizeAgain);
   }
 
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 78f409cd9555..6f9abd3a8d9b 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -98,6 +98,8 @@ private:
   void emitLoop(MachineInstr &MI);
   void emitEndCf(MachineInstr &MI);
 
+  Register getSaveExec(MachineInstr* MI);
+
   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
                         SmallVectorImpl<MachineOperand> &Src) const;
 
@@ -144,7 +146,7 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 
 static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
                        const SIInstrInfo *TII) {
-  unsigned SaveExecReg = MI.getOperand(0).getReg();
+  Register SaveExecReg = MI.getOperand(0).getReg();
   auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
 
   if (U == MRI->use_instr_nodbg_end() ||
@@ -175,17 +177,31 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
   return true;
 }
 
+Register SILowerControlFlow::getSaveExec(MachineInstr *MI) {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineOperand &SaveExec = MI->getOperand(0);
+  assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister);
+
+  Register SaveExecReg = SaveExec.getReg();
+  unsigned FalseTermOpc =
+      TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+  MachineBasicBlock::iterator I = (MI);
+  MachineBasicBlock::iterator J = std::next(I);
+  if (J != MBB->end() && J->getOpcode() == FalseTermOpc &&
+      J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) {
+    SaveExecReg = J->getOperand(0).getReg();
+    J->eraseFromParent();
+  }
+  return SaveExecReg;
+}
+
 void SILowerControlFlow::emitIf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I(&MI);
-
-  MachineOperand &SaveExec = MI.getOperand(0);
-  MachineOperand &Cond = MI.getOperand(1);
-  assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
-         Cond.getSubReg() == AMDGPU::NoSubRegister);
-
-  Register SaveExecReg = SaveExec.getReg();
+  Register SaveExecReg = getSaveExec(&MI);
+  MachineOperand& Cond = MI.getOperand(1);
+  assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
 
   MachineOperand &ImpDefSCC = MI.getOperand(4);
   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
@@ -204,7 +220,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
     .addReg(Exec)
     .addReg(Exec, RegState::ImplicitDefine);
 
-  unsigned Tmp = MRI->createVirtualRegister(BoolRC);
+  Register Tmp = MRI->createVirtualRegister(BoolRC);
 
   MachineInstr *And =
     BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
@@ -266,8 +282,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
 
-  Register DstReg = MI.getOperand(0).getReg();
-  assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
+  Register DstReg = getSaveExec(&MI);
 
   bool ExecModified = MI.getOperand(3).getImm() != 0;
   MachineBasicBlock::iterator Start = MBB.begin();
@@ -339,7 +354,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
-  auto Dst = MI.getOperand(0).getReg();
+  auto Dst = getSaveExec(&MI);
 
   // Skip ANDing with exec if the break condition is already masked by exec
   // because it is a V_CMP in the same basic block. (We know the break
@@ -400,13 +415,17 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
 
 void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  unsigned CFMask = MI.getOperand(0).getReg();
+  MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
   const DebugLoc &DL = MI.getDebugLoc();
 
-  MachineBasicBlock::iterator InsPt = MBB.begin();
-  MachineInstr *NewMI =
-      BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
-          .addReg(Exec)
-          .add(MI.getOperand(0));
+  MachineBasicBlock::iterator InsPt =
+      Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
+                               : MBB.begin();
+  MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
+                            .addReg(Exec)
+                            .add(MI.getOperand(0));
 
   if (LIS)
     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
@@ -422,7 +441,7 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
        SmallVectorImpl<MachineOperand> &Src) const {
   MachineOperand &Op = MI.getOperand(OpNo);
-  if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+  if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) {
     Src.push_back(Op);
     return;
   }
@@ -442,8 +461,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
 
   for (const auto &SrcOp : Def->explicit_operands())
     if (SrcOp.isReg() && SrcOp.isUse() &&
-        (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
-        SrcOp.getReg() == Exec))
+        (Register::isVirtualRegister(SrcOp.getReg()) || SrcOp.getReg() == Exec))
       Src.push_back(SrcOp);
 }
 
@@ -466,7 +484,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
   else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
   else return;
 
-  unsigned Reg = MI.getOperand(OpToReplace).getReg();
+  Register Reg = MI.getOperand(OpToReplace).getReg();
   MI.RemoveOperand(OpToReplace);
   MI.addOperand(Ops[UniqueOpndIdx]);
   if (MRI->use_empty(Reg))
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 1c0f836f07e6..b45412536356 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -96,7 +96,7 @@ private:
   getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
 
   bool isVreg1(unsigned Reg) const {
-    return TargetRegisterInfo::isVirtualRegister(Reg) &&
+    return Register::isVirtualRegister(Reg) &&
            MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
   }
 
@@ -489,6 +489,15 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
   return true;
 }
 
+#ifndef NDEBUG
+static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
+                                const MachineRegisterInfo &MRI,
+                                Register Reg) {
+  unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
+  return Size == 1 || Size == 32;
+}
+#endif
+
 void SILowerI1Copies::lowerCopiesFromI1() {
   SmallVector<MachineInstr *, 4> DeadCopies;
 
@@ -497,8 +506,8 @@ void SILowerI1Copies::lowerCopiesFromI1() {
       if (MI.getOpcode() != AMDGPU::COPY)
         continue;
 
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned SrcReg = MI.getOperand(1).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
+      Register SrcReg = MI.getOperand(1).getReg();
       if (!isVreg1(SrcReg))
         continue;
 
@@ -509,7 +518,7 @@ void SILowerI1Copies::lowerCopiesFromI1() {
       LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
       DebugLoc DL = MI.getDebugLoc();
 
-      assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
+      assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg));
       assert(!MI.getOperand(0).getSubReg());
 
       ConstrainRegs.insert(SrcReg);
@@ -544,7 +553,7 @@ void SILowerI1Copies::lowerPhis() {
     LF.initialize(MBB);
 
     for (MachineInstr &MI : MBB.phis()) {
-      unsigned DstReg = MI.getOperand(0).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
       if (!isVreg1(DstReg))
         continue;
 
@@ -556,7 +565,7 @@ void SILowerI1Copies::lowerPhis() {
       // Collect incoming values.
       for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
         assert(i + 1 < MI.getNumOperands());
-        unsigned IncomingReg = MI.getOperand(i).getReg();
+        Register IncomingReg = MI.getOperand(i).getReg();
         MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
         MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
 
@@ -580,12 +589,12 @@ void SILowerI1Copies::lowerPhis() {
 
       // Phis in a loop that are observed outside the loop receive a simple but
       // conservatively correct treatment.
-      MachineBasicBlock *PostDomBound = &MBB;
-      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
-        PostDomBound =
-            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
-      }
+      std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
+      for (MachineInstr &Use : MRI->use_instructions(DstReg))
+        DomBlocks.push_back(Use.getParent());
 
+      MachineBasicBlock *PostDomBound =
+          PDT->findNearestCommonDominator(DomBlocks);
       unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
 
       SSAUpdater.Initialize(DstReg);
@@ -669,7 +678,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
           MI.getOpcode() != AMDGPU::COPY)
         continue;
 
-      unsigned DstReg = MI.getOperand(0).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
       if (!isVreg1(DstReg))
         continue;
 
@@ -686,10 +695,10 @@ void SILowerI1Copies::lowerCopiesToI1() {
         continue;
 
       DebugLoc DL = MI.getDebugLoc();
-      unsigned SrcReg = MI.getOperand(1).getReg();
+      Register SrcReg = MI.getOperand(1).getReg();
       assert(!MI.getOperand(1).getSubReg());
 
-      if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      if (!Register::isVirtualRegister(SrcReg) ||
           (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
         assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
         unsigned TmpReg = createLaneMaskReg(*MF);
@@ -702,12 +711,12 @@ void SILowerI1Copies::lowerCopiesToI1() {
 
       // Defs in a loop that are observed outside the loop must be transformed
       // into appropriate bit manipulation.
-      MachineBasicBlock *PostDomBound = &MBB;
-      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
-        PostDomBound =
-            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
-      }
+      std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
+      for (MachineInstr &Use : MRI->use_instructions(DstReg))
+        DomBlocks.push_back(Use.getParent());
 
+      MachineBasicBlock *PostDomBound =
+          PDT->findNearestCommonDominator(DomBlocks);
       unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
       if (FoundLoopLevel) {
         SSAUpdater.Initialize(DstReg);
@@ -734,7 +743,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
       break;
 
     Reg = MI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       return false;
     if (!isLaneMaskReg(Reg))
       return false;
diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index a82047473370..714d403a3e8f 100644
--- a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -278,8 +278,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
           unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                                      AMDGPU::OpName::vaddr);
           int FI = MI.getOperand(FIOp).getIndex();
-          unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)
-            ->getReg();
+          Register VReg =
+              TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
                                                 TRI->isAGPR(MRI, VReg))) {
             TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr);
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 46da974a2f45..7dd0f11c95de 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -53,8 +53,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
 
-  Occupancy = getMaxWavesPerEU();
-  limitOccupancy(MF);
+  Occupancy = ST.computeOccupancy(MF, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
 
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
@@ -190,7 +189,7 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
   const SIRegisterInfo &TRI) {
   ArgInfo.PrivateSegmentBuffer =
     ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
-    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
   NumUserSGPRs += 4;
   return ArgInfo.PrivateSegmentBuffer.getRegister();
 }
@@ -487,6 +486,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
     NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
     MemoryBound(MFI.isMemoryBound()),
     WaveLimiter(MFI.needsWaveLimiter()),
+    HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
     ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
     ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
     FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
@@ -501,8 +501,9 @@ void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
 bool SIMachineFunctionInfo::initializeBaseYamlFields(
   const yaml::SIMachineFunctionInfo &YamlMFI) {
   ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
-  MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
+  MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
   LDSSize = YamlMFI.LDSSize;
+  HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
   IsEntryFunction = YamlMFI.IsEntryFunction;
   NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
   MemoryBound = YamlMFI.MemoryBound;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index f19b20ceb5da..7d70c786b594 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -265,6 +265,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   bool NoSignedZerosFPMath = false;
   bool MemoryBound = false;
   bool WaveLimiter = false;
+  uint32_t HighBitsOf32BitAddress = 0;
 
   StringValue ScratchRSrcReg = "$private_rsrc_reg";
   StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg";
@@ -302,6 +303,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
                        StringValue("$sp_reg"));
     YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
     YamlIO.mapOptional("mode", MFI.Mode, SIMode());
+    YamlIO.mapOptional("highBitsOf32BitAddress",
+                       MFI.HighBitsOf32BitAddress, 0u);
   }
 };
 
@@ -670,7 +673,7 @@ public:
     return GITPtrHigh;
   }
 
-  unsigned get32BitAddressHighBits() const {
+  uint32_t get32BitAddressHighBits() const {
     return HighBitsOf32BitAddress;
   }
 
@@ -873,7 +876,7 @@ public:
     assert(BufferRsrc);
     auto PSV = BufferPSVs.try_emplace(
       BufferRsrc,
-      llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
+      std::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
     return PSV.first->second.get();
   }
 
@@ -882,14 +885,14 @@ public:
     assert(ImgRsrc);
     auto PSV = ImagePSVs.try_emplace(
       ImgRsrc,
-      llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII));
+      std::make_unique<AMDGPUImagePseudoSourceValue>(TII));
     return PSV.first->second.get();
   }
 
   const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
     if (!GWSResourcePSV) {
       GWSResourcePSV =
-          llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
+          std::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
     }
 
     return GWSResourcePSV.get();
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index ebbdf80f9567..c072ba6b2d1c 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -348,7 +348,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
 
   // Do not Track Physical Registers, because it messes up.
   for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
-    if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
+    if (Register::isVirtualRegister(RegMaskPair.RegUnit))
       LiveInRegs.insert(RegMaskPair.RegUnit);
   }
   LiveOutRegs.clear();
@@ -376,7 +376,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   // The use of findDefBetween removes the case 4.
   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
     unsigned Reg = RegMaskPair.RegUnit;
-    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+    if (Register::isVirtualRegister(Reg) &&
         isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
                      LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
                      LIS)) {
@@ -1228,7 +1228,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
     unsigned Color = CurrentColoring[SU->NodeNum];
     if (RealID.find(Color) == RealID.end()) {
       int ID = CurrentBlocks.size();
-      BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
+      BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID));
       CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
       RealID[Color] = ID;
     }
@@ -1690,7 +1690,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
   for (unsigned Reg : Regs) {
     // For now only track virtual registers.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       continue;
     // If not already in the live set, then add it.
     (void) LiveRegs.insert(Reg);
@@ -1750,7 +1750,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
 
   for (unsigned Reg : InRegs) {
     // For now only track virtual registers.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       continue;
     if (LiveRegsConsumers[Reg] > 1)
       continue;
@@ -1762,7 +1762,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
 
   for (unsigned Reg : OutRegs) {
     // For now only track virtual registers.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       continue;
     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
     for (; PSetI.isValid(); ++PSetI) {
@@ -1801,7 +1801,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
 // SIScheduleDAGMI //
 
 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
-  ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
+  ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
   SITII = static_cast<const SIInstrInfo*>(TII);
   SITRI = static_cast<const SIRegisterInfo*>(TRI);
 
@@ -1913,7 +1913,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
   for (_Iterator RegI = First; RegI != End; ++RegI) {
     unsigned Reg = *RegI;
     // For now only track virtual registers
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       continue;
     PSetIterator PSetI = MRI.getPressureSets(Reg);
     for (; PSetI.isValid(); ++PSetI) {
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 4320e6c957a0..e914573306ae 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -656,10 +656,10 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) {
 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
   GCNSubtarget::Generation Generation = ST.getGeneration();
   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
-    return make_unique<SIGfx6CacheControl>(ST);
+    return std::make_unique<SIGfx6CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX10)
-    return make_unique<SIGfx7CacheControl>(ST);
-  return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
+    return std::make_unique<SIGfx7CacheControl>(ST);
+  return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
 }
 
 bool SIGfx6CacheControl::enableLoadCacheBypass(
diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp
index a5edd7b3554a..52989a280e80 100644
--- a/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -226,7 +226,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
 // - on exit we have set the Require, Change, and initial Exit modes.
 void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
                                         const SIInstrInfo *TII) {
-  auto NewInfo = llvm::make_unique<BlockData>();
+  auto NewInfo = std::make_unique<BlockData>();
   MachineInstr *InsertionPoint = nullptr;
   // RequirePending is used to indicate whether we are collecting the initial
   // requirements for the block, and need to defer the first InsertionPoint to
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 3227bff20513..cc9b46a75582 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -322,7 +322,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
       continue;
     }
 
-    unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
+    Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
     MachineInstr *SaveExecInst = nullptr;
     SmallVector<MachineInstr *, 4> OtherUseInsts;
 
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 7e10316eab92..fdd30db6a7cb 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -211,7 +211,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
     return AMDGPU::NoRegister;
 
   MachineOperand *AndCC = &And->getOperand(1);
-  unsigned CmpReg = AndCC->getReg();
+  Register CmpReg = AndCC->getReg();
   unsigned CmpSubReg = AndCC->getSubReg();
   if (CmpReg == ExecReg) {
     AndCC = &And->getOperand(2);
@@ -234,7 +234,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
   if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
     return AMDGPU::NoRegister;
 
-  unsigned SelReg = Op1->getReg();
+  Register SelReg = Op1->getReg();
   auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
   if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
     return AMDGPU::NoRegister;
@@ -250,15 +250,16 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
       Op1->getImm() != 0 || Op2->getImm() != 1)
     return AMDGPU::NoRegister;
 
-  LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t'
-                    << *Cmp << '\t' << *And);
+  LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
+                    << *And);
 
-  unsigned CCReg = CC->getReg();
+  Register CCReg = CC->getReg();
   LIS->RemoveMachineInstrFromMaps(*And);
-  MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
-                                TII->get(Andn2Opc), And->getOperand(0).getReg())
-                            .addReg(ExecReg)
-                            .addReg(CCReg, 0, CC->getSubReg());
+  MachineInstr *Andn2 =
+      BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
+              And->getOperand(0).getReg())
+          .addReg(ExecReg)
+          .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
   And->eraseFromParent();
   LIS->InsertMachineInstrInMaps(*Andn2);
 
@@ -266,20 +267,19 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
 
   // Try to remove compare. Cmp value should not used in between of cmp
   // and s_and_b64 if VCC or just unused if any other register.
-  if ((TargetRegisterInfo::isVirtualRegister(CmpReg) &&
-       MRI.use_nodbg_empty(CmpReg)) ||
+  if ((Register::isVirtualRegister(CmpReg) && MRI.use_nodbg_empty(CmpReg)) ||
       (CmpReg == CondReg &&
        std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
                     [&](const MachineInstr &MI) {
-                      return MI.readsRegister(CondReg, TRI); }))) {
+                      return MI.readsRegister(CondReg, TRI);
+                    }))) {
     LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
 
     LIS->RemoveMachineInstrFromMaps(*Cmp);
     Cmp->eraseFromParent();
 
     // Try to remove v_cndmask_b32.
-    if (TargetRegisterInfo::isVirtualRegister(SelReg) &&
-        MRI.use_nodbg_empty(SelReg)) {
+    if (Register::isVirtualRegister(SelReg) && MRI.use_nodbg_empty(SelReg)) {
       LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
 
       LIS->RemoveMachineInstrFromMaps(*Sel);
@@ -413,7 +413,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
     if (!SaveExec || !SaveExec->isFullCopy())
       continue;
 
-    unsigned SavedExec = SaveExec->getOperand(0).getReg();
+    Register SavedExec = SaveExec->getOperand(0).getReg();
     bool SafeToReplace = true;
     for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
       if (U.getParent() != SaveExec->getParent()) {
@@ -434,7 +434,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
 
   if (Changed) {
     for (auto Reg : RecalcRegs) {
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         LIS->removeInterval(Reg);
         if (!MRI.reg_empty(Reg))
           LIS->createAndComputeVirtRegInterval(Reg);
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 2d71abc0612a..9b3b2436475c 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -574,16 +574,16 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-    if (TRI->isPhysicalRegister(Src1->getReg()) ||
-        TRI->isPhysicalRegister(Dst->getReg()))
+    if (Register::isPhysicalRegister(Src1->getReg()) ||
+        Register::isPhysicalRegister(Dst->getReg()))
       break;
 
     if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
         Opcode == AMDGPU::V_LSHLREV_B32_e64) {
-      return make_unique<SDWADstOperand>(
+      return std::make_unique<SDWADstOperand>(
           Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
     } else {
-      return make_unique<SDWASrcOperand>(
+      return std::make_unique<SDWASrcOperand>(
           Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
           Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
           Opcode != AMDGPU::V_LSHRREV_B32_e64);
@@ -613,15 +613,15 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 
-    if (TRI->isPhysicalRegister(Src1->getReg()) ||
-        TRI->isPhysicalRegister(Dst->getReg()))
+    if (Register::isPhysicalRegister(Src1->getReg()) ||
+        Register::isPhysicalRegister(Dst->getReg()))
       break;
 
     if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
         Opcode == AMDGPU::V_LSHLREV_B16_e64) {
-      return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+      return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
     } else {
-      return make_unique<SDWASrcOperand>(
+      return std::make_unique<SDWASrcOperand>(
             Src1, Dst, BYTE_1, false, false,
             Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
             Opcode != AMDGPU::V_LSHRREV_B16_e64);
@@ -677,11 +677,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 
-    if (TRI->isPhysicalRegister(Src0->getReg()) ||
-        TRI->isPhysicalRegister(Dst->getReg()))
+    if (Register::isPhysicalRegister(Src0->getReg()) ||
+        Register::isPhysicalRegister(Dst->getReg()))
       break;
 
-    return make_unique<SDWASrcOperand>(
+    return std::make_unique<SDWASrcOperand>(
           Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
   }
 
@@ -706,11 +706,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 
-    if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
-        TRI->isPhysicalRegister(Dst->getReg()))
+    if (Register::isPhysicalRegister(ValSrc->getReg()) ||
+        Register::isPhysicalRegister(Dst->getReg()))
       break;
 
-    return make_unique<SDWASrcOperand>(
+    return std::make_unique<SDWASrcOperand>(
         ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
   }
 
@@ -840,7 +840,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
     MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
     assert(OrDst && OrDst->isReg());
 
-    return make_unique<SDWADstPreserveOperand>(
+    return std::make_unique<SDWADstPreserveOperand>(
       OrDst, OrSDWADef, OrOtherDef, DstSel);
 
   }
@@ -1189,7 +1189,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
       continue;
     }
 
-    unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
                         TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
     if (Op.isImm())
diff --git a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index f9bfe96f65cb..6cdd12d0e7bd 100644
--- a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -90,12 +90,12 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
   if (!MO.isReg())
     return false;
 
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
 
   if (!TRI->isVGPR(*MRI, Reg))
     return false;
 
-  if (TRI->isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return false;
 
   if (VRM->hasPhys(Reg))
@@ -124,14 +124,14 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
         if (!MO.isReg())
           continue;
 
-        const unsigned VirtReg = MO.getReg();
-        if (TRI->isPhysicalRegister(VirtReg))
+        const Register VirtReg = MO.getReg();
+        if (Register::isPhysicalRegister(VirtReg))
           continue;
 
         if (!VRM->hasPhys(VirtReg))
           continue;
 
-        unsigned PhysReg = VRM->getPhys(VirtReg);
+        Register PhysReg = VRM->getPhys(VirtReg);
         const unsigned SubReg = MO.getSubReg();
         if (SubReg != 0) {
           PhysReg = TRI->getSubReg(PhysReg, SubReg);
@@ -149,7 +149,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
   for (unsigned Reg : RegsToRewrite) {
     LIS->removeInterval(Reg);
 
-    const unsigned PhysReg = VRM->getPhys(Reg);
+    const Register PhysReg = VRM->getPhys(Reg);
     assert(PhysReg != 0);
     MFI->ReserveWWMRegister(PhysReg);
   }
diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h
index 168f05f8fdd6..7c039a54b57f 100644
--- a/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/lib/Target/AMDGPU/SIProgramInfo.h
@@ -41,6 +41,8 @@ struct SIProgramInfo {
     uint64_t ComputePGMRSrc2 = 0;
 
     uint32_t NumVGPR = 0;
+    uint32_t NumArchVGPR = 0;
+    uint32_t NumAccVGPR = 0;
     uint32_t NumSGPR = 0;
     uint32_t LDSSize = 0;
     bool FlatUsed = false;
@@ -51,6 +53,9 @@ struct SIProgramInfo {
     // Number of VGPRs that meets number of waves per execution unit request.
     uint32_t NumVGPRsForWavesPerEU = 0;
 
+    // Final occupancy.
+    uint32_t Occupancy = 0;
+
     // Whether there is recursion, dynamic allocas, indirect calls or some other
     // reason there may be statically unknown stack usage.
     bool DynamicCallStack = false;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f152deb28004..f58bc3060c42 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -48,11 +48,6 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
   }
 }
 
-static cl::opt<bool> EnableSpillSGPRToSMEM(
-  "amdgpu-spill-sgpr-to-smem",
-  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
-  cl::init(false));
-
 static cl::opt<bool> EnableSpillSGPRToVGPR(
   "amdgpu-spill-sgpr-to-vgpr",
   cl::desc("Enable spilling VGPRs to SGPRs"),
@@ -61,17 +56,12 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
 
 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
   AMDGPURegisterInfo(),
+  ST(ST),
   SGPRPressureSets(getNumRegPressureSets()),
   VGPRPressureSets(getNumRegPressureSets()),
   AGPRPressureSets(getNumRegPressureSets()),
-  SpillSGPRToVGPR(false),
-  SpillSGPRToSMEM(false),
+  SpillSGPRToVGPR(EnableSpillSGPRToVGPR),
   isWave32(ST.isWave32()) {
-  if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
-    SpillSGPRToSMEM = true;
-  else if (EnableSpillSGPRToVGPR)
-    SpillSGPRToVGPR = true;
-
   unsigned NumRegPressureSets = getNumRegPressureSets();
 
   SGPRSetID = NumRegPressureSets;
@@ -118,11 +108,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
 
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
   unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
-  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
 }
 
 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
@@ -144,7 +132,6 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
 
 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
   return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 }
@@ -202,8 +189,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(AMDGPU::VCC_HI);
   }
 
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
@@ -220,6 +205,14 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     reserveRegisterTuples(Reserved, Reg);
   }
 
+  // Reserve all the rest AGPRs if there are no instructions to use it.
+  if (!ST.hasMAIInsts()) {
+    for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
+      unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+      reserveRegisterTuples(Reserved, Reg);
+    }
+  }
+
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
@@ -293,32 +286,17 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const
 
 bool SIRegisterInfo::requiresFrameIndexScavenging(
   const MachineFunction &MF) const {
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.hasStackObjects())
-    return true;
-
-  // May need to deal with callee saved registers.
-  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  return !Info->isEntryFunction();
+  // Do not use frame virtual registers. They used to be used for SGPRs, but
+  // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
+  // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
+  // spill.
+  return false;
 }
 
 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
   const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (!MFI.hasStackObjects())
-    return false;
-
-  // The scavenger is used for large frames which may require finding a free
-  // register for large offsets.
-  if (!isUInt<12>(MFI.getStackSize()))
-    return true;
-
-  // If using scalar stores, for spills, m0 is needed for the scalar store
-  // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
-  // register for it during frame index elimination, so the scavenger is
-  // directly needed.
-  return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
-         MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
+  return MFI.hasStackObjects();
 }
 
 bool SIRegisterInfo::requiresVirtualBaseRegisters(
@@ -372,8 +350,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
     DL = Ins->getDebugLoc();
 
   MachineFunction *MF = MBB->getParent();
-  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = Subtarget.getInstrInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
 
   if (Offset == 0) {
     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
@@ -382,9 +359,9 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   }
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
-  unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
     .addImm(Offset);
@@ -399,11 +376,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
 
 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
                                        int64_t Offset) const {
-
-  MachineBasicBlock *MBB = MI.getParent();
-  MachineFunction *MF = MBB->getParent();
-  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = Subtarget.getInstrInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
 
 #ifndef NDEBUG
   // FIXME: Is it possible to be storing a frame index to itself?
@@ -419,12 +392,15 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
 #endif
 
   MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+#ifndef NDEBUG
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction *MF = MBB->getParent();
+#endif
   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
   assert(TII->isMUBUF(MI));
   assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
-         MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
-         "should only be seeing frame offset relative FrameIndex");
-
+         MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() &&
+         "should only be seeing stack pointer offset relative FrameIndex");
 
   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
   int64_t NewOffset = OffsetOp->getImm() + Offset;
@@ -564,7 +540,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
   }
 }
 
-static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
+static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
+                                           MachineBasicBlock::iterator MI,
                                            int Index,
                                            unsigned Lane,
                                            unsigned ValueReg,
@@ -572,7 +549,6 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MI->getParent()->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
@@ -595,11 +571,12 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
 
 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
 // need to handle the case where an SGPR may need to be spilled while spilling.
-static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
+static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
                                       MachineFrameInfo &MFI,
                                       MachineBasicBlock::iterator MI,
                                       int Index,
                                       int64_t Offset) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
   MachineBasicBlock *MBB = MI->getParent();
   const DebugLoc &DL = MI->getDebugLoc();
   bool IsStore = MI->mayStore();
@@ -611,7 +588,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
     return false;
 
   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
-  if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
+  if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
     return true;
 
   MachineInstrBuilder NewMI =
@@ -624,6 +601,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
           .addImm(0) // slc
           .addImm(0) // tfe
           .addImm(0) // dlc
+          .addImm(0) // swz
           .cloneMemRefs(*MI);
 
   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
@@ -645,7 +623,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
                                          RegScavenger *RS) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MI->getParent()->getParent();
-  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const MachineFrameInfo &MFI = MF->getFrameInfo();
 
@@ -707,8 +684,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   }
 
   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
-    unsigned SubReg = NumSubRegs == 1 ?
-      ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
+    Register SubReg = NumSubRegs == 1
+                          ? Register(ValueReg)
+                          : getSubReg(ValueReg, getSubRegFromChannel(i));
 
     unsigned SOffsetRegState = 0;
     unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -718,7 +696,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
       SrcDstRegState |= getKillRegState(IsKill);
     }
 
-    auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
+    auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill);
 
     if (!MIB.getInstr()) {
       unsigned FinalReg = SubReg;
@@ -743,6 +721,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
         .addImm(0) // slc
         .addImm(0) // tfe
         .addImm(0) // dlc
+        .addImm(0) // swz
         .addMemOperand(NewMMO);
 
       if (!IsStore && TmpReg != AMDGPU::NoRegister)
@@ -763,22 +742,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   }
 }
 
-static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
-                                                     bool Store) {
-  if (SuperRegSize % 16 == 0) {
-    return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
-                         AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
-  }
-
-  if (SuperRegSize % 8 == 0) {
-    return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
-                        AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
-  }
-
-  return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
-                      AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
-}
-
 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                                int Index,
                                RegScavenger *RS,
@@ -794,98 +757,37 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
 
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
-  unsigned SuperReg = MI->getOperand(0).getReg();
+  Register SuperReg = MI->getOperand(0).getReg();
   bool IsKill = MI->getOperand(0).isKill();
   const DebugLoc &DL = MI->getDebugLoc();
 
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
 
-  bool SpillToSMEM = spillSGPRToSMEM();
-  if (SpillToSMEM && OnlyToVGPR)
-    return false;
-
-  Register FrameReg = getFrameRegister(*MF);
-
   assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
                          SuperReg != MFI->getFrameOffsetReg() &&
                          SuperReg != MFI->getScratchWaveOffsetReg()));
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
-  unsigned OffsetReg = AMDGPU::M0;
   unsigned M0CopyReg = AMDGPU::NoRegister;
 
-  if (SpillToSMEM) {
-    if (RS->isRegUsed(AMDGPU::M0)) {
-      M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
-        .addReg(AMDGPU::M0);
-    }
-  }
-
-  unsigned ScalarStoreOp;
   unsigned EltSize = 4;
   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
-  if (SpillToSMEM && isSGPRClass(RC)) {
-    // XXX - if private_element_size is larger than 4 it might be useful to be
-    // able to spill wider vmem spills.
-    std::tie(EltSize, ScalarStoreOp) =
-          getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
-  }
 
   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
 
+  // Scavenged temporary VGPR to use. It must be scavenged once for any number
+  // of spilled subregs.
+  Register TmpVGPR;
+
   // SubReg carries the "Kill" flag when SubReg == SuperReg.
   unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
   for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-    unsigned SubReg = NumSubRegs == 1 ?
-      SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
-    if (SpillToSMEM) {
-      int64_t FrOffset = FrameInfo.getObjectOffset(Index);
-
-      // The allocated memory size is really the wavefront size * the frame
-      // index size. The widest register class is 64 bytes, so a 4-byte scratch
-      // allocation is enough to spill this in a single stack object.
-      //
-      // FIXME: Frame size/offsets are computed earlier than this, so the extra
-      // space is still unnecessarily allocated.
-
-      unsigned Align = FrameInfo.getObjectAlignment(Index);
-      MachinePointerInfo PtrInfo
-        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
-      MachineMemOperand *MMO
-        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
-                                   EltSize, MinAlign(Align, EltSize * i));
-
-      // SMEM instructions only support a single offset, so increment the wave
-      // offset.
-
-      int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
-      if (Offset != 0) {
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
-          .addReg(FrameReg)
-          .addImm(Offset);
-      } else {
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
-          .addReg(FrameReg);
-      }
-
-      BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
-        .addReg(SubReg, getKillRegState(IsKill)) // sdata
-        .addReg(MFI->getScratchRSrcReg())        // sbase
-        .addReg(OffsetReg, RegState::Kill)       // soff
-        .addImm(0)                               // glc
-        .addImm(0)                               // dlc
-        .addMemOperand(MMO);
-
-      continue;
-    }
+    Register SubReg =
+        NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
 
     if (SpillToVGPR) {
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
@@ -915,15 +817,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
         return false;
 
       // Spill SGPR to a frame index.
-      // TODO: Should VI try to spill to VGPR and then spill to SMEM?
-      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+      if (!TmpVGPR.isValid())
+        TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
 
       MachineInstrBuilder Mov
-        = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+        = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
         .addReg(SubReg, SubKillState);
 
-
       // There could be undef components of a spilled super register.
       // TODO: Can we detect this and skip the spill?
       if (NumSubRegs > 1) {
@@ -941,7 +841,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
                                    EltSize, MinAlign(Align, EltSize * i));
       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
-        .addReg(TmpReg, RegState::Kill)       // src
+        .addReg(TmpVGPR, RegState::Kill)      // src
         .addFrameIndex(Index)                 // vaddr
         .addReg(MFI->getScratchRSrcReg())     // srrsrc
         .addReg(MFI->getStackPtrOffsetReg())  // soffset
@@ -965,7 +865,6 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                                  RegScavenger *RS,
                                  bool OnlyToVGPR) const {
   MachineFunction *MF = MI->getParent()->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 
@@ -976,84 +875,27 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
     return false;
 
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const DebugLoc &DL = MI->getDebugLoc();
 
-  unsigned SuperReg = MI->getOperand(0).getReg();
-  bool SpillToSMEM = spillSGPRToSMEM();
-  if (SpillToSMEM && OnlyToVGPR)
-    return false;
+  Register SuperReg = MI->getOperand(0).getReg();
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
-  unsigned OffsetReg = AMDGPU::M0;
   unsigned M0CopyReg = AMDGPU::NoRegister;
 
-  if (SpillToSMEM) {
-    if (RS->isRegUsed(AMDGPU::M0)) {
-      M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
-        .addReg(AMDGPU::M0);
-    }
-  }
-
   unsigned EltSize = 4;
-  unsigned ScalarLoadOp;
-
-  Register FrameReg = getFrameRegister(*MF);
 
   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
-  if (SpillToSMEM && isSGPRClass(RC)) {
-    // XXX - if private_element_size is larger than 4 it might be useful to be
-    // able to spill wider vmem spills.
-    std::tie(EltSize, ScalarLoadOp) =
-          getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
-  }
 
   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
 
-  // SubReg carries the "Kill" flag when SubReg == SuperReg.
-  int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+  Register TmpVGPR;
 
   for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-    unsigned SubReg = NumSubRegs == 1 ?
-      SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
-    if (SpillToSMEM) {
-      // FIXME: Size may be > 4 but extra bytes wasted.
-      unsigned Align = FrameInfo.getObjectAlignment(Index);
-      MachinePointerInfo PtrInfo
-        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
-      MachineMemOperand *MMO
-        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
-                                   EltSize, MinAlign(Align, EltSize * i));
-
-      // Add i * 4 offset
-      int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
-      if (Offset != 0) {
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
-          .addReg(FrameReg)
-          .addImm(Offset);
-      } else {
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
-          .addReg(FrameReg);
-      }
-
-      auto MIB =
-        BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
-        .addReg(MFI->getScratchRSrcReg())  // sbase
-        .addReg(OffsetReg, RegState::Kill) // soff
-        .addImm(0)                         // glc
-        .addImm(0)                         // dlc
-        .addMemOperand(MMO);
-
-      if (NumSubRegs > 1 && i == 0)
-        MIB.addReg(SuperReg, RegState::ImplicitDefine);
-
-      continue;
-    }
+    Register SubReg =
+        NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
 
     if (SpillToVGPR) {
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
@@ -1071,7 +913,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
 
       // Restore SGPR from a stack slot.
       // FIXME: We should use S_LOAD_DWORD here for VI.
-      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      if (!TmpVGPR.isValid())
+        TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
       unsigned Align = FrameInfo.getObjectAlignment(Index);
 
       MachinePointerInfo PtrInfo
@@ -1081,7 +924,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
         MachineMemOperand::MOLoad, EltSize,
         MinAlign(Align, EltSize * i));
 
-      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)
         .addFrameIndex(Index)                 // vaddr
         .addReg(MFI->getScratchRSrcReg())     // srsrc
         .addReg(MFI->getStackPtrOffsetReg())  // soffset
@@ -1090,7 +933,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
 
       auto MIB =
         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
-        .addReg(TmpReg, RegState::Kill);
+        .addReg(TmpVGPR, RegState::Kill);
 
       if (NumSubRegs > 1)
         MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
@@ -1141,11 +984,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                         int SPAdj, unsigned FIOperandNum,
                                         RegScavenger *RS) const {
   MachineFunction *MF = MI->getParent()->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -1255,13 +1096,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         // In an entry function/kernel the offset is already the absolute
         // address relative to the frame register.
 
-        unsigned DiffReg
-          = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+        Register TmpDiffReg =
+          RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+
+        // If there's no free SGPR, in-place modify the FP
+        Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg;
 
         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
         Register ResultReg = IsCopy ?
           MI->getOperand(0).getReg() :
-          MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
 
         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
           .addReg(FrameReg)
@@ -1271,35 +1115,80 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         if (Offset == 0) {
           // XXX - This never happens because of emergency scavenging slot at 0?
           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
-            .addImm(Log2_32(ST.getWavefrontSize()))
+            .addImm(ST.getWavefrontSizeLog2())
             .addReg(DiffReg);
         } else {
-          unsigned ScaledReg
-            = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
-            .addImm(Log2_32(ST.getWavefrontSize()))
-            .addReg(DiffReg, RegState::Kill);
-
-          // TODO: Fold if use instruction is another add of a constant.
-          if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
-            TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
-              .addImm(Offset)
-              .addReg(ScaledReg, RegState::Kill)
-              .addImm(0); // clamp bit
+          if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
+            Register ScaledReg =
+              RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
+
+            BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                    ScaledReg)
+              .addImm(ST.getWavefrontSizeLog2())
+              .addReg(DiffReg, RegState::Kill);
+
+            const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
+
+            // TODO: Fold if use instruction is another add of a constant.
+            if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+              // FIXME: This can fail
+              MIB.addImm(Offset);
+              MIB.addReg(ScaledReg, RegState::Kill);
+              if (!IsVOP2)
+                MIB.addImm(0); // clamp bit
+            } else {
+              Register ConstOffsetReg =
+                RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false);
+
+              // This should always be able to use the unused carry out.
+              assert(ConstOffsetReg && "this scavenge should not be able to fail");
+
+              BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+                .addImm(Offset);
+              MIB.addReg(ConstOffsetReg, RegState::Kill);
+              MIB.addReg(ScaledReg, RegState::Kill);
+              MIB.addImm(0); // clamp bit
+            }
           } else {
-            unsigned ConstOffsetReg
-              = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
-              .addImm(Offset);
-            TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
-              .addReg(ConstOffsetReg, RegState::Kill)
+            // We have to produce a carry out, and we there isn't a free SGPR
+            // pair for it. We can keep the whole computation on the SALU to
+            // avoid clobbering an additional register at the cost of an extra
+            // mov.
+
+            // We may have 1 free scratch SGPR even though a carry out is
+            // unavailable. Only one additional mov is needed.
+            Register TmpScaledReg =
+                RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+            Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
+
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
+              .addReg(DiffReg, RegState::Kill)
+              .addImm(ST.getWavefrontSizeLog2());
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
               .addReg(ScaledReg, RegState::Kill)
-              .addImm(0); // clamp bit
+              .addImm(Offset);
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
+              .addReg(ScaledReg, RegState::Kill);
+
+            // If there were truly no free SGPRs, we need to undo everything.
+            if (!TmpScaledReg.isValid()) {
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
+                .addReg(ScaledReg, RegState::Kill)
+                .addImm(Offset);
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
+              .addReg(DiffReg, RegState::Kill)
+              .addImm(ST.getWavefrontSizeLog2());
+            }
           }
         }
 
+        if (!TmpDiffReg.isValid()) {
+          // Restore the FP.
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg)
+            .addReg(FrameReg)
+            .addReg(MFI->getScratchWaveOffsetReg());
+        }
+
         // Don't introduce an extra copy if we're just materializing in a mov.
         if (IsCopy)
           MI->eraseFromParent();
@@ -1325,7 +1214,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         int64_t NewOffset = OldImm + Offset;
 
         if (isUInt<12>(NewOffset) &&
-            buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
+            buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
           MI->eraseFromParent();
           return;
         }
@@ -1337,7 +1226,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       int64_t Offset = FrameInfo.getObjectOffset(Index);
       FIOp.ChangeToImmediate(Offset);
       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
-        unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
           .addImm(Offset);
         FIOp.ChangeToRegister(TmpReg, false, false, true);
@@ -1347,27 +1236,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 }
 
 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
-  const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
-  unsigned Size = getRegSizeInBits(*RC);
-  unsigned AltName = AMDGPU::NoRegAltName;
-
-  switch (Size) {
-  case 32:   AltName = AMDGPU::Reg32; break;
-  case 64:   AltName = AMDGPU::Reg64; break;
-  case 96:   AltName = AMDGPU::Reg96; break;
-  case 128:  AltName = AMDGPU::Reg128; break;
-  case 160:  AltName = AMDGPU::Reg160; break;
-  case 256:  AltName = AMDGPU::Reg256; break;
-  case 512:  AltName = AMDGPU::Reg512; break;
-  case 1024: AltName = AMDGPU::Reg1024; break;
-  }
-  return AMDGPUInstPrinter::getRegisterName(Reg, AltName);
+  return AMDGPUInstPrinter::getRegisterName(Reg);
 }
 
 // FIXME: This is very slow. It might be worth creating a map from physreg to
 // register class.
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
-  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(!Register::isVirtualRegister(Reg));
 
   static const TargetRegisterClass *const BaseClasses[] = {
     &AMDGPU::VGPR_32RegClass,
@@ -1408,8 +1283,6 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
   unsigned Size = getRegSizeInBits(*RC);
-  if (Size < 32)
-    return false;
   switch (Size) {
   case 32:
     return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
@@ -1427,8 +1300,11 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
     return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
   case 1024:
     return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
+  case 1:
+    return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr;
   default:
-    llvm_unreachable("Invalid register class size");
+    assert(Size < 32 && "Invalid register class size");
+    return false;
   }
 }
 
@@ -1476,6 +1352,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
     return &AMDGPU::VReg_512RegClass;
   case 1024:
     return &AMDGPU::VReg_1024RegClass;
+  case 1:
+    return &AMDGPU::VReg_1RegClass;
   default:
     llvm_unreachable("Invalid register class size");
   }
@@ -1509,7 +1387,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
   case 96:
     return &AMDGPU::SReg_96RegClass;
   case 128:
-    return &AMDGPU::SReg_128RegClass;
+    return &AMDGPU::SGPR_128RegClass;
   case 160:
     return &AMDGPU::SReg_160RegClass;
   case 256:
@@ -1539,7 +1417,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
     case 3:
       return &AMDGPU::SReg_96RegClass;
     case 4:
-      return &AMDGPU::SReg_128RegClass;
+      return &AMDGPU::SGPR_128RegClass;
     case 5:
       return &AMDGPU::SReg_160RegClass;
     case 8:
@@ -1587,6 +1465,15 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
   }
 }
 
+bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
+  if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
+      OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
+    return !ST.hasMFMAInlineLiteralBug();
+
+  return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+         OpType <= AMDGPU::OPERAND_SRC_LAST;
+}
+
 bool SIRegisterInfo::shouldRewriteCopySrc(
   const TargetRegisterClass *DefRC,
   unsigned DefSubReg,
@@ -1802,7 +1689,7 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
 const TargetRegisterClass*
 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
                                   unsigned Reg) const {
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
+  if (Register::isVirtualRegister(Reg))
     return  MRI.getRegClass(Reg);
 
   return getPhysRegClass(Reg);
@@ -1845,8 +1732,6 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
 
 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                              MachineFunction &MF) const {
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
@@ -1900,18 +1785,22 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
       return isWave32 ?
         &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
     case AMDGPU::SGPRRegBankID:
-      return &AMDGPU::SReg_32_XM0RegClass;
+      return &AMDGPU::SReg_32RegClass;
     case AMDGPU::SCCRegBankID:
       // This needs to return an allocatable class, so don't bother returning
       // the dummy SCC class.
-      return &AMDGPU::SReg_32_XM0RegClass;
+      //
+      // FIXME: This is a grotesque hack. We use SGPR_32 as an indication this
+      // was not an VCC bank value since we use the larger class SReg_32 for
+      // other values. These should all use SReg_32.
+      return &AMDGPU::SGPR_32RegClass;
     default:
       llvm_unreachable("unknown register bank");
     }
   }
   case 32:
     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
-                                                 &AMDGPU::SReg_32_XM0RegClass;
+                                                 &AMDGPU::SReg_32RegClass;
   case 64:
     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
                                                  &AMDGPU::SReg_64_XEXECRegClass;
@@ -1920,7 +1809,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
                                                  &AMDGPU::SReg_96RegClass;
   case 128:
     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
-                                                 &AMDGPU::SReg_128RegClass;
+                                                 &AMDGPU::SGPR_128RegClass;
   case 160:
     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
                                                  &AMDGPU::SReg_160RegClass;
@@ -1930,10 +1819,13 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
   case 512:
     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
                                                  &AMDGPU::SReg_512RegClass;
+  case 1024:
+    return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass :
+                                                 &AMDGPU::SReg_1024RegClass;
   default:
     if (Size < 32)
       return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
-                                                   &AMDGPU::SReg_32_XM0RegClass;
+                                                   &AMDGPU::SReg_32RegClass;
     return nullptr;
   }
 }
@@ -1941,9 +1833,12 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
 const TargetRegisterClass *
 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
                                          const MachineRegisterInfo &MRI) const {
-  if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
+  const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
+  if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
-  return nullptr;
+
+  const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
+  return getAllocatableClass(RC);
 }
 
 unsigned SIRegisterInfo::getVCC() const {
@@ -1974,7 +1869,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
   SlotIndex DefIdx;
 
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     if (!LIS->hasInterval(Reg))
       return nullptr;
     LiveInterval &LI = LIS->getInterval(Reg);
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 34487c96e72e..ac3dea1a1a28 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -27,6 +27,7 @@ class SIMachineFunctionInfo;
 
 class SIRegisterInfo final : public AMDGPURegisterInfo {
 private:
+  const GCNSubtarget &ST;
   unsigned SGPRSetID;
   unsigned VGPRSetID;
   unsigned AGPRSetID;
@@ -34,7 +35,6 @@ private:
   BitVector VGPRPressureSets;
   BitVector AGPRPressureSets;
   bool SpillSGPRToVGPR;
-  bool SpillSGPRToSMEM;
   bool isWave32;
 
   void classifyPressureSet(unsigned PSetID, unsigned Reg,
@@ -46,10 +46,6 @@ public:
     return SpillSGPRToVGPR;
   }
 
-  bool spillSGPRToSMEM() const {
-    return SpillSGPRToSMEM;
-  }
-
   /// Return the end register initially reserved for the scratch buffer in case
   /// spilling is needed.
   unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
@@ -141,7 +137,7 @@ public:
 
   bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
     const TargetRegisterClass *RC;
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (Register::isVirtualRegister(Reg))
       RC = MRI.getRegClass(Reg);
     else
       RC = getPhysRegClass(Reg);
@@ -193,10 +189,7 @@ public:
   /// \returns True if operands defined with this operand type can accept
   /// an inline constant. i.e. An integer value in the range (-16, 64) or
   /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
-  bool opCanUseInlineConstant(unsigned OpType) const {
-    return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
-           OpType <= AMDGPU::OPERAND_SRC_LAST;
-  }
+  bool opCanUseInlineConstant(unsigned OpType) const;
 
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass *RC,
@@ -270,7 +263,7 @@ public:
                                  const MachineRegisterInfo &MRI) const override;
 
   const TargetRegisterClass *getBoolRC() const {
-    return isWave32 ? &AMDGPU::SReg_32_XM0RegClass
+    return isWave32 ? &AMDGPU::SReg_32RegClass
                     : &AMDGPU::SReg_64RegClass;
   }
 
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index d5948a7862cc..82219cbdf3b2 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -37,50 +37,52 @@ class getSubRegs<int size> {
                                               !if(!eq(size, 16), ret16, ret32))))));
 }
 
-let Namespace = "AMDGPU" in {
-defset list<RegAltNameIndex> AllRegAltNameIndices = {
-  def Reg32   : RegAltNameIndex;
-  def Reg64   : RegAltNameIndex;
-  def Reg96   : RegAltNameIndex;
-  def Reg128  : RegAltNameIndex;
-  def Reg160  : RegAltNameIndex;
-  def Reg256  : RegAltNameIndex;
-  def Reg512  : RegAltNameIndex;
-  def Reg1024 : RegAltNameIndex;
-}
-}
+// Generates list of sequential register tuple names.
+// E.g. RegSeq<3,2,2,"s">.ret -> [ "s[0:1]", "s[2:3]" ]
+class RegSeqNames<int last_reg, int stride, int size, string prefix,
+                  int start = 0> {
+  int next = !add(start, stride);
+  int end_reg = !add(!add(start, size), -1);
+  list<string> ret =
+    !if(!le(end_reg, last_reg),
+        !listconcat([prefix # "[" # start # ":" # end_reg # "]"],
+                    RegSeqNames<last_reg, stride, size, prefix, next>.ret),
+                    []);
+}
+
+// Generates list of dags for register tupless.
+class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size,
+                int start = 0> {
+  dag trunc_rc = (trunc RC,
+                  !if(!and(!eq(stride, 1), !eq(start, 0)),
+                      !add(!add(last_reg, 2), !mul(size, -1)),
+                      !add(last_reg, 1)));
+  list<dag> ret =
+    !if(!lt(start, size),
+        !listconcat([(add (decimate (shl trunc_rc, start), stride))],
+                    RegSeqDags<RC, last_reg, stride, size, !add(start, 1)>.ret),
+        []);
+}
+
+class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC,
+                       int last_reg, int stride, int size, string prefix> :
+  RegisterTuples<Indices,
+                 RegSeqDags<RC, last_reg, stride, size>.ret,
+                 RegSeqNames<last_reg, stride, size, prefix>.ret>;
 
 //===----------------------------------------------------------------------===//
 //  Declarations that describe the SI registers
 //===----------------------------------------------------------------------===//
-class SIReg <string n, bits<16> regIdx = 0, string prefix = "",
-             int regNo = !cast<int>(regIdx)> :
-  Register<n, !if(!eq(prefix, ""),
-                [ n, n, n, n, n, n, n, n ],
-                [ prefix # regNo,
-                  prefix # "[" # regNo # ":" # !and(!add(regNo, 1), 255) # "]",
-                  prefix # "[" # regNo # ":" # !and(!add(regNo, 2), 255) # "]",
-                  prefix # "[" # regNo # ":" # !and(!add(regNo, 3), 255) # "]",
-                  prefix # "[" # regNo # ":" # !and(!add(regNo, 4), 255) # "]",
-                  prefix # "[" # regNo # ":" # !and(!add(regNo, 7), 255) # "]",
-                  prefix # "[" # regNo # ":" # !and(!add(regNo, 15), 255) # "]",
-                  prefix # "[" # regNo # ":" # !and(!add(regNo, 31), 255) # "]",
-                ])>,
+class SIReg <string n, bits<16> regIdx = 0> :
+  Register<n>,
   DwarfRegNum<[!cast<int>(HWEncoding)]> {
   let Namespace = "AMDGPU";
-  let RegAltNameIndices = AllRegAltNameIndices;
 
   // This is the not yet the complete register encoding. An additional
   // bit is set for VGPRs.
   let HWEncoding = regIdx;
 }
 
-class SIRegisterWithSubRegs<string n, list<Register> subregs> :
-  RegisterWithSubRegs<n, subregs> {
-  let RegAltNameIndices = AllRegAltNameIndices;
-  let AltNames = [ n, n, n, n, n, n, n, n ];
-}
-
 // Special Registers
 def VCC_LO : SIReg<"vcc_lo", 106>;
 def VCC_HI : SIReg<"vcc_hi", 107>;
@@ -93,7 +95,7 @@ def SP_REG : SIReg<"sp", 0>;
 def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>;
 
 // VCC for 64-bit instructions
-def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
           DwarfRegAlias<VCC_LO> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
@@ -103,7 +105,7 @@ def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
 def EXEC_LO : SIReg<"exec_lo", 126>;
 def EXEC_HI : SIReg<"exec_hi", 127>;
 
-def EXEC : SIRegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
+def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
            DwarfRegAlias<EXEC_LO> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
@@ -134,7 +136,7 @@ def LDS_DIRECT : SIReg <"src_lds_direct", 254>;
 def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
 def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
 
-def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
+def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
                  DwarfRegAlias<XNACK_MASK_LO> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
@@ -145,7 +147,7 @@ def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_
 def TBA_LO : SIReg<"tba_lo", 108>;
 def TBA_HI : SIReg<"tba_hi", 109>;
 
-def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
+def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
           DwarfRegAlias<TBA_LO> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
@@ -155,7 +157,7 @@ def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
 def TMA_LO : SIReg<"tma_lo", 110>;
 def TMA_HI : SIReg<"tma_hi", 111>;
 
-def TMA : SIRegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
+def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
           DwarfRegAlias<TMA_LO> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
@@ -175,7 +177,7 @@ multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
 }
 
 class FlatReg <Register lo, Register hi, bits<16> encoding> :
-    SIRegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+    RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
     DwarfRegAlias<lo> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
@@ -191,19 +193,19 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
 
 // SGPR registers
 foreach Index = 0-105 in {
-  def SGPR#Index : SIReg <"SGPR"#Index, Index, "s">;
+  def SGPR#Index : SIReg <"s"#Index, Index>;
 }
 
 // VGPR registers
 foreach Index = 0-255 in {
-  def VGPR#Index : SIReg <"VGPR"#Index, Index, "v"> {
+  def VGPR#Index : SIReg <"v"#Index, Index> {
     let HWEncoding{8} = 1;
   }
 }
 
 // AccVGPR registers
 foreach Index = 0-255 in {
-  def AGPR#Index : SIReg <"AGPR"#Index, Index, "a"> {
+  def AGPR#Index : SIReg <"a"#Index, Index> {
     let HWEncoding{8} = 1;
   }
 }
@@ -226,102 +228,32 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
 
 // SGPR 32-bit registers
 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-                            (add (sequence "SGPR%u", 0, 105)), Reg32> {
+                            (add (sequence "SGPR%u", 0, 105))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
   let AllocationPriority = 9;
 }
 
 // SGPR 64-bit registers
-def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret,
-                             [(add (decimate SGPR_32, 2)),
-                              (add (decimate (shl SGPR_32, 1), 2))]>;
+def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">;
 
 // SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
-def SGPR_96Regs : RegisterTuples<getSubRegs<3>.ret,
-                            [(add (decimate SGPR_32, 3)),
-                             (add (decimate (shl SGPR_32, 1), 3)),
-                             (add (decimate (shl SGPR_32, 2), 3))]>;
+def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 3, 3, "s">;
 
 // SGPR 128-bit registers
-def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,
-                              [(add (decimate SGPR_32, 4)),
-                               (add (decimate (shl SGPR_32, 1), 4)),
-                               (add (decimate (shl SGPR_32, 2), 4)),
-                               (add (decimate (shl SGPR_32, 3), 4))]>;
+def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
 
 // SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
-def SGPR_160Regs : RegisterTuples<getSubRegs<5>.ret,
-                            [(add (decimate SGPR_32, 4)),
-                             (add (decimate (shl SGPR_32, 1), 4)),
-                             (add (decimate (shl SGPR_32, 2), 4)),
-                             (add (decimate (shl SGPR_32, 3), 4)),
-                             (add (decimate (shl SGPR_32, 4), 4))]>;
+def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
 
 // SGPR 256-bit registers
-def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret,
-                              [(add (decimate SGPR_32, 4)),
-                               (add (decimate (shl SGPR_32, 1), 4)),
-                               (add (decimate (shl SGPR_32, 2), 4)),
-                               (add (decimate (shl SGPR_32, 3), 4)),
-                               (add (decimate (shl SGPR_32, 4), 4)),
-                               (add (decimate (shl SGPR_32, 5), 4)),
-                               (add (decimate (shl SGPR_32, 6), 4)),
-                               (add (decimate (shl SGPR_32, 7), 4))]>;
+def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
 
 // SGPR 512-bit registers
-def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret,
-                              [(add (decimate SGPR_32, 4)),
-                               (add (decimate (shl SGPR_32, 1), 4)),
-                               (add (decimate (shl SGPR_32, 2), 4)),
-                               (add (decimate (shl SGPR_32, 3), 4)),
-                               (add (decimate (shl SGPR_32, 4), 4)),
-                               (add (decimate (shl SGPR_32, 5), 4)),
-                               (add (decimate (shl SGPR_32, 6), 4)),
-                               (add (decimate (shl SGPR_32, 7), 4)),
-                               (add (decimate (shl SGPR_32, 8), 4)),
-                               (add (decimate (shl SGPR_32, 9), 4)),
-                               (add (decimate (shl SGPR_32, 10), 4)),
-                               (add (decimate (shl SGPR_32, 11), 4)),
-                               (add (decimate (shl SGPR_32, 12), 4)),
-                               (add (decimate (shl SGPR_32, 13), 4)),
-                               (add (decimate (shl SGPR_32, 14), 4)),
-                               (add (decimate (shl SGPR_32, 15), 4))]>;
+def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s">;
 
 // SGPR 1024-bit registers
-def SGPR_1024Regs : RegisterTuples<getSubRegs<32>.ret,
-                              [(add (decimate SGPR_32, 4)),
-                               (add (decimate (shl SGPR_32, 1), 4)),
-                               (add (decimate (shl SGPR_32, 2), 4)),
-                               (add (decimate (shl SGPR_32, 3), 4)),
-                               (add (decimate (shl SGPR_32, 4), 4)),
-                               (add (decimate (shl SGPR_32, 5), 4)),
-                               (add (decimate (shl SGPR_32, 6), 4)),
-                               (add (decimate (shl SGPR_32, 7), 4)),
-                               (add (decimate (shl SGPR_32, 8), 4)),
-                               (add (decimate (shl SGPR_32, 9), 4)),
-                               (add (decimate (shl SGPR_32, 10), 4)),
-                               (add (decimate (shl SGPR_32, 11), 4)),
-                               (add (decimate (shl SGPR_32, 12), 4)),
-                               (add (decimate (shl SGPR_32, 13), 4)),
-                               (add (decimate (shl SGPR_32, 14), 4)),
-                               (add (decimate (shl SGPR_32, 15), 4)),
-                               (add (decimate (shl SGPR_32, 16), 4)),
-                               (add (decimate (shl SGPR_32, 17), 4)),
-                               (add (decimate (shl SGPR_32, 18), 4)),
-                               (add (decimate (shl SGPR_32, 19), 4)),
-                               (add (decimate (shl SGPR_32, 20), 4)),
-                               (add (decimate (shl SGPR_32, 21), 4)),
-                               (add (decimate (shl SGPR_32, 22), 4)),
-                               (add (decimate (shl SGPR_32, 23), 4)),
-                               (add (decimate (shl SGPR_32, 24), 4)),
-                               (add (decimate (shl SGPR_32, 25), 4)),
-                               (add (decimate (shl SGPR_32, 26), 4)),
-                               (add (decimate (shl SGPR_32, 27), 4)),
-                               (add (decimate (shl SGPR_32, 28), 4)),
-                               (add (decimate (shl SGPR_32, 29), 4)),
-                               (add (decimate (shl SGPR_32, 30), 4)),
-                               (add (decimate (shl SGPR_32, 31), 4))]>;
+def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
 
 // Trap handler TMP 32-bit registers
 def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
@@ -330,51 +262,21 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
 }
 
 // Trap handler TMP 64-bit registers
-def TTMP_64Regs : RegisterTuples<getSubRegs<2>.ret,
-                             [(add (decimate TTMP_32, 2)),
-                              (add (decimate (shl TTMP_32, 1), 2))]>;
+def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">;
 
 // Trap handler TMP 128-bit registers
-def TTMP_128Regs : RegisterTuples<getSubRegs<4>.ret,
-                              [(add (decimate TTMP_32, 4)),
-                               (add (decimate (shl TTMP_32, 1), 4)),
-                               (add (decimate (shl TTMP_32, 2), 4)),
-                               (add (decimate (shl TTMP_32, 3), 4))]>;
-
-def TTMP_256Regs : RegisterTuples<getSubRegs<8>.ret,
-                              [(add (decimate TTMP_32, 4)),
-                               (add (decimate (shl TTMP_32, 1), 4)),
-                               (add (decimate (shl TTMP_32, 2), 4)),
-                               (add (decimate (shl TTMP_32, 3), 4)),
-                               (add (decimate (shl TTMP_32, 4), 4)),
-                               (add (decimate (shl TTMP_32, 5), 4)),
-                               (add (decimate (shl TTMP_32, 6), 4)),
-                               (add (decimate (shl TTMP_32, 7), 4))]>;
-
-def TTMP_512Regs : RegisterTuples<getSubRegs<16>.ret,
-                              [(add (decimate TTMP_32, 4)),
-                               (add (decimate (shl TTMP_32, 1), 4)),
-                               (add (decimate (shl TTMP_32, 2), 4)),
-                               (add (decimate (shl TTMP_32, 3), 4)),
-                               (add (decimate (shl TTMP_32, 4), 4)),
-                               (add (decimate (shl TTMP_32, 5), 4)),
-                               (add (decimate (shl TTMP_32, 6), 4)),
-                               (add (decimate (shl TTMP_32, 7), 4)),
-                               (add (decimate (shl TTMP_32, 8), 4)),
-                               (add (decimate (shl TTMP_32, 9), 4)),
-                               (add (decimate (shl TTMP_32, 10), 4)),
-                               (add (decimate (shl TTMP_32, 11), 4)),
-                               (add (decimate (shl TTMP_32, 12), 4)),
-                               (add (decimate (shl TTMP_32, 13), 4)),
-                               (add (decimate (shl TTMP_32, 14), 4)),
-                               (add (decimate (shl TTMP_32, 15), 4))]>;
+def TTMP_128Regs : SIRegisterTuples<getSubRegs<4>.ret, TTMP_32, 15, 4, 4, "ttmp">;
+
+def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">;
+
+def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">;
 
 class TmpRegTuplesBase<int index, int size,
                        list<Register> subRegs,
                        list<SubRegIndex> indices = getSubRegs<size>.ret,
                        int index1 = !add(index, !add(size, -1)),
                        string name = "ttmp["#index#":"#index1#"]"> :
-  SIRegisterWithSubRegs<name, subRegs> {
+  RegisterWithSubRegs<name, subRegs> {
   let HWEncoding = subRegs[0].HWEncoding;
   let SubRegIndices = indices;
 }
@@ -448,196 +350,80 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT
                     TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10,
                     TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>;
 
+class RegisterTypes<list<ValueType> reg_types> {
+  list<ValueType> types = reg_types;
+}
+
+def Reg16Types : RegisterTypes<[i16, f16]>;
+def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
+
+
 // VGPR 32-bit registers
 // i16/f16 only on VI+
-def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-                            (add (sequence "VGPR%u", 0, 255)), Reg32> {
+def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
+                            (add (sequence "VGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
 }
 
 // VGPR 64-bit registers
-def VGPR_64 : RegisterTuples<getSubRegs<2>.ret,
-                             [(add (trunc VGPR_32, 255)),
-                              (add (shl VGPR_32, 1))]>;
+def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 255, 1, 2, "v">;
 
 // VGPR 96-bit registers
-def VGPR_96 : RegisterTuples<getSubRegs<3>.ret,
-                             [(add (trunc VGPR_32, 254)),
-                              (add (shl VGPR_32, 1)),
-                              (add (shl VGPR_32, 2))]>;
+def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 255, 1, 3, "v">;
 
 // VGPR 128-bit registers
-def VGPR_128 : RegisterTuples<getSubRegs<4>.ret,
-                              [(add (trunc VGPR_32, 253)),
-                               (add (shl VGPR_32, 1)),
-                               (add (shl VGPR_32, 2)),
-                               (add (shl VGPR_32, 3))]>;
+def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">;
 
 // VGPR 160-bit registers
-def VGPR_160 : RegisterTuples<getSubRegs<5>.ret,
-                             [(add (trunc VGPR_32, 252)),
-                              (add (shl VGPR_32, 1)),
-                              (add (shl VGPR_32, 2)),
-                              (add (shl VGPR_32, 3)),
-                              (add (shl VGPR_32, 4))]>;
+def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
 
 // VGPR 256-bit registers
-def VGPR_256 : RegisterTuples<getSubRegs<8>.ret,
-                              [(add (trunc VGPR_32, 249)),
-                               (add (shl VGPR_32, 1)),
-                               (add (shl VGPR_32, 2)),
-                               (add (shl VGPR_32, 3)),
-                               (add (shl VGPR_32, 4)),
-                               (add (shl VGPR_32, 5)),
-                               (add (shl VGPR_32, 6)),
-                               (add (shl VGPR_32, 7))]>;
+def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
 
 // VGPR 512-bit registers
-def VGPR_512 : RegisterTuples<getSubRegs<16>.ret,
-                              [(add (trunc VGPR_32, 241)),
-                               (add (shl VGPR_32, 1)),
-                               (add (shl VGPR_32, 2)),
-                               (add (shl VGPR_32, 3)),
-                               (add (shl VGPR_32, 4)),
-                               (add (shl VGPR_32, 5)),
-                               (add (shl VGPR_32, 6)),
-                               (add (shl VGPR_32, 7)),
-                               (add (shl VGPR_32, 8)),
-                               (add (shl VGPR_32, 9)),
-                               (add (shl VGPR_32, 10)),
-                               (add (shl VGPR_32, 11)),
-                               (add (shl VGPR_32, 12)),
-                               (add (shl VGPR_32, 13)),
-                               (add (shl VGPR_32, 14)),
-                               (add (shl VGPR_32, 15))]>;
+def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
 
 // VGPR 1024-bit registers
-def VGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
-                              [(add (trunc VGPR_32, 225)),
-                               (add (shl VGPR_32, 1)),
-                               (add (shl VGPR_32, 2)),
-                               (add (shl VGPR_32, 3)),
-                               (add (shl VGPR_32, 4)),
-                               (add (shl VGPR_32, 5)),
-                               (add (shl VGPR_32, 6)),
-                               (add (shl VGPR_32, 7)),
-                               (add (shl VGPR_32, 8)),
-                               (add (shl VGPR_32, 9)),
-                               (add (shl VGPR_32, 10)),
-                               (add (shl VGPR_32, 11)),
-                               (add (shl VGPR_32, 12)),
-                               (add (shl VGPR_32, 13)),
-                               (add (shl VGPR_32, 14)),
-                               (add (shl VGPR_32, 15)),
-                               (add (shl VGPR_32, 16)),
-                               (add (shl VGPR_32, 17)),
-                               (add (shl VGPR_32, 18)),
-                               (add (shl VGPR_32, 19)),
-                               (add (shl VGPR_32, 20)),
-                               (add (shl VGPR_32, 21)),
-                               (add (shl VGPR_32, 22)),
-                               (add (shl VGPR_32, 23)),
-                               (add (shl VGPR_32, 24)),
-                               (add (shl VGPR_32, 25)),
-                               (add (shl VGPR_32, 26)),
-                               (add (shl VGPR_32, 27)),
-                               (add (shl VGPR_32, 28)),
-                               (add (shl VGPR_32, 29)),
-                               (add (shl VGPR_32, 30)),
-                               (add (shl VGPR_32, 31))]>;
+def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">;
 
 // AccVGPR 32-bit registers
 def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-                            (add (sequence "AGPR%u", 0, 255)), Reg32> {
+                            (add (sequence "AGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
 }
 
 // AGPR 64-bit registers
-def AGPR_64 : RegisterTuples<getSubRegs<2>.ret,
-                             [(add (trunc AGPR_32, 255)),
-                              (add (shl AGPR_32, 1))]>;
+def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">;
 
 // AGPR 128-bit registers
-def AGPR_128 : RegisterTuples<getSubRegs<4>.ret,
-                              [(add (trunc AGPR_32, 253)),
-                               (add (shl AGPR_32, 1)),
-                               (add (shl AGPR_32, 2)),
-                               (add (shl AGPR_32, 3))]>;
+def AGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, AGPR_32, 255, 1, 4, "a">;
 
 // AGPR 512-bit registers
-def AGPR_512 : RegisterTuples<getSubRegs<16>.ret,
-                              [(add (trunc AGPR_32, 241)),
-                               (add (shl AGPR_32, 1)),
-                               (add (shl AGPR_32, 2)),
-                               (add (shl AGPR_32, 3)),
-                               (add (shl AGPR_32, 4)),
-                               (add (shl AGPR_32, 5)),
-                               (add (shl AGPR_32, 6)),
-                               (add (shl AGPR_32, 7)),
-                               (add (shl AGPR_32, 8)),
-                               (add (shl AGPR_32, 9)),
-                               (add (shl AGPR_32, 10)),
-                               (add (shl AGPR_32, 11)),
-                               (add (shl AGPR_32, 12)),
-                               (add (shl AGPR_32, 13)),
-                               (add (shl AGPR_32, 14)),
-                               (add (shl AGPR_32, 15))]>;
+def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">;
 
 // AGPR 1024-bit registers
-def AGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
-                              [(add (trunc AGPR_32, 225)),
-                               (add (shl AGPR_32, 1)),
-                               (add (shl AGPR_32, 2)),
-                               (add (shl AGPR_32, 3)),
-                               (add (shl AGPR_32, 4)),
-                               (add (shl AGPR_32, 5)),
-                               (add (shl AGPR_32, 6)),
-                               (add (shl AGPR_32, 7)),
-                               (add (shl AGPR_32, 8)),
-                               (add (shl AGPR_32, 9)),
-                               (add (shl AGPR_32, 10)),
-                               (add (shl AGPR_32, 11)),
-                               (add (shl AGPR_32, 12)),
-                               (add (shl AGPR_32, 13)),
-                               (add (shl AGPR_32, 14)),
-                               (add (shl AGPR_32, 15)),
-                               (add (shl AGPR_32, 16)),
-                               (add (shl AGPR_32, 17)),
-                               (add (shl AGPR_32, 18)),
-                               (add (shl AGPR_32, 19)),
-                               (add (shl AGPR_32, 20)),
-                               (add (shl AGPR_32, 21)),
-                               (add (shl AGPR_32, 22)),
-                               (add (shl AGPR_32, 23)),
-                               (add (shl AGPR_32, 24)),
-                               (add (shl AGPR_32, 25)),
-                               (add (shl AGPR_32, 26)),
-                               (add (shl AGPR_32, 27)),
-                               (add (shl AGPR_32, 28)),
-                               (add (shl AGPR_32, 29)),
-                               (add (shl AGPR_32, 30)),
-                               (add (shl AGPR_32, 31))]>;
+def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
 
 //===----------------------------------------------------------------------===//
 //  Register classes used as source and destination
 //===----------------------------------------------------------------------===//
 
 def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-  (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG), Reg32> {
+  (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
 }
 
 def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
-  (add PRIVATE_RSRC_REG), Reg128> {
+  (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
 }
 
 def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-  (add LDS_DIRECT), Reg32> {
+  (add LDS_DIRECT)> {
   let isAllocatable = 0;
   let CopyCost = -1;
 }
@@ -648,41 +434,40 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
    SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
    SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
-   SRC_VCCZ, SRC_EXECZ, SRC_SCC), Reg32> {
+   SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
   let AllocationPriority = 10;
 }
 
 def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
-  (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS), Reg32> {
+  (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
   let AllocationPriority = 10;
 }
 
 def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
-  (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI), Reg32> {
+  (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 10;
 }
 
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
-  (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI), Reg32> {
+  (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
   let AllocationPriority = 10;
 }
 
 def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
-  (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS),
-  Reg32> {
+  (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
 }
 
 def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
-                            (add SGPR_64Regs), Reg64> {
+                            (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 11;
 }
 
 // CCR (call clobbered registers) SGPR 64-bit registers
 def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
-                                (add (trunc SGPR_64, 16)), Reg64> {
+                                (add (trunc SGPR_64, 16))> {
   let CopyCost = SGPR_64.CopyCost;
   let AllocationPriority = SGPR_64.AllocationPriority;
 }
@@ -693,13 +478,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
 }
 
 def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
-  (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA), Reg64> {
+  (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
   let AllocationPriority = 13;
 }
 
 def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
-  (add SReg_64_XEXEC, EXEC), Reg64> {
+  (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
   let AllocationPriority = 13;
 }
@@ -722,17 +507,17 @@ let CopyCost = 2 in {
 // There are no 3-component scalar instructions, but this is needed
 // for symmetry with VGPRs.
 def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
-  (add SGPR_96Regs), Reg96> {
+  (add SGPR_96Regs)> {
   let AllocationPriority = 14;
 }
 
 def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
-  (add SGPR_96), Reg96> {
+  (add SGPR_96)> {
   let AllocationPriority = 14;
 }
 
 def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
-                             (add SGPR_128Regs), Reg128> {
+                             (add SGPR_128Regs)> {
   let AllocationPriority = 15;
 }
 
@@ -742,8 +527,9 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
 }
 
 def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
-                             (add SGPR_128, TTMP_128), Reg128> {
+                             (add SGPR_128, TTMP_128)> {
   let AllocationPriority = 15;
+  let isAllocatable = 0;
 }
 
 } // End CopyCost = 2
@@ -751,17 +537,16 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
 // There are no 5-component scalar instructions, but this is needed
 // for symmetry with VGPRs.
 def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
-                             (add SGPR_160Regs), Reg160> {
+                             (add SGPR_160Regs)> {
   let AllocationPriority = 16;
 }
 
 def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
-                             (add SGPR_160), Reg160> {
+                             (add SGPR_160)> {
   let AllocationPriority = 16;
 }
 
-def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs),
-                             Reg256> {
+def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> {
   let AllocationPriority = 17;
 }
 
@@ -770,14 +555,14 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
 }
 
 def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
-                             (add SGPR_256, TTMP_256), Reg256> {
+                             (add SGPR_256, TTMP_256)> {
   // Requires 4 s_mov_b64 to copy
   let CopyCost = 4;
   let AllocationPriority = 17;
 }
 
 def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
-                             (add SGPR_512Regs), Reg512> {
+                             (add SGPR_512Regs)> {
   let AllocationPriority = 18;
 }
 
@@ -787,31 +572,31 @@ def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
 }
 
 def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
-                             (add SGPR_512, TTMP_512), Reg512> {
+                             (add SGPR_512, TTMP_512)> {
   // Requires 8 s_mov_b64 to copy
   let CopyCost = 8;
   let AllocationPriority = 18;
 }
 
 def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-                                 (add VGPR_32, LDS_DIRECT_CLASS), Reg32> {
+                                 (add VGPR_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
 }
 
 def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
-                              (add SGPR_1024Regs), Reg1024> {
+                              (add SGPR_1024Regs)> {
   let AllocationPriority = 19;
 }
 
 def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
-                              (add SGPR_1024), Reg1024> {
+                              (add SGPR_1024)> {
   let CopyCost = 16;
   let AllocationPriority = 19;
 }
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
-                            (add VGPR_64), Reg64> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], 32,
+                            (add VGPR_64)> {
   let Size = 64;
 
   // Requires 2 v_mov_b32 to copy
@@ -819,7 +604,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32
   let AllocationPriority = 2;
 }
 
-def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> {
+def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
   let Size = 96;
 
   // Requires 3 v_mov_b32 to copy
@@ -828,7 +613,7 @@ def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96>
 }
 
 def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
-                             (add VGPR_128), Reg128> {
+                             (add VGPR_128)> {
   let Size = 128;
 
   // Requires 4 v_mov_b32 to copy
@@ -837,7 +622,7 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
 }
 
 def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
-                             (add VGPR_160), Reg160> {
+                             (add VGPR_160)> {
   let Size = 160;
 
   // Requires 5 v_mov_b32 to copy
@@ -846,28 +631,28 @@ def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
 }
 
 def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
-                             (add VGPR_256), Reg256> {
+                             (add VGPR_256)> {
   let Size = 256;
   let CopyCost = 8;
   let AllocationPriority = 6;
 }
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
-                             (add VGPR_512), Reg512> {
+                             (add VGPR_512)> {
   let Size = 512;
   let CopyCost = 16;
   let AllocationPriority = 7;
 }
 
 def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
-                              (add VGPR_1024), Reg1024> {
+                              (add VGPR_1024)> {
   let Size = 1024;
   let CopyCost = 32;
   let AllocationPriority = 8;
 }
 
 def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
-                            (add AGPR_64), Reg64> {
+                            (add AGPR_64)> {
   let Size = 64;
 
   let CopyCost = 5;
@@ -875,7 +660,7 @@ def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32
 }
 
 def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
-                             (add AGPR_128), Reg128> {
+                             (add AGPR_128)> {
   let Size = 128;
 
   // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr
@@ -884,40 +669,39 @@ def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
 }
 
 def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
-                             (add AGPR_512), Reg512> {
+                             (add AGPR_512)> {
   let Size = 512;
   let CopyCost = 33;
   let AllocationPriority = 7;
 }
 
 def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
-                              (add AGPR_1024), Reg1024> {
+                              (add AGPR_1024)> {
   let Size = 1024;
   let CopyCost = 65;
   let AllocationPriority = 8;
 }
 
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32), Reg32> {
-  let Size = 32;
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+  let Size = 1;
 }
 
 def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-                          (add VGPR_32, SReg_32, LDS_DIRECT_CLASS), Reg32> {
+                          (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
 }
 
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64),
-                          Reg64> {
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
   let isAllocatable = 0;
 }
 
 def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-                          (add AGPR_32, VGPR_32), Reg32> {
+                          (add AGPR_32, VGPR_32)> {
   let isAllocatable = 0;
 }
 
 def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
-                          (add AReg_64, VReg_64), Reg64> {
+                          (add AReg_64, VReg_64)> {
   let isAllocatable = 0;
 }
 
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 7ee178149c7a..8afca2cdc325 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -77,8 +77,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   // Try to fold Src0
   MachineOperand &Src0 = MI.getOperand(Src0Idx);
   if (Src0.isReg()) {
-    unsigned Reg = Src0.getReg();
-    if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
+    Register Reg = Src0.getReg();
+    if (Register::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
       MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
       if (Def && Def->isMoveImmediate()) {
         MachineOperand &MovSrc = Def->getOperand(1);
@@ -360,8 +360,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
     }
 
     if (NewImm != 0) {
-      if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
-        SrcReg->isReg()) {
+      if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) {
         MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
         MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
         return true;
@@ -394,12 +393,11 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
     if (!MO.isReg())
       continue;
 
-    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
-        TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    if (Register::isPhysicalRegister(Reg) &&
+        Register::isPhysicalRegister(MO.getReg())) {
       if (TRI.regsOverlap(Reg, MO.getReg()))
         return true;
-    } else if (MO.getReg() == Reg &&
-               TargetRegisterInfo::isVirtualRegister(Reg)) {
+    } else if (MO.getReg() == Reg && Register::isVirtualRegister(Reg)) {
       LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
                             TRI.getSubRegIndexLaneMask(MO.getSubReg());
       if (Overlap.any())
@@ -425,7 +423,7 @@ static TargetInstrInfo::RegSubRegPair
 getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
                   const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
   if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
     } else {
       LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
@@ -459,13 +457,13 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
          MovT.getOpcode() == AMDGPU::COPY);
 
-  unsigned T = MovT.getOperand(0).getReg();
+  Register T = MovT.getOperand(0).getReg();
   unsigned Tsub = MovT.getOperand(0).getSubReg();
   MachineOperand &Xop = MovT.getOperand(1);
 
   if (!Xop.isReg())
     return nullptr;
-  unsigned X = Xop.getReg();
+  Register X = Xop.getReg();
   unsigned Xsub = Xop.getSubReg();
 
   unsigned Size = TII->getOpSize(MovT, 0) / 4;
@@ -484,7 +482,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
         MovY.getOperand(1).getSubReg() != Tsub)
       continue;
 
-    unsigned Y = MovY.getOperand(0).getReg();
+    Register Y = MovY.getOperand(0).getReg();
     unsigned Ysub = MovY.getOperand(0).getSubReg();
 
     if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
@@ -579,7 +577,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         // XXX - not exactly a check for post-regalloc run.
         MachineOperand &Src = MI.getOperand(1);
         if (Src.isImm() &&
-            TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
+            Register::isPhysicalRegister(MI.getOperand(0).getReg())) {
           int32_t ReverseImm;
           if (isReverseInlineImm(TII, Src, ReverseImm)) {
             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
@@ -643,8 +641,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         // FIXME: This could work better if hints worked with subregisters. If
         // we have a vector add of a constant, we usually don't get the correct
         // allocation due to the subregister usage.
-        if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
-            Src0->isReg()) {
+        if (Register::isVirtualRegister(Dest->getReg()) && Src0->isReg()) {
           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
           continue;
@@ -672,8 +669,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         const MachineOperand &Dst = MI.getOperand(0);
         MachineOperand &Src = MI.getOperand(1);
 
-        if (Src.isImm() &&
-            TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
+        if (Src.isImm() && Register::isPhysicalRegister(Dst.getReg())) {
           int32_t ReverseImm;
           if (isKImmOperand(TII, Src))
             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
@@ -721,8 +717,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 
       if (TII->isVOPC(Op32)) {
-        unsigned DstReg = MI.getOperand(0).getReg();
-        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+        Register DstReg = MI.getOperand(0).getReg();
+        if (Register::isVirtualRegister(DstReg)) {
           // VOPC instructions can only write to the VCC register. We can't
           // force them to use VCC here, because this is only one register and
           // cannot deal with sequences which would require multiple copies of
@@ -745,8 +741,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
         if (!Src2->isReg())
           continue;
-        unsigned SReg = Src2->getReg();
-        if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+        Register SReg = Src2->getReg();
+        if (Register::isVirtualRegister(SReg)) {
           MRI.setRegAllocationHint(SReg, 0, VCCReg);
           continue;
         }
@@ -766,7 +762,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         bool Next = false;
 
         if (SDst->getReg() != VCCReg) {
-          if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
+          if (Register::isVirtualRegister(SDst->getReg()))
             MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
           Next = true;
         }
@@ -774,7 +770,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         // All of the instructions with carry outs also have an SGPR input in
         // src2.
         if (Src2 && Src2->getReg() != VCCReg) {
-          if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
+          if (Register::isVirtualRegister(Src2->getReg()))
             MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
           Next = true;
         }
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 4e07efff55d8..cb4cf68d709a 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -273,12 +273,12 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
     if (!Use.isReg() || !Use.isUse())
       continue;
 
-    unsigned Reg = Use.getReg();
+    Register Reg = Use.getReg();
 
     // Handle physical registers that we need to track; this is mostly relevant
     // for VCC, which can appear as the (implicit) input of a uniform branch,
     // e.g. when a loop counter is stored in a VGPR.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (!Register::isVirtualRegister(Reg)) {
       if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
         continue;
 
@@ -312,6 +312,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
   char GlobalFlags = 0;
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
+  SmallVector<MachineInstr *, 4> SoftWQMInstrs;
 
   // We need to visit the basic blocks in reverse post-order so that we visit
   // defs before uses, in particular so that we don't accidentally mark an
@@ -340,6 +341,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         // correct, so we need it to be in WQM.
         Flags = StateWQM;
         LowerToCopyInstrs.push_back(&MI);
+      } else if (Opcode == AMDGPU::SOFT_WQM) {
+        LowerToCopyInstrs.push_back(&MI);
+        SoftWQMInstrs.push_back(&MI);
+        continue;
       } else if (Opcode == AMDGPU::WWM) {
         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
         // to be executed in WQM or Exact so that its copy doesn't clobber
@@ -356,8 +361,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           if (Inactive.isUndef()) {
             LowerToCopyInstrs.push_back(&MI);
           } else {
-            unsigned Reg = Inactive.getReg();
-            if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+            Register Reg = Inactive.getReg();
+            if (Register::isVirtualRegister(Reg)) {
               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
                 markInstruction(DefMI, StateWWM, Worklist);
             }
@@ -385,9 +390,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
             if (!MO.isReg())
               continue;
 
-            unsigned Reg = MO.getReg();
+            Register Reg = MO.getReg();
 
-            if (!TRI->isVirtualRegister(Reg) &&
+            if (!Register::isVirtualRegister(Reg) &&
                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
               Flags = StateWQM;
               break;
@@ -407,9 +412,12 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
   // ever used anywhere in the function. This implements the corresponding
   // semantics of @llvm.amdgcn.set.inactive.
+  // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
   if (GlobalFlags & StateWQM) {
     for (MachineInstr *MI : SetInactiveInstrs)
       markInstruction(*MI, StateWQM, Worklist);
+    for (MachineInstr *MI : SoftWQMInstrs)
+      markInstruction(*MI, StateWQM, Worklist);
   }
 
   return GlobalFlags;
@@ -548,7 +556,7 @@ bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
 MachineBasicBlock::iterator
 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator Before) {
-  unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+  Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
   MachineInstr *Save =
       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
@@ -832,7 +840,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
   for (MachineInstr *MI : LiveMaskQueries) {
     const DebugLoc &DL = MI->getDebugLoc();
-    unsigned Dest = MI->getOperand(0).getReg();
+    Register Dest = MI->getOperand(0).getReg();
     MachineInstr *Copy =
         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
             .addReg(LiveMaskReg);
@@ -847,13 +855,12 @@ void SIWholeQuadMode::lowerCopyInstrs() {
     for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
       MI->RemoveOperand(i);
 
-    const unsigned Reg = MI->getOperand(0).getReg();
+    const Register Reg = MI->getOperand(0).getReg();
 
     if (TRI->isVGPR(*MRI, Reg)) {
-      const TargetRegisterClass *regClass =
-          TargetRegisterInfo::isVirtualRegister(Reg)
-              ? MRI->getRegClass(Reg)
-              : TRI->getPhysRegClass(Reg);
+      const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
+                                                ? MRI->getRegClass(Reg)
+                                                : TRI->getPhysRegClass(Reg);
 
       const unsigned MovOp = TII->getMovOpcode(regClass);
       MI->setDesc(TII->get(MovOp));
@@ -885,7 +892,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   if (!(GlobalFlags & StateWQM)) {
     lowerLiveMaskQueries(Exec);
-    if (!(GlobalFlags & StateWWM))
+    if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
       return !LiveMaskQueries.empty();
   } else {
     // Store a copy of the original live mask when required
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 1b410b6b5912..1a74ebbf8165 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -793,9 +793,18 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> {
 // selector to prefer those.
 let AddedComplexity = 100 in {
 
-defm : SMRD_Pattern <"S_LOAD_DWORD",    i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX2",  v2i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX4",  v4i32>;
+foreach vt = Reg32Types.types in {
+defm : SMRD_Pattern <"S_LOAD_DWORD", vt>;
+}
+
+foreach vt = SReg_64.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", vt>;
+}
+
+foreach vt = SReg_128.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>;
+}
+
 defm : SMRD_Pattern <"S_LOAD_DWORDX8",  v8i32>;
 defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
 
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index dfafdccc05a3..d31a49f428ee 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -181,7 +181,9 @@ def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
 def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
   [(set i32:$sdst, (ctpop i32:$src0))]
 >;
-def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">;
+def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",
+  [(set i32:$sdst, (ctpop i64:$src0))]
+>;
 } // End Defs = [SCC]
 
 def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
@@ -417,16 +419,16 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
 
 let isCommutable = 1 in {
 def S_MIN_I32 : SOP2_32 <"s_min_i32",
-  [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
 >;
 def S_MIN_U32 : SOP2_32 <"s_min_u32",
-  [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
 >;
 def S_MAX_I32 : SOP2_32 <"s_max_i32",
-  [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
 >;
 def S_MAX_U32 : SOP2_32 <"s_max_u32",
-  [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
 >;
 } // End isCommutable = 1
 } // End Defs = [SCC]
@@ -853,13 +855,13 @@ class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
   let Defs = [SCC];
 }
 class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
-                    string opName, PatLeaf cond> : SOPC_Base <
+                    string opName, SDPatternOperator cond> : SOPC_Base <
   op, rc, rc, opName,
   [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
 }
 
 class SOPC_CMP_32<bits<7> op, string opName,
-                  PatLeaf cond = COND_NULL, string revOp = opName>
+                  SDPatternOperator cond = COND_NULL, string revOp = opName>
   : SOPC_Helper<op, SSrc_b32, i32, opName, cond>,
     Commutable_REV<revOp, !eq(revOp, opName)>,
     SOPKInstTable<0, opName> {
@@ -868,7 +870,7 @@ class SOPC_CMP_32<bits<7> op, string opName,
 }
 
 class SOPC_CMP_64<bits<7> op, string opName,
-                  PatLeaf cond = COND_NULL, string revOp = opName>
+                  SDPatternOperator cond = COND_NULL, string revOp = opName>
   : SOPC_Helper<op, SSrc_b64, i64, opName, cond>,
     Commutable_REV<revOp, !eq(revOp, opName)> {
   let isCompare = 1;
@@ -1076,8 +1078,6 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   [(int_amdgcn_s_barrier)]> {
   let SchedRW = [WriteBarrier];
   let simm16 = 0;
-  let mayLoad = 1;
-  let mayStore = 1;
   let isConvergent = 1;
 }
 
@@ -1090,7 +1090,7 @@ def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
 
 let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16",
-    [(int_amdgcn_s_waitcnt UIMM16bit:$simm16)]>;
+    [(int_amdgcn_s_waitcnt timm:$simm16)]>;
 def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
 def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
 
@@ -1099,7 +1099,7 @@ def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
 // maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
 // maximum really 15 on VI?
 def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
-  "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> {
+  "s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
   let hasSideEffects = 1;
   let mayLoad = 1;
   let mayStore = 1;
@@ -1110,12 +1110,11 @@ def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
 let Uses = [EXEC, M0] in {
 // FIXME: Should this be mayLoad+mayStore?
 def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
-  [(AMDGPUsendmsg (i32 imm:$simm16))]
->;
+  [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>;
 
 def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16",
-  [(AMDGPUsendmsghalt (i32 imm:$simm16))]
->;
+  [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>;
+
 } // End Uses = [EXEC, M0]
 
 def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> {
@@ -1126,13 +1125,13 @@ def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
   let simm16 = 0;
 }
 def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
-  [(int_amdgcn_s_incperflevel SIMM16bit:$simm16)]> {
+  [(int_amdgcn_s_incperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
   let mayLoad = 1;
   let mayStore = 1;
 }
 def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
-  [(int_amdgcn_s_decperflevel SIMM16bit:$simm16)]> {
+  [(int_amdgcn_s_decperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
   let mayLoad = 1;
   let mayStore = 1;
@@ -1169,7 +1168,10 @@ let SubtargetPredicate = isGFX10Plus in {
   def S_ROUND_MODE :
     SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
   def S_DENORM_MODE :
-    SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">;
+    SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
+    [(SIdenorm_mode (i32 timm:$simm16))]> {
+      let hasSideEffects = 1;
+    }
   def S_TTRACEDATA_IMM :
     SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
 } // End SubtargetPredicate = isGFX10Plus
@@ -1178,7 +1180,7 @@ let SubtargetPredicate = isGFX10Plus in {
 // S_GETREG_B32 Intrinsic Pattern.
 //===----------------------------------------------------------------------===//
 def : GCNPat <
-  (int_amdgcn_s_getreg imm:$simm16),
+  (int_amdgcn_s_getreg timm:$simm16),
   (S_GETREG_B32 (as_i16imm $simm16))
 >;
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e90f40e6abea..afb2fd987afd 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -131,29 +131,70 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
 struct MUBUFInfo {
   uint16_t Opcode;
   uint16_t BaseOpcode;
-  uint8_t dwords;
+  uint8_t elements;
   bool has_vaddr;
   bool has_srsrc;
   bool has_soffset;
 };
 
+struct MTBUFInfo {
+  uint16_t Opcode;
+  uint16_t BaseOpcode;
+  uint8_t elements;
+  bool has_vaddr;
+  bool has_srsrc;
+  bool has_soffset;
+};
+
+#define GET_MTBUFInfoTable_DECL
+#define GET_MTBUFInfoTable_IMPL
 #define GET_MUBUFInfoTable_DECL
 #define GET_MUBUFInfoTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
+int getMTBUFBaseOpcode(unsigned Opc) {
+  const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc);
+  return Info ? Info->BaseOpcode : -1;
+}
+
+int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) {
+  const MTBUFInfo *Info = getMTBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements);
+  return Info ? Info->Opcode : -1;
+}
+
+int getMTBUFElements(unsigned Opc) {
+  const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+  return Info ? Info->elements : 0;
+}
+
+bool getMTBUFHasVAddr(unsigned Opc) {
+  const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+  return Info ? Info->has_vaddr : false;
+}
+
+bool getMTBUFHasSrsrc(unsigned Opc) {
+  const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+  return Info ? Info->has_srsrc : false;
+}
+
+bool getMTBUFHasSoffset(unsigned Opc) {
+  const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+  return Info ? Info->has_soffset : false;
+}
+
 int getMUBUFBaseOpcode(unsigned Opc) {
   const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc);
   return Info ? Info->BaseOpcode : -1;
 }
 
-int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) {
-  const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords);
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements) {
+  const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements);
   return Info ? Info->Opcode : -1;
 }
 
-int getMUBUFDwords(unsigned Opc) {
+int getMUBUFElements(unsigned Opc) {
   const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
-  return Info ? Info->dwords : 0;
+  return Info ? Info->elements : 0;
 }
 
 bool getMUBUFHasVAddr(unsigned Opc) {
@@ -241,7 +282,7 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
 }
 
 unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
-  return getMaxWavesPerEU() * getEUsPerCU(STI);
+  return getMaxWavesPerEU(STI) * getEUsPerCU(STI);
 }
 
 unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
@@ -253,9 +294,11 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
   return 1;
 }
 
-unsigned getMaxWavesPerEU() {
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
   // FIXME: Need to take scratch memory into account.
-  return 10;
+  if (!isGFX10(*STI))
+    return 10;
+  return 20;
 }
 
 unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
@@ -317,7 +360,7 @@ unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   if (Version.Major >= 10)
     return 0;
 
-  if (WavesPerEU >= getMaxWavesPerEU())
+  if (WavesPerEU >= getMaxWavesPerEU(STI))
     return 0;
 
   unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1);
@@ -394,17 +437,19 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
 }
 
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
-  return 256;
+  if (!isGFX10(*STI))
+    return 256;
+  return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
 }
 
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
-  return getTotalNumVGPRs(STI);
+  return 256;
 }
 
 unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
 
-  if (WavesPerEU >= getMaxWavesPerEU())
+  if (WavesPerEU >= getMaxWavesPerEU(STI))
     return 0;
   unsigned MinNumVGPRs =
       alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1),
@@ -510,7 +555,7 @@ bool isReadOnlySegment(const GlobalValue *GV) {
 }
 
 bool shouldEmitConstantsToTextSection(const Triple &TT) {
-  return TT.getOS() != Triple::AMDHSA;
+  return TT.getOS() == Triple::AMDPAL;
 }
 
 int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 209ef7eef749..f78dadd447ff 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -94,7 +94,7 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
 
 /// \returns Maximum number of waves per execution unit for given subtarget \p
 /// STI without any kind of limitation.
-unsigned getMaxWavesPerEU();
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI);
 
 /// \returns Maximum number of waves per execution unit for given subtarget \p
 /// STI and limited by given \p FlatWorkGroupSize.
@@ -263,14 +263,32 @@ struct MIMGInfo {
 LLVM_READONLY
 const MIMGInfo *getMIMGInfo(unsigned Opc);
 
+LLVM_READONLY
+int getMTBUFBaseOpcode(unsigned Opc);
+
+LLVM_READONLY
+int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements);
+
+LLVM_READONLY
+int getMTBUFElements(unsigned Opc);
+
+LLVM_READONLY
+bool getMTBUFHasVAddr(unsigned Opc);
+
+LLVM_READONLY
+bool getMTBUFHasSrsrc(unsigned Opc);
+
+LLVM_READONLY
+bool getMTBUFHasSoffset(unsigned Opc);
+
 LLVM_READONLY
 int getMUBUFBaseOpcode(unsigned Opc);
 
 LLVM_READONLY
-int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords);
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements);
 
 LLVM_READONLY
-int getMUBUFDwords(unsigned Opc);
+int getMUBUFElements(unsigned Opc);
 
 LLVM_READONLY
 bool getMUBUFHasVAddr(unsigned Opc);
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index db20d5ccf5f9..207e4232e829 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -21,6 +21,8 @@
 #include "SIDefines.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/EndianStream.h"
 
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 6bc416ed7d4b..f1cdc3097dc0 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -104,9 +104,21 @@ multiclass VOP1Inst <string opName, VOPProfile P,
                      SDPatternOperator node = null_frag> {
   def _e32 : VOP1_Pseudo <opName, P>;
   def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
-  def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+
+  foreach _ = BoolToList<P.HasExtSDWA>.ret in
+    def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+
   foreach _ = BoolToList<P.HasExtDPP>.ret in
     def _dpp : VOP1_DPP_Pseudo <opName, P>;
+
+  def : MnemonicAlias<opName#"_e32", opName>, LetDummies;
+  def : MnemonicAlias<opName#"_e64", opName>, LetDummies;
+
+  foreach _ = BoolToList<P.HasExtSDWA>.ret in
+    def : MnemonicAlias<opName#"_sdwa", opName>, LetDummies;
+
+  foreach _ = BoolToList<P.HasExtDPP>.ret in
+    def : MnemonicAlias<opName#"_dpp", opName>, LetDummies;
 }
 
 // Special profile for instructions which have clamp
@@ -227,10 +239,10 @@ defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
 } // End SchedRW = [WriteQuarterRate32]
 
 defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
-defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
-defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>;
+defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
 defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
-defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
 
 let SchedRW = [WriteDoubleAdd] in {
 defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
@@ -434,7 +446,7 @@ let SubtargetPredicate = isGFX10Plus in {
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
 
-class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> :
+class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> :
     VOP_DPP<ps.OpName, p, isDPP16> {
   let hasSideEffects = ps.hasSideEffects;
   let Defs = ps.Defs;
@@ -448,8 +460,9 @@ class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 =
   let Inst{31-25} = 0x3f;
 }
 
-class VOP1_DPP16<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
-    VOP1_DPP<op, ps, p, 1> {
+class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> :
+    VOP1_DPP<op, ps, p, 1>,
+    SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> {
   let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
   let SubtargetPredicate = HasDPP16;
 }
@@ -492,6 +505,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
       VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
   multiclass VOP1_Real_sdwa_gfx10<bits<9> op> {
+    foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
     def _sdwa_gfx10 :
       VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
       VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
@@ -499,11 +513,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
     }
   }
   multiclass VOP1_Real_dpp_gfx10<bits<9> op> {
-    def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
+    foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")> {
       let DecoderNamespace = "SDWA10";
     }
   }
   multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
+    foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
       let DecoderNamespace = "DPP8";
     }
@@ -704,10 +720,12 @@ multiclass VOP1_Real_e32e64_vi <bits<10> op> {
 multiclass VOP1_Real_vi <bits<10> op> {
   defm NAME : VOP1_Real_e32e64_vi <op>;
 
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
   def _sdwa_vi :
     VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
   def _sdwa_gfx9 :
     VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -831,25 +849,25 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
 def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
 def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
 
-let OtherPredicates = [isGFX8GFX9] in {
+let OtherPredicates = [isGFX8Plus] in {
 
 def : GCNPat <
-  (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
-                      imm:$bound_ctrl)),
+  (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
+                      timm:$bound_ctrl)),
   (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl),
                        (as_i32imm $row_mask), (as_i32imm $bank_mask),
                        (as_i1imm $bound_ctrl))
 >;
 
 def : GCNPat <
-  (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask,
-                      imm:$bank_mask, imm:$bound_ctrl)),
+  (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask,
+                      timm:$bank_mask, timm:$bound_ctrl)),
   (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl),
                        (as_i32imm $row_mask), (as_i32imm $bank_mask),
                        (as_i1imm $bound_ctrl))
 >;
 
-} // End OtherPredicates = [isGFX8GFX9]
+} // End OtherPredicates = [isGFX8Plus]
 
 let OtherPredicates = [isGFX8Plus] in {
 def : GCNPat<
@@ -885,6 +903,7 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
     defm NAME : VOP1_Real_e32e64_vi <op>;
   }
 
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
   def _sdwa_gfx9 :
     VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -904,23 +923,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 
 let OtherPredicates = [isGFX10Plus] in {
 def : GCNPat <
-  (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)),
+  (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
   (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0))
 >;
-
-def : GCNPat <
-  (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
-                      imm:$bound_ctrl)),
-  (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl),
-                       (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                       (as_i1imm $bound_ctrl), (i32 0))
->;
-
-def : GCNPat <
-  (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask,
-                              imm:$bank_mask, imm:$bound_ctrl)),
-  (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl),
-                       (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                       (as_i1imm $bound_ctrl), (i32 0))
->;
 } // End OtherPredicates = [isGFX10Plus]
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 1b30cd2ed516..1ab0fc1ab58d 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -147,7 +147,8 @@ multiclass VOP2Inst_sdwa<string opName,
                          string revOp = opName,
                          bit GFX9Renamed = 0> {
   let renamedInGFX9 = GFX9Renamed in {
-    def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+    foreach _ = BoolToList<P.HasExtSDWA>.ret in
+      def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
   } // End renamedInGFX9 = GFX9Renamed
 }
 
@@ -179,9 +180,10 @@ multiclass VOP2bInst <string opName,
           let usesCustomInserter = !eq(P.NumSrcArgs, 2);
         }
 
-        def _sdwa  : VOP2_SDWA_Pseudo <opName, P> {
-          let AsmMatchConverter = "cvtSdwaVOP2b";
-        }
+        foreach _ = BoolToList<P.HasExtSDWA>.ret in
+          def _sdwa  : VOP2_SDWA_Pseudo <opName, P> {
+            let AsmMatchConverter = "cvtSdwaVOP2b";
+          }
         foreach _ = BoolToList<P.HasExtDPP>.ret in
           def _dpp  : VOP2_DPP_Pseudo <opName, P>;
       }
@@ -220,9 +222,10 @@ multiclass VOP2eInst <string opName,
       def _e32 : VOP2_Pseudo <opName, P>,
                  Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
 
-      def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
-        let AsmMatchConverter = "cvtSdwaVOP2b";
-      }
+      foreach _ = BoolToList<P.HasExtSDWA>.ret in
+        def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+          let AsmMatchConverter = "cvtSdwaVOP2e";
+        }
 
       foreach _ = BoolToList<P.HasExtDPP>.ret in
         def _dpp  : VOP2_DPP_Pseudo <opName, P>;
@@ -251,7 +254,9 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
 
 class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
-  field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
+  field dag Ins32 = !if(!eq(vt.Size, 32),
+                        (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
+                        (ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm));
   field bit HasExt = 0;
 
   // Hack to stop printing _e64
@@ -519,7 +524,7 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
 } // End isConvergent = 1
 
 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>;
 defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
@@ -539,9 +544,9 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
 let isCommutable = 1 in {
 defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>;
 } // End isCommutable = 1
 } // End SubtargetPredicate = isGFX6GFX7GFX10
 
@@ -606,9 +611,9 @@ def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
 defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
 } // End FPDPRounding = 1
 
-defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
-defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
-defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
+defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>;
+defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>;
+defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
 
 let isCommutable = 1 in {
 let FPDPRounding = 1 in {
@@ -618,16 +623,16 @@ defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub
 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
 def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
 } // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>;
 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
-defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
+defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
 defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
 defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
-defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
-defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
-defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
-defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;
+defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16, umax>;
+defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>;
+defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>;
+defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>;
 
 let Constraints = "$vdst = $src2", DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1 in {
@@ -653,16 +658,17 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
 let Constraints = "$vdst = $src2",
       DisableEncoding="$src2",
       isConvertibleToThreeAddress = 1,
-      isCommutable = 1 in {
+      isCommutable = 1,
+      IsDOT = 1 in {
   let SubtargetPredicate = HasDot5Insts in
-    defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
+    defm V_DOT2C_F32_F16 : VOP2Inst<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
   let SubtargetPredicate = HasDot6Insts in
-    defm V_DOT4C_I32_I8  : VOP2Inst_e32<"v_dot4c_i32_i8",  VOP_DOT_ACC_I32_I32>;
+    defm V_DOT4C_I32_I8  : VOP2Inst<"v_dot4c_i32_i8",  VOP_DOT_ACC_I32_I32>;
 
   let SubtargetPredicate = HasDot4Insts in
-    defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
+    defm V_DOT2C_I32_I16 : VOP2Inst<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
   let SubtargetPredicate = HasDot3Insts in
-    defm V_DOT8C_I32_I4  : VOP2Inst_e32<"v_dot8c_i32_i4",  VOP_DOT_ACC_I32_I32>;
+    defm V_DOT8C_I32_I4  : VOP2Inst<"v_dot8c_i32_i4",  VOP_DOT_ACC_I32_I32>;
 }
 
 let AddedComplexity = 30 in {
@@ -719,50 +725,17 @@ defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
 
 // Note: 16-bit instructions produce a 0 result in the high 16-bits
 // on GFX8 and GFX9 and preserve high 16 bits on GFX10+
-def ClearHI16 : OutPatFrag<(ops node:$op),
-                           (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>;
-
-multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst,
-                                bit PreservesHI16 = 0> {
-
-def : GCNPat<
-  (op i16:$src0, i16:$src1),
-  !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
->;
-
-def : GCNPat<
-  (i32 (zext (op i16:$src0, i16:$src1))),
-  !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
->;
-
-def : GCNPat<
-  (i64 (zext (op i16:$src0, i16:$src1))),
-   (REG_SEQUENCE VReg_64,
-     !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)),
-     sub0,
-     (V_MOV_B32_e32 (i32 0)), sub1)
->;
-}
-
-multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst,
-                                 bit PreservesHI16 = 0> {
-
-def : GCNPat<
-  (op i16:$src0, i16:$src1),
-  !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
->;
+multiclass Arithmetic_i16_0Hi_Pats <SDPatternOperator op, Instruction inst> {
 
 def : GCNPat<
   (i32 (zext (op i16:$src0, i16:$src1))),
-  !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
+  (inst $src0, $src1)
 >;
 
-
 def : GCNPat<
   (i64 (zext (op i16:$src0, i16:$src1))),
    (REG_SEQUENCE VReg_64,
-     !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)),
-     sub0,
+     (inst $src0, $src1), sub0,
      (V_MOV_B32_e32 (i32 0)), sub1)
 >;
 }
@@ -774,53 +747,36 @@ class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
                      $src)
 >;
 
-let Predicates = [Has16BitInsts] in {
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
-defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
-defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
-defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
-defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
-defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
-defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
-}
-
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64,    1>;
-defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64,    1>;
-defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64,   1>;
-defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64,   1>;
-defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64,   1>;
-defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64,   1>;
-}
-
+foreach vt = [i16, v2i16] in {
 def : GCNPat <
-  (and i16:$src0, i16:$src1),
-  (V_AND_B32_e64 $src0, $src1)
+  (and vt:$src0, vt:$src1),
+  (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
 >;
 
 def : GCNPat <
-  (or i16:$src0, i16:$src1),
-  (V_OR_B32_e64 $src0, $src1)
+  (or vt:$src0, vt:$src1),
+  (V_OR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
 >;
 
 def : GCNPat <
-  (xor i16:$src0, i16:$src1),
-  (V_XOR_B32_e64 $src0, $src1)
+  (xor vt:$src0, vt:$src1),
+  (V_XOR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
 >;
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
-defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
-defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
 }
 
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64, 1>;
-defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64, 1>;
-defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64, 1>;
+let Predicates = [Has16BitInsts] in {
+
+let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
+defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<smin, V_MIN_I16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<smax, V_MAX_I16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<umin, V_MIN_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<lshl_rev, V_LSHLREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<lshr_rev, V_LSHRREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<ashr_rev, V_ASHRREV_I16_e64>;
 }
 
 def : ZExt_i16_i1_Pat<zext>;
@@ -847,7 +803,7 @@ def : GCNPat<
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
 
-class VOP2_DPP<bits<6> op, VOP2_Pseudo ps,
+class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
                string opName = ps.OpName, VOPProfile p = ps.Pfl,
                bit IsDPP16 = 0> :
     VOP_DPP<opName, p, IsDPP16> {
@@ -865,13 +821,18 @@ class VOP2_DPP<bits<6> op, VOP2_Pseudo ps,
   let Inst{31}    = 0x0;
 }
 
-class VOP2_DPP16<bits<6> op, VOP2_Pseudo ps,
+class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
                  string opName = ps.OpName, VOPProfile p = ps.Pfl> :
     VOP2_DPP<op, ps, opName, p, 1> {
   let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
   let SubtargetPredicate = HasDPP16;
 }
 
+class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
+                 string opName = ps.OpName, VOPProfile p = ps.Pfl> :
+    Base_VOP2_DPP16<op, ps, opName, p>,
+    SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>;
+
 class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
                 string opName = ps.OpName, VOPProfile p = ps.Pfl> :
     VOP_DPP8<ps.OpName, p> {
@@ -924,6 +885,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
       VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   }
   multiclass VOP2_Real_sdwa_gfx10<bits<6> op> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
     def _sdwa_gfx10 :
       VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
       VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
@@ -931,11 +893,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
     }
   }
   multiclass VOP2_Real_dpp_gfx10<bits<6> op> {
-    def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
       let DecoderNamespace = "SDWA10";
     }
   }
   multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
       let DecoderNamespace = "DPP8";
     }
@@ -964,6 +928,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
   let DecoderNamespace = "SDWA10" in {
     multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName,
                                               string asmName> {
+      foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
       def _sdwa_gfx10 :
         VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
         VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -973,13 +938,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
     }
     multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName,
                                              string asmName> {
-      def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+      foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+      def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp")> {
         VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
         let AsmString = asmName # ps.Pfl.AsmDPP16;
       }
     }
     multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName,
                                               string asmName> {
+      foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
       def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
         let AsmString = asmName # ps.Pfl.AsmDPP8;
@@ -989,13 +956,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
   } // End DecoderNamespace = "SDWA10"
 
   //===------------------------------ VOP2be ------------------------------===//
-  multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> {
+  multiclass VOP2be_Real_e32_gfx10<bits<6> op, string opName, string asmName> {
     def _e32_gfx10 :
       VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>,
       VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> {
         VOP2_Pseudo Ps = !cast<VOP2_Pseudo>(opName#"_e32");
         let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
       }
+  }
+  multiclass VOP2be_Real_e64_gfx10<bits<6> op, string opName, string asmName> {
     def _e64_gfx10 :
       VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
       VOP3be_gfx10<{0, 1, 0, 0, op{5-0}},
@@ -1003,6 +972,9 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
         VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64");
         let AsmString = asmName # Ps.AsmOperands;
       }
+  }
+  multiclass VOP2be_Real_sdwa_gfx10<bits<6> op, string opName, string asmName> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
     def _sdwa_gfx10 :
       VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
       VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -1010,64 +982,76 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
         let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
         let DecoderNamespace = "SDWA10";
       }
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+    def _sdwa_w32_gfx10 :
+      Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+      VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+        VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+        let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands);
+        let isAsmParserOnly = 1;
+        let DecoderNamespace = "SDWA10";
+        let WaveSizePredicate = isWave32;
+      }
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+    def _sdwa_w64_gfx10 :
+      Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+      VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+        VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+        let AsmString = asmName # Ps.AsmOperands;
+        let isAsmParserOnly = 1;
+        let DecoderNamespace = "SDWA10";
+        let WaveSizePredicate = isWave64;
+      }
+  }
+  multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp_gfx10 :
-      VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+      VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
         string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
         let AsmString = asmName # !subst(", vcc", "", AsmDPP);
         let DecoderNamespace = "SDWA10";
       }
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_w32_gfx10 :
+      Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+        string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+        let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_w64_gfx10 :
+      Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+        string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+        let AsmString = asmName # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
+      }
+  }
+  multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> {
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp8_gfx10 :
       VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
         string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
         let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
         let DecoderNamespace = "DPP8";
       }
-
-    let WaveSizePredicate = isWave32 in {
-      def _sdwa_w32_gfx10 :
-        Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
-        VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
-          VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
-          let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands);
-          let isAsmParserOnly = 1;
-          let DecoderNamespace = "SDWA10";
-        }
-      def _dpp_w32_gfx10 :
-        VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
-          string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
-          let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
-          let isAsmParserOnly = 1;
-        }
-      def _dpp8_w32_gfx10 :
-        VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
-          string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
-          let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
-          let isAsmParserOnly = 1;
-        }
-    } // End WaveSizePredicate = isWave32
-
-    let WaveSizePredicate = isWave64 in {
-      def _sdwa_w64_gfx10 :
-        Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
-        VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
-          VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
-          let AsmString = asmName # Ps.AsmOperands;
-          let isAsmParserOnly = 1;
-          let DecoderNamespace = "SDWA10";
-        }
-      def _dpp_w64_gfx10 :
-        VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
-          string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
-          let AsmString = asmName # AsmDPP;
-          let isAsmParserOnly = 1;
-        }
-      def _dpp8_w64_gfx10 :
-        VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
-          string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
-          let AsmString = asmName # AsmDPP8;
-          let isAsmParserOnly = 1;
-        }
-    } // End WaveSizePredicate = isWave64
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp8_w32_gfx10 :
+      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+        string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+        let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+    foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp8_w64_gfx10 :
+      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+        string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+        let AsmString = asmName # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
+      }
   }
 
   //===----------------------------- VOP3Only -----------------------------===//
@@ -1088,8 +1072,19 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
   }
 } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
 
-multiclass Base_VOP2_Real_gfx10<bits<6> op> :
-  VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>;
+multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> :
+  VOP2be_Real_e32_gfx10<op, opName, asmName>,
+  VOP2be_Real_e64_gfx10<op, opName, asmName>,
+  VOP2be_Real_sdwa_gfx10<op, opName, asmName>,
+  VOP2be_Real_dpp_gfx10<op, opName, asmName>,
+  VOP2be_Real_dpp8_gfx10<op, opName, asmName>;
+
+multiclass VOP2e_Real_gfx10<bits<6> op, string opName, string asmName> :
+  VOP2_Real_e32_gfx10<op>,
+  VOP2_Real_e64_gfx10<op>,
+  VOP2be_Real_sdwa_gfx10<op, opName, asmName>,
+  VOP2be_Real_dpp_gfx10<op, opName, asmName>,
+  VOP2be_Real_dpp8_gfx10<op, opName, asmName>;
 
 multiclass VOP2_Real_gfx10<bits<6> op> :
   VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
@@ -1103,7 +1098,6 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
   VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
   VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
 
-defm V_CNDMASK_B32   : Base_VOP2_Real_gfx10<0x001>;
 defm V_XNOR_B32      : VOP2_Real_gfx10<0x01e>;
 defm V_FMAC_F32      : VOP2_Real_gfx10<0x02b>;
 defm V_FMAMK_F32     : VOP2Only_Real_MADK_gfx10<0x02c>;
@@ -1136,6 +1130,9 @@ defm V_SUB_CO_CI_U32 :
 defm V_SUBREV_CO_CI_U32 :
   VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">;
 
+defm V_CNDMASK_B32 :
+  VOP2e_Real_gfx10<0x001, "V_CNDMASK_B32", "v_cndmask_b32">;
+
 // VOP3 only.
 defm V_BFM_B32            : VOP3Only_Real_gfx10<0x363>;
 defm V_BCNT_U32_B32       : VOP3Only_Real_gfx10<0x364>;
@@ -1322,12 +1319,14 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
 } // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8"
 
 multiclass VOP2_SDWA_Real <bits<6> op> {
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
   def _sdwa_vi :
     VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 }
 
 multiclass VOP2_SDWA9_Real <bits<6> op> {
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
   def _sdwa_gfx9 :
     VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -1350,12 +1349,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
       let AsmString = AsmName # ps.AsmOperands;
       let DecoderNamespace = "GFX8";
     }
-  def _sdwa_vi :
-    VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
-    VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
-      VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
-      let AsmString = AsmName # ps.AsmOperands;
-    }
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA>.ret in
+    def _sdwa_vi :
+      VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
+      VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
+        VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
+        let AsmString = AsmName # ps.AsmOperands;
+      }
   foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp_vi :
       VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>,
@@ -1383,12 +1383,13 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
       let AsmString = AsmName # ps.AsmOperands;
       let DecoderNamespace = "GFX9";
     }
-  def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
-    VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
-      VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
-      let AsmString = AsmName # ps.AsmOperands;
-    }
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9>.ret in
+    def _sdwa_gfx9 :
+      VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
+      VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
+        VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
+        let AsmString = AsmName # ps.AsmOperands;
+      }
   foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp_gfx9 :
       VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>,
@@ -1410,10 +1411,11 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
     VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
       let DecoderNamespace = "GFX9";
     }
-  def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
-    VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
-    }
+  foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+    def _sdwa_gfx9 :
+      VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+      VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+      }
   foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp_gfx9 :
       VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
@@ -1554,7 +1556,7 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
 } // End SubtargetPredicate = HasDLInsts
 
 multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
-  def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+  def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
 }
 
 multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> :
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 21dbef9240e1..605425972b1c 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -112,7 +112,7 @@ class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
 
 class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
-                                        imm:$cbsz, imm:$abid, imm:$blgp))];
+                                        timm:$cbsz, timm:$abid, timm:$blgp))];
 }
 
 class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
@@ -385,12 +385,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
 }
 
 let SchedRW = [Write64Bit] in {
-let SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] in {
-def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>;
-def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>;
-def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>;
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
+def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
+def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
 def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10]
+} // End SubtargetPredicate = isGFX6GFX7GFX10
 
 let SubtargetPredicate = isGFX8Plus in {
 def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
@@ -399,21 +399,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, as
 } // End SubtargetPredicate = isGFX8Plus
 } // End SchedRW = [Write64Bit]
 
-let Predicates = [isGFX8Plus] in {
-def : GCNPat <
- (getDivergentFrag<shl>.ret i64:$x, i32:$y),
- (V_LSHLREV_B64 $y, $x)
->;
-def : AMDGPUPat <
- (getDivergentFrag<srl>.ret i64:$x, i32:$y),
- (V_LSHRREV_B64 $y, $x)
->;
-def : AMDGPUPat <
- (getDivergentFrag<sra>.ret i64:$x, i32:$y),
- (V_ASHRREV_I64 $y, $x)
->;
-}
-
 
 let SchedRW = [Write32Bit] in {
 let SubtargetPredicate = isGFX8Plus in {
@@ -468,13 +453,13 @@ let FPDPRounding = 1 in {
 def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
 let Uses = [M0, EXEC] in {
 def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
-       [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan),
-                                                        (i32 imm:$attr),
-                                                        (i32 imm:$src0_modifiers),
+       [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 timm:$attrchan),
+                                                        (i32 timm:$attr),
+                                                        (i32 timm:$src0_modifiers),
                                                         (f32 VRegSrc_32:$src2),
-                                                        (i32 imm:$src2_modifiers),
-                                                        (i1 imm:$high),
-                                                        (i1 imm:$clamp)))]>;
+                                                        (i32 timm:$src2_modifiers),
+                                                        (i1 timm:$high),
+                                                        (i1 timm:$clamp)))]>;
 } // End Uses = [M0, EXEC]
 } // End FPDPRounding = 1
 } // End renamedInGFX9 = 1
@@ -493,21 +478,21 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1
 
 let Uses = [M0, EXEC], FPDPRounding = 1 in {
 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
-       [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan),
-                                                          (i32 imm:$attr),
-                                                          (i32 imm:$src0_modifiers),
-                                                          (i1 imm:$high),
-                                                          (i1 imm:$clamp),
-                                                          (i32 imm:$omod)))]>;
+       [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 timm:$attrchan),
+                                                          (i32 timm:$attr),
+                                                          (i32 timm:$src0_modifiers),
+                                                          (i1 timm:$high),
+                                                          (i1 timm:$clamp),
+                                                          (i32 timm:$omod)))]>;
 def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
-       [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan),
-                                                          (i32 imm:$attr),
-                                                          (i32 imm:$src0_modifiers),
+       [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 timm:$attrchan),
+                                                          (i32 timm:$attr),
+                                                          (i32 timm:$src0_modifiers),
                                                           (f32 VRegSrc_32:$src2),
-                                                          (i32 imm:$src2_modifiers),
-                                                          (i1 imm:$high),
-                                                          (i1 imm:$clamp),
-                                                          (i32 imm:$omod)))]>;
+                                                          (i32 timm:$src2_modifiers),
+                                                          (i1 timm:$high),
+                                                          (i1 timm:$clamp),
+                                                          (i32 timm:$omod)))]>;
 } // End Uses = [M0, EXEC], FPDPRounding = 1
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
@@ -657,11 +642,11 @@ let SubtargetPredicate = isGFX10Plus in {
   } // End $vdst = $vdst_in, DisableEncoding $vdst_in
 
   def : GCNPat<
-    (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+    (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
     (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
   >;
   def : GCNPat<
-    (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+    (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
     (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
   >;
 } // End SubtargetPredicate = isGFX10Plus
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 55ee5f6577cf..0c13f39fec02 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -261,6 +261,7 @@ class SDot2Pat<Instruction Inst> : GCNPat <
   let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate;
 }
 
+let IsDOT = 1 in {
 let SubtargetPredicate = HasDot2Insts in {
 
 def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
@@ -277,6 +278,7 @@ def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32
 def V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
 
 } // End SubtargetPredicate = HasDot1Insts
+} // End let IsDOT = 1
 
 multiclass DotPats<SDPatternOperator dot_op,
                    VOP3PInst dot_inst> {
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index b3513e383d10..8ef0ec7b71f4 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -183,7 +183,7 @@ multiclass VOPCXInstAliases <string OpName, string Arch> {
 }
 
 
-class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
+class getVOPCPat64 <SDPatternOperator cond, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
       [(set i1:$sdst,
         (setcc (P.Src0VT
@@ -202,7 +202,7 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> {
 
 multiclass VOPC_Pseudos <string opName,
                          VOPC_Profile P,
-                         PatLeaf cond = COND_NULL,
+                         SDPatternOperator cond = COND_NULL,
                          string revOp = opName,
                          bit DefExec = 0> {
 
@@ -225,6 +225,7 @@ multiclass VOPC_Pseudos <string opName,
     let isCommutable = 1;
   }
 
+  foreach _ = BoolToList<P.HasExtSDWA>.ret in
   def _sdwa : VOPC_SDWA_Pseudo <opName, P> {
     let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
     let SchedRW = P.Schedule;
@@ -236,7 +237,7 @@ multiclass VOPC_Pseudos <string opName,
 let SubtargetPredicate = HasSdstCMPX in {
 multiclass VOPCX_Pseudos <string opName,
                           VOPC_Profile P, VOPC_Profile P_NoSDst,
-                          PatLeaf cond = COND_NULL,
+                          SDPatternOperator cond = COND_NULL,
                           string revOp = opName> :
            VOPC_Pseudos <opName, P, cond, revOp, 1> {
 
@@ -261,6 +262,7 @@ multiclass VOPCX_Pseudos <string opName,
     let SubtargetPredicate = HasNoSdstCMPX;
   }
 
+  foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
   def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
@@ -285,22 +287,23 @@ def VOPC_I16_I16 : VOPC_NoSdst_Profile<[Write32Bit], i16>;
 def VOPC_I32_I32 : VOPC_NoSdst_Profile<[Write32Bit], i32>;
 def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>;
 
-multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL,
+                     string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
 
-multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_F32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>;
 
-multiclass VOPC_F64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
 
-multiclass VOPC_I16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>;
 
-multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
 
-multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
 
 multiclass VOPCX_F16 <string opName, string revOp = opName> :
@@ -669,6 +672,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
     let SchedRW = p.Schedule;
   }
 
+  foreach _ = BoolToList<p.HasExtSDWA>.ret in
   def _sdwa : VOPC_SDWA_Pseudo <opName, p> {
     let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
                             !if(DefVcc, [VCC], []));
@@ -698,6 +702,7 @@ multiclass VOPCX_Class_Pseudos <string opName,
     let SubtargetPredicate = HasNoSdstCMPX;
   }
 
+  foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
   def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
     let Defs = [EXEC];
     let SchedRW = P_NoSDst.Schedule;
@@ -737,8 +742,11 @@ defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
 defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
 defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
 defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">;
+
+let SubtargetPredicate = Has16BitInsts in {
 defm V_CMP_CLASS_F16  : VOPC_CLASS_F16 <"v_cmp_class_f16">;
 defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
+}
 
 //===----------------------------------------------------------------------===//
 // V_ICMPIntrinsic Pattern.
@@ -878,6 +886,7 @@ let AssemblerPredicate = isGFX10Plus in {
       }
     } // End DecoderNamespace = "GFX10"
 
+    foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
     def _sdwa_gfx10 :
       VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
       VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -903,6 +912,7 @@ let AssemblerPredicate = isGFX10Plus in {
         }
     } // End DecoderNamespace = "GFX10"
 
+    foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9>.ret in
     def _sdwa_gfx10 :
       VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>,
       VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> {
@@ -1223,10 +1233,12 @@ multiclass VOPC_Real_vi <bits<10> op> {
     }
   }
 
+  foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
   def _sdwa_vi :
     VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
+  foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
   def _sdwa_gfx9 :
     VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 677095a354be..f208a1134a5a 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -14,6 +14,7 @@ class LetDummies {
   bit isReMaterializable;
   bit isAsCheapAsAMove;
   bit VOPAsmPrefer32Bit;
+  bit FPDPRounding;
   Predicate SubtargetPredicate;
   string Constraints;
   string DisableEncoding;
@@ -41,9 +42,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
                   string asm, list<dag> pattern> :
   InstSI <outs, ins, asm, pattern>,
   VOP <opName>,
-  SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
-  MnemonicAlias<opName#suffix, opName> {
-
+  SIMCInstr <opName#suffix, SIEncodingFamily.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
   let UseNamedOperandTable = 1;
@@ -148,6 +147,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
 
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
+  let OtherPredicates    = ps.OtherPredicates;
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let AsmVariantName     = ps.AsmVariantName;
   let Constraints        = ps.Constraints;
@@ -473,8 +473,7 @@ class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
 class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>,
   VOP <opName>,
-  SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>,
-  MnemonicAlias <opName#"_sdwa", opName> {
+  SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE> {
 
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -595,8 +594,7 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 {
 class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
   VOP <OpName>,
-  SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>,
-  MnemonicAlias <OpName#"_dpp", OpName> {
+  SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> {
 
   let isPseudo = 1;
   let isCodeGenOnly = 1;
diff --git a/lib/Target/ARC/ARCFrameLowering.h b/lib/Target/ARC/ARCFrameLowering.h
index 41b559d16761..9242400fb28d 100644
--- a/lib/Target/ARC/ARCFrameLowering.h
+++ b/lib/Target/ARC/ARCFrameLowering.h
@@ -27,8 +27,8 @@ class ARCInstrInfo;
 class ARCFrameLowering : public TargetFrameLowering {
 public:
   ARCFrameLowering(const ARCSubtarget &st)
-      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0), ST(st) {
-  }
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(4), 0),
+        ST(st) {}
 
   /// Insert Prologue into the function.
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/ARC/ARCISelLowering.cpp b/lib/Target/ARC/ARCISelLowering.cpp
index 847d23f0abdb..751fd567bae8 100644
--- a/lib/Target/ARC/ARCISelLowering.cpp
+++ b/lib/Target/ARC/ARCISelLowering.cpp
@@ -716,7 +716,7 @@ SDValue ARCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   assert(cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0 &&
          "Only support lowering frame addr of current frame.");
-  unsigned FrameReg = ARI.getFrameRegister(MF);
+  Register FrameReg = ARI.getFrameRegister(MF);
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
 }
 
diff --git a/lib/Target/ARC/ARCMachineFunctionInfo.h b/lib/Target/ARC/ARCMachineFunctionInfo.h
index 31aa5b93246c..d4dcf9bf285c 100644
--- a/lib/Target/ARC/ARCMachineFunctionInfo.h
+++ b/lib/Target/ARC/ARCMachineFunctionInfo.h
@@ -34,8 +34,8 @@ public:
   explicit ARCFunctionInfo(MachineFunction &MF)
       : ReturnStackOffsetSet(false), VarArgsFrameIndex(0),
         ReturnStackOffset(-1U), MaxCallStackReq(0) {
-    // Functions are 4-byte (2**2) aligned.
-    MF.setAlignment(2);
+    // Functions are 4-byte aligned.
+    MF.setAlignment(Align(4));
   }
 
   ~ARCFunctionInfo() {}
diff --git a/lib/Target/ARC/ARCOptAddrMode.cpp b/lib/Target/ARC/ARCOptAddrMode.cpp
index c922b99c57b0..22a3b9111c8e 100644
--- a/lib/Target/ARC/ARCOptAddrMode.cpp
+++ b/lib/Target/ARC/ARCOptAddrMode.cpp
@@ -139,8 +139,7 @@ static bool dominatesAllUsesOf(const MachineInstr *MI, unsigned VReg,
                                MachineDominatorTree *MDT,
                                MachineRegisterInfo *MRI) {
 
-  assert(TargetRegisterInfo::isVirtualRegister(VReg) &&
-         "Expected virtual register!");
+  assert(Register::isVirtualRegister(VReg) && "Expected virtual register!");
 
   for (auto it = MRI->use_nodbg_begin(VReg), end = MRI->use_nodbg_end();
        it != end; ++it) {
@@ -181,7 +180,7 @@ static bool isLoadStoreThatCanHandleDisplacement(const TargetInstrInfo *TII,
 
 bool ARCOptAddrMode::noUseOfAddBeforeLoadOrStore(const MachineInstr *Add,
                                                  const MachineInstr *Ldst) {
-  unsigned R = Add->getOperand(0).getReg();
+  Register R = Add->getOperand(0).getReg();
   return dominatesAllUsesOf(Ldst, R, MDT, MRI);
 }
 
@@ -205,9 +204,8 @@ MachineInstr *ARCOptAddrMode::tryToCombine(MachineInstr &Ldst) {
     return nullptr;
   }
 
-  unsigned B = Base.getReg();
-  if (TargetRegisterInfo::isStackSlot(B) ||
-      !TargetRegisterInfo::isVirtualRegister(B)) {
+  Register B = Base.getReg();
+  if (Register::isStackSlot(B) || !Register::isVirtualRegister(B)) {
     LLVM_DEBUG(dbgs() << "[ABAW] Base is not VReg\n");
     return nullptr;
   }
@@ -285,7 +283,7 @@ ARCOptAddrMode::canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add,
     return nullptr;
   }
 
-  unsigned BaseReg = Ldst->getOperand(BasePos).getReg();
+  Register BaseReg = Ldst->getOperand(BasePos).getReg();
 
   // prohibit this:
   //   v1 = add v0, c
@@ -294,7 +292,7 @@ ARCOptAddrMode::canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add,
   //   st v0, [v0, 0]
   //   v1 = add v0, c
   if (Ldst->mayStore() && Ldst->getOperand(0).isReg()) {
-    unsigned StReg = Ldst->getOperand(0).getReg();
+    Register StReg = Ldst->getOperand(0).getReg();
     if (Add->getOperand(0).getReg() == StReg || BaseReg == StReg) {
       LLVM_DEBUG(dbgs() << "[canJoinInstructions] Store uses result of Add\n");
       return nullptr;
@@ -447,7 +445,7 @@ void ARCOptAddrMode::changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode,
   MachineOperand Src = MachineOperand::CreateImm(0xDEADBEEF);
   AII->getBaseAndOffsetPosition(Ldst, BasePos, OffPos);
 
-  unsigned BaseReg = Ldst.getOperand(BasePos).getReg();
+  Register BaseReg = Ldst.getOperand(BasePos).getReg();
 
   Ldst.RemoveOperand(OffPos);
   Ldst.RemoveOperand(BasePos);
diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp
index 9c8340ac8f81..a7f89b385ffe 100644
--- a/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -206,7 +206,7 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   LLVM_DEBUG(dbgs() << "Offset             : " << Offset << "\n"
                     << "<--------->\n");
 
-  unsigned Reg = MI.getOperand(0).getReg();
+  Register Reg = MI.getOperand(0).getReg();
   assert(ARC::GPR32RegClass.contains(Reg) && "Unexpected register operand");
 
   if (!TFI->hasFP(MF)) {
diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp
index 9fb45d686c26..34700dc22c54 100644
--- a/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/lib/Target/ARC/ARCTargetMachine.cpp
@@ -38,7 +38,7 @@ ARCTargetMachine::ARCTargetMachine(const Target &T, const Triple &TT,
                         "f32:32:32-i64:32-f64:32-a:0:32-n32",
                         TT, CPU, FS, Options, getRelocModel(RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index fb238bfc9cbc..30b9c8071ba2 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -133,9 +133,9 @@ bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
                                   const TargetRegisterClass *TRC) {
   if (!MO.isReg())
     return false;
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
 
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
+  if (Register::isVirtualRegister(Reg))
     return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
   else
     return TRC->contains(Reg);
@@ -151,7 +151,7 @@ unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
 // Get the subreg type that is most likely to be coalesced
 // for an SPR register that will be used in VDUP32d pseudo.
 unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
-  if (!TRI->isVirtualRegister(SReg))
+  if (!Register::isVirtualRegister(SReg))
     return getDPRLaneFromSPR(SReg);
 
   MachineInstr *MI = MRI->getVRegDef(SReg);
@@ -166,7 +166,7 @@ unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
     SReg = MI->getOperand(1).getReg();
   }
 
-  if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+  if (Register::isVirtualRegister(SReg)) {
     if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
     return ARM::ssub_0;
   }
@@ -191,8 +191,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
     for (MachineOperand &MO : MI->operands()) {
       if ((!MO.isReg()) || (!MO.isUse()))
         continue;
-      unsigned Reg = MO.getReg();
-      if (!TRI->isVirtualRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Register::isVirtualRegister(Reg))
         continue;
       MachineOperand *Op = MI->findRegisterDefOperand(Reg);
 
@@ -213,8 +213,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
       for (MachineOperand &MODef : Def->operands()) {
         if ((!MODef.isReg()) || (!MODef.isDef()))
           continue;
-        unsigned DefReg = MODef.getReg();
-        if (!TRI->isVirtualRegister(DefReg)) {
+        Register DefReg = MODef.getReg();
+        if (!Register::isVirtualRegister(DefReg)) {
           IsDead = false;
           break;
         }
@@ -245,10 +245,10 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
   }
 
   if (MI->isInsertSubreg()) {
-    unsigned DPRReg = MI->getOperand(1).getReg();
-    unsigned SPRReg = MI->getOperand(2).getReg();
+    Register DPRReg = MI->getOperand(1).getReg();
+    Register SPRReg = MI->getOperand(2).getReg();
 
-    if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
+    if (Register::isVirtualRegister(DPRReg) && Register::isVirtualRegister(SPRReg)) {
       MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
       MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
 
@@ -267,7 +267,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
 
             // Find the thing we're subreg copying out of - is it of the same
             // regclass as DPRMI? (i.e. a DPR or QPR).
-            unsigned FullReg = SPRMI->getOperand(1).getReg();
+            Register FullReg = SPRMI->getOperand(1).getReg();
             const TargetRegisterClass *TRC =
               MRI->getRegClass(MI->getOperand(1).getReg());
             if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
@@ -296,9 +296,9 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
       if (!MI->getOperand(I).isReg())
         continue;
       ++NumTotal;
-      unsigned OpReg = MI->getOperand(I).getReg();
+      Register OpReg = MI->getOperand(I).getReg();
 
-      if (!TRI->isVirtualRegister(OpReg))
+      if (!Register::isVirtualRegister(OpReg))
         break;
 
       MachineInstr *Def = MRI->getVRegDef(OpReg);
@@ -342,7 +342,7 @@ bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
 MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
   if (!MI->isFullCopy())
     return MI;
-  if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+  if (!Register::isVirtualRegister(MI->getOperand(1).getReg()))
     return nullptr;
   MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
   if (!Def)
@@ -369,8 +369,8 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
      Reached.insert(MI);
      if (MI->isPHI()) {
        for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
-         unsigned Reg = MI->getOperand(I).getReg();
-         if (!TRI->isVirtualRegister(Reg)) {
+         Register Reg = MI->getOperand(I).getReg();
+         if (!Register::isVirtualRegister(Reg)) {
            continue;
          }
          MachineInstr *NewMI = MRI->getVRegDef(Reg);
@@ -379,7 +379,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
          Front.push_back(NewMI);
        }
      } else if (MI->isFullCopy()) {
-       if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+       if (!Register::isVirtualRegister(MI->getOperand(1).getReg()))
          continue;
        MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
        if (!NewMI)
@@ -418,8 +418,8 @@ unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator InsertBefore,
                                        const DebugLoc &DL, unsigned Reg,
                                        unsigned Lane, bool QPR) {
-  unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
-                                                  &ARM::DPRRegClass);
+  Register Out =
+      MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : &ARM::DPRRegClass);
   BuildMI(MBB, InsertBefore, DL,
           TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), Out)
       .addReg(Reg)
@@ -434,7 +434,7 @@ unsigned A15SDOptimizer::createExtractSubreg(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
     const DebugLoc &DL, unsigned DReg, unsigned Lane,
     const TargetRegisterClass *TRC) {
-  unsigned Out = MRI->createVirtualRegister(TRC);
+  Register Out = MRI->createVirtualRegister(TRC);
   BuildMI(MBB,
           InsertBefore,
           DL,
@@ -448,7 +448,7 @@ unsigned A15SDOptimizer::createExtractSubreg(
 unsigned A15SDOptimizer::createRegSequence(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
     const DebugLoc &DL, unsigned Reg1, unsigned Reg2) {
-  unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
+  Register Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
   BuildMI(MBB,
           InsertBefore,
           DL,
@@ -466,7 +466,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator InsertBefore,
                                     const DebugLoc &DL, unsigned Ssub0,
                                     unsigned Ssub1) {
-  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
   BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out)
       .addReg(Ssub0)
       .addReg(Ssub1)
@@ -478,7 +478,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
 unsigned A15SDOptimizer::createInsertSubreg(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
     const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) {
-  unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
+  Register Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
   BuildMI(MBB,
           InsertBefore,
           DL,
@@ -494,7 +494,7 @@ unsigned
 A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator InsertBefore,
                                   const DebugLoc &DL) {
-  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
   BuildMI(MBB,
           InsertBefore,
           DL,
@@ -602,7 +602,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
     // we can end up with multiple defs of this DPR.
 
     SmallVector<MachineInstr *, 8> DefSrcs;
-    if (!TRI->isVirtualRegister(*I))
+    if (!Register::isVirtualRegister(*I))
       continue;
     MachineInstr *Def = MRI->getVRegDef(*I);
     if (!Def)
@@ -622,7 +622,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
 
       // Collect all the uses of this MI's DPR def for updating later.
       SmallVector<MachineOperand*, 8> Uses;
-      unsigned DPRDefReg = MI->getOperand(0).getReg();
+      Register DPRDefReg = MI->getOperand(0).getReg();
       for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
              E = MRI->use_end(); I != E; ++I)
         Uses.push_back(&*I);
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index bf8ed6562fe7..2e6f756d522c 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -35,6 +35,7 @@ class MachineInstr;
 class MCInst;
 class PassRegistry;
 
+Pass *createMVETailPredicationPass();
 FunctionPass *createARMLowOverheadLoopsPass();
 Pass *createARMParallelDSPPass();
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
@@ -67,6 +68,7 @@ void initializeThumb2SizeReducePass(PassRegistry &);
 void initializeThumb2ITBlockPass(PassRegistry &);
 void initializeMVEVPTBlockPass(PassRegistry &);
 void initializeARMLowOverheadLoopsPass(PassRegistry &);
+void initializeMVETailPredicationPass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index b687db12eaf5..fed4cb2b9316 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -57,12 +57,15 @@ def FeatureD32            : SubtargetFeature<"d32", "HasD32", "true",
                                              "Extend FP to 32 double registers">;
 
 multiclass VFPver<string name, string query, string description,
-                  list<SubtargetFeature> prev = [],
-                  list<SubtargetFeature> otherimplies = []> {
+                  list<SubtargetFeature> prev,
+                  list<SubtargetFeature> otherimplies,
+                  list<SubtargetFeature> vfp2prev = []> {
   def _D16_SP: SubtargetFeature<
     name#"d16sp", query#"D16SP", "true",
     description#" with only 16 d-registers and no double precision",
-    !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) # otherimplies>;
+    !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) #
+      !foreach(v, vfp2prev, !cast<SubtargetFeature>(v # "_SP")) #
+      otherimplies>;
   def _SP: SubtargetFeature<
     name#"sp", query#"SP", "true",
     description#" with no double precision",
@@ -72,6 +75,7 @@ multiclass VFPver<string name, string query, string description,
     name#"d16", query#"D16", "true",
     description#" with only 16 d-registers",
     !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16")) #
+      vfp2prev #
       otherimplies # [FeatureFP64, !cast<SubtargetFeature>(NAME # "_D16_SP")]>;
   def "": SubtargetFeature<
     name, query, "true", description,
@@ -80,11 +84,17 @@ multiclass VFPver<string name, string query, string description,
         !cast<SubtargetFeature>(NAME # "_SP")]>;
 }
 
-defm FeatureVFP2: VFPver<"vfp2", "HasVFPv2", "Enable VFP2 instructions",
-                         [], [FeatureFPRegs]>;
+def FeatureVFP2_SP        : SubtargetFeature<"vfp2sp", "HasVFPv2SP", "true",
+                                             "Enable VFP2 instructions with "
+                                             "no double precision",
+                                             [FeatureFPRegs]>;
+
+def FeatureVFP2           : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+                                             "Enable VFP2 instructions",
+                                             [FeatureFP64, FeatureVFP2_SP]>;
 
 defm FeatureVFP3: VFPver<"vfp3", "HasVFPv3", "Enable VFP3 instructions",
-                         [FeatureVFP2]>;
+                         [], [], [FeatureVFP2]>;
 
 def FeatureNEON           : SubtargetFeature<"neon", "HasNEON", "true",
                                              "Enable NEON instructions",
@@ -98,7 +108,7 @@ defm FeatureVFP4: VFPver<"vfp4", "HasVFPv4", "Enable VFP4 instructions",
                          [FeatureVFP3], [FeatureFP16]>;
 
 defm FeatureFPARMv8: VFPver<"fp-armv8", "HasFPARMv8", "Enable ARMv8 FP",
-                         [FeatureVFP4]>;
+                         [FeatureVFP4], []>;
 
 def FeatureFullFP16       : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
                                              "Enable full half-precision "
@@ -302,9 +312,18 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
                                              "Prefer 32-bit Thumb instrs">;
 
-def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2",
+def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2",
                                               "Prefer 32-bit alignment for loops">;
 
+def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1",
+                        "Model MVE instructions as a 1 beat per tick architecture">;
+
+def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2",
+                        "Model MVE instructions as a 2 beats per tick architecture">;
+
+def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4",
+                        "Model MVE instructions as a 4 beats per tick architecture">;
+
 /// Some instructions update CPSR partially, which can add false dependency for
 /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
 /// mapped to a separate physical register. Avoid partial CPSR update for these
@@ -1156,6 +1175,13 @@ def : ProcNoItin<"cortex-a76ae",                        [ARMv82a, ProcA76,
                                                          FeatureFullFP16,
                                                          FeatureDotProd]>;
 
+def : ProcNoItin<"neoverse-n1",                         [ARMv82a,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureDotProd]>;
+
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                          FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index e29077266fcd..c8c91e53c44e 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -168,7 +168,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // relatively easy to exceed the thumb branch range within a TU.
   if (! ThumbIndirectPads.empty()) {
     OutStreamer->EmitAssemblerFlag(MCAF_Code16);
-    EmitAlignment(1);
+    EmitAlignment(Align(2));
     for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) {
       OutStreamer->EmitLabel(TIP.second);
       EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX)
@@ -203,8 +203,8 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
   switch (MO.getType()) {
   default: llvm_unreachable("<unknown operand type>");
   case MachineOperand::MO_Register: {
-    unsigned Reg = MO.getReg();
-    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    Register Reg = MO.getReg();
+    assert(Register::isPhysicalRegister(Reg));
     assert(!MO.getSubReg() && "Subregs should be eliminated!");
     if(ARM::GPRPairRegClass.contains(Reg)) {
       const MachineFunction &MF = *MI->getParent()->getParent();
@@ -275,7 +275,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       return false;
     case 'y': // Print a VFP single precision register as indexed double.
       if (MI->getOperand(OpNum).isReg()) {
-        unsigned Reg = MI->getOperand(OpNum).getReg();
+        Register Reg = MI->getOperand(OpNum).getReg();
         const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
         // Find the 'd' register that has this 's' register as a sub-register,
         // and determine the lane number.
@@ -302,14 +302,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       if (!MI->getOperand(OpNum).isReg())
         return true;
       const MachineOperand &MO = MI->getOperand(OpNum);
-      unsigned RegBegin = MO.getReg();
+      Register RegBegin = MO.getReg();
       // This takes advantage of the 2 operand-ness of ldm/stm and that we've
       // already got the operands in registers that are operands to the
       // inline asm statement.
       O << "{";
       if (ARM::GPRPairRegClass.contains(RegBegin)) {
         const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-        unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
+        Register Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
         O << ARMInstPrinter::getRegisterName(Reg0) << ", ";
         RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
       }
@@ -378,8 +378,8 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         if (!MO.isReg())
           return true;
         const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-        unsigned Reg = TRI->getSubReg(MO.getReg(), FirstHalf ?
-            ARM::gsub_0 : ARM::gsub_1);
+        Register Reg =
+            TRI->getSubReg(MO.getReg(), FirstHalf ? ARM::gsub_0 : ARM::gsub_1);
         O << ARMInstPrinter::getRegisterName(Reg);
         return false;
       }
@@ -391,7 +391,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       const MachineOperand &MO = MI->getOperand(RegOp);
       if (!MO.isReg())
         return true;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       O << ARMInstPrinter::getRegisterName(Reg);
       return false;
     }
@@ -400,12 +400,12 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     case 'f': { // The high doubleword register of a NEON quad register.
       if (!MI->getOperand(OpNum).isReg())
         return true;
-      unsigned Reg = MI->getOperand(OpNum).getReg();
+      Register Reg = MI->getOperand(OpNum).getReg();
       if (!ARM::QPRRegClass.contains(Reg))
         return true;
       const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-      unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ?
-                                       ARM::dsub_0 : ARM::dsub_1);
+      Register SubReg =
+          TRI->getSubReg(Reg, ExtraCode[0] == 'e' ? ARM::dsub_0 : ARM::dsub_1);
       O << ARMInstPrinter::getRegisterName(SubReg);
       return false;
     }
@@ -419,7 +419,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         return true;
       const MachineFunction &MF = *MI->getParent()->getParent();
       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if(!ARM::GPRPairRegClass.contains(Reg))
         return false;
       Reg = TRI->getSubReg(Reg, ARM::gsub_1);
@@ -526,7 +526,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     if (!Stubs.empty()) {
       // Switch with ".non_lazy_symbol_pointer" directive.
       OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
-      EmitAlignment(2);
+      EmitAlignment(Align(4));
 
       for (auto &Stub : Stubs)
         emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -539,7 +539,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     if (!Stubs.empty()) {
       // Switch with ".non_lazy_symbol_pointer" directive.
       OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
-      EmitAlignment(2);
+      EmitAlignment(Align(4));
 
       for (auto &Stub : Stubs)
         emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -940,7 +940,7 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
 
   // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
   // ARM mode tables.
-  EmitAlignment(2);
+  EmitAlignment(Align(4));
 
   // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
@@ -986,7 +986,7 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
 
   // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
   // ARM mode tables.
-  EmitAlignment(2);
+  EmitAlignment(Align(4));
 
   // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
@@ -1015,7 +1015,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
   unsigned JTI = MO1.getIndex();
 
   if (Subtarget->isThumb1Only())
-    EmitAlignment(2);
+    EmitAlignment(Align(4));
 
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
   OutStreamer->EmitLabel(JTISymbol);
@@ -1058,7 +1058,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
   OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
 
   // Make sure the next instruction is 2-byte aligned.
-  EmitAlignment(1);
+  EmitAlignment(Align(2));
 }
 
 void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
@@ -1072,7 +1072,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
     MF.getSubtarget().getRegisterInfo();
   const MachineRegisterInfo &MachineRegInfo = MF.getRegInfo();
 
-  unsigned FramePtr = TargetRegInfo->getFrameRegister(MF);
+  Register FramePtr = TargetRegInfo->getFrameRegister(MF);
   unsigned Opc = MI->getOpcode();
   unsigned SrcReg, DstReg;
 
@@ -1136,7 +1136,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
         }
         // Check for registers that are remapped (for a Thumb1 prologue that
         // saves high registers).
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(Reg))
           Reg = RemappedReg;
         RegList.push_back(Reg);
@@ -1326,7 +1326,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // So here we generate a bl to a small jump pad that does bx rN.
     // The jump pads are emitted after the function body.
 
-    unsigned TReg = MI->getOperand(0).getReg();
+    Register TReg = MI->getOperand(0).getReg();
     MCSymbol *TRegSym = nullptr;
     for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) {
       if (TIP.first == TReg) {
@@ -1663,8 +1663,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case ARM::tTBH_JT: {
 
     bool Is8Bit = MI->getOpcode() == ARM::tTBB_JT;
-    unsigned Base = MI->getOperand(0).getReg();
-    unsigned Idx = MI->getOperand(1).getReg();
+    Register Base = MI->getOperand(0).getReg();
+    Register Idx = MI->getOperand(1).getReg();
     assert(MI->getOperand(1).isKill() && "We need the index register as scratch!");
 
     // Multiply up idx if necessary.
@@ -1844,8 +1844,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // b LSJLJEH
     // movs r0, #1
     // LSJLJEH:
-    unsigned SrcReg = MI->getOperand(0).getReg();
-    unsigned ValReg = MI->getOperand(1).getReg();
+    Register SrcReg = MI->getOperand(0).getReg();
+    Register ValReg = MI->getOperand(1).getReg();
     MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true);
     OutStreamer->AddComment("eh_setjmp begin");
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
@@ -1910,8 +1910,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // mov r0, #0
     // add pc, pc, #0
     // mov r0, #1
-    unsigned SrcReg = MI->getOperand(0).getReg();
-    unsigned ValReg = MI->getOperand(1).getReg();
+    Register SrcReg = MI->getOperand(0).getReg();
+    Register ValReg = MI->getOperand(1).getReg();
 
     OutStreamer->AddComment("eh_setjmp begin");
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri)
@@ -1967,8 +1967,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // ldr $scratch, [$src, #4]
     // ldr r7, [$src]
     // bx $scratch
-    unsigned SrcReg = MI->getOperand(0).getReg();
-    unsigned ScratchReg = MI->getOperand(1).getReg();
+    Register SrcReg = MI->getOperand(0).getReg();
+    Register ScratchReg = MI->getOperand(1).getReg();
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
       .addReg(ARM::SP)
       .addReg(SrcReg)
@@ -2027,8 +2027,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // ldr $scratch, [$src, #4]
     // ldr r7, [$src]
     // bx $scratch
-    unsigned SrcReg = MI->getOperand(0).getReg();
-    unsigned ScratchReg = MI->getOperand(1).getReg();
+    Register SrcReg = MI->getOperand(0).getReg();
+    Register ScratchReg = MI->getOperand(1).getReg();
 
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
       .addReg(ScratchReg)
@@ -2095,7 +2095,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // ldr.w  sp, [$src, #8]
     // ldr.w  pc, [$src, #4]
 
-    unsigned SrcReg = MI->getOperand(0).getReg();
+    Register SrcReg = MI->getOperand(0).getReg();
 
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
                                      .addReg(ARM::R11)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 222aa85856a2..684cd1def977 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -172,9 +172,9 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
   const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0);
   const MachineOperand &Base = MI.getOperand(2);
   const MachineOperand &Offset = MI.getOperand(NumOps - 3);
-  unsigned WBReg = WB.getReg();
-  unsigned BaseReg = Base.getReg();
-  unsigned OffReg = Offset.getReg();
+  Register WBReg = WB.getReg();
+  Register BaseReg = Base.getReg();
+  Register OffReg = Offset.getReg();
   unsigned OffImm = MI.getOperand(NumOps - 2).getImm();
   ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm();
   switch (AddrMode) {
@@ -276,8 +276,8 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
   if (LV) {
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-        unsigned Reg = MO.getReg();
+      if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
+        Register Reg = MO.getReg();
 
         LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
         if (MO.isDef()) {
@@ -966,8 +966,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   SmallSet<unsigned, 4> DstRegs;
 #endif
   for (unsigned i = 0; i != SubRegs; ++i) {
-    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
-    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
+    Register Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+    Register Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
     assert(Dst && Src && "Bad sub-register");
 #ifndef NDEBUG
     assert(!DstRegs.count(Src) && "destructive vector copy");
@@ -1019,7 +1019,7 @@ ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
   if (!SubIdx)
     return MIB.addReg(Reg, State);
 
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
   return MIB.addReg(Reg, State, SubIdx);
 }
@@ -1133,7 +1133,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     case 24:
       if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
         // Use aligned spills if the stack can be realigned.
-        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+            Subtarget.hasNEON()) {
           BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo))
               .addFrameIndex(FI)
               .addImm(16)
@@ -1155,7 +1156,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       break;
     case 32:
       if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
-        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+            Subtarget.hasNEON()) {
           // FIXME: It's possible to only store part of the QQ register if the
           // spilled def has a sub-register index.
           BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64QPseudo))
@@ -1337,7 +1339,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
       }
 
-      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+      if (Register::isPhysicalRegister(DestReg))
         MIB.addReg(DestReg, RegState::ImplicitDefine);
     } else
       llvm_unreachable("Unknown reg class!");
@@ -1368,7 +1370,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     break;
   case 24:
     if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
-      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+          Subtarget.hasNEON()) {
         BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
             .addFrameIndex(FI)
             .addImm(16)
@@ -1382,7 +1385,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
-        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        if (Register::isPhysicalRegister(DestReg))
           MIB.addReg(DestReg, RegState::ImplicitDefine);
       }
     } else
@@ -1390,7 +1393,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     break;
    case 32:
     if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
-      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+          Subtarget.hasNEON()) {
         BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
             .addFrameIndex(FI)
             .addImm(16)
@@ -1405,7 +1409,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
-        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        if (Register::isPhysicalRegister(DestReg))
           MIB.addReg(DestReg, RegState::ImplicitDefine);
       }
     } else
@@ -1425,7 +1429,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
-      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+      if (Register::isPhysicalRegister(DestReg))
         MIB.addReg(DestReg, RegState::ImplicitDefine);
     } else
       llvm_unreachable("Unknown reg class!");
@@ -1583,8 +1587,8 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 
   // Look for a copy between even S-registers.  That is where we keep floats
   // when using NEON v2f32 instructions for f32 arithmetic.
-  unsigned DstRegS = MI.getOperand(0).getReg();
-  unsigned SrcRegS = MI.getOperand(1).getReg();
+  Register DstRegS = MI.getOperand(0).getReg();
+  Register SrcRegS = MI.getOperand(1).getReg();
   if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
     return false;
 
@@ -1794,12 +1798,11 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
     if (MI0.getNumOperands() != MI1.getNumOperands())
       return false;
 
-    unsigned Addr0 = MI0.getOperand(1).getReg();
-    unsigned Addr1 = MI1.getOperand(1).getReg();
+    Register Addr0 = MI0.getOperand(1).getReg();
+    Register Addr1 = MI1.getOperand(1).getReg();
     if (Addr0 != Addr1) {
-      if (!MRI ||
-          !TargetRegisterInfo::isVirtualRegister(Addr0) ||
-          !TargetRegisterInfo::isVirtualRegister(Addr1))
+      if (!MRI || !Register::isVirtualRegister(Addr0) ||
+          !Register::isVirtualRegister(Addr1))
         return false;
 
       // This assumes SSA form.
@@ -2076,6 +2079,38 @@ isProfitableToIfCvt(MachineBasicBlock &TBB,
   return PredCost <= UnpredCost;
 }
 
+unsigned
+ARMBaseInstrInfo::extraSizeToPredicateInstructions(const MachineFunction &MF,
+                                                   unsigned NumInsts) const {
+  // Thumb2 needs a 2-byte IT instruction to predicate up to 4 instructions.
+  // ARM has a condition code field in every predicable instruction, using it
+  // doesn't change code size.
+  return Subtarget.isThumb2() ? divideCeil(NumInsts, 4) * 2 : 0;
+}
+
+unsigned
+ARMBaseInstrInfo::predictBranchSizeForIfCvt(MachineInstr &MI) const {
+  // If this branch is likely to be folded into the comparison to form a
+  // CB(N)Z, then removing it won't reduce code size at all, because that will
+  // just replace the CB(N)Z with a CMP.
+  if (MI.getOpcode() == ARM::t2Bcc &&
+      findCMPToFoldIntoCBZ(&MI, &getRegisterInfo()))
+    return 0;
+
+  unsigned Size = getInstSizeInBytes(MI);
+
+  // For Thumb2, all branches are 32-bit instructions during the if conversion
+  // pass, but may be replaced with 16-bit instructions during size reduction.
+  // Since the branches considered by if conversion tend to be forward branches
+  // over small basic blocks, they are very likely to be in range for the
+  // narrow instructions, so we assume the final code size will be half what it
+  // currently is.
+  if (Subtarget.isThumb2())
+    Size /= 2;
+
+  return Size;
+}
+
 bool
 ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
                                             MachineBasicBlock &FMBB) const {
@@ -2141,7 +2176,7 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
 MachineInstr *
 ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
                                    const TargetInstrInfo *TII) const {
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  if (!Register::isVirtualRegister(Reg))
     return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
     return nullptr;
@@ -2163,7 +2198,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
     // MI can't have any tied operands, that would conflict with predication.
     if (MO.isTied())
       return nullptr;
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+    if (Register::isPhysicalRegister(MO.getReg()))
       return nullptr;
     if (MO.isDef() && !MO.isDead())
       return nullptr;
@@ -2211,7 +2246,7 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
 
   // Find new register class to use.
   MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1);
-  unsigned DestReg = MI.getOperand(0).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
   if (!MRI.constrainRegClass(DestReg, PreviousClass))
     return nullptr;
@@ -2298,6 +2333,7 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
   {ARM::tSUBSrr, ARM::tSUBrr},
   {ARM::tSBCS, ARM::tSBC},
   {ARM::tRSBS, ARM::tRSB},
+  {ARM::tLSLSri, ARM::tLSLri},
 
   {ARM::t2ADDSri, ARM::t2ADDri},
   {ARM::t2ADDSrr, ARM::t2ADDrr},
@@ -2420,7 +2456,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
     MachineOperand &MO = MI->getOperand(i);
     RegList.push_back(MO);
 
-    if (MO.isReg() && TRI->getEncodingValue(MO.getReg()) < FirstRegEnc)
+    if (MO.isReg() && !MO.isImplicit() &&
+        TRI->getEncodingValue(MO.getReg()) < FirstRegEnc)
       FirstRegEnc = TRI->getEncodingValue(MO.getReg());
   }
 
@@ -2430,7 +2467,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
   for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded;
        --CurRegEnc) {
     unsigned CurReg = RegClass->getRegister(CurRegEnc);
-    if (IsT1PushPop && CurReg > ARM::R7)
+    if (IsT1PushPop && CurRegEnc > TRI->getEncodingValue(ARM::R7))
       continue;
     if (!IsPop) {
       // Pushing any register is completely harmless, mark the register involved
@@ -3039,18 +3076,22 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
         break;
       case ARM::VSELEQD:
       case ARM::VSELEQS:
+      case ARM::VSELEQH:
         CC = ARMCC::EQ;
         break;
       case ARM::VSELGTD:
       case ARM::VSELGTS:
+      case ARM::VSELGTH:
         CC = ARMCC::GT;
         break;
       case ARM::VSELGED:
       case ARM::VSELGES:
+      case ARM::VSELGEH:
         CC = ARMCC::GE;
         break;
-      case ARM::VSELVSS:
       case ARM::VSELVSD:
+      case ARM::VSELVSS:
+      case ARM::VSELVSH:
         CC = ARMCC::VS;
         break;
       }
@@ -3271,9 +3312,9 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   }
 
   unsigned OpIdx = Commute ? 2 : 1;
-  unsigned Reg1 = UseMI.getOperand(OpIdx).getReg();
+  Register Reg1 = UseMI.getOperand(OpIdx).getReg();
   bool isKill = UseMI.getOperand(OpIdx).isKill();
-  unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+  Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
   BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(NewUseOpc),
           NewReg)
       .addReg(Reg1, getKillRegState(isKill))
@@ -3335,15 +3376,15 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRSB_POST:
   case ARM::LDRSH_POST: {
-    unsigned Rt = MI.getOperand(0).getReg();
-    unsigned Rm = MI.getOperand(3).getReg();
+    Register Rt = MI.getOperand(0).getReg();
+    Register Rm = MI.getOperand(3).getReg();
     return (Rt == Rm) ? 4 : 3;
   }
 
   case ARM::LDR_PRE_REG:
   case ARM::LDRB_PRE_REG: {
-    unsigned Rt = MI.getOperand(0).getReg();
-    unsigned Rm = MI.getOperand(3).getReg();
+    Register Rt = MI.getOperand(0).getReg();
+    Register Rm = MI.getOperand(3).getReg();
     if (Rt == Rm)
       return 3;
     unsigned ShOpVal = MI.getOperand(4).getImm();
@@ -3372,8 +3413,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRH_PRE:
   case ARM::STRH_PRE: {
-    unsigned Rt = MI.getOperand(0).getReg();
-    unsigned Rm = MI.getOperand(3).getReg();
+    Register Rt = MI.getOperand(0).getReg();
+    Register Rm = MI.getOperand(3).getReg();
     if (!Rm)
       return 2;
     if (Rt == Rm)
@@ -3384,8 +3425,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
   case ARM::LDR_POST_REG:
   case ARM::LDRB_POST_REG:
   case ARM::LDRH_POST: {
-    unsigned Rt = MI.getOperand(0).getReg();
-    unsigned Rm = MI.getOperand(3).getReg();
+    Register Rt = MI.getOperand(0).getReg();
+    Register Rm = MI.getOperand(3).getReg();
     return (Rt == Rm) ? 3 : 2;
   }
 
@@ -3404,10 +3445,10 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
 
   case ARM::LDRSB_PRE:
   case ARM::LDRSH_PRE: {
-    unsigned Rm = MI.getOperand(3).getReg();
+    Register Rm = MI.getOperand(3).getReg();
     if (Rm == 0)
       return 3;
-    unsigned Rt = MI.getOperand(0).getReg();
+    Register Rt = MI.getOperand(0).getReg();
     if (Rt == Rm)
       return 4;
     unsigned ShOpVal = MI.getOperand(4).getImm();
@@ -3422,9 +3463,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
   }
 
   case ARM::LDRD: {
-    unsigned Rt = MI.getOperand(0).getReg();
-    unsigned Rn = MI.getOperand(2).getReg();
-    unsigned Rm = MI.getOperand(3).getReg();
+    Register Rt = MI.getOperand(0).getReg();
+    Register Rn = MI.getOperand(2).getReg();
+    Register Rm = MI.getOperand(3).getReg();
     if (Rm)
       return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
                                                                           : 3;
@@ -3432,7 +3473,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
   }
 
   case ARM::STRD: {
-    unsigned Rm = MI.getOperand(3).getReg();
+    Register Rm = MI.getOperand(3).getReg();
     if (Rm)
       return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
                                                                           : 3;
@@ -3448,9 +3489,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
     return 4;
 
   case ARM::LDRD_PRE: {
-    unsigned Rt = MI.getOperand(0).getReg();
-    unsigned Rn = MI.getOperand(3).getReg();
-    unsigned Rm = MI.getOperand(4).getReg();
+    Register Rt = MI.getOperand(0).getReg();
+    Register Rn = MI.getOperand(3).getReg();
+    Register Rm = MI.getOperand(4).getReg();
     if (Rm)
       return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
                                                                           : 4;
@@ -3458,13 +3499,13 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
   }
 
   case ARM::t2LDRD_PRE: {
-    unsigned Rt = MI.getOperand(0).getReg();
-    unsigned Rn = MI.getOperand(3).getReg();
+    Register Rt = MI.getOperand(0).getReg();
+    Register Rn = MI.getOperand(3).getReg();
     return (Rt == Rn) ? 4 : 3;
   }
 
   case ARM::STRD_PRE: {
-    unsigned Rm = MI.getOperand(4).getReg();
+    Register Rm = MI.getOperand(4).getReg();
     if (Rm)
       return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
                                                                           : 4;
@@ -3495,8 +3536,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
     return 2;
 
   case ARM::t2LDRDi8: {
-    unsigned Rt = MI.getOperand(0).getReg();
-    unsigned Rn = MI.getOperand(2).getReg();
+    Register Rt = MI.getOperand(0).getReg();
+    Register Rn = MI.getOperand(2).getReg();
     return (Rt == Rn) ? 3 : 2;
   }
 
@@ -3745,7 +3786,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
 }
 
 bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
-  unsigned BaseReg = MI.getOperand(0).getReg();
+  Register BaseReg = MI.getOperand(0).getReg();
   for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) {
     const auto &Op = MI.getOperand(i);
     if (Op.isReg() && Op.getReg() == BaseReg)
@@ -4219,7 +4260,7 @@ int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     return -1;
 
   const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
-  unsigned Reg = DefMO.getReg();
+  Register Reg = DefMO.getReg();
 
   const MachineInstr *ResolvedDefMI = &DefMI;
   unsigned DefAdj = 0;
@@ -4328,10 +4369,10 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   }
 
   const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
-  const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
+  auto *DefMN = cast<MachineSDNode>(DefNode);
   unsigned DefAlign = !DefMN->memoperands_empty()
     ? (*DefMN->memoperands_begin())->getAlignment() : 0;
-  const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
+  auto *UseMN = cast<MachineSDNode>(UseNode);
   unsigned UseAlign = !UseMN->memoperands_empty()
     ? (*UseMN->memoperands_begin())->getAlignment() : 0;
   int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
@@ -4708,7 +4749,7 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
       if (MI.getOperand(i).isImplicit() ||
           !MI.getOperand(i).isReg())
         continue;
-      unsigned Reg = MI.getOperand(i).getReg();
+      Register Reg = MI.getOperand(i).getReg();
       if (Reg < ARM::R0 || Reg > ARM::R7) {
         if (!(MI.getOpcode() == ARM::tPUSH && Reg == ARM::LR) &&
             !(MI.getOpcode() == ARM::tPOP_RET && Reg == ARM::PC)) {
@@ -4731,7 +4772,7 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
 
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned Reg = MI->getOperand(0).getReg();
+  Register Reg = MI->getOperand(0).getReg();
   const GlobalValue *GV =
       cast<GlobalValue>((*MI->memoperands_begin())->getValue());
   MachineInstrBuilder MIB;
@@ -5104,7 +5145,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
   const MachineOperand &MO = MI.getOperand(OpNum);
   if (MO.readsReg())
     return 0;
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   int UseOp = -1;
 
   switch (MI.getOpcode()) {
@@ -5134,7 +5175,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
     return 0;
 
   // We must be able to clobber the whole D-reg.
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     // Virtual register must be a def undef foo:ssub_0 operand.
     if (!MO.getSubReg() || MI.readsVirtualRegister(Reg))
       return 0;
@@ -5159,8 +5200,8 @@ void ARMBaseInstrInfo::breakPartialRegDependency(
   assert(TRI && "Need TRI instance");
 
   const MachineOperand &MO = MI.getOperand(OpNum);
-  unsigned Reg = MO.getReg();
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+  Register Reg = MO.getReg();
+  assert(Register::isPhysicalRegister(Reg) &&
          "Can't break virtual register dependencies.");
   unsigned DReg = Reg;
 
@@ -5337,7 +5378,7 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br,
   // is not redefined between the cmp and the br.
   if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri)
     return nullptr;
-  unsigned Reg = CmpMI->getOperand(0).getReg();
+  Register Reg = CmpMI->getOperand(0).getReg();
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg);
   if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0)
@@ -5349,3 +5390,50 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br,
 
   return &*CmpMI;
 }
+
+unsigned llvm::ConstantMaterializationCost(unsigned Val,
+                                           const ARMSubtarget *Subtarget,
+                                           bool ForCodesize) {
+  if (Subtarget->isThumb()) {
+    if (Val <= 255) // MOV
+      return ForCodesize ? 2 : 1;
+    if (Subtarget->hasV6T2Ops() && (Val <= 0xffff ||                    // MOV
+                                    ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW
+                                    ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN
+      return ForCodesize ? 4 : 1;
+    if (Val <= 510) // MOV + ADDi8
+      return ForCodesize ? 4 : 2;
+    if (~Val <= 255) // MOV + MVN
+      return ForCodesize ? 4 : 2;
+    if (ARM_AM::isThumbImmShiftedVal(Val)) // MOV + LSL
+      return ForCodesize ? 4 : 2;
+  } else {
+    if (ARM_AM::getSOImmVal(Val) != -1) // MOV
+      return ForCodesize ? 4 : 1;
+    if (ARM_AM::getSOImmVal(~Val) != -1) // MVN
+      return ForCodesize ? 4 : 1;
+    if (Subtarget->hasV6T2Ops() && Val <= 0xffff) // MOVW
+      return ForCodesize ? 4 : 1;
+    if (ARM_AM::isSOImmTwoPartVal(Val)) // two instrs
+      return ForCodesize ? 8 : 2;
+  }
+  if (Subtarget->useMovt()) // MOVW + MOVT
+    return ForCodesize ? 8 : 2;
+  return ForCodesize ? 8 : 3; // Literal pool load
+}
+
+bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
+                                               const ARMSubtarget *Subtarget,
+                                               bool ForCodesize) {
+  // Check with ForCodesize
+  unsigned Cost1 = ConstantMaterializationCost(Val1, Subtarget, ForCodesize);
+  unsigned Cost2 = ConstantMaterializationCost(Val2, Subtarget, ForCodesize);
+  if (Cost1 < Cost2)
+    return true;
+  if (Cost1 > Cost2)
+    return false;
+
+  // If they are equal, try with !ForCodesize
+  return ConstantMaterializationCost(Val1, Subtarget, !ForCodesize) <
+         ConstantMaterializationCost(Val2, Subtarget, !ForCodesize);
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index c28983fcc15c..c232b6f0b45d 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -276,6 +276,10 @@ public:
     return NumCycles == 1;
   }
 
+  unsigned extraSizeToPredicateInstructions(const MachineFunction &MF,
+                                            unsigned NumInsts) const override;
+  unsigned predictBranchSizeForIfCvt(MachineInstr &MI) const override;
+
   bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
                                  MachineBasicBlock &FMBB) const override;
 
@@ -601,7 +605,8 @@ bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 
 bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                          unsigned FrameReg, int &Offset,
-                         const ARMBaseInstrInfo &TII);
+                         const ARMBaseInstrInfo &TII,
+                         const TargetRegisterInfo *TRI);
 
 /// Return true if Reg is defd between From and To
 bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From,
@@ -620,6 +625,20 @@ void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond);
 void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond,
                               unsigned Inactive);
 
+/// Returns the number of instructions required to materialize the given
+/// constant in a register, or 3 if a literal pool load is needed.
+/// If ForCodesize is specified, an approximate cost in bytes is returned.
+unsigned ConstantMaterializationCost(unsigned Val,
+                                     const ARMSubtarget *Subtarget,
+                                     bool ForCodesize = false);
+
+/// Returns true if Val1 has a lower Constant Materialization Cost than Val2.
+/// Uses the cost from ConstantMaterializationCost, first with ForCodesize as
+/// specified. If the scores are equal, return the comparison for !ForCodesize.
+bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
+                                         const ARMSubtarget *Subtarget,
+                                         bool ForCodesize = false);
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index dc99b37742da..1eaf871867e0 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -174,6 +174,12 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
                               : CSR_AAPCS_ThisReturn_RegMask;
 }
 
+ArrayRef<MCPhysReg> ARMBaseRegisterInfo::getIntraCallClobberedRegs(
+    const MachineFunction *MF) const {
+  static const MCPhysReg IntraCallClobberedRegs[] = {ARM::R12};
+  return ArrayRef<MCPhysReg>(IntraCallClobberedRegs);
+}
+
 BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
@@ -185,7 +191,7 @@ getReservedRegs(const MachineFunction &MF) const {
   markSuperRegs(Reserved, ARM::PC);
   markSuperRegs(Reserved, ARM::FPSCR);
   markSuperRegs(Reserved, ARM::APSR_NZCV);
-  if (TFI->hasFP(MF))
+  if (TFI->hasFP(MF) || STI.isTargetDarwin())
     markSuperRegs(Reserved, getFramePointerReg(STI));
   if (hasBasePointer(MF))
     markSuperRegs(Reserved, BasePtr);
@@ -217,7 +223,7 @@ isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const {
 
 const TargetRegisterClass *
 ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
-                                               const MachineFunction &) const {
+                                               const MachineFunction &MF) const {
   const TargetRegisterClass *Super = RC;
   TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
   do {
@@ -225,11 +231,13 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     case ARM::GPRRegClassID:
     case ARM::SPRRegClassID:
     case ARM::DPRRegClassID:
+    case ARM::GPRPairRegClassID:
+      return Super;
     case ARM::QPRRegClassID:
     case ARM::QQPRRegClassID:
     case ARM::QQQQPRRegClassID:
-    case ARM::GPRPairRegClassID:
-      return Super;
+      if (MF.getSubtarget<ARMSubtarget>().hasNEON())
+        return Super;
     }
     Super = *I++;
   } while (Super);
@@ -317,7 +325,7 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
     return false;
 
   unsigned PairedPhys = 0;
-  if (TargetRegisterInfo::isPhysicalRegister(Paired)) {
+  if (Register::isPhysicalRegister(Paired)) {
     PairedPhys = Paired;
   } else if (VRM && VRM->hasPhys(Paired)) {
     PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this);
@@ -347,7 +355,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
   std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
   if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
        Hint.first == (unsigned)ARMRI::RegPairEven) &&
-      TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+      Register::isVirtualRegister(Hint.second)) {
     // If 'Reg' is one of the even / odd register pair and it's now changed
     // (e.g. coalesced) into a different register. The other register of the
     // pair allocation hint must be updated to reflect the relationship
@@ -357,7 +365,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
     // Make sure the pair has not already divorced.
     if (Hint.second == Reg) {
       MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
-      if (TargetRegisterInfo::isVirtualRegister(NewReg))
+      if (Register::isVirtualRegister(NewReg))
         MRI->setRegAllocationHint(NewReg,
             Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven
             : ARMRI::RegPairOdd, OtherReg);
@@ -663,7 +671,7 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
     Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII);
   else {
     assert(AFI->isThumb2Function());
-    Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
+    Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII, this);
   }
   assert(Done && "Unable to resolve frame index!");
   (void)Done;
@@ -775,7 +783,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
   else {
     assert(AFI->isThumb2Function());
-    Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+    Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII, this);
   }
   if (Done)
     return;
@@ -783,21 +791,32 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above, handle the rest, providing a register that is
   // SP+LargeImm.
-  assert((Offset ||
-          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
-          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) &&
-         "This code isn't needed if offset already handled!");
+  assert(
+      (Offset ||
+       (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
+       (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6 ||
+       (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7 ||
+       (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7s2 ||
+       (MI.getDesc().TSFlags & ARMII::AddrModeMask) ==
+           ARMII::AddrModeT2_i7s4) &&
+      "This code isn't needed if offset already handled!");
 
   unsigned ScratchReg = 0;
   int PIdx = MI.findFirstPredOperandIdx();
   ARMCC::CondCodes Pred = (PIdx == -1)
     ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
   Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg();
-  if (Offset == 0)
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  const TargetRegisterClass *RegClass =
+      TII.getRegClass(MCID, FIOperandNum, this, *MI.getParent()->getParent());
+
+  if (Offset == 0 &&
+      (Register::isVirtualRegister(FrameReg) || RegClass->contains(FrameReg)))
     // Must be addrmode4/6.
     MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false);
   else {
-    ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
+    ScratchReg = MF.getRegInfo().createVirtualRegister(RegClass);
     if (!AFI->isThumbFunction())
       emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
                               Offset, Pred, PredReg, TII);
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 7e2c72b4d712..477f3ad0a9a7 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -129,6 +129,9 @@ public:
   const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
                                              CallingConv::ID) const;
 
+  ArrayRef<MCPhysReg>
+  getIntraCallClobberedRegs(const MachineFunction *MF) const override;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isAsmClobberable(const MachineFunction &MF,
                        unsigned PhysReg) const override;
@@ -176,8 +179,6 @@ public:
   Register getFrameRegister(const MachineFunction &MF) const override;
   unsigned getBaseRegister() const { return BasePtr; }
 
-  bool isLowRegister(unsigned Reg) const;
-
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
diff --git a/lib/Target/ARM/ARMBasicBlockInfo.cpp b/lib/Target/ARM/ARMBasicBlockInfo.cpp
index 2de90e816b33..00a2231f59e3 100644
--- a/lib/Target/ARM/ARMBasicBlockInfo.cpp
+++ b/lib/Target/ARM/ARMBasicBlockInfo.cpp
@@ -6,14 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMBasicBlockInfo.h"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMBasicBlockInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Support/Debug.h"
 #include <vector>
 
 #define DEBUG_TYPE "arm-bb-utils"
@@ -47,7 +49,7 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) {
   BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
   BBI.Size = 0;
   BBI.Unalign = 0;
-  BBI.PostAlign = 0;
+  BBI.PostAlign = Align::None();
 
   for (MachineInstr &I : *MBB) {
     BBI.Size += TII->getInstSizeInBytes(I);
@@ -62,8 +64,8 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) {
 
   // tBR_JTr contains a .align 2 directive.
   if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
-    BBI.PostAlign = 2;
-    MBB->getParent()->ensureAlignment(2);
+    BBI.PostAlign = Align(4);
+    MBB->getParent()->ensureAlignment(Align(4));
   }
 }
 
@@ -126,9 +128,9 @@ void ARMBasicBlockUtils::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
   for(unsigned i = BBNum + 1, e = MF.getNumBlockIDs(); i < e; ++i) {
     // Get the offset and known bits at the end of the layout predecessor.
     // Include the alignment of the current block.
-    unsigned LogAlign = MF.getBlockNumbered(i)->getAlignment();
-    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
-    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+    const Align Align = MF.getBlockNumbered(i)->getAlignment();
+    const unsigned Offset = BBInfo[i - 1].postOffset(Align);
+    const unsigned KnownBits = BBInfo[i - 1].postKnownBits(Align);
 
     // This is where block i begins.  Stop if the offset is already correct,
     // and we have updated 2 blocks.  This is the maximum number of blocks
diff --git a/lib/Target/ARM/ARMBasicBlockInfo.h b/lib/Target/ARM/ARMBasicBlockInfo.h
index 400bba351cec..13df399ed995 100644
--- a/lib/Target/ARM/ARMBasicBlockInfo.h
+++ b/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -21,17 +21,18 @@
 
 namespace llvm {
 
+struct BasicBlockInfo;
 using BBInfoVector = SmallVectorImpl<BasicBlockInfo>;
 
 /// UnknownPadding - Return the worst case padding that could result from
 /// unknown offset bits.  This does not include alignment padding caused by
 /// known offset bits.
 ///
-/// @param LogAlign log2(alignment)
+/// @param Alignment alignment
 /// @param KnownBits Number of known low offset bits.
-inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
-  if (KnownBits < LogAlign)
-    return (1u << LogAlign) - (1u << KnownBits);
+inline unsigned UnknownPadding(Align Alignment, unsigned KnownBits) {
+  if (KnownBits < Log2(Alignment))
+    return Alignment.value() - (1ull << KnownBits);
   return 0;
 }
 
@@ -65,10 +66,9 @@ struct BasicBlockInfo {
   /// multiple of 1 << Unalign.
   uint8_t Unalign = 0;
 
-  /// PostAlign - When non-zero, the block terminator contains a .align
-  /// directive, so the end of the block is aligned to 1 << PostAlign
-  /// bytes.
-  uint8_t PostAlign = 0;
+  /// PostAlign - When > 1, the block terminator contains a .align
+  /// directive, so the end of the block is aligned to PostAlign bytes.
+  Align PostAlign;
 
   BasicBlockInfo() = default;
 
@@ -84,16 +84,16 @@ struct BasicBlockInfo {
     return Bits;
   }
 
-  /// Compute the offset immediately following this block.  If LogAlign is
+  /// Compute the offset immediately following this block.  If Align is
   /// specified, return the offset the successor block will get if it has
   /// this alignment.
-  unsigned postOffset(unsigned LogAlign = 0) const {
+  unsigned postOffset(Align Alignment = Align::None()) const {
     unsigned PO = Offset + Size;
-    unsigned LA = std::max(unsigned(PostAlign), LogAlign);
-    if (!LA)
+    const Align PA = std::max(PostAlign, Alignment);
+    if (PA == Align::None())
       return PO;
     // Add alignment padding from the terminator.
-    return PO + UnknownPadding(LA, internalKnownBits());
+    return PO + UnknownPadding(PA, internalKnownBits());
   }
 
   /// Compute the number of known low bits of postOffset.  If this block
@@ -101,9 +101,8 @@ struct BasicBlockInfo {
   /// instruction alignment.  An aligned terminator may increase the number
   /// of know bits.
   /// If LogAlign is given, also consider the alignment of the next block.
-  unsigned postKnownBits(unsigned LogAlign = 0) const {
-    return std::max(std::max(unsigned(PostAlign), LogAlign),
-                    internalKnownBits());
+  unsigned postKnownBits(Align Align = Align::None()) const {
+    return std::max(Log2(std::max(PostAlign, Align)), internalKnownBits());
   }
 };
 
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 0cbe6e1871e4..d3b595ce8323 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -90,6 +90,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
                        MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
       : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
 
+  bool isIncomingArgumentHandler() const override { return false; }
+
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
     assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
@@ -169,8 +171,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
 
   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                  CCValAssign::LocInfo LocInfo,
-                 const CallLowering::ArgInfo &Info, CCState &State) override {
-    if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State))
+                 const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+                 CCState &State) override {
+    if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State))
       return true;
 
     StackSize =
@@ -199,9 +202,8 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
   if (SplitVTs.size() == 1) {
     // Even if there is no splitting to do, we still want to replace the
     // original type (e.g. pointer type -> integer).
-    auto Flags = OrigArg.Flags;
-    unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty);
-    Flags.setOrigAlign(OriginalAlignment);
+    auto Flags = OrigArg.Flags[0];
+    Flags.setOrigAlign(Align(DL.getABITypeAlignment(OrigArg.Ty)));
     SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
                            Flags, OrigArg.IsFixed);
     return;
@@ -211,10 +213,9 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
   for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) {
     EVT SplitVT = SplitVTs[i];
     Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
-    auto Flags = OrigArg.Flags;
+    auto Flags = OrigArg.Flags[0];
 
-    unsigned OriginalAlignment = DL.getABITypeAlignment(SplitTy);
-    Flags.setOrigAlign(OriginalAlignment);
+    Flags.setOrigAlign(Align(DL.getABITypeAlignment(SplitTy)));
 
     bool NeedsConsecutiveRegisters =
         TLI.functionArgumentNeedsConsecutiveRegisters(
@@ -286,7 +287,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
                        CCAssignFn AssignFn)
       : ValueHandler(MIRBuilder, MRI, AssignFn) {}
 
-  bool isArgumentHandler() const override { return true; }
+  bool isIncomingArgumentHandler() const override { return true; }
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -298,7 +299,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
     int FI = MFI.CreateFixedObject(Size, Offset, true);
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
 
-    unsigned AddrReg =
+    Register AddrReg =
         MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32));
     MIRBuilder.buildFrameIndex(AddrReg, FI);
 
@@ -405,6 +406,7 @@ struct FormalArgHandler : public IncomingValueHandler {
       : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
+    MIRBuilder.getMRI()->addLiveIn(PhysReg);
     MIRBuilder.getMBB().addLiveIn(PhysReg);
   }
 };
@@ -498,11 +500,7 @@ unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) {
 }
 } // end anonymous namespace
 
-bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
-                                CallingConv::ID CallConv,
-                                const MachineOperand &Callee,
-                                const ArgInfo &OrigRet,
-                                ArrayRef<ArgInfo> OrigArgs) const {
+bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const auto &TLI = *getTLI<ARMTargetLowering>();
   const auto &DL = MF.getDataLayout();
@@ -520,7 +518,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   // Create the call instruction so we can add the implicit uses of arg
   // registers, but don't insert it yet.
-  bool IsDirect = !Callee.isReg();
+  bool IsDirect = !Info.Callee.isReg();
   auto CallOpcode = getCallOpcode(STI, IsDirect);
   auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode);
 
@@ -528,35 +526,35 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   if (IsThumb)
     MIB.add(predOps(ARMCC::AL));
 
-  MIB.add(Callee);
+  MIB.add(Info.Callee);
   if (!IsDirect) {
-    auto CalleeReg = Callee.getReg();
-    if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) {
+    auto CalleeReg = Info.Callee.getReg();
+    if (CalleeReg && !Register::isPhysicalRegister(CalleeReg)) {
       unsigned CalleeIdx = IsThumb ? 2 : 0;
       MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
           MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(),
-          *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx));
+          *MIB.getInstr(), MIB->getDesc(), Info.Callee, CalleeIdx));
     }
   }
 
-  MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv));
+  MIB.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
 
   bool IsVarArg = false;
   SmallVector<ArgInfo, 8> ArgInfos;
-  for (auto Arg : OrigArgs) {
+  for (auto Arg : Info.OrigArgs) {
     if (!isSupportedType(DL, TLI, Arg.Ty))
       return false;
 
     if (!Arg.IsFixed)
       IsVarArg = true;
 
-    if (Arg.Flags.isByVal())
+    if (Arg.Flags[0].isByVal())
       return false;
 
     splitToValueTypes(Arg, ArgInfos, MF);
   }
 
-  auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, IsVarArg);
+  auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, IsVarArg);
   OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
   if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
     return false;
@@ -564,13 +562,13 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // Now we can add the actual call instruction to the correct basic block.
   MIRBuilder.insertInstr(MIB);
 
-  if (!OrigRet.Ty->isVoidTy()) {
-    if (!isSupportedType(DL, TLI, OrigRet.Ty))
+  if (!Info.OrigRet.Ty->isVoidTy()) {
+    if (!isSupportedType(DL, TLI, Info.OrigRet.Ty))
       return false;
 
     ArgInfos.clear();
-    splitToValueTypes(OrigRet, ArgInfos, MF);
-    auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, IsVarArg);
+    splitToValueTypes(Info.OrigRet, ArgInfos, MF);
+    auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, IsVarArg);
     CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
     if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
       return false;
diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h
index 794127b5ebc7..ddbc9feb90e2 100644
--- a/lib/Target/ARM/ARMCallLowering.h
+++ b/lib/Target/ARM/ARMCallLowering.h
@@ -38,9 +38,8 @@ public:
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<ArrayRef<Register>> VRegs) const override;
 
-  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
-                 const MachineOperand &Callee, const ArgInfo &OrigRet,
-                 ArrayRef<ArgInfo> OrigArgs) const override;
+  bool lowerCall(MachineIRBuilder &MIRBuilder,
+                 CallLoweringInfo &Info) const override;
 
 private:
   bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
diff --git a/lib/Target/ARM/ARMCallingConv.cpp b/lib/Target/ARM/ARMCallingConv.cpp
index 5ede7c67f7c2..92ebc542b423 100644
--- a/lib/Target/ARM/ARMCallingConv.cpp
+++ b/lib/Target/ARM/ARMCallingConv.cpp
@@ -193,7 +193,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
   // Try to allocate a contiguous block of registers, each of the correct
   // size to hold one member.
   auto &DL = State.getMachineFunction().getDataLayout();
-  unsigned StackAlign = DL.getStackAlignment();
+  unsigned StackAlign = DL.getStackAlignment().value();
   unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
 
   ArrayRef<MCPhysReg> RegList;
diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index 2fc5f4aaab50..1c2c8aef55bb 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -179,16 +179,12 @@ public:
 }
 
 static bool GenerateSignBits(Value *V) {
-  if (auto *Arg = dyn_cast<Argument>(V))
-    return Arg->hasSExtAttr();
-
   if (!isa<Instruction>(V))
     return false;
 
   unsigned Opc = cast<Instruction>(V)->getOpcode();
   return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
-         Opc == Instruction::SRem || Opc == Instruction::SExt ||
-         Opc == Instruction::SIToFP;
+         Opc == Instruction::SRem || Opc == Instruction::SExt;
 }
 
 static bool EqualTypeSize(Value *V) {
@@ -806,54 +802,48 @@ void IRPromoter::Mutate(Type *OrigTy,
 /// return value is zeroext. We don't allow opcodes that can introduce sign
 /// bits.
 bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
-  if (auto *I = dyn_cast<ICmpInst>(V)) {
-    // Now that we allow small types than TypeSize, only allow icmp of
-    // TypeSize because they will require a trunc to be legalised.
-    // TODO: Allow icmp of smaller types, and calculate at the end
-    // whether the transform would be beneficial.
-    if (isa<PointerType>(I->getOperand(0)->getType()))
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    switch (I->getOpcode()) {
+    default:
+      return isa<BinaryOperator>(I) && isSupportedType(I) &&
+             !GenerateSignBits(I);
+    case Instruction::GetElementPtr:
+    case Instruction::Store:
+    case Instruction::Br:
+    case Instruction::Switch:
       return true;
-    return EqualTypeSize(I->getOperand(0));
-  }
-
-  if (GenerateSignBits(V)) {
-    LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n");
-    return false;
-  }
-
-  // Memory instructions
-  if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
-    return true;
-
-  // Branches and targets.
-  if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
-    return true;
-
-  // Non-instruction values that we can handle.
-  if ((isa<Constant>(V) && !isa<ConstantExpr>(V)) || isa<Argument>(V))
+    case Instruction::PHI:
+    case Instruction::Select:
+    case Instruction::Ret:
+    case Instruction::Load:
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+      return isSupportedType(I);
+    case Instruction::ZExt:
+      return isSupportedType(I->getOperand(0));
+    case Instruction::ICmp:
+      // Now that we allow small types than TypeSize, only allow icmp of
+      // TypeSize because they will require a trunc to be legalised.
+      // TODO: Allow icmp of smaller types, and calculate at the end
+      // whether the transform would be beneficial.
+      if (isa<PointerType>(I->getOperand(0)->getType()))
+        return true;
+      return EqualTypeSize(I->getOperand(0));
+    case Instruction::Call: {
+      // Special cases for calls as we need to check for zeroext
+      // TODO We should accept calls even if they don't have zeroext, as they
+      // can still be sinks.
+      auto *Call = cast<CallInst>(I);
+      return isSupportedType(Call) &&
+             Call->hasRetAttr(Attribute::AttrKind::ZExt);
+    }
+    }
+  } else if (isa<Constant>(V) && !isa<ConstantExpr>(V)) {
     return isSupportedType(V);
-
-  if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) ||
-      isa<LoadInst>(V))
+  } else if (isa<Argument>(V))
     return isSupportedType(V);
 
-  if (auto *Cast = dyn_cast<CastInst>(V))
-    return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0));
-
-  // Special cases for calls as we need to check for zeroext
-  // TODO We should accept calls even if they don't have zeroext, as they can
-  // still be sinks.
-  if (auto *Call = dyn_cast<CallInst>(V))
-    return isSupportedType(Call) &&
-           Call->hasRetAttr(Attribute::AttrKind::ZExt);
-
-  if (!isa<BinaryOperator>(V))
-    return false;
-
-  if (!isSupportedType(V))
-    return false;
-
-  return true;
+  return isa<BasicBlock>(V);
 }
 
 /// Check that the type of V would be promoted and that the original type is
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 60e5d7bf6098..24ca25f73e96 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -26,8 +26,10 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -69,6 +71,7 @@ STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
 STATISTIC(NumCBZ,        "Number of CBZ / CBNZ formed");
 STATISTIC(NumJTMoved,    "Number of jump table destination blocks moved");
 STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
+STATISTIC(NumLEInserted, "Number of LE backwards branches inserted");
 
 static cl::opt<bool>
 AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
@@ -212,6 +215,7 @@ namespace {
     const ARMBaseInstrInfo *TII;
     const ARMSubtarget *STI;
     ARMFunctionInfo *AFI;
+    MachineDominatorTree *DT = nullptr;
     bool isThumb;
     bool isThumb1;
     bool isThumb2;
@@ -224,6 +228,11 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
     MachineFunctionProperties getRequiredProperties() const override {
       return MachineFunctionProperties().set(
           MachineFunctionProperties::Property::NoVRegs);
@@ -238,7 +247,7 @@ namespace {
     void doInitialJumpTablePlacement(std::vector<MachineInstr *> &CPEMIs);
     bool BBHasFallthrough(MachineBasicBlock *MBB);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
-    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    Align getCPEAlign(const MachineInstr *CPEMI);
     void scanFunctionJumpTables();
     void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
     MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
@@ -327,8 +336,7 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
       const BasicBlockInfo &BBI = BBInfo[J];
       dbgs() << format("%08x %bb.%u\t", BBI.Offset, J)
              << " kb=" << unsigned(BBI.KnownBits)
-             << " ua=" << unsigned(BBI.Unalign)
-             << " pa=" << unsigned(BBI.PostAlign)
+             << " ua=" << unsigned(BBI.Unalign) << " pa=" << Log2(BBI.PostAlign)
              << format(" size=%#x\n", BBInfo[J].Size);
     }
   });
@@ -349,6 +357,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   isPositionIndependentOrROPI =
       STI->getTargetLowering()->isPositionIndependent() || STI->isROPI();
   AFI = MF->getInfo<ARMFunctionInfo>();
+  DT = &getAnalysis<MachineDominatorTree>();
 
   isThumb = AFI->isThumbFunction();
   isThumb1 = AFI->isThumb1OnlyFunction();
@@ -357,9 +366,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   HasFarJump = false;
   bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
 
-  // This pass invalidates liveness information when it splits basic blocks.
-  MF->getRegInfo().invalidateLiveness();
-
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
   MF->RenumberBlocks();
@@ -398,7 +404,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // Functions with jump tables need an alignment of 4 because they use the ADR
   // instruction, which aligns the PC to 4 bytes before adding an offset.
   if (!T2JumpTables.empty())
-    MF->ensureAlignment(2);
+    MF->ensureAlignment(Align(4));
 
   /// Remove dead constant pool entries.
   MadeChange |= removeUnusedCPEntries();
@@ -487,8 +493,9 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
   MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
   MF->push_back(BB);
 
-  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
-  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+  // MachineConstantPool measures alignment in bytes.
+  const Align MaxAlign(MCP->getConstantPoolAlignment());
+  const unsigned MaxLogAlign = Log2(MaxAlign);
 
   // Mark the basic block as required by the const-pool.
   BB->setAlignment(MaxAlign);
@@ -501,7 +508,8 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
   // alignment of all entries as long as BB is sufficiently aligned.  Keep
   // track of the insertion point for each alignment.  We are going to bucket
   // sort the entries as they are created.
-  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxLogAlign + 1,
+                                                       BB->end());
 
   // Add all of the constants from the constant pool to the end block, use an
   // identity mapping of CPI's to CPE's.
@@ -526,7 +534,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
 
     // Ensure that future entries with higher alignment get inserted before
     // CPEMI. This is bucket sort with iterators.
-    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+    for (unsigned a = LogAlign + 1; a <= MaxLogAlign; ++a)
       if (InsPoint[a] == InsAt)
         InsPoint[a] = CPEMI;
 
@@ -640,29 +648,27 @@ ARMConstantIslands::findConstPoolEntry(unsigned CPI,
   return nullptr;
 }
 
-/// getCPELogAlign - Returns the required alignment of the constant pool entry
-/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
-unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+/// getCPEAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.
+Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) {
   switch (CPEMI->getOpcode()) {
   case ARM::CONSTPOOL_ENTRY:
     break;
   case ARM::JUMPTABLE_TBB:
-    return isThumb1 ? 2 : 0;
+    return isThumb1 ? Align(4) : Align(1);
   case ARM::JUMPTABLE_TBH:
-    return isThumb1 ? 2 : 1;
+    return isThumb1 ? Align(4) : Align(2);
   case ARM::JUMPTABLE_INSTS:
-    return 1;
+    return Align(2);
   case ARM::JUMPTABLE_ADDRS:
-    return 2;
+    return Align(4);
   default:
     llvm_unreachable("unknown constpool entry kind");
   }
 
   unsigned CPI = getCombinedIndex(CPEMI);
   assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
-  unsigned Align = MCP->getConstants()[CPI].getAlignment();
-  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
-  return Log2_32(Align);
+  return Align(MCP->getConstants()[CPI].getAlignment());
 }
 
 /// scanFunctionJumpTables - Do a scan of the function, building up
@@ -687,7 +693,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   BBInfoVector &BBInfo = BBUtils->getBBInfo();
   // The known bits of the entry block offset are determined by the function
   // alignment.
-  BBInfo.front().KnownBits = MF->getAlignment();
+  BBInfo.front().KnownBits = Log2(MF->getAlignment());
 
   // Compute block offsets and known bits.
   BBUtils->adjustBBOffsetsAfter(&MF->front());
@@ -824,11 +830,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
             Scale = 2;  // +-(offset_8*2)
             NegOk = true;
             break;
-
-          case ARM::tLDRHi:
-            Bits = 5;
-            Scale = 2; // +(offset_5*2)
-            break;
           }
 
           // Remember that this is a user of a CP entry.
@@ -885,6 +886,13 @@ void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
 MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
   MachineBasicBlock *OrigBB = MI->getParent();
 
+  // Collect liveness information at MI.
+  LivePhysRegs LRs(*MF->getSubtarget().getRegisterInfo());
+  LRs.addLiveOuts(*OrigBB);
+  auto LivenessEnd = ++MachineBasicBlock::iterator(MI).getReverse();
+  for (MachineInstr &LiveMI : make_range(OrigBB->rbegin(), LivenessEnd))
+    LRs.stepBackward(LiveMI);
+
   // Create a new MBB for the code after the OrigBB.
   MachineBasicBlock *NewBB =
     MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
@@ -913,6 +921,12 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
   // OrigBB branches to NewBB.
   OrigBB->addSuccessor(NewBB);
 
+  // Update live-in information in the new block.
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  for (MCPhysReg L : LRs)
+    if (!MRI.isReserved(L))
+      NewBB->addLiveIn(L);
+
   // Update internal data structures to account for the newly inserted MBB.
   // This is almost the same as updateForInsertedWaterBlock, except that
   // the Water goes after OrigBB, not NewBB.
@@ -1007,13 +1021,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
                                         MachineBasicBlock* Water, CPUser &U,
                                         unsigned &Growth) {
   BBInfoVector &BBInfo = BBUtils->getBBInfo();
-  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
-  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
-  unsigned NextBlockOffset, NextBlockAlignment;
+  const Align CPEAlign = getCPEAlign(U.CPEMI);
+  const unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPEAlign);
+  unsigned NextBlockOffset;
+  Align NextBlockAlignment;
   MachineFunction::const_iterator NextBlock = Water->getIterator();
   if (++NextBlock == MF->end()) {
     NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
-    NextBlockAlignment = 0;
   } else {
     NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
     NextBlockAlignment = NextBlock->getAlignment();
@@ -1028,13 +1042,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
     Growth = CPEEnd - NextBlockOffset;
     // Compute the padding that would go at the end of the CPE to align the next
     // block.
-    Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
+    Growth += offsetToAlignment(CPEEnd, NextBlockAlignment);
 
     // If the CPE is to be inserted before the instruction, that will raise
     // the offset of the instruction. Also account for unknown alignment padding
     // in blocks between CPE and the user.
     if (CPEOffset < UserOffset)
-      UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign);
+      UserOffset += Growth + UnknownPadding(MF->getAlignment(), Log2(CPEAlign));
   } else
     // CPE fits in existing padding.
     Growth = 0;
@@ -1200,8 +1214,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
   // inserting islands between BB0 and BB1 makes other accesses out of range.
   MachineBasicBlock *UserBB = U.MI->getParent();
   BBInfoVector &BBInfo = BBUtils->getBBInfo();
-  unsigned MinNoSplitDisp =
-      BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI));
+  const Align CPEAlign = getCPEAlign(U.CPEMI);
+  unsigned MinNoSplitDisp = BBInfo[UserBB->getNumber()].postOffset(CPEAlign);
   if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2)
     return false;
   for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
@@ -1254,7 +1268,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
-  unsigned CPELogAlign = getCPELogAlign(CPEMI);
+  const Align CPEAlign = getCPEAlign(CPEMI);
   MachineBasicBlock *UserMBB = UserMI->getParent();
   BBInfoVector &BBInfo = BBUtils->getBBInfo();
   const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
@@ -1267,7 +1281,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
     // Size of branch to insert.
     unsigned Delta = isThumb1 ? 2 : 4;
     // Compute the offset where the CPE will begin.
-    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+    unsigned CPEOffset = UserBBI.postOffset(CPEAlign) + Delta;
 
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
       LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
@@ -1308,11 +1322,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
 
   // Try to split the block so it's fully aligned.  Compute the latest split
   // point where we can add a 4-byte branch instruction, and then align to
-  // LogAlign which is the largest possible alignment in the function.
-  unsigned LogAlign = MF->getAlignment();
-  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  // Align which is the largest possible alignment in the function.
+  const Align Align = MF->getAlignment();
+  assert(Align >= CPEAlign && "Over-aligned constant pool entry");
   unsigned KnownBits = UserBBI.internalKnownBits();
-  unsigned UPad = UnknownPadding(LogAlign, KnownBits);
+  unsigned UPad = UnknownPadding(Align, KnownBits);
   unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
   LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
                               BaseInsertOffset));
@@ -1323,7 +1337,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   BaseInsertOffset -= 4;
 
   LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
-                    << " la=" << LogAlign << " kb=" << KnownBits
+                    << " la=" << Log2(Align) << " kb=" << KnownBits
                     << " up=" << UPad << '\n');
 
   // This could point off the end of the block if we've already got constant
@@ -1337,6 +1351,28 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
     BaseInsertOffset =
         std::max(UserBBI.postOffset() - UPad - 8,
                  UserOffset + TII->getInstSizeInBytes(*UserMI) + 1);
+    // If the CP is referenced(ie, UserOffset) is in first four instructions
+    // after IT, this recalculated BaseInsertOffset could be in the middle of
+    // an IT block. If it is, change the BaseInsertOffset to just after the
+    // IT block. This still make the CP Entry is in range becuase of the
+    // following reasons.
+    //   1. The initial BaseseInsertOffset calculated is (UserOffset +
+    //   U.getMaxDisp() - UPad).
+    //   2. An IT block is only at most 4 instructions plus the "it" itself (18
+    //   bytes).
+    //   3. All the relevant instructions support much larger Maximum
+    //   displacement.
+    MachineBasicBlock::iterator I = UserMI;
+    ++I;
+    for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI),
+                  PredReg = 0;
+         I->getOpcode() != ARM::t2IT &&
+         getITInstrPredicate(*I, PredReg) != ARMCC::AL;
+         Offset += TII->getInstSizeInBytes(*I), I = std::next(I)) {
+      BaseInsertOffset =
+          std::max(BaseInsertOffset, Offset + TII->getInstSizeInBytes(*I) + 1);
+      assert(I != UserMBB->end() && "Fell off end of block");
+    }
     LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
   }
   unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
@@ -1354,8 +1390,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
       CPUser &U = CPUsers[CPUIndex];
       if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
         // Shift intertion point by one unit of alignment so it is within reach.
-        BaseInsertOffset -= 1u << LogAlign;
-        EndInsertOffset  -= 1u << LogAlign;
+        BaseInsertOffset -= Align.value();
+        EndInsertOffset -= Align.value();
       }
       // This is overly conservative, as we don't account for CPEMIs being
       // reused within the block, but it doesn't matter much.  Also assume CPEs
@@ -1397,9 +1433,10 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   }
 
   // We really must not split an IT block.
-  LLVM_DEBUG(unsigned PredReg; assert(
-                 !isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
-
+#ifndef NDEBUG
+  unsigned PredReg;
+  assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL);
+#endif
   NewMBB = splitBlockBeforeInstr(&*MI);
 }
 
@@ -1464,9 +1501,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
   // Always align the new block because CP entries can be smaller than 4
   // bytes. Be careful not to decrease the existing alignment, e.g. NewMBB may
   // be an already aligned constant pool block.
-  const unsigned Align = isThumb ? 1 : 2;
-  if (NewMBB->getAlignment() < Align)
-    NewMBB->setAlignment(Align);
+  const Align Alignment = isThumb ? Align(2) : Align(4);
+  if (NewMBB->getAlignment() < Alignment)
+    NewMBB->setAlignment(Alignment);
 
   // Remove the original WaterList entry; we want subsequent insertions in
   // this vicinity to go after the one we're about to insert.  This
@@ -1495,7 +1532,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
   decrementCPEReferenceCount(CPI, CPEMI);
 
   // Mark the basic block as aligned as required by the const-pool entry.
-  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+  NewIsland->setAlignment(getCPEAlign(U.CPEMI));
 
   // Increase the size of the island block to account for the new entry.
   BBUtils->adjustBBSize(NewIsland, Size);
@@ -1529,10 +1566,11 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
     BBInfo[CPEBB->getNumber()].Size = 0;
 
     // This block no longer needs to be aligned.
-    CPEBB->setAlignment(0);
-  } else
+    CPEBB->setAlignment(Align::None());
+  } else {
     // Entries are sorted by descending alignment, so realign from the front.
-    CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin()));
+    CPEBB->setAlignment(getCPEAlign(&*CPEBB->begin()));
+  }
 
   BBUtils->adjustBBOffsetsAfter(CPEBB);
   // An island has only one predecessor BB and one successor BB. Check if
@@ -1620,7 +1658,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   // L2:
   ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm();
   CC = ARMCC::getOppositeCondition(CC);
-  unsigned CCReg = MI->getOperand(2).getReg();
+  Register CCReg = MI->getOperand(2).getReg();
 
   // If the branch is at the end of its MBB and that has a fall-through block,
   // direct the updated conditional branch to the fall-through block. Otherwise,
@@ -1778,16 +1816,10 @@ bool ARMConstantIslands::optimizeThumb2Instructions() {
   return MadeChange;
 }
 
+
 bool ARMConstantIslands::optimizeThumb2Branches() {
-  bool MadeChange = false;
 
-  // The order in which branches appear in ImmBranches is approximately their
-  // order within the function body. By visiting later branches first, we reduce
-  // the distance between earlier forward branches and their targets, making it
-  // more likely that the cbn?z optimization, which can only apply to forward
-  // branches, will succeed.
-  for (unsigned i = ImmBranches.size(); i != 0; --i) {
-    ImmBranch &Br = ImmBranches[i-1];
+  auto TryShrinkBranch = [this](ImmBranch &Br) {
     unsigned Opcode = Br.MI->getOpcode();
     unsigned NewOpc = 0;
     unsigned Scale = 1;
@@ -1815,47 +1847,115 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
         BBUtils->adjustBBSize(MBB, -2);
         BBUtils->adjustBBOffsetsAfter(MBB);
         ++NumT2BrShrunk;
-        MadeChange = true;
+        return true;
       }
     }
+    return false;
+  };
 
-    Opcode = Br.MI->getOpcode();
-    if (Opcode != ARM::tBcc)
-      continue;
+  struct ImmCompare {
+    MachineInstr* MI = nullptr;
+    unsigned NewOpc = 0;
+  };
+
+  auto FindCmpForCBZ = [this](ImmBranch &Br, ImmCompare &ImmCmp,
+                              MachineBasicBlock *DestBB) {
+    ImmCmp.MI = nullptr;
+    ImmCmp.NewOpc = 0;
 
     // If the conditional branch doesn't kill CPSR, then CPSR can be liveout
     // so this transformation is not safe.
     if (!Br.MI->killsRegister(ARM::CPSR))
-      continue;
+      return false;
 
-    NewOpc = 0;
     unsigned PredReg = 0;
+    unsigned NewOpc = 0;
     ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg);
     if (Pred == ARMCC::EQ)
       NewOpc = ARM::tCBZ;
     else if (Pred == ARMCC::NE)
       NewOpc = ARM::tCBNZ;
-    if (!NewOpc)
-      continue;
-    MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+    else
+      return false;
+
     // Check if the distance is within 126. Subtract starting offset by 2
     // because the cmp will be eliminated.
     unsigned BrOffset = BBUtils->getOffsetOf(Br.MI) + 4 - 2;
     BBInfoVector &BBInfo = BBUtils->getBBInfo();
     unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
     if (BrOffset >= DestOffset || (DestOffset - BrOffset) > 126)
-      continue;
+      return false;
 
     // Search backwards to find a tCMPi8
     auto *TRI = STI->getRegisterInfo();
     MachineInstr *CmpMI = findCMPToFoldIntoCBZ(Br.MI, TRI);
     if (!CmpMI || CmpMI->getOpcode() != ARM::tCMPi8)
+      return false;
+
+    ImmCmp.MI = CmpMI;
+    ImmCmp.NewOpc = NewOpc;
+    return true;
+  };
+
+  auto TryConvertToLE = [this](ImmBranch &Br, ImmCompare &Cmp) {
+    if (Br.MI->getOpcode() != ARM::t2Bcc || !STI->hasLOB() ||
+        STI->hasMinSize())
+      return false;
+
+    MachineBasicBlock *MBB = Br.MI->getParent();
+    MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+    if (BBUtils->getOffsetOf(MBB) < BBUtils->getOffsetOf(DestBB) ||
+        !BBUtils->isBBInRange(Br.MI, DestBB, 4094))
+      return false;
+
+    if (!DT->dominates(DestBB, MBB))
+      return false;
+
+    // We queried for the CBN?Z opcode based upon the 'ExitBB', the opposite
+    // target of Br. So now we need to reverse the condition.
+    Cmp.NewOpc = Cmp.NewOpc == ARM::tCBZ ? ARM::tCBNZ : ARM::tCBZ;
+
+    MachineInstrBuilder MIB = BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(),
+                                      TII->get(ARM::t2LE));
+    MIB.add(Br.MI->getOperand(0));
+    Br.MI->eraseFromParent();
+    Br.MI = MIB;
+    ++NumLEInserted;
+    return true;
+  };
+
+  bool MadeChange = false;
+
+  // The order in which branches appear in ImmBranches is approximately their
+  // order within the function body. By visiting later branches first, we reduce
+  // the distance between earlier forward branches and their targets, making it
+  // more likely that the cbn?z optimization, which can only apply to forward
+  // branches, will succeed.
+  for (ImmBranch &Br : reverse(ImmBranches)) {
+    MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+    MachineBasicBlock *MBB = Br.MI->getParent();
+    MachineBasicBlock *ExitBB = &MBB->back() == Br.MI ?
+      MBB->getFallThrough() :
+      MBB->back().getOperand(0).getMBB();
+
+    ImmCompare Cmp;
+    if (FindCmpForCBZ(Br, Cmp, ExitBB) && TryConvertToLE(Br, Cmp)) {
+      DestBB = ExitBB;
+      MadeChange = true;
+    } else {
+      FindCmpForCBZ(Br, Cmp, DestBB);
+      MadeChange |= TryShrinkBranch(Br);
+    }
+
+    unsigned Opcode = Br.MI->getOpcode();
+    if ((Opcode != ARM::tBcc && Opcode != ARM::t2LE) || !Cmp.NewOpc)
       continue;
 
-    unsigned Reg = CmpMI->getOperand(0).getReg();
+    Register Reg = Cmp.MI->getOperand(0).getReg();
 
     // Check for Kill flags on Reg. If they are present remove them and set kill
     // on the new CBZ.
+    auto *TRI = STI->getRegisterInfo();
     MachineBasicBlock::iterator KillMI = Br.MI;
     bool RegKilled = false;
     do {
@@ -1865,19 +1965,32 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
         RegKilled = true;
         break;
       }
-    } while (KillMI != CmpMI);
+    } while (KillMI != Cmp.MI);
 
     // Create the new CBZ/CBNZ
-    MachineBasicBlock *MBB = Br.MI->getParent();
-    LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+    LLVM_DEBUG(dbgs() << "Fold: " << *Cmp.MI << " and: " << *Br.MI);
     MachineInstr *NewBR =
-        BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+        BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(Cmp.NewOpc))
             .addReg(Reg, getKillRegState(RegKilled))
             .addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags());
-    CmpMI->eraseFromParent();
-    Br.MI->eraseFromParent();
-    Br.MI = NewBR;
+
+    Cmp.MI->eraseFromParent();
+    BBInfoVector &BBInfo = BBUtils->getBBInfo();
     BBInfo[MBB->getNumber()].Size -= 2;
+
+    if (Br.MI->getOpcode() == ARM::tBcc) {
+      Br.MI->eraseFromParent();
+      Br.MI = NewBR;
+    } else if (&MBB->back() != Br.MI) {
+      // We've generated an LE and already erased the original conditional
+      // branch. The CBN?Z is now used to branch to the other successor, so an
+      // unconditional branch terminator is now redundant.
+      MachineInstr *LastMI = &MBB->back();
+      if (LastMI != Br.MI) {
+        BBInfo[MBB->getNumber()].Size -= LastMI->getDesc().getSize();
+        LastMI->eraseFromParent();
+      }
+    }
     BBUtils->adjustBBOffsetsAfter(MBB);
     ++NumCBZ;
     MadeChange = true;
@@ -1931,8 +2044,8 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
   //      of BaseReg, but only if the t2ADDrs can be removed.
   //    + Some instruction other than t2ADDrs computing the entry. Not seen in
   //      the wild, but we should be careful.
-  unsigned EntryReg = JumpMI->getOperand(0).getReg();
-  unsigned BaseReg = LEAMI->getOperand(0).getReg();
+  Register EntryReg = JumpMI->getOperand(0).getReg();
+  Register BaseReg = LEAMI->getOperand(0).getReg();
 
   CanDeleteLEA = true;
   BaseRegKill = false;
@@ -2009,7 +2122,7 @@ static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
   // but the JT now uses PC. Finds the last ADD (if any) that def's EntryReg
   // and is not clobbered / used.
   MachineInstr *RemovableAdd = nullptr;
-  unsigned EntryReg = JumpMI->getOperand(0).getReg();
+  Register EntryReg = JumpMI->getOperand(0).getReg();
 
   // Find the last ADD to set EntryReg
   MachineBasicBlock::iterator I(LEAMI);
@@ -2106,7 +2219,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       //   %idx = tLSLri %idx, 2
       //   %base = tLEApcrelJT
       //   %t = tLDRr %base, %idx
-      unsigned BaseReg = User.MI->getOperand(0).getReg();
+      Register BaseReg = User.MI->getOperand(0).getReg();
 
       if (User.MI->getIterator() == User.MI->getParent()->begin())
         continue;
@@ -2116,7 +2229,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
           !Shift->getOperand(2).isKill())
         continue;
       IdxReg = Shift->getOperand(2).getReg();
-      unsigned ShiftedIdxReg = Shift->getOperand(0).getReg();
+      Register ShiftedIdxReg = Shift->getOperand(0).getReg();
 
       // It's important that IdxReg is live until the actual TBB/TBH. Most of
       // the range is checked later, but the LEA might still clobber it and not
@@ -2313,6 +2426,10 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   MachineFunction::iterator MBBI = ++JTBB->getIterator();
   MF->insert(MBBI, NewBB);
 
+  // Copy live-in information to new block.
+  for (const MachineBasicBlock::RegisterMaskPair &RegMaskPair : BB->liveins())
+    NewBB->addLiveIn(RegMaskPair);
+
   // Add an unconditional branch from NewBB to BB.
   // There doesn't seem to be meaningful DebugInfo available; this doesn't
   // correspond directly to anything in the source.
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 3bdb0e1ef62d..72c95f441265 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index b32ba3eeea18..563fdda56104 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -481,7 +481,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
   unsigned OpIdx = 0;
 
   bool DstIsDead = MI.getOperand(OpIdx).isDead();
-  unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+  Register DstReg = MI.getOperand(OpIdx++).getReg();
   if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
      TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
      TableEntry->RealOpc == ARM::VLD2DUPd32x2) {
@@ -492,7 +492,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
       assert(RegSpc == OddDblSpc && "Unexpected spacing!");
       SubRegIndex = ARM::dsub_1;
     }
-    unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex);
+    Register SubReg = TRI->getSubReg(DstReg, SubRegIndex);
     unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0,
                                                    &ARM::DPairSpcRegClass);
     MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead));
@@ -624,7 +624,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
   bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
-  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+  Register SrcReg = MI.getOperand(OpIdx++).getReg();
   unsigned D0, D1, D2, D3;
   GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
   MIB.addReg(D0, getUndefRegState(SrcIsUndef));
@@ -760,7 +760,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   }
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
-  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+  Register SrcReg = MI.getOperand(OpIdx++).getReg();
   unsigned D0, D1, D2, D3;
   GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
   MIB.addReg(D0);
@@ -789,6 +789,7 @@ static bool IsAnAddressOperand(const MachineOperand &MO) {
   case MachineOperand::MO_Immediate:
   case MachineOperand::MO_CImmediate:
   case MachineOperand::MO_FPImmediate:
+  case MachineOperand::MO_ShuffleMask:
     return false;
   case MachineOperand::MO_MachineBasicBlock:
     return true;
@@ -828,7 +829,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   unsigned Opcode = MI.getOpcode();
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
   const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
@@ -932,13 +933,13 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   const MachineOperand &Dest = MI.getOperand(0);
-  unsigned TempReg = MI.getOperand(1).getReg();
+  Register TempReg = MI.getOperand(1).getReg();
   // Duplicating undef operands into 2 instructions does not guarantee the same
   // value on both; However undef should be replaced by xzr anyway.
   assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
-  unsigned AddrReg = MI.getOperand(2).getReg();
-  unsigned DesiredReg = MI.getOperand(3).getReg();
-  unsigned NewReg = MI.getOperand(4).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register DesiredReg = MI.getOperand(3).getReg();
+  Register NewReg = MI.getOperand(4).getReg();
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -1035,8 +1036,8 @@ static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
                                 unsigned Flags, bool IsThumb,
                                 const TargetRegisterInfo *TRI) {
   if (IsThumb) {
-    unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
-    unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
+    Register RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
+    Register RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
     MIB.addReg(RegLo, Flags);
     MIB.addReg(RegHi, Flags);
   } else
@@ -1051,19 +1052,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   MachineOperand &Dest = MI.getOperand(0);
-  unsigned TempReg = MI.getOperand(1).getReg();
+  Register TempReg = MI.getOperand(1).getReg();
   // Duplicating undef operands into 2 instructions does not guarantee the same
   // value on both; However undef should be replaced by xzr anyway.
   assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
-  unsigned AddrReg = MI.getOperand(2).getReg();
-  unsigned DesiredReg = MI.getOperand(3).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register DesiredReg = MI.getOperand(3).getReg();
   MachineOperand New = MI.getOperand(4);
   New.setIsKill(false);
 
-  unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
-  unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
-  unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0);
-  unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1);
+  Register DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
+  Register DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
+  Register DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0);
+  Register DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1);
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -1204,8 +1205,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
         NewMI->addOperand(MBBI->getOperand(i));
 
-      // Delete the pseudo instruction TCRETURN.
+
+      // Update call site info and delete the pseudo instruction TCRETURN.
+      MBB.getParent()->moveCallSiteInfo(&MI, &*NewMI);
       MBB.erase(MBBI);
+
       MBBI = NewMI;
       return true;
     }
@@ -1336,7 +1340,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       // for us. Otherwise, expand to nothing.
       if (RI.hasBasePointer(MF)) {
         int32_t NumBytes = AFI->getFramePtrSpillOffset();
-        unsigned FramePtr = RI.getFrameRegister(MF);
+        Register FramePtr = RI.getFrameRegister(MF);
         assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
                "base pointer without frame pointer?");
 
@@ -1412,7 +1416,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         MachineConstantPoolValue *CPV =
             ARMConstantPoolSymbol::Create(MF->getFunction().getContext(),
                                           "__aeabi_read_tp", PCLabelID, 0);
-        unsigned Reg = MI.getOperand(0).getReg();
+        Register Reg = MI.getOperand(0).getReg();
         MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
                       TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg)
                   .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
@@ -1435,6 +1439,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
       MIB.cloneMemRefs(MI);
       TransferImpOps(MI, MIB, MIB);
+      MI.getMF()->moveCallSiteInfo(&MI, &*MIB);
       MI.eraseFromParent();
       return true;
     }
@@ -1442,7 +1447,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::t2LDRpci_pic: {
       unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
         ? ARM::tLDRpci : ARM::t2LDRpci;
-      unsigned DstReg = MI.getOperand(0).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
       bool DstIsDead = MI.getOperand(0).isDead();
       MachineInstrBuilder MIB1 =
           BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg)
@@ -1464,7 +1469,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::LDRLIT_ga_pcrel_ldr:
     case ARM::tLDRLIT_ga_abs:
     case ARM::tLDRLIT_ga_pcrel: {
-      unsigned DstReg = MI.getOperand(0).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
       bool DstIsDead = MI.getOperand(0).isDead();
       const MachineOperand &MO1 = MI.getOperand(1);
       auto Flags = MO1.getTargetFlags();
@@ -1522,7 +1527,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::t2MOV_ga_pcrel: {
       // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
       unsigned LabelId = AFI->createPICLabelUId();
-      unsigned DstReg = MI.getOperand(0).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
       bool DstIsDead = MI.getOperand(0).isDead();
       const MachineOperand &MO1 = MI.getOperand(1);
       const GlobalValue *GV = MO1.getGlobal();
@@ -1586,7 +1591,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
       // Grab the Q register destination.
       bool DstIsDead = MI.getOperand(OpIdx).isDead();
-      unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+      Register DstReg = MI.getOperand(OpIdx++).getReg();
 
       // Copy the source register.
       MIB.add(MI.getOperand(OpIdx++));
@@ -1596,8 +1601,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MIB.add(MI.getOperand(OpIdx++));
 
       // Add the destination operands (D subregs).
-      unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
-      unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
+      Register D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+      Register D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
       MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
         .addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
 
@@ -1617,7 +1622,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
       // Grab the Q register source.
       bool SrcIsKill = MI.getOperand(OpIdx).isKill();
-      unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+      Register SrcReg = MI.getOperand(OpIdx++).getReg();
 
       // Copy the destination register.
       MachineOperand Dst(MI.getOperand(OpIdx++));
@@ -1628,8 +1633,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MIB.add(MI.getOperand(OpIdx++));
 
       // Add the source operands (D subregs).
-      unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
-      unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+      Register D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+      Register D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
       MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0)
          .addReg(D1, SrcIsKill ? RegState::Kill : 0);
 
@@ -1915,6 +1920,37 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
     case ARM::CMP_SWAP_64:
       return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
+
+    case ARM::tBL_PUSHLR:
+    case ARM::BL_PUSHLR: {
+      const bool Thumb = Opcode == ARM::tBL_PUSHLR;
+      Register Reg = MI.getOperand(0).getReg();
+      assert(Reg == ARM::LR && "expect LR register!");
+      MachineInstrBuilder MIB;
+      if (Thumb) {
+        // push {lr}
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPUSH))
+            .add(predOps(ARMCC::AL))
+            .addReg(Reg);
+
+        // bl __gnu_mcount_nc
+        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tBL));
+      } else {
+        // stmdb   sp!, {lr}
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::STMDB_UPD))
+            .addReg(ARM::SP, RegState::Define)
+            .addReg(ARM::SP)
+            .add(predOps(ARMCC::AL))
+            .addReg(Reg);
+
+        // bl __gnu_mcount_nc
+        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::BL));
+      }
+      MIB.cloneMemRefs(MI);
+      for (unsigned i = 1; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i));
+      MI.eraseFromParent();
+      return true;
+    }
   }
 }
 
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 6e274d269bf2..1fc5ff6921c6 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -191,8 +191,8 @@ class ARMFastISel final : public FastISel {
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
-                    bool isZExt, bool isEquality);
-    bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                    bool isZExt);
+    bool ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
                      unsigned Alignment = 0, bool isZExt = true,
                      bool allocReg = true);
     bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
@@ -219,15 +219,15 @@ class ARMFastISel final : public FastISel {
                                   bool Return,
                                   bool isVarArg);
     bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
-                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<Register> &ArgRegs,
                          SmallVectorImpl<MVT> &ArgVTs,
                          SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                         SmallVectorImpl<unsigned> &RegArgs,
+                         SmallVectorImpl<Register> &RegArgs,
                          CallingConv::ID CC,
                          unsigned &NumBytes,
                          bool isVarArg);
     unsigned getLibcallReg(const Twine &Name);
-    bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+    bool FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
                     const Instruction *I, CallingConv::ID CC,
                     unsigned &NumBytes, bool isVarArg);
     bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
@@ -301,7 +301,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
 unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
                                      unsigned Op0, bool Op0IsKill) {
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   // Make sure the input operand is sufficiently constrained to be legal
@@ -913,7 +913,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
   AddOptionalDefs(MIB);
 }
 
-bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
                               unsigned Alignment, bool isZExt, bool allocReg) {
   unsigned Opc;
   bool useAM3 = false;
@@ -1045,7 +1045,7 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
   Address Addr;
   if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
 
-  unsigned ResultReg;
+  Register ResultReg;
   if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
     return false;
   updateValueMap(I, ResultReg);
@@ -1259,8 +1259,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       if (ARMPred == ARMCC::AL) return false;
 
       // Emit the compare.
-      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
-                      CI->isEquality()))
+      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
         return false;
 
       unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
@@ -1349,7 +1348,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
 }
 
 bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
-                             bool isZExt, bool isEquality) {
+                             bool isZExt) {
   Type *Ty = Src1Value->getType();
   EVT SrcEVT = TLI.getValueType(DL, Ty, true);
   if (!SrcEVT.isSimple()) return false;
@@ -1397,19 +1396,11 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
     // TODO: Verify compares.
     case MVT::f32:
       isICmp = false;
-      // Equality comparisons shouldn't raise Invalid on uordered inputs.
-      if (isEquality)
-        CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS;
-      else
-        CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+      CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS;
       break;
     case MVT::f64:
       isICmp = false;
-      // Equality comparisons shouldn't raise Invalid on uordered inputs.
-      if (isEquality)
-        CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD;
-      else
-      CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
+      CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD;
       break;
     case MVT::i1:
     case MVT::i8:
@@ -1485,8 +1476,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   if (ARMPred == ARMCC::AL) return false;
 
   // Emit the compare.
-  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
-                  CI->isEquality()))
+  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
     return false;
 
   // Now set a register based on the comparison. Explicitly set the predicates
@@ -1893,10 +1883,10 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
 }
 
 bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
-                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<Register> &ArgRegs,
                                   SmallVectorImpl<MVT> &ArgVTs,
                                   SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  SmallVectorImpl<Register> &RegArgs,
                                   CallingConv::ID CC,
                                   unsigned &NumBytes,
                                   bool isVarArg) {
@@ -1960,7 +1950,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     const Value *ArgVal = Args[VA.getValNo()];
-    unsigned Arg = ArgRegs[VA.getValNo()];
+    Register Arg = ArgRegs[VA.getValNo()];
     MVT ArgVT = ArgVTs[VA.getValNo()];
 
     assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) &&
@@ -2039,7 +2029,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
   return true;
 }
 
-bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
                              const Instruction *I, CallingConv::ID CC,
                              unsigned &NumBytes, bool isVarArg) {
   // Issue CALLSEQ_END
@@ -2060,7 +2050,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
       // double fp reg we want.
       MVT DestVT = RVLocs[0].getValVT();
       const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
-      unsigned ResultReg = createResultReg(DstRC);
+      Register ResultReg = createResultReg(DstRC);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(ARM::VMOVDRR), ResultReg)
                       .addReg(RVLocs[0].getLocReg())
@@ -2081,7 +2071,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
 
       const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
 
-      unsigned ResultReg = createResultReg(DstRC);
+      Register ResultReg = createResultReg(DstRC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY),
               ResultReg).addReg(RVLocs[0].getLocReg());
@@ -2162,7 +2152,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
     }
 
     // Make the copy.
-    unsigned DstReg = VA.getLocReg();
+    Register DstReg = VA.getLocReg();
     const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
     // Avoid a cross-class copy. This is very unlikely.
     if (!SrcRC->contains(DstReg))
@@ -2231,7 +2221,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
 
   // Set up the argument vectors.
   SmallVector<Value*, 8> Args;
-  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<Register, 8> ArgRegs;
   SmallVector<MVT, 8> ArgVTs;
   SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
   Args.reserve(I->getNumOperands());
@@ -2247,8 +2237,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     if (!isTypeLegal(ArgTy, ArgVT)) return false;
 
     ISD::ArgFlagsTy Flags;
-    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
-    Flags.setOrigAlign(OriginalAlignment);
+    Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
 
     Args.push_back(Op);
     ArgRegs.push_back(Arg);
@@ -2257,13 +2246,13 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   }
 
   // Handle the arguments now that we've gotten them.
-  SmallVector<unsigned, 4> RegArgs;
+  SmallVector<Register, 4> RegArgs;
   unsigned NumBytes;
   if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
                        RegArgs, CC, NumBytes, false))
     return false;
 
-  unsigned CalleeReg = 0;
+  Register CalleeReg;
   if (Subtarget->genLongCalls()) {
     CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
     if (CalleeReg == 0) return false;
@@ -2282,7 +2271,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
   // Add implicit physical register uses to the call.
-  for (unsigned R : RegArgs)
+  for (Register R : RegArgs)
     MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
@@ -2290,7 +2279,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
 
   // Finish off the call including any return values.
-  SmallVector<unsigned, 4> UsedRegs;
+  SmallVector<Register, 4> UsedRegs;
   if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false;
 
   // Set all unused physreg defs as dead.
@@ -2340,7 +2329,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
   // Set up the argument vectors.
   SmallVector<Value*, 8> Args;
-  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<Register, 8> ArgRegs;
   SmallVector<MVT, 8> ArgVTs;
   SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
   unsigned arg_size = CS.arg_size();
@@ -2377,12 +2366,11 @@ bool ARMFastISel::SelectCall(const Instruction *I,
         ArgVT != MVT::i1)
       return false;
 
-    unsigned Arg = getRegForValue(*i);
-    if (Arg == 0)
+    Register Arg = getRegForValue(*i);
+    if (!Arg.isValid())
       return false;
 
-    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
-    Flags.setOrigAlign(OriginalAlignment);
+    Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
 
     Args.push_back(*i);
     ArgRegs.push_back(Arg);
@@ -2391,7 +2379,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   }
 
   // Handle the arguments now that we've gotten them.
-  SmallVector<unsigned, 4> RegArgs;
+  SmallVector<Register, 4> RegArgs;
   unsigned NumBytes;
   if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
                        RegArgs, CC, NumBytes, isVarArg))
@@ -2401,7 +2389,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
   if (!GV || Subtarget->genLongCalls()) UseReg = true;
 
-  unsigned CalleeReg = 0;
+  Register CalleeReg;
   if (UseReg) {
     if (IntrMemName)
       CalleeReg = getLibcallReg(IntrMemName);
@@ -2427,7 +2415,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
-  for (unsigned R : RegArgs)
+  for (Register R : RegArgs)
     MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
@@ -2435,7 +2423,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
 
   // Finish off the call including any return values.
-  SmallVector<unsigned, 4> UsedRegs;
+  SmallVector<Register, 4> UsedRegs;
   if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg))
     return false;
 
@@ -2476,7 +2464,7 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
     }
 
     bool RV;
-    unsigned ResultReg;
+    Register ResultReg;
     RV = ARMEmitLoad(VT, ResultReg, Src);
     assert(RV && "Should be able to handle this load.");
     RV = ARMEmitStore(VT, ResultReg, Dest);
@@ -2506,7 +2494,7 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
 
     const ARMBaseRegisterInfo *RegInfo =
         static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo());
-    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
     unsigned SrcReg = FramePtr;
 
     // Recursively load frame address
@@ -2947,7 +2935,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   Address Addr;
   if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
 
-  unsigned ResultReg = MI->getOperand(0).getReg();
+  Register ResultReg = MI->getOperand(0).getReg();
   if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
     return false;
   MachineBasicBlock::iterator I(MI);
@@ -2974,7 +2962,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
                                MachineMemOperand::MOLoad, 4, 4);
 
-  unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
+  Register TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
   unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
   MachineInstrBuilder MIB =
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg)
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index bedb779bcba0..01ae93086dcb 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -76,7 +76,7 @@ skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
                         unsigned NumAlignedDPRCS2Regs);
 
 ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
-    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, Align(4)),
       STI(sti) {}
 
 bool ARMFrameLowering::keepFramePointer(const MachineFunction &MF) const {
@@ -376,7 +376,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // to determine the end of the prologue.
   DebugLoc dl;
 
-  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  Register FramePtr = RegInfo->getFrameRegister(MF);
 
   // Determine the sizes of each callee-save spill areas and record which frame
   // belongs to which callee-save spill areas.
@@ -780,7 +780,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
 
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
   int NumBytes = (int)MFI.getStackSize();
-  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  Register FramePtr = RegInfo->getFrameRegister(MF);
 
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
@@ -1503,11 +1503,17 @@ static unsigned EstimateFunctionSizeInBytes(const MachineFunction &MF,
 /// instructions will require a scratch register during their expansion later.
 // FIXME: Move to TII?
 static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
-                                         const TargetFrameLowering *TFI) {
+                                         const TargetFrameLowering *TFI,
+                                         bool &HasNonSPFrameIndex) {
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   unsigned Limit = (1 << 12) - 1;
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
+      if (MI.isDebugInstr())
+        continue;
       for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
         if (!MI.getOperand(i).isFI())
           continue;
@@ -1518,13 +1524,29 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
           Limit = std::min(Limit, (1U << 8) - 1);
           break;
         }
+        // t2ADDri will not require an extra register, it can reuse the
+        // destination.
+        if (MI.getOpcode() == ARM::t2ADDri || MI.getOpcode() == ARM::t2ADDri12)
+          break;
+
+        const MCInstrDesc &MCID = MI.getDesc();
+        const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI, MF);
+        if (RegClass && !RegClass->contains(ARM::SP))
+          HasNonSPFrameIndex = true;
 
         // Otherwise check the addressing mode.
         switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) {
+        case ARMII::AddrMode_i12:
+        case ARMII::AddrMode2:
+          // Default 12 bit limit.
+          break;
         case ARMII::AddrMode3:
         case ARMII::AddrModeT2_i8:
           Limit = std::min(Limit, (1U << 8) - 1);
           break;
+        case ARMII::AddrMode5FP16:
+          Limit = std::min(Limit, ((1U << 8) - 1) * 2);
+          break;
         case ARMII::AddrMode5:
         case ARMII::AddrModeT2_i8s4:
         case ARMII::AddrModeT2_ldrex:
@@ -1541,8 +1563,17 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
           // Addressing modes 4 & 6 (load/store) instructions can't encode an
           // immediate offset for stack references.
           return 0;
-        default:
+        case ARMII::AddrModeT2_i7:
+          Limit = std::min(Limit, ((1U << 7) - 1) * 1);
+          break;
+        case ARMII::AddrModeT2_i7s2:
+          Limit = std::min(Limit, ((1U << 7) - 1) * 2);
           break;
+        case ARMII::AddrModeT2_i7s4:
+          Limit = std::min(Limit, ((1U << 7) - 1) * 4);
+          break;
+        default:
+          llvm_unreachable("Unhandled addressing mode in stack size limit calculation");
         }
         break; // At most one FI per instruction
       }
@@ -1623,7 +1654,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   (void)TRI;  // Silence unused warning in non-assert builds.
-  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  Register FramePtr = RegInfo->getFrameRegister(MF);
 
   // Spill R4 if Thumb2 function requires stack realignment - it will be used as
   // scratch register. Also spill R4 if Thumb2 function has varsized objects,
@@ -1784,6 +1815,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   EstimatedStackSize += 16; // For possible paddings.
 
   unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit;
+  bool HasNonSPFrameIndex = false;
   if (AFI->isThumb1OnlyFunction()) {
     // For Thumb1, don't bother to iterate over the function. The only
     // instruction that requires an emergency spill slot is a store to a
@@ -1804,7 +1836,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       EstimatedRSStackSizeLimit = (1U << 8) * 4;
     EstimatedRSFixedSizeLimit = (1U << 5) * 4;
   } else {
-    EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this);
+    EstimatedRSStackSizeLimit =
+        estimateRSStackSizeLimit(MF, this, HasNonSPFrameIndex);
     EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit;
   }
   // Final estimate of whether sp or bp-relative accesses might require
@@ -1830,12 +1863,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit;
 
   bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP ||
-                         HasLargeArgumentList;
+                         HasLargeArgumentList || HasNonSPFrameIndex;
   LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit
-                    << "; EstimatedStack" << EstimatedStackSize
-                    << "; EstimatedFPStack" << MaxFixedOffset - MaxFPOffset
-                    << "; BigFrameOffsets: " << BigFrameOffsets
-                    << "\n");
+                    << "; EstimatedStack: " << EstimatedStackSize
+                    << "; EstimatedFPStack: " << MaxFixedOffset - MaxFPOffset
+                    << "; BigFrameOffsets: " << BigFrameOffsets << "\n");
   if (BigFrameOffsets ||
       !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
@@ -2080,9 +2112,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
             ExtraCSSpill = true;
         }
       }
-      if (!ExtraCSSpill) {
+      if (!ExtraCSSpill && RS) {
         // Reserve a slot closest to SP or frame pointer.
-        assert(RS && "Register scavenging not provided");
         LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n");
         const TargetRegisterClass &RC = ARM::GPRRegClass;
         unsigned Size = TRI->getSpillSize(RC);
@@ -2097,6 +2128,12 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     AFI->setLRIsSpilledForFarJump(true);
   }
   AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
+
+  // If we have the "returned" parameter attribute which guarantees that we
+  // return the value which was passed in r0 unmodified (e.g. C++ 'structors),
+  // record that fact for IPRA.
+  if (AFI->getPreservesR0())
+    SavedRegs.set(ARM::R0);
 }
 
 MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 7544ca3c38d6..6d8aee597945 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -63,6 +63,11 @@ public:
   bool enableShrinkWrapping(const MachineFunction &MF) const override {
     return true;
   }
+  bool isProfitableForNoCSROpt(const Function &F) const override {
+    // The no-CSR optimisation is bad for code size on ARM, because we can save
+    // many registers with a single PUSH/POP pair.
+    return false;
+  }
 
 private:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index b349627b67b1..8f6515c423eb 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -139,6 +139,8 @@ public:
   bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
                                  SDValue &OffImm);
   bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
+  template <unsigned Shift>
+  bool SelectTAddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm);
 
   // Thumb 2 Addressing Modes:
   bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
@@ -146,9 +148,12 @@ public:
                             SDValue &OffImm);
   bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
                                  SDValue &OffImm);
-  template<unsigned Shift>
-  bool SelectT2AddrModeImm7(SDValue N, SDValue &Base,
-                            SDValue &OffImm);
+  template <unsigned Shift>
+  bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm);
+  bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm,
+                                  unsigned Shift);
+  template <unsigned Shift>
+  bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm);
   bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
                              SDValue &OffReg, SDValue &ShImm);
   bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
@@ -179,6 +184,7 @@ private:
   bool tryARMIndexedLoad(SDNode *N);
   bool tryT1IndexedLoad(SDNode *N);
   bool tryT2IndexedLoad(SDNode *N);
+  bool tryMVEIndexedLoad(SDNode *N);
 
   /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
   /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
@@ -246,10 +252,6 @@ private:
   SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs,
                         bool is64BitVector);
 
-  /// Returns the number of instructions required to materialize the given
-  /// constant in a register, or 3 if a literal pool load is needed.
-  unsigned ConstantMaterializationCost(unsigned Val) const;
-
   /// Checks if N is a multiplication by a constant where we can extract out a
   /// power of two from the constant so that it can be used in a shift, but only
   /// if it simplifies the materialization of the constant. Returns true if it
@@ -450,27 +452,6 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
          (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
 }
 
-unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
-  if (Subtarget->isThumb()) {
-    if (Val <= 255) return 1;                               // MOV
-    if (Subtarget->hasV6T2Ops() &&
-        (Val <= 0xffff ||                                   // MOV
-         ARM_AM::getT2SOImmVal(Val) != -1 ||                // MOVW
-         ARM_AM::getT2SOImmVal(~Val) != -1))                // MVN
-      return 1;
-    if (Val <= 510) return 2;                               // MOV + ADDi8
-    if (~Val <= 255) return 2;                              // MOV + MVN
-    if (ARM_AM::isThumbImmShiftedVal(Val)) return 2;        // MOV + LSL
-  } else {
-    if (ARM_AM::getSOImmVal(Val) != -1) return 1;           // MOV
-    if (ARM_AM::getSOImmVal(~Val) != -1) return 1;          // MVN
-    if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
-    if (ARM_AM::isSOImmTwoPartVal(Val)) return 2;           // two instrs
-  }
-  if (Subtarget->useMovt()) return 2; // MOVW + MOVT
-  return 3; // Literal pool load
-}
-
 bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
                                              unsigned MaxShift,
                                              unsigned &PowerOfTwo,
@@ -500,8 +481,8 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
   // Only optimise if the new cost is better
   unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo);
   NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32);
-  unsigned OldCost = ConstantMaterializationCost(MulConstVal);
-  unsigned NewCost = ConstantMaterializationCost(NewMulConstVal);
+  unsigned OldCost = ConstantMaterializationCost(MulConstVal, Subtarget);
+  unsigned NewCost = ConstantMaterializationCost(NewMulConstVal, Subtarget);
   return NewCost < OldCost;
 }
 
@@ -1172,6 +1153,28 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
   return false;
 }
 
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectTAddrModeImm7(SDValue N, SDValue &Base,
+                                          SDValue &OffImm) {
+  if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80,
+                                RHSC)) {
+      Base = N.getOperand(0);
+      if (N.getOpcode() == ISD::SUB)
+        RHSC = -RHSC;
+      OffImm =
+          CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+  return true;
+}
+
 
 //===----------------------------------------------------------------------===//
 //                        Thumb 2 Addressing Modes
@@ -1278,35 +1281,59 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
   return false;
 }
 
-template<unsigned Shift>
-bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N,
-                                           SDValue &Base, SDValue &OffImm) {
-  if (N.getOpcode() == ISD::SUB ||
-      CurDAG->isBaseWithConstantOffset(N)) {
-    if (auto RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int RHSC = (int)RHS->getZExtValue();
-      if (N.getOpcode() == ISD::SUB)
-        RHSC = -RHSC;
-
-      if (isShiftedInt<7, Shift>(RHSC)) {
-        Base = N.getOperand(0);
-        if (Base.getOpcode() == ISD::FrameIndex) {
-          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80,
+                                RHSC)) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(
             FI, TLI->getPointerTy(CurDAG->getDataLayout()));
-        }
-        OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
-        return true;
       }
+
+      if (N.getOpcode() == ISD::SUB)
+        RHSC = -RHSC;
+      OffImm =
+          CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32);
+      return true;
     }
   }
 
   // Base only.
   Base = N;
-  OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+  OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
   return true;
 }
 
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
+                                                 SDValue &OffImm) {
+  return SelectT2AddrModeImm7Offset(Op, N, OffImm, Shift);
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
+                                                 SDValue &OffImm,
+                                                 unsigned Shift) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+                               ? cast<LoadSDNode>(Op)->getAddressingMode()
+                               : cast<StoreSDNode>(Op)->getAddressingMode();
+  int RHSC;
+  if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits.
+    OffImm =
+        ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
+            ? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32)
+            : CurDAG->getTargetConstant(-RHSC * (1 << Shift), SDLoc(N),
+                                        MVT::i32);
+    return true;
+  }
+  return false;
+}
+
 bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
                                             SDValue &Base,
                                             SDValue &OffReg, SDValue &ShImm) {
@@ -1565,6 +1592,68 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
   return false;
 }
 
+bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED)
+    return false;
+  EVT LoadedVT = LD->getMemoryVT();
+  if (!LoadedVT.isVector())
+    return false;
+  bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+  SDValue Offset;
+  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  unsigned Opcode = 0;
+  unsigned Align = LD->getAlignment();
+  bool IsLE = Subtarget->isLittle();
+
+  if (Align >= 2 && LoadedVT == MVT::v4i16 &&
+      SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) {
+    if (isSExtLd)
+      Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post;
+    else
+      Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post;
+  } else if (LoadedVT == MVT::v8i8 &&
+             SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+    if (isSExtLd)
+      Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post;
+    else
+      Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post;
+  } else if (LoadedVT == MVT::v4i8 &&
+             SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+    if (isSExtLd)
+      Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post;
+    else
+      Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post;
+  } else if (Align >= 4 &&
+             (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) &&
+             SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2))
+    Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post;
+  else if (Align >= 2 &&
+           (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) &&
+           SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1))
+    Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post;
+  else if ((IsLE || LoadedVT == MVT::v16i8) &&
+           SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0))
+    Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post;
+  else
+    return false;
+
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Ops[] = {Base, Offset,
+                   CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32),
+                   CurDAG->getRegister(0, MVT::i32), Chain};
+  SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0),
+                                       MVT::i32, MVT::Other, Ops);
+  transferMemOperands(N, New);
+  ReplaceUses(SDValue(N, 0), SDValue(New, 1));
+  ReplaceUses(SDValue(N, 1), SDValue(New, 0));
+  ReplaceUses(SDValue(N, 2), SDValue(New, 2));
+  CurDAG->RemoveDeadNode(N);
+  return true;
+}
+
 /// Form a GPRPair pseudo register from a pair of GPR regs.
 SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
   SDLoc dl(V0.getNode());
@@ -2701,7 +2790,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     // If we can't materialize the constant we need to use a literal pool
-    if (ConstantMaterializationCost(Val) > 2) {
+    if (ConstantMaterializationCost(Val, Subtarget) > 2) {
       SDValue CPIdx = CurDAG->getTargetConstantPool(
           ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
           TLI->getPointerTy(CurDAG->getDataLayout()));
@@ -2842,8 +2931,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       bool PreferImmediateEncoding =
         Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm));
       if (!PreferImmediateEncoding &&
-          ConstantMaterializationCost(Imm) >
-              ConstantMaterializationCost(~Imm)) {
+          ConstantMaterializationCost(Imm, Subtarget) >
+              ConstantMaterializationCost(~Imm, Subtarget)) {
         // The current immediate costs more to materialize than a negated
         // immediate, so negate the immediate and use a BIC.
         SDValue NewImm =
@@ -2987,6 +3076,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case ISD::LOAD: {
+    if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N))
+      return;
     if (Subtarget->isThumb() && Subtarget->hasThumb2()) {
       if (tryT2IndexedLoad(N))
         return;
@@ -2998,13 +3089,26 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
-  case ARMISD::WLS: {
-    SDValue Ops[] = { N->getOperand(1),   // Loop count
-                      N->getOperand(2),   // Exit target
+  case ARMISD::WLS:
+  case ARMISD::LE: {
+    SDValue Ops[] = { N->getOperand(1),
+                      N->getOperand(2),
+                      N->getOperand(0) };
+    unsigned Opc = N->getOpcode() == ARMISD::WLS ?
+      ARM::t2WhileLoopStart : ARM::t2LoopEnd;
+    SDNode *New = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+    ReplaceUses(N, New);
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
+  case ARMISD::LOOP_DEC: {
+    SDValue Ops[] = { N->getOperand(1),
+                      N->getOperand(2),
                       N->getOperand(0) };
-    SDNode *LoopStart =
-      CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops);
-    ReplaceUses(N, LoopStart);
+    SDNode *Dec =
+      CurDAG->getMachineNode(ARM::t2LoopDec, dl,
+                             CurDAG->getVTList(MVT::i32, MVT::Other), Ops);
+    ReplaceUses(N, Dec);
     CurDAG->RemoveDeadNode(N);
     return;
   }
@@ -4365,7 +4469,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
       // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
       // the original GPRs.
 
-      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
       PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
       SDValue Chain = SDValue(N,0);
 
@@ -4401,7 +4505,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
 
       // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
       // i32 VRs of inline asm with it.
-      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
       PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
       Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
 
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 18bb9bf3eccc..db26feb57010 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -245,7 +245,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
   const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
 
   for (auto VT : IntTypes) {
-    addRegisterClass(VT, &ARM::QPRRegClass);
+    addRegisterClass(VT, &ARM::MQPRRegClass);
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -258,12 +258,31 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::UMIN, VT, Legal);
     setOperationAction(ISD::UMAX, VT, Legal);
     setOperationAction(ISD::ABS, VT, Legal);
+    setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::MLOAD, VT, Custom);
+    setOperationAction(ISD::MSTORE, VT, Legal);
+    setOperationAction(ISD::CTLZ, VT, Legal);
+    setOperationAction(ISD::CTTZ, VT, Custom);
+    setOperationAction(ISD::BITREVERSE, VT, Legal);
+    setOperationAction(ISD::BSWAP, VT, Legal);
+    setOperationAction(ISD::SADDSAT, VT, Legal);
+    setOperationAction(ISD::UADDSAT, VT, Legal);
+    setOperationAction(ISD::SSUBSAT, VT, Legal);
+    setOperationAction(ISD::USUBSAT, VT, Legal);
 
     // No native support for these.
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+
+    // Vector reductions
+    setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
+    setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
+    setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
+    setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
+    setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
 
     if (!HasMVEFP) {
       setOperationAction(ISD::SINT_TO_FP, VT, Expand);
@@ -271,11 +290,18 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
       setOperationAction(ISD::FP_TO_SINT, VT, Expand);
       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     }
+
+    // Pre and Post inc are supported on loads and stores
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im, VT, Legal);
+      setIndexedStoreAction(im, VT, Legal);
+    }
   }
 
   const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
   for (auto VT : FloatTypes) {
-    addRegisterClass(VT, &ARM::QPRRegClass);
+    addRegisterClass(VT, &ARM::MQPRRegClass);
     if (!HasMVEFP)
       setAllExpand(VT);
 
@@ -287,6 +313,16 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
     setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+    setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::MLOAD, VT, Custom);
+    setOperationAction(ISD::MSTORE, VT, Legal);
+
+    // Pre and Post inc are supported on loads and stores
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im, VT, Legal);
+      setIndexedStoreAction(im, VT, Legal);
+    }
 
     if (HasMVEFP) {
       setOperationAction(ISD::FMINNUM, VT, Legal);
@@ -314,7 +350,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
   // vector types is inhibited at integer-only level.
   const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
   for (auto VT : LongTypes) {
-    addRegisterClass(VT, &ARM::QPRRegClass);
+    addRegisterClass(VT, &ARM::MQPRRegClass);
     setAllExpand(VT);
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -334,6 +370,33 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
   setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
   setTruncStoreAction(MVT::v8i16, MVT::v8i8,  Legal);
+
+  // Pre and Post inc on these are legal, given the correct extends
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im, MVT::v8i8, Legal);
+    setIndexedStoreAction(im, MVT::v8i8, Legal);
+    setIndexedLoadAction(im, MVT::v4i8, Legal);
+    setIndexedStoreAction(im, MVT::v4i8, Legal);
+    setIndexedLoadAction(im, MVT::v4i16, Legal);
+    setIndexedStoreAction(im, MVT::v4i16, Legal);
+  }
+
+  // Predicate types
+  const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1};
+  for (auto VT : pTypes) {
+    addRegisterClass(VT, &ARM::VCCRRegClass);
+    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+    setOperationAction(ISD::LOAD, VT, Custom);
+    setOperationAction(ISD::STORE, VT, Custom);
+  }
 }
 
 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
@@ -645,8 +708,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
   }
 
-  for (MVT VT : MVT::vector_valuetypes()) {
-    for (MVT InnerVT : MVT::vector_valuetypes()) {
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+    for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
       setTruncStoreAction(VT, InnerVT, Expand);
       addAllExtLoads(VT, InnerVT, Expand);
     }
@@ -669,8 +732,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     addMVEVectorTypes(Subtarget->hasMVEFloatOps());
 
   // Combine low-overhead loop intrinsics so that we can lower i1 types.
-  if (Subtarget->hasLOB())
+  if (Subtarget->hasLOB()) {
     setTargetDAGCombine(ISD::BRCOND);
+    setTargetDAGCombine(ISD::BR_CC);
+  }
 
   if (Subtarget->hasNEON()) {
     addDRTypeForNEON(MVT::v2f32);
@@ -837,10 +902,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::SHL);
     setTargetDAGCombine(ISD::SRL);
     setTargetDAGCombine(ISD::SRA);
-    setTargetDAGCombine(ISD::SIGN_EXTEND);
-    setTargetDAGCombine(ISD::ZERO_EXTEND);
-    setTargetDAGCombine(ISD::ANY_EXTEND);
-    setTargetDAGCombine(ISD::STORE);
     setTargetDAGCombine(ISD::FP_TO_SINT);
     setTargetDAGCombine(ISD::FP_TO_UINT);
     setTargetDAGCombine(ISD::FDIV);
@@ -849,7 +910,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     // It is legal to extload from v4i8 to v4i16 or v4i32.
     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
                    MVT::v2i32}) {
-      for (MVT VT : MVT::integer_vector_valuetypes()) {
+      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
@@ -861,6 +922,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::BUILD_VECTOR);
     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+    setTargetDAGCombine(ISD::STORE);
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::ANY_EXTEND);
   }
 
   if (!Subtarget->hasFP64()) {
@@ -901,9 +966,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
   }
 
-  if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){
+  if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
-    setOperationAction(ISD::FP_ROUND,  MVT::f16, Custom);
+    if (Subtarget->hasFullFP16())
+      setOperationAction(ISD::FP_ROUND,  MVT::f16, Custom);
   }
 
   if (!Subtarget->hasFP16())
@@ -955,6 +1021,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
   setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
+  if (Subtarget->hasDSP()) {
+    setOperationAction(ISD::SADDSAT, MVT::i8, Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
+    setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
+  }
+  if (Subtarget->hasBaseDSP()) {
+    setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
+    setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
+  }
 
   // i64 operation support.
   setOperationAction(ISD::MUL,     MVT::i64, Expand);
@@ -972,6 +1048,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRL,       MVT::i64, Custom);
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
 
   // MVE lowers 64 bit shifts to lsll and lsrl
@@ -991,7 +1068,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   // ARM does not have ROTL.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  for (MVT VT : MVT::vector_valuetypes()) {
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     setOperationAction(ISD::ROTL, VT, Expand);
     setOperationAction(ISD::ROTR, VT, Expand);
   }
@@ -1365,14 +1442,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   // On ARM arguments smaller than 4 bytes are extended, so all arguments
   // are at least 4 bytes aligned.
-  setMinStackArgumentAlignment(4);
+  setMinStackArgumentAlignment(Align(4));
 
   // Prefer likely predicted branches to selects on out-of-order cores.
   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
 
-  setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
+  setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
 
-  setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
+  setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
 
   if (Subtarget->isThumb() || Subtarget->isThumb2())
     setTargetDAGCombine(ISD::ABS);
@@ -1472,6 +1549,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::ADDE:          return "ARMISD::ADDE";
   case ARMISD::SUBC:          return "ARMISD::SUBC";
   case ARMISD::SUBE:          return "ARMISD::SUBE";
+  case ARMISD::LSLS:          return "ARMISD::LSLS";
 
   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
@@ -1496,16 +1574,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
 
-  case ARMISD::VCEQ:          return "ARMISD::VCEQ";
-  case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
-  case ARMISD::VCGE:          return "ARMISD::VCGE";
-  case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
-  case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
-  case ARMISD::VCGEU:         return "ARMISD::VCGEU";
-  case ARMISD::VCGT:          return "ARMISD::VCGT";
-  case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
-  case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
-  case ARMISD::VCGTU:         return "ARMISD::VCGTU";
+  case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
+  case ARMISD::VCMP:          return "ARMISD::VCMP";
+  case ARMISD::VCMPZ:         return "ARMISD::VCMPZ";
   case ARMISD::VTST:          return "ARMISD::VTST";
 
   case ARMISD::VSHLs:         return "ARMISD::VSHLs";
@@ -1543,6 +1614,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VTRN:          return "ARMISD::VTRN";
   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
+  case ARMISD::VMOVN:         return "ARMISD::VMOVN";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
@@ -1560,6 +1632,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
   case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
   case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
+  case ARMISD::QADD16b:       return "ARMISD::QADD16b";
+  case ARMISD::QSUB16b:       return "ARMISD::QSUB16b";
+  case ARMISD::QADD8b:        return "ARMISD::QADD8b";
+  case ARMISD::QSUB8b:        return "ARMISD::QSUB8b";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
@@ -1589,6 +1665,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
   case ARMISD::WLS:           return "ARMISD::WLS";
+  case ARMISD::LE:            return "ARMISD::LE";
+  case ARMISD::LOOP_DEC:      return "ARMISD::LOOP_DEC";
+  case ARMISD::CSINV:         return "ARMISD::CSINV";
+  case ARMISD::CSNEG:         return "ARMISD::CSNEG";
+  case ARMISD::CSINC:         return "ARMISD::CSINC";
   }
   return nullptr;
 }
@@ -1597,6 +1678,11 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
                                           EVT VT) const {
   if (!VT.isVector())
     return getPointerTy(DL);
+
+  // MVE has a predicate register.
+  if (Subtarget->hasMVEIntegerOps() &&
+      (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8))
+    return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -1726,34 +1812,22 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
 
 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
-                        ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
+                        ARMCC::CondCodes &CondCode2) {
   CondCode2 = ARMCC::AL;
-  InvalidOnQNaN = true;
   switch (CC) {
   default: llvm_unreachable("Unknown FP condition!");
   case ISD::SETEQ:
-  case ISD::SETOEQ:
-    CondCode = ARMCC::EQ;
-    InvalidOnQNaN = false;
-    break;
+  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
   case ISD::SETGT:
   case ISD::SETOGT: CondCode = ARMCC::GT; break;
   case ISD::SETGE:
   case ISD::SETOGE: CondCode = ARMCC::GE; break;
   case ISD::SETOLT: CondCode = ARMCC::MI; break;
   case ISD::SETOLE: CondCode = ARMCC::LS; break;
-  case ISD::SETONE:
-    CondCode = ARMCC::MI;
-    CondCode2 = ARMCC::GT;
-    InvalidOnQNaN = false;
-    break;
+  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
   case ISD::SETO:   CondCode = ARMCC::VC; break;
   case ISD::SETUO:  CondCode = ARMCC::VS; break;
-  case ISD::SETUEQ:
-    CondCode = ARMCC::EQ;
-    CondCode2 = ARMCC::VS;
-    InvalidOnQNaN = false;
-    break;
+  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
   case ISD::SETUGT: CondCode = ARMCC::HI; break;
   case ISD::SETUGE: CondCode = ARMCC::PL; break;
   case ISD::SETLT:
@@ -1761,10 +1835,7 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
   case ISD::SETLE:
   case ISD::SETULE: CondCode = ARMCC::LE; break;
   case ISD::SETNE:
-  case ISD::SETUNE:
-    CondCode = ARMCC::NE;
-    InvalidOnQNaN = false;
-    break;
+  case ISD::SETUNE: CondCode = ARMCC::NE; break;
   }
 }
 
@@ -1988,6 +2059,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isVarArg                         = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
+  MachineFunction::CallSiteInfo CSInfo;
   bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool isThisReturn = false;
   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
@@ -2112,6 +2184,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                "unexpected use of 'returned'");
         isThisReturn = true;
       }
+      const TargetOptions &Options = DAG.getTarget().Options;
+      if (Options.EnableDebugEntryValues)
+        CSInfo.emplace_back(VA.getLocReg(), i);
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else if (isByVal) {
       assert(VA.isMemLoc());
@@ -2347,12 +2422,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   if (isTailCall) {
     MF.getFrameInfo().setHasTailCall();
-    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+    SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+    return Ret;
   }
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
+  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
@@ -2431,7 +2509,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   int FI = std::numeric_limits<int>::max();
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(VR))
+    if (!Register::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
     if (!Def)
@@ -3047,12 +3125,12 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
 
   // Load the current TEB (thread environment block)
   SDValue Ops[] = {Chain,
-                   DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
-                   DAG.getConstant(15, DL, MVT::i32),
-                   DAG.getConstant(0, DL, MVT::i32),
-                   DAG.getConstant(13, DL, MVT::i32),
-                   DAG.getConstant(0, DL, MVT::i32),
-                   DAG.getConstant(2, DL, MVT::i32)};
+                   DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+                   DAG.getTargetConstant(15, DL, MVT::i32),
+                   DAG.getTargetConstant(0, DL, MVT::i32),
+                   DAG.getTargetConstant(13, DL, MVT::i32),
+                   DAG.getTargetConstant(0, DL, MVT::i32),
+                   DAG.getTargetConstant(2, DL, MVT::i32)};
   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
 
@@ -3498,6 +3576,48 @@ SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
                      Op.getOperand(0));
 }
 
+SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
+    SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
+  unsigned IntNo =
+      cast<ConstantSDNode>(
+          Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other))
+          ->getZExtValue();
+  switch (IntNo) {
+    default:
+      return SDValue();  // Don't custom lower most intrinsics.
+    case Intrinsic::arm_gnu_eabi_mcount: {
+      MachineFunction &MF = DAG.getMachineFunction();
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
+      SDLoc dl(Op);
+      SDValue Chain = Op.getOperand(0);
+      // call "\01__gnu_mcount_nc"
+      const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+      const uint32_t *Mask =
+          ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
+      assert(Mask && "Missing call preserved mask for calling convention");
+      // Mark LR an implicit live-in.
+      unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+      SDValue ReturnAddress =
+          DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
+      std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue};
+      SDValue Callee =
+          DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
+      SDValue RegisterMask = DAG.getRegisterMask(Mask);
+      if (Subtarget->isThumb())
+        return SDValue(
+            DAG.getMachineNode(
+                ARM::tBL_PUSHLR, dl, ResultTys,
+                {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
+                 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
+            0);
+      return SDValue(
+          DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
+                             {ReturnAddress, Callee, RegisterMask, Chain}),
+          0);
+    }
+  }
+}
+
 SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                           const ARMSubtarget *Subtarget) const {
@@ -3898,6 +4018,12 @@ SDValue ARMTargetLowering::LowerFormalArguments(
         // Transform the arguments in physical registers into virtual ones.
         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+        // If this value is passed in r0 and has the returned attribute (e.g.
+        // C++ 'structors), record this fact for later use.
+        if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
+          AFI->setPreservesR0();
+        }
       }
 
       // If this is an 8 or 16-bit value, it is really passed promoted
@@ -4049,6 +4175,67 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     std::swap(LHS, RHS);
   }
 
+  // Thumb1 has very limited immediate modes, so turning an "and" into a
+  // shift can save multiple instructions.
+  //
+  // If we have (x & C1), and C1 is an appropriate mask, we can transform it
+  // into "((x << n) >> n)".  But that isn't necessarily profitable on its
+  // own. If it's the operand to an unsigned comparison with an immediate,
+  // we can eliminate one of the shifts: we transform
+  // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
+  //
+  // We avoid transforming cases which aren't profitable due to encoding
+  // details:
+  //
+  // 1. C2 fits into the immediate field of a cmp, and the transformed version
+  // would not; in that case, we're essentially trading one immediate load for
+  // another.
+  // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
+  // 3. C2 is zero; we have other code for this special case.
+  //
+  // FIXME: Figure out profitability for Thumb2; we usually can't save an
+  // instruction, since the AND is always one instruction anyway, but we could
+  // use narrow instructions in some cases.
+  if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
+      LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
+      LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
+      !isSignedIntSetCC(CC)) {
+    unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
+    auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
+    uint64_t RHSV = RHSC->getZExtValue();
+    if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
+      unsigned ShiftBits = countLeadingZeros(Mask);
+      if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
+        SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
+        LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
+        RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
+      }
+    }
+  }
+
+  // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
+  // single "lsls x, c+1".  The shift sets the "C" and "Z" flags the same
+  // way a cmp would.
+  // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
+  // some tweaks to the heuristics for the previous and->shift transform.
+  // FIXME: Optimize cases where the LHS isn't a shift.
+  if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
+      isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
+      CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
+    unsigned ShiftAmt =
+      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
+    SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
+                                DAG.getVTList(MVT::i32, MVT::i32),
+                                LHS.getOperand(0),
+                                DAG.getConstant(ShiftAmt, dl, MVT::i32));
+    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
+                                     Shift.getValue(1), SDValue());
+    ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
+    return Chain.getValue(1);
+  }
+
   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
 
   // If the RHS is a constant zero then the V (overflow) flag will never be
@@ -4083,15 +4270,13 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
-                                     SelectionDAG &DAG, const SDLoc &dl,
-                                     bool InvalidOnQNaN) const {
+                                     SelectionDAG &DAG, const SDLoc &dl) const {
   assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
   SDValue Cmp;
-  SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
   if (!isFloatingPointZero(RHS))
-    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
   else
-    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
 }
 
@@ -4108,12 +4293,10 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   Cmp = Cmp.getOperand(0);
   Opc = Cmp.getOpcode();
   if (Opc == ARMISD::CMPFP)
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
-                      Cmp.getOperand(1), Cmp.getOperand(2));
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
   else {
     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
-                      Cmp.getOperand(1));
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
   }
   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
 }
@@ -4276,6 +4459,35 @@ SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
 }
 
+static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG,
+                               const ARMSubtarget *Subtarget) {
+  EVT VT = Op.getValueType();
+  if (!Subtarget->hasDSP())
+    return SDValue();
+  if (!VT.isSimple())
+    return SDValue();
+
+  unsigned NewOpcode;
+  bool IsAdd = Op->getOpcode() == ISD::SADDSAT;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::i8:
+    NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b;
+    break;
+  case MVT::i16:
+    NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b;
+    break;
+  }
+
+  SDLoc dl(Op);
+  SDValue Add =
+      DAG.getNode(NewOpcode, dl, MVT::i32,
+                  DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
+                  DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
+  return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
+}
+
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
@@ -4656,10 +4868,62 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
+  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
+  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
+
+  if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
+      LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
+    unsigned TVal = CTVal->getZExtValue();
+    unsigned FVal = CFVal->getZExtValue();
+    unsigned Opcode = 0;
+
+    if (TVal == ~FVal) {
+      Opcode = ARMISD::CSINV;
+    } else if (TVal == ~FVal + 1) {
+      Opcode = ARMISD::CSNEG;
+    } else if (TVal + 1 == FVal) {
+      Opcode = ARMISD::CSINC;
+    } else if (TVal == FVal + 1) {
+      Opcode = ARMISD::CSINC;
+      std::swap(TrueVal, FalseVal);
+      std::swap(TVal, FVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    }
+
+    if (Opcode) {
+      // If one of the constants is cheaper than another, materialise the
+      // cheaper one and let the csel generate the other.
+      if (Opcode != ARMISD::CSINC &&
+          HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
+        std::swap(TrueVal, FalseVal);
+        std::swap(TVal, FVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+
+      // Attempt to use ZR checking TVal is 0, possibly inverting the condition
+      // to get there. CSINC not is invertable like the other two (~(~a) == a,
+      // -(-a) == a, but (a+1)+1 != a).
+      if (FVal == 0 && Opcode != ARMISD::CSINC) {
+        std::swap(TrueVal, FalseVal);
+        std::swap(TVal, FVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+      if (TVal == 0)
+        TrueVal = DAG.getRegister(ARM::ZR, MVT::i32);
+
+      // Drops F's value because we can get it by inverting/negating TVal.
+      FalseVal = TrueVal;
+
+      SDValue ARMcc;
+      SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+      EVT VT = TrueVal.getValueType();
+      return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
+    }
+  }
 
   if (isUnsupportedFloatingType(LHS.getValueType())) {
     DAG.getTargetLoweringInfo().softenSetCCOperands(
-        DAG, LHS.getValueType(), LHS, RHS, CC, dl);
+        DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
@@ -4701,8 +4965,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
-  bool InvalidOnQNaN;
-  FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
+  FPCCToARMCC(CC, CondCode, CondCode2);
 
   // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
   // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
@@ -4727,13 +4990,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
   if (CondCode2 != ARMCC::AL) {
     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
     // FIXME: Needs another CMP because flag can have but one use.
-    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
   }
   return Result;
@@ -4903,7 +5166,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 
   if (isUnsupportedFloatingType(LHS.getValueType())) {
     DAG.getTargetLoweringInfo().softenSetCCOperands(
-        DAG, LHS.getValueType(), LHS, RHS, CC, dl);
+        DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
@@ -4960,11 +5223,10 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
-  bool InvalidOnQNaN;
-  FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
+  FPCCToARMCC(CC, CondCode, CondCode2);
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
@@ -5056,8 +5318,9 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
     else
       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
                               Op.getValueType());
+    MakeLibCallOptions CallOptions;
     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
-                       /*isSigned*/ false, SDLoc(Op)).first;
+                       CallOptions, SDLoc(Op)).first;
   }
 
   return Op;
@@ -5120,8 +5383,9 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
     else
       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
                               Op.getValueType());
+    MakeLibCallOptions CallOptions;
     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
-                       /*isSigned*/ false, SDLoc(Op)).first;
+                       CallOptions, SDLoc(Op)).first;
   }
 
   return Op;
@@ -5140,7 +5404,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
 
   if (UseNEON) {
     // Use VBSL to copy the sign bit.
-    unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
+    unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
@@ -5163,7 +5427,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
 
-    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
+    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
                                             dl, MVT::i32);
     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
@@ -5243,7 +5507,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  unsigned FrameReg = ARI.getFrameRegister(MF);
+  Register FrameReg = ARI.getFrameRegister(MF);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
@@ -5253,9 +5517,9 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
-                                              SelectionDAG &DAG) const {
-  unsigned Reg = StringSwitch<unsigned>(RegName)
+Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              const MachineFunction &MF) const {
+  Register Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", ARM::SP)
                        .Default(0);
   if (Reg)
@@ -5576,8 +5840,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                          const ARMSubtarget *ST) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
-  if (VT.isVector()) {
-    assert(ST->hasNEON());
+  if (VT.isVector() && ST->hasNEON()) {
 
     // Compute the least significant set bit: LSB = X & -X
     SDValue X = N->getOperand(0);
@@ -5777,14 +6040,15 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
     unsigned ShPartsOpc = ARMISD::LSLL;
     ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
 
-    // If the shift amount is greater than 32 then do the default optimisation
-    if (Con && Con->getZExtValue() > 32)
+    // If the shift amount is greater than 32 or has a greater bitwidth than 64
+    // then do the default optimisation
+    if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
+        (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
       return SDValue();
 
-    // Extract the lower 32 bits of the shift amount if it's an i64
-    if (ShAmt->getValueType(0) == MVT::i64)
-      ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt,
-                          DAG.getConstant(0, dl, MVT::i32));
+    // Extract the lower 32 bits of the shift amount if it's not an i32
+    if (ShAmt->getValueType(0) != MVT::i32)
+      ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
 
     if (ShOpc == ISD::SRL) {
       if (!Con)
@@ -5839,20 +6103,37 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
 }
 
-static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
-  SDValue TmpOp0, TmpOp1;
+static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
+                           const ARMSubtarget *ST) {
   bool Invert = false;
   bool Swap = false;
-  unsigned Opc = 0;
+  unsigned Opc = ARMCC::AL;
 
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
-  EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
   EVT VT = Op.getValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   SDLoc dl(Op);
 
+  EVT CmpVT;
+  if (ST->hasNEON())
+    CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
+  else {
+    assert(ST->hasMVEIntegerOps() &&
+           "No hardware support for integer vector comparison!");
+
+    if (Op.getValueType().getVectorElementType() != MVT::i1)
+      return SDValue();
+
+    // Make sure we expand floating point setcc to scalar if we do not have
+    // mve.fp, so that we can handle them from there.
+    if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
+      return SDValue();
+
+    CmpVT = VT;
+  }
+
   if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
       (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
     // Special-case integer 64-bit equality comparisons. They aren't legal,
@@ -5880,60 +6161,74 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
     switch (SetCCOpcode) {
     default: llvm_unreachable("Illegal FP comparison");
     case ISD::SETUNE:
-    case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
+    case ISD::SETNE:
+      if (ST->hasMVEFloatOps()) {
+        Opc = ARMCC::NE; break;
+      } else {
+        Invert = true; LLVM_FALLTHROUGH;
+      }
     case ISD::SETOEQ:
-    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETEQ:  Opc = ARMCC::EQ; break;
     case ISD::SETOLT:
     case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETOGT:
-    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETGT:  Opc = ARMCC::GT; break;
     case ISD::SETOLE:
     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETOGE:
-    case ISD::SETGE: Opc = ARMISD::VCGE; break;
+    case ISD::SETGE: Opc = ARMCC::GE; break;
     case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
-    case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
+    case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
     case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
-    case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
+    case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
     case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
-    case ISD::SETONE:
+    case ISD::SETONE: {
       // Expand this to (OLT | OGT).
-      TmpOp0 = Op0;
-      TmpOp1 = Op1;
-      Opc = ISD::OR;
-      Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
-      Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
-      break;
-    case ISD::SETUO:
-      Invert = true;
-      LLVM_FALLTHROUGH;
-    case ISD::SETO:
+      SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
+                                   DAG.getConstant(ARMCC::GT, dl, MVT::i32));
+      SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
+                                   DAG.getConstant(ARMCC::GT, dl, MVT::i32));
+      SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, VT);
+      return Result;
+    }
+    case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
+    case ISD::SETO: {
       // Expand this to (OLT | OGE).
-      TmpOp0 = Op0;
-      TmpOp1 = Op1;
-      Opc = ISD::OR;
-      Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
-      Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
-      break;
+      SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
+                                   DAG.getConstant(ARMCC::GT, dl, MVT::i32));
+      SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
+                                   DAG.getConstant(ARMCC::GE, dl, MVT::i32));
+      SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, VT);
+      return Result;
+    }
     }
   } else {
     // Integer comparisons.
     switch (SetCCOpcode) {
     default: llvm_unreachable("Illegal integer comparison");
-    case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
-    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETNE:
+      if (ST->hasMVEIntegerOps()) {
+        Opc = ARMCC::NE; break;
+      } else {
+        Invert = true; LLVM_FALLTHROUGH;
+      }
+    case ISD::SETEQ:  Opc = ARMCC::EQ; break;
     case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
-    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETGT:  Opc = ARMCC::GT; break;
     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
-    case ISD::SETGE:  Opc = ARMISD::VCGE; break;
+    case ISD::SETGE:  Opc = ARMCC::GE; break;
     case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
-    case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
+    case ISD::SETUGT: Opc = ARMCC::HI; break;
     case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
-    case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
+    case ISD::SETUGE: Opc = ARMCC::HS; break;
     }
 
     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
-    if (Opc == ARMISD::VCEQ) {
+    if (ST->hasNEON() && Opc == ARMCC::EQ) {
       SDValue AndOp;
       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
         AndOp = Op0;
@@ -5945,10 +6240,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
         AndOp = AndOp.getOperand(0);
 
       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
-        Opc = ARMISD::VTST;
         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
-        Invert = !Invert;
+        SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
+        if (!Invert)
+          Result = DAG.getNOT(dl, Result, VT);
+        return Result;
       }
     }
   }
@@ -5962,31 +6259,20 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
     SingleOp = Op0;
   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
-    if (Opc == ARMISD::VCGE)
-      Opc = ARMISD::VCLEZ;
-    else if (Opc == ARMISD::VCGT)
-      Opc = ARMISD::VCLTZ;
+    if (Opc == ARMCC::GE)
+      Opc = ARMCC::LE;
+    else if (Opc == ARMCC::GT)
+      Opc = ARMCC::LT;
     SingleOp = Op1;
   }
 
   SDValue Result;
   if (SingleOp.getNode()) {
-    switch (Opc) {
-    case ARMISD::VCEQ:
-      Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
-    case ARMISD::VCGE:
-      Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
-    case ARMISD::VCLEZ:
-      Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
-    case ARMISD::VCGT:
-      Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
-    case ARMISD::VCLTZ:
-      Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
-    default:
-      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
-    }
+    Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
+                         DAG.getConstant(Opc, dl, MVT::i32));
   } else {
-     Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
+    Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
+                         DAG.getConstant(Opc, dl, MVT::i32));
   }
 
   Result = DAG.getSExtOrTrunc(Result, dl, VT);
@@ -6027,13 +6313,13 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
                      CCR, Chain.getValue(1));
 }
 
-/// isNEONModifiedImm - Check if the specified splat value corresponds to a
-/// valid vector constant for a NEON or MVE instruction with a "modified immediate"
-/// operand (e.g., VMOV).  If so, return the encoded value.
-static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON or MVE instruction with a "modified
+/// immediate" operand (e.g., VMOV).  If so, return the encoded value.
+static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                  unsigned SplatBitSize, SelectionDAG &DAG,
                                  const SDLoc &dl, EVT &VT, bool is128Bits,
-                                 NEONModImmType type) {
+                                 VMOVModImmType type) {
   unsigned OpCmode, Imm;
 
   // SplatBitSize is set to the smallest size that splats the vector, so a
@@ -6163,10 +6449,10 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
   }
 
   default:
-    llvm_unreachable("unexpected size for isNEONModifiedImm");
+    llvm_unreachable("unexpected size for isVMOVModifiedImm");
   }
 
-  unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
+  unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
 }
 
@@ -6246,7 +6532,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
     return SDValue();
 
   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
-  SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
+  SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
                                      VMovVT, false, VMOVModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
@@ -6263,7 +6549,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
   }
 
   // Finally, try a VMVN.i32
-  NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
+  NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
                              false, VMVNModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
@@ -6649,6 +6935,29 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) {
   return true;
 }
 
+static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) {
+  unsigned NumElts = VT.getVectorNumElements();
+  // Make sure the mask has the right size.
+  if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
+      return false;
+
+  // If Top
+  //   Look for <0, N, 2, N+2, 4, N+4, ..>.
+  //   This inserts Input2 into Input1
+  // else if not Top
+  //   Look for <0, N+1, 2, N+3, 4, N+5, ..>
+  //   This inserts Input1 into Input2
+  unsigned Offset = Top ? 0 : 1;
+  for (unsigned i = 0; i < NumElts; i+=2) {
+    if (M[i] >= 0 && M[i] != (int)i)
+      return false;
+    if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset))
+      return false;
+  }
+
+  return true;
+}
+
 // If N is an integer constant that can be moved into a register in one
 // instruction, return an SDValue of such a constant (will become a MOV
 // instruction).  Otherwise return null.
@@ -6669,6 +6978,66 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BoolMask;
+  unsigned BitsPerBool;
+  if (NumElts == 4) {
+    BitsPerBool = 4;
+    BoolMask = 0xf;
+  } else if (NumElts == 8) {
+    BitsPerBool = 2;
+    BoolMask = 0x3;
+  } else if (NumElts == 16) {
+    BitsPerBool = 1;
+    BoolMask = 0x1;
+  } else
+    return SDValue();
+
+  // If this is a single value copied into all lanes (a splat), we can just sign
+  // extend that single value
+  SDValue FirstOp = Op.getOperand(0);
+  if (!isa<ConstantSDNode>(FirstOp) &&
+      std::all_of(std::next(Op->op_begin()), Op->op_end(),
+                  [&FirstOp](SDUse &U) {
+                    return U.get().isUndef() || U.get() == FirstOp;
+                  })) {
+    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
+                              DAG.getValueType(MVT::i1));
+    return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
+  }
+
+  // First create base with bits set where known
+  unsigned Bits32 = 0;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (!isa<ConstantSDNode>(V) && !V.isUndef())
+      continue;
+    bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
+    if (BitSet)
+      Bits32 |= BoolMask << (i * BitsPerBool);
+  }
+
+  // Add in unknown nodes
+  SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
+                             DAG.getConstant(Bits32, dl, MVT::i32));
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (isa<ConstantSDNode>(V) || V.isUndef())
+      continue;
+    Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
+                       DAG.getConstant(i, dl, MVT::i32));
+  }
+
+  return Base;
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
@@ -6677,6 +7046,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
+  if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
+    return LowerBUILD_VECTOR_i1(Op, DAG, ST);
+
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
@@ -6688,7 +7060,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
         (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
       // Check if an immediate VMOV works.
       EVT VmovVT;
-      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+      SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, dl, VmovVT, VT.is128BitVector(),
                                       VMOVModImm);
@@ -6700,7 +7072,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
       // Try an immediate VMVN.
       uint64_t NegatedImm = (~SplatBits).getZExtValue();
-      Val = isNEONModifiedImm(
+      Val = isVMOVModifiedImm(
           NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
           DAG, dl, VmovVT, VT.is128BitVector(),
           ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
@@ -7088,9 +7460,6 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
       LaneMask[j] = ExtractBase + j;
   }
 
-  // Final check before we try to produce nonsense...
-  if (!isShuffleMaskLegal(Mask, ShuffleVT))
-    return SDValue();
 
   // We can't handle more than two sources. This should have already
   // been checked before this point.
@@ -7100,8 +7469,10 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
   for (unsigned i = 0; i < Sources.size(); ++i)
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
-  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
-                                         ShuffleOps[1], Mask);
+  SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+                                            ShuffleOps[1], Mask, DAG);
+  if (!Shuffle)
+    return SDValue();
   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
@@ -7168,6 +7539,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   unsigned EltSize = VT.getScalarSizeInBits();
   if (EltSize >= 32 ||
       ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+      ShuffleVectorInst::isIdentityMask(M) ||
       isVREVMask(M, VT, 64) ||
       isVREVMask(M, VT, 32) ||
       isVREVMask(M, VT, 16))
@@ -7180,6 +7552,9 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
            isReverseMask(M, VT))
     return true;
+  else if (Subtarget->hasMVEIntegerOps() &&
+           (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1)))
+    return true;
   else
     return false;
 }
@@ -7282,6 +7657,94 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
                      DAG.getConstant(ExtractNum, DL, MVT::i32));
 }
 
+static EVT getVectorTyFromPredicateVector(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::v4i1:
+    return MVT::v4i32;
+  case MVT::v8i1:
+    return MVT::v8i16;
+  case MVT::v16i1:
+    return MVT::v16i8;
+  default:
+    llvm_unreachable("Unexpected vector predicate type");
+  }
+}
+
+static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
+                                    SelectionDAG &DAG) {
+  // Converting from boolean predicates to integers involves creating a vector
+  // of all ones or all zeroes and selecting the lanes based upon the real
+  // predicate.
+  SDValue AllOnes =
+      DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
+  AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
+
+  SDValue AllZeroes =
+      DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
+  AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
+
+  // Get full vector type from predicate type
+  EVT NewVT = getVectorTyFromPredicateVector(VT);
+
+  SDValue RecastV1;
+  // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
+  // this to a v16i1. This cannot be done with an ordinary bitcast because the
+  // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
+  // since we know in hardware the sizes are really the same.
+  if (VT != MVT::v16i1)
+    RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
+  else
+    RecastV1 = Pred;
+
+  // Select either all ones or zeroes depending upon the real predicate bits.
+  SDValue PredAsVector =
+      DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
+
+  // Recast our new predicate-as-integer v16i8 vector into something
+  // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
+  return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
+}
+
+static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
+                                      const ARMSubtarget *ST) {
+  EVT VT = Op.getValueType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  assert(ST->hasMVEIntegerOps() &&
+         "No support for vector shuffle of boolean predicates");
+
+  SDValue V1 = Op.getOperand(0);
+  SDLoc dl(Op);
+  if (isReverseMask(ShuffleMask, VT)) {
+    SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
+    SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
+    SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
+                              DAG.getConstant(16, dl, MVT::i32));
+    return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
+  }
+
+  // Until we can come up with optimised cases for every single vector
+  // shuffle in existence we have chosen the least painful strategy. This is
+  // to essentially promote the boolean predicate to a 8-bit integer, where
+  // each predicate represents a byte. Then we fall back on a normal integer
+  // vector shuffle and convert the result back into a predicate vector. In
+  // many cases the generated code might be even better than scalar code
+  // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
+  // fields in a register into 8 other arbitrary 2-bit fields!
+  SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
+  EVT NewVT = PredAsVector.getValueType();
+
+  // Do the shuffle!
+  SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
+                                          DAG.getUNDEF(NewVT), ShuffleMask);
+
+  // Now return the result of comparing the shuffled vector with zero,
+  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+  return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
+                     DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+}
+
 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const ARMSubtarget *ST) {
   SDValue V1 = Op.getOperand(0);
@@ -7289,6 +7752,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+  unsigned EltSize = VT.getScalarSizeInBits();
+
+  if (ST->hasMVEIntegerOps() && EltSize == 1)
+    return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
 
   // Convert shuffles that are directly supported on NEON to target-specific
   // DAG nodes, instead of keeping them as shuffles and matching them again
@@ -7298,7 +7765,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   // of the same time so that they get CSEd properly.
   ArrayRef<int> ShuffleMask = SVN->getMask();
 
-  unsigned EltSize = VT.getScalarSizeInBits();
   if (EltSize <= 32) {
     if (SVN->isSplat()) {
       int Lane = SVN->getSplatIndex();
@@ -7364,6 +7830,14 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
             .getValue(WhichResult);
       }
     }
+    if (ST->hasMVEIntegerOps()) {
+      if (isVMOVNMask(ShuffleMask, VT, 0))
+        return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
+                           DAG.getConstant(0, dl, MVT::i32));
+      if (isVMOVNMask(ShuffleMask, VT, 1))
+        return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
+                           DAG.getConstant(1, dl, MVT::i32));
+    }
 
     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
     // shuffles that produce a result larger than their operands with:
@@ -7468,8 +7942,29 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-SDValue ARMTargetLowering::
-LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
+                                         const ARMSubtarget *ST) {
+  EVT VecVT = Op.getOperand(0).getValueType();
+  SDLoc dl(Op);
+
+  assert(ST->hasMVEIntegerOps() &&
+         "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
+
+  SDValue Conv =
+      DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
+  unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned LaneWidth =
+      getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
+  unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
+  SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
+                            Op.getOperand(1), DAG.getValueType(MVT::i1));
+  SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
+                            DAG.getConstant(~Mask, dl, MVT::i32));
+  return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
+}
+
+SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                  SelectionDAG &DAG) const {
   // INSERT_VECTOR_ELT is legal only for immediate indexes.
   SDValue Lane = Op.getOperand(2);
   if (!isa<ConstantSDNode>(Lane))
@@ -7477,6 +7972,11 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Elt = Op.getOperand(1);
   EVT EltVT = Elt.getValueType();
+
+  if (Subtarget->hasMVEIntegerOps() &&
+      Op.getValueType().getScalarSizeInBits() == 1)
+    return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
+
   if (getTypeAction(*DAG.getContext(), EltVT) ==
       TargetLowering::TypePromoteFloat) {
     // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
@@ -7505,13 +8005,37 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
-static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
+                                          const ARMSubtarget *ST) {
+  EVT VecVT = Op.getOperand(0).getValueType();
+  SDLoc dl(Op);
+
+  assert(ST->hasMVEIntegerOps() &&
+         "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
+
+  SDValue Conv =
+      DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
+  unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned LaneWidth =
+      getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
+  SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
+                              DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
+  return Shift;
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
+                                       const ARMSubtarget *ST) {
   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
   SDValue Lane = Op.getOperand(1);
   if (!isa<ConstantSDNode>(Lane))
     return SDValue();
 
   SDValue Vec = Op.getOperand(0);
+  EVT VT = Vec.getValueType();
+
+  if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
+    return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
+
   if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
     SDLoc dl(Op);
     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
@@ -7520,7 +8044,64 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   return Op;
 }
 
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
+                                      const ARMSubtarget *ST) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT Op1VT = V1.getValueType();
+  EVT Op2VT = V2.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert(Op1VT == Op2VT && "Operand types don't match!");
+  assert(VT.getScalarSizeInBits() == 1 &&
+         "Unexpected custom CONCAT_VECTORS lowering");
+  assert(ST->hasMVEIntegerOps() &&
+         "CONCAT_VECTORS lowering only supported for MVE");
+
+  SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
+  SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
+
+  // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
+  // promoted to v8i16, etc.
+
+  MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
+
+  // Extract the vector elements from Op1 and Op2 one by one and truncate them
+  // to be the right size for the destination. For example, if Op1 is v4i1 then
+  // the promoted vector is v4i32. The result of concatentation gives a v8i1,
+  // which when promoted is v8i16. That means each i32 element from Op1 needs
+  // truncating to i16 and inserting in the result.
+  EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
+  SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
+  auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
+    EVT NewVT = NewV.getValueType();
+    EVT ConcatVT = ConVec.getValueType();
+    for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
+                                DAG.getIntPtrConstant(i, dl));
+      ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
+                           DAG.getConstant(j, dl, MVT::i32));
+    }
+    return ConVec;
+  };
+  unsigned j = 0;
+  ConVec = ExractInto(NewV1, ConVec, j);
+  ConVec = ExractInto(NewV2, ConVec, j);
+
+  // Now return the result of comparing the subvector with zero,
+  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+  return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
+                     DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+                                   const ARMSubtarget *ST) {
+  EVT VT = Op->getValueType(0);
+  if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
+    return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
+
   // The only time a CONCAT_VECTORS operation can have legal types is when
   // two 64-bit vectors are concatenated to a 128-bit vector.
   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
@@ -7540,6 +8121,43 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
 }
 
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
+                                      const ARMSubtarget *ST) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT Op1VT = V1.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
+
+  assert(VT.getScalarSizeInBits() == 1 &&
+         "Unexpected custom EXTRACT_SUBVECTOR lowering");
+  assert(ST->hasMVEIntegerOps() &&
+         "EXTRACT_SUBVECTOR lowering only supported for MVE");
+
+  SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
+
+  // We now have Op1 promoted to a vector of integers, where v8i1 gets
+  // promoted to v8i16, etc.
+
+  MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
+
+  EVT SubVT = MVT::getVectorVT(ElType, NumElts);
+  SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
+  for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
+                              DAG.getIntPtrConstant(i, dl));
+    SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
+                         DAG.getConstant(j, dl, MVT::i32));
+  }
+
+  // Now return the result of comparing the subvector with zero,
+  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+  return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
+                     DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+}
+
 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
 /// element has been zero/sign-extended, depending on the isSigned parameter,
 /// from an integer type half its size.
@@ -7897,7 +8515,8 @@ static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
   return N0;
 }
 
-static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
+                         const ARMSubtarget *ST) {
   EVT VT = Op.getValueType();
   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
          "unexpected type for custom-lowering ISD::SDIV");
@@ -7924,7 +8543,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
 
     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
-    N0 = LowerCONCAT_VECTORS(N0, DAG);
+    N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
 
     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
     return N0;
@@ -7932,7 +8551,8 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
   return LowerSDIV_v4i16(N0, N1, dl, DAG);
 }
 
-static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
+                         const ARMSubtarget *ST) {
   // TODO: Should this propagate fast-math-flags?
   EVT VT = Op.getValueType();
   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
@@ -7960,7 +8580,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
 
     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
-    N0 = LowerCONCAT_VECTORS(N0, DAG);
+    N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
 
     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
@@ -8255,6 +8875,96 @@ void ARMTargetLowering::ExpandDIV_Windows(
   Results.push_back(Upper);
 }
 
+static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
+  LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+  EVT MemVT = LD->getMemoryVT();
+  assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+         "Expected a predicate type!");
+  assert(MemVT == Op.getValueType());
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Expected a non-extending load");
+  assert(LD->isUnindexed() && "Expected a unindexed load");
+
+  // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
+  // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
+  // need to make sure that 8/4 bits are actually loaded into the correct
+  // place, which means loading the value and then shuffling the values into
+  // the bottom bits of the predicate.
+  // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
+  // for BE).
+
+  SDLoc dl(Op);
+  SDValue Load = DAG.getExtLoad(
+      ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
+      EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+      LD->getMemOperand());
+  SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
+  if (MemVT != MVT::v16i1)
+    Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
+                       DAG.getConstant(0, dl, MVT::i32));
+  return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
+}
+
+static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
+  StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+  EVT MemVT = ST->getMemoryVT();
+  assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+         "Expected a predicate type!");
+  assert(MemVT == ST->getValue().getValueType());
+  assert(!ST->isTruncatingStore() && "Expected a non-extending store");
+  assert(ST->isUnindexed() && "Expected a unindexed store");
+
+  // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
+  // unset and a scalar store.
+  SDLoc dl(Op);
+  SDValue Build = ST->getValue();
+  if (MemVT != MVT::v16i1) {
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
+                                DAG.getConstant(I, dl, MVT::i32)));
+    for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
+      Ops.push_back(DAG.getUNDEF(MVT::i32));
+    Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
+  }
+  SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
+  return DAG.getTruncStore(
+      ST->getChain(), dl, GRP, ST->getBasePtr(),
+      EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+      ST->getMemOperand());
+}
+
+static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDValue PassThru = N->getPassThru();
+  SDLoc dl(Op);
+
+  auto IsZero = [](SDValue PassThru) {
+    return (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
+      (PassThru->getOpcode() == ARMISD::VMOVIMM &&
+       isNullConstant(PassThru->getOperand(0))));
+  };
+
+  if (IsZero(PassThru))
+    return Op;
+
+  // MVE Masked loads use zero as the passthru value. Here we convert undef to
+  // zero too, and other values are lowered to a select.
+  SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(0, dl, MVT::i32));
+  SDValue NewLoad = DAG.getMaskedLoad(
+      VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
+      N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+  SDValue Combo = NewLoad;
+  if (!PassThru.isUndef() &&
+      (PassThru.getOpcode() != ISD::BITCAST ||
+       !IsZero(PassThru->getOperand(0))))
+    Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
+  return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
     // Acquire/Release load/store is not legal for targets without a dmb or
@@ -8273,12 +8983,12 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
   // Under Power Management extensions, the cycle-count is:
   //    mrc p15, #0, <Rt>, c9, c13, #0
   SDValue Ops[] = { N->getOperand(0), // Chain
-                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
-                    DAG.getConstant(15, DL, MVT::i32),
-                    DAG.getConstant(0, DL, MVT::i32),
-                    DAG.getConstant(9, DL, MVT::i32),
-                    DAG.getConstant(13, DL, MVT::i32),
-                    DAG.getConstant(0, DL, MVT::i32)
+                    DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+                    DAG.getTargetConstant(15, DL, MVT::i32),
+                    DAG.getTargetConstant(0, DL, MVT::i32),
+                    DAG.getTargetConstant(9, DL, MVT::i32),
+                    DAG.getTargetConstant(13, DL, MVT::i32),
+                    DAG.getTargetConstant(0, DL, MVT::i32)
   };
 
   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
@@ -8412,6 +9122,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
                                                                Subtarget);
   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
@@ -8426,24 +9137,25 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
-  case ISD::SETCC:         return LowerVSETCC(Op, DAG);
+  case ISD::SETCC:         return LowerVSETCC(Op, DAG, Subtarget);
   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);
   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
+  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
-  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
-  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:           return LowerMUL(Op, DAG);
   case ISD::SDIV:
     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
-    return LowerSDIV(Op, DAG);
+    return LowerSDIV(Op, DAG, Subtarget);
   case ISD::UDIV:
     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
-    return LowerUDIV(Op, DAG);
+    return LowerUDIV(Op, DAG, Subtarget);
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:      return LowerADDSUBCARRY(Op, DAG);
   case ISD::SADDO:
@@ -8452,6 +9164,15 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UADDO:
   case ISD::USUBO:
     return LowerUnsignedALUO(Op, DAG);
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
+    return LowerSADDSUBSAT(Op, DAG, Subtarget);
+  case ISD::LOAD:
+    return LowerPredicateLoad(Op, DAG);
+  case ISD::STORE:
+    return LowerPredicateStore(Op, DAG);
+  case ISD::MLOAD:
+    return LowerMLOAD(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
@@ -8530,6 +9251,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(Res.getValue(0));
     Results.push_back(Res.getValue(1));
     return;
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
+    Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
+    break;
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
@@ -8600,19 +9325,19 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     //   orr    r5, r5, #1
     //   add    r5, pc
     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
-    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    Register NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
         .addConstantPoolIndex(CPI)
         .addMemOperand(CPMMO)
         .add(predOps(ARMCC::AL));
     // Set the low bit because of thumb mode.
-    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    Register NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
         .addReg(NewVReg1, RegState::Kill)
         .addImm(0x01)
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
-    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    Register NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
       .addReg(NewVReg2, RegState::Kill)
       .addImm(PCLabelId);
@@ -8630,28 +9355,28 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     //   orrs   r1, r2
     //   add    r2, $jbuf, #+4 ; &jbuf[1]
     //   str    r1, [r2]
-    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    Register NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
         .addConstantPoolIndex(CPI)
         .addMemOperand(CPMMO)
         .add(predOps(ARMCC::AL));
-    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    Register NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
       .addReg(NewVReg1, RegState::Kill)
       .addImm(PCLabelId);
     // Set the low bit because of thumb mode.
-    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    Register NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
         .addReg(ARM::CPSR, RegState::Define)
         .addImm(1)
         .add(predOps(ARMCC::AL));
-    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    Register NewVReg4 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
         .addReg(ARM::CPSR, RegState::Define)
         .addReg(NewVReg2, RegState::Kill)
         .addReg(NewVReg3, RegState::Kill)
         .add(predOps(ARMCC::AL));
-    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    Register NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
             .addFrameIndex(FI)
             .addImm(36); // &jbuf[1] :: pc
@@ -8666,13 +9391,13 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     //   ldr  r1, LCPI1_1
     //   add  r1, pc, r1
     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
-    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    Register NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
         .addConstantPoolIndex(CPI)
         .addImm(0)
         .addMemOperand(CPMMO)
         .add(predOps(ARMCC::AL));
-    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    Register NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
         .addReg(NewVReg1, RegState::Kill)
         .addImm(PCLabelId)
@@ -8794,7 +9519,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   bool IsPositionIndependent = isPositionIndependent();
   unsigned NumLPads = LPadList.size();
   if (Subtarget->isThumb2()) {
-    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    Register NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
         .addFrameIndex(FI)
         .addImm(4)
@@ -8807,7 +9532,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
           .addImm(LPadList.size())
           .add(predOps(ARMCC::AL));
     } else {
-      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      Register VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
           .addImm(NumLPads & 0xFFFF)
           .add(predOps(ARMCC::AL));
@@ -8832,12 +9557,12 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addImm(ARMCC::HI)
       .addReg(ARM::CPSR);
 
-    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    Register NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
         .addJumpTableIndex(MJTI)
         .add(predOps(ARMCC::AL));
 
-    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    Register NewVReg4 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
         .addReg(NewVReg3, RegState::Kill)
         .addReg(NewVReg1)
@@ -8850,7 +9575,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addReg(NewVReg1)
       .addJumpTableIndex(MJTI);
   } else if (Subtarget->isThumb()) {
-    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    Register NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
         .addFrameIndex(FI)
         .addImm(1)
@@ -8873,7 +9598,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
-      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      Register VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
           .addReg(VReg1, RegState::Define)
           .addConstantPoolIndex(Idx)
@@ -8889,19 +9614,19 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addImm(ARMCC::HI)
       .addReg(ARM::CPSR);
 
-    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    Register NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
         .addReg(ARM::CPSR, RegState::Define)
         .addReg(NewVReg1)
         .addImm(2)
         .add(predOps(ARMCC::AL));
 
-    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    Register NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
         .addJumpTableIndex(MJTI)
         .add(predOps(ARMCC::AL));
 
-    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    Register NewVReg4 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
         .addReg(ARM::CPSR, RegState::Define)
         .addReg(NewVReg2, RegState::Kill)
@@ -8911,7 +9636,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
 
-    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    Register NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
         .addReg(NewVReg4, RegState::Kill)
         .addImm(0)
@@ -8932,7 +9657,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addReg(NewVReg6, RegState::Kill)
       .addJumpTableIndex(MJTI);
   } else {
-    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    Register NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
         .addFrameIndex(FI)
         .addImm(4)
@@ -8945,7 +9670,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
           .addImm(NumLPads)
           .add(predOps(ARMCC::AL));
     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
-      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      Register VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
           .addImm(NumLPads & 0xFFFF)
           .add(predOps(ARMCC::AL));
@@ -8974,7 +9699,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
-      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      Register VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
           .addReg(VReg1, RegState::Define)
           .addConstantPoolIndex(Idx)
@@ -8991,20 +9716,20 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addImm(ARMCC::HI)
       .addReg(ARM::CPSR);
 
-    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    Register NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
         .addReg(NewVReg1)
         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
-    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    Register NewVReg4 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
         .addJumpTableIndex(MJTI)
         .add(predOps(ARMCC::AL));
 
     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
-    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    Register NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
         .addReg(NewVReg3, RegState::Kill)
         .addReg(NewVReg4)
@@ -9239,8 +9964,8 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = ++BB->getIterator();
 
-  unsigned dest = MI.getOperand(0).getReg();
-  unsigned src = MI.getOperand(1).getReg();
+  Register dest = MI.getOperand(0).getReg();
+  Register src = MI.getOperand(1).getReg();
   unsigned SizeVal = MI.getOperand(2).getImm();
   unsigned Align = MI.getOperand(3).getImm();
   DebugLoc dl = MI.getDebugLoc();
@@ -9291,9 +10016,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
     unsigned srcIn = src;
     unsigned destIn = dest;
     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
-      unsigned srcOut = MRI.createVirtualRegister(TRC);
-      unsigned destOut = MRI.createVirtualRegister(TRC);
-      unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+      Register srcOut = MRI.createVirtualRegister(TRC);
+      Register destOut = MRI.createVirtualRegister(TRC);
+      Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
                  IsThumb1, IsThumb2);
       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
@@ -9306,9 +10031,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
     // [destOut] = STRB_POST(scratch, destIn, 1)
     for (unsigned i = 0; i < BytesLeft; i++) {
-      unsigned srcOut = MRI.createVirtualRegister(TRC);
-      unsigned destOut = MRI.createVirtualRegister(TRC);
-      unsigned scratch = MRI.createVirtualRegister(TRC);
+      Register srcOut = MRI.createVirtualRegister(TRC);
+      Register destOut = MRI.createVirtualRegister(TRC);
+      Register scratch = MRI.createVirtualRegister(TRC);
       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
                  IsThumb1, IsThumb2);
       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
@@ -9351,7 +10076,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Load an immediate to varEnd.
-  unsigned varEnd = MRI.createVirtualRegister(TRC);
+  Register varEnd = MRI.createVirtualRegister(TRC);
   if (Subtarget->useMovt()) {
     unsigned Vtmp = varEnd;
     if ((LoopSize & 0xFFFF0000) != 0)
@@ -9401,12 +10126,12 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
   //   destPhi = PHI(destLoop, dst)
   MachineBasicBlock *entryBB = BB;
   BB = loopMBB;
-  unsigned varLoop = MRI.createVirtualRegister(TRC);
-  unsigned varPhi = MRI.createVirtualRegister(TRC);
-  unsigned srcLoop = MRI.createVirtualRegister(TRC);
-  unsigned srcPhi = MRI.createVirtualRegister(TRC);
-  unsigned destLoop = MRI.createVirtualRegister(TRC);
-  unsigned destPhi = MRI.createVirtualRegister(TRC);
+  Register varLoop = MRI.createVirtualRegister(TRC);
+  Register varPhi = MRI.createVirtualRegister(TRC);
+  Register srcLoop = MRI.createVirtualRegister(TRC);
+  Register srcPhi = MRI.createVirtualRegister(TRC);
+  Register destLoop = MRI.createVirtualRegister(TRC);
+  Register destPhi = MRI.createVirtualRegister(TRC);
 
   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
     .addReg(varLoop).addMBB(loopMBB)
@@ -9420,7 +10145,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
 
   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
-  unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+  Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
              IsThumb1, IsThumb2);
   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
@@ -9461,9 +10186,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
   unsigned srcIn = srcLoop;
   unsigned destIn = destLoop;
   for (unsigned i = 0; i < BytesLeft; i++) {
-    unsigned srcOut = MRI.createVirtualRegister(TRC);
-    unsigned destOut = MRI.createVirtualRegister(TRC);
-    unsigned scratch = MRI.createVirtualRegister(TRC);
+    Register srcOut = MRI.createVirtualRegister(TRC);
+    Register destOut = MRI.createVirtualRegister(TRC);
+    Register scratch = MRI.createVirtualRegister(TRC);
     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
                IsThumb1, IsThumb2);
     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
@@ -9523,7 +10248,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
     break;
   case CodeModel::Large: {
     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-    unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+    Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
 
     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
       .addExternalSymbol("__chkstk");
@@ -9771,8 +10496,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     // equality.
     bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
 
-    unsigned LHS1 = MI.getOperand(1).getReg();
-    unsigned LHS2 = MI.getOperand(2).getReg();
+    Register LHS1 = MI.getOperand(1).getReg();
+    Register LHS2 = MI.getOperand(2).getReg();
     if (RHSisZero) {
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
           .addReg(LHS1)
@@ -9782,8 +10507,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addReg(LHS2).addImm(0)
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
     } else {
-      unsigned RHS1 = MI.getOperand(3).getReg();
-      unsigned RHS2 = MI.getOperand(4).getReg();
+      Register RHS1 = MI.getOperand(3).getReg();
+      Register RHS2 = MI.getOperand(4).getReg();
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
           .addReg(LHS1)
           .addReg(RHS1)
@@ -9844,15 +10569,15 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     Fn->insert(BBI, RSBBB);
     Fn->insert(BBI, SinkBB);
 
-    unsigned int ABSSrcReg = MI.getOperand(1).getReg();
-    unsigned int ABSDstReg = MI.getOperand(0).getReg();
+    Register ABSSrcReg = MI.getOperand(1).getReg();
+    Register ABSDstReg = MI.getOperand(0).getReg();
     bool ABSSrcKIll = MI.getOperand(1).isKill();
     bool isThumb2 = Subtarget->isThumb2();
     MachineRegisterInfo &MRI = Fn->getRegInfo();
     // In Thumb mode S must not be specified if source register is the SP or
     // PC and if destination register is the SP, so restrict register class
-    unsigned NewRsbDstReg =
-      MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
+    Register NewRsbDstReg = MRI.createVirtualRegister(
+        isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     SinkBB->splice(SinkBB->begin(), BB,
@@ -9931,7 +10656,7 @@ static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
 
   // The MEMCPY both defines and kills the scratch registers.
   for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
-    unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
+    Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
                                                          : &ARM::GPRRegClass);
     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
   }
@@ -10369,10 +11094,7 @@ static SDValue findMUL_LOHI(SDValue V) {
 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const ARMSubtarget *Subtarget) {
-  if (Subtarget->isThumb()) {
-    if (!Subtarget->hasDSP())
-      return SDValue();
-  } else if (!Subtarget->hasV5TEOps())
+  if (!Subtarget->hasBaseDSP())
     return SDValue();
 
   // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
@@ -11253,7 +11975,7 @@ static SDValue PerformANDCombine(SDNode *N,
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
       EVT VbicVT;
-      SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
+      SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, dl, VbicVT, VT.is128BitVector(),
                                       OtherModImm);
@@ -11469,6 +12191,77 @@ static SDValue PerformORCombineToBFI(SDNode *N,
   return SDValue();
 }
 
+static bool isValidMVECond(unsigned CC, bool IsFloat) {
+  switch (CC) {
+  case ARMCC::EQ:
+  case ARMCC::NE:
+  case ARMCC::LE:
+  case ARMCC::GT:
+  case ARMCC::GE:
+  case ARMCC::LT:
+    return true;
+  case ARMCC::HS:
+  case ARMCC::HI:
+    return !IsFloat;
+  default:
+    return false;
+  };
+}
+
+static SDValue PerformORCombine_i1(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const ARMSubtarget *Subtarget) {
+  // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
+  // together with predicates
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  ARMCC::CondCodes CondCode0 = ARMCC::AL;
+  ARMCC::CondCodes CondCode1 = ARMCC::AL;
+  if (N0->getOpcode() == ARMISD::VCMP)
+    CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
+                    ->getZExtValue();
+  else if (N0->getOpcode() == ARMISD::VCMPZ)
+    CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
+                    ->getZExtValue();
+  if (N1->getOpcode() == ARMISD::VCMP)
+    CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
+                    ->getZExtValue();
+  else if (N1->getOpcode() == ARMISD::VCMPZ)
+    CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
+                    ->getZExtValue();
+
+  if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
+    return SDValue();
+
+  unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
+  unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
+
+  if (!isValidMVECond(Opposite0,
+                      N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
+      !isValidMVECond(Opposite1,
+                      N1->getOperand(0)->getValueType(0).isFloatingPoint()))
+    return SDValue();
+
+  SmallVector<SDValue, 4> Ops0;
+  Ops0.push_back(N0->getOperand(0));
+  if (N0->getOpcode() == ARMISD::VCMP)
+    Ops0.push_back(N0->getOperand(1));
+  Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
+  SmallVector<SDValue, 4> Ops1;
+  Ops1.push_back(N1->getOperand(0));
+  if (N1->getOpcode() == ARMISD::VCMP)
+    Ops1.push_back(N1->getOperand(1));
+  Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));
+
+  SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
+  SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
+  SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
+  return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
+                         DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
+}
+
 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
 static SDValue PerformORCombine(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
@@ -11489,7 +12282,7 @@ static SDValue PerformORCombine(SDNode *N,
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
       EVT VorrVT;
-      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+      SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, dl, VorrVT, VT.is128BitVector(),
                                       OtherModImm);
@@ -11553,6 +12346,10 @@ static SDValue PerformORCombine(SDNode *N,
     }
   }
 
+  if (Subtarget->hasMVEIntegerOps() &&
+      (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
+    return PerformORCombine_i1(N, DCI, Subtarget);
+
   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
   // reasonable.
   if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
@@ -11921,6 +12718,24 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return Vec;
 }
 
+static SDValue
+PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  SDLoc dl(N);
+
+  // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
+  if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
+    // If the valuetypes are the same, we can remove the cast entirely.
+    if (Op->getOperand(0).getValueType() == VT)
+      return Op->getOperand(0);
+    return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl,
+                           Op->getOperand(0).getValueType(), Op->getOperand(0));
+  }
+
+  return SDValue();
+}
+
 /// PerformInsertEltCombine - Target-specific dag combine xforms for
 /// ISD::INSERT_VECTOR_ELT.
 static SDValue PerformInsertEltCombine(SDNode *N,
@@ -12332,7 +13147,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
   // The canonical VMOV for a zero vector uses a 32-bit element size.
   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   unsigned EltBits;
-  if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
+  if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
     EltSize = 8;
   EVT VT = N->getValueType(0);
   if (EltSize > VT.getScalarSizeInBits())
@@ -12382,95 +13197,163 @@ static SDValue PerformLOADCombine(SDNode *N,
   return SDValue();
 }
 
-/// PerformSTORECombine - Target-specific dag combine xforms for
-/// ISD::STORE.
-static SDValue PerformSTORECombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
-  StoreSDNode *St = cast<StoreSDNode>(N);
-  if (St->isVolatile())
-    return SDValue();
-
-  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
-  // pack all of the elements in one place.  Next, store to memory in fewer
-  // chunks.
+// Optimize trunc store (of multiple scalars) to shuffle and store.  First,
+// pack all of the elements in one place.  Next, store to memory in fewer
+// chunks.
+static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
+                                             SelectionDAG &DAG) {
   SDValue StVal = St->getValue();
   EVT VT = StVal.getValueType();
-  if (St->isTruncatingStore() && VT.isVector()) {
-    SelectionDAG &DAG = DCI.DAG;
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    EVT StVT = St->getMemoryVT();
-    unsigned NumElems = VT.getVectorNumElements();
-    assert(StVT != VT && "Cannot truncate to the same type");
-    unsigned FromEltSz = VT.getScalarSizeInBits();
-    unsigned ToEltSz = StVT.getScalarSizeInBits();
+  if (!St->isTruncatingStore() || !VT.isVector())
+    return SDValue();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT StVT = St->getMemoryVT();
+  unsigned NumElems = VT.getVectorNumElements();
+  assert(StVT != VT && "Cannot truncate to the same type");
+  unsigned FromEltSz = VT.getScalarSizeInBits();
+  unsigned ToEltSz = StVT.getScalarSizeInBits();
+
+  // From, To sizes and ElemCount must be pow of two
+  if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
+    return SDValue();
 
-    // From, To sizes and ElemCount must be pow of two
-    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+  // We are going to use the original vector elt for storing.
+  // Accumulated smaller vector elements must be a multiple of the store size.
+  if (0 != (NumElems * FromEltSz) % ToEltSz)
+    return SDValue();
 
-    // We are going to use the original vector elt for storing.
-    // Accumulated smaller vector elements must be a multiple of the store size.
-    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+  unsigned SizeRatio = FromEltSz / ToEltSz;
+  assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
 
-    unsigned SizeRatio  = FromEltSz / ToEltSz;
-    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+  // Create a type on which we perform the shuffle.
+  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+                                   NumElems * SizeRatio);
+  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
-    // Create a type on which we perform the shuffle.
-    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
-                                     NumElems*SizeRatio);
-    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+  SDLoc DL(St);
+  SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+  SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+  for (unsigned i = 0; i < NumElems; ++i)
+    ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
+                                                      : i * SizeRatio;
 
-    SDLoc DL(St);
-    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
-    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; ++i)
-      ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
-                          ? (i + 1) * SizeRatio - 1
-                          : i * SizeRatio;
-
-    // Can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
-
-    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
-                                DAG.getUNDEF(WideVec.getValueType()),
-                                ShuffleVec);
-    // At this point all of the data is stored at the bottom of the
-    // register. We now need to save it to mem.
-
-    // Find the largest store unit
-    MVT StoreType = MVT::i8;
-    for (MVT Tp : MVT::integer_valuetypes()) {
-      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
-        StoreType = Tp;
-    }
-    // Didn't find a legal store type.
-    if (!TLI.isTypeLegal(StoreType))
-      return SDValue();
+  // Can't shuffle using an illegal type.
+  if (!TLI.isTypeLegal(WideVecVT))
+    return SDValue();
 
-    // Bitcast the original vector into a vector of store-size units
-    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
-            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
-    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
-    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
-    SmallVector<SDValue, 8> Chains;
-    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
-                                        TLI.getPointerTy(DAG.getDataLayout()));
-    SDValue BasePtr = St->getBasePtr();
+  SDValue Shuff = DAG.getVectorShuffle(
+      WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
+  // At this point all of the data is stored at the bottom of the
+  // register. We now need to save it to mem.
 
-    // Perform one or more big stores into memory.
-    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
-    for (unsigned I = 0; I < E; I++) {
-      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                                   StoreType, ShuffWide,
-                                   DAG.getIntPtrConstant(I, DL));
-      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
-                                St->getPointerInfo(), St->getAlignment(),
-                                St->getMemOperand()->getFlags());
-      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
-                            Increment);
-      Chains.push_back(Ch);
-    }
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  // Find the largest store unit
+  MVT StoreType = MVT::i8;
+  for (MVT Tp : MVT::integer_valuetypes()) {
+    if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+      StoreType = Tp;
   }
+  // Didn't find a legal store type.
+  if (!TLI.isTypeLegal(StoreType))
+    return SDValue();
+
+  // Bitcast the original vector into a vector of store-size units
+  EVT StoreVecVT =
+      EVT::getVectorVT(*DAG.getContext(), StoreType,
+                       VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
+  assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+  SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+  SmallVector<SDValue, 8> Chains;
+  SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
+                                      TLI.getPointerTy(DAG.getDataLayout()));
+  SDValue BasePtr = St->getBasePtr();
+
+  // Perform one or more big stores into memory.
+  unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
+  for (unsigned I = 0; I < E; I++) {
+    SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
+                                 ShuffWide, DAG.getIntPtrConstant(I, DL));
+    SDValue Ch =
+        DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
+                     St->getAlignment(), St->getMemOperand()->getFlags());
+    BasePtr =
+        DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
+    Chains.push_back(Ch);
+  }
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+}
+
+// Try taking a single vector store from an truncate (which would otherwise turn
+// into an expensive buildvector) and splitting it into a series of narrowing
+// stores.
+static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
+                                                 SelectionDAG &DAG) {
+  if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
+    return SDValue();
+  SDValue Trunc = St->getValue();
+  if (Trunc->getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+  EVT FromVT = Trunc->getOperand(0).getValueType();
+  EVT ToVT = Trunc.getValueType();
+  if (!ToVT.isVector())
+    return SDValue();
+  assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
+  EVT ToEltVT = ToVT.getVectorElementType();
+  EVT FromEltVT = FromVT.getVectorElementType();
+
+  unsigned NumElements = 0;
+  if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8))
+    NumElements = 4;
+  if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
+    NumElements = 8;
+  if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
+      FromVT.getVectorNumElements() % NumElements != 0)
+    return SDValue();
+
+  SDLoc DL(St);
+  // Details about the old store
+  SDValue Ch = St->getChain();
+  SDValue BasePtr = St->getBasePtr();
+  unsigned Alignment = St->getOriginalAlignment();
+  MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = St->getAAInfo();
+
+  EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
+  EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
+
+  SmallVector<SDValue, 4> Stores;
+  for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
+    unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
+    SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+    SDValue Extract =
+        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
+                    DAG.getConstant(i * NumElements, DL, MVT::i32));
+    SDValue Store = DAG.getTruncStore(
+        Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
+        NewToVT, Alignment, MMOFlags, AAInfo);
+    Stores.push_back(Store);
+  }
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const ARMSubtarget *Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  if (St->isVolatile())
+    return SDValue();
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+
+  if (Subtarget->hasNEON())
+    if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
+      return Store;
+
+  if (Subtarget->hasMVEIntegerOps())
+    if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
+      return NewToken;
 
   if (!ISD::isNormalStore(St))
     return SDValue();
@@ -12522,7 +13405,7 @@ static SDValue PerformSTORECombine(SDNode *N,
   }
 
   // If this is a legal vector store, try to combine it into a VST1_UPD.
-  if (ISD::isNormalStore(N) && VT.isVector() &&
+  if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return CombineBaseUpdate(N, DCI);
 
@@ -12890,6 +13773,71 @@ static SDValue PerformShiftCombine(SDNode *N,
   return SDValue();
 }
 
+// Look for a sign/zero extend of a larger than legal load. This can be split
+// into two extending loads, which are simpler to deal with than an arbitrary
+// sign extend.
+static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::LOAD)
+    return SDValue();
+  LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
+  if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
+      LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return SDValue();
+  EVT FromVT = LD->getValueType(0);
+  EVT ToVT = N->getValueType(0);
+  if (!ToVT.isVector())
+    return SDValue();
+  assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
+  EVT ToEltVT = ToVT.getVectorElementType();
+  EVT FromEltVT = FromVT.getVectorElementType();
+
+  unsigned NumElements = 0;
+  if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
+    NumElements = 4;
+  if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
+    NumElements = 8;
+  if (NumElements == 0 ||
+      FromVT.getVectorNumElements() == NumElements ||
+      FromVT.getVectorNumElements() % NumElements != 0 ||
+      !isPowerOf2_32(NumElements))
+    return SDValue();
+
+  SDLoc DL(LD);
+  // Details about the old load
+  SDValue Ch = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  unsigned Alignment = LD->getOriginalAlignment();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  ISD::LoadExtType NewExtType =
+      N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
+  EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
+  SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+  // Split the load in half, each side of which is extended separately. This
+  // is good enough, as legalisation will take it from there. They are either
+  // already legal or they will be split further into something that is
+  // legal.
+  SDValue NewLoad1 =
+      DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
+                  LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo);
+  SDValue NewLoad2 =
+      DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
+                  LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
+                  Alignment, MMOFlags, AAInfo);
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                 SDValue(NewLoad1.getNode(), 1),
+                                 SDValue(NewLoad2.getNode(), 1));
+  DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
+}
+
 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
@@ -12927,6 +13875,10 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  if (ST->hasMVEIntegerOps())
+    if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
+      return NewLoad;
+
   return SDValue();
 }
 
@@ -13028,43 +13980,169 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
   return V;
 }
 
+// Given N, the value controlling the conditional branch, search for the loop
+// intrinsic, returning it, along with how the value is used. We need to handle
+// patterns such as the following:
+// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
+// (brcond (setcc (loop.decrement), 0, eq), exit)
+// (brcond (setcc (loop.decrement), 0, ne), header)
+static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
+                                   bool &Negate) {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::XOR: {
+    if (!isa<ConstantSDNode>(N.getOperand(1)))
+      return SDValue();
+    if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
+      return SDValue();
+    Negate = !Negate;
+    return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
+  }
+  case ISD::SETCC: {
+    auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!Const)
+      return SDValue();
+    if (Const->isNullValue())
+      Imm = 0;
+    else if (Const->isOne())
+      Imm = 1;
+    else
+      return SDValue();
+    CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
+    return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
+    if (IntOp != Intrinsic::test_set_loop_iterations &&
+        IntOp != Intrinsic::loop_decrement_reg)
+      return SDValue();
+    return N;
+  }
+  }
+  return SDValue();
+}
+
 static SDValue PerformHWLoopCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const ARMSubtarget *ST) {
-  // Look for (brcond (xor test.set.loop.iterations, -1)
-  SDValue CC = N->getOperand(1);
-  unsigned Opc = CC->getOpcode();
-  SDValue Int;
 
-  if ((Opc == ISD::XOR || Opc == ISD::SETCC) &&
-      (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) {
+  // The hwloop intrinsics that we're interested are used for control-flow,
+  // either for entering or exiting the loop:
+  // - test.set.loop.iterations will test whether its operand is zero. If it
+  //   is zero, the proceeding branch should not enter the loop.
+  // - loop.decrement.reg also tests whether its operand is zero. If it is
+  //   zero, the proceeding branch should not branch back to the beginning of
+  //   the loop.
+  // So here, we need to check that how the brcond is using the result of each
+  // of the intrinsics to ensure that we're branching to the right place at the
+  // right time.
+
+  ISD::CondCode CC;
+  SDValue Cond;
+  int Imm = 1;
+  bool Negate = false;
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest;
 
-    assert((isa<ConstantSDNode>(CC->getOperand(1)) &&
-            cast<ConstantSDNode>(CC->getOperand(1))->isOne()) &&
-            "Expected to compare against 1");
+  if (N->getOpcode() == ISD::BRCOND) {
+    CC = ISD::SETEQ;
+    Cond = N->getOperand(1);
+    Dest = N->getOperand(2);
+  } else {
+    assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
+    CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    Cond = N->getOperand(2);
+    Dest = N->getOperand(4);
+    if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
+      if (!Const->isOne() && !Const->isNullValue())
+        return SDValue();
+      Imm = Const->getZExtValue();
+    } else
+      return SDValue();
+  }
 
-    Int = CC->getOperand(0);
-  } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN)
-    Int = CC;
-  else 
+  SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
+  if (!Int)
     return SDValue();
 
-  unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
-  if (IntOp != Intrinsic::test_set_loop_iterations)
-    return SDValue();
+  if (Negate)
+    CC = ISD::getSetCCInverse(CC, true);
+
+  auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
+    return (CC == ISD::SETEQ && Imm == 0) ||
+           (CC == ISD::SETNE && Imm == 1) ||
+           (CC == ISD::SETLT && Imm == 1) ||
+           (CC == ISD::SETULT && Imm == 1);
+  };
+
+  auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
+    return (CC == ISD::SETEQ && Imm == 1) ||
+           (CC == ISD::SETNE && Imm == 0) ||
+           (CC == ISD::SETGT && Imm == 0) ||
+           (CC == ISD::SETUGT && Imm == 0) ||
+           (CC == ISD::SETGE && Imm == 1) ||
+           (CC == ISD::SETUGE && Imm == 1);
+  };
+
+  assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
+         "unsupported condition");
 
   SDLoc dl(Int);
-  SDValue Chain = N->getOperand(0);
+  SelectionDAG &DAG = DCI.DAG;
   SDValue Elements = Int.getOperand(2);
-  SDValue ExitBlock = N->getOperand(2);
+  unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+  assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
+          && "expected single br user");
+  SDNode *Br = *N->use_begin();
+  SDValue OtherTarget = Br->getOperand(1);
+
+  // Update the unconditional branch to branch to the given Dest.
+  auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
+    SDValue NewBrOps[] = { Br->getOperand(0), Dest };
+    SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
+  };
 
-  // TODO: Once we start supporting tail predication, we can add another
-  // operand to WLS for the number of elements processed in a vector loop.
+  if (IntOp == Intrinsic::test_set_loop_iterations) {
+    SDValue Res;
+    // We expect this 'instruction' to branch when the counter is zero.
+    if (IsTrueIfZero(CC, Imm)) {
+      SDValue Ops[] = { Chain, Elements, Dest };
+      Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+    } else {
+      // The logic is the reverse of what we need for WLS, so find the other
+      // basic block target: the target of the proceeding br.
+      UpdateUncondBr(Br, Dest, DAG);
 
-  SDValue Ops[] = { Chain, Elements, ExitBlock };
-  SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
-  DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
-  return Res;
+      SDValue Ops[] = { Chain, Elements, OtherTarget };
+      Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+    }
+    DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+    return Res;
+  } else {
+    SDValue Size = DAG.getTargetConstant(
+      cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
+    SDValue Args[] = { Int.getOperand(0), Elements, Size, };
+    SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
+                                  DAG.getVTList(MVT::i32, MVT::Other), Args);
+    DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
+
+    // We expect this instruction to branch when the count is not zero.
+    SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
+
+    // Update the unconditional branch to target the loop preheader if we've
+    // found the condition has been reversed.
+    if (Target == OtherTarget)
+      UpdateUncondBr(Br, Dest, DAG);
+
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        SDValue(LoopDec.getNode(), 1), Chain);
+
+    SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
+    return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
+  }
+  return SDValue();
 }
 
 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
@@ -13298,14 +14376,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
-  case ISD::BRCOND:     return PerformHWLoopCombine(N, DCI, Subtarget);
+  case ISD::BRCOND:
+  case ISD::BR_CC:      return PerformHWLoopCombine(N, DCI, Subtarget);
   case ARMISD::ADDC:
   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
   case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
-  case ISD::STORE:      return PerformSTORECombine(N, DCI);
+  case ISD::STORE:      return PerformSTORECombine(N, DCI, Subtarget);
   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
@@ -13334,6 +14413,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformVLDCombine(N, DCI);
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
+  case ARMISD::PREDICATE_CAST:
+    return PerformPREDICATE_CASTCombine(N, DCI);
   case ARMISD::SMULWB: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
@@ -13348,7 +14429,9 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
       return SDValue();
     break;
   }
-  case ARMISD::SMLALBB: {
+  case ARMISD::SMLALBB:
+  case ARMISD::QADD16b:
+  case ARMISD::QSUB16b: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
@@ -13384,6 +14467,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
       return SDValue();
     break;
   }
+  case ARMISD::QADD8b:
+  case ARMISD::QSUB8b: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
+    if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+      return SDValue();
+    break;
+  }
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -13457,47 +14549,38 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
 
   if (!Subtarget->hasMVEIntegerOps())
     return false;
-  if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
-      Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
-      Ty != MVT::v2f64 &&
-      // These are for truncated stores
-      Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16)
-    return false;
 
-  if (Subtarget->isLittle()) {
-    // In little-endian MVE, the store instructions VSTRB.U8,
-    // VSTRH.U16 and VSTRW.U32 all store the vector register in
-    // exactly the same format, and differ only in the range of
-    // their immediate offset field and the required alignment.
-    //
-    // In particular, VSTRB.U8 can store a vector at byte alignment.
-    // So at this stage we can simply say that loads/stores of all
-    // 128-bit wide vector types are permitted at any alignment,
-    // because we know at least _one_ instruction can manage that.
-    //
-    // Later on we might find that some of those loads are better
-    // generated as VLDRW.U32 if alignment permits, to take
-    // advantage of the larger immediate range. But for the moment,
-    // all that matters is that if we don't lower the load then
-    // _some_ instruction can handle it.
+  // These are for predicates
+  if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
+  // These are for truncated stores/narrowing loads. They are fine so long as
+  // the alignment is at least the size of the item being loaded
+  if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
+      Alignment >= VT.getScalarSizeInBits() / 8) {
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
+  // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
+  // VSTRW.U32 all store the vector register in exactly the same format, and
+  // differ only in the range of their immediate offset field and the required
+  // alignment. So there is always a store that can be used, regardless of
+  // actual type.
+  //
+  // For big endian, that is not the case. But can still emit a (VSTRB.U8;
+  // VREV64.8) pair and get the same effect. This will likely be better than
+  // aligning the vector through the stack.
+  if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
+      Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
+      Ty == MVT::v2f64) {
     if (Fast)
       *Fast = true;
     return true;
-  } else {
-    // In big-endian MVE, those instructions aren't so similar
-    // after all, because they reorder the bytes of the vector
-    // differently. So this time we can only store a particular
-    // kind of vector if its alignment is at least the element
-    // type. And we can't store vectors of i64 or f64 at all
-    // without having to do some postprocessing, because there's
-    // no VSTRD.U64.
-    if (Ty == MVT::v16i8 ||
-        ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
-        ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
-      if (Fast)
-        *Fast = true;
-      return true;
-    }
   }
 
   return false;
@@ -13617,22 +14700,60 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) {
 /// sext/zext can be folded into vsubl.
 bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
                                            SmallVectorImpl<Use *> &Ops) const {
-  if (!Subtarget->hasNEON() || !I->getType()->isVectorTy())
+  if (!I->getType()->isVectorTy())
     return false;
 
-  switch (I->getOpcode()) {
-  case Instruction::Sub:
-  case Instruction::Add: {
-    if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+  if (Subtarget->hasNEON()) {
+    switch (I->getOpcode()) {
+    case Instruction::Sub:
+    case Instruction::Add: {
+      if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+        return false;
+      Ops.push_back(&I->getOperandUse(0));
+      Ops.push_back(&I->getOperandUse(1));
+      return true;
+    }
+    default:
       return false;
-    Ops.push_back(&I->getOperandUse(0));
-    Ops.push_back(&I->getOperandUse(1));
-    return true;
+    }
   }
-  default:
+
+  if (!Subtarget->hasMVEIntegerOps())
+    return false;
+
+  auto IsSinker = [](Instruction *I, int Operand) {
+    switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::Mul:
+      return true;
+    case Instruction::Sub:
+      return Operand == 1;
+    default:
+      return false;
+    }
+  };
+
+  int Op = 0;
+  if (!isa<ShuffleVectorInst>(I->getOperand(Op)))
+    Op = 1;
+  if (!IsSinker(I, Op))
+    return false;
+  if (!match(I->getOperand(Op),
+             m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
+                             m_Undef(), m_Zero()))) {
     return false;
   }
-  return false;
+  Instruction *Shuffle = cast<Instruction>(I->getOperand(Op));
+  // All uses of the shuffle should be sunk to avoid duplicating it across gpr
+  // and vector registers
+  for (Use &U : Shuffle->uses()) {
+    Instruction *Insn = cast<Instruction>(U.getUser());
+    if (!IsSinker(Insn, U.getOperandNo()))
+      return false;
+  }
+  Ops.push_back(&Shuffle->getOperandUse(0));
+  Ops.push_back(&I->getOperandUse(Op));
+  return true;
 }
 
 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
@@ -13641,6 +14762,11 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   if (!isTypeLegal(VT))
     return false;
 
+  if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
+    if (Ld->isExpandingLoad())
+      return false;
+  }
+
   // Don't create a loadext if we can fold the extension into a wide/long
   // instruction.
   // If there's more than one user instruction, the loadext is desirable no
@@ -14028,6 +15154,52 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
   return false;
 }
 
+static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
+                                      bool isSEXTLoad, bool isLE, SDValue &Base,
+                                      SDValue &Offset, bool &isInc,
+                                      SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+  if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
+    return false;
+
+  ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
+  int RHSC = (int)RHS->getZExtValue();
+
+  auto IsInRange = [&](int RHSC, int Limit, int Scale) {
+    if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
+      assert(Ptr->getOpcode() == ISD::ADD);
+      isInc = false;
+      Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
+      return true;
+    } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
+      isInc = Ptr->getOpcode() == ISD::ADD;
+      Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
+      return true;
+    }
+    return false;
+  };
+
+  // Try to find a matching instruction based on s/zext, Alignment, Offset and
+  // (in BE) type.
+  Base = Ptr->getOperand(0);
+  if (VT == MVT::v4i16) {
+    if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
+      return true;
+  } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
+    if (IsInRange(RHSC, 0x80, 1))
+      return true;
+  } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) &&
+             IsInRange(RHSC, 0x80, 4))
+    return true;
+  else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) &&
+           IsInRange(RHSC, 0x80, 2))
+    return true;
+  else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
+    return true;
+  return false;
+}
+
 /// getPreIndexedAddressParts - returns true by value, base pointer and
 /// offset pointer and addressing mode by reference if the node's address
 /// can be legally represented as pre-indexed load / store address.
@@ -14041,25 +15213,35 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
   EVT VT;
   SDValue Ptr;
+  unsigned Align;
   bool isSEXTLoad = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
-    VT  = LD->getMemoryVT();
+    VT = LD->getMemoryVT();
+    Align = LD->getAlignment();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
-    VT  = ST->getMemoryVT();
+    VT = ST->getMemoryVT();
+    Align = ST->getAlignment();
   } else
     return false;
 
   bool isInc;
   bool isLegal = false;
-  if (Subtarget->isThumb2())
-    isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
-                                       Offset, isInc, DAG);
-  else
-    isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
-                                        Offset, isInc, DAG);
+  if (VT.isVector())
+    isLegal = Subtarget->hasMVEIntegerOps() &&
+              getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
+                                        Subtarget->isLittle(), Base, Offset,
+                                        isInc, DAG);
+  else {
+    if (Subtarget->isThumb2())
+      isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                         Offset, isInc, DAG);
+    else
+      isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                          Offset, isInc, DAG);
+  }
   if (!isLegal)
     return false;
 
@@ -14077,15 +15259,18 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    SelectionDAG &DAG) const {
   EVT VT;
   SDValue Ptr;
+  unsigned Align;
   bool isSEXTLoad = false, isNonExt;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT  = LD->getMemoryVT();
+    VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
+    Align = LD->getAlignment();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT  = ST->getMemoryVT();
+    VT = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
+    Align = ST->getAlignment();
     isNonExt = !ST->isTruncatingStore();
   } else
     return false;
@@ -14108,12 +15293,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 
   bool isInc;
   bool isLegal = false;
-  if (Subtarget->isThumb2())
-    isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
-                                       isInc, DAG);
-  else
-    isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+  if (VT.isVector())
+    isLegal = Subtarget->hasMVEIntegerOps() &&
+              getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad,
+                                        Subtarget->isLittle(), Base, Offset,
                                         isInc, DAG);
+  else {
+    if (Subtarget->isThumb2())
+      isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                         isInc, DAG);
+    else
+      isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                          isInc, DAG);
+  }
   if (!isLegal)
     return false;
 
@@ -14369,7 +15561,8 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
 /// constraint it is for this target.
 ARMTargetLowering::ConstraintType
 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
-  if (Constraint.size() == 1) {
+  unsigned S = Constraint.size();
+  if (S == 1) {
     switch (Constraint[0]) {
     default:  break;
     case 'l': return C_RegisterClass;
@@ -14377,12 +15570,12 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const {
     case 'h': return C_RegisterClass;
     case 'x': return C_RegisterClass;
     case 't': return C_RegisterClass;
-    case 'j': return C_Other; // Constant for movw.
-      // An address with a single base register. Due to the way we
-      // currently handle addresses it is the same as an 'r' memory constraint.
+    case 'j': return C_Immediate; // Constant for movw.
+    // An address with a single base register. Due to the way we
+    // currently handle addresses it is the same as an 'r' memory constraint.
     case 'Q': return C_Memory;
     }
-  } else if (Constraint.size() == 2) {
+  } else if (S == 2) {
     switch (Constraint[0]) {
     default: break;
     case 'T': return C_RegisterClass;
@@ -14535,7 +15728,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
       case 'j':
         // Constant suitable for movw, must be between 0 and
         // 65535.
-        if (Subtarget->hasV6T2Ops())
+        if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
           if (CVal >= 0 && CVal <= 65535)
             break;
         return;
@@ -14643,7 +15836,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'N':
-        if (Subtarget->isThumb()) {  // FIXME thumb2
+        if (Subtarget->isThumb1Only()) {
           // This must be a constant between 0 and 31, for shift amounts.
           if (CVal >= 0 && CVal <= 31)
             break;
@@ -14651,7 +15844,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         return;
 
       case 'O':
-        if (Subtarget->isThumb()) {  // FIXME thumb2
+        if (Subtarget->isThumb1Only()) {
           // This must be a multiple of 4 between -508 and 508, for
           // ADD/SUB sp = sp + immediate.
           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
@@ -14874,6 +16067,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   // without FP16. So we must do a function call.
   SDLoc Loc(Op);
   RTLIB::Libcall LC;
+  MakeLibCallOptions CallOptions;
   if (SrcSz == 16) {
     // Instruction from 16 -> 32
     if (Subtarget->hasFP16())
@@ -14884,7 +16078,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
       assert(LC != RTLIB::UNKNOWN_LIBCALL &&
              "Unexpected type for custom-lowering FP_EXTEND");
       SrcVal =
-        makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first;
+        makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first;
     }
   }
 
@@ -14897,7 +16091,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Unexpected type for custom-lowering FP_EXTEND");
-  return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first;
+  return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first;
 }
 
 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
@@ -14923,7 +16117,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Unexpected type for custom-lowering FP_ROUND");
-  return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first;
+  MakeLibCallOptions CallOptions;
+  return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first;
 }
 
 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -15015,7 +16210,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
     return true;
@@ -15030,7 +16225,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
-    Info.align = 0;
+    Info.align.reset();
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
     return true;
@@ -15056,7 +16251,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
     return true;
@@ -15077,7 +16272,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = 0;
+    Info.align.reset();
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
     return true;
@@ -15090,7 +16285,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -15102,7 +16297,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -15112,7 +16307,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(2);
     Info.offset = 0;
-    Info.align = 8;
+    Info.align = Align(8);
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
 
@@ -15122,7 +16317,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = 8;
+    Info.align = Align(8);
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
 
@@ -15473,6 +16668,12 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
   return VecSize == 64 || VecSize % 128 == 0;
 }
 
+unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
+  if (Subtarget->hasNEON())
+    return 4;
+  return TargetLoweringBase::getMaxSupportedInterleaveFactor();
+}
+
 /// Lower an interleaved load into a vldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -15792,15 +16993,15 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
 }
 
 /// Return the correct alignment for the current calling convention.
-unsigned
-ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
-                                                 DataLayout DL) const {
+Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
+                                                       DataLayout DL) const {
+  const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy));
   if (!ArgTy->isVectorTy())
-    return DL.getABITypeAlignment(ArgTy);
+    return ABITypeAlign;
 
   // Avoid over-aligning vector parameters. It would require realigning the
   // stack and waste space for no real benefit.
-  return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
+  return std::min(ABITypeAlign, DL.getStackAlignment());
 }
 
 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
@@ -15861,7 +17062,7 @@ void ARMTargetLowering::insertCopiesSplitCSR(
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
-    unsigned NewVR = MRI->createVirtualRegister(RC);
+    Register NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     // FIXME: this currently does not emit CFI pseudo-instructions, it works
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 1675ec59a354..53813fad5afd 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -103,6 +103,7 @@ class VectorType;
       ADDE,         // Add using carry
       SUBC,         // Sub with carry
       SUBE,         // Sub using carry
+      LSLS,         // Shift left producing carry
 
       VMOVRRD,      // double to two gprs.
       VMOVDRR,      // Two gprs to double.
@@ -126,17 +127,13 @@ class VectorType;
       WIN__DBZCHK,  // Windows' divide by zero check
 
       WLS,          // Low-overhead loops, While Loop Start
+      LOOP_DEC,     // Really a part of LE, performs the sub
+      LE,           // Low-overhead loops, Loop End
 
-      VCEQ,         // Vector compare equal.
-      VCEQZ,        // Vector compare equal to zero.
-      VCGE,         // Vector compare greater than or equal.
-      VCGEZ,        // Vector compare greater than or equal to zero.
-      VCLEZ,        // Vector compare less than or equal to zero.
-      VCGEU,        // Vector compare unsigned greater than or equal.
-      VCGT,         // Vector compare greater than.
-      VCGTZ,        // Vector compare greater than zero.
-      VCLTZ,        // Vector compare less than zero.
-      VCGTU,        // Vector compare unsigned greater than.
+      PREDICATE_CAST, // Predicate cast for MVE i1 types
+
+      VCMP,         // Vector compare.
+      VCMPZ,        // Vector compare to zero.
       VTST,         // Vector test bits.
 
       // Vector shift by vector
@@ -200,6 +197,7 @@ class VectorType;
       VTRN,         // transpose
       VTBL1,        // 1-register shuffle with mask
       VTBL2,        // 2-register shuffle with mask
+      VMOVN,        // MVE vmovn
 
       // Vector multiply long:
       VMULLs,       // ...signed
@@ -221,6 +219,12 @@ class VectorType;
       SMMLAR,       // Signed multiply long, round and add
       SMMLSR,       // Signed multiply long, subtract and round
 
+      // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b stands for.
+      QADD8b,
+      QSUB8b,
+      QADD16b,
+      QSUB16b,
+
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
       // operations, but for ARM some BUILD_VECTORs are legal as-is and their
@@ -243,6 +247,11 @@ class VectorType;
       // instructions.
       MEMCPY,
 
+      // V8.1MMainline condition select
+      CSINV, // Conditional select invert.
+      CSNEG, // Conditional select negate.
+      CSINC, // Conditional select increment.
+
       // Vector load N-element structure to all lanes:
       VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
       VLD2DUP,
@@ -539,7 +548,7 @@ class VectorType;
     Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
                                    AtomicOrdering Ord) const override;
 
-    unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+    unsigned getMaxSupportedInterleaveFactor() const override;
 
     bool lowerInterleavedLoad(LoadInst *LI,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
@@ -608,8 +617,8 @@ class VectorType;
     void finalizeLowering(MachineFunction &MF) const override;
 
     /// Return the correct alignment for the current calling convention.
-    unsigned getABIAlignmentForCallingConv(Type *ArgTy,
-                                           DataLayout DL) const override;
+    Align getABIAlignmentForCallingConv(Type *ArgTy,
+                                        DataLayout DL) const override;
 
     bool isDesirableToCommuteWithShift(const SDNode *N,
                                        CombineLevel Level) const override;
@@ -670,6 +679,8 @@ class VectorType;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -721,8 +732,8 @@ class VectorType;
     void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
                   SelectionDAG &DAG) const;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT,
-                               SelectionDAG &DAG) const override;
+    Register getRegisterByName(const char* RegName, EVT VT,
+                               const MachineFunction &MF) const override;
 
     SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                           SmallVectorImpl<SDNode *> &Created) const override;
@@ -814,7 +825,7 @@ class VectorType;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                      const SDLoc &dl, bool InvalidOnQNaN) const;
+                      const SDLoc &dl) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
@@ -838,7 +849,7 @@ class VectorType;
     void setAllExpand(MVT VT);
   };
 
-  enum NEONModImmType {
+  enum VMOVModImmType {
     VMOVModImm,
     VMVNModImm,
     MVEVMVNModImm,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index bc93a058720c..1da32ad2af6c 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -188,6 +188,13 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
   let DecoderMethod = "DecodeCCOutOperand";
 }
 
+// Transform to generate the inverse of a condition code during ISel
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  ARMCC::CondCodes CC = static_cast<ARMCC::CondCodes>(N->getZExtValue());
+  return CurDAG->getTargetConstant(ARMCC::getOppositeCondition(CC), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
 // VPT predicate
 
 def VPTPredNOperand : AsmOperandClass {
@@ -401,6 +408,8 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   // mnemonic (when not in an IT block) or preclude it (when in an IT block).
   bit thumbArithFlagSetting = 0;
 
+  bit validForTailPredication = 0;
+
   // If this is a pseudo instruction, mark it isCodeGenOnly.
   let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
 
@@ -412,6 +421,7 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   let TSFlags{14}    = canXformTo16Bit;
   let TSFlags{18-15} = D.Value;
   let TSFlags{19}    = thumbArithFlagSetting;
+  let TSFlags{20}    = validForTailPredication;
 
   let Constraints = cstr;
   let Itinerary = itin;
@@ -455,6 +465,7 @@ class AsmPseudoInst<string asm, dag iops, dag oops = (outs)>
   let isCodeGenOnly = 0; // So we get asm matcher for it.
   let AsmString = asm;
   let isPseudo = 1;
+  let hasNoSchedulingInfo = 1;
 }
 
 class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)>
@@ -2282,7 +2293,7 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
   let Inst{24}    = SIMM{7};
   let Inst{18-16} = SIMM{6-4};
   let Inst{3-0}   = SIMM{3-0};
-  let DecoderMethod = "DecodeNEONModImmInstruction";
+  let DecoderMethod = "DecodeVMOVModImmInstruction";
 }
 
 // NEON 2 vector register format.
@@ -2724,6 +2735,16 @@ def complexrotateopodd : Operand<i32> {
   let PrintMethod = "printComplexRotationOp<180, 90>";
 }
 
+def MveSaturateOperand : AsmOperandClass {
+  let PredicateMethod = "isMveSaturateOp";
+  let DiagnosticString = "saturate operand must be 48 or 64";
+  let Name = "MveSaturate";
+}
+def saturateop : Operand<i32> {
+  let ParserMatchClass = MveSaturateOperand;
+  let PrintMethod = "printMveSaturateOp";
+}
+
 // Data type suffix token aliases. Implements Table A7-3 in the ARM ARM.
 def : TokenAlias<".s8", ".i8">;
 def : TokenAlias<".u8", ".i8">;
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 388c889349b7..a802d5a06f07 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -117,7 +117,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
 
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned Reg = MI->getOperand(0).getReg();
+  Register Reg = MI->getOperand(0).getReg();
   MachineInstrBuilder MIB;
 
   MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index e35145463852..fe696222ec70 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -51,8 +51,6 @@ def SDT_ARMAnd     : SDTypeProfile<1, 2,
                                     SDTCisVT<2, i32>]>;
 
 def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
-def SDT_ARMFCmp    : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>,
-                                          SDTCisVT<2, i32>]>;
 
 def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
                                           SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
@@ -108,14 +106,24 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>,
 
 // TODO Add another operand for 'Size' so that we can re-use this node when we
 // start supporting *TP versions.
-def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
-                                            SDTCisVT<1, OtherVT>]>;
+def SDT_ARMLoLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
+                                         SDTCisVT<1, OtherVT>]>;
 
 def ARMSmlald        : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
 def ARMSmlaldx       : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
 def ARMSmlsld        : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
 def ARMSmlsldx       : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>;
 
+def SDT_ARMCSel      : SDTypeProfile<1, 3,
+                                   [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<0, 2>,
+                                    SDTCisInt<3>,
+                                    SDTCisVT<3, i32>]>;
+
+def ARMcsinv         : SDNode<"ARMISD::CSINV", SDT_ARMCSel, [SDNPOptInGlue]>;
+def ARMcsneg         : SDNode<"ARMISD::CSNEG", SDT_ARMCSel, [SDNPOptInGlue]>;
+def ARMcsinc         : SDNode<"ARMISD::CSINC", SDT_ARMCSel, [SDNPOptInGlue]>;
+
 def SDT_MulHSR       : SDTypeProfile<1, 3, [SDTCisVT<0,i32>,
                                             SDTCisSameAs<0, 1>,
                                             SDTCisSameAs<0, 2>,
@@ -194,6 +202,7 @@ def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInGlue ]>;
 def ARMaddc          : SDNode<"ARMISD::ADDC",  SDTBinaryArithWithFlags,
                               [SDNPCommutative]>;
 def ARMsubc          : SDNode<"ARMISD::SUBC",  SDTBinaryArithWithFlags>;
+def ARMlsls          : SDNode<"ARMISD::LSLS",  SDTBinaryArithWithFlags>;
 def ARMadde          : SDNode<"ARMISD::ADDE",  SDTBinaryArithWithFlagsInOut>;
 def ARMsube          : SDNode<"ARMISD::SUBE",  SDTBinaryArithWithFlagsInOut>;
 
@@ -229,6 +238,11 @@ def ARMsmlalbt      : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>;
 def ARMsmlaltb      : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
 def ARMsmlaltt      : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
 
+def ARMqadd8b       : SDNode<"ARMISD::QADD8b", SDT_ARMAnd, []>;
+def ARMqsub8b       : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>;
+def ARMqadd16b      : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>;
+def ARMqsub16b      : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>;
+
 // Vector operations shared between NEON and MVE
 
 def ARMvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
@@ -265,8 +279,16 @@ def ARMvshruImm  : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>;
 def ARMvshls     : SDNode<"ARMISD::VSHLs", SDTARMVSH>;
 def ARMvshlu     : SDNode<"ARMISD::VSHLu", SDTARMVSH>;
 
-def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop,
-                    [SDNPHasChain]>;
+def SDTARMVCMP    : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+                                         SDTCisInt<3>]>;
+def SDTARMVCMPZ   : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
+
+def ARMvcmp      : SDNode<"ARMISD::VCMP", SDTARMVCMP>;
+def ARMvcmpz     : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>;
+
+def ARMWLS      : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
+def ARMLE       : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
+def ARMLoopDec  : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
@@ -1948,7 +1970,7 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
 /// the function.  The first operand is the ID# for this instruction, the second
 /// is the index into the MachineConstantPool that this is, the third is the
 /// size in bytes of this constant pool entry.
-let hasSideEffects = 0, isNotDuplicable = 1 in
+let hasSideEffects = 0, isNotDuplicable = 1, hasNoSchedulingInfo = 1 in
 def CONSTPOOL_ENTRY :
 PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
                     i32imm:$size), NoItinerary, []>;
@@ -2361,6 +2383,12 @@ let isCall = 1,
   def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func),
                                8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
                       Requires<[IsARM]>, Sched<[WriteBr]>;
+
+  // push lr before the call
+  def BL_PUSHLR : ARMPseudoInst<(outs), (ins GPRlr:$ra, arm_bl_target:$func),
+                  4, IIC_Br,
+                  []>,
+             Requires<[IsARM]>, Sched<[WriteBr]>;
 }
 
 let isBranch = 1, isTerminator = 1 in {
@@ -3727,6 +3755,23 @@ let DecoderMethod = "DecodeQADDInstruction" in
                 [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))]>;
 }
 
+def : ARMV5TEPat<(saddsat GPR:$a, GPR:$b),
+                 (QADD GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(ssubsat GPR:$a, GPR:$b),
+                 (QSUB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+                 (QDADD rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV5TEPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
+                 (QDSUB rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
+               (QADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
+               (QSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn),
+               (QADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn),
+               (QSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
 def UQADD16 : AAIIntrinsic<0b01100110, 0b11110001, "uqadd16", int_arm_uqadd16>;
 def UQADD8  : AAIIntrinsic<0b01100110, 0b11111001, "uqadd8", int_arm_uqadd8>;
 def UQSUB16 : AAIIntrinsic<0b01100110, 0b11110111, "uqsub16", int_arm_uqsub16>;
@@ -4870,14 +4915,13 @@ def SB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "sb", "", []>,
   let hasSideEffects = 1;
 }
 
-let usesCustomInserter = 1, Defs = [CPSR] in {
-
-// Pseudo instruction that combines movs + predicated rsbmi
-// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
+  // Pseudo instruction that combines movs + predicated rsbmi
+  // to implement integer ABS
   def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
 }
 
-let usesCustomInserter = 1, Defs = [CPSR] in {
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
     def COPY_STRUCT_BYVAL_I32 : PseudoInst<
       (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
       NoItinerary,
@@ -5085,8 +5129,8 @@ def SWPB: AIswp<1, (outs GPRnopc:$Rt),
 def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
             c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
             NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-            [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                          imm:$CRm, imm:$opc2)]>,
+            [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+                          timm:$CRm, timm:$opc2)]>,
             Requires<[IsARM,PreV8]> {
   bits<4> opc1;
   bits<4> CRn;
@@ -5109,8 +5153,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
 def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-               [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                              imm:$CRm, imm:$opc2)]>,
+               [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+                              timm:$CRm, timm:$opc2)]>,
                Requires<[IsARM,PreV8]> {
   let Inst{31-28} = 0b1111;
   bits<4> opc1;
@@ -5289,15 +5333,15 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
   }
 }
 
-defm LDC   : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm LDCL  : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm LDC   : LdStCop <1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm LDCL  : LdStCop <1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 
-defm STC   : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm STCL  : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
-defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm STC   : LdStCop <0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm STCL  : LdStCop <0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 
 } // DecoderNamespace = "CoProc"
 
@@ -5333,8 +5377,8 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
                     (outs),
                     (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, imm0_7:$opc2),
-                    [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                                  imm:$CRm, imm:$opc2)]>,
+                    [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+                                  timm:$CRm, timm:$opc2)]>,
                     ComplexDeprecationPredicate<"MCR">;
 def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
@@ -5347,8 +5391,8 @@ def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                         c_imm:$CRm, 0, pred:$p)>;
 
-def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
-             (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : ARMPat<(int_arm_mrc timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2),
+             (MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
 
 class MovRCopro2<string opc, bit direction, dag oops, dag iops,
                  list<dag> pattern>
@@ -5379,8 +5423,8 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                       (outs),
                       (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                            c_imm:$CRm, imm0_7:$opc2),
-                      [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                                     imm:$CRm, imm:$opc2)]>,
+                      [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+                                     timm:$CRm, timm:$opc2)]>,
                       Requires<[IsARM,PreV8]>;
 def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
@@ -5394,9 +5438,9 @@ def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 
-def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
-                              imm:$CRm, imm:$opc2),
-                (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : ARMV5TPat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn,
+                              timm:$CRm, timm:$opc2),
+                (MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
 
 class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
                  pattern = []>
@@ -5422,8 +5466,8 @@ class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
 def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
                       (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
                       GPRnopc:$Rt2, c_imm:$CRm),
-                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
-                                     GPRnopc:$Rt2, imm:$CRm)]>;
+                      [(int_arm_mcrr timm:$cop, timm:$opc1, GPRnopc:$Rt,
+                                     GPRnopc:$Rt2, timm:$CRm)]>;
 def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */,
                       (outs GPRnopc:$Rt, GPRnopc:$Rt2),
                       (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
@@ -5455,8 +5499,8 @@ class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
 def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
                         (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
                         GPRnopc:$Rt2, c_imm:$CRm),
-                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
-                                        GPRnopc:$Rt2, imm:$CRm)]>;
+                        [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPRnopc:$Rt,
+                                        GPRnopc:$Rt2, timm:$CRm)]>;
 
 def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */,
                        (outs GPRnopc:$Rt, GPRnopc:$Rt2),
@@ -5579,12 +5623,12 @@ def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn),
 
 def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
                       [SDNPHasChain, SDNPSideEffect]>;
-let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
+let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP], hasNoSchedulingInfo = 1 in
   def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
 
 def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
                          [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
-let usesCustomInserter = 1, Defs = [CPSR] in
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in
   def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary,
                                [(win__dbzchk tGPR:$divisor)]>;
 
@@ -6131,10 +6175,10 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
 def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
          ComplexDeprecationPredicate<"IT">;
 
-let mayLoad = 1, mayStore =1, hasSideEffects = 1 in
+let mayLoad = 1, mayStore =1, hasSideEffects = 1, hasNoSchedulingInfo = 1 in
 def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
                        NoItinerary,
-                       [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>;
+                       [(set GPR:$Rd, (int_arm_space timm:$size, GPR:$Rn))]>;
 
 //===----------------------------------
 // Atomic cmpxchg for -O0
@@ -6174,4 +6218,5 @@ def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
   let hasSideEffects = 1;
   let Size = 0;
   let AsmString = "@ COMPILER BARRIER";
+  let hasNoSchedulingInfo = 1;
 }
diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td
index 3e7ae55c7fc8..4f67cd6e47cc 100644
--- a/lib/Target/ARM/ARMInstrMVE.td
+++ b/lib/Target/ARM/ARMInstrMVE.td
@@ -160,7 +160,8 @@ class TMemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass {
   let RenderMethod = "addMemImmOffsetOperands";
 }
 
-class taddrmode_imm7<int shift> : MemOperand {
+class taddrmode_imm7<int shift> : MemOperand,
+    ComplexPattern<i32, 2, "SelectTAddrModeImm7<"#shift#">", []>  {
   let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand<shift>;
   // They are printed the same way as the T2 imm8 version
   let PrintMethod = "printT2AddrModeImm8Operand<false>";
@@ -221,7 +222,9 @@ def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>;
 def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>;
 def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>;
 
-class t2am_imm7_offset<int shift> : MemOperand {
+class t2am_imm7_offset<int shift> : MemOperand,
+      ComplexPattern<i32, 1, "SelectT2AddrModeImm7Offset<"#shift#">",
+                     [], [SDNPWantRoot]> {
   // They are printed the same way as the imm8 version
   let PrintMethod = "printT2AddrModeImm8OffsetOperand";
   let ParserMatchClass =
@@ -371,6 +374,8 @@ class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4, list<dag> pattern=[]>
   let Inst{7-6} = 0b00;
   let Inst{5-4} = op5_4{1-0};
   let Inst{3-0} = 0b1101;
+
+  let Unpredictable{8-6} = 0b111;
 }
 
 def MVE_SQRSHR : MVE_ScalarShiftSRegReg<"sqrshr", 0b10>;
@@ -403,18 +408,17 @@ class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16,
   let Inst{3-0} = 0b1111;
 }
 
-class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16,
-                             list<dag> pattern=[]>
+class MVE_ScalarShiftDRegRegBase<string iname, dag iops, string asm,
+                                 bit op5, bit op16, list<dag> pattern=[]>
   : MVE_ScalarShiftDoubleReg<
-     iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm),
-     "$RdaLo, $RdaHi, $Rm", "@earlyclobber $RdaHi,@earlyclobber $RdaLo,"
-                            "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
+     iname, iops, asm, "@earlyclobber $RdaHi,@earlyclobber $RdaLo,"
+                       "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
      pattern> {
   bits<4> Rm;
 
   let Inst{16} = op16;
   let Inst{15-12} = Rm{3-0};
-  let Inst{7-6} = 0b00;
+  let Inst{6} = 0b0;
   let Inst{5} = op5;
   let Inst{4} = 0b0;
   let Inst{3-0} = 0b1101;
@@ -427,27 +431,44 @@ class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16,
   let DecoderMethod = "DecodeMVEOverlappingLongShift";
 }
 
-def MVE_ASRLr   : MVE_ScalarShiftDRegReg<"asrl",    0b1,  0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+class MVE_ScalarShiftDRegReg<string iname, bit op5, list<dag> pattern=[]>
+  : MVE_ScalarShiftDRegRegBase<
+     iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm),
+     "$RdaLo, $RdaHi, $Rm", op5, 0b0, pattern> {
+
+  let Inst{7} = 0b0;
+}
+
+class MVE_ScalarShiftDRegRegWithSat<string iname, bit op5, list<dag> pattern=[]>
+  : MVE_ScalarShiftDRegRegBase<
+     iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm, saturateop:$sat),
+     "$RdaLo, $RdaHi, $sat, $Rm", op5, 0b1, pattern> {
+  bit sat;
+
+  let Inst{7} = sat;
+}
+
+def MVE_ASRLr   : MVE_ScalarShiftDRegReg<"asrl",    0b1,  [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
                                         (ARMasrl tGPREven:$RdaLo_src,
                                         tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
 def MVE_ASRLi   : MVE_ScalarShiftDRegImm<"asrl",    0b10, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
                                         (ARMasrl tGPREven:$RdaLo_src,
-                                        tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
-def MVE_LSLLr   : MVE_ScalarShiftDRegReg<"lsll",    0b0,  0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+                                        tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
+def MVE_LSLLr   : MVE_ScalarShiftDRegReg<"lsll",    0b0,  [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
                                         (ARMlsll tGPREven:$RdaLo_src,
                                         tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
 def MVE_LSLLi   : MVE_ScalarShiftDRegImm<"lsll",    0b00, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
                                         (ARMlsll tGPREven:$RdaLo_src,
-                                        tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
+                                        tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
 def MVE_LSRL    : MVE_ScalarShiftDRegImm<"lsrl",    0b01, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
                                         (ARMlsrl tGPREven:$RdaLo_src,
-                                        tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
+                                        tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
 
-def MVE_SQRSHRL : MVE_ScalarShiftDRegReg<"sqrshrl", 0b1,  0b1>;
+def MVE_SQRSHRL : MVE_ScalarShiftDRegRegWithSat<"sqrshrl", 0b1>;
 def MVE_SQSHLL  : MVE_ScalarShiftDRegImm<"sqshll",  0b11, 0b1>;
 def MVE_SRSHRL  : MVE_ScalarShiftDRegImm<"srshrl",  0b10, 0b1>;
 
-def MVE_UQRSHLL : MVE_ScalarShiftDRegReg<"uqrshll", 0b0,  0b1>;
+def MVE_UQRSHLL : MVE_ScalarShiftDRegRegWithSat<"uqrshll", 0b0>;
 def MVE_UQSHLL  : MVE_ScalarShiftDRegImm<"uqshll",  0b00, 0b1>;
 def MVE_URSHRL  : MVE_ScalarShiftDRegImm<"urshrl",  0b01, 0b1>;
 
@@ -531,6 +552,19 @@ defm MVE_VADDVu8  : MVE_VADDV_A<"u8",  0b1, 0b00>;
 defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>;
 defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>;
+  def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>;
+  def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>;
+  def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))),
+            (i32 (MVE_VADDVu32acc $src2, $src1))>;
+  def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))),
+            (i32 (MVE_VADDVu16acc $src2, $src1))>;
+  def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))),
+            (i32 (MVE_VADDVu8acc $src2, $src1))>;
+
+}
+
 class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
                bit A, bit U, list<dag> pattern=[]>
   : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
@@ -636,6 +670,35 @@ multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
 defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>;
 defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
+            (i32 (MVE_VMAXVs8 (t2MVNi (i32 127)), $src))>;
+  def : Pat<(i32 (vecreduce_smax (v8i16 MQPR:$src))),
+            (i32 (MVE_VMAXVs16 (t2MOVi32imm (i32 -32768)), $src))>;
+  def : Pat<(i32 (vecreduce_smax (v4i32 MQPR:$src))),
+            (i32 (MVE_VMAXVs32 (t2MOVi (i32 -2147483648)), $src))>;
+  def : Pat<(i32 (vecreduce_umax (v16i8 MQPR:$src))),
+            (i32 (MVE_VMAXVu8 (t2MOVi (i32 0)), $src))>;
+  def : Pat<(i32 (vecreduce_umax (v8i16 MQPR:$src))),
+            (i32 (MVE_VMAXVu16 (t2MOVi (i32 0)), $src))>;
+  def : Pat<(i32 (vecreduce_umax (v4i32 MQPR:$src))),
+            (i32 (MVE_VMAXVu32 (t2MOVi (i32 0)), $src))>;
+
+  def : Pat<(i32 (vecreduce_smin (v16i8 MQPR:$src))),
+            (i32 (MVE_VMINVs8 (t2MOVi (i32 127)), $src))>;
+  def : Pat<(i32 (vecreduce_smin (v8i16 MQPR:$src))),
+            (i32 (MVE_VMINVs16 (t2MOVi16 (i32 32767)), $src))>;
+  def : Pat<(i32 (vecreduce_smin (v4i32 MQPR:$src))),
+            (i32 (MVE_VMINVs32 (t2MVNi (i32 -2147483648)), $src))>;
+  def : Pat<(i32 (vecreduce_umin (v16i8 MQPR:$src))),
+            (i32 (MVE_VMINVu8 (t2MOVi (i32 255)), $src))>;
+  def : Pat<(i32 (vecreduce_umin (v8i16 MQPR:$src))),
+            (i32 (MVE_VMINVu16 (t2MOVi16 (i32 65535)), $src))>;
+  def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))),
+            (i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>;
+
+}
+
 multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
   def s8  : MVE_VMINMAXV<iname, "s8",  0b0, 0b00, 0b0, bit_7>;
   def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>;
@@ -667,57 +730,57 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
   let Inst{0} = bit_0;
 }
 
-multiclass MVE_VMLAMLSDAV_X<string iname, string suffix, dag iops, string cstr,
-                          bit sz, bit bit_28, bit A, bit bit_8, bit bit_0,
-                          list<dag> pattern=[]> {
-  def _noexch : MVE_VMLAMLSDAV<iname, suffix, iops, cstr, sz,
-                            bit_28, A, 0b0, bit_8, bit_0, pattern>;
-  def _exch   : MVE_VMLAMLSDAV<iname # "x", suffix, iops, cstr, sz,
-                            bit_28, A, 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLAMLSDAV_A<string iname, string x, string suffix,
+                            bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
+                            list<dag> pattern=[]> {
+  def ""#x#suffix : MVE_VMLAMLSDAV<iname # x, suffix,
+                                   (ins MQPR:$Qn, MQPR:$Qm), "",
+                                   sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
+  def "a"#x#suffix : MVE_VMLAMLSDAV<iname # "a" # x, suffix,
+                                    (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
+                                    "$RdaDest = $RdaSrc",
+                                    sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
+}
+
+multiclass MVE_VMLAMLSDAV_AX<string iname, string suffix, bit sz, bit bit_28,
+                             bit bit_8, bit bit_0, list<dag> pattern=[]> {
+  defm "" : MVE_VMLAMLSDAV_A<iname, "", suffix, sz, bit_28,
+                             0b0, bit_8, bit_0, pattern>;
+  defm "" : MVE_VMLAMLSDAV_A<iname, "x", suffix, sz, bit_28,
+                             0b1, bit_8, bit_0, pattern>;
 }
 
-multiclass MVE_VMLAMLSDAV_XA<string iname, string suffix, bit sz, bit bit_28,
-                           bit bit_8, bit bit_0, list<dag> pattern=[]> {
-  defm _noacc : MVE_VMLAMLSDAV_X<iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
-                              sz, bit_28, 0b0, bit_8, bit_0, pattern>;
-  defm _acc   : MVE_VMLAMLSDAV_X<iname # "a", suffix,
-                             (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
-                             "$RdaDest = $RdaSrc",
-                              sz, bit_28, 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit bit_8,
+                             list<dag> pattern=[]> {
+  defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix,
+                              sz, 0b0, bit_8, 0b0, pattern>;
+  defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix,
+                             sz, 0b1, 0b0, bit_8, 0b0, pattern>;
 }
 
-multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit U, bit bit_8,
-                           list<dag> pattern=[]> {
-  defm "" : MVE_VMLAMLSDAV_XA<"vmladav", suffix, sz, U, bit_8, 0b0, pattern>;
+multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
+                             list<dag> pattern=[]> {
+  defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix,
+                              sz, bit_28, 0b0, 0b1, pattern>;
 }
 
-defm MVE_VMLADAVs16 : MVE_VMLADAV_multi<"s16", 0b0, 0b0, 0b0>;
-defm MVE_VMLADAVs32 : MVE_VMLADAV_multi<"s32", 0b1, 0b0, 0b0>;
-defm MVE_VMLADAVu16 : MVE_VMLADAV_multi<"u16", 0b0, 0b1, 0b0>;
-defm MVE_VMLADAVu32 : MVE_VMLADAV_multi<"u32", 0b1, 0b1, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>;
 
-defm MVE_VMLADAVs8 : MVE_VMLADAV_multi<"s8", 0b0, 0b0, 0b1>;
-defm MVE_VMLADAVu8 : MVE_VMLADAV_multi<"u8", 0b0, 0b1, 0b1>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>;
 
 // vmlav aliases vmladav
-foreach acc = ["_acc", "_noacc"] in {
+foreach acc = ["", "a"] in {
   foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in {
-    def : MVEInstAlias<!strconcat("vmlav", !if(!eq(acc, "_acc"), "a", ""),
-                       "${vp}.", suffix, "\t$RdaDest, $Qn, $Qm"),
-                       (!cast<Instruction>("MVE_VMLADAV"#suffix#acc#"_noexch")
+    def : MVEInstAlias<"vmlav"#acc#"${vp}."#suffix#"\t$RdaDest, $Qn, $Qm",
+                       (!cast<Instruction>("MVE_VMLADAV"#acc#suffix)
                         tGPREven:$RdaDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
   }
 }
 
-multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
-                           list<dag> pattern=[]> {
-  defm "" : MVE_VMLAMLSDAV_XA<"vmlsdav", suffix, sz, bit_28, 0b0, 0b1, pattern>;
-}
-
-defm MVE_VMLSDAVs8  : MVE_VMLSDAV_multi<"s8", 0, 0b1>;
-defm MVE_VMLSDAVs16 : MVE_VMLSDAV_multi<"s16", 0, 0b0>;
-defm MVE_VMLSDAVs32 : MVE_VMLSDAV_multi<"s32", 1, 0b0>;
-
 // Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH
 class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
                        bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
@@ -742,82 +805,83 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
   let Inst{0} = bit_0;
 }
 
-multiclass MVE_VMLALDAVBase_X<string iname, string suffix, dag iops,
-                              string cstr, bit sz, bit bit_28, bit A,
-                              bit bit_8, bit bit_0, list<dag> pattern=[]> {
-  def _noexch : MVE_VMLALDAVBase<iname, suffix, iops, cstr, sz,
-                               bit_28, A, 0b0, bit_8, bit_0, pattern>;
-  def _exch   : MVE_VMLALDAVBase<iname # "x", suffix, iops, cstr, sz,
-                               bit_28, A, 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix,
+                               bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
+                               list<dag> pattern=[]> {
+  def ""#x#suffix : MVE_VMLALDAVBase<
+     iname # x, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
+     sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
+  def "a"#x#suffix : MVE_VMLALDAVBase<
+     iname # "a" # x, suffix,
+     (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, MQPR:$Qn, MQPR:$Qm),
+     "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
+     sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
 }
 
-multiclass MVE_VMLALDAVBase_XA<string iname, string suffix, bit sz, bit bit_28,
-                             bit bit_8, bit bit_0, list<dag> pattern=[]> {
-  defm _noacc : MVE_VMLALDAVBase_X<
-     iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
-     sz, bit_28, 0b0, bit_8, bit_0, pattern>;
-  defm _acc   : MVE_VMLALDAVBase_X<
-     iname # "a", suffix, (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc,
-                               MQPR:$Qn, MQPR:$Qm),
-     "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
-     sz, bit_28, 0b1, bit_8, bit_0, pattern>;
+
+multiclass MVE_VMLALDAVBase_AX<string iname, string suffix, bit sz, bit bit_28,
+                               bit bit_8, bit bit_0, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_A<iname, "", suffix, sz,
+                               bit_28, 0b0, bit_8, bit_0, pattern>;
+  defm "" : MVE_VMLALDAVBase_A<iname, "x", suffix, sz,
+                               bit_28, 0b1, bit_8, bit_0, pattern>;
 }
 
-multiclass MVE_VRMLALDAVH_multi<string suffix, bit U, list<dag> pattern=[]> {
-  defm "" : MVE_VMLALDAVBase_XA<
-     "vrmlaldavh", suffix, 0b0, U, 0b1, 0b0, pattern>;
+multiclass MVE_VRMLALDAVH_multi<string suffix, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#suffix,
+                                0b0, 0b0, 0b1, 0b0, pattern>;
+  defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#suffix,
+                               0b0, 0b1, 0b0, 0b1, 0b0, pattern>;
 }
 
-defm MVE_VRMLALDAVHs32 : MVE_VRMLALDAVH_multi<"s32", 0>;
-defm MVE_VRMLALDAVHu32 : MVE_VRMLALDAVH_multi<"u32", 1>;
+defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<"32">;
 
 // vrmlalvh aliases for vrmlaldavh
 def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
-                  (MVE_VRMLALDAVHs32_noacc_noexch
+                  (MVE_VRMLALDAVHs32
                    tGPREven:$RdaLo, tGPROdd:$RdaHi,
                    MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
 def : MVEInstAlias<"vrmlalvha${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
-                  (MVE_VRMLALDAVHs32_acc_noexch
+                  (MVE_VRMLALDAVHas32
                    tGPREven:$RdaLo, tGPROdd:$RdaHi,
                    MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
 def : MVEInstAlias<"vrmlalvh${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
-                  (MVE_VRMLALDAVHu32_noacc_noexch
+                  (MVE_VRMLALDAVHu32
                    tGPREven:$RdaLo, tGPROdd:$RdaHi,
                    MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
 def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
-                  (MVE_VRMLALDAVHu32_acc_noexch
+                  (MVE_VRMLALDAVHau32
                    tGPREven:$RdaLo, tGPROdd:$RdaHi,
                    MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
 
-multiclass MVE_VMLALDAV_multi<string suffix, bit sz, bit U,
-                              list<dag> pattern=[]> {
-  defm "" : MVE_VMLALDAVBase_XA<"vmlaldav", suffix, sz, U, 0b0, 0b0, pattern>;
+multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#suffix, sz, 0b0, 0b0, 0b0, pattern>;
+  defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#suffix,
+                               sz, 0b1, 0b0, 0b0, 0b0, pattern>;
 }
 
-defm MVE_VMLALDAVs16 : MVE_VMLALDAV_multi<"s16", 0b0, 0b0>;
-defm MVE_VMLALDAVs32 : MVE_VMLALDAV_multi<"s32", 0b1, 0b0>;
-defm MVE_VMLALDAVu16 : MVE_VMLALDAV_multi<"u16", 0b0, 0b1>;
-defm MVE_VMLALDAVu32 : MVE_VMLALDAV_multi<"u32", 0b1, 0b1>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>;
 
 // vmlalv aliases vmlaldav
-foreach acc = ["_acc", "_noacc"] in {
+foreach acc = ["", "a"] in {
   foreach suffix = ["s16", "s32", "u16", "u32"] in {
-    def : MVEInstAlias<!strconcat("vmlalv", !if(!eq(acc, "_acc"), "a", ""),
-                       "${vp}.", suffix, "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm"),
-                       (!cast<Instruction>("MVE_VMLALDAV"#suffix#acc#"_noexch")
+    def : MVEInstAlias<"vmlalv" # acc # "${vp}." # suffix #
+                          "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm",
+                       (!cast<Instruction>("MVE_VMLALDAV"#acc#suffix)
                        tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest,
                        MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
   }
 }
 
 multiclass MVE_VMLSLDAV_multi<string iname, string suffix, bit sz,
-                            bit bit_28, list<dag> pattern=[]> {
-  defm "" : MVE_VMLALDAVBase_XA<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
+                              bit bit_28, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
 }
 
-defm MVE_VMLSLDAVs16   : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
-defm MVE_VMLSLDAVs32   : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
-defm MVE_VRMLSLDAVHs32 : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
+defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
+defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
+defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
 
 // end of mve_rDest instructions
 
@@ -967,11 +1031,12 @@ def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
   let Inst{6} = 0b1;
   let Inst{4} = 0b1;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
-class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7>
+class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, string cstr="">
   : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname,
-                  suffix, "$Qd, $Qm", ""> {
+                  suffix, "$Qd, $Qm", cstr> {
 
   let Inst{28} = 0b1;
   let Inst{25-23} = 0b111;
@@ -985,15 +1050,22 @@ class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7>
   let Inst{0} = 0b0;
 }
 
-def MVE_VREV64_8  : MVE_VREV<"vrev64", "8", 0b00, 0b00>;
-def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00>;
-def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00>;
+def MVE_VREV64_8  : MVE_VREV<"vrev64", "8", 0b00, 0b00, "@earlyclobber $Qd">;
+def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, "@earlyclobber $Qd">;
+def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, "@earlyclobber $Qd">;
 
 def MVE_VREV32_8  : MVE_VREV<"vrev32", "8", 0b00, 0b01>;
 def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>;
 
 def MVE_VREV16_8  : MVE_VREV<"vrev16", "8", 0b00, 0b10>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v8i16 (bswap (v8i16 MQPR:$src))),
+            (v8i16 (MVE_VREV16_8 (v8i16 MQPR:$src)))>;
+  def : Pat<(v4i32 (bswap (v4i32 MQPR:$src))),
+            (v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>;
+}
+
 let Predicates = [HasMVEInt] in {
   def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))),
             (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>;
@@ -1026,6 +1098,7 @@ def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
   let Inst{12-6} = 0b0010111;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 let Predicates = [HasMVEInt] in {
@@ -1054,6 +1127,7 @@ class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28>
   let Inst{6} = 0b1;
   let Inst{4} = 0b1;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VEOR : MVE_bit_ops<"veor", 0b00, 0b1>;
@@ -1145,6 +1219,7 @@ class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
 class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type>
   : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
   let Inst{5} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VORRIZ0v4i32  : MVE_VORR<"i32", 0b0001, expzero00>;
@@ -1173,6 +1248,7 @@ def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm",
 class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type>
   : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
   let Inst{5} = 0b1;
+  let validForTailPredication = 1;
 }
 
 def MVE_VBICIZ0v4i32  : MVE_VBIC<"i32", 0b0001, expzero00>;
@@ -1315,8 +1391,12 @@ let Predicates = [HasMVEInt] in {
 
   def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane),
             (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>;
-  def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane),
-            (COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>;
+  def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
+            (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
+  def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
+            (COPY_TO_REGCLASS
+              (VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))),
+              HPR)>;
 
   def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
             (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
@@ -1408,6 +1488,7 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
   let Inst{12-8} = 0b01000;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 class MVE_VADD<string suffix, bits<2> size, list<dag> pattern=[]>
@@ -1442,8 +1523,8 @@ let Predicates = [HasMVEInt] in {
 }
 
 class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
-                   bits<2> size, list<dag> pattern=[]>
-  : MVE_int<iname, suffix, size, pattern> {
+                   bits<2> size, ValueType vt>
+  : MVE_int<iname, suffix, size, []> {
 
   let Inst{28} = U;
   let Inst{25-23} = 0b110;
@@ -1453,26 +1534,49 @@ class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
   let Inst{8} = 0b0;
   let Inst{4} = 0b1;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
+
+  ValueType VT = vt;
 }
 
-class MVE_VQADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
-  : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, pattern>;
-class MVE_VQSUB<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
-  : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, pattern>;
+class MVE_VQADD<string suffix, bit U, bits<2> size, ValueType VT>
+  : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, VT>;
+class MVE_VQSUB<string suffix, bit U, bits<2> size, ValueType VT>
+  : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, VT>;
 
-def MVE_VQADDs8  : MVE_VQADD<"s8",  0b0, 0b00>;
-def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01>;
-def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10>;
-def MVE_VQADDu8  : MVE_VQADD<"u8",  0b1, 0b00>;
-def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01>;
-def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10>;
+def MVE_VQADDs8  : MVE_VQADD<"s8",  0b0, 0b00, v16i8>;
+def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01, v8i16>;
+def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10, v4i32>;
+def MVE_VQADDu8  : MVE_VQADD<"u8",  0b1, 0b00, v16i8>;
+def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01, v8i16>;
+def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10, v4i32>;
+
+def MVE_VQSUBs8  : MVE_VQSUB<"s8",  0b0, 0b00, v16i8>;
+def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01, v8i16>;
+def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10, v4i32>;
+def MVE_VQSUBu8  : MVE_VQSUB<"u8",  0b1, 0b00, v16i8>;
+def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01, v8i16>;
+def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10, v4i32>;
+
+let Predicates = [HasMVEInt] in {
+  foreach instr = [MVE_VQADDu8, MVE_VQADDu16, MVE_VQADDu32] in
+    foreach VT = [instr.VT] in
+      def : Pat<(VT (uaddsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+                (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+  foreach instr = [MVE_VQADDs8, MVE_VQADDs16, MVE_VQADDs32] in
+    foreach VT = [instr.VT] in
+      def : Pat<(VT (saddsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+                (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+  foreach instr = [MVE_VQSUBu8, MVE_VQSUBu16, MVE_VQSUBu32] in
+    foreach VT = [instr.VT] in
+      def : Pat<(VT (usubsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+                (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+  foreach instr = [MVE_VQSUBs8, MVE_VQSUBs16, MVE_VQSUBs32] in
+    foreach VT = [instr.VT] in
+      def : Pat<(VT (ssubsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+                (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+}
 
-def MVE_VQSUBs8  : MVE_VQSUB<"s8",  0b0, 0b00>;
-def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01>;
-def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10>;
-def MVE_VQSUBu8  : MVE_VQSUB<"u8",  0b1, 0b00>;
-def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01>;
-def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10>;
 
 class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
   : MVE_int<"vabd", suffix, size, pattern> {
@@ -1483,6 +1587,7 @@ class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
   let Inst{12-8} = 0b00111;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VABDs8  : MVE_VABD_int<"s8", 0b0, 0b00>;
@@ -1501,6 +1606,7 @@ class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
   let Inst{12-8} = 0b00001;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VRHADDs8  : MVE_VRHADD<"s8", 0b0, 0b00>;
@@ -1522,6 +1628,7 @@ class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
   let Inst{8} = 0b0;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 class MVE_VHADD<string suffix, bit U, bits<2> size,
@@ -1545,6 +1652,60 @@ def MVE_VHSUBu8  : MVE_VHSUB<"u8",  0b1, 0b00>;
 def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>;
 def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v16i8 (ARMvshrsImm
+                     (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+            (v16i8 (MVE_VHADDs8
+                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+  def : Pat<(v8i16 (ARMvshrsImm
+                     (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+            (v8i16 (MVE_VHADDs16
+                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+  def : Pat<(v4i32 (ARMvshrsImm
+                     (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+            (v4i32 (MVE_VHADDs32
+                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+
+  def : Pat<(v16i8 (ARMvshruImm
+                     (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+            (v16i8 (MVE_VHADDu8
+                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+  def : Pat<(v8i16 (ARMvshruImm
+                     (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+            (v8i16 (MVE_VHADDu16
+                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+  def : Pat<(v4i32 (ARMvshruImm
+                     (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+            (v4i32 (MVE_VHADDu32
+                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+
+  def : Pat<(v16i8 (ARMvshrsImm
+                     (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+            (v16i8 (MVE_VHSUBs8
+                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+  def : Pat<(v8i16 (ARMvshrsImm
+                     (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+            (v8i16 (MVE_VHSUBs16
+                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+  def : Pat<(v4i32 (ARMvshrsImm
+                     (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+            (v4i32 (MVE_VHSUBs32
+                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+
+  def : Pat<(v16i8 (ARMvshruImm
+                     (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+            (v16i8 (MVE_VHSUBu8
+                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+  def : Pat<(v8i16 (ARMvshruImm
+                     (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+            (v8i16 (MVE_VHSUBu16
+                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+  def : Pat<(v4i32 (ARMvshruImm
+                     (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+            (v4i32 (MVE_VHSUBu32
+                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+}
+
 class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
           "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
@@ -1563,6 +1724,7 @@ class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
   let Inst{6} = 0b0;
   let Inst{5} = E;
   let Inst{4-0} = 0b10000;
+  let validForTailPredication = 1;
 }
 
 def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>;
@@ -1625,6 +1787,7 @@ class MVE_VCLSCLZ<string iname, string suffix, bits<2> size,
   let Inst{6} = 0b1;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VCLSs8  : MVE_VCLSCLZ<"vcls", "s8",  0b00, 0b0>;
@@ -1635,6 +1798,15 @@ def MVE_VCLZs8  : MVE_VCLSCLZ<"vclz", "i8",  0b00, 0b1>;
 def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>;
 def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))),
+            (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>;
+  def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))),
+            (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>;
+  def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))),
+            (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>;
+}
+
 class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
                       list<dag> pattern=[]>
   : MVEIntSingleSrc<iname, suffix, size, pattern> {
@@ -1648,6 +1820,7 @@ class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
   let Inst{6} = 0b1;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VABSs8  : MVE_VABSNEG_int<"vabs", "s8",  0b00, 0b0>;
@@ -1689,6 +1862,7 @@ class MVE_VQABSNEG<string iname, string suffix, bits<2> size,
   let Inst{6} = 0b1;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VQABSs8  : MVE_VQABSNEG<"vqabs", "s8",  0b00, 0b0>;
@@ -1720,6 +1894,7 @@ class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
   let Inst{3-0} = imm{3-0};
 
   let DecoderMethod = "DecodeMVEModImmInstruction";
+  let validForTailPredication = 1;
 }
 
 let isReMaterializable = 1 in {
@@ -2115,6 +2290,7 @@ class MVE_shift_by_vec<string iname, string suffix, bit U,
   let Inst{4} = bit_4;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> {
@@ -2163,6 +2339,7 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
   let Inst{4} = 0b1;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
 }
 
 class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
@@ -2175,6 +2352,7 @@ class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
   let Inst{21-16} = imm;
   let Inst{10-9} = 0b10;
   let Inst{8} = bit_8;
+  let validForTailPredication = 1;
 }
 
 def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> {
@@ -2427,6 +2605,7 @@ class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size,
   let Inst{11-10} = 0b01;
   let Inst{9-7} = op{2-0};
   let Inst{4} = 0b0;
+  let validForTailPredication = 1;
 
 }
 
@@ -2489,6 +2668,7 @@ class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]>
   let Inst{12-8} = 0b01101;
   let Inst{7} = Qn{3};
   let Inst{4} = 0b1;
+  let validForTailPredication = 1;
 }
 
 def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>;
@@ -2556,8 +2736,38 @@ def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
 def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
     (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
 
-def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>;
-def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>;
+let Predicates = [HasMVEFloat, UseFusedMAC] in {
+  def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1),
+                             (fmul (v8f16 MQPR:$src2),
+                                      (v8f16 MQPR:$src3)))),
+            (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>;
+  def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1),
+                             (fmul (v4f32 MQPR:$src2),
+                                      (v4f32 MQPR:$src3)))),
+            (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>;
+
+  def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1),
+                             (fmul (v8f16 MQPR:$src2),
+                                      (v8f16 MQPR:$src3)))),
+            (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>;
+  def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1),
+                             (fmul (v4f32 MQPR:$src2),
+                                      (v4f32 MQPR:$src3)))),
+            (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>;
+}
+
+let Predicates = [HasMVEFloat] in {
+  def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
+            (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
+  def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
+            (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
+}
+
+
+let validForTailPredication = 1 in {
+  def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>;
+  def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>;
+}
 
 let Predicates = [HasMVEFloat] in {
   def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
@@ -2566,8 +2776,11 @@ let Predicates = [HasMVEFloat] in {
             (v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
 }
 
-def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>;
-def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>;
+
+let validForTailPredication = 1 in {
+  def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>;
+  def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>;
+}
 
 let Predicates = [HasMVEFloat] in {
   def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
@@ -2576,10 +2789,10 @@ let Predicates = [HasMVEFloat] in {
             (v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
 }
 
-class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]>
+class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]>
   : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd),
                          (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
-                         "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> {
+                         "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bit rot;
@@ -2598,7 +2811,7 @@ class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]>
 }
 
 def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>;
-def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1>;
+def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1, "@earlyclobber $Qd">;
 
 class MVE_VABD_fp<string suffix, bit size>
   : MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
@@ -2617,6 +2830,7 @@ class MVE_VABD_fp<string suffix, bit size>
   let Inst{11-8} = 0b1101;
   let Inst{7} = Qn{3};
   let Inst{4} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>;
@@ -2643,6 +2857,7 @@ class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
   let Inst{4} = 0b1;
 
   let DecoderMethod = "DecodeMVEVCVTt1fp";
+  let validForTailPredication = 1;
 }
 
 class MVE_VCVT_imm_asmop<int Bits> : AsmOperandClass {
@@ -2693,6 +2908,7 @@ class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
   let Inst{9-8} = rm;
   let Inst{7} = op;
   let Inst{4} = 0b0;
+  let validForTailPredication = 1;
 }
 
 multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op,
@@ -2727,6 +2943,7 @@ class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op,
   let Inst{12-9} = 0b0011;
   let Inst{8-7} = op;
   let Inst{4} = 0b0;
+  let validForTailPredication = 1;
 }
 
 // The unsuffixed VCVT for float->int implicitly rounds toward zero,
@@ -2776,6 +2993,7 @@ class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
   let Inst{11-8} = 0b0111;
   let Inst{7} = negate;
   let Inst{4} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>;
@@ -2863,6 +3081,7 @@ class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
   // decoder to emit an operand that isn't affected by any instruction
   // bit.
   let DecoderMethod = "DecodeMVEVCMP<false," # predtype.DecoderMethod # ">";
+  let validForTailPredication = 1;
 }
 
 class MVE_VCMPqqf<string suffix, bit size>
@@ -2927,6 +3146,7 @@ class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20,
   let Constraints = "";
   // Custom decoder method, for the same reason as MVE_VCMPqq
   let DecoderMethod = "DecodeMVEVCMP<true," # predtype.DecoderMethod # ">";
+  let validForTailPredication = 1;
 }
 
 class MVE_VCMPqrf<string suffix, bit size>
@@ -2966,6 +3186,168 @@ def MVE_VCMPs8r  : MVE_VCMPqrs<"s8",  0b00>;
 def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>;
 def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>;
 
+multiclass unpred_vcmp_z<string suffix, int fc> {
+  def i8  : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))),
+                (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>;
+  def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))),
+                (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>;
+  def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))),
+                (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>;
+
+  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))))),
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))))),
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))))),
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmp_r<string suffix, int fc> {
+  def i8  : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))),
+                (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>;
+  def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))),
+                (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>;
+  def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))),
+                (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>;
+
+  def i8r  : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))),
+                 (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>;
+  def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))),
+                 (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>;
+  def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))),
+                 (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>;
+
+  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))))),
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, 1, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))))),
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, 1, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))))),
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, 1, VCCR:$p1))>;
+
+  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))))),
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))))),
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))))),
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmpf_z<int fc> {
+  def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))),
+                (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>;
+  def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))),
+                (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>;
+
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))))),
+            (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))))),
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmpf_r<int fc> {
+  def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))),
+                (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
+  def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))),
+                (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
+
+  def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))),
+                 (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>;
+  def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))),
+                 (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>;
+
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))))),
+            (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, 1, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))))),
+            (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, 1, VCCR:$p1))>;
+
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))))),
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))))),
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
+}
+
+let Predicates = [HasMVEInt] in {
+  defm MVE_VCEQZ  : unpred_vcmp_z<"i", 0>;
+  defm MVE_VCNEZ  : unpred_vcmp_z<"i", 1>;
+  defm MVE_VCGEZ  : unpred_vcmp_z<"s", 10>;
+  defm MVE_VCLTZ  : unpred_vcmp_z<"s", 11>;
+  defm MVE_VCGTZ  : unpred_vcmp_z<"s", 12>;
+  defm MVE_VCLEZ  : unpred_vcmp_z<"s", 13>;
+  defm MVE_VCGTUZ : unpred_vcmp_z<"u", 8>;
+  defm MVE_VCGEUZ : unpred_vcmp_z<"u", 2>;
+
+  defm MVE_VCEQ   : unpred_vcmp_r<"i", 0>;
+  defm MVE_VCNE   : unpred_vcmp_r<"i", 1>;
+  defm MVE_VCGE   : unpred_vcmp_r<"s", 10>;
+  defm MVE_VCLT   : unpred_vcmp_r<"s", 11>;
+  defm MVE_VCGT   : unpred_vcmp_r<"s", 12>;
+  defm MVE_VCLE   : unpred_vcmp_r<"s", 13>;
+  defm MVE_VCGTU  : unpred_vcmp_r<"u", 8>;
+  defm MVE_VCGEU  : unpred_vcmp_r<"u", 2>;
+}
+
+let Predicates = [HasMVEFloat] in {
+  defm MVE_VFCEQZ  : unpred_vcmpf_z<0>;
+  defm MVE_VFCNEZ  : unpred_vcmpf_z<1>;
+  defm MVE_VFCGEZ  : unpred_vcmpf_z<10>;
+  defm MVE_VFCLTZ  : unpred_vcmpf_z<11>;
+  defm MVE_VFCGTZ  : unpred_vcmpf_z<12>;
+  defm MVE_VFCLEZ  : unpred_vcmpf_z<13>;
+
+  defm MVE_VFCEQ   : unpred_vcmpf_r<0>;
+  defm MVE_VFCNE   : unpred_vcmpf_r<1>;
+  defm MVE_VFCGE   : unpred_vcmpf_r<10>;
+  defm MVE_VFCLT   : unpred_vcmpf_r<11>;
+  defm MVE_VFCGT   : unpred_vcmpf_r<12>;
+  defm MVE_VFCLE   : unpred_vcmpf_r<13>;
+}
+
+
+// Extra "worst case" and/or/xor partterns, going into and out of GRP
+multiclass two_predops<SDPatternOperator opnode, Instruction insn> {
+  def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))),
+                  (v16i1 (COPY_TO_REGCLASS
+                           (insn (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p1), rGPR)),
+                                 (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p2), rGPR))),
+                           VCCR))>;
+  def v8i1  : Pat<(v8i1 (opnode (v8i1 VCCR:$p1), (v8i1 VCCR:$p2))),
+                  (v8i1 (COPY_TO_REGCLASS
+                          (insn (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p1), rGPR)),
+                                (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p2), rGPR))),
+                          VCCR))>;
+  def v4i1  : Pat<(v4i1 (opnode (v4i1 VCCR:$p1), (v4i1 VCCR:$p2))),
+                  (v4i1 (COPY_TO_REGCLASS
+                          (insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)),
+                                (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))),
+                          VCCR))>;
+}
+
+let Predicates = [HasMVEInt] in {
+  defm POR    : two_predops<or,  t2ORRrr>;
+  defm PAND   : two_predops<and, t2ANDrr>;
+  defm PEOR   : two_predops<xor, t2EORrr>;
+}
+
+// Occasionally we need to cast between a i32 and a boolean vector, for
+// example when moving between rGPR and VPR.P0 as part of predicate vector
+// shuffles. We also sometimes need to cast between different predicate
+// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
+
+def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
+
+let Predicates = [HasMVEInt] in {
+  foreach VT = [ v4i1, v8i1, v16i1 ] in {
+    def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
+              (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>;
+    def : Pat<(VT  (predicate_cast (i32 VCCR:$src))),
+              (VT  (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>;
+
+    foreach VT2 = [ v4i1, v8i1, v16i1 ] in
+      def : Pat<(VT  (predicate_cast (VT2 VCCR:$src))),
+                (VT  (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
+  }
+}
+
 // end of MVE compares
 
 // start of MVE_qDest_qSrc
@@ -2989,10 +3371,10 @@ class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
 }
 
 class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract,
-                    string suffix, bits<2> size, list<dag> pattern=[]>
+                    string suffix, bits<2> size, string cstr="", list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_n, "$Qd = $Qd_src", pattern> {
+                   vpred_n, "$Qd = $Qd_src"#cstr, pattern> {
   bits<4> Qn;
 
   let Inst{28} = subtract;
@@ -3009,7 +3391,7 @@ multiclass MVE_VQxDMLxDH_multi<string iname, bit exch,
                                bit round, bit subtract> {
   def s8  : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8",  0b00>;
   def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>;
-  def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10>;
+  def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">;
 }
 
 defm MVE_VQDMLADH   : MVE_VQxDMLxDH_multi<"vqdmladh",   0b0, 0b0, 0b0>;
@@ -3021,10 +3403,10 @@ defm MVE_VQDMLSDHX  : MVE_VQxDMLxDH_multi<"vqdmlsdhx",  0b1, 0b0, 0b1>;
 defm MVE_VQRDMLSDH  : MVE_VQxDMLxDH_multi<"vqrdmlsdh",  0b0, 0b1, 0b1>;
 defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>;
 
-class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]>
+class MVE_VCMUL<string iname, string suffix, bit size, string cstr="", list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
-                   "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> {
+                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
   bits<4> Qn;
   bits<2> rot;
 
@@ -3041,13 +3423,13 @@ class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]>
 }
 
 def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>;
-def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1>;
+def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1, "@earlyclobber $Qd">;
 
 class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
-                bit T, list<dag> pattern=[]>
+                bit T, string cstr, list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_r, "", pattern> {
+                   vpred_r, cstr, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bits<4> Qm;
@@ -3063,9 +3445,9 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
 }
 
 multiclass MVE_VMULL_multi<string iname, string suffix,
-                           bit bit_28, bits<2> bits_21_20> {
-  def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0>;
-  def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1>;
+                           bit bit_28, bits<2> bits_21_20, string cstr=""> {
+  def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0, cstr>;
+  def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1, cstr>;
 }
 
 // For integer multiplies, bits 21:20 encode size, and bit 28 signedness.
@@ -3074,10 +3456,10 @@ multiclass MVE_VMULL_multi<string iname, string suffix,
 
 defm MVE_VMULLs8  : MVE_VMULL_multi<"vmull", "s8",  0b0, 0b00>;
 defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>;
-defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10>;
+defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10, "@earlyclobber $Qd">;
 defm MVE_VMULLu8  : MVE_VMULL_multi<"vmull", "u8",  0b1, 0b00>;
 defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>;
-defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10>;
+defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10, "@earlyclobber $Qd">;
 defm MVE_VMULLp8  : MVE_VMULL_multi<"vmull", "p8",  0b0, 0b11>;
 defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>;
 
@@ -3144,6 +3526,18 @@ defm MVE_VQMOVNu32  : MVE_VxMOVxN_halves<"vqmovn",  "u32", 0b1, 0b1, 0b01>;
 defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>;
 defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>;
 
+def MVEvmovn       : SDNode<"ARMISD::VMOVN", SDTARMVEXT>;
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
+            (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+  def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
+            (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+  def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))),
+            (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+  def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))),
+            (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+}
+
 class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
                   list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
@@ -3166,11 +3560,10 @@ defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>;
 defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>;
 
 class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
-                 list<dag> pattern=[]>
+                 string cstr="", list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
-                   "$Qd, $Qn, $Qm, $rot", vpred_r, "",
-          pattern> {
+                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
   bits<4> Qn;
   bit rot;
 
@@ -3186,11 +3579,11 @@ class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
 
 def MVE_VCADDi8   : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>;
 def MVE_VCADDi16  : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>;
-def MVE_VCADDi32  : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1>;
+def MVE_VCADDi32  : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1, "@earlyclobber $Qd">;
 
 def MVE_VHCADDs8  : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>;
 def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>;
-def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0>;
+def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0, "@earlyclobber $Qd">;
 
 class MVE_VADCSBC<string iname, bit I, bit subtract,
                   dag carryin, list<dag> pattern=[]>
@@ -3220,10 +3613,10 @@ def MVE_VSBC  : MVE_VADCSBC<"vsbc",  0b0, 0b1, (ins cl_FPSCR_NZCV:$carryin)>;
 def MVE_VSBCI : MVE_VADCSBC<"vsbci", 0b1, 0b1, (ins)>;
 
 class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
-                  list<dag> pattern=[]>
+                  string cstr="", list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_r, "", pattern> {
+                   vpred_r, cstr, pattern> {
   bits<4> Qn;
 
   let Inst{28} = size;
@@ -3236,13 +3629,13 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
   let Inst{0} = 0b1;
 }
 
-multiclass MVE_VQDMULL_halves<string suffix, bit size> {
-  def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0>;
-  def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1>;
+multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> {
+  def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>;
+  def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>;
 }
 
 defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>;
-defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1>;
+defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">;
 
 // end of mve_qDest_qSrc
 
@@ -3267,9 +3660,9 @@ class MVE_qr_base<dag oops, dag iops, InstrItinClass itin, string iname,
   let Inst{3-0} = Rm{3-0};
 }
 
-class MVE_qDest_rSrc<string iname, string suffix, list<dag> pattern=[]>
+class MVE_qDest_rSrc<string iname, string suffix, string cstr="", list<dag> pattern=[]>
   : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm),
-          NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, "",
+          NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr,
            pattern>;
 
 class MVE_qDestSrc_rSrc<string iname, string suffix, list<dag> pattern=[]>
@@ -3291,7 +3684,7 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
 class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
                      bit bit_5, bit bit_12, bit bit_16,
                      bit bit_28, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, pattern> {
+  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = size;
@@ -3299,6 +3692,7 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
   let Inst{12} = bit_12;
   let Inst{8} = 0b1;
   let Inst{5} = bit_5;
+  let validForTailPredication = 1;
 }
 
 multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix,
@@ -3320,9 +3714,27 @@ defm MVE_VSUB_qr_i  : MVE_VADDSUB_qr_sizes<"vsub",  "i", 0b0, 0b1, 0b1, 0b0>;
 defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>;
 defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
+            (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
+  def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
+            (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
+  def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
+            (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+}
+
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
+            (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
+  def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
+            (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
+  def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
+            (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+}
+
 class MVE_VQDMULL_qr<string iname, string suffix, bit size,
-                     bit T, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, pattern> {
+                     bit T, string cstr="", list<dag> pattern=[]>
+  : MVE_qDest_rSrc<iname, suffix, cstr, pattern> {
 
   let Inst{28} = size;
   let Inst{21-20} = 0b11;
@@ -3332,18 +3744,18 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size,
   let Inst{5} = 0b1;
 }
 
-multiclass MVE_VQDMULL_qr_halves<string suffix, bit size> {
-  def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0>;
-  def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1>;
+multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> {
+  def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>;
+  def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>;
 }
 
 defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>;
-defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1>;
+defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">;
 
 class MVE_VxADDSUB_qr<string iname, string suffix,
                       bit bit_28, bits<2> bits_21_20, bit subtract,
                       list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, pattern> {
+  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = bits_21_20;
@@ -3351,6 +3763,7 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
   let Inst{12} = subtract;
   let Inst{8} = 0b1;
   let Inst{5} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VHADD_qr_s8   : MVE_VxADDSUB_qr<"vhadd", "s8",  0b0, 0b00, 0b0>;
@@ -3388,6 +3801,7 @@ class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
   let Inst{12-8} = 0b11110;
   let Inst{7} = bit_7;
   let Inst{6-4} = 0b110;
+  let validForTailPredication = 1;
 }
 
 multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> {
@@ -3421,7 +3835,7 @@ let Predicates = [HasMVEInt] in {
 }
 
 class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, pattern> {
+  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
 
   let Inst{28} = 0b1;
   let Inst{21-20} = size;
@@ -3429,15 +3843,27 @@ class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
   let Inst{12} = 0b1;
   let Inst{8} = 0b0;
   let Inst{5} = 0b1;
+  let validForTailPredication = 1;
 }
 
 def MVE_VBRSR8  : MVE_VBRSR<"vbrsr", "8", 0b00>;
 def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>;
 def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))),
+            (v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>;
+
+  def : Pat<(v4i32 ( bitreverse (v4i32 MQPR:$val1))),
+            (v4i32 ( MVE_VBRSR32 (v4i32 MQPR:$val1), (t2MOVi (i32 32)) ))>;
+
+  def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))),
+            (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>;
+}
+
 class MVE_VMUL_qr_int<string iname, string suffix,
                       bits<2> size, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, pattern> {
+  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
 
   let Inst{28} = 0b0;
   let Inst{21-20} = size;
@@ -3445,15 +3871,25 @@ class MVE_VMUL_qr_int<string iname, string suffix,
   let Inst{12} = 0b1;
   let Inst{8} = 0b0;
   let Inst{5} = 0b1;
+  let validForTailPredication = 1;
 }
 
 def MVE_VMUL_qr_i8  : MVE_VMUL_qr_int<"vmul", "i8",  0b00>;
 def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>;
 def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
+            (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
+  def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
+            (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
+  def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
+            (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+}
+
 class MVE_VxxMUL_qr<string iname, string suffix,
                     bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, pattern> {
+  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = bits_21_20;
@@ -3471,14 +3907,14 @@ def MVE_VQRDMULH_qr_s8  : MVE_VxxMUL_qr<"vqrdmulh", "s8",  0b1, 0b00>;
 def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>;
 def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>;
 
-let Predicates = [HasMVEFloat] in {
+let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
   def MVE_VMUL_qr_f16   : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
   def MVE_VMUL_qr_f32   : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
 }
 
 class MVE_VFMAMLA_qr<string iname, string suffix,
-                   bit bit_28, bits<2> bits_21_20, bit S,
-                   list<dag> pattern=[]>
+                     bit bit_28, bits<2> bits_21_20, bit S,
+                     list<dag> pattern=[]>
   : MVE_qDestSrc_rSrc<iname, suffix, pattern> {
 
   let Inst{28} = bit_28;
@@ -3487,6 +3923,7 @@ class MVE_VFMAMLA_qr<string iname, string suffix,
   let Inst{12} = S;
   let Inst{8} = 0b0;
   let Inst{5} = 0b0;
+  let validForTailPredication = 1;
 }
 
 def MVE_VMLA_qr_s8     : MVE_VFMAMLA_qr<"vmla",  "s8",  0b0, 0b00, 0b0>;
@@ -3503,6 +3940,21 @@ def MVE_VMLAS_qr_u8    : MVE_VFMAMLA_qr<"vmlas", "u8",  0b1, 0b00, 0b1>;
 def MVE_VMLAS_qr_u16   : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>;
 def MVE_VMLAS_qr_u32   : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v4i32 (add (v4i32 MQPR:$src1),
+                        (v4i32 (mul (v4i32 MQPR:$src2),
+                                    (v4i32 (ARMvdup (i32 rGPR:$x))))))),
+            (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>;
+  def : Pat<(v8i16 (add (v8i16 MQPR:$src1),
+                        (v8i16 (mul (v8i16 MQPR:$src2),
+                                    (v8i16 (ARMvdup (i32 rGPR:$x))))))),
+            (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>;
+  def : Pat<(v16i8 (add (v16i8 MQPR:$src1),
+                        (v16i8 (mul (v16i8 MQPR:$src2),
+                                    (v16i8 (ARMvdup (i32 rGPR:$x))))))),
+            (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>;
+}
+
 let Predicates = [HasMVEFloat] in {
   def MVE_VFMA_qr_f16  : MVE_VFMAMLA_qr<"vfma",  "f16", 0b1, 0b11, 0b0>;
   def MVE_VFMA_qr_f32  : MVE_VFMAMLA_qr<"vfma",  "f32", 0b0, 0b11, 0b0>;
@@ -3555,6 +4007,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
   let Inst{7} = imm{1};
   let Inst{6-1} = 0b110111;
   let Inst{0} = imm{0};
+  let validForTailPredication = 1;
 }
 
 def MVE_VIDUPu8  : MVE_VxDUP<"vidup", "u8",  0b00, 0b0>;
@@ -3589,6 +4042,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
   let Inst{6-4} = 0b110;
   let Inst{3-1} = Rm{3-1};
   let Inst{0} = imm{0};
+  let validForTailPredication = 1;
 }
 
 def MVE_VIWDUPu8  : MVE_VxWDUP<"viwdup", "u8",  0b00, 0b0>;
@@ -3599,6 +4053,7 @@ def MVE_VDWDUPu8  : MVE_VxWDUP<"vdwdup", "u8",  0b00, 0b1>;
 def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
 def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
 
+let hasSideEffects = 1 in
 class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
           "$Rn", vpred_n, "", pattern> {
@@ -3614,6 +4069,7 @@ class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
 
   let Constraints = "";
   let DecoderMethod = "DecodeMveVCTP";
+  let validForTailPredication = 1;
 }
 
 def MVE_VCTP8  : MVE_VCTP<"8",  0b00>;
@@ -3621,6 +4077,15 @@ def MVE_VCTP16 : MVE_VCTP<"16", 0b01>;
 def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
 def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(int_arm_vctp8 rGPR:$Rn),
+            (v16i1 (MVE_VCTP8 rGPR:$Rn))>;
+  def : Pat<(int_arm_vctp16 rGPR:$Rn),
+            (v8i1 (MVE_VCTP16 rGPR:$Rn))>;
+  def : Pat<(int_arm_vctp32 rGPR:$Rn),
+            (v4i1 (MVE_VCTP32 rGPR:$Rn))>;
+}
+
 // end of mve_qDest_rSrc
 
 // start of coproc mov
@@ -3863,6 +4328,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
 
   let mayLoad = dir.load;
   let mayStore = !eq(dir.load,0);
+  let validForTailPredication = 1;
 }
 
 // Contiguous load and store instructions. These come in two main
@@ -4165,7 +4631,8 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte
   let Inst{7} = fc{0};
   let Inst{4} = 0b0;
 
-  let Defs = [VPR, P0];
+  let Defs = [VPR];
+  let validForTailPredication = 1;
 }
 
 class MVE_VPTt1<string suffix, bits<2> size, dag iops>
@@ -4177,11 +4644,12 @@ class MVE_VPTt1<string suffix, bits<2> size, dag iops>
   let Inst{5} = Qm{3};
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = fc{1};
+  let validForTailPredication = 1;
 }
 
 class MVE_VPTt1i<string suffix, bits<2> size>
  : MVE_VPTt1<suffix, size,
-           (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, MQPR:$Qm)> {
+           (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_i:$fc)> {
   let Inst{12} = 0b0;
   let Inst{0} = 0b0;
 }
@@ -4192,7 +4660,7 @@ def MVE_VPTv16i8 : MVE_VPTt1i<"i8", 0b00>;
 
 class MVE_VPTt1u<string suffix, bits<2> size>
  : MVE_VPTt1<suffix, size,
-           (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, MQPR:$Qm)> {
+           (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_u:$fc)> {
   let Inst{12} = 0b0;
   let Inst{0} = 0b1;
 }
@@ -4203,7 +4671,7 @@ def MVE_VPTv16u8 : MVE_VPTt1u<"u8", 0b00>;
 
 class MVE_VPTt1s<string suffix, bits<2> size>
  : MVE_VPTt1<suffix, size,
-           (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, MQPR:$Qm)> {
+           (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_s:$fc)> {
   let Inst{12} = 0b1;
 }
 
@@ -4225,7 +4693,7 @@ class MVE_VPTt2<string suffix, bits<2> size, dag iops>
 
 class MVE_VPTt2i<string suffix, bits<2> size>
   : MVE_VPTt2<suffix, size,
-            (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+            (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_i:$fc)> {
   let Inst{12} = 0b0;
   let Inst{5} = 0b0;
 }
@@ -4236,7 +4704,7 @@ def MVE_VPTv16i8r : MVE_VPTt2i<"i8", 0b00>;
 
 class MVE_VPTt2u<string suffix, bits<2> size>
   : MVE_VPTt2<suffix, size,
-            (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+            (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_u:$fc)> {
   let Inst{12} = 0b0;
   let Inst{5} = 0b1;
 }
@@ -4247,7 +4715,7 @@ def MVE_VPTv16u8r : MVE_VPTt2u<"u8", 0b00>;
 
 class MVE_VPTt2s<string suffix, bits<2> size>
   : MVE_VPTt2<suffix, size,
-            (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+            (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_s:$fc)> {
   let Inst{12} = 0b1;
 }
 
@@ -4276,12 +4744,13 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=
   let Inst{7} = fc{0};
   let Inst{4} = 0b0;
 
-  let Defs = [P0];
+  let Defs = [VPR];
   let Predicates = [HasMVEFloat];
+  let validForTailPredication = 1;
 }
 
 class MVE_VPTft1<string suffix, bit size>
-  : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, MQPR:$Qm),
+  : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_fp:$fc),
           "$fc, $Qn, $Qm"> {
   bits<3> fc;
   bits<4> Qm;
@@ -4296,7 +4765,7 @@ def MVE_VPTv4f32         : MVE_VPTft1<"f32", 0b0>;
 def MVE_VPTv8f16         : MVE_VPTft1<"f16", 0b1>;
 
 class MVE_VPTft2<string suffix, bit size>
-  : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, GPRwithZR:$Rm),
+  : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_fp:$fc),
           "$fc, $Qn, $Rm"> {
   bits<3> fc;
   bits<4> Rm;
@@ -4322,7 +4791,8 @@ def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary,
   let Unpredictable{7} = 0b1;
   let Unpredictable{5} = 0b1;
 
-  let Defs = [P0];
+  let Uses = [VPR];
+  let validForTailPredication = 1;
 }
 
 def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
@@ -4346,6 +4816,7 @@ def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
   let Inst{4} = 0b0;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b1;
+  let validForTailPredication = 1;
 }
 
 foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",
@@ -4353,19 +4824,113 @@ foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",
 def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm",
                    (MVE_VPSEL MQPR:$Qd, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
 
-def MVE_VPNOT : MVE_p<(outs), (ins), NoItinerary,
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
+            (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+  def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
+            (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+  def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
+            (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+
+  def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
+            (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+  def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
+            (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+
+  def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
+            (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+                              (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>;
+  def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
+            (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+  def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
+            (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+
+  def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
+            (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+  def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
+            (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+
+  // Pred <-> Int
+  def : Pat<(v16i8 (zext  (v16i1 VCCR:$pred))),
+            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+  def : Pat<(v8i16 (zext  (v8i1  VCCR:$pred))),
+            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+  def : Pat<(v4i32 (zext  (v4i1  VCCR:$pred))),
+            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+
+  def : Pat<(v16i8 (sext  (v16i1 VCCR:$pred))),
+            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+  def : Pat<(v8i16 (sext  (v8i1  VCCR:$pred))),
+            (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+  def : Pat<(v4i32 (sext  (v4i1  VCCR:$pred))),
+            (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+
+  def : Pat<(v16i8 (anyext  (v16i1 VCCR:$pred))),
+            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+  def : Pat<(v8i16 (anyext  (v8i1  VCCR:$pred))),
+            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+  def : Pat<(v4i32 (anyext  (v4i1  VCCR:$pred))),
+            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+
+  def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))),
+            (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, 1))>;
+  def : Pat<(v8i1 (trunc (v8i16  MQPR:$v1))),
+            (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, 1))>;
+  def : Pat<(v4i1 (trunc (v4i32  MQPR:$v1))),
+            (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, 1))>;
+}
+
+let Predicates = [HasMVEFloat] in {
+  // Pred <-> Float
+  // 112 is 1.0 in float
+  def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))),
+            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+  // 2620 in 1.0 in half
+  def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))),
+            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+  // 240 is -1.0 in float
+  def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))),
+            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+  // 2748 is -1.0 in half
+  def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))),
+            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+
+  def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))),
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+  def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))),
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+  def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))),
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+  def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))),
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+}
+
+def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary,
                       "vpnot", "", "", vpred_n, "", []> {
   let Inst{31-0} = 0b11111110001100010000111101001101;
   let Unpredictable{19-17} = 0b111;
   let Unpredictable{12} = 0b1;
   let Unpredictable{7} = 0b1;
   let Unpredictable{5} = 0b1;
-  let Defs = [P0];
-  let Uses = [P0];
 
   let Constraints = "";
+  let DecoderMethod = "DecodeMVEVPNOT";
 }
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))),
+            (v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>;
+  def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))),
+            (v8i1 (MVE_VPNOT (v8i1 VCCR:$pred)))>;
+  def : Pat<(v16i1 (xor (v16i1 VCCR:$pred), (v16i1 (predicate_cast (i32 65535))))),
+            (v16i1 (MVE_VPNOT (v16i1 VCCR:$pred)))>;
+}
+
+
 class MVE_loltp_start<dag iops, string asm, string ops, bits<2> size>
   : t2LOL<(outs GPRlr:$LR), iops, asm, ops> {
   bits<4> Rn;
@@ -4433,159 +4998,440 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
 // Patterns
 //===----------------------------------------------------------------------===//
 
-class MVE_unpred_vector_store_typed<ValueType Ty, Instruction RegImmInst,
+class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
+                             PatFrag StoreKind, int shift>
+  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
+        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
+                                   PatFrag StoreKind, int shift>
+  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
+        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>;
+
+multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
+                            int shift> {
+  def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
+class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
+                            PatFrag LoadKind, int shift>
+  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
+        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
+                                  PatFrag LoadKind, int shift>
+  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>;
+
+multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
+                           int shift> {
+  def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
+}
+
+class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
                                     PatFrag StoreKind, int shift>
-      : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
-           (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+  : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr),
+        (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>;
 
-multiclass MVE_unpred_vector_store<Instruction RegImmInst, PatFrag StoreKind,
+multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
                                    int shift> {
-  def : MVE_unpred_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
-  def : MVE_unpred_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
-  def : MVE_unpred_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
-  def : MVE_unpred_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
-  def : MVE_unpred_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
-  def : MVE_unpred_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
-  def : MVE_unpred_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
-}
-
-class MVE_unpred_vector_load_typed<ValueType Ty, Instruction RegImmInst,
-                                   PatFrag LoadKind, int shift>
-      : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
-          (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
-
-multiclass MVE_unpred_vector_load<Instruction RegImmInst, PatFrag LoadKind,
-                                  int shift> {
-  def : MVE_unpred_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
-  def : MVE_unpred_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
-  def : MVE_unpred_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
-  def : MVE_unpred_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
-  def : MVE_unpred_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
-  def : MVE_unpred_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
-  def : MVE_unpred_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
-}
+  def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
+def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+                                  (pre_store node:$val, node:$ptr, node:$offset), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned32_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+                                   (post_store node:$val, node:$ptr, node:$offset), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned16_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+                                  (pre_store node:$val, node:$ptr, node:$offset), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+                                   (post_store node:$val, node:$ptr, node:$offset), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+
+
+def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                          (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+  auto *Ld = cast<MaskedLoadSDNode>(N);
+  return Ld->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                              (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                              (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+}]>;
+def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                             (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+  auto *Ld = cast<MaskedLoadSDNode>(N);
+  EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+  return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
+}]>;
+def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                 (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+  auto *Ld = cast<MaskedLoadSDNode>(N);
+  EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2;
+}]>;
+def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                               (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                               (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+}]>;
+def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                              (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+  auto *Ld = cast<MaskedLoadSDNode>(N);
+  EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+  return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
+}]>;
+def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                 (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+  auto *Ld = cast<MaskedLoadSDNode>(N);
+  EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4;
+}]>;
+
+def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                           (masked_st node:$val, node:$ptr, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                     (maskedstore8 node:$val, node:$ptr, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                            (masked_st node:$val, node:$ptr, node:$pred), [{
+  auto *St = cast<MaskedStoreSDNode>(N);
+  EVT ScalarVT = St->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+
+def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                      (maskedstore16 node:$val, node:$ptr, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                            (masked_st node:$val, node:$ptr, node:$pred), [{
+  auto *St = cast<MaskedStoreSDNode>(N);
+  EVT ScalarVT = St->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+}]>;
 
 let Predicates = [HasMVEInt, IsLE] in {
-  defm : MVE_unpred_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
-  defm : MVE_unpred_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>;
-  defm : MVE_unpred_vector_store<MVE_VSTRWU32, alignedstore32, 2>;
+  // Stores
+  defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
+  defm : MVE_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>;
+  defm : MVE_vector_store<MVE_VSTRWU32, alignedstore32, 2>;
 
-  defm : MVE_unpred_vector_load<MVE_VLDRBU8, byte_alignedload, 0>;
-  defm : MVE_unpred_vector_load<MVE_VLDRHU16, hword_alignedload, 1>;
-  defm : MVE_unpred_vector_load<MVE_VLDRWU32, alignedload32, 2>;
+  // Loads
+  defm : MVE_vector_load<MVE_VLDRBU8, byte_alignedload, 0>;
+  defm : MVE_vector_load<MVE_VLDRHU16, hword_alignedload, 1>;
+  defm : MVE_vector_load<MVE_VLDRWU32, alignedload32, 2>;
 
-  def  : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
-             (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
-  def  : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)),
-             (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
-  def  : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)),
-             (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
+  // Pre/post inc stores
+  defm : MVE_vector_offset_store<MVE_VSTRBU8_pre, pre_store, 0>;
+  defm : MVE_vector_offset_store<MVE_VSTRBU8_post, post_store, 0>;
+  defm : MVE_vector_offset_store<MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
+  defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>;
+  defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
+  defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>;
 }
 
 let Predicates = [HasMVEInt, IsBE] in {
-  def : MVE_unpred_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>;
-  def : MVE_unpred_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>;
-  def : MVE_unpred_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>;
-  def : MVE_unpred_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>;
-  def : MVE_unpred_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>;
-
-  def : MVE_unpred_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>;
-  def : MVE_unpred_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>;
-  def : MVE_unpred_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>;
-  def : MVE_unpred_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>;
-  def : MVE_unpred_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>;
+  // Aligned Stores
+  def : MVE_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>;
+  def : MVE_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>;
+  def : MVE_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>;
+  def : MVE_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>;
+  def : MVE_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>;
+
+  // Aligned Loads
+  def : MVE_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>;
+  def : MVE_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>;
+  def : MVE_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>;
+  def : MVE_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>;
+  def : MVE_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>;
+
+  // Other unaligned loads/stores need to go though a VREV
+  def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)),
+            (v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+  def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)),
+            (v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+  def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)),
+            (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+  def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)),
+            (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+  def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)),
+            (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+  def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)),
+            (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+  def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr),
+            (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+  def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr),
+            (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+  def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr),
+            (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+  def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr),
+            (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+  def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr),
+            (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+  def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr),
+            (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+
+  // Pre/Post inc stores
+  def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_pre, pre_store, 0>;
+  def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_post, post_store, 0>;
+  def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
+  def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_post, aligned16_post_store, 1>;
+  def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
+  def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_post, aligned16_post_store, 1>;
+  def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
+  def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
+  def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
+  def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
 }
 
+let Predicates = [HasMVEInt] in {
+  // Aligned masked store, shared between LE and BE
+  def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore8, 0>;
+  def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, maskedstore16, 1>;
+  def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, maskedstore16, 1>;
+  def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, maskedstore32, 2>;
+  def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, maskedstore32, 2>;
+  // Truncating stores
+  def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred),
+            (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>;
+  // Aligned masked loads
+  def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>;
+  def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+  def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+  def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+  def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+  // Extending masked loads.
+  def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+                    (v8i16 NEONimmAllZerosV))),
+            (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+                    (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+                    (v8i16 NEONimmAllZerosV))),
+            (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+                    (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+                    (v8i16 NEONimmAllZerosV))),
+            (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+                    (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+                    (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+                    (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+                    (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+}
 
 // Widening/Narrowing Loads/Stores
 
+let MinAlignment = 2 in {
+  def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr),
+                                      (truncstorevi16 node:$val, node:$ptr)>;
+  def post_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                                        (post_truncstvi16 node:$val, node:$base, node:$offset)>;
+  def pre_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                                       (pre_truncstvi16 node:$val, node:$base, node:$offset)>;
+}
+
 let Predicates = [HasMVEInt] in {
-  def : Pat<(truncstorevi8  (v8i16 MQPR:$val), t2addrmode_imm7<1>:$addr),
-             (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<1>:$addr)>;
-  def : Pat<(truncstorevi8  (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr),
-             (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<1>:$addr)>;
-  def : Pat<(truncstorevi16 (v4i32 MQPR:$val), t2addrmode_imm7<2>:$addr),
-             (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<2>:$addr)>;
+  def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr),
+            (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>;
+  def : Pat<(truncstorevi8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr),
+            (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr)>;
+  def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr),
+            (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr)>;
+
+  def : Pat<(post_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+            (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+  def : Pat<(post_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+            (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+  def : Pat<(post_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
+            (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
+
+  def : Pat<(pre_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+            (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+  def : Pat<(pre_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+            (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+  def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
+            (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
+}
+
+
+let MinAlignment = 2 in {
+  def extloadvi16_align2  : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>;
+  def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>;
+  def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>;
 }
 
 multiclass MVEExtLoad<string DestLanes, string DestElemBits,
                       string SrcElemBits, string SrcElemType,
-                      Operand am> {
+                      string Align, Operand am> {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
-                   (!cast<PatFrag>("extloadvi"  # SrcElemBits) am:$addr)),
+                   (!cast<PatFrag>("extloadvi"  # SrcElemBits # Align) am:$addr)),
                  (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
                    am:$addr)>;
   def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
-                   (!cast<PatFrag>("zextloadvi"  # SrcElemBits) am:$addr)),
+                   (!cast<PatFrag>("zextloadvi"  # SrcElemBits # Align) am:$addr)),
                  (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
                    am:$addr)>;
   def _S   : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
-                   (!cast<PatFrag>("sextloadvi"  # SrcElemBits) am:$addr)),
+                   (!cast<PatFrag>("sextloadvi"  # SrcElemBits # Align) am:$addr)),
                  (!cast<Instruction>("MVE_VLDR" # SrcElemType # "S" # DestElemBits)
                    am:$addr)>;
 }
 
 let Predicates = [HasMVEInt] in {
-  defm : MVEExtLoad<"4", "32", "8",  "B", t2addrmode_imm7<1>>;
-  defm : MVEExtLoad<"8", "16", "8",  "B", t2addrmode_imm7<1>>;
-  defm : MVEExtLoad<"4", "32", "16", "H", t2addrmode_imm7<2>>;
+  defm : MVEExtLoad<"4", "32", "8",  "B", "", taddrmode_imm7<0>>;
+  defm : MVEExtLoad<"8", "16", "8",  "B", "", taddrmode_imm7<0>>;
+  defm : MVEExtLoad<"4", "32", "16", "H", "_align2", taddrmode_imm7<1>>;
 }
 
 
 // Bit convert patterns
 
 let Predicates = [HasMVEInt] in {
-  def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
-  def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 MQPR:$src))), (v2f64 MQPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 MQPR:$src))), (v2i64 MQPR:$src)>;
 
-  def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
-  def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 MQPR:$src))), (v4i32 MQPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 MQPR:$src))), (v4f32 MQPR:$src)>;
 
-  def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16  QPR:$src)>;
-  def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16  QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v8f16 MQPR:$src))), (v8i16  MQPR:$src)>;
+  def : Pat<(v8f16 (bitconvert (v8i16 MQPR:$src))), (v8f16  MQPR:$src)>;
 }
 
 let Predicates = [IsLE,HasMVEInt] in {
-  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
-  def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
-  def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
-  def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
-  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
-
-  def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
-  def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
-  def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>;
-  def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
-  def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
-
-  def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
-  def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
-  def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>;
-  def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
-  def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
-
-  def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
-  def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
-  def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>;
-  def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
-  def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
-
-  def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
-  def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>;
-  def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>;
-  def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>;
-  def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>;
-
-  def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
-  def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
-  def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
-
-  def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
-  def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
-  def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>;
-  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 MQPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 MQPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 MQPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 MQPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 MQPR:$src)>;
+
+  def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 MQPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 MQPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 MQPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 MQPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 MQPR:$src)>;
+
+  def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 MQPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 MQPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 MQPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 MQPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 MQPR:$src)>;
+
+  def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 MQPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 MQPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 MQPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 MQPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 MQPR:$src)>;
+
+  def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 MQPR:$src)>;
+  def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 MQPR:$src)>;
+  def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 MQPR:$src)>;
+  def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 MQPR:$src)>;
+  def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 MQPR:$src)>;
+
+  def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 MQPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 MQPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 MQPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 MQPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 MQPR:$src)>;
+
+  def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 MQPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 MQPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 MQPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 MQPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 MQPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 MQPR:$src)>;
+}
+
+let Predicates = [IsBE,HasMVEInt] in {
+  def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>;
+  def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>;
+  def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>;
+  def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>;
+  def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 (MVE_VREV64_8 MQPR:$src))>;
+
+  def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>;
+  def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>;
+  def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>;
+  def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>;
+  def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 (MVE_VREV64_8 MQPR:$src))>;
+
+  def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>;
+  def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>;
+  def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>;
+  def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>;
+  def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 (MVE_VREV32_8 MQPR:$src))>;
+
+  def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>;
+  def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>;
+  def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>;
+  def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>;
+  def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 (MVE_VREV32_8 MQPR:$src))>;
+
+  def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>;
+  def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>;
+  def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>;
+  def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>;
+  def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 (MVE_VREV16_8 MQPR:$src))>;
+
+  def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>;
+  def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>;
+  def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>;
+  def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>;
+  def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 (MVE_VREV16_8 MQPR:$src))>;
+
+  def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>;
+  def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>;
+  def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>;
+  def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>;
+  def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>;
+  def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>;
 }
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 806681df102c..60ca92e58041 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -15,22 +15,22 @@
 // NEON-specific Operands.
 //===----------------------------------------------------------------------===//
 def nModImm : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
+  let PrintMethod = "printVMOVModImmOperand";
 }
 
 def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; }
 def nImmSplatI8 : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
+  let PrintMethod = "printVMOVModImmOperand";
   let ParserMatchClass = nImmSplatI8AsmOperand;
 }
 def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; }
 def nImmSplatI16 : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
+  let PrintMethod = "printVMOVModImmOperand";
   let ParserMatchClass = nImmSplatI16AsmOperand;
 }
 def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; }
 def nImmSplatI32 : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
+  let PrintMethod = "printVMOVModImmOperand";
   let ParserMatchClass = nImmSplatI32AsmOperand;
 }
 def nImmSplatNotI16AsmOperand : AsmOperandClass { let Name = "NEONi16splatNot"; }
@@ -43,7 +43,7 @@ def nImmSplatNotI32 : Operand<i32> {
 }
 def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; }
 def nImmVMOVI32 : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
+  let PrintMethod = "printVMOVModImmOperand";
   let ParserMatchClass = nImmVMOVI32AsmOperand;
 }
 
@@ -62,18 +62,18 @@ class nImmVINVIAsmOperandReplicate<ValueType From, ValueType To>
 }
 
 class nImmVMOVIReplicate<ValueType From, ValueType To> : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
+  let PrintMethod = "printVMOVModImmOperand";
   let ParserMatchClass = nImmVMOVIAsmOperandReplicate<From, To>;
 }
 
 class nImmVINVIReplicate<ValueType From, ValueType To> : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
+  let PrintMethod = "printVMOVModImmOperand";
   let ParserMatchClass = nImmVINVIAsmOperandReplicate<From, To>;
 }
 
 def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
 def nImmVMOVI32Neg : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
+  let PrintMethod = "printVMOVModImmOperand";
   let ParserMatchClass = nImmVMOVI32NegAsmOperand;
 }
 def nImmVMOVF32 : Operand<i32> {
@@ -82,7 +82,7 @@ def nImmVMOVF32 : Operand<i32> {
 }
 def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; }
 def nImmSplatI64 : Operand<i32> {
-  let PrintMethod = "printNEONModImmOperand";
+  let PrintMethod = "printVMOVModImmOperand";
   let ParserMatchClass = nImmSplatI64AsmOperand;
 }
 
@@ -478,20 +478,8 @@ def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
 // NEON-specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
-def SDTARMVCMP    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
-def SDTARMVCMPZ   : SDTypeProfile<1, 1, []>;
-
-def NEONvceq      : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
-def NEONvceqz     : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>;
-def NEONvcge      : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
-def NEONvcgez     : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>;
-def NEONvclez     : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>;
-def NEONvcgeu     : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
-def NEONvcgt      : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
-def NEONvcgtz     : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>;
-def NEONvcltz     : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>;
-def NEONvcgtu     : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
-def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+def SDTARMVTST    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVTST>;
 
 // Types for vector shift by immediates.  The "SHX" version is for long and
 // narrow operations where the source and destination vectors have different
@@ -559,14 +547,14 @@ def NEONvtbl2     : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>;
 def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{
   ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
   unsigned EltBits = 0;
-  uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+  uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
   return (EltBits == 32 && EltVal == 0);
 }]>;
 
 def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{
   ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
   unsigned EltBits = 0;
-  uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+  uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
   return (EltBits == 8 && EltVal == 0xff);
 }]>;
 
@@ -3326,30 +3314,30 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
 //   source operand element sizes of 8, 16 and 32 bits:
 multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                        bits<5> op11_7, bit op4, string opc, string Dt,
-                       string asm, SDNode OpNode> {
+                       string asm, int fc> {
   // 64-bit vector types.
   def v8i8  : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "8"), asm, "",
-                  [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>;
+                  [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), (i32 fc))))]>;
   def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "16"), asm, "",
-                  [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>;
+                  [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), (i32 fc))))]>;
   def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "32"), asm, "",
-                  [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>;
+                  [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), (i32 fc))))]>;
   def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, "f32", asm, "",
-                  [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
+                  [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), (i32 fc))))]> {
     let Inst{10} = 1; // overwrite F = 1
   }
   def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, "f16", asm, "",
-                  [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>,
+                  [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), (i32 fc))))]>,
               Requires<[HasNEON,HasFullFP16]> {
     let Inst{10} = 1; // overwrite F = 1
   }
@@ -3358,30 +3346,83 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
   def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "8"), asm, "",
-                  [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>;
+                  [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), (i32 fc))))]>;
   def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "16"), asm, "",
-                  [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>;
+                  [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), (i32 fc))))]>;
   def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "32"), asm, "",
-                  [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>;
+                  [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), (i32 fc))))]>;
   def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, "f32", asm, "",
-                  [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
+                  [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), (i32 fc))))]> {
     let Inst{10} = 1; // overwrite F = 1
   }
   def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, "f16", asm, "",
-                  [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>,
+                  [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), (i32 fc))))]>,
               Requires<[HasNEON,HasFullFP16]> {
     let Inst{10} = 1; // overwrite F = 1
   }
 }
 
+// Neon 3-register comparisons.
+class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), (i32 fc))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+class N3VD_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), (i32 fc))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+multiclass N3V_QHS_cmp<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt,
+                       int fc, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v8i8  : N3VD_cmp<op24, op23, 0b00, op11_8, op4, itinD16,
+                       OpcodeStr, !strconcat(Dt, "8"),
+                       v8i8, v8i8, fc, Commutable>;
+  def v4i16 : N3VD_cmp<op24, op23, 0b01, op11_8, op4, itinD16,
+                       OpcodeStr, !strconcat(Dt, "16"),
+                       v4i16, v4i16, fc, Commutable>;
+  def v2i32 : N3VD_cmp<op24, op23, 0b10, op11_8, op4, itinD32,
+                       OpcodeStr, !strconcat(Dt, "32"),
+                       v2i32, v2i32, fc, Commutable>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQ_cmp<op24, op23, 0b00, op11_8, op4, itinQ16,
+                       OpcodeStr, !strconcat(Dt, "8"),
+                       v16i8, v16i8, fc, Commutable>;
+  def v8i16 : N3VQ_cmp<op24, op23, 0b01, op11_8, op4, itinQ16,
+                       OpcodeStr, !strconcat(Dt, "16"),
+                       v8i16, v8i16, fc, Commutable>;
+  def v4i32 : N3VQ_cmp<op24, op23, 0b10, op11_8, op4, itinQ32,
+                       OpcodeStr, !strconcat(Dt, "32"),
+                       v4i32, v4i32, fc, Commutable>;
+}
+
 
 // Neon 2-register vector intrinsics,
 //   element sizes of 8, 16 and 32 bits:
@@ -5026,67 +5067,67 @@ def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
 // Vector Comparisons.
 
 //   VCEQ     : Vector Compare Equal
-defm VCEQ     : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                        IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>;
-def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
-                     NEONvceq, 1>;
-def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
-                     NEONvceq, 1>;
-def  VCEQhd   : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
-                     NEONvceq, 1>,
+defm VCEQ     : N3V_QHS_cmp<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                            IIC_VSUBi4Q, "vceq", "i", 0, 1>;
+def  VCEQfd   : N3VD_cmp<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
+                         0, 1>;
+def  VCEQfq   : N3VQ_cmp<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
+                         0, 1>;
+def  VCEQhd   : N3VD_cmp<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
+                         0, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
-def  VCEQhq   : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
-                     NEONvceq, 1>,
+def  VCEQhq   : N3VQ_cmp<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
+                         0, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 let TwoOperandAliasConstraint = "$Vm = $Vd" in
 defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
-                            "$Vd, $Vm, #0", NEONvceqz>;
+                            "$Vd, $Vm, #0", 0>;
 
 //   VCGE     : Vector Compare Greater Than or Equal
-defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                        IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>;
-defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                        IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>;
-def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
-                     NEONvcge, 0>;
-def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
-                     NEONvcge, 0>;
-def  VCGEhd   : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
-                     NEONvcge, 0>,
+defm VCGEs    : N3V_QHS_cmp<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                            IIC_VSUBi4Q, "vcge", "s", 10, 0>;
+defm VCGEu    : N3V_QHS_cmp<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                            IIC_VSUBi4Q, "vcge", "u", 2, 0>;
+def  VCGEfd   : N3VD_cmp<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
+                         10, 0>;
+def  VCGEfq   : N3VQ_cmp<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
+                         10, 0>;
+def  VCGEhd   : N3VD_cmp<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
+                         10, 0>,
                 Requires<[HasNEON, HasFullFP16]>;
-def  VCGEhq   : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
-                     NEONvcge, 0>,
+def  VCGEhq   : N3VQ_cmp<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
+                         10, 0>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
-                            "$Vd, $Vm, #0", NEONvcgez>;
+                            "$Vd, $Vm, #0", 10>;
 defm VCLEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
-                            "$Vd, $Vm, #0", NEONvclez>;
+                            "$Vd, $Vm, #0", 13>;
 }
 
 //   VCGT     : Vector Compare Greater Than
-defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                        IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>;
-defm VCGTu    : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                        IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>;
-def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
-                     NEONvcgt, 0>;
-def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
-                     NEONvcgt, 0>;
-def  VCGThd   : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
-                     NEONvcgt, 0>,
+defm VCGTs    : N3V_QHS_cmp<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                            IIC_VSUBi4Q, "vcgt", "s", 12, 0>;
+defm VCGTu    : N3V_QHS_cmp<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                            IIC_VSUBi4Q, "vcgt", "u", 8, 0>;
+def  VCGTfd   : N3VD_cmp<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
+                         12, 0>;
+def  VCGTfq   : N3VQ_cmp<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
+                         12, 0>;
+def  VCGThd   : N3VD_cmp<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
+                         12, 0>,
                 Requires<[HasNEON, HasFullFP16]>;
-def  VCGThq   : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
-                     NEONvcgt, 0>,
+def  VCGThq   : N3VQ_cmp<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
+                         12, 0>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
-                            "$Vd, $Vm, #0", NEONvcgtz>;
+                            "$Vd, $Vm, #0", 12>;
 defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
-                            "$Vd, $Vm, #0", NEONvcltz>;
+                            "$Vd, $Vm, #0", 11>;
 }
 
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index cfeb13c6acb6..18bcbda44580 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -565,6 +565,13 @@ let isCall = 1,
                   4, IIC_Br,
                   [(ARMcall_nolink tGPR:$func)]>,
             Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
+
+  // Also used for Thumb2
+  // push lr before the call
+  def tBL_PUSHLR : tPseudoInst<(outs), (ins GPRlr:$ra, pred:$p, thumb_bl_target:$func),
+                  4, IIC_Br,
+                  []>,
+             Requires<[IsThumb]>, Sched<[WriteBr]>;
 }
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
@@ -592,6 +599,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
                       [(ARMbrjt tGPR:$target, tjumptable:$jt)]>,
                       Sched<[WriteBrTbl]> {
     let Size = 2;
+    let isNotDuplicable = 1;
     list<Predicate> Predicates = [IsThumb, IsThumb1Only];
   }
 }
@@ -1362,6 +1370,12 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
                             [(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>,
                 Requires<[IsThumb1Only]>,
                 Sched<[WriteALU]>;
+
+  def tLSLSri : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, imm0_31:$imm5),
+                            2, IIC_iALUr,
+                            [(set tGPR:$Rd, CPSR, (ARMlsls tGPR:$Rn, imm0_31:$imm5))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
 }
 
 
@@ -1465,7 +1479,7 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 // Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
 // and make use of the same compressed jump table format as Thumb-2.
 let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
-    isIndirectBranch = 1 in {
+    isIndirectBranch = 1, isNotDuplicable = 1 in {
 def tTBB_JT : tPseudoInst<(outs),
         (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
          IIC_Br, []>, Sched<[WriteBr]>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 7cbfaba7a8eb..25a45b39fa0c 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -45,7 +45,8 @@ def mve_shift_imm : AsmOperandClass {
   let RenderMethod = "addImmOperands";
   let DiagnosticString = "operand must be an immediate in the range [1,32]";
 }
-def long_shift : Operand<i32> {
+def long_shift : Operand<i32>,
+                 ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
   let ParserMatchClass = mve_shift_imm;
   let DecoderMethod = "DecodeLongShiftOperand";
 }
@@ -2394,6 +2395,23 @@ def : Thumb2DSPPat<(int_arm_qadd(int_arm_qadd rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
 def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)),
                    (t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
 
+def : Thumb2DSPPat<(saddsat rGPR:$Rm, rGPR:$Rn),
+                   (t2QADD rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ssubsat rGPR:$Rm, rGPR:$Rn),
+                   (t2QSUB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+                   (t2QDADD rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
+                   (t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
+                   (t2QADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
+                   (t2QSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn),
+                   (t2QADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn),
+                   (t2QSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
 // Signed/Unsigned add/subtract
 
 def t2SASX    : T2I_pam_intrinsics<0b010, 0b0000, "sasx", int_arm_sasx>;
@@ -4085,7 +4103,7 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
 
 // Pseudo isntruction that combines movs + predicated rsbmi
 // to implement integer ABS
-let usesCustomInserter = 1, Defs = [CPSR] in {
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
 def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
                        NoItinerary, []>, Requires<[IsThumb2]>;
 }
@@ -4175,15 +4193,15 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag>
 }
 
 let DecoderNamespace = "Thumb2CoProc" in {
-defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
-defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
 
-defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
-defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
 }
 
 
@@ -4368,8 +4386,8 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
            (outs),
            (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                 c_imm:$CRm, imm0_7:$opc2),
-           [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                         imm:$CRm, imm:$opc2)]>,
+           [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+                         timm:$CRm, timm:$opc2)]>,
            ComplexDeprecationPredicate<"MCR">;
 def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
@@ -4377,8 +4395,8 @@ def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
 def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
              (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                           c_imm:$CRm, imm0_7:$opc2),
-             [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                            imm:$CRm, imm:$opc2)]> {
+             [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+                            timm:$CRm, timm:$opc2)]> {
   let Predicates = [IsThumb2, PreV8];
 }
 def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
@@ -4402,24 +4420,24 @@ def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                           c_imm:$CRm, 0, pred:$p)>;
 
-def : T2v6Pat<(int_arm_mrc  imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
-              (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : T2v6Pat<(int_arm_mrc  timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2),
+              (t2MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
 
-def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
-              (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : T2v6Pat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2),
+              (t2MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
 
 
 /* from ARM core register to coprocessor */
 def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, (outs),
                          (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
                          c_imm:$CRm),
-                        [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
-                                       imm:$CRm)]>;
+                        [(int_arm_mcrr timm:$cop, timm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                       timm:$CRm)]>;
 def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, (outs),
                           (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
                            c_imm:$CRm),
-                          [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
-                                          GPR:$Rt2, imm:$CRm)]> {
+                          [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPR:$Rt,
+                                          GPR:$Rt2, timm:$CRm)]> {
   let Predicates = [IsThumb2, PreV8];
 }
 
@@ -4439,8 +4457,8 @@ def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1, (outs GPR:$Rt, GPR:$Rt2),
 def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                  c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                  "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-                 [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                               imm:$CRm, imm:$opc2)]> {
+                 [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+                               timm:$CRm, timm:$opc2)]> {
   let Inst{27-24} = 0b1110;
 
   bits<4> opc1;
@@ -4465,8 +4483,8 @@ def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
 def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                    c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                    "cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-                   [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                                  imm:$CRm, imm:$opc2)]> {
+                   [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+                                  timm:$CRm, timm:$opc2)]> {
   let Inst{27-24} = 0b1110;
 
   bits<4> opc1;
@@ -5087,6 +5105,7 @@ def t2BF_LabelPseudo
   : t2PseudoInst<(outs ), (ins pclabel:$cp), 0, NoItinerary, []> {
   let isTerminator = 1;
   let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
+  let hasNoSchedulingInfo = 1;
 }
 
 def t2BFi : t2BF<(ins bflabel_u4:$b_label, bflabel_s16:$label, pred:$p),
@@ -5217,11 +5236,13 @@ def t2LoopDec :
   t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
                4, IIC_Br, []>, Sched<[WriteBr]>;
 
-let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in {
+let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in {
+// Set WhileLoopStart and LoopEnd to occupy 8 bytes because they may
+// get converted into t2CMP and t2Bcc.
 def t2WhileLoopStart :
     t2PseudoInst<(outs),
                  (ins rGPR:$elts, brtarget:$target),
-                 4, IIC_Br, []>,
+                 8, IIC_Br, []>,
                  Sched<[WriteBr]>;
 
 def t2LoopEnd :
@@ -5233,7 +5254,7 @@ def t2LoopEnd :
 } // end isNotDuplicable
 
 class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
-  : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZR:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond),
+  : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond),
            AddrModeNone, NoItinerary, iname, "$Rd, $Rn, $Rm, $fcond", "", pattern> {
   bits<4> Rd;
   bits<4> Rm;
@@ -5255,6 +5276,25 @@ def t2CSINC : CS<"csinc", 0b1001>;
 def t2CSINV : CS<"csinv", 0b1010>;
 def t2CSNEG : CS<"csneg", 0b1011>;
 
+let Predicates = [HasV8_1MMainline] in {
+  def : T2Pat<(ARMcsinc GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+              (t2CSINC GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+  def : T2Pat<(ARMcsinv GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+              (t2CSINV GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+  def : T2Pat<(ARMcsneg GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+              (t2CSNEG GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+
+  multiclass ModifiedV8_1CSEL<Instruction Insn, dag modvalue> {
+    def : T2Pat<(ARMcmov modvalue, GPRwithZR:$tval, cmovpred:$imm),
+                (Insn GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+    def : T2Pat<(ARMcmov GPRwithZR:$tval, modvalue, cmovpred:$imm),
+                (Insn GPRwithZR:$tval, GPRwithZR:$fval,
+                         (i32 (inv_cond_XFORM imm:$imm)))>;
+  }
+  defm : ModifiedV8_1CSEL<t2CSINC, (add rGPR:$fval, 1)>;
+  defm : ModifiedV8_1CSEL<t2CSINV, (xor rGPR:$fval, -1)>;
+  defm : ModifiedV8_1CSEL<t2CSNEG, (sub 0, rGPR:$fval)>;
+}
 
 // CS aliases.
 let Predicates = [HasV8_1MMainline] in {
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index a0dd25de07ee..fdd961bfbb2f 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def SDT_CMPFP0  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisVT<1, i32>]>;
+def SDT_CMPFP0  : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
 def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
                                        SDTCisSameAs<1, 2>]>;
 def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
@@ -19,7 +19,7 @@ def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
 def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
 
 def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
-def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMFCmp, [SDNPOutGlue]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
 def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
 def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
 def arm_fmrrd  : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
@@ -324,7 +324,7 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
 // However, there is no UAL syntax for them, so we keep them around for
 // (dis)assembly only.
 multiclass vfp_ldstx_mult<string asm, bit L_bit> {
-  let Predicates = [HasFPRegs] in {
+  let Predicates = [HasFPRegs], hasNoSchedulingInfo = 1 in {
   // Unknown precision
   def XIA :
     AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
@@ -548,12 +548,12 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
-                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 1))]>;
+                  [/* For disassembly only; pattern left blank */]>;
 
 def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
-                  [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 1))]> {
+                  [/* For disassembly only; pattern left blank */]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -562,17 +562,17 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
 def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins HPR:$Sd, HPR:$Sm),
                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
-                  [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>;
+                  [/* For disassembly only; pattern left blank */]>;
 
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
-                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 0))]>;
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
 
 def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
-                  [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 0))]> {
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -581,7 +581,7 @@ def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
 def VCMPH  : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins HPR:$Sd, HPR:$Sm),
                   IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
-                  [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>;
+                  [(arm_cmpfp HPR:$Sd, HPR:$Sm)]>;
 } // Defs = [FPSCR_NZCV]
 
 //===----------------------------------------------------------------------===//
@@ -611,7 +611,7 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
-                   [(arm_cmpfp0 (f64 DPR:$Dd), (i32 1))]> {
+                   [/* For disassembly only; pattern left blank */]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -619,7 +619,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
-                   [(arm_cmpfp0 SPR:$Sd, (i32 1))]> {
+                   [/* For disassembly only; pattern left blank */]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -631,7 +631,7 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins HPR:$Sd),
                    IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
-                   [(arm_cmpfp0 HPR:$Sd, (i32 1))]> {
+                   [/* For disassembly only; pattern left blank */]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -639,7 +639,7 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
-                   [(arm_cmpfp0 (f64 DPR:$Dd), (i32 0))]> {
+                   [(arm_cmpfp0 (f64 DPR:$Dd))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -647,7 +647,7 @@ def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
-                   [(arm_cmpfp0 SPR:$Sd, (i32 0))]> {
+                   [(arm_cmpfp0 SPR:$Sd)]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -659,7 +659,7 @@ def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins HPR:$Sd),
                    IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
-                   [(arm_cmpfp0 HPR:$Sd, (i32 0))]> {
+                   [(arm_cmpfp0 HPR:$Sd)]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -1732,7 +1732,8 @@ def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
 
 def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
-                 IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1740,7 +1741,8 @@ def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
 
 def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
-                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1748,7 +1750,8 @@ def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
 
 def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
-                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -2297,6 +2300,8 @@ class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
   let Inst{6-5}   = 0b00;
   let Inst{4}     = 1;
   let Inst{3-0}   = 0b0000;
+  let Unpredictable{7-5} = 0b111;
+  let Unpredictable{3-0} = 0b1111;
 }
 
 let DecoderMethod = "DecodeForVMRSandVMSR" in {
@@ -2370,63 +2375,65 @@ class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
   VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
 
   // Instruction operand.
-  bits<4> src;
-
-  // Encode instruction operand.
-  let Inst{15-12} = src;
+  bits<4> Rt;
 
   let Inst{27-20} = 0b11101110;
   let Inst{19-16} = opc19_16;
+  let Inst{15-12} = Rt;
   let Inst{11-8}  = 0b1010;
   let Inst{7}     = 0;
+  let Inst{6-5}   = 0b00;
   let Inst{4}     = 1;
+  let Inst{3-0}   = 0b0000;
   let Predicates = [HasVFP2];
+  let Unpredictable{7-5} = 0b111;
+  let Unpredictable{3-0} = 0b1111;
 }
 
 let DecoderMethod = "DecodeForVMRSandVMSR" in {
  let Defs = [FPSCR] in {
    let Predicates = [HasFPRegs] in
    // Application level GPR -> FPSCR
-   def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$src),
-                       "vmsr", "\tfpscr, $src",
-                       [(int_arm_set_fpscr GPRnopc:$src)]>;
+   def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$Rt),
+                       "vmsr", "\tfpscr, $Rt",
+                       [(int_arm_set_fpscr GPRnopc:$Rt)]>;
    // System level GPR -> FPEXC
-   def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$src),
-                               "vmsr", "\tfpexc, $src", []>;
+   def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$Rt),
+                               "vmsr", "\tfpexc, $Rt", []>;
    // System level GPR -> FPSID
-   def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$src),
-                             "vmsr", "\tfpsid, $src", []>;
-   def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$src),
-                              "vmsr", "\tfpinst, $src", []>;
-   def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$src),
-                               "vmsr", "\tfpinst2, $src", []>;
+   def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$Rt),
+                             "vmsr", "\tfpsid, $Rt", []>;
+   def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$Rt),
+                              "vmsr", "\tfpinst, $Rt", []>;
+   def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$Rt),
+                               "vmsr", "\tfpinst2, $Rt", []>;
  }
  let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
    // System level GPR -> FPSCR with context saving for security extensions
-   def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$src),
-                               "vmsr", "\tfpcxtns, $src", []>;
+   def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$Rt),
+                               "vmsr", "\tfpcxtns, $Rt", []>;
  }
  let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
    // System level GPR -> FPSCR with context saving for security extensions
-   def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$src),
-                              "vmsr", "\tfpcxts, $src", []>;
+   def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$Rt),
+                              "vmsr", "\tfpcxts, $Rt", []>;
  }
  let Predicates = [HasV8_1MMainline, HasFPRegs] in {
    // System level GPR -> FPSCR_NZCVQC
    def VMSR_FPSCR_NZCVQC
      : MovToVFP<0b0010 /* fpscr_nzcvqc */,
-                (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$src),
-                "vmsr", "\tfpscr_nzcvqc, $src", []>;
+                (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$Rt),
+                "vmsr", "\tfpscr_nzcvqc, $Rt", []>;
  }
 
  let Predicates = [HasV8_1MMainline, HasMVEInt] in {
    // System level GPR -> VPR/P0
    let Defs = [VPR] in
-   def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$src),
-                           "vmsr", "\tvpr, $src", []>;
+   def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$Rt),
+                           "vmsr", "\tvpr, $Rt", []>;
 
-   def VMSR_P0  : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$src),
-                           "vmsr", "\tp0, $src", []>;
+   def VMSR_P0  : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$Rt),
+                           "vmsr", "\tp0, $Rt", []>;
  }
 }
 
@@ -2614,7 +2621,8 @@ def VSCCLRMD : VFPXI<(outs), (ins pred:$p, fp_dreglist_with_vpr:$regs, variable_
   let Inst{21-16} = 0b011111;
   let Inst{15-12} = regs{11-8};
   let Inst{11-8} = 0b1011;
-  let Inst{7-0} = regs{7-0};
+  let Inst{7-1} = regs{7-1};
+  let Inst{0} = 0;
 
   let DecoderMethod = "DecodeVSCCLRM";
 
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 4485a474a6df..8e5e474c0f59 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -34,7 +34,7 @@ public:
   ARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI,
                          const ARMRegisterBankInfo &RBI);
 
-  bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+  bool select(MachineInstr &I) override;
   static const char *getName() { return DEBUG_TYPE; }
 
 private:
@@ -210,8 +210,8 @@ static const TargetRegisterClass *guessRegClass(unsigned Reg,
 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                        const RegisterBankInfo &RBI) {
-  unsigned DstReg = I.getOperand(0).getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+  Register DstReg = I.getOperand(0).getReg();
+  if (Register::isPhysicalRegister(DstReg))
     return true;
 
   const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
@@ -236,17 +236,17 @@ static bool selectMergeValues(MachineInstrBuilder &MIB,
 
   // We only support G_MERGE_VALUES as a way to stick together two scalar GPRs
   // into one DPR.
-  unsigned VReg0 = MIB->getOperand(0).getReg();
+  Register VReg0 = MIB->getOperand(0).getReg();
   (void)VReg0;
   assert(MRI.getType(VReg0).getSizeInBits() == 64 &&
          RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID &&
          "Unsupported operand for G_MERGE_VALUES");
-  unsigned VReg1 = MIB->getOperand(1).getReg();
+  Register VReg1 = MIB->getOperand(1).getReg();
   (void)VReg1;
   assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
          RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
          "Unsupported operand for G_MERGE_VALUES");
-  unsigned VReg2 = MIB->getOperand(2).getReg();
+  Register VReg2 = MIB->getOperand(2).getReg();
   (void)VReg2;
   assert(MRI.getType(VReg2).getSizeInBits() == 32 &&
          RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID &&
@@ -268,17 +268,17 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB,
 
   // We only support G_UNMERGE_VALUES as a way to break up one DPR into two
   // GPRs.
-  unsigned VReg0 = MIB->getOperand(0).getReg();
+  Register VReg0 = MIB->getOperand(0).getReg();
   (void)VReg0;
   assert(MRI.getType(VReg0).getSizeInBits() == 32 &&
          RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID &&
          "Unsupported operand for G_UNMERGE_VALUES");
-  unsigned VReg1 = MIB->getOperand(1).getReg();
+  Register VReg1 = MIB->getOperand(1).getReg();
   (void)VReg1;
   assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
          RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
          "Unsupported operand for G_UNMERGE_VALUES");
-  unsigned VReg2 = MIB->getOperand(2).getReg();
+  Register VReg2 = MIB->getOperand(2).getReg();
   (void)VReg2;
   assert(MRI.getType(VReg2).getSizeInBits() == 64 &&
          RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::FPRRegBankID &&
@@ -833,8 +833,7 @@ void ARMInstructionSelector::renderVFPF64Imm(
   NewInstBuilder.addImm(FPImmEncoding);
 }
 
-bool ARMInstructionSelector::select(MachineInstr &I,
-                                    CodeGenCoverage &CoverageInfo) const {
+bool ARMInstructionSelector::select(MachineInstr &I) {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
 
@@ -851,7 +850,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
 
   using namespace TargetOpcode;
 
-  if (selectImpl(I, CoverageInfo))
+  if (selectImpl(I, *CoverageInfo))
     return true;
 
   MachineInstrBuilder MIB{MF, I};
@@ -874,10 +873,10 @@ bool ARMInstructionSelector::select(MachineInstr &I,
       MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp());
 
       if (isSExt) {
-        unsigned SExtResult = I.getOperand(0).getReg();
+        Register SExtResult = I.getOperand(0).getReg();
 
         // Use a new virtual register for the result of the AND
-        unsigned AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass);
+        Register AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass);
         I.getOperand(0).setReg(AndResult);
 
         auto InsertBefore = std::next(I.getIterator());
@@ -928,7 +927,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
       assert(MRI.getType(SrcReg).getSizeInBits() == 64 && "Unsupported size");
       assert(MRI.getType(DstReg).getSizeInBits() <= 32 && "Unsupported size");
 
-      unsigned IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass);
+      Register IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass);
       auto InsertBefore = std::next(I.getIterator());
       auto MovI =
           BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::VMOVRRD))
@@ -1039,7 +1038,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
   case G_FCMP: {
     assert(STI.hasVFP2Base() && "Can't select fcmp without VFP");
 
-    unsigned OpReg = I.getOperand(2).getReg();
+    Register OpReg = I.getOperand(2).getReg();
     unsigned Size = MRI.getType(OpReg).getSizeInBits();
 
     if (Size == 64 && !STI.hasFP64()) {
@@ -1077,12 +1076,12 @@ bool ARMInstructionSelector::select(MachineInstr &I,
   case G_STORE:
   case G_LOAD: {
     const auto &MemOp = **I.memoperands_begin();
-    if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+    if (MemOp.isAtomic()) {
       LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
       return false;
     }
 
-    unsigned Reg = I.getOperand(0).getReg();
+    Register Reg = I.getOperand(0).getReg();
     unsigned RegBank = RBI.getRegBank(Reg, MRI, TRI)->getID();
 
     LLT ValTy = MRI.getType(Reg);
@@ -1097,9 +1096,9 @@ bool ARMInstructionSelector::select(MachineInstr &I,
 
     if (ValSize == 1 && NewOpc == Opcodes.STORE8) {
       // Before storing a 1-bit value, make sure to clear out any unneeded bits.
-      unsigned OriginalValue = I.getOperand(0).getReg();
+      Register OriginalValue = I.getOperand(0).getReg();
 
-      unsigned ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass);
+      Register ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass);
       I.getOperand(0).setReg(ValueToStore);
 
       auto InsertBefore = I.getIterator();
@@ -1159,7 +1158,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
   case G_PHI: {
     I.setDesc(TII.get(PHI));
 
-    unsigned DstReg = I.getOperand(0).getReg();
+    Register DstReg = I.getOperand(0).getReg();
     const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
     if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
       break;
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 73a57b297ad6..81414e6d76fe 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -84,6 +84,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
       .legalForCartesianProduct({s8, s16, s32}, {s1, s8, s16});
 
+  getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
   getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR})
       .legalFor({s32})
       .minScalar(0, s32);
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 90a1ce238c3f..4a193fed04a3 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -509,7 +509,7 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
         Offset = MO.getImm() - WordOffset * getImmScale(Opc);
 
         // If storing the base register, it needs to be reset first.
-        unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
+        Register InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
 
         if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
           MO.setImm(Offset);
@@ -859,7 +859,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
   // Determine list of registers and list of implicit super-register defs.
   for (const MachineInstr *MI : Cand.Instrs) {
     const MachineOperand &MO = getLoadStoreRegOp(*MI);
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     bool IsKill = MO.isKill();
     if (IsKill)
       KilledRegs.insert(Reg);
@@ -874,7 +874,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
         if (!MO.isReg() || !MO.isDef() || MO.isDead())
           continue;
         assert(MO.isImplicit());
-        unsigned DefReg = MO.getReg();
+        Register DefReg = MO.getReg();
 
         if (is_contained(ImpDefs, DefReg))
           continue;
@@ -893,7 +893,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
   iterator InsertBefore = std::next(iterator(LatestMI));
   MachineBasicBlock &MBB = *LatestMI->getParent();
   unsigned Offset = getMemoryOpOffset(*First);
-  unsigned Base = getLoadStoreBaseOp(*First).getReg();
+  Register Base = getLoadStoreBaseOp(*First).getReg();
   bool BaseKill = LatestMI->killsRegister(Base);
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
@@ -1005,7 +1005,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
     const MachineInstr *MI = MemOps[SIndex].MI;
     int Offset = MemOps[SIndex].Offset;
     const MachineOperand &PMO = getLoadStoreRegOp(*MI);
-    unsigned PReg = PMO.getReg();
+    Register PReg = PMO.getReg();
     unsigned PRegNum = PMO.isUndef() ? std::numeric_limits<unsigned>::max()
                                      : TRI->getEncodingValue(PReg);
     unsigned Latest = SIndex;
@@ -1052,7 +1052,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
       if (NewOffset != Offset + (int)Size)
         break;
       const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (Reg == ARM::SP || Reg == ARM::PC)
         break;
       if (Count == Limit)
@@ -1261,7 +1261,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   if (isThumb1) return false;
 
   const MachineOperand &BaseOP = MI->getOperand(0);
-  unsigned Base = BaseOP.getReg();
+  Register Base = BaseOP.getReg();
   bool BaseKill = BaseOP.isKill();
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
@@ -1387,7 +1387,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   // FIXME: Use LDM/STM with single register instead.
   if (isThumb1) return false;
 
-  unsigned Base = getLoadStoreBaseOp(*MI).getReg();
+  Register Base = getLoadStoreBaseOp(*MI).getReg();
   bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
   unsigned Opcode = MI->getOpcode();
   DebugLoc DL = MI->getDebugLoc();
@@ -1512,7 +1512,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
   // Behaviour for writeback is undefined if base register is the same as one
   // of the others.
   const MachineOperand &BaseOp = MI.getOperand(2);
-  unsigned Base = BaseOp.getReg();
+  Register Base = BaseOp.getReg();
   const MachineOperand &Reg0Op = MI.getOperand(0);
   const MachineOperand &Reg1Op = MI.getOperand(1);
   if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
@@ -1655,9 +1655,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
     return false;
 
   const MachineOperand &BaseOp = MI->getOperand(2);
-  unsigned BaseReg = BaseOp.getReg();
-  unsigned EvenReg = MI->getOperand(0).getReg();
-  unsigned OddReg  = MI->getOperand(1).getReg();
+  Register BaseReg = BaseOp.getReg();
+  Register EvenReg = MI->getOperand(0).getReg();
+  Register OddReg = MI->getOperand(1).getReg();
   unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
   unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
 
@@ -1783,8 +1783,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
     if (isMemoryOp(*MBBI)) {
       unsigned Opcode = MBBI->getOpcode();
       const MachineOperand &MO = MBBI->getOperand(0);
-      unsigned Reg = MO.getReg();
-      unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
+      Register Reg = MO.getReg();
+      Register Base = getLoadStoreBaseOp(*MBBI).getReg();
       unsigned PredReg = 0;
       ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
       int Offset = getMemoryOpOffset(*MBBI);
@@ -2121,7 +2121,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
       MachineOperand &MO = I->getOperand(j);
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (MO.isDef() && TRI->regsOverlap(Reg, Base))
         return false;
       if (Reg != Base && !MemRegs.count(Reg))
@@ -2415,7 +2415,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
 
       int Opc = MI.getOpcode();
       bool isLd = isLoadSingle(Opc);
-      unsigned Base = MI.getOperand(1).getReg();
+      Register Base = MI.getOperand(1).getReg();
       int Offset = getMemoryOpOffset(MI);
       bool StopHere = false;
       auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) {
diff --git a/lib/Target/ARM/ARMLowOverheadLoops.cpp b/lib/Target/ARM/ARMLowOverheadLoops.cpp
index cedf3bd3c74e..e1c5a9c3e223 100644
--- a/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -11,8 +11,7 @@
 /// The expectation is that the loop contains three pseudo instructions:
 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
 ///   form should be in the preheader, whereas the while form should be in the
-///   preheaders only predecessor. TODO: Could DoLoopStart get moved into the
-///   pre-preheader?
+///   preheaders only predecessor.
 /// - t2LoopDec - placed within in the loop body.
 /// - t2LoopEnd - the loop latch terminator.
 ///
@@ -35,6 +34,7 @@ using namespace llvm;
 namespace {
 
   class ARMLowOverheadLoops : public MachineFunctionPass {
+    MachineFunction           *MF = nullptr;
     const ARMBaseInstrInfo    *TII = nullptr;
     MachineRegisterInfo       *MRI = nullptr;
     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
@@ -52,17 +52,6 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
-    bool ProcessLoop(MachineLoop *ML);
-
-    void RevertWhile(MachineInstr *MI) const;
-
-    void RevertLoopDec(MachineInstr *MI) const;
-
-    void RevertLoopEnd(MachineInstr *MI) const;
-
-    void Expand(MachineLoop *ML, MachineInstr *Start,
-                MachineInstr *Dec, MachineInstr *End, bool Revert);
-
     MachineFunctionProperties getRequiredProperties() const override {
       return MachineFunctionProperties().set(
           MachineFunctionProperties::Property::NoVRegs);
@@ -71,36 +60,156 @@ namespace {
     StringRef getPassName() const override {
       return ARM_LOW_OVERHEAD_LOOPS_NAME;
     }
+
+  private:
+    bool ProcessLoop(MachineLoop *ML);
+
+    MachineInstr * IsSafeToDefineLR(MachineInstr *MI);
+
+    bool RevertNonLoops();
+
+    void RevertWhile(MachineInstr *MI) const;
+
+    bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const;
+
+    void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
+
+    void Expand(MachineLoop *ML, MachineInstr *Start,
+                MachineInstr *InsertPt, MachineInstr *Dec,
+                MachineInstr *End, bool Revert);
+
   };
 }
-  
+
 char ARMLowOverheadLoops::ID = 0;
 
 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
                 false, false)
 
-bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) {
-  if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB())
+bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
+  const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
+  if (!ST.hasLOB())
     return false;
 
-  LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n");
+  MF = &mf;
+  LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
 
   auto &MLI = getAnalysis<MachineLoopInfo>();
-  MRI = &MF.getRegInfo();
-  TII = static_cast<const ARMBaseInstrInfo*>(
-    MF.getSubtarget().getInstrInfo());
-  BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
+  MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+  MRI = &MF->getRegInfo();
+  TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
+  BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
   BBUtils->computeAllBlockSizes();
-  BBUtils->adjustBBOffsetsAfter(&MF.front());
+  BBUtils->adjustBBOffsetsAfter(&MF->front());
 
   bool Changed = false;
   for (auto ML : MLI) {
     if (!ML->getParentLoop())
       Changed |= ProcessLoop(ML);
   }
+  Changed |= RevertNonLoops();
   return Changed;
 }
 
+static bool IsLoopStart(MachineInstr &MI) {
+  return MI.getOpcode() == ARM::t2DoLoopStart ||
+         MI.getOpcode() == ARM::t2WhileLoopStart;
+}
+
+template<typename T>
+static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) {
+  for(auto &MI : make_range(T(Begin), End)) {
+    for (auto &MO : MI.operands()) {
+      if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
+        continue;
+      return &MI;
+    }
+  }
+  return nullptr;
+}
+
+static MachineInstr* SearchForUse(MachineInstr *Begin,
+                                  MachineBasicBlock::iterator End,
+                                  unsigned Reg) {
+  for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) {
+    for (auto &MO : MI.operands()) {
+      if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
+        continue;
+      return &MI;
+    }
+  }
+  return nullptr;
+}
+
+// Is it safe to define LR with DLS/WLS?
+// LR can defined if it is the operand to start, because it's the same value,
+// or if it's going to be equivalent to the operand to Start.
+MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) {
+
+  auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) {
+    return MI->getOpcode() == ARM::tMOVr &&
+           MI->getOperand(0).getReg() == ARM::LR &&
+           MI->getOperand(1).getReg() == Reg &&
+           MI->getOperand(2).getImm() == ARMCC::AL;
+   };
+
+  MachineBasicBlock *MBB = Start->getParent();
+  unsigned CountReg = Start->getOperand(0).getReg();
+  // Walk forward and backward in the block to find the closest instructions
+  // that define LR. Then also filter them out if they're not a mov lr.
+  MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR);
+  if (PredLRDef && !IsMoveLR(PredLRDef, CountReg))
+    PredLRDef = nullptr;
+
+  MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR);
+  if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg))
+    SuccLRDef = nullptr;
+
+  // We've either found one, two or none mov lr instructions... Now figure out
+  // if they are performing the equilvant mov that the Start instruction will.
+  // Do this by scanning forward and backward to see if there's a def of the
+  // register holding the count value. If we find a suitable def, return it as
+  // the insert point. Later, if InsertPt != Start, then we can remove the
+  // redundant instruction.
+  if (SuccLRDef) {
+    MachineBasicBlock::iterator End(SuccLRDef);
+    if (!SearchForDef(Start, End, CountReg)) {
+      return SuccLRDef;
+    } else
+      SuccLRDef = nullptr;
+  }
+  if (PredLRDef) {
+    MachineBasicBlock::reverse_iterator End(PredLRDef);
+    if (!SearchForDef(Start, End, CountReg)) {
+      return PredLRDef;
+    } else
+      PredLRDef = nullptr;
+  }
+
+  // We can define LR because LR already contains the same value.
+  if (Start->getOperand(0).getReg() == ARM::LR)
+    return Start;
+
+  // We've found no suitable LR def and Start doesn't use LR directly. Can we
+  // just define LR anyway? 
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  LivePhysRegs LiveRegs(*TRI);
+  LiveRegs.addLiveOuts(*MBB);
+
+  // Not if we've haven't found a suitable mov and LR is live out.
+  if (LiveRegs.contains(ARM::LR))
+    return nullptr;
+
+  // If LR is not live out, we can insert the instruction if nothing else
+  // uses LR after it.
+  if (!SearchForUse(Start, MBB->end(), ARM::LR))
+    return Start;
+
+  LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for"
+             << " LR\n");
+  return nullptr;
+}
+
 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
 
   bool Changed = false;
@@ -111,15 +220,10 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
 
   LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
 
-  auto IsLoopStart = [](MachineInstr &MI) {
-    return MI.getOpcode() == ARM::t2DoLoopStart ||
-           MI.getOpcode() == ARM::t2WhileLoopStart;
-  };
-
   // Search the given block for a loop start instruction. If one isn't found,
   // and there's only one predecessor block, search that one too.
   std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
-    [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
+    [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
     for (auto &MI : *MBB) {
       if (IsLoopStart(MI))
         return &MI;
@@ -165,41 +269,62 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
         Dec = &MI;
       else if (MI.getOpcode() == ARM::t2LoopEnd)
         End = &MI;
-      else if (MI.getDesc().isCall())
+      else if (IsLoopStart(MI))
+        Start = &MI;
+      else if (MI.getDesc().isCall()) {
         // TODO: Though the call will require LE to execute again, does this
         // mean we should revert? Always executing LE hopefully should be
         // faster than performing a sub,cmp,br or even subs,br.
         Revert = true;
+        LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
+      }
 
-      if (!Dec)
+      if (!Dec || End)
         continue;
 
-      // If we find that we load/store LR between LoopDec and LoopEnd, expect
-      // that the decremented value has been spilled to the stack. Because
-      // this value isn't actually going to be produced until the latch, by LE,
-      // we would need to generate a real sub. The value is also likely to be
-      // reloaded for use of LoopEnd - in which in case we'd need to perform
-      // an add because it gets negated again by LE! The other option is to
-      // then generate the other form of LE which doesn't perform the sub.
-      if (MI.mayLoad() || MI.mayStore())
-        Revert =
-          MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR;
+      // If we find that LR has been written or read between LoopDec and
+      // LoopEnd, expect that the decremented value is being used else where.
+      // Because this value isn't actually going to be produced until the
+      // latch, by LE, we would need to generate a real sub. The value is also
+      // likely to be copied/reloaded for use of LoopEnd - in which in case
+      // we'd need to perform an add because it gets subtracted again by LE!
+      // The other option is to then generate the other form of LE which doesn't
+      // perform the sub.
+      for (auto &MO : MI.operands()) {
+        if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
+            MO.getReg() == ARM::LR) {
+          LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
+          Revert = true;
+          break;
+        }
+      }
     }
 
     if (Dec && End && Revert)
       break;
   }
 
+  LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
+             if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
+             if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;);
+
   if (!Start && !Dec && !End) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
     return Changed;
-  } if (!(Start && Dec && End)) {
-    report_fatal_error("Failed to find all loop components");
+  } else if (!(Start && Dec && End)) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n");
+    return false;
   }
 
-  if (!End->getOperand(1).isMBB() ||
-      End->getOperand(1).getMBB() != ML->getHeader())
-    report_fatal_error("Expected LoopEnd to target Loop Header");
+  if (!End->getOperand(1).isMBB())
+    report_fatal_error("Expected LoopEnd to target basic block");
+
+  // TODO Maybe there's cases where the target doesn't have to be the header,
+  // but for now be safe and revert.
+  if (End->getOperand(1).getMBB() != ML->getHeader()) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
+    Revert = true;
+  }
 
   // The WLS and LE instructions have 12-bits for the label offset. WLS
   // requires a positive offset, while LE uses negative.
@@ -216,41 +341,57 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
     Revert = true;
   }
 
-  LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start
-                    << " - Found Loop Dec: " << *Dec
-                    << " - Found Loop End: " << *End);
+  MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start);
+  if (!InsertPt) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
+    Revert = true;
+  } else
+    LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
 
-  Expand(ML, Start, Dec, End, Revert);
+  Expand(ML, Start, InsertPt, Dec, End, Revert);
   return true;
 }
 
 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
 // beq that branches to the exit branch.
-// FIXME: Need to check that we're not trashing the CPSR when generating the
-// cmp. We could also try to generate a cbz if the value in LR is also in
+// TODO: We could also try to generate a cbz if the value in LR is also in
 // another low register.
 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
   MachineBasicBlock *MBB = MI->getParent();
   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
                                     TII->get(ARM::t2CMPri));
-  MIB.addReg(ARM::LR);
+  MIB.add(MI->getOperand(0));
   MIB.addImm(0);
   MIB.addImm(ARMCC::AL);
-  MIB.addReg(ARM::CPSR);
+  MIB.addReg(ARM::NoRegister);
+  
+  MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
+  unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
+    ARM::tBcc : ARM::t2Bcc;
 
-  // TODO: Try to use tBcc instead
-  MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+  MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
   MIB.add(MI->getOperand(1));   // branch target
   MIB.addImm(ARMCC::EQ);        // condition code
   MIB.addReg(ARM::CPSR);
   MI->eraseFromParent();
 }
 
-// TODO: Check flags so that we can possibly generate a tSubs or tSub.
-void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
+bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
+                                        bool AllowFlags) const {
   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
   MachineBasicBlock *MBB = MI->getParent();
+
+  // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
+  bool SetFlags = false;
+  if (AllowFlags) {
+    if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) {
+      if (!SearchForUse(MI, MBB->end(), ARM::CPSR) &&
+          Def->getOpcode() == ARM::t2LoopEnd)
+        SetFlags = true;
+    }
+  }
+
   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
                                     TII->get(ARM::t2SUBri));
   MIB.addDef(ARM::LR);
@@ -258,28 +399,39 @@ void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
   MIB.add(MI->getOperand(2));
   MIB.addImm(ARMCC::AL);
   MIB.addReg(0);
-  MIB.addReg(0);
+
+  if (SetFlags) {
+    MIB.addReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  } else
+    MIB.addReg(0);
+
   MI->eraseFromParent();
+  return SetFlags;
 }
 
 // Generate a subs, or sub and cmp, and a branch instead of an LE.
-// FIXME: Need to check that we're not trashing the CPSR when generating
-// the cmp.
-void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
+void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
 
-  // Create cmp
   MachineBasicBlock *MBB = MI->getParent();
-  MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
-                                    TII->get(ARM::t2CMPri));
-  MIB.addReg(ARM::LR);
-  MIB.addImm(0);
-  MIB.addImm(ARMCC::AL);
-  MIB.addReg(ARM::CPSR);
+  // Create cmp
+  if (!SkipCmp) {
+    MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                                      TII->get(ARM::t2CMPri));
+    MIB.addReg(ARM::LR);
+    MIB.addImm(0);
+    MIB.addImm(ARMCC::AL);
+    MIB.addReg(ARM::NoRegister);
+  }
+
+  MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
+  unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
+    ARM::tBcc : ARM::t2Bcc;
 
-  // TODO Try to use tBcc instead.
   // Create bne
-  MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+  MachineInstrBuilder MIB =
+    BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
   MIB.add(MI->getOperand(1));   // branch target
   MIB.addImm(ARMCC::NE);        // condition code
   MIB.addReg(ARM::CPSR);
@@ -287,33 +439,13 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
 }
 
 void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
+                                 MachineInstr *InsertPt,
                                  MachineInstr *Dec, MachineInstr *End,
                                  bool Revert) {
 
-  auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) {
-    // The trip count should already been held in LR since the instructions
-    // within the loop can only read and write to LR. So, there should be a
-    // mov to setup the count. WLS/DLS perform this move, so find the original
-    // and delete it - inserting WLS/DLS in its place.
-    MachineBasicBlock *MBB = Start->getParent();
-    MachineInstr *InsertPt = Start;
-    for (auto &I : MRI->def_instructions(ARM::LR)) {
-      if (I.getParent() != MBB)
-        continue;
-
-      // Always execute.
-      if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL)
-        continue;
-
-      // Only handle move reg, if the trip count it will need moving into a reg
-      // before the setup instruction anyway.
-      if (!I.getDesc().isMoveReg() ||
-          !I.getOperand(1).isIdenticalTo(Start->getOperand(0)))
-        continue;
-      InsertPt = &I;
-      break;
-    }
-
+  auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start,
+                                MachineInstr *InsertPt) {
+    MachineBasicBlock *MBB = InsertPt->getParent();
     unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
       ARM::t2DLS : ARM::t2WLS;
     MachineInstrBuilder MIB =
@@ -369,16 +501,54 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
       RevertWhile(Start);
     else
       Start->eraseFromParent();
-    RevertLoopDec(Dec);
-    RevertLoopEnd(End);
+    bool FlagsAlreadySet = RevertLoopDec(Dec, true);
+    RevertLoopEnd(End, FlagsAlreadySet);
   } else {
-    Start = ExpandLoopStart(ML, Start);
+    Start = ExpandLoopStart(ML, Start, InsertPt);
     RemoveDeadBranch(Start);
     End = ExpandLoopEnd(ML, Dec, End);
     RemoveDeadBranch(End);
   }
 }
 
+bool ARMLowOverheadLoops::RevertNonLoops() {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
+  bool Changed = false;
+
+  for (auto &MBB : *MF) {
+    SmallVector<MachineInstr*, 4> Starts;
+    SmallVector<MachineInstr*, 4> Decs;
+    SmallVector<MachineInstr*, 4> Ends;
+
+    for (auto &I : MBB) {
+      if (IsLoopStart(I))
+        Starts.push_back(&I);
+      else if (I.getOpcode() == ARM::t2LoopDec)
+        Decs.push_back(&I);
+      else if (I.getOpcode() == ARM::t2LoopEnd)
+        Ends.push_back(&I);
+    }
+
+    if (Starts.empty() && Decs.empty() && Ends.empty())
+      continue;
+
+    Changed = true;
+
+    for (auto *Start : Starts) {
+      if (Start->getOpcode() == ARM::t2WhileLoopStart)
+        RevertWhile(Start);
+      else
+        Start->eraseFromParent();
+    }
+    for (auto *Dec : Decs)
+      RevertLoopDec(Dec);
+
+    for (auto *End : Ends)
+      RevertLoopEnd(End);
+  }
+  return Changed;
+}
+
 FunctionPass *llvm::createARMLowOverheadLoopsPass() {
   return new ARMLowOverheadLoops();
 }
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 90c5ad025e56..c92689f4942e 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -74,8 +74,8 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
   switch (MO.getType()) {
   default: llvm_unreachable("unknown operand type");
   case MachineOperand::MO_Register:
-    // Ignore all non-CPSR implicit register operands.
-    if (MO.isImplicit() && MO.getReg() != ARM::CPSR)
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
       return false;
     assert(!MO.getSubReg() && "Subregs should be eliminated!");
     MCOp = MCOperand::createReg(MO.getReg());
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 90d794cd27b1..bb136e92329b 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <utility>
 
@@ -130,6 +131,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// The amount the literal pool has been increasedby due to promoted globals.
   int PromotedGlobalsIncrease = 0;
 
+  /// True if r0 will be preserved by a call to this function (e.g. C++
+  /// con/destructors).
+  bool PreservesR0 = false;
+
 public:
   ARMFunctionInfo() = default;
 
@@ -247,6 +252,9 @@ public:
   }
 
   DenseMap<unsigned, unsigned> EHPrologueRemappedRegs;
+
+  void setPreservesR0() { PreservesR0 = true; }
+  bool getPreservesR0() const { return PreservesR0; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
index 5389d09bf7d7..ae5657a0a2c1 100644
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -1,4 +1,4 @@
-//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
+//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -18,13 +18,11 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/PassSupport.h"
@@ -45,54 +43,39 @@ static cl::opt<bool>
 DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
                    cl::desc("Disable the ARM Parallel DSP pass"));
 
+static cl::opt<unsigned>
+NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
+             cl::desc("Limit the number of loads analysed"));
+
 namespace {
-  struct OpChain;
-  struct BinOpChain;
+  struct MulCandidate;
   class Reduction;
 
-  using OpChainList     = SmallVector<std::unique_ptr<OpChain>, 8>;
-  using ReductionList   = SmallVector<Reduction, 8>;
-  using ValueList       = SmallVector<Value*, 8>;
-  using MemInstList     = SmallVector<LoadInst*, 8>;
-  using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
-  using PMACPairList    = SmallVector<PMACPair, 8>;
-  using Instructions    = SmallVector<Instruction*,16>;
-  using MemLocList      = SmallVector<MemoryLocation, 4>;
+  using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
+  using MemInstList = SmallVectorImpl<LoadInst*>;
+  using MulPairList = SmallVector<std::pair<MulCandidate*, MulCandidate*>, 8>;
 
-  struct OpChain {
+  // 'MulCandidate' holds the multiplication instructions that are candidates
+  // for parallel execution.
+  struct MulCandidate {
     Instruction   *Root;
-    ValueList     AllValues;
-    MemInstList   VecLd;    // List of all load instructions.
-    MemInstList   Loads;
+    Value*        LHS;
+    Value*        RHS;
+    bool          Exchange = false;
     bool          ReadOnly = true;
+    bool          Paired = false;
+    SmallVector<LoadInst*, 2> VecLd;    // Container for loads to widen.
 
-    OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
-    virtual ~OpChain() = default;
+    MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
+      Root(I), LHS(lhs), RHS(rhs) { }
 
-    void PopulateLoads() {
-      for (auto *V : AllValues) {
-        if (auto *Ld = dyn_cast<LoadInst>(V))
-          Loads.push_back(Ld);
-      }
+    bool HasTwoLoadInputs() const {
+      return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
     }
 
-    unsigned size() const { return AllValues.size(); }
-  };
-
-  // 'BinOpChain' holds the multiplication instructions that are candidates
-  // for parallel execution.
-  struct BinOpChain : public OpChain {
-    ValueList     LHS;      // List of all (narrow) left hand operands.
-    ValueList     RHS;      // List of all (narrow) right hand operands.
-    bool Exchange = false;
-
-    BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
-      OpChain(I, lhs), LHS(lhs), RHS(rhs) {
-        for (auto *V : RHS)
-          AllValues.push_back(V);
-      }
-
-    bool AreSymmetrical(BinOpChain *Other);
+    LoadInst *getBaseLoad() const {
+      return VecLd.front();
+    }
   };
 
   /// Represent a sequence of multiply-accumulate operations with the aim to
@@ -100,9 +83,9 @@ namespace {
   class Reduction {
     Instruction     *Root = nullptr;
     Value           *Acc = nullptr;
-    OpChainList     Muls;
-    PMACPairList        MulPairs;
-    SmallPtrSet<Instruction*, 4> Adds;
+    MulCandList     Muls;
+    MulPairList        MulPairs;
+    SetVector<Instruction*> Adds;
 
   public:
     Reduction() = delete;
@@ -112,10 +95,35 @@ namespace {
     /// Record an Add instruction that is a part of the this reduction.
     void InsertAdd(Instruction *I) { Adds.insert(I); }
 
-    /// Record a BinOpChain, rooted at a Mul instruction, that is a part of
-    /// this reduction.
-    void InsertMul(Instruction *I, ValueList &LHS, ValueList &RHS) {
-      Muls.push_back(make_unique<BinOpChain>(I, LHS, RHS));
+    /// Create MulCandidates, each rooted at a Mul instruction, that is a part
+    /// of this reduction.
+    void InsertMuls() {
+      auto GetMulOperand = [](Value *V) -> Instruction* {
+        if (auto *SExt = dyn_cast<SExtInst>(V)) {
+          if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
+            if (I->getOpcode() == Instruction::Mul)
+              return I;
+        } else if (auto *I = dyn_cast<Instruction>(V)) {
+          if (I->getOpcode() == Instruction::Mul)
+            return I;
+        }
+        return nullptr;
+      };
+
+      auto InsertMul = [this](Instruction *I) {
+        Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
+        Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
+        Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
+      };
+
+      for (auto *Add : Adds) {
+        if (Add == Acc)
+          continue;
+        if (auto *Mul = GetMulOperand(Add->getOperand(0)))
+          InsertMul(Mul);
+        if (auto *Mul = GetMulOperand(Add->getOperand(1)))
+          InsertMul(Mul);
+      }
     }
 
     /// Add the incoming accumulator value, returns true if a value had not
@@ -128,9 +136,17 @@ namespace {
       return true;
     }
 
-    /// Set two BinOpChains, rooted at muls, that can be executed as a single
+    /// Set two MulCandidates, rooted at muls, that can be executed as a single
     /// parallel operation.
-    void AddMulPair(BinOpChain *Mul0, BinOpChain *Mul1) {
+    void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
+                    bool Exchange = false) {
+      LLVM_DEBUG(dbgs() << "Pairing:\n"
+                 << *Mul0->Root << "\n"
+                 << *Mul1->Root << "\n");
+      Mul0->Paired = true;
+      Mul1->Paired = true;
+      if (Exchange)
+        Mul1->Exchange = true;
       MulPairs.push_back(std::make_pair(Mul0, Mul1));
     }
 
@@ -141,24 +157,40 @@ namespace {
     /// Return the add instruction which is the root of the reduction.
     Instruction *getRoot() { return Root; }
 
+    bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
+
+    Type *getType() const { return Root->getType(); }
+
     /// Return the incoming value to be accumulated. This maybe null.
     Value *getAccumulator() { return Acc; }
 
     /// Return the set of adds that comprise the reduction.
-    SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; }
+    SetVector<Instruction*> &getAdds() { return Adds; }
 
-    /// Return the BinOpChain, rooted at mul instruction, that comprise the
+    /// Return the MulCandidate, rooted at mul instruction, that comprise the
     /// the reduction.
-    OpChainList &getMuls() { return Muls; }
+    MulCandList &getMuls() { return Muls; }
 
-    /// Return the BinOpChain, rooted at mul instructions, that have been
+    /// Return the MulCandidate, rooted at mul instructions, that have been
     /// paired for parallel execution.
-    PMACPairList &getMulPairs() { return MulPairs; }
+    MulPairList &getMulPairs() { return MulPairs; }
 
     /// To finalise, replace the uses of the root with the intrinsic call.
     void UpdateRoot(Instruction *SMLAD) {
       Root->replaceAllUsesWith(SMLAD);
     }
+
+    void dump() {
+      LLVM_DEBUG(dbgs() << "Reduction:\n";
+        for (auto *Add : Adds)
+          LLVM_DEBUG(dbgs() << *Add << "\n");
+        for (auto &Mul : Muls)
+          LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
+                     << "  " << *Mul->LHS << "\n"
+                     << "  " << *Mul->RHS << "\n");
+        LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
+      );
+    }
   };
 
   class WidenedLoad {
@@ -176,13 +208,11 @@ namespace {
     }
   };
 
-  class ARMParallelDSP : public LoopPass {
+  class ARMParallelDSP : public FunctionPass {
     ScalarEvolution   *SE;
     AliasAnalysis     *AA;
     TargetLibraryInfo *TLI;
     DominatorTree     *DT;
-    LoopInfo          *LI;
-    Loop              *L;
     const DataLayout  *DL;
     Module            *M;
     std::map<LoadInst*, LoadInst*> LoadPairs;
@@ -190,13 +220,12 @@ namespace {
     std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
 
     template<unsigned>
-    bool IsNarrowSequence(Value *V, ValueList &VL);
-
+    bool IsNarrowSequence(Value *V);
+    bool Search(Value *V, BasicBlock *BB, Reduction &R);
     bool RecordMemoryOps(BasicBlock *BB);
     void InsertParallelMACs(Reduction &Reduction);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
-    LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
-                             IntegerType *LoadTy);
+    LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
     bool CreateParallelPairs(Reduction &R);
 
     /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
@@ -204,60 +233,38 @@ namespace {
     /// products to a 32-bit accumulate operand. Optionally, the instruction can
     /// exchange the halfwords of the second operand before performing the
     /// arithmetic.
-    bool MatchSMLAD(Loop *L);
+    bool MatchSMLAD(Function &F);
 
   public:
     static char ID;
 
-    ARMParallelDSP() : LoopPass(ID) { }
-
-    bool doInitialization(Loop *L, LPPassManager &LPM) override {
-      LoadPairs.clear();
-      WideLoads.clear();
-      return true;
-    }
+    ARMParallelDSP() : FunctionPass(ID) { }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      LoopPass::getAnalysisUsage(AU);
+      FunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<ScalarEvolutionWrapperPass>();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<LoopInfoWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<TargetPassConfig>();
-      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
       AU.setPreservesCFG();
     }
 
-    bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
+    bool runOnFunction(Function &F) override {
       if (DisableParallelDSP)
         return false;
-      L = TheLoop;
+      if (skipFunction(F))
+        return false;
+
       SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
       AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
       auto &TPC = getAnalysis<TargetPassConfig>();
 
-      BasicBlock *Header = TheLoop->getHeader();
-      if (!Header)
-        return false;
-
-      // TODO: We assume the loop header and latch to be the same block.
-      // This is not a fundamental restriction, but lifting this would just
-      // require more work to do the transformation and then patch up the CFG.
-      if (Header != TheLoop->getLoopLatch()) {
-        LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
-                             "running pass ARMParallelDSP\n");
-        return false;
-      }
-
-      if (!TheLoop->getLoopPreheader())
-        InsertPreheaderForLoop(L, DT, LI, nullptr, true);
-
-      Function &F = *Header->getParent();
       M = F.getParent();
       DL = &M->getDataLayout();
 
@@ -282,17 +289,10 @@ namespace {
         return false;
       }
 
-      LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
-
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
 
-      if (!RecordMemoryOps(Header)) {
-        LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
-        return false;
-      }
-
-      bool Changes = MatchSMLAD(L);
+      bool Changes = MatchSMLAD(F);
       return Changes;
     }
   };
@@ -331,40 +331,14 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
 // TODO: we currently only collect i16, and will support i8 later, so that's
 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
 template<unsigned MaxBitWidth>
-bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
-  ConstantInt *CInt;
-
-  if (match(V, m_ConstantInt(CInt))) {
-    // TODO: if a constant is used, it needs to fit within the bit width.
-    return false;
-  }
-
-  auto *I = dyn_cast<Instruction>(V);
-  if (!I)
-    return false;
-
-  Value *Val, *LHS, *RHS;
-  if (match(V, m_Trunc(m_Value(Val)))) {
-    if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
-      return IsNarrowSequence<MaxBitWidth>(Val, VL);
-  } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
-    // TODO: we need to implement sadd16/sadd8 for this, which enables to
-    // also do the rewrite for smlad8.ll, but it is unsupported for now.
-    return false;
-  } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
-    if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
+bool ARMParallelDSP::IsNarrowSequence(Value *V) {
+  if (auto *SExt = dyn_cast<SExtInst>(V)) {
+    if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
       return false;
 
-    if (match(Val, m_Load(m_Value()))) {
-      auto *Ld = cast<LoadInst>(Val);
-
-      // Check that these load could be paired.
-      if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld))
-        return false;
-
-      VL.push_back(Val);
-      VL.push_back(I);
-      return true;
+    if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
+      // Check that this load could be paired.
+      return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
     }
   }
   return false;
@@ -375,6 +349,9 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
 bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   SmallVector<LoadInst*, 8> Loads;
   SmallVector<Instruction*, 8> Writes;
+  LoadPairs.clear();
+  WideLoads.clear();
+  OrderedBasicBlock OrderedBB(BB);
 
   // Collect loads and instruction that may write to memory. For now we only
   // record loads which are simple, sign-extended and have a single user.
@@ -389,21 +366,24 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
     Loads.push_back(Ld);
   }
 
+  if (Loads.empty() || Loads.size() > NumLoadLimit)
+    return false;
+
   using InstSet = std::set<Instruction*>;
   using DepMap = std::map<Instruction*, InstSet>;
   DepMap RAWDeps;
 
   // Record any writes that may alias a load.
   const auto Size = LocationSize::unknown();
-  for (auto Read : Loads) {
-    for (auto Write : Writes) {
+  for (auto Write : Writes) {
+    for (auto Read : Loads) {
       MemoryLocation ReadLoc =
         MemoryLocation(Read->getPointerOperand(), Size);
 
       if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
           ModRefInfo::ModRef)))
         continue;
-      if (DT->dominates(Write, Read))
+      if (OrderedBB.dominates(Write, Read))
         RAWDeps[Read].insert(Write);
     }
   }
@@ -411,17 +391,16 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   // Check whether there's not a write between the two loads which would
   // prevent them from being safely merged.
   auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
-    LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
-    LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
+    LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset;
+    LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base;
 
     if (RAWDeps.count(Dominated)) {
       InstSet &WritesBefore = RAWDeps[Dominated];
 
       for (auto Before : WritesBefore) {
-
         // We can't move the second load backward, past a write, to merge
         // with the first load.
-        if (DT->dominates(Dominator, Before))
+        if (OrderedBB.dominates(Dominator, Before))
           return false;
       }
     }
@@ -431,7 +410,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   // Record base, offset load pairs.
   for (auto *Base : Loads) {
     for (auto *Offset : Loads) {
-      if (Base == Offset)
+      if (Base == Offset || OffsetLoads.count(Offset))
         continue;
 
       if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
@@ -453,7 +432,54 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   return LoadPairs.size() > 1;
 }
 
-// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
+// Search recursively back through the operands to find a tree of values that
+// form a multiply-accumulate chain. The search records the Add and Mul
+// instructions that form the reduction and allows us to find a single value
+// to be used as the initial input to the accumlator.
+bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
+  // If we find a non-instruction, try to use it as the initial accumulator
+  // value. This may have already been found during the search in which case
+  // this function will return false, signaling a search fail.
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return R.InsertAcc(V);
+
+  if (I->getParent() != BB)
+    return false;
+
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::PHI:
+    // Could be the accumulator value.
+    return R.InsertAcc(V);
+  case Instruction::Add: {
+    // Adds should be adding together two muls, or another add and a mul to
+    // be within the mac chain. One of the operands may also be the
+    // accumulator value at which point we should stop searching.
+    R.InsertAdd(I);
+    Value *LHS = I->getOperand(0);
+    Value *RHS = I->getOperand(1);
+    bool ValidLHS = Search(LHS, BB, R);
+    bool ValidRHS = Search(RHS, BB, R);
+
+    if (ValidLHS && ValidRHS)
+      return true;
+
+    return R.InsertAcc(I);
+  }
+  case Instruction::Mul: {
+    Value *MulOp0 = I->getOperand(0);
+    Value *MulOp1 = I->getOperand(1);
+    return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
+  }
+  case Instruction::SExt:
+    return Search(I->getOperand(0), BB, R);
+  }
+  return false;
+}
+
+// The pass needs to identify integer add/sub reductions of 16-bit vector
 // multiplications.
 // To use SMLAD:
 // 1) we first need to find integer add then look for this pattern:
@@ -484,88 +510,39 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
 // If loop invariants are used instead of loads, these need to be packed
 // before the loop begins.
 //
-bool ARMParallelDSP::MatchSMLAD(Loop *L) {
-  // Search recursively back through the operands to find a tree of values that
-  // form a multiply-accumulate chain. The search records the Add and Mul
-  // instructions that form the reduction and allows us to find a single value
-  // to be used as the initial input to the accumlator.
-  std::function<bool(Value*, Reduction&)> Search = [&]
-    (Value *V, Reduction &R) -> bool {
-
-    // If we find a non-instruction, try to use it as the initial accumulator
-    // value. This may have already been found during the search in which case
-    // this function will return false, signaling a search fail.
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      return R.InsertAcc(V);
-
-    switch (I->getOpcode()) {
-    default:
-      break;
-    case Instruction::PHI:
-      // Could be the accumulator value.
-      return R.InsertAcc(V);
-    case Instruction::Add: {
-      // Adds should be adding together two muls, or another add and a mul to
-      // be within the mac chain. One of the operands may also be the
-      // accumulator value at which point we should stop searching.
-      bool ValidLHS = Search(I->getOperand(0), R);
-      bool ValidRHS = Search(I->getOperand(1), R);
-      if (!ValidLHS && !ValidLHS)
-        return false;
-      else if (ValidLHS && ValidRHS) {
-        R.InsertAdd(I);
-        return true;
-      } else {
-        R.InsertAdd(I);
-        return R.InsertAcc(I);
-      }
-    }
-    case Instruction::Mul: {
-      Value *MulOp0 = I->getOperand(0);
-      Value *MulOp1 = I->getOperand(1);
-      if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
-        ValueList LHS;
-        ValueList RHS;
-        if (IsNarrowSequence<16>(MulOp0, LHS) &&
-            IsNarrowSequence<16>(MulOp1, RHS)) {
-          R.InsertMul(I, LHS, RHS);
-          return true;
-        }
-      }
-      return false;
-    }
-    case Instruction::SExt:
-      return Search(I->getOperand(0), R);
-    }
-    return false;
-  };
-
+bool ARMParallelDSP::MatchSMLAD(Function &F) {
   bool Changed = false;
-  SmallPtrSet<Instruction*, 4> AllAdds;
-  BasicBlock *Latch = L->getLoopLatch();
 
-  for (Instruction &I : reverse(*Latch)) {
-    if (I.getOpcode() != Instruction::Add)
+  for (auto &BB : F) {
+    SmallPtrSet<Instruction*, 4> AllAdds;
+    if (!RecordMemoryOps(&BB))
       continue;
 
-    if (AllAdds.count(&I))
-      continue;
+    for (Instruction &I : reverse(BB)) {
+      if (I.getOpcode() != Instruction::Add)
+        continue;
 
-    const auto *Ty = I.getType();
-    if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
-      continue;
+      if (AllAdds.count(&I))
+        continue;
 
-    Reduction R(&I);
-    if (!Search(&I, R))
-      continue;
+      const auto *Ty = I.getType();
+      if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+        continue;
 
-    if (!CreateParallelPairs(R))
-      continue;
+      Reduction R(&I);
+      if (!Search(&I, &BB, R))
+        continue;
 
-    InsertParallelMACs(R);
-    Changed = true;
-    AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+      R.InsertMuls();
+      LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
+
+      if (!CreateParallelPairs(R))
+        continue;
+
+      InsertParallelMACs(R);
+      Changed = true;
+      AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+    }
   }
 
   return Changed;
@@ -578,87 +555,57 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
     return false;
 
   // Check that the muls operate directly upon sign extended loads.
-  for (auto &MulChain : R.getMuls()) {
-    // A mul has 2 operands, and a narrow op consist of sext and a load; thus
-    // we expect at least 4 items in this operand value list.
-    if (MulChain->size() < 4) {
-      LLVM_DEBUG(dbgs() << "Operand list too short.\n");
+  for (auto &MulCand : R.getMuls()) {
+    if (!MulCand->HasTwoLoadInputs())
       return false;
-    }
-    MulChain->PopulateLoads();
-    ValueList &LHS = static_cast<BinOpChain*>(MulChain.get())->LHS;
-    ValueList &RHS = static_cast<BinOpChain*>(MulChain.get())->RHS;
-
-    // Use +=2 to skip over the expected extend instructions.
-    for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
-      if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
-        return false;
-    }
   }
 
-  auto CanPair = [&](Reduction &R, BinOpChain *PMul0, BinOpChain *PMul1) {
-    if (!PMul0->AreSymmetrical(PMul1))
-      return false;
-
+  auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
     // The first elements of each vector should be loads with sexts. If we
     // find that its two pairs of consecutive loads, then these can be
     // transformed into two wider loads and the users can be replaced with
     // DSP intrinsics.
-    for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
-      auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
-      auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
-      auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
-      auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
-
-      if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
-        return false;
+    auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
+    auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
+    auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
+    auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
 
-      LLVM_DEBUG(dbgs() << "Loads:\n"
-                 << " - " << *Ld0 << "\n"
-                 << " - " << *Ld1 << "\n"
-                 << " - " << *Ld2 << "\n"
-                 << " - " << *Ld3 << "\n");
-
-      if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
-        if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
-          LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-          R.AddMulPair(PMul0, PMul1);
-          return true;
-        } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
-          LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-          LLVM_DEBUG(dbgs() << "    exchanging Ld2 and Ld3\n");
-          PMul1->Exchange = true;
-          R.AddMulPair(PMul0, PMul1);
-          return true;
-        }
-      } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
-                 AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+    if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
+      if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
         LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-        LLVM_DEBUG(dbgs() << "    exchanging Ld0 and Ld1\n");
-        LLVM_DEBUG(dbgs() << "    and swapping muls\n");
-        PMul0->Exchange = true;
-        // Only the second operand can be exchanged, so swap the muls.
-        R.AddMulPair(PMul1, PMul0);
+        R.AddMulPair(PMul0, PMul1);
+        return true;
+      } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
+        LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+        LLVM_DEBUG(dbgs() << "    exchanging Ld2 and Ld3\n");
+        R.AddMulPair(PMul0, PMul1, true);
         return true;
       }
+    } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
+               AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+      LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+      LLVM_DEBUG(dbgs() << "    exchanging Ld0 and Ld1\n");
+      LLVM_DEBUG(dbgs() << "    and swapping muls\n");
+      // Only the second operand can be exchanged, so swap the muls.
+      R.AddMulPair(PMul1, PMul0, true);
+      return true;
     }
     return false;
   };
 
-  OpChainList &Muls = R.getMuls();
+  MulCandList &Muls = R.getMuls();
   const unsigned Elems = Muls.size();
-  SmallPtrSet<const Instruction*, 4> Paired;
   for (unsigned i = 0; i < Elems; ++i) {
-    BinOpChain *PMul0 = static_cast<BinOpChain*>(Muls[i].get());
-    if (Paired.count(PMul0->Root))
+    MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
+    if (PMul0->Paired)
       continue;
 
     for (unsigned j = 0; j < Elems; ++j) {
       if (i == j)
         continue;
 
-      BinOpChain *PMul1 = static_cast<BinOpChain*>(Muls[j].get());
-      if (Paired.count(PMul1->Root))
+      MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
+      if (PMul1->Paired)
         continue;
 
       const Instruction *Mul0 = PMul0->Root;
@@ -668,29 +615,19 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
 
       assert(PMul0 != PMul1 && "expected different chains");
 
-      if (CanPair(R, PMul0, PMul1)) {
-        Paired.insert(Mul0);
-        Paired.insert(Mul1);
+      if (CanPair(R, PMul0, PMul1))
         break;
-      }
     }
   }
   return !R.getMulPairs().empty();
 }
 
-
 void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
 
-  auto CreateSMLADCall = [&](SmallVectorImpl<LoadInst*> &VecLd0,
-                             SmallVectorImpl<LoadInst*> &VecLd1,
-                             Value *Acc, bool Exchange,
-                             Instruction *InsertAfter) {
+  auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
+                         Value *Acc, bool Exchange,
+                         Instruction *InsertAfter) {
     // Replace the reduction chain with an intrinsic call
-    IntegerType *Ty = IntegerType::get(M->getContext(), 32);
-    LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
-      WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty);
-    LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
-      WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty);
 
     Value* Args[] = { WideLd0, WideLd1, Acc };
     Function *SMLAD = nullptr;
@@ -704,34 +641,95 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
         Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
 
     IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
-                                ++BasicBlock::iterator(InsertAfter));
+                                BasicBlock::iterator(InsertAfter));
     Instruction *Call = Builder.CreateCall(SMLAD, Args);
     NumSMLAD++;
     return Call;
   };
 
-  Instruction *InsertAfter = R.getRoot();
+  // Return the instruction after the dominated instruction.
+  auto GetInsertPoint = [this](Value *A, Value *B) {
+    assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
+           "expected at least one instruction");
+
+    Value *V = nullptr;
+    if (!isa<Instruction>(A))
+      V = B;
+    else if (!isa<Instruction>(B))
+      V = A;
+    else
+      V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
+
+    return &*++BasicBlock::iterator(cast<Instruction>(V));
+  };
+
   Value *Acc = R.getAccumulator();
-  if (!Acc)
-    Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
 
-  LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n"
-             << "Acc: " << *Acc << "\n");
+  // For any muls that were discovered but not paired, accumulate their values
+  // as before.
+  IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
+  MulCandList &MulCands = R.getMuls();
+  for (auto &MulCand : MulCands) {
+    if (MulCand->Paired)
+      continue;
+
+    Instruction *Mul = cast<Instruction>(MulCand->Root);
+    LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
+
+    if (R.getType() != Mul->getType()) {
+      assert(R.is64Bit() && "expected 64-bit result");
+      Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
+      Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
+    }
+
+    if (!Acc) {
+      Acc = Mul;
+      continue;
+    }
+
+    // If Acc is the original incoming value to the reduction, it could be a
+    // phi. But the phi will dominate Mul, meaning that Mul will be the
+    // insertion point.
+    Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
+    Acc = Builder.CreateAdd(Mul, Acc);
+  }
+
+  if (!Acc) {
+    Acc = R.is64Bit() ?
+      ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
+      ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
+  } else if (Acc->getType() != R.getType()) {
+    Builder.SetInsertPoint(R.getRoot());
+    Acc = Builder.CreateSExt(Acc, R.getType());
+  }
+
+  // Roughly sort the mul pairs in their program order.
+  OrderedBasicBlock OrderedBB(R.getRoot()->getParent());
+  llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) {
+               const Instruction *A = PairA.first->Root;
+               const Instruction *B = PairB.first->Root;
+               return OrderedBB.dominates(A, B);
+             });
+
+  IntegerType *Ty = IntegerType::get(M->getContext(), 32);
   for (auto &Pair : R.getMulPairs()) {
-    BinOpChain *PMul0 = Pair.first;
-    BinOpChain *PMul1 = Pair.second;
-    LLVM_DEBUG(dbgs() << "Muls:\n"
-               << "- " << *PMul0->Root << "\n"
-               << "- " << *PMul1->Root << "\n");
-
-    Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
-                          InsertAfter);
-    InsertAfter = cast<Instruction>(Acc);
+    MulCandidate *LHSMul = Pair.first;
+    MulCandidate *RHSMul = Pair.second;
+    LoadInst *BaseLHS = LHSMul->getBaseLoad();
+    LoadInst *BaseRHS = RHSMul->getBaseLoad();
+    LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
+      WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
+    LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
+      WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
+
+    Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
+    InsertAfter = GetInsertPoint(InsertAfter, Acc);
+    Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
   }
   R.UpdateRoot(cast<Instruction>(Acc));
 }
 
-LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
                                          IntegerType *LoadTy) {
   assert(Loads.size() == 2 && "currently only support widening two loads");
 
@@ -758,8 +756,8 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
         return;
 
       Source->moveBefore(Sink);
-      for (auto &U : Source->uses())
-        MoveBefore(Source, U.getUser());
+      for (auto &Op : Source->operands())
+        MoveBefore(Op, Source);
     };
 
   // Insert the load at the point of the original dominating load.
@@ -784,57 +782,30 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
   // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
   // TODO: Support big-endian as well.
   Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
-  BaseSExt->setOperand(0, Bottom);
+  Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
+  BaseSExt->replaceAllUsesWith(NewBaseSExt);
 
   IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
   Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
   Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
   Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
-  OffsetSExt->setOperand(0, Trunc);
-
+  Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
+  OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
+
+  LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
+             << *Base << "\n" << *Offset << "\n"
+             << "Created Wide Load:\n"
+             << *WideLoad << "\n"
+             << *Bottom << "\n"
+             << *NewBaseSExt << "\n"
+             << *Top << "\n"
+             << *Trunc << "\n"
+             << *NewOffsetSExt << "\n");
   WideLoads.emplace(std::make_pair(Base,
-                                   make_unique<WidenedLoad>(Loads, WideLoad)));
+                                   std::make_unique<WidenedLoad>(Loads, WideLoad)));
   return WideLoad;
 }
 
-// Compare the value lists in Other to this chain.
-bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
-  // Element-by-element comparison of Value lists returning true if they are
-  // instructions with the same opcode or constants with the same value.
-  auto CompareValueList = [](const ValueList &VL0,
-                             const ValueList &VL1) {
-    if (VL0.size() != VL1.size()) {
-      LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
-                        << VL0.size() << " != " << VL1.size() << "\n");
-      return false;
-    }
-
-    const unsigned Pairs = VL0.size();
-
-    for (unsigned i = 0; i < Pairs; ++i) {
-      const Value *V0 = VL0[i];
-      const Value *V1 = VL1[i];
-      const auto *Inst0 = dyn_cast<Instruction>(V0);
-      const auto *Inst1 = dyn_cast<Instruction>(V1);
-
-      if (!Inst0 || !Inst1)
-        return false;
-
-      if (Inst0->isSameOperationAs(Inst1))
-        continue;
-
-      const APInt *C0, *C1;
-      if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
-        return false;
-    }
-
-    return true;
-  };
-
-  return CompareValueList(LHS, Other->LHS) &&
-         CompareValueList(RHS, Other->RHS);
-}
-
 Pass *llvm::createARMParallelDSPPass() {
   return new ARMParallelDSP();
 }
@@ -842,6 +813,6 @@ Pass *llvm::createARMParallelDSPPass() {
 char ARMParallelDSP::ID = 0;
 
 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
-                "Transform loops to use DSP intrinsics", false, false)
+                "Transform functions to use DSP intrinsics", false, false)
 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
-                "Transform loops to use DSP intrinsics", false, false)
+                "Transform functions to use DSP intrinsics", false, false)
diff --git a/lib/Target/ARM/ARMPredicates.td b/lib/Target/ARM/ARMPredicates.td
index 0b6b40de80dd..b008d3e2e296 100644
--- a/lib/Target/ARM/ARMPredicates.td
+++ b/lib/Target/ARM/ARMPredicates.td
@@ -71,7 +71,7 @@ def HasV8_5a         : Predicate<"Subtarget->hasV8_5aOps()">,
                                  AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2Base()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2Base()">,
-                                 AssemblerPredicate<"FeatureVFP2_D16_SP", "VFP2">;
+                                 AssemblerPredicate<"FeatureVFP2_SP", "VFP2">;
 def HasVFP3          : Predicate<"Subtarget->hasVFP3Base()">,
                                  AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4Base()">,
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 92ae26b3729d..56055a15483a 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -180,7 +180,7 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 // models the APSR when it's accessed by some special instructions. In such cases
 // it has the same encoding as PC.
 def CPSR       : ARMReg<0,  "cpsr">;
-def APSR       : ARMReg<1,  "apsr">;
+def APSR       : ARMReg<15, "apsr">;
 def APSR_NZCV  : ARMReg<15, "apsr_nzcv">;
 def SPSR       : ARMReg<2,  "spsr">;
 def FPSCR      : ARMReg<3,  "fpscr">;
@@ -486,12 +486,20 @@ def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
 
 // Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP.
 // These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs.
-def Tuples2R : RegisterTuples<[gsub_0, gsub_1],
-                              [(add R0, R2, R4, R6, R8, R10, R12),
-                               (add R1, R3, R5, R7, R9, R11, SP)]>;
+def Tuples2Rnosp : RegisterTuples<[gsub_0, gsub_1],
+                                  [(add R0, R2, R4, R6, R8, R10),
+                                   (add R1, R3, R5, R7, R9, R11)]>;
+
+def Tuples2Rsp   : RegisterTuples<[gsub_0, gsub_1],
+                                  [(add R12), (add SP)]>;
 
 // Register class representing a pair of even-odd GPRs.
-def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> {
+def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp, Tuples2Rsp)> {
+  let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
+}
+
+// Register class representing a pair of even-odd GPRs, except (R12, SP).
+def GPRPairnosp : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp)> {
   let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
 }
 
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 21d32bde4710..3f0b71afd977 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -2239,9 +2239,9 @@ def A9WriteLMfpPostRA : SchedWriteVariant<[
 // Distinguish between our multiple MI-level forms of the same
 // VLDM/VSTM instructions.
 def A9PreRA : SchedPredicate<
-  "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">;
+  "Register::isVirtualRegister(MI->getOperand(0).getReg())">;
 def A9PostRA : SchedPredicate<
-  "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">;
+  "Register::isPhysicalRegister(MI->getOperand(0).getReg())">;
 
 // VLDM represents all destination registers as a single register
 // tuple, unlike LDM. So the number of write operands is not variadic.
diff --git a/lib/Target/ARM/ARMScheduleM4.td b/lib/Target/ARM/ARMScheduleM4.td
index 38c8ea2b4f35..bfa5fc0d7131 100644
--- a/lib/Target/ARM/ARMScheduleM4.td
+++ b/lib/Target/ARM/ARMScheduleM4.td
@@ -18,6 +18,9 @@ def CortexM4Model : SchedMachineModel {
   let PostRAScheduler   = 1;
 
   let CompleteModel = 0;
+  let UnsupportedFeatures = [IsARM, HasNEON, HasDotProd, HasZCZ, HasMVEInt,
+          IsNotMClass, HasDPVFP, HasFPARMv8, HasFullFP16, Has8MSecExt, HasV8,
+          HasV8_3a, HasTrustZone, HasDFB, IsWindows];
 }
 
 
@@ -50,6 +53,7 @@ def : M4UnitL2<WriteMAC16>;
 def : M4UnitL2<WriteDIV>;
 
 def : M4UnitL2I<(instregex "(t|t2)LDM")>;
+def : M4UnitL2I<(instregex "(t|t2)LDR")>;
 
 
 // Stores we use a latency of 1 as they have no outputs
@@ -78,9 +82,20 @@ def : M4UnitL1<WriteNoop>;
 def : M4UnitL1<WritePreLd>;
 def : M4UnitL1I<(instregex "(t|t2)MOV")>;
 def : M4UnitL1I<(instrs COPY)>;
-def : M4UnitL1I<(instregex "t2IT")>;
-def : M4UnitL1I<(instregex "t2SEL", "t2USAD8",
-    "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>;
+def : M4UnitL1I<(instregex "t2IT", "t2MSR", "t2MRS")>;
+def : M4UnitL1I<(instregex "t2CLREX")>;
+def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", "t2SML[AS]",
+    "t2(S|Q|SH|U|UQ|UH|QD)(ADD|ASX|SAX|SUB)", "t2USADA8", "(t|t2)REV")>;
+
+// These instructions are not of much interest to scheduling as they will not
+// be generated or it is not very useful to schedule them. They are here to make
+// the model more complete.
+def : M4UnitL1I<(instregex "t2CDP", "t2LDC", "t2MCR", "t2MRC", "t2MRRC", "t2STC")>;
+def : M4UnitL1I<(instregex "tCPS", "t2ISB", "t2DSB", "t2DMB", "t2?HINT$")>;
+def : M4UnitL1I<(instregex "t2?UDF$", "tBKPT", "t2DBG")>;
+def : M4UnitL1I<(instregex "t?2?Int_eh_sjlj_", "tADDframe", "t?ADJCALL")>;
+def : M4UnitL1I<(instregex "CMP_SWAP", "JUMPTABLE", "MEMCPY")>;
+def : M4UnitL1I<(instregex "VSETLNi32", "VGETLNi32")>;
 
 def : ReadAdvance<ReadALU, 0>;
 def : ReadAdvance<ReadALUsr, 0>;
@@ -112,6 +127,9 @@ def : M4UnitL1<WriteVST1>;
 def : M4UnitL1<WriteVST2>;
 def : M4UnitL1<WriteVST3>;
 def : M4UnitL1<WriteVST4>;
+def : M4UnitL1I<(instregex "VMOVS", "FCONSTS", "VCMP", "VNEG", "VABS")>;
+def : M4UnitL2I<(instregex "VMOVD")>;
+def : M4UnitL1I<(instregex "VMRS", "VMSR", "FMSTAT")>;
 
 def : ReadAdvance<ReadFPMUL, 0>;
 def : ReadAdvance<ReadFPMAC, 0>;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 978faed776b0..09603057b2c8 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -125,7 +125,7 @@ const CallLowering *ARMSubtarget::getCallLowering() const {
   return CallLoweringInfo.get();
 }
 
-const InstructionSelector *ARMSubtarget::getInstructionSelector() const {
+InstructionSelector *ARMSubtarget::getInstructionSelector() const {
   return InstSelector.get();
 }
 
@@ -205,9 +205,9 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     NoARM = true;
 
   if (isAAPCS_ABI())
-    stackAlignment = 8;
+    stackAlignment = Align(8);
   if (isTargetNaCl() || isAAPCS16_ABI())
-    stackAlignment = 16;
+    stackAlignment = Align(16);
 
   // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
   // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
@@ -253,6 +253,10 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (isRWPI())
     ReserveR9 = true;
 
+  // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2
+  if (MVEVectorCostFactor == 0)
+    MVEVectorCostFactor = 2;
+
   // FIXME: Teach TableGen to deal with these instead of doing it manually here.
   switch (ARMProcFamily) {
   case Others:
@@ -296,13 +300,15 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     LdStMultipleTiming = SingleIssuePlusExtras;
     MaxInterleaveFactor = 4;
     if (!isThumb())
-      PrefLoopAlignment = 3;
+      PrefLoopLogAlignment = 3;
     break;
   case Kryo:
     break;
   case Krait:
     PreISelOperandLatencyAdjustment = 1;
     break;
+  case NeoverseN1:
+    break;
   case Swift:
     MaxInterleaveFactor = 2;
     LdStMultipleTiming = SingleIssuePlusExtras;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index c2b0f052b843..ef460342a69e 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -71,6 +71,7 @@ protected:
     Exynos,
     Krait,
     Kryo,
+    NeoverseN1,
     Swift
   };
   enum ARMProcClassEnum {
@@ -179,11 +180,9 @@ protected:
   bool HasVFPv3SP = false;
   bool HasVFPv4SP = false;
   bool HasFPARMv8SP = false;
-  bool HasVFPv2D16 = false;
   bool HasVFPv3D16 = false;
   bool HasVFPv4D16 = false;
   bool HasFPARMv8D16 = false;
-  bool HasVFPv2D16SP = false;
   bool HasVFPv3D16SP = false;
   bool HasVFPv4D16SP = false;
   bool HasFPARMv8D16SP = false;
@@ -450,7 +449,7 @@ protected:
 
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
-  unsigned stackAlignment = 4;
+  Align stackAlignment = Align(4);
 
   /// CPUString - String name of used CPU.
   std::string CPUString;
@@ -469,7 +468,12 @@ protected:
   int PreISelOperandLatencyAdjustment = 2;
 
   /// What alignment is preferred for loop bodies, in log2(bytes).
-  unsigned PrefLoopAlignment = 0;
+  unsigned PrefLoopLogAlignment = 0;
+
+  /// The cost factor for MVE instructions, representing the multiple beats an
+  // instruction can take. The default is 2, (set in initSubtargetFeatures so
+  // that we can use subtarget features less than 2).
+  unsigned MVEVectorCostFactor = 0;
 
   /// OptMinSize - True if we're optimising for minimum code size, equal to
   /// the function attribute.
@@ -535,7 +539,7 @@ public:
   }
 
   const CallLowering *getCallLowering() const override;
-  const InstructionSelector *getInstructionSelector() const override;
+  InstructionSelector *getInstructionSelector() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
 
@@ -600,7 +604,7 @@ public:
 
   bool hasARMOps() const { return !NoARM; }
 
-  bool hasVFP2Base() const { return HasVFPv2D16SP; }
+  bool hasVFP2Base() const { return HasVFPv2SP; }
   bool hasVFP3Base() const { return HasVFPv3D16SP; }
   bool hasVFP4Base() const { return HasVFPv4D16SP; }
   bool hasFPARMv8Base() const { return HasFPARMv8D16SP; }
@@ -668,6 +672,12 @@ public:
   bool hasSB() const { return HasSB; }
   bool genLongCalls() const { return GenLongCalls; }
   bool genExecuteOnly() const { return GenExecuteOnly; }
+  bool hasBaseDSP() const {
+    if (isThumb())
+      return hasDSP();
+    else
+      return hasV5TEOps();
+  }
 
   bool hasFP16() const { return HasFP16; }
   bool hasD32() const { return HasD32; }
@@ -812,7 +822,7 @@ public:
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
-  unsigned getStackAlignment() const { return stackAlignment; }
+  Align getStackAlignment() const { return stackAlignment; }
 
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
 
@@ -853,9 +863,9 @@ public:
     return isROPI() || !isTargetELF();
   }
 
-  unsigned getPrefLoopAlignment() const {
-    return PrefLoopAlignment;
-  }
+  unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; }
+
+  unsigned getMVEVectorCostFactor() const { return MVEVectorCostFactor; }
 
   bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
                                    unsigned PhysReg) const override;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 7f0aae1739b3..5c8007f101d9 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -96,15 +96,16 @@ extern "C" void LLVMInitializeARMTarget() {
   initializeARMExpandPseudoPass(Registry);
   initializeThumb2SizeReducePass(Registry);
   initializeMVEVPTBlockPass(Registry);
+  initializeMVETailPredicationPass(Registry);
   initializeARMLowOverheadLoopsPass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
-    return llvm::make_unique<TargetLoweringObjectFileMachO>();
+    return std::make_unique<TargetLoweringObjectFileMachO>();
   if (TT.isOSWindows())
-    return llvm::make_unique<TargetLoweringObjectFileCOFF>();
-  return llvm::make_unique<ARMElfTargetObjectFile>();
+    return std::make_unique<TargetLoweringObjectFileCOFF>();
+  return std::make_unique<ARMElfTargetObjectFile>();
 }
 
 static ARMBaseTargetMachine::ARMABI
@@ -282,7 +283,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle,
+    I = std::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle,
                                         F.hasMinSize());
 
     if (!I->isThumb() && !I->hasARMOps())
@@ -447,8 +448,10 @@ bool ARMPassConfig::addPreISel() {
                                   MergeExternalByDefault));
   }
 
-  if (TM->getOptLevel() != CodeGenOpt::None)
+  if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createHardwareLoopsPass());
+    addPass(createMVETailPredicationPass());
+  }
 
   return false;
 }
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2a8ec734a05f..86c8684d14dc 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -36,8 +36,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "armtti"
 
+static cl::opt<bool> EnableMaskedLoadStores(
+  "enable-arm-maskedldst", cl::Hidden, cl::init(false),
+  cl::desc("Enable the generation of masked loads and stores"));
+
 static cl::opt<bool> DisableLowOverheadLoops(
-  "disable-arm-loloops", cl::Hidden, cl::init(true),
+  "disable-arm-loloops", cl::Hidden, cl::init(false),
   cl::desc("Disable the generation of low-overhead loops"));
 
 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
@@ -167,6 +171,42 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src);
 
+  // The extend of a load is free
+  if (I && isa<LoadInst>(I->getOperand(0))) {
+    static const TypeConversionCostTblEntry LoadConversionTbl[] = {
+        {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
+        {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
+        {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
+        {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
+        {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
+        {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
+        {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
+        {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
+        {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
+        {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
+        {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
+        {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
+    };
+    if (const auto *Entry = ConvertCostTableLookup(
+            LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
+    static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
+        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
+        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
+        {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
+        {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
+        {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
+        {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
+    };
+    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
+      if (const auto *Entry =
+              ConvertCostTableLookup(MVELoadConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+        return Entry->Cost;
+    }
+  }
+
   // Some arithmetic, load and store operations have specific instructions
   // to cast up/down their types automatically at no extra cost.
   // TODO: Get these tables to know at least what the related operations are.
@@ -313,6 +353,31 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       return Entry->Cost;
   }
 
+  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
+  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
+  // are linearised so take more.
+  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
+  };
+
+  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
+    if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
+                                                   ISD, DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost * ST->getMVEVectorCostFactor();
+  }
+
   // Scalar integer conversion costs.
   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
     // i16 -> i64 requires two dependent operations.
@@ -332,7 +397,10 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       return Entry->Cost;
   }
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+  return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -343,8 +411,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
     return 3;
 
-  if ((Opcode == Instruction::InsertElement ||
-       Opcode == Instruction::ExtractElement)) {
+  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
+                        Opcode == Instruction::ExtractElement)) {
     // Cross-class copies are expensive on many microarchitectures,
     // so assume they are expensive by default.
     if (ValTy->getVectorElementType()->isIntegerTy())
@@ -357,6 +425,17 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
       return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
   }
 
+  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
+                                 Opcode == Instruction::ExtractElement)) {
+    // We say MVE moves costs at least the MVEVectorCostFactor, even though
+    // they are scalar instructions. This helps prevent mixing scalar and
+    // vector, to prevent vectorising where we end up just scalarising the
+    // result anyway.
+    return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
+                    ST->getMVEVectorCostFactor()) *
+           ValTy->getVectorNumElements() / 2;
+  }
+
   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
@@ -385,7 +464,10 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     return LT.first;
   }
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+  int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -397,13 +479,37 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
   unsigned NumVectorInstToHideOverhead = 10;
   int MaxMergeDistance = 64;
 
-  if (Ty->isVectorTy() && SE &&
-      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
-    return NumVectorInstToHideOverhead;
+  if (ST->hasNEON()) {
+    if (Ty->isVectorTy() && SE &&
+        !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
+      return NumVectorInstToHideOverhead;
 
-  // In many cases the address computation is not merged into the instruction
-  // addressing mode.
-  return 1;
+    // In many cases the address computation is not merged into the instruction
+    // addressing mode.
+    return 1;
+  }
+  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
+}
+
+bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
+  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
+    return false;
+
+  if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
+    // Don't support v2i1 yet.
+    if (VecTy->getNumElements() == 2)
+      return false;
+
+    // We don't support extending fp types.
+     unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
+    if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
+      return false;
+  }
+
+  unsigned EltWidth = DataTy->getScalarSizeInBits();
+  return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
+         (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
+         (EltWidth == 8);
 }
 
 int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
@@ -442,78 +548,96 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
 
 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  if (Kind == TTI::SK_Broadcast) {
-    static const CostTblEntry NEONDupTbl[] = {
-        // VDUP handles these cases.
-        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v8i8,  1},
-
-        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
-
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
-    if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE,
-                                            LT.second))
-      return LT.first * Entry->Cost;
-
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
-  }
-  if (Kind == TTI::SK_Reverse) {
-    static const CostTblEntry NEONShuffleTbl[] = {
-        // Reverse shuffle cost one instruction if we are shuffling within a
-        // double word (vrev) or two if we shuffle a quad word (vrev, vext).
-        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v8i8,  1},
-
-        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
-        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
-        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
-        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
-
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
-    if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE,
-                                            LT.second))
-      return LT.first * Entry->Cost;
-
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
-  }
-  if (Kind == TTI::SK_Select) {
-    static const CostTblEntry NEONSelShuffleTbl[] = {
-        // Select shuffle cost table for ARM. Cost is the number of instructions
-        // required to create the shuffled vector.
+  if (ST->hasNEON()) {
+    if (Kind == TTI::SK_Broadcast) {
+      static const CostTblEntry NEONDupTbl[] = {
+          // VDUP handles these cases.
+          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
+
+          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
+
+      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+      if (const auto *Entry =
+              CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+    }
+    if (Kind == TTI::SK_Reverse) {
+      static const CostTblEntry NEONShuffleTbl[] = {
+          // Reverse shuffle cost one instruction if we are shuffling within a
+          // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
+
+          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
+
+      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+      if (const auto *Entry =
+              CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+    }
+    if (Kind == TTI::SK_Select) {
+      static const CostTblEntry NEONSelShuffleTbl[] = {
+          // Select shuffle cost table for ARM. Cost is the number of
+          // instructions
+          // required to create the shuffled vector.
 
-        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
-        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
 
-        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
-        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
-        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+          {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
 
-        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
 
-        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
 
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-    if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
-                                            ISD::VECTOR_SHUFFLE, LT.second))
-      return LT.first * Entry->Cost;
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+      if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+    }
+  }
+  if (ST->hasMVEIntegerOps()) {
+    if (Kind == TTI::SK_Broadcast) {
+      static const CostTblEntry MVEDupTbl[] = {
+          // VDUP handles these cases.
+          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+          {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
+
+      std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+      if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
+                                              LT.second))
+        return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
+    }
   }
-  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
 int ARMTTIImpl::getArithmeticInstrCost(
@@ -567,38 +691,64 @@ int ARMTTIImpl::getArithmeticInstrCost(
     // Multiplication.
   };
 
-  if (ST->hasNEON())
+  if (ST->hasNEON()) {
     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
       return LT.first * Entry->Cost;
 
-  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
-                                           Opd1PropInfo, Opd2PropInfo);
-
-  // This is somewhat of a hack. The problem that we are facing is that SROA
-  // creates a sequence of shift, and, or instructions to construct values.
-  // These sequences are recognized by the ISel and have zero-cost. Not so for
-  // the vectorized code. Because we have support for v2i64 but not i64 those
-  // sequences look particularly beneficial to vectorize.
-  // To work around this we increase the cost of v2i64 operations to make them
-  // seem less beneficial.
-  if (LT.second == MVT::v2i64 &&
-      Op2Info == TargetTransformInfo::OK_UniformConstantValue)
-    Cost += 4;
-
-  return Cost;
+    int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                             Opd1PropInfo, Opd2PropInfo);
+
+    // This is somewhat of a hack. The problem that we are facing is that SROA
+    // creates a sequence of shift, and, or instructions to construct values.
+    // These sequences are recognized by the ISel and have zero-cost. Not so for
+    // the vectorized code. Because we have support for v2i64 but not i64 those
+    // sequences look particularly beneficial to vectorize.
+    // To work around this we increase the cost of v2i64 operations to make them
+    // seem less beneficial.
+    if (LT.second == MVT::v2i64 &&
+        Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+      Cost += 4;
+
+    return Cost;
+  }
+
+  int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+
+  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
+  // without treating floats as more expensive that scalars or increasing the
+  // costs for custom operations. The results is also multiplied by the
+  // MVEVectorCostFactor where appropriate.
+  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
+    return LT.first * BaseCost;
+
+  // Else this is expand, assume that we need to scalarize this op.
+  if (Ty->isVectorTy()) {
+    unsigned Num = Ty->getVectorNumElements();
+    unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+    // Return the cost of multiple scalar invocation plus the cost of
+    // inserting and extracting the values.
+    return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
+  }
+
+  return BaseCost;
 }
 
 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                 unsigned AddressSpace, const Instruction *I) {
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
-  if (Src->isVectorTy() && Alignment != 16 &&
+  if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 &&
       Src->getVectorElementType()->isDoubleTy()) {
     // Unaligned loads/stores are extremely inefficient.
     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
     return LT.first * 4;
   }
-  return LT.first;
+  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
+                     ? ST->getMVEVectorCostFactor()
+                     : 1;
+  return BaseCost * LT.first;
 }
 
 int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
@@ -893,6 +1043,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
         }
         return;
       }
+      // Don't unroll vectorised loop. MVE does not benefit from it as much as
+      // scalar code.
+      if (I.getType()->isVectorTy())
+        return;
+
       SmallVector<const Value*, 4> Operands(I.value_op_begin(),
                                             I.value_op_end());
       Cost += getUserCost(&I, Operands);
@@ -914,3 +1069,28 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   if (Cost < 12)
     UP.Force = true;
 }
+
+bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                                       TTI::ReductionFlags Flags) const {
+  assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+  if (!ST->hasMVEIntegerOps())
+    return false;
+
+  switch (Opcode) {
+  case Instruction::FAdd:
+  case Instruction::FMul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Mul:
+  case Instruction::FCmp:
+    return false;
+  case Instruction::ICmp:
+  case Instruction::Add:
+    return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128;
+  default:
+    llvm_unreachable("Unhandled reduction opcode");
+  }
+  return false;
+}
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 52f6ea4a6e2f..a878fdcfe3c7 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -101,9 +101,9 @@ public:
 
   /// Floating-point computation using ARMv8 AArch32 Advanced
   /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
-  /// is IEEE-754 compliant, but it's not covered in this target.
+  /// and Arm MVE are IEEE-754 compliant.
   bool isFPVectorizationPotentiallyUnsafe() {
-    return !ST->isTargetDarwin();
+    return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
   }
 
   /// \name Scalar TTI Implementations
@@ -122,10 +122,13 @@ public:
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector) {
+  unsigned getNumberOfRegisters(unsigned ClassID) const {
+    bool Vector = (ClassID == 1);
     if (Vector) {
       if (ST->hasNEON())
         return 16;
+      if (ST->hasMVEIntegerOps())
+        return 8;
       return 0;
     }
 
@@ -138,6 +141,8 @@ public:
     if (Vector) {
       if (ST->hasNEON())
         return 128;
+      if (ST->hasMVEIntegerOps())
+        return 128;
       return 0;
     }
 
@@ -148,10 +153,23 @@ public:
     return ST->getMaxInterleaveFactor();
   }
 
+  bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment);
+
+  bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) {
+    return isLegalMaskedLoad(DataTy, Alignment);
+  }
+
   int getMemcpyCost(const Instruction *I);
 
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
 
+  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                             TTI::ReductionFlags Flags) const;
+
+  bool shouldExpandReduction(const IntrinsicInst *II) const {
+    return false;
+  }
+
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                        const Instruction *I = nullptr);
 
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 1da9452f1d22..d2c355c1da75 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -2275,6 +2275,14 @@ public:
     return Value >= 1 && Value <= 32;
   }
 
+  bool isMveSaturateOp() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+    return Value == 48 || Value == 64;
+  }
+
   bool isITCondCodeNoAL() const {
     if (!isITCondCode()) return false;
     ARMCC::CondCodes CC = getCondCode();
@@ -2479,28 +2487,28 @@ public:
 
   void addModImmNotOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     uint32_t Enc = ARM_AM::getSOImmVal(~CE->getValue());
     Inst.addOperand(MCOperand::createImm(Enc));
   }
 
   void addModImmNegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     uint32_t Enc = ARM_AM::getSOImmVal(-CE->getValue());
     Inst.addOperand(MCOperand::createImm(Enc));
   }
 
   void addThumbModImmNeg8_255Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     uint32_t Val = -CE->getValue();
     Inst.addOperand(MCOperand::createImm(Val));
   }
 
   void addThumbModImmNeg1_7Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     uint32_t Val = -CE->getValue();
     Inst.addOperand(MCOperand::createImm(Val));
   }
@@ -2523,19 +2531,19 @@ public:
 
   void addFBits16Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(16 - CE->getValue()));
   }
 
   void addFBits32Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(32 - CE->getValue()));
   }
 
   void addFPImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
     Inst.addOperand(MCOperand::createImm(Val));
   }
@@ -2544,7 +2552,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // FIXME: We really want to scale the value here, but the LDRD/STRD
     // instruction don't encode operands that way yet.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue()));
   }
 
@@ -2552,35 +2560,31 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // FIXME: We really want to scale the value here, but the VSTR/VLDR_VSYSR
     // instruction don't encode operands that way yet.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue()));
   }
 
   void addImm7Shift0Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    assert(CE != nullptr && "Invalid operand type!");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue()));
   }
 
   void addImm7Shift1Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    assert(CE != nullptr && "Invalid operand type!");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue()));
   }
 
   void addImm7Shift2Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    assert(CE != nullptr && "Invalid operand type!");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue()));
   }
 
   void addImm7Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    assert(CE != nullptr && "Invalid operand type!");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue()));
   }
 
@@ -2588,7 +2592,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The immediate is scaled by four in the encoding and is stored
     // in the MCInst as such. Lop off the low two bits here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
   }
 
@@ -2596,7 +2600,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The immediate is scaled by four in the encoding and is stored
     // in the MCInst as such. Lop off the low two bits here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(-(CE->getValue() / 4)));
   }
 
@@ -2604,7 +2608,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The immediate is scaled by four in the encoding and is stored
     // in the MCInst as such. Lop off the low two bits here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
   }
 
@@ -2612,7 +2616,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The constant encodes as the immediate-1, and we store in the instruction
     // the bits as encoded, so subtract off one here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
   }
 
@@ -2620,7 +2624,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The constant encodes as the immediate-1, and we store in the instruction
     // the bits as encoded, so subtract off one here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
   }
 
@@ -2628,7 +2632,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The constant encodes as the immediate, except for 32, which encodes as
     // zero.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     unsigned Imm = CE->getValue();
     Inst.addOperand(MCOperand::createImm((Imm == 32 ? 0 : Imm)));
   }
@@ -2637,7 +2641,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // An ASR value of 32 encodes as 0, so that's how we want to add it to
     // the instruction as well.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     int Val = CE->getValue();
     Inst.addOperand(MCOperand::createImm(Val == 32 ? 0 : Val));
   }
@@ -2646,7 +2650,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The operand is actually a t2_so_imm, but we have its bitwise
     // negation in the assembly source, so twiddle it here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(~(uint32_t)CE->getValue()));
   }
 
@@ -2654,7 +2658,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The operand is actually a t2_so_imm, but we have its
     // negation in the assembly source, so twiddle it here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
   }
 
@@ -2662,7 +2666,7 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The operand is actually an imm0_4095, but we have its
     // negation in the assembly source, so twiddle it here.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
   }
 
@@ -2671,9 +2675,7 @@ public:
       Inst.addOperand(MCOperand::createImm(CE->getValue() >> 2));
       return;
     }
-
-    const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
-    assert(SR && "Unknown value type!");
+    const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val);
     Inst.addOperand(MCOperand::createExpr(SR));
   }
 
@@ -2685,10 +2687,7 @@ public:
         Inst.addOperand(MCOperand::createImm(CE->getValue()));
         return;
       }
-
-      const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
-
-      assert(SR && "Unknown value type!");
+      const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val);
       Inst.addOperand(MCOperand::createExpr(SR));
       return;
     }
@@ -2750,7 +2749,7 @@ public:
       return;
     }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     int Val = CE->getValue();
     Inst.addOperand(MCOperand::createImm(Val));
   }
@@ -3130,7 +3129,7 @@ public:
 
   void addPowerTwoOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue()));
   }
 
@@ -3225,14 +3224,14 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
     // Mask in that this is an i8 splat.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue() | 0xe00));
   }
 
   void addNEONi16splatOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     unsigned Value = CE->getValue();
     Value = ARM_AM::encodeNEONi16splat(Value);
     Inst.addOperand(MCOperand::createImm(Value));
@@ -3241,7 +3240,7 @@ public:
   void addNEONi16splatNotOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     unsigned Value = CE->getValue();
     Value = ARM_AM::encodeNEONi16splat(~Value & 0xffff);
     Inst.addOperand(MCOperand::createImm(Value));
@@ -3250,7 +3249,7 @@ public:
   void addNEONi32splatOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     unsigned Value = CE->getValue();
     Value = ARM_AM::encodeNEONi32splat(Value);
     Inst.addOperand(MCOperand::createImm(Value));
@@ -3259,7 +3258,7 @@ public:
   void addNEONi32splatNotOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     unsigned Value = CE->getValue();
     Value = ARM_AM::encodeNEONi32splat(~Value);
     Inst.addOperand(MCOperand::createImm(Value));
@@ -3267,7 +3266,7 @@ public:
 
   void addNEONi8ReplicateOperands(MCInst &Inst, bool Inv) const {
     // The immediate encodes the type of constant as well as the value.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
             Inst.getOpcode() == ARM::VMOVv16i8) &&
           "All instructions that wants to replicate non-zero byte "
@@ -3298,7 +3297,7 @@ public:
   void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     unsigned Value = encodeNeonVMOVImmediate(CE->getValue());
     Inst.addOperand(MCOperand::createImm(Value));
   }
@@ -3310,7 +3309,7 @@ public:
 
   void addNEONvmovi16ReplicateOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     assert((Inst.getOpcode() == ARM::VMOVv4i16 ||
             Inst.getOpcode() == ARM::VMOVv8i16 ||
             Inst.getOpcode() == ARM::VMVNv4i16 ||
@@ -3327,14 +3326,14 @@ public:
   void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     unsigned Value = encodeNeonVMOVImmediate(~CE->getValue());
     Inst.addOperand(MCOperand::createImm(Value));
   }
 
   void addNEONvmovi32ReplicateOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     assert((Inst.getOpcode() == ARM::VMOVv2i32 ||
             Inst.getOpcode() == ARM::VMOVv4i32 ||
             Inst.getOpcode() == ARM::VMVNv2i32 ||
@@ -3349,7 +3348,7 @@ public:
   void addNEONi64splatOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     uint64_t Value = CE->getValue();
     unsigned Imm = 0;
     for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
@@ -3360,20 +3359,28 @@ public:
 
   void addComplexRotationEvenOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm(CE->getValue() / 90));
   }
 
   void addComplexRotationOddOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::createImm((CE->getValue() - 90) / 180));
   }
 
+  void addMveSaturateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    unsigned Imm = CE->getValue();
+    assert((Imm == 48 || Imm == 64) && "Invalid saturate operand");
+    Inst.addOperand(MCOperand::createImm(Imm == 48 ? 1 : 0));
+  }
+
   void print(raw_ostream &OS) const override;
 
   static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_ITCondMask);
+    auto Op = std::make_unique<ARMOperand>(k_ITCondMask);
     Op->ITMask.Mask = Mask;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3382,7 +3389,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC,
                                                     SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_CondCode);
+    auto Op = std::make_unique<ARMOperand>(k_CondCode);
     Op->CC.Val = CC;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3391,7 +3398,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateVPTPred(ARMVCC::VPTCodes CC,
                                                    SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_VPTPred);
+    auto Op = std::make_unique<ARMOperand>(k_VPTPred);
     Op->VCC.Val = CC;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3399,7 +3406,7 @@ public:
   }
 
   static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_CoprocNum);
+    auto Op = std::make_unique<ARMOperand>(k_CoprocNum);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3407,7 +3414,7 @@ public:
   }
 
   static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_CoprocReg);
+    auto Op = std::make_unique<ARMOperand>(k_CoprocReg);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3416,7 +3423,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S,
                                                         SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_CoprocOption);
+    auto Op = std::make_unique<ARMOperand>(k_CoprocOption);
     Op->Cop.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -3424,7 +3431,7 @@ public:
   }
 
   static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_CCOut);
+    auto Op = std::make_unique<ARMOperand>(k_CCOut);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3432,7 +3439,7 @@ public:
   }
 
   static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_Token);
+    auto Op = std::make_unique<ARMOperand>(k_Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -3442,7 +3449,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S,
                                                SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_Register);
+    auto Op = std::make_unique<ARMOperand>(k_Register);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -3453,7 +3460,7 @@ public:
   CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
                         unsigned ShiftReg, unsigned ShiftImm, SMLoc S,
                         SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_ShiftedRegister);
+    auto Op = std::make_unique<ARMOperand>(k_ShiftedRegister);
     Op->RegShiftedReg.ShiftTy = ShTy;
     Op->RegShiftedReg.SrcReg = SrcReg;
     Op->RegShiftedReg.ShiftReg = ShiftReg;
@@ -3466,7 +3473,7 @@ public:
   static std::unique_ptr<ARMOperand>
   CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
                          unsigned ShiftImm, SMLoc S, SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_ShiftedImmediate);
+    auto Op = std::make_unique<ARMOperand>(k_ShiftedImmediate);
     Op->RegShiftedImm.ShiftTy = ShTy;
     Op->RegShiftedImm.SrcReg = SrcReg;
     Op->RegShiftedImm.ShiftImm = ShiftImm;
@@ -3477,7 +3484,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm,
                                                       SMLoc S, SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_ShifterImmediate);
+    auto Op = std::make_unique<ARMOperand>(k_ShifterImmediate);
     Op->ShifterImm.isASR = isASR;
     Op->ShifterImm.Imm = Imm;
     Op->StartLoc = S;
@@ -3487,7 +3494,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S,
                                                   SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_RotateImmediate);
+    auto Op = std::make_unique<ARMOperand>(k_RotateImmediate);
     Op->RotImm.Imm = Imm;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -3496,7 +3503,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateModImm(unsigned Bits, unsigned Rot,
                                                   SMLoc S, SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_ModifiedImmediate);
+    auto Op = std::make_unique<ARMOperand>(k_ModifiedImmediate);
     Op->ModImm.Bits = Bits;
     Op->ModImm.Rot = Rot;
     Op->StartLoc = S;
@@ -3506,7 +3513,7 @@ public:
 
   static std::unique_ptr<ARMOperand>
   CreateConstantPoolImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_ConstantPoolImmediate);
+    auto Op = std::make_unique<ARMOperand>(k_ConstantPoolImmediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -3515,7 +3522,7 @@ public:
 
   static std::unique_ptr<ARMOperand>
   CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
+    auto Op = std::make_unique<ARMOperand>(k_BitfieldDescriptor);
     Op->Bitfield.LSB = LSB;
     Op->Bitfield.Width = Width;
     Op->StartLoc = S;
@@ -3543,16 +3550,15 @@ public:
         Kind = k_SPRRegisterList;
     }
 
-    // Sort based on the register encoding values.
-    array_pod_sort(Regs.begin(), Regs.end());
-
     if (Kind == k_RegisterList && Regs.back().second == ARM::APSR)
       Kind = k_RegisterListWithAPSR;
 
-    auto Op = make_unique<ARMOperand>(Kind);
-    for (SmallVectorImpl<std::pair<unsigned, unsigned>>::const_iterator
-           I = Regs.begin(), E = Regs.end(); I != E; ++I)
-      Op->Registers.push_back(I->second);
+    assert(std::is_sorted(Regs.begin(), Regs.end()) &&
+           "Register list must be sorted by encoding");
+
+    auto Op = std::make_unique<ARMOperand>(Kind);
+    for (const auto &P : Regs)
+      Op->Registers.push_back(P.second);
 
     Op->StartLoc = StartLoc;
     Op->EndLoc = EndLoc;
@@ -3563,7 +3569,7 @@ public:
                                                       unsigned Count,
                                                       bool isDoubleSpaced,
                                                       SMLoc S, SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_VectorList);
+    auto Op = std::make_unique<ARMOperand>(k_VectorList);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -3575,7 +3581,7 @@ public:
   static std::unique_ptr<ARMOperand>
   CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced,
                            SMLoc S, SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_VectorListAllLanes);
+    auto Op = std::make_unique<ARMOperand>(k_VectorListAllLanes);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -3587,7 +3593,7 @@ public:
   static std::unique_ptr<ARMOperand>
   CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index,
                           bool isDoubleSpaced, SMLoc S, SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_VectorListIndexed);
+    auto Op = std::make_unique<ARMOperand>(k_VectorListIndexed);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.LaneIndex = Index;
@@ -3599,7 +3605,7 @@ public:
 
   static std::unique_ptr<ARMOperand>
   CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
-    auto Op = make_unique<ARMOperand>(k_VectorIndex);
+    auto Op = std::make_unique<ARMOperand>(k_VectorIndex);
     Op->VectorIndex.Val = Idx;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -3608,7 +3614,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S,
                                                SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_Immediate);
+    auto Op = std::make_unique<ARMOperand>(k_Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -3620,7 +3626,7 @@ public:
             unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType,
             unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S,
             SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
-    auto Op = make_unique<ARMOperand>(k_Memory);
+    auto Op = std::make_unique<ARMOperand>(k_Memory);
     Op->Memory.BaseRegNum = BaseRegNum;
     Op->Memory.OffsetImm = OffsetImm;
     Op->Memory.OffsetRegNum = OffsetRegNum;
@@ -3637,7 +3643,7 @@ public:
   static std::unique_ptr<ARMOperand>
   CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy,
                    unsigned ShiftImm, SMLoc S, SMLoc E) {
-    auto Op = make_unique<ARMOperand>(k_PostIndexRegister);
+    auto Op = std::make_unique<ARMOperand>(k_PostIndexRegister);
     Op->PostIdxReg.RegNum = RegNum;
     Op->PostIdxReg.isAdd = isAdd;
     Op->PostIdxReg.ShiftTy = ShiftTy;
@@ -3649,7 +3655,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt,
                                                          SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_MemBarrierOpt);
+    auto Op = std::make_unique<ARMOperand>(k_MemBarrierOpt);
     Op->MBOpt.Val = Opt;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3658,7 +3664,7 @@ public:
 
   static std::unique_ptr<ARMOperand>
   CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt);
+    auto Op = std::make_unique<ARMOperand>(k_InstSyncBarrierOpt);
     Op->ISBOpt.Val = Opt;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3667,7 +3673,7 @@ public:
 
   static std::unique_ptr<ARMOperand>
   CreateTraceSyncBarrierOpt(ARM_TSB::TraceSyncBOpt Opt, SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_TraceSyncBarrierOpt);
+    auto Op = std::make_unique<ARMOperand>(k_TraceSyncBarrierOpt);
     Op->TSBOpt.Val = Opt;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3676,7 +3682,7 @@ public:
 
   static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
                                                       SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_ProcIFlags);
+    auto Op = std::make_unique<ARMOperand>(k_ProcIFlags);
     Op->IFlags.Val = IFlags;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3684,7 +3690,7 @@ public:
   }
 
   static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_MSRMask);
+    auto Op = std::make_unique<ARMOperand>(k_MSRMask);
     Op->MMask.Val = MMask;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -3692,7 +3698,7 @@ public:
   }
 
   static std::unique_ptr<ARMOperand> CreateBankedReg(unsigned Reg, SMLoc S) {
-    auto Op = make_unique<ARMOperand>(k_BankedReg);
+    auto Op = std::make_unique<ARMOperand>(k_BankedReg);
     Op->BankedReg.Val = Reg;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -4253,6 +4259,24 @@ static unsigned getNextRegister(unsigned Reg) {
   }
 }
 
+// Insert an <Encoding, Register> pair in an ordered vector. Return true on
+// success, or false, if duplicate encoding found.
+static bool
+insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
+                   unsigned Enc, unsigned Reg) {
+  Regs.emplace_back(Enc, Reg);
+  for (auto I = Regs.rbegin(), J = I + 1, E = Regs.rend(); J != E; ++I, ++J) {
+    if (J->first == Enc) {
+      Regs.erase(J.base());
+      return false;
+    }
+    if (J->first < Enc)
+      break;
+    std::swap(*I, *J);
+  }
+  return true;
+}
+
 /// Parse a register list.
 bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
                                      bool EnforceOrder) {
@@ -4278,7 +4302,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
   if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
     Reg = getDRegFromQReg(Reg);
     EReg = MRI->getEncodingValue(Reg);
-    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    Registers.emplace_back(EReg, Reg);
     ++Reg;
   }
   const MCRegisterClass *RC;
@@ -4295,7 +4319,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
 
   // Store the register.
   EReg = MRI->getEncodingValue(Reg);
-  Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+  Registers.emplace_back(EReg, Reg);
 
   // This starts immediately after the first register token in the list,
   // so we can see either a comma or a minus (range separator) as a legal
@@ -4326,7 +4350,11 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
       while (Reg != EndReg) {
         Reg = getNextRegister(Reg);
         EReg = MRI->getEncodingValue(Reg);
-        Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+        if (!insertNoDuplicates(Registers, EReg, Reg)) {
+          Warning(AfterMinusLoc, StringRef("duplicated register (") +
+                                     ARMInstPrinter::getRegisterName(Reg) +
+                                     ") in register list");
+        }
       }
       continue;
     }
@@ -4350,11 +4378,16 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
       // subset of GPRRegClassId except it contains APSR as well.
       RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID];
     }
-    if (Reg == ARM::VPR && (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] ||
-                            RC == &ARMMCRegisterClasses[ARM::DPRRegClassID])) {
+    if (Reg == ARM::VPR &&
+        (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] ||
+         RC == &ARMMCRegisterClasses[ARM::DPRRegClassID] ||
+         RC == &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID])) {
       RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID];
       EReg = MRI->getEncodingValue(Reg);
-      Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+      if (!insertNoDuplicates(Registers, EReg, Reg)) {
+        Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+                            ") in register list");
+      }
       continue;
     }
     // The register must be in the same register class as the first.
@@ -4371,21 +4404,19 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
       else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg))
         return Error(RegLoc, "register list not in ascending order");
     }
-    if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) {
-      Warning(RegLoc, "duplicated register (" + RegTok.getString() +
-              ") in register list");
-      continue;
-    }
     // VFP register lists must also be contiguous.
     if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
         RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] &&
         Reg != OldReg + 1)
       return Error(RegLoc, "non-contiguous register range");
     EReg = MRI->getEncodingValue(Reg);
-    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    if (!insertNoDuplicates(Registers, EReg, Reg)) {
+      Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+                          ") in register list");
+    }
     if (isQReg) {
       EReg = MRI->getEncodingValue(++Reg);
-      Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+      Registers.emplace_back(EReg, Reg);
     }
   }
 
@@ -5702,14 +5733,16 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
     return false;
   }
 
-  // If we have a '#', it's an immediate offset, else assume it's a register
-  // offset. Be friendly and also accept a plain integer (without a leading
-  // hash) for gas compatibility.
+  // If we have a '#' or '$', it's an immediate offset, else assume it's a
+  // register offset. Be friendly and also accept a plain integer or expression
+  // (without a leading hash) for gas compatibility.
   if (Parser.getTok().is(AsmToken::Hash) ||
       Parser.getTok().is(AsmToken::Dollar) ||
+      Parser.getTok().is(AsmToken::LParen) ||
       Parser.getTok().is(AsmToken::Integer)) {
-    if (Parser.getTok().isNot(AsmToken::Integer))
-      Parser.Lex(); // Eat '#' or '$'.
+    if (Parser.getTok().is(AsmToken::Hash) ||
+        Parser.getTok().is(AsmToken::Dollar))
+      Parser.Lex(); // Eat '#' or '$'
     E = Parser.getTok().getLoc();
 
     bool isNegative = getParser().getTok().is(AsmToken::Minus);
@@ -11308,7 +11341,7 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
   SmallVector<uint8_t, 16> Opcodes;
 
   auto parseOne = [&]() -> bool {
-    const MCExpr *OE;
+    const MCExpr *OE = nullptr;
     SMLoc OpcodeLoc = getLexer().getLoc();
     if (check(getLexer().is(AsmToken::EndOfStatement) ||
                   Parser.parseExpression(OE),
@@ -11694,14 +11727,14 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
     { ARM::AEK_CRYPTO,  {Feature_HasV8Bit},
       {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
     { ARM::AEK_FP, {Feature_HasV8Bit},
-      {ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} },
+      {ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} },
     { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM),
       {Feature_HasV7Bit, Feature_IsNotMClassBit},
       {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} },
     { ARM::AEK_MP, {Feature_HasV7Bit, Feature_IsNotMClassBit},
       {ARM::FeatureMP} },
     { ARM::AEK_SIMD, {Feature_HasV8Bit},
-      {ARM::FeatureNEON, ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} },
+      {ARM::FeatureNEON, ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} },
     { ARM::AEK_SEC, {Feature_HasV6KBit}, {ARM::FeatureTrustZone} },
     // FIXME: Only available in A-class, isel not predicated
     { ARM::AEK_VIRT, {Feature_HasV7Bit}, {ARM::FeatureVirtualization} },
@@ -11775,19 +11808,19 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   // immediate in the syntax.
   switch (Kind) {
   default: break;
-  case MCK__35_0:
+  case MCK__HASH_0:
     if (Op.isImm())
       if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
         if (CE->getValue() == 0)
           return Match_Success;
     break;
-  case MCK__35_8:
+  case MCK__HASH_8:
     if (Op.isImm())
       if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
         if (CE->getValue() == 8)
           return Match_Success;
     break;
-  case MCK__35_16:
+  case MCK__HASH_16:
     if (Op.isImm())
       if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
         if (CE->getValue() == 16)
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 673691ebd93e..eabc26d05f47 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -314,7 +314,7 @@ static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val,
+static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst,unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst,unsigned Val,
                                uint64_t Address, const void *Decoder);
@@ -561,6 +561,8 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn,
                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn,
                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn,
                                                   uint64_t Address,
                                                   const void *Decoder);
@@ -3445,7 +3447,7 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus
-DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn,
+DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Insn,
                             uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -5679,7 +5681,7 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
         }
       }
     }
-    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+    return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder);
   }
 
   if (!(imm & 0x20)) return MCDisassembler::Fail;
@@ -5738,7 +5740,7 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
         }
       }
     }
-    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+    return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder);
   }
 
   if (!(imm & 0x20)) return MCDisassembler::Fail;
@@ -6481,6 +6483,12 @@ static DecodeStatus DecodeMVEOverlappingLongShift(
     if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
       return MCDisassembler::Fail;
 
+    if (fieldFromInstruction (Insn, 6, 3) != 4)
+      return MCDisassembler::SoftFail;
+
+    if (Rda == Rm)
+      return MCDisassembler::SoftFail;
+
     return S;
   }
 
@@ -6503,6 +6511,13 @@ static DecodeStatus DecodeMVEOverlappingLongShift(
   if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  if (Inst.getOpcode() == ARM::MVE_SQRSHRL ||
+      Inst.getOpcode() == ARM::MVE_UQRSHLL) {
+    unsigned Saturate = fieldFromInstruction(Insn, 7, 1);
+    // Saturate, the bit position for saturation
+    Inst.addOperand(MCOperand::createImm(Saturate));
+  }
+
   return S;
 }
 
@@ -6572,3 +6587,11 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
     return MCDisassembler::Fail;
   return S;
 }
+
+static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                   const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  Inst.addOperand(MCOperand::createReg(ARM::VPR));
+  Inst.addOperand(MCOperand::createReg(ARM::VPR));
+  return S;
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 7732a6485a85..24a9fabf0979 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -518,10 +518,10 @@ namespace ARM_AM {
   // Valid alignments depend on the specific instruction.
 
   //===--------------------------------------------------------------------===//
-  // NEON Modified Immediates
+  // NEON/MVE Modified Immediates
   //===--------------------------------------------------------------------===//
   //
-  // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+  // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate"
   // vector operand, where a small immediate encoded in the instruction
   // specifies a full NEON vector value.  These modified immediates are
   // represented here as encoded integers.  The low 8 bits hold the immediate
@@ -529,20 +529,20 @@ namespace ARM_AM {
   // the "Cmode" field of the instruction.  The interfaces below treat the
   // Op and Cmode values as a single 5-bit value.
 
-  inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+  inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) {
     return (OpCmode << 8) | Val;
   }
-  inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+  inline unsigned getVMOVModImmOpCmode(unsigned ModImm) {
     return (ModImm >> 8) & 0x1f;
   }
-  inline unsigned getNEONModImmVal(unsigned ModImm) { return ModImm & 0xff; }
+  inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; }
 
-  /// decodeNEONModImm - Decode a NEON modified immediate value into the
+  /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the
   /// element value and the element size in bits.  (If the element size is
   /// smaller than the vector, it is splatted into all the elements.)
-  inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
-    unsigned OpCmode = getNEONModImmOpCmode(ModImm);
-    unsigned Imm8 = getNEONModImmVal(ModImm);
+  inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) {
+    unsigned OpCmode = getVMOVModImmOpCmode(ModImm);
+    unsigned Imm8 = getVMOVModImmVal(ModImm);
     uint64_t Val = 0;
 
     if (OpCmode == 0xe) {
@@ -572,7 +572,7 @@ namespace ARM_AM {
       }
       EltBits = 64;
     } else {
-      llvm_unreachable("Unsupported NEON immediate");
+      llvm_unreachable("Unsupported VMOV immediate");
     }
     return Val;
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index aeab5be78ab4..6196881a9b8f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -233,7 +233,7 @@ static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) {
 
 const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
                                                     uint64_t Value) const {
-  switch ((unsigned)Fixup.getKind()) {
+  switch (Fixup.getTargetKind()) {
   case ARM::fixup_arm_thumb_br: {
     // Relaxing tB to t2B. tB has a signed 12-bit displacement with the
     // low bit being an implied zero. There's an implied +4 offset for the
@@ -870,7 +870,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                           const MCValue &Target) {
   const MCSymbolRefExpr *A = Target.getSymA();
   const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
-  const unsigned FixupKind = Fixup.getKind() ;
+  const unsigned FixupKind = Fixup.getKind();
   if (FixupKind == FK_NONE)
     return true;
   if (FixupKind == ARM::fixup_arm_thumb_bl) {
@@ -1105,28 +1105,28 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
   if (Instrs.empty())
     return 0;
   // Start off assuming CFA is at SP+0.
-  int CFARegister = ARM::SP;
+  unsigned CFARegister = ARM::SP;
   int CFARegisterOffset = 0;
   // Mark savable registers as initially unsaved
   DenseMap<unsigned, int> RegOffsets;
   int FloatRegCount = 0;
   // Process each .cfi directive and build up compact unwind info.
   for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
-    int Reg;
+    unsigned Reg;
     const MCCFIInstruction &Inst = Instrs[i];
     switch (Inst.getOperation()) {
     case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
       CFARegisterOffset = -Inst.getOffset();
-      CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+      CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
       break;
     case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset
       CFARegisterOffset = -Inst.getOffset();
       break;
     case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register
-      CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+      CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
       break;
     case MCCFIInstruction::OpOffset: // DW_CFA_offset
-      Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+      Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
       if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
         RegOffsets[Reg] = Inst.getOffset();
       else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index c4daafe8ee97..6293a2462306 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -393,6 +393,9 @@ namespace ARMII {
     // in an IT block).
     ThumbArithFlagSetting = 1 << 19,
 
+    // Whether an instruction can be included in an MVE tail-predicated loop.
+    ValidForTailPredication = 1 << 20,
+
     //===------------------------------------------------------------------===//
     // Code domain.
     DomainShift   = 15,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index fda19eea1de6..1fee38821a49 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -82,7 +82,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
 
   if (IsPCRel) {
-    switch ((unsigned)Fixup.getKind()) {
+    switch (Fixup.getTargetKind()) {
     default:
       Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
       return ELF::R_ARM_NONE;
@@ -145,7 +145,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       return ELF::R_ARM_THM_BF18;
     }
   }
-  switch ((unsigned)Fixup.getKind()) {
+  switch (Fixup.getTargetKind()) {
   default:
     Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
     return ELF::R_ARM_NONE;
@@ -263,5 +263,5 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createARMELFObjectWriter(uint8_t OSABI) {
-  return llvm::make_unique<ARMELFObjectWriter>(OSABI);
+  return std::make_unique<ARMELFObjectWriter>(OSABI);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index 45be1ee96342..a1def61b58d9 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -1334,12 +1334,12 @@ void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
     << markup(">");
 }
 
-void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+void ARMInstPrinter::printVMOVModImmOperand(const MCInst *MI, unsigned OpNum,
                                             const MCSubtargetInfo &STI,
                                             raw_ostream &O) {
   unsigned EncodedImm = MI->getOperand(OpNum).getImm();
   unsigned EltBits;
-  uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+  uint64_t Val = ARM_AM::decodeVMOVModImm(EncodedImm, EltBits);
   O << markup("<imm:") << "#0x";
   O.write_hex(Val);
   O << markup(">");
@@ -1676,3 +1676,11 @@ void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
   O.write_hex(Val);
   O << markup(">");
 }
+
+void ARMInstPrinter::printMveSaturateOp(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  uint32_t Val = MI->getOperand(OpNum).getImm();
+  assert(Val <= 1 && "Invalid MVE saturate operand");
+  O << "#" << (Val == 1 ? 48 : 64);
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index 69026956b60e..eeb811e216fc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -191,7 +191,7 @@ public:
                             const MCSubtargetInfo &STI, raw_ostream &O);
   void printFPImmOperand(const MCInst *MI, unsigned OpNum,
                          const MCSubtargetInfo &STI, raw_ostream &O);
-  void printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+  void printVMOVModImmOperand(const MCInst *MI, unsigned OpNum,
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
                               const MCSubtargetInfo &STI, raw_ostream &O);
@@ -262,7 +262,8 @@ public:
                                 const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
                                const MCSubtargetInfo &STI, raw_ostream &O);
-
+  void printMveSaturateOp(const MCInst *MI, unsigned OpNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
 private:
   unsigned DefaultAltIdx = ARM::NoRegAltName;
 };
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index dca6fe37d49a..268fe7efd9ce 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1720,7 +1720,6 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
   unsigned Reg = MI.getOperand(Op).getReg();
   bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
   bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
-  bool CLRMRegs = MI.getOpcode() == ARM::t2CLRM;
 
   unsigned Binary = 0;
 
@@ -1739,21 +1738,13 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
       Binary |= NumRegs * 2;
   } else {
     const MCRegisterInfo &MRI = *CTX.getRegisterInfo();
-    if (!CLRMRegs) {
-      assert(std::is_sorted(MI.begin() + Op, MI.end(),
-                            [&](const MCOperand &LHS, const MCOperand &RHS) {
-                              return MRI.getEncodingValue(LHS.getReg()) <
-                                     MRI.getEncodingValue(RHS.getReg());
-                            }));
-    }
-
+    assert(std::is_sorted(MI.begin() + Op, MI.end(),
+                          [&](const MCOperand &LHS, const MCOperand &RHS) {
+                            return MRI.getEncodingValue(LHS.getReg()) <
+                              MRI.getEncodingValue(RHS.getReg());
+                          }));
     for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
-      unsigned RegNo;
-      if (CLRMRegs && MI.getOperand(I).getReg() == ARM::APSR) {
-        RegNo = 15;
-      } else {
-        RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
-      }
+      unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
       Binary |= 1 << RegNo;
     }
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index c49885023cb2..ed4000c7e5be 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -204,7 +204,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
   // relocation entry in the low 16 bits of r_address field.
   unsigned ThumbBit = 0;
   unsigned MovtBit = 0;
-  switch ((unsigned)Fixup.getKind()) {
+  switch (Fixup.getTargetKind()) {
   default: break;
   case ARM::fixup_arm_movt_hi16:
     MovtBit = 1;
@@ -480,7 +480,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
     // PAIR. I.e. it's correct that we insert the high bits of the addend in the
     // MOVW case here.  relocation entries.
     uint32_t Value = 0;
-    switch ((unsigned)Fixup.getKind()) {
+    switch (Fixup.getTargetKind()) {
     default: break;
     case ARM::fixup_arm_movw_lo16:
     case ARM::fixup_t2_movw_lo16:
@@ -506,5 +506,5 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
                                 uint32_t CPUSubtype) {
-  return llvm::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
+  return std::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index b863517c0cca..7b30a61e8ccb 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -249,12 +249,12 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
                             : ARM::FK_VFPV3_D16)
                      : (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16
                                                          : ARM::FK_VFPV3XD)));
-    else if (STI.hasFeature(ARM::FeatureVFP2_D16_SP))
+    else if (STI.hasFeature(ARM::FeatureVFP2_SP))
       emitFPU(ARM::FK_VFPV2);
   }
 
   // ABI_HardFP_use attribute to indicate single precision FP.
-  if (STI.hasFeature(ARM::FeatureVFP2_D16_SP) && !STI.hasFeature(ARM::FeatureFP64))
+  if (STI.hasFeature(ARM::FeatureVFP2_SP) && !STI.hasFeature(ARM::FeatureFP64))
     emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
                   ARMBuildAttrs::HardFPSinglePrecision);
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 054a95dd1e12..900c5fe30364 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -92,7 +92,7 @@ namespace llvm {
 
 std::unique_ptr<MCObjectTargetWriter>
 createARMWinCOFFObjectWriter(bool Is64Bit) {
-  return llvm::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
+  return std::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
 }
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index 2e816bea5e91..b3c8146a9bde 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -22,20 +22,10 @@ public:
                      std::unique_ptr<MCObjectWriter> OW)
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
-  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitThumbFunc(MCSymbol *Symbol) override;
   void FinishImpl() override;
 };
 
-void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
-  switch (Flag) {
-  default: llvm_unreachable("not implemented");
-  case MCAF_SyntaxUnified:
-  case MCAF_Code16:
-    break;
-  }
-}
-
 void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
   getAssembler().setIsThumbFunc(Symbol);
 }
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 4b25986b90a7..cc31929899b4 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -86,8 +86,8 @@ void MLxExpansion::pushStack(MachineInstr *MI) {
 MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
   // Look past COPY and INSERT_SUBREG instructions to find the
   // real definition MI. This is important for _sfp instructions.
-  unsigned Reg = MI->getOperand(1).getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  Register Reg = MI->getOperand(1).getReg();
+  if (Register::isPhysicalRegister(Reg))
     return nullptr;
 
   MachineBasicBlock *MBB = MI->getParent();
@@ -97,13 +97,13 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
       break;
     if (DefMI->isCopyLike()) {
       Reg = DefMI->getOperand(1).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         DefMI = MRI->getVRegDef(Reg);
         continue;
       }
     } else if (DefMI->isInsertSubreg()) {
       Reg = DefMI->getOperand(2).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         DefMI = MRI->getVRegDef(Reg);
         continue;
       }
@@ -114,9 +114,8 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
 }
 
 unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
-  unsigned Reg = MI->getOperand(0).getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-      !MRI->hasOneNonDBGUse(Reg))
+  Register Reg = MI->getOperand(0).getReg();
+  if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg))
     return Reg;
 
   MachineBasicBlock *MBB = MI->getParent();
@@ -126,8 +125,7 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
 
   while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
     Reg = UseMI->getOperand(0).getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-        !MRI->hasOneNonDBGUse(Reg))
+    if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg))
       return Reg;
     UseMI = &*MRI->use_instr_nodbg_begin(Reg);
     if (UseMI->getParent() != MBB)
@@ -140,8 +138,8 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
 /// hasLoopHazard - Check whether an MLx instruction is chained to itself across
 /// a single-MBB loop.
 bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
-  unsigned Reg = MI->getOperand(1).getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  Register Reg = MI->getOperand(1).getReg();
+  if (Register::isPhysicalRegister(Reg))
     return false;
 
   MachineBasicBlock *MBB = MI->getParent();
@@ -154,8 +152,8 @@ outer_continue:
     if (DefMI->isPHI()) {
       for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
         if (DefMI->getOperand(i + 1).getMBB() == MBB) {
-          unsigned SrcReg = DefMI->getOperand(i).getReg();
-          if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          Register SrcReg = DefMI->getOperand(i).getReg();
+          if (Register::isVirtualRegister(SrcReg)) {
             DefMI = MRI->getVRegDef(SrcReg);
             goto outer_continue;
           }
@@ -163,13 +161,13 @@ outer_continue:
       }
     } else if (DefMI->isCopyLike()) {
       Reg = DefMI->getOperand(1).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         DefMI = MRI->getVRegDef(Reg);
         continue;
       }
     } else if (DefMI->isInsertSubreg()) {
       Reg = DefMI->getOperand(2).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         DefMI = MRI->getVRegDef(Reg);
         continue;
       }
@@ -271,23 +269,23 @@ void
 MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
                                      unsigned MulOpc, unsigned AddSubOpc,
                                      bool NegAcc, bool HasLane) {
-  unsigned DstReg = MI->getOperand(0).getReg();
+  Register DstReg = MI->getOperand(0).getReg();
   bool DstDead = MI->getOperand(0).isDead();
-  unsigned AccReg = MI->getOperand(1).getReg();
-  unsigned Src1Reg = MI->getOperand(2).getReg();
-  unsigned Src2Reg = MI->getOperand(3).getReg();
+  Register AccReg = MI->getOperand(1).getReg();
+  Register Src1Reg = MI->getOperand(2).getReg();
+  Register Src2Reg = MI->getOperand(3).getReg();
   bool Src1Kill = MI->getOperand(2).isKill();
   bool Src2Kill = MI->getOperand(3).isKill();
   unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
   unsigned NextOp = HasLane ? 5 : 4;
   ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
-  unsigned PredReg = MI->getOperand(++NextOp).getReg();
+  Register PredReg = MI->getOperand(++NextOp).getReg();
 
   const MCInstrDesc &MCID1 = TII->get(MulOpc);
   const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
   const MachineFunction &MF = *MI->getParent()->getParent();
-  unsigned TmpReg = MRI->createVirtualRegister(
-                      TII->getRegClass(MCID1, 0, TRI, MF));
+  Register TmpReg =
+      MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI, MF));
 
   MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
     .addReg(Src1Reg, getKillRegState(Src1Kill))
diff --git a/lib/Target/ARM/MVETailPredication.cpp b/lib/Target/ARM/MVETailPredication.cpp
new file mode 100644
index 000000000000..4db8ab17c49b
--- /dev/null
+++ b/lib/Target/ARM/MVETailPredication.cpp
@@ -0,0 +1,519 @@
+//===- MVETailPredication.cpp - MVE Tail Predication ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead
+/// branches to help accelerate DSP applications. These two extensions can be
+/// combined to provide implicit vector predication within a low-overhead loop.
+/// The HardwareLoops pass inserts intrinsics identifying loops that the
+/// backend will attempt to convert into a low-overhead loop. The vectorizer is
+/// responsible for generating a vectorized loop in which the lanes are
+/// predicated upon the iteration counter. This pass looks at these predicated
+/// vector loops, that are targets for low-overhead loops, and prepares it for
+/// code generation. Once the vectorizer has produced a masked loop, there's a
+/// couple of final forms:
+/// - A tail-predicated loop, with implicit predication.
+/// - A loop containing multiple VCPT instructions, predicating multiple VPT
+///   blocks of instructions operating on different vector types.
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "ARM.h"
+#include "ARMSubtarget.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mve-tail-predication"
+#define DESC "Transform predicated vector loops to use MVE tail predication"
+
+static cl::opt<bool>
+DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
+                       cl::init(true),
+                       cl::desc("Disable MVE Tail Predication"));
+namespace {
+
+class MVETailPredication : public LoopPass {
+  SmallVector<IntrinsicInst*, 4> MaskedInsts;
+  Loop *L = nullptr;
+  ScalarEvolution *SE = nullptr;
+  TargetTransformInfo *TTI = nullptr;
+
+public:
+  static char ID;
+
+  MVETailPredication() : LoopPass(ID) { }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager&) override;
+
+private:
+
+  /// Perform the relevant checks on the loop and convert if possible.
+  bool TryConvert(Value *TripCount);
+
+  /// Return whether this is a vectorized loop, that contains masked
+  /// load/stores.
+  bool IsPredicatedVectorLoop();
+
+  /// Compute a value for the total number of elements that the predicated
+  /// loop will process.
+  Value *ComputeElements(Value *TripCount, VectorType *VecTy);
+
+  /// Is the icmp that generates an i1 vector, based upon a loop counter
+  /// and a limit that is defined outside the loop.
+  bool isTailPredicate(Instruction *Predicate, Value *NumElements);
+};
+
+} // end namespace
+
+static bool IsDecrement(Instruction &I) {
+  auto *Call = dyn_cast<IntrinsicInst>(&I);
+  if (!Call)
+    return false;
+
+  Intrinsic::ID ID = Call->getIntrinsicID();
+  return ID == Intrinsic::loop_decrement_reg;
+}
+
+static bool IsMasked(Instruction *I) {
+  auto *Call = dyn_cast<IntrinsicInst>(I);
+  if (!Call)
+    return false;
+
+  Intrinsic::ID ID = Call->getIntrinsicID();
+  // TODO: Support gather/scatter expand/compress operations.
+  return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
+}
+
+bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
+  if (skipLoop(L) || DisableTailPredication)
+    return false;
+
+  Function &F = *L->getHeader()->getParent();
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  auto &TM = TPC.getTM<TargetMachine>();
+  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  this->L = L;
+
+  // The MVE and LOB extensions are combined to enable tail-predication, but
+  // there's nothing preventing us from generating VCTP instructions for v8.1m.
+  if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) {
+    LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n");
+    return false;
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  auto FindLoopIterations = [](BasicBlock *BB) -> IntrinsicInst* {
+    for (auto &I : *BB) {
+      auto *Call = dyn_cast<IntrinsicInst>(&I);
+      if (!Call)
+        continue;
+
+      Intrinsic::ID ID = Call->getIntrinsicID();
+      if (ID == Intrinsic::set_loop_iterations ||
+          ID == Intrinsic::test_set_loop_iterations)
+        return cast<IntrinsicInst>(&I);
+    }
+    return nullptr;
+  };
+
+  // Look for the hardware loop intrinsic that sets the iteration count.
+  IntrinsicInst *Setup = FindLoopIterations(Preheader);
+
+  // The test.set iteration could live in the pre- preheader.
+  if (!Setup) {
+    if (!Preheader->getSinglePredecessor())
+      return false;
+    Setup = FindLoopIterations(Preheader->getSinglePredecessor());
+    if (!Setup)
+      return false;
+  }
+
+  // Search for the hardware loop intrinic that decrements the loop counter.
+  IntrinsicInst *Decrement = nullptr;
+  for (auto *BB : L->getBlocks()) {
+    for (auto &I : *BB) {
+      if (IsDecrement(I)) {
+        Decrement = cast<IntrinsicInst>(&I);
+        break;
+      }
+    }
+  }
+
+  if (!Decrement)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L
+             << *Setup << "\n"
+             << *Decrement << "\n");
+  bool Changed = TryConvert(Setup->getArgOperand(0));
+  return Changed;
+}
+
+bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
+  // Look for the following:
+
+  // %trip.count.minus.1 = add i32 %N, -1
+  // %broadcast.splatinsert10 = insertelement <4 x i32> undef,
+  //                                          i32 %trip.count.minus.1, i32 0
+  // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10,
+  //                                    <4 x i32> undef,
+  //                                    <4 x i32> zeroinitializer
+  // ...
+  // ...
+  // %index = phi i32
+  // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert,
+  //                                  <4 x i32> undef,
+  //                                  <4 x i32> zeroinitializer
+  // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11
+
+  // And return whether V == %pred.
+
+  using namespace PatternMatch;
+
+  CmpInst::Predicate Pred;
+  Instruction *Shuffle = nullptr;
+  Instruction *Induction = nullptr;
+
+  // The vector icmp
+  if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
+                       m_Instruction(Shuffle))) ||
+      Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle))
+    return false;
+
+  // First find the stuff outside the loop which is setting up the limit
+  // vector....
+  // The invariant shuffle that broadcast the limit into a vector.
+  Instruction *Insert = nullptr;
+  if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
+                                      m_Zero())))
+    return false;
+
+  // Insert the limit into a vector.
+  Instruction *BECount = nullptr;
+  if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount),
+                                     m_Zero())))
+    return false;
+
+  // The limit calculation, backedge count.
+  Value *TripCount = nullptr;
+  if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
+    return false;
+
+  if (TripCount != NumElements)
+    return false;
+
+  // Now back to searching inside the loop body...
+  // Find the add with takes the index iv and adds a constant vector to it. 
+  Instruction *BroadcastSplat = nullptr;
+  Constant *Const = nullptr;
+  if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
+                              m_Constant(Const))))
+   return false;
+
+  // Check that we're adding <0, 1, 2, 3...
+  if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) {
+    for (unsigned i = 0; i < CDS->getNumElements(); ++i) {
+      if (CDS->getElementAsInteger(i) != i)
+        return false;
+    }
+  } else
+    return false;
+
+  // The shuffle which broadcasts the index iv into a vector.
+  if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
+                                             m_Zero())))
+    return false;
+
+  // The insert element which initialises a vector with the index iv.
+  Instruction *IV = nullptr;
+  if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero())))
+    return false;
+
+  // The index iv.
+  auto *Phi = dyn_cast<PHINode>(IV);
+  if (!Phi)
+    return false;
+
+  // TODO: Don't think we need to check the entry value.
+  Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
+  if (!match(OnEntry, m_Zero()))
+    return false;
+  
+  Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
+  unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
+
+  Instruction *LHS = nullptr;
+  if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
+    return false;
+  
+  return LHS == Phi;
+}
+
+static VectorType* getVectorType(IntrinsicInst *I) {
+  unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
+  auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType());
+  return cast<VectorType>(PtrTy->getElementType());
+}
+
+bool MVETailPredication::IsPredicatedVectorLoop() {
+  // Check that the loop contains at least one masked load/store intrinsic.
+  // We only support 'normal' vector instructions - other than masked
+  // load/stores.
+  for (auto *BB : L->getBlocks()) {
+    for (auto &I : *BB) {
+      if (IsMasked(&I)) {
+        VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+        unsigned Lanes = VecTy->getNumElements();
+        unsigned ElementWidth = VecTy->getScalarSizeInBits();
+        // MVE vectors are 128-bit, but don't support 128 x i1.
+        // TODO: Can we support vectors larger than 128-bits?
+        unsigned MaxWidth = TTI->getRegisterBitWidth(true); 
+        if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth)
+          return false;
+        MaskedInsts.push_back(cast<IntrinsicInst>(&I));
+      } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
+        for (auto &U : Int->args()) {
+          if (isa<VectorType>(U->getType()))
+            return false;
+        }
+      }
+    }
+  }
+
+  return !MaskedInsts.empty();
+}
+
+Value* MVETailPredication::ComputeElements(Value *TripCount,
+                                           VectorType *VecTy) {
+  const SCEV *TripCountSE = SE->getSCEV(TripCount);
+  ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()),
+                                     VecTy->getNumElements());
+
+  if (VF->equalsInt(1))
+    return nullptr;
+
+  // TODO: Support constant trip counts.
+  auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* {
+    if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
+      if (Const->getAPInt() != -VF->getValue())
+        return nullptr;
+    } else
+      return nullptr;
+    return dyn_cast<SCEVMulExpr>(S->getOperand(1));
+  };
+
+  auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* {
+    if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
+      if (Const->getValue() != VF)
+        return nullptr;
+    } else
+      return nullptr;
+    return dyn_cast<SCEVUDivExpr>(S->getOperand(1));
+  };
+
+  auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* {
+    if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) {
+      if (Const->getValue() != VF)
+        return nullptr;
+    } else
+      return nullptr;
+
+    if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
+      if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
+        if (Const->getAPInt() != (VF->getValue() - 1))
+          return nullptr;
+      } else
+        return nullptr;
+
+      return RoundUp->getOperand(1);
+    }
+    return nullptr;
+  };
+
+  // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to
+  // determine the numbers of elements instead? Looks like this is what is used
+  // for delinearization, but I'm not sure if it can be applied to the
+  // vectorized form - at least not without a bit more work than I feel
+  // comfortable with.
+
+  // Search for Elems in the following SCEV:
+  // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw>
+  const SCEV *Elems = nullptr;
+  if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE))
+    if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
+      if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS()))
+        if (auto *Mul = VisitAdd(Add))
+          if (auto *Div = VisitMul(Mul))
+            if (auto *Res = VisitDiv(Div))
+              Elems = Res;
+
+  if (!Elems)
+    return nullptr;
+
+  Instruction *InsertPt = L->getLoopPreheader()->getTerminator();
+  if (!isSafeToExpandAt(Elems, InsertPt, *SE))
+    return nullptr;
+
+  auto DL = L->getHeader()->getModule()->getDataLayout();
+  SCEVExpander Expander(*SE, DL, "elements");
+  return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
+}
+
+// Look through the exit block to see whether there's a duplicate predicate
+// instruction. This can happen when we need to perform a select on values
+// from the last and previous iteration. Instead of doing a straight
+// replacement of that predicate with the vctp, clone the vctp and place it
+// in the block. This means that the VPR doesn't have to be live into the
+// exit block which should make it easier to convert this loop into a proper
+// tail predicated loop.
+static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
+                    SetVector<Instruction*> &MaybeDead, Loop *L) {
+  if (BasicBlock *Exit = L->getUniqueExitBlock()) {
+    for (auto &Pair : NewPredicates) {
+      Instruction *OldPred = Pair.first;
+      Instruction *NewPred = Pair.second;
+
+      for (auto &I : *Exit) {
+        if (I.isSameOperationAs(OldPred)) {
+          Instruction *PredClone = NewPred->clone();
+          PredClone->insertBefore(&I);
+          I.replaceAllUsesWith(PredClone);
+          MaybeDead.insert(&I);
+          break;
+        }
+      }
+    }
+  }
+
+  // Drop references and add operands to check for dead.
+  SmallPtrSet<Instruction*, 4> Dead;
+  while (!MaybeDead.empty()) {
+    auto *I = MaybeDead.front();
+    MaybeDead.remove(I);
+    if (I->hasNUsesOrMore(1))
+      continue;
+
+    for (auto &U : I->operands()) {
+      if (auto *OpI = dyn_cast<Instruction>(U))
+        MaybeDead.insert(OpI);
+    }
+    I->dropAllReferences();
+    Dead.insert(I);
+  }
+
+  for (auto *I : Dead)
+    I->eraseFromParent();
+
+  for (auto I : L->blocks())
+    DeleteDeadPHIs(I);
+}
+
+bool MVETailPredication::TryConvert(Value *TripCount) {
+  if (!IsPredicatedVectorLoop())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n");
+
+  // Walk through the masked intrinsics and try to find whether the predicate
+  // operand is generated from an induction variable.
+  Module *M = L->getHeader()->getModule();
+  Type *Ty = IntegerType::get(M->getContext(), 32);
+  SetVector<Instruction*> Predicates;
+  DenseMap<Instruction*, Instruction*> NewPredicates;
+
+  for (auto *I : MaskedInsts) {
+    Intrinsic::ID ID = I->getIntrinsicID();
+    unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
+    auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
+    if (!Predicate || Predicates.count(Predicate))
+      continue;
+
+    VectorType *VecTy = getVectorType(I);
+    Value *NumElements = ComputeElements(TripCount, VecTy);
+    if (!NumElements)
+      continue;
+
+    if (!isTailPredicate(Predicate, NumElements)) {
+      LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate <<  "\n");
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n");
+    Predicates.insert(Predicate);
+
+    // Insert a phi to count the number of elements processed by the loop.
+    IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
+    PHINode *Processed = Builder.CreatePHI(Ty, 2);
+    Processed->addIncoming(NumElements, L->getLoopPreheader());
+
+    // Insert the intrinsic to represent the effect of tail predication.
+    Builder.SetInsertPoint(cast<Instruction>(Predicate));
+    ConstantInt *Factor =
+      ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+    Intrinsic::ID VCTPID;
+    switch (VecTy->getNumElements()) {
+    default:
+      llvm_unreachable("unexpected number of lanes");
+    case 2:  VCTPID = Intrinsic::arm_vctp64; break;
+    case 4:  VCTPID = Intrinsic::arm_vctp32; break;
+    case 8:  VCTPID = Intrinsic::arm_vctp16; break;
+    case 16: VCTPID = Intrinsic::arm_vctp8; break;
+    }
+    Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
+    Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
+    Predicate->replaceAllUsesWith(TailPredicate);
+    NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
+
+    // Add the incoming value to the new phi.
+    // TODO: This add likely already exists in the loop.
+    Value *Remaining = Builder.CreateSub(Processed, Factor);
+    Processed->addIncoming(Remaining, L->getLoopLatch());
+    LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: "
+               << *Processed << "\n"
+               << "TP: Inserted VCTP: " << *TailPredicate << "\n");
+  }
+
+  // Now clean up.
+  Cleanup(NewPredicates, Predicates, L);
+  return true;
+}
+
+Pass *llvm::createMVETailPredicationPass() {
+  return new MVETailPredication();
+}
+
+char MVETailPredication::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false)
+INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false)
diff --git a/lib/Target/ARM/MVEVPTBlockPass.cpp b/lib/Target/ARM/MVEVPTBlockPass.cpp
new file mode 100644
index 000000000000..bc0a80b177ed
--- /dev/null
+++ b/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -0,0 +1,278 @@
+//===-- MVEVPTBlockPass.cpp - Insert MVE VPT blocks -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include <cassert>
+#include <new>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-mve-vpt"
+
+namespace {
+  class MVEVPTBlock : public MachineFunctionPass {
+  public:
+    static char ID;
+    const Thumb2InstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+
+    MVEVPTBlock() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "MVE VPT block insertion pass";
+    }
+
+  private:
+    bool InsertVPTBlocks(MachineBasicBlock &MBB);
+  };
+
+  char MVEVPTBlock::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
+
+enum VPTMaskValue {
+  T     =  8, // 0b1000
+  TT    =  4, // 0b0100
+  TE    = 12, // 0b1100
+  TTT   =  2, // 0b0010
+  TTE   =  6, // 0b0110
+  TEE   = 10, // 0b1010
+  TET   = 14, // 0b1110
+  TTTT  =  1, // 0b0001
+  TTTE  =  3, // 0b0011
+  TTEE  =  5, // 0b0101
+  TTET  =  7, // 0b0111
+  TEEE  =  9, // 0b1001
+  TEET  = 11, // 0b1011
+  TETT  = 13, // 0b1101
+  TETE  = 15  // 0b1111
+};
+
+static unsigned VCMPOpcodeToVPT(unsigned Opcode) {
+  switch (Opcode) {
+  case ARM::MVE_VCMPf32:
+    return ARM::MVE_VPTv4f32;
+  case ARM::MVE_VCMPf16:
+    return ARM::MVE_VPTv8f16;
+  case ARM::MVE_VCMPi8:
+    return ARM::MVE_VPTv16i8;
+  case ARM::MVE_VCMPi16:
+    return ARM::MVE_VPTv8i16;
+  case ARM::MVE_VCMPi32:
+    return ARM::MVE_VPTv4i32;
+  case ARM::MVE_VCMPu8:
+    return ARM::MVE_VPTv16u8;
+  case ARM::MVE_VCMPu16:
+    return ARM::MVE_VPTv8u16;
+  case ARM::MVE_VCMPu32:
+    return ARM::MVE_VPTv4u32;
+  case ARM::MVE_VCMPs8:
+    return ARM::MVE_VPTv16s8;
+  case ARM::MVE_VCMPs16:
+    return ARM::MVE_VPTv8s16;
+  case ARM::MVE_VCMPs32:
+    return ARM::MVE_VPTv4s32;
+
+  case ARM::MVE_VCMPf32r:
+    return ARM::MVE_VPTv4f32r;
+  case ARM::MVE_VCMPf16r:
+    return ARM::MVE_VPTv8f16r;
+  case ARM::MVE_VCMPi8r:
+    return ARM::MVE_VPTv16i8r;
+  case ARM::MVE_VCMPi16r:
+    return ARM::MVE_VPTv8i16r;
+  case ARM::MVE_VCMPi32r:
+    return ARM::MVE_VPTv4i32r;
+  case ARM::MVE_VCMPu8r:
+    return ARM::MVE_VPTv16u8r;
+  case ARM::MVE_VCMPu16r:
+    return ARM::MVE_VPTv8u16r;
+  case ARM::MVE_VCMPu32r:
+    return ARM::MVE_VPTv4u32r;
+  case ARM::MVE_VCMPs8r:
+    return ARM::MVE_VPTv16s8r;
+  case ARM::MVE_VCMPs16r:
+    return ARM::MVE_VPTv8s16r;
+  case ARM::MVE_VCMPs32r:
+    return ARM::MVE_VPTv4s32r;
+
+  default:
+    return 0;
+  }
+}
+
+static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI,
+                                            const TargetRegisterInfo *TRI,
+                                            unsigned &NewOpcode) {
+  // Search backwards to the instruction that defines VPR. This may or not
+  // be a VCMP, we check that after this loop. If we find another instruction
+  // that reads cpsr, we return nullptr.
+  MachineBasicBlock::iterator CmpMI = MI;
+  while (CmpMI != MI->getParent()->begin()) {
+    --CmpMI;
+    if (CmpMI->modifiesRegister(ARM::VPR, TRI))
+      break;
+    if (CmpMI->readsRegister(ARM::VPR, TRI))
+      break;
+  }
+
+  if (CmpMI == MI)
+    return nullptr;
+  NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode());
+  if (NewOpcode == 0)
+    return nullptr;
+
+  // Search forward from CmpMI to MI, checking if either register was def'd
+  if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI),
+                             MI, TRI))
+    return nullptr;
+  if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI),
+                             MI, TRI))
+    return nullptr;
+  return &*CmpMI;
+}
+
+bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
+  bool Modified = false;
+  MachineBasicBlock::instr_iterator MBIter = Block.instr_begin();
+  MachineBasicBlock::instr_iterator EndIter = Block.instr_end();
+
+  while (MBIter != EndIter) {
+    MachineInstr *MI = &*MBIter;
+    unsigned PredReg = 0;
+    DebugLoc dl = MI->getDebugLoc();
+
+    ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
+
+    // The idea of the predicate is that None, Then and Else are for use when
+    // handling assembly language: they correspond to the three possible
+    // suffixes "", "t" and "e" on the mnemonic. So when instructions are read
+    // from assembly source or disassembled from object code, you expect to see
+    // a mixture whenever there's a long VPT block. But in code generation, we
+    // hope we'll never generate an Else as input to this pass.
+    assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
+
+    if (Pred == ARMVCC::None) {
+      ++MBIter;
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump());
+    int VPTInstCnt = 1;
+    ARMVCC::VPTCodes NextPred;
+
+    // Look at subsequent instructions, checking if they can be in the same VPT
+    // block.
+    ++MBIter;
+    while (MBIter != EndIter && VPTInstCnt < 4) {
+      NextPred = getVPTInstrPredicate(*MBIter, PredReg);
+      assert(NextPred != ARMVCC::Else &&
+             "VPT block pass does not expect Else preds");
+      if (NextPred != Pred)
+        break;
+      LLVM_DEBUG(dbgs() << "  adding : "; MBIter->dump());
+      ++VPTInstCnt;
+      ++MBIter;
+    };
+
+    unsigned BlockMask = 0;
+    switch (VPTInstCnt) {
+    case 1:
+      BlockMask = VPTMaskValue::T;
+      break;
+    case 2:
+      BlockMask = VPTMaskValue::TT;
+      break;
+    case 3:
+      BlockMask = VPTMaskValue::TTT;
+      break;
+    case 4:
+      BlockMask = VPTMaskValue::TTTT;
+      break;
+    default:
+      llvm_unreachable("Unexpected number of instruction in a VPT block");
+    };
+
+    // Search back for a VCMP that can be folded to create a VPT, or else create
+    // a VPST directly
+    MachineInstrBuilder MIBuilder;
+    unsigned NewOpcode;
+    MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode);
+    if (VCMP) {
+      LLVM_DEBUG(dbgs() << "  folding VCMP into VPST: "; VCMP->dump());
+      MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode));
+      MIBuilder.addImm(BlockMask);
+      MIBuilder.add(VCMP->getOperand(1));
+      MIBuilder.add(VCMP->getOperand(2));
+      MIBuilder.add(VCMP->getOperand(3));
+      VCMP->eraseFromParent();
+    } else {
+      MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST));
+      MIBuilder.addImm(BlockMask);
+    }
+
+    finalizeBundle(
+        Block, MachineBasicBlock::instr_iterator(MIBuilder.getInstr()), MBIter);
+
+    Modified = true;
+  }
+  return Modified;
+}
+
+bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
+  const ARMSubtarget &STI =
+      static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+
+  if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
+    return false;
+
+  TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
+  TRI = STI.getRegisterInfo();
+
+  LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
+                    << "********** Function: " << Fn.getName() << '\n');
+
+  bool Modified = false;
+  for (MachineBasicBlock &MBB : Fn)
+    Modified |= InsertVPTBlocks(MBB);
+
+  LLVM_DEBUG(dbgs() << "**************************************\n");
+  return Modified;
+}
+
+/// createMVEVPTBlock - Returns an instance of the MVE VPT block
+/// insertion pass.
+FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); }
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 426e9a0ed9b8..956d474f1d79 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -164,7 +164,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   // to determine the end of the prologue.
   DebugLoc dl;
 
-  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  Register FramePtr = RegInfo->getFrameRegister(MF);
   unsigned BasePtr = RegInfo->getBaseRegister();
   int CFAOffset = 0;
 
@@ -459,8 +459,8 @@ static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
   else if (MI.getOpcode() == ARM::tPOP) {
     return true;
   } else if (MI.getOpcode() == ARM::tMOVr) {
-    unsigned Dst = MI.getOperand(0).getReg();
-    unsigned Src = MI.getOperand(1).getReg();
+    Register Dst = MI.getOperand(0).getReg();
+    Register Src = MI.getOperand(1).getReg();
     return ((ARM::tGPRRegClass.contains(Src) || Src == ARM::LR) &&
             ARM::hGPRRegClass.contains(Dst));
   }
@@ -483,7 +483,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
   assert((unsigned)NumBytes >= ArgRegsSaveSize &&
          "ArgRegsSaveSize is included in NumBytes");
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  Register FramePtr = RegInfo->getFrameRegister(MF);
 
   if (!AFI->hasStackFrame()) {
     if (NumBytes - ArgRegsSaveSize != 0)
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index f57d93a2e83d..fccaa4c9cc8a 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -80,12 +80,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
   assert((RC == &ARM::tGPRRegClass ||
-          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-           isARMLowRegister(SrcReg))) && "Unknown regclass!");
+          (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) &&
+         "Unknown regclass!");
 
   if (RC == &ARM::tGPRRegClass ||
-      (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-       isARMLowRegister(SrcReg))) {
+      (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -108,13 +107,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
-          (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
-           isARMLowRegister(DestReg))) && "Unknown regclass!");
+  assert(
+      (RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
+       (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) &&
+      "Unknown regclass!");
 
   if (RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
-      (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
-       isARMLowRegister(DestReg))) {
+      (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 3143eb9840ed..786fc78d0233 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -87,7 +87,7 @@ static void TrackDefUses(MachineInstr *MI, RegisterSet &Defs, RegisterSet &Uses,
   for (auto &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP)
       continue;
     if (MO.isUse())
@@ -145,8 +145,8 @@ Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI,
          MI->getOperand(1).getSubReg() == 0 &&
          "Sub-register indices still around?");
 
-  unsigned DstReg = MI->getOperand(0).getReg();
-  unsigned SrcReg = MI->getOperand(1).getReg();
+  Register DstReg = MI->getOperand(0).getReg();
+  Register SrcReg = MI->getOperand(1).getReg();
 
   // First check if it's safe to move it.
   if (Uses.count(DstReg) || Defs.count(SrcReg))
@@ -308,131 +308,3 @@ bool Thumb2ITBlock::runOnMachineFunction(MachineFunction &Fn) {
 /// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
 /// insertion pass.
 FunctionPass *llvm::createThumb2ITBlockPass() { return new Thumb2ITBlock(); }
-
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "arm-mve-vpt"
-
-namespace {
-  class MVEVPTBlock : public MachineFunctionPass {
-  public:
-    static char ID;
-    const Thumb2InstrInfo *TII;
-    const TargetRegisterInfo *TRI;
-
-    MVEVPTBlock() : MachineFunctionPass(ID) {}
-
-    bool runOnMachineFunction(MachineFunction &Fn) override;
-
-    MachineFunctionProperties getRequiredProperties() const override {
-      return MachineFunctionProperties().set(
-          MachineFunctionProperties::Property::NoVRegs);
-    }
-
-    StringRef getPassName() const override {
-      return "MVE VPT block insertion pass";
-    }
-
-  private:
-    bool InsertVPTBlocks(MachineBasicBlock &MBB);
-  };
-
-  char MVEVPTBlock::ID = 0;
-
-} // end anonymous namespace
-
-INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
-
-enum VPTMaskValue {
-  T     =  8, // 0b1000
-  TT    =  4, // 0b0100
-  TE    = 12, // 0b1100
-  TTT   =  2, // 0b0010
-  TTE   =  6, // 0b0110
-  TEE   = 10, // 0b1010
-  TET   = 14, // 0b1110
-  TTTT  =  1, // 0b0001
-  TTTE  =  3, // 0b0011
-  TTEE  =  5, // 0b0101
-  TTET  =  7, // 0b0111
-  TEEE  =  9, // 0b1001
-  TEET  = 11, // 0b1011
-  TETT  = 13, // 0b1101
-  TETE  = 15  // 0b1111
-};
-
-bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
-  bool Modified = false;
-  MachineBasicBlock::iterator MBIter = Block.begin();
-  MachineBasicBlock::iterator EndIter = Block.end();
-
-  while (MBIter != EndIter) {
-    MachineInstr *MI = &*MBIter;
-    unsigned PredReg = 0;
-    DebugLoc dl = MI->getDebugLoc();
-
-    ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
-
-    // The idea of the predicate is that None, Then and Else are for use when
-    // handling assembly language: they correspond to the three possible
-    // suffixes "", "t" and "e" on the mnemonic. So when instructions are read
-    // from assembly source or disassembled from object code, you expect to see
-    // a mixture whenever there's a long VPT block. But in code generation, we
-    // hope we'll never generate an Else as input to this pass.
-
-    assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
-
-    if (Pred == ARMVCC::None) {
-      ++MBIter;
-      continue;
-    }
-
-    MachineInstrBuilder MIBuilder =
-        BuildMI(Block, MBIter, dl, TII->get(ARM::MVE_VPST));
-    // The mask value for the VPST instruction is T = 0b1000 = 8
-    MIBuilder.addImm(VPTMaskValue::T);
-
-    MachineBasicBlock::iterator VPSTInsertPos = MIBuilder.getInstr();
-    int VPTInstCnt = 1;
-    ARMVCC::VPTCodes NextPred;
-
-    do {
-      ++MBIter;
-      NextPred = getVPTInstrPredicate(*MBIter, PredReg);
-    } while (NextPred != ARMVCC::None && NextPred == Pred && ++VPTInstCnt < 4);
-
-    MachineInstr *LastMI = &*MBIter;
-    finalizeBundle(Block, VPSTInsertPos.getInstrIterator(),
-                   ++LastMI->getIterator());
-
-    Modified = true;
-    LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump(););
-
-    ++MBIter;
-  }
-  return Modified;
-}
-
-bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
-  const ARMSubtarget &STI =
-      static_cast<const ARMSubtarget &>(Fn.getSubtarget());
-
-  if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
-    return false;
-
-  TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
-  TRI = STI.getRegisterInfo();
-
-  LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
-                    << "********** Function: " << Fn.getName() << '\n');
-
-  bool Modified = false;
-  for (MachineBasicBlock &MBB : Fn)
-    Modified |= InsertVPTBlocks(MBB);
-
-  LLVM_DEBUG(dbgs() << "**************************************\n");
-  return Modified;
-}
-
-/// createMVEVPTBlock - Returns an instance of the MVE VPT block
-/// insertion pass.
-FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); }
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 5a965f7a6b9b..af1f0aeb27ba 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -159,9 +159,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for
     // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
     // otherwise).
-    if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    if (Register::isVirtualRegister(SrcReg)) {
       MachineRegisterInfo *MRI = &MF.getRegInfo();
-      MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass);
+      MRI->constrainRegClass(SrcReg, &ARM::GPRPairnospRegClass);
     }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
@@ -200,10 +200,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for
     // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
     // otherwise).
-    if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+    if (Register::isVirtualRegister(DestReg)) {
       MachineRegisterInfo *MRI = &MF.getRegInfo();
-      MRI->constrainRegClass(DestReg,
-                             &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass);
+      MRI->constrainRegClass(DestReg, &ARM::GPRPairnospRegClass);
     }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
@@ -211,7 +210,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
     MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
 
-    if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+    if (Register::isPhysicalRegister(DestReg))
       MIB.addReg(DestReg, RegState::ImplicitDefine);
     return;
   }
@@ -470,12 +469,17 @@ immediateOffsetOpcode(unsigned opcode)
 
 bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                                unsigned FrameReg, int &Offset,
-                               const ARMBaseInstrInfo &TII) {
+                               const ARMBaseInstrInfo &TII,
+                               const TargetRegisterInfo *TRI) {
   unsigned Opcode = MI.getOpcode();
   const MCInstrDesc &Desc = MI.getDesc();
   unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
   bool isSub = false;
 
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetRegisterClass *RegClass =
+      TII.getRegClass(Desc, FrameRegIdx, TRI, MF);
+
   // Memory operands in inline assembly always use AddrModeT2_i12.
   if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
     AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
@@ -554,7 +558,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     // register then we change to an immediate version.
     unsigned NewOpc = Opcode;
     if (AddrMode == ARMII::AddrModeT2_so) {
-      unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+      Register OffsetReg = MI.getOperand(FrameRegIdx + 1).getReg();
       if (OffsetReg != 0) {
         MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
         return Offset == 0;
@@ -645,10 +649,21 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
 
     // Attempt to fold address computation
-    // Common case: small offset, fits into instruction.
+    // Common case: small offset, fits into instruction. We need to make sure
+    // the register class is correct too, for instructions like the MVE
+    // VLDRH.32, which only accepts low tGPR registers.
     int ImmedOffset = Offset / Scale;
     unsigned Mask = (1 << NumBits) - 1;
-    if ((unsigned)Offset <= Mask * Scale) {
+    if ((unsigned)Offset <= Mask * Scale &&
+        (Register::isVirtualRegister(FrameReg) ||
+         RegClass->contains(FrameReg))) {
+      if (Register::isVirtualRegister(FrameReg)) {
+        // Make sure the register class for the virtual register is correct
+        MachineRegisterInfo *MRI = &MF.getRegInfo();
+        if (!MRI->constrainRegClass(FrameReg, RegClass))
+          llvm_unreachable("Unable to constrain virtual register class.");
+      }
+
       // Replace the FrameIndex with fp/sp
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
       if (isSub) {
@@ -681,7 +696,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   }
 
   Offset = (isSub) ? -Offset : Offset;
-  return Offset == 0;
+  return Offset == 0 && (Register::isVirtualRegister(FrameReg) ||
+                         RegClass->contains(FrameReg));
 }
 
 ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 37a85fa38417..c5a62aa33990 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -300,7 +300,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
   for (const MachineOperand &MO : CPSRDef->operands()) {
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0 || Reg == ARM::CPSR)
       continue;
     Defs.insert(Reg);
@@ -309,7 +309,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
   for (const MachineOperand &MO : Use->operands()) {
     if (!MO.isReg() || MO.isUndef() || MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Defs.count(Reg))
       return false;
   }
@@ -380,7 +380,7 @@ static bool VerifyLowRegs(MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isImplicit())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0 || Reg == ARM::CPSR)
       continue;
     if (isPCOk && Reg == ARM::PC)
@@ -464,11 +464,11 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     // For this reason we can't reuse the logic at the end of this function; we
     // have to implement the MI building here.
     bool IsStore = Entry.WideOpc == ARM::t2STR_POST;
-    unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
-    unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
+    Register Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
+    Register Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
     unsigned Offset = MI->getOperand(3).getImm();
     unsigned PredImm = MI->getOperand(4).getImm();
-    unsigned PredReg = MI->getOperand(5).getReg();
+    Register PredReg = MI->getOperand(5).getReg();
     assert(isARMLowRegister(Rt));
     assert(isARMLowRegister(Rn));
 
@@ -496,7 +496,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     return true;
   }
   case ARM::t2LDMIA: {
-    unsigned BaseReg = MI->getOperand(0).getReg();
+    Register BaseReg = MI->getOperand(0).getReg();
     assert(isARMLowRegister(BaseReg));
 
     // For the non-writeback version (this one), the base register must be
@@ -524,7 +524,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
 
     break;
   case ARM::t2LDMIA_RET: {
-    unsigned BaseReg = MI->getOperand(1).getReg();
+    Register BaseReg = MI->getOperand(1).getReg();
     if (BaseReg != ARM::SP)
       return false;
     Opc = Entry.NarrowOpc2; // tPOP_RET
@@ -537,7 +537,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
   case ARM::t2STMDB_UPD: {
     OpNum = 0;
 
-    unsigned BaseReg = MI->getOperand(1).getReg();
+    Register BaseReg = MI->getOperand(1).getReg();
     if (BaseReg == ARM::SP &&
         (Entry.WideOpc == ARM::t2LDMIA_UPD ||
          Entry.WideOpc == ARM::t2STMDB_UPD)) {
@@ -743,11 +743,11 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
     // are optimizing for size.
     return false;
 
-  unsigned Reg0 = MI->getOperand(0).getReg();
-  unsigned Reg1 = MI->getOperand(1).getReg();
+  Register Reg0 = MI->getOperand(0).getReg();
+  Register Reg1 = MI->getOperand(1).getReg();
   // t2MUL is "special". The tied source operand is second, not first.
   if (MI->getOpcode() == ARM::t2MUL) {
-    unsigned Reg2 = MI->getOperand(2).getReg();
+    Register Reg2 = MI->getOperand(2).getReg();
     // Early exit if the regs aren't all low regs.
     if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
         || !isARMLowRegister(Reg2))
@@ -782,7 +782,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
     if (Imm > Limit)
       return false;
   } else {
-    unsigned Reg2 = MI->getOperand(2).getReg();
+    Register Reg2 = MI->getOperand(2).getReg();
     if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
       return false;
   }
@@ -868,7 +868,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
       continue;
     const MachineOperand &MO = MI->getOperand(i);
     if (MO.isReg()) {
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg || Reg == ARM::CPSR)
         continue;
       if (Entry.LowRegs1 && !isARMLowRegister(Reg))
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index a96417ffce4d..b0ba58d8dc4a 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -107,8 +107,9 @@ void ThumbRegisterInfo::emitLoadConstPool(
   MachineFunction &MF = *MBB.getParent();
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   if (STI.isThumb1Only()) {
-    assert((isARMLowRegister(DestReg) || isVirtualRegister(DestReg)) &&
-           "Thumb1 does not have ldr to high register");
+    assert(
+        (isARMLowRegister(DestReg) || Register::isVirtualRegister(DestReg)) &&
+        "Thumb1 does not have ldr to high register");
     return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred,
                                    PredReg, MIFlags);
   }
@@ -141,7 +142,7 @@ static void emitThumbRegPlusImmInReg(
   unsigned LdReg = DestReg;
   if (DestReg == ARM::SP)
     assert(BaseReg == ARM::SP && "Unexpected!");
-  if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
+  if (!isARMLowRegister(DestReg) && !Register::isVirtualRegister(DestReg))
     LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
 
   if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
@@ -371,7 +372,7 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
 
   if (Opcode == ARM::tADDframe) {
     Offset += MI.getOperand(FrameRegIdx+1).getImm();
-    unsigned DestReg = MI.getOperand(0).getReg();
+    Register DestReg = MI.getOperand(0).getReg();
 
     emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
                               *this);
@@ -509,7 +510,7 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   if (MI.mayLoad()) {
     // Use the destination register to materialize sp + offset.
-    unsigned TmpReg = MI.getOperand(0).getReg();
+    Register TmpReg = MI.getOperand(0).getReg();
     bool UseRR = false;
     if (Opcode == ARM::tLDRspi) {
       if (FrameReg == ARM::SP || STI.genExecuteOnly())
diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp
index 7586bd7b78fc..1db6b2236b4f 100644
--- a/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -97,7 +97,7 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
 
       assert(RegOp.isReg() && "Operand must be a register when you're"
                               "using 'A'..'Z' operand extracodes.");
-      unsigned Reg = RegOp.getReg();
+      Register Reg = RegOp.getReg();
 
       unsigned ByteNumber = ExtraCode[0] - 'A';
 
diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index c45b2d0e39c1..83d0f6845332 100644
--- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -140,8 +140,8 @@ bool AVRExpandPseudo::
 expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(2).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool DstIsKill = MI.getOperand(1).isKill();
   bool SrcIsKill = MI.getOperand(2).isKill();
@@ -173,8 +173,8 @@ bool AVRExpandPseudo::
 expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(2).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool DstIsKill = MI.getOperand(1).isKill();
   bool SrcIsKill = MI.getOperand(2).isKill();
@@ -220,7 +220,7 @@ bool AVRExpandPseudo::
 expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(3).isDead();
@@ -874,7 +874,7 @@ unsigned AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) {
   // Exclude all the registers being used by the instruction.
   for (MachineOperand &MO : MI.operands()) {
     if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
-        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        !Register::isVirtualRegister(MO.getReg()))
       Candidates.reset(MO.getReg());
   }
 
diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp
index 5e91bb8632c1..e6c48de5a782 100644
--- a/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/lib/Target/AVR/AVRFrameLowering.cpp
@@ -30,7 +30,8 @@
 namespace llvm {
 
 AVRFrameLowering::AVRFrameLowering()
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 1, -2) {}
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align::None(),
+                          -2) {}
 
 bool AVRFrameLowering::canSimplifyCallFramePseudos(
     const MachineFunction &MF) const {
@@ -323,7 +324,7 @@ static void fixStackStores(MachineBasicBlock &MBB,
            "Invalid register, should be SP!");
     if (insertPushes) {
       // Replace this instruction with a push.
-      unsigned SrcReg = MI.getOperand(2).getReg();
+      Register SrcReg = MI.getOperand(2).getReg();
       bool SrcIsKill = MI.getOperand(2).isKill();
 
       // We can't use PUSHWRr here because when expanded the order of the new
diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 5cb4441c4380..4c4f4faa0508 100644
--- a/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -251,7 +251,7 @@ bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
       RegisterSDNode *RegNode =
           cast<RegisterSDNode>(CopyFromRegOp->getOperand(1));
       Reg = RegNode->getReg();
-      CanHandleRegImmOpt &= (TargetRegisterInfo::isVirtualRegister(Reg) ||
+      CanHandleRegImmOpt &= (Register::isVirtualRegister(Reg) ||
                              AVR::PTRDISPREGSRegClass.contains(Reg));
     } else {
       CanHandleRegImmOpt = false;
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index b6ba5f22fafb..f12c59b7d8c3 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -236,7 +236,7 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
   setLibcallName(RTLIB::SIN_F32, "sin");
   setLibcallName(RTLIB::COS_F32, "cos");
 
-  setMinFunctionAlignment(1);
+  setMinFunctionAlignment(Align(2));
   setMinimumJumpTableEntries(UINT_MAX);
 }
 
@@ -1517,11 +1517,11 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
 
   unsigned ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
   unsigned ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
-  unsigned ShiftReg = RI.createVirtualRegister(RC);
-  unsigned ShiftReg2 = RI.createVirtualRegister(RC);
-  unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register ShiftReg = RI.createVirtualRegister(RC);
+  Register ShiftReg2 = RI.createVirtualRegister(RC);
+  Register ShiftAmtSrcReg = MI.getOperand(2).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
 
   // BB:
   // cpi N, 0
@@ -1568,7 +1568,7 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
 
 static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
   if (I->getOpcode() == AVR::COPY) {
-    unsigned SrcReg = I->getOperand(1).getReg();
+    Register SrcReg = I->getOperand(1).getReg();
     return (SrcReg == AVR::R0 || SrcReg == AVR::R1);
   }
 
@@ -1689,6 +1689,8 @@ AVRTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     // See http://www.nongnu.org/avr-libc/user-manual/inline_asm.html
     switch (Constraint[0]) {
+    default:
+      break;
     case 'a': // Simple upper registers
     case 'b': // Base pointer registers pairs
     case 'd': // Upper register
@@ -1715,9 +1717,7 @@ AVRTargetLowering::getConstraintType(StringRef Constraint) const {
     case 'O': // Integer constant (Range: 8, 16, 24)
     case 'P': // Integer constant (Range: 1)
     case 'R': // Integer constant (Range: -6 to 5)x
-      return C_Other;
-    default:
-      break;
+      return C_Immediate;
     }
   }
 
@@ -2006,10 +2006,9 @@ void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
-                                              EVT VT,
-                                              SelectionDAG &DAG) const {
-  unsigned Reg;
+Register AVRTargetLowering::getRegisterByName(const char *RegName, EVT VT,
+                                              const MachineFunction &MF) const {
+  Register Reg;
 
   if (VT == MVT::i8) {
     Reg = StringSwitch<unsigned>(RegName)
diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h
index ed2d0835903c..6c722fa5414b 100644
--- a/lib/Target/AVR/AVRISelLowering.h
+++ b/lib/Target/AVR/AVRISelLowering.h
@@ -125,8 +125,8 @@ public:
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
-  unsigned getRegisterByName(const char* RegName, EVT VT,
-                             SelectionDAG &DAG) const override;
+  Register getRegisterByName(const char* RegName, EVT VT,
+                             const MachineFunction &MF) const override;
 
   bool shouldSplitFunctionArgumentsAsLittleEndian(const DataLayout &DL)
     const override {
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index a6b36f80485d..8fce05c933bc 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -158,7 +158,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
     // We need to materialize the offset via an add instruction.
     unsigned Opcode;
-    unsigned DstReg = MI.getOperand(0).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
     assert(DstReg != AVR::R29R28 && "Dest reg cannot be the frame pointer");
 
     II++; // Skip over the FRMIDX (and now MOVW) instruction.
diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp
index a36c8b0f9649..25304280d002 100644
--- a/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/lib/Target/AVR/AVRTargetMachine.cpp
@@ -50,7 +50,7 @@ AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
                         getEffectiveRelocModel(RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
       SubTarget(TT, getCPU(CPU), FS, *this) {
-  this->TLOF = make_unique<AVRTargetObjectFile>();
+  this->TLOF = std::make_unique<AVRTargetObjectFile>();
   initAsmInfo();
 }
 
diff --git a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index aac5644711e2..af60bc4fdc90 100644
--- a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -199,22 +199,22 @@ public:
   }
 
   static std::unique_ptr<AVROperand> CreateToken(StringRef Str, SMLoc S) {
-    return make_unique<AVROperand>(Str, S);
+    return std::make_unique<AVROperand>(Str, S);
   }
 
   static std::unique_ptr<AVROperand> CreateReg(unsigned RegNum, SMLoc S,
                                                SMLoc E) {
-    return make_unique<AVROperand>(RegNum, S, E);
+    return std::make_unique<AVROperand>(RegNum, S, E);
   }
 
   static std::unique_ptr<AVROperand> CreateImm(const MCExpr *Val, SMLoc S,
                                                SMLoc E) {
-    return make_unique<AVROperand>(Val, S, E);
+    return std::make_unique<AVROperand>(Val, S, E);
   }
 
   static std::unique_ptr<AVROperand>
   CreateMemri(unsigned RegNum, const MCExpr *Val, SMLoc S, SMLoc E) {
-    return make_unique<AVROperand>(RegNum, Val, S, E);
+    return std::make_unique<AVROperand>(RegNum, Val, S, E);
   }
 
   void makeToken(StringRef Token) {
diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index 6025e4b2437c..1c69fea5962d 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -152,7 +152,7 @@ unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx,
 }
 
 std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI) {
-  return make_unique<AVRELFObjectWriter>(OSABI);
+  return std::make_unique<AVRELFObjectWriter>(OSABI);
 }
 
 } // end of namespace llvm
diff --git a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 75885fd058a7..ce1d2ecd9d26 100644
--- a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -194,7 +194,7 @@ public:
   }
 
   static std::unique_ptr<BPFOperand> createToken(StringRef Str, SMLoc S) {
-    auto Op = make_unique<BPFOperand>(Token);
+    auto Op = std::make_unique<BPFOperand>(Token);
     Op->Tok = Str;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -203,7 +203,7 @@ public:
 
   static std::unique_ptr<BPFOperand> createReg(unsigned RegNo, SMLoc S,
                                                SMLoc E) {
-    auto Op = make_unique<BPFOperand>(Register);
+    auto Op = std::make_unique<BPFOperand>(Register);
     Op->Reg.RegNum = RegNo;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -212,7 +212,7 @@ public:
 
   static std::unique_ptr<BPFOperand> createImm(const MCExpr *Val, SMLoc S,
                                                SMLoc E) {
-    auto Op = make_unique<BPFOperand>(Immediate);
+    auto Op = std::make_unique<BPFOperand>(Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
diff --git a/lib/Target/BPF/BPF.h b/lib/Target/BPF/BPF.h
index d311fc154094..6e4f35f4c5d7 100644
--- a/lib/Target/BPF/BPF.h
+++ b/lib/Target/BPF/BPF.h
@@ -15,17 +15,19 @@
 namespace llvm {
 class BPFTargetMachine;
 
-ModulePass *createBPFAbstractMemberAccess();
+ModulePass *createBPFAbstractMemberAccess(BPFTargetMachine *TM);
 
 FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
 FunctionPass *createBPFMISimplifyPatchablePass();
 FunctionPass *createBPFMIPeepholePass();
+FunctionPass *createBPFMIPeepholeTruncElimPass();
 FunctionPass *createBPFMIPreEmitPeepholePass();
 FunctionPass *createBPFMIPreEmitCheckingPass();
 
 void initializeBPFAbstractMemberAccessPass(PassRegistry&);
 void initializeBPFMISimplifyPatchablePass(PassRegistry&);
 void initializeBPFMIPeepholePass(PassRegistry&);
+void initializeBPFMIPeepholeTruncElimPass(PassRegistry&);
 void initializeBPFMIPreEmitPeepholePass(PassRegistry&);
 void initializeBPFMIPreEmitCheckingPass(PassRegistry&);
 }
diff --git a/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 51d4cbc8a429..400701c4e5c2 100644
--- a/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -50,6 +50,28 @@
 //   addr = preserve_struct_access_index(base, gep_index, di_index)
 //          !llvm.preserve.access.index <struct_ditype>
 //
+// Bitfield member access needs special attention. User cannot take the
+// address of a bitfield acceess. To facilitate kernel verifier
+// for easy bitfield code optimization, a new clang intrinsic is introduced:
+//   uint32_t __builtin_preserve_field_info(member_access, info_kind)
+// In IR, a chain with two (or more) intrinsic calls will be generated:
+//   ...
+//   addr = preserve_struct_access_index(base, 1, 1) !struct s
+//   uint32_t result = bpf_preserve_field_info(addr, info_kind)
+//
+// Suppose the info_kind is FIELD_SIGNEDNESS,
+// The above two IR intrinsics will be replaced with
+// a relocatable insn:
+//   signness = /* signness of member_access */
+// and signness can be changed by bpf loader based on the
+// types on the host.
+//
+// User can also test whether a field exists or not with
+//   uint32_t result = bpf_preserve_field_info(member_access, FIELD_EXISTENCE)
+// The field will be always available (result = 1) during initial
+// compilation, but bpf loader can patch with the correct value
+// on the target host where the member_access may or may not be available
+//
 //===----------------------------------------------------------------------===//
 
 #include "BPF.h"
@@ -65,13 +87,12 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <stack>
 
 #define DEBUG_TYPE "bpf-abstract-member-access"
 
 namespace llvm {
 const std::string BPFCoreSharedInfo::AmaAttr = "btf_ama";
-const std::string BPFCoreSharedInfo::PatchableExtSecName =
-    ".BPF.patchable_externs";
 } // namespace llvm
 
 using namespace llvm;
@@ -87,40 +108,62 @@ class BPFAbstractMemberAccess final : public ModulePass {
 
 public:
   static char ID;
-  BPFAbstractMemberAccess() : ModulePass(ID) {}
+  TargetMachine *TM;
+  // Add optional BPFTargetMachine parameter so that BPF backend can add the phase
+  // with target machine to find out the endianness. The default constructor (without
+  // parameters) is used by the pass manager for managing purposes.
+  BPFAbstractMemberAccess(BPFTargetMachine *TM = nullptr) : ModulePass(ID), TM(TM) {}
+
+  struct CallInfo {
+    uint32_t Kind;
+    uint32_t AccessIndex;
+    MDNode *Metadata;
+    Value *Base;
+  };
+  typedef std::stack<std::pair<CallInst *, CallInfo>> CallInfoStack;
 
 private:
   enum : uint32_t {
     BPFPreserveArrayAI = 1,
     BPFPreserveUnionAI = 2,
     BPFPreserveStructAI = 3,
+    BPFPreserveFieldInfoAI = 4,
   };
 
   std::map<std::string, GlobalVariable *> GEPGlobals;
   // A map to link preserve_*_access_index instrinsic calls.
-  std::map<CallInst *, std::pair<CallInst *, uint32_t>> AIChain;
+  std::map<CallInst *, std::pair<CallInst *, CallInfo>> AIChain;
   // A map to hold all the base preserve_*_access_index instrinsic calls.
-  // The base call is not an input of any other preserve_*_access_index
+  // The base call is not an input of any other preserve_*
   // intrinsics.
-  std::map<CallInst *, uint32_t> BaseAICalls;
+  std::map<CallInst *, CallInfo> BaseAICalls;
 
   bool doTransformation(Module &M);
 
-  void traceAICall(CallInst *Call, uint32_t Kind);
-  void traceBitCast(BitCastInst *BitCast, CallInst *Parent, uint32_t Kind);
-  void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, uint32_t Kind);
+  void traceAICall(CallInst *Call, CallInfo &ParentInfo);
+  void traceBitCast(BitCastInst *BitCast, CallInst *Parent,
+                    CallInfo &ParentInfo);
+  void traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
+                CallInfo &ParentInfo);
   void collectAICallChains(Module &M, Function &F);
 
-  bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind);
+  bool IsPreserveDIAccessIndexCall(const CallInst *Call, CallInfo &Cinfo);
+  bool IsValidAIChain(const MDNode *ParentMeta, uint32_t ParentAI,
+                      const MDNode *ChildMeta);
   bool removePreserveAccessIndexIntrinsic(Module &M);
   void replaceWithGEP(std::vector<CallInst *> &CallList,
                       uint32_t NumOfZerosIndex, uint32_t DIIndex);
-
-  Value *computeBaseAndAccessStr(CallInst *Call, std::string &AccessStr,
-                                 std::string &AccessKey, uint32_t Kind,
-                                 MDNode *&TypeMeta);
-  bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex);
-  bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind);
+  bool HasPreserveFieldInfoCall(CallInfoStack &CallStack);
+  void GetStorageBitRange(DICompositeType *CTy, DIDerivedType *MemberTy,
+                          uint32_t AccessIndex, uint32_t &StartBitOffset,
+                          uint32_t &EndBitOffset);
+  uint32_t GetFieldInfo(uint32_t InfoKind, DICompositeType *CTy,
+                        uint32_t AccessIndex, uint32_t PatchImm);
+
+  Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo,
+                                 std::string &AccessKey, MDNode *&BaseMeta);
+  uint64_t getConstant(const Value *IndexValue);
+  bool transformGEPChain(Module &M, CallInst *Call, CallInfo &CInfo);
 };
 } // End anonymous namespace
 
@@ -128,23 +171,65 @@ char BPFAbstractMemberAccess::ID = 0;
 INITIALIZE_PASS(BPFAbstractMemberAccess, DEBUG_TYPE,
                 "abstracting struct/union member accessees", false, false)
 
-ModulePass *llvm::createBPFAbstractMemberAccess() {
-  return new BPFAbstractMemberAccess();
+ModulePass *llvm::createBPFAbstractMemberAccess(BPFTargetMachine *TM) {
+  return new BPFAbstractMemberAccess(TM);
 }
 
 bool BPFAbstractMemberAccess::runOnModule(Module &M) {
   LLVM_DEBUG(dbgs() << "********** Abstract Member Accesses **********\n");
 
   // Bail out if no debug info.
-  if (empty(M.debug_compile_units()))
+  if (M.debug_compile_units().empty())
     return false;
 
   return doTransformation(M);
 }
 
+static bool SkipDIDerivedTag(unsigned Tag) {
+  if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
+      Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type &&
+      Tag != dwarf::DW_TAG_member)
+     return false;
+  return true;
+}
+
+static DIType * stripQualifiers(DIType *Ty) {
+  while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+    if (!SkipDIDerivedTag(DTy->getTag()))
+      break;
+    Ty = DTy->getBaseType();
+  }
+  return Ty;
+}
+
+static const DIType * stripQualifiers(const DIType *Ty) {
+  while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+    if (!SkipDIDerivedTag(DTy->getTag()))
+      break;
+    Ty = DTy->getBaseType();
+  }
+  return Ty;
+}
+
+static uint32_t calcArraySize(const DICompositeType *CTy, uint32_t StartDim) {
+  DINodeArray Elements = CTy->getElements();
+  uint32_t DimSize = 1;
+  for (uint32_t I = StartDim; I < Elements.size(); ++I) {
+    if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
+      if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
+        const DISubrange *SR = cast<DISubrange>(Element);
+        auto *CI = SR->getCount().dyn_cast<ConstantInt *>();
+        DimSize *= CI->getSExtValue();
+      }
+  }
+
+  return DimSize;
+}
+
 /// Check whether a call is a preserve_*_access_index intrinsic call or not.
 bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
-                                                          uint32_t &Kind) {
+                                                          CallInfo &CInfo) {
   if (!Call)
     return false;
 
@@ -152,15 +237,40 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
   if (!GV)
     return false;
   if (GV->getName().startswith("llvm.preserve.array.access.index")) {
-    Kind = BPFPreserveArrayAI;
+    CInfo.Kind = BPFPreserveArrayAI;
+    CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+    if (!CInfo.Metadata)
+      report_fatal_error("Missing metadata for llvm.preserve.array.access.index intrinsic");
+    CInfo.AccessIndex = getConstant(Call->getArgOperand(2));
+    CInfo.Base = Call->getArgOperand(0);
     return true;
   }
   if (GV->getName().startswith("llvm.preserve.union.access.index")) {
-    Kind = BPFPreserveUnionAI;
+    CInfo.Kind = BPFPreserveUnionAI;
+    CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+    if (!CInfo.Metadata)
+      report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic");
+    CInfo.AccessIndex = getConstant(Call->getArgOperand(1));
+    CInfo.Base = Call->getArgOperand(0);
     return true;
   }
   if (GV->getName().startswith("llvm.preserve.struct.access.index")) {
-    Kind = BPFPreserveStructAI;
+    CInfo.Kind = BPFPreserveStructAI;
+    CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+    if (!CInfo.Metadata)
+      report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic");
+    CInfo.AccessIndex = getConstant(Call->getArgOperand(2));
+    CInfo.Base = Call->getArgOperand(0);
+    return true;
+  }
+  if (GV->getName().startswith("llvm.bpf.preserve.field.info")) {
+    CInfo.Kind = BPFPreserveFieldInfoAI;
+    CInfo.Metadata = nullptr;
+    // Check validity of info_kind as clang did not check this.
+    uint64_t InfoKind = getConstant(Call->getArgOperand(1));
+    if (InfoKind >= BPFCoreSharedInfo::MAX_FIELD_RELOC_KIND)
+      report_fatal_error("Incorrect info_kind for llvm.bpf.preserve.field.info intrinsic");
+    CInfo.AccessIndex = InfoKind;
     return true;
   }
 
@@ -173,8 +283,7 @@ void BPFAbstractMemberAccess::replaceWithGEP(std::vector<CallInst *> &CallList,
   for (auto Call : CallList) {
     uint32_t Dimension = 1;
     if (DimensionIndex > 0)
-      Dimension = cast<ConstantInt>(Call->getArgOperand(DimensionIndex))
-                      ->getZExtValue();
+      Dimension = getConstant(Call->getArgOperand(DimensionIndex));
 
     Constant *Zero =
         ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0);
@@ -200,14 +309,14 @@ bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) {
     for (auto &BB : F)
       for (auto &I : BB) {
         auto *Call = dyn_cast<CallInst>(&I);
-        uint32_t Kind;
-        if (!IsPreserveDIAccessIndexCall(Call, Kind))
+        CallInfo CInfo;
+        if (!IsPreserveDIAccessIndexCall(Call, CInfo))
           continue;
 
         Found = true;
-        if (Kind == BPFPreserveArrayAI)
+        if (CInfo.Kind == BPFPreserveArrayAI)
           PreserveArrayIndexCalls.push_back(Call);
-        else if (Kind == BPFPreserveUnionAI)
+        else if (CInfo.Kind == BPFPreserveUnionAI)
           PreserveUnionIndexCalls.push_back(Call);
         else
           PreserveStructIndexCalls.push_back(Call);
@@ -233,79 +342,146 @@ bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) {
   return Found;
 }
 
-void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind) {
+/// Check whether the access index chain is valid. We check
+/// here because there may be type casts between two
+/// access indexes. We want to ensure memory access still valid.
+bool BPFAbstractMemberAccess::IsValidAIChain(const MDNode *ParentType,
+                                             uint32_t ParentAI,
+                                             const MDNode *ChildType) {
+  if (!ChildType)
+    return true; // preserve_field_info, no type comparison needed.
+
+  const DIType *PType = stripQualifiers(cast<DIType>(ParentType));
+  const DIType *CType = stripQualifiers(cast<DIType>(ChildType));
+
+  // Child is a derived/pointer type, which is due to type casting.
+  // Pointer type cannot be in the middle of chain.
+  if (isa<DIDerivedType>(CType))
+    return false;
+
+  // Parent is a pointer type.
+  if (const auto *PtrTy = dyn_cast<DIDerivedType>(PType)) {
+    if (PtrTy->getTag() != dwarf::DW_TAG_pointer_type)
+      return false;
+    return stripQualifiers(PtrTy->getBaseType()) == CType;
+  }
+
+  // Otherwise, struct/union/array types
+  const auto *PTy = dyn_cast<DICompositeType>(PType);
+  const auto *CTy = dyn_cast<DICompositeType>(CType);
+  assert(PTy && CTy && "ParentType or ChildType is null or not composite");
+
+  uint32_t PTyTag = PTy->getTag();
+  assert(PTyTag == dwarf::DW_TAG_array_type ||
+         PTyTag == dwarf::DW_TAG_structure_type ||
+         PTyTag == dwarf::DW_TAG_union_type);
+
+  uint32_t CTyTag = CTy->getTag();
+  assert(CTyTag == dwarf::DW_TAG_array_type ||
+         CTyTag == dwarf::DW_TAG_structure_type ||
+         CTyTag == dwarf::DW_TAG_union_type);
+
+  // Multi dimensional arrays, base element should be the same
+  if (PTyTag == dwarf::DW_TAG_array_type && PTyTag == CTyTag)
+    return PTy->getBaseType() == CTy->getBaseType();
+
+  DIType *Ty;
+  if (PTyTag == dwarf::DW_TAG_array_type)
+    Ty = PTy->getBaseType();
+  else
+    Ty = dyn_cast<DIType>(PTy->getElements()[ParentAI]);
+
+  return dyn_cast<DICompositeType>(stripQualifiers(Ty)) == CTy;
+}
+
+void BPFAbstractMemberAccess::traceAICall(CallInst *Call,
+                                          CallInfo &ParentInfo) {
   for (User *U : Call->users()) {
     Instruction *Inst = dyn_cast<Instruction>(U);
     if (!Inst)
       continue;
 
     if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
-      traceBitCast(BI, Call, Kind);
+      traceBitCast(BI, Call, ParentInfo);
     } else if (auto *CI = dyn_cast<CallInst>(Inst)) {
-      uint32_t CIKind;
-      if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
-        AIChain[CI] = std::make_pair(Call, Kind);
-        traceAICall(CI, CIKind);
+      CallInfo ChildInfo;
+
+      if (IsPreserveDIAccessIndexCall(CI, ChildInfo) &&
+          IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex,
+                         ChildInfo.Metadata)) {
+        AIChain[CI] = std::make_pair(Call, ParentInfo);
+        traceAICall(CI, ChildInfo);
       } else {
-        BaseAICalls[Call] = Kind;
+        BaseAICalls[Call] = ParentInfo;
       }
     } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
       if (GI->hasAllZeroIndices())
-        traceGEP(GI, Call, Kind);
+        traceGEP(GI, Call, ParentInfo);
       else
-        BaseAICalls[Call] = Kind;
+        BaseAICalls[Call] = ParentInfo;
+    } else {
+      BaseAICalls[Call] = ParentInfo;
     }
   }
 }
 
 void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast,
-                                           CallInst *Parent, uint32_t Kind) {
+                                           CallInst *Parent,
+                                           CallInfo &ParentInfo) {
   for (User *U : BitCast->users()) {
     Instruction *Inst = dyn_cast<Instruction>(U);
     if (!Inst)
       continue;
 
     if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
-      traceBitCast(BI, Parent, Kind);
+      traceBitCast(BI, Parent, ParentInfo);
     } else if (auto *CI = dyn_cast<CallInst>(Inst)) {
-      uint32_t CIKind;
-      if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
-        AIChain[CI] = std::make_pair(Parent, Kind);
-        traceAICall(CI, CIKind);
+      CallInfo ChildInfo;
+      if (IsPreserveDIAccessIndexCall(CI, ChildInfo) &&
+          IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex,
+                         ChildInfo.Metadata)) {
+        AIChain[CI] = std::make_pair(Parent, ParentInfo);
+        traceAICall(CI, ChildInfo);
       } else {
-        BaseAICalls[Parent] = Kind;
+        BaseAICalls[Parent] = ParentInfo;
       }
     } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
       if (GI->hasAllZeroIndices())
-        traceGEP(GI, Parent, Kind);
+        traceGEP(GI, Parent, ParentInfo);
       else
-        BaseAICalls[Parent] = Kind;
+        BaseAICalls[Parent] = ParentInfo;
+    } else {
+      BaseAICalls[Parent] = ParentInfo;
     }
   }
 }
 
 void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
-                                       uint32_t Kind) {
+                                       CallInfo &ParentInfo) {
   for (User *U : GEP->users()) {
     Instruction *Inst = dyn_cast<Instruction>(U);
     if (!Inst)
       continue;
 
     if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
-      traceBitCast(BI, Parent, Kind);
+      traceBitCast(BI, Parent, ParentInfo);
     } else if (auto *CI = dyn_cast<CallInst>(Inst)) {
-      uint32_t CIKind;
-      if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
-        AIChain[CI] = std::make_pair(Parent, Kind);
-        traceAICall(CI, CIKind);
+      CallInfo ChildInfo;
+      if (IsPreserveDIAccessIndexCall(CI, ChildInfo) &&
+          IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex,
+                         ChildInfo.Metadata)) {
+        AIChain[CI] = std::make_pair(Parent, ParentInfo);
+        traceAICall(CI, ChildInfo);
       } else {
-        BaseAICalls[Parent] = Kind;
+        BaseAICalls[Parent] = ParentInfo;
       }
     } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
       if (GI->hasAllZeroIndices())
-        traceGEP(GI, Parent, Kind);
+        traceGEP(GI, Parent, ParentInfo);
       else
-        BaseAICalls[Parent] = Kind;
+        BaseAICalls[Parent] = ParentInfo;
+    } else {
+      BaseAICalls[Parent] = ParentInfo;
     }
   }
 }
@@ -316,92 +492,345 @@ void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) {
 
   for (auto &BB : F)
     for (auto &I : BB) {
-      uint32_t Kind;
+      CallInfo CInfo;
       auto *Call = dyn_cast<CallInst>(&I);
-      if (!IsPreserveDIAccessIndexCall(Call, Kind) ||
+      if (!IsPreserveDIAccessIndexCall(Call, CInfo) ||
           AIChain.find(Call) != AIChain.end())
         continue;
 
-      traceAICall(Call, Kind);
+      traceAICall(Call, CInfo);
     }
 }
 
-/// Get access index from the preserve_*_access_index intrinsic calls.
-bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue,
-                                             uint64_t &AccessIndex) {
+uint64_t BPFAbstractMemberAccess::getConstant(const Value *IndexValue) {
   const ConstantInt *CV = dyn_cast<ConstantInt>(IndexValue);
-  if (!CV)
-    return false;
+  assert(CV);
+  return CV->getValue().getZExtValue();
+}
 
-  AccessIndex = CV->getValue().getZExtValue();
-  return true;
+/// Get the start and the end of storage offset for \p MemberTy.
+/// The storage bits are corresponding to the LLVM internal types,
+/// and the storage bits for the member determines what load width
+/// to use in order to extract the bitfield value.
+void BPFAbstractMemberAccess::GetStorageBitRange(DICompositeType *CTy,
+                                                 DIDerivedType *MemberTy,
+                                                 uint32_t AccessIndex,
+                                                 uint32_t &StartBitOffset,
+                                                 uint32_t &EndBitOffset) {
+  auto SOff = dyn_cast<ConstantInt>(MemberTy->getStorageOffsetInBits());
+  assert(SOff);
+  StartBitOffset = SOff->getZExtValue();
+
+  EndBitOffset = CTy->getSizeInBits();
+  uint32_t Index = AccessIndex + 1;
+  for (; Index < CTy->getElements().size(); ++Index) {
+    auto Member = cast<DIDerivedType>(CTy->getElements()[Index]);
+    if (!Member->getStorageOffsetInBits()) {
+      EndBitOffset = Member->getOffsetInBits();
+      break;
+    }
+    SOff = dyn_cast<ConstantInt>(Member->getStorageOffsetInBits());
+    assert(SOff);
+    unsigned BitOffset = SOff->getZExtValue();
+    if (BitOffset != StartBitOffset) {
+      EndBitOffset = BitOffset;
+      break;
+    }
+  }
+}
+
+uint32_t BPFAbstractMemberAccess::GetFieldInfo(uint32_t InfoKind,
+                                               DICompositeType *CTy,
+                                               uint32_t AccessIndex,
+                                               uint32_t PatchImm) {
+  if (InfoKind == BPFCoreSharedInfo::FIELD_EXISTENCE)
+      return 1;
+
+  uint32_t Tag = CTy->getTag();
+  if (InfoKind == BPFCoreSharedInfo::FIELD_BYTE_OFFSET) {
+    if (Tag == dwarf::DW_TAG_array_type) {
+      auto *EltTy = stripQualifiers(CTy->getBaseType());
+      PatchImm += AccessIndex * calcArraySize(CTy, 1) *
+                  (EltTy->getSizeInBits() >> 3);
+    } else if (Tag == dwarf::DW_TAG_structure_type) {
+      auto *MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+      if (!MemberTy->isBitField()) {
+        PatchImm += MemberTy->getOffsetInBits() >> 3;
+      } else {
+        auto SOffset = dyn_cast<ConstantInt>(MemberTy->getStorageOffsetInBits());
+        assert(SOffset);
+        PatchImm += SOffset->getZExtValue() >> 3;
+      }
+    }
+    return PatchImm;
+  }
+
+  if (InfoKind == BPFCoreSharedInfo::FIELD_BYTE_SIZE) {
+    if (Tag == dwarf::DW_TAG_array_type) {
+      auto *EltTy = stripQualifiers(CTy->getBaseType());
+      return calcArraySize(CTy, 1) * (EltTy->getSizeInBits() >> 3);
+    } else {
+      auto *MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+      uint32_t SizeInBits = MemberTy->getSizeInBits();
+      if (!MemberTy->isBitField())
+        return SizeInBits >> 3;
+
+      unsigned SBitOffset, NextSBitOffset;
+      GetStorageBitRange(CTy, MemberTy, AccessIndex, SBitOffset, NextSBitOffset);
+      SizeInBits = NextSBitOffset - SBitOffset;
+      if (SizeInBits & (SizeInBits - 1))
+        report_fatal_error("Unsupported field expression for llvm.bpf.preserve.field.info");
+      return SizeInBits >> 3;
+    }
+  }
+
+  if (InfoKind == BPFCoreSharedInfo::FIELD_SIGNEDNESS) {
+    const DIType *BaseTy;
+    if (Tag == dwarf::DW_TAG_array_type) {
+      // Signedness only checked when final array elements are accessed.
+      if (CTy->getElements().size() != 1)
+        report_fatal_error("Invalid array expression for llvm.bpf.preserve.field.info");
+      BaseTy = stripQualifiers(CTy->getBaseType());
+    } else {
+      auto *MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+      BaseTy = stripQualifiers(MemberTy->getBaseType());
+    }
+
+    // Only basic types and enum types have signedness.
+    const auto *BTy = dyn_cast<DIBasicType>(BaseTy);
+    while (!BTy) {
+      const auto *CompTy = dyn_cast<DICompositeType>(BaseTy);
+      // Report an error if the field expression does not have signedness.
+      if (!CompTy || CompTy->getTag() != dwarf::DW_TAG_enumeration_type)
+        report_fatal_error("Invalid field expression for llvm.bpf.preserve.field.info");
+      BaseTy = stripQualifiers(CompTy->getBaseType());
+      BTy = dyn_cast<DIBasicType>(BaseTy);
+    }
+    uint32_t Encoding = BTy->getEncoding();
+    return (Encoding == dwarf::DW_ATE_signed || Encoding == dwarf::DW_ATE_signed_char);
+  }
+
+  if (InfoKind == BPFCoreSharedInfo::FIELD_LSHIFT_U64) {
+    // The value is loaded into a value with FIELD_BYTE_SIZE size,
+    // and then zero or sign extended to U64.
+    // FIELD_LSHIFT_U64 and FIELD_RSHIFT_U64 are operations
+    // to extract the original value.
+    const Triple &Triple = TM->getTargetTriple();
+    DIDerivedType *MemberTy = nullptr;
+    bool IsBitField = false;
+    uint32_t SizeInBits;
+
+    if (Tag == dwarf::DW_TAG_array_type) {
+      auto *EltTy = stripQualifiers(CTy->getBaseType());
+      SizeInBits = calcArraySize(CTy, 1) * EltTy->getSizeInBits();
+    } else {
+      MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+      SizeInBits = MemberTy->getSizeInBits();
+      IsBitField = MemberTy->isBitField();
+    }
+
+    if (!IsBitField) {
+      if (SizeInBits > 64)
+        report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
+      return 64 - SizeInBits;
+    }
+
+    unsigned SBitOffset, NextSBitOffset;
+    GetStorageBitRange(CTy, MemberTy, AccessIndex, SBitOffset, NextSBitOffset);
+    if (NextSBitOffset - SBitOffset > 64)
+      report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
+
+    unsigned OffsetInBits = MemberTy->getOffsetInBits();
+    if (Triple.getArch() == Triple::bpfel)
+      return SBitOffset + 64 - OffsetInBits - SizeInBits;
+    else
+      return OffsetInBits + 64 - NextSBitOffset;
+  }
+
+  if (InfoKind == BPFCoreSharedInfo::FIELD_RSHIFT_U64) {
+    DIDerivedType *MemberTy = nullptr;
+    bool IsBitField = false;
+    uint32_t SizeInBits;
+    if (Tag == dwarf::DW_TAG_array_type) {
+      auto *EltTy = stripQualifiers(CTy->getBaseType());
+      SizeInBits = calcArraySize(CTy, 1) * EltTy->getSizeInBits();
+    } else {
+      MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+      SizeInBits = MemberTy->getSizeInBits();
+      IsBitField = MemberTy->isBitField();
+    }
+
+    if (!IsBitField) {
+      if (SizeInBits > 64)
+        report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
+      return 64 - SizeInBits;
+    }
+
+    unsigned SBitOffset, NextSBitOffset;
+    GetStorageBitRange(CTy, MemberTy, AccessIndex, SBitOffset, NextSBitOffset);
+    if (NextSBitOffset - SBitOffset > 64)
+      report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
+
+    return 64 - SizeInBits;
+  }
+
+  llvm_unreachable("Unknown llvm.bpf.preserve.field.info info kind");
 }
 
-/// Compute the base of the whole preserve_*_access_index chains, i.e., the base
+bool BPFAbstractMemberAccess::HasPreserveFieldInfoCall(CallInfoStack &CallStack) {
+  // This is called in error return path, no need to maintain CallStack.
+  while (CallStack.size()) {
+    auto StackElem = CallStack.top();
+    if (StackElem.second.Kind == BPFPreserveFieldInfoAI)
+      return true;
+    CallStack.pop();
+  }
+  return false;
+}
+
+/// Compute the base of the whole preserve_* intrinsics chains, i.e., the base
 /// pointer of the first preserve_*_access_index call, and construct the access
 /// string, which will be the name of a global variable.
-Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call,
-                                                        std::string &AccessStr,
+Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
+                                                        CallInfo &CInfo,
                                                         std::string &AccessKey,
-                                                        uint32_t Kind,
                                                         MDNode *&TypeMeta) {
   Value *Base = nullptr;
-  std::vector<uint64_t> AccessIndices;
-  uint64_t TypeNameIndex = 0;
-  std::string LastTypeName;
+  std::string TypeName;
+  CallInfoStack CallStack;
 
+  // Put the access chain into a stack with the top as the head of the chain.
   while (Call) {
-    // Base of original corresponding GEP
-    Base = Call->getArgOperand(0);
-
-    // Type Name
-    std::string TypeName;
-    MDNode *MDN;
-    if (Kind == BPFPreserveUnionAI || Kind == BPFPreserveStructAI) {
-      MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index);
-      if (!MDN)
-        return nullptr;
-
-      DIType *Ty = dyn_cast<DIType>(MDN);
-      if (!Ty)
-        return nullptr;
+    CallStack.push(std::make_pair(Call, CInfo));
+    CInfo = AIChain[Call].second;
+    Call = AIChain[Call].first;
+  }
 
+  // The access offset from the base of the head of chain is also
+  // calculated here as all debuginfo types are available.
+
+  // Get type name and calculate the first index.
+  // We only want to get type name from structure or union.
+  // If user wants a relocation like
+  //    int *p; ... __builtin_preserve_access_index(&p[4]) ...
+  // or
+  //    int a[10][20]; ... __builtin_preserve_access_index(&a[2][3]) ...
+  // we will skip them.
+  uint32_t FirstIndex = 0;
+  uint32_t PatchImm = 0; // AccessOffset or the requested field info
+  uint32_t InfoKind = BPFCoreSharedInfo::FIELD_BYTE_OFFSET;
+  while (CallStack.size()) {
+    auto StackElem = CallStack.top();
+    Call = StackElem.first;
+    CInfo = StackElem.second;
+
+    if (!Base)
+      Base = CInfo.Base;
+
+    DIType *Ty = stripQualifiers(cast<DIType>(CInfo.Metadata));
+    if (CInfo.Kind == BPFPreserveUnionAI ||
+        CInfo.Kind == BPFPreserveStructAI) {
+      // struct or union type
       TypeName = Ty->getName();
+      TypeMeta = Ty;
+      PatchImm += FirstIndex * (Ty->getSizeInBits() >> 3);
+      break;
     }
 
-    // Access Index
-    uint64_t AccessIndex;
-    uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2;
-    if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex))
-      return nullptr;
-
-    AccessIndices.push_back(AccessIndex);
-    if (TypeName.size()) {
-      TypeNameIndex = AccessIndices.size() - 1;
-      LastTypeName = TypeName;
-      TypeMeta = MDN;
+    assert(CInfo.Kind == BPFPreserveArrayAI);
+
+    // Array entries will always be consumed for accumulative initial index.
+    CallStack.pop();
+
+    // BPFPreserveArrayAI
+    uint64_t AccessIndex = CInfo.AccessIndex;
+
+    DIType *BaseTy = nullptr;
+    bool CheckElemType = false;
+    if (const auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+      // array type
+      assert(CTy->getTag() == dwarf::DW_TAG_array_type);
+
+
+      FirstIndex += AccessIndex * calcArraySize(CTy, 1);
+      BaseTy = stripQualifiers(CTy->getBaseType());
+      CheckElemType = CTy->getElements().size() == 1;
+    } else {
+      // pointer type
+      auto *DTy = cast<DIDerivedType>(Ty);
+      assert(DTy->getTag() == dwarf::DW_TAG_pointer_type);
+
+      BaseTy = stripQualifiers(DTy->getBaseType());
+      CTy = dyn_cast<DICompositeType>(BaseTy);
+      if (!CTy) {
+        CheckElemType = true;
+      } else if (CTy->getTag() != dwarf::DW_TAG_array_type) {
+        FirstIndex += AccessIndex;
+        CheckElemType = true;
+      } else {
+        FirstIndex += AccessIndex * calcArraySize(CTy, 0);
+      }
     }
 
-    Kind = AIChain[Call].second;
-    Call = AIChain[Call].first;
-  }
+    if (CheckElemType) {
+      auto *CTy = dyn_cast<DICompositeType>(BaseTy);
+      if (!CTy) {
+        if (HasPreserveFieldInfoCall(CallStack))
+          report_fatal_error("Invalid field access for llvm.preserve.field.info intrinsic");
+        return nullptr;
+      }
 
-  // The intial type name is required.
-  // FIXME: if the initial type access is an array index, e.g.,
-  // &a[3].b.c, only one dimentional array is supported.
-  if (!LastTypeName.size() || AccessIndices.size() > TypeNameIndex + 2)
-    return nullptr;
+      unsigned CTag = CTy->getTag();
+      if (CTag == dwarf::DW_TAG_structure_type || CTag == dwarf::DW_TAG_union_type) {
+        TypeName = CTy->getName();
+      } else {
+        if (HasPreserveFieldInfoCall(CallStack))
+          report_fatal_error("Invalid field access for llvm.preserve.field.info intrinsic");
+        return nullptr;
+      }
+      TypeMeta = CTy;
+      PatchImm += FirstIndex * (CTy->getSizeInBits() >> 3);
+      break;
+    }
+  }
+  assert(TypeName.size());
+  AccessKey += std::to_string(FirstIndex);
+
+  // Traverse the rest of access chain to complete offset calculation
+  // and access key construction.
+  while (CallStack.size()) {
+    auto StackElem = CallStack.top();
+    CInfo = StackElem.second;
+    CallStack.pop();
+
+    if (CInfo.Kind == BPFPreserveFieldInfoAI)
+      break;
+
+    // If the next Call (the top of the stack) is a BPFPreserveFieldInfoAI,
+    // the action will be extracting field info.
+    if (CallStack.size()) {
+      auto StackElem2 = CallStack.top();
+      CallInfo CInfo2 = StackElem2.second;
+      if (CInfo2.Kind == BPFPreserveFieldInfoAI) {
+        InfoKind = CInfo2.AccessIndex;
+        assert(CallStack.size() == 1);
+      }
+    }
 
-  // Construct the type string AccessStr.
-  for (unsigned I = 0; I < AccessIndices.size(); ++I)
-    AccessStr = std::to_string(AccessIndices[I]) + ":" + AccessStr;
+    // Access Index
+    uint64_t AccessIndex = CInfo.AccessIndex;
+    AccessKey += ":" + std::to_string(AccessIndex);
 
-  if (TypeNameIndex == AccessIndices.size() - 1)
-    AccessStr = "0:" + AccessStr;
+    MDNode *MDN = CInfo.Metadata;
+    // At this stage, it cannot be pointer type.
+    auto *CTy = cast<DICompositeType>(stripQualifiers(cast<DIType>(MDN)));
+    PatchImm = GetFieldInfo(InfoKind, CTy, AccessIndex, PatchImm);
+  }
 
-  // Access key is the type name + access string, uniquely identifying
-  // one kernel memory access.
-  AccessKey = LastTypeName + ":" + AccessStr;
+  // Access key is the type name + reloc type + patched imm + access string,
+  // uniquely identifying one relocation.
+  AccessKey = TypeName + ":" + std::to_string(InfoKind) + ":" +
+              std::to_string(PatchImm) + "$" + AccessKey;
 
   return Base;
 }
@@ -409,39 +838,52 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call,
 /// Call/Kind is the base preserve_*_access_index() call. Attempts to do
 /// transformation to a chain of relocable GEPs.
 bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
-                                                uint32_t Kind) {
-  std::string AccessStr, AccessKey;
-  MDNode *TypeMeta = nullptr;
+                                                CallInfo &CInfo) {
+  std::string AccessKey;
+  MDNode *TypeMeta;
   Value *Base =
-      computeBaseAndAccessStr(Call, AccessStr, AccessKey, Kind, TypeMeta);
+      computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta);
   if (!Base)
     return false;
 
-  // Do the transformation
-  // For any original GEP Call and Base %2 like
-  //   %4 = bitcast %struct.net_device** %dev1 to i64*
-  // it is transformed to:
-  //   %6 = load __BTF_0:sk_buff:0:0:2:0:
-  //   %7 = bitcast %struct.sk_buff* %2 to i8*
-  //   %8 = getelementptr i8, i8* %7, %6
-  //   %9 = bitcast i8* %8 to i64*
-  //   using %9 instead of %4
-  // The original Call inst is removed.
   BasicBlock *BB = Call->getParent();
   GlobalVariable *GV;
 
   if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) {
-    GV = new GlobalVariable(M, Type::getInt64Ty(BB->getContext()), false,
-                            GlobalVariable::ExternalLinkage, NULL, AccessStr);
+    IntegerType *VarType;
+    if (CInfo.Kind == BPFPreserveFieldInfoAI)
+      VarType = Type::getInt32Ty(BB->getContext()); // 32bit return value
+    else
+      VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr arith
+
+    GV = new GlobalVariable(M, VarType, false, GlobalVariable::ExternalLinkage,
+                            NULL, AccessKey);
     GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
-    // Set the metadata (debuginfo types) for the global.
-    if (TypeMeta)
-      GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
+    GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
     GEPGlobals[AccessKey] = GV;
   } else {
     GV = GEPGlobals[AccessKey];
   }
 
+  if (CInfo.Kind == BPFPreserveFieldInfoAI) {
+    // Load the global variable which represents the returned field info.
+    auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV);
+    BB->getInstList().insert(Call->getIterator(), LDInst);
+    Call->replaceAllUsesWith(LDInst);
+    Call->eraseFromParent();
+    return true;
+  }
+
+  // For any original GEP Call and Base %2 like
+  //   %4 = bitcast %struct.net_device** %dev1 to i64*
+  // it is transformed to:
+  //   %6 = load sk_buff:50:$0:0:0:2:0
+  //   %7 = bitcast %struct.sk_buff* %2 to i8*
+  //   %8 = getelementptr i8, i8* %7, %6
+  //   %9 = bitcast i8* %8 to i64*
+  //   using %9 instead of %4
+  // The original Call inst is removed.
+
   // Load the global variable.
   auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV);
   BB->getInstList().insert(Call->getIterator(), LDInst);
diff --git a/lib/Target/BPF/BPFAsmPrinter.cpp b/lib/Target/BPF/BPFAsmPrinter.cpp
index e61e73468057..218b0302927c 100644
--- a/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -59,7 +59,7 @@ bool BPFAsmPrinter::doInitialization(Module &M) {
   AsmPrinter::doInitialization(M);
 
   // Only emit BTF when debuginfo available.
-  if (MAI->doesSupportDebugInformation() && !empty(M.debug_compile_units())) {
+  if (MAI->doesSupportDebugInformation() && !M.debug_compile_units().empty()) {
     BTF = new BTFDebug(this);
     Handlers.push_back(HandlerInfo(std::unique_ptr<BTFDebug>(BTF), "emit",
                                    "Debug Info Emission", "BTF",
diff --git a/lib/Target/BPF/BPFCORE.h b/lib/Target/BPF/BPFCORE.h
index e0950d95f8d7..ed4778353e52 100644
--- a/lib/Target/BPF/BPFCORE.h
+++ b/lib/Target/BPF/BPFCORE.h
@@ -13,10 +13,18 @@ namespace llvm {
 
 class BPFCoreSharedInfo {
 public:
-  /// The attribute attached to globals representing a member offset
+  enum OffsetRelocKind : uint32_t {
+    FIELD_BYTE_OFFSET = 0,
+    FIELD_BYTE_SIZE,
+    FIELD_EXISTENCE,
+    FIELD_SIGNEDNESS,
+    FIELD_LSHIFT_U64,
+    FIELD_RSHIFT_U64,
+
+    MAX_FIELD_RELOC_KIND,
+  };
+  /// The attribute attached to globals representing a field access
   static const std::string AmaAttr;
-  /// The section name to identify a patchable external global
-  static const std::string PatchableExtSecName;
 };
 
 } // namespace llvm
diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h
index 2dc6277d2244..a546351ec6cb 100644
--- a/lib/Target/BPF/BPFFrameLowering.h
+++ b/lib/Target/BPF/BPFFrameLowering.h
@@ -21,7 +21,7 @@ class BPFSubtarget;
 class BPFFrameLowering : public TargetFrameLowering {
 public:
   explicit BPFFrameLowering(const BPFSubtarget &sti)
-      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) {}
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8), 0) {}
 
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 1bd705c55188..f2be0ff070d2 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -45,9 +45,7 @@ class BPFDAGToDAGISel : public SelectionDAGISel {
 
 public:
   explicit BPFDAGToDAGISel(BPFTargetMachine &TM)
-      : SelectionDAGISel(TM), Subtarget(nullptr) {
-    curr_func_ = nullptr;
-  }
+      : SelectionDAGISel(TM), Subtarget(nullptr) {}
 
   StringRef getPassName() const override {
     return "BPF DAG->DAG Pattern Instruction Selection";
@@ -92,14 +90,8 @@ private:
                           val_vec_type &Vals, int Offset);
   bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset,
                              uint64_t Size, unsigned char *ByteSeq);
-  bool checkLoadDef(unsigned DefReg, unsigned match_load_op);
-
   // Mapping from ConstantStruct global value to corresponding byte-list values
   std::map<const void *, val_vec_type> cs_vals_;
-  // Mapping from vreg to load memory opcode
-  std::map<unsigned, unsigned> load_to_vreg_;
-  // Current function
-  const Function *curr_func_;
 };
 } // namespace
 
@@ -325,32 +317,13 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
 }
 
 void BPFDAGToDAGISel::PreprocessISelDAG() {
-  // Iterate through all nodes, interested in the following cases:
+  // Iterate through all nodes, interested in the following case:
   //
   //  . loads from ConstantStruct or ConstantArray of constructs
   //    which can be turns into constant itself, with this we can
   //    avoid reading from read-only section at runtime.
   //
-  //  . reg truncating is often the result of 8/16/32bit->64bit or
-  //    8/16bit->32bit conversion. If the reg value is loaded with
-  //    masked byte width, the AND operation can be removed since
-  //    BPF LOAD already has zero extension.
-  //
-  //    This also solved a correctness issue.
-  //    In BPF socket-related program, e.g., __sk_buff->{data, data_end}
-  //    are 32-bit registers, but later on, kernel verifier will rewrite
-  //    it with 64-bit value. Therefore, truncating the value after the
-  //    load will result in incorrect code.
-
-  // clear the load_to_vreg_ map so that we have a clean start
-  // for this function.
-  if (!curr_func_) {
-    curr_func_ = FuncInfo->Fn;
-  } else if (curr_func_ != FuncInfo->Fn) {
-    load_to_vreg_.clear();
-    curr_func_ = FuncInfo->Fn;
-  }
-
+  //  . Removing redundant AND for intrinsic narrow loads.
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
                                        E = CurDAG->allnodes_end();
        I != E;) {
@@ -358,8 +331,6 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
     unsigned Opcode = Node->getOpcode();
     if (Opcode == ISD::LOAD)
       PreprocessLoad(Node, I);
-    else if (Opcode == ISD::CopyToReg)
-      PreprocessCopyToReg(Node);
     else if (Opcode == ISD::AND)
       PreprocessTrunc(Node, I);
   }
@@ -491,37 +462,6 @@ bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL,
   return true;
 }
 
-void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) {
-  const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1));
-  if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
-    return;
-
-  const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2));
-  if (!LD)
-    return;
-
-  // Assign a load value to a virtual register. record its load width
-  unsigned mem_load_op = 0;
-  switch (LD->getMemOperand()->getSize()) {
-  default:
-    return;
-  case 4:
-    mem_load_op = BPF::LDW;
-    break;
-  case 2:
-    mem_load_op = BPF::LDH;
-    break;
-  case 1:
-    mem_load_op = BPF::LDB;
-    break;
-  }
-
-  LLVM_DEBUG(dbgs() << "Find Load Value to VReg "
-                    << TargetRegisterInfo::virtReg2Index(RegN->getReg())
-                    << '\n');
-  load_to_vreg_[RegN->getReg()] = mem_load_op;
-}
-
 void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
                                       SelectionDAG::allnodes_iterator &I) {
   ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
@@ -535,112 +475,26 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
   // which the generic optimizer doesn't understand their results are
   // zero extended.
   SDValue BaseV = Node->getOperand(0);
-  if (BaseV.getOpcode() == ISD::INTRINSIC_W_CHAIN) {
-    unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
-    uint64_t MaskV = MaskN->getZExtValue();
-
-    if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
-          (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) ||
-          (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
-      return;
-
-    LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
-               Node->dump(); dbgs() << '\n');
-
-    I--;
-    CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
-    I++;
-    CurDAG->DeleteNode(Node);
-
-    return;
-  }
-
-  // Multiple basic blocks case.
-  if (BaseV.getOpcode() != ISD::CopyFromReg)
+  if (BaseV.getOpcode() != ISD::INTRINSIC_W_CHAIN)
     return;
 
-  unsigned match_load_op = 0;
-  switch (MaskN->getZExtValue()) {
-  default:
-    return;
-  case 0xFFFFFFFF:
-    match_load_op = BPF::LDW;
-    break;
-  case 0xFFFF:
-    match_load_op = BPF::LDH;
-    break;
-  case 0xFF:
-    match_load_op = BPF::LDB;
-    break;
-  }
+  unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
+  uint64_t MaskV = MaskN->getZExtValue();
 
-  const RegisterSDNode *RegN =
-      dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1));
-  if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
+  if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
+        (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) ||
+        (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
     return;
-  unsigned AndOpReg = RegN->getReg();
-  LLVM_DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n');
-
-  // Examine the PHI insns in the MachineBasicBlock to found out the
-  // definitions of this virtual register. At this stage (DAG2DAG
-  // transformation), only PHI machine insns are available in the machine basic
-  // block.
-  MachineBasicBlock *MBB = FuncInfo->MBB;
-  MachineInstr *MII = nullptr;
-  for (auto &MI : *MBB) {
-    for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
-      const MachineOperand &MOP = MI.getOperand(i);
-      if (!MOP.isReg() || !MOP.isDef())
-        continue;
-      unsigned Reg = MOP.getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg) && Reg == AndOpReg) {
-        MII = &MI;
-        break;
-      }
-    }
-  }
-
-  if (MII == nullptr) {
-    // No phi definition in this block.
-    if (!checkLoadDef(AndOpReg, match_load_op))
-      return;
-  } else {
-    // The PHI node looks like:
-    //   %2 = PHI %0, <%bb.1>, %1, <%bb.3>
-    // Trace each incoming definition, e.g., (%0, %bb.1) and (%1, %bb.3)
-    // The AND operation can be removed if both %0 in %bb.1 and %1 in
-    // %bb.3 are defined with a load matching the MaskN.
-    LLVM_DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
-    unsigned PrevReg = -1;
-    for (unsigned i = 0; i < MII->getNumOperands(); ++i) {
-      const MachineOperand &MOP = MII->getOperand(i);
-      if (MOP.isReg()) {
-        if (MOP.isDef())
-          continue;
-        PrevReg = MOP.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(PrevReg))
-          return;
-        if (!checkLoadDef(PrevReg, match_load_op))
-          return;
-      }
-    }
-  }
 
-  LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
-             dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
+             Node->dump(); dbgs() << '\n');
 
   I--;
   CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
   I++;
   CurDAG->DeleteNode(Node);
-}
-
-bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) {
-  auto it = load_to_vreg_.find(DefReg);
-  if (it == load_to_vreg_.end())
-    return false; // The definition of register is not exported yet.
 
-  return it->second == match_load_op;
+  return;
 }
 
 FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index ff69941d26fb..56e0288f26c9 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -132,9 +132,9 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
 
   setBooleanContents(ZeroOrOneBooleanContent);
 
-  // Function alignments (log2)
-  setMinFunctionAlignment(3);
-  setPrefFunctionAlignment(3);
+  // Function alignments
+  setMinFunctionAlignment(Align(8));
+  setPrefFunctionAlignment(Align(8));
 
   if (BPFExpandMemcpyInOrder) {
     // LLVM generic code will try to expand memcpy into load/store pairs at this
@@ -236,9 +236,8 @@ SDValue BPFTargetLowering::LowerFormalArguments(
       }
       case MVT::i32:
       case MVT::i64:
-        unsigned VReg = RegInfo.createVirtualRegister(SimpleTy == MVT::i64 ?
-                                                      &BPF::GPRRegClass :
-                                                      &BPF::GPR32RegClass);
+        Register VReg = RegInfo.createVirtualRegister(
+            SimpleTy == MVT::i64 ? &BPF::GPRRegClass : &BPF::GPR32RegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
 
@@ -571,9 +570,9 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
   DebugLoc DL = MI.getDebugLoc();
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
-  unsigned PromotedReg0 = RegInfo.createVirtualRegister(RC);
-  unsigned PromotedReg1 = RegInfo.createVirtualRegister(RC);
-  unsigned PromotedReg2 = RegInfo.createVirtualRegister(RC);
+  Register PromotedReg0 = RegInfo.createVirtualRegister(RC);
+  Register PromotedReg1 = RegInfo.createVirtualRegister(RC);
+  Register PromotedReg2 = RegInfo.createVirtualRegister(RC);
   BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
   BuildMI(BB, DL, TII.get(BPF::SLL_ri), PromotedReg1)
     .addReg(PromotedReg0).addImm(32);
@@ -699,7 +698,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     report_fatal_error("unimplemented select CondCode " + Twine(CC));
   }
 
-  unsigned LHS = MI.getOperand(1).getReg();
+  Register LHS = MI.getOperand(1).getReg();
   bool isSignedCmp = (CC == ISD::SETGT ||
                       CC == ISD::SETGE ||
                       CC == ISD::SETLT ||
@@ -716,7 +715,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     LHS = EmitSubregExt(MI, BB, LHS, isSignedCmp);
 
   if (isSelectRROp) {
-    unsigned RHS = MI.getOperand(2).getReg();
+    Register RHS = MI.getOperand(2).getReg();
 
     if (is32BitCmp && !HasJmp32)
       RHS = EmitSubregExt(MI, BB, RHS, isSignedCmp);
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
index 932f718d5490..6de3a4084d3d 100644
--- a/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -43,11 +43,11 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
-  unsigned DstReg = MI->getOperand(0).getReg();
-  unsigned SrcReg = MI->getOperand(1).getReg();
+  Register DstReg = MI->getOperand(0).getReg();
+  Register SrcReg = MI->getOperand(1).getReg();
   uint64_t CopyLen = MI->getOperand(2).getImm();
   uint64_t Alignment = MI->getOperand(3).getImm();
-  unsigned ScratchReg = MI->getOperand(4).getReg();
+  Register ScratchReg = MI->getOperand(4).getReg();
   MachineBasicBlock *BB = MI->getParent();
   DebugLoc dl = MI->getDebugLoc();
   unsigned LdOpc, StOpc;
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index c44702a78ec8..ae5a82a99303 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -473,7 +473,7 @@ class CALL<string OpcodeStr>
 class CALLX<string OpcodeStr>
     : TYPE_ALU_JMP<BPF_CALL.Value, BPF_X.Value,
                    (outs),
-                   (ins calltarget:$BrDst),
+                   (ins GPR:$BrDst),
                    !strconcat(OpcodeStr, " $BrDst"),
                    []> {
   bits<32> BrDst;
diff --git a/lib/Target/BPF/BPFMIChecking.cpp b/lib/Target/BPF/BPFMIChecking.cpp
index 4c46289656b4..f82f166eda4d 100644
--- a/lib/Target/BPF/BPFMIChecking.cpp
+++ b/lib/Target/BPF/BPFMIChecking.cpp
@@ -19,6 +19,7 @@
 #include "BPFTargetMachine.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/BPF/BPFMIPeephole.cpp b/lib/Target/BPF/BPFMIPeephole.cpp
index 156ba793e359..e9eecc55c3c3 100644
--- a/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/lib/Target/BPF/BPFMIPeephole.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
@@ -71,7 +72,7 @@ void BPFMIPeephole::initialize(MachineFunction &MFParm) {
   MF = &MFParm;
   MRI = &MF->getRegInfo();
   TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
-  LLVM_DEBUG(dbgs() << "*** BPF MachineSSA peephole pass ***\n\n");
+  LLVM_DEBUG(dbgs() << "*** BPF MachineSSA ZEXT Elim peephole pass ***\n\n");
 }
 
 bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
@@ -104,10 +105,10 @@ bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
     if (!opnd.isReg())
       return false;
 
-    unsigned Reg = opnd.getReg();
-    if ((TargetRegisterInfo::isVirtualRegister(Reg) &&
+    Register Reg = opnd.getReg();
+    if ((Register::isVirtualRegister(Reg) &&
          MRI->getRegClass(Reg) == &BPF::GPRRegClass))
-       return false;
+      return false;
   }
 
   LLVM_DEBUG(dbgs() << "  One ZExt elim sequence identified.\n");
@@ -134,8 +135,8 @@ bool BPFMIPeephole::eliminateZExtSeq(void) {
       //   SRL_ri    rB, rB, 32
       if (MI.getOpcode() == BPF::SRL_ri &&
           MI.getOperand(2).getImm() == 32) {
-        unsigned DstReg = MI.getOperand(0).getReg();
-        unsigned ShfReg = MI.getOperand(1).getReg();
+        Register DstReg = MI.getOperand(0).getReg();
+        Register ShfReg = MI.getOperand(1).getReg();
         MachineInstr *SllMI = MRI->getVRegDef(ShfReg);
 
         LLVM_DEBUG(dbgs() << "Starting SRL found:");
@@ -159,7 +160,7 @@ bool BPFMIPeephole::eliminateZExtSeq(void) {
         LLVM_DEBUG(dbgs() << "  Type cast Mov found:");
         LLVM_DEBUG(MovMI->dump());
 
-        unsigned SubReg = MovMI->getOperand(1).getReg();
+        Register SubReg = MovMI->getOperand(1).getReg();
         if (!isMovFrom32Def(MovMI)) {
           LLVM_DEBUG(dbgs()
                      << "  One ZExt elim sequence failed qualifying elim.\n");
@@ -186,7 +187,8 @@ bool BPFMIPeephole::eliminateZExtSeq(void) {
 } // end default namespace
 
 INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE,
-                "BPF MachineSSA Peephole Optimization", false, false)
+                "BPF MachineSSA Peephole Optimization For ZEXT Eliminate",
+                false, false)
 
 char BPFMIPeephole::ID = 0;
 FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); }
@@ -253,12 +255,16 @@ bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
       // enabled. The special type cast insn MOV_32_64 involves different
       // register class on src (i32) and dst (i64), RA could generate useless
       // instruction due to this.
-      if (MI.getOpcode() == BPF::MOV_32_64) {
-        unsigned dst = MI.getOperand(0).getReg();
-        unsigned dst_sub = TRI->getSubReg(dst, BPF::sub_32);
-        unsigned src = MI.getOperand(1).getReg();
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == BPF::MOV_32_64 ||
+          Opcode == BPF::MOV_rr || Opcode == BPF::MOV_rr_32) {
+        Register dst = MI.getOperand(0).getReg();
+        Register src = MI.getOperand(1).getReg();
+
+        if (Opcode == BPF::MOV_32_64)
+          dst = TRI->getSubReg(dst, BPF::sub_32);
 
-        if (dst_sub != src)
+        if (dst != src)
           continue;
 
         ToErase = &MI;
@@ -281,3 +287,177 @@ FunctionPass* llvm::createBPFMIPreEmitPeepholePass()
 {
   return new BPFMIPreEmitPeephole();
 }
+
+STATISTIC(TruncElemNum, "Number of truncation eliminated");
+
+namespace {
+
+struct BPFMIPeepholeTruncElim : public MachineFunctionPass {
+
+  static char ID;
+  const BPFInstrInfo *TII;
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+
+  BPFMIPeepholeTruncElim() : MachineFunctionPass(ID) {
+    initializeBPFMIPeepholeTruncElimPass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  bool eliminateTruncSeq(void);
+
+public:
+
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    initialize(MF);
+
+    return eliminateTruncSeq();
+  }
+};
+
+static bool TruncSizeCompatible(int TruncSize, unsigned opcode)
+{
+  if (TruncSize == 1)
+    return opcode == BPF::LDB || opcode == BPF::LDB32;
+
+  if (TruncSize == 2)
+    return opcode == BPF::LDH || opcode == BPF::LDH32;
+
+  if (TruncSize == 4)
+    return opcode == BPF::LDW || opcode == BPF::LDW32;
+
+  return false;
+}
+
+// Initialize class variables.
+void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+  LLVM_DEBUG(dbgs() << "*** BPF MachineSSA TRUNC Elim peephole pass ***\n\n");
+}
+
+// Reg truncating is often the result of 8/16/32bit->64bit or
+// 8/16bit->32bit conversion. If the reg value is loaded with
+// masked byte width, the AND operation can be removed since
+// BPF LOAD already has zero extension.
+//
+// This also solved a correctness issue.
+// In BPF socket-related program, e.g., __sk_buff->{data, data_end}
+// are 32-bit registers, but later on, kernel verifier will rewrite
+// it with 64-bit value. Therefore, truncating the value after the
+// load will result in incorrect code.
+bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) {
+  MachineInstr* ToErase = nullptr;
+  bool Eliminated = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      // The second insn to remove if the eliminate candidate is a pair.
+      MachineInstr *MI2 = nullptr;
+      Register DstReg, SrcReg;
+      MachineInstr *DefMI;
+      int TruncSize = -1;
+
+      // If the previous instruction was marked for elimination, remove it now.
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      // AND A, 0xFFFFFFFF will be turned into SLL/SRL pair due to immediate
+      // for BPF ANDI is i32, and this case only happens on ALU64.
+      if (MI.getOpcode() == BPF::SRL_ri &&
+          MI.getOperand(2).getImm() == 32) {
+        SrcReg = MI.getOperand(1).getReg();
+        MI2 = MRI->getVRegDef(SrcReg);
+        DstReg = MI.getOperand(0).getReg();
+
+        if (!MI2 ||
+            MI2->getOpcode() != BPF::SLL_ri ||
+            MI2->getOperand(2).getImm() != 32)
+          continue;
+
+        // Update SrcReg.
+        SrcReg = MI2->getOperand(1).getReg();
+        DefMI = MRI->getVRegDef(SrcReg);
+        if (DefMI)
+          TruncSize = 4;
+      } else if (MI.getOpcode() == BPF::AND_ri ||
+                 MI.getOpcode() == BPF::AND_ri_32) {
+        SrcReg = MI.getOperand(1).getReg();
+        DstReg = MI.getOperand(0).getReg();
+        DefMI = MRI->getVRegDef(SrcReg);
+
+        if (!DefMI)
+          continue;
+
+        int64_t imm = MI.getOperand(2).getImm();
+        if (imm == 0xff)
+          TruncSize = 1;
+        else if (imm == 0xffff)
+          TruncSize = 2;
+      }
+
+      if (TruncSize == -1)
+        continue;
+
+      // The definition is PHI node, check all inputs.
+      if (DefMI->isPHI()) {
+        bool CheckFail = false;
+
+        for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
+          MachineOperand &opnd = DefMI->getOperand(i);
+          if (!opnd.isReg()) {
+            CheckFail = true;
+            break;
+          }
+
+          MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
+          if (!PhiDef || PhiDef->isPHI() ||
+              !TruncSizeCompatible(TruncSize, PhiDef->getOpcode())) {
+            CheckFail = true;
+            break;
+          }
+        }
+
+        if (CheckFail)
+          continue;
+      } else if (!TruncSizeCompatible(TruncSize, DefMI->getOpcode())) {
+        continue;
+      }
+
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::MOV_rr), DstReg)
+              .addReg(SrcReg);
+
+      if (MI2)
+        MI2->eraseFromParent();
+
+      // Mark it to ToErase, and erase in the next iteration.
+      ToErase = &MI;
+      TruncElemNum++;
+      Eliminated = true;
+    }
+  }
+
+  return Eliminated;
+}
+
+} // end default namespace
+
+INITIALIZE_PASS(BPFMIPeepholeTruncElim, "bpf-mi-trunc-elim",
+                "BPF MachineSSA Peephole Optimization For TRUNC Eliminate",
+                false, false)
+
+char BPFMIPeepholeTruncElim::ID = 0;
+FunctionPass* llvm::createBPFMIPeepholeTruncElimPass()
+{
+  return new BPFMIPeepholeTruncElim();
+}
diff --git a/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/lib/Target/BPF/BPFMISimplifyPatchable.cpp
index e9114d7187e3..9c689aed6417 100644
--- a/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -11,19 +11,15 @@
 //    ldd r2, r1, 0
 //    add r3, struct_base_reg, r2
 //
-// Here @global should either present a AMA (abstruct member access) or
-// a patchable extern variable. And these two kinds of accesses
-// are subject to bpf load time patching. After this pass, the
+// Here @global should represent an AMA (abstruct member access).
+// Such an access is subject to bpf load time patching. After this pass, the
 // code becomes
 //    ld_imm64 r1, @global
 //    add r3, struct_base_reg, r1
 //
 // Eventually, at BTF output stage, a relocation record will be generated
 // for ld_imm64 which should be replaced later by bpf loader:
-//    r1 = <calculated offset> or <to_be_patched_extern_val>
-//    add r3, struct_base_reg, r1
-// or
-//    ld_imm64 r1, <to_be_patched_extern_val>
+//    r1 = <calculated field_info>
 //    add r3, struct_base_reg, r1
 //
 //===----------------------------------------------------------------------===//
@@ -34,6 +30,7 @@
 #include "BPFTargetMachine.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
@@ -100,9 +97,8 @@ bool BPFMISimplifyPatchable::removeLD() {
       if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm())
         continue;
 
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      int64_t ImmVal = MI.getOperand(2).getImm();
+      Register DstReg = MI.getOperand(0).getReg();
+      Register SrcReg = MI.getOperand(1).getReg();
 
       MachineInstr *DefInst = MRI->getUniqueVRegDef(SrcReg);
       if (!DefInst)
@@ -118,17 +114,8 @@ bool BPFMISimplifyPatchable::removeLD() {
             // Global variables representing structure offset or
             // patchable extern globals.
             if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
-              assert(ImmVal == 0);
+              assert(MI.getOperand(2).getImm() == 0);
               IsCandidate = true;
-            } else if (!GVar->hasInitializer() && GVar->hasExternalLinkage() &&
-                       GVar->getSection() ==
-                           BPFCoreSharedInfo::PatchableExtSecName) {
-              if (ImmVal == 0)
-                IsCandidate = true;
-              else
-                errs() << "WARNING: unhandled patchable extern "
-                       << GVar->getName() << " with load offset " << ImmVal
-                       << "\n";
             }
           }
         }
diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp
index 714af06e11d9..8de81a469b84 100644
--- a/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -77,7 +77,7 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
   }
 
-  unsigned FrameReg = getFrameRegister(MF);
+  Register FrameReg = getFrameRegister(MF);
   int FrameIndex = MI.getOperand(i).getIndex();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
@@ -86,7 +86,7 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
     WarnSize(Offset, MF, DL);
     MI.getOperand(i).ChangeToRegister(FrameReg, false);
-    unsigned reg = MI.getOperand(i - 1).getReg();
+    Register reg = MI.getOperand(i - 1).getReg();
     BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg)
         .addReg(reg)
         .addImm(Offset);
@@ -105,7 +105,7 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // architecture does not really support FI_ri, replace it with
     //    MOV_rr <target_reg>, frame_reg
     //    ADD_ri <target_reg>, imm
-    unsigned reg = MI.getOperand(i - 1).getReg();
+    Register reg = MI.getOperand(i - 1).getReg();
 
     BuildMI(MBB, ++II, DL, TII.get(BPF::MOV_rr), reg)
         .addReg(FrameReg);
diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp
index 24c0ff0f7f15..0c4f2c74e7a4 100644
--- a/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/lib/Target/BPF/BPFTargetMachine.cpp
@@ -36,6 +36,7 @@ extern "C" void LLVMInitializeBPFTarget() {
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeBPFAbstractMemberAccessPass(PR);
   initializeBPFMIPeepholePass(PR);
+  initializeBPFMIPeepholeTruncElimPass(PR);
 }
 
 // DataLayout: little or big endian
@@ -61,7 +62,7 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
                         getEffectiveRelocModel(RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 
@@ -94,7 +95,7 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 void BPFPassConfig::addIRPasses() {
 
-  addPass(createBPFAbstractMemberAccess());
+  addPass(createBPFAbstractMemberAccess(&getBPFTargetMachine()));
 
   TargetPassConfig::addIRPasses();
 }
@@ -115,15 +116,16 @@ void BPFPassConfig::addMachineSSAOptimization() {
   TargetPassConfig::addMachineSSAOptimization();
 
   const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
-  if (Subtarget->getHasAlu32() && !DisableMIPeephole)
-    addPass(createBPFMIPeepholePass());
+  if (!DisableMIPeephole) {
+    if (Subtarget->getHasAlu32())
+      addPass(createBPFMIPeepholePass());
+    addPass(createBPFMIPeepholeTruncElimPass());
+  }
 }
 
 void BPFPassConfig::addPreEmitPass() {
-  const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
-
   addPass(createBPFMIPreEmitCheckingPass());
   if (getOptLevel() != CodeGenOpt::None)
-    if (Subtarget->getHasAlu32() && !DisableMIPeephole)
+    if (!DisableMIPeephole)
       addPass(createBPFMIPreEmitPeepholePass());
 }
diff --git a/lib/Target/BPF/BTF.h b/lib/Target/BPF/BTF.h
index ad56716710a6..a13c862bf840 100644
--- a/lib/Target/BPF/BTF.h
+++ b/lib/Target/BPF/BTF.h
@@ -17,7 +17,7 @@
 ///
 /// The binary layout for .BTF.ext section:
 ///   struct ExtHeader
-///   FuncInfo, LineInfo, OffsetReloc and ExternReloc subsections
+///   FuncInfo, LineInfo, FieldReloc and ExternReloc subsections
 /// The FuncInfo subsection is defined as below:
 ///   BTFFuncInfo Size
 ///   struct SecFuncInfo for ELF section #1
@@ -32,19 +32,12 @@
 ///   struct SecLineInfo for ELF section #2
 ///   A number of struct BPFLineInfo for ELF section #2
 ///   ...
-/// The OffsetReloc subsection is defined as below:
-///   BPFOffsetReloc Size
-///   struct SecOffsetReloc for ELF section #1
-///   A number of struct BPFOffsetReloc for ELF section #1
-///   struct SecOffsetReloc for ELF section #2
-///   A number of struct BPFOffsetReloc for ELF section #2
-///   ...
-/// The ExternReloc subsection is defined as below:
-///   BPFExternReloc Size
-///   struct SecExternReloc for ELF section #1
-///   A number of struct BPFExternReloc for ELF section #1
-///   struct SecExternReloc for ELF section #2
-///   A number of struct BPFExternReloc for ELF section #2
+/// The FieldReloc subsection is defined as below:
+///   BPFFieldReloc Size
+///   struct SecFieldReloc for ELF section #1
+///   A number of struct BPFFieldReloc for ELF section #1
+///   struct SecFieldReloc for ELF section #2
+///   A number of struct BPFFieldReloc for ELF section #2
 ///   ...
 ///
 /// The section formats are also defined at
@@ -63,7 +56,7 @@ enum : uint32_t { MAGIC = 0xeB9F, VERSION = 1 };
 /// Sizes in bytes of various things in the BTF format.
 enum {
   HeaderSize = 24,
-  ExtHeaderSize = 40,
+  ExtHeaderSize = 32,
   CommonTypeSize = 12,
   BTFArraySize = 12,
   BTFEnumSize = 8,
@@ -72,12 +65,10 @@ enum {
   BTFDataSecVarSize = 12,
   SecFuncInfoSize = 8,
   SecLineInfoSize = 8,
-  SecOffsetRelocSize = 8,
-  SecExternRelocSize = 8,
+  SecFieldRelocSize = 8,
   BPFFuncInfoSize = 8,
   BPFLineInfoSize = 16,
-  BPFOffsetRelocSize = 12,
-  BPFExternRelocSize = 8,
+  BPFFieldRelocSize = 16,
 };
 
 /// The .BTF section header definition.
@@ -213,10 +204,8 @@ struct ExtHeader {
   uint32_t FuncInfoLen;    ///< Length of func info section
   uint32_t LineInfoOff;    ///< Offset of line info section
   uint32_t LineInfoLen;    ///< Length of line info section
-  uint32_t OffsetRelocOff; ///< Offset of offset reloc section
-  uint32_t OffsetRelocLen; ///< Length of offset reloc section
-  uint32_t ExternRelocOff; ///< Offset of extern reloc section
-  uint32_t ExternRelocLen; ///< Length of extern reloc section
+  uint32_t FieldRelocOff; ///< Offset of offset reloc section
+  uint32_t FieldRelocLen; ///< Length of offset reloc section
 };
 
 /// Specifying one function info.
@@ -247,28 +236,17 @@ struct SecLineInfo {
 };
 
 /// Specifying one offset relocation.
-struct BPFOffsetReloc {
+struct BPFFieldReloc {
   uint32_t InsnOffset;    ///< Byte offset in this section
   uint32_t TypeID;        ///< TypeID for the relocation
   uint32_t OffsetNameOff; ///< The string to traverse types
+  uint32_t RelocKind;     ///< What to patch the instruction
 };
 
 /// Specifying offset relocation's in one section.
-struct SecOffsetReloc {
-  uint32_t SecNameOff;     ///< Section name index in the .BTF string table
-  uint32_t NumOffsetReloc; ///< Number of offset reloc's in this section
-};
-
-/// Specifying one offset relocation.
-struct BPFExternReloc {
-  uint32_t InsnOffset;    ///< Byte offset in this section
-  uint32_t ExternNameOff; ///< The string for external variable
-};
-
-/// Specifying extern relocation's in one section.
-struct SecExternReloc {
+struct SecFieldReloc {
   uint32_t SecNameOff;     ///< Section name index in the .BTF string table
-  uint32_t NumExternReloc; ///< Number of extern reloc's in this section
+  uint32_t NumFieldReloc; ///< Number of offset reloc's in this section
 };
 
 } // End namespace BTF.
diff --git a/lib/Target/BPF/BTFDebug.cpp b/lib/Target/BPF/BTFDebug.cpp
index fa35c6619e21..db551e739bd7 100644
--- a/lib/Target/BPF/BTFDebug.cpp
+++ b/lib/Target/BPF/BTFDebug.cpp
@@ -184,9 +184,7 @@ void BTFTypeEnum::emitType(MCStreamer &OS) {
   }
 }
 
-BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize,
-                           uint32_t NumElems)
-    : ElemSize(ElemSize) {
+BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t NumElems) {
   Kind = BTF::BTF_KIND_ARRAY;
   BTFType.NameOff = 0;
   BTFType.Info = Kind << 24;
@@ -216,12 +214,6 @@ void BTFTypeArray::emitType(MCStreamer &OS) {
   OS.EmitIntValue(ArrayInfo.Nelems, 4);
 }
 
-void BTFTypeArray::getLocInfo(uint32_t Loc, uint32_t &LocOffset,
-                              uint32_t &ElementTypeId) {
-  ElementTypeId = ArrayInfo.ElemType;
-  LocOffset = Loc * ElemSize;
-}
-
 /// Represent either a struct or a union.
 BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct,
                              bool HasBitField, uint32_t Vlen)
@@ -251,7 +243,8 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) {
     } else {
       BTFMember.Offset = DDTy->getOffsetInBits();
     }
-    BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType());
+    const auto *BaseTy = DDTy->getBaseType();
+    BTFMember.Type = BDebug.getTypeId(BaseTy);
     Members.push_back(BTFMember);
   }
 }
@@ -268,15 +261,6 @@ void BTFTypeStruct::emitType(MCStreamer &OS) {
 
 std::string BTFTypeStruct::getName() { return STy->getName(); }
 
-void BTFTypeStruct::getMemberInfo(uint32_t Loc, uint32_t &MemberOffset,
-                                  uint32_t &MemberType) {
-  MemberType = Members[Loc].Type;
-  MemberOffset =
-      HasBitField ? Members[Loc].Offset & 0xffffff : Members[Loc].Offset;
-}
-
-uint32_t BTFTypeStruct::getStructSize() { return STy->getSizeInBits() >> 3; }
-
 /// The Func kind represents both subprogram and pointee of function
 /// pointers. If the FuncName is empty, it represents a pointee of function
 /// pointer. Otherwise, it represents a subprogram. The func arg names
@@ -428,7 +412,7 @@ void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) {
 
   // Create a BTF type instance for this DIBasicType and put it into
   // DIToIdMap for cross-type reference check.
-  auto TypeEntry = llvm::make_unique<BTFTypeInt>(
+  auto TypeEntry = std::make_unique<BTFTypeInt>(
       Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
   TypeId = addType(std::move(TypeEntry), BTy);
 }
@@ -447,7 +431,7 @@ void BTFDebug::visitSubroutineType(
   // a function pointer has an empty name. The subprogram type will
   // not be added to DIToIdMap as it should not be referenced by
   // any other types.
-  auto TypeEntry = llvm::make_unique<BTFTypeFuncProto>(STy, VLen, FuncArgNames);
+  auto TypeEntry = std::make_unique<BTFTypeFuncProto>(STy, VLen, FuncArgNames);
   if (ForSubprog)
     TypeId = addType(std::move(TypeEntry)); // For subprogram
   else
@@ -478,7 +462,7 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
   }
 
   auto TypeEntry =
-      llvm::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen);
+      std::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen);
   StructTypes.push_back(TypeEntry.get());
   TypeId = addType(std::move(TypeEntry), CTy);
 
@@ -489,35 +473,29 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
 
 void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
   // Visit array element type.
-  uint32_t ElemTypeId, ElemSize;
+  uint32_t ElemTypeId;
   const DIType *ElemType = CTy->getBaseType();
   visitTypeEntry(ElemType, ElemTypeId, false, false);
-  ElemSize = ElemType->getSizeInBits() >> 3;
 
-  if (!CTy->getSizeInBits()) {
-    auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemTypeId, 0, 0);
-    ArrayTypes.push_back(TypeEntry.get());
-    ElemTypeId = addType(std::move(TypeEntry), CTy);
-  } else {
-    // Visit array dimensions.
-    DINodeArray Elements = CTy->getElements();
-    for (int I = Elements.size() - 1; I >= 0; --I) {
-      if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
-        if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
-          const DISubrange *SR = cast<DISubrange>(Element);
-          auto *CI = SR->getCount().dyn_cast<ConstantInt *>();
-          int64_t Count = CI->getSExtValue();
-
-          auto TypeEntry =
-              llvm::make_unique<BTFTypeArray>(ElemTypeId, ElemSize, Count);
-          ArrayTypes.push_back(TypeEntry.get());
-          if (I == 0)
-            ElemTypeId = addType(std::move(TypeEntry), CTy);
-          else
-            ElemTypeId = addType(std::move(TypeEntry));
-          ElemSize = ElemSize * Count;
-        }
-    }
+  // Visit array dimensions.
+  DINodeArray Elements = CTy->getElements();
+  for (int I = Elements.size() - 1; I >= 0; --I) {
+    if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
+      if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
+        const DISubrange *SR = cast<DISubrange>(Element);
+        auto *CI = SR->getCount().dyn_cast<ConstantInt *>();
+        int64_t Count = CI->getSExtValue();
+
+        // For struct s { int b; char c[]; }, the c[] will be represented
+        // as an array with Count = -1.
+        auto TypeEntry =
+            std::make_unique<BTFTypeArray>(ElemTypeId,
+                Count >= 0 ? Count : 0);
+        if (I == 0)
+          ElemTypeId = addType(std::move(TypeEntry), CTy);
+        else
+          ElemTypeId = addType(std::move(TypeEntry));
+      }
   }
 
   // The array TypeId is the type id of the outermost dimension.
@@ -526,7 +504,7 @@ void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
   // The IR does not have a type for array index while BTF wants one.
   // So create an array index type if there is none.
   if (!ArrayIndexTypeId) {
-    auto TypeEntry = llvm::make_unique<BTFTypeInt>(dwarf::DW_ATE_unsigned, 32,
+    auto TypeEntry = std::make_unique<BTFTypeInt>(dwarf::DW_ATE_unsigned, 32,
                                                    0, "__ARRAY_SIZE_TYPE__");
     ArrayIndexTypeId = addType(std::move(TypeEntry));
   }
@@ -538,7 +516,7 @@ void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) {
   if (VLen > BTF::MAX_VLEN)
     return;
 
-  auto TypeEntry = llvm::make_unique<BTFTypeEnum>(CTy, VLen);
+  auto TypeEntry = std::make_unique<BTFTypeEnum>(CTy, VLen);
   TypeId = addType(std::move(TypeEntry), CTy);
   // No need to visit base type as BTF does not encode it.
 }
@@ -546,7 +524,7 @@ void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) {
 /// Handle structure/union forward declarations.
 void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
                                 uint32_t &TypeId) {
-  auto TypeEntry = llvm::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion);
+  auto TypeEntry = std::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion);
   TypeId = addType(std::move(TypeEntry), CTy);
 }
 
@@ -588,7 +566,7 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
           /// Find a candidate, generate a fixup. Later on the struct/union
           /// pointee type will be replaced with either a real type or
           /// a forward declaration.
-          auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, true);
+          auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, true);
           auto &Fixup = FixupDerivedTypes[CTy->getName()];
           Fixup.first = CTag == dwarf::DW_TAG_union_type;
           Fixup.second.push_back(TypeEntry.get());
@@ -602,7 +580,7 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
   if (Tag == dwarf::DW_TAG_pointer_type || Tag == dwarf::DW_TAG_typedef ||
       Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type ||
       Tag == dwarf::DW_TAG_restrict_type) {
-    auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, false);
+    auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, false);
     TypeId = addType(std::move(TypeEntry), DTy);
   } else if (Tag != dwarf::DW_TAG_member) {
     return;
@@ -669,7 +647,7 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
   }
 
   auto TypeEntry =
-      llvm::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size());
+      std::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size());
   StructTypes.push_back(TypeEntry.get());
   TypeId = addType(std::move(TypeEntry), CTy);
 
@@ -774,9 +752,10 @@ void BTFDebug::emitBTFSection() {
 }
 
 void BTFDebug::emitBTFExtSection() {
-  // Do not emit section if empty FuncInfoTable and LineInfoTable.
+  // Do not emit section if empty FuncInfoTable and LineInfoTable
+  // and FieldRelocTable.
   if (!FuncInfoTable.size() && !LineInfoTable.size() &&
-      !OffsetRelocTable.size() && !ExternRelocTable.size())
+      !FieldRelocTable.size())
     return;
 
   MCContext &Ctx = OS.getContext();
@@ -788,8 +767,8 @@ void BTFDebug::emitBTFExtSection() {
 
   // Account for FuncInfo/LineInfo record size as well.
   uint32_t FuncLen = 4, LineLen = 4;
-  // Do not account for optional OffsetReloc/ExternReloc.
-  uint32_t OffsetRelocLen = 0, ExternRelocLen = 0;
+  // Do not account for optional FieldReloc.
+  uint32_t FieldRelocLen = 0;
   for (const auto &FuncSec : FuncInfoTable) {
     FuncLen += BTF::SecFuncInfoSize;
     FuncLen += FuncSec.second.size() * BTF::BPFFuncInfoSize;
@@ -798,28 +777,20 @@ void BTFDebug::emitBTFExtSection() {
     LineLen += BTF::SecLineInfoSize;
     LineLen += LineSec.second.size() * BTF::BPFLineInfoSize;
   }
-  for (const auto &OffsetRelocSec : OffsetRelocTable) {
-    OffsetRelocLen += BTF::SecOffsetRelocSize;
-    OffsetRelocLen += OffsetRelocSec.second.size() * BTF::BPFOffsetRelocSize;
-  }
-  for (const auto &ExternRelocSec : ExternRelocTable) {
-    ExternRelocLen += BTF::SecExternRelocSize;
-    ExternRelocLen += ExternRelocSec.second.size() * BTF::BPFExternRelocSize;
+  for (const auto &FieldRelocSec : FieldRelocTable) {
+    FieldRelocLen += BTF::SecFieldRelocSize;
+    FieldRelocLen += FieldRelocSec.second.size() * BTF::BPFFieldRelocSize;
   }
 
-  if (OffsetRelocLen)
-    OffsetRelocLen += 4;
-  if (ExternRelocLen)
-    ExternRelocLen += 4;
+  if (FieldRelocLen)
+    FieldRelocLen += 4;
 
   OS.EmitIntValue(0, 4);
   OS.EmitIntValue(FuncLen, 4);
   OS.EmitIntValue(FuncLen, 4);
   OS.EmitIntValue(LineLen, 4);
   OS.EmitIntValue(FuncLen + LineLen, 4);
-  OS.EmitIntValue(OffsetRelocLen, 4);
-  OS.EmitIntValue(FuncLen + LineLen + OffsetRelocLen, 4);
-  OS.EmitIntValue(ExternRelocLen, 4);
+  OS.EmitIntValue(FieldRelocLen, 4);
 
   // Emit func_info table.
   OS.AddComment("FuncInfo");
@@ -853,35 +824,20 @@ void BTFDebug::emitBTFExtSection() {
     }
   }
 
-  // Emit offset reloc table.
-  if (OffsetRelocLen) {
-    OS.AddComment("OffsetReloc");
-    OS.EmitIntValue(BTF::BPFOffsetRelocSize, 4);
-    for (const auto &OffsetRelocSec : OffsetRelocTable) {
-      OS.AddComment("Offset reloc section string offset=" +
-                    std::to_string(OffsetRelocSec.first));
-      OS.EmitIntValue(OffsetRelocSec.first, 4);
-      OS.EmitIntValue(OffsetRelocSec.second.size(), 4);
-      for (const auto &OffsetRelocInfo : OffsetRelocSec.second) {
-        Asm->EmitLabelReference(OffsetRelocInfo.Label, 4);
-        OS.EmitIntValue(OffsetRelocInfo.TypeID, 4);
-        OS.EmitIntValue(OffsetRelocInfo.OffsetNameOff, 4);
-      }
-    }
-  }
-
-  // Emit extern reloc table.
-  if (ExternRelocLen) {
-    OS.AddComment("ExternReloc");
-    OS.EmitIntValue(BTF::BPFExternRelocSize, 4);
-    for (const auto &ExternRelocSec : ExternRelocTable) {
-      OS.AddComment("Extern reloc section string offset=" +
-                    std::to_string(ExternRelocSec.first));
-      OS.EmitIntValue(ExternRelocSec.first, 4);
-      OS.EmitIntValue(ExternRelocSec.second.size(), 4);
-      for (const auto &ExternRelocInfo : ExternRelocSec.second) {
-        Asm->EmitLabelReference(ExternRelocInfo.Label, 4);
-        OS.EmitIntValue(ExternRelocInfo.ExternNameOff, 4);
+  // Emit field reloc table.
+  if (FieldRelocLen) {
+    OS.AddComment("FieldReloc");
+    OS.EmitIntValue(BTF::BPFFieldRelocSize, 4);
+    for (const auto &FieldRelocSec : FieldRelocTable) {
+      OS.AddComment("Field reloc section string offset=" +
+                    std::to_string(FieldRelocSec.first));
+      OS.EmitIntValue(FieldRelocSec.first, 4);
+      OS.EmitIntValue(FieldRelocSec.second.size(), 4);
+      for (const auto &FieldRelocInfo : FieldRelocSec.second) {
+        Asm->EmitLabelReference(FieldRelocInfo.Label, 4);
+        OS.EmitIntValue(FieldRelocInfo.TypeID, 4);
+        OS.EmitIntValue(FieldRelocInfo.OffsetNameOff, 4);
+        OS.EmitIntValue(FieldRelocInfo.RelocKind, 4);
       }
     }
   }
@@ -942,7 +898,7 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
 
   // Construct subprogram func type
   auto FuncTypeEntry =
-      llvm::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId);
+      std::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId);
   uint32_t FuncTypeId = addType(std::move(FuncTypeEntry));
 
   for (const auto &TypeEntry : TypeEntries)
@@ -980,71 +936,27 @@ unsigned BTFDebug::populateStructType(const DIType *Ty) {
   return Id;
 }
 
-// Find struct/array debuginfo types given a type id.
-void BTFDebug::setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
-                             BTFTypeArray **PrevArrayType) {
-  for (const auto &StructType : StructTypes) {
-    if (StructType->getId() == TypeId) {
-      *PrevStructType = StructType;
-      return;
-    }
-  }
-  for (const auto &ArrayType : ArrayTypes) {
-    if (ArrayType->getId() == TypeId) {
-      *PrevArrayType = ArrayType;
-      return;
-    }
-  }
-}
-
-/// Generate a struct member offset relocation.
-void BTFDebug::generateOffsetReloc(const MachineInstr *MI,
+/// Generate a struct member field relocation.
+void BTFDebug::generateFieldReloc(const MachineInstr *MI,
                                    const MCSymbol *ORSym, DIType *RootTy,
                                    StringRef AccessPattern) {
-  BTFTypeStruct *PrevStructType = nullptr;
-  BTFTypeArray *PrevArrayType = nullptr;
   unsigned RootId = populateStructType(RootTy);
-  setTypeFromId(RootId, &PrevStructType, &PrevArrayType);
-  unsigned RootTySize = PrevStructType->getStructSize();
-
-  BTFOffsetReloc OffsetReloc;
-  OffsetReloc.Label = ORSym;
-  OffsetReloc.OffsetNameOff = addString(AccessPattern.drop_back());
-  OffsetReloc.TypeID = RootId;
-
-  uint32_t Start = 0, End = 0, Offset = 0;
-  bool FirstAccess = true;
-  for (auto C : AccessPattern) {
-    if (C != ':') {
-      End++;
-    } else {
-      std::string SubStr = AccessPattern.substr(Start, End - Start);
-      int Loc = std::stoi(SubStr);
-
-      if (FirstAccess) {
-        Offset = Loc * RootTySize;
-        FirstAccess = false;
-      } else if (PrevStructType) {
-        uint32_t MemberOffset, MemberTypeId;
-        PrevStructType->getMemberInfo(Loc, MemberOffset, MemberTypeId);
-
-        Offset += MemberOffset >> 3;
-        PrevStructType = nullptr;
-        setTypeFromId(MemberTypeId, &PrevStructType, &PrevArrayType);
-      } else if (PrevArrayType) {
-        uint32_t LocOffset, ElementTypeId;
-        PrevArrayType->getLocInfo(Loc, LocOffset, ElementTypeId);
-
-        Offset += LocOffset;
-        PrevArrayType = nullptr;
-        setTypeFromId(ElementTypeId, &PrevStructType, &PrevArrayType);
-      }
-      Start = End + 1;
-      End = Start;
-    }
-  }
-  AccessOffsets[RootTy->getName().str() + ":" + AccessPattern.str()] = Offset;
-  OffsetRelocTable[SecNameOff].push_back(OffsetReloc);
+  size_t FirstDollar = AccessPattern.find_first_of('$');
+  size_t FirstColon = AccessPattern.find_first_of(':');
+  size_t SecondColon = AccessPattern.find_first_of(':', FirstColon + 1);
+  StringRef IndexPattern = AccessPattern.substr(FirstDollar + 1);
+  StringRef RelocKindStr = AccessPattern.substr(FirstColon + 1,
+      SecondColon - FirstColon);
+  StringRef PatchImmStr = AccessPattern.substr(SecondColon + 1,
+      FirstDollar - SecondColon);
+
+  BTFFieldReloc FieldReloc;
+  FieldReloc.Label = ORSym;
+  FieldReloc.OffsetNameOff = addString(IndexPattern);
+  FieldReloc.TypeID = RootId;
+  FieldReloc.RelocKind = std::stoull(RelocKindStr);
+  PatchImms[AccessPattern.str()] = std::stoul(PatchImmStr);
+  FieldRelocTable[SecNameOff].push_back(FieldReloc);
 }
 
 void BTFDebug::processLDimm64(const MachineInstr *MI) {
@@ -1052,7 +964,7 @@ void BTFDebug::processLDimm64(const MachineInstr *MI) {
   // will generate an .BTF.ext record.
   //
   // If the insn is "r2 = LD_imm64 @__BTF_...",
-  // add this insn into the .BTF.ext OffsetReloc subsection.
+  // add this insn into the .BTF.ext FieldReloc subsection.
   // Relocation looks like:
   //  . SecName:
   //    . InstOffset
@@ -1083,16 +995,7 @@ void BTFDebug::processLDimm64(const MachineInstr *MI) {
 
       MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
       DIType *Ty = dyn_cast<DIType>(MDN);
-      generateOffsetReloc(MI, ORSym, Ty, GVar->getName());
-    } else if (GVar && !GVar->hasInitializer() && GVar->hasExternalLinkage() &&
-               GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
-      MCSymbol *ORSym = OS.getContext().createTempSymbol();
-      OS.EmitLabel(ORSym);
-
-      BTFExternReloc ExternReloc;
-      ExternReloc.Label = ORSym;
-      ExternReloc.ExternNameOff = addString(GVar->getName());
-      ExternRelocTable[SecNameOff].push_back(ExternReloc);
+      generateFieldReloc(MI, ORSym, Ty, GVar->getName());
     }
   }
 }
@@ -1200,12 +1103,12 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
                             ? BTF::VAR_GLOBAL_ALLOCATED
                             : BTF::VAR_STATIC;
     auto VarEntry =
-        llvm::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
+        std::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
     uint32_t VarId = addType(std::move(VarEntry));
 
     // Find or create a DataSec
     if (DataSecEntries.find(SecName) == DataSecEntries.end()) {
-      DataSecEntries[SecName] = llvm::make_unique<BTFKindDataSec>(Asm, SecName);
+      DataSecEntries[SecName] = std::make_unique<BTFKindDataSec>(Asm, SecName);
     }
 
     // Calculate symbol size
@@ -1224,30 +1127,12 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
       const GlobalValue *GVal = MO.getGlobal();
       auto *GVar = dyn_cast<GlobalVariable>(GVal);
       if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
-        MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
-        DIType *Ty = dyn_cast<DIType>(MDN);
-        std::string TypeName = Ty->getName();
-        int64_t Imm = AccessOffsets[TypeName + ":" + GVar->getName().str()];
-
-        // Emit "mov ri, <imm>" for abstract member accesses.
+        // Emit "mov ri, <imm>" for patched immediate.
+        uint32_t Imm = PatchImms[GVar->getName().str()];
         OutMI.setOpcode(BPF::MOV_ri);
         OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
         OutMI.addOperand(MCOperand::createImm(Imm));
         return true;
-      } else if (GVar && !GVar->hasInitializer() &&
-                 GVar->hasExternalLinkage() &&
-                 GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
-        const IntegerType *IntTy = dyn_cast<IntegerType>(GVar->getValueType());
-        assert(IntTy);
-        // For patchable externals, emit "LD_imm64, ri, 0" if the external
-        // variable is 64bit width, emit "mov ri, 0" otherwise.
-        if (IntTy->getBitWidth() == 64)
-          OutMI.setOpcode(BPF::LD_imm64);
-        else
-          OutMI.setOpcode(BPF::MOV_ri);
-        OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
-        OutMI.addOperand(MCOperand::createImm(0));
-        return true;
       }
     }
   }
@@ -1281,7 +1166,7 @@ void BTFDebug::endModule() {
     }
 
     if (StructTypeId == 0) {
-      auto FwdTypeEntry = llvm::make_unique<BTFTypeFwd>(TypeName, IsUnion);
+      auto FwdTypeEntry = std::make_unique<BTFTypeFwd>(TypeName, IsUnion);
       StructTypeId = addType(std::move(FwdTypeEntry));
     }
 
diff --git a/lib/Target/BPF/BTFDebug.h b/lib/Target/BPF/BTFDebug.h
index 6c0cdde17d9b..c01e0d1d1612 100644
--- a/lib/Target/BPF/BTFDebug.h
+++ b/lib/Target/BPF/BTFDebug.h
@@ -104,15 +104,13 @@ public:
 
 /// Handle array type.
 class BTFTypeArray : public BTFTypeBase {
-  uint32_t ElemSize;
   struct BTF::BTFArray ArrayInfo;
 
 public:
-  BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, uint32_t NumElems);
+  BTFTypeArray(uint32_t ElemTypeId, uint32_t NumElems);
   uint32_t getSize() { return BTFTypeBase::getSize() + BTF::BTFArraySize; }
   void completeType(BTFDebug &BDebug);
   void emitType(MCStreamer &OS);
-  void getLocInfo(uint32_t Loc, uint32_t &LocOffset, uint32_t &ElementTypeId);
 };
 
 /// Handle struct/union type.
@@ -130,8 +128,6 @@ public:
   void completeType(BTFDebug &BDebug);
   void emitType(MCStreamer &OS);
   std::string getName();
-  void getMemberInfo(uint32_t Loc, uint32_t &Offset, uint32_t &MemberType);
-  uint32_t getStructSize();
 };
 
 /// Handle function pointer.
@@ -199,7 +195,7 @@ class BTFStringTable {
   /// A mapping from string table offset to the index
   /// of the Table. It is used to avoid putting
   /// duplicated strings in the table.
-  std::unordered_map<uint32_t, uint32_t> OffsetToIdMap;
+  std::map<uint32_t, uint32_t> OffsetToIdMap;
   /// A vector of strings to represent the string table.
   std::vector<std::string> Table;
 
@@ -228,16 +224,11 @@ struct BTFLineInfo {
 };
 
 /// Represent one offset relocation.
-struct BTFOffsetReloc {
+struct BTFFieldReloc {
   const MCSymbol *Label;  ///< MCSymbol identifying insn for the reloc
   uint32_t TypeID;        ///< Type ID
   uint32_t OffsetNameOff; ///< The string to traverse types
-};
-
-/// Represent one extern relocation.
-struct BTFExternReloc {
-  const MCSymbol *Label;  ///< MCSymbol identifying insn for the reloc
-  uint32_t ExternNameOff; ///< The extern variable name
+  uint32_t RelocKind;     ///< What to patch the instruction
 };
 
 /// Collect and emit BTF information.
@@ -253,13 +244,11 @@ class BTFDebug : public DebugHandlerBase {
   std::unordered_map<const DIType *, uint32_t> DIToIdMap;
   std::map<uint32_t, std::vector<BTFFuncInfo>> FuncInfoTable;
   std::map<uint32_t, std::vector<BTFLineInfo>> LineInfoTable;
-  std::map<uint32_t, std::vector<BTFOffsetReloc>> OffsetRelocTable;
-  std::map<uint32_t, std::vector<BTFExternReloc>> ExternRelocTable;
+  std::map<uint32_t, std::vector<BTFFieldReloc>> FieldRelocTable;
   StringMap<std::vector<std::string>> FileContent;
   std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries;
   std::vector<BTFTypeStruct *> StructTypes;
-  std::vector<BTFTypeArray *> ArrayTypes;
-  std::map<std::string, int64_t> AccessOffsets;
+  std::map<std::string, uint32_t> PatchImms;
   std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>>
       FixupDerivedTypes;
 
@@ -305,13 +294,9 @@ class BTFDebug : public DebugHandlerBase {
   void processGlobals(bool ProcessingMapDef);
 
   /// Generate one offset relocation record.
-  void generateOffsetReloc(const MachineInstr *MI, const MCSymbol *ORSym,
+  void generateFieldReloc(const MachineInstr *MI, const MCSymbol *ORSym,
                            DIType *RootTy, StringRef AccessPattern);
 
-  /// Set the to-be-traversed Struct/Array Type based on TypeId.
-  void setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
-                     BTFTypeArray **PrevArrayType);
-
   /// Populating unprocessed struct type.
   unsigned populateStructType(const DIType *Ty);
 
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 057bbf5c3b06..ef4e324c3bdd 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -39,7 +39,7 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
   // determine the type of the relocation
-  switch ((unsigned)Fixup.getKind()) {
+  switch (Fixup.getKind()) {
   default:
     llvm_unreachable("invalid fixup kind!");
   case FK_SecRel_8:
@@ -85,5 +85,5 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createBPFELFObjectWriter(uint8_t OSABI) {
-  return llvm::make_unique<BPFELFObjectWriter>(OSABI);
+  return std::make_unique<BPFELFObjectWriter>(OSABI);
 }
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 0881bf841f90..590c4a2eb69d 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -702,7 +702,7 @@ bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) {
     // Make sure we have a number (false is returned if expression is a number)
     if (!getParser().parseExpression(Value)) {
       // Make sure this is a number that is in range
-      const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+      auto *MCE = cast<MCConstantExpr>(Value);
       uint64_t IntValue = MCE->getValue();
       if (!isUIntN(Size, IntValue) && !isIntN(Size, IntValue))
         return Error(ExprLoc, "literal value out of range (256) for falign");
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index b7e95caf24fb..efd5ed915127 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -84,7 +84,7 @@ namespace {
 
   raw_ostream &operator<< (raw_ostream &OS, const printv &PV) {
     if (PV.R)
-      OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R);
+      OS << 'v' << Register::virtReg2Index(PV.R);
     else
       OS << 's';
     return OS;
@@ -201,7 +201,7 @@ BitTracker::~BitTracker() {
 bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
   // An example when "meet" can be invoked with SelfR == 0 is a phi node
   // with a physical register as an operand.
-  assert(SelfR == 0 || TargetRegisterInfo::isVirtualRegister(SelfR));
+  assert(SelfR == 0 || Register::isVirtualRegister(SelfR));
   bool Changed = false;
   for (uint16_t i = 0, n = Bits.size(); i < n; ++i) {
     const BitValue &RCV = RC[i];
@@ -335,12 +335,13 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
   // 1. find a physical register PhysR from the same class as RR.Reg,
   // 2. find a physical register PhysS that corresponds to PhysR:RR.Sub,
   // 3. find a register class that contains PhysS.
-  if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) {
+  if (Register::isVirtualRegister(RR.Reg)) {
     const auto &VC = composeWithSubRegIndex(*MRI.getRegClass(RR.Reg), RR.Sub);
     return TRI.getRegSizeInBits(VC);
   }
-  assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg));
-  unsigned PhysR = (RR.Sub == 0) ? RR.Reg : TRI.getSubReg(RR.Reg, RR.Sub);
+  assert(Register::isPhysicalRegister(RR.Reg));
+  Register PhysR =
+      (RR.Sub == 0) ? Register(RR.Reg) : TRI.getSubReg(RR.Reg, RR.Sub);
   return getPhysRegBitWidth(PhysR);
 }
 
@@ -350,10 +351,10 @@ BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
 
   // Physical registers are assumed to be present in the map with an unknown
   // value. Don't actually insert anything in the map, just return the cell.
-  if (TargetRegisterInfo::isPhysicalRegister(RR.Reg))
+  if (Register::isPhysicalRegister(RR.Reg))
     return RegisterCell::self(0, BW);
 
-  assert(TargetRegisterInfo::isVirtualRegister(RR.Reg));
+  assert(Register::isVirtualRegister(RR.Reg));
   // For virtual registers that belong to a class that is not tracked,
   // generate an "unknown" value as well.
   const TargetRegisterClass *C = MRI.getRegClass(RR.Reg);
@@ -376,7 +377,7 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
   // While updating the cell map can be done in a meaningful way for
   // a part of a register, it makes little sense to implement it as the
   // SSA representation would never contain such "partial definitions".
-  if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+  if (!Register::isVirtualRegister(RR.Reg))
     return;
   assert(RR.Sub == 0 && "Unexpected sub-register in definition");
   // Eliminate all ref-to-reg-0 bit values: replace them with "self".
@@ -711,7 +712,7 @@ BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
 }
 
 uint16_t BT::MachineEvaluator::getPhysRegBitWidth(unsigned Reg) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(Register::isPhysicalRegister(Reg));
   const TargetRegisterClass &PC = *TRI.getMinimalPhysRegClass(Reg);
   return TRI.getRegSizeInBits(PC);
 }
@@ -874,7 +875,7 @@ void BT::visitNonBranch(const MachineInstr &MI) {
       continue;
     RegisterRef RD(MO);
     assert(RD.Sub == 0 && "Unexpected sub-register in definition");
-    if (!TargetRegisterInfo::isVirtualRegister(RD.Reg))
+    if (!Register::isVirtualRegister(RD.Reg))
       continue;
 
     bool Changed = false;
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index b07d15609ede..3d771d388e28 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -130,7 +130,7 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
       if (!MO.isReg())
         return true;
-      unsigned RegNumber = MO.getReg();
+      Register RegNumber = MO.getReg();
       // This should be an assert in the frontend.
       if (Hexagon::DoubleRegsRegClass.contains(RegNumber))
         RegNumber = TRI->getSubReg(RegNumber, ExtraCode[0] == 'L' ?
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 7b75d251ccd3..3068fb6f9629 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -147,11 +147,11 @@ namespace {
     }
 
     static inline unsigned v2x(unsigned v) {
-      return TargetRegisterInfo::virtReg2Index(v);
+      return Register::virtReg2Index(v);
     }
 
     static inline unsigned x2v(unsigned x) {
-      return TargetRegisterInfo::index2VirtReg(x);
+      return Register::index2VirtReg(x);
     }
   };
 
@@ -290,8 +290,8 @@ void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI,
   for (auto &Op : MI.operands()) {
     if (!Op.isReg() || !Op.isDef())
       continue;
-    unsigned R = Op.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
+    Register R = Op.getReg();
+    if (!Register::isVirtualRegister(R))
       continue;
     Defs.insert(R);
   }
@@ -302,8 +302,8 @@ void HexagonBitSimplify::getInstrUses(const MachineInstr &MI,
   for (auto &Op : MI.operands()) {
     if (!Op.isReg() || !Op.isUse())
       continue;
-    unsigned R = Op.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
+    Register R = Op.getReg();
+    if (!Register::isVirtualRegister(R))
       continue;
     Uses.insert(R);
   }
@@ -353,8 +353,7 @@ bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC,
 
 bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
       MachineRegisterInfo &MRI) {
-  if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
-      !TargetRegisterInfo::isVirtualRegister(NewR))
+  if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
     return false;
   auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
   decltype(End) NextI;
@@ -367,8 +366,7 @@ bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
 
 bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
       unsigned NewSR, MachineRegisterInfo &MRI) {
-  if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
-      !TargetRegisterInfo::isVirtualRegister(NewR))
+  if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
     return false;
   if (hasTiedUse(OldR, MRI, NewSR))
     return false;
@@ -384,8 +382,7 @@ bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
 
 bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR,
       unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) {
-  if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
-      !TargetRegisterInfo::isVirtualRegister(NewR))
+  if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
     return false;
   if (OldSR != NewSR && hasTiedUse(OldR, MRI, NewSR))
     return false;
@@ -896,7 +893,7 @@ bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
 // register class.
 const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
       const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) {
-  if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+  if (!Register::isVirtualRegister(RR.Reg))
     return nullptr;
   auto *RC = MRI.getRegClass(RR.Reg);
   if (RR.Sub == 0)
@@ -927,8 +924,8 @@ const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
 // with a 32-bit register.
 bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD,
       const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) {
-  if (!TargetRegisterInfo::isVirtualRegister(RD.Reg) ||
-      !TargetRegisterInfo::isVirtualRegister(RS.Reg))
+  if (!Register::isVirtualRegister(RD.Reg) ||
+      !Register::isVirtualRegister(RS.Reg))
     return false;
   // Return false if one (or both) classes are nullptr.
   auto *DRC = getFinalVRegClass(RD, MRI);
@@ -979,7 +976,7 @@ bool DeadCodeElimination::isDead(unsigned R) const {
       continue;
     if (UseI->isPHI()) {
       assert(!UseI->getOperand(0).getSubReg());
-      unsigned DR = UseI->getOperand(0).getReg();
+      Register DR = UseI->getOperand(0).getReg();
       if (DR == R)
         continue;
     }
@@ -1018,8 +1015,8 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
     for (auto &Op : MI->operands()) {
       if (!Op.isReg() || !Op.isDef())
         continue;
-      unsigned R = Op.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(R) || !isDead(R)) {
+      Register R = Op.getReg();
+      if (!Register::isVirtualRegister(R) || !isDead(R)) {
         AllDead = false;
         break;
       }
@@ -1220,8 +1217,8 @@ bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) {
         return false;
       MachineInstr &UseI = *I->getParent();
       if (UseI.isPHI() || UseI.isCopy()) {
-        unsigned DefR = UseI.getOperand(0).getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(DefR))
+        Register DefR = UseI.getOperand(0).getReg();
+        if (!Register::isVirtualRegister(DefR))
           return false;
         Pending.push_back(DefR);
       } else {
@@ -1345,7 +1342,7 @@ bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
       // If found, replace the instruction with a COPY.
       const DebugLoc &DL = MI->getDebugLoc();
       const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
-      unsigned NewR = MRI.createVirtualRegister(FRC);
+      Register NewR = MRI.createVirtualRegister(FRC);
       MachineInstr *CopyI =
           BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
             .addReg(RS.Reg, 0, RS.Sub);
@@ -1412,7 +1409,7 @@ bool ConstGeneration::isTfrConst(const MachineInstr &MI) {
 // register class and the actual value being transferred.
 unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
       MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) {
-  unsigned Reg = MRI.createVirtualRegister(RC);
+  Register Reg = MRI.createVirtualRegister(RC);
   if (RC == &Hexagon::IntRegsRegClass) {
     BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg)
         .addImm(int32_t(C));
@@ -1470,7 +1467,7 @@ bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
     if (Defs.count() != 1)
       continue;
     unsigned DR = Defs.find_first();
-    if (!TargetRegisterInfo::isVirtualRegister(DR))
+    if (!Register::isVirtualRegister(DR))
       continue;
     uint64_t U;
     const BitTracker::RegisterCell &DRC = BT.lookup(DR);
@@ -1609,7 +1606,7 @@ bool CopyGeneration::processBlock(MachineBasicBlock &B,
       auto *FRC = HBS::getFinalVRegClass(R, MRI);
 
       if (findMatch(R, MR, AVB)) {
-        unsigned NewR = MRI.createVirtualRegister(FRC);
+        Register NewR = MRI.createVirtualRegister(FRC);
         BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
           .addReg(MR.Reg, 0, MR.Sub);
         BT.put(BitTracker::RegisterRef(NewR), BT.get(MR));
@@ -1628,7 +1625,7 @@ bool CopyGeneration::processBlock(MachineBasicBlock &B,
         BitTracker::RegisterRef ML, MH;
         if (findMatch(TL, ML, AVB) && findMatch(TH, MH, AVB)) {
           auto *FRC = HBS::getFinalVRegClass(R, MRI);
-          unsigned NewR = MRI.createVirtualRegister(FRC);
+          Register NewR = MRI.createVirtualRegister(FRC);
           BuildMI(B, At, DL, HII.get(TargetOpcode::REG_SEQUENCE), NewR)
             .addReg(ML.Reg, 0, ML.Sub)
             .addImm(SubLo)
@@ -1819,7 +1816,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
 
   if (Reg == 0 || Reg == SelfR)    // Don't match "self".
     return false;
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  if (!Register::isVirtualRegister(Reg))
     return false;
   if (!BT.has(Reg))
     return false;
@@ -2025,7 +2022,7 @@ bool BitSimplification::genPackhl(MachineInstr *MI,
     return false;
 
   MachineBasicBlock &B = *MI->getParent();
-  unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+  Register NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
   DebugLoc DL = MI->getDebugLoc();
   auto At = MI->isPHI() ? B.getFirstNonPHI()
                         : MachineBasicBlock::iterator(MI);
@@ -2097,7 +2094,7 @@ bool BitSimplification::genCombineHalf(MachineInstr *MI,
 
   MachineBasicBlock &B = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  Register NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
   auto At = MI->isPHI() ? B.getFirstNonPHI()
                         : MachineBasicBlock::iterator(MI);
   BuildMI(B, At, DL, HII.get(COpc), NewR)
@@ -2154,7 +2151,7 @@ bool BitSimplification::genExtractLow(MachineInstr *MI,
     if (!validateReg(RS, NewOpc, 1))
       continue;
 
-    unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+    Register NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
     auto At = MI->isPHI() ? B.getFirstNonPHI()
                           : MachineBasicBlock::iterator(MI);
     auto MIB = BuildMI(B, At, DL, HII.get(NewOpc), NewR)
@@ -2368,7 +2365,7 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
       return true;
     }
   } else if (V.is(0) || V.is(1)) {
-    unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+    Register NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
     unsigned NewOpc = V.is(0) ? Hexagon::PS_false : Hexagon::PS_true;
     BuildMI(B, At, DL, HII.get(NewOpc), NewR);
     HBS::replaceReg(RD.Reg, NewR, MRI);
@@ -2541,7 +2538,7 @@ bool BitSimplification::simplifyExtractLow(MachineInstr *MI,
 
     DebugLoc DL = MI->getDebugLoc();
     MachineBasicBlock &B = *MI->getParent();
-    unsigned NewR = MRI.createVirtualRegister(FRC);
+    Register NewR = MRI.createVirtualRegister(FRC);
     auto At = MI->isPHI() ? B.getFirstNonPHI()
                           : MachineBasicBlock::iterator(MI);
     auto MIB = BuildMI(B, At, DL, HII.get(ExtOpc), NewR)
@@ -2612,8 +2609,8 @@ bool BitSimplification::simplifyRCmp0(MachineInstr *MI,
       KnownNZ = true;
   }
 
-  auto ReplaceWithConst = [&] (int C) {
-    unsigned NewR = MRI.createVirtualRegister(FRC);
+  auto ReplaceWithConst = [&](int C) {
+    Register NewR = MRI.createVirtualRegister(FRC);
     BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), NewR)
       .addImm(C);
     HBS::replaceReg(RD.Reg, NewR, MRI);
@@ -2678,7 +2675,7 @@ bool BitSimplification::simplifyRCmp0(MachineInstr *MI,
     // replace the comparison with a C2_muxii, using the same predicate
     // register, but with operands substituted with 0/1 accordingly.
     if ((KnownZ1 || KnownNZ1) && (KnownZ2 || KnownNZ2)) {
-      unsigned NewR = MRI.createVirtualRegister(FRC);
+      Register NewR = MRI.createVirtualRegister(FRC);
       BuildMI(B, At, DL, HII.get(Hexagon::C2_muxii), NewR)
         .addReg(InpDef->getOperand(1).getReg())
         .addImm(KnownZ1 == (Opc == Hexagon::A4_rcmpeqi))
@@ -3071,7 +3068,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
   DenseMap<unsigned,unsigned> RegMap;
 
   const TargetRegisterClass *PhiRC = MRI->getRegClass(NewPredR);
-  unsigned PhiR = MRI->createVirtualRegister(PhiRC);
+  Register PhiR = MRI->createVirtualRegister(PhiRC);
   BuildMI(LB, At, At->getDebugLoc(), HII->get(TargetOpcode::PHI), PhiR)
     .addReg(NewPredR)
     .addMBB(&PB)
@@ -3083,7 +3080,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
     const MachineInstr *SI = G.Ins[i-1];
     unsigned DR = getDefReg(SI);
     const TargetRegisterClass *RC = MRI->getRegClass(DR);
-    unsigned NewDR = MRI->createVirtualRegister(RC);
+    Register NewDR = MRI->createVirtualRegister(RC);
     DebugLoc DL = SI->getDebugLoc();
 
     auto MIB = BuildMI(LB, At, DL, HII->get(SI->getOpcode()), NewDR);
@@ -3162,7 +3159,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
     if (Defs.count() != 1)
       continue;
     unsigned DefR = Defs.find_first();
-    if (!TargetRegisterInfo::isVirtualRegister(DefR))
+    if (!Register::isVirtualRegister(DefR))
       continue;
     if (!isBitShuffle(&*I, DefR))
       continue;
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index ba50faac2cf9..ebd060ce503e 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -111,7 +111,7 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
 }
 
 uint16_t HexagonEvaluator::getPhysRegBitWidth(unsigned Reg) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(Register::isPhysicalRegister(Reg));
 
   using namespace Hexagon;
   const auto &HST = MF.getSubtarget<HexagonSubtarget>();
@@ -1042,8 +1042,8 @@ unsigned HexagonEvaluator::getUniqueDefVReg(const MachineInstr &MI) const {
   for (const MachineOperand &Op : MI.operands()) {
     if (!Op.isReg() || !Op.isDef())
       continue;
-    unsigned R = Op.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
+    Register R = Op.getReg();
+    if (!Register::isVirtualRegister(R))
       continue;
     if (DefReg != 0)
       return 0;
@@ -1220,7 +1220,7 @@ bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr &MI,
   RegisterRef RD = MI.getOperand(0);
   RegisterRef RS = MI.getOperand(1);
   assert(RD.Sub == 0);
-  if (!TargetRegisterInfo::isPhysicalRegister(RS.Reg))
+  if (!Register::isPhysicalRegister(RS.Reg))
     return false;
   RegExtMap::const_iterator F = VRX.find(RD.Reg);
   if (F == VRX.end())
diff --git a/lib/Target/Hexagon/HexagonBlockRanges.cpp b/lib/Target/Hexagon/HexagonBlockRanges.cpp
index 999150fc8c6e..d1d1b8ee7d41 100644
--- a/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -268,14 +268,14 @@ HexagonBlockRanges::RegisterSet HexagonBlockRanges::expandToSubRegs(
     return SRs;
   }
 
-  if (TargetRegisterInfo::isPhysicalRegister(R.Reg)) {
+  if (Register::isPhysicalRegister(R.Reg)) {
     MCSubRegIterator I(R.Reg, &TRI);
     if (!I.isValid())
       SRs.insert({R.Reg, 0});
     for (; I.isValid(); ++I)
       SRs.insert({*I, 0});
   } else {
-    assert(TargetRegisterInfo::isVirtualRegister(R.Reg));
+    assert(Register::isVirtualRegister(R.Reg));
     auto &RC = *MRI.getRegClass(R.Reg);
     unsigned PReg = *RC.begin();
     MCSubRegIndexIterator I(PReg, &TRI);
@@ -321,7 +321,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
       if (!Op.isReg() || !Op.isUse() || Op.isUndef())
         continue;
       RegisterRef R = { Op.getReg(), Op.getSubReg() };
-      if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
+      if (Register::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
         continue;
       bool IsKill = Op.isKill();
       for (auto S : expandToSubRegs(R, MRI, TRI)) {
@@ -338,7 +338,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
         continue;
       RegisterRef R = { Op.getReg(), Op.getSubReg() };
       for (auto S : expandToSubRegs(R, MRI, TRI)) {
-        if (TargetRegisterInfo::isPhysicalRegister(S.Reg) && Reserved[S.Reg])
+        if (Register::isPhysicalRegister(S.Reg) && Reserved[S.Reg])
           continue;
         if (Op.isDead())
           Clobbers.insert(S);
@@ -374,7 +374,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
     // Update maps for defs.
     for (RegisterRef S : Defs) {
       // Defs should already be expanded into subregs.
-      assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+      assert(!Register::isPhysicalRegister(S.Reg) ||
              !MCSubRegIterator(S.Reg, &TRI, false).isValid());
       if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
         closeRange(S);
@@ -383,7 +383,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
     // Update maps for clobbers.
     for (RegisterRef S : Clobbers) {
       // Clobbers should already be expanded into subregs.
-      assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+      assert(!Register::isPhysicalRegister(S.Reg) ||
              !MCSubRegIterator(S.Reg, &TRI, false).isValid());
       if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
         closeRange(S);
@@ -482,7 +482,7 @@ HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap(
     }
   }
   for (auto &P : LiveMap)
-    if (TargetRegisterInfo::isVirtualRegister(P.first.Reg))
+    if (Register::isVirtualRegister(P.first.Reg))
       addDeadRanges(P.first);
 
   LLVM_DEBUG(dbgs() << __func__ << ": dead map\n"
diff --git a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
index ee93739b2c7b..08f740806879 100644
--- a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
+++ b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -105,12 +105,11 @@ void HexagonBranchRelaxation::computeOffset(MachineFunction &MF,
   // offset of the current instruction from the start.
   unsigned InstOffset = 0;
   for (auto &B : MF) {
-    if (B.getAlignment()) {
+    if (B.getAlignment() != Align::None()) {
       // Although we don't know the exact layout of the final code, we need
       // to account for alignment padding somehow. This heuristic pads each
       // aligned basic block according to the alignment value.
-      int ByteAlign = (1u << B.getAlignment()) - 1;
-      InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
+      InstOffset = alignTo(InstOffset, B.getAlignment());
     }
     OffsetMap[&B] = InstOffset;
     for (auto &MI : B.instrs()) {
diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp
index cfed0ecef272..ddc9b847ef1c 100644
--- a/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -14,9 +14,10 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Pass.h"
 #include <map>
 #include <set>
 #include <utility>
@@ -235,24 +236,24 @@ namespace {
           Reg = Op.getReg();
           Sub = Op.getSubReg();
         } else if (Op.isFI()) {
-          Reg = TargetRegisterInfo::index2StackSlot(Op.getIndex());
+          Reg = llvm::Register::index2StackSlot(Op.getIndex());
         }
         return *this;
       }
       bool isVReg() const {
-        return Reg != 0 && !TargetRegisterInfo::isStackSlot(Reg) &&
-               TargetRegisterInfo::isVirtualRegister(Reg);
+        return Reg != 0 && !llvm::Register::isStackSlot(Reg) &&
+               llvm::Register::isVirtualRegister(Reg);
       }
       bool isSlot() const {
-        return Reg != 0 && TargetRegisterInfo::isStackSlot(Reg);
+        return Reg != 0 && llvm::Register::isStackSlot(Reg);
       }
       operator MachineOperand() const {
         if (isVReg())
           return MachineOperand::CreateReg(Reg, /*Def*/false, /*Imp*/false,
                           /*Kill*/false, /*Dead*/false, /*Undef*/false,
                           /*EarlyClobber*/false, Sub);
-        if (TargetRegisterInfo::isStackSlot(Reg)) {
-          int FI = TargetRegisterInfo::stackSlot2Index(Reg);
+        if (llvm::Register::isStackSlot(Reg)) {
+          int FI = llvm::Register::stackSlot2Index(Reg);
           return MachineOperand::CreateFI(FI);
         }
         llvm_unreachable("Cannot create MachineOperand");
@@ -1524,7 +1525,7 @@ void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
 }
 
 HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) {
-  unsigned DefR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+  llvm::Register DefR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
   MachineBasicBlock &MBB = *DefL.Block;
   MachineBasicBlock::iterator At = DefL.At;
   DebugLoc dl = DefL.Block->findDebugLoc(DefL.At);
diff --git a/lib/Target/Hexagon/HexagonConstPropagation.cpp b/lib/Target/Hexagon/HexagonConstPropagation.cpp
index d1fde5da5fe8..a82501cabb9b 100644
--- a/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -208,14 +208,14 @@ namespace {
 
       bool has(unsigned R) const {
         // All non-virtual registers are considered "bottom".
-        if (!TargetRegisterInfo::isVirtualRegister(R))
+        if (!Register::isVirtualRegister(R))
           return true;
         MapType::const_iterator F = Map.find(R);
         return F != Map.end();
       }
 
       const LatticeCell &get(unsigned R) const {
-        if (!TargetRegisterInfo::isVirtualRegister(R))
+        if (!Register::isVirtualRegister(R))
           return Bottom;
         MapType::const_iterator F = Map.find(R);
         if (F != Map.end())
@@ -623,7 +623,7 @@ void MachineConstPropagator::visitPHI(const MachineInstr &PN) {
 
   const MachineOperand &MD = PN.getOperand(0);
   RegisterSubReg DefR(MD);
-  assert(TargetRegisterInfo::isVirtualRegister(DefR.Reg));
+  assert(Register::isVirtualRegister(DefR.Reg));
 
   bool Changed = false;
 
@@ -652,7 +652,7 @@ Bottomize:
     RegisterSubReg UseR(SO);
     // If the input is not a virtual register, we don't really know what
     // value it holds.
-    if (!TargetRegisterInfo::isVirtualRegister(UseR.Reg))
+    if (!Register::isVirtualRegister(UseR.Reg))
       goto Bottomize;
     // If there is no cell for an input register, it means top.
     if (!Cells.has(UseR.Reg))
@@ -694,7 +694,7 @@ void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
       continue;
     RegisterSubReg DefR(MO);
     // Only track virtual registers.
-    if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
+    if (!Register::isVirtualRegister(DefR.Reg))
       continue;
     bool Changed = false;
     // If the evaluation failed, set cells for all output registers to bottom.
@@ -1070,7 +1070,7 @@ bool MachineConstPropagator::run(MachineFunction &MF) {
 
 bool MachineConstEvaluator::getCell(const RegisterSubReg &R, const CellMap &Inputs,
       LatticeCell &RC) {
-  if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
+  if (!Register::isVirtualRegister(R.Reg))
     return false;
   const LatticeCell &L = Inputs.get(R.Reg);
   if (!R.SubReg) {
@@ -1926,7 +1926,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
   unsigned Opc = MI.getOpcode();
   RegisterSubReg DefR(MD);
   assert(!DefR.SubReg);
-  if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
+  if (!Register::isVirtualRegister(DefR.Reg))
     return false;
 
   if (MI.isCopy()) {
@@ -2793,7 +2793,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
       if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
         continue;
       RegisterSubReg R(MO);
-      if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
+      if (!Register::isVirtualRegister(R.Reg))
         continue;
       HasUse = true;
       // PHIs can legitimately have "top" cells after propagation.
@@ -2813,7 +2813,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
         for (const MachineOperand &MO : MI.operands()) {
           if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
             continue;
-          unsigned R = MO.getReg();
+          Register R = MO.getReg();
           dbgs() << printReg(R, &TRI) << ": " << Inputs.get(R) << "\n";
         }
       }
@@ -2831,8 +2831,8 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned R = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
+    Register R = MO.getReg();
+    if (!Register::isVirtualRegister(R))
       continue;
     assert(!MO.getSubReg());
     assert(Inputs.has(R));
@@ -2871,7 +2871,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
       const MCInstrDesc *NewD = (Ps & P::Zero) ?
         &HII.get(Hexagon::PS_false) :
         &HII.get(Hexagon::PS_true);
-      unsigned NewR = MRI->createVirtualRegister(PredRC);
+      Register NewR = MRI->createVirtualRegister(PredRC);
       const MachineInstrBuilder &MIB = BuildMI(B, At, DL, *NewD, NewR);
       (void)MIB;
 #ifndef NDEBUG
@@ -2893,7 +2893,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
         NewRC = &Hexagon::IntRegsRegClass;
       else
         NewRC = &Hexagon::DoubleRegsRegClass;
-      unsigned NewR = MRI->createVirtualRegister(NewRC);
+      Register NewR = MRI->createVirtualRegister(NewRC);
       const MachineInstr *NewMI;
 
       if (W == 32) {
@@ -3009,7 +3009,7 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
       if (V < 0)
         V = -V;
       const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
-      unsigned NewR = MRI->createVirtualRegister(RC);
+      Register NewR = MRI->createVirtualRegister(RC);
       const MachineOperand &Src1 = MI.getOperand(1);
       NewMI = BuildMI(B, At, DL, D, NewR)
                 .addReg(Src1.getReg(), getRegState(Src1), Src1.getSubReg())
@@ -3111,8 +3111,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
 
 void HexagonConstEvaluator::replaceAllRegUsesWith(unsigned FromReg,
       unsigned ToReg) {
-  assert(TargetRegisterInfo::isVirtualRegister(FromReg));
-  assert(TargetRegisterInfo::isVirtualRegister(ToReg));
+  assert(Register::isVirtualRegister(FromReg));
+  assert(Register::isVirtualRegister(ToReg));
   for (auto I = MRI->use_begin(FromReg), E = MRI->use_end(); I != E;) {
     MachineOperand &O = *I;
     ++I;
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index a09ccab483cf..394a329ac447 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -133,8 +133,8 @@ static bool isCombinableInstType(MachineInstr &MI, const HexagonInstrInfo *TII,
     const MachineOperand &Op1 = MI.getOperand(1);
     assert(Op0.isReg() && Op1.isReg());
 
-    unsigned DestReg = Op0.getReg();
-    unsigned SrcReg = Op1.getReg();
+    Register DestReg = Op0.getReg();
+    Register SrcReg = Op1.getReg();
     return Hexagon::IntRegsRegClass.contains(DestReg) &&
            Hexagon::IntRegsRegClass.contains(SrcReg);
   }
@@ -146,7 +146,7 @@ static bool isCombinableInstType(MachineInstr &MI, const HexagonInstrInfo *TII,
     const MachineOperand &Op1 = MI.getOperand(1);
     assert(Op0.isReg());
 
-    unsigned DestReg = Op0.getReg();
+    Register DestReg = Op0.getReg();
     // Ensure that TargetFlags are MO_NO_FLAG for a global. This is a
     // workaround for an ABI bug that prevents GOT relocations on combine
     // instructions
@@ -226,7 +226,7 @@ static bool areCombinableOperations(const TargetRegisterInfo *TRI,
 }
 
 static bool isEvenReg(unsigned Reg) {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(Register::isPhysicalRegister(Reg));
   if (Hexagon::IntRegsRegClass.contains(Reg))
     return (Reg - Hexagon::R0) % 2 == 0;
   if (Hexagon::HvxVRRegClass.contains(Reg))
@@ -265,7 +265,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
                                                 unsigned I1DestReg,
                                                 unsigned I2DestReg,
                                                 bool &DoInsertAtI1) {
-  unsigned I2UseReg = UseReg(I2.getOperand(1));
+  Register I2UseReg = UseReg(I2.getOperand(1));
 
   // It is not safe to move I1 and I2 into one combine if I2 has a true
   // dependence on I1.
@@ -332,7 +332,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
     // At O3 we got better results (dhrystone) by being more conservative here.
     if (!ShouldCombineAggressively)
       End = std::next(MachineBasicBlock::iterator(I2));
-    unsigned I1UseReg = UseReg(I1.getOperand(1));
+    Register I1UseReg = UseReg(I1.getOperand(1));
     // Track killed operands. If we move across an instruction that kills our
     // operand, we need to update the kill information on the moved I1. It kills
     // the operand now.
@@ -410,7 +410,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
           continue;
 
         // Look for the defining instruction.
-        unsigned Reg = Op.getReg();
+        Register Reg = Op.getReg();
         MachineInstr *DefInst = LastDef[Reg];
         if (!DefInst)
           continue;
@@ -442,7 +442,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
       if (Op.isReg()) {
         if (!Op.isDef() || !Op.getReg())
           continue;
-        unsigned Reg = Op.getReg();
+        Register Reg = Op.getReg();
         if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
           for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
             LastDef[*SubRegs] = &MI;
@@ -528,7 +528,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
   while (I2 != I1.getParent()->end() && I2->isDebugInstr())
     ++I2;
 
-  unsigned I1DestReg = I1.getOperand(0).getReg();
+  Register I1DestReg = I1.getOperand(0).getReg();
 
   for (MachineBasicBlock::iterator End = I1.getParent()->end(); I2 != End;
        ++I2) {
@@ -544,7 +544,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
     if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(&*I2))
       continue;
 
-    unsigned I2DestReg = I2->getOperand(0).getReg();
+    Register I2DestReg = I2->getOperand(0).getReg();
 
     // Check that registers are adjacent and that the first destination register
     // is even.
@@ -579,8 +579,8 @@ void HexagonCopyToCombine::combine(MachineInstr &I1, MachineInstr &I2,
     ++MI;
 
   // Figure out whether I1 or I2 goes into the lowreg part.
-  unsigned I1DestReg = I1.getOperand(0).getReg();
-  unsigned I2DestReg = I2.getOperand(0).getReg();
+  Register I1DestReg = I1.getOperand(0).getReg();
+  Register I2DestReg = I2.getOperand(0).getReg();
   bool IsI1Loreg = (I2DestReg - I1DestReg) == 1;
   unsigned LoRegDef = IsI1Loreg ? I1DestReg : I2DestReg;
   unsigned SubLo;
@@ -758,7 +758,7 @@ void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt,
                                          unsigned DoubleDestReg,
                                          MachineOperand &HiOperand,
                                          MachineOperand &LoOperand) {
-  unsigned LoReg = LoOperand.getReg();
+  Register LoReg = LoOperand.getReg();
   unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
 
   DebugLoc DL = InsertPt->getDebugLoc();
@@ -807,7 +807,7 @@ void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt,
                                          MachineOperand &HiOperand,
                                          MachineOperand &LoOperand) {
   unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
-  unsigned HiReg = HiOperand.getReg();
+  Register HiReg = HiOperand.getReg();
 
   DebugLoc DL = InsertPt->getDebugLoc();
   MachineBasicBlock *BB = InsertPt->getParent();
@@ -857,8 +857,8 @@ void HexagonCopyToCombine::emitCombineRR(MachineBasicBlock::iterator &InsertPt,
                                          MachineOperand &LoOperand) {
   unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
   unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
-  unsigned LoReg = LoOperand.getReg();
-  unsigned HiReg = HiOperand.getReg();
+  Register LoReg = LoOperand.getReg();
+  Register HiReg = HiOperand.getReg();
 
   DebugLoc DL = InsertPt->getDebugLoc();
   MachineBasicBlock *BB = InsertPt->getParent();
diff --git a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 2ce1419e4790..e4a2ba0ec29c 100644
--- a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -37,12 +37,12 @@ def: Pat<(int_hexagon_F2_sfmax IntRegs:$src1, IntRegs:$src2),
          (F2_sfmax IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vabswsat DoubleRegs:$src1),
          (A2_vabswsat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2),
-         (S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
-         (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred:$src2),
-         (A4_combineri IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+         (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -75,8 +75,8 @@ def: Pat<(int_hexagon_A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_maxup DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_maxup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2),
-         (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+         (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_interleave DoubleRegs:$src1),
          (S2_interleave DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2),
@@ -89,10 +89,10 @@ def: Pat<(int_hexagon_C2_cmpgtu IntRegs:$src1, IntRegs:$src2),
          (C2_cmpgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2),
          (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2),
-         (A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2),
-         (C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+         (A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyi IntRegs:$src1, IntRegs:$src2),
          (M2_mpyi IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_df2uw_chop DoubleRegs:$src1),
@@ -103,12 +103,12 @@ def: Pat<(int_hexagon_M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3),
-         (S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
+         (S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S4_ntstbit_r IntRegs:$src1, IntRegs:$src2),
          (S4_ntstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_w2sf IntRegs:$src1),
@@ -125,10 +125,10 @@ def: Pat<(int_hexagon_A4_cmpbgt IntRegs:$src1, IntRegs:$src2),
          (A4_cmpbgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2),
-         (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_orp DoubleRegs:$src1, DoubleRegs:$src2),
@@ -137,28 +137,28 @@ def: Pat<(int_hexagon_M2_mpyu_up IntRegs:$src1, IntRegs:$src2),
          (M2_mpyu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2),
-         (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2),
-         (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+         (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_cmpbgtu IntRegs:$src1, IntRegs:$src2),
          (A4_cmpbgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2),
          (A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2),
-         (A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2),
+         (A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_addsp IntRegs:$src1, DoubleRegs:$src2),
          (A2_addsp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2),
          (S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2),
-         (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+         (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2),
          (S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_pmpyw IntRegs:$src1, IntRegs:$src2),
@@ -177,10 +177,10 @@ def: Pat<(int_hexagon_A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
-         (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2),
-         (S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+         (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -199,10 +199,10 @@ def: Pat<(int_hexagon_M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
          (M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C2_bitsset IntRegs:$src1, IntRegs:$src2),
          (C2_bitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2),
-         (M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2),
-         (M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysip IntRegs:$src1, u32_0ImmPred_timm:$src2),
+         (M2_mpysip IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysin IntRegs:$src1, u8_0ImmPred_timm:$src2),
+         (M2_mpysin IntRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_boundscheck IntRegs:$src1, DoubleRegs:$src2),
          (A4_boundscheck IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2),
@@ -225,10 +225,10 @@ def: Pat<(int_hexagon_F2_conv_ud2df DoubleRegs:$src1),
          (F2_conv_ud2df DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vzxthw IntRegs:$src1),
          (S2_vzxthw IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sfadd IntRegs:$src1, IntRegs:$src2),
@@ -241,12 +241,12 @@ def: Pat<(int_hexagon_M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$sr
          (M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4),
-         (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4),
+         (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_packhl IntRegs:$src1, IntRegs:$src2),
          (S2_packhl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2),
-         (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+         (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -259,8 +259,8 @@ def: Pat<(int_hexagon_M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_d2df DoubleRegs:$src1),
          (F2_conv_d2df DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2),
-         (C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+         (C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vconj DoubleRegs:$src1),
          (A2_vconj DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2),
@@ -279,8 +279,8 @@ def: Pat<(int_hexagon_C2_any8 PredRegs:$src1),
          (C2_any8 PredRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_togglebit_r IntRegs:$src1, IntRegs:$src2),
          (S2_togglebit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2),
-         (S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_uw2sf IntRegs:$src1),
          (F2_conv_uw2sf IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vsathb_nopack DoubleRegs:$src1),
@@ -303,10 +303,10 @@ def: Pat<(int_hexagon_C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)
          (C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2),
-         (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+         (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
          (M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -323,34 +323,34 @@ def: Pat<(int_hexagon_M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleR
          (M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2),
          (M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred:$src1),
-         (A2_tfrsi s32_0ImmPred:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred_timm:$src1),
+         (A2_tfrsi s32_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_svnavgh IntRegs:$src1, IntRegs:$src2),
          (A2_svnavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2),
-         (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2),
-         (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+         (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_svavgh IntRegs:$src1, IntRegs:$src2),
          (A2_svavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
          (M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
          (M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
-         (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+         (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_combine_hl IntRegs:$src1, IntRegs:$src2),
          (A2_combine_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_up IntRegs:$src1, IntRegs:$src2),
@@ -381,10 +381,10 @@ def: Pat<(int_hexagon_M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3
          (M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
-         (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2),
@@ -453,8 +453,8 @@ def: Pat<(int_hexagon_M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2),
-         (C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_sat DoubleRegs:$src1),
@@ -469,8 +469,8 @@ def: Pat<(int_hexagon_A2_svavghs IntRegs:$src1, IntRegs:$src2),
          (A2_svavghs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
          (A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2),
-         (C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
+         (C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
          (A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
@@ -535,10 +535,10 @@ def: Pat<(int_hexagon_C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3
          (C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_parityp DoubleRegs:$src1, DoubleRegs:$src2),
          (S2_parityp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -557,30 +557,30 @@ def: Pat<(int_hexagon_M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src
          (M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3),
-         (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
+         (S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_tstbit_r IntRegs:$src1, IntRegs:$src2),
          (S2_tstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3),
-         (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3),
+         (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
          (M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
          (M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2),
-         (S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2),
          (S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2),
-         (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
@@ -605,14 +605,14 @@ def: Pat<(int_hexagon_F2_conv_w2df IntRegs:$src1),
          (F2_conv_w2df IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
          (A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2),
-         (C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vcnegh DoubleRegs:$src1, IntRegs:$src2),
          (S2_vcnegh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2),
-         (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+         (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2),
@@ -633,8 +633,8 @@ def: Pat<(int_hexagon_S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3
          (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_cl0p DoubleRegs:$src1),
          (S2_cl0p DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3),
-         (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
+         (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sffixupd IntRegs:$src1, IntRegs:$src2),
          (F2_sffixupd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
@@ -653,8 +653,8 @@ def: Pat<(int_hexagon_M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs
          (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2),
-         (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sffixupr IntRegs:$src1),
          (F2_sffixupr IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
@@ -669,32 +669,32 @@ def: Pat<(int_hexagon_C2_andn PredRegs:$src1, PredRegs:$src2),
          (C2_andn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2),
          (M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
-         (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2),
-         (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2),
-         (A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_max IntRegs:$src1, IntRegs:$src2),
          (A2_max IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_round_rr IntRegs:$src1, IntRegs:$src2),
          (A4_round_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2),
-         (A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineir s32_0ImmPred:$src1, IntRegs:$src2),
-         (A4_combineir s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineii s8_0ImmPred_timm:$src1, u32_0ImmPred_timm:$src2),
+         (A4_combineii s8_0ImmPred_timm:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2),
+         (A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
          (C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -703,8 +703,8 @@ def: Pat<(int_hexagon_A4_rcmpeq IntRegs:$src1, IntRegs:$src2),
          (A4_rcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2),
          (M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vzxtbh IntRegs:$src1),
          (S2_vzxtbh IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -721,8 +721,8 @@ def: Pat<(int_hexagon_M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C4_nbitsset IntRegs:$src1, IntRegs:$src2),
          (C4_nbitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -745,10 +745,10 @@ def: Pat<(int_hexagon_M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs
          (M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred:$src1),
-         (F2_sfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred:$src1),
-         (F2_sfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred_timm:$src1),
+         (F2_sfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred_timm:$src1),
+         (F2_sfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2),
          (M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
@@ -759,14 +759,14 @@ def: Pat<(int_hexagon_F2_conv_d2sf DoubleRegs:$src1),
          (F2_conv_d2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2),
-         (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2),
+         (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sfcmpuo IntRegs:$src1, IntRegs:$src2),
          (F2_sfcmpuo IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vsatwh_nopack DoubleRegs:$src1),
          (S2_vsatwh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2),
@@ -783,8 +783,8 @@ def: Pat<(int_hexagon_M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_minp DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_minp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
-         (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
@@ -817,16 +817,16 @@ def: Pat<(int_hexagon_S4_extract_rp IntRegs:$src1, DoubleRegs:$src2),
          (S4_extract_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2),
-         (C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+         (C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_tfrcpp CtrRegs64:$src1),
          (A4_tfrcpp CtrRegs64:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2),
-         (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2),
-         (A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
@@ -837,8 +837,8 @@ def: Pat<(int_hexagon_S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRe
          (S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2),
          (S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2),
@@ -851,14 +851,14 @@ def: Pat<(int_hexagon_S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs
          (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_satb IntRegs:$src1),
          (A2_satb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4),
-         (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4),
+         (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2),
          (S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2),
@@ -925,8 +925,8 @@ def: Pat<(int_hexagon_M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3),
-         (C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
+         (C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
          (M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -937,8 +937,8 @@ def: Pat<(int_hexagon_M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -947,8 +947,8 @@ def: Pat<(int_hexagon_S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs
          (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -957,16 +957,16 @@ def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs
          (M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2),
          (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3),
-         (M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3),
-         (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3),
+         (M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+         (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_tfrcrr CtrRegs:$src1),
          (A2_tfrcrr CtrRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3),
-         (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+         (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C2_orn PredRegs:$src1, PredRegs:$src2),
          (C2_orn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1005,8 +1005,8 @@ def: Pat<(int_hexagon_M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntR
          (M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2),
          (F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
-         (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2),
          (A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2),
@@ -1017,10 +1017,10 @@ def: Pat<(int_hexagon_A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vsxthw IntRegs:$src1),
          (S2_vsxthw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C2_cmpgt IntRegs:$src1, IntRegs:$src2),
@@ -1035,22 +1035,22 @@ def: Pat<(int_hexagon_F2_conv_sf2w IntRegs:$src1),
          (F2_conv_sf2w IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2),
-         (F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3),
-         (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3),
+         (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
          (M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred:$src2),
-         (A2_addi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_addp DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_addp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2),
@@ -1063,8 +1063,8 @@ def: Pat<(int_hexagon_M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2),
          (S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
@@ -1131,12 +1131,12 @@ def: Pat<(int_hexagon_C2_and PredRegs:$src1, PredRegs:$src2),
          (C2_and PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S5_popcountp DoubleRegs:$src1),
          (S5_popcountp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3),
-         (S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
+         (S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_cl0 IntRegs:$src1),
          (S2_cl0 IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2),
-         (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+         (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
          (M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -1167,8 +1167,8 @@ def: Pat<(int_hexagon_M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2),
-         (A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2),
@@ -1185,26 +1185,26 @@ def: Pat<(int_hexagon_F2_conv_sf2d IntRegs:$src1),
          (F2_conv_sf2d IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred:$src1),
-         (F2_dfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred_timm:$src1),
+         (F2_dfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_cmphgt IntRegs:$src1, IntRegs:$src2),
          (A4_cmphgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred:$src1),
-         (F2_dfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred_timm:$src1),
+         (F2_dfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3),
-         (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3),
+         (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
          (M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3),
-         (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
+         (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1215,20 +1215,20 @@ def: Pat<(int_hexagon_A2_maxu IntRegs:$src1, IntRegs:$src2),
          (A2_maxu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_maxp DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_maxp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred:$src2),
-         (A2_andir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sfrecipa IntRegs:$src1, IntRegs:$src2),
          (F2_sfrecipa IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2),
-         (A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2),
+         (A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_orn IntRegs:$src1, IntRegs:$src2),
          (A4_orn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2),
-         (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+         (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2),
-         (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2),
+         (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_r IntRegs:$src1, IntRegs:$src2),
          (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2),
@@ -1251,16 +1251,16 @@ def: Pat<(int_hexagon_A2_satub IntRegs:$src1),
          (A2_satub IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2),
          (M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
-         (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2),
          (C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2),
-         (A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred:$src2),
-         (A2_tfril IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3),
-         (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2),
+         (A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2),
+         (A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+         (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vtrunehb DoubleRegs:$src1),
          (S2_vtrunehb DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vabsw DoubleRegs:$src1),
@@ -1269,14 +1269,14 @@ def: Pat<(int_hexagon_A2_vabsh DoubleRegs:$src1),
          (A2_vabsh DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sfsub IntRegs:$src1, IntRegs:$src2),
          (F2_sfsub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3),
-         (C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
-         (C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3),
+         (C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_swiz IntRegs:$src1),
          (A2_swiz IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2),
@@ -1295,44 +1295,44 @@ def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs
          (M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3),
-         (S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
+         (S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_ud2sf DoubleRegs:$src1),
          (F2_conv_ud2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_tfr IntRegs:$src1),
          (A2_tfr IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subri s32_0ImmPred:$src1, IntRegs:$src2),
-         (A2_subri s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2),
+         (A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M5_vmpybuu IntRegs:$src1, IntRegs:$src2),
          (M5_vmpybuu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2),
-         (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_brev IntRegs:$src1),
          (S2_brev IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2),
-         (S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2),
-         (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+         (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
@@ -1343,8 +1343,8 @@ def: Pat<(int_hexagon_M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
-         (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vrndpackwhs DoubleRegs:$src1),
          (S2_vrndpackwhs DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2),
@@ -1357,24 +1357,24 @@ def: Pat<(int_hexagon_M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4),
-         (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4),
+         (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_uw2df IntRegs:$src1),
          (F2_conv_uw2df IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred:$src2),
-         (A2_orir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_andp DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_andp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2),
          (S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_min IntRegs:$src1, IntRegs:$src2),
          (A2_min IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2),
-         (M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2),
+         (M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1397,10 +1397,10 @@ def: Pat<(int_hexagon_M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_df2w DoubleRegs:$src1),
          (F2_conv_df2w DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2),
-         (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+         (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_df2d DoubleRegs:$src1),
          (F2_conv_df2d DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -1423,8 +1423,8 @@ def: Pat<(int_hexagon_A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4),
          (F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2),
-         (F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_df2ud DoubleRegs:$src1),
          (F2_conv_df2ud DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_df2uw DoubleRegs:$src1),
@@ -1433,16 +1433,16 @@ def: Pat<(int_hexagon_M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2),
-         (C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C4_cmplteu IntRegs:$src1, IntRegs:$src2),
          (C4_cmplteu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vsubb_map DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2),
          (A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2),
-         (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
@@ -1471,14 +1471,14 @@ def: Pat<(int_hexagon_M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2),
-         (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+         (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_svaddhs IntRegs:$src1, IntRegs:$src2),
          (A2_svaddhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
@@ -1499,8 +1499,8 @@ def: Pat<(int_hexagon_M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_lsli s6_0ImmPred:$src1, IntRegs:$src2),
-         (S4_lsli s6_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2),
+         (S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2),
          (S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2),
@@ -1529,8 +1529,8 @@ def: Pat<(int_hexagon_A4_cmpbeq IntRegs:$src1, IntRegs:$src2),
          (A4_cmpbeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_negp DoubleRegs:$src1),
          (A2_negp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2),
-         (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
          (A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vsatwuh DoubleRegs:$src1),
@@ -1541,10 +1541,10 @@ def: Pat<(int_hexagon_S2_svsathb IntRegs:$src1),
          (S2_svsathb IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2),
          (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2),
-         (A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2),
-         (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2),
+         (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_cround_rr IntRegs:$src1, IntRegs:$src2),
          (A4_cround_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1563,12 +1563,12 @@ def: Pat<(int_hexagon_A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vminub DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vminub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3),
-         (S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
+         (S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_svsubh IntRegs:$src1, IntRegs:$src2),
          (A2_svsubh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2),
-         (S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2),
+         (S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vsxtbh IntRegs:$src1),
@@ -1589,16 +1589,16 @@ def: Pat<(int_hexagon_M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$sr
          (M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2),
-         (A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1623,8 +1623,8 @@ def: Pat<(int_hexagon_M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntReg
          (M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2),
-         (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1637,10 +1637,10 @@ def: Pat<(int_hexagon_M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRe
          (M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2),
-         (S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_andn IntRegs:$src1, IntRegs:$src2),
          (A4_andn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
@@ -1655,8 +1655,8 @@ def: Pat<(int_hexagon_C2_bitsclr IntRegs:$src1, IntRegs:$src2),
          (C2_bitsclr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2),
-         (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+         (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_ornp DoubleRegs:$src1, DoubleRegs:$src2),
          (A4_ornp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
@@ -1673,14 +1673,14 @@ def: Pat<(int_hexagon_M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2),
-         (C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2),
-         (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
+         (C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+         (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 
 // V55 Scalar Instructions.
 
@@ -1689,30 +1689,30 @@ def: Pat<(int_hexagon_A5_ACS DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src
 
 // V60 Scalar Instructions.
 
-def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
-         (S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2),
-         (S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
-         (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
-         (S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+         (S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
 
 // V62 Scalar Instructions.
 
@@ -1744,8 +1744,8 @@ def: Pat<(int_hexagon_F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2),
          (F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
 def: Pat<(int_hexagon_M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV66]>;
-def: Pat<(int_hexagon_S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2),
-         (S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2)>, Requires<[HasV66]>;
+def: Pat<(int_hexagon_S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2),
+         (S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV66]>;
 
 // V60 HVX Instructions.
 
@@ -1773,10 +1773,10 @@ def: Pat<(int_hexagon_V6_vaddh_dv HvxWR:$src1, HvxWR:$src2),
          (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vaddh_dv_128B HvxWR:$src1, HvxWR:$src2),
          (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
-         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
-         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vshufoh HvxVR:$src1, HvxVR:$src2),
          (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vshufoh_128B HvxVR:$src1, HvxVR:$src2),
@@ -1789,10 +1789,10 @@ def: Pat<(int_hexagon_V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2),
          (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vdmpyhsuisat_128B HvxWR:$src1, IntRegs:$src2),
          (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
-         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
-         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vnavgw HvxVR:$src1, HvxVR:$src2),
          (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vnavgw_128B HvxVR:$src1, HvxVR:$src2),
@@ -2369,10 +2369,10 @@ def: Pat<(int_hexagon_V6_vsubhsat HvxVR:$src1, HvxVR:$src2),
          (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubhsat_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
-         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
-         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vabsw HvxVR:$src1),
          (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vabsw_128B HvxVR:$src1),
@@ -2489,10 +2489,10 @@ def: Pat<(int_hexagon_V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpybv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
-         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
-         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
          (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
@@ -2677,10 +2677,10 @@ def: Pat<(int_hexagon_V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vaddbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
-         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
-         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsatwh HvxVR:$src1, HvxVR:$src2),
          (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsatwh_128B HvxVR:$src1, HvxVR:$src2),
@@ -2721,10 +2721,10 @@ def: Pat<(int_hexagon_V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_veqh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
-         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
-         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vaddwsat HvxVR:$src1, HvxVR:$src2),
          (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vaddwsat_128B HvxVR:$src1, HvxVR:$src2),
@@ -2885,10 +2885,10 @@ def: Pat<(int_hexagon_V6_vsubh HvxVR:$src1, HvxVR:$src2),
          (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubh_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
-         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
-         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vminw HvxVR:$src1, HvxVR:$src2),
          (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vminw_128B HvxVR:$src1, HvxVR:$src2),
@@ -2929,10 +2929,10 @@ def: Pat<(int_hexagon_V6_vsubuhw HvxVR:$src1, HvxVR:$src2),
          (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubuhw_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
-         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
-         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vasrw HvxVR:$src1, IntRegs:$src2),
          (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vasrw_128B HvxVR:$src1, IntRegs:$src2),
@@ -3016,10 +3016,10 @@ def: Pat<(int_hexagon_V6_vlsrb HvxVR:$src1, IntRegs:$src2),
          (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vlsrb_128B HvxVR:$src1, IntRegs:$src2),
          (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
-         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
-         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2),
          (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vaddububb_sat_128B HvxVR:$src1, HvxVR:$src2),
@@ -3032,10 +3032,10 @@ def: Pat<(int_hexagon_V6_ldtp0 PredRegs:$src1, IntRegs:$src2),
          (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_ldtp0_128B PredRegs:$src1, IntRegs:$src2),
          (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
-         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
-         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2),
          (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubuwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
@@ -3124,10 +3124,10 @@ def: Pat<(int_hexagon_V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$sr
          (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vasrwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
          (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
-         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
-         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsubuwsat HvxVR:$src1, HvxVR:$src2),
          (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubuwsat_128B HvxVR:$src1, HvxVR:$src2),
@@ -3188,10 +3188,10 @@ def: Pat<(int_hexagon_V6_ldcnp0 PredRegs:$src1, IntRegs:$src2),
          (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_ldcnp0_128B PredRegs:$src1, IntRegs:$src2),
          (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
-         (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
-         (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+         (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+         (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsubbsat HvxVR:$src1, HvxVR:$src2),
          (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubbsat_128B HvxVR:$src1, HvxVR:$src2),
diff --git a/lib/Target/Hexagon/HexagonDepOperands.td b/lib/Target/Hexagon/HexagonDepOperands.td
index fdba7b971258..8a94d96522cc 100644
--- a/lib/Target/Hexagon/HexagonDepOperands.td
+++ b/lib/Target/Hexagon/HexagonDepOperands.td
@@ -8,120 +8,125 @@
 // Automatically generated file, please consult code owner before editing.
 //===----------------------------------------------------------------------===//
 
+multiclass ImmOpPred<code pred, ValueType vt = i32> {
+  def "" : PatLeaf<(vt imm), pred>;
+  def _timm : PatLeaf<(vt timm), pred>;
+}
+
 def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
-def s4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
+defm s4_0ImmPred : ImmOpPred<[{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
 def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; }
-def s29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
+defm s29_3ImmPred : ImmOpPred<[{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
 def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
 def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
+defm u6_0ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
 def a30_2ImmOperand : AsmOperandClass { let Name = "a30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
 def a30_2Imm : Operand<i32> { let ParserMatchClass = a30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-def a30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+defm a30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
 def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; }
 def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
+defm u29_3ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
 def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand; let DecoderMethod = "s8_0ImmDecoder"; }
-def s8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
+defm s8_0ImmPred : ImmOpPred<[{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
 def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; let RenderMethod = "addImmOperands"; }
 def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
+defm u32_0ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
 def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; }
 def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
+defm u4_2ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
 def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; }
 def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
+defm u3_0ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
 def b15_2ImmOperand : AsmOperandClass { let Name = "b15_2Imm"; let RenderMethod = "addSignedImmOperands"; }
 def b15_2Imm : Operand<OtherVT> { let ParserMatchClass = b15_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-def b15_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
+defm b15_2ImmPred : ImmOpPred<[{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
 def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; }
 def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u11_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
+defm u11_3ImmPred : ImmOpPred<[{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
 def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; }
-def s4_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
+defm s4_3ImmPred : ImmOpPred<[{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
 def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; }
 def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def m32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+defm m32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
 def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; }
 def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u3_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
+defm u3_1ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
 def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; let RenderMethod = "addImmOperands"; }
 def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u1_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
+defm u1_0ImmPred : ImmOpPred<[{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
 def s31_1ImmOperand : AsmOperandClass { let Name = "s31_1Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s31_1Imm : Operand<i32> { let ParserMatchClass = s31_1ImmOperand; let DecoderMethod = "s31_1ImmDecoder"; }
-def s31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 1>(N->getSExtValue());}]>;
+defm s31_1ImmPred : ImmOpPred<[{ return isShiftedInt<32, 1>(N->getSExtValue());}]>;
 def s3_0ImmOperand : AsmOperandClass { let Name = "s3_0Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s3_0Imm : Operand<i32> { let ParserMatchClass = s3_0ImmOperand; let DecoderMethod = "s3_0ImmDecoder"; }
-def s3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
+defm s3_0ImmPred : ImmOpPred<[{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
 def s30_2ImmOperand : AsmOperandClass { let Name = "s30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s30_2Imm : Operand<i32> { let ParserMatchClass = s30_2ImmOperand; let DecoderMethod = "s30_2ImmDecoder"; }
-def s30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+defm s30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
 def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; let RenderMethod = "addImmOperands"; }
 def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
+defm u4_0ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
 def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand; let DecoderMethod = "s6_0ImmDecoder"; }
-def s6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
+defm s6_0ImmPred : ImmOpPred<[{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
 def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; }
 def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u5_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
+defm u5_3ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
 def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand; let DecoderMethod = "s32_0ImmDecoder"; }
-def s32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+defm s32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
 def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; }
-def s6_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
+defm s6_3ImmPred : ImmOpPred<[{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
 def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; let RenderMethod = "addImmOperands"; }
 def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
+defm u10_0ImmPred : ImmOpPred<[{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
 def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; }
 def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
+defm u31_1ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
 def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; }
-def s4_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
+defm s4_1ImmPred : ImmOpPred<[{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
 def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; }
 def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u16_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
+defm u16_0ImmPred : ImmOpPred<[{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
 def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; let RenderMethod = "addImmOperands"; }
 def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u6_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
+defm u6_1ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
 def u5_2ImmOperand : AsmOperandClass { let Name = "u5_2Imm"; let RenderMethod = "addImmOperands"; }
 def u5_2Imm : Operand<i32> { let ParserMatchClass = u5_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u5_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
+defm u5_2ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
 def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; }
 def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u26_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
+defm u26_6ImmPred : ImmOpPred<[{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
 def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; }
 def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u6_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
+defm u6_2ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
 def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; }
 def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u7_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
+defm u7_0ImmPred : ImmOpPred<[{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
 def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; }
 def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-def b13_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
+defm b13_2ImmPred : ImmOpPred<[{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
 def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; }
 def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u5_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
+defm u5_0ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
 def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; }
 def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u2_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
+defm u2_0ImmPred : ImmOpPred<[{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
 def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; }
-def s4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
+defm s4_2ImmPred : ImmOpPred<[{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
 def b30_2ImmOperand : AsmOperandClass { let Name = "b30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
 def b30_2Imm : Operand<OtherVT> { let ParserMatchClass = b30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-def b30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+defm b30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
 def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; }
 def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
+defm u8_0ImmPred : ImmOpPred<[{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
 def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
 def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
+defm u30_2ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index c1f32e54e98d..0844fb8a8629 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -250,7 +250,7 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
   unsigned Opc = T1I->getOpcode();
   if (Opc != Hexagon::J2_jumpt && Opc != Hexagon::J2_jumpf)
     return false;
-  unsigned PredR = T1I->getOperand(0).getReg();
+  Register PredR = T1I->getOperand(0).getReg();
 
   // Get the layout successor, or 0 if B does not have one.
   MachineFunction::iterator NextBI = std::next(MachineFunction::iterator(B));
@@ -384,8 +384,8 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned R = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(R))
+      Register R = MO.getReg();
+      if (!Register::isVirtualRegister(R))
         continue;
       if (!isPredicate(R))
         continue;
@@ -401,8 +401,8 @@ bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const {
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isUse())
       continue;
-    unsigned R = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
+    Register R = MO.getReg();
+    if (!Register::isVirtualRegister(R))
       continue;
     const MachineInstr *DefI = MRI->getVRegDef(R);
     // "Undefined" virtual registers are actually defined via IMPLICIT_DEF.
@@ -437,7 +437,7 @@ bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
         break;
       if (usesUndefVReg(&MI))
         return false;
-      unsigned DefR = MI.getOperand(0).getReg();
+      Register DefR = MI.getOperand(0).getReg();
       if (isPredicate(DefR))
         return false;
     }
@@ -491,8 +491,8 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned R = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(R))
+      Register R = MO.getReg();
+      if (!Register::isVirtualRegister(R))
         continue;
       if (isPredicate(R))
         PredDefs++;
@@ -798,7 +798,7 @@ unsigned HexagonEarlyIfConversion::buildMux(MachineBasicBlock *B,
   const MCInstrDesc &D = HII->get(Opc);
 
   DebugLoc DL = B->findBranchDebugLoc();
-  unsigned MuxR = MRI->createVirtualRegister(DRC);
+  Register MuxR = MRI->createVirtualRegister(DRC);
   BuildMI(*B, At, DL, D, MuxR)
     .addReg(PredR)
     .addReg(TR, 0, TSR)
@@ -837,7 +837,7 @@ void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
     unsigned MuxR = 0, MuxSR = 0;
 
     if (TR && FR) {
-      unsigned DR = PN->getOperand(0).getReg();
+      Register DR = PN->getOperand(0).getReg();
       const TargetRegisterClass *RC = MRI->getRegClass(DR);
       MuxR = buildMux(FP.SplitB, FP.SplitB->getFirstTerminator(), RC,
                       FP.PredR, TR, TSR, FR, FSR);
@@ -988,8 +988,8 @@ void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
     MachineInstr *PN = &*I;
     assert(PN->getNumOperands() == 3 && "Invalid phi node");
     MachineOperand &UO = PN->getOperand(1);
-    unsigned UseR = UO.getReg(), UseSR = UO.getSubReg();
-    unsigned DefR = PN->getOperand(0).getReg();
+    Register UseR = UO.getReg(), UseSR = UO.getSubReg();
+    Register DefR = PN->getOperand(0).getReg();
     unsigned NewR = UseR;
     if (UseSR) {
       // MRI.replaceVregUsesWith does not allow to update the subregister,
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index c343e426ac7d..8984ee82960d 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -285,7 +285,7 @@ bool HexagonExpandCondsets::isCondset(const MachineInstr &MI) {
 }
 
 LaneBitmask HexagonExpandCondsets::getLaneMask(unsigned Reg, unsigned Sub) {
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
   return Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub)
                   : MRI->getMaxLaneMaskForVReg(Reg);
 }
@@ -364,7 +364,7 @@ void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
 
 void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
       LiveRange &Range) {
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
   if (Range.empty())
     return;
 
@@ -372,8 +372,8 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
   auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> std::pair<bool,bool> {
     if (!Op.isReg() || !Op.isDef())
       return { false, false };
-    unsigned DR = Op.getReg(), DSR = Op.getSubReg();
-    if (!TargetRegisterInfo::isVirtualRegister(DR) || DR != Reg)
+    Register DR = Op.getReg(), DSR = Op.getSubReg();
+    if (!Register::isVirtualRegister(DR) || DR != Reg)
       return { false, false };
     LaneBitmask SLM = getLaneMask(DR, DSR);
     LaneBitmask A = SLM & LM;
@@ -551,8 +551,8 @@ void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet,
       bool Recalc, bool UpdateKills, bool UpdateDeads) {
   UpdateKills |= UpdateDeads;
   for (unsigned R : RegSet) {
-    if (!TargetRegisterInfo::isVirtualRegister(R)) {
-      assert(TargetRegisterInfo::isPhysicalRegister(R));
+    if (!Register::isVirtualRegister(R)) {
+      assert(Register::isPhysicalRegister(R));
       // There shouldn't be any physical registers as operands, except
       // possibly reserved registers.
       assert(MRI->isReserved(R));
@@ -579,17 +579,17 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
   using namespace Hexagon;
 
   if (SO.isReg()) {
-    unsigned PhysR;
+    Register PhysR;
     RegisterRef RS = SO;
-    if (TargetRegisterInfo::isVirtualRegister(RS.Reg)) {
+    if (Register::isVirtualRegister(RS.Reg)) {
       const TargetRegisterClass *VC = MRI->getRegClass(RS.Reg);
       assert(VC->begin() != VC->end() && "Empty register class");
       PhysR = *VC->begin();
     } else {
-      assert(TargetRegisterInfo::isPhysicalRegister(RS.Reg));
+      assert(Register::isPhysicalRegister(RS.Reg));
       PhysR = RS.Reg;
     }
-    unsigned PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
+    Register PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS);
     switch (TRI->getRegSizeInBits(*RC)) {
       case 32:
@@ -671,7 +671,7 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   MachineOperand &MD = MI.getOperand(0);  // Definition
   MachineOperand &MP = MI.getOperand(1);  // Predicate register
   assert(MD.isDef());
-  unsigned DR = MD.getReg(), DSR = MD.getSubReg();
+  Register DR = MD.getReg(), DSR = MD.getSubReg();
   bool ReadUndef = MD.isUndef();
   MachineBasicBlock::iterator At = MI;
 
@@ -802,7 +802,7 @@ bool HexagonExpandCondsets::canMoveOver(MachineInstr &MI, ReferenceMap &Defs,
     // For physical register we would need to check register aliases, etc.
     // and we don't want to bother with that. It would be of little value
     // before the actual register rewriting (from virtual to physical).
-    if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+    if (!Register::isVirtualRegister(RR.Reg))
       return false;
     // No redefs for any operand.
     if (isRefInMap(RR, Defs, Exec_Then))
@@ -954,7 +954,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
     return false;
 
   RegisterRef RT(MS);
-  unsigned PredR = MP.getReg();
+  Register PredR = MP.getReg();
   MachineInstr *DefI = getReachingDefForPred(RT, TfrI, PredR, Cond);
   if (!DefI || !isPredicable(DefI))
     return false;
@@ -999,7 +999,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
       // subregisters are other physical registers, and we are not checking
       // that.
       RegisterRef RR = Op;
-      if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+      if (!Register::isVirtualRegister(RR.Reg))
         return false;
 
       ReferenceMap &Map = Op.isDef() ? Defs : Uses;
@@ -1091,7 +1091,7 @@ bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
 }
 
 bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) {
-  if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+  if (!Register::isVirtualRegister(RR.Reg))
     return false;
   const TargetRegisterClass *RC = MRI->getRegClass(RR.Reg);
   if (RC == &Hexagon::IntRegsRegClass) {
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index f7edc168de4a..d21de8ccb5ab 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -114,12 +114,11 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
 
   // First pass - compute the offset of each basic block.
   for (const MachineBasicBlock &MBB : MF) {
-    if (MBB.getAlignment()) {
+    if (MBB.getAlignment() != Align::None()) {
       // Although we don't know the exact layout of the final code, we need
       // to account for alignment padding somehow. This heuristic pads each
       // aligned basic block according to the alignment value.
-      int ByteAlign = (1u << MBB.getAlignment()) - 1;
-      InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
+      InstOffset = alignTo(InstOffset, MBB.getAlignment());
     }
 
     BlockToInstOffset[&MBB] = InstOffset;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 3368ee4fb3b9..bfa3372d7faf 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -303,10 +303,10 @@ static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
         if (MO.isFI())
           return true;
         if (MO.isReg()) {
-          unsigned R = MO.getReg();
+          Register R = MO.getReg();
           // Virtual registers will need scavenging, which then may require
           // a stack slot.
-          if (TargetRegisterInfo::isVirtualRegister(R))
+          if (Register::isVirtualRegister(R))
             return true;
           for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
             if (CSR[*S])
@@ -973,8 +973,8 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
       // understand paired registers for cfi_offset.
       // Eg .cfi_offset r1:0, -64
 
-      unsigned HiReg = HRI.getSubReg(Reg, Hexagon::isub_hi);
-      unsigned LoReg = HRI.getSubReg(Reg, Hexagon::isub_lo);
+      Register HiReg = HRI.getSubReg(Reg, Hexagon::isub_hi);
+      Register LoReg = HRI.getSubReg(Reg, Hexagon::isub_lo);
       unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true);
       unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true);
       auto OffHi = MCCFIInstruction::createOffset(FrameLabel, HiDwarfReg,
@@ -1377,10 +1377,10 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
   }
 
   MFI.setLocalFrameSize(LFS);
-  unsigned A = MFI.getLocalFrameMaxAlign();
+  Align A = MFI.getLocalFrameMaxAlign();
   assert(A <= 8 && "Unexpected local frame alignment");
-  if (A == 0)
-    MFI.setLocalFrameMaxAlign(8);
+  if (A == 1)
+    MFI.setLocalFrameMaxAlign(Align(8));
   MFI.setUseLocalStackAllocationBlock(true);
 
   // Set the physical aligned-stack base address register.
@@ -1570,13 +1570,13 @@ bool HexagonFrameLowering::expandCopy(MachineBasicBlock &B,
       const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
   MachineInstr *MI = &*It;
   DebugLoc DL = MI->getDebugLoc();
-  unsigned DstR = MI->getOperand(0).getReg();
-  unsigned SrcR = MI->getOperand(1).getReg();
+  Register DstR = MI->getOperand(0).getReg();
+  Register SrcR = MI->getOperand(1).getReg();
   if (!Hexagon::ModRegsRegClass.contains(DstR) ||
       !Hexagon::ModRegsRegClass.contains(SrcR))
     return false;
 
-  unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
   BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR).add(MI->getOperand(1));
   BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), DstR)
     .addReg(TmpR, RegState::Kill);
@@ -1595,13 +1595,13 @@ bool HexagonFrameLowering::expandStoreInt(MachineBasicBlock &B,
 
   DebugLoc DL = MI->getDebugLoc();
   unsigned Opc = MI->getOpcode();
-  unsigned SrcR = MI->getOperand(2).getReg();
+  Register SrcR = MI->getOperand(2).getReg();
   bool IsKill = MI->getOperand(2).isKill();
   int FI = MI->getOperand(0).getIndex();
 
   // TmpR = C2_tfrpr SrcR   if SrcR is a predicate register
   // TmpR = A2_tfrcrr SrcR  if SrcR is a modifier register
-  unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
   unsigned TfrOpc = (Opc == Hexagon::STriw_pred) ? Hexagon::C2_tfrpr
                                                  : Hexagon::A2_tfrcrr;
   BuildMI(B, It, DL, HII.get(TfrOpc), TmpR)
@@ -1628,11 +1628,11 @@ bool HexagonFrameLowering::expandLoadInt(MachineBasicBlock &B,
 
   DebugLoc DL = MI->getDebugLoc();
   unsigned Opc = MI->getOpcode();
-  unsigned DstR = MI->getOperand(0).getReg();
+  Register DstR = MI->getOperand(0).getReg();
   int FI = MI->getOperand(1).getIndex();
 
   // TmpR = L2_loadri_io FI, 0
-  unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
   BuildMI(B, It, DL, HII.get(Hexagon::L2_loadri_io), TmpR)
       .addFrameIndex(FI)
       .addImm(0)
@@ -1658,7 +1658,7 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
     return false;
 
   DebugLoc DL = MI->getDebugLoc();
-  unsigned SrcR = MI->getOperand(2).getReg();
+  Register SrcR = MI->getOperand(2).getReg();
   bool IsKill = MI->getOperand(2).isKill();
   int FI = MI->getOperand(0).getIndex();
   auto *RC = &Hexagon::HvxVRRegClass;
@@ -1667,8 +1667,8 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
   //   TmpR0 = A2_tfrsi 0x01010101
   //   TmpR1 = V6_vandqrt Qx, TmpR0
   //   store FI, 0, TmpR1
-  unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-  unsigned TmpR1 = MRI.createVirtualRegister(RC);
+  Register TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  Register TmpR1 = MRI.createVirtualRegister(RC);
 
   BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
     .addImm(0x01010101);
@@ -1695,15 +1695,15 @@ bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B,
     return false;
 
   DebugLoc DL = MI->getDebugLoc();
-  unsigned DstR = MI->getOperand(0).getReg();
+  Register DstR = MI->getOperand(0).getReg();
   int FI = MI->getOperand(1).getIndex();
   auto *RC = &Hexagon::HvxVRRegClass;
 
   // TmpR0 = A2_tfrsi 0x01010101
   // TmpR1 = load FI, 0
   // DstR = V6_vandvrt TmpR1, TmpR0
-  unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-  unsigned TmpR1 = MRI.createVirtualRegister(RC);
+  Register TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  Register TmpR1 = MRI.createVirtualRegister(RC);
 
   BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
     .addImm(0x01010101);
@@ -1745,9 +1745,9 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   }
 
   DebugLoc DL = MI->getDebugLoc();
-  unsigned SrcR = MI->getOperand(2).getReg();
-  unsigned SrcLo = HRI.getSubReg(SrcR, Hexagon::vsub_lo);
-  unsigned SrcHi = HRI.getSubReg(SrcR, Hexagon::vsub_hi);
+  Register SrcR = MI->getOperand(2).getReg();
+  Register SrcLo = HRI.getSubReg(SrcR, Hexagon::vsub_lo);
+  Register SrcHi = HRI.getSubReg(SrcR, Hexagon::vsub_hi);
   bool IsKill = MI->getOperand(2).isKill();
   int FI = MI->getOperand(0).getIndex();
 
@@ -1793,9 +1793,9 @@ bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B,
     return false;
 
   DebugLoc DL = MI->getDebugLoc();
-  unsigned DstR = MI->getOperand(0).getReg();
-  unsigned DstHi = HRI.getSubReg(DstR, Hexagon::vsub_hi);
-  unsigned DstLo = HRI.getSubReg(DstR, Hexagon::vsub_lo);
+  Register DstR = MI->getOperand(0).getReg();
+  Register DstHi = HRI.getSubReg(DstR, Hexagon::vsub_hi);
+  Register DstLo = HRI.getSubReg(DstR, Hexagon::vsub_lo);
   int FI = MI->getOperand(1).getIndex();
 
   unsigned Size = HRI.getSpillSize(Hexagon::HvxVRRegClass);
@@ -1834,7 +1834,7 @@ bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B,
 
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned SrcR = MI->getOperand(2).getReg();
+  Register SrcR = MI->getOperand(2).getReg();
   bool IsKill = MI->getOperand(2).isKill();
   int FI = MI->getOperand(0).getIndex();
 
@@ -1863,7 +1863,7 @@ bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B,
 
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned DstR = MI->getOperand(0).getReg();
+  Register DstR = MI->getOperand(0).getReg();
   int FI = MI->getOperand(1).getIndex();
 
   unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
@@ -2299,7 +2299,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
           int TFI;
           if (!HII.isLoadFromStackSlot(MI, TFI) || TFI != FI)
             continue;
-          unsigned DstR = MI.getOperand(0).getReg();
+          Register DstR = MI.getOperand(0).getReg();
           assert(MI.getOperand(0).getSubReg() == 0);
           MachineInstr *CopyOut = nullptr;
           if (DstR != FoundR) {
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 65e8c7686640..27265dd53794 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -30,7 +30,7 @@ class TargetRegisterClass;
 class HexagonFrameLowering : public TargetFrameLowering {
 public:
   explicit HexagonFrameLowering()
-      : TargetFrameLowering(StackGrowsDown, 8, 0, 1, true) {}
+      : TargetFrameLowering(StackGrowsDown, Align(8), 0, Align::None(), true) {}
 
   // All of the prolog/epilog functionality, including saving and restoring
   // callee-saved registers is handled in emitPrologue. This is to have the
diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp
index 3417c74e359b..caa0e4d80397 100644
--- a/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -184,7 +184,7 @@ bool HexagonGenExtract::convert(Instruction *In) {
   // The width of the extracted field is the minimum of the original bits
   // that remain after the shifts and the number of contiguous 1s in the mask.
   uint32_t W = std::min(U, T);
-  if (W == 0)
+  if (W == 0 || W == 1)
     return false;
 
   // Check if the extracted bits are contained within the mask that it is
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index 81025c1c5325..48881e02f4d3 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -163,11 +163,11 @@ namespace {
     }
 
     static inline unsigned v2x(unsigned v) {
-      return TargetRegisterInfo::virtReg2Index(v);
+      return Register::virtReg2Index(v);
     }
 
     static inline unsigned x2v(unsigned x) {
-      return TargetRegisterInfo::index2VirtReg(x);
+      return Register::index2VirtReg(x);
     }
   };
 
@@ -267,7 +267,7 @@ namespace {
     CellMapShadow(const BitTracker &T) : BT(T) {}
 
     const BitTracker::RegisterCell &lookup(unsigned VR) {
-      unsigned RInd = TargetRegisterInfo::virtReg2Index(VR);
+      unsigned RInd = Register::virtReg2Index(VR);
       // Grow the vector to at least 32 elements.
       if (RInd >= CVect.size())
         CVect.resize(std::max(RInd+16, 32U), nullptr);
@@ -606,9 +606,9 @@ void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const {
       for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
         const MachineOperand &MO = MI->getOperand(i);
         if (MO.isReg() && MO.isDef()) {
-          unsigned R = MO.getReg();
+          Register R = MO.getReg();
           assert(MO.getSubReg() == 0 && "Unexpected subregister in definition");
-          if (TargetRegisterInfo::isVirtualRegister(R))
+          if (Register::isVirtualRegister(R))
             RO.insert(std::make_pair(R, Index++));
         }
       }
@@ -724,8 +724,8 @@ void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned R = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
+    Register R = MO.getReg();
+    if (!Register::isVirtualRegister(R))
       continue;
     Defs.insert(R);
   }
@@ -737,8 +737,8 @@ void HexagonGenInsert::getInstrUses(const MachineInstr *MI,
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || !MO.isUse())
       continue;
-    unsigned R = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
+    Register R = MO.getReg();
+    if (!Register::isVirtualRegister(R))
       continue;
     Uses.insert(R);
   }
@@ -1399,7 +1399,7 @@ bool HexagonGenInsert::generateInserts() {
   for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
     unsigned VR = I->first;
     const TargetRegisterClass *RC = MRI->getRegClass(VR);
-    unsigned NewVR = MRI->createVirtualRegister(RC);
+    Register NewVR = MRI->createVirtualRegister(RC);
     RegMap[VR] = NewVR;
   }
 
@@ -1477,9 +1477,8 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
     for (const MachineOperand &MO : MI->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned R = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(R) ||
-          !MRI->use_nodbg_empty(R)) {
+      Register R = MO.getReg();
+      if (!Register::isVirtualRegister(R) || !MRI->use_nodbg_empty(R)) {
         AllDead = false;
         break;
       }
@@ -1598,7 +1597,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
 
     IterListType Out;
     for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
-      unsigned Idx = TargetRegisterInfo::virtReg2Index(I->first);
+      unsigned Idx = Register::virtReg2Index(I->first);
       if (Idx >= Cutoff)
         Out.push_back(I);
     }
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index cdafbc20ab86..b559e7bbbb60 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -171,7 +171,7 @@ void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs,
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || MO.isImplicit())
       continue;
-    unsigned R = MO.getReg();
+    Register R = MO.getReg();
     BitVector &Set = MO.isDef() ? Defs : Uses;
     expandReg(R, Set);
   }
@@ -239,14 +239,14 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     unsigned Opc = MI->getOpcode();
     if (!isCondTransfer(Opc))
       continue;
-    unsigned DR = MI->getOperand(0).getReg();
+    Register DR = MI->getOperand(0).getReg();
     if (isRegPair(DR))
       continue;
     MachineOperand &PredOp = MI->getOperand(1);
     if (PredOp.isUndef())
       continue;
 
-    unsigned PR = PredOp.getReg();
+    Register PR = PredOp.getReg();
     unsigned Idx = I2X.lookup(MI);
     CondsetMap::iterator F = CM.find(DR);
     bool IfTrue = HII->isPredicatedTrue(Opc);
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
index e991fa8b61c8..24d33c91a29b 100644
--- a/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -133,7 +133,7 @@ INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
   "Hexagon generate predicate operations", false, false)
 
 bool HexagonGenPredicate::isPredReg(unsigned R) {
-  if (!TargetRegisterInfo::isVirtualRegister(R))
+  if (!Register::isVirtualRegister(R))
     return false;
   const TargetRegisterClass *RC = MRI->getRegClass(R);
   return RC == &Hexagon::PredRegsRegClass;
@@ -213,7 +213,7 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
         case TargetOpcode::COPY:
           if (isPredReg(MI->getOperand(1).getReg())) {
             RegisterSubReg RD = MI->getOperand(0);
-            if (TargetRegisterInfo::isVirtualRegister(RD.R))
+            if (Register::isVirtualRegister(RD.R))
               PredGPRs.insert(RD);
           }
           break;
@@ -245,7 +245,7 @@ RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) {
   // Create a predicate register for a given Reg. The newly created register
   // will have its value copied from Reg, so that it can be later used as
   // an operand in other instructions.
-  assert(TargetRegisterInfo::isVirtualRegister(Reg.R));
+  assert(Register::isVirtualRegister(Reg.R));
   RegToRegMap::iterator F = G2P.find(Reg);
   if (F != G2P.end())
     return F->second;
@@ -265,7 +265,7 @@ RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) {
   MachineBasicBlock &B = *DefI->getParent();
   DebugLoc DL = DefI->getDebugLoc();
   const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
-  unsigned NewPR = MRI->createVirtualRegister(PredRC);
+  Register NewPR = MRI->createVirtualRegister(PredRC);
 
   // For convertible instructions, do not modify them, so that they can
   // be converted later.  Generate a copy from Reg to NewPR.
@@ -432,7 +432,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
   // Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR
   // with NewGPR.
   const TargetRegisterClass *RC = MRI->getRegClass(OutR.R);
-  unsigned NewOutR = MRI->createVirtualRegister(RC);
+  Register NewOutR = MRI->createVirtualRegister(RC);
   BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), NewOutR)
     .addReg(NewPR.R, 0, NewPR.S);
   MRI->replaceRegWith(OutR.R, NewOutR);
@@ -471,9 +471,9 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
         continue;
       RegisterSubReg DR = MI.getOperand(0);
       RegisterSubReg SR = MI.getOperand(1);
-      if (!TargetRegisterInfo::isVirtualRegister(DR.R))
+      if (!Register::isVirtualRegister(DR.R))
         continue;
-      if (!TargetRegisterInfo::isVirtualRegister(SR.R))
+      if (!Register::isVirtualRegister(SR.R))
         continue;
       if (MRI->getRegClass(DR.R) != PredRC)
         continue;
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index cecbaedb6d70..62291790f0fe 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -435,17 +435,17 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
       if (Phi->getOperand(i+1).getMBB() != Latch)
         continue;
 
-      unsigned PhiOpReg = Phi->getOperand(i).getReg();
+      Register PhiOpReg = Phi->getOperand(i).getReg();
       MachineInstr *DI = MRI->getVRegDef(PhiOpReg);
 
       if (DI->getDesc().isAdd()) {
         // If the register operand to the add is the PHI we're looking at, this
         // meets the induction pattern.
-        unsigned IndReg = DI->getOperand(1).getReg();
+        Register IndReg = DI->getOperand(1).getReg();
         MachineOperand &Opnd2 = DI->getOperand(2);
         int64_t V;
         if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) {
-          unsigned UpdReg = DI->getOperand(0).getReg();
+          Register UpdReg = DI->getOperand(0).getReg();
           IndMap.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
         }
       }
@@ -694,7 +694,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
     Cmp = Comparison::getSwappedComparison(Cmp);
 
   if (InitialValue->isReg()) {
-    unsigned R = InitialValue->getReg();
+    Register R = InitialValue->getReg();
     MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
     if (!MDT->properlyDominates(DefBB, Header)) {
       int64_t V;
@@ -704,7 +704,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
     OldInsts.push_back(MRI->getVRegDef(R));
   }
   if (EndValue->isReg()) {
-    unsigned R = EndValue->getReg();
+    Register R = EndValue->getReg();
     MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
     if (!MDT->properlyDominates(DefBB, Header)) {
       int64_t V;
@@ -910,7 +910,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
                               (RegToImm ? TII->get(Hexagon::A2_subri) :
                                           TII->get(Hexagon::A2_addi));
     if (RegToReg || RegToImm) {
-      unsigned SubR = MRI->createVirtualRegister(IntRC);
+      Register SubR = MRI->createVirtualRegister(IntRC);
       MachineInstrBuilder SubIB =
         BuildMI(*PH, InsertPos, DL, SubD, SubR);
 
@@ -931,7 +931,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
           EndValInstr->getOperand(2).getImm() == StartV) {
         DistR = EndValInstr->getOperand(1).getReg();
       } else {
-        unsigned SubR = MRI->createVirtualRegister(IntRC);
+        Register SubR = MRI->createVirtualRegister(IntRC);
         MachineInstrBuilder SubIB =
           BuildMI(*PH, InsertPos, DL, SubD, SubR);
         SubIB.addReg(End->getReg(), 0, End->getSubReg())
@@ -950,7 +950,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
     AdjSR = DistSR;
   } else {
     // Generate CountR = ADD DistR, AdjVal
-    unsigned AddR = MRI->createVirtualRegister(IntRC);
+    Register AddR = MRI->createVirtualRegister(IntRC);
     MCInstrDesc const &AddD = TII->get(Hexagon::A2_addi);
     BuildMI(*PH, InsertPos, DL, AddD, AddR)
       .addReg(DistR, 0, DistSR)
@@ -971,7 +971,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
     unsigned Shift = Log2_32(IVBump);
 
     // Generate NormR = LSR DistR, Shift.
-    unsigned LsrR = MRI->createVirtualRegister(IntRC);
+    Register LsrR = MRI->createVirtualRegister(IntRC);
     const MCInstrDesc &LsrD = TII->get(Hexagon::S2_lsr_i_r);
     BuildMI(*PH, InsertPos, DL, LsrD, LsrR)
       .addReg(AdjR, 0, AdjSR)
@@ -1038,7 +1038,7 @@ bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
     if (!MO.isReg() || !MO.isDef())
       continue;
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (MRI->use_nodbg_empty(Reg))
       continue;
 
@@ -1058,7 +1058,7 @@ bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
       if (!OPO.isReg() || !OPO.isDef())
         continue;
 
-      unsigned OPReg = OPO.getReg();
+      Register OPReg = OPO.getReg();
       use_nodbg_iterator nextJ;
       for (use_nodbg_iterator J = MRI->use_nodbg_begin(OPReg);
            J != End; J = nextJ) {
@@ -1092,7 +1092,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
       const MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       MachineRegisterInfo::use_iterator nextI;
       for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
            E = MRI->use_end(); I != E; I = nextI) {
@@ -1244,7 +1244,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
 
   if (TripCount->isReg()) {
     // Create a copy of the loop count register.
-    unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+    Register CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
     BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg)
       .addReg(TripCount->getReg(), 0, TripCount->getSubReg());
     // Add the Loop instruction to the beginning of the loop.
@@ -1257,7 +1257,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
     // create a new virtual register.
     int64_t CountImm = TripCount->getImm();
     if (!TII->isValidOffset(LOOP_i, CountImm, TRI)) {
-      unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+      Register CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
       BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::A2_tfrsi), CountReg)
         .addImm(CountImm);
       BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_r))
@@ -1333,7 +1333,7 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
       return true;
 
   // Out of order.
-  unsigned PredR = CmpI->getOperand(0).getReg();
+  Register PredR = CmpI->getOperand(0).getReg();
   bool FoundBump = false;
   instr_iterator CmpIt = CmpI->getIterator(), NextIt = std::next(CmpIt);
   for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
@@ -1428,10 +1428,10 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
   if (checkForImmediate(*InitVal, Imm))
     return (EndVal->getImm() == Imm);
 
-  unsigned Reg = InitVal->getReg();
+  Register Reg = InitVal->getReg();
 
   // We don't know the value of a physical register.
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  if (!Register::isVirtualRegister(Reg))
     return true;
 
   MachineInstr *Def = MRI->getVRegDef(Reg);
@@ -1508,8 +1508,8 @@ bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO,
   // processed to handle potential subregisters in MO.
   int64_t TV;
 
-  unsigned R = MO.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(R))
+  Register R = MO.getReg();
+  if (!Register::isVirtualRegister(R))
     return false;
   MachineInstr *DI = MRI->getVRegDef(R);
   unsigned DOpc = DI->getOpcode();
@@ -1582,11 +1582,11 @@ void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) {
   }
 
   assert(MO.isReg());
-  unsigned R = MO.getReg();
+  Register R = MO.getReg();
   MachineInstr *DI = MRI->getVRegDef(R);
 
   const TargetRegisterClass *RC = MRI->getRegClass(R);
-  unsigned NewR = MRI->createVirtualRegister(RC);
+  Register NewR = MRI->createVirtualRegister(RC);
   MachineBasicBlock &B = *DI->getParent();
   DebugLoc DL = DI->getDebugLoc();
   BuildMI(B, DI, DL, TII->get(DI->getOpcode()), NewR).addImm(Val);
@@ -1634,17 +1634,17 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
       if (Phi->getOperand(i+1).getMBB() != Latch)
         continue;
 
-      unsigned PhiReg = Phi->getOperand(i).getReg();
+      Register PhiReg = Phi->getOperand(i).getReg();
       MachineInstr *DI = MRI->getVRegDef(PhiReg);
 
       if (DI->getDesc().isAdd()) {
         // If the register operand to the add/sub is the PHI we are looking
         // at, this meets the induction pattern.
-        unsigned IndReg = DI->getOperand(1).getReg();
+        Register IndReg = DI->getOperand(1).getReg();
         MachineOperand &Opnd2 = DI->getOperand(2);
         int64_t V;
         if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) {
-          unsigned UpdReg = DI->getOperand(0).getReg();
+          Register UpdReg = DI->getOperand(0).getReg();
           IndRegs.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
         }
       }
@@ -1702,7 +1702,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
   if (!Cond[CSz-1].isReg())
     return false;
 
-  unsigned P = Cond[CSz-1].getReg();
+  Register P = Cond[CSz - 1].getReg();
   MachineInstr *PredDef = MRI->getVRegDef(P);
 
   if (!PredDef->isCompare())
@@ -1903,15 +1903,15 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
       MachineInstr *NewPN = MF->CreateMachineInstr(PD, DL);
       NewPH->insert(NewPH->end(), NewPN);
 
-      unsigned PR = PN->getOperand(0).getReg();
+      Register PR = PN->getOperand(0).getReg();
       const TargetRegisterClass *RC = MRI->getRegClass(PR);
-      unsigned NewPR = MRI->createVirtualRegister(RC);
+      Register NewPR = MRI->createVirtualRegister(RC);
       NewPN->addOperand(MachineOperand::CreateReg(NewPR, true));
 
       // Copy all non-latch operands of a header's PHI node to the newly
       // created PHI node in the preheader.
       for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
-        unsigned PredR = PN->getOperand(i).getReg();
+        Register PredR = PN->getOperand(i).getReg();
         unsigned PredRSub = PN->getOperand(i).getSubReg();
         MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
         if (PredB == Latch)
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 605fcfc25559..4684d8e4781a 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -697,7 +697,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
 //
 void HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
   SDLoc dl(N);
-  ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+  auto *CN = cast<ConstantFPSDNode>(N);
   APInt A = CN->getValueAPF().bitcastToAPInt();
   if (N->getValueType(0) == MVT::f32) {
     SDValue V = CurDAG->getTargetConstant(A.getZExtValue(), dl, MVT::i32);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index fef5a98cdb00..8a8986e232a0 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -240,12 +240,12 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return true;
 }
 
-unsigned  HexagonTargetLowering::getRegisterByName(const char* RegName, EVT VT,
-                                              SelectionDAG &DAG) const {
+Register HexagonTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                                  const MachineFunction &) const {
   // Just support r19, the linux kernel uses it.
-  unsigned Reg = StringSwitch<unsigned>(RegName)
+  Register Reg = StringSwitch<Register>(RegName)
                      .Case("r19", Hexagon::R19)
-                     .Default(0);
+                     .Default(Register());
   if (Reg)
     return Reg;
 
@@ -286,7 +286,7 @@ SDValue HexagonTargetLowering::LowerCallResult(
       SDValue FR0 = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
                                        MVT::i32, Glue);
       // FR0 = (Value, Chain, Glue)
-      unsigned PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+      Register PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
       SDValue TPR = DAG.getCopyToReg(FR0.getValue(1), dl, PredR,
                                      FR0.getValue(0), FR0.getValue(2));
       // TPR = (Chain, Glue)
@@ -736,7 +736,7 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
         RegVT = VA.getValVT();
 
       const TargetRegisterClass *RC = getRegClassFor(RegVT);
-      unsigned VReg = MRI.createVirtualRegister(RC);
+      Register VReg = MRI.createVirtualRegister(RC);
       SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
 
       // Treat values of type MVT::i1 specially: they are passed in
@@ -870,15 +870,20 @@ SDValue
 HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue PredOp = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2);
-  EVT OpVT = Op1.getValueType();
-  SDLoc DL(Op);
+  MVT OpTy = ty(Op1);
+  const SDLoc &dl(Op);
 
-  if (OpVT == MVT::v2i16) {
-    SDValue X1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op1);
-    SDValue X2 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op2);
-    SDValue SL = DAG.getNode(ISD::VSELECT, DL, MVT::v2i32, PredOp, X1, X2);
-    SDValue TR = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i16, SL);
-    return TR;
+  if (OpTy == MVT::v2i16 || OpTy == MVT::v4i8) {
+    MVT ElemTy = OpTy.getVectorElementType();
+    assert(ElemTy.isScalarInteger());
+    MVT WideTy = MVT::getVectorVT(MVT::getIntegerVT(2*ElemTy.getSizeInBits()),
+                                  OpTy.getVectorNumElements());
+    // Generate (trunc (select (_, sext, sext))).
+    return DAG.getSExtOrTrunc(
+              DAG.getSelect(dl, WideTy, PredOp,
+                            DAG.getSExtOrTrunc(Op1, dl, WideTy),
+                            DAG.getSExtOrTrunc(Op2, dl, WideTy)),
+              dl, OpTy);
   }
 
   return SDValue();
@@ -1230,9 +1235,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
       Subtarget(ST) {
   auto &HRI = *Subtarget.getRegisterInfo();
 
-  setPrefLoopAlignment(4);
-  setPrefFunctionAlignment(4);
-  setMinFunctionAlignment(2);
+  setPrefLoopAlignment(Align(16));
+  setMinFunctionAlignment(Align(4));
+  setPrefFunctionAlignment(Align(16));
   setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
   setBooleanContents(TargetLoweringBase::UndefinedBooleanContent);
   setBooleanVectorContents(TargetLoweringBase::UndefinedBooleanContent);
@@ -1434,12 +1439,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     ISD::CONCAT_VECTORS,        ISD::VECTOR_SHUFFLE
   };
 
-  for (MVT VT : MVT::vector_valuetypes()) {
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     for (unsigned VectExpOp : VectExpOps)
       setOperationAction(VectExpOp, VT, Expand);
 
     // Expand all extending loads and truncating stores:
-    for (MVT TargetVT : MVT::vector_valuetypes()) {
+    for (MVT TargetVT : MVT::fixedlen_vector_valuetypes()) {
       if (TargetVT == VT)
         continue;
       setLoadExtAction(ISD::EXTLOAD, TargetVT, VT, Expand);
@@ -1496,16 +1501,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STORE, VT, Custom);
   }
 
-  for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v2i32, MVT::v4i16, MVT::v2i32}) {
-    setCondCodeAction(ISD::SETLT,  VT, Expand);
+  for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v8i8, MVT::v2i32, MVT::v4i16,
+                 MVT::v2i32}) {
+    setCondCodeAction(ISD::SETNE,  VT, Expand);
     setCondCodeAction(ISD::SETLE,  VT, Expand);
-    setCondCodeAction(ISD::SETULT, VT, Expand);
+    setCondCodeAction(ISD::SETGE,  VT, Expand);
+    setCondCodeAction(ISD::SETLT,  VT, Expand);
     setCondCodeAction(ISD::SETULE, VT, Expand);
+    setCondCodeAction(ISD::SETUGE, VT, Expand);
+    setCondCodeAction(ISD::SETULT, VT, Expand);
   }
 
   // Custom-lower bitcasts from i8 to v8i1.
   setOperationAction(ISD::BITCAST,        MVT::i8,    Custom);
   setOperationAction(ISD::SETCC,          MVT::v2i16, Custom);
+  setOperationAction(ISD::VSELECT,        MVT::v4i8,  Custom);
   setOperationAction(ISD::VSELECT,        MVT::v2i16, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8,  Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
@@ -1554,6 +1564,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSUB, MVT::f64, Legal);
   }
 
+  setTargetDAGCombine(ISD::VSELECT);
+
   if (Subtarget.useHVXOps())
     initializeHVXLowering();
 
@@ -1643,6 +1655,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VINSERTW0:     return "HexagonISD::VINSERTW0";
   case HexagonISD::VROR:          return "HexagonISD::VROR";
   case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
+  case HexagonISD::PTRUE:         return "HexagonISD::PTRUE";
+  case HexagonISD::PFALSE:        return "HexagonISD::PFALSE";
   case HexagonISD::VZERO:         return "HexagonISD::VZERO";
   case HexagonISD::VSPLATW:       return "HexagonISD::VSPLATW";
   case HexagonISD::D2P:           return "HexagonISD::D2P";
@@ -1783,7 +1797,8 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // The offset value comes through Modifier register. For now, assume the
     // offset is 0.
     Info.offset = 0;
-    Info.align = DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont));
+    Info.align =
+        MaybeAlign(DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont)));
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
@@ -1805,7 +1820,8 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(VecTy);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = M.getDataLayout().getTypeAllocSizeInBits(VecTy) / 8;
+    Info.align =
+        MaybeAlign(M.getDataLayout().getTypeAllocSizeInBits(VecTy) / 8);
     Info.flags = MachineMemOperand::MOLoad |
                  MachineMemOperand::MOStore |
                  MachineMemOperand::MOVolatile;
@@ -1817,6 +1833,10 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   return false;
 }
 
+bool HexagonTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
+  return X.getValueType().isScalarInteger(); // 'tstbit'
+}
+
 bool HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   return isTruncateFree(EVT::getEVT(Ty1), EVT::getEVT(Ty2));
 }
@@ -1844,26 +1864,33 @@ bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask,
 
 TargetLoweringBase::LegalizeTypeAction
 HexagonTargetLowering::getPreferredVectorAction(MVT VT) const {
-  if (VT.getVectorNumElements() == 1)
-    return TargetLoweringBase::TypeScalarizeVector;
-
-  // Always widen vectors of i1.
+  unsigned VecLen = VT.getVectorNumElements();
   MVT ElemTy = VT.getVectorElementType();
-  if (ElemTy == MVT::i1)
-    return TargetLoweringBase::TypeWidenVector;
+
+  if (VecLen == 1 || VT.isScalableVector())
+    return TargetLoweringBase::TypeScalarizeVector;
 
   if (Subtarget.useHVXOps()) {
+    unsigned HwLen = Subtarget.getVectorLength();
     // If the size of VT is at least half of the vector length,
     // widen the vector. Note: the threshold was not selected in
     // any scientific way.
     ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes();
     if (llvm::find(Tys, ElemTy) != Tys.end()) {
-      unsigned HwWidth = 8*Subtarget.getVectorLength();
+      unsigned HwWidth = 8*HwLen;
       unsigned VecWidth = VT.getSizeInBits();
       if (VecWidth >= HwWidth/2 && VecWidth < HwWidth)
         return TargetLoweringBase::TypeWidenVector;
     }
+    // Split vectors of i1 that correspond to (byte) vector pairs.
+    if (ElemTy == MVT::i1 && VecLen == 2*HwLen)
+      return TargetLoweringBase::TypeSplitVector;
   }
+
+  // Always widen (remaining) vectors of i1.
+  if (ElemTy == MVT::i1)
+    return TargetLoweringBase::TypeWidenVector;
+
   return TargetLoweringBase::TypeSplitVector;
 }
 
@@ -2452,6 +2479,23 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return buildVector64(Ops, dl, VecTy, DAG);
 
   if (VecTy == MVT::v8i1 || VecTy == MVT::v4i1 || VecTy == MVT::v2i1) {
+    // Check if this is a special case or all-0 or all-1.
+    bool All0 = true, All1 = true;
+    for (SDValue P : Ops) {
+      auto *CN = dyn_cast<ConstantSDNode>(P.getNode());
+      if (CN == nullptr) {
+        All0 = All1 = false;
+        break;
+      }
+      uint32_t C = CN->getZExtValue();
+      All0 &= (C == 0);
+      All1 &= (C == 1);
+    }
+    if (All0)
+      return DAG.getNode(HexagonISD::PFALSE, dl, VecTy);
+    if (All1)
+      return DAG.getNode(HexagonISD::PTRUE, dl, VecTy);
+
     // For each i1 element in the resulting predicate register, put 1
     // shifted by the index of the element into a general-purpose register,
     // then or them together and transfer it back into a predicate register.
@@ -2629,7 +2673,8 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
     DoDefault = true;
 
   if (!AlignLoads) {
-    if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), *LN->getMemOperand()))
+    if (allowsMemoryAccessForAlignment(Ctx, DL, LN->getMemoryVT(),
+                                       *LN->getMemOperand()))
       return Op;
     DoDefault = true;
   }
@@ -2637,7 +2682,8 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
     // The PartTy is the equivalent of "getLoadableTypeOfSize(HaveAlign)".
     MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8 * HaveAlign)
                                 : MVT::getVectorVT(MVT::i8, HaveAlign);
-    DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, *LN->getMemOperand());
+    DoDefault =
+        allowsMemoryAccessForAlignment(Ctx, DL, PartTy, *LN->getMemOperand());
   }
   if (DoDefault) {
     std::pair<SDValue, SDValue> P = expandUnalignedLoad(LN, DAG);
@@ -2865,12 +2911,54 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
       if (N->getValueType(0) == MVT::i8) {
         SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32,
                              N->getOperand(0), DAG);
-        Results.push_back(P);
+        SDValue T = DAG.getAnyExtOrTrunc(P, dl, MVT::i8);
+        Results.push_back(T);
       }
       break;
   }
 }
 
+SDValue
+HexagonTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
+      const {
+  SDValue Op(N, 0);
+  if (isHvxOperation(Op)) {
+    if (SDValue V = PerformHvxDAGCombine(N, DCI))
+      return V;
+    return SDValue();
+  }
+
+  const SDLoc &dl(Op);
+  unsigned Opc = Op.getOpcode();
+
+  if (Opc == HexagonISD::P2D) {
+    SDValue P = Op.getOperand(0);
+    switch (P.getOpcode()) {
+      case HexagonISD::PTRUE:
+        return DCI.DAG.getConstant(-1, dl, ty(Op));
+      case HexagonISD::PFALSE:
+        return getZero(dl, ty(Op), DCI.DAG);
+      default:
+        break;
+    }
+  } else if (Opc == ISD::VSELECT) {
+    // This is pretty much duplicated in HexagonISelLoweringHVX...
+    //
+    // (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0)
+    SDValue Cond = Op.getOperand(0);
+    if (Cond->getOpcode() == ISD::XOR) {
+      SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
+      if (C1->getOpcode() == HexagonISD::PTRUE) {
+        SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
+                                       Op.getOperand(2), Op.getOperand(1));
+        return VSel;
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 /// Returns relocation base for the given PIC jumptable.
 SDValue
 HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 4e467cb22727..75f553bfec7f 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -68,6 +68,8 @@ namespace HexagonISD {
       EH_RETURN,
       DCFETCH,
       READCYCLE,
+      PTRUE,
+      PFALSE,
       D2P,         // Convert 8-byte value to 8-bit predicate register. [*]
       P2D,         // Convert 8-bit predicate register to 8-byte value. [*]
       V2Q,         // Convert HVX vector to a vector predicate reg. [*]
@@ -127,6 +129,8 @@ namespace HexagonISD {
     bool isCheapToSpeculateCtlz() const override { return true; }
     bool isCtlzFast() const override { return true; }
 
+    bool hasBitTest(SDValue X, SDValue Y) const override;
+
     bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
     /// Return true if an FMA operation is faster than a pair of mul and add
@@ -221,10 +225,12 @@ namespace HexagonISD {
                         const SmallVectorImpl<SDValue> &OutVals,
                         const SDLoc &dl, SelectionDAG &DAG) const override;
 
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
     bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT,
-                               SelectionDAG &DAG) const override;
+    Register getRegisterByName(const char* RegName, EVT VT,
+                               const MachineFunction &MF) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
@@ -299,7 +305,8 @@ namespace HexagonISD {
         const AttributeList &FuncAttributes) const override;
 
     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
-        unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const override;
+        unsigned Align, MachineMemOperand::Flags Flags, bool *Fast)
+        const override;
 
     /// Returns relocation base for the given PIC jumptable.
     SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
@@ -456,6 +463,8 @@ namespace HexagonISD {
 
     bool isHvxOperation(SDValue Op) const;
     SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 345c657787a0..bc8a9959c917 100644
--- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -193,6 +193,8 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::OR,                 BoolV, Legal);
     setOperationAction(ISD::XOR,                BoolV, Legal);
   }
+
+  setTargetDAGCombine(ISD::VSELECT);
 }
 
 SDValue
@@ -1580,6 +1582,28 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("Unhandled HVX operation");
 }
 
+SDValue
+HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
+      const {
+  const SDLoc &dl(N);
+  SDValue Op(N, 0);
+
+  unsigned Opc = Op.getOpcode();
+  if (Opc == ISD::VSELECT) {
+    // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0)
+    SDValue Cond = Op.getOperand(0);
+    if (Cond->getOpcode() == ISD::XOR) {
+      SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
+      if (C1->getOpcode() == HexagonISD::QTRUE) {
+        SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
+                                       Op.getOperand(2), Op.getOperand(1));
+        return VSel;
+      }
+    }
+  }
+  return SDValue();
+}
+
 bool
 HexagonTargetLowering::isHvxOperation(SDValue Op) const {
   // If the type of the result, or any operand type are HVX vector types,
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index a156de5ba128..767538f92ed6 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -193,7 +193,7 @@ static inline void parseOperands(const MachineInstr &MI,
     if (!MO.isReg())
       continue;
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
 
@@ -674,86 +674,96 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
   return 2;
 }
 
-/// Analyze the loop code to find the loop induction variable and compare used
-/// to compute the number of iterations. Currently, we analyze loop that are
-/// controlled using hardware loops.  In this case, the induction variable
-/// instruction is null.  For all other cases, this function returns true, which
-/// means we're unable to analyze it.
-bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
-                                   MachineInstr *&IndVarInst,
-                                   MachineInstr *&CmpInst) const {
+namespace {
+class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+  MachineInstr *Loop, *EndLoop;
+  MachineFunction *MF;
+  const HexagonInstrInfo *TII;
+  int64_t TripCount;
+  Register LoopCount;
+  DebugLoc DL;
 
-  MachineBasicBlock *LoopEnd = L.getBottomBlock();
-  MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
-  // We really "analyze" only hardware loops right now.
-  if (I != LoopEnd->end() && isEndLoopN(I->getOpcode())) {
-    IndVarInst = nullptr;
-    CmpInst = &*I;
-    return false;
+public:
+  HexagonPipelinerLoopInfo(MachineInstr *Loop, MachineInstr *EndLoop)
+      : Loop(Loop), EndLoop(EndLoop), MF(Loop->getParent()->getParent()),
+        TII(MF->getSubtarget<HexagonSubtarget>().getInstrInfo()),
+        DL(Loop->getDebugLoc()) {
+    // Inspect the Loop instruction up-front, as it may be deleted when we call
+    // createTripCountGreaterCondition.
+    TripCount = Loop->getOpcode() == Hexagon::J2_loop0r
+                    ? -1
+                    : Loop->getOperand(1).getImm();
+    if (TripCount == -1)
+      LoopCount = Loop->getOperand(1).getReg();
   }
-  return true;
-}
 
-/// Generate code to reduce the loop iteration by one and check if the loop is
-/// finished. Return the value/register of the new loop count. this function
-/// assumes the nth iteration is peeled first.
-unsigned HexagonInstrInfo::reduceLoopCount(
-    MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
-    MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
-    SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
-    unsigned MaxIter) const {
-  // We expect a hardware loop currently. This means that IndVar is set
-  // to null, and the compare is the ENDLOOP instruction.
-  assert((!IndVar) && isEndLoopN(Cmp.getOpcode())
-                   && "Expecting a hardware loop");
-  MachineFunction *MF = MBB.getParent();
-  DebugLoc DL = Cmp.getDebugLoc();
-  SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-  MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(),
-                                     Cmp.getOperand(0).getMBB(), VisitedBBs);
-  if (!Loop)
-    return 0;
-  // If the loop trip count is a compile-time value, then just change the
-  // value.
-  if (Loop->getOpcode() == Hexagon::J2_loop0i ||
-      Loop->getOpcode() == Hexagon::J2_loop1i) {
-    int64_t Offset = Loop->getOperand(1).getImm();
-    if (Offset <= 1)
-      Loop->eraseFromParent();
-    else
-      Loop->getOperand(1).setImm(Offset - 1);
-    return Offset - 1;
+  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+    // Only ignore the terminator.
+    return MI == EndLoop;
   }
-  // The loop trip count is a run-time value. We generate code to subtract
-  // one from the trip count, and update the loop instruction.
-  assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction");
-  unsigned LoopCount = Loop->getOperand(1).getReg();
-  // Check if we're done with the loop.
-  unsigned LoopEnd = createVR(MF, MVT::i1);
-  MachineInstr *NewCmp = BuildMI(&MBB, DL, get(Hexagon::C2_cmpgtui), LoopEnd).
-    addReg(LoopCount).addImm(1);
-  unsigned NewLoopCount = createVR(MF, MVT::i32);
-  MachineInstr *NewAdd = BuildMI(&MBB, DL, get(Hexagon::A2_addi), NewLoopCount).
-    addReg(LoopCount).addImm(-1);
-  const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
-  // Update the previously generated instructions with the new loop counter.
-  for (SmallVectorImpl<MachineInstr *>::iterator I = PrevInsts.begin(),
-         E = PrevInsts.end(); I != E; ++I)
-    (*I)->substituteRegister(LoopCount, NewLoopCount, 0, HRI);
-  PrevInsts.clear();
-  PrevInsts.push_back(NewCmp);
-  PrevInsts.push_back(NewAdd);
-  // Insert the new loop instruction if this is the last time the loop is
-  // decremented.
-  if (Iter == MaxIter)
-    BuildMI(&MBB, DL, get(Hexagon::J2_loop0r)).
-      addMBB(Loop->getOperand(0).getMBB()).addReg(NewLoopCount);
-  // Delete the old loop instruction.
-  if (Iter == 0)
-    Loop->eraseFromParent();
-  Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf));
-  Cond.push_back(NewCmp->getOperand(0));
-  return NewLoopCount;
+
+  Optional<bool>
+  createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+                                  SmallVectorImpl<MachineOperand> &Cond) override {
+    if (TripCount == -1) {
+      // Check if we're done with the loop.
+      unsigned Done = TII->createVR(MF, MVT::i1);
+      MachineInstr *NewCmp = BuildMI(&MBB, DL,
+                                     TII->get(Hexagon::C2_cmpgtui), Done)
+                                 .addReg(LoopCount)
+                                 .addImm(TC);
+      Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf));
+      Cond.push_back(NewCmp->getOperand(0));
+      return {};
+    }
+
+    return TripCount > TC;
+  }
+
+  void setPreheader(MachineBasicBlock *NewPreheader) override {
+    NewPreheader->splice(NewPreheader->getFirstTerminator(), Loop->getParent(),
+                         Loop);
+  }
+
+  void adjustTripCount(int TripCountAdjust) override {
+    // If the loop trip count is a compile-time value, then just change the
+    // value.
+    if (Loop->getOpcode() == Hexagon::J2_loop0i ||
+        Loop->getOpcode() == Hexagon::J2_loop1i) {
+      int64_t TripCount = Loop->getOperand(1).getImm() + TripCountAdjust;
+      assert(TripCount > 0 && "Can't create an empty or negative loop!");
+      Loop->getOperand(1).setImm(TripCount);
+      return;
+    }
+
+    // The loop trip count is a run-time value. We generate code to subtract
+    // one from the trip count, and update the loop instruction.
+    Register LoopCount = Loop->getOperand(1).getReg();
+    Register NewLoopCount = TII->createVR(MF, MVT::i32);
+    BuildMI(*Loop->getParent(), Loop, Loop->getDebugLoc(),
+            TII->get(Hexagon::A2_addi), NewLoopCount)
+        .addReg(LoopCount)
+        .addImm(TripCountAdjust);
+    Loop->getOperand(1).setReg(NewLoopCount);
+  }
+
+  void disposed() override { Loop->eraseFromParent(); }
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+HexagonInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+  // We really "analyze" only hardware loops right now.
+  MachineBasicBlock::iterator I = LoopBB->getFirstTerminator();
+
+  if (I != LoopBB->end() && isEndLoopN(I->getOpcode())) {
+    SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+    MachineInstr *LoopInst = findLoopInstr(
+        LoopBB, I->getOpcode(), I->getOperand(0).getMBB(), VisitedBBs);
+    if (LoopInst)
+      return std::make_unique<HexagonPipelinerLoopInfo>(LoopInst, &*I);
+  }
+  return nullptr;
 }
 
 bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
@@ -839,8 +849,8 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
   if (Hexagon::HvxWRRegClass.contains(SrcReg, DestReg)) {
-    unsigned LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
-    unsigned HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+    Register LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+    Register HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
     BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg)
       .addReg(HiSrc, KillFlag)
       .addReg(LoSrc, KillFlag);
@@ -1017,7 +1027,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
 
   auto RealCirc = [&](unsigned Opc, bool HasImm, unsigned MxOp) {
-    unsigned Mx = MI.getOperand(MxOp).getReg();
+    Register Mx = MI.getOperand(MxOp).getReg();
     unsigned CSx = (Mx == Hexagon::M0 ? Hexagon::CS0 : Hexagon::CS1);
     BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrrcr), CSx)
         .add(MI.getOperand((HasImm ? 5 : 4)));
@@ -1049,8 +1059,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       MBB.erase(MI);
       return true;
     case Hexagon::V6_vassignp: {
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned DstReg = MI.getOperand(0).getReg();
+      Register SrcReg = MI.getOperand(1).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
       unsigned Kill = getKillRegState(MI.getOperand(1).isKill());
       BuildMI(MBB, MI, DL, get(Hexagon::V6_vcombine), DstReg)
         .addReg(HRI.getSubReg(SrcReg, Hexagon::vsub_hi), Kill)
@@ -1059,18 +1069,18 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       return true;
     }
     case Hexagon::V6_lo: {
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+      Register SrcReg = MI.getOperand(1).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
+      Register SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
       copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI.getOperand(1).isKill());
       MBB.erase(MI);
       MRI.clearKillFlags(SrcSubLo);
       return true;
     }
     case Hexagon::V6_hi: {
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+      Register SrcReg = MI.getOperand(1).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
+      Register SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
       copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI.getOperand(1).isKill());
       MBB.erase(MI);
       MRI.clearKillFlags(SrcSubHi);
@@ -1079,9 +1089,9 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     case Hexagon::PS_vstorerw_ai:
     case Hexagon::PS_vstorerwu_ai: {
       bool Aligned = Opc == Hexagon::PS_vstorerw_ai;
-      unsigned SrcReg = MI.getOperand(2).getReg();
-      unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
-      unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+      Register SrcReg = MI.getOperand(2).getReg();
+      Register SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+      Register SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
       unsigned NewOpc = Aligned ? Hexagon::V6_vS32b_ai : Hexagon::V6_vS32Ub_ai;
       unsigned Offset = HRI.getSpillSize(Hexagon::HvxVRRegClass);
 
@@ -1103,7 +1113,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     case Hexagon::PS_vloadrw_ai:
     case Hexagon::PS_vloadrwu_ai: {
       bool Aligned = Opc == Hexagon::PS_vloadrw_ai;
-      unsigned DstReg = MI.getOperand(0).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
       unsigned NewOpc = Aligned ? Hexagon::V6_vL32b_ai : Hexagon::V6_vL32Ub_ai;
       unsigned Offset = HRI.getSpillSize(Hexagon::HvxVRRegClass);
 
@@ -1122,7 +1132,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       return true;
     }
     case Hexagon::PS_true: {
-      unsigned Reg = MI.getOperand(0).getReg();
+      Register Reg = MI.getOperand(0).getReg();
       BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg)
         .addReg(Reg, RegState::Undef)
         .addReg(Reg, RegState::Undef);
@@ -1130,7 +1140,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       return true;
     }
     case Hexagon::PS_false: {
-      unsigned Reg = MI.getOperand(0).getReg();
+      Register Reg = MI.getOperand(0).getReg();
       BuildMI(MBB, MI, DL, get(Hexagon::C2_andn), Reg)
         .addReg(Reg, RegState::Undef)
         .addReg(Reg, RegState::Undef);
@@ -1152,7 +1162,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       return true;
     }
     case Hexagon::PS_vdd0: {
-      unsigned Vd = MI.getOperand(0).getReg();
+      Register Vd = MI.getOperand(0).getReg();
       BuildMI(MBB, MI, DL, get(Hexagon::V6_vsubw_dv), Vd)
         .addReg(Vd, RegState::Undef)
         .addReg(Vd, RegState::Undef);
@@ -1161,13 +1171,13 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     }
     case Hexagon::PS_vmulw: {
       // Expand a 64-bit vector multiply into 2 32-bit scalar multiplies.
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned Src1Reg = MI.getOperand(1).getReg();
-      unsigned Src2Reg = MI.getOperand(2).getReg();
-      unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
-      unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
-      unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
-      unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
+      Register DstReg = MI.getOperand(0).getReg();
+      Register Src1Reg = MI.getOperand(1).getReg();
+      Register Src2Reg = MI.getOperand(2).getReg();
+      Register Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
+      Register Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
+      Register Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
+      Register Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
       BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_mpyi),
               HRI.getSubReg(DstReg, Hexagon::isub_hi))
           .addReg(Src1SubHi)
@@ -1185,16 +1195,16 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     }
     case Hexagon::PS_vmulw_acc: {
       // Expand 64-bit vector multiply with addition into 2 scalar multiplies.
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned Src1Reg = MI.getOperand(1).getReg();
-      unsigned Src2Reg = MI.getOperand(2).getReg();
-      unsigned Src3Reg = MI.getOperand(3).getReg();
-      unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
-      unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
-      unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
-      unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
-      unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::isub_hi);
-      unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::isub_lo);
+      Register DstReg = MI.getOperand(0).getReg();
+      Register Src1Reg = MI.getOperand(1).getReg();
+      Register Src2Reg = MI.getOperand(2).getReg();
+      Register Src3Reg = MI.getOperand(3).getReg();
+      Register Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
+      Register Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
+      Register Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
+      Register Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
+      Register Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::isub_hi);
+      Register Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::isub_lo);
       BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_maci),
               HRI.getSubReg(DstReg, Hexagon::isub_hi))
           .addReg(Src1SubHi)
@@ -1219,10 +1229,10 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       const MachineOperand &Op1 = MI.getOperand(1);
       const MachineOperand &Op2 = MI.getOperand(2);
       const MachineOperand &Op3 = MI.getOperand(3);
-      unsigned Rd = Op0.getReg();
-      unsigned Pu = Op1.getReg();
-      unsigned Rs = Op2.getReg();
-      unsigned Rt = Op3.getReg();
+      Register Rd = Op0.getReg();
+      Register Pu = Op1.getReg();
+      Register Rs = Op2.getReg();
+      Register Rt = Op3.getReg();
       DebugLoc DL = MI.getDebugLoc();
       unsigned K1 = getKillRegState(Op1.isKill());
       unsigned K2 = getKillRegState(Op2.isKill());
@@ -1246,7 +1256,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
-      unsigned PReg = Op1.getReg();
+      Register PReg = Op1.getReg();
       assert(Op1.getSubReg() == 0);
       unsigned PState = getRegState(Op1);
 
@@ -1280,15 +1290,15 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
-      unsigned PReg = Op1.getReg();
+      Register PReg = Op1.getReg();
       assert(Op1.getSubReg() == 0);
       unsigned PState = getRegState(Op1);
 
       if (Op0.getReg() != Op2.getReg()) {
         unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill
                                                   : PState;
-        unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
-        unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
+        Register SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
+        Register SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
                      .add(Op0)
                      .addReg(PReg, S)
@@ -1299,8 +1309,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         IsDestLive = true;
       }
       if (Op0.getReg() != Op3.getReg()) {
-        unsigned SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo);
-        unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
+        Register SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo);
+        Register SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine))
                      .add(Op0)
                      .addReg(PReg, PState)
@@ -1856,8 +1866,7 @@ DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
 //  S2_storeri_io %r29, 132, killed %r1; flags:  mem:ST4[FixedStack1]
 // Currently AA considers the addresses in these instructions to be aliasing.
 bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
-    const MachineInstr &MIa, const MachineInstr &MIb,
-    AliasAnalysis *AA) const {
+    const MachineInstr &MIa, const MachineInstr &MIb) const {
   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
@@ -1872,7 +1881,7 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
   if (!getBaseAndOffsetPosition(MIa, BasePosA, OffsetPosA))
     return false;
   const MachineOperand &BaseA = MIa.getOperand(BasePosA);
-  unsigned BaseRegA = BaseA.getReg();
+  Register BaseRegA = BaseA.getReg();
   unsigned BaseSubA = BaseA.getSubReg();
 
   // Get the base register in MIb.
@@ -1880,7 +1889,7 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
   if (!getBaseAndOffsetPosition(MIb, BasePosB, OffsetPosB))
     return false;
   const MachineOperand &BaseB = MIb.getOperand(BasePosB);
-  unsigned BaseRegB = BaseB.getReg();
+  Register BaseRegB = BaseB.getReg();
   unsigned BaseSubB = BaseB.getSubReg();
 
   if (BaseRegA != BaseRegB || BaseSubA != BaseSubB)
@@ -1984,7 +1993,7 @@ unsigned HexagonInstrInfo::createVR(MachineFunction *MF, MVT VT) const {
     llvm_unreachable("Cannot handle this register class");
   }
 
-  unsigned NewReg = MRI.createVirtualRegister(TRC);
+  Register NewReg = MRI.createVirtualRegister(TRC);
   return NewReg;
 }
 
@@ -2094,12 +2103,12 @@ bool HexagonInstrInfo::isDependent(const MachineInstr &ProdMI,
       if (RegA == RegB)
         return true;
 
-      if (TargetRegisterInfo::isPhysicalRegister(RegA))
+      if (Register::isPhysicalRegister(RegA))
         for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs)
           if (RegB == *SubRegs)
             return true;
 
-      if (TargetRegisterInfo::isPhysicalRegister(RegB))
+      if (Register::isPhysicalRegister(RegB))
         for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs)
           if (RegA == *SubRegs)
             return true;
@@ -2605,7 +2614,7 @@ bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr &MI1,
       const MachineInstr &MI2) const {
   if (mayBeCurLoad(MI1)) {
     // if (result of SU is used in Next) return true;
-    unsigned DstReg = MI1.getOperand(0).getReg();
+    Register DstReg = MI1.getOperand(0).getReg();
     int N = MI2.getNumOperands();
     for (int I = 0; I < N; I++)
       if (MI2.getOperand(I).isReg() && DstReg == MI2.getOperand(I).getReg())
@@ -3374,7 +3383,7 @@ unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr &GA,
   if ((GA.getOpcode() != Hexagon::C2_cmpeqi) ||
       (GB.getOpcode() != Hexagon::J2_jumptnew))
     return -1u;
-  unsigned DestReg = GA.getOperand(0).getReg();
+  Register DestReg = GA.getOperand(0).getReg();
   if (!GB.readsRegister(DestReg))
     return -1u;
   if (DestReg != Hexagon::P0 && DestReg != Hexagon::P1)
@@ -4091,7 +4100,7 @@ int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   // Get DefIdx and UseIdx for super registers.
   const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
 
-  if (DefMO.isReg() && HRI.isPhysicalRegister(DefMO.getReg())) {
+  if (DefMO.isReg() && Register::isPhysicalRegister(DefMO.getReg())) {
     if (DefMO.isImplicit()) {
       for (MCSuperRegIterator SR(DefMO.getReg(), &HRI); SR.isValid(); ++SR) {
         int Idx = DefMI.findRegisterDefOperandIdx(*SR, false, false, &HRI);
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index e0a999d0f4c4..60298cd666bb 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -129,21 +129,10 @@ public:
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
 
-  /// Analyze the loop code, return true if it cannot be understood. Upon
-  /// success, this function returns false and returns information about the
-  /// induction variable and compare instruction used at the end.
-  bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
-                   MachineInstr *&CmpInst) const override;
-
-  /// Generate code to reduce the loop iteration by one and check if the loop
-  /// is finished.  Return the value/register of the new loop count.  We need
-  /// this function when peeling off one or more iterations of a loop. This
-  /// function assumes the nth iteration is peeled first.
-  unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
-                           MachineInstr *IndVar, MachineInstr &Cmp,
-                           SmallVectorImpl<MachineOperand> &Cond,
-                           SmallVectorImpl<MachineInstr *> &PrevInsts,
-                           unsigned Iter, unsigned MaxIter) const override;
+  /// Analyze loop L, which must be a single-basic-block loop, and if the
+  /// conditions can be understood enough produce a PipelinerLoopInfo object.
+  std::unique_ptr<PipelinerLoopInfo>
+  analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
 
   /// Return true if it's profitable to predicate
   /// instructions with accumulated instruction latency of "NumCycles"
@@ -299,8 +288,7 @@ public:
   // memory addresses and false otherwise.
   bool
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
-                                  const MachineInstr &MIb,
-                                  AliasAnalysis *AA = nullptr) const override;
+                                  const MachineInstr &MIb) const override;
 
   /// For instructions with a base and offset, return the position of the
   /// base register and offset operands.
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index cabfd783effa..c5e3cfd080d6 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -22,14 +22,14 @@ class T_RP_pat <InstHexagon MI, Intrinsic IntID>
 
 def: Pat<(int_hexagon_A2_add IntRegs:$Rs, IntRegs:$Rt),
          (A2_add IntRegs:$Rs, IntRegs:$Rt)>;
-def: Pat<(int_hexagon_A2_addi IntRegs:$Rs, imm:$s16),
+def: Pat<(int_hexagon_A2_addi IntRegs:$Rs, timm:$s16),
          (A2_addi IntRegs:$Rs, imm:$s16)>;
 def: Pat<(int_hexagon_A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt),
          (A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
 
 def: Pat<(int_hexagon_A2_sub IntRegs:$Rs, IntRegs:$Rt),
          (A2_sub IntRegs:$Rs, IntRegs:$Rt)>;
-def: Pat<(int_hexagon_A2_subri imm:$s10, IntRegs:$Rs),
+def: Pat<(int_hexagon_A2_subri timm:$s10, IntRegs:$Rs),
          (A2_subri imm:$s10, IntRegs:$Rs)>;
 def: Pat<(int_hexagon_A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt),
          (A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
@@ -45,26 +45,26 @@ def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$Rs, IntRegs:$Rt),
 def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt),
          (M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt)>;
 
-def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$Rs, imm:$u5),
+def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$Rs, timm:$u5),
          (S2_asl_i_r IntRegs:$Rs, imm:$u5)>;
-def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$Rs, imm:$u5),
+def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$Rs, timm:$u5),
          (S2_lsr_i_r IntRegs:$Rs, imm:$u5)>;
-def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$Rs, imm:$u5),
+def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$Rs, timm:$u5),
          (S2_asr_i_r IntRegs:$Rs, imm:$u5)>;
-def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$Rs, imm:$u6),
+def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$Rs, timm:$u6),
          (S2_asl_i_p DoubleRegs:$Rs, imm:$u6)>;
-def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$Rs, imm:$u6),
+def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$Rs, timm:$u6),
          (S2_lsr_i_p DoubleRegs:$Rs, imm:$u6)>;
-def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$Rs, imm:$u6),
+def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$Rs, timm:$u6),
          (S2_asr_i_p DoubleRegs:$Rs, imm:$u6)>;
 
 def: Pat<(int_hexagon_A2_and IntRegs:$Rs, IntRegs:$Rt),
          (A2_and IntRegs:$Rs, IntRegs:$Rt)>;
-def: Pat<(int_hexagon_A2_andir IntRegs:$Rs, imm:$s10),
+def: Pat<(int_hexagon_A2_andir IntRegs:$Rs, timm:$s10),
          (A2_andir IntRegs:$Rs, imm:$s10)>;
 def: Pat<(int_hexagon_A2_or IntRegs:$Rs, IntRegs:$Rt),
          (A2_or IntRegs:$Rs, IntRegs:$Rt)>;
-def: Pat<(int_hexagon_A2_orir IntRegs:$Rs, imm:$s10),
+def: Pat<(int_hexagon_A2_orir IntRegs:$Rs, timm:$s10),
          (A2_orir IntRegs:$Rs, imm:$s10)>;
 def: Pat<(int_hexagon_A2_xor IntRegs:$Rs, IntRegs:$Rt),
          (A2_xor IntRegs:$Rs, IntRegs:$Rt)>;
@@ -99,13 +99,13 @@ def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, (i32 0)),
            (S2_vsathub I64:$Rs)>;
 }
 
-def : Pat <(int_hexagon_S2_asr_i_r_rnd_goodsyntax I32:$Rs, u5_0ImmPred:$imm),
+def : Pat <(int_hexagon_S2_asr_i_r_rnd_goodsyntax I32:$Rs, u5_0ImmPred_timm:$imm),
            (S2_asr_i_r_rnd I32:$Rs, (UDEC1 u5_0ImmPred:$imm))>;
-def : Pat <(int_hexagon_S2_asr_i_p_rnd_goodsyntax I64:$Rs, u6_0ImmPred:$imm),
+def : Pat <(int_hexagon_S2_asr_i_p_rnd_goodsyntax I64:$Rs, u6_0ImmPred_timm:$imm),
            (S2_asr_i_p_rnd I64:$Rs, (UDEC1 u6_0ImmPred:$imm))>;
-def : Pat <(int_hexagon_S5_vasrhrnd_goodsyntax I64:$Rs, u4_0ImmPred:$imm),
+def : Pat <(int_hexagon_S5_vasrhrnd_goodsyntax I64:$Rs, u4_0ImmPred_timm:$imm),
            (S5_vasrhrnd I64:$Rs, (UDEC1 u4_0ImmPred:$imm))>;
-def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, u4_0ImmPred:$imm),
+def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, u4_0ImmPred_timm:$imm),
            (S5_asrhub_rnd_sat I64:$Rs, (UDEC1 u4_0ImmPred:$imm))>;
 
 def ImmExt64: SDNodeXForm<imm, [{
@@ -121,13 +121,13 @@ def ImmExt64: SDNodeXForm<imm, [{
 // To connect the builtin with the instruction, the builtin's operand
 // needs to be extended to the right type.
 
-def : Pat<(int_hexagon_A2_tfrpi imm:$Is),
+def : Pat<(int_hexagon_A2_tfrpi timm:$Is),
           (A2_tfrpi (ImmExt64 $Is))>;
 
-def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred:$src2),
+def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred_timm:$src2),
            (C2_tfrpr (C2_cmpgti I32:$src1, (SDEC1 s32_0ImmPred:$src2)))>;
 
-def : Pat <(int_hexagon_C2_cmpgeui I32:$src1, u32_0ImmPred:$src2),
+def : Pat <(int_hexagon_C2_cmpgeui I32:$src1, u32_0ImmPred_timm:$src2),
            (C2_tfrpr (C2_cmpgtui I32:$src1, (UDEC1 u32_0ImmPred:$src2)))>;
 
 def : Pat <(int_hexagon_C2_cmpgeui I32:$src, 0),
@@ -142,7 +142,7 @@ def : Pat <(int_hexagon_C2_cmpltu I32:$src1, I32:$src2),
 //===----------------------------------------------------------------------===//
 class S2op_tableidx_pat <Intrinsic IntID, InstHexagon OutputInst,
                          SDNodeXForm XformImm>
-  : Pat <(IntID I32:$src1, I32:$src2, u4_0ImmPred:$src3, u5_0ImmPred:$src4),
+  : Pat <(IntID I32:$src1, I32:$src2, u4_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4),
          (OutputInst I32:$src1, I32:$src2, u4_0ImmPred:$src3,
                      (XformImm u5_0ImmPred:$src4))>;
 
@@ -197,11 +197,11 @@ class T_stc_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Imm, PatLeaf Val>
   : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s),
         (MI I32:$Rs, Imm:$s, I32:$Ru, Val:$Rt)>;
 
-def: T_stc_pat<S2_storerb_pci, int_hexagon_circ_stb,   s4_0ImmPred, I32>;
-def: T_stc_pat<S2_storerh_pci, int_hexagon_circ_sth,   s4_1ImmPred, I32>;
-def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw,   s4_2ImmPred, I32>;
-def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std,   s4_3ImmPred, I64>;
-def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
+def: T_stc_pat<S2_storerb_pci, int_hexagon_circ_stb,   s4_0ImmPred_timm, I32>;
+def: T_stc_pat<S2_storerh_pci, int_hexagon_circ_sth,   s4_1ImmPred_timm, I32>;
+def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw,   s4_2ImmPred_timm, I32>;
+def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std,   s4_3ImmPred_timm, I64>;
+def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred_timm, I32>;
 
 multiclass MaskedStore <InstHexagon MI, Intrinsic IntID> {
   def : Pat<(IntID HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index ac48e1dc30b0..bda3eccac0cd 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -93,9 +93,9 @@ static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
   cl::Hidden, cl::init(true),
   cl::desc("Only enable generating memmove in non-nested loops"));
 
-cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
-  cl::Hidden, cl::init(false),
-  cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
+static cl::opt<bool> HexagonVolatileMemcpy(
+    "disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false),
+    cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
 
 static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
   cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
@@ -632,9 +632,9 @@ Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
     if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero())
       continue;
     Value *IterV = PN->getIncomingValueForBlock(BB);
-    if (!isa<BinaryOperator>(IterV))
-      continue;
     auto *BO = dyn_cast<BinaryOperator>(IterV);
+    if (!BO)
+      continue;
     if (BO->getOpcode() != Instruction::Add)
       continue;
     Value *IncV = nullptr;
@@ -2020,7 +2020,7 @@ bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
   // See if the pointer expression is an AddRec like {base,+,1} on the current
   // loop, which indicates a strided load.  If we have something else, it's a
   // random load we can't handle.
-  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+  auto *LI = cast<LoadInst>(SI->getValueOperand());
   auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
 
   // The trip count of the loop and the base pointer of the addrec SCEV is
@@ -2426,7 +2426,8 @@ bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   DL = &L->getHeader()->getModule()->getDataLayout();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+      *L->getHeader()->getParent());
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 
   HasMemcpy = TLI->has(LibFunc_memcpy);
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index db44901ca706..680d01e12af0 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -177,7 +177,7 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
         (II->getOperand(i).isUse() || II->getOperand(i).isDef())) {
       MachineBasicBlock::iterator localII = II;
       ++localII;
-      unsigned Reg = II->getOperand(i).getReg();
+      Register Reg = II->getOperand(i).getReg();
       for (MachineBasicBlock::iterator localBegin = localII; localBegin != end;
            ++localBegin) {
         if (localBegin == skip)
@@ -290,7 +290,7 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
     // at machine code level, we don't need this, but if we decide
     // to move new value jump prior to RA, we would be needing this.
     MachineRegisterInfo &MRI = MF.getRegInfo();
-    if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
+    if (secondReg && !Register::isPhysicalRegister(cmpOp2)) {
       MachineInstr *def = MRI.getVRegDef(cmpOp2);
       if (def->getOpcode() == TargetOpcode::COPY)
         return false;
@@ -516,7 +516,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
         jmpPos = MII;
         jmpInstr = &MI;
         predReg = MI.getOperand(0).getReg();
-        afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);
+        afterRA = Register::isPhysicalRegister(predReg);
 
         // If ifconverter had not messed up with the kill flags of the
         // operands, the following check on the kill flag would suffice.
@@ -603,7 +603,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
              (isSecondOpReg &&
               MI.getOperand(0).getReg() == (unsigned)cmpOp2))) {
 
-          unsigned feederReg = MI.getOperand(0).getReg();
+          Register feederReg = MI.getOperand(0).getReg();
 
           // First try to see if we can get the feeder from the first operand
           // of the compare. If we can not, and if secondOpReg is true
@@ -651,7 +651,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
             for (MachineOperand &MO : MI.operands()) {
               if (!MO.isReg() || !MO.isUse())
                 continue;
-              unsigned UseR = MO.getReg();
+              Register UseR = MO.getReg();
               for (auto I = std::next(MI.getIterator()); I != jmpPos; ++I) {
                 if (I == cmpPos)
                   continue;
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 547da9fd598f..9121115020a2 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -162,7 +162,7 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
   if (!OffsetOp.isImm() || OffsetOp.getImm() > 3)
     return false;
 
-  unsigned OffsetReg = MI.getOperand(2).getReg();
+  Register OffsetReg = MI.getOperand(2).getReg();
   RegisterRef OffsetRR;
   NodeId OffsetRegRD = 0;
   for (NodeAddr<UseNode *> UA : AddAslSN.Addr->members_if(DFG->IsUse, *DFG)) {
@@ -348,7 +348,7 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
                                         MachineInstr *AddMI,
                                         const NodeList &UNodeList) {
 
-  unsigned AddDefR = AddMI->getOperand(0).getReg();
+  Register AddDefR = AddMI->getOperand(0).getReg();
   for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
     NodeAddr<UseNode *> UN = *I;
     NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG);
@@ -381,7 +381,7 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
     // Ex: Rx= add(Rt,#10)
     //     memw(Rx+#0) = Rs
     // will be replaced with =>  memw(Rt+#10) = Rs
-    unsigned BaseReg = AddMI->getOperand(1).getReg();
+    Register BaseReg = AddMI->getOperand(1).getReg();
     if (!isSafeToExtLR(AddSN, AddMI, BaseReg, UNodeList))
       return false;
   }
@@ -411,7 +411,7 @@ bool HexagonOptAddrMode::updateAddUses(MachineInstr *AddMI,
                                         MachineInstr *UseMI) {
   const MachineOperand ImmOp = AddMI->getOperand(2);
   const MachineOperand AddRegOp = AddMI->getOperand(1);
-  unsigned newReg = AddRegOp.getReg();
+  Register newReg = AddRegOp.getReg();
   const MCInstrDesc &MID = UseMI->getDesc();
 
   MachineOperand &BaseOp = MID.mayLoad() ? UseMI->getOperand(1)
@@ -543,7 +543,7 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
 bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
                                      unsigned ImmOpNum) {
   bool Changed = false;
-  unsigned OpStart;
+  unsigned OpStart = 0;
   unsigned OpEnd = OldMI->getNumOperands();
   MachineBasicBlock *BB = OldMI->getParent();
   auto UsePos = MachineBasicBlock::iterator(OldMI);
@@ -724,7 +724,7 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
     }
 
     short SizeInc = 0;
-    unsigned DefR = MI->getOperand(0).getReg();
+    Register DefR = MI->getOperand(0).getReg();
     InstrEvalMap InstrEvalResult;
 
     // Analyze all uses and calculate increase in size. Perform the optimization
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index fb731f56bfbf..485e658e1c84 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -99,13 +99,21 @@ def HWI8:   PatLeaf<(VecPI8  HvxWR:$R)>;
 def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
 def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
 
+def SDTVecLeaf:
+  SDTypeProfile<1, 0, [SDTCisVec<0>]>;
 def SDTVecVecIntOp:
   SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>,
                        SDTCisVT<3,i32>]>;
 
+def HexagonPTRUE:      SDNode<"HexagonISD::PTRUE",      SDTVecLeaf>;
+def HexagonPFALSE:     SDNode<"HexagonISD::PFALSE",     SDTVecLeaf>;
 def HexagonVALIGN:     SDNode<"HexagonISD::VALIGN",     SDTVecVecIntOp>;
 def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>;
 
+def ptrue:  PatFrag<(ops), (HexagonPTRUE)>;
+def pfalse: PatFrag<(ops), (HexagonPFALSE)>;
+def pnot:   PatFrag<(ops node:$Pu), (xor node:$Pu, ptrue)>;
+
 def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru),
                     (HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>;
 def valignaddr: PatFrag<(ops node:$Addr), (HexagonVALIGNADDR node:$Addr)>;
@@ -154,6 +162,11 @@ def IsNPow2_64H: PatLeaf<(i64 imm), [{
   return isPowerOf2_64(NV) && Log2_64(NV) >= 32;
 }]>;
 
+class IsULE<int Width, int Arg>: PatLeaf<(i32 imm),
+  "uint64_t V = N->getZExtValue();" #
+  "return isUInt<" # Width # ">(V) && V <= " # Arg # ";"
+>;
+
 class IsUGT<int Width, int Arg>: PatLeaf<(i32 imm),
   "uint64_t V = N->getZExtValue();" #
   "return isUInt<" # Width # ">(V) && V > " # Arg # ";"
@@ -320,6 +333,24 @@ multiclass SelMinMax_pats<PatFrag CmpOp, PatFrag Val,
            (InstB Val:$A, Val:$B)>;
 }
 
+multiclass MinMax_pats<InstHexagon PickT, InstHexagon PickS,
+                       PatFrag Sel, PatFrag CmpOp,
+                       ValueType CmpType, PatFrag CmpPred> {
+  def: Pat<(Sel (CmpType (CmpOp CmpPred:$Vs, CmpPred:$Vt)),
+                CmpPred:$Vt, CmpPred:$Vs),
+           (PickT CmpPred:$Vs, CmpPred:$Vt)>;
+  def: Pat<(Sel (CmpType (CmpOp CmpPred:$Vs, CmpPred:$Vt)),
+                CmpPred:$Vs, CmpPred:$Vt),
+           (PickS CmpPred:$Vs, CmpPred:$Vt)>;
+}
+
+// Bitcasts between same-size vector types are no-ops, except for the
+// actual type change.
+multiclass NopCast_pat<ValueType Ty1, ValueType Ty2, RegisterClass RC> {
+  def: Pat<(Ty1 (bitconvert (Ty2 RC:$Val))), (Ty1 RC:$Val)>;
+  def: Pat<(Ty2 (bitconvert (Ty1 RC:$Val))), (Ty2 RC:$Val)>;
+}
+
 
 // Frags for commonly used SDNodes.
 def Add: pf2<add>;    def And: pf2<and>;    def Sra: pf2<sra>;
@@ -403,17 +434,18 @@ def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
 def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
 def: Pat<(f64 (bitconvert I64:$v)), (F64:$v)>;
 
-multiclass Cast_pat<ValueType Ta, ValueType Tb, RegisterClass RC> {
-  def: Pat<(Tb (bitconvert (Ta RC:$Rs))), (Tb RC:$Rs)>;
-  def: Pat<(Ta (bitconvert (Tb RC:$Rs))), (Ta RC:$Rs)>;
-}
-
-// Bit convert vector types to integers.
-defm: Cast_pat<v4i8,  i32, IntRegs>;
-defm: Cast_pat<v2i16, i32, IntRegs>;
-defm: Cast_pat<v8i8,  i64, DoubleRegs>;
-defm: Cast_pat<v4i16, i64, DoubleRegs>;
-defm: Cast_pat<v2i32, i64, DoubleRegs>;
+// Bit convert 32- and 64-bit types.
+// All of these are bitcastable to one another: i32, v2i16, v4i8.
+defm: NopCast_pat<i32,   v2i16, IntRegs>;
+defm: NopCast_pat<i32,    v4i8, IntRegs>;
+defm: NopCast_pat<v2i16,  v4i8, IntRegs>;
+// All of these are bitcastable to one another: i64, v2i32, v4i16, v8i8.
+defm: NopCast_pat<i64,   v2i32, DoubleRegs>;
+defm: NopCast_pat<i64,   v4i16, DoubleRegs>;
+defm: NopCast_pat<i64,    v8i8, DoubleRegs>;
+defm: NopCast_pat<v2i32, v4i16, DoubleRegs>;
+defm: NopCast_pat<v2i32,  v8i8, DoubleRegs>;
+defm: NopCast_pat<v4i16,  v8i8, DoubleRegs>;
 
 
 // --(3) Extend/truncate -------------------------------------------------
@@ -497,7 +529,9 @@ def: Pat<(v2i16 (trunc V2I32:$Rs)),
 //
 
 def: Pat<(not I1:$Ps),      (C2_not I1:$Ps)>;
-def: Pat<(not V8I1:$Ps),    (C2_not V8I1:$Ps)>;
+def: Pat<(pnot V2I1:$Ps),   (C2_not V2I1:$Ps)>;
+def: Pat<(pnot V4I1:$Ps),   (C2_not V4I1:$Ps)>;
+def: Pat<(pnot V8I1:$Ps),   (C2_not V8I1:$Ps)>;
 def: Pat<(add I1:$Ps, -1),  (C2_not I1:$Ps)>;
 
 multiclass BoolOpR_RR_pat<InstHexagon MI, PatFrag Op> {
@@ -816,14 +850,6 @@ def: Pat<(select (not I1:$Pu), f32ImmPred:$I, F32:$Rs),
 def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I),
          (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
 
-def: Pat<(select I1:$Pu, V4I8:$Rs, V4I8:$Rt),
-         (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>;
-def: Pat<(select I1:$Pu, V2I16:$Rs, V2I16:$Rt),
-         (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>;
-def: Pat<(select I1:$Pu, V2I32:$Rs, V2I32:$Rt),
-         (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
-                   (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
-
 def: Pat<(vselect V8I1:$Pu, V8I8:$Rs, V8I8:$Rt),
          (C2_vmux V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)>;
 def: Pat<(vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt),
@@ -831,6 +857,14 @@ def: Pat<(vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt),
 def: Pat<(vselect V2I1:$Pu, V2I32:$Rs, V2I32:$Rt),
          (C2_vmux V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)>;
 
+def: Pat<(vselect (pnot V8I1:$Pu), V8I8:$Rs, V8I8:$Rt),
+         (C2_vmux V8I1:$Pu, V8I8:$Rt, V8I8:$Rs)>;
+def: Pat<(vselect (pnot V4I1:$Pu), V4I16:$Rs, V4I16:$Rt),
+         (C2_vmux V4I1:$Pu, V4I16:$Rt, V4I16:$Rs)>;
+def: Pat<(vselect (pnot V2I1:$Pu), V2I32:$Rs, V2I32:$Rt),
+         (C2_vmux V2I1:$Pu, V2I32:$Rt, V2I32:$Rs)>;
+
+
 // From LegalizeDAG.cpp: (Pu ? Pv : Pw) <=> (Pu & Pv) | (!Pu & Pw).
 def: Pat<(select I1:$Pu, I1:$Pv, I1:$Pw),
          (C2_or (C2_and  I1:$Pu, I1:$Pv),
@@ -863,32 +897,44 @@ let AddedComplexity = 200 in {
 }
 
 let AddedComplexity = 200 in {
-  defm: SelMinMax_pats<setge,  I32, A2_max,   A2_min>;
-  defm: SelMinMax_pats<setgt,  I32, A2_max,   A2_min>;
-  defm: SelMinMax_pats<setle,  I32, A2_min,   A2_max>;
-  defm: SelMinMax_pats<setlt,  I32, A2_min,   A2_max>;
-  defm: SelMinMax_pats<setuge, I32, A2_maxu,  A2_minu>;
-  defm: SelMinMax_pats<setugt, I32, A2_maxu,  A2_minu>;
-  defm: SelMinMax_pats<setule, I32, A2_minu,  A2_maxu>;
-  defm: SelMinMax_pats<setult, I32, A2_minu,  A2_maxu>;
-
-  defm: SelMinMax_pats<setge,  I64, A2_maxp,  A2_minp>;
-  defm: SelMinMax_pats<setgt,  I64, A2_maxp,  A2_minp>;
-  defm: SelMinMax_pats<setle,  I64, A2_minp,  A2_maxp>;
-  defm: SelMinMax_pats<setlt,  I64, A2_minp,  A2_maxp>;
-  defm: SelMinMax_pats<setuge, I64, A2_maxup, A2_minup>;
-  defm: SelMinMax_pats<setugt, I64, A2_maxup, A2_minup>;
-  defm: SelMinMax_pats<setule, I64, A2_minup, A2_maxup>;
-  defm: SelMinMax_pats<setult, I64, A2_minup, A2_maxup>;
+  defm: MinMax_pats<A2_min,   A2_max,   select,  setgt, i1, I32>;
+  defm: MinMax_pats<A2_min,   A2_max,   select,  setge, i1, I32>;
+  defm: MinMax_pats<A2_max,   A2_min,   select,  setlt, i1, I32>;
+  defm: MinMax_pats<A2_max,   A2_min,   select,  setle, i1, I32>;
+  defm: MinMax_pats<A2_minu,  A2_maxu,  select, setugt, i1, I32>;
+  defm: MinMax_pats<A2_minu,  A2_maxu,  select, setuge, i1, I32>;
+  defm: MinMax_pats<A2_maxu,  A2_minu,  select, setult, i1, I32>;
+  defm: MinMax_pats<A2_maxu,  A2_minu,  select, setule, i1, I32>;
+
+  defm: MinMax_pats<A2_minp,  A2_maxp,  select,  setgt, i1, I64>;
+  defm: MinMax_pats<A2_minp,  A2_maxp,  select,  setge, i1, I64>;
+  defm: MinMax_pats<A2_maxp,  A2_minp,  select,  setlt, i1, I64>;
+  defm: MinMax_pats<A2_maxp,  A2_minp,  select,  setle, i1, I64>;
+  defm: MinMax_pats<A2_minup, A2_maxup, select, setugt, i1, I64>;
+  defm: MinMax_pats<A2_minup, A2_maxup, select, setuge, i1, I64>;
+  defm: MinMax_pats<A2_maxup, A2_minup, select, setult, i1, I64>;
+  defm: MinMax_pats<A2_maxup, A2_minup, select, setule, i1, I64>;
 }
 
 let AddedComplexity = 100 in {
-  defm: SelMinMax_pats<setolt, F32, F2_sfmin, F2_sfmax>;
-  defm: SelMinMax_pats<setole, F32, F2_sfmin, F2_sfmax>;
-  defm: SelMinMax_pats<setogt, F32, F2_sfmax, F2_sfmin>;
-  defm: SelMinMax_pats<setoge, F32, F2_sfmax, F2_sfmin>;
-}
-
+  defm: MinMax_pats<F2_sfmin, F2_sfmax, select, setogt, i1, F32>;
+  defm: MinMax_pats<F2_sfmin, F2_sfmax, select, setoge, i1, F32>;
+  defm: MinMax_pats<F2_sfmax, F2_sfmin, select, setolt, i1, F32>;
+  defm: MinMax_pats<F2_sfmax, F2_sfmin, select, setole, i1, F32>;
+}
+
+defm: MinMax_pats<A2_vminb,  A2_vmaxb,  vselect,  setgt,  v8i1,  V8I8>;
+defm: MinMax_pats<A2_vminb,  A2_vmaxb,  vselect,  setge,  v8i1,  V8I8>;
+defm: MinMax_pats<A2_vminh,  A2_vmaxh,  vselect,  setgt,  v4i1, V4I16>;
+defm: MinMax_pats<A2_vminh,  A2_vmaxh,  vselect,  setge,  v4i1, V4I16>;
+defm: MinMax_pats<A2_vminw,  A2_vmaxw,  vselect,  setgt,  v2i1, V2I32>;
+defm: MinMax_pats<A2_vminw,  A2_vmaxw,  vselect,  setge,  v2i1, V2I32>;
+defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setugt,  v8i1,  V8I8>;
+defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setuge,  v8i1,  V8I8>;
+defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setugt,  v4i1, V4I16>;
+defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setuge,  v4i1, V4I16>;
+defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setugt,  v2i1, V2I32>;
+defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setuge,  v2i1, V2I32>;
 
 // --(7) Insert/extract --------------------------------------------------
 //
@@ -1639,19 +1685,19 @@ def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
 //
 
 // Count leading zeros.
-def: Pat<(ctlz I32:$Rs),                      (S2_cl0 I32:$Rs)>;
+def: Pat<(i32 (ctlz I32:$Rs)),                (S2_cl0 I32:$Rs)>;
 def: Pat<(i32 (trunc (ctlz I64:$Rss))),       (S2_cl0p I64:$Rss)>;
 
 // Count trailing zeros.
-def: Pat<(cttz I32:$Rs),                      (S2_ct0 I32:$Rs)>;
+def: Pat<(i32 (cttz I32:$Rs)),                (S2_ct0 I32:$Rs)>;
 def: Pat<(i32 (trunc (cttz I64:$Rss))),       (S2_ct0p I64:$Rss)>;
 
 // Count leading ones.
-def: Pat<(ctlz (not I32:$Rs)),                (S2_cl1 I32:$Rs)>;
+def: Pat<(i32 (ctlz (not I32:$Rs))),          (S2_cl1 I32:$Rs)>;
 def: Pat<(i32 (trunc (ctlz (not I64:$Rss)))), (S2_cl1p I64:$Rss)>;
 
 // Count trailing ones.
-def: Pat<(cttz (not I32:$Rs)),                (S2_ct1 I32:$Rs)>;
+def: Pat<(i32 (cttz (not I32:$Rs))),           (S2_ct1 I32:$Rs)>;
 def: Pat<(i32 (trunc (cttz (not I64:$Rss)))), (S2_ct1p I64:$Rss)>;
 
 // Define leading/trailing patterns that require zero-extensions to 64 bits.
@@ -1706,6 +1752,7 @@ let AddedComplexity = 20 in { // Complexity greater than and/or/xor
                      (i32 (LoReg $Rss)))>;
 }
 
+
 let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
   def: Pat<(i1 (setne (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
            (S2_tstbit_i IntRegs:$Rs, imm:$u5)>;
@@ -1717,6 +1764,20 @@ let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
            (S2_tstbit_i (LoReg DoubleRegs:$Rs), 0)>;
 }
 
+def: Pat<(and (srl I32:$Rs, u5_0ImmPred:$u5), 1),
+         (I1toI32 (S2_tstbit_i I32:$Rs, imm:$u5))>;
+def: Pat<(and (srl I64:$Rss, IsULE<32,31>:$u6), 1),
+         (ToZext64 (I1toI32 (S2_tstbit_i (LoReg $Rss), imm:$u6)))>;
+def: Pat<(and (srl I64:$Rss, IsUGT<32,31>:$u6), 1),
+         (ToZext64 (I1toI32 (S2_tstbit_i (HiReg $Rss), (UDEC32 $u6))))>;
+
+def: Pat<(and (not (srl I32:$Rs, u5_0ImmPred:$u5)), 1),
+         (I1toI32 (S4_ntstbit_i I32:$Rs, imm:$u5))>;
+def: Pat<(and (not (srl I64:$Rss, IsULE<32,31>:$u6)), 1),
+         (ToZext64 (I1toI32 (S4_ntstbit_i (LoReg $Rss), imm:$u6)))>;
+def: Pat<(and (not (srl I64:$Rss, IsUGT<32,31>:$u6)), 1),
+         (ToZext64 (I1toI32 (S4_ntstbit_i (HiReg $Rss), (UDEC32 $u6))))>;
+
 let AddedComplexity = 20 in { // Complexity greater than compare reg-imm.
   def: Pat<(i1 (seteq (and I32:$Rs, u6_0ImmPred:$u6), 0)),
            (C2_bitsclri IntRegs:$Rs, imm:$u6)>;
@@ -1737,23 +1798,28 @@ def: Pat<(HexagonTSTBIT I32:$Rs, u5_0ImmPred:$u5),
 def: Pat<(HexagonTSTBIT I32:$Rs, I32:$Rt),
          (S2_tstbit_r I32:$Rs, I32:$Rt)>;
 
+// Add extra complexity to prefer these instructions over bitsset/bitsclr.
+// The reason is that tstbit/ntstbit can be folded into a compound instruction:
+//   if ([!]tstbit(...)) jump ...
 let AddedComplexity = 20 in {   // Complexity greater than cmp reg-imm.
-  def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
-           (S4_ntstbit_i I32:$Rs, imm:$u5)>;
+  def: Pat<(i1 (seteq (and I32:$Rs, IsPow2_32:$u5), 0)),
+           (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
+  def: Pat<(i1 (setne (and I32:$Rs, IsPow2_32:$u5), 0)),
+           (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
   def: Pat<(i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)),
            (S4_ntstbit_r I32:$Rs, I32:$Rt)>;
+  def: Pat<(i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)),
+           (S2_tstbit_r I32:$Rs, I32:$Rt)>;
 }
 
-// Add extra complexity to prefer these instructions over bitsset/bitsclr.
-// The reason is that tstbit/ntstbit can be folded into a compound instruction:
-//   if ([!]tstbit(...)) jump ...
-let AddedComplexity = 100 in
-def: Pat<(i1 (setne (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))),
-         (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
-
-let AddedComplexity = 100 in
-def: Pat<(i1 (seteq (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))),
-         (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
+def: Pat<(i1 (seteq (and I64:$Rs, IsPow2_64L:$u6), 0)),
+         (S4_ntstbit_i (LoReg $Rs), (Log2_64 $u6))>;
+def: Pat<(i1 (seteq (and I64:$Rs, IsPow2_64H:$u6), 0)),
+         (S4_ntstbit_i (HiReg $Rs), (UDEC32 (i32 (Log2_64 $u6))))>;
+def: Pat<(i1 (setne (and I64:$Rs, IsPow2_64L:$u6), 0)),
+         (S2_tstbit_i (LoReg $Rs), (Log2_32 imm:$u6))>;
+def: Pat<(i1 (setne (and I64:$Rs, IsPow2_64H:$u6), 0)),
+         (S2_tstbit_i (HiReg $Rs), (UDEC32 (i32 (Log2_32 imm:$u6))))>;
 
 // Do not increase complexity of these patterns. In the DAG, "cmp i8" may be
 // represented as a compare against "value & 0xFF", which is an exact match
@@ -1773,10 +1839,18 @@ def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), I32:$Rt)),
 
 let AddedComplexity = 100 in {
   // Avoid A4_rcmp[n]eqi in these cases:
+  def: Pat<(i32 (zext (i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)))),
+           (I1toI32 (S4_ntstbit_r IntRegs:$Rs, IntRegs:$Rt))>;
   def: Pat<(i32 (zext (i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)))),
            (I1toI32 (S2_tstbit_r IntRegs:$Rs, IntRegs:$Rt))>;
+  def: Pat<(i32 (zext (i1 (seteq (and I32:$Rs, IsPow2_32:$u5), 0)))),
+           (I1toI32 (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5)))>;
+  def: Pat<(i32 (zext (i1 (setne (and I32:$Rs, IsPow2_32:$u5), 0)))),
+           (I1toI32 (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5)))>;
   def: Pat<(i32 (zext (i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)))),
-           (I1toI32 (S4_ntstbit_r IntRegs:$Rs, IntRegs:$Rt))>;
+           (I1toI32 (S4_ntstbit_r I32:$Rs, I32:$Rt))>;
+  def: Pat<(i32 (zext (i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)))),
+           (I1toI32 (S2_tstbit_r I32:$Rs, I32:$Rt))>;
 }
 
 // --(11) PIC ------------------------------------------------------------
diff --git a/lib/Target/Hexagon/HexagonPatternsHVX.td b/lib/Target/Hexagon/HexagonPatternsHVX.td
index a4cfca9ac7d7..078a7135c55b 100644
--- a/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -1,5 +1,3 @@
-def SDTVecLeaf:
-  SDTypeProfile<1, 0, [SDTCisVec<0>]>;
 def SDTVecBinOp:
   SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>;
 
@@ -162,23 +160,14 @@ let Predicates = [UseHVX] in {
 
 // Bitcasts between same-size vector types are no-ops, except for the
 // actual type change.
-class Bitcast<ValueType ResTy, ValueType InpTy, RegisterClass RC>
-  : Pat<(ResTy (bitconvert (InpTy RC:$Val))), (ResTy RC:$Val)>;
-
 let Predicates = [UseHVX] in {
-  def: Bitcast<VecI8,   VecI16,  HvxVR>;
-  def: Bitcast<VecI8,   VecI32,  HvxVR>;
-  def: Bitcast<VecI16,  VecI8,   HvxVR>;
-  def: Bitcast<VecI16,  VecI32,  HvxVR>;
-  def: Bitcast<VecI32,  VecI8,   HvxVR>;
-  def: Bitcast<VecI32,  VecI16,  HvxVR>;
-
-  def: Bitcast<VecPI8,  VecPI16, HvxWR>;
-  def: Bitcast<VecPI8,  VecPI32, HvxWR>;
-  def: Bitcast<VecPI16, VecPI8,  HvxWR>;
-  def: Bitcast<VecPI16, VecPI32, HvxWR>;
-  def: Bitcast<VecPI32, VecPI8,  HvxWR>;
-  def: Bitcast<VecPI32, VecPI16, HvxWR>;
+  defm: NopCast_pat<VecI8,   VecI16,  HvxVR>;
+  defm: NopCast_pat<VecI8,   VecI32,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecI32,  HvxVR>;
+
+  defm: NopCast_pat<VecPI8,  VecPI16, HvxWR>;
+  defm: NopCast_pat<VecPI8,  VecPI32, HvxWR>;
+  defm: NopCast_pat<VecPI16, VecPI32, HvxWR>;
 }
 
 let Predicates = [UseHVX] in {
@@ -259,6 +248,21 @@ class Vneg1<ValueType VecTy>
 class Vnot<ValueType VecTy>
   : PatFrag<(ops node:$Vs), (xor $Vs, Vneg1<VecTy>)>;
 
+let Predicates = [UseHVX] in {
+  let AddedComplexity = 220 in {
+    defm: MinMax_pats<V6_vminb,  V6_vmaxb,  vselect,  setgt,  VecQ8,  HVI8>;
+    defm: MinMax_pats<V6_vminb,  V6_vmaxb,  vselect,  setge,  VecQ8,  HVI8>;
+    defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setugt,  VecQ8,  HVI8>;
+    defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setuge,  VecQ8,  HVI8>;
+    defm: MinMax_pats<V6_vminh,  V6_vmaxh,  vselect,  setgt, VecQ16, HVI16>;
+    defm: MinMax_pats<V6_vminh,  V6_vmaxh,  vselect,  setge, VecQ16, HVI16>;
+    defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setugt, VecQ16, HVI16>;
+    defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setuge, VecQ16, HVI16>;
+    defm: MinMax_pats<V6_vminw,  V6_vmaxw,  vselect,  setgt, VecQ32, HVI32>;
+    defm: MinMax_pats<V6_vminw,  V6_vmaxw,  vselect,  setge, VecQ32, HVI32>;
+  }
+}
+
 let Predicates = [UseHVX] in {
   let AddedComplexity = 200 in {
     def: Pat<(Vnot<VecI8>   HVI8:$Vs), (V6_vnot HvxVR:$Vs)>;
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 8f761d2d4805..0ccfe64ad1e5 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -136,11 +136,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         assert(MI.getNumOperands() == 2);
         MachineOperand &Dst = MI.getOperand(0);
         MachineOperand &Src = MI.getOperand(1);
-        unsigned DstReg = Dst.getReg();
-        unsigned SrcReg = Src.getReg();
+        Register DstReg = Dst.getReg();
+        Register SrcReg = Src.getReg();
         // Just handle virtual registers.
-        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
-            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+        if (Register::isVirtualRegister(DstReg) &&
+            Register::isVirtualRegister(SrcReg)) {
           // Map the following:
           // %170 = SXTW %166
           // PeepholeMap[170] = %166
@@ -157,8 +157,8 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         MachineOperand &Src2 = MI.getOperand(2);
         if (Src1.getImm() != 0)
           continue;
-        unsigned DstReg = Dst.getReg();
-        unsigned SrcReg = Src2.getReg();
+        Register DstReg = Dst.getReg();
+        Register SrcReg = Src2.getReg();
         PeepholeMap[DstReg] = SrcReg;
       }
 
@@ -174,8 +174,8 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         MachineOperand &Src2 = MI.getOperand(2);
         if (Src2.getImm() != 32)
           continue;
-        unsigned DstReg = Dst.getReg();
-        unsigned SrcReg = Src1.getReg();
+        Register DstReg = Dst.getReg();
+        Register SrcReg = Src1.getReg();
         PeepholeDoubleRegsMap[DstReg] =
           std::make_pair(*&SrcReg, Hexagon::isub_hi);
       }
@@ -185,11 +185,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         assert(MI.getNumOperands() == 2);
         MachineOperand &Dst = MI.getOperand(0);
         MachineOperand &Src = MI.getOperand(1);
-        unsigned DstReg = Dst.getReg();
-        unsigned SrcReg = Src.getReg();
+        Register DstReg = Dst.getReg();
+        Register SrcReg = Src.getReg();
         // Just handle virtual registers.
-        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
-            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+        if (Register::isVirtualRegister(DstReg) &&
+            Register::isVirtualRegister(SrcReg)) {
           // Map the following:
           // %170 = NOT_xx %166
           // PeepholeMap[170] = %166
@@ -208,10 +208,10 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         if (Src.getSubReg() != Hexagon::isub_lo)
           continue;
 
-        unsigned DstReg = Dst.getReg();
-        unsigned SrcReg = Src.getReg();
-        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
-            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+        Register DstReg = Dst.getReg();
+        Register SrcReg = Src.getReg();
+        if (Register::isVirtualRegister(DstReg) &&
+            Register::isVirtualRegister(SrcReg)) {
           // Try to find in the map.
           if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
             // Change the 1st operand.
@@ -237,12 +237,12 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         bool Done = false;
         if (QII->isPredicated(MI)) {
           MachineOperand &Op0 = MI.getOperand(0);
-          unsigned Reg0 = Op0.getReg();
+          Register Reg0 = Op0.getReg();
           const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0);
           if (RC0->getID() == Hexagon::PredRegsRegClassID) {
             // Handle instructions that have a prediate register in op0
             // (most cases of predicable instructions).
-            if (TargetRegisterInfo::isVirtualRegister(Reg0)) {
+            if (Register::isVirtualRegister(Reg0)) {
               // Try to find in the map.
               if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
                 // Change the 1st operand and, flip the opcode.
@@ -275,7 +275,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
               break;
           }
           if (NewOp) {
-            unsigned PSrc = MI.getOperand(PR).getReg();
+            Register PSrc = MI.getOperand(PR).getReg();
             if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
               BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(),
                       QII->get(NewOp), MI.getOperand(0).getReg())
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 4f5f750e5842..b7171fb14272 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -217,7 +217,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // If the offset is not valid, calculate the address in a temporary
     // register and use it with offset 0.
     auto &MRI = MF.getRegInfo();
-    unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+    Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
     const DebugLoc &DL = MI.getDebugLoc();
     BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), TmpR)
       .addReg(BP)
@@ -249,8 +249,8 @@ bool HexagonRegisterInfo::shouldCoalesce(MachineInstr *MI,
   if (!SmallSrc && !SmallDst)
     return true;
 
-  unsigned DstReg = MI->getOperand(0).getReg();
-  unsigned SrcReg = MI->getOperand(1).getReg();
+  Register DstReg = MI->getOperand(0).getReg();
+  Register SrcReg = MI->getOperand(1).getReg();
   const SlotIndexes &Indexes = *LIS.getSlotIndexes();
   auto HasCall = [&Indexes] (const LiveInterval::Segment &S) {
     for (SlotIndex I = S.start.getBaseIndex(), E = S.end.getBaseIndex();
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index bd4254aea276..f9fb14c190ff 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -76,18 +76,18 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
       unsigned Opc = MI.getOpcode();
 
       if (Opc == Hexagon::CONST32) {
-        unsigned DestReg = MI.getOperand(0).getReg();
+        Register DestReg = MI.getOperand(0).getReg();
         uint64_t ImmValue = MI.getOperand(1).getImm();
         const DebugLoc &DL = MI.getDebugLoc();
         BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), DestReg)
             .addImm(ImmValue);
         B.erase(&MI);
       } else if (Opc == Hexagon::CONST64) {
-        unsigned DestReg = MI.getOperand(0).getReg();
+        Register DestReg = MI.getOperand(0).getReg();
         int64_t ImmValue = MI.getOperand(1).getImm();
         const DebugLoc &DL = MI.getDebugLoc();
-        unsigned DestLo = TRI->getSubReg(DestReg, Hexagon::isub_lo);
-        unsigned DestHi = TRI->getSubReg(DestReg, Hexagon::isub_hi);
+        Register DestLo = TRI->getSubReg(DestReg, Hexagon::isub_lo);
+        Register DestHi = TRI->getSubReg(DestReg, Hexagon::isub_hi);
 
         int32_t LowWord = (ImmValue & 0xFFFFFFFF);
         int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF;
diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp
index 013eede2d414..55f31c628854 100644
--- a/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -210,8 +210,8 @@ bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
   for (auto &Op : MI->operands()) {
     if (!Op.isReg())
       continue;
-    unsigned R = Op.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
+    Register R = Op.getReg();
+    if (!Register::isVirtualRegister(R))
       return true;
   }
   return false;
@@ -224,14 +224,14 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
   unsigned NumRegs = MRI->getNumVirtRegs();
   BitVector DoubleRegs(NumRegs);
   for (unsigned i = 0; i < NumRegs; ++i) {
-    unsigned R = TargetRegisterInfo::index2VirtReg(i);
+    unsigned R = Register::index2VirtReg(i);
     if (MRI->getRegClass(R) == DoubleRC)
       DoubleRegs.set(i);
   }
 
   BitVector FixedRegs(NumRegs);
   for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
-    unsigned R = TargetRegisterInfo::index2VirtReg(x);
+    unsigned R = Register::index2VirtReg(x);
     MachineInstr *DefI = MRI->getVRegDef(R);
     // In some cases a register may exist, but never be defined or used.
     // It should never appear anywhere, but mark it as "fixed", just to be
@@ -244,7 +244,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
   for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
     if (FixedRegs[x])
       continue;
-    unsigned R = TargetRegisterInfo::index2VirtReg(x);
+    unsigned R = Register::index2VirtReg(x);
     LLVM_DEBUG(dbgs() << printReg(R, TRI) << " ~~");
     USet &Asc = AssocMap[R];
     for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end();
@@ -258,14 +258,14 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
         // Skip non-registers or registers with subregisters.
         if (&MO == &Op || !MO.isReg() || MO.getSubReg())
           continue;
-        unsigned T = MO.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(T)) {
+        Register T = MO.getReg();
+        if (!Register::isVirtualRegister(T)) {
           FixedRegs.set(x);
           continue;
         }
         if (MRI->getRegClass(T) != DoubleRC)
           continue;
-        unsigned u = TargetRegisterInfo::virtReg2Index(T);
+        unsigned u = Register::virtReg2Index(T);
         if (FixedRegs[u])
           continue;
         LLVM_DEBUG(dbgs() << ' ' << printReg(T, TRI));
@@ -281,7 +281,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
   unsigned NextP = 1;
   USet Visited;
   for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
-    unsigned R = TargetRegisterInfo::index2VirtReg(x);
+    unsigned R = Register::index2VirtReg(x);
     if (Visited.count(R))
       continue;
     // Create a new partition for R.
@@ -372,8 +372,8 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
     case Hexagon::A2_andp:
     case Hexagon::A2_orp:
     case Hexagon::A2_xorp: {
-      unsigned Rs = MI->getOperand(1).getReg();
-      unsigned Rt = MI->getOperand(2).getReg();
+      Register Rs = MI->getOperand(1).getReg();
+      Register Rt = MI->getOperand(2).getReg();
       return profit(Rs) + profit(Rt);
     }
 
@@ -400,7 +400,7 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
 }
 
 int32_t HexagonSplitDoubleRegs::profit(unsigned Reg) const {
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
 
   const MachineInstr *DefI = MRI->getVRegDef(Reg);
   switch (DefI->getOpcode()) {
@@ -499,7 +499,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
     return;
   assert(Cond[1].isReg() && "Unexpected Cond vector from analyzeBranch");
   // Expect a predicate register.
-  unsigned PR = Cond[1].getReg();
+  Register PR = Cond[1].getReg();
   assert(MRI->getRegClass(PR) == &Hexagon::PredRegsRegClass);
 
   // Get the registers on which the loop controlling compare instruction
@@ -535,7 +535,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
     if (!MI.isPHI())
       break;
     const MachineOperand &MD = MI.getOperand(0);
-    unsigned R = MD.getReg();
+    Register R = MD.getReg();
     if (MRI->getRegClass(R) == DoubleRC)
       DP.push_back(R);
   }
@@ -551,7 +551,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
       // Get the output from the add. If it is one of the inputs to the
       // loop-controlling compare instruction, then R is likely an induc-
       // tion register.
-      unsigned T = UseI->getOperand(0).getReg();
+      Register T = UseI->getOperand(0).getReg();
       if (T == CmpR1 || T == CmpR2)
         return false;
     }
@@ -603,9 +603,9 @@ void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI,
       continue;
     }
     // For register operands, set the subregister.
-    unsigned R = Op.getReg();
+    Register R = Op.getReg();
     unsigned SR = Op.getSubReg();
-    bool isVirtReg = TargetRegisterInfo::isVirtualRegister(R);
+    bool isVirtReg = Register::isVirtualRegister(R);
     bool isKill = Op.isKill();
     if (isVirtReg && MRI->getRegClass(R) == DoubleRC) {
       isKill = false;
@@ -674,7 +674,7 @@ void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI,
                        : MI->getOperand(2).getImm();
     MachineOperand &UpdOp = Load ? MI->getOperand(1) : MI->getOperand(0);
     const TargetRegisterClass *RC = MRI->getRegClass(UpdOp.getReg());
-    unsigned NewR = MRI->createVirtualRegister(RC);
+    Register NewR = MRI->createVirtualRegister(RC);
     assert(!UpdOp.getSubReg() && "Def operand with subreg");
     BuildMI(B, MI, DL, TII->get(Hexagon::A2_addi), NewR)
       .addReg(AdrOp.getReg(), RSA)
@@ -789,8 +789,8 @@ void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI,
   UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
   assert(F != PairMap.end());
   const UUPair &P = F->second;
-  unsigned LoR = P.first;
-  unsigned HiR = P.second;
+  Register LoR = P.first;
+  Register HiR = P.second;
 
   unsigned Opc = MI->getOpcode();
   bool Right = (Opc == S2_lsr_i_p || Opc == S2_asr_i_p);
@@ -813,7 +813,7 @@ void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI,
       .addReg(Op1.getReg(), RS, HiSR);
   } else if (S < 32) {
     const TargetRegisterClass *IntRC = &IntRegsRegClass;
-    unsigned TmpR = MRI->createVirtualRegister(IntRC);
+    Register TmpR = MRI->createVirtualRegister(IntRC);
     // Expansion:
     // Shift left:    DR = shl R, #s
     //   LoR  = shl R.lo, #s
@@ -953,12 +953,12 @@ void HexagonSplitDoubleRegs::splitAslOr(MachineInstr *MI,
       .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
       .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
       .addImm(S);
-    unsigned TmpR1 = MRI->createVirtualRegister(IntRC);
+    Register TmpR1 = MRI->createVirtualRegister(IntRC);
     BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR1)
       .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
       .addImm(S)
       .addImm(32-S);
-    unsigned TmpR2 = MRI->createVirtualRegister(IntRC);
+    Register TmpR2 = MRI->createVirtualRegister(IntRC);
     BuildMI(B, MI, DL, TII->get(A2_or), TmpR2)
       .addReg(Op1.getReg(), RS1, HiSR)
       .addReg(TmpR1);
@@ -1002,7 +1002,7 @@ bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI,
   switch (Opc) {
     case TargetOpcode::PHI:
     case TargetOpcode::COPY: {
-      unsigned DstR = MI->getOperand(0).getReg();
+      Register DstR = MI->getOperand(0).getReg();
       if (MRI->getRegClass(DstR) == DoubleRC) {
         createHalfInstr(Opc, MI, PairMap, isub_lo);
         createHalfInstr(Opc, MI, PairMap, isub_hi);
@@ -1079,7 +1079,7 @@ void HexagonSplitDoubleRegs::replaceSubregUses(MachineInstr *MI,
   for (auto &Op : MI->operands()) {
     if (!Op.isReg() || !Op.isUse() || !Op.getSubReg())
       continue;
-    unsigned R = Op.getReg();
+    Register R = Op.getReg();
     UUPairMap::const_iterator F = PairMap.find(R);
     if (F == PairMap.end())
       continue;
@@ -1104,8 +1104,8 @@ void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
   for (auto &Op : MI->operands()) {
     if (!Op.isReg() || !Op.isUse())
       continue;
-    unsigned R = Op.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(R))
+    Register R = Op.getReg();
+    if (!Register::isVirtualRegister(R))
       continue;
     if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg())
       continue;
@@ -1113,7 +1113,7 @@ void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
     if (F == PairMap.end())
       continue;
     const UUPair &Pr = F->second;
-    unsigned NewDR = MRI->createVirtualRegister(DoubleRC);
+    Register NewDR = MRI->createVirtualRegister(DoubleRC);
     BuildMI(B, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), NewDR)
       .addReg(Pr.first)
       .addImm(Hexagon::isub_lo)
@@ -1145,8 +1145,8 @@ bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
          U != W; ++U)
       SplitIns.insert(U->getParent());
 
-    unsigned LoR = MRI->createVirtualRegister(IntRC);
-    unsigned HiR = MRI->createVirtualRegister(IntRC);
+    Register LoR = MRI->createVirtualRegister(IntRC);
+    Register HiR = MRI->createVirtualRegister(IntRC);
     LLVM_DEBUG(dbgs() << "Created mapping: " << printReg(DR, TRI) << " -> "
                       << printReg(HiR, TRI) << ':' << printReg(LoR, TRI)
                       << '\n');
diff --git a/lib/Target/Hexagon/HexagonStoreWidening.cpp b/lib/Target/Hexagon/HexagonStoreWidening.cpp
index b8b61517ff95..27fefa5f5e2b 100644
--- a/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -441,7 +441,7 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
     // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
     const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
     const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
-    unsigned VReg = MF->getRegInfo().createVirtualRegister(RC);
+    Register VReg = MF->getRegInfo().createVirtualRegister(RC);
     MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg)
                            .addImm(int(Acc));
     NG.push_back(TfrI);
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 7ec63a642b0c..6c706fea096b 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -119,7 +119,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 
   FeatureBitset Features = getFeatureBits();
   if (HexagonDisableDuplex)
-    setFeatureBits(Features.set(Hexagon::FeatureDuplex, false));
+    setFeatureBits(Features.reset(Hexagon::FeatureDuplex));
   setFeatureBits(Hexagon_MC::completeHVXFeatures(Features));
 
   return *this;
@@ -230,7 +230,7 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
     else if (SchedRetvalOptimization) {
       const MachineInstr *MI = DAG->SUnits[su].getInstr();
       if (MI->isCopy() &&
-          TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
+          Register::isPhysicalRegister(MI->getOperand(1).getReg())) {
         // %vregX = COPY %r0
         VRegHoldingReg[MI->getOperand(0).getReg()] = MI->getOperand(1).getReg();
         LastVRegUse.erase(MI->getOperand(1).getReg());
@@ -243,8 +243,7 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
               VRegHoldingReg.count(MO.getReg())) {
             // <use of %vregX>
             LastVRegUse[VRegHoldingReg[MO.getReg()]] = &DAG->SUnits[su];
-          } else if (MO.isDef() &&
-                     TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+          } else if (MO.isDef() && Register::isPhysicalRegister(MO.getReg())) {
             for (MCRegAliasIterator AI(MO.getReg(), &TRI, true); AI.isValid();
                  ++AI) {
               if (LastVRegUse.count(*AI) &&
@@ -345,7 +344,7 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
   // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine
   // the correct latency.
   if ((DstInst->isRegSequence() || DstInst->isCopy()) && Dst->NumSuccs == 1) {
-    unsigned DReg = DstInst->getOperand(0).getReg();
+    Register DReg = DstInst->getOperand(0).getReg();
     MachineInstr *DDst = Dst->Succs[0].getSUnit()->getInstr();
     unsigned UseIdx = -1;
     for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) {
@@ -375,15 +374,15 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
 
 void HexagonSubtarget::getPostRAMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(llvm::make_unique<UsrOverflowMutation>());
-  Mutations.push_back(llvm::make_unique<HVXMemLatencyMutation>());
-  Mutations.push_back(llvm::make_unique<BankConflictMutation>());
+  Mutations.push_back(std::make_unique<UsrOverflowMutation>());
+  Mutations.push_back(std::make_unique<HVXMemLatencyMutation>());
+  Mutations.push_back(std::make_unique<BankConflictMutation>());
 }
 
 void HexagonSubtarget::getSMSMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(llvm::make_unique<UsrOverflowMutation>());
-  Mutations.push_back(llvm::make_unique<HVXMemLatencyMutation>());
+  Mutations.push_back(std::make_unique<UsrOverflowMutation>());
+  Mutations.push_back(std::make_unique<HVXMemLatencyMutation>());
 }
 
 // Pin the vtable to this file.
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 007423ef1902..31157a0065d9 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -228,7 +228,7 @@ public:
   }
 
   bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const {
-    if (!VecTy.isVector() || !useHVXOps())
+    if (!VecTy.isVector() || !useHVXOps() || VecTy.isScalableVector())
       return false;
     MVT ElemTy = VecTy.getVectorElementType();
     if (!IncludeBool && ElemTy == MVT::i1)
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 80b8480448fe..d709a82be660 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -111,10 +111,10 @@ int HexagonTargetMachineModule = 0;
 
 static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
-    new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>());
-  DAG->addMutation(make_unique<HexagonSubtarget::UsrOverflowMutation>());
-  DAG->addMutation(make_unique<HexagonSubtarget::HVXMemLatencyMutation>());
-  DAG->addMutation(make_unique<HexagonSubtarget::CallMutation>());
+    new VLIWMachineScheduler(C, std::make_unique<ConvergingVLIWScheduler>());
+  DAG->addMutation(std::make_unique<HexagonSubtarget::UsrOverflowMutation>());
+  DAG->addMutation(std::make_unique<HexagonSubtarget::HVXMemLatencyMutation>());
+  DAG->addMutation(std::make_unique<HexagonSubtarget::CallMutation>());
   DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
 }
@@ -218,7 +218,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
           TT, CPU, FS, Options, getEffectiveRelocModel(RM),
           getEffectiveCodeModel(CM, CodeModel::Small),
           (HexagonNoOpt ? CodeGenOpt::None : OL)),
-      TLOF(make_unique<HexagonTargetObjectFile>()) {
+      TLOF(std::make_unique<HexagonTargetObjectFile>()) {
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
   initAsmInfo();
 }
@@ -244,7 +244,7 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this);
+    I = std::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this);
   }
   return I.get();
 }
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 38062e8e922c..ddbc5543348d 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -45,6 +45,8 @@ bool HexagonTTIImpl::useHVX() const {
 
 bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
   assert(VecTy->isVectorTy());
+  if (cast<VectorType>(VecTy)->isScalable())
+    return false;
   // Avoid types like <2 x i32*>.
   if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
     return false;
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 27e8fc019007..12ede503af83 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -68,8 +68,8 @@ public:
   bool shouldFavorPostInc() const;
 
   // L1 cache prefetch.
-  unsigned getPrefetchDistance() const;
-  unsigned getCacheLineSize() const;
+  unsigned getPrefetchDistance() const override;
+  unsigned getCacheLineSize() const override;
 
   /// @}
 
diff --git a/lib/Target/Hexagon/HexagonVExtract.cpp b/lib/Target/Hexagon/HexagonVExtract.cpp
index a9692f42e468..0c0266a6839a 100644
--- a/lib/Target/Hexagon/HexagonVExtract.cpp
+++ b/lib/Target/Hexagon/HexagonVExtract.cpp
@@ -67,9 +67,9 @@ unsigned HexagonVExtract::genElemLoad(MachineInstr *ExtI, unsigned BaseR,
                                       MachineRegisterInfo &MRI) {
   MachineBasicBlock &ExtB = *ExtI->getParent();
   DebugLoc DL = ExtI->getDebugLoc();
-  unsigned ElemR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  Register ElemR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
 
-  unsigned ExtIdxR = ExtI->getOperand(2).getReg();
+  Register ExtIdxR = ExtI->getOperand(2).getReg();
   unsigned ExtIdxS = ExtI->getOperand(2).getSubReg();
 
   // Simplified check for a compile-time constant value of ExtIdxR.
@@ -86,7 +86,7 @@ unsigned HexagonVExtract::genElemLoad(MachineInstr *ExtI, unsigned BaseR,
     }
   }
 
-  unsigned IdxR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  Register IdxR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
   BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::A2_andir), IdxR)
     .add(ExtI->getOperand(2))
     .addImm(-4);
@@ -111,7 +111,7 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
       unsigned Opc = MI.getOpcode();
       if (Opc != Hexagon::V6_extractw)
         continue;
-      unsigned VecR = MI.getOperand(1).getReg();
+      Register VecR = MI.getOperand(1).getReg();
       VExtractMap[VecR].push_back(&MI);
     }
   }
@@ -144,13 +144,13 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
 
       MachineBasicBlock &ExtB = *ExtI->getParent();
       DebugLoc DL = ExtI->getDebugLoc();
-      unsigned BaseR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+      Register BaseR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
       BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::PS_fi), BaseR)
         .addFrameIndex(FI)
         .addImm(SR == 0 ? 0 : VecSize/2);
 
       unsigned ElemR = genElemLoad(ExtI, BaseR, MRI);
-      unsigned ExtR = ExtI->getOperand(0).getReg();
+      Register ExtR = ExtI->getOperand(0).getReg();
       MRI.replaceRegWith(ExtR, ElemR);
       ExtB.erase(ExtI);
       Changed = true;
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 3619e4c239d7..fab5edefb553 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -57,9 +58,9 @@ static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden,
   cl::ZeroOrMore, cl::init(false),
   cl::desc("Disable Hexagon packetizer pass"));
 
-cl::opt<bool> Slot1Store("slot1-store-slot0-load", cl::Hidden,
-  cl::ZeroOrMore, cl::init(true),
-  cl::desc("Allow slot1 store and slot0 load"));
+static cl::opt<bool> Slot1Store("slot1-store-slot0-load", cl::Hidden,
+                                cl::ZeroOrMore, cl::init(true),
+                                cl::desc("Allow slot1 store and slot0 load"));
 
 static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
   cl::ZeroOrMore, cl::Hidden, cl::init(true),
@@ -129,16 +130,16 @@ INITIALIZE_PASS_END(HexagonPacketizer, "hexagon-packetizer",
                     "Hexagon Packetizer", false, false)
 
 HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF,
-      MachineLoopInfo &MLI, AliasAnalysis *AA,
+      MachineLoopInfo &MLI, AAResults *AA,
       const MachineBranchProbabilityInfo *MBPI, bool Minimal)
     : VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI),
       Minimal(Minimal) {
   HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
   HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
 
-  addMutation(llvm::make_unique<HexagonSubtarget::UsrOverflowMutation>());
-  addMutation(llvm::make_unique<HexagonSubtarget::HVXMemLatencyMutation>());
-  addMutation(llvm::make_unique<HexagonSubtarget::BankConflictMutation>());
+  addMutation(std::make_unique<HexagonSubtarget::UsrOverflowMutation>());
+  addMutation(std::make_unique<HexagonSubtarget::HVXMemLatencyMutation>());
+  addMutation(std::make_unique<HexagonSubtarget::BankConflictMutation>());
 }
 
 // Check if FirstI modifies a register that SecondI reads.
@@ -148,7 +149,7 @@ static bool hasWriteToReadDep(const MachineInstr &FirstI,
   for (auto &MO : FirstI.operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned R = MO.getReg();
+    Register R = MO.getReg();
     if (SecondI.readsRegister(R, TRI))
       return true;
   }
@@ -422,7 +423,7 @@ bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI,
     dbgs() << "Checking CUR against ";
     MJ.dump();
   });
-  unsigned DestReg = MI.getOperand(0).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
   bool FoundMatch = false;
   for (auto &MO : MJ.operands())
     if (MO.isReg() && MO.getReg() == DestReg)
@@ -515,7 +516,7 @@ bool HexagonPacketizerList::updateOffset(SUnit *SUI, SUnit *SUJ) {
   unsigned BPJ, OPJ;
   if (!HII->getBaseAndOffsetPosition(MJ, BPJ, OPJ))
     return false;
-  unsigned Reg = MI.getOperand(BPI).getReg();
+  Register Reg = MI.getOperand(BPI).getReg();
   if (Reg != MJ.getOperand(BPJ).getReg())
     return false;
   // Make sure that the dependences do not restrict adding MI to the packet.
@@ -788,7 +789,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
       return false;
     if (!MO.isReg() || !MO.isDef() || !MO.isImplicit())
       continue;
-    unsigned R = MO.getReg();
+    Register R = MO.getReg();
     if (R == DepReg || HRI->isSuperRegister(DepReg, R))
       return false;
   }
@@ -1208,7 +1209,7 @@ bool HexagonPacketizerList::hasDeadDependence(const MachineInstr &I,
   for (auto &MO : J.operands()) {
     if (!MO.isReg() || !MO.isDef() || !MO.isDead())
       continue;
-    unsigned R = MO.getReg();
+    Register R = MO.getReg();
     if (R != Hexagon::USR_OVF && DeadDefs[R])
       return true;
   }
@@ -1585,7 +1586,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       // subset of the volatile register set.
       for (const MachineOperand &Op : I.operands()) {
         if (Op.isReg() && Op.isDef()) {
-          unsigned R = Op.getReg();
+          Register R = Op.getReg();
           if (!J.readsRegister(R, HRI) && !J.modifiesRegister(R, HRI))
             continue;
         } else if (!Op.isRegMask()) {
@@ -1763,6 +1764,16 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) {
 void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator EndMI) {
   // Replace VLIWPacketizerList::endPacket(MBB, EndMI).
+  LLVM_DEBUG({
+    if (!CurrentPacketMIs.empty()) {
+      dbgs() << "Finalizing packet:\n";
+      unsigned Idx = 0;
+      for (MachineInstr *MI : CurrentPacketMIs) {
+        unsigned R = ResourceTracker->getUsedResources(Idx++);
+        dbgs() << " * [res:0x" << utohexstr(R) << "] " << *MI;
+      }
+    }
+  });
 
   bool memShufDisabled = getmemShufDisabled();
   if (memShufDisabled && !foundLSInPacket()) {
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index daa86b6f5393..943b9ac7ecc4 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -69,8 +69,7 @@ private:
 
 public:
   HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
-                        AliasAnalysis *AA,
-                        const MachineBranchProbabilityInfo *MBPI,
+                        AAResults *AA, const MachineBranchProbabilityInfo *MBPI,
                         bool Minimal);
 
   // initPacketizerState - initialize some internal flags.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 7c0770926abe..75cb398d4097 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -201,9 +201,7 @@ public:
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target) override {
-    MCFixupKind Kind = Fixup.getKind();
-
-    switch((unsigned)Kind) {
+    switch(Fixup.getTargetKind()) {
       default:
         llvm_unreachable("Unknown Fixup Kind!");
 
@@ -583,7 +581,7 @@ public:
       return false;
     // If we cannot resolve the fixup value, it requires relaxation.
     if (!Resolved) {
-      switch ((unsigned)Fixup.getKind()) {
+      switch (Fixup.getTargetKind()) {
       case fixup_Hexagon_B22_PCREL:
         // GetFixupCount assumes B22 won't relax
         LLVM_FALLTHROUGH;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index f678bf49322e..cdbeae38b3a1 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -44,7 +44,7 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
                                               MCFixup const &Fixup,
                                               bool IsPCRel) const {
   MCSymbolRefExpr::VariantKind Variant = Target.getAccessVariant();
-  switch ((unsigned)Fixup.getKind()) {
+  switch (Fixup.getTargetKind()) {
   default:
     report_fatal_error("Unrecognized relocation type");
     break;
@@ -299,5 +299,5 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createHexagonELFObjectWriter(uint8_t OSABI, StringRef CPU) {
-  return llvm::make_unique<HexagonELFObjectWriter>(OSABI, CPU);
+  return std::make_unique<HexagonELFObjectWriter>(OSABI, CPU);
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index fcd3758600c1..8b262bd0248e 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -726,9 +726,6 @@ void HexagonMCChecker::reportNote(SMLoc Loc, llvm::Twine const &Msg) {
 }
 
 void HexagonMCChecker::reportWarning(Twine const &Msg) {
-  if (ReportErrors) {
-    auto SM = Context.getSourceManager();
-    if (SM)
-      SM->PrintMessage(MCB.getLoc(), SourceMgr::DK_Warning, Msg);
-  }
+  if (ReportErrors)
+    Context.reportWarning(MCB.getLoc(), Msg);
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index f2432883af6f..a799f7f7c0b9 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -116,8 +116,8 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
     }
 
     // Update the maximum alignment of the section if necessary.
-    if (ByteAlignment > Section.getAlignment())
-      Section.setAlignment(ByteAlignment);
+    if (Align(ByteAlignment) > Section.getAlignment())
+      Section.setAlignment(Align(ByteAlignment));
 
     SwitchSection(P.first, P.second);
   } else {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 9c50b25156c3..870ab9e94a63 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -72,7 +72,6 @@ cl::opt<bool> MV65("mv65", cl::Hidden, cl::desc("Build for Hexagon V65"),
                    cl::init(false));
 cl::opt<bool> MV66("mv66", cl::Hidden, cl::desc("Build for Hexagon V66"),
                    cl::init(false));
-} // namespace
 
 cl::opt<Hexagon::ArchEnum>
     EnableHVX("mhvx",
@@ -86,6 +85,7 @@ cl::opt<Hexagon::ArchEnum>
         clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
       // Sentinel for flag not present.
       cl::init(Hexagon::ArchEnum::NoArch), cl::ValueOptional);
+} // namespace
 
 static cl::opt<bool>
   DisableHVX("mno-hvx", cl::Hidden,
@@ -264,14 +264,12 @@ createHexagonObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
 }
 
 static void LLVM_ATTRIBUTE_UNUSED clearFeature(MCSubtargetInfo* STI, uint64_t F) {
-  uint64_t FB = STI->getFeatureBits().to_ullong();
-  if (FB & (1ULL << F))
+  if (STI->getFeatureBits()[F])
     STI->ToggleFeature(F);
 }
 
 static bool LLVM_ATTRIBUTE_UNUSED checkFeature(MCSubtargetInfo* STI, uint64_t F) {
-  uint64_t FB = STI->getFeatureBits().to_ullong();
-  return (FB & (1ULL << F)) != 0;
+  return STI->getFeatureBits()[F];
 }
 
 namespace {
@@ -398,7 +396,7 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
   MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS);
   if (HexagonDisableDuplex) {
     llvm::FeatureBitset Features = X->getFeatureBits();
-    X->setFeatureBits(Features.set(Hexagon::FeatureDuplex, false));
+    X->setFeatureBits(Features.reset(Hexagon::FeatureDuplex));
   }
 
   X->setFeatureBits(completeHVXFeatures(X->getFeatureBits()));
diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp
index 7702024f87bd..a9d39fd4b2dc 100644
--- a/lib/Target/Hexagon/RDFCopy.cpp
+++ b/lib/Target/Hexagon/RDFCopy.cpp
@@ -45,8 +45,8 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
       const MachineOperand &Src = MI->getOperand(1);
       RegisterRef DstR = DFG.makeRegRef(Dst.getReg(), Dst.getSubReg());
       RegisterRef SrcR = DFG.makeRegRef(Src.getReg(), Src.getSubReg());
-      assert(TargetRegisterInfo::isPhysicalRegister(DstR.Reg));
-      assert(TargetRegisterInfo::isPhysicalRegister(SrcR.Reg));
+      assert(Register::isPhysicalRegister(DstR.Reg));
+      assert(Register::isPhysicalRegister(SrcR.Reg));
       const TargetRegisterInfo &TRI = DFG.getTRI();
       if (TRI.getMinimalPhysRegClass(DstR.Reg) !=
           TRI.getMinimalPhysRegClass(SrcR.Reg))
diff --git a/lib/Target/Hexagon/RDFDeadCode.cpp b/lib/Target/Hexagon/RDFDeadCode.cpp
index 52178931aa6d..af86c7b1956b 100644
--- a/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
 
 #include <queue>
 
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index 9d8f706b8a0f..0cb35dc98819 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -633,7 +633,7 @@ bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
   // uses or defs, and those lists do not allow sub-registers.
   if (Op.getSubReg() != 0)
     return false;
-  RegisterId Reg = Op.getReg();
+  Register Reg = Op.getReg();
   const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs()
                                      : D.getImplicitUses();
   if (!ImpR)
@@ -963,7 +963,7 @@ void DataFlowGraph::build(unsigned Options) {
 
 RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const {
   assert(PhysicalRegisterInfo::isRegMaskId(Reg) ||
-         TargetRegisterInfo::isPhysicalRegister(Reg));
+         Register::isPhysicalRegister(Reg));
   assert(Reg != 0);
   if (Sub != 0)
     Reg = TRI.getSubReg(Reg, Sub);
@@ -1291,8 +1291,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     MachineOperand &Op = In.getOperand(OpN);
     if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
       continue;
-    unsigned R = Op.getReg();
-    if (!R || !TargetRegisterInfo::isPhysicalRegister(R))
+    Register R = Op.getReg();
+    if (!R || !Register::isPhysicalRegister(R))
       continue;
     uint16_t Flags = NodeAttrs::None;
     if (TOI.isPreserving(In, OpN)) {
@@ -1336,8 +1336,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     MachineOperand &Op = In.getOperand(OpN);
     if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
       continue;
-    unsigned R = Op.getReg();
-    if (!R || !TargetRegisterInfo::isPhysicalRegister(R) || DoneDefs.test(R))
+    Register R = Op.getReg();
+    if (!R || !Register::isPhysicalRegister(R) || DoneDefs.test(R))
       continue;
     RegisterRef RR = makeRegRef(Op);
     uint16_t Flags = NodeAttrs::None;
@@ -1365,8 +1365,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     MachineOperand &Op = In.getOperand(OpN);
     if (!Op.isReg() || !Op.isUse())
       continue;
-    unsigned R = Op.getReg();
-    if (!R || !TargetRegisterInfo::isPhysicalRegister(R))
+    Register R = Op.getReg();
+    if (!R || !Register::isPhysicalRegister(R))
       continue;
     uint16_t Flags = NodeAttrs::None;
     if (Op.isUndef())
diff --git a/lib/Target/Hexagon/RDFLiveness.cpp b/lib/Target/Hexagon/RDFLiveness.cpp
index 9cd304aa10bc..7d7b89462ff9 100644
--- a/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/lib/Target/Hexagon/RDFLiveness.cpp
@@ -889,8 +889,8 @@ void Liveness::resetKills(MachineBasicBlock *B) {
       // implicit defs.
       if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
         continue;
-      unsigned R = Op.getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(R))
+      Register R = Op.getReg();
+      if (!Register::isPhysicalRegister(R))
         continue;
       for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
         Live.reset(*SR);
@@ -898,8 +898,8 @@ void Liveness::resetKills(MachineBasicBlock *B) {
     for (auto &Op : MI->operands()) {
       if (!Op.isReg() || !Op.isUse() || Op.isUndef())
         continue;
-      unsigned R = Op.getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(R))
+      Register R = Op.getReg();
+      if (!Register::isPhysicalRegister(R))
         continue;
       bool IsLive = false;
       for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) {
diff --git a/lib/Target/Hexagon/RDFRegisters.cpp b/lib/Target/Hexagon/RDFRegisters.cpp
index 6e0f33695f0e..b5675784e34b 100644
--- a/lib/Target/Hexagon/RDFRegisters.cpp
+++ b/lib/Target/Hexagon/RDFRegisters.cpp
@@ -101,7 +101,7 @@ RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const {
 std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
   // Do not include RR in the alias set.
   std::set<RegisterId> AS;
-  assert(isRegMaskId(Reg) || TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(isRegMaskId(Reg) || Register::isPhysicalRegister(Reg));
   if (isRegMaskId(Reg)) {
     // XXX SLOW
     const uint32_t *MB = getRegMaskBits(Reg);
@@ -129,8 +129,8 @@ std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
 }
 
 bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg));
-  assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg));
+  assert(Register::isPhysicalRegister(RA.Reg));
+  assert(Register::isPhysicalRegister(RB.Reg));
 
   MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
   MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
@@ -160,7 +160,7 @@ bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const {
 }
 
 bool PhysicalRegisterInfo::aliasRM(RegisterRef RR, RegisterRef RM) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg));
+  assert(Register::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg));
   const uint32_t *MB = getRegMaskBits(RM.Reg);
   bool Preserved = MB[RR.Reg/32] & (1u << (RR.Reg%32));
   // If the lane mask information is "full", e.g. when the given lane mask
diff --git a/lib/Target/Hexagon/RDFRegisters.h b/lib/Target/Hexagon/RDFRegisters.h
index 646233bacda5..4afaf80e4659 100644
--- a/lib/Target/Hexagon/RDFRegisters.h
+++ b/lib/Target/Hexagon/RDFRegisters.h
@@ -99,15 +99,15 @@ namespace rdf {
                          const MachineFunction &mf);
 
     static bool isRegMaskId(RegisterId R) {
-      return TargetRegisterInfo::isStackSlot(R);
+      return Register::isStackSlot(R);
     }
 
     RegisterId getRegMaskId(const uint32_t *RM) const {
-      return TargetRegisterInfo::index2StackSlot(RegMasks.find(RM));
+      return Register::index2StackSlot(RegMasks.find(RM));
     }
 
     const uint32_t *getRegMaskBits(RegisterId R) const {
-      return RegMasks.get(TargetRegisterInfo::stackSlot2Index(R));
+      return RegMasks.get(Register::stackSlot2Index(R));
     }
 
     RegisterRef normalize(RegisterRef RR) const;
@@ -125,7 +125,7 @@ namespace rdf {
     }
 
     const BitVector &getMaskUnits(RegisterId MaskId) const {
-      return MaskInfos[TargetRegisterInfo::stackSlot2Index(MaskId)].Units;
+      return MaskInfos[Register::stackSlot2Index(MaskId)].Units;
     }
 
     RegisterRef mapTo(RegisterRef RR, unsigned R) const;
diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 9af8a0b35b2f..ec82e3a41f2a 100644
--- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -469,13 +469,14 @@ public:
     else if (isa<LanaiMCExpr>(getImm())) {
 #ifndef NDEBUG
       const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
-      assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO);
+      assert(SymbolRefExpr &&
+             SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+      assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
              cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
                  LanaiMCExpr::VK_Lanai_ABS_LO);
 #endif
@@ -499,13 +500,14 @@ public:
     else if (isa<LanaiMCExpr>(getImm())) {
 #ifndef NDEBUG
       const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
-      assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI);
+      assert(SymbolRefExpr &&
+             SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+      assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
              cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
                  LanaiMCExpr::VK_Lanai_ABS_HI);
 #endif
@@ -544,10 +546,9 @@ public:
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      const LanaiMCExpr *SymbolRefExpr =
-          dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
-      assert(SymbolRefExpr &&
-             SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+      assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+             cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+                 LanaiMCExpr::VK_Lanai_None);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else
@@ -580,7 +581,7 @@ public:
   }
 
   static std::unique_ptr<LanaiOperand> CreateToken(StringRef Str, SMLoc Start) {
-    auto Op = make_unique<LanaiOperand>(TOKEN);
+    auto Op = std::make_unique<LanaiOperand>(TOKEN);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = Start;
@@ -590,7 +591,7 @@ public:
 
   static std::unique_ptr<LanaiOperand> createReg(unsigned RegNum, SMLoc Start,
                                                  SMLoc End) {
-    auto Op = make_unique<LanaiOperand>(REGISTER);
+    auto Op = std::make_unique<LanaiOperand>(REGISTER);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = Start;
     Op->EndLoc = End;
@@ -599,7 +600,7 @@ public:
 
   static std::unique_ptr<LanaiOperand> createImm(const MCExpr *Value,
                                                  SMLoc Start, SMLoc End) {
-    auto Op = make_unique<LanaiOperand>(IMMEDIATE);
+    auto Op = std::make_unique<LanaiOperand>(IMMEDIATE);
     Op->Imm.Value = Value;
     Op->StartLoc = Start;
     Op->EndLoc = End;
diff --git a/lib/Target/Lanai/LanaiAsmPrinter.cpp b/lib/Target/Lanai/LanaiAsmPrinter.cpp
index 64d963475e1a..12a3202446a8 100644
--- a/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -133,7 +133,7 @@ bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
       const MachineOperand &MO = MI->getOperand(RegOp);
       if (!MO.isReg())
         return true;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       O << LanaiInstPrinter::getRegisterName(Reg);
       return false;
     }
diff --git a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
index 09c63dca23e2..b9e577d201f9 100644
--- a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
+++ b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Simple pass to fills delay slots with useful instructions.
+// Simple pass to fill delay slots with useful instructions.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/Lanai/LanaiFrameLowering.cpp b/lib/Target/Lanai/LanaiFrameLowering.cpp
index 142c09c504cc..eddc2b8e61f7 100644
--- a/lib/Target/Lanai/LanaiFrameLowering.cpp
+++ b/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -72,8 +72,8 @@ void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const {
       MachineInstr &MI = *MBBI++;
       if (MI.getOpcode() == Lanai::ADJDYNALLOC) {
         DebugLoc DL = MI.getDebugLoc();
-        unsigned Dst = MI.getOperand(0).getReg();
-        unsigned Src = MI.getOperand(1).getReg();
+        Register Dst = MI.getOperand(0).getReg();
+        Register Src = MI.getOperand(1).getReg();
 
         BuildMI(*MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst)
             .addReg(Src)
diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h
index 5fe4535543ec..380d63df7301 100644
--- a/lib/Target/Lanai/LanaiFrameLowering.h
+++ b/lib/Target/Lanai/LanaiFrameLowering.h
@@ -31,7 +31,7 @@ protected:
 public:
   explicit LanaiFrameLowering(const LanaiSubtarget &Subtarget)
       : TargetFrameLowering(StackGrowsDown,
-                            /*StackAlignment=*/8,
+                            /*StackAlignment=*/Align(8),
                             /*LocalAreaOffset=*/0),
         STI(Subtarget) {}
 
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index 1ed078bb433f..43933d062a7e 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -144,9 +144,9 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::XOR);
 
-  // Function alignments (log2)
-  setMinFunctionAlignment(2);
-  setPrefFunctionAlignment(2);
+  // Function alignments
+  setMinFunctionAlignment(Align(4));
+  setPrefFunctionAlignment(Align(4));
 
   setJumpIsExpensive(true);
 
@@ -212,10 +212,11 @@ SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
 //                       Lanai Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
-unsigned LanaiTargetLowering::getRegisterByName(const char *RegName, EVT /*VT*/,
-                                                SelectionDAG & /*DAG*/) const {
+Register LanaiTargetLowering::getRegisterByName(
+  const char *RegName, EVT /*VT*/,
+  const MachineFunction & /*MF*/) const {
   // Only unallocatable registers should be matched here.
-  unsigned Reg = StringSwitch<unsigned>(RegName)
+  Register Reg = StringSwitch<unsigned>(RegName)
                      .Case("pc", Lanai::PC)
                      .Case("sp", Lanai::SP)
                      .Case("fp", Lanai::FP)
@@ -459,7 +460,7 @@ SDValue LanaiTargetLowering::LowerCCCArguments(
       EVT RegVT = VA.getLocVT();
       switch (RegVT.getSimpleVT().SimpleTy) {
       case MVT::i32: {
-        unsigned VReg = RegInfo.createVirtualRegister(&Lanai::GPRRegClass);
+        Register VReg = RegInfo.createVirtualRegister(&Lanai::GPRRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
 
diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h
index e7b5755e9041..4c35a2c6fb8e 100644
--- a/lib/Target/Lanai/LanaiISelLowering.h
+++ b/lib/Target/Lanai/LanaiISelLowering.h
@@ -90,8 +90,8 @@ public:
   SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 
-  unsigned getRegisterByName(const char *RegName, EVT VT,
-                             SelectionDAG &DAG) const override;
+  Register getRegisterByName(const char *RegName, EVT VT,
+                             const MachineFunction &MF) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp
index 700a86069102..b950fd0424ef 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -86,8 +86,7 @@ void LanaiInstrInfo::loadRegFromStackSlot(
 }
 
 bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint(
-    const MachineInstr &MIa, const MachineInstr &MIb,
-    AliasAnalysis * /*AA*/) const {
+    const MachineInstr &MIa, const MachineInstr &MIb) const {
   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
 
@@ -457,7 +456,7 @@ bool LanaiInstrInfo::analyzeSelect(const MachineInstr &MI,
 // return the defining instruction.
 static MachineInstr *canFoldIntoSelect(unsigned Reg,
                                        const MachineRegisterInfo &MRI) {
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  if (!Register::isVirtualRegister(Reg))
     return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
     return nullptr;
@@ -479,7 +478,7 @@ static MachineInstr *canFoldIntoSelect(unsigned Reg,
     // MI can't have any tied operands, that would conflict with predication.
     if (MO.isTied())
       return nullptr;
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+    if (Register::isPhysicalRegister(MO.getReg()))
       return nullptr;
     if (MO.isDef() && !MO.isDead())
       return nullptr;
@@ -505,7 +504,7 @@ LanaiInstrInfo::optimizeSelect(MachineInstr &MI,
 
   // Find new register class to use.
   MachineOperand FalseReg = MI.getOperand(Invert ? 1 : 2);
-  unsigned DestReg = MI.getOperand(0).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
   if (!MRI.constrainRegClass(DestReg, PreviousClass))
     return nullptr;
diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h
index d71424aeb0b1..59a04d2cc388 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/lib/Target/Lanai/LanaiInstrInfo.h
@@ -36,8 +36,7 @@ public:
   }
 
   bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
-                                       const MachineInstr &MIb,
-                                       AliasAnalysis *AA) const override;
+                                       const MachineInstr &MIb) const override;
 
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp
index d3056a1eba8e..7c28debb94dd 100644
--- a/lib/Target/Lanai/LanaiRegisterInfo.cpp
+++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -155,7 +155,7 @@ void LanaiRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (!HasFP || (needsStackRealignment(MF) && FrameIndex >= 0))
     Offset += MF.getFrameInfo().getStackSize();
 
-  unsigned FrameReg = getFrameRegister(MF);
+  Register FrameReg = getFrameRegister(MF);
   if (FrameIndex >= 0) {
     if (hasBasePointer(MF))
       FrameReg = getBaseRegister();
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
index 4313fa5a82b5..919d43ad9b9b 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -88,5 +88,5 @@ bool LanaiELFObjectWriter::needsRelocateWithSymbol(const MCSymbol & /*SD*/,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createLanaiELFObjectWriter(uint8_t OSABI) {
-  return llvm::make_unique<LanaiELFObjectWriter>(OSABI);
+  return std::make_unique<LanaiELFObjectWriter>(OSABI);
 }
diff --git a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index a0ec14ae2381..85dcc0f152f9 100644
--- a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -191,33 +191,33 @@ public:
   }
 
   static std::unique_ptr<MSP430Operand> CreateToken(StringRef Str, SMLoc S) {
-    return make_unique<MSP430Operand>(Str, S);
+    return std::make_unique<MSP430Operand>(Str, S);
   }
 
   static std::unique_ptr<MSP430Operand> CreateReg(unsigned RegNum, SMLoc S,
                                                   SMLoc E) {
-    return make_unique<MSP430Operand>(k_Reg, RegNum, S, E);
+    return std::make_unique<MSP430Operand>(k_Reg, RegNum, S, E);
   }
 
   static std::unique_ptr<MSP430Operand> CreateImm(const MCExpr *Val, SMLoc S,
                                                   SMLoc E) {
-    return make_unique<MSP430Operand>(Val, S, E);
+    return std::make_unique<MSP430Operand>(Val, S, E);
   }
 
   static std::unique_ptr<MSP430Operand> CreateMem(unsigned RegNum,
                                                   const MCExpr *Val,
                                                   SMLoc S, SMLoc E) {
-    return make_unique<MSP430Operand>(RegNum, Val, S, E);
+    return std::make_unique<MSP430Operand>(RegNum, Val, S, E);
   }
 
   static std::unique_ptr<MSP430Operand> CreateIndReg(unsigned RegNum, SMLoc S,
                                                   SMLoc E) {
-    return make_unique<MSP430Operand>(k_IndReg, RegNum, S, E);
+    return std::make_unique<MSP430Operand>(k_IndReg, RegNum, S, E);
   }
 
   static std::unique_ptr<MSP430Operand> CreatePostIndReg(unsigned RegNum, SMLoc S,
                                                   SMLoc E) {
-    return make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E);
+    return std::make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E);
   }
 
   SMLoc getStartLoc() const { return Start; }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
index 38b7da32c246..0cdd1f4f701f 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -31,7 +31,7 @@ protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override {
     // Translate fixup kind to ELF relocation type.
-    switch ((unsigned)Fixup.getKind()) {
+    switch (Fixup.getTargetKind()) {
     case FK_Data_1:                   return ELF::R_MSP430_8;
     case FK_Data_2:                   return ELF::R_MSP430_16_BYTE;
     case FK_Data_4:                   return ELF::R_MSP430_32;
@@ -54,5 +54,5 @@ protected:
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createMSP430ELFObjectWriter(uint8_t OSABI) {
-  return llvm::make_unique<MSP430ELFObjectWriter>(OSABI);
+  return std::make_unique<MSP430ELFObjectWriter>(OSABI);
 }
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 3a71a084d1af..a3b91acdc6d0 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -159,8 +159,9 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) {
   MCSection *Cur = OutStreamer->getCurrentSectionOnly();
   const auto *F = &ISR.getFunction();
-  assert(F->hasFnAttribute("interrupt") &&
-         "Functions with MSP430_INTR CC should have 'interrupt' attribute");
+  if (F->getCallingConv() != CallingConv::MSP430_INTR) {
+    report_fatal_error("Functions with 'interrupt' attribute must have msp430_intrcc CC");
+  }
   StringRef IVIdx = F->getFnAttribute("interrupt").getValueAsString();
   MCSection *IV = OutStreamer->getContext().getELFSection(
     "__interrupt_vector_" + IVIdx,
@@ -174,8 +175,9 @@ void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) {
 
 bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Emit separate section for an interrupt vector if ISR
-  if (MF.getFunction().getCallingConv() == CallingConv::MSP430_INTR)
+  if (MF.getFunction().hasFnAttribute("interrupt")) {
     EmitInterruptVectorSection(MF);
+  }
 
   SetupMachineFunction(MF);
   EmitFunctionBody();
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index 45e7c26e4d30..ce5affdc25b0 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index 33ce3c70a2a3..70e284053021 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -22,7 +22,8 @@ protected:
 
 public:
   explicit MSP430FrameLowering()
-      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2) {}
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(2), -2,
+                            Align(2)) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index fedfb857bd0f..64169d1f5eb1 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -327,8 +327,8 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN);
   // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
 
-  setMinFunctionAlignment(1);
-  setPrefFunctionAlignment(1);
+  setMinFunctionAlignment(Align(2));
+  setPrefFunctionAlignment(Align(2));
 }
 
 SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
@@ -353,6 +353,9 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
   }
 }
 
+unsigned MSP430TargetLowering::getShiftAmountThreshold(EVT VT) const {
+  return 2;
+}
 //===----------------------------------------------------------------------===//
 //                       MSP430 Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -632,7 +635,7 @@ SDValue MSP430TargetLowering::LowerCCCArguments(
           llvm_unreachable(nullptr);
         }
       case MVT::i16:
-        unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
+        Register VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
 
@@ -1446,8 +1449,8 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
   case MSP430::Rrcl16: {
     BuildMI(*BB, MI, dl, TII.get(MSP430::BIC16rc), MSP430::SR)
       .addReg(MSP430::SR).addImm(1);
-    unsigned SrcReg = MI.getOperand(1).getReg();
-    unsigned DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
     unsigned RrcOpc = MI.getOpcode() == MSP430::Rrcl16
                     ? MSP430::RRC16r : MSP430::RRC8r;
     BuildMI(*BB, MI, dl, TII.get(RrcOpc), DstReg)
@@ -1479,13 +1482,13 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
   LoopBB->addSuccessor(RemBB);
   LoopBB->addSuccessor(LoopBB);
 
-  unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
-  unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
-  unsigned ShiftReg = RI.createVirtualRegister(RC);
-  unsigned ShiftReg2 = RI.createVirtualRegister(RC);
-  unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  Register ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  Register ShiftReg = RI.createVirtualRegister(RC);
+  Register ShiftReg2 = RI.createVirtualRegister(RC);
+  Register ShiftAmtSrcReg = MI.getOperand(2).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
 
   // BB:
   // cmp 0, N
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index ee6b6316d7a9..9224e5e3d005 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -124,6 +124,8 @@ namespace llvm {
     bool isZExtFree(EVT VT1, EVT VT2) const override;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
 
+    unsigned getShiftAmountThreshold(EVT VT) const override;
+
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
                                 MachineBasicBlock *BB) const override;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index afbb2f213b45..bec357a1548d 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -139,7 +139,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       return;
 
     // We need to materialize the offset via add instruction.
-    unsigned DstReg = MI.getOperand(0).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
     if (Offset < 0)
       BuildMI(MBB, std::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
         .addReg(DstReg).addImm(-Offset);
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 8c4ca982c966..e9aeba76de85 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -46,7 +46,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
                         Options, getEffectiveRelocModel(RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 1f7d095bf49b..21d0df74d458 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -39,6 +39,7 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -233,9 +234,14 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
                      MCStreamer &Out, const MCSubtargetInfo *STI);
 
-  bool expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, bool Is64FPU,
-                         SMLoc IDLoc, MCStreamer &Out,
-                         const MCSubtargetInfo *STI);
+  bool expandLoadSingleImmToGPR(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                const MCSubtargetInfo *STI);
+  bool expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                const MCSubtargetInfo *STI);
+  bool expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                const MCSubtargetInfo *STI);
+  bool expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU, SMLoc IDLoc,
+                                MCStreamer &Out, const MCSubtargetInfo *STI);
 
   bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
                          const MCOperand &Offset, bool Is32BitAddress,
@@ -512,11 +518,11 @@ public:
 
     // Remember the initial assembler options. The user can not modify these.
     AssemblerOptions.push_back(
-        llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+        std::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
 
     // Create an assembler options environment for the user to modify.
     AssemblerOptions.push_back(
-        llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+        std::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
 
     getTargetStreamer().updateABIInfo(*this);
 
@@ -844,7 +850,7 @@ private:
                                                 const MCRegisterInfo *RegInfo,
                                                 SMLoc S, SMLoc E,
                                                 MipsAsmParser &Parser) {
-    auto Op = llvm::make_unique<MipsOperand>(k_RegisterIndex, Parser);
+    auto Op = std::make_unique<MipsOperand>(k_RegisterIndex, Parser);
     Op->RegIdx.Index = Index;
     Op->RegIdx.RegInfo = RegInfo;
     Op->RegIdx.Kind = RegKind;
@@ -1446,7 +1452,7 @@ public:
 
   static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
                                                   MipsAsmParser &Parser) {
-    auto Op = llvm::make_unique<MipsOperand>(k_Token, Parser);
+    auto Op = std::make_unique<MipsOperand>(k_Token, Parser);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -1521,7 +1527,7 @@ public:
 
   static std::unique_ptr<MipsOperand>
   CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
-    auto Op = llvm::make_unique<MipsOperand>(k_Immediate, Parser);
+    auto Op = std::make_unique<MipsOperand>(k_Immediate, Parser);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -1531,7 +1537,7 @@ public:
   static std::unique_ptr<MipsOperand>
   CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
             SMLoc E, MipsAsmParser &Parser) {
-    auto Op = llvm::make_unique<MipsOperand>(k_Memory, Parser);
+    auto Op = std::make_unique<MipsOperand>(k_Memory, Parser);
     Op->Mem.Base = Base.release();
     Op->Mem.Off = Off;
     Op->StartLoc = S;
@@ -1544,7 +1550,7 @@ public:
                 MipsAsmParser &Parser) {
     assert(Regs.size() > 0 && "Empty list not allowed");
 
-    auto Op = llvm::make_unique<MipsOperand>(k_RegList, Parser);
+    auto Op = std::make_unique<MipsOperand>(k_RegList, Parser);
     Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end());
     Op->StartLoc = StartLoc;
     Op->EndLoc = EndLoc;
@@ -1804,8 +1810,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         break; // We'll deal with this situation later on when applying fixups.
       if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
-      if (OffsetToAlignment(Offset.getImm(),
-                            1LL << (inMicroMipsMode() ? 1 : 2)))
+      if (offsetToAlignment(Offset.getImm(),
+                            (inMicroMipsMode() ? Align(2) : Align(4))))
         return Error(IDLoc, "branch to misaligned address");
       break;
     case Mips::BGEZ:
@@ -1834,8 +1840,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         break; // We'll deal with this situation later on when applying fixups.
       if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
-      if (OffsetToAlignment(Offset.getImm(),
-                            1LL << (inMicroMipsMode() ? 1 : 2)))
+      if (offsetToAlignment(Offset.getImm(),
+                            (inMicroMipsMode() ? Align(2) : Align(4))))
         return Error(IDLoc, "branch to misaligned address");
       break;
     case Mips::BGEC:    case Mips::BGEC_MMR6:
@@ -1850,7 +1856,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         break; // We'll deal with this situation later on when applying fixups.
       if (!isIntN(18, Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
-      if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+      if (offsetToAlignment(Offset.getImm(), Align(4)))
         return Error(IDLoc, "branch to misaligned address");
       break;
     case Mips::BLEZC:   case Mips::BLEZC_MMR6:
@@ -1863,7 +1869,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         break; // We'll deal with this situation later on when applying fixups.
       if (!isIntN(18, Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
-      if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+      if (offsetToAlignment(Offset.getImm(), Align(4)))
         return Error(IDLoc, "branch to misaligned address");
       break;
     case Mips::BEQZC:   case Mips::BEQZC_MMR6:
@@ -1874,7 +1880,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         break; // We'll deal with this situation later on when applying fixups.
       if (!isIntN(23, Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
-      if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+      if (offsetToAlignment(Offset.getImm(), Align(4)))
         return Error(IDLoc, "branch to misaligned address");
       break;
     case Mips::BEQZ16_MM:
@@ -1887,7 +1893,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         break; // We'll deal with this situation later on when applying fixups.
       if (!isInt<8>(Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
-      if (OffsetToAlignment(Offset.getImm(), 2LL))
+      if (offsetToAlignment(Offset.getImm(), Align(2)))
         return Error(IDLoc, "branch to misaligned address");
       break;
     }
@@ -2454,25 +2460,21 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                                           : MER_Success;
 
   case Mips::LoadImmSingleGPR:
-    return expandLoadImmReal(Inst, true, true, false, IDLoc, Out, STI)
-               ? MER_Fail
-               : MER_Success;
+    return expandLoadSingleImmToGPR(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                           : MER_Success;
   case Mips::LoadImmSingleFGR:
-    return expandLoadImmReal(Inst, true, false, false, IDLoc, Out, STI)
-               ? MER_Fail
-               : MER_Success;
+    return expandLoadSingleImmToFPR(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                           : MER_Success;
   case Mips::LoadImmDoubleGPR:
-    return expandLoadImmReal(Inst, false, true, false, IDLoc, Out, STI)
-               ? MER_Fail
-               : MER_Success;
+    return expandLoadDoubleImmToGPR(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                           : MER_Success;
   case Mips::LoadImmDoubleFGR:
-      return expandLoadImmReal(Inst, false, false, true, IDLoc, Out, STI)
-               ? MER_Fail
-               : MER_Success;
+    return expandLoadDoubleImmToFPR(Inst, true, IDLoc, Out, STI) ? MER_Fail
+                                                                 : MER_Success;
   case Mips::LoadImmDoubleFGR_32:
-    return expandLoadImmReal(Inst, false, false, false, IDLoc, Out, STI)
-               ? MER_Fail
-               : MER_Success;
+    return expandLoadDoubleImmToFPR(Inst, false, IDLoc, Out, STI) ? MER_Fail
+                                                                  : MER_Success;
+
   case Mips::Ulh:
     return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::Ulhu:
@@ -2868,12 +2870,12 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
                                             bool Is32BitSym, SMLoc IDLoc,
                                             MCStreamer &Out,
                                             const MCSubtargetInfo *STI) {
-  // FIXME: These expansions do not respect -mxgot.
   MipsTargetStreamer &TOut = getTargetStreamer();
-  bool UseSrcReg = SrcReg != Mips::NoRegister;
+  bool UseSrcReg = SrcReg != Mips::NoRegister && SrcReg != Mips::ZERO &&
+                   SrcReg != Mips::ZERO_64;
   warnIfNoMacro(IDLoc);
 
-  if (inPicMode() && ABI.IsO32()) {
+  if (inPicMode()) {
     MCValue Res;
     if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
       Error(IDLoc, "expected relocatable expression");
@@ -2884,46 +2886,41 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       return true;
     }
 
+    bool IsPtr64 = ABI.ArePtrs64bit();
+    bool IsLocalSym =
+        Res.getSymA()->getSymbol().isInSection() ||
+        Res.getSymA()->getSymbol().isTemporary() ||
+        (Res.getSymA()->getSymbol().isELF() &&
+         cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
+             ELF::STB_LOCAL);
+    bool UseXGOT = STI->getFeatureBits()[Mips::FeatureXGOT] && !IsLocalSym;
+
     // The case where the result register is $25 is somewhat special. If the
     // symbol in the final relocation is external and not modified with a
-    // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16.
+    // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16
+    // or R_MIPS_CALL16 instead of R_MIPS_GOT_DISP in 64-bit case.
     if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
-        Res.getConstant() == 0 &&
-        !(Res.getSymA()->getSymbol().isInSection() ||
-          Res.getSymA()->getSymbol().isTemporary() ||
-          (Res.getSymA()->getSymbol().isELF() &&
-           cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
-               ELF::STB_LOCAL))) {
-      const MCExpr *CallExpr =
-          MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
-      TOut.emitRRX(Mips::LW, DstReg, GPReg, MCOperand::createExpr(CallExpr),
-                   IDLoc, STI);
+        Res.getConstant() == 0 && !IsLocalSym) {
+      if (UseXGOT) {
+        const MCExpr *CallHiExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_HI16,
+                                                      SymExpr, getContext());
+        const MCExpr *CallLoExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_LO16,
+                                                      SymExpr, getContext());
+        TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(CallHiExpr), IDLoc,
+                    STI);
+        TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, DstReg, GPReg,
+                     IDLoc, STI);
+        TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, DstReg,
+                     MCOperand::createExpr(CallLoExpr), IDLoc, STI);
+      } else {
+        const MCExpr *CallExpr =
+            MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+        TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, GPReg,
+                     MCOperand::createExpr(CallExpr), IDLoc, STI);
+      }
       return false;
     }
 
-    // The remaining cases are:
-    //   External GOT: lw $tmp, %got(symbol+offset)($gp)
-    //                >addiu $tmp, $tmp, %lo(offset)
-    //                >addiu $rd, $tmp, $rs
-    //   Local GOT:    lw $tmp, %got(symbol+offset)($gp)
-    //                 addiu $tmp, $tmp, %lo(symbol+offset)($gp)
-    //                >addiu $rd, $tmp, $rs
-    // The addiu's marked with a '>' may be omitted if they are redundant. If
-    // this happens then the last instruction must use $rd as the result
-    // register.
-    const MipsMCExpr *GotExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
-    const MCExpr *LoExpr = nullptr;
-    if (Res.getSymA()->getSymbol().isInSection() ||
-        Res.getSymA()->getSymbol().isTemporary())
-      LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
-    else if (Res.getConstant() != 0) {
-      // External symbols fully resolve the symbol with just the %got(symbol)
-      // but we must still account for any offset to the symbol for expressions
-      // like symbol+8.
-      LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
-    }
-
     unsigned TmpReg = DstReg;
     if (UseSrcReg &&
         getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
@@ -2936,94 +2933,102 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       TmpReg = ATReg;
     }
 
-    TOut.emitRRX(Mips::LW, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc,
-                 STI);
-
-    if (LoExpr)
-      TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+    if (UseXGOT) {
+      // Loading address from XGOT
+      //   External GOT: lui $tmp, %got_hi(symbol)($gp)
+      //                 addu $tmp, $tmp, $gp
+      //                 lw $tmp, %got_lo(symbol)($tmp)
+      //                >addiu $tmp, $tmp, offset
+      //                >addiu $rd, $tmp, $rs
+      // The addiu's marked with a '>' may be omitted if they are redundant. If
+      // this happens then the last instruction must use $rd as the result
+      // register.
+      const MCExpr *CallHiExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, SymExpr, getContext());
+      const MCExpr *CallLoExpr = MipsMCExpr::create(
+          MipsMCExpr::MEK_GOT_LO16, Res.getSymA(), getContext());
+
+      TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(CallHiExpr), IDLoc,
+                  STI);
+      TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg, GPReg,
                    IDLoc, STI);
+      TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, TmpReg, TmpReg,
+                   MCOperand::createExpr(CallLoExpr), IDLoc, STI);
 
-    if (UseSrcReg)
-      TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
-
-    return false;
-  }
-
-  if (inPicMode() && ABI.ArePtrs64bit()) {
-    MCValue Res;
-    if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
-      Error(IDLoc, "expected relocatable expression");
-      return true;
-    }
-    if (Res.getSymB() != nullptr) {
-      Error(IDLoc, "expected relocatable expression with only one symbol");
-      return true;
-    }
+      if (Res.getConstant() != 0)
+        TOut.emitRRX(IsPtr64 ? Mips::DADDiu : Mips::ADDiu, TmpReg, TmpReg,
+                     MCOperand::createExpr(MCConstantExpr::create(
+                         Res.getConstant(), getContext())),
+                     IDLoc, STI);
 
-    // The case where the result register is $25 is somewhat special. If the
-    // symbol in the final relocation is external and not modified with a
-    // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT_DISP.
-    if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
-        Res.getConstant() == 0 &&
-        !(Res.getSymA()->getSymbol().isInSection() ||
-          Res.getSymA()->getSymbol().isTemporary() ||
-          (Res.getSymA()->getSymbol().isELF() &&
-           cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
-               ELF::STB_LOCAL))) {
-      const MCExpr *CallExpr =
-          MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
-      TOut.emitRRX(Mips::LD, DstReg, GPReg, MCOperand::createExpr(CallExpr),
-                   IDLoc, STI);
+      if (UseSrcReg)
+        TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, TmpReg, SrcReg,
+                     IDLoc, STI);
       return false;
     }
 
-    // The remaining cases are:
-    //   Small offset: ld $tmp, %got_disp(symbol)($gp)
-    //                >daddiu $tmp, $tmp, offset
-    //                >daddu $rd, $tmp, $rs
-    // The daddiu's marked with a '>' may be omitted if they are redundant. If
-    // this happens then the last instruction must use $rd as the result
-    // register.
-    const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP,
-                                                   Res.getSymA(),
-                                                   getContext());
+    const MipsMCExpr *GotExpr = nullptr;
     const MCExpr *LoExpr = nullptr;
-    if (Res.getConstant() != 0) {
-      // Symbols fully resolve with just the %got_disp(symbol) but we
-      // must still account for any offset to the symbol for
-      // expressions like symbol+8.
-      LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
-
-      // FIXME: Offsets greater than 16 bits are not yet implemented.
-      // FIXME: The correct range is a 32-bit sign-extended number.
-      if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) {
-        Error(IDLoc, "macro instruction uses large offset, which is not "
-                     "currently supported");
-        return true;
+    if (IsPtr64) {
+      // The remaining cases are:
+      //   Small offset: ld $tmp, %got_disp(symbol)($gp)
+      //                >daddiu $tmp, $tmp, offset
+      //                >daddu $rd, $tmp, $rs
+      // The daddiu's marked with a '>' may be omitted if they are redundant. If
+      // this happens then the last instruction must use $rd as the result
+      // register.
+      GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, Res.getSymA(),
+                                   getContext());
+      if (Res.getConstant() != 0) {
+        // Symbols fully resolve with just the %got_disp(symbol) but we
+        // must still account for any offset to the symbol for
+        // expressions like symbol+8.
+        LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
+
+        // FIXME: Offsets greater than 16 bits are not yet implemented.
+        // FIXME: The correct range is a 32-bit sign-extended number.
+        if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) {
+          Error(IDLoc, "macro instruction uses large offset, which is not "
+                       "currently supported");
+          return true;
+        }
+      }
+    } else {
+      // The remaining cases are:
+      //   External GOT: lw $tmp, %got(symbol)($gp)
+      //                >addiu $tmp, $tmp, offset
+      //                >addiu $rd, $tmp, $rs
+      //   Local GOT:    lw $tmp, %got(symbol+offset)($gp)
+      //                 addiu $tmp, $tmp, %lo(symbol+offset)($gp)
+      //                >addiu $rd, $tmp, $rs
+      // The addiu's marked with a '>' may be omitted if they are redundant. If
+      // this happens then the last instruction must use $rd as the result
+      // register.
+      if (IsLocalSym) {
+        GotExpr =
+            MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
+        LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+      } else {
+        // External symbols fully resolve the symbol with just the %got(symbol)
+        // but we must still account for any offset to the symbol for
+        // expressions like symbol+8.
+        GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT, Res.getSymA(),
+                                     getContext());
+        if (Res.getConstant() != 0)
+          LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
       }
     }
 
-    unsigned TmpReg = DstReg;
-    if (UseSrcReg &&
-        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
-                                                               SrcReg)) {
-      // If $rs is the same as $rd, we need to use AT.
-      // If it is not available we exit.
-      unsigned ATReg = getATReg(IDLoc);
-      if (!ATReg)
-        return true;
-      TmpReg = ATReg;
-    }
-
-    TOut.emitRRX(Mips::LD, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc,
-                 STI);
+    TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, TmpReg, GPReg,
+                 MCOperand::createExpr(GotExpr), IDLoc, STI);
 
     if (LoExpr)
-      TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
-                   IDLoc, STI);
+      TOut.emitRRX(IsPtr64 ? Mips::DADDiu : Mips::ADDiu, TmpReg, TmpReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
 
     if (UseSrcReg)
-      TOut.emitRRR(Mips::DADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+      TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, TmpReg, SrcReg,
+                   IDLoc, STI);
 
     return false;
   }
@@ -3289,10 +3294,43 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
   return false;
 }
 
-bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR,
-                                      bool Is64FPU, SMLoc IDLoc,
-                                      MCStreamer &Out,
-                                      const MCSubtargetInfo *STI) {
+static uint64_t convertIntToDoubleImm(uint64_t ImmOp64) {
+  // If ImmOp64 is AsmToken::Integer type (all bits set to zero in the
+  // exponent field), convert it to double (e.g. 1 to 1.0)
+  if ((Hi_32(ImmOp64) & 0x7ff00000) == 0) {
+    APFloat RealVal(APFloat::IEEEdouble(), ImmOp64);
+    ImmOp64 = RealVal.bitcastToAPInt().getZExtValue();
+  }
+  return ImmOp64;
+}
+
+static uint32_t covertDoubleImmToSingleImm(uint64_t ImmOp64) {
+  // Conversion of a double in an uint64_t to a float in a uint32_t,
+  // retaining the bit pattern of a float.
+  double DoubleImm = BitsToDouble(ImmOp64);
+  float TmpFloat = static_cast<float>(DoubleImm);
+  return FloatToBits(TmpFloat);
+}
+
+bool MipsAsmParser::expandLoadSingleImmToGPR(MCInst &Inst, SMLoc IDLoc,
+                                             MCStreamer &Out,
+                                             const MCSubtargetInfo *STI) {
+  assert(Inst.getNumOperands() == 2 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
+         "Invalid instruction operand.");
+
+  unsigned FirstReg = Inst.getOperand(0).getReg();
+  uint64_t ImmOp64 = Inst.getOperand(1).getImm();
+
+  uint32_t ImmOp32 = covertDoubleImmToSingleImm(convertIntToDoubleImm(ImmOp64));
+
+  return loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, false, IDLoc,
+                       Out, STI);
+}
+
+bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
+                                             MCStreamer &Out,
+                                             const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
   assert(Inst.getNumOperands() == 2 && "Invalid operand count");
   assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
@@ -3301,166 +3339,184 @@ bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR,
   unsigned FirstReg = Inst.getOperand(0).getReg();
   uint64_t ImmOp64 = Inst.getOperand(1).getImm();
 
-  uint32_t HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;
-  // If ImmOp64 is AsmToken::Integer type (all bits set to zero in the
-  // exponent field), convert it to double (e.g. 1 to 1.0)
-  if ((HiImmOp64 & 0x7ff00000) == 0) {
-    APFloat RealVal(APFloat::IEEEdouble(), ImmOp64);
-    ImmOp64 = RealVal.bitcastToAPInt().getZExtValue();
+  ImmOp64 = convertIntToDoubleImm(ImmOp64);
+
+  uint32_t ImmOp32 = covertDoubleImmToSingleImm(ImmOp64);
+
+  unsigned TmpReg = Mips::ZERO;
+  if (ImmOp32 != 0) {
+    TmpReg = getATReg(IDLoc);
+    if (!TmpReg)
+      return true;
   }
 
-  uint32_t LoImmOp64 = ImmOp64 & 0xffffffff;
-  HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;
+  if (Lo_32(ImmOp64) == 0) {
+    if (TmpReg != Mips::ZERO && loadImmediate(ImmOp32, TmpReg, Mips::NoRegister,
+                                              true, false, IDLoc, Out, STI))
+      return true;
+    TOut.emitRR(Mips::MTC1, FirstReg, TmpReg, IDLoc, STI);
+    return false;
+  }
 
-  if (IsSingle) {
-    // Conversion of a double in an uint64_t to a float in a uint32_t,
-    // retaining the bit pattern of a float.
-    uint32_t ImmOp32;
-    double doubleImm = BitsToDouble(ImmOp64);
-    float tmp_float = static_cast<float>(doubleImm);
-    ImmOp32 = FloatToBits(tmp_float);
+  MCSection *CS = getStreamer().getCurrentSectionOnly();
+  // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
+  // where appropriate.
+  MCSection *ReadOnlySection =
+      getContext().getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
 
-    if (IsGPR) {
-      if (loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, true, IDLoc,
-                        Out, STI))
-        return true;
-      return false;
-    } else {
-      unsigned ATReg = getATReg(IDLoc);
-      if (!ATReg)
-        return true;
-      if (LoImmOp64 == 0) {
-        if (loadImmediate(ImmOp32, ATReg, Mips::NoRegister, true, true, IDLoc,
-                          Out, STI))
-          return true;
-        TOut.emitRR(Mips::MTC1, FirstReg, ATReg, IDLoc, STI);
-        return false;
-      }
+  MCSymbol *Sym = getContext().createTempSymbol();
+  const MCExpr *LoSym =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+  const MipsMCExpr *LoExpr =
+      MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
 
-      MCSection *CS = getStreamer().getCurrentSectionOnly();
-      // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
-      // where appropriate.
-      MCSection *ReadOnlySection = getContext().getELFSection(
-          ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  getStreamer().SwitchSection(ReadOnlySection);
+  getStreamer().EmitLabel(Sym, IDLoc);
+  getStreamer().EmitIntValue(ImmOp32, 4);
+  getStreamer().SwitchSection(CS);
 
-      MCSymbol *Sym = getContext().createTempSymbol();
-      const MCExpr *LoSym =
-          MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
-      const MipsMCExpr *LoExpr =
-          MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+  if (emitPartialAddress(TOut, IDLoc, Sym))
+    return true;
+  TOut.emitRRX(Mips::LWC1, FirstReg, TmpReg, MCOperand::createExpr(LoExpr),
+               IDLoc, STI);
+  return false;
+}
+
+bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
+                                             MCStreamer &Out,
+                                             const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  assert(Inst.getNumOperands() == 2 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
+         "Invalid instruction operand.");
+
+  unsigned FirstReg = Inst.getOperand(0).getReg();
+  uint64_t ImmOp64 = Inst.getOperand(1).getImm();
+
+  ImmOp64 = convertIntToDoubleImm(ImmOp64);
 
-      getStreamer().SwitchSection(ReadOnlySection);
-      getStreamer().EmitLabel(Sym, IDLoc);
-      getStreamer().EmitIntValue(ImmOp32, 4);
-      getStreamer().SwitchSection(CS);
+  if (Lo_32(ImmOp64) == 0) {
+    if (isGP64bit()) {
+      if (loadImmediate(ImmOp64, FirstReg, Mips::NoRegister, false, false,
+                        IDLoc, Out, STI))
+        return true;
+    } else {
+      if (loadImmediate(Hi_32(ImmOp64), FirstReg, Mips::NoRegister, true, false,
+                        IDLoc, Out, STI))
+        return true;
 
-      if(emitPartialAddress(TOut, IDLoc, Sym))
+      if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, false,
+                        IDLoc, Out, STI))
         return true;
-      TOut.emitRRX(Mips::LWC1, FirstReg, ATReg,
-                   MCOperand::createExpr(LoExpr), IDLoc, STI);
     }
     return false;
   }
 
-  // if(!IsSingle)
-  unsigned ATReg = getATReg(IDLoc);
-  if (!ATReg)
+  MCSection *CS = getStreamer().getCurrentSectionOnly();
+  MCSection *ReadOnlySection =
+      getContext().getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+  MCSymbol *Sym = getContext().createTempSymbol();
+  const MCExpr *LoSym =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+  const MipsMCExpr *LoExpr =
+      MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+  getStreamer().SwitchSection(ReadOnlySection);
+  getStreamer().EmitLabel(Sym, IDLoc);
+  getStreamer().EmitValueToAlignment(8);
+  getStreamer().EmitIntValue(ImmOp64, 8);
+  getStreamer().SwitchSection(CS);
+
+  unsigned TmpReg = getATReg(IDLoc);
+  if (!TmpReg)
     return true;
 
-  if (IsGPR) {
-    if (LoImmOp64 == 0) {
-      if(isABI_N32() || isABI_N64()) {
-        if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, false, true,
-                          IDLoc, Out, STI))
-          return true;
-        return false;
-      } else {
-        if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, true, true,
-                        IDLoc, Out, STI))
-          return true;
+  if (emitPartialAddress(TOut, IDLoc, Sym))
+    return true;
 
-        if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, true,
-                        IDLoc, Out, STI))
-          return true;
-        return false;
-      }
-    }
+  TOut.emitRRX(isABI_N64() ? Mips::DADDiu : Mips::ADDiu, TmpReg, TmpReg,
+               MCOperand::createExpr(LoExpr), IDLoc, STI);
 
-    MCSection *CS = getStreamer().getCurrentSectionOnly();
-    MCSection *ReadOnlySection = getContext().getELFSection(
-        ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  if (isGP64bit())
+    TOut.emitRRI(Mips::LD, FirstReg, TmpReg, 0, IDLoc, STI);
+  else {
+    TOut.emitRRI(Mips::LW, FirstReg, TmpReg, 0, IDLoc, STI);
+    TOut.emitRRI(Mips::LW, nextReg(FirstReg), TmpReg, 4, IDLoc, STI);
+  }
+  return false;
+}
 
-    MCSymbol *Sym = getContext().createTempSymbol();
-    const MCExpr *LoSym =
-        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
-    const MipsMCExpr *LoExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
+                                             SMLoc IDLoc, MCStreamer &Out,
+                                             const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  assert(Inst.getNumOperands() == 2 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
+         "Invalid instruction operand.");
 
-    getStreamer().SwitchSection(ReadOnlySection);
-    getStreamer().EmitLabel(Sym, IDLoc);
-    getStreamer().EmitIntValue(HiImmOp64, 4);
-    getStreamer().EmitIntValue(LoImmOp64, 4);
-    getStreamer().SwitchSection(CS);
+  unsigned FirstReg = Inst.getOperand(0).getReg();
+  uint64_t ImmOp64 = Inst.getOperand(1).getImm();
+
+  ImmOp64 = convertIntToDoubleImm(ImmOp64);
 
-    if(emitPartialAddress(TOut, IDLoc, Sym))
+  unsigned TmpReg = Mips::ZERO;
+  if (ImmOp64 != 0) {
+    TmpReg = getATReg(IDLoc);
+    if (!TmpReg)
       return true;
-    if(isABI_N64())
-      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
-                   MCOperand::createExpr(LoExpr), IDLoc, STI);
-    else
-      TOut.emitRRX(Mips::ADDiu, ATReg, ATReg,
-                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+  }
 
-    if(isABI_N32() || isABI_N64())
-      TOut.emitRRI(Mips::LD, FirstReg, ATReg, 0, IDLoc, STI);
-    else {
-      TOut.emitRRI(Mips::LW, FirstReg, ATReg, 0, IDLoc, STI);
-      TOut.emitRRI(Mips::LW, nextReg(FirstReg), ATReg, 4, IDLoc, STI);
-    }
-    return false;
-  } else { // if(!IsGPR && !IsSingle)
-    if ((LoImmOp64 == 0) &&
-        !((HiImmOp64 & 0xffff0000) && (HiImmOp64 & 0x0000ffff))) {
-      // FIXME: In the case where the constant is zero, we can load the
-      // register directly from the zero register.
-      if (loadImmediate(HiImmOp64, ATReg, Mips::NoRegister, true, true, IDLoc,
+  if ((Lo_32(ImmOp64) == 0) &&
+      !((Hi_32(ImmOp64) & 0xffff0000) && (Hi_32(ImmOp64) & 0x0000ffff))) {
+    if (isGP64bit()) {
+      if (TmpReg != Mips::ZERO &&
+          loadImmediate(ImmOp64, TmpReg, Mips::NoRegister, false, false, IDLoc,
                         Out, STI))
         return true;
-      if (isABI_N32() || isABI_N64())
-        TOut.emitRR(Mips::DMTC1, FirstReg, ATReg, IDLoc, STI);
-      else if (hasMips32r2()) {
-        TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
-        TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, ATReg, IDLoc, STI);
-      } else {
-        TOut.emitRR(Mips::MTC1, nextReg(FirstReg), ATReg, IDLoc, STI);
-        TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
-      }
+      TOut.emitRR(Mips::DMTC1, FirstReg, TmpReg, IDLoc, STI);
       return false;
     }
 
-    MCSection *CS = getStreamer().getCurrentSectionOnly();
-    // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
-    // where appropriate.
-    MCSection *ReadOnlySection = getContext().getELFSection(
-        ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+    if (TmpReg != Mips::ZERO &&
+        loadImmediate(Hi_32(ImmOp64), TmpReg, Mips::NoRegister, true, false,
+                      IDLoc, Out, STI))
+      return true;
 
-    MCSymbol *Sym = getContext().createTempSymbol();
-    const MCExpr *LoSym =
-        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
-    const MipsMCExpr *LoExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+    if (hasMips32r2()) {
+      TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
+      TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, TmpReg, IDLoc, STI);
+    } else {
+      TOut.emitRR(Mips::MTC1, nextReg(FirstReg), TmpReg, IDLoc, STI);
+      TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
+    }
+    return false;
+  }
 
-    getStreamer().SwitchSection(ReadOnlySection);
-    getStreamer().EmitLabel(Sym, IDLoc);
-    getStreamer().EmitIntValue(HiImmOp64, 4);
-    getStreamer().EmitIntValue(LoImmOp64, 4);
-    getStreamer().SwitchSection(CS);
+  MCSection *CS = getStreamer().getCurrentSectionOnly();
+  // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
+  // where appropriate.
+  MCSection *ReadOnlySection =
+      getContext().getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+  MCSymbol *Sym = getContext().createTempSymbol();
+  const MCExpr *LoSym =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+  const MipsMCExpr *LoExpr =
+      MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+  getStreamer().SwitchSection(ReadOnlySection);
+  getStreamer().EmitLabel(Sym, IDLoc);
+  getStreamer().EmitValueToAlignment(8);
+  getStreamer().EmitIntValue(ImmOp64, 8);
+  getStreamer().SwitchSection(CS);
+
+  if (emitPartialAddress(TOut, IDLoc, Sym))
+    return true;
+
+  TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, TmpReg,
+               MCOperand::createExpr(LoExpr), IDLoc, STI);
 
-    if(emitPartialAddress(TOut, IDLoc, Sym))
-      return true;
-    TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, ATReg,
-                 MCOperand::createExpr(LoExpr), IDLoc, STI);
-  }
   return false;
 }
 
@@ -3489,7 +3545,7 @@ bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
     } else {
       if (!isInt<17>(Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
-      if (OffsetToAlignment(Offset.getImm(), 1LL << 1))
+      if (offsetToAlignment(Offset.getImm(), Align(2)))
         return Error(IDLoc, "branch to misaligned address");
       Inst.clear();
       Inst.setOpcode(Mips::BEQ_MM);
@@ -3581,7 +3637,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   assert(DstRegOp.isReg() && "expected register operand kind");
   const MCOperand &BaseRegOp = Inst.getOperand(1);
   assert(BaseRegOp.isReg() && "expected register operand kind");
-  const MCOperand &OffsetOp = Inst.getOperand(2);
 
   MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned DstReg = DstRegOp.getReg();
@@ -3603,6 +3658,26 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       return;
   }
 
+  if (Inst.getNumOperands() > 3) {
+    const MCOperand &BaseRegOp = Inst.getOperand(2);
+    assert(BaseRegOp.isReg() && "expected register operand kind");
+    const MCOperand &ExprOp = Inst.getOperand(3);
+    assert(ExprOp.isExpr() && "expected expression oprand kind");
+
+    unsigned BaseReg = BaseRegOp.getReg();
+    const MCExpr *ExprOffset = ExprOp.getExpr();
+
+    MCOperand LoOperand = MCOperand::createExpr(
+        MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+    MCOperand HiOperand = MCOperand::createExpr(
+        MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+    TOut.emitSCWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+                             LoOperand, TmpReg, IDLoc, STI);
+    return;
+  }
+
+  const MCOperand &OffsetOp = Inst.getOperand(2);
+
   if (OffsetOp.isImm()) {
     int64_t LoOffset = OffsetOp.getImm() & 0xffff;
     int64_t HiOffset = OffsetOp.getImm() & ~0xffff;
@@ -3625,21 +3700,54 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg,
                    BaseReg, IDLoc, STI);
     TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, LoOffset, IDLoc, STI);
-  } else {
-    assert(OffsetOp.isExpr() && "expected expression operand kind");
-    const MCExpr *ExprOffset = OffsetOp.getExpr();
-    MCOperand LoOperand = MCOperand::createExpr(
-        MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
-    MCOperand HiOperand = MCOperand::createExpr(
-        MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+    return;
+  }
+
+  if (OffsetOp.isExpr()) {
+    if (inPicMode()) {
+      // FIXME:
+      // c) Check that immediates of R_MIPS_GOT16/R_MIPS_LO16 relocations
+      //    do not exceed 16-bit.
+      // d) Use R_MIPS_GOT_PAGE/R_MIPS_GOT_OFST relocations instead
+      //    of R_MIPS_GOT_DISP in appropriate cases to reduce number
+      //    of GOT entries.
+      MCValue Res;
+      if (!OffsetOp.getExpr()->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+        Error(IDLoc, "expected relocatable expression");
+        return;
+      }
+      if (Res.getSymB() != nullptr) {
+        Error(IDLoc, "expected relocatable expression with only one symbol");
+        return;
+      }
 
-    if (IsLoad)
-      TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
-                                 LoOperand, TmpReg, IDLoc, STI);
-    else
-      TOut.emitStoreWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
-                                  LoOperand, TmpReg, IDLoc, STI);
+      loadAndAddSymbolAddress(Res.getSymA(), TmpReg, BaseReg,
+                              !ABI.ArePtrs64bit(), IDLoc, Out, STI);
+      TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, Res.getConstant(), IDLoc,
+                   STI);
+    } else {
+      // FIXME: Implement 64-bit case.
+      // 1) lw $8, sym => lui $8,  %hi(sym)
+      //                  lw  $8,  %lo(sym)($8)
+      // 2) sw $8, sym => lui $at, %hi(sym)
+      //                  sw  $8,  %lo(sym)($at)
+      const MCExpr *ExprOffset = OffsetOp.getExpr();
+      MCOperand LoOperand = MCOperand::createExpr(
+          MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+      MCOperand HiOperand = MCOperand::createExpr(
+          MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+
+      // Generate the base address in TmpReg.
+      TOut.emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI);
+      if (BaseReg != Mips::ZERO)
+        TOut.emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+      // Emit the load or store with the adjusted base and offset.
+      TOut.emitRRX(Inst.getOpcode(), DstReg, TmpReg, LoOperand, IDLoc, STI);
+    }
+    return;
   }
+
+  llvm_unreachable("unexpected operand type");
 }
 
 bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
@@ -6976,7 +7084,7 @@ bool MipsAsmParser::parseSetPushDirective() {
 
   // Create a copy of the current assembler options environment and push it.
   AssemblerOptions.push_back(
-        llvm::make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
+        std::make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
 
   getTargetStreamer().emitDirectiveSetPush();
   return false;
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index ef13507fe63a..c3e98fe410c1 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -267,6 +267,13 @@ static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
                                        uint64_t Address,
                                        const void *Decoder);
 
+// DecodeJumpTargetXMM - Decode microMIPS jump and link exchange target,
+// which is shifted left by 2 bit.
+static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
 static DecodeStatus DecodeMem(MCInst &Inst,
                               unsigned Insn,
                               uint64_t Address,
@@ -2291,6 +2298,15 @@ static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder) {
+  unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
+  Inst.addOperand(MCOperand::createImm(JumpOffset));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
                                        unsigned Value,
                                        uint64_t Address,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 859f9cbbca07..70f2a7bdf10f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -304,7 +304,6 @@ Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
   return StringSwitch<Optional<MCFixupKind>>(Name)
       .Case("R_MIPS_NONE", FK_NONE)
       .Case("R_MIPS_32", FK_Data_4)
-      .Case("R_MIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_Mips_GOT_PAGE)
       .Case("R_MIPS_CALL_HI16", (MCFixupKind)Mips::fixup_Mips_CALL_HI16)
       .Case("R_MIPS_CALL_LO16", (MCFixupKind)Mips::fixup_Mips_CALL_LO16)
       .Case("R_MIPS_CALL16", (MCFixupKind)Mips::fixup_Mips_CALL16)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 4d7e36995ae4..cca75dfc45c2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -66,9 +66,9 @@ public:
 
   /// fixupNeedsRelaxation - Target specific predicate for whether a given
   /// fixup requires the associated instruction to be relaxed.
-   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                             const MCRelaxableFragment *DF,
-                             const MCAsmLayout &Layout) const override {
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
     // FIXME.
     llvm_unreachable("RelaxInstruction() unimplemented");
     return false;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index cf7bae98a27f..cc3168790b98 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -219,7 +219,7 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
                                            const MCFixup &Fixup,
                                            bool IsPCRel) const {
   // Determine the type of the relocation.
-  unsigned Kind = (unsigned)Fixup.getKind();
+  unsigned Kind = Fixup.getTargetKind();
 
   switch (Kind) {
   case FK_NONE:
@@ -690,6 +690,6 @@ llvm::createMipsELFObjectWriter(const Triple &TT, bool IsN32) {
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
   bool IsN64 = TT.isArch64Bit() && !IsN32;
   bool HasRelocationAddend = TT.isArch64Bit();
-  return llvm::make_unique<MipsELFObjectWriter>(OSABI, HasRelocationAddend,
+  return std::make_unique<MipsELFObjectWriter>(OSABI, HasRelocationAddend,
                                                 IsN64);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 759a7fdb32b8..142e9cebb79e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -485,8 +485,11 @@ getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
   assert(MO.isExpr() &&
          "getJumpOffset16OpValue expects only expressions or an immediate");
 
-   // TODO: Push fixup.
-   return 0;
+  const MCExpr *Expr = MO.getExpr();
+  Mips::Fixups FixupKind =
+      isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16 : Mips::fixup_Mips_LO16;
+  Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
+  return 0;
 }
 
 /// getJumpTargetOpValue - Return binary encoding of the jump
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index ad5aff6552f6..a84ca8ccfb2d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -10,11 +10,12 @@
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
 
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/Support/Alignment.h"
 
 namespace llvm {
 
-// Log2 of the NaCl MIPS sandbox's instruction bundle size.
-static const unsigned MIPS_NACL_BUNDLE_ALIGN = 4u;
+// NaCl MIPS sandbox's instruction bundle size.
+static const Align MIPS_NACL_BUNDLE_ALIGN = Align(16);
 
 bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
                                   bool *IsStore = nullptr);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index ddeec03ba784..79c47d1b6508 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -143,12 +143,15 @@ public:
       return false;
     switch (Info->get(Inst.getOpcode()).OpInfo[NumOps - 1].OperandType) {
     case MCOI::OPERAND_UNKNOWN:
-    case MCOI::OPERAND_IMMEDIATE:
-      // jal, bal ...
-      Target = Inst.getOperand(NumOps - 1).getImm();
+    case MCOI::OPERAND_IMMEDIATE: {
+      // j, jal, jalx, jals
+      // Absolute branch within the current 256 MB-aligned region
+      uint64_t Region = Addr & ~uint64_t(0xfffffff);
+      Target = Region + Inst.getOperand(NumOps - 1).getImm();
       return true;
+    }
     case MCOI::OPERAND_PCREL:
-      // b, j, beq ...
+      // b, beq ...
       Target = Addr + Inst.getOperand(NumOps - 1).getImm();
       return true;
     default:
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index c050db8a17fd..2d53750ad0ee 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -270,7 +270,7 @@ MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context,
     S->getAssembler().setRelaxAll(true);
 
   // Set bundle-alignment as required by the NaCl ABI for the target.
-  S->EmitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);
+  S->EmitBundleAlignMode(Log2(MIPS_NACL_BUNDLE_ALIGN));
 
   return S;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index b4ebb9d18b72..3ff9c722484b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -37,7 +37,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
         Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
                               ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1, "");
     MCA.registerSection(*Sec);
-    Sec->setAlignment(8);
+    Sec->setAlignment(Align(8));
     Streamer->SwitchSection(Sec);
 
     Streamer->EmitIntValue(ELF::ODK_REGINFO, 1);  // kind
@@ -55,7 +55,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
     MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO,
                                               ELF::SHF_ALLOC, 24, "");
     MCA.registerSection(*Sec);
-    Sec->setAlignment(MTS->getABI().IsN32() ? 8 : 4);
+    Sec->setAlignment(MTS->getABI().IsN32() ? Align(8) : Align(4));
     Streamer->SwitchSection(Sec);
 
     Streamer->EmitIntValue(ri_gprmask, 4);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index e3bdb3b140a8..b6dae9f6dea8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -34,6 +34,15 @@ static cl::opt<bool> RoundSectionSizes(
     cl::desc("Round section sizes up to the section alignment"), cl::Hidden);
 } // end anonymous namespace
 
+static bool isMipsR6(const MCSubtargetInfo *STI) {
+  return STI->getFeatureBits()[Mips::FeatureMips32r6] ||
+         STI->getFeatureBits()[Mips::FeatureMips64r6];
+}
+
+static bool isMicroMips(const MCSubtargetInfo *STI) {
+  return STI->getFeatureBits()[Mips::FeatureMicroMips];
+}
+
 MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S), GPReg(Mips::GP), ModuleDirectiveAllowed(true) {
   GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
@@ -216,6 +225,19 @@ void MipsTargetStreamer::emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
   emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc, STI);
 }
 
+void MipsTargetStreamer::emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+                                  unsigned Reg2, MCOperand Op3, SMLoc IDLoc,
+                                  const MCSubtargetInfo *STI) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  TmpInst.addOperand(MCOperand::createReg(Reg0));
+  TmpInst.addOperand(MCOperand::createReg(Reg1));
+  TmpInst.addOperand(MCOperand::createReg(Reg2));
+  TmpInst.addOperand(Op3);
+  TmpInst.setLoc(IDLoc);
+  getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
 void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
                                  int16_t Imm, SMLoc IDLoc,
                                  const MCSubtargetInfo *STI) {
@@ -264,8 +286,7 @@ void MipsTargetStreamer::emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
 }
 
 void MipsTargetStreamer::emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI) {
-  const FeatureBitset &Features = STI->getFeatureBits();
-  if (Features[Mips::FeatureMicroMips])
+  if (isMicroMips(STI))
     emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, STI);
   else
     emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
@@ -311,21 +332,34 @@ void MipsTargetStreamer::emitStoreWithImmOffset(
   emitRRI(Opcode, SrcReg, ATReg, LoOffset, IDLoc, STI);
 }
 
-/// Emit a store instruction with an symbol offset. Symbols are assumed to be
-/// out of range for a simm16 will be expanded to appropriate instructions.
-void MipsTargetStreamer::emitStoreWithSymOffset(
-    unsigned Opcode, unsigned SrcReg, unsigned BaseReg, MCOperand &HiOperand,
-    MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
-    const MCSubtargetInfo *STI) {
-  // sw $8, sym => lui $at, %hi(sym)
-  //               sw $8, %lo(sym)($at)
+/// Emit a store instruction with an symbol offset.
+void MipsTargetStreamer::emitSCWithSymOffset(unsigned Opcode, unsigned SrcReg,
+                                             unsigned BaseReg,
+                                             MCOperand &HiOperand,
+                                             MCOperand &LoOperand,
+                                             unsigned ATReg, SMLoc IDLoc,
+                                             const MCSubtargetInfo *STI) {
+  // sc $8, sym => lui $at, %hi(sym)
+  //               sc $8, %lo(sym)($at)
 
   // Generate the base address in ATReg.
   emitRX(Mips::LUi, ATReg, HiOperand, IDLoc, STI);
-  if (BaseReg != Mips::ZERO)
-    emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
-  // Emit the store with the adjusted base and offset.
-  emitRRX(Opcode, SrcReg, ATReg, LoOperand, IDLoc, STI);
+  if (!isMicroMips(STI) && isMipsR6(STI)) {
+    // For non-micromips r6 offset for 'sc' is not in the lower 16 bits so we
+    // put it in 'at'.
+    // sc $8, sym => lui $at, %hi(sym)
+    //               addiu $at, $at, %lo(sym)
+    //               sc $8, 0($at)
+    emitRRX(Mips::ADDiu, ATReg, ATReg, LoOperand, IDLoc, STI);
+    MCOperand Offset = MCOperand::createImm(0);
+    // Emit the store with the adjusted base and offset.
+    emitRRRX(Opcode, SrcReg, SrcReg, ATReg, Offset, IDLoc, STI);
+  } else {
+    if (BaseReg != Mips::ZERO)
+      emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
+    // Emit the store with the adjusted base and offset.
+    emitRRRX(Opcode, SrcReg, SrcReg, ATReg, LoOperand, IDLoc, STI);
+  }
 }
 
 /// Emit a load instruction with an immediate offset. DstReg and TmpReg are
@@ -364,30 +398,6 @@ void MipsTargetStreamer::emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg,
   emitRRI(Opcode, DstReg, TmpReg, LoOffset, IDLoc, STI);
 }
 
-/// Emit a load instruction with an symbol offset. Symbols are assumed to be
-/// out of range for a simm16 will be expanded to appropriate instructions.
-/// DstReg and TmpReg are permitted to be the same register iff DstReg is a
-/// GPR. It is the callers responsibility to identify such cases and pass the
-/// appropriate register in TmpReg.
-void MipsTargetStreamer::emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg,
-                                               unsigned BaseReg,
-                                               MCOperand &HiOperand,
-                                               MCOperand &LoOperand,
-                                               unsigned TmpReg, SMLoc IDLoc,
-                                               const MCSubtargetInfo *STI) {
-  // 1) lw $8, sym        => lui $8, %hi(sym)
-  //                         lw $8, %lo(sym)($8)
-  // 2) ldc1 $f0, sym     => lui $at, %hi(sym)
-  //                         ldc1 $f0, %lo(sym)($at)
-
-  // Generate the base address in TmpReg.
-  emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI);
-  if (BaseReg != Mips::ZERO)
-    emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
-  // Emit the load with the adjusted base and offset.
-  emitRRX(Opcode, DstReg, TmpReg, LoOperand, IDLoc, STI);
-}
-
 MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
                                              formatted_raw_ostream &OS)
     : MipsTargetStreamer(S), OS(OS) {}
@@ -891,9 +901,9 @@ void MipsTargetELFStreamer::finish() {
   MCSection &BSSSection = *OFI.getBSSSection();
   MCA.registerSection(BSSSection);
 
-  TextSection.setAlignment(std::max(16u, TextSection.getAlignment()));
-  DataSection.setAlignment(std::max(16u, DataSection.getAlignment()));
-  BSSSection.setAlignment(std::max(16u, BSSSection.getAlignment()));
+  TextSection.setAlignment(Align(std::max(16u, TextSection.getAlignment())));
+  DataSection.setAlignment(Align(std::max(16u, DataSection.getAlignment())));
+  BSSSection.setAlignment(Align(std::max(16u, BSSSection.getAlignment())));
 
   if (RoundSectionSizes) {
     // Make sections sizes a multiple of the alignment. This is useful for
@@ -1016,7 +1026,7 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
       MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context);
 
   MCA.registerSection(*Sec);
-  Sec->setAlignment(4);
+  Sec->setAlignment(Align(4));
 
   OS.PushSection();
 
@@ -1306,7 +1316,7 @@ void MipsTargetELFStreamer::emitMipsAbiFlags() {
   MCSectionELF *Sec = Context.getELFSection(
       ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24, "");
   MCA.registerSection(*Sec);
-  Sec->setAlignment(8);
+  Sec->setAlignment(Align(8));
   OS.SwitchSection(Sec);
 
   OS << ABIFlagsSection;
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index 5a12568893af..9a1e47e5ecca 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -360,7 +360,7 @@ class RDDSP_MM_DESC {
   dag OutOperandList = (outs GPR32Opnd:$rt);
   dag InOperandList = (ins uimm7:$mask);
   string AsmString = !strconcat("rddsp", "\t$rt, $mask");
-  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt7:$mask))];
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp timmZExt7:$mask))];
   InstrItinClass Itinerary = NoItinerary;
 }
 
@@ -383,7 +383,7 @@ class WRDSP_MM_DESC {
   dag OutOperandList = (outs);
   dag InOperandList = (ins GPR32Opnd:$rt, uimm7:$mask);
   string AsmString = !strconcat("wrdsp", "\t$rt, $mask");
-  list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)];
+  list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, timmZExt7:$mask)];
   InstrItinClass Itinerary = NoItinerary;
   bit isMoveReg = 1;
 }
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 9b7f7b25fa94..8cc0029fc896 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -955,17 +955,18 @@ let DecoderNamespace = "MicroMips" in {
                EXT_FM_MM<0x0c>, ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Jump Instructions
-  let DecoderMethod = "DecodeJumpTargetMM" in
+  let DecoderMethod = "DecodeJumpTargetMM" in {
     def J_MM          : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
                         J_FM_MM<0x35>, AdditionalRequires<[RelocNotPIC]>,
                         IsBranch, ISA_MICROMIPS32_NOT_MIPS32R6;
-
-  let DecoderMethod = "DecodeJumpTargetMM" in {
     def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>,
                       ISA_MICROMIPS32_NOT_MIPS32R6;
+  }
+
+  let DecoderMethod = "DecodeJumpTargetXMM" in
     def JALX_MM     : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>,
                       ISA_MICROMIPS32_NOT_MIPS32R6;
-  }
+
   def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>,
               ISA_MICROMIPS32_NOT_MIPS32R6;
   def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>,
diff --git a/lib/Target/Mips/MicroMipsSizeReduction.cpp b/lib/Target/Mips/MicroMipsSizeReduction.cpp
index 70af95592aa5..db93b3d80ede 100644
--- a/lib/Target/Mips/MicroMipsSizeReduction.cpp
+++ b/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -361,7 +361,7 @@ static bool CheckXWPInstr(MachineInstr *MI, bool ReduceToLwp,
         MI->getOpcode() == Mips::SW16_MM))
     return false;
 
-  unsigned reg = MI->getOperand(0).getReg();
+  Register reg = MI->getOperand(0).getReg();
   if (reg == Mips::RA)
     return false;
 
@@ -403,8 +403,8 @@ static bool ConsecutiveInstr(MachineInstr *MI1, MachineInstr *MI2) {
   if (!GetImm(MI2, 2, Offset2))
     return false;
 
-  unsigned Reg1 = MI1->getOperand(0).getReg();
-  unsigned Reg2 = MI2->getOperand(0).getReg();
+  Register Reg1 = MI1->getOperand(0).getReg();
+  Register Reg2 = MI2->getOperand(0).getReg();
 
   return ((Offset1 == (Offset2 - 4)) && (ConsecutiveRegisters(Reg1, Reg2)));
 }
@@ -475,8 +475,8 @@ bool MicroMipsSizeReduce::ReduceXWtoXWP(ReduceEntryFunArgs *Arguments) {
   if (!CheckXWPInstr(MI2, ReduceToLwp, Entry))
     return false;
 
-  unsigned Reg1 = MI1->getOperand(1).getReg();
-  unsigned Reg2 = MI2->getOperand(1).getReg();
+  Register Reg1 = MI1->getOperand(1).getReg();
+  Register Reg2 = MI2->getOperand(1).getReg();
 
   if (Reg1 != Reg2)
     return false;
@@ -621,8 +621,8 @@ bool MicroMipsSizeReduce::ReduceMoveToMovep(ReduceEntryFunArgs *Arguments) {
   MachineInstr *MI1 = Arguments->MI;
   MachineInstr *MI2 = &*NextMII;
 
-  unsigned RegDstMI1 = MI1->getOperand(0).getReg();
-  unsigned RegSrcMI1 = MI1->getOperand(1).getReg();
+  Register RegDstMI1 = MI1->getOperand(0).getReg();
+  Register RegSrcMI1 = MI1->getOperand(1).getReg();
 
   if (!IsMovepSrcRegister(RegSrcMI1))
     return false;
@@ -633,8 +633,8 @@ bool MicroMipsSizeReduce::ReduceMoveToMovep(ReduceEntryFunArgs *Arguments) {
   if (MI2->getOpcode() != Entry.WideOpc())
     return false;
 
-  unsigned RegDstMI2 = MI2->getOperand(0).getReg();
-  unsigned RegSrcMI2 = MI2->getOperand(1).getReg();
+  Register RegDstMI2 = MI2->getOperand(0).getReg();
+  Register RegSrcMI2 = MI2->getOperand(1).getReg();
 
   if (!IsMovepSrcRegister(RegSrcMI2))
     return false;
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 7b83ea8535ae..a5908362e81f 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -25,6 +25,8 @@ class PredicateControl {
   list<Predicate> GPRPredicates = [];
   // Predicates for the PTR size such as IsPTR64bit
   list<Predicate> PTRPredicates = [];
+  // Predicates for a symbol's size such as hasSym32.
+  list<Predicate> SYMPredicates = [];
   // Predicates for the FGR size and layout such as IsFP64bit
   list<Predicate> FGRPredicates = [];
   // Predicates for the instruction group membership such as ISA's.
@@ -38,6 +40,7 @@ class PredicateControl {
   list<Predicate> Predicates = !listconcat(EncodingPredicates,
                                            GPRPredicates,
                                            PTRPredicates,
+                                           SYMPredicates,
                                            FGRPredicates,
                                            InsnPredicates,
                                            HardFloatPredicate,
@@ -206,6 +209,9 @@ def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">;
 def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true",
                                         "Disable use of the jal instruction">;
 
+def FeatureXGOT
+    : SubtargetFeature<"xgot", "UseXGOT", "true", "Assume 32-bit GOT">;
+
 def FeatureUseIndirectJumpsHazard : SubtargetFeature<"use-indirect-jump-hazard",
                                                     "UseIndirectJumpsHazard",
                                                     "true", "Use indirect jump"
@@ -257,3 +263,9 @@ def Mips : Target {
   let AssemblyParserVariants = [MipsAsmParserVariant];
   let AllowRegisterRenaming = 1;
 }
+
+//===----------------------------------------------------------------------===//
+// Pfm Counters
+//===----------------------------------------------------------------------===//
+
+include "MipsPfmCounters.td"
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 3ab4f1e064da..768d54fc9c24 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -72,7 +72,7 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   DebugLoc DL;
-  unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  Register V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
   const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
 
   V0 = RegInfo.createVirtualRegister(RC);
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 6d8e5aef2a3f..5a5b78c9d5f9 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -708,8 +708,8 @@ Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  unsigned regX = MI.getOperand(0).getReg();
-  unsigned regY = MI.getOperand(1).getReg();
+  Register regX = MI.getOperand(0).getReg();
+  Register regY = MI.getOperand(1).getReg();
   MachineBasicBlock *target = MI.getOperand(2).getMBB();
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(CmpOpc))
       .addReg(regX)
@@ -725,7 +725,7 @@ MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  unsigned regX = MI.getOperand(0).getReg();
+  Register regX = MI.getOperand(0).getReg();
   int64_t imm = MI.getOperand(1).getImm();
   MachineBasicBlock *target = MI.getOperand(2).getMBB();
   unsigned CmpOpc;
@@ -758,9 +758,9 @@ Mips16TargetLowering::emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr &MI,
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  unsigned CC = MI.getOperand(0).getReg();
-  unsigned regX = MI.getOperand(1).getReg();
-  unsigned regY = MI.getOperand(2).getReg();
+  Register CC = MI.getOperand(0).getReg();
+  Register regX = MI.getOperand(1).getReg();
+  Register regY = MI.getOperand(2).getReg();
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc))
       .addReg(regX)
       .addReg(regY);
@@ -777,8 +777,8 @@ Mips16TargetLowering::emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc,
   if (DontExpandCondPseudos16)
     return BB;
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  unsigned CC = MI.getOperand(0).getReg();
-  unsigned regX = MI.getOperand(1).getReg();
+  Register CC = MI.getOperand(0).getReg();
+  Register regX = MI.getOperand(1).getReg();
   int64_t Imm = MI.getOperand(2).getImm();
   unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc)).addReg(regX).addImm(Imm);
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index c234c309d760..0d735c20ec2f 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -358,7 +358,7 @@ unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm,
   for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = II->getOperand(i);
     if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
-        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        !Register::isVirtualRegister(MO.getReg()))
       Candidates.reset(MO.getReg());
   }
 
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 7f35280f7936..cc15949b0d57 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -16,6 +16,7 @@
 
 // shamt must fit in 6 bits.
 def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
+def timmZExt6 : TImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
 
 // Node immediate fits as 10-bit sign extended on target immediate.
 // e.g. seqi, snei
@@ -651,6 +652,7 @@ def : MipsPat<(MipsTlsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>,
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsPat<(MipsJmpLink (i64 texternalsym:$dst)),
                 (JAL texternalsym:$dst)>, ISA_MIPS3, GPR_64, SYM_64;
+
   def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)),
                 (LUi64 tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(MipsHighest (i64 tblockaddress:$in)),
@@ -682,6 +684,20 @@ let AdditionalPredicates = [NotInMicroMips] in {
                 (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tconstpool:$lo))),
                 (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 texternalsym:$lo))),
+                (DADDiu GPR64:$hi, texternalsym:$lo)>,
+                ISA_MIPS3, GPR_64, SYM_64;
+
+  def : MipsPat<(MipsHi (i64 tglobaladdr:$in)),
+                (DADDiu ZERO_64, tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(MipsHi (i64 tblockaddress:$in)),
+                (DADDiu ZERO_64, tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(MipsHi (i64 tjumptable:$in)),
+                (DADDiu ZERO_64, tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(MipsHi (i64 tconstpool:$in)),
+                (DADDiu ZERO_64, tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(MipsHi (i64 texternalsym:$in)),
+                (DADDiu ZERO_64, texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64;
 
   def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))),
                 (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
@@ -692,6 +708,23 @@ let AdditionalPredicates = [NotInMicroMips] in {
                 (DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
   def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tconstpool:$lo))),
                 (DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 texternalsym:$lo))),
+                (DADDiu GPR64:$hi, texternalsym:$lo)>,
+                ISA_MIPS3, GPR_64, SYM_64;
+
+  def : MipsPat<(MipsLo (i64 tglobaladdr:$in)),
+                (DADDiu ZERO_64, tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(MipsLo (i64 tblockaddress:$in)),
+                (DADDiu ZERO_64, tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(MipsLo (i64 tjumptable:$in)),
+                (DADDiu ZERO_64, tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(MipsLo (i64 tconstpool:$in)),
+                (DADDiu ZERO_64, tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(MipsLo (i64 tglobaltlsaddr:$in)),
+                (DADDiu ZERO_64, tglobaltlsaddr:$in)>,
+                ISA_MIPS3, GPR_64, SYM_64;
+  def : MipsPat<(MipsLo (i64 texternalsym:$in)),
+                (DADDiu ZERO_64, texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64;
 
   def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))),
                 (DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
@@ -705,6 +738,9 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaltlsaddr:$lo))),
                 (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MIPS3, GPR_64,
                 SYM_64;
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 texternalsym:$lo))),
+                (DADDiu GPR64:$hi, texternalsym:$lo)>,
+                ISA_MIPS3, GPR_64, SYM_64;
 }
 
 // gp_rel relocs
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index db83fe49cec0..2201545adc94 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -56,6 +56,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cstdint>
@@ -376,7 +377,7 @@ void MipsAsmPrinter::printSavedRegsBitmask() {
 void MipsAsmPrinter::emitFrameDirective() {
   const TargetRegisterInfo &RI = *MF->getSubtarget().getRegisterInfo();
 
-  unsigned stackReg  = RI.getFrameRegister(*MF);
+  Register stackReg = RI.getFrameRegister(*MF);
   unsigned returnReg = RI.getRARegister();
   unsigned stackSize = MF->getFrameInfo().getStackSize();
 
@@ -571,7 +572,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       // for 2 for 32 bit mode and 1 for 64 bit mode.
       if (NumVals != 2) {
         if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) {
-          unsigned Reg = MO.getReg();
+          Register Reg = MO.getReg();
           O << '$' << MipsInstPrinter::getRegisterName(Reg);
           return false;
         }
@@ -597,7 +598,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         const MachineOperand &MO = MI->getOperand(RegOp);
         if (!MO.isReg())
           return true;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         O << '$' << MipsInstPrinter::getRegisterName(Reg);
         return false;
       }
@@ -780,7 +781,7 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU());
   StringRef FS = TM.getTargetFeatureString();
   const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
-  const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, 0);
+  const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, None);
 
   bool IsABICalls = STI.isABICalls();
   const MipsABIInfo &ABI = MTM.getABI();
@@ -821,6 +822,9 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // option has changed the default (i.e. FPXX) and omit it otherwise.
   if (ABI.IsO32() && (!STI.useOddSPReg() || STI.isABI_FPXX()))
     TS.emitDirectiveModuleOddSPReg();
+
+  // Switch to the .text section.
+  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
 }
 
 void MipsAsmPrinter::emitInlineAsmStart() const {
diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp
index da65689ecff5..cad82953af50 100644
--- a/lib/Target/Mips/MipsCallLowering.cpp
+++ b/lib/Target/Mips/MipsCallLowering.cpp
@@ -106,6 +106,7 @@ private:
                    Register ArgsReg, const EVT &VT) override;
 
   virtual void markPhysRegUsed(unsigned PhysReg) {
+    MIRBuilder.getMRI()->addLiveIn(PhysReg);
     MIRBuilder.getMBB().addLiveIn(PhysReg);
   }
 
@@ -357,7 +358,7 @@ bool OutgoingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
   return true;
 }
 
-static bool isSupportedType(Type *T) {
+static bool isSupportedArgumentType(Type *T) {
   if (T->isIntegerTy())
     return true;
   if (T->isPointerTy())
@@ -367,6 +368,18 @@ static bool isSupportedType(Type *T) {
   return false;
 }
 
+static bool isSupportedReturnType(Type *T) {
+  if (T->isIntegerTy())
+    return true;
+  if (T->isPointerTy())
+    return true;
+  if (T->isFloatingPointTy())
+    return true;
+  if (T->isAggregateType())
+    return true;
+  return false;
+}
+
 static CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
                                              const ISD::ArgFlagsTy &Flags) {
   // > does not mean loss of information as type RegisterVT can't hold type VT,
@@ -403,7 +416,7 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
 
   MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(Mips::RetRA);
 
-  if (Val != nullptr && !isSupportedType(Val->getType()))
+  if (Val != nullptr && !isSupportedReturnType(Val->getType()))
     return false;
 
   if (!VRegs.empty()) {
@@ -411,21 +424,13 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
     const Function &F = MF.getFunction();
     const DataLayout &DL = MF.getDataLayout();
     const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
-    LLVMContext &Ctx = Val->getType()->getContext();
-
-    SmallVector<EVT, 4> SplitEVTs;
-    ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
-    assert(VRegs.size() == SplitEVTs.size() &&
-           "For each split Type there should be exactly one VReg.");
 
     SmallVector<ArgInfo, 8> RetInfos;
     SmallVector<unsigned, 8> OrigArgIndices;
 
-    for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
-      ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
-      setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
-      splitToValueTypes(CurArgInfo, 0, RetInfos, OrigArgIndices);
-    }
+    ArgInfo ArgRetInfo(VRegs, Val->getType());
+    setArgFlags(ArgRetInfo, AttributeList::ReturnIndex, DL, F);
+    splitToValueTypes(DL, ArgRetInfo, 0, RetInfos, OrigArgIndices);
 
     SmallVector<ISD::OutputArg, 8> Outs;
     subTargetRegTypeForCallingConv(F, RetInfos, OrigArgIndices, Outs);
@@ -453,12 +458,8 @@ bool MipsCallLowering::lowerFormalArguments(
   if (F.arg_empty())
     return true;
 
-  if (F.isVarArg()) {
-    return false;
-  }
-
   for (auto &Arg : F.args()) {
-    if (!isSupportedType(Arg.getType()))
+    if (!isSupportedArgumentType(Arg.getType()))
       return false;
   }
 
@@ -472,7 +473,8 @@ bool MipsCallLowering::lowerFormalArguments(
   for (auto &Arg : F.args()) {
     ArgInfo AInfo(VRegs[i], Arg.getType());
     setArgFlags(AInfo, i + AttributeList::FirstArgIndex, DL, F);
-    splitToValueTypes(AInfo, i, ArgInfos, OrigArgIndices);
+    ArgInfos.push_back(AInfo);
+    OrigArgIndices.push_back(i);
     ++i;
   }
 
@@ -495,30 +497,64 @@ bool MipsCallLowering::lowerFormalArguments(
   if (!Handler.handle(ArgLocs, ArgInfos))
     return false;
 
+  if (F.isVarArg()) {
+    ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
+    unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
+
+    int VaArgOffset;
+    unsigned RegSize = 4;
+    if (ArgRegs.size() == Idx)
+      VaArgOffset = alignTo(CCInfo.getNextStackOffset(), RegSize);
+    else {
+      VaArgOffset =
+          (int)ABI.GetCalleeAllocdArgSizeInBytes(CCInfo.getCallingConv()) -
+          (int)(RegSize * (ArgRegs.size() - Idx));
+    }
+
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    int FI = MFI.CreateFixedObject(RegSize, VaArgOffset, true);
+    MF.getInfo<MipsFunctionInfo>()->setVarArgsFrameIndex(FI);
+
+    for (unsigned I = Idx; I < ArgRegs.size(); ++I, VaArgOffset += RegSize) {
+      MIRBuilder.getMBB().addLiveIn(ArgRegs[I]);
+
+      MachineInstrBuilder Copy =
+          MIRBuilder.buildCopy(LLT::scalar(RegSize * 8), Register(ArgRegs[I]));
+      FI = MFI.CreateFixedObject(RegSize, VaArgOffset, true);
+      MachinePointerInfo MPO = MachinePointerInfo::getFixedStack(MF, FI);
+      MachineInstrBuilder FrameIndex =
+          MIRBuilder.buildFrameIndex(LLT::pointer(MPO.getAddrSpace(), 32), FI);
+      MachineMemOperand *MMO =
+          MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, RegSize,
+                                  /* Alignment */ RegSize);
+      MIRBuilder.buildStore(Copy, FrameIndex, *MMO);
+    }
+  }
+
   return true;
 }
 
 bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
-                                 CallingConv::ID CallConv,
-                                 const MachineOperand &Callee,
-                                 const ArgInfo &OrigRet,
-                                 ArrayRef<ArgInfo> OrigArgs) const {
+                                 CallLoweringInfo &Info) const {
 
-  if (CallConv != CallingConv::C)
+  if (Info.CallConv != CallingConv::C)
     return false;
 
-  for (auto &Arg : OrigArgs) {
-    if (!isSupportedType(Arg.Ty))
+  for (auto &Arg : Info.OrigArgs) {
+    if (!isSupportedArgumentType(Arg.Ty))
+      return false;
+    if (Arg.Flags[0].isByVal())
       return false;
-    if (Arg.Flags.isByVal() || Arg.Flags.isSRet())
+    if (Arg.Flags[0].isSRet() && !Arg.Ty->isPointerTy())
       return false;
   }
 
-  if (OrigRet.Regs[0] && !isSupportedType(OrigRet.Ty))
+  if (!Info.OrigRet.Ty->isVoidTy() && !isSupportedReturnType(Info.OrigRet.Ty))
     return false;
 
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
+  const DataLayout &DL = MF.getDataLayout();
   const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
   const MipsTargetMachine &TM =
       static_cast<const MipsTargetMachine &>(MF.getTarget());
@@ -528,37 +564,38 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       MIRBuilder.buildInstr(Mips::ADJCALLSTACKDOWN);
 
   const bool IsCalleeGlobalPIC =
-      Callee.isGlobal() && TM.isPositionIndependent();
+      Info.Callee.isGlobal() && TM.isPositionIndependent();
 
   MachineInstrBuilder MIB = MIRBuilder.buildInstrNoInsert(
-      Callee.isReg() || IsCalleeGlobalPIC ? Mips::JALRPseudo : Mips::JAL);
+      Info.Callee.isReg() || IsCalleeGlobalPIC ? Mips::JALRPseudo : Mips::JAL);
   MIB.addDef(Mips::SP, RegState::Implicit);
   if (IsCalleeGlobalPIC) {
     Register CalleeReg =
         MF.getRegInfo().createGenericVirtualRegister(LLT::pointer(0, 32));
     MachineInstr *CalleeGlobalValue =
-        MIRBuilder.buildGlobalValue(CalleeReg, Callee.getGlobal());
-    if (!Callee.getGlobal()->hasLocalLinkage())
+        MIRBuilder.buildGlobalValue(CalleeReg, Info.Callee.getGlobal());
+    if (!Info.Callee.getGlobal()->hasLocalLinkage())
       CalleeGlobalValue->getOperand(1).setTargetFlags(MipsII::MO_GOT_CALL);
     MIB.addUse(CalleeReg);
   } else
-    MIB.add(Callee);
+    MIB.add(Info.Callee);
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv()));
 
   TargetLowering::ArgListTy FuncOrigArgs;
-  FuncOrigArgs.reserve(OrigArgs.size());
+  FuncOrigArgs.reserve(Info.OrigArgs.size());
 
   SmallVector<ArgInfo, 8> ArgInfos;
   SmallVector<unsigned, 8> OrigArgIndices;
   unsigned i = 0;
-  for (auto &Arg : OrigArgs) {
+  for (auto &Arg : Info.OrigArgs) {
 
     TargetLowering::ArgListEntry Entry;
     Entry.Ty = Arg.Ty;
     FuncOrigArgs.push_back(Entry);
 
-    splitToValueTypes(Arg, i, ArgInfos, OrigArgIndices);
+    ArgInfos.push_back(Arg);
+    OrigArgIndices.push_back(i);
     ++i;
   }
 
@@ -566,11 +603,17 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   subTargetRegTypeForCallingConv(F, ArgInfos, OrigArgIndices, Outs);
 
   SmallVector<CCValAssign, 8> ArgLocs;
-  MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+  bool IsCalleeVarArg = false;
+  if (Info.Callee.isGlobal()) {
+    const Function *CF = static_cast<const Function *>(Info.Callee.getGlobal());
+    IsCalleeVarArg = CF->isVarArg();
+  }
+  MipsCCState CCInfo(F.getCallingConv(), IsCalleeVarArg, MF, ArgLocs,
                      F.getContext());
 
-  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
-  const char *Call = Callee.isSymbol() ? Callee.getSymbolName() : nullptr;
+  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv), 1);
+  const char *Call =
+      Info.Callee.isSymbol() ? Info.Callee.getSymbolName() : nullptr;
   CCInfo.AnalyzeCallOperands(Outs, TLI.CCAssignFnForCall(), FuncOrigArgs, Call);
   setLocInfo(ArgLocs, Outs);
 
@@ -599,11 +642,11 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                          *STI.getRegBankInfo());
   }
 
-  if (OrigRet.Regs[0]) {
+  if (!Info.OrigRet.Ty->isVoidTy()) {
     ArgInfos.clear();
     SmallVector<unsigned, 8> OrigRetIndices;
 
-    splitToValueTypes(OrigRet, 0, ArgInfos, OrigRetIndices);
+    splitToValueTypes(DL, Info.OrigRet, 0, ArgInfos, OrigRetIndices);
 
     SmallVector<ISD::InputArg, 8> Ins;
     subTargetRegTypeForCallingConv(F, ArgInfos, OrigRetIndices, Ins);
@@ -612,7 +655,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
                        F.getContext());
 
-    CCInfo.AnalyzeCallResult(Ins, TLI.CCAssignFnForReturn(), OrigRet.Ty, Call);
+    CCInfo.AnalyzeCallResult(Ins, TLI.CCAssignFnForReturn(), Info.OrigRet.Ty, Call);
     setLocInfo(ArgLocs, Ins);
 
     CallReturnHandler Handler(MIRBuilder, MF.getRegInfo(), MIB);
@@ -642,12 +685,12 @@ void MipsCallLowering::subTargetRegTypeForCallingConv(
         F.getContext(), F.getCallingConv(), VT);
 
     for (unsigned i = 0; i < NumRegs; ++i) {
-      ISD::ArgFlagsTy Flags = Arg.Flags;
+      ISD::ArgFlagsTy Flags = Arg.Flags[0];
 
       if (i == 0)
         Flags.setOrigAlign(TLI.getABIAlignmentForCallingConv(Arg.Ty, DL));
       else
-        Flags.setOrigAlign(1);
+        Flags.setOrigAlign(Align::None());
 
       ISDArgs.emplace_back(Flags, RegisterVT, VT, true, OrigArgIndices[ArgNo],
                            0);
@@ -657,12 +700,21 @@ void MipsCallLowering::subTargetRegTypeForCallingConv(
 }
 
 void MipsCallLowering::splitToValueTypes(
-    const ArgInfo &OrigArg, unsigned OriginalIndex,
+    const DataLayout &DL, const ArgInfo &OrigArg, unsigned OriginalIndex,
     SmallVectorImpl<ArgInfo> &SplitArgs,
     SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const {
 
-  // TODO : perform structure and array split. For now we only deal with
-  // types that pass isSupportedType check.
-  SplitArgs.push_back(OrigArg);
-  SplitArgsOrigIndices.push_back(OriginalIndex);
+  SmallVector<EVT, 4> SplitEVTs;
+  SmallVector<Register, 4> SplitVRegs;
+  const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+  LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitEVTs);
+
+  for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
+    ArgInfo Info = ArgInfo{OrigArg.Regs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
+    Info.Flags = OrigArg.Flags;
+    SplitArgs.push_back(Info);
+    SplitArgsOrigIndices.push_back(OriginalIndex);
+  }
 }
diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h
index 11c2d53ad35d..a284cf5e26cf 100644
--- a/lib/Target/Mips/MipsCallLowering.h
+++ b/lib/Target/Mips/MipsCallLowering.h
@@ -68,9 +68,8 @@ public:
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<ArrayRef<Register>> VRegs) const override;
 
-  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
-                 const MachineOperand &Callee, const ArgInfo &OrigRet,
-                 ArrayRef<ArgInfo> OrigArgs) const override;
+  bool lowerCall(MachineIRBuilder &MIRBuilder,
+                 CallLoweringInfo &Info) const override;
 
 private:
   /// Based on registers available on target machine split or extend
@@ -83,7 +82,8 @@ private:
 
   /// Split structures and arrays, save original argument indices since
   /// Mips calling convention needs info about original argument type.
-  void splitToValueTypes(const ArgInfo &OrigArg, unsigned OriginalIndex,
+  void splitToValueTypes(const DataLayout &DL, const ArgInfo &OrigArg,
+                         unsigned OriginalIndex,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
                          SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const;
 };
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index eea28df7eda1..f50640521738 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -222,12 +222,7 @@ namespace {
 
       BasicBlockInfo() = default;
 
-      // FIXME: ignore LogAlign for this patch
-      //
-      unsigned postOffset(unsigned LogAlign = 0) const {
-        unsigned PO = Offset + Size;
-        return PO;
-      }
+      unsigned postOffset() const { return Offset + Size; }
     };
 
     std::vector<BasicBlockInfo> BBInfo;
@@ -376,7 +371,7 @@ namespace {
 
     void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
-    unsigned getCPELogAlign(const MachineInstr &CPEMI);
+    Align getCPEAlign(const MachineInstr &CPEMI);
     void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
     unsigned getOffsetOf(MachineInstr *MI) const;
     unsigned getUserOffset(CPUser&) const;
@@ -534,11 +529,11 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   MF->push_back(BB);
 
   // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
-  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+  const Align MaxAlign(MCP->getConstantPoolAlignment());
 
   // Mark the basic block as required by the const-pool.
   // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
-  BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+  BB->setAlignment(AlignConstantIslands ? MaxAlign : Align(4));
 
   // The function needs to be as aligned as the basic blocks. The linker may
   // move functions around based on their alignment.
@@ -548,7 +543,8 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   // alignment of all entries as long as BB is sufficiently aligned.  Keep
   // track of the insertion point for each alignment.  We are going to bucket
   // sort the entries as they are created.
-  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(Log2(MaxAlign) + 1,
+                                                       BB->end());
 
   // Add all of the constants from the constant pool to the end block, use an
   // identity mapping of CPI's to CPE's.
@@ -576,7 +572,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
 
     // Ensure that future entries with higher alignment get inserted before
     // CPEMI. This is bucket sort with iterators.
-    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+    for (unsigned a = LogAlign + 1; a <= Log2(MaxAlign); ++a)
       if (InsPoint[a] == InsAt)
         InsPoint[a] = CPEMI;
     // Add a new CPEntry, but no corresponding CPUser yet.
@@ -621,20 +617,18 @@ MipsConstantIslands::CPEntry
   return nullptr;
 }
 
-/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// getCPEAlign - Returns the required alignment of the constant pool entry
 /// represented by CPEMI.  Alignment is measured in log2(bytes) units.
-unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr &CPEMI) {
+Align MipsConstantIslands::getCPEAlign(const MachineInstr &CPEMI) {
   assert(CPEMI.getOpcode() == Mips::CONSTPOOL_ENTRY);
 
   // Everything is 4-byte aligned unless AlignConstantIslands is set.
   if (!AlignConstantIslands)
-    return 2;
+    return Align(4);
 
   unsigned CPI = CPEMI.getOperand(1).getIndex();
   assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
-  unsigned Align = MCP->getConstants()[CPI].getAlignment();
-  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
-  return Log2_32(Align);
+  return Align(MCP->getConstants()[CPI].getAlignment());
 }
 
 /// initializeFunctionInfo - Do the initial scan of the function, building up
@@ -940,13 +934,13 @@ bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset,
 bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
                                         MachineBasicBlock* Water, CPUser &U,
                                         unsigned &Growth) {
-  unsigned CPELogAlign = getCPELogAlign(*U.CPEMI);
-  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
-  unsigned NextBlockOffset, NextBlockAlignment;
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset();
+  unsigned NextBlockOffset;
+  Align NextBlockAlignment;
   MachineFunction::const_iterator NextBlock = ++Water->getIterator();
   if (NextBlock == MF->end()) {
     NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
-    NextBlockAlignment = 0;
+    NextBlockAlignment = Align::None();
   } else {
     NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
     NextBlockAlignment = NextBlock->getAlignment();
@@ -961,7 +955,7 @@ bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
     Growth = CPEEnd - NextBlockOffset;
     // Compute the padding that would go at the end of the CPE to align the next
     // block.
-    Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
+    Growth += offsetToAlignment(CPEEnd, NextBlockAlignment);
 
     // If the CPE is to be inserted before the instruction, that will raise
     // the offset of the instruction. Also account for unknown alignment padding
@@ -1221,7 +1215,6 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
-  unsigned CPELogAlign = getCPELogAlign(*CPEMI);
   MachineBasicBlock *UserMBB = UserMI->getParent();
   const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
 
@@ -1231,7 +1224,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
     // Size of branch to insert.
     unsigned Delta = 2;
     // Compute the offset where the CPE will begin.
-    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+    unsigned CPEOffset = UserBBI.postOffset() + Delta;
 
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
       LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
@@ -1257,9 +1250,8 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
 
   // Try to split the block so it's fully aligned.  Compute the latest split
   // point where we can add a 4-byte branch instruction, and then align to
-  // LogAlign which is the largest possible alignment in the function.
-  unsigned LogAlign = MF->getAlignment();
-  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  // Align which is the largest possible alignment in the function.
+  const Align Align = MF->getAlignment();
   unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
   LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
                               BaseInsertOffset));
@@ -1270,7 +1262,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
   BaseInsertOffset -= 4;
 
   LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
-                    << " la=" << LogAlign << '\n');
+                    << " la=" << Log2(Align) << '\n');
 
   // This could point off the end of the block if we've already got constant
   // pool entries following this block; only the last one is in the water list.
@@ -1295,8 +1287,8 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
       CPUser &U = CPUsers[CPUIndex];
       if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
         // Shift intertion point by one unit of alignment so it is within reach.
-        BaseInsertOffset -= 1u << LogAlign;
-        EndInsertOffset  -= 1u << LogAlign;
+        BaseInsertOffset -= Align.value();
+        EndInsertOffset -= Align.value();
       }
       // This is overly conservative, as we don't account for CPEMIs being
       // reused within the block, but it doesn't matter much.  Also assume CPEs
@@ -1399,7 +1391,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   ++NumCPEs;
 
   // Mark the basic block as aligned as required by the const-pool entry.
-  NewIsland->setAlignment(getCPELogAlign(*U.CPEMI));
+  NewIsland->setAlignment(getCPEAlign(*U.CPEMI));
 
   // Increase the size of the island block to account for the new entry.
   BBInfo[NewIsland->getNumber()].Size += Size;
@@ -1431,10 +1423,11 @@ void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
     BBInfo[CPEBB->getNumber()].Size = 0;
 
     // This block no longer needs to be aligned.
-    CPEBB->setAlignment(0);
-  } else
+    CPEBB->setAlignment(Align(1));
+  } else {
     // Entries are sorted by descending alignment, so realign from the front.
-    CPEBB->setAlignment(getCPELogAlign(*CPEBB->begin()));
+    CPEBB->setAlignment(getCPEAlign(*CPEBB->begin()));
+  }
 
   adjustBBOffsetsAfter(CPEBB);
   // An island has only one predecessor BB and one successor BB. Check if
@@ -1529,7 +1522,7 @@ MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
     // We should have a way to back out this alignment restriction if we "can" later.
     // but it is not harmful.
     //
-    DestBB->setAlignment(2);
+    DestBB->setAlignment(Align(4));
     Br.MaxDisp = ((1<<24)-1) * 2;
     MI->setDesc(TII->get(Mips::JalB16));
   }
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index daca8b907081..d3e68c014fb7 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -12,12 +12,19 @@
 
 // ImmLeaf
 def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>;
+def timmZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}], NOOP_SDNodeXForm, timm>;
 def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
+def timmZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}], NOOP_SDNodeXForm, timm>;
 def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
+def timmZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}], NOOP_SDNodeXForm, timm>;
 def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def timmZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}], NOOP_SDNodeXForm, timm>;
 def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
+def timmZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}], NOOP_SDNodeXForm, timm>;
 def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
+def timmZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}], NOOP_SDNodeXForm, timm>;
 def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
+def timmSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}], NOOP_SDNodeXForm, timm>;
 def immSExt10 : ImmLeaf<i32, [{return isInt<10>(Imm);}]>;
 
 // Mips-specific dsp nodes
@@ -306,7 +313,7 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   dag OutOperandList = (outs ROT:$rt);
   dag InOperandList = (ins ROS:$rs, uimm5:$sa, ROS:$src);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
-  list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
+  list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, timmZExt5:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
   string BaseOpcode = instr_asm;
@@ -443,7 +450,7 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   dag OutOperandList = (outs GPR32Opnd:$rd);
   dag InOperandList = (ins uimm10:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
-  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode timmZExt10:$mask))];
   InstrItinClass Itinerary = itin;
   string BaseOpcode = instr_asm;
   bit isMoveReg = 1;
@@ -454,7 +461,7 @@ class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   dag OutOperandList = (outs);
   dag InOperandList = (ins GPR32Opnd:$rs, uimm10:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
-  list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
+  list<dag> Pattern = [(OpNode GPR32Opnd:$rs, timmZExt10:$mask)];
   InstrItinClass Itinerary = itin;
   string BaseOpcode = instr_asm;
   bit isMoveReg = 1;
@@ -1096,14 +1103,14 @@ class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
                                            NoItinerary, DSPROpnd>;
 
 // Misc
-class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, immZExt5,
+class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, timmZExt5,
                                      NoItinerary>;
 
-class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, immZExt2,
+class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, timmZExt2,
                                      NoItinerary>;
 
 class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5,
-                                      immZExt5, NoItinerary>;
+                                      timmZExt5, NoItinerary>;
 
 // Pseudos.
 def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp
index 65d84a6c44a0..00cd284e7094 100644
--- a/lib/Target/Mips/MipsExpandPseudo.cpp
+++ b/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -99,15 +99,15 @@ bool MipsExpandPseudo::expandAtomicCmpSwapSubword(
                             : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
   }
 
-  unsigned Dest = I->getOperand(0).getReg();
-  unsigned Ptr = I->getOperand(1).getReg();
-  unsigned Mask = I->getOperand(2).getReg();
-  unsigned ShiftCmpVal = I->getOperand(3).getReg();
-  unsigned Mask2 = I->getOperand(4).getReg();
-  unsigned ShiftNewVal = I->getOperand(5).getReg();
-  unsigned ShiftAmnt = I->getOperand(6).getReg();
-  unsigned Scratch = I->getOperand(7).getReg();
-  unsigned Scratch2 = I->getOperand(8).getReg();
+  Register Dest = I->getOperand(0).getReg();
+  Register Ptr = I->getOperand(1).getReg();
+  Register Mask = I->getOperand(2).getReg();
+  Register ShiftCmpVal = I->getOperand(3).getReg();
+  Register Mask2 = I->getOperand(4).getReg();
+  Register ShiftNewVal = I->getOperand(5).getReg();
+  Register ShiftAmnt = I->getOperand(6).getReg();
+  Register Scratch = I->getOperand(7).getReg();
+  Register Scratch2 = I->getOperand(8).getReg();
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB.getBasicBlock();
@@ -240,11 +240,11 @@ bool MipsExpandPseudo::expandAtomicCmpSwap(MachineBasicBlock &BB,
     MOVE = Mips::OR64;
   }
 
-  unsigned Dest = I->getOperand(0).getReg();
-  unsigned Ptr = I->getOperand(1).getReg();
-  unsigned OldVal = I->getOperand(2).getReg();
-  unsigned NewVal = I->getOperand(3).getReg();
-  unsigned Scratch = I->getOperand(4).getReg();
+  Register Dest = I->getOperand(0).getReg();
+  Register Ptr = I->getOperand(1).getReg();
+  Register OldVal = I->getOperand(2).getReg();
+  Register NewVal = I->getOperand(3).getReg();
+  Register Scratch = I->getOperand(4).getReg();
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB.getBasicBlock();
@@ -374,15 +374,15 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword(
     llvm_unreachable("Unknown subword atomic pseudo for expansion!");
   }
 
-  unsigned Dest = I->getOperand(0).getReg();
-  unsigned Ptr = I->getOperand(1).getReg();
-  unsigned Incr = I->getOperand(2).getReg();
-  unsigned Mask = I->getOperand(3).getReg();
-  unsigned Mask2 = I->getOperand(4).getReg();
-  unsigned ShiftAmnt = I->getOperand(5).getReg();
-  unsigned OldVal = I->getOperand(6).getReg();
-  unsigned BinOpRes = I->getOperand(7).getReg();
-  unsigned StoreVal = I->getOperand(8).getReg();
+  Register Dest = I->getOperand(0).getReg();
+  Register Ptr = I->getOperand(1).getReg();
+  Register Incr = I->getOperand(2).getReg();
+  Register Mask = I->getOperand(3).getReg();
+  Register Mask2 = I->getOperand(4).getReg();
+  Register ShiftAmnt = I->getOperand(5).getReg();
+  Register OldVal = I->getOperand(6).getReg();
+  Register BinOpRes = I->getOperand(7).getReg();
+  Register StoreVal = I->getOperand(8).getReg();
 
   const BasicBlock *LLVM_BB = BB.getBasicBlock();
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -513,10 +513,10 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
     BEQ = Mips::BEQ64;
   }
 
-  unsigned OldVal = I->getOperand(0).getReg();
-  unsigned Ptr = I->getOperand(1).getReg();
-  unsigned Incr = I->getOperand(2).getReg();
-  unsigned Scratch = I->getOperand(3).getReg();
+  Register OldVal = I->getOperand(0).getReg();
+  Register Ptr = I->getOperand(1).getReg();
+  Register Incr = I->getOperand(2).getReg();
+  Register Scratch = I->getOperand(3).getReg();
 
   unsigned Opcode = 0;
   unsigned OR = 0;
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 123d3cc242f0..80f288ac500c 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1162,14 +1162,20 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
       if (ArgVT == MVT::f32) {
         VA.convertToReg(Mips::F12);
       } else if (ArgVT == MVT::f64) {
-        VA.convertToReg(Mips::D6);
+        if (Subtarget->isFP64bit())
+          VA.convertToReg(Mips::D6_64);
+        else
+          VA.convertToReg(Mips::D6);
       }
     } else if (i == 1) {
       if ((firstMVT == MVT::f32) || (firstMVT == MVT::f64)) {
         if (ArgVT == MVT::f32) {
           VA.convertToReg(Mips::F14);
         } else if (ArgVT == MVT::f64) {
-          VA.convertToReg(Mips::D7);
+          if (Subtarget->isFP64bit())
+            VA.convertToReg(Mips::D7_64);
+          else
+            VA.convertToReg(Mips::D7);
         }
       }
     }
@@ -1722,7 +1728,7 @@ bool MipsFastISel::selectRet(const Instruction *I) {
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
-    unsigned DestReg = VA.getLocReg();
+    Register DestReg = VA.getLocReg();
     // Avoid a cross-class copy. This is very unlikely.
     if (!MRI.getRegClass(SrcReg)->contains(DestReg))
       return false;
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 0537cfd1cb30..612b2b712fa8 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -24,8 +24,9 @@ protected:
   const MipsSubtarget &STI;
 
 public:
-  explicit MipsFrameLowering(const MipsSubtarget &sti, unsigned Alignment)
-    : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {}
+  explicit MipsFrameLowering(const MipsSubtarget &sti, Align Alignment)
+      : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {
+  }
 
   static const MipsFrameLowering *create(const MipsSubtarget &ST);
 
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 9ba54d6bb73c..e5997af3bcc5 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -65,7 +65,7 @@ bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 /// getGlobalBaseReg - Output the instructions required to put the
 /// GOT address into a register.
 SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
-  unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
+  Register GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
   return CurDAG->getRegister(GlobalBaseReg, getTargetLowering()->getPointerTy(
                                                 CurDAG->getDataLayout()))
       .getNode();
@@ -217,6 +217,51 @@ bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
   return false;
 }
 
+/// Convert vector addition with vector subtraction if that allows to encode
+/// constant as an immediate and thus avoid extra 'ldi' instruction.
+/// add X, <-1, -1...> --> sub X, <1, 1...>
+bool MipsDAGToDAGISel::selectVecAddAsVecSubIfProfitable(SDNode *Node) {
+  assert(Node->getOpcode() == ISD::ADD && "Should only get 'add' here.");
+
+  EVT VT = Node->getValueType(0);
+  assert(VT.isVector() && "Should only be called for vectors.");
+
+  SDValue X = Node->getOperand(0);
+  SDValue C = Node->getOperand(1);
+
+  auto *BVN = dyn_cast<BuildVectorSDNode>(C);
+  if (!BVN)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                            8, !Subtarget->isLittle()))
+    return false;
+
+  auto IsInlineConstant = [](const APInt &Imm) { return Imm.isIntN(5); };
+
+  if (IsInlineConstant(SplatValue))
+    return false; // Can already be encoded as an immediate.
+
+  APInt NegSplatValue = 0 - SplatValue;
+  if (!IsInlineConstant(NegSplatValue))
+    return false; // Even if we negate it it won't help.
+
+  SDLoc DL(Node);
+
+  SDValue NegC = CurDAG->FoldConstantArithmetic(
+      ISD::SUB, DL, VT, CurDAG->getConstant(0, DL, VT).getNode(), C.getNode());
+  assert(NegC && "Constant-folding failed!");
+  SDValue NewNode = CurDAG->getNode(ISD::SUB, DL, VT, X, NegC);
+
+  ReplaceNode(Node, NewNode.getNode());
+  SelectCode(NewNode.getNode());
+  return true;
+}
+
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
 void MipsDAGToDAGISel::Select(SDNode *Node) {
@@ -236,6 +281,12 @@ void MipsDAGToDAGISel::Select(SDNode *Node) {
   switch(Opcode) {
   default: break;
 
+  case ISD::ADD:
+    if (Node->getSimpleValueType(0).isVector() &&
+        selectVecAddAsVecSubIfProfitable(Node))
+      return;
+    break;
+
   // Get target GOT address.
   case ISD::GLOBAL_OFFSET_TABLE:
     ReplaceNode(Node, getGlobalBaseReg());
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index bae3bbf71f3b..a768589b374b 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -125,6 +125,11 @@ private:
   /// starting at bit zero.
   virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
 
+  /// Convert vector addition with vector subtraction if that allows to encode
+  /// constant as an immediate and thus avoid extra 'ldi' instruction.
+  /// add X, <-1, -1...> --> sub X, <1, 1...>
+  bool selectVecAddAsVecSubIfProfitable(SDNode *Node);
+
   void Select(SDNode *N) override;
 
   virtual bool trySelect(SDNode *Node) = 0;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 0ff09007da4b..bf1b4756b24f 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -82,10 +82,6 @@ using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 
-static cl::opt<bool>
-LargeGOT("mxgot", cl::Hidden,
-         cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
-
 static cl::opt<bool>
 NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
                cl::desc("MIPS: Don't trap on integer division by zero."),
@@ -330,7 +326,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   }
 
   // Set LoadExtAction for f16 vectors to Expand
-  for (MVT VT : MVT::fp_vector_valuetypes()) {
+  for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
     MVT F16VT = MVT::getVectorVT(MVT::f16, VT.getVectorNumElements());
     if (F16VT.isValid())
       setLoadExtAction(ISD::EXTLOAD, VT, F16VT, Expand);
@@ -518,11 +514,12 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setLibcallName(RTLIB::SRA_I128, nullptr);
   }
 
-  setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2);
+  setMinFunctionAlignment(Subtarget.isGP64bit() ? Align(8) : Align(4));
 
   // The arguments on the stack are defined in terms of 4-byte slots on O32
   // and 8-byte slots on N32/N64.
-  setMinStackArgumentAlignment((ABI.IsN32() || ABI.IsN64()) ? 8 : 4);
+  setMinStackArgumentAlignment((ABI.IsN32() || ABI.IsN64()) ? Align(8)
+                                                            : Align(4));
 
   setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP);
 
@@ -552,8 +549,9 @@ MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                      !Subtarget.inMicroMipsMode();
 
   // Disable if either of the following is true:
-  // We do not generate PIC, the ABI is not O32, LargeGOT is being used.
-  if (!TM.isPositionIndependent() || !TM.getABI().IsO32() || LargeGOT)
+  // We do not generate PIC, the ABI is not O32, XGOT is being used.
+  if (!TM.isPositionIndependent() || !TM.getABI().IsO32() ||
+      Subtarget.useXGOT())
     UseFastISel = false;
 
   return UseFastISel ? Mips::createFastISel(funcInfo, libInfo) : nullptr;
@@ -1257,7 +1255,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
 static unsigned
 addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
 {
-  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+  Register VReg = MF.getRegInfo().createVirtualRegister(RC);
   MF.getRegInfo().addLiveIn(PReg, VReg);
   return VReg;
 }
@@ -1477,10 +1475,10 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
     llvm_unreachable("Unknown pseudo atomic for replacement!");
   }
 
-  unsigned OldVal = MI.getOperand(0).getReg();
-  unsigned Ptr = MI.getOperand(1).getReg();
-  unsigned Incr = MI.getOperand(2).getReg();
-  unsigned Scratch = RegInfo.createVirtualRegister(RegInfo.getRegClass(OldVal));
+  Register OldVal = MI.getOperand(0).getReg();
+  Register Ptr = MI.getOperand(1).getReg();
+  Register Incr = MI.getOperand(2).getReg();
+  Register Scratch = RegInfo.createVirtualRegister(RegInfo.getRegClass(OldVal));
 
   MachineBasicBlock::iterator II(MI);
 
@@ -1519,8 +1517,8 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
   //     containing the word.
   //
 
-  unsigned PtrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Ptr));
-  unsigned IncrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Incr));
+  Register PtrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Ptr));
+  Register IncrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Incr));
 
   BuildMI(*BB, II, DL, TII->get(Mips::COPY), IncrCopy).addReg(Incr);
   BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
@@ -1556,7 +1554,7 @@ MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
-  unsigned ScrReg = RegInfo.createVirtualRegister(RC);
+  Register ScrReg = RegInfo.createVirtualRegister(RC);
 
   assert(Size < 32);
   int64_t ShiftImm = 32 - (Size * 8);
@@ -1581,21 +1579,21 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
-  unsigned Dest = MI.getOperand(0).getReg();
-  unsigned Ptr = MI.getOperand(1).getReg();
-  unsigned Incr = MI.getOperand(2).getReg();
-
-  unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
-  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
-  unsigned Mask = RegInfo.createVirtualRegister(RC);
-  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
-  unsigned Incr2 = RegInfo.createVirtualRegister(RC);
-  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
-  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
-  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
-  unsigned Scratch = RegInfo.createVirtualRegister(RC);
-  unsigned Scratch2 = RegInfo.createVirtualRegister(RC);
-  unsigned Scratch3 = RegInfo.createVirtualRegister(RC);
+  Register Dest = MI.getOperand(0).getReg();
+  Register Ptr = MI.getOperand(1).getReg();
+  Register Incr = MI.getOperand(2).getReg();
+
+  Register AlignedAddr = RegInfo.createVirtualRegister(RCp);
+  Register ShiftAmt = RegInfo.createVirtualRegister(RC);
+  Register Mask = RegInfo.createVirtualRegister(RC);
+  Register Mask2 = RegInfo.createVirtualRegister(RC);
+  Register Incr2 = RegInfo.createVirtualRegister(RC);
+  Register MaskLSB2 = RegInfo.createVirtualRegister(RCp);
+  Register PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  Register MaskUpper = RegInfo.createVirtualRegister(RC);
+  Register Scratch = RegInfo.createVirtualRegister(RC);
+  Register Scratch2 = RegInfo.createVirtualRegister(RC);
+  Register Scratch3 = RegInfo.createVirtualRegister(RC);
 
   unsigned AtomicOp = 0;
   switch (MI.getOpcode()) {
@@ -1678,7 +1676,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
   if (Subtarget.isLittle()) {
     BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
   } else {
-    unsigned Off = RegInfo.createVirtualRegister(RC);
+    Register Off = RegInfo.createVirtualRegister(RC);
     BuildMI(BB, DL, TII->get(Mips::XORi), Off)
       .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
     BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
@@ -1738,12 +1736,12 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
   unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32
                           ? Mips::ATOMIC_CMP_SWAP_I32_POSTRA
                           : Mips::ATOMIC_CMP_SWAP_I64_POSTRA;
-  unsigned Dest = MI.getOperand(0).getReg();
-  unsigned Ptr = MI.getOperand(1).getReg();
-  unsigned OldVal = MI.getOperand(2).getReg();
-  unsigned NewVal = MI.getOperand(3).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register Ptr = MI.getOperand(1).getReg();
+  Register OldVal = MI.getOperand(2).getReg();
+  Register NewVal = MI.getOperand(3).getReg();
 
-  unsigned Scratch = MRI.createVirtualRegister(RC);
+  Register Scratch = MRI.createVirtualRegister(RC);
   MachineBasicBlock::iterator II(MI);
 
   // We need to create copies of the various registers and kill them at the
@@ -1751,9 +1749,9 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
   // after fast register allocation, the spills will end up outside of the
   // blocks that their values are defined in, causing livein errors.
 
-  unsigned PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr));
-  unsigned OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal));
-  unsigned NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal));
+  Register PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr));
+  Register OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal));
+  Register NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal));
 
   BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
   BuildMI(*BB, II, DL, TII->get(Mips::COPY), OldValCopy).addReg(OldVal);
@@ -1790,22 +1788,22 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
-  unsigned Dest = MI.getOperand(0).getReg();
-  unsigned Ptr = MI.getOperand(1).getReg();
-  unsigned CmpVal = MI.getOperand(2).getReg();
-  unsigned NewVal = MI.getOperand(3).getReg();
-
-  unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
-  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
-  unsigned Mask = RegInfo.createVirtualRegister(RC);
-  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
-  unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
-  unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
-  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
-  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
-  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
+  Register Dest = MI.getOperand(0).getReg();
+  Register Ptr = MI.getOperand(1).getReg();
+  Register CmpVal = MI.getOperand(2).getReg();
+  Register NewVal = MI.getOperand(3).getReg();
+
+  Register AlignedAddr = RegInfo.createVirtualRegister(RCp);
+  Register ShiftAmt = RegInfo.createVirtualRegister(RC);
+  Register Mask = RegInfo.createVirtualRegister(RC);
+  Register Mask2 = RegInfo.createVirtualRegister(RC);
+  Register ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
+  Register ShiftedNewVal = RegInfo.createVirtualRegister(RC);
+  Register MaskLSB2 = RegInfo.createVirtualRegister(RCp);
+  Register PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  Register MaskUpper = RegInfo.createVirtualRegister(RC);
+  Register MaskedCmpVal = RegInfo.createVirtualRegister(RC);
+  Register MaskedNewVal = RegInfo.createVirtualRegister(RC);
   unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I8
                           ? Mips::ATOMIC_CMP_SWAP_I8_POSTRA
                           : Mips::ATOMIC_CMP_SWAP_I16_POSTRA;
@@ -1820,8 +1818,8 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
   // value isn't a problem.
   // The Dead flag is needed as the value in scratch isn't used by any other
   // instruction. Kill isn't used as Dead is more precise.
-  unsigned Scratch = RegInfo.createVirtualRegister(RC);
-  unsigned Scratch2 = RegInfo.createVirtualRegister(RC);
+  Register Scratch = RegInfo.createVirtualRegister(RC);
+  Register Scratch2 = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -1859,7 +1857,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
   if (Subtarget.isLittle()) {
     BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
   } else {
-    unsigned Off = RegInfo.createVirtualRegister(RC);
+    Register Off = RegInfo.createVirtualRegister(RC);
     BuildMI(BB, DL, TII->get(Mips::XORi), Off)
       .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
     BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
@@ -1967,10 +1965,10 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
       // %gp_rel relocation
       return getAddrGPRel(N, SDLoc(N), Ty, DAG, ABI.IsN64());
 
-                                 // %hi/%lo relocation
+                                // %hi/%lo relocation
     return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
-                                 // %highest/%higher/%hi/%lo relocation
-                                 : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+                                // %highest/%higher/%hi/%lo relocation
+                                : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
   }
 
   // Every other architecture would use shouldAssumeDSOLocal in here, but
@@ -1987,7 +1985,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   if (GV->hasLocalLinkage())
     return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 
-  if (LargeGOT)
+  if (Subtarget.useXGOT())
     return getAddrGlobalLargeGOT(
         N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16,
         DAG.getEntryNode(),
@@ -2149,7 +2147,8 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Node->getValueType(0);
   SDValue Chain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
-  unsigned Align = Node->getConstantOperandVal(3);
+  const Align Align =
+      llvm::MaybeAlign(Node->getConstantOperandVal(3)).valueOrOne();
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   SDLoc DL(Node);
   unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4;
@@ -2166,14 +2165,13 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   //        when the pointer is still aligned from the last va_arg (or pair of
   //        va_args for the i64 on O32 case).
   if (Align > getMinStackArgumentAlignment()) {
-    assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+    VAList = DAG.getNode(
+        ISD::ADD, DL, VAList.getValueType(), VAList,
+        DAG.getConstant(Align.value() - 1, DL, VAList.getValueType()));
 
-    VAList = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
-                         DAG.getConstant(Align - 1, DL, VAList.getValueType()));
-
-    VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
-                         DAG.getConstant(-(int64_t)Align, DL,
-                                         VAList.getValueType()));
+    VAList = DAG.getNode(
+        ISD::AND, DL, VAList.getValueType(), VAList,
+        DAG.getConstant(-(int64_t)Align.value(), DL, VAList.getValueType()));
   }
 
   // Increment the pointer, VAList, to the next vaarg.
@@ -2870,7 +2868,7 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
 #include "MipsGenCallingConv.inc"
 
  CCAssignFn *MipsTargetLowering::CCAssignFnForCall() const{
-   return CC_Mips;
+   return CC_Mips_FixedArg;
  }
 
  CCAssignFn *MipsTargetLowering::CCAssignFnForReturn() const{
@@ -3167,7 +3165,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                    Arg, DAG.getConstant(1, DL, MVT::i32));
           if (!Subtarget.isLittle())
             std::swap(Lo, Hi);
-          unsigned LocRegLo = VA.getLocReg();
+          Register LocRegLo = VA.getLocReg();
           unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
           RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
           RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
@@ -3270,7 +3268,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       if (InternalLinkage)
         Callee = getAddrLocal(G, DL, Ty, DAG, ABI.IsN32() || ABI.IsN64());
-      else if (LargeGOT) {
+      else if (Subtarget.useXGOT()) {
         Callee = getAddrGlobalLargeGOT(G, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                        MipsII::MO_CALL_LO16, Chain,
                                        FuncInfo->callPtrInfo(Val));
@@ -3292,7 +3290,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     if (!IsPIC) // static
       Callee = DAG.getTargetExternalSymbol(
           Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG);
-    else if (LargeGOT) {
+    else if (Subtarget.useXGOT()) {
       Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                      MipsII::MO_CALL_LO16, Chain,
                                      FuncInfo->callPtrInfo(Sym));
@@ -3523,7 +3521,7 @@ SDValue MipsTargetLowering::LowerFormalArguments(
     // Arguments stored on registers
     if (IsRegLoc) {
       MVT RegVT = VA.getLocVT();
-      unsigned ArgReg = VA.getLocReg();
+      Register ArgReg = VA.getLocReg();
       const TargetRegisterClass *RC = getRegClassFor(RegVT);
 
       // Transform the arguments stored on
@@ -4568,20 +4566,20 @@ MachineBasicBlock *MipsTargetLowering::emitPseudoD_SELECT(MachineInstr &MI,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT,
-                                               SelectionDAG &DAG) const {
+Register MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                               const MachineFunction &MF) const {
   // Named registers is expected to be fairly rare. For now, just support $28
   // since the linux kernel uses it.
   if (Subtarget.isGP64bit()) {
-    unsigned Reg = StringSwitch<unsigned>(RegName)
+    Register Reg = StringSwitch<Register>(RegName)
                          .Case("$28", Mips::GP_64)
-                         .Default(0);
+                         .Default(Register());
     if (Reg)
       return Reg;
   } else {
-    unsigned Reg = StringSwitch<unsigned>(RegName)
+    Register Reg = StringSwitch<Register>(RegName)
                          .Case("$28", Mips::GP)
-                         .Default(0);
+                         .Default(Register());
     if (Reg)
       return Reg;
   }
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 2db60e9801f1..0a5cddd45afb 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -304,11 +304,12 @@ class TargetRegisterClass;
         unsigned &NumIntermediates, MVT &RegisterVT) const override;
 
     /// Return the correct alignment for the current calling convention.
-    unsigned getABIAlignmentForCallingConv(Type *ArgTy,
-                                           DataLayout DL) const override {
+    Align getABIAlignmentForCallingConv(Type *ArgTy,
+                                        DataLayout DL) const override {
+      const Align ABIAlign(DL.getABITypeAlignment(ArgTy));
       if (ArgTy->isVectorTy())
-        return std::min(DL.getABITypeAlignment(ArgTy), 8U);
-      return DL.getABITypeAlignment(ArgTy);
+        return std::min(ABIAlign, Align(8));
+      return ABIAlign;
     }
 
     ISD::NodeType getExtendForAtomicOps() const override {
@@ -347,8 +348,8 @@ class TargetRegisterClass;
 
     void HandleByVal(CCState *, unsigned &, unsigned) const override;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT,
-                               SelectionDAG &DAG) const override;
+    Register getRegisterByName(const char* RegName, EVT VT,
+                               const MachineFunction &MF) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index fbd56206b249..6bb25ee5754d 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -677,7 +677,8 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
   return MIB;
 }
 
-bool MipsInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+bool MipsInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+                                          unsigned &SrcOpIdx1,
                                           unsigned &SrcOpIdx2) const {
   assert(!MI.isBundle() &&
          "TargetInstrInfo::findCommutedOpIndices() can't handle bundles");
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index a626c0c3fdb8..092a960b4ba7 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -148,7 +148,7 @@ public:
   MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
                                          MachineBasicBlock::iterator I) const;
 
-  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+  bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
   /// Perform target specific instruction verification.
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index a4e85a38ab28..58167e0f344d 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -211,9 +211,9 @@ def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
                       AssemblerPredicate<"FeatureCnMips">;
 def NotCnMips    :    Predicate<"!Subtarget->hasCnMips()">,
                       AssemblerPredicate<"!FeatureCnMips">;
-def IsSym32     :     Predicate<"Subtarget->HasSym32()">,
+def IsSym32     :     Predicate<"Subtarget->hasSym32()">,
                       AssemblerPredicate<"FeatureSym32">;
-def IsSym64     :     Predicate<"!Subtarget->HasSym32()">,
+def IsSym64     :     Predicate<"!Subtarget->hasSym32()">,
                       AssemblerPredicate<"!FeatureSym32">;
 def IsN64       :     Predicate<"Subtarget->isABI_N64()">;
 def IsNotN64    :     Predicate<"!Subtarget->isABI_N64()">;
@@ -1263,6 +1263,7 @@ def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
 
 // Node immediate fits as 7-bit zero extended on target immediate.
 def immZExt7 : PatLeaf<(imm), [{ return isUInt<7>(N->getZExtValue()); }]>;
+def timmZExt7 : PatLeaf<(timm), [{ return isUInt<7>(N->getZExtValue()); }]>;
 
 // Node immediate fits as 16-bit zero extended on target immediate.
 // The LO16 param means that only the lower 16 bits of the node
@@ -1295,6 +1296,7 @@ def immZExt32  : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>;
 
 // shamt field must fit in 5 bits.
 def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
+def timmZExt5 : TImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
 
 def immZExt5Plus1 : PatLeaf<(imm), [{
   return isUInt<5>(N->getZExtValue() - 1);
@@ -3142,25 +3144,31 @@ multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
   def : MipsPat<(MipsHi tconstpool:$in), (Lui tconstpool:$in)>;
   def : MipsPat<(MipsHi texternalsym:$in), (Lui texternalsym:$in)>;
 
-  def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>;
+  def : MipsPat<(MipsLo tglobaladdr:$in),
+                (Addiu ZeroReg, tglobaladdr:$in)>;
   def : MipsPat<(MipsLo tblockaddress:$in),
                 (Addiu ZeroReg, tblockaddress:$in)>;
-  def : MipsPat<(MipsLo tjumptable:$in), (Addiu ZeroReg, tjumptable:$in)>;
-  def : MipsPat<(MipsLo tconstpool:$in), (Addiu ZeroReg, tconstpool:$in)>;
+  def : MipsPat<(MipsLo tjumptable:$in),
+                (Addiu ZeroReg, tjumptable:$in)>;
+  def : MipsPat<(MipsLo tconstpool:$in),
+                (Addiu ZeroReg, tconstpool:$in)>;
   def : MipsPat<(MipsLo tglobaltlsaddr:$in),
                 (Addiu ZeroReg, tglobaltlsaddr:$in)>;
-  def : MipsPat<(MipsLo texternalsym:$in), (Addiu ZeroReg, texternalsym:$in)>;
+  def : MipsPat<(MipsLo texternalsym:$in),
+                (Addiu ZeroReg, texternalsym:$in)>;
 
   def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaladdr:$lo)),
-              (Addiu GPROpnd:$hi, tglobaladdr:$lo)>;
+                (Addiu GPROpnd:$hi, tglobaladdr:$lo)>;
   def : MipsPat<(add GPROpnd:$hi, (MipsLo tblockaddress:$lo)),
-              (Addiu GPROpnd:$hi, tblockaddress:$lo)>;
+                (Addiu GPROpnd:$hi, tblockaddress:$lo)>;
   def : MipsPat<(add GPROpnd:$hi, (MipsLo tjumptable:$lo)),
-              (Addiu GPROpnd:$hi, tjumptable:$lo)>;
+                (Addiu GPROpnd:$hi, tjumptable:$lo)>;
   def : MipsPat<(add GPROpnd:$hi, (MipsLo tconstpool:$lo)),
-              (Addiu GPROpnd:$hi, tconstpool:$lo)>;
+                (Addiu GPROpnd:$hi, tconstpool:$lo)>;
   def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaltlsaddr:$lo)),
-              (Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>;
+                (Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>;
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo texternalsym:$lo)),
+                (Addiu GPROpnd:$hi, texternalsym:$lo)>;
 }
 
 // wrapper_pic
diff --git a/lib/Target/Mips/MipsInstructionSelector.cpp b/lib/Target/Mips/MipsInstructionSelector.cpp
index 45a47ad3c087..f8fc7cb0898b 100644
--- a/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -17,6 +17,7 @@
 #include "MipsTargetMachine.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 
 #define DEBUG_TYPE "mips-isel"
 
@@ -33,7 +34,7 @@ public:
   MipsInstructionSelector(const MipsTargetMachine &TM, const MipsSubtarget &STI,
                           const MipsRegisterBankInfo &RBI);
 
-  bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+  bool select(MachineInstr &I) override;
   static const char *getName() { return DEBUG_TYPE; }
 
 private:
@@ -44,6 +45,8 @@ private:
   const TargetRegisterClass *
   getRegClassForTypeOnBank(unsigned OpSize, const RegisterBank &RB,
                            const RegisterBankInfo &RBI) const;
+  unsigned selectLoadStoreOpCode(MachineInstr &I,
+                                 MachineRegisterInfo &MRI) const;
 
   const MipsTargetMachine &TM;
   const MipsSubtarget &STI;
@@ -84,7 +87,7 @@ MipsInstructionSelector::MipsInstructionSelector(
 bool MipsInstructionSelector::selectCopy(MachineInstr &I,
                                          MachineRegisterInfo &MRI) const {
   Register DstReg = I.getOperand(0).getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+  if (Register::isPhysicalRegister(DstReg))
     return true;
 
   const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI);
@@ -158,9 +161,15 @@ bool MipsInstructionSelector::materialize32BitImm(Register DestReg, APInt Imm,
 }
 
 /// Returning Opc indicates that we failed to select MIPS instruction opcode.
-static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes,
-                                      unsigned RegBank, bool isFP64) {
-  bool isStore = Opc == TargetOpcode::G_STORE;
+unsigned
+MipsInstructionSelector::selectLoadStoreOpCode(MachineInstr &I,
+                                               MachineRegisterInfo &MRI) const {
+  STI.getRegisterInfo();
+  const Register DestReg = I.getOperand(0).getReg();
+  const unsigned RegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID();
+  const unsigned MemSizeInBytes = (*I.memoperands_begin())->getSize();
+  unsigned Opc = I.getOpcode();
+  const bool isStore = Opc == TargetOpcode::G_STORE;
   if (RegBank == Mips::GPRBRegBankID) {
     if (isStore)
       switch (MemSizeInBytes) {
@@ -192,10 +201,24 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes,
     case 4:
       return isStore ? Mips::SWC1 : Mips::LWC1;
     case 8:
-      if (isFP64)
+      if (STI.isFP64bit())
         return isStore ? Mips::SDC164 : Mips::LDC164;
       else
         return isStore ? Mips::SDC1 : Mips::LDC1;
+    case 16: {
+      assert(STI.hasMSA() && "Vector instructions require target with MSA.");
+      const unsigned VectorElementSizeInBytes =
+          MRI.getType(DestReg).getElementType().getSizeInBytes();
+      if (VectorElementSizeInBytes == 1)
+        return isStore ? Mips::ST_B : Mips::LD_B;
+      if (VectorElementSizeInBytes == 2)
+        return isStore ? Mips::ST_H : Mips::LD_H;
+      if (VectorElementSizeInBytes == 4)
+        return isStore ? Mips::ST_W : Mips::LD_W;
+      if (VectorElementSizeInBytes == 8)
+        return isStore ? Mips::ST_D : Mips::LD_D;
+      return Opc;
+    }
     default:
       return Opc;
     }
@@ -203,8 +226,7 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes,
   return Opc;
 }
 
-bool MipsInstructionSelector::select(MachineInstr &I,
-                                     CodeGenCoverage &CoverageInfo) const {
+bool MipsInstructionSelector::select(MachineInstr &I) {
 
   MachineBasicBlock &MBB = *I.getParent();
   MachineFunction &MF = *MBB.getParent();
@@ -231,7 +253,7 @@ bool MipsInstructionSelector::select(MachineInstr &I,
     return true;
   }
 
-  if (selectImpl(I, CoverageInfo))
+  if (selectImpl(I, *CoverageInfo))
     return true;
 
   MachineInstr *MI = nullptr;
@@ -265,6 +287,11 @@ bool MipsInstructionSelector::select(MachineInstr &I,
              .add(I.getOperand(2));
     break;
   }
+  case G_INTTOPTR:
+  case G_PTRTOINT: {
+    I.setDesc(TII.get(COPY));
+    return selectCopy(I, MRI);
+  }
   case G_FRAME_INDEX: {
     MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
              .add(I.getOperand(0))
@@ -279,12 +306,71 @@ bool MipsInstructionSelector::select(MachineInstr &I,
              .add(I.getOperand(1));
     break;
   }
+  case G_BRJT: {
+    unsigned EntrySize =
+        MF.getJumpTableInfo()->getEntrySize(MF.getDataLayout());
+    assert(isPowerOf2_32(EntrySize) &&
+           "Non-power-of-two jump-table entry size not supported.");
+
+    Register JTIndex = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    MachineInstr *SLL = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::SLL))
+                            .addDef(JTIndex)
+                            .addUse(I.getOperand(2).getReg())
+                            .addImm(Log2_32(EntrySize));
+    if (!constrainSelectedInstRegOperands(*SLL, TII, TRI, RBI))
+      return false;
+
+    Register DestAddress = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    MachineInstr *ADDu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu))
+                             .addDef(DestAddress)
+                             .addUse(I.getOperand(0).getReg())
+                             .addUse(JTIndex);
+    if (!constrainSelectedInstRegOperands(*ADDu, TII, TRI, RBI))
+      return false;
+
+    Register Dest = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    MachineInstr *LW =
+        BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW))
+            .addDef(Dest)
+            .addUse(DestAddress)
+            .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_ABS_LO)
+            .addMemOperand(MF.getMachineMemOperand(
+                MachinePointerInfo(), MachineMemOperand::MOLoad, 4, 4));
+    if (!constrainSelectedInstRegOperands(*LW, TII, TRI, RBI))
+      return false;
+
+    if (MF.getTarget().isPositionIndependent()) {
+      Register DestTmp = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+      LW->getOperand(0).setReg(DestTmp);
+      MachineInstr *ADDu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu))
+                               .addDef(Dest)
+                               .addUse(DestTmp)
+                               .addUse(MF.getInfo<MipsFunctionInfo>()
+                                           ->getGlobalBaseRegForGlobalISel());
+      if (!constrainSelectedInstRegOperands(*ADDu, TII, TRI, RBI))
+        return false;
+    }
+
+    MachineInstr *Branch =
+        BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoIndirectBranch))
+            .addUse(Dest);
+    if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
+  case G_BRINDIRECT: {
+    MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoIndirectBranch))
+             .add(I.getOperand(0));
+    break;
+  }
   case G_PHI: {
     const Register DestReg = I.getOperand(0).getReg();
     const unsigned OpSize = MRI.getType(DestReg).getSizeInBits();
 
     const TargetRegisterClass *DefRC = nullptr;
-    if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+    if (Register::isPhysicalRegister(DestReg))
       DefRC = TRI.getRegClass(DestReg);
     else
       DefRC = getRegClassForTypeOnBank(OpSize,
@@ -297,26 +383,35 @@ bool MipsInstructionSelector::select(MachineInstr &I,
   case G_LOAD:
   case G_ZEXTLOAD:
   case G_SEXTLOAD: {
-    const Register DestReg = I.getOperand(0).getReg();
-    const unsigned DestRegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID();
-    const unsigned OpSize = MRI.getType(DestReg).getSizeInBits();
-    const unsigned OpMemSizeInBytes = (*I.memoperands_begin())->getSize();
-
-    if (DestRegBank == Mips::GPRBRegBankID && OpSize != 32)
-      return false;
-
-    if (DestRegBank == Mips::FPRBRegBankID && OpSize != 32 && OpSize != 64)
-      return false;
-
-    const unsigned NewOpc = selectLoadStoreOpCode(
-        I.getOpcode(), OpMemSizeInBytes, DestRegBank, STI.isFP64bit());
+    const unsigned NewOpc = selectLoadStoreOpCode(I, MRI);
     if (NewOpc == I.getOpcode())
       return false;
 
+    MachineOperand BaseAddr = I.getOperand(1);
+    int64_t SignedOffset = 0;
+    // Try to fold load/store + G_GEP + G_CONSTANT
+    // %SignedOffset:(s32) = G_CONSTANT i32 16_bit_signed_immediate
+    // %Addr:(p0) = G_GEP %BaseAddr, %SignedOffset
+    // %LoadResult/%StoreSrc = load/store %Addr(p0)
+    // into:
+    // %LoadResult/%StoreSrc = NewOpc %BaseAddr(p0), 16_bit_signed_immediate
+
+    MachineInstr *Addr = MRI.getVRegDef(I.getOperand(1).getReg());
+    if (Addr->getOpcode() == G_GEP) {
+      MachineInstr *Offset = MRI.getVRegDef(Addr->getOperand(2).getReg());
+      if (Offset->getOpcode() == G_CONSTANT) {
+        APInt OffsetValue = Offset->getOperand(1).getCImm()->getValue();
+        if (OffsetValue.isSignedIntN(16)) {
+          BaseAddr = Addr->getOperand(1);
+          SignedOffset = OffsetValue.getSExtValue();
+        }
+      }
+    }
+
     MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
              .add(I.getOperand(0))
-             .add(I.getOperand(1))
-             .addImm(0)
+             .add(BaseAddr)
+             .addImm(SignedOffset)
              .addMemOperand(*I.memoperands_begin());
     break;
   }
@@ -356,6 +451,18 @@ bool MipsInstructionSelector::select(MachineInstr &I,
              .add(I.getOperand(3));
     break;
   }
+  case G_IMPLICIT_DEF: {
+    MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::IMPLICIT_DEF))
+             .add(I.getOperand(0));
+
+    // Set class based on register bank, there can be fpr and gpr implicit def.
+    MRI.setRegClass(MI->getOperand(0).getReg(),
+                    getRegClassForTypeOnBank(
+                        MRI.getType(I.getOperand(0).getReg()).getSizeInBits(),
+                        *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI),
+                        RBI));
+    break;
+  }
   case G_CONSTANT: {
     MachineIRBuilder B(I);
     if (!materialize32BitImm(I.getOperand(0).getReg(),
@@ -423,7 +530,7 @@ bool MipsInstructionSelector::select(MachineInstr &I,
       Opcode = Mips::TRUNC_W_S;
     else
       Opcode = STI.isFP64bit() ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32;
-    unsigned ResultInFPR = MRI.createVirtualRegister(&Mips::FGR32RegClass);
+    Register ResultInFPR = MRI.createVirtualRegister(&Mips::FGR32RegClass);
     MachineInstr *Trunc = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Opcode))
                 .addDef(ResultInFPR)
                 .addUse(I.getOperand(1).getReg());
@@ -496,6 +603,24 @@ bool MipsInstructionSelector::select(MachineInstr &I,
     I.eraseFromParent();
     return true;
   }
+  case G_JUMP_TABLE: {
+    if (MF.getTarget().isPositionIndependent()) {
+      MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW))
+               .addDef(I.getOperand(0).getReg())
+               .addReg(MF.getInfo<MipsFunctionInfo>()
+                           ->getGlobalBaseRegForGlobalISel())
+               .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_GOT)
+               .addMemOperand(
+                   MF.getMachineMemOperand(MachinePointerInfo::getGOT(MF),
+                                           MachineMemOperand::MOLoad, 4, 4));
+    } else {
+      MI =
+          BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi))
+              .addDef(I.getOperand(0).getReg())
+              .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_ABS_HI);
+    }
+    break;
+  }
   case G_ICMP: {
     struct Instr {
       unsigned Opcode;
@@ -626,7 +751,7 @@ bool MipsInstructionSelector::select(MachineInstr &I,
     // MipsFCMPCondCode, result is inverted i.e. MOVT_I is used.
     unsigned MoveOpcode = isLogicallyNegated ? Mips::MOVT_I : Mips::MOVF_I;
 
-    unsigned TrueInReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register TrueInReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
     BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
         .addDef(TrueInReg)
         .addUse(Mips::ZERO)
@@ -654,6 +779,33 @@ bool MipsInstructionSelector::select(MachineInstr &I,
     I.eraseFromParent();
     return true;
   }
+  case G_FENCE: {
+    MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::SYNC)).addImm(0);
+    break;
+  }
+  case G_VASTART: {
+    MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+    int FI = FuncInfo->getVarArgsFrameIndex();
+
+    Register LeaReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    MachineInstr *LEA_ADDiu =
+        BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LEA_ADDiu))
+            .addDef(LeaReg)
+            .addFrameIndex(FI)
+            .addImm(0);
+    if (!constrainSelectedInstRegOperands(*LEA_ADDiu, TII, TRI, RBI))
+      return false;
+
+    MachineInstr *Store = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::SW))
+                              .addUse(LeaReg)
+                              .addUse(I.getOperand(0).getReg())
+                              .addImm(0);
+    if (!constrainSelectedInstRegOperands(*Store, TII, TRI, RBI))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
   default:
     return false;
   }
diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp
index e442a81837ed..bb4a1d902d75 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -16,18 +16,65 @@
 
 using namespace llvm;
 
+struct TypesAndMemOps {
+  LLT ValTy;
+  LLT PtrTy;
+  unsigned MemSize;
+  bool MustBeNaturallyAligned;
+};
+
+static bool
+CheckTy0Ty1MemSizeAlign(const LegalityQuery &Query,
+                        std::initializer_list<TypesAndMemOps> SupportedValues) {
+  for (auto &Val : SupportedValues) {
+    if (Val.ValTy != Query.Types[0])
+      continue;
+    if (Val.PtrTy != Query.Types[1])
+      continue;
+    if (Val.MemSize != Query.MMODescrs[0].SizeInBits)
+      continue;
+    if (Val.MustBeNaturallyAligned &&
+        Query.MMODescrs[0].SizeInBits % Query.MMODescrs[0].AlignInBits != 0)
+      continue;
+    return true;
+  }
+  return false;
+}
+
+static bool CheckTyN(unsigned N, const LegalityQuery &Query,
+                     std::initializer_list<LLT> SupportedValues) {
+  for (auto &Val : SupportedValues)
+    if (Val == Query.Types[N])
+      return true;
+  return false;
+}
+
 MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
   using namespace TargetOpcode;
 
   const LLT s1 = LLT::scalar(1);
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
+  const LLT v16s8 = LLT::vector(16, 8);
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
   const LLT p0 = LLT::pointer(0, 32);
 
-  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+  getActionDefinitionsBuilder({G_SUB, G_MUL})
       .legalFor({s32})
       .clampScalar(0, s32, s32);
 
+  getActionDefinitionsBuilder(G_ADD)
+      .legalIf([=, &ST](const LegalityQuery &Query) {
+        if (CheckTyN(0, Query, {s32}))
+          return true;
+        if (ST.hasMSA() && CheckTyN(0, Query, {v16s8, v8s16, v4s32, v2s64}))
+          return true;
+        return false;
+      })
+      .clampScalar(0, s32, s32);
+
   getActionDefinitionsBuilder({G_UADDO, G_UADDE, G_USUBO, G_USUBE, G_UMULO})
       .lowerFor({{s32, s1}});
 
@@ -36,13 +83,26 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
       .maxScalar(0, s32);
 
   getActionDefinitionsBuilder({G_LOAD, G_STORE})
-      .legalForTypesWithMemDesc({{s32, p0, 8, 8},
-                                 {s32, p0, 16, 8},
-                                 {s32, p0, 32, 8},
-                                 {s64, p0, 64, 8},
-                                 {p0, p0, 32, 8}})
+      .legalIf([=, &ST](const LegalityQuery &Query) {
+        if (CheckTy0Ty1MemSizeAlign(Query, {{s32, p0, 8, ST.hasMips32r6()},
+                                            {s32, p0, 16, ST.hasMips32r6()},
+                                            {s32, p0, 32, ST.hasMips32r6()},
+                                            {p0, p0, 32, ST.hasMips32r6()},
+                                            {s64, p0, 64, ST.hasMips32r6()}}))
+          return true;
+        if (ST.hasMSA() &&
+            CheckTy0Ty1MemSizeAlign(Query, {{v16s8, p0, 128, false},
+                                            {v8s16, p0, 128, false},
+                                            {v4s32, p0, 128, false},
+                                            {v2s64, p0, 128, false}}))
+          return true;
+        return false;
+      })
       .minScalar(0, s32);
 
+  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+      .legalFor({s32, s64});
+
   getActionDefinitionsBuilder(G_UNMERGE_VALUES)
      .legalFor({{s32, s64}});
 
@@ -50,9 +110,17 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
      .legalFor({{s64, s32}});
 
   getActionDefinitionsBuilder({G_ZEXTLOAD, G_SEXTLOAD})
-    .legalForTypesWithMemDesc({{s32, p0, 8, 8},
-                               {s32, p0, 16, 8}})
-      .minScalar(0, s32);
+      .legalForTypesWithMemDesc({{s32, p0, 8, 8},
+                                 {s32, p0, 16, 8}})
+      .clampScalar(0, s32, s32);
+
+  getActionDefinitionsBuilder({G_ZEXT, G_SEXT})
+      .legalIf([](const LegalityQuery &Query) { return false; })
+      .maxScalar(0, s32);
+
+  getActionDefinitionsBuilder(G_TRUNC)
+      .legalIf([](const LegalityQuery &Query) { return false; })
+      .maxScalar(1, s32);
 
   getActionDefinitionsBuilder(G_SELECT)
       .legalForCartesianProduct({p0, s32, s64}, {s32})
@@ -63,6 +131,12 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
       .legalFor({s32})
       .minScalar(0, s32);
 
+  getActionDefinitionsBuilder(G_BRJT)
+      .legalFor({{p0, s32}});
+
+  getActionDefinitionsBuilder(G_BRINDIRECT)
+      .legalFor({p0});
+
   getActionDefinitionsBuilder(G_PHI)
       .legalFor({p0, s32, s64})
       .minScalar(0, s32);
@@ -77,8 +151,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
       .libcallFor({s64});
 
   getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
-    .legalFor({s32, s32})
-    .minScalar(1, s32);
+      .legalFor({{s32, s32}})
+      .clampScalar(1, s32, s32)
+      .clampScalar(0, s32, s32);
 
   getActionDefinitionsBuilder(G_ICMP)
       .legalForCartesianProduct({s32}, {s32, p0})
@@ -89,15 +164,24 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
       .legalFor({s32})
       .clampScalar(0, s32, s32);
 
-  getActionDefinitionsBuilder(G_GEP)
+  getActionDefinitionsBuilder({G_GEP, G_INTTOPTR})
       .legalFor({{p0, s32}});
 
+  getActionDefinitionsBuilder(G_PTRTOINT)
+      .legalFor({{s32, p0}});
+
   getActionDefinitionsBuilder(G_FRAME_INDEX)
       .legalFor({p0});
 
-  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
+  getActionDefinitionsBuilder({G_GLOBAL_VALUE, G_JUMP_TABLE})
       .legalFor({p0});
 
+  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
+      .lowerFor({{p0, s32}});
+
+  getActionDefinitionsBuilder(G_VASTART)
+     .legalFor({p0});
+
   // FP instructions
   getActionDefinitionsBuilder(G_FCONSTANT)
       .legalFor({s32, s64});
@@ -126,6 +210,7 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
 
   getActionDefinitionsBuilder(G_FPTOUI)
       .libcallForCartesianProduct({s64}, {s64, s32})
+      .lowerForCartesianProduct({s32}, {s64, s32})
       .minScalar(0, s32);
 
   // Int to FP conversion instructions
@@ -136,8 +221,11 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
 
   getActionDefinitionsBuilder(G_UITOFP)
       .libcallForCartesianProduct({s64, s32}, {s64})
+      .customForCartesianProduct({s64, s32}, {s32})
       .minScalar(1, s32);
 
+  getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
   computeTables();
   verify(*ST.getInstrInfo());
 }
@@ -150,6 +238,134 @@ bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI,
   using namespace TargetOpcode;
 
   MIRBuilder.setInstr(MI);
+  const MipsSubtarget &STI =
+      static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
 
-  return false;
+  switch (MI.getOpcode()) {
+  case G_UITOFP: {
+    Register Dst = MI.getOperand(0).getReg();
+    Register Src = MI.getOperand(1).getReg();
+    LLT DstTy = MRI.getType(Dst);
+    LLT SrcTy = MRI.getType(Src);
+
+    if (SrcTy != s32)
+      return false;
+    if (DstTy != s32 && DstTy != s64)
+      return false;
+
+    // Let 0xABCDEFGH be given unsigned in MI.getOperand(1). First let's convert
+    // unsigned to double. Mantissa has 52 bits so we use following trick:
+    // First make floating point bit mask 0x43300000ABCDEFGH.
+    // Mask represents 2^52 * 0x1.00000ABCDEFGH i.e. 0x100000ABCDEFGH.0 .
+    // Next, subtract  2^52 * 0x1.0000000000000 i.e. 0x10000000000000.0 from it.
+    // Done. Trunc double to float if needed.
+
+    MachineInstrBuilder Bitcast = MIRBuilder.buildInstr(
+        STI.isFP64bit() ? Mips::BuildPairF64_64 : Mips::BuildPairF64, {s64},
+        {Src, MIRBuilder.buildConstant(s32, UINT32_C(0x43300000))});
+    Bitcast.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+                             *STI.getRegBankInfo());
+
+    MachineInstrBuilder TwoP52FP = MIRBuilder.buildFConstant(
+        s64, BitsToDouble(UINT64_C(0x4330000000000000)));
+
+    if (DstTy == s64)
+      MIRBuilder.buildFSub(Dst, Bitcast, TwoP52FP);
+    else {
+      MachineInstrBuilder ResF64 = MIRBuilder.buildFSub(s64, Bitcast, TwoP52FP);
+      MIRBuilder.buildFPTrunc(Dst, ResF64);
+    }
+
+    MI.eraseFromParent();
+    break;
+  }
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+static bool SelectMSA3OpIntrinsic(MachineInstr &MI, unsigned Opcode,
+                                  MachineIRBuilder &MIRBuilder,
+                                  const MipsSubtarget &ST) {
+  assert(ST.hasMSA() && "MSA intrinsic not supported on target without MSA.");
+  if (!MIRBuilder.buildInstr(Opcode)
+           .add(MI.getOperand(0))
+           .add(MI.getOperand(2))
+           .add(MI.getOperand(3))
+           .constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(),
+                             *ST.getRegBankInfo()))
+    return false;
+  MI.eraseFromParent();
+  return true;
+}
+
+static bool MSA3OpIntrinsicToGeneric(MachineInstr &MI, unsigned Opcode,
+                                     MachineIRBuilder &MIRBuilder,
+                                     const MipsSubtarget &ST) {
+  assert(ST.hasMSA() && "MSA intrinsic not supported on target without MSA.");
+  MIRBuilder.buildInstr(Opcode)
+      .add(MI.getOperand(0))
+      .add(MI.getOperand(2))
+      .add(MI.getOperand(3));
+  MI.eraseFromParent();
+  return true;
+}
+
+bool MipsLegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
+                                          MachineRegisterInfo &MRI,
+                                          MachineIRBuilder &MIRBuilder) const {
+  const MipsSubtarget &ST =
+      static_cast<const MipsSubtarget &>(MI.getMF()->getSubtarget());
+  const MipsInstrInfo &TII = *ST.getInstrInfo();
+  const MipsRegisterInfo &TRI = *ST.getRegisterInfo();
+  const RegisterBankInfo &RBI = *ST.getRegBankInfo();
+  MIRBuilder.setInstr(MI);
+
+  switch (MI.getIntrinsicID()) {
+  case Intrinsic::memcpy:
+  case Intrinsic::memset:
+  case Intrinsic::memmove:
+    if (createMemLibcall(MIRBuilder, MRI, MI) ==
+        LegalizerHelper::UnableToLegalize)
+      return false;
+    MI.eraseFromParent();
+    return true;
+  case Intrinsic::trap: {
+    MachineInstr *Trap = MIRBuilder.buildInstr(Mips::TRAP);
+    MI.eraseFromParent();
+    return constrainSelectedInstRegOperands(*Trap, TII, TRI, RBI);
+  }
+  case Intrinsic::vacopy: {
+    Register Tmp = MRI.createGenericVirtualRegister(LLT::pointer(0, 32));
+    MachinePointerInfo MPO;
+    MIRBuilder.buildLoad(Tmp, MI.getOperand(2),
+                         *MI.getMF()->getMachineMemOperand(
+                             MPO, MachineMemOperand::MOLoad, 4, 4));
+    MIRBuilder.buildStore(Tmp, MI.getOperand(1),
+                          *MI.getMF()->getMachineMemOperand(
+                              MPO, MachineMemOperand::MOStore, 4, 4));
+    MI.eraseFromParent();
+    return true;
+  }
+  case Intrinsic::mips_addv_b:
+  case Intrinsic::mips_addv_h:
+  case Intrinsic::mips_addv_w:
+  case Intrinsic::mips_addv_d:
+    return MSA3OpIntrinsicToGeneric(MI, TargetOpcode::G_ADD, MIRBuilder, ST);
+  case Intrinsic::mips_addvi_b:
+    return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_B, MIRBuilder, ST);
+  case Intrinsic::mips_addvi_h:
+    return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_H, MIRBuilder, ST);
+  case Intrinsic::mips_addvi_w:
+    return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_W, MIRBuilder, ST);
+  case Intrinsic::mips_addvi_d:
+    return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_D, MIRBuilder, ST);
+  default:
+    break;
+  }
+  return true;
 }
diff --git a/lib/Target/Mips/MipsLegalizerInfo.h b/lib/Target/Mips/MipsLegalizerInfo.h
index e5021e081890..9696c262b2db 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.h
+++ b/lib/Target/Mips/MipsLegalizerInfo.h
@@ -28,6 +28,9 @@ public:
   bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
                       MachineIRBuilder &MIRBuilder,
                       GISelChangeObserver &Observer) const override;
+
+  bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &MIRBuilder) const override;
 };
 } // end namespace llvm
 #endif
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index 907ed9ef746f..f585d9c1a148 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -60,6 +60,11 @@ def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>;
 def immZExt3Ptr : ImmLeaf<iPTR, [{return isUInt<3>(Imm);}]>;
 def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>;
 
+def timmZExt1Ptr : TImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>;
+def timmZExt2Ptr : TImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>;
+def timmZExt3Ptr : TImmLeaf<iPTR, [{return isUInt<3>(Imm);}]>;
+def timmZExt4Ptr : TImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>;
+
 // Operands
 
 def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>;
@@ -1270,7 +1275,7 @@ class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
   dag OutOperandList = (outs ROWD:$wd);
   dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
-  list<dag> Pattern = [(set ROWD:$wd, (MipsSHF immZExt8:$u8, ROWS:$ws))];
+  list<dag> Pattern = [(set ROWD:$wd, (MipsSHF timmZExt8:$u8, ROWS:$ws))];
   InstrItinClass Itinerary = itin;
 }
 
@@ -2299,13 +2304,13 @@ class INSERT_FW_VIDX64_PSEUDO_DESC :
 class INSERT_FD_VIDX64_PSEUDO_DESC :
     MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>;
 
-class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4,
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, timmZExt4,
                                          MSA128BOpnd>;
-class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3,
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, timmZExt3,
                                          MSA128HOpnd>;
-class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2,
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, timmZExt2,
                                          MSA128WOpnd>;
-class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1,
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, timmZExt1,
                                          MSA128DOpnd>;
 
 class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -2518,22 +2523,22 @@ class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
 class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
 
 class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3,
-                                         immZExt3, MSA128BOpnd>;
+                                         timmZExt3, MSA128BOpnd>;
 class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4,
-                                         immZExt4, MSA128HOpnd>;
+                                         timmZExt4, MSA128HOpnd>;
 class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5,
-                                         immZExt5, MSA128WOpnd>;
+                                         timmZExt5, MSA128WOpnd>;
 class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6,
-                                         immZExt6, MSA128DOpnd>;
+                                         timmZExt6, MSA128DOpnd>;
 
 class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3,
-                                         immZExt3, MSA128BOpnd>;
+                                         timmZExt3, MSA128BOpnd>;
 class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4,
-                                         immZExt4, MSA128HOpnd>;
+                                         timmZExt4, MSA128HOpnd>;
 class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5,
-                                         immZExt5, MSA128WOpnd>;
+                                         timmZExt5, MSA128WOpnd>;
 class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6,
-                                         immZExt6, MSA128DOpnd>;
+                                         timmZExt6, MSA128DOpnd>;
 
 class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
 class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
@@ -2546,16 +2551,16 @@ class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
 
 class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b,
                                           MSA128BOpnd, MSA128BOpnd, uimm4,
-                                          immZExt4>;
+                                          timmZExt4>;
 class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h,
                                           MSA128HOpnd, MSA128HOpnd, uimm3,
-                                          immZExt3>;
+                                          timmZExt3>;
 class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w,
                                           MSA128WOpnd, MSA128WOpnd, uimm2,
-                                          immZExt2>;
+                                          timmZExt2>;
 class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d,
                                           MSA128DOpnd, MSA128DOpnd, uimm1,
-                                          immZExt1>;
+                                          timmZExt1>;
 
 class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
 class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
@@ -2609,13 +2614,13 @@ class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
 class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
 
 class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3,
-                                         immZExt3, MSA128BOpnd>;
+                                         timmZExt3, MSA128BOpnd>;
 class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4,
-                                         immZExt4, MSA128HOpnd>;
+                                         timmZExt4, MSA128HOpnd>;
 class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5,
-                                         immZExt5, MSA128WOpnd>;
+                                         timmZExt5, MSA128WOpnd>;
 class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6,
-                                         immZExt6, MSA128DOpnd>;
+                                         timmZExt6, MSA128DOpnd>;
 
 class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
 class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
@@ -2637,13 +2642,13 @@ class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
 class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
 
 class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3,
-                                         immZExt3, MSA128BOpnd>;
+                                         timmZExt3, MSA128BOpnd>;
 class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4,
-                                         immZExt4, MSA128HOpnd>;
+                                         timmZExt4, MSA128HOpnd>;
 class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5,
-                                         immZExt5, MSA128WOpnd>;
+                                         timmZExt5, MSA128WOpnd>;
 class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6,
-                                         immZExt6, MSA128DOpnd>;
+                                         timmZExt6, MSA128DOpnd>;
 
 class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    ValueType TyNode, RegisterOperand ROWD,
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 5ef07a2d283e..8bd64ff6cb27 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -127,8 +127,7 @@ static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
 
   MachineOperand &MO = MI.getOperand(0);
 
-  if (!MO.isReg() || !MO.isUse() ||
-      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+  if (!MO.isReg() || !MO.isUse() || !Register::isVirtualRegister(MO.getReg()))
     return nullptr;
 
   return &MO;
@@ -152,7 +151,7 @@ static void setCallTargetReg(MachineBasicBlock *MBB,
                              MachineBasicBlock::iterator I) {
   MachineFunction &MF = *MBB->getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  unsigned SrcReg = I->getOperand(0).getReg();
+  Register SrcReg = I->getOperand(0).getReg();
   unsigned DstReg = getRegTy(SrcReg, MF) == MVT::i32 ? Mips::T9 : Mips::T9_64;
   BuildMI(*MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), DstReg)
       .addReg(SrcReg);
diff --git a/lib/Target/Mips/MipsPfmCounters.td b/lib/Target/Mips/MipsPfmCounters.td
new file mode 100644
index 000000000000..c7779b474b91
--- /dev/null
+++ b/lib/Target/Mips/MipsPfmCounters.td
@@ -0,0 +1,18 @@
+//===-- MipsPfmCounters.td - Mips Hardware Counters --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the available hardware counters for Mips.
+//
+//===----------------------------------------------------------------------===//
+
+def CpuCyclesPfmCounter : PfmCounter<"CYCLES">;
+
+def DefaultPfmCounters : ProcPfmCounters {
+  let CycleCounter = CpuCyclesPfmCounter;
+}
+def : PfmCountersDefaultBinding<DefaultPfmCounters>;
diff --git a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
index 85076590d407..ace0735652bd 100644
--- a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
+++ b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
@@ -27,7 +27,8 @@ class MipsPreLegalizerCombinerInfo : public CombinerInfo {
 public:
   MipsPreLegalizerCombinerInfo()
       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
-                     /*LegalizerInfo*/ nullptr) {}
+                     /*LegalizerInfo*/ nullptr, /*EnableOpt*/ false,
+                     /*EnableOptSize*/ false, /*EnableMinSize*/ false) {}
   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
                        MachineIRBuilder &B) const override;
 };
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.cpp b/lib/Target/Mips/MipsRegisterBankInfo.cpp
index d8bcf16afd50..d334366e727c 100644
--- a/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "MipsRegisterBankInfo.h"
 #include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
@@ -27,20 +28,23 @@ enum PartialMappingIdx {
   PMI_GPR,
   PMI_SPR,
   PMI_DPR,
+  PMI_MSA,
   PMI_Min = PMI_GPR,
 };
 
 RegisterBankInfo::PartialMapping PartMappings[]{
     {0, 32, GPRBRegBank},
     {0, 32, FPRBRegBank},
-    {0, 64, FPRBRegBank}
+    {0, 64, FPRBRegBank},
+    {0, 128, FPRBRegBank}
 };
 
 enum ValueMappingIdx {
     InvalidIdx = 0,
     GPRIdx = 1,
     SPRIdx = 4,
-    DPRIdx = 7
+    DPRIdx = 7,
+    MSAIdx = 10
 };
 
 RegisterBankInfo::ValueMapping ValueMappings[] = {
@@ -50,14 +54,18 @@ RegisterBankInfo::ValueMapping ValueMappings[] = {
     {&PartMappings[PMI_GPR - PMI_Min], 1},
     {&PartMappings[PMI_GPR - PMI_Min], 1},
     {&PartMappings[PMI_GPR - PMI_Min], 1},
-    // up to 3 ops operands FPRs - single precission
+    // up to 3 operands in FPRs - single precission
     {&PartMappings[PMI_SPR - PMI_Min], 1},
     {&PartMappings[PMI_SPR - PMI_Min], 1},
     {&PartMappings[PMI_SPR - PMI_Min], 1},
-    // up to 3 ops operands FPRs - double precission
+    // up to 3 operands in FPRs - double precission
     {&PartMappings[PMI_DPR - PMI_Min], 1},
     {&PartMappings[PMI_DPR - PMI_Min], 1},
-    {&PartMappings[PMI_DPR - PMI_Min], 1}
+    {&PartMappings[PMI_DPR - PMI_Min], 1},
+    // up to 3 operands in FPRs - MSA
+    {&PartMappings[PMI_MSA - PMI_Min], 1},
+    {&PartMappings[PMI_MSA - PMI_Min], 1},
+    {&PartMappings[PMI_MSA - PMI_Min], 1}
 };
 
 } // end namespace Mips
@@ -86,6 +94,10 @@ const RegisterBank &MipsRegisterBankInfo::getRegBankFromRegClass(
   case Mips::FGR32RegClassID:
   case Mips::FGR64RegClassID:
   case Mips::AFGR64RegClassID:
+  case Mips::MSA128BRegClassID:
+  case Mips::MSA128HRegClassID:
+  case Mips::MSA128WRegClassID:
+  case Mips::MSA128DRegClassID:
     return getRegBank(Mips::FPRBRegBankID);
   default:
     llvm_unreachable("Register class not supported");
@@ -149,6 +161,7 @@ static bool isAmbiguous(unsigned Opc) {
   case TargetOpcode::G_STORE:
   case TargetOpcode::G_PHI:
   case TargetOpcode::G_SELECT:
+  case TargetOpcode::G_IMPLICIT_DEF:
     return true;
   default:
     return false;
@@ -163,8 +176,7 @@ void MipsRegisterBankInfo::AmbiguousRegDefUseContainer::addDefUses(
     MachineInstr *NonCopyInstr = skipCopiesOutgoing(&UseMI);
     // Copy with many uses.
     if (NonCopyInstr->getOpcode() == TargetOpcode::COPY &&
-        !TargetRegisterInfo::isPhysicalRegister(
-            NonCopyInstr->getOperand(0).getReg()))
+        !Register::isPhysicalRegister(NonCopyInstr->getOperand(0).getReg()))
       addDefUses(NonCopyInstr->getOperand(0).getReg(), MRI);
     else
       DefUses.push_back(skipCopiesOutgoing(&UseMI));
@@ -186,7 +198,7 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesOutgoing(
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineInstr *Ret = MI;
   while (Ret->getOpcode() == TargetOpcode::COPY &&
-         !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(0).getReg()) &&
+         !Register::isPhysicalRegister(Ret->getOperand(0).getReg()) &&
          MRI.hasOneUse(Ret->getOperand(0).getReg())) {
     Ret = &(*MRI.use_instr_begin(Ret->getOperand(0).getReg()));
   }
@@ -200,7 +212,7 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesIncoming(
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineInstr *Ret = MI;
   while (Ret->getOpcode() == TargetOpcode::COPY &&
-         !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(1).getReg()))
+         !Register::isPhysicalRegister(Ret->getOperand(1).getReg()))
     Ret = MRI.getVRegDef(Ret->getOperand(1).getReg());
   return Ret;
 }
@@ -231,6 +243,9 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::AmbiguousRegDefUseContainer(
     addUseDef(MI->getOperand(2).getReg(), MRI);
     addUseDef(MI->getOperand(3).getReg(), MRI);
   }
+
+  if (MI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
+    addDefUses(MI->getOperand(0).getReg(), MRI);
 }
 
 bool MipsRegisterBankInfo::TypeInfoForMF::visit(
@@ -318,8 +333,7 @@ void MipsRegisterBankInfo::TypeInfoForMF::setTypes(const MachineInstr *MI,
 
 void MipsRegisterBankInfo::TypeInfoForMF::setTypesAccordingToPhysicalRegister(
     const MachineInstr *MI, const MachineInstr *CopyInst, unsigned Op) {
-  assert((TargetRegisterInfo::isPhysicalRegister(
-             CopyInst->getOperand(Op).getReg())) &&
+  assert((Register::isPhysicalRegister(CopyInst->getOperand(Op).getReg())) &&
          "Copies of non physical registers should not be considered here.\n");
 
   const MachineFunction &MF = *CopyInst->getMF();
@@ -353,6 +367,31 @@ void MipsRegisterBankInfo::TypeInfoForMF::cleanupIfNewFunction(
   }
 }
 
+static const MipsRegisterBankInfo::ValueMapping *
+getMSAMapping(const MachineFunction &MF) {
+  assert(static_cast<const MipsSubtarget &>(MF.getSubtarget()).hasMSA() &&
+         "MSA mapping not available on target without MSA.");
+  return &Mips::ValueMappings[Mips::MSAIdx];
+}
+
+static const MipsRegisterBankInfo::ValueMapping *getFprbMapping(unsigned Size) {
+  return Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+                    : &Mips::ValueMappings[Mips::DPRIdx];
+}
+
+static const unsigned CustomMappingID = 1;
+
+// Only 64 bit mapping is available in fprb and will be marked as custom, i.e.
+// will be split into two 32 bit registers in gprb.
+static const MipsRegisterBankInfo::ValueMapping *
+getGprbOrCustomMapping(unsigned Size, unsigned &MappingID) {
+  if (Size == 32)
+    return &Mips::ValueMappings[Mips::GPRIdx];
+
+  MappingID = CustomMappingID;
+  return &Mips::ValueMappings[Mips::DPRIdx];
+}
+
 const RegisterBankInfo::InstructionMapping &
 MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
@@ -377,17 +416,35 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   unsigned NumOperands = MI.getNumOperands();
   const ValueMapping *OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
   unsigned MappingID = DefaultMappingID;
-  const unsigned CustomMappingID = 1;
+
+  // Check if LLT sizes match sizes of available register banks.
+  for (const MachineOperand &Op : MI.operands()) {
+    if (Op.isReg()) {
+      LLT RegTy = MRI.getType(Op.getReg());
+
+      if (RegTy.isScalar() &&
+          (RegTy.getSizeInBits() != 32 && RegTy.getSizeInBits() != 64))
+        return getInvalidInstructionMapping();
+
+      if (RegTy.isVector() && RegTy.getSizeInBits() != 128)
+        return getInvalidInstructionMapping();
+    }
+  }
+
+  const LLT Op0Ty = MRI.getType(MI.getOperand(0).getReg());
+  unsigned Op0Size = Op0Ty.getSizeInBits();
+  InstType InstTy = InstType::Integer;
 
   switch (Opc) {
   case G_TRUNC:
-  case G_ADD:
   case G_SUB:
   case G_MUL:
   case G_UMULH:
   case G_ZEXTLOAD:
   case G_SEXTLOAD:
   case G_GEP:
+  case G_INTTOPTR:
+  case G_PTRTOINT:
   case G_AND:
   case G_OR:
   case G_XOR:
@@ -398,66 +455,42 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case G_UDIV:
   case G_SREM:
   case G_UREM:
+  case G_BRINDIRECT:
+  case G_VASTART:
     OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
     break;
-  case G_LOAD: {
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    InstType InstTy = InstType::Integer;
-    if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
-      InstTy = TI.determineInstType(&MI);
-    }
-
-    if (InstTy == InstType::FloatingPoint ||
-        (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb
-      OperandsMapping =
-          getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
-                                         : &Mips::ValueMappings[Mips::DPRIdx],
-                              &Mips::ValueMappings[Mips::GPRIdx]});
+  case G_ADD:
+    OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
+    if (Op0Size == 128)
+      OperandsMapping = getMSAMapping(MF);
+    break;
+  case G_STORE:
+  case G_LOAD:
+    if (Op0Size == 128) {
+      OperandsMapping = getOperandsMapping(
+          {getMSAMapping(MF), &Mips::ValueMappings[Mips::GPRIdx]});
       break;
-    } else { // gprb
-      OperandsMapping =
-          getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx]
-                                         : &Mips::ValueMappings[Mips::DPRIdx],
-                              &Mips::ValueMappings[Mips::GPRIdx]});
-      if (Size == 64)
-        MappingID = CustomMappingID;
     }
 
-    break;
-  }
-  case G_STORE: {
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    InstType InstTy = InstType::Integer;
-    if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+    if (!Op0Ty.isPointer())
       InstTy = TI.determineInstType(&MI);
-    }
 
     if (InstTy == InstType::FloatingPoint ||
-        (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb
-      OperandsMapping =
-          getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
-                                         : &Mips::ValueMappings[Mips::DPRIdx],
-                              &Mips::ValueMappings[Mips::GPRIdx]});
-      break;
-    } else { // gprb
+        (Op0Size == 64 && InstTy == InstType::Ambiguous))
+      OperandsMapping = getOperandsMapping(
+          {getFprbMapping(Op0Size), &Mips::ValueMappings[Mips::GPRIdx]});
+    else
       OperandsMapping =
-          getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx]
-                                         : &Mips::ValueMappings[Mips::DPRIdx],
+          getOperandsMapping({getGprbOrCustomMapping(Op0Size, MappingID),
                               &Mips::ValueMappings[Mips::GPRIdx]});
-      if (Size == 64)
-        MappingID = CustomMappingID;
-    }
+
     break;
-  }
-  case G_PHI: {
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    InstType InstTy = InstType::Integer;
-    if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+  case G_PHI:
+    if (!Op0Ty.isPointer())
       InstTy = TI.determineInstType(&MI);
-    }
 
     // PHI is copylike and should have one regbank in mapping for def register.
-    if (InstTy == InstType::Integer && Size == 64) { // fprb
+    if (InstTy == InstType::Integer && Op0Size == 64) {
       OperandsMapping =
           getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx]});
       return getInstructionMapping(CustomMappingID, /*Cost=*/1, OperandsMapping,
@@ -465,80 +498,63 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     // Use default handling for PHI, i.e. set reg bank of def operand to match
     // register banks of use operands.
-    const RegisterBankInfo::InstructionMapping &Mapping =
-        getInstrMappingImpl(MI);
-    return Mapping;
-  }
+    return getInstrMappingImpl(MI);
   case G_SELECT: {
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    InstType InstTy = InstType::Integer;
-    if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+    if (!Op0Ty.isPointer())
       InstTy = TI.determineInstType(&MI);
-    }
 
     if (InstTy == InstType::FloatingPoint ||
-        (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb
-      const RegisterBankInfo::ValueMapping *Bank =
-          Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
-                     : &Mips::ValueMappings[Mips::DPRIdx];
+        (Op0Size == 64 && InstTy == InstType::Ambiguous)) {
+      const RegisterBankInfo::ValueMapping *Bank = getFprbMapping(Op0Size);
       OperandsMapping = getOperandsMapping(
           {Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank});
       break;
-    } else { // gprb
+    } else {
       const RegisterBankInfo::ValueMapping *Bank =
-          Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx]
-                     : &Mips::ValueMappings[Mips::DPRIdx];
+          getGprbOrCustomMapping(Op0Size, MappingID);
       OperandsMapping = getOperandsMapping(
           {Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank});
-      if (Size == 64)
-        MappingID = CustomMappingID;
     }
     break;
   }
-  case G_UNMERGE_VALUES: {
+  case G_IMPLICIT_DEF:
+    if (!Op0Ty.isPointer())
+      InstTy = TI.determineInstType(&MI);
+
+    if (InstTy == InstType::FloatingPoint)
+      OperandsMapping = getFprbMapping(Op0Size);
+    else
+      OperandsMapping = getGprbOrCustomMapping(Op0Size, MappingID);
+
+    break;
+  case G_UNMERGE_VALUES:
     OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx],
                                           &Mips::ValueMappings[Mips::GPRIdx],
                                           &Mips::ValueMappings[Mips::DPRIdx]});
     MappingID = CustomMappingID;
     break;
-  }
-  case G_MERGE_VALUES: {
+  case G_MERGE_VALUES:
     OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx],
                                           &Mips::ValueMappings[Mips::GPRIdx],
                                           &Mips::ValueMappings[Mips::GPRIdx]});
     MappingID = CustomMappingID;
     break;
-  }
   case G_FADD:
   case G_FSUB:
   case G_FMUL:
   case G_FDIV:
   case G_FABS:
-  case G_FSQRT:{
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    assert((Size == 32 || Size == 64) && "Unsupported floating point size");
-    OperandsMapping = Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
-                                 : &Mips::ValueMappings[Mips::DPRIdx];
+  case G_FSQRT:
+    OperandsMapping = getFprbMapping(Op0Size);
     break;
-  }
-  case G_FCONSTANT: {
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    assert((Size == 32 || Size == 64) && "Unsupported floating point size");
-    const RegisterBankInfo::ValueMapping *FPRValueMapping =
-        Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
-                   : &Mips::ValueMappings[Mips::DPRIdx];
-    OperandsMapping = getOperandsMapping({FPRValueMapping, nullptr});
+  case G_FCONSTANT:
+    OperandsMapping = getOperandsMapping({getFprbMapping(Op0Size), nullptr});
     break;
-  }
   case G_FCMP: {
-    unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
-    assert((Size == 32 || Size == 64) && "Unsupported floating point size");
-    const RegisterBankInfo::ValueMapping *FPRValueMapping =
-        Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
-                   : &Mips::ValueMappings[Mips::DPRIdx];
+    unsigned Op2Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
     OperandsMapping =
         getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr,
-                            FPRValueMapping, FPRValueMapping});
+                            getFprbMapping(Op2Size), getFprbMapping(Op2Size)});
     break;
   }
   case G_FPEXT:
@@ -550,36 +566,31 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
                                           &Mips::ValueMappings[Mips::DPRIdx]});
     break;
   case G_FPTOSI: {
+    assert((Op0Size == 32) && "Unsupported integer size");
     unsigned SizeFP = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
-    assert((MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 32) &&
-           "Unsupported integer size");
-    assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size");
-    OperandsMapping = getOperandsMapping({
-        &Mips::ValueMappings[Mips::GPRIdx],
-        SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
-                     : &Mips::ValueMappings[Mips::DPRIdx],
-    });
+    OperandsMapping = getOperandsMapping(
+        {&Mips::ValueMappings[Mips::GPRIdx], getFprbMapping(SizeFP)});
     break;
   }
-  case G_SITOFP: {
-    unsigned SizeInt = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
-    unsigned SizeFP = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    (void)SizeInt;
-    assert((SizeInt == 32) && "Unsupported integer size");
-    assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size");
-    OperandsMapping =
-        getOperandsMapping({SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
-                                         : &Mips::ValueMappings[Mips::DPRIdx],
-                            &Mips::ValueMappings[Mips::GPRIdx]});
+  case G_SITOFP:
+    assert((MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() == 32) &&
+           "Unsupported integer size");
+    OperandsMapping = getOperandsMapping(
+        {getFprbMapping(Op0Size), &Mips::ValueMappings[Mips::GPRIdx]});
     break;
-  }
   case G_CONSTANT:
   case G_FRAME_INDEX:
   case G_GLOBAL_VALUE:
+  case G_JUMP_TABLE:
   case G_BRCOND:
     OperandsMapping =
         getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr});
     break;
+  case G_BRJT:
+    OperandsMapping =
+        getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr,
+                            &Mips::ValueMappings[Mips::GPRIdx]});
+    break;
   case G_ICMP:
     OperandsMapping =
         getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr,
@@ -609,11 +620,41 @@ public:
 };
 } // end anonymous namespace
 
-/// Here we have to narrowScalar s64 operands to s32, combine away
-/// G_MERGE/G_UNMERGE and erase instructions that became dead in the process.
-/// We manually assign 32 bit gprb to register operands of all new instructions
-/// that got created in the process since they will not end up in RegBankSelect
-/// loop. Careful not to delete instruction after MI i.e. MI.getIterator()++.
+void MipsRegisterBankInfo::setRegBank(MachineInstr &MI,
+                                      MachineRegisterInfo &MRI) const {
+  Register Dest = MI.getOperand(0).getReg();
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_STORE:
+    // No def operands, skip this instruction.
+    break;
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_SELECT:
+  case TargetOpcode::G_PHI:
+  case TargetOpcode::G_IMPLICIT_DEF: {
+    assert(MRI.getType(Dest) == LLT::scalar(32) && "Unexpected operand type.");
+    MRI.setRegBank(Dest, getRegBank(Mips::GPRBRegBankID));
+    break;
+  }
+  case TargetOpcode::G_GEP: {
+    assert(MRI.getType(Dest).isPointer() && "Unexpected operand type.");
+    MRI.setRegBank(Dest, getRegBank(Mips::GPRBRegBankID));
+    break;
+  }
+  default:
+    llvm_unreachable("Unexpected opcode.");
+  }
+}
+
+static void
+combineAwayG_UNMERGE_VALUES(LegalizationArtifactCombiner &ArtCombiner,
+                            MachineInstr &MI) {
+  SmallVector<MachineInstr *, 2> DeadInstrs;
+  ArtCombiner.tryCombineMerges(MI, DeadInstrs);
+  for (MachineInstr *DeadMI : DeadInstrs)
+    DeadMI->eraseFromParent();
+}
+
 void MipsRegisterBankInfo::applyMappingImpl(
     const OperandsMapper &OpdMapper) const {
   MachineInstr &MI = OpdMapper.getMI();
@@ -621,18 +662,19 @@ void MipsRegisterBankInfo::applyMappingImpl(
   MachineIRBuilder B(MI);
   MachineFunction *MF = MI.getMF();
   MachineRegisterInfo &MRI = OpdMapper.getMRI();
+  const LegalizerInfo &LegInfo = *MF->getSubtarget().getLegalizerInfo();
 
   InstManager NewInstrObserver(NewInstrs);
   GISelObserverWrapper WrapperObserver(&NewInstrObserver);
   LegalizerHelper Helper(*MF, WrapperObserver, B);
-  LegalizationArtifactCombiner ArtCombiner(
-      B, MF->getRegInfo(), *MF->getSubtarget().getLegalizerInfo());
+  LegalizationArtifactCombiner ArtCombiner(B, MF->getRegInfo(), LegInfo);
 
   switch (MI.getOpcode()) {
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE:
   case TargetOpcode::G_PHI:
-  case TargetOpcode::G_SELECT: {
+  case TargetOpcode::G_SELECT:
+  case TargetOpcode::G_IMPLICIT_DEF: {
     Helper.narrowScalar(MI, 0, LLT::scalar(32));
     // Handle new instructions.
     while (!NewInstrs.empty()) {
@@ -640,35 +682,21 @@ void MipsRegisterBankInfo::applyMappingImpl(
       // This is new G_UNMERGE that was created during narrowScalar and will
       // not be considered for regbank selection. RegBankSelect for mips
       // visits/makes corresponding G_MERGE first. Combine them here.
-      if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
-        SmallVector<MachineInstr *, 2> DeadInstrs;
-        ArtCombiner.tryCombineMerges(*NewMI, DeadInstrs);
-        for (MachineInstr *DeadMI : DeadInstrs)
-          DeadMI->eraseFromParent();
-      }
+      if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
+        combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI);
       // This G_MERGE will be combined away when its corresponding G_UNMERGE
       // gets regBankSelected.
       else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES)
         continue;
       else
-        // Manually set register banks for all register operands to 32 bit gprb.
-        for (auto Op : NewMI->operands()) {
-          if (Op.isReg()) {
-            assert(MRI.getType(Op.getReg()).getSizeInBits() == 32 &&
-                   "Only 32 bit gprb is handled here.\n");
-            MRI.setRegBank(Op.getReg(), getRegBank(Mips::GPRBRegBankID));
-          }
-        }
+        // Manually set register banks for def operands to 32 bit gprb.
+        setRegBank(*NewMI, MRI);
     }
     return;
   }
-  case TargetOpcode::G_UNMERGE_VALUES: {
-    SmallVector<MachineInstr *, 2> DeadInstrs;
-    ArtCombiner.tryCombineMerges(MI, DeadInstrs);
-    for (MachineInstr *DeadMI : DeadInstrs)
-      DeadMI->eraseFromParent();
+  case TargetOpcode::G_UNMERGE_VALUES:
+    combineAwayG_UNMERGE_VALUES(ArtCombiner, MI);
     return;
-  }
   default:
     break;
   }
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.h b/lib/Target/Mips/MipsRegisterBankInfo.h
index 176813c031ed..fa0f1c7bc941 100644
--- a/lib/Target/Mips/MipsRegisterBankInfo.h
+++ b/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -38,8 +38,17 @@ public:
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 
+  /// Here we have to narrowScalar s64 operands to s32, combine away G_MERGE or
+  /// G_UNMERGE and erase instructions that became dead in the process. We
+  /// manually assign bank to def operand of all new instructions that were
+  /// created in the process since they will not end up in RegBankSelect loop.
   void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
 
+  /// RegBankSelect determined that s64 operand is better to be split into two
+  /// s32 operands in gprb. Here we manually set register banks of def operands
+  /// of newly created instructions since they will not get regbankselected.
+  void setRegBank(MachineInstr &MI, MachineRegisterInfo &MRI) const;
+
 private:
   /// Some instructions are used with both floating point and integer operands.
   /// We assign InstType to such instructions as it helps us to avoid cross bank
diff --git a/lib/Target/Mips/MipsRegisterBanks.td b/lib/Target/Mips/MipsRegisterBanks.td
index 14a0181f8f11..7d11475884ce 100644
--- a/lib/Target/Mips/MipsRegisterBanks.td
+++ b/lib/Target/Mips/MipsRegisterBanks.td
@@ -11,4 +11,4 @@
 
 def GPRBRegBank : RegisterBank<"GPRB", [GPR32]>;
 
-def FPRBRegBank : RegisterBank<"FPRB", [FGR64, AFGR64]>;
+def FPRBRegBank : RegisterBank<"FPRB", [FGR64, AFGR64, MSA128D]>;
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 4c6cc1ef771c..166ddea0431f 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -171,8 +171,8 @@ void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
   const TargetRegisterClass *RC = RegInfo.intRegClass(4);
-  unsigned VR = MRI.createVirtualRegister(RC);
-  unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  Register VR = MRI.createVirtualRegister(RC);
+  Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
 
   TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0);
   BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst)
@@ -186,8 +186,8 @@ void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
   const TargetRegisterClass *RC = RegInfo.intRegClass(4);
-  unsigned VR = MRI.createVirtualRegister(RC);
-  unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  Register VR = MRI.createVirtualRegister(RC);
+  Register Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
 
   BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR)
     .addReg(Src, getKillRegState(I->getOperand(0).isKill()));
@@ -204,11 +204,11 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
-  unsigned VR0 = MRI.createVirtualRegister(RC);
-  unsigned VR1 = MRI.createVirtualRegister(RC);
-  unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
-  unsigned Lo = RegInfo.getSubReg(Dst, Mips::sub_lo);
-  unsigned Hi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+  Register VR0 = MRI.createVirtualRegister(RC);
+  Register VR1 = MRI.createVirtualRegister(RC);
+  Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  Register Lo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+  Register Hi = RegInfo.getSubReg(Dst, Mips::sub_hi);
   DebugLoc DL = I->getDebugLoc();
   const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
 
@@ -229,9 +229,9 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
-  unsigned VR0 = MRI.createVirtualRegister(RC);
-  unsigned VR1 = MRI.createVirtualRegister(RC);
-  unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  Register VR0 = MRI.createVirtualRegister(RC);
+  Register VR1 = MRI.createVirtualRegister(RC);
+  Register Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
   unsigned SrcKill = getKillRegState(I->getOperand(0).isKill());
   DebugLoc DL = I->getDebugLoc();
 
@@ -242,7 +242,7 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
 }
 
 bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
-  unsigned Src = I->getOperand(1).getReg();
+  Register Src = I->getOperand(1).getReg();
   std::pair<unsigned, unsigned> Opcodes = getMFHiLoOpc(Src);
 
   if (!Opcodes.first)
@@ -262,11 +262,11 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
   const TargetRegisterClass *DstRC = RegInfo.getMinimalPhysRegClass(Dst);
   unsigned VRegSize = RegInfo.getRegSizeInBits(*DstRC) / 16;
   const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
-  unsigned VR0 = MRI.createVirtualRegister(RC);
-  unsigned VR1 = MRI.createVirtualRegister(RC);
+  Register VR0 = MRI.createVirtualRegister(RC);
+  Register VR1 = MRI.createVirtualRegister(RC);
   unsigned SrcKill = getKillRegState(I->getOperand(1).isKill());
-  unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
-  unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+  Register DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+  Register DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
   DebugLoc DL = I->getDebugLoc();
 
   BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
@@ -304,9 +304,9 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
   // stack is used.
   if (I->getNumOperands() == 4 && I->getOperand(3).isReg()
       && I->getOperand(3).getReg() == Mips::SP) {
-    unsigned DstReg = I->getOperand(0).getReg();
-    unsigned LoReg = I->getOperand(1).getReg();
-    unsigned HiReg = I->getOperand(2).getReg();
+    Register DstReg = I->getOperand(0).getReg();
+    Register LoReg = I->getOperand(1).getReg();
+    Register HiReg = I->getOperand(2).getReg();
 
     // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
     // the cases where mthc1 is not available). 64-bit architectures and
@@ -346,7 +346,7 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
   const MachineOperand &Op2 = I->getOperand(2);
 
   if ((Op1.isReg() && Op1.isUndef()) || (Op2.isReg() && Op2.isUndef())) {
-    unsigned DstReg = I->getOperand(0).getReg();
+    Register DstReg = I->getOperand(0).getReg();
     BuildMI(MBB, I, I->getDebugLoc(), TII.get(Mips::IMPLICIT_DEF), DstReg);
     return true;
   }
@@ -369,8 +369,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
   // stack is used.
   if (I->getNumOperands() == 4 && I->getOperand(3).isReg()
       && I->getOperand(3).getReg() == Mips::SP) {
-    unsigned DstReg = I->getOperand(0).getReg();
-    unsigned SrcReg = Op1.getReg();
+    Register DstReg = I->getOperand(0).getReg();
+    Register SrcReg = Op1.getReg();
     unsigned N = Op2.getImm();
     int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N));
 
@@ -538,7 +538,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
     if (RegInfo.needsStackRealignment(MF)) {
       // addiu $Reg, $zero, -MaxAlignment
       // andi $sp, $sp, $Reg
-      unsigned VR = MF.getRegInfo().createVirtualRegister(RC);
+      Register VR = MF.getRegInfo().createVirtualRegister(RC);
       assert(isInt<16>(MFI.getMaxAlignment()) &&
              "Function's alignment size requirement is not supported.");
       int MaxAlign = -(int)MFI.getMaxAlignment();
@@ -865,12 +865,15 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   MipsABIInfo ABI = STI.getABI();
+  unsigned RA = ABI.IsN64() ? Mips::RA_64 : Mips::RA;
   unsigned FP = ABI.GetFramePtr();
   unsigned BP = ABI.IsN64() ? Mips::S7_64 : Mips::S7;
 
-  // Mark $fp as used if function has dedicated frame pointer.
-  if (hasFP(MF))
+  // Mark $ra and $fp as used if function has dedicated frame pointer.
+  if (hasFP(MF)) {
+    setAliasRegs(MF, SavedRegs, RA);
     setAliasRegs(MF, SavedRegs, FP);
+  }
   // Mark $s7 as used if function has dedicated base pointer.
   if (hasBP(MF))
     setAliasRegs(MF, SavedRegs, BP);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 703f99f37dd1..c8313240a678 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -124,6 +124,33 @@ bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
   return true;
 }
 
+void MipsSEDAGToDAGISel::emitMCountABI(MachineInstr &MI, MachineBasicBlock &MBB,
+                                       MachineFunction &MF) {
+  MachineInstrBuilder MIB(MF, &MI);
+  if (!Subtarget->isABI_O32()) { // N32, N64
+    // Save current return address.
+    BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Mips::OR64))
+        .addDef(Mips::AT_64)
+        .addUse(Mips::RA_64, RegState::Undef)
+        .addUse(Mips::ZERO_64);
+    // Stops instruction above from being removed later on.
+    MIB.addUse(Mips::AT_64, RegState::Implicit);
+  } else {  // O32
+    // Save current return address.
+    BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Mips::OR))
+        .addDef(Mips::AT)
+        .addUse(Mips::RA, RegState::Undef)
+        .addUse(Mips::ZERO);
+    // _mcount pops 2 words from stack.
+    BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Mips::ADDiu))
+        .addDef(Mips::SP)
+        .addUse(Mips::SP)
+        .addImm(-8);
+    // Stops first instruction above from being removed later on.
+    MIB.addUse(Mips::AT, RegState::Implicit);
+  }
+}
+
 void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
   MF.getInfo<MipsFunctionInfo>()->initGlobalBaseReg();
 
@@ -150,6 +177,24 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
         if (Subtarget->isABI_FPXX() && !Subtarget->hasMTHC1())
           MI.addOperand(MachineOperand::CreateReg(Mips::SP, false, true));
         break;
+      case Mips::JAL:
+      case Mips::JAL_MM:
+        if (MI.getOperand(0).isGlobal() &&
+            MI.getOperand(0).getGlobal()->getGlobalIdentifier() == "_mcount")
+          emitMCountABI(MI, MBB, MF);
+        break;
+      case Mips::JALRPseudo:
+      case Mips::JALR64Pseudo:
+      case Mips::JALR16_MM:
+        if (MI.getOperand(2).isMCSymbol() &&
+            MI.getOperand(2).getMCSymbol()->getName() == "_mcount")
+          emitMCountABI(MI, MBB, MF);
+        break;
+      case Mips::JALR:
+        if (MI.getOperand(3).isMCSymbol() &&
+            MI.getOperand(3).getMCSymbol()->getName() == "_mcount")
+          emitMCountABI(MI, MBB, MF);
+        break;
       default:
         replaceUsesWithZeroReg(MRI, MI);
       }
@@ -247,7 +292,8 @@ bool MipsSEDAGToDAGISel::selectAddrFrameIndexOffset(
         Base = Addr.getOperand(0);
         // If base is a FI, additional offset calculation is done in
         // eliminateFrameIndex, otherwise we need to check the alignment
-        if (OffsetToAlignment(CN->getZExtValue(), 1ull << ShiftAmount) != 0)
+        const Align Alignment(1ULL << ShiftAmount);
+        if (!isAligned(Alignment, CN->getZExtValue()))
           return false;
       }
 
@@ -719,7 +765,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::ConstantFP: {
-    ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
+    auto *CN = cast<ConstantFPSDNode>(Node);
     if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
       if (Subtarget->isGP64bit()) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
@@ -743,7 +789,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::Constant: {
-    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node);
+    auto *CN = cast<ConstantSDNode>(Node);
     int64_t Imm = CN->getSExtValue();
     unsigned Size = CN->getValueSizeInBits(0);
 
@@ -969,7 +1015,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
       break;
     }
 
-    SDNode *Res;
+    SDNode *Res = nullptr;
 
     // If we have a signed 10 bit integer, we can splat it directly.
     //
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index ce594e1fb4fa..39f665be571e 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -120,7 +120,7 @@ private:
   /// power of 2.
   bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override;
   /// Select constant vector splats whose value is a run of set bits
-  /// ending at the most significant bit
+  /// ending at the most significant bit.
   bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override;
   /// Select constant vector splats whose value is a run of set bits
   /// starting at bit zero.
@@ -128,6 +128,10 @@ private:
 
   bool trySelect(SDNode *Node) override;
 
+  // Emits proper ABI for _mcount profiling calls.
+  void emitMCountABI(MachineInstr &MI, MachineBasicBlock &MBB,
+                     MachineFunction &MF);
+
   void processFunctionAfterISel(MachineFunction &MF) override;
 
   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index edf57a3840d1..5bd234f955ba 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -71,8 +71,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
 
   if (Subtarget.hasDSP() || Subtarget.hasMSA()) {
     // Expand all truncating stores and extending loads.
-    for (MVT VT0 : MVT::vector_valuetypes()) {
-      for (MVT VT1 : MVT::vector_valuetypes()) {
+    for (MVT VT0 : MVT::fixedlen_vector_valuetypes()) {
+      for (MVT VT1 : MVT::fixedlen_vector_valuetypes()) {
         setTruncStoreAction(VT0, VT1, Expand);
         setLoadExtAction(ISD::SEXTLOAD, VT0, VT1, Expand);
         setLoadExtAction(ISD::ZEXTLOAD, VT0, VT1, Expand);
@@ -327,6 +327,7 @@ addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
   setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+  setOperationAction(ISD::UNDEF, Ty, Legal);
 
   setOperationAction(ISD::ADD, Ty, Legal);
   setOperationAction(ISD::AND, Ty, Legal);
@@ -2595,7 +2596,8 @@ static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy,
 
   SDLoc DL(Op);
   return DAG.getNode(MipsISD::SHF, DL, ResTy,
-                     DAG.getConstant(Imm, DL, MVT::i32), Op->getOperand(0));
+                     DAG.getTargetConstant(Imm, DL, MVT::i32),
+                     Op->getOperand(0));
 }
 
 /// Determine whether a range fits a regular pattern of values.
@@ -3062,13 +3064,13 @@ MipsSETargetLowering::emitBPOSGE32(MachineInstr &MI,
   BuildMI(BB, DL, TII->get(Mips::BPOSGE32C_MMR3)).addMBB(TBB);
 
   // Fill $FBB.
-  unsigned VR2 = RegInfo.createVirtualRegister(RC);
+  Register VR2 = RegInfo.createVirtualRegister(RC);
   BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2)
     .addReg(Mips::ZERO).addImm(0);
   BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
 
   // Fill $TBB.
-  unsigned VR1 = RegInfo.createVirtualRegister(RC);
+  Register VR1 = RegInfo.createVirtualRegister(RC);
   BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1)
     .addReg(Mips::ZERO).addImm(1);
 
@@ -3131,13 +3133,13 @@ MachineBasicBlock *MipsSETargetLowering::emitMSACBranchPseudo(
       .addMBB(TBB);
 
   // Fill $FBB.
-  unsigned RD1 = RegInfo.createVirtualRegister(RC);
+  Register RD1 = RegInfo.createVirtualRegister(RC);
   BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), RD1)
     .addReg(Mips::ZERO).addImm(0);
   BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
 
   // Fill $TBB.
-  unsigned RD2 = RegInfo.createVirtualRegister(RC);
+  Register RD2 = RegInfo.createVirtualRegister(RC);
   BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), RD2)
     .addReg(Mips::ZERO).addImm(1);
 
@@ -3169,8 +3171,8 @@ MipsSETargetLowering::emitCOPY_FW(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned Fd = MI.getOperand(0).getReg();
-  unsigned Ws = MI.getOperand(1).getReg();
+  Register Fd = MI.getOperand(0).getReg();
+  Register Ws = MI.getOperand(1).getReg();
   unsigned Lane = MI.getOperand(2).getImm();
 
   if (Lane == 0) {
@@ -3185,9 +3187,9 @@ MipsSETargetLowering::emitCOPY_FW(MachineInstr &MI,
 
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
   } else {
-    unsigned Wt = RegInfo.createVirtualRegister(
-        Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
-                                  &Mips::MSA128WEvensRegClass);
+    Register Wt = RegInfo.createVirtualRegister(
+        Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+                                : &Mips::MSA128WEvensRegClass);
 
     BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane);
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
@@ -3214,15 +3216,15 @@ MipsSETargetLowering::emitCOPY_FD(MachineInstr &MI,
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  unsigned Fd = MI.getOperand(0).getReg();
-  unsigned Ws = MI.getOperand(1).getReg();
+  Register Fd = MI.getOperand(0).getReg();
+  Register Ws = MI.getOperand(1).getReg();
   unsigned Lane = MI.getOperand(2).getImm() * 2;
   DebugLoc DL = MI.getDebugLoc();
 
   if (Lane == 0)
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
   else {
-    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+    Register Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
 
     BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
@@ -3244,13 +3246,13 @@ MipsSETargetLowering::emitINSERT_FW(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned Wd = MI.getOperand(0).getReg();
-  unsigned Wd_in = MI.getOperand(1).getReg();
+  Register Wd = MI.getOperand(0).getReg();
+  Register Wd_in = MI.getOperand(1).getReg();
   unsigned Lane = MI.getOperand(2).getImm();
-  unsigned Fs = MI.getOperand(3).getReg();
-  unsigned Wt = RegInfo.createVirtualRegister(
-      Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
-                                &Mips::MSA128WEvensRegClass);
+  Register Fs = MI.getOperand(3).getReg();
+  Register Wt = RegInfo.createVirtualRegister(
+      Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+                              : &Mips::MSA128WEvensRegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
       .addImm(0)
@@ -3280,11 +3282,11 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned Wd = MI.getOperand(0).getReg();
-  unsigned Wd_in = MI.getOperand(1).getReg();
+  Register Wd = MI.getOperand(0).getReg();
+  Register Wd_in = MI.getOperand(1).getReg();
   unsigned Lane = MI.getOperand(2).getImm();
-  unsigned Fs = MI.getOperand(3).getReg();
-  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+  Register Fs = MI.getOperand(3).getReg();
+  Register Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
       .addImm(0)
@@ -3326,10 +3328,10 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned Wd = MI.getOperand(0).getReg();
-  unsigned SrcVecReg = MI.getOperand(1).getReg();
-  unsigned LaneReg = MI.getOperand(2).getReg();
-  unsigned SrcValReg = MI.getOperand(3).getReg();
+  Register Wd = MI.getOperand(0).getReg();
+  Register SrcVecReg = MI.getOperand(1).getReg();
+  Register LaneReg = MI.getOperand(2).getReg();
+  Register SrcValReg = MI.getOperand(3).getReg();
 
   const TargetRegisterClass *VecRC = nullptr;
   // FIXME: This should be true for N32 too.
@@ -3370,7 +3372,7 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
   }
 
   if (IsFP) {
-    unsigned Wt = RegInfo.createVirtualRegister(VecRC);
+    Register Wt = RegInfo.createVirtualRegister(VecRC);
     BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
         .addImm(0)
         .addReg(SrcValReg)
@@ -3380,7 +3382,7 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
 
   // Convert the lane index into a byte index
   if (EltSizeInBytes != 1) {
-    unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
+    Register LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
     BuildMI(*BB, MI, DL, TII->get(ShiftOp), LaneTmp1)
         .addReg(LaneReg)
         .addImm(EltLog2Size);
@@ -3388,13 +3390,13 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
   }
 
   // Rotate bytes around so that the desired lane is element zero
-  unsigned WdTmp1 = RegInfo.createVirtualRegister(VecRC);
+  Register WdTmp1 = RegInfo.createVirtualRegister(VecRC);
   BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1)
       .addReg(SrcVecReg)
       .addReg(SrcVecReg)
       .addReg(LaneReg, 0, SubRegIdx);
 
-  unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC);
+  Register WdTmp2 = RegInfo.createVirtualRegister(VecRC);
   if (IsFP) {
     // Use insve.df to insert to element zero
     BuildMI(*BB, MI, DL, TII->get(InsveOp), WdTmp2)
@@ -3413,7 +3415,7 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
   // Rotate elements the rest of the way for a full rotation.
   // sld.df inteprets $rt modulo the number of columns so we only need to negate
   // the lane index to do this.
-  unsigned LaneTmp2 = RegInfo.createVirtualRegister(GPRRC);
+  Register LaneTmp2 = RegInfo.createVirtualRegister(GPRRC);
   BuildMI(*BB, MI, DL, TII->get(Subtarget.isABI_N64() ? Mips::DSUB : Mips::SUB),
           LaneTmp2)
       .addReg(Subtarget.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO)
@@ -3440,12 +3442,12 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned Wd = MI.getOperand(0).getReg();
-  unsigned Fs = MI.getOperand(1).getReg();
-  unsigned Wt1 = RegInfo.createVirtualRegister(
+  Register Wd = MI.getOperand(0).getReg();
+  Register Fs = MI.getOperand(1).getReg();
+  Register Wt1 = RegInfo.createVirtualRegister(
       Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
                               : &Mips::MSA128WEvensRegClass);
-  unsigned Wt2 = RegInfo.createVirtualRegister(
+  Register Wt2 = RegInfo.createVirtualRegister(
       Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
                               : &Mips::MSA128WEvensRegClass);
 
@@ -3475,10 +3477,10 @@ MipsSETargetLowering::emitFILL_FD(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned Wd = MI.getOperand(0).getReg();
-  unsigned Fs = MI.getOperand(1).getReg();
-  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
-  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+  Register Wd = MI.getOperand(0).getReg();
+  Register Fs = MI.getOperand(1).getReg();
+  Register Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+  Register Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
   BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
@@ -3509,8 +3511,8 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned Ws = MI.getOperand(0).getReg();
-  unsigned Rt = MI.getOperand(1).getReg();
+  Register Ws = MI.getOperand(0).getReg();
+  Register Rt = MI.getOperand(1).getReg();
   const MachineMemOperand &MMO = **MI.memoperands_begin();
   unsigned Imm = MMO.getOffset();
 
@@ -3522,11 +3524,11 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
                                : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
                                                         : &Mips::GPR64RegClass);
   const bool UsingMips32 = RC == &Mips::GPR32RegClass;
-  unsigned Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
+  Register Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0);
   if(!UsingMips32) {
-    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass);
+    Register Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass);
     BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Tmp)
         .addImm(0)
         .addReg(Rs)
@@ -3564,7 +3566,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned Wd = MI.getOperand(0).getReg();
+  Register Wd = MI.getOperand(0).getReg();
 
   // Caution: A load via the GOT can expand to a GPR32 operand, a load via
   //          spill and reload can expand as a GPR64 operand. Examine the
@@ -3575,7 +3577,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
                                                         : &Mips::GPR64RegClass);
 
   const bool UsingMips32 = RC == &Mips::GPR32RegClass;
-  unsigned Rt = RegInfo.createVirtualRegister(RC);
+  Register Rt = RegInfo.createVirtualRegister(RC);
 
   MachineInstrBuilder MIB =
       BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::LH : Mips::LH64), Rt);
@@ -3583,7 +3585,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
     MIB.add(MI.getOperand(i));
 
   if(!UsingMips32) {
-    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
+    Register Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Tmp).addReg(Rt, 0, Mips::sub_32);
     Rt = Tmp;
   }
@@ -3658,11 +3660,11 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
-  unsigned Wd = MI.getOperand(0).getReg();
-  unsigned Fs = MI.getOperand(1).getReg();
+  Register Wd = MI.getOperand(0).getReg();
+  Register Fs = MI.getOperand(1).getReg();
 
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  Register Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
   const TargetRegisterClass *GPRRC =
       IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
   unsigned MFC1Opc = IsFGR64onMips64
@@ -3671,16 +3673,16 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W;
 
   // Perform the register class copy as mentioned above.
-  unsigned Rtemp = RegInfo.createVirtualRegister(GPRRC);
+  Register Rtemp = RegInfo.createVirtualRegister(GPRRC);
   BuildMI(*BB, MI, DL, TII->get(MFC1Opc), Rtemp).addReg(Fs);
   BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp);
   unsigned WPHI = Wtemp;
 
   if (IsFGR64onMips32) {
-    unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
+    Register Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
     BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs);
-    unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
-    unsigned Wtemp3 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+    Register Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+    Register Wtemp3 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
     BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_W), Wtemp2)
         .addReg(Wtemp)
         .addReg(Rtemp2)
@@ -3693,7 +3695,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   }
 
   if (IsFGR64) {
-    unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+    Register Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
     BuildMI(*BB, MI, DL, TII->get(Mips::FEXDO_W), Wtemp2)
         .addReg(WPHI)
         .addReg(WPHI);
@@ -3817,8 +3819,8 @@ MipsSETargetLowering::emitFEXP2_W_1(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
-  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
-  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  Register Ws1 = RegInfo.createVirtualRegister(RC);
+  Register Ws2 = RegInfo.createVirtualRegister(RC);
   DebugLoc DL = MI.getDebugLoc();
 
   // Splat 1.0 into a vector
@@ -3846,8 +3848,8 @@ MipsSETargetLowering::emitFEXP2_D_1(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
-  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
-  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  Register Ws1 = RegInfo.createVirtualRegister(RC);
+  Register Ws2 = RegInfo.createVirtualRegister(RC);
   DebugLoc DL = MI.getDebugLoc();
 
   // Splat 1.0 into a vector
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 4e49f5e7d9d1..2126a1bda493 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -628,7 +628,7 @@ unsigned MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
   // The first instruction can be a LUi, which is different from other
   // instructions (ADDiu, ORI and SLL) in that it does not have a register
   // operand.
-  unsigned Reg = RegInfo.createVirtualRegister(RC);
+  Register Reg = RegInfo.createVirtualRegister(RC);
 
   if (Inst->Opc == LUi)
     BuildMI(MBB, II, DL, get(LUi), Reg).addImm(SignExtend64<16>(Inst->ImmOpnd));
@@ -734,9 +734,9 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
   // Add lo/hi registers if the mtlo/hi instructions created have explicit
   // def registers.
   if (HasExplicitDef) {
-    unsigned DstReg = I->getOperand(0).getReg();
-    unsigned DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
-    unsigned DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
+    Register DstReg = I->getOperand(0).getReg();
+    Register DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+    Register DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
     LoInst.addReg(DstLo, RegState::Define);
     HiInst.addReg(DstHi, RegState::Define);
   }
@@ -773,14 +773,14 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
                                               MachineBasicBlock::iterator I,
                                               bool isMicroMips,
                                               bool FP64) const {
-  unsigned DstReg = I->getOperand(0).getReg();
-  unsigned SrcReg = I->getOperand(1).getReg();
+  Register DstReg = I->getOperand(0).getReg();
+  Register SrcReg = I->getOperand(1).getReg();
   unsigned N = I->getOperand(2).getImm();
   DebugLoc dl = I->getDebugLoc();
 
   assert(N < 2 && "Invalid immediate");
   unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
-  unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
+  Register SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
 
   // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
   // in MipsSEFrameLowering.cpp.
@@ -815,7 +815,7 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
 void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
                                          bool isMicroMips, bool FP64) const {
-  unsigned DstReg = I->getOperand(0).getReg();
+  Register DstReg = I->getOperand(0).getReg();
   unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
   const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
   DebugLoc dl = I->getDebugLoc();
@@ -883,8 +883,8 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
   unsigned RA = Subtarget.isGP64bit() ? Mips::RA_64 : Mips::RA;
   unsigned T9 = Subtarget.isGP64bit() ? Mips::T9_64 : Mips::T9;
   unsigned ZERO = Subtarget.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
-  unsigned OffsetReg = I->getOperand(0).getReg();
-  unsigned TargetReg = I->getOperand(1).getReg();
+  Register OffsetReg = I->getOperand(0).getReg();
+  Register TargetReg = I->getOperand(1).getReg();
 
   // addu $ra, $v0, $zero
   // addu $sp, $sp, $v1
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index f4b164d5c0ab..a48088c28919 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -212,11 +212,9 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
     // element size), otherwise it is a 16-bit signed immediate.
     unsigned OffsetBitSize =
         getLoadStoreOffsetSizeInBits(MI.getOpcode(), MI.getOperand(OpNo - 1));
-    unsigned OffsetAlign = getLoadStoreOffsetAlign(MI.getOpcode());
-
+    const Align OffsetAlign(getLoadStoreOffsetAlign(MI.getOpcode()));
     if (OffsetBitSize < 16 && isInt<16>(Offset) &&
-        (!isIntN(OffsetBitSize, Offset) ||
-         OffsetToAlignment(Offset, OffsetAlign) != 0)) {
+        (!isIntN(OffsetBitSize, Offset) || !isAligned(OffsetAlign, Offset))) {
       // If we have an offset that needs to fit into a signed n-bit immediate
       // (where n < 16) and doesn't, but does fit into 16-bits then use an ADDiu
       MachineBasicBlock &MBB = *MI.getParent();
@@ -224,7 +222,7 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
       const TargetRegisterClass *PtrRC =
           ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
       MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
-      unsigned Reg = RegInfo.createVirtualRegister(PtrRC);
+      Register Reg = RegInfo.createVirtualRegister(PtrRC);
       const MipsSEInstrInfo &TII =
           *static_cast<const MipsSEInstrInfo *>(
               MBB.getParent()->getSubtarget().getInstrInfo());
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index d021b3d021b1..b9245c9fc0eb 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -69,7 +69,7 @@ void MipsSubtarget::anchor() {}
 
 MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
                              bool little, const MipsTargetMachine &TM,
-                             unsigned StackAlignOverride)
+                             MaybeAlign StackAlignOverride)
     : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
       IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
       NoABICalls(false), Abs2008(false), IsFP64bit(false), UseOddSPReg(true),
@@ -81,10 +81,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
       Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
       HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false),
       HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false),
-      StackAlignOverride(StackAlignOverride),
-      TM(TM), TargetTriple(TT), TSInfo(),
-      InstrInfo(
-          MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
+      StackAlignOverride(StackAlignOverride), TM(TM), TargetTriple(TT),
+      TSInfo(), InstrInfo(MipsInstrInfo::create(
+                    initializeSubtargetDependencies(CPU, FS, TM))),
       FrameLowering(MipsFrameLowering::create(*this)),
       TLInfo(MipsTargetLowering::create(TM, *this)) {
 
@@ -248,12 +247,12 @@ MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
     InMips16HardFloat = true;
 
   if (StackAlignOverride)
-    stackAlignment = StackAlignOverride;
+    stackAlignment = *StackAlignOverride;
   else if (isABI_N32() || isABI_N64())
-    stackAlignment = 16;
+    stackAlignment = Align(16);
   else {
     assert(isABI_O32() && "Unknown ABI for stack alignment!");
-    stackAlignment = 8;
+    stackAlignment = Align(8);
   }
 
   return *this;
@@ -286,6 +285,6 @@ const RegisterBankInfo *MipsSubtarget::getRegBankInfo() const {
   return RegBankInfo.get();
 }
 
-const InstructionSelector *MipsSubtarget::getInstructionSelector() const {
+InstructionSelector *MipsSubtarget::getInstructionSelector() const {
   return InstSelector.get();
 }
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index aa1200579fc8..0a8c2ef8ae5c 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -189,12 +189,15 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // Disable use of the `jal` instruction.
   bool UseLongCalls = false;
 
+  // Assume 32-bit GOT.
+  bool UseXGOT = false;
+
   /// The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
-  unsigned stackAlignment;
+  Align stackAlignment;
 
   /// The overridden stack alignment.
-  unsigned StackAlignOverride;
+  MaybeAlign StackAlignOverride;
 
   InstrItineraryData InstrItins;
 
@@ -227,7 +230,7 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, bool little,
-                const MipsTargetMachine &TM, unsigned StackAlignOverride);
+                const MipsTargetMachine &TM, MaybeAlign StackAlignOverride);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -323,6 +326,8 @@ public:
 
   bool useLongCalls() const { return UseLongCalls; }
 
+  bool useXGOT() const { return UseXGOT; }
+
   bool enableLongBranchPass() const {
     return hasStandardEncoding() || inMicroMipsMode() || allowMixed16_32();
   }
@@ -344,7 +349,7 @@ public:
   // really use them if in addition we are in mips16 mode
   static bool useConstantIslands();
 
-  unsigned getStackAlignment() const { return stackAlignment; }
+  Align getStackAlignment() const { return stackAlignment; }
 
   // Grab relocation model
   Reloc::Model getRelocationModel() const;
@@ -391,7 +396,7 @@ public:
   const CallLowering *getCallLowering() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
-  const InstructionSelector *getInstructionSelector() const override;
+  InstructionSelector *getInstructionSelector() const override;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index c878abb042e4..e58f316791ba 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -117,14 +117,17 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
                         CPU, FS, Options, getEffectiveRelocModel(JIT, RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      isLittle(isLittle), TLOF(llvm::make_unique<MipsTargetObjectFile>()),
+      isLittle(isLittle), TLOF(std::make_unique<MipsTargetObjectFile>()),
       ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
-      Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this,
-                                           Options.StackAlignmentOverride),
+      Subtarget(nullptr),
+      DefaultSubtarget(TT, CPU, FS, isLittle, *this,
+                       MaybeAlign(Options.StackAlignmentOverride)),
       NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
-                        isLittle, *this, Options.StackAlignmentOverride),
+                        isLittle, *this,
+                        MaybeAlign(Options.StackAlignmentOverride)),
       Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16",
-                      isLittle, *this, Options.StackAlignmentOverride) {
+                      isLittle, *this,
+                      MaybeAlign(Options.StackAlignmentOverride)) {
   Subtarget = &DefaultSubtarget;
   initAsmInfo();
 }
@@ -196,8 +199,9 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle, *this,
-                                         Options.StackAlignmentOverride);
+    I = std::make_unique<MipsSubtarget>(
+        TargetTriple, CPU, FS, isLittle, *this,
+        MaybeAlign(Options.StackAlignmentOverride));
   }
   return I.get();
 }
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 1fa8ebadd643..298d056ce2c3 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -130,6 +130,8 @@ public:
                SMLoc IDLoc, const MCSubtargetInfo *STI);
   void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
                SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
+                MCOperand Op3, SMLoc IDLoc, const MCSubtargetInfo *STI);
   void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
                SMLoc IDLoc, const MCSubtargetInfo *STI);
   void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0,
@@ -154,17 +156,13 @@ public:
                               unsigned BaseReg, int64_t Offset,
                               function_ref<unsigned()> GetATReg, SMLoc IDLoc,
                               const MCSubtargetInfo *STI);
-  void emitStoreWithSymOffset(unsigned Opcode, unsigned SrcReg,
-                              unsigned BaseReg, MCOperand &HiOperand,
-                              MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
-                              const MCSubtargetInfo *STI);
+  void emitSCWithSymOffset(unsigned Opcode, unsigned SrcReg, unsigned BaseReg,
+                           MCOperand &HiOperand, MCOperand &LoOperand,
+                           unsigned ATReg, SMLoc IDLoc,
+                           const MCSubtargetInfo *STI);
   void emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
                              int64_t Offset, unsigned TmpReg, SMLoc IDLoc,
                              const MCSubtargetInfo *STI);
-  void emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
-                             MCOperand &HiOperand, MCOperand &LoOperand,
-                             unsigned ATReg, SMLoc IDLoc,
-                             const MCSubtargetInfo *STI);
   void emitGPRestore(int Offset, SMLoc IDLoc, const MCSubtargetInfo *STI);
 
   void forbidModuleDirective() { ModuleDirectiveAllowed = false; }
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 6530c40ea100..0acbace5f848 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -44,7 +44,7 @@ MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
 FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
-BasicBlockPass *createNVPTXLowerAllocaPass();
+FunctionPass *createNVPTXLowerAllocaPass();
 MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 5f38b4a3c4c5..307f4d58c3ab 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -282,7 +282,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
 }
 
 unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     const TargetRegisterClass *RC = MRI->getRegClass(Reg);
 
     DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC];
@@ -434,7 +434,7 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
   return false;
 }
 
-void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
   AsmPrinter::EmitBasicBlockStart(MBB);
   if (isLoopHeaderOfNoUnroll(MBB))
     OutStreamer->EmitRawText(StringRef("\t.pragma \"nounroll\";\n"));
@@ -507,8 +507,8 @@ const MCSymbol *NVPTXAsmPrinter::getFunctionFrameSymbol() const {
 }
 
 void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
-  unsigned RegNo = MI->getOperand(0).getReg();
-  if (TargetRegisterInfo::isVirtualRegister(RegNo)) {
+  Register RegNo = MI->getOperand(0).getReg();
+  if (Register::isVirtualRegister(RegNo)) {
     OutStreamer->AddComment(Twine("implicit-def: ") +
                             getVirtualRegisterName(RegNo));
   } else {
@@ -1397,7 +1397,7 @@ static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
 
   auto *FTy = dyn_cast<FunctionType>(Ty);
   if (FTy)
-    return DL.getPointerPrefAlignment();
+    return DL.getPointerPrefAlignment().value();
   return DL.getPrefTypeAlignment(Ty);
 }
 
@@ -1473,12 +1473,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
         // Just print .param .align <a> .b8 .param[size];
         // <a> = PAL.getparamalignment
         // size = typeallocsize of element type
-        unsigned align = PAL.getParamAlignment(paramIndex);
-        if (align == 0)
-          align = DL.getABITypeAlignment(Ty);
+        const Align align = DL.getValueOrABITypeAlignment(
+            PAL.getParamAlignment(paramIndex), Ty);
 
         unsigned sz = DL.getTypeAllocSize(Ty);
-        O << "\t.param .align " << align << " .b8 ";
+        O << "\t.param .align " << align.value() << " .b8 ";
         printParamName(I, paramIndex, O);
         O << "[" << sz << "]";
 
@@ -1559,9 +1558,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // Just print .param .align <a> .b8 .param[size];
       // <a> = PAL.getparamalignment
       // size = typeallocsize of element type
-      unsigned align = PAL.getParamAlignment(paramIndex);
-      if (align == 0)
-        align = DL.getABITypeAlignment(ETy);
+      Align align =
+          DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy);
       // Work around a bug in ptxas. When PTX code takes address of
       // byval parameter with alignment < 4, ptxas generates code to
       // spill argument into memory. Alas on sm_50+ ptxas generates
@@ -1573,10 +1571,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // TODO: this will need to be undone when we get to support multi-TU
       // device-side compilation as it breaks ABI compatibility with nvcc.
       // Hopefully ptxas bug is fixed by then.
-      if (!isKernelFunc && align < 4)
-        align = 4;
+      if (!isKernelFunc && align < Align(4))
+        align = Align(4);
       unsigned sz = DL.getTypeAllocSize(ETy);
-      O << "\t.param .align " << align << " .b8 ";
+      O << "\t.param .align " << align.value() << " .b8 ";
       printParamName(I, paramIndex, O);
       O << "[" << sz << "]";
       continue;
@@ -1653,7 +1651,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   // We use the per class virtual register number in the ptx output.
   unsigned int numVRs = MRI->getNumVirtRegs();
   for (unsigned i = 0; i < numVRs; i++) {
-    unsigned int vr = TRI->index2VirtReg(i);
+    unsigned int vr = Register::index2VirtReg(i);
     const TargetRegisterClass *RC = MRI->getRegClass(vr);
     DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
     int n = regmap.size();
@@ -1861,7 +1859,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   case Type::HalfTyID:
   case Type::FloatTyID:
   case Type::DoubleTyID: {
-    const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+    const auto *CFP = cast<ConstantFP>(CPV);
     Type *Ty = CFP->getType();
     if (Ty == Type::getHalfTy(CPV->getContext())) {
       APInt API = CFP->getValueAPF().bitcastToAPInt();
@@ -2212,7 +2210,7 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    if (Register::isPhysicalRegister(MO.getReg())) {
       if (MO.getReg() == NVPTX::VRDepot)
         O << DEPOTNAME << getFunctionNumber();
       else
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 43ae57ac1262..7a66854d32f4 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -200,7 +200,7 @@ private:
   const Function *F;
   std::string CurrentFnName;
 
-  void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+  void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
   void EmitFunctionEntryLabel() override;
   void EmitFunctionBodyStart() override;
   void EmitFunctionBodyEnd() override;
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 46f08b23d31a..d26912f47e50 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -25,7 +25,7 @@
 using namespace llvm;
 
 NVPTXFrameLowering::NVPTXFrameLowering()
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0) {}
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, Align(8), 0) {}
 
 bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
 
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ae1aa98da0e8..9acd0bea66fd 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -480,7 +480,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   // Register custom handling for vector loads/stores
-  for (MVT VT : MVT::vector_valuetypes()) {
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     if (IsPTXVectorType(VT)) {
       setOperationAction(ISD::LOAD, VT, Custom);
       setOperationAction(ISD::STORE, VT, Custom);
@@ -1291,8 +1291,8 @@ std::string NVPTXTargetLowering::getPrototype(
       O << ".param .b" << size << " _";
     } else if (isa<PointerType>(retTy)) {
       O << ".param .b" << PtrVT.getSizeInBits() << " _";
-    } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
-      auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
+    } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
+               retTy->isIntegerTy(128)) {
       O << ".param .align " << retAlignment << " .b8 _["
         << DL.getTypeAllocSize(retTy) << "]";
     } else {
@@ -2230,8 +2230,8 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType() == MVT::v2f16) {
     LoadSDNode *Load = cast<LoadSDNode>(Op);
     EVT MemVT = Load->getMemoryVT();
-    if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
-                            *Load->getMemOperand())) {
+    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                        MemVT, *Load->getMemOperand())) {
       SDValue Ops[2];
       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
       return DAG.getMergeValues(Ops, SDLoc(Op));
@@ -2273,8 +2273,8 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   // v2f16 is legal, so we can't rely on legalizer to handle unaligned
   // stores and have to handle it here.
   if (VT == MVT::v2f16 &&
-      !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
-                          *Store->getMemOperand()))
+      !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                      VT, *Store->getMemOperand()))
     return expandUnalignedStore(Store, DAG);
 
   if (VT.isVector())
@@ -3497,7 +3497,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
@@ -3521,7 +3521,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 8;
+    Info.align = Align(8);
     return true;
   }
 
@@ -3547,7 +3547,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3585,7 +3585,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 4;
+    Info.align = Align(4);
     return true;
   }
 
@@ -3606,7 +3606,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3627,7 +3627,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3648,7 +3648,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3665,7 +3665,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 8;
+    Info.align = Align(8);
     return true;
   }
 
@@ -3686,7 +3686,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOStore;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3707,7 +3707,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOStore;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3728,7 +3728,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOStore;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3745,7 +3745,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOStore;
-    Info.align = 8;
+    Info.align = Align(8);
     return true;
   }
 
@@ -3780,7 +3780,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-    Info.align = 0;
+    Info.align.reset();
     return true;
   }
 
@@ -3798,7 +3798,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    Info.align =
+        MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
 
     return true;
   }
@@ -3817,7 +3818,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    Info.align =
+        MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
 
     return true;
   }
@@ -3883,7 +3885,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_tex_1d_v4s32_s32:
@@ -4003,7 +4005,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_suld_1d_i8_clamp:
@@ -4056,7 +4058,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_suld_1d_i16_clamp:
@@ -4109,7 +4111,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_suld_1d_i32_clamp:
@@ -4162,7 +4164,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_suld_1d_i64_clamp:
@@ -4200,7 +4202,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
   return false;
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 62da3c79f465..fe7a84f9a361 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -143,12 +143,17 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
 def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
 def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
 def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
+def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
 
 def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
 def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
 def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
 def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
 
+// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
+def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 
+                          "&& Subtarget->getPTXVersion() >= 64)">;
+
 def useShortPtr : Predicate<"useShortPointers()">;
 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
 
@@ -2908,7 +2913,7 @@ def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
 // ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
 // ptx value to 64 bits to match the ISD node's semantics, unless we know we're
 // truncating back down to 32 bits.
-def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
 def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
 
 // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
@@ -2925,10 +2930,10 @@ def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
 // and then ctlz that value.  This way we don't have to subtract 16 from the
 // result.  Unfortunately today we don't have a way to generate
 // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(ctlz Int16Regs:$a),
+def : Pat<(i16 (ctlz Int16Regs:$a)),
           (SUBi16ri (CVT_u16_u32
            (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
           (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
 
 // Population count
@@ -2953,7 +2958,7 @@ def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
 // If we know that we're storing into an i32, we can avoid the final trunc.
 def : Pat<(ctpop Int16Regs:$a),
           (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
+def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
           (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
 
 // fpround f32 -> f16
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 1752d3e0575e..c52195fb0449 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -56,6 +56,10 @@ class RegSeq<int n, string prefix> {
                             []);
 }
 
+class THREADMASK_INFO<bit sync> {
+  list<bit> ret = !if(sync, [0,1], [0]);
+}
+
 //-----------------------------------
 // Synchronization and shuffle functions
 //-----------------------------------
@@ -129,121 +133,64 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
                  [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
         Requires<[hasPTX60, hasSM30]>;
 
-
-// shfl.{up,down,bfly,idx}.b32
-multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
-  // The last two parameters to shfl can be regs or imms.  ptxas is smart
-  // enough to inline constant registers, so strictly speaking we don't need to
-  // handle immediates here.  But it's easy enough, and it makes our ptx more
-  // readable.
-  def reg : NVPTXInst<
-      (outs regclass:$dst),
-      (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
-      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
-      [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;
-
-  def imm1 : NVPTXInst<
-      (outs regclass:$dst),
-      (ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
-      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
-      [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;
-
-  def imm2 : NVPTXInst<
-      (outs regclass:$dst),
-      (ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
-      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
-      [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;
-
-  def imm3 : NVPTXInst<
-      (outs regclass:$dst),
-      (ins regclass:$src, i32imm:$offset, i32imm:$mask),
-      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
-      [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
+class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
+                 bit offset_imm, bit mask_imm, bit threadmask_imm>
+      : NVPTXInst<(outs), (ins), "?", []> {
+  NVPTXRegClass rc = !cond(
+    !eq(reg, "i32"): Int32Regs,
+    !eq(reg, "f32"): Float32Regs);
+  string IntrName = "int_nvvm_shfl_"
+                    # !if(sync, "sync_", "")
+                    # mode
+                    # "_" # reg
+                    # !if(return_pred, "p", "");
+  Intrinsic Intr = !cast<Intrinsic>(IntrName);
+  let InOperandList = !con(
+    !if(sync,
+        !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]),
+        (ins)),
+    (ins rc:$src),
+    !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]),
+    !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"])
+    );
+  let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst));
+  let AsmString = "shfl."
+     # !if(sync, "sync.", "")
+     # mode # ".b32\t"
+     # "$dst"
+     # !if(return_pred, "|$pred", "") # ", "
+     # "$src, $offset, $mask"
+     # !if(sync, ", $threadmask", "")
+     # ";"
+     ;
+  let Pattern = [!con(
+      !foreach(tmp, OutOperandList,
+             !subst(outs, set,
+             !subst(i32imm, imm, tmp))),
+      (set !foreach(tmp, InOperandList,
+             !subst(ins, Intr,
+             !subst(i32imm, imm, tmp))))
+  )];
 }
 
-defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>;
-defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>;
-defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>;
-defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>;
-defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>;
-defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
-defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
-defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
-
-multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
-  // Threadmask and the last two parameters to shfl.sync can be regs or imms.
-  // ptxas is smart enough to inline constant registers, so strictly speaking we
-  // don't need to handle immediates here.  But it's easy enough, and it makes
-  // our ptx more readable.
-  def rrr : NVPTXInst<
-      (outs regclass:$dst),
-      (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
-                            Int32Regs:$offset, Int32Regs:$mask))]>;
-
-  def rri : NVPTXInst<
-      (outs regclass:$dst),
-      (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
-                            Int32Regs:$offset, imm:$mask))]>;
-
-  def rir : NVPTXInst<
-      (outs regclass:$dst),
-      (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
-                            imm:$offset, Int32Regs:$mask))]>;
-
-  def rii : NVPTXInst<
-      (outs regclass:$dst),
-      (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
-                            imm:$offset, imm:$mask))]>;
-
-  def irr : NVPTXInst<
-      (outs regclass:$dst),
-      (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
-                            Int32Regs:$offset, Int32Regs:$mask))]>;
-
-  def iri : NVPTXInst<
-      (outs regclass:$dst),
-      (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
-                            Int32Regs:$offset, imm:$mask))]>;
-
-  def iir : NVPTXInst<
-      (outs regclass:$dst),
-      (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
-                            imm:$offset, Int32Regs:$mask))]>;
-
-  def iii : NVPTXInst<
-      (outs regclass:$dst),
-      (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
-                            imm:$offset, imm:$mask))]>;
+foreach sync = [0, 1] in {
+  foreach mode = ["up", "down", "bfly", "idx"] in {
+    foreach regclass = ["i32", "f32"] in {
+      foreach return_pred = [0, 1] in {
+        foreach offset_imm = [0, 1] in {
+          foreach mask_imm = [0, 1] in {
+            foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
+              def : SHFL_INSTR<sync, mode, regclass, return_pred,
+                               offset_imm, mask_imm, threadmask_imm>,
+                    Requires<!if(sync, [hasSM30], [hasSM30, hasSHFL])>;
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
-// On sm_70 these don't have to be convergent, so we may eventually want to
-// implement non-convergent variant of this intrinsic.
-defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>;
-defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>;
-defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>;
-defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>;
-defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>;
-defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>;
-defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>;
-defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>;
-
-
 // vote.{all,any,uni,ballot}
 multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
   def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred),
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 0743a2986718..83039241a7c7 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -103,7 +103,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   // Do the transformation of an aggr load/copy/set to a loop
   //
   for (LoadInst *LI : AggrLoads) {
-    StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin());
+    auto *SI = cast<StoreInst>(*LI->user_begin());
     Value *SrcAddr = LI->getOperand(0);
     Value *DstAddr = SI->getOperand(1);
     unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index 76fb9f3fa692..945b7286b03c 100644
--- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -41,12 +41,12 @@ void initializeNVPTXLowerAllocaPass(PassRegistry &);
 }
 
 namespace {
-class NVPTXLowerAlloca : public BasicBlockPass {
-  bool runOnBasicBlock(BasicBlock &BB) override;
+class NVPTXLowerAlloca : public FunctionPass {
+  bool runOnFunction(Function &F) override;
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  NVPTXLowerAlloca() : BasicBlockPass(ID) {}
+  NVPTXLowerAlloca() : FunctionPass(ID) {}
   StringRef getPassName() const override {
     return "convert address space of alloca'ed memory to local";
   }
@@ -61,58 +61,61 @@ INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca",
 // =============================================================================
 // Main function for this pass.
 // =============================================================================
-bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
-  if (skipBasicBlock(BB))
+bool NVPTXLowerAlloca::runOnFunction(Function &F) {
+  if (skipFunction(F))
     return false;
 
   bool Changed = false;
-  for (auto &I : BB) {
-    if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
-      Changed = true;
-      auto PTy = dyn_cast<PointerType>(allocaInst->getType());
-      auto ETy = PTy->getElementType();
-      auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
-      auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
-      auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
-      auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal,
-                                                    GenericAddrTy, "");
-      NewASCToLocal->insertAfter(allocaInst);
-      NewASCToGeneric->insertAfter(NewASCToLocal);
-      for (Value::use_iterator UI = allocaInst->use_begin(),
-                                UE = allocaInst->use_end();
-            UI != UE; ) {
-        // Check Load, Store, GEP, and BitCast Uses on alloca and make them
-        // use the converted generic address, in order to expose non-generic
-        // addrspacecast to NVPTXInferAddressSpaces. For other types
-        // of instructions this is unnecessary and may introduce redundant
-        // address cast.
-        const auto &AllocaUse = *UI++;
-        auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
-        if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) {
-          LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
-          continue;
-        }
-        auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
-        if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) {
-          SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
-          continue;
-        }
-        auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
-        if (GI && GI->getPointerOperand() == allocaInst) {
-          GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
-          continue;
-        }
-        auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
-        if (BI && BI->getOperand(0) == allocaInst) {
-          BI->setOperand(0, NewASCToGeneric);
-          continue;
+  for (auto &BB : F)
+    for (auto &I : BB) {
+      if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
+        Changed = true;
+        auto PTy = dyn_cast<PointerType>(allocaInst->getType());
+        auto ETy = PTy->getElementType();
+        auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
+        auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
+        auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
+        auto NewASCToGeneric =
+            new AddrSpaceCastInst(NewASCToLocal, GenericAddrTy, "");
+        NewASCToLocal->insertAfter(allocaInst);
+        NewASCToGeneric->insertAfter(NewASCToLocal);
+        for (Value::use_iterator UI = allocaInst->use_begin(),
+                                 UE = allocaInst->use_end();
+             UI != UE;) {
+          // Check Load, Store, GEP, and BitCast Uses on alloca and make them
+          // use the converted generic address, in order to expose non-generic
+          // addrspacecast to NVPTXInferAddressSpaces. For other types
+          // of instructions this is unnecessary and may introduce redundant
+          // address cast.
+          const auto &AllocaUse = *UI++;
+          auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
+          if (LI && LI->getPointerOperand() == allocaInst &&
+              !LI->isVolatile()) {
+            LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
+            continue;
+          }
+          auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
+          if (SI && SI->getPointerOperand() == allocaInst &&
+              !SI->isVolatile()) {
+            SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
+            continue;
+          }
+          auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
+          if (GI && GI->getPointerOperand() == allocaInst) {
+            GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
+            continue;
+          }
+          auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
+          if (BI && BI->getOperand(0) == allocaInst) {
+            BI->setOperand(0, NewASCToGeneric);
+            continue;
+          }
         }
       }
     }
-  }
   return Changed;
 }
 
-BasicBlockPass *llvm::createNVPTXLowerAllocaPass() {
+FunctionPass *llvm::createNVPTXLowerAllocaPass() {
   return new NVPTXLowerAlloca();
 }
diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index c5e02e34e25e..c3c5f6fbcba7 100644
--- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -164,7 +164,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   // Set the alignment to alignment of the byval parameter. This is because,
   // later load/stores assume that alignment, and we are going to replace
   // the use of the byval parameter with this alloca instruction.
-  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo()));
+  AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo())));
   Arg->replaceAllUsesWith(AllocA);
 
   Value *ArgInParam = new AddrSpaceCastInst(
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index 629757db8707..5e6411c61eab 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -81,7 +81,7 @@ static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
   auto &Op = Root.getOperand(1);
   const auto &MRI = MF.getRegInfo();
   MachineInstr *GenericAddrDef = nullptr;
-  if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+  if (Op.isReg() && Register::isVirtualRegister(Op.getReg())) {
     GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg());
   }
 
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 4c5a9adf1f65..a7127b0e9a99 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -178,7 +178,7 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
   // frame index registers. Functions which don't want/need this optimization
   // will continue to use the existing code path.
   if (MFI.getUseLocalStackAllocationBlock()) {
-    unsigned Align = MFI.getLocalFrameMaxAlign();
+    unsigned Align = MFI.getLocalFrameMaxAlign().value();
 
     // Adjust to alignment boundary.
     Offset = (Offset + Align - 1) / Align * Align;
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 11b3fe2fa3d3..f58fb5717773 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -116,7 +116,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
                         CPU, FS, Options, Reloc::PIC_,
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
       is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
-      TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
+      TLOF(std::make_unique<NVPTXTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   if (TT.getOS() == Triple::NVCL)
     drvInterface = NVPTX::NVCL;
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 665eb1383253..43c2e9920403 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -19,10 +19,11 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/Mutex.h"
 #include <algorithm>
 #include <cstring>
 #include <map>
+#include <mutex>
 #include <string>
 #include <vector>
 
@@ -38,12 +39,12 @@ static ManagedStatic<per_module_annot_t> annotationCache;
 static sys::Mutex Lock;
 
 void clearAnnotationCache(const Module *Mod) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   annotationCache->erase(Mod);
 }
 
 static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   assert(md && "Invalid mdnode for annotation");
   assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
   // start index = 1, to skip the global variable key
@@ -69,7 +70,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
 }
 
 static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations");
   if (!NMD)
     return;
@@ -103,7 +104,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
 
 bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
                            unsigned &retval) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   const Module *m = gv->getParent();
   if ((*annotationCache).find(m) == (*annotationCache).end())
     cacheAnnotationFromMD(m, gv);
@@ -117,7 +118,7 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
 
 bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
                            std::vector<unsigned> &retval) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   const Module *m = gv->getParent();
   if ((*annotationCache).find(m) == (*annotationCache).end())
     cacheAnnotationFromMD(m, gv);
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index c9524da93acd..aedf5b713c3f 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -579,7 +579,7 @@ public:
 
   static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S,
                                                  bool IsPPC64) {
-    auto Op = make_unique<PPCOperand>(Token);
+    auto Op = std::make_unique<PPCOperand>(Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -608,7 +608,7 @@ public:
 
   static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E,
                                                bool IsPPC64) {
-    auto Op = make_unique<PPCOperand>(Immediate);
+    auto Op = std::make_unique<PPCOperand>(Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -618,7 +618,7 @@ public:
 
   static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S,
                                                 SMLoc E, bool IsPPC64) {
-    auto Op = make_unique<PPCOperand>(Expression);
+    auto Op = std::make_unique<PPCOperand>(Expression);
     Op->Expr.Val = Val;
     Op->Expr.CRVal = EvaluateCRExpr(Val);
     Op->StartLoc = S;
@@ -629,7 +629,7 @@ public:
 
   static std::unique_ptr<PPCOperand>
   CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) {
-    auto Op = make_unique<PPCOperand>(TLSRegister);
+    auto Op = std::make_unique<PPCOperand>(TLSRegister);
     Op->TLSReg.Sym = Sym;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -639,7 +639,7 @@ public:
 
   static std::unique_ptr<PPCOperand>
   CreateContextImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) {
-    auto Op = make_unique<PPCOperand>(ContextImmediate);
+    auto Op = std::make_unique<PPCOperand>(ContextImmediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 7a8af57961cb..3597fd15eeb1 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -167,12 +167,6 @@ static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, QFRegs);
 }
 
-static DecodeStatus DecodeSPE4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder) {
-  return decodeRegisterClass(Inst, RegNo, RRegs);
-}
-
 static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
                                             const void *Decoder) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 042ddf48d5df..20f752c3041a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -78,7 +78,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
   // determine the type of the relocation
   unsigned Type;
   if (IsPCRel) {
-    switch ((unsigned)Fixup.getKind()) {
+    switch (Fixup.getTargetKind()) {
     default:
       llvm_unreachable("Unimplemented");
     case PPC::fixup_ppc_br24:
@@ -131,7 +131,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       break;
     }
   } else {
-    switch ((unsigned)Fixup.getKind()) {
+    switch (Fixup.getTargetKind()) {
       default: llvm_unreachable("invalid fixup kind!");
     case FK_NONE:
       Type = ELF::R_PPC_NONE;
@@ -443,5 +443,5 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createPPCELFObjectWriter(bool Is64Bit, uint8_t OSABI) {
-  return llvm::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
+  return std::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index 0e64ae55ab1c..7fc231618fa9 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -66,6 +66,31 @@ void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 
 void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                StringRef Annot, const MCSubtargetInfo &STI) {
+  // Customize printing of the addis instruction on AIX. When an operand is a
+  // symbol reference, the instruction syntax is changed to look like a load
+  // operation, i.e:
+  //     Transform:  addis $rD, $rA, $src --> addis $rD, $src($rA).
+  if (TT.isOSAIX() &&
+      (MI->getOpcode() == PPC::ADDIS8 || MI->getOpcode() == PPC::ADDIS) &&
+      MI->getOperand(2).isExpr()) {
+    assert((MI->getOperand(0).isReg() && MI->getOperand(1).isReg()) &&
+           "The first and the second operand of an addis instruction"
+           " should be registers.");
+
+    assert(isa<MCSymbolRefExpr>(MI->getOperand(2).getExpr()) &&
+           "The third operand of an addis instruction should be a symbol "
+           "reference expression if it is an expression at all.");
+
+    O << "\taddis ";
+    printOperand(MI, 0, O);
+    O << ", ";
+    printOperand(MI, 2, O);
+    O << "(";
+    printOperand(MI, 1, O);
+    O << ")";
+    return;
+  }
+
   // Check for slwi/srwi mnemonics.
   if (MI->getOpcode() == PPC::RLWINM) {
     unsigned char SH = MI->getOperand(2).getImm();
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 5f0005ea1d7b..1216cd727289 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -86,4 +86,5 @@ void PPCXCOFFMCAsmInfo::anchor() {}
 PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
   assert(!IsLittleEndian && "Little-endian XCOFF not supported.");
   CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4;
+  ZeroDirective = "\t.space\t";
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index d467f5c4a439..fb9dd5d7aa75 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -19,8 +19,8 @@ using namespace llvm;
 
 const PPCMCExpr*
 PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr,
-                  bool isDarwin, MCContext &Ctx) {
-  return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin);
+                  bool IsDarwin, MCContext &Ctx) {
+  return new (Ctx) PPCMCExpr(Kind, Expr, IsDarwin);
 }
 
 void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 449e2c34f74d..ad1454566162 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -45,21 +45,21 @@ public:
   /// @{
 
   static const PPCMCExpr *create(VariantKind Kind, const MCExpr *Expr,
-                                 bool isDarwin, MCContext &Ctx);
+                                 bool IsDarwin, MCContext &Ctx);
 
   static const PPCMCExpr *createLo(const MCExpr *Expr,
-                                   bool isDarwin, MCContext &Ctx) {
-    return create(VK_PPC_LO, Expr, isDarwin, Ctx);
+                                   bool IsDarwin, MCContext &Ctx) {
+    return create(VK_PPC_LO, Expr, IsDarwin, Ctx);
   }
 
   static const PPCMCExpr *createHi(const MCExpr *Expr,
-                                   bool isDarwin, MCContext &Ctx) {
-    return create(VK_PPC_HI, Expr, isDarwin, Ctx);
+                                   bool IsDarwin, MCContext &Ctx) {
+    return create(VK_PPC_HI, Expr, IsDarwin, Ctx);
   }
 
   static const PPCMCExpr *createHa(const MCExpr *Expr,
-                                   bool isDarwin, MCContext &Ctx) {
-    return create(VK_PPC_HA, Expr, isDarwin, Ctx);
+                                   bool IsDarwin, MCContext &Ctx) {
+    return create(VK_PPC_HA, Expr, IsDarwin, Ctx);
   }
 
   /// @}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 4cf7fd15fa75..672f910ab086 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -178,7 +178,7 @@ static uint32_t getFixupOffset(const MCAsmLayout &Layout,
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
   // On Mach-O, ppc_fixup_half16 relocations must refer to the
   // start of the instruction, not the second halfword, as ELF does
-  if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16)
+  if (Fixup.getTargetKind() == PPC::fixup_ppc_half16)
     FixupOffset &= ~uint32_t(3);
   return FixupOffset;
 }
@@ -376,5 +376,5 @@ void PPCMachObjectWriter::RecordPPCRelocation(
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
                                 uint32_t CPUSubtype) {
-  return llvm::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
+  return std::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index 9c661286d455..7fdbb8990b55 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -25,5 +25,5 @@ PPCXCOFFObjectWriter::PPCXCOFFObjectWriter(bool Is64Bit)
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createPPCXCOFFObjectWriter(bool Is64Bit) {
-  return llvm::make_unique<PPCXCOFFObjectWriter>(Is64Bit);
+  return std::make_unique<PPCXCOFFObjectWriter>(Is64Bit);
 }
diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td
index 2a10322d3f49..f6cd8ed00c82 100644
--- a/lib/Target/PowerPC/P9InstrResources.td
+++ b/lib/Target/PowerPC/P9InstrResources.td
@@ -64,6 +64,7 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
     XXLAND,
     XXLANDC,
     XXLEQV,
+    XXLEQVOnes,
     XXLNAND,
     XXLNOR,
     XXLOR,
@@ -124,8 +125,8 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
     (instregex "SRAD(I)?$"),
     (instregex "EXTSWSLI_32_64$"),
     (instregex "MFV(S)?RD$"),
-    (instregex "MTVSRD$"),
-    (instregex "MTVSRW(A|Z)$"),
+    (instregex "MTV(S)?RD$"),
+    (instregex "MTV(S)?RW(A|Z)$"),
     (instregex "CMP(WI|LWI|W|LW)(8)?$"),
     (instregex "CMP(L)?D(I)?$"),
     (instregex "SUBF(I)?C(8)?$"),
@@ -148,7 +149,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
     (instregex "EXTS(B|H|W)(8)?(_32)?(_64)?(o)?$"),
     (instregex "ADD(4|8)(TLS)?(_)?$"),
     (instregex "NEG(8)?$"),
-    (instregex "ADDI(S)?toc(HA|L)$"),
+    (instregex "ADDI(S)?toc(HA|L)(8)?$"),
     COPY,
     MCRF,
     MCRXRX,
@@ -158,6 +159,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
     XSNEGDP,
     XSCPSGNDP,
     MFVSRWZ,
+    MFVRWZ,
     EXTSWSLI,
     SRADI_32,
     RLDIC,
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index c6951ab67b08..0534773c4c9e 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -50,10 +50,10 @@ namespace llvm {
   FunctionPass *createPPCExpandISELPass();
   FunctionPass *createPPCPreEmitPeepholePass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                    AsmPrinter &AP, bool isDarwin);
+                                    AsmPrinter &AP, bool IsDarwin);
   bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
                                          MCOperand &OutMO, AsmPrinter &AP,
-                                         bool isDarwin);
+                                         bool IsDarwin);
 
   void initializePPCCTRLoopsPass(PassRegistry&);
 #ifndef NDEBUG
@@ -86,8 +86,8 @@ namespace llvm {
     MO_NO_FLAG,
 
     /// On a symbol operand "FOO", this indicates that the reference is actually
-    /// to "FOO@plt".  This is used for calls and jumps to external functions on
-    /// for PIC calls on Linux and ELF systems.
+    /// to "FOO@plt".  This is used for calls and jumps to external functions
+    /// and for PIC calls on 32-bit ELF systems.
     MO_PLT = 1,
 
     /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bd87ce06b4fb..66236b72a1a3 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -51,9 +51,11 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
@@ -76,7 +78,7 @@ namespace {
 
 class PPCAsmPrinter : public AsmPrinter {
 protected:
-  MapVector<MCSymbol *, MCSymbol *> TOC;
+  MapVector<const MCSymbol *, MCSymbol *> TOC;
   const PPCSubtarget *Subtarget;
   StackMaps SM;
 
@@ -87,7 +89,7 @@ public:
 
   StringRef getPassName() const override { return "PowerPC Assembly Printer"; }
 
-  MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
+  MCSymbol *lookUpOrCreateTOCEntry(const MCSymbol *Sym);
 
   bool doInitialization(Module &M) override {
     if (!TOC.empty())
@@ -164,6 +166,14 @@ public:
       : PPCAsmPrinter(TM, std::move(Streamer)) {}
 
   StringRef getPassName() const override { return "AIX PPC Assembly Printer"; }
+
+  void SetupMachineFunction(MachineFunction &MF) override;
+
+  void EmitGlobalVariable(const GlobalVariable *GV) override;
+
+  void EmitFunctionDescriptor() override;
+
+  void EmitEndOfAsmFile(Module &) override;
 };
 
 } // end anonymous namespace
@@ -265,7 +275,7 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
         return true;
       // This operand uses VSX numbering.
       // If the operand is a VMX register, convert it to a VSX register.
-      unsigned Reg = MI->getOperand(OpNo).getReg();
+      Register Reg = MI->getOperand(OpNo).getReg();
       if (PPCInstrInfo::isVRRegister(Reg))
         Reg = PPC::VSX32 + (Reg - PPC::V0);
       else if (PPCInstrInfo::isVFRegister(Reg))
@@ -328,7 +338,7 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
 /// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
 /// exists for it.  If not, create one.  Then return a symbol that references
 /// the TOC entry.
-MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
+MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(const MCSymbol *Sym) {
   MCSymbol *&TOCEntry = TOC[Sym];
   if (!TOCEntry)
     TOCEntry = createTempSymbol("C");
@@ -378,7 +388,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
     if (CallTarget) {
       assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
              "High 16 bits of call target should be zero.");
-      unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+      Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
       EncodedBytes = 0;
       // Materialize the jump address:
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8)
@@ -502,13 +512,32 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
                  .addExpr(SymVar));
 }
 
+/// Map a machine operand for a TOC pseudo-machine instruction to its
+/// corresponding MCSymbol.
+static MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO,
+                                           AsmPrinter &AP) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_GlobalAddress:
+    return AP.getSymbol(MO.getGlobal());
+  case MachineOperand::MO_ConstantPoolIndex:
+    return AP.GetCPISymbol(MO.getIndex());
+  case MachineOperand::MO_JumpTableIndex:
+    return AP.GetJTISymbol(MO.getIndex());
+  case MachineOperand::MO_BlockAddress:
+    return AP.GetBlockAddressSymbol(MO.getBlockAddress());
+  default:
+    llvm_unreachable("Unexpected operand type to get symbol.");
+  }
+}
+
 /// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
 /// the current output stream.
 ///
 void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
-  bool isPPC64 = Subtarget->isPPC64();
-  bool isDarwin = TM.getTargetTriple().isOSDarwin();
+  const bool IsDarwin = TM.getTargetTriple().isOSDarwin();
+  const bool IsPPC64 = Subtarget->isPPC64();
+  const bool IsAIX = Subtarget->isAIXABI();
   const Module *M = MF->getFunction().getParent();
   PICLevel::Level PL = M->getPICLevel();
 
@@ -517,7 +546,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (!MI->isInlineAsm()) {
     for (const MachineOperand &MO: MI->operands()) {
       if (MO.isReg()) {
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (Subtarget->hasSPE()) {
           if (PPC::F4RCRegClass.contains(Reg) ||
               PPC::F8RCRegClass.contains(Reg) ||
@@ -595,7 +624,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     //       addis r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@ha
     //       addi r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@l
     // Get the offset from the GOT Base Register to the GOT
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
     if (Subtarget->isSecurePlt() && isPositionIndependent() ) {
       unsigned PICR = TmpInst.getOperand(0).getReg();
       MCSymbol *BaseSymbol = OutContext.getOrCreateSymbol(
@@ -646,43 +675,57 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
   }
   case PPC::LWZtoc: {
-    // Transform %r3 = LWZtoc @min1, %r2
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+    assert(!IsDarwin && "TOC is an ELF/XCOFF construct.");
+
+    // Transform %rN = LWZtoc @op1, %r2
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
 
-    // Change the opcode to LWZ, and the global address operand to be a
-    // reference to the GOT entry we will synthesize later.
+    // Change the opcode to LWZ.
     TmpInst.setOpcode(PPC::LWZ);
+
     const MachineOperand &MO = MI->getOperand(1);
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+           "Invalid operand for LWZtoc.");
 
-    // Map symbol -> label of TOC entry
-    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
-    MCSymbol *MOSymbol = nullptr;
-    if (MO.isGlobal())
-      MOSymbol = getSymbol(MO.getGlobal());
-    else if (MO.isCPI())
-      MOSymbol = GetCPISymbol(MO.getIndex());
-    else if (MO.isJTI())
-      MOSymbol = GetJTISymbol(MO.getIndex());
-    else if (MO.isBlockAddress())
-      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
-
-    if (PL == PICLevel::SmallPIC) {
+    // Map the operand to its corresponding MCSymbol.
+    const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+    // Create a reference to the GOT entry for the symbol. The GOT entry will be
+    // synthesized later.
+    if (PL == PICLevel::SmallPIC && !IsAIX) {
       const MCExpr *Exp =
         MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_GOT,
                                 OutContext);
       TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
-    } else {
-      MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+      EmitToStreamer(*OutStreamer, TmpInst);
+      return;
+    }
 
-      const MCExpr *Exp =
-        MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None,
-                                OutContext);
-      const MCExpr *PB =
-        MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
-                                                             OutContext);
-      Exp = MCBinaryExpr::createSub(Exp, PB, OutContext);
+    // Otherwise, use the TOC. 'TOCEntry' is a label used to reference the
+    // storage allocated in the TOC which contains the address of
+    // 'MOSymbol'. Said TOC entry will be synthesized later.
+    MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+    const MCExpr *Exp =
+        MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None, OutContext);
+
+    // AIX uses the label directly as the lwz displacement operand for
+    // references into the toc section. The displacement value will be generated
+    // relative to the toc-base.
+    if (IsAIX) {
+      assert(
+          TM.getCodeModel() == CodeModel::Small &&
+          "This pseudo should only be selected for 32-bit small code model.");
       TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+      EmitToStreamer(*OutStreamer, TmpInst);
+      return;
     }
+
+    // Create an explicit subtract expression between the local symbol and
+    // '.LTOC' to manifest the toc-relative offset.
+    const MCExpr *PB = MCSymbolRefExpr::create(
+        OutContext.getOrCreateSymbol(Twine(".LTOC")), OutContext);
+    Exp = MCBinaryExpr::createSub(Exp, PB, OutContext);
+    TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
@@ -690,72 +733,121 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::LDtocCPT:
   case PPC::LDtocBA:
   case PPC::LDtoc: {
+    assert(!IsDarwin && "TOC is an ELF/XCOFF construct");
+
     // Transform %x3 = LDtoc @min1, %x2
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
 
-    // Change the opcode to LD, and the global address operand to be a
-    // reference to the TOC entry we will synthesize later.
+    // Change the opcode to LD.
     TmpInst.setOpcode(PPC::LD);
+
     const MachineOperand &MO = MI->getOperand(1);
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+           "Invalid operand!");
+
+    // Map the machine operand to its corresponding MCSymbol, then map the
+    // global address operand to be a reference to the TOC entry we will
+    // synthesize later.
+    MCSymbol *TOCEntry =
+        lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this));
+
+    const MCSymbolRefExpr::VariantKind VK =
+        IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC;
+    const MCExpr *Exp =
+        MCSymbolRefExpr::create(TOCEntry, VK, OutContext);
+    TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::ADDIStocHA: {
+    assert((IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large) &&
+           "This pseudo should only be selected for 32-bit large code model on"
+           " AIX.");
+
+    // Transform %rd = ADDIStocHA %rA, @sym(%r2)
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
 
-    // Map symbol -> label of TOC entry
-    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
-    MCSymbol *MOSymbol = nullptr;
-    if (MO.isGlobal())
-      MOSymbol = getSymbol(MO.getGlobal());
-    else if (MO.isCPI())
-      MOSymbol = GetCPISymbol(MO.getIndex());
-    else if (MO.isJTI())
-      MOSymbol = GetJTISymbol(MO.getIndex());
-    else if (MO.isBlockAddress())
-      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+    // Change the opcode to ADDIS.
+    TmpInst.setOpcode(PPC::ADDIS);
 
+    const MachineOperand &MO = MI->getOperand(2);
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+           "Invalid operand for ADDIStocHA.");
+
+    // Map the machine operand to its corresponding MCSymbol.
+    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+    // Always use TOC on AIX. Map the global address operand to be a reference
+    // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
+    // reference the storage allocated in the TOC which contains the address of
+    // 'MOSymbol'.
     MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+    const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry,
+                                                MCSymbolRefExpr::VK_PPC_U,
+                                                OutContext);
+    TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::LWZtocL: {
+    assert(IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large &&
+           "This pseudo should only be selected for 32-bit large code model on"
+           " AIX.");
 
-    const MCExpr *Exp =
-      MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
-                              OutContext);
+    // Transform %rd = LWZtocL @sym, %rs.
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+
+    // Change the opcode to lwz.
+    TmpInst.setOpcode(PPC::LWZ);
+
+    const MachineOperand &MO = MI->getOperand(1);
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+           "Invalid operand for LWZtocL.");
+
+    // Map the machine operand to its corresponding MCSymbol.
+    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+    // Always use TOC on AIX. Map the global address operand to be a reference
+    // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
+    // reference the storage allocated in the TOC which contains the address of
+    // 'MOSymbol'.
+    MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+    const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry,
+                                                MCSymbolRefExpr::VK_PPC_L,
+                                                OutContext);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
+  case PPC::ADDIStocHA8: {
+    assert(!IsDarwin && "TOC is an ELF/XCOFF construct");
 
-  case PPC::ADDIStocHA: {
-    // Transform %xd = ADDIStocHA %x2, @sym
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+    // Transform %xd = ADDIStocHA8 %x2, @sym
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
 
-    // Change the opcode to ADDIS8.  If the global address is external, has
-    // common linkage, is a non-local function address, or is a jump table
-    // address, then generate a TOC entry and reference that.  Otherwise
-    // reference the symbol directly.
+    // Change the opcode to ADDIS8. If the global address is the address of
+    // an external symbol, is a jump table address, is a block address, or is a
+    // constant pool index with large code model enabled, then generate a TOC
+    // entry and reference that. Otherwise, reference the symbol directly.
     TmpInst.setOpcode(PPC::ADDIS8);
+
     const MachineOperand &MO = MI->getOperand(2);
-    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
-            MO.isBlockAddress()) &&
-           "Invalid operand for ADDIStocHA!");
-    MCSymbol *MOSymbol = nullptr;
-    bool GlobalToc = false;
-
-    if (MO.isGlobal()) {
-      const GlobalValue *GV = MO.getGlobal();
-      MOSymbol = getSymbol(GV);
-      unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
-      GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG);
-    } else if (MO.isCPI()) {
-      MOSymbol = GetCPISymbol(MO.getIndex());
-    } else if (MO.isJTI()) {
-      MOSymbol = GetJTISymbol(MO.getIndex());
-    } else if (MO.isBlockAddress()) {
-      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
-    }
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+           "Invalid operand for ADDIStocHA8!");
+
+    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
+    const bool GlobalToc =
+        MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal());
     if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
-        TM.getCodeModel() == CodeModel::Large)
+        (MO.isCPI() && TM.getCodeModel() == CodeModel::Large))
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
+    const MCSymbolRefExpr::VariantKind VK =
+        IsAIX ? MCSymbolRefExpr::VK_PPC_U : MCSymbolRefExpr::VK_PPC_TOC_HA;
+
     const MCExpr *Exp =
-      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
-                              OutContext);
+        MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
 
     if (!MO.isJTI() && MO.getOffset())
       Exp = MCBinaryExpr::createAdd(Exp,
@@ -768,73 +860,59 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::LDtocL: {
+    assert(!IsDarwin && "TOC is an ELF/XCOFF construct");
+
     // Transform %xd = LDtocL @sym, %xs
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
 
-    // Change the opcode to LD.  If the global address is external, has
-    // common linkage, or is a jump table address, then reference the
-    // associated TOC entry.  Otherwise reference the symbol directly.
+    // Change the opcode to LD. If the global address is the address of
+    // an external symbol, is a jump table address, is a block address, or is
+    // a constant pool index with large code model enabled, then generate a
+    // TOC entry and reference that. Otherwise, reference the symbol directly.
     TmpInst.setOpcode(PPC::LD);
+
     const MachineOperand &MO = MI->getOperand(1);
     assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
             MO.isBlockAddress()) &&
            "Invalid operand for LDtocL!");
-    MCSymbol *MOSymbol = nullptr;
 
-    if (MO.isJTI())
-      MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
-    else if (MO.isBlockAddress()) {
-      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
-      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
-    }
-    else if (MO.isCPI()) {
-      MOSymbol = GetCPISymbol(MO.getIndex());
-      if (TM.getCodeModel() == CodeModel::Large)
-        MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
-    }
-    else if (MO.isGlobal()) {
-      const GlobalValue *GV = MO.getGlobal();
-      MOSymbol = getSymbol(GV);
-      LLVM_DEBUG(
-          unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
-          assert((GVFlags & PPCII::MO_NLP_FLAG) &&
-                 "LDtocL used on symbol that could be accessed directly is "
-                 "invalid. Must match ADDIStocHA."));
+    LLVM_DEBUG(assert(
+        (!MO.isGlobal() || Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
+        "LDtocL used on symbol that could be accessed directly is "
+        "invalid. Must match ADDIStocHA8."));
+
+    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+    if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
-    }
 
+    const MCSymbolRefExpr::VariantKind VK =
+        IsAIX ? MCSymbolRefExpr::VK_PPC_L : MCSymbolRefExpr::VK_PPC_TOC_LO;
     const MCExpr *Exp =
-      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
-                              OutContext);
+        MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
   case PPC::ADDItocL: {
     // Transform %xd = ADDItocL %xs, @sym
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
 
-    // Change the opcode to ADDI8.  If the global address is external, then
-    // generate a TOC entry and reference that.  Otherwise reference the
+    // Change the opcode to ADDI8. If the global address is external, then
+    // generate a TOC entry and reference that. Otherwise, reference the
     // symbol directly.
     TmpInst.setOpcode(PPC::ADDI8);
+
     const MachineOperand &MO = MI->getOperand(2);
-    assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
-    MCSymbol *MOSymbol = nullptr;
-
-    if (MO.isGlobal()) {
-      const GlobalValue *GV = MO.getGlobal();
-      LLVM_DEBUG(unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
-                 assert(!(GVFlags & PPCII::MO_NLP_FLAG) &&
-                        "Interposable definitions must use indirect access."));
-      MOSymbol = getSymbol(GV);
-    } else if (MO.isCPI()) {
-      MOSymbol = GetCPISymbol(MO.getIndex());
-    }
+    assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL.");
+
+    LLVM_DEBUG(assert(
+        !(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
+        "Interposable definitions must use indirect access."));
 
     const MCExpr *Exp =
-      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
-                              OutContext);
+        MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO, *this),
+                                MCSymbolRefExpr::VK_PPC_TOC_LO, OutContext);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
@@ -842,13 +920,13 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::ADDISgotTprelHA: {
     // Transform: %xd = ADDISgotTprelHA %x2, @sym
     // Into:      %xd = ADDIS8 %x2, sym@got@tlsgd@ha
-    assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+    assert(IsPPC64 && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTprel =
-      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
-                              OutContext);
+        MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
+                                OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addReg(MI->getOperand(1).getReg())
@@ -858,16 +936,17 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::LDgotTprelL:
   case PPC::LDgotTprelL32: {
     // Transform %xd = LDgotTprelL @sym, %xs
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
 
     // Change the opcode to LD.
-    TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ);
+    TmpInst.setOpcode(IsPPC64 ? PPC::LD : PPC::LWZ);
     const MachineOperand &MO = MI->getOperand(1);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *Exp =
-      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
-                              OutContext);
+    const MCExpr *Exp = MCSymbolRefExpr::create(
+        MOSymbol, IsPPC64 ? MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO
+                          : MCSymbolRefExpr::VK_PPC_GOT_TPREL,
+        OutContext);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
@@ -920,7 +999,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::ADDIStlsgdHA: {
     // Transform: %xd = ADDIStlsgdHA %x2, @sym
     // Into:      %xd = ADDIS8 %x2, sym@got@tlsgd@ha
-    assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+    assert(IsPPC64 && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -943,11 +1022,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD = MCSymbolRefExpr::create(
-        MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO
-                                       : MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
+        MOSymbol, IsPPC64 ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO
+                          : MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
         OutContext);
     EmitToStreamer(*OutStreamer,
-                   MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
                    .addReg(MI->getOperand(0).getReg())
                    .addReg(MI->getOperand(1).getReg())
                    .addExpr(SymGotTlsGD));
@@ -965,7 +1044,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::ADDIStlsldHA: {
     // Transform: %xd = ADDIStlsldHA %x2, @sym
     // Into:      %xd = ADDIS8 %x2, sym@got@tlsld@ha
-    assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+    assert(IsPPC64 && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -988,11 +1067,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD = MCSymbolRefExpr::create(
-        MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO
-                                       : MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
+        MOSymbol, IsPPC64 ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO
+                          : MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
         OutContext);
     EmitToStreamer(*OutStreamer,
-                   MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
                        .addReg(MI->getOperand(0).getReg())
                        .addReg(MI->getOperand(1).getReg())
                        .addExpr(SymGotTlsLD));
@@ -1021,7 +1100,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                               OutContext);
     EmitToStreamer(
         *OutStreamer,
-        MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
+        MCInstBuilder(IsPPC64 ? PPC::ADDIS8 : PPC::ADDIS)
             .addReg(MI->getOperand(0).getReg())
             .addReg(MI->getOperand(1).getReg())
             .addExpr(SymDtprel));
@@ -1040,7 +1119,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
                               OutContext);
     EmitToStreamer(*OutStreamer,
-                   MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
                        .addReg(MI->getOperand(0).getReg())
                        .addReg(MI->getOperand(1).getReg())
                        .addExpr(SymDtprel));
@@ -1087,7 +1166,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // suite shows a handful of test cases that fail this check for
     // Darwin.  Those need to be investigated before this sanity test
     // can be enabled for those subtargets.
-    if (!Subtarget->isDarwin()) {
+    if (!IsDarwin) {
       unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
       const MachineOperand &MO = MI->getOperand(OpNum);
       if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
@@ -1098,7 +1177,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   }
 
-  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
@@ -1368,15 +1447,16 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
               ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
     OutStreamer->SwitchSection(Section);
 
-    for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
-         E = TOC.end(); I != E; ++I) {
-      OutStreamer->EmitLabel(I->second);
-      MCSymbol *S = I->first;
+    for (const auto &TOCMapPair : TOC) {
+      const MCSymbol *const TOCEntryTarget = TOCMapPair.first;
+      MCSymbol *const TOCEntryLabel = TOCMapPair.second;
+
+      OutStreamer->EmitLabel(TOCEntryLabel);
       if (isPPC64) {
-        TS.emitTCEntry(*S);
+        TS.emitTCEntry(*TOCEntryTarget);
       } else {
         OutStreamer->EmitValueToAlignment(4);
-        OutStreamer->EmitSymbolValue(S, 4);
+        OutStreamer->EmitSymbolValue(TOCEntryTarget, 4);
       }
     }
   }
@@ -1602,7 +1682,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
     if (!Stubs.empty()) {
       // Switch with ".non_lazy_symbol_pointer" directive.
       OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
-      EmitAlignment(isPPC64 ? 3 : 2);
+      EmitAlignment(isPPC64 ? Align(8) : Align(4));
 
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         // L_foo$stub:
@@ -1643,6 +1723,106 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
   return AsmPrinter::doFinalization(M);
 }
 
+void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) {
+  // Get the function descriptor symbol.
+  CurrentFnDescSym = getSymbol(&MF.getFunction());
+  // Set the containing csect.
+  MCSectionXCOFF *FnDescSec = OutStreamer->getContext().getXCOFFSection(
+      CurrentFnDescSym->getName(), XCOFF::XMC_DS, XCOFF::XTY_SD,
+      XCOFF::C_HIDEXT, SectionKind::getData());
+  cast<MCSymbolXCOFF>(CurrentFnDescSym)->setContainingCsect(FnDescSec);
+
+  return AsmPrinter::SetupMachineFunction(MF);
+}
+
+void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  // Early error checking limiting what is supported.
+  if (GV->isThreadLocal())
+    report_fatal_error("Thread local not yet supported on AIX.");
+
+  if (GV->hasSection())
+    report_fatal_error("Custom section for Data not yet supported.");
+
+  if (GV->hasComdat())
+    report_fatal_error("COMDAT not yet supported by AIX.");
+
+  SectionKind GVKind = getObjFileLowering().getKindForGlobal(GV, TM);
+  if (!GVKind.isCommon() && !GVKind.isBSSLocal() && !GVKind.isData())
+    report_fatal_error("Encountered a global variable kind that is "
+                       "not supported yet.");
+
+  // Create the containing csect and switch to it.
+  MCSectionXCOFF *CSect = cast<MCSectionXCOFF>(
+      getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
+  OutStreamer->SwitchSection(CSect);
+
+  // Create the symbol, set its storage class, and emit it.
+  MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV));
+  GVSym->setStorageClass(
+      TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
+  GVSym->setContainingCsect(CSect);
+
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+
+  // Handle common symbols.
+  if (GVKind.isCommon() || GVKind.isBSSLocal()) {
+    unsigned Align =
+      GV->getAlignment() ? GV->getAlignment() : DL.getPreferredAlignment(GV);
+    uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+
+    if (GVKind.isBSSLocal())
+      OutStreamer->EmitXCOFFLocalCommonSymbol(GVSym, Size, Align);
+    else
+      OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
+    return;
+  }
+
+  MCSymbol *EmittedInitSym = GVSym;
+  EmitLinkage(GV, EmittedInitSym);
+  EmitAlignment(getGVAlignment(GV, DL), GV);
+  OutStreamer->EmitLabel(EmittedInitSym);
+  EmitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+}
+
+void PPCAIXAsmPrinter::EmitFunctionDescriptor() {
+  const DataLayout &DL = getDataLayout();
+  const unsigned PointerSize = DL.getPointerSizeInBits() == 64 ? 8 : 4;
+
+  MCSectionSubPair Current = OutStreamer->getCurrentSection();
+  // Emit function descriptor.
+  OutStreamer->SwitchSection(
+      cast<MCSymbolXCOFF>(CurrentFnDescSym)->getContainingCsect());
+  OutStreamer->EmitLabel(CurrentFnDescSym);
+  // Emit function entry point address.
+  OutStreamer->EmitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
+                         PointerSize);
+  // Emit TOC base address.
+  MCSymbol *TOCBaseSym = OutContext.getOrCreateSymbol(StringRef("TOC[TC0]"));
+  OutStreamer->EmitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext),
+                         PointerSize);
+  // Emit a null environment pointer.
+  OutStreamer->EmitIntValue(0, PointerSize);
+
+  OutStreamer->SwitchSection(Current.first, Current.second);
+}
+
+void PPCAIXAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  // If there are no functions in this module, we will never need to reference
+  // the TOC base.
+  if (M.empty())
+    return;
+
+  // Emit TOC base.
+  MCSymbol *TOCBaseSym = OutContext.getOrCreateSymbol(StringRef("TOC[TC0]"));
+  MCSectionXCOFF *TOCBaseSection = OutStreamer->getContext().getXCOFFSection(
+      StringRef("TOC"), XCOFF::XMC_TC0, XCOFF::XTY_SD, XCOFF::C_HIDEXT,
+      SectionKind::getData());
+  cast<MCSymbolXCOFF>(TOCBaseSym)->setContainingCsect(TOCBaseSection);
+  // Switch to section to emit TOC base.
+  OutStreamer->SwitchSection(TOCBaseSection);
+}
+
+
 /// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
 /// for a MachineFunction to the given output stream, in a format that the
 /// Darwin assembler can deal with.
diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 5e9a661f8f0b..d325b078979f 100644
--- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -340,9 +340,10 @@ bool PPCBranchCoalescing::identicalOperands(
 
     if (Op1.isIdenticalTo(Op2)) {
       // filter out instructions with physical-register uses
-      if (Op1.isReg() && TargetRegisterInfo::isPhysicalRegister(Op1.getReg())
-        // If the physical register is constant then we can assume the value
-        // has not changed between uses.
+      if (Op1.isReg() &&
+          Register::isPhysicalRegister(Op1.getReg())
+          // If the physical register is constant then we can assume the value
+          // has not changed between uses.
           && !(Op1.isUse() && MRI->isConstantPhysReg(Op1.getReg()))) {
         LLVM_DEBUG(dbgs() << "The operands are not provably identical.\n");
         return false;
@@ -355,8 +356,8 @@ bool PPCBranchCoalescing::identicalOperands(
     // definition of the register produces the same value. If they produce the
     // same value, consider them to be identical.
     if (Op1.isReg() && Op2.isReg() &&
-        TargetRegisterInfo::isVirtualRegister(Op1.getReg()) &&
-        TargetRegisterInfo::isVirtualRegister(Op2.getReg())) {
+        Register::isVirtualRegister(Op1.getReg()) &&
+        Register::isVirtualRegister(Op2.getReg())) {
       MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg());
       MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg());
       if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) {
@@ -456,7 +457,7 @@ bool PPCBranchCoalescing::canMoveToEnd(const MachineInstr &MI,
                     << TargetMBB.getNumber() << "\n");
 
   for (auto &Use : MI.uses()) {
-    if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) {
+    if (Use.isReg() && Register::isVirtualRegister(Use.getReg())) {
       MachineInstr *DefInst = MRI->getVRegDef(Use.getReg());
       if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) {
         LLVM_DEBUG(dbgs() << "    *** Cannot move this instruction ***\n");
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 793d690baec3..cdff4d383d23 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -81,21 +81,20 @@ FunctionPass *llvm::createPPCBranchSelectionPass() {
 /// original Offset.
 unsigned PPCBSel::GetAlignmentAdjustment(MachineBasicBlock &MBB,
                                          unsigned Offset) {
-  unsigned Align = MBB.getAlignment();
-  if (!Align)
+  const Align Alignment = MBB.getAlignment();
+  if (Alignment == Align::None())
     return 0;
 
-  unsigned AlignAmt = 1 << Align;
-  unsigned ParentAlign = MBB.getParent()->getAlignment();
+  const Align ParentAlign = MBB.getParent()->getAlignment();
 
-  if (Align <= ParentAlign)
-    return OffsetToAlignment(Offset, AlignAmt);
+  if (Alignment <= ParentAlign)
+    return offsetToAlignment(Offset, Alignment);
 
   // The alignment of this MBB is larger than the function's alignment, so we
   // can't tell whether or not it will insert nops. Assume that it will.
   if (FirstImpreciseBlock < 0)
     FirstImpreciseBlock = MBB.getNumber();
-  return AlignAmt + OffsetToAlignment(Offset, AlignAmt);
+  return Alignment.value() + offsetToAlignment(Offset, Alignment);
 }
 
 /// We need to be careful about the offset of the first block in the function
@@ -179,7 +178,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn,
                                const MachineBasicBlock *Dest,
                                unsigned BrOffset) {
   int BranchSize;
-  unsigned MaxAlign = 2;
+  Align MaxAlign = Align(4);
   bool NeedExtraAdjustment = false;
   if (Dest->getNumber() <= Src->getNumber()) {
     // If this is a backwards branch, the delta is the offset from the
@@ -192,8 +191,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn,
     BranchSize += BlockSizes[DestBlock].first;
     for (unsigned i = DestBlock+1, e = Src->getNumber(); i < e; ++i) {
       BranchSize += BlockSizes[i].first;
-      MaxAlign = std::max(MaxAlign,
-                          Fn.getBlockNumbered(i)->getAlignment());
+      MaxAlign = std::max(MaxAlign, Fn.getBlockNumbered(i)->getAlignment());
     }
 
     NeedExtraAdjustment = (FirstImpreciseBlock >= 0) &&
@@ -207,8 +205,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn,
     MaxAlign = std::max(MaxAlign, Dest->getAlignment());
     for (unsigned i = StartBlock+1, e = Dest->getNumber(); i != e; ++i) {
       BranchSize += BlockSizes[i].first;
-      MaxAlign = std::max(MaxAlign,
-                          Fn.getBlockNumbered(i)->getAlignment());
+      MaxAlign = std::max(MaxAlign, Fn.getBlockNumbered(i)->getAlignment());
     }
 
     NeedExtraAdjustment = (FirstImpreciseBlock >= 0) &&
@@ -258,7 +255,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn,
   // The computed offset is at most ((1 << alignment) - 4) bytes smaller
   // than actual offset. So we add this number to the offset for safety.
   if (NeedExtraAdjustment)
-    BranchSize += (1 << MaxAlign) - 4;
+    BranchSize += MaxAlign.value() - 4;
 
   return BranchSize;
 }
@@ -339,16 +336,16 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
           // 1. CR register
           // 2. Target MBB
           PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
-          unsigned CRReg = I->getOperand(1).getReg();
+          Register CRReg = I->getOperand(1).getReg();
 
           // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
           BuildMI(MBB, I, dl, TII->get(PPC::BCC))
             .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
         } else if (I->getOpcode() == PPC::BC) {
-          unsigned CRBit = I->getOperand(0).getReg();
+          Register CRBit = I->getOperand(0).getReg();
           BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2);
         } else if (I->getOpcode() == PPC::BCn) {
-          unsigned CRBit = I->getOperand(0).getReg();
+          Register CRBit = I->getOperand(0).getReg();
           BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2);
         } else if (I->getOpcode() == PPC::BDNZ) {
           BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 264d6b590f95..d8425d89da92 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -162,7 +162,7 @@ class PPCFastISel final : public FastISel {
     bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
                     bool isZExt, unsigned DestReg,
                     const PPC::Predicate Pred);
-    bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+    bool PPCEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
                      const TargetRegisterClass *RC, bool IsZExt = true,
                      unsigned FP64LoadOpc = PPC::LFD);
     bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
@@ -451,7 +451,7 @@ void PPCFastISel::PPCSimplifyAddress(Address &Addr, bool &UseOffset,
 // Emit a load instruction if possible, returning true if we succeeded,
 // otherwise false.  See commentary below for how the register class of
 // the load is determined.
-bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+bool PPCFastISel::PPCEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
                               const TargetRegisterClass *RC,
                               bool IsZExt, unsigned FP64LoadOpc) {
   unsigned Opc;
@@ -469,7 +469,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
     (ResultReg ? MRI.getRegClass(ResultReg) :
      (RC ? RC :
       (VT == MVT::f64 ? (HasSPE ? &PPC::SPERCRegClass : &PPC::F8RCRegClass) :
-       (VT == MVT::f32 ? (HasSPE ? &PPC::SPE4RCRegClass : &PPC::F4RCRegClass) :
+       (VT == MVT::f32 ? (HasSPE ? &PPC::GPRCRegClass : &PPC::F4RCRegClass) :
         (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
          &PPC::GPRC_and_GPRC_NOR0RegClass)))));
 
@@ -612,7 +612,7 @@ bool PPCFastISel::SelectLoad(const Instruction *I) {
   const TargetRegisterClass *RC =
     AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
 
-  unsigned ResultReg = 0;
+  Register ResultReg = 0;
   if (!PPCEmitLoad(VT, ResultReg, Addr, RC, true,
       PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
     return false;
@@ -989,7 +989,7 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
   unsigned DestReg;
   auto RC = MRI.getRegClass(SrcReg);
   if (PPCSubTarget->hasSPE()) {
-    DestReg = createResultReg(&PPC::SPE4RCRegClass);
+    DestReg = createResultReg(&PPC::GPRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
       TII.get(PPC::EFSCFD), DestReg)
       .addReg(SrcReg);
@@ -1051,7 +1051,7 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
   }
 
   const TargetRegisterClass *RC = &PPC::F8RCRegClass;
-  unsigned ResultReg = 0;
+  Register ResultReg = 0;
   if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc))
     return 0;
 
@@ -1176,7 +1176,7 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
   const TargetRegisterClass *RC =
     AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
 
-  unsigned ResultReg = 0;
+  Register ResultReg = 0;
   if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
     return 0;
 
@@ -1229,9 +1229,9 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
   if (PPCSubTarget->hasSPE()) {
     DestReg = createResultReg(&PPC::GPRCRegClass);
     if (IsSigned)
-      Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ;
+      Opc = InRC == &PPC::GPRCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ;
     else
-      Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ;
+      Opc = InRC == &PPC::GPRCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ;
   } else if (isVSFRCRegClass(RC)) {
     DestReg = createResultReg(&PPC::VSFRCRegClass);
     if (DstVT == MVT::i32) 
@@ -1717,7 +1717,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) {
       CCValAssign &VA = ValLocs[0];
 
-      unsigned RetReg = VA.getLocReg();
+      Register RetReg = VA.getLocReg();
       // We still need to worry about properly extending the sign. For example,
       // we could have only a single bit or a constant that needs zero
       // extension rather than sign extension. Make sure we pass the return
@@ -2002,7 +2002,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
   const bool HasSPE = PPCSubTarget->hasSPE();
   const TargetRegisterClass *RC;
   if (HasSPE)
-    RC = ((VT == MVT::f32) ? &PPC::SPE4RCRegClass : &PPC::SPERCRegClass);
+    RC = ((VT == MVT::f32) ? &PPC::GPRCRegClass : &PPC::SPERCRegClass);
   else
     RC = ((VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass);
 
@@ -2031,8 +2031,8 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
       .addImm(0).addReg(TmpReg).addMemOperand(MMO);
   } else {
-    // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+    // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA8(X2, Idx)).
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA8),
             TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
     // But for large code model, we must generate a LDtocL followed
     // by the LF[SD].
@@ -2085,16 +2085,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     // or externally available linkage, a non-local function address, or a
     // jump table address (not yet needed), or if we are generating code
     // for large code model, we generate:
-    //       LDtocL(GV, ADDIStocHA(%x2, GV))
+    //       LDtocL(GV, ADDIStocHA8(%x2, GV))
     // Otherwise we generate:
-    //       ADDItocL(ADDIStocHA(%x2, GV), GV)
-    // Either way, start with the ADDIStocHA:
+    //       ADDItocL(ADDIStocHA8(%x2, GV), GV)
+    // Either way, start with the ADDIStocHA8:
     unsigned HighPartReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA8),
             HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
 
-    unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
-    if (GVFlags & PPCII::MO_NLP_FLAG) {
+    if (PPCSubTarget->isGVIndirectSymbol(GV)) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
               DestReg).addGlobalAddress(GV).addReg(HighPartReg);
     } else {
@@ -2353,7 +2352,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   if (!PPCComputeAddress(LI->getOperand(0), Addr))
     return false;
 
-  unsigned ResultReg = MI->getOperand(0).getReg();
+  Register ResultReg = MI->getOperand(0).getReg();
 
   if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt,
         PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
@@ -2464,7 +2463,7 @@ namespace llvm {
                                 const TargetLibraryInfo *LibInfo) {
     // Only available on 64-bit ELF for now.
     const PPCSubtarget &Subtarget = FuncInfo.MF->getSubtarget<PPCSubtarget>();
-    if (Subtarget.isPPC64() && Subtarget.isSVR4ABI())
+    if (Subtarget.is64BitELFABI())
       return new PPCFastISel(FuncInfo, LibInfo);
     return nullptr;
   }
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index ebfb1ef7f49b..06a4d183e781 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -47,13 +47,15 @@ static const MCPhysReg VRRegNo[] = {
 };
 
 static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
-  if (STI.isDarwinABI())
+  if (STI.isDarwinABI() || STI.isAIXABI())
     return STI.isPPC64() ? 16 : 8;
   // SVR4 ABI:
   return STI.isPPC64() ? 16 : 4;
 }
 
 static unsigned computeTOCSaveOffset(const PPCSubtarget &STI) {
+  if (STI.isAIXABI())
+    return STI.isPPC64() ? 40 : 20;
   return STI.isELFv2ABI() ? 24 : 40;
 }
 
@@ -88,6 +90,11 @@ static unsigned computeBasePointerSaveOffset(const PPCSubtarget &STI) {
              : STI.getTargetMachine().isPositionIndependent() ? -12U : -8U;
 }
 
+static unsigned computeCRSaveOffset() {
+  // The condition register save offset needs to be updated for AIX PPC32.
+  return 8;
+}
+
 PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
     : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
                           STI.getPlatformStackAlignment(), 0),
@@ -95,7 +102,8 @@ PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
       TOCSaveOffset(computeTOCSaveOffset(Subtarget)),
       FramePointerSaveOffset(computeFramePointerSaveOffset(Subtarget)),
       LinkageSize(computeLinkageSize(Subtarget)),
-      BasePointerSaveOffset(computeBasePointerSaveOffset(STI)) {}
+      BasePointerSaveOffset(computeBasePointerSaveOffset(Subtarget)),
+      CRSaveOffset(computeCRSaveOffset()) {}
 
 // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
 const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
@@ -370,8 +378,8 @@ static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) {
     return;
   }
 
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
 
   if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
     if (DstReg != SrcReg)
@@ -781,15 +789,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   bool isPPC64 = Subtarget.isPPC64();
   // Get the ABI.
   bool isSVR4ABI = Subtarget.isSVR4ABI();
+  bool isAIXABI = Subtarget.isAIXABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
-  assert((Subtarget.isDarwinABI() || isSVR4ABI) &&
-         "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
+  assert((Subtarget.isDarwinABI() || isSVR4ABI || isAIXABI) &&
+         "Unsupported PPC ABI.");
 
   // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
   // process it.
   if (!isSVR4ABI)
     for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
       if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+        if (isAIXABI)
+          report_fatal_error("UPDATE_VRSAVE is unexpected on AIX.");
         HandleVRSaveUpdate(*MBBI, TII);
         break;
       }
@@ -819,7 +830,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   bool HasRedZone = isPPC64 || !isSVR4ABI;
 
   unsigned SPReg       = isPPC64 ? PPC::X1  : PPC::R1;
-  unsigned BPReg       = RegInfo->getBaseRegister(MF);
+  Register BPReg = RegInfo->getBaseRegister(MF);
   unsigned FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
   unsigned LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
   unsigned TOCReg      = isPPC64 ? PPC::X2 :  PPC::R2;
@@ -908,6 +919,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   assert((isPPC64 || !MustSaveCR) &&
          "Prologue CR saving supported only in 64-bit mode");
 
+  if (MustSaveCR && isAIXABI)
+    report_fatal_error("Prologue CR saving is unimplemented on AIX.");
+
   // Check if we can move the stack update instruction (stdu) down the prologue
   // past the callee saves. Hopefully this will avoid the situation where the
   // saves are waiting for the update on the store with update to complete.
@@ -966,7 +980,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
       MIB.addReg(MustSaveCRs[i], CrState);
     BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
       .addReg(TempReg, getKillRegState(true))
-      .addImm(8)
+      .addImm(getCRSaveOffset())
       .addReg(SPReg);
   }
 
@@ -1020,7 +1034,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     assert(HasRedZone && "A red zone is always available on PPC64");
     BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
       .addReg(TempReg, getKillRegState(true))
-      .addImm(8)
+      .addImm(getCRSaveOffset())
       .addReg(SPReg);
   }
 
@@ -1324,7 +1338,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
         // actually saved gets its own CFI record.
         unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2;
         unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-            nullptr, MRI->getDwarfRegNum(CRReg, true), 8));
+            nullptr, MRI->getDwarfRegNum(CRReg, true), getCRSaveOffset()));
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
         continue;
@@ -1387,7 +1401,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
 
   unsigned SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
-  unsigned BPReg      = RegInfo->getBaseRegister(MF);
+  Register BPReg = RegInfo->getBaseRegister(MF);
   unsigned FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
   unsigned ScratchReg = 0;
   unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
@@ -1590,7 +1604,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     // is live here.
     assert(HasRedZone && "Expecting red zone");
     BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
-      .addImm(8)
+      .addImm(getCRSaveOffset())
       .addReg(SPReg);
     for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
       BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
@@ -1614,7 +1628,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     assert(isPPC64 && "Expecting 64-bit mode");
     assert(RBReg == SPReg && "Should be using SP as a base register");
     BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
-      .addImm(8)
+      .addImm(getCRSaveOffset())
       .addReg(RBReg);
   }
 
@@ -1762,8 +1776,8 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   //  Save R31 if necessary
   int FPSI = FI->getFramePointerSaveIndex();
-  bool isPPC64 = Subtarget.isPPC64();
-  bool isDarwinABI  = Subtarget.isDarwinABI();
+  const bool isPPC64 = Subtarget.isPPC64();
+  const bool IsDarwinABI  = Subtarget.isDarwinABI();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // If the frame pointer save index hasn't been defined yet.
@@ -1812,7 +1826,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
   // function uses CR 2, 3, or 4.
-  if (!isPPC64 && !isDarwinABI &&
+  if (!isPPC64 && !IsDarwinABI &&
       (SavedRegs.test(PPC::CR2) ||
        SavedRegs.test(PPC::CR3) ||
        SavedRegs.test(PPC::CR4))) {
@@ -1872,8 +1886,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     assert((!MF.getInfo<PPCFunctionInfo>()->mustSaveTOC() ||
             (Reg != PPC::X2 && Reg != PPC::R2)) &&
            "Not expecting to try to spill R2 in a function that must save TOC");
-    if (PPC::GPRCRegClass.contains(Reg) ||
-        PPC::SPE4RCRegClass.contains(Reg)) {
+    if (PPC::GPRCRegClass.contains(Reg)) {
       HasGPSaveArea = true;
 
       GPRegs.push_back(CSI[i]);
@@ -1967,7 +1980,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     assert(FI && "No Base Pointer Save Slot!");
     MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
 
-    unsigned BP = RegInfo->getBaseRegister(MF);
+    Register BP = RegInfo->getBaseRegister(MF);
     if (PPC::G8RCRegClass.contains(BP)) {
       MinG8R = std::min<unsigned>(MinG8R, BP);
       HasG8SaveArea = true;
@@ -2428,6 +2441,26 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
+unsigned PPCFrameLowering::getTOCSaveOffset() const {
+  if (Subtarget.isAIXABI())
+    // TOC save/restore is normally handled by the linker.
+    // Indirect calls should hit this limitation.
+    report_fatal_error("TOC save is not implemented on AIX yet.");
+  return TOCSaveOffset;
+}
+
+unsigned PPCFrameLowering::getFramePointerSaveOffset() const {
+  if (Subtarget.isAIXABI())
+    report_fatal_error("FramePointer is not implemented on AIX yet.");
+  return FramePointerSaveOffset;
+}
+
+unsigned PPCFrameLowering::getBasePointerSaveOffset() const {
+  if (Subtarget.isAIXABI())
+    report_fatal_error("BasePointer is not implemented on AIX yet.");
+  return BasePointerSaveOffset;
+}
+
 bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
   if (MF.getInfo<PPCFunctionInfo>()->shrinkWrapDisabled())
     return false;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index d116e9fd22e1..a5fbc9acbb28 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -26,6 +26,7 @@ class PPCFrameLowering: public TargetFrameLowering {
   const unsigned FramePointerSaveOffset;
   const unsigned LinkageSize;
   const unsigned BasePointerSaveOffset;
+  const unsigned CRSaveOffset;
 
   /**
    * Find register[s] that can be used in function prologue and epilogue
@@ -142,15 +143,19 @@ public:
 
   /// getTOCSaveOffset - Return the previous frame offset to save the
   /// TOC register -- 64-bit SVR4 ABI only.
-  unsigned getTOCSaveOffset() const { return TOCSaveOffset; }
+  unsigned getTOCSaveOffset() const;
 
   /// getFramePointerSaveOffset - Return the previous frame offset to save the
   /// frame pointer.
-  unsigned getFramePointerSaveOffset() const { return FramePointerSaveOffset; }
+  unsigned getFramePointerSaveOffset() const;
 
   /// getBasePointerSaveOffset - Return the previous frame offset to save the
   /// base pointer.
-  unsigned getBasePointerSaveOffset() const { return BasePointerSaveOffset; }
+  unsigned getBasePointerSaveOffset() const;
+
+  /// getCRSaveOffset - Return the previous frame offset to save the
+  /// CR register.
+  unsigned getCRSaveOffset() const { return CRSaveOffset; }
 
   /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
   ///
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 543cac075f55..4ad6c88233fe 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -371,7 +371,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
   // by the scheduler.  Detect them now.
   bool HasVectorVReg = false;
   for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
       HasVectorVReg = true;
       break;
@@ -391,8 +391,8 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
 
   // Create two vregs - one to hold the VRSAVE register that is live-in to the
   // function and one for the value after having bits or'd into it.
-  unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
-  unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+  Register InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+  Register UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
 
   const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
   MachineBasicBlock &EntryBB = *Fn.begin();
@@ -447,7 +447,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
         } else {
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
-          unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+          Register TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
           BuildMI(FirstMBB, MBBI, dl,
                   TII.get(PPC::UpdateGBR), GlobalBaseReg)
                   .addReg(TempReg, RegState::Define).addReg(GlobalBaseReg);
@@ -5065,52 +5065,95 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case PPCISD::TOC_ENTRY: {
-    assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) &&
-            "Only supported for 64-bit ABI and 32-bit SVR4");
-    if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
-      SDValue GA = N->getOperand(0);
-      SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
-                                          N->getOperand(1));
-      transferMemOperands(N, MN);
-      ReplaceNode(N, MN);
-      return;
-    }
+    const bool isPPC64 = PPCSubTarget->isPPC64();
+    const bool isELFABI = PPCSubTarget->isSVR4ABI();
+    const bool isAIXABI = PPCSubTarget->isAIXABI();
+
+    assert(!PPCSubTarget->isDarwin() && "TOC is an ELF/XCOFF construct");
+
+    // PowerPC only support small, medium and large code model.
+    const CodeModel::Model CModel = TM.getCodeModel();
+    assert(!(CModel == CodeModel::Tiny || CModel == CodeModel::Kernel) &&
+           "PowerPC doesn't support tiny or kernel code models.");
 
-    // For medium and large code model, we generate two instructions as
-    // described below.  Otherwise we allow SelectCodeCommon to handle this,
+    if (isAIXABI && CModel == CodeModel::Medium)
+      report_fatal_error("Medium code model is not supported on AIX.");
+
+    // For 64-bit small code model, we allow SelectCodeCommon to handle this,
     // selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA.
-    CodeModel::Model CModel = TM.getCodeModel();
-    if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
+    if (isPPC64 && CModel == CodeModel::Small)
       break;
 
-    // The first source operand is a TargetGlobalAddress or a TargetJumpTable.
-    // If it must be toc-referenced according to PPCSubTarget, we generate:
-    //   LDtocL(@sym, ADDIStocHA(%x2, @sym))
+    // Handle 32-bit small code model.
+    if (!isPPC64) {
+      // Transforms the ISD::TOC_ENTRY node to a PPCISD::LWZtoc.
+      auto replaceWithLWZtoc = [this, &dl](SDNode *TocEntry) {
+        SDValue GA = TocEntry->getOperand(0);
+        SDValue TocBase = TocEntry->getOperand(1);
+        SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
+                                            TocBase);
+        transferMemOperands(TocEntry, MN);
+        ReplaceNode(TocEntry, MN);
+      };
+
+      if (isELFABI) {
+        assert(TM.isPositionIndependent() &&
+               "32-bit ELF can only have TOC entries in position independent"
+               " code.");
+        // 32-bit ELF always uses a small code model toc access.
+        replaceWithLWZtoc(N);
+        return;
+      }
+
+      if (isAIXABI && CModel == CodeModel::Small) {
+        replaceWithLWZtoc(N);
+        return;
+      }
+    }
+
+    assert(CModel != CodeModel::Small && "All small code models handled.");
+
+    assert((isPPC64 || (isAIXABI && !isPPC64)) && "We are dealing with 64-bit"
+           " ELF/AIX or 32-bit AIX in the following.");
+
+    // Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode
+    // or 64-bit medium (ELF-only) or large (ELF and AIX) code model code. We
+    // generate two instructions as described below. The first source operand
+    // is a symbol reference. If it must be toc-referenced according to
+    // PPCSubTarget, we generate:
+    // [32-bit AIX]
+    //   LWZtocL(@sym, ADDIStocHA(%r2, @sym))
+    // [64-bit ELF/AIX]
+    //   LDtocL(@sym, ADDIStocHA8(%x2, @sym))
     // Otherwise we generate:
-    //   ADDItocL(ADDIStocHA(%x2, @sym), @sym)
+    //   ADDItocL(ADDIStocHA8(%x2, @sym), @sym)
     SDValue GA = N->getOperand(0);
     SDValue TOCbase = N->getOperand(1);
-    SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
-                                         TOCbase, GA);
+
+    EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+    SDNode *Tmp = CurDAG->getMachineNode(
+        isPPC64 ? PPC::ADDIStocHA8 : PPC::ADDIStocHA, dl, VT, TOCbase, GA);
+
     if (PPCLowering->isAccessedAsGotIndirect(GA)) {
-      // If it is access as got-indirect, we need an extra LD to load
+      // If it is accessed as got-indirect, we need an extra LWZ/LD to load
       // the address.
-      SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
-                                          SDValue(Tmp, 0));
+      SDNode *MN = CurDAG->getMachineNode(
+          isPPC64 ? PPC::LDtocL : PPC::LWZtocL, dl, VT, GA, SDValue(Tmp, 0));
+
       transferMemOperands(N, MN);
       ReplaceNode(N, MN);
       return;
     }
 
-    // Build the address relative to the TOC-pointer..
+    // Build the address relative to the TOC-pointer.
     ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
                                           SDValue(Tmp, 0), GA));
     return;
   }
   case PPCISD::PPC32_PICGOT:
     // Generate a PIC-safe GOT reference.
-    assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
-      "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
+    assert(PPCSubTarget->is32BitELFABI() &&
+           "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
     CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT,
                          PPCLowering->getPointerTy(CurDAG->getDataLayout()),
                          MVT::i32);
@@ -6456,7 +6499,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
           continue;
 
         if (!HBase.isMachineOpcode() ||
-            HBase.getMachineOpcode() != PPC::ADDIStocHA)
+            HBase.getMachineOpcode() != PPC::ADDIStocHA8)
           continue;
 
         if (!Base.hasOneUse() || !HBase.hasOneUse())
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 24d50074860d..8cf6a660b08b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -139,13 +139,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
   // arguments are at least 4/8 bytes aligned.
   bool isPPC64 = Subtarget.isPPC64();
-  setMinStackArgumentAlignment(isPPC64 ? 8:4);
+  setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
   if (!useSoftFloat()) {
     if (hasSPE()) {
-      addRegisterClass(MVT::f32, &PPC::SPE4RCRegClass);
+      addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
       addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
     } else {
       addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
@@ -431,28 +431,26 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 
-  if (Subtarget.isSVR4ABI()) {
-    if (isPPC64) {
-      // VAARG always uses double-word chunks, so promote anything smaller.
-      setOperationAction(ISD::VAARG, MVT::i1, Promote);
-      AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
-      setOperationAction(ISD::VAARG, MVT::i8, Promote);
-      AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
-      setOperationAction(ISD::VAARG, MVT::i16, Promote);
-      AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
-      setOperationAction(ISD::VAARG, MVT::i32, Promote);
-      AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
-      setOperationAction(ISD::VAARG, MVT::Other, Expand);
-    } else {
-      // VAARG is custom lowered with the 32-bit SVR4 ABI.
-      setOperationAction(ISD::VAARG, MVT::Other, Custom);
-      setOperationAction(ISD::VAARG, MVT::i64, Custom);
-    }
+  if (Subtarget.is64BitELFABI()) {
+    // VAARG always uses double-word chunks, so promote anything smaller.
+    setOperationAction(ISD::VAARG, MVT::i1, Promote);
+    AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64);
+    setOperationAction(ISD::VAARG, MVT::i8, Promote);
+    AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64);
+    setOperationAction(ISD::VAARG, MVT::i16, Promote);
+    AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64);
+    setOperationAction(ISD::VAARG, MVT::i32, Promote);
+    AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64);
+    setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  } else if (Subtarget.is32BitELFABI()) {
+    // VAARG is custom lowered with the 32-bit SVR4 ABI.
+    setOperationAction(ISD::VAARG, MVT::Other, Custom);
+    setOperationAction(ISD::VAARG, MVT::i64, Custom);
   } else
     setOperationAction(ISD::VAARG, MVT::Other, Expand);
 
-  if (Subtarget.isSVR4ABI() && !isPPC64)
-    // VACOPY is custom lowered with the 32-bit SVR4 ABI.
+  // VACOPY is custom lowered with the 32-bit SVR4 ABI.
+  if (Subtarget.is32BitELFABI())
     setOperationAction(ISD::VACOPY            , MVT::Other, Custom);
   else
     setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
@@ -553,17 +551,25 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   if (Subtarget.hasAltivec()) {
     // First set operation action for all vector types to expand. Then we
     // will selectively turn on ones that can be effectively codegen'd.
-    for (MVT VT : MVT::vector_valuetypes()) {
+    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
       // add/sub are legal for all supported vector VT's.
       setOperationAction(ISD::ADD, VT, Legal);
       setOperationAction(ISD::SUB, VT, Legal);
 
       // For v2i64, these are only valid with P8Vector. This is corrected after
       // the loop.
-      setOperationAction(ISD::SMAX, VT, Legal);
-      setOperationAction(ISD::SMIN, VT, Legal);
-      setOperationAction(ISD::UMAX, VT, Legal);
-      setOperationAction(ISD::UMIN, VT, Legal);
+      if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
+        setOperationAction(ISD::SMAX, VT, Legal);
+        setOperationAction(ISD::SMIN, VT, Legal);
+        setOperationAction(ISD::UMAX, VT, Legal);
+        setOperationAction(ISD::UMIN, VT, Legal);
+      }
+      else {
+        setOperationAction(ISD::SMAX, VT, Expand);
+        setOperationAction(ISD::SMIN, VT, Expand);
+        setOperationAction(ISD::UMAX, VT, Expand);
+        setOperationAction(ISD::UMIN, VT, Expand);
+      }
 
       if (Subtarget.hasVSX()) {
         setOperationAction(ISD::FMAXNUM, VT, Legal);
@@ -646,7 +652,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::ROTL, VT, Expand);
       setOperationAction(ISD::ROTR, VT, Expand);
 
-      for (MVT InnerVT : MVT::vector_valuetypes()) {
+      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
@@ -944,7 +950,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
 
     setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
-    setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand);
     setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
 
     setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
@@ -1118,6 +1123,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setTargetDAGCombine(ISD::ANY_EXTEND);
 
   setTargetDAGCombine(ISD::TRUNCATE);
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+
 
   if (Subtarget.useCRBits()) {
     setTargetDAGCombine(ISD::TRUNCATE);
@@ -1172,9 +1179,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setJumpIsExpensive();
   }
 
-  setMinFunctionAlignment(2);
+  setMinFunctionAlignment(Align(4));
   if (Subtarget.isDarwin())
-    setPrefFunctionAlignment(4);
+    setPrefFunctionAlignment(Align(16));
 
   switch (Subtarget.getDarwinDirective()) {
   default: break;
@@ -1191,8 +1198,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
-    setPrefFunctionAlignment(4);
-    setPrefLoopAlignment(4);
+    setPrefLoopAlignment(Align(16));
+    setPrefFunctionAlignment(Align(16));
     break;
   }
 
@@ -1352,6 +1359,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::SExtVElems:      return "PPCISD::SExtVElems";
   case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
   case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
+  case PPCISD::LOAD_VEC_BE:     return "PPCISD::LOAD_VEC_BE";
+  case PPCISD::STORE_VEC_BE:    return "PPCISD::STORE_VEC_BE";
   case PPCISD::ST_VSR_SCAL_INT:
                                 return "PPCISD::ST_VSR_SCAL_INT";
   case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
@@ -1396,7 +1405,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::EXTRACT_SPE:     return "PPCISD::EXTRACT_SPE";
   case PPCISD::EXTSWSLI:        return "PPCISD::EXTSWSLI";
   case PPCISD::LD_VSX_LH:       return "PPCISD::LD_VSX_LH";
-  case PPCISD::FP_EXTEND_LH:    return "PPCISD::FP_EXTEND_LH";
+  case PPCISD::FP_EXTEND_HALF:  return "PPCISD::FP_EXTEND_HALF";
+  case PPCISD::LD_SPLAT:        return "PPCISD::LD_SPLAT";
   }
   return nullptr;
 }
@@ -1517,7 +1527,7 @@ bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
 bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
   const PPCSubtarget& Subtarget =
-    static_cast<const PPCSubtarget&>(DAG.getSubtarget());
+      static_cast<const PPCSubtarget&>(DAG.getSubtarget());
   if (!Subtarget.hasP8Vector())
     return false;
 
@@ -1769,10 +1779,10 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
 
 /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a splat of a single element that is suitable for input to
-/// VSPLTB/VSPLTH/VSPLTW.
+/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
 bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
-  assert(N->getValueType(0) == MVT::v16i8 &&
-         (EltSize == 1 || EltSize == 2 || EltSize == 4));
+  assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) &&
+         EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
 
   // The consecutive indices need to specify an element, not part of two
   // different elements.  So abandon ship early if this isn't the case.
@@ -2065,10 +2075,11 @@ bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
 }
 
 
-/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
-/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
-                                SelectionDAG &DAG) {
+/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
+/// appropriate for PPC mnemonics (which have a big endian bias - namely
+/// elements are counted from the left of the vector register).
+unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
+                                         SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   assert(isSplatShuffleMask(SVOp, EltSize));
   if (DAG.getDataLayout().isLittleEndian())
@@ -2667,12 +2678,14 @@ static void setUsesTOCBasePtr(SelectionDAG &DAG) {
   setUsesTOCBasePtr(DAG.getMachineFunction());
 }
 
-static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
-                           SDValue GA) {
+SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
+                                       SDValue GA) const {
+  const bool Is64Bit = Subtarget.isPPC64();
   EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
-  SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) :
-                DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
-
+  SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT)
+                        : Subtarget.isAIXABI()
+                              ? DAG.getRegister(PPC::R2, VT)
+                              : DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
   SDValue Ops[] = { GA, Reg };
   return DAG.getMemIntrinsicNode(
       PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
@@ -2688,10 +2701,10 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+  if (Subtarget.is64BitELFABI()) {
     setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
-    return getTOCEntry(DAG, SDLoc(CP), true, GA);
+    return getTOCEntry(DAG, SDLoc(CP), GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
@@ -2701,7 +2714,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
   if (IsPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
                                            PPCII::MO_PIC_FLAG);
-    return getTOCEntry(DAG, SDLoc(CP), false, GA);
+    return getTOCEntry(DAG, SDLoc(CP), GA);
   }
 
   SDValue CPIHi =
@@ -2764,10 +2777,10 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+  if (Subtarget.is64BitELFABI()) {
     setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
-    return getTOCEntry(DAG, SDLoc(JT), true, GA);
+    return getTOCEntry(DAG, SDLoc(JT), GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
@@ -2777,7 +2790,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   if (IsPIC && Subtarget.isSVR4ABI()) {
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
                                         PPCII::MO_PIC_FLAG);
-    return getTOCEntry(DAG, SDLoc(GA), false, GA);
+    return getTOCEntry(DAG, SDLoc(GA), GA);
   }
 
   SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
@@ -2793,14 +2806,18 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual BlockAddress is stored in the TOC.
-  if (Subtarget.isSVR4ABI() &&
-      (Subtarget.isPPC64() || isPositionIndependent())) {
-    if (Subtarget.isPPC64())
-      setUsesTOCBasePtr(DAG);
+  if (Subtarget.is64BitELFABI()) {
+    setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
-    return getTOCEntry(DAG, SDLoc(BASDN), Subtarget.isPPC64(), GA);
+    return getTOCEntry(DAG, SDLoc(BASDN), GA);
   }
 
+  // 32-bit position-independent ELF stores the BlockAddress in the .got.
+  if (Subtarget.is32BitELFABI() && isPositionIndependent())
+    return getTOCEntry(
+        DAG, SDLoc(BASDN),
+        DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()));
+
   unsigned MOHiFlag, MOLoFlag;
   bool IsPIC = isPositionIndependent();
   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
@@ -2913,12 +2930,12 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
   SDLoc DL(GSDN);
   const GlobalValue *GV = GSDN->getGlobal();
 
-  // 64-bit SVR4 ABI code is always position-independent.
+  // 64-bit SVR4 ABI & AIX ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
-  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+  if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
     setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
-    return getTOCEntry(DAG, DL, true, GA);
+    return getTOCEntry(DAG, DL, GA);
   }
 
   unsigned MOHiFlag, MOLoFlag;
@@ -2929,7 +2946,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
                                             GSDN->getOffset(),
                                             PPCII::MO_PIC_FLAG);
-    return getTOCEntry(DAG, DL, false, GA);
+    return getTOCEntry(DAG, DL, GA);
   }
 
   SDValue GAHi =
@@ -3235,8 +3252,8 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV, nextOffset));
 }
 
-/// FPR - The set of FP registers that should be allocated for arguments,
-/// on Darwin.
+/// FPR - The set of FP registers that should be allocated for arguments
+/// on Darwin and AIX.
 static const MCPhysReg FPR[] = {PPC::F1,  PPC::F2,  PPC::F3, PPC::F4, PPC::F5,
                                 PPC::F6,  PPC::F7,  PPC::F8, PPC::F9, PPC::F10,
                                 PPC::F11, PPC::F12, PPC::F13};
@@ -3377,17 +3394,17 @@ SDValue PPCTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-  if (Subtarget.isSVR4ABI()) {
-    if (Subtarget.isPPC64())
-      return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
-                                         dl, DAG, InVals);
-    else
-      return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
-                                         dl, DAG, InVals);
-  } else {
-    return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
-                                       dl, DAG, InVals);
-  }
+  if (Subtarget.is64BitELFABI())
+    return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
+                                       InVals);
+  else if (Subtarget.is32BitELFABI())
+    return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
+                                       InVals);
+
+  // FIXME: We are using this for both AIX and Darwin. We should add appropriate
+  // AIX testing, and rename it appropriately.
+  return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG,
+                                     InVals);
 }
 
 SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
@@ -3467,7 +3484,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
           if (Subtarget.hasP8Vector())
             RC = &PPC::VSSRCRegClass;
           else if (Subtarget.hasSPE())
-            RC = &PPC::SPE4RCRegClass;
+            RC = &PPC::GPRCRegClass;
           else
             RC = &PPC::F4RCRegClass;
           break;
@@ -4516,7 +4533,7 @@ callsShareTOCBase(const Function *Caller, SDValue Callee,
 static bool
 needStackSlotPassParameters(const PPCSubtarget &Subtarget,
                             const SmallVectorImpl<ISD::OutputArg> &Outs) {
-  assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64());
+  assert(Subtarget.is64BitELFABI());
 
   const unsigned PtrByteSize = 8;
   const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
@@ -4926,7 +4943,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
             ImmutableCallSite CS, const PPCSubtarget &Subtarget) {
   bool isPPC64 = Subtarget.isPPC64();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
-  bool isELFv2ABI = Subtarget.isELFv2ABI();
+  bool is64BitELFv1ABI = isPPC64 && isSVR4ABI && !Subtarget.isELFv2ABI();
   bool isAIXABI = Subtarget.isAIXABI();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
@@ -4997,7 +5014,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
     // to do the call, we can't use PPCISD::CALL.
     SDValue MTCTROps[] = {Chain, Callee, InFlag};
 
-    if (isSVR4ABI && isPPC64 && !isELFv2ABI) {
+    if (is64BitELFv1ABI) {
       // Function pointers in the 64-bit SVR4 ABI do not point to the function
       // entry point, but to the function descriptor (the function entry point
       // address is part of the function descriptor though).
@@ -5085,7 +5102,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
     CallOpc = PPCISD::BCTRL;
     Callee.setNode(nullptr);
     // Add use of X11 (holding environment pointer)
-    if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest)
+    if (is64BitELFv1ABI && !hasNest)
       Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
     // Add CTR register as callee so a bctr can be emitted later.
     if (isTailCall)
@@ -6730,8 +6747,12 @@ SDValue PPCTargetLowering::LowerCall_AIX(
 
   const unsigned NumGPRs = isPPC64 ? array_lengthof(GPR_64)
                                    : array_lengthof(GPR_32);
+  const unsigned NumFPRs = array_lengthof(FPR);
+  assert(NumFPRs == 13 && "Only FPR 1-13 could be used for parameter passing "
+                          "on AIX");
+
   const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
-  unsigned GPR_idx = 0;
+  unsigned GPR_idx = 0, FPR_idx = 0;
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
 
@@ -6768,6 +6789,20 @@ SDValue PPCTargetLowering::LowerCall_AIX(
       break;
     case MVT::f32:
     case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+        // If we have any FPRs remaining, we may also have GPRs remaining.
+        // Args passed in FPRs consume 1 or 2 (f64 in 32 bit mode) available
+        // GPRs.
+        if (GPR_idx != NumGPRs)
+          ++GPR_idx;
+        if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64)
+          ++GPR_idx;
+      } else
+        report_fatal_error("Handling of placing parameters on the stack is "
+                           "unimplemented!");
+      break;
     case MVT::v4f32:
     case MVT::v4i32:
     case MVT::v8i16:
@@ -8152,6 +8187,18 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
                      Op0.getOperand(1));
 }
 
+static const SDValue *getNormalLoadInput(const SDValue &Op) {
+  const SDValue *InputLoad = &Op;
+  if (InputLoad->getOpcode() == ISD::BITCAST)
+    InputLoad = &InputLoad->getOperand(0);
+  if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR)
+    InputLoad = &InputLoad->getOperand(0);
+  if (InputLoad->getOpcode() != ISD::LOAD)
+    return nullptr;
+  LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+  return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.  If we CAN select this case, and if it
 // selects to a single instruction, return Op.  Otherwise, if we can codegen
@@ -8274,6 +8321,34 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
                              HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
       SplatBitSize > 32) {
+
+    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
+    // Handle load-and-splat patterns as we have instructions that will do this
+    // in one go.
+    if (InputLoad && DAG.isSplatValue(Op, true)) {
+      LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+
+      // We have handling for 4 and 8 byte elements.
+      unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();
+
+      // Checking for a single use of this load, we have to check for vector
+      // width (128 bits) / ElementSize uses (since each operand of the
+      // BUILD_VECTOR is a separate use of the value.
+      if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
+          ((Subtarget.hasVSX() && ElementSize == 64) ||
+           (Subtarget.hasP9Vector() && ElementSize == 32))) {
+        SDValue Ops[] = {
+          LD->getChain(),    // Chain
+          LD->getBasePtr(),  // Ptr
+          DAG.getValueType(Op.getValueType()) // VT
+        };
+        return
+          DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
+                                  DAG.getVTList(Op.getValueType(), MVT::Other),
+                                  Ops, LD->getMemoryVT(), LD->getMemOperand());
+      }
+    }
+
     // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
     // lowered to VSX instructions under certain conditions.
     // Without VSX, there is no pattern more efficient than expanding the node.
@@ -8759,6 +8834,45 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
   unsigned ShiftElts, InsertAtByte;
   bool Swap = false;
+
+  // If this is a load-and-splat, we can do that with a single instruction
+  // in some cases. However if the load has multiple uses, we don't want to
+  // combine it because that will just produce multiple loads.
+  const SDValue *InputLoad = getNormalLoadInput(V1);
+  if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
+      (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
+      InputLoad->hasOneUse()) {
+    bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
+    int SplatIdx =
+      PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
+
+    LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+    // For 4-byte load-and-splat, we need Power9.
+    if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
+      uint64_t Offset = 0;
+      if (IsFourByte)
+        Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
+      else
+        Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
+      SDValue BasePtr = LD->getBasePtr();
+      if (Offset != 0)
+        BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                              BasePtr, DAG.getIntPtrConstant(Offset, dl));
+      SDValue Ops[] = {
+        LD->getChain(),    // Chain
+        BasePtr,           // BasePtr
+        DAG.getValueType(Op.getValueType()) // VT
+      };
+      SDVTList VTL =
+        DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
+      SDValue LdSplt =
+        DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
+                                Ops, LD->getMemoryVT(), LD->getMemOperand());
+      if (LdSplt.getValueType() != SVOp->getValueType(0))
+        LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
+      return LdSplt;
+    }
+  }
   if (Subtarget.hasP9Vector() &&
       PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
                            isLittleEndian)) {
@@ -8835,7 +8949,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
   if (Subtarget.hasVSX()) {
     if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
-      int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
+      int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);
 
       SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
       SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
@@ -9880,6 +9994,30 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   switch (Op0.getOpcode()) {
   default:
     return SDValue();
+  case ISD::EXTRACT_SUBVECTOR: {
+    assert(Op0.getNumOperands() == 2 &&
+           isa<ConstantSDNode>(Op0->getOperand(1)) &&
+           "Node should have 2 operands with second one being a constant!");
+
+    if (Op0.getOperand(0).getValueType() != MVT::v4f32)
+      return SDValue();
+
+    // Custom lower is only done for high or low doubleword.
+    int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    if (Idx % 2 != 0)
+      return SDValue();
+
+    // Since input is v4f32, at this point Idx is either 0 or 2.
+    // Shift to get the doubleword position we want.
+    int DWord = Idx >> 1;
+
+    // High and low word positions are different on little endian.
+    if (Subtarget.isLittleEndian())
+      DWord ^= 0x1;
+
+    return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64,
+                       Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32));
+  }
   case ISD::FADD:
   case ISD::FMUL:
   case ISD::FSUB: {
@@ -9891,26 +10029,25 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
         return SDValue();
       // Generate new load node.
       LoadSDNode *LD = cast<LoadSDNode>(LdOp);
-      SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
-      NewLoad[i] =
-        DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
-                                DAG.getVTList(MVT::v4f32, MVT::Other),
-                                LoadOps, LD->getMemoryVT(),
-                                LD->getMemOperand());
-    }
-    SDValue NewOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32,
-                              NewLoad[0], NewLoad[1],
-                              Op0.getNode()->getFlags());
-    return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewOp);
+      SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
+      NewLoad[i] = DAG.getMemIntrinsicNode(
+          PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
+          LD->getMemoryVT(), LD->getMemOperand());
+    }
+    SDValue NewOp =
+        DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0],
+                    NewLoad[1], Op0.getNode()->getFlags());
+    return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp,
+                       DAG.getConstant(0, dl, MVT::i32));
   }
   case ISD::LOAD: {
     LoadSDNode *LD = cast<LoadSDNode>(Op0);
-    SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
-    SDValue NewLd =
-      DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
-                              DAG.getVTList(MVT::v4f32, MVT::Other),
-                              LoadOps, LD->getMemoryVT(), LD->getMemOperand());
-    return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewLd);
+    SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
+    SDValue NewLd = DAG.getMemIntrinsicNode(
+        PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
+        LD->getMemoryVT(), LD->getMemOperand());
+    return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd,
+                       DAG.getConstant(0, dl, MVT::i32));
   }
   }
   llvm_unreachable("ERROR:Should return for all cases within swtich.");
@@ -10048,9 +10185,11 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::TRUNCATE: {
     EVT TrgVT = N->getValueType(0);
+    EVT OpVT = N->getOperand(0).getValueType();
     if (TrgVT.isVector() &&
         isOperationCustom(N->getOpcode(), TrgVT) &&
-        N->getOperand(0).getValueType().getSizeInBits() <= 128)
+        OpVT.getSizeInBits() <= 128 &&
+        isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits()))
       Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
     return;
   }
@@ -10192,7 +10331,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
   if (CmpOpcode) {
     // Signed comparisons of byte or halfword values must be sign-extended.
     if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
-      unsigned ExtReg =  RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+      Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
       BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
               ExtReg).addReg(dest);
       BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
@@ -10243,10 +10382,10 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
   MachineFunction *F = BB->getParent();
   MachineFunction::iterator It = ++BB->getIterator();
 
-  unsigned dest = MI.getOperand(0).getReg();
-  unsigned ptrA = MI.getOperand(1).getReg();
-  unsigned ptrB = MI.getOperand(2).getReg();
-  unsigned incr = MI.getOperand(3).getReg();
+  Register dest = MI.getOperand(0).getReg();
+  Register ptrA = MI.getOperand(1).getReg();
+  Register ptrB = MI.getOperand(2).getReg();
+  Register incr = MI.getOperand(3).getReg();
   DebugLoc dl = MI.getDebugLoc();
 
   MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -10364,7 +10503,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
   if (CmpOpcode) {
     // For unsigned comparisons, we can directly compare the shifted values.
     // For signed comparisons we shift and sign extend.
-    unsigned SReg = RegInfo.createVirtualRegister(GPRC);
+    Register SReg = RegInfo.createVirtualRegister(GPRC);
     BuildMI(BB, dl, TII->get(PPC::AND), SReg)
         .addReg(TmpDestReg)
         .addReg(MaskReg);
@@ -10375,7 +10514,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
       BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
           .addReg(SReg)
           .addReg(ShiftReg);
-      unsigned ValueSReg = RegInfo.createVirtualRegister(GPRC);
+      Register ValueSReg = RegInfo.createVirtualRegister(GPRC);
       BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
           .addReg(ValueReg);
       ValueReg = ValueSReg;
@@ -10426,11 +10565,11 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
 
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
-  unsigned mainDstReg = MRI.createVirtualRegister(RC);
-  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+  Register mainDstReg = MRI.createVirtualRegister(RC);
+  Register restoreDstReg = MRI.createVirtualRegister(RC);
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
@@ -10482,10 +10621,10 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   // Prepare IP either in reg.
   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
-  unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
-  unsigned BufReg = MI.getOperand(1).getReg();
+  Register LabelReg = MRI.createVirtualRegister(PtrRC);
+  Register BufReg = MI.getOperand(1).getReg();
 
-  if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
+  if (Subtarget.is64BitELFABI()) {
     setUsesTOCBasePtr(*MBB->getParent());
     MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
               .addReg(PPC::X2)
@@ -10570,7 +10709,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
 
   const TargetRegisterClass *RC =
     (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
-  unsigned Tmp = MRI.createVirtualRegister(RC);
+  Register Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   unsigned FP  = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
   unsigned SP  = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
@@ -10587,7 +10726,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   const int64_t TOCOffset   = 3 * PVT.getStoreSize();
   const int64_t BPOffset    = 4 * PVT.getStoreSize();
 
-  unsigned BufReg = MI.getOperand(0).getReg();
+  Register BufReg = MI.getOperand(0).getReg();
 
   // Reload FP (the jumped-to function may not have had a
   // frame pointer, and if so, then its r31 will be restored
@@ -10662,7 +10801,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   if (MI.getOpcode() == TargetOpcode::STACKMAP ||
       MI.getOpcode() == TargetOpcode::PATCHPOINT) {
-    if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() &&
+    if (Subtarget.is64BitELFABI() &&
         MI.getOpcode() == TargetOpcode::PATCHPOINT) {
       // Call lowering should have added an r2 operand to indicate a dependence
       // on the TOC base pointer value. It can't however, because there is no
@@ -10828,15 +10967,15 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     BB = readMBB;
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
-    unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
-    unsigned LoReg = MI.getOperand(0).getReg();
-    unsigned HiReg = MI.getOperand(1).getReg();
+    Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+    Register LoReg = MI.getOperand(0).getReg();
+    Register HiReg = MI.getOperand(1).getReg();
 
     BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
     BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
     BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
 
-    unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+    Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
 
     BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
         .addReg(HiReg)
@@ -10978,11 +11117,11 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       StoreMnemonic = PPC::STDCX;
       break;
     }
-    unsigned dest = MI.getOperand(0).getReg();
-    unsigned ptrA = MI.getOperand(1).getReg();
-    unsigned ptrB = MI.getOperand(2).getReg();
-    unsigned oldval = MI.getOperand(3).getReg();
-    unsigned newval = MI.getOperand(4).getReg();
+    Register dest = MI.getOperand(0).getReg();
+    Register ptrA = MI.getOperand(1).getReg();
+    Register ptrB = MI.getOperand(2).getReg();
+    Register oldval = MI.getOperand(3).getReg();
+    Register newval = MI.getOperand(4).getReg();
     DebugLoc dl = MI.getDebugLoc();
 
     MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -11057,11 +11196,11 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     bool isLittleEndian = Subtarget.isLittleEndian();
     bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
 
-    unsigned dest = MI.getOperand(0).getReg();
-    unsigned ptrA = MI.getOperand(1).getReg();
-    unsigned ptrB = MI.getOperand(2).getReg();
-    unsigned oldval = MI.getOperand(3).getReg();
-    unsigned newval = MI.getOperand(4).getReg();
+    Register dest = MI.getOperand(0).getReg();
+    Register ptrA = MI.getOperand(1).getReg();
+    Register ptrB = MI.getOperand(2).getReg();
+    Register oldval = MI.getOperand(3).getReg();
+    Register newval = MI.getOperand(4).getReg();
     DebugLoc dl = MI.getDebugLoc();
 
     MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -11238,13 +11377,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     // This pseudo performs an FADD with rounding mode temporarily forced
     // to round-to-zero.  We emit this via custom inserter since the FPSCR
     // is not modeled at the SelectionDAG level.
-    unsigned Dest = MI.getOperand(0).getReg();
-    unsigned Src1 = MI.getOperand(1).getReg();
-    unsigned Src2 = MI.getOperand(2).getReg();
+    Register Dest = MI.getOperand(0).getReg();
+    Register Src1 = MI.getOperand(1).getReg();
+    Register Src2 = MI.getOperand(2).getReg();
     DebugLoc dl = MI.getDebugLoc();
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
-    unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+    Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
 
     // Save FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
@@ -11270,7 +11409,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                  MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8);
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
-    unsigned Dest = RegInfo.createVirtualRegister(
+    Register Dest = RegInfo.createVirtualRegister(
         Opcode == PPC::ANDIo ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);
 
     DebugLoc dl = MI.getDebugLoc();
@@ -11283,7 +11422,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   } else if (MI.getOpcode() == PPC::TCHECK_RET) {
     DebugLoc Dl = MI.getDebugLoc();
     MachineRegisterInfo &RegInfo = F->getRegInfo();
-    unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+    Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
     BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
     BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
             MI.getOperand(0).getReg())
@@ -11297,7 +11436,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addReg(PPC::CR0EQ);
   } else if (MI.getOpcode() == PPC::SETRNDi) {
     DebugLoc dl = MI.getDebugLoc();
-    unsigned OldFPSCRReg = MI.getOperand(0).getReg();
+    Register OldFPSCRReg = MI.getOperand(0).getReg();
 
     // Save FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
@@ -11378,7 +11517,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       }
     };
 
-    unsigned OldFPSCRReg = MI.getOperand(0).getReg();
+    Register OldFPSCRReg = MI.getOperand(0).getReg();
 
     // Save FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
@@ -11393,12 +11532,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     // mtfsf 255, NewFPSCRReg
     MachineOperand SrcOp = MI.getOperand(1);
     MachineRegisterInfo &RegInfo = F->getRegInfo();
-    unsigned OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+    Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
 
     copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
 
-    unsigned ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
-    unsigned ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+    Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+    Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
 
     // The first operand of INSERT_SUBREG should be a register which has
     // subregisters, we only care about its RegClass, so we should use an
@@ -11409,14 +11548,14 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       .add(SrcOp)
       .addImm(1);
 
-    unsigned NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+    Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
     BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
       .addReg(OldFPSCRTmpReg)
       .addReg(ExtSrcReg)
       .addImm(0)
       .addImm(62);
 
-    unsigned NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+    Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
     copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
 
     // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
@@ -13113,6 +13252,61 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
   return Val;
 }
 
+SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
+                                                LSBaseSDNode *LSBase,
+                                                DAGCombinerInfo &DCI) const {
+  assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
+        "Not a reverse memop pattern!");
+
+  auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
+    auto Mask = SVN->getMask();
+    int i = 0;
+    auto I = Mask.rbegin();
+    auto E = Mask.rend();
+
+    for (; I != E; ++I) {
+      if (*I != i)
+        return false;
+      i++;
+    }
+    return true;
+  };
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = SVN->getValueType(0);
+
+  if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
+    return SDValue();
+
+  // Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
+  // See comment in PPCVSXSwapRemoval.cpp.
+  // It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
+  if (!Subtarget.hasP9Vector())
+    return SDValue();
+
+  if(!IsElementReverse(SVN))
+    return SDValue();
+
+  if (LSBase->getOpcode() == ISD::LOAD) {
+    SDLoc dl(SVN);
+    SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
+    return DAG.getMemIntrinsicNode(
+        PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
+        LSBase->getMemoryVT(), LSBase->getMemOperand());
+  }
+
+  if (LSBase->getOpcode() == ISD::STORE) {
+    SDLoc dl(LSBase);
+    SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
+                          LSBase->getBasePtr()};
+    return DAG.getMemIntrinsicNode(
+        PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,
+        LSBase->getMemoryVT(), LSBase->getMemOperand());
+  }
+
+  llvm_unreachable("Expected a load or store node here");
+}
+
 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -13159,6 +13353,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     return combineFPToIntToFP(N, DCI);
+  case ISD::VECTOR_SHUFFLE:
+    if (ISD::isNormalLoad(N->getOperand(0).getNode())) {
+      LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
+      return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
+    }
+    break;
   case ISD::STORE: {
 
     EVT Op1VT = N->getOperand(1).getValueType();
@@ -13170,6 +13370,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         return Val;
     }
 
+    if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
+      ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));
+      SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);
+      if (Val)
+        return Val;
+    }
+
     // Turn STORE (BSWAP) -> sthbrx/stwbrx.
     if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
         N->getOperand(1).getNode()->hasOneUse() &&
@@ -13903,7 +14110,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   }
 }
 
-unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   switch (Subtarget.getDarwinDirective()) {
   default: break;
   case PPC::DIR_970:
@@ -13924,7 +14131,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
       // Actual alignment of the loop will depend on the hotness check and other
       // logic in alignBlocks.
       if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
-        return 5;
+        return Align(32);
     }
 
     const PPCInstrInfo *TII = Subtarget.getInstrInfo();
@@ -13940,7 +14147,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
       }
 
     if (LoopSize > 16 && LoopSize <= 32)
-      return 5;
+      return Align(32);
 
     break;
   }
@@ -14063,7 +14270,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case 'f':
       if (Subtarget.hasSPE()) {
         if (VT == MVT::f32 || VT == MVT::i32)
-          return std::make_pair(0U, &PPC::SPE4RCRegClass);
+          return std::make_pair(0U, &PPC::GPRCRegClass);
         if (VT == MVT::f64 || VT == MVT::i64)
           return std::make_pair(0U, &PPC::SPERCRegClass);
       } else {
@@ -14306,22 +14513,22 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
-                                              SelectionDAG &DAG) const {
+Register PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              const MachineFunction &MF) const {
   bool isPPC64 = Subtarget.isPPC64();
-  bool isDarwinABI = Subtarget.isDarwinABI();
+  bool IsDarwinABI = Subtarget.isDarwinABI();
 
   if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
       (!isPPC64 && VT != MVT::i32))
     report_fatal_error("Invalid register global variable type");
 
   bool is64Bit = isPPC64 && VT == MVT::i64;
-  unsigned Reg = StringSwitch<unsigned>(RegName)
+  Register Reg = StringSwitch<Register>(RegName)
                    .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
-                   .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2)
-                   .Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
+                   .Case("r2", (IsDarwinABI || isPPC64) ? Register() : PPC::R2)
+                   .Case("r13", (!isPPC64 && IsDarwinABI) ? Register() :
                                   (is64Bit ? PPC::X13 : PPC::R13))
-                   .Default(0);
+                   .Default(Register());
 
   if (Reg)
     return Reg;
@@ -14330,14 +14537,17 @@ unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
 
 bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
   // 32-bit SVR4 ABI access everything as got-indirect.
-  if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
+  if (Subtarget.is32BitELFABI())
+    return true;
+
+  // AIX accesses everything indirectly through the TOC, which is similar to
+  // the GOT.
+  if (Subtarget.isAIXABI())
     return true;
 
   CodeModel::Model CModel = getTargetMachine().getCodeModel();
   // If it is small or large code model, module locals are accessed
-  // indirectly by loading their address from .toc/.got. The difference
-  // is that for large code model we have ADDISTocHa + LDtocL and for
-  // small code model we simply have LDtoc.
+  // indirectly by loading their address from .toc/.got.
   if (CModel == CodeModel::Small || CModel == CodeModel::Large)
     return true;
 
@@ -14345,14 +14555,8 @@ bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
   if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))
     return true;
 
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
-    const GlobalValue *GV = G->getGlobal();
-    unsigned char GVFlags = Subtarget.classifyGlobalReference(GV);
-    // The NLP flag indicates that a global access has to use an
-    // extra indirection.
-    if (GVFlags & PPCII::MO_NLP_FLAG)
-      return true;
-  }
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA))
+    return Subtarget.isGVIndirectSymbol(G->getGlobal());
 
   return false;
 }
@@ -14417,7 +14621,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = -VT.getStoreSize()+1;
     Info.size = 2*VT.getStoreSize()-1;
-    Info.align = 1;
+    Info.align = Align::None();
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
@@ -14451,7 +14655,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.size = VT.getStoreSize();
-    Info.align = 1;
+    Info.align = Align::None();
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
@@ -14503,7 +14707,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = -VT.getStoreSize()+1;
     Info.size = 2*VT.getStoreSize()-1;
-    Info.align = 1;
+    Info.align = Align::None();
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
@@ -14536,7 +14740,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
     Info.size = VT.getStoreSize();
-    Info.align = 1;
+    Info.align = Align::None();
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
@@ -14786,7 +14990,7 @@ void PPCTargetLowering::insertCopiesSplitCSR(
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
-    unsigned NewVR = MRI->createVirtualRegister(RC);
+    Register NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     // FIXME: this currently does not emit CFI pseudo-instructions, it works
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
@@ -15146,7 +15350,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
 
 bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
-  if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())
+  if (!Subtarget.is64BitELFABI())
     return false;
 
   // If not a tail call then no need to proceed.
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 97422c6eda36..62922ea2d4c4 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -412,8 +412,9 @@ namespace llvm {
       /// representation.
       QBFLT,
 
-      /// Custom extend v4f32 to v2f64.
-      FP_EXTEND_LH,
+      /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
+      /// lower (IDX=1) half of v4f32 to v2f64.
+      FP_EXTEND_HALF,
 
       /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
       /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
@@ -456,15 +457,29 @@ namespace llvm {
       /// an xxswapd.
       LXVD2X,
 
+      /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
+      /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
+      /// the vector type to load vector in big-endian element order.
+      LOAD_VEC_BE,
+
       /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
       /// v2f32 value into the lower half of a VSR register.
       LD_VSX_LH,
 
+      /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
+      /// instructions such as LXVDSX, LXVWSX.
+      LD_SPLAT,
+
       /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
       /// Maps directly to an stxvd2x instruction that will be preceded by
       /// an xxswapd.
       STXVD2X,
 
+      /// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
+      /// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
+      /// the vector type to store vector in big-endian element order.
+      STORE_VEC_BE,
+
       /// Store scalar integers from VSR.
       ST_VSR_SCAL_INT,
 
@@ -563,9 +578,11 @@ namespace llvm {
     bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
                          unsigned &InsertAtByte, bool &Swap, bool IsLE);
 
-    /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
-    /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
+    /// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
+    /// appropriate for PPC mnemonics (which have a big endian bias - namely
+    /// elements are counted from the left of the vector register).
+    unsigned getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
+                                        SelectionDAG &DAG);
 
     /// get_VSPLTI_elt - If this is a build_vector of constants which can be
     /// formed by using a vspltis[bhw] instruction of the specified element
@@ -716,8 +733,8 @@ namespace llvm {
     SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                           SmallVectorImpl<SDNode *> &Created) const override;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT,
-                               SelectionDAG &DAG) const override;
+    Register getRegisterByName(const char* RegName, EVT VT,
+                               const MachineFunction &MF) const override;
 
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        KnownBits &Known,
@@ -725,7 +742,7 @@ namespace llvm {
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
-    unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
+    Align getPrefLoopAlignment(MachineLoop *ML) const override;
 
     bool shouldInsertFencesForAtomic(const Instruction *I) const override {
       return true;
@@ -834,6 +851,18 @@ namespace llvm {
       return true;
     }
 
+    bool isDesirableToTransformToIntegerOp(unsigned Opc,
+                                           EVT VT) const override {
+      // Only handle float load/store pair because float(fpr) load/store
+      // instruction has more cycles than integer(gpr) load/store in PPC.
+      if (Opc != ISD::LOAD && Opc != ISD::STORE)
+        return false;
+      if (VT != MVT::f32 && VT != MVT::f64)
+        return false;
+
+      return true; 
+    }
+
     // Returns true if the address of the global is stored in TOC entry.
     bool isAccessedAsGotIndirect(SDValue N) const;
 
@@ -998,6 +1027,8 @@ namespace llvm {
                                          SDValue &FPOpOut,
                                          const SDLoc &dl) const;
 
+    SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, SDValue GA) const;
+
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
@@ -1155,6 +1186,8 @@ namespace llvm {
     SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase,
+                                 DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index d598567f8e4e..f16187149d36 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1099,8 +1099,8 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
 // Support for medium and large code model.
 let hasSideEffects = 0 in {
 let isReMaterializable = 1 in {
-def ADDIStocHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
-                       "#ADDIStocHA", []>, isPPC64;
+def ADDIStocHA8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+                       "#ADDIStocHA8", []>, isPPC64;
 def ADDItocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                      "#ADDItocL", []>, isPPC64;
 }
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 8176c5120a83..fd3fc2af2327 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -215,21 +215,21 @@ def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
 
 // VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
 def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG), SDLoc(N));
+  return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 1, *CurDAG), SDLoc(N));
 }]>;
 def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
   return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
 }], VSPLTB_get_imm>;
 def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG), SDLoc(N));
+  return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 2, *CurDAG), SDLoc(N));
 }]>;
 def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
   return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
 }], VSPLTH_get_imm>;
 def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG), SDLoc(N));
+  return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 4, *CurDAG), SDLoc(N));
 }]>;
 def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
@@ -331,7 +331,7 @@ class VXBX_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
 class VXCR_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
   : VXForm_CR<xo, (outs vrrc:$vD), (ins vrrc:$vA, u1imm:$ST, u4imm:$SIX),
               !strconcat(opc, " $vD, $vA, $ST, $SIX"), IIC_VecFP,
-              [(set Ty:$vD, (IntID Ty:$vA, imm:$ST, imm:$SIX))]>;
+              [(set Ty:$vD, (IntID Ty:$vA, timm:$ST, timm:$SIX))]>;
 
 //===----------------------------------------------------------------------===//
 // Instruction Definitions.
@@ -401,10 +401,10 @@ let isCodeGenOnly = 1 in {
 
 def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
                       "mfvscr $vD", IIC_LdStStore,
-                      [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; 
+                      [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>;
 def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
                       "mtvscr $vB", IIC_LdStLoad,
-                      [(int_ppc_altivec_mtvscr v4i32:$vB)]>; 
+                      [(int_ppc_altivec_mtvscr v4i32:$vB)]>;
 
 let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {  // Loads.
 def LVEBX: XForm_1_memOp<31,   7, (outs vrrc:$vD), (ins memrr:$src),
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index a48eb1690695..96b9c9a119c0 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -1209,20 +1209,13 @@ class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = XT{5};
 }
 
-class XX3Form_Zero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+class XX3Form_SameOp<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern>
   : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
   let XA = XT;
   let XB = XT;
 }
 
-class XX3Form_SetZero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
-               InstrItinClass itin, list<dag> pattern>
-    : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
-  let XB = XT;
-  let XA = XT;
-}
-
 class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
                 InstrItinClass itin, list<dag> pattern>
   : I<opcode, OOL, IOL, asmstr, itin> {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index a787bdd56b9d..6b10672965c9 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -90,7 +90,6 @@ enum SpillOpcodeKey {
   SOK_QuadBitSpill,
   SOK_SpillToVSR,
   SOK_SPESpill,
-  SOK_SPE4Spill,
   SOK_LastOpcodeSpill  // This must be last on the enum.
 };
 
@@ -184,10 +183,10 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     return Latency;
 
   const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
-  unsigned Reg = DefMO.getReg();
+  Register Reg = DefMO.getReg();
 
   bool IsRegCR;
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     const MachineRegisterInfo *MRI =
         &DefMI.getParent()->getParent()->getRegInfo();
     IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) ||
@@ -330,11 +329,13 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case PPC::LIS8:
   case PPC::QVGPCI:
   case PPC::ADDIStocHA:
+  case PPC::ADDIStocHA8:
   case PPC::ADDItocL:
   case PPC::LOAD_STACK_GUARD:
   case PPC::XXLXORz:
   case PPC::XXLXORspz:
   case PPC::XXLXORdpz:
+  case PPC::XXLEQVOnes:
   case PPC::V_SET0B:
   case PPC::V_SET0H:
   case PPC::V_SET0:
@@ -448,7 +449,8 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   return &MI;
 }
 
-bool PPCInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+bool PPCInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+                                         unsigned &SrcOpIdx1,
                                          unsigned &SrcOpIdx2) const {
   // For VSX A-Type FMA instructions, it is the first two operands that can be
   // commuted, however, because the non-encoded tied input operand is listed
@@ -966,11 +968,11 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     getKillRegState(KillSrc);
     return;
   } else if (PPC::SPERCRegClass.contains(SrcReg) &&
-             PPC::SPE4RCRegClass.contains(DestReg)) {
+             PPC::GPRCRegClass.contains(DestReg)) {
     BuildMI(MBB, I, DL, get(PPC::EFSCFD), DestReg).addReg(SrcReg);
     getKillRegState(KillSrc);
     return;
-  } else if (PPC::SPE4RCRegClass.contains(SrcReg) &&
+  } else if (PPC::GPRCRegClass.contains(SrcReg) &&
              PPC::SPERCRegClass.contains(DestReg)) {
     BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg);
     getKillRegState(KillSrc);
@@ -1009,8 +1011,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = PPC::QVFMRb;
   else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::CROR;
-  else if (PPC::SPE4RCRegClass.contains(DestReg, SrcReg))
-    Opc = PPC::OR;
   else if (PPC::SPERCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::EVOR;
   else
@@ -1043,8 +1043,6 @@ unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg,
       OpcodeIndex = SOK_Float4Spill;
     } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
       OpcodeIndex = SOK_SPESpill;
-    } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_SPE4Spill;
     } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
       OpcodeIndex = SOK_CRSpill;
     } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
@@ -1083,8 +1081,6 @@ unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg,
       OpcodeIndex = SOK_Float4Spill;
     } else if (PPC::SPERCRegClass.contains(Reg)) {
       OpcodeIndex = SOK_SPESpill;
-    } else if (PPC::SPE4RCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_SPE4Spill;
     } else if (PPC::CRRCRegClass.contains(Reg)) {
       OpcodeIndex = SOK_CRSpill;
     } else if (PPC::CRBITRCRegClass.contains(Reg)) {
@@ -1133,8 +1129,6 @@ PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg,
       OpcodeIndex = SOK_Float4Spill;
     } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
       OpcodeIndex = SOK_SPESpill;
-    } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_SPE4Spill;
     } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
       OpcodeIndex = SOK_CRSpill;
     } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
@@ -1173,8 +1167,6 @@ PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg,
       OpcodeIndex = SOK_Float4Spill;
     } else if (PPC::SPERCRegClass.contains(Reg)) {
       OpcodeIndex = SOK_SPESpill;
-    } else if (PPC::SPE4RCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_SPE4Spill;
     } else if (PPC::CRRCRegClass.contains(Reg)) {
       OpcodeIndex = SOK_CRSpill;
     } else if (PPC::CRBITRCRegClass.contains(Reg)) {
@@ -1648,7 +1640,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
     return false;
 
   int OpC = CmpInstr.getOpcode();
-  unsigned CRReg = CmpInstr.getOperand(0).getReg();
+  Register CRReg = CmpInstr.getOperand(0).getReg();
 
   // FP record forms set CR1 based on the exception status bits, not a
   // comparison with zero.
@@ -1671,7 +1663,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
 
   // Look through copies unless that gets us to a physical register.
   unsigned ActualSrc = TRI->lookThruCopyLike(SrcReg, MRI);
-  if (TargetRegisterInfo::isVirtualRegister(ActualSrc))
+  if (Register::isVirtualRegister(ActualSrc))
     SrcReg = ActualSrc;
 
   // Get the unique definition of SrcReg.
@@ -1937,7 +1929,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
     // Rotates are expensive instructions. If we're emitting a record-form
     // rotate that can just be an andi/andis, we should just emit that.
     if (MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINM8) {
-      unsigned GPRRes = MI->getOperand(0).getReg();
+      Register GPRRes = MI->getOperand(0).getReg();
       int64_t SH = MI->getOperand(2).getImm();
       int64_t MB = MI->getOperand(3).getImm();
       int64_t ME = MI->getOperand(4).getImm();
@@ -2122,7 +2114,7 @@ bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const {
       llvm_unreachable("Unknown Operation!");
     }
 
-    unsigned TargetReg = MI.getOperand(0).getReg();
+    Register TargetReg = MI.getOperand(0).getReg();
     unsigned Opcode;
     if ((TargetReg >= PPC::F0 && TargetReg <= PPC::F31) ||
         (TargetReg >= PPC::VSL0 && TargetReg <= PPC::VSL31))
@@ -2184,7 +2176,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return expandVSXMemPseudo(MI);
   }
   case PPC::SPILLTOVSR_LD: {
-    unsigned TargetReg = MI.getOperand(0).getReg();
+    Register TargetReg = MI.getOperand(0).getReg();
     if (PPC::VSFRCRegClass.contains(TargetReg)) {
       MI.setDesc(get(PPC::DFLOADf64));
       return expandPostRAPseudo(MI);
@@ -2194,7 +2186,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return true;
   }
   case PPC::SPILLTOVSR_ST: {
-    unsigned SrcReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(0).getReg();
     if (PPC::VSFRCRegClass.contains(SrcReg)) {
       NumStoreSPILLVSRRCAsVec++;
       MI.setDesc(get(PPC::DFSTOREf64));
@@ -2206,7 +2198,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return true;
   }
   case PPC::SPILLTOVSR_LDX: {
-    unsigned TargetReg = MI.getOperand(0).getReg();
+    Register TargetReg = MI.getOperand(0).getReg();
     if (PPC::VSFRCRegClass.contains(TargetReg))
       MI.setDesc(get(PPC::LXSDX));
     else
@@ -2214,7 +2206,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return true;
   }
   case PPC::SPILLTOVSR_STX: {
-    unsigned SrcReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(0).getReg();
     if (PPC::VSFRCRegClass.contains(SrcReg)) {
       NumStoreSPILLVSRRCAsVec++;
       MI.setDesc(get(PPC::STXSDX));
@@ -2279,10 +2271,10 @@ void PPCInstrInfo::replaceInstrOperandWithImm(MachineInstr &MI,
                                               int64_t Imm) const {
   assert(MI.getOperand(OpNo).isReg() && "Operand must be a REG");
   // Replace the REG with the Immediate.
-  unsigned InUseReg = MI.getOperand(OpNo).getReg();
+  Register InUseReg = MI.getOperand(OpNo).getReg();
   MI.getOperand(OpNo).ChangeToImmediate(Imm);
 
-  if (empty(MI.implicit_operands()))
+  if (MI.implicit_operands().empty())
     return;
 
   // We need to make sure that the MI didn't have any implicit use
@@ -2328,6 +2320,23 @@ void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI,
       .addImm(LII.Imm);
 }
 
+MachineInstr *PPCInstrInfo::getDefMIPostRA(unsigned Reg, MachineInstr &MI,
+                                           bool &SeenIntermediateUse) const {
+  assert(!MI.getParent()->getParent()->getRegInfo().isSSA() &&
+         "Should be called after register allocation.");
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI;
+  It++;
+  SeenIntermediateUse = false;
+  for (; It != E; ++It) {
+    if (It->modifiesRegister(Reg, TRI))
+      return &*It;
+    if (It->readsRegister(Reg, TRI))
+      SeenIntermediateUse = true;
+  }
+  return nullptr;
+}
+
 MachineInstr *PPCInstrInfo::getForwardingDefMI(
   MachineInstr &MI,
   unsigned &OpNoForForwarding,
@@ -2342,11 +2351,11 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
     for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
       if (!MI.getOperand(i).isReg())
         continue;
-      unsigned Reg = MI.getOperand(i).getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MI.getOperand(i).getReg();
+      if (!Register::isVirtualRegister(Reg))
         continue;
       unsigned TrueReg = TRI->lookThruCopyLike(Reg, MRI);
-      if (TargetRegisterInfo::isVirtualRegister(TrueReg)) {
+      if (Register::isVirtualRegister(TrueReg)) {
         DefMI = MRI->getVRegDef(TrueReg);
         if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) {
           OpNoForForwarding = i;
@@ -2370,7 +2379,10 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
       Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 ||
       Opc == PPC::RLWINM || Opc == PPC::RLWINMo ||
       Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
-    if (!instrHasImmForm(MI, III, true) && !ConvertibleImmForm)
+    bool IsVFReg = (MI.getNumOperands() && MI.getOperand(0).isReg())
+                       ? isVFRegister(MI.getOperand(0).getReg())
+                       : false;
+    if (!ConvertibleImmForm && !instrHasImmForm(Opc, IsVFReg, III, true))
       return nullptr;
 
     // Don't convert or %X, %Y, %Y since that's just a register move.
@@ -2381,29 +2393,24 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
       MachineOperand &MO = MI.getOperand(i);
       SeenIntermediateUse = false;
       if (MO.isReg() && MO.isUse() && !MO.isImplicit()) {
-        MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI;
-        It++;
-        unsigned Reg = MI.getOperand(i).getReg();
-
-        // Is this register defined by some form of add-immediate (including
-        // load-immediate) within this basic block?
-        for ( ; It != E; ++It) {
-          if (It->modifiesRegister(Reg, &getRegisterInfo())) {
-            switch (It->getOpcode()) {
-            default: break;
-            case PPC::LI:
-            case PPC::LI8:
-            case PPC::ADDItocL:
-            case PPC::ADDI:
-            case PPC::ADDI8:
-              OpNoForForwarding = i;
-              return &*It;
-            }
+        Register Reg = MI.getOperand(i).getReg();
+        // If we see another use of this reg between the def and the MI,
+        // we want to flat it so the def isn't deleted.
+        MachineInstr *DefMI = getDefMIPostRA(Reg, MI, SeenIntermediateUse);
+        if (DefMI) {
+          // Is this register defined by some form of add-immediate (including
+          // load-immediate) within this basic block?
+          switch (DefMI->getOpcode()) {
+          default:
             break;
-          } else if (It->readsRegister(Reg, &getRegisterInfo()))
-            // If we see another use of this reg between the def and the MI,
-            // we want to flat it so the def isn't deleted.
-            SeenIntermediateUse = true;
+          case PPC::LI:
+          case PPC::LI8:
+          case PPC::ADDItocL:
+          case PPC::ADDI:
+          case PPC::ADDI8:
+            OpNoForForwarding = i;
+            return DefMI;
+          }
         }
       }
     }
@@ -2417,7 +2424,7 @@ const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
       {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
        PPC::SPILL_CRBIT, PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX,
        PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
-       PPC::SPILLTOVSR_ST, PPC::EVSTDD, PPC::SPESTW},
+       PPC::SPILLTOVSR_ST, PPC::EVSTDD},
       // Power 9
       {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
        PPC::SPILL_CRBIT, PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32,
@@ -2433,7 +2440,7 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
       {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
        PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX,
        PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
-       PPC::SPILLTOVSR_LD, PPC::EVLDD, PPC::SPELWZ},
+       PPC::SPILLTOVSR_LD, PPC::EVLDD},
       // Power 9
       {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
        PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, PPC::DFLOADf32,
@@ -2538,12 +2545,15 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
          "The forwarding operand needs to be valid at this point");
   bool IsForwardingOperandKilled = MI.getOperand(ForwardingOperand).isKill();
   bool KillFwdDefMI = !SeenIntermediateUse && IsForwardingOperandKilled;
-  unsigned ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg();
+  Register ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg();
   if (KilledDef && KillFwdDefMI)
     *KilledDef = DefMI;
 
   ImmInstrInfo III;
-  bool HasImmForm = instrHasImmForm(MI, III, PostRA);
+  bool IsVFReg = MI.getOperand(0).isReg()
+                     ? isVFRegister(MI.getOperand(0).getReg())
+                     : false;
+  bool HasImmForm = instrHasImmForm(MI.getOpcode(), IsVFReg, III, PostRA);
   // If this is a reg+reg instruction that has a reg+imm form,
   // and one of the operands is produced by an add-immediate,
   // try to convert it.
@@ -2591,7 +2601,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
     // If a compare-immediate is fed by an immediate and is itself an input of
     // an ISEL (the most common case) into a COPY of the correct register.
     bool Changed = false;
-    unsigned DefReg = MI.getOperand(0).getReg();
+    Register DefReg = MI.getOperand(0).getReg();
     int64_t Comparand = MI.getOperand(2).getImm();
     int64_t SExtComparand = ((uint64_t)Comparand & ~0x7FFFuLL) != 0 ?
       (Comparand | 0xFFFFFFFFFFFF0000) : Comparand;
@@ -2601,8 +2611,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
       if (UseOpc != PPC::ISEL && UseOpc != PPC::ISEL8)
         continue;
       unsigned CRSubReg = CompareUseMI.getOperand(3).getSubReg();
-      unsigned TrueReg = CompareUseMI.getOperand(1).getReg();
-      unsigned FalseReg = CompareUseMI.getOperand(2).getReg();
+      Register TrueReg = CompareUseMI.getOperand(1).getReg();
+      Register FalseReg = CompareUseMI.getOperand(2).getReg();
       unsigned RegToCopy = selectReg(SExtImm, SExtComparand, Opc, TrueReg,
                                      FalseReg, CRSubReg);
       if (RegToCopy == PPC::NoRegister)
@@ -2777,9 +2787,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
   return false;
 }
 
-bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
+bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg,
                                    ImmInstrInfo &III, bool PostRA) const {
-  unsigned Opc = MI.getOpcode();
   // The vast majority of the instructions would need their operand 2 replaced
   // with an immediate when switching to the reg+imm form. A marked exception
   // are the update form loads/stores for which a constant operand 2 would need
@@ -3111,7 +3120,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       break;
     case PPC::LXSSPX:
       if (PostRA) {
-        if (isVFRegister(MI.getOperand(0).getReg()))
+        if (IsVFReg)
           III.ImmOpcode = PPC::LXSSP;
         else {
           III.ImmOpcode = PPC::LFS;
@@ -3125,7 +3134,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       break;
     case PPC::LXSDX:
       if (PostRA) {
-        if (isVFRegister(MI.getOperand(0).getReg()))
+        if (IsVFReg)
           III.ImmOpcode = PPC::LXSD;
         else {
           III.ImmOpcode = PPC::LFD;
@@ -3143,7 +3152,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       break;
     case PPC::STXSSPX:
       if (PostRA) {
-        if (isVFRegister(MI.getOperand(0).getReg()))
+        if (IsVFReg)
           III.ImmOpcode = PPC::STXSSP;
         else {
           III.ImmOpcode = PPC::STFS;
@@ -3157,7 +3166,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       break;
     case PPC::STXSDX:
       if (PostRA) {
-        if (isVFRegister(MI.getOperand(0).getReg()))
+        if (IsVFReg)
           III.ImmOpcode = PPC::STXSD;
         else {
           III.ImmOpcode = PPC::STFD;
@@ -3287,7 +3296,7 @@ bool PPCInstrInfo::isRegElgibleForForwarding(
   if (MRI.isSSA())
     return false;
 
-  unsigned Reg = RegMO.getReg();
+  Register Reg = RegMO.getReg();
 
   // Walking the inst in reverse(MI-->DefMI) to get the last DEF of the Reg.
   MachineBasicBlock::const_reverse_iterator It = MI;
@@ -3511,8 +3520,8 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
   if (PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) {
     unsigned PosForOrigZero = III.ZeroIsSpecialOrig ? III.ZeroIsSpecialOrig :
       III.ZeroIsSpecialNew + 1;
-    unsigned OrigZeroReg = MI.getOperand(PosForOrigZero).getReg();
-    unsigned NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg();
+    Register OrigZeroReg = MI.getOperand(PosForOrigZero).getReg();
+    Register NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg();
     // If R0 is in the operand where zero is special for the new instruction,
     // it is unsafe to transform if the constant operand isn't that operand.
     if ((NewZeroReg == PPC::R0 || NewZeroReg == PPC::X0) &&
@@ -3563,16 +3572,20 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
       } else {
         // The 32 bit and 64 bit instructions are quite different.
         if (SpecialShift32) {
-          // Left shifts use (N, 0, 31-N), right shifts use (32-N, N, 31).
-          uint64_t SH = RightShift ? 32 - ShAmt : ShAmt;
+          // Left shifts use (N, 0, 31-N).
+          // Right shifts use (32-N, N, 31) if 0 < N < 32.
+          //              use (0, 0, 31)    if N == 0.
+          uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 32 - ShAmt : ShAmt;
           uint64_t MB = RightShift ? ShAmt : 0;
           uint64_t ME = RightShift ? 31 : 31 - ShAmt;
           replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH);
           MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB)
             .addImm(ME);
         } else {
-          // Left shifts use (N, 63-N), right shifts use (64-N, N).
-          uint64_t SH = RightShift ? 64 - ShAmt : ShAmt;
+          // Left shifts use (N, 63-N).
+          // Right shifts use (64-N, N) if 0 < N < 64.
+          //              use (0, 0)    if N == 0.
+          uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 64 - ShAmt : ShAmt;
           uint64_t ME = RightShift ? ShAmt : 63 - ShAmt;
           replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH);
           MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME);
@@ -3601,8 +3614,8 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
     if (III.ZeroIsSpecialNew) {
       // If operand at III.ZeroIsSpecialNew is physical reg(eg: ZERO/ZERO8), no
       // need to fix up register class.
-      unsigned RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(RegToModify)) {
+      Register RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg();
+      if (Register::isVirtualRegister(RegToModify)) {
         const TargetRegisterClass *NewRC =
           MRI.getRegClass(RegToModify)->hasSuperClassEq(&PPC::GPRCRegClass) ?
           &PPC::GPRC_and_GPRC_NOR0RegClass : &PPC::G8RC_and_G8RC_NOX0RegClass;
@@ -3747,7 +3760,7 @@ bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const {
     return false;
   unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
   unsigned StackOffset = MI.getOperand(1).getImm();
-  unsigned StackReg = MI.getOperand(2).getReg();
+  Register StackReg = MI.getOperand(2).getReg();
   if (StackReg == PPC::X1 && StackOffset == TOCSaveOffset)
     return true;
 
@@ -3772,7 +3785,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
 
   switch (MI.getOpcode()) {
   case PPC::COPY: {
-    unsigned SrcReg = MI.getOperand(1).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
 
     // In both ELFv1 and v2 ABI, method parameters and the return value
     // are sign- or zero-extended.
@@ -3781,7 +3794,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
       // We check the ZExt/SExt flags for a method parameter.
       if (MI.getParent()->getBasicBlock() ==
           &MF->getFunction().getEntryBlock()) {
-        unsigned VReg = MI.getOperand(0).getReg();
+        Register VReg = MI.getOperand(0).getReg();
         if (MF->getRegInfo().isLiveIn(VReg))
           return SignExt ? FuncInfo->isLiveInSExt(VReg) :
                            FuncInfo->isLiveInZExt(VReg);
@@ -3818,7 +3831,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
     }
 
     // If this is a copy from another register, we recursively check source.
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    if (!Register::isVirtualRegister(SrcReg))
       return false;
     const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
     if (SrcMI != NULL)
@@ -3841,8 +3854,8 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
   case PPC::XORIS8: {
     // logical operation with 16-bit immediate does not change the upper bits.
     // So, we track the operand register as we do for register copy.
-    unsigned SrcReg = MI.getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    Register SrcReg = MI.getOperand(1).getReg();
+    if (!Register::isVirtualRegister(SrcReg))
       return false;
     const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
     if (SrcMI != NULL)
@@ -3870,8 +3883,8 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
 
     for (unsigned I = 1; I != E; I += D) {
       if (MI.getOperand(I).isReg()) {
-        unsigned SrcReg = MI.getOperand(I).getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+        Register SrcReg = MI.getOperand(I).getReg();
+        if (!Register::isVirtualRegister(SrcReg))
           return false;
         const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
         if (SrcMI == NULL || !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1))
@@ -3893,12 +3906,12 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
 
     assert(MI.getOperand(1).isReg() && MI.getOperand(2).isReg());
 
-    unsigned SrcReg1 = MI.getOperand(1).getReg();
-    unsigned SrcReg2 = MI.getOperand(2).getReg();
+    Register SrcReg1 = MI.getOperand(1).getReg();
+    Register SrcReg2 = MI.getOperand(2).getReg();
 
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg1) ||
-        !TargetRegisterInfo::isVirtualRegister(SrcReg2))
-       return false;
+    if (!Register::isVirtualRegister(SrcReg1) ||
+        !Register::isVirtualRegister(SrcReg2))
+      return false;
 
     const MachineInstr *MISrc1 = MRI->getVRegDef(SrcReg1);
     const MachineInstr *MISrc2 = MRI->getVRegDef(SrcReg2);
@@ -3923,21 +3936,99 @@ bool PPCInstrInfo::isBDNZ(unsigned Opcode) const {
   return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ));
 }
 
-bool PPCInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
-                               MachineInstr *&CmpInst) const {
-  MachineBasicBlock *LoopEnd = L.getBottomBlock();
-  MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
-  // We really "analyze" only CTR loops right now.
-  if (I != LoopEnd->end() && isBDNZ(I->getOpcode())) {
-    IndVarInst = nullptr;
-    CmpInst = &*I;
-    return false;
+namespace {
+class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+  MachineInstr *Loop, *EndLoop, *LoopCount;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  int64_t TripCount;
+
+public:
+  PPCPipelinerLoopInfo(MachineInstr *Loop, MachineInstr *EndLoop,
+                       MachineInstr *LoopCount)
+      : Loop(Loop), EndLoop(EndLoop), LoopCount(LoopCount),
+        MF(Loop->getParent()->getParent()),
+        TII(MF->getSubtarget().getInstrInfo()) {
+    // Inspect the Loop instruction up-front, as it may be deleted when we call
+    // createTripCountGreaterCondition.
+    if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI)
+      TripCount = LoopCount->getOperand(1).getImm();
+    else
+      TripCount = -1;
   }
-  return true;
+
+  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+    // Only ignore the terminator.
+    return MI == EndLoop;
+  }
+
+  Optional<bool>
+  createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+                                  SmallVectorImpl<MachineOperand> &Cond) override {
+    if (TripCount == -1) {
+      // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
+      // so we don't need to generate any thing here.
+      Cond.push_back(MachineOperand::CreateImm(0));
+      Cond.push_back(MachineOperand::CreateReg(
+          MF->getSubtarget<PPCSubtarget>().isPPC64() ? PPC::CTR8 : PPC::CTR,
+          true));
+      return {};
+    }
+
+    return TripCount > TC;
+  }
+
+  void setPreheader(MachineBasicBlock *NewPreheader) override {
+    // Do nothing. We want the LOOP setup instruction to stay in the *old*
+    // preheader, so we can use BDZ in the prologs to adapt the loop trip count.
+  }
+
+  void adjustTripCount(int TripCountAdjust) override {
+    // If the loop trip count is a compile-time value, then just change the
+    // value.
+    if (LoopCount->getOpcode() == PPC::LI8 ||
+        LoopCount->getOpcode() == PPC::LI) {
+      int64_t TripCount = LoopCount->getOperand(1).getImm() + TripCountAdjust;
+      LoopCount->getOperand(1).setImm(TripCount);
+      return;
+    }
+
+    // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
+    // so we don't need to generate any thing here.
+  }
+
+  void disposed() override {
+    Loop->eraseFromParent();
+    // Ensure the loop setup instruction is deleted too.
+    LoopCount->eraseFromParent();
+  }
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+PPCInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+  // We really "analyze" only hardware loops right now.
+  MachineBasicBlock::iterator I = LoopBB->getFirstTerminator();
+  MachineBasicBlock *Preheader = *LoopBB->pred_begin();
+  if (Preheader == LoopBB)
+    Preheader = *std::next(LoopBB->pred_begin());
+  MachineFunction *MF = Preheader->getParent();
+
+  if (I != LoopBB->end() && isBDNZ(I->getOpcode())) {
+    SmallPtrSet<MachineBasicBlock *, 8> Visited;
+    if (MachineInstr *LoopInst = findLoopInstr(*Preheader, Visited)) {
+      Register LoopCountReg = LoopInst->getOperand(0).getReg();
+      MachineRegisterInfo &MRI = MF->getRegInfo();
+      MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
+      return std::make_unique<PPCPipelinerLoopInfo>(LoopInst, &*I, LoopCount);
+    }
+  }
+  return nullptr;
 }
 
-MachineInstr *
-PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const {
+MachineInstr *PPCInstrInfo::findLoopInstr(
+    MachineBasicBlock &PreHeader,
+    SmallPtrSet<MachineBasicBlock *, 8> &Visited) const {
 
   unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop);
 
@@ -3948,50 +4039,6 @@ PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const {
   return nullptr;
 }
 
-unsigned PPCInstrInfo::reduceLoopCount(
-    MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
-    MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
-    SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
-    unsigned MaxIter) const {
-  // We expect a hardware loop currently. This means that IndVar is set
-  // to null, and the compare is the ENDLOOP instruction.
-  assert((!IndVar) && isBDNZ(Cmp.getOpcode()) && "Expecting a CTR loop");
-  MachineFunction *MF = MBB.getParent();
-  DebugLoc DL = Cmp.getDebugLoc();
-  MachineInstr *Loop = findLoopInstr(PreHeader);
-  if (!Loop)
-    return 0;
-  unsigned LoopCountReg = Loop->getOperand(0).getReg();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
-
-  if (!LoopCount)
-    return 0;
-  // If the loop trip count is a compile-time value, then just change the
-  // value.
-  if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) {
-    int64_t Offset = LoopCount->getOperand(1).getImm();
-    if (Offset <= 1) {
-      LoopCount->eraseFromParent();
-      Loop->eraseFromParent();
-      return 0;
-    }
-    LoopCount->getOperand(1).setImm(Offset - 1);
-    return Offset - 1;
-  }
-
-  // The loop trip count is a run-time value.
-  // We need to subtract one from the trip count,
-  // and insert branch later to check if we're done with the loop.
-
-  // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
-  // so we don't need to generate any thing here.
-  Cond.push_back(MachineOperand::CreateImm(0));
-  Cond.push_back(MachineOperand::CreateReg(
-      Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, true));
-  return LoopCountReg;
-}
-
 // Return true if get the base operand, byte offset of an instruction and the
 // memory width. Width is the size of memory that is being loaded/stored.
 bool PPCInstrInfo::getMemOperandWithOffsetWidth(
@@ -4018,8 +4065,7 @@ bool PPCInstrInfo::getMemOperandWithOffsetWidth(
 }
 
 bool PPCInstrInfo::areMemAccessesTriviallyDisjoint(
-    const MachineInstr &MIa, const MachineInstr &MIb,
-    AliasAnalysis * /*AA*/) const {
+    const MachineInstr &MIa, const MachineInstr &MIb) const {
   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 70fb757e8f1e..19ab30cb0908 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -248,11 +248,11 @@ public:
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
   bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                         AliasAnalysis *AA) const override;
+                                         AAResults *AA) const override;
   unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
-  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+  bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
   void insertNoop(MachineBasicBlock &MBB,
@@ -370,8 +370,7 @@ public:
   /// otherwise
   bool
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
-                                  const MachineInstr &MIb,
-                                  AliasAnalysis *AA = nullptr) const override;
+                                  const MachineInstr &MIb) const override;
 
   /// GetInstSize - Return the number of bytes of code the specified
   /// instruction may be.  This returns the maximum number of bytes.
@@ -439,9 +438,14 @@ public:
   void replaceInstrOperandWithImm(MachineInstr &MI, unsigned OpNo,
                                   int64_t Imm) const;
 
-  bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III,
+  bool instrHasImmForm(unsigned Opc, bool IsVFReg, ImmInstrInfo &III,
                        bool PostRA) const;
 
+  // In PostRA phase, try to find instruction defines \p Reg before \p MI.
+  // \p SeenIntermediate is set to true if uses between DefMI and \p MI exist.
+  MachineInstr *getDefMIPostRA(unsigned Reg, MachineInstr &MI,
+                               bool &SeenIntermediateUse) const;
+
   /// getRegNumForOperand - some operands use different numbering schemes
   /// for the same registers. For example, a VSX instruction may have any of
   /// vs0-vs63 allocated whereas an Altivec instruction could only have
@@ -481,26 +485,14 @@ public:
   /// On PPC, we have two instructions used to set-up the hardware loop
   /// (MTCTRloop, MTCTR8loop) with corresponding endloop (BDNZ, BDNZ8)
   /// instructions to indicate the end of a loop.
-  MachineInstr *findLoopInstr(MachineBasicBlock &PreHeader) const;
-
-  /// Analyze the loop code to find the loop induction variable and compare used
-  /// to compute the number of iterations. Currently, we analyze loop that are
-  /// controlled using hardware loops.  In this case, the induction variable
-  /// instruction is null.  For all other cases, this function returns true,
-  /// which means we're unable to analyze it. \p IndVarInst and \p CmpInst will
-  /// return new values when we can analyze the readonly loop \p L, otherwise,
-  /// nothing got changed
-  bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
-                   MachineInstr *&CmpInst) const override;
-  /// Generate code to reduce the loop iteration by one and check if the loop
-  /// is finished.  Return the value/register of the new loop count.  We need
-  /// this function when peeling off one or more iterations of a loop. This
-  /// function assumes the last iteration is peeled first.
-  unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
-                           MachineInstr *IndVar, MachineInstr &Cmp,
-                           SmallVectorImpl<MachineOperand> &Cond,
-                           SmallVectorImpl<MachineInstr *> &PrevInsts,
-                           unsigned Iter, unsigned MaxIter) const override;
+  MachineInstr *
+  findLoopInstr(MachineBasicBlock &PreHeader,
+                SmallPtrSet<MachineBasicBlock *, 8> &Visited) const;
+
+  /// Analyze loop L, which must be a single-basic-block loop, and if the
+  /// conditions can be understood enough produce a PipelinerLoopInfo object.
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+  analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
 };
 
 }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index c313337047f0..24183277519b 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -386,7 +386,9 @@ def immZExt16  : PatLeaf<(imm), [{
   // field.  Used by instructions like 'ori'.
   return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
 }], LO16>;
-def immAnyExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm) || isUInt<8>(Imm); }]>;
+def immNonAllOneAnyExt8 : ImmLeaf<i32, [{ 
+  return (isInt<8>(Imm) && (Imm != -1)) || (isUInt<8>(Imm) && (Imm != 0xFF));
+}]>;
 def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>;
 
 // imm16Shifted* - These match immediates where the low 16-bits are zero.  There
@@ -577,7 +579,7 @@ def sperc : RegisterOperand<SPERC> {
 def PPCRegSPE4RCAsmOperand : AsmOperandClass {
   let Name = "RegSPE4RC"; let PredicateMethod = "isRegNumber";
 }
-def spe4rc : RegisterOperand<SPE4RC> {
+def spe4rc : RegisterOperand<GPRC> {
   let ParserMatchClass = PPCRegSPE4RCAsmOperand;
 }
 
@@ -3161,7 +3163,16 @@ def ADDISdtprelHA32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s1
 def LWZtoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
                    "#LWZtoc",
                    [(set i32:$rD,
+                     (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+def LWZtocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc_nor0:$reg),
+                    "#LWZtocL",
+                    [(set i32:$rD,
                       (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp),
+                       "#ADDIStocHA",
+                       [(set i32:$rD,
+                         (PPCtoc_entry i32:$reg, tglobaladdr:$disp))]>;
+
 // Get Global (GOT) Base Register offset, from the word immediately preceding
 // the function label.
 def UpdateGBR : PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;
@@ -3177,21 +3188,21 @@ def : Pat<(srl i32:$rS, i32:$rB),
 def : Pat<(shl i32:$rS, i32:$rB),
           (SLW $rS, $rB)>;
 
-def : Pat<(zextloadi1 iaddr:$src),
+def : Pat<(i32 (zextloadi1 iaddr:$src)),
           (LBZ iaddr:$src)>;
-def : Pat<(zextloadi1 xaddr:$src),
+def : Pat<(i32 (zextloadi1 xaddr:$src)),
           (LBZX xaddr:$src)>;
-def : Pat<(extloadi1 iaddr:$src),
+def : Pat<(i32 (extloadi1 iaddr:$src)),
           (LBZ iaddr:$src)>;
-def : Pat<(extloadi1 xaddr:$src),
+def : Pat<(i32 (extloadi1 xaddr:$src)),
           (LBZX xaddr:$src)>;
-def : Pat<(extloadi8 iaddr:$src),
+def : Pat<(i32 (extloadi8 iaddr:$src)),
           (LBZ iaddr:$src)>;
-def : Pat<(extloadi8 xaddr:$src),
+def : Pat<(i32 (extloadi8 xaddr:$src)),
           (LBZX xaddr:$src)>;
-def : Pat<(extloadi16 iaddr:$src),
+def : Pat<(i32 (extloadi16 iaddr:$src)),
           (LHZ iaddr:$src)>;
-def : Pat<(extloadi16 xaddr:$src),
+def : Pat<(i32 (extloadi16 xaddr:$src)),
           (LHZX xaddr:$src)>;
 let Predicates = [HasFPU] in {
 def : Pat<(f64 (extloadf32 iaddr:$src)),
@@ -3564,23 +3575,6 @@ def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)),
           (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
                                   (LO16 imm:$imm)), sub_eq)>;
 
-defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
-                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
-                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
-                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
-                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
-                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
-                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
-
-defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
-                (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
-                                        (LO16 imm:$imm)), sub_eq)>;
-
 def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)),
           (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
 def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)),
@@ -3592,17 +3586,6 @@ def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)),
 def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)),
           (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
 
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
-                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
-                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
-                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
-                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
-                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
-
 // SETCC for i64.
 def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)),
           (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
@@ -3632,6 +3615,47 @@ def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)),
           (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
                                   (LO16 imm:$imm)), sub_eq)>;
 
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
+          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
+          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+// Instantiations of CRNotPat for i32.
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+                                        (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
+                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
+                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+// Instantiations of CRNotPat for i64.
 defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)),
                 (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
 defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)),
@@ -3649,17 +3673,6 @@ defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
                 (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
                                         (LO16 imm:$imm)), sub_eq)>;
 
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
-          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
-          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
-          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
-          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
-          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
-
 defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)),
                 (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
 defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)),
@@ -3671,6 +3684,56 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
 defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
                 (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
 
+let Predicates = [HasFPU] in {
+// Instantiations of CRNotPat for f32.
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+// Instantiations of CRNotPat for f64.
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+// Instantiations of CRNotPat for f128.
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)),
+                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+}
+
 // SETCC for f32.
 let Predicates = [HasFPU] in {
 def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
@@ -3688,21 +3751,6 @@ def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
 def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
           (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
 
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
-                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
-
 // SETCC for f64.
 def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
           (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
@@ -3719,21 +3767,6 @@ def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
 def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
           (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
 
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
-                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
-
 // SETCC for f128.
 def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)),
           (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
@@ -3750,21 +3783,6 @@ def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)),
 def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)),
           (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
 
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)),
-                (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
-
 }
 
 // This must be in this file because it relies on patterns defined in this file
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 07f38a61d098..2aad5860d87f 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -58,8 +58,12 @@ def SDT_PPCldvsxlh : SDTypeProfile<1, 1, [
   SDTCisVT<0, v4f32>, SDTCisPtrTy<1>
 ]>;
 
-def SDT_PPCfpextlh : SDTypeProfile<1, 1, [
-  SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>
+def SDT_PPCfpexth : SDTypeProfile<1, 2, [
+  SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>, SDTCisPtrTy<2>
+]>;
+
+def SDT_PPCldsplat : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisPtrTy<1>
 ]>;
 
 // Little-endian-specific nodes.
@@ -78,12 +82,21 @@ def SDTVecConv : SDTypeProfile<1, 2, [
 def SDTVabsd : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>
 ]>;
-
+def SDT_PPCld_vec_be : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCst_vec_be : SDTypeProfile<0, 2, [
+  SDTCisVec<0>, SDTCisPtrTy<1>
+]>;
 
 def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
                         [SDNPHasChain, SDNPMayStore]>;
+def PPCld_vec_be  : SDNode<"PPCISD::LOAD_VEC_BE", SDT_PPCld_vec_be,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCst_vec_be : SDNode<"PPCISD::STORE_VEC_BE", SDT_PPCst_vec_be,
+                        [SDNPHasChain, SDNPMayStore]>;
 def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
 def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
 def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
@@ -93,9 +106,11 @@ def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
 def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
 def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
 
-def PPCfpextlh : SDNode<"PPCISD::FP_EXTEND_LH", SDT_PPCfpextlh, []>;
+def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>;
 def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
 multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
                     string asmstr, InstrItinClass itin, Intrinsic Int,
@@ -855,14 +870,14 @@ let Uses = [RM] in {
 
   let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
       isReMaterializable = 1 in {
-    def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins),
+    def XXLXORz : XX3Form_SameOp<60, 154, (outs vsrc:$XT), (ins),
                        "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
                        [(set v4i32:$XT, (v4i32 immAllZerosV))]>;
-    def XXLXORdpz : XX3Form_SetZero<60, 154,
+    def XXLXORdpz : XX3Form_SameOp<60, 154,
                          (outs vsfrc:$XT), (ins),
                          "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
                          [(set f64:$XT, (fpimm0))]>;
-    def XXLXORspz : XX3Form_SetZero<60, 154,
+    def XXLXORspz : XX3Form_SameOp<60, 154,
                          (outs vssrc:$XT), (ins),
                          "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
                          [(set f32:$XT, (fpimm0))]>;
@@ -996,21 +1011,21 @@ def : Pat<(f64 (extractelt v2f64:$S, 1)),
           (f64 (EXTRACT_SUBREG $S, sub_64))>;
 }
 
-// Additional fnmsub patterns: -a*c + b == -(a*c - b)
-def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
-          (XSNMSUBADP $B, $C, $A)>;
-def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
-          (XSNMSUBADP $B, $C, $A)>;
+// Additional fnmsub patterns: -a*b + c == -(a*b - c)
+def : Pat<(fma (fneg f64:$A), f64:$B, f64:$C),
+          (XSNMSUBADP $C, $A, $B)>;
+def : Pat<(fma f64:$A, (fneg f64:$B), f64:$C),
+          (XSNMSUBADP $C, $A, $B)>;
 
-def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B),
-          (XVNMSUBADP $B, $C, $A)>;
-def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B),
-          (XVNMSUBADP $B, $C, $A)>;
+def : Pat<(fma (fneg v2f64:$A), v2f64:$B, v2f64:$C),
+          (XVNMSUBADP $C, $A, $B)>;
+def : Pat<(fma v2f64:$A, (fneg v2f64:$B), v2f64:$C),
+          (XVNMSUBADP $C, $A, $B)>;
 
-def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
-          (XVNMSUBASP $B, $C, $A)>;
-def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
-          (XVNMSUBASP $B, $C, $A)>;
+def : Pat<(fma (fneg v4f32:$A), v4f32:$B, v4f32:$C),
+          (XVNMSUBASP $C, $A, $B)>;
+def : Pat<(fma v4f32:$A, (fneg v4f32:$B), v4f32:$C),
+          (XVNMSUBASP $C, $A, $B)>;
 
 def : Pat<(v2f64 (bitconvert v4f32:$A)),
           (COPY_TO_REGCLASS $A, VSRC)>;
@@ -1077,7 +1092,8 @@ def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
 def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
           (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
 
-def : Pat<(v2f64 (PPCfpextlh v4f32:$C)), (XVCVSPDP (XXMRGHW $C, $C))>;
+def : Pat<(v2f64 (PPCfpexth v4f32:$C, 0)), (XVCVSPDP (XXMRGHW $C, $C))>;
+def : Pat<(v2f64 (PPCfpexth v4f32:$C, 1)), (XVCVSPDP (XXMRGLW $C, $C))>;
 
 // Loads.
 let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
@@ -1088,6 +1104,19 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
             (STXVD2X $rS, xoaddr:$dst)>;
   def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
 }
+
+// Load vector big endian order
+let Predicates = [IsLittleEndian, HasVSX] in {
+  def : Pat<(v2f64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+  def : Pat<(PPCst_vec_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+  def : Pat<(v4f32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+  def : Pat<(PPCst_vec_be v4f32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+  def : Pat<(v2i64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+  def : Pat<(PPCst_vec_be v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+  def : Pat<(v4i32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+  def : Pat<(PPCst_vec_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+}
+
 let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
   def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
   def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
@@ -1288,6 +1317,13 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
             (XXLEQV $A, $B)>;
 
+  let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+      isReMaterializable = 1 in {
+    def XXLEQVOnes : XX3Form_SameOp<60, 186, (outs vsrc:$XT), (ins),
+                         "xxleqv $XT, $XT, $XT", IIC_VecGeneral,
+                         [(set v4i32:$XT, (bitconvert (v16i8 immAllOnesV)))]>;
+  }
+
   def XXLORC : XX3Form<60, 170,
                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                        "xxlorc $XT, $XA, $XB", IIC_VecGeneral,
@@ -1476,6 +1512,12 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                           AltVSXFMARel;
   }
 
+  // Additional xsnmsubasp patterns: -a*b + c == -(a*b - c)
+  def : Pat<(fma (fneg f32:$A), f32:$B, f32:$C),
+            (XSNMSUBASP $C, $A, $B)>;
+  def : Pat<(fma f32:$A, (fneg f32:$B), f32:$C),
+            (XSNMSUBASP $C, $A, $B)>;
+
   // Single Precision Conversions (FP <-> INT)
   def XSCVSXDSP : XX2Form<60, 312,
                       (outs vssrc:$XT), (ins vsfrc:$XB),
@@ -1564,16 +1606,33 @@ let Predicates = [HasDirectMove] in {
   def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT),
                                "mfvsrwz $rA, $XT", IIC_VecGeneral,
                                [(set i32:$rA, (PPCmfvsr f64:$XT))]>;
+  let isCodeGenOnly = 1 in
+  def MFVRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsrc:$XT),
+                               "mfvsrwz $rA, $XT", IIC_VecGeneral,
+                               []>;
   def MTVSRD : XX1_RS6_RD5_XO<31, 179, (outs vsfrc:$XT), (ins g8rc:$rA),
                               "mtvsrd $XT, $rA", IIC_VecGeneral,
                               [(set f64:$XT, (PPCmtvsra i64:$rA))]>,
       Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in
+  def MTVRD : XX1_RS6_RD5_XO<31, 179, (outs vsrc:$XT), (ins g8rc:$rA),
+                              "mtvsrd $XT, $rA", IIC_VecGeneral,
+                              []>,
+      Requires<[In64BitMode]>;
   def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$rA),
                                "mtvsrwa $XT, $rA", IIC_VecGeneral,
                                [(set f64:$XT, (PPCmtvsra i32:$rA))]>;
+  let isCodeGenOnly = 1 in
+  def MTVRWA : XX1_RS6_RD5_XO<31, 211, (outs vsrc:$XT), (ins gprc:$rA),
+                               "mtvsrwa $XT, $rA", IIC_VecGeneral,
+                               []>;
   def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA),
                                "mtvsrwz $XT, $rA", IIC_VecGeneral,
                                [(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
+  let isCodeGenOnly = 1 in
+  def MTVRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsrc:$XT), (ins gprc:$rA),
+                               "mtvsrwz $XT, $rA", IIC_VecGeneral,
+                               []>;
 } // HasDirectMove
 
 let Predicates = [IsISA3_0, HasDirectMove] in {
@@ -1597,6 +1656,22 @@ def : InstAlias<"mfvrd $rA, $XT",
                 (MFVRD g8rc:$rA, vrrc:$XT), 0>;
 def : InstAlias<"mffprd $rA, $src",
                 (MFVSRD g8rc:$rA, f8rc:$src)>;
+def : InstAlias<"mtvrd $XT, $rA",
+                (MTVRD vrrc:$XT, g8rc:$rA), 0>;
+def : InstAlias<"mtfprd $dst, $rA",
+                (MTVSRD f8rc:$dst, g8rc:$rA)>;
+def : InstAlias<"mfvrwz $rA, $XT",
+                (MFVRWZ gprc:$rA, vrrc:$XT), 0>;
+def : InstAlias<"mffprwz $rA, $src",
+                (MFVSRWZ gprc:$rA, f8rc:$src)>;
+def : InstAlias<"mtvrwa $XT, $rA",
+                (MTVRWA vrrc:$XT, gprc:$rA), 0>;
+def : InstAlias<"mtfprwa $dst, $rA",
+                (MTVSRWA f8rc:$dst, gprc:$rA)>;
+def : InstAlias<"mtvrwz $XT, $rA",
+                (MTVRWZ vrrc:$XT, gprc:$rA), 0>;
+def : InstAlias<"mtfprwz $dst, $rA",
+                (MTVSRWZ f8rc:$dst, gprc:$rA)>;
 
 /*  Direct moves of various widths from GPR's into VSR's. Each move lines
     the value up into element 0 (both BE and LE). Namely, entities smaller than
@@ -2581,9 +2656,9 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
                                       (fneg (int_ppc_fmaf128_round_to_odd
                                       f128:$vA, f128:$vB, (fneg f128:$vTi))))]>;
 
-  // Additional fnmsub patterns: -a*c + b == -(a*c - b)
-  def : Pat<(fma (fneg f128:$A), f128:$C, f128:$B), (XSNMSUBQP $B, $C, $A)>;
-  def : Pat<(fma f128:$A, (fneg f128:$C), f128:$B), (XSNMSUBQP $B, $C, $A)>;
+  // Additional fnmsub patterns: -a*b + c == -(a*b - c)
+  def : Pat<(fma (fneg f128:$A), f128:$B, f128:$C), (XSNMSUBQP $C, $A, $B)>;
+  def : Pat<(fma f128:$A, (fneg f128:$B), f128:$C), (XSNMSUBQP $C, $A, $B)>;
 
   //===--------------------------------------------------------------------===//
   // Quad/Double-Precision Compare Instructions:
@@ -2799,12 +2874,12 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
                               (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
                               "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP,
                               [(set v4i32: $XT,
-                               (int_ppc_vsx_xvtstdcsp v4f32:$XB, imm:$DCMX))]>;
+                               (int_ppc_vsx_xvtstdcsp v4f32:$XB, timm:$DCMX))]>;
   def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5,
                               (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
                               "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP,
                               [(set v2i64: $XT,
-                               (int_ppc_vsx_xvtstdcdp v2f64:$XB, imm:$DCMX))]>;
+                               (int_ppc_vsx_xvtstdcdp v2f64:$XB, timm:$DCMX))]>;
 
   //===--------------------------------------------------------------------===//
 
@@ -3024,6 +3099,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
             (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
   def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
             (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+
+  def : Pat<(v8i16 (PPCld_vec_be xoaddr:$src)),
+            (COPY_TO_REGCLASS (LXVH8X xoaddr:$src), VRRC)>;
+  def : Pat<(PPCst_vec_be v8i16:$rS, xoaddr:$dst),
+            (STXVH8X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
+
+  def : Pat<(v16i8 (PPCld_vec_be xoaddr:$src)),
+            (COPY_TO_REGCLASS (LXVB16X xoaddr:$src), VRRC)>;
+  def : Pat<(PPCst_vec_be v16i8:$rS, xoaddr:$dst),
+            (STXVB16X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
   } // IsLittleEndian, HasP9Vector
 
   let Predicates = [IsBigEndian, HasP9Vector] in {
@@ -3059,7 +3144,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
             (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
   def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
             (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
-  } // IsLittleEndian, HasP9Vector
+  } // IsBigEndian, HasP9Vector
 
   // D-Form Load/Store
   def : Pat<(v4i32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
@@ -3858,6 +3943,10 @@ let AddedComplexity = 400 in {
                                 (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
     def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
               (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+    def : Pat<(v2f64 (PPCldsplat xoaddr:$A)),
+              (v2f64 (LXVDSX xoaddr:$A))>;
+    def : Pat<(v2i64 (PPCldsplat xoaddr:$A)),
+              (v2i64 (LXVDSX xoaddr:$A))>;
 
     // Build vectors of floating point converted to i64.
     def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
@@ -4063,27 +4152,32 @@ let AddedComplexity = 400 in {
               (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
   }
 
+  let Predicates = [HasP8Vector] in {
+    def : Pat<(v1i128 (bitconvert (v16i8 immAllOnesV))),
+              (v1i128 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+    def : Pat<(v2i64 (bitconvert (v16i8 immAllOnesV))),
+              (v2i64 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+    def : Pat<(v8i16 (bitconvert (v16i8 immAllOnesV))),
+              (v8i16 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+    def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))),
+              (v16i8 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+  }
+
   let Predicates = [HasP9Vector] in {
     // Endianness-neutral patterns for const splats with ISA 3.0 instructions.
     def : Pat<(v4i32 (scalar_to_vector i32:$A)),
               (v4i32 (MTVSRWS $A))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
               (v4i32 (MTVSRWS $A))>;
-    def : Pat<(v16i8 (build_vector immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
-                                   immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
-                                   immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
-                                   immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
-                                   immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
-                                   immAnyExt8:$A)),
+    def : Pat<(v16i8 (build_vector immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A)),
               (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
-    def : Pat<(v16i8 immAllOnesV),
-              (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
-    def : Pat<(v8i16 immAllOnesV),
-              (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
-    def : Pat<(v4i32 immAllOnesV),
-              (v4i32 (XXSPLTIB 255))>;
-    def : Pat<(v2i64 immAllOnesV),
-              (v2i64 (XXSPLTIB 255))>;
     def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
               (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>;
     def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
@@ -4102,6 +4196,10 @@ let AddedComplexity = 400 in {
               (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
                                               (DFLOADf32 iaddrX4:$A),
                                               VSFRC)), 0))>;
+    def : Pat<(v4f32 (PPCldsplat xoaddr:$A)),
+              (v4f32 (LXVWSX xoaddr:$A))>;
+    def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
+              (v4i32 (LXVWSX xoaddr:$A))>;
   }
 
   let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index 4d45d96d4479..d252cfbd26b1 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -63,8 +63,24 @@ static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars",
   cl::desc("Potential PHI threshold for PPC preinc loop prep"));
 
 STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form");
+STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten");
 
 namespace {
+  struct BucketElement {
+    BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
+    BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
+
+    const SCEVConstant *Offset;
+    Instruction *Instr;
+  };
+
+  struct Bucket {
+    Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
+                                            Elements(1, BucketElement(I)) {}
+
+    const SCEV *BaseSCEV;
+    SmallVector<BucketElement, 16> Elements;
+  };
 
   class PPCLoopPreIncPrep : public FunctionPass {
   public:
@@ -85,21 +101,47 @@ namespace {
       AU.addRequired<ScalarEvolutionWrapperPass>();
     }
 
-    bool alreadyPrepared(Loop *L, Instruction* MemI,
-                         const SCEV *BasePtrStartSCEV,
-                         const SCEVConstant *BasePtrIncSCEV);
     bool runOnFunction(Function &F) override;
 
-    bool runOnLoop(Loop *L);
-    void simplifyLoopLatch(Loop *L);
-    bool rotateLoop(Loop *L);
-
   private:
     PPCTargetMachine *TM = nullptr;
+    const PPCSubtarget *ST; 
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
     bool PreserveLCSSA;
+
+    bool runOnLoop(Loop *L);
+
+    /// Check if required PHI node is already exist in Loop \p L.
+    bool alreadyPrepared(Loop *L, Instruction* MemI,
+                         const SCEV *BasePtrStartSCEV,
+                         const SCEVConstant *BasePtrIncSCEV);
+
+    /// Collect condition matched(\p isValidCandidate() returns true)
+    /// candidates in Loop \p L.
+    SmallVector<Bucket, 16>
+    collectCandidates(Loop *L,
+                      std::function<bool(const Instruction *, const Value *)>
+                          isValidCandidate,
+                      unsigned MaxCandidateNum);
+
+    /// Add a candidate to candidates \p Buckets.
+    void addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
+                         SmallVector<Bucket, 16> &Buckets,
+                         unsigned MaxCandidateNum);
+
+    /// Prepare all candidates in \p Buckets for update form.
+    bool updateFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets);
+
+    /// Prepare for one chain \p BucketChain, find the best base element and
+    /// update all other elements in \p BucketChain accordingly.
+    bool prepareBaseForUpdateFormChain(Bucket &BucketChain);
+
+    /// Rewrite load/store instructions in \p BucketChain according to
+    /// preparation.
+    bool rewriteLoadStores(Loop *L, Bucket &BucketChain,
+                           SmallSet<BasicBlock *, 16> &BBChanged);
   };
 
 } // end anonymous namespace
@@ -111,30 +153,15 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
 
+static const std::string PHINodeNameSuffix    = ".phi";
+static const std::string CastNodeNameSuffix   = ".cast";
+static const std::string GEPNodeIncNameSuffix = ".inc";
+static const std::string GEPNodeOffNameSuffix = ".off";
+
 FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
   return new PPCLoopPreIncPrep(TM);
 }
 
-namespace {
-
-  struct BucketElement {
-    BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
-    BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
-
-    const SCEVConstant *Offset;
-    Instruction *Instr;
-  };
-
-  struct Bucket {
-    Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
-                                            Elements(1, BucketElement(I)) {}
-
-    const SCEV *BaseSCEV;
-    SmallVector<BucketElement, 16> Elements;
-  };
-
-} // end anonymous namespace
-
 static bool IsPtrInBounds(Value *BasePtr) {
   Value *StrippedBasePtr = BasePtr;
   while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr))
@@ -145,6 +172,14 @@ static bool IsPtrInBounds(Value *BasePtr) {
   return false;
 }
 
+static std::string getInstrName(const Value *I, const std::string Suffix) {
+  assert(I && "Invalid paramater!");
+  if (I->hasName())
+    return (I->getName() + Suffix).str();
+  else
+    return ""; 
+}
+
 static Value *GetPointerOperand(Value *MemI) {
   if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) {
     return LMemI->getPointerOperand();
@@ -167,6 +202,7 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+  ST = TM ? TM->getSubtargetImpl(F) : nullptr;
 
   bool MadeChange = false;
 
@@ -177,10 +213,280 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
   return MadeChange;
 }
 
+void PPCLoopPreIncPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
+                                        SmallVector<Bucket, 16> &Buckets,
+                                        unsigned MaxCandidateNum) {
+  assert((MemI && GetPointerOperand(MemI)) &&
+         "Candidate should be a memory instruction.");
+  assert(LSCEV && "Invalid SCEV for Ptr value.");
+  bool FoundBucket = false;
+  for (auto &B : Buckets) {
+    const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
+    if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
+      B.Elements.push_back(BucketElement(CDiff, MemI));
+      FoundBucket = true;
+      break;
+    }
+  }
+
+  if (!FoundBucket) {
+    if (Buckets.size() == MaxCandidateNum)
+      return;
+    Buckets.push_back(Bucket(LSCEV, MemI));
+  }
+}
+
+SmallVector<Bucket, 16> PPCLoopPreIncPrep::collectCandidates(
+    Loop *L,
+    std::function<bool(const Instruction *, const Value *)> isValidCandidate,
+    unsigned MaxCandidateNum) {
+  SmallVector<Bucket, 16> Buckets;
+  for (const auto &BB : L->blocks())
+    for (auto &J : *BB) {
+      Value *PtrValue;
+      Instruction *MemI;
+
+      if (LoadInst *LMemI = dyn_cast<LoadInst>(&J)) {
+        MemI = LMemI;
+        PtrValue = LMemI->getPointerOperand();
+      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&J)) {
+        MemI = SMemI;
+        PtrValue = SMemI->getPointerOperand();
+      } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) {
+        if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
+          MemI = IMemI;
+          PtrValue = IMemI->getArgOperand(0);
+        } else continue;
+      } else continue;
+
+      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+      if (PtrAddrSpace)
+        continue;
+
+      if (L->isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
+      const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LARSCEV || LARSCEV->getLoop() != L)
+        continue;
+
+      if (isValidCandidate(&J, PtrValue))
+        addOneCandidate(MemI, LSCEV, Buckets, MaxCandidateNum);
+    }
+  return Buckets;
+}
+
+// TODO: implement a more clever base choosing policy.
+// Currently we always choose an exist load/store offset. This maybe lead to
+// suboptimal code sequences. For example, for one DS chain with offsets
+// {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp
+// for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a
+// multipler of 4, it cannot be represented by sint16.
+bool PPCLoopPreIncPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
+  // We have a choice now of which instruction's memory operand we use as the
+  // base for the generated PHI. Always picking the first instruction in each
+  // bucket does not work well, specifically because that instruction might
+  // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
+  // the choice is somewhat arbitrary, because the backend will happily
+  // generate direct offsets from both the pre-incremented and
+  // post-incremented pointer values. Thus, we'll pick the first non-prefetch
+  // instruction in each bucket, and adjust the recurrence and other offsets
+  // accordingly.
+  for (int j = 0, je = BucketChain.Elements.size(); j != je; ++j) {
+    if (auto *II = dyn_cast<IntrinsicInst>(BucketChain.Elements[j].Instr))
+      if (II->getIntrinsicID() == Intrinsic::prefetch)
+        continue;
+
+    // If we'd otherwise pick the first element anyway, there's nothing to do.
+    if (j == 0)
+      break;
+
+    // If our chosen element has no offset from the base pointer, there's
+    // nothing to do.
+    if (!BucketChain.Elements[j].Offset ||
+        BucketChain.Elements[j].Offset->isZero())
+      break;
+
+    const SCEV *Offset = BucketChain.Elements[j].Offset;
+    BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset);
+    for (auto &E : BucketChain.Elements) {
+      if (E.Offset)
+        E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
+      else
+        E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
+    }
+
+    std::swap(BucketChain.Elements[j], BucketChain.Elements[0]);
+    break;
+  }
+  return true;
+}
+
+bool PPCLoopPreIncPrep::rewriteLoadStores(
+    Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged) {
+  bool MadeChange = false;
+  const SCEVAddRecExpr *BasePtrSCEV =
+      cast<SCEVAddRecExpr>(BucketChain.BaseSCEV);
+  if (!BasePtrSCEV->isAffine())
+    return MadeChange;
+
+  LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
+
+  assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?");
+
+  // The instruction corresponding to the Bucket's BaseSCEV must be the first
+  // in the vector of elements.
+  Instruction *MemI = BucketChain.Elements.begin()->Instr;
+  Value *BasePtr = GetPointerOperand(MemI);
+  assert(BasePtr && "No pointer operand");
+
+  Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext());
+  Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(),
+    BasePtr->getType()->getPointerAddressSpace());
+
+  const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart();
+  if (!SE->isLoopInvariant(BasePtrStartSCEV, L))
+    return MadeChange;
+
+  const SCEVConstant *BasePtrIncSCEV =
+    dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE));
+  if (!BasePtrIncSCEV)
+    return MadeChange;
+  BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV);
+  if (!isSafeToExpand(BasePtrStartSCEV, *SE))
+    return MadeChange;
+
+  if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV))
+    return MadeChange;
+
+  LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
+
+  BasicBlock *Header = L->getHeader();
+  unsigned HeaderLoopPredCount = pred_size(Header);
+  BasicBlock *LoopPredecessor = L->getLoopPredecessor();
+
+  PHINode *NewPHI =
+      PHINode::Create(I8PtrTy, HeaderLoopPredCount,
+                      getInstrName(MemI, PHINodeNameSuffix),
+                      Header->getFirstNonPHI());
+
+  SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart");
+  Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
+                                            LoopPredecessor->getTerminator());
+
+  // Note that LoopPredecessor might occur in the predecessor list multiple
+  // times, and we need to add it the right number of times.
+  for (const auto &PI : predecessors(Header)) {
+    if (PI != LoopPredecessor)
+      continue;
+
+    NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
+  }
+
+  Instruction *InsPoint = &*Header->getFirstInsertionPt();
+  GetElementPtrInst *PtrInc = GetElementPtrInst::Create(
+      I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
+      getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint);
+  PtrInc->setIsInBounds(IsPtrInBounds(BasePtr));
+  for (const auto &PI : predecessors(Header)) {
+    if (PI == LoopPredecessor)
+      continue;
+
+    NewPHI->addIncoming(PtrInc, PI);
+  }
+
+  Instruction *NewBasePtr;
+  if (PtrInc->getType() != BasePtr->getType())
+    NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(),
+      getInstrName(PtrInc, CastNodeNameSuffix), InsPoint);
+  else
+    NewBasePtr = PtrInc;
+
+  if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
+    BBChanged.insert(IDel->getParent());
+  BasePtr->replaceAllUsesWith(NewBasePtr);
+  RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
+
+  // Keep track of the replacement pointer values we've inserted so that we
+  // don't generate more pointer values than necessary.
+  SmallPtrSet<Value *, 16> NewPtrs;
+  NewPtrs.insert(NewBasePtr);
+
+  for (auto I = std::next(BucketChain.Elements.begin()),
+       IE = BucketChain.Elements.end(); I != IE; ++I) {
+    Value *Ptr = GetPointerOperand(I->Instr);
+    assert(Ptr && "No pointer operand");
+    if (NewPtrs.count(Ptr))
+      continue;
+
+    Instruction *RealNewPtr;
+    if (!I->Offset || I->Offset->getValue()->isZero()) {
+      RealNewPtr = NewBasePtr;
+    } else {
+      Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
+      if (PtrIP && isa<Instruction>(NewBasePtr) &&
+          cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
+        PtrIP = nullptr;
+      else if (PtrIP && isa<PHINode>(PtrIP))
+        PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
+      else if (!PtrIP)
+        PtrIP = I->Instr;
+
+      GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
+          I8Ty, PtrInc, I->Offset->getValue(),
+          getInstrName(I->Instr, GEPNodeOffNameSuffix), PtrIP);
+      if (!PtrIP)
+        NewPtr->insertAfter(cast<Instruction>(PtrInc));
+      NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
+      RealNewPtr = NewPtr;
+    }
+
+    if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
+      BBChanged.insert(IDel->getParent());
+
+    Instruction *ReplNewPtr;
+    if (Ptr->getType() != RealNewPtr->getType()) {
+      ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
+        getInstrName(Ptr, CastNodeNameSuffix));
+      ReplNewPtr->insertAfter(RealNewPtr);
+    } else
+      ReplNewPtr = RealNewPtr;
+
+    Ptr->replaceAllUsesWith(ReplNewPtr);
+    RecursivelyDeleteTriviallyDeadInstructions(Ptr);
+
+    NewPtrs.insert(RealNewPtr);
+  }
+
+  MadeChange = true;
+  UpdFormChainRewritten++;
+
+  return MadeChange;
+}
+
+bool PPCLoopPreIncPrep::updateFormPrep(Loop *L,
+                                       SmallVector<Bucket, 16> &Buckets) {
+  bool MadeChange = false;
+  if (Buckets.empty())
+    return MadeChange;
+  SmallSet<BasicBlock *, 16> BBChanged;
+  for (auto &Bucket : Buckets)
+    // The base address of each bucket is transformed into a phi and the others
+    // are rewritten based on new base.
+    if (prepareBaseForUpdateFormChain(Bucket))
+      MadeChange |= rewriteLoadStores(L, Bucket, BBChanged);
+  if (MadeChange)
+    for (auto &BB : L->blocks())
+      if (BBChanged.count(BB))
+        DeleteDeadPHIs(BB);
+  return MadeChange;
+}
+
 // In order to prepare for the pre-increment a PHI is added.
 // This function will check to see if that PHI already exists and will return
-//  true if it found an existing PHI with the same start and increment as the
-//  one we wanted to create.
+// true if it found an existing PHI with the same start and increment as the
+// one we wanted to create.
 bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI,
                                         const SCEV *BasePtrStartSCEV,
                                         const SCEVConstant *BasePtrIncSCEV) {
@@ -216,10 +522,10 @@ bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI,
       continue;
 
     if (CurrentPHINode->getNumIncomingValues() == 2) {
-      if ( (CurrentPHINode->getIncomingBlock(0) == LatchBB &&
-            CurrentPHINode->getIncomingBlock(1) == PredBB) ||
-            (CurrentPHINode->getIncomingBlock(1) == LatchBB &&
-            CurrentPHINode->getIncomingBlock(0) == PredBB) ) {
+      if ((CurrentPHINode->getIncomingBlock(0) == LatchBB &&
+           CurrentPHINode->getIncomingBlock(1) == PredBB) ||
+          (CurrentPHINode->getIncomingBlock(1) == LatchBB &&
+           CurrentPHINode->getIncomingBlock(0) == PredBB)) {
         if (PHIBasePtrSCEV->getStart() == BasePtrStartSCEV &&
             PHIBasePtrIncSCEV == BasePtrIncSCEV) {
           // The existing PHI (CurrentPHINode) has the same start and increment
@@ -242,89 +548,6 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
 
   LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
 
-  BasicBlock *Header = L->getHeader();
-
-  const PPCSubtarget *ST =
-    TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr;
-
-  unsigned HeaderLoopPredCount = pred_size(Header);
-
-  // Collect buckets of comparable addresses used by loads and stores.
-  SmallVector<Bucket, 16> Buckets;
-  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
-       I != IE; ++I) {
-    for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
-        J != JE; ++J) {
-      Value *PtrValue;
-      Instruction *MemI;
-
-      if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) {
-        MemI = LMemI;
-        PtrValue = LMemI->getPointerOperand();
-      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) {
-        MemI = SMemI;
-        PtrValue = SMemI->getPointerOperand();
-      } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(J)) {
-        if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
-          MemI = IMemI;
-          PtrValue = IMemI->getArgOperand(0);
-        } else continue;
-      } else continue;
-
-      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
-      if (PtrAddrSpace)
-        continue;
-
-      // There are no update forms for Altivec vector load/stores.
-      if (ST && ST->hasAltivec() &&
-          PtrValue->getType()->getPointerElementType()->isVectorTy())
-        continue;
-
-      if (L->isLoopInvariant(PtrValue))
-        continue;
-
-      const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
-      if (const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV)) {
-        if (LARSCEV->getLoop() != L)
-          continue;
-        // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
-        // be 4's multiple (DS-form). For i64 loads/stores when the displacement
-        // fits in a 16-bit signed field but isn't a multiple of 4, it will be
-        // useless and possible to break some original well-form addressing mode
-        // to make this pre-inc prep for it.
-        if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) {
-          if (const SCEVConstant *StepConst =
-                  dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) {
-            const APInt &ConstInt = StepConst->getValue()->getValue();
-            if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0)
-              continue;
-          }
-        }
-      } else {
-        continue;
-      }
-
-      bool FoundBucket = false;
-      for (auto &B : Buckets) {
-        const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
-        if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
-          B.Elements.push_back(BucketElement(CDiff, MemI));
-          FoundBucket = true;
-          break;
-        }
-      }
-
-      if (!FoundBucket) {
-        if (Buckets.size() == MaxVars)
-          return MadeChange;
-        Buckets.push_back(Bucket(LSCEV, MemI));
-      }
-    }
-  }
-
-  if (Buckets.empty())
-    return MadeChange;
-
   BasicBlock *LoopPredecessor = L->getLoopPredecessor();
   // If there is no loop predecessor, or the loop predecessor's terminator
   // returns a value (which might contribute to determining the loop's
@@ -335,191 +558,48 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
     if (LoopPredecessor)
       MadeChange = true;
   }
-  if (!LoopPredecessor)
+  if (!LoopPredecessor) {
+    LLVM_DEBUG(dbgs() << "PIP fails since no predecessor for current loop.\n");
     return MadeChange;
+  }
 
-  LLVM_DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n");
-
-  SmallSet<BasicBlock *, 16> BBChanged;
-  for (unsigned i = 0, e = Buckets.size(); i != e; ++i) {
-    // The base address of each bucket is transformed into a phi and the others
-    // are rewritten as offsets of that variable.
-
-    // We have a choice now of which instruction's memory operand we use as the
-    // base for the generated PHI. Always picking the first instruction in each
-    // bucket does not work well, specifically because that instruction might
-    // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
-    // the choice is somewhat arbitrary, because the backend will happily
-    // generate direct offsets from both the pre-incremented and
-    // post-incremented pointer values. Thus, we'll pick the first non-prefetch
-    // instruction in each bucket, and adjust the recurrence and other offsets
-    // accordingly.
-    for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) {
-      if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr))
-        if (II->getIntrinsicID() == Intrinsic::prefetch)
-          continue;
-
-      // If we'd otherwise pick the first element anyway, there's nothing to do.
-      if (j == 0)
-        break;
-
-      // If our chosen element has no offset from the base pointer, there's
-      // nothing to do.
-      if (!Buckets[i].Elements[j].Offset ||
-          Buckets[i].Elements[j].Offset->isZero())
-        break;
-
-      const SCEV *Offset = Buckets[i].Elements[j].Offset;
-      Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset);
-      for (auto &E : Buckets[i].Elements) {
-        if (E.Offset)
-          E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
-        else
-          E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
-      }
-
-      std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]);
-      break;
-    }
-
-    const SCEVAddRecExpr *BasePtrSCEV =
-      cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV);
-    if (!BasePtrSCEV->isAffine())
-      continue;
-
-    LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
-    assert(BasePtrSCEV->getLoop() == L &&
-           "AddRec for the wrong loop?");
-
-    // The instruction corresponding to the Bucket's BaseSCEV must be the first
-    // in the vector of elements.
-    Instruction *MemI = Buckets[i].Elements.begin()->Instr;
-    Value *BasePtr = GetPointerOperand(MemI);
-    assert(BasePtr && "No pointer operand");
-
-    Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext());
-    Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(),
-      BasePtr->getType()->getPointerAddressSpace());
-
-    const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart();
-    if (!SE->isLoopInvariant(BasePtrStartSCEV, L))
-      continue;
-
-    const SCEVConstant *BasePtrIncSCEV =
-      dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE));
-    if (!BasePtrIncSCEV)
-      continue;
-    BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV);
-    if (!isSafeToExpand(BasePtrStartSCEV, *SE))
-      continue;
-
-    LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
-
-    if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV))
-      continue;
-
-    PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
-      MemI->hasName() ? MemI->getName() + ".phi" : "",
-      Header->getFirstNonPHI());
-
-    SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart");
-    Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
-      LoopPredecessor->getTerminator());
-
-    // Note that LoopPredecessor might occur in the predecessor list multiple
-    // times, and we need to add it the right number of times.
-    for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
-         PI != PE; ++PI) {
-      if (*PI != LoopPredecessor)
-        continue;
-
-      NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
-    }
-
-    Instruction *InsPoint = &*Header->getFirstInsertionPt();
-    GetElementPtrInst *PtrInc = GetElementPtrInst::Create(
-        I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
-        MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint);
-    PtrInc->setIsInBounds(IsPtrInBounds(BasePtr));
-    for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
-         PI != PE; ++PI) {
-      if (*PI == LoopPredecessor)
-        continue;
-
-      NewPHI->addIncoming(PtrInc, *PI);
-    }
-
-    Instruction *NewBasePtr;
-    if (PtrInc->getType() != BasePtr->getType())
-      NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(),
-        PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint);
-    else
-      NewBasePtr = PtrInc;
-
-    if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
-      BBChanged.insert(IDel->getParent());
-    BasePtr->replaceAllUsesWith(NewBasePtr);
-    RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
-
-    // Keep track of the replacement pointer values we've inserted so that we
-    // don't generate more pointer values than necessary.
-    SmallPtrSet<Value *, 16> NewPtrs;
-    NewPtrs.insert( NewBasePtr);
-
-    for (auto I = std::next(Buckets[i].Elements.begin()),
-         IE = Buckets[i].Elements.end(); I != IE; ++I) {
-      Value *Ptr = GetPointerOperand(I->Instr);
-      assert(Ptr && "No pointer operand");
-      if (NewPtrs.count(Ptr))
-        continue;
-
-      Instruction *RealNewPtr;
-      if (!I->Offset || I->Offset->getValue()->isZero()) {
-        RealNewPtr = NewBasePtr;
-      } else {
-        Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
-        if (PtrIP && isa<Instruction>(NewBasePtr) &&
-            cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
-          PtrIP = nullptr;
-        else if (isa<PHINode>(PtrIP))
-          PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
-        else if (!PtrIP)
-          PtrIP = I->Instr;
-
-        GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
-            I8Ty, PtrInc, I->Offset->getValue(),
-            I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP);
-        if (!PtrIP)
-          NewPtr->insertAfter(cast<Instruction>(PtrInc));
-        NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
-        RealNewPtr = NewPtr;
+  // Check if a load/store has update form. This lambda is used by function
+  // collectCandidates which can collect candidates for types defined by lambda.
+  auto isUpdateFormCandidate = [&] (const Instruction *I,
+                                    const Value *PtrValue) {
+    assert((PtrValue && I) && "Invalid parameter!");
+    // There are no update forms for Altivec vector load/stores.
+    if (ST && ST->hasAltivec() &&
+        PtrValue->getType()->getPointerElementType()->isVectorTy())
+      return false;
+    // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
+    // be 4's multiple (DS-form). For i64 loads/stores when the displacement
+    // fits in a 16-bit signed field but isn't a multiple of 4, it will be
+    // useless and possible to break some original well-form addressing mode
+    // to make this pre-inc prep for it.
+    if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) {
+      const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L);
+      const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LARSCEV || LARSCEV->getLoop() != L)
+        return false;
+      if (const SCEVConstant *StepConst =
+              dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) {
+        const APInt &ConstInt = StepConst->getValue()->getValue();
+        if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0)
+          return false;
       }
-
-      if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
-        BBChanged.insert(IDel->getParent());
-
-      Instruction *ReplNewPtr;
-      if (Ptr->getType() != RealNewPtr->getType()) {
-        ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
-          Ptr->hasName() ? Ptr->getName() + ".cast" : "");
-        ReplNewPtr->insertAfter(RealNewPtr);
-      } else
-        ReplNewPtr = RealNewPtr;
-
-      Ptr->replaceAllUsesWith(ReplNewPtr);
-      RecursivelyDeleteTriviallyDeadInstructions(Ptr);
-
-      NewPtrs.insert(RealNewPtr);
     }
+    return true;
+  };
 
-    MadeChange = true;
-  }
+  // Collect buckets of comparable addresses used by loads, stores and prefetch
+  // intrinsic for update form.
+  SmallVector<Bucket, 16> UpdateFormBuckets =
+      collectCandidates(L, isUpdateFormCandidate, MaxVars);
 
-  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
-       I != IE; ++I) {
-    if (BBChanged.count(*I))
-      DeleteDeadPHIs(*I);
-  }
+  // Prepare for update form.
+  if (!UpdateFormBuckets.empty())
+    MadeChange |= updateFormPrep(L, UpdateFormBuckets);
 
   return MadeChange;
 }
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 027e6bd1ba06..b6496f189a3a 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -79,7 +79,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
 }
 
 static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
-                              AsmPrinter &Printer, bool isDarwin) {
+                              AsmPrinter &Printer, bool IsDarwin) {
   MCContext &Ctx = Printer.OutContext;
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
 
@@ -137,10 +137,10 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   // Add ha16() / lo16() markers if required.
   switch (access) {
     case PPCII::MO_LO:
-      Expr = PPCMCExpr::createLo(Expr, isDarwin, Ctx);
+      Expr = PPCMCExpr::createLo(Expr, IsDarwin, Ctx);
       break;
     case PPCII::MO_HA:
-      Expr = PPCMCExpr::createHa(Expr, isDarwin, Ctx);
+      Expr = PPCMCExpr::createHa(Expr, IsDarwin, Ctx);
       break;
   }
 
@@ -148,20 +148,20 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
 }
 
 void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                        AsmPrinter &AP, bool isDarwin) {
+                                        AsmPrinter &AP, bool IsDarwin) {
   OutMI.setOpcode(MI->getOpcode());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MCOperand MCOp;
     if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP,
-                                          isDarwin))
+                                          IsDarwin))
       OutMI.addOperand(MCOp);
   }
 }
 
 bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
                                              MCOperand &OutMO, AsmPrinter &AP,
-                                             bool isDarwin) {
+                                             bool IsDarwin) {
   switch (MO.getType()) {
   default:
     llvm_unreachable("unknown operand type");
@@ -181,17 +181,20 @@ bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
     return true;
   case MachineOperand::MO_GlobalAddress:
   case MachineOperand::MO_ExternalSymbol:
-    OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
+    OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, IsDarwin);
     return true;
   case MachineOperand::MO_JumpTableIndex:
-    OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
+    OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, IsDarwin);
     return true;
   case MachineOperand::MO_ConstantPoolIndex:
-    OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
+    OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, IsDarwin);
     return true;
   case MachineOperand::MO_BlockAddress:
     OutMO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP,
-                         isDarwin);
+                         IsDarwin);
+    return true;
+  case MachineOperand::MO_MCSymbol:
+    OutMO = GetSymbolRef(MO, MO.getMCSymbol(), AP, IsDarwin);
     return true;
   case MachineOperand::MO_RegisterMask:
     return false;
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index 446246358e96..ac8ac060f460 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -148,8 +148,8 @@ static MachineInstr *getVRegDefOrNull(MachineOperand *Op,
   if (!Op->isReg())
     return nullptr;
 
-  unsigned Reg = Op->getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  Register Reg = Op->getReg();
+  if (!Register::isVirtualRegister(Reg))
     return nullptr;
 
   return MRI->getVRegDef(Reg);
@@ -344,8 +344,7 @@ bool PPCMIPeephole::simplifyCode(void) {
           unsigned TrueReg2 =
             TRI->lookThruCopyLike(MI.getOperand(2).getReg(), MRI);
 
-          if (TrueReg1 == TrueReg2
-              && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
+          if (TrueReg1 == TrueReg2 && Register::isVirtualRegister(TrueReg1)) {
             MachineInstr *DefMI = MRI->getVRegDef(TrueReg1);
             unsigned DefOpc = DefMI ? DefMI->getOpcode() : 0;
 
@@ -358,7 +357,7 @@ bool PPCMIPeephole::simplifyCode(void) {
                 return false;
               unsigned DefReg =
                 TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
-              if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
+              if (Register::isVirtualRegister(DefReg)) {
                 MachineInstr *LoadMI = MRI->getVRegDef(DefReg);
                 if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX)
                   return true;
@@ -444,7 +443,7 @@ bool PPCMIPeephole::simplifyCode(void) {
         unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
         unsigned TrueReg =
           TRI->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
-        if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+        if (!Register::isVirtualRegister(TrueReg))
           break;
         MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
         if (!DefMI)
@@ -453,8 +452,8 @@ bool PPCMIPeephole::simplifyCode(void) {
         auto isConvertOfSplat = [=]() -> bool {
           if (DefOpcode != PPC::XVCVSPSXWS && DefOpcode != PPC::XVCVSPUXWS)
             return false;
-          unsigned ConvReg = DefMI->getOperand(1).getReg();
-          if (!TargetRegisterInfo::isVirtualRegister(ConvReg))
+          Register ConvReg = DefMI->getOperand(1).getReg();
+          if (!Register::isVirtualRegister(ConvReg))
             return false;
           MachineInstr *Splt = MRI->getVRegDef(ConvReg);
           return Splt && (Splt->getOpcode() == PPC::LXVWSX ||
@@ -481,9 +480,9 @@ bool PPCMIPeephole::simplifyCode(void) {
         // Splat fed by a shift. Usually when we align value to splat into
         // vector element zero.
         if (DefOpcode == PPC::XXSLDWI) {
-          unsigned ShiftRes = DefMI->getOperand(0).getReg();
-          unsigned ShiftOp1 = DefMI->getOperand(1).getReg();
-          unsigned ShiftOp2 = DefMI->getOperand(2).getReg();
+          Register ShiftRes = DefMI->getOperand(0).getReg();
+          Register ShiftOp1 = DefMI->getOperand(1).getReg();
+          Register ShiftOp2 = DefMI->getOperand(2).getReg();
           unsigned ShiftImm = DefMI->getOperand(3).getImm();
           unsigned SplatImm = MI.getOperand(2).getImm();
           if (ShiftOp1 == ShiftOp2) {
@@ -507,7 +506,7 @@ bool PPCMIPeephole::simplifyCode(void) {
         // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
         unsigned TrueReg =
           TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
-        if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+        if (!Register::isVirtualRegister(TrueReg))
           break;
         MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
 
@@ -518,8 +517,8 @@ bool PPCMIPeephole::simplifyCode(void) {
             TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
           unsigned DefsReg2 =
             TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
-          if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) ||
-              !TargetRegisterInfo::isVirtualRegister(DefsReg2))
+          if (!Register::isVirtualRegister(DefsReg1) ||
+              !Register::isVirtualRegister(DefsReg2))
             break;
           MachineInstr *P1 = MRI->getVRegDef(DefsReg1);
           MachineInstr *P2 = MRI->getVRegDef(DefsReg2);
@@ -533,8 +532,8 @@ bool PPCMIPeephole::simplifyCode(void) {
             if (RoundInstr->getOpcode() == PPC::FRSP &&
                 MRI->hasOneNonDBGUse(RoundInstr->getOperand(0).getReg())) {
               Simplified = true;
-              unsigned ConvReg1 = RoundInstr->getOperand(1).getReg();
-              unsigned FRSPDefines = RoundInstr->getOperand(0).getReg();
+              Register ConvReg1 = RoundInstr->getOperand(1).getReg();
+              Register FRSPDefines = RoundInstr->getOperand(0).getReg();
               MachineInstr &Use = *(MRI->use_instr_begin(FRSPDefines));
               for (int i = 0, e = Use.getNumOperands(); i < e; ++i)
                 if (Use.getOperand(i).isReg() &&
@@ -566,8 +565,8 @@ bool PPCMIPeephole::simplifyCode(void) {
       case PPC::EXTSH8:
       case PPC::EXTSH8_32_64: {
         if (!EnableSExtElimination) break;
-        unsigned NarrowReg = MI.getOperand(1).getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(NarrowReg))
+        Register NarrowReg = MI.getOperand(1).getReg();
+        if (!Register::isVirtualRegister(NarrowReg))
           break;
 
         MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg);
@@ -610,8 +609,8 @@ bool PPCMIPeephole::simplifyCode(void) {
       case PPC::EXTSW_32:
       case PPC::EXTSW_32_64: {
         if (!EnableSExtElimination) break;
-        unsigned NarrowReg = MI.getOperand(1).getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(NarrowReg))
+        Register NarrowReg = MI.getOperand(1).getReg();
+        if (!Register::isVirtualRegister(NarrowReg))
           break;
 
         MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg);
@@ -652,8 +651,8 @@ bool PPCMIPeephole::simplifyCode(void) {
           // We can eliminate EXTSW if the input is known to be already
           // sign-extended.
           LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
-          unsigned TmpReg =
-            MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass);
+          Register TmpReg =
+              MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass);
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::IMPLICIT_DEF),
                   TmpReg);
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::INSERT_SUBREG),
@@ -679,8 +678,8 @@ bool PPCMIPeephole::simplifyCode(void) {
         if (MI.getOperand(2).getImm() != 0)
           break;
 
-        unsigned SrcReg = MI.getOperand(1).getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+        Register SrcReg = MI.getOperand(1).getReg();
+        if (!Register::isVirtualRegister(SrcReg))
           break;
 
         MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
@@ -695,8 +694,8 @@ bool PPCMIPeephole::simplifyCode(void) {
 
         SrcMI = SubRegMI;
         if (SubRegMI->getOpcode() == PPC::COPY) {
-          unsigned CopyReg = SubRegMI->getOperand(1).getReg();
-          if (TargetRegisterInfo::isVirtualRegister(CopyReg))
+          Register CopyReg = SubRegMI->getOperand(1).getReg();
+          if (Register::isVirtualRegister(CopyReg))
             SrcMI = MRI->getVRegDef(CopyReg);
         }
 
@@ -757,7 +756,7 @@ bool PPCMIPeephole::simplifyCode(void) {
           break; // We don't have an ADD fed by LI's that can be transformed
 
         // Now we know that Op1 is the PHI node and Op2 is the dominator
-        unsigned DominatorReg = Op2.getReg();
+        Register DominatorReg = Op2.getReg();
 
         const TargetRegisterClass *TRC = MI.getOpcode() == PPC::ADD8
                                              ? &PPC::G8RC_and_G8RC_NOX0RegClass
@@ -927,7 +926,7 @@ static unsigned getSrcVReg(unsigned Reg, MachineBasicBlock *BB1,
     }
     else if (Inst->isFullCopy())
       NextReg = Inst->getOperand(1).getReg();
-    if (NextReg == SrcReg || !TargetRegisterInfo::isVirtualRegister(NextReg))
+    if (NextReg == SrcReg || !Register::isVirtualRegister(NextReg))
       break;
     SrcReg = NextReg;
   }
@@ -949,9 +948,8 @@ static bool eligibleForCompareElimination(MachineBasicBlock &MBB,
         (*BII).getOpcode() == PPC::BCC &&
         (*BII).getOperand(1).isReg()) {
       // We optimize only if the condition code is used only by one BCC.
-      unsigned CndReg = (*BII).getOperand(1).getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(CndReg) ||
-          !MRI->hasOneNonDBGUse(CndReg))
+      Register CndReg = (*BII).getOperand(1).getReg();
+      if (!Register::isVirtualRegister(CndReg) || !MRI->hasOneNonDBGUse(CndReg))
         return false;
 
       MachineInstr *CMPI = MRI->getVRegDef(CndReg);
@@ -961,7 +959,7 @@ static bool eligibleForCompareElimination(MachineBasicBlock &MBB,
 
       // We skip this BB if a physical register is used in comparison.
       for (MachineOperand &MO : CMPI->operands())
-        if (MO.isReg() && !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        if (MO.isReg() && !Register::isVirtualRegister(MO.getReg()))
           return false;
 
       return true;
@@ -1271,8 +1269,8 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
       // We touch up the compare instruction in MBB2 and move it to
       // a previous BB to handle partially redundant case.
       if (SwapOperands) {
-        unsigned Op1 = CMPI2->getOperand(1).getReg();
-        unsigned Op2 = CMPI2->getOperand(2).getReg();
+        Register Op1 = CMPI2->getOperand(1).getReg();
+        Register Op2 = CMPI2->getOperand(2).getReg();
         CMPI2->getOperand(1).setReg(Op2);
         CMPI2->getOperand(2).setReg(Op1);
       }
@@ -1295,7 +1293,7 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
       MBBtoMoveCmp->splice(I, &MBB2, MachineBasicBlock::iterator(CMPI2));
 
       DebugLoc DL = CMPI2->getDebugLoc();
-      unsigned NewVReg = MRI->createVirtualRegister(&PPC::CRRCRegClass);
+      Register NewVReg = MRI->createVirtualRegister(&PPC::CRRCRegClass);
       BuildMI(MBB2, MBB2.begin(), DL,
               TII->get(PPC::PHI), NewVReg)
         .addReg(BI1->getOperand(1).getReg()).addMBB(MBB1)
@@ -1334,8 +1332,8 @@ bool PPCMIPeephole::emitRLDICWhenLoweringJumpTables(MachineInstr &MI) {
   if (MI.getOpcode() != PPC::RLDICR)
     return false;
 
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (!Register::isVirtualRegister(SrcReg))
     return false;
 
   MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
@@ -1414,8 +1412,8 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI,
   if (SHMI + MEMI != 63)
     return false;
 
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (!Register::isVirtualRegister(SrcReg))
     return false;
 
   MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
@@ -1428,6 +1426,12 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI,
   if (!MRI->hasOneNonDBGUse(SrcReg))
     return false;
 
+  assert(SrcMI->getNumOperands() == 2 && "EXTSW should have 2 operands");
+  assert(SrcMI->getOperand(1).isReg() &&
+         "EXTSW's second operand should be a register");
+  if (!Register::isVirtualRegister(SrcMI->getOperand(1).getReg()))
+    return false;
+
   LLVM_DEBUG(dbgs() << "Combining pair: ");
   LLVM_DEBUG(SrcMI->dump());
   LLVM_DEBUG(MI.dump());
diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index d83c92276800..b1c0433641dd 100644
--- a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -57,6 +57,109 @@ namespace {
           MachineFunctionProperties::Property::NoVRegs);
     }
 
+    // This function removes any redundant load immediates. It has two level
+    // loops - The outer loop finds the load immediates BBI that could be used
+    // to replace following redundancy. The inner loop scans instructions that
+    // after BBI to find redundancy and update kill/dead flags accordingly. If
+    // AfterBBI is the same as BBI, it is redundant, otherwise any instructions
+    // that modify the def register of BBI would break the scanning.
+    // DeadOrKillToUnset is a pointer to the previous operand that had the
+    // kill/dead flag set. It keeps track of the def register of BBI, the use
+    // registers of AfterBBIs and the def registers of AfterBBIs.
+    bool removeRedundantLIs(MachineBasicBlock &MBB,
+                            const TargetRegisterInfo *TRI) {
+      LLVM_DEBUG(dbgs() << "Remove redundant load immediates from MBB:\n";
+                 MBB.dump(); dbgs() << "\n");
+
+      DenseSet<MachineInstr *> InstrsToErase;
+      for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
+        // Skip load immediate that is marked to be erased later because it
+        // cannot be used to replace any other instructions.
+        if (InstrsToErase.find(&*BBI) != InstrsToErase.end())
+          continue;
+        // Skip non-load immediate.
+        unsigned Opc = BBI->getOpcode();
+        if (Opc != PPC::LI && Opc != PPC::LI8 && Opc != PPC::LIS &&
+            Opc != PPC::LIS8)
+          continue;
+        // Skip load immediate, where the operand is a relocation (e.g., $r3 =
+        // LI target-flags(ppc-lo) %const.0).
+        if (!BBI->getOperand(1).isImm())
+          continue;
+        assert(BBI->getOperand(0).isReg() &&
+               "Expected a register for the first operand");
+
+        LLVM_DEBUG(dbgs() << "Scanning after load immediate: "; BBI->dump(););
+
+        Register Reg = BBI->getOperand(0).getReg();
+        int64_t Imm = BBI->getOperand(1).getImm();
+        MachineOperand *DeadOrKillToUnset = nullptr;
+        if (BBI->getOperand(0).isDead()) {
+          DeadOrKillToUnset = &BBI->getOperand(0);
+          LLVM_DEBUG(dbgs() << " Kill flag of " << *DeadOrKillToUnset
+                            << " from load immediate " << *BBI
+                            << " is a unsetting candidate\n");
+        }
+        // This loop scans instructions after BBI to see if there is any
+        // redundant load immediate.
+        for (auto AfterBBI = std::next(BBI); AfterBBI != MBB.instr_end();
+             ++AfterBBI) {
+          // Track the operand that kill Reg. We would unset the kill flag of
+          // the operand if there is a following redundant load immediate.
+          int KillIdx = AfterBBI->findRegisterUseOperandIdx(Reg, true, TRI);
+          if (KillIdx != -1) {
+            assert(!DeadOrKillToUnset && "Shouldn't kill same register twice");
+            DeadOrKillToUnset = &AfterBBI->getOperand(KillIdx);
+            LLVM_DEBUG(dbgs()
+                       << " Kill flag of " << *DeadOrKillToUnset << " from "
+                       << *AfterBBI << " is a unsetting candidate\n");
+          }
+
+          if (!AfterBBI->modifiesRegister(Reg, TRI))
+            continue;
+          // Finish scanning because Reg is overwritten by a non-load
+          // instruction.
+          if (AfterBBI->getOpcode() != Opc)
+            break;
+          assert(AfterBBI->getOperand(0).isReg() &&
+                 "Expected a register for the first operand");
+          // Finish scanning because Reg is overwritten by a relocation or a
+          // different value.
+          if (!AfterBBI->getOperand(1).isImm() ||
+              AfterBBI->getOperand(1).getImm() != Imm)
+            break;
+
+          // It loads same immediate value to the same Reg, which is redundant.
+          // We would unset kill flag in previous Reg usage to extend live range
+          // of Reg first, then remove the redundancy.
+          if (DeadOrKillToUnset) {
+            LLVM_DEBUG(dbgs()
+                       << " Unset dead/kill flag of " << *DeadOrKillToUnset
+                       << " from " << *DeadOrKillToUnset->getParent());
+            if (DeadOrKillToUnset->isDef())
+              DeadOrKillToUnset->setIsDead(false);
+            else
+              DeadOrKillToUnset->setIsKill(false);
+          }
+          DeadOrKillToUnset =
+              AfterBBI->findRegisterDefOperand(Reg, true, true, TRI);
+          if (DeadOrKillToUnset)
+            LLVM_DEBUG(dbgs()
+                       << " Dead flag of " << *DeadOrKillToUnset << " from "
+                       << *AfterBBI << " is a unsetting candidate\n");
+          InstrsToErase.insert(&*AfterBBI);
+          LLVM_DEBUG(dbgs() << " Remove redundant load immediate: ";
+                     AfterBBI->dump());
+        }
+      }
+
+      for (MachineInstr *MI : InstrsToErase) {
+        MI->eraseFromParent();
+      }
+      NumRemovedInPreEmit += InstrsToErase.size();
+      return !InstrsToErase.empty();
+    }
+
     bool runOnMachineFunction(MachineFunction &MF) override {
       if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole)
         return false;
@@ -65,6 +168,7 @@ namespace {
       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
       SmallVector<MachineInstr *, 4> InstrsToErase;
       for (MachineBasicBlock &MBB : MF) {
+        Changed |= removeRedundantLIs(MBB, TRI);
         for (MachineInstr &MI : MBB) {
           unsigned Opc = MI.getOpcode();
           // Detect self copies - these can result from running AADB.
@@ -111,7 +215,7 @@ namespace {
         if (Br->getOpcode() != PPC::BC && Br->getOpcode() != PPC::BCn)
           continue;
         MachineInstr *CRSetMI = nullptr;
-        unsigned CRBit = Br->getOperand(0).getReg();
+        Register CRBit = Br->getOperand(0).getReg();
         unsigned CRReg = getCRFromCRBit(CRBit);
         bool SeenUse = false;
         MachineBasicBlock::reverse_iterator It = Br, Er = MBB.rend();
diff --git a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
index 3a83cc27439c..6e9042643820 100644
--- a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
+++ b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
@@ -79,8 +79,8 @@ bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
 
       for (auto SI = Splats.begin(); SI != Splats.end();) {
         MachineInstr *SMI = *SI;
-        unsigned SplatReg = SMI->getOperand(0).getReg();
-        unsigned SrcReg = SMI->getOperand(1).getReg();
+        Register SplatReg = SMI->getOperand(0).getReg();
+        Register SrcReg = SMI->getOperand(1).getReg();
 
         if (MI->modifiesRegister(SrcReg, TRI)) {
           switch (MI->getOpcode()) {
@@ -102,7 +102,7 @@ bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
               // the QPX splat source register.
               unsigned SubRegIndex =
                 TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg());
-              unsigned SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
+              Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
 
               // Substitute both the explicit defined register, and also the
               // implicit def of the containing QPX register.
diff --git a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 8eaa6dfe2bf7..3b71ed219c17 100644
--- a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -381,10 +381,10 @@ private:
   const MachineBranchProbabilityInfo *MBPI;
 
   // A vector to contain all the CR logical operations
-  std::vector<CRLogicalOpInfo> AllCRLogicalOps;
+  SmallVector<CRLogicalOpInfo, 16> AllCRLogicalOps;
   void initialize(MachineFunction &MFParm);
   void collectCRLogicals();
-  bool handleCROp(CRLogicalOpInfo &CRI);
+  bool handleCROp(unsigned Idx);
   bool splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI);
   static bool isCRLogical(MachineInstr &MI) {
     unsigned Opc = MI.getOpcode();
@@ -398,7 +398,7 @@ private:
     // Not using a range-based for loop here as the vector may grow while being
     // operated on.
     for (unsigned i = 0; i < AllCRLogicalOps.size(); i++)
-      Changed |= handleCROp(AllCRLogicalOps[i]);
+      Changed |= handleCROp(i);
     return Changed;
   }
 
@@ -535,15 +535,15 @@ MachineInstr *PPCReduceCRLogicals::lookThroughCRCopy(unsigned Reg,
                                                      unsigned &Subreg,
                                                      MachineInstr *&CpDef) {
   Subreg = -1;
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  if (!Register::isVirtualRegister(Reg))
     return nullptr;
   MachineInstr *Copy = MRI->getVRegDef(Reg);
   CpDef = Copy;
   if (!Copy->isCopy())
     return Copy;
-  unsigned CopySrc = Copy->getOperand(1).getReg();
+  Register CopySrc = Copy->getOperand(1).getReg();
   Subreg = Copy->getOperand(1).getSubReg();
-  if (!TargetRegisterInfo::isVirtualRegister(CopySrc)) {
+  if (!Register::isVirtualRegister(CopySrc)) {
     const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
     // Set the Subreg
     if (CopySrc == PPC::CR0EQ || CopySrc == PPC::CR6EQ)
@@ -578,10 +578,11 @@ void PPCReduceCRLogicals::initialize(MachineFunction &MFParam) {
 /// a unary CR logical might be used to change the condition code on a
 /// comparison feeding it. A nullary CR logical might simply be removable
 /// if the user of the bit it [un]sets can be transformed.
-bool PPCReduceCRLogicals::handleCROp(CRLogicalOpInfo &CRI) {
+bool PPCReduceCRLogicals::handleCROp(unsigned Idx) {
   // We can definitely split a block on the inputs to a binary CR operation
   // whose defs and (single) use are within the same block.
   bool Changed = false;
+  CRLogicalOpInfo CRI = AllCRLogicalOps[Idx];
   if (CRI.IsBinary && CRI.ContainedInBlock && CRI.SingleUse && CRI.FeedsBR &&
       CRI.DefsSingleUse) {
     Changed = splitBlockOnBinaryCROp(CRI);
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 12554ea8d079..9ec26a19bdaa 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -325,13 +325,13 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   bool IsPositionIndependent = TM.isPositionIndependent();
   if (hasBasePointer(MF)) {
-    if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
+    if (Subtarget.is32BitELFABI() && IsPositionIndependent)
       markSuperRegs(Reserved, PPC::R29);
     else
       markSuperRegs(Reserved, PPC::R30);
   }
 
-  if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
+  if (Subtarget.is32BitELFABI() && IsPositionIndependent)
     markSuperRegs(Reserved, PPC::R30);
 
   // Reserve Altivec registers when Altivec is unavailable.
@@ -391,7 +391,7 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co
 
 bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg,
                                                const MachineFunction &MF) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+  assert(Register::isPhysicalRegister(PhysReg));
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   if (!TM.isPPC64())
@@ -425,7 +425,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case PPC::G8RC_NOX0RegClassID:
   case PPC::GPRC_NOR0RegClassID:
   case PPC::SPERCRegClassID:
-  case PPC::SPE4RCRegClassID:
   case PPC::G8RCRegClassID:
   case PPC::GPRCRegClassID: {
     unsigned FP = TFI->hasFP(MF) ? 1 : 0;
@@ -527,7 +526,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   // Fortunately, a frame greater than 32K is rare.
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
 
   if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
     if (LP64)
@@ -549,7 +548,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   }
 
   bool KillNegSizeReg = MI.getOperand(1).isKill();
-  unsigned NegSizeReg = MI.getOperand(1).getReg();
+  Register NegSizeReg = MI.getOperand(1).getReg();
 
   // Grow the stack and update the stack pointer link, then determine the
   // address of new allocated space.
@@ -655,8 +654,8 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
 
-  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
-  unsigned SrcReg = MI.getOperand(0).getReg();
+  Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  Register SrcReg = MI.getOperand(0).getReg();
 
   // We need to store the CR in the low 4-bits of the saved value. First, issue
   // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg.
@@ -700,8 +699,8 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
 
-  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
-  unsigned DestReg = MI.getOperand(0).getReg();
+  Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  Register DestReg = MI.getOperand(0).getReg();
   assert(MI.definesRegister(DestReg) &&
     "RESTORE_CR does not define its destination");
 
@@ -744,8 +743,8 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
 
-  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
-  unsigned SrcReg = MI.getOperand(0).getReg();
+  Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  Register SrcReg = MI.getOperand(0).getReg();
 
   // Search up the BB to find the definition of the CR bit.
   MachineBasicBlock::reverse_iterator Ins;
@@ -823,8 +822,8 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
 
-  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
-  unsigned DestReg = MI.getOperand(0).getReg();
+  Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  Register DestReg = MI.getOperand(0).getReg();
   assert(MI.definesRegister(DestReg) &&
     "RESTORE_CRBIT does not define its destination");
 
@@ -833,7 +832,7 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
 
   BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg);
 
-  unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  Register RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
   BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO)
           .addReg(getCRFromCRBit(DestReg));
 
@@ -870,8 +869,8 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
-  unsigned SrcReg = MI.getOperand(0).getReg();
+  Register Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+  Register SrcReg = MI.getOperand(0).getReg();
 
   BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
       .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
@@ -896,8 +895,8 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
-  unsigned DestReg = MI.getOperand(0).getReg();
+  Register Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+  Register DestReg = MI.getOperand(0).getReg();
   assert(MI.definesRegister(DestReg) &&
     "RESTORE_VRSAVE does not define its destination");
 
@@ -1128,7 +1127,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     OperandBase = OffsetOperandNo;
   }
 
-  unsigned StackReg = MI.getOperand(FIOperandNum).getReg();
+  Register StackReg = MI.getOperand(FIOperandNum).getReg();
   MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
   MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
 }
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index af0dff6347a6..4719e947b172 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -253,15 +253,14 @@ def RM: PPCReg<"**ROUNDING MODE**">;
 /// Register classes
 // Allocate volatiles first
 // then nonvolatiles in reverse order since stmw/lmw save from rN to r31
-def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12),
-                                                (sequence "R%u", 30, 13),
-                                                R31, R0, R1, FP, BP)> {
+def GPRC : RegisterClass<"PPC", [i32,f32], 32, (add (sequence "R%u", 2, 12),
+                                                    (sequence "R%u", 30, 13),
+                                                    R31, R0, R1, FP, BP)> {
   // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
   // put it at the end of the list.
   let AltOrders = [(add (sub GPRC, R2), R2)];
   let AltOrderSelect = [{
-    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
-    return S.isPPC64() && S.isSVR4ABI();
+    return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
   }];
 }
 
@@ -272,21 +271,19 @@ def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
   // put it at the end of the list.
   let AltOrders = [(add (sub G8RC, X2), X2)];
   let AltOrderSelect = [{
-    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
-    return S.isPPC64() && S.isSVR4ABI();
+    return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
   }];
 }
 
 // For some instructions r0 is special (representing the value 0 instead of
 // the value in the r0 register), and we use these register subclasses to
 // prevent r0 from being allocated for use by those instructions.
-def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)> {
+def GPRC_NOR0 : RegisterClass<"PPC", [i32,f32], 32, (add (sub GPRC, R0), ZERO)> {
   // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
   // put it at the end of the list.
   let AltOrders = [(add (sub GPRC_NOR0, R2), R2)];
   let AltOrderSelect = [{
-    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
-    return S.isPPC64() && S.isSVR4ABI();
+    return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
   }];
 }
 
@@ -295,8 +292,7 @@ def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> {
   // put it at the end of the list.
   let AltOrders = [(add (sub G8RC_NOX0, X2), X2)];
   let AltOrderSelect = [{
-    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
-    return S.isPPC64() && S.isSVR4ABI();
+    return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
   }];
 }
 
@@ -304,8 +300,6 @@ def SPERC : RegisterClass<"PPC", [f64], 64, (add (sequence "S%u", 2, 12),
                                                 (sequence "S%u", 30, 13),
                                                 S31, S0, S1)>;
 
-def SPE4RC : RegisterClass<"PPC", [f32], 32, (add GPRC)>;
-
 // Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
 // ABI the size of the Floating-point register save area is determined by the
 // allocated non-volatile register with the lowest register number, as FP
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 6aa7528634d3..10568ed4b655 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -60,7 +60,7 @@ PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
       InstrInfo(*this), TLInfo(TM, *this) {}
 
 void PPCSubtarget::initializeEnvironment() {
-  StackAlignment = 16;
+  StackAlignment = Align(16);
   DarwinDirective = PPC::DIR_NONE;
   HasMFOCRF = false;
   Has64BitSupport = false;
@@ -145,7 +145,8 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (isDarwin())
     HasLazyResolverStubs = true;
 
-  if (TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD() ||
+  if ((TargetTriple.isOSFreeBSD() && TargetTriple.getOSMajorVersion() >= 13) ||
+      TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD() ||
       TargetTriple.isMusl())
     SecurePlt = true;
 
@@ -228,18 +229,13 @@ bool PPCSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
 }
 
-unsigned char
-PPCSubtarget::classifyGlobalReference(const GlobalValue *GV) const {
-  // Note that currently we don't generate non-pic references.
-  // If a caller wants that, this will have to be updated.
-
+bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
   // Large code model always uses the TOC even for local symbols.
   if (TM.getCodeModel() == CodeModel::Large)
-    return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
-
+    return true;
   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
-    return PPCII::MO_PIC_FLAG;
-  return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
+    return false;
+  return true;
 }
 
 bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 55fec1cb6d99..d96c2893aee9 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -78,7 +78,7 @@ protected:
 
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
-  unsigned StackAlignment;
+  Align StackAlignment;
 
   /// Selected instruction itineraries (one entry per itinerary class.)
   InstrItineraryData InstrItins;
@@ -166,7 +166,7 @@ public:
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
-  unsigned getStackAlignment() const { return StackAlignment; }
+  Align getStackAlignment() const { return StackAlignment; }
 
   /// getDarwinDirective - Returns the -m directive specified for the cpu.
   ///
@@ -210,7 +210,11 @@ public:
   /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
   bool has64BitSupport() const { return Has64BitSupport; }
   // useSoftFloat - Return true if soft-float option is turned on.
-  bool useSoftFloat() const { return !HasHardFloat; }
+  bool useSoftFloat() const {
+    if (isAIXABI() && !HasHardFloat)
+      report_fatal_error("soft-float is not yet supported on AIX.");
+    return !HasHardFloat;
+  }
 
   /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
   /// registers in 32-bit mode when possible.  This can only true if
@@ -277,11 +281,11 @@ public:
   bool hasDirectMove() const { return HasDirectMove; }
 
   bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; }
-  unsigned getPlatformStackAlignment() const {
+  Align getPlatformStackAlignment() const {
     if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned())
-      return 32;
+      return Align(32);
 
-    return 16;
+    return Align(16);
   }
 
   // DarwinABI has a 224-byte red zone. PPC32 SVR4ABI(Non-DarwinABI) has no
@@ -316,6 +320,9 @@ public:
   bool isSVR4ABI() const { return !isDarwinABI() && !isAIXABI(); }
   bool isELFv2ABI() const;
 
+  bool is64BitELFABI() const { return  isSVR4ABI() && isPPC64(); }
+  bool is32BitELFABI() const { return  isSVR4ABI() && !isPPC64(); }
+
   /// Originally, this function return hasISEL(). Now we always enable it,
   /// but may expand the ISEL instruction later.
   bool enableEarlyIfConversion() const override { return true; }
@@ -337,9 +344,8 @@ public:
 
   bool enableSubRegLiveness() const override;
 
-  /// classifyGlobalReference - Classify a global variable reference for the
-  /// current subtarget accourding to how we should reference it.
-  unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+  /// True if the GV will be accessed via an indirect symbol.
+  bool isGVIndirectSymbol(const GlobalValue *GV) const;
 
   bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; }
 };
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index fb826c4a32f1..8f313d9d01c4 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -74,8 +74,8 @@ protected:
 
         LLVM_DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n    " << MI);
 
-        unsigned OutReg = MI.getOperand(0).getReg();
-        unsigned InReg = MI.getOperand(1).getReg();
+        Register OutReg = MI.getOperand(0).getReg();
+        Register InReg = MI.getOperand(1).getReg();
         DebugLoc DL = MI.getDebugLoc();
         unsigned GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
         unsigned Opc1, Opc2;
diff --git a/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/lib/Target/PowerPC/PPCTOCRegDeps.cpp
index 3eb0569fb955..895ae6744421 100644
--- a/lib/Target/PowerPC/PPCTOCRegDeps.cpp
+++ b/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -95,7 +95,8 @@ namespace {
 protected:
     bool hasTOCLoReloc(const MachineInstr &MI) {
       if (MI.getOpcode() == PPC::LDtocL ||
-          MI.getOpcode() == PPC::ADDItocL)
+          MI.getOpcode() == PPC::ADDItocL ||
+          MI.getOpcode() == PPC::LWZtocL)
         return true;
 
       for (const MachineOperand &MO : MI.operands()) {
@@ -109,11 +110,15 @@ protected:
     bool processBlock(MachineBasicBlock &MBB) {
       bool Changed = false;
 
+      const bool isPPC64 =
+          MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+      const unsigned TOCReg = isPPC64 ? PPC::X2 : PPC::R2;
+
       for (auto &MI : MBB) {
         if (!hasTOCLoReloc(MI))
           continue;
 
-        MI.addOperand(MachineOperand::CreateReg(PPC::X2,
+        MI.addOperand(MachineOperand::CreateReg(TOCReg,
                                                 false  /*IsDef*/,
                                                 true  /*IsImp*/));
         Changed = true;
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index ce00f848dd72..abefee8b339d 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -93,7 +93,7 @@ EnableMachineCombinerPass("ppc-machine-combiner",
 static cl::opt<bool>
   ReduceCRLogical("ppc-reduce-cr-logicals",
                   cl::desc("Expand eligible cr-logical binary ops to branches"),
-                  cl::init(false), cl::Hidden);
+                  cl::init(true), cl::Hidden);
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
@@ -185,12 +185,13 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
-  // If it isn't a Mach-O file then it's going to be a linux ELF
-  // object file.
   if (TT.isOSDarwin())
-    return llvm::make_unique<TargetLoweringObjectFileMachO>();
+    return std::make_unique<TargetLoweringObjectFileMachO>();
+
+  if (TT.isOSAIX())
+    return std::make_unique<TargetLoweringObjectFileXCOFF>();
 
-  return llvm::make_unique<PPC64LinuxTargetObjectFile>();
+  return std::make_unique<PPC64LinuxTargetObjectFile>();
 }
 
 static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
@@ -248,10 +249,19 @@ static CodeModel::Model getEffectivePPCCodeModel(const Triple &TT,
       report_fatal_error("Target does not support the kernel CodeModel", false);
     return *CM;
   }
-  if (!TT.isOSDarwin() && !JIT &&
-      (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
-    return CodeModel::Medium;
-  return CodeModel::Small;
+
+  if (JIT)
+    return CodeModel::Small;
+  if (TT.isOSAIX())
+    return CodeModel::Small;
+
+  assert(TT.isOSBinFormatELF() && "All remaining PPC OSes are ELF based.");
+
+  if (TT.isArch32Bit())
+    return CodeModel::Small;
+
+  assert(TT.isArch64Bit() && "Unsupported PPC architecture.");
+  return CodeModel::Medium;
 }
 
 
@@ -259,8 +269,8 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) {
   const PPCSubtarget &ST = C->MF->getSubtarget<PPCSubtarget>();
   ScheduleDAGMILive *DAG =
     new ScheduleDAGMILive(C, ST.usePPCPreRASchedStrategy() ?
-                          llvm::make_unique<PPCPreRASchedStrategy>(C) :
-                          llvm::make_unique<GenericScheduler>(C));
+                          std::make_unique<PPCPreRASchedStrategy>(C) :
+                          std::make_unique<GenericScheduler>(C));
   // add DAG Mutations here.
   DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
@@ -271,8 +281,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler(
   const PPCSubtarget &ST = C->MF->getSubtarget<PPCSubtarget>();
   ScheduleDAGMI *DAG =
     new ScheduleDAGMI(C, ST.usePPCPostRASchedStrategy() ?
-                      llvm::make_unique<PPCPostRASchedStrategy>(C) :
-                      llvm::make_unique<PostGenericScheduler>(C), true);
+                      std::make_unique<PPCPostRASchedStrategy>(C) :
+                      std::make_unique<PostGenericScheduler>(C), true);
   // add DAG Mutations here.
   return DAG;
 }
@@ -328,7 +338,7 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<PPCSubtarget>(
+    I = std::make_unique<PPCSubtarget>(
         TargetTriple, CPU,
         // FIXME: It would be good to have the subtarget additions here
         // not necessary. Anything that turns them on/off (overrides) ends
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index ff3dfbfaca05..f51300c656aa 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -594,10 +594,37 @@ bool PPCTTIImpl::enableInterleavedAccessVectorization() {
   return true;
 }
 
-unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
-  if (Vector && !ST->hasAltivec() && !ST->hasQPX())
-    return 0;
-  return ST->hasVSX() ? 64 : 32;
+unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+  assert(ClassID == GPRRC || ClassID == FPRRC ||
+         ClassID == VRRC || ClassID == VSXRC);
+  if (ST->hasVSX()) {
+    assert(ClassID == GPRRC || ClassID == VSXRC);
+    return ClassID == GPRRC ? 32 : 64;
+  }
+  assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
+  return 32;
+}
+
+unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
+  if (Vector)
+    return ST->hasVSX() ? VSXRC : VRRC;
+  else if (Ty && Ty->getScalarType()->isFloatTy())
+    return ST->hasVSX() ? VSXRC : FPRRC;
+  else
+    return GPRRC;
+}
+
+const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
+
+  switch (ClassID) {
+    default:
+      llvm_unreachable("unknown register class");
+      return "PPC::unknown register class";
+    case GPRRC:       return "PPC::GPRRC";
+    case FPRRC:       return "PPC::FPRRC";
+    case VRRC:        return "PPC::VRRC";
+    case VSXRC:       return "PPC::VSXRC";
+  }
 }
 
 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
@@ -613,7 +640,7 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
 
 }
 
-unsigned PPCTTIImpl::getCacheLineSize() {
+unsigned PPCTTIImpl::getCacheLineSize() const {
   // Check first if the user specified a custom line size.
   if (CacheLineSize.getNumOccurrences() > 0)
     return CacheLineSize;
@@ -628,7 +655,7 @@ unsigned PPCTTIImpl::getCacheLineSize() {
   return 64;
 }
 
-unsigned PPCTTIImpl::getPrefetchDistance() {
+unsigned PPCTTIImpl::getPrefetchDistance() const {
   // This seems like a reasonable default for the BG/Q (this pass is enabled, by
   // default, only on the BG/Q).
   return 300;
@@ -752,6 +779,35 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
       return 0;
 
     return Cost;
+
+  } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
+    if (ST->hasP9Altivec()) {
+      if (ISD == ISD::INSERT_VECTOR_ELT)
+        // A move-to VSR and a permute/insert.  Assume vector operation cost
+        // for both (cost will be 2x on P9).
+        return vectorCostAdjustment(2, Opcode, Val, nullptr);
+
+      // It's an extract.  Maybe we can do a cheap move-from VSR.
+      unsigned EltSize = Val->getScalarSizeInBits();
+      if (EltSize == 64) {
+        unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0;
+        if (Index == MfvsrdIndex)
+          return 1;
+      } else if (EltSize == 32) {
+        unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
+        if (Index == MfvsrwzIndex)
+          return 1;
+      }
+
+      // We need a vector extract (or mfvsrld).  Assume vector operation cost.
+      // The cost of the load constant for a vector extract is disregarded
+      // (invariant, easily schedulable).
+      return vectorCostAdjustment(1, Opcode, Val, nullptr);
+      
+    } else if (ST->hasDirectMove())
+      // Assume permute has standard cost.
+      // Assume move-to/move-from VSR have 2x standard cost.
+      return 3;
   }
 
   // Estimated cost of a load-hit-store delay.  This was obtained
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 5d76ee418b69..83a70364bf68 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -72,10 +72,16 @@ public:
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
   bool enableInterleavedAccessVectorization();
-  unsigned getNumberOfRegisters(bool Vector);
+
+  enum PPCRegisterClass {
+    GPRRC, FPRRC, VRRC, VSXRC
+  };
+  unsigned getNumberOfRegisters(unsigned ClassID) const;
+  unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
+  const char* getRegisterClassName(unsigned ClassID) const;
   unsigned getRegisterBitWidth(bool Vector) const;
-  unsigned getCacheLineSize();
-  unsigned getPrefetchDistance();
+  unsigned getCacheLineSize() const override;
+  unsigned getPrefetchDistance() const override;
   unsigned getMaxInterleaveFactor(unsigned VF);
   int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2);
   int getArithmeticInstrCost(
diff --git a/lib/Target/PowerPC/PPCVSXCopy.cpp b/lib/Target/PowerPC/PPCVSXCopy.cpp
index 719ed7b63878..3463bbbdc5f0 100644
--- a/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -50,7 +50,7 @@ namespace {
 
     bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
                       MachineRegisterInfo &MRI) {
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         return RC->hasSubClassEq(MRI.getRegClass(Reg));
       } else if (RC->contains(Reg)) {
         return true;
@@ -102,7 +102,7 @@ protected:
                   IsVSFReg(SrcMO.getReg(), MRI)) &&
                  "Unknown source for a VSX copy");
 
-          unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
+          Register NewVReg = MRI.createVirtualRegister(SrcRC);
           BuildMI(MBB, MI, MI.getDebugLoc(),
                   TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
               .addImm(1) // add 1, not 0, because there is no implicit clearing
@@ -124,7 +124,7 @@ protected:
                  "Unknown destination for a VSX copy");
 
           // Copy the VSX value into a new VSX register of the correct subclass.
-          unsigned NewVReg = MRI.createVirtualRegister(DstRC);
+          Register NewVReg = MRI.createVirtualRegister(DstRC);
           BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
                   NewVReg)
               .add(SrcMO);
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index ce78239df0a8..5e150be544ed 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -126,8 +126,8 @@ protected:
         if (!AddendMI->isFullCopy())
           continue;
 
-        unsigned AddendSrcReg = AddendMI->getOperand(1).getReg();
-        if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) {
+        Register AddendSrcReg = AddendMI->getOperand(1).getReg();
+        if (Register::isVirtualRegister(AddendSrcReg)) {
           if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
               MRI.getRegClass(AddendSrcReg))
             continue;
@@ -182,12 +182,12 @@ protected:
         //   %5 = A-form-op %5, %5, %11;
         // where %5 and %11 are both kills. This case would be skipped
         // otherwise.
-        unsigned OldFMAReg = MI.getOperand(0).getReg();
+        Register OldFMAReg = MI.getOperand(0).getReg();
 
         // Find one of the product operands that is killed by this instruction.
         unsigned KilledProdOp = 0, OtherProdOp = 0;
-        unsigned Reg2 = MI.getOperand(2).getReg();
-        unsigned Reg3 = MI.getOperand(3).getReg();
+        Register Reg2 = MI.getOperand(2).getReg();
+        Register Reg3 = MI.getOperand(3).getReg();
         if (LIS->getInterval(Reg2).Query(FMAIdx).isKill()
             && Reg2 != OldFMAReg) {
           KilledProdOp = 2;
@@ -208,14 +208,14 @@ protected:
         // legality checks above, the live range for the addend source register
         // could be extended), but it seems likely that such a trivial copy can
         // be coalesced away later, and thus is not worth the effort.
-        if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) &&
+        if (Register::isVirtualRegister(AddendSrcReg) &&
             !LIS->getInterval(AddendSrcReg).liveAt(FMAIdx))
           continue;
 
         // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
 
-        unsigned KilledProdReg = MI.getOperand(KilledProdOp).getReg();
-        unsigned OtherProdReg = MI.getOperand(OtherProdOp).getReg();
+        Register KilledProdReg = MI.getOperand(KilledProdOp).getReg();
+        Register OtherProdReg = MI.getOperand(OtherProdOp).getReg();
 
         unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
         unsigned KilledProdSubReg = MI.getOperand(KilledProdOp).getSubReg();
@@ -314,7 +314,7 @@ protected:
         // Extend the live interval of the addend source (it might end at the
         // copy to be removed, or somewhere in between there and here). This
         // is necessary only if it is a physical register.
-        if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg))
+        if (!Register::isVirtualRegister(AddendSrcReg))
           for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid();
                ++Units) {
             unsigned Unit = *Units;
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 44175af7f9b6..c3729da0b07b 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -158,7 +158,7 @@ private:
 
   // Return true iff the given register is in the given class.
   bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (Register::isVirtualRegister(Reg))
       return RC->hasSubClassEq(MRI->getRegClass(Reg));
     return RC->contains(Reg);
   }
@@ -253,7 +253,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (isAnyVecReg(Reg, Partial)) {
           RelevantInstr = true;
           break;
@@ -566,7 +566,7 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
     CopySrcReg = MI->getOperand(2).getReg();
   }
 
-  if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) {
+  if (!Register::isVirtualRegister(CopySrcReg)) {
     if (!isScalarVecReg(CopySrcReg))
       SwapVector[VecIdx].MentionsPhysVR = 1;
     return CopySrcReg;
@@ -601,11 +601,11 @@ void PPCVSXSwapRemoval::formWebs() {
       if (!MO.isReg())
         continue;
 
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!isVecReg(Reg) && !isScalarVecReg(Reg))
         continue;
 
-      if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (!Register::isVirtualRegister(Reg)) {
         if (!(MI->isCopy() && isScalarVecReg(Reg)))
           SwapVector[EntryIdx].MentionsPhysVR = 1;
         continue;
@@ -667,7 +667,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
     // than a swap instruction.
     else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
       MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
-      unsigned DefReg = MI->getOperand(0).getReg();
+      Register DefReg = MI->getOperand(0).getReg();
 
       // We skip debug instructions in the analysis.  (Note that debug
       // location information is still maintained by this optimization
@@ -695,9 +695,9 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
     // other than a swap instruction.
     } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
       MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
-      unsigned UseReg = MI->getOperand(0).getReg();
+      Register UseReg = MI->getOperand(0).getReg();
       MachineInstr *DefMI = MRI->getVRegDef(UseReg);
-      unsigned DefReg = DefMI->getOperand(0).getReg();
+      Register DefReg = DefMI->getOperand(0).getReg();
       int DefIdx = SwapMap[DefMI];
 
       if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad ||
@@ -756,7 +756,7 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
 
       if (!SwapVector[Repr].WebRejected) {
         MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
-        unsigned DefReg = MI->getOperand(0).getReg();
+        Register DefReg = MI->getOperand(0).getReg();
 
         for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
           int UseIdx = SwapMap[&UseMI];
@@ -772,7 +772,7 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
 
       if (!SwapVector[Repr].WebRejected) {
         MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
-        unsigned UseReg = MI->getOperand(0).getReg();
+        Register UseReg = MI->getOperand(0).getReg();
         MachineInstr *DefMI = MRI->getVRegDef(UseReg);
         int DefIdx = SwapMap[DefMI];
         SwapVector[DefIdx].WillRemove = 1;
@@ -869,8 +869,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
       Selector = 3 - Selector;
     MI->getOperand(3).setImm(Selector);
 
-    unsigned Reg1 = MI->getOperand(1).getReg();
-    unsigned Reg2 = MI->getOperand(2).getReg();
+    Register Reg1 = MI->getOperand(1).getReg();
+    Register Reg2 = MI->getOperand(2).getReg();
     MI->getOperand(1).setReg(Reg2);
     MI->getOperand(2).setReg(Reg1);
 
@@ -894,9 +894,9 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
     LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
     LLVM_DEBUG(MI->dump());
 
-    unsigned DstReg = MI->getOperand(0).getReg();
+    Register DstReg = MI->getOperand(0).getReg();
     const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
-    unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+    Register NewVReg = MRI->createVirtualRegister(DstRC);
 
     MI->getOperand(0).setReg(NewVReg);
     LLVM_DEBUG(dbgs() << "  Into: ");
@@ -910,8 +910,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
     // prior to the swap, and from VSRC to VRRC following the swap.
     // Coalescing will usually remove all this mess.
     if (DstRC == &PPC::VRRCRegClass) {
-      unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
-      unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+      Register VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+      Register VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
 
       BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
               TII->get(PPC::COPY), VSRCTmp1)
diff --git a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 0172c6298772..300ba8dc675c 100644
--- a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -79,7 +80,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
 
   // Helper to emit a combination of LUI, ADDI(W), and SLLI instructions that
   // synthesize the desired immedate value into the destination register.
-  void emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out);
+  void emitLoadImm(Register DestReg, int64_t Value, MCStreamer &Out);
 
   // Helper to emit a combination of AUIPC and SecondOpcode. Used to implement
   // helpers such as emitLoadLocalAddress and emitLoadAddress.
@@ -127,6 +128,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseRegister(OperandVector &Operands,
                                      bool AllowParens = false);
   OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands);
+  OperandMatchResultTy parseAtomicMemOp(OperandVector &Operands);
   OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands);
   OperandMatchResultTy parseBareSymbol(OperandVector &Operands);
   OperandMatchResultTy parseCallSymbol(OperandVector &Operands);
@@ -193,7 +195,7 @@ public:
 /// instruction
 struct RISCVOperand : public MCParsedAsmOperand {
 
-  enum KindTy {
+  enum class KindTy {
     Token,
     Register,
     Immediate,
@@ -203,7 +205,7 @@ struct RISCVOperand : public MCParsedAsmOperand {
   bool IsRV64;
 
   struct RegOp {
-    unsigned RegNum;
+    Register RegNum;
   };
 
   struct ImmOp {
@@ -235,26 +237,26 @@ public:
     StartLoc = o.StartLoc;
     EndLoc = o.EndLoc;
     switch (Kind) {
-    case Register:
+    case KindTy::Register:
       Reg = o.Reg;
       break;
-    case Immediate:
+    case KindTy::Immediate:
       Imm = o.Imm;
       break;
-    case Token:
+    case KindTy::Token:
       Tok = o.Tok;
       break;
-    case SystemRegister:
+    case KindTy::SystemRegister:
       SysReg = o.SysReg;
       break;
     }
   }
 
-  bool isToken() const override { return Kind == Token; }
-  bool isReg() const override { return Kind == Register; }
-  bool isImm() const override { return Kind == Immediate; }
+  bool isToken() const override { return Kind == KindTy::Token; }
+  bool isReg() const override { return Kind == KindTy::Register; }
+  bool isImm() const override { return Kind == KindTy::Immediate; }
   bool isMem() const override { return false; }
-  bool isSystemRegister() const { return Kind == SystemRegister; }
+  bool isSystemRegister() const { return Kind == KindTy::SystemRegister; }
 
   static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
                                   RISCVMCExpr::VariantKind &VK) {
@@ -276,7 +278,7 @@ public:
   // modifiers and isShiftedInt<N-1, 1>(Op).
   template <int N> bool isBareSimmNLsb0() const {
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     if (!isImm())
       return false;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
@@ -292,7 +294,7 @@ public:
 
   bool isBareSymbol() const {
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     // Must be of 'immediate' type but not a constant.
     if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
       return false;
@@ -302,7 +304,7 @@ public:
 
   bool isCallSymbol() const {
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     // Must be of 'immediate' type but not a constant.
     if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
       return false;
@@ -313,7 +315,7 @@ public:
 
   bool isTPRelAddSymbol() const {
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     // Must be of 'immediate' type but not a constant.
     if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
       return false;
@@ -364,7 +366,7 @@ public:
 
   bool isImmXLenLI() const {
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     if (!isImm())
       return false;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
@@ -372,13 +374,13 @@ public:
       return true;
     // Given only Imm, ensuring that the actually specified constant is either
     // a signed or unsigned 64-bit number is unfortunately impossible.
-    bool IsInRange = isRV64() ? true : isInt<32>(Imm) || isUInt<32>(Imm);
-    return IsConstantImm && IsInRange && VK == RISCVMCExpr::VK_RISCV_None;
+    return IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None &&
+           (isRV64() || (isInt<32>(Imm) || isUInt<32>(Imm)));
   }
 
   bool isUImmLog2XLen() const {
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     if (!isImm())
       return false;
     if (!evaluateConstantImm(getImm(), Imm, VK) ||
@@ -389,7 +391,7 @@ public:
 
   bool isUImmLog2XLenNonZero() const {
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     if (!isImm())
       return false;
     if (!evaluateConstantImm(getImm(), Imm, VK) ||
@@ -402,7 +404,7 @@ public:
 
   bool isUImm5() const {
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     if (!isImm())
       return false;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
@@ -411,7 +413,7 @@ public:
 
   bool isUImm5NonZero() const {
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     if (!isImm())
       return false;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
@@ -422,7 +424,7 @@ public:
   bool isSImm6() const {
     if (!isImm())
       return false;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     int64_t Imm;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && isInt<6>(Imm) &&
@@ -432,7 +434,7 @@ public:
   bool isSImm6NonZero() const {
     if (!isImm())
       return false;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     int64_t Imm;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && isInt<6>(Imm) && (Imm != 0) &&
@@ -443,7 +445,7 @@ public:
     if (!isImm())
       return false;
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && (Imm != 0) &&
            (isUInt<5>(Imm) || (Imm >= 0xfffe0 && Imm <= 0xfffff)) &&
@@ -454,7 +456,7 @@ public:
     if (!isImm())
       return false;
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && isShiftedUInt<5, 2>(Imm) &&
            VK == RISCVMCExpr::VK_RISCV_None;
@@ -464,7 +466,7 @@ public:
     if (!isImm())
       return false;
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && isShiftedUInt<6, 2>(Imm) &&
            VK == RISCVMCExpr::VK_RISCV_None;
@@ -474,7 +476,7 @@ public:
     if (!isImm())
       return false;
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && isShiftedUInt<5, 3>(Imm) &&
            VK == RISCVMCExpr::VK_RISCV_None;
@@ -486,7 +488,7 @@ public:
     if (!isImm())
       return false;
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && isShiftedUInt<6, 3>(Imm) &&
            VK == RISCVMCExpr::VK_RISCV_None;
@@ -496,14 +498,14 @@ public:
     if (!isImm())
       return false;
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && isShiftedUInt<8, 2>(Imm) && (Imm != 0) &&
            VK == RISCVMCExpr::VK_RISCV_None;
   }
 
   bool isSImm12() const {
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     int64_t Imm;
     bool IsValid;
     if (!isImm())
@@ -527,14 +529,14 @@ public:
     if (!isImm())
       return false;
     int64_t Imm;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && (Imm != 0) && isShiftedInt<6, 4>(Imm) &&
            VK == RISCVMCExpr::VK_RISCV_None;
   }
 
   bool isUImm20LUI() const {
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     int64_t Imm;
     bool IsValid;
     if (!isImm())
@@ -552,7 +554,7 @@ public:
   }
 
   bool isUImm20AUIPC() const {
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     int64_t Imm;
     bool IsValid;
     if (!isImm())
@@ -575,6 +577,15 @@ public:
 
   bool isSImm21Lsb0JAL() const { return isBareSimmNLsb0<21>(); }
 
+  bool isImmZero() const {
+    if (!isImm())
+      return false;
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && (Imm == 0) && VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   /// getStartLoc - Gets location of the first token of this operand
   SMLoc getStartLoc() const override { return StartLoc; }
   /// getEndLoc - Gets location of the last token of this operand
@@ -583,38 +594,38 @@ public:
   bool isRV64() const { return IsRV64; }
 
   unsigned getReg() const override {
-    assert(Kind == Register && "Invalid type access!");
-    return Reg.RegNum;
+    assert(Kind == KindTy::Register && "Invalid type access!");
+    return Reg.RegNum.id();
   }
 
   StringRef getSysReg() const {
-    assert(Kind == SystemRegister && "Invalid access!");
+    assert(Kind == KindTy::SystemRegister && "Invalid access!");
     return StringRef(SysReg.Data, SysReg.Length);
   }
 
   const MCExpr *getImm() const {
-    assert(Kind == Immediate && "Invalid type access!");
+    assert(Kind == KindTy::Immediate && "Invalid type access!");
     return Imm.Val;
   }
 
   StringRef getToken() const {
-    assert(Kind == Token && "Invalid type access!");
+    assert(Kind == KindTy::Token && "Invalid type access!");
     return Tok;
   }
 
   void print(raw_ostream &OS) const override {
     switch (Kind) {
-    case Immediate:
+    case KindTy::Immediate:
       OS << *getImm();
       break;
-    case Register:
+    case KindTy::Register:
       OS << "<register x";
       OS << getReg() << ">";
       break;
-    case Token:
+    case KindTy::Token:
       OS << "'" << getToken() << "'";
       break;
-    case SystemRegister:
+    case KindTy::SystemRegister:
       OS << "<sysreg: " << getSysReg() << '>';
       break;
     }
@@ -622,7 +633,7 @@ public:
 
   static std::unique_ptr<RISCVOperand> createToken(StringRef Str, SMLoc S,
                                                    bool IsRV64) {
-    auto Op = make_unique<RISCVOperand>(Token);
+    auto Op = std::make_unique<RISCVOperand>(KindTy::Token);
     Op->Tok = Str;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -632,7 +643,7 @@ public:
 
   static std::unique_ptr<RISCVOperand> createReg(unsigned RegNo, SMLoc S,
                                                  SMLoc E, bool IsRV64) {
-    auto Op = make_unique<RISCVOperand>(Register);
+    auto Op = std::make_unique<RISCVOperand>(KindTy::Register);
     Op->Reg.RegNum = RegNo;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -642,7 +653,7 @@ public:
 
   static std::unique_ptr<RISCVOperand> createImm(const MCExpr *Val, SMLoc S,
                                                  SMLoc E, bool IsRV64) {
-    auto Op = make_unique<RISCVOperand>(Immediate);
+    auto Op = std::make_unique<RISCVOperand>(KindTy::Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -652,7 +663,7 @@ public:
 
   static std::unique_ptr<RISCVOperand>
   createSysReg(StringRef Str, SMLoc S, unsigned Encoding, bool IsRV64) {
-    auto Op = make_unique<RISCVOperand>(SystemRegister);
+    auto Op = std::make_unique<RISCVOperand>(KindTy::SystemRegister);
     Op->SysReg.Data = Str.data();
     Op->SysReg.Length = Str.size();
     Op->SysReg.Encoding = Encoding;
@@ -664,7 +675,7 @@ public:
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     assert(Expr && "Expr shouldn't be null!");
     int64_t Imm = 0;
-    RISCVMCExpr::VariantKind VK;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
     bool IsConstant = evaluateConstantImm(Expr, Imm, VK);
 
     if (IsConstant)
@@ -730,46 +741,9 @@ public:
 #define GET_MATCHER_IMPLEMENTATION
 #include "RISCVGenAsmMatcher.inc"
 
-// Return the matching FPR64 register for the given FPR32.
-// FIXME: Ideally this function could be removed in favour of using
-// information from TableGen.
-unsigned convertFPR32ToFPR64(unsigned Reg) {
-  switch (Reg) {
-  default:
-    llvm_unreachable("Not a recognised FPR32 register");
-  case RISCV::F0_32: return RISCV::F0_64;
-  case RISCV::F1_32: return RISCV::F1_64;
-  case RISCV::F2_32: return RISCV::F2_64;
-  case RISCV::F3_32: return RISCV::F3_64;
-  case RISCV::F4_32: return RISCV::F4_64;
-  case RISCV::F5_32: return RISCV::F5_64;
-  case RISCV::F6_32: return RISCV::F6_64;
-  case RISCV::F7_32: return RISCV::F7_64;
-  case RISCV::F8_32: return RISCV::F8_64;
-  case RISCV::F9_32: return RISCV::F9_64;
-  case RISCV::F10_32: return RISCV::F10_64;
-  case RISCV::F11_32: return RISCV::F11_64;
-  case RISCV::F12_32: return RISCV::F12_64;
-  case RISCV::F13_32: return RISCV::F13_64;
-  case RISCV::F14_32: return RISCV::F14_64;
-  case RISCV::F15_32: return RISCV::F15_64;
-  case RISCV::F16_32: return RISCV::F16_64;
-  case RISCV::F17_32: return RISCV::F17_64;
-  case RISCV::F18_32: return RISCV::F18_64;
-  case RISCV::F19_32: return RISCV::F19_64;
-  case RISCV::F20_32: return RISCV::F20_64;
-  case RISCV::F21_32: return RISCV::F21_64;
-  case RISCV::F22_32: return RISCV::F22_64;
-  case RISCV::F23_32: return RISCV::F23_64;
-  case RISCV::F24_32: return RISCV::F24_64;
-  case RISCV::F25_32: return RISCV::F25_64;
-  case RISCV::F26_32: return RISCV::F26_64;
-  case RISCV::F27_32: return RISCV::F27_64;
-  case RISCV::F28_32: return RISCV::F28_64;
-  case RISCV::F29_32: return RISCV::F29_64;
-  case RISCV::F30_32: return RISCV::F30_64;
-  case RISCV::F31_32: return RISCV::F31_64;
-  }
+static Register convertFPR64ToFPR32(Register Reg) {
+  assert(Reg >= RISCV::F0_D && Reg <= RISCV::F31_D && "Invalid register");
+  return Reg - RISCV::F0_D + RISCV::F0_F;
 }
 
 unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
@@ -778,17 +752,17 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   if (!Op.isReg())
     return Match_InvalidOperand;
 
-  unsigned Reg = Op.getReg();
-  bool IsRegFPR32 =
-      RISCVMCRegisterClasses[RISCV::FPR32RegClassID].contains(Reg);
-  bool IsRegFPR32C =
-      RISCVMCRegisterClasses[RISCV::FPR32CRegClassID].contains(Reg);
+  Register Reg = Op.getReg();
+  bool IsRegFPR64 =
+      RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg);
+  bool IsRegFPR64C =
+      RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains(Reg);
 
   // As the parser couldn't differentiate an FPR32 from an FPR64, coerce the
-  // register from FPR32 to FPR64 or FPR32C to FPR64C if necessary.
-  if ((IsRegFPR32 && Kind == MCK_FPR64) ||
-      (IsRegFPR32C && Kind == MCK_FPR64C)) {
-    Op.Reg.RegNum = convertFPR32ToFPR64(Reg);
+  // register from FPR64 to FPR32 or FPR64C to FPR32C if necessary.
+  if ((IsRegFPR64 && Kind == MCK_FPR32) ||
+      (IsRegFPR64C && Kind == MCK_FPR32C)) {
+    Op.Reg.RegNum = convertFPR64ToFPR32(Reg);
     return Match_Success;
   }
   return Match_InvalidOperand;
@@ -853,6 +827,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(Operands, ErrorInfo,
                                       std::numeric_limits<int32_t>::min(),
                                       std::numeric_limits<uint32_t>::max());
+  case Match_InvalidImmZero: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "immediate must be zero");
+  }
   case Match_InvalidUImmLog2XLen:
     if (isRV64())
       return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1);
@@ -968,14 +946,19 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 // alternative ABI names), setting RegNo to the matching register. Upon
 // failure, returns true and sets RegNo to 0. If IsRV32E then registers
 // x16-x31 will be rejected.
-static bool matchRegisterNameHelper(bool IsRV32E, unsigned &RegNo,
+static bool matchRegisterNameHelper(bool IsRV32E, Register &RegNo,
                                     StringRef Name) {
   RegNo = MatchRegisterName(Name);
-  if (RegNo == 0)
+  // The 32- and 64-bit FPRs have the same asm name. Check that the initial
+  // match always matches the 64-bit variant, and not the 32-bit one.
+  assert(!(RegNo >= RISCV::F0_F && RegNo <= RISCV::F31_F));
+  // The default FPR register class is based on the tablegen enum ordering.
+  static_assert(RISCV::F0_D < RISCV::F0_F, "FPR matching must be updated");
+  if (RegNo == RISCV::NoRegister)
     RegNo = MatchRegisterAltName(Name);
   if (IsRV32E && RegNo >= RISCV::X16 && RegNo <= RISCV::X31)
-    RegNo = 0;
-  return RegNo == 0;
+    RegNo = RISCV::NoRegister;
+  return RegNo == RISCV::NoRegister;
 }
 
 bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
@@ -986,7 +969,7 @@ bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   RegNo = 0;
   StringRef Name = getLexer().getTok().getIdentifier();
 
-  if (matchRegisterNameHelper(isRV32E(), RegNo, Name))
+  if (matchRegisterNameHelper(isRV32E(), (Register&)RegNo, Name))
     return Error(StartLoc, "invalid register name");
 
   getParser().Lex(); // Eat identifier token.
@@ -1018,10 +1001,10 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
     return MatchOperand_NoMatch;
   case AsmToken::Identifier:
     StringRef Name = getLexer().getTok().getIdentifier();
-    unsigned RegNo;
+    Register RegNo;
     matchRegisterNameHelper(isRV32E(), RegNo, Name);
 
-    if (RegNo == 0) {
+    if (RegNo == RISCV::NoRegister) {
       if (HadParens)
         getLexer().UnLex(LParen);
       return MatchOperand_NoMatch;
@@ -1208,6 +1191,24 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
     Res = V;
   } else
     Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+  MCBinaryExpr::Opcode Opcode;
+  switch (getLexer().getKind()) {
+  default:
+    Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
+    return MatchOperand_Success;
+  case AsmToken::Plus:
+    Opcode = MCBinaryExpr::Add;
+    break;
+  case AsmToken::Minus:
+    Opcode = MCBinaryExpr::Sub;
+    break;
+  }
+
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
+  Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext());
   Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
   return MatchOperand_Success;
 }
@@ -1282,6 +1283,73 @@ RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy RISCVAsmParser::parseAtomicMemOp(OperandVector &Operands) {
+  // Atomic operations such as lr.w, sc.w, and amo*.w accept a "memory operand"
+  // as one of their register operands, such as `(a0)`. This just denotes that
+  // the register (in this case `a0`) contains a memory address.
+  //
+  // Normally, we would be able to parse these by putting the parens into the
+  // instruction string. However, GNU as also accepts a zero-offset memory
+  // operand (such as `0(a0)`), and ignores the 0. Normally this would be parsed
+  // with parseImmediate followed by parseMemOpBaseReg, but these instructions
+  // do not accept an immediate operand, and we do not want to add a "dummy"
+  // operand that is silently dropped.
+  //
+  // Instead, we use this custom parser. This will: allow (and discard) an
+  // offset if it is zero; require (and discard) parentheses; and add only the
+  // parsed register operand to `Operands`.
+  //
+  // These operands are printed with RISCVInstPrinter::printAtomicMemOp, which
+  // will only print the register surrounded by parentheses (which GNU as also
+  // uses as its canonical representation for these operands).
+  std::unique_ptr<RISCVOperand> OptionalImmOp;
+
+  if (getLexer().isNot(AsmToken::LParen)) {
+    // Parse an Integer token. We do not accept arbritrary constant expressions
+    // in the offset field (because they may include parens, which complicates
+    // parsing a lot).
+    int64_t ImmVal;
+    SMLoc ImmStart = getLoc();
+    if (getParser().parseIntToken(ImmVal,
+                                  "expected '(' or optional integer offset"))
+      return MatchOperand_ParseFail;
+
+    // Create a RISCVOperand for checking later (so the error messages are
+    // nicer), but we don't add it to Operands.
+    SMLoc ImmEnd = getLoc();
+    OptionalImmOp =
+        RISCVOperand::createImm(MCConstantExpr::create(ImmVal, getContext()),
+                                ImmStart, ImmEnd, isRV64());
+  }
+
+  if (getLexer().isNot(AsmToken::LParen)) {
+    Error(getLoc(), OptionalImmOp ? "expected '(' after optional integer offset"
+                                  : "expected '(' or optional integer offset");
+    return MatchOperand_ParseFail;
+  }
+  getParser().Lex(); // Eat '('
+
+  if (parseRegister(Operands) != MatchOperand_Success) {
+    Error(getLoc(), "expected register");
+    return MatchOperand_ParseFail;
+  }
+
+  if (getLexer().isNot(AsmToken::RParen)) {
+    Error(getLoc(), "expected ')'");
+    return MatchOperand_ParseFail;
+  }
+  getParser().Lex(); // Eat ')'
+
+  // Deferred Handling of non-zero offsets. This makes the error messages nicer.
+  if (OptionalImmOp && !OptionalImmOp->isImmZero()) {
+    Error(OptionalImmOp->getStartLoc(), "optional integer offset must be 0",
+          SMRange(OptionalImmOp->getStartLoc(), OptionalImmOp->getEndLoc()));
+    return MatchOperand_ParseFail;
+  }
+
+  return MatchOperand_Success;
+}
+
 /// Looks at a token type and creates the relevant operand from this
 /// information, adding to Operands. If operand was parsed, returns false, else
 /// true.
@@ -1523,12 +1591,12 @@ void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
   S.EmitInstruction((Res ? CInst : Inst), getSTI());
 }
 
-void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value,
+void RISCVAsmParser::emitLoadImm(Register DestReg, int64_t Value,
                                  MCStreamer &Out) {
   RISCVMatInt::InstSeq Seq;
   RISCVMatInt::generateInstSeq(Value, isRV64(), Seq);
 
-  unsigned SrcReg = RISCV::X0;
+  Register SrcReg = RISCV::X0;
   for (RISCVMatInt::Inst &Inst : Seq) {
     if (Inst.Opc == RISCV::LUI) {
       emitToStreamer(
@@ -1682,7 +1750,7 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   default:
     break;
   case RISCV::PseudoLI: {
-    unsigned Reg = Inst.getOperand(0).getReg();
+    Register Reg = Inst.getOperand(0).getReg();
     const MCOperand &Op1 = Inst.getOperand(1);
     if (Op1.isExpr()) {
       // We must have li reg, %lo(sym) or li reg, %pcrel_lo(sym) or similar.
diff --git a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 36200c03f703..15943ba42156 100644
--- a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -13,6 +13,7 @@
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "Utils/RISCVBaseInfo.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
@@ -56,17 +57,6 @@ extern "C" void LLVMInitializeRISCVDisassembler() {
                                          createRISCVDisassembler);
 }
 
-static const unsigned GPRDecoderTable[] = {
-  RISCV::X0,  RISCV::X1,  RISCV::X2,  RISCV::X3,
-  RISCV::X4,  RISCV::X5,  RISCV::X6,  RISCV::X7,
-  RISCV::X8,  RISCV::X9,  RISCV::X10, RISCV::X11,
-  RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15,
-  RISCV::X16, RISCV::X17, RISCV::X18, RISCV::X19,
-  RISCV::X20, RISCV::X21, RISCV::X22, RISCV::X23,
-  RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27,
-  RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31
-};
-
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                            uint64_t Address,
                                            const void *Decoder) {
@@ -76,38 +66,21 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
           .getFeatureBits();
   bool IsRV32E = FeatureBits[RISCV::FeatureRV32E];
 
-  if (RegNo > array_lengthof(GPRDecoderTable) || (IsRV32E && RegNo > 15))
+  if (RegNo >= 32 || (IsRV32E && RegNo >= 16))
     return MCDisassembler::Fail;
 
-  // We must define our own mapping from RegNo to register identifier.
-  // Accessing index RegNo in the register class will work in the case that
-  // registers were added in ascending order, but not in general.
-  unsigned Reg = GPRDecoderTable[RegNo];
+  Register Reg = RISCV::X0 + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
 
-static const unsigned FPR32DecoderTable[] = {
-  RISCV::F0_32,  RISCV::F1_32,  RISCV::F2_32,  RISCV::F3_32,
-  RISCV::F4_32,  RISCV::F5_32,  RISCV::F6_32,  RISCV::F7_32,
-  RISCV::F8_32,  RISCV::F9_32,  RISCV::F10_32, RISCV::F11_32,
-  RISCV::F12_32, RISCV::F13_32, RISCV::F14_32, RISCV::F15_32,
-  RISCV::F16_32, RISCV::F17_32, RISCV::F18_32, RISCV::F19_32,
-  RISCV::F20_32, RISCV::F21_32, RISCV::F22_32, RISCV::F23_32,
-  RISCV::F24_32, RISCV::F25_32, RISCV::F26_32, RISCV::F27_32,
-  RISCV::F28_32, RISCV::F29_32, RISCV::F30_32, RISCV::F31_32
-};
-
 static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
                                              uint64_t Address,
                                              const void *Decoder) {
-  if (RegNo > array_lengthof(FPR32DecoderTable))
+  if (RegNo >= 32)
     return MCDisassembler::Fail;
 
-  // We must define our own mapping from RegNo to register identifier.
-  // Accessing index RegNo in the register class will work in the case that
-  // registers were added in ascending order, but not in general.
-  unsigned Reg = FPR32DecoderTable[RegNo];
+  Register Reg = RISCV::F0_F + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -115,35 +88,21 @@ static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
 static DecodeStatus DecodeFPR32CRegisterClass(MCInst &Inst, uint64_t RegNo,
                                               uint64_t Address,
                                               const void *Decoder) {
-  if (RegNo > 8) {
+  if (RegNo >= 8) {
     return MCDisassembler::Fail;
   }
-  unsigned Reg = FPR32DecoderTable[RegNo + 8];
+  Register Reg = RISCV::F8_F + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
 
-static const unsigned FPR64DecoderTable[] = {
-  RISCV::F0_64,  RISCV::F1_64,  RISCV::F2_64,  RISCV::F3_64,
-  RISCV::F4_64,  RISCV::F5_64,  RISCV::F6_64,  RISCV::F7_64,
-  RISCV::F8_64,  RISCV::F9_64,  RISCV::F10_64, RISCV::F11_64,
-  RISCV::F12_64, RISCV::F13_64, RISCV::F14_64, RISCV::F15_64,
-  RISCV::F16_64, RISCV::F17_64, RISCV::F18_64, RISCV::F19_64,
-  RISCV::F20_64, RISCV::F21_64, RISCV::F22_64, RISCV::F23_64,
-  RISCV::F24_64, RISCV::F25_64, RISCV::F26_64, RISCV::F27_64,
-  RISCV::F28_64, RISCV::F29_64, RISCV::F30_64, RISCV::F31_64
-};
-
 static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
                                              uint64_t Address,
                                              const void *Decoder) {
-  if (RegNo > array_lengthof(FPR64DecoderTable))
+  if (RegNo >= 32)
     return MCDisassembler::Fail;
 
-  // We must define our own mapping from RegNo to register identifier.
-  // Accessing index RegNo in the register class will work in the case that
-  // registers were added in ascending order, but not in general.
-  unsigned Reg = FPR64DecoderTable[RegNo];
+  Register Reg = RISCV::F0_D + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -151,10 +110,10 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
 static DecodeStatus DecodeFPR64CRegisterClass(MCInst &Inst, uint64_t RegNo,
                                               uint64_t Address,
                                               const void *Decoder) {
-  if (RegNo > 8) {
+  if (RegNo >= 8) {
     return MCDisassembler::Fail;
   }
-  unsigned Reg = FPR64DecoderTable[RegNo + 8];
+  Register Reg = RISCV::F8_D + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -182,10 +141,10 @@ static DecodeStatus DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo,
 static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
                                             const void *Decoder) {
-  if (RegNo > 8)
+  if (RegNo >= 8)
     return MCDisassembler::Fail;
 
-  unsigned Reg = GPRDecoderTable[RegNo + 8];
+  Register Reg = RISCV::X8 + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -279,8 +238,80 @@ static DecodeStatus decodeFRMArg(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeRVCInstrSImm(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, unsigned Insn,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
 #include "RISCVGenDisassemblerTables.inc"
 
+static DecodeStatus decodeRVCInstrSImm(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address, const void *Decoder) {
+  uint64_t SImm6 =
+      fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
+  DecodeStatus Result = decodeSImmOperand<6>(Inst, SImm6, Address, Decoder);
+  (void)Result;
+  assert(Result == MCDisassembler::Success && "Invalid immediate");
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  DecodeGPRRegisterClass(Inst, 0, Address, Decoder);
+  uint64_t SImm6 =
+      fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
+  DecodeStatus Result = decodeSImmOperand<6>(Inst, SImm6, Address, Decoder);
+  (void)Result;
+  assert(Result == MCDisassembler::Success && "Invalid immediate");
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, unsigned Insn,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  DecodeGPRRegisterClass(Inst, 0, Address, Decoder);
+  Inst.addOperand(Inst.getOperand(0));
+  uint64_t UImm6 =
+      fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
+  DecodeStatus Result = decodeUImmOperand<6>(Inst, UImm6, Address, Decoder);
+  (void)Result;
+  assert(Result == MCDisassembler::Success && "Invalid immediate");
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(Insn, 7, 5);
+  unsigned Rs2 = fieldFromInstruction(Insn, 2, 5);
+  DecodeGPRRegisterClass(Inst, Rd, Address, Decoder);
+  DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(Insn, 7, 5);
+  unsigned Rs2 = fieldFromInstruction(Insn, 2, 5);
+  DecodeGPRRegisterClass(Inst, Rd, Address, Decoder);
+  Inst.addOperand(Inst.getOperand(0));
+  DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder);
+  return MCDisassembler::Success;
+}
+
 DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                                ArrayRef<uint8_t> Bytes,
                                                uint64_t Address,
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index ee5f760ebcb0..f6b727ae37c7 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -30,9 +30,16 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                             const MCValue &Target) {
   bool ShouldForce = false;
 
-  switch ((unsigned)Fixup.getKind()) {
+  switch (Fixup.getTargetKind()) {
   default:
     break;
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    if (Target.isAbsolute())
+      return false;
+    break;
   case RISCV::fixup_riscv_got_hi20:
   case RISCV::fixup_riscv_tls_got_hi20:
   case RISCV::fixup_riscv_tls_gd_hi20:
@@ -48,7 +55,7 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
       return false;
     }
 
-    switch ((unsigned)T->getKind()) {
+    switch (T->getTargetKind()) {
     default:
       llvm_unreachable("Unexpected fixup kind for pcrel_lo12");
       break;
@@ -83,7 +90,7 @@ bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
     return true;
 
   int64_t Offset = int64_t(Value);
-  switch ((unsigned)Fixup.getKind()) {
+  switch (Fixup.getTargetKind()) {
   default:
     return false;
   case RISCV::fixup_riscv_rvc_branch:
@@ -174,8 +181,7 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
 
 static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
                                  MCContext &Ctx) {
-  unsigned Kind = Fixup.getKind();
-  switch (Kind) {
+  switch (Fixup.getTargetKind()) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case RISCV::fixup_riscv_got_hi20:
@@ -186,6 +192,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case FK_Data_2:
   case FK_Data_4:
   case FK_Data_8:
+  case FK_Data_6b:
     return Value;
   case RISCV::fixup_riscv_lo12_i:
   case RISCV::fixup_riscv_pcrel_lo12_i:
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 3ccbc86d2619..cab2bbcb81bc 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/RISCVFixupKinds.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixup.h"
@@ -47,8 +48,9 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
                                             const MCValue &Target,
                                             const MCFixup &Fixup,
                                             bool IsPCRel) const {
+  const MCExpr *Expr = Fixup.getValue();
   // Determine the type of the relocation
-  unsigned Kind = Fixup.getKind();
+  unsigned Kind = Fixup.getTargetKind();
   if (IsPCRel) {
     switch (Kind) {
     default:
@@ -87,6 +89,9 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
   default:
     llvm_unreachable("invalid fixup kind!");
   case FK_Data_4:
+    if (Expr->getKind() == MCExpr::Target &&
+        cast<RISCVMCExpr>(Expr)->getKind() == RISCVMCExpr::VK_RISCV_32_PCREL)
+      return ELF::R_RISCV_32_PCREL;
     return ELF::R_RISCV_32;
   case FK_Data_8:
     return ELF::R_RISCV_64;
@@ -98,6 +103,8 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_RISCV_ADD32;
   case FK_Data_Add_8:
     return ELF::R_RISCV_ADD64;
+  case FK_Data_Add_6b:
+    return ELF::R_RISCV_SET6;
   case FK_Data_Sub_1:
     return ELF::R_RISCV_SUB8;
   case FK_Data_Sub_2:
@@ -106,6 +113,8 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_RISCV_SUB32;
   case FK_Data_Sub_8:
     return ELF::R_RISCV_SUB64;
+  case FK_Data_Sub_6b:
+    return ELF::R_RISCV_SUB6;
   case RISCV::fixup_riscv_hi20:
     return ELF::R_RISCV_HI20;
   case RISCV::fixup_riscv_lo12_i:
@@ -129,5 +138,5 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createRISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit) {
-  return llvm::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit);
+  return std::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit);
 }
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index fe37b70811d8..8b5fe6dd8252 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -39,6 +39,30 @@ static cl::opt<bool>
               cl::desc("Disable the emission of assembler pseudo instructions"),
               cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+    ArchRegNames("riscv-arch-reg-names",
+                 cl::desc("Print architectural register names rather than the "
+                          "ABI names (such as x2 instead of sp)"),
+                 cl::init(false), cl::Hidden);
+
+// The command-line flags above are used by llvm-mc and llc. They can be used by
+// `llvm-objdump`, but we override their values here to handle options passed to
+// `llvm-objdump` with `-M` (which matches GNU objdump). There did not seem to
+// be an easier way to allow these options in all these tools, without doing it
+// this way.
+bool RISCVInstPrinter::applyTargetSpecificCLOption(StringRef Opt) {
+  if (Opt == "no-aliases") {
+    NoAliases = true;
+    return true;
+  }
+  if (Opt == "numeric") {
+    ArchRegNames = true;
+    return true;
+  }
+
+  return false;
+}
+
 void RISCVInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                  StringRef Annot, const MCSubtargetInfo &STI) {
   bool Res = false;
@@ -112,3 +136,20 @@ void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo,
       static_cast<RISCVFPRndMode::RoundingMode>(MI->getOperand(OpNo).getImm());
   O << RISCVFPRndMode::roundingModeToString(FRMArg);
 }
+
+void RISCVInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+
+  assert(MO.isReg() && "printAtomicMemOp can only print register operands");
+  O << "(";
+  printRegName(O, MO.getReg());
+  O << ")";
+  return;
+}
+
+const char *RISCVInstPrinter::getRegisterName(unsigned RegNo) {
+  return getRegisterName(RegNo, ArchRegNames ? RISCV::NoRegAltName
+                                             : RISCV::ABIRegAltName);
+}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
index 5ca1d3fa20fe..189d72626f3e 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
@@ -25,6 +25,8 @@ public:
                    const MCRegisterInfo &MRI)
       : MCInstPrinter(MAI, MII, MRI) {}
 
+  bool applyTargetSpecificCLOption(StringRef Opt) override;
+
   void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
                  const MCSubtargetInfo &STI) override;
   void printRegName(raw_ostream &O, unsigned RegNo) const override;
@@ -37,6 +39,8 @@ public:
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printFRMArg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printAtomicMemOp(const MCInst *MI, unsigned OpNo,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
@@ -46,8 +50,8 @@ public:
   void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
                                unsigned PrintMethodIdx,
                                const MCSubtargetInfo &STI, raw_ostream &O);
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = RISCV::ABIRegAltName);
+  static const char *getRegisterName(unsigned RegNo);
+  static const char *getRegisterName(unsigned RegNo, unsigned AltIdx);
 };
 } // namespace llvm
 
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index 983629692883..089a2def4c21 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -11,7 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVMCAsmInfo.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCStreamer.h"
 using namespace llvm;
 
 void RISCVMCAsmInfo::anchor() {}
@@ -25,3 +28,20 @@ RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
 }
+
+const MCExpr *RISCVMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
+                                                  unsigned Encoding,
+                                                  MCStreamer &Streamer) const {
+  if (!(Encoding & dwarf::DW_EH_PE_pcrel))
+    return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
+
+  // The default symbol subtraction results in an ADD/SUB relocation pair.
+  // Processing this relocation pair is problematic when linker relaxation is
+  // enabled, so we follow binutils in using the R_RISCV_32_PCREL relocation
+  // for the FDE initial location.
+  MCContext &Ctx = Streamer.getContext();
+  const MCExpr *ME =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+  assert(Encoding & dwarf::DW_EH_PE_sdata4 && "Unexpected encoding");
+  return RISCVMCExpr::create(ME, RISCVMCExpr::VK_RISCV_32_PCREL, Ctx);
+}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
index 043fdb7c08c0..6824baf699aa 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
@@ -23,6 +23,9 @@ class RISCVMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit RISCVMCAsmInfo(const Triple &TargetTriple);
+
+  const MCExpr *getExprForFDESymbol(const MCSymbol *Sym, unsigned Encoding,
+                                    MCStreamer &Streamer) const override;
 };
 
 } // namespace llvm
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 0fc775f63ed4..de99960848a5 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "Utils/RISCVBaseInfo.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -100,7 +101,7 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
                                             const MCSubtargetInfo &STI) const {
   MCInst TmpInst;
   MCOperand Func;
-  unsigned Ra;
+  Register Ra;
   if (MI.getOpcode() == RISCV::PseudoTAIL) {
     Func = MI.getOperand(0);
     Ra = RISCV::X6;
@@ -266,6 +267,7 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
     switch (RVExpr->getKind()) {
     case RISCVMCExpr::VK_RISCV_None:
     case RISCVMCExpr::VK_RISCV_Invalid:
+    case RISCVMCExpr::VK_RISCV_32_PCREL:
       llvm_unreachable("Unhandled fixup kind!");
     case RISCVMCExpr::VK_RISCV_TPREL_ADD:
       // tprel_add is only used to indicate that a relocation should be emitted
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index b5a292dc1b1a..921df376f3df 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -36,6 +36,7 @@ public:
     VK_RISCV_TLS_GD_HI,
     VK_RISCV_CALL,
     VK_RISCV_CALL_PLT,
+    VK_RISCV_32_PCREL,
     VK_RISCV_Invalid
   };
 
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index bc45262ab2de..5a4c86e48f1e 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -16,7 +16,9 @@
 #include "RISCVMCAsmInfo.h"
 #include "RISCVTargetStreamer.h"
 #include "TargetInfo/RISCVTargetInfo.h"
+#include "Utils/RISCVBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -52,7 +54,7 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
                                        const Triple &TT) {
   MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
 
-  unsigned SP = MRI.getDwarfRegNum(RISCV::X2, true);
+  Register SP = MRI.getDwarfRegNum(RISCV::X2, true);
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
   MAI->addInitialFrameState(Inst);
 
diff --git a/lib/Target/RISCV/RISCV.h b/lib/Target/RISCV/RISCV.h
index 834a1d171143..f23f742a4782 100644
--- a/lib/Target/RISCV/RISCV.h
+++ b/lib/Target/RISCV/RISCV.h
@@ -18,9 +18,12 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+class RISCVRegisterBankInfo;
+class RISCVSubtarget;
 class RISCVTargetMachine;
 class AsmPrinter;
 class FunctionPass;
+class InstructionSelector;
 class MCInst;
 class MCOperand;
 class MachineInstr;
@@ -39,6 +42,10 @@ void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 
 FunctionPass *createRISCVExpandPseudoPass();
 void initializeRISCVExpandPseudoPass(PassRegistry &);
+
+InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &,
+                                                    RISCVSubtarget &,
+                                                    RISCVRegisterBankInfo &);
 }
 
 #endif
diff --git a/lib/Target/RISCV/RISCV.td b/lib/Target/RISCV/RISCV.td
index e19b70b8e709..46530a8f74a8 100644
--- a/lib/Target/RISCV/RISCV.td
+++ b/lib/Target/RISCV/RISCV.td
@@ -43,6 +43,11 @@ def FeatureStdExtC
 def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
                            AssemblerPredicate<"FeatureStdExtC">;
 
+def FeatureRVCHints
+    : SubtargetFeature<"rvc-hints", "EnableRVCHintInstrs", "true",
+                       "Enable RVC Hint Instructions.">;
+def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
+                            AssemblerPredicate<"FeatureRVCHints">;
 
 def Feature64Bit
     : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">;
@@ -77,14 +82,16 @@ include "RISCVSystemOperands.td"
 include "RISCVRegisterInfo.td"
 include "RISCVCallingConv.td"
 include "RISCVInstrInfo.td"
+include "RISCVRegisterBanks.td"
 
 //===----------------------------------------------------------------------===//
 // RISC-V processors supported.
 //===----------------------------------------------------------------------===//
 
-def : ProcessorModel<"generic-rv32", NoSchedModel, []>;
+def : ProcessorModel<"generic-rv32", NoSchedModel, [FeatureRVCHints]>;
 
-def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
+def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit,
+                     FeatureRVCHints]>;
 
 //===----------------------------------------------------------------------===//
 // Define the RISC-V target.
diff --git a/lib/Target/RISCV/RISCVCallLowering.cpp b/lib/Target/RISCV/RISCVCallLowering.cpp
new file mode 100644
index 000000000000..c63a84739c4a
--- /dev/null
+++ b/lib/Target/RISCV/RISCVCallLowering.cpp
@@ -0,0 +1,50 @@
+//===-- RISCVCallLowering.cpp - Call lowering -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVCallLowering.h"
+#include "RISCVISelLowering.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+using namespace llvm;
+
+RISCVCallLowering::RISCVCallLowering(const RISCVTargetLowering &TLI)
+    : CallLowering(&TLI) {}
+
+bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                    const Value *Val,
+                                    ArrayRef<Register> VRegs) const {
+
+  MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(RISCV::PseudoRET);
+
+  if (Val != nullptr) {
+    return false;
+  }
+  MIRBuilder.insertInstr(Ret);
+  return true;
+}
+
+bool RISCVCallLowering::lowerFormalArguments(
+    MachineIRBuilder &MIRBuilder, const Function &F,
+    ArrayRef<ArrayRef<Register>> VRegs) const {
+
+  if (F.arg_empty())
+    return true;
+
+  return false;
+}
+
+bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                  CallLoweringInfo &Info) const {
+  return false;
+}
diff --git a/lib/Target/RISCV/RISCVCallLowering.h b/lib/Target/RISCV/RISCVCallLowering.h
new file mode 100644
index 000000000000..7ce074a61f0a
--- /dev/null
+++ b/lib/Target/RISCV/RISCVCallLowering.h
@@ -0,0 +1,42 @@
+//===-- RISCVCallLowering.h - Call lowering ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVCALLLOWERING_H
+#define LLVM_LIB_TARGET_RISCV_RISCVCALLLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+class RISCVTargetLowering;
+
+class RISCVCallLowering : public CallLowering {
+
+public:
+  RISCVCallLowering(const RISCVTargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   ArrayRef<Register> VRegs) const override;
+
+  bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+                            ArrayRef<ArrayRef<Register>> VRegs) const override;
+
+  bool lowerCall(MachineIRBuilder &MIRBuilder,
+                 CallLoweringInfo &Info) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_RISCV_RISCVCALLLOWERING_H
diff --git a/lib/Target/RISCV/RISCVCallingConv.td b/lib/Target/RISCV/RISCVCallingConv.td
index db13e6e8beca..025454f8fcca 100644
--- a/lib/Target/RISCV/RISCVCallingConv.td
+++ b/lib/Target/RISCV/RISCVCallingConv.td
@@ -18,11 +18,11 @@ def CSR_ILP32_LP64
 
 def CSR_ILP32F_LP64F
     : CalleeSavedRegs<(add CSR_ILP32_LP64,
-                       F8_32, F9_32, (sequence "F%u_32", 18, 27))>;
+                       F8_F, F9_F, (sequence "F%u_F", 18, 27))>;
 
 def CSR_ILP32D_LP64D
     : CalleeSavedRegs<(add CSR_ILP32_LP64,
-                       F8_64, F9_64, (sequence "F%u_64", 18, 27))>;
+                       F8_D, F9_D, (sequence "F%u_D", 18, 27))>;
 
 // Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
@@ -43,12 +43,12 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add X1,
     (sequence "X%u", 12, 17),
     (sequence "X%u", 18, 27),
     (sequence "X%u", 28, 31),
-    (sequence "F%u_32", 0, 7),
-    (sequence "F%u_32", 10, 11),
-    (sequence "F%u_32", 12, 17),
-    (sequence "F%u_32", 28, 31),
-    (sequence "F%u_32", 8, 9),
-    (sequence "F%u_32", 18, 27))>;
+    (sequence "F%u_F", 0, 7),
+    (sequence "F%u_F", 10, 11),
+    (sequence "F%u_F", 12, 17),
+    (sequence "F%u_F", 28, 31),
+    (sequence "F%u_F", 8, 9),
+    (sequence "F%u_F", 18, 27))>;
 
 // Same as CSR_Interrupt, but including all 64-bit FP registers.
 def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add X1,
@@ -57,9 +57,9 @@ def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add X1,
     (sequence "X%u", 12, 17),
     (sequence "X%u", 18, 27),
     (sequence "X%u", 28, 31),
-    (sequence "F%u_64", 0, 7),
-    (sequence "F%u_64", 10, 11),
-    (sequence "F%u_64", 12, 17),
-    (sequence "F%u_64", 28, 31),
-    (sequence "F%u_64", 8, 9),
-    (sequence "F%u_64", 18, 27))>;
+    (sequence "F%u_D", 0, 7),
+    (sequence "F%u_D", 10, 11),
+    (sequence "F%u_D", 12, 17),
+    (sequence "F%u_D", 28, 31),
+    (sequence "F%u_D", 8, 9),
+    (sequence "F%u_D", 18, 27))>;
diff --git a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 1c5171a7b7a4..da5cd16e750c 100644
--- a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -235,10 +235,10 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
                                    MachineBasicBlock *LoopMBB,
                                    MachineBasicBlock *DoneMBB,
                                    AtomicRMWInst::BinOp BinOp, int Width) {
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned ScratchReg = MI.getOperand(1).getReg();
-  unsigned AddrReg = MI.getOperand(2).getReg();
-  unsigned IncrReg = MI.getOperand(3).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg = MI.getOperand(1).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register IncrReg = MI.getOperand(3).getReg();
   AtomicOrdering Ordering =
       static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
 
@@ -271,9 +271,9 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
 }
 
 static void insertMaskedMerge(const RISCVInstrInfo *TII, DebugLoc DL,
-                              MachineBasicBlock *MBB, unsigned DestReg,
-                              unsigned OldValReg, unsigned NewValReg,
-                              unsigned MaskReg, unsigned ScratchReg) {
+                              MachineBasicBlock *MBB, Register DestReg,
+                              Register OldValReg, Register NewValReg,
+                              Register MaskReg, Register ScratchReg) {
   assert(OldValReg != ScratchReg && "OldValReg and ScratchReg must be unique");
   assert(OldValReg != MaskReg && "OldValReg and MaskReg must be unique");
   assert(ScratchReg != MaskReg && "ScratchReg and MaskReg must be unique");
@@ -297,11 +297,11 @@ static void doMaskedAtomicBinOpExpansion(
     MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopMBB,
     MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width) {
   assert(Width == 32 && "Should never need to expand masked 64-bit operations");
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned ScratchReg = MI.getOperand(1).getReg();
-  unsigned AddrReg = MI.getOperand(2).getReg();
-  unsigned IncrReg = MI.getOperand(3).getReg();
-  unsigned MaskReg = MI.getOperand(4).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg = MI.getOperand(1).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register IncrReg = MI.getOperand(3).getReg();
+  Register MaskReg = MI.getOperand(4).getReg();
   AtomicOrdering Ordering =
       static_cast<AtomicOrdering>(MI.getOperand(5).getImm());
 
@@ -394,8 +394,8 @@ bool RISCVExpandPseudo::expandAtomicBinOp(
 }
 
 static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL,
-                       MachineBasicBlock *MBB, unsigned ValReg,
-                       unsigned ShamtReg) {
+                       MachineBasicBlock *MBB, Register ValReg,
+                       Register ShamtReg) {
   BuildMI(MBB, DL, TII->get(RISCV::SLL), ValReg)
       .addReg(ValReg)
       .addReg(ShamtReg);
@@ -436,12 +436,12 @@ bool RISCVExpandPseudo::expandAtomicMinMaxOp(
   DoneMBB->transferSuccessors(&MBB);
   MBB.addSuccessor(LoopHeadMBB);
 
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned Scratch1Reg = MI.getOperand(1).getReg();
-  unsigned Scratch2Reg = MI.getOperand(2).getReg();
-  unsigned AddrReg = MI.getOperand(3).getReg();
-  unsigned IncrReg = MI.getOperand(4).getReg();
-  unsigned MaskReg = MI.getOperand(5).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
+  Register Scratch1Reg = MI.getOperand(1).getReg();
+  Register Scratch2Reg = MI.getOperand(2).getReg();
+  Register AddrReg = MI.getOperand(3).getReg();
+  Register IncrReg = MI.getOperand(4).getReg();
+  Register MaskReg = MI.getOperand(5).getReg();
   bool IsSigned = BinOp == AtomicRMWInst::Min || BinOp == AtomicRMWInst::Max;
   AtomicOrdering Ordering =
       static_cast<AtomicOrdering>(MI.getOperand(IsSigned ? 7 : 6).getImm());
@@ -549,11 +549,11 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg(
   DoneMBB->transferSuccessors(&MBB);
   MBB.addSuccessor(LoopHeadMBB);
 
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned ScratchReg = MI.getOperand(1).getReg();
-  unsigned AddrReg = MI.getOperand(2).getReg();
-  unsigned CmpValReg = MI.getOperand(3).getReg();
-  unsigned NewValReg = MI.getOperand(4).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg = MI.getOperand(1).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register CmpValReg = MI.getOperand(3).getReg();
+  Register NewValReg = MI.getOperand(4).getReg();
   AtomicOrdering Ordering =
       static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm());
 
@@ -582,7 +582,7 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg(
     //   lr.w dest, (addr)
     //   and scratch, dest, mask
     //   bne scratch, cmpval, done
-    unsigned MaskReg = MI.getOperand(5).getReg();
+    Register MaskReg = MI.getOperand(5).getReg();
     BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
         .addReg(AddrReg);
     BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), ScratchReg)
@@ -629,7 +629,7 @@ bool RISCVExpandPseudo::expandAuipcInstPair(
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
 
-  unsigned DestReg = MI.getOperand(0).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
   const MachineOperand &Symbol = MI.getOperand(1);
 
   MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
diff --git a/lib/Target/RISCV/RISCVFrameLowering.cpp b/lib/Target/RISCV/RISCVFrameLowering.cpp
index 32c3b9684d2c..6b6f62e18ce9 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -40,8 +40,16 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
   uint64_t FrameSize = MFI.getStackSize();
 
   // Get the alignment.
-  uint64_t StackAlign = RI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
-                                                      : getStackAlignment();
+  unsigned StackAlign = getStackAlignment();
+  if (RI->needsStackRealignment(MF)) {
+    unsigned MaxStackAlign = std::max(StackAlign, MFI.getMaxAlignment());
+    FrameSize += (MaxStackAlign - StackAlign);
+    StackAlign = MaxStackAlign;
+  }
+
+  // Set Max Call Frame Size
+  uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign);
+  MFI.setMaxCallFrameSize(MaxCallSize);
 
   // Make sure the frame is aligned.
   FrameSize = alignTo(FrameSize, StackAlign);
@@ -52,8 +60,8 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
 
 void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
-                                   const DebugLoc &DL, unsigned DestReg,
-                                   unsigned SrcReg, int64_t Val,
+                                   const DebugLoc &DL, Register DestReg,
+                                   Register SrcReg, int64_t Val,
                                    MachineInstr::MIFlag Flag) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
@@ -66,7 +74,7 @@ void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
         .addReg(SrcReg)
         .addImm(Val)
         .setMIFlag(Flag);
-  } else if (isInt<32>(Val)) {
+  } else {
     unsigned Opc = RISCV::ADD;
     bool isSub = Val < 0;
     if (isSub) {
@@ -74,22 +82,20 @@ void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
       Opc = RISCV::SUB;
     }
 
-    unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-    TII->movImm32(MBB, MBBI, DL, ScratchReg, Val, Flag);
+    Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+    TII->movImm(MBB, MBBI, DL, ScratchReg, Val, Flag);
     BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
         .addReg(SrcReg)
         .addReg(ScratchReg, RegState::Kill)
         .setMIFlag(Flag);
-  } else {
-    report_fatal_error("adjustReg cannot yet handle adjustments >32 bits");
   }
 }
 
 // Returns the register used to hold the frame pointer.
-static unsigned getFPReg(const RISCVSubtarget &STI) { return RISCV::X8; }
+static Register getFPReg(const RISCVSubtarget &STI) { return RISCV::X8; }
 
 // Returns the register used to hold the stack pointer.
-static unsigned getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; }
+static Register getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; }
 
 void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
                                       MachineBasicBlock &MBB) const {
@@ -101,8 +107,14 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   const RISCVInstrInfo *TII = STI.getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.begin();
 
-  unsigned FPReg = getFPReg(STI);
-  unsigned SPReg = getSPReg(STI);
+  if (RI->needsStackRealignment(MF) && MFI.hasVarSizedObjects()) {
+    report_fatal_error(
+        "RISC-V backend can't currently handle functions that need stack "
+        "realignment and have variable sized objects");
+  }
+
+  Register FPReg = getFPReg(STI);
+  Register SPReg = getSPReg(STI);
 
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
@@ -119,6 +131,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   if (StackSize == 0 && !MFI.adjustsStack())
     return;
 
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+  // Split the SP adjustment to reduce the offsets of callee saved spill.
+  if (FirstSPAdjustAmount)
+    StackSize = FirstSPAdjustAmount;
+
   // Allocate space on the stack if necessary.
   adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
 
@@ -141,7 +158,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   // directives.
   for (const auto &Entry : CSI) {
     int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx());
-    unsigned Reg = Entry.getReg();
+    Register Reg = Entry.getReg();
     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
         nullptr, RI->getDwarfRegNum(Reg, true), Offset));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
@@ -159,6 +176,45 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
   }
+
+  // Emit the second SP adjustment after saving callee saved registers.
+  if (FirstSPAdjustAmount) {
+    uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount;
+    assert(SecondSPAdjustAmount > 0 &&
+           "SecondSPAdjustAmount should be greater than zero");
+    adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount,
+              MachineInstr::FrameSetup);
+    // Emit ".cfi_def_cfa_offset StackSize"
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+
+  if (hasFP(MF)) {
+    // Realign Stack
+    const RISCVRegisterInfo *RI = STI.getRegisterInfo();
+    if (RI->needsStackRealignment(MF)) {
+      unsigned MaxAlignment = MFI.getMaxAlignment();
+
+      const RISCVInstrInfo *TII = STI.getInstrInfo();
+      if (isInt<12>(-(int)MaxAlignment)) {
+        BuildMI(MBB, MBBI, DL, TII->get(RISCV::ANDI), SPReg)
+            .addReg(SPReg)
+            .addImm(-(int)MaxAlignment);
+      } else {
+        unsigned ShiftAmount = countTrailingZeros(MaxAlignment);
+        Register VR =
+            MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+        BuildMI(MBB, MBBI, DL, TII->get(RISCV::SRLI), VR)
+            .addReg(SPReg)
+            .addImm(ShiftAmount);
+        BuildMI(MBB, MBBI, DL, TII->get(RISCV::SLLI), SPReg)
+            .addReg(VR)
+            .addImm(ShiftAmount);
+      }
+    }
+  }
 }
 
 void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -169,8 +225,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
   DebugLoc DL = MBBI->getDebugLoc();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
-  unsigned FPReg = getFPReg(STI);
-  unsigned SPReg = getSPReg(STI);
+  Register FPReg = getFPReg(STI);
+  Register SPReg = getSPReg(STI);
 
   // Skip to before the restores of callee-saved registers
   // FIXME: assumes exactly one instruction is used to restore each
@@ -189,11 +245,29 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
               MachineInstr::FrameDestroy);
   }
 
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+  if (FirstSPAdjustAmount) {
+    uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount;
+    assert(SecondSPAdjustAmount > 0 &&
+           "SecondSPAdjustAmount should be greater than zero");
+
+    adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, SecondSPAdjustAmount,
+              MachineInstr::FrameDestroy);
+
+    // Emit ".cfi_def_cfa_offset FirstSPAdjustAmount"
+    unsigned CFIIndex =
+        MF.addFrameInst(
+             MCCFIInstruction::createDefCfaOffset(nullptr,
+                                                  -FirstSPAdjustAmount));
+    BuildMI(MBB, LastFrameDestroy, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+
   if (hasFP(MF)) {
     // To find the instruction restoring FP from stack.
     for (auto &I = LastFrameDestroy; I != MBBI; ++I) {
       if (I->mayLoad() && I->getOperand(0).isReg()) {
-        unsigned DestReg = I->getOperand(0).getReg();
+        Register DestReg = I->getOperand(0).getReg();
         if (DestReg == FPReg) {
           // If there is frame pointer, after restoring $fp registers, we
           // need adjust CFA to ($sp - FPOffset).
@@ -214,13 +288,16 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   // Iterate over list of callee-saved registers and emit .cfi_restore
   // directives.
   for (const auto &Entry : CSI) {
-    unsigned Reg = Entry.getReg();
+    Register Reg = Entry.getReg();
     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
         nullptr, RI->getDwarfRegNum(Reg, true)));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
   }
 
+  if (FirstSPAdjustAmount)
+    StackSize = FirstSPAdjustAmount;
+
   // Deallocate stack
   adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
 
@@ -249,6 +326,8 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea() +
                MFI.getOffsetAdjustment();
 
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+
   if (CSI.size()) {
     MinCSFI = CSI[0].getFrameIdx();
     MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
@@ -256,6 +335,17 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
 
   if (FI >= MinCSFI && FI <= MaxCSFI) {
     FrameReg = RISCV::X2;
+
+    if (FirstSPAdjustAmount)
+      Offset += FirstSPAdjustAmount;
+    else
+      Offset += MF.getFrameInfo().getStackSize();
+  } else if (RI->needsStackRealignment(MF)) {
+    assert(!MFI.hasVarSizedObjects() &&
+           "Unexpected combination of stack realignment and varsized objects");
+    // If the stack was realigned, the frame pointer is set in order to allow
+    // SP to be restored, but we still access stack objects using SP.
+    FrameReg = RISCV::X2;
     Offset += MF.getFrameInfo().getStackSize();
   } else {
     FrameReg = RI->getFrameRegister(MF);
@@ -338,7 +428,7 @@ bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator MI) const {
-  unsigned SPReg = RISCV::X2;
+  Register SPReg = RISCV::X2;
   DebugLoc DL = MI->getDebugLoc();
 
   if (!hasReservedCallFrame(MF)) {
@@ -362,3 +452,39 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
 
   return MBB.erase(MI);
 }
+
+// We would like to split the SP adjustment to reduce prologue/epilogue
+// as following instructions. In this way, the offset of the callee saved
+// register could fit in a single store.
+//   add     sp,sp,-2032
+//   sw      ra,2028(sp)
+//   sw      s0,2024(sp)
+//   sw      s1,2020(sp)
+//   sw      s3,2012(sp)
+//   sw      s4,2008(sp)
+//   add     sp,sp,-64
+uint64_t
+RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  uint64_t StackSize = MFI.getStackSize();
+  uint64_t StackAlign = getStackAlignment();
+
+  // FIXME: Disable SplitSPAdjust if save-restore libcall enabled when the patch
+  //        landing. The callee saved registers will be pushed by the
+  //        save-restore libcalls, so we don't have to split the SP adjustment
+  //        in this case.
+  //
+  // Return the FirstSPAdjustAmount if the StackSize can not fit in signed
+  // 12-bit and there exists a callee saved register need to be pushed.
+  if (!isInt<12>(StackSize) && (CSI.size() > 0)) {
+    // FirstSPAdjustAmount is choosed as (2048 - StackAlign)
+    // because 2048 will cause sp = sp + 2048 in epilogue split into
+    // multi-instructions. The offset smaller than 2048 can fit in signle
+    // load/store instruction and we have to stick with the stack alignment.
+    // 2048 is 16-byte alignment. The stack alignment for RV32 and RV64 is 16,
+    // for RV32E is 4. So (2048 - StackAlign) will satisfy the stack alignment.
+    return 2048 - StackAlign;
+  }
+  return 0;
+}
diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h
index 0e045c3ff853..f4a5949773d9 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/lib/Target/RISCV/RISCVFrameLowering.h
@@ -22,7 +22,7 @@ class RISCVFrameLowering : public TargetFrameLowering {
 public:
   explicit RISCVFrameLowering(const RISCVSubtarget &STI)
       : TargetFrameLowering(StackGrowsDown,
-                            /*StackAlignment=*/16,
+                            /*StackAlignment=*/Align(16),
                             /*LocalAreaOffset=*/0),
         STI(STI) {}
 
@@ -45,13 +45,18 @@ public:
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
 
+  // Get the first stack adjustment amount for SplitSPAdjust.
+  // Return 0 if we don't want to to split the SP adjustment in prologue and
+  // epilogue.
+  uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const;
+
 protected:
   const RISCVSubtarget &STI;
 
 private:
   void determineFrameLayout(MachineFunction &MF) const;
   void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                 const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                 const DebugLoc &DL, Register DestReg, Register SrcReg,
                  int64_t Val, MachineInstr::MIFlag Flag) const;
 };
 }
diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index d0a3af375a6d..1a12d9177d2a 100644
--- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -68,7 +68,7 @@ static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
   RISCVMatInt::InstSeq Seq;
   RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq);
 
-  SDNode *Result;
+  SDNode *Result = nullptr;
   SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT);
   for (RISCVMatInt::Inst &Inst : Seq) {
     SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT);
@@ -179,6 +179,9 @@ bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(
     // operand and need no special handling.
     OutOps.push_back(Op);
     return false;
+  case InlineAsm::Constraint_A:
+    OutOps.push_back(Op);
+    return false;
   default:
     break;
   }
diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp
index ce7b85911ab6..dc829fce9013 100644
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -100,6 +100,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 
   if (Subtarget.is64Bit()) {
+    setOperationAction(ISD::ADD, MVT::i32, Custom);
+    setOperationAction(ISD::SUB, MVT::i32, Custom);
     setOperationAction(ISD::SHL, MVT::i32, Custom);
     setOperationAction(ISD::SRA, MVT::i32, Custom);
     setOperationAction(ISD::SRL, MVT::i32, Custom);
@@ -116,6 +118,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
+    setOperationAction(ISD::MUL, MVT::i32, Custom);
     setOperationAction(ISD::SDIV, MVT::i32, Custom);
     setOperationAction(ISD::UDIV, MVT::i32, Custom);
     setOperationAction(ISD::UREM, MVT::i32, Custom);
@@ -194,8 +197,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setBooleanContents(ZeroOrOneBooleanContent);
 
-  // Function alignments (log2).
-  unsigned FunctionAlignment = Subtarget.hasStdExtC() ? 1 : 2;
+  // Function alignments.
+  const Align FunctionAlignment(Subtarget.hasStdExtC() ? 2 : 4);
   setMinFunctionAlignment(FunctionAlignment);
   setPrefFunctionAlignment(FunctionAlignment);
 
@@ -231,7 +234,7 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = 4;
+    Info.align = Align(4);
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
                  MachineMemOperand::MOVolatile;
     return true;
@@ -660,7 +663,7 @@ SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setFrameAddressIsTaken(true);
-  unsigned FrameReg = RI.getFrameRegister(MF);
+  Register FrameReg = RI.getFrameRegister(MF);
   int XLenInBytes = Subtarget.getXLen() / 8;
 
   EVT VT = Op.getValueType();
@@ -703,7 +706,7 @@ SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
 
   // Return the value of the return address register, marking it an implicit
   // live-in.
-  unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
+  Register Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
 }
 
@@ -834,6 +837,18 @@ static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
 }
 
+// Converts the given 32-bit operation to a i64 operation with signed extension
+// semantic to reduce the signed extension instructions.
+static SDValue customLegalizeToWOpWithSExt(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+  SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+  SDValue NewWOp = DAG.getNode(N->getOpcode(), DL, MVT::i64, NewOp0, NewOp1);
+  SDValue NewRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, NewWOp,
+                               DAG.getValueType(MVT::i32));
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
+}
+
 void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
                                              SmallVectorImpl<SDValue> &Results,
                                              SelectionDAG &DAG) const {
@@ -854,6 +869,15 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(RCW.getValue(2));
     break;
   }
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    if (N->getOperand(1).getOpcode() == ISD::Constant)
+      return;
+    Results.push_back(customLegalizeToWOpWithSExt(N, DAG));
+    break;
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -1007,12 +1031,14 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
       // We can materialise `c1 << c2` into an add immediate, so it's "free",
       // and the combine should happen, to potentially allow further combines
       // later.
-      if (isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
+      if (ShiftedC1Int.getMinSignedBits() <= 64 &&
+          isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
         return true;
 
       // We can materialise `c1` in an add immediate, so it's "free", and the
       // combine should be prevented.
-      if (isLegalAddImmediate(C1Int.getSExtValue()))
+      if (C1Int.getMinSignedBits() <= 64 &&
+          isLegalAddImmediate(C1Int.getSExtValue()))
         return false;
 
       // Neither constant will fit into an immediate, so find materialisation
@@ -1052,8 +1078,8 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   return 1;
 }
 
-MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
-                                           MachineBasicBlock *BB) {
+static MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
+                                                  MachineBasicBlock *BB) {
   assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction");
 
   // To read the 64-bit cycle CSR on a 32-bit target, we read the two halves.
@@ -1085,9 +1111,9 @@ MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
   BB->addSuccessor(LoopMBB);
 
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  unsigned ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
-  unsigned LoReg = MI.getOperand(0).getReg();
-  unsigned HiReg = MI.getOperand(1).getReg();
+  Register ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+  Register LoReg = MI.getOperand(0).getReg();
+  Register HiReg = MI.getOperand(1).getReg();
   DebugLoc DL = MI.getDebugLoc();
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
@@ -1122,9 +1148,9 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
-  unsigned LoReg = MI.getOperand(0).getReg();
-  unsigned HiReg = MI.getOperand(1).getReg();
-  unsigned SrcReg = MI.getOperand(2).getReg();
+  Register LoReg = MI.getOperand(0).getReg();
+  Register HiReg = MI.getOperand(1).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
   const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
   int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
 
@@ -1154,9 +1180,9 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned LoReg = MI.getOperand(1).getReg();
-  unsigned HiReg = MI.getOperand(2).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register LoReg = MI.getOperand(1).getReg();
+  Register HiReg = MI.getOperand(2).getReg();
   const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
   int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
 
@@ -1215,12 +1241,12 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   // previous selects in the sequence.
   // These conditions could be further relaxed. See the X86 target for a
   // related approach and more information.
-  unsigned LHS = MI.getOperand(1).getReg();
-  unsigned RHS = MI.getOperand(2).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
   auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());
 
   SmallVector<MachineInstr *, 4> SelectDebugValues;
-  SmallSet<unsigned, 4> SelectDests;
+  SmallSet<Register, 4> SelectDests;
   SelectDests.insert(MI.getOperand(0).getReg());
 
   MachineInstr *LastSelectPseudo = &MI;
@@ -1363,12 +1389,12 @@ static const MCPhysReg ArgGPRs[] = {
   RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17
 };
 static const MCPhysReg ArgFPR32s[] = {
-  RISCV::F10_32, RISCV::F11_32, RISCV::F12_32, RISCV::F13_32,
-  RISCV::F14_32, RISCV::F15_32, RISCV::F16_32, RISCV::F17_32
+  RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F,
+  RISCV::F14_F, RISCV::F15_F, RISCV::F16_F, RISCV::F17_F
 };
 static const MCPhysReg ArgFPR64s[] = {
-  RISCV::F10_64, RISCV::F11_64, RISCV::F12_64, RISCV::F13_64,
-  RISCV::F14_64, RISCV::F15_64, RISCV::F16_64, RISCV::F17_64
+  RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D,
+  RISCV::F14_D, RISCV::F15_D, RISCV::F16_D, RISCV::F17_D
 };
 
 // Pass a 2*XLEN argument that has been split into two XLEN values through
@@ -1378,7 +1404,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
                                 MVT ValVT2, MVT LocVT2,
                                 ISD::ArgFlagsTy ArgFlags2) {
   unsigned XLenInBytes = XLen / 8;
-  if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
+  if (Register Reg = State.AllocateReg(ArgGPRs)) {
     // At least one half can be passed via register.
     State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
                                      VA1.getLocVT(), CCValAssign::Full));
@@ -1395,7 +1421,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
     return false;
   }
 
-  if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
+  if (Register Reg = State.AllocateReg(ArgGPRs)) {
     // The second half can also be passed via register.
     State.addLoc(
         CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
@@ -1495,7 +1521,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
     // GPRs, split between a GPR and the stack, or passed completely on the
     // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
     // cases.
-    unsigned Reg = State.AllocateReg(ArgGPRs);
+    Register Reg = State.AllocateReg(ArgGPRs);
     LocVT = MVT::i32;
     if (!Reg) {
       unsigned StackOffset = State.AllocateStack(8, 8);
@@ -1537,7 +1563,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   }
 
   // Allocate to a register if possible, or else a stack slot.
-  unsigned Reg;
+  Register Reg;
   if (ValVT == MVT::f32 && !UseGPRForF32)
     Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
   else if (ValVT == MVT::f64 && !UseGPRForF64)
@@ -1673,7 +1699,7 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
     break;
   }
 
-  unsigned VReg = RegInfo.createVirtualRegister(RC);
+  Register VReg = RegInfo.createVirtualRegister(RC);
   RegInfo.addLiveIn(VA.getLocReg(), VReg);
   Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
 
@@ -1751,7 +1777,7 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
 
   assert(VA.isRegLoc() && "Expected register VA assignment");
 
-  unsigned LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+  Register LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
   RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
   SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
   SDValue Hi;
@@ -1763,13 +1789,70 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
                      MachinePointerInfo::getFixedStack(MF, FI));
   } else {
     // Second half of f64 is passed in another GPR.
-    unsigned HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+    Register HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
     RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
     Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
   }
   return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
 }
 
+// FastCC has less than 1% performance improvement for some particular
+// benchmark. But theoretically, it may has benenfit for some cases.
+static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (LocVT == MVT::i32 || LocVT == MVT::i64) {
+    // X5 and X6 might be used for save-restore libcall.
+    static const MCPhysReg GPRList[] = {
+        RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14,
+        RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7,  RISCV::X28,
+        RISCV::X29, RISCV::X30, RISCV::X31};
+    if (unsigned Reg = State.AllocateReg(GPRList)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::f32) {
+    static const MCPhysReg FPR32List[] = {
+        RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F,
+        RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F,  RISCV::F1_F,
+        RISCV::F2_F,  RISCV::F3_F,  RISCV::F4_F,  RISCV::F5_F,  RISCV::F6_F,
+        RISCV::F7_F,  RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F};
+    if (unsigned Reg = State.AllocateReg(FPR32List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::f64) {
+    static const MCPhysReg FPR64List[] = {
+        RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, RISCV::F14_D,
+        RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D,  RISCV::F1_D,
+        RISCV::F2_D,  RISCV::F3_D,  RISCV::F4_D,  RISCV::F5_D,  RISCV::F6_D,
+        RISCV::F7_D,  RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D};
+    if (unsigned Reg = State.AllocateReg(FPR64List)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    unsigned Offset4 = State.AllocateStack(4, 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo));
+    return false;
+  }
+
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    unsigned Offset5 = State.AllocateStack(8, 8);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo));
+    return false;
+  }
+
+  return true; // CC didn't match.
+}
+
 // Transform physical registers into virtual registers.
 SDValue RISCVTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
@@ -1809,7 +1892,11 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
-  analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
+
+  if (CallConv == CallingConv::Fast)
+    CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC);
+  else
+    analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -1877,8 +1964,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
     // ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures
     // offsets to even-numbered registered remain 2*XLEN-aligned.
     if (Idx % 2) {
-      FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes,
-                                 true);
+      MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes, true);
       VarArgsSaveSize += XLenInBytes;
     }
 
@@ -1886,7 +1972,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
     // to the vararg save area.
     for (unsigned I = Idx; I < ArgRegs.size();
          ++I, VaArgOffset += XLenInBytes) {
-      const unsigned Reg = RegInfo.createVirtualRegister(RC);
+      const Register Reg = RegInfo.createVirtualRegister(RC);
       RegInfo.addLiveIn(ArgRegs[I], Reg);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
       FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
@@ -1920,7 +2006,6 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
 
   auto &Callee = CLI.Callee;
   auto CalleeCC = CLI.CallConv;
-  auto IsVarArg = CLI.IsVarArg;
   auto &Outs = CLI.Outs;
   auto &Caller = MF.getFunction();
   auto CallerCC = Caller.getCallingConv();
@@ -1937,10 +2022,6 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
   if (Caller.hasFnAttribute("interrupt"))
     return false;
 
-  // Do not tail call opt functions with varargs.
-  if (IsVarArg)
-    return false;
-
   // Do not tail call opt if the stack is used to pass parameters.
   if (CCInfo.getNextStackOffset() != 0)
     return false;
@@ -2015,7 +2096,11 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
-  analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
+
+  if (CallConv == CallingConv::Fast)
+    ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC);
+  else
+    analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
 
   // Check if it's really possible to do a tail call.
   if (IsTailCall)
@@ -2057,7 +2142,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
 
   // Copy argument values to their designated locations.
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
   SDValue StackPtr;
   for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -2074,7 +2159,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
       SDValue Lo = SplitF64.getValue(0);
       SDValue Hi = SplitF64.getValue(1);
 
-      unsigned RegLo = VA.getLocReg();
+      Register RegLo = VA.getLocReg();
       RegsToPass.push_back(std::make_pair(RegLo, Lo));
 
       if (RegLo == RISCV::X17) {
@@ -2087,7 +2172,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
             DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
       } else {
         // Second half of f64 is passed in another GPR.
-        unsigned RegHigh = RegLo + 1;
+        assert(RegLo < RISCV::X31 && "Invalid register pair");
+        Register RegHigh = RegLo + 1;
         RegsToPass.push_back(std::make_pair(RegHigh, Hi));
       }
       continue;
@@ -2302,8 +2388,9 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                      DAG.getVTList(MVT::i32, MVT::i32), Val);
       SDValue Lo = SplitF64.getValue(0);
       SDValue Hi = SplitF64.getValue(1);
-      unsigned RegLo = VA.getLocReg();
-      unsigned RegHi = RegLo + 1;
+      Register RegLo = VA.getLocReg();
+      assert(RegLo < RISCV::X31 && "Invalid register pair");
+      Register RegHi = RegLo + 1;
       Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
       Glue = Chain.getValue(1);
       RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
@@ -2397,6 +2484,27 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
 }
 
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+RISCVTargetLowering::ConstraintType
+RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'f':
+      return C_RegisterClass;
+    case 'I':
+    case 'J':
+    case 'K':
+      return C_Immediate;
+    case 'A':
+      return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
 std::pair<unsigned, const TargetRegisterClass *>
 RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                   StringRef Constraint,
@@ -2407,14 +2515,125 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     switch (Constraint[0]) {
     case 'r':
       return std::make_pair(0U, &RISCV::GPRRegClass);
+    case 'f':
+      if (Subtarget.hasStdExtF() && VT == MVT::f32)
+        return std::make_pair(0U, &RISCV::FPR32RegClass);
+      if (Subtarget.hasStdExtD() && VT == MVT::f64)
+        return std::make_pair(0U, &RISCV::FPR64RegClass);
+      break;
     default:
       break;
     }
   }
 
+  // Clang will correctly decode the usage of register name aliases into their
+  // official names. However, other frontends like `rustc` do not. This allows
+  // users of these frontends to use the ABI names for registers in LLVM-style
+  // register constraints.
+  Register XRegFromAlias = StringSwitch<Register>(Constraint.lower())
+                               .Case("{zero}", RISCV::X0)
+                               .Case("{ra}", RISCV::X1)
+                               .Case("{sp}", RISCV::X2)
+                               .Case("{gp}", RISCV::X3)
+                               .Case("{tp}", RISCV::X4)
+                               .Case("{t0}", RISCV::X5)
+                               .Case("{t1}", RISCV::X6)
+                               .Case("{t2}", RISCV::X7)
+                               .Cases("{s0}", "{fp}", RISCV::X8)
+                               .Case("{s1}", RISCV::X9)
+                               .Case("{a0}", RISCV::X10)
+                               .Case("{a1}", RISCV::X11)
+                               .Case("{a2}", RISCV::X12)
+                               .Case("{a3}", RISCV::X13)
+                               .Case("{a4}", RISCV::X14)
+                               .Case("{a5}", RISCV::X15)
+                               .Case("{a6}", RISCV::X16)
+                               .Case("{a7}", RISCV::X17)
+                               .Case("{s2}", RISCV::X18)
+                               .Case("{s3}", RISCV::X19)
+                               .Case("{s4}", RISCV::X20)
+                               .Case("{s5}", RISCV::X21)
+                               .Case("{s6}", RISCV::X22)
+                               .Case("{s7}", RISCV::X23)
+                               .Case("{s8}", RISCV::X24)
+                               .Case("{s9}", RISCV::X25)
+                               .Case("{s10}", RISCV::X26)
+                               .Case("{s11}", RISCV::X27)
+                               .Case("{t3}", RISCV::X28)
+                               .Case("{t4}", RISCV::X29)
+                               .Case("{t5}", RISCV::X30)
+                               .Case("{t6}", RISCV::X31)
+                               .Default(RISCV::NoRegister);
+  if (XRegFromAlias != RISCV::NoRegister)
+    return std::make_pair(XRegFromAlias, &RISCV::GPRRegClass);
+
+  // Since TargetLowering::getRegForInlineAsmConstraint uses the name of the
+  // TableGen record rather than the AsmName to choose registers for InlineAsm
+  // constraints, plus we want to match those names to the widest floating point
+  // register type available, manually select floating point registers here.
+  //
+  // The second case is the ABI name of the register, so that frontends can also
+  // use the ABI names in register constraint lists.
+  if (Subtarget.hasStdExtF() || Subtarget.hasStdExtD()) {
+    std::pair<Register, Register> FReg =
+        StringSwitch<std::pair<Register, Register>>(Constraint.lower())
+            .Cases("{f0}", "{ft0}", {RISCV::F0_F, RISCV::F0_D})
+            .Cases("{f1}", "{ft1}", {RISCV::F1_F, RISCV::F1_D})
+            .Cases("{f2}", "{ft2}", {RISCV::F2_F, RISCV::F2_D})
+            .Cases("{f3}", "{ft3}", {RISCV::F3_F, RISCV::F3_D})
+            .Cases("{f4}", "{ft4}", {RISCV::F4_F, RISCV::F4_D})
+            .Cases("{f5}", "{ft5}", {RISCV::F5_F, RISCV::F5_D})
+            .Cases("{f6}", "{ft6}", {RISCV::F6_F, RISCV::F6_D})
+            .Cases("{f7}", "{ft7}", {RISCV::F7_F, RISCV::F7_D})
+            .Cases("{f8}", "{fs0}", {RISCV::F8_F, RISCV::F8_D})
+            .Cases("{f9}", "{fs1}", {RISCV::F9_F, RISCV::F9_D})
+            .Cases("{f10}", "{fa0}", {RISCV::F10_F, RISCV::F10_D})
+            .Cases("{f11}", "{fa1}", {RISCV::F11_F, RISCV::F11_D})
+            .Cases("{f12}", "{fa2}", {RISCV::F12_F, RISCV::F12_D})
+            .Cases("{f13}", "{fa3}", {RISCV::F13_F, RISCV::F13_D})
+            .Cases("{f14}", "{fa4}", {RISCV::F14_F, RISCV::F14_D})
+            .Cases("{f15}", "{fa5}", {RISCV::F15_F, RISCV::F15_D})
+            .Cases("{f16}", "{fa6}", {RISCV::F16_F, RISCV::F16_D})
+            .Cases("{f17}", "{fa7}", {RISCV::F17_F, RISCV::F17_D})
+            .Cases("{f18}", "{fs2}", {RISCV::F18_F, RISCV::F18_D})
+            .Cases("{f19}", "{fs3}", {RISCV::F19_F, RISCV::F19_D})
+            .Cases("{f20}", "{fs4}", {RISCV::F20_F, RISCV::F20_D})
+            .Cases("{f21}", "{fs5}", {RISCV::F21_F, RISCV::F21_D})
+            .Cases("{f22}", "{fs6}", {RISCV::F22_F, RISCV::F22_D})
+            .Cases("{f23}", "{fs7}", {RISCV::F23_F, RISCV::F23_D})
+            .Cases("{f24}", "{fs8}", {RISCV::F24_F, RISCV::F24_D})
+            .Cases("{f25}", "{fs9}", {RISCV::F25_F, RISCV::F25_D})
+            .Cases("{f26}", "{fs10}", {RISCV::F26_F, RISCV::F26_D})
+            .Cases("{f27}", "{fs11}", {RISCV::F27_F, RISCV::F27_D})
+            .Cases("{f28}", "{ft8}", {RISCV::F28_F, RISCV::F28_D})
+            .Cases("{f29}", "{ft9}", {RISCV::F29_F, RISCV::F29_D})
+            .Cases("{f30}", "{ft10}", {RISCV::F30_F, RISCV::F30_D})
+            .Cases("{f31}", "{ft11}", {RISCV::F31_F, RISCV::F31_D})
+            .Default({RISCV::NoRegister, RISCV::NoRegister});
+    if (FReg.first != RISCV::NoRegister)
+      return Subtarget.hasStdExtD()
+                 ? std::make_pair(FReg.second, &RISCV::FPR64RegClass)
+                 : std::make_pair(FReg.first, &RISCV::FPR32RegClass);
+  }
+
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
+unsigned
+RISCVTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const {
+  // Currently only support length 1 constraints.
+  if (ConstraintCode.size() == 1) {
+    switch (ConstraintCode[0]) {
+    case 'A':
+      return InlineAsm::Constraint_A;
+    default:
+      break;
+    }
+  }
+
+  return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+}
+
 void RISCVTargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
@@ -2619,3 +2838,13 @@ unsigned RISCVTargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   return RISCV::X11;
 }
+
+bool RISCVTargetLowering::shouldExtendTypeInLibCall(EVT Type) const {
+  // Return false to suppress the unnecessary extensions if the LibCall
+  // arguments or return value is f32 type for LP64 ABI.
+  RISCVABI::ABI ABI = Subtarget.getTargetABI();
+  if (ABI == RISCVABI::ABI_LP64 && (Type == MVT::f32))
+    return false;
+
+  return true;
+}
diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h
index 17db03bbb69e..18fc7350bbbf 100644
--- a/lib/Target/RISCV/RISCVISelLowering.h
+++ b/lib/Target/RISCV/RISCVISelLowering.h
@@ -92,6 +92,10 @@ public:
   // This method returns the name of a target specific DAG node.
   const char *getTargetNodeName(unsigned Opcode) const override;
 
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+
+  unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override;
+
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
@@ -141,6 +145,8 @@ public:
   unsigned
   getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
+  bool shouldExtendTypeInLibCall(EVT Type) const override;
+
 private:
   void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
                         const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/lib/Target/RISCV/RISCVInstrInfo.cpp b/lib/Target/RISCV/RISCVInstrInfo.cpp
index 99c8d2ef73de..084839299530 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -14,6 +14,7 @@
 #include "RISCV.h"
 #include "RISCVSubtarget.h"
 #include "RISCVTargetMachine.h"
+#include "Utils/RISCVMatInt.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -28,8 +29,9 @@
 
 using namespace llvm;
 
-RISCVInstrInfo::RISCVInstrInfo()
-    : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP) {}
+RISCVInstrInfo::RISCVInstrInfo(RISCVSubtarget &STI)
+    : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
+      STI(STI) {}
 
 unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                              int &FrameIndex) const {
@@ -156,24 +158,43 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DL, get(Opcode), DstReg).addFrameIndex(FI).addImm(0);
 }
 
-void RISCVInstrInfo::movImm32(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              const DebugLoc &DL, unsigned DstReg, uint64_t Val,
-                              MachineInstr::MIFlag Flag) const {
-  assert(isInt<32>(Val) && "Can only materialize 32-bit constants");
-
-  // TODO: If the value can be materialized using only one instruction, only
-  // insert a single instruction.
-
-  uint64_t Hi20 = ((Val + 0x800) >> 12) & 0xfffff;
-  uint64_t Lo12 = SignExtend64<12>(Val);
-  BuildMI(MBB, MBBI, DL, get(RISCV::LUI), DstReg)
-      .addImm(Hi20)
-      .setMIFlag(Flag);
-  BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
-      .addReg(DstReg, RegState::Kill)
-      .addImm(Lo12)
-      .setMIFlag(Flag);
+void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            const DebugLoc &DL, Register DstReg, uint64_t Val,
+                            MachineInstr::MIFlag Flag) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  bool IsRV64 = MF->getSubtarget<RISCVSubtarget>().is64Bit();
+  Register SrcReg = RISCV::X0;
+  Register Result = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+  unsigned Num = 0;
+
+  if (!IsRV64 && !isInt<32>(Val))
+    report_fatal_error("Should only materialize 32-bit constants for RV32");
+
+  RISCVMatInt::InstSeq Seq;
+  RISCVMatInt::generateInstSeq(Val, IsRV64, Seq);
+  assert(Seq.size() > 0);
+
+  for (RISCVMatInt::Inst &Inst : Seq) {
+    // Write the final result to DstReg if it's the last instruction in the Seq.
+    // Otherwise, write the result to the temp register.
+    if (++Num == Seq.size())
+      Result = DstReg;
+
+    if (Inst.Opc == RISCV::LUI) {
+      BuildMI(MBB, MBBI, DL, get(RISCV::LUI), Result)
+          .addImm(Inst.Imm)
+          .setMIFlag(Flag);
+    } else {
+      BuildMI(MBB, MBBI, DL, get(Inst.Opc), Result)
+          .addReg(SrcReg, RegState::Kill)
+          .addImm(Inst.Imm)
+          .setMIFlag(Flag);
+    }
+    // Only the first instruction has X0 as its source.
+    SrcReg = Result;
+  }
 }
 
 // The contents of values added to Cond are not examined outside of
@@ -372,7 +393,7 @@ unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   // FIXME: A virtual register must be used initially, as the register
   // scavenger won't work with empty blocks (SIInstrInfo::insertIndirectBranch
   // uses the same workaround).
-  unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+  Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
   auto II = MBB.end();
 
   MachineInstr &LuiMI = *BuildMI(MBB, II, DL, get(RISCV::LUI), ScratchReg)
@@ -466,3 +487,58 @@ bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   }
   return MI.isAsCheapAsAMove();
 }
+
+bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
+                                       StringRef &ErrInfo) const {
+  const MCInstrInfo *MCII = STI.getInstrInfo();
+  MCInstrDesc const &Desc = MCII->get(MI.getOpcode());
+
+  for (auto &OI : enumerate(Desc.operands())) {
+    unsigned OpType = OI.value().OperandType;
+    if (OpType >= RISCVOp::OPERAND_FIRST_RISCV_IMM &&
+        OpType <= RISCVOp::OPERAND_LAST_RISCV_IMM) {
+      const MachineOperand &MO = MI.getOperand(OI.index());
+      if (MO.isImm()) {
+        int64_t Imm = MO.getImm();
+        bool Ok;
+        switch (OpType) {
+        default:
+          llvm_unreachable("Unexpected operand type");
+        case RISCVOp::OPERAND_UIMM4:
+          Ok = isUInt<4>(Imm);
+          break;
+        case RISCVOp::OPERAND_UIMM5:
+          Ok = isUInt<5>(Imm);
+          break;
+        case RISCVOp::OPERAND_UIMM12:
+          Ok = isUInt<12>(Imm);
+          break;
+        case RISCVOp::OPERAND_SIMM12:
+          Ok = isInt<12>(Imm);
+          break;
+        case RISCVOp::OPERAND_SIMM13_LSB0:
+          Ok = isShiftedInt<12, 1>(Imm);
+          break;
+        case RISCVOp::OPERAND_UIMM20:
+          Ok = isUInt<20>(Imm);
+          break;
+        case RISCVOp::OPERAND_SIMM21_LSB0:
+          Ok = isShiftedInt<20, 1>(Imm);
+          break;
+        case RISCVOp::OPERAND_UIMMLOG2XLEN:
+          if (STI.getTargetTriple().isArch64Bit())
+            Ok = isUInt<6>(Imm);
+          else
+            Ok = isUInt<5>(Imm);
+          break;
+        }
+        if (!Ok) {
+          ErrInfo = "Invalid immediate";
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
diff --git a/lib/Target/RISCV/RISCVInstrInfo.h b/lib/Target/RISCV/RISCVInstrInfo.h
index ff098e660d19..d3ae04aefe04 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/lib/Target/RISCV/RISCVInstrInfo.h
@@ -21,10 +21,12 @@
 
 namespace llvm {
 
+class RISCVSubtarget;
+
 class RISCVInstrInfo : public RISCVGenInstrInfo {
 
 public:
-  RISCVInstrInfo();
+  explicit RISCVInstrInfo(RISCVSubtarget &STI);
 
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
@@ -46,10 +48,10 @@ public:
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  // Materializes the given int32 Val into DstReg.
-  void movImm32(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                const DebugLoc &DL, unsigned DstReg, uint64_t Val,
-                MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
+  // Materializes the given integer Val into DstReg.
+  void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+              const DebugLoc &DL, Register DstReg, uint64_t Val,
+              MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
 
   unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
 
@@ -80,6 +82,12 @@ public:
                              int64_t BrOffset) const override;
 
   bool isAsCheapAsAMove(const MachineInstr &MI) const override;
+
+  bool verifyInstruction(const MachineInstr &MI,
+                         StringRef &ErrInfo) const override;
+
+protected:
+  const RISCVSubtarget &STI;
 };
 }
 #endif
diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index 69bde15f1218..db2ecc49d14e 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -69,6 +69,12 @@ class ImmAsmOperand<string prefix, int width, string suffix> : AsmOperandClass {
   let DiagnosticType = !strconcat("Invalid", Name);
 }
 
+def ImmZeroAsmOperand : AsmOperandClass {
+  let Name = "ImmZero";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = !strconcat("Invalid", Name);
+}
+
 class SImmAsmOperand<int width, string suffix = "">
     : ImmAsmOperand<"S", width, suffix> {
 }
@@ -87,6 +93,8 @@ def fencearg : Operand<XLenVT> {
   let ParserMatchClass = FenceArg;
   let PrintMethod = "printFenceArg";
   let DecoderMethod = "decodeUImmOperand<4>";
+  let OperandType = "OPERAND_UIMM4";
+  let OperandNamespace = "RISCVOp";
 }
 
 def UImmLog2XLenAsmOperand : AsmOperandClass {
@@ -111,11 +119,15 @@ def uimmlog2xlen : Operand<XLenVT>, ImmLeaf<XLenVT, [{
       return  isUInt<6>(Imm);
     return isUInt<5>(Imm);
   }];
+  let OperandType = "OPERAND_UIMMLOG2XLEN";
+  let OperandNamespace = "RISCVOp";
 }
 
 def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> {
   let ParserMatchClass = UImmAsmOperand<5>;
   let DecoderMethod = "decodeUImmOperand<5>";
+  let OperandType = "OPERAND_UIMM5";
+  let OperandNamespace = "RISCVOp";
 }
 
 def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> {
@@ -128,6 +140,8 @@ def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> {
       return isInt<12>(Imm);
     return MCOp.isBareSymbolRef();
   }];
+  let OperandType = "OPERAND_SIMM12";
+  let OperandNamespace = "RISCVOp";
 }
 
 // A 13-bit signed immediate where the least significant bit is zero.
@@ -141,6 +155,8 @@ def simm13_lsb0 : Operand<OtherVT> {
       return isShiftedInt<12, 1>(Imm);
     return MCOp.isBareSymbolRef();
   }];
+  let OperandType = "OPERAND_SIMM13_LSB0";
+  let OperandNamespace = "RISCVOp";
 }
 
 class UImm20Operand : Operand<XLenVT> {
@@ -152,6 +168,8 @@ class UImm20Operand : Operand<XLenVT> {
       return isUInt<20>(Imm);
     return MCOp.isBareSymbolRef();
   }];
+  let OperandType = "OPERAND_UIMM20";
+  let OperandNamespace = "RISCVOp";
 }
 
 def uimm20_lui : UImm20Operand {
@@ -176,6 +194,8 @@ def simm21_lsb0_jal : Operand<OtherVT> {
       return isShiftedInt<20, 1>(Imm);
     return MCOp.isBareSymbolRef();
   }];
+  let OperandType = "OPERAND_SIMM21_LSB0";
+  let OperandNamespace = "RISCVOp";
 }
 
 def BareSymbol : AsmOperandClass {
@@ -224,6 +244,8 @@ def csr_sysreg : Operand<XLenVT> {
   let ParserMatchClass = CSRSystemRegister;
   let PrintMethod = "printCSRSystemRegister";
   let DecoderMethod = "decodeUImmOperand<12>";
+  let OperandType = "OPERAND_UIMM12";
+  let OperandNamespace = "RISCVOp";
 }
 
 // A parameterized register class alternative to i32imm/i64imm from Target.td.
diff --git a/lib/Target/RISCV/RISCVInstrInfoA.td b/lib/Target/RISCV/RISCVInstrInfoA.td
index b768c9347b38..38ba3f9fb24e 100644
--- a/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -11,6 +11,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+// A parse method for (${gpr}) or 0(${gpr}), where the 0 is be silently ignored.
+// Used for GNU as Compatibility.
+def AtomicMemOpOperand : AsmOperandClass {
+  let Name = "AtomicMemOpOperand";
+  let RenderMethod = "addRegOperands";
+  let PredicateMethod = "isReg";
+  let ParserMethod = "parseAtomicMemOp";
+}
+
+def GPRMemAtomic : RegisterOperand<GPR> {
+  let ParserMatchClass = AtomicMemOpOperand;
+  let PrintMethod = "printAtomicMemOp";
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -18,8 +36,8 @@
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
 class LR_r<bit aq, bit rl, bits<3> funct3, string opcodestr>
     : RVInstRAtomic<0b00010, aq, rl, funct3, OPC_AMO,
-                    (outs GPR:$rd), (ins GPR:$rs1),
-                    opcodestr, "$rd, (${rs1})"> {
+                    (outs GPR:$rd), (ins GPRMemAtomic:$rs1),
+                    opcodestr, "$rd, $rs1"> {
   let rs2 = 0;
 }
 
@@ -33,8 +51,8 @@ multiclass LR_r_aq_rl<bits<3> funct3, string opcodestr> {
 let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in
 class AMO_rr<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr>
     : RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO,
-                    (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
-                    opcodestr, "$rd, $rs2, (${rs1})">;
+                    (outs GPR:$rd), (ins GPRMemAtomic:$rs1, GPR:$rs2),
+                    opcodestr, "$rd, $rs2, $rs1">;
 
 multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> {
   def ""     : AMO_rr<funct5, 0, 0, funct3, opcodestr>;
@@ -196,12 +214,12 @@ class PseudoMaskedAMOUMinUMax
 }
 
 class PseudoMaskedAMOPat<Intrinsic intrin, Pseudo AMOInst>
-    : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, imm:$ordering),
+    : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, timm:$ordering),
           (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, imm:$ordering)>;
 
 class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst>
     : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
-           imm:$ordering),
+           timm:$ordering),
           (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
            imm:$ordering)>;
 
@@ -270,7 +288,7 @@ def PseudoMaskedCmpXchg32
 }
 
 def : Pat<(int_riscv_masked_cmpxchg_i32
-            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering),
+            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
           (PseudoMaskedCmpXchg32
             GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>;
 
@@ -347,7 +365,7 @@ def PseudoCmpXchg64 : PseudoCmpXchg;
 defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64>;
 
 def : Pat<(int_riscv_masked_cmpxchg_i64
-            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering),
+            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
           (PseudoMaskedCmpXchg32
             GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>;
 } // Predicates = [HasStdExtA, IsRV64]
diff --git a/lib/Target/RISCV/RISCVInstrInfoC.td b/lib/Target/RISCV/RISCVInstrInfoC.td
index 94477341eea7..fa0050f107b2 100644
--- a/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -61,6 +61,11 @@ def simm6nonzero : Operand<XLenVT>,
   }];
 }
 
+def immzero : Operand<XLenVT>,
+              ImmLeaf<XLenVT, [{return (Imm == 0);}]> {
+  let ParserMatchClass = ImmZeroAsmOperand;
+}
+
 def CLUIImmAsmOperand : AsmOperandClass {
   let Name = "CLUIImm";
   let RenderMethod = "addImmOperands";
@@ -132,7 +137,8 @@ def uimm8_lsb000 : Operand<XLenVT>,
 }
 
 // A 9-bit signed immediate where the least significant bit is zero.
-def simm9_lsb0 : Operand<OtherVT> {
+def simm9_lsb0 : Operand<OtherVT>,
+                 ImmLeaf<XLenVT, [{return isShiftedInt<8, 1>(Imm);}]> {
   let ParserMatchClass = SImmAsmOperand<9, "Lsb0">;
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<9>";
@@ -191,7 +197,8 @@ def simm10_lsb0000nonzero : Operand<XLenVT>,
 }
 
 // A 12-bit signed immediate where the least significant bit is zero.
-def simm12_lsb0 : Operand<XLenVT> {
+def simm12_lsb0 : Operand<XLenVT>,
+                  ImmLeaf<XLenVT, [{return isShiftedInt<11, 1>(Imm);}]> {
   let ParserMatchClass = SImmAsmOperand<12, "Lsb0">;
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<12>";
@@ -344,7 +351,10 @@ def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000> {
 }
 
 let rd = 0, imm = 0, hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">;
+def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">
+{
+  let Inst{6-2} = 0;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
@@ -354,6 +364,15 @@ def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
   let Inst{6-2} = imm{4-0};
 }
 
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def C_ADDI_NOP : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb),
+                            (ins GPRX0:$rd, immzero:$imm),
+                            "c.addi", "$rd, $imm"> {
+  let Constraints = "$rd = $rd_wb";
+  let Inst{6-2} = 0;
+  let isAsmParserOnly = 1;
+}
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
     DecoderNamespace = "RISCV32Only_", Defs = [X1],
     Predicates = [HasStdExtC, IsRV32]  in
@@ -522,6 +541,105 @@ def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther> {
 
 } // Predicates = [HasStdExtC]
 
+//===----------------------------------------------------------------------===//
+// HINT Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtC, HasRVCHints], hasSideEffects = 0, mayLoad = 0,
+    mayStore = 0 in
+{
+
+let rd = 0 in
+def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm),
+                            "c.nop", "$imm"> {
+  let Inst{6-2} = imm{4-0};
+  let DecoderMethod = "decodeRVCInstrSImm";
+}
+
+// Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6.
+def C_ADDI_HINT_X0 : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb),
+                                (ins GPRX0:$rd, simm6nonzero:$imm),
+                                "c.addi", "$rd, $imm"> {
+  let Constraints = "$rd = $rd_wb";
+  let Inst{6-2} = imm{4-0};
+  let isAsmParserOnly = 1;
+}
+
+def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
+                                      (ins GPRNoX0:$rd, immzero:$imm),
+                                      "c.addi", "$rd, $imm"> {
+  let Constraints = "$rd = $rd_wb";
+  let Inst{6-2} = 0;
+  let isAsmParserOnly = 1;
+}
+
+def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm),
+                           "c.li", "$rd, $imm"> {
+  let Inst{6-2} = imm{4-0};
+  let Inst{11-7} = 0;
+  let DecoderMethod = "decodeRVCInstrRdSImm";
+}
+
+def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd),
+                            (ins c_lui_imm:$imm),
+                            "c.lui", "$rd, $imm"> {
+  let Inst{6-2} = imm{4-0};
+  let Inst{11-7} = 0;
+  let DecoderMethod = "decodeRVCInstrRdSImm";
+}
+
+def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2),
+                           "c.mv", "$rs1, $rs2">
+{
+  let Inst{11-7} = 0;
+  let DecoderMethod = "decodeRVCInstrRdRs2";
+}
+
+def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rs1_wb),
+                            (ins GPRX0:$rs1, GPRNoX0:$rs2),
+                            "c.add", "$rs1, $rs2"> {
+  let Constraints = "$rs1 = $rs1_wb";
+  let Inst{11-7} = 0;
+  let DecoderMethod = "decodeRVCInstrRdRs1Rs2";
+}
+
+def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb),
+                             (ins GPRX0:$rd, uimmlog2xlennonzero:$imm),
+                             "c.slli" ,"$rd, $imm"> {
+  let Constraints = "$rd = $rd_wb";
+  let Inst{6-2} = imm{4-0};
+  let Inst{11-7} = 0;
+  let DecoderMethod = "decodeRVCInstrRdRs1UImm";
+}
+
+def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd),
+                               "c.slli64" ,"$rd"> {
+  let Constraints = "$rd = $rd_wb";
+  let Inst{6-2} = 0;
+  let Inst{12} = 0;
+}
+
+def C_SRLI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb),
+                               (ins GPRC:$rd),
+                               "c.srli64", "$rd"> {
+  let Constraints = "$rd = $rd_wb";
+  let Inst{6-2} = 0;
+  let Inst{11-10} = 0;
+  let Inst{12} = 0;
+}
+
+def C_SRAI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb),
+                               (ins GPRC:$rd),
+                               "c.srai64", "$rd"> {
+  let Constraints = "$rd = $rd_wb";
+  let Inst{6-2} = 0;
+  let Inst{11-10} = 1;
+  let Inst{12} = 0;
+}
+
+} // Predicates = [HasStdExtC, HasRVCHints], hasSideEffects = 0, mayLoad = 0,
+  // mayStore = 0
+
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td
index 032642942f2b..3b73c865ea17 100644
--- a/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -227,6 +227,12 @@ def : InstAlias<"frcsr $rd",      (CSRRS GPR:$rd, FCSR.Encoding, X0), 2>;
 def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs)>;
 def : InstAlias<"fscsr $rs",      (CSRRW      X0, FCSR.Encoding, GPR:$rs), 2>;
 
+// frsr, fssr are obsolete aliases replaced by frcsr, fscsr, so give them
+// zero weight.
+def : InstAlias<"frsr $rd",       (CSRRS GPR:$rd, FCSR.Encoding, X0), 0>;
+def : InstAlias<"fssr $rd, $rs",  (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs), 0>;
+def : InstAlias<"fssr $rs",       (CSRRW      X0, FCSR.Encoding, GPR:$rs), 0>;
+
 def : InstAlias<"frrm $rd",        (CSRRS  GPR:$rd, FRM.Encoding, X0), 2>;
 def : InstAlias<"fsrm $rd, $rs",   (CSRRW  GPR:$rd, FRM.Encoding, GPR:$rs)>;
 def : InstAlias<"fsrm $rs",        (CSRRW       X0, FRM.Encoding, GPR:$rs), 2>;
diff --git a/lib/Target/RISCV/RISCVInstructionSelector.cpp b/lib/Target/RISCV/RISCVInstructionSelector.cpp
new file mode 100644
index 000000000000..5bd09a546114
--- /dev/null
+++ b/lib/Target/RISCV/RISCVInstructionSelector.cpp
@@ -0,0 +1,103 @@
+//===-- RISCVInstructionSelector.cpp -----------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "RISCVRegisterBankInfo.h"
+#include "RISCVSubtarget.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "riscv-isel"
+
+using namespace llvm;
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+namespace {
+
+class RISCVInstructionSelector : public InstructionSelector {
+public:
+  RISCVInstructionSelector(const RISCVTargetMachine &TM,
+                           const RISCVSubtarget &STI,
+                           const RISCVRegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) override;
+  static const char *getName() { return DEBUG_TYPE; }
+
+private:
+  bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
+  const RISCVSubtarget &STI;
+  const RISCVInstrInfo &TII;
+  const RISCVRegisterInfo &TRI;
+  const RISCVRegisterBankInfo &RBI;
+
+  // FIXME: This is necessary because DAGISel uses "Subtarget->" and GlobalISel
+  // uses "STI." in the code generated by TableGen. We need to unify the name of
+  // Subtarget variable.
+  const RISCVSubtarget *Subtarget = &STI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+RISCVInstructionSelector::RISCVInstructionSelector(
+    const RISCVTargetMachine &TM, const RISCVSubtarget &STI,
+    const RISCVRegisterBankInfo &RBI)
+    : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI),
+
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+bool RISCVInstructionSelector::select(MachineInstr &I) {
+
+  if (!isPreISelGenericOpcode(I.getOpcode())) {
+    // Certain non-generic instructions also need some special handling.
+    return true;
+  }
+
+  if (selectImpl(I, *CoverageInfo))
+    return true;
+
+  return false;
+}
+
+namespace llvm {
+InstructionSelector *
+createRISCVInstructionSelector(const RISCVTargetMachine &TM,
+                               RISCVSubtarget &Subtarget,
+                               RISCVRegisterBankInfo &RBI) {
+  return new RISCVInstructionSelector(TM, Subtarget, RBI);
+}
+} // end namespace llvm
diff --git a/lib/Target/RISCV/RISCVLegalizerInfo.cpp b/lib/Target/RISCV/RISCVLegalizerInfo.cpp
new file mode 100644
index 000000000000..c92f4a3ee17b
--- /dev/null
+++ b/lib/Target/RISCV/RISCVLegalizerInfo.cpp
@@ -0,0 +1,23 @@
+//===-- RISCVLegalizerInfo.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "RISCVLegalizerInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+
+RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
+  computeTables();
+}
diff --git a/lib/Target/RISCV/RISCVLegalizerInfo.h b/lib/Target/RISCV/RISCVLegalizerInfo.h
new file mode 100644
index 000000000000..f2c2b9a3fd46
--- /dev/null
+++ b/lib/Target/RISCV/RISCVLegalizerInfo.h
@@ -0,0 +1,28 @@
+//===-- RISCVLegalizerInfo.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class RISCVSubtarget;
+
+/// This class provides the information for the target register banks.
+class RISCVLegalizerInfo : public LegalizerInfo {
+public:
+  RISCVLegalizerInfo(const RISCVSubtarget &ST);
+};
+} // end namespace llvm
+#endif
diff --git a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 82b1209cb8e7..4c9013aa1e23 100644
--- a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -45,7 +45,7 @@ struct RISCVMergeBaseOffsetOpt : public MachineFunctionPass {
   bool detectAndFoldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI);
   void foldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI, MachineInstr &Tail,
                   int64_t Offset);
-  bool matchLargeOffset(MachineInstr &TailAdd, unsigned GSReg, int64_t &Offset);
+  bool matchLargeOffset(MachineInstr &TailAdd, Register GSReg, int64_t &Offset);
   RISCVMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
 
   MachineFunctionProperties getRequiredProperties() const override {
@@ -85,7 +85,7 @@ bool RISCVMergeBaseOffsetOpt::detectLuiAddiGlobal(MachineInstr &HiLUI,
       HiLUI.getOperand(1).getOffset() != 0 ||
       !MRI->hasOneUse(HiLUI.getOperand(0).getReg()))
     return false;
-  unsigned HiLuiDestReg = HiLUI.getOperand(0).getReg();
+  Register HiLuiDestReg = HiLUI.getOperand(0).getReg();
   LoADDI = MRI->use_begin(HiLuiDestReg)->getParent();
   if (LoADDI->getOpcode() != RISCV::ADDI ||
       LoADDI->getOperand(2).getTargetFlags() != RISCVII::MO_LO ||
@@ -132,12 +132,12 @@ void RISCVMergeBaseOffsetOpt::foldOffset(MachineInstr &HiLUI,
 //                       \                                  /
 //                         TailAdd: add  vreg4, vreg2, voff
 bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd,
-                                               unsigned GAReg,
+                                               Register GAReg,
                                                int64_t &Offset) {
   assert((TailAdd.getOpcode() == RISCV::ADD) && "Expected ADD instruction!");
-  unsigned Rs = TailAdd.getOperand(1).getReg();
-  unsigned Rt = TailAdd.getOperand(2).getReg();
-  unsigned Reg = Rs == GAReg ? Rt : Rs;
+  Register Rs = TailAdd.getOperand(1).getReg();
+  Register Rt = TailAdd.getOperand(2).getReg();
+  Register Reg = Rs == GAReg ? Rt : Rs;
 
   // Can't fold if the register has more than one use.
   if (!MRI->hasOneUse(Reg))
@@ -178,7 +178,7 @@ bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd,
 
 bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
                                                   MachineInstr &LoADDI) {
-  unsigned DestReg = LoADDI.getOperand(0).getReg();
+  Register DestReg = LoADDI.getOperand(0).getReg();
   assert(MRI->hasOneUse(DestReg) && "expected one use for LoADDI");
   // LoADDI has only one use.
   MachineInstr &Tail = *MRI->use_begin(DestReg)->getParent();
@@ -232,7 +232,7 @@ bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
       return false;
     // Register defined by LoADDI should be used in the base part of the
     // load\store instruction. Otherwise, no folding possible.
-    unsigned BaseAddrReg = Tail.getOperand(1).getReg();
+    Register BaseAddrReg = Tail.getOperand(1).getReg();
     if (DestReg != BaseAddrReg)
       return false;
     MachineOperand &TailImmOp = Tail.getOperand(2);
diff --git a/lib/Target/RISCV/RISCVRegisterBankInfo.cpp b/lib/Target/RISCV/RISCVRegisterBankInfo.cpp
new file mode 100644
index 000000000000..bd3b95a98b9f
--- /dev/null
+++ b/lib/Target/RISCV/RISCVRegisterBankInfo.cpp
@@ -0,0 +1,26 @@
+//===-- RISCVRegisterBankInfo.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "RISCVRegisterBankInfo.h"
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "RISCVGenRegisterBank.inc"
+
+using namespace llvm;
+
+RISCVRegisterBankInfo::RISCVRegisterBankInfo(const TargetRegisterInfo &TRI)
+    : RISCVGenRegisterBankInfo() {}
diff --git a/lib/Target/RISCV/RISCVRegisterBankInfo.h b/lib/Target/RISCV/RISCVRegisterBankInfo.h
new file mode 100644
index 000000000000..05fac992734d
--- /dev/null
+++ b/lib/Target/RISCV/RISCVRegisterBankInfo.h
@@ -0,0 +1,37 @@
+//===-- RISCVRegisterBankInfo.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_RISCV_RISCVREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "RISCVGenRegisterBank.inc"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+class RISCVGenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "RISCVGenRegisterBank.inc"
+};
+
+/// This class provides the information for the target register banks.
+class RISCVRegisterBankInfo final : public RISCVGenRegisterBankInfo {
+public:
+  RISCVRegisterBankInfo(const TargetRegisterInfo &TRI);
+};
+} // end namespace llvm
+#endif
diff --git a/lib/Target/RISCV/RISCVRegisterBanks.td b/lib/Target/RISCV/RISCVRegisterBanks.td
new file mode 100644
index 000000000000..400b65a1bf9a
--- /dev/null
+++ b/lib/Target/RISCV/RISCVRegisterBanks.td
@@ -0,0 +1,13 @@
+//=-- RISCVRegisterBank.td - Describe the RISCV Banks --------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers: X.
+def GPRRegBank : RegisterBank<"GPRB", [GPR]>;
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp
index e6a126e3e513..66557687c0b6 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -26,6 +26,15 @@
 
 using namespace llvm;
 
+static_assert(RISCV::X1 == RISCV::X0 + 1, "Register list not consecutive");
+static_assert(RISCV::X31 == RISCV::X0 + 31, "Register list not consecutive");
+static_assert(RISCV::F1_F == RISCV::F0_F + 1, "Register list not consecutive");
+static_assert(RISCV::F31_F == RISCV::F0_F + 31,
+              "Register list not consecutive");
+static_assert(RISCV::F1_D == RISCV::F0_D + 1, "Register list not consecutive");
+static_assert(RISCV::F31_D == RISCV::F0_D + 31,
+              "Register list not consecutive");
+
 RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
     : RISCVGenRegisterInfo(RISCV::X1, /*DwarfFlavour*/0, /*EHFlavor*/0,
                            /*PC*/0, HwMode) {}
@@ -109,8 +118,8 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     assert(isInt<32>(Offset) && "Int32 expected");
     // The offset won't fit in an immediate, so use a scratch register instead
     // Modify Offset and FrameReg appropriately
-    unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-    TII->movImm32(MBB, II, DL, ScratchReg, Offset);
+    Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+    TII->movImm(MBB, II, DL, ScratchReg, Offset);
     BuildMI(MBB, II, DL, TII->get(RISCV::ADD), ScratchReg)
         .addReg(FrameReg)
         .addReg(ScratchReg, RegState::Kill);
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.h b/lib/Target/RISCV/RISCVRegisterInfo.h
index 4f339475508f..56a50fe6ddc0 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -52,6 +52,12 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
   bool trackLivenessAfterRegAlloc(const MachineFunction &) const override {
     return true;
   }
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override {
+    return &RISCV::GPRRegClass;
+  }
 };
 }
 
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.td b/lib/Target/RISCV/RISCVRegisterInfo.td
index 79f8ab12f6c0..82b37afd0805 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -101,6 +101,12 @@ def GPR : RegisterClass<"RISCV", [XLenVT], 32, (add
       [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
 }
 
+def GPRX0 : RegisterClass<"RISCV", [XLenVT], 32, (add X0)> {
+  let RegInfos = RegInfoByHwMode<
+      [RV32,              RV64,              DefaultMode],
+      [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+}
+
 // The order of registers represents the preferred allocation sequence.
 // Registers are listed in the order caller-save, callee-save, specials.
 def GPRNoX0 : RegisterClass<"RISCV", [XLenVT], 32, (add
@@ -159,41 +165,41 @@ def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> {
 
 // Floating point registers
 let RegAltNameIndices = [ABIRegAltName] in {
-  def F0_32  : RISCVReg32<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
-  def F1_32  : RISCVReg32<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;
-  def F2_32  : RISCVReg32<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;
-  def F3_32  : RISCVReg32<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;
-  def F4_32  : RISCVReg32<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;
-  def F5_32  : RISCVReg32<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;
-  def F6_32  : RISCVReg32<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;
-  def F7_32  : RISCVReg32<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;
-  def F8_32  : RISCVReg32<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;
-  def F9_32  : RISCVReg32<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;
-  def F10_32 : RISCVReg32<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;
-  def F11_32 : RISCVReg32<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;
-  def F12_32 : RISCVReg32<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;
-  def F13_32 : RISCVReg32<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;
-  def F14_32 : RISCVReg32<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;
-  def F15_32 : RISCVReg32<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;
-  def F16_32 : RISCVReg32<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;
-  def F17_32 : RISCVReg32<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;
-  def F18_32 : RISCVReg32<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;
-  def F19_32 : RISCVReg32<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;
-  def F20_32 : RISCVReg32<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;
-  def F21_32 : RISCVReg32<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;
-  def F22_32 : RISCVReg32<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;
-  def F23_32 : RISCVReg32<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;
-  def F24_32 : RISCVReg32<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;
-  def F25_32 : RISCVReg32<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;
-  def F26_32 : RISCVReg32<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;
-  def F27_32 : RISCVReg32<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;
-  def F28_32 : RISCVReg32<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;
-  def F29_32 : RISCVReg32<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;
-  def F30_32 : RISCVReg32<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;
-  def F31_32 : RISCVReg32<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;
+  def F0_F  : RISCVReg32<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
+  def F1_F  : RISCVReg32<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;
+  def F2_F  : RISCVReg32<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;
+  def F3_F  : RISCVReg32<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;
+  def F4_F  : RISCVReg32<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;
+  def F5_F  : RISCVReg32<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;
+  def F6_F  : RISCVReg32<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;
+  def F7_F  : RISCVReg32<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;
+  def F8_F  : RISCVReg32<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;
+  def F9_F  : RISCVReg32<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;
+  def F10_F : RISCVReg32<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;
+  def F11_F : RISCVReg32<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;
+  def F12_F : RISCVReg32<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;
+  def F13_F : RISCVReg32<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;
+  def F14_F : RISCVReg32<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;
+  def F15_F : RISCVReg32<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;
+  def F16_F : RISCVReg32<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;
+  def F17_F : RISCVReg32<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;
+  def F18_F : RISCVReg32<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;
+  def F19_F : RISCVReg32<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;
+  def F20_F : RISCVReg32<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;
+  def F21_F : RISCVReg32<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;
+  def F22_F : RISCVReg32<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;
+  def F23_F : RISCVReg32<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;
+  def F24_F : RISCVReg32<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;
+  def F25_F : RISCVReg32<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;
+  def F26_F : RISCVReg32<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;
+  def F27_F : RISCVReg32<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;
+  def F28_F : RISCVReg32<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;
+  def F29_F : RISCVReg32<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;
+  def F30_F : RISCVReg32<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;
+  def F31_F : RISCVReg32<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;
 
   foreach Index = 0-31 in {
-    def F#Index#_64 : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_32")>,
+    def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>,
       DwarfRegNum<[!add(Index, 32)]>;
   }
 }
@@ -201,29 +207,29 @@ let RegAltNameIndices = [ABIRegAltName] in {
 // The order of registers represents the preferred allocation sequence,
 // meaning caller-save regs are listed before callee-save.
 def FPR32 : RegisterClass<"RISCV", [f32], 32, (add
-    (sequence "F%u_32", 0, 7),
-    (sequence "F%u_32", 10, 17),
-    (sequence "F%u_32", 28, 31),
-    (sequence "F%u_32", 8, 9),
-    (sequence "F%u_32", 18, 27)
+    (sequence "F%u_F", 0, 7),
+    (sequence "F%u_F", 10, 17),
+    (sequence "F%u_F", 28, 31),
+    (sequence "F%u_F", 8, 9),
+    (sequence "F%u_F", 18, 27)
 )>;
 
 def FPR32C : RegisterClass<"RISCV", [f32], 32, (add
-  (sequence "F%u_32", 10, 15),
-  (sequence "F%u_32", 8, 9)
+  (sequence "F%u_F", 10, 15),
+  (sequence "F%u_F", 8, 9)
 )>;
 
 // The order of registers represents the preferred allocation sequence,
 // meaning caller-save regs are listed before callee-save.
 def FPR64 : RegisterClass<"RISCV", [f64], 64, (add
-    (sequence "F%u_64", 0, 7),
-    (sequence "F%u_64", 10, 17),
-    (sequence "F%u_64", 28, 31),
-    (sequence "F%u_64", 8, 9),
-    (sequence "F%u_64", 18, 27)
+    (sequence "F%u_D", 0, 7),
+    (sequence "F%u_D", 10, 17),
+    (sequence "F%u_D", 28, 31),
+    (sequence "F%u_D", 8, 9),
+    (sequence "F%u_D", 18, 27)
 )>;
 
 def FPR64C : RegisterClass<"RISCV", [f64], 64, (add
-  (sequence "F%u_64", 10, 15),
-  (sequence "F%u_64", 8, 9)
+  (sequence "F%u_D", 10, 15),
+  (sequence "F%u_D", 8, 9)
 )>;
diff --git a/lib/Target/RISCV/RISCVSubtarget.cpp b/lib/Target/RISCV/RISCVSubtarget.cpp
index 6902ed75d852..f114c6ac1925 100644
--- a/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -12,7 +12,11 @@
 
 #include "RISCVSubtarget.h"
 #include "RISCV.h"
+#include "RISCVCallLowering.h"
 #include "RISCVFrameLowering.h"
+#include "RISCVLegalizerInfo.h"
+#include "RISCVRegisterBankInfo.h"
+#include "RISCVTargetMachine.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -47,4 +51,28 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
                                StringRef ABIName, const TargetMachine &TM)
     : RISCVGenSubtargetInfo(TT, CPU, FS),
       FrameLowering(initializeSubtargetDependencies(TT, CPU, FS, ABIName)),
-      InstrInfo(), RegInfo(getHwMode()), TLInfo(TM, *this) {}
+      InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {
+  CallLoweringInfo.reset(new RISCVCallLowering(*getTargetLowering()));
+  Legalizer.reset(new RISCVLegalizerInfo(*this));
+
+  auto *RBI = new RISCVRegisterBankInfo(*getRegisterInfo());
+  RegBankInfo.reset(RBI);
+  InstSelector.reset(createRISCVInstructionSelector(
+      *static_cast<const RISCVTargetMachine *>(&TM), *this, *RBI));
+}
+
+const CallLowering *RISCVSubtarget::getCallLowering() const {
+  return CallLoweringInfo.get();
+}
+
+InstructionSelector *RISCVSubtarget::getInstructionSelector() const {
+  return InstSelector.get();
+}
+
+const LegalizerInfo *RISCVSubtarget::getLegalizerInfo() const {
+  return Legalizer.get();
+}
+
+const RegisterBankInfo *RISCVSubtarget::getRegBankInfo() const {
+  return RegBankInfo.get();
+}
diff --git a/lib/Target/RISCV/RISCVSubtarget.h b/lib/Target/RISCV/RISCVSubtarget.h
index 106ff49f021a..7d0373a5253a 100644
--- a/lib/Target/RISCV/RISCVSubtarget.h
+++ b/lib/Target/RISCV/RISCVSubtarget.h
@@ -17,6 +17,10 @@
 #include "RISCVISelLowering.h"
 #include "RISCVInstrInfo.h"
 #include "Utils/RISCVBaseInfo.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -38,6 +42,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasRV64 = false;
   bool IsRV32E = false;
   bool EnableLinkerRelax = false;
+  bool EnableRVCHintInstrs = false;
   unsigned XLen = 32;
   MVT XLenVT = MVT::i32;
   RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
@@ -75,6 +80,7 @@ public:
   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
+  bool enableMachineScheduler() const override { return true; }
   bool hasStdExtM() const { return HasStdExtM; }
   bool hasStdExtA() const { return HasStdExtA; }
   bool hasStdExtF() const { return HasStdExtF; }
@@ -83,9 +89,23 @@ public:
   bool is64Bit() const { return HasRV64; }
   bool isRV32E() const { return IsRV32E; }
   bool enableLinkerRelax() const { return EnableLinkerRelax; }
+  bool enableRVCHintInstrs() const { return EnableRVCHintInstrs; }
   MVT getXLenVT() const { return XLenVT; }
   unsigned getXLen() const { return XLen; }
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
+
+protected:
+  // GlobalISel related APIs.
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+public:
+  const CallLowering *getCallLowering() const override;
+  InstructionSelector *getInstructionSelector() const override;
+  const LegalizerInfo *getLegalizerInfo() const override;
+  const RegisterBankInfo *getRegBankInfo() const override;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp
index f4e6ed9f6284..5ffc6eda6bd7 100644
--- a/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -17,6 +17,10 @@
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -30,6 +34,7 @@ extern "C" void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
   auto PR = PassRegistry::getPassRegistry();
+  initializeGlobalISel(*PR);
   initializeRISCVExpandPseudoPass(*PR);
 }
 
@@ -58,7 +63,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
                         getEffectiveRelocModel(TT, RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      TLOF(make_unique<RISCVELFTargetObjectFile>()),
+      TLOF(std::make_unique<RISCVELFTargetObjectFile>()),
       Subtarget(TT, CPU, FS, Options.MCOptions.getABIName(), *this) {
   initAsmInfo();
 }
@@ -80,6 +85,10 @@ public:
 
   void addIRPasses() override;
   bool addInstSelector() override;
+  bool addIRTranslator() override;
+  bool addLegalizeMachineIR() override;
+  bool addRegBankSelect() override;
+  bool addGlobalInstructionSelect() override;
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
   void addPreRegAlloc() override;
@@ -101,6 +110,26 @@ bool RISCVPassConfig::addInstSelector() {
   return false;
 }
 
+bool RISCVPassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
+
+bool RISCVPassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
+  return false;
+}
+
+bool RISCVPassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
+  return false;
+}
+
+bool RISCVPassConfig::addGlobalInstructionSelect() {
+  addPass(new InstructionSelect());
+  return false;
+}
+
 void RISCVPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); }
 
 void RISCVPassConfig::addPreEmitPass2() {
diff --git a/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/lib/Target/RISCV/Utils/RISCVBaseInfo.h
index c33c72f24319..30e475e80a01 100644
--- a/lib/Target/RISCV/Utils/RISCVBaseInfo.h
+++ b/lib/Target/RISCV/Utils/RISCVBaseInfo.h
@@ -16,6 +16,7 @@
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/SubtargetFeature.h"
 
 namespace llvm {
@@ -63,6 +64,21 @@ enum {
 };
 } // namespace RISCVII
 
+namespace RISCVOp {
+enum OperandType : unsigned {
+  OPERAND_FIRST_RISCV_IMM = MCOI::OPERAND_FIRST_TARGET,
+  OPERAND_UIMM4 = OPERAND_FIRST_RISCV_IMM,
+  OPERAND_UIMM5,
+  OPERAND_UIMM12,
+  OPERAND_SIMM12,
+  OPERAND_SIMM13_LSB0,
+  OPERAND_UIMM20,
+  OPERAND_SIMM21_LSB0,
+  OPERAND_UIMMLOG2XLEN,
+  OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN
+};
+} // namespace RISCVOp
+
 // Describes the predecessor/successor bits used in the FENCE instruction.
 namespace RISCVFenceField {
 enum FenceField {
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 15453ae59a4f..f6be9dd01249 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -376,7 +376,7 @@ public:
   }
 
   static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
-    auto Op = make_unique<SparcOperand>(k_Token);
+    auto Op = std::make_unique<SparcOperand>(k_Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -386,7 +386,7 @@ public:
 
   static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind,
                                                  SMLoc S, SMLoc E) {
-    auto Op = make_unique<SparcOperand>(k_Register);
+    auto Op = std::make_unique<SparcOperand>(k_Register);
     Op->Reg.RegNum = RegNum;
     Op->Reg.Kind   = (SparcOperand::RegisterKind)Kind;
     Op->StartLoc = S;
@@ -396,7 +396,7 @@ public:
 
   static std::unique_ptr<SparcOperand> CreateImm(const MCExpr *Val, SMLoc S,
                                                  SMLoc E) {
-    auto Op = make_unique<SparcOperand>(k_Immediate);
+    auto Op = std::make_unique<SparcOperand>(k_Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -481,7 +481,7 @@ public:
 
   static std::unique_ptr<SparcOperand>
   CreateMEMr(unsigned Base, SMLoc S, SMLoc E) {
-    auto Op = make_unique<SparcOperand>(k_MemoryReg);
+    auto Op = std::make_unique<SparcOperand>(k_MemoryReg);
     Op->Mem.Base = Base;
     Op->Mem.OffsetReg = Sparc::G0;  // always 0
     Op->Mem.Off = nullptr;
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index f1ca8e18c228..db8e7850300f 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -253,7 +253,7 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
     if (!MO.isReg())
       continue; // skip
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
 
     if (MO.isDef()) {
       // check whether Reg is defined or used before delay slot.
@@ -324,7 +324,7 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
     if (!MO.isReg())
       continue;
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0)
       continue;
     if (MO.isDef())
@@ -380,7 +380,7 @@ static bool combineRestoreADD(MachineBasicBlock::iterator RestoreMI,
   //
   // After :  restore <op0>, <op1>, %o[0-7]
 
-  unsigned reg = AddMI->getOperand(0).getReg();
+  Register reg = AddMI->getOperand(0).getReg();
   if (reg < SP::I0 || reg > SP::I7)
     return false;
 
@@ -408,7 +408,7 @@ static bool combineRestoreOR(MachineBasicBlock::iterator RestoreMI,
   //
   // After :  restore <op0>, <op1>, %o[0-7]
 
-  unsigned reg = OrMI->getOperand(0).getReg();
+  Register reg = OrMI->getOperand(0).getReg();
   if (reg < SP::I0 || reg > SP::I7)
     return false;
 
@@ -446,7 +446,7 @@ static bool combineRestoreSETHIi(MachineBasicBlock::iterator RestoreMI,
   //
   // After :  restore %g0, (imm3<<10), %o[0-7]
 
-  unsigned reg = SetHiMI->getOperand(0).getReg();
+  Register reg = SetHiMI->getOperand(0).getReg();
   if (reg < SP::I0 || reg > SP::I7)
     return false;
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 88547075c5ae..c97a30e634cc 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -49,7 +49,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
   }
 
   if (IsPCRel) {
-    switch((unsigned)Fixup.getKind()) {
+    switch(Fixup.getTargetKind()) {
     default:
       llvm_unreachable("Unimplemented fixup -> relocation");
     case FK_Data_1:                  return ELF::R_SPARC_DISP8;
@@ -65,7 +65,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
     }
   }
 
-  switch((unsigned)Fixup.getKind()) {
+  switch(Fixup.getTargetKind()) {
   default:
     llvm_unreachable("Unimplemented fixup -> relocation");
   case FK_Data_1:                return ELF::R_SPARC_8;
@@ -135,5 +135,5 @@ bool SparcELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createSparcELFObjectWriter(bool Is64Bit, uint8_t OSABI) {
-  return llvm::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI);
+  return std::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI);
 }
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 1834a6fd861d..0f74f2bb344c 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -34,7 +34,8 @@ DisableLeafProc("disable-sparc-leaf-proc",
 
 SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST)
     : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
-                          ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {}
+                          ST.is64Bit() ? Align(16) : Align(8), 0,
+                          ST.is64Bit() ? Align(16) : Align(8)) {}
 
 void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
                                           MachineBasicBlock &MBB,
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index 8cff50d19ed4..4e61c341b703 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -231,7 +231,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
       // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
       // the original GPRs.
 
-      unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+      Register GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
       PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
       SDValue Chain = SDValue(N,0);
 
@@ -278,7 +278,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
 
       // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
       // i32 VRs of inline asm with it.
-      unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+      Register GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
       PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
       Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index a6d440fa8aa2..4a2ba00ac6c2 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -417,7 +417,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32(
       if (VA.needsCustom()) {
         assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
 
-        unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+        Register VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
         MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
         SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
 
@@ -445,7 +445,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32(
         InVals.push_back(WholeValue);
         continue;
       }
-      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      Register VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
       MF.getRegInfo().addLiveIn(VA.getLocReg(), VReg);
       SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
       if (VA.getLocVT() == MVT::f32)
@@ -552,7 +552,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32(
     std::vector<SDValue> OutChains;
 
     for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
-      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      Register VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
       MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
       SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
 
@@ -1016,9 +1016,9 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT,
-                                               SelectionDAG &DAG) const {
-  unsigned Reg = StringSwitch<unsigned>(RegName)
+Register SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                                const MachineFunction &MF) const {
+  Register Reg = StringSwitch<unsigned>(RegName)
     .Case("i0", SP::I0).Case("i1", SP::I1).Case("i2", SP::I2).Case("i3", SP::I3)
     .Case("i4", SP::I4).Case("i5", SP::I5).Case("i6", SP::I6).Case("i7", SP::I7)
     .Case("o0", SP::O0).Case("o1", SP::O1).Case("o2", SP::O2).Case("o3", SP::O3)
@@ -1438,7 +1438,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
       setOperationAction(Op, MVT::v2i32, Expand);
     }
     // Truncating/extending stores/loads are also not supported.
-    for (MVT VT : MVT::integer_vector_valuetypes()) {
+    for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand);
       setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand);
@@ -1805,7 +1805,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
-  setMinFunctionAlignment(2);
+  setMinFunctionAlignment(Align(4));
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
 }
@@ -2244,7 +2244,7 @@ SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_UL : {
-    SDValue Mask   = DAG.getTargetConstant(1, DL, Result.getValueType());
+    SDValue Mask   = DAG.getConstant(1, DL, Result.getValueType());
     Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
     SDValue RHS    = DAG.getTargetConstant(0, DL, Result.getValueType());
     SPCC = SPCC::ICC_NE;
@@ -2277,14 +2277,14 @@ SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_LG :  {
-    SDValue Mask   = DAG.getTargetConstant(3, DL, Result.getValueType());
+    SDValue Mask   = DAG.getConstant(3, DL, Result.getValueType());
     Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
     SDValue RHS    = DAG.getTargetConstant(0, DL, Result.getValueType());
     SPCC = SPCC::ICC_NE;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_UE : {
-    SDValue Mask   = DAG.getTargetConstant(3, DL, Result.getValueType());
+    SDValue Mask   = DAG.getConstant(3, DL, Result.getValueType());
     Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
     SDValue RHS    = DAG.getTargetConstant(0, DL, Result.getValueType());
     SPCC = SPCC::ICC_E;
@@ -2951,9 +2951,11 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
   SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
   SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
 
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(isSigned);
   SDValue MulResult = TLI.makeLibCall(DAG,
                                       RTLIB::MUL_I128, WideVT,
-                                      Args, isSigned, dl).first;
+                                      Args, CallOptions, dl).first;
   SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
                                    MulResult, DAG.getIntPtrConstant(0, dl));
   SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
@@ -3183,7 +3185,7 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const {
     case 'e':
       return C_RegisterClass;
     case 'I': // SIMM13
-      return C_Other;
+      return C_Immediate;
     }
   }
 
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 8d557a4225e5..3d798cec0c16 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -98,8 +98,8 @@ namespace llvm {
       return MVT::i32;
     }
 
-    unsigned getRegisterByName(const char* RegName, EVT VT,
-                               SelectionDAG &DAG) const override;
+    Register getRegisterByName(const char* RegName, EVT VT,
+                               const MachineFunction &MF) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index 2d4f687f72d2..d18ab3b1370b 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -177,7 +177,7 @@ def LEAX_ADDri : F3_2<2, 0b000000,
 
 def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
 def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
-def : Pat<(ctpop i64:$src), (POPCrr $src)>;
+def : Pat<(i64 (ctpop i64:$src)), (POPCrr $src)>;
 
 } // Predicates = [Is64Bit]
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index ad343fe6f80a..3d3d314a26bb 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -375,8 +375,8 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   MachineInstr *MovMI = nullptr;
 
   for (unsigned i = 0; i != numSubRegs; ++i) {
-    unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
-    unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
+    Register Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
+    Register Src = TRI->getSubReg(SrcReg, subRegIdx[i]);
     assert(Dst && Src && "Bad sub-register");
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst);
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 8474c7abffb3..73dbdc4f443e 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -516,9 +516,9 @@ let DecoderMethod = "DecodeLoadQFP" in
   defm LDQF  : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>,
                Requires<[HasV9, HasHardQuad]>;
 
-let DecoderMethod = "DecodeLoadCP" in 
-  defm LDC   : Load<"ld", 0b110000, load, CoprocRegs, i32>; 
-let DecoderMethod = "DecodeLoadCPPair" in 
+let DecoderMethod = "DecodeLoadCP" in
+  defm LDC   : Load<"ld", 0b110000, load, CoprocRegs, i32>;
+let DecoderMethod = "DecodeLoadCPPair" in
   defm LDDC   : Load<"ldd", 0b110011, load, CoprocPair, v2i32, IIC_ldd>;
 
 let DecoderMethod = "DecodeLoadCP", Defs = [CPSR] in {
@@ -1508,7 +1508,7 @@ let rs1 = 0 in
   def POPCrr : F3_1<2, 0b101110,
                     (outs IntRegs:$rd), (ins IntRegs:$rs2),
                     "popc $rs2, $rd", []>, Requires<[HasV9]>;
-def : Pat<(ctpop i32:$src),
+def : Pat<(i32 (ctpop i32:$src)),
           (POPCrr (SRLri $src, 0))>;
 
 let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index ce11a423d10e..19a90e98db7e 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -182,9 +182,9 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
     if (MI.getOpcode() == SP::STQFri) {
       const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-      unsigned SrcReg = MI.getOperand(2).getReg();
-      unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
-      unsigned SrcOddReg  = getSubReg(SrcReg, SP::sub_odd64);
+      Register SrcReg = MI.getOperand(2).getReg();
+      Register SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
+      Register SrcOddReg = getSubReg(SrcReg, SP::sub_odd64);
       MachineInstr *StMI =
         BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
         .addReg(FrameReg).addImm(0).addReg(SrcEvenReg);
@@ -194,9 +194,9 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       Offset += 8;
     } else if (MI.getOpcode() == SP::LDQFri) {
       const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-      unsigned DestReg     = MI.getOperand(0).getReg();
-      unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
-      unsigned DestOddReg  = getSubReg(DestReg, SP::sub_odd64);
+      Register DestReg = MI.getOperand(0).getReg();
+      Register DestEvenReg = getSubReg(DestReg, SP::sub_even64);
+      Register DestOddReg = getSubReg(DestReg, SP::sub_odd64);
       MachineInstr *LdMI =
         BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
         .addReg(FrameReg).addImm(0);
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 195cff79de03..c1e3f8c36982 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -98,7 +98,7 @@ SparcTargetMachine::SparcTargetMachine(
                         getEffectiveSparcCodeModel(
                             CM, getEffectiveRelocModel(RM), is64bit, JIT),
                         OL),
-      TLOF(make_unique<SparcELFTargetObjectFile>()),
+      TLOF(std::make_unique<SparcELFTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this, is64bit), is64Bit(is64bit) {
   initAsmInfo();
 }
@@ -133,7 +133,7 @@ SparcTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<SparcSubtarget>(TargetTriple, CPU, FS, *this,
+    I = std::make_unique<SparcSubtarget>(TargetTriple, CPU, FS, *this,
                                           this->is64Bit);
   }
   return I.get();
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index a259ba3433d6..93c4ce4b5ccc 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -155,11 +155,11 @@ public:
   // Create particular kinds of operand.
   static std::unique_ptr<SystemZOperand> createInvalid(SMLoc StartLoc,
                                                        SMLoc EndLoc) {
-    return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
+    return std::make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
   }
 
   static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
-    auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
+    auto Op = std::make_unique<SystemZOperand>(KindToken, Loc, Loc);
     Op->Token.Data = Str.data();
     Op->Token.Length = Str.size();
     return Op;
@@ -167,7 +167,7 @@ public:
 
   static std::unique_ptr<SystemZOperand>
   createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
-    auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
+    auto Op = std::make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
     Op->Reg.Kind = Kind;
     Op->Reg.Num = Num;
     return Op;
@@ -175,7 +175,7 @@ public:
 
   static std::unique_ptr<SystemZOperand>
   createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
-    auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
+    auto Op = std::make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
     Op->Imm = Expr;
     return Op;
   }
@@ -184,7 +184,7 @@ public:
   createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base,
             const MCExpr *Disp, unsigned Index, const MCExpr *LengthImm,
             unsigned LengthReg, SMLoc StartLoc, SMLoc EndLoc) {
-    auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
+    auto Op = std::make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
     Op->Mem.MemKind = MemKind;
     Op->Mem.RegKind = RegKind;
     Op->Mem.Base = Base;
@@ -200,7 +200,7 @@ public:
   static std::unique_ptr<SystemZOperand>
   createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
                SMLoc StartLoc, SMLoc EndLoc) {
-    auto Op = make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc);
+    auto Op = std::make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc);
     Op->ImmTLS.Imm = Imm;
     Op->ImmTLS.Sym = Sym;
     return Op;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 8d8ba5644e10..49b6fc490336 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -162,5 +162,5 @@ unsigned SystemZObjectWriter::getRelocType(MCContext &Ctx,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createSystemZObjectWriter(uint8_t OSABI) {
-  return llvm::make_unique<SystemZObjectWriter>(OSABI);
+  return std::make_unique<SystemZObjectWriter>(OSABI);
 }
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index 2b0f90182d7f..88cf589a3f10 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -190,7 +190,6 @@ static inline bool isImmHF(uint64_t Val) {
 FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
                                    CodeGenOpt::Level OptLevel);
 FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
-FunctionPass *createSystemZExpandPseudoPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index ef378e4ade7a..10023e9e169c 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -501,6 +501,10 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     break;
 
+  case TargetOpcode::FENTRY_CALL:
+    LowerFENTRY_CALL(*MI, Lower);
+    return;
+
   case TargetOpcode::STACKMAP:
     LowerSTACKMAP(*MI);
     return;
@@ -546,6 +550,22 @@ static unsigned EmitNop(MCContext &OutContext, MCStreamer &OutStreamer,
   }
 }
 
+void SystemZAsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
+                                         SystemZMCInstLower &Lower) {
+  MCContext &Ctx = MF->getContext();
+  if (MF->getFunction().getFnAttribute("mnop-mcount")
+                       .getValueAsString() == "true") {
+    EmitNop(Ctx, *OutStreamer, 6, getSubtargetInfo());
+    return;
+  }
+
+  MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
+  const MCSymbolRefExpr *Op =
+      MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_PLT, Ctx);
+  OutStreamer->EmitInstruction(MCInstBuilder(SystemZ::BRASL)
+                       .addReg(SystemZ::R0D).addExpr(Op), getSubtargetInfo());
+}
+
 void SystemZAsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
   const SystemZInstrInfo *TII =
     static_cast<const SystemZInstrInfo *>(MF->getSubtarget().getInstrInfo());
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h
index aa5d3ca78e61..d01a17c2ebe2 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -46,6 +46,7 @@ public:
   }
 
 private:
+  void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL);
   void LowerSTACKMAP(const MachineInstr &MI);
   void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower);
 };
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index 9cbf6b320504..946eb2ba7c79 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -152,7 +152,7 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) {
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
     if (MO.isReg()) {
-      if (unsigned MOReg = MO.getReg()) {
+      if (Register MOReg = MO.getReg()) {
         if (TRI->regsOverlap(MOReg, Reg)) {
           if (MO.isUse())
             Ref.Use = true;
@@ -378,11 +378,8 @@ bool SystemZElimCompare::adjustCCMasksForInstr(
   }
 
   // CC is now live after MI.
-  if (!ConvOpc) {
-    int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
-    assert(CCDef >= 0 && "Couldn't find CC set");
-    MI.getOperand(CCDef).setIsDead(false);
-  }
+  if (!ConvOpc)
+    MI.clearRegisterDeads(SystemZ::CC);
 
   // Check if MI lies before Compare.
   bool BeforeCmp = false;
diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
deleted file mode 100644
index 09708fb4241c..000000000000
--- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//==-- SystemZExpandPseudo.cpp - Expand pseudo instructions -------*- C++ -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that expands pseudo instructions into target
-// instructions to allow proper scheduling and other late optimizations.  This
-// pass should be run after register allocation but before the post-regalloc
-// scheduling pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "SystemZ.h"
-#include "SystemZInstrInfo.h"
-#include "SystemZSubtarget.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-using namespace llvm;
-
-#define SYSTEMZ_EXPAND_PSEUDO_NAME "SystemZ pseudo instruction expansion pass"
-
-namespace llvm {
-  void initializeSystemZExpandPseudoPass(PassRegistry&);
-}
-
-namespace {
-class SystemZExpandPseudo : public MachineFunctionPass {
-public:
-  static char ID;
-  SystemZExpandPseudo() : MachineFunctionPass(ID) {
-    initializeSystemZExpandPseudoPass(*PassRegistry::getPassRegistry());
-  }
-
-  const SystemZInstrInfo *TII;
-
-  bool runOnMachineFunction(MachineFunction &Fn) override;
-
-  StringRef getPassName() const override { return SYSTEMZ_EXPAND_PSEUDO_NAME; }
-
-private:
-  bool expandMBB(MachineBasicBlock &MBB);
-  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                MachineBasicBlock::iterator &NextMBBI);
-  bool expandLOCRMux(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     MachineBasicBlock::iterator &NextMBBI);
-};
-char SystemZExpandPseudo::ID = 0;
-}
-
-INITIALIZE_PASS(SystemZExpandPseudo, "systemz-expand-pseudo",
-                SYSTEMZ_EXPAND_PSEUDO_NAME, false, false)
-
-/// Returns an instance of the pseudo instruction expansion pass.
-FunctionPass *llvm::createSystemZExpandPseudoPass(SystemZTargetMachine &TM) {
-  return new SystemZExpandPseudo();
-}
-
-// MI is a load-register-on-condition pseudo instruction that could not be
-// handled as a single hardware instruction.  Replace it by a branch sequence.
-bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        MachineBasicBlock::iterator &NextMBBI) {
-  MachineFunction &MF = *MBB.getParent();
-  const BasicBlock *BB = MBB.getBasicBlock();
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(2).getReg();
-  unsigned CCValid = MI.getOperand(3).getImm();
-  unsigned CCMask = MI.getOperand(4).getImm();
-
-  LivePhysRegs LiveRegs(TII->getRegisterInfo());
-  LiveRegs.addLiveOuts(MBB);
-  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
-    LiveRegs.stepBackward(*I);
-
-  // Splice MBB at MI, moving the rest of the block into RestMBB.
-  MachineBasicBlock *RestMBB = MF.CreateMachineBasicBlock(BB);
-  MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB);
-  RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end());
-  RestMBB->transferSuccessors(&MBB);
-  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
-    RestMBB->addLiveIn(*I);
-
-  // Create a new block MoveMBB to hold the move instruction.
-  MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB);
-  MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB);
-  MoveMBB->addLiveIn(SrcReg);
-  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
-    MoveMBB->addLiveIn(*I);
-
-  // At the end of MBB, create a conditional branch to RestMBB if the
-  // condition is false, otherwise fall through to MoveMBB.
-  BuildMI(&MBB, DL, TII->get(SystemZ::BRC))
-    .addImm(CCValid).addImm(CCMask ^ CCValid).addMBB(RestMBB);
-  MBB.addSuccessor(RestMBB);
-  MBB.addSuccessor(MoveMBB);
-
-  // In MoveMBB, emit an instruction to move SrcReg into DestReg,
-  // then fall through to RestMBB.
-  TII->copyPhysReg(*MoveMBB, MoveMBB->end(), DL, DestReg, SrcReg,
-                   MI.getOperand(2).isKill());
-  MoveMBB->addSuccessor(RestMBB);
-
-  NextMBBI = MBB.end();
-  MI.eraseFromParent();
-  return true;
-}
-
-/// If MBBI references a pseudo instruction that should be expanded here,
-/// do the expansion and return true.  Otherwise return false.
-bool SystemZExpandPseudo::expandMI(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   MachineBasicBlock::iterator &NextMBBI) {
-  MachineInstr &MI = *MBBI;
-  switch (MI.getOpcode()) {
-  case SystemZ::LOCRMux:
-    return expandLOCRMux(MBB, MBBI, NextMBBI);
-  default:
-    break;
-  }
-  return false;
-}
-
-/// Iterate over the instructions in basic block MBB and expand any
-/// pseudo instructions.  Return true if anything was modified.
-bool SystemZExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
-  bool Modified = false;
-
-  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-  while (MBBI != E) {
-    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
-    Modified |= expandMI(MBB, MBBI, NMBBI);
-    MBBI = NMBBI;
-  }
-
-  return Modified;
-}
-
-bool SystemZExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
-
-  bool Modified = false;
-  for (auto &MBB : MF)
-    Modified |= expandMBB(MBB);
-  return Modified;
-}
-
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index da28faebb326..0b8b6880accc 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -46,8 +46,8 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
 } // end anonymous namespace
 
 SystemZFrameLowering::SystemZFrameLowering()
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
-                          -SystemZMC::CallFrameSize, 8,
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8),
+                          -SystemZMC::CallFrameSize, Align(8),
                           false /* StackRealignable */) {
   // Create a mapping from register number to save slot offset.
   RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
@@ -118,7 +118,7 @@ static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
                         unsigned GPR64, bool IsImplicit) {
   const TargetRegisterInfo *RI =
       MBB.getParent()->getSubtarget().getRegisterInfo();
-  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
+  Register GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
   bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
   if (!IsLive || !IsImplicit) {
     MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive));
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 9dc4512255cc..751034c2d41a 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -346,6 +346,11 @@ public:
       : SelectionDAGISel(TM, OptLevel) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
+    const Function &F = MF.getFunction();
+    if (F.getFnAttribute("mnop-mcount").getValueAsString() == "true" &&
+        F.getFnAttribute("fentry-call").getValueAsString() != "true")
+      report_fatal_error("mnop-mcount only supported with fentry-call");
+
     Subtarget = &MF.getSubtarget<SystemZSubtarget>();
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
@@ -1146,7 +1151,7 @@ void SystemZDAGToDAGISel::loadVectorConstant(
   SDLoc DL(Node);
   SmallVector<SDValue, 2> Ops;
   for (unsigned OpVal : VCI.OpVals)
-    Ops.push_back(CurDAG->getConstant(OpVal, DL, MVT::i32));
+    Ops.push_back(CurDAG->getTargetConstant(OpVal, DL, MVT::i32));
   SDValue Op = CurDAG->getNode(VCI.Opcode, DL, VCI.VecVT, Ops);
 
   if (VCI.VecVT == VT.getSimpleVT())
@@ -1550,8 +1555,8 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
       uint64_t ConstCCMask =
         cast<ConstantSDNode>(CCMask.getNode())->getZExtValue();
       // Invert the condition.
-      CCMask = CurDAG->getConstant(ConstCCValid ^ ConstCCMask, SDLoc(Node),
-                                   CCMask.getValueType());
+      CCMask = CurDAG->getTargetConstant(ConstCCValid ^ ConstCCMask,
+                                         SDLoc(Node), CCMask.getValueType());
       SDValue Op4 = Node->getOperand(4);
       SDNode *UpdatedNode =
         CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 78820f511ab4..e0ca9da93561 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -120,9 +120,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   // Instructions are strings of 2-byte aligned 2-byte values.
-  setMinFunctionAlignment(2);
+  setMinFunctionAlignment(Align(2));
   // For performance reasons we prefer 16-byte alignment.
-  setPrefFunctionAlignment(4);
+  setPrefFunctionAlignment(Align(16));
 
   // Handle operations that are handled in a similar way for all types.
   for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
@@ -206,6 +206,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       // the default expansion.
       if (!Subtarget.hasFPExtension())
         setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+
+      // Mirror those settings for STRICT_FP_TO_[SU]INT.  Note that these all
+      // default to Expand, so need to be modified to Legal where appropriate.
+      setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Legal);
+      if (Subtarget.hasFPExtension())
+        setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Legal);
     }
   }
 
@@ -252,7 +258,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
   setOperationAction(ISD::CTLZ, MVT::i64, Legal);
 
-  // On arch13 we have native support for a 64-bit CTPOP.
+  // On z15 we have native support for a 64-bit CTPOP.
   if (Subtarget.hasMiscellaneousExtensions3()) {
     setOperationAction(ISD::CTPOP, MVT::i32, Promote);
     setOperationAction(ISD::CTPOP, MVT::i64, Legal);
@@ -294,14 +300,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   // Handle prefetches with PFD or PFDRL.
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
-  for (MVT VT : MVT::vector_valuetypes()) {
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     // Assume by default that all vector operations need to be expanded.
     for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
       if (getOperationAction(Opcode, VT) == Legal)
         setOperationAction(Opcode, VT, Expand);
 
     // Likewise all truncating stores and extending loads.
-    for (MVT InnerVT : MVT::vector_valuetypes()) {
+    for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
       setTruncStoreAction(VT, InnerVT, Expand);
       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
@@ -327,7 +333,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   }
 
   // Handle integer vector types.
-  for (MVT VT : MVT::integer_vector_valuetypes()) {
+  for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
     if (isTypeLegal(VT)) {
       // These operations have direct equivalents.
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
@@ -381,6 +387,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f64, Legal);
   }
 
   if (Subtarget.hasVectorEnhancements2()) {
@@ -392,6 +403,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SINT_TO_FP, MVT::v4f32, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f32, Legal);
   }
 
   // Handle floating-point types.
@@ -831,7 +847,7 @@ supportedAddressingMode(Instruction *I, bool HasVector) {
   }
 
   if (isa<LoadInst>(I) && I->hasOneUse()) {
-    auto *SingleUser = dyn_cast<Instruction>(*I->user_begin());
+    auto *SingleUser = cast<Instruction>(*I->user_begin());
     if (SingleUser->getParent() == I->getParent()) {
       if (isa<ICmpInst>(SingleUser)) {
         if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
@@ -956,7 +972,7 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
     case 'K': // Signed 16-bit constant
     case 'L': // Signed 20-bit displacement (on all targets we support)
     case 'M': // 0x7fffffff
-      return C_Other;
+      return C_Immediate;
 
     default:
       break;
@@ -1335,7 +1351,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
         break;
       }
 
-      unsigned VReg = MRI.createVirtualRegister(RC);
+      Register VReg = MRI.createVirtualRegister(RC);
       MRI.addLiveIn(VA.getLocReg(), VReg);
       ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
     } else {
@@ -1430,7 +1446,7 @@ static bool canUseSiblingCall(const CCState &ArgCCInfo,
       return false;
     if (!VA.isRegLoc())
       return false;
-    unsigned Reg = VA.getLocReg();
+    Register Reg = VA.getLocReg();
     if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
       return false;
     if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError())
@@ -1674,7 +1690,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
 
     // Chain and glue the copies together.
-    unsigned Reg = VA.getLocReg();
+    Register Reg = VA.getLocReg();
     Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
     Glue = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
@@ -2533,12 +2549,12 @@ static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
   }
   if (C.Opcode == SystemZISD::ICMP)
     return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1,
-                       DAG.getConstant(C.ICmpType, DL, MVT::i32));
+                       DAG.getTargetConstant(C.ICmpType, DL, MVT::i32));
   if (C.Opcode == SystemZISD::TM) {
     bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
                          bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
     return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1,
-                       DAG.getConstant(RegisterOnly, DL, MVT::i32));
+                       DAG.getTargetConstant(RegisterOnly, DL, MVT::i32));
   }
   return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1);
 }
@@ -2576,10 +2592,10 @@ static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
 // in CCValid, so other values can be ignored.
 static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg,
                          unsigned CCValid, unsigned CCMask) {
-  SDValue Ops[] = { DAG.getConstant(1, DL, MVT::i32),
-                    DAG.getConstant(0, DL, MVT::i32),
-                    DAG.getConstant(CCValid, DL, MVT::i32),
-                    DAG.getConstant(CCMask, DL, MVT::i32), CCReg };
+  SDValue Ops[] = {DAG.getConstant(1, DL, MVT::i32),
+                   DAG.getConstant(0, DL, MVT::i32),
+                   DAG.getTargetConstant(CCValid, DL, MVT::i32),
+                   DAG.getTargetConstant(CCMask, DL, MVT::i32), CCReg};
   return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops);
 }
 
@@ -2741,9 +2757,10 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 
   Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
   SDValue CCReg = emitCmp(DAG, DL, C);
-  return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
-                     Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32),
-                     DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
+  return DAG.getNode(
+      SystemZISD::BR_CCMASK, DL, Op.getValueType(), Op.getOperand(0),
+      DAG.getTargetConstant(C.CCValid, DL, MVT::i32),
+      DAG.getTargetConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
 }
 
 // Return true if Pos is CmpOp and Neg is the negative of CmpOp,
@@ -2794,8 +2811,9 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
   }
 
   SDValue CCReg = emitCmp(DAG, DL, C);
-  SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32),
-                   DAG.getConstant(C.CCMask, DL, MVT::i32), CCReg};
+  SDValue Ops[] = {TrueOp, FalseOp,
+                   DAG.getTargetConstant(C.CCValid, DL, MVT::i32),
+                   DAG.getTargetConstant(C.CCMask, DL, MVT::i32), CCReg};
 
   return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops);
 }
@@ -3882,11 +3900,8 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
   bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
   auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
-  SDValue Ops[] = {
-    Op.getOperand(0),
-    DAG.getConstant(Code, DL, MVT::i32),
-    Op.getOperand(1)
-  };
+  SDValue Ops[] = {Op.getOperand(0), DAG.getTargetConstant(Code, DL, MVT::i32),
+                   Op.getOperand(1)};
   return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, DL,
                                  Node->getVTList(), Ops,
                                  Node->getMemoryVT(), Node->getMemOperand());
@@ -4228,7 +4243,7 @@ static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
   Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
   SDValue Op;
   if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
-    SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32);
+    SDValue Op2 = DAG.getTargetConstant(P.Operand, DL, MVT::i32);
     Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
   } else if (P.Opcode == SystemZISD::PACK) {
     MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
@@ -4253,7 +4268,8 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
   unsigned StartIndex, OpNo0, OpNo1;
   if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
     return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
-                       Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32));
+                       Ops[OpNo1],
+                       DAG.getTargetConstant(StartIndex, DL, MVT::i32));
 
   // Fall back on VPERM.  Construct an SDNode for the permute vector.
   SDValue IndexNodes[SystemZ::VectorBytes];
@@ -4751,7 +4767,7 @@ SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
       return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index));
     // Otherwise keep it as a vector-to-vector operation.
     return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0),
-                       DAG.getConstant(Index, DL, MVT::i32));
+                       DAG.getTargetConstant(Index, DL, MVT::i32));
   }
 
   GeneralShuffle GS(VT);
@@ -6041,8 +6057,8 @@ SDValue SystemZTargetLowering::combineBR_CCMASK(
   if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
     return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
                        Chain,
-                       DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
-                       DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
+                       DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
+                       DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
                        N->getOperand(3), CCReg);
   return SDValue();
 }
@@ -6063,10 +6079,9 @@ SDValue SystemZTargetLowering::combineSELECT_CCMASK(
 
   if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
     return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
-                       N->getOperand(0),
-                       N->getOperand(1),
-                       DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
-                       DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
+                       N->getOperand(0), N->getOperand(1),
+                       DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
+                       DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
                        CCReg);
   return SDValue();
 }
@@ -6548,19 +6563,17 @@ static bool isSelectPseudo(MachineInstr &MI) {
 
 // Helper function, which inserts PHI functions into SinkMBB:
 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
-// where %FalseValue(i) and %TrueValue(i) are taken from the consequent Selects
-// in [MIItBegin, MIItEnd) range.
-static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
-                                 MachineBasicBlock::iterator MIItEnd,
+// where %FalseValue(i) and %TrueValue(i) are taken from Selects.
+static void createPHIsForSelects(SmallVector<MachineInstr*, 8> &Selects,
                                  MachineBasicBlock *TrueMBB,
                                  MachineBasicBlock *FalseMBB,
                                  MachineBasicBlock *SinkMBB) {
   MachineFunction *MF = TrueMBB->getParent();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
-  unsigned CCValid = MIItBegin->getOperand(3).getImm();
-  unsigned CCMask = MIItBegin->getOperand(4).getImm();
-  DebugLoc DL = MIItBegin->getDebugLoc();
+  MachineInstr *FirstMI = Selects.front();
+  unsigned CCValid = FirstMI->getOperand(3).getImm();
+  unsigned CCMask = FirstMI->getOperand(4).getImm();
 
   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
 
@@ -6572,16 +6585,15 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
   // destination registers, and the registers that went into the PHI.
   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
 
-  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;
-       MIIt = skipDebugInstructionsForward(++MIIt, MIItEnd)) {
-    unsigned DestReg = MIIt->getOperand(0).getReg();
-    unsigned TrueReg = MIIt->getOperand(1).getReg();
-    unsigned FalseReg = MIIt->getOperand(2).getReg();
+  for (auto MI : Selects) {
+    Register DestReg = MI->getOperand(0).getReg();
+    Register TrueReg = MI->getOperand(1).getReg();
+    Register FalseReg = MI->getOperand(2).getReg();
 
     // If this Select we are generating is the opposite condition from
     // the jump we generated, then we have to swap the operands for the
     // PHI that is going to be generated.
-    if (MIIt->getOperand(4).getImm() == (CCValid ^ CCMask))
+    if (MI->getOperand(4).getImm() == (CCValid ^ CCMask))
       std::swap(TrueReg, FalseReg);
 
     if (RegRewriteTable.find(TrueReg) != RegRewriteTable.end())
@@ -6590,6 +6602,7 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
     if (RegRewriteTable.find(FalseReg) != RegRewriteTable.end())
       FalseReg = RegRewriteTable[FalseReg].second;
 
+    DebugLoc DL = MI->getDebugLoc();
     BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(SystemZ::PHI), DestReg)
       .addReg(TrueReg).addMBB(TrueMBB)
       .addReg(FalseReg).addMBB(FalseMBB);
@@ -6605,36 +6618,61 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
 MachineBasicBlock *
 SystemZTargetLowering::emitSelect(MachineInstr &MI,
                                   MachineBasicBlock *MBB) const {
+  assert(isSelectPseudo(MI) && "Bad call to emitSelect()");
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
 
   unsigned CCValid = MI.getOperand(3).getImm();
   unsigned CCMask = MI.getOperand(4).getImm();
-  DebugLoc DL = MI.getDebugLoc();
 
   // If we have a sequence of Select* pseudo instructions using the
   // same condition code value, we want to expand all of them into
   // a single pair of basic blocks using the same condition.
-  MachineInstr *LastMI = &MI;
-  MachineBasicBlock::iterator NextMIIt = skipDebugInstructionsForward(
-      std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-
-  if (isSelectPseudo(MI))
-    while (NextMIIt != MBB->end() && isSelectPseudo(*NextMIIt) &&
-           NextMIIt->getOperand(3).getImm() == CCValid &&
-           (NextMIIt->getOperand(4).getImm() == CCMask ||
-            NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask))) {
-      LastMI = &*NextMIIt;
-      NextMIIt = skipDebugInstructionsForward(++NextMIIt, MBB->end());
+  SmallVector<MachineInstr*, 8> Selects;
+  SmallVector<MachineInstr*, 8> DbgValues;
+  Selects.push_back(&MI);
+  unsigned Count = 0;
+  for (MachineBasicBlock::iterator NextMIIt =
+         std::next(MachineBasicBlock::iterator(MI));
+       NextMIIt != MBB->end(); ++NextMIIt) {
+    if (NextMIIt->definesRegister(SystemZ::CC))
+      break;
+    if (isSelectPseudo(*NextMIIt)) {
+      assert(NextMIIt->getOperand(3).getImm() == CCValid &&
+             "Bad CCValid operands since CC was not redefined.");
+      if (NextMIIt->getOperand(4).getImm() == CCMask ||
+          NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask)) {
+        Selects.push_back(&*NextMIIt);
+        continue;
+      }
+      break;
     }
+    bool User = false;
+    for (auto SelMI : Selects)
+      if (NextMIIt->readsVirtualRegister(SelMI->getOperand(0).getReg())) {
+        User = true;
+        break;
+      }
+    if (NextMIIt->isDebugInstr()) {
+      if (User) {
+        assert(NextMIIt->isDebugValue() && "Unhandled debug opcode.");
+        DbgValues.push_back(&*NextMIIt);
+      }
+    }
+    else if (User || ++Count > 20)
+      break;
+  }
 
+  MachineInstr *LastMI = Selects.back();
+  bool CCKilled =
+      (LastMI->killsRegister(SystemZ::CC) || checkCCKill(*LastMI, MBB));
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *JoinMBB  = splitBlockAfter(LastMI, MBB);
   MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
 
   // Unless CC was killed in the last Select instruction, mark it as
   // live-in to both FalseMBB and JoinMBB.
-  if (!LastMI->killsRegister(SystemZ::CC) && !checkCCKill(*LastMI, JoinMBB)) {
+  if (!CCKilled) {
     FalseMBB->addLiveIn(SystemZ::CC);
     JoinMBB->addLiveIn(SystemZ::CC);
   }
@@ -6643,7 +6681,7 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
   //   BRC CCMask, JoinMBB
   //   # fallthrough to FalseMBB
   MBB = StartMBB;
-  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+  BuildMI(MBB, MI.getDebugLoc(), TII->get(SystemZ::BRC))
     .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
   MBB->addSuccessor(JoinMBB);
   MBB->addSuccessor(FalseMBB);
@@ -6657,12 +6695,14 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
   //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
   //  ...
   MBB = JoinMBB;
-  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
-  MachineBasicBlock::iterator MIItEnd = skipDebugInstructionsForward(
-      std::next(MachineBasicBlock::iterator(LastMI)), MBB->end());
-  createPHIsForSelects(MIItBegin, MIItEnd, StartMBB, FalseMBB, MBB);
+  createPHIsForSelects(Selects, StartMBB, FalseMBB, MBB);
+  for (auto SelMI : Selects)
+    SelMI->eraseFromParent();
+
+  MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI();
+  for (auto DbgMI : DbgValues)
+    MBB->splice(InsertPos, StartMBB, DbgMI);
 
-  StartMBB->erase(MIItBegin, MIItEnd);
   return JoinMBB;
 }
 
@@ -6678,10 +6718,10 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
 
-  unsigned SrcReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(0).getReg();
   MachineOperand Base = MI.getOperand(1);
   int64_t Disp = MI.getOperand(2).getImm();
-  unsigned IndexReg = MI.getOperand(3).getReg();
+  Register IndexReg = MI.getOperand(3).getReg();
   unsigned CCValid = MI.getOperand(4).getImm();
   unsigned CCMask = MI.getOperand(5).getImm();
   DebugLoc DL = MI.getDebugLoc();
@@ -6773,7 +6813,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
 
   // Extract the operands.  Base can be a register or a frame index.
   // Src2 can be a register or immediate.
-  unsigned Dest = MI.getOperand(0).getReg();
+  Register Dest = MI.getOperand(0).getReg();
   MachineOperand Base = earlyUseOperand(MI.getOperand(1));
   int64_t Disp = MI.getOperand(2).getImm();
   MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
@@ -6833,7 +6873,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
       .addReg(OldVal).addReg(BitShift).addImm(0);
   if (Invert) {
     // Perform the operation normally and then invert every bit of the field.
-    unsigned Tmp = MRI.createVirtualRegister(RC);
+    Register Tmp = MRI.createVirtualRegister(RC);
     BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2);
     if (BitSize <= 32)
       // XILF with the upper BitSize bits set.
@@ -6842,7 +6882,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
     else {
       // Use LCGR and add -1 to the result, which is more compact than
       // an XILF, XILH pair.
-      unsigned Tmp2 = MRI.createVirtualRegister(RC);
+      Register Tmp2 = MRI.createVirtualRegister(RC);
       BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
       BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
         .addReg(Tmp2).addImm(-1);
@@ -6891,7 +6931,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
   bool IsSubWord = (BitSize < 32);
 
   // Extract the operands.  Base can be a register or a frame index.
-  unsigned Dest = MI.getOperand(0).getReg();
+  Register Dest = MI.getOperand(0).getReg();
   MachineOperand Base = earlyUseOperand(MI.getOperand(1));
   int64_t Disp = MI.getOperand(2).getImm();
   Register Src2 = MI.getOperand(3).getReg();
@@ -7005,13 +7045,13 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Extract the operands.  Base can be a register or a frame index.
-  unsigned Dest = MI.getOperand(0).getReg();
+  Register Dest = MI.getOperand(0).getReg();
   MachineOperand Base = earlyUseOperand(MI.getOperand(1));
   int64_t Disp = MI.getOperand(2).getImm();
-  unsigned OrigCmpVal = MI.getOperand(3).getReg();
-  unsigned OrigSwapVal = MI.getOperand(4).getReg();
-  unsigned BitShift = MI.getOperand(5).getReg();
-  unsigned NegBitShift = MI.getOperand(6).getReg();
+  Register OrigCmpVal = MI.getOperand(3).getReg();
+  Register OrigSwapVal = MI.getOperand(4).getReg();
+  Register BitShift = MI.getOperand(5).getReg();
+  Register NegBitShift = MI.getOperand(6).getReg();
   int64_t BitSize = MI.getOperand(7).getImm();
   DebugLoc DL = MI.getDebugLoc();
 
@@ -7023,14 +7063,14 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   assert(LOpcode && CSOpcode && "Displacement out of range");
 
   // Create virtual registers for temporary results.
-  unsigned OrigOldVal   = MRI.createVirtualRegister(RC);
-  unsigned OldVal       = MRI.createVirtualRegister(RC);
-  unsigned CmpVal       = MRI.createVirtualRegister(RC);
-  unsigned SwapVal      = MRI.createVirtualRegister(RC);
-  unsigned StoreVal     = MRI.createVirtualRegister(RC);
-  unsigned RetryOldVal  = MRI.createVirtualRegister(RC);
-  unsigned RetryCmpVal  = MRI.createVirtualRegister(RC);
-  unsigned RetrySwapVal = MRI.createVirtualRegister(RC);
+  Register OrigOldVal = MRI.createVirtualRegister(RC);
+  Register OldVal = MRI.createVirtualRegister(RC);
+  Register CmpVal = MRI.createVirtualRegister(RC);
+  Register SwapVal = MRI.createVirtualRegister(RC);
+  Register StoreVal = MRI.createVirtualRegister(RC);
+  Register RetryOldVal = MRI.createVirtualRegister(RC);
+  Register RetryCmpVal = MRI.createVirtualRegister(RC);
+  Register RetrySwapVal = MRI.createVirtualRegister(RC);
 
   // Insert 2 basic blocks for the loop.
   MachineBasicBlock *StartMBB = MBB;
@@ -7129,11 +7169,11 @@ SystemZTargetLowering::emitPair128(MachineInstr &MI,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
 
-  unsigned Dest = MI.getOperand(0).getReg();
-  unsigned Hi = MI.getOperand(1).getReg();
-  unsigned Lo = MI.getOperand(2).getReg();
-  unsigned Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
-  unsigned Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+  Register Dest = MI.getOperand(0).getReg();
+  Register Hi = MI.getOperand(1).getReg();
+  Register Lo = MI.getOperand(2).getReg();
+  Register Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+  Register Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
 
   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Tmp1);
   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Tmp2)
@@ -7157,14 +7197,14 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI.getDebugLoc();
 
-  unsigned Dest = MI.getOperand(0).getReg();
-  unsigned Src = MI.getOperand(1).getReg();
-  unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+  Register Dest = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  Register In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
 
   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
   if (ClearEven) {
-    unsigned NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
-    unsigned Zero64   = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+    Register NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+    Register Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
 
     BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
       .addImm(0);
@@ -7308,7 +7348,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     // The previous iteration might have created out-of-range displacements.
     // Apply them using LAY if so.
     if (!isUInt<12>(DestDisp)) {
-      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
           .add(DestBase)
           .addImm(DestDisp)
@@ -7317,7 +7357,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
       DestDisp = 0;
     }
     if (!isUInt<12>(SrcDisp)) {
-      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
           .add(SrcBase)
           .addImm(SrcDisp)
@@ -7474,11 +7514,11 @@ MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
   DebugLoc DL = MI.getDebugLoc();
 
-  unsigned SrcReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(0).getReg();
 
   // Create new virtual register of the same class as source.
   const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
-  unsigned DstReg = MRI->createVirtualRegister(RC);
+  Register DstReg = MRI->createVirtualRegister(RC);
 
   // Replace pseudo with a normal load-and-test that models the def as
   // well.
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 19c7ec58ed3d..9c95e8aec940 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -25,10 +25,10 @@ let Predicates = [FeatureNoVectorEnhancements1] in
 let Predicates = [FeatureVectorEnhancements1] in
   def SelectVR128 : SelectWrapper<f128, VR128>;
 
-defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
-                               nonvolatile_load, bdxaddr20only>;
-defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
-                               nonvolatile_load, bdxaddr20only>;
+defm CondStoreF32 : CondStores<FP32, simple_store,
+                               simple_load, bdxaddr20only>;
+defm CondStoreF64 : CondStores<FP64, simple_store,
+                               simple_load, bdxaddr20only>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
@@ -276,13 +276,13 @@ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in {
 }
 
 // fp_to_sint always rounds towards zero, which is modifier value 5.
-def : Pat<(i32 (fp_to_sint FP32:$src)),  (CFEBR 5, FP32:$src)>;
-def : Pat<(i32 (fp_to_sint FP64:$src)),  (CFDBR 5, FP64:$src)>;
-def : Pat<(i32 (fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>;
+def : Pat<(i32 (any_fp_to_sint FP32:$src)),  (CFEBR 5, FP32:$src)>;
+def : Pat<(i32 (any_fp_to_sint FP64:$src)),  (CFDBR 5, FP64:$src)>;
+def : Pat<(i32 (any_fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>;
 
-def : Pat<(i64 (fp_to_sint FP32:$src)),  (CGEBR 5, FP32:$src)>;
-def : Pat<(i64 (fp_to_sint FP64:$src)),  (CGDBR 5, FP64:$src)>;
-def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
+def : Pat<(i64 (any_fp_to_sint FP32:$src)),  (CGEBR 5, FP32:$src)>;
+def : Pat<(i64 (any_fp_to_sint FP64:$src)),  (CGDBR 5, FP64:$src)>;
+def : Pat<(i64 (any_fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
 
 // The FP extension feature provides versions of the above that allow
 // also specifying the inexact-exception suppression flag.
@@ -309,13 +309,13 @@ let Predicates = [FeatureFPExtension] in {
     def CLGXBR : TernaryRRFe<"clgxbr", 0xB3AE, GR64, FP128>;
   }
 
-  def : Pat<(i32 (fp_to_uint FP32:$src)),  (CLFEBR 5, FP32:$src,  0)>;
-  def : Pat<(i32 (fp_to_uint FP64:$src)),  (CLFDBR 5, FP64:$src,  0)>;
-  def : Pat<(i32 (fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>;
+  def : Pat<(i32 (any_fp_to_uint FP32:$src)),  (CLFEBR 5, FP32:$src,  0)>;
+  def : Pat<(i32 (any_fp_to_uint FP64:$src)),  (CLFDBR 5, FP64:$src,  0)>;
+  def : Pat<(i32 (any_fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>;
 
-  def : Pat<(i64 (fp_to_uint FP32:$src)),  (CLGEBR 5, FP32:$src,  0)>;
-  def : Pat<(i64 (fp_to_uint FP64:$src)),  (CLGDBR 5, FP64:$src,  0)>;
-  def : Pat<(i64 (fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>;
+  def : Pat<(i64 (any_fp_to_uint FP32:$src)),  (CLGEBR 5, FP32:$src,  0)>;
+  def : Pat<(i64 (any_fp_to_uint FP64:$src)),  (CLGDBR 5, FP64:$src,  0)>;
+  def : Pat<(i64 (any_fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>;
 }
 
 
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 2a1d14de3ddf..c9dbe3da686d 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2141,17 +2141,17 @@ class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode,
 }
 
 class CmpBranchRIEa<string mnemonic, bits<16> opcode,
-                    RegisterOperand cls, Immediate imm>
+                    RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3),
              mnemonic#"$M3\t$R1, $I2", []>;
 
 class AsmCmpBranchRIEa<string mnemonic, bits<16> opcode,
-                       RegisterOperand cls, Immediate imm>
+                       RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, imm32zx4:$M3),
              mnemonic#"\t$R1, $I2, $M3", []>;
 
 class FixedCmpBranchRIEa<CondVariant V, string mnemonic, bits<16> opcode,
-                          RegisterOperand cls, Immediate imm>
+                          RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2),
              mnemonic#V.suffix#"\t$R1, $I2", []> {
   let isAsmParserOnly = V.alternate;
@@ -2159,7 +2159,7 @@ class FixedCmpBranchRIEa<CondVariant V, string mnemonic, bits<16> opcode,
 }
 
 multiclass CmpBranchRIEaPair<string mnemonic, bits<16> opcode,
-                             RegisterOperand cls, Immediate imm> {
+                             RegisterOperand cls, ImmOpWithPattern imm> {
   let isCodeGenOnly = 1 in
     def "" : CmpBranchRIEa<mnemonic, opcode, cls, imm>;
   def Asm : AsmCmpBranchRIEa<mnemonic, opcode, cls, imm>;
@@ -2193,19 +2193,19 @@ multiclass CmpBranchRIEbPair<string mnemonic, bits<16> opcode,
 }
 
 class CmpBranchRIEc<string mnemonic, bits<16> opcode,
-                    RegisterOperand cls, Immediate imm>
+                    RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIEc<opcode, (outs),
              (ins cls:$R1, imm:$I2, cond4:$M3, brtarget16:$RI4),
              mnemonic#"$M3\t$R1, $I2, $RI4", []>;
 
 class AsmCmpBranchRIEc<string mnemonic, bits<16> opcode,
-                       RegisterOperand cls, Immediate imm>
+                       RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIEc<opcode, (outs),
              (ins cls:$R1, imm:$I2, imm32zx4:$M3, brtarget16:$RI4),
              mnemonic#"\t$R1, $I2, $M3, $RI4", []>;
 
 class FixedCmpBranchRIEc<CondVariant V, string mnemonic, bits<16> opcode,
-                         RegisterOperand cls, Immediate imm>
+                         RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIEc<opcode, (outs), (ins cls:$R1, imm:$I2, brtarget16:$RI4),
              mnemonic#V.suffix#"\t$R1, $I2, $RI4", []> {
   let isAsmParserOnly = V.alternate;
@@ -2213,7 +2213,7 @@ class FixedCmpBranchRIEc<CondVariant V, string mnemonic, bits<16> opcode,
 }
 
 multiclass CmpBranchRIEcPair<string mnemonic, bits<16> opcode,
-                            RegisterOperand cls, Immediate imm> {
+                            RegisterOperand cls, ImmOpWithPattern imm> {
   let isCodeGenOnly = 1 in
     def "" : CmpBranchRIEc<mnemonic, opcode, cls, imm>;
   def Asm : AsmCmpBranchRIEc<mnemonic, opcode, cls, imm>;
@@ -2272,19 +2272,19 @@ multiclass CmpBranchRRSPair<string mnemonic, bits<16> opcode,
 }
 
 class CmpBranchRIS<string mnemonic, bits<16> opcode,
-                   RegisterOperand cls, Immediate imm>
+                   RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIS<opcode, (outs),
             (ins cls:$R1, imm:$I2, cond4:$M3, bdaddr12only:$BD4),
             mnemonic#"$M3\t$R1, $I2, $BD4", []>;
 
 class AsmCmpBranchRIS<string mnemonic, bits<16> opcode,
-                      RegisterOperand cls, Immediate imm>
+                      RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIS<opcode, (outs),
             (ins cls:$R1, imm:$I2, imm32zx4:$M3, bdaddr12only:$BD4),
             mnemonic#"\t$R1, $I2, $M3, $BD4", []>;
 
 class FixedCmpBranchRIS<CondVariant V, string mnemonic, bits<16> opcode,
-                        RegisterOperand cls, Immediate imm>
+                        RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIS<opcode, (outs), (ins cls:$R1, imm:$I2, bdaddr12only:$BD4),
             mnemonic#V.suffix#"\t$R1, $I2, $BD4", []> {
   let isAsmParserOnly = V.alternate;
@@ -2292,7 +2292,7 @@ class FixedCmpBranchRIS<CondVariant V, string mnemonic, bits<16> opcode,
 }
 
 multiclass CmpBranchRISPair<string mnemonic, bits<16> opcode,
-                            RegisterOperand cls, Immediate imm> {
+                            RegisterOperand cls, ImmOpWithPattern imm> {
   let isCodeGenOnly = 1 in
     def "" : CmpBranchRIS<mnemonic, opcode, cls, imm>;
   def Asm : AsmCmpBranchRIS<mnemonic, opcode, cls, imm>;
@@ -2585,7 +2585,7 @@ multiclass StoreMultipleVRSaAlign<string mnemonic, bits<16> opcode> {
 // We therefore match the address in the same way as a normal store and
 // only use the StoreSI* instruction if the matched address is suitable.
 class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-              Immediate imm>
+              ImmOpWithPattern imm>
   : InstSI<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
            mnemonic#"\t$BD1, $I2",
            [(operator imm:$I2, mviaddr12pair:$BD1)]> {
@@ -2593,7 +2593,7 @@ class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
 }
 
 class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               Immediate imm>
+               ImmOpWithPattern imm>
   : InstSIY<opcode, (outs), (ins mviaddr20pair:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
             [(operator imm:$I2, mviaddr20pair:$BD1)]> {
@@ -2601,7 +2601,7 @@ class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 }
 
 class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               Immediate imm>
+               ImmOpWithPattern imm>
   : InstSIL<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
             [(operator imm:$I2, mviaddr12pair:$BD1)]> {
@@ -2609,7 +2609,7 @@ class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 }
 
 multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
-                       SDPatternOperator operator, Immediate imm> {
+                       SDPatternOperator operator, ImmOpWithPattern imm> {
   let DispKey = mnemonic in {
     let DispSize = "12" in
       def "" : StoreSI<mnemonic, siOpcode, operator, imm>;
@@ -2665,7 +2665,7 @@ multiclass CondStoreRSYPair<string mnemonic, bits<16> opcode,
   def Asm : AsmCondStoreRSY<mnemonic, opcode, cls, bytes, mode>;
 }
 
-class SideEffectUnaryI<string mnemonic, bits<8> opcode, Immediate imm>
+class SideEffectUnaryI<string mnemonic, bits<8> opcode, ImmOpWithPattern imm>
   : InstI<opcode, (outs), (ins imm:$I1),
           mnemonic#"\t$I1", []>;
 
@@ -2761,13 +2761,13 @@ class UnaryMemRRFc<string mnemonic, bits<16> opcode,
 }
 
 class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
-              RegisterOperand cls, Immediate imm>
+              RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIa<opcode, (outs cls:$R1), (ins imm:$I2),
             mnemonic#"\t$R1, $I2",
             [(set cls:$R1, (operator imm:$I2))]>;
 
 class UnaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
-               RegisterOperand cls, Immediate imm>
+               RegisterOperand cls, ImmOpWithPattern imm>
   : InstRILa<opcode, (outs cls:$R1), (ins imm:$I2),
              mnemonic#"\t$R1, $I2",
              [(set cls:$R1, (operator imm:$I2))]>;
@@ -2885,14 +2885,14 @@ multiclass UnaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
 }
 
 class UnaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                TypedReg tr, Immediate imm, bits<4> type = 0>
+                TypedReg tr, ImmOpWithPattern imm, bits<4> type = 0>
   : InstVRIa<opcode, (outs tr.op:$V1), (ins imm:$I2),
              mnemonic#"\t$V1, $I2",
-             [(set (tr.vt tr.op:$V1), (operator imm:$I2))]> {
+             [(set (tr.vt tr.op:$V1), (operator (i32 timm:$I2)))]> {
   let M3 = type;
 }
 
-class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, Immediate imm>
+class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, ImmOpWithPattern imm>
   : InstVRIa<opcode, (outs VR128:$V1), (ins imm:$I2, imm32zx4:$M3),
              mnemonic#"\t$V1, $I2, $M3", []>;
 
@@ -3021,7 +3021,7 @@ class SideEffectBinaryRRFc<string mnemonic, bits<16> opcode,
 }
 
 class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
-                         Immediate imm1, Immediate imm2>
+                         ImmOpWithPattern imm1, ImmOpWithPattern imm2>
   : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
            mnemonic#"\t$I1, $I2", []>;
 
@@ -3030,7 +3030,7 @@ class SideEffectBinarySI<string mnemonic, bits<8> opcode, Operand imm>
            mnemonic#"\t$BD1, $I2", []>;
 
 class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
-                          SDPatternOperator operator, Immediate imm>
+                          SDPatternOperator operator, ImmOpWithPattern imm>
   : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>;
 
@@ -3165,7 +3165,7 @@ class BinaryRRFc<string mnemonic, bits<16> opcode,
              mnemonic#"\t$R1, $R2, $M3", []>;
 
 class BinaryMemRRFc<string mnemonic, bits<16> opcode,
-                    RegisterOperand cls1, RegisterOperand cls2, Immediate imm>
+                    RegisterOperand cls1, RegisterOperand cls2, ImmOpWithPattern imm>
   : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
             mnemonic#"\t$R1, $R2, $M3", []> {
   let Constraints = "$R1 = $R1src";
@@ -3267,7 +3267,7 @@ multiclass CondBinaryRRFaPair<string mnemonic, bits<16> opcode,
 }
 
 class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
-               RegisterOperand cls, Immediate imm>
+               RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
             mnemonic#"\t$R1, $I2",
             [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
@@ -3276,14 +3276,14 @@ class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
 }
 
 class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                RegisterOperand cls, Immediate imm>
+                RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIEd<opcode, (outs cls:$R1), (ins cls:$R3, imm:$I2),
              mnemonic#"\t$R1, $R3, $I2",
              [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
 
 multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
                         SDPatternOperator operator, RegisterOperand cls,
-                        Immediate imm> {
+                        ImmOpWithPattern imm> {
   let NumOpsKey = mnemonic in {
     let NumOpsValue = "3" in
       def K : BinaryRIE<mnemonic##"k", opcode2, operator, cls, imm>,
@@ -3294,7 +3294,7 @@ multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
 }
 
 class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
-                    Immediate imm>
+                    ImmOpWithPattern imm>
   : InstRIEg<opcode, (outs cls:$R1),
              (ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3),
              mnemonic#"$M3\t$R1, $I2",
@@ -3308,7 +3308,7 @@ class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
 // Like CondBinaryRIE, but used for the raw assembly form.  The condition-code
 // mask is the third operand rather than being part of the mnemonic.
 class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
-                       Immediate imm>
+                       ImmOpWithPattern imm>
   : InstRIEg<opcode, (outs cls:$R1),
              (ins cls:$R1src, imm:$I2, imm32zx4:$M3),
              mnemonic#"\t$R1, $I2, $M3", []> {
@@ -3318,7 +3318,7 @@ class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
 
 // Like CondBinaryRIE, but with a fixed CC mask.
 class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode,
-                         RegisterOperand cls, Immediate imm>
+                         RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
              mnemonic#V.suffix#"\t$R1, $I2", []> {
   let Constraints = "$R1 = $R1src";
@@ -3328,14 +3328,14 @@ class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode,
 }
 
 multiclass CondBinaryRIEPair<string mnemonic, bits<16> opcode,
-                             RegisterOperand cls, Immediate imm> {
+                             RegisterOperand cls, ImmOpWithPattern imm> {
   let isCodeGenOnly = 1 in
     def "" : CondBinaryRIE<mnemonic, opcode, cls, imm>;
   def Asm : AsmCondBinaryRIE<mnemonic, opcode, cls, imm>;
 }
 
 class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
-                RegisterOperand cls, Immediate imm>
+                RegisterOperand cls, ImmOpWithPattern imm>
   : InstRILa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
              mnemonic#"\t$R1, $I2",
              [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
@@ -3484,7 +3484,7 @@ class BinaryVRIb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr, bits<4> type>
   : InstVRIb<opcode, (outs tr.op:$V1), (ins imm32zx8:$I2, imm32zx8:$I3),
              mnemonic#"\t$V1, $I2, $I3",
-             [(set (tr.vt tr.op:$V1), (operator imm32zx8:$I2, imm32zx8:$I3))]> {
+             [(set (tr.vt tr.op:$V1), (operator imm32zx8_timm:$I2, imm32zx8_timm:$I3))]> {
   let M4 = type;
 }
 
@@ -3498,7 +3498,7 @@ class BinaryVRIc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRIc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, imm32zx16:$I2),
              mnemonic#"\t$V1, $V3, $I2",
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V3),
-                                                  imm32zx16:$I2))]> {
+                                                  imm32zx16_timm:$I2))]> {
   let M4 = type;
 }
 
@@ -3512,7 +3512,7 @@ class BinaryVRIe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRIe<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx12:$I3),
              mnemonic#"\t$V1, $V2, $I3",
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
-                                                  imm32zx12:$I3))]> {
+                                                  imm32zx12_timm:$I3))]> {
   let M4 = type;
   let M5 = m5;
 }
@@ -3715,7 +3715,7 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
             mnemonic#"\t$V1, $XBD2, $M3",
             [(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2,
-                                               imm32zx4:$M3))]> {
+                                               imm32zx4_timm:$M3))]> {
   let mayLoad = 1;
   let AccessBytes = bytes;
 }
@@ -3765,7 +3765,7 @@ class BinaryVSI<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 }
 
 class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
-                     Immediate index>
+                     ImmOpWithPattern index>
   : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
             mnemonic#"\t$V1, $VBD2, $M3", []> {
   let mayStore = 1;
@@ -3774,7 +3774,7 @@ class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
 
 class StoreBinaryVRX<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator, TypedReg tr, bits<5> bytes,
-                     Immediate index>
+                     ImmOpWithPattern index>
   : InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2, index:$M3),
             mnemonic#"\t$V1, $XBD2, $M3",
             [(operator (tr.vt tr.op:$V1), bdxaddr12only:$XBD2, index:$M3)]> {
@@ -3809,7 +3809,7 @@ class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 }
 
 class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
-                RegisterOperand cls, Immediate imm>
+                RegisterOperand cls, ImmOpWithPattern imm>
   : InstRIa<opcode, (outs), (ins cls:$R1, imm:$I2),
             mnemonic#"\t$R1, $I2",
             [(set CC, (operator cls:$R1, imm:$I2))]> {
@@ -3817,7 +3817,7 @@ class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
 }
 
 class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
-                 RegisterOperand cls, Immediate imm>
+                 RegisterOperand cls, ImmOpWithPattern imm>
   : InstRILa<opcode, (outs), (ins cls:$R1, imm:$I2),
              mnemonic#"\t$R1, $I2",
              [(set CC, (operator cls:$R1, imm:$I2))]> {
@@ -3924,7 +3924,7 @@ class CompareSSb<string mnemonic, bits<8> opcode>
 }
 
 class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-                SDPatternOperator load, Immediate imm,
+                SDPatternOperator load, ImmOpWithPattern imm,
                 AddressingMode mode = bdaddr12only>
   : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
            mnemonic#"\t$BD1, $I2",
@@ -3934,7 +3934,7 @@ class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
 }
 
 class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                 SDPatternOperator load, Immediate imm>
+                 SDPatternOperator load, ImmOpWithPattern imm>
   : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
             [(set CC, (operator (load bdaddr12only:$BD1), imm:$I2))]> {
@@ -3943,7 +3943,7 @@ class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 }
 
 class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                 SDPatternOperator load, Immediate imm,
+                 SDPatternOperator load, ImmOpWithPattern imm,
                  AddressingMode mode = bdaddr20only>
   : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
@@ -3954,7 +3954,7 @@ class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 
 multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
                          SDPatternOperator operator, SDPatternOperator load,
-                         Immediate imm> {
+                         ImmOpWithPattern imm> {
   let DispKey = mnemonic in {
     let DispSize = "12" in
       def "" : CompareSI<mnemonic, siOpcode, operator, load, imm, bdaddr12pair>;
@@ -4012,7 +4012,7 @@ class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 }
 
 class TestBinarySIL<string mnemonic, bits<16> opcode,
-                    SDPatternOperator operator, Immediate imm>
+                    SDPatternOperator operator, ImmOpWithPattern imm>
   : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
             [(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>;
@@ -4073,7 +4073,7 @@ class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
 
 class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
                             RegisterOperand cls1, RegisterOperand cls2,
-                            Immediate imm>
+                            ImmOpWithPattern imm>
   : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
              mnemonic#"\t$R1, $R2, $M3", []>;
 
@@ -4086,7 +4086,7 @@ multiclass SideEffectTernaryRRFcOpt<string mnemonic, bits<16> opcode,
 
 class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
                                   RegisterOperand cls1, RegisterOperand cls2,
-                                  Immediate imm>
+                                  ImmOpWithPattern imm>
   : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2),
              (ins cls1:$R1src, cls2:$R2src, imm:$M3),
              mnemonic#"\t$R1, $R2, $M3", []> {
@@ -4221,7 +4221,7 @@ class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 }
 
 class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                  TypedReg tr1, TypedReg tr2, Immediate imm, Immediate index>
+                  TypedReg tr1, TypedReg tr2, ImmOpWithPattern imm, ImmOpWithPattern index>
   : InstVRIa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V1src, imm:$I2, index:$M3),
              mnemonic#"\t$V1, $I2, $M3",
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
@@ -4237,7 +4237,7 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
              mnemonic#"\t$V1, $V2, $V3, $I4",
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
                                                   (tr2.vt tr2.op:$V3),
-                                                  imm32zx8:$I4))]> {
+                                                  imm32zx8_timm:$I4))]> {
   let M5 = type;
 }
 
@@ -4252,8 +4252,8 @@ class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
              (ins tr2.op:$V2, imm32zx4:$M4, imm32zx4:$M5),
              mnemonic#"\t$V1, $V2, $M4, $M5",
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
-                                                  imm32zx4:$M4,
-                                                  imm32zx4:$M5))],
+                                                  imm32zx4_timm:$M4,
+                                                  imm32zx4_timm:$M5))],
              m4or> {
   let M3 = type;
 }
@@ -4285,13 +4285,13 @@ multiclass TernaryOptVRRbSPair<string mnemonic, bits<16> opcode,
                                TypedReg tr1, TypedReg tr2, bits<4> type,
                                bits<4> modifier = 0> {
   def "" : TernaryVRRb<mnemonic, opcode, operator, tr1, tr2, type,
-                       imm32zx4even, !and (modifier, 14)>;
+                       imm32zx4even_timm, !and (modifier, 14)>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3",
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
                                             tr2.op:$V3, 0)>;
   let Defs = [CC] in
     def S : TernaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
-                        imm32zx4even, !add(!and (modifier, 14), 1)>;
+                        imm32zx4even_timm, !add(!and (modifier, 14), 1)>;
   def : InstAlias<mnemonic#"s\t$V1, $V2, $V3",
                   (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
                                                 tr2.op:$V3, 0)>;
@@ -4314,7 +4314,7 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
              mnemonic#"\t$V1, $V2, $V3, $M4",
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
                                                   (tr2.vt tr2.op:$V3),
-                                                  imm32zx4:$M4))]> {
+                                                  imm32zx4_timm:$M4))]> {
   let M5 = 0;
   let M6 = 0;
 }
@@ -4327,7 +4327,7 @@ class TernaryVRRcFloat<string mnemonic, bits<16> opcode,
              mnemonic#"\t$V1, $V2, $V3, $M6",
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
                                                   (tr2.vt tr2.op:$V3),
-                                                  imm32zx4:$M6))]> {
+                                                  imm32zx4_timm:$M6))]> {
   let M4 = type;
   let M5 = m5;
 }
@@ -4429,7 +4429,7 @@ class TernaryVRSbGeneric<string mnemonic, bits<16> opcode>
 }
 
 class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
-                 Immediate index>
+                 ImmOpWithPattern index>
   : InstVRV<opcode, (outs VR128:$V1),
            (ins VR128:$V1src, bdvaddr12only:$VBD2, index:$M3),
            mnemonic#"\t$V1, $VBD2, $M3", []> {
@@ -4440,7 +4440,7 @@ class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
 }
 
 class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                 TypedReg tr1, TypedReg tr2, bits<5> bytes, Immediate index>
+                 TypedReg tr1, TypedReg tr2, bits<5> bytes, ImmOpWithPattern index>
   : InstVRX<opcode, (outs tr1.op:$V1),
            (ins tr2.op:$V1src, bdxaddr12only:$XBD2, index:$M3),
            mnemonic#"\t$V1, $XBD2, $M3",
@@ -4461,7 +4461,7 @@ class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operato
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
                                                   (tr2.vt tr2.op:$V2),
                                                   (tr2.vt tr2.op:$V3),
-                                                  imm32zx8:$I4))]> {
+                                                  imm32zx8_timm:$I4))]> {
   let Constraints = "$V1 = $V1src";
   let DisableEncoding = "$V1src";
   let M5 = type;
@@ -4480,7 +4480,7 @@ class QuaternaryVRIf<string mnemonic, bits<16> opcode>
   : InstVRIf<opcode, (outs VR128:$V1),
              (ins VR128:$V2, VR128:$V3,
                   imm32zx8:$I4, imm32zx4:$M5),
-             mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>;
+            mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>;
 
 class QuaternaryVRIg<string mnemonic, bits<16> opcode>
   : InstVRIg<opcode, (outs VR128:$V1),
@@ -4491,7 +4491,7 @@ class QuaternaryVRIg<string mnemonic, bits<16> opcode>
 class QuaternaryVRRd<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
                      TypedReg tr3, TypedReg tr4, bits<4> type,
-                     SDPatternOperator m6mask = imm32zx4, bits<4> m6or = 0>
+                     SDPatternOperator m6mask = imm32zx4_timm, bits<4> m6or = 0>
   : InstVRRd<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6),
              mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
@@ -4518,14 +4518,14 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
                                 bits<4> modifier = 0> {
   def "" : QuaternaryVRRd<mnemonic, opcode, operator,
                           tr1, tr2, tr2, tr2, type,
-                          imm32zx4even, !and (modifier, 14)>;
+                          imm32zx4even_timm, !and (modifier, 14)>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
                                             tr2.op:$V3, tr2.op:$V4, 0)>;
   let Defs = [CC] in
     def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc,
                            tr1, tr2, tr2, tr2, type,
-                           imm32zx4even, !add (!and (modifier, 14), 1)>;
+                           imm32zx4even_timm, !add (!and (modifier, 14), 1)>;
   def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
                                                 tr2.op:$V3, tr2.op:$V4, 0)>;
@@ -4536,7 +4536,7 @@ multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
     def "" : QuaternaryVRRdGeneric<mnemonic, opcode>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5",
                   (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
-                                            VR128:$V4, imm32zx4:$M5, 0)>;
+                                            VR128:$V4, imm32zx4_timm:$M5, 0)>;
 }
 
 class SideEffectQuaternaryRRFa<string mnemonic, bits<16> opcode,
@@ -4638,13 +4638,13 @@ class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
 class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
   : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
              mnemonic##"\t$M1, $XBD2",
-             [(operator imm32zx4:$M1, bdxaddr20only:$XBD2)]>;
+             [(operator imm32zx4_timm:$M1, bdxaddr20only:$XBD2)]>;
 
 class PrefetchRILPC<string mnemonic, bits<12> opcode,
                     SDPatternOperator operator>
-  : InstRILc<opcode, (outs), (ins imm32zx4:$M1, pcrel32:$RI2),
+  : InstRILc<opcode, (outs), (ins imm32zx4_timm:$M1, pcrel32:$RI2),
              mnemonic##"\t$M1, $RI2",
-             [(operator imm32zx4:$M1, pcrel32:$RI2)]> {
+             [(operator imm32zx4_timm:$M1, pcrel32:$RI2)]> {
   // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
   // However, BDXs have two extra operands and are therefore 6 units more
   // complex.
@@ -4691,7 +4691,7 @@ class Pseudo<dag outs, dag ins, list<dag> pattern>
 
 // Like UnaryRI, but expanded after RA depending on the choice of register.
 class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
-                    Immediate imm>
+                    ImmOpWithPattern imm>
   : Pseudo<(outs cls:$R1), (ins imm:$I2),
            [(set cls:$R1, (operator imm:$I2))]>;
 
@@ -4720,7 +4720,7 @@ class UnaryRRPseudo<string key, SDPatternOperator operator,
 
 // Like BinaryRI, but expanded after RA depending on the choice of register.
 class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
-                     Immediate imm>
+                     ImmOpWithPattern imm>
   : Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2),
            [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
   let Constraints = "$R1 = $R1src";
@@ -4728,13 +4728,13 @@ class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
 
 // Like BinaryRIE, but expanded after RA depending on the choice of register.
 class BinaryRIEPseudo<SDPatternOperator operator, RegisterOperand cls,
-                      Immediate imm>
+                      ImmOpWithPattern imm>
   : Pseudo<(outs cls:$R1), (ins cls:$R3, imm:$I2),
            [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
 
 // Like BinaryRIAndK, but expanded after RA depending on the choice of register.
 multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
-                              RegisterOperand cls, Immediate imm> {
+                              RegisterOperand cls, ImmOpWithPattern imm> {
   let NumOpsKey = key in {
     let NumOpsValue = "3" in
       def K : BinaryRIEPseudo<operator, cls, imm>,
@@ -4764,7 +4764,7 @@ class MemFoldPseudo<string mnemonic, RegisterOperand cls, bits<5> bytes,
 
 // Like CompareRI, but expanded after RA depending on the choice of register.
 class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
-                      Immediate imm>
+                      ImmOpWithPattern imm>
   : Pseudo<(outs), (ins cls:$R1, imm:$I2),
            [(set CC, (operator cls:$R1, imm:$I2))]> {
   let isCompare = 1;
@@ -4783,7 +4783,7 @@ class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
 }
 
 // Like TestBinarySIL, but expanded later.
-class TestBinarySILPseudo<SDPatternOperator operator, Immediate imm>
+class TestBinarySILPseudo<SDPatternOperator operator, ImmOpWithPattern imm>
   : Pseudo<(outs), (ins bdaddr12only:$BD1, imm:$I2),
            [(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>;
 
@@ -4812,7 +4812,7 @@ class CondBinaryRRFaPseudo<RegisterOperand cls1, RegisterOperand cls2,
 
 // Like CondBinaryRIE, but expanded after RA depending on the choice of
 // register.
-class CondBinaryRIEPseudo<RegisterOperand cls, Immediate imm>
+class CondBinaryRIEPseudo<RegisterOperand cls, ImmOpWithPattern imm>
   : Pseudo<(outs cls:$R1),
            (ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3),
            [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
@@ -4876,7 +4876,7 @@ class SelectWrapper<ValueType vt, RegisterOperand cls>
   : Pseudo<(outs cls:$dst),
            (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
            [(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2,
-                                            imm32zx4:$valid, imm32zx4:$cc))]> {
+                                            imm32zx4_timm:$valid, imm32zx4_timm:$cc))]> {
   let usesCustomInserter = 1;
   let hasNoSchedulingInfo = 1;
   let Uses = [CC];
@@ -4890,12 +4890,12 @@ multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
     def "" : Pseudo<(outs),
                     (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
                     [(store (z_select_ccmask cls:$new, (load mode:$addr),
-                                             imm32zx4:$valid, imm32zx4:$cc),
+                                             imm32zx4_timm:$valid, imm32zx4_timm:$cc),
                             mode:$addr)]>;
     def Inv : Pseudo<(outs),
                      (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
                      [(store (z_select_ccmask (load mode:$addr), cls:$new,
-                                              imm32zx4:$valid, imm32zx4:$cc),
+                                              imm32zx4_timm:$valid, imm32zx4_timm:$cc),
                               mode:$addr)]>;
   }
 }
@@ -4917,11 +4917,11 @@ class AtomicLoadBinary<SDPatternOperator operator, RegisterOperand cls,
 // Specializations of AtomicLoadWBinary.
 class AtomicLoadBinaryReg32<SDPatternOperator operator>
   : AtomicLoadBinary<operator, GR32, (i32 GR32:$src2), GR32>;
-class AtomicLoadBinaryImm32<SDPatternOperator operator, Immediate imm>
+class AtomicLoadBinaryImm32<SDPatternOperator operator, ImmOpWithPattern imm>
   : AtomicLoadBinary<operator, GR32, (i32 imm:$src2), imm>;
 class AtomicLoadBinaryReg64<SDPatternOperator operator>
   : AtomicLoadBinary<operator, GR64, (i64 GR64:$src2), GR64>;
-class AtomicLoadBinaryImm64<SDPatternOperator operator, Immediate imm>
+class AtomicLoadBinaryImm64<SDPatternOperator operator, ImmOpWithPattern imm>
   : AtomicLoadBinary<operator, GR64, (i64 imm:$src2), imm>;
 
 // OPERATOR is ATOMIC_SWAPW or an ATOMIC_LOADW_* operation.  PAT and OPERAND
@@ -4944,7 +4944,7 @@ class AtomicLoadWBinary<SDPatternOperator operator, dag pat,
 // Specializations of AtomicLoadWBinary.
 class AtomicLoadWBinaryReg<SDPatternOperator operator>
   : AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>;
-class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
+class AtomicLoadWBinaryImm<SDPatternOperator operator, ImmOpWithPattern imm>
   : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
 
 // A pseudo instruction that is a direct alias of a real instruction.
@@ -4979,7 +4979,7 @@ class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
 
 // An alias of a BinaryRI, but with different register sizes.
 class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
-                    Immediate imm>
+                    ImmOpWithPattern imm>
   : Alias<4, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
   let Constraints = "$R1 = $R1src";
@@ -4987,7 +4987,7 @@ class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
 
 // An alias of a BinaryRIL, but with different register sizes.
 class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls,
-                     Immediate imm>
+                     ImmOpWithPattern imm>
   : Alias<6, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
   let Constraints = "$R1 = $R1src";
@@ -4999,7 +4999,7 @@ class BinaryAliasVRRf<RegisterOperand cls>
 
 // An alias of a CompareRI, but with different register sizes.
 class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
-                     Immediate imm>
+                     ImmOpWithPattern imm>
   : Alias<4, (outs), (ins cls:$R1, imm:$I2),
           [(set CC, (operator cls:$R1, imm:$I2))]> {
   let isCompare = 1;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 57c1cf4ec70a..bc783608d45b 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -46,22 +46,12 @@ using namespace llvm;
 #include "SystemZGenInstrInfo.inc"
 
 #define DEBUG_TYPE "systemz-II"
-STATISTIC(LOCRMuxJumps, "Number of LOCRMux jump-sequences (lower is better)");
 
 // Return a mask with Count low bits set.
 static uint64_t allOnes(unsigned int Count) {
   return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
 }
 
-// Reg should be a 32-bit GPR.  Return true if it is a high register rather
-// than a low register.
-static bool isHighReg(unsigned int Reg) {
-  if (SystemZ::GRH32BitRegClass.contains(Reg))
-    return true;
-  assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
-  return false;
-}
-
 // Pin the vtable to this file.
 void SystemZInstrInfo::anchor() {}
 
@@ -85,7 +75,7 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   // Set up the two 64-bit registers and remember super reg and its flags.
   MachineOperand &HighRegOp = EarlierMI->getOperand(0);
   MachineOperand &LowRegOp = MI->getOperand(0);
-  unsigned Reg128 = LowRegOp.getReg();
+  Register Reg128 = LowRegOp.getReg();
   unsigned Reg128Killed = getKillRegState(LowRegOp.isKill());
   unsigned Reg128Undef  = getUndefRegState(LowRegOp.isUndef());
   HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
@@ -147,8 +137,8 @@ void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
 void SystemZInstrInfo::expandRIPseudo(MachineInstr &MI, unsigned LowOpcode,
                                       unsigned HighOpcode,
                                       bool ConvertHigh) const {
-  unsigned Reg = MI.getOperand(0).getReg();
-  bool IsHigh = isHighReg(Reg);
+  Register Reg = MI.getOperand(0).getReg();
+  bool IsHigh = SystemZ::isHighReg(Reg);
   MI.setDesc(get(IsHigh ? HighOpcode : LowOpcode));
   if (IsHigh && ConvertHigh)
     MI.getOperand(1).setImm(uint32_t(MI.getOperand(1).getImm()));
@@ -161,10 +151,10 @@ void SystemZInstrInfo::expandRIPseudo(MachineInstr &MI, unsigned LowOpcode,
 void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
                                        unsigned LowOpcodeK,
                                        unsigned HighOpcode) const {
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  bool DestIsHigh = isHighReg(DestReg);
-  bool SrcIsHigh = isHighReg(SrcReg);
+  Register DestReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  bool DestIsHigh = SystemZ::isHighReg(DestReg);
+  bool SrcIsHigh = SystemZ::isHighReg(SrcReg);
   if (!DestIsHigh && !SrcIsHigh)
     MI.setDesc(get(LowOpcodeK));
   else {
@@ -184,9 +174,10 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
 // is a high GR32.
 void SystemZInstrInfo::expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
                                        unsigned HighOpcode) const {
-  unsigned Reg = MI.getOperand(0).getReg();
-  unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode,
-                                       MI.getOperand(2).getImm());
+  Register Reg = MI.getOperand(0).getReg();
+  unsigned Opcode = getOpcodeForOffset(
+      SystemZ::isHighReg(Reg) ? HighOpcode : LowOpcode,
+      MI.getOperand(2).getImm());
   MI.setDesc(get(Opcode));
 }
 
@@ -195,93 +186,11 @@ void SystemZInstrInfo::expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
 // register is a low GR32 and HighOpcode if the register is a high GR32.
 void SystemZInstrInfo::expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode,
                                        unsigned HighOpcode) const {
-  unsigned Reg = MI.getOperand(0).getReg();
-  unsigned Opcode = isHighReg(Reg) ? HighOpcode : LowOpcode;
+  Register Reg = MI.getOperand(0).getReg();
+  unsigned Opcode = SystemZ::isHighReg(Reg) ? HighOpcode : LowOpcode;
   MI.setDesc(get(Opcode));
 }
 
-// MI is a load-register-on-condition pseudo instruction.  Replace it with
-// LowOpcode if source and destination are both low GR32s and HighOpcode if
-// source and destination are both high GR32s.
-void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
-                                        unsigned HighOpcode) const {
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(2).getReg();
-  bool DestIsHigh = isHighReg(DestReg);
-  bool SrcIsHigh = isHighReg(SrcReg);
-
-  if (!DestIsHigh && !SrcIsHigh)
-    MI.setDesc(get(LowOpcode));
-  else if (DestIsHigh && SrcIsHigh)
-    MI.setDesc(get(HighOpcode));
-  else
-    LOCRMuxJumps++;
-
-  // If we were unable to implement the pseudo with a single instruction, we
-  // need to convert it back into a branch sequence.  This cannot be done here
-  // since the caller of expandPostRAPseudo does not handle changes to the CFG
-  // correctly.  This change is defered to the SystemZExpandPseudo pass.
-}
-
-// MI is a select pseudo instruction.  Replace it with LowOpcode if source
-// and destination are all low GR32s and HighOpcode if source and destination
-// are all high GR32s.  Otherwise, use the two-operand MixedOpcode.
-void SystemZInstrInfo::expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode,
-                                        unsigned HighOpcode,
-                                        unsigned MixedOpcode) const {
-  unsigned DestReg = MI.getOperand(0).getReg();
-  unsigned Src1Reg = MI.getOperand(1).getReg();
-  unsigned Src2Reg = MI.getOperand(2).getReg();
-  bool DestIsHigh = isHighReg(DestReg);
-  bool Src1IsHigh = isHighReg(Src1Reg);
-  bool Src2IsHigh = isHighReg(Src2Reg);
-
-  // If sources and destination aren't all high or all low, we may be able to
-  // simplify the operation by moving one of the sources to the destination
-  // first.  But only if this doesn't clobber the other source.
-  if (DestReg != Src1Reg && DestReg != Src2Reg) {
-    if (DestIsHigh != Src1IsHigh) {
-      emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src1Reg,
-                    SystemZ::LR, 32, MI.getOperand(1).isKill(),
-                    MI.getOperand(1).isUndef());
-      MI.getOperand(1).setReg(DestReg);
-      Src1Reg = DestReg;
-      Src1IsHigh = DestIsHigh;
-    } else if (DestIsHigh != Src2IsHigh) {
-      emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src2Reg,
-                    SystemZ::LR, 32, MI.getOperand(2).isKill(),
-                    MI.getOperand(2).isUndef());
-      MI.getOperand(2).setReg(DestReg);
-      Src2Reg = DestReg;
-      Src2IsHigh = DestIsHigh;
-    }
-  }
-
-  // If the destination (now) matches one source, prefer this to be first.
-  if (DestReg != Src1Reg && DestReg == Src2Reg) {
-    commuteInstruction(MI, false, 1, 2);
-    std::swap(Src1Reg, Src2Reg);
-    std::swap(Src1IsHigh, Src2IsHigh);
-  }
-
-  if (!DestIsHigh && !Src1IsHigh && !Src2IsHigh)
-    MI.setDesc(get(LowOpcode));
-  else if (DestIsHigh && Src1IsHigh && Src2IsHigh)
-    MI.setDesc(get(HighOpcode));
-  else {
-    // Given the simplifcation above, we must already have a two-operand case.
-    assert (DestReg == Src1Reg);
-    MI.setDesc(get(MixedOpcode));
-    MI.tieOperands(0, 1);
-    LOCRMuxJumps++;
-  }
-
-  // If we were unable to implement the pseudo with a single instruction, we
-  // need to convert it back into a branch sequence.  This cannot be done here
-  // since the caller of expandPostRAPseudo does not handle changes to the CFG
-  // correctly.  This change is defered to the SystemZExpandPseudo pass.
-}
-
 // MI is an RR-style pseudo instruction that zero-extends the low Size bits
 // of one GRX32 into another.  Replace it with LowOpcode if both operands
 // are low registers, otherwise use RISB[LH]G.
@@ -302,8 +211,8 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
 void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction &MF = *MBB->getParent();
-  const unsigned Reg64 = MI->getOperand(0).getReg();
-  const unsigned Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32);
+  const Register Reg64 = MI->getOperand(0).getReg();
+  const Register Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32);
 
   // EAR can only load the low subregister so us a shift for %a0 to produce
   // the GR containing %a0 and %a1.
@@ -341,8 +250,8 @@ SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
                                 unsigned Size, bool KillSrc,
                                 bool UndefSrc) const {
   unsigned Opcode;
-  bool DestIsHigh = isHighReg(DestReg);
-  bool SrcIsHigh = isHighReg(SrcReg);
+  bool DestIsHigh = SystemZ::isHighReg(DestReg);
+  bool SrcIsHigh = SystemZ::isHighReg(SrcReg);
   if (DestIsHigh && SrcIsHigh)
     Opcode = SystemZ::RISBHH;
   else if (DestIsHigh && !SrcIsHigh)
@@ -468,7 +377,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
 
     // Can't handle indirect branches.
     SystemZII::Branch Branch(getBranchInfo(*I));
-    if (!Branch.Target->isMBB())
+    if (!Branch.hasMBBTarget())
       return true;
 
     // Punt on compound branches.
@@ -478,7 +387,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     if (Branch.CCMask == SystemZ::CCMASK_ANY) {
       // Handle unconditional branches.
       if (!AllowModify) {
-        TBB = Branch.Target->getMBB();
+        TBB = Branch.getMBBTarget();
         continue;
       }
 
@@ -490,7 +399,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
-      if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) {
+      if (MBB.isLayoutSuccessor(Branch.getMBBTarget())) {
         TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
@@ -498,7 +407,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       }
 
       // TBB is used to indicate the unconditinal destination.
-      TBB = Branch.Target->getMBB();
+      TBB = Branch.getMBBTarget();
       continue;
     }
 
@@ -506,7 +415,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     if (Cond.empty()) {
       // FIXME: add X86-style branch swap
       FBB = TBB;
-      TBB = Branch.Target->getMBB();
+      TBB = Branch.getMBBTarget();
       Cond.push_back(MachineOperand::CreateImm(Branch.CCValid));
       Cond.push_back(MachineOperand::CreateImm(Branch.CCMask));
       continue;
@@ -517,7 +426,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
 
     // Only handle the case where all conditional branches branch to the same
     // destination.
-    if (TBB != Branch.Target->getMBB())
+    if (TBB != Branch.getMBBTarget())
       return true;
 
     // If the conditions are the same, we can leave them alone.
@@ -547,7 +456,7 @@ unsigned SystemZInstrInfo::removeBranch(MachineBasicBlock &MBB,
       continue;
     if (!I->isBranch())
       break;
-    if (!getBranchInfo(*I).Target->isMBB())
+    if (!getBranchInfo(*I).hasMBBTarget())
       break;
     // Remove the branch.
     I->eraseFromParent();
@@ -676,8 +585,8 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
     else {
       Opc = SystemZ::LOCR;
       MRI.constrainRegClass(DstReg, &SystemZ::GR32BitRegClass);
-      unsigned TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
-      unsigned FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      Register TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      Register FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
       BuildMI(MBB, I, DL, get(TargetOpcode::COPY), TReg).addReg(TrueReg);
       BuildMI(MBB, I, DL, get(TargetOpcode::COPY), FReg).addReg(FalseReg);
       TrueReg = TReg;
@@ -1258,13 +1167,14 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       assert(NumOps == 3 && "Expected two source registers.");
       Register DstReg = MI.getOperand(0).getReg();
       Register DstPhys =
-        (TRI->isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg);
+          (Register::isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg);
       Register SrcReg = (OpNum == 2 ? MI.getOperand(1).getReg()
                                     : ((OpNum == 1 && MI.isCommutable())
                                            ? MI.getOperand(2).getReg()
                                          : Register()));
       if (DstPhys && !SystemZ::GRH32BitRegClass.contains(DstPhys) && SrcReg &&
-          TRI->isVirtualRegister(SrcReg) && DstPhys == VRM->getPhys(SrcReg))
+          Register::isVirtualRegister(SrcReg) &&
+          DstPhys == VRM->getPhys(SrcReg))
         NeedsCommute = (OpNum == 1);
       else
         MemOpcode = -1;
@@ -1358,15 +1268,6 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     expandLOCPseudo(MI, SystemZ::LOCHI, SystemZ::LOCHHI);
     return true;
 
-  case SystemZ::LOCRMux:
-    expandLOCRPseudo(MI, SystemZ::LOCR, SystemZ::LOCFHR);
-    return true;
-
-  case SystemZ::SELRMux:
-    expandSELRPseudo(MI, SystemZ::SELR, SystemZ::SELFHR,
-                         SystemZ::LOCRMux);
-    return true;
-
   case SystemZ::STCMux:
     expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH);
     return true;
@@ -1468,8 +1369,8 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return true;
 
   case SystemZ::RISBMux: {
-    bool DestIsHigh = isHighReg(MI.getOperand(0).getReg());
-    bool SrcIsHigh = isHighReg(MI.getOperand(2).getReg());
+    bool DestIsHigh = SystemZ::isHighReg(MI.getOperand(0).getReg());
+    bool SrcIsHigh = SystemZ::isHighReg(MI.getOperand(2).getReg());
     if (SrcIsHigh == DestIsHigh)
       MI.setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
     else {
@@ -1545,6 +1446,10 @@ SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const {
     return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP,
                              MI.getOperand(2).getImm(), &MI.getOperand(3));
 
+  case SystemZ::INLINEASM_BR:
+    // Don't try to analyze asm goto, so pass nullptr as branch target argument.
+    return SystemZII::Branch(SystemZII::AsmGoto, 0, 0, nullptr);
+
   default:
     llvm_unreachable("Unrecognized branch opcode");
   }
@@ -1845,8 +1750,7 @@ void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
 
 bool SystemZInstrInfo::
 areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
-                                const MachineInstr &MIb,
-                                AliasAnalysis *AA) const {
+                                const MachineInstr &MIb) const {
 
   if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand())
     return false;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 2edde175542e..6dc6e72aa52a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -100,11 +100,18 @@ enum BranchType {
 
   // An instruction that decrements a 64-bit register and branches if
   // the result is nonzero.
-  BranchCTG
+  BranchCTG,
+
+  // An instruction representing an asm goto statement.
+  AsmGoto
 };
 
 // Information about a branch instruction.
-struct Branch {
+class Branch {
+  // The target of the branch. In case of INLINEASM_BR, this is nullptr.
+  const MachineOperand *Target;
+
+public:
   // The type of the branch.
   BranchType Type;
 
@@ -114,12 +121,15 @@ struct Branch {
   // CCMASK_<N> is set if the branch should be taken when CC == N.
   unsigned CCMask;
 
-  // The target of the branch.
-  const MachineOperand *Target;
-
   Branch(BranchType type, unsigned ccValid, unsigned ccMask,
          const MachineOperand *target)
-    : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
+    : Target(target), Type(type), CCValid(ccValid), CCMask(ccMask) {}
+
+  bool isIndirect() { return Target != nullptr && Target->isReg(); }
+  bool hasMBBTarget() { return Target != nullptr && Target->isMBB(); }
+  MachineBasicBlock *getMBBTarget() {
+    return hasMBBTarget() ? Target->getMBB() : nullptr;
+  }
 };
 
 // Kinds of fused compares in compare-and-* instructions.  Together with type
@@ -160,10 +170,6 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
                        unsigned HighOpcode) const;
   void expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode,
                        unsigned HighOpcode) const;
-  void expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
-                        unsigned HighOpcode) const;
-  void expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode,
-                        unsigned HighOpcode, unsigned MixedOpcode) const;
   void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                         unsigned Size) const;
   void expandLoadStackGuard(MachineInstr *MI) const;
@@ -322,8 +328,7 @@ public:
   // memory addresses and false otherwise.
   bool
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
-                                  const MachineInstr &MIb,
-                                  AliasAnalysis *AA = nullptr) const override;
+                                  const MachineInstr &MIb) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 91856893e3bd..8b334756611a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -337,15 +337,15 @@ defm CondStore8Mux  : CondStores<GRX32, nonvolatile_truncstorei8,
 defm CondStore16Mux : CondStores<GRX32, nonvolatile_truncstorei16,
                                  nonvolatile_anyextloadi16, bdxaddr20only>,
                       Requires<[FeatureHighWord]>;
-defm CondStore32Mux : CondStores<GRX32, nonvolatile_store,
-                                 nonvolatile_load, bdxaddr20only>,
+defm CondStore32Mux : CondStores<GRX32, simple_store,
+                                 simple_load, bdxaddr20only>,
                       Requires<[FeatureLoadStoreOnCond2]>;
 defm CondStore8     : CondStores<GR32, nonvolatile_truncstorei8,
                                  nonvolatile_anyextloadi8, bdxaddr20only>;
 defm CondStore16    : CondStores<GR32, nonvolatile_truncstorei16,
                                  nonvolatile_anyextloadi16, bdxaddr20only>;
-defm CondStore32    : CondStores<GR32, nonvolatile_store,
-                                 nonvolatile_load, bdxaddr20only>;
+defm CondStore32    : CondStores<GR32, simple_store,
+                                 simple_load, bdxaddr20only>;
 
 defm : CondStores64<CondStore8, CondStore8Inv, nonvolatile_truncstorei8,
                     nonvolatile_anyextloadi8, bdxaddr20only>;
@@ -353,8 +353,8 @@ defm : CondStores64<CondStore16, CondStore16Inv, nonvolatile_truncstorei16,
                     nonvolatile_anyextloadi16, bdxaddr20only>;
 defm : CondStores64<CondStore32, CondStore32Inv, nonvolatile_truncstorei32,
                     nonvolatile_anyextloadi32, bdxaddr20only>;
-defm CondStore64 : CondStores<GR64, nonvolatile_store,
-                              nonvolatile_load, bdxaddr20only>;
+defm CondStore64 : CondStores<GR64, simple_store,
+                              simple_load, bdxaddr20only>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
@@ -531,8 +531,8 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
 
   // Load on condition.  Matched via DAG pattern.
   // Expands to LOC or LOCFH, depending on the choice of register.
-  def LOCMux : CondUnaryRSYPseudo<nonvolatile_load, GRX32, 4>;
-  defm LOCFH : CondUnaryRSYPair<"locfh", 0xEBE0, nonvolatile_load, GRH32, 4>;
+  def LOCMux : CondUnaryRSYPseudo<simple_load, GRX32, 4>;
+  defm LOCFH : CondUnaryRSYPair<"locfh", 0xEBE0, simple_load, GRH32, 4>;
 
   // Store on condition.  Expanded from CondStore* pseudos.
   // Expands to STOC or STOCFH, depending on the choice of register.
@@ -563,8 +563,8 @@ let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in {
   }
 
   // Load on condition.  Matched via DAG pattern.
-  defm LOC  : CondUnaryRSYPair<"loc",  0xEBF2, nonvolatile_load, GR32, 4>;
-  defm LOCG : CondUnaryRSYPair<"locg", 0xEBE2, nonvolatile_load, GR64, 8>;
+  defm LOC  : CondUnaryRSYPair<"loc",  0xEBF2, simple_load, GR32, 4>;
+  defm LOCG : CondUnaryRSYPair<"locg", 0xEBE2, simple_load, GR64, 8>;
 
   // Store on condition.  Expanded from CondStore* pseudos.
   defm STOC  : CondStoreRSYPair<"stoc",  0xEBF3, GR32, 4>;
@@ -2082,7 +2082,7 @@ let Predicates = [FeatureProcessorAssist] in {
 // cleared.  We only use the first result here.
 let Defs = [CC] in
   def FLOGR : UnaryRRE<"flogr", 0xB983, null_frag, GR128, GR64>;
-def : Pat<(ctlz GR64:$src),
+def : Pat<(i64 (ctlz GR64:$src)),
           (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
 
 // Population count.  Counts bits set per byte or doubleword.
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 261727f89058..02364bbda5c1 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -60,7 +60,7 @@ let Predicates = [FeatureVector] in {
     // Generate byte mask.
     def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
     def VONE  : InherentVRIa<"vone", 0xE744, 0xffff>;
-    def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
+    def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16_timm>;
 
     // Generate mask.
     def VGM  : BinaryVRIbGeneric<"vgm", 0xE746>;
@@ -71,10 +71,10 @@ let Predicates = [FeatureVector] in {
 
     // Replicate immediate.
     def VREPI  : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
-    def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
-    def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
-    def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
-    def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
+    def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16_timm, 0>;
+    def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16_timm, 1>;
+    def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16_timm, 2>;
+    def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16_timm, 3>;
   }
 
   // Load element immediate.
@@ -116,7 +116,7 @@ let Predicates = [FeatureVector] in {
                                (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
                        "lcbb\t$R1, $XBD2, $M3",
                        [(set GR32:$R1, (int_s390_lcbb bdxaddr12only:$XBD2,
-                                                      imm32zx4:$M3))]>;
+                                                      imm32zx4_timm:$M3))]>;
 
   // Load with length.  The number of loaded bytes is only known at run time.
   def VLL : BinaryVRSb<"vll", 0xE737, int_s390_vll, 0>;
@@ -362,9 +362,9 @@ let Predicates = [FeatureVector] in {
   def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>;
   def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>;
   def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>;
-  def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16:$index)),
+  def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16_timm:$index)),
             (VREPF VR128:$vec, imm32zx16:$index)>;
-  def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16:$index)),
+  def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16_timm:$index)),
             (VREPG VR128:$vec, imm32zx16:$index)>;
 
   // Select.
@@ -778,7 +778,7 @@ let Predicates = [FeatureVector] in {
 
   // Shift left double by byte.
   def VSLDB : TernaryVRId<"vsldb", 0xE777, z_shl_double, v128b, v128b, 0>;
-  def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8:$z),
+  def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8_timm:$z),
             (VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>;
 
   // Shift left double by bit.
@@ -1069,7 +1069,7 @@ let Predicates = [FeatureVector] in {
     def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>;
   }
   // Rounding mode should agree with SystemZInstrFP.td.
-  def : FPConversion<VCGDB, fp_to_sint, v128g, v128db, 0, 5>;
+  def : FPConversion<VCGDB, any_fp_to_sint, v128g, v128db, 0, 5>;
   let Predicates = [FeatureVectorEnhancements2] in {
     let Uses = [FPC], mayRaiseFPException = 1 in {
       let isAsmParserOnly = 1 in
@@ -1078,7 +1078,7 @@ let Predicates = [FeatureVector] in {
       def WCFEB : TernaryVRRa<"wcfeb", 0xE7C2, null_frag, v32sb, v32f, 2, 8>;
     }
     // Rounding mode should agree with SystemZInstrFP.td.
-    def : FPConversion<VCFEB, fp_to_sint, v128f, v128sb, 0, 5>;
+    def : FPConversion<VCFEB, any_fp_to_sint, v128f, v128sb, 0, 5>;
   }
 
   // Convert to logical.
@@ -1088,7 +1088,7 @@ let Predicates = [FeatureVector] in {
     def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>;
   }
   // Rounding mode should agree with SystemZInstrFP.td.
-  def : FPConversion<VCLGDB, fp_to_uint, v128g, v128db, 0, 5>;
+  def : FPConversion<VCLGDB, any_fp_to_uint, v128g, v128db, 0, 5>;
   let Predicates = [FeatureVectorEnhancements2] in {
     let Uses = [FPC], mayRaiseFPException = 1 in {
       let isAsmParserOnly = 1 in
@@ -1097,7 +1097,7 @@ let Predicates = [FeatureVector] in {
       def WCLFEB : TernaryVRRa<"wclfeb", 0xE7C0, null_frag, v32sb, v32f, 2, 8>;
     }
     // Rounding mode should agree with SystemZInstrFP.td.
-    def : FPConversion<VCLFEB, fp_to_uint, v128f, v128sb, 0, 5>;
+    def : FPConversion<VCLFEB, any_fp_to_uint, v128f, v128sb, 0, 5>;
   }
 
   // Divide.
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 95d7e22dec32..724111229569 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -85,9 +85,9 @@ struct MBBInfo {
   // This value never changes.
   uint64_t Size = 0;
 
-  // The minimum alignment of the block, as a log2 value.
+  // The minimum alignment of the block.
   // This value never changes.
-  unsigned Alignment = 0;
+  Align Alignment;
 
   // The number of terminators in this block.  This value never changes.
   unsigned NumTerminators = 0;
@@ -127,7 +127,8 @@ struct BlockPosition {
   // as the runtime address.
   unsigned KnownBits;
 
-  BlockPosition(unsigned InitialAlignment) : KnownBits(InitialAlignment) {}
+  BlockPosition(unsigned InitialLogAlignment)
+      : KnownBits(InitialLogAlignment) {}
 };
 
 class SystemZLongBranch : public MachineFunctionPass {
@@ -178,17 +179,16 @@ const uint64_t MaxForwardRange = 0xfffe;
 // instructions.
 void SystemZLongBranch::skipNonTerminators(BlockPosition &Position,
                                            MBBInfo &Block) {
-  if (Block.Alignment > Position.KnownBits) {
+  if (Log2(Block.Alignment) > Position.KnownBits) {
     // When calculating the address of Block, we need to conservatively
     // assume that Block had the worst possible misalignment.
-    Position.Address += ((uint64_t(1) << Block.Alignment) -
-                         (uint64_t(1) << Position.KnownBits));
-    Position.KnownBits = Block.Alignment;
+    Position.Address +=
+        (Block.Alignment.value() - (uint64_t(1) << Position.KnownBits));
+    Position.KnownBits = Log2(Block.Alignment);
   }
 
   // Align the addresses.
-  uint64_t AlignMask = (uint64_t(1) << Block.Alignment) - 1;
-  Position.Address = (Position.Address + AlignMask) & ~AlignMask;
+  Position.Address = alignTo(Position.Address, Block.Alignment);
 
   // Record the block's position.
   Block.Address = Position.Address;
@@ -257,7 +257,7 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr &MI) {
     }
     Terminator.Branch = &MI;
     Terminator.TargetBlock =
-      TII->getBranchInfo(MI).Target->getMBB()->getNumber();
+      TII->getBranchInfo(MI).getMBBTarget()->getNumber();
   }
   return Terminator;
 }
@@ -275,7 +275,7 @@ uint64_t SystemZLongBranch::initMBBInfo() {
   Terminators.clear();
   Terminators.reserve(NumBlocks);
 
-  BlockPosition Position(MF->getAlignment());
+  BlockPosition Position(Log2(MF->getAlignment()));
   for (unsigned I = 0; I < NumBlocks; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(I);
     MBBInfo &Block = MBBs[I];
@@ -339,7 +339,7 @@ bool SystemZLongBranch::mustRelaxABranch() {
 // must be long.
 void SystemZLongBranch::setWorstCaseAddresses() {
   SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
-  BlockPosition Position(MF->getAlignment());
+  BlockPosition Position(Log2(MF->getAlignment()));
   for (auto &Block : MBBs) {
     skipNonTerminators(Position, Block);
     for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
@@ -440,7 +440,7 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
 // Run a shortening pass and relax any branches that need to be relaxed.
 void SystemZLongBranch::relaxBranches() {
   SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
-  BlockPosition Position(MF->getAlignment());
+  BlockPosition Position(Log2(MF->getAlignment()));
   for (auto &Block : MBBs) {
     skipNonTerminators(Position, Block);
     for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 0becfaa1d49c..3fc25034dded 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMachineScheduler.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 
 using namespace llvm;
 
@@ -108,8 +109,8 @@ void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
        I != SinglePredMBB->end(); I++) {
     LLVM_DEBUG(dbgs() << "** Emitting incoming branch: "; I->dump(););
     bool TakenBranch = (I->isBranch() &&
-      (TII->getBranchInfo(*I).Target->isReg() || // Relative branch
-       TII->getBranchInfo(*I).Target->getMBB() == MBB));
+                        (TII->getBranchInfo(*I).isIndirect() ||
+                         TII->getBranchInfo(*I).getMBBTarget() == MBB));
     HazardRec->emitInstruction(&*I, TakenBranch);
     if (TakenBranch)
       break;
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 56632e1529a2..b2bab68a6274 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -21,15 +21,32 @@ class ImmediateTLSAsmOperand<string name>
   let RenderMethod = "addImmTLSOperands";
 }
 
+class ImmediateOp<ValueType vt, string asmop> : Operand<vt> {
+  let PrintMethod = "print"##asmop##"Operand";
+  let DecoderMethod = "decode"##asmop##"Operand";
+  let ParserMatchClass = !cast<AsmOperandClass>(asmop);
+}
+
+class ImmOpWithPattern<ValueType vt, string asmop, code pred, SDNodeXForm xform,
+      SDNode ImmNode = imm> :
+  ImmediateOp<vt, asmop>, PatLeaf<(vt ImmNode), pred, xform>;
+
+// class ImmediatePatLeaf<ValueType vt, code pred,
+//       SDNodeXForm xform, SDNode ImmNode>
+//   : PatLeaf<(vt ImmNode), pred, xform>;
+
+
 // Constructs both a DAG pattern and instruction operand for an immediate
 // of type VT.  PRED returns true if a node is acceptable and XFORM returns
 // the operand value associated with the node.  ASMOP is the name of the
 // associated asm operand, and also forms the basis of the asm print method.
-class Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop>
-  : PatLeaf<(vt imm), pred, xform>, Operand<vt> {
-  let PrintMethod = "print"##asmop##"Operand";
-  let DecoderMethod = "decode"##asmop##"Operand";
-  let ParserMatchClass = !cast<AsmOperandClass>(asmop);
+multiclass Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop> {
+  // def "" : ImmediateOp<vt, asmop>,
+  //          PatLeaf<(vt imm), pred, xform>;
+  def "" : ImmOpWithPattern<vt, asmop, pred, xform>;
+
+//  def _timm : PatLeaf<(vt timm), pred, xform>;
+  def _timm : ImmOpWithPattern<vt, asmop, pred, xform, timm>;
 }
 
 // Constructs an asm operand for a PC-relative address.  SIZE says how
@@ -295,87 +312,87 @@ def U48Imm : ImmediateAsmOperand<"U48Imm">;
 
 // Immediates for the lower and upper 16 bits of an i32, with the other
 // bits of the i32 being zero.
-def imm32ll16 : Immediate<i32, [{
+defm imm32ll16 : Immediate<i32, [{
   return SystemZ::isImmLL(N->getZExtValue());
 }], LL16, "U16Imm">;
 
-def imm32lh16 : Immediate<i32, [{
+defm imm32lh16 : Immediate<i32, [{
   return SystemZ::isImmLH(N->getZExtValue());
 }], LH16, "U16Imm">;
 
 // Immediates for the lower and upper 16 bits of an i32, with the other
 // bits of the i32 being one.
-def imm32ll16c : Immediate<i32, [{
+defm imm32ll16c : Immediate<i32, [{
   return SystemZ::isImmLL(uint32_t(~N->getZExtValue()));
 }], LL16, "U16Imm">;
 
-def imm32lh16c : Immediate<i32, [{
+defm imm32lh16c : Immediate<i32, [{
   return SystemZ::isImmLH(uint32_t(~N->getZExtValue()));
 }], LH16, "U16Imm">;
 
 // Short immediates
-def imm32zx1 : Immediate<i32, [{
+defm imm32zx1 : Immediate<i32, [{
   return isUInt<1>(N->getZExtValue());
 }], NOOP_SDNodeXForm, "U1Imm">;
 
-def imm32zx2 : Immediate<i32, [{
+defm imm32zx2 : Immediate<i32, [{
   return isUInt<2>(N->getZExtValue());
 }], NOOP_SDNodeXForm, "U2Imm">;
 
-def imm32zx3 : Immediate<i32, [{
+defm imm32zx3 : Immediate<i32, [{
   return isUInt<3>(N->getZExtValue());
 }], NOOP_SDNodeXForm, "U3Imm">;
 
-def imm32zx4 : Immediate<i32, [{
+defm imm32zx4 : Immediate<i32, [{
   return isUInt<4>(N->getZExtValue());
 }], NOOP_SDNodeXForm, "U4Imm">;
 
 // Note: this enforces an even value during code generation only.
 // When used from the assembler, any 4-bit value is allowed.
-def imm32zx4even : Immediate<i32, [{
+defm imm32zx4even : Immediate<i32, [{
   return isUInt<4>(N->getZExtValue());
 }], UIMM8EVEN, "U4Imm">;
 
-def imm32zx6 : Immediate<i32, [{
+defm imm32zx6 : Immediate<i32, [{
   return isUInt<6>(N->getZExtValue());
 }], NOOP_SDNodeXForm, "U6Imm">;
 
-def imm32sx8 : Immediate<i32, [{
+defm imm32sx8 : Immediate<i32, [{
   return isInt<8>(N->getSExtValue());
 }], SIMM8, "S8Imm">;
 
-def imm32zx8 : Immediate<i32, [{
+defm imm32zx8 : Immediate<i32, [{
   return isUInt<8>(N->getZExtValue());
 }], UIMM8, "U8Imm">;
 
-def imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">;
+defm imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">;
 
-def imm32zx12 : Immediate<i32, [{
+defm imm32zx12 : Immediate<i32, [{
   return isUInt<12>(N->getZExtValue());
 }], UIMM12, "U12Imm">;
 
-def imm32sx16 : Immediate<i32, [{
+defm imm32sx16 : Immediate<i32, [{
   return isInt<16>(N->getSExtValue());
 }], SIMM16, "S16Imm">;
 
-def imm32sx16n : Immediate<i32, [{
+defm imm32sx16n : Immediate<i32, [{
   return isInt<16>(-N->getSExtValue());
 }], NEGSIMM16, "S16Imm">;
 
-def imm32zx16 : Immediate<i32, [{
+defm imm32zx16 : Immediate<i32, [{
   return isUInt<16>(N->getZExtValue());
 }], UIMM16, "U16Imm">;
 
-def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
-def imm32zx16trunc : Immediate<i32, [{}], UIMM16, "U16Imm">;
+defm imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
+defm imm32zx16trunc : Immediate<i32, [{}], UIMM16, "U16Imm">;
 
 // Full 32-bit immediates.  we need both signed and unsigned versions
 // because the assembler is picky.  E.g. AFI requires signed operands
 // while NILF requires unsigned ones.
-def simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
-def uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
+defm simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
+defm uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
 
-def simm32n : Immediate<i32, [{
+defm simm32n : Immediate<i32, [{
   return isInt<32>(-N->getSExtValue());
 }], NEGSIMM32, "S32Imm">;
 
@@ -387,107 +404,107 @@ def imm32 : ImmLeaf<i32, [{}]>;
 
 // Immediates for 16-bit chunks of an i64, with the other bits of the
 // i32 being zero.
-def imm64ll16 : Immediate<i64, [{
+defm imm64ll16 : Immediate<i64, [{
   return SystemZ::isImmLL(N->getZExtValue());
 }], LL16, "U16Imm">;
 
-def imm64lh16 : Immediate<i64, [{
+defm imm64lh16 : Immediate<i64, [{
   return SystemZ::isImmLH(N->getZExtValue());
 }], LH16, "U16Imm">;
 
-def imm64hl16 : Immediate<i64, [{
+defm imm64hl16 : Immediate<i64, [{
   return SystemZ::isImmHL(N->getZExtValue());
 }], HL16, "U16Imm">;
 
-def imm64hh16 : Immediate<i64, [{
+defm imm64hh16 : Immediate<i64, [{
   return SystemZ::isImmHH(N->getZExtValue());
 }], HH16, "U16Imm">;
 
 // Immediates for 16-bit chunks of an i64, with the other bits of the
 // i32 being one.
-def imm64ll16c : Immediate<i64, [{
+defm imm64ll16c : Immediate<i64, [{
   return SystemZ::isImmLL(uint64_t(~N->getZExtValue()));
 }], LL16, "U16Imm">;
 
-def imm64lh16c : Immediate<i64, [{
+defm imm64lh16c : Immediate<i64, [{
   return SystemZ::isImmLH(uint64_t(~N->getZExtValue()));
 }], LH16, "U16Imm">;
 
-def imm64hl16c : Immediate<i64, [{
+defm imm64hl16c : Immediate<i64, [{
   return SystemZ::isImmHL(uint64_t(~N->getZExtValue()));
 }], HL16, "U16Imm">;
 
-def imm64hh16c : Immediate<i64, [{
+defm imm64hh16c : Immediate<i64, [{
   return SystemZ::isImmHH(uint64_t(~N->getZExtValue()));
 }], HH16, "U16Imm">;
 
 // Immediates for the lower and upper 32 bits of an i64, with the other
 // bits of the i32 being zero.
-def imm64lf32 : Immediate<i64, [{
+defm imm64lf32 : Immediate<i64, [{
   return SystemZ::isImmLF(N->getZExtValue());
 }], LF32, "U32Imm">;
 
-def imm64hf32 : Immediate<i64, [{
+defm imm64hf32 : Immediate<i64, [{
   return SystemZ::isImmHF(N->getZExtValue());
 }], HF32, "U32Imm">;
 
 // Immediates for the lower and upper 32 bits of an i64, with the other
 // bits of the i32 being one.
-def imm64lf32c : Immediate<i64, [{
+defm imm64lf32c : Immediate<i64, [{
   return SystemZ::isImmLF(uint64_t(~N->getZExtValue()));
 }], LF32, "U32Imm">;
 
-def imm64hf32c : Immediate<i64, [{
+defm imm64hf32c : Immediate<i64, [{
   return SystemZ::isImmHF(uint64_t(~N->getZExtValue()));
 }], HF32, "U32Imm">;
 
 // Negated immediates that fit LF32 or LH16.
-def imm64lh16n : Immediate<i64, [{
+defm imm64lh16n : Immediate<i64, [{
   return SystemZ::isImmLH(uint64_t(-N->getZExtValue()));
 }], NEGLH16, "U16Imm">;
 
-def imm64lf32n : Immediate<i64, [{
+defm imm64lf32n : Immediate<i64, [{
   return SystemZ::isImmLF(uint64_t(-N->getZExtValue()));
 }], NEGLF32, "U32Imm">;
 
 // Short immediates.
-def imm64sx8 : Immediate<i64, [{
+defm imm64sx8 : Immediate<i64, [{
   return isInt<8>(N->getSExtValue());
 }], SIMM8, "S8Imm">;
 
-def imm64zx8 : Immediate<i64, [{
+defm imm64zx8 : Immediate<i64, [{
   return isUInt<8>(N->getSExtValue());
 }], UIMM8, "U8Imm">;
 
-def imm64sx16 : Immediate<i64, [{
+defm imm64sx16 : Immediate<i64, [{
   return isInt<16>(N->getSExtValue());
 }], SIMM16, "S16Imm">;
 
-def imm64sx16n : Immediate<i64, [{
+defm imm64sx16n : Immediate<i64, [{
   return isInt<16>(-N->getSExtValue());
 }], NEGSIMM16, "S16Imm">;
 
-def imm64zx16 : Immediate<i64, [{
+defm imm64zx16 : Immediate<i64, [{
   return isUInt<16>(N->getZExtValue());
 }], UIMM16, "U16Imm">;
 
-def imm64sx32 : Immediate<i64, [{
+defm imm64sx32 : Immediate<i64, [{
   return isInt<32>(N->getSExtValue());
 }], SIMM32, "S32Imm">;
 
-def imm64sx32n : Immediate<i64, [{
+defm imm64sx32n : Immediate<i64, [{
   return isInt<32>(-N->getSExtValue());
 }], NEGSIMM32, "S32Imm">;
 
-def imm64zx32 : Immediate<i64, [{
+defm imm64zx32 : Immediate<i64, [{
   return isUInt<32>(N->getZExtValue());
 }], UIMM32, "U32Imm">;
 
-def imm64zx32n : Immediate<i64, [{
+defm imm64zx32n : Immediate<i64, [{
   return isUInt<32>(-N->getSExtValue());
 }], NEGUIMM32, "U32Imm">;
 
-def imm64zx48 : Immediate<i64, [{
+defm imm64zx48 : Immediate<i64, [{
   return isUInt<64>(N->getZExtValue());
 }], UIMM48, "U48Imm">;
 
@@ -637,7 +654,7 @@ def bdvaddr12only     : BDVMode<            "64", "12">;
 //===----------------------------------------------------------------------===//
 
 // A 4-bit condition-code mask.
-def cond4 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 16); }]>,
+def cond4 : PatLeaf<(i32 timm), [{ return (N->getZExtValue() < 16); }]>,
             Operand<i32> {
   let PrintMethod = "printCond4Operand";
 }
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 15bd12bc98a4..6fe383e64b74 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -472,17 +472,17 @@ def z_subcarry : PatFrag<(ops node:$lhs, node:$rhs),
                               (z_subcarry_1 node:$lhs, node:$rhs, CC)>;
 
 // Signed and unsigned comparisons.
-def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{
   unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
   return Type != SystemZICMP::UnsignedOnly;
 }]>;
-def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{
   unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
   return Type != SystemZICMP::SignedOnly;
 }]>;
 
 // Register- and memory-based TEST UNDER MASK.
-def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, imm)>;
+def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, timm)>;
 def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>;
 
 // Register sign-extend operations.  Sub-32-bit values are represented as i32s.
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index beaf4de285a3..65300fb47627 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -100,12 +100,12 @@ multiclass CondStores64<Instruction insn, Instruction insninv,
                         SDPatternOperator store, SDPatternOperator load,
                         AddressingMode mode> {
   def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr),
-                                    imm32zx4:$valid, imm32zx4:$cc),
+                                    imm32zx4_timm:$valid, imm32zx4_timm:$cc),
                    mode:$addr),
             (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
                   imm32zx4:$valid, imm32zx4:$cc)>;
   def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new,
-                                    imm32zx4:$valid, imm32zx4:$cc),
+                                    imm32zx4_timm:$valid, imm32zx4_timm:$cc),
                    mode:$addr),
             (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
                      imm32zx4:$valid, imm32zx4:$cc)>;
diff --git a/lib/Target/SystemZ/SystemZPostRewrite.cpp b/lib/Target/SystemZ/SystemZPostRewrite.cpp
index 8e4060eac74c..aaa7f8fc88f5 100644
--- a/lib/Target/SystemZ/SystemZPostRewrite.cpp
+++ b/lib/Target/SystemZ/SystemZPostRewrite.cpp
@@ -25,6 +25,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "systemz-postrewrite"
 STATISTIC(MemFoldCopies, "Number of copies inserted before folded mem ops.");
+STATISTIC(LOCRMuxJumps, "Number of LOCRMux jump-sequences (lower is better)");
 
 namespace llvm {
   void initializeSystemZPostRewritePass(PassRegistry&);
@@ -45,12 +46,20 @@ public:
 
   StringRef getPassName() const override { return SYSTEMZ_POSTREWRITE_NAME; }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
 private:
+  void selectLOCRMux(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator MBBI,
+                     MachineBasicBlock::iterator &NextMBBI,
+                     unsigned LowOpcode,
+                     unsigned HighOpcode);
+  void selectSELRMux(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator MBBI,
+                     MachineBasicBlock::iterator &NextMBBI,
+                     unsigned LowOpcode,
+                     unsigned HighOpcode);
+  bool expandCondMove(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator MBBI,
+                      MachineBasicBlock::iterator &NextMBBI);
   bool selectMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 MachineBasicBlock::iterator &NextMBBI);
   bool selectMBB(MachineBasicBlock &MBB);
@@ -68,11 +77,141 @@ FunctionPass *llvm::createSystemZPostRewritePass(SystemZTargetMachine &TM) {
   return new SystemZPostRewrite();
 }
 
+// MI is a load-register-on-condition pseudo instruction.  Replace it with
+// LowOpcode if source and destination are both low GR32s and HighOpcode if
+// source and destination are both high GR32s. Otherwise, a branch sequence
+// is created.
+void SystemZPostRewrite::selectLOCRMux(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       MachineBasicBlock::iterator &NextMBBI,
+                                       unsigned LowOpcode,
+                                       unsigned HighOpcode) {
+  Register DestReg = MBBI->getOperand(0).getReg();
+  Register SrcReg = MBBI->getOperand(2).getReg();
+  bool DestIsHigh = SystemZ::isHighReg(DestReg);
+  bool SrcIsHigh = SystemZ::isHighReg(SrcReg);
+
+  if (!DestIsHigh && !SrcIsHigh)
+    MBBI->setDesc(TII->get(LowOpcode));
+  else if (DestIsHigh && SrcIsHigh)
+    MBBI->setDesc(TII->get(HighOpcode));
+  else
+    expandCondMove(MBB, MBBI, NextMBBI);
+}
+
+// MI is a select pseudo instruction.  Replace it with LowOpcode if source
+// and destination are all low GR32s and HighOpcode if source and destination
+// are all high GR32s. Otherwise, a branch sequence is created.
+void SystemZPostRewrite::selectSELRMux(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       MachineBasicBlock::iterator &NextMBBI,
+                                       unsigned LowOpcode,
+                                       unsigned HighOpcode) {
+  Register DestReg = MBBI->getOperand(0).getReg();
+  Register Src1Reg = MBBI->getOperand(1).getReg();
+  Register Src2Reg = MBBI->getOperand(2).getReg();
+  bool DestIsHigh = SystemZ::isHighReg(DestReg);
+  bool Src1IsHigh = SystemZ::isHighReg(Src1Reg);
+  bool Src2IsHigh = SystemZ::isHighReg(Src2Reg);
+
+  // If sources and destination aren't all high or all low, we may be able to
+  // simplify the operation by moving one of the sources to the destination
+  // first.  But only if this doesn't clobber the other source.
+  if (DestReg != Src1Reg && DestReg != Src2Reg) {
+    if (DestIsHigh != Src1IsHigh) {
+      BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(),
+              TII->get(SystemZ::COPY), DestReg)
+        .addReg(MBBI->getOperand(1).getReg(), getRegState(MBBI->getOperand(1)));
+      MBBI->getOperand(1).setReg(DestReg);
+      Src1Reg = DestReg;
+      Src1IsHigh = DestIsHigh;
+    } else if (DestIsHigh != Src2IsHigh) {
+      BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(),
+              TII->get(SystemZ::COPY), DestReg)
+        .addReg(MBBI->getOperand(2).getReg(), getRegState(MBBI->getOperand(2)));
+      MBBI->getOperand(2).setReg(DestReg);
+      Src2Reg = DestReg;
+      Src2IsHigh = DestIsHigh;
+    }
+  }
+
+  // If the destination (now) matches one source, prefer this to be first.
+  if (DestReg != Src1Reg && DestReg == Src2Reg) {
+    TII->commuteInstruction(*MBBI, false, 1, 2);
+    std::swap(Src1Reg, Src2Reg);
+    std::swap(Src1IsHigh, Src2IsHigh);
+  }
+
+  if (!DestIsHigh && !Src1IsHigh && !Src2IsHigh)
+    MBBI->setDesc(TII->get(LowOpcode));
+  else if (DestIsHigh && Src1IsHigh && Src2IsHigh)
+    MBBI->setDesc(TII->get(HighOpcode));
+  else
+    // Given the simplification above, we must already have a two-operand case.
+    expandCondMove(MBB, MBBI, NextMBBI);
+}
+
+// Replace MBBI by a branch sequence that performs a conditional move of
+// operand 2 to the destination register. Operand 1 is expected to be the
+// same register as the destination.
+bool SystemZPostRewrite::expandCondMove(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        MachineBasicBlock::iterator &NextMBBI) {
+  MachineFunction &MF = *MBB.getParent();
+  const BasicBlock *BB = MBB.getBasicBlock();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  Register DestReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
+  unsigned CCValid = MI.getOperand(3).getImm();
+  unsigned CCMask = MI.getOperand(4).getImm();
+  assert(DestReg == MI.getOperand(1).getReg() &&
+         "Expected destination and first source operand to be the same.");
+
+  LivePhysRegs LiveRegs(TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  // Splice MBB at MI, moving the rest of the block into RestMBB.
+  MachineBasicBlock *RestMBB = MF.CreateMachineBasicBlock(BB);
+  MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB);
+  RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end());
+  RestMBB->transferSuccessors(&MBB);
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    RestMBB->addLiveIn(*I);
+
+  // Create a new block MoveMBB to hold the move instruction.
+  MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB);
+  MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB);
+  MoveMBB->addLiveIn(SrcReg);
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    MoveMBB->addLiveIn(*I);
+
+  // At the end of MBB, create a conditional branch to RestMBB if the
+  // condition is false, otherwise fall through to MoveMBB.
+  BuildMI(&MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask ^ CCValid).addMBB(RestMBB);
+  MBB.addSuccessor(RestMBB);
+  MBB.addSuccessor(MoveMBB);
+
+  // In MoveMBB, emit an instruction to move SrcReg into DestReg,
+  // then fall through to RestMBB.
+  BuildMI(*MoveMBB, MoveMBB->end(), DL, TII->get(SystemZ::COPY), DestReg)
+      .addReg(MI.getOperand(2).getReg(), getRegState(MI.getOperand(2)));
+  MoveMBB->addSuccessor(RestMBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  LOCRMuxJumps++;
+  return true;
+}
+
 /// If MBBI references a pseudo instruction that should be selected here,
 /// do it and return true.  Otherwise return false.
 bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MBBI,
-                                MachineBasicBlock::iterator &NextMBBI) {
+                                  MachineBasicBlock::iterator MBBI,
+                                  MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
 
@@ -83,7 +222,7 @@ bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB,
   if (TargetMemOpcode != -1) {
     MI.setDesc(TII->get(TargetMemOpcode));
     MI.tieOperands(0, 1);
-    unsigned DstReg = MI.getOperand(0).getReg();
+    Register DstReg = MI.getOperand(0).getReg();
     MachineOperand &SrcMO = MI.getOperand(1);
     if (DstReg != SrcMO.getReg()) {
       BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(SystemZ::COPY), DstReg)
@@ -94,6 +233,15 @@ bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB,
     return true;
   }
 
+  switch (Opcode) {
+  case SystemZ::LOCRMux:
+    selectLOCRMux(MBB, MBBI, NextMBBI, SystemZ::LOCR, SystemZ::LOCFHR);
+    return true;
+  case SystemZ::SELRMux:
+    selectSELRMux(MBB, MBBI, NextMBBI, SystemZ::SELR, SystemZ::SELFHR);
+    return true;
+  }
+
   return false;
 }
 
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index b27c25beb58c..af33a0300552 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -35,5 +35,6 @@ def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>;
 def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>;
 def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>;
 
-def : ProcessorModel<"arch13", Arch13Model, Arch13SupportedFeatures.List>;
+def : ProcessorModel<"arch13", Z15Model, Arch13SupportedFeatures.List>;
+def : ProcessorModel<"z15", Z15Model, Arch13SupportedFeatures.List>;
 
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index e7cd6871dbb4..39ace5594b7f 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -41,7 +41,7 @@ static const TargetRegisterClass *getRC32(MachineOperand &MO,
     return &SystemZ::GRH32BitRegClass;
 
   if (VRM && VRM->hasPhys(MO.getReg())) {
-    unsigned PhysReg = VRM->getPhys(MO.getReg());
+    Register PhysReg = VRM->getPhys(MO.getReg());
     if (SystemZ::GR32BitRegClass.contains(PhysReg))
       return &SystemZ::GR32BitRegClass;
     assert (SystemZ::GRH32BitRegClass.contains(PhysReg) &&
@@ -120,8 +120,8 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
           }
 
           // Add the other operand of the LOCRMux to the worklist.
-          unsigned OtherReg =
-            (TrueMO.getReg() == Reg ? FalseMO.getReg() : TrueMO.getReg());
+          Register OtherReg =
+              (TrueMO.getReg() == Reg ? FalseMO.getReg() : TrueMO.getReg());
           if (MRI->getRegClass(OtherReg) == &SystemZ::GRX32BitRegClass)
             Worklist.push_back(OtherReg);
         } // end LOCRMux
@@ -169,7 +169,8 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
 
       auto tryAddHint = [&](const MachineOperand *MO) -> void {
         Register Reg = MO->getReg();
-        Register PhysReg = isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg);
+        Register PhysReg =
+            Register::isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg);
         if (PhysReg) {
           if (MO->getSubReg())
             PhysReg = getSubReg(PhysReg, MO->getSubReg());
@@ -297,8 +298,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       assert(Mask && "One offset must be OK");
     } while (!OpcodeForOffset);
 
-    unsigned ScratchReg =
-      MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+    Register ScratchReg =
+        MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass);
     int64_t HighOffset = OldOffset - Offset;
 
     if (MI->getDesc().TSFlags & SystemZII::HasIndex
@@ -351,8 +352,8 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI,
   // regalloc may run out of registers.
 
   unsigned WideOpNo = (getRegSizeInBits(*SrcRC) == 128 ? 1 : 0);
-  unsigned GR128Reg = MI->getOperand(WideOpNo).getReg();
-  unsigned GRNarReg = MI->getOperand((WideOpNo == 1) ? 0 : 1).getReg();
+  Register GR128Reg = MI->getOperand(WideOpNo).getReg();
+  Register GRNarReg = MI->getOperand((WideOpNo == 1) ? 0 : 1).getReg();
   LiveInterval &IntGR128 = LIS.getInterval(GR128Reg);
   LiveInterval &IntGRNar = LIS.getInterval(GRNarReg);
 
@@ -385,7 +386,7 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI,
   MEE++;
   for (; MII != MEE; ++MII) {
     for (const MachineOperand &MO : MII->operands())
-      if (MO.isReg() && isPhysicalRegister(MO.getReg())) {
+      if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) {
         for (MCSuperRegIterator SI(MO.getReg(), this, true/*IncludeSelf*/);
              SI.isValid(); ++SI)
           if (NewRC->contains(*SI)) {
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 4f721ec23e53..7044efef1ac6 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -28,6 +28,15 @@ inline unsigned even128(bool Is32bit) {
 inline unsigned odd128(bool Is32bit) {
   return Is32bit ? subreg_l32 : subreg_l64;
 }
+
+// Reg should be a 32-bit GPR.  Return true if it is a high register rather
+// than a low register.
+inline bool isHighReg(unsigned int Reg) {
+  if (SystemZ::GRH32BitRegClass.contains(Reg))
+    return true;
+  assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
+  return false;
+}
 } // end namespace SystemZ
 
 struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index 98eca2802242..119e3ee7c22c 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -59,7 +59,7 @@ def VBU : SchedWrite; // Virtual branching unit
 
 def MCD : SchedWrite; // Millicode
 
-include "SystemZScheduleArch13.td"
+include "SystemZScheduleZ15.td"
 include "SystemZScheduleZ14.td"
 include "SystemZScheduleZ13.td"
 include "SystemZScheduleZEC12.td"
diff --git a/lib/Target/SystemZ/SystemZScheduleArch13.td b/lib/Target/SystemZ/SystemZScheduleArch13.td
deleted file mode 100644
index 9f82f24d0e8f..000000000000
--- a/lib/Target/SystemZ/SystemZScheduleArch13.td
+++ /dev/null
@@ -1,1695 +0,0 @@
-//-- SystemZScheduleArch13.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the machine model for Arch13 to support instruction
-// scheduling and other instruction cost heuristics.
-//
-// Pseudos expanded right after isel do not need to be modelled here.
-//
-//===----------------------------------------------------------------------===//
-
-def Arch13Model : SchedMachineModel {
-
-    let UnsupportedFeatures = Arch13UnsupportedFeatures.List;
-
-    let IssueWidth = 6;             // Number of instructions decoded per cycle.
-    let MicroOpBufferSize = 60;     // Issue queues
-    let LoadLatency = 1;            // Optimistic load latency.
-
-    let PostRAScheduler = 1;
-
-    // Extra cycles for a mispredicted branch.
-    let MispredictPenalty = 20;
-}
-
-let SchedModel = Arch13Model in  {
-// These definitions need the SchedModel value. They could be put in a
-// subtarget common include file, but it seems the include system in Tablegen
-// currently (2016) rejects multiple includes of same file.
-
-// Decoder grouping rules
-let NumMicroOps = 1 in {
-  def : WriteRes<NormalGr, []>;
-  def : WriteRes<BeginGroup, []> { let BeginGroup  = 1; }
-  def : WriteRes<EndGroup, []>   { let EndGroup    = 1; }
-}
-def : WriteRes<Cracked, []> {
-  let NumMicroOps = 2;
-  let BeginGroup  = 1;
-}
-def : WriteRes<GroupAlone, []> {
-  let NumMicroOps = 3;
-  let BeginGroup  = 1;
-  let EndGroup    = 1;
-}
-def : WriteRes<GroupAlone2, []> {
-  let NumMicroOps = 6;
-  let BeginGroup  = 1;
-  let EndGroup    = 1;
-}
-def : WriteRes<GroupAlone3, []> {
-  let NumMicroOps = 9;
-  let BeginGroup  = 1;
-  let EndGroup    = 1;
-}
-
-// Incoming latency removed from the register operand which is used together
-// with a memory operand by the instruction.
-def : ReadAdvance<RegReadAdv, 4>;
-
-// LoadLatency (above) is not used for instructions in this file. This is
-// instead the role of LSULatency, which is the latency value added to the
-// result of loads and instructions with folded memory operands.
-def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
-
-let NumMicroOps = 0 in {
-  foreach L = 1-30 in
-    def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
-}
-
-// Execution units.
-def Arch13_FXaUnit     : ProcResource<2>;
-def Arch13_FXbUnit     : ProcResource<2>;
-def Arch13_LSUnit      : ProcResource<2>;
-def Arch13_VecUnit     : ProcResource<2>;
-def Arch13_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
-def Arch13_VBUnit      : ProcResource<2>;
-def Arch13_MCD         : ProcResource<1>;
-
-// Subtarget specific definitions of scheduling resources.
-let NumMicroOps = 0 in {
-  def : WriteRes<FXa, [Arch13_FXaUnit]>;
-  def : WriteRes<FXb, [Arch13_FXbUnit]>;
-  def : WriteRes<LSU, [Arch13_LSUnit]>;
-  def : WriteRes<VecBF,  [Arch13_VecUnit]>;
-  def : WriteRes<VecDF,  [Arch13_VecUnit]>;
-  def : WriteRes<VecDFX, [Arch13_VecUnit]>;
-  def : WriteRes<VecMul,  [Arch13_VecUnit]>;
-  def : WriteRes<VecStr,  [Arch13_VecUnit]>;
-  def : WriteRes<VecXsPm, [Arch13_VecUnit]>;
-  foreach Num = 2-5 in { let ResourceCycles = [Num] in {
-    def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Arch13_FXaUnit]>;
-    def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Arch13_FXbUnit]>;
-    def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Arch13_LSUnit]>;
-    def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Arch13_VecUnit]>;
-    def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Arch13_VecUnit]>;
-    def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Arch13_VecUnit]>;
-    def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Arch13_VecUnit]>;
-    def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Arch13_VecUnit]>;
-    def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Arch13_VecUnit]>;
-  }}
-
-  def : WriteRes<VecFPd,  [Arch13_VecFPdUnit]> { let ResourceCycles = [30]; }
-
-  def : WriteRes<VBU,     [Arch13_VBUnit]>; // Virtual Branching Unit
-}
-
-def : WriteRes<MCD, [Arch13_MCD]> { let NumMicroOps = 3;
-                                    let BeginGroup  = 1;
-                                    let EndGroup    = 1; }
-
-// -------------------------- INSTRUCTIONS ---------------------------------- //
-
-// InstRW constructs have been used in order to preserve the
-// readability of the InstrInfo files.
-
-// For each instruction, as matched by a regexp, provide a list of
-// resources that it needs. These will be combined into a SchedClass.
-
-//===----------------------------------------------------------------------===//
-// Stack allocation
-//===----------------------------------------------------------------------===//
-
-// Pseudo -> LA / LAY
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>;
-
-//===----------------------------------------------------------------------===//
-// Branch instructions
-//===----------------------------------------------------------------------===//
-
-// Branch
-def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
-def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "BI(C)?(Asm.*)?$")>;
-def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>;
-def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>;
-def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>;
-def : InstRW<[WLat1, FXa2, FXb2, GroupAlone2],
-             (instregex "B(R)?X(H|L).*$")>;
-
-// Compare and branch
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
-def : InstRW<[WLat1, FXb2, GroupAlone],
-             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Trap instructions
-//===----------------------------------------------------------------------===//
-
-// Trap
-def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
-
-// Compare and trap
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Call and return instructions
-//===----------------------------------------------------------------------===//
-
-// Call
-def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
-
-// Return
-def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
-
-//===----------------------------------------------------------------------===//
-// Move instructions
-//===----------------------------------------------------------------------===//
-
-// Moves
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>;
-
-// Move character
-def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
-def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "MVCRL$")>;
-
-// Pseudo -> reg move
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>;
-
-// Loads
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
-def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
-
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
-
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
-
-// Load and zero rightmost byte
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
-
-// Load and trap
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
-
-// Load and test
-def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>;
-
-// Stores
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
-
-// String moves.
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
-
-//===----------------------------------------------------------------------===//
-// Conditional move instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>;
-def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
-def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
-def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr],
-             (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
-
-def : InstRW<[WLat2, FXa, NormalGr], (instregex "SELRMux$")>;
-def : InstRW<[WLat2, FXa, NormalGr], (instregex "SEL(G|FH)?R(Asm.*)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Sign extensions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>;
-
-def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>;
-
-def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
-def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>;
-def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
-def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
-def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
-
-//===----------------------------------------------------------------------===//
-// Zero extensions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
-def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
-
-// Load and zero rightmost byte
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>;
-
-// Load and trap
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
-
-//===----------------------------------------------------------------------===//
-// Truncations
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Multi-register moves
-//===----------------------------------------------------------------------===//
-
-// Load multiple (estimated average of 5 ops)
-def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
-
-// Load multiple disjoint
-def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
-
-// Store multiple
-def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Byte swaps
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>;
-def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
-def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
-
-//===----------------------------------------------------------------------===//
-// Load address instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>;
-
-// Load the Global Offset Table address ( -> larl )
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>;
-
-//===----------------------------------------------------------------------===//
-// Absolute and Negation
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LP(G)?R$")>;
-def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "L(N|P)GFR$")>;
-def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LN(R|GR)$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>;
-def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>;
-
-//===----------------------------------------------------------------------===//
-// Insertion
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>;
-def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "IC32(Y)?$")>;
-def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr],
-             (instregex "ICM(H|Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Addition
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "A(Y)?$")>;
-def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "AH(Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>;
-def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "AG$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>;
-def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "AL(Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "ALG(F)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>;
-def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>;
-def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>;
-
-// Logical addition with carry
-def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
-             (instregex "ALC(G)?$")>;
-def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>;
-
-// Add with sign extension (16/32 -> 64)
-def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "AG(F|H)$")>;
-def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>;
-
-//===----------------------------------------------------------------------===//
-// Subtraction
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "S(G|Y)?$")>;
-def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "SH(Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>;
-def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "SL(G|GF|Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>;
-def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>;
-
-// Subtraction with borrow
-def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
-             (instregex "SLB(G)?$")>;
-def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>;
-
-// Subtraction with sign extension (16/32 -> 64)
-def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "SG(F|H)$")>;
-def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>;
-
-//===----------------------------------------------------------------------===//
-// AND
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "N(G|Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
-def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>;
-def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>;
-
-//===----------------------------------------------------------------------===//
-// OR
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "O(G|Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>;
-def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>;
-def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>;
-
-//===----------------------------------------------------------------------===//
-// XOR
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "X(G|Y)?$")>;
-def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>;
-def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>;
-
-//===----------------------------------------------------------------------===//
-// Combined logical operations
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NC(G)?RK$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "OC(G)?RK$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NN(G)?RK$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NO(G)?RK$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "NX(G)?RK$")>;
-
-//===----------------------------------------------------------------------===//
-// Multiplication
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat5LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "MS(GF|Y)?$")>;
-def : InstRW<[WLat5, FXa, NormalGr], (instregex "MS(R|FI)$")>;
-def : InstRW<[WLat7LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>;
-def : InstRW<[WLat7, FXa, NormalGr], (instregex "MSGR$")>;
-def : InstRW<[WLat5, FXa, NormalGr], (instregex "MSGF(I|R)$")>;
-def : InstRW<[WLat8LSU, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MLGR$")>;
-def : InstRW<[WLat4, FXa, NormalGr], (instregex "MGHI$")>;
-def : InstRW<[WLat4, FXa, NormalGr], (instregex "MHI$")>;
-def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>;
-def : InstRW<[WLat6, FXa2, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[WLat6LSU, RegReadAdv, FXa2, LSU, GroupAlone],
-             (instregex "M(FY|L)?$")>;
-def : InstRW<[WLat8, RegReadAdv, FXa, LSU, NormalGr], (instregex "MGH$")>;
-def : InstRW<[WLat12, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MG$")>;
-def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MGRK$")>;
-def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "MSC$")>;
-def : InstRW<[WLat8LSU, WLat8LSU, RegReadAdv, FXa, LSU, NormalGr],
-             (instregex "MSGC$")>;
-def : InstRW<[WLat6, WLat6, FXa, NormalGr], (instregex "MSRKC$")>;
-def : InstRW<[WLat8, WLat8, FXa, NormalGr], (instregex "MSGRKC$")>;
-
-//===----------------------------------------------------------------------===//
-// Division and remainder
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>;
-def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2], (instregex "D$")>;
-def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>;
-def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone2],
-             (instregex "DSG(F)?$")>;
-def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>;
-def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>;
-def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2],
-             (instregex "DL(G)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Shifts
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>;
-def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2],
-             (instregex "S(L|R)D(A|L)$")>;
-
-// Rotate
-def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>;
-
-// Rotate and insert
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>;
-
-// Rotate and Select
-def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "R(N|O|X)SBG$")>;
-
-//===----------------------------------------------------------------------===//
-// Comparison
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
-             (instregex "C(G|Y|Mux)?$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>;
-def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>;
-def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
-             (instregex "CL(Y|Mux)?$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>;
-def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
-def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>;
-def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>;
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>;
-def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>;
-
-// Compare halfword
-def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>;
-def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>;
-def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>;
-def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>;
-def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>;
-
-// Compare with sign extension (32 -> 64)
-def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>;
-def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>;
-def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>;
-
-// Compare logical character
-def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
-
-// Test under mask
-def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>;
-
-// Compare logical characters under mask
-def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr],
-             (instregex "CLM(H|Y)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Prefetch and execution hint
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>;
-def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>;
-
-//===----------------------------------------------------------------------===//
-// Atomic operations
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>;
-
-def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>;
-def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>;
-def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>;
-def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>;
-def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>;
-
-// Test and set
-def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>;
-
-// Compare and swap
-def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
-             (instregex "CS(G|Y)?$")>;
-
-// Compare double and swap
-def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2],
-             (instregex "CDS(Y)?$")>;
-def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3,
-              GroupAlone3], (instregex "CDSG$")>;
-
-// Compare and swap and store
-def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
-
-// Perform locked operation
-def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
-
-// Load/store pair from/to quadword
-def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
-def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>;
-
-// Load pair disjoint
-def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Translate and convert
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>;
-def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone2],
-             (instregex "TRT$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD],
-             (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Message-security assist
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
-             (instregex "KM(C|F|O|CTR|A)?$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD],
-             (instregex "(KIMD|KLMD|KMAC|KDSA)$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD],
-             (instregex "(PCC|PPNO|PRNO)$")>;
-
-//===----------------------------------------------------------------------===//
-// Guarded storage
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LGG$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLGFSG$")>;
-def : InstRW<[WLat30, MCD], (instregex "(L|ST)GSC$")>;
-
-//===----------------------------------------------------------------------===//
-// Decimal arithmetic
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat20, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone2],
-             (instregex "CVBG$")>;
-def : InstRW<[WLat20, RegReadAdv, FXb, VecDF, LSU, GroupAlone2],
-             (instregex "CVB(Y)?$")>;
-def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone3], (instregex "CVDG$")>;
-def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone2], (instregex "CVD(Y)?$")>;
-def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
-def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
-def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
-
-def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2],
-             (instregex "(A|S|ZA)P$")>;
-def : InstRW<[WLat1, FXb, VecDFX2, LSU3, GroupAlone2], (instregex "MP$")>;
-def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "DP$")>;
-def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>;
-def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
-def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
-def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Access registers
-//===----------------------------------------------------------------------===//
-
-// Extract/set/copy access register
-def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
-
-// Load address extended
-def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
-
-// Load/store access multiple (not modeled precisely)
-def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
-def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Program mask and addressing mode
-//===----------------------------------------------------------------------===//
-
-// Insert Program Mask
-def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>;
-
-// Set Program Mask
-def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
-
-// Branch and link
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>;
-
-// Test addressing mode
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>;
-
-// Set addressing mode
-def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>;
-
-// Branch (and save) and set mode.
-def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>;
-def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>;
-
-//===----------------------------------------------------------------------===//
-// Transactional execution
-//===----------------------------------------------------------------------===//
-
-// Transaction begin
-def : InstRW<[WLat9, LSU2, FXb5, GroupAlone2], (instregex "TBEGIN(C)?$")>;
-
-// Transaction end
-def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>;
-
-// Transaction abort
-def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
-
-// Extract Transaction Nesting Depth
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>;
-
-// Nontransactional store
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>;
-
-//===----------------------------------------------------------------------===//
-// Processor assist
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXb, GroupAlone], (instregex "PPA$")>;
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous Instructions.
-//===----------------------------------------------------------------------===//
-
-// Find leftmost one
-def : InstRW<[WLat5, WLat5, FXa2, GroupAlone], (instregex "FLOGR$")>;
-
-// Population count
-def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT(Opt)?$")>;
-
-// String instructions
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
-
-// Various complex instructions
-def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
-def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
-             (instregex "UPT$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
-def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
-def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "SORTL$")>;
-def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "DFLTCC$")>;
-
-// Execute
-def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>;
-
-//===----------------------------------------------------------------------===//
-// .insn directive instructions
-//===----------------------------------------------------------------------===//
-
-// An "empty" sched-class will be assigned instead of the "invalid sched-class".
-// getNumDecoderSlots() will then return 1 instead of 0.
-def : InstRW<[], (instregex "Insn.*")>;
-
-
-// ----------------------------- Floating point ----------------------------- //
-
-//===----------------------------------------------------------------------===//
-// FP: Move instructions
-//===----------------------------------------------------------------------===//
-
-// Load zero
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
-def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
-
-// Load
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
-def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
-def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
-
-// Load and Test
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>;
-def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BRCompare$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone],
-             (instregex "LTXBR(Compare)?$")>;
-
-// Copy sign
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
-
-//===----------------------------------------------------------------------===//
-// FP: Load instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
-
-//===----------------------------------------------------------------------===//
-// FP: Store instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
-
-//===----------------------------------------------------------------------===//
-// FP: Conversion instructions
-//===----------------------------------------------------------------------===//
-
-// Load rounded
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEDBR(A)?$")>;
-def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
-
-// Load lengthened
-def : InstRW<[WLat6LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LDEBR$")>;
-def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
-def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>;
-
-// Convert from fixed / logical
-def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>;
-def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)BR(A)?$")>;
-def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>;
-def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CXL(F|G)BR$")>;
-
-// Convert to fixed / logical
-def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked],
-             (instregex "C(F|G)(E|D)BR(A)?$")>;
-def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked],
-             (instregex "C(F|G)XBR(A)?$")>;
-def : InstRW<[WLat9, WLat9, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>;
-def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLFDBR$")>;
-def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>;
-def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
-
-//===----------------------------------------------------------------------===//
-// FP: Unary arithmetic
-//===----------------------------------------------------------------------===//
-
-// Load Complement / Negative / Positive
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
-
-// Square root
-def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)BR$")>;
-def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>;
-
-// Load FP integer
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
-def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>;
-
-//===----------------------------------------------------------------------===//
-// FP: Binary arithmetic
-//===----------------------------------------------------------------------===//
-
-// Addition
-def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
-             (instregex "A(E|D)B$")>;
-def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D)BR$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>;
-
-// Subtraction
-def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
-             (instregex "S(E|D)B$")>;
-def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D)BR$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>;
-
-// Multiply
-def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
-             (instregex "M(D|DE|EE)B$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>;
-def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
-             (instregex "MXDB$")>;
-def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDBR$")>;
-def : InstRW<[WLat15, VecDF4, GroupAlone], (instregex "MXBR$")>;
-
-// Multiply and add / subtract
-def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
-             (instregex "M(A|S)EB$")>;
-def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
-             (instregex "M(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(A|S)DBR$")>;
-
-// Division
-def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr],
-             (instregex "D(E|D)B$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)BR$")>;
-def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>;
-
-// Divide to integer
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
-
-//===----------------------------------------------------------------------===//
-// FP: Comparisons
-//===----------------------------------------------------------------------===//
-
-// Compare
-def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
-             (instregex "(K|C)(E|D)B$")>;
-def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>;
-def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
-
-// Test Data Class
-def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
-def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>;
-
-//===----------------------------------------------------------------------===//
-// FP: Floating-point control register instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>;
-def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>;
-def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
-def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
-def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>;
-
-
-// --------------------- Hexadecimal floating point ------------------------- //
-
-//===----------------------------------------------------------------------===//
-// HFP: Move instructions
-//===----------------------------------------------------------------------===//
-
-// Load and Test
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>;
-
-//===----------------------------------------------------------------------===//
-// HFP: Conversion instructions
-//===----------------------------------------------------------------------===//
-
-// Load rounded
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEXR$")>;
-def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>;
-
-// Load lengthened
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>;
-def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
-def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>;
-
-// Convert from fixed
-def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>;
-def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)R$")>;
-
-// Convert to fixed
-def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>;
-def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>;
-
-// Convert BFP to HFP / HFP to BFP.
-def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "THD(E)?R$")>;
-def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "TB(E)?DR$")>;
-
-//===----------------------------------------------------------------------===//
-// HFP: Unary arithmetic
-//===----------------------------------------------------------------------===//
-
-// Load Complement / Negative / Positive
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>;
-
-// Halve
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "H(E|D)R$")>;
-
-// Square root
-def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)R$")>;
-def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>;
-
-// Load FP integer
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)R$")>;
-def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>;
-
-//===----------------------------------------------------------------------===//
-// HFP: Binary arithmetic
-//===----------------------------------------------------------------------===//
-
-// Addition
-def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
-             (instregex "A(E|D|U|W)$")>;
-def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>;
-
-// Subtraction
-def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
-             (instregex "S(E|D|U|W)$")>;
-def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>;
-
-// Multiply
-def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
-             (instregex "M(D|DE|E|EE)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>;
-def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
-             (instregex "MXD$")>;
-def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDR$")>;
-def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone], (instregex "MY$")>;
-def : InstRW<[WLat6LSU, RegReadAdv, VecBF2, LSU, GroupAlone],
-             (instregex "MY(H|L)$")>;
-def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MYR$")>;
-def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
-
-// Multiply and add / subtract
-def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
-             (instregex "M(A|S)(E|D)$")>;
-def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
-def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone],
-             (instregex "MAY$")>;
-def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
-             (instregex "MAY(H|L)$")>;
-def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MAYR$")>;
-def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
-
-// Division
-def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "D(E|D)$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)R$")>;
-def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>;
-
-//===----------------------------------------------------------------------===//
-// HFP: Comparisons
-//===----------------------------------------------------------------------===//
-
-// Compare
-def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
-             (instregex "C(E|D)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "C(E|D)R$")>;
-def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>;
-
-
-// ------------------------ Decimal floating point -------------------------- //
-
-//===----------------------------------------------------------------------===//
-// DFP: Move instructions
-//===----------------------------------------------------------------------===//
-
-// Load and Test
-def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>;
-
-//===----------------------------------------------------------------------===//
-// DFP: Conversion instructions
-//===----------------------------------------------------------------------===//
-
-// Load rounded
-def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>;
-def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>;
-
-// Load lengthened
-def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>;
-def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>;
-
-// Convert from fixed / logical
-def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDFTR(A)?$")>;
-def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDGTR(A)?$")>;
-def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXFTR(A)?$")>;
-def : InstRW<[WLat30, FXb, VecDF4, GroupAlone2], (instregex "CXGTR(A)?$")>;
-def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDLFTR$")>;
-def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDLGTR$")>;
-def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXLFTR$")>;
-def : InstRW<[WLat30, FXb, VecDF4, GroupAlone2], (instregex "CXLGTR$")>;
-
-// Convert to fixed / logical
-def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked],
-             (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked],
-             (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>;
-
-// Convert from / to signed / unsigned packed
-def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>;
-def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone2], (instregex "CX(S|U)TR$")>;
-def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>;
-def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone2], (instregex "C(S|U)XTR$")>;
-
-// Convert from / to zoned
-def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>;
-def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXZT$")>;
-def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>;
-def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>;
-
-// Convert from / to packed
-def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>;
-def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXPT$")>;
-def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>;
-def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>;
-
-// Perform floating-point operation
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
-
-//===----------------------------------------------------------------------===//
-// DFP: Unary arithmetic
-//===----------------------------------------------------------------------===//
-
-// Load FP integer
-def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>;
-def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>;
-
-// Extract biased exponent
-def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>;
-def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>;
-
-// Extract significance
-def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>;
-def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>;
-
-//===----------------------------------------------------------------------===//
-// DFP: Binary arithmetic
-//===----------------------------------------------------------------------===//
-
-// Addition
-def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>;
-
-// Subtraction
-def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>;
-
-// Multiply
-def : InstRW<[WLat30, VecDF, NormalGr], (instregex "MDTR(A)?$")>;
-def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>;
-
-// Division
-def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>;
-def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>;
-
-// Quantize
-def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>;
-def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>;
-
-// Reround
-def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>;
-def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone2], (instregex "RRXTR$")>;
-
-// Shift significand left/right
-def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>;
-def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>;
-
-// Insert biased exponent
-def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>;
-def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "IEXTR$")>;
-
-//===----------------------------------------------------------------------===//
-// DFP: Comparisons
-//===----------------------------------------------------------------------===//
-
-// Compare
-def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>;
-def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>;
-
-// Compare biased exponent
-def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>;
-def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>;
-
-// Test Data Class/Group
-def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>;
-
-
-// --------------------------------- Vector --------------------------------- //
-
-//===----------------------------------------------------------------------===//
-// Vector: Move instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>;
-def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>;
-def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Immediate instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Loads
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
-def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
-             (instregex "VLE(B|F|G|H)$")>;
-def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
-             (instregex "VGE(F|G)$")>;
-def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone],
-             (instregex "VLM(Align)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Stores
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
-def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
-def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
-def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Byte swaps
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBR(H|F|G|Q)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLER(H|F|G)?$")>;
-def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
-             (instregex "VLEBR(H|F|G)$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEBRZ(H|F|G|E)?$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBRREP(H|F|G)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTBR(H|F|G|Q)?$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTER(H|F|G)?$")>;
-def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTEBRH$")>;
-def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTEBR(F|G)$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Selects and permutes
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VBPERM$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Widening and narrowing
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|W)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Integer arithmetic
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O|N|X)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO(C)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|W)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|W)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H)?$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VMSL(G)?$")>;
-
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT(B|F|G|H)?$")>;
-
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>;
-
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)B$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLD$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSRD$")>;
-
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>;
-
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>;
-def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Integer comparison
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H)?$")>;
-def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H)?$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)?$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)S$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)?$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)S$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)?$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)S$")>;
-def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Floating-point arithmetic
-//===----------------------------------------------------------------------===//
-
-// Conversion and rounding
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCFP(S|L)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?G$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?GB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCD(L)?GB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCE(L)?FB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCE(L)?FB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(S|L)FP$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GD$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GDB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?GDB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?FEB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?FEB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)B$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(L|R)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(LS|RD)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFL(LS|RD)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFLLD$")>;
-def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFLRX$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFI(DB)?$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFIDB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFISB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFISB$")>;
-def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFIXB$")>;
-
-// Sign operations
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSOSB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFPSOXB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)SB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFL(C|N|P)XB$")>;
-
-// Minimum / maximum
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)DB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)DB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)SB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)SB$")>;
-def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WF(MAX|MIN)XB$")>;
-
-// Test data class
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCISB$")>;
-def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
-
-// Add / subtract
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
-def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
-
-// Multiply / multiply-and-add/subtract
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>;
-def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
-def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
-
-// Divide / square root
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDSB$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFDXB$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQSB$")>;
-def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFSQXB$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Floating-point comparison
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)DB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)DB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)SB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SB$")>;
-def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SB$")>;
-def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XB$")>;
-def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XB$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFK(E|H|HE)DBS$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
-             (instregex "WF(C|K)(E|H|HE)DBS$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
-             (instregex "VF(C|K)(E|H|HE)SBS$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SBS$")>;
-def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SBS$")>;
-def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XBS$")>;
-def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XBS$")>;
-def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>;
-def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
-def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)SB$")>;
-def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Floating-point insertion and extraction
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
-def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: String instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>;
-def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>;
-def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>;
-def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>;
-def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>;
-def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
-             (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
-def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
-             (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
-def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>;
-def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>;
-def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>;
-def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>;
-def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>;
-def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>;
-def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRS(B|F|H)?$")>;
-def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRSZ(B|F|H)$")>;
-
-//===----------------------------------------------------------------------===//
-// Vector: Packed-decimal instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "VLIP$")>;
-def : InstRW<[WLat6, VecDFX, LSU, GroupAlone2], (instregex "VPKZ$")>;
-def : InstRW<[WLat1, VecDFX, FXb, LSU2, GroupAlone2], (instregex "VUPKZ$")>;
-def : InstRW<[WLat20, WLat20, VecDF2, FXb, GroupAlone],
-             (instregex "VCVB(G)?(Opt)?$")>;
-def : InstRW<[WLat15, WLat15, VecDF2, FXb, GroupAlone],
-             (instregex "VCVD(G)?$")>;
-def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "V(A|S)P$")>;
-def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VM(S)?P$")>;
-def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "V(D|R)P$")>;
-def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VSDP$")>;
-def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRP$")>;
-def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "VPSOP$")>;
-def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>;
-
-
-// -------------------------------- System ---------------------------------- //
-
-//===----------------------------------------------------------------------===//
-// System: Program-Status Word Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
-def : InstRW<[WLat20, GroupAlone3], (instregex "LPSW(E)?$")>;
-def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
-def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
-def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
-def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
-def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>;
-def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
-
-//===----------------------------------------------------------------------===//
-// System: Control Register Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
-def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>;
-def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
-def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
-def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
-
-//===----------------------------------------------------------------------===//
-// System: Prefix-Register Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
-
-//===----------------------------------------------------------------------===//
-// System: Storage-Key and Real Memory Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
-def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
-def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
-def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
-def : InstRW<[WLat30, MCD], (instregex "IRBM$")>;
-def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
-def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
-def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
-
-//===----------------------------------------------------------------------===//
-// System: Dynamic-Address-Translation Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
-def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
-def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
-def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
-def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
-def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
-def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
-def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
-def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
-
-//===----------------------------------------------------------------------===//
-// System: Memory-move Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>;
-def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
-def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
-
-//===----------------------------------------------------------------------===//
-// System: Address-Space Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
-def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
-def : InstRW<[WLat30, MCD], (instregex "PC$")>;
-def : InstRW<[WLat30, MCD], (instregex "PR$")>;
-def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
-def : InstRW<[WLat30, MCD], (instregex "RP$")>;
-def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
-def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
-
-//===----------------------------------------------------------------------===//
-// System: Linkage-Stack Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
-def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
-
-//===----------------------------------------------------------------------===//
-// System: Time-Related Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
-def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>;
-def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>;
-def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone2], (instregex "STCK(F)?$")>;
-def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone3], (instregex "STCKE$")>;
-def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
-def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>;
-
-//===----------------------------------------------------------------------===//
-// System: CPU-Related Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
-def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
-def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
-def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
-def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
-
-//===----------------------------------------------------------------------===//
-// System: Miscellaneous Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
-def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>;
-def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRAC(E|G)$")>;
-def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
-def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
-def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
-
-//===----------------------------------------------------------------------===//
-// System: CPU-Measurement Facility Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
-def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
-def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
-def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
-def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
-def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
-
-//===----------------------------------------------------------------------===//
-// System: I/O Instructions
-//===----------------------------------------------------------------------===//
-
-def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
-def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
-def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
-def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
-def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
-def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
-def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
-
-}
-
diff --git a/lib/Target/SystemZ/SystemZScheduleZ15.td b/lib/Target/SystemZ/SystemZScheduleZ15.td
new file mode 100644
index 000000000000..56ceb88f35d4
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -0,0 +1,1695 @@
+//-- SystemZScheduleZ15.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z15 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+// Pseudos expanded right after isel do not need to be modelled here.
+//
+//===----------------------------------------------------------------------===//
+
+def Z15Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch13UnsupportedFeatures.List;
+
+    let IssueWidth = 6;             // Number of instructions decoded per cycle.
+    let MicroOpBufferSize = 60;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 20;
+}
+
+let SchedModel = Z15Model in  {
+// These definitions need the SchedModel value. They could be put in a
+// subtarget common include file, but it seems the include system in Tablegen
+// currently (2016) rejects multiple includes of same file.
+
+// Decoder grouping rules
+let NumMicroOps = 1 in {
+  def : WriteRes<NormalGr, []>;
+  def : WriteRes<BeginGroup, []> { let BeginGroup  = 1; }
+  def : WriteRes<EndGroup, []>   { let EndGroup    = 1; }
+}
+def : WriteRes<Cracked, []> {
+  let NumMicroOps = 2;
+  let BeginGroup  = 1;
+}
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 3;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<GroupAlone2, []> {
+  let NumMicroOps = 6;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<GroupAlone3, []> {
+  let NumMicroOps = 9;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+
+// Incoming latency removed from the register operand which is used together
+// with a memory operand by the instruction.
+def : ReadAdvance<RegReadAdv, 4>;
+
+// LoadLatency (above) is not used for instructions in this file. This is
+// instead the role of LSULatency, which is the latency value added to the
+// result of loads and instructions with folded memory operands.
+def : WriteRes<LSULatency, []> { let Latency = 4; let NumMicroOps = 0; }
+
+let NumMicroOps = 0 in {
+  foreach L = 1-30 in
+    def : WriteRes<!cast<SchedWrite>("WLat"#L), []> { let Latency = L; }
+}
+
+// Execution units.
+def Z15_FXaUnit     : ProcResource<2>;
+def Z15_FXbUnit     : ProcResource<2>;
+def Z15_LSUnit      : ProcResource<2>;
+def Z15_VecUnit     : ProcResource<2>;
+def Z15_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z15_VBUnit      : ProcResource<2>;
+def Z15_MCD         : ProcResource<1>;
+
+// Subtarget specific definitions of scheduling resources.
+let NumMicroOps = 0 in {
+  def : WriteRes<FXa, [Z15_FXaUnit]>;
+  def : WriteRes<FXb, [Z15_FXbUnit]>;
+  def : WriteRes<LSU, [Z15_LSUnit]>;
+  def : WriteRes<VecBF,  [Z15_VecUnit]>;
+  def : WriteRes<VecDF,  [Z15_VecUnit]>;
+  def : WriteRes<VecDFX, [Z15_VecUnit]>;
+  def : WriteRes<VecMul,  [Z15_VecUnit]>;
+  def : WriteRes<VecStr,  [Z15_VecUnit]>;
+  def : WriteRes<VecXsPm, [Z15_VecUnit]>;
+  foreach Num = 2-5 in { let ResourceCycles = [Num] in {
+    def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Z15_FXaUnit]>;
+    def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Z15_FXbUnit]>;
+    def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z15_LSUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Z15_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Z15_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Z15_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Z15_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Z15_VecUnit]>;
+    def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Z15_VecUnit]>;
+  }}
+
+  def : WriteRes<VecFPd,  [Z15_VecFPdUnit]> { let ResourceCycles = [30]; }
+
+  def : WriteRes<VBU,     [Z15_VBUnit]>; // Virtual Branching Unit
+}
+
+def : WriteRes<MCD, [Z15_MCD]> { let NumMicroOps = 3;
+                                    let BeginGroup  = 1;
+                                    let EndGroup    = 1; }
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+// Pseudo -> LA / LAY
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ADJDYNALLOC$")>;
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb2, GroupAlone2],
+             (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb2, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[WLat1, VBU, FXa2, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[WLat1, FXb, LSU3, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVCL(E|U)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "MVCRL$")>;
+
+// Pseudo -> reg move
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "REG_SEQUENCE$")>;
+
+// Loads
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSULatency, LSULatency, LSU, NormalGr], (instregex "LCBB$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LG(RL)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "L128$")>;
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXa, NormalGr], (instregex "LT(G)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STG(RL)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST128$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOCRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr],
+             (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "SELRMux$")>;
+def : InstRW<[WLat2, FXa, NormalGr], (instregex "SEL(G|FH)?R(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "L(B|H|G)R$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[WLat1LSU, WLat1LSU, FXa, LSU, NormalGr], (instregex "LTGF$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LTGFR$")>;
+
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(Y)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(B|H|F)$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLH(Mux)?$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLHRL$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STCM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[WLat10, WLat10, LSU5, GroupAlone], (instregex "LM(H|Y|G)?$")>;
+
+// Load multiple disjoint
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LMD$")>;
+
+// Store multiple
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LRV(G)?R$")>;
+def : InstRW<[WLat1LSU, FXa, LSU, NormalGr], (instregex "LRV(G|H)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STRV(G|H)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCIN$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LP(G)?R$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "L(N|P)GFR$")>;
+def : InstRW<[WLat1, WLat1, FXa, NormalGr], (instregex "LN(R|GR)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LC(R|GR)$")>;
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "IC(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "IC32(Y)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, WLat1LSU, FXa, LSU, NormalGr],
+             (instregex "ICM(H|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "A(Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AIH$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AG$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGFI$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHI(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AHIMux(K)?$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AL(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "ALG(F)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGHSIK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "AR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "A(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "A(L)?HHLR$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ALSIH(N)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "A(L)?(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "ALC(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (16/32 -> 64)
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "AG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "S(G|Y)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SH(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLFI$")>;
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGF(I|R)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "S(L)?HHHR$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "S(L)?HHLR$")>;
+
+// Subtraction with borrow
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, GroupAlone],
+             (instregex "SLB(G)?$")>;
+def : InstRW<[WLat2, WLat2, FXa, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (16/32 -> 64)
+def : InstRW<[WLat2LSU, WLat2LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "SG(F|H)$")>;
+def : InstRW<[WLat2, WLat2, FXa, NormalGr], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "N(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "NI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "O(G|Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OGR(K)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "OI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OIHL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILH(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OILL(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, WLat1LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "X(G|Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "XI(Y)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIFMux$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XGR(K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XIHF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XILF(64)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "XR(K)?$")>;
+def : InstRW<[WLat3LSU, LSU2, FXb, Cracked], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Combined logical operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NC(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "OC(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NN(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NO(G)?RK$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "NX(G)?RK$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat5LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MS(GF|Y)?$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MS(R|FI)$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MSG$")>;
+def : InstRW<[WLat7, FXa, NormalGr], (instregex "MSGR$")>;
+def : InstRW<[WLat5, FXa, NormalGr], (instregex "MSGF(I|R)$")>;
+def : InstRW<[WLat8LSU, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MGHI$")>;
+def : InstRW<[WLat4, FXa, NormalGr], (instregex "MHI$")>;
+def : InstRW<[WLat4LSU, RegReadAdv, FXa, LSU, NormalGr], (instregex "MH(Y)?$")>;
+def : InstRW<[WLat6, FXa2, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, FXa2, LSU, GroupAlone],
+             (instregex "M(FY|L)?$")>;
+def : InstRW<[WLat8, RegReadAdv, FXa, LSU, NormalGr], (instregex "MGH$")>;
+def : InstRW<[WLat12, RegReadAdv, FXa2, LSU, GroupAlone], (instregex "MG$")>;
+def : InstRW<[WLat8, FXa2, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MSC$")>;
+def : InstRW<[WLat8LSU, WLat8LSU, RegReadAdv, FXa, LSU, NormalGr],
+             (instregex "MSGC$")>;
+def : InstRW<[WLat6, WLat6, FXa, NormalGr], (instregex "MSRKC$")>;
+def : InstRW<[WLat8, WLat8, FXa, NormalGr], (instregex "MSGRKC$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2], (instregex "D$")>;
+def : InstRW<[WLat30, FXa2, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa2, LSU, GroupAlone2],
+             (instregex "DSG(F)?$")>;
+def : InstRW<[WLat20, FXa4, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[WLat30, FXa4, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[WLat30, RegReadAdv, FXa4, LSU, GroupAlone2],
+             (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRL(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SRA(G|K)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "SLA(G|K)?$")>;
+def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2],
+             (instregex "S(L|R)D(A|L)$")>;
+
+// Rotate
+def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[WLat2, WLat2, FXa2, Cracked], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "C(G|Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CG(F|H)I$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(G)?R$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CIH$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CHSI$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CL(Y|Mux)?$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLFHSI$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLG$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLG(HRL|HSI)$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLGF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGFRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGF(I|R)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLGR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLGRL$")>;
+def : InstRW<[WLat1LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CLHF$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLH(RL|HSI)$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLIH$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLI(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "CLR$")>;
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "CLRL$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "C(L)?HHR$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "C(L)?HLR$")>;
+
+// Compare halfword
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CH(Y)?$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CHRL$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGH$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGHRL$")>;
+def : InstRW<[WLat2LSU, FXa, FXb, LSU, Cracked], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr], (instregex "CGF$")>;
+def : InstRW<[WLat2LSU, FXb, LSU, NormalGr], (instregex "CGFRL$")>;
+def : InstRW<[WLat2, FXb, NormalGr], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[WLat6, FXb, LSU2, Cracked], (instregex "CLC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[WLat1LSU, FXb, LSU, NormalGr], (instregex "TM(Y)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMHL(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLH(64)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TMLL(64)?$")>;
+
+// Compare logical characters under mask
+def : InstRW<[WLat2LSU, RegReadAdv, FXb, LSU, NormalGr],
+             (instregex "CLM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "PFD(RL)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAA(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAAL(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAN(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAO(G)?$")>;
+def : InstRW<[WLat2LSU, WLat2LSU, FXb, LSU, NormalGr], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[WLat2LSU, FXb, LSU, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[WLat3LSU, WLat3LSU, FXa, FXb, LSU, GroupAlone],
+             (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[WLat6LSU, WLat6LSU, FXa3, FXb2, LSU, GroupAlone2],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[WLat15, WLat15, FXa2, FXb4, LSU3,
+              GroupAlone3], (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[WLat30, MCD], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[WLat30, MCD], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[WLat4LSU, LSU2, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[WLat1, FXb2, LSU, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[WLat1LSU, WLat1LSU, LSU2, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "TR$")>;
+def : InstRW<[WLat30, WLat30, WLat30, FXa3, LSU2, GroupAlone2],
+             (instregex "TRT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRTR$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TRE$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TRT(R)?E(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "(KIMD|KLMD|KMAC|KDSA)$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD],
+             (instregex "(PCC|PPNO|PRNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LGG$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LLGFSG$")>;
+def : InstRW<[WLat30, MCD], (instregex "(L|ST)GSC$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat20, RegReadAdv, FXb, VecDF2, LSU2, GroupAlone2],
+             (instregex "CVBG$")>;
+def : InstRW<[WLat20, RegReadAdv, FXb, VecDF, LSU, GroupAlone2],
+             (instregex "CVB(Y)?$")>;
+def : InstRW<[WLat1, FXb3, VecDF4, LSU, GroupAlone3], (instregex "CVDG$")>;
+def : InstRW<[WLat1, FXb2, VecDF, LSU, GroupAlone2], (instregex "CVD(Y)?$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "MV(N|O|Z)$")>;
+def : InstRW<[WLat1, LSU5, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[WLat12, LSU5, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[WLat1, FXb, LSU2, Cracked], (instregex "UNPK$")>;
+
+def : InstRW<[WLat5LSU, FXb, VecDFX, LSU3, GroupAlone2],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[WLat1, FXb, VecDFX2, LSU3, GroupAlone2], (instregex "MP$")>;
+def : InstRW<[WLat1, FXb, VecDFX4, LSU3, GroupAlone2], (instregex "DP$")>;
+def : InstRW<[WLat15, FXb, VecDFX2, LSU2, GroupAlone3], (instregex "SRP$")>;
+def : InstRW<[WLat8, VecDFX, LSU, LSU, GroupAlone], (instregex "CP$")>;
+def : InstRW<[WLat3LSU, VecDFX, LSU, Cracked], (instregex "TP$")>;
+def : InstRW<[WLat30, MCD], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[WLat3, LSU, NormalGr], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[WLat5, LSU, FXa, Cracked], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[WLat20, WLat20, LSU5, GroupAlone], (instregex "LAM(Y)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STAM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[WLat3, FXa, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[WLat3, LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[WLat1, FXb, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[WLat1, FXa, FXb, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[WLat1, FXa2, FXb, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[WLat9, LSU2, FXb5, GroupAlone2], (instregex "TBEGIN(C)?$")>;
+
+// Transaction end
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[WLat30, MCD], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[WLat5, WLat5, FXa2, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[WLat3, WLat3, FXa, NormalGr], (instregex "POPCNT(Opt)?$")>;
+
+// String instructions
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "SRST(U)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CFC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, WLat30, WLat30, MCD],
+             (instregex "UPT$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "CKSM$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "CMPSC$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "SORTL$")>;
+def : InstRW<[WLat30, WLat30, WLat30, WLat30, MCD], (instregex "DFLTCC$")>;
+
+// Execute
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>;
+def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BR$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)BRCompare$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone],
+             (instregex "LTXBR(Compare)?$")>;
+
+// Copy sign
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "CPSDR(d|s)(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2LSU, VecXsPm, LSU, NormalGr], (instregex "LE(Y)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "ST(E|D)(Y)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEDBR(A)?$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "L(E|D)XBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[WLat6LSU, VecBF, LSU, NormalGr], (instregex "LDEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LDEBR$")>;
+def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)B$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)BR(A)?$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)L(F|G)BR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked],
+             (instregex "C(F|G)(E|D)BR(A)?$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLFDBR$")>;
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)BR(A)?$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D)B$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D)B$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D)BR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXDB$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[WLat15, VecDF4, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)EB$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr],
+             (instregex "D(E|D)B$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)BR$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+// Divide to integer
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "DI(E|D)BR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat3LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XBR$")>;
+
+// Test Data Class
+def : InstRW<[WLat5, LSU, VecXsPm, NormalGr], (instregex "TC(E|D)B$")>;
+def : InstRW<[WLat10, LSU, VecDF4, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4, FXa, LSU, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[WLat3, LSU, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[WLat3LSU, LSU2, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[WLat30, MCD], (instregex "SFASR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LFAS$")>;
+def : InstRW<[WLat3, FXb, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "LT(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "LEXR$")>;
+def : InstRW<[WLat9, VecDF2, NormalGr], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LDE$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LDER$")>;
+def : InstRW<[WLat7LSU, VecBF4, LSU, GroupAlone], (instregex "LX(E|D)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "LX(E|D)R$")>;
+
+// Convert from fixed
+def : InstRW<[WLat7, FXb, VecBF, Cracked], (instregex "C(E|D)(F|G)R$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[WLat9, WLat9, FXb, VecBF, Cracked], (instregex "C(F|G)(E|D)R$")>;
+def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "THD(E)?R$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[WLat30, VecFPd, LSU, NormalGr], (instregex "SQ(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "SQ(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "FI(E|D)R$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "A(E|D|U|W)$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[WLat6LSU, WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "S(E|D|U|W)$")>;
+def : InstRW<[WLat6, WLat6, VecBF, NormalGr], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MXD$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, VecBF4, LSU, GroupAlone], (instregex "MY$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MY(H|L)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "M(A|S)(E|D)$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "M(A|S)(E|D)R$")>;
+def : InstRW<[WLat7LSU, RegReadAdv, RegReadAdv, VecBF4, LSU, GroupAlone],
+             (instregex "MAY$")>;
+def : InstRW<[WLat6LSU, RegReadAdv, RegReadAdv, VecBF2, LSU, GroupAlone],
+             (instregex "MAY(H|L)$")>;
+def : InstRW<[WLat7, VecBF4, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[WLat6, VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
+
+// Division
+def : InstRW<[WLat30, RegReadAdv, VecFPd, LSU, NormalGr], (instregex "D(E|D)$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "D(E|D)R$")>;
+def : InstRW<[WLat30, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat6LSU, RegReadAdv, VecBF, LSU, NormalGr],
+             (instregex "C(E|D)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "C(E|D)R$")>;
+def : InstRW<[WLat10, VecDF2, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "LTDTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[WLat15, VecDF, NormalGr], (instregex "LEDTR$")>;
+def : InstRW<[WLat15, VecDF2, NormalGr], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "LDETR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDFTR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDGTR(A)?$")>;
+def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXFTR(A)?$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone2], (instregex "CXGTR(A)?$")>;
+def : InstRW<[WLat15, FXb, VecDF, Cracked], (instregex "CDLFTR$")>;
+def : InstRW<[WLat30, FXb, VecDF, Cracked], (instregex "CDLGTR$")>;
+def : InstRW<[WLat15, FXb, VecDF4, GroupAlone2], (instregex "CXLFTR$")>;
+def : InstRW<[WLat30, FXb, VecDF4, GroupAlone2], (instregex "CXLGTR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked],
+             (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked],
+             (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF, Cracked], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[WLat30, WLat30, FXb, VecDF2, Cracked], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "CD(S|U)TR$")>;
+def : InstRW<[WLat12, FXb2, VecDF4, GroupAlone2], (instregex "CX(S|U)TR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "C(S|U)DTR$")>;
+def : InstRW<[WLat15, FXb2, VecDF4, GroupAlone2], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDZT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXZT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CZDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[WLat8LSU, LSU, VecDF, Cracked], (instregex "CDPT$")>;
+def : InstRW<[WLat16LSU, LSU2, VecDF4, GroupAlone3], (instregex "CXPT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF, Cracked], (instregex "CPDT$")>;
+def : InstRW<[WLat1, FXb, LSU, VecDF2, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "FIDTR$")>;
+def : InstRW<[WLat10, VecDF4, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[WLat11, FXb, VecDF, Cracked], (instregex "ESDTR$")>;
+def : InstRW<[WLat12, FXb, VecDF2, Cracked], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "ADTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "SDTR(A)?$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "MDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[WLat30, VecDF, NormalGr], (instregex "DDTR(A)?$")>;
+def : InstRW<[WLat30, VecDF4, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[WLat8, WLat8, VecDF, NormalGr], (instregex "QADTR$")>;
+def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[WLat9, WLat9, FXb, VecDF, Cracked], (instregex "RRDTR$")>;
+def : InstRW<[WLat11, WLat11, FXb, VecDF4, GroupAlone2], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[WLat11LSU, LSU, VecDF, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[WLat11LSU, LSU, VecDF4, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[WLat9, FXb, VecDF, Cracked], (instregex "IEDTR$")>;
+def : InstRW<[WLat11, FXb, VecDF4, GroupAlone2], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "(K|C)DTR$")>;
+def : InstRW<[WLat9, VecDF2, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEDTR$")>;
+def : InstRW<[WLat8, VecDF, NormalGr], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[WLat15, LSU, VecDF, NormalGr], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[WLat15, LSU, VecDF2, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLR(32|64)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VZERO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VONE$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGBM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[WLat5LSU, RegReadAdv, FXb, LSU, VecXsPm, Cracked],
+             (instregex "VGE(F|G)$")>;
+def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone],
+             (instregex "VLM(Align)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>;
+def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>;
+def : InstRW<[WLat1, FXb2, LSU, Cracked], (instregex "VSCE(F|G)$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBR(H|F|G|Q)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLER(H|F|G)?$")>;
+def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr],
+             (instregex "VLEBR(H|F|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEBRZ(H|F|G|E)?$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLBRREP(H|F|G)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTBR(H|F|G|Q)?$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTER(H|F|G)?$")>;
+def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTEBRH$")>;
+def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTEBR(F|G)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPDI$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VBPERM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEL$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VUPLL(B|F|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VO(C)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VCKSM$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VX$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VML(B|F)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VMO(B|F|H)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSL(DB)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSR(A|L)B$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSLD$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSRD$")>;
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[WLat4, VecMul, NormalGr], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[WLat4, VecStr, NormalGr], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// Conversion and rounding
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCFP(S|L)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?G$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCD(L)?GB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCD(L)?GB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VCE(L)?FB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WCE(L)?FB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(S|L)FP$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GD$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?GDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?GDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VC(L)?FEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WC(L)?FEB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(L|R)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFLLD$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFLRX$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFI(DB)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFIDB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFISB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFISB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WFIXB$")>;
+
+// Sign operations
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VFPSO$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFPSOXB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFL(C|N|P)XB$")>;
+
+// Minimum / maximum
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WF(MAX|MIN)XB$")>;
+
+// Test data class
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFTCI$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
+
+// Add / subtract
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
+def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
+
+// Multiply / multiply-and-add/subtract
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>;
+def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
+
+// Divide / square root
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FDSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFDXB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFSQ$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "(V|W)FSQSB$")>;
+def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "WFSQXB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)DB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XB$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XB$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "VFK(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+             (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr],
+             (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFC(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "WFK(E|H|HE)SBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFC(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFK(E|H|HE)XBS$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)DB$")>;
+def : InstRW<[WLat3, VecXsPm, NormalGr], (instregex "WF(C|K)SB$")>;
+def : InstRW<[WLat3, VecDFX, NormalGr], (instregex "WF(C|K)XB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LEFR$")>;
+def : InstRW<[WLat3, FXb, NormalGr], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(B)?$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAE(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr],
+             (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[WLat3, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRCZ(B|F|H)S$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRS(B|F|H)?$")>;
+def : InstRW<[WLat4, WLat4, VecStr, NormalGr], (instregex "VSTRSZ(B|F|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "VLIP$")>;
+def : InstRW<[WLat6, VecDFX, LSU, GroupAlone2], (instregex "VPKZ$")>;
+def : InstRW<[WLat1, VecDFX, FXb, LSU2, GroupAlone2], (instregex "VUPKZ$")>;
+def : InstRW<[WLat20, WLat20, VecDF2, FXb, GroupAlone],
+             (instregex "VCVB(G)?(Opt)?$")>;
+def : InstRW<[WLat15, WLat15, VecDF2, FXb, GroupAlone],
+             (instregex "VCVD(G)?$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "V(A|S)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[WLat30, WLat30, VecDF2, GroupAlone], (instregex "VSDP$")>;
+def : InstRW<[WLat10, WLat10, VecDF2, NormalGr], (instregex "VSRP$")>;
+def : InstRW<[WLat4, WLat4, VecDFX, NormalGr], (instregex "VPSOP$")>;
+def : InstRW<[WLat2, VecDFX, NormalGr], (instregex "V(T|C)P$")>;
+
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, WLat30, MCD], (instregex "EPSW$")>;
+def : InstRW<[WLat20, GroupAlone3], (instregex "LPSW(E)?$")>;
+def : InstRW<[WLat3, FXa, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[WLat1, FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[WLat3, FXa, NormalGr], (instregex "IAC$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4LSU, WLat4LSU, LSU2, GroupAlone], (instregex "LCTL(G)?$")>;
+def : InstRW<[WLat1, LSU5, FXb, GroupAlone2], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSULatency, LSU, NormalGr], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSA(I)?R$")>;
+def : InstRW<[WLat30, MCD], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "S(T)?PX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "ISKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "IVSK$")>;
+def : InstRW<[WLat30, MCD], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RRB(E|M)$")>;
+def : InstRW<[WLat30, MCD], (instregex "IRBM$")>;
+def : InstRW<[WLat30, MCD], (instregex "PFMF$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "TB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGIN$")>;
+def : InstRW<[WLat30, MCD], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTLB$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "CSP(G)?$")>;
+def : InstRW<[WLat30, WLat30, WLat30, MCD], (instregex "LPTEA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STRAG$")>;
+def : InstRW<[WLat30, MCD], (instregex "LURA(G)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "STUR(A|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat4LSU, FXa2, FXb, LSU5, GroupAlone2], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[WLat1, FXa, LSU5, GroupAlone2], (instregex "MVC(S|D)K$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVCOS$")>;
+def : InstRW<[WLat30, MCD], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "LASP$")>;
+def : InstRW<[WLat1, LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[WLat30, MCD], (instregex "PC$")>;
+def : InstRW<[WLat30, MCD], (instregex "PR$")>;
+def : InstRW<[WLat30, MCD], (instregex "PT(I)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "RP$")>;
+def : InstRW<[WLat30, MCD], (instregex "BS(G|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "BAKR$")>;
+def : InstRW<[WLat30, MCD], (instregex "EREG(G)?$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "PTFF$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCK(PF|C)?$")>;
+def : InstRW<[WLat1, LSU2, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[WLat15, LSU3, FXa2, FXb, GroupAlone2], (instregex "STCK(F)?$")>;
+def : InstRW<[WLat20, LSU4, FXa2, FXb2, GroupAlone3], (instregex "STCKE$")>;
+def : InstRW<[WLat30, MCD], (instregex "STCKC$")>;
+def : InstRW<[WLat1, LSU2, FXb, Cracked], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "STAP$")>;
+def : InstRW<[WLat30, MCD], (instregex "STIDP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STSI$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "STFL(E)?$")>;
+def : InstRW<[WLat30, MCD], (instregex "ECAG$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECTG$")>;
+def : InstRW<[WLat30, MCD], (instregex "PTF$")>;
+def : InstRW<[WLat30, MCD], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "SVC$")>;
+def : InstRW<[WLat1, FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[WLat30, MCD], (instregex "DIAG$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "TRAC(E|G)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TRAP(2|4)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIG(P|A)$")>;
+def : InstRW<[WLat30, MCD], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "LPP$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "ECPGA$")>;
+def : InstRW<[WLat30, WLat30, MCD], (instregex "E(C|P)CTR$")>;
+def : InstRW<[WLat30, MCD], (instregex "LCCTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "L(P|S)CTL$")>;
+def : InstRW<[WLat30, MCD], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[WLat30, MCD], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[WLat30, MCD], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[WLat30, MCD], (instregex "RCHP$")>;
+def : InstRW<[WLat30, MCD], (instregex "SCHM$")>;
+def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>;
+def : InstRW<[WLat30, MCD], (instregex "TPI$")>;
+def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
+
+}
+
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index a50e6aa59711..47c925dcf730 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -209,10 +209,10 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
 
   // Now select between End and null, depending on whether the character
   // was found.
-  SDValue Ops[] = {End, DAG.getConstant(0, DL, PtrVT),
-                   DAG.getConstant(SystemZ::CCMASK_SRST, DL, MVT::i32),
-                   DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32),
-                   CCReg};
+  SDValue Ops[] = {
+      End, DAG.getConstant(0, DL, PtrVT),
+      DAG.getTargetConstant(SystemZ::CCMASK_SRST, DL, MVT::i32),
+      DAG.getTargetConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32), CCReg};
   End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, PtrVT, Ops);
   return std::make_pair(End, Chain);
 }
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index e79dfc5b4b9e..2aca22c9082a 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -75,7 +75,7 @@ static void tieOpsIfNeeded(MachineInstr &MI) {
 // instead of IIxF.
 bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned LLIxL,
                                     unsigned LLIxH) {
-  unsigned Reg = MI.getOperand(0).getReg();
+  Register Reg = MI.getOperand(0).getReg();
   // The new opcode will clear the other half of the GR64 reg, so
   // cancel if that is live.
   unsigned thisSubRegIdx =
@@ -86,7 +86,7 @@ bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned LLIxL,
                                             : SystemZ::subreg_l32);
   unsigned GR64BitReg =
       TRI->getMatchingSuperReg(Reg, thisSubRegIdx, &SystemZ::GR64BitRegClass);
-  unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx);
+  Register OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx);
   if (LiveRegs.contains(OtherReg))
     return false;
 
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 5c49e6eff0bf..20865037fe38 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -154,7 +154,7 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
           getEffectiveRelocModel(RM),
           getEffectiveSystemZCodeModel(CM, getEffectiveRelocModel(RM), JIT),
           OL),
-      TLOF(llvm::make_unique<TargetLoweringObjectFileELF>()),
+      TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
@@ -176,7 +176,7 @@ public:
   ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const override {
     return new ScheduleDAGMI(C,
-                             llvm::make_unique<SystemZPostRASchedStrategy>(C),
+                             std::make_unique<SystemZPostRASchedStrategy>(C),
                              /*RemoveKillFlags=*/true);
   }
 
@@ -184,6 +184,7 @@ public:
   bool addInstSelector() override;
   bool addILPOpts() override;
   void addPostRewrite() override;
+  void addPostRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
@@ -217,14 +218,14 @@ void SystemZPassConfig::addPostRewrite() {
   addPass(createSystemZPostRewritePass(getSystemZTargetMachine()));
 }
 
-void SystemZPassConfig::addPreSched2() {
+void SystemZPassConfig::addPostRegAlloc() {
   // PostRewrite needs to be run at -O0 also (in which case addPostRewrite()
   // is not called).
   if (getOptLevel() == CodeGenOpt::None)
     addPass(createSystemZPostRewritePass(getSystemZTargetMachine()));
+}
 
-  addPass(createSystemZExpandPseudoPass(getSystemZTargetMachine()));
-
+void SystemZPassConfig::addPreSched2() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
 }
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 145cf87ef9f5..11c99aa11174 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -304,7 +304,8 @@ bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
              C2.ScaleCost, C2.SetupCost);
 }
 
-unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+  bool Vector = (ClassID == 1);
   if (!Vector)
     // Discount the stack pointer.  Also leave out %r0, since it can't
     // be used in an address.
@@ -707,7 +708,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       // TODO: Fix base implementation which could simplify things a bit here
       // (seems to miss on differentiating on scalar/vector types).
 
-      // Only 64 bit vector conversions are natively supported before arch13.
+      // Only 64 bit vector conversions are natively supported before z15.
       if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
         if (SrcScalarBits == DstScalarBits)
           return NumDstVectors;
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 16ce2ef1d7a0..3ba80b31439f 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -56,12 +56,12 @@ public:
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getNumberOfRegisters(unsigned ClassID) const;
   unsigned getRegisterBitWidth(bool Vector) const;
 
-  unsigned getCacheLineSize() { return 256; }
-  unsigned getPrefetchDistance() { return 2000; }
-  unsigned getMinPrefetchStride() { return 2048; }
+  unsigned getCacheLineSize() const override { return 256; }
+  unsigned getPrefetchDistance() const override { return 2000; }
+  unsigned getMinPrefetchStride() const override { return 2048; }
 
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool prefersVectorizedAddressing() { return false; }
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 17274e1c2c6e..dcd3934de0fa 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -253,6 +253,7 @@ MCSection *TargetLoweringObjectFile::SectionForGlobal(
     auto Attrs = GVar->getAttributes();
     if ((Attrs.hasAttribute("bss-section") && Kind.isBSS()) ||
         (Attrs.hasAttribute("data-section") && Kind.isData()) ||
+        (Attrs.hasAttribute("relro-section") && Kind.isReadOnlyWithRel()) ||
         (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly()))  {
        return getExplicitSectionGlobal(GO, Kind, TM);
     }
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 634866d93570..4c98e140f446 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -63,18 +63,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
   RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
-  RESET_OPTION(NoTrappingFPMath, "no-trapping-math");
-
-  StringRef Denormal =
-    F.getFnAttribute("denormal-fp-math").getValueAsString();
-  if (Denormal == "ieee")
-    Options.FPDenormalMode = FPDenormal::IEEE;
-  else if (Denormal == "preserve-sign")
-    Options.FPDenormalMode = FPDenormal::PreserveSign;
-  else if (Denormal == "positive-zero")
-    Options.FPDenormalMode = FPDenormal::PositiveZero;
-  else
-    Options.FPDenormalMode = DefaultOptions.FPDenormalMode;
 }
 
 /// Returns the code generation relocation model. The choices are static, PIC,
@@ -140,8 +128,8 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
   // don't assume the variables to be DSO local unless we actually know
   // that for sure. This only has to be done for variables; for functions
   // the linker can insert thunks for calling functions from another DLL.
-  if (TT.isWindowsGNUEnvironment() && GV && GV->isDeclarationForLinker() &&
-      isa<GlobalVariable>(GV))
+  if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() && GV &&
+      GV->isDeclarationForLinker() && isa<GlobalVariable>(GV))
     return false;
 
   // On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols
@@ -154,7 +142,9 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
   // Make an exception for windows OS in the triple: Some firmware builds use
   // *-win32-macho triples. This (accidentally?) produced windows relocations
   // without GOT tables in older clang versions; Keep this behaviour.
-  if (TT.isOSBinFormatCOFF() || (TT.isOSWindows() && TT.isOSBinFormatMachO()))
+  // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables
+  // either.
+  if (TT.isOSBinFormatCOFF() || TT.isOSWindows())
     return true;
 
   // Most PIC code sequences that assume that a symbol is local cannot
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 5d9029682fdd..3ac9c38dfc0b 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -219,7 +219,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
 LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
   char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
   std::error_code EC;
-  raw_fd_ostream dest(Filename, EC, sys::fs::F_None);
+  raw_fd_ostream dest(Filename, EC, sys::fs::OF_None);
   if (EC) {
     *ErrorMessage = strdup(EC.message().c_str());
     return true;
diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 09628e872dd5..53a96fd6a97d 100644
--- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -313,16 +313,17 @@ public:
     return Optional<wasm::ValType>();
   }
 
-  WebAssembly::ExprType parseBlockType(StringRef ID) {
-    return StringSwitch<WebAssembly::ExprType>(ID)
-        .Case("i32", WebAssembly::ExprType::I32)
-        .Case("i64", WebAssembly::ExprType::I64)
-        .Case("f32", WebAssembly::ExprType::F32)
-        .Case("f64", WebAssembly::ExprType::F64)
-        .Case("v128", WebAssembly::ExprType::V128)
-        .Case("exnref", WebAssembly::ExprType::Exnref)
-        .Case("void", WebAssembly::ExprType::Void)
-        .Default(WebAssembly::ExprType::Invalid);
+  WebAssembly::BlockType parseBlockType(StringRef ID) {
+    // Multivalue block types are handled separately in parseSignature
+    return StringSwitch<WebAssembly::BlockType>(ID)
+        .Case("i32", WebAssembly::BlockType::I32)
+        .Case("i64", WebAssembly::BlockType::I64)
+        .Case("f32", WebAssembly::BlockType::F32)
+        .Case("f64", WebAssembly::BlockType::F64)
+        .Case("v128", WebAssembly::BlockType::V128)
+        .Case("exnref", WebAssembly::BlockType::Exnref)
+        .Case("void", WebAssembly::BlockType::Void)
+        .Default(WebAssembly::BlockType::Invalid);
   }
 
   bool parseRegTypeList(SmallVectorImpl<wasm::ValType> &Types) {
@@ -343,7 +344,7 @@ public:
     int64_t Val = Int.getIntVal();
     if (IsNegative)
       Val = -Val;
-    Operands.push_back(make_unique<WebAssemblyOperand>(
+    Operands.push_back(std::make_unique<WebAssemblyOperand>(
         WebAssemblyOperand::Integer, Int.getLoc(), Int.getEndLoc(),
         WebAssemblyOperand::IntOp{Val}));
     Parser.Lex();
@@ -356,7 +357,7 @@ public:
       return error("Cannot parse real: ", Flt);
     if (IsNegative)
       Val = -Val;
-    Operands.push_back(make_unique<WebAssemblyOperand>(
+    Operands.push_back(std::make_unique<WebAssemblyOperand>(
         WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(),
         WebAssemblyOperand::FltOp{Val}));
     Parser.Lex();
@@ -378,7 +379,7 @@ public:
     }
     if (IsNegative)
       Val = -Val;
-    Operands.push_back(make_unique<WebAssemblyOperand>(
+    Operands.push_back(std::make_unique<WebAssemblyOperand>(
         WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(),
         WebAssemblyOperand::FltOp{Val}));
     Parser.Lex();
@@ -407,7 +408,7 @@ public:
         // an opcode until after the assembly matcher, so set a default to fix
         // up later.
         auto Tok = Lexer.getTok();
-        Operands.push_back(make_unique<WebAssemblyOperand>(
+        Operands.push_back(std::make_unique<WebAssemblyOperand>(
             WebAssemblyOperand::Integer, Tok.getLoc(), Tok.getEndLoc(),
             WebAssemblyOperand::IntOp{-1}));
       }
@@ -416,8 +417,8 @@ public:
   }
 
   void addBlockTypeOperand(OperandVector &Operands, SMLoc NameLoc,
-                           WebAssembly::ExprType BT) {
-    Operands.push_back(make_unique<WebAssemblyOperand>(
+                           WebAssembly::BlockType BT) {
+    Operands.push_back(std::make_unique<WebAssemblyOperand>(
         WebAssemblyOperand::Integer, NameLoc, NameLoc,
         WebAssemblyOperand::IntOp{static_cast<int64_t>(BT)}));
   }
@@ -449,13 +450,14 @@ public:
     }
 
     // Now construct the name as first operand.
-    Operands.push_back(make_unique<WebAssemblyOperand>(
+    Operands.push_back(std::make_unique<WebAssemblyOperand>(
         WebAssemblyOperand::Token, NameLoc, SMLoc::getFromPointer(Name.end()),
         WebAssemblyOperand::TokOp{Name}));
 
     // If this instruction is part of a control flow structure, ensure
     // proper nesting.
     bool ExpectBlockType = false;
+    bool ExpectFuncType = false;
     if (Name == "block") {
       push(Block);
       ExpectBlockType = true;
@@ -489,9 +491,37 @@ public:
       if (pop(Name, Block))
         return true;
     } else if (Name == "end_function") {
+      ensureLocals(getStreamer());
       CurrentState = EndFunction;
       if (pop(Name, Function) || ensureEmptyNestingStack())
         return true;
+    } else if (Name == "call_indirect" || Name == "return_call_indirect") {
+      ExpectFuncType = true;
+    }
+
+    if (ExpectFuncType || (ExpectBlockType && Lexer.is(AsmToken::LParen))) {
+      // This has a special TYPEINDEX operand which in text we
+      // represent as a signature, such that we can re-build this signature,
+      // attach it to an anonymous symbol, which is what WasmObjectWriter
+      // expects to be able to recreate the actual unique-ified type indices.
+      auto Loc = Parser.getTok();
+      auto Signature = std::make_unique<wasm::WasmSignature>();
+      if (parseSignature(Signature.get()))
+        return true;
+      // Got signature as block type, don't need more
+      ExpectBlockType = false;
+      auto &Ctx = getStreamer().getContext();
+      // The "true" here will cause this to be a nameless symbol.
+      MCSymbol *Sym = Ctx.createTempSymbol("typeindex", true);
+      auto *WasmSym = cast<MCSymbolWasm>(Sym);
+      WasmSym->setSignature(Signature.get());
+      addSignature(std::move(Signature));
+      WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+      const MCExpr *Expr = MCSymbolRefExpr::create(
+          WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx);
+      Operands.push_back(std::make_unique<WebAssemblyOperand>(
+          WebAssemblyOperand::Symbol, Loc.getLoc(), Loc.getEndLoc(),
+          WebAssemblyOperand::SymOp{Expr}));
     }
 
     while (Lexer.isNot(AsmToken::EndOfStatement)) {
@@ -504,7 +534,7 @@ public:
         if (ExpectBlockType) {
           // Assume this identifier is a block_type.
           auto BT = parseBlockType(Id.getString());
-          if (BT == WebAssembly::ExprType::Invalid)
+          if (BT == WebAssembly::BlockType::Invalid)
             return error("Unknown block type: ", Id);
           addBlockTypeOperand(Operands, NameLoc, BT);
           Parser.Lex();
@@ -514,7 +544,7 @@ public:
           SMLoc End;
           if (Parser.parseExpression(Val, End))
             return error("Cannot parse symbol: ", Lexer.getTok());
-          Operands.push_back(make_unique<WebAssemblyOperand>(
+          Operands.push_back(std::make_unique<WebAssemblyOperand>(
               WebAssemblyOperand::Symbol, Id.getLoc(), Id.getEndLoc(),
               WebAssemblyOperand::SymOp{Val}));
           if (checkForP2AlignIfLoadStore(Operands, Name))
@@ -549,7 +579,7 @@ public:
       }
       case AsmToken::LCurly: {
         Parser.Lex();
-        auto Op = make_unique<WebAssemblyOperand>(
+        auto Op = std::make_unique<WebAssemblyOperand>(
             WebAssemblyOperand::BrList, Tok.getLoc(), Tok.getEndLoc());
         if (!Lexer.is(AsmToken::RCurly))
           for (;;) {
@@ -572,7 +602,7 @@ public:
     }
     if (ExpectBlockType && Operands.size() == 1) {
       // Support blocks with no operands as default to void.
-      addBlockTypeOperand(Operands, NameLoc, WebAssembly::ExprType::Void);
+      addBlockTypeOperand(Operands, NameLoc, WebAssembly::BlockType::Void);
     }
     Parser.Lex();
     return false;
@@ -671,7 +701,7 @@ public:
         LastFunctionLabel = LastLabel;
         push(Function);
       }
-      auto Signature = make_unique<wasm::WasmSignature>();
+      auto Signature = std::make_unique<wasm::WasmSignature>();
       if (parseSignature(Signature.get()))
         return true;
       WasmSym->setSignature(Signature.get());
@@ -687,7 +717,7 @@ public:
       if (SymName.empty())
         return true;
       auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
-      auto Signature = make_unique<wasm::WasmSignature>();
+      auto Signature = std::make_unique<wasm::WasmSignature>();
       if (parseRegTypeList(Signature->Params))
         return true;
       WasmSym->setSignature(Signature.get());
@@ -737,24 +767,30 @@ public:
     return true; // We didn't process this directive.
   }
 
+  // Called either when the first instruction is parsed of the function ends.
+  void ensureLocals(MCStreamer &Out) {
+    if (CurrentState == FunctionStart) {
+      // We haven't seen a .local directive yet. The streamer requires locals to
+      // be encoded as a prelude to the instructions, so emit an empty list of
+      // locals here.
+      auto &TOut = reinterpret_cast<WebAssemblyTargetStreamer &>(
+          *Out.getTargetStreamer());
+      TOut.emitLocal(SmallVector<wasm::ValType, 0>());
+      CurrentState = FunctionLocals;
+    }
+  }
+
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned & /*Opcode*/,
                                OperandVector &Operands, MCStreamer &Out,
                                uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override {
     MCInst Inst;
+    Inst.setLoc(IDLoc);
     unsigned MatchResult =
         MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
     switch (MatchResult) {
     case Match_Success: {
-      if (CurrentState == FunctionStart) {
-        // This is the first instruction in a function, but we haven't seen
-        // a .local directive yet. The streamer requires locals to be encoded
-        // as a prelude to the instructions, so emit an empty list of locals
-        // here.
-        auto &TOut = reinterpret_cast<WebAssemblyTargetStreamer &>(
-            *Out.getTargetStreamer());
-        TOut.emitLocal(SmallVector<wasm::ValType, 0>());
-      }
+      ensureLocals(Out);
       // Fix unknown p2align operands.
       auto Align = WebAssembly::GetDefaultP2AlignAny(Inst.getOpcode());
       if (Align != -1U) {
diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index f9bf3f85d30f..9a9c31cff2d5 100644
--- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -213,10 +214,29 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
         return MCDisassembler::Fail;
       break;
     }
-    // block_type operands (uint8_t).
+    // block_type operands:
     case WebAssembly::OPERAND_SIGNATURE: {
-      if (!parseImmediate<uint8_t>(MI, Size, Bytes))
+      int64_t Val;
+      uint64_t PrevSize = Size;
+      if (!nextLEB(Val, Bytes, Size, true))
         return MCDisassembler::Fail;
+      if (Val < 0) {
+        // Negative values are single septet value types or empty types
+        if (Size != PrevSize + 1) {
+          MI.addOperand(
+              MCOperand::createImm(int64_t(WebAssembly::BlockType::Invalid)));
+        } else {
+          MI.addOperand(MCOperand::createImm(Val & 0x7f));
+        }
+      } else {
+        // We don't have access to the signature, so create a symbol without one
+        MCSymbol *Sym = getContext().createTempSymbol("typeindex", true);
+        auto *WasmSym = cast<MCSymbolWasm>(Sym);
+        WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+        const MCExpr *Expr = MCSymbolRefExpr::create(
+            WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, getContext());
+        MI.addOperand(MCOperand::createExpr(Expr));
+      }
       break;
     }
     // FP operands.
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 70b409cf4a90..8314de41021f 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -31,10 +31,12 @@ namespace {
 
 class WebAssemblyAsmBackend final : public MCAsmBackend {
   bool Is64Bit;
+  bool IsEmscripten;
 
 public:
-  explicit WebAssemblyAsmBackend(bool Is64Bit)
-      : MCAsmBackend(support::little), Is64Bit(Is64Bit) {}
+  explicit WebAssemblyAsmBackend(bool Is64Bit, bool IsEmscripten)
+      : MCAsmBackend(support::little), Is64Bit(Is64Bit),
+        IsEmscripten(IsEmscripten) {}
 
   unsigned getNumFixupKinds() const override {
     return WebAssembly::NumTargetFixupKinds;
@@ -123,11 +125,11 @@ void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
 
 std::unique_ptr<MCObjectTargetWriter>
 WebAssemblyAsmBackend::createObjectTargetWriter() const {
-  return createWebAssemblyWasmObjectWriter(Is64Bit);
+  return createWebAssemblyWasmObjectWriter(Is64Bit, IsEmscripten);
 }
 
 } // end anonymous namespace
 
 MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
-  return new WebAssemblyAsmBackend(TT.isArch64Bit());
+  return new WebAssemblyAsmBackend(TT.isArch64Bit(), TT.isOSEmscripten());
 }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index b5d4d369b726..221ac17b8336 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -51,7 +52,9 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
 
   // Print any additional variadic operands.
   const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-  if (Desc.isVariadic())
+  if (Desc.isVariadic()) {
+    if (Desc.getNumOperands() == 0 && MI->getNumOperands() > 0)
+      OS << "\t";
     for (auto I = Desc.getNumOperands(), E = MI->getNumOperands(); I < E; ++I) {
       // FIXME: For CALL_INDIRECT_VOID, don't print a leading comma, because
       // we have an extra flags operand which is not currently printed, for
@@ -62,6 +65,7 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
         OS << ", ";
       printOperand(MI, I, OS);
     }
+  }
 
   // Print any added annotation.
   printAnnotation(OS, Annot);
@@ -232,7 +236,16 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     }
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    Op.getExpr()->print(O, &MAI);
+    // call_indirect instructions have a TYPEINDEX operand that we print
+    // as a signature here, such that the assembler can recover this
+    // information.
+    auto SRE = static_cast<const MCSymbolRefExpr *>(Op.getExpr());
+    if (SRE->getKind() == MCSymbolRefExpr::VK_WASM_TYPEINDEX) {
+      auto &Sym = static_cast<const MCSymbolWasm &>(SRE->getSymbol());
+      O << WebAssembly::signatureToString(Sym.getSignature());
+    } else {
+      Op.getExpr()->print(O, &MAI);
+    }
   }
 }
 
@@ -259,14 +272,26 @@ void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
 void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
                                                               unsigned OpNo,
                                                               raw_ostream &O) {
-  auto Imm = static_cast<unsigned>(MI->getOperand(OpNo).getImm());
-  if (Imm != wasm::WASM_TYPE_NORESULT)
-    O << WebAssembly::anyTypeToString(Imm);
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    auto Imm = static_cast<unsigned>(Op.getImm());
+    if (Imm != wasm::WASM_TYPE_NORESULT)
+      O << WebAssembly::anyTypeToString(Imm);
+  } else {
+    auto Expr = cast<MCSymbolRefExpr>(Op.getExpr());
+    auto *Sym = cast<MCSymbolWasm>(&Expr->getSymbol());
+    if (Sym->getSignature()) {
+      O << WebAssembly::signatureToString(Sym->getSignature());
+    } else {
+      // Disassembler does not currently produce a signature
+      O << "unknown_type";
+    }
+  }
 }
 
 // We have various enums representing a subset of these types, use this
 // function to convert any of them to text.
-const char *llvm::WebAssembly::anyTypeToString(unsigned Ty) {
+const char *WebAssembly::anyTypeToString(unsigned Ty) {
   switch (Ty) {
   case wasm::WASM_TYPE_I32:
     return "i32";
@@ -291,6 +316,24 @@ const char *llvm::WebAssembly::anyTypeToString(unsigned Ty) {
   }
 }
 
-const char *llvm::WebAssembly::typeToString(wasm::ValType Ty) {
+const char *WebAssembly::typeToString(wasm::ValType Ty) {
   return anyTypeToString(static_cast<unsigned>(Ty));
 }
+
+std::string WebAssembly::typeListToString(ArrayRef<wasm::ValType> List) {
+  std::string S;
+  for (auto &Ty : List) {
+    if (&Ty != &List[0]) S += ", ";
+    S += WebAssembly::typeToString(Ty);
+  }
+  return S;
+}
+
+std::string WebAssembly::signatureToString(const wasm::WasmSignature *Sig) {
+  std::string S("(");
+  S += typeListToString(Sig->Params);
+  S += ") -> (";
+  S += typeListToString(Sig->Returns);
+  S += ")";
+  return S;
+}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index b979de5028bf..cf37778099a0 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -58,6 +58,9 @@ namespace WebAssembly {
 const char *typeToString(wasm::ValType Ty);
 const char *anyTypeToString(unsigned Ty);
 
+std::string typeListToString(ArrayRef<wasm::ValType> List);
+std::string signatureToString(const wasm::WasmSignature *Sig);
+
 } // end namespace WebAssembly
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 44b6d6a968a9..1a4c57e66d2f 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -152,6 +152,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         break;
       case WebAssembly::OPERAND_FUNCTION32:
       case WebAssembly::OPERAND_OFFSET32:
+      case WebAssembly::OPERAND_SIGNATURE:
       case WebAssembly::OPERAND_TYPEINDEX:
       case WebAssembly::OPERAND_GLOBAL:
       case WebAssembly::OPERAND_EVENT:
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 7a9f59b1a4f2..b339860a381d 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -38,7 +38,7 @@ MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
 MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 
 std::unique_ptr<MCObjectTargetWriter>
-createWebAssemblyWasmObjectWriter(bool Is64Bit);
+createWebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten);
 
 namespace WebAssembly {
 enum OperandType {
@@ -122,16 +122,22 @@ enum TOF {
 namespace llvm {
 namespace WebAssembly {
 
-/// This is used to indicate block signatures.
-enum class ExprType : unsigned {
+/// Used as immediate MachineOperands for block signatures
+enum class BlockType : unsigned {
+  Invalid = 0x00,
   Void = 0x40,
-  I32 = 0x7F,
-  I64 = 0x7E,
-  F32 = 0x7D,
-  F64 = 0x7C,
-  V128 = 0x7B,
-  Exnref = 0x68,
-  Invalid = 0x00
+  I32 = unsigned(wasm::ValType::I32),
+  I64 = unsigned(wasm::ValType::I64),
+  F32 = unsigned(wasm::ValType::F32),
+  F64 = unsigned(wasm::ValType::F64),
+  V128 = unsigned(wasm::ValType::V128),
+  Exnref = unsigned(wasm::ValType::EXNREF),
+  // Multivalue blocks (and other non-void blocks) are only emitted when the
+  // blocks will never be exited and are at the ends of functions (see
+  // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
+  // to pop values off the stack, so the exact multivalue signature can always
+  // be inferred from the return type of the parent function in MCInstLower.
+  Multivalue = 0xffff,
 };
 
 /// Instruction opcodes emitted via means other than CodeGen.
@@ -191,6 +197,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I32_S:
   case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64:
   case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64_S:
+  case WebAssembly::LOAD_SPLAT_v8x16:
+  case WebAssembly::LOAD_SPLAT_v8x16_S:
     return 0;
   case WebAssembly::LOAD16_S_I32:
   case WebAssembly::LOAD16_S_I32_S:
@@ -240,6 +248,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I32_S:
   case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64:
   case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64_S:
+  case WebAssembly::LOAD_SPLAT_v16x8:
+  case WebAssembly::LOAD_SPLAT_v16x8_S:
     return 1;
   case WebAssembly::LOAD_I32:
   case WebAssembly::LOAD_I32_S:
@@ -295,6 +305,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   case WebAssembly::ATOMIC_NOTIFY_S:
   case WebAssembly::ATOMIC_WAIT_I32:
   case WebAssembly::ATOMIC_WAIT_I32_S:
+  case WebAssembly::LOAD_SPLAT_v32x4:
+  case WebAssembly::LOAD_SPLAT_v32x4_S:
     return 2;
   case WebAssembly::LOAD_I64:
   case WebAssembly::LOAD_I64_S:
@@ -324,31 +336,25 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   case WebAssembly::ATOMIC_RMW_CMPXCHG_I64_S:
   case WebAssembly::ATOMIC_WAIT_I64:
   case WebAssembly::ATOMIC_WAIT_I64_S:
+  case WebAssembly::LOAD_SPLAT_v64x2:
+  case WebAssembly::LOAD_SPLAT_v64x2_S:
+  case WebAssembly::LOAD_EXTEND_S_v8i16:
+  case WebAssembly::LOAD_EXTEND_S_v8i16_S:
+  case WebAssembly::LOAD_EXTEND_U_v8i16:
+  case WebAssembly::LOAD_EXTEND_U_v8i16_S:
+  case WebAssembly::LOAD_EXTEND_S_v4i32:
+  case WebAssembly::LOAD_EXTEND_S_v4i32_S:
+  case WebAssembly::LOAD_EXTEND_U_v4i32:
+  case WebAssembly::LOAD_EXTEND_U_v4i32_S:
+  case WebAssembly::LOAD_EXTEND_S_v2i64:
+  case WebAssembly::LOAD_EXTEND_S_v2i64_S:
+  case WebAssembly::LOAD_EXTEND_U_v2i64:
+  case WebAssembly::LOAD_EXTEND_U_v2i64_S:
     return 3;
-  case WebAssembly::LOAD_v16i8:
-  case WebAssembly::LOAD_v16i8_S:
-  case WebAssembly::LOAD_v8i16:
-  case WebAssembly::LOAD_v8i16_S:
-  case WebAssembly::LOAD_v4i32:
-  case WebAssembly::LOAD_v4i32_S:
-  case WebAssembly::LOAD_v2i64:
-  case WebAssembly::LOAD_v2i64_S:
-  case WebAssembly::LOAD_v4f32:
-  case WebAssembly::LOAD_v4f32_S:
-  case WebAssembly::LOAD_v2f64:
-  case WebAssembly::LOAD_v2f64_S:
-  case WebAssembly::STORE_v16i8:
-  case WebAssembly::STORE_v16i8_S:
-  case WebAssembly::STORE_v8i16:
-  case WebAssembly::STORE_v8i16_S:
-  case WebAssembly::STORE_v4i32:
-  case WebAssembly::STORE_v4i32_S:
-  case WebAssembly::STORE_v2i64:
-  case WebAssembly::STORE_v2i64_S:
-  case WebAssembly::STORE_v4f32:
-  case WebAssembly::STORE_v4f32_S:
-  case WebAssembly::STORE_v2f64:
-  case WebAssembly::STORE_v2f64_S:
+  case WebAssembly::LOAD_V128:
+  case WebAssembly::LOAD_V128_S:
+  case WebAssembly::STORE_V128:
+  case WebAssembly::STORE_V128_S:
     return 4;
   default:
     return -1;
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index e05efef7201b..40926201931a 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -60,39 +60,10 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<wasm::ValType> Types) {
 
 void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
 
-void WebAssemblyTargetAsmStreamer::emitSignature(
-    const wasm::WasmSignature *Sig) {
-  OS << "(";
-  emitParamList(Sig);
-  OS << ") -> (";
-  emitReturnList(Sig);
-  OS << ")";
-}
-
-void WebAssemblyTargetAsmStreamer::emitParamList(
-    const wasm::WasmSignature *Sig) {
-  auto &Params = Sig->Params;
-  for (auto &Ty : Params) {
-    if (&Ty != &Params[0])
-      OS << ", ";
-    OS << WebAssembly::typeToString(Ty);
-  }
-}
-
-void WebAssemblyTargetAsmStreamer::emitReturnList(
-    const wasm::WasmSignature *Sig) {
-  auto &Returns = Sig->Returns;
-  for (auto &Ty : Returns) {
-    if (&Ty != &Returns[0])
-      OS << ", ";
-    OS << WebAssembly::typeToString(Ty);
-  }
-}
-
 void WebAssemblyTargetAsmStreamer::emitFunctionType(const MCSymbolWasm *Sym) {
   assert(Sym->isFunction());
   OS << "\t.functype\t" << Sym->getName() << " ";
-  emitSignature(Sym->getSignature());
+  OS << WebAssembly::signatureToString(Sym->getSignature());
   OS << "\n";
 }
 
@@ -107,7 +78,7 @@ void WebAssemblyTargetAsmStreamer::emitGlobalType(const MCSymbolWasm *Sym) {
 void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) {
   assert(Sym->isEvent());
   OS << "\t.eventtype\t" << Sym->getName() << " ";
-  emitParamList(Sym->getSignature());
+  OS << WebAssembly::typeListToString(Sym->getSignature()->Params);
   OS << "\n";
 }
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 5ea62b179d22..0164f8e572ef 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -56,9 +56,6 @@ protected:
 /// This part is for ascii assembly output
 class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
   formatted_raw_ostream &OS;
-  void emitSignature(const wasm::WasmSignature *Sig);
-  void emitParamList(const wasm::WasmSignature *Sig);
-  void emitReturnList(const wasm::WasmSignature *Sig);
 
 public:
   WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index a1cc3e268e8f..e7a599e3e175 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 namespace {
 class WebAssemblyWasmObjectWriter final : public MCWasmObjectTargetWriter {
 public:
-  explicit WebAssemblyWasmObjectWriter(bool Is64Bit);
+  explicit WebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten);
 
 private:
   unsigned getRelocType(const MCValue &Target,
@@ -39,8 +39,9 @@ private:
 };
 } // end anonymous namespace
 
-WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit)
-    : MCWasmObjectTargetWriter(Is64Bit) {}
+WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit,
+                                                         bool IsEmscripten)
+    : MCWasmObjectTargetWriter(Is64Bit, IsEmscripten) {}
 
 static const MCSection *getFixupSection(const MCExpr *Expr) {
   if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr)) {
@@ -116,6 +117,6 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
 }
 
 std::unique_ptr<MCObjectTargetWriter>
-llvm::createWebAssemblyWasmObjectWriter(bool Is64Bit) {
-  return llvm::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit);
+llvm::createWebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten) {
+  return std::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit, IsEmscripten);
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 7f9d41da3978..5d8b873ce23b 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -67,8 +67,8 @@ MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
 }
 
 std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
-  unsigned RegNo = MO.getReg();
-  assert(TargetRegisterInfo::isVirtualRegister(RegNo) &&
+  Register RegNo = MO.getReg();
+  assert(Register::isVirtualRegister(RegNo) &&
          "Unlowered physical register encountered during assembly printing");
   assert(!MFI->isVRegStackified(RegNo));
   unsigned WAReg = MFI->getWAReg(RegNo);
@@ -332,43 +332,15 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // These represent values which are live into the function entry, so there's
     // no instruction to emit.
     break;
-  case WebAssembly::FALLTHROUGH_RETURN_I32:
-  case WebAssembly::FALLTHROUGH_RETURN_I32_S:
-  case WebAssembly::FALLTHROUGH_RETURN_I64:
-  case WebAssembly::FALLTHROUGH_RETURN_I64_S:
-  case WebAssembly::FALLTHROUGH_RETURN_F32:
-  case WebAssembly::FALLTHROUGH_RETURN_F32_S:
-  case WebAssembly::FALLTHROUGH_RETURN_F64:
-  case WebAssembly::FALLTHROUGH_RETURN_F64_S:
-  case WebAssembly::FALLTHROUGH_RETURN_v16i8:
-  case WebAssembly::FALLTHROUGH_RETURN_v16i8_S:
-  case WebAssembly::FALLTHROUGH_RETURN_v8i16:
-  case WebAssembly::FALLTHROUGH_RETURN_v8i16_S:
-  case WebAssembly::FALLTHROUGH_RETURN_v4i32:
-  case WebAssembly::FALLTHROUGH_RETURN_v4i32_S:
-  case WebAssembly::FALLTHROUGH_RETURN_v2i64:
-  case WebAssembly::FALLTHROUGH_RETURN_v2i64_S:
-  case WebAssembly::FALLTHROUGH_RETURN_v4f32:
-  case WebAssembly::FALLTHROUGH_RETURN_v4f32_S:
-  case WebAssembly::FALLTHROUGH_RETURN_v2f64:
-  case WebAssembly::FALLTHROUGH_RETURN_v2f64_S: {
+  case WebAssembly::FALLTHROUGH_RETURN: {
     // These instructions represent the implicit return at the end of a
-    // function body. Always pops one value off the stack.
+    // function body.
     if (isVerbose()) {
-      OutStreamer->AddComment("fallthrough-return-value");
+      OutStreamer->AddComment("fallthrough-return");
       OutStreamer->AddBlankLine();
     }
     break;
   }
-  case WebAssembly::FALLTHROUGH_RETURN_VOID:
-  case WebAssembly::FALLTHROUGH_RETURN_VOID_S:
-    // This instruction represents the implicit return at the end of a
-    // function body with no return value.
-    if (isVerbose()) {
-      OutStreamer->AddComment("fallthrough-return-void");
-      OutStreamer->AddBlankLine();
-    }
-    break;
   case WebAssembly::COMPILER_FENCE:
     // This is a compiler barrier that prevents instruction reordering during
     // backend compilation, and should not be emitted.
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index 4c5d0192fc28..c069af9eed62 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -97,14 +97,14 @@ public:
       // If the smallest region containing MBB is a loop
       if (LoopMap.count(ML))
         return LoopMap[ML].get();
-      LoopMap[ML] = llvm::make_unique<ConcreteRegion<MachineLoop>>(ML);
+      LoopMap[ML] = std::make_unique<ConcreteRegion<MachineLoop>>(ML);
       return LoopMap[ML].get();
     } else {
       // If the smallest region containing MBB is an exception
       if (ExceptionMap.count(WE))
         return ExceptionMap[WE].get();
       ExceptionMap[WE] =
-          llvm::make_unique<ConcreteRegion<WebAssemblyException>>(WE);
+          std::make_unique<ConcreteRegion<WebAssemblyException>>(WE);
       return ExceptionMap[WE].get();
     }
   }
@@ -317,6 +317,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
       // If Next was originally ordered before MBB, and it isn't because it was
       // loop-rotated above the header, it's not preferred.
       if (Next->getNumber() < MBB->getNumber() &&
+          (WasmDisableEHPadSort || !Next->isEHPad()) &&
           (!R || !R->contains(Next) ||
            R->getHeader()->getNumber() < Next->getNumber())) {
         Ready.push(Next);
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index e6bfc5226e2e..7e867edaaa27 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 using namespace llvm;
 
@@ -315,12 +316,12 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
   //   br_on_exn 0, $__cpp_exception
   //   rethrow
   // end_block
-  WebAssembly::ExprType ReturnType = WebAssembly::ExprType::Void;
+  WebAssembly::BlockType ReturnType = WebAssembly::BlockType::Void;
   if (IsBrOnExn) {
     const char *TagName = BrOnExn->getOperand(1).getSymbolName();
     if (std::strcmp(TagName, "__cpp_exception") != 0)
       llvm_unreachable("Only C++ exception is supported");
-    ReturnType = WebAssembly::ExprType::I32;
+    ReturnType = WebAssembly::BlockType::I32;
   }
 
   auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
@@ -406,7 +407,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
   auto InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet);
   MachineInstr *Begin = BuildMI(MBB, InsertPos, MBB.findDebugLoc(InsertPos),
                                 TII.get(WebAssembly::LOOP))
-                            .addImm(int64_t(WebAssembly::ExprType::Void));
+                            .addImm(int64_t(WebAssembly::BlockType::Void));
 
   // Decide where in Header to put the END_LOOP.
   BeforeSet.clear();
@@ -526,46 +527,56 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
       AfterSet.insert(&MI);
   }
 
-  // Local expression tree should go after the TRY.
-  for (auto I = Header->getFirstTerminator(), E = Header->begin(); I != E;
-       --I) {
-    if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
-      continue;
-    if (WebAssembly::isChild(*std::prev(I), MFI))
-      AfterSet.insert(&*std::prev(I));
-    else
-      break;
-  }
-
   // If Header unwinds to MBB (= Header contains 'invoke'), the try block should
   // contain the call within it. So the call should go after the TRY. The
   // exception is when the header's terminator is a rethrow instruction, in
   // which case that instruction, not a call instruction before it, is gonna
   // throw.
+  MachineInstr *ThrowingCall = nullptr;
   if (MBB.isPredecessor(Header)) {
     auto TermPos = Header->getFirstTerminator();
     if (TermPos == Header->end() ||
         TermPos->getOpcode() != WebAssembly::RETHROW) {
-      for (const auto &MI : reverse(*Header)) {
+      for (auto &MI : reverse(*Header)) {
         if (MI.isCall()) {
           AfterSet.insert(&MI);
+          ThrowingCall = &MI;
           // Possibly throwing calls are usually wrapped by EH_LABEL
           // instructions. We don't want to split them and the call.
           if (MI.getIterator() != Header->begin() &&
-              std::prev(MI.getIterator())->isEHLabel())
+              std::prev(MI.getIterator())->isEHLabel()) {
             AfterSet.insert(&*std::prev(MI.getIterator()));
+            ThrowingCall = &*std::prev(MI.getIterator());
+          }
           break;
         }
       }
     }
   }
 
+  // Local expression tree should go after the TRY.
+  // For BLOCK placement, we start the search from the previous instruction of a
+  // BB's terminator, but in TRY's case, we should start from the previous
+  // instruction of a call that can throw, or a EH_LABEL that precedes the call,
+  // because the return values of the call's previous instructions can be
+  // stackified and consumed by the throwing call.
+  auto SearchStartPt = ThrowingCall ? MachineBasicBlock::iterator(ThrowingCall)
+                                    : Header->getFirstTerminator();
+  for (auto I = SearchStartPt, E = Header->begin(); I != E; --I) {
+    if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
+      continue;
+    if (WebAssembly::isChild(*std::prev(I), MFI))
+      AfterSet.insert(&*std::prev(I));
+    else
+      break;
+  }
+
   // Add the TRY.
   auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
   MachineInstr *Begin =
       BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
               TII.get(WebAssembly::TRY))
-          .addImm(int64_t(WebAssembly::ExprType::Void));
+          .addImm(int64_t(WebAssembly::BlockType::Void));
 
   // Decide where in Header to put the END_TRY.
   BeforeSet.clear();
@@ -694,8 +705,26 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
   }
 }
 
+// When MBB is split into MBB and Split, we should unstackify defs in MBB that
+// have their uses in Split.
+static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
+                                         MachineBasicBlock &Split,
+                                         WebAssemblyFunctionInfo &MFI,
+                                         MachineRegisterInfo &MRI) {
+  for (auto &MI : Split) {
+    for (auto &MO : MI.explicit_uses()) {
+      if (!MO.isReg() || Register::isPhysicalRegister(MO.getReg()))
+        continue;
+      if (MachineInstr *Def = MRI.getUniqueVRegDef(MO.getReg()))
+        if (Def->getParent() == &MBB)
+          MFI.unstackifyVReg(MO.getReg());
+    }
+  }
+}
+
 bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Linearizing the control flow by placing TRY / END_TRY markers can create
@@ -830,7 +859,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
   SmallVector<const MachineBasicBlock *, 8> EHPadStack;
   // Range of intructions to be wrapped in a new nested try/catch
   using TryRange = std::pair<MachineInstr *, MachineInstr *>;
-  // In original CFG, <unwind destionation BB, a vector of try ranges>
+  // In original CFG, <unwind destination BB, a vector of try ranges>
   DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> UnwindDestToTryRanges;
   // In new CFG, <destination to branch to, a vector of try ranges>
   DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> BrDestToTryRanges;
@@ -936,7 +965,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
   // of the function with a local.get and a rethrow instruction.
   if (NeedAppendixBlock) {
     auto *AppendixBB = getAppendixBlock(MF);
-    unsigned ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
+    Register ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
     BuildMI(AppendixBB, DebugLoc(), TII.get(WebAssembly::RETHROW))
         .addReg(ExnReg);
     // These instruction ranges should branch to this appendix BB.
@@ -967,7 +996,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
   // ...
   // cont:
   for (auto &P : UnwindDestToTryRanges) {
-    NumUnwindMismatches++;
+    NumUnwindMismatches += P.second.size();
 
     // This means the destination is the appendix BB, which was separately
     // handled above.
@@ -1007,6 +1036,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
     BrDest->insert(BrDest->end(), EndTry->removeFromParent());
     // Take out the handler body from EH pad to the new branch destination BB.
     BrDest->splice(BrDest->end(), EHPad, SplitPos, EHPad->end());
+    unstackifyVRegsUsedInSplitBB(*EHPad, *BrDest, MFI, MRI);
     // Fix predecessor-successor relationship.
     BrDest->transferSuccessors(EHPad);
     EHPad->addSuccessor(BrDest);
@@ -1100,7 +1130,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
       MachineInstr *NestedTry =
           BuildMI(*MBB, *RangeBegin, RangeBegin->getDebugLoc(),
                   TII.get(WebAssembly::TRY))
-              .addImm(int64_t(WebAssembly::ExprType::Void));
+              .addImm(int64_t(WebAssembly::BlockType::Void));
 
       // Create the nested EH pad and fill instructions in.
       MachineBasicBlock *NestedEHPad = MF.CreateMachineBasicBlock();
@@ -1122,6 +1152,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
       // new nested continuation BB.
       NestedCont->splice(NestedCont->end(), MBB,
                          std::next(RangeEnd->getIterator()), MBB->end());
+      unstackifyVRegsUsedInSplitBB(*MBB, *NestedCont, MFI, MRI);
       registerTryScope(NestedTry, NestedEndTry, NestedEHPad);
 
       // Fix predecessor-successor relationship.
@@ -1197,54 +1228,32 @@ getDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
 /// checks for such cases and fixes up the signatures.
 void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
   const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
-  assert(MFI.getResults().size() <= 1);
 
   if (MFI.getResults().empty())
     return;
 
-  WebAssembly::ExprType RetType;
-  switch (MFI.getResults().front().SimpleTy) {
-  case MVT::i32:
-    RetType = WebAssembly::ExprType::I32;
-    break;
-  case MVT::i64:
-    RetType = WebAssembly::ExprType::I64;
-    break;
-  case MVT::f32:
-    RetType = WebAssembly::ExprType::F32;
-    break;
-  case MVT::f64:
-    RetType = WebAssembly::ExprType::F64;
-    break;
-  case MVT::v16i8:
-  case MVT::v8i16:
-  case MVT::v4i32:
-  case MVT::v2i64:
-  case MVT::v4f32:
-  case MVT::v2f64:
-    RetType = WebAssembly::ExprType::V128;
-    break;
-  case MVT::exnref:
-    RetType = WebAssembly::ExprType::Exnref;
-    break;
-  default:
-    llvm_unreachable("unexpected return type");
-  }
+  // MCInstLower will add the proper types to multivalue signatures based on the
+  // function return type
+  WebAssembly::BlockType RetType =
+      MFI.getResults().size() > 1
+          ? WebAssembly::BlockType::Multivalue
+          : WebAssembly::BlockType(
+                WebAssembly::toValType(MFI.getResults().front()));
 
   for (MachineBasicBlock &MBB : reverse(MF)) {
     for (MachineInstr &MI : reverse(MBB)) {
       if (MI.isPosition() || MI.isDebugInstr())
         continue;
-      if (MI.getOpcode() == WebAssembly::END_BLOCK) {
-        EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
-        continue;
-      }
-      if (MI.getOpcode() == WebAssembly::END_LOOP) {
+      switch (MI.getOpcode()) {
+      case WebAssembly::END_BLOCK:
+      case WebAssembly::END_LOOP:
+      case WebAssembly::END_TRY:
         EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
         continue;
+      default:
+        // Something other than an `end`. We're done.
+        return;
       }
-      // Something other than an `end`. We're done.
-      return;
     }
   }
 }
@@ -1280,7 +1289,9 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
     }
   }
   // Fix mismatches in unwind destinations induced by linearizing the code.
-  fixUnwindMismatches(MF);
+  if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
+      MF.getFunction().hasPersonalityFn())
+    fixUnwindMismatches(MF);
 }
 
 void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index dbd62179f055..ef75bb215317 100644
--- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -168,7 +168,7 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
 static MachineInstr *findStartOfTree(MachineOperand &MO,
                                      MachineRegisterInfo &MRI,
                                      WebAssemblyFunctionInfo &MFI) {
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   assert(MFI.isVRegStackified(Reg));
   MachineInstr *Def = MRI.getVRegDef(Reg);
 
@@ -207,7 +207,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
     MachineInstr &MI = *I++;
     if (!WebAssembly::isArgument(MI.getOpcode()))
       break;
-    unsigned Reg = MI.getOperand(0).getReg();
+    Register Reg = MI.getOperand(0).getReg();
     assert(!MFI.isVRegStackified(Reg));
     Reg2Local[Reg] = static_cast<unsigned>(MI.getOperand(1).getImm());
     MI.eraseFromParent();
@@ -221,7 +221,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   // drops to their defs.
   BitVector UseEmpty(MRI.getNumVirtRegs());
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I)
-    UseEmpty[I] = MRI.use_empty(TargetRegisterInfo::index2VirtReg(I));
+    UseEmpty[I] = MRI.use_empty(Register::index2VirtReg(I));
 
   // Visit each instruction in the function.
   for (MachineBasicBlock &MBB : MF) {
@@ -238,13 +238,13 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
       if (WebAssembly::isTee(MI.getOpcode())) {
         assert(MFI.isVRegStackified(MI.getOperand(0).getReg()));
         assert(!MFI.isVRegStackified(MI.getOperand(1).getReg()));
-        unsigned OldReg = MI.getOperand(2).getReg();
+        Register OldReg = MI.getOperand(2).getReg();
         const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
 
         // Stackify the input if it isn't stackified yet.
         if (!MFI.isVRegStackified(OldReg)) {
           unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
-          unsigned NewReg = MRI.createVirtualRegister(RC);
+          Register NewReg = MRI.createVirtualRegister(RC);
           unsigned Opc = getLocalGetOpcode(RC);
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc), NewReg)
               .addImm(LocalId);
@@ -270,17 +270,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
       // we handle at most one def.
       assert(MI.getDesc().getNumDefs() <= 1);
       if (MI.getDesc().getNumDefs() == 1) {
-        unsigned OldReg = MI.getOperand(0).getReg();
+        Register OldReg = MI.getOperand(0).getReg();
         if (!MFI.isVRegStackified(OldReg)) {
           const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
-          unsigned NewReg = MRI.createVirtualRegister(RC);
+          Register NewReg = MRI.createVirtualRegister(RC);
           auto InsertPt = std::next(MI.getIterator());
           if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
             MI.eraseFromParent();
             Changed = true;
             continue;
           }
-          if (UseEmpty[TargetRegisterInfo::virtReg2Index(OldReg)]) {
+          if (UseEmpty[Register::virtReg2Index(OldReg)]) {
             unsigned Opc = getDropOpcode(RC);
             MachineInstr *Drop =
                 BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
@@ -310,7 +310,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
         if (!MO.isReg())
           continue;
 
-        unsigned OldReg = MO.getReg();
+        Register OldReg = MO.getReg();
 
         // Inline asm may have a def in the middle of the operands. Our contract
         // with inline asm register operands is to provide local indices as
@@ -345,7 +345,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
         // Insert a local.get.
         unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
         const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
-        unsigned NewReg = MRI.createVirtualRegister(RC);
+        Register NewReg = MRI.createVirtualRegister(RC);
         unsigned Opc = getLocalGetOpcode(RC);
         InsertPt =
             BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc), NewReg)
@@ -369,7 +369,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   // TODO: Sort the locals for better compression.
   MFI.setNumLocals(CurLocal - MFI.getParams().size());
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     auto RL = Reg2Local.find(Reg);
     if (RL == Reg2Local.end() || RL->second < MFI.getParams().size())
       continue;
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 2552e9150833..c932f985489a 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -1141,14 +1141,14 @@ bool WebAssemblyFastISel::selectBitCast(const Instruction *I) {
     return true;
   }
 
-  unsigned Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(),
+  Register Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(),
                                         In, I->getOperand(0)->hasOneUse());
   if (!Reg)
     return false;
   MachineBasicBlock::iterator Iter = FuncInfo.InsertPt;
   --Iter;
   assert(Iter->isBitcast());
-  Iter->setPhysRegsDeadExcept(ArrayRef<unsigned>(), TRI);
+  Iter->setPhysRegsDeadExcept(ArrayRef<Register>(), TRI);
   updateValueMap(I, Reg);
   return true;
 }
@@ -1302,51 +1302,33 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
 
   if (Ret->getNumOperands() == 0) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(WebAssembly::RETURN_VOID));
+            TII.get(WebAssembly::RETURN));
     return true;
   }
 
+  // TODO: support multiple return in FastISel
+  if (Ret->getNumOperands() > 1)
+    return false;
+
   Value *RV = Ret->getOperand(0);
   if (!Subtarget->hasSIMD128() && RV->getType()->isVectorTy())
     return false;
 
-  unsigned Opc;
   switch (getSimpleType(RV->getType())) {
   case MVT::i1:
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
-    Opc = WebAssembly::RETURN_I32;
-    break;
   case MVT::i64:
-    Opc = WebAssembly::RETURN_I64;
-    break;
   case MVT::f32:
-    Opc = WebAssembly::RETURN_F32;
-    break;
   case MVT::f64:
-    Opc = WebAssembly::RETURN_F64;
-    break;
   case MVT::v16i8:
-    Opc = WebAssembly::RETURN_v16i8;
-    break;
   case MVT::v8i16:
-    Opc = WebAssembly::RETURN_v8i16;
-    break;
   case MVT::v4i32:
-    Opc = WebAssembly::RETURN_v4i32;
-    break;
   case MVT::v2i64:
-    Opc = WebAssembly::RETURN_v2i64;
-    break;
   case MVT::v4f32:
-    Opc = WebAssembly::RETURN_v4f32;
-    break;
   case MVT::v2f64:
-    Opc = WebAssembly::RETURN_v2f64;
-    break;
   case MVT::exnref:
-    Opc = WebAssembly::RETURN_EXNREF;
     break;
   default:
     return false;
@@ -1363,7 +1345,9 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
   if (Reg == 0)
     return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)).addReg(Reg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::RETURN))
+      .addReg(Reg);
   return true;
 }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index b7fc65401fc4..6b1bbd7a2b07 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -70,6 +70,8 @@ static void findUses(Value *V, Function &F,
   for (Use &U : V->uses()) {
     if (auto *BC = dyn_cast<BitCastOperator>(U.getUser()))
       findUses(BC, F, Uses, ConstantBCs);
+    else if (auto *A = dyn_cast<GlobalAlias>(U.getUser()))
+      findUses(A, F, Uses, ConstantBCs);
     else if (U.get()->getType() != F.getType()) {
       CallSite CS(U.getUser());
       if (!CS)
diff --git a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
index 7d8e86d9b2c0..157ea9d525c9 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -56,6 +56,7 @@
 #include "WebAssembly.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-fix-irreducible-control-flow"
@@ -358,7 +359,7 @@ void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop(
   // Add the register which will be used to tell the jump table which block to
   // jump to.
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+  Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
   MIB.addReg(Reg);
 
   // Compute the indices in the superheader, one for each bad block, and
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 5299068efdd4..71eeebfada4b 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -183,14 +183,14 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   bool HasBP = hasBP(MF);
   if (HasBP) {
     auto FI = MF.getInfo<WebAssemblyFunctionInfo>();
-    unsigned BasePtr = MRI.createVirtualRegister(PtrRC);
+    Register BasePtr = MRI.createVirtualRegister(PtrRC);
     FI->setBasePointerVreg(BasePtr);
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY), BasePtr)
         .addReg(SPReg);
   }
   if (StackSize) {
     // Subtract the frame size
-    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+    Register OffsetReg = MRI.createVirtualRegister(PtrRC);
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
         .addImm(StackSize);
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::SUB_I32),
@@ -199,7 +199,7 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(OffsetReg);
   }
   if (HasBP) {
-    unsigned BitmaskReg = MRI.createVirtualRegister(PtrRC);
+    Register BitmaskReg = MRI.createVirtualRegister(PtrRC);
     unsigned Alignment = MFI.getMaxAlignment();
     assert((1u << countTrailingZeros(Alignment)) == Alignment &&
            "Alignment must be a power of 2");
@@ -244,7 +244,7 @@ void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
   } else if (StackSize) {
     const TargetRegisterClass *PtrRC =
         MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+    Register OffsetReg = MRI.createVirtualRegister(PtrRC);
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
         .addImm(StackSize);
     // In the epilog we don't need to write the result back to the SP32 physreg
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index daddd4ca16ff..fdc0f561dcd9 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -29,9 +29,9 @@ public:
   static const size_t RedZoneSize = 128;
 
   WebAssemblyFrameLowering()
-      : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/16,
+      : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/Align(16),
                             /*LocalAreaOffset=*/0,
-                            /*TransientStackAlignment=*/16,
+                            /*TransientStackAlignment=*/Align(16),
                             /*StackRealignable=*/true) {}
 
   MachineBasicBlock::iterator
diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index 77217f16a727..13f0476eb4a5 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -26,9 +26,11 @@ HANDLE_NODETYPE(WrapperPIC)
 HANDLE_NODETYPE(BR_IF)
 HANDLE_NODETYPE(BR_TABLE)
 HANDLE_NODETYPE(SHUFFLE)
+HANDLE_NODETYPE(SWIZZLE)
 HANDLE_NODETYPE(VEC_SHL)
 HANDLE_NODETYPE(VEC_SHR_S)
 HANDLE_NODETYPE(VEC_SHR_U)
+HANDLE_NODETYPE(LOAD_SPLAT)
 HANDLE_NODETYPE(THROW)
 HANDLE_NODETYPE(MEMORY_COPY)
 HANDLE_NODETYPE(MEMORY_FILL)
diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 26339eaef37d..f83a8a984ae0 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -54,6 +54,12 @@ public:
 
     ForCodeSize = MF.getFunction().hasOptSize();
     Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
+
+    // Wasm64 is not fully supported right now (and is not specified)
+    if (Subtarget->hasAddr64())
+      report_fatal_error(
+          "64-bit WebAssembly (wasm64) is not currently supported");
+
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
@@ -88,88 +94,36 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
 
     uint64_t SyncScopeID =
         cast<ConstantSDNode>(Node->getOperand(2).getNode())->getZExtValue();
+    MachineSDNode *Fence = nullptr;
     switch (SyncScopeID) {
-    case SyncScope::SingleThread: {
+    case SyncScope::SingleThread:
       // We lower a single-thread fence to a pseudo compiler barrier instruction
       // preventing instruction reordering. This will not be emitted in final
       // binary.
-      MachineSDNode *Fence =
-          CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE,
-                                 DL,                 // debug loc
-                                 MVT::Other,         // outchain type
-                                 Node->getOperand(0) // inchain
-          );
-      ReplaceNode(Node, Fence);
-      CurDAG->RemoveDeadNode(Node);
-      return;
-    }
-
-    case SyncScope::System: {
-      // For non-emscripten systems, we have not decided on what we should
-      // traslate fences to yet.
-      if (!Subtarget->getTargetTriple().isOSEmscripten())
-        report_fatal_error(
-            "ATOMIC_FENCE is not yet supported in non-emscripten OSes");
-
-      // Wasm does not have a fence instruction, but because all atomic
-      // instructions in wasm are sequentially consistent, we translate a
-      // fence to an idempotent atomic RMW instruction to a linear memory
-      // address. All atomic instructions in wasm are sequentially consistent,
-      // but this is to ensure a fence also prevents reordering of non-atomic
-      // instructions in the VM. Even though LLVM IR's fence instruction does
-      // not say anything about its relationship with non-atomic instructions,
-      // we think this is more user-friendly.
-      //
-      // While any address can work, here we use a value stored in
-      // __stack_pointer wasm global because there's high chance that area is
-      // in cache.
-      //
-      // So the selected instructions will be in the form of:
-      //   %addr = get_global $__stack_pointer
-      //   %0 = i32.const 0
-      //   i32.atomic.rmw.or %addr, %0
-      SDValue StackPtrSym = CurDAG->getTargetExternalSymbol(
-          "__stack_pointer", TLI->getPointerTy(CurDAG->getDataLayout()));
-      MachineSDNode *GetGlobal =
-          CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32, // opcode
-                                 DL,                          // debug loc
-                                 MVT::i32,                    // result type
-                                 StackPtrSym // __stack_pointer symbol
-          );
-
-      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      auto *MMO = MF.getMachineMemOperand(
-          MachinePointerInfo::getUnknownStack(MF),
-          // FIXME Volatile isn't really correct, but currently all LLVM
-          // atomic instructions are treated as volatiles in the backend, so
-          // we should be consistent.
-          MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad |
-              MachineMemOperand::MOStore,
-          4, 4, AAMDNodes(), nullptr, SyncScope::System,
-          AtomicOrdering::SequentiallyConsistent);
-      MachineSDNode *Const0 =
-          CurDAG->getMachineNode(WebAssembly::CONST_I32, DL, MVT::i32, Zero);
-      MachineSDNode *AtomicRMW = CurDAG->getMachineNode(
-          WebAssembly::ATOMIC_RMW_OR_I32, // opcode
-          DL,                             // debug loc
-          MVT::i32,                       // result type
-          MVT::Other,                     // outchain type
-          {
-              Zero,                  // alignment
-              Zero,                  // offset
-              SDValue(GetGlobal, 0), // __stack_pointer
-              SDValue(Const0, 0),    // OR with 0 to make it idempotent
-              Node->getOperand(0)    // inchain
-          });
-
-      CurDAG->setNodeMemRefs(AtomicRMW, {MMO});
-      ReplaceUses(SDValue(Node, 0), SDValue(AtomicRMW, 1));
-      CurDAG->RemoveDeadNode(Node);
-      return;
-    }
+      Fence = CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE,
+                                     DL,                 // debug loc
+                                     MVT::Other,         // outchain type
+                                     Node->getOperand(0) // inchain
+      );
+      break;
+    case SyncScope::System:
+      // Currently wasm only supports sequentially consistent atomics, so we
+      // always set the order to 0 (sequentially consistent).
+      Fence = CurDAG->getMachineNode(
+          WebAssembly::ATOMIC_FENCE,
+          DL,                                         // debug loc
+          MVT::Other,                                 // outchain type
+          CurDAG->getTargetConstant(0, DL, MVT::i32), // order
+          Node->getOperand(0)                         // inchain
+      );
+      break;
     default:
       llvm_unreachable("Unknown scope!");
     }
+
+    ReplaceNode(Node, Fence);
+    CurDAG->RemoveDeadNode(Node);
+    return;
   }
 
   case ISD::GlobalTLSAddress: {
@@ -224,6 +178,33 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, TLSSize);
       return;
     }
+    case Intrinsic::wasm_tls_align: {
+      MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
+      assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
+
+      MachineSDNode *TLSAlign = CurDAG->getMachineNode(
+          WebAssembly::GLOBAL_GET_I32, DL, PtrVT,
+          CurDAG->getTargetExternalSymbol("__tls_align", MVT::i32));
+      ReplaceNode(Node, TLSAlign);
+      return;
+    }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    case Intrinsic::wasm_tls_base: {
+      MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
+      assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
+
+      MachineSDNode *TLSBase = CurDAG->getMachineNode(
+          WebAssembly::GLOBAL_GET_I32, DL, MVT::i32, MVT::Other,
+          CurDAG->getTargetExternalSymbol("__tls_base", PtrVT),
+          Node->getOperand(0));
+      ReplaceNode(Node, TLSBase);
+      return;
+    }
     }
     break;
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4064a983099c..f06afdbcea9e 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -205,7 +205,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     for (auto T : {MVT::i8, MVT::i16, MVT::i32})
       setOperationAction(ISD::SIGN_EXTEND_INREG, T, Action);
   }
-  for (auto T : MVT::integer_vector_valuetypes())
+  for (auto T : MVT::integer_fixedlen_vector_valuetypes())
     setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
 
   // Dynamic stack allocation: use the default expansion.
@@ -228,7 +228,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   //  - Floating-point extending loads.
   //  - Floating-point truncating stores.
   //  - i1 extending loads.
-  //  - extending/truncating SIMD loads/stores
+  //  - truncating SIMD stores and most extending loads
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   for (auto T : MVT::integer_valuetypes())
@@ -237,7 +237,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   if (Subtarget->hasSIMD128()) {
     for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32,
                    MVT::v2f64}) {
-      for (auto MemT : MVT::vector_valuetypes()) {
+      for (auto MemT : MVT::fixedlen_vector_valuetypes()) {
         if (MVT(T) != MemT) {
           setTruncStoreAction(T, MemT, Expand);
           for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
@@ -245,6 +245,14 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
         }
       }
     }
+    // But some vector extending loads are legal
+    if (Subtarget->hasUnimplementedSIMD128()) {
+      for (auto Ext : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
+        setLoadExtAction(Ext, MVT::v8i16, MVT::v8i8, Legal);
+        setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal);
+        setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal);
+      }
+    }
   }
 
   // Don't do anything clever with build_pairs
@@ -259,16 +267,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   setMaxAtomicSizeInBitsSupported(64);
 
-  if (Subtarget->hasBulkMemory()) {
-    // Use memory.copy and friends over multiple loads and stores
-    MaxStoresPerMemcpy = 1;
-    MaxStoresPerMemcpyOptSize = 1;
-    MaxStoresPerMemmove = 1;
-    MaxStoresPerMemmoveOptSize = 1;
-    MaxStoresPerMemset = 1;
-    MaxStoresPerMemsetOptSize = 1;
-  }
-
   // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is
   // consistent with the f64 and f128 names.
   setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
@@ -337,8 +335,8 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL,
                                        bool Float64, unsigned LoweredOpcode) {
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 
-  unsigned OutReg = MI.getOperand(0).getReg();
-  unsigned InReg = MI.getOperand(1).getReg();
+  Register OutReg = MI.getOperand(0).getReg();
+  Register InReg = MI.getOperand(1).getReg();
 
   unsigned Abs = Float64 ? WebAssembly::ABS_F64 : WebAssembly::ABS_F32;
   unsigned FConst = Float64 ? WebAssembly::CONST_F64 : WebAssembly::CONST_F32;
@@ -396,9 +394,9 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL,
   // For unsigned numbers, we have to do a separate comparison with zero.
   if (IsUnsigned) {
     Tmp1 = MRI.createVirtualRegister(MRI.getRegClass(InReg));
-    unsigned SecondCmpReg =
+    Register SecondCmpReg =
         MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-    unsigned AndReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+    Register AndReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
     BuildMI(BB, DL, TII.get(FConst), Tmp1)
         .addFPImm(cast<ConstantFP>(ConstantFP::get(Ty, 0.0)));
     BuildMI(BB, DL, TII.get(GE), SecondCmpReg).addReg(Tmp0).addReg(Tmp1);
@@ -550,6 +548,16 @@ bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT,
   return true;
 }
 
+bool WebAssemblyTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+  if (!Subtarget->hasUnimplementedSIMD128())
+    return false;
+  MVT ExtT = ExtVal.getSimpleValueType();
+  MVT MemT = cast<LoadSDNode>(ExtVal->getOperand(0))->getSimpleValueType(0);
+  return (ExtT == MVT::v8i16 && MemT == MVT::v8i8) ||
+         (ExtT == MVT::v4i32 && MemT == MVT::v4i16) ||
+         (ExtT == MVT::v2i64 && MemT == MVT::v2i32);
+}
+
 EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
                                                   LLVMContext &C,
                                                   EVT VT) const {
@@ -569,7 +577,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::i32;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = 4;
+    Info.align = Align(4);
     // atomic.notify instruction does not really load the memory specified with
     // this argument, but MachineMemOperand should either be load or store, so
     // we set this to a load.
@@ -583,7 +591,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::i32;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = 4;
+    Info.align = Align(4);
     Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
     return true;
   case Intrinsic::wasm_atomic_wait_i64:
@@ -591,7 +599,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = 8;
+    Info.align = Align(8);
     Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
     return true;
   default:
@@ -623,7 +631,8 @@ static bool callingConvSupported(CallingConv::ID CallConv) {
          CallConv == CallingConv::Cold ||
          CallConv == CallingConv::PreserveMost ||
          CallConv == CallingConv::PreserveAll ||
-         CallConv == CallingConv::CXX_FAST_TLS;
+         CallConv == CallingConv::CXX_FAST_TLS ||
+         CallConv == CallingConv::WASM_EmscriptenInvoke;
 }
 
 SDValue
@@ -644,13 +653,36 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (CLI.IsPatchPoint)
     fail(DL, DAG, "WebAssembly doesn't support patch point yet");
 
-  // Fail if tail calls are required but not enabled
-  if (!Subtarget->hasTailCall()) {
-    if ((CallConv == CallingConv::Fast && CLI.IsTailCall &&
-         MF.getTarget().Options.GuaranteedTailCallOpt) ||
-        (CLI.CS && CLI.CS.isMustTailCall()))
-      fail(DL, DAG, "WebAssembly 'tail-call' feature not enabled");
-    CLI.IsTailCall = false;
+  if (CLI.IsTailCall) {
+    bool MustTail = CLI.CS && CLI.CS.isMustTailCall();
+    if (Subtarget->hasTailCall() && !CLI.IsVarArg) {
+      // Do not tail call unless caller and callee return types match
+      const Function &F = MF.getFunction();
+      const TargetMachine &TM = getTargetMachine();
+      Type *RetTy = F.getReturnType();
+      SmallVector<MVT, 4> CallerRetTys;
+      SmallVector<MVT, 4> CalleeRetTys;
+      computeLegalValueVTs(F, TM, RetTy, CallerRetTys);
+      computeLegalValueVTs(F, TM, CLI.RetTy, CalleeRetTys);
+      bool TypesMatch = CallerRetTys.size() == CalleeRetTys.size() &&
+                        std::equal(CallerRetTys.begin(), CallerRetTys.end(),
+                                   CalleeRetTys.begin());
+      if (!TypesMatch) {
+        // musttail in this case would be an LLVM IR validation failure
+        assert(!MustTail);
+        CLI.IsTailCall = false;
+      }
+    } else {
+      CLI.IsTailCall = false;
+      if (MustTail) {
+        if (CLI.IsVarArg) {
+          // The return would pop the argument buffer
+          fail(DL, DAG, "WebAssembly does not support varargs tail calls");
+        } else {
+          fail(DL, DAG, "WebAssembly 'tail-call' feature not enabled");
+        }
+      }
+    }
   }
 
   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
@@ -659,6 +691,16 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+
+  // The generic code may have added an sret argument. If we're lowering an
+  // invoke function, the ABI requires that the function pointer be the first
+  // argument, so we may have to swap the arguments.
+  if (CallConv == CallingConv::WASM_EmscriptenInvoke && Outs.size() >= 2 &&
+      Outs[0].Flags.isSRet()) {
+    std::swap(Outs[0], Outs[1]);
+    std::swap(OutVals[0], OutVals[1]);
+  }
+
   unsigned NumFixedArgs = 0;
   for (unsigned I = 0; I < Outs.size(); ++I) {
     const ISD::OutputArg &Out = Outs[I];
@@ -810,8 +852,8 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
     CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     LLVMContext & /*Context*/) const {
-  // WebAssembly can't currently handle returning tuples.
-  return Outs.size() <= 1;
+  // WebAssembly can only handle returning tuples with multivalue enabled
+  return Subtarget->hasMultivalue() || Outs.size() <= 1;
 }
 
 SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -819,7 +861,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
-  assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
+  assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
+         "MVP WebAssembly can only return up to one value");
   if (!callingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
 
@@ -881,7 +924,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
   // the buffer is passed as an argument.
   if (IsVarArg) {
     MVT PtrVT = getPointerTy(MF.getDataLayout());
-    unsigned VarargVreg =
+    Register VarargVreg =
         MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrVT));
     MFI->setVarargBufferVreg(VarargVreg);
     Chain = DAG.getCopyToReg(
@@ -1022,8 +1065,9 @@ SDValue WebAssemblyTargetLowering::LowerRETURNADDR(SDValue Op,
     return SDValue();
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  MakeLibCallOptions CallOptions;
   return makeLibCall(DAG, RTLIB::RETURN_ADDRESS, Op.getValueType(),
-                     {DAG.getConstant(Depth, DL, MVT::i32)}, false, DL)
+                     {DAG.getConstant(Depth, DL, MVT::i32)}, CallOptions, DL)
       .first;
 }
 
@@ -1037,7 +1081,7 @@ SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op,
 
   DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
   EVT VT = Op.getValueType();
-  unsigned FP =
+  Register FP =
       Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
   return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FP, VT);
 }
@@ -1249,68 +1293,116 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   const EVT VecT = Op.getValueType();
   const EVT LaneT = Op.getOperand(0).getValueType();
   const size_t Lanes = Op.getNumOperands();
+  bool CanSwizzle = Subtarget->hasUnimplementedSIMD128() && VecT == MVT::v16i8;
+
+  // BUILD_VECTORs are lowered to the instruction that initializes the highest
+  // possible number of lanes at once followed by a sequence of replace_lane
+  // instructions to individually initialize any remaining lanes.
+
+  // TODO: Tune this. For example, lanewise swizzling is very expensive, so
+  // swizzled lanes should be given greater weight.
+
+  // TODO: Investigate building vectors by shuffling together vectors built by
+  // separately specialized means.
+
   auto IsConstant = [](const SDValue &V) {
     return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP;
   };
 
-  // Find the most common operand, which is approximately the best to splat
-  using Entry = std::pair<SDValue, size_t>;
-  SmallVector<Entry, 16> ValueCounts;
-  size_t NumConst = 0, NumDynamic = 0;
-  for (const SDValue &Lane : Op->op_values()) {
-    if (Lane.isUndef()) {
-      continue;
-    } else if (IsConstant(Lane)) {
-      NumConst++;
-    } else {
-      NumDynamic++;
-    }
-    auto CountIt = std::find_if(ValueCounts.begin(), ValueCounts.end(),
-                                [&Lane](Entry A) { return A.first == Lane; });
-    if (CountIt == ValueCounts.end()) {
-      ValueCounts.emplace_back(Lane, 1);
+  // Returns the source vector and index vector pair if they exist. Checks for:
+  //   (extract_vector_elt
+  //     $src,
+  //     (sign_extend_inreg (extract_vector_elt $indices, $i))
+  //   )
+  auto GetSwizzleSrcs = [](size_t I, const SDValue &Lane) {
+    auto Bail = std::make_pair(SDValue(), SDValue());
+    if (Lane->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return Bail;
+    const SDValue &SwizzleSrc = Lane->getOperand(0);
+    const SDValue &IndexExt = Lane->getOperand(1);
+    if (IndexExt->getOpcode() != ISD::SIGN_EXTEND_INREG)
+      return Bail;
+    const SDValue &Index = IndexExt->getOperand(0);
+    if (Index->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return Bail;
+    const SDValue &SwizzleIndices = Index->getOperand(0);
+    if (SwizzleSrc.getValueType() != MVT::v16i8 ||
+        SwizzleIndices.getValueType() != MVT::v16i8 ||
+        Index->getOperand(1)->getOpcode() != ISD::Constant ||
+        Index->getConstantOperandVal(1) != I)
+      return Bail;
+    return std::make_pair(SwizzleSrc, SwizzleIndices);
+  };
+
+  using ValueEntry = std::pair<SDValue, size_t>;
+  SmallVector<ValueEntry, 16> SplatValueCounts;
+
+  using SwizzleEntry = std::pair<std::pair<SDValue, SDValue>, size_t>;
+  SmallVector<SwizzleEntry, 16> SwizzleCounts;
+
+  auto AddCount = [](auto &Counts, const auto &Val) {
+    auto CountIt = std::find_if(Counts.begin(), Counts.end(),
+                                [&Val](auto E) { return E.first == Val; });
+    if (CountIt == Counts.end()) {
+      Counts.emplace_back(Val, 1);
     } else {
       CountIt->second++;
     }
+  };
+
+  auto GetMostCommon = [](auto &Counts) {
+    auto CommonIt =
+        std::max_element(Counts.begin(), Counts.end(),
+                         [](auto A, auto B) { return A.second < B.second; });
+    assert(CommonIt != Counts.end() && "Unexpected all-undef build_vector");
+    return *CommonIt;
+  };
+
+  size_t NumConstantLanes = 0;
+
+  // Count eligible lanes for each type of vector creation op
+  for (size_t I = 0; I < Lanes; ++I) {
+    const SDValue &Lane = Op->getOperand(I);
+    if (Lane.isUndef())
+      continue;
+
+    AddCount(SplatValueCounts, Lane);
+
+    if (IsConstant(Lane)) {
+      NumConstantLanes++;
+    } else if (CanSwizzle) {
+      auto SwizzleSrcs = GetSwizzleSrcs(I, Lane);
+      if (SwizzleSrcs.first)
+        AddCount(SwizzleCounts, SwizzleSrcs);
+    }
   }
-  auto CommonIt =
-      std::max_element(ValueCounts.begin(), ValueCounts.end(),
-                       [](Entry A, Entry B) { return A.second < B.second; });
-  assert(CommonIt != ValueCounts.end() && "Unexpected all-undef build_vector");
-  SDValue SplatValue = CommonIt->first;
-  size_t NumCommon = CommonIt->second;
-
-  // If v128.const is available, consider using it instead of a splat
+
+  SDValue SplatValue;
+  size_t NumSplatLanes;
+  std::tie(SplatValue, NumSplatLanes) = GetMostCommon(SplatValueCounts);
+
+  SDValue SwizzleSrc;
+  SDValue SwizzleIndices;
+  size_t NumSwizzleLanes = 0;
+  if (SwizzleCounts.size())
+    std::forward_as_tuple(std::tie(SwizzleSrc, SwizzleIndices),
+                          NumSwizzleLanes) = GetMostCommon(SwizzleCounts);
+
+  // Predicate returning true if the lane is properly initialized by the
+  // original instruction
+  std::function<bool(size_t, const SDValue &)> IsLaneConstructed;
+  SDValue Result;
   if (Subtarget->hasUnimplementedSIMD128()) {
-    // {i32,i64,f32,f64}.const opcode, and value
-    const size_t ConstBytes = 1 + std::max(size_t(4), 16 / Lanes);
-    // SIMD prefix and opcode
-    const size_t SplatBytes = 2;
-    const size_t SplatConstBytes = SplatBytes + ConstBytes;
-    // SIMD prefix, opcode, and lane index
-    const size_t ReplaceBytes = 3;
-    const size_t ReplaceConstBytes = ReplaceBytes + ConstBytes;
-    // SIMD prefix, v128.const opcode, and 128-bit value
-    const size_t VecConstBytes = 18;
-    // Initial v128.const and a replace_lane for each non-const operand
-    const size_t ConstInitBytes = VecConstBytes + NumDynamic * ReplaceBytes;
-    // Initial splat and all necessary replace_lanes
-    const size_t SplatInitBytes =
-        IsConstant(SplatValue)
-            // Initial constant splat
-            ? (SplatConstBytes +
-               // Constant replace_lanes
-               (NumConst - NumCommon) * ReplaceConstBytes +
-               // Dynamic replace_lanes
-               (NumDynamic * ReplaceBytes))
-            // Initial dynamic splat
-            : (SplatBytes +
-               // Constant replace_lanes
-               (NumConst * ReplaceConstBytes) +
-               // Dynamic replace_lanes
-               (NumDynamic - NumCommon) * ReplaceBytes);
-    if (ConstInitBytes < SplatInitBytes) {
-      // Create build_vector that will lower to initial v128.const
+    // Prefer swizzles over vector consts over splats
+    if (NumSwizzleLanes >= NumSplatLanes &&
+        NumSwizzleLanes >= NumConstantLanes) {
+      Result = DAG.getNode(WebAssemblyISD::SWIZZLE, DL, VecT, SwizzleSrc,
+                           SwizzleIndices);
+      auto Swizzled = std::make_pair(SwizzleSrc, SwizzleIndices);
+      IsLaneConstructed = [&, Swizzled](size_t I, const SDValue &Lane) {
+        return Swizzled == GetSwizzleSrcs(I, Lane);
+      };
+    } else if (NumConstantLanes >= NumSplatLanes) {
       SmallVector<SDValue, 16> ConstLanes;
       for (const SDValue &Lane : Op->op_values()) {
         if (IsConstant(Lane)) {
@@ -1321,26 +1413,35 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
           ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
         }
       }
-      SDValue Result = DAG.getBuildVector(VecT, DL, ConstLanes);
-      // Add replace_lane instructions for non-const lanes
-      for (size_t I = 0; I < Lanes; ++I) {
-        const SDValue &Lane = Op->getOperand(I);
-        if (!Lane.isUndef() && !IsConstant(Lane))
-          Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
-                               DAG.getConstant(I, DL, MVT::i32));
-      }
-      return Result;
+      Result = DAG.getBuildVector(VecT, DL, ConstLanes);
+      IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+        return IsConstant(Lane);
+      };
+    }
+  }
+  if (!Result) {
+    // Use a splat, but possibly a load_splat
+    LoadSDNode *SplattedLoad;
+    if (Subtarget->hasUnimplementedSIMD128() &&
+        (SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) &&
+        SplattedLoad->getMemoryVT() == VecT.getVectorElementType()) {
+      Result = DAG.getNode(WebAssemblyISD::LOAD_SPLAT, DL, VecT, SplatValue);
+    } else {
+      Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
     }
+    IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+      return Lane == SplatValue;
+    };
   }
-  // Use a splat for the initial vector
-  SDValue Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
-  // Add replace_lane instructions for other values
+
+  // Add replace_lane instructions for any unhandled values
   for (size_t I = 0; I < Lanes; ++I) {
     const SDValue &Lane = Op->getOperand(I);
-    if (Lane != SplatValue)
+    if (!Lane.isUndef() && !IsLaneConstructed(I, Lane))
       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
                            DAG.getConstant(I, DL, MVT::i32));
   }
+
   return Result;
 }
 
@@ -1415,11 +1516,6 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
   // Only manually lower vector shifts
   assert(Op.getSimpleValueType().isVector());
 
-  // Expand all vector shifts until V8 fixes its implementation
-  // TODO: remove this once V8 is fixed
-  if (!Subtarget->hasUnimplementedSIMD128())
-    return unrollVectorShift(Op, DAG);
-
   // Unroll non-splat vector shifts
   BuildVectorSDNode *ShiftVec;
   SDValue SplatVal;
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b3c7f3defd5f..a53e24a05542 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -63,7 +63,7 @@ private:
                                       MachineMemOperand::Flags Flags,
                                       bool *Fast) const override;
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
-
+  bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index e85aa57efc42..a9a99d38f9f1 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -71,12 +71,6 @@ class NotifyPatImmOff<PatFrag operand> :
 def : NotifyPatImmOff<regPlusImm>;
 def : NotifyPatImmOff<or_is_add>;
 
-def NotifyPatGlobalAddr :
-  Pat<(i32 (int_wasm_atomic_notify (regPlusGA I32:$addr,
-                                    (WebAssemblywrapper tglobaladdr:$off)),
-                                   I32:$count)),
-      (ATOMIC_NOTIFY 0, tglobaladdr:$off, I32:$addr, I32:$count)>;
-
 // Select notifys with just a constant offset.
 def NotifyPatOffsetOnly :
   Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)),
@@ -105,13 +99,6 @@ def : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, or_is_add, ATOMIC_WAIT_I32>;
 def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, regPlusImm, ATOMIC_WAIT_I64>;
 def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, or_is_add, ATOMIC_WAIT_I64>;
 
-class WaitPatGlobalAddr<ValueType ty, Intrinsic kind, NI inst> :
-  Pat<(i32 (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
-                 ty:$exp, I64:$timeout)),
-      (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, I64:$timeout)>;
-def : WaitPatGlobalAddr<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatGlobalAddr<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
-
 // Select wait_i32, ATOMIC_WAIT_I32s with just a constant offset.
 class WaitPatOffsetOnly<ValueType ty, Intrinsic kind, NI inst> :
   Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
@@ -126,6 +113,19 @@ def : WaitPatGlobalAddrOffOnly<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
 def : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
 } // Predicates = [HasAtomics]
 
+//===----------------------------------------------------------------------===//
+// Atomic fences
+//===----------------------------------------------------------------------===//
+
+// A compiler fence instruction that prevents reordering of instructions.
+let Defs = [ARGUMENTS] in {
+let isPseudo = 1, hasSideEffects = 1 in
+defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">;
+let hasSideEffects = 1 in
+defm ATOMIC_FENCE : ATOMIC_NRI<(outs), (ins i8imm:$flags), [], "atomic.fence",
+                               0x03>;
+} // Defs = [ARGUMENTS]
+
 //===----------------------------------------------------------------------===//
 // Atomic loads
 //===----------------------------------------------------------------------===//
@@ -151,9 +151,6 @@ def : LoadPatImmOff<i64, atomic_load_64, regPlusImm, ATOMIC_LOAD_I64>;
 def : LoadPatImmOff<i32, atomic_load_32, or_is_add, ATOMIC_LOAD_I32>;
 def : LoadPatImmOff<i64, atomic_load_64, or_is_add, ATOMIC_LOAD_I64>;
 
-def : LoadPatGlobalAddr<i32, atomic_load_32, ATOMIC_LOAD_I32>;
-def : LoadPatGlobalAddr<i64, atomic_load_64, ATOMIC_LOAD_I64>;
-
 // Select loads with just a constant offset.
 def : LoadPatOffsetOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>;
 def : LoadPatOffsetOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
@@ -244,16 +241,6 @@ def : LoadPatImmOff<i64, sext_aload_8_64, or_is_add, ATOMIC_LOAD8_U_I64>;
 def : LoadPatImmOff<i64, sext_aload_16_64, or_is_add, ATOMIC_LOAD16_U_I64>;
 // No 32->64 patterns, just use i32.atomic.load and i64.extend_s/i64
 
-def : LoadPatGlobalAddr<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddr<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatGlobalAddr<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
-def : LoadPatGlobalAddr<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddr<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-
 // Extending loads with just a constant offset
 def : LoadPatOffsetOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
 def : LoadPatOffsetOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
@@ -313,13 +300,6 @@ def : AStorePatImmOff<i64, atomic_store_64, regPlusImm, ATOMIC_STORE_I64>;
 def : AStorePatImmOff<i32, atomic_store_32, or_is_add, ATOMIC_STORE_I32>;
 def : AStorePatImmOff<i64, atomic_store_64, or_is_add, ATOMIC_STORE_I64>;
 
-class AStorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
-            ty:$val),
-      (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
-def : AStorePatGlobalAddr<i32, atomic_store_32, ATOMIC_STORE_I32>;
-def : AStorePatGlobalAddr<i64, atomic_store_64, ATOMIC_STORE_I64>;
-
 // Select stores with just a constant offset.
 class AStorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
   Pat<(kind imm:$off, ty:$val), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
@@ -374,12 +354,6 @@ def : AStorePatImmOff<i64, trunc_astore_8_64, or_is_add, ATOMIC_STORE8_I64>;
 def : AStorePatImmOff<i64, trunc_astore_16_64, or_is_add, ATOMIC_STORE16_I64>;
 def : AStorePatImmOff<i64, trunc_astore_32_64, or_is_add, ATOMIC_STORE32_I64>;
 
-def : AStorePatGlobalAddr<i32, atomic_store_8, ATOMIC_STORE8_I32>;
-def : AStorePatGlobalAddr<i32, atomic_store_16, ATOMIC_STORE16_I32>;
-def : AStorePatGlobalAddr<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
-def : AStorePatGlobalAddr<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
-def : AStorePatGlobalAddr<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
-
 // Truncating stores with just a constant offset
 def : AStorePatOffsetOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
 def : AStorePatOffsetOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
@@ -500,11 +474,6 @@ class BinRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
   Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$val)),
       (inst 0, imm:$off, I32:$addr, ty:$val)>;
 
-class BinRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
-                ty:$val)),
-      (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
-
 // Select binary RMWs with just a constant offset.
 class BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
   Pat<(ty (kind imm:$off, ty:$val)),
@@ -525,9 +494,6 @@ multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
   def : BinRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
   def : BinRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
 
-  def : BinRMWPatGlobalAddr<i32, rmw_32, inst_32>;
-  def : BinRMWPatGlobalAddr<i64, rmw_64, inst_64>;
-
   def : BinRMWPatOffsetOnly<i32, rmw_32, inst_32>;
   def : BinRMWPatOffsetOnly<i64, rmw_64, inst_64>;
 
@@ -622,17 +588,6 @@ multiclass BinRMWTruncExtPattern<
   def : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
   def : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
 
-  def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
-  def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
-  def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
-  def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
-  def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
-
-  def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
-  def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
-  def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
-  def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
-
   // Truncating-extending binary RMWs with just a constant offset
   def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
   def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
@@ -732,11 +687,6 @@ class TerRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
   Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$exp, ty:$new)),
       (inst 0, imm:$off, I32:$addr, ty:$exp, ty:$new)>;
 
-class TerRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
-                ty:$exp, ty:$new)),
-      (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, ty:$new)>;
-
 // Select ternary RMWs with just a constant offset.
 class TerRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
   Pat<(ty (kind imm:$off, ty:$exp, ty:$new)),
@@ -757,9 +707,6 @@ multiclass TerRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
   def : TerRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
   def : TerRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
 
-  def : TerRMWPatGlobalAddr<i32, rmw_32, inst_32>;
-  def : TerRMWPatGlobalAddr<i64, rmw_64, inst_64>;
-
   def : TerRMWPatOffsetOnly<i32, rmw_32, inst_32>;
   def : TerRMWPatOffsetOnly<i64, rmw_64, inst_64>;
 
@@ -846,17 +793,6 @@ multiclass TerRMWTruncExtPattern<
   def : TerRMWPatImmOff<i64, sext_ter_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
   def : TerRMWPatImmOff<i64, sext_ter_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
 
-  def : TerRMWPatGlobalAddr<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
-  def : TerRMWPatGlobalAddr<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
-  def : TerRMWPatGlobalAddr<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
-  def : TerRMWPatGlobalAddr<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
-  def : TerRMWPatGlobalAddr<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
-
-  def : TerRMWPatGlobalAddr<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
-  def : TerRMWPatGlobalAddr<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
-  def : TerRMWPatGlobalAddr<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
-  def : TerRMWPatGlobalAddr<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
-
   // Truncating-extending ternary RMWs with just a constant offset
   def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
   def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
@@ -887,13 +823,3 @@ defm : TerRMWTruncExtPattern<
   ATOMIC_RMW8_U_CMPXCHG_I32, ATOMIC_RMW16_U_CMPXCHG_I32,
   ATOMIC_RMW8_U_CMPXCHG_I64, ATOMIC_RMW16_U_CMPXCHG_I64,
   ATOMIC_RMW32_U_CMPXCHG_I64>;
-
-//===----------------------------------------------------------------------===//
-// Atomic fences
-//===----------------------------------------------------------------------===//
-
-// A compiler fence instruction that prevents reordering of instructions.
-let Defs = [ARGUMENTS] in {
-let isPseudo = 1, hasSideEffects = 1 in
-defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">;
-} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
index f4352e3d12ec..05735cf6d31f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
@@ -39,7 +39,7 @@ defm MEMORY_INIT :
          (ins i32imm_op:$seg, i32imm_op:$idx, I32:$dest,
               I32:$offset, I32:$size),
          (outs), (ins i32imm_op:$seg, i32imm_op:$idx),
-         [(int_wasm_memory_init (i32 imm:$seg), (i32 imm:$idx), I32:$dest,
+         [(int_wasm_memory_init (i32 timm:$seg), (i32 timm:$idx), I32:$dest,
             I32:$offset, I32:$size
           )],
          "memory.init\t$seg, $idx, $dest, $offset, $size",
@@ -48,7 +48,7 @@ defm MEMORY_INIT :
 let hasSideEffects = 1 in
 defm DATA_DROP :
   BULK_I<(outs), (ins i32imm_op:$seg), (outs), (ins i32imm_op:$seg),
-         [(int_wasm_data_drop (i32 imm:$seg))],
+         [(int_wasm_data_drop (i32 timm:$seg))],
          "data.drop\t$seg", "data.drop\t$seg", 0x09>;
 
 let mayLoad = 1, mayStore = 1 in
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 1870c5bc34b0..1afc9a8790dc 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -84,49 +84,19 @@ let isTerminator = 1, isBarrier = 1 in
 defm END_FUNCTION : NRI<(outs), (ins), [], "end_function", 0x0b>;
 } // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
 
-multiclass RETURN<WebAssemblyRegClass vt> {
-  defm RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins),
-                      [(WebAssemblyreturn vt:$val)],
-                      "return  \t$val", "return", 0x0f>;
-  // Equivalent to RETURN_#vt, for use at the end of a function when wasm
-  // semantics return by falling off the end of the block.
-  let isCodeGenOnly = 1 in
-  defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins), []>;
-}
-
-multiclass SIMD_RETURN<ValueType vt> {
-  defm RETURN_#vt : I<(outs), (ins V128:$val), (outs), (ins),
-                      [(WebAssemblyreturn (vt V128:$val))],
-                      "return  \t$val", "return", 0x0f>,
-                    Requires<[HasSIMD128]>;
-  // Equivalent to RETURN_#vt, for use at the end of a function when wasm
-  // semantics return by falling off the end of the block.
-  let isCodeGenOnly = 1 in
-  defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins V128:$val), (outs), (ins),
-                                  []>,
-                                  Requires<[HasSIMD128]>;
-}
 
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 
 let isReturn = 1 in {
-  defm "": RETURN<I32>;
-  defm "": RETURN<I64>;
-  defm "": RETURN<F32>;
-  defm "": RETURN<F64>;
-  defm "": RETURN<EXNREF>;
-  defm "": SIMD_RETURN<v16i8>;
-  defm "": SIMD_RETURN<v8i16>;
-  defm "": SIMD_RETURN<v4i32>;
-  defm "": SIMD_RETURN<v2i64>;
-  defm "": SIMD_RETURN<v4f32>;
-  defm "": SIMD_RETURN<v2f64>;
-
-  defm RETURN_VOID : NRI<(outs), (ins), [(WebAssemblyreturn)], "return", 0x0f>;
-
-  // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
-  let isCodeGenOnly = 1 in
-  defm FALLTHROUGH_RETURN_VOID : NRI<(outs), (ins), []>;
+
+defm RETURN : I<(outs), (ins variable_ops), (outs), (ins),
+                [(WebAssemblyreturn)],
+                "return", "return", 0x0f>;
+// Equivalent to RETURN, for use at the end of a function when wasm
+// semantics return by falling off the end of the block.
+let isCodeGenOnly = 1 in
+defm FALLTHROUGH_RETURN : I<(outs), (ins variable_ops), (outs), (ins), []>;
+
 } // isReturn = 1
 
 defm UNREACHABLE : NRI<(outs), (ins), [(trap)], "unreachable", 0x00>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 661fee2715ba..f3d9c5d5032c 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -171,6 +171,23 @@ defm I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
                          0xb1>;
 } // hasSideEffects = 1
 
+def : Pat<(int_wasm_trunc_signed F32:$src),
+          (I32_TRUNC_S_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_unsigned F32:$src),
+          (I32_TRUNC_U_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_signed F64:$src),
+          (I32_TRUNC_S_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_unsigned F64:$src),
+          (I32_TRUNC_U_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_signed F32:$src),
+          (I64_TRUNC_S_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_unsigned F32:$src),
+          (I64_TRUNC_U_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_signed F64:$src),
+          (I64_TRUNC_S_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_unsigned F64:$src),
+          (I64_TRUNC_U_F64 F64:$src)>;
+
 defm F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src), (outs), (ins),
                            [(set F32:$dst, (sint_to_fp I32:$src))],
                            "f32.convert_i32_s\t$dst, $src", "f32.convert_i32_s",
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index a86c9af28f0d..8e8126c90e72 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -38,7 +38,7 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
       RI(STI.getTargetTriple()) {}
 
 bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
-    const MachineInstr &MI, AliasAnalysis *AA) const {
+    const MachineInstr &MI, AAResults *AA) const {
   switch (MI.getOpcode()) {
   case WebAssembly::CONST_I32:
   case WebAssembly::CONST_I64:
@@ -60,7 +60,7 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // exist. However we need to handle both here.
   auto &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *RC =
-      TargetRegisterInfo::isVirtualRegister(DestReg)
+      Register::isVirtualRegister(DestReg)
           ? MRI.getRegClass(DestReg)
           : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg);
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index df1051b4f42c..fe6211663c31 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -43,7 +43,7 @@ public:
   const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
 
   bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                         AliasAnalysis *AA) const override;
+                                         AAResults *AA) const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 73ddbe85d551..044901481381 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -106,7 +106,8 @@ def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE",
 def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT",
                                  SDT_WebAssemblyArgument>;
 def WebAssemblyreturn   : SDNode<"WebAssemblyISD::RETURN",
-                                 SDT_WebAssemblyReturn, [SDNPHasChain]>;
+                                 SDT_WebAssemblyReturn,
+                                 [SDNPHasChain, SDNPVariadic]>;
 def WebAssemblywrapper  : SDNode<"WebAssemblyISD::Wrapper",
                                  SDT_WebAssemblyWrapper>;
 def WebAssemblywrapperPIC  : SDNode<"WebAssemblyISD::WrapperPIC",
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 6916b165f970..eba9b80d3286 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -37,16 +37,6 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
   return (~Known0.Zero & ~Known1.Zero) == 0;
 }]>;
 
-// GlobalAddresses are conceptually unsigned values, so we can also fold them
-// into immediate values as long as the add is 'nuw'.
-// TODO: We'd like to also match GA offsets but there are cases where the
-// register can have a negative value. Find out what more we can do.
-def regPlusGA : PatFrag<(ops node:$addr, node:$off),
-                        (add node:$addr, node:$off),
-                        [{
-  return N->getFlags().hasNoUnsignedWrap();
-}]>;
-
 // We don't need a regPlusES because external symbols never have constant
 // offsets folded into them, so we can just use add.
 
@@ -93,15 +83,6 @@ def : LoadPatImmOff<i64, load, or_is_add, LOAD_I64>;
 def : LoadPatImmOff<f32, load, or_is_add, LOAD_F32>;
 def : LoadPatImmOff<f64, load, or_is_add, LOAD_F64>;
 
-class LoadPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)))),
-      (inst 0, tglobaladdr:$off, I32:$addr)>, Requires<[IsNotPIC]>;
-
-def : LoadPatGlobalAddr<i32, load, LOAD_I32>;
-def : LoadPatGlobalAddr<i64, load, LOAD_I64>;
-def : LoadPatGlobalAddr<f32, load, LOAD_F32>;
-def : LoadPatGlobalAddr<f64, load, LOAD_F64>;
-
 // Select loads with just a constant offset.
 class LoadPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
   Pat<(ty (kind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>;
@@ -167,18 +148,6 @@ def : LoadPatImmOff<i64, zextloadi16, or_is_add, LOAD16_U_I64>;
 def : LoadPatImmOff<i64, sextloadi32, or_is_add, LOAD32_S_I64>;
 def : LoadPatImmOff<i64, zextloadi32, or_is_add, LOAD32_U_I64>;
 
-def : LoadPatGlobalAddr<i32, sextloadi8, LOAD8_S_I32>;
-def : LoadPatGlobalAddr<i32, zextloadi8, LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, sextloadi16, LOAD16_S_I32>;
-def : LoadPatGlobalAddr<i32, zextloadi8, LOAD16_U_I32>;
-
-def : LoadPatGlobalAddr<i64, sextloadi8, LOAD8_S_I64>;
-def : LoadPatGlobalAddr<i64, zextloadi8, LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, sextloadi16, LOAD16_S_I64>;
-def : LoadPatGlobalAddr<i64, zextloadi16, LOAD16_U_I64>;
-def : LoadPatGlobalAddr<i64, sextloadi32, LOAD32_S_I64>;
-def : LoadPatGlobalAddr<i64, zextloadi32, LOAD32_U_I64>;
-
 // Select extending loads with just a constant offset.
 def : LoadPatOffsetOnly<i32, sextloadi8, LOAD8_S_I32>;
 def : LoadPatOffsetOnly<i32, zextloadi8, LOAD8_U_I32>;
@@ -224,11 +193,6 @@ def : LoadPatImmOff<i32, extloadi16, or_is_add, LOAD16_U_I32>;
 def : LoadPatImmOff<i64, extloadi8, or_is_add, LOAD8_U_I64>;
 def : LoadPatImmOff<i64, extloadi16, or_is_add, LOAD16_U_I64>;
 def : LoadPatImmOff<i64, extloadi32, or_is_add, LOAD32_U_I64>;
-def : LoadPatGlobalAddr<i32, extloadi8, LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, extloadi16, LOAD16_U_I32>;
-def : LoadPatGlobalAddr<i64, extloadi8, LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, extloadi16, LOAD16_U_I64>;
-def : LoadPatGlobalAddr<i64, extloadi32, LOAD32_U_I64>;
 
 // Select "don't care" extending loads with just a constant offset.
 def : LoadPatOffsetOnly<i32, extloadi8, LOAD8_U_I32>;
@@ -282,15 +246,6 @@ def : StorePatImmOff<i64, store, or_is_add, STORE_I64>;
 def : StorePatImmOff<f32, store, or_is_add, STORE_F32>;
 def : StorePatImmOff<f64, store, or_is_add, STORE_F64>;
 
-class StorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(kind ty:$val,
-            (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off))),
-      (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>, Requires<[IsNotPIC]>;
-def : StorePatGlobalAddr<i32, store, STORE_I32>;
-def : StorePatGlobalAddr<i64, store, STORE_I64>;
-def : StorePatGlobalAddr<f32, store, STORE_F32>;
-def : StorePatGlobalAddr<f64, store, STORE_F64>;
-
 // Select stores with just a constant offset.
 class StorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
   Pat<(kind ty:$val, imm:$off), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
@@ -333,12 +288,6 @@ def : StorePatImmOff<i64, truncstorei8, or_is_add, STORE8_I64>;
 def : StorePatImmOff<i64, truncstorei16, or_is_add, STORE16_I64>;
 def : StorePatImmOff<i64, truncstorei32, or_is_add, STORE32_I64>;
 
-def : StorePatGlobalAddr<i32, truncstorei8, STORE8_I32>;
-def : StorePatGlobalAddr<i32, truncstorei16, STORE16_I32>;
-def : StorePatGlobalAddr<i64, truncstorei8, STORE8_I64>;
-def : StorePatGlobalAddr<i64, truncstorei16, STORE16_I64>;
-def : StorePatGlobalAddr<i64, truncstorei32, STORE32_I64>;
-
 // Select truncating stores with just a constant offset.
 def : StorePatOffsetOnly<i32, truncstorei8, STORE8_I32>;
 def : StorePatOffsetOnly<i32, truncstorei16, STORE16_I32>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index dd8930f079b0..fc5d73dac52e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -40,47 +40,124 @@ def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
 //===----------------------------------------------------------------------===//
 
 // Load: v128.load
-multiclass SIMDLoad<ValueType vec_t> {
-  let mayLoad = 1, UseNamedOperandTable = 1 in
-  defm LOAD_#vec_t :
+let mayLoad = 1, UseNamedOperandTable = 1 in
+defm LOAD_V128 :
+  SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+         (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+         "v128.load\t$dst, ${off}(${addr})$p2align",
+         "v128.load\t$off$p2align", 0>;
+
+// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
+def : LoadPatNoOffset<vec_t, load, LOAD_V128>;
+def : LoadPatImmOff<vec_t, load, regPlusImm, LOAD_V128>;
+def : LoadPatImmOff<vec_t, load, or_is_add, LOAD_V128>;
+def : LoadPatOffsetOnly<vec_t, load, LOAD_V128>;
+def : LoadPatGlobalAddrOffOnly<vec_t, load, LOAD_V128>;
+}
+
+// vNxM.load_splat
+multiclass SIMDLoadSplat<string vec, bits<32> simdop> {
+  let mayLoad = 1, UseNamedOperandTable = 1,
+      Predicates = [HasUnimplementedSIMD128] in
+  defm LOAD_SPLAT_#vec :
     SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
-           "v128.load\t$dst, ${off}(${addr})$p2align",
-           "v128.load\t$off$p2align", 0>;
+           vec#".load_splat\t$dst, ${off}(${addr})$p2align",
+           vec#".load_splat\t$off$p2align", simdop>;
 }
 
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-defm "" : SIMDLoad<vec_t>;
-
-// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-def : LoadPatNoOffset<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatImmOff<vec_t, load, regPlusImm, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatImmOff<vec_t, load, or_is_add, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatGlobalAddr<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatOffsetOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatGlobalAddrOffOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
+defm "" : SIMDLoadSplat<"v8x16", 194>;
+defm "" : SIMDLoadSplat<"v16x8", 195>;
+defm "" : SIMDLoadSplat<"v32x4", 196>;
+defm "" : SIMDLoadSplat<"v64x2", 197>;
+
+def wasm_load_splat_t : SDTypeProfile<1, 1, []>;
+def wasm_load_splat : SDNode<"WebAssemblyISD::LOAD_SPLAT", wasm_load_splat_t>;
+
+foreach args = [["v16i8", "i32", "extloadi8"], ["v8i16", "i32", "extloadi16"],
+                ["v4i32", "i32", "load"], ["v2i64", "i64", "load"],
+                ["v4f32", "f32", "load"], ["v2f64", "f64", "load"]] in
+def load_splat_#args[0] :
+  PatFrag<(ops node:$addr), (wasm_load_splat
+            (!cast<ValueType>(args[1]) (!cast<PatFrag>(args[2]) node:$addr)))>;
+
+let Predicates = [HasUnimplementedSIMD128] in
+foreach args = [["v16i8", "v8x16"], ["v8i16", "v16x8"], ["v4i32", "v32x4"],
+                ["v2i64", "v64x2"], ["v4f32", "v32x4"], ["v2f64", "v64x2"]] in {
+def : LoadPatNoOffset<!cast<ValueType>(args[0]),
+                      !cast<PatFrag>("load_splat_"#args[0]),
+                      !cast<NI>("LOAD_SPLAT_"#args[1])>;
+def : LoadPatImmOff<!cast<ValueType>(args[0]),
+                    !cast<PatFrag>("load_splat_"#args[0]),
+                    regPlusImm,
+                    !cast<NI>("LOAD_SPLAT_"#args[1])>;
+def : LoadPatImmOff<!cast<ValueType>(args[0]),
+                    !cast<PatFrag>("load_splat_"#args[0]),
+                    or_is_add,
+                    !cast<NI>("LOAD_SPLAT_"#args[1])>;
+def : LoadPatOffsetOnly<!cast<ValueType>(args[0]),
+                        !cast<PatFrag>("load_splat_"#args[0]),
+                        !cast<NI>("LOAD_SPLAT_"#args[1])>;
+def : LoadPatGlobalAddrOffOnly<!cast<ValueType>(args[0]),
+                               !cast<PatFrag>("load_splat_"#args[0]),
+                               !cast<NI>("LOAD_SPLAT_"#args[1])>;
 }
 
-// Store: v128.store
-multiclass SIMDStore<ValueType vec_t> {
-  let mayStore = 1, UseNamedOperandTable = 1 in
-  defm STORE_#vec_t :
-    SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, V128:$vec),
+// Load and extend
+multiclass SIMDLoadExtend<ValueType vec_t, string name, bits<32> simdop> {
+  let mayLoad = 1, UseNamedOperandTable = 1,
+      Predicates = [HasUnimplementedSIMD128] in {
+  defm LOAD_EXTEND_S_#vec_t :
+    SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+           (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+           name#"_s\t$dst, ${off}(${addr})$p2align",
+           name#"_s\t$off$p2align", simdop>;
+  defm LOAD_EXTEND_U_#vec_t :
+    SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
-           "v128.store\t${off}(${addr})$p2align, $vec",
-           "v128.store\t$off$p2align", 1>;
+           name#"_u\t$dst, ${off}(${addr})$p2align",
+           name#"_u\t$off$p2align", !add(simdop, 1)>;
+  }
 }
 
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-defm "" : SIMDStore<vec_t>;
+defm "" : SIMDLoadExtend<v8i16, "i16x8.load8x8", 210>;
+defm "" : SIMDLoadExtend<v4i32, "i32x4.load16x4", 212>;
+defm "" : SIMDLoadExtend<v2i64, "i64x2.load32x2", 214>;
+
+let Predicates = [HasUnimplementedSIMD128] in
+foreach types = [[v8i16, i8], [v4i32, i16], [v2i64, i32]] in
+foreach exts = [["sextloadv", "_S"],
+                ["zextloadv", "_U"],
+                ["extloadv", "_U"]] in {
+def : LoadPatNoOffset<types[0], !cast<PatFrag>(exts[0]#types[1]),
+                      !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+def : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), regPlusImm,
+                    !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+def : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), or_is_add,
+                    !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+def : LoadPatOffsetOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
+                        !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+def : LoadPatGlobalAddrOffOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
+                               !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+}
+
+
+// Store: v128.store
+let mayStore = 1, UseNamedOperandTable = 1 in
+defm STORE_V128 :
+  SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, V128:$vec),
+         (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+         "v128.store\t${off}(${addr})$p2align, $vec",
+         "v128.store\t$off$p2align", 1>;
 
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
 // Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-def : StorePatNoOffset<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatImmOff<vec_t, store, regPlusImm, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatImmOff<vec_t, store, or_is_add, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatGlobalAddr<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatOffsetOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatGlobalAddrOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
+def : StorePatNoOffset<vec_t, store, STORE_V128>;
+def : StorePatImmOff<vec_t, store, regPlusImm, STORE_V128>;
+def : StorePatImmOff<vec_t, store, or_is_add, STORE_V128>;
+def : StorePatOffsetOnly<vec_t, store, STORE_V128>;
+def : StorePatGlobalAddrOffOnly<vec_t, store, STORE_V128>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -90,7 +167,7 @@ def : StorePatGlobalAddrOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
 // Constant: v128.const
 multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
   let isMoveImm = 1, isReMaterializable = 1,
-      Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
+      Predicates = [HasUnimplementedSIMD128] in
   defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops,
                                   [(set V128:$dst, (vec_t pat))],
                                   "v128.const\t$dst, "#args,
@@ -198,6 +275,19 @@ def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
             (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF)))>;
 }
 
+// Swizzle lanes: v8x16.swizzle
+def wasm_swizzle_t : SDTypeProfile<1, 2, []>;
+def wasm_swizzle : SDNode<"WebAssemblyISD::SWIZZLE", wasm_swizzle_t>;
+let Predicates = [HasUnimplementedSIMD128] in
+defm SWIZZLE :
+  SIMD_I<(outs V128:$dst), (ins V128:$src, V128:$mask), (outs), (ins),
+         [(set (v16i8 V128:$dst),
+           (wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)))],
+         "v8x16.swizzle\t$dst, $src, $mask", "v8x16.swizzle", 192>;
+
+def : Pat<(int_wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)),
+          (SWIZZLE V128:$src, V128:$mask)>;
+
 // Create vector with identical lanes: splat
 def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
 def splat4 : PatFrag<(ops node:$x), (build_vector
@@ -286,7 +376,7 @@ multiclass ExtractLaneExtended<string sign, bits<32> baseInst> {
 }
 
 defm "" : ExtractLaneExtended<"_s", 5>;
-let Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
+let Predicates = [HasUnimplementedSIMD128] in
 defm "" : ExtractLaneExtended<"_u", 6>;
 defm "" : ExtractLane<v4i32, "i32x4", LaneIdx4, I32, 13>;
 defm "" : ExtractLane<v2i64, "i64x2", LaneIdx2, I64, 16>;
@@ -472,6 +562,11 @@ defm OR : SIMDBitwise<or, "or", 78>;
 defm XOR : SIMDBitwise<xor, "xor", 79>;
 } // isCommutable = 1
 
+// Bitwise logic: v128.andnot
+def andnot : PatFrag<(ops node:$left, node:$right), (and $left, (vnot $right))>;
+let Predicates = [HasUnimplementedSIMD128] in
+defm ANDNOT : SIMDBitwise<andnot, "andnot", 216>;
+
 // Bitwise select: v128.bitselect
 foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
   defm BITSELECT_#vec_t :
@@ -655,7 +750,7 @@ defm ABS : SIMDUnaryFP<fabs, "abs", 149>;
 defm NEG : SIMDUnaryFP<fneg, "neg", 150>;
 
 // Square root: sqrt
-let Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
+let Predicates = [HasUnimplementedSIMD128] in
 defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 151>;
 
 //===----------------------------------------------------------------------===//
@@ -679,7 +774,7 @@ let isCommutable = 1 in
 defm MUL : SIMDBinaryFP<fmul, "mul", 156>;
 
 // Division: div
-let Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
+let Predicates = [HasUnimplementedSIMD128] in
 defm DIV : SIMDBinaryFP<fdiv, "div", 157>;
 
 // NaN-propagating minimum: min
@@ -712,6 +807,42 @@ defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_f32x4_u", 172>;
 defm "" : SIMDConvert<v2i64, v2f64, fp_to_sint, "i64x2.trunc_sat_f64x2_s", 173>;
 defm "" : SIMDConvert<v2i64, v2f64, fp_to_uint, "i64x2.trunc_sat_f64x2_u", 174>;
 
+// Widening operations
+multiclass SIMDWiden<ValueType vec_t, string vec, ValueType arg_t, string arg,
+                     bits<32> baseInst> {
+  defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_signed,
+                        vec#".widen_low_"#arg#"_s", baseInst>;
+  defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_signed,
+                        vec#".widen_high_"#arg#"_s", !add(baseInst, 1)>;
+  defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_unsigned,
+                        vec#".widen_low_"#arg#"_u", !add(baseInst, 2)>;
+  defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_unsigned,
+                        vec#".widen_high_"#arg#"_u", !add(baseInst, 3)>;
+}
+
+defm "" : SIMDWiden<v8i16, "i16x8", v16i8, "i8x16", 202>;
+defm "" : SIMDWiden<v4i32, "i32x4", v8i16, "i16x8", 206>;
+
+// Narrowing operations
+multiclass SIMDNarrow<ValueType vec_t, string vec, ValueType arg_t, string arg,
+                      bits<32> baseInst> {
+  defm NARROW_S_#vec_t :
+    SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins),
+           [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_signed
+             (arg_t V128:$low), (arg_t V128:$high))))],
+           vec#".narrow_"#arg#"_s\t$dst, $low, $high", vec#".narrow_"#arg#"_s",
+           baseInst>;
+  defm NARROW_U_#vec_t :
+    SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins),
+           [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_unsigned
+             (arg_t V128:$low), (arg_t V128:$high))))],
+           vec#".narrow_"#arg#"_u\t$dst, $low, $high", vec#".narrow_"#arg#"_u",
+           !add(baseInst, 1)>;
+}
+
+defm "" : SIMDNarrow<v16i8, "i8x16", v8i16, "i16x8", 198>;
+defm "" : SIMDNarrow<v8i16, "i16x8", v4i32, "i32x4", 200>;
+
 // Lower llvm.wasm.trunc.saturate.* to saturating instructions
 def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
           (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>;
@@ -732,3 +863,25 @@ foreach t2 = !foldl(
   )
 ) in
 def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>;
+
+//===----------------------------------------------------------------------===//
+// Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS)
+//===----------------------------------------------------------------------===//
+
+multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> baseInst> {
+  defm QFMA_#vec_t :
+    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
+           (outs), (ins),
+           [(set (vec_t V128:$dst),
+             (int_wasm_qfma (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
+           vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", baseInst>;
+  defm QFMS_#vec_t :
+    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
+           (outs), (ins),
+           [(set (vec_t V128:$dst),
+             (int_wasm_qfms (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
+           vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>;
+}
+
+defm "" : SIMDQFM<v4f32, "f32x4", 0x98>;
+defm "" : SIMDQFM<v2f64, "f64x2", 0xa3>;
diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index e92b34430272..75d04252cbe9 100644
--- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-late-eh-prepare"
@@ -131,7 +132,7 @@ bool WebAssemblyLateEHPrepare::addCatches(MachineFunction &MF) {
       auto InsertPos = MBB.begin();
       if (InsertPos->isEHLabel()) // EH pad starts with an EH label
         ++InsertPos;
-      unsigned DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
+      Register DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
       BuildMI(MBB, InsertPos, MBB.begin()->getDebugLoc(),
               TII.get(WebAssembly::CATCH), DstReg);
     }
@@ -168,7 +169,7 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
       if (CatchPos->isEHLabel()) // EH pad starts with an EH label
         ++CatchPos;
       MachineInstr *Catch = &*CatchPos;
-      unsigned ExnReg = Catch->getOperand(0).getReg();
+      Register ExnReg = Catch->getOperand(0).getReg();
       BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
           .addReg(ExnReg);
       TI->eraseFromParent();
@@ -233,6 +234,7 @@ bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
 // it. The pseudo instruction will be deleted later.
 bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   auto *EHInfo = MF.getWasmEHFuncInfo();
   SmallVector<MachineInstr *, 16> ExtractInstrs;
   SmallVector<MachineInstr *, 8> ToDelete;
@@ -292,7 +294,7 @@ bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
     // thenbb:
     //   %exn:i32 = extract_exception
     //   ... use exn ...
-    unsigned ExnReg = Catch->getOperand(0).getReg();
+    Register ExnReg = Catch->getOperand(0).getReg();
     auto *ThenMBB = MF.CreateMachineBasicBlock();
     auto *ElseMBB = MF.CreateMachineBasicBlock();
     MF.insert(std::next(MachineFunction::iterator(EHPad)), ElseMBB);
@@ -339,9 +341,11 @@ bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
               WebAssembly::ClangCallTerminateFn);
       assert(ClangCallTerminateFn &&
              "There is no __clang_call_terminate() function");
+      Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+      BuildMI(ElseMBB, DL, TII.get(WebAssembly::CONST_I32), Reg).addImm(0);
       BuildMI(ElseMBB, DL, TII.get(WebAssembly::CALL_VOID))
           .addGlobalAddress(ClangCallTerminateFn)
-          .addImm(0);
+          .addReg(Reg);
       BuildMI(ElseMBB, DL, TII.get(WebAssembly::UNREACHABLE));
 
     } else {
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index 34a8195ac4b4..4314aa611549 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -68,7 +68,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       if (MI->getOpcode() != WebAssembly::BR_UNLESS)
         continue;
 
-      unsigned Cond = MI->getOperand(1).getReg();
+      Register Cond = MI->getOperand(1).getReg();
       bool Inverted = false;
 
       // Attempt to invert the condition in place.
@@ -188,7 +188,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       // If we weren't able to invert the condition in place. Insert an
       // instruction to invert it.
       if (!Inverted) {
-        unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+        Register Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
         BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp)
             .addReg(Cond);
         MFI.stackifyVReg(Tmp);
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 960d5134f6e9..1cf397dd060b 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -227,15 +227,6 @@ static cl::list<std::string>
 
 namespace {
 class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
-  static const char *ResumeFName;
-  static const char *EHTypeIDFName;
-  static const char *EmLongjmpFName;
-  static const char *EmLongjmpJmpbufFName;
-  static const char *SaveSetjmpFName;
-  static const char *TestSetjmpFName;
-  static const char *FindMatchingCatchPrefix;
-  static const char *InvokePrefix;
-
   bool EnableEH;   // Enable exception handling
   bool EnableSjLj; // Enable setjmp/longjmp handling
 
@@ -274,6 +265,7 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
 
   bool areAllExceptionsAllowed() const { return EHWhitelistSet.empty(); }
   bool canLongjmp(Module &M, const Value *Callee) const;
+  bool isEmAsmCall(Module &M, const Value *Callee) const;
 
   void rebuildSSA(Function &F);
 
@@ -292,19 +284,6 @@ public:
 };
 } // End anonymous namespace
 
-const char *WebAssemblyLowerEmscriptenEHSjLj::ResumeFName = "__resumeException";
-const char *WebAssemblyLowerEmscriptenEHSjLj::EHTypeIDFName =
-    "llvm_eh_typeid_for";
-const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpFName =
-    "emscripten_longjmp";
-const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpJmpbufFName =
-    "emscripten_longjmp_jmpbuf";
-const char *WebAssemblyLowerEmscriptenEHSjLj::SaveSetjmpFName = "saveSetjmp";
-const char *WebAssemblyLowerEmscriptenEHSjLj::TestSetjmpFName = "testSetjmp";
-const char *WebAssemblyLowerEmscriptenEHSjLj::FindMatchingCatchPrefix =
-    "__cxa_find_matching_catch_";
-const char *WebAssemblyLowerEmscriptenEHSjLj::InvokePrefix = "__invoke_";
-
 char WebAssemblyLowerEmscriptenEHSjLj::ID = 0;
 INITIALIZE_PASS(WebAssemblyLowerEmscriptenEHSjLj, DEBUG_TYPE,
                 "WebAssembly Lower Emscripten Exceptions / Setjmp / Longjmp",
@@ -335,7 +314,8 @@ static bool canThrow(const Value *V) {
 static GlobalVariable *getGlobalVariableI32(Module &M, IRBuilder<> &IRB,
                                             const char *Name) {
 
-  auto* GV = dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, IRB.getInt32Ty()));
+  auto *GV =
+      dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, IRB.getInt32Ty()));
   if (!GV)
     report_fatal_error(Twine("unable to create global: ") + Name);
 
@@ -376,9 +356,9 @@ WebAssemblyLowerEmscriptenEHSjLj::getFindMatchingCatch(Module &M,
   PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
   SmallVector<Type *, 16> Args(NumClauses, Int8PtrTy);
   FunctionType *FTy = FunctionType::get(Int8PtrTy, Args, false);
-  Function *F =
-      Function::Create(FTy, GlobalValue::ExternalLinkage,
-                       FindMatchingCatchPrefix + Twine(NumClauses + 2), &M);
+  Function *F = Function::Create(
+      FTy, GlobalValue::ExternalLinkage,
+      "__cxa_find_matching_catch_" + Twine(NumClauses + 2), &M);
   FindMatchingCatches[NumClauses] = F;
   return F;
 }
@@ -418,7 +398,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
   Args.append(CI->arg_begin(), CI->arg_end());
   CallInst *NewCall = IRB.CreateCall(getInvokeWrapper(CI), Args);
   NewCall->takeName(CI);
-  NewCall->setCallingConv(CI->getCallingConv());
+  NewCall->setCallingConv(CallingConv::WASM_EmscriptenInvoke);
   NewCall->setDebugLoc(CI->getDebugLoc());
 
   // Because we added the pointer to the callee as first argument, all
@@ -432,9 +412,22 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
   for (unsigned I = 0, E = CI->getNumArgOperands(); I < E; ++I)
     ArgAttributes.push_back(InvokeAL.getParamAttributes(I));
 
+  AttrBuilder FnAttrs(InvokeAL.getFnAttributes());
+  if (FnAttrs.contains(Attribute::AllocSize)) {
+    // The allocsize attribute (if any) referes to parameters by index and needs
+    // to be adjusted.
+    unsigned SizeArg;
+    Optional<unsigned> NEltArg;
+    std::tie(SizeArg, NEltArg) = FnAttrs.getAllocSizeArgs();
+    SizeArg += 1;
+    if (NEltArg.hasValue())
+      NEltArg = NEltArg.getValue() + 1;
+    FnAttrs.addAllocSizeAttr(SizeArg, NEltArg);
+  }
+
   // Reconstruct the AttributesList based on the vector we constructed.
   AttributeList NewCallAL =
-      AttributeList::get(C, InvokeAL.getFnAttributes(),
+      AttributeList::get(C, AttributeSet::get(C, FnAttrs),
                          InvokeAL.getRetAttributes(), ArgAttributes);
   NewCall->setAttributes(NewCallAL);
 
@@ -473,8 +466,8 @@ Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallOrInvoke *CI) {
 
   FunctionType *FTy = FunctionType::get(CalleeFTy->getReturnType(), ArgTys,
                                         CalleeFTy->isVarArg());
-  Function *F = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                 InvokePrefix + Sig, M);
+  Function *F =
+      Function::Create(FTy, GlobalValue::ExternalLinkage, "__invoke_" + Sig, M);
   InvokeWrappers[Sig] = F;
   return F;
 }
@@ -491,39 +484,44 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M,
   // and can't be passed by pointer. The result is a crash with illegal IR.
   if (isa<InlineAsm>(Callee))
     return false;
+  StringRef CalleeName = Callee->getName();
 
   // The reason we include malloc/free here is to exclude the malloc/free
   // calls generated in setjmp prep / cleanup routines.
-  Function *SetjmpF = M.getFunction("setjmp");
-  Function *MallocF = M.getFunction("malloc");
-  Function *FreeF = M.getFunction("free");
-  if (Callee == SetjmpF || Callee == MallocF || Callee == FreeF)
+  if (CalleeName == "setjmp" || CalleeName == "malloc" || CalleeName == "free")
     return false;
 
   // There are functions in JS glue code
-  if (Callee == ResumeF || Callee == EHTypeIDF || Callee == SaveSetjmpF ||
-      Callee == TestSetjmpF)
+  if (CalleeName == "__resumeException" || CalleeName == "llvm_eh_typeid_for" ||
+      CalleeName == "saveSetjmp" || CalleeName == "testSetjmp" ||
+      CalleeName == "getTempRet0" || CalleeName == "setTempRet0")
     return false;
 
   // __cxa_find_matching_catch_N functions cannot longjmp
-  if (Callee->getName().startswith(FindMatchingCatchPrefix))
+  if (Callee->getName().startswith("__cxa_find_matching_catch_"))
     return false;
 
   // Exception-catching related functions
-  Function *BeginCatchF = M.getFunction("__cxa_begin_catch");
-  Function *EndCatchF = M.getFunction("__cxa_end_catch");
-  Function *AllocExceptionF = M.getFunction("__cxa_allocate_exception");
-  Function *ThrowF = M.getFunction("__cxa_throw");
-  Function *TerminateF = M.getFunction("__clang_call_terminate");
-  if (Callee == BeginCatchF || Callee == EndCatchF ||
-      Callee == AllocExceptionF || Callee == ThrowF || Callee == TerminateF ||
-      Callee == GetTempRet0Func || Callee == SetTempRet0Func)
+  if (CalleeName == "__cxa_begin_catch" || CalleeName == "__cxa_end_catch" ||
+      CalleeName == "__cxa_allocate_exception" || CalleeName == "__cxa_throw" ||
+      CalleeName == "__clang_call_terminate")
     return false;
 
   // Otherwise we don't know
   return true;
 }
 
+bool WebAssemblyLowerEmscriptenEHSjLj::isEmAsmCall(Module &M,
+                                                   const Value *Callee) const {
+  StringRef CalleeName = Callee->getName();
+  // This is an exhaustive list from Emscripten's <emscripten/em_asm.h>.
+  return CalleeName == "emscripten_asm_const_int" ||
+         CalleeName == "emscripten_asm_const_double" ||
+         CalleeName == "emscripten_asm_const_int_sync_on_main_thread" ||
+         CalleeName == "emscripten_asm_const_double_sync_on_main_thread" ||
+         CalleeName == "emscripten_asm_const_async_on_main_thread";
+}
+
 // Generate testSetjmp function call seqence with preamble and postamble.
 // The code this generates is equivalent to the following JavaScript code:
 // if (%__THREW__.val != 0 & threwValue != 0) {
@@ -605,15 +603,12 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
   SSAUpdater SSA;
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
+      SSA.Initialize(I.getType(), I.getName());
+      SSA.AddAvailableValue(&BB, &I);
       for (auto UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
         Use &U = *UI;
         ++UI;
-        SSA.Initialize(I.getType(), I.getName());
-        SSA.AddAvailableValue(&BB, &I);
         auto *User = cast<Instruction>(U.getUser());
-        if (User->getParent() == &BB)
-          continue;
-
         if (auto *UserPN = dyn_cast<PHINode>(User))
           if (UserPN->getIncomingBlock(U) == &BB)
             continue;
@@ -660,13 +655,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
     FunctionType *ResumeFTy =
         FunctionType::get(IRB.getVoidTy(), IRB.getInt8PtrTy(), false);
     ResumeF = Function::Create(ResumeFTy, GlobalValue::ExternalLinkage,
-                               ResumeFName, &M);
+                               "__resumeException", &M);
 
     // Register llvm_eh_typeid_for function
     FunctionType *EHTypeIDTy =
         FunctionType::get(IRB.getInt32Ty(), IRB.getInt8PtrTy(), false);
     EHTypeIDF = Function::Create(EHTypeIDTy, GlobalValue::ExternalLinkage,
-                                 EHTypeIDFName, &M);
+                                 "llvm_eh_typeid_for", &M);
 
     for (Function &F : M) {
       if (F.isDeclaration())
@@ -684,7 +679,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
       // defined in JS code
       EmLongjmpJmpbufF = Function::Create(LongjmpF->getFunctionType(),
                                           GlobalValue::ExternalLinkage,
-                                          EmLongjmpJmpbufFName, &M);
+                                          "emscripten_longjmp_jmpbuf", &M);
 
       LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF);
     }
@@ -697,19 +692,19 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
                                        IRB.getInt32Ty()};
       FunctionType *FTy =
           FunctionType::get(Type::getInt32PtrTy(C), Params, false);
-      SaveSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                     SaveSetjmpFName, &M);
+      SaveSetjmpF =
+          Function::Create(FTy, GlobalValue::ExternalLinkage, "saveSetjmp", &M);
 
       // Register testSetjmp function
       Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()};
       FTy = FunctionType::get(IRB.getInt32Ty(), Params, false);
-      TestSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                     TestSetjmpFName, &M);
+      TestSetjmpF =
+          Function::Create(FTy, GlobalValue::ExternalLinkage, "testSetjmp", &M);
 
       FTy = FunctionType::get(IRB.getVoidTy(),
                               {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
       EmLongjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                    EmLongjmpFName, &M);
+                                    "emscripten_longjmp", &M);
 
       // Only traverse functions that uses setjmp in order not to insert
       // unnecessary prep / cleanup code in every function
@@ -970,10 +965,16 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
       const Value *Callee = CI->getCalledValue();
       if (!canLongjmp(M, Callee))
         continue;
+      if (isEmAsmCall(M, Callee))
+        report_fatal_error("Cannot use EM_ASM* alongside setjmp/longjmp in " +
+                               F.getName() +
+                               ". Please consider using EM_JS, or move the "
+                               "EM_ASM into another function.",
+                           false);
 
       Value *Threw = nullptr;
       BasicBlock *Tail;
-      if (Callee->getName().startswith(InvokePrefix)) {
+      if (Callee->getName().startswith("__invoke_")) {
         // If invoke wrapper has already been generated for this call in
         // previous EH phase, search for the load instruction
         // %__THREW__.val = __THREW__;
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
index 494d3fadbc8c..750b2233e67a 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
@@ -94,7 +94,7 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
       break; // Found a null terminator, skip the rest.
 
     Constant *Associated = CS->getOperand(2);
-    Associated = cast<Constant>(Associated->stripPointerCastsNoFollowAliases());
+    Associated = cast<Constant>(Associated->stripPointerCasts());
 
     DtorFuncs[PriorityValue][Associated].push_back(DtorFunc);
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 288b991ae2c5..59c10243c545 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -79,7 +79,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
   // Clang-provided symbols.
   if (strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0 ||
       strcmp(Name, "__memory_base") == 0 || strcmp(Name, "__table_base") == 0 ||
-      strcmp(Name, "__tls_size") == 0) {
+      strcmp(Name, "__tls_size") == 0 || strcmp(Name, "__tls_align") == 0) {
     bool Mutable =
         strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0;
     WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
@@ -115,7 +115,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
     getLibcallSignature(Subtarget, Name, Returns, Params);
   }
   auto Signature =
-      make_unique<wasm::WasmSignature>(std::move(Returns), std::move(Params));
+      std::make_unique<wasm::WasmSignature>(std::move(Returns), std::move(Params));
   WasmSym->setSignature(Signature.get());
   Printer.addSignature(std::move(Signature));
 
@@ -163,6 +163,21 @@ MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
   return MCOperand::createExpr(Expr);
 }
 
+MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand(
+    SmallVector<wasm::ValType, 1> &&Returns,
+    SmallVector<wasm::ValType, 4> &&Params) const {
+  auto Signature = std::make_unique<wasm::WasmSignature>(std::move(Returns),
+                                                         std::move(Params));
+  MCSymbol *Sym = Printer.createTempSymbol("typeindex");
+  auto *WasmSym = cast<MCSymbolWasm>(Sym);
+  WasmSym->setSignature(Signature.get());
+  Printer.addSignature(std::move(Signature));
+  WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
 // Return the WebAssembly type associated with the given register class.
 static wasm::ValType getType(const TargetRegisterClass *RC) {
   if (RC == &WebAssembly::I32RegClass)
@@ -178,6 +193,16 @@ static wasm::ValType getType(const TargetRegisterClass *RC) {
   llvm_unreachable("Unexpected register class");
 }
 
+static void getFunctionReturns(const MachineInstr *MI,
+                               SmallVectorImpl<wasm::ValType> &Returns) {
+  const Function &F = MI->getMF()->getFunction();
+  const TargetMachine &TM = MI->getMF()->getTarget();
+  Type *RetTy = F.getReturnType();
+  SmallVector<MVT, 4> CallerRetTys;
+  computeLegalValueVTs(F, TM, RetTy, CallerRetTys);
+  valTypesFromMVTs(CallerRetTys, Returns);
+}
+
 void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
                                    MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
@@ -208,8 +233,6 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
       if (I < Desc.NumOperands) {
         const MCOperandInfo &Info = Desc.OpInfo[I];
         if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
-          MCSymbol *Sym = Printer.createTempSymbol("typeindex");
-
           SmallVector<wasm::ValType, 4> Returns;
           SmallVector<wasm::ValType, 4> Params;
 
@@ -226,17 +249,23 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
           if (WebAssembly::isCallIndirect(MI->getOpcode()))
             Params.pop_back();
 
-          auto *WasmSym = cast<MCSymbolWasm>(Sym);
-          auto Signature = make_unique<wasm::WasmSignature>(std::move(Returns),
-                                                            std::move(Params));
-          WasmSym->setSignature(Signature.get());
-          Printer.addSignature(std::move(Signature));
-          WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+          // return_call_indirect instructions have the return type of the
+          // caller
+          if (MI->getOpcode() == WebAssembly::RET_CALL_INDIRECT)
+            getFunctionReturns(MI, Returns);
 
-          const MCExpr *Expr = MCSymbolRefExpr::create(
-              WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx);
-          MCOp = MCOperand::createExpr(Expr);
+          MCOp = lowerTypeIndexOperand(std::move(Returns), std::move(Params));
           break;
+        } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
+          auto BT = static_cast<WebAssembly::BlockType>(MO.getImm());
+          assert(BT != WebAssembly::BlockType::Invalid);
+          if (BT == WebAssembly::BlockType::Multivalue) {
+            SmallVector<wasm::ValType, 1> Returns;
+            getFunctionReturns(MI, Returns);
+            MCOp = lowerTypeIndexOperand(std::move(Returns),
+                                         SmallVector<wasm::ValType, 4>());
+            break;
+          }
         }
       }
       MCOp = MCOperand::createImm(MO.getImm());
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index 2c375a01a7f5..d79c54097eb7 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
 
+#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Compiler.h"
 
@@ -33,6 +34,8 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
   MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
   MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+  MCOperand lowerTypeIndexOperand(SmallVector<wasm::ValType, 1> &&,
+                                  SmallVector<wasm::ValType, 4> &&) const;
 
 public:
   WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index d31c1226bfdb..e4cc2389147b 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -49,10 +49,12 @@ void llvm::computeSignatureVTs(const FunctionType *Ty, const Function &F,
   computeLegalValueVTs(F, TM, Ty->getReturnType(), Results);
 
   MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
-  if (Results.size() > 1) {
-    // WebAssembly currently can't lower returns of multiple values without
-    // demoting to sret (see WebAssemblyTargetLowering::CanLowerReturn). So
-    // replace multiple return values with a pointer parameter.
+  if (Results.size() > 1 &&
+      !TM.getSubtarget<WebAssemblySubtarget>(F).hasMultivalue()) {
+    // WebAssembly can't lower returns of multiple values without demoting to
+    // sret unless multivalue is enabled (see
+    // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
+    // values with a poitner parameter.
     Results.clear();
     Params.push_back(PtrVT);
   }
@@ -72,7 +74,7 @@ void llvm::valTypesFromMVTs(const ArrayRef<MVT> &In,
 std::unique_ptr<wasm::WasmSignature>
 llvm::signatureFromMVTs(const SmallVectorImpl<MVT> &Results,
                         const SmallVectorImpl<MVT> &Params) {
-  auto Sig = make_unique<wasm::WasmSignature>();
+  auto Sig = std::make_unique<wasm::WasmSignature>();
   valTypesFromMVTs(Results, Sig->Returns);
   valTypesFromMVTs(Params, Sig->Params);
   return Sig;
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 4b9ba491dee6..16e2f4392984 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -96,13 +96,18 @@ public:
 
   void stackifyVReg(unsigned VReg) {
     assert(MF.getRegInfo().getUniqueVRegDef(VReg));
-    auto I = TargetRegisterInfo::virtReg2Index(VReg);
+    auto I = Register::virtReg2Index(VReg);
     if (I >= VRegStackified.size())
       VRegStackified.resize(I + 1);
     VRegStackified.set(I);
   }
+  void unstackifyVReg(unsigned VReg) {
+    auto I = Register::virtReg2Index(VReg);
+    if (I < VRegStackified.size())
+      VRegStackified.reset(I);
+  }
   bool isVRegStackified(unsigned VReg) const {
-    auto I = TargetRegisterInfo::virtReg2Index(VReg);
+    auto I = Register::virtReg2Index(VReg);
     if (I >= VRegStackified.size())
       return false;
     return VRegStackified.test(I);
@@ -111,12 +116,12 @@ public:
   void initWARegs();
   void setWAReg(unsigned VReg, unsigned WAReg) {
     assert(WAReg != UnusedReg);
-    auto I = TargetRegisterInfo::virtReg2Index(VReg);
+    auto I = Register::virtReg2Index(VReg);
     assert(I < WARegs.size());
     WARegs[I] = WAReg;
   }
   unsigned getWAReg(unsigned VReg) const {
-    auto I = TargetRegisterInfo::virtReg2Index(VReg);
+    auto I = Register::virtReg2Index(VReg);
     assert(I < WARegs.size());
     return WARegs[I];
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
index 7ac0511c28b0..ac428fcc826a 100644
--- a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
@@ -166,8 +166,8 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
   if (!LibInfo.getLibFunc(Name, Func))
     return false;
 
-  unsigned FromReg = MI.getOperand(2).getReg();
-  unsigned ToReg = MI.getOperand(0).getReg();
+  Register FromReg = MI.getOperand(2).getReg();
+  Register ToReg = MI.getOperand(0).getReg();
   if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg))
     report_fatal_error("Memory Intrinsic results: call to builtin function "
                        "with wrong signature, from/to mismatch");
@@ -184,7 +184,8 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) {
   auto &MDT = getAnalysis<MachineDominatorTree>();
   const WebAssemblyTargetLowering &TLI =
       *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
-  const auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  const auto &LibInfo =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(MF.getFunction());
   auto &LIS = getAnalysis<LiveIntervals>();
   bool Changed = false;
 
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index 8c7c3305c201..0bd30791e57c 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -81,7 +81,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
   // Split multiple-VN LiveIntervals into multiple LiveIntervals.
   SmallVector<LiveInterval *, 4> SplitLIs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
 
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index d20352259e07..9b60596e42b4 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -64,11 +64,8 @@ void OptimizeReturned::visitCallSite(CallSite CS) {
       if (isa<Constant>(Arg))
         continue;
       // Like replaceDominatedUsesWith but using Instruction/Use dominance.
-      for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) {
-        Use &U = *UI++;
-        if (DT->dominates(Inst, U))
-          U.set(Inst);
-      }
+      Arg->replaceUsesWithIf(Inst,
+                             [&](Use &U) { return DT->dominates(Inst, U); });
     }
 }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index e11cdeaa0e79..ea6cd09a604c 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -63,7 +63,7 @@ static bool maybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
   bool Changed = false;
   if (OldReg == NewReg) {
     Changed = true;
-    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+    Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
     MO.setReg(NewReg);
     MO.setIsDead();
     MFI.stackifyVReg(NewReg);
@@ -75,9 +75,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
                                       const MachineFunction &MF,
                                       WebAssemblyFunctionInfo &MFI,
                                       MachineRegisterInfo &MRI,
-                                      const WebAssemblyInstrInfo &TII,
-                                      unsigned FallthroughOpc,
-                                      unsigned CopyLocalOpc) {
+                                      const WebAssemblyInstrInfo &TII) {
   if (DisableWebAssemblyFallthroughReturnOpt)
     return false;
   if (&MBB != &MF.back())
@@ -90,13 +88,36 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
   if (&MI != &*End)
     return false;
 
-  if (FallthroughOpc != WebAssembly::FALLTHROUGH_RETURN_VOID) {
-    // If the operand isn't stackified, insert a COPY to read the operand and
-    // stackify it.
-    MachineOperand &MO = MI.getOperand(0);
-    unsigned Reg = MO.getReg();
+  for (auto &MO : MI.explicit_operands()) {
+    // If the operand isn't stackified, insert a COPY to read the operands and
+    // stackify them.
+    Register Reg = MO.getReg();
     if (!MFI.isVRegStackified(Reg)) {
-      unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+      unsigned CopyLocalOpc;
+      const TargetRegisterClass *RegClass = MRI.getRegClass(Reg);
+      switch (RegClass->getID()) {
+      case WebAssembly::I32RegClassID:
+        CopyLocalOpc = WebAssembly::COPY_I32;
+        break;
+      case WebAssembly::I64RegClassID:
+        CopyLocalOpc = WebAssembly::COPY_I64;
+        break;
+      case WebAssembly::F32RegClassID:
+        CopyLocalOpc = WebAssembly::COPY_F32;
+        break;
+      case WebAssembly::F64RegClassID:
+        CopyLocalOpc = WebAssembly::COPY_F64;
+        break;
+      case WebAssembly::V128RegClassID:
+        CopyLocalOpc = WebAssembly::COPY_V128;
+        break;
+      case WebAssembly::EXNREFRegClassID:
+        CopyLocalOpc = WebAssembly::COPY_EXNREF;
+        break;
+      default:
+        llvm_unreachable("Unexpected register class for return operand");
+      }
+      Register NewReg = MRI.createVirtualRegister(RegClass);
       BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
           .addReg(Reg);
       MO.setReg(NewReg);
@@ -104,8 +125,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
     }
   }
 
-  // Rewrite the return.
-  MI.setDesc(TII.get(FallthroughOpc));
+  MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN));
   return true;
 }
 
@@ -120,7 +140,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   const WebAssemblyTargetLowering &TLI =
       *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
-  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &LibInfo =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(MF.getFunction());
   bool Changed = false;
 
   for (auto &MBB : MF)
@@ -143,8 +164,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
                 report_fatal_error("Peephole: call to builtin function with "
                                    "wrong signature, not consuming reg");
               MachineOperand &MO = MI.getOperand(0);
-              unsigned OldReg = MO.getReg();
-              unsigned NewReg = Op2.getReg();
+              Register OldReg = MO.getReg();
+              Register NewReg = Op2.getReg();
 
               if (MRI.getRegClass(NewReg) != MRI.getRegClass(OldReg))
                 report_fatal_error("Peephole: call to builtin function with "
@@ -156,60 +177,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
       // Optimize away an explicit void return at the end of the function.
-      case WebAssembly::RETURN_I32:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I32,
-            WebAssembly::COPY_I32);
-        break;
-      case WebAssembly::RETURN_I64:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I64,
-            WebAssembly::COPY_I64);
-        break;
-      case WebAssembly::RETURN_F32:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F32,
-            WebAssembly::COPY_F32);
-        break;
-      case WebAssembly::RETURN_F64:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64,
-            WebAssembly::COPY_F64);
-        break;
-      case WebAssembly::RETURN_v16i8:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v16i8,
-            WebAssembly::COPY_V128);
-        break;
-      case WebAssembly::RETURN_v8i16:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v8i16,
-            WebAssembly::COPY_V128);
-        break;
-      case WebAssembly::RETURN_v4i32:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4i32,
-            WebAssembly::COPY_V128);
-        break;
-      case WebAssembly::RETURN_v2i64:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2i64,
-            WebAssembly::COPY_V128);
-        break;
-      case WebAssembly::RETURN_v4f32:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4f32,
-            WebAssembly::COPY_V128);
-        break;
-      case WebAssembly::RETURN_v2f64:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2f64,
-            WebAssembly::COPY_V128);
-        break;
-      case WebAssembly::RETURN_VOID:
-        Changed |= maybeRewriteToFallthrough(
-            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_VOID,
-            WebAssembly::INSTRUCTION_LIST_END);
+      case WebAssembly::RETURN:
+        Changed |= maybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII);
         break;
       }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index 3bfbf607344d..799b9388097c 100644
--- a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -95,7 +95,7 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(
   // TODO: This is fairly heavy-handed; find a better approach.
   //
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
 
     // Skip unused registers.
     if (MRI.use_nodbg_empty(Reg))
diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 6f09c45b6642..043b6f1b7d18 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -98,7 +98,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
 
   LLVM_DEBUG(dbgs() << "Interesting register intervals:\n");
   for (unsigned I = 0; I < NumVRegs; ++I) {
-    unsigned VReg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned VReg = Register::index2VirtReg(I);
     if (MFI.isVRegStackified(VReg))
       continue;
     // Skip unused registers, which can use $drop.
@@ -157,9 +157,8 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
     Changed |= Old != New;
     UsedColors.set(Color);
     Assignments[Color].push_back(LI);
-    LLVM_DEBUG(
-        dbgs() << "Assigning vreg" << TargetRegisterInfo::virtReg2Index(LI->reg)
-               << " to vreg" << TargetRegisterInfo::virtReg2Index(New) << "\n");
+    LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg)
+                      << " to vreg" << Register::virtReg2Index(New) << "\n");
   }
   if (!Changed)
     return false;
diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index cdca23f55b29..72e7a7cf5042 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -89,7 +89,7 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
   // Start the numbering for locals after the arg regs
   unsigned CurReg = MFI.getParams().size();
   for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) {
-    unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx);
+    unsigned VReg = Register::index2VirtReg(VRegIdx);
     // Skip unused registers.
     if (MRI.use_empty(VReg))
       continue;
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index a120a6471014..421d353a89e8 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -120,7 +120,7 @@ static void convertImplicitDefToConstZero(MachineInstr *MI,
         Type::getDoubleTy(MF.getFunction().getContext())));
     MI->addOperand(MachineOperand::CreateFPImm(Val));
   } else if (RegClass == &WebAssembly::V128RegClass) {
-    unsigned TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+    Register TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
     MI->setDesc(TII->get(WebAssembly::SPLAT_v4i32));
     MI->addOperand(MachineOperand::CreateReg(TempReg, false));
     MachineInstr *Const = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
@@ -334,14 +334,14 @@ static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
   for (const MachineOperand &MO : Def->operands()) {
     if (!MO.isReg() || MO.isUndef())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
 
     // If the register is dead here and at Insert, ignore it.
     if (MO.isDead() && Insert->definesRegister(Reg) &&
         !Insert->readsRegister(Reg))
       continue;
 
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       // Ignore ARGUMENTS; it's just used to keep the ARGUMENT_* instructions
       // from moving down, and we've already checked for that.
       if (Reg == WebAssembly::ARGUMENTS)
@@ -436,8 +436,8 @@ static bool oneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
         const MachineOperand &MO = UseInst->getOperand(0);
         if (!MO.isReg())
           return false;
-        unsigned DefReg = MO.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(DefReg) ||
+        Register DefReg = MO.getReg();
+        if (!Register::isVirtualRegister(DefReg) ||
             !MFI.isVRegStackified(DefReg))
           return false;
         assert(MRI.hasOneNonDBGUse(DefReg));
@@ -499,7 +499,7 @@ static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op,
   } else {
     // The register may have unrelated uses or defs; create a new register for
     // just our one def and use so that we can stackify it.
-    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+    Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
     Def->getOperand(0).setReg(NewReg);
     Op.setReg(NewReg);
 
@@ -535,7 +535,7 @@ static MachineInstr *rematerializeCheapDef(
 
   WebAssemblyDebugValueManager DefDIs(&Def);
 
-  unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+  Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
   TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
   Op.setReg(NewReg);
   MachineInstr *Clone = &*std::prev(Insert);
@@ -607,8 +607,8 @@ static MachineInstr *moveAndTeeForMultiUse(
 
   // Create the Tee and attach the registers.
   const auto *RegClass = MRI.getRegClass(Reg);
-  unsigned TeeReg = MRI.createVirtualRegister(RegClass);
-  unsigned DefReg = MRI.createVirtualRegister(RegClass);
+  Register TeeReg = MRI.createVirtualRegister(RegClass);
+  Register DefReg = MRI.createVirtualRegister(RegClass);
   MachineOperand &DefMO = Def->getOperand(0);
   MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(),
                               TII->get(getTeeOpcode(RegClass)), TeeReg)
@@ -807,11 +807,11 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         if (!Op.isReg())
           continue;
 
-        unsigned Reg = Op.getReg();
+        Register Reg = Op.getReg();
         assert(Op.isUse() && "explicit_uses() should only iterate over uses");
         assert(!Op.isImplicit() &&
                "explicit_uses() should only iterate over explicit operands");
-        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+        if (Register::isPhysicalRegister(Reg))
           continue;
 
         // Identify the definition for this register at this point.
@@ -915,7 +915,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
       for (MachineOperand &MO : reverse(MI.explicit_operands())) {
         if (!MO.isReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
 
         if (MFI.isVRegStackified(Reg)) {
           if (MO.isDef())
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index ea9cfc00adfd..789a025794ea 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -91,8 +91,8 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
   if (MI.getOpcode() == WebAssembly::ADD_I32) {
     MachineOperand &OtherMO = MI.getOperand(3 - FIOperandNum);
     if (OtherMO.isReg()) {
-      unsigned OtherMOReg = OtherMO.getReg();
-      if (TargetRegisterInfo::isVirtualRegister(OtherMOReg)) {
+      Register OtherMOReg = OtherMO.getReg();
+      if (Register::isVirtualRegister(OtherMOReg)) {
         MachineInstr *Def = MF.getRegInfo().getUniqueVRegDef(OtherMOReg);
         // TODO: For now we just opportunistically do this in the case where
         // the CONST_I32 happens to have exactly one def and one use. We
@@ -117,7 +117,7 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
     // Create i32.add SP, offset and make it the operand.
     const TargetRegisterClass *PtrRC =
         MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-    unsigned OffsetOp = MRI.createVirtualRegister(PtrRC);
+    Register OffsetOp = MRI.createVirtualRegister(PtrRC);
     BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32),
             OffsetOp)
         .addImm(FrameOffset);
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 7e65368e671a..bdf5fe2620a4 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -140,7 +140,7 @@ WebAssemblyTargetMachine::getSubtargetImpl(std::string CPU,
                                            std::string FS) const {
   auto &I = SubtargetMap[CPU + FS];
   if (!I) {
-    I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
+    I = std::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
   }
   return I.get();
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 46ef765ce0f4..1c53e90daea7 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -25,10 +25,11 @@ WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   return TargetTransformInfo::PSK_FastHardware;
 }
 
-unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
-  unsigned Result = BaseT::getNumberOfRegisters(Vector);
+unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+  unsigned Result = BaseT::getNumberOfRegisters(ClassID);
 
   // For SIMD, use at least 16 registers, as a rough guess.
+  bool Vector = (ClassID == 1);
   if (Vector)
     Result = std::max(Result, 16u);
 
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 1b11b4b631eb..f0ecc73e91de 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -53,7 +53,7 @@ public:
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getNumberOfRegisters(unsigned ClassID) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index e9d88d4818a5..a237da8154ab 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -32,9 +32,8 @@ bool WebAssembly::isChild(const MachineInstr &MI,
   const MachineOperand &MO = MI.getOperand(0);
   if (!MO.isReg() || MO.isImplicit() || !MO.isDef())
     return false;
-  unsigned Reg = MO.getReg();
-  return TargetRegisterInfo::isVirtualRegister(Reg) &&
-         MFI.isVRegStackified(Reg);
+  Register Reg = MO.getReg();
+  return Register::isVirtualRegister(Reg) && MFI.isVRegStackified(Reg);
 }
 
 bool WebAssembly::mayThrow(const MachineInstr &MI) {
@@ -51,7 +50,21 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) {
     return false;
 
   const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI.getOpcode()));
-  assert(MO.isGlobal());
+  assert(MO.isGlobal() || MO.isSymbol());
+
+  if (MO.isSymbol()) {
+    // Some intrinsics are lowered to calls to external symbols, which are then
+    // lowered to calls to library functions. Most of libcalls don't throw, but
+    // we only list some of them here now.
+    // TODO Consider adding 'nounwind' info in TargetLowering::CallLoweringInfo
+    // instead for more accurate info.
+    const char *Name = MO.getSymbolName();
+    if (strcmp(Name, "memcpy") == 0 || strcmp(Name, "memmove") == 0 ||
+        strcmp(Name, "memset") == 0)
+      return false;
+    return true;
+  }
+
   const auto *F = dyn_cast<Function>(MO.getGlobal());
   if (!F)
     return true;
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 95cbf46d37ed..25be79ec2b1e 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -870,6 +870,14 @@ private:
   bool parseDirectiveFPOEndProc(SMLoc L);
   bool parseDirectiveFPOData(SMLoc L);
 
+  /// SEH directives.
+  bool parseSEHRegisterNumber(unsigned RegClassID, unsigned &RegNo);
+  bool parseDirectiveSEHPushReg(SMLoc);
+  bool parseDirectiveSEHSetFrame(SMLoc);
+  bool parseDirectiveSEHSaveReg(SMLoc);
+  bool parseDirectiveSEHSaveXMM(SMLoc);
+  bool parseDirectiveSEHPushFrame(SMLoc);
+
   unsigned checkTargetMatchPredicate(MCInst &Inst) override;
 
   bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
@@ -955,6 +963,8 @@ private:
 public:
   enum X86MatchResultTy {
     Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "X86GenAsmMatcher.inc"
   };
 
   X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
@@ -3173,6 +3183,13 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
       EmitInstruction(Inst, Operands, Out);
     Opcode = Inst.getOpcode();
     return false;
+  case Match_InvalidImmUnsignedi4: {
+    SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return Error(ErrorLoc, "immediate must be an integer in range [0, 15]",
+                 EmptyRange, MatchingInlineAsm);
+  }
   case Match_MissingFeature:
     return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm);
   case Match_InvalidOperand:
@@ -3520,6 +3537,15 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
                  MatchingInlineAsm);
   }
 
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_InvalidImmUnsignedi4) == 1) {
+    SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return Error(ErrorLoc, "immediate must be an integer in range [0, 15]",
+                 EmptyRange, MatchingInlineAsm);
+  }
+
   // If all of these were an outright failure, report it in a useless way.
   return Error(IDLoc, "unknown instruction mnemonic", EmptyRange,
                MatchingInlineAsm);
@@ -3572,6 +3598,16 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveFPOEndPrologue(DirectiveID.getLoc());
   else if (IDVal == ".cv_fpo_endproc")
     return parseDirectiveFPOEndProc(DirectiveID.getLoc());
+  else if (IDVal == ".seh_pushreg")
+    return parseDirectiveSEHPushReg(DirectiveID.getLoc());
+  else if (IDVal == ".seh_setframe")
+    return parseDirectiveSEHSetFrame(DirectiveID.getLoc());
+  else if (IDVal == ".seh_savereg")
+    return parseDirectiveSEHSaveReg(DirectiveID.getLoc());
+  else if (IDVal == ".seh_savexmm")
+    return parseDirectiveSEHSaveXMM(DirectiveID.getLoc());
+  else if (IDVal == ".seh_pushframe")
+    return parseDirectiveSEHPushFrame(DirectiveID.getLoc());
 
   return true;
 }
@@ -3708,6 +3744,140 @@ bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) {
   return getTargetStreamer().emitFPOEndProc(L);
 }
 
+bool X86AsmParser::parseSEHRegisterNumber(unsigned RegClassID,
+                                          unsigned &RegNo) {
+  SMLoc startLoc = getLexer().getLoc();
+  const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+
+  // Try parsing the argument as a register first.
+  if (getLexer().getTok().isNot(AsmToken::Integer)) {
+    SMLoc endLoc;
+    if (ParseRegister(RegNo, startLoc, endLoc))
+      return true;
+
+    if (!X86MCRegisterClasses[RegClassID].contains(RegNo)) {
+      return Error(startLoc,
+                   "register is not supported for use with this directive");
+    }
+  } else {
+    // Otherwise, an integer number matching the encoding of the desired
+    // register may appear.
+    int64_t EncodedReg;
+    if (getParser().parseAbsoluteExpression(EncodedReg))
+      return true;
+
+    // The SEH register number is the same as the encoding register number. Map
+    // from the encoding back to the LLVM register number.
+    RegNo = 0;
+    for (MCPhysReg Reg : X86MCRegisterClasses[RegClassID]) {
+      if (MRI->getEncodingValue(Reg) == EncodedReg) {
+        RegNo = Reg;
+        break;
+      }
+    }
+    if (RegNo == 0) {
+      return Error(startLoc,
+                   "incorrect register number for use with this directive");
+    }
+  }
+
+  return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHPushReg(SMLoc Loc) {
+  unsigned Reg = 0;
+  if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  getParser().Lex();
+  getStreamer().EmitWinCFIPushReg(Reg, Loc);
+  return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSetFrame(SMLoc Loc) {
+  unsigned Reg = 0;
+  int64_t Off;
+  if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify a stack pointer offset");
+
+  getParser().Lex();
+  if (getParser().parseAbsoluteExpression(Off))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  getParser().Lex();
+  getStreamer().EmitWinCFISetFrame(Reg, Off, Loc);
+  return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSaveReg(SMLoc Loc) {
+  unsigned Reg = 0;
+  int64_t Off;
+  if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify an offset on the stack");
+
+  getParser().Lex();
+  if (getParser().parseAbsoluteExpression(Off))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  getParser().Lex();
+  getStreamer().EmitWinCFISaveReg(Reg, Off, Loc);
+  return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSaveXMM(SMLoc Loc) {
+  unsigned Reg = 0;
+  int64_t Off;
+  if (parseSEHRegisterNumber(X86::VR128XRegClassID, Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify an offset on the stack");
+
+  getParser().Lex();
+  if (getParser().parseAbsoluteExpression(Off))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  getParser().Lex();
+  getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc);
+  return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) {
+  bool Code = false;
+  StringRef CodeID;
+  if (getLexer().is(AsmToken::At)) {
+    SMLoc startLoc = getLexer().getLoc();
+    getParser().Lex();
+    if (!getParser().parseIdentifier(CodeID)) {
+      if (CodeID != "code")
+        return Error(startLoc, "expected @code");
+      Code = true;
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  getParser().Lex();
+  getStreamer().EmitWinCFIPushFrame(Code, Loc);
+  return false;
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeX86AsmParser() {
   RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index 5bc979d1f18c..e9be28ca77b0 100644
--- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -35,6 +35,10 @@ inline bool isImmUnsignedi8Value(uint64_t Value) {
   return isUInt<8>(Value) || isInt<8>(Value);
 }
 
+inline bool isImmUnsignedi4Value(uint64_t Value) {
+  return isUInt<4>(Value);
+}
+
 } // End of namespace llvm
 
 #endif
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index a771ba366318..3a76d023e640 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -260,6 +260,15 @@ struct X86Operand final : public MCParsedAsmOperand {
     return isImmSExti64i32Value(CE->getValue());
   }
 
+  bool isImmUnsignedi4() const {
+    if (!isImm()) return false;
+    // If this isn't a constant expr, reject it. The immediate byte is shared
+    // with a register encoding. We can't have it affected by a relocation.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    return isImmUnsignedi4Value(CE->getValue());
+  }
+
   bool isImmUnsignedi8() const {
     if (!isImm()) return false;
     // If this isn't a constant expr, just assume it fits and let relaxation
@@ -491,7 +500,7 @@ struct X86Operand final : public MCParsedAsmOperand {
 
   void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    unsigned RegNo = getReg();
+    MCRegister RegNo = getReg();
     if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
       RegNo = getX86SubSuperRegister(RegNo, 32);
     Inst.addOperand(MCOperand::createReg(RegNo));
@@ -572,7 +581,7 @@ struct X86Operand final : public MCParsedAsmOperand {
 
   static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
     SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
-    auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
+    auto Res = std::make_unique<X86Operand>(Token, Loc, EndLoc);
     Res->Tok.Data = Str.data();
     Res->Tok.Length = Str.size();
     return Res;
@@ -582,7 +591,7 @@ struct X86Operand final : public MCParsedAsmOperand {
   CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
             bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
             StringRef SymName = StringRef(), void *OpDecl = nullptr) {
-    auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
+    auto Res = std::make_unique<X86Operand>(Register, StartLoc, EndLoc);
     Res->Reg.RegNo = RegNo;
     Res->AddressOf = AddressOf;
     Res->OffsetOfLoc = OffsetOfLoc;
@@ -593,19 +602,19 @@ struct X86Operand final : public MCParsedAsmOperand {
 
   static std::unique_ptr<X86Operand>
   CreateDXReg(SMLoc StartLoc, SMLoc EndLoc) {
-    return llvm::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc);
+    return std::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc);
   }
 
   static std::unique_ptr<X86Operand>
   CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) {
-    auto Res = llvm::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
+    auto Res = std::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
     Res->Pref.Prefixes = Prefixes;
     return Res;
   }
 
   static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
                                                SMLoc StartLoc, SMLoc EndLoc) {
-    auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
+    auto Res = std::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
     Res->Imm.Val = Val;
     return Res;
   }
@@ -615,7 +624,7 @@ struct X86Operand final : public MCParsedAsmOperand {
   CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
             unsigned Size = 0, StringRef SymName = StringRef(),
             void *OpDecl = nullptr, unsigned FrontendSize = 0) {
-    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+    auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
     Res->Mem.BaseReg  = 0;
@@ -643,7 +652,7 @@ struct X86Operand final : public MCParsedAsmOperand {
     // The scale should always be one of {1,2,4,8}.
     assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
            "Invalid scale!");
-    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+    auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = SegReg;
     Res->Mem.Disp     = Disp;
     Res->Mem.BaseReg  = BaseReg;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index a241362a271d..e287f6625115 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -12,13 +12,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "X86DisassemblerDecoder.h"
+#include "llvm/ADT/StringRef.h"
+
 #include <cstdarg> /* for va_*()       */
 #include <cstdio>  /* for vsnprintf()  */
 #include <cstdlib> /* for exit()       */
 #include <cstring> /* for memset()     */
 
-#include "X86DisassemblerDecoder.h"
-
 using namespace llvm::X86Disassembler;
 
 /// Specifies whether a ModR/M byte is needed and (if so) which
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 54413fa1a02f..f08fcb575bf0 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -287,7 +287,7 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
                                          const MCRelaxableFragment *DF,
                                          const MCAsmLayout &Layout) const {
   // Relax if the value is too big for a (signed) i8.
-  return int64_t(Value) != int64_t(int8_t(Value));
+  return !isInt<8>(Value);
 }
 
 // FIXME: Can tblgen help at all here to verify there aren't other instructions
@@ -557,7 +557,7 @@ protected:
 
         // If the frame pointer is other than esp/rsp, we do not have a way to
         // generate a compact unwinding representation, so bail out.
-        if (MRI.getLLVMRegNum(Inst.getRegister(), true) !=
+        if (*MRI.getLLVMRegNum(Inst.getRegister(), true) !=
             (Is64Bit ? X86::RBP : X86::EBP))
           return 0;
 
@@ -605,7 +605,7 @@ protected:
           // unwind encoding.
           return CU::UNWIND_MODE_DWARF;
 
-        unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
         SavedRegs[SavedRegIdx++] = Reg;
         StackAdjust += OffsetSize;
         InstrOffset += PushInstrSize(Reg);
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 232a06593238..bd009da60851 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -46,10 +46,10 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
 
 enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
 
-static X86_64RelType getType64(unsigned Kind,
+static X86_64RelType getType64(MCFixupKind Kind,
                                MCSymbolRefExpr::VariantKind &Modifier,
                                bool &IsPCRel) {
-  switch (Kind) {
+  switch (unsigned(Kind)) {
   default:
     llvm_unreachable("Unimplemented");
   case FK_NONE:
@@ -97,7 +97,7 @@ static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
 static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
                                MCSymbolRefExpr::VariantKind Modifier,
                                X86_64RelType Type, bool IsPCRel,
-                               unsigned Kind) {
+                               MCFixupKind Kind) {
   switch (Modifier) {
   default:
     llvm_unreachable("Unimplemented");
@@ -202,7 +202,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
     // and we want to keep back-compatibility.
     if (!Ctx.getAsmInfo()->canRelaxRelocations())
       return ELF::R_X86_64_GOTPCREL;
-    switch (Kind) {
+    switch (unsigned(Kind)) {
     default:
       return ELF::R_X86_64_GOTPCREL;
     case X86::reloc_riprel_4byte_relax:
@@ -237,7 +237,7 @@ static X86_32RelType getType32(X86_64RelType T) {
 static unsigned getRelocType32(MCContext &Ctx,
                                MCSymbolRefExpr::VariantKind Modifier,
                                X86_32RelType Type, bool IsPCRel,
-                               unsigned Kind) {
+                               MCFixupKind Kind) {
   switch (Modifier) {
   default:
     llvm_unreachable("Unimplemented");
@@ -265,8 +265,9 @@ static unsigned getRelocType32(MCContext &Ctx,
     if (!Ctx.getAsmInfo()->canRelaxRelocations())
       return ELF::R_386_GOT32;
 
-    return Kind == X86::reloc_signed_4byte_relax ? ELF::R_386_GOT32X
-                                                 : ELF::R_386_GOT32;
+    return Kind == MCFixupKind(X86::reloc_signed_4byte_relax)
+               ? ELF::R_386_GOT32X
+               : ELF::R_386_GOT32;
   case MCSymbolRefExpr::VK_GOTOFF:
     assert(Type == RT32_32);
     assert(!IsPCRel);
@@ -317,7 +318,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
   MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
-  unsigned Kind = Fixup.getKind();
+  MCFixupKind Kind = Fixup.getKind();
   X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
   if (getEMachine() == ELF::EM_X86_64)
     return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
@@ -329,5 +330,5 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) {
-  return llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
+  return std::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index e1125c176b25..d986c829d98e 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -163,5 +163,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
 
   TextAlignFillValue = 0x90;
 
+  AllowAtInName = true;
+
   UseIntegratedAssembler = true;
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 31d26d08a63f..ac36bf3a12fa 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -862,6 +862,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     VEX_B = ~(BaseRegEnc >> 3) & 1;
     unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
     VEX_X = ~(IndexRegEnc >> 3) & 1;
+    if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+      EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
     break;
   }
   case X86II::MRMSrcReg: {
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index ce05ad974507..ced9eacc8b97 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -70,6 +70,10 @@ unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
   return DWARFFlavour::X86_32_Generic;
 }
 
+bool X86_MC::hasLockPrefix(const MCInst &MI) {
+  return MI.getFlags() & X86::IP_HAS_LOCK;
+}
+
 void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
   // FIXME: TableGen these.
   for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
@@ -399,6 +403,9 @@ public:
   findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
                  uint64_t GotSectionVA,
                  const Triple &TargetTriple) const override;
+  Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
+                                                  uint64_t Addr,
+                                                  uint64_t Size) const override;
 };
 
 #define GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS
@@ -511,7 +518,31 @@ std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries(
       return findX86_64PltEntries(PltSectionVA, PltContents);
     default:
       return {};
-  }
+    }
+}
+
+Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress(
+    const MCInst &Inst, uint64_t Addr, uint64_t Size) const {
+  const MCInstrDesc &MCID = Info->get(Inst.getOpcode());
+  int MemOpStart = X86II::getMemoryOperandNo(MCID.TSFlags);
+  if (MemOpStart == -1)
+    return None;
+  MemOpStart += X86II::getOperandBias(MCID);
+
+  const MCOperand &SegReg = Inst.getOperand(MemOpStart + X86::AddrSegmentReg);
+  const MCOperand &BaseReg = Inst.getOperand(MemOpStart + X86::AddrBaseReg);
+  const MCOperand &IndexReg = Inst.getOperand(MemOpStart + X86::AddrIndexReg);
+  const MCOperand &ScaleAmt = Inst.getOperand(MemOpStart + X86::AddrScaleAmt);
+  const MCOperand &Disp = Inst.getOperand(MemOpStart + X86::AddrDisp);
+  if (SegReg.getReg() != 0 || IndexReg.getReg() != 0 || ScaleAmt.getImm() != 1 ||
+      !Disp.isImm())
+    return None;
+
+  // RIP-relative addressing.
+  if (BaseReg.getReg() == X86::RIP)
+    return Addr + Size + Disp.getImm();
+
+  return None;
 }
 
 } // end of namespace X86_MC
@@ -567,13 +598,13 @@ extern "C" void LLVMInitializeX86TargetMC() {
                                        createX86_64AsmBackend);
 }
 
-unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
-                                            bool High) {
+MCRegister llvm::getX86SubSuperRegisterOrZero(MCRegister Reg, unsigned Size,
+                                              bool High) {
   switch (Size) {
-  default: return 0;
+  default: return X86::NoRegister;
   case 8:
     if (High) {
-      switch (Reg) {
+      switch (Reg.id()) {
       default: return getX86SubSuperRegisterOrZero(Reg, 64);
       case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
         return X86::SI;
@@ -593,8 +624,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
         return X86::BH;
       }
     } else {
-      switch (Reg) {
-      default: return 0;
+      switch (Reg.id()) {
+      default: return X86::NoRegister;
       case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
         return X86::AL;
       case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -630,8 +661,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
       }
     }
   case 16:
-    switch (Reg) {
-    default: return 0;
+    switch (Reg.id()) {
+    default: return X86::NoRegister;
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::AX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -666,8 +697,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
       return X86::R15W;
     }
   case 32:
-    switch (Reg) {
-    default: return 0;
+    switch (Reg.id()) {
+    default: return X86::NoRegister;
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::EAX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -702,7 +733,7 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
       return X86::R15D;
     }
   case 64:
-    switch (Reg) {
+    switch (Reg.id()) {
     default: return 0;
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::RAX;
@@ -740,9 +771,9 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
   }
 }
 
-unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) {
-  unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
-  assert(Res != 0 && "Unexpected register or VT");
+MCRegister llvm::getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High) {
+  MCRegister Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+  assert(Res != X86::NoRegister && "Unexpected register or VT");
   return Res;
 }
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 00dd5908cbf5..0c789061f0e1 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 
+#include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/DataTypes.h"
 #include <string>
@@ -57,6 +58,10 @@ unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
 
 void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
 
+
+/// Returns true if this instruction has a LOCK prefix.
+bool hasLockPrefix(const MCInst &MI);
+
 /// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
 /// do not need to go through TargetRegistry.
 MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
@@ -111,12 +116,12 @@ createX86WinCOFFObjectWriter(bool Is64Bit);
 /// Returns the sub or super register of a specific X86 register.
 /// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
 /// Aborts on error.
-unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false);
+MCRegister getX86SubSuperRegister(MCRegister, unsigned, bool High=false);
 
 /// Returns the sub or super register of a specific X86 register.
 /// Like getX86SubSuperRegister() but returns 0 on error.
-unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
-                                      bool High = false);
+MCRegister getX86SubSuperRegisterOrZero(MCRegister, unsigned,
+                                        bool High = false);
 
 } // End llvm namespace
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index fc7e99f61e5e..b67a7508fe72 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -276,7 +276,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
           // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
           // rewrite the movq to an leaq at link time if the symbol ends up in
           // the same linkage unit.
-          if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+          if (Fixup.getTargetKind() == X86::reloc_riprel_4byte_movq_load)
             Type = MachO::X86_64_RELOC_GOT_LOAD;
           else
             Type = MachO::X86_64_RELOC_GOT;
@@ -339,8 +339,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         return;
       } else {
         Type = MachO::X86_64_RELOC_UNSIGNED;
-        unsigned Kind = Fixup.getKind();
-        if (Kind == X86::reloc_signed_4byte) {
+        if (Fixup.getTargetKind() == X86::reloc_signed_4byte) {
           Asm.getContext().reportError(
               Fixup.getLoc(),
               "32-bit absolute addressing is not supported in 64-bit mode");
@@ -600,5 +599,5 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
                                 uint32_t CPUSubtype) {
-  return llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
+  return std::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 3baab9da1c41..760239f76505 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -109,5 +109,5 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createX86WinCOFFObjectWriter(bool Is64Bit) {
-  return llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
+  return std::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 796a27a17255..db624378d517 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -35,8 +35,9 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
   MCStreamer::EmitWinEHHandlerData(Loc);
 
   // We have to emit the unwind info now, because this directive
-  // actually switches to the .xdata section!
-  EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+  // actually switches to the .xdata section.
+  if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo())
+    EHStreamer.EmitUnwindInfo(*this, CurFrame);
 }
 
 void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index e9987d1f62bd..d5494ef12370 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -170,7 +170,7 @@ bool X86WinCOFFTargetStreamer::emitFPOProc(const MCSymbol *ProcSym,
         L, "opening new .cv_fpo_proc before closing previous frame");
     return true;
   }
-  CurFPOData = llvm::make_unique<FPOData>();
+  CurFPOData = std::make_unique<FPOData>();
   CurFPOData->Function = ProcSym;
   CurFPOData->Begin = emitFPOLabel();
   CurFPOData->ParamsSize = ParamsSize;
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index a95f68434d12..6840fc12751d 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -81,6 +81,12 @@ FunctionPass *createX86FlagsCopyLoweringPass();
 /// Return a pass that expands WinAlloca pseudo-instructions.
 FunctionPass *createX86WinAllocaExpander();
 
+/// Return a pass that inserts int3 at the end of the function if it ends with a
+/// CALL instruction. The pass does the same for each funclet as well. This
+/// ensures that the open interval of function start and end PCs contains all
+/// return addresses for the benefit of the Windows x64 unwinder.
+FunctionPass *createX86AvoidTrailingCallPass();
+
 /// Return a pass that optimizes the code-size of x86 call sequences. This is
 /// done by replacing esp-relative movs with pushes.
 FunctionPass *createX86CallFrameOptimization();
@@ -137,13 +143,13 @@ void initializeWinEHStatePassPass(PassRegistry &);
 void initializeX86AvoidSFBPassPass(PassRegistry &);
 void initializeX86CallFrameOptimizationPass(PassRegistry &);
 void initializeX86CmovConverterPassPass(PassRegistry &);
-void initializeX86ExpandPseudoPass(PassRegistry&);
 void initializeX86CondBrFoldingPassPass(PassRegistry &);
 void initializeX86DomainReassignmentPass(PassRegistry &);
 void initializeX86ExecutionDomainFixPass(PassRegistry &);
+void initializeX86ExpandPseudoPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+void initializeX86OptimizeLEAPassPass(PassRegistry &);
 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
-
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 3112f00c91f2..d8631aca2734 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -95,7 +95,8 @@ def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
 def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                       "Support 64-bit instructions">;
 def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
-                                      "64-bit with cmpxchg16b">;
+                                      "64-bit with cmpxchg16b",
+                                      [FeatureCMPXCHG8B]>;
 def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
 def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
@@ -240,8 +241,11 @@ def FeatureCLDEMOTE  : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
                                       "Enable Cache Demote">;
 def FeaturePTWRITE  : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
                                       "Support ptwrite instruction">;
-def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
-                                      "Support MPX instructions">;
+// FIXME: This feature is deprecated in 10.0 and should not be used for
+// anything, but removing it would break IR files that may contain it in a
+// target-feature attribute.
+def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false",
+                                      "Deprecated. Support MPX instructions">;
 def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
 def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
@@ -374,6 +378,10 @@ def FeatureHasFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast">;
 
+def FeaturePrefer128Bit
+    : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
+                       "Prefer 128-bit AVX instructions">;
+
 def FeaturePrefer256Bit
     : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
                        "Prefer 256-bit AVX instructions">;
@@ -449,6 +457,10 @@ def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
                                         "Merge branches to a three-way "
                                         "conditional branch">;
 
+// Enable use of alias analysis during code generation.
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+                                    "Use alias analysis during codegen">;
+
 // Bonnell
 def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
 // Silvermont
@@ -579,7 +591,6 @@ def ProcessorFeatures {
 
   // Skylake
   list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
-                                                  FeatureMPX,
                                                   FeatureXSAVEC,
                                                   FeatureXSAVES,
                                                   FeatureCLFLUSHOPT,
@@ -594,6 +605,7 @@ def ProcessorFeatures {
 
   // Skylake-AVX512
   list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
+                                                  FeaturePrefer256Bit,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
@@ -627,6 +639,7 @@ def ProcessorFeatures {
 
   // Cannonlake
   list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
+                                                  FeaturePrefer256Bit,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
@@ -665,6 +678,17 @@ def ProcessorFeatures {
   list<SubtargetFeature> ICXFeatures =
     !listconcat(ICLInheritableFeatures, ICXSpecificFeatures);
 
+  //Tigerlake
+  list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT,
+                                                  FeatureMOVDIRI,
+                                                  FeatureMOVDIR64B,
+                                                  FeatureSHSTK];
+  list<SubtargetFeature> TGLSpecificFeatures = [FeatureHasFastGather];
+  list<SubtargetFeature> TGLInheritableFeatures =
+    !listconcat(TGLAdditionalFeatures ,TGLSpecificFeatures);
+  list<SubtargetFeature> TGLFeatures =
+    !listconcat(ICLFeatures, TGLInheritableFeatures );
+
   // Atom
   list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
                                                     FeatureCMPXCHG8B,
@@ -707,7 +731,6 @@ def ProcessorFeatures {
 
   // Goldmont
   list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
-                                                  FeatureMPX,
                                                   FeatureSHA,
                                                   FeatureRDSEED,
                                                   FeatureXSAVE,
@@ -786,6 +809,22 @@ def ProcessorFeatures {
   list<SubtargetFeature> KNMFeatures =
     !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
 
+  // Barcelona
+  list<SubtargetFeature> BarcelonaInheritableFeatures = [FeatureX87,
+                                                         FeatureCMPXCHG8B,
+                                                         FeatureSSE4A,
+                                                         Feature3DNowA,
+                                                         FeatureFXSR,
+                                                         FeatureNOPL,
+                                                         FeatureCMPXCHG16B,
+                                                         FeatureLZCNT,
+                                                         FeaturePOPCNT,
+                                                         FeatureSlowSHLD,
+                                                         FeatureLAHFSAHF,
+                                                         FeatureCMOV,
+                                                         Feature64Bit,
+                                                         FeatureFastScalarShiftMasks];
+  list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures;
 
   // Bobcat
   list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
@@ -1093,6 +1132,8 @@ def : ProcessorModel<"icelake-client", SkylakeServerModel,
                      ProcessorFeatures.ICLFeatures>;
 def : ProcessorModel<"icelake-server", SkylakeServerModel,
                      ProcessorFeatures.ICXFeatures>;
+def : ProcessorModel<"tigerlake", SkylakeServerModel,
+                     ProcessorFeatures.TGLFeatures>;
 
 // AMD CPUs.
 
@@ -1129,10 +1170,7 @@ foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
 }
 
 foreach P = ["amdfam10", "barcelona"] in {
-  def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA,
-                 FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT,
-                 FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV,
-                 Feature64Bit, FeatureFastScalarShiftMasks]>;
+  def : Proc<P, ProcessorFeatures.BarcelonaFeatures>;
 }
 
 // Bobcat
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 80120722e0e6..8d27be30a277 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -242,7 +242,7 @@ void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
     return PrintOperand(MI, OpNo, O);
   if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
     O << '%';
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
     unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
         (strcmp(Modifier+6,"32") == 0) ? 32 :
@@ -388,7 +388,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
 
 static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
                               char Mode, raw_ostream &O) {
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   bool EmitPercent = true;
 
   if (!X86::GR8RegClass.contains(Reg) &&
@@ -575,7 +575,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
 
       // Emitting note header.
       int WordSize = TT.isArch64Bit() ? 8 : 4;
-      EmitAlignment(WordSize == 4 ? 2 : 3);
+      EmitAlignment(WordSize == 4 ? Align(4) : Align(8));
       OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0"
       OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
       OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
@@ -585,7 +585,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
       OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4);
       OutStreamer->EmitIntValue(4, 4);               // data size
       OutStreamer->EmitIntValue(FeatureFlagsAnd, 4); // data
-      EmitAlignment(WordSize == 4 ? 2 : 3);          // padding
+      EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
 
       OutStreamer->endSection(Nt);
       OutStreamer->SwitchSection(Cur);
diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 3dcc1015dc7c..69c6b3356cbb 100644
--- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -35,6 +35,7 @@
 
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -390,7 +391,7 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
   MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
   MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
 
-  unsigned Reg1 = MRI->createVirtualRegister(
+  Register Reg1 = MRI->createVirtualRegister(
       TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
   MachineInstr *NewLoad =
       BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
diff --git a/lib/Target/X86/X86AvoidTrailingCall.cpp b/lib/Target/X86/X86AvoidTrailingCall.cpp
new file mode 100644
index 000000000000..fb4f9e2901dc
--- /dev/null
+++ b/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -0,0 +1,108 @@
+//===----- X86AvoidTrailingCall.cpp - Insert int3 after trailing calls ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The Windows x64 unwinder has trouble unwinding the stack when a return
+// address points to the end of the function. This pass maintains the invariant
+// that every return address is inside the bounds of its parent function or
+// funclet by inserting int3 if the last instruction would otherwise be a call.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+#define DEBUG_TYPE "x86-avoid-trailing-call"
+
+using namespace llvm;
+
+namespace {
+
+class X86AvoidTrailingCallPass : public MachineFunctionPass {
+public:
+  X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  StringRef getPassName() const override {
+    return "X86 avoid trailing call pass";
+  }
+  static char ID;
+};
+
+char X86AvoidTrailingCallPass::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86AvoidTrailingCallPass() {
+  return new X86AvoidTrailingCallPass();
+}
+
+// A real instruction is a non-meta, non-pseudo instruction.  Some pseudos
+// expand to nothing, and some expand to code. This logic conservatively assumes
+// they might expand to nothing.
+static bool isRealInstruction(MachineInstr &MI) {
+  return !MI.isPseudo() && !MI.isMetaInstruction();
+}
+
+// Return true if this is a call instruction, but not a tail call.
+static bool isCallInstruction(const MachineInstr &MI) {
+  return MI.isCall() && !MI.isReturn();
+}
+
+bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86InstrInfo &TII = *STI.getInstrInfo();
+  assert(STI.isTargetWin64() && "pass only runs on Win64");
+
+  // FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops
+  // before epilogues.
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    // Look for basic blocks that precede funclet entries or are at the end of
+    // the function.
+    MachineBasicBlock *NextMBB = MBB.getNextNode();
+    if (NextMBB && !NextMBB->isEHFuncletEntry())
+      continue;
+
+    // Find the last real instruction in this block, or previous blocks if this
+    // block is empty.
+    MachineBasicBlock::reverse_iterator LastRealInstr;
+    for (MachineBasicBlock &RMBB :
+         make_range(MBB.getReverseIterator(), MF.rend())) {
+      LastRealInstr = llvm::find_if(reverse(RMBB), isRealInstruction);
+      if (LastRealInstr != RMBB.rend())
+        break;
+    }
+
+    // Do nothing if this function or funclet has no instructions.
+    if (LastRealInstr == MF.begin()->rend())
+      continue;
+
+    // If this is a call instruction, insert int3 right after it with the same
+    // DebugLoc. Convert back to a forward iterator and advance the insertion
+    // position once.
+    if (isCallInstruction(*LastRealInstr)) {
+      LLVM_DEBUG({
+        dbgs() << "inserting int3 after trailing call instruction:\n";
+        LastRealInstr->dump();
+        dbgs() << '\n';
+      });
+
+      MachineBasicBlock::iterator MBBI = std::next(LastRealInstr.getReverse());
+      BuildMI(*LastRealInstr->getParent(), MBBI, LastRealInstr->getDebugLoc(),
+              TII.get(X86::INT3));
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 4df849a2e14c..ad7e32b4efc8 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -155,12 +155,22 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
   // This is bad, and breaks SP adjustment.
   // So, check that all of the frames in the function are closed inside
   // the same block, and, for good measure, that there are no nested frames.
+  //
+  // If any call allocates more argument stack memory than the stack
+  // probe size, don't do this optimization. Otherwise, this pass
+  // would need to synthesize additional stack probe calls to allocate
+  // memory for arguments.
   unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+  bool UseStackProbe =
+      !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty();
+  unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF);
   for (MachineBasicBlock &BB : MF) {
     bool InsideFrameSequence = false;
     for (MachineInstr &MI : BB) {
       if (MI.getOpcode() == FrameSetupOpcode) {
+        if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe)
+          return false;
         if (InsideFrameSequence)
           return false;
         InsideFrameSequence = true;
@@ -325,8 +335,8 @@ X86CallFrameOptimization::classifyInstruction(
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned int Reg = MO.getReg();
-    if (!RegInfo.isPhysicalRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isPhysicalRegister(Reg))
       continue;
     if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
       return Exit;
@@ -370,7 +380,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
   while (I->getOpcode() == X86::LEA32r || I->isDebugInstr())
     ++I;
 
-  unsigned StackPtr = RegInfo.getStackRegister();
+  Register StackPtr = RegInfo.getStackRegister();
   auto StackPtrCopyInst = MBB.end();
   // SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual
   // register.  If it's there, use that virtual register as stack pointer
@@ -443,8 +453,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
     for (const MachineOperand &MO : I->uses()) {
       if (!MO.isReg())
         continue;
-      unsigned int Reg = MO.getReg();
-      if (RegInfo.isPhysicalRegister(Reg))
+      Register Reg = MO.getReg();
+      if (Register::isPhysicalRegister(Reg))
         UsedRegs.insert(Reg);
     }
   }
@@ -524,12 +534,12 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
       break;
     case X86::MOV32mr:
     case X86::MOV64mr: {
-      unsigned int Reg = PushOp.getReg();
+      Register Reg = PushOp.getReg();
 
       // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
       // in preparation for the PUSH64. The upper 32 bits can be undef.
       if (Is64Bit && Store->getOpcode() == X86::MOV32mr) {
-        unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+        Register UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
         Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
         BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
         BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
@@ -598,7 +608,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
   // movl    %eax, (%esp)
   // call
   // Get rid of those with prejudice.
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  if (!Register::isVirtualRegister(Reg))
     return nullptr;
 
   // Make sure this is the only use of Reg.
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index b16b3839c85a..7ee637cfd523 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -102,6 +102,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
         DL(MIRBuilder.getMF().getDataLayout()),
         STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
 
+  bool isIncomingArgumentHandler() const override { return false; }
+
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
     LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
@@ -155,8 +157,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
 
   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                  CCValAssign::LocInfo LocInfo,
-                 const CallLowering::ArgInfo &Info, CCState &State) override {
-    bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+                 const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+                 CCState &State) override {
+    bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
     StackSize = State.getNextStackOffset();
 
     static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
@@ -229,7 +232,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
       : ValueHandler(MIRBuilder, MRI, AssignFn),
         DL(MIRBuilder.getMF().getDataLayout()) {}
 
-  bool isArgumentHandler() const override { return true; }
+  bool isIncomingArgumentHandler() const override { return true; }
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -237,7 +240,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
     int FI = MFI.CreateFixedObject(Size, Offset, true);
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
 
-    unsigned AddrReg = MRI.createGenericVirtualRegister(
+    Register AddrReg = MRI.createGenericVirtualRegister(
         LLT::pointer(0, DL.getPointerSizeInBits(0)));
     MIRBuilder.buildFrameIndex(AddrReg, FI);
     return AddrReg;
@@ -301,6 +304,7 @@ struct FormalArgHandler : public IncomingValueHandler {
       : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
+    MIRBuilder.getMRI()->addLiveIn(PhysReg);
     MIRBuilder.getMBB().addLiveIn(PhysReg);
   }
 };
@@ -372,10 +376,7 @@ bool X86CallLowering::lowerFormalArguments(
 }
 
 bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
-                                CallingConv::ID CallConv,
-                                const MachineOperand &Callee,
-                                const ArgInfo &OrigRet,
-                                ArrayRef<ArgInfo> OrigArgs) const {
+                                CallLoweringInfo &Info) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -385,8 +386,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   auto TRI = STI.getRegisterInfo();
 
   // Handle only Linux C, X86_64_SysV calling conventions for now.
-  if (!STI.isTargetLinux() ||
-      !(CallConv == CallingConv::C || CallConv == CallingConv::X86_64_SysV))
+  if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C ||
+                                Info.CallConv == CallingConv::X86_64_SysV))
     return false;
 
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
@@ -395,18 +396,19 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // Create a temporarily-floating call instruction so we can add the implicit
   // uses of arg registers.
   bool Is64Bit = STI.is64Bit();
-  unsigned CallOpc = Callee.isReg()
+  unsigned CallOpc = Info.Callee.isReg()
                          ? (Is64Bit ? X86::CALL64r : X86::CALL32r)
                          : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
 
-  auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc).add(Callee).addRegMask(
-      TRI->getCallPreservedMask(MF, CallConv));
+  auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc)
+                 .add(Info.Callee)
+                 .addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
 
   SmallVector<ArgInfo, 8> SplitArgs;
-  for (const auto &OrigArg : OrigArgs) {
+  for (const auto &OrigArg : Info.OrigArgs) {
 
     // TODO: handle not simple cases.
-    if (OrigArg.Flags.isByVal())
+    if (OrigArg.Flags[0].isByVal())
       return false;
 
     if (OrigArg.Regs.size() > 1)
@@ -423,8 +425,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
     return false;
 
-  bool IsFixed = OrigArgs.empty() ? true : OrigArgs.back().IsFixed;
-  if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(CallConv)) {
+  bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed;
+  if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) {
     // From AMD64 ABI document:
     // For calls that may call functions that use varargs or stdargs
     // (prototype-less calls or calls to functions containing ellipsis (...) in
@@ -445,23 +447,24 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // If Callee is a reg, since it is used by a target specific
   // instruction, it must have a register class matching the
   // constraint of that instruction.
-  if (Callee.isReg())
+  if (Info.Callee.isReg())
     MIB->getOperand(0).setReg(constrainOperandRegClass(
         MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
-        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
+        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+        0));
 
   // Finally we can copy the returned value back into its virtual-register. In
   // symmetry with the arguments, the physical register must be an
   // implicit-define of the call instruction.
 
-  if (!OrigRet.Ty->isVoidTy()) {
-    if (OrigRet.Regs.size() > 1)
+  if (!Info.OrigRet.Ty->isVoidTy()) {
+    if (Info.OrigRet.Regs.size() > 1)
       return false;
 
     SplitArgs.clear();
     SmallVector<Register, 8> NewRegs;
 
-    if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
+    if (!splitToValueTypes(Info.OrigRet, SplitArgs, DL, MRI,
                            [&](ArrayRef<Register> Regs) {
                              NewRegs.assign(Regs.begin(), Regs.end());
                            }))
@@ -472,7 +475,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       return false;
 
     if (!NewRegs.empty())
-      MIRBuilder.buildMerge(OrigRet.Regs[0], NewRegs);
+      MIRBuilder.buildMerge(Info.OrigRet.Regs[0], NewRegs);
   }
 
   CallSeqStart.addImm(Handler.getStackSize())
diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h
index 0445331bc3ff..444a0c7d0122 100644
--- a/lib/Target/X86/X86CallLowering.h
+++ b/lib/Target/X86/X86CallLowering.h
@@ -34,9 +34,8 @@ public:
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<ArrayRef<Register>> VRegs) const override;
 
-  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
-                 const MachineOperand &Callee, const ArgInfo &OrigRet,
-                 ArrayRef<ArgInfo> OrigArgs) const override;
+  bool lowerCall(MachineIRBuilder &MIRBuilder,
+                 CallLoweringInfo &Info) const override;
 
 private:
   /// A function of this type is used to perform value split action.
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 1c3034a5116a..4c49d68bec99 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -433,6 +433,7 @@ defm X86_SysV64_RegCall :
 def RetCC_X86_32 : CallingConv<[
   // If FastCC, use RetCC_X86_32_Fast.
   CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>,
   // If HiPE, use RetCC_X86_32_HiPE.
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
@@ -1000,6 +1001,7 @@ def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+  CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
   CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index a61fa3246f09..5123853f5455 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -436,8 +436,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
           // Checks for "isUse()" as "uses()" returns also implicit definitions.
           if (!MO.isReg() || !MO.isUse())
             continue;
-          unsigned Reg = MO.getReg();
-          auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
+          Register Reg = MO.getReg();
+          auto &RDM = RegDefMaps[Register::isVirtualRegister(Reg)];
           if (MachineInstr *DefMI = RDM.lookup(Reg)) {
             OperandToDefMap[&MO] = DefMI;
             DepthInfo Info = DepthMap.lookup(DefMI);
@@ -456,8 +456,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
         for (auto &MO : MI.operands()) {
           if (!MO.isReg() || !MO.isDef())
             continue;
-          unsigned Reg = MO.getReg();
-          RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
+          Register Reg = MO.getReg();
+          RegDefMaps[Register::isVirtualRegister(Reg)][Reg] = &MI;
         }
 
         unsigned Latency = TSchedModel.computeInstrLatency(&MI);
@@ -710,7 +710,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
     // Skip any CMOVs in this group which don't load from memory.
     if (!MI.mayLoad()) {
       // Remember the false-side register input.
-      unsigned FalseReg =
+      Register FalseReg =
           MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
       // Walk back through any intermediate cmovs referenced.
       while (true) {
@@ -753,7 +753,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
 
     // Get a fresh register to use as the destination of the MOV.
     const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
-    unsigned TmpReg = MRI->createVirtualRegister(RC);
+    Register TmpReg = MRI->createVirtualRegister(RC);
 
     SmallVector<MachineInstr *, 4> NewMIs;
     bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
@@ -810,9 +810,9 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
 
   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
-    unsigned DestReg = MIIt->getOperand(0).getReg();
-    unsigned Op1Reg = MIIt->getOperand(1).getReg();
-    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+    Register DestReg = MIIt->getOperand(0).getReg();
+    Register Op1Reg = MIIt->getOperand(1).getReg();
+    Register Op2Reg = MIIt->getOperand(2).getReg();
 
     // If this CMOV we are processing is the opposite condition from the jump we
     // generated, then we have to swap the operands for the PHI that is going to
diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp
index 9dea94f1368d..1bf2d5ba7b8f 100644
--- a/lib/Target/X86/X86CondBrFolding.cpp
+++ b/lib/Target/X86/X86CondBrFolding.cpp
@@ -564,7 +564,7 @@ X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) {
     Modified = false;
     break;
   }
-  return llvm::make_unique<TargetMBBInfo>(TargetMBBInfo{
+  return std::make_unique<TargetMBBInfo>(TargetMBBInfo{
       TBB, FBB, BrInstr, CmpInstr, CC, SrcReg, CmpValue, Modified, CmpBrOnly});
 }
 
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index 18bbfa32e11b..b4cf5cafbc6e 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -182,7 +182,7 @@ public:
     MachineBasicBlock *MBB = MI->getParent();
     auto &DL = MI->getDebugLoc();
 
-    unsigned Reg = MRI->createVirtualRegister(
+    Register Reg = MRI->createVirtualRegister(
         TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
                          *MBB->getParent()));
     MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
@@ -219,13 +219,13 @@ public:
 
     // Don't allow copies to/flow GR8/GR16 physical registers.
     // FIXME: Is there some better way to support this?
-    unsigned DstReg = MI->getOperand(0).getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+    Register DstReg = MI->getOperand(0).getReg();
+    if (Register::isPhysicalRegister(DstReg) &&
         (X86::GR8RegClass.contains(DstReg) ||
          X86::GR16RegClass.contains(DstReg)))
       return false;
-    unsigned SrcReg = MI->getOperand(1).getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+    Register SrcReg = MI->getOperand(1).getReg();
+    if (Register::isPhysicalRegister(SrcReg) &&
         (X86::GR8RegClass.contains(SrcReg) ||
          X86::GR16RegClass.contains(SrcReg)))
       return false;
@@ -241,7 +241,7 @@ public:
       // Physical registers will not be converted. Assume that converting the
       // COPY to the destination domain will eventually result in a actual
       // instruction.
-      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      if (Register::isPhysicalRegister(MO.getReg()))
         return 1;
 
       RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()),
@@ -436,7 +436,7 @@ void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg,
   if (EnclosedEdges.count(Reg))
     return;
 
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  if (!Register::isVirtualRegister(Reg))
     return;
 
   if (!MRI->hasOneDef(Reg))
@@ -593,8 +593,8 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
         if (!DefOp.isReg())
           continue;
 
-        unsigned DefReg = DefOp.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(DefReg)) {
+        Register DefReg = DefOp.getReg();
+        if (!Register::isVirtualRegister(DefReg)) {
           C.setAllIllegal();
           continue;
         }
@@ -751,7 +751,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
   // Go over all virtual registers and calculate a closure.
   unsigned ClosureID = 0;
   for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx);
+    unsigned Reg = Register::index2VirtReg(Idx);
 
     // GPR only current source domain supported.
     if (!isGPR(MRI->getRegClass(Reg)))
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 58680f1815bb..24c8e6d6f6eb 100755
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -131,7 +131,7 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
     if (!MO.isReg())
       continue;
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
 
     assert(!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31) &&
            "ZMM instructions should not be in the EVEX->VEX tables");
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index b8624b40f2f7..9126a1fbea52 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -194,7 +194,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   case X86::TCRETURNmi64: {
     bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
     MachineOperand &JumpTarget = MBBI->getOperand(0);
-    MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+    MachineOperand &StackAdjust = MBBI->getOperand(isMem ? X86::AddrNumOperands
+                                                         : 1);
     assert(StackAdjust.isImm() && "Expecting immediate value.");
 
     // Adjust stack pointer.
@@ -259,7 +260,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                         ? X86::TAILJMPm
                         : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
-      for (unsigned i = 0; i != 5; ++i)
+      for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
         MIB.add(MBBI->getOperand(i));
     } else if (Opcode == X86::TCRETURNri64) {
       JumpTarget.setIsKill();
@@ -274,7 +275,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
     MachineInstr &NewMI = *std::prev(MBBI);
     NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
-    MBB.getParent()->updateCallSiteInfo(&*MBBI, &NewMI);
+    MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
@@ -287,7 +288,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     assert(DestAddr.isReg() && "Offset should be in register!");
     const bool Uses64BitFramePtr =
         STI->isTarget64BitLP64() || STI->isTargetNaCl64();
-    unsigned StackPtr = TRI->getStackRegister();
+    Register StackPtr = TRI->getStackRegister();
     BuildMI(MBB, MBBI, DL,
             TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
         .addReg(DestAddr.getReg());
@@ -347,7 +348,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     // actualcmpxchg Addr
     // [E|R]BX = SaveRbx
     const MachineOperand &InArg = MBBI->getOperand(6);
-    unsigned SaveRbx = MBBI->getOperand(7).getReg();
+    Register SaveRbx = MBBI->getOperand(7).getReg();
 
     unsigned ActualInArg =
         Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 7b9ce0271205..e5e089d07d55 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1160,6 +1160,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   CallingConv::ID CC = F.getCallingConv();
   if (CC != CallingConv::C &&
       CC != CallingConv::Fast &&
+      CC != CallingConv::Tail &&
       CC != CallingConv::X86_FastCall &&
       CC != CallingConv::X86_StdCall &&
       CC != CallingConv::X86_ThisCall &&
@@ -1173,7 +1174,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
   // tail call optimization. Fastisel doesn't know how to do that.
-  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+  if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+      CC == CallingConv::Tail)
     return false;
 
   // Let SDISel handle vararg functions.
@@ -1241,7 +1243,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     }
 
     // Make the copy.
-    unsigned DstReg = VA.getLocReg();
+    Register DstReg = VA.getLocReg();
     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
     // Avoid a cross-class copy. This is very unlikely.
     if (!SrcRC->contains(DstReg))
@@ -3157,7 +3159,7 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
   if (Subtarget->getTargetTriple().isOSMSVCRT())
     return 0;
   if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
-      CC == CallingConv::HiPE)
+      CC == CallingConv::HiPE || CC == CallingConv::Tail)
     return 0;
 
   if (CS)
@@ -3208,6 +3210,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   default: return false;
   case CallingConv::C:
   case CallingConv::Fast:
+  case CallingConv::Tail:
   case CallingConv::WebKit_JS:
   case CallingConv::Swift:
   case CallingConv::X86_FastCall:
@@ -3224,7 +3227,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
   // tail call optimization. Fastisel doesn't know how to do that.
-  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+  if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+      CC == CallingConv::Tail)
     return false;
 
   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
@@ -3387,6 +3391,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     case CCValAssign::SExtUpper:
     case CCValAssign::ZExtUpper:
     case CCValAssign::FPExt:
+    case CCValAssign::Trunc:
       llvm_unreachable("Unexpected loc info!");
     case CCValAssign::Indirect:
       // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
@@ -3547,7 +3552,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     CCValAssign &VA = RVLocs[i];
     EVT CopyVT = VA.getValVT();
     unsigned CopyReg = ResultReg + i;
-    unsigned SrcReg = VA.getLocReg();
+    Register SrcReg = VA.getLocReg();
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index bf541d933790..9f7c4afde760 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -80,7 +80,7 @@ class FixupBWInstPass : public MachineFunctionPass {
   /// destination register of the MachineInstr passed in. It returns true if
   /// that super register is dead just prior to \p OrigMI, and false if not.
   bool getSuperRegDestIfDead(MachineInstr *OrigMI,
-                             unsigned &SuperDestReg) const;
+                             Register &SuperDestReg) const;
 
   /// Change the MachineInstr \p MI into the equivalent extending load to 32 bit
   /// register if it is safe to do so.  Return the replacement instruction if
@@ -92,6 +92,12 @@ class FixupBWInstPass : public MachineFunctionPass {
   /// nullptr.
   MachineInstr *tryReplaceCopy(MachineInstr *MI) const;
 
+  /// Change the MachineInstr \p MI into the equivalent extend to 32 bit
+  /// register if it is safe to do so.  Return the replacement instruction if
+  /// OK, otherwise return nullptr.
+  MachineInstr *tryReplaceExtend(unsigned New32BitOpcode,
+                                 MachineInstr *MI) const;
+
   // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
   // possible.  Return the replacement instruction if OK, return nullptr
   // otherwise.
@@ -169,10 +175,10 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
 ///
 /// If so, return that super register in \p SuperDestReg.
 bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
-                                            unsigned &SuperDestReg) const {
+                                            Register &SuperDestReg) const {
   auto *TRI = &TII->getRegisterInfo();
 
-  unsigned OrigDestReg = OrigMI->getOperand(0).getReg();
+  Register OrigDestReg = OrigMI->getOperand(0).getReg();
   SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
 
   const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg);
@@ -232,12 +238,12 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
   //   %ax = KILL %ax, implicit killed %eax
   //   RET 0, %ax
   unsigned Opc = OrigMI->getOpcode(); (void)Opc;
-  // These are the opcodes currently handled by the pass, if something
-  // else will be added we need to ensure that new opcode has the same
-  // properties.
-  assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr ||
-          Opc == X86::MOV16rr) &&
-         "Unexpected opcode.");
+  // These are the opcodes currently known to work with the code below, if
+  // something // else will be added we need to ensure that new opcode has the
+  // same properties.
+  if (Opc != X86::MOV8rm && Opc != X86::MOV16rm && Opc != X86::MOV8rr &&
+      Opc != X86::MOV16rr)
+    return false;
 
   bool IsDefined = false;
   for (auto &MO: OrigMI->implicit_operands()) {
@@ -247,7 +253,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
     assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
 
     if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg()))
-        IsDefined = true;
+      IsDefined = true;
 
     // If MO is a use of any part of the destination register but is not equal
     // to OrigDestReg or one of its subregisters, we cannot use SuperDestReg.
@@ -268,7 +274,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
 
 MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
                                               MachineInstr *MI) const {
-  unsigned NewDestReg;
+  Register NewDestReg;
 
   // We are going to try to rewrite this load to a larger zero-extending
   // load.  This is safe if all portions of the 32 bit super-register
@@ -295,11 +301,11 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
   auto &OldDest = MI->getOperand(0);
   auto &OldSrc = MI->getOperand(1);
 
-  unsigned NewDestReg;
+  Register NewDestReg;
   if (!getSuperRegDestIfDead(MI, NewDestReg))
     return nullptr;
 
-  unsigned NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
+  Register NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
 
   // This is only correct if we access the same subregister index: otherwise,
   // we could try to replace "movb %ah, %al" with "movl %eax, %eax".
@@ -326,6 +332,33 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
   return MIB;
 }
 
+MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode,
+                                                MachineInstr *MI) const {
+  Register NewDestReg;
+  if (!getSuperRegDestIfDead(MI, NewDestReg))
+    return nullptr;
+
+  // Don't interfere with formation of CBW instructions which should be a
+  // shorter encoding than even the MOVSX32rr8. It's also immunte to partial
+  // merge issues on Intel CPUs.
+  if (MI->getOpcode() == X86::MOVSX16rr8 &&
+      MI->getOperand(0).getReg() == X86::AX &&
+      MI->getOperand(1).getReg() == X86::AL)
+    return nullptr;
+
+  // Safe to change the instruction.
+  MachineInstrBuilder MIB =
+      BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
+
+  unsigned NumArgs = MI->getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i)
+    MIB.add(MI->getOperand(i));
+
+  MIB.setMemRefs(MI->memoperands());
+
+  return MIB;
+}
+
 MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
                                                MachineBasicBlock &MBB) const {
   // See if this is an instruction of the type we are currently looking for.
@@ -355,6 +388,15 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
     // of the register.
     return tryReplaceCopy(MI);
 
+  case X86::MOVSX16rr8:
+    return tryReplaceExtend(X86::MOVSX32rr8, MI);
+  case X86::MOVSX16rm8:
+    return tryReplaceExtend(X86::MOVSX32rm8, MI);
+  case X86::MOVZX16rr8:
+    return tryReplaceExtend(X86::MOVZX32rr8, MI);
+  case X86::MOVZX16rm8:
+    return tryReplaceExtend(X86::MOVZX32rm8, MI);
+
   default:
     // nothing to do here.
     break;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 041529a0be68..543dc8b00fa0 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -67,8 +67,8 @@ class FixupLEAPass : public MachineFunctionPass {
   /// - LEA that uses RIP relative addressing mode
   /// - LEA that uses 16-bit addressing mode "
   /// This function currently handles the first 2 cases only.
-  MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
-                                          MachineBasicBlock &MBB);
+  void processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+                                 MachineBasicBlock &MBB, bool OptIncDec);
 
   /// Look for LEAs that are really two address LEAs that we might be able to
   /// turn into regular ADD instructions.
@@ -216,14 +216,10 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
       if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP))
         continue;
 
-      if (IsSlowLEA) {
+      if (IsSlowLEA)
         processInstructionForSlowLEA(I, MBB);
-      } else if (IsSlow3OpsLEA) {
-        if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) {
-          MBB.erase(I);
-          I = NewMI;
-        }
-      }
+      else if (IsSlow3OpsLEA)
+        processInstrForSlow3OpLEA(I, MBB, OptIncDec);
     }
 
     // Second pass for creating LEAs. This may reverse some of the
@@ -301,18 +297,14 @@ static inline bool isInefficientLEAReg(unsigned Reg) {
          Reg == X86::R13D || Reg == X86::R13;
 }
 
-static inline bool isRegOperand(const MachineOperand &Op) {
-  return Op.isReg() && Op.getReg() != X86::NoRegister;
-}
-
 /// Returns true if this LEA uses base an index registers, and the base register
 /// is known to be inefficient for the subtarget.
 // TODO: use a variant scheduling class to model the latency profile
 // of LEA instructions, and implement this logic as a scheduling predicate.
 static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
                                             const MachineOperand &Index) {
-  return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
-         isRegOperand(Index);
+  return Base.isReg() && isInefficientLEAReg(Base.getReg()) && Index.isReg() &&
+         Index.getReg() != X86::NoRegister;
 }
 
 static inline bool hasLEAOffset(const MachineOperand &Offset) {
@@ -372,9 +364,9 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
       !TII->isSafeToClobberEFLAGS(MBB, I))
     return false;
 
-  unsigned DestReg  = MI.getOperand(0).getReg();
-  unsigned BaseReg  = Base.getReg();
-  unsigned IndexReg = Index.getReg();
+  Register DestReg = MI.getOperand(0).getReg();
+  Register BaseReg = Base.getReg();
+  Register IndexReg = Index.getReg();
 
   // Don't change stack adjustment LEAs.
   if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP))
@@ -500,9 +492,9 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
   if (Segment.getReg() != 0 || !Offset.isImm() ||
       !TII->isSafeToClobberEFLAGS(MBB, I))
     return;
-  const unsigned DstR = Dst.getReg();
-  const unsigned SrcR1 = Base.getReg();
-  const unsigned SrcR2 = Index.getReg();
+  const Register DstR = Dst.getReg();
+  const Register SrcR1 = Base.getReg();
+  const Register SrcR2 = Index.getReg();
   if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
     return;
   if (Scale.getImm() > 1)
@@ -534,111 +526,150 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
   }
 }
 
-MachineInstr *
-FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
-                                        MachineBasicBlock &MBB) {
+void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+                                             MachineBasicBlock &MBB,
+                                             bool OptIncDec) {
+  MachineInstr &MI = *I;
   const unsigned LEAOpcode = MI.getOpcode();
 
-  const MachineOperand &Dst =     MI.getOperand(0);
+  const MachineOperand &Dest =    MI.getOperand(0);
   const MachineOperand &Base =    MI.getOperand(1 + X86::AddrBaseReg);
   const MachineOperand &Scale =   MI.getOperand(1 + X86::AddrScaleAmt);
   const MachineOperand &Index =   MI.getOperand(1 + X86::AddrIndexReg);
   const MachineOperand &Offset =  MI.getOperand(1 + X86::AddrDisp);
   const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
 
-  if (!(TII->isThreeOperandsLEA(MI) ||
-        hasInefficientLEABaseReg(Base, Index)) ||
+  if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) ||
       !TII->isSafeToClobberEFLAGS(MBB, MI) ||
       Segment.getReg() != X86::NoRegister)
-    return nullptr;
+    return;
+
+  Register DestReg = Dest.getReg();
+  Register BaseReg = Base.getReg();
+  Register IndexReg = Index.getReg();
+
+  if (MI.getOpcode() == X86::LEA64_32r) {
+    if (BaseReg != 0)
+      BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit);
+    if (IndexReg != 0)
+      IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit);
+  }
 
-  unsigned DstR = Dst.getReg();
-  unsigned BaseR = Base.getReg();
-  unsigned IndexR = Index.getReg();
-  unsigned SSDstR =
-      (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
   bool IsScale1 = Scale.getImm() == 1;
-  bool IsInefficientBase = isInefficientLEAReg(BaseR);
-  bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+  bool IsInefficientBase = isInefficientLEAReg(BaseReg);
+  bool IsInefficientIndex = isInefficientLEAReg(IndexReg);
 
   // Skip these cases since it takes more than 2 instructions
   // to replace the LEA instruction.
-  if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
-    return nullptr;
-  if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
-      (IsInefficientIndex || !IsScale1))
-    return nullptr;
-
-  const DebugLoc DL = MI.getDebugLoc();
-  const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
-  const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+  if (IsInefficientBase && DestReg == BaseReg && !IsScale1)
+    return;
 
   LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
   LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
 
+  MachineInstr *NewMI = nullptr;
+
   // First try to replace LEA with one or two (for the 3-op LEA case)
   // add instructions:
   // 1.lea (%base,%index,1), %base => add %index,%base
   // 2.lea (%base,%index,1), %index => add %base,%index
-  if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
-    const MachineOperand &Src = DstR == BaseR ? Index : Base;
-    MachineInstr *NewMI =
-        BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
-    LLVM_DEBUG(NewMI->dump(););
-    // Create ADD instruction for the Offset in case of 3-Ops LEA.
-    if (hasLEAOffset(Offset)) {
-      NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
-      LLVM_DEBUG(NewMI->dump(););
+  if (IsScale1 && (DestReg == BaseReg || DestReg == IndexReg)) {
+    unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+    if (DestReg != BaseReg)
+      std::swap(BaseReg, IndexReg);
+
+    if (MI.getOpcode() == X86::LEA64_32r) {
+      // TODO: Do we need the super register implicit use?
+      NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                  .addReg(BaseReg)
+                  .addReg(IndexReg)
+                  .addReg(Base.getReg(), RegState::Implicit)
+                  .addReg(Index.getReg(), RegState::Implicit);
+    } else {
+      NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                  .addReg(BaseReg)
+                  .addReg(IndexReg);
     }
-    return NewMI;
-  }
-  // If the base is inefficient try switching the index and base operands,
-  // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
-  // lea offset(%base,%index,scale),%dst =>
-  // lea (%base,%index,scale); add offset,%dst
-  if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
-    MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
-                              .add(Dst)
-                              .add(IsInefficientBase ? Index : Base)
-                              .add(Scale)
-                              .add(IsInefficientBase ? Base : Index)
-                              .addImm(0)
-                              .add(Segment);
+  } else if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+    // If the base is inefficient try switching the index and base operands,
+    // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+    // lea offset(%base,%index,scale),%dst =>
+    // lea (%base,%index,scale); add offset,%dst
+    NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+                .add(Dest)
+                .add(IsInefficientBase ? Index : Base)
+                .add(Scale)
+                .add(IsInefficientBase ? Base : Index)
+                .addImm(0)
+                .add(Segment);
     LLVM_DEBUG(NewMI->dump(););
+  }
+
+  // If either replacement succeeded above, add the offset if needed, then
+  // replace the instruction.
+  if (NewMI) {
     // Create ADD instruction for the Offset in case of 3-Ops LEA.
     if (hasLEAOffset(Offset)) {
-      NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
-      LLVM_DEBUG(NewMI->dump(););
+      if (OptIncDec && Offset.isImm() &&
+          (Offset.getImm() == 1 || Offset.getImm() == -1)) {
+        unsigned NewOpc =
+            getINCDECFromLEA(MI.getOpcode(), Offset.getImm() == 1);
+        NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                    .addReg(DestReg);
+        LLVM_DEBUG(NewMI->dump(););
+      } else {
+        unsigned NewOpc = getADDriFromLEA(MI.getOpcode(), Offset);
+        NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                    .addReg(DestReg)
+                    .add(Offset);
+        LLVM_DEBUG(NewMI->dump(););
+      }
     }
-    return NewMI;
+
+    MBB.erase(I);
+    I = NewMI;
+    return;
   }
+
   // Handle the rest of the cases with inefficient base register:
-  assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+  assert(DestReg != BaseReg && "DestReg == BaseReg should be handled already!");
   assert(IsInefficientBase && "efficient base should be handled already!");
 
+  // FIXME: Handle LEA64_32r.
+  if (LEAOpcode == X86::LEA64_32r)
+    return;
+
   // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
   if (IsScale1 && !hasLEAOffset(Offset)) {
-    bool BIK = Base.isKill() && BaseR != IndexR;
-    TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK);
+    bool BIK = Base.isKill() && BaseReg != IndexReg;
+    TII->copyPhysReg(MBB, MI, MI.getDebugLoc(), DestReg, BaseReg, BIK);
     LLVM_DEBUG(MI.getPrevNode()->dump(););
 
-    MachineInstr *NewMI =
-        BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+    unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+    NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                .addReg(DestReg)
+                .add(Index);
     LLVM_DEBUG(NewMI->dump(););
-    return NewMI;
+    return;
   }
+
   // lea offset(%base,%index,scale), %dst =>
   // lea offset( ,%index,scale), %dst; add %base,%dst
-  MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
-                            .add(Dst)
-                            .addReg(0)
-                            .add(Scale)
-                            .add(Index)
-                            .add(Offset)
-                            .add(Segment);
+  NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+              .add(Dest)
+              .addReg(0)
+              .add(Scale)
+              .add(Index)
+              .add(Offset)
+              .add(Segment);
   LLVM_DEBUG(NewMI->dump(););
 
-  NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+  unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+  NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+              .addReg(DestReg)
+              .add(Base);
   LLVM_DEBUG(NewMI->dump(););
-  return NewMI;
+
+  MBB.erase(I);
+  I = NewMI;
 }
diff --git a/lib/Target/X86/X86FixupSetCC.cpp b/lib/Target/X86/X86FixupSetCC.cpp
index e2d4d1ede6f3..cbde280aa280 100644
--- a/lib/Target/X86/X86FixupSetCC.cpp
+++ b/lib/Target/X86/X86FixupSetCC.cpp
@@ -136,8 +136,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
       const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
                                           ? &X86::GR32RegClass
                                           : &X86::GR32_ABCDRegClass;
-      unsigned ZeroReg = MRI->createVirtualRegister(RC);
-      unsigned InsertReg = MRI->createVirtualRegister(RC);
+      Register ZeroReg = MRI->createVirtualRegister(RC);
+      Register InsertReg = MRI->createVirtualRegister(RC);
 
       // Initialize a register with 0. This must go before the eflags def
       BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp
index 5ce3255ea96a..cfba06fb6533 100644
--- a/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -721,8 +721,9 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
   for (MachineInstr &MI :
        llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
     X86::CondCode Cond = X86::getCondFromSETCC(MI);
-    if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() &&
-        TRI->isVirtualRegister(MI.getOperand(0).getReg())) {
+    if (Cond != X86::COND_INVALID && !MI.mayStore() &&
+        MI.getOperand(0).isReg() &&
+        Register::isVirtualRegister(MI.getOperand(0).getReg())) {
       assert(MI.getOperand(0).isDef() &&
              "A non-storing SETcc should always define a register!");
       CondRegs[Cond] = MI.getOperand(0).getReg();
@@ -739,7 +740,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
 unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
     DebugLoc TestLoc, X86::CondCode Cond) {
-  unsigned Reg = MRI->createVirtualRegister(PromoteRC);
+  Register Reg = MRI->createVirtualRegister(PromoteRC);
   auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
                       TII->get(X86::SETCCr), Reg).addImm(Cond);
   (void)SetI;
@@ -813,7 +814,7 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic(
   MachineBasicBlock &MBB = *MI.getParent();
 
   // Insert an instruction that will set the flag back to the desired value.
-  unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
+  Register TmpReg = MRI->createVirtualRegister(PromoteRC);
   auto AddI =
       BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
           .addDef(TmpReg, RegState::Dead)
@@ -974,7 +975,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
 
   // Now we need to turn this into a bitmask. We do this by subtracting it from
   // zero.
-  unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  Register ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
   BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
   ZeroReg = AdjustReg(ZeroReg);
 
@@ -999,7 +1000,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
   default:
     llvm_unreachable("Invalid SETB_C* opcode!");
   }
-  unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
+  Register ResultReg = MRI->createVirtualRegister(&SetBRC);
   BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
       .addReg(ZeroReg)
       .addReg(ExtCondReg);
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 074cf21d03f5..fcfb5bc91314 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -288,8 +288,8 @@ namespace {
 
     // Check if a COPY instruction is using FP registers.
     static bool isFPCopy(MachineInstr &MI) {
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned SrcReg = MI.getOperand(1).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
+      Register SrcReg = MI.getOperand(1).getReg();
 
       return X86::RFP80RegClass.contains(DstReg) ||
         X86::RFP80RegClass.contains(SrcReg);
@@ -313,7 +313,7 @@ FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
 /// For example, this returns 3 for X86::FP3.
 static unsigned getFPReg(const MachineOperand &MO) {
   assert(MO.isReg() && "Expected an FP register!");
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
   return Reg - X86::FP0;
 }
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index e310fe069117..1b469a814adc 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -35,8 +35,8 @@
 using namespace llvm;
 
 X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
-                                   unsigned StackAlignOverride)
-    : TargetFrameLowering(StackGrowsDown, StackAlignOverride,
+                                   MaybeAlign StackAlignOverride)
+    : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
                           STI.is64Bit() ? -8 : -4),
       STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
   // Cache a bunch of frame-related predicates for this subtarget.
@@ -176,7 +176,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
       MachineOperand &MO = MBBI->getOperand(i);
       if (!MO.isReg() || MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
@@ -216,7 +216,7 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (Reg != X86::EFLAGS)
         continue;
 
@@ -995,11 +995,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;
   bool NeedsDwarfCFI =
       !IsWin64Prologue && (MMI.hasDebugInfo() || Fn.needsUnwindTableEntry());
-  unsigned FramePtr = TRI->getFrameRegister(MF);
-  const unsigned MachineFramePtr =
+  Register FramePtr = TRI->getFrameRegister(MF);
+  const Register MachineFramePtr =
       STI.isTarget64BitILP32()
-          ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
-  unsigned BasePtr = TRI->getBaseRegister();
+          ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
+  Register BasePtr = TRI->getBaseRegister();
   bool HasWinCFI = false;
 
   // Debug location must be unknown since the first debug location is used
@@ -1016,14 +1016,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
   bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
-
-  // The default stack probe size is 4096 if the function has no stackprobesize
-  // attribute.
-  unsigned StackProbeSize = 4096;
-  if (Fn.hasFnAttribute("stack-probe-size"))
-    Fn.getFnAttribute("stack-probe-size")
-        .getValueAsString()
-        .getAsInteger(0, StackProbeSize);
+  unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
 
   // Re-align the stack on 64-bit if the x86-interrupt calling convention is
   // used and an error code was pushed, since the x86-64 ABI requires a 16-byte
@@ -1081,7 +1074,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   int stackGrowth = -SlotSize;
 
   // Find the funclet establisher parameter
-  unsigned Establisher = X86::NoRegister;
+  Register Establisher = X86::NoRegister;
   if (IsClrFunclet)
     Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
   else if (IsFunclet)
@@ -1192,7 +1185,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
          (MBBI->getOpcode() == X86::PUSH32r ||
           MBBI->getOpcode() == X86::PUSH64r)) {
     PushedRegs = true;
-    unsigned Reg = MBBI->getOperand(0).getReg();
+    Register Reg = MBBI->getOperand(0).getReg();
     ++MBBI;
 
     if (!HasFP && NeedsDwarfCFI) {
@@ -1396,9 +1389,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       int FI;
       if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
         if (X86::FR64RegClass.contains(Reg)) {
+          int Offset;
           unsigned IgnoredFrameReg;
-          int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
-          Offset += SEHFrameOffset;
+          if (IsWin64Prologue && IsFunclet)
+            Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
+          else
+            Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) +
+                     SEHFrameOffset;
 
           HasWinCFI = true;
           assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
@@ -1554,9 +1551,13 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
 
 unsigned
 X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   // This is the size of the pushed CSRs.
-  unsigned CSSize =
-      MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  // This is the size of callee saved XMMs.
+  const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+  unsigned XMMSize = WinEHXMMSlotInfo.size() *
+                     TRI->getSpillSize(X86::VR128RegClass);
   // This is the amount of stack a funclet needs to allocate.
   unsigned UsedSize;
   EHPersonality Personality =
@@ -1576,7 +1577,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
   unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
   // Subtract out the size of the callee saved registers. This is how much stack
   // each funclet will allocate.
-  return FrameSizeMinusRBP - CSSize;
+  return FrameSizeMinusRBP + XMMSize - CSSize;
 }
 
 static bool isTailCallOpcode(unsigned Opc) {
@@ -1597,9 +1598,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     DL = MBBI->getDebugLoc();
   // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
   const bool Is64BitILP32 = STI.isTarget64BitILP32();
-  unsigned FramePtr = TRI->getFrameRegister(MF);
+  Register FramePtr = TRI->getFrameRegister(MF);
   unsigned MachineFramePtr =
-      Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+      Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
 
   bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool NeedsWin64CFI =
@@ -1850,6 +1851,20 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return Offset + FPDelta;
 }
 
+int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
+                                              int FI, unsigned &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+  const auto it = WinEHXMMSlotInfo.find(FI);
+
+  if (it == WinEHXMMSlotInfo.end())
+    return getFrameIndexReference(MF, FI, FrameReg);
+
+  FrameReg = TRI->getStackRegister();
+  return alignTo(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
+}
+
 int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
                                                int FI, unsigned &FrameReg,
                                                int Adjustment) const {
@@ -1948,6 +1963,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
   unsigned CalleeSavedFrameSize = 0;
+  unsigned XMMCalleeSavedFrameSize = 0;
+  auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
   int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
 
   int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
@@ -1984,7 +2001,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     // Since emitPrologue and emitEpilogue will handle spilling and restoring of
     // the frame register, we can delete it from CSI list and not have to worry
     // about avoiding it later.
-    unsigned FPReg = TRI->getFrameRegister(MF);
+    Register FPReg = TRI->getFrameRegister(MF);
     for (unsigned i = 0; i < CSI.size(); ++i) {
       if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
         CSI.erase(CSI.begin() + i);
@@ -2025,12 +2042,20 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     unsigned Size = TRI->getSpillSize(*RC);
     unsigned Align = TRI->getSpillAlignment(*RC);
     // ensure alignment
-    SpillSlotOffset -= std::abs(SpillSlotOffset) % Align;
+    assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
+    SpillSlotOffset = -alignTo(-SpillSlotOffset, Align);
+
     // spill into slot
     SpillSlotOffset -= Size;
     int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
     CSI[i - 1].setFrameIdx(SlotIndex);
     MFI.ensureMaxAlignment(Align);
+
+    // Save the start offset and size of XMM in stack frame for funclets.
+    if (X86::VR128RegClass.contains(Reg)) {
+      WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize;
+      XMMCalleeSavedFrameSize += Size;
+    }
   }
 
   return true;
@@ -2200,7 +2225,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // Spill the BasePtr if it's used.
   if (TRI->hasBasePointer(MF)){
-    unsigned BasePtr = TRI->getBaseRegister();
+    Register BasePtr = TRI->getBaseRegister();
     if (STI.isTarget64BitILP32())
       BasePtr = getX86SubSuperRegister(BasePtr, 64);
     SavedRegs.set(BasePtr);
@@ -2212,7 +2237,7 @@ HasNestArgument(const MachineFunction *MF) {
   const Function &F = MF->getFunction();
   for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
        I != E; I++) {
-    if (I->hasNestAttr())
+    if (I->hasNestAttr() && !I->use_empty())
       return true;
   }
   return false;
@@ -2244,7 +2269,8 @@ GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Pr
   bool IsNested = HasNestArgument(&MF);
 
   if (CallingConvention == CallingConv::X86_FastCall ||
-      CallingConvention == CallingConv::Fast) {
+      CallingConvention == CallingConv::Fast ||
+      CallingConvention == CallingConv::Tail) {
     if (IsNested)
       report_fatal_error("Segmented stacks does not support fastcall with "
                          "nested function.");
@@ -2525,6 +2551,18 @@ static unsigned getHiPELiteral(
                      + " required but not provided");
 }
 
+// Return true if there are no non-ehpad successors to MBB and there are no
+// non-meta instructions between MBBI and MBB.end().
+static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
+                                  MachineBasicBlock::const_iterator MBBI) {
+  return std::all_of(
+             MBB.succ_begin(), MBB.succ_end(),
+             [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
+         std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
+           return MI.isMetaInstruction();
+         });
+}
+
 /// Erlang programs may need a special prologue to handle the stack size they
 /// might need at runtime. That is because Erlang/OTP does not implement a C
 /// stack but uses a custom implementation of hybrid stack/heap architecture.
@@ -2758,7 +2796,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   unsigned Opcode = I->getOpcode();
   bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
   DebugLoc DL = I->getDebugLoc();
-  uint64_t Amount = !reserveCallFrame ? TII.getFrameSize(*I) : 0;
+  uint64_t Amount = TII.getFrameSize(*I);
   uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;
   I = MBB.erase(I);
   auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
@@ -2847,7 +2885,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     return I;
   }
 
-  if (isDestroy && InternalAmt) {
+  if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) {
     // If we are performing frame pointer elimination and if the callee pops
     // something off the stack pointer, add it back.  We do this until we have
     // more advanced stack pointer tracking ability.
@@ -2912,8 +2950,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
          "restoring EBP/ESI on non-32-bit target");
 
   MachineFunction &MF = *MBB.getParent();
-  unsigned FramePtr = TRI->getFrameRegister(MF);
-  unsigned BasePtr = TRI->getBaseRegister();
+  Register FramePtr = TRI->getFrameRegister(MF);
+  Register BasePtr = TRI->getBaseRegister();
   WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index d32746e3a36e..2103d6471ead 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -25,7 +25,7 @@ class X86RegisterInfo;
 
 class X86FrameLowering : public TargetFrameLowering {
 public:
-  X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride);
+  X86FrameLowering(const X86Subtarget &STI, MaybeAlign StackAlignOverride);
 
   // Cached subtarget predicates.
 
@@ -99,6 +99,8 @@ public:
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
 
+  int getWin64EHFrameIndexRef(const MachineFunction &MF,
+                              int FI, unsigned &SPReg) const;
   int getFrameIndexReferenceSP(const MachineFunction &MF,
                                int FI, unsigned &SPReg, int Adjustment) const;
   int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 95d31e62cafc..5b546d42d98a 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -253,6 +253,11 @@ namespace {
       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
     }
 
+    bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
+                          SDValue &Base, SDValue &Scale,
+                          SDValue &Index, SDValue &Disp,
+                          SDValue &Segment);
+
     /// Implement addressing mode selection for inline asm expressions.
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
@@ -362,6 +367,11 @@ namespace {
         if (User->getNumOperands() != 2)
           continue;
 
+        // If this can match to INC/DEC, don't count it as a use.
+        if (User->getOpcode() == ISD::ADD &&
+            (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0))))
+          continue;
+
         // Immediates that are used for offsets as part of stack
         // manipulation should be left alone. These are typically
         // used to indicate SP offsets for argument passing and
@@ -502,8 +512,10 @@ namespace {
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
     bool tryShiftAmountMod(SDNode *N);
+    bool combineIncDecVector(SDNode *Node);
     bool tryShrinkShlLogicImm(SDNode *N);
     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
+    bool tryMatchBitSelect(SDNode *N);
 
     MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
                                 const SDLoc &dl, MVT VT, SDNode *Node);
@@ -746,7 +758,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
     return false;
   LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
   if (!LD ||
-      LD->isVolatile() ||
+      !LD->isSimple() ||
       LD->getAddressingMode() != ISD::UNINDEXED ||
       LD->getExtensionType() != ISD::NON_EXTLOAD)
     return false;
@@ -873,10 +885,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       case ISD::FRINT:      Imm = 0x4; break;
       }
       SDLoc dl(N);
-      SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
-                                    N->getValueType(0),
-                                    N->getOperand(0),
-                                    CurDAG->getConstant(Imm, dl, MVT::i8));
+      SDValue Res = CurDAG->getNode(
+          X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0),
+          CurDAG->getTargetConstant(Imm, dl, MVT::i8));
       --I;
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
       ++I;
@@ -2305,10 +2316,10 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
     return false;
 
   // We can allow a full vector load here since narrowing a load is ok unless
-  // it's volatile.
+  // it's volatile or atomic.
   if (ISD::isNON_EXTLoad(N.getNode())) {
     LoadSDNode *LD = cast<LoadSDNode>(N);
-    if (!LD->isVolatile() &&
+    if (LD->isSimple() &&
         IsProfitableToFold(N, LD, Root) &&
         IsLegalToFold(N, Parent, Root, OptLevel)) {
       PatternNodeWithChain = N;
@@ -2464,6 +2475,37 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
       Complexity += 2;
   }
 
+  // Heuristic: try harder to form an LEA from ADD if the operands set flags.
+  // Unlike ADD, LEA does not affect flags, so we will be less likely to require
+  // duplicating flag-producing instructions later in the pipeline.
+  if (N.getOpcode() == ISD::ADD) {
+    auto isMathWithFlags = [](SDValue V) {
+      switch (V.getOpcode()) {
+      case X86ISD::ADD:
+      case X86ISD::SUB:
+      case X86ISD::ADC:
+      case X86ISD::SBB:
+      /* TODO: These opcodes can be added safely, but we may want to justify
+               their inclusion for different reasons (better for reg-alloc).
+      case X86ISD::SMUL:
+      case X86ISD::UMUL:
+      case X86ISD::OR:
+      case X86ISD::XOR:
+      case X86ISD::AND:
+      */
+        // Value 1 is the flag output of the node - verify it's not dead.
+        return !SDValue(V.getNode(), 1).use_empty();
+      default:
+        return false;
+      }
+    };
+    // TODO: This could be an 'or' rather than 'and' to make the transform more
+    //       likely to happen. We might want to factor in whether there's a
+    //       load folding opportunity for the math op that disappears with LEA.
+    if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
+      Complexity++;
+  }
+
   if (AM.Disp)
     Complexity++;
 
@@ -2544,6 +2586,7 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
                                   SDValue &Base, SDValue &Scale,
                                   SDValue &Index, SDValue &Disp,
                                   SDValue &Segment) {
+  assert(Root && P && "Unknown root/parent nodes");
   if (!ISD::isNON_EXTLoad(N.getNode()) ||
       !IsProfitableToFold(N, P, Root) ||
       !IsLegalToFold(N, P, Root, OptLevel))
@@ -2553,6 +2596,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
 
+bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
+                                       SDValue &Base, SDValue &Scale,
+                                       SDValue &Index, SDValue &Disp,
+                                       SDValue &Segment) {
+  assert(Root && P && "Unknown root/parent nodes");
+  if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
+      !IsProfitableToFold(N, P, Root) ||
+      !IsLegalToFold(N, P, Root, OptLevel))
+    return false;
+
+  return selectAddr(N.getNode(),
+                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
 /// Return an SDNode that returns the value of the global base register.
 /// Output instructions required to initialize the global base register,
 /// if necessary.
@@ -3302,8 +3359,12 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   SDValue ImplDef = SDValue(
       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
   insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
-  NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
-                                        NBits);
+
+  SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
+  insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
+  NBits = SDValue(
+      CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
+                             NBits, SRIdxVal), 0);
   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
 
   if (Subtarget->hasBMI2()) {
@@ -3400,8 +3461,9 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
   // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
   // hoisting the move immediate would make it worthwhile with a less optimal
   // BEXTR?
-  if (!Subtarget->hasTBM() &&
-      !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
+  bool PreferBEXTR =
+      Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
+  if (!PreferBEXTR && !Subtarget->hasBMI2())
     return nullptr;
 
   // Must have a shift right.
@@ -3440,23 +3502,50 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
   if (Shift + MaskSize > NVT.getSizeInBits())
     return nullptr;
 
-  SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
-  unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
-  unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+  // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
+  // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
+  // does not fit into 32 bits. Load folding is not a sufficient reason.
+  if (!PreferBEXTR && MaskSize <= 32)
+    return nullptr;
 
-  // BMI requires the immediate to placed in a register.
-  if (!Subtarget->hasTBM()) {
-    ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
-    MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+  SDValue Control;
+  unsigned ROpc, MOpc;
+
+  if (!PreferBEXTR) {
+    assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
+    // If we can't make use of BEXTR then we can't fuse shift+mask stages.
+    // Let's perform the mask first, and apply shift later. Note that we need to
+    // widen the mask to account for the fact that we'll apply shift afterwards!
+    Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
+    ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
+    MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
     unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
-    New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
+    Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+  } else {
+    // The 'control' of BEXTR has the pattern of:
+    // [15...8 bit][ 7...0 bit] location
+    // [ bit count][     shift] name
+    // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
+    Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+    if (Subtarget->hasTBM()) {
+      ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+      MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+    } else {
+      assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
+      // BMI requires the immediate to placed in a register.
+      ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+      MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+      unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
+      Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+    }
   }
 
   MachineSDNode *NewNode;
   SDValue Input = N0->getOperand(0);
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+    SDValue Ops[] = {
+        Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
     SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
     NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     // Update the chain.
@@ -3464,7 +3553,15 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
     // Record the mem-refs
     CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
   } else {
-    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
+    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
+  }
+
+  if (!PreferBEXTR) {
+    // We still need to apply the shift.
+    SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
+    unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
+    NewNode =
+        CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
   }
 
   return NewNode;
@@ -3735,6 +3832,52 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
   return true;
 }
 
+/// Convert vector increment or decrement to sub/add with an all-ones constant:
+/// add X, <1, 1...> --> sub X, <-1, -1...>
+/// sub X, <1, 1...> --> add X, <-1, -1...>
+/// The all-ones vector constant can be materialized using a pcmpeq instruction
+/// that is commonly recognized as an idiom (has no register dependency), so
+/// that's better/smaller than loading a splat 1 constant.
+bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
+  assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
+         "Unexpected opcode for increment/decrement transform");
+
+  EVT VT = Node->getValueType(0);
+  assert(VT.isVector() && "Should only be called for vectors.");
+
+  SDValue X = Node->getOperand(0);
+  SDValue OneVec = Node->getOperand(1);
+
+  APInt SplatVal;
+  if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
+    return false;
+
+  SDLoc DL(Node);
+  SDValue OneConstant, AllOnesVec;
+
+  APInt Ones = APInt::getAllOnesValue(32);
+  assert(VT.getSizeInBits() % 32 == 0 &&
+         "Expected bit count to be a multiple of 32");
+  OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
+  insertDAGNode(*CurDAG, X, OneConstant);
+
+  unsigned NumElts = VT.getSizeInBits() / 32;
+  assert(NumElts > 0 && "Expected to get non-empty vector.");
+  AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
+                                           DL, OneConstant);
+  insertDAGNode(*CurDAG, X, AllOnesVec);
+
+  AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
+  insertDAGNode(*CurDAG, X, AllOnesVec);
+
+  unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+  SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
+
+  ReplaceNode(Node, NewNode.getNode());
+  SelectCode(NewNode.getNode());
+  return true;
+}
+
 /// If the high bits of an 'and' operand are known zero, try setting the
 /// high bits of an 'and' constant operand to produce a smaller encoding by
 /// creating a small, sign-extended negative immediate rather than a large
@@ -3975,12 +4118,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
   if (CC != ISD::SETEQ && CC != ISD::SETNE)
     return false;
 
-  // See if we're comparing against zero. This should have been canonicalized
-  // to RHS during lowering.
-  if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
+  SDValue SetccOp0 = Setcc.getOperand(0);
+  SDValue SetccOp1 = Setcc.getOperand(1);
+
+  // Canonicalize the all zero vector to the RHS.
+  if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
+    std::swap(SetccOp0, SetccOp1);
+
+  // See if we're comparing against zero.
+  if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
     return false;
 
-  SDValue N0 = Setcc.getOperand(0);
+  SDValue N0 = SetccOp0;
 
   MVT CmpVT = N0.getSimpleValueType();
   MVT CmpSVT = CmpVT.getVectorElementType();
@@ -4027,13 +4176,14 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
 
   auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
     // Look through single use bitcasts.
-    if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
-      Src = Src.getOperand(0);
-
-    if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
+    if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
       Parent = Src.getNode();
       Src = Src.getOperand(0);
-      if (Src.getSimpleValueType() == CmpSVT)
+    }
+
+    if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
+      auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
+      if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
         return Src;
     }
 
@@ -4045,17 +4195,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
   bool FoldedBCast = false;
   if (!FoldedLoad && CanFoldLoads &&
       (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
-    SDNode *ParentNode = nullptr;
+    SDNode *ParentNode = N0.getNode();
     if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
-      FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
-                                Tmp1, Tmp2, Tmp3, Tmp4);
+      FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
+                                     Tmp1, Tmp2, Tmp3, Tmp4);
     }
 
     // Try the other operand.
     if (!FoldedBCast) {
+      SDNode *ParentNode = N0.getNode();
       if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
-        FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
-                                  Tmp1, Tmp2, Tmp3, Tmp4);
+        FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
+                                       Tmp1, Tmp2, Tmp3, Tmp4);
         if (FoldedBCast)
           std::swap(Src0, Src1);
       }
@@ -4125,7 +4276,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
     // Update the chain.
     ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
   } else {
     if (IsMasked)
       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
@@ -4146,6 +4297,55 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
   return true;
 }
 
+// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
+// into vpternlog.
+bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
+  assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
+
+  MVT NVT = N->getSimpleValueType(0);
+
+  // Make sure we support VPTERNLOG.
+  if (!NVT.isVector() || !Subtarget->hasAVX512())
+    return false;
+
+  // We need VLX for 128/256-bit.
+  if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
+    return false;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Canonicalize AND to LHS.
+  if (N1.getOpcode() == ISD::AND)
+    std::swap(N0, N1);
+
+  if (N0.getOpcode() != ISD::AND ||
+      N1.getOpcode() != X86ISD::ANDNP ||
+      !N0.hasOneUse() || !N1.hasOneUse())
+    return false;
+
+  // ANDN is not commutable, use it to pick down A and C.
+  SDValue A = N1.getOperand(0);
+  SDValue C = N1.getOperand(1);
+
+  // AND is commutable, if one operand matches A, the other operand is B.
+  // Otherwise this isn't a match.
+  SDValue B;
+  if (N0.getOperand(0) == A)
+    B = N0.getOperand(1);
+  else if (N0.getOperand(1) == A)
+    B = N0.getOperand(0);
+  else
+    return false;
+
+  SDLoc dl(N);
+  SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
+  SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
+  ReplaceNode(N, Ternlog.getNode());
+  SelectCode(Ternlog.getNode());
+  return true;
+}
+
 void X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   unsigned Opcode = Node->getOpcode();
@@ -4170,6 +4370,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
       unsigned Opc = 0;
       switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
       case Intrinsic::x86_sse3_monitor:
         if (!Subtarget->hasSSE3())
           break;
@@ -4303,9 +4504,16 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     if (tryShrinkShlLogicImm(Node))
       return;
 
+    if (Opcode == ISD::OR && tryMatchBitSelect(Node))
+      return;
+
     LLVM_FALLTHROUGH;
   case ISD::ADD:
   case ISD::SUB: {
+    if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
+        combineIncDecVector(Node))
+      return;
+
     // Try to avoid folding immediates with multiple uses for optsize.
     // This code tries to select to register form directly to avoid going
     // through the isel table which might fold the immediate. We can't change
@@ -4333,6 +4541,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     if (!isInt<8>(Val) && !isInt<32>(Val))
       break;
 
+    // If this can match to INC/DEC, let it go.
+    if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
+      break;
+
     // Check if we should avoid folding this immediate.
     if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
       break;
@@ -4610,7 +4822,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:
       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
-      SExtOpcode = X86::CBW;
+      SExtOpcode = 0; // Not used.
       break;
     case MVT::i16:
       LoReg = X86::AX;  HiReg = X86::DX;
@@ -4632,24 +4844,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     bool signBitIsZero = CurDAG->SignBitIsZero(N0);
 
     SDValue InFlag;
-    if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+    if (NVT == MVT::i8) {
       // Special case for div8, just use a move with zero extension to AX to
       // clear the upper 8 bits (AH).
       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
       MachineSDNode *Move;
       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
-        Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
-                                      MVT::Other, Ops);
+        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
+                                                    : X86::MOVZX16rm8;
+        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
         Chain = SDValue(Move, 1);
         ReplaceUses(N0.getValue(1), Chain);
         // Record the mem-refs
         CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
       } else {
-        Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
+        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
+                                                    : X86::MOVZX16rr8;
+        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
         Chain = CurDAG->getEntryNode();
       }
-      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
+      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
                                     SDValue());
       InFlag = Chain.getValue(1);
     } else {
@@ -4996,10 +5211,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     case ISD::FRINT:      Imm = 0x4; break;
     }
     SDLoc dl(Node);
-    SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
-                                  Node->getValueType(0),
+    SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0),
                                   Node->getOperand(0),
-                                  CurDAG->getConstant(Imm, dl, MVT::i8));
+                                  CurDAG->getTargetConstant(Imm, dl, MVT::i8));
     ReplaceNode(Node, Res.getNode());
     SelectCode(Res.getNode());
     return;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0b4bf687e6cf..ed975e9248a8 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -65,17 +65,19 @@ using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 
-static cl::opt<bool> ExperimentalVectorWideningLegalization(
-    "x86-experimental-vector-widening-legalization", cl::init(false),
-    cl::desc("Enable an experimental vector type legalization through widening "
-             "rather than promotion."),
-    cl::Hidden);
-
 static cl::opt<int> ExperimentalPrefLoopAlignment(
     "x86-experimental-pref-loop-alignment", cl::init(4),
-    cl::desc("Sets the preferable loop alignment for experiments "
-             "(the last x86-experimental-pref-loop-alignment bits"
-             " of the loop header PC will be 0)."),
+    cl::desc(
+        "Sets the preferable loop alignment for experiments (as log2 bytes)"
+        "(the last x86-experimental-pref-loop-alignment bits"
+        " of the loop header PC will be 0)."),
+    cl::Hidden);
+
+// Added in 10.0.
+static cl::opt<bool> EnableOldKNLABI(
+    "x86-enable-old-knl-abi", cl::init(false),
+    cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
+             "one ZMM register on AVX512F, but not AVX512BW targets."),
     cl::Hidden);
 
 static cl::opt<bool> MulConstantOptimization(
@@ -84,6 +86,13 @@ static cl::opt<bool> MulConstantOptimization(
              "SHIFT, LEA, etc."),
     cl::Hidden);
 
+static cl::opt<bool> ExperimentalUnorderedISEL(
+    "x86-experimental-unordered-atomic-isel", cl::init(false),
+    cl::desc("Use LoadSDNode and StoreSDNode instead of "
+             "AtomicSDNode for unordered atomic loads and "
+             "stores respectively."),
+    cl::Hidden);
+
 /// Call this when the user attempts to do something unsupported, like
 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
 /// report_fatal_error, so calling code should attempt to recover without
@@ -196,7 +205,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // Integer absolute.
   if (Subtarget.hasCMov()) {
     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
-    setOperationAction(ISD::ABS            , MVT::i32  , Custom); 
+    setOperationAction(ISD::ABS            , MVT::i32  , Custom);
   }
   setOperationAction(ISD::ABS              , MVT::i64  , Custom);
 
@@ -214,14 +223,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 
-  if (Subtarget.is64Bit()) {
-    if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
-      // f32/f64 are legal, f80 is custom.
-      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
-    else
-      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
-    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
-  } else if (!Subtarget.useSoftFloat()) {
+  if (!Subtarget.useSoftFloat()) {
     // We have an algorithm for SSE2->double, and we turn this into a
     // 64-bit FILD followed by conditional FADD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
@@ -277,29 +279,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 
-  if (Subtarget.is64Bit()) {
-    if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
-      // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
-      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
-      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
-    } else {
-      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
-      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
-    }
-  } else if (!Subtarget.useSoftFloat()) {
-    // Since AVX is a superset of SSE3, only check for SSE here.
-    if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
-      // Expand FP_TO_UINT into a select.
-      // FIXME: We would like to use a Custom expander here eventually to do
-      // the optimal thing for SSE vs. the default expansion in the legalizer.
-      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
-    else
-      // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
-      // With SSE3 we can use fisttpll to convert to a signed i64; without
-      // SSE, we're stuck with a fistpll.
-      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
-
-    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
+  if (!Subtarget.useSoftFloat()) {
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
   }
 
   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
@@ -345,11 +327,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
-  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 
   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f128 , Expand);
   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 
   // Promote the i8 variants and force them on up to i32 which has a shorter
@@ -396,15 +378,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // There's never any support for operations beyond MVT::f32.
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 
   if (Subtarget.hasPOPCNT()) {
     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
@@ -638,17 +624,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FMA, MVT::f64, Expand);
   setOperationAction(ISD::FMA, MVT::f32, Expand);
 
-  // Long double always uses X87, except f128 in MMX.
+  // f80 always uses X87.
   if (UseX87) {
-    if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
-      addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
-                                                     : &X86::VR128RegClass);
-      ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
-      setOperationAction(ISD::FABS , MVT::f128, Custom);
-      setOperationAction(ISD::FNEG , MVT::f128, Custom);
-      setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
-    }
-
     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -684,10 +661,60 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::LLRINT, MVT::f80, Expand);
   }
 
+  // f128 uses xmm registers, but most operations require libcalls.
+  if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
+    addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
+                                                   : &X86::VR128RegClass);
+
+    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
+
+    setOperationAction(ISD::FADD, MVT::f128, Custom);
+    setOperationAction(ISD::FSUB, MVT::f128, Custom);
+    setOperationAction(ISD::FDIV, MVT::f128, Custom);
+    setOperationAction(ISD::FMUL, MVT::f128, Custom);
+    setOperationAction(ISD::FMA,  MVT::f128, Expand);
+
+    setOperationAction(ISD::FABS, MVT::f128, Custom);
+    setOperationAction(ISD::FNEG, MVT::f128, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+
+    setOperationAction(ISD::FSIN,    MVT::f128, Expand);
+    setOperationAction(ISD::FCOS,    MVT::f128, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+    setOperationAction(ISD::FSQRT,   MVT::f128, Expand);
+
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+    // We need to custom handle any FP_ROUND with an f128 input, but
+    // LegalizeDAG uses the result type to know when to run a custom handler.
+    // So we have to list all legal floating point result types here.
+    if (isTypeLegal(MVT::f32)) {
+      setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
+    }
+    if (isTypeLegal(MVT::f64)) {
+      setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
+    }
+    if (isTypeLegal(MVT::f80)) {
+      setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
+    }
+
+    setOperationAction(ISD::SETCC, MVT::f128, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
+    setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+    setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+    setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+  }
+
   // Always use a library call for pow.
   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
 
   setOperationAction(ISD::FLOG, MVT::f80, Expand);
   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
@@ -716,7 +743,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // First set operation action for all vector types to either promote
   // (for widening) or expand (for scalarization). Then we will selectively
   // turn on ones that can be effectively codegen'd.
-  for (MVT VT : MVT::vector_valuetypes()) {
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
@@ -754,7 +781,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
-    for (MVT InnerVT : MVT::vector_valuetypes()) {
+    for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
       setTruncStoreAction(InnerVT, VT, Expand);
 
       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
@@ -797,6 +824,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
     setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
+
+    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -823,10 +852,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
-    setOperationAction(ISD::MUL,                MVT::v2i16, Custom);
-    setOperationAction(ISD::MUL,                MVT::v2i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
-    setOperationAction(ISD::MUL,                MVT::v4i16, Custom);
     setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
 
     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
@@ -863,28 +889,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
     setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
 
-    if (!ExperimentalVectorWideningLegalization) {
-      // Use widening instead of promotion.
-      for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
-                       MVT::v4i16, MVT::v2i16 }) {
-        setOperationAction(ISD::UADDSAT, VT, Custom);
-        setOperationAction(ISD::SADDSAT, VT, Custom);
-        setOperationAction(ISD::USUBSAT, VT, Custom);
-        setOperationAction(ISD::SSUBSAT, VT, Custom);
-      }
-    }
-
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
-    // Provide custom widening for v2f32 setcc. This is really for VLX when
-    // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
-    // type legalization changing the result type to v4i1 during widening.
-    // It works fine for SSE2 and is probably faster so no need to qualify with
-    // VLX support.
-    setOperationAction(ISD::SETCC,               MVT::v2i32, Custom);
-
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SETCC,              VT, Custom);
       setOperationAction(ISD::CTPOP,              VT, Custom);
@@ -904,19 +912,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
-    // We support custom legalizing of sext and anyext loads for specific
-    // memory vector types which we can load as a scalar (or sequence of
-    // scalars) and extend in-register to a legal 128-bit vector type. For sext
-    // loads these must work with a single scalar load.
-    for (MVT VT : MVT::integer_vector_valuetypes()) {
-      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
-      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
-      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
-      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
-      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
-      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
-    }
-
     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
@@ -938,7 +933,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i16, Custom);
 
     // Custom legalize these to avoid over promotion or custom promotion.
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i8,  Custom);
@@ -991,18 +985,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
 
-    if (ExperimentalVectorWideningLegalization) {
-      setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
 
-      setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
-      setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
-      setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
-      setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
-      setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
-      setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
-    } else {
-      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
-    }
+    setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
+    setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
+    setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
+    setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
+    setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
+    setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
 
     // In the customized shift lowering, the legal v4i32/v2i64 cases
     // in AVX2 will be recognized.
@@ -1069,22 +1059,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
     }
 
-    if (!ExperimentalVectorWideningLegalization) {
-      // Avoid narrow result types when widening. The legal types are listed
-      // in the next loop.
-      for (MVT VT : MVT::integer_vector_valuetypes()) {
-        setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
-        setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
-        setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
-      }
-    }
-
     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
-      if (!ExperimentalVectorWideningLegalization)
-        setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
@@ -1145,6 +1123,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
 
+    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f32, Custom);
+
     if (!Subtarget.hasAVX512())
       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
 
@@ -1292,10 +1272,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STORE,              VT, Custom);
     }
 
-    if (HasInt256)
-      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
-
     if (HasInt256) {
+      setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
       // Custom legalize 2x32 to get a little better code.
       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
@@ -1407,6 +1386,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
 
+    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v16f32, Custom);
+
     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
@@ -1433,12 +1414,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
 
-    if (ExperimentalVectorWideningLegalization) {
-      // Need to custom widen this if we don't have AVX512BW.
-      setOperationAction(ISD::ANY_EXTEND,         MVT::v8i8, Custom);
-      setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i8, Custom);
-      setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i8, Custom);
-    }
+    // Need to custom widen this if we don't have AVX512BW.
+    setOperationAction(ISD::ANY_EXTEND,         MVT::v8i8, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i8, Custom);
 
     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FFLOOR,           VT, Legal);
@@ -1529,10 +1508,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MGATHER,             VT, Custom);
       setOperationAction(ISD::MSCATTER,            VT, Custom);
     }
-    // Need to custom split v32i16/v64i8 bitcasts.
     if (!Subtarget.hasBWI()) {
+      // Need to custom split v32i16/v64i8 bitcasts.
       setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
       setOperationAction(ISD::BITCAST, MVT::v64i8,  Custom);
+
+      // Better to split these into two 256-bit ops.
+      setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
+      setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
     }
 
     if (Subtarget.hasVBMI2()) {
@@ -1777,6 +1760,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(ISD::FSHR, VT, Custom);
       }
     }
+
+    setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
   }
 
   // We want to custom lower some of our intrinsics.
@@ -1905,13 +1892,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxLoadsPerMemcmpOptSize = 2;
 
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
-  setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
+  setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
 
   // An out-of-order CPU can speculatively execute past a predictable branch,
   // but a conditional move could be stalled by an expensive earlier operation.
   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
   EnableExtLdPromotion = true;
-  setPrefFunctionAlignment(4); // 2^4 bytes.
+  setPrefFunctionAlignment(Align(16));
 
   verifyIntrinsicTables();
 }
@@ -1939,8 +1926,7 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return TypeSplitVector;
 
-  if (ExperimentalVectorWideningLegalization &&
-      VT.getVectorNumElements() != 1 &&
+  if (VT.getVectorNumElements() != 1 &&
       VT.getVectorElementType() != MVT::i1)
     return TypeWidenVector;
 
@@ -1950,19 +1936,62 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                      CallingConv::ID CC,
                                                      EVT VT) const {
+  // v32i1 vectors should be promoted to v32i8 to match avx2.
   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return MVT::v32i8;
+  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+      Subtarget.hasAVX512() &&
+      (!isPowerOf2_32(VT.getVectorNumElements()) ||
+       (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+       (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
+    return MVT::i8;
+  // FIXME: Should we just make these types legal and custom split operations?
+  if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
+      Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+    return MVT::v16i32;
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
 }
 
 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                           CallingConv::ID CC,
                                                           EVT VT) const {
+  // v32i1 vectors should be promoted to v32i8 to match avx2.
   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return 1;
+  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+      Subtarget.hasAVX512() &&
+      (!isPowerOf2_32(VT.getVectorNumElements()) ||
+       (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+       (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
+    return VT.getVectorNumElements();
+  // FIXME: Should we just make these types legal and custom split operations?
+  if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
+      Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+    return 1;
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
 }
 
+unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const {
+  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+      Subtarget.hasAVX512() &&
+      (!isPowerOf2_32(VT.getVectorNumElements()) ||
+       (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+       (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
+    RegisterVT = MVT::i8;
+    IntermediateVT = MVT::i1;
+    NumIntermediates = VT.getVectorNumElements();
+    return NumIntermediates;
+  }
+
+  return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
+                                              NumIntermediates, RegisterVT);
+}
+
 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
                                           LLVMContext& Context,
                                           EVT VT) const {
@@ -2060,6 +2089,11 @@ EVT X86TargetLowering::getOptimalMemOpType(
     if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
                        ((DstAlign == 0 || DstAlign >= 16) &&
                         (SrcAlign == 0 || SrcAlign >= 16)))) {
+      // FIXME: Check if unaligned 64-byte accesses are slow.
+      if (Size >= 64 && Subtarget.hasAVX512() &&
+          (Subtarget.getPreferVectorWidth() >= 512)) {
+        return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
+      }
       // FIXME: Check if unaligned 32-byte accesses are slow.
       if (Size >= 32 && Subtarget.hasAVX() &&
           (Subtarget.getPreferVectorWidth() >= 256)) {
@@ -2403,8 +2437,8 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
 
 /// Breaks v64i1 value into two registers and adds the new node to the DAG
 static void Passv64i1ArgInRegs(
-    const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
-    SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
+    const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
+    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
@@ -2537,7 +2571,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       assert(VA.getValVT() == MVT::v64i1 &&
              "Currently the only custom case is when we split v64i1 to 2 regs");
 
-      Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
+      Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
                          Subtarget);
 
       assert(2 == RegsToPass.size() &&
@@ -2816,6 +2850,10 @@ SDValue X86TargetLowering::LowerCallResult(
         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+    } else if (CopyVT == MVT::f64 &&
+               (Is64Bit && !Subtarget.hasSSE2())) {
+      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
 
     // If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -2925,7 +2963,7 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
 static bool canGuaranteeTCO(CallingConv::ID CC) {
   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
-          CC == CallingConv::HHVM);
+          CC == CallingConv::HHVM || CC == CallingConv::Tail);
 }
 
 /// Return true if we might ever do TCO for calls with this calling convention.
@@ -2951,7 +2989,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
 /// Return true if the function is being made into a tailcall target by
 /// changing its ABI.
 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
-  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+  return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
 }
 
 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
@@ -3405,7 +3443,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
     // Find the largest legal vector type.
     MVT VecVT = MVT::Other;
     // FIXME: Only some x86_32 calling conventions support AVX512.
-    if (Subtarget.hasAVX512() &&
+    if (Subtarget.useAVX512Regs() &&
         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
                      CallConv == CallingConv::Intel_OCL_BI)))
       VecVT = MVT::v16f32;
@@ -3577,6 +3615,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
   bool IsSibcall      = false;
+  bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
+      CallConv == CallingConv::Tail;
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
   const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
@@ -3597,8 +3637,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (Attr.getValueAsString() == "true")
     isTailCall = false;
 
-  if (Subtarget.isPICStyleGOT() &&
-      !MF.getTarget().Options.GuaranteedTailCallOpt) {
+  if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
     // If we are using a GOT, disable tail calls to external symbols with
     // default visibility. Tail calling such a symbol requires using a GOT
     // relocation, which forces early binding of the symbol. This breaks code
@@ -3625,7 +3664,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
-    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+    if (!IsGuaranteeTCO && isTailCall)
       IsSibcall = true;
 
     if (isTailCall)
@@ -3657,8 +3696,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
-  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
-           canGuaranteeTCO(CallConv))
+  else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
@@ -3782,8 +3820,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       assert(VA.getValVT() == MVT::v64i1 &&
              "Currently the only custom case is when we split v64i1 to 2 regs");
       // Split v64i1 value into two registers
-      Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
-                         Subtarget);
+      Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
     } else if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       const TargetOptions &Options = DAG.getTarget().Options;
@@ -4069,6 +4106,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   InFlag = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
+  // Save heapallocsite metadata.
+  if (CLI.CS)
+    if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
+      DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
+
   // Create the CALLSEQ_END node.
   unsigned NumBytesForCalleeToPop;
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
@@ -4190,7 +4232,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   int FI = INT_MAX;
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(VR))
+    if (!Register::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
     if (!Def)
@@ -4279,6 +4321,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   bool CCMatch = CallerCC == CalleeCC;
   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
+  bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
+      CalleeCC == CallingConv::Tail;
 
   // Win64 functions have extra shadow space for argument homing. Don't do the
   // sibcall if the caller and callee have mismatched expectations for this
@@ -4286,7 +4330,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   if (IsCalleeWin64 != IsCallerWin64)
     return false;
 
-  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+  if (IsGuaranteeTCO) {
     if (canGuaranteeTCO(CalleeCC) && CCMatch)
       return true;
     return false;
@@ -4413,7 +4457,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
         CCValAssign &VA = ArgLocs[i];
         if (!VA.isRegLoc())
           continue;
-        unsigned Reg = VA.getLocReg();
+        Register Reg = VA.getLocReg();
         switch (Reg) {
         default: break;
         case X86::EAX: case X86::EDX: case X86::ECX:
@@ -4652,7 +4696,11 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
         // X < 0   -> X == 0, jump on sign.
         return X86::COND_S;
       }
-      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+      if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
+        // X >= 0   -> X == 0, jump on !sign.
+        return X86::COND_NS;
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) {
         // X < 1   -> X <= 0
         RHS = DAG.getConstant(0, DL, RHS.getValueType());
         return X86::COND_LE;
@@ -4760,7 +4808,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       ScalarVT = MVT::i32;
 
     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
-    Info.align = 1;
+    Info.align = Align::None();
     Info.flags |= MachineMemOperand::MOStore;
     break;
   }
@@ -4773,7 +4821,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
                                 IndexVT.getVectorNumElements());
     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
-    Info.align = 1;
+    Info.align = Align::None();
     Info.flags |= MachineMemOperand::MOLoad;
     break;
   }
@@ -4785,7 +4833,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
                                 IndexVT.getVectorNumElements());
     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
-    Info.align = 1;
+    Info.align = Align::None();
     Info.flags |= MachineMemOperand::MOStore;
     break;
   }
@@ -4811,6 +4859,8 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
                                               ISD::LoadExtType ExtTy,
                                               EVT NewVT) const {
+  assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
+  
   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
   // relocation target a movq or addq instruction: don't let the load shrink.
   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
@@ -4852,11 +4902,12 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return true;
 }
 
-bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
+bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
   // If we are using XMM registers in the ABI and the condition of the select is
   // a floating-point compare and we have blendv or conditional move, then it is
   // cheaper to select instead of doing a cross-register move and creating a
   // load that depends on the compare result.
+  bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
   return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
 }
 
@@ -4869,15 +4920,25 @@ bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
   return true;
 }
 
-bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
+bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                                               SDValue C) const {
   // TODO: We handle scalars using custom code, but generic combining could make
   // that unnecessary.
   APInt MulC;
   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
     return false;
 
+  // Find the type this will be legalized too. Otherwise we might prematurely
+  // convert this to shl+add/sub and then still have to type legalize those ops.
+  // Another choice would be to defer the decision for illegal types until
+  // after type legalization. But constant splat vectors of i64 can't make it
+  // through type legalization on 32-bit targets so we would need to special
+  // case vXi64.
+  while (getTypeAction(Context, VT) != TypeLegal)
+    VT = getTypeToTransformTo(Context, VT);
+
   // If vector multiply is legal, assume that's faster than shl + add/sub.
-  // TODO: Multiply is a complex op with higher latency and lower througput in
+  // TODO: Multiply is a complex op with higher latency and lower throughput in
   //       most implementations, so this check could be loosened based on type
   //       and/or a CPU attribute.
   if (isOperationLegal(ISD::MUL, VT))
@@ -5022,6 +5083,33 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {
   return Subtarget.hasSSE2();
 }
 
+bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
+  return X.getValueType().isScalarInteger(); // 'bt'
+}
+
+bool X86TargetLowering::
+    shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+        unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+        SelectionDAG &DAG) const {
+  // Does baseline recommend not to perform the fold by default?
+  if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+          X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
+    return false;
+  // For scalars this transform is always beneficial.
+  if (X.getValueType().isScalarInteger())
+    return true;
+  // If all the shift amounts are identical, then transform is beneficial even
+  // with rudimentary SSE2 shifts.
+  if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
+    return true;
+  // If we have AVX2 with it's powerful shift operations, then it's also good.
+  if (Subtarget.hasAVX2())
+    return true;
+  // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
+  return NewShiftOpcode == ISD::SHL;
+}
+
 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
     const SDNode *N, CombineLevel Level) const {
   assert(((N->getOpcode() == ISD::SHL &&
@@ -5054,6 +5142,14 @@ bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
   return true;
 }
 
+bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
+                                          SDNode *N) const {
+  if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+      !Subtarget.isOSWindows())
+    return false;
+  return true;
+}
+
 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
   // Any legal vector type can be splatted more efficiently than
   // loading/spilling from memory.
@@ -5093,10 +5189,8 @@ static bool isUndefOrZero(int Val) {
 /// Return true if every element in Mask, beginning from position Pos and ending
 /// in Pos+Size is the undef sentinel value.
 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
-  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
-    if (Mask[i] != SM_SentinelUndef)
-      return false;
-  return true;
+  return llvm::all_of(Mask.slice(Pos, Size),
+                      [](int M) { return M == SM_SentinelUndef; });
 }
 
 /// Return true if the mask creates a vector whose lower half is undefined.
@@ -5119,10 +5213,7 @@ static bool isInRange(int Val, int Low, int Hi) {
 /// Return true if the value of any element in Mask falls within the specified
 /// range (L, H].
 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
-  for (int M : Mask)
-    if (isInRange(M, Low, Hi))
-      return true;
-  return false;
+  return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
 }
 
 /// Return true if Val is undef or if its value falls within the
@@ -5133,12 +5224,9 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
 
 /// Return true if every element in Mask is undef or if its value
 /// falls within the specified range (L, H].
-static bool isUndefOrInRange(ArrayRef<int> Mask,
-                             int Low, int Hi) {
-  for (int M : Mask)
-    if (!isUndefOrInRange(M, Low, Hi))
-      return false;
-  return true;
+static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
+  return llvm::all_of(
+      Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
 }
 
 /// Return true if Val is undef, zero or if its value falls within the
@@ -5150,10 +5238,8 @@ static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
 /// Return true if every element in Mask is undef, zero or if its value
 /// falls within the specified range (L, H].
 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
-  for (int M : Mask)
-    if (!isUndefOrZeroOrInRange(M, Low, Hi))
-      return false;
-  return true;
+  return llvm::all_of(
+      Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
 }
 
 /// Return true if every element in Mask, beginning
@@ -5171,8 +5257,9 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
 /// from position Pos and ending in Pos+Size, falls within the specified
 /// sequential range (Low, Low+Size], or is undef or is zero.
 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
-                                             unsigned Size, int Low) {
-  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
+                                             unsigned Size, int Low,
+                                             int Step = 1) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
       return false;
   return true;
@@ -5182,10 +5269,8 @@ static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
 /// from position Pos and ending in Pos+Size is undef or is zero.
 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
                                  unsigned Size) {
-  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
-    if (!isUndefOrZero(Mask[i]))
-      return false;
-  return true;
+  return llvm::all_of(Mask.slice(Pos, Size),
+                      [](int M) { return isUndefOrZero(M); });
 }
 
 /// Helper function to test whether a shuffle mask could be
@@ -5357,6 +5442,8 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
   SDValue Vec;
   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
+  } else if (VT.isFloatingPoint()) {
+    Vec = DAG.getConstantFP(+0.0, dl, VT);
   } else if (VT.getVectorElementType() == MVT::i1) {
     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
            "Unexpected vector type");
@@ -5500,6 +5587,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
     if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
         Idx == (VT.getVectorNumElements() / 2) &&
         Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        Src.getOperand(1).getValueType() == SubVT &&
         isNullConstant(Src.getOperand(2))) {
       Ops.push_back(Src.getOperand(1));
       Ops.push_back(Sub);
@@ -5593,7 +5681,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
     // May need to promote to a legal type.
     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                     getZeroVector(WideOpVT, Subtarget, DAG, dl),
+                     DAG.getConstant(0, dl, WideOpVT),
                      SubVec, Idx);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
@@ -5609,14 +5697,14 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
 
   if (IdxVal == 0) {
     // Zero lower bits of the Vec
-    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+    SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
                       ZeroIdx);
     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
     // Merge them together, SubVec should be zero extended.
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                         getZeroVector(WideOpVT, Subtarget, DAG, dl),
+                         DAG.getConstant(0, dl, WideOpVT),
                          SubVec, ZeroIdx);
     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
@@ -5628,7 +5716,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   if (Vec.isUndef()) {
     assert(IdxVal != 0 && "Unexpected index");
     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
-                         DAG.getConstant(IdxVal, dl, MVT::i8));
+                         DAG.getTargetConstant(IdxVal, dl, MVT::i8));
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
   }
 
@@ -5638,30 +5726,30 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
     unsigned ShiftLeft = NumElems - SubVecNumElems;
     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
-                         DAG.getConstant(ShiftLeft, dl, MVT::i8));
+                         DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
     if (ShiftRight != 0)
       SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
-                           DAG.getConstant(ShiftRight, dl, MVT::i8));
+                           DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
   }
 
   // Simple case when we put subvector in the upper part
   if (IdxVal + SubVecNumElems == NumElems) {
     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
-                         DAG.getConstant(IdxVal, dl, MVT::i8));
+                         DAG.getTargetConstant(IdxVal, dl, MVT::i8));
     if (SubVecNumElems * 2 == NumElems) {
       // Special case, use legal zero extending insert_subvector. This allows
       // isel to opimitize when bits are known zero.
       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                        getZeroVector(WideOpVT, Subtarget, DAG, dl),
+                        DAG.getConstant(0, dl, WideOpVT),
                         Vec, ZeroIdx);
     } else {
       // Otherwise use explicit shifts to zero the bits.
       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                         Undef, Vec, ZeroIdx);
       NumElems = WideOpVT.getVectorNumElements();
-      SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
+      SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     }
@@ -5675,30 +5763,47 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
 
   // Widen the vector if needed.
   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
-  // Move the current value of the bit to be replace to the lsbs.
-  Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
-                   DAG.getConstant(IdxVal, dl, MVT::i8));
-  // Xor with the new bit.
-  Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
-  // Shift to MSB, filling bottom bits with 0.
+
+  // Clear the upper bits of the subvector and move it to its insert position.
   unsigned ShiftLeft = NumElems - SubVecNumElems;
-  Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
-                   DAG.getConstant(ShiftLeft, dl, MVT::i8));
-  // Shift to the final position, filling upper bits with 0.
+  SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+                       DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
-  Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
-                       DAG.getConstant(ShiftRight, dl, MVT::i8));
-  // Xor with original vector leaving the new value.
-  Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
+  SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+                       DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+
+  // Isolate the bits below the insertion point.
+  unsigned LowShift = NumElems - IdxVal;
+  SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
+                            DAG.getTargetConstant(LowShift, dl, MVT::i8));
+  Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
+                    DAG.getTargetConstant(LowShift, dl, MVT::i8));
+
+  // Isolate the bits after the last inserted bit.
+  unsigned HighShift = IdxVal + SubVecNumElems;
+  SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
+                            DAG.getTargetConstant(HighShift, dl, MVT::i8));
+  High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
+                    DAG.getTargetConstant(HighShift, dl, MVT::i8));
+
+  // Now OR all 3 pieces together.
+  Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
+  SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
+
   // Reduce to original width if needed.
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
 }
 
-static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
-                                unsigned NumElems, SelectionDAG &DAG,
-                                const SDLoc &dl, unsigned VectorWidth) {
-  SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
-  return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
+static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
+                                const SDLoc &dl) {
+  assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
+  EVT SubVT = V1.getValueType();
+  EVT SubSVT = SubVT.getScalarType();
+  unsigned SubNumElts = SubVT.getVectorNumElements();
+  unsigned SubVectorWidth = SubVT.getSizeInBits();
+  EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
+  SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
+  return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
 }
 
 /// Returns a vector of specified type with all bits set.
@@ -5755,6 +5860,34 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
   return DAG.getNode(Opcode, DL, VT, In);
 }
 
+// Match (xor X, -1) -> X.
+// Match extract_subvector(xor X, -1) -> extract_subvector(X).
+// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
+  V = peekThroughBitcasts(V);
+  if (V.getOpcode() == ISD::XOR &&
+      ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
+    return V.getOperand(0);
+  if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
+    if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
+      Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
+                         Not, V.getOperand(1));
+    }
+  }
+  SmallVector<SDValue, 2> CatOps;
+  if (collectConcatOps(V.getNode(), CatOps)) {
+    for (SDValue &CatOp : CatOps) {
+      SDValue NotCat = IsNOT(CatOp, DAG);
+      if (!NotCat) return SDValue();
+      CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
+    }
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
+  }
+  return SDValue();
+}
+
 /// Returns a vector_shuffle node for an unpackl operation.
 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
                           SDValue V1, SDValue V2) {
@@ -6003,6 +6136,37 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
     }
   }
 
+  if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
+      EltSizeInBits <= VT.getScalarSizeInBits()) {
+    auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+    if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
+      return false;
+
+    SDValue Ptr = MemIntr->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper ||
+        Ptr->getOpcode() == X86ISD::WrapperRIP)
+      Ptr = Ptr->getOperand(0);
+
+    auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!CNode || CNode->isMachineConstantPoolEntry() ||
+        CNode->getOffset() != 0)
+      return false;
+
+    if (const Constant *C = CNode->getConstVal()) {
+      unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
+      unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+      APInt UndefSrcElts(NumSrcElts, 0);
+      SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
+      if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
+        if (UndefSrcElts[0])
+          UndefSrcElts.setBits(0, NumSrcElts);
+        SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
+        return CastBitData(UndefSrcElts, SrcEltBits);
+      }
+    }
+  }
+
   // Extract constant bits from a subvector broadcast.
   if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
     SmallVector<APInt, 16> SubEltBits;
@@ -6123,7 +6287,9 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   return false;
 }
 
-static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
+namespace llvm {
+namespace X86 {
+bool isConstantSplat(SDValue Op, APInt &SplatVal) {
   APInt UndefElts;
   SmallVector<APInt, 16> EltBits;
   if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
@@ -6146,6 +6312,8 @@ static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
 
   return false;
 }
+} // namespace X86
+} // namespace llvm
 
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
@@ -6551,13 +6719,12 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   return true;
 }
 
-/// Check a target shuffle mask's inputs to see if we can set any values to
-/// SM_SentinelZero - this is for elements that are known to be zero
-/// (not just zeroable) from their inputs.
+/// Decode a target shuffle mask and inputs and see if any values are
+/// known to be undef or zero from their inputs.
 /// Returns true if the target shuffle mask was decoded.
-static bool setTargetShuffleZeroElements(SDValue N,
-                                         SmallVectorImpl<int> &Mask,
-                                         SmallVectorImpl<SDValue> &Ops) {
+static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
+                                         SmallVectorImpl<SDValue> &Ops,
+                                         APInt &KnownUndef, APInt &KnownZero) {
   bool IsUnary;
   if (!isTargetShuffle(N.getOpcode()))
     return false;
@@ -6566,15 +6733,17 @@ static bool setTargetShuffleZeroElements(SDValue N,
   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
     return false;
 
+  int Size = Mask.size();
   SDValue V1 = Ops[0];
   SDValue V2 = IsUnary ? V1 : Ops[1];
+  KnownUndef = KnownZero = APInt::getNullValue(Size);
 
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
   assert((VT.getSizeInBits() % Mask.size()) == 0 &&
          "Illegal split of shuffle value type");
-  unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
+  unsigned EltSizeInBits = VT.getSizeInBits() / Size;
 
   // Extract known constant input data.
   APInt UndefSrcElts[2];
@@ -6585,12 +6754,18 @@ static bool setTargetShuffleZeroElements(SDValue N,
       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
                                     SrcEltBits[1], true, false)};
 
-  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+  for (int i = 0; i < Size; ++i) {
     int M = Mask[i];
 
     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
-    if (M < 0)
+    if (M < 0) {
+      assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
+      if (SM_SentinelUndef == M)
+        KnownUndef.setBit(i);
+      if (SM_SentinelZero == M)
+        KnownZero.setBit(i);
       continue;
+    }
 
     // Determine shuffle input and normalize the mask.
     unsigned SrcIdx = M / Size;
@@ -6599,7 +6774,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
 
     // We are referencing an UNDEF input.
     if (V.isUndef()) {
-      Mask[i] = SM_SentinelUndef;
+      KnownUndef.setBit(i);
       continue;
     }
 
@@ -6612,31 +6787,64 @@ static bool setTargetShuffleZeroElements(SDValue N,
       int Scale = Size / V.getValueType().getVectorNumElements();
       int Idx = M / Scale;
       if (Idx != 0 && !VT.isFloatingPoint())
-        Mask[i] = SM_SentinelUndef;
+        KnownUndef.setBit(i);
       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
-        Mask[i] = SM_SentinelZero;
+        KnownZero.setBit(i);
       continue;
     }
 
     // Attempt to extract from the source's constant bits.
     if (IsSrcConstant[SrcIdx]) {
       if (UndefSrcElts[SrcIdx][M])
-        Mask[i] = SM_SentinelUndef;
+        KnownUndef.setBit(i);
       else if (SrcEltBits[SrcIdx][M] == 0)
-        Mask[i] = SM_SentinelZero;
+        KnownZero.setBit(i);
     }
   }
 
-  assert(VT.getVectorNumElements() == Mask.size() &&
+  assert(VT.getVectorNumElements() == (unsigned)Size &&
          "Different mask size from vector size!");
   return true;
 }
 
+// Replace target shuffle mask elements with known undef/zero sentinels.
+static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
+                                              const APInt &KnownUndef,
+                                              const APInt &KnownZero) {
+  unsigned NumElts = Mask.size();
+  assert(KnownUndef.getBitWidth() == NumElts &&
+         KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (KnownUndef[i])
+      Mask[i] = SM_SentinelUndef;
+    else if (KnownZero[i])
+      Mask[i] = SM_SentinelZero;
+  }
+}
+
+// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
+static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
+                                              APInt &KnownUndef,
+                                              APInt &KnownZero) {
+  unsigned NumElts = Mask.size();
+  KnownUndef = KnownZero = APInt::getNullValue(NumElts);
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int M = Mask[i];
+    if (SM_SentinelUndef == M)
+      KnownUndef.setBit(i);
+    if (SM_SentinelZero == M)
+      KnownZero.setBit(i);
+  }
+}
+
 // Forward declaration (for getFauxShuffleMask recursive check).
-static bool resolveTargetShuffleInputs(SDValue Op,
-                                       SmallVectorImpl<SDValue> &Inputs,
-                                       SmallVectorImpl<int> &Mask,
-                                       SelectionDAG &DAG);
+// TODO: Use DemandedElts variant.
+static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
+                                   SmallVectorImpl<int> &Mask,
+                                   SelectionDAG &DAG, unsigned Depth,
+                                   bool ResolveKnownElts);
 
 // Attempt to decode ops that could be represented as a shuffle mask.
 // The decoded shuffle mask may contain a different number of elements to the
@@ -6644,7 +6852,8 @@ static bool resolveTargetShuffleInputs(SDValue Op,
 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
                                SmallVectorImpl<int> &Mask,
                                SmallVectorImpl<SDValue> &Ops,
-                               SelectionDAG &DAG) {
+                               SelectionDAG &DAG, unsigned Depth,
+                               bool ResolveKnownElts) {
   Mask.clear();
   Ops.clear();
 
@@ -6685,7 +6894,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
         Mask.push_back(SM_SentinelUndef);
         continue;
       }
-      uint64_t ByteBits = EltBits[i].getZExtValue();
+      const APInt &ByteBits = EltBits[i];
       if (ByteBits != 0 && ByteBits != 255)
         return false;
       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
@@ -6696,8 +6905,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
   case ISD::OR: {
     // Inspect each operand at the byte level. We can merge these into a
     // blend shuffle mask if for each byte at least one is masked out (zero).
-    KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts);
-    KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts);
+    KnownBits Known0 =
+        DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
+    KnownBits Known1 =
+        DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
     if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
       bool IsByteMask = true;
       unsigned NumSizeInBytes = NumSizeInBits / 8;
@@ -6736,14 +6947,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       return false;
     SmallVector<int, 64> SrcMask0, SrcMask1;
     SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
-    if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
-        !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
+    if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
+                                true) ||
+        !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
+                                true))
       return false;
-    int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
+    size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
     SmallVector<int, 64> Mask0, Mask1;
     scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
     scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
-    for (int i = 0; i != MaskSize; ++i) {
+    for (size_t i = 0; i != MaskSize; ++i) {
       if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
         Mask.push_back(SM_SentinelUndef);
       else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
@@ -6751,14 +6964,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       else if (Mask1[i] == SM_SentinelZero)
         Mask.push_back(Mask0[i]);
       else if (Mask0[i] == SM_SentinelZero)
-        Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
+        Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
       else
         return false;
     }
-    for (SDValue &Op : SrcInputs0)
-      Ops.push_back(Op);
-    for (SDValue &Op : SrcInputs1)
-      Ops.push_back(Op);
+    Ops.append(SrcInputs0.begin(), SrcInputs0.end());
+    Ops.append(SrcInputs1.begin(), SrcInputs1.end());
     return true;
   }
   case ISD::INSERT_SUBVECTOR: {
@@ -6786,8 +6997,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
     SmallVector<int, 64> SubMask;
     SmallVector<SDValue, 2> SubInputs;
-    if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
-                                    SubMask, DAG))
+    if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
+                                SubMask, DAG, Depth + 1, ResolveKnownElts))
       return false;
     if (SubMask.size() != NumSubElts) {
       assert(((SubMask.size() % NumSubElts) == 0 ||
@@ -6911,14 +7122,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     // as a truncation shuffle.
     if (Opcode == X86ISD::PACKSS) {
       if ((!N0.isUndef() &&
-           DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) ||
+           DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
           (!N1.isUndef() &&
-           DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt))
+           DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
         return false;
     } else {
       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
-      if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) ||
-          (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS)))
+      if ((!N0.isUndef() &&
+           !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
+          (!N1.isUndef() &&
+           !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
         return false;
     }
 
@@ -7061,23 +7274,45 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
   Inputs = UsedInputs;
 }
 
-/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
-/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
-/// remaining input indices in case we now have a unary shuffle and adjust the
-/// inputs accordingly.
+/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
+/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
 /// Returns true if the target shuffle mask was decoded.
-static bool resolveTargetShuffleInputs(SDValue Op,
-                                       SmallVectorImpl<SDValue> &Inputs,
-                                       SmallVectorImpl<int> &Mask,
-                                       SelectionDAG &DAG) {
+static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
+                                   SmallVectorImpl<SDValue> &Inputs,
+                                   SmallVectorImpl<int> &Mask,
+                                   APInt &KnownUndef, APInt &KnownZero,
+                                   SelectionDAG &DAG, unsigned Depth,
+                                   bool ResolveKnownElts) {
+  EVT VT = Op.getValueType();
+  if (!VT.isSimple() || !VT.isVector())
+    return false;
+
+  if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
+    if (ResolveKnownElts)
+      resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
+    return true;
+  }
+  if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
+                         ResolveKnownElts)) {
+    resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
+    return true;
+  }
+  return false;
+}
+
+static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
+                                   SmallVectorImpl<int> &Mask,
+                                   SelectionDAG &DAG, unsigned Depth = 0,
+                                   bool ResolveKnownElts = true) {
+  EVT VT = Op.getValueType();
+  if (!VT.isSimple() || !VT.isVector())
+    return false;
+
+  APInt KnownUndef, KnownZero;
   unsigned NumElts = Op.getValueType().getVectorNumElements();
   APInt DemandedElts = APInt::getAllOnesValue(NumElts);
-  if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
-    if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG))
-      return false;
-
-  resolveTargetShuffleInputsAndMask(Inputs, Mask);
-  return true;
+  return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
+                                KnownZero, DAG, Depth, ResolveKnownElts);
 }
 
 /// Returns the scalar element that will make up the ith
@@ -7414,7 +7649,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   SDLoc DL(Op);
   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
-                               DAG.getIntPtrConstant(InsertPSMask, DL));
+                               DAG.getIntPtrConstant(InsertPSMask, DL, true));
   return DAG.getBitcast(VT, Result);
 }
 
@@ -7427,7 +7662,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getBitcast(ShVT, SrcOp);
   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
-  SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
+  SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
 }
 
@@ -7439,7 +7674,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
   // the shuffle mask.
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
     SDValue Ptr = LD->getBasePtr();
-    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+    if (!ISD::isNormalLoad(LD) || !LD->isSimple())
       return SDValue();
     EVT PVT = LD->getValueType(0);
     if (PVT != MVT::i32 && PVT != MVT::f32)
@@ -7504,6 +7739,49 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
   return SDValue();
 }
 
+// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
+static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
+  if (ISD::isNON_EXTLoad(Elt.getNode())) {
+    auto *BaseLd = cast<LoadSDNode>(Elt);
+    if (!BaseLd->isSimple())
+      return false;
+    Ld = BaseLd;
+    ByteOffset = 0;
+    return true;
+  }
+
+  switch (Elt.getOpcode()) {
+  case ISD::BITCAST:
+  case ISD::TRUNCATE:
+  case ISD::SCALAR_TO_VECTOR:
+    return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
+  case ISD::SRL:
+    if (isa<ConstantSDNode>(Elt.getOperand(1))) {
+      uint64_t Idx = Elt.getConstantOperandVal(1);
+      if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
+        ByteOffset += Idx / 8;
+        return true;
+      }
+    }
+    break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (isa<ConstantSDNode>(Elt.getOperand(1))) {
+      SDValue Src = Elt.getOperand(0);
+      unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
+      unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
+      if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
+          findEltLoadSrc(Src, Ld, ByteOffset)) {
+        uint64_t Idx = Elt.getConstantOperandVal(1);
+        ByteOffset += Idx * (SrcSizeInBits / 8);
+        return true;
+      }
+    }
+    break;
+  }
+
+  return false;
+}
+
 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
 /// elements can be replaced by a single large load which has the same value as
 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
@@ -7513,6 +7791,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                         const SDLoc &DL, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget,
                                         bool isAfterLegalize) {
+  if ((VT.getScalarSizeInBits() % 8) != 0)
+    return SDValue();
+
   unsigned NumElems = Elts.size();
 
   int LastLoadedElt = -1;
@@ -7521,6 +7802,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   APInt UndefMask = APInt::getNullValue(NumElems);
 
   SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
+  SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
 
   // For each element in the initializer, see if we've found a load, zero or an
   // undef.
@@ -7539,13 +7821,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
 
     // Each loaded element must be the correct fractional portion of the
     // requested vector load.
-    if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+    unsigned EltSizeInBits = Elt.getValueSizeInBits();
+    if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
       return SDValue();
 
-    if (!ISD::isNON_EXTLoad(Elt.getNode()))
+    if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
+      return SDValue();
+    unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
+    if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
       return SDValue();
 
-    Loads[i] = cast<LoadSDNode>(Elt);
     LoadMask.setBit(i);
     LastLoadedElt = i;
   }
@@ -7575,6 +7860,24 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
   assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
 
+  // TODO: Support offsetting the base load.
+  if (ByteOffsets[FirstLoadedElt] != 0)
+    return SDValue();
+
+  // Check to see if the element's load is consecutive to the base load
+  // or offset from a previous (already checked) load.
+  auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
+    LoadSDNode *Ld = Loads[EltIdx];
+    int64_t ByteOffset = ByteOffsets[EltIdx];
+    if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
+      int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
+      return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
+              Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
+    }
+    return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
+                                              EltIdx - FirstLoadedElt);
+  };
+
   // Consecutive loads can contain UNDEFS but not ZERO elements.
   // Consecutive loads with UNDEFs and ZEROs elements require a
   // an additional shuffle stage to clear the ZERO elements.
@@ -7582,8 +7885,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   bool IsConsecutiveLoadWithZeros = true;
   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
     if (LoadMask[i]) {
-      if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
-                                              i - FirstLoadedElt)) {
+      if (!CheckConsecutiveLoad(LDBase, i)) {
         IsConsecutiveLoad = false;
         IsConsecutiveLoadWithZeros = false;
         break;
@@ -7595,8 +7897,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
 
   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
     auto MMOFlags = LDBase->getMemOperand()->getFlags();
-    assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
-           "Cannot merge volatile loads.");
+    assert(LDBase->isSimple() &&
+           "Cannot merge volatile or atomic loads.");
     SDValue NewLd =
         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
@@ -7636,17 +7938,22 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
     // vector and a zero vector to clear out the zero elements.
     if (!isAfterLegalize && VT.isVector()) {
-      SmallVector<int, 4> ClearMask(NumElems, -1);
-      for (unsigned i = 0; i < NumElems; ++i) {
-        if (ZeroMask[i])
-          ClearMask[i] = i + NumElems;
-        else if (LoadMask[i])
-          ClearMask[i] = i;
+      unsigned NumMaskElts = VT.getVectorNumElements();
+      if ((NumMaskElts % NumElems) == 0) {
+        unsigned Scale = NumMaskElts / NumElems;
+        SmallVector<int, 4> ClearMask(NumMaskElts, -1);
+        for (unsigned i = 0; i < NumElems; ++i) {
+          if (UndefMask[i])
+            continue;
+          int Offset = ZeroMask[i] ? NumMaskElts : 0;
+          for (unsigned j = 0; j != Scale; ++j)
+            ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
+        }
+        SDValue V = CreateLoad(VT, LDBase);
+        SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
+                                   : DAG.getConstantFP(0.0, DL, VT);
+        return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
       }
-      SDValue V = CreateLoad(VT, LDBase);
-      SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
-                                 : DAG.getConstantFP(0.0, DL, VT);
-      return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
     }
   }
 
@@ -8194,34 +8501,10 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
          "Unexpected type in LowerBUILD_VECTORvXi1!");
 
   SDLoc dl(Op);
-  if (ISD::isBuildVectorAllZeros(Op.getNode()))
-    return Op;
-
-  if (ISD::isBuildVectorAllOnes(Op.getNode()))
+  if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
+      ISD::isBuildVectorAllOnes(Op.getNode()))
     return Op;
 
-  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
-    if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
-      // Split the pieces.
-      SDValue Lower =
-          DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
-      SDValue Upper =
-          DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
-      // We have to manually lower both halves so getNode doesn't try to
-      // reassemble the build_vector.
-      Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
-      Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
-      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
-    }
-    SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
-    if (Imm.getValueSizeInBits() == VT.getSizeInBits())
-      return DAG.getBitcast(VT, Imm);
-    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
-                        DAG.getIntPtrConstant(0, dl));
-  }
-
-  // Vector has one or more non-const elements
   uint64_t Immediate = 0;
   SmallVector<unsigned, 16> NonConstIdx;
   bool IsSplat = true;
@@ -8244,29 +8527,40 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
   }
 
   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
-  if (IsSplat)
-    return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
+  if (IsSplat) {
+    // The build_vector allows the scalar element to be larger than the vector
+    // element type. We need to mask it to use as a condition unless we know
+    // the upper bits are zero.
+    // FIXME: Use computeKnownBits instead of checking specific opcode?
+    SDValue Cond = Op.getOperand(SplatIdx);
+    assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
+    if (Cond.getOpcode() != ISD::SETCC)
+      Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
+                         DAG.getConstant(1, dl, MVT::i8));
+    return DAG.getSelect(dl, VT, Cond,
                          DAG.getConstant(1, dl, VT),
                          DAG.getConstant(0, dl, VT));
+  }
 
   // insert elements one by one
   SDValue DstVec;
-  SDValue Imm;
-  if (Immediate) {
-    MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
-    Imm = DAG.getConstant(Immediate, dl, ImmVT);
-  }
-  else if (HasConstElts)
-    Imm = DAG.getConstant(0, dl, VT);
-  else
-    Imm = DAG.getUNDEF(VT);
-  if (Imm.getValueSizeInBits() == VT.getSizeInBits())
-    DstVec = DAG.getBitcast(VT, Imm);
-  else {
-    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
-    DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
-                         DAG.getIntPtrConstant(0, dl));
-  }
+  if (HasConstElts) {
+    if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+      SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
+      SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
+      ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
+      ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
+      DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
+    } else {
+      MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U));
+      SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
+      MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
+      DstVec = DAG.getBitcast(VecVT, Imm);
+      DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
+                           DAG.getIntPtrConstant(0, dl));
+    }
+  } else
+    DstVec = DAG.getUNDEF(VT);
 
   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
     unsigned InsertIdx = NonConstIdx[i];
@@ -8757,7 +9051,7 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
   // If we don't need the upper xmm, then perform as a xmm hop.
   unsigned HalfNumElts = NumElts / 2;
   if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
-    MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts);
+    MVT HalfVT = VT.getHalfNumVectorElementsVT();
     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
     SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
@@ -8965,21 +9259,14 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
   MVT VT = Op.getSimpleValueType();
 
   // Vectors containing all zeros can be matched by pxor and xorps.
-  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
-    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
-    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
-    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
-      return Op;
-
-    return getZeroVector(VT, Subtarget, DAG, DL);
-  }
+  if (ISD::isBuildVectorAllZeros(Op.getNode()))
+    return Op;
 
   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   // vpcmpeqd on 256-bit vectors.
   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
-    if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
-        (VT == MVT::v8i32 && Subtarget.hasInt256()))
+    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
       return Op;
 
     return getOnesVector(VT, DAG, DL);
@@ -9150,9 +9437,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
       SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
                                           {4, 5, 6, 7, 4, 5, 6, 7});
       if (Subtarget.hasXOP())
-        return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
-                                              LoLo, HiHi, IndicesVec,
-                                              DAG.getConstant(0, DL, MVT::i8)));
+        return DAG.getBitcast(
+            VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
+                            IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
       // Permute Lo and Hi and then select based on index range.
       // This works as VPERMILPS only uses index bits[0:1] to permute elements.
       SDValue Res = DAG.getSelectCC(
@@ -9186,9 +9473,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
       // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
       if (Subtarget.hasXOP())
-        return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
-                                              LoLo, HiHi, IndicesVec,
-                                              DAG.getConstant(0, DL, MVT::i8)));
+        return DAG.getBitcast(
+            VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
+                            IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
       // Permute Lo and Hi and then select based on index range.
       // This works as VPERMILPD only uses index bit[1] to permute elements.
       SDValue Res = DAG.getSelectCC(
@@ -9283,7 +9570,7 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
       return SDValue();
 
     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
-    if (!PermIdx || PermIdx->getZExtValue() != Idx)
+    if (!PermIdx || PermIdx->getAPIntValue() != Idx)
       return SDValue();
   }
 
@@ -9434,23 +9721,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       // it to i32 first.
       if (EltVT == MVT::i16 || EltVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
-        if (VT.getSizeInBits() >= 256) {
-          MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
-          if (Subtarget.hasAVX()) {
-            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
-            Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
-          } else {
-            // Without AVX, we need to extend to a 128-bit vector and then
-            // insert into the 256-bit vector.
-            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
-            SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
-            Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
-          }
-        } else {
-          assert(VT.is128BitVector() && "Expected an SSE value type!");
-          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
-          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
-        }
+        MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
         return DAG.getBitcast(VT, Item);
       }
     }
@@ -9549,8 +9822,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
 
     // Recreate the wider vector with the lower and upper part.
-    return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
-                            VT.getSizeInBits() / 2);
+    return concatSubVectors(Lower, Upper, DAG, dl);
   }
 
   // Let legalizer expand 2-wide build_vectors.
@@ -9703,8 +9975,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
 
   // If we have more than 2 non-zeros, build each half separately.
   if (NumNonZero > 2) {
-    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
-                                  ResVT.getVectorNumElements()/2);
+    MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
     ArrayRef<SDUse> Ops = Op->ops();
     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
                              Ops.slice(0, NumOperands/2));
@@ -9745,30 +10016,47 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
-  unsigned NumZero = 0;
-  unsigned NumNonZero = 0;
+  uint64_t Zeros = 0;
   uint64_t NonZeros = 0;
   for (unsigned i = 0; i != NumOperands; ++i) {
     SDValue SubVec = Op.getOperand(i);
     if (SubVec.isUndef())
       continue;
+    assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
-      ++NumZero;
-    else {
-      assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+      Zeros |= (uint64_t)1 << i;
+    else
       NonZeros |= (uint64_t)1 << i;
-      ++NumNonZero;
-    }
   }
 
+  unsigned NumElems = ResVT.getVectorNumElements();
+
+  // If we are inserting non-zero vector and there are zeros in LSBs and undef
+  // in the MSBs we need to emit a KSHIFTL. The generic lowering to
+  // insert_subvector will give us two kshifts.
+  if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
+      Log2_64(NonZeros) != NumOperands - 1) {
+    MVT ShiftVT = ResVT;
+    if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
+      ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+    unsigned Idx = Log2_64(NonZeros);
+    SDValue SubVec = Op.getOperand(Idx);
+    unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
+    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
+                         DAG.getUNDEF(ShiftVT), SubVec,
+                         DAG.getIntPtrConstant(0, dl));
+    Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
+                     DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
+                       DAG.getIntPtrConstant(0, dl));
+  }
 
   // If there are zero or one non-zeros we can handle this very simply.
-  if (NumNonZero <= 1) {
-    SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
-                          : DAG.getUNDEF(ResVT);
-    if (!NumNonZero)
+  if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
+    SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
+    if (!NonZeros)
       return Vec;
-    unsigned Idx = countTrailingZeros(NonZeros);
+    unsigned Idx = Log2_64(NonZeros);
     SDValue SubVec = Op.getOperand(Idx);
     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
@@ -9776,8 +10064,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   }
 
   if (NumOperands > 2) {
-    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
-                                  ResVT.getVectorNumElements()/2);
+    MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
     ArrayRef<SDUse> Ops = Op->ops();
     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
                              Ops.slice(0, NumOperands/2));
@@ -9786,7 +10073,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   }
 
-  assert(NumNonZero == 2 && "Simple cases not handled?");
+  assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
 
   if (ResVT.getVectorNumElements() >= 16)
     return Op; // The operation is legal with KUNPCK
@@ -9794,7 +10081,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
                             DAG.getUNDEF(ResVT), Op.getOperand(0),
                             DAG.getIntPtrConstant(0, dl));
-  unsigned NumElems = ResVT.getVectorNumElements();
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
                      DAG.getIntPtrConstant(NumElems/2, dl));
 }
@@ -9997,42 +10283,44 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
 ///
-/// SM_SentinelZero is accepted as a valid negative index but must match in both.
+/// SM_SentinelZero is accepted as a valid negative index but must match in
+/// both.
 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
-                                      ArrayRef<int> ExpectedMask) {
+                                      ArrayRef<int> ExpectedMask,
+                                      SDValue V1 = SDValue(),
+                                      SDValue V2 = SDValue()) {
   int Size = Mask.size();
   if (Size != (int)ExpectedMask.size())
     return false;
   assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
          "Illegal target shuffle mask");
 
-  for (int i = 0; i < Size; ++i)
-    if (Mask[i] == SM_SentinelUndef)
-      continue;
-    else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
-      return false;
-    else if (Mask[i] != ExpectedMask[i])
-      return false;
-
-  return true;
-}
+  // Check for out-of-range target shuffle mask indices.
+  if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
+    return false;
 
-// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
-// mask.
-static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
-                                                    const APInt &Zeroable) {
-  int NumElts = Mask.size();
-  assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
+  // If the values are build vectors, we can look through them to find
+  // equivalent inputs that make the shuffles equivalent.
+  auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
+  auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
+  BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
+  BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
 
-  SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
-  for (int i = 0; i != NumElts; ++i) {
-    int M = Mask[i];
-    if (M == SM_SentinelUndef)
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
       continue;
-    assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
-    TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
+    if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
+      auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+      auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+      if (MaskBV && ExpectedBV &&
+          MaskBV->getOperand(Mask[i] % Size) ==
+              ExpectedBV->getOperand(ExpectedMask[i] % Size))
+        continue;
+    }
+    // TODO - handle SM_Sentinel equivalences.
+    return false;
   }
-  return TargetMask;
+  return true;
 }
 
 // Attempt to create a shuffle mask from a VSELECT condition mask.
@@ -10133,7 +10421,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
 
 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
                                           SelectionDAG &DAG) {
-  return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
+  return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
 }
 
 /// Compute whether each element of a shuffle is zeroable.
@@ -10573,14 +10861,14 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
   // Try binary shuffle.
   SmallVector<int, 32> BinaryMask;
   createPackShuffleMask(VT, BinaryMask, false);
-  if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
+  if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
     if (MatchPACK(V1, V2))
       return true;
 
   // Try unary shuffle.
   SmallVector<int, 32> UnaryMask;
   createPackShuffleMask(VT, UnaryMask, true);
-  if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
+  if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
     if (MatchPACK(V1, V1))
       return true;
 
@@ -10685,9 +10973,9 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
                                     SelectionDAG &DAG);
 
 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
-                                      MutableArrayRef<int> TargetMask,
-                                      bool &ForceV1Zero, bool &ForceV2Zero,
-                                      uint64_t &BlendMask) {
+                                      MutableArrayRef<int> Mask,
+                                      const APInt &Zeroable, bool &ForceV1Zero,
+                                      bool &ForceV2Zero, uint64_t &BlendMask) {
   bool V1IsZeroOrUndef =
       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
   bool V2IsZeroOrUndef =
@@ -10695,13 +10983,12 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
 
   BlendMask = 0;
   ForceV1Zero = false, ForceV2Zero = false;
-  assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
+  assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
 
   // Attempt to generate the binary blend mask. If an input is zero then
   // we can use any lane.
-  // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
-  for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
-    int M = TargetMask[i];
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
     if (M == SM_SentinelUndef)
       continue;
     if (M == i)
@@ -10710,16 +10997,16 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
       BlendMask |= 1ull << i;
       continue;
     }
-    if (M == SM_SentinelZero) {
+    if (Zeroable[i]) {
       if (V1IsZeroOrUndef) {
         ForceV1Zero = true;
-        TargetMask[i] = i;
+        Mask[i] = i;
         continue;
       }
       if (V2IsZeroOrUndef) {
         ForceV2Zero = true;
         BlendMask |= 1ull << i;
-        TargetMask[i] = i + Size;
+        Mask[i] = i + Size;
         continue;
       }
     }
@@ -10748,11 +11035,10 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                    const APInt &Zeroable,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
-  SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
-
   uint64_t BlendMask = 0;
   bool ForceV1Zero = false, ForceV2Zero = false;
-  if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
+  SmallVector<int, 64> Mask(Original.begin(), Original.end());
+  if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
                                  BlendMask))
     return SDValue();
 
@@ -10778,7 +11064,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
   case MVT::v8i16:
     assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
-                       DAG.getConstant(BlendMask, DL, MVT::i8));
+                       DAG.getTargetConstant(BlendMask, DL, MVT::i8));
   case MVT::v16i16: {
     assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
     SmallVector<int, 8> RepeatedMask;
@@ -10790,7 +11076,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
         if (RepeatedMask[i] >= 8)
           BlendMask |= 1ull << i;
       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
-                         DAG.getConstant(BlendMask, DL, MVT::i8));
+                         DAG.getTargetConstant(BlendMask, DL, MVT::i8));
     }
     // Use PBLENDW for lower/upper lanes and then blend lanes.
     // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
@@ -10799,9 +11085,9 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     uint64_t HiMask = (BlendMask >> 8) & 0xFF;
     if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
       SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
-                               DAG.getConstant(LoMask, DL, MVT::i8));
+                               DAG.getTargetConstant(LoMask, DL, MVT::i8));
       SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
-                               DAG.getConstant(HiMask, DL, MVT::i8));
+                               DAG.getTargetConstant(HiMask, DL, MVT::i8));
       return DAG.getVectorShuffle(
           MVT::v16i16, DL, Lo, Hi,
           {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
@@ -11061,7 +11347,7 @@ static SDValue lowerShuffleAsByteRotateAndPermute(
     SDValue Rotate = DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
                         DAG.getBitcast(ByteVT, Lo),
-                        DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
+                        DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
     SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
     for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
       for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
@@ -11268,7 +11554,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
            "512-bit PALIGNR requires BWI instructions");
     return DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
-                        DAG.getConstant(ByteRotation, DL, MVT::i8)));
+                        DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
   }
 
   assert(VT.is128BitVector() &&
@@ -11282,10 +11568,12 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
   int LoByteShift = 16 - ByteRotation;
   int HiByteShift = ByteRotation;
 
-  SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
-                                DAG.getConstant(LoByteShift, DL, MVT::i8));
-  SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
-                                DAG.getConstant(HiByteShift, DL, MVT::i8));
+  SDValue LoShift =
+      DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
+                  DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
+  SDValue HiShift =
+      DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
+                  DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
   return DAG.getBitcast(VT,
                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
 }
@@ -11317,7 +11605,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
     return SDValue();
 
   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
-                     DAG.getConstant(Rotation, DL, MVT::i8));
+                     DAG.getTargetConstant(Rotation, DL, MVT::i8));
 }
 
 /// Try to lower a vector shuffle as a byte shift sequence.
@@ -11356,27 +11644,27 @@ static SDValue lowerVectorShuffleAsByteShiftMask(
   if (ZeroLo == 0) {
     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
-                      DAG.getConstant(Scale * Shift, DL, MVT::i8));
+                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
-                      DAG.getConstant(Scale * ZeroHi, DL, MVT::i8));
+                      DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
   } else if (ZeroHi == 0) {
     unsigned Shift = Mask[ZeroLo] % NumElts;
     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
-                      DAG.getConstant(Scale * Shift, DL, MVT::i8));
+                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
-                      DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+                      DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
   } else if (!Subtarget.hasSSSE3()) {
     // If we don't have PSHUFB then its worth avoiding an AND constant mask
     // by performing 3 byte shifts. Shuffle combining can kick in above that.
     // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
-                      DAG.getConstant(Scale * Shift, DL, MVT::i8));
+                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
     Shift += Mask[ZeroLo] % NumElts;
     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
-                      DAG.getConstant(Scale * Shift, DL, MVT::i8));
+                      DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
-                      DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+                      DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
   } else
     return SDValue();
 
@@ -11498,7 +11786,7 @@ static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
          "Illegal integer vector type");
   V = DAG.getBitcast(ShiftVT, V);
   V = DAG.getNode(Opcode, DL, ShiftVT, V,
-                  DAG.getConstant(ShiftAmt, DL, MVT::i8));
+                  DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
   return DAG.getBitcast(VT, V);
 }
 
@@ -11632,14 +11920,14 @@ static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
   uint64_t BitLen, BitIdx;
   if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
-                       DAG.getConstant(BitLen, DL, MVT::i8),
-                       DAG.getConstant(BitIdx, DL, MVT::i8));
+                       DAG.getTargetConstant(BitLen, DL, MVT::i8),
+                       DAG.getTargetConstant(BitIdx, DL, MVT::i8));
 
   if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
                        V2 ? V2 : DAG.getUNDEF(VT),
-                       DAG.getConstant(BitLen, DL, MVT::i8),
-                       DAG.getConstant(BitIdx, DL, MVT::i8));
+                       DAG.getTargetConstant(BitLen, DL, MVT::i8),
+                       DAG.getTargetConstant(BitIdx, DL, MVT::i8));
 
   return SDValue();
 }
@@ -11686,9 +11974,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
   };
 
-  // Found a valid zext mask! Try various lowering strategies based on the
+  // Found a valid a/zext mask! Try various lowering strategies based on the
   // input type and available ISA extensions.
-  // TODO: Add AnyExt support.
   if (Subtarget.hasSSE41()) {
     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
     // PUNPCK will catch this in a later shuffle match.
@@ -11697,7 +11984,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
     InputV = ShuffleOffset(InputV);
-    InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG);
+    InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
+                            ExtVT, InputV, DAG);
     return DAG.getBitcast(VT, InputV);
   }
 
@@ -11736,8 +12024,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
     int LoIdx = Offset * EltBits;
     SDValue Lo = DAG.getBitcast(
         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
-                                DAG.getConstant(EltBits, DL, MVT::i8),
-                                DAG.getConstant(LoIdx, DL, MVT::i8)));
+                                DAG.getTargetConstant(EltBits, DL, MVT::i8),
+                                DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
 
     if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
       return DAG.getBitcast(VT, Lo);
@@ -11745,8 +12033,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
     int HiIdx = (Offset + 1) * EltBits;
     SDValue Hi = DAG.getBitcast(
         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
-                                DAG.getConstant(EltBits, DL, MVT::i8),
-                                DAG.getConstant(HiIdx, DL, MVT::i8)));
+                                DAG.getTargetConstant(EltBits, DL, MVT::i8),
+                                DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
     return DAG.getBitcast(VT,
                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
   }
@@ -11759,8 +12047,12 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
     SDValue PSHUFBMask[16];
     for (int i = 0; i < 16; ++i) {
       int Idx = Offset + (i / Scale);
-      PSHUFBMask[i] = DAG.getConstant(
-          (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+      if ((i % Scale == 0 && SafeOffset(Idx))) {
+        PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
+        continue;
+      }
+      PSHUFBMask[i] =
+          AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
     }
     InputV = DAG.getBitcast(MVT::v16i8, InputV);
     return DAG.getBitcast(
@@ -12052,9 +12344,9 @@ static SDValue lowerShuffleAsElementInsertion(
       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
     } else {
       V2 = DAG.getBitcast(MVT::v16i8, V2);
-      V2 = DAG.getNode(
-          X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
-          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
+      V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
+                       DAG.getTargetConstant(
+                           V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
       V2 = DAG.getBitcast(VT, V2);
     }
   }
@@ -12294,7 +12586,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
     // If we can't broadcast from a register, check that the input is a load.
     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
       return SDValue();
-  } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+  } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {
     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
@@ -12486,7 +12778,7 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
 
   // Insert the V2 element into the desired position.
   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
-                     DAG.getConstant(InsertPSMask, DL, MVT::i8));
+                     DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
 }
 
 /// Try to lower a shuffle as a permute of the inputs followed by an
@@ -12635,14 +12927,14 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       // If we have AVX, we can use VPERMILPS which will allow folding a load
       // into the shuffle.
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
-                         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+                         DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
     }
 
     return DAG.getNode(
         X86ISD::SHUFP, DL, MVT::v2f64,
         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
-        DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+        DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
   }
   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
@@ -12688,7 +12980,7 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
-                     DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+                     DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
 }
 
 /// Handle lowering of 2-lane 64-bit integer shuffles.
@@ -12996,10 +13288,12 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 0) {
-    // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
-                                                    Mask, Subtarget, DAG))
-      return Broadcast;
+    // Try to use broadcast unless the mask only has one non-undef element.
+    if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
+      if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
+                                                      Mask, Subtarget, DAG))
+        return Broadcast;
+    }
 
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
@@ -13680,16 +13974,16 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
 
   if (NumV2Inputs == 0) {
-    // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
-                                                    Mask, Subtarget, DAG))
-      return Broadcast;
-
     // Try to use shift instructions.
     if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
                                             Zeroable, Subtarget, DAG))
       return Shift;
 
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
+                                                    Mask, Subtarget, DAG))
+      return Broadcast;
+
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
       return V;
@@ -13984,8 +14278,16 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
 
       // Unpack the bytes to form the i16s that will be shuffled into place.
+      bool EvenInUse = false, OddInUse = false;
+      for (int i = 0; i < 16; i += 2) {
+        EvenInUse |= (Mask[i + 0] >= 0);
+        OddInUse |= (Mask[i + 1] >= 0);
+        if (EvenInUse && OddInUse)
+          break;
+      }
       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
-                       MVT::v16i8, V1, V1);
+                       MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
+                       OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
 
       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
       for (int i = 0; i < 16; ++i)
@@ -14100,11 +14402,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     // First we need to zero all the dropped bytes.
     assert(NumEvenDrops <= 3 &&
            "No support for dropping even elements more than 3 times.");
-    // We use the mask type to pick which bytes are preserved based on how many
-    // elements are dropped.
-    MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
-    SDValue ByteClearMask = DAG.getBitcast(
-        MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
+    SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
+    for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
+      ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
+    SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
     if (!IsSingleInput)
       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
@@ -14448,16 +14749,14 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
   return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
 }
 
-/// Lower a vector shuffle crossing multiple 128-bit lanes as
-/// a permutation and blend of those lanes.
+/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
+/// source with a lane permutation.
 ///
-/// This essentially blends the out-of-lane inputs to each lane into the lane
-/// from a permuted copy of the vector. This lowering strategy results in four
-/// instructions in the worst case for a single-input cross lane shuffle which
-/// is lower than any other fully general cross-lane shuffle strategy I'm aware
-/// of. Special cases for each particular shuffle pattern should be handled
-/// prior to trying this lowering.
-static SDValue lowerShuffleAsLanePermuteAndBlend(
+/// This lowering strategy results in four instructions in the worst case for a
+/// single-input cross lane shuffle which is lower than any other fully general
+/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
+/// shuffle pattern should be handled prior to trying this lowering.
+static SDValue lowerShuffleAsLanePermuteAndShuffle(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
   // FIXME: This should probably be generalized for 512-bit vectors as well.
@@ -14484,24 +14783,28 @@ static SDValue lowerShuffleAsLanePermuteAndBlend(
       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
   }
 
+  // TODO - we could support shuffling V2 in the Flipped input.
   assert(V2.isUndef() &&
          "This last part of this routine only works on single input shuffles");
 
-  SmallVector<int, 32> FlippedBlendMask(Size);
-  for (int i = 0; i < Size; ++i)
-    FlippedBlendMask[i] =
-        Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
-                                ? Mask[i]
-                                : Mask[i] % LaneSize +
-                                      (i / LaneSize) * LaneSize + Size);
+  SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
+  for (int i = 0; i < Size; ++i) {
+    int &M = InLaneMask[i];
+    if (M < 0)
+      continue;
+    if (((M % Size) / LaneSize) != (i / LaneSize))
+      M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
+  }
+  assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
+         "In-lane shuffle mask expected");
 
-  // Flip the vector, and blend the results which should now be in-lane.
+  // Flip the lanes, and shuffle the results which should now be in-lane.
   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
   SDValue Flipped = DAG.getBitcast(PVT, V1);
-  Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
-                                 { 2, 3, 0, 1 });
+  Flipped =
+      DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
   Flipped = DAG.getBitcast(VT, Flipped);
-  return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
+  return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
 }
 
 /// Handle lowering 2-lane 128-bit shuffles.
@@ -14565,8 +14868,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
                             ((WidenedMask[1] % 2) << 1);
-      return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
-                         DAG.getConstant(PermMask, DL, MVT::i8));
+        return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
       }
     }
   }
@@ -14598,7 +14901,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
     V2 = DAG.getUNDEF(VT);
 
   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
-                     DAG.getConstant(PermMask, DL, MVT::i8));
+                     DAG.getTargetConstant(PermMask, DL, MVT::i8));
 }
 
 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
@@ -14616,26 +14919,26 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
     return SDValue();
 
-  int Size = Mask.size();
+  int NumElts = Mask.size();
   int NumLanes = VT.getSizeInBits() / 128;
-  int LaneSize = 128 / VT.getScalarSizeInBits();
-  SmallVector<int, 16> RepeatMask(LaneSize, -1);
+  int NumLaneElts = 128 / VT.getScalarSizeInBits();
+  SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
   SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
 
   // First pass will try to fill in the RepeatMask from lanes that need two
   // sources.
   for (int Lane = 0; Lane != NumLanes; ++Lane) {
-    int Srcs[2] = { -1, -1 };
-    SmallVector<int, 16> InLaneMask(LaneSize, -1);
-    for (int i = 0; i != LaneSize; ++i) {
-      int M = Mask[(Lane * LaneSize) + i];
+    int Srcs[2] = {-1, -1};
+    SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
+    for (int i = 0; i != NumLaneElts; ++i) {
+      int M = Mask[(Lane * NumLaneElts) + i];
       if (M < 0)
         continue;
       // Determine which of the possible input lanes (NumLanes from each source)
       // this element comes from. Assign that as one of the sources for this
       // lane. We can assign up to 2 sources for this lane. If we run out
       // sources we can't do anything.
-      int LaneSrc = M / LaneSize;
+      int LaneSrc = M / NumLaneElts;
       int Src;
       if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
         Src = 0;
@@ -14645,7 +14948,7 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
         return SDValue();
 
       Srcs[Src] = LaneSrc;
-      InLaneMask[i] = (M % LaneSize) + Src * Size;
+      InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
     }
 
     // If this lane has two sources, see if it fits with the repeat mask so far.
@@ -14701,23 +15004,23 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
     if (LaneSrcs[Lane][0] >= 0)
       continue;
 
-    for (int i = 0; i != LaneSize; ++i) {
-      int M = Mask[(Lane * LaneSize) + i];
+    for (int i = 0; i != NumLaneElts; ++i) {
+      int M = Mask[(Lane * NumLaneElts) + i];
       if (M < 0)
         continue;
 
       // If RepeatMask isn't defined yet we can define it ourself.
       if (RepeatMask[i] < 0)
-        RepeatMask[i] = M % LaneSize;
+        RepeatMask[i] = M % NumLaneElts;
 
-      if (RepeatMask[i] < Size) {
-        if (RepeatMask[i] != M % LaneSize)
+      if (RepeatMask[i] < NumElts) {
+        if (RepeatMask[i] != M % NumLaneElts)
           return SDValue();
-        LaneSrcs[Lane][0] = M / LaneSize;
+        LaneSrcs[Lane][0] = M / NumLaneElts;
       } else {
-        if (RepeatMask[i] != ((M % LaneSize) + Size))
+        if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
           return SDValue();
-        LaneSrcs[Lane][1] = M / LaneSize;
+        LaneSrcs[Lane][1] = M / NumLaneElts;
       }
     }
 
@@ -14725,14 +15028,14 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
       return SDValue();
   }
 
-  SmallVector<int, 16> NewMask(Size, -1);
+  SmallVector<int, 16> NewMask(NumElts, -1);
   for (int Lane = 0; Lane != NumLanes; ++Lane) {
     int Src = LaneSrcs[Lane][0];
-    for (int i = 0; i != LaneSize; ++i) {
+    for (int i = 0; i != NumLaneElts; ++i) {
       int M = -1;
       if (Src >= 0)
-        M = Src * LaneSize + i;
-      NewMask[Lane * LaneSize + i] = M;
+        M = Src * NumLaneElts + i;
+      NewMask[Lane * NumLaneElts + i] = M;
     }
   }
   SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
@@ -14745,11 +15048,11 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
 
   for (int Lane = 0; Lane != NumLanes; ++Lane) {
     int Src = LaneSrcs[Lane][1];
-    for (int i = 0; i != LaneSize; ++i) {
+    for (int i = 0; i != NumLaneElts; ++i) {
       int M = -1;
       if (Src >= 0)
-        M = Src * LaneSize + i;
-      NewMask[Lane * LaneSize + i] = M;
+        M = Src * NumLaneElts + i;
+      NewMask[Lane * NumLaneElts + i] = M;
     }
   }
   SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
@@ -14760,12 +15063,12 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
       cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
     return SDValue();
 
-  for (int i = 0; i != Size; ++i) {
-    NewMask[i] = RepeatMask[i % LaneSize];
+  for (int i = 0; i != NumElts; ++i) {
+    NewMask[i] = RepeatMask[i % NumLaneElts];
     if (NewMask[i] < 0)
       continue;
 
-    NewMask[i] += (i / LaneSize) * LaneSize;
+    NewMask[i] += (i / NumLaneElts) * NumLaneElts;
   }
   return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
 }
@@ -14831,14 +15134,13 @@ getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
                                      ArrayRef<int> HalfMask, int HalfIdx1,
                                      int HalfIdx2, bool UndefLower,
-                                     SelectionDAG &DAG) {
+                                     SelectionDAG &DAG, bool UseConcat = false) {
   assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
   assert(V1.getValueType().isSimple() && "Expecting only simple types");
 
   MVT VT = V1.getSimpleValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned HalfNumElts = NumElts / 2;
-  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+  MVT HalfVT = VT.getHalfNumVectorElementsVT();
+  unsigned HalfNumElts = HalfVT.getVectorNumElements();
 
   auto getHalfVector = [&](int HalfIdx) {
     if (HalfIdx < 0)
@@ -14853,6 +15155,14 @@ static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
   SDValue Half1 = getHalfVector(HalfIdx1);
   SDValue Half2 = getHalfVector(HalfIdx2);
   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+  if (UseConcat) {
+    SDValue Op0 = V;
+    SDValue Op1 = DAG.getUNDEF(HalfVT);
+    if (UndefLower)
+      std::swap(Op0, Op1);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
+  }
+
   unsigned Offset = UndefLower ? HalfNumElts : 0;
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
                      DAG.getIntPtrConstant(Offset, DL));
@@ -14877,9 +15187,8 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
 
   // Upper half is undef and lower half is whole upper subvector.
   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned HalfNumElts = NumElts / 2;
-  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+  MVT HalfVT = VT.getHalfNumVectorElementsVT();
+  unsigned HalfNumElts = HalfVT.getVectorNumElements();
   if (!UndefLower &&
       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
@@ -15155,11 +15464,19 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
 }
 
 static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
-                                   unsigned &ShuffleImm, ArrayRef<int> Mask) {
+                                   bool &ForceV1Zero, bool &ForceV2Zero,
+                                   unsigned &ShuffleImm, ArrayRef<int> Mask,
+                                   const APInt &Zeroable) {
   int NumElts = VT.getVectorNumElements();
   assert(VT.getScalarSizeInBits() == 64 &&
          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected data type for VSHUFPD");
+  assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
+         "Illegal shuffle mask");
+
+  bool ZeroLane[2] = { true, true };
+  for (int i = 0; i < NumElts; ++i)
+    ZeroLane[i & 1] &= Zeroable[i];
 
   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
@@ -15167,7 +15484,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
   bool ShufpdMask = true;
   bool CommutableMask = true;
   for (int i = 0; i < NumElts; ++i) {
-    if (Mask[i] == SM_SentinelUndef)
+    if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
       continue;
     if (Mask[i] < 0)
       return false;
@@ -15180,30 +15497,77 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
     ShuffleImm |= (Mask[i] % 2) << i;
   }
 
-  if (ShufpdMask)
-    return true;
-  if (CommutableMask) {
+  if (!ShufpdMask && !CommutableMask)
+    return false;
+
+  if (!ShufpdMask && CommutableMask)
     std::swap(V1, V2);
-    return true;
-  }
 
-  return false;
+  ForceV1Zero = ZeroLane[0];
+  ForceV2Zero = ZeroLane[1];
+  return true;
 }
 
-static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
-                                      ArrayRef<int> Mask, SDValue V1,
-                                      SDValue V2, SelectionDAG &DAG) {
-  assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
+static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
+                                      SDValue V2, ArrayRef<int> Mask,
+                                      const APInt &Zeroable,
+                                      const X86Subtarget &Subtarget,
+                                      SelectionDAG &DAG) {
+  assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
          "Unexpected data type for VSHUFPD");
 
   unsigned Immediate = 0;
-  if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
+  bool ForceV1Zero = false, ForceV2Zero = false;
+  if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
+                              Mask, Zeroable))
     return SDValue();
 
+  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+  if (ForceV1Zero)
+    V1 = getZeroVector(VT, Subtarget, DAG, DL);
+  if (ForceV2Zero)
+    V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
-                     DAG.getConstant(Immediate, DL, MVT::i8));
+                     DAG.getTargetConstant(Immediate, DL, MVT::i8));
 }
 
+// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
+// by zeroable elements in the remaining 24 elements. Turn this into two
+// vmovqb instructions shuffled together.
+static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
+                                             SDValue V1, SDValue V2,
+                                             ArrayRef<int> Mask,
+                                             const APInt &Zeroable,
+                                             SelectionDAG &DAG) {
+  assert(VT == MVT::v32i8 && "Unexpected type!");
+
+  // The first 8 indices should be every 8th element.
+  if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
+    return SDValue();
+
+  // Remaining elements need to be zeroable.
+  if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
+    return SDValue();
+
+  V1 = DAG.getBitcast(MVT::v4i64, V1);
+  V2 = DAG.getBitcast(MVT::v4i64, V2);
+
+  V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
+  V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
+
+  // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
+  // the upper bits of the result using an unpckldq.
+  SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
+                                        { 0, 1, 2, 3, 16, 17, 18, 19,
+                                          4, 5, 6, 7, 20, 21, 22, 23 });
+  // Insert the unpckldq into a zero vector to widen to v32i8.
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
+                     DAG.getConstant(0, DL, MVT::v32i8), Unpack,
+                     DAG.getIntPtrConstant(0, DL));
+}
+
+
 /// Handle lowering of 4-lane 64-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -15236,7 +15600,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
-                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+                         DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
     }
 
     // With AVX2 we have direct support for this permutation.
@@ -15256,8 +15620,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return V;
 
     // Otherwise, fall back.
-    return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG,
-                                             Subtarget);
+    return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
+                                               DAG, Subtarget);
   }
 
   // Use dedicated unpack instructions for masks that match their pattern.
@@ -15269,7 +15633,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Blend;
 
   // Check if the blend happens to exactly fit that of SHUFPD.
-  if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+  if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
+                                          Zeroable, Subtarget, DAG))
     return Op;
 
   // If we have one input in place, then we can permute the other input and
@@ -15473,8 +15838,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
 
     // Otherwise, fall back.
-    return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
-                                             DAG, Subtarget);
+    return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
+                                               DAG, Subtarget);
   }
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -15681,8 +16046,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
               DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
         return V;
 
-      return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask,
-                                               DAG, Subtarget);
+      return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
+                                                 DAG, Subtarget);
     }
 
     SmallVector<int, 8> RepeatedMask;
@@ -15780,8 +16145,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
             DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
       return V;
 
-    return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG,
-                                             Subtarget);
+    return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
+                                               DAG, Subtarget);
   }
 
   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
@@ -15803,6 +16168,14 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
     return V;
 
+  // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
+  // by zeroable elements in the remaining 24 elements. Turn this into two
+  // vmovqb instructions shuffled together.
+  if (Subtarget.hasVLX())
+    if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
+                                                  Mask, Zeroable, DAG))
+      return V;
+
   // Otherwise fall back on generic lowering.
   return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
                                     Subtarget, DAG);
@@ -15974,7 +16347,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
   }
 
   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
-                     DAG.getConstant(PermMask, DL, MVT::i8));
+                     DAG.getTargetConstant(PermMask, DL, MVT::i8));
 }
 
 /// Handle lowering of 8-lane 64-bit floating point shuffles.
@@ -15999,7 +16372,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
-                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+                         DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
     }
 
     SmallVector<int, 4> RepeatedMask;
@@ -16016,7 +16389,8 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Unpck;
 
   // Check if the blend happens to exactly fit that of SHUFPD.
-  if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+  if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
+                                          Zeroable, Subtarget, DAG))
     return Op;
 
   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
@@ -16389,6 +16763,49 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   }
 }
 
+static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         const X86Subtarget &Subtarget,
+                                         SelectionDAG &DAG) {
+  // Shuffle should be unary.
+  if (!V2.isUndef())
+    return SDValue();
+
+  int ShiftAmt = -1;
+  int NumElts = Mask.size();
+  for (int i = 0; i != NumElts; ++i) {
+    int M = Mask[i];
+    assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
+           "Unexpected mask index.");
+    if (M < 0)
+      continue;
+
+    // The first non-undef element determines our shift amount.
+    if (ShiftAmt < 0) {
+      ShiftAmt = M - i;
+      // Need to be shifting right.
+      if (ShiftAmt <= 0)
+        return SDValue();
+    }
+    // All non-undef elements must shift by the same amount.
+    if (ShiftAmt != M - i)
+      return SDValue();
+  }
+  assert(ShiftAmt >= 0 && "All undef?");
+
+  // Great we found a shift right.
+  MVT WideVT = VT;
+  if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
+    WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+  SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
+                            DAG.getUNDEF(WideVT), V1,
+                            DAG.getIntPtrConstant(0, DL));
+  Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
+                    DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                     DAG.getIntPtrConstant(0, DL));
+}
+
 // Determine if this shuffle can be implemented with a KSHIFT instruction.
 // Returns the shift amount if possible or -1 if not. This is a simplified
 // version of matchShuffleAsShift.
@@ -16434,13 +16851,20 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   assert(Subtarget.hasAVX512() &&
          "Cannot lower 512-bit vectors w/o basic ISA!");
 
-  unsigned NumElts = Mask.size();
+  int NumElts = Mask.size();
 
   // Try to recognize shuffles that are just padding a subvector with zeros.
-  unsigned SubvecElts = 0;
-  for (int i = 0; i != (int)NumElts; ++i) {
-    if (Mask[i] >= 0 && Mask[i] != i)
-      break;
+  int SubvecElts = 0;
+  int Src = -1;
+  for (int i = 0; i != NumElts; ++i) {
+    if (Mask[i] >= 0) {
+      // Grab the source from the first valid mask. All subsequent elements need
+      // to use this same source.
+      if (Src < 0)
+        Src = Mask[i] / NumElts;
+      if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
+        break;
+    }
 
     ++SubvecElts;
   }
@@ -16451,30 +16875,54 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Make sure the number of zeroable bits in the top at least covers the bits
   // not covered by the subvector.
-  if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
+  if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
+    assert(Src >= 0 && "Expected a source!");
     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
-                                  V1, DAG.getIntPtrConstant(0, DL));
+                                  Src == 0 ? V1 : V2,
+                                  DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
-                       getZeroVector(VT, Subtarget, DAG, DL),
+                       DAG.getConstant(0, DL, VT),
                        Extract, DAG.getIntPtrConstant(0, DL));
   }
 
+  // Try a simple shift right with undef elements. Later we'll try with zeros.
+  if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
+                                                DAG))
+    return Shift;
+
   // Try to match KSHIFTs.
-  // TODO: Support narrower than legal shifts by widening and extracting.
-  if (NumElts >= 16 || (Subtarget.hasDQI() && NumElts == 8)) {
-    unsigned Offset = 0;
-    for (SDValue V : { V1, V2 }) {
-      unsigned Opcode;
-      int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
-      if (ShiftAmt >= 0)
-        return DAG.getNode(Opcode, DL, VT, V,
-                           DAG.getConstant(ShiftAmt, DL, MVT::i8));
-      Offset += NumElts; // Increment for next iteration.
+  unsigned Offset = 0;
+  for (SDValue V : { V1, V2 }) {
+    unsigned Opcode;
+    int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
+    if (ShiftAmt >= 0) {
+      MVT WideVT = VT;
+      if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
+        WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
+                                DAG.getUNDEF(WideVT), V,
+                                DAG.getIntPtrConstant(0, DL));
+      // Widened right shifts need two shifts to ensure we shift in zeroes.
+      if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
+        int WideElts = WideVT.getVectorNumElements();
+        // Shift left to put the original vector in the MSBs of the new size.
+        Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
+                          DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
+        // Increase the shift amount to account for the left shift.
+        ShiftAmt += WideElts - NumElts;
+      }
+
+      Res = DAG.getNode(Opcode, DL, WideVT, Res,
+                        DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                         DAG.getIntPtrConstant(0, DL));
     }
+    Offset += NumElts; // Increment for next iteration.
   }
 
 
+
   MVT ExtVT;
   switch (VT.SimpleTy) {
   default:
@@ -16594,7 +17042,7 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
+  ArrayRef<int> OrigMask = SVOp->getMask();
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   MVT VT = Op.getSimpleValueType();
@@ -16620,8 +17068,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   // undef as well. This makes it easier to match the shuffle based solely on
   // the mask.
   if (V2IsUndef &&
-      any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
-    SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+      any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
+    SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
     for (int &M : NewMask)
       if (M >= NumElements)
         M = -1;
@@ -16629,15 +17077,16 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   // Check for illegal shuffle mask element index values.
-  int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
-  assert(llvm::all_of(Mask,
+  int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
+  (void)MaskUpperLimit;
+  assert(llvm::all_of(OrigMask,
                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
          "Out of bounds shuffle index");
 
   // We actually see shuffles that are entirely re-arrangements of a set of
   // zero inputs. This mostly happens while decomposing complex shuffles into
   // simple ones. Directly lower these as a buildvector of zeros.
-  APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2);
   if (Zeroable.isAllOnesValue())
     return getZeroVector(VT, Subtarget, DAG, DL);
 
@@ -16645,11 +17094,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
 
   // Create an alternative mask with info about zeroable elements.
   // Here we do not set undef elements as zeroable.
-  SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+  SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end());
   if (V2IsZero) {
     assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
     for (int i = 0; i != NumElements; ++i)
-      if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+      if (OrigMask[i] != SM_SentinelUndef && Zeroable[i])
         ZeroableMask[i] = SM_SentinelZero;
   }
 
@@ -16664,7 +17113,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
     // by obfuscating the operands with bitcasts.
     // TODO: Avoid lowering directly from this top-level function: make this
     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
-    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
                                                     Subtarget, DAG))
       return Broadcast;
 
@@ -16700,8 +17149,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   // Commute the shuffle if it will improve canonicalization.
-  if (canonicalizeShuffleMaskWithCommute(Mask))
-    return DAG.getCommutedVectorShuffle(*SVOp);
+  SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
+  if (canonicalizeShuffleMaskWithCommute(Mask)) {
+    ShuffleVectorSDNode::commuteMask(Mask);
+    std::swap(V1, V2);
+  }
 
   if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
     return V;
@@ -16910,7 +17362,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
 
   // Use kshiftr instruction to move to the lower element.
   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
-                    DAG.getConstant(IdxVal, dl, MVT::i8));
+                    DAG.getTargetConstant(IdxVal, dl, MVT::i8));
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
                      DAG.getIntPtrConstant(0, dl));
@@ -17137,8 +17589,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
-        N2 = DAG.getIntPtrConstant(1, dl);
-        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
+                           DAG.getTargetConstant(1, dl, MVT::i8));
       }
     }
 
@@ -17207,14 +17659,14 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
         // But if optimizing for size and there's a load folding opportunity,
         // generate insertps because blendps does not have a 32-bit memory
         // operand form.
-        N2 = DAG.getIntPtrConstant(1, dl);
         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
-        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
+                           DAG.getTargetConstant(1, dl, MVT::i8));
       }
-      N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
       // Create this as a scalar to vector..
       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
-      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
+                         DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
     }
 
     // PINSR* works with constant index.
@@ -17300,7 +17752,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
 
   // Shift to the LSB.
   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
-                    DAG.getConstant(IdxVal, dl, MVT::i8));
+                    DAG.getTargetConstant(IdxVal, dl, MVT::i8));
 
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
                      DAG.getIntPtrConstant(0, dl));
@@ -17841,10 +18293,10 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
       std::swap(Op0, Op1);
 
     APInt APIntShiftAmt;
-    if (isConstantSplat(Amt, APIntShiftAmt)) {
+    if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
       uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
-      return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
-                         Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
+      return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
+                         Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
     }
 
     return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
@@ -17970,6 +18422,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
+  if (VT == MVT::f128)
+    return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
+
   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
     return Extract;
 
@@ -18072,6 +18527,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   return Result;
 }
 
+/// Horizontal vector math instructions may be slower than normal math with
+/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
+/// implementation, and likely shuffle complexity of the alternate sequence.
+static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+                                  const X86Subtarget &Subtarget) {
+  bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool HasFastHOps = Subtarget.hasFastHorizontalOps();
+  return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+}
+
 /// 64-bit unsigned integer to double expansion.
 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
@@ -18126,8 +18591,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   SDValue Result;
 
-  if (Subtarget.hasSSE3()) {
-    // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
+  if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {
     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   } else {
     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
@@ -18273,7 +18737,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
     // Low will be bitcasted right away, so do not bother bitcasting back to its
     // original type.
     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
-                      VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+                      VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
     //                                 (uint4) 0x53000000, 0xaa);
     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
@@ -18281,7 +18745,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
     // High will be bitcasted right away, so do not bother bitcasting back to
     // its original type.
     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
-                       VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+                       VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
   } else {
     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
@@ -18329,16 +18793,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
+  MVT SrcVT = N0.getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
+
+  if (DstVT == MVT::f128)
+    return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
 
-  if (Op.getSimpleValueType().isVector())
+  if (DstVT.isVector())
     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
 
   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
     return Extract;
 
-  MVT SrcVT = N0.getSimpleValueType();
-  MVT DstVT = Op.getSimpleValueType();
-
   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
     // Conversions from unsigned i32 to f32/f64 are legal,
@@ -18346,6 +18812,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
     return Op;
   }
 
+  // Promote i32 to i64 and use a signed conversion on 64-bit targets.
+  if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0);
+    return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0);
+  }
+
   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
     return V;
 
@@ -18579,7 +19051,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
 
   // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
   if (InVT == MVT::v8i8) {
-    if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+    if (VT != MVT::v8i64)
       return SDValue();
 
     In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
@@ -18602,10 +19074,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   //   Concat upper and lower parts.
   //
-
-  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
-                                VT.getVectorNumElements() / 2);
-
+  MVT HalfVT = VT.getHalfNumVectorElementsVT();
   SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
 
   // Short-circuit if we can determine that each 128-bit half is the same value.
@@ -18903,9 +19372,29 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
          "Invalid TRUNCATE operation");
 
-  // If called by the legalizer just return.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+  // If we're called by the type legalizer, handle a few cases.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(InVT)) {
+    if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
+        VT.is128BitVector()) {
+      assert(Subtarget.hasVLX() && "Unexpected subtarget!");
+      // The default behavior is to truncate one step, concatenate, and then
+      // truncate the remainder. We'd rather produce two 64-bit results and
+      // concatenate those.
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
+
+      EVT LoVT, HiVT;
+      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+      Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
+      Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+    }
+
+    // Otherwise let default legalization handle it.
     return SDValue();
+  }
 
   if (VT.getVectorElementType() == MVT::i1)
     return LowerTruncateVecI1(Op, DAG, Subtarget);
@@ -18940,6 +19429,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
             truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
       return V;
 
+  // Handle truncation of V256 to V128 using shuffles.
+  assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
+
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
@@ -19016,22 +19508,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
   }
 
-  // Handle truncation of V256 to V128 using shuffles.
-  assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
-
-  assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
-
-  unsigned NumElems = VT.getVectorNumElements();
-  MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
-
-  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
-  // Prepare truncation shuffle mask
-  for (unsigned i = 0; i != NumElems; ++i)
-    MaskVec[i] = i * 2;
-  In = DAG.getBitcast(NVT, In);
-  SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
-                     DAG.getIntPtrConstant(0, DL));
+  llvm_unreachable("All 256->128 cases should have been handled above!");
 }
 
 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
@@ -19041,6 +19518,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   MVT SrcVT = Src.getSimpleValueType();
   SDLoc dl(Op);
 
+  if (SrcVT == MVT::f128) {
+    RTLIB::Libcall LC;
+    if (Op.getOpcode() == ISD::FP_TO_SINT)
+      LC = RTLIB::getFPTOSINT(SrcVT, VT);
+    else
+      LC = RTLIB::getFPTOUINT(SrcVT, VT);
+
+    MakeLibCallOptions CallOptions;
+    return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first;
+  }
+
   if (VT.isVector()) {
     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
       MVT ResVT = MVT::v4i32;
@@ -19075,14 +19563,27 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 
   bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
 
-  if (!IsSigned && Subtarget.hasAVX512()) {
-    // Conversions from f32/f64 should be legal.
-    if (UseSSEReg)
+  if (!IsSigned && UseSSEReg) {
+    // Conversions from f32/f64 with AVX512 should be legal.
+    if (Subtarget.hasAVX512())
       return Op;
 
-    // Use default expansion.
+    // Use default expansion for i64.
     if (VT == MVT::i64)
       return SDValue();
+
+    assert(VT == MVT::i32 && "Unexpected VT!");
+
+    // Promote i32 to i64 and use a signed operation on 64-bit targets.
+    if (Subtarget.is64Bit()) {
+      SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+    }
+
+    // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
+    // use fisttp which will be handled later.
+    if (!Subtarget.hasSSE3())
+      return SDValue();
   }
 
   // Promote i16 to i32 if we can use a SSE operation.
@@ -19103,12 +19604,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
 }
 
-static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getSimpleValueType();
 
+  if (VT == MVT::f128) {
+    RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
+    return LowerF128Call(Op, DAG, LC);
+  }
+
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
@@ -19116,14 +19622,31 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
                                  In, DAG.getUNDEF(SVT)));
 }
 
-/// Horizontal vector math instructions may be slower than normal math with
-/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
-/// implementation, and likely shuffle complexity of the alternate sequence.
-static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
-                                  const X86Subtarget &Subtarget) {
-  bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
-  bool HasFastHOps = Subtarget.hasFastHorizontalOps();
-  return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getSimpleValueType();
+
+  // It's legal except when f128 is involved
+  if (SVT != MVT::f128)
+    return Op;
+
+  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
+
+  // FP_ROUND node has a second operand indicating whether it is known to be
+  // precise. That doesn't take part in the LibCall so we can't directly use
+  // LowerF128Call.
+  MakeLibCallOptions CallOptions;
+  return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first;
+}
+
+// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking
+// the default expansion of STRICT_FP_ROUND.
+static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {
+  // FIXME: Need to form a libcall with an input chain for f128.
+  assert(Op.getOperand(0).getValueType() != MVT::f128 &&
+         "Don't know how to handle f128 yet!");
+  return Op;
 }
 
 /// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -19200,8 +19723,13 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
 
 /// Depending on uarch and/or optimizing for size, we might prefer to use a
 /// vector operation in place of the typical scalar operation.
-static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
-                             const X86Subtarget &Subtarget) {
+SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::f128) {
+    RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128
+                                                    : RTLIB::SUB_F128;
+    return LowerF128Call(Op, DAG, LC);
+  }
+
   assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
          "Only expecting float/double");
   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
@@ -19358,13 +19886,13 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
                         SelectionDAG &DAG) {
   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-                     DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
+                     DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
 }
 
 /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
 /// style scalarized (associative) reduction patterns.
-static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp,
-                                SmallVectorImpl<SDValue> &SrcOps) {
+static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
+                                 SmallVectorImpl<SDValue> &SrcOps) {
   SmallVector<SDValue, 8> Opnds;
   DenseMap<SDValue, APInt> SrcOpMap;
   EVT VT = MVT::Other;
@@ -19437,7 +19965,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
     return SDValue();
 
   SmallVector<SDValue, 8> VecIns;
-  if (!matchBitOpReduction(Op, ISD::OR, VecIns))
+  if (!matchScalarReduction(Op, ISD::OR, VecIns))
     return SDValue();
 
   // Quit if not 128/256-bit vector.
@@ -19461,8 +19989,8 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   }
 
-  X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL,
-                          MVT::i8);
+  X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
+                                DL, MVT::i8);
   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
 }
 
@@ -19576,6 +20104,13 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   case X86ISD::XOR:
   case X86ISD::AND:
     return SDValue(Op.getNode(), 1);
+  case ISD::SSUBO:
+  case ISD::USUBO: {
+    // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
+                       Op->getOperand(1)).getValue(1);
+  }
   default:
   default_case:
     break;
@@ -19766,6 +20301,63 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
   return 2;
 }
 
+SDValue
+X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                 SelectionDAG &DAG,
+                                 SmallVectorImpl<SDNode *> &Created) const {
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N,0); // Lower SDIV as SDIV
+
+  assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
+         "Unexpected divisor!");
+
+  // Only perform this transform if CMOV is supported otherwise the select
+  // below will become a branch.
+  if (!Subtarget.hasCMov())
+    return SDValue();
+
+  // fold (sdiv X, pow2)
+  EVT VT = N->getValueType(0);
+  // FIXME: Support i8.
+  if (VT != MVT::i16 && VT != MVT::i32 &&
+      !(Subtarget.is64Bit() && VT == MVT::i64))
+    return SDValue();
+
+  unsigned Lg2 = Divisor.countTrailingZeros();
+
+  // If the divisor is 2 or -2, the default expansion is better.
+  if (Lg2 == 1)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
+  SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
+
+  // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
+  SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+  SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+
+  Created.push_back(Cmp.getNode());
+  Created.push_back(Add.getNode());
+  Created.push_back(CMov.getNode());
+
+  // Divide by pow2.
+  SDValue SRA =
+      DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i64));
+
+  // If we're dividing by a positive value, we're done.  Otherwise, we must
+  // negate the result.
+  if (Divisor.isNonNegative())
+    return SRA;
+
+  Created.push_back(SRA.getNode());
+  return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
+}
+
 /// Result of 'and' is compared against zero. Change to a BT node if possible.
 /// Returns the BT node and the condition code needed to use it.
 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
@@ -19842,8 +20434,8 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
   if (Src.getValueType() != BitNo.getValueType())
     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
 
-  X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
-                          dl, MVT::i8);
+  X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
+                                dl, MVT::i8);
   return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
 }
 
@@ -19935,13 +20527,6 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
 
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
 
-  // If this is a seteq make sure any build vectors of all zeros are on the RHS.
-  // This helps with vptestm matching.
-  // TODO: Should we just canonicalize the setcc during DAG combine?
-  if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
-      ISD::isBuildVectorAllZeros(Op0.getNode()))
-    std::swap(Op0, Op1);
-
   // Prefer SETGT over SETLT.
   if (SetCCOpcode == ISD::SETLT) {
     SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
@@ -20007,7 +20592,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
     // Only do this pre-AVX since vpcmp* is no longer destructive.
     if (Subtarget.hasAVX())
       return SDValue();
-    SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
+    SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
     if (!ULEOp1)
       return SDValue();
     Op1 = ULEOp1;
@@ -20018,7 +20603,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
     // This is beneficial because materializing a constant 0 for the PCMPEQ is
     // probably cheaper than XOR+PCMPGT using 2 different vector constants:
     // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
-    SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
+    SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
     if (!UGEOp1)
       return SDValue();
     Op1 = Op0;
@@ -20086,14 +20671,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
       }
 
       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
-                                 DAG.getConstant(CC0, dl, MVT::i8));
+                                 DAG.getTargetConstant(CC0, dl, MVT::i8));
       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
-                                 DAG.getConstant(CC1, dl, MVT::i8));
+                                 DAG.getTargetConstant(CC1, dl, MVT::i8));
       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
     } else {
       // Handle all other FP comparisons here.
       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
-                        DAG.getConstant(SSECC, dl, MVT::i8));
+                        DAG.getTargetConstant(SSECC, dl, MVT::i8));
     }
 
     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
@@ -20106,16 +20691,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   MVT VTOp0 = Op0.getSimpleValueType();
+  (void)VTOp0;
   assert(VTOp0 == Op1.getSimpleValueType() &&
          "Expected operands with same type!");
   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
          "Invalid number of packed elements for source and destination!");
 
-  // This is being called by type legalization because v2i32 is marked custom
-  // for result type legalization for v2f32.
-  if (VTOp0 == MVT::v2i32)
-    return SDValue();
-
   // The non-AVX512 code below works under the assumption that source and
   // destination types are the same.
   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
@@ -20153,7 +20734,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
 
     return DAG.getNode(Opc, dl, VT, Op0, Op1,
-                       DAG.getConstant(CmpMode, dl, MVT::i8));
+                       DAG.getTargetConstant(CmpMode, dl, MVT::i8));
   }
 
   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
@@ -20222,21 +20803,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
       TLI.isOperationLegal(ISD::UMIN, VT)) {
     // If we have a constant operand, increment/decrement it and change the
     // condition to avoid an invert.
-    if (Cond == ISD::SETUGT &&
-        ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
-          return !C->getAPIntValue().isMaxValue();
-        })) {
+    if (Cond == ISD::SETUGT) {
       // X > C --> X >= (C+1) --> X == umax(X, C+1)
-      Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT));
-      Cond = ISD::SETUGE;
+      if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
+        Op1 = UGTOp1;
+        Cond = ISD::SETUGE;
+      }
     }
-    if (Cond == ISD::SETULT &&
-        ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
-          return !C->getAPIntValue().isNullValue();
-        })) {
+    if (Cond == ISD::SETULT) {
       // X < C --> X <= (C-1) --> X == umin(X, C-1)
-      Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT));
-      Cond = ISD::SETULE;
+      if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
+        Op1 = ULTOp1;
+        Cond = ISD::SETULE;
+      }
     }
     bool Invert = false;
     unsigned Opc;
@@ -20360,11 +20939,11 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   return Result;
 }
 
-// Try to select this as a KORTEST+SETCC if possible.
-static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
-                           const SDLoc &dl, SelectionDAG &DAG,
-                           const X86Subtarget &Subtarget,
-                           SDValue &X86CC) {
+// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
+static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+                              const SDLoc &dl, SelectionDAG &DAG,
+                              const X86Subtarget &Subtarget,
+                              SDValue &X86CC) {
   // Only support equality comparisons.
   if (CC != ISD::SETEQ && CC != ISD::SETNE)
     return SDValue();
@@ -20389,6 +20968,21 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
   } else
     return SDValue();
 
+  // If the input is an AND, we can combine it's operands into the KTEST.
+  bool KTestable = false;
+  if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
+    KTestable = true;
+  if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
+    KTestable = true;
+  if (!isNullConstant(Op1))
+    KTestable = false;
+  if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
+    SDValue LHS = Op0.getOperand(0);
+    SDValue RHS = Op0.getOperand(1);
+    X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+    return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
+  }
+
   // If the input is an OR, we can combine it's operands into the KORTEST.
   SDValue LHS = Op0;
   SDValue RHS = Op0;
@@ -20397,7 +20991,7 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
     RHS = Op0.getOperand(1);
   }
 
-  X86CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+  X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
   return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
 }
 
@@ -20425,9 +21019,9 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
       return PTEST;
   }
 
-  // Try to lower using KORTEST.
-  if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
-    return KORTEST;
+  // Try to lower using KORTEST or KTEST.
+  if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
+    return Test;
 
   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   // these.
@@ -20442,7 +21036,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
       if (Invert) {
         X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
         CCode = X86::GetOppositeBranchCondition(CCode);
-        X86CC = DAG.getConstant(CCode, dl, MVT::i8);
+        X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
       }
 
       return Op0.getOperand(1);
@@ -20456,7 +21050,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
 
   SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
-  X86CC = DAG.getConstant(CondCode, dl, MVT::i8);
+  X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
   return EFLAGS;
 }
 
@@ -20472,6 +21066,19 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
+  // Handle f128 first, since one possible outcome is a normal integer
+  // comparison which gets handled by emitFlagsForSetcc.
+  if (Op0.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1);
+
+    // If softenSetCCOperands returned a scalar, use it.
+    if (!Op1.getNode()) {
+      assert(Op0.getValueType() == Op.getValueType() &&
+             "Unexpected setcc expansion!");
+      return Op0;
+    }
+  }
+
   SDValue X86CC;
   SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
   if (!EFLAGS)
@@ -20612,15 +21219,16 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
 
     if (Subtarget.hasAVX512()) {
-      SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
-                                CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
+      SDValue Cmp =
+          DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
+                      DAG.getTargetConstant(SSECC, DL, MVT::i8));
       assert(!VT.isVector() && "Not a scalar type?");
       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
     }
 
     if (SSECC < 8 || Subtarget.hasAVX()) {
       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
-                                DAG.getConstant(SSECC, DL, MVT::i8));
+                                DAG.getTargetConstant(SSECC, DL, MVT::i8));
 
       // If we have AVX, we can use a variable vector select (VBLENDV) instead
       // of 3 logic instructions for size savings and potentially speed.
@@ -20718,8 +21326,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
       isNullConstant(Cond.getOperand(1).getOperand(1))) {
     SDValue Cmp = Cond.getOperand(1);
-    unsigned CondCode =
-        cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+    unsigned CondCode = Cond.getConstantOperandVal(0);
 
     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
@@ -20807,8 +21414,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
-    MVT VT = Op.getSimpleValueType();
-
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
         !isScalarFPTypeInSSEReg(VT))  // FPStack?
@@ -20826,7 +21431,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     X86::CondCode X86Cond;
     std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
 
-    CC = DAG.getConstant(X86Cond, DL, MVT::i8);
+    CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
     AddTest = false;
   }
 
@@ -20848,7 +21453,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (AddTest) {
-    CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
+    CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
     Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
                    X86::COND_NE, DL, DAG);
   }
@@ -20864,9 +21469,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (isNullConstant(Op1) || isNullConstant(Op2))) {
-      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
-                                DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                Cond);
+      SDValue Res =
+          DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                      DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
         return DAG.getNOT(DL, Res, Res.getValueType());
       return Res;
@@ -21037,8 +21642,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
   // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
   if (Subtarget.hasAVX()) {
     assert(VT.is256BitVector() && "256-bit vector expected");
-    int HalfNumElts = NumElts / 2;
-    MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);
+    MVT HalfVT = VT.getHalfNumVectorElementsVT();
+    int HalfNumElts = HalfVT.getVectorNumElements();
 
     unsigned NumSrcElts = InVT.getVectorNumElements();
     SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
@@ -21081,7 +21686,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
 
     unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
     SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
-                          DAG.getConstant(SignExtShift, dl, MVT::i8));
+                          DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
   }
 
   if (VT == MVT::v2i64) {
@@ -21119,7 +21724,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
 
   // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
   if (InVT == MVT::v8i8) {
-    if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+    if (VT != MVT::v8i64)
       return SDValue();
 
     In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
@@ -21138,10 +21743,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   // concat the vectors to original VT
-
-  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
-                                VT.getVectorNumElements() / 2);
-
+  MVT HalfVT = VT.getHalfNumVectorElementsVT();
   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
 
   unsigned NumElems = InVT.getVectorNumElements();
@@ -21165,7 +21767,7 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
   // Splitting volatile memory ops is not allowed unless the operation was not
   // legal to begin with. We are assuming the input op is legal (this transform
   // is only used for targets with AVX).
-  if (Store->isVolatile())
+  if (!Store->isSimple())
     return SDValue();
 
   MVT StoreVT = StoredVal.getSimpleValueType();
@@ -21201,7 +21803,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
   // Splitting volatile memory ops is not allowed unless the operation was not
   // legal to begin with. We are assuming the input op is legal (this transform
   // is only used for targets with AVX).
-  if (Store->isVolatile())
+  if (!Store->isSimple())
     return SDValue();
 
   MVT StoreSVT = StoreVT.getScalarType();
@@ -21266,14 +21868,13 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
     return SDValue();
   }
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
          "Unexpected VT");
-  if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
-        TargetLowering::TypeWidenVector)
-    return SDValue();
+  assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
+             TargetLowering::TypeWidenVector && "Unexpected type action!");
 
-  MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
-                                StoreVT.getVectorNumElements() * 2);
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
   StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
                           DAG.getUNDEF(StoreVT));
 
@@ -21313,11 +21914,10 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
 
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
-  EVT MemVT = Ld->getMemoryVT();
 
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
   if (RegVT.getVectorElementType() == MVT::i1) {
-    assert(EVT(RegVT) == MemVT && "Expected non-extending load");
+    assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
            "Expected AVX512F without AVX512DQI");
@@ -21336,176 +21936,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
   }
 
-  // Nothing useful we can do without SSE2 shuffles.
-  assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
-
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  unsigned RegSz = RegVT.getSizeInBits();
-
-  ISD::LoadExtType Ext = Ld->getExtensionType();
-
-  assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
-         && "Only anyext and sext are currently implemented.");
-  assert(MemVT != RegVT && "Cannot extend to the same type");
-  assert(MemVT.isVector() && "Must load a vector from memory");
-
-  unsigned NumElems = RegVT.getVectorNumElements();
-  unsigned MemSz = MemVT.getSizeInBits();
-  assert(RegSz > MemSz && "Register size must be greater than the mem size");
-
-  if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
-    // The only way in which we have a legal 256-bit vector result but not the
-    // integer 256-bit operations needed to directly lower a sextload is if we
-    // have AVX1 but not AVX2. In that case, we can always emit a sextload to
-    // a 128-bit vector and a normal sign_extend to 256-bits that should get
-    // correctly legalized. We do this late to allow the canonical form of
-    // sextload to persist throughout the rest of the DAG combiner -- it wants
-    // to fold together any extensions it can, and so will fuse a sign_extend
-    // of an sextload into a sextload targeting a wider value.
-    SDValue Load;
-    if (MemSz == 128) {
-      // Just switch this to a normal load.
-      assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
-                                       "it must be a legal 128-bit vector "
-                                       "type!");
-      Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
-                         Ld->getPointerInfo(), Ld->getAlignment(),
-                         Ld->getMemOperand()->getFlags());
-    } else {
-      assert(MemSz < 128 &&
-             "Can't extend a type wider than 128 bits to a 256 bit vector!");
-      // Do an sext load to a 128-bit vector type. We want to use the same
-      // number of elements, but elements half as wide. This will end up being
-      // recursively lowered by this routine, but will succeed as we definitely
-      // have all the necessary features if we're using AVX1.
-      EVT HalfEltVT =
-          EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
-      EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
-      Load =
-          DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
-                         Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
-                         Ld->getMemOperand()->getFlags());
-    }
-
-    // Replace chain users with the new chain.
-    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
-
-    // Finally, do a normal sign-extend to the desired register.
-    SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT);
-    return DAG.getMergeValues({SExt, Load.getValue(1)}, dl);
-  }
-
-  // All sizes must be a power of two.
-  assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
-         "Non-power-of-two elements are not custom lowered!");
-
-  // Attempt to load the original value using scalar loads.
-  // Find the largest scalar type that divides the total loaded size.
-  MVT SclrLoadTy = MVT::i8;
-  for (MVT Tp : MVT::integer_valuetypes()) {
-    if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
-      SclrLoadTy = Tp;
-    }
-  }
-
-  // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
-  if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
-      (64 <= MemSz))
-    SclrLoadTy = MVT::f64;
-
-  // Calculate the number of scalar loads that we need to perform
-  // in order to load our vector from memory.
-  unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
-
-  assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
-         "Can only lower sext loads with a single scalar load!");
-
-  unsigned loadRegSize = RegSz;
-  if (Ext == ISD::SEXTLOAD && RegSz >= 256)
-    loadRegSize = 128;
-
-  // If we don't have BWI we won't be able to create the shuffle needed for
-  // v8i8->v8i64.
-  if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
-      MemVT == MVT::v8i8)
-    loadRegSize = 128;
-
-  // Represent our vector as a sequence of elements which are the
-  // largest scalar that we can load.
-  EVT LoadUnitVecVT = EVT::getVectorVT(
-      *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());
-
-  // Represent the data using the same element type that is stored in
-  // memory. In practice, we ''widen'' MemVT.
-  EVT WideVecVT =
-      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
-                       loadRegSize / MemVT.getScalarSizeInBits());
-
-  assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
-         "Invalid vector type");
-
-  // We can't shuffle using an illegal type.
-  assert(TLI.isTypeLegal(WideVecVT) &&
-         "We only lower types that form legal widened vector types");
-
-  SmallVector<SDValue, 8> Chains;
-  SDValue Ptr = Ld->getBasePtr();
-  unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8;
-  SDValue Increment = DAG.getConstant(OffsetInc, dl,
-                                      TLI.getPointerTy(DAG.getDataLayout()));
-  SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
-
-  unsigned Offset = 0;
-  for (unsigned i = 0; i < NumLoads; ++i) {
-    unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset);
-
-    // Perform a single load.
-    SDValue ScalarLoad =
-      DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr,
-                  Ld->getPointerInfo().getWithOffset(Offset),
-                  NewAlign, Ld->getMemOperand()->getFlags());
-    Chains.push_back(ScalarLoad.getValue(1));
-    // Create the first element type using SCALAR_TO_VECTOR in order to avoid
-    // another round of DAGCombining.
-    if (i == 0)
-      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
-    else
-      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
-                        ScalarLoad, DAG.getIntPtrConstant(i, dl));
-
-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
-    Offset += OffsetInc;
-  }
-
-  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
-
-  // Bitcast the loaded value to a vector of the original element type, in
-  // the size of the target vector type.
-  SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
-  unsigned SizeRatio = RegSz / MemSz;
-
-  if (Ext == ISD::SEXTLOAD) {
-    SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
-    return DAG.getMergeValues({Sext, TF}, dl);
-  }
-
-  if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
-      MemVT == MVT::v8i8) {
-    SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
-    return DAG.getMergeValues({Sext, TF}, dl);
-  }
-
-  // Redistribute the loaded elements into the different locations.
-  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
-  for (unsigned i = 0; i != NumElems; ++i)
-    ShuffleVec[i * SizeRatio] = i;
-
-  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                       DAG.getUNDEF(WideVecVT), ShuffleVec);
-
-  // Bitcast to the requested type.
-  Shuff = DAG.getBitcast(RegVT, Shuff);
-  return DAG.getMergeValues({Shuff, TF}, dl);
+  return SDValue();
 }
 
 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
@@ -21610,7 +22041,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
     if (Inverted)
       X86Cond = X86::GetOppositeBranchCondition(X86Cond);
 
-    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+    CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
     addTest = false;
   } else {
     unsigned CondOpc;
@@ -21638,10 +22069,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
         if (Cmp == Cond.getOperand(1).getOperand(1) &&
             isX86LogicalCmp(Cmp) &&
             Op.getNode()->hasOneUse()) {
-          X86::CondCode CCode =
-            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
-          CCode = X86::GetOppositeBranchCondition(CCode);
-          CC = DAG.getConstant(CCode, dl, MVT::i8);
+          X86::CondCode CCode0 =
+              (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+          CCode0 = X86::GetOppositeBranchCondition(CCode0);
+          CC = DAG.getTargetConstant(CCode0, dl, MVT::i8);
           SDNode *User = *Op.getNode()->use_begin();
           // Look for an unconditional branch following this conditional branch.
           // We need this because we need to reverse the successors in order
@@ -21654,12 +22085,12 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
             (void)NewBR;
             Dest = FalseBB;
 
-            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
-                                Chain, Dest, CC, Cmp);
-            X86::CondCode CCode =
-              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
-            CCode = X86::GetOppositeBranchCondition(CCode);
-            CC = DAG.getConstant(CCode, dl, MVT::i8);
+            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain,
+                                Dest, CC, Cmp);
+            X86::CondCode CCode1 =
+                (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+            CCode1 = X86::GetOppositeBranchCondition(CCode1);
+            CC = DAG.getTargetConstant(CCode1, dl, MVT::i8);
             Cond = Cmp;
             addTest = false;
           }
@@ -21672,7 +22103,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
       X86::CondCode CCode =
         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
       CCode = X86::GetOppositeBranchCondition(CCode);
-      CC = DAG.getConstant(CCode, dl, MVT::i8);
+      CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
       Cond = Cond.getOperand(0).getOperand(1);
       addTest = false;
     } else if (Cond.getOpcode() == ISD::SETCC &&
@@ -21698,10 +22129,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                     Cond.getOperand(0), Cond.getOperand(1));
           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
-          CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+          CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
-          CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+          CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
           Cond = Cmp;
           addTest = false;
         }
@@ -21714,10 +22145,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
       SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                 Cond.getOperand(0), Cond.getOperand(1));
       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
-      CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+      CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
       Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                           Chain, Dest, CC, Cmp);
-      CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+      CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
       Cond = Cmp;
       addTest = false;
     }
@@ -21742,7 +22173,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
   if (addTest) {
     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
-    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+    CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
     Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
                    X86Cond, dl, DAG);
   }
@@ -21770,7 +22201,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
-  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Align = Op.getConstantOperandVal(2);
   EVT VT = Node->getValueType(0);
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
@@ -21811,7 +22242,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     }
 
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
-    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+    Register Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                 DAG.getRegister(Vreg, SPTy));
@@ -21821,7 +22252,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
 
     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
-    unsigned SPReg = RegInfo->getStackRegister();
+    Register SPReg = RegInfo->getStackRegister();
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
     Chain = SP.getValue(1);
 
@@ -22076,7 +22507,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
   }
 
   return DAG.getNode(Opc, dl, VT, SrcOp,
-                     DAG.getConstant(ShiftAmt, dl, MVT::i8));
+                     DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
 }
 
 /// Handle vector element shifts where the shift amount may or may not be a
@@ -22121,7 +22552,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
       ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
                           MVT::v2i64, ShAmt);
     else {
-      SDValue ByteShift = DAG.getConstant(
+      SDValue ByteShift = DAG.getTargetConstant(
           (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
       ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
       ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
@@ -22308,13 +22739,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
   auto isRoundModeCurDirection = [](SDValue Rnd) {
     if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
-      return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
+      return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
 
     return false;
   };
   auto isRoundModeSAE = [](SDValue Rnd) {
-    if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
-      return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC;
+    if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
+      unsigned RC = C->getZExtValue();
+      if (RC & X86::STATIC_ROUNDING::NO_EXC) {
+        // Clear the NO_EXC bit and check remaining bits.
+        RC ^= X86::STATIC_ROUNDING::NO_EXC;
+        // As a convenience we allow no other bits or explicitly
+        // current direction.
+        return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
+      }
+    }
 
     return false;
   };
@@ -22335,7 +22774,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   };
 
   SDLoc dl(Op);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   MVT VT = Op.getSimpleValueType();
   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
   if (IntrData) {
@@ -22411,9 +22850,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
 
-      if (IntrData->Type == INTR_TYPE_3OP_IMM8)
-        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
-
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -22666,7 +23102,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     case CMP_MASK_CC: {
       MVT MaskVT = Op.getSimpleValueType();
       SDValue CC = Op.getOperand(3);
-      CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -22685,7 +23120,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     case CMP_MASK_SCALAR_CC: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
-      SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+      SDValue CC = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
 
       SDValue Cmp;
@@ -22750,16 +23185,16 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     case COMI_RM: { // Comparison intrinsics with Sae
       SDValue LHS = Op.getOperand(1);
       SDValue RHS = Op.getOperand(2);
-      unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned CondVal = Op.getConstantOperandVal(3);
       SDValue Sae = Op.getOperand(4);
 
       SDValue FCmp;
       if (isRoundModeCurDirection(Sae))
         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
-                           DAG.getConstant(CondVal, dl, MVT::i8));
+                           DAG.getTargetConstant(CondVal, dl, MVT::i8));
       else if (isRoundModeSAE(Sae))
         FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
-                           DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+                           DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
       else
         return SDValue();
       // Need to fill with zeros to ensure the bitcast will produce zeroes
@@ -22819,9 +23254,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
       // Clear the upper bits of the rounding immediate so that the legacy
       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
-      SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
-                                         Op.getOperand(2),
-                                         DAG.getConstant(0xf, dl, MVT::i32));
+      auto Round = cast<ConstantSDNode>(Op.getOperand(2));
+      SDValue RoundingMode =
+          DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), RoundingMode);
     }
@@ -22829,12 +23264,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
       // Clear the upper bits of the rounding immediate so that the legacy
       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
-      SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
-                                         Op.getOperand(3),
-                                         DAG.getConstant(0xf, dl, MVT::i32));
+      auto Round = cast<ConstantSDNode>(Op.getOperand(3));
+      SDValue RoundingMode =
+          DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
     }
+    case BEXTRI: {
+      assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode");
+
+      // The control is a TargetConstant, but we need to convert it to a
+      // ConstantSDNode.
+      uint64_t Imm = Op.getConstantOperandVal(2);
+      SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+                         Op.getOperand(1), Control);
+    }
     // ADC/ADCX/SBB
     case ADX: {
       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
@@ -23165,6 +23610,61 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                  MaskVT, Operation);
     return DAG.getMergeValues({Result0, Result1}, DL);
   }
+  case Intrinsic::x86_mmx_pslli_w:
+  case Intrinsic::x86_mmx_pslli_d:
+  case Intrinsic::x86_mmx_pslli_q:
+  case Intrinsic::x86_mmx_psrli_w:
+  case Intrinsic::x86_mmx_psrli_d:
+  case Intrinsic::x86_mmx_psrli_q:
+  case Intrinsic::x86_mmx_psrai_w:
+  case Intrinsic::x86_mmx_psrai_d: {
+    SDLoc DL(Op);
+    SDValue ShAmt = Op.getOperand(2);
+    // If the argument is a constant, convert it to a target constant.
+    if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
+      ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+                         Op.getOperand(0), Op.getOperand(1), ShAmt);
+    }
+
+    unsigned NewIntrinsic;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_mmx_pslli_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_w;
+      break;
+    case Intrinsic::x86_mmx_pslli_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_d;
+      break;
+    case Intrinsic::x86_mmx_pslli_q:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_q;
+      break;
+    case Intrinsic::x86_mmx_psrli_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
+      break;
+    case Intrinsic::x86_mmx_psrli_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
+      break;
+    case Intrinsic::x86_mmx_psrli_q:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
+      break;
+    case Intrinsic::x86_mmx_psrai_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psra_w;
+      break;
+    case Intrinsic::x86_mmx_psrai_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psra_d;
+      break;
+    }
+
+    // The vector shift intrinsics with scalars uses 32b shift amounts but
+    // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
+    // MMX register.
+    ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+                       DAG.getConstant(NewIntrinsic, DL, MVT::i32),
+                       Op.getOperand(1), ShAmt);
+
+  }
   }
 }
 
@@ -23177,7 +23677,9 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   // Scale must be constant.
   if (!C)
     return SDValue();
-  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
   EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   // If source is undef or we know it won't be used, use a zero vector
@@ -23204,7 +23706,9 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
   // Scale must be constant.
   if (!C)
     return SDValue();
-  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
                               VT.getVectorNumElements());
   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
@@ -23238,7 +23742,9 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   // Scale must be constant.
   if (!C)
     return SDValue();
-  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
                               Src.getSimpleValueType().getVectorNumElements());
   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
@@ -23266,7 +23772,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   // Scale must be constant.
   if (!C)
     return SDValue();
-  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT =
@@ -23435,8 +23943,7 @@ EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-
+  unsigned IntNo = Op.getConstantOperandVal(1);
   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
   if (!IntrData) {
     switch (IntNo) {
@@ -23538,10 +24045,10 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
 
     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
     // Otherwise return the value from Rand, which is always 0, casted to i32.
-    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
-                      DAG.getConstant(1, dl, Op->getValueType(1)),
-                      DAG.getConstant(X86::COND_B, dl, MVT::i8),
-                      SDValue(Result.getNode(), 1) };
+    SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+                     DAG.getConstant(1, dl, Op->getValueType(1)),
+                     DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
+                     SDValue(Result.getNode(), 1)};
     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
 
     // Return { result, isValid, chain }.
@@ -23581,8 +24088,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
                           Scale, Chain, Subtarget);
   }
   case PREFETCH: {
-    SDValue Hint = Op.getOperand(6);
-    unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
+    const APInt &HintVal = Op.getConstantOperandAPInt(6);
     assert((HintVal == 2 || HintVal == 3) &&
            "Wrong prefetch hint in intrinsic: should be 2 or 3");
     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
@@ -23678,7 +24184,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
     return SDValue();
 
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
@@ -23730,7 +24236,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   unsigned FrameReg =
       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   SDLoc dl(Op);  // FIXME probably not meaningful
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
          "Invalid Frame Register!");
@@ -23743,12 +24249,11 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
-                                              SelectionDAG &DAG) const {
+Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              const MachineFunction &MF) const {
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
-  const MachineFunction &MF = DAG.getMachineFunction();
 
-  unsigned Reg = StringSwitch<unsigned>(RegName)
+  Register Reg = StringSwitch<unsigned>(RegName)
                        .Case("esp", X86::ESP)
                        .Case("rsp", X86::RSP)
                        .Case("ebp", X86::EBP)
@@ -23762,8 +24267,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
 #ifndef NDEBUG
     else {
       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
-      unsigned FrameReg =
-          RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+      Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
              "Invalid Frame Register!");
     }
@@ -23809,7 +24313,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
-  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+  Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
          "Invalid Frame Register!");
@@ -23967,6 +24471,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     case CallingConv::X86_FastCall:
     case CallingConv::X86_ThisCall:
     case CallingConv::Fast:
+    case CallingConv::Tail:
       // Pass 'nest' parameter in EAX.
       // Must be kept in sync with X86CallingConv.td
       NestReg = X86::EAX;
@@ -24279,12 +24784,9 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
 
   if (Opc == ISD::CTLZ) {
     // If src is zero (i.e. bsr sets ZF), returns NumBits.
-    SDValue Ops[] = {
-      Op,
-      DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
-      DAG.getConstant(X86::COND_E, dl, MVT::i8),
-      Op.getValue(1)
-    };
+    SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+                     DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
+                     Op.getValue(1)};
     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
   }
 
@@ -24312,12 +24814,9 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
   Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
 
   // If src is zero (i.e. bsf sets ZF), returns NumBits.
-  SDValue Ops[] = {
-    Op,
-    DAG.getConstant(NumBits, dl, VT),
-    DAG.getConstant(X86::COND_E, dl, MVT::i8),
-    Op.getValue(1)
-  };
+  SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
+                   DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
+                   Op.getValue(1)};
   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
 }
 
@@ -24453,7 +24952,7 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
     SDValue N0 = Op.getOperand(0);
     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
                               DAG.getConstant(0, DL, VT), N0);
-    SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
+    SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
                      SDValue(Neg.getNode(), 1)};
     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
   }
@@ -25033,7 +25532,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
   // Optimize shl/srl/sra with constant shift amount.
   APInt APIntShiftAmt;
-  if (!isConstantSplat(Amt, APIntShiftAmt))
+  if (!X86::isConstantSplat(Amt, APIntShiftAmt))
     return SDValue();
 
   // If the shift amount is out of range, return undef.
@@ -25220,7 +25719,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
       }
 
       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
-      APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
+      APInt C(SVTBits, ND->getZExtValue());
       uint64_t ShAmt = C.getZExtValue();
       if (ShAmt >= SVTBits) {
         Elts.push_back(DAG.getUNDEF(SVT));
@@ -25502,7 +26001,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
        (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
       !Subtarget.hasXOP()) {
     int NumElts = VT.getVectorNumElements();
-    SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8);
+    SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
 
     // Extend constant shift amount to vXi16 (it doesn't matter if the type
     // isn't legal).
@@ -25774,7 +26273,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
       unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
       uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
       return DAG.getNode(Op, DL, VT, R,
-                         DAG.getConstant(RotateAmt, DL, MVT::i8));
+                         DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
     }
 
     // Else, fall-back on VPROLV/VPRORV.
@@ -25795,7 +26294,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     if (0 <= CstSplatIndex) {
       uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
-                         DAG.getConstant(RotateAmt, DL, MVT::i8));
+                         DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
     }
 
     // Use general rotate by variable (per-element).
@@ -26032,7 +26531,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
 
   // If this is a canonical idempotent atomicrmw w/no uses, we have a better
   // lowering available in lowerAtomicArith.
-  // TODO: push more cases through this path. 
+  // TODO: push more cases through this path.
   if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
     if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
         AI->use_empty())
@@ -26087,10 +26586,22 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   return Loaded;
 }
 
+bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
+  if (!SI.isUnordered())
+    return false;
+  return ExperimentalUnorderedISEL;
+}
+bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
+  if (!LI.isUnordered())
+    return false;
+  return ExperimentalUnorderedISEL;
+}
+
+
 /// Emit a locked operation on a stack location which does not change any
 /// memory location, but does involve a lock prefix.  Location is chosen to be
 /// a) very likely accessed only by a single thread to minimize cache traffic,
-/// and b) definitely dereferenceable.  Returns the new Chain result.  
+/// and b) definitely dereferenceable.  Returns the new Chain result.
 static SDValue emitLockedStackOp(SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget,
                                  SDValue Chain, SDLoc DL) {
@@ -26099,22 +26610,22 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
   // operations issued by the current processor.  As such, the location
   // referenced is not relevant for the ordering properties of the instruction.
   // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
-  // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions 
+  // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions
   // 2) Using an immediate operand appears to be the best encoding choice
   // here since it doesn't require an extra register.
   // 3) OR appears to be very slightly faster than ADD. (Though, the difference
   // is small enough it might just be measurement noise.)
   // 4) When choosing offsets, there are several contributing factors:
   //   a) If there's no redzone, we default to TOS.  (We could allocate a cache
-  //      line aligned stack object to improve this case.) 
+  //      line aligned stack object to improve this case.)
   //   b) To minimize our chances of introducing a false dependence, we prefer
-  //      to offset the stack usage from TOS slightly.  
+  //      to offset the stack usage from TOS slightly.
   //   c) To minimize concerns about cross thread stack usage - in particular,
   //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
   //      captures state in the TOS frame and accesses it from many threads -
   //      we want to use an offset such that the offset is in a distinct cache
   //      line from the TOS frame.
-  // 
+  //
   // For a general discussion of the tradeoffs and benchmark results, see:
   // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
 
@@ -26155,10 +26666,10 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG) {
   SDLoc dl(Op);
-  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
-    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
-  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
-    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+  AtomicOrdering FenceOrdering =
+      static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
+  SyncScope::ID FenceSSID =
+      static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
 
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
@@ -26167,7 +26678,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
     if (Subtarget.hasMFence())
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
-    SDValue Chain = Op.getOperand(0); 
+    SDValue Chain = Op.getOperand(0);
     return emitLockedStackOp(DAG, Subtarget, Chain, dl);
   }
 
@@ -26218,6 +26729,17 @@ static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
   MVT InVT = V.getSimpleValueType();
 
+  if (InVT == MVT::v64i8) {
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
+    Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
+    Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
+    Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
+    Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
+    Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
+                     DAG.getConstant(32, DL, MVT::i8));
+    return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
+  }
   if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
     SDValue Lo, Hi;
     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
@@ -26258,8 +26780,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
     SDLoc dl(Op);
     SDValue Lo, Hi;
     std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
-    EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
-                                  DstVT.getVectorNumElements() / 2);
+    MVT CastVT = DstVT.getHalfNumVectorElementsVT();
     Lo = DAG.getBitcast(CastVT, Lo);
     Hi = DAG.getBitcast(CastVT, Hi);
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
@@ -26275,53 +26796,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getZExtOrTrunc(V, DL, DstVT);
   }
 
-  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
-      SrcVT == MVT::i64) {
-    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
-    if (DstVT != MVT::f64 && DstVT != MVT::i64 &&
-        !(DstVT == MVT::x86mmx && SrcVT.isVector()))
-      // This conversion needs to be expanded.
-      return SDValue();
+  assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
+          SrcVT == MVT::i64) && "Unexpected VT!");
 
-    SDLoc dl(Op);
-    if (SrcVT.isVector()) {
-      // Widen the vector in input in the case of MVT::v2i32.
-      // Example: from MVT::v2i32 to MVT::v4i32.
-      MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
-                                   SrcVT.getVectorNumElements() * 2);
-      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
-                        DAG.getUNDEF(SrcVT));
-    } else {
-      assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
-             "Unexpected source type in LowerBITCAST");
-      Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
-    }
+  assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+  if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
+      !(DstVT == MVT::x86mmx && SrcVT.isVector()))
+    // This conversion needs to be expanded.
+    return SDValue();
 
-    MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
-    Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
+  SDLoc dl(Op);
+  if (SrcVT.isVector()) {
+    // Widen the vector in input in the case of MVT::v2i32.
+    // Example: from MVT::v2i32 to MVT::v4i32.
+    MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
+                                 SrcVT.getVectorNumElements() * 2);
+    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
+                      DAG.getUNDEF(SrcVT));
+  } else {
+    assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
+           "Unexpected source type in LowerBITCAST");
+    Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
+  }
 
-    if (DstVT == MVT::x86mmx)
-      return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
+  MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
+  Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
 
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
-                       DAG.getIntPtrConstant(0, dl));
-  }
+  if (DstVT == MVT::x86mmx)
+    return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
 
-  assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
-         Subtarget.hasMMX() && "Unexpected custom BITCAST");
-  assert((DstVT == MVT::i64 ||
-          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
-         "Unexpected custom BITCAST");
-  // i64 <=> MMX conversions are Legal.
-  if (SrcVT==MVT::i64 && DstVT.isVector())
-    return Op;
-  if (DstVT==MVT::i64 && SrcVT.isVector())
-    return Op;
-  // MMX <=> MMX conversions are Legal.
-  if (SrcVT.isVector() && DstVT.isVector())
-    return Op;
-  // All other conversions need to be expanded.
-  return SDValue();
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
+                     DAG.getIntPtrConstant(0, dl));
 }
 
 /// Compute the horizontal sum of bytes in V for the elements of VT.
@@ -26549,6 +27054,13 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
+  // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
+  // lowering.
+  if (VT == MVT::v8i64 || VT == MVT::v16i32) {
+    assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE");
+    return Lower512IntUnary(Op, DAG);
+  }
+
   unsigned NumElts = VT.getVectorNumElements();
   assert(VT.getScalarType() == MVT::i8 &&
          "Only byte vector BITREVERSE supported");
@@ -26656,12 +27168,12 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
     // seq_cst which isn't SingleThread, everything just needs to be preserved
     // during codegen and then dropped. Note that we expect (but don't assume),
     // that orderings other than seq_cst and acq_rel have been canonicalized to
-    // a store or load. 
+    // a store or load.
     if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
         AN->getSyncScopeID() == SyncScope::System) {
       // Prefer a locked operation against a stack location to minimize cache
       // traffic.  This assumes that stack locations are very likely to be
-      // accessed only by the owning thread. 
+      // accessed only by the owning thread.
       SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
       assert(!N->hasAnyUseOfValue(0));
       // NOTE: The getUNDEF is needed to give something for the unused result 0.
@@ -26886,12 +27398,13 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
   SDValue Chain = N->getChain();
   SDValue BasePtr = N->getBasePtr();
 
-  if (VT == MVT::v2f32) {
+  if (VT == MVT::v2f32 || VT == MVT::v2i32) {
     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
     // If the index is v2i64 and we have VLX we can use xmm for data and index.
     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
-      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
-                        DAG.getUNDEF(MVT::v2f32));
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
       SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
       SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
@@ -26901,30 +27414,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
     return SDValue();
   }
 
-  if (VT == MVT::v2i32) {
-    assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
-    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
-                      DAG.getUNDEF(MVT::v2i32));
-    // If the index is v2i64 and we have VLX we can use xmm for data and index.
-    if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
-      SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
-      SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
-      SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
-          VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
-      return SDValue(NewScatter.getNode(), 1);
-    }
-    // Custom widen all the operands to avoid promotion.
-    EVT NewIndexVT = EVT::getVectorVT(
-        *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
-    Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
-                        DAG.getUNDEF(Index.getValueType()));
-    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
-                       DAG.getConstant(0, dl, MVT::v2i1));
-    SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
-    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
-                                Ops, N->getMemOperand());
-  }
-
   MVT IndexVT = Index.getSimpleValueType();
   MVT MaskVT = Mask.getSimpleValueType();
 
@@ -27160,6 +27649,13 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
   return NOOP;
 }
 
+SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                                         RTLIB::Libcall Call) const {
+  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+  MakeLibCallOptions CallOptions;
+  return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
+}
+
 /// Provide custom lowering hooks for some operations.
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -27206,10 +27702,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
+  case ISD::FP_ROUND:           return LowerFP_ROUND(Op, DAG);
+  case ISD::STRICT_FP_ROUND:    return LowerSTRICT_FP_ROUND(Op, DAG);
   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
   case ISD::FADD:
-  case ISD::FSUB:               return lowerFaddFsub(Op, DAG, Subtarget);
+  case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
+  case ISD::FMUL:               return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV:               return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
   case ISD::FABS:
   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
@@ -27347,37 +27847,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   }
   case ISD::MUL: {
     EVT VT = N->getValueType(0);
-    assert(VT.isVector() && "Unexpected VT");
-    if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
-        VT.getVectorNumElements() == 2) {
-      // Promote to a pattern that will be turned into PMULUDQ.
-      SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
-                               N->getOperand(0));
-      SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
-                               N->getOperand(1));
-      SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
-      Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
-    } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
-               VT.getVectorElementType() == MVT::i8) {
-      // Pre-promote these to vXi16 to avoid op legalization thinking all 16
-      // elements are needed.
-      MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
-      SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
-      SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
-      SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
-      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
-      unsigned NumConcats = 16 / VT.getVectorNumElements();
-      SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
-      ConcatOps[0] = Res;
-      Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
-      Results.push_back(Res);
-    }
+    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+           VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
+    // Pre-promote these to vXi16 to avoid op legalization thinking all 16
+    // elements are needed.
+    MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+    SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
+    SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
+    Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+    unsigned NumConcats = 16 / VT.getVectorNumElements();
+    SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
+    ConcatOps[0] = Res;
+    Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
+    Results.push_back(Res);
     return;
   }
-  case ISD::UADDSAT:
-  case ISD::SADDSAT:
-  case ISD::USUBSAT:
-  case ISD::SSUBSAT:
   case X86ISD::VPMADDWD:
   case X86ISD::AVG: {
     // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
@@ -27388,6 +27873,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT InVT = N->getOperand(0).getValueType();
     assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
            "Expected a VT that divides into 128 bits.");
+    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+           "Unexpected type action!");
     unsigned NumConcat = 128 / InVT.getSizeInBits();
 
     EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
@@ -27404,9 +27891,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
 
     SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
-    if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
-      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
-                        DAG.getIntPtrConstant(0, dl));
     Results.push_back(Res);
     return;
   }
@@ -27435,26 +27919,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(Hi);
     return;
   }
-  case ISD::SETCC: {
-    // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
-    // setCC result type is v2i1 because type legalzation will end up with
-    // a v4i1 setcc plus an extend.
-    assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
-    if (N->getOperand(0).getValueType() != MVT::v2f32 ||
-        getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
-      return;
-    SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
-    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
-                              N->getOperand(0), UNDEF);
-    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
-                              N->getOperand(1), UNDEF);
-    SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
-                              N->getOperand(2));
-    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
-                      DAG.getIntPtrConstant(0, dl));
-    Results.push_back(Res);
-    return;
-  }
   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   case X86ISD::FMINC:
   case X86ISD::FMIN:
@@ -27475,7 +27939,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SREM:
   case ISD::UREM: {
     EVT VT = N->getValueType(0);
-    if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
+    if (VT.isVector()) {
+      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+             "Unexpected type action!");
       // If this RHS is a constant splat vector we can widen this and let
       // division/remainder by constant optimize it.
       // TODO: Can we do something for non-splat?
@@ -27493,17 +27959,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-    if (VT == MVT::v2i32) {
-      // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
-      // v2i64 and unroll later. But then we create i64 scalar ops which
-      // might be slow in 64-bit mode or require a libcall in 32-bit mode.
-      Results.push_back(DAG.UnrollVectorOp(N));
-      return;
-    }
-
-    if (VT.isVector())
-      return;
-
     LLVM_FALLTHROUGH;
   }
   case ISD::SDIVREM:
@@ -27561,58 +28016,40 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         return;
       }
     }
-    return;
-  }
-  case ISD::SIGN_EXTEND_VECTOR_INREG: {
-    if (ExperimentalVectorWideningLegalization)
-      return;
-
-    EVT VT = N->getValueType(0);
-    SDValue In = N->getOperand(0);
-    EVT InVT = In.getValueType();
-    if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
-        (InVT == MVT::v16i16 || InVT == MVT::v32i8)) {
-      // Custom split this so we can extend i8/i16->i32 invec. This is better
-      // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
-      // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
-      // we allow the sra from the extend to i32 to be shared by the split.
-      EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
-                                       InVT.getVectorElementType(),
-                                       InVT.getVectorNumElements() / 2);
-      MVT ExtendVT = MVT::getVectorVT(MVT::i32,
-                                      VT.getVectorNumElements());
-      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT,
-                       In, DAG.getIntPtrConstant(0, dl));
-      In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In);
-
-      // Fill a vector with sign bits for each element.
-      SDValue Zero = DAG.getConstant(0, dl, ExtendVT);
-      SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT);
-
-      EVT LoVT, HiVT;
-      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-
-      // Create an unpackl and unpackh to interleave the sign bits then bitcast
-      // to vXi64.
-      SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits);
-      Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
-      SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits);
-      Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+    if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
+        getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
+        isTypeLegal(MVT::v4i64)) {
+      // Input needs to be split and output needs to widened. Let's use two
+      // VTRUNCs, and shuffle their results together into the wider type.
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
 
-      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+      Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
+      Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
+      SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
+                                         { 0,  1,  2,  3, 16, 17, 18, 19,
+                                          -1, -1, -1, -1, -1, -1, -1, -1 });
       Results.push_back(Res);
       return;
     }
+
     return;
   }
+  case ISD::ANY_EXTEND:
+    // Right now, only MVT::v8i8 has Custom action for an illegal type.
+    // It's intended to custom handle the input type.
+    assert(N->getValueType(0) == MVT::v8i8 &&
+           "Do not know how to legalize this Node");
+    return;
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND: {
     EVT VT = N->getValueType(0);
     SDValue In = N->getOperand(0);
     EVT InVT = In.getValueType();
     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
-        (InVT == MVT::v4i16 || InVT == MVT::v4i8) &&
-        getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
+        (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
+      assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
+             "Unexpected type action!");
       assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
       // Custom split this so we can extend i8/i16->i32 invec. This is better
       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
@@ -27683,27 +28120,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Src = N->getOperand(0);
     EVT SrcVT = Src.getValueType();
 
-    // Promote these manually to avoid over promotion to v2i64. Type
-    // legalization will revisit the v2i32 operation for more cleanup.
-    if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
-        getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
-      // AVX512DQ provides instructions that produce a v2i64 result.
-      if (Subtarget.hasDQI())
-        return;
-
-      SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
-      Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
-                                                          : ISD::AssertSext,
-                        dl, MVT::v2i32, Res,
-                        DAG.getValueType(VT.getVectorElementType()));
-      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
-      Results.push_back(Res);
-      return;
-    }
-
     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
-      if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
-        return;
+      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+             "Unexpected type action!");
 
       // Try to create a 128 bit vector, but don't exceed a 32 bit element.
       unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
@@ -27738,35 +28157,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       assert((IsSigned || Subtarget.hasAVX512()) &&
              "Can only handle signed conversion without AVX512");
       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
-      bool Widenv2i32 =
-        getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector;
+      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+             "Unexpected type action!");
       if (Src.getValueType() == MVT::v2f64) {
-        unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
         if (!IsSigned && !Subtarget.hasVLX()) {
-          // If v2i32 is widened, we can defer to the generic legalizer.
-          if (Widenv2i32)
-            return;
-          // Custom widen by doubling to a legal vector with. Isel will
-          // further widen to v8f64.
-          Opc = ISD::FP_TO_UINT;
-          Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64,
-                            Src, DAG.getUNDEF(MVT::v2f64));
+          // If we have VLX we can emit a target specific FP_TO_UINT node,
+          // otherwise we can defer to the generic legalizer which will widen
+          // the input as well. This will be further widened during op
+          // legalization to v8i32<-v8f64.
+          return;
         }
+        unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
         SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
-        if (!Widenv2i32)
-          Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
-                            DAG.getIntPtrConstant(0, dl));
-        Results.push_back(Res);
-        return;
-      }
-      if (SrcVT == MVT::v2f32 &&
-          getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
-        SDValue Idx = DAG.getIntPtrConstant(0, dl);
-        SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
-                                  DAG.getUNDEF(MVT::v2f32));
-        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
-                                   : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
-        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
         Results.push_back(Res);
         return;
       }
@@ -27776,6 +28178,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
+    assert(!VT.isVector() && "Vectors should have been handled above!");
+
     if (Subtarget.hasDQI() && VT == MVT::i64 &&
         (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
       assert(!Subtarget.is64Bit() && "i64 should be legal");
@@ -27847,7 +28251,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
     default : llvm_unreachable("Do not know how to custom type "
                                "legalize this intrinsic operation!");
@@ -27905,7 +28309,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
     SDValue Result;
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-    unsigned BasePtr = TRI->getBaseRegister();
+    Register BasePtr = TRI->getBaseRegister();
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
@@ -28060,34 +28464,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-    if (SrcVT != MVT::f64 ||
-        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
-        getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
+    if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
+      assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
+             "Unexpected type action!");
+      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
+      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
+      Results.push_back(Res);
       return;
+    }
 
-    unsigned NumElts = DstVT.getVectorNumElements();
-    EVT SVT = DstVT.getVectorElementType();
-    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
-    SDValue Res;
-    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
-    Res = DAG.getBitcast(WiderVT, Res);
-    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
-                      DAG.getIntPtrConstant(0, dl));
-    Results.push_back(Res);
     return;
   }
   case ISD::MGATHER: {
     EVT VT = N->getValueType(0);
-    if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
+    if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
+        (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
       auto *Gather = cast<MaskedGatherSDNode>(N);
       SDValue Index = Gather->getIndex();
       if (Index.getValueType() != MVT::v2i64)
         return;
+      assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+             "Unexpected type action!");
+      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
       SDValue Mask = Gather->getMask();
       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
-      SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+      SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
                                      Gather->getPassThru(),
-                                     DAG.getUNDEF(MVT::v2f32));
+                                     DAG.getUNDEF(VT));
       if (!Subtarget.hasVLX()) {
         // We need to widen the mask, but the instruction will only use 2
         // of its elements. So we can use undef.
@@ -28098,66 +28501,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
                         Gather->getBasePtr(), Index, Gather->getScale() };
       SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-        DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
+        DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
         Gather->getMemoryVT(), Gather->getMemOperand());
       Results.push_back(Res);
       Results.push_back(Res.getValue(2));
       return;
     }
-    if (VT == MVT::v2i32) {
-      auto *Gather = cast<MaskedGatherSDNode>(N);
-      SDValue Index = Gather->getIndex();
-      SDValue Mask = Gather->getMask();
-      assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
-      SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
-                                     Gather->getPassThru(),
-                                     DAG.getUNDEF(MVT::v2i32));
-      // If the index is v2i64 we can use it directly.
-      if (Index.getValueType() == MVT::v2i64 &&
-          (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
-        if (!Subtarget.hasVLX()) {
-          // We need to widen the mask, but the instruction will only use 2
-          // of its elements. So we can use undef.
-          Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
-                             DAG.getUNDEF(MVT::v2i1));
-          Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
-        }
-        SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
-                          Gather->getBasePtr(), Index, Gather->getScale() };
-        SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-          DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
-          Gather->getMemoryVT(), Gather->getMemOperand());
-        SDValue Chain = Res.getValue(2);
-        if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
-          Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
-                            DAG.getIntPtrConstant(0, dl));
-        Results.push_back(Res);
-        Results.push_back(Chain);
-        return;
-      }
-      if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
-        EVT IndexVT = Index.getValueType();
-        EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
-                                          IndexVT.getScalarType(), 4);
-        // Otherwise we need to custom widen everything to avoid promotion.
-        Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
-                            DAG.getUNDEF(IndexVT));
-        Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
-                           DAG.getConstant(0, dl, MVT::v2i1));
-        SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
-                          Gather->getBasePtr(), Index, Gather->getScale() };
-        SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
-                                          Gather->getMemoryVT(), dl, Ops,
-                                          Gather->getMemOperand());
-        SDValue Chain = Res.getValue(1);
-        if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
-          Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
-                            DAG.getIntPtrConstant(0, dl));
-        Results.push_back(Res);
-        Results.push_back(Chain);
-        return;
-      }
-    }
     return;
   }
   case ISD::LOAD: {
@@ -28166,8 +28515,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     // cast since type legalization will try to use an i64 load.
     MVT VT = N->getSimpleValueType(0);
     assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
-    if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
-      return;
+    assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+           "Unexpected type action!");
     if (!ISD::isNON_EXTLoad(N))
       return;
     auto *Ld = cast<LoadSDNode>(N);
@@ -28177,11 +28526,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                 Ld->getPointerInfo(), Ld->getAlignment(),
                                 Ld->getMemOperand()->getFlags());
       SDValue Chain = Res.getValue(1);
-      MVT WideVT = MVT::getVectorVT(LdVT, 2);
-      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
-      MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
-                                    VT.getVectorNumElements() * 2);
-      Res = DAG.getBitcast(CastVT, Res);
+      MVT VecVT = MVT::getVectorVT(LdVT, 2);
+      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
+      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
+      Res = DAG.getBitcast(WideVT, Res);
       Results.push_back(Res);
       Results.push_back(Chain);
       return;
@@ -28236,6 +28584,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
+  case X86ISD::MOVQ2DQ:            return "X86ISD::MOVQ2DQ";
   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
@@ -28373,6 +28722,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
+  case X86ISD::VBROADCAST_LOAD:    return "X86ISD::VBROADCAST_LOAD";
   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
@@ -28737,6 +29087,9 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
 }
 
 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+  if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
+    return false;
+
   EVT SrcVT = ExtVal.getOperand(0).getValueType();
 
   // There is no extending load for vXi1.
@@ -28856,10 +29209,10 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  unsigned mainDstReg = MRI.createVirtualRegister(RC);
-  unsigned fallDstReg = MRI.createVirtualRegister(RC);
+  Register mainDstReg = MRI.createVirtualRegister(RC);
+  Register fallDstReg = MRI.createVirtualRegister(RC);
 
   // thisMBB:
   //  xbegin fallMBB
@@ -28913,7 +29266,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   static_assert(X86::AddrNumOperands == 5,
                 "VAARG_64 assumes 5 address operands");
 
-  unsigned DestReg = MI.getOperand(0).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
   MachineOperand &Base = MI.getOperand(1);
   MachineOperand &Scale = MI.getOperand(2);
   MachineOperand &Index = MI.getOperand(3);
@@ -29049,7 +29402,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
     assert(OffsetReg != 0);
 
     // Read the reg_save_area address.
-    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+    Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
         .add(Base)
         .add(Scale)
@@ -29059,8 +29412,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
         .setMemRefs(LoadOnlyMMO);
 
     // Zero-extend the offset
-    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
-      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+    Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
         .addImm(0)
         .addReg(OffsetReg)
         .addImm(X86::sub_32bit);
@@ -29071,7 +29424,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
       .addReg(RegSaveReg);
 
     // Compute the offset for the next argument
-    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
       .addReg(OffsetReg)
       .addImm(UseFPOffset ? 16 : 8);
@@ -29096,7 +29449,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   //
 
   // Load the overflow_area address into a register.
-  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
       .add(Base)
       .add(Scale)
@@ -29110,7 +29463,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   if (NeedsAlign) {
     // Align the overflow address
     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
-    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+    Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
 
     // aligned_addr = (addr + (align-1)) & ~(align-1)
     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
@@ -29127,7 +29480,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
 
   // Compute the next overflow address after this argument.
   // (the overflow address should be kept 8-byte aligned)
-  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
     .addReg(OverflowDestReg)
     .addImm(ArgSizeA8);
@@ -29191,7 +29544,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
-  unsigned CountReg = MI.getOperand(0).getReg();
+  Register CountReg = MI.getOperand(0).getReg();
   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
 
@@ -29273,7 +29626,9 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
 static bool isCMOVPseudo(MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case X86::CMOV_FR32:
+  case X86::CMOV_FR32X:
   case X86::CMOV_FR64:
+  case X86::CMOV_FR64X:
   case X86::CMOV_GR8:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
@@ -29326,9 +29681,9 @@ static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
   MachineInstrBuilder MIB;
 
   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
-    unsigned DestReg = MIIt->getOperand(0).getReg();
-    unsigned Op1Reg = MIIt->getOperand(1).getReg();
-    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+    Register DestReg = MIIt->getOperand(0).getReg();
+    Register Op1Reg = MIIt->getOperand(1).getReg();
+    Register Op2Reg = MIIt->getOperand(2).getReg();
 
     // If this CMOV we are generating is the opposite condition from
     // the jump we generated, then we have to swap the operands for the
@@ -29486,9 +29841,9 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
 
   //  SinkMBB:
   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
-  unsigned DestReg = FirstCMOV.getOperand(0).getReg();
-  unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
-  unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
+  Register DestReg = FirstCMOV.getOperand(0).getReg();
+  Register Op1Reg = FirstCMOV.getOperand(1).getReg();
+  Register Op2Reg = FirstCMOV.getOperand(2).getReg();
   MachineInstrBuilder MIB =
       BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
           .addReg(Op1Reg)
@@ -30006,7 +30361,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
   // call the retpoline thunk.
   DebugLoc DL = MI.getDebugLoc();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
-  unsigned CalleeVReg = MI.getOperand(0).getReg();
+  Register CalleeVReg = MI.getOperand(0).getReg();
   unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
 
   // Find an available scratch register to hold the callee. On 64-bit, we can
@@ -30079,7 +30434,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
   // Initialize a register with zero.
   MVT PVT = getPointerTy(MF->getDataLayout());
   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
-  unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+  Register ZReg = MRI.createVirtualRegister(PtrRC);
   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
   BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
       .addDef(ZReg)
@@ -30087,7 +30442,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
       .addReg(ZReg, RegState::Undef);
 
   // Read the current SSP Register value to the zeroed register.
-  unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+  Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
   BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
 
@@ -30131,8 +30486,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
   (void)TRI;
-  unsigned mainDstReg = MRI.createVirtualRegister(RC);
-  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+  Register mainDstReg = MRI.createVirtualRegister(RC);
+  Register restoreDstReg = MRI.createVirtualRegister(RC);
 
   MemOpndSlot = CurOp;
 
@@ -30246,8 +30601,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
     X86FI->setRestoreBasePointer(MF);
-    unsigned FramePtr = RegInfo->getFrameRegister(*MF);
-    unsigned BasePtr = RegInfo->getBaseRegister();
+    Register FramePtr = RegInfo->getFrameRegister(*MF);
+    Register BasePtr = RegInfo->getBaseRegister();
     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
@@ -30329,7 +30684,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
   MBB->addSuccessor(checkSspMBB);
 
   // Initialize a register with zero.
-  unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+  Register ZReg = MRI.createVirtualRegister(PtrRC);
   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
   BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
       .addDef(ZReg)
@@ -30337,7 +30692,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
       .addReg(ZReg, RegState::Undef);
 
   // Read the current SSP Register value to the zeroed register.
-  unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+  Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
   BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
 
@@ -30352,7 +30707,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
   checkSspMBB->addSuccessor(fallMBB);
 
   // Reload the previously saved SSP register value.
-  unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
+  Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   const int64_t SPPOffset = 3 * PVT.getStoreSize();
   MachineInstrBuilder MIB =
@@ -30370,7 +30725,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
   MIB.setMemRefs(MMOs);
 
   // Subtract the current SSP from the previous SSP.
-  unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
+  Register SspSubReg = MRI.createVirtualRegister(PtrRC);
   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
   BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
       .addReg(PrevSSPReg)
@@ -30384,7 +30739,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
   // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
-  unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
+  Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
       .addReg(SspSubReg)
       .addImm(Offset);
@@ -30394,7 +30749,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
   BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
 
   // Reset the lower 8 bits.
-  unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
+  Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
       .addReg(SspFirstShrReg)
       .addImm(8);
@@ -30406,12 +30761,12 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
 
   // Do a single shift left.
   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
-  unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
+  Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
       .addReg(SspSecondShrReg);
 
   // Save the value 128 to a register (will be used next with incssp).
-  unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
+  Register Value128InReg = MRI.createVirtualRegister(PtrRC);
   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
       .addImm(128);
@@ -30419,8 +30774,8 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
 
   // Since incssp only looks at the lower 8 bits, we might need to do several
   // iterations of incssp until we finish fixing the shadow stack.
-  unsigned DecReg = MRI.createVirtualRegister(PtrRC);
-  unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
+  Register DecReg = MRI.createVirtualRegister(PtrRC);
+  Register CounterReg = MRI.createVirtualRegister(PtrRC);
   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
       .addReg(SspAfterShlReg)
       .addMBB(fixShadowLoopPrepareMBB)
@@ -30460,11 +30815,11 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
 
   const TargetRegisterClass *RC =
     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
-  unsigned Tmp = MRI.createVirtualRegister(RC);
+  Register Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
-  unsigned SP = RegInfo->getStackRegister();
+  Register SP = RegInfo->getStackRegister();
 
   MachineInstrBuilder MIB;
 
@@ -30662,8 +31017,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
     MFI->setRestoreBasePointer(MF);
 
-    unsigned FP = RI.getFrameRegister(*MF);
-    unsigned BP = RI.getBaseRegister();
+    Register FP = RI.getFrameRegister(*MF);
+    Register BP = RI.getBaseRegister();
     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
                  MFI->getRestoreBasePointerOffset())
@@ -30674,7 +31029,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   }
 
   // IReg is used as an index in a memory operand and therefore can't be SP
-  unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
+  Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
                     Subtarget.is64Bit() ? 8 : 4);
   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
@@ -30683,8 +31038,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
 
   if (Subtarget.is64Bit()) {
-    unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
-    unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+    Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+    Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
 
     // leaq .LJTI0_0(%rip), BReg
     BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
@@ -30710,9 +31065,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
           .addReg(0);
       break;
     case MachineJumpTableInfo::EK_LabelDifference32: {
-      unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
-      unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
-      unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+      Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+      Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
+      Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
 
       // movl (BReg,IReg64,4), OReg
       BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
@@ -30783,8 +31138,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
           DefRegs[MOp.getReg()] = true;
 
       MachineInstrBuilder MIB(*MF, &II);
-      for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
-        unsigned Reg = SavedRegs[RI];
+      for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
+        unsigned Reg = SavedRegs[RegIdx];
         if (!DefRegs[Reg])
           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
       }
@@ -30906,20 +31261,18 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                               TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
 
     // Load the old value of the control word...
-    unsigned OldCW =
-      MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+    Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
                       OrigCWFrameIdx);
 
     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
-    unsigned NewCW =
-      MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+    Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
     BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
       .addReg(OldCW, RegState::Kill).addImm(0xC00);
 
     // Extract to 16 bits.
-    unsigned NewCW16 =
-      MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+    Register NewCW16 =
+        MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
       .addReg(NewCW, RegState::Kill, X86::sub_16bit);
 
@@ -31023,7 +31376,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MachineRegisterInfo &MRI = MF->getRegInfo();
     MVT SPTy = getPointerTy(MF->getDataLayout());
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
-    unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
+    Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
 
     X86AddressMode AM = getAddressFromInstr(&MI, 0);
     // Regalloc does not need any help when the memory operand of CMPXCHG8B
@@ -31034,10 +31387,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
     // four operand definitions that are E[ABCD] registers. We skip them and
     // then insert the LEA.
-    MachineBasicBlock::iterator MBBI(MI);
-    while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
-           MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
-      --MBBI;
+    MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
+    while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
+                                   RMBBI->definesRegister(X86::EBX) ||
+                                   RMBBI->definesRegister(X86::ECX) ||
+                                   RMBBI->definesRegister(X86::EDX))) {
+      ++RMBBI;
+    }
+    MachineBasicBlock::iterator MBBI(RMBBI);
     addFullAddress(
         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
 
@@ -31232,12 +31589,21 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known.One |= Known2.One;
     break;
   }
+  case X86ISD::PSADBW: {
+    assert(VT.getScalarType() == MVT::i64 &&
+           Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
+           "Unexpected PSADBW types");
+
+    // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
+    Known.Zero.setBitsFrom(16);
+    break;
+  }
   case X86ISD::CMOV: {
-    Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
+    Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
     // If we don't know any bits, early out.
     if (Known.isUnknown())
       break;
-    KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
+    KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
@@ -31650,8 +32016,8 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
     SmallVector<int, 4> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
-      ArrayRef<int> LoMask(Mask.data() + 0, 4);
-      ArrayRef<int> HiMask(Mask.data() + 4, 4);
+      ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
+      ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
 
       // PSHUFLW: permute lower 4 elements only.
       if (isUndefOrInRange(LoMask, 0, 4) &&
@@ -31789,8 +32155,8 @@ static bool matchBinaryPermuteShuffle(
     uint64_t BlendMask = 0;
     bool ForceV1Zero = false, ForceV2Zero = false;
     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
-    if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
-                                  BlendMask)) {
+    if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
+                                  ForceV2Zero, BlendMask)) {
       if (MaskVT == MVT::v16i16) {
         // We can only use v16i16 PBLENDW if the lanes are repeated.
         SmallVector<int, 8> RepeatedMask;
@@ -31819,15 +32185,15 @@ static bool matchBinaryPermuteShuffle(
     }
   }
 
-  // Attempt to combine to INSERTPS.
+  // Attempt to combine to INSERTPS, but only if it has elements that need to
+  // be set to zero.
   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
-      MaskVT.is128BitVector()) {
-    if (Zeroable.getBoolValue() &&
-        matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
-      Shuffle = X86ISD::INSERTPS;
-      ShuffleVT = MVT::v4f32;
-      return true;
-    }
+      MaskVT.is128BitVector() &&
+      llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
+      matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+    Shuffle = X86ISD::INSERTPS;
+    ShuffleVT = MVT::v4f32;
+    return true;
   }
 
   // Attempt to combine to SHUFPD.
@@ -31835,7 +32201,11 @@ static bool matchBinaryPermuteShuffle(
       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
-    if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
+    bool ForceV1Zero = false, ForceV2Zero = false;
+    if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
+                               PermuteImm, Mask, Zeroable)) {
+      V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+      V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
       Shuffle = X86ISD::SHUFP;
       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
       return true;
@@ -31889,6 +32259,15 @@ static bool matchBinaryPermuteShuffle(
     }
   }
 
+  // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
+  if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+      MaskVT.is128BitVector() &&
+      matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+    Shuffle = X86ISD::INSERTPS;
+    ShuffleVT = MVT::v4f32;
+    return true;
+  }
+
   return false;
 }
 
@@ -31942,7 +32321,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   unsigned NumRootElts = RootVT.getVectorNumElements();
   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
-                     (RootVT.isFloatingPoint() && Depth >= 2) ||
+                     (RootVT.isFloatingPoint() && Depth >= 1) ||
                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
 
   // Don't combine if we are a AVX512/EVEX target and the mask element size
@@ -31981,7 +32360,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
       !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
-    if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
+    if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
       return SDValue(); // Nothing to do!
     MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
     unsigned PermMask = 0;
@@ -31991,7 +32370,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     Res = DAG.getBitcast(ShuffleVT, V1);
     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
                       DAG.getUNDEF(ShuffleVT),
-                      DAG.getConstant(PermMask, DL, MVT::i8));
+                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -32026,8 +32405,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // Which shuffle domains are permitted?
   // Permit domain crossing at higher combine depths.
   // TODO: Should we indicate which domain is preferred if both are allowed?
-  bool AllowFloatDomain = FloatDomain || (Depth > 3);
-  bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
+  bool AllowFloatDomain = FloatDomain || (Depth >= 3);
+  bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
 
   // Determine zeroable mask elements.
@@ -32062,14 +32441,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         if (V1.getValueType() == MaskVT &&
             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
             MayFoldLoad(V1.getOperand(0))) {
-          if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+          if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
             return SDValue(); // Nothing to do!
           Res = V1.getOperand(0);
           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
           return DAG.getBitcast(RootVT, Res);
         }
         if (Subtarget.hasAVX2()) {
-          if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+          if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
             return SDValue(); // Nothing to do!
           Res = DAG.getBitcast(MaskVT, V1);
           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
@@ -32083,7 +32462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                           DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
                           ShuffleVT) &&
         (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
-      if (Depth == 1 && Root.getOpcode() == Shuffle)
+      if (Depth == 0 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
@@ -32094,11 +32473,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                  AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
                                  PermuteImm) &&
         (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
-      if (Depth == 1 && Root.getOpcode() == Shuffle)
+      if (Depth == 0 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleVT, V1);
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
-                        DAG.getConstant(PermuteImm, DL, MVT::i8));
+                        DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
       return DAG.getBitcast(RootVT, Res);
     }
   }
@@ -32109,7 +32488,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
                          ShuffleVT, UnaryShuffle) &&
       (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
-    if (Depth == 1 && Root.getOpcode() == Shuffle)
+    if (Depth == 0 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
     NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
@@ -32123,12 +32502,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
           MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
           NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
       (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
-    if (Depth == 1 && Root.getOpcode() == Shuffle)
+    if (Depth == 0 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
     NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
-                      DAG.getConstant(PermuteImm, DL, MVT::i8));
+                      DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -32141,34 +32520,34 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     uint64_t BitLen, BitIdx;
     if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
                             Zeroable)) {
-      if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
+      if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
         return SDValue(); // Nothing to do!
       V1 = DAG.getBitcast(IntMaskVT, V1);
       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
-                        DAG.getConstant(BitLen, DL, MVT::i8),
-                        DAG.getConstant(BitIdx, DL, MVT::i8));
+                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
+                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
       return DAG.getBitcast(RootVT, Res);
     }
 
     if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
-      if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
+      if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
         return SDValue(); // Nothing to do!
       V1 = DAG.getBitcast(IntMaskVT, V1);
       V2 = DAG.getBitcast(IntMaskVT, V2);
       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
-                        DAG.getConstant(BitLen, DL, MVT::i8),
-                        DAG.getConstant(BitIdx, DL, MVT::i8));
+                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
+                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
       return DAG.getBitcast(RootVT, Res);
     }
   }
 
   // Don't try to re-form single instruction chains under any circumstances now
   // that we've done encoding canonicalization for them.
-  if (Depth < 2)
+  if (Depth < 1)
     return SDValue();
 
   // Depth threshold above which we can efficiently use variable mask shuffles.
-  int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
+  int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
   AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
 
   bool MaskContainsZeros =
@@ -32321,7 +32700,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     V2 = DAG.getBitcast(MaskVT, V2);
     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
-                      DAG.getConstant(M2ZImm, DL, MVT::i8));
+                      DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
     return DAG.getBitcast(RootVT, Res);
   }
 
@@ -32650,7 +33029,7 @@ static SDValue combineX86ShufflesRecursively(
   // Bound the depth of our recursive combine because this is ultimately
   // quadratic in nature.
   const unsigned MaxRecursionDepth = 8;
-  if (Depth > MaxRecursionDepth)
+  if (Depth >= MaxRecursionDepth)
     return SDValue();
 
   // Directly rip through bitcasts to find the underlying operand.
@@ -32667,11 +33046,18 @@ static SDValue combineX86ShufflesRecursively(
          "Can only combine shuffles of the same vector register size.");
 
   // Extract target shuffle mask and resolve sentinels and inputs.
+  // TODO - determine Op's demanded elts from RootMask.
   SmallVector<int, 64> OpMask;
   SmallVector<SDValue, 2> OpInputs;
-  if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
+  APInt OpUndef, OpZero;
+  APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+  bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
+  if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
+                              OpZero, DAG, Depth, false))
     return SDValue();
 
+  resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
+
   // Add the inputs to the Ops list, avoiding duplicates.
   SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
 
@@ -32772,6 +33158,9 @@ static SDValue combineX86ShufflesRecursively(
     Mask[i] = OpMaskedIdx;
   }
 
+  // Remove unused/repeated shuffle source ops.
+  resolveTargetShuffleInputsAndMask(Ops, Mask);
+
   // Handle the all undef/zero cases early.
   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
     return DAG.getUNDEF(Root.getValueType());
@@ -32783,11 +33172,8 @@ static SDValue combineX86ShufflesRecursively(
     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
                          SDLoc(Root));
 
-  // Remove unused/repeated shuffle source ops.
-  resolveTargetShuffleInputsAndMask(Ops, Mask);
   assert(!Ops.empty() && "Shuffle with no inputs detected");
-
-  HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
+  HasVariableMask |= IsOpVariableMask;
 
   // Update the list of shuffle nodes that have been combined so far.
   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
@@ -32853,7 +33239,7 @@ static SDValue combineX86ShufflesRecursively(
 /// Helper entry wrapper to combineX86ShufflesRecursively.
 static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
                                              const X86Subtarget &Subtarget) {
-  return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+  return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
                                        /*HasVarMask*/ false,
                                        /*AllowVarMask*/ true, DAG, Subtarget);
 }
@@ -33088,7 +33474,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
       for (unsigned i = 0; i != Scale; ++i)
         DemandedMask[i] = i;
       if (SDValue Res = combineX86ShufflesRecursively(
-              {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
+              {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
               /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
                            DAG.getBitcast(SrcVT, Res));
@@ -33120,6 +33506,30 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
                                 VT.getSizeInBits());
       }
 
+    // vbroadcast(scalarload X) -> vbroadcast_load X
+    // For float loads, extract other uses of the scalar from the broadcast.
+    if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
+        ISD::isNormalLoad(Src.getNode())) {
+      LoadSDNode *LN = cast<LoadSDNode>(Src);
+      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+      SDValue BcastLd =
+          DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+                                  LN->getMemoryVT(), LN->getMemOperand());
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceExtract = Src.hasOneUse();
+      DCI.CombineTo(N.getNode(), BcastLd);
+      if (NoReplaceExtract) {
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(LN);
+      } else {
+        SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
+                                  DAG.getIntPtrConstant(0, DL));
+        DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
+      }
+      return N; // Return N so it doesn't get rechecked!
+    }
+
     return SDValue();
   }
   case X86ISD::BLENDI: {
@@ -33133,14 +33543,14 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
       MVT SrcVT = N0.getOperand(0).getSimpleValueType();
       if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
           SrcVT.getScalarSizeInBits() >= 32) {
-        unsigned Mask = N.getConstantOperandVal(2);
+        unsigned BlendMask = N.getConstantOperandVal(2);
         unsigned Size = VT.getVectorNumElements();
         unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
-        unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
+        BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
         return DAG.getBitcast(
             VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
                             N1.getOperand(0),
-                            DAG.getConstant(ScaleMask, DL, MVT::i8)));
+                            DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
       }
     }
     return SDValue();
@@ -33208,76 +33618,97 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     // If we zero out all elements from Op0 then we don't need to reference it.
     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
-                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+                         DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
 
     // If we zero out the element from Op1 then we don't need to reference it.
     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
-                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+                         DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
 
     // Attempt to merge insertps Op1 with an inner target shuffle node.
     SmallVector<int, 8> TargetMask1;
     SmallVector<SDValue, 2> Ops1;
-    if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
-      int M = TargetMask1[SrcIdx];
-      if (isUndefOrZero(M)) {
+    APInt KnownUndef1, KnownZero1;
+    if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
+                                     KnownZero1)) {
+      if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
         // Zero/UNDEF insertion - zero out element and remove dependency.
         InsertPSMask |= (1u << DstIdx);
         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
-                           DAG.getConstant(InsertPSMask, DL, MVT::i8));
+                           DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
       }
       // Update insertps mask srcidx and reference the source input directly.
+      int M = TargetMask1[SrcIdx];
       assert(0 <= M && M < 8 && "Shuffle index out of range");
       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
       Op1 = Ops1[M < 4 ? 0 : 1];
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
-                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+                         DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
     }
 
     // Attempt to merge insertps Op0 with an inner target shuffle node.
     SmallVector<int, 8> TargetMask0;
     SmallVector<SDValue, 2> Ops0;
-    if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
-      return SDValue();
+    APInt KnownUndef0, KnownZero0;
+    if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
+                                     KnownZero0)) {
+      bool Updated = false;
+      bool UseInput00 = false;
+      bool UseInput01 = false;
+      for (int i = 0; i != 4; ++i) {
+        if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+          // No change if element is already zero or the inserted element.
+          continue;
+        } else if (KnownUndef0[i] || KnownZero0[i]) {
+          // If the target mask is undef/zero then we must zero the element.
+          InsertPSMask |= (1u << i);
+          Updated = true;
+          continue;
+        }
 
-    bool Updated = false;
-    bool UseInput00 = false;
-    bool UseInput01 = false;
-    for (int i = 0; i != 4; ++i) {
-      int M = TargetMask0[i];
-      if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
-        // No change if element is already zero or the inserted element.
-        continue;
-      } else if (isUndefOrZero(M)) {
-        // If the target mask is undef/zero then we must zero the element.
-        InsertPSMask |= (1u << i);
-        Updated = true;
-        continue;
+        // The input vector element must be inline.
+        int M = TargetMask0[i];
+        if (M != i && M != (i + 4))
+          return SDValue();
+
+        // Determine which inputs of the target shuffle we're using.
+        UseInput00 |= (0 <= M && M < 4);
+        UseInput01 |= (4 <= M);
       }
 
-      // The input vector element must be inline.
-      if (M != i && M != (i + 4))
-        return SDValue();
+      // If we're not using both inputs of the target shuffle then use the
+      // referenced input directly.
+      if (UseInput00 && !UseInput01) {
+        Updated = true;
+        Op0 = Ops0[0];
+      } else if (!UseInput00 && UseInput01) {
+        Updated = true;
+        Op0 = Ops0[1];
+      }
 
-      // Determine which inputs of the target shuffle we're using.
-      UseInput00 |= (0 <= M && M < 4);
-      UseInput01 |= (4 <= M);
+      if (Updated)
+        return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+                           DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
     }
 
-    // If we're not using both inputs of the target shuffle then use the
-    // referenced input directly.
-    if (UseInput00 && !UseInput01) {
-      Updated = true;
-      Op0 = Ops0[0];
-    } else if (!UseInput00 && UseInput01) {
-      Updated = true;
-      Op0 = Ops0[1];
+    // If we're inserting an element from a vbroadcast load, fold the
+    // load into the X86insertps instruction. We need to convert the scalar
+    // load to a vector and clear the source lane of the INSERTPS control.
+    if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
+      auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
+      if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
+        SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
+                                   MemIntr->getBasePtr(),
+                                   MemIntr->getMemOperand());
+        SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
+                           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
+                                       Load),
+                           DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
+        return Insert;
+      }
     }
 
-    if (Updated)
-      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
-                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
-
     return SDValue();
   }
   default:
@@ -33580,7 +34011,7 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
 }
 
 /// Eliminate a redundant shuffle of a horizontal math op.
-static SDValue foldShuffleOfHorizOp(SDNode *N) {
+static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
   unsigned Opcode = N->getOpcode();
   if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
     if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
@@ -33611,17 +34042,36 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
       HOp.getOperand(0) != HOp.getOperand(1))
     return SDValue();
 
+  // The shuffle that we are eliminating may have allowed the horizontal op to
+  // have an undemanded (undefined) operand. Duplicate the other (defined)
+  // operand to ensure that the results are defined across all lanes without the
+  // shuffle.
+  auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
+    SDValue X;
+    if (HorizOp.getOperand(0).isUndef()) {
+      assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op");
+      X = HorizOp.getOperand(1);
+    } else if (HorizOp.getOperand(1).isUndef()) {
+      assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op");
+      X = HorizOp.getOperand(0);
+    } else {
+      return HorizOp;
+    }
+    return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
+                       HorizOp.getValueType(), X, X);
+  };
+
   // When the operands of a horizontal math op are identical, the low half of
   // the result is the same as the high half. If a target shuffle is also
-  // replicating low and high halves, we don't need the shuffle.
+  // replicating low and high halves (and without changing the type/length of
+  // the vector), we don't need the shuffle.
   if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
-    if (HOp.getScalarValueSizeInBits() == 64) {
+    if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
       // movddup (hadd X, X) --> hadd X, X
       // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
       assert((HOp.getValueType() == MVT::v2f64 ||
-        HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT &&
-        "Unexpected type for h-op");
-      return HOp;
+              HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op");
+      return updateHOp(HOp, DAG);
     }
     return SDValue();
   }
@@ -33635,14 +34085,14 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
       (isTargetShuffleEquivalent(Mask, {0, 0}) ||
        isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
        isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
-    return HOp;
+    return updateHOp(HOp, DAG);
 
   if (HOp.getValueSizeInBits() == 256 &&
       (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
        isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
        isTargetShuffleEquivalent(
            Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
-    return HOp;
+    return updateHOp(HOp, DAG);
 
   return SDValue();
 }
@@ -33677,7 +34127,7 @@ static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
   // the wide shuffle that we started with.
   return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
                                Shuf->getOperand(1), HalfMask, HalfIdx1,
-                               HalfIdx2, false, DAG);
+                               HalfIdx2, false, DAG, /*UseConcat*/true);
 }
 
 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
@@ -33696,70 +34146,10 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
       return AddSub;
 
-    if (SDValue HAddSub = foldShuffleOfHorizOp(N))
+    if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
       return HAddSub;
   }
 
-  // During Type Legalization, when promoting illegal vector types,
-  // the backend might introduce new shuffle dag nodes and bitcasts.
-  //
-  // This code performs the following transformation:
-  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
-  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
-  //
-  // We do this only if both the bitcast and the BINOP dag nodes have
-  // one use. Also, perform this transformation only if the new binary
-  // operation is legal. This is to avoid introducing dag nodes that
-  // potentially need to be further expanded (or custom lowered) into a
-  // less optimal sequence of dag nodes.
-  if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
-      N->getOpcode() == ISD::VECTOR_SHUFFLE &&
-      N->getOperand(0).getOpcode() == ISD::BITCAST &&
-      N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
-    SDValue N0 = N->getOperand(0);
-    SDValue N1 = N->getOperand(1);
-
-    SDValue BC0 = N0.getOperand(0);
-    EVT SVT = BC0.getValueType();
-    unsigned Opcode = BC0.getOpcode();
-    unsigned NumElts = VT.getVectorNumElements();
-
-    if (BC0.hasOneUse() && SVT.isVector() &&
-        SVT.getVectorNumElements() * 2 == NumElts &&
-        TLI.isOperationLegal(Opcode, VT)) {
-      bool CanFold = false;
-      switch (Opcode) {
-      default : break;
-      case ISD::ADD:
-      case ISD::SUB:
-      case ISD::MUL:
-        // isOperationLegal lies for integer ops on floating point types.
-        CanFold = VT.isInteger();
-        break;
-      case ISD::FADD:
-      case ISD::FSUB:
-      case ISD::FMUL:
-        // isOperationLegal lies for floating point ops on integer types.
-        CanFold = VT.isFloatingPoint();
-        break;
-      }
-
-      unsigned SVTNumElts = SVT.getVectorNumElements();
-      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-      for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
-        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
-      for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
-        CanFold = SVOp->getMaskElt(i) < 0;
-
-      if (CanFold) {
-        SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
-        SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
-        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
-        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
-      }
-    }
-  }
-
   // Attempt to combine into a vector load/broadcast.
   if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
     return LD;
@@ -33841,7 +34231,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
       ISD::isNormalLoad(N->getOperand(0).getNode())) {
     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
-    if (!LN->isVolatile()) {
+    if (LN->isSimple()) {
       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
       SDValue VZLoad =
@@ -33855,53 +34245,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-
-  // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
-  // operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
-  // FIXME: This can probably go away once we default to widening legalization.
-  if (Subtarget.hasSSE41() && VT == MVT::v4i32 &&
-      N->getOpcode() == ISD::VECTOR_SHUFFLE &&
-      N->getOperand(0).getOpcode() == ISD::BITCAST &&
-      N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
-    SDValue BC = N->getOperand(0);
-    SDValue MULUDQ = BC.getOperand(0);
-    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-    ArrayRef<int> Mask = SVOp->getMask();
-    if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
-        Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
-      SDValue Op0 = MULUDQ.getOperand(0);
-      SDValue Op1 = MULUDQ.getOperand(1);
-      if (Op0.getOpcode() == ISD::BITCAST &&
-          Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
-          Op0.getOperand(0).getValueType() == MVT::v4i32) {
-        ShuffleVectorSDNode *SVOp0 =
-          cast<ShuffleVectorSDNode>(Op0.getOperand(0));
-        ArrayRef<int> Mask2 = SVOp0->getMask();
-        if (Mask2[0] == 0 && Mask2[1] == -1 &&
-            Mask2[2] == 1 && Mask2[3] == -1) {
-          Op0 = SVOp0->getOperand(0);
-          Op1 = DAG.getBitcast(MVT::v4i32, Op1);
-          Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
-          return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
-        }
-      }
-      if (Op1.getOpcode() == ISD::BITCAST &&
-          Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
-          Op1.getOperand(0).getValueType() == MVT::v4i32) {
-        ShuffleVectorSDNode *SVOp1 =
-          cast<ShuffleVectorSDNode>(Op1.getOperand(0));
-        ArrayRef<int> Mask2 = SVOp1->getMask();
-        if (Mask2[0] == 0 && Mask2[1] == -1 &&
-            Mask2[2] == 1 && Mask2[3] == -1) {
-          Op0 = DAG.getBitcast(MVT::v4i32, Op0);
-          Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
-          Op1 = SVOp1->getOperand(0);
-          return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
-        }
-      }
-    }
-  }
-
   return SDValue();
 }
 
@@ -33966,6 +34309,84 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     // TODO convert SrcUndef to KnownUndef.
     break;
   }
+  case X86ISD::KSHIFTL: {
+    SDValue Src = Op.getOperand(0);
+    auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
+    assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
+    unsigned ShiftAmt = Amt->getZExtValue();
+
+    if (ShiftAmt == 0)
+      return TLO.CombineTo(Op, Src);
+
+    // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+    // single shift.  We can do this if the bottom bits (which are shifted
+    // out) are never demanded.
+    if (Src.getOpcode() == X86ISD::KSHIFTR) {
+      if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
+        unsigned C1 = Src.getConstantOperandVal(1);
+        unsigned NewOpc = X86ISD::KSHIFTL;
+        int Diff = ShiftAmt - C1;
+        if (Diff < 0) {
+          Diff = -Diff;
+          NewOpc = X86ISD::KSHIFTR;
+        }
+
+        SDLoc dl(Op);
+        SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
+      }
+    }
+
+    APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
+    if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
+                                   Depth + 1))
+      return true;
+
+    KnownUndef <<= ShiftAmt;
+    KnownZero <<= ShiftAmt;
+    KnownZero.setLowBits(ShiftAmt);
+    break;
+  }
+  case X86ISD::KSHIFTR: {
+    SDValue Src = Op.getOperand(0);
+    auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
+    assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
+    unsigned ShiftAmt = Amt->getZExtValue();
+
+    if (ShiftAmt == 0)
+      return TLO.CombineTo(Op, Src);
+
+    // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
+    // single shift.  We can do this if the top bits (which are shifted
+    // out) are never demanded.
+    if (Src.getOpcode() == X86ISD::KSHIFTL) {
+      if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
+        unsigned C1 = Src.getConstantOperandVal(1);
+        unsigned NewOpc = X86ISD::KSHIFTR;
+        int Diff = ShiftAmt - C1;
+        if (Diff < 0) {
+          Diff = -Diff;
+          NewOpc = X86ISD::KSHIFTL;
+        }
+
+        SDLoc dl(Op);
+        SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
+      }
+    }
+
+    APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
+    if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
+                                   Depth + 1))
+      return true;
+
+    KnownUndef.lshrInPlace(ShiftAmt);
+    KnownZero.lshrInPlace(ShiftAmt);
+    KnownZero.setHighBits(ShiftAmt);
+    break;
+  }
   case X86ISD::CVTSI2P:
   case X86ISD::CVTUI2P: {
     SDValue Src = Op.getOperand(0);
@@ -33979,16 +34400,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   }
   case X86ISD::PACKSS:
   case X86ISD::PACKUS: {
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+
     APInt DemandedLHS, DemandedRHS;
     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
 
     APInt SrcUndef, SrcZero;
-    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
-                                   SrcZero, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
       return true;
-    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
-                                   SrcZero, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
       return true;
+
+    // Aggressively peek through ops to get at the demanded elts.
+    // TODO - we should do this for all target/faux shuffles ops.
+    if (!DemandedElts.isAllOnesValue()) {
+      APInt DemandedSrcBits =
+          APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
+      SDValue NewN0 = SimplifyMultipleUseDemandedBits(
+          N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
+      SDValue NewN1 = SimplifyMultipleUseDemandedBits(
+          N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
+      if (NewN0 || NewN1) {
+        NewN0 = NewN0 ? NewN0 : N0;
+        NewN1 = NewN1 ? NewN1 : N1;
+        return TLO.CombineTo(Op,
+                             TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
+      }
+    }
     break;
   }
   case X86ISD::HADD:
@@ -34062,25 +34503,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       return true;
     break;
   }
-  case X86ISD::SUBV_BROADCAST: {
-    // Reduce size of broadcast if we don't need the upper half.
-    unsigned HalfElts = NumElts / 2;
-    if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) {
-      SDValue Src = Op.getOperand(0);
-      MVT SrcVT = Src.getSimpleValueType();
-
-      SDValue Half = Src;
-      if (SrcVT.getVectorNumElements() != HalfElts) {
-        MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts);
-        Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src);
-      }
-
-      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0,
-                                               TLO.DAG, SDLoc(Op),
-                                               Half.getValueSizeInBits()));
-    }
-    break;
-  }
   case X86ISD::VPERMV: {
     SDValue Mask = Op.getOperand(0);
     APInt MaskUndef, MaskZero;
@@ -34134,6 +34556,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       SDValue Insert =
           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
       return TLO.CombineTo(Op, Insert);
+    }
+      // Subvector broadcast.
+    case X86ISD::SUBV_BROADCAST: {
+      SDLoc DL(Op);
+      SDValue Src = Op.getOperand(0);
+      if (Src.getValueSizeInBits() > ExtSizeInBits)
+        Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
+      else if (Src.getValueSizeInBits() < ExtSizeInBits) {
+        MVT SrcSVT = Src.getSimpleValueType().getScalarType();
+        MVT SrcVT =
+            MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
+        Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
+      }
+      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
+                                               TLO.DAG, DL, ExtSizeInBits));
     }
       // Byte shifts by immediate.
     case X86ISD::VSHLDQ:
@@ -34201,36 +34638,30 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     }
   }
 
-  // Simplify target shuffles.
-  if (!isTargetShuffle(Opc) || !VT.isSimple())
-    return false;
-
-  // Get target shuffle mask.
-  bool IsUnary;
+  // Get target/faux shuffle mask.
+  APInt OpUndef, OpZero;
   SmallVector<int, 64> OpMask;
   SmallVector<SDValue, 2> OpInputs;
-  if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs,
-                            OpMask, IsUnary))
+  if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
+                              OpZero, TLO.DAG, Depth, false))
     return false;
 
-  // Shuffle inputs must be the same type as the result.
-  if (llvm::any_of(OpInputs,
-                   [VT](SDValue V) { return VT != V.getValueType(); }))
+  // Shuffle inputs must be the same size as the result.
+  if (OpMask.size() != (unsigned)NumElts ||
+      llvm::any_of(OpInputs, [VT](SDValue V) {
+        return VT.getSizeInBits() != V.getValueSizeInBits() ||
+               !V.getValueType().isVector();
+      }))
     return false;
 
-  // Clear known elts that might have been set above.
-  KnownZero.clearAllBits();
-  KnownUndef.clearAllBits();
+  KnownZero = OpZero;
+  KnownUndef = OpUndef;
 
   // Check if shuffle mask can be simplified to undef/zero/identity.
   int NumSrcs = OpInputs.size();
-  for (int i = 0; i != NumElts; ++i) {
-    int &M = OpMask[i];
+  for (int i = 0; i != NumElts; ++i)
     if (!DemandedElts[i])
-      M = SM_SentinelUndef;
-    else if (0 <= M && OpInputs[M / NumElts].isUndef())
-      M = SM_SentinelUndef;
-  }
+      OpMask[i] = SM_SentinelUndef;
 
   if (isUndefInRange(OpMask, 0, NumElts)) {
     KnownUndef.setAllBits();
@@ -34243,10 +34674,14 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   }
   for (int Src = 0; Src != NumSrcs; ++Src)
     if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
-      return TLO.CombineTo(Op, OpInputs[Src]);
+      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
 
   // Attempt to simplify inputs.
   for (int Src = 0; Src != NumSrcs; ++Src) {
+    // TODO: Support inputs of different types.
+    if (OpInputs[Src].getValueType() != VT)
+      continue;
+
     int Lo = Src * NumElts;
     APInt SrcElts = APInt::getNullValue(NumElts);
     for (int i = 0; i != NumElts; ++i)
@@ -34256,21 +34691,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
           SrcElts.setBit(M);
       }
 
+    // TODO - Propagate input undef/zero elts.
     APInt SrcUndef, SrcZero;
     if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
                                    TLO, Depth + 1))
       return true;
   }
 
-  // Extract known zero/undef elements.
-  // TODO - Propagate input undef/zero elts.
-  for (int i = 0; i != NumElts; ++i) {
-    if (OpMask[i] == SM_SentinelUndef)
-      KnownUndef.setBit(i);
-    if (OpMask[i] == SM_SentinelZero)
-      KnownZero.setBit(i);
-  }
-
   return false;
 }
 
@@ -34296,6 +34723,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
                              TLO, Depth + 1))
       return true;
+
+    // Aggressively peek through ops to get at the demanded low bits.
+    SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
+        LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+    SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
+        RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+    if (DemandedLHS || DemandedRHS) {
+      DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
+      DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
+      return TLO.CombineTo(
+          Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
+    }
     break;
   }
   case X86ISD::VSHLI: {
@@ -34323,7 +34762,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
             unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
             SDValue NewShift = TLO.DAG.getNode(
                 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
-                TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
+                TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
             return TLO.CombineTo(Op, NewShift);
           }
         }
@@ -34441,6 +34880,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
                                KnownVec, TLO, Depth + 1))
         return true;
 
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
+
       Known = KnownVec.zext(BitWidth, true);
       return false;
     }
@@ -34542,12 +34986,80 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
 }
 
+SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    SelectionDAG &DAG, unsigned Depth) const {
+  int NumElts = DemandedElts.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
+
+  switch (Opc) {
+  case X86ISD::PINSRB:
+  case X86ISD::PINSRW: {
+    // If we don't demand the inserted element, return the base vector.
+    SDValue Vec = Op.getOperand(0);
+    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    MVT VecVT = Vec.getSimpleValueType();
+    if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
+        !DemandedElts[CIdx->getZExtValue()])
+      return Vec;
+     break;
+  }
+  }
+
+  APInt ShuffleUndef, ShuffleZero;
+  SmallVector<int, 16> ShuffleMask;
+  SmallVector<SDValue, 2> ShuffleOps;
+  if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
+                             ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
+    // If all the demanded elts are from one operand and are inline,
+    // then we can use the operand directly.
+    int NumOps = ShuffleOps.size();
+    if (ShuffleMask.size() == (unsigned)NumElts &&
+        llvm::all_of(ShuffleOps, [VT](SDValue V) {
+          return VT.getSizeInBits() == V.getValueSizeInBits();
+        })) {
+
+      if (DemandedElts.isSubsetOf(ShuffleUndef))
+        return DAG.getUNDEF(VT);
+      if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
+        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
+
+      // Bitmask that indicates which ops have only been accessed 'inline'.
+      APInt IdentityOp = APInt::getAllOnesValue(NumOps);
+      for (int i = 0; i != NumElts; ++i) {
+        int M = ShuffleMask[i];
+        if (!DemandedElts[i] || ShuffleUndef[i])
+          continue;
+        int Op = M / NumElts;
+        int Index = M % NumElts;
+        if (M < 0 || Index != i) {
+          IdentityOp.clearAllBits();
+          break;
+        }
+        IdentityOp &= APInt::getOneBitSet(NumOps, Op);
+        if (IdentityOp == 0)
+          break;
+      }
+      assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
+             "Multiple identity shuffles detected");
+
+      if (IdentityOp != 0)
+        return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
+    }
+  }
+
+  return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+      Op, DemandedBits, DemandedElts, DAG, Depth);
+}
+
 /// Check if a vector extract from a target-specific shuffle of a load can be
 /// folded into a single element load.
 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
 /// shuffles have been custom lowered so we need to handle those here.
-static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
-                                         TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue
+XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -34559,13 +35071,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT OriginalVT = InVec.getValueType();
+  unsigned NumOriginalElts = OriginalVT.getVectorNumElements();
 
   // Peek through bitcasts, don't duplicate a load with other uses.
   InVec = peekThroughOneUseBitcasts(InVec);
 
   EVT CurrentVT = InVec.getValueType();
-  if (!CurrentVT.isVector() ||
-      CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+  if (!CurrentVT.isVector())
+    return SDValue();
+
+  unsigned NumCurrentElts = CurrentVT.getVectorNumElements();
+  if ((NumOriginalElts % NumCurrentElts) != 0)
     return SDValue();
 
   if (!isTargetShuffle(InVec.getOpcode()))
@@ -34582,10 +35098,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                             ShuffleOps, ShuffleMask, UnaryShuffle))
     return SDValue();
 
+  unsigned Scale = NumOriginalElts / NumCurrentElts;
+  if (Scale > 1) {
+    SmallVector<int, 16> ScaledMask;
+    scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);
+    ShuffleMask = std::move(ScaledMask);
+  }
+  assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch");
+
   // Select the input vector, guarding against out of range extract vector.
-  unsigned NumElems = CurrentVT.getVectorNumElements();
   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-  int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
+  int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];
 
   if (Idx == SM_SentinelZero)
     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
@@ -34598,8 +35121,9 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
     return SDValue();
 
-  assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
-  SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1];
+  assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
+         "Shuffle index out of range");
+  SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];
 
   // If inputs to shuffle are the same for both ops, then allow 2 uses
   unsigned AllowedUses =
@@ -34619,7 +35143,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
 
-  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+  if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple())
     return SDValue();
 
   // If there's a bitcast before the shuffle, check if the load type and
@@ -34637,10 +35161,11 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   SDLoc dl(N);
 
   // Create shuffle node taking into account the case that its a unary shuffle
-  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
-  Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
-                                 ShuffleMask);
-  Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
+  SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)
+                                 : DAG.getBitcast(OriginalVT, ShuffleOps[1]);
+  Shuffle = DAG.getVectorShuffle(OriginalVT, dl,
+                                 DAG.getBitcast(OriginalVT, ShuffleOps[0]),
+                                 Shuffle, ShuffleMask);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
                      EltNo);
 }
@@ -34660,6 +35185,23 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
   return false;
 }
 
+// Helper to push sign extension of vXi1 SETCC result through bitops.
+static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
+                                          SDValue Src, const SDLoc &DL) {
+  switch (Src.getOpcode()) {
+  case ISD::SETCC:
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR:
+    return DAG.getNode(
+        Src.getOpcode(), DL, SExtVT,
+        signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
+        signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
+  }
+  llvm_unreachable("Unexpected node type for vXi1 sign extension");
+}
+
 // Try to match patterns such as
 // (i16 bitcast (v16i1 x))
 // ->
@@ -34698,6 +35240,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
   MVT SExtVT;
+  bool PropagateSExt = false;
   switch (SrcVT.getSimpleVT().SimpleTy) {
   default:
     return SDValue();
@@ -34708,8 +35251,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
     SExtVT = MVT::v4i32;
     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
     // sign-extend to a 256-bit operation to avoid truncation.
-    if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256))
+    if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
       SExtVT = MVT::v4i64;
+      PropagateSExt = true;
+    }
     break;
   case MVT::v8i1:
     SExtVT = MVT::v8i16;
@@ -34718,11 +35263,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
     // 256-bit because the shuffle is cheaper than sign extending the result of
     // the compare.
-    // TODO : use checkBitcastSrcVectorSize
-    if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
-        (Src.getOperand(0).getValueType().is256BitVector() ||
-         Src.getOperand(0).getValueType().is512BitVector())) {
+    if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
+                               checkBitcastSrcVectorSize(Src, 512))) {
       SExtVT = MVT::v8i32;
+      PropagateSExt = true;
     }
     break;
   case MVT::v16i1:
@@ -34745,19 +35289,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
     return SDValue();
   };
 
-  SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+  SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
+                            : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
 
-  if (SExtVT == MVT::v64i8) {
-    SDValue Lo, Hi;
-    std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
-    Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
-    Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
-    Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
-    Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
-    Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
-                     DAG.getConstant(32, DL, MVT::i8));
-    V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
-  } else if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
+  if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
     V = getPMOVMSKB(DL, V, DAG, Subtarget);
   } else {
     if (SExtVT == MVT::v8i16)
@@ -34891,8 +35426,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
       return DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
-          DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
-          DAG.getConstant(ShufMask, DL, MVT::i8));
+          DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
+          Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
     }
     Ops.append(NumElts, Splat);
   } else {
@@ -34935,6 +35470,24 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
       return V;
 
+    // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type
+    // legalization destroys the v4i32 type.
+    if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
+        VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
+        N0.getOperand(0).getValueType() == MVT::v4i32 &&
+        ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
+        cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
+      SDValue N00 = N0.getOperand(0);
+      // Only do this if we can avoid scalarizing the input.
+      if (ISD::isNormalLoad(N00.getNode()) ||
+          (N00.getOpcode() == ISD::BITCAST &&
+           N00.getOperand(0).getValueType() == MVT::v4f32)) {
+        SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
+                                DAG.getBitcast(MVT::v4f32, N00));
+        return DAG.getZExtOrTrunc(V, dl, VT);
+      }
+    }
+
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
@@ -34949,6 +35502,26 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     // type, widen both sides to avoid a trip through memory.
     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
         Subtarget.hasAVX512()) {
+      // Use zeros for the widening if we already have some zeroes. This can
+      // allow SimplifyDemandedBits to remove scalar ANDs that may be down
+      // stream of this.
+      // FIXME: It might make sense to detect a concat_vectors with a mix of
+      // zeroes and undef and turn it into insert_subvector for i1 vectors as
+      // a separate combine. What we can't do is canonicalize the operands of
+      // such a concat or we'll get into a loop with SimplifyDemandedBits.
+      if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
+        SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
+        if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
+          SrcVT = LastOp.getValueType();
+          unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
+          SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
+          Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
+          N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
+          N0 = DAG.getBitcast(MVT::i8, N0);
+          return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
+        }
+      }
+
       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
       Ops[0] = N0;
@@ -34958,6 +35531,33 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
+  // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
+  // due to insert_subvector legalization on KNL. By promoting the copy to i16
+  // we can help with known bits propagation from the vXi1 domain to the
+  // scalar domain.
+  if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
+      !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N0.getOperand(0).getValueType() == MVT::v16i1 &&
+      isNullConstant(N0.getOperand(1)))
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
+                       DAG.getBitcast(MVT::i16, N0.getOperand(0)));
+
+  // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT
+  // determines // the number of bits loaded. Remaining bits are zero.
+  if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
+      VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {
+    auto *BCast = cast<MemIntrinsicSDNode>(N0);
+    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+    SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+    SDValue ResNode =
+        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+                                VT.getVectorElementType(),
+                                BCast->getMemOperand());
+    DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
+    return ResNode;
+  }
+
   // Since MMX types are special and don't usually play with other vector types,
   // it's better to handle them early to be sure we emit efficient code by
   // avoiding store-load conversions.
@@ -35152,7 +35752,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
   ISD::NodeType BinOp;
   SDValue Src = DAG.matchBinOpReduction(
-      Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
+      Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
   if (!Src)
     return SDValue();
 
@@ -35246,29 +35846,31 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
   SDLoc DL(Extract);
   EVT MatchVT = Match.getValueType();
   unsigned NumElts = MatchVT.getVectorNumElements();
+  unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (ExtractVT == MVT::i1) {
     // Special case for (pre-legalization) vXi1 reductions.
-    if (NumElts > 32)
+    if (NumElts > 64 || !isPowerOf2_32(NumElts))
       return SDValue();
-    if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) {
+    if (TLI.isTypeLegal(MatchVT)) {
       // If this is a legal AVX512 predicate type then we can just bitcast.
       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
       Movmsk = DAG.getBitcast(MovmskVT, Match);
     } else {
       // Use combineBitcastvxi1 to create the MOVMSK.
-      if (NumElts == 32 && !Subtarget.hasInt256()) {
+      while (NumElts > MaxElts) {
         SDValue Lo, Hi;
         std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
         Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
-        NumElts = 16;
+        NumElts /= 2;
       }
       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
       Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
     }
     if (!Movmsk)
       return SDValue();
-    Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32);
+    Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
   } else {
     // Bail with AVX512VL (which uses predicate registers).
     if (Subtarget.hasVLX())
@@ -35309,13 +35911,15 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
     Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
     NumElts = MaskSrcVT.getVectorNumElements();
   }
-  assert(NumElts <= 32 && "Not expecting more than 32 elements");
+  assert((NumElts <= 32 || NumElts == 64) &&
+         "Not expecting more than 64 elements");
 
+  MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
   if (BinOp == ISD::XOR) {
     // parity -> (AND (CTPOP(MOVMSK X)), 1)
-    SDValue Mask = DAG.getConstant(1, DL, MVT::i32);
-    SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk);
-    Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask);
+    SDValue Mask = DAG.getConstant(1, DL, CmpVT);
+    SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
+    Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
     return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
   }
 
@@ -35323,19 +35927,19 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
   ISD::CondCode CondCode;
   if (BinOp == ISD::OR) {
     // any_of -> MOVMSK != 0
-    CmpC = DAG.getConstant(0, DL, MVT::i32);
+    CmpC = DAG.getConstant(0, DL, CmpVT);
     CondCode = ISD::CondCode::SETNE;
   } else {
     // all_of -> MOVMSK == ((1 << NumElts) - 1)
-    CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32);
+    CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
+                           DL, CmpVT);
     CondCode = ISD::CondCode::SETEQ;
   }
 
   // The setcc produces an i8 of 0/1, so extend that to the result width and
   // negate to get the final 0/-1 mask value.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT SetccVT =
-      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
   SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
   SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
   SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
@@ -35431,6 +36035,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  SDLoc dl(N);
   SDValue Src = N->getOperand(0);
   SDValue Idx = N->getOperand(1);
 
@@ -35452,10 +36057,37 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
       return DAG.getBitcast(VT, SrcOp);
   }
 
+  // If we're extracting a single element from a broadcast load and there are
+  // no other users, just create a single load.
+  if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
+    auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
+    unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
+    if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
+        VT.getSizeInBits() == SrcBCWidth) {
+      SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
+                                 MemIntr->getBasePtr(),
+                                 MemIntr->getPointerInfo(),
+                                 MemIntr->getAlignment(),
+                                 MemIntr->getMemOperand()->getFlags());
+      DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
+      return Load;
+    }
+  }
+
+  // Handle extract(truncate(x)) for 0'th index.
+  // TODO: Treat this as a faux shuffle?
+  // TODO: When can we use this for general indices?
+  if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() &&
+      isNullConstant(Idx)) {
+    Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
+    Src = DAG.getBitcast(SrcVT, Src);
+    return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
+  }
+
   // Resolve the target shuffle inputs and mask.
   SmallVector<int, 16> Mask;
   SmallVector<SDValue, 2> Ops;
-  if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
+  if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
     return SDValue();
 
   // Attempt to narrow/widen the shuffle mask to the correct size.
@@ -35489,7 +36121,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   int SrcIdx = Mask[N->getConstantOperandVal(1)];
-  SDLoc dl(N);
 
   // If the shuffle source element is undef/zero then we can just accept it.
   if (SrcIdx == SM_SentinelUndef)
@@ -35584,7 +36215,7 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
   }
 
   // TODO: This switch could include FNEG and the x86-specific FP logic ops
-  // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid 
+  // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
   // missed load folding and fma+fneg combining.
   switch (Vec.getOpcode()) {
   case ISD::FMA: // Begin 3 operands
@@ -35631,27 +36262,84 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
 static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget) {
   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
-  bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
-  if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
-    return SDValue();
-  SDValue Index = ExtElt->getOperand(1);
-  if (!isNullConstant(Index))
-    return SDValue();
 
-  // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
   ISD::NodeType Opc;
-  SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
+  SDValue Rdx =
+      DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
   if (!Rdx)
     return SDValue();
 
+  SDValue Index = ExtElt->getOperand(1);
+  assert(isNullConstant(Index) &&
+         "Reduction doesn't end in an extract from index 0");
+
   EVT VT = ExtElt->getValueType(0);
-  EVT VecVT = ExtElt->getOperand(0).getValueType();
+  EVT VecVT = Rdx.getValueType();
   if (VecVT.getScalarType() != VT)
     return SDValue();
 
-  unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
   SDLoc DL(ExtElt);
 
+  // vXi8 reduction - sub 128-bit vector.
+  if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
+    if (VecVT == MVT::v4i8) {
+      // Pad with zero.
+      if (Subtarget.hasSSE41()) {
+        Rdx = DAG.getBitcast(MVT::i32, Rdx);
+        Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+                          DAG.getConstant(0, DL, MVT::v4i32), Rdx,
+                          DAG.getIntPtrConstant(0, DL));
+        Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+      } else {
+        Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
+                          DAG.getConstant(0, DL, VecVT));
+      }
+    }
+    if (Rdx.getValueType() == MVT::v8i8) {
+      // Pad with undef.
+      Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
+                        DAG.getUNDEF(MVT::v8i8));
+    }
+    Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
+                      DAG.getConstant(0, DL, MVT::v16i8));
+    Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+  }
+
+  // Must be a >=128-bit vector with pow2 elements.
+  if ((VecVT.getSizeInBits() % 128) != 0 ||
+      !isPowerOf2_32(VecVT.getVectorNumElements()))
+    return SDValue();
+
+  // vXi8 reduction - sum lo/hi halves then use PSADBW.
+  if (VT == MVT::i8) {
+    while (Rdx.getValueSizeInBits() > 128) {
+      unsigned HalfSize = VecVT.getSizeInBits() / 2;
+      unsigned HalfElts = VecVT.getVectorNumElements() / 2;
+      SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
+      SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
+      Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
+      VecVT = Rdx.getValueType();
+    }
+    assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
+
+    SDValue Hi = DAG.getVectorShuffle(
+        MVT::v16i8, DL, Rdx, Rdx,
+        {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
+    Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
+    Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
+                      getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+    Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+  }
+
+  // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
+  bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+    return SDValue();
+
+  unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
+
   // 256-bit horizontal instructions operate on 128-bit chunks rather than
   // across the whole vector, so we need an extract + hop preliminary stage.
   // This is the only step where the operands of the hop are not the same value.
@@ -35661,15 +36349,14 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
     unsigned NumElts = VecVT.getVectorNumElements();
     SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
     SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
-    VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
-    Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
+    Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
+    VecVT = Rdx.getValueType();
   }
   if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
       !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
     return SDValue();
 
   // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
-  assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
   unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
   for (unsigned i = 0; i != ReductionSteps; ++i)
     Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
@@ -35714,15 +36401,26 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // TODO - Remove this once we can handle the implicit zero-extension of
-  // X86ISD::PEXTRW/X86ISD::PEXTRB in:
-  // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
-  // combineBasicSADPattern.
   if (IsPextr) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     if (TLI.SimplifyDemandedBits(
             SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
       return SDValue(N, 0);
+
+    // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
+    if ((InputVector.getOpcode() == X86ISD::PINSRB ||
+         InputVector.getOpcode() == X86ISD::PINSRW) &&
+        InputVector.getOperand(2) == EltIdx) {
+      assert(SrcVT == InputVector.getOperand(0).getValueType() &&
+             "Vector type mismatch");
+      SDValue Scl = InputVector.getOperand(1);
+      Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
+      return DAG.getZExtOrTrunc(Scl, dl, VT);
+    }
+
+    // TODO - Remove this once we can handle the implicit zero-extension of
+    // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,
+    // combineHorizontalPredicateResult and combineBasicSADPattern.
     return SDValue();
   }
 
@@ -35832,6 +36530,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   //       get simplified at node creation time)?
   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+  // If both inputs are 0/undef, create a complete zero vector.
+  // FIXME: As noted above this should be handled by DAGCombiner/getNode.
+  if (TValIsAllZeros && FValIsAllZeros) {
+    if (VT.isFloatingPoint())
+      return DAG.getConstantFP(0.0, DL, VT);
+    return DAG.getConstant(0, DL, VT);
+  }
+
   if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
       Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
     // Invert the cond to not(cond) : xor(op,allones)=not(op)
@@ -36295,8 +37002,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // Since SKX these selects have a proper lowering.
   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
       CondVT.getVectorElementType() == MVT::i1 &&
-      (ExperimentalVectorWideningLegalization ||
-       VT.getVectorNumElements() > 4) &&
       (VT.getVectorElementType() == MVT::i8 ||
        VT.getVectorElementType() == MVT::i16)) {
     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
@@ -36358,6 +37063,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // subl   %esi, $edi
   // cmovsl %eax, %edi
   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+      Cond.hasOneUse() &&
       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -36508,6 +37214,12 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
     return V;
 
+  // select(~Cond, X, Y) -> select(Cond, Y, X)
+  if (CondVT.getScalarType() != MVT::i1)
+    if (SDValue CondNot = IsNOT(Cond, DAG))
+      return DAG.getNode(N->getOpcode(), DL, VT,
+                         DAG.getBitcast(CondVT, CondNot), RHS, LHS);
+
   // Custom action for SELECT MMX
   if (VT == MVT::x86mmx) {
     LHS = DAG.getBitcast(MVT::i64, LHS);
@@ -36873,8 +37585,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
   // We can't always do this as FCMOV only supports a subset of X86 cond.
   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
-      SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
-        Flags};
+      SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
+                       Flags};
       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
     }
   }
@@ -36923,12 +37635,13 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
       // Optimize cases that will turn into an LEA instruction.  This requires
       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
-        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
-        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+        APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
+        assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
+               "Implicit constant truncation");
 
         bool isFastMultiplier = false;
-        if (Diff < 10) {
-          switch ((unsigned char)Diff) {
+        if (Diff.ult(10)) {
+          switch (Diff.getZExtValue()) {
           default: break;
           case 1:  // result = add base, cond
           case 2:  // result = lea base(    , cond*2)
@@ -36943,7 +37656,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
         }
 
         if (isFastMultiplier) {
-          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
           Cond = getSETCC(CC, Cond, DL ,DAG);
           // Zero extend the condition if needed.
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
@@ -36994,8 +37706,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
 
       if (CC == X86::COND_E &&
           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
-        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
-                          DAG.getConstant(CC, DL, MVT::i8), Cond };
+        SDValue Ops[] = {FalseOp, Cond.getOperand(0),
+                         DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
       }
     }
@@ -37029,10 +37741,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
         CC1 = X86::GetOppositeBranchCondition(CC1);
       }
 
-      SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
-        Flags};
+      SDValue LOps[] = {FalseOp, TrueOp,
+                        DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
-      SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
+      SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
+                       Flags};
       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
       return CMOV;
     }
@@ -37064,9 +37777,9 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
       EVT VT = N->getValueType(0);
       // This should constant fold.
       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
-      SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
-                                 DAG.getConstant(X86::COND_NE, DL, MVT::i8),
-                                 Cond);
+      SDValue CMov =
+          DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
+                      DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
       return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
     }
   }
@@ -37166,98 +37879,45 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
   if ((NumElts % 2) != 0)
     return SDValue();
 
-  unsigned RegSize = 128;
-  MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
 
   // Shrink the operands of mul.
   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
 
-  if (ExperimentalVectorWideningLegalization ||
-      NumElts >= OpsVT.getVectorNumElements()) {
-    // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
-    // lower part is needed.
-    SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
-    if (Mode == MULU8 || Mode == MULS8)
-      return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
-                         DL, VT, MulLo);
-
-    MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
-    // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
-    // the higher part is also needed.
-    SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
-                                ReducedVT, NewN0, NewN1);
-
-    // Repack the lower part and higher part result of mul into a wider
-    // result.
-    // Generate shuffle functioning as punpcklwd.
-    SmallVector<int, 16> ShuffleMask(NumElts);
-    for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
-      ShuffleMask[2 * i] = i;
-      ShuffleMask[2 * i + 1] = i + NumElts;
-    }
-    SDValue ResLo =
-        DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
-    ResLo = DAG.getBitcast(ResVT, ResLo);
-    // Generate shuffle functioning as punpckhwd.
-    for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
-      ShuffleMask[2 * i] = i + NumElts / 2;
-      ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
-    }
-    SDValue ResHi =
-        DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
-    ResHi = DAG.getBitcast(ResVT, ResHi);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
-  }
-
-  // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
-  // to legalize the mul explicitly because implicit legalization for type
-  // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
-  // instructions which will not exist when we explicitly legalize it by
-  // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
-  // <4 x i16> undef).
-  //
-  // Legalize the operands of mul.
-  // FIXME: We may be able to handle non-concatenated vectors by insertion.
-  unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
-  if ((RegSize % ReducedSizeInBits) != 0)
-    return SDValue();
-
-  SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
-                               DAG.getUNDEF(ReducedVT));
-  Ops[0] = NewN0;
-  NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
-  Ops[0] = NewN1;
-  NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
-
-  if (Mode == MULU8 || Mode == MULS8) {
-    // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
-    // part is needed.
-    SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
-
-    // convert the type of mul result to VT.
-    MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
-    SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
-                                            : ISD::SIGN_EXTEND_VECTOR_INREG,
-                              DL, ResVT, Mul);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
-                       DAG.getIntPtrConstant(0, DL));
-  }
+  // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
+  // lower part is needed.
+  SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
+  if (Mode == MULU8 || Mode == MULS8)
+    return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+                       DL, VT, MulLo);
 
-  // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
-  // MULU16/MULS16, both parts are needed.
-  SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+  MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
+  // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
+  // the higher part is also needed.
   SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
-                              OpsVT, NewN0, NewN1);
+                              ReducedVT, NewN0, NewN1);
 
   // Repack the lower part and higher part result of mul into a wider
-  // result. Make sure the type of mul result is VT.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
-  SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
-  Res = DAG.getBitcast(ResVT, Res);
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
-                     DAG.getIntPtrConstant(0, DL));
+  // result.
+  // Generate shuffle functioning as punpcklwd.
+  SmallVector<int, 16> ShuffleMask(NumElts);
+  for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+    ShuffleMask[2 * i] = i;
+    ShuffleMask[2 * i + 1] = i + NumElts;
+  }
+  SDValue ResLo =
+      DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+  ResLo = DAG.getBitcast(ResVT, ResLo);
+  // Generate shuffle functioning as punpckhwd.
+  for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+    ShuffleMask[2 * i] = i + NumElts / 2;
+    ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
+  }
+  SDValue ResHi =
+      DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+  ResHi = DAG.getBitcast(ResVT, ResHi);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
 }
 
 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
@@ -37365,8 +38025,7 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
   // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
   // Also allow v2i32 if it will be widened.
   MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
-  if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) ||
-        DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
+  if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
@@ -37919,7 +38578,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
     if (NewShiftVal >= NumBitsPerElt)
       NewShiftVal = NumBitsPerElt - 1;
     return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
-                       DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8));
+                       DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
   }
 
   // We can decode 'whole byte' logical bit shifts as shuffles.
@@ -38039,7 +38698,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
           if (Subtarget.hasAVX512()) {
             SDValue FSetCC =
                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
-                            DAG.getConstant(x86cc, DL, MVT::i8));
+                            DAG.getTargetConstant(x86cc, DL, MVT::i8));
             // Need to fill with zeros to ensure the bitcast will produce zeroes
             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
@@ -38048,10 +38707,9 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
                                       N->getSimpleValueType(0));
           }
-          SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
-                                              CMP00.getValueType(), CMP00, CMP01,
-                                              DAG.getConstant(x86cc, DL,
-                                                              MVT::i8));
+          SDValue OnesOrZeroesF =
+              DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
+                          CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
 
           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
@@ -38083,34 +38741,6 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-// Match (xor X, -1) -> X.
-// Match extract_subvector(xor X, -1) -> extract_subvector(X).
-// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
-static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
-  V = peekThroughBitcasts(V);
-  if (V.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
-    return V.getOperand(0);
-  if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-      (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
-    if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
-      Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
-                         Not, V.getOperand(1));
-    }
-  }
-  SmallVector<SDValue, 2> CatOps;
-  if (collectConcatOps(V.getNode(), CatOps)) {
-    for (SDValue &CatOp : CatOps) {
-      SDValue NotCat = IsNOT(CatOp, DAG);
-      if (!NotCat) return SDValue();
-      CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
-    }
-    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
-  }
-  return SDValue();
-}
-
 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::AND);
@@ -38273,7 +38903,7 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
 
   SDLoc DL(N);
   unsigned ShiftVal = SplatVal.countTrailingOnes();
-  SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
+  SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
   return DAG.getBitcast(N->getValueType(0), Shift);
 }
@@ -38499,7 +39129,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   // TODO: Support multiple SrcOps.
   if (VT == MVT::i1) {
     SmallVector<SDValue, 2> SrcOps;
-    if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
+    if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
         SrcOps.size() == 1) {
       SDLoc dl(N);
       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
@@ -38570,7 +39200,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
       }
 
       if (SDValue Shuffle = combineX86ShufflesRecursively(
-              {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
+              {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
               /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
                            N->getOperand(0).getOperand(1));
@@ -38585,7 +39215,7 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
 
-  EVT VT = N->getValueType(0);
+  MVT VT = N->getSimpleValueType(0);
   if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
     return SDValue();
 
@@ -38594,10 +39224,12 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
     return SDValue();
 
-  // On XOP we'll lower to PCMOV so accept one use, otherwise only
-  // do this if either mask has multiple uses already.
-  if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() ||
-        !N1.getOperand(1).hasOneUse()))
+  // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
+  // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
+  bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
+                      Subtarget.hasVLX();
+  if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
+        !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
     return SDValue();
 
   // Attempt to extract constant byte masks.
@@ -38895,6 +39527,24 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
                                       DAG.getBitcast(MVT::v4f32, N1)));
   }
 
+  // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
+  // TODO: Support multiple SrcOps.
+  if (VT == MVT::i1) {
+    SmallVector<SDValue, 2> SrcOps;
+    if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
+        SrcOps.size() == 1) {
+      SDLoc dl(N);
+      unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+      EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+      SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+      if (Mask) {
+        APInt AllBits = APInt::getNullValue(NumElts);
+        return DAG.getSetCC(dl, MVT::i1, Mask,
+                            DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
+      }
+    }
+  }
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -39136,26 +39786,6 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
 }
 
-/// Check if truncation with saturation form type \p SrcVT to \p DstVT
-/// is valid for the given \p Subtarget.
-static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
-                                        const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasAVX512())
-    return false;
-
-  // FIXME: Scalar type may be supported if we move it to vector register.
-  if (!SrcVT.isVector())
-    return false;
-
-  EVT SrcElVT = SrcVT.getScalarType();
-  EVT DstElVT = DstVT.getScalarType();
-  if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
-    return false;
-  if (SrcVT.is512BitVector() || Subtarget.hasVLX())
-    return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
-  return false;
-}
-
 /// Detect patterns of truncation with unsigned saturation:
 ///
 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
@@ -39253,64 +39883,61 @@ static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
   return SDValue();
 }
 
-/// Detect a pattern of truncation with signed saturation.
-/// The types should allow to use VPMOVSS* instruction on AVX512.
-/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched.
-static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
-                                       const X86Subtarget &Subtarget,
-                                       const TargetLowering &TLI) {
-  if (!TLI.isTypeLegal(In.getValueType()))
-    return SDValue();
-  if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
-    return SDValue();
-  return detectSSatPattern(In, VT);
-}
-
-/// Detect a pattern of truncation with saturation:
-/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
-/// The types should allow to use VPMOVUS* instruction on AVX512.
-/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched.
-static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
-                                       const SDLoc &DL,
-                                       const X86Subtarget &Subtarget,
-                                       const TargetLowering &TLI) {
-  if (!TLI.isTypeLegal(In.getValueType()))
-    return SDValue();
-  if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
-    return SDValue();
-  return detectUSatPattern(In, VT, DAG, DL);
-}
-
 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
                                       SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
-  EVT SVT = VT.getScalarType();
+  if (!Subtarget.hasSSE2() || !VT.isVector())
+    return SDValue();
+
+  EVT SVT = VT.getVectorElementType();
   EVT InVT = In.getValueType();
-  EVT InSVT = InVT.getScalarType();
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
-      isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
-    if (auto SSatVal = detectSSatPattern(In, VT))
-      return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
-    if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
-      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
-  }
-  if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
-      !Subtarget.hasAVX512() &&
+  EVT InSVT = InVT.getVectorElementType();
+
+  // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
+  // split across two registers. We can use a packusdw+perm to clamp to 0-65535
+  // and concatenate at the same time. Then we can use a final vpmovuswb to
+  // clip to 0-255.
+  if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+      InVT == MVT::v16i32 && VT == MVT::v16i8) {
+    if (auto USatVal = detectSSatPattern(In, VT, true)) {
+      // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
+      SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
+                                           DL, DAG, Subtarget);
+      assert(Mid && "Failed to pack!");
+      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
+    }
+  }
+
+  // vXi32 truncate instructions are available with AVX512F.
+  // vXi16 truncate instructions are only available with AVX512BW.
+  // For 256-bit or smaller vectors, we require VLX.
+  // FIXME: We could widen truncates to 512 to remove the VLX restriction.
+  // If the result type is 256-bits or larger and we have disable 512-bit
+  // registers, we should go ahead and use the pack instructions if possible.
+  bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
+                       (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
+                      (InVT.getSizeInBits() > 128) &&
+                      (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
+                      !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
+
+  if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
+      VT.getSizeInBits() >= 64 &&
       (SVT == MVT::i8 || SVT == MVT::i16) &&
       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
     if (auto USatVal = detectSSatPattern(In, VT, true)) {
       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
+      // Only do this when the result is at least 64 bits or we'll leaving
+      // dangling PACKSSDW nodes.
       if (SVT == MVT::i8 && InSVT == MVT::i32) {
         EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
                                      VT.getVectorNumElements());
         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
                                              DAG, Subtarget);
-        if (Mid)
-          return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
-                                        Subtarget);
+        assert(Mid && "Failed to pack!");
+        SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
+                                           Subtarget);
+        assert(V && "Failed to pack!");
+        return V;
       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
                                       Subtarget);
@@ -39319,6 +39946,42 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
                                     Subtarget);
   }
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
+      Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
+    unsigned TruncOpc;
+    SDValue SatVal;
+    if (auto SSatVal = detectSSatPattern(In, VT)) {
+      SatVal = SSatVal;
+      TruncOpc = X86ISD::VTRUNCS;
+    } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
+      SatVal = USatVal;
+      TruncOpc = X86ISD::VTRUNCUS;
+    }
+    if (SatVal) {
+      unsigned ResElts = VT.getVectorNumElements();
+      // If the input type is less than 512 bits and we don't have VLX, we need
+      // to widen to 512 bits.
+      if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
+        unsigned NumConcats = 512 / InVT.getSizeInBits();
+        ResElts *= NumConcats;
+        SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
+        ConcatOps[0] = SatVal;
+        InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
+                                NumConcats * InVT.getVectorNumElements());
+        SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
+      }
+      // Widen the result if its narrower than 128 bits.
+      if (ResElts * SVT.getSizeInBits() < 128)
+        ResElts = 128 / SVT.getSizeInBits();
+      EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
+      SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+  }
+
   return SDValue();
 }
 
@@ -39377,7 +40040,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
     return true;
   };
 
-  // Check if each element of the vector is left-shifted by one.
+  // Check if each element of the vector is right-shifted by one.
   auto LHS = In.getOperand(0);
   auto RHS = In.getOperand(1);
   if (!IsConstVectorInRange(RHS, 1, 1))
@@ -39679,90 +40342,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
         return Blend;
   }
 
-  if (Mld->getExtensionType() != ISD::EXTLOAD)
-    return SDValue();
-
-  // Resolve extending loads.
-  EVT VT = Mld->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-  EVT LdVT = Mld->getMemoryVT();
-  SDLoc dl(Mld);
-
-  assert(LdVT != VT && "Cannot extend to the same type");
-  unsigned ToSz = VT.getScalarSizeInBits();
-  unsigned FromSz = LdVT.getScalarSizeInBits();
-  // From/To sizes and ElemCount must be pow of two.
-  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
-    "Unexpected size for extending masked load");
-
-  unsigned SizeRatio  = ToSz / FromSz;
-  assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
-
-  // Create a type on which we perform the shuffle.
-  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
-          LdVT.getScalarType(), NumElems*SizeRatio);
-  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
-  // Convert PassThru value.
-  SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru());
-  if (!Mld->getPassThru().isUndef()) {
-    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i != NumElems; ++i)
-      ShuffleVec[i] = i * SizeRatio;
-
-    // Can't shuffle using an illegal type.
-    assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
-           "WideVecVT should be legal");
-    WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru,
-                                    DAG.getUNDEF(WideVecVT), ShuffleVec);
-  }
-
-  // Prepare the new mask.
-  SDValue NewMask;
-  SDValue Mask = Mld->getMask();
-  if (Mask.getValueType() == VT) {
-    // Mask and original value have the same type.
-    NewMask = DAG.getBitcast(WideVecVT, Mask);
-    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i != NumElems; ++i)
-      ShuffleVec[i] = i * SizeRatio;
-    for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
-      ShuffleVec[i] = NumElems * SizeRatio;
-    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
-                                   DAG.getConstant(0, dl, WideVecVT),
-                                   ShuffleVec);
-  } else {
-    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
-    unsigned WidenNumElts = NumElems*SizeRatio;
-    unsigned MaskNumElts = VT.getVectorNumElements();
-    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
-                                     WidenNumElts);
-
-    unsigned NumConcat = WidenNumElts / MaskNumElts;
-    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
-    SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
-    Ops[0] = Mask;
-    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
-  }
-
-  SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
-                                     Mld->getBasePtr(), NewMask, WidePassThru,
-                                     Mld->getMemoryVT(), Mld->getMemOperand(),
-                                     ISD::NON_EXTLOAD);
-
-  SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
-  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
-  for (unsigned i = 0; i != NumElems; ++i)
-    ShuffleVec[i * SizeRatio] = i;
-
-  // Can't shuffle using an illegal type.
-  assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
-         "WideVecVT should be legal");
-  SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                   DAG.getUNDEF(WideVecVT), ShuffleVec);
-  SlicedVec = DAG.getBitcast(VT, SlicedVec);
-
-  return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
+  return SDValue();
 }
 
 /// If exactly one element of the mask is set for a non-truncating masked store,
@@ -39800,123 +40380,45 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT VT = Mst->getValue().getValueType();
-  EVT StVT = Mst->getMemoryVT();
   SDLoc dl(Mst);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  if (!Mst->isTruncatingStore()) {
-    if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
-      return ScalarStore;
-
-    // If the mask value has been legalized to a non-boolean vector, try to
-    // simplify ops leading up to it. We only demand the MSB of each lane.
-    SDValue Mask = Mst->getMask();
-    if (Mask.getScalarValueSizeInBits() != 1) {
-      APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
-      if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
-        return SDValue(N, 0);
-    }
-
-    // TODO: AVX512 targets should also be able to simplify something like the
-    // pattern above, but that pattern will be different. It will either need to
-    // match setcc more generally or match PCMPGTM later (in tablegen?).
-
-    SDValue Value = Mst->getValue();
-    if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
-        TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
-                              Mst->getMemoryVT())) {
-      return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
-                                Mst->getBasePtr(), Mask,
-                                Mst->getMemoryVT(), Mst->getMemOperand(), true);
-    }
-
-    return SDValue();
-  }
-
-  // Resolve truncating stores.
-  unsigned NumElems = VT.getVectorNumElements();
-
-  assert(StVT != VT && "Cannot truncate to the same type");
-  unsigned FromSz = VT.getScalarSizeInBits();
-  unsigned ToSz = StVT.getScalarSizeInBits();
-
-  // The truncating store is legal in some cases. For example
-  // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
-  // are designated for truncate store.
-  // In this case we don't need any further transformations.
-  if (TLI.isTruncStoreLegal(VT, StVT))
+  if (Mst->isTruncatingStore())
     return SDValue();
 
-  // From/To sizes and ElemCount must be pow of two.
-  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
-    "Unexpected size for truncating masked store");
-  // We are going to use the original vector elt for storing.
-  // Accumulated smaller vector elements must be a multiple of the store size.
-  assert (((NumElems * FromSz) % ToSz) == 0 &&
-          "Unexpected ratio for truncating masked store");
-
-  unsigned SizeRatio  = FromSz / ToSz;
-  assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
-
-  // Create a type on which we perform the shuffle.
-  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
-          StVT.getScalarType(), NumElems*SizeRatio);
-
-  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
-  SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
-  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
-  for (unsigned i = 0; i != NumElems; ++i)
-    ShuffleVec[i] = i * SizeRatio;
+  if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
+    return ScalarStore;
 
-  // Can't shuffle using an illegal type.
-  assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
-         "WideVecVT should be legal");
-
-  SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
-                                              DAG.getUNDEF(WideVecVT),
-                                              ShuffleVec);
-
-  SDValue NewMask;
+  // If the mask value has been legalized to a non-boolean vector, try to
+  // simplify ops leading up to it. We only demand the MSB of each lane.
   SDValue Mask = Mst->getMask();
-  if (Mask.getValueType() == VT) {
-    // Mask and original value have the same type.
-    NewMask = DAG.getBitcast(WideVecVT, Mask);
-    for (unsigned i = 0; i != NumElems; ++i)
-      ShuffleVec[i] = i * SizeRatio;
-    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
-      ShuffleVec[i] = NumElems*SizeRatio;
-    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
-                                   DAG.getConstant(0, dl, WideVecVT),
-                                   ShuffleVec);
-  } else {
-    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
-    unsigned WidenNumElts = NumElems*SizeRatio;
-    unsigned MaskNumElts = VT.getVectorNumElements();
-    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
-                                     WidenNumElts);
+  if (Mask.getScalarValueSizeInBits() != 1) {
+    APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
+    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+      return SDValue(N, 0);
+  }
 
-    unsigned NumConcat = WidenNumElts / MaskNumElts;
-    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
-    SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
-    Ops[0] = Mask;
-    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+  SDValue Value = Mst->getValue();
+  if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
+      TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+                            Mst->getMemoryVT())) {
+    return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
+                              Mst->getBasePtr(), Mask,
+                              Mst->getMemoryVT(), Mst->getMemOperand(), true);
   }
 
-  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
-                            Mst->getBasePtr(), NewMask, StVT,
-                            Mst->getMemOperand(), false);
+  return SDValue();
 }
 
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
   StoreSDNode *St = cast<StoreSDNode>(N);
-  EVT VT = St->getValue().getValueType();
   EVT StVT = St->getMemoryVT();
   SDLoc dl(St);
   unsigned Alignment = St->getAlignment();
-  SDValue StoredVal = St->getOperand(1);
+  SDValue StoredVal = St->getValue();
+  EVT VT = StoredVal.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Convert a store of vXi1 into a store of iX and a bitcast.
@@ -39986,8 +40488,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                         St->getMemOperand()->getFlags());
   }
 
-  // If we are saving a concatenation of two XMM registers and 32-byte stores
-  // are slow, such as on Sandy Bridge, perform two 16-byte stores.
+  // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
+  // Sandy Bridge, perform two 16-byte stores.
   bool Fast;
   if (VT.is256BitVector() && StVT == VT &&
       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
@@ -40026,13 +40528,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
       St->getValue().getOpcode() == ISD::TRUNCATE &&
       St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
-      TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
-      !DCI.isBeforeLegalizeOps()) {
+      TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
+      St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
                              MVT::v16i8, St->getMemOperand());
   }
 
+  // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
+  if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
+      (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
+       StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
+      TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
+    bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
+    return EmitTruncSStore(IsSigned, St->getChain(),
+                           dl, StoredVal.getOperand(0), St->getBasePtr(),
+                           VT, St->getMemOperand(), DAG);
+  }
+
   // Optimize trunc store (of multiple scalars) to shuffle and store.
   // First, pack all of the elements in one place. Next, store to memory
   // in fewer chunks.
@@ -40040,100 +40553,26 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     // Check if we can detect an AVG pattern from the truncation. If yes,
     // replace the trunc store by a normal store with the result of X86ISD::AVG
     // instruction.
-    if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
-                                       Subtarget, dl))
-      return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
-                          St->getPointerInfo(), St->getAlignment(),
-                          St->getMemOperand()->getFlags());
-
-    if (SDValue Val =
-        detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
-                                TLI))
-      return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
-                             dl, Val, St->getBasePtr(),
-                             St->getMemoryVT(), St->getMemOperand(), DAG);
-    if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
-                                              DAG, dl, Subtarget, TLI))
-      return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
-                             dl, Val, St->getBasePtr(),
-                             St->getMemoryVT(), St->getMemOperand(), DAG);
-
-    unsigned NumElems = VT.getVectorNumElements();
-    assert(StVT != VT && "Cannot truncate to the same type");
-    unsigned FromSz = VT.getScalarSizeInBits();
-    unsigned ToSz = StVT.getScalarSizeInBits();
-
-    // The truncating store is legal in some cases. For example
-    // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
-    // are designated for truncate store.
-    // In this case we don't need any further transformations.
-    if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
-      return SDValue();
+    if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
+      if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
+                                         Subtarget, dl))
+        return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+                            St->getPointerInfo(), St->getAlignment(),
+                            St->getMemOperand()->getFlags());
 
-    // From, To sizes and ElemCount must be pow of two
-    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
-    // We are going to use the original vector elt for storing.
-    // Accumulated smaller vector elements must be a multiple of the store size.
-    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
-
-    unsigned SizeRatio  = FromSz / ToSz;
-
-    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
-
-    // Create a type on which we perform the shuffle
-    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
-            StVT.getScalarType(), NumElems*SizeRatio);
-
-    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
-    SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
-    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i != NumElems; ++i)
-      ShuffleVec[i] = i * SizeRatio;
-
-    // Can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT))
-      return SDValue();
-
-    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
-                                         DAG.getUNDEF(WideVecVT),
-                                         ShuffleVec);
-    // At this point all of the data is stored at the bottom of the
-    // register. We now need to save it to mem.
-
-    // Find the largest store unit
-    MVT StoreType = MVT::i8;
-    for (MVT Tp : MVT::integer_valuetypes()) {
-      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
-        StoreType = Tp;
+    if (TLI.isTruncStoreLegal(VT, StVT)) {
+      if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
+        return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
+                               dl, Val, St->getBasePtr(),
+                               St->getMemoryVT(), St->getMemOperand(), DAG);
+      if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
+                                          DAG, dl))
+        return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+                               dl, Val, St->getBasePtr(),
+                               St->getMemoryVT(), St->getMemOperand(), DAG);
     }
 
-    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
-    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
-        (64 <= NumElems * ToSz))
-      StoreType = MVT::f64;
-
-    // Bitcast the original vector into a vector of store-size units
-    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
-            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
-    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
-    SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
-    SmallVector<SDValue, 8> Chains;
-    SDValue Ptr = St->getBasePtr();
-
-    // Perform one or more big stores into memory.
-    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
-      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                                   StoreType, ShuffWide,
-                                   DAG.getIntPtrConstant(i, dl));
-      SDValue Ch =
-          DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
-                       St->getAlignment(), St->getMemOperand()->getFlags());
-      Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
-      Chains.push_back(Ch);
-    }
-
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+    return SDValue();
   }
 
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
@@ -40149,11 +40588,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
   bool F64IsLegal =
       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
-  if (((VT.isVector() && !VT.isFloatingPoint()) ||
-       (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
+  if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
       isa<LoadSDNode>(St->getValue()) &&
-      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
-      St->getChain().hasOneUse() && !St->isVolatile()) {
+      cast<LoadSDNode>(St->getValue())->isSimple() &&
+      St->getChain().hasOneUse() && St->isSimple()) {
     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
     SmallVector<SDValue, 8> Ops;
 
@@ -40595,8 +41033,8 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
                                                SelectionDAG &DAG,
                                                const X86Subtarget &Subtarget) {
-  // Requires SSE2 but AVX512 has fast truncate.
-  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+  // Requires SSE2.
+  if (!Subtarget.hasSSE2())
     return SDValue();
 
   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
@@ -40620,6 +41058,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
     return SDValue();
 
+  // AVX512 has fast truncate, but if the input is already going to be split,
+  // there's no harm in trying pack.
+  if (Subtarget.hasAVX512() &&
+      !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
+        InVT.is512BitVector()))
+    return SDValue();
+
   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
@@ -40658,9 +41103,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
 
   // Only handle vXi16 types that are at least 128-bits unless they will be
   // widened.
-  if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
-      (!ExperimentalVectorWideningLegalization &&
-       VT.getVectorNumElements() < 8))
+  if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
     return SDValue();
 
   // Input type should be vXi32.
@@ -40874,6 +41317,19 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   return combineVectorTruncation(N, DAG, Subtarget);
 }
 
+static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDValue In = N->getOperand(0);
+  SDLoc DL(N);
+
+  if (auto SSatVal = detectSSatPattern(In, VT))
+    return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
+  if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
+    return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+
+  return SDValue();
+}
+
 /// Returns the negated value if the node \p N flips sign of FP value.
 ///
 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
@@ -40883,10 +41339,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
 /// In this case we go though all bitcasts.
 /// This also recognizes splat of a negated value and returns the splat of that
 /// value.
-static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
+static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
   if (N->getOpcode() == ISD::FNEG)
     return N->getOperand(0);
 
+  // Don't recurse exponentially.
+  if (Depth > SelectionDAG::MaxRecursionDepth)
+    return SDValue();
+
   unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
 
   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
@@ -40900,7 +41360,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
     // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
     if (!SVOp->getOperand(1).isUndef())
       return SDValue();
-    if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
+    if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1))
       if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
         return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
                                     SVOp->getMask());
@@ -40914,7 +41374,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
     SDValue InsVal = Op.getOperand(1);
     if (!InsVector.isUndef())
       return SDValue();
-    if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
+    if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
       if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
         return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
                            NegInsVal, Op.getOperand(2));
@@ -40951,6 +41411,57 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
   return SDValue();
 }
 
+static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
+                                bool NegRes) {
+  if (NegMul) {
+    switch (Opcode) {
+    default: llvm_unreachable("Unexpected opcode");
+    case ISD::FMA:             Opcode = X86ISD::FNMADD;       break;
+    case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMADD_RND;   break;
+    case X86ISD::FMSUB:        Opcode = X86ISD::FNMSUB;       break;
+    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
+    case X86ISD::FNMADD:       Opcode = ISD::FMA;             break;
+    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMADD_RND;    break;
+    case X86ISD::FNMSUB:       Opcode = X86ISD::FMSUB;        break;
+    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMSUB_RND;    break;
+    }
+  }
+
+  if (NegAcc) {
+    switch (Opcode) {
+    default: llvm_unreachable("Unexpected opcode");
+    case ISD::FMA:             Opcode = X86ISD::FMSUB;        break;
+    case X86ISD::FMADD_RND:    Opcode = X86ISD::FMSUB_RND;    break;
+    case X86ISD::FMSUB:        Opcode = ISD::FMA;             break;
+    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FMADD_RND;    break;
+    case X86ISD::FNMADD:       Opcode = X86ISD::FNMSUB;       break;
+    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FNMSUB_RND;   break;
+    case X86ISD::FNMSUB:       Opcode = X86ISD::FNMADD;       break;
+    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FNMADD_RND;   break;
+    case X86ISD::FMADDSUB:     Opcode = X86ISD::FMSUBADD;     break;
+    case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
+    case X86ISD::FMSUBADD:     Opcode = X86ISD::FMADDSUB;     break;
+    case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
+    }
+  }
+
+  if (NegRes) {
+    switch (Opcode) {
+    default: llvm_unreachable("Unexpected opcode");
+    case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;
+    case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
+    case X86ISD::FMSUB:        Opcode = X86ISD::FNMADD;       break;
+    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMADD_RND;   break;
+    case X86ISD::FNMADD:       Opcode = X86ISD::FMSUB;        break;
+    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
+    case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
+    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
+    }
+  }
+
+  return Opcode;
+}
+
 /// Do target-specific dag combines on floating point negations.
 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
@@ -40980,29 +41491,123 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
 
   // If we're negating an FMA node, then we can adjust the
   // instruction to include the extra negation.
-  unsigned NewOpcode = 0;
   if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
     switch (Arg.getOpcode()) {
-    case ISD::FMA:             NewOpcode = X86ISD::FNMSUB;       break;
-    case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
-    case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
-    case X86ISD::FNMSUB:       NewOpcode = ISD::FMA;             break;
-    case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
-    case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
-    case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
-    case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
-    // We can't handle scalar intrinsic node here because it would only
-    // invert one element and not the whole vector. But we could try to handle
-    // a negation of the lower element only.
-    }
-  }
-  if (NewOpcode)
-    return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
-                                              Arg.getNode()->ops()));
+    case ISD::FMA:
+    case X86ISD::FMSUB:
+    case X86ISD::FNMADD:
+    case X86ISD::FNMSUB:
+    case X86ISD::FMADD_RND:
+    case X86ISD::FMSUB_RND:
+    case X86ISD::FNMADD_RND:
+    case X86ISD::FNMSUB_RND: {
+      // We can't handle scalar intrinsic node here because it would only
+      // invert one element and not the whole vector. But we could try to handle
+      // a negation of the lower element only.
+      unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true);
+      return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops()));
+    }
+    }
+  }
 
   return SDValue();
 }
 
+char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
+                                           bool LegalOperations,
+                                           bool ForCodeSize,
+                                           unsigned Depth) const {
+  // fneg patterns are removable even if they have multiple uses.
+  if (isFNEG(DAG, Op.getNode(), Depth))
+    return 2;
+
+  // Don't recurse exponentially.
+  if (Depth > SelectionDAG::MaxRecursionDepth)
+    return 0;
+
+  EVT VT = Op.getValueType();
+  EVT SVT = VT.getScalarType();
+  switch (Op.getOpcode()) {
+  case ISD::FMA:
+  case X86ISD::FMSUB:
+  case X86ISD::FNMADD:
+  case X86ISD::FNMSUB:
+  case X86ISD::FMADD_RND:
+  case X86ISD::FMSUB_RND:
+  case X86ISD::FNMADD_RND:
+  case X86ISD::FNMSUB_RND: {
+    if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
+        !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
+      break;
+
+    // This is always negatible for free but we might be able to remove some
+    // extra operand negations as well.
+    for (int i = 0; i != 3; ++i) {
+      char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
+                                  ForCodeSize, Depth + 1);
+      if (V == 2)
+        return V;
+    }
+    return 1;
+  }
+  }
+
+  return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
+                                            ForCodeSize, Depth);
+}
+
+SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                                bool LegalOperations,
+                                                bool ForCodeSize,
+                                                unsigned Depth) const {
+  // fneg patterns are removable even if they have multiple uses.
+  if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
+    return DAG.getBitcast(Op.getValueType(), Arg);
+
+  EVT VT = Op.getValueType();
+  EVT SVT = VT.getScalarType();
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  case ISD::FMA:
+  case X86ISD::FMSUB:
+  case X86ISD::FNMADD:
+  case X86ISD::FNMSUB:
+  case X86ISD::FMADD_RND:
+  case X86ISD::FMSUB_RND:
+  case X86ISD::FNMADD_RND:
+  case X86ISD::FNMSUB_RND: {
+    if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
+        !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
+      break;
+
+    // This is always negatible for free but we might be able to remove some
+    // extra operand negations as well.
+    SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
+    for (int i = 0; i != 3; ++i) {
+      char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
+                                  ForCodeSize, Depth + 1);
+      if (V == 2)
+        NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
+                                         ForCodeSize, Depth + 1);
+    }
+
+    bool NegA = !!NewOps[0];
+    bool NegB = !!NewOps[1];
+    bool NegC = !!NewOps[2];
+    unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
+
+    // Fill in the non-negated ops with the original values.
+    for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
+      if (!NewOps[i])
+        NewOps[i] = Op.getOperand(i);
+    return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
+  }
+  }
+
+  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
+                                              ForCodeSize, Depth);
+}
+
 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
@@ -41312,8 +41917,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
-    // Unless the load is volatile.
-    if (!LN->isVolatile()) {
+    // Unless the load is volatile or atomic.
+    if (LN->isSimple()) {
       SDLoc dl(N);
       unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
       MVT MemVT = MVT::getIntegerVT(NumBits);
@@ -41347,8 +41952,8 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
-    // Unless the load is volatile.
-    if (!LN->isVolatile()) {
+    // Unless the load is volatile or atomic.
+    if (LN->isSimple()) {
       SDLoc dl(N);
       unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
       MVT MemVT = MVT::getFloatingPointVT(NumBits);
@@ -41724,127 +42329,6 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
 }
 
-/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
-/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
-/// with UNDEFs) of the input to vectors of the same size as the target type
-/// which then extends the lowest elements.
-static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
-                                          TargetLowering::DAGCombinerInfo &DCI,
-                                          const X86Subtarget &Subtarget) {
-  if (ExperimentalVectorWideningLegalization)
-    return SDValue();
-
-  unsigned Opcode = N->getOpcode();
-  // TODO - add ANY_EXTEND support.
-  if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
-    return SDValue();
-  if (!DCI.isBeforeLegalizeOps())
-    return SDValue();
-  if (!Subtarget.hasSSE2())
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-  EVT SVT = VT.getScalarType();
-  EVT InVT = N0.getValueType();
-  EVT InSVT = InVT.getScalarType();
-
-  // FIXME: Generic DAGCombiner previously had a bug that would cause a
-  // sign_extend of setcc to sometimes return the original node and tricked it
-  // into thinking CombineTo was used which prevented the target combines from
-  // running.
-  // Earlying out here to avoid regressions like this
-  //  (v4i32 (sext (v4i1 (setcc (v4i16)))))
-  // Becomes
-  //  (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
-  // Type legalized to
-  //  (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
-  // Leading to a packssdw+pmovsxwd
-  // We could write a DAG combine to fix this, but really we shouldn't be
-  // creating sext_invec that's forcing v8i16 into the DAG.
-  if (N0.getOpcode() == ISD::SETCC)
-    return SDValue();
-
-  // Input type must be a vector and we must be extending legal integer types.
-  if (!VT.isVector() || VT.getVectorNumElements() < 2)
-    return SDValue();
-  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
-    return SDValue();
-  if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
-    return SDValue();
-
-  // If the input/output types are both legal then we have at least AVX1 and
-  // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
-  if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
-      DAG.getTargetLoweringInfo().isTypeLegal(InVT))
-    return SDValue();
-
-  SDLoc DL(N);
-
-  auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
-    EVT SrcVT = N.getValueType();
-    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
-                                 Size / SrcVT.getScalarSizeInBits());
-    SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
-                                  DAG.getUNDEF(SrcVT));
-    Opnds[0] = N;
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
-  };
-
-  // If target-size is less than 128-bits, extend to a type that would extend
-  // to 128 bits, extend that and extract the original target vector.
-  if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
-    unsigned Scale = 128 / VT.getSizeInBits();
-    EVT ExVT =
-        EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
-    SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
-    SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
-                       DAG.getIntPtrConstant(0, DL));
-  }
-
-  // If target-size is 128-bits (or 256-bits on AVX target), then convert to
-  // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
-  // Also use this if we don't have SSE41 to allow the legalizer do its job.
-  if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
-      (VT.is256BitVector() && Subtarget.hasAVX()) ||
-      (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
-    SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
-    Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
-    return DAG.getNode(Opcode, DL, VT, ExOp);
-  }
-
-  auto SplitAndExtendInReg = [&](unsigned SplitSize) {
-    unsigned NumVecs = VT.getSizeInBits() / SplitSize;
-    unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
-    EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
-    EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
-
-    unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
-    SmallVector<SDValue, 8> Opnds;
-    for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
-      SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
-                                   DAG.getIntPtrConstant(Offset, DL));
-      SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
-      SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
-      Opnds.push_back(SrcVec);
-    }
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
-  };
-
-  // On pre-AVX targets, split into 128-bit nodes of
-  // ISD::*_EXTEND_VECTOR_INREG.
-  if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
-    return SplitAndExtendInReg(128);
-
-  // On pre-AVX512 targets, split into 256-bit nodes of
-  // ISD::*_EXTEND_VECTOR_INREG.
-  if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
-    return SplitAndExtendInReg(256);
-
-  return SDValue();
-}
-
 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
 // result type.
 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
@@ -41915,9 +42399,6 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
   }
 
-  if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
-    return V;
-
   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
@@ -41931,45 +42412,15 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
-  if (NegMul) {
-    switch (Opcode) {
-    default: llvm_unreachable("Unexpected opcode");
-    case ISD::FMA:             Opcode = X86ISD::FNMADD;       break;
-    case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMADD_RND;   break;
-    case X86ISD::FMSUB:        Opcode = X86ISD::FNMSUB;       break;
-    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
-    case X86ISD::FNMADD:       Opcode = ISD::FMA;             break;
-    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMADD_RND;    break;
-    case X86ISD::FNMSUB:       Opcode = X86ISD::FMSUB;        break;
-    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMSUB_RND;    break;
-    }
-  }
-
-  if (NegAcc) {
-    switch (Opcode) {
-    default: llvm_unreachable("Unexpected opcode");
-    case ISD::FMA:             Opcode = X86ISD::FMSUB;        break;
-    case X86ISD::FMADD_RND:    Opcode = X86ISD::FMSUB_RND;    break;
-    case X86ISD::FMSUB:        Opcode = ISD::FMA;             break;
-    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FMADD_RND;    break;
-    case X86ISD::FNMADD:       Opcode = X86ISD::FNMSUB;       break;
-    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FNMSUB_RND;   break;
-    case X86ISD::FNMSUB:       Opcode = X86ISD::FNMADD;       break;
-    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FNMADD_RND;   break;
-    }
-  }
-
-  return Opcode;
-}
-
 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VT))
     return SDValue();
 
   EVT ScalarVT = VT.getScalarType();
@@ -41980,17 +42431,21 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   SDValue B = N->getOperand(1);
   SDValue C = N->getOperand(2);
 
-  auto invertIfNegative = [&DAG](SDValue &V) {
-    if (SDValue NegVal = isFNEG(DAG, V.getNode())) {
-      V = DAG.getBitcast(V.getValueType(), NegVal);
+  auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
+    bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+    bool LegalOperations = !DCI.isBeforeLegalizeOps();
+    if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) {
+      V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
       return true;
     }
     // Look through extract_vector_elts. If it comes from an FNEG, create a
     // new extract from the FNEG input.
     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         isNullConstant(V.getOperand(1))) {
-      if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) {
-        NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
+      SDValue Vec = V.getOperand(0);
+      if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) {
+        SDValue NegVal =
+            TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
                         NegVal, V.getOperand(1));
         return true;
@@ -42009,7 +42464,8 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   if (!NegA && !NegB && !NegC)
     return SDValue();
 
-  unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
+  unsigned NewOpcode =
+      negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
 
   if (N->getNumOperands() == 4)
     return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
@@ -42017,33 +42473,27 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
 }
 
 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
+// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
-                               const X86Subtarget &Subtarget) {
+                               TargetLowering::DAGCombinerInfo &DCI) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool LegalOperations = !DCI.isBeforeLegalizeOps();
 
-  SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode());
-  if (!NegVal)
+  SDValue N2 = N->getOperand(2);
+  if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2)
     return SDValue();
 
-  // FIXME: Should we bitcast instead?
-  if (NegVal.getValueType() != VT)
-    return SDValue();
-
-  unsigned NewOpcode;
-  switch (N->getOpcode()) {
-  default: llvm_unreachable("Unexpected opcode!");
-  case X86ISD::FMADDSUB:     NewOpcode = X86ISD::FMSUBADD;     break;
-  case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
-  case X86ISD::FMSUBADD:     NewOpcode = X86ISD::FMADDSUB;     break;
-  case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
-  }
+  SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+  unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
 
   if (N->getNumOperands() == 4)
     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
-                       NegVal, N->getOperand(3));
+                       NegN2, N->getOperand(3));
   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
-                     NegVal);
+                     NegN2);
 }
 
 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
@@ -42090,9 +42540,6 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
       return V;
 
-  if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
-    return V;
-
   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
@@ -42111,12 +42558,11 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
       VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
     SDValue N00 = N0.getOperand(0);
     SDValue N01 = N0.getOperand(1);
-    unsigned NumSrcElts = N00.getValueType().getVectorNumElements();
     unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
     APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
     if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
         (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
-      return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128);
+      return concatSubVectors(N00, N01, DAG, dl);
     }
   }
 
@@ -42159,16 +42605,30 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
       !IsOrXorXorCCZero)
     return SDValue();
 
-  // TODO: Use PXOR + PTEST for SSE4.1 or later?
   EVT VT = SetCC->getValueType(0);
   SDLoc DL(SetCC);
+  bool HasAVX = Subtarget.hasAVX();
+
+  // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512.
+  // Otherwise use PCMPEQ (plus AND) and mask testing.
   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
-      (OpSize == 256 && Subtarget.hasAVX2()) ||
+      (OpSize == 256 && HasAVX) ||
       (OpSize == 512 && Subtarget.useAVX512Regs())) {
-    EVT VecVT = OpSize == 512 ? MVT::v16i32 :
-                OpSize == 256 ? MVT::v32i8 :
-                                MVT::v16i8;
-    EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
+    bool HasPT = Subtarget.hasSSE41();
+    EVT VecVT = MVT::v16i8;
+    EVT CmpVT = MVT::v16i8;
+    if (OpSize == 256)
+      VecVT = CmpVT = MVT::v32i8;
+    if (OpSize == 512) {
+      if (Subtarget.hasBWI()) {
+        VecVT = MVT::v64i8;
+        CmpVT = MVT::v64i1;
+      } else {
+        VecVT = MVT::v16i32;
+        CmpVT = MVT::v16i1;
+      }
+    }
+
     SDValue Cmp;
     if (IsOrXorXorCCZero) {
       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
@@ -42179,18 +42639,38 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
       SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
       SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
       SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
-      SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
-      SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
-      Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
+      if (VecVT == CmpVT && HasPT) {
+        SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B);
+        SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D);
+        Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2);
+      } else {
+        SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+        SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
+        Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
+      }
     } else {
       SDValue VecX = DAG.getBitcast(VecVT, X);
       SDValue VecY = DAG.getBitcast(VecVT, Y);
-      Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+      if (VecVT == CmpVT && HasPT) {
+        Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
+      } else {
+        Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+      }
     }
     // For 512-bits we want to emit a setcc that will lower to kortest.
-    if (OpSize == 512)
-      return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
-                          DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
+    if (VecVT != CmpVT) {
+      EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16;
+      SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT);
+      return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC);
+    }
+    if (HasPT) {
+      SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
+                                     Cmp);
+      SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
+      X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+      SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));
+    }
     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
@@ -42270,8 +42750,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   // go through type promotion to a 128-bit vector.
   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
       VT.getVectorElementType() == MVT::i1 &&
-      (ExperimentalVectorWideningLegalization ||
-       VT.getVectorNumElements() > 4) &&
       (OpVT.getVectorElementType() == MVT::i8 ||
        OpVT.getVectorElementType() == MVT::i16)) {
     SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
@@ -42289,7 +42767,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI) {
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const X86Subtarget &Subtarget) {
   SDValue Src = N->getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
   MVT VT = N->getSimpleValueType(0);
@@ -42310,7 +42789,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
 
   // Look through int->fp bitcasts that don't change the element width.
   unsigned EltWidth = SrcVT.getScalarSizeInBits();
-  if (Src.getOpcode() == ISD::BITCAST &&
+  if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
       Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
     return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
 
@@ -42334,71 +42813,123 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  // With vector masks we only demand the upper bit of the mask.
+  SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
+  if (Mask.getScalarValueSizeInBits() != 1) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
+    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+      return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    const X86Subtarget &Subtarget) {
+                                    TargetLowering::DAGCombinerInfo &DCI) {
   SDLoc DL(N);
+  auto *GorS = cast<MaskedGatherScatterSDNode>(N);
+  SDValue Chain = GorS->getChain();
+  SDValue Index = GorS->getIndex();
+  SDValue Mask = GorS->getMask();
+  SDValue Base = GorS->getBasePtr();
+  SDValue Scale = GorS->getScale();
 
-  if (DCI.isBeforeLegalizeOps()) {
-    SDValue Index = N->getOperand(4);
-    // Remove any sign extends from 32 or smaller to larger than 32.
-    // Only do this before LegalizeOps in case we need the sign extend for
-    // legalization.
-    if (Index.getOpcode() == ISD::SIGN_EXTEND) {
-      if (Index.getScalarValueSizeInBits() > 32 &&
-          Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
-        SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
-        NewOps[4] = Index.getOperand(0);
-        SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
-        if (Res == N) {
-          // The original sign extend has less users, add back to worklist in
-          // case it needs to be removed
-          DCI.AddToWorklist(Index.getNode());
-          DCI.AddToWorklist(N);
+  if (DCI.isBeforeLegalize()) {
+    unsigned IndexWidth = Index.getScalarValueSizeInBits();
+
+    // Shrink constant indices if they are larger than 32-bits.
+    // Only do this before legalize types since v2i64 could become v2i32.
+    // FIXME: We could check that the type is legal if we're after legalize
+    // types, but then we would need to construct test cases where that happens.
+    // FIXME: We could support more than just constant vectors, but we need to
+    // careful with costing. A truncate that can be optimized out would be fine.
+    // Otherwise we might only want to create a truncate if it avoids a split.
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
+      if (BV->isConstant() && IndexWidth > 32 &&
+          DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
+        unsigned NumElts = Index.getValueType().getVectorNumElements();
+        EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+        Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+        if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+          SDValue Ops[] = { Chain, Gather->getPassThru(),
+                            Mask, Base, Index, Scale } ;
+          return DAG.getMaskedGather(Gather->getVTList(),
+                                     Gather->getMemoryVT(), DL, Ops,
+                                     Gather->getMemOperand(),
+                                     Gather->getIndexType());
         }
-        return SDValue(Res, 0);
-      }
+        auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+        SDValue Ops[] = { Chain, Scatter->getValue(),
+                          Mask, Base, Index, Scale };
+        return DAG.getMaskedScatter(Scatter->getVTList(),
+                                    Scatter->getMemoryVT(), DL,
+                                    Ops, Scatter->getMemOperand(),
+                                    Scatter->getIndexType());
+      }
+    }
+
+    // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
+    // there are sufficient sign bits. Only do this before legalize types to
+    // avoid creating illegal types in truncate.
+    if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
+         Index.getOpcode() == ISD::ZERO_EXTEND) &&
+        IndexWidth > 32 &&
+        Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
+        DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
+      unsigned NumElts = Index.getValueType().getVectorNumElements();
+      EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+      Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+      if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+        SDValue Ops[] = { Chain, Gather->getPassThru(),
+                          Mask, Base, Index, Scale } ;
+        return DAG.getMaskedGather(Gather->getVTList(),
+                                   Gather->getMemoryVT(), DL, Ops,
+                                   Gather->getMemOperand(),
+                                   Gather->getIndexType());
+      }
+      auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+      SDValue Ops[] = { Chain, Scatter->getValue(),
+                        Mask, Base, Index, Scale };
+      return DAG.getMaskedScatter(Scatter->getVTList(),
+                                  Scatter->getMemoryVT(), DL,
+                                  Ops, Scatter->getMemOperand(),
+                                  Scatter->getIndexType());
     }
+  }
+
+  if (DCI.isBeforeLegalizeOps()) {
+    unsigned IndexWidth = Index.getScalarValueSizeInBits();
 
     // Make sure the index is either i32 or i64
-    unsigned ScalarSize = Index.getScalarValueSizeInBits();
-    if (ScalarSize != 32 && ScalarSize != 64) {
-      MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
+    if (IndexWidth != 32 && IndexWidth != 64) {
+      MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
       EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                                    Index.getValueType().getVectorNumElements());
       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
-      SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
-      NewOps[4] = Index;
-      SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
-      if (Res == N)
-        DCI.AddToWorklist(N);
-      return SDValue(Res, 0);
-    }
-
-    // Try to remove zero extends from 32->64 if we know the sign bit of
-    // the input is zero.
-    if (Index.getOpcode() == ISD::ZERO_EXTEND &&
-        Index.getScalarValueSizeInBits() == 64 &&
-        Index.getOperand(0).getScalarValueSizeInBits() == 32) {
-      if (DAG.SignBitIsZero(Index.getOperand(0))) {
-        SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
-        NewOps[4] = Index.getOperand(0);
-        SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
-        if (Res == N) {
-          // The original sign extend has less users, add back to worklist in
-          // case it needs to be removed
-          DCI.AddToWorklist(Index.getNode());
-          DCI.AddToWorklist(N);
-        }
-        return SDValue(Res, 0);
-      }
-    }
-  }
-
-  // With AVX2 we only demand the upper bit of the mask.
-  if (!Subtarget.hasAVX512()) {
+      if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+        SDValue Ops[] = { Chain, Gather->getPassThru(),
+                          Mask, Base, Index, Scale } ;
+        return DAG.getMaskedGather(Gather->getVTList(),
+                                   Gather->getMemoryVT(), DL, Ops,
+                                   Gather->getMemOperand(),
+                                   Gather->getIndexType());
+      }
+      auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+      SDValue Ops[] = { Chain, Scatter->getValue(),
+                        Mask, Base, Index, Scale };
+      return DAG.getMaskedScatter(Scatter->getVTList(),
+                                  Scatter->getMemoryVT(), DL,
+                                  Ops, Scatter->getMemOperand(),
+                                  Scatter->getIndexType());
+    }
+  }
+
+  // With vector masks we only demand the upper bit of the mask.
+  if (Mask.getScalarValueSizeInBits() != 1) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    SDValue Mask = N->getOperand(2);
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
       return SDValue(N, 0);
@@ -42432,7 +42963,7 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
   // RAUW them under us.
   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
-    SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+    SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
                        N->getOperand(1), Cond, Flags);
   }
@@ -42549,6 +43080,7 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
+                               TargetLowering::DAGCombinerInfo &DCI,
                                const X86Subtarget &Subtarget) {
   // First try to optimize away the conversion entirely when it's
   // conditionally from a constant. Vectors only.
@@ -42578,13 +43110,22 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
     unsigned BitWidth = InVT.getScalarSizeInBits();
     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
     if (NumSignBits >= (BitWidth - 31)) {
-      EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
+      EVT TruncVT = MVT::i32;
       if (InVT.isVector())
         TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
                                    InVT.getVectorNumElements());
       SDLoc dl(N);
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
-      return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+      if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
+        return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+      }
+      // If we're after legalize and the type is v2i32 we need to shuffle and
+      // use CVTSI2P.
+      assert(InVT == MVT::v2i64 && "Unexpected VT!");
+      SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
+      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
+                                          { 0, 2, -1, -1 });
+      return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
     }
   }
 
@@ -42604,7 +43145,7 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
     if (Subtarget.hasDQI() && VT != MVT::f80)
       return SDValue();
 
-    if (!Ld->isVolatile() && !VT.isVector() &&
+    if (Ld->isSimple() && !VT.isVector() &&
         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
         !Subtarget.is64Bit() && LdVT == MVT::i64) {
       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
@@ -42841,12 +43382,12 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
     SDLoc DL(N);
     EVT VT = N->getValueType(0);
     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
-    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
-                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                                           DAG.getConstant(X86::COND_B, DL,
-                                                           MVT::i8),
-                                           N->getOperand(2)),
-                               DAG.getConstant(1, DL, VT));
+    SDValue Res1 =
+        DAG.getNode(ISD::AND, DL, VT,
+                    DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                                DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+                                N->getOperand(2)),
+                    DAG.getConstant(1, DL, VT));
     return DCI.CombineTo(N, Res1, CarryOut);
   }
 
@@ -42906,7 +43447,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                         DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
                          Y.getOperand(1));
     }
 
@@ -42924,7 +43465,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                           DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                           DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
                            NewEFLAGS);
       }
     }
@@ -42984,7 +43525,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                         DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
                          SDValue(Neg.getNode(), 1));
     }
 
@@ -42997,7 +43538,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
       SDValue One = DAG.getConstant(1, DL, ZVT);
       SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                         DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1);
     }
   }
 
@@ -43025,9 +43566,6 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-
   EVT VT = N->getValueType(0);
 
   // If the vector size is less than 128, or greater than the supported RegSize,
@@ -43035,14 +43573,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
   if (!VT.isVector() || VT.getVectorNumElements() < 8)
     return SDValue();
 
-  if (Op0.getOpcode() != ISD::MUL)
-    std::swap(Op0, Op1);
-  if (Op0.getOpcode() != ISD::MUL)
-    return SDValue();
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
 
-  ShrinkMode Mode;
-  if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16)
-    return SDValue();
+  auto UsePMADDWD = [&](SDValue Op) {
+    ShrinkMode Mode;
+    return Op.getOpcode() == ISD::MUL &&
+           canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 &&
+           (!Subtarget.hasSSE41() ||
+            (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+             Op->isOnlyUserOf(Op.getOperand(1).getNode())));
+  };
+
+  SDValue MulOp, OtherOp;
+  if (UsePMADDWD(Op0)) {
+    MulOp = Op0;
+    OtherOp = Op1;
+  } else if (UsePMADDWD(Op1)) {
+    MulOp = Op1;
+    OtherOp = Op0;
+  } else
+   return SDValue();
 
   SDLoc DL(N);
   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
@@ -43050,34 +43601,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
   EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                 VT.getVectorNumElements() / 2);
 
+  // Shrink the operands of mul.
+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
+
   // Madd vector size is half of the original vector size
   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                            ArrayRef<SDValue> Ops) {
     MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
     return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
   };
-
-  auto BuildPMADDWD = [&](SDValue Mul) {
-    // Shrink the operands of mul.
-    SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0));
-    SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1));
-
-    SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
-                                   PMADDWDBuilder);
-    // Fill the rest of the output with 0
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd,
-                       DAG.getConstant(0, DL, MAddVT));
-  };
-
-  Op0 = BuildPMADDWD(Op0);
-
-  // It's possible that Op1 is also a mul we can reduce.
-  if (Op1.getOpcode() == ISD::MUL &&
-      canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) {
-    Op1 = BuildPMADDWD(Op1);
-  }
-
-  return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
+  SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
+                                  PMADDWDBuilder);
+  // Fill the rest of the output with 0
+  SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+
+  // Preserve the reduction flag on the ADD. We may need to revisit for the
+  // other operand.
+  SDNodeFlags Flags;
+  Flags.setVectorReduction(true);
+  return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
 }
 
 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
@@ -43087,8 +43631,6 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
 
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
 
   // TODO: There's nothing special about i32, any integer type above i16 should
   // work just as well.
@@ -43108,80 +43650,53 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
   if (VT.getSizeInBits() / 4 > RegSize)
     return SDValue();
 
-  // We know N is a reduction add, which means one of its operands is a phi.
-  // To match SAD, we need the other operand to be a ABS.
-  if (Op0.getOpcode() != ISD::ABS)
-    std::swap(Op0, Op1);
-  if (Op0.getOpcode() != ISD::ABS)
-    return SDValue();
-
-  auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
-    // SAD pattern detected. Now build a SAD instruction and an addition for
-    // reduction. Note that the number of elements of the result of SAD is less
-    // than the number of elements of its input. Therefore, we could only update
-    // part of elements in the reduction vector.
-    SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
-
-    // The output of PSADBW is a vector of i64.
-    // We need to turn the vector of i64 into a vector of i32.
-    // If the reduction vector is at least as wide as the psadbw result, just
-    // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
-    // anyway.
-    MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
-    if (VT.getSizeInBits() >= ResVT.getSizeInBits())
-      Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
-    else
-      Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
-
-    if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
-      // Fill the upper elements with zero to match the add width.
-      SDValue Zero = DAG.getConstant(0, DL, VT);
-      Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
-                        DAG.getIntPtrConstant(0, DL));
-    }
-
-    return Sad;
-  };
+  // We know N is a reduction add. To match SAD, we need one of the operands to
+  // be an ABS.
+  SDValue AbsOp = N->getOperand(0);
+  SDValue OtherOp = N->getOperand(1);
+  if (AbsOp.getOpcode() != ISD::ABS)
+    std::swap(AbsOp, OtherOp);
+  if (AbsOp.getOpcode() != ISD::ABS)
+    return SDValue();
 
   // Check whether we have an abs-diff pattern feeding into the select.
   SDValue SadOp0, SadOp1;
-  if (!detectZextAbsDiff(Op0, SadOp0, SadOp1))
-    return SDValue();
-
-  Op0 = BuildPSADBW(SadOp0, SadOp1);
-
-  // It's possible we have a sad on the other side too.
-  if (Op1.getOpcode() == ISD::ABS &&
-      detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
-    Op1 = BuildPSADBW(SadOp0, SadOp1);
-  }
-
-  return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
-}
-
-/// Convert vector increment or decrement to sub/add with an all-ones constant:
-/// add X, <1, 1...> --> sub X, <-1, -1...>
-/// sub X, <1, 1...> --> add X, <-1, -1...>
-/// The all-ones vector constant can be materialized using a pcmpeq instruction
-/// that is commonly recognized as an idiom (has no register dependency), so
-/// that's better/smaller than loading a splat 1 constant.
-static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
-  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
-         "Unexpected opcode for increment/decrement transform");
-
-  // Pseudo-legality check: getOnesVector() expects one of these types, so bail
-  // out and wait for legalization if we have an unsupported vector length.
-  EVT VT = N->getValueType(0);
-  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
-    return SDValue();
-
-  APInt SplatVal;
-  if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue())
-    return SDValue();
-
-  SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
-  unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
-  return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
+  if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
+    return SDValue();
+
+  // SAD pattern detected. Now build a SAD instruction and an addition for
+  // reduction. Note that the number of elements of the result of SAD is less
+  // than the number of elements of its input. Therefore, we could only update
+  // part of elements in the reduction vector.
+  SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
+
+  // The output of PSADBW is a vector of i64.
+  // We need to turn the vector of i64 into a vector of i32.
+  // If the reduction vector is at least as wide as the psadbw result, just
+  // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
+  // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
+  // result to v2i32 which will be removed by type legalization. If we/ widen
+  // narrow vectors then we bitcast to v4i32 and extract v2i32.
+  MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
+  Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
+
+  if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
+    // Fill the upper elements with zero to match the add width.
+    assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
+    unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
+    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
+    Ops[0] = Sad;
+    Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
+  } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
+    Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
+                      DAG.getIntPtrConstant(0, DL));
+  }
+
+  // Preserve the reduction flag on the ADD. We may need to revisit for the
+  // other operand.
+  SDNodeFlags Flags;
+  Flags.setVectorReduction(true);
+  return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
 }
 
 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
@@ -43294,8 +43809,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
 }
 
 // Attempt to turn this pattern into PMADDWD.
-// (mul (add (zext (build_vector)), (zext (build_vector))),
-//      (add (zext (build_vector)), (zext (build_vector)))
+// (mul (add (sext (build_vector)), (sext (build_vector))),
+//      (add (sext (build_vector)), (sext (build_vector)))
 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                               const SDLoc &DL, EVT VT,
                               const X86Subtarget &Subtarget) {
@@ -43415,6 +43930,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
 }
 
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   const SDNodeFlags Flags = N->getFlags();
   if (Flags.hasVectorReduction()) {
@@ -43445,8 +43961,29 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                             HADDBuilder);
   }
 
-  if (SDValue V = combineIncDecVector(N, DAG))
-    return V;
+  // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
+  // (sub Y, (sext (vXi1 X))).
+  // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
+  // generic DAG combine without a legal type check, but adding this there
+  // caused regressions.
+  if (VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
+        Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
+        TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
+      SDLoc DL(N);
+      SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
+      return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
+    }
+
+    if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
+        Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
+        TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
+      SDLoc DL(N);
+      SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
+      return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
+    }
+  }
 
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
@@ -43457,13 +43994,15 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
+  if (!VT.isVector())
+    return SDValue();
+
   // PSUBUS is supported, starting from SSE2, but truncation for v8i32
   // is only worth it with SSSE3 (PSHUFB).
-  if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
+  EVT EltVT = VT.getVectorElementType();
+  if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
       !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
-      !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
-      !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
-                                   VT == MVT::v16i32 || VT == MVT::v8i64)))
+      !(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
     return SDValue();
 
   SDValue SubusLHS, SubusRHS;
@@ -43493,16 +44032,13 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   } else
     return SDValue();
 
-  auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                           ArrayRef<SDValue> Ops) {
-    return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops);
-  };
-
   // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
   // special preprocessing in some cases.
-  if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
-    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
-                            { SubusLHS, SubusRHS }, USUBSATBuilder);
+  if (EltVT == MVT::i8 || EltVT == MVT::i16)
+    return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
+
+  assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&
+         "Unexpected VT!");
 
   // Special preprocessing case can be only applied
   // if the value was zero extended from 16 bit,
@@ -43531,15 +44067,16 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
   SDValue NewSubusLHS =
       DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
   SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
-  SDValue Psubus =
-      SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
-                       { NewSubusLHS, NewSubusRHS }, USUBSATBuilder);
+  SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
+                               NewSubusLHS, NewSubusRHS);
+
   // Zero extend the result, it may be used somewhere as 32 bit,
   // if not zext and following trunc will shrink.
   return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
 }
 
 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -43576,9 +44113,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
                             HSUBBuilder);
   }
 
-  if (SDValue V = combineIncDecVector(N, DAG))
-    return V;
-
   // Try to create PSUBUS if SUB's argument is max/min
   if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
     return V;
@@ -43712,14 +44246,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     }
   }
 
-  // If we're inserting all zeros into the upper half, change this to
-  // an insert into an all zeros vector. We will match this to a move
-  // with implicit upper bit zeroing during isel.
-  if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode()))
-    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
-                       getZeroVector(VT, Subtarget, DAG, DL), Ops[0],
-                       DAG.getIntPtrConstant(0, DL));
-
   return SDValue();
 }
 
@@ -43786,10 +44312,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
     // least as large as the original insertion. Just insert the original
     // subvector into a zero vector.
     if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
-        SubVec.getConstantOperandAPInt(1) == 0 &&
+        isNullConstant(SubVec.getOperand(1)) &&
         SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
       SDValue Ins = SubVec.getOperand(0);
-      if (Ins.getConstantOperandAPInt(2) == 0 &&
+      if (isNullConstant(Ins.getOperand(2)) &&
           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
           Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
@@ -43825,31 +44351,42 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
 
   // Match concat_vector style patterns.
   SmallVector<SDValue, 2> SubVectorOps;
-  if (collectConcatOps(N, SubVectorOps))
+  if (collectConcatOps(N, SubVectorOps)) {
     if (SDValue Fold =
             combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
       return Fold;
 
-  // If we are inserting into both halves of the vector, the starting vector
-  // should be undef. If it isn't, make it so. Only do this if the early insert
-  // has no other uses.
-  // TODO: Should this be a generic DAG combine?
-  // TODO: Why doesn't SimplifyDemandedVectorElts catch this?
-  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
-      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
-      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
-      isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
-      Vec.hasOneUse()) {
-    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
-                      Vec.getOperand(1), Vec.getOperand(2));
-    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
-                       N->getOperand(2));
+    // If we're inserting all zeros into the upper half, change this to
+    // a concat with zero. We will match this to a move
+    // with implicit upper bit zeroing during isel.
+    // We do this here because we don't want combineConcatVectorOps to
+    // create INSERT_SUBVECTOR from CONCAT_VECTORS.
+    if (SubVectorOps.size() == 2 &&
+        ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+                         getZeroVector(OpVT, Subtarget, DAG, dl),
+                         SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
   }
 
   // If this is a broadcast insert into an upper undef, use a larger broadcast.
   if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
     return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
 
+  // If this is a broadcast load inserted into an upper undef, use a larger
+  // broadcast load.
+  if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
+      SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+    auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
+    SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
+    SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
+    SDValue BcastLd =
+        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+                                MemIntr->getMemoryVT(),
+                                MemIntr->getMemOperand());
+    DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+    return BcastLd;
+  }
+
   return SDValue();
 }
 
@@ -43928,12 +44465,15 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   MVT VT = N->getSimpleValueType(0);
-  EVT WideVecVT = N->getOperand(0).getValueType();
-  SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
+  SDValue InVec = N->getOperand(0);
+  SDValue InVecBC = peekThroughBitcasts(InVec);
+  EVT InVecVT = InVec.getValueType();
+  EVT InVecBCVT = InVecBC.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
-      TLI.isTypeLegal(WideVecVT) &&
-      WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
+      TLI.isTypeLegal(InVecVT) &&
+      InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
     auto isConcatenatedNot = [] (SDValue V) {
       V = peekThroughBitcasts(V);
       if (!isBitwiseNot(V))
@@ -43941,12 +44481,12 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       SDValue NotOp = V->getOperand(0);
       return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
     };
-    if (isConcatenatedNot(WideVec.getOperand(0)) ||
-        isConcatenatedNot(WideVec.getOperand(1))) {
+    if (isConcatenatedNot(InVecBC.getOperand(0)) ||
+        isConcatenatedNot(InVecBC.getOperand(1))) {
       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
-      SDValue Concat = split256IntArith(WideVec, DAG);
+      SDValue Concat = split256IntArith(InVecBC, DAG);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
-                         DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
+                         DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
     }
   }
 
@@ -43956,7 +44496,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = narrowExtractedVectorSelect(N, DAG))
     return V;
 
-  SDValue InVec = N->getOperand(0);
   unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
@@ -43976,31 +44515,42 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   // Try to move vector bitcast after extract_subv by scaling extraction index:
   // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
   // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
-  if (InVec.getOpcode() == ISD::BITCAST &&
-      InVec.getOperand(0).getValueType().isVector()) {
-    SDValue SrcOp = InVec.getOperand(0);
-    EVT SrcVT = SrcOp.getValueType();
-    unsigned SrcNumElts = SrcVT.getVectorNumElements();
-    unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
+  if (InVec != InVecBC && InVecBCVT.isVector()) {
+    unsigned SrcNumElts = InVecBCVT.getVectorNumElements();
+    unsigned DestNumElts = InVecVT.getVectorNumElements();
     if ((DestNumElts % SrcNumElts) == 0) {
       unsigned DestSrcRatio = DestNumElts / SrcNumElts;
       if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
         unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
         EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
-                                        SrcVT.getScalarType(), NewExtNumElts);
+                                        InVecBCVT.getScalarType(), NewExtNumElts);
         if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
             TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
           unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
           SDLoc DL(N);
           SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
           SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
-                                           SrcOp, NewIndex);
+                                           InVecBC, NewIndex);
           return DAG.getBitcast(VT, NewExtract);
         }
       }
     }
   }
 
+  // If we are extracting from an insert into a zero vector, replace with a
+  // smaller insert into zero if we don't access less than the original
+  // subvector. Don't do this for i1 vectors.
+  if (VT.getVectorElementType() != MVT::i1 &&
+      InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
+      InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
+      ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
+      InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
+    SDLoc DL(N);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                       getZeroVector(VT, Subtarget, DAG, DL),
+                       InVec.getOperand(1), InVec.getOperand(2));
+  }
+
   // If we're extracting from a broadcast then we're better off just
   // broadcasting to the smaller type directly, assuming this is the only use.
   // As its a broadcast we don't care about the extraction index.
@@ -44008,11 +44558,25 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
     return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
 
+  if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
+    auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
+    if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
+      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+      SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
+      SDValue BcastLd =
+          DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+                                  MemIntr->getMemoryVT(),
+                                  MemIntr->getMemOperand());
+      DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+      return BcastLd;
+    }
+  }
+
   // If we're extracting the lowest subvector and we're the only user,
   // we may be able to perform this with a smaller vector width.
   if (IdxVal == 0 && InVec.hasOneUse()) {
     unsigned InOpcode = InVec.getOpcode();
-    if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
+    if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
       // v2f64 CVTDQ2PD(v4i32).
       if (InOpcode == ISD::SINT_TO_FP &&
           InVec.getOperand(0).getValueType() == MVT::v4i32) {
@@ -44093,7 +44657,8 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
 
 // Simplify PMULDQ and PMULUDQ operations.
 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI) {
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const X86Subtarget &Subtarget) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
@@ -44103,23 +44668,43 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
 
   // Multiply by zero.
+  // Don't return RHS as it may contain UNDEFs.
   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
-    return RHS;
-
-  // Aggressively peek through ops to get at the demanded low bits.
-  APInt DemandedMask = APInt::getLowBitsSet(64, 32);
-  SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask);
-  SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask);
-  if (DemandedLHS || DemandedRHS)
-    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
-                       DemandedLHS ? DemandedLHS : LHS,
-                       DemandedRHS ? DemandedRHS : RHS);
+    return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
 
   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
     return SDValue(N, 0);
 
+  // If the input is an extend_invec and the SimplifyDemandedBits call didn't
+  // convert it to any_extend_invec, due to the LegalOperations check, do the
+  // conversion directly to a vector shuffle manually. This exposes combine
+  // opportunities missed by combineExtInVec not calling
+  // combineX86ShufflesRecursively on SSE4.1 targets.
+  // FIXME: This is basically a hack around several other issues related to
+  // ANY_EXTEND_VECTOR_INREG.
+  if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
+      (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+       LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+      LHS.getOperand(0).getValueType() == MVT::v4i32) {
+    SDLoc dl(N);
+    LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
+                               LHS.getOperand(0), { 0, -1, 1, -1 });
+    LHS = DAG.getBitcast(MVT::v2i64, LHS);
+    return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+  }
+  if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
+      (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+       RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+      RHS.getOperand(0).getValueType() == MVT::v4i32) {
+    SDLoc dl(N);
+    RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
+                               RHS.getOperand(0), { 0, -1, 1, -1 });
+    RHS = DAG.getBitcast(MVT::v2i64, RHS);
+    return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+  }
+
   return SDValue();
 }
 
@@ -44134,7 +44719,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
       In.hasOneUse()) {
     auto *Ld = cast<LoadSDNode>(In);
-    if (!Ld->isVolatile()) {
+    if (Ld->isSimple()) {
       MVT SVT = In.getSimpleValueType().getVectorElementType();
       ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
       EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
@@ -44150,17 +44735,6 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // Disabling for widening legalization for now. We can enable if we find a
-  // case that needs it. Otherwise it can be deleted when we switch to
-  // widening legalization.
-  if (ExperimentalVectorWideningLegalization)
-    return SDValue();
-
-  // Combine (ext_invec (ext_invec X)) -> (ext_invec X)
-  if (In.getOpcode() == N->getOpcode() &&
-      TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
-    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
-
   // Attempt to combine as a shuffle.
   // TODO: SSE41 support
   if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
@@ -44173,6 +44747,20 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+
+  APInt KnownUndef, KnownZero;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+                                     KnownZero, DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -44196,8 +44784,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
   case X86ISD::CMP:         return combineCMP(N, DAG);
-  case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
-  case ISD::SUB:            return combineSub(N, DAG, Subtarget);
+  case ISD::ADD:            return combineAdd(N, DAG, DCI, Subtarget);
+  case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
   case X86ISD::ADD:
   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI);
   case X86ISD::SBB:         return combineSBB(N, DAG);
@@ -44214,12 +44802,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
-  case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
+  case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, DCI, Subtarget);
   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
   case ISD::FADD:
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
+  case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG);
   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
@@ -44299,20 +44888,22 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FNMADD_RND:
   case X86ISD::FNMSUB:
   case X86ISD::FNMSUB_RND:
-  case ISD::FMA: return combineFMA(N, DAG, Subtarget);
+  case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);
   case X86ISD::FMADDSUB_RND:
   case X86ISD::FMSUBADD_RND:
   case X86ISD::FMADDSUB:
-  case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, Subtarget);
-  case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI);
+  case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);
+  case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
   case X86ISD::MGATHER:
-  case X86ISD::MSCATTER:
+  case X86ISD::MSCATTER:    return combineX86GatherScatter(N, DAG, DCI);
   case ISD::MGATHER:
-  case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI, Subtarget);
+  case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
   case X86ISD::PCMPEQ:
   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
   case X86ISD::PMULDQ:
-  case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI);
+  case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
+  case X86ISD::KSHIFTL:
+  case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
   }
 
   return SDValue();
@@ -44660,10 +45251,11 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
     case 'I':
     case 'J':
     case 'K':
-    case 'L':
-    case 'M':
     case 'N':
     case 'G':
+    case 'L':
+    case 'M':
+      return C_Immediate;
     case 'C':
     case 'e':
     case 'Z':
@@ -45175,8 +45767,9 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         if (VConstraint && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::FR64XRegClass);
         return std::make_pair(0U, &X86::FR64RegClass);
-      // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
-      // Vector types.
+      // TODO: Handle i128 in FR128RegClass after it is tested well.
+      // Vector types and fp128.
+      case MVT::f128:
       case MVT::v16i8:
       case MVT::v8i16:
       case MVT::v4i32:
@@ -45469,7 +46062,7 @@ void X86TargetLowering::insertCopiesSplitCSR(
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
-    unsigned NewVR = MRI->createVirtualRegister(RC);
+    Register NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     // FIXME: this currently does not emit CFI pseudo-instructions, it works
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
@@ -45514,3 +46107,16 @@ X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
 }
+
+unsigned
+X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
+  // The default stack probe size is 4096 if the function has no stackprobesize
+  // attribute.
+  unsigned StackProbeSize = 4096;
+  const Function &Fn = MF.getFunction();
+  if (Fn.hasFnAttribute("stack-probe-size"))
+    Fn.getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  return StackProbeSize;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index e0be03bc3f9d..6f7e90008de4 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Target/TargetOptions.h"
 
 namespace llvm {
   class X86Subtarget;
@@ -144,6 +143,10 @@ namespace llvm {
       /// relative displacements.
       WrapperRIP,
 
+      /// Copies a 64-bit value from an MMX vector to the low word
+      /// of an XMM vector, with the high word zero filled.
+      MOVQ2DQ,
+
       /// Copies a 64-bit value from the low word of an XMM vector
       /// to an MMX vector.
       MOVDQ2Q,
@@ -422,7 +425,8 @@ namespace llvm {
       // Tests Types Of a FP Values for scalar types.
       VFPCLASSS,
 
-      // Broadcast scalar to vector.
+      // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+      // a vector, this node may change the vector length as part of the splat.
       VBROADCAST,
       // Broadcast mask to vector.
       VBROADCASTM,
@@ -611,6 +615,9 @@ namespace llvm {
       // extract_vector_elt, store.
       VEXTRACT_STORE,
 
+      // scalar broadcast from memory
+      VBROADCAST_LOAD,
+
       // Store FP control world into i16 memory.
       FNSTCW16m,
 
@@ -680,6 +687,9 @@ namespace llvm {
     bool isCalleePop(CallingConv::ID CallingConv,
                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
 
+    /// If Op is a constant whose elements are all the same constant or
+    /// undefined, return true and return the constant value in \p SplatVal.
+    bool isConstantSplat(SDValue Op, APInt &SplatVal);
   } // end namespace X86
 
   //===--------------------------------------------------------------------===//
@@ -792,6 +802,17 @@ namespace llvm {
     /// and some i16 instructions are slow.
     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
 
+    /// Return 1 if we can compute the negated form of the specified expression
+    /// for the same cost as the expression itself, or 2 if we can compute the
+    /// negated form more cheaply than the expression itself. Else return 0.
+    char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
+                            bool ForCodeSize, unsigned Depth) const override;
+
+    /// If isNegatibleForFree returns true, return the newly negated expression.
+    SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                 bool LegalOperations, bool ForCodeSize,
+                                 unsigned Depth) const override;
+
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
                                 MachineBasicBlock *MBB) const override;
@@ -840,6 +861,13 @@ namespace llvm {
 
     bool hasAndNot(SDValue Y) const override;
 
+    bool hasBitTest(SDValue X, SDValue Y) const override;
+
+    bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+        unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+        SelectionDAG &DAG) const override;
+
     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                            CombineLevel Level) const override;
 
@@ -863,11 +891,7 @@ namespace llvm {
       return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
     }
 
-    bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
-      if (DAG.getMachineFunction().getFunction().hasMinSize())
-        return false;
-      return true;
-    }
+    bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
 
     bool shouldSplatInsEltVarIndex(EVT VT) const override;
 
@@ -913,6 +937,10 @@ namespace llvm {
                                            TargetLoweringOpt &TLO,
                                            unsigned Depth) const override;
 
+    SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
+        SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+        SelectionDAG &DAG, unsigned Depth) const override;
+
     const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
 
     SDValue unwrapAddress(SDValue N) const override;
@@ -1090,11 +1118,12 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
-    bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const override;
+    bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
 
     bool convertSelectOfConstantsToMath(EVT VT) const override;
 
-    bool decomposeMulByConstant(EVT VT, SDValue C) const override;
+    bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                                SDValue C) const override;
 
     bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
                                   bool IsSigned) const override;
@@ -1136,8 +1165,8 @@ namespace llvm {
       return nullptr; // nothing to do, move along.
     }
 
-    unsigned getRegisterByName(const char* RegName, EVT VT,
-                               SelectionDAG &DAG) const override;
+    Register getRegisterByName(const char* RegName, EVT VT,
+                               const MachineFunction &MF) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
@@ -1189,12 +1218,18 @@ namespace llvm {
                                            CallingConv::ID CC,
                                            EVT VT) const override;
 
+    unsigned getVectorTypeBreakdownForCallingConv(
+        LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+        unsigned &NumIntermediates, MVT &RegisterVT) const override;
+
     bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
     bool supportSwiftError() const override;
 
     StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
 
+    unsigned getStackProbeSize(MachineFunction &MF) const;
+
     bool hasVectorBlend() const override { return true; }
 
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
@@ -1326,6 +1361,12 @@ namespace llvm {
     SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                          RTLIB::Libcall Call) const;
 
     SDValue
     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -1372,6 +1413,9 @@ namespace llvm {
     LoadInst *
     lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
 
+    bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override;
+    bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override;
+
     bool needsCmpXchgNb(Type *MemType) const;
 
     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
@@ -1462,6 +1506,9 @@ namespace llvm {
 
     /// Reassociate floating point divisions into multiply by reciprocal.
     unsigned combineRepeatedFPDivisors() const override;
+
+    SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                          SmallVectorImpl<SDNode *> &Created) const override;
   };
 
   namespace X86 {
@@ -1625,24 +1672,24 @@ namespace llvm {
   /// mask. This is the reverse process to canWidenShuffleElements, but can
   /// always succeed.
   template <typename T>
-  void scaleShuffleMask(int Scale, ArrayRef<T> Mask,
+  void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask,
                         SmallVectorImpl<T> &ScaledMask) {
     assert(0 < Scale && "Unexpected scaling factor");
     size_t NumElts = Mask.size();
     ScaledMask.assign(NumElts * Scale, -1);
 
-    for (int i = 0; i != (int)NumElts; ++i) {
+    for (size_t i = 0; i != NumElts; ++i) {
       int M = Mask[i];
 
       // Repeat sentinel values in every mask element.
       if (M < 0) {
-        for (int s = 0; s != Scale; ++s)
+        for (size_t s = 0; s != Scale; ++s)
           ScaledMask[(Scale * i) + s] = M;
         continue;
       }
 
       // Scale mask element and increment across each mask element.
-      for (int s = 0; s != Scale; ++s)
+      for (size_t s = 0; s != Scale; ++s)
         ScaledMask[(Scale * i) + s] = (Scale * M) + s;
     }
   }
diff --git a/lib/Target/X86/X86IndirectBranchTracking.cpp b/lib/Target/X86/X86IndirectBranchTracking.cpp
index 04e8b2231fec..cc0f59ab329d 100644
--- a/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -84,7 +84,7 @@ bool X86IndirectBranchTrackingPass::addENDBR(
   return false;
 }
 
-bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
+static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
   if (!MOp.isGlobal())
     return false;
   auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal());
diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp
index 02ae73706a34..2b1e3f23efd7 100644
--- a/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/lib/Target/X86/X86InsertPrefetch.cpp
@@ -79,8 +79,8 @@ ErrorOr<PrefetchHints> getPrefetchHints(const FunctionSamples *TopSamples,
 // The prefetch instruction can't take memory operands involving vector
 // registers.
 bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) {
-  unsigned BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
-  unsigned IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
+  Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
+  Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
   return (BaseReg == 0 ||
           X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) ||
           X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) &&
@@ -108,7 +108,7 @@ bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
                                          Prefetches &Prefetches) const {
   assert(Prefetches.empty() &&
          "Expected caller passed empty PrefetchInfo vector.");
-  static const std::pair<const StringRef, unsigned> HintTypes[] = {
+  static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = {
       {"_nta_", X86::PREFETCHNTA},
       {"_t0_", X86::PREFETCHT0},
       {"_t1_", X86::PREFETCHT1},
@@ -173,7 +173,7 @@ bool X86InsertPrefetch::doInitialization(Module &M) {
 
 void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<MachineModuleInfoWrapperPass>();
 }
 
 bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 54eddeacaa17..9b5de59430a5 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -74,6 +74,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName);
 
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
+  PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName);
 
   ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
                                           !cast<ComplexPattern>("sse_load_f32"),
@@ -412,6 +413,14 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
                [(set VR512:$dst, (v16i32 immAllOnesV))]>;
 }
 
+let Predicates = [HasAVX512] in {
+def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+}
+
 // Alias instructions that allow VPTERNLOG to be used with a mask to create
 // a mix of all ones and all zeros elements. This is done this way to force
 // the same register to be used as input for all three sources.
@@ -436,6 +445,19 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
                [(set VR256X:$dst, (v8i32 immAllZerosV))]>;
 }
 
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
+}
+
 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -443,7 +465,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
   def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
                           [(set FR32X:$dst, fp32imm0)]>;
   def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
-                          [(set FR64X:$dst, fpimm0)]>;
+                          [(set FR64X:$dst, fp64imm0)]>;
+  def AVX512_FsFLD0F128 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
+                            [(set VR128X:$dst, fp128imm0)]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -730,14 +754,14 @@ let isCommutable = 1 in
 def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
       (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-      [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+      [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>,
       EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
 def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
       (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1,
                           (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
-                          imm:$src3))]>,
+                          timm:$src3))]>,
       EVEX_4V, EVEX_CD8<32, CD8VT1>,
       Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
 }
@@ -1100,75 +1124,104 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                                      X86VectorVTInfo MaskInfo,
                                      X86VectorVTInfo DestInfo,
                                      X86VectorVTInfo SrcInfo,
-                                     SDPatternOperator UnmaskedOp = X86VBroadcast> {
-  let ExeDomain = DestInfo.ExeDomain, hasSideEffects = 0 in {
-  defm r : AVX512_maskable_split<opc, MRMSrcReg, MaskInfo,
-                   (outs MaskInfo.RC:$dst),
-                   (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
-                   (MaskInfo.VT
-                    (bitconvert
-                     (DestInfo.VT
-                      (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))),
-                   (MaskInfo.VT
-                    (bitconvert
-                     (DestInfo.VT
-                      (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>,
-                   T8PD, EVEX, Sched<[SchedRR]>;
-  let mayLoad = 1 in
-  defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo,
-                   (outs MaskInfo.RC:$dst),
-                   (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
-                   (MaskInfo.VT
-                    (bitconvert
-                     (DestInfo.VT (UnmaskedOp
-                                   (SrcInfo.ScalarLdFrag addr:$src))))),
-                   (MaskInfo.VT
-                    (bitconvert
-                     (DestInfo.VT (X86VBroadcast
-                                   (SrcInfo.ScalarLdFrag addr:$src)))))>,
-                   T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>,
-                   Sched<[SchedRM]>;
-  }
-
-  def : Pat<(MaskInfo.VT
-             (bitconvert
-              (DestInfo.VT (UnmaskedOp
-                            (SrcInfo.VT (scalar_to_vector
-                                         (SrcInfo.ScalarLdFrag addr:$src))))))),
-            (!cast<Instruction>(Name#MaskInfo.ZSuffix#m) addr:$src)>;
-  def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
+                                     bit IsConvertibleToThreeAddress,
+                                     SDPatternOperator UnmaskedOp = X86VBroadcast,
+                                     SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> {
+  let hasSideEffects = 0 in
+  def r : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set MaskInfo.RC:$dst,
+                     (MaskInfo.VT
+                      (bitconvert
+                       (DestInfo.VT
+                        (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
+                   DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
+  def rkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+                     (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+                      "${dst} {${mask}} {z}, $src}"),
+                      [(set MaskInfo.RC:$dst,
+                        (vselect MaskInfo.KRCWM:$mask,
+                         (MaskInfo.VT
                           (bitconvert
                            (DestInfo.VT
-                            (X86VBroadcast
-                             (SrcInfo.VT (scalar_to_vector
-                                          (SrcInfo.ScalarLdFrag addr:$src)))))),
-                          MaskInfo.RC:$src0)),
-            (!cast<Instruction>(Name#DestInfo.ZSuffix#mk)
-             MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>;
-  def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
+                            (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+                         MaskInfo.ImmAllZerosV))],
+                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
+  let Constraints = "$src0 = $dst" in
+  def rk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+                    (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+                         SrcInfo.RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+                    "${dst} {${mask}}, $src}"),
+                    [(set MaskInfo.RC:$dst,
+                      (vselect MaskInfo.KRCWM:$mask,
+                       (MaskInfo.VT
+                        (bitconvert
+                         (DestInfo.VT
+                          (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+                       MaskInfo.RC:$src0))],
+                     DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
+
+  let hasSideEffects = 0, mayLoad = 1 in
+  def m : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+                   (ins SrcInfo.ScalarMemOp:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set MaskInfo.RC:$dst,
+                     (MaskInfo.VT
+                      (bitconvert
+                       (DestInfo.VT
+                        (UnmaskedBcastOp addr:$src)))))],
+                   DestInfo.ExeDomain>, T8PD, EVEX,
+                   EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+  def mkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+                     (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src),
+                     !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+                      "${dst} {${mask}} {z}, $src}"),
+                      [(set MaskInfo.RC:$dst,
+                        (vselect MaskInfo.KRCWM:$mask,
+                         (MaskInfo.VT
                           (bitconvert
                            (DestInfo.VT
-                            (X86VBroadcast
-                             (SrcInfo.VT (scalar_to_vector
-                                          (SrcInfo.ScalarLdFrag addr:$src)))))),
-                          MaskInfo.ImmAllZerosV)),
-            (!cast<Instruction>(Name#MaskInfo.ZSuffix#mkz)
-             MaskInfo.KRCWM:$mask, addr:$src)>;
+                            (SrcInfo.BroadcastLdFrag addr:$src)))),
+                         MaskInfo.ImmAllZerosV))],
+                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
+                      EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+  let Constraints = "$src0 = $dst",
+      isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
+  def mk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+                    (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+                         SrcInfo.ScalarMemOp:$src),
+                    !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+                    "${dst} {${mask}}, $src}"),
+                    [(set MaskInfo.RC:$dst,
+                      (vselect MaskInfo.KRCWM:$mask,
+                       (MaskInfo.VT
+                        (bitconvert
+                         (DestInfo.VT
+                          (SrcInfo.BroadcastLdFrag addr:$src)))),
+                       MaskInfo.RC:$src0))],
+                     DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
+                     EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
 }
 
 // Helper class to force mask and broadcast result to same type.
 multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name,
                                SchedWrite SchedRR, SchedWrite SchedRM,
                                X86VectorVTInfo DestInfo,
-                               X86VectorVTInfo SrcInfo> :
+                               X86VectorVTInfo SrcInfo,
+                               bit IsConvertibleToThreeAddress> :
   avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM,
-                            DestInfo, DestInfo, SrcInfo>;
+                            DestInfo, DestInfo, SrcInfo,
+                            IsConvertibleToThreeAddress>;
 
 multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
                                                        AVX512VLVectorVTInfo _> {
   let Predicates = [HasAVX512] in {
     defm Z  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
-                                  WriteFShuffle256Ld, _.info512, _.info128>,
+                                  WriteFShuffle256Ld, _.info512, _.info128, 1>,
               avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
                                       _.info128>,
               EVEX_V512;
@@ -1176,7 +1229,7 @@ multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
 
   let Predicates = [HasVLX] in {
     defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
-                                     WriteFShuffle256Ld, _.info256, _.info128>,
+                                     WriteFShuffle256Ld, _.info256, _.info128, 1>,
                  avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
                                          _.info128>,
                  EVEX_V256;
@@ -1187,7 +1240,7 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
                                                        AVX512VLVectorVTInfo _> {
   let Predicates = [HasAVX512] in {
     defm Z  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
-                                  WriteFShuffle256Ld, _.info512, _.info128>,
+                                  WriteFShuffle256Ld, _.info512, _.info128, 1>,
               avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
                                       _.info128>,
               EVEX_V512;
@@ -1195,12 +1248,12 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
 
   let Predicates = [HasVLX] in {
     defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
-                                     WriteFShuffle256Ld, _.info256, _.info128>,
+                                     WriteFShuffle256Ld, _.info256, _.info128, 1>,
                  avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
                                          _.info128>,
                  EVEX_V256;
     defm Z128  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
-                                     WriteFShuffle256Ld, _.info128, _.info128>,
+                                     WriteFShuffle256Ld, _.info128, _.info128, 1>,
                  avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128,
                                          _.info128>,
                  EVEX_V128;
@@ -1284,46 +1337,35 @@ defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
 defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
                                                  X86VBroadcast, GR64, HasAVX512>, VEX_W;
 
-// Provide aliases for broadcast from the same register class that
-// automatically does the extract.
-multiclass avx512_int_broadcast_rm_lowering<string Name,
-                                            X86VectorVTInfo DestInfo,
-                                            X86VectorVTInfo SrcInfo,
-                                            X86VectorVTInfo ExtInfo> {
-  def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
-            (!cast<Instruction>(Name#DestInfo.ZSuffix#"r")
-                (ExtInfo.VT (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm)))>;
-}
-
 multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
-                                        AVX512VLVectorVTInfo _, Predicate prd> {
+                                        AVX512VLVectorVTInfo _, Predicate prd,
+                                        bit IsConvertibleToThreeAddress> {
   let Predicates = [prd] in {
     defm Z :   avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
-                                   WriteShuffle256Ld, _.info512, _.info128>,
-               avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info256, _.info128>,
+                                   WriteShuffle256Ld, _.info512, _.info128,
+                                   IsConvertibleToThreeAddress>,
                                   EVEX_V512;
-    // Defined separately to avoid redefinition.
-    defm Z_Alt : avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info512, _.info128>;
   }
   let Predicates = [prd, HasVLX] in {
     defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
-                                    WriteShuffle256Ld, _.info256, _.info128>,
-                avx512_int_broadcast_rm_lowering<NAME, _.info256, _.info256, _.info128>,
+                                    WriteShuffle256Ld, _.info256, _.info128,
+                                    IsConvertibleToThreeAddress>,
                                  EVEX_V256;
     defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle,
-                                    WriteShuffleXLd, _.info128, _.info128>,
+                                    WriteShuffleXLd, _.info128, _.info128,
+                                    IsConvertibleToThreeAddress>,
                                  EVEX_V128;
   }
 }
 
 defm VPBROADCASTB  : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
-                                           avx512vl_i8_info, HasBWI>;
+                                           avx512vl_i8_info, HasBWI, 0>;
 defm VPBROADCASTW  : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
-                                           avx512vl_i16_info, HasBWI>;
+                                           avx512vl_i16_info, HasBWI, 0>;
 defm VPBROADCASTD  : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
-                                           avx512vl_i32_info, HasAVX512>;
+                                           avx512vl_i32_info, HasAVX512, 1>;
 defm VPBROADCASTQ  : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
-                                           avx512vl_i64_info, HasAVX512>, VEX_W1X;
+                                           avx512vl_i64_info, HasAVX512, 1>, VEX_W1X;
 
 multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
                           X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
@@ -1354,6 +1396,10 @@ let Predicates = [HasAVX512] in {
   // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
   def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
             (VPBROADCASTQZm addr:$src)>;
+
+  // FIXME this is to handle aligned extloads from i8.
+  def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))),
+            (VPBROADCASTDZm addr:$src)>;
 }
 
 let Predicates = [HasVLX] in {
@@ -1362,6 +1408,12 @@ let Predicates = [HasVLX] in {
             (VPBROADCASTQZ128m addr:$src)>;
   def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
             (VPBROADCASTQZ256m addr:$src)>;
+
+  // FIXME this is to handle aligned extloads from i8.
+  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+            (VPBROADCASTDZ128m addr:$src)>;
+  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+            (VPBROADCASTDZ256m addr:$src)>;
 }
 let Predicates = [HasVLX, HasBWI] in {
   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
@@ -1382,6 +1434,12 @@ let Predicates = [HasVLX, HasBWI] in {
   def : Pat<(v16i16 (X86VBroadcast
               (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
             (VPBROADCASTWZ256m addr:$src)>;
+
+  // FIXME this is to handle aligned extloads from i8.
+  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+            (VPBROADCASTWZ128m addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+            (VPBROADCASTWZ256m addr:$src)>;
 }
 let Predicates = [HasBWI] in {
   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
@@ -1394,6 +1452,10 @@ let Predicates = [HasBWI] in {
   def : Pat<(v32i16 (X86VBroadcast
               (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
             (VPBROADCASTWZm addr:$src)>;
+
+  // FIXME this is to handle aligned extloads from i8.
+  def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))),
+            (VPBROADCASTWZm addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1629,12 +1691,12 @@ multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
   let Predicates = [HasDQI] in
     defm Z :    avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
                                           WriteShuffle256Ld, _Dst.info512,
-                                          _Src.info512, _Src.info128, null_frag>,
+                                          _Src.info512, _Src.info128, 0, null_frag, null_frag>,
                                           EVEX_V512;
   let Predicates = [HasDQI, HasVLX] in
     defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
                                           WriteShuffle256Ld, _Dst.info256,
-                                          _Src.info256, _Src.info128, null_frag>,
+                                          _Src.info256, _Src.info128, 0, null_frag, null_frag>,
                                           EVEX_V256;
 }
 
@@ -1645,7 +1707,7 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
   let Predicates = [HasDQI, HasVLX] in
     defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle,
                                           WriteShuffleXLd, _Dst.info128,
-                                          _Src.info128, _Src.info128, null_frag>,
+                                          _Src.info128, _Src.info128, 0, null_frag, null_frag>,
                                           EVEX_V128;
 }
 
@@ -1654,23 +1716,6 @@ defm VBROADCASTI32X2  : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
 defm VBROADCASTF32X2  : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
                                           avx512vl_f32_info, avx512vl_f64_info>;
 
-let Predicates = [HasVLX] in {
-def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))),
-          (VBROADCASTSSZ256r (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
-def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))),
-          (VBROADCASTSDZ256r (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
-}
-
-def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
-          (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))>;
-def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
-          (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
-
-def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
-          (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))>;
-def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
-          (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
-
 //===----------------------------------------------------------------------===//
 // AVX-512 BROADCAST MASK TO VECTOR REGISTER
 //---
@@ -1730,7 +1775,7 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
               (_.VT (X86VPermt2 _.RC:$src2,
-               IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+               IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
               AVX5128IBase, EVEX_4V, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -1807,7 +1852,7 @@ multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _,
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                              (X86VPermt2 _.RC:$src2,
                                          (IdxVT.VT (bitconvert  (CastVT.VT _.RC:$src1))),
-                                         (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+                                         (_.BroadcastLdFrag addr:$src3)),
                              (_.VT (bitconvert  (CastVT.VT _.RC:$src1))))),
             (!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask,
                                                  _.RC:$src2, addr:$src3)>;
@@ -1846,7 +1891,7 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
               (_.VT (X86VPermt2 _.RC:$src1,
-               IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+               IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
               AVX5128IBase, EVEX_4V, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -1947,7 +1992,7 @@ multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
 }
 multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
                                  X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let mayLoad = 1, hasSideEffects = 0 in {
+  let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in {
   def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
       (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
        !strconcat(OpcodeStr,
@@ -2031,9 +2076,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                       (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                       "vcmp"#_.Suffix,
                       "$cc, $src2, $src1", "$src1, $src2, $cc",
-                      (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+                      (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
                       (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                 imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
+                                 timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
   let mayLoad = 1 in
   defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                     (outs _.KRC:$dst),
@@ -2041,9 +2086,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                     "vcmp"#_.Suffix,
                     "$cc, $src2, $src1", "$src1, $src2, $cc",
                     (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
-                        imm:$cc),
+                        timm:$cc),
                     (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
-                        imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+                        timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
@@ -2052,9 +2097,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                      "vcmp"#_.Suffix,
                      "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc",
                      (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                imm:$cc),
+                                timm:$cc),
                      (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                   imm:$cc)>,
+                                   timm:$cc)>,
                      EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
 
   let isCodeGenOnly = 1 in {
@@ -2065,7 +2110,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                            "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
                 [(set _.KRC:$dst, (OpNode _.FRC:$src1,
                                           _.FRC:$src2,
-                                          imm:$cc))]>,
+                                          timm:$cc))]>,
                 EVEX_4V, VEX_LIG, Sched<[sched]>;
     def rm : AVX512Ii8<0xC2, MRMSrcMem,
               (outs _.KRC:$dst),
@@ -2074,7 +2119,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                          "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
               [(set _.KRC:$dst, (OpNode _.FRC:$src1,
                                         (_.ScalarLdFrag addr:$src2),
-                                        imm:$cc))]>,
+                                        timm:$cc))]>,
               EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -2100,94 +2145,82 @@ let Predicates = [HasAVX512] in {
                                    SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
 }
 
-multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
-                              PatFrag OpNode_su, X86FoldableSchedWrite sched,
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr,
+                              X86FoldableSchedWrite sched,
                               X86VectorVTInfo _, bit IsCommutable> {
-  let isCommutable = IsCommutable in
+  let isCommutable = IsCommutable, hasSideEffects = 0 in
   def rr : AVX512BI<opc, MRMSrcReg,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))]>,
-             EVEX_4V, Sched<[sched]>;
+             []>, EVEX_4V, Sched<[sched]>;
+  let mayLoad = 1, hasSideEffects = 0 in
   def rm : AVX512BI<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                                       (_.VT (_.LdFrag addr:$src2))))]>,
-             EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
-  let isCommutable = IsCommutable in
+             []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  let isCommutable = IsCommutable, hasSideEffects = 0 in
   def rrk : AVX512BI<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, $src2}"),
-              [(set _.KRC:$dst, (and _.KRCWM:$mask,
-                                   (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>,
-              EVEX_4V, EVEX_K, Sched<[sched]>;
+              []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+  let mayLoad = 1, hasSideEffects = 0 in
   def rmk : AVX512BI<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, $src2}"),
-              [(set _.KRC:$dst, (and _.KRCWM:$mask,
-                                   (OpNode_su (_.VT _.RC:$src1),
-                                       (_.VT (_.LdFrag addr:$src2)))))]>,
-              EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
-multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
-                                  PatFrag OpNode_su,
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
                                   X86FoldableSchedWrite sched, X86VectorVTInfo _,
                                   bit IsCommutable> :
-           avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched, _, IsCommutable> {
+           avx512_icmp_packed<opc, OpcodeStr, sched, _, IsCommutable> {
+  let mayLoad = 1, hasSideEffects = 0 in {
   def rmb : AVX512BI<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
               !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
                                     "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
-              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                              (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>,
-              EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmbk : AVX512BI<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
                                        _.ScalarMemOp:$src2),
                !strconcat(OpcodeStr,
                           "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
-               [(set _.KRC:$dst, (and _.KRCWM:$mask,
-                                      (OpNode_su (_.VT _.RC:$src1),
-                                        (X86VBroadcast
-                                          (_.ScalarLdFrag addr:$src2)))))]>,
-               EVEX_4V, EVEX_K, EVEX_B,
+               []>, EVEX_4V, EVEX_K, EVEX_B,
                Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
 }
 
-multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode,
-                                 PatFrag OpNode_su, X86SchedWriteWidths sched,
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr,
+                                 X86SchedWriteWidths sched,
                                  AVX512VLVectorVTInfo VTInfo, Predicate prd,
                                  bit IsCommutable = 0> {
   let Predicates = [prd] in
-  defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
+  defm Z : avx512_icmp_packed<opc, OpcodeStr, sched.ZMM,
                               VTInfo.info512, IsCommutable>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
+    defm Z256 : avx512_icmp_packed<opc, OpcodeStr, sched.YMM,
                                    VTInfo.info256, IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
+    defm Z128 : avx512_icmp_packed<opc, OpcodeStr, sched.XMM,
                                    VTInfo.info128, IsCommutable>, EVEX_V128;
   }
 }
 
 multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
-                                     PatFrag OpNode, PatFrag OpNode_su,
                                      X86SchedWriteWidths sched,
                                      AVX512VLVectorVTInfo VTInfo,
                                      Predicate prd, bit IsCommutable = 0> {
   let Predicates = [prd] in
-  defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
+  defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.ZMM,
                                   VTInfo.info512, IsCommutable>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
+    defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.YMM,
                                        VTInfo.info256, IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
+    defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.XMM,
                                        VTInfo.info128, IsCommutable>, EVEX_V128;
   }
 }
@@ -2195,53 +2228,42 @@ multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
 // This fragment treats X86cmpm as commutable to help match loads in both
 // operands for PCMPEQ.
 def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>;
-def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2),
-                           (X86setcc_commute node:$src1, node:$src2, SETEQ)>;
 def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
                          (setcc node:$src1, node:$src2, SETGT)>;
 
-def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2),
-                              (X86pcmpeqm_c node:$src1, node:$src2), [{
-  return N->hasOneUse();
-}]>;
-def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2),
-                            (X86pcmpgtm node:$src1, node:$src2), [{
-  return N->hasOneUse();
-}]>;
-
 // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
 // increase the pattern complexity the way an immediate would.
 let AddedComplexity = 2 in {
 // FIXME: Is there a better scheduler class for VPCMP?
-defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb",
                       SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>,
                 EVEX_CD8<8, CD8VF>, VEX_WIG;
 
-defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw",
                       SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>,
                 EVEX_CD8<16, CD8VF>, VEX_WIG;
 
-defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd",
                       SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>,
                 EVEX_CD8<32, CD8VF>;
 
-defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq",
                       SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
                 T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb",
                       SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
                 EVEX_CD8<8, CD8VF>, VEX_WIG;
 
-defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw",
                       SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
                 EVEX_CD8<16, CD8VF>, VEX_WIG;
 
-defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd",
                       SchedWriteVecALU, avx512vl_i32_info, HasAVX512>,
                 EVEX_CD8<32, CD8VF>;
 
-defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq",
                       SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
                 T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
 }
@@ -2322,8 +2344,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
                         "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
              [(set _.KRC:$dst, (_.KVT (Frag:$cc
                                        (_.VT _.RC:$src1),
-                                       (X86VBroadcast
-                                        (_.ScalarLdFrag addr:$src2)),
+                                       (_.BroadcastLdFrag addr:$src2),
                                        cond)))]>,
              EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmibk : AVX512AIi8<opc, MRMSrcMem,
@@ -2335,23 +2356,21 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
                                      (_.KVT (Frag_su:$cc
                                              (_.VT _.RC:$src1),
-                                             (X86VBroadcast
-                                              (_.ScalarLdFrag addr:$src2)),
+                                             (_.BroadcastLdFrag addr:$src2),
                                              cond))))]>,
               EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
-  def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+  def : Pat<(_.KVT (CommFrag:$cc (_.BroadcastLdFrag addr:$src2),
                     (_.VT _.RC:$src1), cond)),
             (!cast<Instruction>(Name#_.ZSuffix#"rmib")
              _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
 
   def : Pat<(and _.KRCWM:$mask,
-                 (_.KVT (CommFrag_su:$cc (X86VBroadcast
-                                       (_.ScalarLdFrag addr:$src2)),
+                 (_.KVT (CommFrag_su:$cc (_.BroadcastLdFrag addr:$src2),
                                       (_.VT _.RC:$src1), cond))),
             (!cast<Instruction>(Name#_.ZSuffix#"rmibk")
              _.KRCWM:$mask, _.RC:$src1, addr:$src2,
-             (CommFrag.OperandTransform $cc))>;
+             (CommFrag_su.OperandTransform $cc))>;
 }
 
 multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
@@ -2496,14 +2515,19 @@ def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
   return N->hasOneUse();
 }]>;
 
+def X86cmpm_imm_commute : SDNodeXForm<timm, [{
+  uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f);
+  return getI8Imm(Imm, SDLoc(N));
+}]>;
+
 multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
                               string Name> {
   defm  rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                    (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc),
                    "vcmp"#_.Suffix,
                    "$cc, $src2, $src1", "$src1, $src2, $cc",
-                   (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
-                   (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+                   (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+                   (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
                    1>, Sched<[sched]>;
 
   defm  rmi  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
@@ -2511,9 +2535,9 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
                 "vcmp"#_.Suffix,
                 "$cc, $src2, $src1", "$src1, $src2, $cc",
                 (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
-                         imm:$cc),
+                         timm:$cc),
                 (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
-                            imm:$cc)>,
+                            timm:$cc)>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm  rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
@@ -2523,38 +2547,37 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
                 "$cc, ${src2}"#_.BroadcastStr#", $src1",
                 "$src1, ${src2}"#_.BroadcastStr#", $cc",
                 (X86cmpm (_.VT _.RC:$src1),
-                        (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                        imm:$cc),
+                        (_.VT (_.BroadcastLdFrag addr:$src2)),
+                        timm:$cc),
                 (X86cmpm_su (_.VT _.RC:$src1),
-                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                            imm:$cc)>,
+                            (_.VT (_.BroadcastLdFrag addr:$src2)),
+                            timm:$cc)>,
                 EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   // Patterns for selecting with loads in other operand.
   def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
-                     CommutableCMPCC:$cc),
+                     timm:$cc),
             (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
-                                                      imm:$cc)>;
+                                                      (X86cmpm_imm_commute timm:$cc))>;
 
   def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2),
                                             (_.VT _.RC:$src1),
-                                            CommutableCMPCC:$cc)),
+                                            timm:$cc)),
             (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
                                                        _.RC:$src1, addr:$src2,
-                                                       imm:$cc)>;
+                                                       (X86cmpm_imm_commute timm:$cc))>;
 
-  def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
-                     (_.VT _.RC:$src1), CommutableCMPCC:$cc),
+  def : Pat<(X86cmpm (_.BroadcastLdFrag addr:$src2),
+                     (_.VT _.RC:$src1), timm:$cc),
             (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
-                                                       imm:$cc)>;
+                                                       (X86cmpm_imm_commute timm:$cc))>;
 
-  def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast
-                                             (_.ScalarLdFrag addr:$src2)),
+  def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.BroadcastLdFrag addr:$src2),
                                             (_.VT _.RC:$src1),
-                                            CommutableCMPCC:$cc)),
+                                            timm:$cc)),
             (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
                                                         _.RC:$src1, addr:$src2,
-                                                        imm:$cc)>;
+                                                        (X86cmpm_imm_commute timm:$cc))>;
 }
 
 multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
@@ -2564,9 +2587,9 @@ multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
                      "vcmp"#_.Suffix,
                      "$cc, {sae}, $src2, $src1",
                      "$src1, $src2, {sae}, $cc",
-                     (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+                     (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
                      (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                    imm:$cc)>,
+                                    timm:$cc)>,
                      EVEX_B, Sched<[sched]>;
 }
 
@@ -2590,12 +2613,12 @@ defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
 // Patterns to select fp compares with load as first operand.
 let Predicates = [HasAVX512] in {
   def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1,
-                            CommutableCMPCC:$cc)),
-            (VCMPSDZrm FR64X:$src1, addr:$src2, imm:$cc)>;
+                            timm:$cc)),
+            (VCMPSDZrm FR64X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
 
   def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1,
-                            CommutableCMPCC:$cc)),
-            (VCMPSSZrm FR32X:$src1, addr:$src2, imm:$cc)>;
+                            timm:$cc)),
+            (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
 }
 
 // ----------------------------------------------------------------
@@ -2621,7 +2644,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1),
-                              (i32 imm:$src2)))]>,
+                              (i32 timm:$src2)))]>,
                       Sched<[sched]>;
       def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
@@ -2629,7 +2652,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                       [(set _.KRC:$dst,(and _.KRCWM:$mask,
                                       (X86Vfpclasss_su (_.VT _.RC:$src1),
-                                      (i32 imm:$src2))))]>,
+                                      (i32 timm:$src2))))]>,
                       EVEX_K, Sched<[sched]>;
     def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
@@ -2637,7 +2660,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,
                           (X86Vfpclasss _.ScalarIntMemCPat:$src1,
-                                       (i32 imm:$src2)))]>,
+                                       (i32 timm:$src2)))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
     def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
@@ -2645,7 +2668,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                     [(set _.KRC:$dst,(and _.KRCWM:$mask,
                         (X86Vfpclasss_su _.ScalarIntMemCPat:$src1,
-                            (i32 imm:$src2))))]>,
+                            (i32 timm:$src2))))]>,
                     EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -2661,7 +2684,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1),
-                                       (i32 imm:$src2)))]>,
+                                       (i32 timm:$src2)))]>,
                       Sched<[sched]>;
   def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
@@ -2669,7 +2692,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                       [(set _.KRC:$dst,(and _.KRCWM:$mask,
                                        (X86Vfpclass_su (_.VT _.RC:$src1),
-                                       (i32 imm:$src2))))]>,
+                                       (i32 timm:$src2))))]>,
                       EVEX_K, Sched<[sched]>;
   def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.MemOp:$src1, i32u8imm:$src2),
@@ -2677,7 +2700,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,(X86Vfpclass
                                      (_.VT (_.LdFrag addr:$src1)),
-                                     (i32 imm:$src2)))]>,
+                                     (i32 timm:$src2)))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
@@ -2685,7 +2708,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                     [(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su
                                   (_.VT (_.LdFrag addr:$src1)),
-                                  (i32 imm:$src2))))]>,
+                                  (i32 timm:$src2))))]>,
                     EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
@@ -2693,9 +2716,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                                       _.BroadcastStr##", $dst|$dst, ${src1}"
                                                   ##_.BroadcastStr##", $src2}",
                     [(set _.KRC:$dst,(X86Vfpclass
-                                     (_.VT (X86VBroadcast
-                                           (_.ScalarLdFrag addr:$src1))),
-                                     (i32 imm:$src2)))]>,
+                                     (_.VT (_.BroadcastLdFrag addr:$src1)),
+                                     (i32 timm:$src2)))]>,
                     EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
@@ -2703,9 +2725,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                           _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
                                                    _.BroadcastStr##", $src2}",
                     [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su
-                                     (_.VT (X86VBroadcast
-                                           (_.ScalarLdFrag addr:$src1))),
-                                     (i32 imm:$src2))))]>,
+                                     (_.VT (_.BroadcastLdFrag addr:$src1)),
+                                     (i32 timm:$src2))))]>,
                     EVEX_B, EVEX_K,  Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 
@@ -2836,13 +2857,21 @@ def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
 
 def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
           (KMOVWrk VK16:$src)>;
+def : Pat<(i64 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
+          (SUBREG_TO_REG (i64 0), (KMOVWrk VK16:$src), sub_32bit)>;
 def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
           (COPY_TO_REGCLASS VK16:$src, GR32)>;
+def : Pat<(i64 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
+          (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK16:$src, GR32), sub_32bit)>;
 
 def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
           (KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
+def : Pat<(i64 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+          (SUBREG_TO_REG (i64 0), (KMOVBrk VK8:$src), sub_32bit)>, Requires<[HasDQI]>;
 def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
           (COPY_TO_REGCLASS VK8:$src, GR32)>;
+def : Pat<(i64 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+          (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK8:$src, GR32), sub_32bit)>;
 
 def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
           (COPY_TO_REGCLASS GR32:$src, VK32)>;
@@ -3075,7 +3104,7 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
     def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
                  !strconcat(OpcodeStr,
                             "\t{$imm, $src, $dst|$dst, $src, $imm}"),
-                            [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>,
+                            [(set KRC:$dst, (OpNode KRC:$src, (i8 timm:$imm)))]>,
                  Sched<[sched]>;
 }
 
@@ -3097,30 +3126,6 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
 defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShuffle>;
 defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>;
 
-// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
-multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
-                                              string InstStr,
-                                              X86VectorVTInfo Narrow,
-                                              X86VectorVTInfo Wide> {
-  def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1),
-                              (Narrow.VT Narrow.RC:$src2))),
-          (COPY_TO_REGCLASS
-           (!cast<Instruction>(InstStr#"Zrr")
-            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
-            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
-           Narrow.KRC)>;
-
-  def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
-                             (Frag_su (Narrow.VT Narrow.RC:$src1),
-                                      (Narrow.VT Narrow.RC:$src2)))),
-          (COPY_TO_REGCLASS
-           (!cast<Instruction>(InstStr#"Zrrk")
-            (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
-            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
-            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
-           Narrow.KRC)>;
-}
-
 // Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
 multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
                                                  string InstStr,
@@ -3129,7 +3134,7 @@ multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
 def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
                                 (Narrow.VT Narrow.RC:$src2), cond)),
           (COPY_TO_REGCLASS
-           (!cast<Instruction>(InstStr##Zrri)
+           (!cast<Instruction>(InstStr#"Zrri")
             (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
             (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
             (Frag.OperandTransform $cc)), Narrow.KRC)>;
@@ -3138,53 +3143,111 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
                            (Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
                                                     (Narrow.VT Narrow.RC:$src2),
                                                     cond)))),
-          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik")
            (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
-           (Frag.OperandTransform $cc)), Narrow.KRC)>;
+           (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+}
+
+multiclass axv512_icmp_packed_cc_rmb_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
+                                                     PatFrag CommFrag, PatFrag CommFrag_su,
+                                                     string InstStr,
+                                                     X86VectorVTInfo Narrow,
+                                                     X86VectorVTInfo Wide> {
+// Broadcast load.
+def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
+                                (Narrow.BroadcastLdFrag addr:$src2), cond)),
+          (COPY_TO_REGCLASS
+           (!cast<Instruction>(InstStr#"Zrmib")
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+            addr:$src2, (Frag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+                           (Narrow.KVT
+                            (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
+                                         (Narrow.BroadcastLdFrag addr:$src2),
+                                         cond)))),
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+           (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+           (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+           addr:$src2, (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+
+// Commuted with broadcast load.
+def : Pat<(Narrow.KVT (CommFrag:$cc (Narrow.BroadcastLdFrag addr:$src2),
+                                    (Narrow.VT Narrow.RC:$src1),
+                                    cond)),
+          (COPY_TO_REGCLASS
+           (!cast<Instruction>(InstStr#"Zrmib")
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+            addr:$src2, (CommFrag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+                           (Narrow.KVT
+                            (CommFrag_su:$cc (Narrow.BroadcastLdFrag addr:$src2),
+                                             (Narrow.VT Narrow.RC:$src1), 
+                                             cond)))),
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+           (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+           (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+           addr:$src2, (CommFrag_su.OperandTransform $cc)), Narrow.KRC)>;
 }
 
 // Same as above, but for fp types which don't use PatFrags.
-multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, PatFrag OpNode_su,
-                                                string InstStr,
+multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr,
                                                 X86VectorVTInfo Narrow,
                                                 X86VectorVTInfo Wide> {
-def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
-                              (Narrow.VT Narrow.RC:$src2), imm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+                               (Narrow.VT Narrow.RC:$src2), timm:$cc)),
           (COPY_TO_REGCLASS
-           (!cast<Instruction>(InstStr##Zrri)
+           (!cast<Instruction>(InstStr#"Zrri")
             (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
             (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
-            imm:$cc), Narrow.KRC)>;
+            timm:$cc), Narrow.KRC)>;
 
 def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
-                           (OpNode_su (Narrow.VT Narrow.RC:$src1),
-                                      (Narrow.VT Narrow.RC:$src2), imm:$cc))),
-          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+                           (X86cmpm_su (Narrow.VT Narrow.RC:$src1),
+                                       (Narrow.VT Narrow.RC:$src2), timm:$cc))),
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik")
            (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
-           imm:$cc), Narrow.KRC)>;
-}
+           timm:$cc), Narrow.KRC)>;
 
-let Predicates = [HasAVX512, NoVLX] in {
-  // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
-  // increase the pattern complexity the way an immediate would.
-  let AddedComplexity = 2 in {
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v8i32x_info, v16i32_info>;
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v8i32x_info, v16i32_info>;
+// Broadcast load.
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+                               (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
+          (COPY_TO_REGCLASS
+           (!cast<Instruction>(InstStr#"Zrmbi")
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+            addr:$src2, timm:$cc), Narrow.KRC)>;
 
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v4i32x_info, v16i32_info>;
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v4i32x_info, v16i32_info>;
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+                           (X86cmpm_su (Narrow.VT Narrow.RC:$src1),
+                                       (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc))),
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
+           (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+           (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+           addr:$src2, timm:$cc), Narrow.KRC)>;
 
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v4i64x_info, v8i64_info>;
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v4i64x_info, v8i64_info>;
+// Commuted with broadcast load.
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+                               (Narrow.VT Narrow.RC:$src1), timm:$cc)),
+          (COPY_TO_REGCLASS
+           (!cast<Instruction>(InstStr#"Zrmbi")
+            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+            addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
 
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v2i64x_info, v8i64_info>;
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v2i64x_info, v8i64_info>;
-  }
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+                           (X86cmpm_su (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+                                       (Narrow.VT Narrow.RC:$src1), timm:$cc))),
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
+           (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+           (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+           addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
+}
 
+let Predicates = [HasAVX512, NoVLX] in {
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
 
@@ -3197,29 +3260,25 @@ let Predicates = [HasAVX512, NoVLX] in {
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>;
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
 
-  defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v8f32x_info, v16f32_info>;
-  defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v4f32x_info, v16f32_info>;
-  defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v4f64x_info, v8f64_info>;
-  defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v2f64x_info, v8f64_info>;
-}
+  defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v8i32x_info, v16i32_info>;
+  defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v8i32x_info, v16i32_info>;
 
-let Predicates = [HasBWI, NoVLX] in {
-  // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
-  // increase the pattern complexity the way an immediate would.
-  let AddedComplexity = 2 in {
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v32i8x_info, v64i8_info>;
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v32i8x_info, v64i8_info>;
+  defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v4i32x_info, v16i32_info>;
+  defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v4i32x_info, v16i32_info>;
 
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v16i8x_info, v64i8_info>;
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v16i8x_info, v64i8_info>;
+  defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v4i64x_info, v8i64_info>;
+  defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v4i64x_info, v8i64_info>;
 
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v16i16x_info, v32i16_info>;
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v16i16x_info, v32i16_info>;
+  defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v2i64x_info, v8i64_info>;
+  defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
 
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v8i16x_info, v32i16_info>;
-  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v8i16x_info, v32i16_info>;
-  }
+  defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v8f32x_info, v16f32_info>;
+  defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v4f32x_info, v16f32_info>;
+  defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v4f64x_info, v8f64_info>;
+  defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>;
+}
 
+let Predicates = [HasBWI, NoVLX] in {
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
 
@@ -4186,16 +4245,32 @@ def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)),
           (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
            (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>;
 
+def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), (f32 FR32X:$src0))),
+          (COPY_TO_REGCLASS
+           (v4f32 (VMOVSSZrmk (v4f32 (COPY_TO_REGCLASS FR32X:$src0, VR128X)),
+                                                       VK1WM:$mask, addr:$src)),
+           FR32X)>;
+def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), fp32imm0)),
+          (COPY_TO_REGCLASS (v4f32 (VMOVSSZrmkz VK1WM:$mask, addr:$src)), FR32X)>;
+
 def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
           (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk
            (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)),
            VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
            (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
 
-def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fpimm0)),
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fp64imm0)),
           (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
            (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
 
+def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), (f64 FR64X:$src0))),
+          (COPY_TO_REGCLASS
+           (v2f64 (VMOVSDZrmk (v2f64 (COPY_TO_REGCLASS FR64X:$src0, VR128X)),
+                                                       VK1WM:$mask, addr:$src)),
+           FR64X)>;
+def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), fp64imm0)),
+          (COPY_TO_REGCLASS (v2f64 (VMOVSDZrmkz VK1WM:$mask, addr:$src)), FR64X)>;
+
 let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
   def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
@@ -4537,8 +4612,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   "${src2}"##_.BroadcastStr##", $src1",
                   "$src1, ${src2}"##_.BroadcastStr,
                   (_.VT (OpNode _.RC:$src1,
-                                (X86VBroadcast
-                                    (_.ScalarLdFrag addr:$src2))))>,
+                                (_.BroadcastLdFrag addr:$src2)))>,
                   AVX512BIBase, EVEX_4V, EVEX_B,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -4664,8 +4738,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                     "${src2}"##_Brdct.BroadcastStr##", $src1",
                      "$src1, ${src2}"##_Brdct.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
-                                 (_Brdct.VT (X86VBroadcast
-                                          (_Brdct.ScalarLdFrag addr:$src2))))))>,
+                                 (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
                     AVX512BIBase, EVEX_4V, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -4737,8 +4810,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     "${src2}"##_Src.BroadcastStr##", $src1",
                      "$src1, ${src2}"##_Src.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
-                                 (_Src.VT (X86VBroadcast
-                                          (_Src.ScalarLdFrag addr:$src2))))))>,
+                                 (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>,
                     EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -4874,22 +4946,11 @@ let Predicates = [HasDQI, NoVLX] in {
                     (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
                     (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
              sub_ymm)>;
-
-  def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
-            (EXTRACT_SUBREG
-                (VPMULLQZrr
-                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
-                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
-             sub_xmm)>;
-}
-
-// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasDQI, NoVLX] in {
-  def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+  def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))),
             (EXTRACT_SUBREG
-                (VPMULLQZrr
+                (VPMULLQZrmb
                     (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
-                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+                    addr:$src2),
              sub_ymm)>;
 
   def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
@@ -4898,29 +4959,47 @@ let Predicates = [HasDQI, NoVLX] in {
                     (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
                     (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
              sub_xmm)>;
+  def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))),
+            (EXTRACT_SUBREG
+                (VPMULLQZrmb
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+                    addr:$src2),
+             sub_xmm)>;
 }
 
-multiclass avx512_min_max_lowering<Instruction Instr, SDNode OpNode> {
+multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> {
   def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)),
             (EXTRACT_SUBREG
-                (Instr
+                (!cast<Instruction>(Instr#"rr")
                     (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
                     (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
              sub_ymm)>;
+  def : Pat<(v4i64 (OpNode (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(Instr#"rmb")
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+                    addr:$src2),
+             sub_ymm)>;
 
   def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)),
             (EXTRACT_SUBREG
-                (Instr
+                (!cast<Instruction>(Instr#"rr")
                     (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
                     (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
              sub_xmm)>;
+  def : Pat<(v2i64 (OpNode (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(Instr#"rmb")
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+                    addr:$src2),
+             sub_xmm)>;
 }
 
 let Predicates = [HasAVX512, NoVLX] in {
-  defm : avx512_min_max_lowering<VPMAXUQZrr, umax>;
-  defm : avx512_min_max_lowering<VPMINUQZrr, umin>;
-  defm : avx512_min_max_lowering<VPMAXSQZrr, smax>;
-  defm : avx512_min_max_lowering<VPMINSQZrr, smin>;
+  defm : avx512_min_max_lowering<"VPMAXUQZ", umax>;
+  defm : avx512_min_max_lowering<"VPMINUQZ", umin>;
+  defm : avx512_min_max_lowering<"VPMAXSQZ", smax>;
+  defm : avx512_min_max_lowering<"VPMINSQZ", smin>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4977,32 +5056,6 @@ let Predicates = [HasVLX] in {
   def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)),
             (VPANDNQZ128rm VR128X:$src1, addr:$src2)>;
 
-  def : Pat<(and VR128X:$src1,
-                 (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPANDDZ128rmb VR128X:$src1, addr:$src2)>;
-  def : Pat<(or VR128X:$src1,
-                (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPORDZ128rmb VR128X:$src1, addr:$src2)>;
-  def : Pat<(xor VR128X:$src1,
-                 (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPXORDZ128rmb VR128X:$src1, addr:$src2)>;
-  def : Pat<(X86andnp VR128X:$src1,
-                      (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPANDNDZ128rmb VR128X:$src1, addr:$src2)>;
-
-  def : Pat<(and VR128X:$src1,
-                 (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPANDQZ128rmb VR128X:$src1, addr:$src2)>;
-  def : Pat<(or VR128X:$src1,
-                (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPORQZ128rmb VR128X:$src1, addr:$src2)>;
-  def : Pat<(xor VR128X:$src1,
-                 (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPXORQZ128rmb VR128X:$src1, addr:$src2)>;
-  def : Pat<(X86andnp VR128X:$src1,
-                      (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPANDNQZ128rmb VR128X:$src1, addr:$src2)>;
-
   def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)),
             (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
   def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)),
@@ -5042,32 +5095,6 @@ let Predicates = [HasVLX] in {
             (VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
   def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)),
             (VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
-
-  def : Pat<(and VR256X:$src1,
-                 (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPANDDZ256rmb VR256X:$src1, addr:$src2)>;
-  def : Pat<(or VR256X:$src1,
-                (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPORDZ256rmb VR256X:$src1, addr:$src2)>;
-  def : Pat<(xor VR256X:$src1,
-                 (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPXORDZ256rmb VR256X:$src1, addr:$src2)>;
-  def : Pat<(X86andnp VR256X:$src1,
-                      (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPANDNDZ256rmb VR256X:$src1, addr:$src2)>;
-
-  def : Pat<(and VR256X:$src1,
-                 (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPANDQZ256rmb VR256X:$src1, addr:$src2)>;
-  def : Pat<(or VR256X:$src1,
-                (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPORQZ256rmb VR256X:$src1, addr:$src2)>;
-  def : Pat<(xor VR256X:$src1,
-                 (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPXORQZ256rmb VR256X:$src1, addr:$src2)>;
-  def : Pat<(X86andnp VR256X:$src1,
-                      (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPANDNQZ256rmb VR256X:$src1, addr:$src2)>;
 }
 
 let Predicates = [HasAVX512] in {
@@ -5110,32 +5137,6 @@ let Predicates = [HasAVX512] in {
             (VPANDNQZrm VR512:$src1, addr:$src2)>;
   def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)),
             (VPANDNQZrm VR512:$src1, addr:$src2)>;
-
-  def : Pat<(and VR512:$src1,
-                 (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPANDDZrmb VR512:$src1, addr:$src2)>;
-  def : Pat<(or VR512:$src1,
-                (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPORDZrmb VR512:$src1, addr:$src2)>;
-  def : Pat<(xor VR512:$src1,
-                 (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPXORDZrmb VR512:$src1, addr:$src2)>;
-  def : Pat<(X86andnp VR512:$src1,
-                      (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
-            (VPANDNDZrmb VR512:$src1, addr:$src2)>;
-
-  def : Pat<(and VR512:$src1,
-                 (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPANDQZrmb VR512:$src1, addr:$src2)>;
-  def : Pat<(or VR512:$src1,
-                (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPORQZrmb VR512:$src1, addr:$src2)>;
-  def : Pat<(xor VR512:$src1,
-                 (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPXORQZrmb VR512:$src1, addr:$src2)>;
-  def : Pat<(X86andnp VR512:$src1,
-                      (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
-            (VPANDNQZrmb VR512:$src1, addr:$src2)>;
 }
 
 // Patterns to catch vselect with different type than logic op.
@@ -5174,25 +5175,17 @@ multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode,
                                          X86VectorVTInfo _,
                                          X86VectorVTInfo IntInfo> {
   // Register-broadcast logical operations.
-  def : Pat<(IntInfo.VT (OpNode _.RC:$src1,
-                         (bitconvert (_.VT (X86VBroadcast
-                                            (_.ScalarLdFrag addr:$src2)))))),
-            (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (bitconvert
                     (IntInfo.VT (OpNode _.RC:$src1,
-                                 (bitconvert (_.VT
-                                              (X86VBroadcast
-                                               (_.ScalarLdFrag addr:$src2))))))),
+                                 (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
                    _.RC:$src0)),
             (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
              _.RC:$src1, addr:$src2)>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (bitconvert
                     (IntInfo.VT (OpNode _.RC:$src1,
-                                 (bitconvert (_.VT
-                                              (X86VBroadcast
-                                               (_.ScalarLdFrag addr:$src2))))))),
+                                 (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
                    _.ImmAllZerosV)),
             (!cast<Instruction>(InstrStr#rmbkz)  _.KRCWM:$mask,
              _.RC:$src1, addr:$src2)>;
@@ -5329,7 +5322,8 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
 }
 multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                                 SDNode OpNode, SDNode VecNode, SDNode SaeNode,
-                                X86FoldableSchedWrite sched, bit IsCommutable> {
+                                X86FoldableSchedWrite sched, bit IsCommutable,
+                                string EVEX2VexOvrd> {
   let ExeDomain = _.ExeDomain in {
   defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -5349,7 +5343,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
-                          Sched<[sched]> {
+                          Sched<[sched]>,
+                          EVEX2VEXOverride<EVEX2VexOvrd#"rr"> {
     let isCommutable = IsCommutable;
   }
   def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
@@ -5357,7 +5352,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                          [(set _.FRC:$dst, (OpNode _.FRC:$src1,
                          (_.ScalarLdFrag addr:$src2)))]>,
-                         Sched<[sched.Folded, sched.ReadAfterFold]>;
+                         Sched<[sched.Folded, sched.ReadAfterFold]>,
+                         EVEX2VEXOverride<EVEX2VexOvrd#"rm">;
   }
 
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5387,10 +5383,12 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               SDNode VecNode, SDNode SaeNode,
                               X86SchedWriteSizes sched, bit IsCommutable> {
   defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
-                              VecNode, SaeNode, sched.PS.Scl, IsCommutable>,
+                              VecNode, SaeNode, sched.PS.Scl, IsCommutable,
+                              NAME#"SS">,
                               XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
-                              VecNode, SaeNode, sched.PD.Scl, IsCommutable>,
+                              VecNode, SaeNode, sched.PD.Scl, IsCommutable,
+                              NAME#"SD">,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
 }
 defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds,
@@ -5410,13 +5408,14 @@ defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
 // X86fminc and X86fmaxc instead of X86fmin and X86fmax
 multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
                                     X86VectorVTInfo _, SDNode OpNode,
-                                    X86FoldableSchedWrite sched> {
+                                    X86FoldableSchedWrite sched,
+                                    string EVEX2VEXOvrd> {
   let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
-                          Sched<[sched]> {
+                          Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr"> {
     let isCommutable = 1;
   }
   def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
@@ -5424,24 +5423,27 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                          [(set _.FRC:$dst, (OpNode _.FRC:$src1,
                          (_.ScalarLdFrag addr:$src2)))]>,
-                         Sched<[sched.Folded, sched.ReadAfterFold]>;
+                         Sched<[sched.Folded, sched.ReadAfterFold]>,
+                         EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
   }
 }
 defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
-                                         SchedWriteFCmp.Scl>, XS, EVEX_4V,
-                                         VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                         SchedWriteFCmp.Scl, "VMINCSS">, XS,
+                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 
 defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
-                                         SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
-                                         VEX_LIG, EVEX_CD8<64, CD8VT1>;
+                                         SchedWriteFCmp.Scl, "VMINCSD">, XD,
+                                         VEX_W, EVEX_4V, VEX_LIG,
+                                         EVEX_CD8<64, CD8VT1>;
 
 defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
-                                         SchedWriteFCmp.Scl>, XS, EVEX_4V,
-                                         VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                         SchedWriteFCmp.Scl, "VMAXCSS">, XS,
+                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 
 defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
-                                         SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
-                                         VEX_LIG, EVEX_CD8<64, CD8VT1>;
+                                         SchedWriteFCmp.Scl, "VMAXCSD">, XD,
+                                         VEX_W, EVEX_4V, VEX_LIG,
+                                         EVEX_CD8<64, CD8VT1>;
 
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                             X86VectorVTInfo _, X86FoldableSchedWrite sched,
@@ -5464,8 +5466,7 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
                      (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
                      "${src2}"##_.BroadcastStr##", $src1",
                      "$src1, ${src2}"##_.BroadcastStr,
-                     (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
-                                                (_.ScalarLdFrag addr:$src2))))>,
+                     (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
                      EVEX_4V, EVEX_B,
                      Sched<[sched.Folded, sched.ReadAfterFold]>;
     }
@@ -5595,8 +5596,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
                    "${src2}"##_.BroadcastStr##", $src1",
                    "$src1, ${src2}"##_.BroadcastStr,
-                   (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
-                                              (_.ScalarLdFrag addr:$src2))))>,
+                   (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
                    EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -5751,13 +5751,13 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
   defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1, (i8 imm:$src2)))>,
+                   (_.VT (OpNode _.RC:$src1, (i8 timm:$src2)))>,
                    Sched<[sched]>;
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode (_.VT (_.LdFrag addr:$src1)),
-                          (i8 imm:$src2)))>,
+                          (i8 timm:$src2)))>,
                    Sched<[sched.Folded]>;
   }
 }
@@ -5769,7 +5769,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
   defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
       "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
-     (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2)))>,
+     (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>,
      EVEX_B, Sched<[sched.Folded]>;
 }
 
@@ -5911,17 +5911,17 @@ let Predicates = [HasAVX512, NoVLX] in {
                 (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
                  VR128X:$src2)), sub_xmm)>;
 
-  def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))),
+  def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPSRAQZri
                 (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-                 imm:$src2)), sub_ymm)>;
+                 timm:$src2)), sub_ymm)>;
 
-  def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))),
+  def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPSRAQZri
                 (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
-                 imm:$src2)), sub_xmm)>;
+                 timm:$src2)), sub_xmm)>;
 }
 
 //===-------------------------------------------------------------------===//
@@ -5953,8 +5953,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                     "${src2}"##_.BroadcastStr##", $src1",
                     "$src1, ${src2}"##_.BroadcastStr,
-                    (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
-                                                (_.ScalarLdFrag addr:$src2)))))>,
+                    (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>,
                     AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -6062,27 +6061,27 @@ let Predicates = [HasAVX512, NoVLX] in {
                 (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
                         sub_ymm)>;
 
-  def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))),
+  def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPROLQZri
                 (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
-                        imm:$src2)), sub_xmm)>;
-  def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))),
+                        timm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPROLQZri
                 (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-                       imm:$src2)), sub_ymm)>;
+                       timm:$src2)), sub_ymm)>;
 
-  def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))),
+  def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v16i32
               (VPROLDZri
                 (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
-                        imm:$src2)), sub_xmm)>;
-  def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))),
+                        timm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v16i32
               (VPROLDZri
                 (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-                        imm:$src2)), sub_ymm)>;
+                        timm:$src2)), sub_ymm)>;
 }
 
 // Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
@@ -6113,27 +6112,27 @@ let Predicates = [HasAVX512, NoVLX] in {
                 (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
                         sub_ymm)>;
 
-  def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))),
+  def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPRORQZri
                 (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
-                        imm:$src2)), sub_xmm)>;
-  def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))),
+                        timm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPRORQZri
                 (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-                       imm:$src2)), sub_ymm)>;
+                       timm:$src2)), sub_ymm)>;
 
-  def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))),
+  def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v16i32
               (VPRORDZri
                 (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
-                        imm:$src2)), sub_xmm)>;
-  def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))),
+                        timm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v16i32
               (VPRORDZri
                 (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-                        imm:$src2)), sub_ymm)>;
+                        timm:$src2)), sub_ymm)>;
 }
 
 //===-------------------------------------------------------------------===//
@@ -6228,8 +6227,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                    "$src1, ${src2}"##_.BroadcastStr,
                    (_.VT (OpNode
                             _.RC:$src1,
-                            (Ctrl.VT (X86VBroadcast
-                                       (Ctrl.ScalarLdFrag addr:$src2)))))>,
+                            (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
                    T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -6419,7 +6417,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
             !strconcat("$src2, ${src3}", _.BroadcastStr ),
             (OpNode _.RC:$src2,
-             _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
+             _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
              AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -6493,7 +6491,7 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
          "$src2, ${src3}"##_.BroadcastStr,
          (_.VT (OpNode _.RC:$src2,
-                      (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                      (_.VT (_.BroadcastLdFrag addr:$src3)),
                       _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
          Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -6571,7 +6569,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          (ins _.RC:$src2, _.ScalarMemOp:$src3),
          OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
          "$src2, ${src3}"##_.BroadcastStr,
-         (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+         (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
                        _.RC:$src1, _.RC:$src2)), 1, 0>,
          AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -6964,7 +6962,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
             !strconcat("$src2, ${src3}", _.BroadcastStr ),
             (OpNode _.RC:$src2,
-                    (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+                    (_.VT (_.BroadcastLdFrag addr:$src3)),
                     _.RC:$src1)>,
             AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -7504,14 +7502,13 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          OpcodeStr,
                          "${src}"##Broadcast, "${src}"##Broadcast,
                          (_.VT (OpNode (_Src.VT
-                                  (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
+                                  (_Src.BroadcastLdFrag addr:$src))
                             )),
                          (vselect MaskRC:$mask,
                                   (_.VT
                                    (OpNode
                                     (_Src.VT
-                                     (X86VBroadcast
-                                      (_Src.ScalarLdFrag addr:$src))))),
+                                     (_Src.BroadcastLdFrag addr:$src)))),
                                   _.RC:$src0),
                          vselect, "$src0 = $dst">,
                          EVEX, EVEX_B, Sched<[sched.Folded]>;
@@ -7646,14 +7643,14 @@ let Predicates = [HasAVX512] in {
                      v8f32x_info.ImmAllZerosV),
             (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>;
 
-  def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))),
+  def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcastld64 addr:$src)))),
             (VCVTPD2PSZrmb addr:$src)>;
   def : Pat<(vselect VK8WM:$mask,
-                     (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+                     (fpround (v8f64 (X86VBroadcastld64 addr:$src))),
                      (v8f32 VR256X:$src0)),
             (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>;
   def : Pat<(vselect VK8WM:$mask,
-                     (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+                     (fpround (v8f64 (X86VBroadcastld64 addr:$src))),
                      v8f32x_info.ImmAllZerosV),
             (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>;
 }
@@ -7677,14 +7674,14 @@ let Predicates = [HasVLX] in {
                      v4f32x_info.ImmAllZerosV),
             (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>;
 
-  def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+  def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
             (VCVTPD2PSZ256rmb addr:$src)>;
   def : Pat<(vselect VK4WM:$mask,
-                     (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+                     (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
                      VR128X:$src0),
             (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
   def : Pat<(vselect VK4WM:$mask,
-                     (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+                     (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
                      v4f32x_info.ImmAllZerosV),
             (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>;
 
@@ -7708,12 +7705,12 @@ let Predicates = [HasVLX] in {
                           VK2WM:$mask),
             (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>;
 
-  def : Pat<(X86vfpround (v2f64 (X86VBroadcast (loadf64 addr:$src)))),
+  def : Pat<(X86vfpround (v2f64 (X86VBroadcastld64 addr:$src))),
             (VCVTPD2PSZ128rmb addr:$src)>;
-  def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
                           (v4f32 VR128X:$src0), VK2WM:$mask),
             (VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
                           v4f32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
 }
@@ -8194,12 +8191,12 @@ let Predicates = [HasVLX] in {
                           VK2WM:$mask),
             (VCVTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>;
 
-  def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+  def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)))),
             (VCVTPD2DQZ128rmb addr:$src)>;
-  def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)),
                           (v4i32 VR128X:$src0), VK2WM:$mask),
             (VCVTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)),
                           v4i32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>;
 
@@ -8223,12 +8220,12 @@ let Predicates = [HasVLX] in {
                           VK2WM:$mask),
             (VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>;
 
-  def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+  def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))),
             (VCVTTPD2DQZ128rmb addr:$src)>;
-  def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)),
                           (v4i32 VR128X:$src0), VK2WM:$mask),
             (VCVTTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)),
                           v4i32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>;
 
@@ -8252,12 +8249,12 @@ let Predicates = [HasVLX] in {
                            VK2WM:$mask),
             (VCVTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>;
 
-  def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+  def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)))),
             (VCVTPD2UDQZ128rmb addr:$src)>;
-  def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)),
                            (v4i32 VR128X:$src0), VK2WM:$mask),
             (VCVTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)),
                            v4i32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
 
@@ -8281,12 +8278,12 @@ let Predicates = [HasVLX] in {
                           VK2WM:$mask),
             (VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>;
 
-  def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+  def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))),
             (VCVTTPD2UDQZ128rmb addr:$src)>;
-  def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
                           (v4i32 VR128X:$src0), VK2WM:$mask),
             (VCVTTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
                           v4i32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
 }
@@ -8419,12 +8416,12 @@ let Predicates = [HasDQI, HasVLX] in {
                            VK2WM:$mask),
             (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
 
-  def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+  def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
             (VCVTQQ2PSZ128rmb addr:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
                            (v4f32 VR128X:$src0), VK2WM:$mask),
             (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
                            v4f32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
 
@@ -8448,12 +8445,12 @@ let Predicates = [HasDQI, HasVLX] in {
                            VK2WM:$mask),
             (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
 
-  def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+  def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
             (VCVTUQQ2PSZ128rmb addr:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
                            (v4f32 VR128X:$src0), VK2WM:$mask),
             (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
                            v4f32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
 }
@@ -8576,21 +8573,21 @@ let ExeDomain = GenericDomain in {
              (ins _src.RC:$src1, i32u8imm:$src2),
              "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
              [(set _dest.RC:$dst,
-                   (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)))]>,
+                   (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
              Sched<[RR]>;
   let Constraints = "$src0 = $dst" in
   def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
              (ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
              "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
              [(set _dest.RC:$dst,
-                   (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2),
+                   (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2),
                                  _dest.RC:$src0, _src.KRCWM:$mask))]>,
              Sched<[RR]>, EVEX_K;
   def rrkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
              (ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
              "vcvtps2ph\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}",
              [(set _dest.RC:$dst,
-                   (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2),
+                   (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2),
                                  _dest.ImmAllZerosV, _src.KRCWM:$mask))]>,
              Sched<[RR]>, EVEX_KZ;
   let hasSideEffects = 0, mayStore = 1 in {
@@ -8631,17 +8628,17 @@ let Predicates = [HasAVX512] in {
   }
 
   def : Pat<(store (f64 (extractelt
-                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
                          (iPTR 0))), addr:$dst),
-            (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+            (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
   def : Pat<(store (i64 (extractelt
-                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
                          (iPTR 0))), addr:$dst),
-            (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
-  def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst),
-            (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>;
-  def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst),
-            (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>;
+            (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
+  def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst),
+            (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>;
+  def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, timm:$src2)), addr:$dst),
+            (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>;
 }
 
 // Patterns for matching conversions from float to half-float and vice versa.
@@ -8765,7 +8762,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           (ins _.ScalarMemOp:$src), OpcodeStr,
                           "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
                           (OpNode (_.VT
-                            (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                            (_.BroadcastLdFrag addr:$src)))>,
                           EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -8859,7 +8856,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          (ins _.ScalarMemOp:$src), OpcodeStr,
                          "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
                          (OpNode (_.VT
-                                  (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                                  (_.BroadcastLdFrag addr:$src)))>,
                          EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -8940,7 +8937,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
                           (ins _.ScalarMemOp:$src), OpcodeStr,
                           "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
                           (fsqrt (_.VT
-                            (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                            (_.BroadcastLdFrag addr:$src)))>,
                           EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -9049,14 +9046,14 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
                            (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
                            "$src3, $src2, $src1", "$src1, $src2, $src3",
                            (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                           (i32 imm:$src3)))>,
+                           (i32 timm:$src3)))>,
                            Sched<[sched]>;
 
   defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
                          "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
                          (_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                         (i32 imm:$src3)))>, EVEX_B,
+                         (i32 timm:$src3)))>, EVEX_B,
                          Sched<[sched]>;
 
   defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -9064,7 +9061,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
                          OpcodeStr,
                          "$src3, $src2, $src1", "$src1, $src2, $src3",
                          (_.VT (X86RndScales _.RC:$src1,
-                                _.ScalarIntMemCPat:$src2, (i32 imm:$src3)))>,
+                                _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
@@ -9082,15 +9079,15 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
   }
 
   let Predicates = [HasAVX512] in {
-    def : Pat<(X86VRndScale _.FRC:$src1, imm:$src2),
+    def : Pat<(X86VRndScale _.FRC:$src1, timm:$src2),
               (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
-               _.FRC:$src1, imm:$src2))>;
+               _.FRC:$src1, timm:$src2))>;
   }
 
   let Predicates = [HasAVX512, OptForSize] in {
-    def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), imm:$src2),
+    def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2),
               (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
-               addr:$src1, imm:$src2))>;
+               addr:$src1, timm:$src2))>;
   }
 }
 
@@ -10109,19 +10106,19 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
                       (OpNode (_.VT _.RC:$src1),
-                              (i32 imm:$src2))>, Sched<[sched]>;
+                              (i32 timm:$src2))>, Sched<[sched]>;
   defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.MemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
                     (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                            (i32 imm:$src2))>,
+                            (i32 timm:$src2))>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
                     "${src1}"##_.BroadcastStr##", $src2",
-                    (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
-                            (i32 imm:$src2))>, EVEX_B,
+                    (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)),
+                            (i32 timm:$src2))>, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -10136,7 +10133,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
                       OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
                       "$src1, {sae}, $src2",
                       (OpNode (_.VT _.RC:$src1),
-                              (i32 imm:$src2))>,
+                              (i32 timm:$src2))>,
                       EVEX_B, Sched<[sched]>;
 }
 
@@ -10169,22 +10166,22 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (i32 imm:$src3))>,
+                              (i32 timm:$src3))>,
                       Sched<[sched]>;
   defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
                     OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                            (i32 imm:$src3))>,
+                            (i32 timm:$src3))>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
                     OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
                     "$src1, ${src2}"##_.BroadcastStr##", $src3",
                     (OpNode (_.VT _.RC:$src1),
-                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                            (i32 imm:$src3))>, EVEX_B,
+                            (_.VT (_.BroadcastLdFrag addr:$src2)),
+                            (i32 timm:$src3))>, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -10200,7 +10197,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                   (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
                                (SrcInfo.VT SrcInfo.RC:$src2),
-                               (i8 imm:$src3)))>,
+                               (i8 timm:$src3)))>,
                   Sched<[sched]>;
   defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
                 (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
@@ -10208,7 +10205,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                 (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
                              (SrcInfo.VT (bitconvert
                                                 (SrcInfo.LdFrag addr:$src2))),
-                             (i8 imm:$src3)))>,
+                             (i8 timm:$src3)))>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -10226,8 +10223,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
                     "$src1, ${src2}"##_.BroadcastStr##", $src3",
                     (OpNode (_.VT _.RC:$src1),
-                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                            (i8 imm:$src3))>, EVEX_B,
+                            (_.VT (_.BroadcastLdFrag addr:$src2)),
+                            (i8 timm:$src3))>, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -10241,15 +10238,14 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (i32 imm:$src3))>,
+                              (i32 timm:$src3))>,
                       Sched<[sched]>;
   defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                    (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
                     OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                     (OpNode (_.VT _.RC:$src1),
-                            (_.VT (scalar_to_vector
-                                      (_.ScalarLdFrag addr:$src2))),
-                            (i32 imm:$src3))>,
+                            (_.VT _.ScalarIntMemCPat:$src2),
+                            (i32 timm:$src3))>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -10265,7 +10261,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
                       "$src1, $src2, {sae}, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (i32 imm:$src3))>,
+                              (i32 timm:$src3))>,
                       EVEX_B, Sched<[sched]>;
 }
 
@@ -10279,7 +10275,7 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode
                       "$src1, $src2, {sae}, $src3",
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
-                              (i32 imm:$src3))>,
+                              (i32 timm:$src3))>,
                       EVEX_B, Sched<[sched]>;
 }
 
@@ -10401,7 +10397,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                   OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                   (_.VT (bitconvert
                          (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2,
-                                                  (i8 imm:$src3)))))>,
+                                                  (i8 timm:$src3)))))>,
                   Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
   defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                 (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
@@ -10410,7 +10406,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                  (bitconvert
                   (CastInfo.VT (X86Shuf128 _.RC:$src1,
                                            (CastInfo.LdFrag addr:$src2),
-                                           (i8 imm:$src3)))))>,
+                                           (i8 timm:$src3)))))>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>,
                 EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -10421,8 +10417,8 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                      (bitconvert
                       (CastInfo.VT
                        (X86Shuf128 _.RC:$src1,
-                                   (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
-                                   (i8 imm:$src3)))))>, EVEX_B,
+                                   (_.BroadcastLdFrag addr:$src2),
+                                   (i8 timm:$src3)))))>, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -10491,14 +10487,14 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
   defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
                   OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                  (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$src3)))>,
+                  (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>,
                   Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">;
   defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                 (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
                 OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                 (_.VT (X86VAlign _.RC:$src1,
                                  (bitconvert (_.LdFrag addr:$src2)),
-                                 (i8 imm:$src3)))>,
+                                 (i8 timm:$src3)))>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>,
                 EVEX2VEXOverride<"VPALIGNRrmi">;
 
@@ -10507,8 +10503,8 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
                    OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
                    "$src1, ${src2}"##_.BroadcastStr##", $src3",
                    (X86VAlign _.RC:$src1,
-                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
-                              (i8 imm:$src3))>, EVEX_B,
+                              (_.VT (_.BroadcastLdFrag addr:$src2)),
+                              (i8 timm:$src3))>, EVEX_B,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -10541,13 +10537,13 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr",
 
 // Fragments to help convert valignq into masked valignd. Or valignq/valignd
 // into vpalignr.
-def ValignqImm32XForm : SDNodeXForm<imm, [{
+def ValignqImm32XForm : SDNodeXForm<timm, [{
   return getI8Imm(N->getZExtValue() * 2, SDLoc(N));
 }]>;
-def ValignqImm8XForm : SDNodeXForm<imm, [{
+def ValignqImm8XForm : SDNodeXForm<timm, [{
   return getI8Imm(N->getZExtValue() * 8, SDLoc(N));
 }]>;
-def ValigndImm8XForm : SDNodeXForm<imm, [{
+def ValigndImm8XForm : SDNodeXForm<timm, [{
   return getI8Imm(N->getZExtValue() * 4, SDLoc(N));
 }]>;
 
@@ -10557,40 +10553,40 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1, From.RC:$src2,
-                                              imm:$src3))),
+                                              timm:$src3))),
                             To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
                                                   To.RC:$src1, To.RC:$src2,
-                                                  (ImmXForm imm:$src3))>;
+                                                  (ImmXForm timm:$src3))>;
 
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1, From.RC:$src2,
-                                              imm:$src3))),
+                                              timm:$src3))),
                             To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
                                                    To.RC:$src1, To.RC:$src2,
-                                                   (ImmXForm imm:$src3))>;
+                                                   (ImmXForm timm:$src3))>;
 
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
                                               (From.LdFrag addr:$src2),
-                                      imm:$src3))),
+                                      timm:$src3))),
                             To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
                                                   To.RC:$src1, addr:$src2,
-                                                  (ImmXForm imm:$src3))>;
+                                                  (ImmXForm timm:$src3))>;
 
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
                                               (From.LdFrag addr:$src2),
-                                      imm:$src3))),
+                                      timm:$src3))),
                             To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
                                                    To.RC:$src1, addr:$src2,
-                                                   (ImmXForm imm:$src3))>;
+                                                   (ImmXForm timm:$src3))>;
 }
 
 multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
@@ -10599,35 +10595,32 @@ multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
                                            SDNodeXForm ImmXForm> :
       avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> {
   def : Pat<(From.VT (OpNode From.RC:$src1,
-                             (bitconvert (To.VT (X86VBroadcast
-                                                (To.ScalarLdFrag addr:$src2)))),
-                             imm:$src3)),
+                             (bitconvert (To.VT (To.BroadcastLdFrag addr:$src2))),
+                             timm:$src3)),
             (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
-                                                  (ImmXForm imm:$src3))>;
+                                                  (ImmXForm timm:$src3))>;
 
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
                                       (bitconvert
-                                       (To.VT (X86VBroadcast
-                                               (To.ScalarLdFrag addr:$src2)))),
-                                      imm:$src3))),
+                                       (To.VT (To.BroadcastLdFrag addr:$src2))),
+                                      timm:$src3))),
                             To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
                                                    To.RC:$src1, addr:$src2,
-                                                   (ImmXForm imm:$src3))>;
+                                                   (ImmXForm timm:$src3))>;
 
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
                                       (bitconvert
-                                       (To.VT (X86VBroadcast
-                                               (To.ScalarLdFrag addr:$src2)))),
-                                      imm:$src3))),
+                                       (To.VT (To.BroadcastLdFrag addr:$src2))),
+                                      timm:$src3))),
                             To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
                                                     To.RC:$src1, addr:$src2,
-                                                    (ImmXForm imm:$src3))>;
+                                                    (ImmXForm timm:$src3))>;
 }
 
 let Predicates = [HasAVX512] in {
@@ -10666,13 +10659,13 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1), OpcodeStr,
                     "$src1", "$src1",
-                    (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase,
+                    (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase,
                     Sched<[sched]>;
 
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.MemOp:$src1), OpcodeStr,
                   "$src1", "$src1",
-                  (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
+                  (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1)))))>,
             EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>,
             Sched<[sched.Folded]>;
   }
@@ -10685,8 +10678,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (ins _.ScalarMemOp:$src1), OpcodeStr,
                   "${src1}"##_.BroadcastStr,
                   "${src1}"##_.BroadcastStr,
-                  (_.VT (OpNode (X86VBroadcast
-                                    (_.ScalarLdFrag addr:$src1))))>,
+                  (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>,
              EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
              Sched<[sched.Folded]>;
 }
@@ -10770,7 +10762,7 @@ let Predicates = [HasAVX512, NoVLX] in {
 multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
                                  AVX512VLVectorVTInfo _, Predicate prd> {
   let Predicates = [prd, NoVLX] in {
-    def : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
+    def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))),
               (EXTRACT_SUBREG
                 (!cast<Instruction>(InstrStr # "Zrr")
                   (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
@@ -10778,7 +10770,7 @@ multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
                                  _.info256.SubRegIdx)),
               _.info256.SubRegIdx)>;
 
-    def : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
+    def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1))),
               (EXTRACT_SUBREG
                 (!cast<Instruction>(InstrStr # "Zrr")
                   (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
@@ -10829,17 +10821,16 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup,
 // AVX-512 - MOVDDUP
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr,
                               X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                    (ins _.RC:$src), OpcodeStr, "$src", "$src",
-                   (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX,
+                   (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX,
                    Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                  (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
-                 (_.VT (OpNode (_.VT (scalar_to_vector
-                                       (_.ScalarLdFrag addr:$src)))))>,
+                 (_.VT (_.BroadcastLdFrag addr:$src))>,
                  EVEX, EVEX_CD8<_.EltSize, CD8VH>,
                  Sched<[sched.Folded]>;
   }
@@ -10853,7 +10844,7 @@ multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let Predicates = [HasAVX512, HasVLX] in {
     defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.YMM,
                                 VTInfo.info256>, EVEX_V256;
-    defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, sched.XMM,
+    defm Z128 : avx512_movddup_128<opc, OpcodeStr, sched.XMM,
                                    VTInfo.info128>, EVEX_V128;
   }
 }
@@ -10867,11 +10858,9 @@ multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode,
 defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>;
 
 let Predicates = [HasVLX] in {
-def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
-          (VMOVDDUPZ128rm addr:$src)>;
 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
           (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
           (VMOVDDUPZ128rm addr:$src)>;
 def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
           (VMOVDDUPZ128rm addr:$src)>;
@@ -10884,17 +10873,17 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
                    immAllZerosV),
           (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
 
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
                    (v2f64 VR128X:$src0)),
           (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
                    immAllZerosV),
           (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
 
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
                    (v2f64 VR128X:$src0)),
           (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
                    immAllZerosV),
           (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
 }
@@ -11070,14 +11059,14 @@ multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
   def rr : AVX512<opc, MRMr,
              (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
+             [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 timm:$src2))))]>,
              Sched<[sched]>;
   def rm : AVX512<opc, MRMm,
            (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set _.RC:$dst,(_.VT (OpNode
                                  (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                                 (i8 imm:$src2))))]>,
+                                 (i8 timm:$src2))))]>,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -11104,6 +11093,7 @@ defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
 multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
                                 string OpcodeStr, X86FoldableSchedWrite sched,
                                 X86VectorVTInfo _dst, X86VectorVTInfo _src> {
+  let isCommutable = 1 in
   def rr : AVX512BI<opc, MRMSrcReg,
              (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -11140,7 +11130,7 @@ defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
 
 // Transforms to swizzle an immediate to enable better matching when
 // memory operand isn't in the right place.
-def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG321_imm8 : SDNodeXForm<timm, [{
   // Convert a VPTERNLOG immediate by swapping operand 0 and operand 2.
   uint8_t Imm = N->getZExtValue();
   // Swap bits 1/4 and 3/6.
@@ -11151,7 +11141,7 @@ def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
   if (Imm & 0x40) NewImm |= 0x08;
   return getI8Imm(NewImm, SDLoc(N));
 }]>;
-def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG213_imm8 : SDNodeXForm<timm, [{
   // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
   uint8_t Imm = N->getZExtValue();
   // Swap bits 2/4 and 3/5.
@@ -11162,7 +11152,7 @@ def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
   if (Imm & 0x20) NewImm |= 0x08;
   return getI8Imm(NewImm, SDLoc(N));
 }]>;
-def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG132_imm8 : SDNodeXForm<timm, [{
   // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
   uint8_t Imm = N->getZExtValue();
   // Swap bits 1/2 and 5/6.
@@ -11173,7 +11163,7 @@ def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
   if (Imm & 0x40) NewImm |= 0x20;
   return getI8Imm(NewImm, SDLoc(N));
 }]>;
-def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG231_imm8 : SDNodeXForm<timm, [{
   // Convert a VPTERNLOG immediate by moving operand 1 to the end.
   uint8_t Imm = N->getZExtValue();
   // Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5
@@ -11186,7 +11176,7 @@ def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
   if (Imm & 0x40) NewImm |= 0x20;
   return getI8Imm(NewImm, SDLoc(N));
 }]>;
-def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG312_imm8 : SDNodeXForm<timm, [{
   // Convert a VPTERNLOG immediate by moving operand 2 to the beginning.
   uint8_t Imm = N->getZExtValue();
   // Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3
@@ -11210,7 +11200,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               (_.VT _.RC:$src3),
-                              (i8 imm:$src4)), 1, 1>,
+                              (i8 timm:$src4)), 1, 1>,
                       AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
   defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
@@ -11218,7 +11208,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT _.RC:$src2),
                             (_.VT (bitconvert (_.LdFrag addr:$src3))),
-                            (i8 imm:$src4)), 1, 0>,
+                            (i8 timm:$src4)), 1, 0>,
                     AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -11227,146 +11217,145 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     "$src2, ${src3}"##_.BroadcastStr##", $src4",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT _.RC:$src2),
-                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
-                            (i8 imm:$src4)), 1, 0>, EVEX_B,
+                            (_.VT (_.BroadcastLdFrag addr:$src3)),
+                            (i8 timm:$src4)), 1, 0>, EVEX_B,
                     AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }// Constraints = "$src1 = $dst"
 
   // Additional patterns for matching passthru operand in other positions.
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+             _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)),
+                   (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+             _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
 
   // Additional patterns for matching loads in other positions.
   def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
-                          _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+                          _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
             (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
-                                   addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+                                   addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
   def : Pat<(_.VT (OpNode _.RC:$src1,
                           (bitconvert (_.LdFrag addr:$src3)),
-                          _.RC:$src2, (i8 imm:$src4))),
+                          _.RC:$src2, (i8 timm:$src4))),
             (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
-                                   addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+                                   addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
 
   // Additional patterns for matching zero masking with loads in other
   // positions.
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode (bitconvert (_.LdFrag addr:$src3)),
-                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                    _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.ImmAllZerosV)),
             (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
-                    _.RC:$src2, (i8 imm:$src4)),
+                    _.RC:$src2, (i8 timm:$src4)),
                    _.ImmAllZerosV)),
             (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
 
   // Additional patterns for matching masked loads with different
   // operand orders.
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
-                    _.RC:$src2, (i8 imm:$src4)),
+                    _.RC:$src2, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode (bitconvert (_.LdFrag addr:$src3)),
-                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                    _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src2, _.RC:$src1,
-                    (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)),
+                    (bitconvert (_.LdFrag addr:$src3)), (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
-                    _.RC:$src1, (i8 imm:$src4)),
+                    _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode (bitconvert (_.LdFrag addr:$src3)),
-                    _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+                    _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
 
   // Additional patterns for matching broadcasts in other positions.
-  def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
-                          _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+  def : Pat<(_.VT (OpNode (_.BroadcastLdFrag addr:$src3),
+                          _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
             (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
-                                   addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+                                   addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
   def : Pat<(_.VT (OpNode _.RC:$src1,
-                          (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
-                          _.RC:$src2, (i8 imm:$src4))),
+                          (_.BroadcastLdFrag addr:$src3),
+                          _.RC:$src2, (i8 timm:$src4))),
             (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
-                                   addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+                                   addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
 
   // Additional patterns for matching zero masking with broadcasts in other
   // positions.
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
-                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   (OpNode (_.BroadcastLdFrag addr:$src3),
+                    _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.ImmAllZerosV)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
              _.KRCWM:$mask, _.RC:$src2, addr:$src3,
-             (VPTERNLOG321_imm8 imm:$src4))>;
+             (VPTERNLOG321_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src1,
-                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
-                    _.RC:$src2, (i8 imm:$src4)),
+                    (_.BroadcastLdFrag addr:$src3),
+                    _.RC:$src2, (i8 timm:$src4)),
                    _.ImmAllZerosV)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
              _.KRCWM:$mask, _.RC:$src2, addr:$src3,
-             (VPTERNLOG132_imm8 imm:$src4))>;
+             (VPTERNLOG132_imm8 timm:$src4))>;
 
   // Additional patterns for matching masked broadcasts with different
   // operand orders.
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (OpNode _.RC:$src1,
-                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
-                    _.RC:$src2, (i8 imm:$src4)),
+                   (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3),
+                    _.RC:$src2, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
-                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   (OpNode (_.BroadcastLdFrag addr:$src3),
+                    _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src2, _.RC:$src1,
-                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
-                    (i8 imm:$src4)), _.RC:$src1)),
+                    (_.BroadcastLdFrag addr:$src3),
+                    (i8 timm:$src4)), _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
                    (OpNode _.RC:$src2,
-                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
-                    _.RC:$src1, (i8 imm:$src4)),
+                    (_.BroadcastLdFrag addr:$src3),
+                    _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
   def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
-                    _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+                   (OpNode (_.BroadcastLdFrag addr:$src3),
+                    _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
-             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
+             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
 }
 
 multiclass avx512_common_ternlog<string OpcodeStr, X86SchedWriteWidths sched,
@@ -11387,6 +11376,113 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU,
 defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
                                         avx512vl_i64_info>, VEX_W;
 
+// Patterns to use VPTERNLOG for vXi16/vXi8 vectors.
+let Predicates = [HasVLX] in {
+  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
+                               timm:$src4)>;
+  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
+                                 (loadv16i8 addr:$src3), (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+                               timm:$src4)>;
+  def : Pat<(v16i8 (X86vpternlog (loadv16i8 addr:$src3), VR128X:$src2,
+                                 VR128X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+                               (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, (loadv16i8 addr:$src3),
+                                 VR128X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
+                               timm:$src4)>;
+  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
+                                 (loadv8i16 addr:$src3), (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+                               timm:$src4)>;
+  def : Pat<(v8i16 (X86vpternlog (loadv8i16 addr:$src3), VR128X:$src2,
+                                 VR128X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+                               (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, (loadv8i16 addr:$src3),
+                                 VR128X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
+                               timm:$src4)>;
+  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
+                                 (loadv32i8 addr:$src3), (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+                               timm:$src4)>;
+  def : Pat<(v32i8 (X86vpternlog (loadv32i8 addr:$src3), VR256X:$src2,
+                                 VR256X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+                               (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, (loadv32i8 addr:$src3),
+                                 VR256X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
+                                  (i8 timm:$src4))),
+            (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
+                               timm:$src4)>;
+  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
+                                  (loadv16i16 addr:$src3), (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+                               timm:$src4)>;
+  def : Pat<(v16i16 (X86vpternlog (loadv16i16 addr:$src3), VR256X:$src2,
+                                  VR256X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+                               (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, (loadv16i16 addr:$src3),
+                                  VR256X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+}
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
+                            timm:$src4)>;
+  def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
+                                 (loadv64i8 addr:$src3), (i8 timm:$src4))),
+            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+                            timm:$src4)>;
+  def : Pat<(v64i8 (X86vpternlog (loadv64i8 addr:$src3), VR512:$src2,
+                                  VR512:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+                            (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v64i8 (X86vpternlog VR512:$src1, (loadv64i8 addr:$src3),
+                                 VR512:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+                            (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
+                                  (i8 timm:$src4))),
+            (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
+                            timm:$src4)>;
+  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
+                                  (loadv32i16 addr:$src3), (i8 timm:$src4))),
+            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+                            timm:$src4)>;
+  def : Pat<(v32i16 (X86vpternlog (loadv32i16 addr:$src3), VR512:$src2,
+                                  VR512:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+                            (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3),
+                                 VR512:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+                            (VPTERNLOG132_imm8 timm:$src4))>;
+}
+
 // Patterns to implement vnot using vpternlog instead of creating all ones
 // using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen
 // so that the result is only dependent on src0. But we use the same source
@@ -11498,14 +11594,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
                         (X86VFixupimm (_.VT _.RC:$src1),
                                       (_.VT _.RC:$src2),
                                       (TblVT.VT _.RC:$src3),
-                                      (i32 imm:$src4))>, Sched<[sched]>;
+                                      (i32 timm:$src4))>, Sched<[sched]>;
     defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
                       OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
                       (X86VFixupimm (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
                                     (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
-                                    (i32 imm:$src4))>,
+                                    (i32 timm:$src4))>,
                       Sched<[sched.Folded, sched.ReadAfterFold]>;
     defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
@@ -11513,8 +11609,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
                     "$src2, ${src3}"##_.BroadcastStr##", $src4",
                       (X86VFixupimm (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
-                                    (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))),
-                                    (i32 imm:$src4))>,
+                                    (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)),
+                                    (i32 timm:$src4))>,
                     EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   } // Constraints = "$src1 = $dst"
 }
@@ -11531,7 +11627,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
                       (X86VFixupimmSAE (_.VT _.RC:$src1),
                                        (_.VT _.RC:$src2),
                                        (TblVT.VT _.RC:$src3),
-                                       (i32 imm:$src4))>,
+                                       (i32 timm:$src4))>,
                       EVEX_B, Sched<[sched]>;
   }
 }
@@ -11547,7 +11643,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
                       (X86VFixupimms (_.VT _.RC:$src1),
                                      (_.VT _.RC:$src2),
                                      (_src3VT.VT _src3VT.RC:$src3),
-                                     (i32 imm:$src4))>, Sched<[sched]>;
+                                     (i32 timm:$src4))>, Sched<[sched]>;
     defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
                       OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -11555,7 +11651,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
                       (X86VFixupimmSAEs (_.VT _.RC:$src1),
                                         (_.VT _.RC:$src2),
                                         (_src3VT.VT _src3VT.RC:$src3),
-                                        (i32 imm:$src4))>,
+                                        (i32 timm:$src4))>,
                       EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
     defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                      (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
@@ -11564,13 +11660,13 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
                                     (_.VT _.RC:$src2),
                                     (_src3VT.VT (scalar_to_vector
                                               (_src3VT.ScalarLdFrag addr:$src3))),
-                                    (i32 imm:$src4))>,
+                                    (i32 timm:$src4))>,
                      Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
-                                      AVX512VLVectorVTInfo _Vec, 
+                                      AVX512VLVectorVTInfo _Vec,
                                       AVX512VLVectorVTInfo _Tbl> {
   let Predicates = [HasAVX512] in
     defm Z    : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM,
@@ -11804,7 +11900,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
               "${src3}"##VTI.BroadcastStr##", $src2",
               "$src2, ${src3}"##VTI.BroadcastStr,
               (OpNode VTI.RC:$src1, VTI.RC:$src2,
-               (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>,
+               (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
               AVX512FMA3Base, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -11880,12 +11976,14 @@ defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256,
 
 let Constraints = "$src1 = $dst" in
 multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
-                    X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
+                    X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
+                    bit IsCommutable> {
   defm r  :   AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1,
-                                            VTI.RC:$src2, VTI.RC:$src3))>,
+                                            VTI.RC:$src2, VTI.RC:$src3)),
+                                   IsCommutable, IsCommutable>,
                                    EVEX_4V, T8PD, Sched<[sched]>;
   defm m  :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
@@ -11899,27 +11997,58 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    OpStr, "${src3}"##VTI.BroadcastStr##", $src2",
                                    "$src2, ${src3}"##VTI.BroadcastStr,
                                    (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                                    (VTI.VT (X86VBroadcast
-                                             (VTI.ScalarLdFrag addr:$src3))))>,
+                                    (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
                                    EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
                                    T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
-                       X86SchedWriteWidths sched> {
+                       X86SchedWriteWidths sched, bit IsCommutable> {
   let Predicates = [HasVNNI] in
-  defm Z      :   VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info>, EVEX_V512;
+  defm Z      :   VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info,
+                           IsCommutable>, EVEX_V512;
   let Predicates = [HasVNNI, HasVLX] in {
-    defm Z256 :   VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info>, EVEX_V256;
-    defm Z128 :   VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info>, EVEX_V128;
+    defm Z256 :   VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info,
+                           IsCommutable>, EVEX_V256;
+    defm Z128 :   VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info,
+                           IsCommutable>, EVEX_V128;
   }
 }
 
 // FIXME: Is there a better scheduler class for VPDP?
-defm VPDPBUSD   : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul>;
-defm VPDPBUSDS  : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul>;
-defm VPDPWSSD   : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul>;
-defm VPDPWSSDS  : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul>;
+defm VPDPBUSD   : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0>;
+defm VPDPBUSDS  : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0>;
+defm VPDPWSSD   : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>;
+defm VPDPWSSDS  : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>;
+
+def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
+                             (X86vpmaddwd node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+// Patterns to match VPDPWSSD from existing instructions/intrinsics.
+let Predicates = [HasVNNI] in {
+  def : Pat<(v16i32 (add VR512:$src1,
+                         (X86vpmaddwd_su VR512:$src2, VR512:$src3))),
+            (VPDPWSSDZr VR512:$src1, VR512:$src2, VR512:$src3)>;
+  def : Pat<(v16i32 (add VR512:$src1,
+                         (X86vpmaddwd_su VR512:$src2, (load addr:$src3)))),
+            (VPDPWSSDZm VR512:$src1, VR512:$src2, addr:$src3)>;
+}
+let Predicates = [HasVNNI,HasVLX] in {
+  def : Pat<(v8i32 (add VR256X:$src1,
+                        (X86vpmaddwd_su VR256X:$src2, VR256X:$src3))),
+            (VPDPWSSDZ256r VR256X:$src1, VR256X:$src2, VR256X:$src3)>;
+  def : Pat<(v8i32 (add VR256X:$src1,
+                        (X86vpmaddwd_su VR256X:$src2, (load addr:$src3)))),
+            (VPDPWSSDZ256m VR256X:$src1, VR256X:$src2, addr:$src3)>;
+  def : Pat<(v4i32 (add VR128X:$src1,
+                        (X86vpmaddwd_su VR128X:$src2, VR128X:$src3))),
+            (VPDPWSSDZ128r VR128X:$src1, VR128X:$src2, VR128X:$src3)>;
+  def : Pat<(v4i32 (add VR128X:$src1,
+                        (X86vpmaddwd_su VR128X:$src2, (load addr:$src3)))),
+            (VPDPWSSDZ128m VR128X:$src1, VR128X:$src2, addr:$src3)>;
+}
 
 //===----------------------------------------------------------------------===//
 // Bit Algorithms
@@ -12004,8 +12133,8 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
                 OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1",
                 "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3",
                 (OpNode (VTI.VT VTI.RC:$src1),
-                 (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))),
-                 (i8 imm:$src3))>, EVEX_B,
+                 (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))),
+                 (i8 timm:$src3))>, EVEX_B,
                  Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12116,7 +12245,7 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> {
                   !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr,
                              ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
-                             _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>,
+                             _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
                   EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
 }
 
@@ -12217,12 +12346,12 @@ let Predicates = [HasBF16, HasVLX] in {
             (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>;
 
   def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32
-                                     (X86VBroadcast (loadf32 addr:$src))))),
+                                     (X86VBroadcastld32 addr:$src)))),
             (VCVTNEPS2BF16Z128rmb addr:$src)>;
-  def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+  def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)),
                               (v8i16 VR128X:$src0), VK4WM:$mask),
             (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
-  def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+  def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)),
                               v8i16x_info.ImmAllZerosV, VK4WM:$mask),
             (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>;
 }
@@ -12249,7 +12378,7 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   !strconcat("${src3}", _.BroadcastStr,", $src2"),
                   !strconcat("$src2, ${src3}", _.BroadcastStr),
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2,
-                  (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>,
+                  (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
                   EVEX_B, EVEX_4V;
 
 }
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index e52635f8d48b..1e399a894490 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1271,22 +1271,22 @@ let isCompare = 1 in {
 // ANDN Instruction
 //
 multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
-                    PatFrag ld_frag> {
+                    PatFrag ld_frag, X86FoldableSchedWrite sched> {
   def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
             [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
-            Sched<[WriteALU]>;
+            Sched<[sched]>;
   def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
             [(set RC:$dst, EFLAGS,
              (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
-           Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+           Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 // Complexity is reduced to give and with immediate a chance to match first.
 let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in {
-  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
-  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS, VEX_4V;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, VEX_4V, VEX_W;
 }
 
 let Predicates = [HasBMI], AddedComplexity = -6 in {
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index 50aed98112c3..aa45e9b191c1 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -131,11 +131,11 @@ addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
 /// reference.
 static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand,
                                            unsigned Reg) {
-  // Direct memory address is in a form of: Reg, 1 (Scale), NoReg, 0, NoReg.
-  MI->getOperand(Operand).setReg(Reg);
+  // Direct memory address is in a form of: Reg/FI, 1 (Scale), NoReg, 0, NoReg.
+  MI->getOperand(Operand).ChangeToRegister(Reg, /*isDef=*/false);
   MI->getOperand(Operand + 1).setImm(1);
   MI->getOperand(Operand + 2).setReg(0);
-  MI->getOperand(Operand + 3).setImm(0);
+  MI->getOperand(Operand + 3).ChangeToImmediate(0);
   MI->getOperand(Operand + 4).setReg(0);
 }
 
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index 099f6aa8d8bb..330b8c7a8a43 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -20,19 +20,19 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
     : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
         "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
         [(set GR16:$dst,
-              (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>,
+              (X86cmov GR16:$src1, GR16:$src2, timm:$cond, EFLAGS))]>,
               TB, OpSize16;
   def CMOV32rr
     : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond),
         "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
         [(set GR32:$dst,
-              (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>,
+              (X86cmov GR32:$src1, GR32:$src2, timm:$cond, EFLAGS))]>,
               TB, OpSize32;
   def CMOV64rr
     :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond),
         "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
         [(set GR64:$dst,
-              (X86cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>, TB;
+              (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB;
 }
 
 let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
@@ -41,29 +41,46 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
     : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
         "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
         [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
-                                  imm:$cond, EFLAGS))]>, TB, OpSize16;
+                                  timm:$cond, EFLAGS))]>, TB, OpSize16;
   def CMOV32rm
     : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond),
         "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
         [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
-                                  imm:$cond, EFLAGS))]>, TB, OpSize32;
+                                  timm:$cond, EFLAGS))]>, TB, OpSize32;
   def CMOV64rm
     :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond),
         "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
         [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
-                                  imm:$cond, EFLAGS))]>, TB;
+                                  timm:$cond, EFLAGS))]>, TB;
 } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
 } // isCodeGenOnly = 1, ForceDisassemble = 1
 
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue());
+  return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC),
+                                   SDLoc(N), MVT::i8);
+}]>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+let Predicates = [HasCMov] in {
+  def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
+            (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
+            (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
+            (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+}
+
 // SetCC instructions.
 let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in {
   def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond),
                 "set${cond}\t$dst",
-                [(set GR8:$dst, (X86setcc imm:$cond, EFLAGS))]>,
+                [(set GR8:$dst, (X86setcc timm:$cond, EFLAGS))]>,
                 TB, Sched<[WriteSETCC]>;
   def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond),
                 "set${cond}\t$dst",
-                [(store (X86setcc imm:$cond, EFLAGS), addr:$dst)]>,
+                [(store (X86setcc timm:$cond, EFLAGS), addr:$dst)]>,
                 TB, Sched<[WriteSETCCStore]>;
 } // Uses = [EFLAGS]
 
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index efaccdc9ee96..78d8dd3c0d03 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -542,7 +542,7 @@ multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
   def CMOV#NAME  : I<0, Pseudo,
                     (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
                     "#CMOV_"#NAME#" PSEUDO!",
-                    [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond,
+                    [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, timm:$cond,
                                                 EFLAGS)))]>;
 }
 
@@ -593,66 +593,66 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
   defm _VK64   : CMOVrr_PSEUDO<VK64, v64i1>;
 } // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS]
 
-def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
-          (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
+def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+          (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
 
 let Predicates = [NoVLX] in {
-  def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
-  def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
-  def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
-  def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
-  def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
-
-  def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
-  def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
-  def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
-  def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
-  def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
+  def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+  def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+  def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+  def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+  def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+
+  def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+  def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+  def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+  def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+  def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
 }
 let Predicates = [HasVLX] in {
-  def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
-  def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
-  def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
-  def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
-  def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
-
-  def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
-  def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
-  def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
-  def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
-  def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
-            (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
+  def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+  def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+  def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+  def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+  def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+
+  def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+  def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+  def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+  def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+  def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
 }
 
-def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
-          (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
-          (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
-          (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
-          (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
-          (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
+def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+          (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+          (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+          (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+          (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+          (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 
 //===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
@@ -1126,12 +1126,12 @@ def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
 // binary size compared to a regular MOV, but it introduces an unnecessary
 // load, so is not suitable for regular or optsize functions.
 let Predicates = [OptForMinSize] in {
-def : Pat<(nonvolatile_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
-def : Pat<(nonvolatile_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
-def : Pat<(nonvolatile_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
-def : Pat<(nonvolatile_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
-def : Pat<(nonvolatile_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
-def : Pat<(nonvolatile_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
 }
 
 // In kernel code model, we can get the address of a label
@@ -1276,23 +1276,6 @@ def : Pat<(X86cmp GR32:$src1, 0),
 def : Pat<(X86cmp GR64:$src1, 0),
           (TEST64rr GR64:$src1, GR64:$src1)>;
 
-def inv_cond_XFORM : SDNodeXForm<imm, [{
-  X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue());
-  return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC),
-                                   SDLoc(N), MVT::i8);
-}]>;
-
-// Conditional moves with folded loads with operands swapped and conditions
-// inverted.
-let Predicates = [HasCMov] in {
-  def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, imm:$cond, EFLAGS),
-            (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
-  def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, imm:$cond, EFLAGS),
-            (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
-  def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, imm:$cond, EFLAGS),
-            (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
-}
-
 // zextload bool -> zextload byte
 // i1 stored in one byte in zero-extended form.
 // Upper bits cleanup should be executed before Store.
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index f82e80965b7c..e1e6eea59884 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -75,7 +75,7 @@ let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump],
   def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs),
                         (ins brtarget8:$dst, ccode:$cond),
                         "j${cond}\t$dst",
-                        [(X86brcond bb:$dst, imm:$cond, EFLAGS)]>;
+                        [(X86brcond bb:$dst, timm:$cond, EFLAGS)]>;
   let hasSideEffects = 0 in {
     def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs),
                           (ins brtarget16:$dst, ccode:$cond),
@@ -145,6 +145,17 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
                      [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>,
                      Sched<[WriteJumpLd]>;
 
+  // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
+  // These are switched from TAILJMPr/m64_REX in MCInstLower.
+  let isCodeGenOnly = 1, hasREX_WPrefix = 1 in {
+    def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst),
+                       "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>;
+    let mayLoad = 1 in
+    def JMP64m_REX : I<0xFF, MRM4m, (outs), (ins i64mem:$dst),
+                       "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJumpLd]>;
+
+  }
+
   // Non-tracking jumps for IBT, use with caution.
   let isCodeGenOnly = 1 in {
     def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst",
@@ -273,39 +284,35 @@ let isCall = 1 in
 
 // Tail call stuff.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
-  let Uses = [ESP, SSP] in {
-  def TCRETURNdi : PseudoI<(outs),
-                     (ins i32imm_pcrel:$dst, i32imm:$offset), []>, NotMemoryFoldable;
-  def TCRETURNri : PseudoI<(outs),
-                     (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
+    isCodeGenOnly = 1, Uses = [ESP, SSP] in {
+  def TCRETURNdi : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset),
+                           []>, Sched<[WriteJump]>, NotMemoryFoldable;
+  def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset),
+                           []>, Sched<[WriteJump]>, NotMemoryFoldable;
   let mayLoad = 1 in
-  def TCRETURNmi : PseudoI<(outs),
-                     (ins i32mem_TC:$dst, i32imm:$offset), []>;
+  def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
+                           []>, Sched<[WriteJumpLd]>;
 
-  // FIXME: The should be pseudo instructions that are lowered when going to
-  // mcinst.
-  def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
-                           (ins i32imm_pcrel:$dst), "jmp\t$dst", []>;
+  def TAILJMPd : PseudoI<(outs), (ins i32imm_pcrel:$dst),
+                         []>, Sched<[WriteJump]>;
 
-  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
-                   "", []>;  // FIXME: Remove encoding when JIT is dead.
+  def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+                         []>, Sched<[WriteJump]>;
   let mayLoad = 1 in
-  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
-                   "jmp{l}\t{*}$dst", []>;
+  def TAILJMPm : PseudoI<(outs), (ins i32mem_TC:$dst),
+                         []>, Sched<[WriteJumpLd]>;
 }
 
 // Conditional tail calls are similar to the above, but they are branches
 // rather than barriers, and they use EFLAGS.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
-    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+    isCodeGenOnly = 1, SchedRW = [WriteJump] in
   let Uses = [ESP, EFLAGS, SSP] in {
   def TCRETURNdicc : PseudoI<(outs),
                      (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
 
   // This gets substituted to a conditional jump instruction in MC lowering.
-  def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
-                           (ins i32imm_pcrel:$dst, i32imm:$cond), "", []>;
+  def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$cond), []>;
 }
 
 
@@ -348,34 +355,36 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
 }
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    isCodeGenOnly = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
+    isCodeGenOnly = 1, Uses = [RSP, SSP] in {
   def TCRETURNdi64   : PseudoI<(outs),
-                        (ins i64i32imm_pcrel:$dst, i32imm:$offset),
-                        []>;
+                               (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+                               []>, Sched<[WriteJump]>;
   def TCRETURNri64   : PseudoI<(outs),
-                        (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
+                               (ins ptr_rc_tailcall:$dst, i32imm:$offset),
+                               []>, Sched<[WriteJump]>, NotMemoryFoldable;
   let mayLoad = 1 in
   def TCRETURNmi64   : PseudoI<(outs),
-                        (ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable;
+                               (ins i64mem_TC:$dst, i32imm:$offset),
+                               []>, Sched<[WriteJumpLd]>, NotMemoryFoldable;
 
-  def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
-                   "jmp\t$dst", []>;
+  def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst),
+                           []>, Sched<[WriteJump]>;
 
-  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
-                     "jmp{q}\t{*}$dst", []>;
+  def TAILJMPr64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+                           []>, Sched<[WriteJump]>;
 
   let mayLoad = 1 in
-  def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
-                     "jmp{q}\t{*}$dst", []>;
+  def TAILJMPm64 : PseudoI<(outs), (ins i64mem_TC:$dst),
+                           []>, Sched<[WriteJumpLd]>;
 
   // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
   let hasREX_WPrefix = 1 in {
-    def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
-                           "rex64 jmp{q}\t{*}$dst", []>;
+    def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+                                 []>, Sched<[WriteJump]>;
 
     let mayLoad = 1 in
-    def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
-                           "rex64 jmp{q}\t{*}$dst", []>;
+    def TAILJMPm64_REX : PseudoI<(outs), (ins i64mem_TC:$dst),
+                                 []>, Sched<[WriteJumpLd]>;
   }
 }
 
@@ -403,13 +412,13 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
 // Conditional tail calls are similar to the above, but they are branches
 // rather than barriers, and they use EFLAGS.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
-    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+    isCodeGenOnly = 1, SchedRW = [WriteJump] in
   let Uses = [RSP, EFLAGS, SSP] in {
   def TCRETURNdi64cc : PseudoI<(outs),
                            (ins i64i32imm_pcrel:$dst, i32imm:$offset,
                             i32imm:$cond), []>;
 
   // This gets substituted to a conditional jump instruction in MC lowering.
-  def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
-                           (ins i64i32imm_pcrel:$dst, i32imm:$cond), "", []>;
+  def TAILJMPd64_CC : PseudoI<(outs),
+                              (ins i64i32imm_pcrel:$dst, i32imm:$cond), []>;
 }
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 06e605fe5db2..7a4eb138ec34 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -17,19 +17,18 @@ let hasSideEffects = 0 in {
   let Defs = [EAX], Uses = [AX] in // EAX = signext(AX)
   def CWDE : I<0x98, RawFrm, (outs), (ins),
               "{cwtl|cwde}", []>, OpSize32, Sched<[WriteALU]>;
+  let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
+  def CDQE : RI<0x98, RawFrm, (outs), (ins),
+               "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
 
+  // FIXME: CWD/CDQ/CQO shouldn't Def the A register, but the fast register
+  // allocator crashes if you remove it.
   let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX)
   def CWD : I<0x99, RawFrm, (outs), (ins),
               "{cwtd|cwd}", []>, OpSize16, Sched<[WriteALU]>;
   let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX)
   def CDQ : I<0x99, RawFrm, (outs), (ins),
               "{cltd|cdq}", []>, OpSize32, Sched<[WriteALU]>;
-
-
-  let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
-  def CDQE : RI<0x98, RawFrm, (outs), (ins),
-               "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
-
   let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX)
   def CQO  : RI<0x99, RawFrm, (outs), (ins),
                 "{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp
index d42fec3770c7..f3b286e0375c 100644
--- a/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/lib/Target/X86/X86InstrFoldTables.cpp
@@ -292,6 +292,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
   { X86::JMP32r_NT,           X86::JMP32m_NT,           TB_FOLDED_LOAD },
   { X86::JMP64r,              X86::JMP64m,              TB_FOLDED_LOAD },
   { X86::JMP64r_NT,           X86::JMP64m_NT,           TB_FOLDED_LOAD },
+  { X86::MMX_MOVD64from64rr,  X86::MMX_MOVD64from64rm,  TB_FOLDED_STORE | TB_NO_REVERSE },
+  { X86::MMX_MOVD64grr,       X86::MMX_MOVD64mr,        TB_FOLDED_STORE | TB_NO_REVERSE },
   { X86::MOV16ri,             X86::MOV16mi,             TB_FOLDED_STORE },
   { X86::MOV16rr,             X86::MOV16mr,             TB_FOLDED_STORE },
   { X86::MOV32ri,             X86::MOV32mi,             TB_FOLDED_STORE },
@@ -5245,6 +5247,270 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VXORPSZrrk,                X86::VXORPSZrmk,                0 },
 };
 
+static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = {
+  { X86::VADDPDZ128rr,   X86::VADDPDZ128rmb,   TB_BCAST_SD },
+  { X86::VADDPDZ256rr,   X86::VADDPDZ256rmb,   TB_BCAST_SD },
+  { X86::VADDPDZrr,      X86::VADDPDZrmb,      TB_BCAST_SD },
+  { X86::VADDPSZ128rr,   X86::VADDPSZ128rmb,   TB_BCAST_SS },
+  { X86::VADDPSZ256rr,   X86::VADDPSZ256rmb,   TB_BCAST_SS },
+  { X86::VADDPSZrr,      X86::VADDPSZrmb,      TB_BCAST_SS },
+  { X86::VCMPPDZ128rri,  X86::VCMPPDZ128rmbi,  TB_BCAST_SD },
+  { X86::VCMPPDZ256rri,  X86::VCMPPDZ256rmbi,  TB_BCAST_SD },
+  { X86::VCMPPDZrri,     X86::VCMPPDZrmbi,     TB_BCAST_SD },
+  { X86::VCMPPSZ128rri,  X86::VCMPPSZ128rmbi,  TB_BCAST_SS },
+  { X86::VCMPPSZ256rri,  X86::VCMPPSZ256rmbi,  TB_BCAST_SS },
+  { X86::VCMPPSZrri,     X86::VCMPPSZrmbi,     TB_BCAST_SS },
+  { X86::VDIVPDZ128rr,   X86::VDIVPDZ128rmb,   TB_BCAST_SD },
+  { X86::VDIVPDZ256rr,   X86::VDIVPDZ256rmb,   TB_BCAST_SD },
+  { X86::VDIVPDZrr,      X86::VDIVPDZrmb,      TB_BCAST_SD },
+  { X86::VDIVPSZ128rr,   X86::VDIVPSZ128rmb,   TB_BCAST_SS },
+  { X86::VDIVPSZ256rr,   X86::VDIVPSZ256rmb,   TB_BCAST_SS },
+  { X86::VDIVPSZrr,      X86::VDIVPSZrmb,      TB_BCAST_SS },
+  { X86::VMAXCPDZ128rr,  X86::VMAXCPDZ128rmb,  TB_BCAST_SD },
+  { X86::VMAXCPDZ256rr,  X86::VMAXCPDZ256rmb,  TB_BCAST_SD },
+  { X86::VMAXCPDZrr,     X86::VMAXCPDZrmb,     TB_BCAST_SD },
+  { X86::VMAXCPSZ128rr,  X86::VMAXCPSZ128rmb,  TB_BCAST_SS },
+  { X86::VMAXCPSZ256rr,  X86::VMAXCPSZ256rmb,  TB_BCAST_SS },
+  { X86::VMAXCPSZrr,     X86::VMAXCPSZrmb,     TB_BCAST_SS },
+  { X86::VMAXPDZ128rr,   X86::VMAXPDZ128rmb,   TB_BCAST_SD },
+  { X86::VMAXPDZ256rr,   X86::VMAXPDZ256rmb,   TB_BCAST_SD },
+  { X86::VMAXPDZrr,      X86::VMAXPDZrmb,      TB_BCAST_SD },
+  { X86::VMAXPSZ128rr,   X86::VMAXPSZ128rmb,   TB_BCAST_SS },
+  { X86::VMAXPSZ256rr,   X86::VMAXPSZ256rmb,   TB_BCAST_SS },
+  { X86::VMAXPSZrr,      X86::VMAXPSZrmb,      TB_BCAST_SS },
+  { X86::VMINCPDZ128rr,  X86::VMINCPDZ128rmb,  TB_BCAST_SD },
+  { X86::VMINCPDZ256rr,  X86::VMINCPDZ256rmb,  TB_BCAST_SD },
+  { X86::VMINCPDZrr,     X86::VMINCPDZrmb,     TB_BCAST_SD },
+  { X86::VMINCPSZ128rr,  X86::VMINCPSZ128rmb,  TB_BCAST_SS },
+  { X86::VMINCPSZ256rr,  X86::VMINCPSZ256rmb,  TB_BCAST_SS },
+  { X86::VMINCPSZrr,     X86::VMINCPSZrmb,     TB_BCAST_SS },
+  { X86::VMINPDZ128rr,   X86::VMINPDZ128rmb,   TB_BCAST_SD },
+  { X86::VMINPDZ256rr,   X86::VMINPDZ256rmb,   TB_BCAST_SD },
+  { X86::VMINPDZrr,      X86::VMINPDZrmb,      TB_BCAST_SD },
+  { X86::VMINPSZ128rr,   X86::VMINPSZ128rmb,   TB_BCAST_SS },
+  { X86::VMINPSZ256rr,   X86::VMINPSZ256rmb,   TB_BCAST_SS },
+  { X86::VMINPSZrr,      X86::VMINPSZrmb,      TB_BCAST_SS },
+  { X86::VMULPDZ128rr,   X86::VMULPDZ128rmb,   TB_BCAST_SD },
+  { X86::VMULPDZ256rr,   X86::VMULPDZ256rmb,   TB_BCAST_SD },
+  { X86::VMULPDZrr,      X86::VMULPDZrmb,      TB_BCAST_SD },
+  { X86::VMULPSZ128rr,   X86::VMULPSZ128rmb,   TB_BCAST_SS },
+  { X86::VMULPSZ256rr,   X86::VMULPSZ256rmb,   TB_BCAST_SS },
+  { X86::VMULPSZrr,      X86::VMULPSZrmb,      TB_BCAST_SS },
+  { X86::VPADDDZ128rr,   X86::VPADDDZ128rmb,   TB_BCAST_D },
+  { X86::VPADDDZ256rr,   X86::VPADDDZ256rmb,   TB_BCAST_D },
+  { X86::VPADDDZrr,      X86::VPADDDZrmb,      TB_BCAST_D },
+  { X86::VPADDQZ128rr,   X86::VPADDQZ128rmb,   TB_BCAST_Q },
+  { X86::VPADDQZ256rr,   X86::VPADDQZ256rmb,   TB_BCAST_Q },
+  { X86::VPADDQZrr,      X86::VPADDQZrmb,      TB_BCAST_Q },
+  { X86::VPANDDZ128rr,   X86::VPANDDZ128rmb,   TB_BCAST_D },
+  { X86::VPANDDZ256rr,   X86::VPANDDZ256rmb,   TB_BCAST_D },
+  { X86::VPANDDZrr,      X86::VPANDDZrmb,      TB_BCAST_D },
+  { X86::VPANDNDZ128rr,  X86::VPANDNDZ128rmb,  TB_BCAST_D },
+  { X86::VPANDNDZ256rr,  X86::VPANDNDZ256rmb,  TB_BCAST_D },
+  { X86::VPANDNDZrr,     X86::VPANDNDZrmb,     TB_BCAST_D },
+  { X86::VPANDNQZ128rr,  X86::VPANDNQZ128rmb,  TB_BCAST_Q },
+  { X86::VPANDNQZ256rr,  X86::VPANDNQZ256rmb,  TB_BCAST_Q },
+  { X86::VPANDNQZrr,     X86::VPANDNQZrmb,     TB_BCAST_Q },
+  { X86::VPANDQZ128rr,   X86::VPANDQZ128rmb,   TB_BCAST_Q },
+  { X86::VPANDQZ256rr,   X86::VPANDQZ256rmb,   TB_BCAST_Q },
+  { X86::VPANDQZrr,      X86::VPANDQZrmb,      TB_BCAST_Q },
+  { X86::VPCMPDZ128rri,  X86::VPCMPDZ128rmib,  TB_BCAST_D },
+  { X86::VPCMPDZ256rri,  X86::VPCMPDZ256rmib,  TB_BCAST_D },
+  { X86::VPCMPDZrri,     X86::VPCMPDZrmib,     TB_BCAST_D },
+  { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rmb, TB_BCAST_D },
+  { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rmb, TB_BCAST_D },
+  { X86::VPCMPEQDZrr,    X86::VPCMPEQDZrmb,    TB_BCAST_D },
+  { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rmb, TB_BCAST_Q },
+  { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rmb, TB_BCAST_Q },
+  { X86::VPCMPEQQZrr,    X86::VPCMPEQQZrmb,    TB_BCAST_Q },
+  { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rmb, TB_BCAST_D },
+  { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rmb, TB_BCAST_D },
+  { X86::VPCMPGTDZrr,    X86::VPCMPGTDZrmb,    TB_BCAST_D },
+  { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rmb, TB_BCAST_Q },
+  { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rmb, TB_BCAST_Q },
+  { X86::VPCMPGTQZrr,    X86::VPCMPGTQZrmb,    TB_BCAST_Q },
+  { X86::VPCMPQZ128rri,  X86::VPCMPQZ128rmib,  TB_BCAST_Q },
+  { X86::VPCMPQZ256rri,  X86::VPCMPQZ256rmib,  TB_BCAST_Q },
+  { X86::VPCMPQZrri,     X86::VPCMPQZrmib,     TB_BCAST_Q },
+  { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmib, TB_BCAST_D },
+  { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmib, TB_BCAST_D },
+  { X86::VPCMPUDZrri,    X86::VPCMPUDZrmib,    TB_BCAST_D },
+  { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmib, TB_BCAST_Q },
+  { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmib, TB_BCAST_Q },
+  { X86::VPCMPUQZrri,    X86::VPCMPUQZrmib,    TB_BCAST_Q },
+  { X86::VPMAXSDZ128rr,  X86::VPMAXSDZ128rmb,  TB_BCAST_D },
+  { X86::VPMAXSDZ256rr,  X86::VPMAXSDZ256rmb,  TB_BCAST_D },
+  { X86::VPMAXSDZrr,     X86::VPMAXSDZrmb,     TB_BCAST_D },
+  { X86::VPMAXSQZ128rr,  X86::VPMAXSQZ128rmb,  TB_BCAST_Q },
+  { X86::VPMAXSQZ256rr,  X86::VPMAXSQZ256rmb,  TB_BCAST_Q },
+  { X86::VPMAXSQZrr,     X86::VPMAXSQZrmb,     TB_BCAST_Q },
+  { X86::VPMAXUDZ128rr,  X86::VPMAXUDZ128rmb,  TB_BCAST_D },
+  { X86::VPMAXUDZ256rr,  X86::VPMAXUDZ256rmb,  TB_BCAST_D },
+  { X86::VPMAXUDZrr,     X86::VPMAXUDZrmb,     TB_BCAST_D },
+  { X86::VPMAXUQZ128rr,  X86::VPMAXUQZ128rmb,  TB_BCAST_Q },
+  { X86::VPMAXUQZ256rr,  X86::VPMAXUQZ256rmb,  TB_BCAST_Q },
+  { X86::VPMAXUQZrr,     X86::VPMAXUQZrmb,     TB_BCAST_Q },
+  { X86::VPMINSDZ128rr,  X86::VPMINSDZ128rmb,  TB_BCAST_D },
+  { X86::VPMINSDZ256rr,  X86::VPMINSDZ256rmb,  TB_BCAST_D },
+  { X86::VPMINSDZrr,     X86::VPMINSDZrmb,     TB_BCAST_D },
+  { X86::VPMINSQZ128rr,  X86::VPMINSQZ128rmb,  TB_BCAST_Q },
+  { X86::VPMINSQZ256rr,  X86::VPMINSQZ256rmb,  TB_BCAST_Q },
+  { X86::VPMINSQZrr,     X86::VPMINSQZrmb,     TB_BCAST_Q },
+  { X86::VPMINUDZ128rr,  X86::VPMINUDZ128rmb,  TB_BCAST_D },
+  { X86::VPMINUDZ256rr,  X86::VPMINUDZ256rmb,  TB_BCAST_D },
+  { X86::VPMINUDZrr,     X86::VPMINUDZrmb,     TB_BCAST_D },
+  { X86::VPMINUQZ128rr,  X86::VPMINUQZ128rmb,  TB_BCAST_Q },
+  { X86::VPMINUQZ256rr,  X86::VPMINUQZ256rmb,  TB_BCAST_Q },
+  { X86::VPMINUQZrr,     X86::VPMINUQZrmb,     TB_BCAST_Q },
+  { X86::VPMULLDZ128rr,  X86::VPMULLDZ128rmb,  TB_BCAST_D },
+  { X86::VPMULLDZ256rr,  X86::VPMULLDZ256rmb,  TB_BCAST_D },
+  { X86::VPMULLDZrr,     X86::VPMULLDZrmb,     TB_BCAST_D },
+  { X86::VPMULLQZ128rr,  X86::VPMULLQZ128rmb,  TB_BCAST_Q },
+  { X86::VPMULLQZ256rr,  X86::VPMULLQZ256rmb,  TB_BCAST_Q },
+  { X86::VPMULLQZrr,     X86::VPMULLQZrmb,     TB_BCAST_Q },
+  { X86::VPORDZ128rr,    X86::VPORDZ128rmb,    TB_BCAST_D },
+  { X86::VPORDZ256rr,    X86::VPORDZ256rmb,    TB_BCAST_D },
+  { X86::VPORDZrr,       X86::VPORDZrmb,       TB_BCAST_D },
+  { X86::VPORQZ128rr,    X86::VPORQZ128rmb,    TB_BCAST_Q },
+  { X86::VPORQZ256rr,    X86::VPORQZ256rmb,    TB_BCAST_Q },
+  { X86::VPORQZrr,       X86::VPORQZrmb,       TB_BCAST_Q },
+  { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D },
+  { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D },
+  { X86::VPTESTMDZrr,    X86::VPTESTMDZrmb,    TB_BCAST_D },
+  { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q },
+  { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q },
+  { X86::VPTESTMQZrr,    X86::VPTESTMQZrmb,    TB_BCAST_Q },
+  { X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D },
+  { X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D },
+  { X86::VPTESTNMDZrr,   X86::VPTESTNMDZrmb,   TB_BCAST_D },
+  { X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q },
+  { X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q },
+  { X86::VPTESTNMQZrr,   X86::VPTESTNMQZrmb,   TB_BCAST_Q },
+  { X86::VPXORDZ128rr,   X86::VPXORDZ128rmb,   TB_BCAST_D },
+  { X86::VPXORDZ256rr,   X86::VPXORDZ256rmb,   TB_BCAST_D },
+  { X86::VPXORDZrr,      X86::VPXORDZrmb,      TB_BCAST_D },
+  { X86::VPXORQZ128rr,   X86::VPXORQZ128rmb,   TB_BCAST_Q },
+  { X86::VPXORQZ256rr,   X86::VPXORQZ256rmb,   TB_BCAST_Q },
+  { X86::VPXORQZrr,      X86::VPXORQZrmb,      TB_BCAST_Q },
+  { X86::VSUBPDZ128rr,   X86::VSUBPDZ128rmb,   TB_BCAST_SD },
+  { X86::VSUBPDZ256rr,   X86::VSUBPDZ256rmb,   TB_BCAST_SD },
+  { X86::VSUBPDZrr,      X86::VSUBPDZrmb,      TB_BCAST_SD },
+  { X86::VSUBPSZ128rr,   X86::VSUBPSZ128rmb,   TB_BCAST_SS },
+  { X86::VSUBPSZ256rr,   X86::VSUBPSZ256rmb,   TB_BCAST_SS },
+  { X86::VSUBPSZrr,      X86::VSUBPSZrmb,      TB_BCAST_SS },
+};
+
+static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = {
+  { X86::VFMADD132PDZ128r,     X86::VFMADD132PDZ128mb,    TB_BCAST_SD },
+  { X86::VFMADD132PDZ256r,     X86::VFMADD132PDZ256mb,    TB_BCAST_SD },
+  { X86::VFMADD132PDZr,        X86::VFMADD132PDZmb,       TB_BCAST_SD },
+  { X86::VFMADD132PSZ128r,     X86::VFMADD132PSZ128mb,    TB_BCAST_SS },
+  { X86::VFMADD132PSZ256r,     X86::VFMADD132PSZ256mb,    TB_BCAST_SS },
+  { X86::VFMADD132PSZr,        X86::VFMADD132PSZmb,       TB_BCAST_SS },
+  { X86::VFMADD213PDZ128r,     X86::VFMADD213PDZ128mb,    TB_BCAST_SD },
+  { X86::VFMADD213PDZ256r,     X86::VFMADD213PDZ256mb,    TB_BCAST_SD },
+  { X86::VFMADD213PDZr,        X86::VFMADD213PDZmb,       TB_BCAST_SD },
+  { X86::VFMADD213PSZ128r,     X86::VFMADD213PSZ128mb,    TB_BCAST_SS },
+  { X86::VFMADD213PSZ256r,     X86::VFMADD213PSZ256mb,    TB_BCAST_SS },
+  { X86::VFMADD213PSZr,        X86::VFMADD213PSZmb,       TB_BCAST_SS },
+  { X86::VFMADD231PDZ128r,     X86::VFMADD231PDZ128mb,    TB_BCAST_SD },
+  { X86::VFMADD231PDZ256r,     X86::VFMADD231PDZ256mb,    TB_BCAST_SD },
+  { X86::VFMADD231PDZr,        X86::VFMADD231PDZmb,       TB_BCAST_SD },
+  { X86::VFMADD231PSZ128r,     X86::VFMADD231PSZ128mb,    TB_BCAST_SS },
+  { X86::VFMADD231PSZ256r,     X86::VFMADD231PSZ256mb,    TB_BCAST_SS },
+  { X86::VFMADD231PSZr,        X86::VFMADD231PSZmb,       TB_BCAST_SS },
+  { X86::VFMADDSUB132PDZ128r,  X86::VFMADDSUB132PDZ128mb, TB_BCAST_SD },
+  { X86::VFMADDSUB132PDZ256r,  X86::VFMADDSUB132PDZ256mb, TB_BCAST_SD },
+  { X86::VFMADDSUB132PDZr,     X86::VFMADDSUB132PDZmb,    TB_BCAST_SD },
+  { X86::VFMADDSUB132PSZ128r,  X86::VFMADDSUB132PSZ128mb, TB_BCAST_SS },
+  { X86::VFMADDSUB132PSZ256r,  X86::VFMADDSUB132PSZ256mb, TB_BCAST_SS },
+  { X86::VFMADDSUB132PSZr,     X86::VFMADDSUB132PSZmb,    TB_BCAST_SS },
+  { X86::VFMADDSUB213PDZ128r,  X86::VFMADDSUB213PDZ128mb, TB_BCAST_SD },
+  { X86::VFMADDSUB213PDZ256r,  X86::VFMADDSUB213PDZ256mb, TB_BCAST_SD },
+  { X86::VFMADDSUB213PDZr,     X86::VFMADDSUB213PDZmb,    TB_BCAST_SD },
+  { X86::VFMADDSUB213PSZ128r,  X86::VFMADDSUB213PSZ128mb, TB_BCAST_SS },
+  { X86::VFMADDSUB213PSZ256r,  X86::VFMADDSUB213PSZ256mb, TB_BCAST_SS },
+  { X86::VFMADDSUB213PSZr,     X86::VFMADDSUB213PSZmb,    TB_BCAST_SS },
+  { X86::VFMADDSUB231PDZ128r,  X86::VFMADDSUB231PDZ128mb, TB_BCAST_SD },
+  { X86::VFMADDSUB231PDZ256r,  X86::VFMADDSUB231PDZ256mb, TB_BCAST_SD },
+  { X86::VFMADDSUB231PDZr,     X86::VFMADDSUB231PDZmb,    TB_BCAST_SD },
+  { X86::VFMADDSUB231PSZ128r,  X86::VFMADDSUB231PSZ128mb, TB_BCAST_SS },
+  { X86::VFMADDSUB231PSZ256r,  X86::VFMADDSUB231PSZ256mb, TB_BCAST_SS },
+  { X86::VFMADDSUB231PSZr,     X86::VFMADDSUB231PSZmb,    TB_BCAST_SS },
+  { X86::VFMSUB132PDZ128r,     X86::VFMSUB132PDZ128mb,    TB_BCAST_SD },
+  { X86::VFMSUB132PDZ256r,     X86::VFMSUB132PDZ256mb,    TB_BCAST_SD },
+  { X86::VFMSUB132PDZr,        X86::VFMSUB132PDZmb,       TB_BCAST_SD },
+  { X86::VFMSUB132PSZ128r,     X86::VFMSUB132PSZ128mb,    TB_BCAST_SS },
+  { X86::VFMSUB132PSZ256r,     X86::VFMSUB132PSZ256mb,    TB_BCAST_SS },
+  { X86::VFMSUB132PSZr,        X86::VFMSUB132PSZmb,       TB_BCAST_SS },
+  { X86::VFMSUB213PDZ128r,     X86::VFMSUB213PDZ128mb,    TB_BCAST_SD },
+  { X86::VFMSUB213PDZ256r,     X86::VFMSUB213PDZ256mb,    TB_BCAST_SD },
+  { X86::VFMSUB213PDZr,        X86::VFMSUB213PDZmb,       TB_BCAST_SD },
+  { X86::VFMSUB213PSZ128r,     X86::VFMSUB213PSZ128mb,    TB_BCAST_SS },
+  { X86::VFMSUB213PSZ256r,     X86::VFMSUB213PSZ256mb,    TB_BCAST_SS },
+  { X86::VFMSUB213PSZr,        X86::VFMSUB213PSZmb,       TB_BCAST_SS },
+  { X86::VFMSUB231PDZ128r,     X86::VFMSUB231PDZ128mb,    TB_BCAST_SD },
+  { X86::VFMSUB231PDZ256r,     X86::VFMSUB231PDZ256mb,    TB_BCAST_SD },
+  { X86::VFMSUB231PDZr,        X86::VFMSUB231PDZmb,       TB_BCAST_SD },
+  { X86::VFMSUB231PSZ128r,     X86::VFMSUB231PSZ128mb,    TB_BCAST_SS },
+  { X86::VFMSUB231PSZ256r,     X86::VFMSUB231PSZ256mb,    TB_BCAST_SS },
+  { X86::VFMSUB231PSZr,        X86::VFMSUB231PSZmb,       TB_BCAST_SS },
+  { X86::VFMSUBADD132PDZ128r,  X86::VFMSUBADD132PDZ128mb, TB_BCAST_SD },
+  { X86::VFMSUBADD132PDZ256r,  X86::VFMSUBADD132PDZ256mb, TB_BCAST_SD },
+  { X86::VFMSUBADD132PDZr,     X86::VFMSUBADD132PDZmb,    TB_BCAST_SD },
+  { X86::VFMSUBADD132PSZ128r,  X86::VFMSUBADD132PSZ128mb, TB_BCAST_SS },
+  { X86::VFMSUBADD132PSZ256r,  X86::VFMSUBADD132PSZ256mb, TB_BCAST_SS },
+  { X86::VFMSUBADD132PSZr,     X86::VFMSUBADD132PSZmb,    TB_BCAST_SS },
+  { X86::VFMSUBADD213PDZ128r,  X86::VFMSUBADD213PDZ128mb, TB_BCAST_SD },
+  { X86::VFMSUBADD213PDZ256r,  X86::VFMSUBADD213PDZ256mb, TB_BCAST_SD },
+  { X86::VFMSUBADD213PDZr,     X86::VFMSUBADD213PDZmb,    TB_BCAST_SD },
+  { X86::VFMSUBADD213PSZ128r,  X86::VFMSUBADD213PSZ128mb, TB_BCAST_SS },
+  { X86::VFMSUBADD213PSZ256r,  X86::VFMSUBADD213PSZ256mb, TB_BCAST_SS },
+  { X86::VFMSUBADD213PSZr,     X86::VFMSUBADD213PSZmb,    TB_BCAST_SS },
+  { X86::VFMSUBADD231PDZ128r,  X86::VFMSUBADD231PDZ128mb, TB_BCAST_SD },
+  { X86::VFMSUBADD231PDZ256r,  X86::VFMSUBADD231PDZ256mb, TB_BCAST_SD },
+  { X86::VFMSUBADD231PDZr,     X86::VFMSUBADD231PDZmb,    TB_BCAST_SD },
+  { X86::VFMSUBADD231PSZ128r,  X86::VFMSUBADD231PSZ128mb, TB_BCAST_SS },
+  { X86::VFMSUBADD231PSZ256r,  X86::VFMSUBADD231PSZ256mb, TB_BCAST_SS },
+  { X86::VFMSUBADD231PSZr,     X86::VFMSUBADD231PSZmb,    TB_BCAST_SS },
+  { X86::VFNMADD132PDZ128r,    X86::VFNMADD132PDZ128mb,   TB_BCAST_SD },
+  { X86::VFNMADD132PDZ256r,    X86::VFNMADD132PDZ256mb,   TB_BCAST_SD },
+  { X86::VFNMADD132PDZr,       X86::VFNMADD132PDZmb,      TB_BCAST_SD },
+  { X86::VFNMADD132PSZ128r,    X86::VFNMADD132PSZ128mb,   TB_BCAST_SS },
+  { X86::VFNMADD132PSZ256r,    X86::VFNMADD132PSZ256mb,   TB_BCAST_SS },
+  { X86::VFNMADD132PSZr,       X86::VFNMADD132PSZmb,      TB_BCAST_SS },
+  { X86::VFNMADD213PDZ128r,    X86::VFNMADD213PDZ128mb,   TB_BCAST_SD },
+  { X86::VFNMADD213PDZ256r,    X86::VFNMADD213PDZ256mb,   TB_BCAST_SD },
+  { X86::VFNMADD213PDZr,       X86::VFNMADD213PDZmb,      TB_BCAST_SD },
+  { X86::VFNMADD213PSZ128r,    X86::VFNMADD213PSZ128mb,   TB_BCAST_SS },
+  { X86::VFNMADD213PSZ256r,    X86::VFNMADD213PSZ256mb,   TB_BCAST_SS },
+  { X86::VFNMADD213PSZr,       X86::VFNMADD213PSZmb,      TB_BCAST_SS },
+  { X86::VFNMADD231PDZ128r,    X86::VFNMADD231PDZ128mb,   TB_BCAST_SD },
+  { X86::VFNMADD231PDZ256r,    X86::VFNMADD231PDZ256mb,   TB_BCAST_SD },
+  { X86::VFNMADD231PDZr,       X86::VFNMADD231PDZmb,      TB_BCAST_SD },
+  { X86::VFNMADD231PSZ128r,    X86::VFNMADD231PSZ128mb,   TB_BCAST_SS },
+  { X86::VFNMADD231PSZ256r,    X86::VFNMADD231PSZ256mb,   TB_BCAST_SS },
+  { X86::VFNMADD231PSZr,       X86::VFNMADD231PSZmb,      TB_BCAST_SS },
+  { X86::VFNMSUB132PDZ128r,    X86::VFNMSUB132PDZ128mb,   TB_BCAST_SD },
+  { X86::VFNMSUB132PDZ256r,    X86::VFNMSUB132PDZ256mb,   TB_BCAST_SD },
+  { X86::VFNMSUB132PDZr,       X86::VFNMSUB132PDZmb,      TB_BCAST_SD },
+  { X86::VFNMSUB132PSZ128r,    X86::VFNMSUB132PSZ128mb,   TB_BCAST_SS },
+  { X86::VFNMSUB132PSZ256r,    X86::VFNMSUB132PSZ256mb,   TB_BCAST_SS },
+  { X86::VFNMSUB132PSZr,       X86::VFNMSUB132PSZmb,      TB_BCAST_SS },
+  { X86::VFNMSUB213PDZ128r,    X86::VFNMSUB213PDZ128mb,   TB_BCAST_SD },
+  { X86::VFNMSUB213PDZ256r,    X86::VFNMSUB213PDZ256mb,   TB_BCAST_SD },
+  { X86::VFNMSUB213PDZr,       X86::VFNMSUB213PDZmb,      TB_BCAST_SD },
+  { X86::VFNMSUB213PSZ128r,    X86::VFNMSUB213PSZ128mb,   TB_BCAST_SS },
+  { X86::VFNMSUB213PSZ256r,    X86::VFNMSUB213PSZ256mb,   TB_BCAST_SS },
+  { X86::VFNMSUB213PSZr,       X86::VFNMSUB213PSZmb,      TB_BCAST_SS },
+  { X86::VFNMSUB231PDZ128r,    X86::VFNMSUB231PDZ128mb,   TB_BCAST_SD },
+  { X86::VFNMSUB231PDZ256r,    X86::VFNMSUB231PDZ256mb,   TB_BCAST_SD },
+  { X86::VFNMSUB231PDZr,       X86::VFNMSUB231PDZmb,      TB_BCAST_SD },
+  { X86::VFNMSUB231PSZ128r,    X86::VFNMSUB231PSZ128mb,   TB_BCAST_SS },
+  { X86::VFNMSUB231PSZ256r,    X86::VFNMSUB231PSZ256mb,   TB_BCAST_SS },
+  { X86::VFNMSUB231PSZr,       X86::VFNMSUB231PSZmb,      TB_BCAST_SS },
+};
+
 static const X86MemoryFoldTableEntry *
 lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
 #ifndef NDEBUG
@@ -5287,6 +5553,18 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
                               std::end(MemoryFoldTable4)) ==
            std::end(MemoryFoldTable4) &&
            "MemoryFoldTable4 is not sorted and unique!");
+    assert(std::is_sorted(std::begin(BroadcastFoldTable2),
+                          std::end(BroadcastFoldTable2)) &&
+           std::adjacent_find(std::begin(BroadcastFoldTable2),
+                              std::end(BroadcastFoldTable2)) ==
+           std::end(BroadcastFoldTable2) &&
+           "BroadcastFoldTable2 is not sorted and unique!");
+    assert(std::is_sorted(std::begin(BroadcastFoldTable3),
+                          std::end(BroadcastFoldTable3)) &&
+           std::adjacent_find(std::begin(BroadcastFoldTable3),
+                              std::end(BroadcastFoldTable3)) ==
+           std::end(BroadcastFoldTable3) &&
+           "BroadcastFoldTable3 is not sorted and unique!");
     FoldTablesChecked.store(true, std::memory_order_relaxed);
   }
 #endif
@@ -5355,6 +5633,15 @@ struct X86MemUnfoldTable {
       // Index 4, folded load
       addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD);
 
+    // Broadcast tables.
+    for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable2)
+      // Index 2, folded broadcast
+      addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
+
+    for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3)
+      // Index 2, folded broadcast
+      addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
+
     // Sort the memory->reg unfold table.
     array_pod_sort(Table.begin(), Table.end());
 
diff --git a/lib/Target/X86/X86InstrFoldTables.h b/lib/Target/X86/X86InstrFoldTables.h
index 419baf98f61d..7dc236a0d7e4 100644
--- a/lib/Target/X86/X86InstrFoldTables.h
+++ b/lib/Target/X86/X86InstrFoldTables.h
@@ -19,35 +19,48 @@ namespace llvm {
 
 enum {
   // Select which memory operand is being unfolded.
-  // (stored in bits 0 - 3)
+  // (stored in bits 0 - 2)
   TB_INDEX_0    = 0,
   TB_INDEX_1    = 1,
   TB_INDEX_2    = 2,
   TB_INDEX_3    = 3,
   TB_INDEX_4    = 4,
-  TB_INDEX_MASK = 0xf,
+  TB_INDEX_MASK = 0x7,
 
   // Do not insert the reverse map (MemOp -> RegOp) into the table.
   // This may be needed because there is a many -> one mapping.
-  TB_NO_REVERSE   = 1 << 4,
+  TB_NO_REVERSE   = 1 << 3,
 
   // Do not insert the forward map (RegOp -> MemOp) into the table.
   // This is needed for Native Client, which prohibits branch
   // instructions from using a memory operand.
-  TB_NO_FORWARD   = 1 << 5,
+  TB_NO_FORWARD   = 1 << 4,
 
-  TB_FOLDED_LOAD  = 1 << 6,
-  TB_FOLDED_STORE = 1 << 7,
+  TB_FOLDED_LOAD  = 1 << 5,
+  TB_FOLDED_STORE = 1 << 6,
+  TB_FOLDED_BCAST = 1 << 7,
 
   // Minimum alignment required for load/store.
-  // Used for RegOp->MemOp conversion.
-  // (stored in bits 8 - 15)
+  // Used for RegOp->MemOp conversion. Encoded as Log2(Align) + 1 to allow 0
+  // to mean align of 0.
+  // (stored in bits 8 - 11)
   TB_ALIGN_SHIFT = 8,
-  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
-  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
-  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
-  TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
-  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
+  TB_ALIGN_NONE  =   0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   5 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   6 << TB_ALIGN_SHIFT,
+  TB_ALIGN_64    =   7 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xf << TB_ALIGN_SHIFT,
+
+  // Broadcast type.
+  // (stored in bits 12 - 13)
+  TB_BCAST_TYPE_SHIFT = 12,
+  TB_BCAST_D    =   0 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_Q    =   1 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_SS   =   2 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_SD   =   3 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_MASK = 0x3 << TB_BCAST_TYPE_SHIFT,
+
+  // Unused bits 14-15
 };
 
 // This struct is used for both the folding and unfold tables. They KeyOp
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 096cc27861ca..de6f8a81dff6 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -103,6 +103,8 @@ def X86vzld  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def X86vextractst  : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
                      [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86VBroadcastld  : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad,
+                      [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
 def SDTVtrunc    : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                         SDTCisInt<0>, SDTCisInt<1>,
@@ -954,6 +956,26 @@ def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr),
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
 }]>;
 
+def X86VBroadcastld8 : PatFrag<(ops node:$src),
+                               (X86VBroadcastld node:$src), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 1;
+}]>;
+
+def X86VBroadcastld16 : PatFrag<(ops node:$src),
+                                (X86VBroadcastld node:$src), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 2;
+}]>;
+
+def X86VBroadcastld32 : PatFrag<(ops node:$src),
+                                (X86VBroadcastld node:$src), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
+}]>;
+
+def X86VBroadcastld64 : PatFrag<(ops node:$src),
+                                (X86VBroadcastld node:$src), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
+
 
 def fp32imm0 : PatLeaf<(f32 fpimm), [{
   return N->isExactlyValue(+0.0);
@@ -963,6 +985,10 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{
   return N->isExactlyValue(+0.0);
 }]>;
 
+def fp128imm0 : PatLeaf<(f128 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
 // EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
 // to VEXTRACTF128/VEXTRACTI128 imm.
 def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index dbe45356c42b..c29029daeec9 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -30,7 +30,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -465,7 +465,7 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
 /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
 static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
   // Don't waste compile time scanning use-def chains of physregs.
-  if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+  if (!Register::isVirtualRegister(BaseReg))
     return false;
   bool isPICBase = false;
   for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
@@ -480,9 +480,50 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
 }
 
 bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                                     AliasAnalysis *AA) const {
+                                                     AAResults *AA) const {
   switch (MI.getOpcode()) {
-  default: break;
+  default:
+    // This function should only be called for opcodes with the ReMaterializable
+    // flag set.
+    llvm_unreachable("Unknown rematerializable operation!");
+    break;
+
+  case X86::LOAD_STACK_GUARD:
+  case X86::AVX1_SETALLONES:
+  case X86::AVX2_SETALLONES:
+  case X86::AVX512_128_SET0:
+  case X86::AVX512_256_SET0:
+  case X86::AVX512_512_SET0:
+  case X86::AVX512_512_SETALLONES:
+  case X86::AVX512_FsFLD0SD:
+  case X86::AVX512_FsFLD0SS:
+  case X86::AVX512_FsFLD0F128:
+  case X86::AVX_SET0:
+  case X86::FsFLD0SD:
+  case X86::FsFLD0SS:
+  case X86::FsFLD0F128:
+  case X86::KSET0D:
+  case X86::KSET0Q:
+  case X86::KSET0W:
+  case X86::KSET1D:
+  case X86::KSET1Q:
+  case X86::KSET1W:
+  case X86::MMX_SET0:
+  case X86::MOV32ImmSExti8:
+  case X86::MOV32r0:
+  case X86::MOV32r1:
+  case X86::MOV32r_1:
+  case X86::MOV32ri64:
+  case X86::MOV64ImmSExti8:
+  case X86::V_SET0:
+  case X86::V_SETALLONES:
+  case X86::MOV16ri:
+  case X86::MOV32ri:
+  case X86::MOV64ri:
+  case X86::MOV64ri32:
+  case X86::MOV8ri:
+    return true;
+
   case X86::MOV8rm:
   case X86::MOV8rm_NOREX:
   case X86::MOV16rm:
@@ -561,7 +602,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
         MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
         MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
         MI.isDereferenceableInvariantLoad(AA)) {
-      unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+      Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
       if (BaseReg == 0 || BaseReg == X86::RIP)
         return true;
       // Allow re-materialization of PIC load.
@@ -583,7 +624,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
       // lea fi#, lea GV, etc. are all rematerializable.
       if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
         return true;
-      unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+      Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
       if (BaseReg == 0)
         return true;
       // Allow re-materialization of lea PICBase + x.
@@ -594,10 +635,6 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
     return false;
   }
   }
-
-  // All other instructions marked M_REMATERIALIZABLE are always trivially
-  // rematerializable.
-  return true;
 }
 
 void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
@@ -664,7 +701,7 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
 }
 
 bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
-                                  unsigned Opc, bool AllowSP, unsigned &NewSrc,
+                                  unsigned Opc, bool AllowSP, Register &NewSrc,
                                   bool &isKill, MachineOperand &ImplicitOp,
                                   LiveVariables *LV) const {
   MachineFunction &MF = *MI.getParent()->getParent();
@@ -675,7 +712,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     RC = Opc != X86::LEA32r ?
       &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
   }
-  unsigned SrcReg = Src.getReg();
+  Register SrcReg = Src.getReg();
 
   // For both LEA64 and LEA32 the register already has essentially the right
   // type (32-bit or 64-bit) we may just need to forbid SP.
@@ -684,7 +721,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     isKill = Src.isKill();
     assert(!Src.isUndef() && "Undef op doesn't need optimization");
 
-    if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+    if (Register::isVirtualRegister(NewSrc) &&
         !MF.getRegInfo().constrainRegClass(NewSrc, RC))
       return false;
 
@@ -693,7 +730,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
 
   // This is for an LEA64_32r and incoming registers are 32-bit. One way or
   // another we need to add 64-bit registers to the final MI.
-  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+  if (Register::isPhysicalRegister(SrcReg)) {
     ImplicitOp = Src;
     ImplicitOp.setImplicit();
 
@@ -740,8 +777,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
     return nullptr;
 
   unsigned Opcode = X86::LEA64_32r;
-  unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
-  unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+  Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
 
   // Build and insert into an implicit UNDEF value. This is OK because
   // we will be shifting and then extracting the lower 8/16-bits.
@@ -751,8 +788,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
   // But testing has shown this *does* help performance in 64-bit mode (at
   // least on modern x86 machines).
   MachineBasicBlock::iterator MBBI = MI.getIterator();
-  unsigned Dest = MI.getOperand(0).getReg();
-  unsigned Src = MI.getOperand(1).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
   bool IsDead = MI.getOperand(0).isDead();
   bool IsKill = MI.getOperand(1).isKill();
   unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
@@ -794,7 +831,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
   case X86::ADD8rr_DB:
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
-    unsigned Src2 = MI.getOperand(2).getReg();
+    Register Src2 = MI.getOperand(2).getReg();
     bool IsKill2 = MI.getOperand(2).isKill();
     assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
     unsigned InRegLEA2 = 0;
@@ -888,7 +925,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     // LEA can't handle RSP.
-    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+    if (Register::isVirtualRegister(Src.getReg()) &&
         !MF.getRegInfo().constrainRegClass(Src.getReg(),
                                            &X86::GR64_NOSPRegClass))
       return nullptr;
@@ -911,7 +948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
     // LEA can't handle ESP.
     bool isKill;
-    unsigned SrcReg;
+    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                         SrcReg, isKill, ImplicitOp, LV))
@@ -947,7 +984,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
         (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
     bool isKill;
-    unsigned SrcReg;
+    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
                         ImplicitOp, LV))
@@ -970,7 +1007,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
 
     bool isKill;
-    unsigned SrcReg;
+    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
                         ImplicitOp, LV))
@@ -1005,7 +1042,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     bool isKill;
-    unsigned SrcReg;
+    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
                         SrcReg, isKill, ImplicitOp, LV))
@@ -1013,7 +1050,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
     const MachineOperand &Src2 = MI.getOperand(2);
     bool isKill2;
-    unsigned SrcReg2;
+    Register SrcReg2;
     MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
                         SrcReg2, isKill2, ImplicitOp2, LV))
@@ -1054,7 +1091,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     bool isKill;
-    unsigned SrcReg;
+    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
                         SrcReg, isKill, ImplicitOp, LV))
@@ -1085,6 +1122,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     return nullptr;
   case X86::SUB32ri8:
   case X86::SUB32ri: {
+    if (!MI.getOperand(2).isImm())
+      return nullptr;
     int64_t Imm = MI.getOperand(2).getImm();
     if (!isInt<32>(-Imm))
       return nullptr;
@@ -1093,7 +1132,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     bool isKill;
-    unsigned SrcReg;
+    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
                         SrcReg, isKill, ImplicitOp, LV))
@@ -1111,6 +1150,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   case X86::SUB64ri8:
   case X86::SUB64ri32: {
+    if (!MI.getOperand(2).isImm())
+      return nullptr;
     int64_t Imm = MI.getOperand(2).getImm();
     if (!isInt<32>(-Imm))
       return nullptr;
@@ -1140,40 +1181,62 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::VMOVUPDZrmk:      case X86::VMOVAPDZrmk:
   case X86::VMOVUPSZ128rmk:   case X86::VMOVAPSZ128rmk:
   case X86::VMOVUPSZ256rmk:   case X86::VMOVAPSZ256rmk:
-  case X86::VMOVUPSZrmk:      case X86::VMOVAPSZrmk: {
+  case X86::VMOVUPSZrmk:      case X86::VMOVAPSZrmk:
+  case X86::VBROADCASTSDZ256mk:
+  case X86::VBROADCASTSDZmk:
+  case X86::VBROADCASTSSZ128mk:
+  case X86::VBROADCASTSSZ256mk:
+  case X86::VBROADCASTSSZmk:
+  case X86::VPBROADCASTDZ128mk:
+  case X86::VPBROADCASTDZ256mk:
+  case X86::VPBROADCASTDZmk:
+  case X86::VPBROADCASTQZ128mk:
+  case X86::VPBROADCASTQZ256mk:
+  case X86::VPBROADCASTQZmk: {
     unsigned Opc;
     switch (MIOpc) {
     default: llvm_unreachable("Unreachable!");
-    case X86::VMOVDQU8Z128rmk:  Opc = X86::VPBLENDMBZ128rmk; break;
-    case X86::VMOVDQU8Z256rmk:  Opc = X86::VPBLENDMBZ256rmk; break;
-    case X86::VMOVDQU8Zrmk:     Opc = X86::VPBLENDMBZrmk;    break;
-    case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
-    case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
-    case X86::VMOVDQU16Zrmk:    Opc = X86::VPBLENDMWZrmk;    break;
-    case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
-    case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
-    case X86::VMOVDQU32Zrmk:    Opc = X86::VPBLENDMDZrmk;    break;
-    case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
-    case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
-    case X86::VMOVDQU64Zrmk:    Opc = X86::VPBLENDMQZrmk;    break;
-    case X86::VMOVUPDZ128rmk:   Opc = X86::VBLENDMPDZ128rmk; break;
-    case X86::VMOVUPDZ256rmk:   Opc = X86::VBLENDMPDZ256rmk; break;
-    case X86::VMOVUPDZrmk:      Opc = X86::VBLENDMPDZrmk;    break;
-    case X86::VMOVUPSZ128rmk:   Opc = X86::VBLENDMPSZ128rmk; break;
-    case X86::VMOVUPSZ256rmk:   Opc = X86::VBLENDMPSZ256rmk; break;
-    case X86::VMOVUPSZrmk:      Opc = X86::VBLENDMPSZrmk;    break;
-    case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
-    case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
-    case X86::VMOVDQA32Zrmk:    Opc = X86::VPBLENDMDZrmk;    break;
-    case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
-    case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
-    case X86::VMOVDQA64Zrmk:    Opc = X86::VPBLENDMQZrmk;    break;
-    case X86::VMOVAPDZ128rmk:   Opc = X86::VBLENDMPDZ128rmk; break;
-    case X86::VMOVAPDZ256rmk:   Opc = X86::VBLENDMPDZ256rmk; break;
-    case X86::VMOVAPDZrmk:      Opc = X86::VBLENDMPDZrmk;    break;
-    case X86::VMOVAPSZ128rmk:   Opc = X86::VBLENDMPSZ128rmk; break;
-    case X86::VMOVAPSZ256rmk:   Opc = X86::VBLENDMPSZ256rmk; break;
-    case X86::VMOVAPSZrmk:      Opc = X86::VBLENDMPSZrmk;    break;
+    case X86::VMOVDQU8Z128rmk:    Opc = X86::VPBLENDMBZ128rmk; break;
+    case X86::VMOVDQU8Z256rmk:    Opc = X86::VPBLENDMBZ256rmk; break;
+    case X86::VMOVDQU8Zrmk:       Opc = X86::VPBLENDMBZrmk;    break;
+    case X86::VMOVDQU16Z128rmk:   Opc = X86::VPBLENDMWZ128rmk; break;
+    case X86::VMOVDQU16Z256rmk:   Opc = X86::VPBLENDMWZ256rmk; break;
+    case X86::VMOVDQU16Zrmk:      Opc = X86::VPBLENDMWZrmk;    break;
+    case X86::VMOVDQU32Z128rmk:   Opc = X86::VPBLENDMDZ128rmk; break;
+    case X86::VMOVDQU32Z256rmk:   Opc = X86::VPBLENDMDZ256rmk; break;
+    case X86::VMOVDQU32Zrmk:      Opc = X86::VPBLENDMDZrmk;    break;
+    case X86::VMOVDQU64Z128rmk:   Opc = X86::VPBLENDMQZ128rmk; break;
+    case X86::VMOVDQU64Z256rmk:   Opc = X86::VPBLENDMQZ256rmk; break;
+    case X86::VMOVDQU64Zrmk:      Opc = X86::VPBLENDMQZrmk;    break;
+    case X86::VMOVUPDZ128rmk:     Opc = X86::VBLENDMPDZ128rmk; break;
+    case X86::VMOVUPDZ256rmk:     Opc = X86::VBLENDMPDZ256rmk; break;
+    case X86::VMOVUPDZrmk:        Opc = X86::VBLENDMPDZrmk;    break;
+    case X86::VMOVUPSZ128rmk:     Opc = X86::VBLENDMPSZ128rmk; break;
+    case X86::VMOVUPSZ256rmk:     Opc = X86::VBLENDMPSZ256rmk; break;
+    case X86::VMOVUPSZrmk:        Opc = X86::VBLENDMPSZrmk;    break;
+    case X86::VMOVDQA32Z128rmk:   Opc = X86::VPBLENDMDZ128rmk; break;
+    case X86::VMOVDQA32Z256rmk:   Opc = X86::VPBLENDMDZ256rmk; break;
+    case X86::VMOVDQA32Zrmk:      Opc = X86::VPBLENDMDZrmk;    break;
+    case X86::VMOVDQA64Z128rmk:   Opc = X86::VPBLENDMQZ128rmk; break;
+    case X86::VMOVDQA64Z256rmk:   Opc = X86::VPBLENDMQZ256rmk; break;
+    case X86::VMOVDQA64Zrmk:      Opc = X86::VPBLENDMQZrmk;    break;
+    case X86::VMOVAPDZ128rmk:     Opc = X86::VBLENDMPDZ128rmk; break;
+    case X86::VMOVAPDZ256rmk:     Opc = X86::VBLENDMPDZ256rmk; break;
+    case X86::VMOVAPDZrmk:        Opc = X86::VBLENDMPDZrmk;    break;
+    case X86::VMOVAPSZ128rmk:     Opc = X86::VBLENDMPSZ128rmk; break;
+    case X86::VMOVAPSZ256rmk:     Opc = X86::VBLENDMPSZ256rmk; break;
+    case X86::VMOVAPSZrmk:        Opc = X86::VBLENDMPSZrmk;    break;
+    case X86::VBROADCASTSDZ256mk: Opc = X86::VBLENDMPDZ256rmbk; break;
+    case X86::VBROADCASTSDZmk:    Opc = X86::VBLENDMPDZrmbk;    break;
+    case X86::VBROADCASTSSZ128mk: Opc = X86::VBLENDMPSZ128rmbk; break;
+    case X86::VBROADCASTSSZ256mk: Opc = X86::VBLENDMPSZ256rmbk; break;
+    case X86::VBROADCASTSSZmk:    Opc = X86::VBLENDMPSZrmbk;    break;
+    case X86::VPBROADCASTDZ128mk: Opc = X86::VPBLENDMDZ128rmbk; break;
+    case X86::VPBROADCASTDZ256mk: Opc = X86::VPBLENDMDZ256rmbk; break;
+    case X86::VPBROADCASTDZmk:    Opc = X86::VPBLENDMDZrmbk;    break;
+    case X86::VPBROADCASTQZ128mk: Opc = X86::VPBLENDMQZ128rmbk; break;
+    case X86::VPBROADCASTQZ256mk: Opc = X86::VPBLENDMQZ256rmbk; break;
+    case X86::VPBROADCASTQZmk:    Opc = X86::VPBLENDMQZrmbk;    break;
     }
 
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1187,6 +1250,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
               .add(MI.getOperand(7));
     break;
   }
+
   case X86::VMOVDQU8Z128rrk:
   case X86::VMOVDQU8Z256rrk:
   case X86::VMOVDQU8Zrrk:
@@ -1683,6 +1747,27 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
+  case X86::VCMPSDZrr:
+  case X86::VCMPSSZrr:
+  case X86::VCMPPDZrri:
+  case X86::VCMPPSZrri:
+  case X86::VCMPPDZ128rri:
+  case X86::VCMPPSZ128rri:
+  case X86::VCMPPDZ256rri:
+  case X86::VCMPPSZ256rri:
+  case X86::VCMPPDZrrik:
+  case X86::VCMPPSZrrik:
+  case X86::VCMPPDZ128rrik:
+  case X86::VCMPPSZ128rrik:
+  case X86::VCMPPDZ256rrik:
+  case X86::VCMPPSZ256rrik: {
+    unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x1f;
+    Imm = X86::getSwappedVCMPImm(Imm);
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
   case X86::VPERM2F128rr:
   case X86::VPERM2I128rr: {
     // Flip permute source immediate.
@@ -1859,7 +1944,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
 
     // CommutableOpIdx2 is well defined now. Let's choose another commutable
     // operand and assign its index to CommutableOpIdx1.
-    unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
+    Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
 
     unsigned CommutableOpIdx1;
     for (CommutableOpIdx1 = LastCommutableVecOp;
@@ -1889,7 +1974,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
   return true;
 }
 
-bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+                                         unsigned &SrcOpIdx1,
                                          unsigned &SrcOpIdx2) const {
   const MCInstrDesc &Desc = MI.getDesc();
   if (!Desc.isCommutable())
@@ -1926,17 +2012,23 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
     // Ordered/Unordered/Equal/NotEqual tests
     unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
     switch (Imm) {
+    default:
+      // EVEX versions can be commuted.
+      if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
+        break;
+      return false;
     case 0x00: // EQUAL
     case 0x03: // UNORDERED
     case 0x04: // NOT EQUAL
     case 0x07: // ORDERED
-      // The indices of the commutable operands are 1 and 2 (or 2 and 3
-      // when masked).
-      // Assign them to the returned operand indices here.
-      return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
-                                  2 + OpOffset);
+      break;
     }
-    return false;
+
+    // The indices of the commutable operands are 1 and 2 (or 2 and 3
+    // when masked).
+    // Assign them to the returned operand indices here.
+    return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
+                                2 + OpOffset);
   }
   case X86::MOVSSrr:
     // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
@@ -1990,6 +2082,24 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
   case X86::VPTERNLOGQZ256rmbikz:
   case X86::VPTERNLOGQZrmbikz:
     return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+  case X86::VPDPWSSDZ128r:
+  case X86::VPDPWSSDZ128rk:
+  case X86::VPDPWSSDZ128rkz:
+  case X86::VPDPWSSDZ256r:
+  case X86::VPDPWSSDZ256rk:
+  case X86::VPDPWSSDZ256rkz:
+  case X86::VPDPWSSDZr:
+  case X86::VPDPWSSDZrk:
+  case X86::VPDPWSSDZrkz:
+  case X86::VPDPWSSDSZ128r:
+  case X86::VPDPWSSDSZ128rk:
+  case X86::VPDPWSSDSZ128rkz:
+  case X86::VPDPWSSDSZ256r:
+  case X86::VPDPWSSDSZ256rk:
+  case X86::VPDPWSSDSZ256rkz:
+  case X86::VPDPWSSDSZr:
+  case X86::VPDPWSSDSZrk:
+  case X86::VPDPWSSDSZrkz:
   case X86::VPMADD52HUQZ128r:
   case X86::VPMADD52HUQZ128rk:
   case X86::VPMADD52HUQZ128rkz:
@@ -2215,7 +2325,7 @@ unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
   }
 }
 
-/// Get the VPCMP immediate if the opcodes are swapped.
+/// Get the VPCMP immediate if the operands are swapped.
 unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
   switch (Imm) {
   default: llvm_unreachable("Unreachable!");
@@ -2233,7 +2343,7 @@ unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
   return Imm;
 }
 
-/// Get the VPCOM immediate if the opcodes are swapped.
+/// Get the VPCOM immediate if the operands are swapped.
 unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
   switch (Imm) {
   default: llvm_unreachable("Unreachable!");
@@ -2251,6 +2361,23 @@ unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
   return Imm;
 }
 
+/// Get the VCMP immediate if the operands are swapped.
+unsigned X86::getSwappedVCMPImm(unsigned Imm) {
+  // Only need the lower 2 bits to distinquish.
+  switch (Imm & 0x3) {
+  default: llvm_unreachable("Unreachable!");
+  case 0x00: case 0x03:
+    // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
+    break;
+  case 0x01: case 0x02:
+    // Need to toggle bits 3:0. Bit 4 stays the same.
+    Imm ^= 0xf;
+    break;
+  }
+
+  return Imm;
+}
+
 bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
   if (!MI.isTerminator()) return false;
 
@@ -3131,25 +3258,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     .addReg(SrcReg, getKillRegState(isKill));
 }
 
-void X86InstrInfo::storeRegToAddr(
-    MachineFunction &MF, unsigned SrcReg, bool isKill,
-    SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC,
-    ArrayRef<MachineMemOperand *> MMOs,
-    SmallVectorImpl<MachineInstr *> &NewMIs) const {
-  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-  unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
-  bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
-  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
-  DebugLoc DL;
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.add(Addr[i]);
-  MIB.addReg(SrcReg, getKillRegState(isKill));
-  MIB.setMemRefs(MMOs);
-  NewMIs.push_back(MIB);
-}
-
-
 void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
                                         unsigned DestReg, int FrameIdx,
@@ -3164,23 +3272,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
 }
 
-void X86InstrInfo::loadRegFromAddr(
-    MachineFunction &MF, unsigned DestReg,
-    SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC,
-    ArrayRef<MachineMemOperand *> MMOs,
-    SmallVectorImpl<MachineInstr *> &NewMIs) const {
-  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-  unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
-  bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
-  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
-  DebugLoc DL;
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
-  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.add(Addr[i]);
-  MIB.setMemRefs(MMOs);
-  NewMIs.push_back(MIB);
-}
-
 bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                                   unsigned &SrcReg2, int &CmpMask,
                                   int &CmpValue) const {
@@ -3599,8 +3690,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   if (!IsCmpZero && !Sub)
     return false;
 
-  bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
-                    Sub->getOperand(2).getReg() == SrcReg);
+  bool IsSwapped =
+      (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 &&
+       Sub->getOperand(2).getReg() == SrcReg);
 
   // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
   // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
@@ -3755,7 +3847,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg != FoldAsLoadDefReg)
       continue;
     // Do not fold if we have a subreg use or a def.
@@ -3785,7 +3877,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
 static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
                              const MCInstrDesc &Desc) {
   assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
-  unsigned Reg = MIB->getOperand(0).getReg();
+  Register Reg = MIB->getOperand(0).getReg();
   MIB->setDesc(Desc);
 
   // MachineInstr::addOperand() will insert explicit operands before any
@@ -3815,7 +3907,7 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
                           bool MinusOne) {
   MachineBasicBlock &MBB = *MIB->getParent();
   DebugLoc DL = MIB->getDebugLoc();
-  unsigned Reg = MIB->getOperand(0).getReg();
+  Register Reg = MIB->getOperand(0).getReg();
 
   // Insert the XOR.
   BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
@@ -3891,7 +3983,7 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
                                  const TargetInstrInfo &TII) {
   MachineBasicBlock &MBB = *MIB->getParent();
   DebugLoc DL = MIB->getDebugLoc();
-  unsigned Reg = MIB->getOperand(0).getReg();
+  Register Reg = MIB->getOperand(0).getReg();
   const GlobalValue *GV =
       cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
   auto Flags = MachineMemOperand::MOLoad |
@@ -3929,7 +4021,7 @@ static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
                             const MCInstrDesc &LoadDesc,
                             const MCInstrDesc &BroadcastDesc,
                             unsigned SubIdx) {
-  unsigned DestReg = MIB->getOperand(0).getReg();
+  Register DestReg = MIB->getOperand(0).getReg();
   // Check if DestReg is XMM16-31 or YMM16-31.
   if (TRI->getEncodingValue(DestReg) < 16) {
     // We can use a normal VEX encoded load.
@@ -3952,7 +4044,7 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB,
                              const MCInstrDesc &StoreDesc,
                              const MCInstrDesc &ExtractDesc,
                              unsigned SubIdx) {
-  unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+  Register SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
   // Check if DestReg is XMM16-31 or YMM16-31.
   if (TRI->getEncodingValue(SrcReg) < 16) {
     // We can use a normal VEX encoded store.
@@ -4008,12 +4100,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::V_SET0:
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
+  case X86::FsFLD0F128:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
   case X86::AVX_SET0: {
     assert(HasAVX && "AVX not supported");
     const TargetRegisterInfo *TRI = &getRegisterInfo();
-    unsigned SrcReg = MIB->getOperand(0).getReg();
-    unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+    Register SrcReg = MIB->getOperand(0).getReg();
+    Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
     MIB->getOperand(0).setReg(XReg);
     Expand2AddrUndef(MIB, get(X86::VXORPSrr));
     MIB.addReg(SrcReg, RegState::ImplicitDefine);
@@ -4021,9 +4114,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   }
   case X86::AVX512_128_SET0:
   case X86::AVX512_FsFLD0SS:
-  case X86::AVX512_FsFLD0SD: {
+  case X86::AVX512_FsFLD0SD:
+  case X86::AVX512_FsFLD0F128: {
     bool HasVLX = Subtarget.hasVLX();
-    unsigned SrcReg = MIB->getOperand(0).getReg();
+    Register SrcReg = MIB->getOperand(0).getReg();
     const TargetRegisterInfo *TRI = &getRegisterInfo();
     if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
       return Expand2AddrUndef(MIB,
@@ -4037,10 +4131,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0: {
     bool HasVLX = Subtarget.hasVLX();
-    unsigned SrcReg = MIB->getOperand(0).getReg();
+    Register SrcReg = MIB->getOperand(0).getReg();
     const TargetRegisterInfo *TRI = &getRegisterInfo();
     if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
-      unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+      Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
       MIB->getOperand(0).setReg(XReg);
       Expand2AddrUndef(MIB,
                        get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
@@ -4060,14 +4154,14 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::AVX2_SETALLONES:
     return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
   case X86::AVX1_SETALLONES: {
-    unsigned Reg = MIB->getOperand(0).getReg();
+    Register Reg = MIB->getOperand(0).getReg();
     // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
     MIB->setDesc(get(X86::VCMPPSYrri));
     MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
     return true;
   }
   case X86::AVX512_512_SETALLONES: {
-    unsigned Reg = MIB->getOperand(0).getReg();
+    Register Reg = MIB->getOperand(0).getReg();
     MIB->setDesc(get(X86::VPTERNLOGDZrri));
     // VPTERNLOGD needs 3 register inputs and an immediate.
     // 0xff will return 1s for any input.
@@ -4077,8 +4171,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   }
   case X86::AVX512_512_SEXT_MASK_32:
   case X86::AVX512_512_SEXT_MASK_64: {
-    unsigned Reg = MIB->getOperand(0).getReg();
-    unsigned MaskReg = MIB->getOperand(1).getReg();
+    Register Reg = MIB->getOperand(0).getReg();
+    Register MaskReg = MIB->getOperand(1).getReg();
     unsigned MaskState = getRegState(MIB->getOperand(1));
     unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
                    X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
@@ -4115,8 +4209,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
                             get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
   case X86::MOV32ri64: {
-    unsigned Reg = MIB->getOperand(0).getReg();
-    unsigned Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
+    Register Reg = MIB->getOperand(0).getReg();
+    Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
     MI.setDesc(get(X86::MOV32ri));
     MIB->getOperand(0).setReg(Reg32);
     MIB.addReg(Reg, RegState::ImplicitDefine);
@@ -4251,8 +4345,8 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
 
   // If MI is marked as reading Reg, the partial register update is wanted.
   const MachineOperand &MO = MI.getOperand(0);
-  unsigned Reg = MO.getReg();
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  Register Reg = MO.getReg();
+  if (Register::isVirtualRegister(Reg)) {
     if (MO.readsReg() || MI.readsVirtualRegister(Reg))
       return 0;
   } else {
@@ -4268,7 +4362,10 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
 
 // Return true for any instruction the copies the high bits of the first source
 // operand into the unused high bits of the destination operand.
-static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
+static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
+                              bool ForLoadFold = false) {
+  // Set the OpNum parameter to the first source operand.
+  OpNum = 1;
   switch (Opcode) {
   case X86::VCVTSI2SSrr:
   case X86::VCVTSI2SSrm:
@@ -4427,6 +4524,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
   case X86::VSQRTSDZm:
   case X86::VSQRTSDZm_Int:
     return true;
+  case X86::VMOVSSZrrk:
+  case X86::VMOVSDZrrk:
+    OpNum = 3;
+    return true;
+  case X86::VMOVSSZrrkz:
+  case X86::VMOVSDZrrkz:
+    OpNum = 2;
+    return true;
   }
 
   return false;
@@ -4449,14 +4554,11 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
 unsigned
 X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
                                    const TargetRegisterInfo *TRI) const {
-  if (!hasUndefRegUpdate(MI.getOpcode()))
+  if (!hasUndefRegUpdate(MI.getOpcode(), OpNum))
     return 0;
 
-  // Set the OpNum parameter to the first source operand.
-  OpNum = 1;
-
   const MachineOperand &MO = MI.getOperand(OpNum);
-  if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+  if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) {
     return UndefRegClearance;
   }
   return 0;
@@ -4464,7 +4566,7 @@ X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
 
 void X86InstrInfo::breakPartialRegDependency(
     MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
-  unsigned Reg = MI.getOperand(OpNum).getReg();
+  Register Reg = MI.getOperand(OpNum).getReg();
   // If MI kills this register, the false dependence is already broken.
   if (MI.killsRegister(Reg, TRI))
     return;
@@ -4480,7 +4582,7 @@ void X86InstrInfo::breakPartialRegDependency(
   } else if (X86::VR256RegClass.contains(Reg)) {
     // Use vxorps to clear the full ymm register.
     // It wants to read and write the xmm sub-register.
-    unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+    Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
         .addReg(XReg, RegState::Undef)
         .addReg(XReg, RegState::Undef)
@@ -4489,7 +4591,7 @@ void X86InstrInfo::breakPartialRegDependency(
   } else if (X86::GR64RegClass.contains(Reg)) {
     // Using XOR32rr because it has shorter encoding and zeros up the upper bits
     // as well.
-    unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit);
+    Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
         .addReg(XReg, RegState::Undef)
         .addReg(XReg, RegState::Undef)
@@ -4538,8 +4640,8 @@ static void updateOperandRegConstraints(MachineFunction &MF,
     // We only need to update constraints on virtual register operands.
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TRI.isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
 
     auto *NewRC = MRI.constrainRegClass(
@@ -4698,7 +4800,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
 
 static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
                                                MachineInstr &MI) {
-  if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) ||
+  unsigned Ignored;
+  if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) ||
       !MI.getOperand(1).isReg())
     return false;
 
@@ -4788,6 +4891,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   if (I != nullptr) {
     unsigned Opcode = I->DstOp;
     unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+    MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0;
     if (Align < MinAlign)
       return nullptr;
     bool NarrowToMOV32rm = false;
@@ -4821,8 +4925,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       // If this is the special case where we use a MOV32rm to load a 32-bit
       // value and zero-extend the top bits. Change the destination register
       // to a 32-bit one.
-      unsigned DstReg = NewMI->getOperand(0).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+      Register DstReg = NewMI->getOperand(0).getReg();
+      if (Register::isPhysicalRegister(DstReg))
         NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
       else
         NewMI->getOperand(0).setSubReg(X86::sub_32bit);
@@ -5133,6 +5237,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::V_SET0:
     case X86::V_SETALLONES:
     case X86::AVX512_128_SET0:
+    case X86::FsFLD0F128:
+    case X86::AVX512_FsFLD0F128:
       Alignment = 16;
       break;
     case X86::MMX_SET0:
@@ -5182,7 +5288,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   case X86::FsFLD0SD:
   case X86::AVX512_FsFLD0SD:
   case X86::FsFLD0SS:
-  case X86::AVX512_FsFLD0SS: {
+  case X86::AVX512_FsFLD0SS:
+  case X86::FsFLD0F128:
+  case X86::AVX512_FsFLD0F128: {
     // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
     // Create a constant-pool entry and operands to load from it.
 
@@ -5212,6 +5320,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       Ty = Type::getFloatTy(MF.getFunction().getContext());
     else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
       Ty = Type::getDoubleTy(MF.getFunction().getContext());
+    else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
+      Ty = Type::getFP128Ty(MF.getFunction().getContext());
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16);
     else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
@@ -5293,6 +5403,51 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
   return StoreMMOs;
 }
 
+static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I,
+                                   const TargetRegisterClass *RC,
+                                   const X86Subtarget &STI) {
+  assert(STI.hasAVX512() && "Expected at least AVX512!");
+  unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
+  assert((SpillSize == 64 || STI.hasVLX()) &&
+         "Can't broadcast less than 64 bytes without AVX512VL!");
+
+  switch (I->Flags & TB_BCAST_MASK) {
+  default: llvm_unreachable("Unexpected broadcast type!");
+  case TB_BCAST_D:
+    switch (SpillSize) {
+    default: llvm_unreachable("Unknown spill size");
+    case 16: return X86::VPBROADCASTDZ128m;
+    case 32: return X86::VPBROADCASTDZ256m;
+    case 64: return X86::VPBROADCASTDZm;
+    }
+    break;
+  case TB_BCAST_Q:
+    switch (SpillSize) {
+    default: llvm_unreachable("Unknown spill size");
+    case 16: return X86::VPBROADCASTQZ128m;
+    case 32: return X86::VPBROADCASTQZ256m;
+    case 64: return X86::VPBROADCASTQZm;
+    }
+    break;
+  case TB_BCAST_SS:
+    switch (SpillSize) {
+    default: llvm_unreachable("Unknown spill size");
+    case 16: return X86::VBROADCASTSSZ128m;
+    case 32: return X86::VBROADCASTSSZ256m;
+    case 64: return X86::VBROADCASTSSZm;
+    }
+    break;
+  case TB_BCAST_SD:
+    switch (SpillSize) {
+    default: llvm_unreachable("Unknown spill size");
+    case 16: return X86::VMOVDDUPZ128rm;
+    case 32: return X86::VBROADCASTSDZ256m;
+    case 64: return X86::VBROADCASTSDZm;
+    }
+    break;
+  }
+}
+
 bool X86InstrInfo::unfoldMemoryOperand(
     MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
     bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
@@ -5303,6 +5458,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
   unsigned Index = I->Flags & TB_INDEX_MASK;
   bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
   bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+  bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
   if (UnfoldLoad && !FoldedLoad)
     return false;
   UnfoldLoad &= FoldedLoad;
@@ -5311,7 +5467,9 @@ bool X86InstrInfo::unfoldMemoryOperand(
   UnfoldStore &= FoldedStore;
 
   const MCInstrDesc &MCID = get(Opc);
+
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   // TODO: Check if 32-byte or greater accesses are slow too?
   if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
       Subtarget.isUnalignedMem16Slow())
@@ -5335,10 +5493,26 @@ bool X86InstrInfo::unfoldMemoryOperand(
       AfterOps.push_back(Op);
   }
 
-  // Emit the load instruction.
+  // Emit the load or broadcast instruction.
   if (UnfoldLoad) {
     auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
-    loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs, NewMIs);
+
+    unsigned Opc;
+    if (FoldedBCast) {
+      Opc = getBroadcastOpcode(I, RC, Subtarget);
+    } else {
+      unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+      bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+      Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
+    }
+
+    DebugLoc DL;
+    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
+    for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
+      MIB.add(AddrOps[i]);
+    MIB.setMemRefs(MMOs);
+    NewMIs.push_back(MIB);
+
     if (UnfoldStore) {
       // Address operands cannot be marked isKill.
       for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
@@ -5404,7 +5578,16 @@ bool X86InstrInfo::unfoldMemoryOperand(
   if (UnfoldStore) {
     const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
     auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
-    storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs, NewMIs);
+    unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
+    bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+    unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
+    DebugLoc DL;
+    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+    for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
+      MIB.add(AddrOps[i]);
+    MIB.addReg(Reg, RegState::Kill);
+    MIB.setMemRefs(MMOs);
+    NewMIs.push_back(MIB);
   }
 
   return true;
@@ -5423,6 +5606,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   unsigned Index = I->Flags & TB_INDEX_MASK;
   bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
   bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+  bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
   const MCInstrDesc &MCID = get(Opc);
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -5456,10 +5640,17 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
       return false;
     // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
     // memory access is slow above.
-    unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
-    bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
-    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
-                              VT, MVT::Other, AddrOps);
+
+    unsigned Opc;
+    if (FoldedBCast) {
+      Opc = getBroadcastOpcode(I, RC, Subtarget);
+    } else {
+      unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+      bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+      Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
+    }
+
+    Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
     NewNodes.push_back(Load);
 
     // Preserve memory reference information.
@@ -7367,6 +7558,96 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   }
 }
 
+Optional<ParamLoadedValue>
+X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const {
+  const MachineOperand *Op = nullptr;
+  DIExpression *Expr = nullptr;
+
+  switch (MI.getOpcode()) {
+  case X86::LEA32r:
+  case X86::LEA64r:
+  case X86::LEA64_32r: {
+    // Operand 4 could be global address. For now we do not support
+    // such situation.
+    if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
+      return None;
+
+    const MachineOperand &Op1 = MI.getOperand(1);
+    const MachineOperand &Op2 = MI.getOperand(3);
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    assert(Op2.isReg() && (Op2.getReg() == X86::NoRegister ||
+                           Register::isPhysicalRegister(Op2.getReg())));
+
+    // Omit situations like:
+    // %rsi = lea %rsi, 4, ...
+    if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
+        Op2.getReg() == MI.getOperand(0).getReg())
+      return None;
+    else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
+              TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
+             (Op2.getReg() != X86::NoRegister &&
+              TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
+      return None;
+
+    int64_t Coef = MI.getOperand(2).getImm();
+    int64_t Offset = MI.getOperand(4).getImm();
+    SmallVector<uint64_t, 8> Ops;
+
+    if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
+      Op = &Op1;
+    } else if (Op1.isFI())
+      Op = &Op1;
+
+    if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
+      Ops.push_back(dwarf::DW_OP_constu);
+      Ops.push_back(Coef + 1);
+      Ops.push_back(dwarf::DW_OP_mul);
+    } else {
+      if (Op && Op2.getReg() != X86::NoRegister) {
+        int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
+        if (dwarfReg < 0)
+          return None;
+        else if (dwarfReg < 32) {
+          Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
+          Ops.push_back(0);
+        } else {
+          Ops.push_back(dwarf::DW_OP_bregx);
+          Ops.push_back(dwarfReg);
+          Ops.push_back(0);
+        }
+      } else if (!Op) {
+        assert(Op2.getReg() != X86::NoRegister);
+        Op = &Op2;
+      }
+
+      if (Coef > 1) {
+        assert(Op2.getReg() != X86::NoRegister);
+        Ops.push_back(dwarf::DW_OP_constu);
+        Ops.push_back(Coef);
+        Ops.push_back(dwarf::DW_OP_mul);
+      }
+
+      if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
+          Op2.getReg() != X86::NoRegister) {
+        Ops.push_back(dwarf::DW_OP_plus);
+      }
+    }
+
+    DIExpression::appendOffset(Ops, Offset);
+    Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
+
+    return ParamLoadedValue(*Op, Expr);;
+  }
+  case X86::XOR32rr: {
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
+      return ParamLoadedValue(MachineOperand::CreateImm(0), Expr);
+    return None;
+  }
+  default:
+    return TargetInstrInfo::describeLoadedValue(MI);
+  }
+}
+
 /// This is an architecture-specific helper function of reassociateOps.
 /// Set special operand attributes for new instructions after reassociation.
 void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
@@ -7500,9 +7781,8 @@ namespace {
           //   movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
           //   addq %rcx, %rax
           // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
-          unsigned PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
-          unsigned GOTReg =
-              RegInfo.createVirtualRegister(&X86::GR64RegClass);
+          Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+          Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
           BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
               .addReg(X86::RIP)
               .addImm(0)
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 13ca17139494..22b7b1d4cb19 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -67,6 +67,9 @@ unsigned getSwappedVPCMPImm(unsigned Imm);
 /// Get the VPCOM immediate if the opcodes are swapped.
 unsigned getSwappedVPCOMImm(unsigned Imm);
 
+/// Get the VCMP immediate if the opcodes are swapped.
+unsigned getSwappedVCMPImm(unsigned Imm);
+
 } // namespace X86
 
 /// isGlobalStubReference - Return true if the specified TargetFlag operand is
@@ -203,7 +206,7 @@ public:
                                     int &FrameIndex) const override;
 
   bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                         AliasAnalysis *AA) const override;
+                                         AAResults *AA) const override;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr &Orig,
@@ -218,7 +221,7 @@ public:
   /// Reference parameters are set to indicate how caller should add this
   /// operand to the LEA instruction.
   bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
-                      unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
+                      unsigned LEAOpcode, bool AllowSP, Register &NewSrc,
                       bool &isKill, MachineOperand &ImplicitOp,
                       LiveVariables *LV) const;
 
@@ -251,7 +254,7 @@ public:
   ///     findCommutedOpIndices(MI, Op1, Op2);
   /// can be interpreted as a query asking to find an operand that would be
   /// commutable with the operand#1.
-  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+  bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
   /// Returns an adjusted FMA opcode that must be used in FMA instruction that
@@ -317,23 +320,11 @@ public:
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
-  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                      SmallVectorImpl<MachineOperand> &Addr,
-                      const TargetRegisterClass *RC,
-                      ArrayRef<MachineMemOperand *> MMOs,
-                      SmallVectorImpl<MachineInstr *> &NewMIs) const;
-
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI, unsigned DestReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                       SmallVectorImpl<MachineOperand> &Addr,
-                       const TargetRegisterClass *RC,
-                       ArrayRef<MachineMemOperand *> MMOs,
-                       SmallVectorImpl<MachineInstr *> &NewMIs) const;
-
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   /// Check whether the target can fold a load that feeds a subreg operand
@@ -527,6 +518,13 @@ public:
 #define GET_INSTRINFO_HELPER_DECLS
 #include "X86GenInstrInfo.inc"
 
+  static bool hasLockPrefix(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & X86II::LOCK;
+  }
+
+  Optional<ParamLoadedValue>
+  describeLoadedValue(const MachineInstr &MI) const override;
+
 protected:
   /// Commutes the operands in the given instruction by changing the operands
   /// order and/or changing the instruction's opcode and/or the immediate value
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 8e05dd8ec5c1..e452145f3b65 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -673,6 +673,14 @@ def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
                       ImmSExti64i32AsmOperand];
 }
 
+// 4-bit immediate used by some XOP instructions
+// [0, 0xF]
+def ImmUnsignedi4AsmOperand : AsmOperandClass {
+  let Name = "ImmUnsignedi4";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidImmUnsignedi4";
+}
+
 // Unsigned immediate used by SSE/AVX instructions
 // [0, 0xFF]
 //   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
@@ -705,6 +713,13 @@ def i64i8imm   : Operand<i64> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
+// Unsigned 4-bit immediate used by some XOP instructions.
+def u4imm : Operand<i8> {
+  let PrintMethod = "printU8Imm";
+  let ParserMatchClass = ImmUnsignedi4AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
 // Unsigned 8-bit immediate used by SSE/AVX instructions.
 def u8imm : Operand<i8> {
   let PrintMethod = "printU8Imm";
@@ -925,7 +940,6 @@ def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">;
 def HasPTWRITE   : Predicate<"Subtarget->hasPTWRITE()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
-def HasMPX       : Predicate<"Subtarget->hasMPX()">;
 def HasSHSTK     : Predicate<"Subtarget->hasSHSTK()">;
 def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
 def HasCLWB      : Predicate<"Subtarget->hasCLWB()">;
@@ -1103,7 +1117,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
   if (ExtType == ISD::NON_EXTLOAD)
     return true;
   if (ExtType == ISD::EXTLOAD)
-    return LD->getAlignment() >= 2 && !LD->isVolatile();
+    return LD->getAlignment() >= 2 && LD->isSimple();
   return false;
 }]>;
 
@@ -1113,7 +1127,7 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
   if (ExtType == ISD::NON_EXTLOAD)
     return true;
   if (ExtType == ISD::EXTLOAD)
-    return LD->getAlignment() >= 4 && !LD->isVolatile();
+    return LD->getAlignment() >= 4 && LD->isSimple();
   return false;
 }]>;
 
@@ -1170,7 +1184,7 @@ def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [
   if (LD->getMemoryVT() == MVT::i32)
     return true;
 
-  return LD->getAlignment() >= 4 && !LD->isVolatile();
+  return LD->getAlignment() >= 4 && LD->isSimple();
 }]>;
 
 
@@ -2404,25 +2418,26 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
 }
 
 multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
-                  RegisterClass RC, X86MemOperand x86memop> {
+                  RegisterClass RC, X86MemOperand x86memop,
+                  X86FoldableSchedWrite sched> {
 let hasSideEffects = 0 in {
   def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-             T8PS, VEX_4V, Sched<[WriteBLS]>;
+             T8PS, VEX_4V, Sched<[sched]>;
   let mayLoad = 1 in
   def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-             T8PS, VEX_4V, Sched<[WriteBLS.Folded]>;
+             T8PS, VEX_4V, Sched<[sched.Folded]>;
 }
 }
 
 let Predicates = [HasBMI], Defs = [EFLAGS] in {
-  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
-  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
-  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
-  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
-  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
-  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
+  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>;
+  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, VEX_W;
+  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>;
+  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, VEX_W;
+  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>;
+  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, VEX_W;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2683,12 +2698,12 @@ def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
 multiclass lwpins_intr<RegisterClass RC> {
   def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
                  "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>,
+                 [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>,
                  XOP_4V, XOPA;
   let mayLoad = 1 in
   def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
                  "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>,
+                 [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>,
                  XOP_4V, XOPA;
 }
 
@@ -2700,11 +2715,11 @@ let Defs = [EFLAGS] in {
 multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
   def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
                  "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(Int RC:$src0, GR32:$src1, imm:$cntl)]>, XOP_4V, XOPA;
+                 [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA;
   let mayLoad = 1 in
   def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
                  "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)]>,
+                 [(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>,
                  XOP_4V, XOPA;
 }
 
@@ -3205,13 +3220,13 @@ def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
 // Disambiguate the mem/imm form of bt-without-a-suffix as btl.
 // Likewise for btc/btr/bts.
 def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
-                (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+                (BT32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
 def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
-                (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+                (BTC32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
 def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
-                (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+                (BTR32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
 def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
-                (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+                (BTS32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
 
 // clr aliases.
 def : InstAlias<"clr{b}\t$reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 57835b1a256a..cd9a866c91cb 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -30,7 +30,6 @@ def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>;
 
 let Constraints = "$src1 = $dst" in {
   // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
-  // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
   multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
                                X86FoldableSchedWrite sched, bit Commutable = 0,
                                X86MemOperand OType = i64mem> {
@@ -67,7 +66,7 @@ let Constraints = "$src1 = $dst" in {
     def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
                                    (ins VR64:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))]>,
+           [(set VR64:$dst, (IntId2 VR64:$src1, timm:$src2))]>,
            Sched<[schedImm]>;
   }
 }
@@ -114,13 +113,13 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
   def rri  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
       (ins VR64:$src1, VR64:$src2, u8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-      [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
+      [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 timm:$src3)))]>,
       Sched<[sched]>;
   def rmi  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1,
-                       (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
+                       (bitconvert (load_mmx addr:$src2)), (i8 timm:$src3)))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -496,14 +495,14 @@ def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
                           (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
-                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>,
+                             (int_x86_sse_pshuf_w VR64:$src1, timm:$src2))]>,
                           Sched<[SchedWriteShuffle.MMX]>;
 def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
                           (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
                              (int_x86_sse_pshuf_w (load_mmx addr:$src1),
-                                                   imm:$src2))]>,
+                                                   timm:$src2))]>,
                           Sched<[SchedWriteShuffle.MMX.Folded]>;
 
 // -- Conversion Instructions
@@ -535,7 +534,7 @@ def MMX_PEXTRWrr: MMXIi8<0xC5, MRMSrcReg,
                      (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
                      "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
-                                             imm:$src2))]>,
+                                             timm:$src2))]>,
                      Sched<[WriteVecExtract]>;
 let Constraints = "$src1 = $dst" in {
 let Predicates = [HasMMX, HasSSE1] in {
@@ -544,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in {
                     (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
-                                      GR32orGR64:$src2, imm:$src3))]>,
+                                      GR32orGR64:$src2, timm:$src3))]>,
                     Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
 
   def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
@@ -553,7 +552,7 @@ let Predicates = [HasMMX, HasSSE1] in {
                    "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                    [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
                                        (i32 (anyext (loadi16 addr:$src2))),
-                                     imm:$src3))]>,
+                                     timm:$src3))]>,
                    Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
 }
 }
@@ -567,6 +566,13 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
                                 (int_x86_mmx_pmovmskb VR64:$src))]>,
                           Sched<[WriteMMXMOVMSK]>;
 
+// MMX to XMM for vector types
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
 // Low word of XMM to MMX.
 def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
                             [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
@@ -574,9 +580,13 @@ def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
 def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
           (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
 
-def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))),
           (x86mmx (MMX_MOVQ64rm addr:$src))>;
 
+def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
+                             (i64 (bitconvert (x86mmx VR64:$src)))))),
+          (MMX_MOVQ2DQrr VR64:$src)>;
+
 // Misc.
 let SchedRW = [SchedWriteShuffle.MMX] in {
 let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in
@@ -601,9 +611,6 @@ def : Pat<(x86mmx (MMX_X86movdq2q
 def : Pat<(x86mmx (MMX_X86movdq2q
                    (bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))),
           (MMX_CVTTPS2PIirr VR128:$src)>;
-def : Pat<(x86mmx (MMX_X86movdq2q
-                   (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))),
-          (MMX_CVTTPS2PIirr VR128:$src)>;
 def : Pat<(x86mmx (MMX_X86movdq2q
                    (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
           (MMX_CVTPD2PIirr VR128:$src)>;
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index f7d931510fe2..44ba071947c2 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -12,16 +12,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FIXME: Investigate a better scheduler class once MPX is used inside LLVM.
+// FIXME: Investigate a better scheduler class if MPX is ever used inside LLVM.
 let SchedRW = [WriteSystem] in {
 
 multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
   def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
               OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
-              Requires<[HasMPX, Not64BitMode]>;
+              Requires<[Not64BitMode]>;
   def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
               OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
-              Requires<[HasMPX, In64BitMode]>;
+              Requires<[In64BitMode]>;
 }
 
 defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
@@ -29,17 +29,17 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
 multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
   def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, anymem:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
-              Requires<[HasMPX, Not64BitMode]>;
+              Requires<[Not64BitMode]>;
   def 64rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, anymem:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
-              Requires<[HasMPX, In64BitMode]>;
+              Requires<[In64BitMode]>;
 
   def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
-              Requires<[HasMPX, Not64BitMode]>;
+              Requires<[Not64BitMode]>;
   def 64rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR64:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
-              Requires<[HasMPX, In64BitMode]>;
+              Requires<[In64BitMode]>;
 }
 defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable;
 defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable;
@@ -47,33 +47,31 @@ defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable;
 
 def BNDMOVrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
                   "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  Requires<[HasMPX]>, NotMemoryFoldable;
+                  NotMemoryFoldable;
 let mayLoad = 1 in {
 def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
                   "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+                  Requires<[Not64BitMode]>, NotMemoryFoldable;
 def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
                   "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
+                  Requires<[In64BitMode]>, NotMemoryFoldable;
 }
 let isCodeGenOnly = 1, ForceDisassemble = 1 in
 def BNDMOVrr_REV   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
                        "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                       Requires<[HasMPX]>, NotMemoryFoldable;
+                       NotMemoryFoldable;
 let mayStore = 1 in {
 def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
                   "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+                  Requires<[Not64BitMode]>, NotMemoryFoldable;
 def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
                   "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
+                  Requires<[In64BitMode]>, NotMemoryFoldable;
 
 def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src),
-                    "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
-                    Requires<[HasMPX]>;
+                    "bndstx\t{$src, $dst|$dst, $src}", []>, PS;
 }
 let mayLoad = 1 in
 def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
-                    "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
-                    Requires<[HasMPX]>;
+                    "bndldx\t{$src, $dst|$dst, $src}", []>, PS;
 } // SchedRW
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 7d0a5b87baf4..09a04c0338b4 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -115,7 +115,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
   def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
                    [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
   def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
-                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
+                   [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
+  def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+                     [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -128,13 +130,18 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, SchedRW = [WriteZero] in {
+    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
                [(set VR128:$dst, (v4f32 immAllZerosV))]>;
 }
 
-let Predicates = [NoAVX512] in
+let Predicates = [NoAVX512] in {
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+}
 
 
 // The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
@@ -147,6 +154,14 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
                  [(set VR256:$dst, (v8i32 immAllZerosV))]>;
 }
 
+let Predicates = [NoAVX512] in {
+def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+}
+
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -355,7 +370,7 @@ defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
                                  SSEPackedSingle, SchedWriteFMoveLS.YMM>,
                                  PS, VEX, VEX_L, VEX_WIG;
-defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
                                  SSEPackedDouble, SchedWriteFMoveLS.YMM>,
                                  PD, VEX, VEX_L, VEX_WIG;
 }
@@ -661,7 +676,7 @@ let Predicates = [UseSSE1] in {
   // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
   // end up with a movsd or blend instead of shufp.
   // No need for aligned load, we're only loading 64-bits.
-  def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1,
+  def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
                       (i8 -28)),
             (MOVLPSrm VR128:$src1, addr:$src2)>;
   def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
@@ -727,7 +742,7 @@ let Predicates = [UseSSE1] in {
   // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
   // end up with a movsd or blend instead of shufp.
   // No need for aligned load, we're only loading 64-bits.
-  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))),
+  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
             (MOVHPSrm VR128:$src1, addr:$src2)>;
   def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
             (MOVHPSrm VR128:$src1, addr:$src2)>;
@@ -761,7 +776,7 @@ let Predicates = [UseSSE2] in {
 let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
   // Use MOVLPD to load into the low bits from a full vector unless we can use
   // BLENDPD.
-  def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))),
+  def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
             (MOVLPDrm VR128:$src1, addr:$src2)>;
 }
 
@@ -1713,12 +1728,12 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
   let isCommutable = 1 in
   def rr : SIi8<0xC2, MRMSrcReg,
                 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
-                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
+                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
                 Sched<[sched]>;
   def rm : SIi8<0xC2, MRMSrcMem,
                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
                 [(set RC:$dst, (OpNode (VT RC:$src1),
-                                         (ld_frag addr:$src2), imm:$cc))]>,
+                                         (ld_frag addr:$src2), timm:$cc))]>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -1751,13 +1766,13 @@ multiclass sse12_cmp_scalar_int<Operand memop,
   def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
                       (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
-                                               VR128:$src, imm:$cc))]>,
+                                               VR128:$src, timm:$cc))]>,
            Sched<[sched]>;
 let mayLoad = 1 in
   def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
-                                               mem_cpat:$src, imm:$cc))]>,
+                                               mem_cpat:$src, timm:$cc))]>,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -1876,12 +1891,12 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
   let isCommutable = 1 in
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
-             [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
+             [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
             Sched<[sched]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
              [(set RC:$dst,
-               (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
+               (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
             Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -1906,7 +1921,7 @@ let Constraints = "$src1 = $dst" in {
                  SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
 }
 
-def CommutableCMPCC : PatLeaf<(imm), [{
+def CommutableCMPCC : PatLeaf<(timm), [{
   uint64_t Imm = N->getZExtValue() & 0x7;
   return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
 }]>;
@@ -1915,47 +1930,47 @@ def CommutableCMPCC : PatLeaf<(imm), [{
 let Predicates = [HasAVX] in {
   def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
                             CommutableCMPCC:$cc)),
-            (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+            (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
 
   def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
                             CommutableCMPCC:$cc)),
-            (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+            (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
 
   def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
                             CommutableCMPCC:$cc)),
-            (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+            (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
 
   def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
                             CommutableCMPCC:$cc)),
-            (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+            (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
 
   def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
                           CommutableCMPCC:$cc)),
-            (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
+            (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
 
   def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
                           CommutableCMPCC:$cc)),
-            (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
+            (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
 }
 
 let Predicates = [UseSSE2] in {
   def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
                             CommutableCMPCC:$cc)),
-            (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+            (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
 
   def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
                           CommutableCMPCC:$cc)),
-            (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
+            (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
 }
 
 let Predicates = [UseSSE1] in {
   def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
                             CommutableCMPCC:$cc)),
-            (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+            (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
 
   def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
                           CommutableCMPCC:$cc)),
-            (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
+            (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1970,13 +1985,13 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
   def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
-                                       (i8 imm:$src3))))], d>,
+                                       (i8 timm:$src3))))], d>,
             Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = IsCommutable in
   def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
                  (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
                  [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
-                                     (i8 imm:$src3))))], d>,
+                                     (i8 timm:$src3))))], d>,
             Sched<[sched]>;
 }
 
@@ -2097,7 +2112,7 @@ let Predicates = [HasAVX1Only] in {
 let Predicates = [UseSSE2] in {
   // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
-                              (v2f64 (nonvolatile_load addr:$src2)))),
+                              (v2f64 (simple_load addr:$src2)))),
             (MOVHPDrm VR128:$src1, addr:$src2)>;
 }
 
@@ -2721,7 +2736,7 @@ defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64,
 defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
 defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
 defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
- 
+
 /// Unop Arithmetic
 /// In addition, we also have a special variant of the scalar form here to
 /// represent the associated intrinsic operation.  This form is unlike the
@@ -3482,7 +3497,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
        Sched<[schedImm]>;
 }
 
@@ -3514,7 +3529,7 @@ multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
        Sched<[sched]>;
 }
 
@@ -3597,7 +3612,7 @@ let Predicates = [HasAVX, prd] in {
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
-                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+                        (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
                       VEX, Sched<[sched.XMM]>, VEX_WIG;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, u8imm:$src2),
@@ -3605,7 +3620,7 @@ let Predicates = [HasAVX, prd] in {
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
                        (vt128 (OpNode (load addr:$src1),
-                        (i8 imm:$src2))))]>, VEX,
+                        (i8 timm:$src2))))]>, VEX,
                   Sched<[sched.XMM.Folded]>, VEX_WIG;
 }
 
@@ -3615,7 +3630,7 @@ let Predicates = [HasAVX2, prd] in {
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
-                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
+                         (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, u8imm:$src2),
@@ -3623,7 +3638,7 @@ let Predicates = [HasAVX2, prd] in {
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
                         (vt256 (OpNode (load addr:$src1),
-                         (i8 imm:$src2))))]>, VEX, VEX_L,
+                         (i8 timm:$src2))))]>, VEX, VEX_L,
                    Sched<[sched.YMM.Folded]>, VEX_WIG;
 }
 
@@ -3633,7 +3648,7 @@ let Predicates = [UseSSE2] in {
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set VR128:$dst,
-                 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+                 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
                Sched<[sched.XMM]>;
   def mi : Ii8<0x70, MRMSrcMem,
                (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
@@ -3641,7 +3656,7 @@ let Predicates = [UseSSE2] in {
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set VR128:$dst,
                  (vt128 (OpNode (memop addr:$src1),
-                        (i8 imm:$src2))))]>,
+                        (i8 timm:$src2))))]>,
                Sched<[sched.XMM.Folded]>;
 }
 }
@@ -4380,7 +4395,7 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
 
 
 let Predicates = [HasAVX, NoVLX] in {
-  def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
+  def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
@@ -4388,7 +4403,7 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [UseSSE3] in {
   // No need for aligned memory as this only loads 64-bits.
-  def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
+  def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
             (MOVDDUPrm addr:$src)>;
   def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
             (MOVDDUPrm addr:$src)>;
@@ -4812,7 +4827,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
+      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
       Sched<[sched]>;
   let mayLoad = 1 in
   def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
@@ -4823,7 +4838,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set RC:$dst, (VT (X86PAlignr RC:$src1,
                                      (memop_frag addr:$src2),
-                                     (i8 imm:$src3))))]>,
+                                     (i8 timm:$src3))))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -5300,7 +5315,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
+        (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
       Sched<[SchedWriteFShuffle.XMM]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
@@ -5311,7 +5326,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
       [(set VR128:$dst,
         (X86insertps VR128:$src1,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
-                    imm:$src3))]>,
+                    timm:$src3))]>,
       Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
 }
 
@@ -5323,17 +5338,6 @@ let ExeDomain = SSEPackedSingle in {
     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
 }
 
-let Predicates = [UseAVX] in {
-  // If we're inserting an element from a vbroadcast of a load, fold the
-  // load into the X86insertps instruction.
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
-                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
-            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
-  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
-                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
-            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
-}
-
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Round Instructions
 //===----------------------------------------------------------------------===//
@@ -5348,7 +5352,7 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
                   (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
                   !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
+                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
                   Sched<[sched]>;
 
   // Vector intrinsic operation, mem
@@ -5357,13 +5361,13 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
                   !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set RC:$dst,
-                        (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
+                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
                   Sched<[sched.Folded]>;
 }
 
 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
                           string OpcodeStr, X86FoldableSchedWrite sched> {
-let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
   def SSr : SS4AIi8<opcss, MRMSrcReg,
         (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
@@ -5378,7 +5382,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
         []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
-let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
   def SDr : SS4AIi8<opcsd, MRMSrcReg,
         (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
@@ -5396,7 +5400,7 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
 
 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
                            string OpcodeStr, X86FoldableSchedWrite sched> {
-let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
   def SSr : SS4AIi8<opcss, MRMSrcReg,
                     (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
@@ -5411,7 +5415,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
                     []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
-let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
   def SDr : SS4AIi8<opcsd, MRMSrcReg,
                     (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
                     !strconcat(OpcodeStr,
@@ -5431,7 +5435,7 @@ multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
                             string OpcodeStr, X86FoldableSchedWrite sched,
                             ValueType VT32, ValueType VT64,
                             SDNode OpNode, bit Is2Addr = 1> {
-let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedSingle in {
   def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
@@ -5439,7 +5443,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
                 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
+        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
         Sched<[sched]>;
 
   def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
@@ -5450,11 +5454,11 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
             !strconcat(OpcodeStr,
                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst,
-             (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+             (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
 
-let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedDouble in {
   def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
@@ -5462,7 +5466,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
+        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
         Sched<[sched]>;
 
   def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
@@ -5473,7 +5477,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
             !strconcat(OpcodeStr,
                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst,
-              (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+              (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
 }
@@ -5508,17 +5512,17 @@ let Predicates = [UseAVX] in {
 }
 
 let Predicates = [UseAVX] in {
-  def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
-            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>;
-  def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>;
+  def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
+  def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
 }
 
 let Predicates = [UseAVX, OptForSize] in {
-  def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
-            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
-  def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
-            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
+  def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
+            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+  def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
+            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -5535,17 +5539,17 @@ defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
                                v4f32, v2f64, X86RndScales>;
 
 let Predicates = [UseSSE41] in {
-  def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
-            (ROUNDSSr FR32:$src1, imm:$src2)>;
-  def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
-            (ROUNDSDr FR64:$src1, imm:$src2)>;
+  def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
+            (ROUNDSSr FR32:$src1, timm:$src2)>;
+  def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
+            (ROUNDSDr FR64:$src1, timm:$src2)>;
 }
 
 let Predicates = [UseSSE41, OptForSize] in {
-  def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
-            (ROUNDSSm addr:$src1, imm:$src2)>;
-  def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
-            (ROUNDSDm addr:$src1, imm:$src2)>;
+  def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
+            (ROUNDSSm addr:$src1, timm:$src2)>;
+  def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
+            (ROUNDSDm addr:$src1, timm:$src2)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -5826,7 +5830,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
         Sched<[sched]>;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -5836,7 +5840,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
+          (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -5853,7 +5857,7 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
         Sched<[sched]>;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -5863,27 +5867,27 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
-def BlendCommuteImm2 : SDNodeXForm<imm, [{
+def BlendCommuteImm2 : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue() & 0x03;
   return getI8Imm(Imm ^ 0x03, SDLoc(N));
 }]>;
 
-def BlendCommuteImm4 : SDNodeXForm<imm, [{
+def BlendCommuteImm4 : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue() & 0x0f;
   return getI8Imm(Imm ^ 0x0f, SDLoc(N));
 }]>;
 
-def BlendCommuteImm8 : SDNodeXForm<imm, [{
+def BlendCommuteImm8 : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue() & 0xff;
   return getI8Imm(Imm ^ 0xff, SDLoc(N));
 }]>;
 
 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
-def BlendScaleImm4 : SDNodeXForm<imm, [{
+def BlendScaleImm4 : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue();
   uint8_t NewImm = 0;
   for (unsigned i = 0; i != 4; ++i) {
@@ -5894,7 +5898,7 @@ def BlendScaleImm4 : SDNodeXForm<imm, [{
 }]>;
 
 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
-def BlendScaleImm2 : SDNodeXForm<imm, [{
+def BlendScaleImm2 : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue();
   uint8_t NewImm = 0;
   for (unsigned i = 0; i != 2; ++i) {
@@ -5905,7 +5909,7 @@ def BlendScaleImm2 : SDNodeXForm<imm, [{
 }]>;
 
 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
-def BlendScaleImm2to4 : SDNodeXForm<imm, [{
+def BlendScaleImm2to4 : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue();
   uint8_t NewImm = 0;
   for (unsigned i = 0; i != 2; ++i) {
@@ -5916,7 +5920,7 @@ def BlendScaleImm2to4 : SDNodeXForm<imm, [{
 }]>;
 
 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
-def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
+def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue();
   uint8_t NewImm = 0;
   for (unsigned i = 0; i != 4; ++i) {
@@ -5927,7 +5931,7 @@ def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
 }]>;
 
 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
-def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
+def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue();
   uint8_t NewImm = 0;
   for (unsigned i = 0; i != 2; ++i) {
@@ -5938,7 +5942,7 @@ def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
 }]>;
 
 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
-def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{
+def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue();
   uint8_t NewImm = 0;
   for (unsigned i = 0; i != 2; ++i) {
@@ -6008,7 +6012,7 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
         Sched<[sched]>;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -6018,14 +6022,14 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
-                                            (commuteXForm imm:$src3))>;
+                                            (commuteXForm timm:$src3))>;
 }
 
 let Predicates = [HasAVX] in {
@@ -6061,37 +6065,37 @@ let Predicates = [HasAVX2] in {
 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
 // ExecutionDomainFixPass will cleanup domains later on.
 let Predicates = [HasAVX1Only] in {
-def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
-          (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
-          (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
-          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
+          (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
+          (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
+          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
 
 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
 // it from becoming movsd via commuting under optsize.
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
-          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
-          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
-          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
-
-def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
-          (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
-          (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
-          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
+          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
+          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
+
+def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
+          (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
+          (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
+def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
+          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
 
 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
 // it from becoming movss via commuting under optsize.
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
-          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
-          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
-          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
+          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
+          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
+          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
 }
 
 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
@@ -6107,19 +6111,19 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
 let Predicates = [UseSSE41] in {
 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
 // it from becoming movss via commuting under optsize.
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
-          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
-          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
-          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
+          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
+          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
 
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
-          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
-          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
-          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
+          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
+          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
+          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
 }
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -6592,7 +6596,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
-                            (i8 imm:$src3)))]>, TA,
+                            (i8 timm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
                          (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
@@ -6600,7 +6604,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
                             (memop addr:$src2),
-                            (i8 imm:$src3)))]>, TA,
+                            (i8 timm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
@@ -6718,26 +6722,26 @@ let Predicates = [HasAVX, HasAES] in {
       (ins VR128:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
-        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+        (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
       Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
   def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
-        (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
+        (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
       Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1, u8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
-    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+    (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
   Sched<[WriteAESKeyGen]>;
 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1, u8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
-    (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
+    (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
   Sched<[WriteAESKeyGen.Folded]>;
 
 //===----------------------------------------------------------------------===//
@@ -6745,7 +6749,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
 //===----------------------------------------------------------------------===//
 
 // Immediate transform to help with commuting.
-def PCLMULCommuteImm : SDNodeXForm<imm, [{
+def PCLMULCommuteImm : SDNodeXForm<timm, [{
   uint8_t Imm = N->getZExtValue();
   return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
 }]>;
@@ -6758,7 +6762,7 @@ let Predicates = [NoAVX, HasPCLMUL] in {
               (ins VR128:$src1, VR128:$src2, u8imm:$src3),
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
-                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
                 Sched<[WriteCLMul]>;
 
     def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
@@ -6766,14 +6770,14 @@ let Predicates = [NoAVX, HasPCLMUL] in {
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
                  (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
-                  imm:$src3))]>,
+                  timm:$src3))]>,
               Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
   } // Constraints = "$src1 = $dst"
 
   def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
-                                (i8 imm:$src3)),
+                                (i8 timm:$src3)),
             (PCLMULQDQrm VR128:$src1, addr:$src2,
-                          (PCLMULCommuteImm imm:$src3))>;
+                          (PCLMULCommuteImm timm:$src3))>;
 } // Predicates = [NoAVX, HasPCLMUL]
 
 // SSE aliases
@@ -6795,21 +6799,21 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
             (ins RC:$src1, RC:$src2, u8imm:$src3),
             "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
             [(set RC:$dst,
-              (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+              (IntId RC:$src1, RC:$src2, timm:$src3))]>,
             Sched<[WriteCLMul]>;
 
   def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
             (ins RC:$src1, MemOp:$src2, u8imm:$src3),
             "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
             [(set RC:$dst,
-               (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
+               (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
             Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
 
   // We can commute a load in the first operand by swapping the sources and
   // rotating the immediate.
-  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
+  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
             (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
-                                           (PCLMULCommuteImm imm:$src3))>;
+                                           (PCLMULCommuteImm timm:$src3))>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
@@ -6853,8 +6857,8 @@ let Constraints = "$src = $dst" in {
 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
                  (ins VR128:$src, u8imm:$len, u8imm:$idx),
                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
-                 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
-                                    imm:$idx))]>,
+                 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
+                                    timm:$idx))]>,
                  PD, Sched<[SchedWriteVecALU.XMM]>;
 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src, VR128:$mask),
@@ -6867,7 +6871,7 @@ def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
                    "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
                    [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
-                                      imm:$len, imm:$idx))]>,
+                                      timm:$len, timm:$idx))]>,
                    XD, Sched<[SchedWriteVecALU.XMM]>;
 def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
                  (ins VR128:$src, VR128:$mask),
@@ -6907,10 +6911,10 @@ def : Pat<(nontemporalstore FR64:$src, addr:$dst),
 //
 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
                            X86MemOperand x86memop, ValueType VT,
-                           PatFrag ld_frag, SchedWrite Sched> :
+                           PatFrag bcast_frag, SchedWrite Sched> :
   AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+        [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
         Sched<[Sched]>, VEX;
 
 // AVX2 adds register forms
@@ -6923,15 +6927,15 @@ class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
 
 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
   def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
-                                         f32mem, v4f32, loadf32,
+                                         f32mem, v4f32, X86VBroadcastld32,
                                          SchedWriteFShuffle.XMM.Folded>;
   def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
-                                         f32mem, v8f32, loadf32,
+                                         f32mem, v8f32, X86VBroadcastld32,
                                          SchedWriteFShuffle.XMM.Folded>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
 def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
-                                        v4f64, loadf64,
+                                        v4f64, X86VBroadcastld64,
                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
 
 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
@@ -6944,15 +6948,6 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
 def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
                                          v4f64, v2f64, WriteFShuffle256>, VEX_L;
 
-let Predicates = [HasAVX, NoVLX] in {
-  def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
-            (VBROADCASTSSrm addr:$src)>;
-  def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
-            (VBROADCASTSSYrm addr:$src)>;
-  def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
-            (VBROADCASTSDYrm addr:$src)>;
-}
-
 //===----------------------------------------------------------------------===//
 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
 //                  halves of a 256-bit vector.
@@ -7081,27 +7076,29 @@ let Predicates = [HasAVX1Only] in {
 //
 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
                           Intrinsic IntLd, Intrinsic IntLd256,
-                          Intrinsic IntSt, Intrinsic IntSt256> {
+                          Intrinsic IntSt, Intrinsic IntSt256,
+                          X86SchedWriteMaskMove schedX,
+                          X86SchedWriteMaskMove schedY> {
   def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, f128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
-             VEX_4V, Sched<[WriteFMaskedLoad]>;
+             VEX_4V, Sched<[schedX.RM]>;
   def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, f256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
-             VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
+             VEX_4V, VEX_L, Sched<[schedY.RM]>;
   def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
-             VEX_4V, Sched<[WriteFMaskedStore]>;
+             VEX_4V, Sched<[schedX.MR]>;
   def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
-             VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
+             VEX_4V, VEX_L, Sched<[schedY.MR]>;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -7109,13 +7106,15 @@ defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
                                  int_x86_avx_maskload_ps,
                                  int_x86_avx_maskload_ps_256,
                                  int_x86_avx_maskstore_ps,
-                                 int_x86_avx_maskstore_ps_256>;
+                                 int_x86_avx_maskstore_ps_256,
+                                 WriteFMaskMove32, WriteFMaskMove32Y>;
 let ExeDomain = SSEPackedDouble in
 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
                                  int_x86_avx_maskload_pd,
                                  int_x86_avx_maskload_pd_256,
                                  int_x86_avx_maskstore_pd,
-                                 int_x86_avx_maskstore_pd_256>;
+                                 int_x86_avx_maskstore_pd_256,
+                                 WriteFMaskMove64, WriteFMaskMove64Y>;
 
 //===----------------------------------------------------------------------===//
 // VPERMIL - Permute Single and Double Floating-Point Values
@@ -7143,13 +7142,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
              (ins RC:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
+             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
              Sched<[sched]>;
     def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
              (ins x86memop_f:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst,
-               (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
+               (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
              Sched<[sched.Folded]>;
   }// Predicates = [HasAVX, NoVLX]
 }
@@ -7181,38 +7180,38 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
+                              (i8 timm:$src3))))]>, VEX_4V, VEX_L,
           Sched<[WriteFShuffle256]>;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
-                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
+                             (i8 timm:$src3)))]>, VEX_4V, VEX_L,
           Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
 }
 
 // Immediate transform to help with commuting.
-def Perm2XCommuteImm : SDNodeXForm<imm, [{
+def Perm2XCommuteImm : SDNodeXForm<timm, [{
   return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
 }]>;
 
 let Predicates = [HasAVX] in {
 // Pattern with load in other operand.
 def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
-                                VR256:$src1, (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
+                                VR256:$src1, (i8 timm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
 }
 
 let Predicates = [HasAVX1Only] in {
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
-                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+                  (loadv4i64 addr:$src2), (i8 timm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
 // Pattern with load in other operand.
 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
-                                VR256:$src1, (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
+                                VR256:$src1, (i8 timm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7257,7 +7256,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
   def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
                (ins RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-               [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
+               [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>,
                TAPD, VEX, Sched<[RR]>;
   let hasSideEffects = 0, mayStore = 1 in
   def mr : Ii8<0x1D, MRMDestMem, (outs),
@@ -7282,15 +7281,15 @@ let Predicates = [HasF16C, NoVLX] in {
             (VCVTPH2PSrm addr:$src)>;
 
   def : Pat<(store (f64 (extractelt
-                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
                          (iPTR 0))), addr:$dst),
-            (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
   def : Pat<(store (i64 (extractelt
-                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
                          (iPTR 0))), addr:$dst),
-            (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
-  def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
-            (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
+  def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
+            (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
 }
 
 // Patterns for  matching conversions from float to half-float and vice versa.
@@ -7327,20 +7326,20 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
         (ins RC:$src1, RC:$src2, u8imm:$src3),
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
         Sched<[sched]>, VEX_4V;
   def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
-                                            (commuteXForm imm:$src3))>;
+                                            (commuteXForm timm:$src3))>;
 }
 
 let Predicates = [HasAVX2] in {
@@ -7351,19 +7350,19 @@ defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
                                 SchedWriteBlend.YMM, VR256, i256mem,
                                 BlendCommuteImm8>, VEX_L;
 
-def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
-          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
-          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
-          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
+          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
+          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
+          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
 
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
-          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
-          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>;
-def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
-          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>;
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
+          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
+          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
 }
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -7407,7 +7406,7 @@ def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0
 //               destination operand
 //
 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
-                          X86MemOperand x86memop, PatFrag ld_frag,
+                          X86MemOperand x86memop, PatFrag bcast_frag,
                           ValueType OpVT128, ValueType OpVT256, Predicate prd> {
   let Predicates = [HasAVX2, prd] in {
     def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -7418,7 +7417,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
     def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR128:$dst,
-                   (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
+                   (OpVT128 (bcast_frag addr:$src)))]>,
                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
     def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -7428,7 +7427,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR256:$dst,
-                    (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
+                    (OpVT256 (bcast_frag addr:$src)))]>,
                    Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
 
     // Provide aliases for broadcast from the same register class that
@@ -7439,13 +7438,13 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
   }
 }
 
-defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
                                     v16i8, v32i8, NoVLX_Or_NoBWI>;
-defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
                                     v8i16, v16i16, NoVLX_Or_NoBWI>;
-defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
                                     v4i32, v8i32, NoVLX>;
-defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
                                     v2i64, v4i64, NoVLX>;
 
 let Predicates = [HasAVX2, NoVLX] in {
@@ -7455,14 +7454,11 @@ let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
             (VPBROADCASTQYrm addr:$src)>;
 
-  def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+  // FIXME this is to handle aligned extloads from i8/i16.
+  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
             (VPBROADCASTDrm addr:$src)>;
-  def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
             (VPBROADCASTDYrm addr:$src)>;
-  def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-            (VPBROADCASTQrm addr:$src)>;
-  def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-            (VPBROADCASTQYrm addr:$src)>;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
@@ -7483,17 +7479,12 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   def : Pat<(v16i16 (X86VBroadcast
               (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
             (VPBROADCASTWYrm addr:$src)>;
-}
 
-let Predicates = [HasAVX2, NoVLX] in {
-  // Provide aliases for broadcast from the same register class that
-  // automatically does the extract.
-  def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
-            (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
-                                                    sub_xmm)))>;
-  def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
-            (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
-                                                    sub_xmm)))>;
+  // FIXME this is to handle aligned extloads from i8.
+  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+            (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+            (VPBROADCASTWYrm addr:$src)>;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
@@ -7509,45 +7500,41 @@ let Predicates = [HasAVX2, NoVLX] in {
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
-        (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
+        (VPBROADCASTBrr (VMOVDI2PDIrr
                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                             GR8:$src, sub_8bit)),
-                         VR128)))>;
+                                             GR8:$src, sub_8bit))))>;
   def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
-        (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
+        (VPBROADCASTBYrr (VMOVDI2PDIrr
                           (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                              GR8:$src, sub_8bit)),
-                          VR128)))>;
+                                              GR8:$src, sub_8bit))))>;
 
   def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
-        (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
+        (VPBROADCASTWrr (VMOVDI2PDIrr
                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                             GR16:$src, sub_16bit)),
-                         VR128)))>;
+                                             GR16:$src, sub_16bit))))>;
   def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
-        (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
+        (VPBROADCASTWYrr (VMOVDI2PDIrr
                           (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                              GR16:$src, sub_16bit)),
-                          VR128)))>;
+                                              GR16:$src, sub_16bit))))>;
 }
 let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
-            (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
+            (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
-            (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
+            (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
   def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
-            (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
+            (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
   def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
-            (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
+            (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
 }
 
 // AVX1 broadcast patterns
 let Predicates = [HasAVX1Only] in {
-def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
           (VBROADCASTSSYrm addr:$src)>;
-def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
           (VBROADCASTSDYrm addr:$src)>;
-def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
           (VBROADCASTSSrm addr:$src)>;
 }
 
@@ -7557,12 +7544,12 @@ let Predicates = [HasAVX, NoVLX] in {
   // 128bit broadcasts:
   def : Pat<(v2f64 (X86VBroadcast f64:$src)),
             (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
-  def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
             (VMOVDDUPrm addr:$src)>;
 
   def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
             (VMOVDDUPrr VR128:$src)>;
-  def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+  def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
             (VMOVDDUPrm addr:$src)>;
   def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
             (VMOVDDUPrm addr:$src)>;
@@ -7581,19 +7568,19 @@ let Predicates = [HasAVX1Only] in {
               (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
 
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
-            (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
+            (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
-              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
-              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
+              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
+              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
   def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
-              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
-              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
+              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
+              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
 
   def : Pat<(v2i64 (X86VBroadcast i64:$src)),
-            (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
-  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+            (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
+  def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
             (VMOVDDUPrm addr:$src)>;
 }
 
@@ -7636,7 +7623,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                        !strconcat(OpcodeStr,
                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
-                         (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+                         (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
                        Sched<[Sched]>, VEX, VEX_L;
     def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
                        (ins memOp:$src1, u8imm:$src2),
@@ -7644,7 +7631,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (OpVT (X86VPermi (mem_frag addr:$src1),
-                                (i8 imm:$src2))))]>,
+                                (i8 timm:$src2))))]>,
                        Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
   }
 }
@@ -7663,19 +7650,19 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
+                            (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
           VEX_4V, VEX_L;
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
-                             (i8 imm:$src3)))]>,
+                             (i8 timm:$src3)))]>,
           Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
 
 let Predicates = [HasAVX2] in
 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
-                                VR256:$src1, (i8 imm:$imm))),
-          (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
+                                VR256:$src1, (i8 timm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
 
 
 //===----------------------------------------------------------------------===//
@@ -7760,7 +7747,7 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
                                 int_x86_avx2_maskstore_q_256>, VEX_W;
 
 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
-                          ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
+                          ValueType MaskVT> {
     // masked store
     def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
              (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
@@ -7772,23 +7759,23 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
              (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
 }
 let Predicates = [HasAVX] in {
-  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
-  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
-  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
-  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
+  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
 }
 let Predicates = [HasAVX1Only] in {
   // load/store i32/i64 not supported use ps/pd version
-  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
-  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
-  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
-  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
+  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
 }
 let Predicates = [HasAVX2] in {
-  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
-  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
-  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
-  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
+  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
+  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
+  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7956,13 +7943,13 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
       OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
   def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
               (ins RC:$src1, RC:$src2, u8imm:$src3), "",
-              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
               SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
   def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
               (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
               [(set RC:$dst, (OpVT (OpNode RC:$src1,
                                     (MemOpFrag addr:$src2),
-                              imm:$src3)))], SSEPackedInt>,
+                              timm:$src3)))], SSEPackedInt>,
               Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
   }
 }
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 7050e1917494..7f41feb6c0d9 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -43,7 +43,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
 let SchedRW = [WriteSystem] in {
 
 def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
-              [(int_x86_int imm:$trap)]>;
+              [(int_x86_int timm:$trap)]>;
 
 
 def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index fc0da845299f..3a1212342a13 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -45,7 +45,7 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins),
 
 def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
                  "xabort\t$imm",
-                 [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+                 [(int_x86_xabort timm:$imm)]>, Requires<[HasRTM]>;
 } // SchedRW
 
 // HLE prefixes
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 66ca78556b82..229af366d940 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -143,13 +143,13 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            (ins VR128:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>,
+              (vt128 (OpNode (vt128 VR128:$src1), timm:$src2)))]>,
            XOP, Sched<[sched]>;
   def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
            (ins i128mem:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>,
+              (vt128 (OpNode (vt128 (load addr:$src1)), timm:$src2)))]>,
            XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -251,7 +251,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
              "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
              [(set VR128:$dst,
                 (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                               imm:$cc)))]>,
+                               timm:$cc)))]>,
              XOP_4V, Sched<[sched]>;
     def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2, u8imm:$cc),
@@ -260,14 +260,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
              [(set VR128:$dst,
                 (vt128 (OpNode (vt128 VR128:$src1),
                                (vt128 (load addr:$src2)),
-                                imm:$cc)))]>,
+                                timm:$cc)))]>,
              XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 
   def : Pat<(OpNode (load addr:$src2),
-                    (vt128 VR128:$src1), imm:$cc),
+                    (vt128 VR128:$src1), timm:$cc),
             (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
-                                           (CommuteVPCOMCC imm:$cc))>;
+                                           (CommuteVPCOMCC timm:$cc))>;
 }
 
 defm VPCOMB  : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>;
@@ -418,27 +418,27 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
                         ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag,
                         X86FoldableSchedWrite sched> {
   def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst),
-        (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
+        (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set RC:$dst,
-           (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>,
+           (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 timm:$src4))))]>,
         Sched<[sched]>;
   def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
-        (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4),
+        (ins RC:$src1, RC:$src2, intmemop:$src3, u4imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set RC:$dst,
           (VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3),
-                           (i8 imm:$src4))))]>, VEX_W,
+                           (i8 timm:$src4))))]>, VEX_W,
         Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
-        (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4),
+        (ins RC:$src1, fpmemop:$src2, RC:$src3, u4imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set RC:$dst,
           (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
-                           RC:$src3, (i8 imm:$src4))))]>,
+                           RC:$src3, (i8 timm:$src4))))]>,
         Sched<[sched.Folded, sched.ReadAfterFold,
                // fpmemop:$src2
                ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -447,7 +447,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
-        (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
+        (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         []>, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>;
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 892a083f4d1a..01620b7b64c9 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -60,7 +60,7 @@ public:
   X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI,
                          const X86RegisterBankInfo &RBI);
 
-  bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+  bool select(MachineInstr &I) override;
   static const char *getName() { return DEBUG_TYPE; }
 
 private:
@@ -94,11 +94,9 @@ private:
                    MachineFunction &MF) const;
   bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
-                           MachineFunction &MF,
-                           CodeGenCoverage &CoverageInfo) const;
+                           MachineFunction &MF);
   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
-                         MachineFunction &MF,
-                         CodeGenCoverage &CoverageInfo) const;
+                         MachineFunction &MF);
   bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
                     MachineFunction &MF) const;
   bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -217,7 +215,7 @@ static unsigned getSubRegIndex(const TargetRegisterClass *RC) {
 }
 
 static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(Register::isPhysicalRegister(Reg));
   if (X86::GR64RegClass.contains(Reg))
     return &X86::GR64RegClass;
   if (X86::GR32RegClass.contains(Reg))
@@ -233,15 +231,15 @@ static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) {
 // Set X86 Opcode and constrain DestReg.
 bool X86InstructionSelector::selectCopy(MachineInstr &I,
                                         MachineRegisterInfo &MRI) const {
-  unsigned DstReg = I.getOperand(0).getReg();
+  Register DstReg = I.getOperand(0).getReg();
   const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
 
-  unsigned SrcReg = I.getOperand(1).getReg();
+  Register SrcReg = I.getOperand(1).getReg();
   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
 
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+  if (Register::isPhysicalRegister(DstReg)) {
     assert(I.isCopy() && "Generic operators do not allow physical registers");
 
     if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID &&
@@ -253,7 +251,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
 
       if (SrcRC != DstRC) {
         // This case can be generated by ABI lowering, performe anyext
-        unsigned ExtSrc = MRI.createVirtualRegister(DstRC);
+        Register ExtSrc = MRI.createVirtualRegister(DstRC);
         BuildMI(*I.getParent(), I, I.getDebugLoc(),
                 TII.get(TargetOpcode::SUBREG_TO_REG))
             .addDef(ExtSrc)
@@ -268,12 +266,12 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
     return true;
   }
 
-  assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+  assert((!Register::isPhysicalRegister(SrcReg) || I.isCopy()) &&
          "No phys reg on generic operators");
   assert((DstSize == SrcSize ||
           // Copies are a mean to setup initial types, the number of
           // bits may not exactly match.
-          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+          (Register::isPhysicalRegister(SrcReg) &&
            DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
          "Copy with different width?!");
 
@@ -282,7 +280,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
 
   if (SrcRegBank.getID() == X86::GPRRegBankID &&
       DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize &&
-      TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+      Register::isPhysicalRegister(SrcReg)) {
     // Change the physical register to performe truncate.
 
     const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg);
@@ -308,8 +306,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
   return true;
 }
 
-bool X86InstructionSelector::select(MachineInstr &I,
-                                    CodeGenCoverage &CoverageInfo) const {
+bool X86InstructionSelector::select(MachineInstr &I) {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
 
@@ -333,7 +330,7 @@ bool X86InstructionSelector::select(MachineInstr &I,
   assert(I.getNumOperands() == I.getNumExplicitOperands() &&
          "Generic instruction has unexpected implicit operands\n");
 
-  if (selectImpl(I, CoverageInfo))
+  if (selectImpl(I, *CoverageInfo))
     return true;
 
   LLVM_DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
@@ -370,10 +367,10 @@ bool X86InstructionSelector::select(MachineInstr &I,
   case TargetOpcode::G_UADDE:
     return selectUadde(I, MRI, MF);
   case TargetOpcode::G_UNMERGE_VALUES:
-    return selectUnmergeValues(I, MRI, MF, CoverageInfo);
+    return selectUnmergeValues(I, MRI, MF);
   case TargetOpcode::G_MERGE_VALUES:
   case TargetOpcode::G_CONCAT_VECTORS:
-    return selectMergeValues(I, MRI, MF, CoverageInfo);
+    return selectMergeValues(I, MRI, MF);
   case TargetOpcode::G_EXTRACT:
     return selectExtract(I, MRI, MF);
   case TargetOpcode::G_INSERT:
@@ -512,7 +509,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
   assert((Opc == TargetOpcode::G_STORE || Opc == TargetOpcode::G_LOAD) &&
          "unexpected instruction");
 
-  const unsigned DefReg = I.getOperand(0).getReg();
+  const Register DefReg = I.getOperand(0).getReg();
   LLT Ty = MRI.getType(DefReg);
   const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
@@ -572,7 +569,7 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I,
   assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_GEP) &&
          "unexpected instruction");
 
-  const unsigned DefReg = I.getOperand(0).getReg();
+  const Register DefReg = I.getOperand(0).getReg();
   LLT Ty = MRI.getType(DefReg);
 
   // Use LEA to calculate frame index and GEP
@@ -625,7 +622,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
     AM.Base.Reg = X86::RIP;
   }
 
-  const unsigned DefReg = I.getOperand(0).getReg();
+  const Register DefReg = I.getOperand(0).getReg();
   LLT Ty = MRI.getType(DefReg);
   unsigned NewOpc = getLeaOP(Ty, STI);
 
@@ -644,7 +641,7 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I,
   assert((I.getOpcode() == TargetOpcode::G_CONSTANT) &&
          "unexpected instruction");
 
-  const unsigned DefReg = I.getOperand(0).getReg();
+  const Register DefReg = I.getOperand(0).getReg();
   LLT Ty = MRI.getType(DefReg);
 
   if (RBI.getRegBank(DefReg, MRI, TRI)->getID() != X86::GPRRegBankID)
@@ -717,8 +714,8 @@ bool X86InstructionSelector::selectTruncOrPtrToInt(MachineInstr &I,
           I.getOpcode() == TargetOpcode::G_PTRTOINT) &&
          "unexpected instruction");
 
-  const unsigned DstReg = I.getOperand(0).getReg();
-  const unsigned SrcReg = I.getOperand(1).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register SrcReg = I.getOperand(1).getReg();
 
   const LLT DstTy = MRI.getType(DstReg);
   const LLT SrcTy = MRI.getType(SrcReg);
@@ -781,8 +778,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
                                         MachineFunction &MF) const {
   assert((I.getOpcode() == TargetOpcode::G_ZEXT) && "unexpected instruction");
 
-  const unsigned DstReg = I.getOperand(0).getReg();
-  const unsigned SrcReg = I.getOperand(1).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register SrcReg = I.getOperand(1).getReg();
 
   const LLT DstTy = MRI.getType(DstReg);
   const LLT SrcTy = MRI.getType(SrcReg);
@@ -892,8 +889,8 @@ bool X86InstructionSelector::selectAnyext(MachineInstr &I,
                                           MachineFunction &MF) const {
   assert((I.getOpcode() == TargetOpcode::G_ANYEXT) && "unexpected instruction");
 
-  const unsigned DstReg = I.getOperand(0).getReg();
-  const unsigned SrcReg = I.getOperand(1).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register SrcReg = I.getOperand(1).getReg();
 
   const LLT DstTy = MRI.getType(DstReg);
   const LLT SrcTy = MRI.getType(SrcReg);
@@ -952,8 +949,8 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
   std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
       (CmpInst::Predicate)I.getOperand(1).getPredicate());
 
-  unsigned LHS = I.getOperand(2).getReg();
-  unsigned RHS = I.getOperand(3).getReg();
+  Register LHS = I.getOperand(2).getReg();
+  Register RHS = I.getOperand(3).getReg();
 
   if (SwapArgs)
     std::swap(LHS, RHS);
@@ -998,8 +995,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
                                         MachineFunction &MF) const {
   assert((I.getOpcode() == TargetOpcode::G_FCMP) && "unexpected instruction");
 
-  unsigned LhsReg = I.getOperand(2).getReg();
-  unsigned RhsReg = I.getOperand(3).getReg();
+  Register LhsReg = I.getOperand(2).getReg();
+  Register RhsReg = I.getOperand(3).getReg();
   CmpInst::Predicate Predicate =
       (CmpInst::Predicate)I.getOperand(1).getPredicate();
 
@@ -1033,7 +1030,7 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
     break;
   }
 
-  unsigned ResultReg = I.getOperand(0).getReg();
+  Register ResultReg = I.getOperand(0).getReg();
   RBI.constrainGenericRegister(
       ResultReg,
       *getRegClass(LLT::scalar(8), *RBI.getRegBank(ResultReg, MRI, TRI)), MRI);
@@ -1043,8 +1040,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
              .addReg(LhsReg)
              .addReg(RhsReg);
 
-    unsigned FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass);
-    unsigned FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass);
+    Register FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass);
+    Register FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass);
     MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
                                   TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]);
     MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -1089,11 +1086,11 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I,
                                          MachineFunction &MF) const {
   assert((I.getOpcode() == TargetOpcode::G_UADDE) && "unexpected instruction");
 
-  const unsigned DstReg = I.getOperand(0).getReg();
-  const unsigned CarryOutReg = I.getOperand(1).getReg();
-  const unsigned Op0Reg = I.getOperand(2).getReg();
-  const unsigned Op1Reg = I.getOperand(3).getReg();
-  unsigned CarryInReg = I.getOperand(4).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register CarryOutReg = I.getOperand(1).getReg();
+  const Register Op0Reg = I.getOperand(2).getReg();
+  const Register Op1Reg = I.getOperand(3).getReg();
+  Register CarryInReg = I.getOperand(4).getReg();
 
   const LLT DstTy = MRI.getType(DstReg);
 
@@ -1149,8 +1146,8 @@ bool X86InstructionSelector::selectExtract(MachineInstr &I,
   assert((I.getOpcode() == TargetOpcode::G_EXTRACT) &&
          "unexpected instruction");
 
-  const unsigned DstReg = I.getOperand(0).getReg();
-  const unsigned SrcReg = I.getOperand(1).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register SrcReg = I.getOperand(1).getReg();
   int64_t Index = I.getOperand(2).getImm();
 
   const LLT DstTy = MRI.getType(DstReg);
@@ -1281,9 +1278,9 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
                                           MachineFunction &MF) const {
   assert((I.getOpcode() == TargetOpcode::G_INSERT) && "unexpected instruction");
 
-  const unsigned DstReg = I.getOperand(0).getReg();
-  const unsigned SrcReg = I.getOperand(1).getReg();
-  const unsigned InsertReg = I.getOperand(2).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register SrcReg = I.getOperand(1).getReg();
+  const Register InsertReg = I.getOperand(2).getReg();
   int64_t Index = I.getOperand(3).getImm();
 
   const LLT DstTy = MRI.getType(DstReg);
@@ -1335,14 +1332,13 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
 }
 
 bool X86InstructionSelector::selectUnmergeValues(
-    MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF,
-    CodeGenCoverage &CoverageInfo) const {
+    MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) {
   assert((I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES) &&
          "unexpected instruction");
 
   // Split to extracts.
   unsigned NumDefs = I.getNumOperands() - 1;
-  unsigned SrcReg = I.getOperand(NumDefs).getReg();
+  Register SrcReg = I.getOperand(NumDefs).getReg();
   unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
 
   for (unsigned Idx = 0; Idx < NumDefs; ++Idx) {
@@ -1352,7 +1348,7 @@ bool X86InstructionSelector::selectUnmergeValues(
              .addReg(SrcReg)
              .addImm(Idx * DefSize);
 
-    if (!select(ExtrInst, CoverageInfo))
+    if (!select(ExtrInst))
       return false;
   }
 
@@ -1361,15 +1357,14 @@ bool X86InstructionSelector::selectUnmergeValues(
 }
 
 bool X86InstructionSelector::selectMergeValues(
-    MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF,
-    CodeGenCoverage &CoverageInfo) const {
+    MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) {
   assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES ||
           I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS) &&
          "unexpected instruction");
 
   // Split to inserts.
-  unsigned DstReg = I.getOperand(0).getReg();
-  unsigned SrcReg0 = I.getOperand(1).getReg();
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg0 = I.getOperand(1).getReg();
 
   const LLT DstTy = MRI.getType(DstReg);
   const LLT SrcTy = MRI.getType(SrcReg0);
@@ -1378,13 +1373,13 @@ bool X86InstructionSelector::selectMergeValues(
   const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
 
   // For the first src use insertSubReg.
-  unsigned DefReg = MRI.createGenericVirtualRegister(DstTy);
+  Register DefReg = MRI.createGenericVirtualRegister(DstTy);
   MRI.setRegBank(DefReg, RegBank);
   if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF))
     return false;
 
   for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) {
-    unsigned Tmp = MRI.createGenericVirtualRegister(DstTy);
+    Register Tmp = MRI.createGenericVirtualRegister(DstTy);
     MRI.setRegBank(Tmp, RegBank);
 
     MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -1395,7 +1390,7 @@ bool X86InstructionSelector::selectMergeValues(
 
     DefReg = Tmp;
 
-    if (!select(InsertInst, CoverageInfo))
+    if (!select(InsertInst))
       return false;
   }
 
@@ -1403,7 +1398,7 @@ bool X86InstructionSelector::selectMergeValues(
                                     TII.get(TargetOpcode::COPY), DstReg)
                                 .addReg(DefReg);
 
-  if (!select(CopyInst, CoverageInfo))
+  if (!select(CopyInst))
     return false;
 
   I.eraseFromParent();
@@ -1415,7 +1410,7 @@ bool X86InstructionSelector::selectCondBranch(MachineInstr &I,
                                               MachineFunction &MF) const {
   assert((I.getOpcode() == TargetOpcode::G_BRCOND) && "unexpected instruction");
 
-  const unsigned CondReg = I.getOperand(0).getReg();
+  const Register CondReg = I.getOperand(0).getReg();
   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
 
   MachineInstr &TestInst =
@@ -1442,7 +1437,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
   if (CM != CodeModel::Small && CM != CodeModel::Large)
     return false;
 
-  const unsigned DstReg = I.getOperand(0).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
   const LLT DstTy = MRI.getType(DstReg);
   const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
   unsigned Align = DstTy.getSizeInBits();
@@ -1460,7 +1455,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
     // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
     // they cannot be folded into immediate fields.
 
-    unsigned AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass);
+    Register AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass);
     BuildMI(*I.getParent(), I, DbgLoc, TII.get(X86::MOV64ri), AddrReg)
         .addConstantPoolIndex(CPI, 0, OpFlag);
 
@@ -1503,7 +1498,7 @@ bool X86InstructionSelector::selectImplicitDefOrPHI(
           I.getOpcode() == TargetOpcode::G_PHI) &&
          "unexpected instruction");
 
-  unsigned DstReg = I.getOperand(0).getReg();
+  Register DstReg = I.getOperand(0).getReg();
 
   if (!MRI.getRegClassOrNull(DstReg)) {
     const LLT DstTy = MRI.getType(DstReg);
@@ -1537,7 +1532,7 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
           I.getOpcode() == TargetOpcode::G_LSHR) &&
          "unexpected instruction");
 
-  unsigned DstReg = I.getOperand(0).getReg();
+  Register DstReg = I.getOperand(0).getReg();
   const LLT DstTy = MRI.getType(DstReg);
   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
 
@@ -1578,8 +1573,8 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
     return false;
   }
 
-  unsigned Op0Reg = I.getOperand(1).getReg();
-  unsigned Op1Reg = I.getOperand(2).getReg();
+  Register Op0Reg = I.getOperand(1).getReg();
+  Register Op1Reg = I.getOperand(2).getReg();
 
   assert(MRI.getType(Op1Reg).getSizeInBits() == 8);
 
@@ -1606,9 +1601,9 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
           I.getOpcode() == TargetOpcode::G_UREM) &&
          "unexpected instruction");
 
-  const unsigned DstReg = I.getOperand(0).getReg();
-  const unsigned Op1Reg = I.getOperand(1).getReg();
-  const unsigned Op2Reg = I.getOperand(2).getReg();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register Op1Reg = I.getOperand(1).getReg();
+  const Register Op2Reg = I.getOperand(2).getReg();
 
   const LLT RegTy = MRI.getType(DstReg);
   assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) &&
@@ -1732,7 +1727,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
       BuildMI(*I.getParent(), I, I.getDebugLoc(),
               TII.get(OpEntry.OpSignExtend));
     else {
-      unsigned Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass);
+      Register Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass);
       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::MOV32r0),
               Zero32);
 
@@ -1770,8 +1765,8 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
   if ((I.getOpcode() == Instruction::SRem ||
        I.getOpcode() == Instruction::URem) &&
       OpEntry.DivRemResultReg == X86::AH && STI.is64Bit()) {
-    unsigned SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
-    unsigned ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+    Register SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+    Register ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
     BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Copy), SourceSuperReg)
         .addReg(X86::AX);
 
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 40141d894629..1d7adbaa9e99 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -23,7 +23,7 @@ enum IntrinsicType : uint16_t {
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
   INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
   INTR_TYPE_3OP_IMM8,
-  CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV,
+  CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI,
   CVTPD2PS_MASK,
   INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE,
   INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE,
@@ -1101,8 +1101,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
   X86_INTRINSIC_DATA(subborrow_32,      ADX, X86ISD::SBB, X86ISD::SUB),
   X86_INTRINSIC_DATA(subborrow_64,      ADX, X86ISD::SBB, X86ISD::SUB),
-  X86_INTRINSIC_DATA(tbm_bextri_u32,    INTR_TYPE_2OP, X86ISD::BEXTR, 0),
-  X86_INTRINSIC_DATA(tbm_bextri_u64,    INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+  X86_INTRINSIC_DATA(tbm_bextri_u32,    BEXTRI, X86ISD::BEXTR, 0),
+  X86_INTRINSIC_DATA(tbm_bextri_u64,    BEXTRI, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(vcvtph2ps_128,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(vcvtph2ps_256,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(vcvtps2ph_128,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 00fb1b573858..04121f863c89 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -13,6 +13,7 @@
 #include "X86LegalizerInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -84,6 +85,24 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   verify(*STI.getInstrInfo());
 }
 
+bool X86LegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineIRBuilder &MIRBuilder) const {
+  switch (MI.getIntrinsicID()) {
+  case Intrinsic::memcpy:
+  case Intrinsic::memset:
+  case Intrinsic::memmove:
+    if (createMemLibcall(MIRBuilder, MRI, MI) ==
+        LegalizerHelper::UnableToLegalize)
+      return false;
+    MI.eraseFromParent();
+    return true;
+  default:
+    break;
+  }
+  return true;
+}
+
 void X86LegalizerInfo::setLegalizerInfo32bit() {
 
   const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
@@ -158,6 +177,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
     setAction({G_ANYEXT, Ty}, Legal);
   }
   setAction({G_ANYEXT, s128}, Legal);
+  getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
   // Comparison
   setAction({G_ICMP, s1}, Legal);
diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h
index d21707b9ab9b..7a0f13fb5ae6 100644
--- a/lib/Target/X86/X86LegalizerInfo.h
+++ b/lib/Target/X86/X86LegalizerInfo.h
@@ -32,6 +32,9 @@ private:
 public:
   X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
 
+  bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &MIRBuilder) const override;
+
 private:
   void setLegalizerInfo32bit();
   void setLegalizerInfo64bit();
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index b1fefaa84be4..78098fd6262f 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -427,6 +427,41 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
   }
 }
 
+// Replace TAILJMP opcodes with their equivalent opcodes that have encoding
+// information.
+static unsigned convertTailJumpOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::TAILJMPr:
+    Opcode = X86::JMP32r;
+    break;
+  case X86::TAILJMPm:
+    Opcode = X86::JMP32m;
+    break;
+  case X86::TAILJMPr64:
+    Opcode = X86::JMP64r;
+    break;
+  case X86::TAILJMPm64:
+    Opcode = X86::JMP64m;
+    break;
+  case X86::TAILJMPr64_REX:
+    Opcode = X86::JMP64r_REX;
+    break;
+  case X86::TAILJMPm64_REX:
+    Opcode = X86::JMP64m_REX;
+    break;
+  case X86::TAILJMPd:
+  case X86::TAILJMPd64:
+    Opcode = X86::JMP_1;
+    break;
+  case X86::TAILJMPd_CC:
+  case X86::TAILJMPd64_CC:
+    Opcode = X86::JCC_1;
+    break;
+  }
+
+  return Opcode;
+}
+
 void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
@@ -500,21 +535,190 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     break;
   }
 
-  // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
-  // inputs modeled as normal uses instead of implicit uses.  As such, truncate
-  // off all but the first operand (the callee).  FIXME: Change isel.
-  case X86::TAILJMPr64:
-  case X86::TAILJMPr64_REX:
-  case X86::CALL64r:
-  case X86::CALL64pcrel32: {
-    unsigned Opcode = OutMI.getOpcode();
-    MCOperand Saved = OutMI.getOperand(0);
-    OutMI = MCInst();
-    OutMI.setOpcode(Opcode);
-    OutMI.addOperand(Saved);
+  case X86::VPCMPBZ128rmi:  case X86::VPCMPBZ128rmik:
+  case X86::VPCMPBZ128rri:  case X86::VPCMPBZ128rrik:
+  case X86::VPCMPBZ256rmi:  case X86::VPCMPBZ256rmik:
+  case X86::VPCMPBZ256rri:  case X86::VPCMPBZ256rrik:
+  case X86::VPCMPBZrmi:     case X86::VPCMPBZrmik:
+  case X86::VPCMPBZrri:     case X86::VPCMPBZrrik:
+  case X86::VPCMPDZ128rmi:  case X86::VPCMPDZ128rmik:
+  case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+  case X86::VPCMPDZ128rri:  case X86::VPCMPDZ128rrik:
+  case X86::VPCMPDZ256rmi:  case X86::VPCMPDZ256rmik:
+  case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+  case X86::VPCMPDZ256rri:  case X86::VPCMPDZ256rrik:
+  case X86::VPCMPDZrmi:     case X86::VPCMPDZrmik:
+  case X86::VPCMPDZrmib:    case X86::VPCMPDZrmibk:
+  case X86::VPCMPDZrri:     case X86::VPCMPDZrrik:
+  case X86::VPCMPQZ128rmi:  case X86::VPCMPQZ128rmik:
+  case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+  case X86::VPCMPQZ128rri:  case X86::VPCMPQZ128rrik:
+  case X86::VPCMPQZ256rmi:  case X86::VPCMPQZ256rmik:
+  case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+  case X86::VPCMPQZ256rri:  case X86::VPCMPQZ256rrik:
+  case X86::VPCMPQZrmi:     case X86::VPCMPQZrmik:
+  case X86::VPCMPQZrmib:    case X86::VPCMPQZrmibk:
+  case X86::VPCMPQZrri:     case X86::VPCMPQZrrik:
+  case X86::VPCMPWZ128rmi:  case X86::VPCMPWZ128rmik:
+  case X86::VPCMPWZ128rri:  case X86::VPCMPWZ128rrik:
+  case X86::VPCMPWZ256rmi:  case X86::VPCMPWZ256rmik:
+  case X86::VPCMPWZ256rri:  case X86::VPCMPWZ256rrik:
+  case X86::VPCMPWZrmi:     case X86::VPCMPWZrmik:
+  case X86::VPCMPWZrri:     case X86::VPCMPWZrrik: {
+    // Turn immediate 0 into the VPCMPEQ instruction.
+    if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      case X86::VPCMPBZ128rmi:   NewOpc = X86::VPCMPEQBZ128rm;   break;
+      case X86::VPCMPBZ128rmik:  NewOpc = X86::VPCMPEQBZ128rmk;  break;
+      case X86::VPCMPBZ128rri:   NewOpc = X86::VPCMPEQBZ128rr;   break;
+      case X86::VPCMPBZ128rrik:  NewOpc = X86::VPCMPEQBZ128rrk;  break;
+      case X86::VPCMPBZ256rmi:   NewOpc = X86::VPCMPEQBZ256rm;   break;
+      case X86::VPCMPBZ256rmik:  NewOpc = X86::VPCMPEQBZ256rmk;  break;
+      case X86::VPCMPBZ256rri:   NewOpc = X86::VPCMPEQBZ256rr;   break;
+      case X86::VPCMPBZ256rrik:  NewOpc = X86::VPCMPEQBZ256rrk;  break;
+      case X86::VPCMPBZrmi:      NewOpc = X86::VPCMPEQBZrm;      break;
+      case X86::VPCMPBZrmik:     NewOpc = X86::VPCMPEQBZrmk;     break;
+      case X86::VPCMPBZrri:      NewOpc = X86::VPCMPEQBZrr;      break;
+      case X86::VPCMPBZrrik:     NewOpc = X86::VPCMPEQBZrrk;     break;
+      case X86::VPCMPDZ128rmi:   NewOpc = X86::VPCMPEQDZ128rm;   break;
+      case X86::VPCMPDZ128rmib:  NewOpc = X86::VPCMPEQDZ128rmb;  break;
+      case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPEQDZ128rmbk; break;
+      case X86::VPCMPDZ128rmik:  NewOpc = X86::VPCMPEQDZ128rmk;  break;
+      case X86::VPCMPDZ128rri:   NewOpc = X86::VPCMPEQDZ128rr;   break;
+      case X86::VPCMPDZ128rrik:  NewOpc = X86::VPCMPEQDZ128rrk;  break;
+      case X86::VPCMPDZ256rmi:   NewOpc = X86::VPCMPEQDZ256rm;   break;
+      case X86::VPCMPDZ256rmib:  NewOpc = X86::VPCMPEQDZ256rmb;  break;
+      case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPEQDZ256rmbk; break;
+      case X86::VPCMPDZ256rmik:  NewOpc = X86::VPCMPEQDZ256rmk;  break;
+      case X86::VPCMPDZ256rri:   NewOpc = X86::VPCMPEQDZ256rr;   break;
+      case X86::VPCMPDZ256rrik:  NewOpc = X86::VPCMPEQDZ256rrk;  break;
+      case X86::VPCMPDZrmi:      NewOpc = X86::VPCMPEQDZrm;      break;
+      case X86::VPCMPDZrmib:     NewOpc = X86::VPCMPEQDZrmb;     break;
+      case X86::VPCMPDZrmibk:    NewOpc = X86::VPCMPEQDZrmbk;    break;
+      case X86::VPCMPDZrmik:     NewOpc = X86::VPCMPEQDZrmk;     break;
+      case X86::VPCMPDZrri:      NewOpc = X86::VPCMPEQDZrr;      break;
+      case X86::VPCMPDZrrik:     NewOpc = X86::VPCMPEQDZrrk;     break;
+      case X86::VPCMPQZ128rmi:   NewOpc = X86::VPCMPEQQZ128rm;   break;
+      case X86::VPCMPQZ128rmib:  NewOpc = X86::VPCMPEQQZ128rmb;  break;
+      case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPEQQZ128rmbk; break;
+      case X86::VPCMPQZ128rmik:  NewOpc = X86::VPCMPEQQZ128rmk;  break;
+      case X86::VPCMPQZ128rri:   NewOpc = X86::VPCMPEQQZ128rr;   break;
+      case X86::VPCMPQZ128rrik:  NewOpc = X86::VPCMPEQQZ128rrk;  break;
+      case X86::VPCMPQZ256rmi:   NewOpc = X86::VPCMPEQQZ256rm;   break;
+      case X86::VPCMPQZ256rmib:  NewOpc = X86::VPCMPEQQZ256rmb;  break;
+      case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPEQQZ256rmbk; break;
+      case X86::VPCMPQZ256rmik:  NewOpc = X86::VPCMPEQQZ256rmk;  break;
+      case X86::VPCMPQZ256rri:   NewOpc = X86::VPCMPEQQZ256rr;   break;
+      case X86::VPCMPQZ256rrik:  NewOpc = X86::VPCMPEQQZ256rrk;  break;
+      case X86::VPCMPQZrmi:      NewOpc = X86::VPCMPEQQZrm;      break;
+      case X86::VPCMPQZrmib:     NewOpc = X86::VPCMPEQQZrmb;     break;
+      case X86::VPCMPQZrmibk:    NewOpc = X86::VPCMPEQQZrmbk;    break;
+      case X86::VPCMPQZrmik:     NewOpc = X86::VPCMPEQQZrmk;     break;
+      case X86::VPCMPQZrri:      NewOpc = X86::VPCMPEQQZrr;      break;
+      case X86::VPCMPQZrrik:     NewOpc = X86::VPCMPEQQZrrk;     break;
+      case X86::VPCMPWZ128rmi:   NewOpc = X86::VPCMPEQWZ128rm;   break;
+      case X86::VPCMPWZ128rmik:  NewOpc = X86::VPCMPEQWZ128rmk;  break;
+      case X86::VPCMPWZ128rri:   NewOpc = X86::VPCMPEQWZ128rr;   break;
+      case X86::VPCMPWZ128rrik:  NewOpc = X86::VPCMPEQWZ128rrk;  break;
+      case X86::VPCMPWZ256rmi:   NewOpc = X86::VPCMPEQWZ256rm;   break;
+      case X86::VPCMPWZ256rmik:  NewOpc = X86::VPCMPEQWZ256rmk;  break;
+      case X86::VPCMPWZ256rri:   NewOpc = X86::VPCMPEQWZ256rr;   break;
+      case X86::VPCMPWZ256rrik:  NewOpc = X86::VPCMPEQWZ256rrk;  break;
+      case X86::VPCMPWZrmi:      NewOpc = X86::VPCMPEQWZrm;      break;
+      case X86::VPCMPWZrmik:     NewOpc = X86::VPCMPEQWZrmk;     break;
+      case X86::VPCMPWZrri:      NewOpc = X86::VPCMPEQWZrr;      break;
+      case X86::VPCMPWZrrik:     NewOpc = X86::VPCMPEQWZrrk;     break;
+      }
+
+      OutMI.setOpcode(NewOpc);
+      OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
+      break;
+    }
+
+    // Turn immediate 6 into the VPCMPGT instruction.
+    if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      case X86::VPCMPBZ128rmi:   NewOpc = X86::VPCMPGTBZ128rm;   break;
+      case X86::VPCMPBZ128rmik:  NewOpc = X86::VPCMPGTBZ128rmk;  break;
+      case X86::VPCMPBZ128rri:   NewOpc = X86::VPCMPGTBZ128rr;   break;
+      case X86::VPCMPBZ128rrik:  NewOpc = X86::VPCMPGTBZ128rrk;  break;
+      case X86::VPCMPBZ256rmi:   NewOpc = X86::VPCMPGTBZ256rm;   break;
+      case X86::VPCMPBZ256rmik:  NewOpc = X86::VPCMPGTBZ256rmk;  break;
+      case X86::VPCMPBZ256rri:   NewOpc = X86::VPCMPGTBZ256rr;   break;
+      case X86::VPCMPBZ256rrik:  NewOpc = X86::VPCMPGTBZ256rrk;  break;
+      case X86::VPCMPBZrmi:      NewOpc = X86::VPCMPGTBZrm;      break;
+      case X86::VPCMPBZrmik:     NewOpc = X86::VPCMPGTBZrmk;     break;
+      case X86::VPCMPBZrri:      NewOpc = X86::VPCMPGTBZrr;      break;
+      case X86::VPCMPBZrrik:     NewOpc = X86::VPCMPGTBZrrk;     break;
+      case X86::VPCMPDZ128rmi:   NewOpc = X86::VPCMPGTDZ128rm;   break;
+      case X86::VPCMPDZ128rmib:  NewOpc = X86::VPCMPGTDZ128rmb;  break;
+      case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPGTDZ128rmbk; break;
+      case X86::VPCMPDZ128rmik:  NewOpc = X86::VPCMPGTDZ128rmk;  break;
+      case X86::VPCMPDZ128rri:   NewOpc = X86::VPCMPGTDZ128rr;   break;
+      case X86::VPCMPDZ128rrik:  NewOpc = X86::VPCMPGTDZ128rrk;  break;
+      case X86::VPCMPDZ256rmi:   NewOpc = X86::VPCMPGTDZ256rm;   break;
+      case X86::VPCMPDZ256rmib:  NewOpc = X86::VPCMPGTDZ256rmb;  break;
+      case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPGTDZ256rmbk; break;
+      case X86::VPCMPDZ256rmik:  NewOpc = X86::VPCMPGTDZ256rmk;  break;
+      case X86::VPCMPDZ256rri:   NewOpc = X86::VPCMPGTDZ256rr;   break;
+      case X86::VPCMPDZ256rrik:  NewOpc = X86::VPCMPGTDZ256rrk;  break;
+      case X86::VPCMPDZrmi:      NewOpc = X86::VPCMPGTDZrm;      break;
+      case X86::VPCMPDZrmib:     NewOpc = X86::VPCMPGTDZrmb;     break;
+      case X86::VPCMPDZrmibk:    NewOpc = X86::VPCMPGTDZrmbk;    break;
+      case X86::VPCMPDZrmik:     NewOpc = X86::VPCMPGTDZrmk;     break;
+      case X86::VPCMPDZrri:      NewOpc = X86::VPCMPGTDZrr;      break;
+      case X86::VPCMPDZrrik:     NewOpc = X86::VPCMPGTDZrrk;     break;
+      case X86::VPCMPQZ128rmi:   NewOpc = X86::VPCMPGTQZ128rm;   break;
+      case X86::VPCMPQZ128rmib:  NewOpc = X86::VPCMPGTQZ128rmb;  break;
+      case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPGTQZ128rmbk; break;
+      case X86::VPCMPQZ128rmik:  NewOpc = X86::VPCMPGTQZ128rmk;  break;
+      case X86::VPCMPQZ128rri:   NewOpc = X86::VPCMPGTQZ128rr;   break;
+      case X86::VPCMPQZ128rrik:  NewOpc = X86::VPCMPGTQZ128rrk;  break;
+      case X86::VPCMPQZ256rmi:   NewOpc = X86::VPCMPGTQZ256rm;   break;
+      case X86::VPCMPQZ256rmib:  NewOpc = X86::VPCMPGTQZ256rmb;  break;
+      case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPGTQZ256rmbk; break;
+      case X86::VPCMPQZ256rmik:  NewOpc = X86::VPCMPGTQZ256rmk;  break;
+      case X86::VPCMPQZ256rri:   NewOpc = X86::VPCMPGTQZ256rr;   break;
+      case X86::VPCMPQZ256rrik:  NewOpc = X86::VPCMPGTQZ256rrk;  break;
+      case X86::VPCMPQZrmi:      NewOpc = X86::VPCMPGTQZrm;      break;
+      case X86::VPCMPQZrmib:     NewOpc = X86::VPCMPGTQZrmb;     break;
+      case X86::VPCMPQZrmibk:    NewOpc = X86::VPCMPGTQZrmbk;    break;
+      case X86::VPCMPQZrmik:     NewOpc = X86::VPCMPGTQZrmk;     break;
+      case X86::VPCMPQZrri:      NewOpc = X86::VPCMPGTQZrr;      break;
+      case X86::VPCMPQZrrik:     NewOpc = X86::VPCMPGTQZrrk;     break;
+      case X86::VPCMPWZ128rmi:   NewOpc = X86::VPCMPGTWZ128rm;   break;
+      case X86::VPCMPWZ128rmik:  NewOpc = X86::VPCMPGTWZ128rmk;  break;
+      case X86::VPCMPWZ128rri:   NewOpc = X86::VPCMPGTWZ128rr;   break;
+      case X86::VPCMPWZ128rrik:  NewOpc = X86::VPCMPGTWZ128rrk;  break;
+      case X86::VPCMPWZ256rmi:   NewOpc = X86::VPCMPGTWZ256rm;   break;
+      case X86::VPCMPWZ256rmik:  NewOpc = X86::VPCMPGTWZ256rmk;  break;
+      case X86::VPCMPWZ256rri:   NewOpc = X86::VPCMPGTWZ256rr;   break;
+      case X86::VPCMPWZ256rrik:  NewOpc = X86::VPCMPGTWZ256rrk;  break;
+      case X86::VPCMPWZrmi:      NewOpc = X86::VPCMPGTWZrm;      break;
+      case X86::VPCMPWZrmik:     NewOpc = X86::VPCMPGTWZrmk;     break;
+      case X86::VPCMPWZrri:      NewOpc = X86::VPCMPGTWZrr;      break;
+      case X86::VPCMPWZrrik:     NewOpc = X86::VPCMPGTWZrrk;     break;
+      }
+
+      OutMI.setOpcode(NewOpc);
+      OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
+      break;
+    }
+
     break;
   }
 
+  // CALL64r, CALL64pcrel32 - These instructions used to have
+  // register inputs modeled as normal uses instead of implicit uses.  As such,
+  // they we used to truncate off all but the first operand (the callee). This
+  // issue seems to have been fixed at some point. This assert verifies that.
+  case X86::CALL64r:
+  case X86::CALL64pcrel32:
+    assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
+    break;
+
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
     OutMI = MCInst();
@@ -539,36 +743,30 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     break;
   }
 
-    // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
-    // instruction.
-    {
-      unsigned Opcode;
-    case X86::TAILJMPr:
-      Opcode = X86::JMP32r;
-      goto SetTailJmpOpcode;
-    case X86::TAILJMPd:
-    case X86::TAILJMPd64:
-      Opcode = X86::JMP_1;
-      goto SetTailJmpOpcode;
-
-    SetTailJmpOpcode:
-      MCOperand Saved = OutMI.getOperand(0);
-      OutMI = MCInst();
-      OutMI.setOpcode(Opcode);
-      OutMI.addOperand(Saved);
-      break;
-    }
+  // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
+  // instruction.
+  case X86::TAILJMPr:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPr64_REX:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd64:
+    assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
+    OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
+    break;
 
   case X86::TAILJMPd_CC:
-  case X86::TAILJMPd64_CC: {
-    MCOperand Saved = OutMI.getOperand(0);
-    MCOperand Saved2 = OutMI.getOperand(1);
-    OutMI = MCInst();
-    OutMI.setOpcode(X86::JCC_1);
-    OutMI.addOperand(Saved);
-    OutMI.addOperand(Saved2);
+  case X86::TAILJMPd64_CC:
+    assert(OutMI.getNumOperands() == 2 && "Unexpected number of operands!");
+    OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
+    break;
+
+  case X86::TAILJMPm:
+  case X86::TAILJMPm64:
+  case X86::TAILJMPm64_REX:
+    assert(OutMI.getNumOperands() == X86::AddrNumOperands &&
+           "Unexpected number of operands!");
+    OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
     break;
-  }
 
   case X86::DEC16r:
   case X86::DEC32r:
@@ -958,7 +1156,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
   // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
   //                  <opcode>, <operands>
 
-  unsigned DefRegister = FaultingMI.getOperand(0).getReg();
+  Register DefRegister = FaultingMI.getOperand(0).getReg();
   FaultMaps::FaultKind FK =
       static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
   MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
@@ -1079,7 +1277,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
 
     // Emit MOV to materialize the target address and the CALL to target.
     // This is encoded with 12-13 bytes, depending on which register is used.
-    unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
+    Register ScratchReg = MI.getOperand(ScratchIdx).getReg();
     if (X86II::isX86_64ExtendedReg(ScratchReg))
       EncodedBytes = 13;
     else
@@ -1369,6 +1567,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
   recordSled(CurSled, MI, SledKind::TAIL_CALL);
 
   unsigned OpCode = MI.getOperand(0).getImm();
+  OpCode = convertTailJumpOpcode(OpCode);
   MCInst TC;
   TC.setOpcode(OpCode);
 
@@ -1538,8 +1737,6 @@ static void printConstant(const Constant *COp, raw_ostream &CS) {
 void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
   assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only");
-  const X86RegisterInfo *RI =
-      MF->getSubtarget<X86Subtarget>().getRegisterInfo();
 
   // Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86.
   if (EmitFPOData) {
@@ -1577,17 +1774,16 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   // Otherwise, use the .seh_ directives for all other Windows platforms.
   switch (MI->getOpcode()) {
   case X86::SEH_PushReg:
-    OutStreamer->EmitWinCFIPushReg(
-        RI->getSEHRegNum(MI->getOperand(0).getImm()));
+    OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm());
     break;
 
   case X86::SEH_SaveReg:
-    OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+    OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(),
                                    MI->getOperand(1).getImm());
     break;
 
   case X86::SEH_SaveXMM:
-    OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+    OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(),
                                    MI->getOperand(1).getImm());
     break;
 
@@ -1596,9 +1792,8 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
     break;
 
   case X86::SEH_SetFrame:
-    OutStreamer->EmitWinCFISetFrame(
-        RI->getSEHRegNum(MI->getOperand(0).getImm()),
-        MI->getOperand(1).getImm());
+    OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(),
+                                    MI->getOperand(1).getImm());
     break;
 
   case X86::SEH_PushFrame:
@@ -1650,7 +1845,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
     // Lower these as normal, but add some comments.
-    unsigned Reg = MI->getOperand(0).getReg();
+    Register Reg = MI->getOperand(0).getReg();
     OutStreamer->AddComment(StringRef("eh_return, addr: %") +
                             X86ATTInstPrinter::getRegisterName(Reg));
     break;
@@ -1697,11 +1892,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::MASKPAIR16LOAD: {
     int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm();
     assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
-    const X86RegisterInfo *RI =
-      MF->getSubtarget<X86Subtarget>().getRegisterInfo();
-    unsigned Reg = MI->getOperand(0).getReg();
-    unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
-    unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+    Register Reg = MI->getOperand(0).getReg();
+    Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+    Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
 
     // Load the first mask register
     MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm);
@@ -1730,11 +1923,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::MASKPAIR16STORE: {
     int64_t Disp = MI->getOperand(X86::AddrDisp).getImm();
     assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
-    const X86RegisterInfo *RI =
-      MF->getSubtarget<X86Subtarget>().getRegisterInfo();
-    unsigned Reg = MI->getOperand(X86::AddrNumOperands).getReg();
-    unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
-    unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+    Register Reg = MI->getOperand(X86::AddrNumOperands).getReg();
+    Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+    Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
 
     // Store the first mask register
     MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk);
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index d7e535598d81..5cb80a082b56 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -36,6 +36,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// is stashed.
   signed char RestoreBasePointerOffset = 0;
 
+  /// WinEHXMMSlotInfo - Slot information of XMM registers in the stack frame
+  /// in bytes.
+  DenseMap<int, unsigned> WinEHXMMSlotInfo;
+
   /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
   /// stack frame in bytes.
   unsigned CalleeSavedFrameSize = 0;
@@ -120,6 +124,10 @@ public:
   void setRestoreBasePointer(const MachineFunction *MF);
   int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
 
+  DenseMap<int, unsigned>& getWinEHXMMSlotInfo() { return WinEHXMMSlotInfo; }
+  const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const {
+    return WinEHXMMSlotInfo; }
+
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
 
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index 7f75598b0655..1aee01563c4b 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -198,8 +198,7 @@ static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
 static inline bool isIdenticalOp(const MachineOperand &MO1,
                                  const MachineOperand &MO2) {
   return MO1.isIdenticalTo(MO2) &&
-         (!MO1.isReg() ||
-          !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+         (!MO1.isReg() || !Register::isPhysicalRegister(MO1.getReg()));
 }
 
 #ifndef NDEBUG
@@ -235,9 +234,9 @@ static inline bool isLEA(const MachineInstr &MI) {
 
 namespace {
 
-class OptimizeLEAPass : public MachineFunctionPass {
+class X86OptimizeLEAPass : public MachineFunctionPass {
 public:
-  OptimizeLEAPass() : MachineFunctionPass(ID) {}
+  X86OptimizeLEAPass() : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "X86 LEA Optimize"; }
 
@@ -246,6 +245,8 @@ public:
   /// been calculated by LEA. Also, remove redundant LEAs.
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  static char ID;
+
 private:
   using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
 
@@ -296,18 +297,18 @@ private:
   MachineRegisterInfo *MRI;
   const X86InstrInfo *TII;
   const X86RegisterInfo *TRI;
-
-  static char ID;
 };
 
 } // end anonymous namespace
 
-char OptimizeLEAPass::ID = 0;
+char X86OptimizeLEAPass::ID = 0;
 
-FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+FunctionPass *llvm::createX86OptimizeLEAs() { return new X86OptimizeLEAPass(); }
+INITIALIZE_PASS(X86OptimizeLEAPass, DEBUG_TYPE, "X86 optimize LEA pass", false,
+                false)
 
-int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
-                                   const MachineInstr &Last) {
+int X86OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+                                      const MachineInstr &Last) {
   // Both instructions must be in the same basic block and they must be
   // presented in InstrPos.
   assert(Last.getParent() == First.getParent() &&
@@ -328,10 +329,9 @@ int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
 // 3) Displacement of the new memory operand should fit in 1 byte if possible.
 // 4) The LEA should be as close to MI as possible, and prior to it if
 //    possible.
-bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
-                                    const MachineInstr &MI,
-                                    MachineInstr *&BestLEA,
-                                    int64_t &AddrDispShift, int &Dist) {
+bool X86OptimizeLEAPass::chooseBestLEA(
+    const SmallVectorImpl<MachineInstr *> &List, const MachineInstr &MI,
+    MachineInstr *&BestLEA, int64_t &AddrDispShift, int &Dist) {
   const MachineFunction *MF = MI.getParent()->getParent();
   const MCInstrDesc &Desc = MI.getDesc();
   int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) +
@@ -387,9 +387,10 @@ bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
 // Get the difference between the addresses' displacements of the two
 // instructions \p MI1 and \p MI2. The numbers of the first memory operands are
 // passed through \p N1 and \p N2.
-int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1,
-                                          const MachineInstr &MI2,
-                                          unsigned N2) const {
+int64_t X86OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1,
+                                             unsigned N1,
+                                             const MachineInstr &MI2,
+                                             unsigned N2) const {
   const MachineOperand &Op1 = MI1.getOperand(N1 + X86::AddrDisp);
   const MachineOperand &Op2 = MI2.getOperand(N2 + X86::AddrDisp);
 
@@ -411,9 +412,9 @@ int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1,
 // 2) Def registers of LEAs belong to the same class.
 // 3) All uses of the Last LEA def register are replaceable, thus the
 //    register is used only as address base.
-bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
-                                    const MachineInstr &Last,
-                                    int64_t &AddrDispShift) const {
+bool X86OptimizeLEAPass::isReplaceable(const MachineInstr &First,
+                                       const MachineInstr &Last,
+                                       int64_t &AddrDispShift) const {
   assert(isLEA(First) && isLEA(Last) &&
          "The function works only with LEA instructions");
 
@@ -467,7 +468,8 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
   return true;
 }
 
-void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
+void X86OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+                                  MemOpMap &LEAs) {
   unsigned Pos = 0;
   for (auto &MI : MBB) {
     // Assign the position number to the instruction. Note that we are going to
@@ -485,7 +487,7 @@ void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
 // Try to find load and store instructions which recalculate addresses already
 // calculated by some LEA and replace their memory operands with its def
 // register.
-bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
+bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
   bool Changed = false;
 
   assert(!LEAs.empty());
@@ -564,9 +566,9 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
   return Changed;
 }
 
-MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
-                                                 unsigned VReg,
-                                                 int64_t AddrDispShift) {
+MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
+                                                    unsigned VReg,
+                                                    int64_t AddrDispShift) {
   DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression());
   if (AddrDispShift != 0)
     Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
@@ -583,7 +585,7 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
 }
 
 // Try to find similar LEAs in the list and replace one with another.
-bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
+bool X86OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
   bool Changed = false;
 
   // Loop over all entries in the table.
@@ -613,8 +615,8 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
         // Loop over all uses of the Last LEA and update their operands. Note
         // that the correctness of this has already been checked in the
         // isReplaceable function.
-        unsigned FirstVReg = First.getOperand(0).getReg();
-        unsigned LastVReg = Last.getOperand(0).getReg();
+        Register FirstVReg = First.getOperand(0).getReg();
+        Register LastVReg = Last.getOperand(0).getReg();
         for (auto UI = MRI->use_begin(LastVReg), UE = MRI->use_end();
              UI != UE;) {
           MachineOperand &MO = *UI++;
@@ -670,7 +672,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
   return Changed;
 }
 
-bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
   if (DisableX86LEAOpt || skipFunction(MF.getFunction()))
diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp
index 78fede3dcde2..daddf4231897 100644
--- a/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -46,7 +46,9 @@ const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass(
   if (X86::GR8RegClass.hasSubClassEq(&RC) ||
       X86::GR16RegClass.hasSubClassEq(&RC) ||
       X86::GR32RegClass.hasSubClassEq(&RC) ||
-      X86::GR64RegClass.hasSubClassEq(&RC))
+      X86::GR64RegClass.hasSubClassEq(&RC) ||
+      X86::LOW32_ADDR_ACCESSRegClass.hasSubClassEq(&RC) ||
+      X86::LOW32_ADDR_ACCESS_RBPRegClass.hasSubClassEq(&RC))
     return getRegBank(X86::GPRRegBankID);
 
   if (X86::FR32XRegClass.hasSubClassEq(&RC) ||
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 2e2f1f9e438a..ff625325b4c9 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -544,7 +544,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         "Stack realignment in presence of dynamic allocas is not supported with"
         "this calling convention.");
 
-    unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
+    Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
     for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
          I.isValid(); ++I)
       Reserved.set(*I);
@@ -677,13 +677,13 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
       MI.getOperand(4).getImm() != 0 ||
       MI.getOperand(5).getReg() != X86::NoRegister)
     return false;
-  unsigned BasePtr = MI.getOperand(1).getReg();
+  Register BasePtr = MI.getOperand(1).getReg();
   // In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will
   // be replaced with a 32-bit operand MOV which will zero extend the upper
   // 32-bits of the super register.
   if (Opc == X86::LEA64_32r)
     BasePtr = getX86SubSuperRegister(BasePtr, 32);
-  unsigned NewDestReg = MI.getOperand(0).getReg();
+  Register NewDestReg = MI.getOperand(0).getReg();
   const X86InstrInfo *TII =
       MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo();
   TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr,
@@ -692,12 +692,27 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
   return true;
 }
 
+static bool isFuncletReturnInstr(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case X86::CATCHRET:
+  case X86::CLEANUPRET:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("impossible");
+}
+
 void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      int SPAdj, unsigned FIOperandNum,
                                      RegScavenger *RS) const {
   MachineInstr &MI = *II;
-  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  bool IsEHFuncletEpilogue = MBBI == MBB.end() ? false
+                                               : isFuncletReturnInstr(*MBBI);
   const X86FrameLowering *TFI = getFrameLowering(MF);
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
@@ -709,6 +724,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
            MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
            "Return instruction can only reference SP relative frame objects");
     FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0);
+  } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) {
+    FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr);
   } else {
     FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr);
   }
@@ -729,7 +746,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // register as source operand, semantic is the same and destination is
   // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
   // Don't change BasePtr since it is used later for stack adjustment.
-  unsigned MachineBasePtr = BasePtr;
+  Register MachineBasePtr = BasePtr;
   if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
     MachineBasePtr = getX86SubSuperRegister(BasePtr, 64);
 
@@ -773,7 +790,7 @@ Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
 unsigned
 X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
   const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
-  unsigned FrameReg = getFrameRegister(MF);
+  Register FrameReg = getFrameRegister(MF);
   if (Subtarget.isTarget64BitILP32())
     FrameReg = getX86SubSuperRegister(FrameReg, 32);
   return FrameReg;
@@ -782,7 +799,7 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
 unsigned
 X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
   const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
-  unsigned StackReg = getStackRegister();
+  Register StackReg = getStackRegister();
   if (Subtarget.isTarget64BitILP32())
     StackReg = getX86SubSuperRegister(StackReg, 32);
   return StackReg;
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index b435b22e8ac7..f8464c7e8298 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -58,8 +58,8 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineModuleInfo>();
-    AU.addPreserved<MachineModuleInfo>();
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
   }
 
 private:
@@ -97,7 +97,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
   TII = STI->getInstrInfo();
   Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;
 
-  MMI = &getAnalysis<MachineModuleInfo>();
+  MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   Module &M = const_cast<Module &>(*MMI->getModule());
 
   // If this function is not a thunk, check to see if we need to insert
@@ -279,7 +279,7 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
 
   CallTarget->addLiveIn(Reg);
   CallTarget->setHasAddressTaken();
-  CallTarget->setAlignment(4);
+  CallTarget->setAlignment(Align(16));
   insertRegReturnAddrClobber(*CallTarget, Reg);
   CallTarget->back().setPreInstrSymbol(MF, TargetSym);
   BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td
index 7574e4b8f896..9b1fcaa8a13d 100755
--- a/lib/Target/X86/X86SchedBroadwell.td
+++ b/lib/Target/X86/X86SchedBroadwell.td
@@ -232,8 +232,12 @@ defm : X86WriteRes<WriteFStoreY,       [BWPort237,BWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNT,      [BWPort237,BWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNTX,     [BWPort237,BWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNTY,     [BWPort237,BWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore,  [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+
+defm : X86WriteRes<WriteFMaskedStore32,  [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64,  [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+
 defm : X86WriteRes<WriteFMove,         [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [BWPort5], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 284d1567c5c6..06f417501b21 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -231,8 +231,12 @@ defm : X86WriteRes<WriteFStoreY,       [HWPort237,HWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNT,      [HWPort237,HWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNTX,     [HWPort237,HWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNTY,     [HWPort237,HWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore,  [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+
+defm : X86WriteRes<WriteFMaskedStore32,  [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64,  [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+
 defm : X86WriteRes<WriteFMove,         [HWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [HWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [HWPort5], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td
index 41bd776648f7..76001d382a27 100644
--- a/lib/Target/X86/X86SchedPredicates.td
+++ b/lib/Target/X86/X86SchedPredicates.td
@@ -84,3 +84,60 @@ def IsSETAm_Or_SETBEm : CheckAny<[
   CheckImmOperand_s<5, "X86::COND_A">,
   CheckImmOperand_s<5, "X86::COND_BE">
 ]>;
+
+// A predicate used to check if an instruction has a LOCK prefix.
+def CheckLockPrefix : CheckFunctionPredicate<
+  "X86_MC::hasLockPrefix",
+  "X86InstrInfo::hasLockPrefix"
+>;
+
+def IsRegRegCompareAndSwap_8 : CheckOpcode<[ CMPXCHG8rr ]>;
+
+def IsRegMemCompareAndSwap_8 : CheckOpcode<[
+  LCMPXCHG8, CMPXCHG8rm
+]>;
+
+def IsRegRegCompareAndSwap_16_32_64  : CheckOpcode<[
+  CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr
+]>;
+
+def IsRegMemCompareAndSwap_16_32_64  : CheckOpcode<[
+  CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm,
+  LCMPXCHG16, LCMPXCHG32, LCMPXCHG64,
+  LCMPXCHG8B, LCMPXCHG16B
+]>;
+
+def IsCompareAndSwap8B  : CheckOpcode<[ CMPXCHG8B, LCMPXCHG8B ]>;
+def IsCompareAndSwap16B : CheckOpcode<[ CMPXCHG16B, LCMPXCHG16B ]>;
+
+def IsRegMemCompareAndSwap  : CheckOpcode<
+  !listconcat(
+    IsRegMemCompareAndSwap_8.ValidOpcodes,
+    IsRegMemCompareAndSwap_16_32_64.ValidOpcodes
+  )>;
+
+def IsRegRegCompareAndSwap  : CheckOpcode<
+  !listconcat(
+    IsRegRegCompareAndSwap_8.ValidOpcodes,
+    IsRegRegCompareAndSwap_16_32_64.ValidOpcodes
+  )>;
+
+def IsAtomicCompareAndSwap_8 : CheckAll<[
+  CheckLockPrefix,
+  IsRegMemCompareAndSwap_8
+]>;
+
+def IsAtomicCompareAndSwap : CheckAll<[
+  CheckLockPrefix,
+  IsRegMemCompareAndSwap
+]>;
+
+def IsAtomicCompareAndSwap8B : CheckAll<[
+  CheckLockPrefix,
+  IsCompareAndSwap8B
+]>;
+
+def IsAtomicCompareAndSwap16B : CheckAll<[
+  CheckLockPrefix,
+  IsCompareAndSwap16B
+]>;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index d40bdf728a48..26d4d8fa3549 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -208,8 +208,12 @@ defm : X86WriteRes<WriteFStoreY,       [SBPort23,SBPort4], 1, [1,1], 1>;
 defm : X86WriteRes<WriteFStoreNT,      [SBPort23,SBPort4], 1, [1,1], 1>;
 defm : X86WriteRes<WriteFStoreNTX,     [SBPort23,SBPort4], 1, [1,1], 1>;
 defm : X86WriteRes<WriteFStoreNTY,     [SBPort23,SBPort4], 1, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStore,  [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
-defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+
+defm : X86WriteRes<WriteFMaskedStore32,  [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore64,  [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+
 defm : X86WriteRes<WriteFMove,         [SBPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [SBPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [SBPort5], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
index 8f3e4ae62d53..9a511ecc0071 100644
--- a/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY,       [SKLPort237,SKLPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNT,      [SKLPort237,SKLPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNTX,     [SKLPort237,SKLPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNTY,     [SKLPort237,SKLPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore,  [SKLPort237,SKLPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32,  [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64,  [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+
 defm : X86WriteRes<WriteFMove,         [SKLPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [SKLPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [SKLPort015], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
index 58caf1dacfcb..a8c65435ab9b 100755
--- a/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY,       [SKXPort237,SKXPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNT,      [SKXPort237,SKXPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNTX,     [SKXPort237,SKXPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteFStoreNTY,     [SKXPort237,SKXPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore,  [SKXPort237,SKXPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32,  [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64,  [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+
 defm : X86WriteRes<WriteFMove,         [SKXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [SKXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [SKXPort015], 1, [1], 1>;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 55ca85ec1e3d..95f710061aeb 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -102,6 +102,12 @@ class X86SchedWriteMoveLS<SchedWrite MoveRR,
   SchedWrite MR = StoreMR;
 }
 
+// Multiclass that wraps masked load/store writes for a vector width.
+class X86SchedWriteMaskMove<SchedWrite LoadRM, SchedWrite StoreMR> {
+  SchedWrite RM = LoadRM;
+  SchedWrite MR = StoreMR;
+}
+
 // Multiclass that wraps X86SchedWriteMoveLS for each vector width.
 class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl,
                                 X86SchedWriteMoveLS s128,
@@ -218,8 +224,12 @@ def  WriteFStoreY       : SchedWrite;
 def  WriteFStoreNT      : SchedWrite;
 def  WriteFStoreNTX     : SchedWrite;
 def  WriteFStoreNTY     : SchedWrite;
-def  WriteFMaskedStore  : SchedWrite;
-def  WriteFMaskedStoreY : SchedWrite;
+
+def  WriteFMaskedStore32  : SchedWrite;
+def  WriteFMaskedStore64  : SchedWrite;
+def  WriteFMaskedStore32Y : SchedWrite;
+def  WriteFMaskedStore64Y : SchedWrite;
+
 def  WriteFMove         : SchedWrite;
 def  WriteFMoveX        : SchedWrite;
 def  WriteFMoveY        : SchedWrite;
@@ -530,6 +540,16 @@ def SchedWriteVecMoveLSNT
   : X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX,
                               WriteVecMoveLSNTY, WriteVecMoveLSNTY>;
 
+// Conditional SIMD Packed Loads and Stores wrappers.
+def WriteFMaskMove32
+  : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore32>;
+def WriteFMaskMove64
+  : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore64>;
+def WriteFMaskMove32Y
+  : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>;
+def WriteFMaskMove64Y
+  : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>;
+
 // Vector width wrappers.
 def SchedWriteFAdd
  : X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index b0334655de7e..78acb1065ec8 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -216,8 +216,10 @@ defm : X86WriteResUnsupported<WriteFStoreY>;
 def  : WriteRes<WriteFStoreNT,      [AtomPort0]>;
 def  : WriteRes<WriteFStoreNTX,     [AtomPort0]>;
 defm : X86WriteResUnsupported<WriteFStoreNTY>;
-defm : X86WriteResUnsupported<WriteFMaskedStore>;
-defm : X86WriteResUnsupported<WriteFMaskedStoreY>;
+defm : X86WriteResUnsupported<WriteFMaskedStore32>;
+defm : X86WriteResUnsupported<WriteFMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteFMaskedStore64>;
+defm : X86WriteResUnsupported<WriteFMaskedStore64Y>;
 
 def  : WriteRes<WriteFMove,         [AtomPort01]>;
 def  : WriteRes<WriteFMoveX,        [AtomPort01]>;
diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td
index 8cc01c3acece..d7aea3cf4e9d 100644
--- a/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/lib/Target/X86/X86ScheduleBdVer2.td
@@ -726,8 +726,10 @@ defm : PdWriteRes<WriteFStoreNT,           [PdStore, PdFPU1,  PdFPSTO], 3>;
 defm : PdWriteRes<WriteFStoreNTX,          [PdStore, PdFPU1,  PdFPSTO], 3>;
 defm : PdWriteRes<WriteFStoreNTY,          [PdStore, PdFPU1,  PdFPSTO], 3, [2, 2, 2], 4>;
 
-defm : PdWriteRes<WriteFMaskedStore,       [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
-defm : PdWriteRes<WriteFMaskedStoreY,      [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
+defm : PdWriteRes<WriteFMaskedStore32,     [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStore64,     [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStore32Y,    [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
+defm : PdWriteRes<WriteFMaskedStore64Y,    [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
 
 defm : PdWriteRes<WriteFMove,              [PdFPU01, PdFPFMA]>;
 defm : PdWriteRes<WriteFMoveX,             [PdFPU01, PdFPFMA], 1, [1, 2]>;
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index 2d26232b4132..d0421d94ee05 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -180,9 +180,11 @@ multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
 
 // Instructions that have local forwarding disabled have an extra +1cy latency.
 
-// A folded store needs a cycle on the SAGU for the store data,
-// most RMW instructions don't need an extra uop.
-defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
+// A folded store needs a cycle on the SAGU for the store data, most RMW
+// instructions don't need an extra uop.  ALU RMW operations don't seem to
+// benefit from STLF, and their observed latency is 6cy. That is the reason why
+// this write adds two extra cycles (instead of just 1cy for the store).
+defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Arithmetic.
@@ -191,22 +193,22 @@ defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
 defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
 defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
 
-defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteCMPXCHG,[JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteCMPXCHGRMW,[JALU01, JSAGU, JLAGU], 4, [1, 1, 1], 2>;
-defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [1], 1>;
-
-defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;
-defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 2>;
-defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 2>;
+defm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
+defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [2], 2>;
+
+defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 3], 3>;
+defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
+defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 2], 2>;
+defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;  
+defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
+defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
 defm : X86WriteRes<WriteIMulH,          [JALU1], 6, [4], 1>;
 
 defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
@@ -305,6 +307,192 @@ def : WriteRes<WriteFence,  [JSAGU]>;
 // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
 def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
 
+def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
+  let Latency = 3;
+  let ResourceCycles = [3];
+  let NumMicroOps = 3;
+}
+
+def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 16;
+  let ResourceCycles = [3,16,16];
+  let NumMicroOps = 5;
+}
+
+def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 17;
+  let ResourceCycles = [3,17,17];
+  let NumMicroOps = 6;
+}
+
+def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 11;
+  let ResourceCycles = [3,1,1];
+  let NumMicroOps = 5;
+}
+
+def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 11;
+  let ResourceCycles = [3,1,1];
+  let NumMicroOps = 18;
+}
+
+def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 32;
+  let ResourceCycles = [6,1,1];
+  let NumMicroOps = 28;
+}
+
+def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 19;
+  let ResourceCycles = [3,19,19];
+  let NumMicroOps = 18;
+}
+
+def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 38;
+  let ResourceCycles = [6,38,38];
+  let NumMicroOps = 28;
+}
+
+def JWriteCMPXCHGVariant :  SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
+  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
+  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
+  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
+  SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
+  SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
+  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
+  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
+  SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
+  SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
+]>;
+
+// The first five reads are contributed by the memory load operand.
+// We ignore those reads and set a read-advance for the other input operands
+// including the implicit read of RAX.
+def : InstRW<[JWriteCMPXCHGVariant,
+              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+              ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
+                                                 LCMPXCHG32, LCMPXCHG64,
+                                                 CMPXCHG8rm, CMPXCHG16rm,
+                                                 CMPXCHG32rm, CMPXCHG64rm)>;
+
+def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
+                                             CMPXCHG32rr, CMPXCHG64rr)>;
+
+def : InstRW<[JWriteCMPXCHGVariant,
+              // Ignore reads contributed by the memory operand.
+              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+              // Add a read-advance to every implicit register read.
+              ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
+                                                                           CMPXCHG8B, CMPXCHG16B)>;
+
+def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 19;
+  let ResourceCycles = [1,19,19];
+  let NumMicroOps = 1;
+}
+
+def JWriteLOCK_ALURMWVariant :  SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
+  SchedVar<NoSchedPred,                       [WriteALURMW]>
+]>;
+def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
+                                                 DEC8m, DEC16m, DEC32m, DEC64m,
+                                                 NOT8m, NOT16m, NOT32m, NOT64m,
+                                                 NEG8m, NEG16m, NEG32m, NEG64m)>;
+
+def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
+  let Latency = 2;
+  let ResourceCycles = [3];
+  let NumMicroOps = 3;
+}
+def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
+                                                      XADD32rr, XADD64rr)>;
+
+// This write defines the latency of the in/out register operand of a non-atomic
+// XADDrm. This is the first of a pair of writes that model non-atomic
+// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
+//
+// We need two writes because the instruction latency differs from the output
+// register operand latency. In particular, the first write describes the first
+// (and only) output register operand of the instruction.  However, the
+// instruction latency is set to the MAX of all the write latencies. That's why
+// a second write is needed in this case (see example below).
+//
+// Example:
+//     XADD %ecx, (%rsp)      ## Instruction latency: 11cy
+//                            ## ECX write Latency: 3cy
+//
+// Register ECX becomes available in 3 cycles. That is because the value of ECX
+// is exchanged with the value read from the stack pointer, and the load-to-use
+// latency is assumed to be 3cy.
+def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+  let Latency = 3;  // load-to-use latency
+  let ResourceCycles = [3];
+  let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XADDrm. This is the first of a sequence of two writes used to model atomic
+// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
+//
+//
+// Example:
+//    LOCK XADD %ecx, (%rsp)     ## Instruction Latency: 16cy
+//                               ## ECX write Latency: 11cy
+//
+// The value of ECX becomes available only after 11cy from the start of
+// execution. This write is used to specifically set that operand latency. 
+def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+  let Latency = 11;
+  let ResourceCycles = [3];
+  let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XCHGrm. This write is the first of a sequence of two writes that describe
+// atomic XCHG operations. We need two writes because the instruction latency
+// differs from the output register write latency.  We want to make sure that
+// the output register operand becomes visible after 11cy. However, we want to
+// set the instruction latency to 16cy.
+def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+  let Latency = 11;
+  let ResourceCycles = [2];
+  let NumMicroOps = 2;
+}
+
+def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+  let Latency = 11;
+  let ResourceCycles = [1, 1];
+  let NumMicroOps = 1;
+}
+
+def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+  let Latency = 16;
+  let ResourceCycles = [16, 16];
+  let NumMicroOps = 1;
+}
+
+def JWriteXADDrm_Part1 : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
+  SchedVar<NoSchedPred,                       [JWriteXADDrm_XCHG_Part]>
+]>;
+
+def JWriteXADDrm_Part2 : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
+  SchedVar<NoSchedPred,                       [JWriteXADDrm_LdSt_Part]>
+]>;
+
+def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
+                 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
+                         LXADD8, LXADD16, LXADD32, LXADD64)>;
+
+def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
+                 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Floating point. This covers both scalar and vector operations.
 ////////////////////////////////////////////////////////////////////////////////
@@ -313,19 +501,22 @@ defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
 defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
 defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
 defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteFLoadX,        [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteFLoadY,        [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFLoadX,        [JLAGU], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadY,        [JLAGU], 5, [2], 2>;
 defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
 defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
 
 defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [2, 2, 2], 2>;
 defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
-defm : X86WriteRes<WriteFMaskedStore,  [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>;
-defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
+defm : X86WriteRes<WriteFMaskedStore64,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
 
 defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
@@ -466,8 +657,8 @@ defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
 ////////////////////////////////////////////////////////////////////////////////
 
 defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteVecLoadX,         [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteVecLoadY,         [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadX,         [JLAGU], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY,         [JLAGU], 5, [2], 2>;
 defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
@@ -475,7 +666,7 @@ defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4],
 
 defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
 defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
 defm : X86WriteRes<WriteVecMaskedStore,   [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
@@ -630,6 +821,18 @@ def JWriteJVZEROUPPER: SchedWriteRes<[]> {
 }
 def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
 
+///////////////////////////////////////////////////////////////////////////////
+//  SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
+///////////////////////////////////////////////////////////////////////////////
+
+def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
+  let Latency = 34;
+  let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
+  let NumMicroOps = 63;
+}
+def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
+                                         VMASKMOVDQU, VMASKMOVDQU64)>;
+
 ///////////////////////////////////////////////////////////////////////////////
 //  SchedWriteVariant definitions.
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index 34c251a5c5bb..8e3ce721f1a1 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -186,8 +186,12 @@ def  : WriteRes<WriteFStoreY,       [SLM_MEC_RSV]>;
 def  : WriteRes<WriteFStoreNT,      [SLM_MEC_RSV]>;
 def  : WriteRes<WriteFStoreNTX,     [SLM_MEC_RSV]>;
 def  : WriteRes<WriteFStoreNTY,     [SLM_MEC_RSV]>;
-def  : WriteRes<WriteFMaskedStore,  [SLM_MEC_RSV]>;
-def  : WriteRes<WriteFMaskedStoreY, [SLM_MEC_RSV]>;
+
+def  : WriteRes<WriteFMaskedStore32,    [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFMaskedStore32Y,   [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFMaskedStore64,    [SLM_MEC_RSV]>;
+def  : WriteRes<WriteFMaskedStore64Y,   [SLM_MEC_RSV]>;
+
 def  : WriteRes<WriteFMove,         [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteFMoveX,        [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteFMoveY,        [SLM_FPC_RSV01]>;
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
index 65f6d89df610..06201f4a3a84 100644
--- a/lib/Target/X86/X86ScheduleZnver1.td
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -268,8 +268,12 @@ defm : X86WriteRes<WriteFStoreY,       [ZnAGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFStoreNT,      [ZnAGU,ZnFPU2], 8, [1,1], 1>;
 defm : X86WriteRes<WriteFStoreNTX,     [ZnAGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFStoreNTY,     [ZnAGU], 1, [1], 1>;
-defm : X86WriteRes<WriteFMaskedStore,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMaskedStore64,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+
 defm : X86WriteRes<WriteFMove,         [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [ZnFPU], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 50690953eef5..1ae8df977f83 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -36,7 +36,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
 
   const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
       DAG.getSubtarget().getRegisterInfo());
-  unsigned BaseReg = TRI->getBaseRegister();
+  Register BaseReg = TRI->getBaseRegister();
   for (unsigned R : ClobberSet)
     if (BaseReg == R)
       return true;
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 40f5dbe57e4b..b8980789258e 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -477,7 +477,7 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
     // Otherwise, just build the predicate state itself by zeroing a register
     // as we don't need any initial state.
     PS->InitialReg = MRI->createVirtualRegister(PS->RC);
-    unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+    Register PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
     auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
                          PredStateSubReg);
     ++NumInstsInserted;
@@ -750,7 +750,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
             int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
             auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
 
-            unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+            Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
             // Note that we intentionally use an empty debug location so that
             // this picks up the preceding location.
             auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
@@ -907,7 +907,7 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
                      MI.dump(); dbgs() << "\n");
           report_fatal_error("Unable to unfold load!");
         }
-        unsigned Reg = MRI->createVirtualRegister(UnfoldedRC);
+        Register Reg = MRI->createVirtualRegister(UnfoldedRC);
         SmallVector<MachineInstr *, 2> NewMIs;
         // If we were able to compute an unfolded reg class, any failure here
         // is just a programming error so just assert.
@@ -1102,7 +1102,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
       // synthetic target in the predecessor. We do this at the bottom of the
       // predecessor.
       auto InsertPt = Pred->getFirstTerminator();
-      unsigned TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+      Register TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
       if (MF.getTarget().getCodeModel() == CodeModel::Small &&
           !Subtarget->isPositionIndependent()) {
         // Directly materialize it into an immediate.
@@ -1153,7 +1153,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
       LLVM_DEBUG(dbgs() << "  Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
     } else {
       // Otherwise compute the address into a register first.
-      unsigned AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+      Register AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
       auto AddrI =
           BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg)
               .addReg(/*Base*/ X86::RIP)
@@ -1175,7 +1175,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
     // Now cmov over the predicate if the comparison wasn't equal.
     int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
     auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
-    unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+    Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
     auto CMovI =
         BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
             .addReg(PS->InitialReg)
@@ -1878,7 +1878,7 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
     DebugLoc Loc) {
   // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
   // what instruction selection does.
-  unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
   // We directly copy the FLAGS register and rely on later lowering to clean
   // this up into the appropriate setCC instructions.
   BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
@@ -1905,7 +1905,7 @@ void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
 void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
     unsigned PredStateReg) {
-  unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+  Register TmpReg = MRI->createVirtualRegister(PS->RC);
   // FIXME: This hard codes a shift distance based on the number of bits needed
   // to stay canonical on 64-bit. We should compute this somehow and support
   // 32-bit as part of that.
@@ -1925,8 +1925,8 @@ void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
 unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
     DebugLoc Loc) {
-  unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
-  unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+  Register PredStateReg = MRI->createVirtualRegister(PS->RC);
+  Register TmpReg = MRI->createVirtualRegister(PS->RC);
 
   // We know that the stack pointer will have any preserved predicate state in
   // its high bit. We just want to smear this across the other bits. Turns out,
@@ -2031,9 +2031,9 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
   }
 
   for (MachineOperand *Op : HardenOpRegs) {
-    unsigned OpReg = Op->getReg();
+    Register OpReg = Op->getReg();
     auto *OpRC = MRI->getRegClass(OpReg);
-    unsigned TmpReg = MRI->createVirtualRegister(OpRC);
+    Register TmpReg = MRI->createVirtualRegister(OpRC);
 
     // If this is a vector register, we'll need somewhat custom logic to handle
     // hardening it.
@@ -2045,7 +2045,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
       // Move our state into a vector register.
       // FIXME: We could skip this at the cost of longer encodings with AVX-512
       // but that doesn't seem likely worth it.
-      unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
+      Register VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
       auto MovI =
           BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
               .addReg(StateReg);
@@ -2054,7 +2054,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
       LLVM_DEBUG(dbgs() << "  Inserting mov: "; MovI->dump(); dbgs() << "\n");
 
       // Broadcast it across the vector register.
-      unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
+      Register VBStateReg = MRI->createVirtualRegister(OpRC);
       auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
                                 TII->get(Is128Bit ? X86::VPBROADCASTQrr
                                                   : X86::VPBROADCASTQYrr),
@@ -2084,7 +2084,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
         assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
 
       // Broadcast our state into a vector register.
-      unsigned VStateReg = MRI->createVirtualRegister(OpRC);
+      Register VStateReg = MRI->createVirtualRegister(OpRC);
       unsigned BroadcastOp =
           Is128Bit ? X86::VPBROADCASTQrZ128r
                    : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
@@ -2153,7 +2153,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
   // See if we can sink hardening the loaded value.
   auto SinkCheckToSingleUse =
       [&](MachineInstr &MI) -> Optional<MachineInstr *> {
-    unsigned DefReg = MI.getOperand(0).getReg();
+    Register DefReg = MI.getOperand(0).getReg();
 
     // We need to find a single use which we can sink the check. We can
     // primarily do this because many uses may already end up checked on their
@@ -2210,8 +2210,8 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
       // If this register isn't a virtual register we can't walk uses of sanely,
       // just bail. Also check that its register class is one of the ones we
       // can harden.
-      unsigned UseDefReg = UseMI.getOperand(0).getReg();
-      if (!TRI->isVirtualRegister(UseDefReg) ||
+      Register UseDefReg = UseMI.getOperand(0).getReg();
+      if (!Register::isVirtualRegister(UseDefReg) ||
           !canHardenRegister(UseDefReg))
         return {};
 
@@ -2241,6 +2241,9 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
     // We don't support post-load hardening of vectors.
     return false;
 
+  unsigned RegIdx = Log2_32(RegBytes);
+  assert(RegIdx < 4 && "Unsupported register size");
+
   // If this register class is explicitly constrained to a class that doesn't
   // require REX prefix, we may not be able to satisfy that constraint when
   // emitting the hardening instructions, so bail out here.
@@ -2251,13 +2254,13 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
   const TargetRegisterClass *NOREXRegClasses[] = {
       &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
       &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
-  if (RC == NOREXRegClasses[Log2_32(RegBytes)])
+  if (RC == NOREXRegClasses[RegIdx])
     return false;
 
   const TargetRegisterClass *GPRRegClasses[] = {
       &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
       &X86::GR64RegClass};
-  return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
+  return RC->hasSuperClassEq(GPRRegClasses[RegIdx]);
 }
 
 /// Harden a value in a register.
@@ -2278,7 +2281,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
     unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
     DebugLoc Loc) {
   assert(canHardenRegister(Reg) && "Cannot harden this register!");
-  assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!");
+  assert(Register::isVirtualRegister(Reg) && "Cannot harden a physical register!");
 
   auto *RC = MRI->getRegClass(Reg);
   int Bytes = TRI->getRegSizeInBits(*RC) / 8;
@@ -2289,7 +2292,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
   if (Bytes != 8) {
     unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
     unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
-    unsigned NarrowStateReg = MRI->createVirtualRegister(RC);
+    Register NarrowStateReg = MRI->createVirtualRegister(RC);
     BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
         .addReg(StateReg, 0, SubRegImm);
     StateReg = NarrowStateReg;
@@ -2299,7 +2302,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
   if (isEFLAGSLive(MBB, InsertPt, *TRI))
     FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
 
-  unsigned NewReg = MRI->createVirtualRegister(RC);
+  Register NewReg = MRI->createVirtualRegister(RC);
   unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
   unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
   auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
@@ -2329,13 +2332,13 @@ unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
   DebugLoc Loc = MI.getDebugLoc();
 
   auto &DefOp = MI.getOperand(0);
-  unsigned OldDefReg = DefOp.getReg();
+  Register OldDefReg = DefOp.getReg();
   auto *DefRC = MRI->getRegClass(OldDefReg);
 
   // Because we want to completely replace the uses of this def'ed value with
   // the hardened value, create a dedicated new register that will only be used
   // to communicate the unhardened value to the hardening.
-  unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC);
+  Register UnhardenedReg = MRI->createVirtualRegister(DefRC);
   DefOp.setReg(UnhardenedReg);
 
   // Now harden this register's value, getting a hardened reg that is safe to
@@ -2537,7 +2540,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
         .addReg(ExpectedRetAddrReg, RegState::Kill)
         .addSym(RetSymbol);
   } else {
-    unsigned ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
+    Register ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
     BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg)
         .addReg(/*Base*/ X86::RIP)
         .addImm(/*Scale*/ 1)
@@ -2554,7 +2557,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
   int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
   auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
 
-  unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+  Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
   auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
                    .addReg(NewStateReg, RegState::Kill)
                    .addReg(PS->PoisonReg)
@@ -2611,7 +2614,7 @@ void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
   // For all of these, the target register is the first operand of the
   // instruction.
   auto &TargetOp = MI.getOperand(0);
-  unsigned OldTargetReg = TargetOp.getReg();
+  Register OldTargetReg = TargetOp.getReg();
 
   // Try to lookup a hardened version of this register. We retain a reference
   // here as we want to update the map to track any newly computed hardened
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index d5bb56603df9..f8f78da52cc2 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -146,6 +146,9 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
       return X86II::MO_DLLIMPORT;
     return X86II::MO_COFFSTUB;
   }
+  // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables.
+  if (isOSWindows())
+    return X86II::MO_NO_FLAG;
 
   if (is64Bit()) {
     // ELF supports a large, truly PIC code model with non-PC relative GOT
@@ -285,10 +288,10 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
   // 32 and 64 bit) and for all 64-bit targets.
   if (StackAlignOverride)
-    stackAlignment = StackAlignOverride;
+    stackAlignment = *StackAlignOverride;
   else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
            isTargetKFreeBSD() || In64BitMode)
-    stackAlignment = 16;
+    stackAlignment = Align(16);
 
   // Some CPUs have more overhead for gather. The specified overhead is relative
   // to the Load operation. "2" is the number provided by Intel architects. This
@@ -304,6 +307,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Consume the vector width attribute or apply any target specific limit.
   if (PreferVectorWidthOverride)
     PreferVectorWidth = PreferVectorWidthOverride;
+  else if (Prefer128Bit)
+    PreferVectorWidth = 128;
   else if (Prefer256Bit)
     PreferVectorWidth = 256;
 }
@@ -316,12 +321,11 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
 
 X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                            const X86TargetMachine &TM,
-                           unsigned StackAlignOverride,
+                           MaybeAlign StackAlignOverride,
                            unsigned PreferVectorWidthOverride,
                            unsigned RequiredVectorWidth)
-    : X86GenSubtargetInfo(TT, CPU, FS),
-      PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
-      StackAlignOverride(StackAlignOverride),
+    : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::None), TM(TM),
+      TargetTriple(TT), StackAlignOverride(StackAlignOverride),
       PreferVectorWidthOverride(PreferVectorWidthOverride),
       RequiredVectorWidth(RequiredVectorWidth),
       In64BitMode(TargetTriple.getArch() == Triple::x86_64),
@@ -355,7 +359,7 @@ const CallLowering *X86Subtarget::getCallLowering() const {
   return CallLoweringInfo.get();
 }
 
-const InstructionSelector *X86Subtarget::getInstructionSelector() const {
+InstructionSelector *X86Subtarget::getInstructionSelector() const {
   return InstSelector.get();
 }
 
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 24ccc9cb7843..e8efe8f2afe5 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -365,8 +365,8 @@ protected:
   /// Processor has AVX-512 vp2intersect instructions
   bool HasVP2INTERSECT = false;
 
-  /// Processor supports MPX - Memory Protection Extensions
-  bool HasMPX = false;
+  /// Deprecated flag for MPX instructions.
+  bool DeprecatedHasMPX = false;
 
   /// Processor supports CET SHSTK - Control-Flow Enforcement Technology
   /// using Shadow Stack
@@ -427,15 +427,21 @@ protected:
   /// Use software floating point for code generation.
   bool UseSoftFloat = false;
 
+  /// Use alias analysis during code generation.
+  bool UseAA = false;
+
   /// The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
-  unsigned stackAlignment = 4;
+  Align stackAlignment = Align(4);
 
   /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
   ///
   // FIXME: this is a known good value for Yonah. How about others?
   unsigned MaxInlineSizeThreshold = 128;
 
+  /// Indicates target prefers 128 bit instructions.
+  bool Prefer128Bit = false;
+
   /// Indicates target prefers 256 bit instructions.
   bool Prefer256Bit = false;
 
@@ -453,7 +459,7 @@ protected:
 
 private:
   /// Override the stack alignment.
-  unsigned StackAlignOverride;
+  MaybeAlign StackAlignOverride;
 
   /// Preferred vector width from function attribute.
   unsigned PreferVectorWidthOverride;
@@ -490,7 +496,7 @@ public:
   /// of the specified triple.
   ///
   X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
-               const X86TargetMachine &TM, unsigned StackAlignOverride,
+               const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
                unsigned PreferVectorWidthOverride,
                unsigned RequiredVectorWidth);
 
@@ -515,7 +521,7 @@ public:
   /// Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
-  unsigned getStackAlignment() const { return stackAlignment; }
+  Align getStackAlignment() const { return stackAlignment; }
 
   /// Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
@@ -527,7 +533,7 @@ public:
 
   /// Methods used by Global ISel
   const CallLowering *getCallLowering() const override;
-  const InstructionSelector *getInstructionSelector() const override;
+  InstructionSelector *getInstructionSelector() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
 
@@ -684,7 +690,6 @@ public:
   bool hasBF16() const { return HasBF16; }
   bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
   bool hasBITALG() const { return HasBITALG; }
-  bool hasMPX() const { return HasMPX; }
   bool hasSHSTK() const { return HasSHSTK; }
   bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
   bool hasCLWB() const { return HasCLWB; }
@@ -739,6 +744,7 @@ public:
            X86ProcFamily == IntelTRM;
   }
   bool useSoftFloat() const { return UseSoftFloat; }
+  bool useAA() const override { return UseAA; }
 
   /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
   /// no-sse2). There isn't any reason to disable it if the target processor
@@ -809,6 +815,7 @@ public:
     // On Win64, all these conventions just use the default convention.
     case CallingConv::C:
     case CallingConv::Fast:
+    case CallingConv::Tail:
     case CallingConv::Swift:
     case CallingConv::X86_FastCall:
     case CallingConv::X86_StdCall:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 0cbf13899a29..c15297134e4d 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -81,27 +81,28 @@ extern "C" void LLVMInitializeX86Target() {
   initializeX86SpeculativeLoadHardeningPassPass(PR);
   initializeX86FlagsCopyLoweringPassPass(PR);
   initializeX86CondBrFoldingPassPass(PR);
+  initializeX86OptimizeLEAPassPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO()) {
     if (TT.getArch() == Triple::x86_64)
-      return llvm::make_unique<X86_64MachoTargetObjectFile>();
-    return llvm::make_unique<TargetLoweringObjectFileMachO>();
+      return std::make_unique<X86_64MachoTargetObjectFile>();
+    return std::make_unique<TargetLoweringObjectFileMachO>();
   }
 
   if (TT.isOSFreeBSD())
-    return llvm::make_unique<X86FreeBSDTargetObjectFile>();
+    return std::make_unique<X86FreeBSDTargetObjectFile>();
   if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
-    return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
+    return std::make_unique<X86LinuxNaClTargetObjectFile>();
   if (TT.isOSSolaris())
-    return llvm::make_unique<X86SolarisTargetObjectFile>();
+    return std::make_unique<X86SolarisTargetObjectFile>();
   if (TT.isOSFuchsia())
-    return llvm::make_unique<X86FuchsiaTargetObjectFile>();
+    return std::make_unique<X86FuchsiaTargetObjectFile>();
   if (TT.isOSBinFormatELF())
-    return llvm::make_unique<X86ELFTargetObjectFile>();
+    return std::make_unique<X86ELFTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
-    return llvm::make_unique<TargetLoweringObjectFileCOFF>();
+    return std::make_unique<TargetLoweringObjectFileCOFF>();
   llvm_unreachable("unknown subtarget type");
 }
 
@@ -116,6 +117,9 @@ static std::string computeDataLayout(const Triple &TT) {
       !TT.isArch64Bit())
     Ret += "-p:32:32";
 
+  // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
+  Ret += "-p270:32:32-p271:32:32-p272:64:64";
+
   // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
   if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
     Ret += "-i64:64";
@@ -218,17 +222,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
           getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
           OL),
       TLOF(createTLOF(getTargetTriple())) {
-  // Windows stack unwinder gets confused when execution flow "falls through"
-  // after a call to 'noreturn' function.
-  // To prevent that, we emit a trap for 'unreachable' IR instructions.
-  // (which on X86, happens to be the 'ud2' instruction)
   // On PS4, the "return address" of a 'noreturn' call must still be within
   // the calling function, and TrapUnreachable is an easy way to get that.
-  // The check here for 64-bit windows is a bit icky, but as we're unlikely
-  // to ever want to mix 32 and 64-bit windows code in a single module
-  // this should be fine.
-  if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4() ||
-      TT.isOSBinFormatMachO()) {
+  if (TT.isPS4() || TT.isOSBinFormatMachO()) {
     this->Options.TrapUnreachable = true;
     this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
   }
@@ -311,10 +307,10 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
-                                        Options.StackAlignmentOverride,
-                                        PreferVectorWidthOverride,
-                                        RequiredVectorWidth);
+    I = std::make_unique<X86Subtarget>(
+        TargetTriple, CPU, FS, *this,
+        MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride,
+        RequiredVectorWidth);
   }
   return I.get();
 }
@@ -517,12 +513,19 @@ void X86PassConfig::addPreEmitPass() {
 }
 
 void X86PassConfig::addPreEmitPass2() {
+  const Triple &TT = TM->getTargetTriple();
+  const MCAsmInfo *MAI = TM->getMCAsmInfo();
+
   addPass(createX86RetpolineThunksPass());
+
+  // Insert extra int3 instructions after trailing call instructions to avoid
+  // issues in the unwinder.
+  if (TT.isOSWindows() && TT.getArch() == Triple::x86_64)
+    addPass(createX86AvoidTrailingCallPass());
+
   // Verify basic block incoming and outgoing cfa offset and register values and
   // correct CFA calculation rule where needed by inserting appropriate CFI
   // instructions.
-  const Triple &TT = TM->getTargetTriple();
-  const MCAsmInfo *MAI = TM->getMCAsmInfo();
   if (!TT.isOSDarwin() &&
       (!TT.isOSWindows() ||
        MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index b999e2e86af6..ec3db7b1e9e8 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -16,7 +16,6 @@
 #include "X86Subtarget.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
 #include <memory>
@@ -26,6 +25,7 @@ namespace llvm {
 class StringRef;
 class X86Subtarget;
 class X86RegisterBankInfo;
+class TargetTransformInfo;
 
 class X86TargetMachine final : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 92e0779c2e74..44185957686b 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -47,8 +47,8 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
 }
 
 const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
-    const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
-    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+    const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+    int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
   // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry
   // from a data section. In case there's an additional offset, then use
   // foo@GOTPCREL+4+<offset>.
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 13d7b4ad70d6..1fd0bbf56b19 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -30,7 +30,8 @@ namespace llvm {
                                       const TargetMachine &TM,
                                       MachineModuleInfo *MMI) const override;
 
-    const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+    const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
+                                            const MCSymbol *Sym,
                                             const MCValue &MV, int64_t Offset,
                                             MachineModuleInfo *MMI,
                                             MCStreamer &Streamer) const override;
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 3dc59aeb263e..70fd857fcf01 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -116,7 +116,8 @@ llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
 }
 
-unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+  bool Vector = (ClassID == 1);
   if (Vector && !ST->hasSSE1())
     return 0;
 
@@ -887,7 +888,7 @@ int X86TTIImpl::getArithmeticInstrCost(
 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
-  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+  // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
   // Treat Transpose as 2-op shuffles - there's no difference in lowering.
@@ -911,6 +912,39 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       int NumSubElts = SubLT.second.getVectorNumElements();
       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
         return SubLT.first;
+      // Handle some cases for widening legalization. For now we only handle
+      // cases where the original subvector was naturally aligned and evenly
+      // fit in its legalized subvector type.
+      // FIXME: Remove some of the alignment restrictions.
+      // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
+      // vectors.
+      int OrigSubElts = SubTp->getVectorNumElements();
+      if (NumSubElts > OrigSubElts &&
+          (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
+          LT.second.getVectorElementType() ==
+            SubLT.second.getVectorElementType() &&
+          LT.second.getVectorElementType().getSizeInBits() ==
+            Tp->getVectorElementType()->getPrimitiveSizeInBits()) {
+        assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
+               "Unexpected number of elements!");
+        Type *VecTy = VectorType::get(Tp->getVectorElementType(),
+                                      LT.second.getVectorNumElements());
+        Type *SubTy = VectorType::get(Tp->getVectorElementType(),
+                                      SubLT.second.getVectorNumElements());
+        int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
+        int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
+                                         ExtractIndex, SubTy);
+
+        // If the original size is 32-bits or more, we can use pshufd. Otherwise
+        // if we have SSSE3 we can use pshufb.
+        if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
+          return ExtractCost + 1; // pshufd or pshufb
+
+        assert(SubTp->getPrimitiveSizeInBits() == 16 &&
+               "Unexpected vector size");
+
+        return ExtractCost + 2; // worst case pshufhw + pshufd
+      }
     }
   }
 
@@ -1314,8 +1348,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
 
@@ -1354,6 +1390,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
 
     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    1 },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    1 },
 
     { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
@@ -1371,14 +1409,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  1 },
     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
@@ -1402,13 +1440,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 4 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
@@ -1421,7 +1459,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i64, 11 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i64,  9 },
     { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i64, 11 },
 
     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
@@ -1507,6 +1548,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i64,  1 }, // PSHUFB
 
     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
   };
@@ -1520,7 +1562,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
 
@@ -1536,6 +1579,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
 
     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    6 },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    4 },
 
     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
@@ -1562,15 +1607,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  5 },
 
+    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  2 }, // PAND+PACKUSWB
     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i32,  3 }, // PAND+3*PACKUSWB
+    { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i32,  1 },
     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
+    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i64,  4 }, // PAND+3*PACKUSWB
+    { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i64,  2 }, // PSHUFD+PSHUFLW
+    { ISD::TRUNCATE,    MVT::v2i32,  MVT::v2i64,  1 }, // PSHUFD
   };
 
   std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
@@ -1691,6 +1742,11 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     }
   }
 
+  static const CostTblEntry SLMCostTbl[] = {
+    // slm pcmpeq/pcmpgt throughput is 2
+    { ISD::SETCC,   MVT::v2i64,   2 },
+  };
+
   static const CostTblEntry AVX512BWCostTbl[] = {
     { ISD::SETCC,   MVT::v32i16,  1 },
     { ISD::SETCC,   MVT::v64i8,   1 },
@@ -1777,6 +1833,10 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     { ISD::SELECT,  MVT::v4f32,   3 }, // andps + andnps + orps
   };
 
+  if (ST->isSLM())
+    if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+      return LT.first * (ExtraCost + Entry->Cost);
+
   if (ST->hasBWI())
     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
       return LT.first * (ExtraCost + Entry->Cost);
@@ -2043,8 +2103,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
   };
+  static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
+    { ISD::CTLZ,       MVT::i64,     1 },
+  };
+  static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
+    { ISD::CTLZ,       MVT::i32,     1 },
+    { ISD::CTLZ,       MVT::i16,     1 },
+    { ISD::CTLZ,       MVT::i8,      1 },
+  };
+  static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
+    { ISD::CTPOP,      MVT::i64,     1 },
+  };
+  static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
+    { ISD::CTPOP,      MVT::i32,     1 },
+    { ISD::CTPOP,      MVT::i16,     1 },
+    { ISD::CTPOP,      MVT::i8,      1 },
+  };
   static const CostTblEntry X64CostTbl[] = { // 64-bit targets
     { ISD::BITREVERSE, MVT::i64,    14 },
+    { ISD::CTLZ,       MVT::i64,     4 }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTPOP,      MVT::i64,    10 },
     { ISD::SADDO,      MVT::i64,     1 },
     { ISD::UADDO,      MVT::i64,     1 },
   };
@@ -2052,6 +2130,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::BITREVERSE, MVT::i32,    14 },
     { ISD::BITREVERSE, MVT::i16,    14 },
     { ISD::BITREVERSE, MVT::i8,     11 },
+    { ISD::CTLZ,       MVT::i32,     4 }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTLZ,       MVT::i16,     4 }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTLZ,       MVT::i8,      4 }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTPOP,      MVT::i32,     8 },
+    { ISD::CTPOP,      MVT::i16,     9 },
+    { ISD::CTPOP,      MVT::i8,      7 },
     { ISD::SADDO,      MVT::i32,     1 },
     { ISD::SADDO,      MVT::i16,     1 },
     { ISD::SADDO,      MVT::i8,      1 },
@@ -2163,6 +2247,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
       if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
         return LT.first * Entry->Cost;
 
+    if (ST->hasLZCNT()) {
+      if (ST->is64Bit())
+        if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
+          return LT.first * Entry->Cost;
+
+      if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
+    }
+
+    if (ST->hasPOPCNT()) {
+      if (ST->is64Bit())
+        if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
+          return LT.first * Entry->Cost;
+
+      if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
+    }
+
+    // TODO - add BMI (TZCNT) scalar handling
+
     if (ST->is64Bit())
       if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
         return LT.first * Entry->Cost;
@@ -2357,8 +2461,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
   unsigned NumElem = SrcVTy->getVectorNumElements();
   VectorType *MaskTy =
       VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
-  if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) ||
-      (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) {
+  if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) ||
+      (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
+      !isPowerOf2_32(NumElem)) {
     // Scalarization
     int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
     int ScalarCompareCost = getCmpSelInstrCost(
@@ -2425,70 +2530,107 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 
 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
                                            bool IsPairwise) {
-
-  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
-
-  MVT MTy = LT.second;
-
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
   // and make it as the cost.
 
-  static const CostTblEntry SSE42CostTblPairWise[] = {
+  static const CostTblEntry SSE2CostTblPairWise[] = {
     { ISD::FADD,  MVT::v2f64,   2 },
     { ISD::FADD,  MVT::v4f32,   4 },
     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v2i32,   2 }, // FIXME: chosen to be less than v4i32.
     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v2i16,   3 }, // FIXME: chosen to be less than v4i16
+    { ISD::ADD,   MVT::v4i16,   4 }, // FIXME: chosen to be less than v8i16
     { ISD::ADD,   MVT::v8i16,   5 },
+    { ISD::ADD,   MVT::v2i8,    2 },
+    { ISD::ADD,   MVT::v4i8,    2 },
+    { ISD::ADD,   MVT::v8i8,    2 },
+    { ISD::ADD,   MVT::v16i8,   3 },
   };
 
   static const CostTblEntry AVX1CostTblPairWise[] = {
-    { ISD::FADD,  MVT::v4f32,   4 },
     { ISD::FADD,  MVT::v4f64,   5 },
     { ISD::FADD,  MVT::v8f32,   7 },
     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
-    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
     { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
-    { ISD::ADD,   MVT::v8i16,   5 },
     { ISD::ADD,   MVT::v8i32,   5 },
+    { ISD::ADD,   MVT::v16i16,  6 },
+    { ISD::ADD,   MVT::v32i8,   4 },
   };
 
-  static const CostTblEntry SSE42CostTblNoPairWise[] = {
+  static const CostTblEntry SSE2CostTblNoPairWise[] = {
     { ISD::FADD,  MVT::v2f64,   2 },
     { ISD::FADD,  MVT::v4f32,   4 },
     { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v2i32,   2 }, // FIXME: chosen to be less than v4i32
     { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
+    { ISD::ADD,   MVT::v2i16,   2 },      // The data reported by the IACA tool is "4.3".
+    { ISD::ADD,   MVT::v4i16,   3 },      // The data reported by the IACA tool is "4.3".
     { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
+    { ISD::ADD,   MVT::v2i8,    2 },
+    { ISD::ADD,   MVT::v4i8,    2 },
+    { ISD::ADD,   MVT::v8i8,    2 },
+    { ISD::ADD,   MVT::v16i8,   3 },
   };
 
   static const CostTblEntry AVX1CostTblNoPairWise[] = {
-    { ISD::FADD,  MVT::v4f32,   3 },
     { ISD::FADD,  MVT::v4f64,   3 },
+    { ISD::FADD,  MVT::v4f32,   3 },
     { ISD::FADD,  MVT::v8f32,   4 },
     { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
-    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
     { ISD::ADD,   MVT::v4i64,   3 },
-    { ISD::ADD,   MVT::v8i16,   4 },
     { ISD::ADD,   MVT::v8i32,   5 },
+    { ISD::ADD,   MVT::v16i16,  5 },
+    { ISD::ADD,   MVT::v32i8,   4 },
   };
 
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Before legalizing the type, give a chance to look up illegal narrow types
+  // in the table.
+  // FIXME: Is there a better way to do this?
+  EVT VT = TLI->getValueType(DL, ValTy);
+  if (VT.isSimple()) {
+    MVT MTy = VT.getSimpleVT();
+    if (IsPairwise) {
+      if (ST->hasAVX())
+        if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+          return Entry->Cost;
+
+      if (ST->hasSSE2())
+        if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
+          return Entry->Cost;
+    } else {
+      if (ST->hasAVX())
+        if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+          return Entry->Cost;
+
+      if (ST->hasSSE2())
+        if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+          return Entry->Cost;
+    }
+  }
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+  MVT MTy = LT.second;
+
   if (IsPairwise) {
     if (ST->hasAVX())
       if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
         return LT.first * Entry->Cost;
 
-    if (ST->hasSSE42())
-      if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+    if (ST->hasSSE2())
+      if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
         return LT.first * Entry->Cost;
   } else {
     if (ST->hasAVX())
       if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
         return LT.first * Entry->Cost;
 
-    if (ST->hasSSE42())
-      if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+    if (ST->hasSSE2())
+      if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
         return LT.first * Entry->Cost;
   }
 
@@ -3116,7 +3258,7 @@ bool X86TTIImpl::canMacroFuseCmp() {
   return ST->hasMacroFusion() || ST->hasBranchFusion();
 }
 
-bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
   if (!ST->hasAVX())
     return false;
 
@@ -3139,11 +3281,11 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
          ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
 }
 
-bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
-  return isLegalMaskedLoad(DataType);
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+  return isLegalMaskedLoad(DataType, Alignment);
 }
 
-bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
   unsigned DataSize = DL.getTypeStoreSize(DataType);
   // The only supported nontemporal loads are for aligned vectors of 16 or 32
   // bytes.  Note that 32-byte nontemporal vector loads are supported by AVX2
@@ -3154,7 +3296,7 @@ bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
   return false;
 }
 
-bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
+bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
   unsigned DataSize = DL.getTypeStoreSize(DataType);
 
   // SSE4A supports nontemporal stores of float and double at arbitrary
@@ -3299,9 +3441,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   if (IsZeroCmp) {
     // Only enable vector loads for equality comparison. Right now the vector
     // version is not as fast for three way compare (see #33329).
-    // TODO: enable AVX512 when the DAG is ready.
-    // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
     const unsigned PreferredWidth = ST->getPreferVectorWidth();
+    if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
     if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);
     if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
     // All GPR and vector loads can be unaligned. SIMD compare requires integer
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 25d9c33eb16d..7581257f41f8 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -83,6 +83,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::FeatureSlowUAMem32,
 
       // Based on whether user set the -mprefer-vector-width command line.
+      X86::FeaturePrefer128Bit,
       X86::FeaturePrefer256Bit,
 
       // CPU name enums. These just follow CPU string.
@@ -115,7 +116,7 @@ public:
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getNumberOfRegisters(unsigned ClassID) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
@@ -184,10 +185,10 @@ public:
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2);
   bool canMacroFuseCmp();
-  bool isLegalMaskedLoad(Type *DataType);
-  bool isLegalMaskedStore(Type *DataType);
-  bool isLegalNTLoad(Type *DataType, unsigned Alignment);
-  bool isLegalNTStore(Type *DataType, unsigned Alignment);
+  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment);
+  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment);
+  bool isLegalNTLoad(Type *DataType, Align Alignment);
+  bool isLegalNTStore(Type *DataType, Align Alignment);
   bool isLegalMaskedGather(Type *DataType);
   bool isLegalMaskedScatter(Type *DataType);
   bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index a07d2f20acab..9280d030b5d5 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -292,8 +292,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   // need to insert any VZEROUPPER instructions.  This is constant-time, so it
   // is cheap in the common case of no ymm/zmm use.
   bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
-  const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass};
-  for (auto *RC : RCs) {
+  for (auto *RC : {&X86::VR256RegClass, &X86::VR512_0_15RegClass}) {
     if (!YmmOrZmmUsed) {
       for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
            i++) {
@@ -304,9 +303,8 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
       }
     }
   }
-  if (!YmmOrZmmUsed) {
+  if (!YmmOrZmmUsed)
     return false;
-  }
 
   assert(BlockStates.empty() && DirtySuccessors.empty() &&
          "X86VZeroUpper state should be clear");
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index 9e499db1d7ee..ae72c6427588 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -81,7 +81,7 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
          MI->getOpcode() == X86::WIN_ALLOCA_64);
   assert(MI->getOperand(0).isReg());
 
-  unsigned AmountReg = MI->getOperand(0).getReg();
+  Register AmountReg = MI->getOperand(0).getReg();
   MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg);
 
   if (!Def ||
@@ -261,7 +261,7 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
     break;
   }
 
-  unsigned AmountReg = MI->getOperand(0).getReg();
+  Register AmountReg = MI->getOperand(0).getReg();
   MI->eraseFromParent();
 
   // Delete the definition of AmountReg.
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index f68d17d7256d..d65e1f3ab414 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -339,7 +339,10 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
     if (UseStackGuard) {
       Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
       Value *FrameAddr = Builder.CreateCall(
-          Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress),
+          Intrinsic::getDeclaration(
+              TheModule, Intrinsic::frameaddress,
+              Builder.getInt8PtrTy(
+                  TheModule->getDataLayout().getAllocaAddrSpace())),
           Builder.getInt32(0), "frameaddr");
       Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
       FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 9f615b9e7741..6b3dc27cb886 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -115,7 +115,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   MCSymbol *GVSym = getSymbol(GV);
   const Constant *C = GV->getInitializer();
-  unsigned Align = (unsigned)DL.getPreferredTypeAlignmentShift(C->getType());
+  const Align Alignment(DL.getPrefTypeAlignment(C->getType()));
 
   // Mark the start of the global
   getTargetStreamer().emitCCTopData(GVSym->getName());
@@ -143,7 +143,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     llvm_unreachable("Unknown linkage type!");
   }
 
-  EmitAlignment(Align > 2 ? Align : 2, GV);
+  EmitAlignment(std::max(Alignment, Align(4)), GV);
 
   if (GV->isThreadLocal()) {
     report_fatal_error("TLS is not supported by this target!");
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 5066407c74aa..fd8b37e26e47 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -211,7 +211,7 @@ static void RestoreSpillList(MachineBasicBlock &MBB,
 //===----------------------------------------------------------------------===//
 
 XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
-  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) {
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(4), 0) {
   // Do nothing
 }
 
@@ -367,8 +367,8 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
     RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
 
     // Return to the landing pad.
-    unsigned EhStackReg = MBBI->getOperand(0).getReg();
-    unsigned EhHandlerReg = MBBI->getOperand(1).getReg();
+    Register EhStackReg = MBBI->getOperand(0).getReg();
+    Register EhHandlerReg = MBBI->getOperand(1).getReg();
     BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(EhStackReg);
     BuildMI(MBB, MBBI, dl, TII.get(XCore::BAU_1r)).addReg(EhHandlerReg);
     MBB.erase(MBBI);  // Erase the previous return instruction.
diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
index e433d21c59b7..b5dbdea98eea 100644
--- a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
+++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -55,7 +55,7 @@ bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) {
          MBBI != EE; ++MBBI) {
       if (MBBI->getOpcode() == XCore::FRAME_TO_ARGS_OFFSET) {
         MachineInstr &OldInst = *MBBI;
-        unsigned Reg = OldInst.getOperand(0).getReg();
+        Register Reg = OldInst.getOperand(0).getReg();
         MBBI = TII.loadImmediate(MBB, MBBI, Reg, StackSize);
         OldInst.eraseFromParent();
       }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 072278d9fc46..bf006fd673f1 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -171,8 +171,8 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 
-  setMinFunctionAlignment(1);
-  setPrefFunctionAlignment(2);
+  setMinFunctionAlignment(Align(2));
+  setPrefFunctionAlignment(Align(4));
 }
 
 bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
@@ -414,8 +414,8 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
          "Unexpected extension type");
   assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
 
-  if (allowsMemoryAccess(Context, DAG.getDataLayout(), LD->getMemoryVT(),
-                         *LD->getMemOperand()))
+  if (allowsMemoryAccessForAlignment(Context, DAG.getDataLayout(),
+                                     LD->getMemoryVT(), *LD->getMemOperand()))
     return SDValue();
 
   SDValue Chain = LD->getChain();
@@ -488,8 +488,8 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   assert(!ST->isTruncatingStore() && "Unexpected store type");
   assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
 
-  if (allowsMemoryAccess(Context, DAG.getDataLayout(), ST->getMemoryVT(),
-                         *ST->getMemOperand()))
+  if (allowsMemoryAccessForAlignment(Context, DAG.getDataLayout(),
+                                     ST->getMemoryVT(), *ST->getMemOperand()))
     return SDValue();
 
   SDValue Chain = ST->getChain();
@@ -1309,7 +1309,7 @@ SDValue XCoreTargetLowering::LowerCCCArguments(
           llvm_unreachable(nullptr);
         }
       case MVT::i32:
-        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+        Register VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         ArgIn = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
         CFRegNode.push_back(ArgIn.getValue(ArgIn->getNumValues() - 1));
@@ -1360,7 +1360,7 @@ SDValue XCoreTargetLowering::LowerCCCArguments(
         offset -= StackSlotSize;
         SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
         // Move argument from phys reg -> virt reg
-        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+        Register VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
         RegInfo.addLiveIn(ArgRegs[i], VReg);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         CFRegNode.push_back(Val.getValue(Val->getNumValues() - 1));
@@ -1780,8 +1780,9 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     // Replace unaligned store of unaligned load with memmove.
     StoreSDNode *ST = cast<StoreSDNode>(N);
     if (!DCI.isBeforeLegalize() ||
-        allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
-                           ST->getMemoryVT(), *ST->getMemOperand()) ||
+        allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                       ST->getMemoryVT(),
+                                       *ST->getMemOperand()) ||
         ST->isVolatile() || ST->isIndexed()) {
       break;
     }
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 3752274e2cdf..86ec7f82d4d1 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -301,7 +301,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                     << "<--------->\n");
   Offset/=4;
 
-  unsigned Reg = MI.getOperand(0).getReg();
+  Register Reg = MI.getOperand(0).getReg();
   assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand");
 
   if (TFI->hasFP(MF)) {
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 2a8cd6b657b7..b5b7445265b7 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -53,7 +53,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT,
           T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32",
           TT, CPU, FS, Options, getEffectiveRelocModel(RM),
           getEffectiveXCoreCodeModel(CM), OL),
-      TLOF(llvm::make_unique<XCoreTargetObjectFile>()),
+      TLOF(std::make_unique<XCoreTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h
index 3fecaaa59722..58df1f290ec9 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -40,7 +40,8 @@ public:
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
         TLI(ST->getTargetLowering()) {}
 
-  unsigned getNumberOfRegisters(bool Vector) {
+  unsigned getNumberOfRegisters(unsigned ClassID) const {
+    bool Vector = (ClassID == 1);
     if (Vector) {
       return 0;
     }
diff --git a/lib/TextAPI/MachO/Architecture.cpp b/lib/TextAPI/MachO/Architecture.cpp
index a66a982fa153..699fb5f4587a 100644
--- a/lib/TextAPI/MachO/Architecture.cpp
+++ b/lib/TextAPI/MachO/Architecture.cpp
@@ -68,6 +68,10 @@ std::pair<uint32_t, uint32_t> getCPUTypeFromArchitecture(Architecture Arch) {
   return std::make_pair(0, 0);
 }
 
+Architecture mapToArchitecture(const Triple &Target) {
+  return getArchitectureFromName(Target.getArchName());
+}
+
 raw_ostream &operator<<(raw_ostream &OS, Architecture Arch) {
   OS << getArchitectureName(Arch);
   return OS;
diff --git a/lib/TextAPI/MachO/InterfaceFile.cpp b/lib/TextAPI/MachO/InterfaceFile.cpp
index 54ba8cc31267..c40a952a6a8b 100644
--- a/lib/TextAPI/MachO/InterfaceFile.cpp
+++ b/lib/TextAPI/MachO/InterfaceFile.cpp
@@ -27,36 +27,65 @@ typename C::iterator addEntry(C &Container, StringRef InstallName) {
 
   return Container.emplace(I, InstallName);
 }
+
+template <typename C>
+typename C::iterator addEntry(C &Container, const Target &Target_) {
+  auto Iter =
+      lower_bound(Container, Target_, [](const Target &LHS, const Target &RHS) {
+        return LHS < RHS;
+      });
+  if ((Iter != std::end(Container)) && !(Target_ < *Iter))
+    return Iter;
+
+  return Container.insert(Iter, Target_);
+}
 } // end namespace detail.
 
-void InterfaceFile::addAllowableClient(StringRef Name,
-                                       ArchitectureSet Architectures) {
-  auto Client = detail::addEntry(AllowableClients, Name);
-  Client->addArchitectures(Architectures);
+void InterfaceFileRef::addTarget(const Target &Target) {
+  detail::addEntry(Targets, Target);
+}
+
+void InterfaceFile::addAllowableClient(StringRef InstallName,
+                                       const Target &Target) {
+  auto Client = detail::addEntry(AllowableClients, InstallName);
+  Client->addTarget(Target);
 }
 
 void InterfaceFile::addReexportedLibrary(StringRef InstallName,
-                                         ArchitectureSet Architectures) {
+                                         const Target &Target) {
   auto Lib = detail::addEntry(ReexportedLibraries, InstallName);
-  Lib->addArchitectures(Architectures);
+  Lib->addTarget(Target);
 }
 
-void InterfaceFile::addUUID(Architecture Arch, StringRef UUID) {
-  auto I = partition_point(UUIDs,
-                           [=](const std::pair<Architecture, std::string> &O) {
-                             return O.first < Arch;
-                           });
+void InterfaceFile::addParentUmbrella(const Target &Target_, StringRef Parent) {
+  auto Iter = lower_bound(ParentUmbrellas, Target_,
+                          [](const std::pair<Target, std::string> &LHS,
+                             Target RHS) { return LHS.first < RHS; });
 
-  if (I != UUIDs.end() && Arch == I->first) {
-    I->second = UUID;
+  if ((Iter != ParentUmbrellas.end()) && !(Target_ < Iter->first)) {
+    Iter->second = Parent;
     return;
   }
 
-  UUIDs.emplace(I, Arch, UUID);
+  ParentUmbrellas.emplace(Iter, Target_, Parent);
   return;
 }
 
-void InterfaceFile::addUUID(Architecture Arch, uint8_t UUID[16]) {
+void InterfaceFile::addUUID(const Target &Target_, StringRef UUID) {
+  auto Iter = lower_bound(UUIDs, Target_,
+                          [](const std::pair<Target, std::string> &LHS,
+                             Target RHS) { return LHS.first < RHS; });
+
+  if ((Iter != UUIDs.end()) && !(Target_ < Iter->first)) {
+    Iter->second = UUID;
+    return;
+  }
+
+  UUIDs.emplace(Iter, Target_, UUID);
+  return;
+}
+
+void InterfaceFile::addUUID(const Target &Target, uint8_t UUID[16]) {
   std::stringstream Stream;
   for (unsigned i = 0; i < 16; ++i) {
     if (i == 4 || i == 6 || i == 8 || i == 10)
@@ -64,17 +93,30 @@ void InterfaceFile::addUUID(Architecture Arch, uint8_t UUID[16]) {
     Stream << std::setfill('0') << std::setw(2) << std::uppercase << std::hex
            << static_cast<int>(UUID[i]);
   }
-  addUUID(Arch, Stream.str());
+  addUUID(Target, Stream.str());
+}
+
+void InterfaceFile::addTarget(const Target &Target) {
+  detail::addEntry(Targets, Target);
+}
+
+InterfaceFile::const_filtered_target_range
+InterfaceFile::targets(ArchitectureSet Archs) const {
+  std::function<bool(const Target &)> fn = [Archs](const Target &Target_) {
+    return Archs.has(Target_.Arch);
+  };
+  return make_filter_range(Targets, fn);
 }
 
 void InterfaceFile::addSymbol(SymbolKind Kind, StringRef Name,
-                              ArchitectureSet Archs, SymbolFlags Flags) {
+                              const TargetList &Targets, SymbolFlags Flags) {
   Name = copyString(Name);
   auto result = Symbols.try_emplace(SymbolsMapKey{Kind, Name}, nullptr);
   if (result.second)
-    result.first->second = new (Allocator) Symbol{Kind, Name, Archs, Flags};
+    result.first->second = new (Allocator) Symbol{Kind, Name, Targets, Flags};
   else
-    result.first->second->addArchitectures(Archs);
+    for (const auto &Target : Targets)
+      result.first->second->addTarget(Target);
 }
 
 } // end namespace MachO.
diff --git a/lib/TextAPI/MachO/Platform.cpp b/lib/TextAPI/MachO/Platform.cpp
new file mode 100644
index 000000000000..588ec9a4d83b
--- /dev/null
+++ b/lib/TextAPI/MachO/Platform.cpp
@@ -0,0 +1,91 @@
+//===- llvm/TextAPI/MachO/Platform.cpp - Platform ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementations of Platform Helper functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/TextAPI/MachO/Platform.h"
+
+namespace llvm {
+namespace MachO {
+
+PlatformKind mapToPlatformKind(PlatformKind Platform, bool WantSim) {
+  switch (Platform) {
+  default:
+    return Platform;
+  case PlatformKind::iOS:
+    return WantSim ? PlatformKind::iOSSimulator : PlatformKind::iOS;
+  case PlatformKind::tvOS:
+    return WantSim ? PlatformKind::tvOSSimulator : PlatformKind::tvOS;
+  case PlatformKind::watchOS:
+    return WantSim ? PlatformKind::watchOSSimulator : PlatformKind::watchOS;
+  }
+  llvm_unreachable("Unknown llvm.MachO.PlatformKind enum");
+}
+
+PlatformKind mapToPlatformKind(const Triple &Target) {
+  switch (Target.getOS()) {
+  default:
+    return PlatformKind::unknown;
+  case Triple::MacOSX:
+    return PlatformKind::macOS;
+  case Triple::IOS:
+    if (Target.isSimulatorEnvironment())
+      return PlatformKind::iOSSimulator;
+    if (Target.getEnvironment() == Triple::MacABI)
+      return PlatformKind::macCatalyst;
+    return PlatformKind::iOS;
+  case Triple::TvOS:
+    return Target.isSimulatorEnvironment() ? PlatformKind::tvOSSimulator
+                                           : PlatformKind::tvOS;
+  case Triple::WatchOS:
+    return Target.isSimulatorEnvironment() ? PlatformKind::watchOSSimulator
+                                           : PlatformKind::watchOS;
+    // TODO: add bridgeOS once in llvm::Triple
+  }
+  llvm_unreachable("Unknown Target Triple");
+}
+
+PlatformSet mapToPlatformSet(ArrayRef<Triple> Targets) {
+  PlatformSet Result;
+  for (const auto &Target : Targets)
+    Result.insert(mapToPlatformKind(Target));
+  return Result;
+}
+
+StringRef getPlatformName(PlatformKind Platform) {
+  switch (Platform) {
+  case PlatformKind::unknown:
+    return "unknown";
+  case PlatformKind::macOS:
+    return "macOS";
+  case PlatformKind::iOS:
+    return "iOS";
+  case PlatformKind::tvOS:
+    return "tvOS";
+  case PlatformKind::watchOS:
+    return "watchOS";
+  case PlatformKind::bridgeOS:
+    return "bridgeOS";
+  case PlatformKind::macCatalyst:
+    return "macCatalyst";
+  case PlatformKind::iOSSimulator:
+    return "iOS Simulator";
+  case PlatformKind::tvOSSimulator:
+    return "tvOS Simulator";
+  case PlatformKind::watchOSSimulator:
+    return "watchOS Simulator";
+  }
+  llvm_unreachable("Unknown llvm.MachO.PlatformKind enum");
+}
+
+} // end namespace MachO.
+} // end namespace llvm.
diff --git a/lib/TextAPI/MachO/Symbol.cpp b/lib/TextAPI/MachO/Symbol.cpp
index 731b264f6082..9f2d8172beed 100644
--- a/lib/TextAPI/MachO/Symbol.cpp
+++ b/lib/TextAPI/MachO/Symbol.cpp
@@ -45,5 +45,14 @@ LLVM_DUMP_METHOD void Symbol::dump(raw_ostream &OS) const {
 }
 #endif
 
+Symbol::const_filtered_target_range
+Symbol::targets(ArchitectureSet Architectures) const {
+  std::function<bool(const Target &)> FN =
+      [Architectures](const Target &Target) {
+        return Architectures.has(Target.Arch);
+      };
+  return make_filter_range(Targets, FN);
+}
+
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/lib/TextAPI/MachO/Target.cpp b/lib/TextAPI/MachO/Target.cpp
new file mode 100644
index 000000000000..aee8ef421425
--- /dev/null
+++ b/lib/TextAPI/MachO/Target.cpp
@@ -0,0 +1,75 @@
+//===- tapi/Core/Target.cpp - Target ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TextAPI/MachO/Target.h"
+
+namespace llvm {
+namespace MachO {
+
+Expected<Target> Target::create(StringRef TargetValue) {
+  auto Result = TargetValue.split('-');
+  auto ArchitectureStr = Result.first;
+  auto Architecture = getArchitectureFromName(ArchitectureStr);
+  auto PlatformStr = Result.second;
+  PlatformKind Platform;
+  Platform = StringSwitch<PlatformKind>(PlatformStr)
+                 .Case("macos", PlatformKind::macOS)
+                 .Case("ios", PlatformKind::iOS)
+                 .Case("tvos", PlatformKind::tvOS)
+                 .Case("watchos", PlatformKind::watchOS)
+                 .Case("bridgeos", PlatformKind::bridgeOS)
+                 .Case("maccatalyst", PlatformKind::macCatalyst)
+                 .Case("ios-simulator", PlatformKind::iOSSimulator)
+                 .Case("tvos-simulator", PlatformKind::tvOSSimulator)
+                 .Case("watchos-simulator", PlatformKind::watchOSSimulator)
+                 .Default(PlatformKind::unknown);
+
+  if (Platform == PlatformKind::unknown) {
+    if (PlatformStr.startswith("<") && PlatformStr.endswith(">")) {
+      PlatformStr = PlatformStr.drop_front().drop_back();
+      unsigned long long RawValue;
+      if (!PlatformStr.getAsInteger(10, RawValue))
+        Platform = (PlatformKind)RawValue;
+    }
+  }
+
+  return Target{Architecture, Platform};
+}
+
+Target::operator std::string() const {
+  return (getArchitectureName(Arch) + " (" + getPlatformName(Platform) + ")")
+      .str();
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Target &Target) {
+  OS << std::string(Target);
+  return OS;
+}
+
+PlatformSet mapToPlatformSet(ArrayRef<Target> Targets) {
+  PlatformSet Result;
+  for (const auto &Target : Targets)
+    Result.insert(Target.Platform);
+  return Result;
+}
+
+ArchitectureSet mapToArchitectureSet(ArrayRef<Target> Targets) {
+  ArchitectureSet Result;
+  for (const auto &Target : Targets)
+    Result.set(Target.Arch);
+  return Result;
+}
+
+} // end namespace MachO.
+} // end namespace llvm.
diff --git a/lib/TextAPI/MachO/TextStub.cpp b/lib/TextAPI/MachO/TextStub.cpp
index 799ebdc883ab..0584e43d5893 100644
--- a/lib/TextAPI/MachO/TextStub.cpp
+++ b/lib/TextAPI/MachO/TextStub.cpp
@@ -147,6 +147,58 @@ Each undefineds section is defined as following:
   objc-ivars: []       # Optional: List of Objective C Instance Variables
   weak-ref-symbols: [] # Optional: List of weak defined symbols
 */
+
+/*
+
+ YAML Format specification.
+
+--- !tapi-tbd
+tbd-version: 4                              # The tbd version for format
+targets: [ armv7-ios, x86_64-maccatalyst ]  # The list of applicable tapi supported target triples
+uuids:                                      # Optional: List of target and UUID pairs.
+  - target: armv7-ios
+    value: ...
+  - target: x86_64-maccatalyst
+    value: ...
+flags: []                        # Optional:
+install-name: /u/l/libfoo.dylib  #
+current-version: 1.2.3           # Optional: defaults to 1.0
+compatibility-version: 1.0       # Optional: defaults to 1.0
+swift-abi-version: 0             # Optional: defaults to 0
+parent-umbrella:                 # Optional:
+allowable-clients:
+  - targets: [ armv7-ios ]       # Optional:
+    clients: [ clientA ]
+exports:                         # List of export sections
+...
+re-exports:                      # List of reexport sections
+...
+undefineds:                      # List of undefineds sections
+...
+
+Each export and reexport  section is defined as following:
+
+- targets: [ arm64-macos ]                        # The list of target triples associated with symbols
+  symbols: [ _symA ]                              # Optional: List of symbols
+  objc-classes: []                                # Optional: List of Objective-C classes
+  objc-eh-types: []                               # Optional: List of Objective-C classes
+                                                  #           with EH
+  objc-ivars: []                                  # Optional: List of Objective C Instance
+                                                  #           Variables
+  weak-symbols: []                                # Optional: List of weak defined symbols
+  thread-local-symbols: []                        # Optional: List of thread local symbols
+- targets: [ arm64-macos, x86_64-maccatalyst ]    # Optional: Targets for applicable additional symbols
+  symbols: [ _symB ]                              # Optional: List of symbols
+
+Each undefineds section is defined as following:
+- targets: [ arm64-macos ]    # The list of target triples associated with symbols
+  symbols: [ _symC ]          # Optional: List of symbols
+  objc-classes: []            # Optional: List of Objective-C classes
+  objc-eh-types: []           # Optional: List of Objective-C classes
+                              #           with EH
+  objc-ivars: []              # Optional: List of Objective C Instance Variables
+  weak-symbols: []            # Optional: List of weak defined symbols
+*/
 // clang-format on
 
 using namespace llvm;
@@ -175,6 +227,38 @@ struct UndefinedSection {
   std::vector<FlowStringRef> WeakRefSymbols;
 };
 
+// Sections for direct target mapping in TBDv4
+struct SymbolSection {
+  TargetList Targets;
+  std::vector<FlowStringRef> Symbols;
+  std::vector<FlowStringRef> Classes;
+  std::vector<FlowStringRef> ClassEHs;
+  std::vector<FlowStringRef> Ivars;
+  std::vector<FlowStringRef> WeakSymbols;
+  std::vector<FlowStringRef> TlvSymbols;
+};
+
+struct MetadataSection {
+  enum Option { Clients, Libraries };
+  std::vector<Target> Targets;
+  std::vector<FlowStringRef> Values;
+};
+
+struct UmbrellaSection {
+  std::vector<Target> Targets;
+  std::string Umbrella;
+};
+
+// UUID's for TBDv4 are mapped to target not arch
+struct UUIDv4 {
+  Target TargetID;
+  std::string Value;
+
+  UUIDv4() = default;
+  UUIDv4(const Target &TargetID, const std::string &Value)
+      : TargetID(TargetID), Value(Value) {}
+};
+
 // clang-format off
 enum TBDFlags : unsigned {
   None                         = 0U,
@@ -189,6 +273,12 @@ enum TBDFlags : unsigned {
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(Architecture)
 LLVM_YAML_IS_SEQUENCE_VECTOR(ExportSection)
 LLVM_YAML_IS_SEQUENCE_VECTOR(UndefinedSection)
+// Specific to TBDv4
+LLVM_YAML_IS_SEQUENCE_VECTOR(SymbolSection)
+LLVM_YAML_IS_SEQUENCE_VECTOR(MetadataSection)
+LLVM_YAML_IS_SEQUENCE_VECTOR(UmbrellaSection)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(Target)
+LLVM_YAML_IS_SEQUENCE_VECTOR(UUIDv4)
 
 namespace llvm {
 namespace yaml {
@@ -231,6 +321,49 @@ template <> struct MappingTraits<UndefinedSection> {
   }
 };
 
+template <> struct MappingTraits<SymbolSection> {
+  static void mapping(IO &IO, SymbolSection &Section) {
+    IO.mapRequired("targets", Section.Targets);
+    IO.mapOptional("symbols", Section.Symbols);
+    IO.mapOptional("objc-classes", Section.Classes);
+    IO.mapOptional("objc-eh-types", Section.ClassEHs);
+    IO.mapOptional("objc-ivars", Section.Ivars);
+    IO.mapOptional("weak-symbols", Section.WeakSymbols);
+    IO.mapOptional("thread-local-symbols", Section.TlvSymbols);
+  }
+};
+
+template <> struct MappingTraits<UmbrellaSection> {
+  static void mapping(IO &IO, UmbrellaSection &Section) {
+    IO.mapRequired("targets", Section.Targets);
+    IO.mapRequired("umbrella", Section.Umbrella);
+  }
+};
+
+template <> struct MappingTraits<UUIDv4> {
+  static void mapping(IO &IO, UUIDv4 &UUID) {
+    IO.mapRequired("target", UUID.TargetID);
+    IO.mapRequired("value", UUID.Value);
+  }
+};
+
+template <>
+struct MappingContextTraits<MetadataSection, MetadataSection::Option> {
+  static void mapping(IO &IO, MetadataSection &Section,
+                      MetadataSection::Option &OptionKind) {
+    IO.mapRequired("targets", Section.Targets);
+    switch (OptionKind) {
+    case MetadataSection::Option::Clients:
+      IO.mapRequired("clients", Section.Values);
+      return;
+    case MetadataSection::Option::Libraries:
+      IO.mapRequired("libraries", Section.Values);
+      return;
+    }
+    llvm_unreachable("unexpected option for metadata");
+  }
+};
+
 template <> struct ScalarBitSetTraits<TBDFlags> {
   static void bitset(IO &IO, TBDFlags &Flags) {
     IO.bitSetCase(Flags, "flat_namespace", TBDFlags::FlatNamespace);
@@ -240,13 +373,67 @@ template <> struct ScalarBitSetTraits<TBDFlags> {
   }
 };
 
+template <> struct ScalarTraits<Target> {
+  static void output(const Target &Value, void *, raw_ostream &OS) {
+    OS << Value.Arch << "-";
+    switch (Value.Platform) {
+    default:
+      OS << "unknown";
+      break;
+    case PlatformKind::macOS:
+      OS << "macos";
+      break;
+    case PlatformKind::iOS:
+      OS << "ios";
+      break;
+    case PlatformKind::tvOS:
+      OS << "tvos";
+      break;
+    case PlatformKind::watchOS:
+      OS << "watchos";
+      break;
+    case PlatformKind::bridgeOS:
+      OS << "bridgeos";
+      break;
+    case PlatformKind::macCatalyst:
+      OS << "maccatalyst";
+      break;
+    case PlatformKind::iOSSimulator:
+      OS << "ios-simulator";
+      break;
+    case PlatformKind::tvOSSimulator:
+      OS << "tvos-simulator";
+      break;
+    case PlatformKind::watchOSSimulator:
+      OS << "watchos-simulator";
+      break;
+    }
+  }
+
+  static StringRef input(StringRef Scalar, void *, Target &Value) {
+    auto Result = Target::create(Scalar);
+    if (!Result)
+      return toString(Result.takeError());
+
+    Value = *Result;
+    if (Value.Arch == AK_unknown)
+      return "unknown architecture";
+    if (Value.Platform == PlatformKind::unknown)
+      return "unknown platform";
+
+    return {};
+  }
+
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+
 template <> struct MappingTraits<const InterfaceFile *> {
   struct NormalizedTBD {
     explicit NormalizedTBD(IO &IO) {}
     NormalizedTBD(IO &IO, const InterfaceFile *&File) {
       Architectures = File->getArchitectures();
       UUIDs = File->uuids();
-      Platform = File->getPlatform();
+      Platforms = File->getPlatforms();
       InstallName = File->getInstallName();
       CurrentVersion = PackedVersion(File->getCurrentVersion());
       CompatibilityVersion = PackedVersion(File->getCompatibilityVersion());
@@ -263,7 +450,10 @@ template <> struct MappingTraits<const InterfaceFile *> {
       if (File->isInstallAPI())
         Flags |= TBDFlags::InstallAPI;
 
-      ParentUmbrella = File->getParentUmbrella();
+      for (const auto &Iter : File->umbrellas()) {
+        ParentUmbrella = Iter.second;
+        break;
+      }
 
       std::set<ArchitectureSet> ArchSet;
       for (const auto &Library : File->allowableClients())
@@ -396,6 +586,29 @@ template <> struct MappingTraits<const InterfaceFile *> {
       }
     }
 
+    // TBD v1 - TBD v3 files only support one platform and several
+    // architectures. It is possible to have more than one platform for TBD v3
+    // files, but the architectures don't apply to all
+    // platforms, specifically to filter out the i386 slice from
+    // platform macCatalyst.
+    TargetList synthesizeTargets(ArchitectureSet Architectures,
+                                          const PlatformSet &Platforms) {
+      TargetList Targets;
+
+      for (auto Platform : Platforms) {
+        Platform = mapToPlatformKind(Platform, Architectures.hasX86());
+
+        for (const auto &&Architecture : Architectures) {
+          if ((Architecture == AK_i386) &&
+              (Platform == PlatformKind::macCatalyst))
+            continue;
+
+          Targets.emplace_back(Architecture, Platform);
+        }
+      }
+      return Targets;
+    }
+
     const InterfaceFile *denormalize(IO &IO) {
       auto Ctx = reinterpret_cast<TextAPIContext *>(IO.getContext());
       assert(Ctx);
@@ -403,16 +616,16 @@ template <> struct MappingTraits<const InterfaceFile *> {
       auto *File = new InterfaceFile;
       File->setPath(Ctx->Path);
       File->setFileType(Ctx->FileKind);
+      File->addTargets(synthesizeTargets(Architectures, Platforms));
       for (auto &ID : UUIDs)
         File->addUUID(ID.first, ID.second);
-      File->setPlatform(Platform);
-      File->setArchitectures(Architectures);
       File->setInstallName(InstallName);
       File->setCurrentVersion(CurrentVersion);
       File->setCompatibilityVersion(CompatibilityVersion);
       File->setSwiftABIVersion(SwiftABIVersion);
       File->setObjCConstraint(ObjCConstraint);
-      File->setParentUmbrella(ParentUmbrella);
+      for (const auto &Target : File->targets())
+        File->addParentUmbrella(Target, ParentUmbrella);
 
       if (Ctx->FileKind == FileType::TBD_V1) {
         File->setTwoLevelNamespace();
@@ -425,76 +638,80 @@ template <> struct MappingTraits<const InterfaceFile *> {
       }
 
       for (const auto &Section : Exports) {
-        for (const auto &Library : Section.AllowableClients)
-          File->addAllowableClient(Library, Section.Architectures);
-        for (const auto &Library : Section.ReexportedLibraries)
-          File->addReexportedLibrary(Library, Section.Architectures);
+        const auto Targets =
+            synthesizeTargets(Section.Architectures, Platforms);
+
+        for (const auto &Lib : Section.AllowableClients)
+          for (const auto &Target : Targets)
+            File->addAllowableClient(Lib, Target);
+
+        for (const auto &Lib : Section.ReexportedLibraries)
+          for (const auto &Target : Targets)
+            File->addReexportedLibrary(Lib, Target);
 
         for (const auto &Symbol : Section.Symbols) {
           if (Ctx->FileKind != FileType::TBD_V3 &&
               Symbol.value.startswith("_OBJC_EHTYPE_$_"))
             File->addSymbol(SymbolKind::ObjectiveCClassEHType,
-                            Symbol.value.drop_front(15), Section.Architectures);
+                            Symbol.value.drop_front(15), Targets);
           else
-            File->addSymbol(SymbolKind::GlobalSymbol, Symbol,
-                            Section.Architectures);
+            File->addSymbol(SymbolKind::GlobalSymbol, Symbol, Targets);
         }
         for (auto &Symbol : Section.Classes) {
           auto Name = Symbol.value;
           if (Ctx->FileKind != FileType::TBD_V3)
             Name = Name.drop_front();
-          File->addSymbol(SymbolKind::ObjectiveCClass, Name,
-                          Section.Architectures);
+          File->addSymbol(SymbolKind::ObjectiveCClass, Name, Targets);
         }
         for (auto &Symbol : Section.ClassEHs)
-          File->addSymbol(SymbolKind::ObjectiveCClassEHType, Symbol,
-                          Section.Architectures);
+          File->addSymbol(SymbolKind::ObjectiveCClassEHType, Symbol, Targets);
         for (auto &Symbol : Section.IVars) {
           auto Name = Symbol.value;
           if (Ctx->FileKind != FileType::TBD_V3)
             Name = Name.drop_front();
           File->addSymbol(SymbolKind::ObjectiveCInstanceVariable, Name,
-                          Section.Architectures);
+                          Targets);
         }
         for (auto &Symbol : Section.WeakDefSymbols)
-          File->addSymbol(SymbolKind::GlobalSymbol, Symbol,
-                          Section.Architectures, SymbolFlags::WeakDefined);
+          File->addSymbol(SymbolKind::GlobalSymbol, Symbol, Targets,
+                          SymbolFlags::WeakDefined);
         for (auto &Symbol : Section.TLVSymbols)
-          File->addSymbol(SymbolKind::GlobalSymbol, Symbol,
-                          Section.Architectures, SymbolFlags::ThreadLocalValue);
+          File->addSymbol(SymbolKind::GlobalSymbol, Symbol, Targets,
+                          SymbolFlags::ThreadLocalValue);
       }
 
       for (const auto &Section : Undefineds) {
+        const auto Targets =
+            synthesizeTargets(Section.Architectures, Platforms);
         for (auto &Symbol : Section.Symbols) {
           if (Ctx->FileKind != FileType::TBD_V3 &&
               Symbol.value.startswith("_OBJC_EHTYPE_$_"))
             File->addSymbol(SymbolKind::ObjectiveCClassEHType,
-                            Symbol.value.drop_front(15), Section.Architectures,
+                            Symbol.value.drop_front(15), Targets,
                             SymbolFlags::Undefined);
           else
-            File->addSymbol(SymbolKind::GlobalSymbol, Symbol,
-                            Section.Architectures, SymbolFlags::Undefined);
+            File->addSymbol(SymbolKind::GlobalSymbol, Symbol, Targets,
+                            SymbolFlags::Undefined);
         }
         for (auto &Symbol : Section.Classes) {
           auto Name = Symbol.value;
           if (Ctx->FileKind != FileType::TBD_V3)
             Name = Name.drop_front();
-          File->addSymbol(SymbolKind::ObjectiveCClass, Name,
-                          Section.Architectures, SymbolFlags::Undefined);
+          File->addSymbol(SymbolKind::ObjectiveCClass, Name, Targets,
+                          SymbolFlags::Undefined);
         }
         for (auto &Symbol : Section.ClassEHs)
-          File->addSymbol(SymbolKind::ObjectiveCClassEHType, Symbol,
-                          Section.Architectures, SymbolFlags::Undefined);
+          File->addSymbol(SymbolKind::ObjectiveCClassEHType, Symbol, Targets,
+                          SymbolFlags::Undefined);
         for (auto &Symbol : Section.IVars) {
           auto Name = Symbol.value;
           if (Ctx->FileKind != FileType::TBD_V3)
             Name = Name.drop_front();
-          File->addSymbol(SymbolKind::ObjectiveCInstanceVariable, Name,
-                          Section.Architectures, SymbolFlags::Undefined);
+          File->addSymbol(SymbolKind::ObjectiveCInstanceVariable, Name, Targets,
+                          SymbolFlags::Undefined);
         }
         for (auto &Symbol : Section.WeakRefSymbols)
-          File->addSymbol(SymbolKind::GlobalSymbol, Symbol,
-                          Section.Architectures,
+          File->addSymbol(SymbolKind::GlobalSymbol, Symbol, Targets,
                           SymbolFlags::Undefined | SymbolFlags::WeakReferenced);
       }
 
@@ -513,7 +730,7 @@ template <> struct MappingTraits<const InterfaceFile *> {
 
     std::vector<Architecture> Architectures;
     std::vector<UUID> UUIDs;
-    PlatformKind Platform{PlatformKind::unknown};
+    PlatformSet Platforms;
     StringRef InstallName;
     PackedVersion CurrentVersion;
     PackedVersion CompatibilityVersion;
@@ -525,71 +742,336 @@ template <> struct MappingTraits<const InterfaceFile *> {
     std::vector<UndefinedSection> Undefineds;
   };
 
+  static void setFileTypeForInput(TextAPIContext *Ctx, IO &IO) {
+    if (IO.mapTag("!tapi-tbd", false))
+      Ctx->FileKind = FileType::TBD_V4;
+    else if (IO.mapTag("!tapi-tbd-v3", false))
+      Ctx->FileKind = FileType::TBD_V3;
+    else if (IO.mapTag("!tapi-tbd-v2", false))
+      Ctx->FileKind = FileType::TBD_V2;
+    else if (IO.mapTag("!tapi-tbd-v1", false) ||
+             IO.mapTag("tag:yaml.org,2002:map", false))
+      Ctx->FileKind = FileType::TBD_V1;
+    else {
+      Ctx->FileKind = FileType::Invalid;
+      return;
+    }
+  }
+
   static void mapping(IO &IO, const InterfaceFile *&File) {
     auto *Ctx = reinterpret_cast<TextAPIContext *>(IO.getContext());
     assert((!Ctx || !IO.outputting() ||
             (Ctx && Ctx->FileKind != FileType::Invalid)) &&
            "File type is not set in YAML context");
-    MappingNormalization<NormalizedTBD, const InterfaceFile *> Keys(IO, File);
 
-    // prope file type when reading.
     if (!IO.outputting()) {
-      if (IO.mapTag("!tapi-tbd-v2", false))
-        Ctx->FileKind = FileType::TBD_V2;
-      else if (IO.mapTag("!tapi-tbd-v3", false))
-        Ctx->FileKind = FileType::TBD_V2;
-      else if (IO.mapTag("!tapi-tbd-v1", false) ||
-               IO.mapTag("tag:yaml.org,2002:map", false))
-        Ctx->FileKind = FileType::TBD_V1;
-      else {
+      setFileTypeForInput(Ctx, IO);
+      switch (Ctx->FileKind) {
+      default:
+        break;
+      case FileType::TBD_V4:
+        mapKeysToValuesV4(IO, File);
+        return;
+      case FileType::Invalid:
         IO.setError("unsupported file type");
         return;
       }
-    }
-
-    // Set file tyoe when writing.
-    if (IO.outputting()) {
+    } else {
+      // Set file type when writing.
       switch (Ctx->FileKind) {
       default:
         llvm_unreachable("unexpected file type");
-      case FileType::TBD_V1:
-        // Don't write the tag into the .tbd file for TBD v1.
+      case FileType::TBD_V4:
+        mapKeysToValuesV4(IO, File);
+        return;
+      case FileType::TBD_V3:
+        IO.mapTag("!tapi-tbd-v3", true);
         break;
       case FileType::TBD_V2:
         IO.mapTag("!tapi-tbd-v2", true);
         break;
-      case FileType::TBD_V3:
-        IO.mapTag("!tapi-tbd-v3", true);
+      case FileType::TBD_V1:
+        // Don't write the tag into the .tbd file for TBD v1
         break;
       }
     }
+    mapKeysToValues(Ctx->FileKind, IO, File);
+  }
+
+  using SectionList = std::vector<SymbolSection>;
+  struct NormalizedTBD_V4 {
+    explicit NormalizedTBD_V4(IO &IO) {}
+    NormalizedTBD_V4(IO &IO, const InterfaceFile *&File) {
+      auto Ctx = reinterpret_cast<TextAPIContext *>(IO.getContext());
+      assert(Ctx);
+      TBDVersion = Ctx->FileKind >> 1;
+      Targets.insert(Targets.begin(), File->targets().begin(),
+                     File->targets().end());
+      for (const auto &IT : File->uuids())
+        UUIDs.emplace_back(IT.first, IT.second);
+      InstallName = File->getInstallName();
+      CurrentVersion = File->getCurrentVersion();
+      CompatibilityVersion = File->getCompatibilityVersion();
+      SwiftABIVersion = File->getSwiftABIVersion();
+
+      Flags = TBDFlags::None;
+      if (!File->isApplicationExtensionSafe())
+        Flags |= TBDFlags::NotApplicationExtensionSafe;
+
+      if (!File->isTwoLevelNamespace())
+        Flags |= TBDFlags::FlatNamespace;
+
+      if (File->isInstallAPI())
+        Flags |= TBDFlags::InstallAPI;
+
+      {
+        std::map<std::string, TargetList> valueToTargetList;
+        for (const auto &it : File->umbrellas())
+          valueToTargetList[it.second].emplace_back(it.first);
+
+        for (const auto &it : valueToTargetList) {
+          UmbrellaSection CurrentSection;
+          CurrentSection.Targets.insert(CurrentSection.Targets.begin(),
+                                        it.second.begin(), it.second.end());
+          CurrentSection.Umbrella = it.first;
+          ParentUmbrellas.emplace_back(std::move(CurrentSection));
+        }
+      }
+
+      assignTargetsToLibrary(File->allowableClients(), AllowableClients);
+      assignTargetsToLibrary(File->reexportedLibraries(), ReexportedLibraries);
+
+      auto handleSymbols =
+          [](SectionList &CurrentSections,
+             InterfaceFile::const_filtered_symbol_range Symbols,
+             std::function<bool(const Symbol *)> Pred) {
+            std::set<TargetList> TargetSet;
+            std::map<const Symbol *, TargetList> SymbolToTargetList;
+            for (const auto *Symbol : Symbols) {
+              if (!Pred(Symbol))
+                continue;
+              TargetList Targets(Symbol->targets());
+              SymbolToTargetList[Symbol] = Targets;
+              TargetSet.emplace(std::move(Targets));
+            }
+            for (const auto &TargetIDs : TargetSet) {
+              SymbolSection CurrentSection;
+              CurrentSection.Targets.insert(CurrentSection.Targets.begin(),
+                                            TargetIDs.begin(), TargetIDs.end());
+
+              for (const auto &IT : SymbolToTargetList) {
+                if (IT.second != TargetIDs)
+                  continue;
+
+                const auto *Symbol = IT.first;
+                switch (Symbol->getKind()) {
+                case SymbolKind::GlobalSymbol:
+                  if (Symbol->isWeakDefined())
+                    CurrentSection.WeakSymbols.emplace_back(Symbol->getName());
+                  else if (Symbol->isThreadLocalValue())
+                    CurrentSection.TlvSymbols.emplace_back(Symbol->getName());
+                  else
+                    CurrentSection.Symbols.emplace_back(Symbol->getName());
+                  break;
+                case SymbolKind::ObjectiveCClass:
+                  CurrentSection.Classes.emplace_back(Symbol->getName());
+                  break;
+                case SymbolKind::ObjectiveCClassEHType:
+                  CurrentSection.ClassEHs.emplace_back(Symbol->getName());
+                  break;
+                case SymbolKind::ObjectiveCInstanceVariable:
+                  CurrentSection.Ivars.emplace_back(Symbol->getName());
+                  break;
+                }
+              }
+              sort(CurrentSection.Symbols);
+              sort(CurrentSection.Classes);
+              sort(CurrentSection.ClassEHs);
+              sort(CurrentSection.Ivars);
+              sort(CurrentSection.WeakSymbols);
+              sort(CurrentSection.TlvSymbols);
+              CurrentSections.emplace_back(std::move(CurrentSection));
+            }
+          };
+
+      handleSymbols(Exports, File->exports(), [](const Symbol *Symbol) {
+        return !Symbol->isReexported();
+      });
+      handleSymbols(Reexports, File->exports(), [](const Symbol *Symbol) {
+        return Symbol->isReexported();
+      });
+      handleSymbols(Undefineds, File->undefineds(),
+                    [](const Symbol *Symbol) { return true; });
+    }
+
+    const InterfaceFile *denormalize(IO &IO) {
+      auto Ctx = reinterpret_cast<TextAPIContext *>(IO.getContext());
+      assert(Ctx);
+
+      auto *File = new InterfaceFile;
+      File->setPath(Ctx->Path);
+      File->setFileType(Ctx->FileKind);
+      for (auto &id : UUIDs)
+        File->addUUID(id.TargetID, id.Value);
+      File->addTargets(Targets);
+      File->setInstallName(InstallName);
+      File->setCurrentVersion(CurrentVersion);
+      File->setCompatibilityVersion(CompatibilityVersion);
+      File->setSwiftABIVersion(SwiftABIVersion);
+      for (const auto &CurrentSection : ParentUmbrellas)
+        for (const auto &target : CurrentSection.Targets)
+          File->addParentUmbrella(target, CurrentSection.Umbrella);
+      File->setTwoLevelNamespace(!(Flags & TBDFlags::FlatNamespace));
+      File->setApplicationExtensionSafe(
+          !(Flags & TBDFlags::NotApplicationExtensionSafe));
+      File->setInstallAPI(Flags & TBDFlags::InstallAPI);
+
+      for (const auto &CurrentSection : AllowableClients) {
+        for (const auto &lib : CurrentSection.Values)
+          for (const auto &Target : CurrentSection.Targets)
+            File->addAllowableClient(lib, Target);
+      }
+
+      for (const auto &CurrentSection : ReexportedLibraries) {
+        for (const auto &Lib : CurrentSection.Values)
+          for (const auto &Target : CurrentSection.Targets)
+            File->addReexportedLibrary(Lib, Target);
+      }
+
+      auto handleSymbols = [File](const SectionList &CurrentSections,
+                                  SymbolFlags Flag = SymbolFlags::None) {
+        for (const auto &CurrentSection : CurrentSections) {
+          for (auto &sym : CurrentSection.Symbols)
+            File->addSymbol(SymbolKind::GlobalSymbol, sym,
+                            CurrentSection.Targets, Flag);
+
+          for (auto &sym : CurrentSection.Classes)
+            File->addSymbol(SymbolKind::ObjectiveCClass, sym,
+                            CurrentSection.Targets);
+
+          for (auto &sym : CurrentSection.ClassEHs)
+            File->addSymbol(SymbolKind::ObjectiveCClassEHType, sym,
+                            CurrentSection.Targets);
+
+          for (auto &sym : CurrentSection.Ivars)
+            File->addSymbol(SymbolKind::ObjectiveCInstanceVariable, sym,
+                            CurrentSection.Targets);
+
+          for (auto &sym : CurrentSection.WeakSymbols)
+            File->addSymbol(SymbolKind::GlobalSymbol, sym,
+                            CurrentSection.Targets);
+          for (auto &sym : CurrentSection.TlvSymbols)
+            File->addSymbol(SymbolKind::GlobalSymbol, sym,
+                            CurrentSection.Targets,
+                            SymbolFlags::ThreadLocalValue);
+        }
+      };
+
+      handleSymbols(Exports);
+      handleSymbols(Reexports, SymbolFlags::Rexported);
+      handleSymbols(Undefineds, SymbolFlags::Undefined);
+
+      return File;
+    }
+
+    unsigned TBDVersion;
+    std::vector<UUIDv4> UUIDs;
+    TargetList Targets;
+    StringRef InstallName;
+    PackedVersion CurrentVersion;
+    PackedVersion CompatibilityVersion;
+    SwiftVersion SwiftABIVersion{0};
+    std::vector<MetadataSection> AllowableClients;
+    std::vector<MetadataSection> ReexportedLibraries;
+    TBDFlags Flags{TBDFlags::None};
+    std::vector<UmbrellaSection> ParentUmbrellas;
+    SectionList Exports;
+    SectionList Reexports;
+    SectionList Undefineds;
+
+  private:
+    void assignTargetsToLibrary(const std::vector<InterfaceFileRef> &Libraries,
+                                std::vector<MetadataSection> &Section) {
+      std::set<TargetList> targetSet;
+      std::map<const InterfaceFileRef *, TargetList> valueToTargetList;
+      for (const auto &library : Libraries) {
+        TargetList targets(library.targets());
+        valueToTargetList[&library] = targets;
+        targetSet.emplace(std::move(targets));
+      }
+
+      for (const auto &targets : targetSet) {
+        MetadataSection CurrentSection;
+        CurrentSection.Targets.insert(CurrentSection.Targets.begin(),
+                                      targets.begin(), targets.end());
+
+        for (const auto &it : valueToTargetList) {
+          if (it.second != targets)
+            continue;
+
+          CurrentSection.Values.emplace_back(it.first->getInstallName());
+        }
+        llvm::sort(CurrentSection.Values);
+        Section.emplace_back(std::move(CurrentSection));
+      }
+    }
+  };
 
+  static void mapKeysToValues(FileType FileKind, IO &IO,
+                              const InterfaceFile *&File) {
+    MappingNormalization<NormalizedTBD, const InterfaceFile *> Keys(IO, File);
     IO.mapRequired("archs", Keys->Architectures);
-    if (Ctx->FileKind != FileType::TBD_V1)
+    if (FileKind != FileType::TBD_V1)
       IO.mapOptional("uuids", Keys->UUIDs);
-    IO.mapRequired("platform", Keys->Platform);
-    if (Ctx->FileKind != FileType::TBD_V1)
+    IO.mapRequired("platform", Keys->Platforms);
+    if (FileKind != FileType::TBD_V1)
       IO.mapOptional("flags", Keys->Flags, TBDFlags::None);
     IO.mapRequired("install-name", Keys->InstallName);
     IO.mapOptional("current-version", Keys->CurrentVersion,
                    PackedVersion(1, 0, 0));
     IO.mapOptional("compatibility-version", Keys->CompatibilityVersion,
                    PackedVersion(1, 0, 0));
-    if (Ctx->FileKind != FileType::TBD_V3)
+    if (FileKind != FileType::TBD_V3)
       IO.mapOptional("swift-version", Keys->SwiftABIVersion, SwiftVersion(0));
     else
       IO.mapOptional("swift-abi-version", Keys->SwiftABIVersion,
                      SwiftVersion(0));
     IO.mapOptional("objc-constraint", Keys->ObjCConstraint,
-                   (Ctx->FileKind == FileType::TBD_V1)
+                   (FileKind == FileType::TBD_V1)
                        ? ObjCConstraintType::None
                        : ObjCConstraintType::Retain_Release);
-    if (Ctx->FileKind != FileType::TBD_V1)
+    if (FileKind != FileType::TBD_V1)
       IO.mapOptional("parent-umbrella", Keys->ParentUmbrella, StringRef());
     IO.mapOptional("exports", Keys->Exports);
-    if (Ctx->FileKind != FileType::TBD_V1)
+    if (FileKind != FileType::TBD_V1)
       IO.mapOptional("undefineds", Keys->Undefineds);
   }
+
+  static void mapKeysToValuesV4(IO &IO, const InterfaceFile *&File) {
+    MappingNormalization<NormalizedTBD_V4, const InterfaceFile *> Keys(IO,
+                                                                       File);
+    IO.mapTag("!tapi-tbd", true);
+    IO.mapRequired("tbd-version", Keys->TBDVersion);
+    IO.mapRequired("targets", Keys->Targets);
+    IO.mapOptional("uuids", Keys->UUIDs);
+    IO.mapOptional("flags", Keys->Flags, TBDFlags::None);
+    IO.mapRequired("install-name", Keys->InstallName);
+    IO.mapOptional("current-version", Keys->CurrentVersion,
+                   PackedVersion(1, 0, 0));
+    IO.mapOptional("compatibility-version", Keys->CompatibilityVersion,
+                   PackedVersion(1, 0, 0));
+    IO.mapOptional("swift-abi-version", Keys->SwiftABIVersion, SwiftVersion(0));
+    IO.mapOptional("parent-umbrella", Keys->ParentUmbrellas);
+    auto OptionKind = MetadataSection::Option::Clients;
+    IO.mapOptionalWithContext("allowable-clients", Keys->AllowableClients,
+                              OptionKind);
+    OptionKind = MetadataSection::Option::Libraries;
+    IO.mapOptionalWithContext("reexported-libraries", Keys->ReexportedLibraries,
+                              OptionKind);
+    IO.mapOptional("exports", Keys->Exports);
+    IO.mapOptional("reexports", Keys->Reexports);
+    IO.mapOptional("undefineds", Keys->Undefineds);
+  }
 };
 
 template <>
@@ -623,15 +1105,17 @@ static void DiagHandler(const SMDiagnostic &Diag, void *Context) {
 }
 
 Expected<std::unique_ptr<InterfaceFile>>
-TextAPIReader::get(std::unique_ptr<MemoryBuffer> InputBuffer) {
+TextAPIReader::get(MemoryBufferRef InputBuffer) {
   TextAPIContext Ctx;
-  Ctx.Path = InputBuffer->getBufferIdentifier();
-  yaml::Input YAMLIn(InputBuffer->getBuffer(), &Ctx, DiagHandler, &Ctx);
+  Ctx.Path = InputBuffer.getBufferIdentifier();
+  yaml::Input YAMLIn(InputBuffer.getBuffer(), &Ctx, DiagHandler, &Ctx);
 
   // Fill vector with interface file objects created by parsing the YAML file.
   std::vector<const InterfaceFile *> Files;
   YAMLIn >> Files;
 
+  // YAMLIn dynamically allocates for Interface file and in case of error,
+  // memory leak will occur unless wrapped around unique_ptr
   auto File = std::unique_ptr<InterfaceFile>(
       const_cast<InterfaceFile *>(Files.front()));
 
diff --git a/lib/TextAPI/MachO/TextStubCommon.cpp b/lib/TextAPI/MachO/TextStubCommon.cpp
index 00382cd24573..183c5d5a93b0 100644
--- a/lib/TextAPI/MachO/TextStubCommon.cpp
+++ b/lib/TextAPI/MachO/TextStubCommon.cpp
@@ -41,9 +41,21 @@ void ScalarEnumerationTraits<ObjCConstraintType>::enumeration(
   IO.enumCase(Constraint, "gc", ObjCConstraintType::GC);
 }
 
-void ScalarTraits<PlatformKind>::output(const PlatformKind &Value, void *,
-                                        raw_ostream &OS) {
-  switch (Value) {
+void ScalarTraits<PlatformSet>::output(const PlatformSet &Values, void *IO,
+                                       raw_ostream &OS) {
+
+  const auto *Ctx = reinterpret_cast<TextAPIContext *>(IO);
+  assert((!Ctx || Ctx->FileKind != FileType::Invalid) &&
+         "File type is not set in context");
+
+  if (Ctx && Ctx->FileKind == TBD_V3 && Values.count(PlatformKind::macOS) &&
+      Values.count(PlatformKind::macCatalyst)) {
+    OS << "zippered";
+    return;
+  }
+
+  assert(Values.size() == 1U);
+  switch (*Values.begin()) {
   default:
     llvm_unreachable("unexpected platform");
     break;
@@ -64,21 +76,44 @@ void ScalarTraits<PlatformKind>::output(const PlatformKind &Value, void *,
     break;
   }
 }
-StringRef ScalarTraits<PlatformKind>::input(StringRef Scalar, void *,
-                                            PlatformKind &Value) {
-  Value = StringSwitch<PlatformKind>(Scalar)
-              .Case("macosx", PlatformKind::macOS)
-              .Case("ios", PlatformKind::iOS)
-              .Case("watchos", PlatformKind::watchOS)
-              .Case("tvos", PlatformKind::tvOS)
-              .Case("bridgeos", PlatformKind::bridgeOS)
-              .Default(PlatformKind::unknown);
-
-  if (Value == PlatformKind::unknown)
+
+StringRef ScalarTraits<PlatformSet>::input(StringRef Scalar, void *IO,
+                                           PlatformSet &Values) {
+  const auto *Ctx = reinterpret_cast<TextAPIContext *>(IO);
+  assert((!Ctx || Ctx->FileKind != FileType::Invalid) &&
+         "File type is not set in context");
+
+  if (Scalar == "zippered") {
+    if (Ctx && Ctx->FileKind == FileType::TBD_V3) {
+      Values.insert(PlatformKind::macOS);
+      Values.insert(PlatformKind::macCatalyst);
+      return {};
+    }
+    return "invalid platform";
+  }
+
+  auto Platform = StringSwitch<PlatformKind>(Scalar)
+                      .Case("unknown", PlatformKind::unknown)
+                      .Case("macosx", PlatformKind::macOS)
+                      .Case("ios", PlatformKind::iOS)
+                      .Case("watchos", PlatformKind::watchOS)
+                      .Case("tvos", PlatformKind::tvOS)
+                      .Case("bridgeos", PlatformKind::bridgeOS)
+                      .Case("iosmac", PlatformKind::macCatalyst)
+                      .Default(PlatformKind::unknown);
+
+  if (Platform == PlatformKind::macCatalyst)
+    if (Ctx && Ctx->FileKind != FileType::TBD_V3)
+      return "invalid platform";
+
+  if (Platform == PlatformKind::unknown)
     return "unknown platform";
+
+  Values.insert(Platform);
   return {};
 }
-QuotingType ScalarTraits<PlatformKind>::mustQuote(StringRef) {
+
+QuotingType ScalarTraits<PlatformSet>::mustQuote(StringRef) {
   return QuotingType::None;
 }
 
@@ -137,14 +172,25 @@ void ScalarTraits<SwiftVersion>::output(const SwiftVersion &Value, void *,
     break;
   }
 }
-StringRef ScalarTraits<SwiftVersion>::input(StringRef Scalar, void *,
+StringRef ScalarTraits<SwiftVersion>::input(StringRef Scalar, void *IO,
                                             SwiftVersion &Value) {
-  Value = StringSwitch<SwiftVersion>(Scalar)
-              .Case("1.0", 1)
-              .Case("1.1", 2)
-              .Case("2.0", 3)
-              .Case("3.0", 4)
-              .Default(0);
+  const auto *Ctx = reinterpret_cast<TextAPIContext *>(IO);
+  assert((!Ctx || Ctx->FileKind != FileType::Invalid) &&
+         "File type is not set in context");
+
+  if (Ctx->FileKind == FileType::TBD_V4) {
+    if (Scalar.getAsInteger(10, Value))
+      return "invalid Swift ABI version.";
+    return {};
+  } else {
+    Value = StringSwitch<SwiftVersion>(Scalar)
+                .Case("1.0", 1)
+                .Case("1.1", 2)
+                .Case("2.0", 3)
+                .Case("3.0", 4)
+                .Default(0);
+  }
+
   if (Value != SwiftVersion(0))
     return {};
 
@@ -166,10 +212,11 @@ StringRef ScalarTraits<UUID>::input(StringRef Scalar, void *, UUID &Value) {
   auto UUID = Split.second.trim();
   if (UUID.empty())
     return "invalid uuid string pair";
-  Value.first = getArchitectureFromName(Arch);
   Value.second = UUID;
+  Value.first = Target{getArchitectureFromName(Arch), PlatformKind::unknown};
   return {};
 }
+
 QuotingType ScalarTraits<UUID>::mustQuote(StringRef) {
   return QuotingType::Single;
 }
diff --git a/lib/TextAPI/MachO/TextStubCommon.h b/lib/TextAPI/MachO/TextStubCommon.h
index c4dd1075b1c8..a558cbcec9fb 100644
--- a/lib/TextAPI/MachO/TextStubCommon.h
+++ b/lib/TextAPI/MachO/TextStubCommon.h
@@ -21,7 +21,7 @@
 #include "llvm/TextAPI/MachO/InterfaceFile.h"
 #include "llvm/TextAPI/MachO/PackedVersion.h"
 
-using UUID = std::pair<llvm::MachO::Architecture, std::string>;
+using UUID = std::pair<llvm::MachO::Target, std::string>;
 
 LLVM_YAML_STRONG_TYPEDEF(llvm::StringRef, FlowStringRef)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, SwiftVersion)
@@ -41,9 +41,9 @@ template <> struct ScalarEnumerationTraits<MachO::ObjCConstraintType> {
   static void enumeration(IO &, MachO::ObjCConstraintType &);
 };
 
-template <> struct ScalarTraits<MachO::PlatformKind> {
-  static void output(const MachO::PlatformKind &, void *, raw_ostream &);
-  static StringRef input(StringRef, void *, MachO::PlatformKind &);
+template <> struct ScalarTraits<MachO::PlatformSet> {
+  static void output(const MachO::PlatformSet &, void *, raw_ostream &);
+  static StringRef input(StringRef, void *, MachO::PlatformSet &);
   static QuotingType mustQuote(StringRef);
 };
 
diff --git a/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
index 0b406cc531a4..19f253be7952 100644
--- a/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
+++ b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -74,13 +74,6 @@ static MachineTypes getEmulation(StringRef S) {
       .Default(IMAGE_FILE_MACHINE_UNKNOWN);
 }
 
-static std::string getImplibPath(StringRef Path) {
-  SmallString<128> Out = StringRef("lib");
-  Out.append(Path);
-  sys::path::replace_extension(Out, ".a");
-  return Out.str();
-}
-
 int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
   DllOptTable Table;
   unsigned MissingIndex;
@@ -149,13 +142,23 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
     Def->OutputFile = Arg->getValue();
 
   if (Def->OutputFile.empty()) {
-    llvm::errs() << "no output file specified\n";
+    llvm::errs() << "no DLL name specified\n";
     return 1;
   }
 
   std::string Path = Args.getLastArgValue(OPT_l);
-  if (Path.empty())
-    Path = getImplibPath(Def->OutputFile);
+
+  // If ExtName is set (if the "ExtName = Name" syntax was used), overwrite
+  // Name with ExtName and clear ExtName. When only creating an import
+  // library and not linking, the internal name is irrelevant. This avoids
+  // cases where writeImportLibrary tries to transplant decoration from
+  // symbol decoration onto ExtName.
+  for (COFFShortExport& E : Def->Exports) {
+    if (!E.ExtName.empty()) {
+      E.Name = E.ExtName;
+      E.ExtName.clear();
+    }
+  }
 
   if (Machine == IMAGE_FILE_MACHINE_I386 && Args.getLastArg(OPT_k)) {
     for (COFFShortExport& E : Def->Exports) {
@@ -174,7 +177,8 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
     }
   }
 
-  if (writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine, true))
+  if (!Path.empty() &&
+      writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine, true))
     return 1;
   return 0;
 }
diff --git a/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 18ab6637305e..286191abff20 100644
--- a/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -141,6 +142,125 @@ static void doList(opt::InputArgList& Args) {
   fatalOpenError(std::move(Err), B->getBufferIdentifier());
 }
 
+static COFF::MachineTypes getCOFFFileMachine(MemoryBufferRef MB) {
+  std::error_code EC;
+  object::COFFObjectFile Obj(MB, EC);
+  if (EC) {
+    llvm::errs() << MB.getBufferIdentifier()
+                 << ": failed to open: " << EC.message() << '\n';
+    exit(1);
+  }
+
+  uint16_t Machine = Obj.getMachine();
+  if (Machine != COFF::IMAGE_FILE_MACHINE_I386 &&
+      Machine != COFF::IMAGE_FILE_MACHINE_AMD64 &&
+      Machine != COFF::IMAGE_FILE_MACHINE_ARMNT &&
+      Machine != COFF::IMAGE_FILE_MACHINE_ARM64) {
+    llvm::errs() << MB.getBufferIdentifier() << ": unknown machine: " << Machine
+                 << '\n';
+    exit(1);
+  }
+
+  return static_cast<COFF::MachineTypes>(Machine);
+}
+
+static COFF::MachineTypes getBitcodeFileMachine(MemoryBufferRef MB) {
+  Expected<std::string> TripleStr = getBitcodeTargetTriple(MB);
+  if (!TripleStr) {
+    llvm::errs() << MB.getBufferIdentifier()
+                 << ": failed to get target triple from bitcode\n";
+    exit(1);
+  }
+
+  switch (Triple(*TripleStr).getArch()) {
+  case Triple::x86:
+    return COFF::IMAGE_FILE_MACHINE_I386;
+  case Triple::x86_64:
+    return COFF::IMAGE_FILE_MACHINE_AMD64;
+  case Triple::arm:
+    return COFF::IMAGE_FILE_MACHINE_ARMNT;
+  case Triple::aarch64:
+    return COFF::IMAGE_FILE_MACHINE_ARM64;
+  default:
+    llvm::errs() << MB.getBufferIdentifier()
+                 << ": unknown arch in target triple " << *TripleStr << '\n';
+    exit(1);
+  }
+}
+
+static void appendFile(std::vector<NewArchiveMember> &Members,
+                       COFF::MachineTypes &LibMachine,
+                       std::string &LibMachineSource, MemoryBufferRef MB) {
+  file_magic Magic = identify_magic(MB.getBuffer());
+
+  if (Magic != file_magic::coff_object && Magic != file_magic::bitcode &&
+      Magic != file_magic::archive && Magic != file_magic::windows_resource) {
+    llvm::errs() << MB.getBufferIdentifier()
+                 << ": not a COFF object, bitcode, archive or resource file\n";
+    exit(1);
+  }
+
+  // If a user attempts to add an archive to another archive, llvm-lib doesn't
+  // handle the first archive file as a single file. Instead, it extracts all
+  // members from the archive and add them to the second archive. This beahvior
+  // is for compatibility with Microsoft's lib command.
+  if (Magic == file_magic::archive) {
+    Error Err = Error::success();
+    object::Archive Archive(MB, Err);
+    fatalOpenError(std::move(Err), MB.getBufferIdentifier());
+
+    for (auto &C : Archive.children(Err)) {
+      Expected<MemoryBufferRef> ChildMB = C.getMemoryBufferRef();
+      if (!ChildMB) {
+        handleAllErrors(ChildMB.takeError(), [&](const ErrorInfoBase &EIB) {
+          llvm::errs() << MB.getBufferIdentifier() << ": " << EIB.message()
+                       << "\n";
+        });
+        exit(1);
+      }
+
+      appendFile(Members, LibMachine, LibMachineSource, *ChildMB);
+    }
+
+    fatalOpenError(std::move(Err), MB.getBufferIdentifier());
+    return;
+  }
+
+  // Check that all input files have the same machine type.
+  // Mixing normal objects and LTO bitcode files is fine as long as they
+  // have the same machine type.
+  // Doing this here duplicates the header parsing work that writeArchive()
+  // below does, but it's not a lot of work and it's a bit awkward to do
+  // in writeArchive() which needs to support many tools, can't assume the
+  // input is COFF, and doesn't have a good way to report errors.
+  if (Magic == file_magic::coff_object || Magic == file_magic::bitcode) {
+    COFF::MachineTypes FileMachine = (Magic == file_magic::coff_object)
+                                         ? getCOFFFileMachine(MB)
+                                         : getBitcodeFileMachine(MB);
+
+    // FIXME: Once lld-link rejects multiple resource .obj files:
+    // Call convertResToCOFF() on .res files and add the resulting
+    // COFF file to the .lib output instead of adding the .res file, and remove
+    // this check. See PR42180.
+    if (FileMachine != COFF::IMAGE_FILE_MACHINE_UNKNOWN) {
+      if (LibMachine == COFF::IMAGE_FILE_MACHINE_UNKNOWN) {
+        LibMachine = FileMachine;
+        LibMachineSource =
+            (" (inferred from earlier file '" + MB.getBufferIdentifier() + "')")
+                .str();
+      } else if (LibMachine != FileMachine) {
+        llvm::errs() << MB.getBufferIdentifier() << ": file machine type "
+                     << machineToStr(FileMachine)
+                     << " conflicts with library machine type "
+                     << machineToStr(LibMachine) << LibMachineSource << '\n';
+        exit(1);
+      }
+    }
+  }
+
+  Members.emplace_back(MB);
+}
+
 int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
   BumpPtrAllocator Alloc;
   StringSaver Saver(Alloc);
@@ -195,104 +315,40 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
         std::string(" (from '/machine:") + Arg->getValue() + "' flag)";
   }
 
-  // Create a NewArchiveMember for each input file.
+  std::vector<std::unique_ptr<MemoryBuffer>> MBs;
+  StringSet<> Seen;
   std::vector<NewArchiveMember> Members;
+
+  // Create a NewArchiveMember for each input file.
   for (auto *Arg : Args.filtered(OPT_INPUT)) {
+    // Find a file
     std::string Path = findInputFile(Arg->getValue(), SearchPaths);
     if (Path.empty()) {
       llvm::errs() << Arg->getValue() << ": no such file or directory\n";
       return 1;
     }
 
-    Expected<NewArchiveMember> MOrErr =
-        NewArchiveMember::getFile(Saver.save(Path), /*Deterministic=*/true);
-    if (!MOrErr) {
-      handleAllErrors(MOrErr.takeError(), [&](const ErrorInfoBase &EIB) {
-        llvm::errs() << Arg->getValue() << ": " << EIB.message() << "\n";
-      });
-      return 1;
-    }
-
-    file_magic Magic = identify_magic(MOrErr->Buf->getBuffer());
-    if (Magic != file_magic::coff_object && Magic != file_magic::bitcode &&
-        Magic != file_magic::windows_resource) {
-      llvm::errs() << Arg->getValue()
-                   << ": not a COFF object, bitcode or resource file\n";
-      return 1;
-    }
-
-    // Check that all input files have the same machine type.
-    // Mixing normal objects and LTO bitcode files is fine as long as they
-    // have the same machine type.
-    // Doing this here duplicates the header parsing work that writeArchive()
-    // below does, but it's not a lot of work and it's a bit awkward to do
-    // in writeArchive() which needs to support many tools, can't assume the
-    // input is COFF, and doesn't have a good way to report errors.
-    COFF::MachineTypes FileMachine = COFF::IMAGE_FILE_MACHINE_UNKNOWN;
-    if (Magic == file_magic::coff_object) {
-      std::error_code EC;
-      object::COFFObjectFile Obj(*MOrErr->Buf, EC);
-      if (EC) {
-        llvm::errs() << Arg->getValue() << ": failed to open: " << EC.message()
-                     << '\n';
-        return 1;
-      }
-      uint16_t Machine = Obj.getMachine();
-      if (Machine != COFF::IMAGE_FILE_MACHINE_I386 &&
-          Machine != COFF::IMAGE_FILE_MACHINE_AMD64 &&
-          Machine != COFF::IMAGE_FILE_MACHINE_ARMNT &&
-          Machine != COFF::IMAGE_FILE_MACHINE_ARM64) {
-        llvm::errs() << Arg->getValue() << ": unknown machine: " << Machine
-                     << '\n';
-        return 1;
-      }
-      FileMachine = static_cast<COFF::MachineTypes>(Machine);
-    } else if (Magic == file_magic::bitcode) {
-      Expected<std::string> TripleStr = getBitcodeTargetTriple(*MOrErr->Buf);
-      if (!TripleStr) {
-        llvm::errs() << Arg->getValue()
-                     << ": failed to get target triple from bitcode\n";
-        return 1;
-      }
-      switch (Triple(*TripleStr).getArch()) {
-      case Triple::x86:
-        FileMachine = COFF::IMAGE_FILE_MACHINE_I386;
-        break;
-      case Triple::x86_64:
-        FileMachine = COFF::IMAGE_FILE_MACHINE_AMD64;
-        break;
-      case Triple::arm:
-        FileMachine = COFF::IMAGE_FILE_MACHINE_ARMNT;
-        break;
-      case Triple::aarch64:
-        FileMachine = COFF::IMAGE_FILE_MACHINE_ARM64;
-        break;
-      default:
-        llvm::errs() << Arg->getValue() << ": unknown arch in target triple "
-                     << *TripleStr << '\n';
-        return 1;
-      }
-    }
-
-    // FIXME: Once lld-link rejects multiple resource .obj files:
-    // Call convertResToCOFF() on .res files and add the resulting
-    // COFF file to the .lib output instead of adding the .res file, and remove
-    // this check. See PR42180.
-    if (FileMachine != COFF::IMAGE_FILE_MACHINE_UNKNOWN) {
-      if (LibMachine == COFF::IMAGE_FILE_MACHINE_UNKNOWN) {
-        LibMachine = FileMachine;
-        LibMachineSource = std::string(" (inferred from earlier file '") +
-                           Arg->getValue() + "')";
-      } else if (LibMachine != FileMachine) {
-        llvm::errs() << Arg->getValue() << ": file machine type "
-                     << machineToStr(FileMachine)
-                     << " conflicts with library machine type "
-                     << machineToStr(LibMachine) << LibMachineSource << '\n';
-        return 1;
-      }
-    }
-
-    Members.emplace_back(std::move(*MOrErr));
+    // Input files are uniquified by pathname. If you specify the exact same
+    // path more than once, all but the first one are ignored.
+    //
+    // Note that there's a loophole in the rule; you can prepend `.\` or
+    // something like that to a path to make it look different, and they are
+    // handled as if they were different files. This behavior is compatible with
+    // Microsoft lib.exe.
+    if (!Seen.insert(Path).second)
+      continue;
+
+    // Open a file.
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MOrErr =
+        MemoryBuffer::getFile(Path, -1, false);
+    fatalOpenError(errorCodeToError(MOrErr.getError()), Path);
+    MemoryBufferRef MBRef = (*MOrErr)->getMemBufferRef();
+
+    // Append a file.
+    appendFile(Members, LibMachine, LibMachineSource, MBRef);
+
+    // Take the ownership of the file buffer to keep the file open.
+    MBs.push_back(std::move(*MOrErr));
   }
 
   // Create an archive file.
diff --git a/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 06222d7e7e44..a24de3ca213f 100644
--- a/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -121,14 +121,13 @@ static bool foldGuardedRotateToFunnelShift(Instruction &I) {
   BasicBlock *GuardBB = Phi.getIncomingBlock(RotSrc == P1);
   BasicBlock *RotBB = Phi.getIncomingBlock(RotSrc != P1);
   Instruction *TermI = GuardBB->getTerminator();
-  BasicBlock *TrueBB, *FalseBB;
   ICmpInst::Predicate Pred;
-  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()), TrueBB,
-                         FalseBB)))
+  BasicBlock *PhiBB = Phi.getParent();
+  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()),
+                         m_SpecificBB(PhiBB), m_SpecificBB(RotBB))))
     return false;
 
-  BasicBlock *PhiBB = Phi.getParent();
-  if (Pred != CmpInst::ICMP_EQ || TrueBB != PhiBB || FalseBB != RotBB)
+  if (Pred != CmpInst::ICMP_EQ)
     return false;
 
   // We matched a variation of this IR pattern:
@@ -251,6 +250,72 @@ static bool foldAnyOrAllBitsSet(Instruction &I) {
   return true;
 }
 
+// Try to recognize below function as popcount intrinsic.
+// This is the "best" algorithm from
+// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+// Also used in TargetLowering::expandCTPOP().
+//
+// int popcount(unsigned int i) {
+//   i = i - ((i >> 1) & 0x55555555);
+//   i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+//   i = ((i + (i >> 4)) & 0x0F0F0F0F);
+//   return (i * 0x01010101) >> 24;
+// }
+static bool tryToRecognizePopCount(Instruction &I) {
+  if (I.getOpcode() != Instruction::LShr)
+    return false;
+
+  Type *Ty = I.getType();
+  if (!Ty->isIntOrIntVectorTy())
+    return false;
+
+  unsigned Len = Ty->getScalarSizeInBits();
+  // FIXME: fix Len == 8 and other irregular type lengths.
+  if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
+    return false;
+
+  APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
+  APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
+  APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
+  APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
+  APInt MaskShift = APInt(Len, Len - 8);
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *MulOp0;
+  // Matching "(i * 0x01010101...) >> 24".
+  if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
+       match(Op1, m_SpecificInt(MaskShift))) {
+    Value *ShiftOp0;
+    // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
+    if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
+                                    m_Deferred(ShiftOp0)),
+                            m_SpecificInt(Mask0F)))) {
+      Value *AndOp0;
+      // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
+      if (match(ShiftOp0,
+                m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
+                        m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)),
+                              m_SpecificInt(Mask33))))) {
+        Value *Root, *SubOp1;
+        // Matching "i - ((i >> 1) & 0x55555555...)".
+        if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
+            match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
+                                m_SpecificInt(Mask55)))) {
+          LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
+          IRBuilder<> Builder(&I);
+          Function *Func = Intrinsic::getDeclaration(
+              I.getModule(), Intrinsic::ctpop, I.getType());
+          I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
@@ -269,6 +334,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
     for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
       MadeChange |= foldAnyOrAllBitsSet(I);
       MadeChange |= foldGuardedRotateToFunnelShift(I);
+      MadeChange |= tryToRecognizePopCount(I); 
     }
   }
 
@@ -303,7 +369,7 @@ void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
 }
 
 bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   return runImpl(F, TLI, DT);
 }
diff --git a/lib/Transforms/Coroutines/CoroCleanup.cpp b/lib/Transforms/Coroutines/CoroCleanup.cpp
index 1fb0a114d0c7..c3e05577f044 100644
--- a/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -73,6 +73,8 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
         II->replaceAllUsesWith(ConstantInt::getTrue(Context));
         break;
       case Intrinsic::coro_id:
+      case Intrinsic::coro_id_retcon:
+      case Intrinsic::coro_id_retcon_once:
         II->replaceAllUsesWith(ConstantTokenNone::get(Context));
         break;
       case Intrinsic::coro_subfn_addr:
@@ -111,8 +113,9 @@ struct CoroCleanup : FunctionPass {
   bool doInitialization(Module &M) override {
     if (coro::declaresIntrinsics(M, {"llvm.coro.alloc", "llvm.coro.begin",
                                      "llvm.coro.subfn.addr", "llvm.coro.free",
-                                     "llvm.coro.id"}))
-      L = llvm::make_unique<Lowerer>(M);
+                                     "llvm.coro.id", "llvm.coro.id.retcon",
+                                     "llvm.coro.id.retcon.once"}))
+      L = std::make_unique<Lowerer>(M);
     return false;
   }
 
diff --git a/lib/Transforms/Coroutines/CoroEarly.cpp b/lib/Transforms/Coroutines/CoroEarly.cpp
index 692697d6f32e..55993d33ee4e 100644
--- a/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -91,13 +91,14 @@ void Lowerer::lowerCoroDone(IntrinsicInst *II) {
   Value *Operand = II->getArgOperand(0);
 
   // ResumeFnAddr is the first pointer sized element of the coroutine frame.
+  static_assert(coro::Shape::SwitchFieldIndex::Resume == 0,
+                "resume function not at offset zero");
   auto *FrameTy = Int8Ptr;
   PointerType *FramePtrTy = FrameTy->getPointerTo();
 
   Builder.SetInsertPoint(II);
   auto *BCI = Builder.CreateBitCast(Operand, FramePtrTy);
-  auto *Gep = Builder.CreateConstInBoundsGEP1_32(FrameTy, BCI, 0);
-  auto *Load = Builder.CreateLoad(FrameTy, Gep);
+  auto *Load = Builder.CreateLoad(BCI);
   auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
 
   II->replaceAllUsesWith(Cond);
@@ -189,6 +190,10 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
           }
         }
         break;
+      case Intrinsic::coro_id_retcon:
+      case Intrinsic::coro_id_retcon_once:
+        F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
+        break;
       case Intrinsic::coro_resume:
         lowerResumeOrDestroy(CS, CoroSubFnInst::ResumeIndex);
         break;
@@ -231,11 +236,18 @@ struct CoroEarly : public FunctionPass {
   // This pass has work to do only if we find intrinsics we are going to lower
   // in the module.
   bool doInitialization(Module &M) override {
-    if (coro::declaresIntrinsics(
-            M, {"llvm.coro.id", "llvm.coro.destroy", "llvm.coro.done",
-                "llvm.coro.end", "llvm.coro.noop", "llvm.coro.free",
-                "llvm.coro.promise", "llvm.coro.resume", "llvm.coro.suspend"}))
-      L = llvm::make_unique<Lowerer>(M);
+    if (coro::declaresIntrinsics(M, {"llvm.coro.id",
+                                     "llvm.coro.id.retcon",
+                                     "llvm.coro.id.retcon.once",
+                                     "llvm.coro.destroy",
+                                     "llvm.coro.done",
+                                     "llvm.coro.end",
+                                     "llvm.coro.noop", 
+                                     "llvm.coro.free",
+                                     "llvm.coro.promise",
+                                     "llvm.coro.resume",
+                                     "llvm.coro.suspend"}))
+      L = std::make_unique<Lowerer>(M);
     return false;
   }
 
diff --git a/lib/Transforms/Coroutines/CoroElide.cpp b/lib/Transforms/Coroutines/CoroElide.cpp
index 6707aa1c827d..aca77119023b 100644
--- a/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/lib/Transforms/Coroutines/CoroElide.cpp
@@ -286,7 +286,7 @@ struct CoroElide : FunctionPass {
 
   bool doInitialization(Module &M) override {
     if (coro::declaresIntrinsics(M, {"llvm.coro.id"}))
-      L = llvm::make_unique<Lowerer>(M);
+      L = std::make_unique<Lowerer>(M);
     return false;
   }
 
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 58bf22bee29b..2c42cf8a6d25 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -18,6 +18,7 @@
 
 #include "CoroInternal.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
@@ -28,6 +29,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/circular_raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 
 using namespace llvm;
 
@@ -120,6 +122,15 @@ struct SuspendCrossingInfo {
         return false;
 
     BasicBlock *UseBB = I->getParent();
+
+    // As a special case, treat uses by an llvm.coro.suspend.retcon
+    // as if they were uses in the suspend's single predecessor: the
+    // uses conceptually occur before the suspend.
+    if (isa<CoroSuspendRetconInst>(I)) {
+      UseBB = UseBB->getSinglePredecessor();
+      assert(UseBB && "should have split coro.suspend into its own block");
+    }
+
     return hasPathCrossingSuspendPoint(DefBB, UseBB);
   }
 
@@ -128,7 +139,17 @@ struct SuspendCrossingInfo {
   }
 
   bool isDefinitionAcrossSuspend(Instruction &I, User *U) const {
-    return isDefinitionAcrossSuspend(I.getParent(), U);
+    auto *DefBB = I.getParent();
+
+    // As a special case, treat values produced by an llvm.coro.suspend.*
+    // as if they were defined in the single successor: the uses
+    // conceptually occur after the suspend.
+    if (isa<AnyCoroSuspendInst>(I)) {
+      DefBB = DefBB->getSingleSuccessor();
+      assert(DefBB && "should have split coro.suspend into its own block");
+    }
+
+    return isDefinitionAcrossSuspend(DefBB, U);
   }
 };
 } // end anonymous namespace
@@ -183,9 +204,10 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
     B.Suspend = true;
     B.Kills |= B.Consumes;
   };
-  for (CoroSuspendInst *CSI : Shape.CoroSuspends) {
+  for (auto *CSI : Shape.CoroSuspends) {
     markSuspendBlock(CSI);
-    markSuspendBlock(CSI->getCoroSave());
+    if (auto *Save = CSI->getCoroSave())
+      markSuspendBlock(Save);
   }
 
   // Iterate propagating consumes and kills until they stop changing.
@@ -261,11 +283,13 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
 // We build up the list of spills for every case where a use is separated
 // from the definition by a suspend point.
 
+static const unsigned InvalidFieldIndex = ~0U;
+
 namespace {
 class Spill {
   Value *Def = nullptr;
   Instruction *User = nullptr;
-  unsigned FieldNo = 0;
+  unsigned FieldNo = InvalidFieldIndex;
 
 public:
   Spill(Value *Def, llvm::User *U) : Def(Def), User(cast<Instruction>(U)) {}
@@ -280,11 +304,11 @@ public:
   // the definition the first time they encounter it. Consider refactoring
   // SpillInfo into two arrays to normalize the spill representation.
   unsigned fieldIndex() const {
-    assert(FieldNo && "Accessing unassigned field");
+    assert(FieldNo != InvalidFieldIndex && "Accessing unassigned field");
     return FieldNo;
   }
   void setFieldIndex(unsigned FieldNumber) {
-    assert(!FieldNo && "Reassigning field number");
+    assert(FieldNo == InvalidFieldIndex && "Reassigning field number");
     FieldNo = FieldNumber;
   }
 };
@@ -376,18 +400,30 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
   SmallString<32> Name(F.getName());
   Name.append(".Frame");
   StructType *FrameTy = StructType::create(C, Name);
-  auto *FramePtrTy = FrameTy->getPointerTo();
-  auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
-                                 /*isVarArg=*/false);
-  auto *FnPtrTy = FnTy->getPointerTo();
-
-  // Figure out how wide should be an integer type storing the suspend index.
-  unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
-  Type *PromiseType = Shape.PromiseAlloca
-                          ? Shape.PromiseAlloca->getType()->getElementType()
-                          : Type::getInt1Ty(C);
-  SmallVector<Type *, 8> Types{FnPtrTy, FnPtrTy, PromiseType,
-                               Type::getIntNTy(C, IndexBits)};
+  SmallVector<Type *, 8> Types;
+
+  AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
+
+  if (Shape.ABI == coro::ABI::Switch) {
+    auto *FramePtrTy = FrameTy->getPointerTo();
+    auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy,
+                                   /*IsVarArg=*/false);
+    auto *FnPtrTy = FnTy->getPointerTo();
+
+    // Figure out how wide should be an integer type storing the suspend index.
+    unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
+    Type *PromiseType = PromiseAlloca
+                            ? PromiseAlloca->getType()->getElementType()
+                            : Type::getInt1Ty(C);
+    Type *IndexType = Type::getIntNTy(C, IndexBits);
+    Types.push_back(FnPtrTy);
+    Types.push_back(FnPtrTy);
+    Types.push_back(PromiseType);
+    Types.push_back(IndexType);
+  } else {
+    assert(PromiseAlloca == nullptr && "lowering doesn't support promises");
+  }
+
   Value *CurrentDef = nullptr;
 
   Padder.addTypes(Types);
@@ -399,7 +435,7 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
 
     CurrentDef = S.def();
     // PromiseAlloca was already added to Types array earlier.
-    if (CurrentDef == Shape.PromiseAlloca)
+    if (CurrentDef == PromiseAlloca)
       continue;
 
     uint64_t Count = 1;
@@ -430,9 +466,80 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
   }
   FrameTy->setBody(Types);
 
+  switch (Shape.ABI) {
+  case coro::ABI::Switch:
+    break;
+
+  // Remember whether the frame is inline in the storage.
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    auto &Layout = F.getParent()->getDataLayout();
+    auto Id = Shape.getRetconCoroId();
+    Shape.RetconLowering.IsFrameInlineInStorage
+      = (Layout.getTypeAllocSize(FrameTy) <= Id->getStorageSize() &&
+         Layout.getABITypeAlignment(FrameTy) <= Id->getStorageAlignment());
+    break;
+  }
+  }
+
   return FrameTy;
 }
 
+// We use a pointer use visitor to discover if there are any writes into an
+// alloca that dominates CoroBegin. If that is the case, insertSpills will copy
+// the value from the alloca into the coroutine frame spill slot corresponding
+// to that alloca.
+namespace {
+struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
+  using Base = PtrUseVisitor<AllocaUseVisitor>;
+  AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
+                   const CoroBeginInst &CB)
+      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB) {}
+
+  // We are only interested in uses that dominate coro.begin.
+  void visit(Instruction &I) {
+    if (DT.dominates(&I, &CoroBegin))
+      Base::visit(I);
+  }
+  // We need to provide this overload as PtrUseVisitor uses a pointer based
+  // visiting function.
+  void visit(Instruction *I) { return visit(*I); }
+
+  void visitLoadInst(LoadInst &) {} // Good. Nothing to do.
+
+  // If the use is an operand, the pointer escaped and anything can write into
+  // that memory. If the use is the pointer, we are definitely writing into the
+  // alloca and therefore we need to copy.
+  void visitStoreInst(StoreInst &SI) { PI.setAborted(&SI); }
+
+  // Any other instruction that is not filtered out by PtrUseVisitor, will
+  // result in the copy.
+  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+
+private:
+  const DominatorTree &DT;
+  const CoroBeginInst &CoroBegin;
+};
+} // namespace
+static bool mightWriteIntoAllocaPtr(AllocaInst &A, const DominatorTree &DT,
+                                    const CoroBeginInst &CB) {
+  const DataLayout &DL = A.getModule()->getDataLayout();
+  AllocaUseVisitor Visitor(DL, DT, CB);
+  auto PtrI = Visitor.visitPtr(A);
+  if (PtrI.isEscaped() || PtrI.isAborted()) {
+    auto *PointerEscapingInstr = PtrI.getEscapingInst()
+                                     ? PtrI.getEscapingInst()
+                                     : PtrI.getAbortingInst();
+    if (PointerEscapingInstr) {
+      LLVM_DEBUG(
+          dbgs() << "AllocaInst copy was triggered by instruction: "
+                 << *PointerEscapingInstr << "\n");
+    }
+    return true;
+  }
+  return false;
+}
+
 // We need to make room to insert a spill after initial PHIs, but before
 // catchswitch instruction. Placing it before violates the requirement that
 // catchswitch, like all other EHPads must be the first nonPHI in a block.
@@ -476,7 +583,7 @@ static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) {
 //    whatever
 //
 //
-static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
+static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   auto *CB = Shape.CoroBegin;
   LLVMContext &C = CB->getContext();
   IRBuilder<> Builder(CB->getNextNode());
@@ -484,11 +591,14 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   PointerType *FramePtrTy = FrameTy->getPointerTo();
   auto *FramePtr =
       cast<Instruction>(Builder.CreateBitCast(CB, FramePtrTy, "FramePtr"));
+  DominatorTree DT(*CB->getFunction());
 
   Value *CurrentValue = nullptr;
   BasicBlock *CurrentBlock = nullptr;
   Value *CurrentReload = nullptr;
-  unsigned Index = 0; // Proper field number will be read from field definition.
+
+  // Proper field number will be read from field definition.
+  unsigned Index = InvalidFieldIndex;
 
   // We need to keep track of any allocas that need "spilling"
   // since they will live in the coroutine frame now, all access to them
@@ -496,9 +606,11 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   // we remember allocas and their indices to be handled once we processed
   // all the spills.
   SmallVector<std::pair<AllocaInst *, unsigned>, 4> Allocas;
-  // Promise alloca (if present) has a fixed field number (Shape::PromiseField)
-  if (Shape.PromiseAlloca)
-    Allocas.emplace_back(Shape.PromiseAlloca, coro::Shape::PromiseField);
+  // Promise alloca (if present) has a fixed field number.
+  if (auto *PromiseAlloca = Shape.getPromiseAlloca()) {
+    assert(Shape.ABI == coro::ABI::Switch);
+    Allocas.emplace_back(PromiseAlloca, coro::Shape::SwitchFieldIndex::Promise);
+  }
 
   // Create a GEP with the given index into the coroutine frame for the original
   // value Orig. Appends an extra 0 index for array-allocas, preserving the
@@ -526,7 +638,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   // Create a load instruction to reload the spilled value from the coroutine
   // frame.
   auto CreateReload = [&](Instruction *InsertBefore) {
-    assert(Index && "accessing unassigned field number");
+    assert(Index != InvalidFieldIndex && "accessing unassigned field number");
     Builder.SetInsertPoint(InsertBefore);
 
     auto *G = GetFramePointer(Index, CurrentValue);
@@ -558,29 +670,45 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
         // coroutine frame.
 
         Instruction *InsertPt = nullptr;
-        if (isa<Argument>(CurrentValue)) {
+        if (auto Arg = dyn_cast<Argument>(CurrentValue)) {
           // For arguments, we will place the store instruction right after
           // the coroutine frame pointer instruction, i.e. bitcast of
           // coro.begin from i8* to %f.frame*.
           InsertPt = FramePtr->getNextNode();
+
+          // If we're spilling an Argument, make sure we clear 'nocapture'
+          // from the coroutine function.
+          Arg->getParent()->removeParamAttr(Arg->getArgNo(),
+                                            Attribute::NoCapture);
+
         } else if (auto *II = dyn_cast<InvokeInst>(CurrentValue)) {
           // If we are spilling the result of the invoke instruction, split the
           // normal edge and insert the spill in the new block.
           auto NewBB = SplitEdge(II->getParent(), II->getNormalDest());
           InsertPt = NewBB->getTerminator();
-        } else if (dyn_cast<PHINode>(CurrentValue)) {
+        } else if (isa<PHINode>(CurrentValue)) {
           // Skip the PHINodes and EH pads instructions.
           BasicBlock *DefBlock = cast<Instruction>(E.def())->getParent();
           if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
             InsertPt = splitBeforeCatchSwitch(CSI);
           else
             InsertPt = &*DefBlock->getFirstInsertionPt();
+        } else if (auto CSI = dyn_cast<AnyCoroSuspendInst>(CurrentValue)) {
+          // Don't spill immediately after a suspend; splitting assumes
+          // that the suspend will be followed by a branch.
+          InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHI();
         } else {
+          auto *I = cast<Instruction>(E.def());
+          assert(!I->isTerminator() && "unexpected terminator");
           // For all other values, the spill is placed immediately after
           // the definition.
-          assert(!cast<Instruction>(E.def())->isTerminator() &&
-                 "unexpected terminator");
-          InsertPt = cast<Instruction>(E.def())->getNextNode();
+          if (DT.dominates(CB, I)) {
+            InsertPt = I->getNextNode();
+          } else {
+            // Unless, it is not dominated by CoroBegin, then it will be
+            // inserted immediately after CoroFrame is computed.
+            InsertPt = FramePtr->getNextNode();
+          }
         }
 
         Builder.SetInsertPoint(InsertPt);
@@ -613,21 +741,53 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   }
 
   BasicBlock *FramePtrBB = FramePtr->getParent();
-  Shape.AllocaSpillBlock =
-      FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");
-  Shape.AllocaSpillBlock->splitBasicBlock(&Shape.AllocaSpillBlock->front(),
-                                          "PostSpill");
 
-  Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
+  auto SpillBlock =
+    FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");      
+  SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
+  Shape.AllocaSpillBlock = SpillBlock;
   // If we found any allocas, replace all of their remaining uses with Geps.
+  // Note: we cannot do it indiscriminately as some of the uses may not be
+  // dominated by CoroBegin.
+  bool MightNeedToCopy = false;
+  Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
+  SmallVector<Instruction *, 4> UsersToUpdate;
   for (auto &P : Allocas) {
-    auto *G = GetFramePointer(P.second, P.first);
+    AllocaInst *const A = P.first;
+    UsersToUpdate.clear();
+    for (User *U : A->users()) {
+      auto *I = cast<Instruction>(U);
+      if (DT.dominates(CB, I))
+        UsersToUpdate.push_back(I);
+      else
+        MightNeedToCopy = true;
+    }
+    if (!UsersToUpdate.empty()) {
+      auto *G = GetFramePointer(P.second, A);
+      G->takeName(A);
+      for (Instruction *I : UsersToUpdate)
+        I->replaceUsesOfWith(A, G);
+    }
+  }
+  // If we discovered such uses not dominated by CoroBegin, see if any of them
+  // preceed coro begin and have instructions that can modify the
+  // value of the alloca and therefore would require a copying the value into
+  // the spill slot in the coroutine frame.
+  if (MightNeedToCopy) {
+    Builder.SetInsertPoint(FramePtr->getNextNode());
+
+    for (auto &P : Allocas) {
+      AllocaInst *const A = P.first;
+      if (mightWriteIntoAllocaPtr(*A, DT, *CB)) {
+        if (A->isArrayAllocation())
+          report_fatal_error(
+              "Coroutines cannot handle copying of array allocas yet");
 
-    // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) here,
-    // as we are changing location of the instruction.
-    G->takeName(P.first);
-    P.first->replaceAllUsesWith(G);
-    P.first->eraseFromParent();
+        auto *G = GetFramePointer(P.second, A);
+        auto *Value = Builder.CreateLoad(A);
+        Builder.CreateStore(Value, G);
+      }
+    }
   }
   return FramePtr;
 }
@@ -829,52 +989,6 @@ static void rewriteMaterializableInstructions(IRBuilder<> &IRB,
   }
 }
 
-// Move early uses of spilled variable after CoroBegin.
-// For example, if a parameter had address taken, we may end up with the code
-// like:
-//        define @f(i32 %n) {
-//          %n.addr = alloca i32
-//          store %n, %n.addr
-//          ...
-//          call @coro.begin
-//    we need to move the store after coro.begin
-static void moveSpillUsesAfterCoroBegin(Function &F, SpillInfo const &Spills,
-                                        CoroBeginInst *CoroBegin) {
-  DominatorTree DT(F);
-  SmallVector<Instruction *, 8> NeedsMoving;
-
-  Value *CurrentValue = nullptr;
-
-  for (auto const &E : Spills) {
-    if (CurrentValue == E.def())
-      continue;
-
-    CurrentValue = E.def();
-
-    for (User *U : CurrentValue->users()) {
-      Instruction *I = cast<Instruction>(U);
-      if (!DT.dominates(CoroBegin, I)) {
-        LLVM_DEBUG(dbgs() << "will move: " << *I << "\n");
-
-        // TODO: Make this more robust. Currently if we run into a situation
-        // where simple instruction move won't work we panic and
-        // report_fatal_error.
-        for (User *UI : I->users()) {
-          if (!DT.dominates(CoroBegin, cast<Instruction>(UI)))
-            report_fatal_error("cannot move instruction since its users are not"
-                               " dominated by CoroBegin");
-        }
-
-        NeedsMoving.push_back(I);
-      }
-    }
-  }
-
-  Instruction *InsertPt = CoroBegin->getNextNode();
-  for (Instruction *I : NeedsMoving)
-    I->moveBefore(InsertPt);
-}
-
 // Splits the block at a particular instruction unless it is the first
 // instruction in the block with a single predecessor.
 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) {
@@ -895,21 +1009,337 @@ static void splitAround(Instruction *I, const Twine &Name) {
   splitBlockIfNotFirst(I->getNextNode(), "After" + Name);
 }
 
+static bool isSuspendBlock(BasicBlock *BB) {
+  return isa<AnyCoroSuspendInst>(BB->front());
+}
+
+typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet;
+
+/// Does control flow starting at the given block ever reach a suspend
+/// instruction before reaching a block in VisitedOrFreeBBs?
+static bool isSuspendReachableFrom(BasicBlock *From,
+                                   VisitedBlocksSet &VisitedOrFreeBBs) {
+  // Eagerly try to add this block to the visited set.  If it's already
+  // there, stop recursing; this path doesn't reach a suspend before
+  // either looping or reaching a freeing block.
+  if (!VisitedOrFreeBBs.insert(From).second)
+    return false;
+
+  // We assume that we'll already have split suspends into their own blocks.
+  if (isSuspendBlock(From))
+    return true;
+
+  // Recurse on the successors.
+  for (auto Succ : successors(From)) {
+    if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs))
+      return true;
+  }
+
+  return false;
+}
+
+/// Is the given alloca "local", i.e. bounded in lifetime to not cross a
+/// suspend point?
+static bool isLocalAlloca(CoroAllocaAllocInst *AI) {
+  // Seed the visited set with all the basic blocks containing a free
+  // so that we won't pass them up.
+  VisitedBlocksSet VisitedOrFreeBBs;
+  for (auto User : AI->users()) {
+    if (auto FI = dyn_cast<CoroAllocaFreeInst>(User))
+      VisitedOrFreeBBs.insert(FI->getParent());
+  }
+
+  return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs);
+}
+
+/// After we split the coroutine, will the given basic block be along
+/// an obvious exit path for the resumption function?
+static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB,
+                                              unsigned depth = 3) {
+  // If we've bottomed out our depth count, stop searching and assume
+  // that the path might loop back.
+  if (depth == 0) return false;
+
+  // If this is a suspend block, we're about to exit the resumption function.
+  if (isSuspendBlock(BB)) return true;
+
+  // Recurse into the successors.
+  for (auto Succ : successors(BB)) {
+    if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1))
+      return false;
+  }
+
+  // If none of the successors leads back in a loop, we're on an exit/abort.
+  return true;
+}
+
+static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) {
+  // Look for a free that isn't sufficiently obviously followed by
+  // either a suspend or a termination, i.e. something that will leave
+  // the coro resumption frame.
+  for (auto U : AI->users()) {
+    auto FI = dyn_cast<CoroAllocaFreeInst>(U);
+    if (!FI) continue;
+
+    if (!willLeaveFunctionImmediatelyAfter(FI->getParent()))
+      return true;
+  }
+
+  // If we never found one, we don't need a stack save.
+  return false;
+}
+
+/// Turn each of the given local allocas into a normal (dynamic) alloca
+/// instruction.
+static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas,
+                              SmallVectorImpl<Instruction*> &DeadInsts) {
+  for (auto AI : LocalAllocas) {
+    auto M = AI->getModule();
+    IRBuilder<> Builder(AI);
+
+    // Save the stack depth.  Try to avoid doing this if the stackrestore
+    // is going to immediately precede a return or something.
+    Value *StackSave = nullptr;
+    if (localAllocaNeedsStackSave(AI))
+      StackSave = Builder.CreateCall(
+                            Intrinsic::getDeclaration(M, Intrinsic::stacksave));
+
+    // Allocate memory.
+    auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize());
+    Alloca->setAlignment(MaybeAlign(AI->getAlignment()));
+
+    for (auto U : AI->users()) {
+      // Replace gets with the allocation.
+      if (isa<CoroAllocaGetInst>(U)) {
+        U->replaceAllUsesWith(Alloca);
+
+      // Replace frees with stackrestores.  This is safe because
+      // alloca.alloc is required to obey a stack discipline, although we
+      // don't enforce that structurally.
+      } else {
+        auto FI = cast<CoroAllocaFreeInst>(U);
+        if (StackSave) {
+          Builder.SetInsertPoint(FI);
+          Builder.CreateCall(
+                    Intrinsic::getDeclaration(M, Intrinsic::stackrestore),
+                             StackSave);
+        }
+      }
+      DeadInsts.push_back(cast<Instruction>(U));
+    }
+
+    DeadInsts.push_back(AI);
+  }
+}
+
+/// Turn the given coro.alloca.alloc call into a dynamic allocation.
+/// This happens during the all-instructions iteration, so it must not
+/// delete the call.
+static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI,
+                                        coro::Shape &Shape,
+                                   SmallVectorImpl<Instruction*> &DeadInsts) {
+  IRBuilder<> Builder(AI);
+  auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr);
+
+  for (User *U : AI->users()) {
+    if (isa<CoroAllocaGetInst>(U)) {
+      U->replaceAllUsesWith(Alloc);
+    } else {
+      auto FI = cast<CoroAllocaFreeInst>(U);
+      Builder.SetInsertPoint(FI);
+      Shape.emitDealloc(Builder, Alloc, nullptr);
+    }
+    DeadInsts.push_back(cast<Instruction>(U));
+  }
+
+  // Push this on last so that it gets deleted after all the others.
+  DeadInsts.push_back(AI);
+
+  // Return the new allocation value so that we can check for needed spills.
+  return cast<Instruction>(Alloc);
+}
+
+/// Get the current swifterror value.
+static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy,
+                                     coro::Shape &Shape) {
+  // Make a fake function pointer as a sort of intrinsic.
+  auto FnTy = FunctionType::get(ValueTy, {}, false);
+  auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
+
+  auto Call = Builder.CreateCall(Fn, {});
+  Shape.SwiftErrorOps.push_back(Call);
+
+  return Call;
+}
+
+/// Set the given value as the current swifterror value.
+///
+/// Returns a slot that can be used as a swifterror slot.
+static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V,
+                                     coro::Shape &Shape) {
+  // Make a fake function pointer as a sort of intrinsic.
+  auto FnTy = FunctionType::get(V->getType()->getPointerTo(),
+                                {V->getType()}, false);
+  auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
+
+  auto Call = Builder.CreateCall(Fn, { V });
+  Shape.SwiftErrorOps.push_back(Call);
+
+  return Call;
+}
+
+/// Set the swifterror value from the given alloca before a call,
+/// then put in back in the alloca afterwards.
+///
+/// Returns an address that will stand in for the swifterror slot
+/// until splitting.
+static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call,
+                                                 AllocaInst *Alloca,
+                                                 coro::Shape &Shape) {
+  auto ValueTy = Alloca->getAllocatedType();
+  IRBuilder<> Builder(Call);
+
+  // Load the current value from the alloca and set it as the
+  // swifterror value.
+  auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca);
+  auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape);
+
+  // Move to after the call.  Since swifterror only has a guaranteed
+  // value on normal exits, we can ignore implicit and explicit unwind
+  // edges.
+  if (isa<CallInst>(Call)) {
+    Builder.SetInsertPoint(Call->getNextNode());
+  } else {
+    auto Invoke = cast<InvokeInst>(Call);
+    Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg());
+  }
+
+  // Get the current swifterror value and store it to the alloca.
+  auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape);
+  Builder.CreateStore(ValueAfterCall, Alloca);
+
+  return Addr;
+}
+
+/// Eliminate a formerly-swifterror alloca by inserting the get/set
+/// intrinsics and attempting to MemToReg the alloca away.
+static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca,
+                                      coro::Shape &Shape) {
+  for (auto UI = Alloca->use_begin(), UE = Alloca->use_end(); UI != UE; ) {
+    // We're likely changing the use list, so use a mutation-safe
+    // iteration pattern.
+    auto &Use = *UI;
+    ++UI;
+
+    // swifterror values can only be used in very specific ways.
+    // We take advantage of that here.
+    auto User = Use.getUser();
+    if (isa<LoadInst>(User) || isa<StoreInst>(User))
+      continue;
+
+    assert(isa<CallInst>(User) || isa<InvokeInst>(User));
+    auto Call = cast<Instruction>(User);
+
+    auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape);
+
+    // Use the returned slot address as the call argument.
+    Use.set(Addr);
+  }
+
+  // All the uses should be loads and stores now.
+  assert(isAllocaPromotable(Alloca));
+}
+
+/// "Eliminate" a swifterror argument by reducing it to the alloca case
+/// and then loading and storing in the prologue and epilog.
+///
+/// The argument keeps the swifterror flag.
+static void eliminateSwiftErrorArgument(Function &F, Argument &Arg,
+                                        coro::Shape &Shape,
+                             SmallVectorImpl<AllocaInst*> &AllocasToPromote) {
+  IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
+
+  auto ArgTy = cast<PointerType>(Arg.getType());
+  auto ValueTy = ArgTy->getElementType();
+
+  // Reduce to the alloca case:
+
+  // Create an alloca and replace all uses of the arg with it.
+  auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace());
+  Arg.replaceAllUsesWith(Alloca);
+
+  // Set an initial value in the alloca.  swifterror is always null on entry.
+  auto InitialValue = Constant::getNullValue(ValueTy);
+  Builder.CreateStore(InitialValue, Alloca);
+
+  // Find all the suspends in the function and save and restore around them.
+  for (auto Suspend : Shape.CoroSuspends) {
+    (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape);
+  }
+
+  // Find all the coro.ends in the function and restore the error value.
+  for (auto End : Shape.CoroEnds) {
+    Builder.SetInsertPoint(End);
+    auto FinalValue = Builder.CreateLoad(ValueTy, Alloca);
+    (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape);
+  }
+
+  // Now we can use the alloca logic.
+  AllocasToPromote.push_back(Alloca);
+  eliminateSwiftErrorAlloca(F, Alloca, Shape);
+}
+
+/// Eliminate all problematic uses of swifterror arguments and allocas
+/// from the function.  We'll fix them up later when splitting the function.
+static void eliminateSwiftError(Function &F, coro::Shape &Shape) {
+  SmallVector<AllocaInst*, 4> AllocasToPromote;
+
+  // Look for a swifterror argument.
+  for (auto &Arg : F.args()) {
+    if (!Arg.hasSwiftErrorAttr()) continue;
+
+    eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote);
+    break;
+  }
+
+  // Look for swifterror allocas.
+  for (auto &Inst : F.getEntryBlock()) {
+    auto Alloca = dyn_cast<AllocaInst>(&Inst);
+    if (!Alloca || !Alloca->isSwiftError()) continue;
+
+    // Clear the swifterror flag.
+    Alloca->setSwiftError(false);
+
+    AllocasToPromote.push_back(Alloca);
+    eliminateSwiftErrorAlloca(F, Alloca, Shape);
+  }
+
+  // If we have any allocas to promote, compute a dominator tree and
+  // promote them en masse.
+  if (!AllocasToPromote.empty()) {
+    DominatorTree DT(F);
+    PromoteMemToReg(AllocasToPromote, DT);
+  }
+}
+
 void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   // Lower coro.dbg.declare to coro.dbg.value, since we are going to rewrite
   // access to local variables.
   LowerDbgDeclare(F);
 
-  Shape.PromiseAlloca = Shape.CoroBegin->getId()->getPromise();
-  if (Shape.PromiseAlloca) {
-    Shape.CoroBegin->getId()->clearPromise();
+  eliminateSwiftError(F, Shape);
+
+  if (Shape.ABI == coro::ABI::Switch &&
+      Shape.SwitchLowering.PromiseAlloca) {
+    Shape.getSwitchCoroId()->clearPromise();
   }
 
   // Make sure that all coro.save, coro.suspend and the fallthrough coro.end
   // intrinsics are in their own blocks to simplify the logic of building up
   // SuspendCrossing data.
-  for (CoroSuspendInst *CSI : Shape.CoroSuspends) {
-    splitAround(CSI->getCoroSave(), "CoroSave");
+  for (auto *CSI : Shape.CoroSuspends) {
+    if (auto *Save = CSI->getCoroSave())
+      splitAround(Save, "CoroSave");
     splitAround(CSI, "CoroSuspend");
   }
 
@@ -926,6 +1356,8 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
 
   IRBuilder<> Builder(F.getContext());
   SpillInfo Spills;
+  SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas;
+  SmallVector<Instruction*, 4> DeadInstructions;
 
   for (int Repeat = 0; Repeat < 4; ++Repeat) {
     // See if there are materializable instructions across suspend points.
@@ -955,11 +1387,40 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
     // of the Coroutine Frame.
     if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin)
       continue;
+
     // The Coroutine Promise always included into coroutine frame, no need to
     // check for suspend crossing.
-    if (Shape.PromiseAlloca == &I)
+    if (Shape.ABI == coro::ABI::Switch &&
+        Shape.SwitchLowering.PromiseAlloca == &I)
       continue;
 
+    // Handle alloca.alloc specially here.
+    if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) {
+      // Check whether the alloca's lifetime is bounded by suspend points.
+      if (isLocalAlloca(AI)) {
+        LocalAllocas.push_back(AI);
+        continue;
+      }
+
+      // If not, do a quick rewrite of the alloca and then add spills of
+      // the rewritten value.  The rewrite doesn't invalidate anything in
+      // Spills because the other alloca intrinsics have no other operands
+      // besides AI, and it doesn't invalidate the iteration because we delay
+      // erasing AI.
+      auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions);
+
+      for (User *U : Alloc->users()) {
+        if (Checker.isDefinitionAcrossSuspend(*Alloc, U))
+          Spills.emplace_back(Alloc, U);
+      }
+      continue;
+    }
+
+    // Ignore alloca.get; we process this as part of coro.alloca.alloc.
+    if (isa<CoroAllocaGetInst>(I)) {
+      continue;
+    }
+
     for (User *U : I.users())
       if (Checker.isDefinitionAcrossSuspend(I, U)) {
         // We cannot spill a token.
@@ -970,7 +1431,10 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
       }
   }
   LLVM_DEBUG(dump("Spills", Spills));
-  moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
   Shape.FrameTy = buildFrameType(F, Shape, Spills);
   Shape.FramePtr = insertSpills(Spills, Shape);
+  lowerLocalAllocas(LocalAllocas, DeadInstructions);
+
+  for (auto I : DeadInstructions)
+    I->eraseFromParent();
 }
diff --git a/lib/Transforms/Coroutines/CoroInstr.h b/lib/Transforms/Coroutines/CoroInstr.h
index 5e19d7642e38..de2d2920cb15 100644
--- a/lib/Transforms/Coroutines/CoroInstr.h
+++ b/lib/Transforms/Coroutines/CoroInstr.h
@@ -27,6 +27,7 @@
 
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
@@ -77,10 +78,8 @@ public:
   }
 };
 
-/// This represents the llvm.coro.alloc instruction.
-class LLVM_LIBRARY_VISIBILITY CoroIdInst : public IntrinsicInst {
-  enum { AlignArg, PromiseArg, CoroutineArg, InfoArg };
-
+/// This represents a common base class for llvm.coro.id instructions.
+class LLVM_LIBRARY_VISIBILITY AnyCoroIdInst : public IntrinsicInst {
 public:
   CoroAllocInst *getCoroAlloc() {
     for (User *U : users())
@@ -97,6 +96,24 @@ public:
     llvm_unreachable("no coro.begin associated with coro.id");
   }
 
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    auto ID = I->getIntrinsicID();
+    return ID == Intrinsic::coro_id ||
+           ID == Intrinsic::coro_id_retcon ||
+           ID == Intrinsic::coro_id_retcon_once;
+  }
+
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.id instruction.
+class LLVM_LIBRARY_VISIBILITY CoroIdInst : public AnyCoroIdInst {
+  enum { AlignArg, PromiseArg, CoroutineArg, InfoArg };
+
+public:
   AllocaInst *getPromise() const {
     Value *Arg = getArgOperand(PromiseArg);
     return isa<ConstantPointerNull>(Arg)
@@ -182,6 +199,80 @@ public:
   }
 };
 
+/// This represents either the llvm.coro.id.retcon or
+/// llvm.coro.id.retcon.once instruction.
+class LLVM_LIBRARY_VISIBILITY AnyCoroIdRetconInst : public AnyCoroIdInst {
+  enum { SizeArg, AlignArg, StorageArg, PrototypeArg, AllocArg, DeallocArg };
+
+public:
+  void checkWellFormed() const;
+
+  uint64_t getStorageSize() const {
+    return cast<ConstantInt>(getArgOperand(SizeArg))->getZExtValue();
+  }
+
+  uint64_t getStorageAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  }
+
+  Value *getStorage() const {
+    return getArgOperand(StorageArg);
+  }
+
+  /// Return the prototype for the continuation function.  The type,
+  /// attributes, and calling convention of the continuation function(s)
+  /// are taken from this declaration.
+  Function *getPrototype() const {
+    return cast<Function>(getArgOperand(PrototypeArg)->stripPointerCasts());
+  }
+
+  /// Return the function to use for allocating memory.
+  Function *getAllocFunction() const {
+    return cast<Function>(getArgOperand(AllocArg)->stripPointerCasts());
+  }
+
+  /// Return the function to use for deallocating memory.
+  Function *getDeallocFunction() const {
+    return cast<Function>(getArgOperand(DeallocArg)->stripPointerCasts());
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    auto ID = I->getIntrinsicID();
+    return ID == Intrinsic::coro_id_retcon
+        || ID == Intrinsic::coro_id_retcon_once;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.id.retcon instruction.
+class LLVM_LIBRARY_VISIBILITY CoroIdRetconInst
+    : public AnyCoroIdRetconInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_id_retcon;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.id.retcon.once instruction.
+class LLVM_LIBRARY_VISIBILITY CoroIdRetconOnceInst
+    : public AnyCoroIdRetconInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_id_retcon_once;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This represents the llvm.coro.frame instruction.
 class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst {
 public:
@@ -215,7 +306,9 @@ class LLVM_LIBRARY_VISIBILITY CoroBeginInst : public IntrinsicInst {
   enum { IdArg, MemArg };
 
 public:
-  CoroIdInst *getId() const { return cast<CoroIdInst>(getArgOperand(IdArg)); }
+  AnyCoroIdInst *getId() const {
+    return cast<AnyCoroIdInst>(getArgOperand(IdArg));
+  }
 
   Value *getMem() const { return getArgOperand(MemArg); }
 
@@ -261,8 +354,22 @@ public:
   }
 };
 
+class LLVM_LIBRARY_VISIBILITY AnyCoroSuspendInst : public IntrinsicInst {
+public:
+  CoroSaveInst *getCoroSave() const;
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_suspend ||
+           I->getIntrinsicID() == Intrinsic::coro_suspend_retcon;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This represents the llvm.coro.suspend instruction.
-class LLVM_LIBRARY_VISIBILITY CoroSuspendInst : public IntrinsicInst {
+class LLVM_LIBRARY_VISIBILITY CoroSuspendInst : public AnyCoroSuspendInst {
   enum { SaveArg, FinalArg };
 
 public:
@@ -273,6 +380,7 @@ public:
     assert(isa<ConstantTokenNone>(Arg));
     return nullptr;
   }
+
   bool isFinal() const {
     return cast<Constant>(getArgOperand(FinalArg))->isOneValue();
   }
@@ -286,6 +394,37 @@ public:
   }
 };
 
+inline CoroSaveInst *AnyCoroSuspendInst::getCoroSave() const {
+  if (auto Suspend = dyn_cast<CoroSuspendInst>(this))
+    return Suspend->getCoroSave();
+  return nullptr;
+}
+
+/// This represents the llvm.coro.suspend.retcon instruction.
+class LLVM_LIBRARY_VISIBILITY CoroSuspendRetconInst : public AnyCoroSuspendInst {
+public:
+  op_iterator value_begin() { return arg_begin(); }
+  const_op_iterator value_begin() const { return arg_begin(); }
+
+  op_iterator value_end() { return arg_end(); }
+  const_op_iterator value_end() const { return arg_end(); }
+
+  iterator_range<op_iterator> value_operands() {
+    return make_range(value_begin(), value_end());
+  }
+  iterator_range<const_op_iterator> value_operands() const {
+    return make_range(value_begin(), value_end());
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_suspend_retcon;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 /// This represents the llvm.coro.size instruction.
 class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst {
 public:
@@ -317,6 +456,60 @@ public:
   }
 };
 
+/// This represents the llvm.coro.alloca.alloc instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAllocaAllocInst : public IntrinsicInst {
+  enum { SizeArg, AlignArg };
+public:
+  Value *getSize() const {
+    return getArgOperand(SizeArg);
+  }
+  unsigned getAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_alloca_alloc;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.alloca.get instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAllocaGetInst : public IntrinsicInst {
+  enum { AllocArg };
+public:
+  CoroAllocaAllocInst *getAlloc() const {
+    return cast<CoroAllocaAllocInst>(getArgOperand(AllocArg));
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_alloca_get;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.alloca.free instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAllocaFreeInst : public IntrinsicInst {
+  enum { AllocArg };
+public:
+  CoroAllocaAllocInst *getAlloc() const {
+    return cast<CoroAllocaAllocInst>(getArgOperand(AllocArg));
+  }
+
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_alloca_free;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 } // End namespace llvm.
 
 #endif
diff --git a/lib/Transforms/Coroutines/CoroInternal.h b/lib/Transforms/Coroutines/CoroInternal.h
index 441c8a20f1f3..c151474316f9 100644
--- a/lib/Transforms/Coroutines/CoroInternal.h
+++ b/lib/Transforms/Coroutines/CoroInternal.h
@@ -12,6 +12,7 @@
 #define LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H
 
 #include "CoroInstr.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/Transforms/Coroutines.h"
 
 namespace llvm {
@@ -61,37 +62,174 @@ struct LowererBase {
   Value *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
 };
 
+enum class ABI {
+  /// The "resume-switch" lowering, where there are separate resume and
+  /// destroy functions that are shared between all suspend points.  The
+  /// coroutine frame implicitly stores the resume and destroy functions,
+  /// the current index, and any promise value.
+  Switch,
+
+  /// The "returned-continuation" lowering, where each suspend point creates a
+  /// single continuation function that is used for both resuming and
+  /// destroying.  Does not support promises.
+  Retcon,
+
+  /// The "unique returned-continuation" lowering, where each suspend point
+  /// creates a single continuation function that is used for both resuming
+  /// and destroying.  Does not support promises.  The function is known to
+  /// suspend at most once during its execution, and the return value of
+  /// the continuation is void.
+  RetconOnce,
+};
+
 // Holds structural Coroutine Intrinsics for a particular function and other
 // values used during CoroSplit pass.
 struct LLVM_LIBRARY_VISIBILITY Shape {
   CoroBeginInst *CoroBegin;
   SmallVector<CoroEndInst *, 4> CoroEnds;
   SmallVector<CoroSizeInst *, 2> CoroSizes;
-  SmallVector<CoroSuspendInst *, 4> CoroSuspends;
-
-  // Field Indexes for known coroutine frame fields.
-  enum {
-    ResumeField,
-    DestroyField,
-    PromiseField,
-    IndexField,
+  SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
+  SmallVector<CallInst*, 2> SwiftErrorOps;
+
+  // Field indexes for special fields in the switch lowering.
+  struct SwitchFieldIndex {
+    enum {
+      Resume,
+      Destroy,
+      Promise,
+      Index,
+      /// The index of the first spill field.
+      FirstSpill
+    };
   };
 
+  coro::ABI ABI;
+
   StructType *FrameTy;
   Instruction *FramePtr;
   BasicBlock *AllocaSpillBlock;
-  SwitchInst *ResumeSwitch;
-  AllocaInst *PromiseAlloca;
-  bool HasFinalSuspend;
+
+  struct SwitchLoweringStorage {
+    SwitchInst *ResumeSwitch;
+    AllocaInst *PromiseAlloca;
+    BasicBlock *ResumeEntryBlock;
+    bool HasFinalSuspend;
+  };
+
+  struct RetconLoweringStorage {
+    Function *ResumePrototype;
+    Function *Alloc;
+    Function *Dealloc;
+    BasicBlock *ReturnBlock;
+    bool IsFrameInlineInStorage;
+  };
+
+  union {
+    SwitchLoweringStorage SwitchLowering;
+    RetconLoweringStorage RetconLowering;
+  };
+
+  CoroIdInst *getSwitchCoroId() const {
+    assert(ABI == coro::ABI::Switch);
+    return cast<CoroIdInst>(CoroBegin->getId());
+  }
+
+  AnyCoroIdRetconInst *getRetconCoroId() const {
+    assert(ABI == coro::ABI::Retcon ||
+           ABI == coro::ABI::RetconOnce);
+    return cast<AnyCoroIdRetconInst>(CoroBegin->getId());
+  }
 
   IntegerType *getIndexType() const {
+    assert(ABI == coro::ABI::Switch);
     assert(FrameTy && "frame type not assigned");
-    return cast<IntegerType>(FrameTy->getElementType(IndexField));
+    return cast<IntegerType>(FrameTy->getElementType(SwitchFieldIndex::Index));
   }
   ConstantInt *getIndex(uint64_t Value) const {
     return ConstantInt::get(getIndexType(), Value);
   }
 
+  PointerType *getSwitchResumePointerType() const {
+    assert(ABI == coro::ABI::Switch);
+  assert(FrameTy && "frame type not assigned");
+  return cast<PointerType>(FrameTy->getElementType(SwitchFieldIndex::Resume));
+  }
+
+  FunctionType *getResumeFunctionType() const {
+    switch (ABI) {
+    case coro::ABI::Switch: {
+      auto *FnPtrTy = getSwitchResumePointerType();
+      return cast<FunctionType>(FnPtrTy->getPointerElementType());
+    }
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return RetconLowering.ResumePrototype->getFunctionType();
+    }
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  ArrayRef<Type*> getRetconResultTypes() const {
+    assert(ABI == coro::ABI::Retcon ||
+           ABI == coro::ABI::RetconOnce);
+    auto FTy = CoroBegin->getFunction()->getFunctionType();
+
+    // The safety of all this is checked by checkWFRetconPrototype.
+    if (auto STy = dyn_cast<StructType>(FTy->getReturnType())) {
+      return STy->elements().slice(1);
+    } else {
+      return ArrayRef<Type*>();
+    }
+  }
+
+  ArrayRef<Type*> getRetconResumeTypes() const {
+    assert(ABI == coro::ABI::Retcon ||
+           ABI == coro::ABI::RetconOnce);
+
+    // The safety of all this is checked by checkWFRetconPrototype.
+    auto FTy = RetconLowering.ResumePrototype->getFunctionType();
+    return FTy->params().slice(1);
+  }
+
+  CallingConv::ID getResumeFunctionCC() const {
+    switch (ABI) {
+    case coro::ABI::Switch:
+      return CallingConv::Fast;
+
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return RetconLowering.ResumePrototype->getCallingConv();
+    }
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  unsigned getFirstSpillFieldIndex() const {
+    switch (ABI) {
+    case coro::ABI::Switch:
+      return SwitchFieldIndex::FirstSpill;
+
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return 0;
+    }
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  AllocaInst *getPromiseAlloca() const {
+    if (ABI == coro::ABI::Switch)
+      return SwitchLowering.PromiseAlloca;
+    return nullptr;
+  }
+
+  /// Allocate memory according to the rules of the active lowering.
+  ///
+  /// \param CG - if non-null, will be updated for the new call
+  Value *emitAlloc(IRBuilder<> &Builder, Value *Size, CallGraph *CG) const;
+
+  /// Deallocate memory according to the rules of the active lowering.
+  ///
+  /// \param CG - if non-null, will be updated for the new call
+  void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const;
+
   Shape() = default;
   explicit Shape(Function &F) { buildFrom(F); }
   void buildFrom(Function &F);
diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp
index 5458e70ff16a..04723cbde417 100644
--- a/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -70,9 +71,197 @@ using namespace llvm;
 
 #define DEBUG_TYPE "coro-split"
 
+namespace {
+
+/// A little helper class for building 
+class CoroCloner {
+public:
+  enum class Kind {
+    /// The shared resume function for a switch lowering.
+    SwitchResume,
+
+    /// The shared unwind function for a switch lowering.
+    SwitchUnwind,
+
+    /// The shared cleanup function for a switch lowering.
+    SwitchCleanup,
+
+    /// An individual continuation function.
+    Continuation,
+  };
+private:
+  Function &OrigF;
+  Function *NewF;
+  const Twine &Suffix;
+  coro::Shape &Shape;
+  Kind FKind;
+  ValueToValueMapTy VMap;
+  IRBuilder<> Builder;
+  Value *NewFramePtr = nullptr;
+  Value *SwiftErrorSlot = nullptr;
+
+  /// The active suspend instruction; meaningful only for continuation ABIs.
+  AnyCoroSuspendInst *ActiveSuspend = nullptr;
+
+public:
+  /// Create a cloner for a switch lowering.
+  CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape,
+             Kind FKind)
+    : OrigF(OrigF), NewF(nullptr), Suffix(Suffix), Shape(Shape),
+      FKind(FKind), Builder(OrigF.getContext()) {
+    assert(Shape.ABI == coro::ABI::Switch);
+  }
+
+  /// Create a cloner for a continuation lowering.
+  CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape,
+             Function *NewF, AnyCoroSuspendInst *ActiveSuspend)
+    : OrigF(OrigF), NewF(NewF), Suffix(Suffix), Shape(Shape),
+      FKind(Kind::Continuation), Builder(OrigF.getContext()),
+      ActiveSuspend(ActiveSuspend) {
+    assert(Shape.ABI == coro::ABI::Retcon ||
+           Shape.ABI == coro::ABI::RetconOnce);
+    assert(NewF && "need existing function for continuation");
+    assert(ActiveSuspend && "need active suspend point for continuation");
+  }
+
+  Function *getFunction() const {
+    assert(NewF != nullptr && "declaration not yet set");
+    return NewF;
+  }
+
+  void create();
+
+private:
+  bool isSwitchDestroyFunction() {
+    switch (FKind) {
+    case Kind::Continuation:
+    case Kind::SwitchResume:
+      return false;
+    case Kind::SwitchUnwind:
+    case Kind::SwitchCleanup:
+      return true;
+    }
+    llvm_unreachable("Unknown CoroCloner::Kind enum");
+  }
+
+  void createDeclaration();
+  void replaceEntryBlock();
+  Value *deriveNewFramePointer();
+  void replaceRetconSuspendUses();
+  void replaceCoroSuspends();
+  void replaceCoroEnds();
+  void replaceSwiftErrorOps();
+  void handleFinalSuspend();
+  void maybeFreeContinuationStorage();
+};
+
+} // end anonymous namespace
+
+static void maybeFreeRetconStorage(IRBuilder<> &Builder, coro::Shape &Shape,
+                                   Value *FramePtr, CallGraph *CG) {
+  assert(Shape.ABI == coro::ABI::Retcon ||
+         Shape.ABI == coro::ABI::RetconOnce);
+  if (Shape.RetconLowering.IsFrameInlineInStorage)
+    return;
+
+  Shape.emitDealloc(Builder, FramePtr, CG);
+}
+
+/// Replace a non-unwind call to llvm.coro.end.
+static void replaceFallthroughCoroEnd(CoroEndInst *End, coro::Shape &Shape,
+                                      Value *FramePtr, bool InResume,
+                                      CallGraph *CG) {
+  // Start inserting right before the coro.end.
+  IRBuilder<> Builder(End);
+
+  // Create the return instruction.
+  switch (Shape.ABI) {
+  // The cloned functions in switch-lowering always return void.
+  case coro::ABI::Switch:
+    // coro.end doesn't immediately end the coroutine in the main function
+    // in this lowering, because we need to deallocate the coroutine.
+    if (!InResume)
+      return;
+    Builder.CreateRetVoid();
+    break;
+
+  // In unique continuation lowering, the continuations always return void.
+  // But we may have implicitly allocated storage.
+  case coro::ABI::RetconOnce:
+    maybeFreeRetconStorage(Builder, Shape, FramePtr, CG);
+    Builder.CreateRetVoid();
+    break;
+
+  // In non-unique continuation lowering, we signal completion by returning
+  // a null continuation.
+  case coro::ABI::Retcon: {
+    maybeFreeRetconStorage(Builder, Shape, FramePtr, CG);
+    auto RetTy = Shape.getResumeFunctionType()->getReturnType();
+    auto RetStructTy = dyn_cast<StructType>(RetTy);
+    PointerType *ContinuationTy =
+      cast<PointerType>(RetStructTy ? RetStructTy->getElementType(0) : RetTy);
+
+    Value *ReturnValue = ConstantPointerNull::get(ContinuationTy);
+    if (RetStructTy) {
+      ReturnValue = Builder.CreateInsertValue(UndefValue::get(RetStructTy),
+                                              ReturnValue, 0);
+    }
+    Builder.CreateRet(ReturnValue);
+    break;
+  }
+  }
+
+  // Remove the rest of the block, by splitting it into an unreachable block.
+  auto *BB = End->getParent();
+  BB->splitBasicBlock(End);
+  BB->getTerminator()->eraseFromParent();
+}
+
+/// Replace an unwind call to llvm.coro.end.
+static void replaceUnwindCoroEnd(CoroEndInst *End, coro::Shape &Shape,
+                                 Value *FramePtr, bool InResume, CallGraph *CG){
+  IRBuilder<> Builder(End);
+
+  switch (Shape.ABI) {
+  // In switch-lowering, this does nothing in the main function.
+  case coro::ABI::Switch:
+    if (!InResume)
+      return;
+    break;
+
+  // In continuation-lowering, this frees the continuation storage.
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    maybeFreeRetconStorage(Builder, Shape, FramePtr, CG);
+    break;
+  }
+
+  // If coro.end has an associated bundle, add cleanupret instruction.
+  if (auto Bundle = End->getOperandBundle(LLVMContext::OB_funclet)) {
+    auto *FromPad = cast<CleanupPadInst>(Bundle->Inputs[0]);
+    auto *CleanupRet = Builder.CreateCleanupRet(FromPad, nullptr);
+    End->getParent()->splitBasicBlock(End);
+    CleanupRet->getParent()->getTerminator()->eraseFromParent();
+  }
+}
+
+static void replaceCoroEnd(CoroEndInst *End, coro::Shape &Shape,
+                           Value *FramePtr, bool InResume, CallGraph *CG) {
+  if (End->isUnwind())
+    replaceUnwindCoroEnd(End, Shape, FramePtr, InResume, CG);
+  else
+    replaceFallthroughCoroEnd(End, Shape, FramePtr, InResume, CG);
+
+  auto &Context = End->getContext();
+  End->replaceAllUsesWith(InResume ? ConstantInt::getTrue(Context)
+                                   : ConstantInt::getFalse(Context));
+  End->eraseFromParent();
+}
+
 // Create an entry block for a resume function with a switch that will jump to
 // suspend points.
-static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
+static void createResumeEntryBlock(Function &F, coro::Shape &Shape) {
+  assert(Shape.ABI == coro::ABI::Switch);
   LLVMContext &C = F.getContext();
 
   // resume.entry:
@@ -91,15 +280,16 @@ static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
   IRBuilder<> Builder(NewEntry);
   auto *FramePtr = Shape.FramePtr;
   auto *FrameTy = Shape.FrameTy;
-  auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(
-      FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr");
+  auto *GepIndex = Builder.CreateStructGEP(
+      FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr");
   auto *Index = Builder.CreateLoad(Shape.getIndexType(), GepIndex, "index");
   auto *Switch =
       Builder.CreateSwitch(Index, UnreachBB, Shape.CoroSuspends.size());
-  Shape.ResumeSwitch = Switch;
+  Shape.SwitchLowering.ResumeSwitch = Switch;
 
   size_t SuspendIndex = 0;
-  for (CoroSuspendInst *S : Shape.CoroSuspends) {
+  for (auto *AnyS : Shape.CoroSuspends) {
+    auto *S = cast<CoroSuspendInst>(AnyS);
     ConstantInt *IndexVal = Shape.getIndex(SuspendIndex);
 
     // Replace CoroSave with a store to Index:
@@ -109,14 +299,15 @@ static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
     Builder.SetInsertPoint(Save);
     if (S->isFinal()) {
       // Final suspend point is represented by storing zero in ResumeFnAddr.
-      auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0,
-                                                          0, "ResumeFn.addr");
+      auto *GepIndex = Builder.CreateStructGEP(FrameTy, FramePtr,
+                                 coro::Shape::SwitchFieldIndex::Resume,
+                                  "ResumeFn.addr");
       auto *NullPtr = ConstantPointerNull::get(cast<PointerType>(
           cast<PointerType>(GepIndex->getType())->getElementType()));
       Builder.CreateStore(NullPtr, GepIndex);
     } else {
-      auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(
-          FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr");
+      auto *GepIndex = Builder.CreateStructGEP(
+          FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr");
       Builder.CreateStore(IndexVal, GepIndex);
     }
     Save->replaceAllUsesWith(ConstantTokenNone::get(C));
@@ -164,48 +355,9 @@ static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) {
   Builder.SetInsertPoint(UnreachBB);
   Builder.CreateUnreachable();
 
-  return NewEntry;
+  Shape.SwitchLowering.ResumeEntryBlock = NewEntry;
 }
 
-// In Resumers, we replace fallthrough coro.end with ret void and delete the
-// rest of the block.
-static void replaceFallthroughCoroEnd(IntrinsicInst *End,
-                                      ValueToValueMapTy &VMap) {
-  auto *NewE = cast<IntrinsicInst>(VMap[End]);
-  ReturnInst::Create(NewE->getContext(), nullptr, NewE);
-
-  // Remove the rest of the block, by splitting it into an unreachable block.
-  auto *BB = NewE->getParent();
-  BB->splitBasicBlock(NewE);
-  BB->getTerminator()->eraseFromParent();
-}
-
-// In Resumers, we replace unwind coro.end with True to force the immediate
-// unwind to caller.
-static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) {
-  if (Shape.CoroEnds.empty())
-    return;
-
-  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
-  auto *True = ConstantInt::getTrue(Context);
-  for (CoroEndInst *CE : Shape.CoroEnds) {
-    if (!CE->isUnwind())
-      continue;
-
-    auto *NewCE = cast<IntrinsicInst>(VMap[CE]);
-
-    // If coro.end has an associated bundle, add cleanupret instruction.
-    if (auto Bundle = NewCE->getOperandBundle(LLVMContext::OB_funclet)) {
-      Value *FromPad = Bundle->Inputs[0];
-      auto *CleanupRet = CleanupReturnInst::Create(FromPad, nullptr, NewCE);
-      NewCE->getParent()->splitBasicBlock(NewCE);
-      CleanupRet->getParent()->getTerminator()->eraseFromParent();
-    }
-
-    NewCE->replaceAllUsesWith(True);
-    NewCE->eraseFromParent();
-  }
-}
 
 // Rewrite final suspend point handling. We do not use suspend index to
 // represent the final suspend point. Instead we zero-out ResumeFnAddr in the
@@ -216,83 +368,364 @@ static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) {
 // In the destroy function, we add a code sequence to check if ResumeFnAddress
 // is Null, and if so, jump to the appropriate label to handle cleanup from the
 // final suspend point.
-static void handleFinalSuspend(IRBuilder<> &Builder, Value *FramePtr,
-                               coro::Shape &Shape, SwitchInst *Switch,
-                               bool IsDestroy) {
-  assert(Shape.HasFinalSuspend);
+void CoroCloner::handleFinalSuspend() {
+  assert(Shape.ABI == coro::ABI::Switch &&
+         Shape.SwitchLowering.HasFinalSuspend);
+  auto *Switch = cast<SwitchInst>(VMap[Shape.SwitchLowering.ResumeSwitch]);
   auto FinalCaseIt = std::prev(Switch->case_end());
   BasicBlock *ResumeBB = FinalCaseIt->getCaseSuccessor();
   Switch->removeCase(FinalCaseIt);
-  if (IsDestroy) {
+  if (isSwitchDestroyFunction()) {
     BasicBlock *OldSwitchBB = Switch->getParent();
     auto *NewSwitchBB = OldSwitchBB->splitBasicBlock(Switch, "Switch");
     Builder.SetInsertPoint(OldSwitchBB->getTerminator());
-    auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(Shape.FrameTy, FramePtr,
-                                                        0, 0, "ResumeFn.addr");
-    auto *Load = Builder.CreateLoad(
-        Shape.FrameTy->getElementType(coro::Shape::ResumeField), GepIndex);
-    auto *NullPtr =
-        ConstantPointerNull::get(cast<PointerType>(Load->getType()));
-    auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
+    auto *GepIndex = Builder.CreateStructGEP(Shape.FrameTy, NewFramePtr,
+                                       coro::Shape::SwitchFieldIndex::Resume,
+                                             "ResumeFn.addr");
+    auto *Load = Builder.CreateLoad(Shape.getSwitchResumePointerType(),
+                                    GepIndex);
+    auto *Cond = Builder.CreateIsNull(Load);
     Builder.CreateCondBr(Cond, ResumeBB, NewSwitchBB);
     OldSwitchBB->getTerminator()->eraseFromParent();
   }
 }
 
-// Create a resume clone by cloning the body of the original function, setting
-// new entry block and replacing coro.suspend an appropriate value to force
-// resume or cleanup pass for every suspend point.
-static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
-                             BasicBlock *ResumeEntry, int8_t FnIndex) {
-  Module *M = F.getParent();
-  auto *FrameTy = Shape.FrameTy;
-  auto *FnPtrTy = cast<PointerType>(FrameTy->getElementType(0));
-  auto *FnTy = cast<FunctionType>(FnPtrTy->getElementType());
+static Function *createCloneDeclaration(Function &OrigF, coro::Shape &Shape,
+                                        const Twine &Suffix,
+                                        Module::iterator InsertBefore) {
+  Module *M = OrigF.getParent();
+  auto *FnTy = Shape.getResumeFunctionType();
 
   Function *NewF =
-      Function::Create(FnTy, GlobalValue::LinkageTypes::ExternalLinkage,
-                       F.getName() + Suffix, M);
+      Function::Create(FnTy, GlobalValue::LinkageTypes::InternalLinkage,
+                       OrigF.getName() + Suffix);
   NewF->addParamAttr(0, Attribute::NonNull);
   NewF->addParamAttr(0, Attribute::NoAlias);
 
-  ValueToValueMapTy VMap;
+  M->getFunctionList().insert(InsertBefore, NewF);
+
+  return NewF;
+}
+
+/// Replace uses of the active llvm.coro.suspend.retcon call with the
+/// arguments to the continuation function.
+///
+/// This assumes that the builder has a meaningful insertion point.
+void CoroCloner::replaceRetconSuspendUses() {
+  assert(Shape.ABI == coro::ABI::Retcon ||
+         Shape.ABI == coro::ABI::RetconOnce);
+
+  auto NewS = VMap[ActiveSuspend];
+  if (NewS->use_empty()) return;
+
+  // Copy out all the continuation arguments after the buffer pointer into
+  // an easily-indexed data structure for convenience.
+  SmallVector<Value*, 8> Args;
+  for (auto I = std::next(NewF->arg_begin()), E = NewF->arg_end(); I != E; ++I)
+    Args.push_back(&*I);
+
+  // If the suspend returns a single scalar value, we can just do a simple
+  // replacement.
+  if (!isa<StructType>(NewS->getType())) {
+    assert(Args.size() == 1);
+    NewS->replaceAllUsesWith(Args.front());
+    return;
+  }
+
+  // Try to peephole extracts of an aggregate return.
+  for (auto UI = NewS->use_begin(), UE = NewS->use_end(); UI != UE; ) {
+    auto EVI = dyn_cast<ExtractValueInst>((UI++)->getUser());
+    if (!EVI || EVI->getNumIndices() != 1)
+      continue;
+
+    EVI->replaceAllUsesWith(Args[EVI->getIndices().front()]);
+    EVI->eraseFromParent();
+  }
+
+  // If we have no remaining uses, we're done.
+  if (NewS->use_empty()) return;
+
+  // Otherwise, we need to create an aggregate.
+  Value *Agg = UndefValue::get(NewS->getType());
+  for (size_t I = 0, E = Args.size(); I != E; ++I)
+    Agg = Builder.CreateInsertValue(Agg, Args[I], I);
+
+  NewS->replaceAllUsesWith(Agg);
+}
+
+void CoroCloner::replaceCoroSuspends() {
+  Value *SuspendResult;
+
+  switch (Shape.ABI) {
+  // In switch lowering, replace coro.suspend with the appropriate value
+  // for the type of function we're extracting.
+  // Replacing coro.suspend with (0) will result in control flow proceeding to
+  // a resume label associated with a suspend point, replacing it with (1) will
+  // result in control flow proceeding to a cleanup label associated with this
+  // suspend point.
+  case coro::ABI::Switch:
+    SuspendResult = Builder.getInt8(isSwitchDestroyFunction() ? 1 : 0);
+    break;
+
+  // In returned-continuation lowering, the arguments from earlier
+  // continuations are theoretically arbitrary, and they should have been
+  // spilled.
+  case coro::ABI::RetconOnce:
+  case coro::ABI::Retcon:
+    return;
+  }
+
+  for (AnyCoroSuspendInst *CS : Shape.CoroSuspends) {
+    // The active suspend was handled earlier.
+    if (CS == ActiveSuspend) continue;
+
+    auto *MappedCS = cast<AnyCoroSuspendInst>(VMap[CS]);
+    MappedCS->replaceAllUsesWith(SuspendResult);
+    MappedCS->eraseFromParent();
+  }
+}
+
+void CoroCloner::replaceCoroEnds() {
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    // We use a null call graph because there's no call graph node for
+    // the cloned function yet.  We'll just be rebuilding that later.
+    auto NewCE = cast<CoroEndInst>(VMap[CE]);
+    replaceCoroEnd(NewCE, Shape, NewFramePtr, /*in resume*/ true, nullptr);
+  }
+}
+
+static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
+                                 ValueToValueMapTy *VMap) {
+  Value *CachedSlot = nullptr;
+  auto getSwiftErrorSlot = [&](Type *ValueTy) -> Value * {
+    if (CachedSlot) {
+      assert(CachedSlot->getType()->getPointerElementType() == ValueTy &&
+             "multiple swifterror slots in function with different types");
+      return CachedSlot;
+    }
+
+    // Check if the function has a swifterror argument.
+    for (auto &Arg : F.args()) {
+      if (Arg.isSwiftError()) {
+        CachedSlot = &Arg;
+        assert(Arg.getType()->getPointerElementType() == ValueTy &&
+               "swifterror argument does not have expected type");
+        return &Arg;
+      }
+    }
+
+    // Create a swifterror alloca.
+    IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
+    auto Alloca = Builder.CreateAlloca(ValueTy);
+    Alloca->setSwiftError(true);
+
+    CachedSlot = Alloca;
+    return Alloca;
+  };
+
+  for (CallInst *Op : Shape.SwiftErrorOps) {
+    auto MappedOp = VMap ? cast<CallInst>((*VMap)[Op]) : Op;
+    IRBuilder<> Builder(MappedOp);
+
+    // If there are no arguments, this is a 'get' operation.
+    Value *MappedResult;
+    if (Op->getNumArgOperands() == 0) {
+      auto ValueTy = Op->getType();
+      auto Slot = getSwiftErrorSlot(ValueTy);
+      MappedResult = Builder.CreateLoad(ValueTy, Slot);
+    } else {
+      assert(Op->getNumArgOperands() == 1);
+      auto Value = MappedOp->getArgOperand(0);
+      auto ValueTy = Value->getType();
+      auto Slot = getSwiftErrorSlot(ValueTy);
+      Builder.CreateStore(Value, Slot);
+      MappedResult = Slot;
+    }
+
+    MappedOp->replaceAllUsesWith(MappedResult);
+    MappedOp->eraseFromParent();
+  }
+
+  // If we're updating the original function, we've invalidated SwiftErrorOps.
+  if (VMap == nullptr) {
+    Shape.SwiftErrorOps.clear();
+  }
+}
+
+void CoroCloner::replaceSwiftErrorOps() {
+  ::replaceSwiftErrorOps(*NewF, Shape, &VMap);
+}
+
+void CoroCloner::replaceEntryBlock() {
+  // In the original function, the AllocaSpillBlock is a block immediately
+  // following the allocation of the frame object which defines GEPs for
+  // all the allocas that have been moved into the frame, and it ends by
+  // branching to the original beginning of the coroutine.  Make this 
+  // the entry block of the cloned function.
+  auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]);
+  Entry->setName("entry" + Suffix);
+  Entry->moveBefore(&NewF->getEntryBlock());
+  Entry->getTerminator()->eraseFromParent();
+
+  // Clear all predecessors of the new entry block.  There should be
+  // exactly one predecessor, which we created when splitting out
+  // AllocaSpillBlock to begin with.
+  assert(Entry->hasOneUse());
+  auto BranchToEntry = cast<BranchInst>(Entry->user_back());
+  assert(BranchToEntry->isUnconditional());
+  Builder.SetInsertPoint(BranchToEntry);
+  Builder.CreateUnreachable();
+  BranchToEntry->eraseFromParent();
+
+  // TODO: move any allocas into Entry that weren't moved into the frame.
+  // (Currently we move all allocas into the frame.)
+
+  // Branch from the entry to the appropriate place.
+  Builder.SetInsertPoint(Entry);
+  switch (Shape.ABI) {
+  case coro::ABI::Switch: {
+    // In switch-lowering, we built a resume-entry block in the original
+    // function.  Make the entry block branch to this.
+    auto *SwitchBB =
+      cast<BasicBlock>(VMap[Shape.SwitchLowering.ResumeEntryBlock]);
+    Builder.CreateBr(SwitchBB);
+    break;
+  }
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    // In continuation ABIs, we want to branch to immediately after the
+    // active suspend point.  Earlier phases will have put the suspend in its
+    // own basic block, so just thread our jump directly to its successor.
+    auto MappedCS = cast<CoroSuspendRetconInst>(VMap[ActiveSuspend]);
+    auto Branch = cast<BranchInst>(MappedCS->getNextNode());
+    assert(Branch->isUnconditional());
+    Builder.CreateBr(Branch->getSuccessor(0));
+    break;
+  }
+  }
+}
+
+/// Derive the value of the new frame pointer.
+Value *CoroCloner::deriveNewFramePointer() {
+  // Builder should be inserting to the front of the new entry block.
+
+  switch (Shape.ABI) {
+  // In switch-lowering, the argument is the frame pointer.
+  case coro::ABI::Switch:
+    return &*NewF->arg_begin();
+
+  // In continuation-lowering, the argument is the opaque storage.
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    Argument *NewStorage = &*NewF->arg_begin();
+    auto FramePtrTy = Shape.FrameTy->getPointerTo();
+
+    // If the storage is inline, just bitcast to the storage to the frame type.
+    if (Shape.RetconLowering.IsFrameInlineInStorage)
+      return Builder.CreateBitCast(NewStorage, FramePtrTy);
+
+    // Otherwise, load the real frame from the opaque storage.
+    auto FramePtrPtr =
+      Builder.CreateBitCast(NewStorage, FramePtrTy->getPointerTo());
+    return Builder.CreateLoad(FramePtrPtr);
+  }
+  }
+  llvm_unreachable("bad ABI");
+}
+
+/// Clone the body of the original function into a resume function of
+/// some sort.
+void CoroCloner::create() {
+  // Create the new function if we don't already have one.
+  if (!NewF) {
+    NewF = createCloneDeclaration(OrigF, Shape, Suffix,
+                                  OrigF.getParent()->end());
+  }
+
   // Replace all args with undefs. The buildCoroutineFrame algorithm already
   // rewritten access to the args that occurs after suspend points with loads
   // and stores to/from the coroutine frame.
-  for (Argument &A : F.args())
+  for (Argument &A : OrigF.args())
     VMap[&A] = UndefValue::get(A.getType());
 
   SmallVector<ReturnInst *, 4> Returns;
 
-  CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns);
-  NewF->setLinkage(GlobalValue::LinkageTypes::InternalLinkage);
+  // Ignore attempts to change certain attributes of the function.
+  // TODO: maybe there should be a way to suppress this during cloning?
+  auto savedVisibility = NewF->getVisibility();
+  auto savedUnnamedAddr = NewF->getUnnamedAddr();
+  auto savedDLLStorageClass = NewF->getDLLStorageClass();
+
+  // NewF's linkage (which CloneFunctionInto does *not* change) might not
+  // be compatible with the visibility of OrigF (which it *does* change),
+  // so protect against that.
+  auto savedLinkage = NewF->getLinkage();
+  NewF->setLinkage(llvm::GlobalValue::ExternalLinkage);
+
+  CloneFunctionInto(NewF, &OrigF, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+  NewF->setLinkage(savedLinkage);
+  NewF->setVisibility(savedVisibility);
+  NewF->setUnnamedAddr(savedUnnamedAddr);
+  NewF->setDLLStorageClass(savedDLLStorageClass);
+
+  auto &Context = NewF->getContext();
+
+  // Replace the attributes of the new function:
+  auto OrigAttrs = NewF->getAttributes();
+  auto NewAttrs = AttributeList();
+
+  switch (Shape.ABI) {
+  case coro::ABI::Switch:
+    // Bootstrap attributes by copying function attributes from the
+    // original function.  This should include optimization settings and so on.
+    NewAttrs = NewAttrs.addAttributes(Context, AttributeList::FunctionIndex,
+                                      OrigAttrs.getFnAttributes());
+    break;
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    // If we have a continuation prototype, just use its attributes,
+    // full-stop.
+    NewAttrs = Shape.RetconLowering.ResumePrototype->getAttributes();
+    break;
+  }
 
-  // Remove old returns.
-  for (ReturnInst *Return : Returns)
-    changeToUnreachable(Return, /*UseLLVMTrap=*/false);
+  // Make the frame parameter nonnull and noalias.
+  NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NonNull);
+  NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NoAlias);
+
+  switch (Shape.ABI) {
+  // In these ABIs, the cloned functions always return 'void', and the
+  // existing return sites are meaningless.  Note that for unique
+  // continuations, this includes the returns associated with suspends;
+  // this is fine because we can't suspend twice.
+  case coro::ABI::Switch:
+  case coro::ABI::RetconOnce:
+    // Remove old returns.
+    for (ReturnInst *Return : Returns)
+      changeToUnreachable(Return, /*UseLLVMTrap=*/false);
+    break;
+
+  // With multi-suspend continuations, we'll already have eliminated the
+  // original returns and inserted returns before all the suspend points,
+  // so we want to leave any returns in place.
+  case coro::ABI::Retcon:
+    break;
+  }
 
-  // Remove old return attributes.
-  NewF->removeAttributes(
-      AttributeList::ReturnIndex,
-      AttributeFuncs::typeIncompatible(NewF->getReturnType()));
+  NewF->setAttributes(NewAttrs);
+  NewF->setCallingConv(Shape.getResumeFunctionCC());
 
-  // Make AllocaSpillBlock the new entry block.
-  auto *SwitchBB = cast<BasicBlock>(VMap[ResumeEntry]);
-  auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]);
-  Entry->moveBefore(&NewF->getEntryBlock());
-  Entry->getTerminator()->eraseFromParent();
-  BranchInst::Create(SwitchBB, Entry);
-  Entry->setName("entry" + Suffix);
+  // Set up the new entry block.
+  replaceEntryBlock();
 
-  // Clear all predecessors of the new entry block.
-  auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]);
-  Entry->replaceAllUsesWith(Switch->getDefaultDest());
-
-  IRBuilder<> Builder(&NewF->getEntryBlock().front());
+  Builder.SetInsertPoint(&NewF->getEntryBlock().front());
+  NewFramePtr = deriveNewFramePointer();
 
   // Remap frame pointer.
-  Argument *NewFramePtr = &*NewF->arg_begin();
-  Value *OldFramePtr = cast<Value>(VMap[Shape.FramePtr]);
+  Value *OldFramePtr = VMap[Shape.FramePtr];
   NewFramePtr->takeName(OldFramePtr);
   OldFramePtr->replaceAllUsesWith(NewFramePtr);
 
@@ -302,50 +735,55 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   Value *OldVFrame = cast<Value>(VMap[Shape.CoroBegin]);
   OldVFrame->replaceAllUsesWith(NewVFrame);
 
-  // Rewrite final suspend handling as it is not done via switch (allows to
-  // remove final case from the switch, since it is undefined behavior to resume
-  // the coroutine suspended at the final suspend point.
-  if (Shape.HasFinalSuspend) {
-    auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]);
-    bool IsDestroy = FnIndex != 0;
-    handleFinalSuspend(Builder, NewFramePtr, Shape, Switch, IsDestroy);
+  switch (Shape.ABI) {
+  case coro::ABI::Switch:
+    // Rewrite final suspend handling as it is not done via switch (allows to
+    // remove final case from the switch, since it is undefined behavior to
+    // resume the coroutine suspended at the final suspend point.
+    if (Shape.SwitchLowering.HasFinalSuspend)
+      handleFinalSuspend();
+    break;
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    // Replace uses of the active suspend with the corresponding
+    // continuation-function arguments.
+    assert(ActiveSuspend != nullptr &&
+           "no active suspend when lowering a continuation-style coroutine");
+    replaceRetconSuspendUses();
+    break;
   }
 
-  // Replace coro suspend with the appropriate resume index.
-  // Replacing coro.suspend with (0) will result in control flow proceeding to
-  // a resume label associated with a suspend point, replacing it with (1) will
-  // result in control flow proceeding to a cleanup label associated with this
-  // suspend point.
-  auto *NewValue = Builder.getInt8(FnIndex ? 1 : 0);
-  for (CoroSuspendInst *CS : Shape.CoroSuspends) {
-    auto *MappedCS = cast<CoroSuspendInst>(VMap[CS]);
-    MappedCS->replaceAllUsesWith(NewValue);
-    MappedCS->eraseFromParent();
-  }
+  // Handle suspends.
+  replaceCoroSuspends();
+
+  // Handle swifterror.
+  replaceSwiftErrorOps();
 
   // Remove coro.end intrinsics.
-  replaceFallthroughCoroEnd(Shape.CoroEnds.front(), VMap);
-  replaceUnwindCoroEnds(Shape, VMap);
+  replaceCoroEnds();
+
   // Eliminate coro.free from the clones, replacing it with 'null' in cleanup,
   // to suppress deallocation code.
-  coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]),
-                        /*Elide=*/FnIndex == 2);
-
-  NewF->setCallingConv(CallingConv::Fast);
-
-  return NewF;
+  if (Shape.ABI == coro::ABI::Switch)
+    coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]),
+                          /*Elide=*/ FKind == CoroCloner::Kind::SwitchCleanup);
 }
 
-static void removeCoroEnds(coro::Shape &Shape) {
-  if (Shape.CoroEnds.empty())
-    return;
-
-  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
-  auto *False = ConstantInt::getFalse(Context);
+// Create a resume clone by cloning the body of the original function, setting
+// new entry block and replacing coro.suspend an appropriate value to force
+// resume or cleanup pass for every suspend point.
+static Function *createClone(Function &F, const Twine &Suffix,
+                             coro::Shape &Shape, CoroCloner::Kind FKind) {
+  CoroCloner Cloner(F, Suffix, Shape, FKind);
+  Cloner.create();
+  return Cloner.getFunction();
+}
 
-  for (CoroEndInst *CE : Shape.CoroEnds) {
-    CE->replaceAllUsesWith(False);
-    CE->eraseFromParent();
+/// Remove calls to llvm.coro.end in the original function.
+static void removeCoroEnds(coro::Shape &Shape, CallGraph *CG) {
+  for (auto End : Shape.CoroEnds) {
+    replaceCoroEnd(End, Shape, Shape.FramePtr, /*in resume*/ false, CG);
   }
 }
 
@@ -377,8 +815,12 @@ static void replaceFrameSize(coro::Shape &Shape) {
 //                    i8* bitcast([2 x void(%f.frame*)*] * @f.resumers to i8*))
 //
 // Assumes that all the functions have the same signature.
-static void setCoroInfo(Function &F, CoroBeginInst *CoroBegin,
-                        std::initializer_list<Function *> Fns) {
+static void setCoroInfo(Function &F, coro::Shape &Shape,
+                        ArrayRef<Function *> Fns) {
+  // This only works under the switch-lowering ABI because coro elision
+  // only works on the switch-lowering ABI.
+  assert(Shape.ABI == coro::ABI::Switch);
+
   SmallVector<Constant *, 4> Args(Fns.begin(), Fns.end());
   assert(!Args.empty());
   Function *Part = *Fns.begin();
@@ -393,38 +835,45 @@ static void setCoroInfo(Function &F, CoroBeginInst *CoroBegin,
   // Update coro.begin instruction to refer to this constant.
   LLVMContext &C = F.getContext();
   auto *BC = ConstantExpr::getPointerCast(GV, Type::getInt8PtrTy(C));
-  CoroBegin->getId()->setInfo(BC);
+  Shape.getSwitchCoroId()->setInfo(BC);
 }
 
 // Store addresses of Resume/Destroy/Cleanup functions in the coroutine frame.
 static void updateCoroFrame(coro::Shape &Shape, Function *ResumeFn,
                             Function *DestroyFn, Function *CleanupFn) {
+  assert(Shape.ABI == coro::ABI::Switch);
+
   IRBuilder<> Builder(Shape.FramePtr->getNextNode());
-  auto *ResumeAddr = Builder.CreateConstInBoundsGEP2_32(
-      Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::ResumeField,
+  auto *ResumeAddr = Builder.CreateStructGEP(
+      Shape.FrameTy, Shape.FramePtr, coro::Shape::SwitchFieldIndex::Resume,
       "resume.addr");
   Builder.CreateStore(ResumeFn, ResumeAddr);
 
   Value *DestroyOrCleanupFn = DestroyFn;
 
-  CoroIdInst *CoroId = Shape.CoroBegin->getId();
+  CoroIdInst *CoroId = Shape.getSwitchCoroId();
   if (CoroAllocInst *CA = CoroId->getCoroAlloc()) {
     // If there is a CoroAlloc and it returns false (meaning we elide the
     // allocation, use CleanupFn instead of DestroyFn).
     DestroyOrCleanupFn = Builder.CreateSelect(CA, DestroyFn, CleanupFn);
   }
 
-  auto *DestroyAddr = Builder.CreateConstInBoundsGEP2_32(
-      Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::DestroyField,
+  auto *DestroyAddr = Builder.CreateStructGEP(
+      Shape.FrameTy, Shape.FramePtr, coro::Shape::SwitchFieldIndex::Destroy,
       "destroy.addr");
   Builder.CreateStore(DestroyOrCleanupFn, DestroyAddr);
 }
 
 static void postSplitCleanup(Function &F) {
   removeUnreachableBlocks(F);
+
+  // For now, we do a mandatory verification step because we don't
+  // entirely trust this pass.  Note that we don't want to add a verifier
+  // pass to FPM below because it will also verify all the global data.
+  verifyFunction(F);
+
   legacy::FunctionPassManager FPM(F.getParent());
 
-  FPM.add(createVerifierPass());
   FPM.add(createSCCPPass());
   FPM.add(createCFGSimplificationPass());
   FPM.add(createEarlyCSEPass());
@@ -520,21 +969,34 @@ static void addMustTailToCoroResumes(Function &F) {
 
 // Coroutine has no suspend points. Remove heap allocation for the coroutine
 // frame if possible.
-static void handleNoSuspendCoroutine(CoroBeginInst *CoroBegin, Type *FrameTy) {
+static void handleNoSuspendCoroutine(coro::Shape &Shape) {
+  auto *CoroBegin = Shape.CoroBegin;
   auto *CoroId = CoroBegin->getId();
   auto *AllocInst = CoroId->getCoroAlloc();
-  coro::replaceCoroFree(CoroId, /*Elide=*/AllocInst != nullptr);
-  if (AllocInst) {
-    IRBuilder<> Builder(AllocInst);
-    // FIXME: Need to handle overaligned members.
-    auto *Frame = Builder.CreateAlloca(FrameTy);
-    auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy());
-    AllocInst->replaceAllUsesWith(Builder.getFalse());
-    AllocInst->eraseFromParent();
-    CoroBegin->replaceAllUsesWith(VFrame);
-  } else {
-    CoroBegin->replaceAllUsesWith(CoroBegin->getMem());
+  switch (Shape.ABI) {
+  case coro::ABI::Switch: {
+    auto SwitchId = cast<CoroIdInst>(CoroId);
+    coro::replaceCoroFree(SwitchId, /*Elide=*/AllocInst != nullptr);
+    if (AllocInst) {
+      IRBuilder<> Builder(AllocInst);
+      // FIXME: Need to handle overaligned members.
+      auto *Frame = Builder.CreateAlloca(Shape.FrameTy);
+      auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy());
+      AllocInst->replaceAllUsesWith(Builder.getFalse());
+      AllocInst->eraseFromParent();
+      CoroBegin->replaceAllUsesWith(VFrame);
+    } else {
+      CoroBegin->replaceAllUsesWith(CoroBegin->getMem());
+    }
+    break;
+  }
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    CoroBegin->replaceAllUsesWith(UndefValue::get(CoroBegin->getType()));
+    break;
   }
+
   CoroBegin->eraseFromParent();
 }
 
@@ -670,12 +1132,16 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
 
 // Remove suspend points that are simplified.
 static void simplifySuspendPoints(coro::Shape &Shape) {
+  // Currently, the only simplification we do is switch-lowering-specific.
+  if (Shape.ABI != coro::ABI::Switch)
+    return;
+
   auto &S = Shape.CoroSuspends;
   size_t I = 0, N = S.size();
   if (N == 0)
     return;
   while (true) {
-    if (simplifySuspendPoint(S[I], Shape.CoroBegin)) {
+    if (simplifySuspendPoint(cast<CoroSuspendInst>(S[I]), Shape.CoroBegin)) {
       if (--N == I)
         break;
       std::swap(S[I], S[N]);
@@ -687,142 +1153,227 @@ static void simplifySuspendPoints(coro::Shape &Shape) {
   S.resize(N);
 }
 
-static SmallPtrSet<BasicBlock *, 4> getCoroBeginPredBlocks(CoroBeginInst *CB) {
-  // Collect all blocks that we need to look for instructions to relocate.
-  SmallPtrSet<BasicBlock *, 4> RelocBlocks;
-  SmallVector<BasicBlock *, 4> Work;
-  Work.push_back(CB->getParent());
+static void splitSwitchCoroutine(Function &F, coro::Shape &Shape,
+                                 SmallVectorImpl<Function *> &Clones) {
+  assert(Shape.ABI == coro::ABI::Switch);
 
-  do {
-    BasicBlock *Current = Work.pop_back_val();
-    for (BasicBlock *BB : predecessors(Current))
-      if (RelocBlocks.count(BB) == 0) {
-        RelocBlocks.insert(BB);
-        Work.push_back(BB);
-      }
-  } while (!Work.empty());
-  return RelocBlocks;
-}
-
-static SmallPtrSet<Instruction *, 8>
-getNotRelocatableInstructions(CoroBeginInst *CoroBegin,
-                              SmallPtrSetImpl<BasicBlock *> &RelocBlocks) {
-  SmallPtrSet<Instruction *, 8> DoNotRelocate;
-  // Collect all instructions that we should not relocate
-  SmallVector<Instruction *, 8> Work;
-
-  // Start with CoroBegin and terminators of all preceding blocks.
-  Work.push_back(CoroBegin);
-  BasicBlock *CoroBeginBB = CoroBegin->getParent();
-  for (BasicBlock *BB : RelocBlocks)
-    if (BB != CoroBeginBB)
-      Work.push_back(BB->getTerminator());
-
-  // For every instruction in the Work list, place its operands in DoNotRelocate
-  // set.
-  do {
-    Instruction *Current = Work.pop_back_val();
-    LLVM_DEBUG(dbgs() << "CoroSplit: Will not relocate: " << *Current << "\n");
-    DoNotRelocate.insert(Current);
-    for (Value *U : Current->operands()) {
-      auto *I = dyn_cast<Instruction>(U);
-      if (!I)
-        continue;
+  createResumeEntryBlock(F, Shape);
+  auto ResumeClone = createClone(F, ".resume", Shape,
+                                 CoroCloner::Kind::SwitchResume);
+  auto DestroyClone = createClone(F, ".destroy", Shape,
+                                  CoroCloner::Kind::SwitchUnwind);
+  auto CleanupClone = createClone(F, ".cleanup", Shape,
+                                  CoroCloner::Kind::SwitchCleanup);
 
-      if (auto *A = dyn_cast<AllocaInst>(I)) {
-        // Stores to alloca instructions that occur before the coroutine frame
-        // is allocated should not be moved; the stored values may be used by
-        // the coroutine frame allocator. The operands to those stores must also
-        // remain in place.
-        for (const auto &User : A->users())
-          if (auto *SI = dyn_cast<llvm::StoreInst>(User))
-            if (RelocBlocks.count(SI->getParent()) != 0 &&
-                DoNotRelocate.count(SI) == 0) {
-              Work.push_back(SI);
-              DoNotRelocate.insert(SI);
-            }
-        continue;
-      }
+  postSplitCleanup(*ResumeClone);
+  postSplitCleanup(*DestroyClone);
+  postSplitCleanup(*CleanupClone);
+
+  addMustTailToCoroResumes(*ResumeClone);
+
+  // Store addresses resume/destroy/cleanup functions in the coroutine frame.
+  updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone);
+
+  assert(Clones.empty());
+  Clones.push_back(ResumeClone);
+  Clones.push_back(DestroyClone);
+  Clones.push_back(CleanupClone);
+
+  // Create a constant array referring to resume/destroy/clone functions pointed
+  // by the last argument of @llvm.coro.info, so that CoroElide pass can
+  // determined correct function to call.
+  setCoroInfo(F, Shape, Clones);
+}
 
-      if (DoNotRelocate.count(I) == 0) {
-        Work.push_back(I);
-        DoNotRelocate.insert(I);
+static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
+                                 SmallVectorImpl<Function *> &Clones) {
+  assert(Shape.ABI == coro::ABI::Retcon ||
+         Shape.ABI == coro::ABI::RetconOnce);
+  assert(Clones.empty());
+
+  // Reset various things that the optimizer might have decided it
+  // "knows" about the coroutine function due to not seeing a return.
+  F.removeFnAttr(Attribute::NoReturn);
+  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+
+  // Allocate the frame.
+  auto *Id = cast<AnyCoroIdRetconInst>(Shape.CoroBegin->getId());
+  Value *RawFramePtr;
+  if (Shape.RetconLowering.IsFrameInlineInStorage) {
+    RawFramePtr = Id->getStorage();
+  } else {
+    IRBuilder<> Builder(Id);
+
+    // Determine the size of the frame.
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    auto Size = DL.getTypeAllocSize(Shape.FrameTy);
+
+    // Allocate.  We don't need to update the call graph node because we're
+    // going to recompute it from scratch after splitting.
+    RawFramePtr = Shape.emitAlloc(Builder, Builder.getInt64(Size), nullptr);
+    RawFramePtr =
+      Builder.CreateBitCast(RawFramePtr, Shape.CoroBegin->getType());
+
+    // Stash the allocated frame pointer in the continuation storage.
+    auto Dest = Builder.CreateBitCast(Id->getStorage(),
+                                      RawFramePtr->getType()->getPointerTo());
+    Builder.CreateStore(RawFramePtr, Dest);
+  }
+
+  // Map all uses of llvm.coro.begin to the allocated frame pointer.
+  {
+    // Make sure we don't invalidate Shape.FramePtr.
+    TrackingVH<Instruction> Handle(Shape.FramePtr);
+    Shape.CoroBegin->replaceAllUsesWith(RawFramePtr);
+    Shape.FramePtr = Handle.getValPtr();
+  }
+
+  // Create a unique return block.
+  BasicBlock *ReturnBB = nullptr;
+  SmallVector<PHINode *, 4> ReturnPHIs;
+
+  // Create all the functions in order after the main function.
+  auto NextF = std::next(F.getIterator());
+
+  // Create a continuation function for each of the suspend points.
+  Clones.reserve(Shape.CoroSuspends.size());
+  for (size_t i = 0, e = Shape.CoroSuspends.size(); i != e; ++i) {
+    auto Suspend = cast<CoroSuspendRetconInst>(Shape.CoroSuspends[i]);
+
+    // Create the clone declaration.
+    auto Continuation =
+      createCloneDeclaration(F, Shape, ".resume." + Twine(i), NextF);
+    Clones.push_back(Continuation);
+
+    // Insert a branch to the unified return block immediately before
+    // the suspend point.
+    auto SuspendBB = Suspend->getParent();
+    auto NewSuspendBB = SuspendBB->splitBasicBlock(Suspend);
+    auto Branch = cast<BranchInst>(SuspendBB->getTerminator());
+
+    // Create the unified return block.
+    if (!ReturnBB) {
+      // Place it before the first suspend.
+      ReturnBB = BasicBlock::Create(F.getContext(), "coro.return", &F,
+                                    NewSuspendBB);
+      Shape.RetconLowering.ReturnBlock = ReturnBB;
+
+      IRBuilder<> Builder(ReturnBB);
+
+      // Create PHIs for all the return values.
+      assert(ReturnPHIs.empty());
+
+      // First, the continuation.
+      ReturnPHIs.push_back(Builder.CreatePHI(Continuation->getType(),
+                                             Shape.CoroSuspends.size()));
+
+      // Next, all the directly-yielded values.
+      for (auto ResultTy : Shape.getRetconResultTypes())
+        ReturnPHIs.push_back(Builder.CreatePHI(ResultTy,
+                                               Shape.CoroSuspends.size()));
+
+      // Build the return value.
+      auto RetTy = F.getReturnType();
+
+      // Cast the continuation value if necessary.
+      // We can't rely on the types matching up because that type would
+      // have to be infinite.
+      auto CastedContinuationTy =
+        (ReturnPHIs.size() == 1 ? RetTy : RetTy->getStructElementType(0));
+      auto *CastedContinuation =
+        Builder.CreateBitCast(ReturnPHIs[0], CastedContinuationTy);
+
+      Value *RetV;
+      if (ReturnPHIs.size() == 1) {
+        RetV = CastedContinuation;
+      } else {
+        RetV = UndefValue::get(RetTy);
+        RetV = Builder.CreateInsertValue(RetV, CastedContinuation, 0);
+        for (size_t I = 1, E = ReturnPHIs.size(); I != E; ++I)
+          RetV = Builder.CreateInsertValue(RetV, ReturnPHIs[I], I);
       }
+
+      Builder.CreateRet(RetV);
     }
-  } while (!Work.empty());
-  return DoNotRelocate;
-}
 
-static void relocateInstructionBefore(CoroBeginInst *CoroBegin, Function &F) {
-  // Analyze which non-alloca instructions are needed for allocation and
-  // relocate the rest to after coro.begin. We need to do it, since some of the
-  // targets of those instructions may be placed into coroutine frame memory
-  // for which becomes available after coro.begin intrinsic.
+    // Branch to the return block.
+    Branch->setSuccessor(0, ReturnBB);
+    ReturnPHIs[0]->addIncoming(Continuation, SuspendBB);
+    size_t NextPHIIndex = 1;
+    for (auto &VUse : Suspend->value_operands())
+      ReturnPHIs[NextPHIIndex++]->addIncoming(&*VUse, SuspendBB);
+    assert(NextPHIIndex == ReturnPHIs.size());
+  }
 
-  auto BlockSet = getCoroBeginPredBlocks(CoroBegin);
-  auto DoNotRelocateSet = getNotRelocatableInstructions(CoroBegin, BlockSet);
+  assert(Clones.size() == Shape.CoroSuspends.size());
+  for (size_t i = 0, e = Shape.CoroSuspends.size(); i != e; ++i) {
+    auto Suspend = Shape.CoroSuspends[i];
+    auto Clone = Clones[i];
 
-  Instruction *InsertPt = CoroBegin->getNextNode();
-  BasicBlock &BB = F.getEntryBlock(); // TODO: Look at other blocks as well.
-  for (auto B = BB.begin(), E = BB.end(); B != E;) {
-    Instruction &I = *B++;
-    if (isa<AllocaInst>(&I))
-      continue;
-    if (&I == CoroBegin)
-      break;
-    if (DoNotRelocateSet.count(&I))
-      continue;
-    I.moveBefore(InsertPt);
+    CoroCloner(F, "resume." + Twine(i), Shape, Clone, Suspend).create();
+  }
+}
+
+namespace {
+  class PrettyStackTraceFunction : public PrettyStackTraceEntry {
+    Function &F;
+  public:
+    PrettyStackTraceFunction(Function &F) : F(F) {}
+    void print(raw_ostream &OS) const override {
+      OS << "While splitting coroutine ";
+      F.printAsOperand(OS, /*print type*/ false, F.getParent());
+      OS << "\n";
+    }
+  };
+}
+
+static void splitCoroutine(Function &F, coro::Shape &Shape,
+                           SmallVectorImpl<Function *> &Clones) {
+  switch (Shape.ABI) {
+  case coro::ABI::Switch:
+    return splitSwitchCoroutine(F, Shape, Clones);
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce:
+    return splitRetconCoroutine(F, Shape, Clones);
   }
+  llvm_unreachable("bad ABI kind");
 }
 
 static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
-  EliminateUnreachableBlocks(F);
+  PrettyStackTraceFunction prettyStackTrace(F);
+
+  // The suspend-crossing algorithm in buildCoroutineFrame get tripped
+  // up by uses in unreachable blocks, so remove them as a first pass.
+  removeUnreachableBlocks(F);
 
   coro::Shape Shape(F);
   if (!Shape.CoroBegin)
     return;
 
   simplifySuspendPoints(Shape);
-  relocateInstructionBefore(Shape.CoroBegin, F);
   buildCoroutineFrame(F, Shape);
   replaceFrameSize(Shape);
 
+  SmallVector<Function*, 4> Clones;
+
   // If there are no suspend points, no split required, just remove
   // the allocation and deallocation blocks, they are not needed.
   if (Shape.CoroSuspends.empty()) {
-    handleNoSuspendCoroutine(Shape.CoroBegin, Shape.FrameTy);
-    removeCoroEnds(Shape);
-    postSplitCleanup(F);
-    coro::updateCallGraph(F, {}, CG, SCC);
-    return;
+    handleNoSuspendCoroutine(Shape);
+  } else {
+    splitCoroutine(F, Shape, Clones);
   }
 
-  auto *ResumeEntry = createResumeEntryBlock(F, Shape);
-  auto ResumeClone = createClone(F, ".resume", Shape, ResumeEntry, 0);
-  auto DestroyClone = createClone(F, ".destroy", Shape, ResumeEntry, 1);
-  auto CleanupClone = createClone(F, ".cleanup", Shape, ResumeEntry, 2);
-
-  // We no longer need coro.end in F.
-  removeCoroEnds(Shape);
+  // Replace all the swifterror operations in the original function.
+  // This invalidates SwiftErrorOps in the Shape.
+  replaceSwiftErrorOps(F, Shape, nullptr);
 
+  removeCoroEnds(Shape, &CG);
   postSplitCleanup(F);
-  postSplitCleanup(*ResumeClone);
-  postSplitCleanup(*DestroyClone);
-  postSplitCleanup(*CleanupClone);
-
-  addMustTailToCoroResumes(*ResumeClone);
-
-  // Store addresses resume/destroy/cleanup functions in the coroutine frame.
-  updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone);
-
-  // Create a constant array referring to resume/destroy/clone functions pointed
-  // by the last argument of @llvm.coro.info, so that CoroElide pass can
-  // determined correct function to call.
-  setCoroInfo(F, Shape.CoroBegin, {ResumeClone, DestroyClone, CleanupClone});
 
   // Update call graph and add the functions we created to the SCC.
-  coro::updateCallGraph(F, {ResumeClone, DestroyClone, CleanupClone}, CG, SCC);
+  coro::updateCallGraph(F, Clones, CG, SCC);
 }
 
 // When we see the coroutine the first time, we insert an indirect call to a
@@ -881,6 +1432,80 @@ static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) {
   SCC.initialize(Nodes);
 }
 
+/// Replace a call to llvm.coro.prepare.retcon.
+static void replacePrepare(CallInst *Prepare, CallGraph &CG) {
+  auto CastFn = Prepare->getArgOperand(0); // as an i8*
+  auto Fn = CastFn->stripPointerCasts(); // as its original type
+
+  // Find call graph nodes for the preparation.
+  CallGraphNode *PrepareUserNode = nullptr, *FnNode = nullptr;
+  if (auto ConcreteFn = dyn_cast<Function>(Fn)) {
+    PrepareUserNode = CG[Prepare->getFunction()];
+    FnNode = CG[ConcreteFn];
+  }
+
+  // Attempt to peephole this pattern:
+  //    %0 = bitcast [[TYPE]] @some_function to i8*
+  //    %1 = call @llvm.coro.prepare.retcon(i8* %0)
+  //    %2 = bitcast %1 to [[TYPE]]
+  // ==>
+  //    %2 = @some_function
+  for (auto UI = Prepare->use_begin(), UE = Prepare->use_end();
+         UI != UE; ) {
+    // Look for bitcasts back to the original function type.
+    auto *Cast = dyn_cast<BitCastInst>((UI++)->getUser());
+    if (!Cast || Cast->getType() != Fn->getType()) continue;
+
+    // Check whether the replacement will introduce new direct calls.
+    // If so, we'll need to update the call graph.
+    if (PrepareUserNode) {
+      for (auto &Use : Cast->uses()) {
+        if (auto *CB = dyn_cast<CallBase>(Use.getUser())) {
+          if (!CB->isCallee(&Use))
+            continue;
+          PrepareUserNode->removeCallEdgeFor(*CB);
+          PrepareUserNode->addCalledFunction(CB, FnNode);
+        }
+      }
+    }
+
+    // Replace and remove the cast.
+    Cast->replaceAllUsesWith(Fn);
+    Cast->eraseFromParent();
+  }
+
+  // Replace any remaining uses with the function as an i8*.
+  // This can never directly be a callee, so we don't need to update CG.
+  Prepare->replaceAllUsesWith(CastFn);
+  Prepare->eraseFromParent();
+
+  // Kill dead bitcasts.
+  while (auto *Cast = dyn_cast<BitCastInst>(CastFn)) {
+    if (!Cast->use_empty()) break;
+    CastFn = Cast->getOperand(0);
+    Cast->eraseFromParent();
+  }
+}
+
+/// Remove calls to llvm.coro.prepare.retcon, a barrier meant to prevent
+/// IPO from operating on calls to a retcon coroutine before it's been
+/// split.  This is only safe to do after we've split all retcon
+/// coroutines in the module.  We can do that this in this pass because
+/// this pass does promise to split all retcon coroutines (as opposed to
+/// switch coroutines, which are lowered in multiple stages).
+static bool replaceAllPrepares(Function *PrepareFn, CallGraph &CG) {
+  bool Changed = false;
+  for (auto PI = PrepareFn->use_begin(), PE = PrepareFn->use_end();
+         PI != PE; ) {
+    // Intrinsics can only be used in calls.
+    auto *Prepare = cast<CallInst>((PI++)->getUser());
+    replacePrepare(Prepare, CG);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
 //===----------------------------------------------------------------------===//
 //                              Top Level Driver
 //===----------------------------------------------------------------------===//
@@ -899,7 +1524,9 @@ struct CoroSplit : public CallGraphSCCPass {
   // A coroutine is identified by the presence of coro.begin intrinsic, if
   // we don't have any, this pass has nothing to do.
   bool doInitialization(CallGraph &CG) override {
-    Run = coro::declaresIntrinsics(CG.getModule(), {"llvm.coro.begin"});
+    Run = coro::declaresIntrinsics(CG.getModule(),
+                                   {"llvm.coro.begin",
+                                    "llvm.coro.prepare.retcon"});
     return CallGraphSCCPass::doInitialization(CG);
   }
 
@@ -907,6 +1534,12 @@ struct CoroSplit : public CallGraphSCCPass {
     if (!Run)
       return false;
 
+    // Check for uses of llvm.coro.prepare.retcon.
+    auto PrepareFn =
+      SCC.getCallGraph().getModule().getFunction("llvm.coro.prepare.retcon");
+    if (PrepareFn && PrepareFn->use_empty())
+      PrepareFn = nullptr;
+
     // Find coroutines for processing.
     SmallVector<Function *, 4> Coroutines;
     for (CallGraphNode *CGN : SCC)
@@ -914,12 +1547,17 @@ struct CoroSplit : public CallGraphSCCPass {
         if (F->hasFnAttribute(CORO_PRESPLIT_ATTR))
           Coroutines.push_back(F);
 
-    if (Coroutines.empty())
+    if (Coroutines.empty() && !PrepareFn)
       return false;
 
     CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+    if (Coroutines.empty())
+      return replaceAllPrepares(PrepareFn, CG);
+
     createDevirtTriggerFunc(CG, SCC);
 
+    // Split all the coroutines.
     for (Function *F : Coroutines) {
       Attribute Attr = F->getFnAttribute(CORO_PRESPLIT_ATTR);
       StringRef Value = Attr.getValueAsString();
@@ -932,6 +1570,10 @@ struct CoroSplit : public CallGraphSCCPass {
       F->removeFnAttr(CORO_PRESPLIT_ATTR);
       splitCoroutine(*F, CG, SCC);
     }
+
+    if (PrepareFn)
+      replaceAllPrepares(PrepareFn, CG);
+
     return true;
   }
 
diff --git a/lib/Transforms/Coroutines/Coroutines.cpp b/lib/Transforms/Coroutines/Coroutines.cpp
index a581d1d21169..f39483b27518 100644
--- a/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/lib/Transforms/Coroutines/Coroutines.cpp
@@ -123,12 +123,26 @@ Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
 static bool isCoroutineIntrinsicName(StringRef Name) {
   // NOTE: Must be sorted!
   static const char *const CoroIntrinsics[] = {
-      "llvm.coro.alloc",   "llvm.coro.begin",   "llvm.coro.destroy",
-      "llvm.coro.done",    "llvm.coro.end",     "llvm.coro.frame",
-      "llvm.coro.free",    "llvm.coro.id",      "llvm.coro.noop",
-      "llvm.coro.param",   "llvm.coro.promise", "llvm.coro.resume",
-      "llvm.coro.save",    "llvm.coro.size",    "llvm.coro.subfn.addr",
+      "llvm.coro.alloc",
+      "llvm.coro.begin",
+      "llvm.coro.destroy",
+      "llvm.coro.done",
+      "llvm.coro.end",
+      "llvm.coro.frame",
+      "llvm.coro.free",
+      "llvm.coro.id",
+      "llvm.coro.id.retcon",
+      "llvm.coro.id.retcon.once",
+      "llvm.coro.noop",
+      "llvm.coro.param",
+      "llvm.coro.prepare.retcon",
+      "llvm.coro.promise",
+      "llvm.coro.resume",
+      "llvm.coro.save",
+      "llvm.coro.size",
+      "llvm.coro.subfn.addr",
       "llvm.coro.suspend",
+      "llvm.coro.suspend.retcon",
   };
   return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1;
 }
@@ -217,9 +231,6 @@ static void clear(coro::Shape &Shape) {
   Shape.FrameTy = nullptr;
   Shape.FramePtr = nullptr;
   Shape.AllocaSpillBlock = nullptr;
-  Shape.ResumeSwitch = nullptr;
-  Shape.PromiseAlloca = nullptr;
-  Shape.HasFinalSuspend = false;
 }
 
 static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
@@ -235,6 +246,7 @@ static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
 
 // Collect "interesting" coroutine intrinsics.
 void coro::Shape::buildFrom(Function &F) {
+  bool HasFinalSuspend = false;
   size_t FinalSuspendIndex = 0;
   clear(*this);
   SmallVector<CoroFrameInst *, 8> CoroFrames;
@@ -257,9 +269,15 @@ void coro::Shape::buildFrom(Function &F) {
         if (II->use_empty())
           UnusedCoroSaves.push_back(cast<CoroSaveInst>(II));
         break;
-      case Intrinsic::coro_suspend:
-        CoroSuspends.push_back(cast<CoroSuspendInst>(II));
-        if (CoroSuspends.back()->isFinal()) {
+      case Intrinsic::coro_suspend_retcon: {
+        auto Suspend = cast<CoroSuspendRetconInst>(II);
+        CoroSuspends.push_back(Suspend);
+        break;
+      }
+      case Intrinsic::coro_suspend: {
+        auto Suspend = cast<CoroSuspendInst>(II);
+        CoroSuspends.push_back(Suspend);
+        if (Suspend->isFinal()) {
           if (HasFinalSuspend)
             report_fatal_error(
               "Only one suspend point can be marked as final");
@@ -267,18 +285,23 @@ void coro::Shape::buildFrom(Function &F) {
           FinalSuspendIndex = CoroSuspends.size() - 1;
         }
         break;
+      }
       case Intrinsic::coro_begin: {
         auto CB = cast<CoroBeginInst>(II);
-        if (CB->getId()->getInfo().isPreSplit()) {
-          if (CoroBegin)
-            report_fatal_error(
+
+        // Ignore coro id's that aren't pre-split.
+        auto Id = dyn_cast<CoroIdInst>(CB->getId());
+        if (Id && !Id->getInfo().isPreSplit())
+          break;
+
+        if (CoroBegin)
+          report_fatal_error(
                 "coroutine should have exactly one defining @llvm.coro.begin");
-          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
-          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-          CB->removeAttribute(AttributeList::FunctionIndex,
-                              Attribute::NoDuplicate);
-          CoroBegin = CB;
-        }
+        CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+        CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+        CB->removeAttribute(AttributeList::FunctionIndex,
+                            Attribute::NoDuplicate);
+        CoroBegin = CB;
         break;
       }
       case Intrinsic::coro_end:
@@ -310,7 +333,7 @@ void coro::Shape::buildFrom(Function &F) {
 
     // Replace all coro.suspend with undef and remove related coro.saves if
     // present.
-    for (CoroSuspendInst *CS : CoroSuspends) {
+    for (AnyCoroSuspendInst *CS : CoroSuspends) {
       CS->replaceAllUsesWith(UndefValue::get(CS->getType()));
       CS->eraseFromParent();
       if (auto *CoroSave = CS->getCoroSave())
@@ -324,19 +347,136 @@ void coro::Shape::buildFrom(Function &F) {
     return;
   }
 
+  auto Id = CoroBegin->getId();
+  switch (auto IdIntrinsic = Id->getIntrinsicID()) {
+  case Intrinsic::coro_id: {
+    auto SwitchId = cast<CoroIdInst>(Id);
+    this->ABI = coro::ABI::Switch;
+    this->SwitchLowering.HasFinalSuspend = HasFinalSuspend;
+    this->SwitchLowering.ResumeSwitch = nullptr;
+    this->SwitchLowering.PromiseAlloca = SwitchId->getPromise();
+    this->SwitchLowering.ResumeEntryBlock = nullptr;
+
+    for (auto AnySuspend : CoroSuspends) {
+      auto Suspend = dyn_cast<CoroSuspendInst>(AnySuspend);
+      if (!Suspend) {
+#ifndef NDEBUG
+        AnySuspend->dump();
+#endif
+        report_fatal_error("coro.id must be paired with coro.suspend");
+      }
+
+      if (!Suspend->getCoroSave())
+        createCoroSave(CoroBegin, Suspend);
+    }
+    break;
+  }
+
+  case Intrinsic::coro_id_retcon:
+  case Intrinsic::coro_id_retcon_once: {
+    auto ContinuationId = cast<AnyCoroIdRetconInst>(Id);
+    ContinuationId->checkWellFormed();
+    this->ABI = (IdIntrinsic == Intrinsic::coro_id_retcon
+                  ? coro::ABI::Retcon
+                  : coro::ABI::RetconOnce);
+    auto Prototype = ContinuationId->getPrototype();
+    this->RetconLowering.ResumePrototype = Prototype;
+    this->RetconLowering.Alloc = ContinuationId->getAllocFunction();
+    this->RetconLowering.Dealloc = ContinuationId->getDeallocFunction();
+    this->RetconLowering.ReturnBlock = nullptr;
+    this->RetconLowering.IsFrameInlineInStorage = false;
+
+    // Determine the result value types, and make sure they match up with
+    // the values passed to the suspends.
+    auto ResultTys = getRetconResultTypes();
+    auto ResumeTys = getRetconResumeTypes();
+
+    for (auto AnySuspend : CoroSuspends) {
+      auto Suspend = dyn_cast<CoroSuspendRetconInst>(AnySuspend);
+      if (!Suspend) {
+#ifndef NDEBUG
+        AnySuspend->dump();
+#endif
+        report_fatal_error("coro.id.retcon.* must be paired with "
+                           "coro.suspend.retcon");
+      }
+
+      // Check that the argument types of the suspend match the results.
+      auto SI = Suspend->value_begin(), SE = Suspend->value_end();
+      auto RI = ResultTys.begin(), RE = ResultTys.end();
+      for (; SI != SE && RI != RE; ++SI, ++RI) {
+        auto SrcTy = (*SI)->getType();
+        if (SrcTy != *RI) {
+          // The optimizer likes to eliminate bitcasts leading into variadic
+          // calls, but that messes with our invariants.  Re-insert the
+          // bitcast and ignore this type mismatch.
+          if (CastInst::isBitCastable(SrcTy, *RI)) {
+            auto BCI = new BitCastInst(*SI, *RI, "", Suspend);
+            SI->set(BCI);
+            continue;
+          }
+
+#ifndef NDEBUG
+          Suspend->dump();
+          Prototype->getFunctionType()->dump();
+#endif
+          report_fatal_error("argument to coro.suspend.retcon does not "
+                             "match corresponding prototype function result");
+        }
+      }
+      if (SI != SE || RI != RE) {
+#ifndef NDEBUG
+        Suspend->dump();
+        Prototype->getFunctionType()->dump();
+#endif
+        report_fatal_error("wrong number of arguments to coro.suspend.retcon");
+      }
+
+      // Check that the result type of the suspend matches the resume types.
+      Type *SResultTy = Suspend->getType();
+      ArrayRef<Type*> SuspendResultTys;
+      if (SResultTy->isVoidTy()) {
+        // leave as empty array
+      } else if (auto SResultStructTy = dyn_cast<StructType>(SResultTy)) {
+        SuspendResultTys = SResultStructTy->elements();
+      } else {
+        // forms an ArrayRef using SResultTy, be careful
+        SuspendResultTys = SResultTy;
+      }
+      if (SuspendResultTys.size() != ResumeTys.size()) {
+#ifndef NDEBUG
+        Suspend->dump();
+        Prototype->getFunctionType()->dump();
+#endif
+        report_fatal_error("wrong number of results from coro.suspend.retcon");
+      }
+      for (size_t I = 0, E = ResumeTys.size(); I != E; ++I) {
+        if (SuspendResultTys[I] != ResumeTys[I]) {
+#ifndef NDEBUG
+          Suspend->dump();
+          Prototype->getFunctionType()->dump();
+#endif
+          report_fatal_error("result from coro.suspend.retcon does not "
+                             "match corresponding prototype function param");
+        }
+      }
+    }
+    break;
+  }
+
+  default:
+    llvm_unreachable("coro.begin is not dependent on a coro.id call");
+  }
+
   // The coro.free intrinsic is always lowered to the result of coro.begin.
   for (CoroFrameInst *CF : CoroFrames) {
     CF->replaceAllUsesWith(CoroBegin);
     CF->eraseFromParent();
   }
 
-  // Canonicalize coro.suspend by inserting a coro.save if needed.
-  for (CoroSuspendInst *CS : CoroSuspends)
-    if (!CS->getCoroSave())
-      createCoroSave(CoroBegin, CS);
-
   // Move final suspend to be the last element in the CoroSuspends vector.
-  if (HasFinalSuspend &&
+  if (ABI == coro::ABI::Switch &&
+      SwitchLowering.HasFinalSuspend &&
       FinalSuspendIndex != CoroSuspends.size() - 1)
     std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
 
@@ -345,6 +485,154 @@ void coro::Shape::buildFrom(Function &F) {
     CoroSave->eraseFromParent();
 }
 
+static void propagateCallAttrsFromCallee(CallInst *Call, Function *Callee) {
+  Call->setCallingConv(Callee->getCallingConv());
+  // TODO: attributes?
+}
+
+static void addCallToCallGraph(CallGraph *CG, CallInst *Call, Function *Callee){
+  if (CG)
+    (*CG)[Call->getFunction()]->addCalledFunction(Call, (*CG)[Callee]);
+}
+
+Value *coro::Shape::emitAlloc(IRBuilder<> &Builder, Value *Size,
+                              CallGraph *CG) const {
+  switch (ABI) {
+  case coro::ABI::Switch:
+    llvm_unreachable("can't allocate memory in coro switch-lowering");
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    auto Alloc = RetconLowering.Alloc;
+    Size = Builder.CreateIntCast(Size,
+                                 Alloc->getFunctionType()->getParamType(0),
+                                 /*is signed*/ false);
+    auto *Call = Builder.CreateCall(Alloc, Size);
+    propagateCallAttrsFromCallee(Call, Alloc);
+    addCallToCallGraph(CG, Call, Alloc);
+    return Call;
+  }
+  }
+  llvm_unreachable("Unknown coro::ABI enum");
+}
+
+void coro::Shape::emitDealloc(IRBuilder<> &Builder, Value *Ptr,
+                              CallGraph *CG) const {
+  switch (ABI) {
+  case coro::ABI::Switch:
+    llvm_unreachable("can't allocate memory in coro switch-lowering");
+
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
+    auto Dealloc = RetconLowering.Dealloc;
+    Ptr = Builder.CreateBitCast(Ptr,
+                                Dealloc->getFunctionType()->getParamType(0));
+    auto *Call = Builder.CreateCall(Dealloc, Ptr);
+    propagateCallAttrsFromCallee(Call, Dealloc);
+    addCallToCallGraph(CG, Call, Dealloc);
+    return;
+  }
+  }
+  llvm_unreachable("Unknown coro::ABI enum");
+}
+
+LLVM_ATTRIBUTE_NORETURN
+static void fail(const Instruction *I, const char *Reason, Value *V) {
+#ifndef NDEBUG
+  I->dump();
+  if (V) {
+    errs() << "  Value: ";
+    V->printAsOperand(llvm::errs());
+    errs() << '\n';
+  }
+#endif
+  report_fatal_error(Reason);
+}
+
+/// Check that the given value is a well-formed prototype for the
+/// llvm.coro.id.retcon.* intrinsics.
+static void checkWFRetconPrototype(const AnyCoroIdRetconInst *I, Value *V) {
+  auto F = dyn_cast<Function>(V->stripPointerCasts());
+  if (!F)
+    fail(I, "llvm.coro.id.retcon.* prototype not a Function", V);
+
+  auto FT = F->getFunctionType();
+
+  if (isa<CoroIdRetconInst>(I)) {
+    bool ResultOkay;
+    if (FT->getReturnType()->isPointerTy()) {
+      ResultOkay = true;
+    } else if (auto SRetTy = dyn_cast<StructType>(FT->getReturnType())) {
+      ResultOkay = (!SRetTy->isOpaque() &&
+                    SRetTy->getNumElements() > 0 &&
+                    SRetTy->getElementType(0)->isPointerTy());
+    } else {
+      ResultOkay = false;
+    }
+    if (!ResultOkay)
+      fail(I, "llvm.coro.id.retcon prototype must return pointer as first "
+              "result", F);
+
+    if (FT->getReturnType() !=
+          I->getFunction()->getFunctionType()->getReturnType())
+      fail(I, "llvm.coro.id.retcon prototype return type must be same as"
+              "current function return type", F);
+  } else {
+    // No meaningful validation to do here for llvm.coro.id.unique.once.
+  }
+
+  if (FT->getNumParams() == 0 || !FT->getParamType(0)->isPointerTy())
+    fail(I, "llvm.coro.id.retcon.* prototype must take pointer as "
+            "its first parameter", F);
+}
+
+/// Check that the given value is a well-formed allocator.
+static void checkWFAlloc(const Instruction *I, Value *V) {
+  auto F = dyn_cast<Function>(V->stripPointerCasts());
+  if (!F)
+    fail(I, "llvm.coro.* allocator not a Function", V);
+
+  auto FT = F->getFunctionType();
+  if (!FT->getReturnType()->isPointerTy())
+    fail(I, "llvm.coro.* allocator must return a pointer", F);
+
+  if (FT->getNumParams() != 1 ||
+      !FT->getParamType(0)->isIntegerTy())
+    fail(I, "llvm.coro.* allocator must take integer as only param", F);
+}
+
+/// Check that the given value is a well-formed deallocator.
+static void checkWFDealloc(const Instruction *I, Value *V) {
+  auto F = dyn_cast<Function>(V->stripPointerCasts());
+  if (!F)
+    fail(I, "llvm.coro.* deallocator not a Function", V);
+
+  auto FT = F->getFunctionType();
+  if (!FT->getReturnType()->isVoidTy())
+    fail(I, "llvm.coro.* deallocator must return void", F);
+
+  if (FT->getNumParams() != 1 ||
+      !FT->getParamType(0)->isPointerTy())
+    fail(I, "llvm.coro.* deallocator must take pointer as only param", F);
+}
+
+static void checkConstantInt(const Instruction *I, Value *V,
+                             const char *Reason) {
+  if (!isa<ConstantInt>(V)) {
+    fail(I, Reason, V);
+  }
+}
+
+void AnyCoroIdRetconInst::checkWellFormed() const {
+  checkConstantInt(this, getArgOperand(SizeArg),
+                   "size argument to coro.id.retcon.* must be constant");
+  checkConstantInt(this, getArgOperand(AlignArg),
+                   "alignment argument to coro.id.retcon.* must be constant");
+  checkWFRetconPrototype(this, getArgOperand(PrototypeArg));
+  checkWFAlloc(this, getArgOperand(AllocArg));
+  checkWFDealloc(this, getArgOperand(DeallocArg));
+}
+
 void LLVMAddCoroEarlyPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCoroEarlyPass());
 }
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 95a9f31cced3..dd9f74a881ee 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -304,7 +304,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
           // of the previous load.
           LoadInst *newLoad =
               IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
-          newLoad->setAlignment(OrigLoad->getAlignment());
+          newLoad->setAlignment(MaybeAlign(OrigLoad->getAlignment()));
           // Transfer the AA info too.
           AAMDNodes AAInfo;
           OrigLoad->getAAMetadata(AAInfo);
diff --git a/lib/Transforms/IPO/Attributor.cpp b/lib/Transforms/IPO/Attributor.cpp
index 2a52c6b9b4ad..95f47345d8fd 100644
--- a/lib/Transforms/IPO/Attributor.cpp
+++ b/lib/Transforms/IPO/Attributor.cpp
@@ -16,11 +16,15 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -30,6 +34,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
 #include <cassert>
 
 using namespace llvm;
@@ -46,19 +53,50 @@ STATISTIC(NumAttributesValidFixpoint,
           "Number of abstract attributes in a valid fixpoint state");
 STATISTIC(NumAttributesManifested,
           "Number of abstract attributes manifested in IR");
-STATISTIC(NumFnNoUnwind, "Number of functions marked nounwind");
-
-STATISTIC(NumFnUniqueReturned, "Number of function with unique return");
-STATISTIC(NumFnKnownReturns, "Number of function with known return values");
-STATISTIC(NumFnArgumentReturned,
-          "Number of function arguments marked returned");
-STATISTIC(NumFnNoSync, "Number of functions marked nosync");
-STATISTIC(NumFnNoFree, "Number of functions marked nofree");
-STATISTIC(NumFnReturnedNonNull,
-          "Number of function return values marked nonnull");
-STATISTIC(NumFnArgumentNonNull, "Number of function arguments marked nonnull");
-STATISTIC(NumCSArgumentNonNull, "Number of call site arguments marked nonnull");
-STATISTIC(NumFnWillReturn, "Number of functions marked willreturn");
+
+// Some helper macros to deal with statistics tracking.
+//
+// Usage:
+// For simple IR attribute tracking overload trackStatistics in the abstract
+// attribute and choose the right STATS_DECLTRACK_********* macro,
+// e.g.,:
+//  void trackStatistics() const override {
+//    STATS_DECLTRACK_ARG_ATTR(returned)
+//  }
+// If there is a single "increment" side one can use the macro
+// STATS_DECLTRACK with a custom message. If there are multiple increment
+// sides, STATS_DECL and STATS_TRACK can also be used separatly.
+//
+#define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME)                                     \
+  ("Number of " #TYPE " marked '" #NAME "'")
+#define BUILD_STAT_NAME(NAME, TYPE) NumIR##TYPE##_##NAME
+#define STATS_DECL_(NAME, MSG) STATISTIC(NAME, MSG);
+#define STATS_DECL(NAME, TYPE, MSG)                                            \
+  STATS_DECL_(BUILD_STAT_NAME(NAME, TYPE), MSG);
+#define STATS_TRACK(NAME, TYPE) ++(BUILD_STAT_NAME(NAME, TYPE));
+#define STATS_DECLTRACK(NAME, TYPE, MSG)                                       \
+  {                                                                            \
+    STATS_DECL(NAME, TYPE, MSG)                                                \
+    STATS_TRACK(NAME, TYPE)                                                    \
+  }
+#define STATS_DECLTRACK_ARG_ATTR(NAME)                                         \
+  STATS_DECLTRACK(NAME, Arguments, BUILD_STAT_MSG_IR_ATTR(arguments, NAME))
+#define STATS_DECLTRACK_CSARG_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, CSArguments,                                           \
+                  BUILD_STAT_MSG_IR_ATTR(call site arguments, NAME))
+#define STATS_DECLTRACK_FN_ATTR(NAME)                                          \
+  STATS_DECLTRACK(NAME, Function, BUILD_STAT_MSG_IR_ATTR(functions, NAME))
+#define STATS_DECLTRACK_CS_ATTR(NAME)                                          \
+  STATS_DECLTRACK(NAME, CS, BUILD_STAT_MSG_IR_ATTR(call site, NAME))
+#define STATS_DECLTRACK_FNRET_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, FunctionReturn,                                        \
+                  BUILD_STAT_MSG_IR_ATTR(function returns, NAME))
+#define STATS_DECLTRACK_CSRET_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, CSReturn,                                              \
+                  BUILD_STAT_MSG_IR_ATTR(call site returns, NAME))
+#define STATS_DECLTRACK_FLOATING_ATTR(NAME)                                    \
+  STATS_DECLTRACK(NAME, Floating,                                              \
+                  ("Number of floating values known to be '" #NAME "'"))
 
 // TODO: Determine a good default value.
 //
@@ -72,18 +110,32 @@ static cl::opt<unsigned>
     MaxFixpointIterations("attributor-max-iterations", cl::Hidden,
                           cl::desc("Maximal number of fixpoint iterations."),
                           cl::init(32));
+static cl::opt<bool> VerifyMaxFixpointIterations(
+    "attributor-max-iterations-verify", cl::Hidden,
+    cl::desc("Verify that max-iterations is a tight bound for a fixpoint"),
+    cl::init(false));
 
 static cl::opt<bool> DisableAttributor(
     "attributor-disable", cl::Hidden,
     cl::desc("Disable the attributor inter-procedural deduction pass."),
     cl::init(true));
 
-static cl::opt<bool> VerifyAttributor(
-    "attributor-verify", cl::Hidden,
-    cl::desc("Verify the Attributor deduction and "
-             "manifestation of attributes -- may issue false-positive errors"),
+static cl::opt<bool> ManifestInternal(
+    "attributor-manifest-internal", cl::Hidden,
+    cl::desc("Manifest Attributor internal string attributes."),
     cl::init(false));
 
+static cl::opt<unsigned> DepRecInterval(
+    "attributor-dependence-recompute-interval", cl::Hidden,
+    cl::desc("Number of iterations until dependences are recomputed."),
+    cl::init(4));
+
+static cl::opt<bool> EnableHeapToStack("enable-heap-to-stack-conversion",
+                                       cl::init(true), cl::Hidden);
+
+static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128),
+                                       cl::Hidden);
+
 /// Logic operators for the change status enum class.
 ///
 ///{
@@ -95,78 +147,30 @@ ChangeStatus llvm::operator&(ChangeStatus l, ChangeStatus r) {
 }
 ///}
 
-/// Helper to adjust the statistics.
-static void bookkeeping(AbstractAttribute::ManifestPosition MP,
-                        const Attribute &Attr) {
-  if (!AreStatisticsEnabled())
-    return;
-
-  if (!Attr.isEnumAttribute())
-    return;
-  switch (Attr.getKindAsEnum()) {
-  case Attribute::NoUnwind:
-    NumFnNoUnwind++;
-    return;
-  case Attribute::Returned:
-    NumFnArgumentReturned++;
-    return;
-  case Attribute::NoSync:
-    NumFnNoSync++;
-    break;
-  case Attribute::NoFree:
-    NumFnNoFree++;
-    break;
-  case Attribute::NonNull:
-    switch (MP) {
-    case AbstractAttribute::MP_RETURNED:
-      NumFnReturnedNonNull++;
-      break;
-    case AbstractAttribute::MP_ARGUMENT:
-      NumFnArgumentNonNull++;
-      break;
-    case AbstractAttribute::MP_CALL_SITE_ARGUMENT:
-      NumCSArgumentNonNull++;
-      break;
-    default:
-      break;
-    }
-    break;
-  case Attribute::WillReturn:
-    NumFnWillReturn++;
-    break;
-  default:
-    return;
-  }
-}
-
-template <typename StateTy>
-using followValueCB_t = std::function<bool(Value *, StateTy &State)>;
-template <typename StateTy>
-using visitValueCB_t = std::function<void(Value *, StateTy &State)>;
-
-/// Recursively visit all values that might become \p InitV at some point. This
+/// Recursively visit all values that might become \p IRP at some point. This
 /// will be done by looking through cast instructions, selects, phis, and calls
-/// with the "returned" attribute. The callback \p FollowValueCB is asked before
-/// a potential origin value is looked at. If no \p FollowValueCB is passed, a
-/// default one is used that will make sure we visit every value only once. Once
-/// we cannot look through the value any further, the callback \p VisitValueCB
-/// is invoked and passed the current value and the \p State. To limit how much
-/// effort is invested, we will never visit more than \p MaxValues values.
-template <typename StateTy>
+/// with the "returned" attribute. Once we cannot look through the value any
+/// further, the callback \p VisitValueCB is invoked and passed the current
+/// value, the \p State, and a flag to indicate if we stripped anything. To
+/// limit how much effort is invested, we will never visit more values than
+/// specified by \p MaxValues.
+template <typename AAType, typename StateTy>
 static bool genericValueTraversal(
-    Value *InitV, StateTy &State, visitValueCB_t<StateTy> &VisitValueCB,
-    followValueCB_t<StateTy> *FollowValueCB = nullptr, int MaxValues = 8) {
-
+    Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State,
+    const function_ref<bool(Value &, StateTy &, bool)> &VisitValueCB,
+    int MaxValues = 8) {
+
+  const AAIsDead *LivenessAA = nullptr;
+  if (IRP.getAnchorScope())
+    LivenessAA = &A.getAAFor<AAIsDead>(
+        QueryingAA, IRPosition::function(*IRP.getAnchorScope()),
+        /* TrackDependence */ false);
+  bool AnyDead = false;
+
+  // TODO: Use Positions here to allow context sensitivity in VisitValueCB
   SmallPtrSet<Value *, 16> Visited;
-  followValueCB_t<bool> DefaultFollowValueCB = [&](Value *Val, bool &) {
-    return Visited.insert(Val).second;
-  };
-
-  if (!FollowValueCB)
-    FollowValueCB = &DefaultFollowValueCB;
-
   SmallVector<Value *, 16> Worklist;
-  Worklist.push_back(InitV);
+  Worklist.push_back(&IRP.getAssociatedValue());
 
   int Iteration = 0;
   do {
@@ -174,7 +178,7 @@ static bool genericValueTraversal(
 
     // Check if we should process the current value. To prevent endless
     // recursion keep a record of the values we followed!
-    if (!(*FollowValueCB)(V, State))
+    if (!Visited.insert(V).second)
       continue;
 
     // Make sure we limit the compile time for complex expressions.
@@ -183,23 +187,23 @@ static bool genericValueTraversal(
 
     // Explicitly look through calls with a "returned" attribute if we do
     // not have a pointer as stripPointerCasts only works on them.
+    Value *NewV = nullptr;
     if (V->getType()->isPointerTy()) {
-      V = V->stripPointerCasts();
+      NewV = V->stripPointerCasts();
     } else {
       CallSite CS(V);
       if (CS && CS.getCalledFunction()) {
-        Value *NewV = nullptr;
         for (Argument &Arg : CS.getCalledFunction()->args())
           if (Arg.hasReturnedAttr()) {
             NewV = CS.getArgOperand(Arg.getArgNo());
             break;
           }
-        if (NewV) {
-          Worklist.push_back(NewV);
-          continue;
-        }
       }
     }
+    if (NewV && NewV != V) {
+      Worklist.push_back(NewV);
+      continue;
+    }
 
     // Look through select instructions, visit both potential values.
     if (auto *SI = dyn_cast<SelectInst>(V)) {
@@ -208,35 +212,34 @@ static bool genericValueTraversal(
       continue;
     }
 
-    // Look through phi nodes, visit all operands.
+    // Look through phi nodes, visit all live operands.
     if (auto *PHI = dyn_cast<PHINode>(V)) {
-      Worklist.append(PHI->op_begin(), PHI->op_end());
+      assert(LivenessAA &&
+             "Expected liveness in the presence of instructions!");
+      for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
+        const BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
+        if (LivenessAA->isAssumedDead(IncomingBB->getTerminator())) {
+          AnyDead = true;
+          continue;
+        }
+        Worklist.push_back(PHI->getIncomingValue(u));
+      }
       continue;
     }
 
     // Once a leaf is reached we inform the user through the callback.
-    VisitValueCB(V, State);
+    if (!VisitValueCB(*V, State, Iteration > 1))
+      return false;
   } while (!Worklist.empty());
 
+  // If we actually used liveness information so we have to record a dependence.
+  if (AnyDead)
+    A.recordDependence(*LivenessAA, QueryingAA);
+
   // All values have been visited.
   return true;
 }
 
-/// Helper to identify the correct offset into an attribute list.
-static unsigned getAttrIndex(AbstractAttribute::ManifestPosition MP,
-                             unsigned ArgNo = 0) {
-  switch (MP) {
-  case AbstractAttribute::MP_ARGUMENT:
-  case AbstractAttribute::MP_CALL_SITE_ARGUMENT:
-    return ArgNo + AttributeList::FirstArgIndex;
-  case AbstractAttribute::MP_FUNCTION:
-    return AttributeList::FunctionIndex;
-  case AbstractAttribute::MP_RETURNED:
-    return AttributeList::ReturnIndex;
-  }
-  llvm_unreachable("Unknown manifest position!");
-}
-
 /// Return true if \p New is equal or worse than \p Old.
 static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
   if (!Old.isIntAttribute())
@@ -247,12 +250,9 @@ static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
 
 /// Return true if the information provided by \p Attr was added to the
 /// attribute list \p Attrs. This is only the case if it was not already present
-/// in \p Attrs at the position describe by \p MP and \p ArgNo.
+/// in \p Attrs at the position describe by \p PK and \p AttrIdx.
 static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
-                             AttributeList &Attrs,
-                             AbstractAttribute::ManifestPosition MP,
-                             unsigned ArgNo = 0) {
-  unsigned AttrIdx = getAttrIndex(MP, ArgNo);
+                             AttributeList &Attrs, int AttrIdx) {
 
   if (Attr.isEnumAttribute()) {
     Attribute::AttrKind Kind = Attr.getKindAsEnum();
@@ -270,9 +270,47 @@ static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
     Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
     return true;
   }
+  if (Attr.isIntAttribute()) {
+    Attribute::AttrKind Kind = Attr.getKindAsEnum();
+    if (Attrs.hasAttribute(AttrIdx, Kind))
+      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+        return false;
+    Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind);
+    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    return true;
+  }
 
   llvm_unreachable("Expected enum or string attribute!");
 }
+static const Value *getPointerOperand(const Instruction *I) {
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    if (!LI->isVolatile())
+      return LI->getPointerOperand();
+
+  if (auto *SI = dyn_cast<StoreInst>(I))
+    if (!SI->isVolatile())
+      return SI->getPointerOperand();
+
+  if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(I))
+    if (!CXI->isVolatile())
+      return CXI->getPointerOperand();
+
+  if (auto *RMWI = dyn_cast<AtomicRMWInst>(I))
+    if (!RMWI->isVolatile())
+      return RMWI->getPointerOperand();
+
+  return nullptr;
+}
+static const Value *getBasePointerOfAccessPointerOperand(const Instruction *I,
+                                                         int64_t &BytesOffset,
+                                                         const DataLayout &DL) {
+  const Value *Ptr = getPointerOperand(I);
+  if (!Ptr)
+    return nullptr;
+
+  return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
+                                          /*AllowNonInbounds*/ false);
+}
 
 ChangeStatus AbstractAttribute::update(Attributor &A) {
   ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
@@ -289,143 +327,527 @@ ChangeStatus AbstractAttribute::update(Attributor &A) {
   return HasChanged;
 }
 
-ChangeStatus AbstractAttribute::manifest(Attributor &A) {
-  assert(getState().isValidState() &&
-         "Attempted to manifest an invalid state!");
-  assert(getAssociatedValue() &&
-         "Attempted to manifest an attribute without associated value!");
-
-  ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
-  SmallVector<Attribute, 4> DeducedAttrs;
-  getDeducedAttributes(DeducedAttrs);
-
-  Function &ScopeFn = getAnchorScope();
-  LLVMContext &Ctx = ScopeFn.getContext();
-  ManifestPosition MP = getManifestPosition();
-
-  AttributeList Attrs;
-  SmallVector<unsigned, 4> ArgNos;
+ChangeStatus
+IRAttributeManifest::manifestAttrs(Attributor &A, IRPosition &IRP,
+                                   const ArrayRef<Attribute> &DeducedAttrs) {
+  Function *ScopeFn = IRP.getAssociatedFunction();
+  IRPosition::Kind PK = IRP.getPositionKind();
 
   // In the following some generic code that will manifest attributes in
   // DeducedAttrs if they improve the current IR. Due to the different
   // annotation positions we use the underlying AttributeList interface.
-  // Note that MP_CALL_SITE_ARGUMENT can annotate multiple locations.
 
-  switch (MP) {
-  case MP_ARGUMENT:
-    ArgNos.push_back(cast<Argument>(getAssociatedValue())->getArgNo());
-    Attrs = ScopeFn.getAttributes();
+  AttributeList Attrs;
+  switch (PK) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+    return ChangeStatus::UNCHANGED;
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_FUNCTION:
+  case IRPosition::IRP_RETURNED:
+    Attrs = ScopeFn->getAttributes();
     break;
-  case MP_FUNCTION:
-  case MP_RETURNED:
-    ArgNos.push_back(0);
-    Attrs = ScopeFn.getAttributes();
+  case IRPosition::IRP_CALL_SITE:
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    Attrs = ImmutableCallSite(&IRP.getAnchorValue()).getAttributes();
     break;
-  case MP_CALL_SITE_ARGUMENT: {
-    CallSite CS(&getAnchoredValue());
-    for (unsigned u = 0, e = CS.getNumArgOperands(); u != e; u++)
-      if (CS.getArgOperand(u) == getAssociatedValue())
-        ArgNos.push_back(u);
-    Attrs = CS.getAttributes();
-  }
   }
 
+  ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+  LLVMContext &Ctx = IRP.getAnchorValue().getContext();
   for (const Attribute &Attr : DeducedAttrs) {
-    for (unsigned ArgNo : ArgNos) {
-      if (!addIfNotExistent(Ctx, Attr, Attrs, MP, ArgNo))
-        continue;
+    if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx()))
+      continue;
 
-      HasChanged = ChangeStatus::CHANGED;
-      bookkeeping(MP, Attr);
-    }
+    HasChanged = ChangeStatus::CHANGED;
   }
 
   if (HasChanged == ChangeStatus::UNCHANGED)
     return HasChanged;
 
-  switch (MP) {
-  case MP_ARGUMENT:
-  case MP_FUNCTION:
-  case MP_RETURNED:
-    ScopeFn.setAttributes(Attrs);
+  switch (PK) {
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_FUNCTION:
+  case IRPosition::IRP_RETURNED:
+    ScopeFn->setAttributes(Attrs);
+    break;
+  case IRPosition::IRP_CALL_SITE:
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    CallSite(&IRP.getAnchorValue()).setAttributes(Attrs);
+    break;
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
     break;
-  case MP_CALL_SITE_ARGUMENT:
-    CallSite(&getAnchoredValue()).setAttributes(Attrs);
   }
 
   return HasChanged;
 }
 
-Function &AbstractAttribute::getAnchorScope() {
-  Value &V = getAnchoredValue();
-  if (isa<Function>(V))
-    return cast<Function>(V);
-  if (isa<Argument>(V))
-    return *cast<Argument>(V).getParent();
-  if (isa<Instruction>(V))
-    return *cast<Instruction>(V).getFunction();
-  llvm_unreachable("No scope for anchored value found!");
+const IRPosition IRPosition::EmptyKey(255);
+const IRPosition IRPosition::TombstoneKey(256);
+
+SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
+  IRPositions.emplace_back(IRP);
+
+  ImmutableCallSite ICS(&IRP.getAnchorValue());
+  switch (IRP.getPositionKind()) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+  case IRPosition::IRP_FUNCTION:
+    return;
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_RETURNED:
+    IRPositions.emplace_back(
+        IRPosition::function(*IRP.getAssociatedFunction()));
+    return;
+  case IRPosition::IRP_CALL_SITE:
+    assert(ICS && "Expected call site!");
+    // TODO: We need to look at the operand bundles similar to the redirection
+    //       in CallBase.
+    if (!ICS.hasOperandBundles())
+      if (const Function *Callee = ICS.getCalledFunction())
+        IRPositions.emplace_back(IRPosition::function(*Callee));
+    return;
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+    assert(ICS && "Expected call site!");
+    // TODO: We need to look at the operand bundles similar to the redirection
+    //       in CallBase.
+    if (!ICS.hasOperandBundles()) {
+      if (const Function *Callee = ICS.getCalledFunction()) {
+        IRPositions.emplace_back(IRPosition::returned(*Callee));
+        IRPositions.emplace_back(IRPosition::function(*Callee));
+      }
+    }
+    IRPositions.emplace_back(
+        IRPosition::callsite_function(cast<CallBase>(*ICS.getInstruction())));
+    return;
+  case IRPosition::IRP_CALL_SITE_ARGUMENT: {
+    int ArgNo = IRP.getArgNo();
+    assert(ICS && ArgNo >= 0 && "Expected call site!");
+    // TODO: We need to look at the operand bundles similar to the redirection
+    //       in CallBase.
+    if (!ICS.hasOperandBundles()) {
+      const Function *Callee = ICS.getCalledFunction();
+      if (Callee && Callee->arg_size() > unsigned(ArgNo))
+        IRPositions.emplace_back(IRPosition::argument(*Callee->getArg(ArgNo)));
+      if (Callee)
+        IRPositions.emplace_back(IRPosition::function(*Callee));
+    }
+    IRPositions.emplace_back(IRPosition::value(IRP.getAssociatedValue()));
+    return;
+  }
+  }
+}
+
+bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs,
+                         bool IgnoreSubsumingPositions) const {
+  for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
+    for (Attribute::AttrKind AK : AKs)
+      if (EquivIRP.getAttr(AK).getKindAsEnum() == AK)
+        return true;
+    // The first position returned by the SubsumingPositionIterator is
+    // always the position itself. If we ignore subsuming positions we
+    // are done after the first iteration.
+    if (IgnoreSubsumingPositions)
+      break;
+  }
+  return false;
 }
 
-const Function &AbstractAttribute::getAnchorScope() const {
-  return const_cast<AbstractAttribute *>(this)->getAnchorScope();
+void IRPosition::getAttrs(ArrayRef<Attribute::AttrKind> AKs,
+                          SmallVectorImpl<Attribute> &Attrs) const {
+  for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this))
+    for (Attribute::AttrKind AK : AKs) {
+      const Attribute &Attr = EquivIRP.getAttr(AK);
+      if (Attr.getKindAsEnum() == AK)
+        Attrs.push_back(Attr);
+    }
 }
 
-/// -----------------------NoUnwind Function Attribute--------------------------
+void IRPosition::verify() {
+  switch (KindOrArgNo) {
+  default:
+    assert(KindOrArgNo >= 0 && "Expected argument or call site argument!");
+    assert((isa<CallBase>(AnchorVal) || isa<Argument>(AnchorVal)) &&
+           "Expected call base or argument for positive attribute index!");
+    if (isa<Argument>(AnchorVal)) {
+      assert(cast<Argument>(AnchorVal)->getArgNo() == unsigned(getArgNo()) &&
+             "Argument number mismatch!");
+      assert(cast<Argument>(AnchorVal) == &getAssociatedValue() &&
+             "Associated value mismatch!");
+    } else {
+      assert(cast<CallBase>(*AnchorVal).arg_size() > unsigned(getArgNo()) &&
+             "Call site argument number mismatch!");
+      assert(cast<CallBase>(*AnchorVal).getArgOperand(getArgNo()) ==
+                 &getAssociatedValue() &&
+             "Associated value mismatch!");
+    }
+    break;
+  case IRP_INVALID:
+    assert(!AnchorVal && "Expected no value for an invalid position!");
+    break;
+  case IRP_FLOAT:
+    assert((!isa<CallBase>(&getAssociatedValue()) &&
+            !isa<Argument>(&getAssociatedValue())) &&
+           "Expected specialized kind for call base and argument values!");
+    break;
+  case IRP_RETURNED:
+    assert(isa<Function>(AnchorVal) &&
+           "Expected function for a 'returned' position!");
+    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
+    break;
+  case IRP_CALL_SITE_RETURNED:
+    assert((isa<CallBase>(AnchorVal)) &&
+           "Expected call base for 'call site returned' position!");
+    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
+    break;
+  case IRP_CALL_SITE:
+    assert((isa<CallBase>(AnchorVal)) &&
+           "Expected call base for 'call site function' position!");
+    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
+    break;
+  case IRP_FUNCTION:
+    assert(isa<Function>(AnchorVal) &&
+           "Expected function for a 'function' position!");
+    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
+    break;
+  }
+}
 
-struct AANoUnwindFunction : AANoUnwind, BooleanState {
+namespace {
+/// Helper functions to clamp a state \p S of type \p StateType with the
+/// information in \p R and indicate/return if \p S did change (as-in update is
+/// required to be run again).
+///
+///{
+template <typename StateType>
+ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R);
+
+template <>
+ChangeStatus clampStateAndIndicateChange<IntegerState>(IntegerState &S,
+                                                       const IntegerState &R) {
+  auto Assumed = S.getAssumed();
+  S ^= R;
+  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+                                   : ChangeStatus::CHANGED;
+}
 
-  AANoUnwindFunction(Function &F, InformationCache &InfoCache)
-      : AANoUnwind(F, InfoCache) {}
+template <>
+ChangeStatus clampStateAndIndicateChange<BooleanState>(BooleanState &S,
+                                                       const BooleanState &R) {
+  return clampStateAndIndicateChange<IntegerState>(S, R);
+}
+///}
 
-  /// See AbstractAttribute::getState()
-  /// {
-  AbstractState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
-  /// }
+/// Clamp the information known for all returned values of a function
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA,
+                                     StateType &S) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for "
+                    << static_cast<const AbstractAttribute &>(QueryingAA)
+                    << " into " << S << "\n");
+
+  assert((QueryingAA.getIRPosition().getPositionKind() ==
+              IRPosition::IRP_RETURNED ||
+          QueryingAA.getIRPosition().getPositionKind() ==
+              IRPosition::IRP_CALL_SITE_RETURNED) &&
+         "Can only clamp returned value states for a function returned or call "
+         "site returned position!");
+
+  // Use an optional state as there might not be any return values and we want
+  // to join (IntegerState::operator&) the state of all there are.
+  Optional<StateType> T;
+
+  // Callback for each possibly returned value.
+  auto CheckReturnValue = [&](Value &RV) -> bool {
+    const IRPosition &RVPos = IRPosition::value(RV);
+    const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos);
+    LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
+                      << " @ " << RVPos << "\n");
+    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    if (T.hasValue())
+      *T &= AAS;
+    else
+      T = AAS;
+    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T
+                      << "\n");
+    return T->isValidState();
+  };
+
+  if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA))
+    S.indicatePessimisticFixpoint();
+  else if (T.hasValue())
+    S ^= *T;
+}
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_FUNCTION; }
+/// Helper class to compose two generic deduction
+template <typename AAType, typename Base, typename StateType,
+          template <typename...> class F, template <typename...> class G>
+struct AAComposeTwoGenericDeduction
+    : public F<AAType, G<AAType, Base, StateType>, StateType> {
+  AAComposeTwoGenericDeduction(const IRPosition &IRP)
+      : F<AAType, G<AAType, Base, StateType>, StateType>(IRP) {}
 
-  const std::string getAsStr() const override {
-    return getAssumed() ? "nounwind" : "may-unwind";
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus ChangedF = F<AAType, G<AAType, Base, StateType>, StateType>::updateImpl(A);
+    ChangeStatus ChangedG = G<AAType, Base, StateType>::updateImpl(A);
+    return ChangedF | ChangedG;
   }
+};
+
+/// Helper class for generic deduction: return value -> returned position.
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+struct AAReturnedFromReturnedValues : public Base {
+  AAReturnedFromReturnedValues(const IRPosition &IRP) : Base(IRP) {}
 
   /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
+  ChangeStatus updateImpl(Attributor &A) override {
+    StateType S;
+    clampReturnedValueStates<AAType, StateType>(A, *this, S);
+    // TODO: If we know we visited all returned values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange<StateType>(this->getState(), S);
+  }
+};
+
+/// Clamp the information known at all call sites for a given argument
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
+                                        StateType &S) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for "
+                    << static_cast<const AbstractAttribute &>(QueryingAA)
+                    << " into " << S << "\n");
+
+  assert(QueryingAA.getIRPosition().getPositionKind() ==
+             IRPosition::IRP_ARGUMENT &&
+         "Can only clamp call site argument states for an argument position!");
+
+  // Use an optional state as there might not be any return values and we want
+  // to join (IntegerState::operator&) the state of all there are.
+  Optional<StateType> T;
+
+  // The argument number which is also the call site argument number.
+  unsigned ArgNo = QueryingAA.getIRPosition().getArgNo();
+
+  auto CallSiteCheck = [&](AbstractCallSite ACS) {
+    const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
+    // Check if a coresponding argument was found or if it is on not associated
+    // (which can happen for callback calls).
+    if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+      return false;
+
+    const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos);
+    LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
+                      << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
+    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    if (T.hasValue())
+      *T &= AAS;
+    else
+      T = AAS;
+    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T
+                      << "\n");
+    return T->isValidState();
+  };
+
+  if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true))
+    S.indicatePessimisticFixpoint();
+  else if (T.hasValue())
+    S ^= *T;
+}
 
-  /// See AANoUnwind::isAssumedNoUnwind().
-  bool isAssumedNoUnwind() const override { return getAssumed(); }
+/// Helper class for generic deduction: call site argument -> argument position.
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+struct AAArgumentFromCallSiteArguments : public Base {
+  AAArgumentFromCallSiteArguments(const IRPosition &IRP) : Base(IRP) {}
 
-  /// See AANoUnwind::isKnownNoUnwind().
-  bool isKnownNoUnwind() const override { return getKnown(); }
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    StateType S;
+    clampCallSiteArgumentStates<AAType, StateType>(A, *this, S);
+    // TODO: If we know we visited all incoming values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange<StateType>(this->getState(), S);
+  }
 };
 
-ChangeStatus AANoUnwindFunction::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+/// Helper class for generic replication: function returned -> cs returned.
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+struct AACallSiteReturnedFromReturned : public Base {
+  AACallSiteReturnedFromReturned(const IRPosition &IRP) : Base(IRP) {}
 
-  // The map from instruction opcodes to those instructions in the function.
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
-  auto Opcodes = {
-      (unsigned)Instruction::Invoke,      (unsigned)Instruction::CallBr,
-      (unsigned)Instruction::Call,        (unsigned)Instruction::CleanupRet,
-      (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume};
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    assert(this->getIRPosition().getPositionKind() ==
+               IRPosition::IRP_CALL_SITE_RETURNED &&
+           "Can only wrap function returned positions for call site returned "
+           "positions!");
+    auto &S = this->getState();
+
+    const Function *AssociatedFunction =
+        this->getIRPosition().getAssociatedFunction();
+    if (!AssociatedFunction)
+      return S.indicatePessimisticFixpoint();
+
+    IRPosition FnPos = IRPosition::returned(*AssociatedFunction);
+    const AAType &AA = A.getAAFor<AAType>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        S, static_cast<const typename AAType::StateType &>(AA.getState()));
+  }
+};
 
-  for (unsigned Opcode : Opcodes) {
-    for (Instruction *I : OpcodeInstMap[Opcode]) {
-      if (!I->mayThrow())
-        continue;
+/// Helper class for generic deduction using must-be-executed-context
+/// Base class is required to have `followUse` method.
 
-      auto *NoUnwindAA = A.getAAFor<AANoUnwind>(*this, *I);
+/// bool followUse(Attributor &A, const Use *U, const Instruction *I)
+/// U - Underlying use.
+/// I - The user of the \p U.
+/// `followUse` returns true if the value should be tracked transitively.
 
-      if (!NoUnwindAA || !NoUnwindAA->isAssumedNoUnwind()) {
-        indicatePessimisticFixpoint();
-        return ChangeStatus::CHANGED;
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+struct AAFromMustBeExecutedContext : public Base {
+  AAFromMustBeExecutedContext(const IRPosition &IRP) : Base(IRP) {}
+
+  void initialize(Attributor &A) override {
+    Base::initialize(A);
+    IRPosition &IRP = this->getIRPosition();
+    Instruction *CtxI = IRP.getCtxI();
+
+    if (!CtxI)
+      return;
+
+    for (const Use &U : IRP.getAssociatedValue().uses())
+      Uses.insert(&U);
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto BeforeState = this->getState();
+    auto &S = this->getState();
+    Instruction *CtxI = this->getIRPosition().getCtxI();
+    if (!CtxI)
+      return ChangeStatus::UNCHANGED;
+
+    MustBeExecutedContextExplorer &Explorer =
+        A.getInfoCache().getMustBeExecutedContextExplorer();
+
+    SetVector<const Use *> NextUses;
+
+    for (const Use *U : Uses) {
+      if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) {
+        auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI);
+        bool Found = EIt.count(UserI);
+        while (!Found && ++EIt != EEnd)
+          Found = EIt.getCurrentInst() == UserI;
+        if (Found && Base::followUse(A, U, UserI))
+          for (const Use &Us : UserI->uses())
+            NextUses.insert(&Us);
       }
     }
+    for (const Use *U : NextUses)
+      Uses.insert(U);
+
+    return BeforeState == S ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
   }
-  return ChangeStatus::UNCHANGED;
-}
+
+private:
+  /// Container for (transitive) uses of the associated value.
+  SetVector<const Use *> Uses;
+};
+
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+using AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext =
+    AAComposeTwoGenericDeduction<AAType, Base, StateType,
+                                 AAFromMustBeExecutedContext,
+                                 AAArgumentFromCallSiteArguments>;
+
+template <typename AAType, typename Base,
+          typename StateType = typename AAType::StateType>
+using AACallSiteReturnedFromReturnedAndMustBeExecutedContext =
+    AAComposeTwoGenericDeduction<AAType, Base, StateType,
+                                 AAFromMustBeExecutedContext,
+                                 AACallSiteReturnedFromReturned>;
+
+/// -----------------------NoUnwind Function Attribute--------------------------
+
+struct AANoUnwindImpl : AANoUnwind {
+  AANoUnwindImpl(const IRPosition &IRP) : AANoUnwind(IRP) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nounwind" : "may-unwind";
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto Opcodes = {
+        (unsigned)Instruction::Invoke,      (unsigned)Instruction::CallBr,
+        (unsigned)Instruction::Call,        (unsigned)Instruction::CleanupRet,
+        (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume};
+
+    auto CheckForNoUnwind = [&](Instruction &I) {
+      if (!I.mayThrow())
+        return true;
+
+      if (ImmutableCallSite ICS = ImmutableCallSite(&I)) {
+        const auto &NoUnwindAA =
+            A.getAAFor<AANoUnwind>(*this, IRPosition::callsite_function(ICS));
+        return NoUnwindAA.isAssumedNoUnwind();
+      }
+      return false;
+    };
+
+    if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AANoUnwindFunction final : public AANoUnwindImpl {
+  AANoUnwindFunction(const IRPosition &IRP) : AANoUnwindImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nounwind) }
+};
+
+/// NoUnwind attribute deduction for a call sites.
+struct AANoUnwindCallSite final : AANoUnwindImpl {
+  AANoUnwindCallSite(const IRPosition &IRP) : AANoUnwindImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoUnwindImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoUnwind::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); }
+};
 
 /// --------------------- Function Return Values -------------------------------
 
@@ -434,68 +856,48 @@ ChangeStatus AANoUnwindFunction::updateImpl(Attributor &A) {
 ///
 /// If there is a unique returned value R, the manifest method will:
 ///   - mark R with the "returned" attribute, if R is an argument.
-class AAReturnedValuesImpl final : public AAReturnedValues, AbstractState {
+class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState {
 
   /// Mapping of values potentially returned by the associated function to the
   /// return instructions that might return them.
-  DenseMap<Value *, SmallPtrSet<ReturnInst *, 2>> ReturnedValues;
+  MapVector<Value *, SmallSetVector<ReturnInst *, 4>> ReturnedValues;
+
+  /// Mapping to remember the number of returned values for a call site such
+  /// that we can avoid updates if nothing changed.
+  DenseMap<const CallBase *, unsigned> NumReturnedValuesPerKnownAA;
+
+  /// Set of unresolved calls returned by the associated function.
+  SmallSetVector<CallBase *, 4> UnresolvedCalls;
 
   /// State flags
   ///
   ///{
-  bool IsFixed;
-  bool IsValidState;
-  bool HasOverdefinedReturnedCalls;
+  bool IsFixed = false;
+  bool IsValidState = true;
   ///}
 
-  /// Collect values that could become \p V in the set \p Values, each mapped to
-  /// \p ReturnInsts.
-  void collectValuesRecursively(
-      Attributor &A, Value *V, SmallPtrSetImpl<ReturnInst *> &ReturnInsts,
-      DenseMap<Value *, SmallPtrSet<ReturnInst *, 2>> &Values) {
-
-    visitValueCB_t<bool> VisitValueCB = [&](Value *Val, bool &) {
-      assert(!isa<Instruction>(Val) ||
-             &getAnchorScope() == cast<Instruction>(Val)->getFunction());
-      Values[Val].insert(ReturnInsts.begin(), ReturnInsts.end());
-    };
-
-    bool UnusedBool;
-    bool Success = genericValueTraversal(V, UnusedBool, VisitValueCB);
-
-    // If we did abort the above traversal we haven't see all the values.
-    // Consequently, we cannot know if the information we would derive is
-    // accurate so we give up early.
-    if (!Success)
-      indicatePessimisticFixpoint();
-  }
-
 public:
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AAReturnedValuesImpl(Function &F, InformationCache &InfoCache)
-      : AAReturnedValues(F, InfoCache) {
-    // We do not have an associated argument yet.
-    AssociatedVal = nullptr;
-  }
+  AAReturnedValuesImpl(const IRPosition &IRP) : AAReturnedValues(IRP) {}
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     // Reset the state.
-    AssociatedVal = nullptr;
     IsFixed = false;
     IsValidState = true;
-    HasOverdefinedReturnedCalls = false;
     ReturnedValues.clear();
 
-    Function &F = cast<Function>(getAnchoredValue());
+    Function *F = getAssociatedFunction();
+    if (!F) {
+      indicatePessimisticFixpoint();
+      return;
+    }
 
     // The map from instruction opcodes to those instructions in the function.
-    auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+    auto &OpcodeInstMap = A.getInfoCache().getOpcodeInstMapForFunction(*F);
 
     // Look through all arguments, if one is marked as returned we are done.
-    for (Argument &Arg : F.args()) {
+    for (Argument &Arg : F->args()) {
       if (Arg.hasReturnedAttr()) {
-
         auto &ReturnInstSet = ReturnedValues[&Arg];
         for (Instruction *RI : OpcodeInstMap[Instruction::Ret])
           ReturnInstSet.insert(cast<ReturnInst>(RI));
@@ -505,13 +907,8 @@ public:
       }
     }
 
-    // If no argument was marked as returned we look at all return instructions
-    // and collect potentially returned values.
-    for (Instruction *RI : OpcodeInstMap[Instruction::Ret]) {
-      SmallPtrSet<ReturnInst *, 1> RISet({cast<ReturnInst>(RI)});
-      collectValuesRecursively(A, cast<ReturnInst>(RI)->getReturnValue(), RISet,
-                               ReturnedValues);
-    }
+    if (!F->hasExactDefinition())
+      indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::manifest(...).
@@ -523,25 +920,35 @@ public:
   /// See AbstractAttribute::getState(...).
   const AbstractState &getState() const override { return *this; }
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_ARGUMENT; }
-
   /// See AbstractAttribute::updateImpl(Attributor &A).
   ChangeStatus updateImpl(Attributor &A) override;
 
+  llvm::iterator_range<iterator> returned_values() override {
+    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+  }
+
+  llvm::iterator_range<const_iterator> returned_values() const override {
+    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+  }
+
+  const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const override {
+    return UnresolvedCalls;
+  }
+
   /// Return the number of potential return values, -1 if unknown.
-  size_t getNumReturnValues() const {
+  size_t getNumReturnValues() const override {
     return isValidState() ? ReturnedValues.size() : -1;
   }
 
   /// Return an assumed unique return value if a single candidate is found. If
   /// there cannot be one, return a nullptr. If it is not clear yet, return the
   /// Optional::NoneType.
-  Optional<Value *> getAssumedUniqueReturnValue() const;
+  Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const;
 
-  /// See AbstractState::checkForallReturnedValues(...).
-  bool
-  checkForallReturnedValues(std::function<bool(Value &)> &Pred) const override;
+  /// See AbstractState::checkForAllReturnedValues(...).
+  bool checkForAllReturnedValuesAndReturnInsts(
+      const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
+          &Pred) const override;
 
   /// Pretty print the attribute similar to the IR representation.
   const std::string getAsStr() const override;
@@ -553,13 +960,15 @@ public:
   bool isValidState() const override { return IsValidState; }
 
   /// See AbstractState::indicateOptimisticFixpoint(...).
-  void indicateOptimisticFixpoint() override {
+  ChangeStatus indicateOptimisticFixpoint() override {
     IsFixed = true;
-    IsValidState &= true;
+    return ChangeStatus::UNCHANGED;
   }
-  void indicatePessimisticFixpoint() override {
+
+  ChangeStatus indicatePessimisticFixpoint() override {
     IsFixed = true;
     IsValidState = false;
+    return ChangeStatus::CHANGED;
   }
 };
 
@@ -568,21 +977,52 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
 
   // Bookkeeping.
   assert(isValidState());
-  NumFnKnownReturns++;
+  STATS_DECLTRACK(KnownReturnValues, FunctionReturn,
+                  "Number of function with known return values");
 
   // Check if we have an assumed unique return value that we could manifest.
-  Optional<Value *> UniqueRV = getAssumedUniqueReturnValue();
+  Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A);
 
   if (!UniqueRV.hasValue() || !UniqueRV.getValue())
     return Changed;
 
   // Bookkeeping.
-  NumFnUniqueReturned++;
+  STATS_DECLTRACK(UniqueReturnValue, FunctionReturn,
+                  "Number of function with unique return");
+
+  // Callback to replace the uses of CB with the constant C.
+  auto ReplaceCallSiteUsersWith = [](CallBase &CB, Constant &C) {
+    if (CB.getNumUses() == 0 || CB.isMustTailCall())
+      return ChangeStatus::UNCHANGED;
+    CB.replaceAllUsesWith(&C);
+    return ChangeStatus::CHANGED;
+  };
 
   // If the assumed unique return value is an argument, annotate it.
   if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) {
-    AssociatedVal = UniqueRVArg;
-    Changed = AbstractAttribute::manifest(A) | Changed;
+    getIRPosition() = IRPosition::argument(*UniqueRVArg);
+    Changed = IRAttribute::manifest(A);
+  } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) {
+    // We can replace the returned value with the unique returned constant.
+    Value &AnchorValue = getAnchorValue();
+    if (Function *F = dyn_cast<Function>(&AnchorValue)) {
+      for (const Use &U : F->uses())
+        if (CallBase *CB = dyn_cast<CallBase>(U.getUser()))
+          if (CB->isCallee(&U)) {
+            Constant *RVCCast =
+                ConstantExpr::getTruncOrBitCast(RVC, CB->getType());
+            Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed;
+          }
+    } else {
+      assert(isa<CallBase>(AnchorValue) &&
+             "Expcected a function or call base anchor!");
+      Constant *RVCCast =
+          ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType());
+      Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast);
+    }
+    if (Changed == ChangeStatus::CHANGED)
+      STATS_DECLTRACK(UniqueConstantReturnValue, FunctionReturn,
+                      "Number of function returns replaced by constant return");
   }
 
   return Changed;
@@ -590,18 +1030,20 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
 
 const std::string AAReturnedValuesImpl::getAsStr() const {
   return (isAtFixpoint() ? "returns(#" : "may-return(#") +
-         (isValidState() ? std::to_string(getNumReturnValues()) : "?") + ")";
+         (isValidState() ? std::to_string(getNumReturnValues()) : "?") +
+         ")[#UC: " + std::to_string(UnresolvedCalls.size()) + "]";
 }
 
-Optional<Value *> AAReturnedValuesImpl::getAssumedUniqueReturnValue() const {
-  // If checkForallReturnedValues provides a unique value, ignoring potential
+Optional<Value *>
+AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const {
+  // If checkForAllReturnedValues provides a unique value, ignoring potential
   // undef values that can also be present, it is assumed to be the actual
   // return value and forwarded to the caller of this method. If there are
   // multiple, a nullptr is returned indicating there cannot be a unique
   // returned value.
   Optional<Value *> UniqueRV;
 
-  std::function<bool(Value &)> Pred = [&](Value &RV) -> bool {
+  auto Pred = [&](Value &RV) -> bool {
     // If we found a second returned value and neither the current nor the saved
     // one is an undef, there is no unique returned value. Undefs are special
     // since we can pretend they have any value.
@@ -618,14 +1060,15 @@ Optional<Value *> AAReturnedValuesImpl::getAssumedUniqueReturnValue() const {
     return true;
   };
 
-  if (!checkForallReturnedValues(Pred))
+  if (!A.checkForAllReturnedValues(Pred, *this))
     UniqueRV = nullptr;
 
   return UniqueRV;
 }
 
-bool AAReturnedValuesImpl::checkForallReturnedValues(
-    std::function<bool(Value &)> &Pred) const {
+bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts(
+    const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
+        &Pred) const {
   if (!isValidState())
     return false;
 
@@ -634,11 +1077,11 @@ bool AAReturnedValuesImpl::checkForallReturnedValues(
   for (auto &It : ReturnedValues) {
     Value *RV = It.first;
 
-    ImmutableCallSite ICS(RV);
-    if (ICS && !HasOverdefinedReturnedCalls)
+    CallBase *CB = dyn_cast<CallBase>(RV);
+    if (CB && !UnresolvedCalls.count(CB))
       continue;
 
-    if (!Pred(*RV))
+    if (!Pred(*RV, It.second))
       return false;
   }
 
@@ -646,125 +1089,196 @@ bool AAReturnedValuesImpl::checkForallReturnedValues(
 }
 
 ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
+  size_t NumUnresolvedCalls = UnresolvedCalls.size();
+  bool Changed = false;
+
+  // State used in the value traversals starting in returned values.
+  struct RVState {
+    // The map in which we collect return values -> return instrs.
+    decltype(ReturnedValues) &RetValsMap;
+    // The flag to indicate a change.
+    bool &Changed;
+    // The return instrs we come from.
+    SmallSetVector<ReturnInst *, 4> RetInsts;
+  };
 
-  // Check if we know of any values returned by the associated function,
-  // if not, we are done.
-  if (getNumReturnValues() == 0) {
-    indicateOptimisticFixpoint();
-    return ChangeStatus::UNCHANGED;
-  }
+  // Callback for a leaf value returned by the associated function.
+  auto VisitValueCB = [](Value &Val, RVState &RVS, bool) -> bool {
+    auto Size = RVS.RetValsMap[&Val].size();
+    RVS.RetValsMap[&Val].insert(RVS.RetInsts.begin(), RVS.RetInsts.end());
+    bool Inserted = RVS.RetValsMap[&Val].size() != Size;
+    RVS.Changed |= Inserted;
+    LLVM_DEBUG({
+      if (Inserted)
+        dbgs() << "[AAReturnedValues] 1 Add new returned value " << Val
+               << " => " << RVS.RetInsts.size() << "\n";
+    });
+    return true;
+  };
 
-  // Check if any of the returned values is a call site we can refine.
-  decltype(ReturnedValues) AddRVs;
-  bool HasCallSite = false;
+  // Helper method to invoke the generic value traversal.
+  auto VisitReturnedValue = [&](Value &RV, RVState &RVS) {
+    IRPosition RetValPos = IRPosition::value(RV);
+    return genericValueTraversal<AAReturnedValues, RVState>(A, RetValPos, *this,
+                                                            RVS, VisitValueCB);
+  };
 
-  // Look at all returned call sites.
-  for (auto &It : ReturnedValues) {
-    SmallPtrSet<ReturnInst *, 2> &ReturnInsts = It.second;
-    Value *RV = It.first;
-    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Potentially returned value " << *RV
-                      << "\n");
+  // Callback for all "return intructions" live in the associated function.
+  auto CheckReturnInst = [this, &VisitReturnedValue, &Changed](Instruction &I) {
+    ReturnInst &Ret = cast<ReturnInst>(I);
+    RVState RVS({ReturnedValues, Changed, {}});
+    RVS.RetInsts.insert(&Ret);
+    return VisitReturnedValue(*Ret.getReturnValue(), RVS);
+  };
 
-    // Only call sites can change during an update, ignore the rest.
-    CallSite RetCS(RV);
-    if (!RetCS)
+  // Start by discovering returned values from all live returned instructions in
+  // the associated function.
+  if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}))
+    return indicatePessimisticFixpoint();
+
+  // Once returned values "directly" present in the code are handled we try to
+  // resolve returned calls.
+  decltype(ReturnedValues) NewRVsMap;
+  for (auto &It : ReturnedValues) {
+    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *It.first
+                      << " by #" << It.second.size() << " RIs\n");
+    CallBase *CB = dyn_cast<CallBase>(It.first);
+    if (!CB || UnresolvedCalls.count(CB))
       continue;
 
-    // For now, any call site we see will prevent us from directly fixing the
-    // state. However, if the information on the callees is fixed, the call
-    // sites will be removed and we will fix the information for this state.
-    HasCallSite = true;
-
-    // Try to find a assumed unique return value for the called function.
-    auto *RetCSAA = A.getAAFor<AAReturnedValuesImpl>(*this, *RV);
-    if (!RetCSAA) {
-      HasOverdefinedReturnedCalls = true;
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned call site (" << *RV
-                        << ") with " << (RetCSAA ? "invalid" : "no")
-                        << " associated state\n");
+    if (!CB->getCalledFunction()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+                        << "\n");
+      UnresolvedCalls.insert(CB);
       continue;
     }
 
-    // Try to find a assumed unique return value for the called function.
-    Optional<Value *> AssumedUniqueRV = RetCSAA->getAssumedUniqueReturnValue();
+    // TODO: use the function scope once we have call site AAReturnedValues.
+    const auto &RetValAA = A.getAAFor<AAReturnedValues>(
+        *this, IRPosition::function(*CB->getCalledFunction()));
+    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: "
+                      << static_cast<const AbstractAttribute &>(RetValAA)
+                      << "\n");
 
-    // If no assumed unique return value was found due to the lack of
-    // candidates, we may need to resolve more calls (through more update
-    // iterations) or the called function will not return. Either way, we simply
-    // stick with the call sites as return values. Because there were not
-    // multiple possibilities, we do not treat it as overdefined.
-    if (!AssumedUniqueRV.hasValue())
+    // Skip dead ends, thus if we do not know anything about the returned
+    // call we mark it as unresolved and it will stay that way.
+    if (!RetValAA.getState().isValidState()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+                        << "\n");
+      UnresolvedCalls.insert(CB);
       continue;
+    }
 
-    // If multiple, non-refinable values were found, there cannot be a unique
-    // return value for the called function. The returned call is overdefined!
-    if (!AssumedUniqueRV.getValue()) {
-      HasOverdefinedReturnedCalls = true;
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned call site has multiple "
-                           "potentially returned values\n");
+    // Do not try to learn partial information. If the callee has unresolved
+    // return values we will treat the call as unresolved/opaque.
+    auto &RetValAAUnresolvedCalls = RetValAA.getUnresolvedCalls();
+    if (!RetValAAUnresolvedCalls.empty()) {
+      UnresolvedCalls.insert(CB);
       continue;
     }
 
-    LLVM_DEBUG({
-      bool UniqueRVIsKnown = RetCSAA->isAtFixpoint();
-      dbgs() << "[AAReturnedValues] Returned call site "
-             << (UniqueRVIsKnown ? "known" : "assumed")
-             << " unique return value: " << *AssumedUniqueRV << "\n";
-    });
-
-    // The assumed unique return value.
-    Value *AssumedRetVal = AssumedUniqueRV.getValue();
-
-    // If the assumed unique return value is an argument, lookup the matching
-    // call site operand and recursively collect new returned values.
-    // If it is not an argument, it is just put into the set of returned values
-    // as we would have already looked through casts, phis, and similar values.
-    if (Argument *AssumedRetArg = dyn_cast<Argument>(AssumedRetVal))
-      collectValuesRecursively(A,
-                               RetCS.getArgOperand(AssumedRetArg->getArgNo()),
-                               ReturnInsts, AddRVs);
-    else
-      AddRVs[AssumedRetVal].insert(ReturnInsts.begin(), ReturnInsts.end());
-  }
+    // Now check if we can track transitively returned values. If possible, thus
+    // if all return value can be represented in the current scope, do so.
+    bool Unresolved = false;
+    for (auto &RetValAAIt : RetValAA.returned_values()) {
+      Value *RetVal = RetValAAIt.first;
+      if (isa<Argument>(RetVal) || isa<CallBase>(RetVal) ||
+          isa<Constant>(RetVal))
+        continue;
+      // Anything that did not fit in the above categories cannot be resolved,
+      // mark the call as unresolved.
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] transitively returned value "
+                           "cannot be translated: "
+                        << *RetVal << "\n");
+      UnresolvedCalls.insert(CB);
+      Unresolved = true;
+      break;
+    }
 
-  // Keep track of any change to trigger updates on dependent attributes.
-  ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    if (Unresolved)
+      continue;
 
-  for (auto &It : AddRVs) {
-    assert(!It.second.empty() && "Entry does not add anything.");
-    auto &ReturnInsts = ReturnedValues[It.first];
-    for (ReturnInst *RI : It.second)
-      if (ReturnInsts.insert(RI).second) {
-        LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value "
-                          << *It.first << " => " << *RI << "\n");
-        Changed = ChangeStatus::CHANGED;
+    // Now track transitively returned values.
+    unsigned &NumRetAA = NumReturnedValuesPerKnownAA[CB];
+    if (NumRetAA == RetValAA.getNumReturnValues()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Skip call as it has not "
+                           "changed since it was seen last\n");
+      continue;
+    }
+    NumRetAA = RetValAA.getNumReturnValues();
+
+    for (auto &RetValAAIt : RetValAA.returned_values()) {
+      Value *RetVal = RetValAAIt.first;
+      if (Argument *Arg = dyn_cast<Argument>(RetVal)) {
+        // Arguments are mapped to call site operands and we begin the traversal
+        // again.
+        bool Unused = false;
+        RVState RVS({NewRVsMap, Unused, RetValAAIt.second});
+        VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS);
+        continue;
+      } else if (isa<CallBase>(RetVal)) {
+        // Call sites are resolved by the callee attribute over time, no need to
+        // do anything for us.
+        continue;
+      } else if (isa<Constant>(RetVal)) {
+        // Constants are valid everywhere, we can simply take them.
+        NewRVsMap[RetVal].insert(It.second.begin(), It.second.end());
+        continue;
       }
+    }
   }
 
-  // If there is no call site in the returned values we are done.
-  if (!HasCallSite) {
-    indicateOptimisticFixpoint();
-    return ChangeStatus::CHANGED;
+  // To avoid modifications to the ReturnedValues map while we iterate over it
+  // we kept record of potential new entries in a copy map, NewRVsMap.
+  for (auto &It : NewRVsMap) {
+    assert(!It.second.empty() && "Entry does not add anything.");
+    auto &ReturnInsts = ReturnedValues[It.first];
+    for (ReturnInst *RI : It.second)
+      if (ReturnInsts.insert(RI)) {
+        LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value "
+                          << *It.first << " => " << *RI << "\n");
+        Changed = true;
+      }
   }
 
-  return Changed;
+  Changed |= (NumUnresolvedCalls != UnresolvedCalls.size());
+  return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
 }
 
-/// ------------------------ NoSync Function Attribute -------------------------
+struct AAReturnedValuesFunction final : public AAReturnedValuesImpl {
+  AAReturnedValuesFunction(const IRPosition &IRP) : AAReturnedValuesImpl(IRP) {}
 
-struct AANoSyncFunction : AANoSync, BooleanState {
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(returned) }
+};
 
-  AANoSyncFunction(Function &F, InformationCache &InfoCache)
-      : AANoSync(F, InfoCache) {}
+/// Returned values information for a call sites.
+struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
+  AAReturnedValuesCallSite(const IRPosition &IRP) : AAReturnedValuesImpl(IRP) {}
 
-  /// See AbstractAttribute::getState()
-  /// {
-  AbstractState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
-  /// }
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites instead of
+    //       redirecting requests to the callee.
+    llvm_unreachable("Abstract attributes for returned values are not "
+                     "supported for call sites yet!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_FUNCTION; }
+/// ------------------------ NoSync Function Attribute -------------------------
+
+struct AANoSyncImpl : AANoSync {
+  AANoSyncImpl(const IRPosition &IRP) : AANoSync(IRP) {}
 
   const std::string getAsStr() const override {
     return getAssumed() ? "nosync" : "may-sync";
@@ -773,12 +1287,6 @@ struct AANoSyncFunction : AANoSync, BooleanState {
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override;
 
-  /// See AANoSync::isAssumedNoSync()
-  bool isAssumedNoSync() const override { return getAssumed(); }
-
-  /// See AANoSync::isKnownNoSync()
-  bool isKnownNoSync() const override { return getKnown(); }
-
   /// Helper function used to determine whether an instruction is non-relaxed
   /// atomic. In other words, if an atomic instruction does not have unordered
   /// or monotonic ordering
@@ -792,7 +1300,7 @@ struct AANoSyncFunction : AANoSync, BooleanState {
   static bool isNoSyncIntrinsic(Instruction *I);
 };
 
-bool AANoSyncFunction::isNonRelaxedAtomic(Instruction *I) {
+bool AANoSyncImpl::isNonRelaxedAtomic(Instruction *I) {
   if (!I->isAtomic())
     return false;
 
@@ -841,7 +1349,7 @@ bool AANoSyncFunction::isNonRelaxedAtomic(Instruction *I) {
 
 /// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics.
 /// FIXME: We should ipmrove the handling of intrinsics.
-bool AANoSyncFunction::isNoSyncIntrinsic(Instruction *I) {
+bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) {
   if (auto *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
     /// Element wise atomic memory intrinsics are can only be unordered,
@@ -863,7 +1371,7 @@ bool AANoSyncFunction::isNoSyncIntrinsic(Instruction *I) {
   return false;
 }
 
-bool AANoSyncFunction::isVolatile(Instruction *I) {
+bool AANoSyncImpl::isVolatile(Instruction *I) {
   assert(!ImmutableCallSite(I) && !isa<CallBase>(I) &&
          "Calls should not be checked here");
 
@@ -881,482 +1389,3074 @@ bool AANoSyncFunction::isVolatile(Instruction *I) {
   }
 }
 
-ChangeStatus AANoSyncFunction::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
 
-  /// We are looking for volatile instructions or Non-Relaxed atomics.
-  /// FIXME: We should ipmrove the handling of intrinsics.
-  for (Instruction *I : InfoCache.getReadOrWriteInstsForFunction(F)) {
-    ImmutableCallSite ICS(I);
-    auto *NoSyncAA = A.getAAFor<AANoSyncFunction>(*this, *I);
+  auto CheckRWInstForNoSync = [&](Instruction &I) {
+    /// We are looking for volatile instructions or Non-Relaxed atomics.
+    /// FIXME: We should ipmrove the handling of intrinsics.
 
-    if (isa<IntrinsicInst>(I) && isNoSyncIntrinsic(I))
-      continue;
+    if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I))
+      return true;
 
-    if (ICS && (!NoSyncAA || !NoSyncAA->isAssumedNoSync()) &&
-        !ICS.hasFnAttr(Attribute::NoSync)) {
-      indicatePessimisticFixpoint();
-      return ChangeStatus::CHANGED;
+    if (ImmutableCallSite ICS = ImmutableCallSite(&I)) {
+      if (ICS.hasFnAttr(Attribute::NoSync))
+        return true;
+
+      const auto &NoSyncAA =
+          A.getAAFor<AANoSync>(*this, IRPosition::callsite_function(ICS));
+      if (NoSyncAA.isAssumedNoSync())
+        return true;
+      return false;
     }
 
-    if (ICS)
-      continue;
+    if (!isVolatile(&I) && !isNonRelaxedAtomic(&I))
+      return true;
 
-    if (!isVolatile(I) && !isNonRelaxedAtomic(I))
-      continue;
+    return false;
+  };
 
-    indicatePessimisticFixpoint();
-    return ChangeStatus::CHANGED;
-  }
+  auto CheckForNoSync = [&](Instruction &I) {
+    // At this point we handled all read/write effects and they are all
+    // nosync, so they can be skipped.
+    if (I.mayReadOrWriteMemory())
+      return true;
 
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
-  auto Opcodes = {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
-                  (unsigned)Instruction::Call};
+    // non-convergent and readnone imply nosync.
+    return !ImmutableCallSite(&I).isConvergent();
+  };
 
-  for (unsigned Opcode : Opcodes) {
-    for (Instruction *I : OpcodeInstMap[Opcode]) {
-      // At this point we handled all read/write effects and they are all
-      // nosync, so they can be skipped.
-      if (I->mayReadOrWriteMemory())
-        continue;
+  if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) ||
+      !A.checkForAllCallLikeInstructions(CheckForNoSync, *this))
+    return indicatePessimisticFixpoint();
 
-      ImmutableCallSite ICS(I);
+  return ChangeStatus::UNCHANGED;
+}
 
-      // non-convergent and readnone imply nosync.
-      if (!ICS.isConvergent())
-        continue;
+struct AANoSyncFunction final : public AANoSyncImpl {
+  AANoSyncFunction(const IRPosition &IRP) : AANoSyncImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nosync) }
+};
+
+/// NoSync attribute deduction for a call sites.
+struct AANoSyncCallSite final : AANoSyncImpl {
+  AANoSyncCallSite(const IRPosition &IRP) : AANoSyncImpl(IRP) {}
 
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoSyncImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
       indicatePessimisticFixpoint();
-      return ChangeStatus::CHANGED;
-    }
   }
 
-  return ChangeStatus::UNCHANGED;
-}
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoSync::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); }
+};
 
 /// ------------------------ No-Free Attributes ----------------------------
 
-struct AANoFreeFunction : AbstractAttribute, BooleanState {
+struct AANoFreeImpl : public AANoFree {
+  AANoFreeImpl(const IRPosition &IRP) : AANoFree(IRP) {}
 
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AANoFreeFunction(Function &F, InformationCache &InfoCache)
-      : AbstractAttribute(F, InfoCache) {}
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForNoFree = [&](Instruction &I) {
+      ImmutableCallSite ICS(&I);
+      if (ICS.hasFnAttr(Attribute::NoFree))
+        return true;
 
-  /// See AbstractAttribute::getState()
-  ///{
-  AbstractState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
-  ///}
+      const auto &NoFreeAA =
+          A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(ICS));
+      return NoFreeAA.isAssumedNoFree();
+    };
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_FUNCTION; }
+    if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
 
   /// See AbstractAttribute::getAsStr().
   const std::string getAsStr() const override {
     return getAssumed() ? "nofree" : "may-free";
   }
+};
 
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
+struct AANoFreeFunction final : public AANoFreeImpl {
+  AANoFreeFunction(const IRPosition &IRP) : AANoFreeImpl(IRP) {}
 
-  /// See AbstractAttribute::getAttrKind().
-  Attribute::AttrKind getAttrKind() const override { return ID; }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nofree) }
+};
 
-  /// Return true if "nofree" is assumed.
-  bool isAssumedNoFree() const { return getAssumed(); }
+/// NoFree attribute deduction for a call sites.
+struct AANoFreeCallSite final : AANoFreeImpl {
+  AANoFreeCallSite(const IRPosition &IRP) : AANoFreeImpl(IRP) {}
 
-  /// Return true if "nofree" is known.
-  bool isKnownNoFree() const { return getKnown(); }
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoFreeImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
 
-  /// The identifier used by the Attributor for this class of attributes.
-  static constexpr Attribute::AttrKind ID = Attribute::NoFree;
-};
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoFree::StateType &>(FnAA.getState()));
+  }
 
-ChangeStatus AANoFreeFunction::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); }
+};
 
-  // The map from instruction opcodes to those instructions in the function.
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+/// ------------------------ NonNull Argument Attribute ------------------------
+static int64_t getKnownNonNullAndDerefBytesForUse(
+    Attributor &A, AbstractAttribute &QueryingAA, Value &AssociatedValue,
+    const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) {
+  TrackUse = false;
+
+  const Value *UseV = U->get();
+  if (!UseV->getType()->isPointerTy())
+    return 0;
+
+  Type *PtrTy = UseV->getType();
+  const Function *F = I->getFunction();
+  bool NullPointerIsDefined =
+      F ? llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()) : true;
+  const DataLayout &DL = A.getInfoCache().getDL();
+  if (ImmutableCallSite ICS = ImmutableCallSite(I)) {
+    if (ICS.isBundleOperand(U))
+      return 0;
+
+    if (ICS.isCallee(U)) {
+      IsNonNull |= !NullPointerIsDefined;
+      return 0;
+    }
 
-  for (unsigned Opcode :
-       {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
-        (unsigned)Instruction::Call}) {
-    for (Instruction *I : OpcodeInstMap[Opcode]) {
+    unsigned ArgNo = ICS.getArgumentNo(U);
+    IRPosition IRP = IRPosition::callsite_argument(ICS, ArgNo);
+    auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP);
+    IsNonNull |= DerefAA.isKnownNonNull();
+    return DerefAA.getKnownDereferenceableBytes();
+  }
 
-      auto ICS = ImmutableCallSite(I);
-      auto *NoFreeAA = A.getAAFor<AANoFreeFunction>(*this, *I);
+  int64_t Offset;
+  if (const Value *Base = getBasePointerOfAccessPointerOperand(I, Offset, DL)) {
+    if (Base == &AssociatedValue && getPointerOperand(I) == UseV) {
+      int64_t DerefBytes =
+          Offset + (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
 
-      if ((!NoFreeAA || !NoFreeAA->isAssumedNoFree()) &&
-          !ICS.hasFnAttr(Attribute::NoFree)) {
-        indicatePessimisticFixpoint();
-        return ChangeStatus::CHANGED;
-      }
+      IsNonNull |= !NullPointerIsDefined;
+      return DerefBytes;
     }
   }
-  return ChangeStatus::UNCHANGED;
-}
+  if (const Value *Base =
+          GetPointerBaseWithConstantOffset(UseV, Offset, DL,
+                                           /*AllowNonInbounds*/ false)) {
+    auto &DerefAA =
+        A.getAAFor<AADereferenceable>(QueryingAA, IRPosition::value(*Base));
+    IsNonNull |= (!NullPointerIsDefined && DerefAA.isKnownNonNull());
+    IsNonNull |= (!NullPointerIsDefined && (Offset != 0));
+    int64_t DerefBytes = DerefAA.getKnownDereferenceableBytes();
+    return std::max(int64_t(0), DerefBytes - Offset);
+  }
 
-/// ------------------------ NonNull Argument Attribute ------------------------
-struct AANonNullImpl : AANonNull, BooleanState {
+  return 0;
+}
 
-  AANonNullImpl(Value &V, InformationCache &InfoCache)
-      : AANonNull(V, InfoCache) {}
+struct AANonNullImpl : AANonNull {
+  AANonNullImpl(const IRPosition &IRP)
+      : AANonNull(IRP),
+        NullIsDefined(NullPointerIsDefined(
+            getAnchorScope(),
+            getAssociatedValue().getType()->getPointerAddressSpace())) {}
 
-  AANonNullImpl(Value *AssociatedVal, Value &AnchoredValue,
-                InformationCache &InfoCache)
-      : AANonNull(AssociatedVal, AnchoredValue, InfoCache) {}
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (!NullIsDefined &&
+        hasAttr({Attribute::NonNull, Attribute::Dereferenceable}))
+      indicateOptimisticFixpoint();
+    else
+      AANonNull::initialize(A);
+  }
 
-  /// See AbstractAttribute::getState()
-  /// {
-  AbstractState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
-  /// }
+  /// See AAFromMustBeExecutedContext
+  bool followUse(Attributor &A, const Use *U, const Instruction *I) {
+    bool IsNonNull = false;
+    bool TrackUse = false;
+    getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I,
+                                       IsNonNull, TrackUse);
+    takeKnownMaximum(IsNonNull);
+    return TrackUse;
+  }
 
   /// See AbstractAttribute::getAsStr().
   const std::string getAsStr() const override {
     return getAssumed() ? "nonnull" : "may-null";
   }
 
-  /// See AANonNull::isAssumedNonNull().
-  bool isAssumedNonNull() const override { return getAssumed(); }
+  /// Flag to determine if the underlying value can be null and still allow
+  /// valid accesses.
+  const bool NullIsDefined;
+};
 
-  /// See AANonNull::isKnownNonNull().
-  bool isKnownNonNull() const override { return getKnown(); }
+/// NonNull attribute for a floating value.
+struct AANonNullFloating
+    : AAFromMustBeExecutedContext<AANonNull, AANonNullImpl> {
+  using Base = AAFromMustBeExecutedContext<AANonNull, AANonNullImpl>;
+  AANonNullFloating(const IRPosition &IRP) : Base(IRP) {}
 
-  /// Generate a predicate that checks if a given value is assumed nonnull.
-  /// The generated function returns true if a value satisfies any of
-  /// following conditions.
-  /// (i) A value is known nonZero(=nonnull).
-  /// (ii) A value is associated with AANonNull and its isAssumedNonNull() is
-  /// true.
-  std::function<bool(Value &)> generatePredicate(Attributor &);
-};
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Base::initialize(A);
 
-std::function<bool(Value &)> AANonNullImpl::generatePredicate(Attributor &A) {
-  // FIXME: The `AAReturnedValues` should provide the predicate with the
-  // `ReturnInst` vector as well such that we can use the control flow sensitive
-  // version of `isKnownNonZero`. This should fix `test11` in
-  // `test/Transforms/FunctionAttrs/nonnull.ll`
+    if (isAtFixpoint())
+      return;
 
-  std::function<bool(Value &)> Pred = [&](Value &RV) -> bool {
-    if (isKnownNonZero(&RV, getAnchorScope().getParent()->getDataLayout()))
-      return true;
+    const IRPosition &IRP = getIRPosition();
+    const Value &V = IRP.getAssociatedValue();
+    const DataLayout &DL = A.getDataLayout();
+
+    // TODO: This context sensitive query should be removed once we can do
+    // context sensitive queries in the genericValueTraversal below.
+    if (isKnownNonZero(&V, DL, 0, /* TODO: AC */ nullptr, IRP.getCtxI(),
+                       /* TODO: DT */ nullptr))
+      indicateOptimisticFixpoint();
+  }
 
-    auto *NonNullAA = A.getAAFor<AANonNull>(*this, RV);
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = Base::updateImpl(A);
+    if (isKnownNonNull())
+      return Change;
+
+    if (!NullIsDefined) {
+      const auto &DerefAA = A.getAAFor<AADereferenceable>(*this, getIRPosition());
+      if (DerefAA.getAssumedDereferenceableBytes())
+        return Change;
+    }
 
-    ImmutableCallSite ICS(&RV);
+    const DataLayout &DL = A.getDataLayout();
+
+    auto VisitValueCB = [&](Value &V, AAAlign::StateType &T,
+                            bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        if (!isKnownNonZero(&V, DL, 0, /* TODO: AC */ nullptr,
+                            /* CtxI */ getCtxI(),
+                            /* TODO: DT */ nullptr))
+          T.indicatePessimisticFixpoint();
+      } else {
+        // Use abstract attribute information.
+        const AANonNull::StateType &NS =
+            static_cast<const AANonNull::StateType &>(AA.getState());
+        T ^= NS;
+      }
+      return T.isValidState();
+    };
 
-    if ((!NonNullAA || !NonNullAA->isAssumedNonNull()) &&
-        (!ICS || !ICS.hasRetAttr(Attribute::NonNull)))
-      return false;
+    StateType T;
+    if (!genericValueTraversal<AANonNull, StateType>(A, getIRPosition(), *this,
+                                                     T, VisitValueCB))
+      return indicatePessimisticFixpoint();
 
-    return true;
-  };
+    return clampStateAndIndicateChange(getState(), T);
+  }
 
-  return Pred;
-}
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
 
 /// NonNull attribute for function return value.
-struct AANonNullReturned : AANonNullImpl {
+struct AANonNullReturned final
+    : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl> {
+  AANonNullReturned(const IRPosition &IRP)
+      : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl>(IRP) {}
 
-  AANonNullReturned(Function &F, InformationCache &InfoCache)
-      : AANonNullImpl(F, InfoCache) {}
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_RETURNED; }
+/// NonNull attribute for function argument.
+struct AANonNullArgument final
+    : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AANonNull,
+                                                              AANonNullImpl> {
+  AANonNullArgument(const IRPosition &IRP)
+      : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AANonNull,
+                                                                AANonNullImpl>(
+            IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nonnull) }
+};
 
-  /// See AbstractAttriubute::initialize(...).
-  void initialize(Attributor &A) override {
-    Function &F = getAnchorScope();
+struct AANonNullCallSiteArgument final : AANonNullFloating {
+  AANonNullCallSiteArgument(const IRPosition &IRP) : AANonNullFloating(IRP) {}
 
-    // Already nonnull.
-    if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                       Attribute::NonNull))
-      indicateOptimisticFixpoint();
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(nonnull) }
+};
+
+/// NonNull attribute for a call site return position.
+struct AANonNullCallSiteReturned final
+    : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AANonNull,
+                                                             AANonNullImpl> {
+  AANonNullCallSiteReturned(const IRPosition &IRP)
+      : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AANonNull,
+                                                               AANonNullImpl>(
+            IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) }
+};
+
+/// ------------------------ No-Recurse Attributes ----------------------------
+
+struct AANoRecurseImpl : public AANoRecurse {
+  AANoRecurseImpl(const IRPosition &IRP) : AANoRecurse(IRP) {}
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "norecurse" : "may-recurse";
+  }
+};
+
+struct AANoRecurseFunction final : AANoRecurseImpl {
+  AANoRecurseFunction(const IRPosition &IRP) : AANoRecurseImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoRecurseImpl::initialize(A);
+    if (const Function *F = getAnchorScope())
+      if (A.getInfoCache().getSccSize(*F) == 1)
+        return;
+    indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
+  ChangeStatus updateImpl(Attributor &A) override {
+
+    auto CheckForNoRecurse = [&](Instruction &I) {
+      ImmutableCallSite ICS(&I);
+      if (ICS.hasFnAttr(Attribute::NoRecurse))
+        return true;
+
+      const auto &NoRecurseAA =
+          A.getAAFor<AANoRecurse>(*this, IRPosition::callsite_function(ICS));
+      if (!NoRecurseAA.isAssumedNoRecurse())
+        return false;
+
+      // Recursion to the same function
+      if (ICS.getCalledFunction() == getAnchorScope())
+        return false;
+
+      return true;
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(norecurse) }
 };
 
-ChangeStatus AANonNullReturned::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+/// NoRecurse attribute deduction for a call sites.
+struct AANoRecurseCallSite final : AANoRecurseImpl {
+  AANoRecurseCallSite(const IRPosition &IRP) : AANoRecurseImpl(IRP) {}
 
-  auto *AARetVal = A.getAAFor<AAReturnedValues>(*this, F);
-  if (!AARetVal) {
-    indicatePessimisticFixpoint();
-    return ChangeStatus::CHANGED;
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoRecurseImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
   }
 
-  std::function<bool(Value &)> Pred = this->generatePredicate(A);
-  if (!AARetVal->checkForallReturnedValues(Pred)) {
-    indicatePessimisticFixpoint();
-    return ChangeStatus::CHANGED;
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoRecurse::StateType &>(FnAA.getState()));
   }
-  return ChangeStatus::UNCHANGED;
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); }
+};
+
+/// ------------------------ Will-Return Attributes ----------------------------
+
+// Helper function that checks whether a function has any cycle.
+// TODO: Replace with more efficent code
+static bool containsCycle(Function &F) {
+  SmallPtrSet<BasicBlock *, 32> Visited;
+
+  // Traverse BB by dfs and check whether successor is already visited.
+  for (BasicBlock *BB : depth_first(&F)) {
+    Visited.insert(BB);
+    for (auto *SuccBB : successors(BB)) {
+      if (Visited.count(SuccBB))
+        return true;
+    }
+  }
+  return false;
 }
 
-/// NonNull attribute for function argument.
-struct AANonNullArgument : AANonNullImpl {
+// Helper function that checks the function have a loop which might become an
+// endless loop
+// FIXME: Any cycle is regarded as endless loop for now.
+//        We have to allow some patterns.
+static bool containsPossiblyEndlessLoop(Function *F) {
+  return !F || !F->hasExactDefinition() || containsCycle(*F);
+}
+
+struct AAWillReturnImpl : public AAWillReturn {
+  AAWillReturnImpl(const IRPosition &IRP) : AAWillReturn(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAWillReturn::initialize(A);
+
+    Function *F = getAssociatedFunction();
+    if (containsPossiblyEndlessLoop(F))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForWillReturn = [&](Instruction &I) {
+      IRPosition IPos = IRPosition::callsite_function(ImmutableCallSite(&I));
+      const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, IPos);
+      if (WillReturnAA.isKnownWillReturn())
+        return true;
+      if (!WillReturnAA.isAssumedWillReturn())
+        return false;
+      const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(*this, IPos);
+      return NoRecurseAA.isAssumedNoRecurse();
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "willreturn" : "may-noreturn";
+  }
+};
+
+struct AAWillReturnFunction final : AAWillReturnImpl {
+  AAWillReturnFunction(const IRPosition &IRP) : AAWillReturnImpl(IRP) {}
 
-  AANonNullArgument(Argument &A, InformationCache &InfoCache)
-      : AANonNullImpl(A, InfoCache) {}
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(willreturn) }
+};
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override { return MP_ARGUMENT; }
+/// WillReturn attribute deduction for a call sites.
+struct AAWillReturnCallSite final : AAWillReturnImpl {
+  AAWillReturnCallSite(const IRPosition &IRP) : AAWillReturnImpl(IRP) {}
 
-  /// See AbstractAttriubute::initialize(...).
+  /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
-    Argument *Arg = cast<Argument>(getAssociatedValue());
-    if (Arg->hasNonNullAttr())
-      indicateOptimisticFixpoint();
+    AAWillReturnImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
   }
 
   /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AAWillReturn::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); }
 };
 
-/// NonNull attribute for a call site argument.
-struct AANonNullCallSiteArgument : AANonNullImpl {
+/// ------------------------ NoAlias Argument Attribute ------------------------
+
+struct AANoAliasImpl : AANoAlias {
+  AANoAliasImpl(const IRPosition &IRP) : AANoAlias(IRP) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noalias" : "may-alias";
+  }
+};
 
-  /// See AANonNullImpl::AANonNullImpl(...).
-  AANonNullCallSiteArgument(CallSite CS, unsigned ArgNo,
-                            InformationCache &InfoCache)
-      : AANonNullImpl(CS.getArgOperand(ArgNo), *CS.getInstruction(), InfoCache),
-        ArgNo(ArgNo) {}
+/// NoAlias attribute for a floating value.
+struct AANoAliasFloating final : AANoAliasImpl {
+  AANoAliasFloating(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
 
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
-    CallSite CS(&getAnchoredValue());
-    if (isKnownNonZero(getAssociatedValue(),
-                       getAnchorScope().getParent()->getDataLayout()) ||
-        CS.paramHasAttr(ArgNo, getAttrKind()))
+    AANoAliasImpl::initialize(A);
+    Value &Val = getAssociatedValue();
+    if (isa<AllocaInst>(Val))
+      indicateOptimisticFixpoint();
+    if (isa<ConstantPointerNull>(Val) &&
+        Val.getType()->getPointerAddressSpace() == 0)
       indicateOptimisticFixpoint();
   }
 
-  /// See AbstractAttribute::updateImpl(Attributor &A).
-  ChangeStatus updateImpl(Attributor &A) override;
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Implement this.
+    return indicatePessimisticFixpoint();
+  }
 
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override {
-    return MP_CALL_SITE_ARGUMENT;
-  };
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(noalias)
+  }
+};
 
-  // Return argument index of associated value.
-  int getArgNo() const { return ArgNo; }
+/// NoAlias attribute for an argument.
+struct AANoAliasArgument final
+    : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> {
+  AANoAliasArgument(const IRPosition &IRP)
+      : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>(IRP) {}
 
-private:
-  unsigned ArgNo;
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) }
 };
-ChangeStatus AANonNullArgument::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
-  Argument &Arg = cast<Argument>(getAnchoredValue());
 
-  unsigned ArgNo = Arg.getArgNo();
+struct AANoAliasCallSiteArgument final : AANoAliasImpl {
+  AANoAliasCallSiteArgument(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // See callsite argument attribute and callee argument attribute.
+    ImmutableCallSite ICS(&getAnchorValue());
+    if (ICS.paramHasAttr(getArgNo(), Attribute::NoAlias))
+      indicateOptimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // We can deduce "noalias" if the following conditions hold.
+    // (i)   Associated value is assumed to be noalias in the definition.
+    // (ii)  Associated value is assumed to be no-capture in all the uses
+    //       possibly executed before this callsite.
+    // (iii) There is no other pointer argument which could alias with the
+    //       value.
+
+    const Value &V = getAssociatedValue();
+    const IRPosition IRP = IRPosition::value(V);
+
+    // (i) Check whether noalias holds in the definition.
+
+    auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP);
+
+    if (!NoAliasAA.isAssumedNoAlias())
+      return indicatePessimisticFixpoint();
+
+    LLVM_DEBUG(dbgs() << "[Attributor][AANoAliasCSArg] " << V
+                      << " is assumed NoAlias in the definition\n");
+
+    // (ii) Check whether the value is captured in the scope using AANoCapture.
+    //      FIXME: This is conservative though, it is better to look at CFG and
+    //             check only uses possibly executed before this callsite.
+
+    auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP);
+    if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+      LLVM_DEBUG(
+          dbgs() << "[Attributor][AANoAliasCSArg] " << V
+                 << " cannot be noalias as it is potentially captured\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    // (iii) Check there is no other pointer argument which could alias with the
+    // value.
+    ImmutableCallSite ICS(&getAnchorValue());
+    for (unsigned i = 0; i < ICS.getNumArgOperands(); i++) {
+      if (getArgNo() == (int)i)
+        continue;
+      const Value *ArgOp = ICS.getArgOperand(i);
+      if (!ArgOp->getType()->isPointerTy())
+        continue;
+
+      if (const Function *F = getAnchorScope()) {
+        if (AAResults *AAR = A.getInfoCache().getAAResultsForFunction(*F)) {
+          bool IsAliasing = AAR->isNoAlias(&getAssociatedValue(), ArgOp);
+          LLVM_DEBUG(dbgs()
+                     << "[Attributor][NoAliasCSArg] Check alias between "
+                        "callsite arguments "
+                     << AAR->isNoAlias(&getAssociatedValue(), ArgOp) << " "
+                     << getAssociatedValue() << " " << *ArgOp << " => "
+                     << (IsAliasing ? "" : "no-") << "alias \n");
+
+          if (IsAliasing)
+            continue;
+        }
+      }
+      return indicatePessimisticFixpoint();
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noalias) }
+};
+
+/// NoAlias attribute for function return value.
+struct AANoAliasReturned final : AANoAliasImpl {
+  AANoAliasReturned(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
+
+    auto CheckReturnValue = [&](Value &RV) -> bool {
+      if (Constant *C = dyn_cast<Constant>(&RV))
+        if (C->isNullValue() || isa<UndefValue>(C))
+          return true;
+
+      /// For now, we can only deduce noalias if we have call sites.
+      /// FIXME: add more support.
+      ImmutableCallSite ICS(&RV);
+      if (!ICS)
+        return false;
+
+      const IRPosition &RVPos = IRPosition::value(RV);
+      const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, RVPos);
+      if (!NoAliasAA.isAssumedNoAlias())
+        return false;
+
+      const auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, RVPos);
+      return NoCaptureAA.isAssumedNoCaptureMaybeReturned();
+    };
+
+    if (!A.checkForAllReturnedValues(CheckReturnValue, *this))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noalias) }
+};
+
+/// NoAlias attribute deduction for a call site return value.
+struct AANoAliasCallSiteReturned final : AANoAliasImpl {
+  AANoAliasCallSiteReturned(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoAliasImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::returned(*F);
+    auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoAlias::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); }
+};
+
+/// -------------------AAIsDead Function Attribute-----------------------
+
+struct AAIsDeadImpl : public AAIsDead {
+  AAIsDeadImpl(const IRPosition &IRP) : AAIsDead(IRP) {}
+
+  void initialize(Attributor &A) override {
+    const Function *F = getAssociatedFunction();
+    if (F && !F->isDeclaration())
+      exploreFromEntry(A, F);
+  }
+
+  void exploreFromEntry(Attributor &A, const Function *F) {
+    ToBeExploredPaths.insert(&(F->getEntryBlock().front()));
+
+    for (size_t i = 0; i < ToBeExploredPaths.size(); ++i)
+      if (const Instruction *NextNoReturnI =
+              findNextNoReturn(A, ToBeExploredPaths[i]))
+        NoReturnCalls.insert(NextNoReturnI);
+
+    // Mark the block live after we looked for no-return instructions.
+    assumeLive(A, F->getEntryBlock());
+  }
+
+  /// Find the next assumed noreturn instruction in the block of \p I starting
+  /// from, thus including, \p I.
+  ///
+  /// The caller is responsible to monitor the ToBeExploredPaths set as new
+  /// instructions discovered in other basic block will be placed in there.
+  ///
+  /// \returns The next assumed noreturn instructions in the block of \p I
+  ///          starting from, thus including, \p I.
+  const Instruction *findNextNoReturn(Attributor &A, const Instruction *I);
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" +
+           std::to_string(getAssociatedFunction()->size()) + "][#NRI " +
+           std::to_string(NoReturnCalls.size()) + "]";
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    assert(getState().isValidState() &&
+           "Attempted to manifest an invalid state!");
+
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+    Function &F = *getAssociatedFunction();
+
+    if (AssumedLiveBlocks.empty()) {
+      A.deleteAfterManifest(F);
+      return ChangeStatus::CHANGED;
+    }
+
+    // Flag to determine if we can change an invoke to a call assuming the
+    // callee is nounwind. This is not possible if the personality of the
+    // function allows to catch asynchronous exceptions.
+    bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F);
+
+    for (const Instruction *NRC : NoReturnCalls) {
+      Instruction *I = const_cast<Instruction *>(NRC);
+      BasicBlock *BB = I->getParent();
+      Instruction *SplitPos = I->getNextNode();
+      // TODO: mark stuff before unreachable instructions as dead.
+
+      if (auto *II = dyn_cast<InvokeInst>(I)) {
+        // If we keep the invoke the split position is at the beginning of the
+        // normal desitination block (it invokes a noreturn function after all).
+        BasicBlock *NormalDestBB = II->getNormalDest();
+        SplitPos = &NormalDestBB->front();
+
+        /// Invoke is replaced with a call and unreachable is placed after it if
+        /// the callee is nounwind and noreturn. Otherwise, we keep the invoke
+        /// and only place an unreachable in the normal successor.
+        if (Invoke2CallAllowed) {
+          if (II->getCalledFunction()) {
+            const IRPosition &IPos = IRPosition::callsite_function(*II);
+            const auto &AANoUnw = A.getAAFor<AANoUnwind>(*this, IPos);
+            if (AANoUnw.isAssumedNoUnwind()) {
+              LLVM_DEBUG(dbgs()
+                         << "[AAIsDead] Replace invoke with call inst\n");
+              // We do not need an invoke (II) but instead want a call followed
+              // by an unreachable. However, we do not remove II as other
+              // abstract attributes might have it cached as part of their
+              // results. Given that we modify the CFG anyway, we simply keep II
+              // around but in a new dead block. To avoid II being live through
+              // a different edge we have to ensure the block we place it in is
+              // only reached from the current block of II and then not reached
+              // at all when we insert the unreachable.
+              SplitBlockPredecessors(NormalDestBB, {BB}, ".i2c");
+              CallInst *CI = createCallMatchingInvoke(II);
+              CI->insertBefore(II);
+              CI->takeName(II);
+              II->replaceAllUsesWith(CI);
+              SplitPos = CI->getNextNode();
+            }
+          }
+        }
+
+        if (SplitPos == &NormalDestBB->front()) {
+          // If this is an invoke of a noreturn function the edge to the normal
+          // destination block is dead but not necessarily the block itself.
+          // TODO: We need to move to an edge based system during deduction and
+          //       also manifest.
+          assert(!NormalDestBB->isLandingPad() &&
+                 "Expected the normal destination not to be a landingpad!");
+          if (NormalDestBB->getUniquePredecessor() == BB) {
+            assumeLive(A, *NormalDestBB);
+          } else {
+            BasicBlock *SplitBB =
+                SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
+            // The split block is live even if it contains only an unreachable
+            // instruction at the end.
+            assumeLive(A, *SplitBB);
+            SplitPos = SplitBB->getTerminator();
+            HasChanged = ChangeStatus::CHANGED;
+          }
+        }
+      }
+
+      if (isa_and_nonnull<UnreachableInst>(SplitPos))
+        continue;
+
+      BB = SplitPos->getParent();
+      SplitBlock(BB, SplitPos);
+      changeToUnreachable(BB->getTerminator(), /* UseLLVMTrap */ false);
+      HasChanged = ChangeStatus::CHANGED;
+    }
+
+    for (BasicBlock &BB : F)
+      if (!AssumedLiveBlocks.count(&BB))
+        A.deleteAfterManifest(BB);
+
+    return HasChanged;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AAIsDead::isAssumedDead(BasicBlock *).
+  bool isAssumedDead(const BasicBlock *BB) const override {
+    assert(BB->getParent() == getAssociatedFunction() &&
+           "BB must be in the same anchor scope function.");
+
+    if (!getAssumed())
+      return false;
+    return !AssumedLiveBlocks.count(BB);
+  }
+
+  /// See AAIsDead::isKnownDead(BasicBlock *).
+  bool isKnownDead(const BasicBlock *BB) const override {
+    return getKnown() && isAssumedDead(BB);
+  }
+
+  /// See AAIsDead::isAssumed(Instruction *I).
+  bool isAssumedDead(const Instruction *I) const override {
+    assert(I->getParent()->getParent() == getAssociatedFunction() &&
+           "Instruction must be in the same anchor scope function.");
+
+    if (!getAssumed())
+      return false;
+
+    // If it is not in AssumedLiveBlocks then it for sure dead.
+    // Otherwise, it can still be after noreturn call in a live block.
+    if (!AssumedLiveBlocks.count(I->getParent()))
+      return true;
+
+    // If it is not after a noreturn call, than it is live.
+    return isAfterNoReturn(I);
+  }
+
+  /// See AAIsDead::isKnownDead(Instruction *I).
+  bool isKnownDead(const Instruction *I) const override {
+    return getKnown() && isAssumedDead(I);
+  }
+
+  /// Check if instruction is after noreturn call, in other words, assumed dead.
+  bool isAfterNoReturn(const Instruction *I) const;
+
+  /// Determine if \p F might catch asynchronous exceptions.
+  static bool mayCatchAsynchronousExceptions(const Function &F) {
+    return F.hasPersonalityFn() && !canSimplifyInvokeNoUnwind(&F);
+  }
+
+  /// Assume \p BB is (partially) live now and indicate to the Attributor \p A
+  /// that internal function called from \p BB should now be looked at.
+  void assumeLive(Attributor &A, const BasicBlock &BB) {
+    if (!AssumedLiveBlocks.insert(&BB).second)
+      return;
+
+    // We assume that all of BB is (probably) live now and if there are calls to
+    // internal functions we will assume that those are now live as well. This
+    // is a performance optimization for blocks with calls to a lot of internal
+    // functions. It can however cause dead functions to be treated as live.
+    for (const Instruction &I : BB)
+      if (ImmutableCallSite ICS = ImmutableCallSite(&I))
+        if (const Function *F = ICS.getCalledFunction())
+          if (F->hasLocalLinkage())
+            A.markLiveInternalFunction(*F);
+  }
+
+  /// Collection of to be explored paths.
+  SmallSetVector<const Instruction *, 8> ToBeExploredPaths;
+
+  /// Collection of all assumed live BasicBlocks.
+  DenseSet<const BasicBlock *> AssumedLiveBlocks;
+
+  /// Collection of calls with noreturn attribute, assumed or knwon.
+  SmallSetVector<const Instruction *, 4> NoReturnCalls;
+};
+
+struct AAIsDeadFunction final : public AAIsDeadImpl {
+  AAIsDeadFunction(const IRPosition &IRP) : AAIsDeadImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECL(PartiallyDeadBlocks, Function,
+               "Number of basic blocks classified as partially dead");
+    BUILD_STAT_NAME(PartiallyDeadBlocks, Function) += NoReturnCalls.size();
+  }
+};
+
+bool AAIsDeadImpl::isAfterNoReturn(const Instruction *I) const {
+  const Instruction *PrevI = I->getPrevNode();
+  while (PrevI) {
+    if (NoReturnCalls.count(PrevI))
+      return true;
+    PrevI = PrevI->getPrevNode();
+  }
+  return false;
+}
+
+const Instruction *AAIsDeadImpl::findNextNoReturn(Attributor &A,
+                                                  const Instruction *I) {
+  const BasicBlock *BB = I->getParent();
+  const Function &F = *BB->getParent();
+
+  // Flag to determine if we can change an invoke to a call assuming the callee
+  // is nounwind. This is not possible if the personality of the function allows
+  // to catch asynchronous exceptions.
+  bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F);
+
+  // TODO: We should have a function that determines if an "edge" is dead.
+  //       Edges could be from an instruction to the next or from a terminator
+  //       to the successor. For now, we need to special case the unwind block
+  //       of InvokeInst below.
+
+  while (I) {
+    ImmutableCallSite ICS(I);
+
+    if (ICS) {
+      const IRPosition &IPos = IRPosition::callsite_function(ICS);
+      // Regarless of the no-return property of an invoke instruction we only
+      // learn that the regular successor is not reachable through this
+      // instruction but the unwind block might still be.
+      if (auto *Invoke = dyn_cast<InvokeInst>(I)) {
+        // Use nounwind to justify the unwind block is dead as well.
+        const auto &AANoUnw = A.getAAFor<AANoUnwind>(*this, IPos);
+        if (!Invoke2CallAllowed || !AANoUnw.isAssumedNoUnwind()) {
+          assumeLive(A, *Invoke->getUnwindDest());
+          ToBeExploredPaths.insert(&Invoke->getUnwindDest()->front());
+        }
+      }
+
+      const auto &NoReturnAA = A.getAAFor<AANoReturn>(*this, IPos);
+      if (NoReturnAA.isAssumedNoReturn())
+        return I;
+    }
+
+    I = I->getNextNode();
+  }
+
+  // get new paths (reachable blocks).
+  for (const BasicBlock *SuccBB : successors(BB)) {
+    assumeLive(A, *SuccBB);
+    ToBeExploredPaths.insert(&SuccBB->front());
+  }
+
+  // No noreturn instruction found.
+  return nullptr;
+}
+
+ChangeStatus AAIsDeadImpl::updateImpl(Attributor &A) {
+  ChangeStatus Status = ChangeStatus::UNCHANGED;
+
+  // Temporary collection to iterate over existing noreturn instructions. This
+  // will alow easier modification of NoReturnCalls collection
+  SmallVector<const Instruction *, 8> NoReturnChanged;
+
+  for (const Instruction *I : NoReturnCalls)
+    NoReturnChanged.push_back(I);
+
+  for (const Instruction *I : NoReturnChanged) {
+    size_t Size = ToBeExploredPaths.size();
+
+    const Instruction *NextNoReturnI = findNextNoReturn(A, I);
+    if (NextNoReturnI != I) {
+      Status = ChangeStatus::CHANGED;
+      NoReturnCalls.remove(I);
+      if (NextNoReturnI)
+        NoReturnCalls.insert(NextNoReturnI);
+    }
+
+    // Explore new paths.
+    while (Size != ToBeExploredPaths.size()) {
+      Status = ChangeStatus::CHANGED;
+      if (const Instruction *NextNoReturnI =
+              findNextNoReturn(A, ToBeExploredPaths[Size++]))
+        NoReturnCalls.insert(NextNoReturnI);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "[AAIsDead] AssumedLiveBlocks: "
+                    << AssumedLiveBlocks.size() << " Total number of blocks: "
+                    << getAssociatedFunction()->size() << "\n");
+
+  // If we know everything is live there is no need to query for liveness.
+  if (NoReturnCalls.empty() &&
+      getAssociatedFunction()->size() == AssumedLiveBlocks.size()) {
+    // Indicating a pessimistic fixpoint will cause the state to be "invalid"
+    // which will cause the Attributor to not return the AAIsDead on request,
+    // which will prevent us from querying isAssumedDead().
+    indicatePessimisticFixpoint();
+    assert(!isValidState() && "Expected an invalid state!");
+    Status = ChangeStatus::CHANGED;
+  }
+
+  return Status;
+}
+
+/// Liveness information for a call sites.
+struct AAIsDeadCallSite final : AAIsDeadImpl {
+  AAIsDeadCallSite(const IRPosition &IRP) : AAIsDeadImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites instead of
+    //       redirecting requests to the callee.
+    llvm_unreachable("Abstract attributes for liveness are not "
+                     "supported for call sites yet!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// -------------------- Dereferenceable Argument Attribute --------------------
+
+template <>
+ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S,
+                                                     const DerefState &R) {
+  ChangeStatus CS0 = clampStateAndIndicateChange<IntegerState>(
+      S.DerefBytesState, R.DerefBytesState);
+  ChangeStatus CS1 =
+      clampStateAndIndicateChange<IntegerState>(S.GlobalState, R.GlobalState);
+  return CS0 | CS1;
+}
+
+struct AADereferenceableImpl : AADereferenceable {
+  AADereferenceableImpl(const IRPosition &IRP) : AADereferenceable(IRP) {}
+  using StateType = DerefState;
+
+  void initialize(Attributor &A) override {
+    SmallVector<Attribute, 4> Attrs;
+    getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull},
+             Attrs);
+    for (const Attribute &Attr : Attrs)
+      takeKnownDerefBytesMaximum(Attr.getValueAsInt());
+
+    NonNullAA = &A.getAAFor<AANonNull>(*this, getIRPosition());
+
+    const IRPosition &IRP = this->getIRPosition();
+    bool IsFnInterface = IRP.isFnInterfaceKind();
+    const Function *FnScope = IRP.getAnchorScope();
+    if (IsFnInterface && (!FnScope || !FnScope->hasExactDefinition()))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::getState()
+  /// {
+  StateType &getState() override { return *this; }
+  const StateType &getState() const override { return *this; }
+  /// }
+
+  /// See AAFromMustBeExecutedContext
+  bool followUse(Attributor &A, const Use *U, const Instruction *I) {
+    bool IsNonNull = false;
+    bool TrackUse = false;
+    int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse(
+        A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse);
+    takeKnownDerefBytesMaximum(DerefBytes);
+    return TrackUse;
+  }
+
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    // TODO: Add *_globally support
+    if (isAssumedNonNull())
+      Attrs.emplace_back(Attribute::getWithDereferenceableBytes(
+          Ctx, getAssumedDereferenceableBytes()));
+    else
+      Attrs.emplace_back(Attribute::getWithDereferenceableOrNullBytes(
+          Ctx, getAssumedDereferenceableBytes()));
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    if (!getAssumedDereferenceableBytes())
+      return "unknown-dereferenceable";
+    return std::string("dereferenceable") +
+           (isAssumedNonNull() ? "" : "_or_null") +
+           (isAssumedGlobal() ? "_globally" : "") + "<" +
+           std::to_string(getKnownDereferenceableBytes()) + "-" +
+           std::to_string(getAssumedDereferenceableBytes()) + ">";
+  }
+};
+
+/// Dereferenceable attribute for a floating value.
+struct AADereferenceableFloating
+    : AAFromMustBeExecutedContext<AADereferenceable, AADereferenceableImpl> {
+  using Base =
+      AAFromMustBeExecutedContext<AADereferenceable, AADereferenceableImpl>;
+  AADereferenceableFloating(const IRPosition &IRP) : Base(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = Base::updateImpl(A);
+
+    const DataLayout &DL = A.getDataLayout();
+
+    auto VisitValueCB = [&](Value &V, DerefState &T, bool Stripped) -> bool {
+      unsigned IdxWidth =
+          DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
+      APInt Offset(IdxWidth, 0);
+      const Value *Base =
+          V.stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+      const auto &AA =
+          A.getAAFor<AADereferenceable>(*this, IRPosition::value(*Base));
+      int64_t DerefBytes = 0;
+      if (!Stripped && this == &AA) {
+        // Use IR information if we did not strip anything.
+        // TODO: track globally.
+        bool CanBeNull;
+        DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull);
+        T.GlobalState.indicatePessimisticFixpoint();
+      } else {
+        const DerefState &DS = static_cast<const DerefState &>(AA.getState());
+        DerefBytes = DS.DerefBytesState.getAssumed();
+        T.GlobalState &= DS.GlobalState;
+      }
+
+      // For now we do not try to "increase" dereferenceability due to negative
+      // indices as we first have to come up with code to deal with loops and
+      // for overflows of the dereferenceable bytes.
+      int64_t OffsetSExt = Offset.getSExtValue();
+      if (OffsetSExt < 0)
+        OffsetSExt = 0;
+
+      T.takeAssumedDerefBytesMinimum(
+          std::max(int64_t(0), DerefBytes - OffsetSExt));
+
+      if (this == &AA) {
+        if (!Stripped) {
+          // If nothing was stripped IR information is all we got.
+          T.takeKnownDerefBytesMaximum(
+              std::max(int64_t(0), DerefBytes - OffsetSExt));
+          T.indicatePessimisticFixpoint();
+        } else if (OffsetSExt > 0) {
+          // If something was stripped but there is circular reasoning we look
+          // for the offset. If it is positive we basically decrease the
+          // dereferenceable bytes in a circluar loop now, which will simply
+          // drive them down to the known value in a very slow way which we
+          // can accelerate.
+          T.indicatePessimisticFixpoint();
+        }
+      }
+
+      return T.isValidState();
+    };
+
+    DerefState T;
+    if (!genericValueTraversal<AADereferenceable, DerefState>(
+            A, getIRPosition(), *this, T, VisitValueCB))
+      return indicatePessimisticFixpoint();
+
+    return Change | clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute for a return value.
+struct AADereferenceableReturned final
+    : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl,
+                                   DerefState> {
+  AADereferenceableReturned(const IRPosition &IRP)
+      : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl,
+                                     DerefState>(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute for an argument
+struct AADereferenceableArgument final
+    : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<
+          AADereferenceable, AADereferenceableImpl, DerefState> {
+  using Base = AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<
+      AADereferenceable, AADereferenceableImpl, DerefState>;
+  AADereferenceableArgument(const IRPosition &IRP) : Base(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute for a call site argument.
+struct AADereferenceableCallSiteArgument final : AADereferenceableFloating {
+  AADereferenceableCallSiteArgument(const IRPosition &IRP)
+      : AADereferenceableFloating(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute deduction for a call site return value.
+struct AADereferenceableCallSiteReturned final
+    : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<
+          AADereferenceable, AADereferenceableImpl> {
+  using Base = AACallSiteReturnedFromReturnedAndMustBeExecutedContext<
+      AADereferenceable, AADereferenceableImpl>;
+  AADereferenceableCallSiteReturned(const IRPosition &IRP) : Base(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Base::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+
+    ChangeStatus Change = Base::updateImpl(A);
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::returned(*F);
+    auto &FnAA = A.getAAFor<AADereferenceable>(*this, FnPos);
+    return Change |
+           clampStateAndIndicateChange(
+               getState(), static_cast<const DerefState &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(dereferenceable);
+  }
+};
+
+// ------------------------ Align Argument Attribute ------------------------
+
+struct AAAlignImpl : AAAlign {
+  AAAlignImpl(const IRPosition &IRP) : AAAlign(IRP) {}
+
+  // Max alignemnt value allowed in IR
+  static const unsigned MAX_ALIGN = 1U << 29;
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    takeAssumedMinimum(MAX_ALIGN);
+
+    SmallVector<Attribute, 4> Attrs;
+    getAttrs({Attribute::Alignment}, Attrs);
+    for (const Attribute &Attr : Attrs)
+      takeKnownMaximum(Attr.getValueAsInt());
+
+    if (getIRPosition().isFnInterfaceKind() &&
+        (!getAssociatedFunction() ||
+         !getAssociatedFunction()->hasExactDefinition()))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    // Check for users that allow alignment annotations.
+    Value &AnchorVal = getIRPosition().getAnchorValue();
+    for (const Use &U : AnchorVal.uses()) {
+      if (auto *SI = dyn_cast<StoreInst>(U.getUser())) {
+        if (SI->getPointerOperand() == &AnchorVal)
+          if (SI->getAlignment() < getAssumedAlign()) {
+            STATS_DECLTRACK(AAAlign, Store,
+                            "Number of times alignemnt added to a store");
+            SI->setAlignment(Align(getAssumedAlign()));
+            Changed = ChangeStatus::CHANGED;
+          }
+      } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) {
+        if (LI->getPointerOperand() == &AnchorVal)
+          if (LI->getAlignment() < getAssumedAlign()) {
+            LI->setAlignment(Align(getAssumedAlign()));
+            STATS_DECLTRACK(AAAlign, Load,
+                            "Number of times alignemnt added to a load");
+            Changed = ChangeStatus::CHANGED;
+          }
+      }
+    }
+
+    return AAAlign::manifest(A) | Changed;
+  }
+
+  // TODO: Provide a helper to determine the implied ABI alignment and check in
+  //       the existing manifest method and a new one for AAAlignImpl that value
+  //       to avoid making the alignment explicit if it did not improve.
+
+  /// See AbstractAttribute::getDeducedAttributes
+  virtual void
+  getDeducedAttributes(LLVMContext &Ctx,
+                       SmallVectorImpl<Attribute> &Attrs) const override {
+    if (getAssumedAlign() > 1)
+      Attrs.emplace_back(
+          Attribute::getWithAlignment(Ctx, Align(getAssumedAlign())));
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) +
+                                "-" + std::to_string(getAssumedAlign()) + ">")
+                             : "unknown-align";
+  }
+};
+
+/// Align attribute for a floating value.
+struct AAAlignFloating : AAAlignImpl {
+  AAAlignFloating(const IRPosition &IRP) : AAAlignImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const DataLayout &DL = A.getDataLayout();
+
+    auto VisitValueCB = [&](Value &V, AAAlign::StateType &T,
+                            bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        // Use only IR information if we did not strip anything.
+        const MaybeAlign PA = V.getPointerAlignment(DL);
+        T.takeKnownMaximum(PA ? PA->value() : 0);
+        T.indicatePessimisticFixpoint();
+      } else {
+        // Use abstract attribute information.
+        const AAAlign::StateType &DS =
+            static_cast<const AAAlign::StateType &>(AA.getState());
+        T ^= DS;
+      }
+      return T.isValidState();
+    };
+
+    StateType T;
+    if (!genericValueTraversal<AAAlign, StateType>(A, getIRPosition(), *this, T,
+                                                   VisitValueCB))
+      return indicatePessimisticFixpoint();
+
+    // TODO: If we know we visited all incoming values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(align) }
+};
+
+/// Align attribute for function return value.
+struct AAAlignReturned final
+    : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> {
+  AAAlignReturned(const IRPosition &IRP)
+      : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl>(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) }
+};
+
+/// Align attribute for function argument.
+struct AAAlignArgument final
+    : AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl> {
+  AAAlignArgument(const IRPosition &IRP)
+      : AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl>(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) }
+};
+
+struct AAAlignCallSiteArgument final : AAAlignFloating {
+  AAAlignCallSiteArgument(const IRPosition &IRP) : AAAlignFloating(IRP) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    return AAAlignImpl::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) }
+};
+
+/// Align attribute deduction for a call site return value.
+struct AAAlignCallSiteReturned final : AAAlignImpl {
+  AAAlignCallSiteReturned(const IRPosition &IRP) : AAAlignImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAAlignImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::returned(*F);
+    auto &FnAA = A.getAAFor<AAAlign>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AAAlign::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
+};
+
+/// ------------------ Function No-Return Attribute ----------------------------
+struct AANoReturnImpl : public AANoReturn {
+  AANoReturnImpl(const IRPosition &IRP) : AANoReturn(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoReturn::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F || F->hasFnAttribute(Attribute::WillReturn))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noreturn" : "may-return";
+  }
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
+    const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, getIRPosition());
+    if (WillReturnAA.isKnownWillReturn())
+      return indicatePessimisticFixpoint();
+    auto CheckForNoReturn = [](Instruction &) { return false; };
+    if (!A.checkForAllInstructions(CheckForNoReturn, *this,
+                                   {(unsigned)Instruction::Ret}))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AANoReturnFunction final : AANoReturnImpl {
+  AANoReturnFunction(const IRPosition &IRP) : AANoReturnImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) }
+};
+
+/// NoReturn attribute deduction for a call sites.
+struct AANoReturnCallSite final : AANoReturnImpl {
+  AANoReturnCallSite(const IRPosition &IRP) : AANoReturnImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoReturn::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
+};
+
+/// ----------------------- Variable Capturing ---------------------------------
+
+/// A class to hold the state of for no-capture attributes.
+struct AANoCaptureImpl : public AANoCapture {
+  AANoCaptureImpl(const IRPosition &IRP) : AANoCapture(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoCapture::initialize(A);
+
+    // You cannot "capture" null in the default address space.
+    if (isa<ConstantPointerNull>(getAssociatedValue()) &&
+        getAssociatedValue().getType()->getPointerAddressSpace() == 0) {
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    const IRPosition &IRP = getIRPosition();
+    const Function *F =
+        getArgNo() >= 0 ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
+
+    // Check what state the associated function can actually capture.
+    if (F)
+      determineFunctionCaptureCapabilities(IRP, *F, *this);
+    else
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...).
+  virtual void
+  getDeducedAttributes(LLVMContext &Ctx,
+                       SmallVectorImpl<Attribute> &Attrs) const override {
+    if (!isAssumedNoCaptureMaybeReturned())
+      return;
+
+    if (getArgNo() >= 0) {
+      if (isAssumedNoCapture())
+        Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture));
+      else if (ManifestInternal)
+        Attrs.emplace_back(Attribute::get(Ctx, "no-capture-maybe-returned"));
+    }
+  }
+
+  /// Set the NOT_CAPTURED_IN_MEM and NOT_CAPTURED_IN_RET bits in \p Known
+  /// depending on the ability of the function associated with \p IRP to capture
+  /// state in memory and through "returning/throwing", respectively.
+  static void determineFunctionCaptureCapabilities(const IRPosition &IRP,
+                                                   const Function &F,
+                                                   IntegerState &State) {
+    // TODO: Once we have memory behavior attributes we should use them here.
+
+    // If we know we cannot communicate or write to memory, we do not care about
+    // ptr2int anymore.
+    if (F.onlyReadsMemory() && F.doesNotThrow() &&
+        F.getReturnType()->isVoidTy()) {
+      State.addKnownBits(NO_CAPTURE);
+      return;
+    }
+
+    // A function cannot capture state in memory if it only reads memory, it can
+    // however return/throw state and the state might be influenced by the
+    // pointer value, e.g., loading from a returned pointer might reveal a bit.
+    if (F.onlyReadsMemory())
+      State.addKnownBits(NOT_CAPTURED_IN_MEM);
+
+    // A function cannot communicate state back if it does not through
+    // exceptions and doesn not return values.
+    if (F.doesNotThrow() && F.getReturnType()->isVoidTy())
+      State.addKnownBits(NOT_CAPTURED_IN_RET);
+
+    // Check existing "returned" attributes.
+    int ArgNo = IRP.getArgNo();
+    if (F.doesNotThrow() && ArgNo >= 0) {
+      for (unsigned u = 0, e = F.arg_size(); u< e; ++u)
+        if (F.hasParamAttribute(u, Attribute::Returned)) {
+          if (u == unsigned(ArgNo))
+            State.removeAssumedBits(NOT_CAPTURED_IN_RET);
+          else if (F.onlyReadsMemory())
+            State.addKnownBits(NO_CAPTURE);
+          else
+            State.addKnownBits(NOT_CAPTURED_IN_RET);
+          break;
+        }
+    }
+  }
+
+  /// See AbstractState::getAsStr().
+  const std::string getAsStr() const override {
+    if (isKnownNoCapture())
+      return "known not-captured";
+    if (isAssumedNoCapture())
+      return "assumed not-captured";
+    if (isKnownNoCaptureMaybeReturned())
+      return "known not-captured-maybe-returned";
+    if (isAssumedNoCaptureMaybeReturned())
+      return "assumed not-captured-maybe-returned";
+    return "assumed-captured";
+  }
+};
+
+/// Attributor-aware capture tracker.
+struct AACaptureUseTracker final : public CaptureTracker {
+
+  /// Create a capture tracker that can lookup in-flight abstract attributes
+  /// through the Attributor \p A.
+  ///
+  /// If a use leads to a potential capture, \p CapturedInMemory is set and the
+  /// search is stopped. If a use leads to a return instruction,
+  /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed.
+  /// If a use leads to a ptr2int which may capture the value,
+  /// \p CapturedInInteger is set. If a use is found that is currently assumed
+  /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies
+  /// set. All values in \p PotentialCopies are later tracked as well. For every
+  /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0,
+  /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger
+  /// conservatively set to true.
+  AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA,
+                      const AAIsDead &IsDeadAA, IntegerState &State,
+                      SmallVectorImpl<const Value *> &PotentialCopies,
+                      unsigned &RemainingUsesToExplore)
+      : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State),
+        PotentialCopies(PotentialCopies),
+        RemainingUsesToExplore(RemainingUsesToExplore) {}
+
+  /// Determine if \p V maybe captured. *Also updates the state!*
+  bool valueMayBeCaptured(const Value *V) {
+    if (V->getType()->isPointerTy()) {
+      PointerMayBeCaptured(V, this);
+    } else {
+      State.indicatePessimisticFixpoint();
+    }
+    return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+  }
+
+  /// See CaptureTracker::tooManyUses().
+  void tooManyUses() override {
+    State.removeAssumedBits(AANoCapture::NO_CAPTURE);
+  }
+
+  bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override {
+    if (CaptureTracker::isDereferenceableOrNull(O, DL))
+      return true;
+    const auto &DerefAA =
+        A.getAAFor<AADereferenceable>(NoCaptureAA, IRPosition::value(*O));
+    return DerefAA.getAssumedDereferenceableBytes();
+  }
+
+  /// See CaptureTracker::captured(...).
+  bool captured(const Use *U) override {
+    Instruction *UInst = cast<Instruction>(U->getUser());
+    LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst
+                      << "\n");
+
+    // Because we may reuse the tracker multiple times we keep track of the
+    // number of explored uses ourselves as well.
+    if (RemainingUsesToExplore-- == 0) {
+      LLVM_DEBUG(dbgs() << " - too many uses to explore!\n");
+      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                          /* Return */ true);
+    }
+
+    // Deal with ptr2int by following uses.
+    if (isa<PtrToIntInst>(UInst)) {
+      LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n");
+      return valueMayBeCaptured(UInst);
+    }
+
+    // Explicitly catch return instructions.
+    if (isa<ReturnInst>(UInst))
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ true);
+
+    // For now we only use special logic for call sites. However, the tracker
+    // itself knows about a lot of other non-capturing cases already.
+    CallSite CS(UInst);
+    if (!CS || !CS.isArgOperand(U))
+      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                          /* Return */ true);
+
+    unsigned ArgNo = CS.getArgumentNo(U);
+    const IRPosition &CSArgPos = IRPosition::callsite_argument(CS, ArgNo);
+    // If we have a abstract no-capture attribute for the argument we can use
+    // it to justify a non-capture attribute here. This allows recursion!
+    auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos);
+    if (ArgNoCaptureAA.isAssumedNoCapture())
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ false);
+    if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+      addPotentialCopy(CS);
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ false);
+    }
+
+    // Lastly, we could not find a reason no-capture can be assumed so we don't.
+    return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                        /* Return */ true);
+  }
+
+  /// Register \p CS as potential copy of the value we are checking.
+  void addPotentialCopy(CallSite CS) {
+    PotentialCopies.push_back(CS.getInstruction());
+  }
+
+  /// See CaptureTracker::shouldExplore(...).
+  bool shouldExplore(const Use *U) override {
+    // Check liveness.
+    return !IsDeadAA.isAssumedDead(cast<Instruction>(U->getUser()));
+  }
+
+  /// Update the state according to \p CapturedInMem, \p CapturedInInt, and
+  /// \p CapturedInRet, then return the appropriate value for use in the
+  /// CaptureTracker::captured() interface.
+  bool isCapturedIn(bool CapturedInMem, bool CapturedInInt,
+                    bool CapturedInRet) {
+    LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int "
+                      << CapturedInInt << "|Ret " << CapturedInRet << "]\n");
+    if (CapturedInMem)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_MEM);
+    if (CapturedInInt)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT);
+    if (CapturedInRet)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET);
+    return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+  }
+
+private:
+  /// The attributor providing in-flight abstract attributes.
+  Attributor &A;
+
+  /// The abstract attribute currently updated.
+  AANoCapture &NoCaptureAA;
+
+  /// The abstract liveness state.
+  const AAIsDead &IsDeadAA;
+
+  /// The state currently updated.
+  IntegerState &State;
+
+  /// Set of potential copies of the tracked value.
+  SmallVectorImpl<const Value *> &PotentialCopies;
+
+  /// Global counter to limit the number of explored uses.
+  unsigned &RemainingUsesToExplore;
+};
+
+ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
+  const IRPosition &IRP = getIRPosition();
+  const Value *V =
+      getArgNo() >= 0 ? IRP.getAssociatedArgument() : &IRP.getAssociatedValue();
+  if (!V)
+    return indicatePessimisticFixpoint();
+
+  const Function *F =
+      getArgNo() >= 0 ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
+  assert(F && "Expected a function!");
+  const IRPosition &FnPos = IRPosition::function(*F);
+  const auto &IsDeadAA = A.getAAFor<AAIsDead>(*this, FnPos);
+
+  AANoCapture::StateType T;
+
+  // Readonly means we cannot capture through memory.
+  const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
+  if (FnMemAA.isAssumedReadOnly()) {
+    T.addKnownBits(NOT_CAPTURED_IN_MEM);
+    if (FnMemAA.isKnownReadOnly())
+      addKnownBits(NOT_CAPTURED_IN_MEM);
+  }
+
+  // Make sure all returned values are different than the underlying value.
+  // TODO: we could do this in a more sophisticated way inside
+  //       AAReturnedValues, e.g., track all values that escape through returns
+  //       directly somehow.
+  auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) {
+    bool SeenConstant = false;
+    for (auto &It : RVAA.returned_values()) {
+      if (isa<Constant>(It.first)) {
+        if (SeenConstant)
+          return false;
+        SeenConstant = true;
+      } else if (!isa<Argument>(It.first) ||
+                 It.first == getAssociatedArgument())
+        return false;
+    }
+    return true;
+  };
+
+  const auto &NoUnwindAA = A.getAAFor<AANoUnwind>(*this, FnPos);
+  if (NoUnwindAA.isAssumedNoUnwind()) {
+    bool IsVoidTy = F->getReturnType()->isVoidTy();
+    const AAReturnedValues *RVAA =
+        IsVoidTy ? nullptr : &A.getAAFor<AAReturnedValues>(*this, FnPos);
+    if (IsVoidTy || CheckReturnedArgs(*RVAA)) {
+      T.addKnownBits(NOT_CAPTURED_IN_RET);
+      if (T.isKnown(NOT_CAPTURED_IN_MEM))
+        return ChangeStatus::UNCHANGED;
+      if (NoUnwindAA.isKnownNoUnwind() &&
+          (IsVoidTy || RVAA->getState().isAtFixpoint())) {
+        addKnownBits(NOT_CAPTURED_IN_RET);
+        if (isKnown(NOT_CAPTURED_IN_MEM))
+          return indicateOptimisticFixpoint();
+      }
+    }
+  }
+
+  // Use the CaptureTracker interface and logic with the specialized tracker,
+  // defined in AACaptureUseTracker, that can look at in-flight abstract
+  // attributes and directly updates the assumed state.
+  SmallVector<const Value *, 4> PotentialCopies;
+  unsigned RemainingUsesToExplore = DefaultMaxUsesToExplore;
+  AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies,
+                              RemainingUsesToExplore);
+
+  // Check all potential copies of the associated value until we can assume
+  // none will be captured or we have to assume at least one might be.
+  unsigned Idx = 0;
+  PotentialCopies.push_back(V);
+  while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size())
+    Tracker.valueMayBeCaptured(PotentialCopies[Idx++]);
+
+  AAAlign::StateType &S = getState();
+  auto Assumed = S.getAssumed();
+  S.intersectAssumedBits(T.getAssumed());
+  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+                                   : ChangeStatus::CHANGED;
+}
+
+/// NoCapture attribute for function arguments.
+struct AANoCaptureArgument final : AANoCaptureImpl {
+  AANoCaptureArgument(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nocapture) }
+};
+
+/// NoCapture attribute for call site arguments.
+struct AANoCaptureCallSiteArgument final : AANoCaptureImpl {
+  AANoCaptureCallSiteArgument(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoCapture::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nocapture)};
+};
+
+/// NoCapture attribute for floating values.
+struct AANoCaptureFloating final : AANoCaptureImpl {
+  AANoCaptureFloating(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(nocapture)
+  }
+};
+
+/// NoCapture attribute for function return value.
+struct AANoCaptureReturned final : AANoCaptureImpl {
+  AANoCaptureReturned(const IRPosition &IRP) : AANoCaptureImpl(IRP) {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// NoCapture attribute deduction for a call site return value.
+struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
+  AANoCaptureCallSiteReturned(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(nocapture)
+  }
+};
+
+/// ------------------ Value Simplify Attribute ----------------------------
+struct AAValueSimplifyImpl : AAValueSimplify {
+  AAValueSimplifyImpl(const IRPosition &IRP) : AAValueSimplify(IRP) {}
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? (getKnown() ? "simplified" : "maybe-simple")
+                        : "not-simple";
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+  /// See AAValueSimplify::getAssumedSimplifiedValue()
+  Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const override {
+    if (!getAssumed())
+      return const_cast<Value *>(&getAssociatedValue());
+    return SimplifiedAssociatedValue;
+  }
+  void initialize(Attributor &A) override {}
+
+  /// Helper function for querying AAValueSimplify and updating candicate.
+  /// \param QueryingValue Value trying to unify with SimplifiedValue
+  /// \param AccumulatedSimplifiedValue Current simplification result.
+  static bool checkAndUpdate(Attributor &A, const AbstractAttribute &QueryingAA,
+                             Value &QueryingValue,
+                             Optional<Value *> &AccumulatedSimplifiedValue) {
+    // FIXME: Add a typecast support.
+
+    auto &ValueSimpifyAA = A.getAAFor<AAValueSimplify>(
+        QueryingAA, IRPosition::value(QueryingValue));
+
+    Optional<Value *> QueryingValueSimplified =
+        ValueSimpifyAA.getAssumedSimplifiedValue(A);
+
+    if (!QueryingValueSimplified.hasValue())
+      return true;
+
+    if (!QueryingValueSimplified.getValue())
+      return false;
+
+    Value &QueryingValueSimplifiedUnwrapped =
+        *QueryingValueSimplified.getValue();
+
+    if (isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
+      return true;
+
+    if (AccumulatedSimplifiedValue.hasValue())
+      return AccumulatedSimplifiedValue == QueryingValueSimplified;
+
+    LLVM_DEBUG(dbgs() << "[Attributor][ValueSimplify] " << QueryingValue
+                      << " is assumed to be "
+                      << QueryingValueSimplifiedUnwrapped << "\n");
+
+    AccumulatedSimplifiedValue = QueryingValueSimplified;
+    return true;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    if (!SimplifiedAssociatedValue.hasValue() ||
+        !SimplifiedAssociatedValue.getValue())
+      return Changed;
+
+    if (auto *C = dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())) {
+      // We can replace the AssociatedValue with the constant.
+      Value &V = getAssociatedValue();
+      if (!V.user_empty() && &V != C && V.getType() == C->getType()) {
+        LLVM_DEBUG(dbgs() << "[Attributor][ValueSimplify] " << V << " -> " << *C
+                          << "\n");
+        V.replaceAllUsesWith(C);
+        Changed = ChangeStatus::CHANGED;
+      }
+    }
+
+    return Changed | AAValueSimplify::manifest(A);
+  }
+
+protected:
+  // An assumed simplified value. Initially, it is set to Optional::None, which
+  // means that the value is not clear under current assumption. If in the
+  // pessimistic state, getAssumedSimplifiedValue doesn't return this value but
+  // returns orignal associated value.
+  Optional<Value *> SimplifiedAssociatedValue;
+};
+
+struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
+  AAValueSimplifyArgument(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto PredForCallSite = [&](AbstractCallSite ACS) {
+      // Check if we have an associated argument or not (which can happen for
+      // callback calls).
+      if (Value *ArgOp = ACS.getCallArgOperand(getArgNo()))
+        return checkAndUpdate(A, *this, *ArgOp, SimplifiedAssociatedValue);
+      return false;
+    };
+
+    if (!A.checkForAllCallSites(PredForCallSite, *this, true))
+      return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyReturned : AAValueSimplifyImpl {
+  AAValueSimplifyReturned(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto PredForReturned = [&](Value &V) {
+      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+    };
+
+    if (!A.checkForAllReturnedValues(PredForReturned, *this))
+      return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyFloating : AAValueSimplifyImpl {
+  AAValueSimplifyFloating(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Value &V = getAnchorValue();
+
+    // TODO: add other stuffs
+    if (isa<Constant>(V) || isa<UndefValue>(V))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto VisitValueCB = [&](Value &V, BooleanState, bool Stripped) -> bool {
+      auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        // TODO: Look the instruction and check recursively.
+        LLVM_DEBUG(
+            dbgs() << "[Attributor][ValueSimplify] Can't be stripped more : "
+                   << V << "\n");
+        indicatePessimisticFixpoint();
+        return false;
+      }
+      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+    };
+
+    if (!genericValueTraversal<AAValueSimplify, BooleanState>(
+            A, getIRPosition(), *this, static_cast<BooleanState &>(*this),
+            VisitValueCB))
+      return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyFunction : AAValueSimplifyImpl {
+  AAValueSimplifyFunction(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    SimplifiedAssociatedValue = &getAnchorValue();
+    indicateOptimisticFixpoint();
+  }
+  /// See AbstractAttribute::initialize(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable(
+        "AAValueSimplify(Function|CallSite)::updateImpl will not be called");
+  }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FN_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyCallSite : AAValueSimplifyFunction {
+  AAValueSimplifyCallSite(const IRPosition &IRP)
+      : AAValueSimplifyFunction(IRP) {}
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned {
+  AAValueSimplifyCallSiteReturned(const IRPosition &IRP)
+      : AAValueSimplifyReturned(IRP) {}
+
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(value_simplify)
+  }
+};
+struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
+  AAValueSimplifyCallSiteArgument(const IRPosition &IRP)
+      : AAValueSimplifyFloating(IRP) {}
+
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(value_simplify)
+  }
+};
+
+/// ----------------------- Heap-To-Stack Conversion ---------------------------
+struct AAHeapToStackImpl : public AAHeapToStack {
+  AAHeapToStackImpl(const IRPosition &IRP) : AAHeapToStack(IRP) {}
+
+  const std::string getAsStr() const override {
+    return "[H2S] Mallocs: " + std::to_string(MallocCalls.size());
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    assert(getState().isValidState() &&
+           "Attempted to manifest an invalid state!");
+
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+    Function *F = getAssociatedFunction();
+    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+    for (Instruction *MallocCall : MallocCalls) {
+      // This malloc cannot be replaced.
+      if (BadMallocCalls.count(MallocCall))
+        continue;
+
+      for (Instruction *FreeCall : FreesForMalloc[MallocCall]) {
+        LLVM_DEBUG(dbgs() << "H2S: Removing free call: " << *FreeCall << "\n");
+        A.deleteAfterManifest(*FreeCall);
+        HasChanged = ChangeStatus::CHANGED;
+      }
+
+      LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall
+                        << "\n");
+
+      Constant *Size;
+      if (isCallocLikeFn(MallocCall, TLI)) {
+        auto *Num = cast<ConstantInt>(MallocCall->getOperand(0));
+        auto *SizeT = dyn_cast<ConstantInt>(MallocCall->getOperand(1));
+        APInt TotalSize = SizeT->getValue() * Num->getValue();
+        Size =
+            ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize);
+      } else {
+        Size = cast<ConstantInt>(MallocCall->getOperand(0));
+      }
+
+      unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace();
+      Instruction *AI = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
+                                       Size, "", MallocCall->getNextNode());
+
+      if (AI->getType() != MallocCall->getType())
+        AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc",
+                             AI->getNextNode());
+
+      MallocCall->replaceAllUsesWith(AI);
+
+      if (auto *II = dyn_cast<InvokeInst>(MallocCall)) {
+        auto *NBB = II->getNormalDest();
+        BranchInst::Create(NBB, MallocCall->getParent());
+        A.deleteAfterManifest(*MallocCall);
+      } else {
+        A.deleteAfterManifest(*MallocCall);
+      }
+
+      if (isCallocLikeFn(MallocCall, TLI)) {
+        auto *BI = new BitCastInst(AI, MallocCall->getType(), "calloc_bc",
+                                   AI->getNextNode());
+        Value *Ops[] = {
+            BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size,
+            ConstantInt::get(Type::getInt1Ty(F->getContext()), false)};
+
+        Type *Tys[] = {BI->getType(), MallocCall->getOperand(0)->getType()};
+        Module *M = F->getParent();
+        Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
+        CallInst::Create(Fn, Ops, "", BI->getNextNode());
+      }
+      HasChanged = ChangeStatus::CHANGED;
+    }
+
+    return HasChanged;
+  }
+
+  /// Collection of all malloc calls in a function.
+  SmallSetVector<Instruction *, 4> MallocCalls;
+
+  /// Collection of malloc calls that cannot be converted.
+  DenseSet<const Instruction *> BadMallocCalls;
+
+  /// A map for each malloc call to the set of associated free calls.
+  DenseMap<Instruction *, SmallPtrSet<Instruction *, 4>> FreesForMalloc;
+
+  ChangeStatus updateImpl(Attributor &A) override;
+};
+
+ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) {
+  const Function *F = getAssociatedFunction();
+  const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+  auto UsesCheck = [&](Instruction &I) {
+    SmallPtrSet<const Use *, 8> Visited;
+    SmallVector<const Use *, 8> Worklist;
+
+    for (Use &U : I.uses())
+      Worklist.push_back(&U);
+
+    while (!Worklist.empty()) {
+      const Use *U = Worklist.pop_back_val();
+      if (!Visited.insert(U).second)
+        continue;
+
+      auto *UserI = U->getUser();
+
+      if (isa<LoadInst>(UserI))
+        continue;
+      if (auto *SI = dyn_cast<StoreInst>(UserI)) {
+        if (SI->getValueOperand() == U->get()) {
+          LLVM_DEBUG(dbgs() << "[H2S] escaping store to memory: " << *UserI << "\n");
+          return false;
+        }
+        // A store into the malloc'ed memory is fine.
+        continue;
+      }
+
+      // NOTE: Right now, if a function that has malloc pointer as an argument
+      // frees memory, we assume that the malloc pointer is freed.
+
+      // TODO: Add nofree callsite argument attribute to indicate that pointer
+      // argument is not freed.
+      if (auto *CB = dyn_cast<CallBase>(UserI)) {
+        if (!CB->isArgOperand(U))
+          continue;
+
+        if (CB->isLifetimeStartOrEnd())
+          continue;
+
+        // Record malloc.
+        if (isFreeCall(UserI, TLI)) {
+          FreesForMalloc[&I].insert(
+              cast<Instruction>(const_cast<User *>(UserI)));
+          continue;
+        }
+
+        // If a function does not free memory we are fine
+        const auto &NoFreeAA =
+            A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(*CB));
+
+        unsigned ArgNo = U - CB->arg_begin();
+        const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+        if (!NoCaptureAA.isAssumedNoCapture() || !NoFreeAA.isAssumedNoFree()) {
+          LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n");
+          return false;
+        }
+        continue;
+      }
+
+      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI)) {
+        for (Use &U : UserI->uses())
+          Worklist.push_back(&U);
+        continue;
+      }
+
+      // Unknown user.
+      LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n");
+      return false;
+    }
+    return true;
+  };
+
+  auto MallocCallocCheck = [&](Instruction &I) {
+    if (BadMallocCalls.count(&I))
+      return true;
+
+    bool IsMalloc = isMallocLikeFn(&I, TLI);
+    bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI);
+    if (!IsMalloc && !IsCalloc) {
+      BadMallocCalls.insert(&I);
+      return true;
+    }
+
+    if (IsMalloc) {
+      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0)))
+        if (Size->getValue().sle(MaxHeapToStackSize))
+          if (UsesCheck(I)) {
+            MallocCalls.insert(&I);
+            return true;
+          }
+    } else if (IsCalloc) {
+      bool Overflow = false;
+      if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0)))
+        if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
+          if ((Size->getValue().umul_ov(Num->getValue(), Overflow))
+                   .sle(MaxHeapToStackSize))
+            if (!Overflow && UsesCheck(I)) {
+              MallocCalls.insert(&I);
+              return true;
+            }
+    }
+
+    BadMallocCalls.insert(&I);
+    return true;
+  };
+
+  size_t NumBadMallocs = BadMallocCalls.size();
+
+  A.checkForAllCallLikeInstructions(MallocCallocCheck, *this);
+
+  if (NumBadMallocs != BadMallocCalls.size())
+    return ChangeStatus::CHANGED;
+
+  return ChangeStatus::UNCHANGED;
+}
+
+struct AAHeapToStackFunction final : public AAHeapToStackImpl {
+  AAHeapToStackFunction(const IRPosition &IRP) : AAHeapToStackImpl(IRP) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECL(MallocCalls, Function,
+               "Number of MallocCalls converted to allocas");
+    BUILD_STAT_NAME(MallocCalls, Function) += MallocCalls.size();
+  }
+};
+
+/// -------------------- Memory Behavior Attributes ----------------------------
+/// Includes read-none, read-only, and write-only.
+/// ----------------------------------------------------------------------------
+struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
+  AAMemoryBehaviorImpl(const IRPosition &IRP) : AAMemoryBehavior(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    intersectAssumedBits(BEST_STATE);
+    getKnownStateFromValue(getIRPosition(), getState());
+    IRAttribute::initialize(A);
+  }
+
+  /// Return the memory behavior information encoded in the IR for \p IRP.
+  static void getKnownStateFromValue(const IRPosition &IRP,
+                                     IntegerState &State) {
+    SmallVector<Attribute, 2> Attrs;
+    IRP.getAttrs(AttrKinds, Attrs);
+    for (const Attribute &Attr : Attrs) {
+      switch (Attr.getKindAsEnum()) {
+      case Attribute::ReadNone:
+        State.addKnownBits(NO_ACCESSES);
+        break;
+      case Attribute::ReadOnly:
+        State.addKnownBits(NO_WRITES);
+        break;
+      case Attribute::WriteOnly:
+        State.addKnownBits(NO_READS);
+        break;
+      default:
+        llvm_unreachable("Unexpcted attribute!");
+      }
+    }
+
+    if (auto *I = dyn_cast<Instruction>(&IRP.getAnchorValue())) {
+      if (!I->mayReadFromMemory())
+        State.addKnownBits(NO_READS);
+      if (!I->mayWriteToMemory())
+        State.addKnownBits(NO_WRITES);
+    }
+  }
+
+  /// See AbstractAttribute::getDeducedAttributes(...).
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    assert(Attrs.size() == 0);
+    if (isAssumedReadNone())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
+    else if (isAssumedReadOnly())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadOnly));
+    else if (isAssumedWriteOnly())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::WriteOnly));
+    assert(Attrs.size() <= 1);
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    IRPosition &IRP = getIRPosition();
+
+    // Check if we would improve the existing attributes first.
+    SmallVector<Attribute, 4> DeducedAttrs;
+    getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
+    if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
+          return IRP.hasAttr(Attr.getKindAsEnum(),
+                             /* IgnoreSubsumingPositions */ true);
+        }))
+      return ChangeStatus::UNCHANGED;
+
+    // Clear existing attributes.
+    IRP.removeAttrs(AttrKinds);
+
+    // Use the generic manifest method.
+    return IRAttribute::manifest(A);
+  }
+
+  /// See AbstractState::getAsStr().
+  const std::string getAsStr() const override {
+    if (isAssumedReadNone())
+      return "readnone";
+    if (isAssumedReadOnly())
+      return "readonly";
+    if (isAssumedWriteOnly())
+      return "writeonly";
+    return "may-read/write";
+  }
+
+  /// The set of IR attributes AAMemoryBehavior deals with.
+  static const Attribute::AttrKind AttrKinds[3];
+};
+
+const Attribute::AttrKind AAMemoryBehaviorImpl::AttrKinds[] = {
+    Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly};
+
+/// Memory behavior attribute for a floating value.
+struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl {
+  AAMemoryBehaviorFloating(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorImpl::initialize(A);
+    // Initialize the use vector with all direct uses of the associated value.
+    for (const Use &U : getAssociatedValue().uses())
+      Uses.insert(&U);
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FLOATING_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_FLOATING_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_FLOATING_ATTR(writeonly)
+  }
+
+private:
+  /// Return true if users of \p UserI might access the underlying
+  /// variable/location described by \p U and should therefore be analyzed.
+  bool followUsersOfUseIn(Attributor &A, const Use *U,
+                          const Instruction *UserI);
+
+  /// Update the state according to the effect of use \p U in \p UserI.
+  void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI);
+
+protected:
+  /// Container for (transitive) uses of the associated argument.
+  SetVector<const Use *> Uses;
+};
+
+/// Memory behavior attribute for function argument.
+struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating {
+  AAMemoryBehaviorArgument(const IRPosition &IRP)
+      : AAMemoryBehaviorFloating(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorFloating::initialize(A);
+
+    // Initialize the use vector with all direct uses of the associated value.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg || !Arg->getParent()->hasExactDefinition())
+      indicatePessimisticFixpoint();
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    // TODO: From readattrs.ll: "inalloca parameters are always
+    //                           considered written"
+    if (hasAttr({Attribute::InAlloca})) {
+      removeKnownBits(NO_WRITES);
+      removeAssumedBits(NO_WRITES);
+    }
+    return AAMemoryBehaviorFloating::manifest(A);
+  }
+
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_ARG_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_ARG_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_ARG_ATTR(writeonly)
+  }
+};
+
+struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
+  AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP)
+      : AAMemoryBehaviorArgument(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoCapture::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CSARG_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_CSARG_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_CSARG_ATTR(writeonly)
+  }
+};
+
+/// Memory behavior attribute for a call site return position.
+struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
+  AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP)
+      : AAMemoryBehaviorFloating(IRP) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // We do not annotate returned values.
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// An AA to represent the memory behavior function attributes.
+struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl {
+  AAMemoryBehaviorFunction(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {}
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    Function &F = cast<Function>(getAnchorValue());
+    if (isAssumedReadNone()) {
+      F.removeFnAttr(Attribute::ArgMemOnly);
+      F.removeFnAttr(Attribute::InaccessibleMemOnly);
+      F.removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+    }
+    return AAMemoryBehaviorImpl::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FN_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_FN_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_FN_ATTR(writeonly)
+  }
+};
+
+/// AAMemoryBehavior attribute for call sites.
+struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
+  AAMemoryBehaviorCallSite(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F || !F->hasExactDefinition())
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AAAlign::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CS_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_CS_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_CS_ATTR(writeonly)
+  }
+};
+} // namespace
+
+ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) {
+
+  // The current assumed state used to determine a change.
+  auto AssumedState = getAssumed();
+
+  auto CheckRWInst = [&](Instruction &I) {
+    // If the instruction has an own memory behavior state, use it to restrict
+    // the local state. No further analysis is required as the other memory
+    // state is as optimistic as it gets.
+    if (ImmutableCallSite ICS = ImmutableCallSite(&I)) {
+      const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+          *this, IRPosition::callsite_function(ICS));
+      intersectAssumedBits(MemBehaviorAA.getAssumed());
+      return !isAtFixpoint();
+    }
+
+    // Remove access kind modifiers if necessary.
+    if (I.mayReadFromMemory())
+      removeAssumedBits(NO_READS);
+    if (I.mayWriteToMemory())
+      removeAssumedBits(NO_WRITES);
+    return !isAtFixpoint();
+  };
+
+  if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+    return indicatePessimisticFixpoint();
+
+  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+                                        : ChangeStatus::UNCHANGED;
+}
+
+ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
+
+  const IRPosition &IRP = getIRPosition();
+  const IRPosition &FnPos = IRPosition::function_scope(IRP);
+  AAMemoryBehavior::StateType &S = getState();
+
+  // First, check the function scope. We take the known information and we avoid
+  // work if the assumed information implies the current assumed information for
+  // this attribute.
+  const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
+  S.addKnownBits(FnMemAA.getKnown());
+  if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed())
+    return ChangeStatus::UNCHANGED;
+
+  // Make sure the value is not captured (except through "return"), if
+  // it is, any information derived would be irrelevant anyway as we cannot
+  // check the potential aliases introduced by the capture. However, no need
+  // to fall back to anythign less optimistic than the function state.
+  const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP);
+  if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+    S.intersectAssumedBits(FnMemAA.getAssumed());
+    return ChangeStatus::CHANGED;
+  }
+
+  // The current assumed state used to determine a change.
+  auto AssumedState = S.getAssumed();
+
+  // Liveness information to exclude dead users.
+  // TODO: Take the FnPos once we have call site specific liveness information.
+  const auto &LivenessAA = A.getAAFor<AAIsDead>(
+      *this, IRPosition::function(*IRP.getAssociatedFunction()));
+
+  // Visit and expand uses until all are analyzed or a fixpoint is reached.
+  for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) {
+    const Use *U = Uses[i];
+    Instruction *UserI = cast<Instruction>(U->getUser());
+    LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI
+                      << " [Dead: " << (LivenessAA.isAssumedDead(UserI))
+                      << "]\n");
+    if (LivenessAA.isAssumedDead(UserI))
+      continue;
+
+    // Check if the users of UserI should also be visited.
+    if (followUsersOfUseIn(A, U, UserI))
+      for (const Use &UserIUse : UserI->uses())
+        Uses.insert(&UserIUse);
+
+    // If UserI might touch memory we analyze the use in detail.
+    if (UserI->mayReadOrWriteMemory())
+      analyzeUseIn(A, U, UserI);
+  }
+
+  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+                                        : ChangeStatus::UNCHANGED;
+}
+
+bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U,
+                                                  const Instruction *UserI) {
+  // The loaded value is unrelated to the pointer argument, no need to
+  // follow the users of the load.
+  if (isa<LoadInst>(UserI))
+    return false;
+
+  // By default we follow all uses assuming UserI might leak information on U,
+  // we have special handling for call sites operands though.
+  ImmutableCallSite ICS(UserI);
+  if (!ICS || !ICS.isArgOperand(U))
+    return true;
+
+  // If the use is a call argument known not to be captured, the users of
+  // the call do not need to be visited because they have to be unrelated to
+  // the input. Note that this check is not trivial even though we disallow
+  // general capturing of the underlying argument. The reason is that the
+  // call might the argument "through return", which we allow and for which we
+  // need to check call users.
+  unsigned ArgNo = ICS.getArgumentNo(U);
+  const auto &ArgNoCaptureAA =
+      A.getAAFor<AANoCapture>(*this, IRPosition::callsite_argument(ICS, ArgNo));
+  return !ArgNoCaptureAA.isAssumedNoCapture();
+}
+
+void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U,
+                                            const Instruction *UserI) {
+  assert(UserI->mayReadOrWriteMemory());
+
+  switch (UserI->getOpcode()) {
+  default:
+    // TODO: Handle all atomics and other side-effect operations we know of.
+    break;
+  case Instruction::Load:
+    // Loads cause the NO_READS property to disappear.
+    removeAssumedBits(NO_READS);
+    return;
+
+  case Instruction::Store:
+    // Stores cause the NO_WRITES property to disappear if the use is the
+    // pointer operand. Note that we do assume that capturing was taken care of
+    // somewhere else.
+    if (cast<StoreInst>(UserI)->getPointerOperand() == U->get())
+      removeAssumedBits(NO_WRITES);
+    return;
+
+  case Instruction::Call:
+  case Instruction::CallBr:
+  case Instruction::Invoke: {
+    // For call sites we look at the argument memory behavior attribute (this
+    // could be recursive!) in order to restrict our own state.
+    ImmutableCallSite ICS(UserI);
+
+    // Give up on operand bundles.
+    if (ICS.isBundleOperand(U)) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    // Calling a function does read the function pointer, maybe write it if the
+    // function is self-modifying.
+    if (ICS.isCallee(U)) {
+      removeAssumedBits(NO_READS);
+      break;
+    }
 
-  // Callback function
-  std::function<bool(CallSite)> CallSiteCheck = [&](CallSite CS) {
-    assert(CS && "Sanity check: Call site was not initialized properly!");
+    // Adjust the possible access behavior based on the information on the
+    // argument.
+    unsigned ArgNo = ICS.getArgumentNo(U);
+    const IRPosition &ArgPos = IRPosition::callsite_argument(ICS, ArgNo);
+    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
+    // "assumed" has at most the same bits as the MemBehaviorAA assumed
+    // and at least "known".
+    intersectAssumedBits(MemBehaviorAA.getAssumed());
+    return;
+  }
+  };
 
-    auto *NonNullAA = A.getAAFor<AANonNull>(*this, *CS.getInstruction(), ArgNo);
+  // Generally, look at the "may-properties" and adjust the assumed state if we
+  // did not trigger special handling before.
+  if (UserI->mayReadFromMemory())
+    removeAssumedBits(NO_READS);
+  if (UserI->mayWriteToMemory())
+    removeAssumedBits(NO_WRITES);
+}
 
-    // Check that NonNullAA is AANonNullCallSiteArgument.
-    if (NonNullAA) {
-      ImmutableCallSite ICS(&NonNullAA->getAnchoredValue());
-      if (ICS && CS.getInstruction() == ICS.getInstruction())
-        return NonNullAA->isAssumedNonNull();
-      return false;
-    }
+/// ----------------------------------------------------------------------------
+///                               Attributor
+/// ----------------------------------------------------------------------------
 
-    if (CS.paramHasAttr(ArgNo, Attribute::NonNull))
-      return true;
+bool Attributor::isAssumedDead(const AbstractAttribute &AA,
+                               const AAIsDead *LivenessAA) {
+  const Instruction *CtxI = AA.getIRPosition().getCtxI();
+  if (!CtxI)
+    return false;
 
-    Value *V = CS.getArgOperand(ArgNo);
-    if (isKnownNonZero(V, getAnchorScope().getParent()->getDataLayout()))
-      return true;
+  if (!LivenessAA)
+    LivenessAA =
+        &getAAFor<AAIsDead>(AA, IRPosition::function(*CtxI->getFunction()),
+                            /* TrackDependence */ false);
 
+  // Don't check liveness for AAIsDead.
+  if (&AA == LivenessAA)
     return false;
-  };
-  if (!A.checkForAllCallSites(F, CallSiteCheck, true)) {
-    indicatePessimisticFixpoint();
-    return ChangeStatus::CHANGED;
-  }
-  return ChangeStatus::UNCHANGED;
-}
 
-ChangeStatus AANonNullCallSiteArgument::updateImpl(Attributor &A) {
-  // NOTE: Never look at the argument of the callee in this method.
-  //       If we do this, "nonnull" is always deduced because of the assumption.
+  if (!LivenessAA->isAssumedDead(CtxI))
+    return false;
 
-  Value &V = *getAssociatedValue();
+  // We actually used liveness information so we have to record a dependence.
+  recordDependence(*LivenessAA, AA);
 
-  auto *NonNullAA = A.getAAFor<AANonNull>(*this, V);
+  return true;
+}
 
-  if (!NonNullAA || !NonNullAA->isAssumedNonNull()) {
-    indicatePessimisticFixpoint();
-    return ChangeStatus::CHANGED;
+bool Attributor::checkForAllCallSites(
+    const function_ref<bool(AbstractCallSite)> &Pred,
+    const AbstractAttribute &QueryingAA, bool RequireAllCallSites) {
+  // We can try to determine information from
+  // the call sites. However, this is only possible all call sites are known,
+  // hence the function has internal linkage.
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction) {
+    LLVM_DEBUG(dbgs() << "[Attributor] No function associated with " << IRP
+                      << "\n");
+    return false;
   }
 
-  return ChangeStatus::UNCHANGED;
+  return checkForAllCallSites(Pred, *AssociatedFunction, RequireAllCallSites,
+                              &QueryingAA);
 }
 
-/// ------------------------ Will-Return Attributes ----------------------------
+bool Attributor::checkForAllCallSites(
+    const function_ref<bool(AbstractCallSite)> &Pred, const Function &Fn,
+    bool RequireAllCallSites, const AbstractAttribute *QueryingAA) {
+  if (RequireAllCallSites && !Fn.hasLocalLinkage()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "[Attributor] Function " << Fn.getName()
+        << " has no internal linkage, hence not all call sites are known\n");
+    return false;
+  }
 
-struct AAWillReturnImpl : public AAWillReturn, BooleanState {
+  for (const Use &U : Fn.uses()) {
+    AbstractCallSite ACS(&U);
+    if (!ACS) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Function "
+                        << Fn.getName()
+                        << " has non call site use " << *U.get() << " in "
+                        << *U.getUser() << "\n");
+      return false;
+    }
 
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AAWillReturnImpl(Function &F, InformationCache &InfoCache)
-      : AAWillReturn(F, InfoCache) {}
+    Instruction *I = ACS.getInstruction();
+    Function *Caller = I->getFunction();
 
-  /// See AAWillReturn::isKnownWillReturn().
-  bool isKnownWillReturn() const override { return getKnown(); }
+    const auto *LivenessAA =
+        lookupAAFor<AAIsDead>(IRPosition::function(*Caller), QueryingAA,
+                           /* TrackDependence */ false);
 
-  /// See AAWillReturn::isAssumedWillReturn().
-  bool isAssumedWillReturn() const override { return getAssumed(); }
+    // Skip dead calls.
+    if (LivenessAA && LivenessAA->isAssumedDead(I)) {
+      // We actually used liveness information so we have to record a
+      // dependence.
+      if (QueryingAA)
+        recordDependence(*LivenessAA, *QueryingAA);
+      continue;
+    }
 
-  /// See AbstractAttribute::getState(...).
-  AbstractState &getState() override { return *this; }
+    const Use *EffectiveUse =
+        ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U;
+    if (!ACS.isCallee(EffectiveUse)) {
+      if (!RequireAllCallSites)
+        continue;
+      LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser()
+                        << " is an invalid use of "
+                        << Fn.getName() << "\n");
+      return false;
+    }
 
-  /// See AbstractAttribute::getState(...).
-  const AbstractState &getState() const override { return *this; }
+    if (Pred(ACS))
+      continue;
 
-  /// See AbstractAttribute::getAsStr()
-  const std::string getAsStr() const override {
-    return getAssumed() ? "willreturn" : "may-noreturn";
+    LLVM_DEBUG(dbgs() << "[Attributor] Call site callback failed for "
+                      << *ACS.getInstruction() << "\n");
+    return false;
   }
-};
 
-struct AAWillReturnFunction final : AAWillReturnImpl {
-
-  /// See AbstractAttribute::AbstractAttribute(...).
-  AAWillReturnFunction(Function &F, InformationCache &InfoCache)
-      : AAWillReturnImpl(F, InfoCache) {}
-
-  /// See AbstractAttribute::getManifestPosition().
-  ManifestPosition getManifestPosition() const override {
-    return MP_FUNCTION;
-  }
+  return true;
+}
 
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override;
+bool Attributor::checkForAllReturnedValuesAndReturnInsts(
+    const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
+        &Pred,
+    const AbstractAttribute &QueryingAA) {
 
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
-};
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  // Since we need to provide return instructions we have to have an exact
+  // definition.
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
 
-// Helper function that checks whether a function has any cycle.
-// TODO: Replace with more efficent code
-bool containsCycle(Function &F) {
-  SmallPtrSet<BasicBlock *, 32> Visited;
+  // If this is a call site query we use the call site specific return values
+  // and liveness information.
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
+  if (!AARetVal.getState().isValidState())
+    return false;
 
-  // Traverse BB by dfs and check whether successor is already visited.
-  for (BasicBlock *BB : depth_first(&F)) {
-    Visited.insert(BB);
-    for (auto *SuccBB : successors(BB)) {
-      if (Visited.count(SuccBB))
-        return true;
-    }
-  }
-  return false;
+  return AARetVal.checkForAllReturnedValuesAndReturnInsts(Pred);
 }
 
-// Helper function that checks the function have a loop which might become an
-// endless loop
-// FIXME: Any cycle is regarded as endless loop for now.
-//        We have to allow some patterns.
-bool containsPossiblyEndlessLoop(Function &F) { return containsCycle(F); }
+bool Attributor::checkForAllReturnedValues(
+    const function_ref<bool(Value &)> &Pred,
+    const AbstractAttribute &QueryingAA) {
 
-void AAWillReturnFunction::initialize(Attributor &A) {
-  Function &F = getAnchorScope();
-
-  if (containsPossiblyEndlessLoop(F))
-    indicatePessimisticFixpoint();
-}
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction)
+    return false;
 
-ChangeStatus AAWillReturnFunction::updateImpl(Attributor &A) {
-  Function &F = getAnchorScope();
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
+  if (!AARetVal.getState().isValidState())
+    return false;
 
-  // The map from instruction opcodes to those instructions in the function.
-  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+  return AARetVal.checkForAllReturnedValuesAndReturnInsts(
+      [&](Value &RV, const SmallSetVector<ReturnInst *, 4> &) {
+        return Pred(RV);
+      });
+}
 
-  for (unsigned Opcode :
-       {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
-        (unsigned)Instruction::Call}) {
+static bool
+checkForAllInstructionsImpl(InformationCache::OpcodeInstMapTy &OpcodeInstMap,
+                            const function_ref<bool(Instruction &)> &Pred,
+                            const AAIsDead *LivenessAA, bool &AnyDead,
+                            const ArrayRef<unsigned> &Opcodes) {
+  for (unsigned Opcode : Opcodes) {
     for (Instruction *I : OpcodeInstMap[Opcode]) {
-      auto ICS = ImmutableCallSite(I);
-
-      if (ICS.hasFnAttr(Attribute::WillReturn))
+      // Skip dead instructions.
+      if (LivenessAA && LivenessAA->isAssumedDead(I)) {
+        AnyDead = true;
         continue;
-
-      auto *WillReturnAA = A.getAAFor<AAWillReturn>(*this, *I);
-      if (!WillReturnAA || !WillReturnAA->isAssumedWillReturn()) {
-        indicatePessimisticFixpoint();
-        return ChangeStatus::CHANGED;
       }
 
-      auto *NoRecurseAA = A.getAAFor<AANoRecurse>(*this, *I);
-
-      // FIXME: (i) Prohibit any recursion for now.
-      //        (ii) AANoRecurse isn't implemented yet so currently any call is
-      //        regarded as having recursion.
-      //       Code below should be
-      //       if ((!NoRecurseAA || !NoRecurseAA->isAssumedNoRecurse()) &&
-      if (!NoRecurseAA && !ICS.hasFnAttr(Attribute::NoRecurse)) {
-        indicatePessimisticFixpoint();
-        return ChangeStatus::CHANGED;
-      }
+      if (!Pred(*I))
+        return false;
     }
   }
-
-  return ChangeStatus::UNCHANGED;
+  return true;
 }
 
-/// ----------------------------------------------------------------------------
-///                               Attributor
-/// ----------------------------------------------------------------------------
+bool Attributor::checkForAllInstructions(
+    const llvm::function_ref<bool(Instruction &)> &Pred,
+    const AbstractAttribute &QueryingAA, const ArrayRef<unsigned> &Opcodes) {
 
-bool Attributor::checkForAllCallSites(Function &F,
-                                      std::function<bool(CallSite)> &Pred,
-                                      bool RequireAllCallSites) {
-  // We can try to determine information from
-  // the call sites. However, this is only possible all call sites are known,
-  // hence the function has internal linkage.
-  if (RequireAllCallSites && !F.hasInternalLinkage()) {
-    LLVM_DEBUG(
-        dbgs()
-        << "Attributor: Function " << F.getName()
-        << " has no internal linkage, hence not all call sites are known\n");
+  const IRPosition &IRP = QueryingAA.getIRPosition();
+  // Since we need to provide instructions we have to have an exact definition.
+  const Function *AssociatedFunction = IRP.getAssociatedFunction();
+  if (!AssociatedFunction)
     return false;
-  }
 
-  for (const Use &U : F.uses()) {
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &LivenessAA =
+      getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
+  bool AnyDead = false;
 
-    CallSite CS(U.getUser());
-    if (!CS || !CS.isCallee(&U) || !CS.getCaller()->hasExactDefinition()) {
-      if (!RequireAllCallSites)
-        continue;
+  auto &OpcodeInstMap =
+      InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction);
+  if (!checkForAllInstructionsImpl(OpcodeInstMap, Pred, &LivenessAA, AnyDead,
+                                   Opcodes))
+    return false;
 
-      LLVM_DEBUG(dbgs() << "Attributor: User " << *U.getUser()
-                        << " is an invalid use of " << F.getName() << "\n");
-      return false;
-    }
+  // If we actually used liveness information so we have to record a dependence.
+  if (AnyDead)
+    recordDependence(LivenessAA, QueryingAA);
 
-    if (Pred(CS))
-      continue;
+  return true;
+}
 
-    LLVM_DEBUG(dbgs() << "Attributor: Call site callback failed for "
-                      << *CS.getInstruction() << "\n");
+bool Attributor::checkForAllReadWriteInstructions(
+    const llvm::function_ref<bool(Instruction &)> &Pred,
+    AbstractAttribute &QueryingAA) {
+
+  const Function *AssociatedFunction =
+      QueryingAA.getIRPosition().getAssociatedFunction();
+  if (!AssociatedFunction)
     return false;
+
+  // TODO: use the function scope once we have call site AAReturnedValues.
+  const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+  const auto &LivenessAA =
+      getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
+  bool AnyDead = false;
+
+  for (Instruction *I :
+       InfoCache.getReadOrWriteInstsForFunction(*AssociatedFunction)) {
+    // Skip dead instructions.
+    if (LivenessAA.isAssumedDead(I)) {
+      AnyDead = true;
+      continue;
+    }
+
+    if (!Pred(*I))
+      return false;
   }
 
+  // If we actually used liveness information so we have to record a dependence.
+  if (AnyDead)
+    recordDependence(LivenessAA, QueryingAA);
+
   return true;
 }
 
-ChangeStatus Attributor::run() {
-  // Initialize all abstract attributes.
-  for (AbstractAttribute *AA : AllAbstractAttributes)
-    AA->initialize(*this);
-
+ChangeStatus Attributor::run(Module &M) {
   LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized "
                     << AllAbstractAttributes.size()
                     << " abstract attributes.\n");
@@ -1370,10 +4470,25 @@ ChangeStatus Attributor::run() {
   SetVector<AbstractAttribute *> Worklist;
   Worklist.insert(AllAbstractAttributes.begin(), AllAbstractAttributes.end());
 
+  bool RecomputeDependences = false;
+
   do {
+    // Remember the size to determine new attributes.
+    size_t NumAAs = AllAbstractAttributes.size();
     LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter
                       << ", Worklist size: " << Worklist.size() << "\n");
 
+    // If dependences (=QueryMap) are recomputed we have to look at all abstract
+    // attributes again, regardless of what changed in the last iteration.
+    if (RecomputeDependences) {
+      LLVM_DEBUG(
+          dbgs() << "[Attributor] Run all AAs to recompute dependences\n");
+      QueryMap.clear();
+      ChangedAAs.clear();
+      Worklist.insert(AllAbstractAttributes.begin(),
+                      AllAbstractAttributes.end());
+    }
+
     // Add all abstract attributes that are potentially dependent on one that
     // changed to the work list.
     for (AbstractAttribute *ChangedAA : ChangedAAs) {
@@ -1381,27 +4496,42 @@ ChangeStatus Attributor::run() {
       Worklist.insert(QuerriedAAs.begin(), QuerriedAAs.end());
     }
 
+    LLVM_DEBUG(dbgs() << "[Attributor] #Iteration: " << IterationCounter
+                      << ", Worklist+Dependent size: " << Worklist.size()
+                      << "\n");
+
     // Reset the changed set.
     ChangedAAs.clear();
 
     // Update all abstract attribute in the work list and record the ones that
     // changed.
     for (AbstractAttribute *AA : Worklist)
-      if (AA->update(*this) == ChangeStatus::CHANGED)
-        ChangedAAs.push_back(AA);
+      if (!isAssumedDead(*AA, nullptr))
+        if (AA->update(*this) == ChangeStatus::CHANGED)
+          ChangedAAs.push_back(AA);
+
+    // Check if we recompute the dependences in the next iteration.
+    RecomputeDependences = (DepRecomputeInterval > 0 &&
+                            IterationCounter % DepRecomputeInterval == 0);
+
+    // Add attributes to the changed set if they have been created in the last
+    // iteration.
+    ChangedAAs.append(AllAbstractAttributes.begin() + NumAAs,
+                      AllAbstractAttributes.end());
 
     // Reset the work list and repopulate with the changed abstract attributes.
     // Note that dependent ones are added above.
     Worklist.clear();
     Worklist.insert(ChangedAAs.begin(), ChangedAAs.end());
 
-  } while (!Worklist.empty() && ++IterationCounter < MaxFixpointIterations);
+  } while (!Worklist.empty() && (IterationCounter++ < MaxFixpointIterations ||
+                                 VerifyMaxFixpointIterations));
 
   LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
                     << IterationCounter << "/" << MaxFixpointIterations
                     << " iterations\n");
 
-  bool FinishedAtFixpoint = Worklist.empty();
+  size_t NumFinalAAs = AllAbstractAttributes.size();
 
   // Reset abstract arguments not settled in a sound fixpoint by now. This
   // happens when we stopped the fixpoint iteration early. Note that only the
@@ -1448,8 +4578,14 @@ ChangeStatus Attributor::run() {
     if (!State.isValidState())
       continue;
 
+    // Skip dead code.
+    if (isAssumedDead(*AA, nullptr))
+      continue;
     // Manifest the state and record if we changed the IR.
     ChangeStatus LocalChange = AA->manifest(*this);
+    if (LocalChange == ChangeStatus::CHANGED && AreStatisticsEnabled())
+      AA->trackStatistics();
+
     ManifestChange = ManifestChange | LocalChange;
 
     NumAtFixpoint++;
@@ -1462,69 +4598,92 @@ ChangeStatus Attributor::run() {
                     << " arguments while " << NumAtFixpoint
                     << " were in a valid fixpoint state\n");
 
-  // If verification is requested, we finished this run at a fixpoint, and the
-  // IR was changed, we re-run the whole fixpoint analysis, starting at
-  // re-initialization of the arguments. This re-run should not result in an IR
-  // change. Though, the (virtual) state of attributes at the end of the re-run
-  // might be more optimistic than the known state or the IR state if the better
-  // state cannot be manifested.
-  if (VerifyAttributor && FinishedAtFixpoint &&
-      ManifestChange == ChangeStatus::CHANGED) {
-    VerifyAttributor = false;
-    ChangeStatus VerifyStatus = run();
-    if (VerifyStatus != ChangeStatus::UNCHANGED)
-      llvm_unreachable(
-          "Attributor verification failed, re-run did result in an IR change "
-          "even after a fixpoint was reached in the original run. (False "
-          "positives possible!)");
-    VerifyAttributor = true;
-  }
-
   NumAttributesManifested += NumManifested;
   NumAttributesValidFixpoint += NumAtFixpoint;
 
-  return ManifestChange;
-}
-
-void Attributor::identifyDefaultAbstractAttributes(
-    Function &F, InformationCache &InfoCache,
-    DenseSet</* Attribute::AttrKind */ unsigned> *Whitelist) {
+  (void)NumFinalAAs;
+  assert(
+      NumFinalAAs == AllAbstractAttributes.size() &&
+      "Expected the final number of abstract attributes to remain unchanged!");
+
+  // Delete stuff at the end to avoid invalid references and a nice order.
+  {
+    LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least "
+                      << ToBeDeletedFunctions.size() << " functions and "
+                      << ToBeDeletedBlocks.size() << " blocks and "
+                      << ToBeDeletedInsts.size() << " instructions\n");
+    for (Instruction *I : ToBeDeletedInsts) {
+      if (!I->use_empty())
+        I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      I->eraseFromParent();
+    }
 
-  // Every function can be nounwind.
-  registerAA(*new AANoUnwindFunction(F, InfoCache));
+    if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) {
+      SmallVector<BasicBlock *, 8> ToBeDeletedBBs;
+      ToBeDeletedBBs.reserve(NumDeadBlocks);
+      ToBeDeletedBBs.append(ToBeDeletedBlocks.begin(), ToBeDeletedBlocks.end());
+      DeleteDeadBlocks(ToBeDeletedBBs);
+      STATS_DECLTRACK(AAIsDead, BasicBlock,
+                      "Number of dead basic blocks deleted.");
+    }
 
-  // Every function might be marked "nosync"
-  registerAA(*new AANoSyncFunction(F, InfoCache));
+    STATS_DECL(AAIsDead, Function, "Number of dead functions deleted.");
+    for (Function *Fn : ToBeDeletedFunctions) {
+      Fn->replaceAllUsesWith(UndefValue::get(Fn->getType()));
+      Fn->eraseFromParent();
+      STATS_TRACK(AAIsDead, Function);
+    }
 
-  // Every function might be "no-free".
-  registerAA(*new AANoFreeFunction(F, InfoCache));
+    // Identify dead internal functions and delete them. This happens outside
+    // the other fixpoint analysis as we might treat potentially dead functions
+    // as live to lower the number of iterations. If they happen to be dead, the
+    // below fixpoint loop will identify and eliminate them.
+    SmallVector<Function *, 8> InternalFns;
+    for (Function &F : M)
+      if (F.hasLocalLinkage())
+        InternalFns.push_back(&F);
+
+    bool FoundDeadFn = true;
+    while (FoundDeadFn) {
+      FoundDeadFn = false;
+      for (unsigned u = 0, e = InternalFns.size(); u < e; ++u) {
+        Function *F = InternalFns[u];
+        if (!F)
+          continue;
 
-  // Return attributes are only appropriate if the return type is non void.
-  Type *ReturnType = F.getReturnType();
-  if (!ReturnType->isVoidTy()) {
-    // Argument attribute "returned" --- Create only one per function even
-    // though it is an argument attribute.
-    if (!Whitelist || Whitelist->count(AAReturnedValues::ID))
-      registerAA(*new AAReturnedValuesImpl(F, InfoCache));
+        const auto *LivenessAA =
+            lookupAAFor<AAIsDead>(IRPosition::function(*F));
+        if (LivenessAA &&
+            !checkForAllCallSites([](AbstractCallSite ACS) { return false; },
+                                  *LivenessAA, true))
+          continue;
 
-    // Every function with pointer return type might be marked nonnull.
-    if (ReturnType->isPointerTy() &&
-        (!Whitelist || Whitelist->count(AANonNullReturned::ID)))
-      registerAA(*new AANonNullReturned(F, InfoCache));
+        STATS_TRACK(AAIsDead, Function);
+        F->replaceAllUsesWith(UndefValue::get(F->getType()));
+        F->eraseFromParent();
+        InternalFns[u] = nullptr;
+        FoundDeadFn = true;
+      }
+    }
   }
 
-  // Every argument with pointer type might be marked nonnull.
-  for (Argument &Arg : F.args()) {
-    if (Arg.getType()->isPointerTy())
-      registerAA(*new AANonNullArgument(Arg, InfoCache));
+  if (VerifyMaxFixpointIterations &&
+      IterationCounter != MaxFixpointIterations) {
+    errs() << "\n[Attributor] Fixpoint iteration done after: "
+           << IterationCounter << "/" << MaxFixpointIterations
+           << " iterations\n";
+    llvm_unreachable("The fixpoint was not reached with exactly the number of "
+                     "specified iterations!");
   }
 
-  // Every function might be "will-return".
-  registerAA(*new AAWillReturnFunction(F, InfoCache));
+  return ManifestChange;
+}
+
+void Attributor::initializeInformationCache(Function &F) {
 
-  // Walk all instructions to find more attribute opportunities and also
-  // interesting instructions that might be queried by abstract attributes
-  // during their initialization or update.
+  // Walk all instructions to find interesting instructions that might be
+  // queried by abstract attributes during their initialization or update.
+  // This has to happen before we create attributes.
   auto &ReadOrWriteInsts = InfoCache.FuncRWInstsMap[&F];
   auto &InstOpcodeMap = InfoCache.FuncInstOpcodeMap[&F];
 
@@ -1540,8 +4699,12 @@ void Attributor::identifyDefaultAbstractAttributes(
     default:
       assert((!ImmutableCallSite(&I)) && (!isa<CallBase>(&I)) &&
              "New call site/base instruction type needs to be known int the "
-             "attributor.");
+             "Attributor.");
       break;
+    case Instruction::Load:
+      // The alignment of a pointer is interesting for loads.
+    case Instruction::Store:
+      // The alignment of a pointer is interesting for stores.
     case Instruction::Call:
     case Instruction::CallBr:
     case Instruction::Invoke:
@@ -1555,18 +4718,154 @@ void Attributor::identifyDefaultAbstractAttributes(
       InstOpcodeMap[I.getOpcode()].push_back(&I);
     if (I.mayReadOrWriteMemory())
       ReadOrWriteInsts.push_back(&I);
+  }
+}
+
+void Attributor::identifyDefaultAbstractAttributes(Function &F) {
+  if (!VisitedFunctions.insert(&F).second)
+    return;
+
+  IRPosition FPos = IRPosition::function(F);
+
+  // Check for dead BasicBlocks in every function.
+  // We need dead instruction detection because we do not want to deal with
+  // broken IR in which SSA rules do not apply.
+  getOrCreateAAFor<AAIsDead>(FPos);
+
+  // Every function might be "will-return".
+  getOrCreateAAFor<AAWillReturn>(FPos);
+
+  // Every function can be nounwind.
+  getOrCreateAAFor<AANoUnwind>(FPos);
+
+  // Every function might be marked "nosync"
+  getOrCreateAAFor<AANoSync>(FPos);
+
+  // Every function might be "no-free".
+  getOrCreateAAFor<AANoFree>(FPos);
+
+  // Every function might be "no-return".
+  getOrCreateAAFor<AANoReturn>(FPos);
+
+  // Every function might be "no-recurse".
+  getOrCreateAAFor<AANoRecurse>(FPos);
+
+  // Every function might be "readnone/readonly/writeonly/...".
+  getOrCreateAAFor<AAMemoryBehavior>(FPos);
+
+  // Every function might be applicable for Heap-To-Stack conversion.
+  if (EnableHeapToStack)
+    getOrCreateAAFor<AAHeapToStack>(FPos);
+
+  // Return attributes are only appropriate if the return type is non void.
+  Type *ReturnType = F.getReturnType();
+  if (!ReturnType->isVoidTy()) {
+    // Argument attribute "returned" --- Create only one per function even
+    // though it is an argument attribute.
+    getOrCreateAAFor<AAReturnedValues>(FPos);
+
+    IRPosition RetPos = IRPosition::returned(F);
+
+    // Every function might be simplified.
+    getOrCreateAAFor<AAValueSimplify>(RetPos);
+
+    if (ReturnType->isPointerTy()) {
+
+      // Every function with pointer return type might be marked align.
+      getOrCreateAAFor<AAAlign>(RetPos);
+
+      // Every function with pointer return type might be marked nonnull.
+      getOrCreateAAFor<AANonNull>(RetPos);
+
+      // Every function with pointer return type might be marked noalias.
+      getOrCreateAAFor<AANoAlias>(RetPos);
 
+      // Every function with pointer return type might be marked
+      // dereferenceable.
+      getOrCreateAAFor<AADereferenceable>(RetPos);
+    }
+  }
+
+  for (Argument &Arg : F.args()) {
+    IRPosition ArgPos = IRPosition::argument(Arg);
+
+    // Every argument might be simplified.
+    getOrCreateAAFor<AAValueSimplify>(ArgPos);
+
+    if (Arg.getType()->isPointerTy()) {
+      // Every argument with pointer type might be marked nonnull.
+      getOrCreateAAFor<AANonNull>(ArgPos);
+
+      // Every argument with pointer type might be marked noalias.
+      getOrCreateAAFor<AANoAlias>(ArgPos);
+
+      // Every argument with pointer type might be marked dereferenceable.
+      getOrCreateAAFor<AADereferenceable>(ArgPos);
+
+      // Every argument with pointer type might be marked align.
+      getOrCreateAAFor<AAAlign>(ArgPos);
+
+      // Every argument with pointer type might be marked nocapture.
+      getOrCreateAAFor<AANoCapture>(ArgPos);
+
+      // Every argument with pointer type might be marked
+      // "readnone/readonly/writeonly/..."
+      getOrCreateAAFor<AAMemoryBehavior>(ArgPos);
+    }
+  }
+
+  auto CallSitePred = [&](Instruction &I) -> bool {
     CallSite CS(&I);
-    if (CS && CS.getCalledFunction()) {
+    if (CS.getCalledFunction()) {
       for (int i = 0, e = CS.getCalledFunction()->arg_size(); i < e; i++) {
+
+        IRPosition CSArgPos = IRPosition::callsite_argument(CS, i);
+
+        // Call site argument might be simplified.
+        getOrCreateAAFor<AAValueSimplify>(CSArgPos);
+
         if (!CS.getArgument(i)->getType()->isPointerTy())
           continue;
 
         // Call site argument attribute "non-null".
-        registerAA(*new AANonNullCallSiteArgument(CS, i, InfoCache), i);
+        getOrCreateAAFor<AANonNull>(CSArgPos);
+
+        // Call site argument attribute "no-alias".
+        getOrCreateAAFor<AANoAlias>(CSArgPos);
+
+        // Call site argument attribute "dereferenceable".
+        getOrCreateAAFor<AADereferenceable>(CSArgPos);
+
+        // Call site argument attribute "align".
+        getOrCreateAAFor<AAAlign>(CSArgPos);
       }
     }
-  }
+    return true;
+  };
+
+  auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+  bool Success, AnyDead = false;
+  Success = checkForAllInstructionsImpl(
+      OpcodeInstMap, CallSitePred, nullptr, AnyDead,
+      {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
+       (unsigned)Instruction::Call});
+  (void)Success;
+  assert(Success && !AnyDead && "Expected the check call to be successful!");
+
+  auto LoadStorePred = [&](Instruction &I) -> bool {
+    if (isa<LoadInst>(I))
+      getOrCreateAAFor<AAAlign>(
+          IRPosition::value(*cast<LoadInst>(I).getPointerOperand()));
+    else
+      getOrCreateAAFor<AAAlign>(
+          IRPosition::value(*cast<StoreInst>(I).getPointerOperand()));
+    return true;
+  };
+  Success = checkForAllInstructionsImpl(
+      OpcodeInstMap, LoadStorePred, nullptr, AnyDead,
+      {(unsigned)Instruction::Load, (unsigned)Instruction::Store});
+  (void)Success;
+  assert(Success && !AnyDead && "Expected the check call to be successful!");
 }
 
 /// Helpers to ease debugging through output streams and print calls.
@@ -1576,21 +4875,39 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, ChangeStatus S) {
   return OS << (S == ChangeStatus::CHANGED ? "changed" : "unchanged");
 }
 
-raw_ostream &llvm::operator<<(raw_ostream &OS,
-                              AbstractAttribute::ManifestPosition AP) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, IRPosition::Kind AP) {
   switch (AP) {
-  case AbstractAttribute::MP_ARGUMENT:
+  case IRPosition::IRP_INVALID:
+    return OS << "inv";
+  case IRPosition::IRP_FLOAT:
+    return OS << "flt";
+  case IRPosition::IRP_RETURNED:
+    return OS << "fn_ret";
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+    return OS << "cs_ret";
+  case IRPosition::IRP_FUNCTION:
+    return OS << "fn";
+  case IRPosition::IRP_CALL_SITE:
+    return OS << "cs";
+  case IRPosition::IRP_ARGUMENT:
     return OS << "arg";
-  case AbstractAttribute::MP_CALL_SITE_ARGUMENT:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
     return OS << "cs_arg";
-  case AbstractAttribute::MP_FUNCTION:
-    return OS << "fn";
-  case AbstractAttribute::MP_RETURNED:
-    return OS << "fn_ret";
   }
   llvm_unreachable("Unknown attribute position!");
 }
 
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) {
+  const Value &AV = Pos.getAssociatedValue();
+  return OS << "{" << Pos.getPositionKind() << ":" << AV.getName() << " ["
+            << Pos.getAnchorValue().getName() << "@" << Pos.getArgNo() << "]}";
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerState &S) {
+  return OS << "(" << S.getKnown() << "-" << S.getAssumed() << ")"
+            << static_cast<const AbstractState &>(S);
+}
+
 raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractState &S) {
   return OS << (!S.isValidState() ? "top" : (S.isAtFixpoint() ? "fix" : ""));
 }
@@ -1601,8 +4918,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) {
 }
 
 void AbstractAttribute::print(raw_ostream &OS) const {
-  OS << "[" << getManifestPosition() << "][" << getAsStr() << "]["
-     << AnchoredVal.getName() << "]";
+  OS << "[P: " << getIRPosition() << "][" << getAsStr() << "][S: " << getState()
+     << "]";
 }
 ///}
 
@@ -1610,7 +4927,7 @@ void AbstractAttribute::print(raw_ostream &OS) const {
 ///                       Pass (Manager) Boilerplate
 /// ----------------------------------------------------------------------------
 
-static bool runAttributorOnModule(Module &M) {
+static bool runAttributorOnModule(Module &M, AnalysisGetter &AG) {
   if (DisableAttributor)
     return false;
 
@@ -1619,39 +4936,39 @@ static bool runAttributorOnModule(Module &M) {
 
   // Create an Attributor and initially empty information cache that is filled
   // while we identify default attribute opportunities.
-  Attributor A;
-  InformationCache InfoCache;
+  InformationCache InfoCache(M, AG);
+  Attributor A(InfoCache, DepRecInterval);
+
+  for (Function &F : M)
+    A.initializeInformationCache(F);
 
   for (Function &F : M) {
-    // TODO: Not all attributes require an exact definition. Find a way to
-    //       enable deduction for some but not all attributes in case the
-    //       definition might be changed at runtime, see also
-    //       http://lists.llvm.org/pipermail/llvm-dev/2018-February/121275.html.
-    // TODO: We could always determine abstract attributes and if sufficient
-    //       information was found we could duplicate the functions that do not
-    //       have an exact definition.
-    if (!F.hasExactDefinition()) {
+    if (F.hasExactDefinition())
+      NumFnWithExactDefinition++;
+    else
       NumFnWithoutExactDefinition++;
-      continue;
-    }
 
-    // For now we ignore naked and optnone functions.
-    if (F.hasFnAttribute(Attribute::Naked) ||
-        F.hasFnAttribute(Attribute::OptimizeNone))
-      continue;
-
-    NumFnWithExactDefinition++;
+    // We look at internal functions only on-demand but if any use is not a
+    // direct call, we have to do it eagerly.
+    if (F.hasLocalLinkage()) {
+      if (llvm::all_of(F.uses(), [](const Use &U) {
+            return ImmutableCallSite(U.getUser()) &&
+                   ImmutableCallSite(U.getUser()).isCallee(&U);
+          }))
+        continue;
+    }
 
     // Populate the Attributor with abstract attribute opportunities in the
     // function and the information cache with IR information.
-    A.identifyDefaultAbstractAttributes(F, InfoCache);
+    A.identifyDefaultAbstractAttributes(F);
   }
 
-  return A.run() == ChangeStatus::CHANGED;
+  return A.run(M) == ChangeStatus::CHANGED;
 }
 
 PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
-  if (runAttributorOnModule(M)) {
+  AnalysisGetter AG(AM);
+  if (runAttributorOnModule(M, AG)) {
     // FIXME: Think about passes we will preserve and add them here.
     return PreservedAnalyses::none();
   }
@@ -1670,12 +4987,14 @@ struct AttributorLegacyPass : public ModulePass {
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
-    return runAttributorOnModule(M);
+
+    AnalysisGetter AG;
+    return runAttributorOnModule(M, AG);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     // FIXME: Think about passes we will preserve and add them here.
-    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
@@ -1684,7 +5003,147 @@ struct AttributorLegacyPass : public ModulePass {
 Pass *llvm::createAttributorLegacyPass() { return new AttributorLegacyPass(); }
 
 char AttributorLegacyPass::ID = 0;
+
+const char AAReturnedValues::ID = 0;
+const char AANoUnwind::ID = 0;
+const char AANoSync::ID = 0;
+const char AANoFree::ID = 0;
+const char AANonNull::ID = 0;
+const char AANoRecurse::ID = 0;
+const char AAWillReturn::ID = 0;
+const char AANoAlias::ID = 0;
+const char AANoReturn::ID = 0;
+const char AAIsDead::ID = 0;
+const char AADereferenceable::ID = 0;
+const char AAAlign::ID = 0;
+const char AANoCapture::ID = 0;
+const char AAValueSimplify::ID = 0;
+const char AAHeapToStack::ID = 0;
+const char AAMemoryBehavior::ID = 0;
+
+// Macro magic to create the static generator function for attributes that
+// follow the naming scheme.
+
+#define SWITCH_PK_INV(CLASS, PK, POS_NAME)                                     \
+  case IRPosition::PK:                                                         \
+    llvm_unreachable("Cannot create " #CLASS " for a " POS_NAME " position!");
+
+#define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX)                               \
+  case IRPosition::PK:                                                         \
+    AA = new CLASS##SUFFIX(IRP);                                               \
+    break;
+
+#define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                 \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
+      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                    \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_FUNCTION, "function")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                      \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)            \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
+      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                  \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues)
+
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
+
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify)
+
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack)
+
+CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior)
+
+#undef CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef SWITCH_PK_CREATE
+#undef SWITCH_PK_INV
+
 INITIALIZE_PASS_BEGIN(AttributorLegacyPass, "attributor",
                       "Deduce and propagate attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(AttributorLegacyPass, "attributor",
                     "Deduce and propagate attributes", false, false)
diff --git a/lib/Transforms/IPO/BlockExtractor.cpp b/lib/Transforms/IPO/BlockExtractor.cpp
index 6c365f3f3cbe..de80c88c1591 100644
--- a/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/lib/Transforms/IPO/BlockExtractor.cpp
@@ -119,6 +119,8 @@ void BlockExtractor::loadFile() {
                /*KeepEmpty=*/false);
     if (LineSplit.empty())
       continue;
+    if (LineSplit.size()!=2)
+      report_fatal_error("Invalid line format, expecting lines like: 'funcname bb1[;bb2..]'");
     SmallVector<StringRef, 4> BBNames;
     LineSplit[1].split(BBNames, ';', /*MaxSplit=*/-1,
                        /*KeepEmpty=*/false);
@@ -204,7 +206,8 @@ bool BlockExtractor::runOnModule(Module &M) {
       ++NumExtracted;
       Changed = true;
     }
-    Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion();
+    CodeExtractorAnalysisCache CEAC(*BBs[0]->getParent());
+    Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion(CEAC);
     if (F)
       LLVM_DEBUG(dbgs() << "Extracted group '" << (*BBs.begin())->getName()
                         << "' in: " << F->getName() << '\n');
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index ad877ae1786c..3cf839e397f8 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -48,7 +48,7 @@ static void FindUsedValues(GlobalVariable *LLVMUsed,
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
 
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
-    Value *Operand = Inits->getOperand(i)->stripPointerCastsNoFollowAliases();
+    Value *Operand = Inits->getOperand(i)->stripPointerCasts();
     GlobalValue *GV = cast<GlobalValue>(Operand);
     UsedValues.insert(GV);
   }
@@ -120,7 +120,7 @@ static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New) {
 
   // Bump the alignment if necessary.
   if (Old->getAlignment() || New->getAlignment())
-    New->setAlignment(std::max(getAlignment(Old), getAlignment(New)));
+    New->setAlignment(Align(std::max(getAlignment(Old), getAlignment(New))));
 
   copyDebugLocMetadata(Old, New);
   Old->replaceAllUsesWith(NewConstant);
diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp
index e30b33aa4872..e20159ba0db5 100644
--- a/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -84,13 +84,9 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   for (GlobalObject &GO : M.global_objects()) {
     Types.clear();
     GO.getMetadata(LLVMContext::MD_type, Types);
-    for (MDNode *Type : Types) {
-      // Sanity check. GO must not be a function declaration.
-      assert(!isa<Function>(&GO) || !cast<Function>(&GO)->isDeclaration());
-
+    for (MDNode *Type : Types)
       if (ConstantInt *TypeId = extractNumericTypeId(Type))
         TypeIds.insert(TypeId->getZExtValue());
-    }
   }
 
   NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
@@ -108,11 +104,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   FunctionCallee C = M.getOrInsertFunction(
       "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
       Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
-  Function *F = dyn_cast<Function>(C.getCallee());
+  Function *F = cast<Function>(C.getCallee());
   // Take over the existing function. The frontend emits a weak stub so that the
   // linker knows about the symbol; this pass replaces the function body.
   F->deleteBody();
-  F->setAlignment(4096);
+  F->setAlignment(Align(4096));
 
   Triple T(M.getTargetTriple());
   if (T.isARM() || T.isThumb())
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 5ccd8bc4b0fb..b174c63a577b 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -78,11 +78,8 @@ STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
 STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
 STATISTIC(NumNoFree, "Number of functions marked as nofree");
 
-// FIXME: This is disabled by default to avoid exposing security vulnerabilities
-// in C/C++ code compiled by clang:
-// http://lists.llvm.org/pipermail/cfe-dev/2017-January/052066.html
 static cl::opt<bool> EnableNonnullArgPropagation(
-    "enable-nonnull-arg-prop", cl::Hidden,
+    "enable-nonnull-arg-prop", cl::init(true), cl::Hidden,
     cl::desc("Try to propagate nonnull argument attributes from callsites to "
              "caller functions."));
 
@@ -664,6 +661,25 @@ static bool addArgumentAttrsFromCallsites(Function &F) {
   return Changed;
 }
 
+static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
+  assert((R == Attribute::ReadOnly || R == Attribute::ReadNone)
+         && "Must be a Read attribute.");
+  assert(A && "Argument must not be null.");
+
+  // If the argument already has the attribute, nothing needs to be done.
+  if (A->hasAttribute(R))
+      return false;
+
+  // Otherwise, remove potentially conflicting attribute, add the new one,
+  // and update statistics.
+  A->removeAttr(Attribute::WriteOnly);
+  A->removeAttr(Attribute::ReadOnly);
+  A->removeAttr(Attribute::ReadNone);
+  A->addAttr(R);
+  R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+  return true;
+}
+
 /// Deduce nocapture attributes for the SCC.
 static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
   bool Changed = false;
@@ -732,11 +748,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         SmallPtrSet<Argument *, 8> Self;
         Self.insert(&*A);
         Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
-        if (R != Attribute::None) {
-          A->addAttr(R);
-          Changed = true;
-          R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
-        }
+        if (R != Attribute::None)
+          Changed = addReadAttr(A, R);
       }
     }
   }
@@ -833,12 +846,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     if (ReadAttr != Attribute::None) {
       for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
         Argument *A = ArgumentSCC[i]->Definition;
-        // Clear out existing readonly/readnone attributes
-        A->removeAttr(Attribute::ReadOnly);
-        A->removeAttr(Attribute::ReadNone);
-        A->addAttr(ReadAttr);
-        ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
-        Changed = true;
+        Changed = addReadAttr(A, ReadAttr);
       }
     }
   }
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 62c7fbd07223..3f5cc078d75f 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -450,7 +450,7 @@ static void computeImportForFunction(
         } else if (PrintImportFailures) {
           assert(!FailureInfo &&
                  "Expected no FailureInfo for newly rejected candidate");
-          FailureInfo = llvm::make_unique<FunctionImporter::ImportFailureInfo>(
+          FailureInfo = std::make_unique<FunctionImporter::ImportFailureInfo>(
               VI, Edge.second.getHotness(), Reason, 1);
         }
         LLVM_DEBUG(
@@ -764,7 +764,7 @@ void llvm::computeDeadSymbols(
   }
 
   // Make value live and add it to the worklist if it was not live before.
-  auto visit = [&](ValueInfo VI) {
+  auto visit = [&](ValueInfo VI, bool IsAliasee) {
     // FIXME: If we knew which edges were created for indirect call profiles,
     // we could skip them here. Any that are live should be reached via
     // other edges, e.g. reference edges. Otherwise, using a profile collected
@@ -800,12 +800,15 @@ void llvm::computeDeadSymbols(
           Interposable = true;
       }
 
-      if (!KeepAliveLinkage)
-        return;
+      if (!IsAliasee) {
+        if (!KeepAliveLinkage)
+          return;
 
-      if (Interposable)
-        report_fatal_error(
-          "Interposable and available_externally/linkonce_odr/weak_odr symbol");
+        if (Interposable)
+          report_fatal_error(
+              "Interposable and available_externally/linkonce_odr/weak_odr "
+              "symbol");
+      }
     }
 
     for (auto &S : VI.getSummaryList())
@@ -821,16 +824,16 @@ void llvm::computeDeadSymbols(
         // If this is an alias, visit the aliasee VI to ensure that all copies
         // are marked live and it is added to the worklist for further
         // processing of its references.
-        visit(AS->getAliaseeVI());
+        visit(AS->getAliaseeVI(), true);
         continue;
       }
 
       Summary->setLive(true);
       for (auto Ref : Summary->refs())
-        visit(Ref);
+        visit(Ref, false);
       if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
         for (auto Call : FS->calls())
-          visit(Call.first);
+          visit(Call.first, false);
     }
   }
   Index.setWithGlobalValueDeadStripping();
@@ -892,7 +895,7 @@ std::error_code llvm::EmitImportsFiles(
     StringRef ModulePath, StringRef OutputFilename,
     const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
   std::error_code EC;
-  raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::F_None);
+  raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
   if (EC)
     return EC;
   for (auto &ILI : ModuleToSummariesForIndex)
@@ -948,23 +951,15 @@ void llvm::thinLTOResolvePrevailingInModule(
     auto NewLinkage = GS->second->linkage();
     if (NewLinkage == GV.getLinkage())
       return;
-
-    // Switch the linkage to weakany if asked for, e.g. we do this for
-    // linker redefined symbols (via --wrap or --defsym).
-    // We record that the visibility should be changed here in `addThinLTO`
-    // as we need access to the resolution vectors for each input file in
-    // order to find which symbols have been redefined.
-    // We may consider reorganizing this code and moving the linkage recording
-    // somewhere else, e.g. in thinLTOResolvePrevailingInIndex.
-    if (NewLinkage == GlobalValue::WeakAnyLinkage) {
-      GV.setLinkage(NewLinkage);
-      return;
-    }
-
     if (GlobalValue::isLocalLinkage(GV.getLinkage()) ||
+        // Don't internalize anything here, because the code below
+        // lacks necessary correctness checks. Leave this job to
+        // LLVM 'internalize' pass.
+        GlobalValue::isLocalLinkage(NewLinkage) ||
         // In case it was dead and already converted to declaration.
         GV.isDeclaration())
       return;
+
     // Check for a non-prevailing def that has interposable linkage
     // (e.g. non-odr weak or linkonce). In that case we can't simply
     // convert to available_externally, since it would lose the
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 86b7f3e49ee6..f010f7b703a6 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -17,9 +17,11 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
@@ -29,10 +31,15 @@ using namespace llvm;
 
 #define DEBUG_TYPE "globaldce"
 
+static cl::opt<bool>
+    ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+                cl::desc("Enable virtual function elimination"));
+
 STATISTIC(NumAliases  , "Number of global aliases removed");
 STATISTIC(NumFunctions, "Number of functions removed");
 STATISTIC(NumIFuncs,    "Number of indirect functions removed");
 STATISTIC(NumVariables, "Number of global variables removed");
+STATISTIC(NumVFuncs,    "Number of virtual functions removed");
 
 namespace {
   class GlobalDCELegacyPass : public ModulePass {
@@ -118,6 +125,15 @@ void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) {
     ComputeDependencies(User, Deps);
   Deps.erase(&GV); // Remove self-reference.
   for (GlobalValue *GVU : Deps) {
+    // If this is a dep from a vtable to a virtual function, and we have
+    // complete information about all virtual call sites which could call
+    // though this vtable, then skip it, because the call site information will
+    // be more precise.
+    if (VFESafeVTables.count(GVU) && isa<Function>(&GV)) {
+      LLVM_DEBUG(dbgs() << "Ignoring dep " << GVU->getName() << " -> "
+                        << GV.getName() << "\n");
+      continue;
+    }
     GVDependencies[GVU].insert(&GV);
   }
 }
@@ -132,12 +148,133 @@ void GlobalDCEPass::MarkLive(GlobalValue &GV,
   if (Updates)
     Updates->push_back(&GV);
   if (Comdat *C = GV.getComdat()) {
-    for (auto &&CM : make_range(ComdatMembers.equal_range(C)))
+    for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
       MarkLive(*CM.second, Updates); // Recursion depth is only two because only
                                      // globals in the same comdat are visited.
+    }
+  }
+}
+
+void GlobalDCEPass::ScanVTables(Module &M) {
+  SmallVector<MDNode *, 2> Types;
+  LLVM_DEBUG(dbgs() << "Building type info -> vtable map\n");
+
+  auto *LTOPostLinkMD =
+      cast_or_null<ConstantAsMetadata>(M.getModuleFlag("LTOPostLink"));
+  bool LTOPostLink =
+      LTOPostLinkMD &&
+      (cast<ConstantInt>(LTOPostLinkMD->getValue())->getZExtValue() != 0);
+
+  for (GlobalVariable &GV : M.globals()) {
+    Types.clear();
+    GV.getMetadata(LLVMContext::MD_type, Types);
+    if (GV.isDeclaration() || Types.empty())
+      continue;
+
+    // Use the typeid metadata on the vtable to build a mapping from typeids to
+    // the list of (GV, offset) pairs which are the possible vtables for that
+    // typeid.
+    for (MDNode *Type : Types) {
+      Metadata *TypeID = Type->getOperand(1).get();
+
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+
+      TypeIdMap[TypeID].insert(std::make_pair(&GV, Offset));
+    }
+
+    // If the type corresponding to the vtable is private to this translation
+    // unit, we know that we can see all virtual functions which might use it,
+    // so VFE is safe.
+    if (auto GO = dyn_cast<GlobalObject>(&GV)) {
+      GlobalObject::VCallVisibility TypeVis = GO->getVCallVisibility();
+      if (TypeVis == GlobalObject::VCallVisibilityTranslationUnit ||
+          (LTOPostLink &&
+           TypeVis == GlobalObject::VCallVisibilityLinkageUnit)) {
+        LLVM_DEBUG(dbgs() << GV.getName() << " is safe for VFE\n");
+        VFESafeVTables.insert(&GV);
+      }
+    }
+  }
+}
+
+void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
+                                   uint64_t CallOffset) {
+  for (auto &VTableInfo : TypeIdMap[TypeId]) {
+    GlobalVariable *VTable = VTableInfo.first;
+    uint64_t VTableOffset = VTableInfo.second;
+
+    Constant *Ptr =
+        getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset,
+                           *Caller->getParent());
+    if (!Ptr) {
+      LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n");
+      VFESafeVTables.erase(VTable);
+      return;
+    }
+
+    auto Callee = dyn_cast<Function>(Ptr->stripPointerCasts());
+    if (!Callee) {
+      LLVM_DEBUG(dbgs() << "vtable entry is not function pointer!\n");
+      VFESafeVTables.erase(VTable);
+      return;
+    }
+
+    LLVM_DEBUG(dbgs() << "vfunc dep " << Caller->getName() << " -> "
+                      << Callee->getName() << "\n");
+    GVDependencies[Caller].insert(Callee);
   }
 }
 
+void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) {
+  LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n");
+  Function *TypeCheckedLoadFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+
+  if (!TypeCheckedLoadFunc)
+    return;
+
+  for (auto U : TypeCheckedLoadFunc->users()) {
+    auto CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+
+    auto *Offset = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    Value *TypeIdValue = CI->getArgOperand(2);
+    auto *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
+
+    if (Offset) {
+      ScanVTableLoad(CI->getFunction(), TypeId, Offset->getZExtValue());
+    } else {
+      // type.checked.load with a non-constant offset, so assume every entry in
+      // every matching vtable is used.
+      for (auto &VTableInfo : TypeIdMap[TypeId]) {
+        VFESafeVTables.erase(VTableInfo.first);
+      }
+    }
+  }
+}
+
+void GlobalDCEPass::AddVirtualFunctionDependencies(Module &M) {
+  if (!ClEnableVFE)
+    return;
+
+  ScanVTables(M);
+
+  if (VFESafeVTables.empty())
+    return;
+
+  ScanTypeCheckedLoadIntrinsics(M);
+
+  LLVM_DEBUG(
+    dbgs() << "VFE safe vtables:\n";
+    for (auto *VTable : VFESafeVTables)
+      dbgs() << "  " << VTable->getName() << "\n";
+  );
+}
+
 PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   bool Changed = false;
 
@@ -163,6 +300,10 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
     if (Comdat *C = GA.getComdat())
       ComdatMembers.insert(std::make_pair(C, &GA));
 
+  // Add dependencies between virtual call sites and the virtual functions they
+  // might call, if we have that information.
+  AddVirtualFunctionDependencies(M);
+
   // Loop over the module, adding globals which are obviously necessary.
   for (GlobalObject &GO : M.global_objects()) {
     Changed |= RemoveUnusedGlobalValue(GO);
@@ -257,8 +398,17 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   };
 
   NumFunctions += DeadFunctions.size();
-  for (Function *F : DeadFunctions)
+  for (Function *F : DeadFunctions) {
+    if (!F->use_empty()) {
+      // Virtual functions might still be referenced by one or more vtables,
+      // but if we've proven them to be unused then it's safe to replace the
+      // virtual function pointers with null, allowing us to remove the
+      // function itself.
+      ++NumVFuncs;
+      F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType()));
+    }
     EraseUnusedGlobalValue(F);
+  }
 
   NumVariables += DeadGlobalVars.size();
   for (GlobalVariable *GV : DeadGlobalVars)
@@ -277,6 +427,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   ConstantDependenciesCache.clear();
   GVDependencies.clear();
   ComdatMembers.clear();
+  TypeIdMap.clear();
+  VFESafeVTables.clear();
 
   if (Changed)
     return PreservedAnalyses::none();
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index c4fb3ce77f6e..819715b9f8da 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -155,7 +155,8 @@ static bool isLeakCheckerRoot(GlobalVariable *GV) {
 /// Given a value that is stored to a global but never read, determine whether
 /// it's safe to remove the store and the chain of computation that feeds the
 /// store.
-static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
+static bool IsSafeComputationToRemove(
+    Value *V, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   do {
     if (isa<Constant>(V))
       return true;
@@ -164,7 +165,7 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
     if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
         isa<GlobalValue>(V))
       return false;
-    if (isAllocationFn(V, TLI))
+    if (isAllocationFn(V, GetTLI))
       return true;
 
     Instruction *I = cast<Instruction>(V);
@@ -184,8 +185,9 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
 /// This GV is a pointer root.  Loop over all users of the global and clean up
 /// any that obviously don't assign the global a value that isn't dynamically
 /// allocated.
-static bool CleanupPointerRootUsers(GlobalVariable *GV,
-                                    const TargetLibraryInfo *TLI) {
+static bool
+CleanupPointerRootUsers(GlobalVariable *GV,
+                        function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   // A brief explanation of leak checkers.  The goal is to find bugs where
   // pointers are forgotten, causing an accumulating growth in memory
   // usage over time.  The common strategy for leak checkers is to whitelist the
@@ -241,18 +243,18 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
         Dead.clear();
-        CleanupPointerRootUsers(GV, TLI);
+        CleanupPointerRootUsers(GV, GetTLI);
         return true;
       }
     }
   }
 
   for (int i = 0, e = Dead.size(); i != e; ++i) {
-    if (IsSafeComputationToRemove(Dead[i].first, TLI)) {
+    if (IsSafeComputationToRemove(Dead[i].first, GetTLI)) {
       Dead[i].second->eraseFromParent();
       Instruction *I = Dead[i].first;
       do {
-        if (isAllocationFn(I, TLI))
+        if (isAllocationFn(I, GetTLI))
           break;
         Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
         if (!J)
@@ -270,9 +272,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
 /// We just marked GV constant.  Loop over all users of the global, cleaning up
 /// the obvious ones.  This is largely just a quick scan over the use list to
 /// clean up the easy and obvious cruft.  This returns true if it made a change.
-static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
-                                       const DataLayout &DL,
-                                       TargetLibraryInfo *TLI) {
+static bool CleanupConstantGlobalUsers(
+    Value *V, Constant *Init, const DataLayout &DL,
+    function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   bool Changed = false;
   // Note that we need to use a weak value handle for the worklist items. When
   // we delete a constant array, we may also be holding pointer to one of its
@@ -302,12 +304,12 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
         Constant *SubInit = nullptr;
         if (Init)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
-        Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, TLI);
+        Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI);
       } else if ((CE->getOpcode() == Instruction::BitCast &&
                   CE->getType()->isPointerTy()) ||
                  CE->getOpcode() == Instruction::AddrSpaceCast) {
         // Pointer cast, delete any stores and memsets to the global.
-        Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, TLI);
+        Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI);
       }
 
       if (CE->use_empty()) {
@@ -321,7 +323,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       Constant *SubInit = nullptr;
       if (!isa<ConstantExpr>(GEP->getOperand(0))) {
         ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
-            ConstantFoldInstruction(GEP, DL, TLI));
+            ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction())));
         if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
 
@@ -331,7 +333,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
         if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
           SubInit = Constant::getNullValue(GEP->getResultElementType());
       }
-      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, TLI);
+      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI);
 
       if (GEP->use_empty()) {
         GEP->eraseFromParent();
@@ -348,7 +350,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       // us, and if they are all dead, nuke them without remorse.
       if (isSafeToDestroyConstant(C)) {
         C->destroyConstant();
-        CleanupConstantGlobalUsers(V, Init, DL, TLI);
+        CleanupConstantGlobalUsers(V, Init, DL, GetTLI);
         return true;
       }
     }
@@ -495,8 +497,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
       // had 256 byte alignment for example, something might depend on that:
       // propagate info to each field.
       uint64_t FieldOffset = Layout.getElementOffset(i);
-      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset);
-      if (NewAlign > DL.getABITypeAlignment(STy->getElementType(i)))
+      Align NewAlign(MinAlign(StartAlignment, FieldOffset));
+      if (NewAlign > Align(DL.getABITypeAlignment(STy->getElementType(i))))
         NGV->setAlignment(NewAlign);
 
       // Copy over the debug info for the variable.
@@ -511,7 +513,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
     NewGlobals.reserve(NumElements);
     auto ElTy = STy->getElementType();
     uint64_t EltSize = DL.getTypeAllocSize(ElTy);
-    unsigned EltAlign = DL.getABITypeAlignment(ElTy);
+    Align EltAlign(DL.getABITypeAlignment(ElTy));
     uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
     for (unsigned i = 0, e = NumElements; i != e; ++i) {
       Constant *In = Init->getAggregateElement(i);
@@ -530,7 +532,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
       // Calculate the known alignment of the field.  If the original aggregate
       // had 256 byte alignment for example, something might depend on that:
       // propagate info to each field.
-      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i);
+      Align NewAlign(MinAlign(StartAlignment, EltSize * i));
       if (NewAlign > EltAlign)
         NGV->setAlignment(NewAlign);
       transferSRADebugInfo(GV, NGV, FragmentSizeInBits * i, FragmentSizeInBits,
@@ -745,9 +747,9 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
 /// are uses of the loaded value that would trap if the loaded value is
 /// dynamically null, then we know that they cannot be reachable with a null
 /// optimize away the load.
-static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
-                                            const DataLayout &DL,
-                                            TargetLibraryInfo *TLI) {
+static bool OptimizeAwayTrappingUsesOfLoads(
+    GlobalVariable *GV, Constant *LV, const DataLayout &DL,
+    function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   bool Changed = false;
 
   // Keep track of whether we are able to remove all the uses of the global
@@ -793,10 +795,10 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   // nor is the global.
   if (AllNonStoreUsesGone) {
     if (isLeakCheckerRoot(GV)) {
-      Changed |= CleanupPointerRootUsers(GV, TLI);
+      Changed |= CleanupPointerRootUsers(GV, GetTLI);
     } else {
       Changed = true;
-      CleanupConstantGlobalUsers(GV, nullptr, DL, TLI);
+      CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI);
     }
     if (GV->use_empty()) {
       LLVM_DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
@@ -889,8 +891,8 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
   while (!GV->use_empty()) {
     if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
       // The global is initialized when the store to it occurs.
-      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0,
-                    SI->getOrdering(), SI->getSyncScopeID(), SI);
+      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false,
+                    None, SI->getOrdering(), SI->getSyncScopeID(), SI);
       SI->eraseFromParent();
       continue;
     }
@@ -907,7 +909,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
       // Replace the cmp X, 0 with a use of the bool value.
       // Sink the load to where the compare was, if atomic rules allow us to.
       Value *LV = new LoadInst(InitBool->getValueType(), InitBool,
-                               InitBool->getName() + ".val", false, 0,
+                               InitBool->getName() + ".val", false, None,
                                LI->getOrdering(), LI->getSyncScopeID(),
                                LI->isUnordered() ? (Instruction *)ICI : LI);
       InitBoolUsed = true;
@@ -1562,10 +1564,10 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
 
 // Try to optimize globals based on the knowledge that only one value (besides
 // its initializer) is ever stored to the global.
-static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
-                                     AtomicOrdering Ordering,
-                                     const DataLayout &DL,
-                                     TargetLibraryInfo *TLI) {
+static bool
+optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+                         AtomicOrdering Ordering, const DataLayout &DL,
+                         function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   // Ignore no-op GEPs and bitcasts.
   StoredOnceVal = StoredOnceVal->stripPointerCasts();
 
@@ -1583,9 +1585,10 @@ static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
         SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
 
       // Optimize away any trapping uses of the loaded value.
-      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, TLI))
+      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI))
         return true;
-    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) {
+    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) {
+      auto *TLI = &GetTLI(*CI->getFunction());
       Type *MallocType = getMallocAllocatedType(CI, TLI);
       if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
                                                            Ordering, DL, TLI))
@@ -1643,10 +1646,12 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
   // instead of a select to synthesize the desired value.
   bool IsOneZero = false;
   bool EmitOneOrZero = true;
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)){
+  auto *CI = dyn_cast<ConstantInt>(OtherVal);
+  if (CI && CI->getValue().getActiveBits() <= 64) {
     IsOneZero = InitVal->isNullValue() && CI->isOne();
 
-    if (ConstantInt *CIInit = dyn_cast<ConstantInt>(GV->getInitializer())){
+    auto *CIInit = dyn_cast<ConstantInt>(GV->getInitializer());
+    if (CIInit && CIInit->getValue().getActiveBits() <= 64) {
       uint64_t ValInit = CIInit->getZExtValue();
       uint64_t ValOther = CI->getZExtValue();
       uint64_t ValMinus = ValOther - ValInit;
@@ -1711,7 +1716,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
           assert(LI->getOperand(0) == GV && "Not a copy!");
           // Insert a new load, to preserve the saved value.
           StoreVal = new LoadInst(NewGV->getValueType(), NewGV,
-                                  LI->getName() + ".b", false, 0,
+                                  LI->getName() + ".b", false, None,
                                   LI->getOrdering(), LI->getSyncScopeID(), LI);
         } else {
           assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
@@ -1721,15 +1726,15 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
         }
       }
       StoreInst *NSI =
-          new StoreInst(StoreVal, NewGV, false, 0, SI->getOrdering(),
+          new StoreInst(StoreVal, NewGV, false, None, SI->getOrdering(),
                         SI->getSyncScopeID(), SI);
       NSI->setDebugLoc(SI->getDebugLoc());
     } else {
       // Change the load into a load of bool then a select.
       LoadInst *LI = cast<LoadInst>(UI);
-      LoadInst *NLI =
-          new LoadInst(NewGV->getValueType(), NewGV, LI->getName() + ".b",
-                       false, 0, LI->getOrdering(), LI->getSyncScopeID(), LI);
+      LoadInst *NLI = new LoadInst(NewGV->getValueType(), NewGV,
+                                   LI->getName() + ".b", false, None,
+                                   LI->getOrdering(), LI->getSyncScopeID(), LI);
       Instruction *NSI;
       if (IsOneZero)
         NSI = new ZExtInst(NLI, LI->getType(), "", LI);
@@ -1914,9 +1919,10 @@ static void makeAllConstantUsesInstructions(Constant *C) {
 
 /// Analyze the specified global variable and optimize
 /// it if possible.  If we make a change, return true.
-static bool processInternalGlobal(
-    GlobalVariable *GV, const GlobalStatus &GS, TargetLibraryInfo *TLI,
-    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+static bool
+processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
+                      function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+                      function_ref<DominatorTree &(Function &)> LookupDomTree) {
   auto &DL = GV->getParent()->getDataLayout();
   // If this is a first class global and has only one accessing function and
   // this function is non-recursive, we replace the global with a local alloca
@@ -1963,11 +1969,12 @@ static bool processInternalGlobal(
     bool Changed;
     if (isLeakCheckerRoot(GV)) {
       // Delete any constant stores to the global.
-      Changed = CleanupPointerRootUsers(GV, TLI);
+      Changed = CleanupPointerRootUsers(GV, GetTLI);
     } else {
       // Delete any stores we can find to the global.  We may not be able to
       // make it completely dead though.
-      Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
+      Changed =
+          CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
     }
 
     // If the global is dead now, delete it.
@@ -1989,7 +1996,7 @@ static bool processInternalGlobal(
       GV->setConstant(true);
 
     // Clean up any obviously simplifiable users now.
-    CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
+    CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
 
     // If the global is dead now, just nuke it.
     if (GV->use_empty()) {
@@ -2019,7 +2026,7 @@ static bool processInternalGlobal(
         GV->setInitializer(SOVConstant);
 
         // Clean up any obviously simplifiable users now.
-        CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI);
+        CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
 
         if (GV->use_empty()) {
           LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
@@ -2033,7 +2040,8 @@ static bool processInternalGlobal(
 
     // Try to optimize globals based on the knowledge that only one value
     // (besides its initializer) is ever stored to the global.
-    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, TLI))
+    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL,
+                                 GetTLI))
       return true;
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
@@ -2054,7 +2062,8 @@ static bool processInternalGlobal(
 /// Analyze the specified global variable and optimize it if possible.  If we
 /// make a change, return true.
 static bool
-processGlobal(GlobalValue &GV, TargetLibraryInfo *TLI,
+processGlobal(GlobalValue &GV,
+              function_ref<TargetLibraryInfo &(Function &)> GetTLI,
               function_ref<DominatorTree &(Function &)> LookupDomTree) {
   if (GV.getName().startswith("llvm."))
     return false;
@@ -2086,7 +2095,7 @@ processGlobal(GlobalValue &GV, TargetLibraryInfo *TLI,
   if (GVar->isConstant() || !GVar->hasInitializer())
     return Changed;
 
-  return processInternalGlobal(GVar, GS, TLI, LookupDomTree) || Changed;
+  return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed;
 }
 
 /// Walk all of the direct calls of the specified function, changing them to
@@ -2234,7 +2243,8 @@ hasOnlyColdCalls(Function &F,
 }
 
 static bool
-OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
+OptimizeFunctions(Module &M,
+                  function_ref<TargetLibraryInfo &(Function &)> GetTLI,
                   function_ref<TargetTransformInfo &(Function &)> GetTTI,
                   function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
                   function_ref<DominatorTree &(Function &)> LookupDomTree,
@@ -2275,17 +2285,13 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
     // So, remove unreachable blocks from the function, because a) there's
     // no point in analyzing them and b) GlobalOpt should otherwise grow
     // some more complicated logic to break these cycles.
-    // Removing unreachable blocks might invalidate the dominator so we
-    // recalculate it.
     if (!F->isDeclaration()) {
-      if (removeUnreachableBlocks(*F)) {
-        auto &DT = LookupDomTree(*F);
-        DT.recalculate(*F);
-        Changed = true;
-      }
+      auto &DT = LookupDomTree(*F);
+      DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+      Changed |= removeUnreachableBlocks(*F, &DTU);
     }
 
-    Changed |= processGlobal(*F, TLI, LookupDomTree);
+    Changed |= processGlobal(*F, GetTLI, LookupDomTree);
 
     if (!F->hasLocalLinkage())
       continue;
@@ -2342,7 +2348,8 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
 }
 
 static bool
-OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
+OptimizeGlobalVars(Module &M,
+                   function_ref<TargetLibraryInfo &(Function &)> GetTLI,
                    function_ref<DominatorTree &(Function &)> LookupDomTree,
                    SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
   bool Changed = false;
@@ -2357,7 +2364,10 @@ OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
     if (GV->hasInitializer())
       if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
         auto &DL = M.getDataLayout();
-        Constant *New = ConstantFoldConstant(C, DL, TLI);
+        // TLI is not used in the case of a Constant, so use default nullptr
+        // for that optional parameter, since we don't have a Function to
+        // provide GetTLI anyway.
+        Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr);
         if (New && New != C)
           GV->setInitializer(New);
       }
@@ -2367,7 +2377,7 @@ OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI,
       continue;
     }
 
-    Changed |= processGlobal(*GV, TLI, LookupDomTree);
+    Changed |= processGlobal(*GV, GetTLI, LookupDomTree);
   }
   return Changed;
 }
@@ -2581,8 +2591,8 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
 }
 
 static int compareNames(Constant *const *A, Constant *const *B) {
-  Value *AStripped = (*A)->stripPointerCastsNoFollowAliases();
-  Value *BStripped = (*B)->stripPointerCastsNoFollowAliases();
+  Value *AStripped = (*A)->stripPointerCasts();
+  Value *BStripped = (*B)->stripPointerCasts();
   return AStripped->getName().compare(BStripped->getName());
 }
 
@@ -2809,7 +2819,14 @@ OptimizeGlobalAliases(Module &M,
   return Changed;
 }
 
-static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
+static Function *
+FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+  // Hack to get a default TLI before we have actual Function.
+  auto FuncIter = M.begin();
+  if (FuncIter == M.end())
+    return nullptr;
+  auto *TLI = &GetTLI(*FuncIter);
+
   LibFunc F = LibFunc_cxa_atexit;
   if (!TLI->has(F))
     return nullptr;
@@ -2818,6 +2835,9 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
   if (!Fn)
     return nullptr;
 
+  // Now get the actual TLI for Fn.
+  TLI = &GetTLI(*Fn);
+
   // Make sure that the function has the correct prototype.
   if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit)
     return nullptr;
@@ -2889,7 +2909,8 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
 }
 
 static bool optimizeGlobalsInModule(
-    Module &M, const DataLayout &DL, TargetLibraryInfo *TLI,
+    Module &M, const DataLayout &DL,
+    function_ref<TargetLibraryInfo &(Function &)> GetTLI,
     function_ref<TargetTransformInfo &(Function &)> GetTTI,
     function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
     function_ref<DominatorTree &(Function &)> LookupDomTree) {
@@ -2914,24 +2935,24 @@ static bool optimizeGlobalsInModule(
           NotDiscardableComdats.insert(C);
 
     // Delete functions that are trivially dead, ccc -> fastcc
-    LocalChange |= OptimizeFunctions(M, TLI, GetTTI, GetBFI, LookupDomTree,
+    LocalChange |= OptimizeFunctions(M, GetTLI, GetTTI, GetBFI, LookupDomTree,
                                      NotDiscardableComdats);
 
     // Optimize global_ctors list.
     LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
-      return EvaluateStaticConstructor(F, DL, TLI);
+      return EvaluateStaticConstructor(F, DL, &GetTLI(*F));
     });
 
     // Optimize non-address-taken globals.
-    LocalChange |= OptimizeGlobalVars(M, TLI, LookupDomTree,
-                                      NotDiscardableComdats);
+    LocalChange |=
+        OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats);
 
     // Resolve aliases, when possible.
     LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats);
 
     // Try to remove trivial global destructors if they are not removed
     // already.
-    Function *CXAAtExitFn = FindCXAAtExit(M, TLI);
+    Function *CXAAtExitFn = FindCXAAtExit(M, GetTLI);
     if (CXAAtExitFn)
       LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
 
@@ -2946,12 +2967,14 @@ static bool optimizeGlobalsInModule(
 
 PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
     auto &DL = M.getDataLayout();
-    auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
     auto &FAM =
         AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
     auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
       return FAM.getResult<DominatorTreeAnalysis>(F);
     };
+    auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+      return FAM.getResult<TargetLibraryAnalysis>(F);
+    };
     auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
       return FAM.getResult<TargetIRAnalysis>(F);
     };
@@ -2960,7 +2983,7 @@ PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
       return FAM.getResult<BlockFrequencyAnalysis>(F);
     };
 
-    if (!optimizeGlobalsInModule(M, DL, &TLI, GetTTI, GetBFI, LookupDomTree))
+    if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree))
       return PreservedAnalyses::all();
     return PreservedAnalyses::none();
 }
@@ -2979,10 +3002,12 @@ struct GlobalOptLegacyPass : public ModulePass {
       return false;
 
     auto &DL = M.getDataLayout();
-    auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     auto LookupDomTree = [this](Function &F) -> DominatorTree & {
       return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
     };
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
     auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
       return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     };
@@ -2991,7 +3016,8 @@ struct GlobalOptLegacyPass : public ModulePass {
       return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
     };
 
-    return optimizeGlobalsInModule(M, DL, TLI, GetTTI, GetBFI, LookupDomTree);
+    return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI,
+                                   LookupDomTree);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index ab1a9a79cad6..cfdcc8db7f50 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -85,12 +85,6 @@ static cl::opt<int>
                                 "multiple of TCC_Basic)"));
 
 namespace {
-
-/// A sequence of basic blocks.
-///
-/// A 0-sized SmallVector is slightly cheaper to move than a std::vector.
-using BlockSequence = SmallVector<BasicBlock *, 0>;
-
 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
 // this function unless you modify the MBB version as well.
 //
@@ -169,31 +163,6 @@ static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
   return Changed;
 }
 
-class HotColdSplitting {
-public:
-  HotColdSplitting(ProfileSummaryInfo *ProfSI,
-                   function_ref<BlockFrequencyInfo *(Function &)> GBFI,
-                   function_ref<TargetTransformInfo &(Function &)> GTTI,
-                   std::function<OptimizationRemarkEmitter &(Function &)> *GORE,
-                   function_ref<AssumptionCache *(Function &)> LAC)
-      : PSI(ProfSI), GetBFI(GBFI), GetTTI(GTTI), GetORE(GORE), LookupAC(LAC) {}
-  bool run(Module &M);
-
-private:
-  bool isFunctionCold(const Function &F) const;
-  bool shouldOutlineFrom(const Function &F) const;
-  bool outlineColdRegions(Function &F, bool HasProfileSummary);
-  Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
-                              BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
-                              OptimizationRemarkEmitter &ORE,
-                              AssumptionCache *AC, unsigned Count);
-  ProfileSummaryInfo *PSI;
-  function_ref<BlockFrequencyInfo *(Function &)> GetBFI;
-  function_ref<TargetTransformInfo &(Function &)> GetTTI;
-  std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
-  function_ref<AssumptionCache *(Function &)> LookupAC;
-};
-
 class HotColdSplittingLegacyPass : public ModulePass {
 public:
   static char ID;
@@ -321,13 +290,10 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
   return Penalty;
 }
 
-Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
-                                              DominatorTree &DT,
-                                              BlockFrequencyInfo *BFI,
-                                              TargetTransformInfo &TTI,
-                                              OptimizationRemarkEmitter &ORE,
-                                              AssumptionCache *AC,
-                                              unsigned Count) {
+Function *HotColdSplitting::extractColdRegion(
+    const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
+    DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
+    OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
   assert(!Region.empty());
 
   // TODO: Pass BFI and BPI to update profile information.
@@ -349,7 +315,7 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
     return nullptr;
 
   Function *OrigF = Region[0]->getParent();
-  if (Function *OutF = CE.extractCodeRegion()) {
+  if (Function *OutF = CE.extractCodeRegion(CEAC)) {
     User *U = *OutF->user_begin();
     CallInst *CI = cast<CallInst>(U);
     CallSite CS(CI);
@@ -607,9 +573,9 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
     });
 
     if (!DT)
-      DT = make_unique<DominatorTree>(F);
+      DT = std::make_unique<DominatorTree>(F);
     if (!PDT)
-      PDT = make_unique<PostDominatorTree>(F);
+      PDT = std::make_unique<PostDominatorTree>(F);
 
     auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
     for (OutliningRegion &Region : Regions) {
@@ -637,9 +603,14 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
     }
   }
 
+  if (OutliningWorklist.empty())
+    return Changed;
+
   // Outline single-entry cold regions, splitting up larger regions as needed.
   unsigned OutlinedFunctionID = 1;
-  while (!OutliningWorklist.empty()) {
+  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
+  CodeExtractorAnalysisCache CEAC(F);
+  do {
     OutliningRegion Region = OutliningWorklist.pop_back_val();
     assert(!Region.empty() && "Empty outlining region in worklist");
     do {
@@ -650,14 +621,14 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
           BB->dump();
       });
 
-      Function *Outlined = extractColdRegion(SubRegion, *DT, BFI, TTI, ORE, AC,
-                                             OutlinedFunctionID);
+      Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
+                                             ORE, AC, OutlinedFunctionID);
       if (Outlined) {
         ++OutlinedFunctionID;
         Changed = true;
       }
     } while (!Region.empty());
-  }
+  } while (!OutliningWorklist.empty());
 
   return Changed;
 }
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index 34db75dd8b03..bddf75211599 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -114,6 +114,10 @@ void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createIPSCCPPass());
 }
 
+void LLVMAddMergeFunctionsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMergeFunctionsPass());
+}
+
 void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
   auto PreserveMain = [=](const GlobalValue &GV) {
     return AllButMain && GV.getName() == "main";
@@ -121,6 +125,15 @@ void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
   unwrap(PM)->add(createInternalizePass(PreserveMain));
 }
 
+void LLVMAddInternalizePassWithMustPreservePredicate(
+    LLVMPassManagerRef PM,
+    void *Context,
+    LLVMBool (*Pred)(LLVMValueRef, void *)) {
+  unwrap(PM)->add(createInternalizePass([=](const GlobalValue &GV) {
+    return Pred(wrap(&GV), Context) == 0 ? false : true;
+  }));
+}
+
 void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createStripDeadPrototypesPass());
 }
diff --git a/lib/Transforms/IPO/InferFunctionAttrs.cpp b/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 7f5511e008e1..d1a68b28bd33 100644
--- a/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -18,24 +18,28 @@ using namespace llvm;
 
 #define DEBUG_TYPE "inferattrs"
 
-static bool inferAllPrototypeAttributes(Module &M,
-                                        const TargetLibraryInfo &TLI) {
+static bool inferAllPrototypeAttributes(
+    Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   bool Changed = false;
 
   for (Function &F : M.functions())
     // We only infer things using the prototype and the name; we don't need
     // definitions.
     if (F.isDeclaration() && !F.hasOptNone())
-      Changed |= inferLibFuncAttributes(F, TLI);
+      Changed |= inferLibFuncAttributes(F, GetTLI(F));
 
   return Changed;
 }
 
 PreservedAnalyses InferFunctionAttrsPass::run(Module &M,
                                               ModuleAnalysisManager &AM) {
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
 
-  if (!inferAllPrototypeAttributes(M, TLI))
+  if (!inferAllPrototypeAttributes(M, GetTLI))
     // If we didn't infer anything, preserve all analyses.
     return PreservedAnalyses::all();
 
@@ -60,8 +64,10 @@ struct InferFunctionAttrsLegacyPass : public ModulePass {
     if (skipModule(M))
       return false;
 
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    return inferAllPrototypeAttributes(M, TLI);
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    return inferAllPrototypeAttributes(M, GetTLI);
   }
 };
 }
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 945f8affae6e..4b72261131c1 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -239,7 +239,7 @@ static void mergeInlinedArrayAllocas(
         }
 
         if (Align1 > Align2)
-          AvailableAlloca->setAlignment(AI->getAlignment());
+          AvailableAlloca->setAlignment(MaybeAlign(AI->getAlignment()));
       }
 
       AI->eraseFromParent();
@@ -527,7 +527,8 @@ static void setInlineRemark(CallSite &CS, StringRef message) {
 static bool
 inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
                 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
-                ProfileSummaryInfo *PSI, TargetLibraryInfo &TLI,
+                ProfileSummaryInfo *PSI,
+                std::function<TargetLibraryInfo &(Function &)> GetTLI,
                 bool InsertLifetime,
                 function_ref<InlineCost(CallSite CS)> GetInlineCost,
                 function_ref<AAResults &(Function &)> AARGetter,
@@ -626,7 +627,8 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
 
       Instruction *Instr = CS.getInstruction();
 
-      bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI);
+      bool IsTriviallyDead =
+          isInstructionTriviallyDead(Instr, &GetTLI(*Caller));
 
       int InlineHistoryID;
       if (!IsTriviallyDead) {
@@ -757,13 +759,16 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   ACT = &getAnalysis<AssumptionCacheTracker>();
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto GetTLI = [&](Function &F) -> TargetLibraryInfo & {
+    return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return ACT->getAssumptionCache(F);
   };
-  return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime,
-                         [this](CallSite CS) { return getInlineCost(CS); },
-                         LegacyAARGetter(*this), ImportedFunctionsStats);
+  return inlineCallsImpl(
+      SCC, CG, GetAssumptionCache, PSI, GetTLI, InsertLifetime,
+      [this](CallSite CS) { return getInlineCost(CS); }, LegacyAARGetter(*this),
+      ImportedFunctionsStats);
 }
 
 /// Remove now-dead linkonce functions at the end of
@@ -879,7 +884,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   if (!ImportedFunctionsStats &&
       InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
     ImportedFunctionsStats =
-        llvm::make_unique<ImportedFunctionsInliningStatistics>();
+        std::make_unique<ImportedFunctionsInliningStatistics>();
     ImportedFunctionsStats->setModuleInfo(M);
   }
 
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 91c7b5f5f135..add2ae053735 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -141,10 +141,12 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
     if (NumLoops == 0) return Changed;
     --NumLoops;
     AssumptionCache *AC = nullptr;
+    Function &Func = *L->getHeader()->getParent();
     if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
-      AC = ACT->lookupAssumptionCache(*L->getHeader()->getParent());
+      AC = ACT->lookupAssumptionCache(Func);
+    CodeExtractorAnalysisCache CEAC(Func);
     CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC);
-    if (Extractor.extractCodeRegion() != nullptr) {
+    if (Extractor.extractCodeRegion(CEAC) != nullptr) {
       Changed = true;
       // After extraction, the loop is replaced by a function call, so
       // we shouldn't try to run any more loop passes on it.
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index f7371284f47e..2dec366d70e2 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -230,6 +230,16 @@ void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits,
     Bytes[AllocByteOffset + B] |= AllocMask;
 }
 
+bool lowertypetests::isJumpTableCanonical(Function *F) {
+  if (F->isDeclarationForLinker())
+    return false;
+  auto *CI = mdconst::extract_or_null<ConstantInt>(
+      F->getParent()->getModuleFlag("CFI Canonical Jump Tables"));
+  if (!CI || CI->getZExtValue() != 0)
+    return true;
+  return F->hasFnAttribute("cfi-canonical-jump-table");
+}
+
 namespace {
 
 struct ByteArrayInfo {
@@ -251,9 +261,12 @@ class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
   GlobalObject *GO;
   size_t NTypes;
 
-  // For functions: true if this is a definition (either in the merged module or
-  // in one of the thinlto modules).
-  bool IsDefinition;
+  // For functions: true if the jump table is canonical. This essentially means
+  // whether the canonical address (i.e. the symbol table entry) of the function
+  // is provided by the local jump table. This is normally the same as whether
+  // the function is defined locally, but if canonical jump tables are disabled
+  // by the user then the jump table never provides a canonical definition.
+  bool IsJumpTableCanonical;
 
   // For functions: true if this function is either defined or used in a thinlto
   // module and its jumptable entry needs to be exported to thinlto backends.
@@ -263,13 +276,13 @@ class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
 
 public:
   static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
-                                  bool IsDefinition, bool IsExported,
+                                  bool IsJumpTableCanonical, bool IsExported,
                                   ArrayRef<MDNode *> Types) {
     auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
         totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
     GTM->GO = GO;
     GTM->NTypes = Types.size();
-    GTM->IsDefinition = IsDefinition;
+    GTM->IsJumpTableCanonical = IsJumpTableCanonical;
     GTM->IsExported = IsExported;
     std::uninitialized_copy(Types.begin(), Types.end(),
                             GTM->getTrailingObjects<MDNode *>());
@@ -280,8 +293,8 @@ public:
     return GO;
   }
 
-  bool isDefinition() const {
-    return IsDefinition;
+  bool isJumpTableCanonical() const {
+    return IsJumpTableCanonical;
   }
 
   bool isExported() const {
@@ -320,6 +333,49 @@ private:
   size_t NTargets;
 };
 
+struct ScopedSaveAliaseesAndUsed {
+  Module &M;
+  SmallPtrSet<GlobalValue *, 16> Used, CompilerUsed;
+  std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases;
+
+  ScopedSaveAliaseesAndUsed(Module &M) : M(M) {
+    // The users of this class want to replace all function references except
+    // for aliases and llvm.used/llvm.compiler.used with references to a jump
+    // table. We avoid replacing aliases in order to avoid introducing a double
+    // indirection (or an alias pointing to a declaration in ThinLTO mode), and
+    // we avoid replacing llvm.used/llvm.compiler.used because these global
+    // variables describe properties of the global, not the jump table (besides,
+    // offseted references to the jump table in llvm.used are invalid).
+    // Unfortunately, LLVM doesn't have a "RAUW except for these (possibly
+    // indirect) users", so what we do is save the list of globals referenced by
+    // llvm.used/llvm.compiler.used and aliases, erase the used lists, let RAUW
+    // replace the aliasees and then set them back to their original values at
+    // the end.
+    if (GlobalVariable *GV = collectUsedGlobalVariables(M, Used, false))
+      GV->eraseFromParent();
+    if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true))
+      GV->eraseFromParent();
+
+    for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) {
+      // FIXME: This should look past all aliases not just interposable ones,
+      // see discussion on D65118.
+      if (auto *F =
+              dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts()))
+        FunctionAliases.push_back({&GIS, F});
+    }
+  }
+
+  ~ScopedSaveAliaseesAndUsed() {
+    appendToUsed(M, std::vector<GlobalValue *>(Used.begin(), Used.end()));
+    appendToCompilerUsed(M, std::vector<GlobalValue *>(CompilerUsed.begin(),
+                                                       CompilerUsed.end()));
+
+    for (auto P : FunctionAliases)
+      P.first->setIndirectSymbol(
+          ConstantExpr::getBitCast(P.second, P.first->getType()));
+  }
+};
+
 class LowerTypeTestsModule {
   Module &M;
 
@@ -387,7 +443,8 @@ class LowerTypeTestsModule {
   uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
   TypeIdLowering importTypeId(StringRef TypeId);
   void importTypeTest(CallInst *CI);
-  void importFunction(Function *F, bool isDefinition);
+  void importFunction(Function *F, bool isJumpTableCanonical,
+                      std::vector<GlobalAlias *> &AliasesToErase);
 
   BitSetInfo
   buildBitSet(Metadata *TypeId,
@@ -421,7 +478,8 @@ class LowerTypeTestsModule {
                               ArrayRef<GlobalTypeMember *> Globals,
                               ArrayRef<ICallBranchFunnel *> ICallBranchFunnels);
 
-  void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT, bool IsDefinition);
+  void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT,
+                                              bool IsJumpTableCanonical);
   void moveInitializerToModuleConstructor(GlobalVariable *GV);
   void findGlobalVariableUsersOf(Constant *C,
                                  SmallSetVector<GlobalVariable *, 8> &Out);
@@ -433,7 +491,7 @@ class LowerTypeTestsModule {
   /// the block. 'This's use list is expected to have at least one element.
   /// Unlike replaceAllUsesWith this function skips blockaddr and direct call
   /// uses.
-  void replaceCfiUses(Function *Old, Value *New, bool IsDefinition);
+  void replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical);
 
   /// replaceDirectCalls - Go through the uses list for this definition and
   /// replace each use, which is a direct function call.
@@ -759,43 +817,50 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
   // Build a new global with the combined contents of the referenced globals.
   // This global is a struct whose even-indexed elements contain the original
   // contents of the referenced globals and whose odd-indexed elements contain
-  // any padding required to align the next element to the next power of 2.
+  // any padding required to align the next element to the next power of 2 plus
+  // any additional padding required to meet its alignment requirements.
   std::vector<Constant *> GlobalInits;
   const DataLayout &DL = M.getDataLayout();
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+  Align MaxAlign;
+  uint64_t CurOffset = 0;
+  uint64_t DesiredPadding = 0;
   for (GlobalTypeMember *G : Globals) {
-    GlobalVariable *GV = cast<GlobalVariable>(G->getGlobal());
+    auto *GV = cast<GlobalVariable>(G->getGlobal());
+    MaybeAlign Alignment(GV->getAlignment());
+    if (!Alignment)
+      Alignment = Align(DL.getABITypeAlignment(GV->getValueType()));
+    MaxAlign = std::max(MaxAlign, *Alignment);
+    uint64_t GVOffset = alignTo(CurOffset + DesiredPadding, *Alignment);
+    GlobalLayout[G] = GVOffset;
+    if (GVOffset != 0) {
+      uint64_t Padding = GVOffset - CurOffset;
+      GlobalInits.push_back(
+          ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
+    }
+
     GlobalInits.push_back(GV->getInitializer());
     uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
+    CurOffset = GVOffset + InitSize;
 
-    // Compute the amount of padding required.
-    uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
+    // Compute the amount of padding that we'd like for the next element.
+    DesiredPadding = NextPowerOf2(InitSize - 1) - InitSize;
 
     // Experiments of different caps with Chromium on both x64 and ARM64
     // have shown that the 32-byte cap generates the smallest binary on
     // both platforms while different caps yield similar performance.
     // (see https://lists.llvm.org/pipermail/llvm-dev/2018-July/124694.html)
-    if (Padding > 32)
-      Padding = alignTo(InitSize, 32) - InitSize;
-
-    GlobalInits.push_back(
-        ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
+    if (DesiredPadding > 32)
+      DesiredPadding = alignTo(InitSize, 32) - InitSize;
   }
-  if (!GlobalInits.empty())
-    GlobalInits.pop_back();
+
   Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
   auto *CombinedGlobal =
       new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
                          GlobalValue::PrivateLinkage, NewInit);
+  CombinedGlobal->setAlignment(MaxAlign);
 
   StructType *NewTy = cast<StructType>(NewInit->getType());
-  const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy);
-
-  // Compute the offsets of the original globals within the new global.
-  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
-  for (unsigned I = 0; I != Globals.size(); ++I)
-    // Multiply by 2 to account for padding elements.
-    GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2);
-
   lowerTypeTestCalls(TypeIds, CombinedGlobal, GlobalLayout);
 
   // Build aliases pointing to offsets into the combined global for each
@@ -975,14 +1040,16 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
 }
 
 // ThinLTO backend: the function F has a jump table entry; update this module
-// accordingly. isDefinition describes the type of the jump table entry.
-void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
+// accordingly. isJumpTableCanonical describes the type of the jump table entry.
+void LowerTypeTestsModule::importFunction(
+    Function *F, bool isJumpTableCanonical,
+    std::vector<GlobalAlias *> &AliasesToErase) {
   assert(F->getType()->getAddressSpace() == 0);
 
   GlobalValue::VisibilityTypes Visibility = F->getVisibility();
   std::string Name = F->getName();
 
-  if (F->isDeclarationForLinker() && isDefinition) {
+  if (F->isDeclarationForLinker() && isJumpTableCanonical) {
     // Non-dso_local functions may be overriden at run time,
     // don't short curcuit them
     if (F->isDSOLocal()) {
@@ -997,12 +1064,13 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
   }
 
   Function *FDecl;
-  if (F->isDeclarationForLinker() && !isDefinition) {
-    // Declaration of an external function.
+  if (!isJumpTableCanonical) {
+    // Either a declaration of an external function or a reference to a locally
+    // defined jump table.
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
                              F->getAddressSpace(), Name + ".cfi_jt", &M);
     FDecl->setVisibility(GlobalValue::HiddenVisibility);
-  } else if (isDefinition) {
+  } else {
     F->setName(Name + ".cfi");
     F->setLinkage(GlobalValue::ExternalLinkage);
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
@@ -1011,8 +1079,8 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
     Visibility = GlobalValue::HiddenVisibility;
 
     // Delete aliases pointing to this function, they'll be re-created in the
-    // merged output
-    SmallVector<GlobalAlias*, 4> ToErase;
+    // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed
+    // will want to reset the aliasees first.
     for (auto &U : F->uses()) {
       if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) {
         Function *AliasDecl = Function::Create(
@@ -1020,24 +1088,15 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
             F->getAddressSpace(), "", &M);
         AliasDecl->takeName(A);
         A->replaceAllUsesWith(AliasDecl);
-        ToErase.push_back(A);
+        AliasesToErase.push_back(A);
       }
     }
-    for (auto *A : ToErase)
-      A->eraseFromParent();
-  } else {
-    // Function definition without type metadata, where some other translation
-    // unit contained a declaration with type metadata. This normally happens
-    // during mixed CFI + non-CFI compilation. We do nothing with the function
-    // so that it is treated the same way as a function defined outside of the
-    // LTO unit.
-    return;
   }
 
-  if (F->isWeakForLinker())
-    replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isDefinition);
+  if (F->hasExternalWeakLinkage())
+    replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isJumpTableCanonical);
   else
-    replaceCfiUses(F, FDecl, isDefinition);
+    replaceCfiUses(F, FDecl, isJumpTableCanonical);
 
   // Set visibility late because it's used in replaceCfiUses() to determine
   // whether uses need to to be replaced.
@@ -1225,7 +1284,7 @@ void LowerTypeTestsModule::findGlobalVariableUsersOf(
 
 // Replace all uses of F with (F ? JT : 0).
 void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
-    Function *F, Constant *JT, bool IsDefinition) {
+    Function *F, Constant *JT, bool IsJumpTableCanonical) {
   // The target expression can not appear in a constant initializer on most
   // (all?) targets. Switch to a runtime initializer.
   SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
@@ -1239,7 +1298,7 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
       Function::Create(cast<FunctionType>(F->getValueType()),
                        GlobalValue::ExternalWeakLinkage,
                        F->getAddressSpace(), "", &M);
-  replaceCfiUses(F, PlaceholderFn, IsDefinition);
+  replaceCfiUses(F, PlaceholderFn, IsJumpTableCanonical);
 
   Constant *Target = ConstantExpr::getSelect(
       ConstantExpr::getICmp(CmpInst::ICMP_NE, F,
@@ -1276,8 +1335,9 @@ selectJumpTableArmEncoding(ArrayRef<GlobalTypeMember *> Functions,
 
   unsigned ArmCount = 0, ThumbCount = 0;
   for (const auto GTM : Functions) {
-    if (!GTM->isDefinition()) {
+    if (!GTM->isJumpTableCanonical()) {
       // PLT stubs are always ARM.
+      // FIXME: This is the wrong heuristic for non-canonical jump tables.
       ++ArmCount;
       continue;
     }
@@ -1303,7 +1363,7 @@ void LowerTypeTestsModule::createJumpTable(
                          cast<Function>(Functions[I]->getGlobal()));
 
   // Align the whole table by entry size.
-  F->setAlignment(getJumpTableEntrySize());
+  F->setAlignment(Align(getJumpTableEntrySize()));
   // Skip prologue.
   // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3.
   // Luckily, this function does not get any prologue even without the
@@ -1438,47 +1498,53 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
 
   lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout);
 
-  // Build aliases pointing to offsets into the jump table, and replace
-  // references to the original functions with references to the aliases.
-  for (unsigned I = 0; I != Functions.size(); ++I) {
-    Function *F = cast<Function>(Functions[I]->getGlobal());
-    bool IsDefinition = Functions[I]->isDefinition();
-
-    Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
-        ConstantExpr::getInBoundsGetElementPtr(
-            JumpTableType, JumpTable,
-            ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
-                                 ConstantInt::get(IntPtrTy, I)}),
-        F->getType());
-    if (Functions[I]->isExported()) {
-      if (IsDefinition) {
-        ExportSummary->cfiFunctionDefs().insert(F->getName());
+  {
+    ScopedSaveAliaseesAndUsed S(M);
+
+    // Build aliases pointing to offsets into the jump table, and replace
+    // references to the original functions with references to the aliases.
+    for (unsigned I = 0; I != Functions.size(); ++I) {
+      Function *F = cast<Function>(Functions[I]->getGlobal());
+      bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical();
+
+      Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
+          ConstantExpr::getInBoundsGetElementPtr(
+              JumpTableType, JumpTable,
+              ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
+                                   ConstantInt::get(IntPtrTy, I)}),
+          F->getType());
+      if (Functions[I]->isExported()) {
+        if (IsJumpTableCanonical) {
+          ExportSummary->cfiFunctionDefs().insert(F->getName());
+        } else {
+          GlobalAlias *JtAlias = GlobalAlias::create(
+              F->getValueType(), 0, GlobalValue::ExternalLinkage,
+              F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+          JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+          ExportSummary->cfiFunctionDecls().insert(F->getName());
+        }
+      }
+      if (!IsJumpTableCanonical) {
+        if (F->hasExternalWeakLinkage())
+          replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr,
+                                                 IsJumpTableCanonical);
+        else
+          replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical);
       } else {
-        GlobalAlias *JtAlias = GlobalAlias::create(
-            F->getValueType(), 0, GlobalValue::ExternalLinkage,
-            F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
-        JtAlias->setVisibility(GlobalValue::HiddenVisibility);
-        ExportSummary->cfiFunctionDecls().insert(F->getName());
+        assert(F->getType()->getAddressSpace() == 0);
+
+        GlobalAlias *FAlias =
+            GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "",
+                                CombinedGlobalElemPtr, &M);
+        FAlias->setVisibility(F->getVisibility());
+        FAlias->takeName(F);
+        if (FAlias->hasName())
+          F->setName(FAlias->getName() + ".cfi");
+        replaceCfiUses(F, FAlias, IsJumpTableCanonical);
+        if (!F->hasLocalLinkage())
+          F->setVisibility(GlobalVariable::HiddenVisibility);
       }
     }
-    if (!IsDefinition) {
-      if (F->isWeakForLinker())
-        replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, IsDefinition);
-      else
-        replaceCfiUses(F, CombinedGlobalElemPtr, IsDefinition);
-    } else {
-      assert(F->getType()->getAddressSpace() == 0);
-
-      GlobalAlias *FAlias = GlobalAlias::create(
-          F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M);
-      FAlias->setVisibility(F->getVisibility());
-      FAlias->takeName(F);
-      if (FAlias->hasName())
-        F->setName(FAlias->getName() + ".cfi");
-      replaceCfiUses(F, FAlias, IsDefinition);
-      if (!F->hasLocalLinkage())
-        F->setVisibility(GlobalVariable::HiddenVisibility);
-    }
   }
 
   createJumpTable(JumpTableFn, Functions);
@@ -1623,7 +1689,7 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
     ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
                           ": ");
     std::error_code EC;
-    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
     ExitOnErr(errorCodeToError(EC));
 
     yaml::Output Out(OS);
@@ -1643,7 +1709,8 @@ static bool isDirectCall(Use& U) {
   return false;
 }
 
-void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefinition) {
+void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
+                                          bool IsJumpTableCanonical) {
   SmallSetVector<Constant *, 4> Constants;
   auto UI = Old->use_begin(), E = Old->use_end();
   for (; UI != E;) {
@@ -1655,7 +1722,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefi
       continue;
 
     // Skip direct calls to externally defined or non-dso_local functions
-    if (isDirectCall(U) && (Old->isDSOLocal() || !IsDefinition))
+    if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical))
       continue;
 
     // Must handle Constants specially, we cannot call replaceUsesOfWith on a
@@ -1678,16 +1745,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefi
 }
 
 void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
-  auto UI = Old->use_begin(), E = Old->use_end();
-  for (; UI != E;) {
-    Use &U = *UI;
-    ++UI;
-
-    if (!isDirectCall(U))
-      continue;
-
-    U.set(New);
-  }
+  Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); });
 }
 
 bool LowerTypeTestsModule::lower() {
@@ -1734,10 +1792,16 @@ bool LowerTypeTestsModule::lower() {
         Decls.push_back(&F);
     }
 
-    for (auto F : Defs)
-      importFunction(F, /*isDefinition*/ true);
-    for (auto F : Decls)
-      importFunction(F, /*isDefinition*/ false);
+    std::vector<GlobalAlias *> AliasesToErase;
+    {
+      ScopedSaveAliaseesAndUsed S(M);
+      for (auto F : Defs)
+        importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase);
+      for (auto F : Decls)
+        importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase);
+    }
+    for (GlobalAlias *GA : AliasesToErase)
+      GA->eraseFromParent();
 
     return true;
   }
@@ -1823,6 +1887,17 @@ bool LowerTypeTestsModule::lower() {
         CfiFunctionLinkage Linkage = P.second.Linkage;
         MDNode *FuncMD = P.second.FuncMD;
         Function *F = M.getFunction(FunctionName);
+        if (F && F->hasLocalLinkage()) {
+          // Locally defined function that happens to have the same name as a
+          // function defined in a ThinLTO module. Rename it to move it out of
+          // the way of the external reference that we're about to create.
+          // Note that setName will find a unique name for the function, so even
+          // if there is an existing function with the suffix there won't be a
+          // name collision.
+          F->setName(F->getName() + ".1");
+          F = nullptr;
+        }
+
         if (!F)
           F = Function::Create(
               FunctionType::get(Type::getVoidTy(M.getContext()), false),
@@ -1871,24 +1946,26 @@ bool LowerTypeTestsModule::lower() {
     Types.clear();
     GO.getMetadata(LLVMContext::MD_type, Types);
 
-    bool IsDefinition = !GO.isDeclarationForLinker();
+    bool IsJumpTableCanonical = false;
     bool IsExported = false;
     if (Function *F = dyn_cast<Function>(&GO)) {
+      IsJumpTableCanonical = isJumpTableCanonical(F);
       if (ExportedFunctions.count(F->getName())) {
-        IsDefinition |= ExportedFunctions[F->getName()].Linkage == CFL_Definition;
+        IsJumpTableCanonical |=
+            ExportedFunctions[F->getName()].Linkage == CFL_Definition;
         IsExported = true;
       // TODO: The logic here checks only that the function is address taken,
       // not that the address takers are live. This can be updated to check
       // their liveness and emit fewer jumptable entries once monolithic LTO
       // builds also emit summaries.
       } else if (!F->hasAddressTaken()) {
-        if (!CrossDsoCfi || !IsDefinition || F->hasLocalLinkage())
+        if (!CrossDsoCfi || !IsJumpTableCanonical || F->hasLocalLinkage())
           continue;
       }
     }
 
-    auto *GTM =
-        GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types);
+    auto *GTM = GlobalTypeMember::create(Alloc, &GO, IsJumpTableCanonical,
+                                         IsExported, Types);
     GlobalTypeMembers[&GO] = GTM;
     for (MDNode *Type : Types) {
       verifyTypeMDNode(&GO, Type);
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 3a08069dcd4a..8b9abaddc84c 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -769,7 +769,7 @@ void MergeFunctions::writeAlias(Function *F, Function *G) {
       PtrType->getElementType(), PtrType->getAddressSpace(),
       G->getLinkage(), "", BitcastF, G->getParent());
 
-  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+  F->setAlignment(MaybeAlign(std::max(F->getAlignment(), G->getAlignment())));
   GA->takeName(G);
   GA->setVisibility(G->getVisibility());
   GA->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
@@ -816,7 +816,7 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
     removeUsers(F);
     F->replaceAllUsesWith(NewF);
 
-    unsigned MaxAlignment = std::max(G->getAlignment(), NewF->getAlignment());
+    MaybeAlign MaxAlignment(std::max(G->getAlignment(), NewF->getAlignment()));
 
     writeThunkOrAlias(F, G);
     writeThunkOrAlias(F, NewF);
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 733782e8764d..e193074884af 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -409,7 +409,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
 
   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
-      llvm::make_unique<FunctionOutliningMultiRegionInfo>();
+      std::make_unique<FunctionOutliningMultiRegionInfo>();
 
   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
     BasicBlock *Dom = BlockList.front();
@@ -589,7 +589,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   };
 
   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
-      llvm::make_unique<FunctionOutliningInfo>();
+      std::make_unique<FunctionOutliningInfo>();
 
   BasicBlock *CurrEntry = EntryBlock;
   bool CandidateFound = false;
@@ -966,7 +966,7 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
     function_ref<AssumptionCache *(Function &)> LookupAC)
     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
-  ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
+  ClonedOI = std::make_unique<FunctionOutliningInfo>();
 
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
@@ -991,7 +991,7 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(
     OptimizationRemarkEmitter &ORE,
     function_ref<AssumptionCache *(Function &)> LookupAC)
     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
-  ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>();
+  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
 
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
@@ -1122,6 +1122,9 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
   BranchProbabilityInfo BPI(*ClonedFunc, LI);
   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
 
+  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
+  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
+
   SetVector<Value *> Inputs, Outputs, Sinks;
   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
        ClonedOMRI->ORI) {
@@ -1148,7 +1151,7 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
     if (Outputs.size() > 0 && !ForceLiveExit)
       continue;
 
-    Function *OutlinedFunc = CE.extractCodeRegion();
+    Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
 
     if (OutlinedFunc) {
       CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
@@ -1210,11 +1213,12 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
     }
 
   // Extract the body of the if.
+  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
   Function *OutlinedFunc =
       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
                     /* AllowVarargs */ true)
-          .extractCodeRegion();
+          .extractCodeRegion(CEAC);
 
   if (OutlinedFunc) {
     BasicBlock *OutliningCallBB =
@@ -1264,7 +1268,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
   if (PSI->isFunctionEntryCold(F))
     return {false, nullptr};
 
-  if (empty(F->users()))
+  if (F->users().empty())
     return {false, nullptr};
 
   OptimizationRemarkEmitter ORE(F);
@@ -1370,7 +1374,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     return false;
   }
 
-  assert(empty(Cloner.OrigFunc->users()) &&
+  assert(Cloner.OrigFunc->users().empty() &&
          "F's users should all be replaced!");
 
   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 3ea77f08fd3c..5314a8219b1e 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -654,6 +654,7 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createGlobalsAAWrapperPass());
 
   MPM.add(createFloat2IntPass());
+  MPM.add(createLowerConstantIntrinsicsPass());
 
   addExtensionsToPM(EP_VectorizerStart, MPM);
 
diff --git a/lib/Transforms/IPO/SCCP.cpp b/lib/Transforms/IPO/SCCP.cpp
index 7be3608bd2ec..307690729b14 100644
--- a/lib/Transforms/IPO/SCCP.cpp
+++ b/lib/Transforms/IPO/SCCP.cpp
@@ -9,16 +9,18 @@ using namespace llvm;
 
 PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
   const DataLayout &DL = M.getDataLayout();
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
   auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn {
     DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
     return {
-        make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)),
+        std::make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)),
         &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F)};
   };
 
-  if (!runIPSCCP(M, DL, &TLI, getAnalysis))
+  if (!runIPSCCP(M, DL, GetTLI, getAnalysis))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
@@ -47,14 +49,14 @@ public:
     if (skipModule(M))
       return false;
     const DataLayout &DL = M.getDataLayout();
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-
+    auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
     auto getAnalysis = [this](Function &F) -> AnalysisResultsForFn {
       DominatorTree &DT =
           this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
       return {
-          make_unique<PredicateInfo>(
+          std::make_unique<PredicateInfo>(
               F, DT,
               this->getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
                   F)),
@@ -62,7 +64,7 @@ public:
           nullptr}; // manager, so set them to nullptr.
     };
 
-    return runIPSCCP(M, DL, TLI, getAnalysis);
+    return runIPSCCP(M, DL, GetTLI, getAnalysis);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 877d20e72ffc..6184681db8a2 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -72,6 +72,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -79,6 +80,7 @@
 #include <limits>
 #include <map>
 #include <memory>
+#include <queue>
 #include <string>
 #include <system_error>
 #include <utility>
@@ -128,6 +130,12 @@ static cl::opt<bool> ProfileSampleAccurate(
              "callsite and function as having 0 samples. Otherwise, treat "
              "un-sampled callsites and functions conservatively as unknown. "));
 
+static cl::opt<bool> ProfileAccurateForSymsInList(
+    "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
+    cl::init(true),
+    cl::desc("For symbols in profile symbol list, regard their profiles to "
+             "be accurate. It may be overriden by profile-sample-accurate. "));
+
 namespace {
 
 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
@@ -137,9 +145,11 @@ using EdgeWeightMap = DenseMap<Edge, uint64_t>;
 using BlockEdgeMap =
     DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
 
+class SampleProfileLoader;
+
 class SampleCoverageTracker {
 public:
-  SampleCoverageTracker() = default;
+  SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){};
 
   bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
                        uint32_t Discriminator, uint64_t Samples);
@@ -185,6 +195,76 @@ private:
   /// keyed by FunctionSamples pointers, but these stats are cleared after
   /// every function, so we just need to keep a single counter.
   uint64_t TotalUsedSamples = 0;
+
+  SampleProfileLoader &SPLoader;
+};
+
+class GUIDToFuncNameMapper {
+public:
+  GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
+                        DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
+      : CurrentReader(Reader), CurrentModule(M),
+      CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
+    if (CurrentReader.getFormat() != SPF_Compact_Binary)
+      return;
+
+    for (const auto &F : CurrentModule) {
+      StringRef OrigName = F.getName();
+      CurrentGUIDToFuncNameMap.insert(
+          {Function::getGUID(OrigName), OrigName});
+
+      // Local to global var promotion used by optimization like thinlto
+      // will rename the var and add suffix like ".llvm.xxx" to the
+      // original local name. In sample profile, the suffixes of function
+      // names are all stripped. Since it is possible that the mapper is
+      // built in post-thin-link phase and var promotion has been done,
+      // we need to add the substring of function name without the suffix
+      // into the GUIDToFuncNameMap.
+      StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+      if (CanonName != OrigName)
+        CurrentGUIDToFuncNameMap.insert(
+            {Function::getGUID(CanonName), CanonName});
+    }
+
+    // Update GUIDToFuncNameMap for each function including inlinees.
+    SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
+  }
+
+  ~GUIDToFuncNameMapper() {
+    if (CurrentReader.getFormat() != SPF_Compact_Binary)
+      return;
+
+    CurrentGUIDToFuncNameMap.clear();
+
+    // Reset GUIDToFuncNameMap for of each function as they're no
+    // longer valid at this point.
+    SetGUIDToFuncNameMapForAll(nullptr);
+  }
+
+private:
+  void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
+    std::queue<FunctionSamples *> FSToUpdate;
+    for (auto &IFS : CurrentReader.getProfiles()) {
+      FSToUpdate.push(&IFS.second);
+    }
+
+    while (!FSToUpdate.empty()) {
+      FunctionSamples *FS = FSToUpdate.front();
+      FSToUpdate.pop();
+      FS->GUIDToFuncNameMap = Map;
+      for (const auto &ICS : FS->getCallsiteSamples()) {
+        const FunctionSamplesMap &FSMap = ICS.second;
+        for (auto &IFS : FSMap) {
+          FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
+          FSToUpdate.push(&FS);
+        }
+      }
+    }
+  }
+
+  SampleProfileReader &CurrentReader;
+  Module &CurrentModule;
+  DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
 };
 
 /// Sample profile pass.
@@ -199,8 +279,9 @@ public:
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
       : GetAC(std::move(GetAssumptionCache)),
-        GetTTI(std::move(GetTargetTransformInfo)), Filename(Name),
-        RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {}
+        GetTTI(std::move(GetTargetTransformInfo)), CoverageTracker(*this),
+        Filename(Name), RemappingFilename(RemapName),
+        IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   bool doInitialization(Module &M);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
@@ -209,6 +290,8 @@ public:
   void dump() { Reader->dump(); }
 
 protected:
+  friend class SampleCoverageTracker;
+
   bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
   unsigned getFunctionLoc(Function &F);
   bool emitAnnotations(Function &F);
@@ -237,6 +320,8 @@ protected:
   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
   void computeDominanceAndLoopInfo(Function &F);
   void clearFunctionData();
+  bool callsiteIsHot(const FunctionSamples *CallsiteFS,
+                     ProfileSummaryInfo *PSI);
 
   /// Map basic blocks to their computed weights.
   ///
@@ -310,6 +395,10 @@ protected:
   /// Profile Summary Info computed from sample profile.
   ProfileSummaryInfo *PSI = nullptr;
 
+  /// Profle Symbol list tells whether a function name appears in the binary
+  /// used to generate the current profile.
+  std::unique_ptr<ProfileSymbolList> PSL;
+
   /// Total number of samples collected in this profile.
   ///
   /// This is the sum of all the samples collected in all the functions executed
@@ -326,6 +415,21 @@ protected:
     uint64_t entryCount;
   };
   DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;
+
+  // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
+  // all the function symbols defined or declared in current module.
+  DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
+
+  // All the Names used in FunctionSamples including outline function
+  // names, inline instance names and call target names.
+  StringSet<> NamesInProfile;
+
+  // For symbol in profile symbol list, whether to regard their profiles
+  // to be accurate. It is mainly decided by existance of profile symbol
+  // list and -profile-accurate-for-symsinlist flag, but it can be
+  // overriden by -profile-sample-accurate or profile-sample-accurate
+  // attribute.
+  bool ProfAccForSymsInList;
 };
 
 class SampleProfileLoaderLegacyPass : public ModulePass {
@@ -381,14 +485,23 @@ private:
 /// To decide whether an inlined callsite is hot, we compare the callsite
 /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
 /// regarded as hot if the count is above the cutoff value.
-static bool callsiteIsHot(const FunctionSamples *CallsiteFS,
-                          ProfileSummaryInfo *PSI) {
+///
+/// When ProfileAccurateForSymsInList is enabled and profile symbol list
+/// is present, functions in the profile symbol list but without profile will
+/// be regarded as cold and much less inlining will happen in CGSCC inlining
+/// pass, so we tend to lower the hot criteria here to allow more early
+/// inlining to happen for warm callsites and it is helpful for performance.
+bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS,
+                                        ProfileSummaryInfo *PSI) {
   if (!CallsiteFS)
     return false; // The callsite was not inlined in the original binary.
 
   assert(PSI && "PSI is expected to be non null");
   uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
-  return PSI->isHotCount(CallsiteTotalSamples);
+  if (ProfAccForSymsInList)
+    return !PSI->isColdCount(CallsiteTotalSamples);
+  else
+    return PSI->isHotCount(CallsiteTotalSamples);
 }
 
 /// Mark as used the sample record for the given function samples at
@@ -425,7 +538,7 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(CalleeSamples, PSI))
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
         Count += countUsedRecords(CalleeSamples, PSI);
     }
 
@@ -444,7 +557,7 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(CalleeSamples, PSI))
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
         Count += countBodyRecords(CalleeSamples, PSI);
     }
 
@@ -465,7 +578,7 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
   for (const auto &I : FS->getCallsiteSamples())
     for (const auto &J : I.second) {
       const FunctionSamples *CalleeSamples = &J.second;
-      if (callsiteIsHot(CalleeSamples, PSI))
+      if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
         Total += countBodySamples(CalleeSamples, PSI);
     }
 
@@ -788,6 +901,14 @@ bool SampleProfileLoader::inlineHotFunctions(
     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
   DenseSet<Instruction *> PromotedInsns;
 
+  // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
+  // Profile symbol list is ignored when profile-sample-accurate is on.
+  assert((!ProfAccForSymsInList ||
+          (!ProfileSampleAccurate &&
+           !F.hasFnAttribute("profile-sample-accurate"))) &&
+         "ProfAccForSymsInList should be false when profile-sample-accurate "
+         "is enabled");
+
   DenseMap<Instruction *, const FunctionSamples *> localNotInlinedCallSites;
   bool Changed = false;
   while (true) {
@@ -1219,17 +1340,12 @@ void SampleProfileLoader::buildEdges(Function &F) {
 }
 
 /// Returns the sorted CallTargetMap \p M by count in descending order.
-static SmallVector<InstrProfValueData, 2> SortCallTargets(
-    const SampleRecord::CallTargetMap &M) {
+static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets(
+    const SampleRecord::CallTargetMap & M) {
   SmallVector<InstrProfValueData, 2> R;
-  for (auto I = M.begin(); I != M.end(); ++I)
-    R.push_back({FunctionSamples::getGUID(I->getKey()), I->getValue()});
-  llvm::sort(R, [](const InstrProfValueData &L, const InstrProfValueData &R) {
-    if (L.Count == R.Count)
-      return L.Value > R.Value;
-    else
-      return L.Count > R.Count;
-  });
+  for (const auto &I : SampleRecord::SortCallTargets(M)) {
+    R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
+  }
   return R;
 }
 
@@ -1324,7 +1440,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
           if (!T || T.get().empty())
             continue;
           SmallVector<InstrProfValueData, 2> SortedCallTargets =
-              SortCallTargets(T.get());
+              GetSortedValueDataFromCallTargets(T.get());
           uint64_t Sum;
           findIndirectCallFunctionSamples(I, Sum);
           annotateValueSite(*I.getParent()->getParent()->getParent(), I,
@@ -1374,6 +1490,8 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       }
     }
 
+    misexpect::verifyMisExpect(TI, Weights, TI->getContext());
+
     uint64_t TempWeight;
     // Only set weights if there is at least one non-zero weight.
     // In any other case, let the analyzer set weights.
@@ -1557,30 +1675,29 @@ INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
 
 bool SampleProfileLoader::doInitialization(Module &M) {
   auto &Ctx = M.getContext();
-  auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx);
+
+  std::unique_ptr<SampleProfileReaderItaniumRemapper> RemapReader;
+  auto ReaderOrErr =
+      SampleProfileReader::create(Filename, Ctx, RemappingFilename);
   if (std::error_code EC = ReaderOrErr.getError()) {
     std::string Msg = "Could not open profile: " + EC.message();
     Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
     return false;
   }
   Reader = std::move(ReaderOrErr.get());
-  Reader->collectFuncsToUse(M);
+  Reader->collectFuncsFrom(M);
   ProfileIsValid = (Reader->read() == sampleprof_error::success);
-
-  if (!RemappingFilename.empty()) {
-    // Apply profile remappings to the loaded profile data if requested.
-    // For now, we only support remapping symbols encoded using the Itanium
-    // C++ ABI's name mangling scheme.
-    ReaderOrErr = SampleProfileReaderItaniumRemapper::create(
-        RemappingFilename, Ctx, std::move(Reader));
-    if (std::error_code EC = ReaderOrErr.getError()) {
-      std::string Msg = "Could not open profile remapping file: " + EC.message();
-      Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
-      return false;
-    }
-    Reader = std::move(ReaderOrErr.get());
-    ProfileIsValid = (Reader->read() == sampleprof_error::success);
+  PSL = Reader->getProfileSymbolList();
+
+  // While profile-sample-accurate is on, ignore symbol list.
+  ProfAccForSymsInList =
+      ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate;
+  if (ProfAccForSymsInList) {
+    NamesInProfile.clear();
+    if (auto NameTable = Reader->getNameTable())
+      NamesInProfile.insert(NameTable->begin(), NameTable->end());
   }
+
   return true;
 }
 
@@ -1594,7 +1711,7 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
 
 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
                                       ProfileSummaryInfo *_PSI) {
-  FunctionSamples::GUIDToFuncNameMapper Mapper(M);
+  GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
   if (!ProfileIsValid)
     return false;
 
@@ -1651,19 +1768,48 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
 }
 
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
-  
+
   DILocation2SampleMap.clear();
   // By default the entry count is initialized to -1, which will be treated
   // conservatively by getEntryCount as the same as unknown (None). This is
   // to avoid newly added code to be treated as cold. If we have samples
   // this will be overwritten in emitAnnotations.
-  // If ProfileSampleAccurate is true or F has profile-sample-accurate
-  // attribute, initialize the entry count to 0 so callsites or functions
-  // unsampled will be treated as cold.
-  uint64_t initialEntryCount =
-      (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate"))
-          ? 0
-          : -1;
+  uint64_t initialEntryCount = -1;
+
+  ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
+  if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
+    // initialize all the function entry counts to 0. It means all the
+    // functions without profile will be regarded as cold.
+    initialEntryCount = 0;
+    // profile-sample-accurate is a user assertion which has a higher precedence
+    // than symbol list. When profile-sample-accurate is on, ignore symbol list.
+    ProfAccForSymsInList = false;
+  }
+
+  // PSL -- profile symbol list include all the symbols in sampled binary.
+  // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
+  // old functions without samples being cold, without having to worry
+  // about new and hot functions being mistakenly treated as cold.
+  if (ProfAccForSymsInList) {
+    // Initialize the entry count to 0 for functions in the list.
+    if (PSL->contains(F.getName()))
+      initialEntryCount = 0;
+
+    // Function in the symbol list but without sample will be regarded as
+    // cold. To minimize the potential negative performance impact it could
+    // have, we want to be a little conservative here saying if a function
+    // shows up in the profile, no matter as outline function, inline instance
+    // or call targets, treat the function as not being cold. This will handle
+    // the cases such as most callsites of a function are inlined in sampled
+    // binary but not inlined in current build (because of source code drift,
+    // imprecise debug information, or the callsites are all cold individually
+    // but not cold accumulatively...), so the outline function showing up as
+    // cold in sampled binary will actually not be cold after current build.
+    StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+    if (NamesInProfile.count(CanonName))
+      initialEntryCount = -1;
+  }
+
   F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   if (AM) {
@@ -1672,7 +1818,7 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM)
             .getManager();
     ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   } else {
-    OwnedORE = make_unique<OptimizationRemarkEmitter>(&F);
+    OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
     ORE = OwnedORE.get();
   }
   Samples = Reader->getSamplesFor(F);
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 24c476376c14..690b5e8bf49e 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 using namespace llvm;
@@ -218,10 +219,18 @@ void splitAndWriteThinLTOBitcode(
 
   promoteTypeIds(M, ModuleId);
 
-  // Returns whether a global has attached type metadata. Such globals may
-  // participate in CFI or whole-program devirtualization, so they need to
-  // appear in the merged module instead of the thin LTO module.
+  // Returns whether a global or its associated global has attached type
+  // metadata. The former may participate in CFI or whole-program
+  // devirtualization, so they need to appear in the merged module instead of
+  // the thin LTO module. Similarly, globals that are associated with globals
+  // with type metadata need to appear in the merged module because they will
+  // reference the global's section directly.
   auto HasTypeMetadata = [](const GlobalObject *GO) {
+    if (MDNode *MD = GO->getMetadata(LLVMContext::MD_associated))
+      if (auto *AssocVM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(0)))
+        if (auto *AssocGO = dyn_cast<GlobalObject>(AssocVM->getValue()))
+          if (AssocGO->hasMetadata(LLVMContext::MD_type))
+            return true;
     return GO->hasMetadata(LLVMContext::MD_type);
   };
 
@@ -315,9 +324,9 @@ void splitAndWriteThinLTOBitcode(
     SmallVector<Metadata *, 4> Elts;
     Elts.push_back(MDString::get(Ctx, F.getName()));
     CfiFunctionLinkage Linkage;
-    if (!F.isDeclarationForLinker())
+    if (lowertypetests::isJumpTableCanonical(&F))
       Linkage = CFL_Definition;
-    else if (F.isWeakForLinker())
+    else if (F.hasExternalWeakLinkage())
       Linkage = CFL_WeakDeclaration;
     else
       Linkage = CFL_Declaration;
@@ -457,7 +466,7 @@ void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
       // splitAndWriteThinLTOBitcode). Just always build it once via the
       // buildModuleSummaryIndex when Module(s) are ready.
       ProfileSummaryInfo PSI(M);
-      NewIndex = llvm::make_unique<ModuleSummaryIndex>(
+      NewIndex = std::make_unique<ModuleSummaryIndex>(
           buildModuleSummaryIndex(M, nullptr, &PSI));
       Index = NewIndex.get();
     }
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 6b6dd6194e17..f0cf5581ba8a 100644
--- a/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -24,12 +24,14 @@
 //   returns 0, or a single vtable's function returns 1, replace each virtual
 //   call with a comparison of the vptr against that vtable's address.
 //
-// This pass is intended to be used during the regular and thin LTO pipelines.
+// This pass is intended to be used during the regular and thin LTO pipelines:
+//
 // During regular LTO, the pass determines the best optimization for each
 // virtual call and applies the resolutions directly to virtual calls that are
 // eligible for virtual call optimization (i.e. calls that use either of the
-// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During
-// ThinLTO, the pass operates in two phases:
+// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics).
+//
+// During hybrid Regular/ThinLTO, the pass operates in two phases:
 // - Export phase: this is run during the thin link over a single merged module
 //   that contains all vtables with !type metadata that participate in the link.
 //   The pass computes a resolution for each virtual call and stores it in the
@@ -38,6 +40,14 @@
 //   modules. The pass applies the resolutions previously computed during the
 //   import phase to each eligible virtual call.
 //
+// During ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over the index which
+//   contains a summary of all vtables with !type metadata that participate in
+//   the link. It computes a resolution for each virtual call and stores it in
+//   the type identifier summary. Only single implementation devirtualization
+//   is supported.
+// - Import phase: (same as with hybrid case above).
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
@@ -117,6 +127,11 @@ static cl::opt<unsigned>
                 cl::desc("Maximum number of call targets per "
                          "call site to enable branch funnels"));
 
+static cl::opt<bool>
+    PrintSummaryDevirt("wholeprogramdevirt-print-index-based", cl::Hidden,
+                       cl::init(false), cl::ZeroOrMore,
+                       cl::desc("Print index-based devirtualization messages"));
+
 // Find the minimum offset that we may store a value of size Size bits at. If
 // IsAfter is set, look for an offset before the object, otherwise look for an
 // offset after the object.
@@ -265,6 +280,25 @@ template <> struct DenseMapInfo<VTableSlot> {
   }
 };
 
+template <> struct DenseMapInfo<VTableSlotSummary> {
+  static VTableSlotSummary getEmptyKey() {
+    return {DenseMapInfo<StringRef>::getEmptyKey(),
+            DenseMapInfo<uint64_t>::getEmptyKey()};
+  }
+  static VTableSlotSummary getTombstoneKey() {
+    return {DenseMapInfo<StringRef>::getTombstoneKey(),
+            DenseMapInfo<uint64_t>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const VTableSlotSummary &I) {
+    return DenseMapInfo<StringRef>::getHashValue(I.TypeID) ^
+           DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
+  }
+  static bool isEqual(const VTableSlotSummary &LHS,
+                      const VTableSlotSummary &RHS) {
+    return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
+  }
+};
+
 } // end namespace llvm
 
 namespace {
@@ -342,19 +376,21 @@ struct CallSiteInfo {
   /// pass the vector is non-empty, we will need to add a use of llvm.type.test
   /// to each of the function summaries in the vector.
   std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+  std::vector<FunctionSummary *> SummaryTypeTestAssumeUsers;
 
   bool isExported() const {
     return SummaryHasTypeTestAssumeUsers ||
            !SummaryTypeCheckedLoadUsers.empty();
   }
 
-  void markSummaryHasTypeTestAssumeUsers() {
-    SummaryHasTypeTestAssumeUsers = true;
+  void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
+    SummaryTypeCheckedLoadUsers.push_back(FS);
     AllCallSitesDevirted = false;
   }
 
-  void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
-    SummaryTypeCheckedLoadUsers.push_back(FS);
+  void addSummaryTypeTestAssumeUser(FunctionSummary *FS) {
+    SummaryTypeTestAssumeUsers.push_back(FS);
+    SummaryHasTypeTestAssumeUsers = true;
     AllCallSitesDevirted = false;
   }
 
@@ -456,7 +492,6 @@ struct DevirtModule {
   void buildTypeIdentifierMap(
       std::vector<VTableBits> &Bits,
       DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
-  Constant *getPointerAtOffset(Constant *I, uint64_t Offset);
   bool
   tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
                             const std::set<TypeMemberInfo> &TypeMemberInfos,
@@ -464,7 +499,8 @@ struct DevirtModule {
 
   void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
                              bool &IsExported);
-  bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+  bool trySingleImplDevirt(ModuleSummaryIndex *ExportSummary,
+                           MutableArrayRef<VirtualCallTarget> TargetsForSlot,
                            VTableSlotInfo &SlotInfo,
                            WholeProgramDevirtResolution *Res);
 
@@ -542,6 +578,38 @@ struct DevirtModule {
                 function_ref<DominatorTree &(Function &)> LookupDomTree);
 };
 
+struct DevirtIndex {
+  ModuleSummaryIndex &ExportSummary;
+  // The set in which to record GUIDs exported from their module by
+  // devirtualization, used by client to ensure they are not internalized.
+  std::set<GlobalValue::GUID> &ExportedGUIDs;
+  // A map in which to record the information necessary to locate the WPD
+  // resolution for local targets in case they are exported by cross module
+  // importing.
+  std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap;
+
+  MapVector<VTableSlotSummary, VTableSlotInfo> CallSlots;
+
+  DevirtIndex(
+      ModuleSummaryIndex &ExportSummary,
+      std::set<GlobalValue::GUID> &ExportedGUIDs,
+      std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap)
+      : ExportSummary(ExportSummary), ExportedGUIDs(ExportedGUIDs),
+        LocalWPDTargetsMap(LocalWPDTargetsMap) {}
+
+  bool tryFindVirtualCallTargets(std::vector<ValueInfo> &TargetsForSlot,
+                                 const TypeIdCompatibleVtableInfo TIdInfo,
+                                 uint64_t ByteOffset);
+
+  bool trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
+                           VTableSlotSummary &SlotSummary,
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res,
+                           std::set<ValueInfo> &DevirtTargets);
+
+  void run();
+};
+
 struct WholeProgramDevirt : public ModulePass {
   static char ID;
 
@@ -572,7 +640,7 @@ struct WholeProgramDevirt : public ModulePass {
     // an optimization remark emitter on the fly, when we need it.
     std::unique_ptr<OptimizationRemarkEmitter> ORE;
     auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
-      ORE = make_unique<OptimizationRemarkEmitter>(F);
+      ORE = std::make_unique<OptimizationRemarkEmitter>(F);
       return *ORE;
     };
 
@@ -632,6 +700,41 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
   return PreservedAnalyses::none();
 }
 
+namespace llvm {
+void runWholeProgramDevirtOnIndex(
+    ModuleSummaryIndex &Summary, std::set<GlobalValue::GUID> &ExportedGUIDs,
+    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
+  DevirtIndex(Summary, ExportedGUIDs, LocalWPDTargetsMap).run();
+}
+
+void updateIndexWPDForExports(
+    ModuleSummaryIndex &Summary,
+    function_ref<bool(StringRef, GlobalValue::GUID)> isExported,
+    std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
+  for (auto &T : LocalWPDTargetsMap) {
+    auto &VI = T.first;
+    // This was enforced earlier during trySingleImplDevirt.
+    assert(VI.getSummaryList().size() == 1 &&
+           "Devirt of local target has more than one copy");
+    auto &S = VI.getSummaryList()[0];
+    if (!isExported(S->modulePath(), VI.getGUID()))
+      continue;
+
+    // It's been exported by a cross module import.
+    for (auto &SlotSummary : T.second) {
+      auto *TIdSum = Summary.getTypeIdSummary(SlotSummary.TypeID);
+      assert(TIdSum);
+      auto WPDRes = TIdSum->WPDRes.find(SlotSummary.ByteOffset);
+      assert(WPDRes != TIdSum->WPDRes.end());
+      WPDRes->second.SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
+          WPDRes->second.SingleImplName,
+          Summary.getModuleHash(S->modulePath()));
+    }
+  }
+}
+
+} // end namespace llvm
+
 bool DevirtModule::runForTesting(
     Module &M, function_ref<AAResults &(Function &)> AARGetter,
     function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
@@ -662,7 +765,7 @@ bool DevirtModule::runForTesting(
     ExitOnError ExitOnErr(
         "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
     std::error_code EC;
-    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
     ExitOnErr(errorCodeToError(EC));
 
     yaml::Output Out(OS);
@@ -706,38 +809,6 @@ void DevirtModule::buildTypeIdentifierMap(
   }
 }
 
-Constant *DevirtModule::getPointerAtOffset(Constant *I, uint64_t Offset) {
-  if (I->getType()->isPointerTy()) {
-    if (Offset == 0)
-      return I;
-    return nullptr;
-  }
-
-  const DataLayout &DL = M.getDataLayout();
-
-  if (auto *C = dyn_cast<ConstantStruct>(I)) {
-    const StructLayout *SL = DL.getStructLayout(C->getType());
-    if (Offset >= SL->getSizeInBytes())
-      return nullptr;
-
-    unsigned Op = SL->getElementContainingOffset(Offset);
-    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
-                              Offset - SL->getElementOffset(Op));
-  }
-  if (auto *C = dyn_cast<ConstantArray>(I)) {
-    ArrayType *VTableTy = C->getType();
-    uint64_t ElemSize = DL.getTypeAllocSize(VTableTy->getElementType());
-
-    unsigned Op = Offset / ElemSize;
-    if (Op >= C->getNumOperands())
-      return nullptr;
-
-    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
-                              Offset % ElemSize);
-  }
-  return nullptr;
-}
-
 bool DevirtModule::tryFindVirtualCallTargets(
     std::vector<VirtualCallTarget> &TargetsForSlot,
     const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
@@ -746,7 +817,7 @@ bool DevirtModule::tryFindVirtualCallTargets(
       return false;
 
     Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(),
-                                       TM.Offset + ByteOffset);
+                                       TM.Offset + ByteOffset, M);
     if (!Ptr)
       return false;
 
@@ -766,6 +837,34 @@ bool DevirtModule::tryFindVirtualCallTargets(
   return !TargetsForSlot.empty();
 }
 
+bool DevirtIndex::tryFindVirtualCallTargets(
+    std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo,
+    uint64_t ByteOffset) {
+  for (const TypeIdOffsetVtableInfo P : TIdInfo) {
+    // VTable initializer should have only one summary, or all copies must be
+    // linkonce/weak ODR.
+    assert(P.VTableVI.getSummaryList().size() == 1 ||
+           llvm::all_of(
+               P.VTableVI.getSummaryList(),
+               [&](const std::unique_ptr<GlobalValueSummary> &Summary) {
+                 return GlobalValue::isLinkOnceODRLinkage(Summary->linkage()) ||
+                        GlobalValue::isWeakODRLinkage(Summary->linkage());
+               }));
+    const auto *VS = cast<GlobalVarSummary>(P.VTableVI.getSummaryList()[0].get());
+    if (!P.VTableVI.getSummaryList()[0]->isLive())
+      continue;
+    for (auto VTP : VS->vTableFuncs()) {
+      if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset)
+        continue;
+
+      TargetsForSlot.push_back(VTP.FuncVI);
+    }
+  }
+
+  // Give up if we couldn't find any targets.
+  return !TargetsForSlot.empty();
+}
+
 void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
                                          Constant *TheFn, bool &IsExported) {
   auto Apply = [&](CallSiteInfo &CSInfo) {
@@ -788,9 +887,38 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
     Apply(P.second);
 }
 
+static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) {
+  // We can't add calls if we haven't seen a definition
+  if (Callee.getSummaryList().empty())
+    return false;
+
+  // Insert calls into the summary index so that the devirtualized targets
+  // are eligible for import.
+  // FIXME: Annotate type tests with hotness. For now, mark these as hot
+  // to better ensure we have the opportunity to inline them.
+  bool IsExported = false;
+  auto &S = Callee.getSummaryList()[0];
+  CalleeInfo CI(CalleeInfo::HotnessType::Hot, /* RelBF = */ 0);
+  auto AddCalls = [&](CallSiteInfo &CSInfo) {
+    for (auto *FS : CSInfo.SummaryTypeCheckedLoadUsers) {
+      FS->addCall({Callee, CI});
+      IsExported |= S->modulePath() != FS->modulePath();
+    }
+    for (auto *FS : CSInfo.SummaryTypeTestAssumeUsers) {
+      FS->addCall({Callee, CI});
+      IsExported |= S->modulePath() != FS->modulePath();
+    }
+  };
+  AddCalls(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    AddCalls(P.second);
+  return IsExported;
+}
+
 bool DevirtModule::trySingleImplDevirt(
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res) {
+    ModuleSummaryIndex *ExportSummary,
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res) {
   // See if the program contains a single implementation of this virtual
   // function.
   Function *TheFn = TargetsForSlot[0].Fn;
@@ -830,6 +958,10 @@ bool DevirtModule::trySingleImplDevirt(
     TheFn->setVisibility(GlobalValue::HiddenVisibility);
     TheFn->setName(NewName);
   }
+  if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID()))
+    // Any needed promotion of 'TheFn' has already been done during
+    // LTO unit split, so we can ignore return value of AddCalls.
+    AddCalls(SlotInfo, TheFnVI);
 
   Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
   Res->SingleImplName = TheFn->getName();
@@ -837,6 +969,63 @@ bool DevirtModule::trySingleImplDevirt(
   return true;
 }
 
+bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
+                                      VTableSlotSummary &SlotSummary,
+                                      VTableSlotInfo &SlotInfo,
+                                      WholeProgramDevirtResolution *Res,
+                                      std::set<ValueInfo> &DevirtTargets) {
+  // See if the program contains a single implementation of this virtual
+  // function.
+  auto TheFn = TargetsForSlot[0];
+  for (auto &&Target : TargetsForSlot)
+    if (TheFn != Target)
+      return false;
+
+  // Don't devirtualize if we don't have target definition.
+  auto Size = TheFn.getSummaryList().size();
+  if (!Size)
+    return false;
+
+  // If the summary list contains multiple summaries where at least one is
+  // a local, give up, as we won't know which (possibly promoted) name to use.
+  for (auto &S : TheFn.getSummaryList())
+    if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1)
+      return false;
+
+  // Collect functions devirtualized at least for one call site for stats.
+  if (PrintSummaryDevirt)
+    DevirtTargets.insert(TheFn);
+
+  auto &S = TheFn.getSummaryList()[0];
+  bool IsExported = AddCalls(SlotInfo, TheFn);
+  if (IsExported)
+    ExportedGUIDs.insert(TheFn.getGUID());
+
+  // Record in summary for use in devirtualization during the ThinLTO import
+  // step.
+  Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+  if (GlobalValue::isLocalLinkage(S->linkage())) {
+    if (IsExported)
+      // If target is a local function and we are exporting it by
+      // devirtualizing a call in another module, we need to record the
+      // promoted name.
+      Res->SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
+          TheFn.name(), ExportSummary.getModuleHash(S->modulePath()));
+    else {
+      LocalWPDTargetsMap[TheFn].push_back(SlotSummary);
+      Res->SingleImplName = TheFn.name();
+    }
+  } else
+    Res->SingleImplName = TheFn.name();
+
+  // Name will be empty if this thin link driven off of serialized combined
+  // index (e.g. llvm-lto). However, WPD is not supported/invoked for the
+  // legacy LTO API anyway.
+  assert(!Res->SingleImplName.empty());
+
+  return true;
+}
+
 void DevirtModule::tryICallBranchFunnel(
     MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
     WholeProgramDevirtResolution *Res, VTableSlot Slot) {
@@ -1302,10 +1491,13 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
   if (B.Before.Bytes.empty() && B.After.Bytes.empty())
     return;
 
-  // Align each byte array to pointer width.
-  unsigned PointerSize = M.getDataLayout().getPointerSize();
-  B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), PointerSize));
-  B.After.Bytes.resize(alignTo(B.After.Bytes.size(), PointerSize));
+  // Align the before byte array to the global's minimum alignment so that we
+  // don't break any alignment requirements on the global.
+  MaybeAlign Alignment(B.GV->getAlignment());
+  if (!Alignment)
+    Alignment =
+        Align(M.getDataLayout().getABITypeAlignment(B.GV->getValueType()));
+  B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), Alignment));
 
   // Before was stored in reverse order; flip it now.
   for (size_t I = 0, Size = B.Before.Bytes.size(); I != Size / 2; ++I)
@@ -1322,6 +1514,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
                          GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
   NewGV->setSection(B.GV->getSection());
   NewGV->setComdat(B.GV->getComdat());
+  NewGV->setAlignment(MaybeAlign(B.GV->getAlignment()));
 
   // Copy the original vtable's metadata to the anonymous global, adjusting
   // offsets as required.
@@ -1483,8 +1676,11 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
 }
 
 void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
+  auto *TypeId = dyn_cast<MDString>(Slot.TypeID);
+  if (!TypeId)
+    return;
   const TypeIdSummary *TidSummary =
-      ImportSummary->getTypeIdSummary(cast<MDString>(Slot.TypeID)->getString());
+      ImportSummary->getTypeIdSummary(TypeId->getString());
   if (!TidSummary)
     return;
   auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset);
@@ -1493,6 +1689,7 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
   const WholeProgramDevirtResolution &Res = ResI->second;
 
   if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) {
+    assert(!Res.SingleImplName.empty());
     // The type of the function in the declaration is irrelevant because every
     // call site will cast it to the correct type.
     Constant *SingleImpl =
@@ -1627,8 +1824,7 @@ bool DevirtModule::run() {
         // FIXME: Only add live functions.
         for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
           for (Metadata *MD : MetadataByGUID[VF.GUID]) {
-            CallSlots[{MD, VF.Offset}]
-                .CSInfo.markSummaryHasTypeTestAssumeUsers();
+            CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
           }
         }
         for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
@@ -1641,7 +1837,7 @@ bool DevirtModule::run() {
           for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
             CallSlots[{MD, VC.VFunc.Offset}]
                 .ConstCSInfo[VC.Args]
-                .markSummaryHasTypeTestAssumeUsers();
+                .addSummaryTypeTestAssumeUser(FS);
           }
         }
         for (const FunctionSummary::ConstVCall &VC :
@@ -1673,7 +1869,7 @@ bool DevirtModule::run() {
                        cast<MDString>(S.first.TypeID)->getString())
                    .WPDRes[S.first.ByteOffset];
 
-      if (!trySingleImplDevirt(TargetsForSlot, S.second, Res)) {
+      if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
         DidVirtualConstProp |=
             tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
 
@@ -1710,7 +1906,7 @@ bool DevirtModule::run() {
       using namespace ore;
       OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
                         << "devirtualized "
-                        << NV("FunctionName", F->getName()));
+                        << NV("FunctionName", DT.first));
     }
   }
 
@@ -1722,5 +1918,86 @@ bool DevirtModule::run() {
     for (VTableBits &B : Bits)
       rebuildGlobal(B);
 
+  // We have lowered or deleted the type checked load intrinsics, so we no
+  // longer have enough information to reason about the liveness of virtual
+  // function pointers in GlobalDCE.
+  for (GlobalVariable &GV : M.globals())
+    GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
   return true;
 }
+
+void DevirtIndex::run() {
+  if (ExportSummary.typeIdCompatibleVtableMap().empty())
+    return;
+
+  DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID;
+  for (auto &P : ExportSummary.typeIdCompatibleVtableMap()) {
+    NameByGUID[GlobalValue::getGUID(P.first)].push_back(P.first);
+  }
+
+  // Collect information from summary about which calls to try to devirtualize.
+  for (auto &P : ExportSummary) {
+    for (auto &S : P.second.SummaryList) {
+      auto *FS = dyn_cast<FunctionSummary>(S.get());
+      if (!FS)
+        continue;
+      // FIXME: Only add live functions.
+      for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+        for (StringRef Name : NameByGUID[VF.GUID]) {
+          CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
+        }
+      }
+      for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+        for (StringRef Name : NameByGUID[VF.GUID]) {
+          CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
+        }
+      }
+      for (const FunctionSummary::ConstVCall &VC :
+           FS->type_test_assume_const_vcalls()) {
+        for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
+          CallSlots[{Name, VC.VFunc.Offset}]
+              .ConstCSInfo[VC.Args]
+              .addSummaryTypeTestAssumeUser(FS);
+        }
+      }
+      for (const FunctionSummary::ConstVCall &VC :
+           FS->type_checked_load_const_vcalls()) {
+        for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
+          CallSlots[{Name, VC.VFunc.Offset}]
+              .ConstCSInfo[VC.Args]
+              .addSummaryTypeCheckedLoadUser(FS);
+        }
+      }
+    }
+  }
+
+  std::set<ValueInfo> DevirtTargets;
+  // For each (type, offset) pair:
+  for (auto &S : CallSlots) {
+    // Search each of the members of the type identifier for the virtual
+    // function implementation at offset S.first.ByteOffset, and add to
+    // TargetsForSlot.
+    std::vector<ValueInfo> TargetsForSlot;
+    auto TidSummary = ExportSummary.getTypeIdCompatibleVtableSummary(S.first.TypeID);
+    assert(TidSummary);
+    if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary,
+                                  S.first.ByteOffset)) {
+      WholeProgramDevirtResolution *Res =
+          &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID)
+               .WPDRes[S.first.ByteOffset];
+
+      if (!trySingleImplDevirt(TargetsForSlot, S.first, S.second, Res,
+                               DevirtTargets))
+        continue;
+    }
+  }
+
+  // Optionally have the thin link print message for each devirtualized
+  // function.
+  if (PrintSummaryDevirt)
+    for (const auto &DT : DevirtTargets)
+      errs() << "Devirtualized call to " << DT << "\n";
+
+  return;
+}
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index ba15b023f2a3..8bc34825f8a7 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1097,6 +1097,107 @@ static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) {
   return nullptr;
 }
 
+Instruction *
+InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
+    BinaryOperator &I) {
+  assert((I.getOpcode() == Instruction::Add ||
+          I.getOpcode() == Instruction::Or ||
+          I.getOpcode() == Instruction::Sub) &&
+         "Expecting add/or/sub instruction");
+
+  // We have a subtraction/addition between a (potentially truncated) *logical*
+  // right-shift of X and a "select".
+  Value *X, *Select;
+  Instruction *LowBitsToSkip, *Extract;
+  if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd(
+                               m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)),
+                               m_Instruction(Extract))),
+                           m_Value(Select))))
+    return nullptr;
+
+  // `add`/`or` is commutative; but for `sub`, "select" *must* be on RHS.
+  if (I.getOpcode() == Instruction::Sub && I.getOperand(1) != Select)
+    return nullptr;
+
+  Type *XTy = X->getType();
+  bool HadTrunc = I.getType() != XTy;
+
+  // If there was a truncation of extracted value, then we'll need to produce
+  // one extra instruction, so we need to ensure one instruction will go away.
+  if (HadTrunc && !match(&I, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+    return nullptr;
+
+  // Extraction should extract high NBits bits, with shift amount calculated as:
+  //   low bits to skip = shift bitwidth - high bits to extract
+  // The shift amount itself may be extended, and we need to look past zero-ext
+  // when matching NBits, that will matter for matching later.
+  Constant *C;
+  Value *NBits;
+  if (!match(
+          LowBitsToSkip,
+          m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) ||
+      !match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                   APInt(C->getType()->getScalarSizeInBits(),
+                                         X->getType()->getScalarSizeInBits()))))
+    return nullptr;
+
+  // Sign-extending value can be zero-extended if we `sub`tract it,
+  // or sign-extended otherwise.
+  auto SkipExtInMagic = [&I](Value *&V) {
+    if (I.getOpcode() == Instruction::Sub)
+      match(V, m_ZExtOrSelf(m_Value(V)));
+    else
+      match(V, m_SExtOrSelf(m_Value(V)));
+  };
+
+  // Now, finally validate the sign-extending magic.
+  // `select` itself may be appropriately extended, look past that.
+  SkipExtInMagic(Select);
+
+  ICmpInst::Predicate Pred;
+  const APInt *Thr;
+  Value *SignExtendingValue, *Zero;
+  bool ShouldSignext;
+  // It must be a select between two values we will later establish to be a
+  // sign-extending value and a zero constant. The condition guarding the
+  // sign-extension must be based on a sign bit of the same X we had in `lshr`.
+  if (!match(Select, m_Select(m_ICmp(Pred, m_Specific(X), m_APInt(Thr)),
+                              m_Value(SignExtendingValue), m_Value(Zero))) ||
+      !isSignBitCheck(Pred, *Thr, ShouldSignext))
+    return nullptr;
+
+  // icmp-select pair is commutative.
+  if (!ShouldSignext)
+    std::swap(SignExtendingValue, Zero);
+
+  // If we should not perform sign-extension then we must add/or/subtract zero.
+  if (!match(Zero, m_Zero()))
+    return nullptr;
+  // Otherwise, it should be some constant, left-shifted by the same NBits we
+  // had in `lshr`. Said left-shift can also be appropriately extended.
+  // Again, we must look past zero-ext when looking for NBits.
+  SkipExtInMagic(SignExtendingValue);
+  Constant *SignExtendingValueBaseConstant;
+  if (!match(SignExtendingValue,
+             m_Shl(m_Constant(SignExtendingValueBaseConstant),
+                   m_ZExtOrSelf(m_Specific(NBits)))))
+    return nullptr;
+  // If we `sub`, then the constant should be one, else it should be all-ones.
+  if (I.getOpcode() == Instruction::Sub
+          ? !match(SignExtendingValueBaseConstant, m_One())
+          : !match(SignExtendingValueBaseConstant, m_AllOnes()))
+    return nullptr;
+
+  auto *NewAShr = BinaryOperator::CreateAShr(X, LowBitsToSkip,
+                                             Extract->getName() + ".sext");
+  NewAShr->copyIRFlags(Extract); // Preserve `exact`-ness.
+  if (!HadTrunc)
+    return NewAShr;
+
+  Builder.Insert(NewAShr);
+  return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType());
+}
+
 Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
@@ -1302,12 +1403,32 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Instruction *V = canonicalizeLowbitMask(I, Builder))
     return V;
 
+  if (Instruction *V =
+          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+    return V;
+
   if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I))
     return SatAdd;
 
   return Changed ? &I : nullptr;
 }
 
+/// Eliminate an op from a linear interpolation (lerp) pattern.
+static Instruction *factorizeLerp(BinaryOperator &I,
+                                  InstCombiner::BuilderTy &Builder) {
+  Value *X, *Y, *Z;
+  if (!match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_Value(Y),
+                                            m_OneUse(m_FSub(m_FPOne(),
+                                                            m_Value(Z))))),
+                          m_OneUse(m_c_FMul(m_Value(X), m_Deferred(Z))))))
+    return nullptr;
+
+  // (Y * (1.0 - Z)) + (X * Z) --> Y + Z * (X - Y) [8 commuted variants]
+  Value *XY = Builder.CreateFSubFMF(X, Y, &I);
+  Value *MulZ = Builder.CreateFMulFMF(Z, XY, &I);
+  return BinaryOperator::CreateFAddFMF(Y, MulZ, &I);
+}
+
 /// Factor a common operand out of fadd/fsub of fmul/fdiv.
 static Instruction *factorizeFAddFSub(BinaryOperator &I,
                                       InstCombiner::BuilderTy &Builder) {
@@ -1315,6 +1436,10 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I,
           I.getOpcode() == Instruction::FSub) && "Expecting fadd/fsub");
   assert(I.hasAllowReassoc() && I.hasNoSignedZeros() &&
          "FP factorization requires FMF");
+
+  if (Instruction *Lerp = factorizeLerp(I, Builder))
+    return Lerp;
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Value *X, *Y, *Z;
   bool IsFMul;
@@ -1362,17 +1487,32 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
     return FoldedFAdd;
 
-  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-  Value *X;
   // (-X) + Y --> Y - X
-  if (match(LHS, m_FNeg(m_Value(X))))
-    return BinaryOperator::CreateFSubFMF(RHS, X, &I);
-  // Y + (-X) --> Y - X
-  if (match(RHS, m_FNeg(m_Value(X))))
-    return BinaryOperator::CreateFSubFMF(LHS, X, &I);
+  Value *X, *Y;
+  if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y))))
+    return BinaryOperator::CreateFSubFMF(Y, X, &I);
+
+  // Similar to above, but look through fmul/fdiv for the negated term.
+  // (-X * Y) + Z --> Z - (X * Y) [4 commuted variants]
+  Value *Z;
+  if (match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))),
+                         m_Value(Z)))) {
+    Value *XY = Builder.CreateFMulFMF(X, Y, &I);
+    return BinaryOperator::CreateFSubFMF(Z, XY, &I);
+  }
+  // (-X / Y) + Z --> Z - (X / Y) [2 commuted variants]
+  // (X / -Y) + Z --> Z - (X / Y) [2 commuted variants]
+  if (match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y))),
+                         m_Value(Z))) ||
+      match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))),
+                         m_Value(Z)))) {
+    Value *XY = Builder.CreateFDivFMF(X, Y, &I);
+    return BinaryOperator::CreateFSubFMF(Z, XY, &I);
+  }
 
   // Check for (fadd double (sitofp x), y), see if we can merge this into an
   // integer add followed by a promotion.
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
     Value *LHSIntVal = LHSConv->getOperand(0);
     Type *FPType = LHSConv->getType();
@@ -1631,37 +1771,50 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
   const APInt *Op0C;
   if (match(Op0, m_APInt(Op0C))) {
-    unsigned BitWidth = I.getType()->getScalarSizeInBits();
 
-    // -(X >>u 31) -> (X >>s 31)
-    // -(X >>s 31) -> (X >>u 31)
     if (Op0C->isNullValue()) {
+      Value *Op1Wide;
+      match(Op1, m_TruncOrSelf(m_Value(Op1Wide)));
+      bool HadTrunc = Op1Wide != Op1;
+      bool NoTruncOrTruncIsOneUse = !HadTrunc || Op1->hasOneUse();
+      unsigned BitWidth = Op1Wide->getType()->getScalarSizeInBits();
+
       Value *X;
       const APInt *ShAmt;
-      if (match(Op1, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
+      // -(X >>u 31) -> (X >>s 31)
+      if (NoTruncOrTruncIsOneUse &&
+          match(Op1Wide, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
           *ShAmt == BitWidth - 1) {
-        Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
-        return BinaryOperator::CreateAShr(X, ShAmtOp);
+        Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1);
+        Instruction *NewShift = BinaryOperator::CreateAShr(X, ShAmtOp);
+        NewShift->copyIRFlags(Op1Wide);
+        if (!HadTrunc)
+          return NewShift;
+        Builder.Insert(NewShift);
+        return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType());
       }
-      if (match(Op1, m_AShr(m_Value(X), m_APInt(ShAmt))) &&
+      // -(X >>s 31) -> (X >>u 31)
+      if (NoTruncOrTruncIsOneUse &&
+          match(Op1Wide, m_AShr(m_Value(X), m_APInt(ShAmt))) &&
           *ShAmt == BitWidth - 1) {
-        Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1);
-        return BinaryOperator::CreateLShr(X, ShAmtOp);
+        Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1);
+        Instruction *NewShift = BinaryOperator::CreateLShr(X, ShAmtOp);
+        NewShift->copyIRFlags(Op1Wide);
+        if (!HadTrunc)
+          return NewShift;
+        Builder.Insert(NewShift);
+        return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType());
       }
 
-      if (Op1->hasOneUse()) {
+      if (!HadTrunc && Op1->hasOneUse()) {
         Value *LHS, *RHS;
         SelectPatternFlavor SPF = matchSelectPattern(Op1, LHS, RHS).Flavor;
         if (SPF == SPF_ABS || SPF == SPF_NABS) {
           // This is a negate of an ABS/NABS pattern. Just swap the operands
           // of the select.
-          SelectInst *SI = cast<SelectInst>(Op1);
-          Value *TrueVal = SI->getTrueValue();
-          Value *FalseVal = SI->getFalseValue();
-          SI->setTrueValue(FalseVal);
-          SI->setFalseValue(TrueVal);
+          cast<SelectInst>(Op1)->swapValues();
           // Don't swap prof metadata, we didn't change the branch behavior.
-          return replaceInstUsesWith(I, SI);
+          return replaceInstUsesWith(I, Op1);
         }
       }
     }
@@ -1686,6 +1839,23 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateNeg(Y);
   }
 
+  // (sub (or A, B) (and A, B)) --> (xor A, B)
+  {
+    Value *A, *B;
+    if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateXor(A, B);
+  }
+
+  // (sub (and A, B) (or A, B)) --> neg (xor A, B)
+  {
+    Value *A, *B;
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
+        (Op0->hasOneUse() || Op1->hasOneUse()))
+      return BinaryOperator::CreateNeg(Builder.CreateXor(A, B));
+  }
+
   // (sub (or A, B), (xor A, B)) --> (and A, B)
   {
     Value *A, *B;
@@ -1694,6 +1864,15 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(A, B);
   }
 
+  // (sub (xor A, B) (or A, B)) --> neg (and A, B)
+  {
+    Value *A, *B;
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
+        (Op0->hasOneUse() || Op1->hasOneUse()))
+      return BinaryOperator::CreateNeg(Builder.CreateAnd(A, B));
+  }
+
   {
     Value *Y;
     // ((X | Y) - X) --> (~X & Y)
@@ -1778,7 +1957,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
         std::swap(LHS, RHS);
       // LHS is now O above and expected to have at least 2 uses (the min/max)
       // NotA is epected to have 2 uses from the min/max and 1 from the sub.
-      if (IsFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
+      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
           !NotA->hasNUsesOrMore(4)) {
         // Note: We don't generate the inverse max/min, just create the not of
         // it and let other folds do the rest.
@@ -1826,6 +2005,10 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return SelectInst::Create(Cmp, Neg, A);
   }
 
+  if (Instruction *V =
+          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+    return V;
+
   if (Instruction *Ext = narrowMathIfNoOverflow(I))
     return Ext;
 
@@ -1865,6 +2048,22 @@ static Instruction *foldFNegIntoConstant(Instruction &I) {
   return nullptr;
 }
 
+static Instruction *hoistFNegAboveFMulFDiv(Instruction &I,
+                                           InstCombiner::BuilderTy &Builder) {
+  Value *FNeg;
+  if (!match(&I, m_FNeg(m_Value(FNeg))))
+    return nullptr;
+
+  Value *X, *Y;
+  if (match(FNeg, m_OneUse(m_FMul(m_Value(X), m_Value(Y)))))
+    return BinaryOperator::CreateFMulFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
+
+  if (match(FNeg, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))))
+    return BinaryOperator::CreateFDivFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
   Value *Op = I.getOperand(0);
 
@@ -1882,6 +2081,9 @@ Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
       match(Op, m_OneUse(m_FSub(m_Value(X), m_Value(Y)))))
     return BinaryOperator::CreateFSubFMF(Y, X, &I);
 
+  if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
+    return R;
+
   return nullptr;
 }
 
@@ -1903,6 +2105,9 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   if (Instruction *X = foldFNegIntoConstant(I))
     return X;
 
+  if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
+    return R;
+
   Value *X, *Y;
   Constant *C;
 
@@ -1944,6 +2149,21 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y))))))
     return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I);
 
+  // Similar to above, but look through fmul/fdiv of the negated value:
+  // Op0 - (-X * Y) --> Op0 + (X * Y)
+  // Op0 - (Y * -X) --> Op0 + (X * Y)
+  if (match(Op1, m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))))) {
+    Value *FMul = Builder.CreateFMulFMF(X, Y, &I);
+    return BinaryOperator::CreateFAddFMF(Op0, FMul, &I);
+  }
+  // Op0 - (-X / Y) --> Op0 + (X / Y)
+  // Op0 - (X / -Y) --> Op0 + (X / Y)
+  if (match(Op1, m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y)))) ||
+      match(Op1, m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))))) {
+    Value *FDiv = Builder.CreateFDivFMF(X, Y, &I);
+    return BinaryOperator::CreateFAddFMF(Op0, FDiv, &I);
+  }
+
   // Handle special cases for FSub with selects feeding the operation
   if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
     return replaceInstUsesWith(I, V);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 2b9859b602f4..4a30b60ca931 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -160,16 +160,14 @@ Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
 }
 
 /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
-/// (V < Lo || V >= Hi). This method expects that Lo <= Hi. IsSigned indicates
+/// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates
 /// whether to treat V, Lo, and Hi as signed or not.
 Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
                                      bool isSigned, bool Inside) {
-  assert((isSigned ? Lo.sle(Hi) : Lo.ule(Hi)) &&
-         "Lo is not <= Hi in range emission code!");
+  assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) &&
+         "Lo is not < Hi in range emission code!");
 
   Type *Ty = V->getType();
-  if (Lo == Hi)
-    return Inside ? ConstantInt::getFalse(Ty) : ConstantInt::getTrue(Ty);
 
   // V >= Min && V <  Hi --> V <  Hi
   // V <  Min || V >= Hi --> V >= Hi
@@ -1051,9 +1049,103 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
   return nullptr;
 }
 
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
+static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
+                                         ICmpInst *UnsignedICmp, bool IsAnd,
+                                         const SimplifyQuery &Q,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *ZeroCmpOp;
+  ICmpInst::Predicate EqPred;
+  if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) ||
+      !ICmpInst::isEquality(EqPred))
+    return nullptr;
+
+  auto IsKnownNonZero = [&](Value *V) {
+    return isKnownNonZero(V, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+  };
+
+  ICmpInst::Predicate UnsignedPred;
+
+  Value *A, *B;
+  if (match(UnsignedICmp,
+            m_c_ICmp(UnsignedPred, m_Specific(ZeroCmpOp), m_Value(A))) &&
+      match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) &&
+      (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) {
+    if (UnsignedICmp->getOperand(0) != ZeroCmpOp)
+      UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
+
+    auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) {
+      if (!IsKnownNonZero(NonZero))
+        std::swap(NonZero, Other);
+      return IsKnownNonZero(NonZero);
+    };
+
+    // Given  ZeroCmpOp = (A + B)
+    //   ZeroCmpOp <= A && ZeroCmpOp != 0  -->  (0-B) <  A
+    //   ZeroCmpOp >  A || ZeroCmpOp == 0  -->  (0-B) >= A
+    //
+    //   ZeroCmpOp <  A && ZeroCmpOp != 0  -->  (0-X) <  Y  iff
+    //   ZeroCmpOp >= A || ZeroCmpOp == 0  -->  (0-X) >= Y  iff
+    //     with X being the value (A/B) that is known to be non-zero,
+    //     and Y being remaining value.
+    if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
+        IsAnd)
+      return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
+    if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE &&
+        IsAnd && GetKnownNonZeroAndOther(B, A))
+      return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
+    if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+        !IsAnd)
+      return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
+    if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ &&
+        !IsAnd && GetKnownNonZeroAndOther(B, A))
+      return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
+  }
+
+  Value *Base, *Offset;
+  if (!match(ZeroCmpOp, m_Sub(m_Value(Base), m_Value(Offset))))
+    return nullptr;
+
+  if (!match(UnsignedICmp,
+             m_c_ICmp(UnsignedPred, m_Specific(Base), m_Specific(Offset))) ||
+      !ICmpInst::isUnsigned(UnsignedPred))
+    return nullptr;
+  if (UnsignedICmp->getOperand(0) != Base)
+    UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
+
+  // Base >=/> Offset && (Base - Offset) != 0  <-->  Base > Offset
+  // (no overflow and not null)
+  if ((UnsignedPred == ICmpInst::ICMP_UGE ||
+       UnsignedPred == ICmpInst::ICMP_UGT) &&
+      EqPred == ICmpInst::ICMP_NE && IsAnd)
+    return Builder.CreateICmpUGT(Base, Offset);
+
+  // Base <=/< Offset || (Base - Offset) == 0  <-->  Base <= Offset
+  // (overflow or null)
+  if ((UnsignedPred == ICmpInst::ICMP_ULE ||
+       UnsignedPred == ICmpInst::ICMP_ULT) &&
+      EqPred == ICmpInst::ICMP_EQ && !IsAnd)
+    return Builder.CreateICmpULE(Base, Offset);
+
+  // Base <= Offset && (Base - Offset) != 0  -->  Base < Offset
+  if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
+      IsAnd)
+    return Builder.CreateICmpULT(Base, Offset);
+
+  // Base > Offset || (Base - Offset) == 0  -->  Base >= Offset
+  if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+      !IsAnd)
+    return Builder.CreateICmpUGE(Base, Offset);
+
+  return nullptr;
+}
+
 /// Fold (icmp)&(icmp) if possible.
 Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                     Instruction &CxtI) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&CxtI);
+
   // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
   // if K1 and K2 are a one-bit mask.
   if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI))
@@ -1096,6 +1188,13 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder))
     return V;
 
+  if (Value *X =
+          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/true, Q, Builder))
+    return X;
+  if (Value *X =
+          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder))
+    return X;
+
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
   Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
@@ -1196,16 +1295,22 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     default:
       llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_ULT:
-      if (LHSC == SubOne(RHSC)) // (X != 13 & X u< 14) -> X < 13
+      // (X != 13 & X u< 14) -> X < 13
+      if (LHSC->getValue() == (RHSC->getValue() - 1))
         return Builder.CreateICmpULT(LHS0, LHSC);
-      if (LHSC->isZero()) // (X !=  0 & X u< 14) -> X-1 u< 13
+      if (LHSC->isZero()) // (X != 0 & X u< C) -> X-1 u< C-1
         return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                                false, true);
       break; // (X != 13 & X u< 15) -> no change
     case ICmpInst::ICMP_SLT:
-      if (LHSC == SubOne(RHSC)) // (X != 13 & X s< 14) -> X < 13
+      // (X != 13 & X s< 14) -> X < 13
+      if (LHSC->getValue() == (RHSC->getValue() - 1))
         return Builder.CreateICmpSLT(LHS0, LHSC);
-      break;                 // (X != 13 & X s< 15) -> no change
+      // (X != INT_MIN & X s< C) -> X-(INT_MIN+1) u< (C-(INT_MIN+1))
+      if (LHSC->isMinValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               true, true);
+      break; // (X != 13 & X s< 15) -> no change
     case ICmpInst::ICMP_NE:
       // Potential folds for this case should already be handled.
       break;
@@ -1216,10 +1321,15 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     default:
       llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_NE:
-      if (RHSC == AddOne(LHSC)) // (X u> 13 & X != 14) -> X u> 14
+      // (X u> 13 & X != 14) -> X u> 14
+      if (RHSC->getValue() == (LHSC->getValue() + 1))
         return Builder.CreateICmp(PredL, LHS0, RHSC);
+      // X u> C & X != UINT_MAX -> (X-(C+1)) u< UINT_MAX-(C+1)
+      if (RHSC->isMaxValue(false))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               false, true);
       break;                 // (X u> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) <u 1
+    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) u< 1
       return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                              false, true);
     }
@@ -1229,10 +1339,15 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     default:
       llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_NE:
-      if (RHSC == AddOne(LHSC)) // (X s> 13 & X != 14) -> X s> 14
+      // (X s> 13 & X != 14) -> X s> 14
+      if (RHSC->getValue() == (LHSC->getValue() + 1))
         return Builder.CreateICmp(PredL, LHS0, RHSC);
+      // X s> C & X != INT_MAX -> (X-(C+1)) u< INT_MAX-(C+1)
+      if (RHSC->isMaxValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+                               true, true);
       break;                 // (X s> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1
+    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) u< 1
       return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
                              true);
     }
@@ -1352,8 +1467,8 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
   Value *A, *B;
   if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
       match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
-      !IsFreeToInvert(A, A->hasOneUse()) &&
-      !IsFreeToInvert(B, B->hasOneUse())) {
+      !isFreeToInvert(A, A->hasOneUse()) &&
+      !isFreeToInvert(B, B->hasOneUse())) {
     Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
     return BinaryOperator::CreateNot(AndOr);
   }
@@ -1770,13 +1885,13 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
     if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
       if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
-        if (Op1->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
+        if (Op1->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
           return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C));
 
     // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C
     if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
       if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
-        if (Op0->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
+        if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
           return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C));
 
     // (A | B) & ((~A) ^ B) -> (A & B)
@@ -1844,6 +1959,20 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       A->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(A, Op0, Constant::getNullValue(I.getType()));
 
+  // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0.
+  {
+    Value *X, *Y;
+    const APInt *ShAmt;
+    Type *Ty = I.getType();
+    if (match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)),
+                                          m_APInt(ShAmt))),
+                          m_Deferred(X))) &&
+        *ShAmt == Ty->getScalarSizeInBits() - 1) {
+      Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
+      return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty));
+    }
+  }
+
   return nullptr;
 }
 
@@ -2057,6 +2186,8 @@ Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
 /// Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                    Instruction &CxtI) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&CxtI);
+
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
   // if K1 and K2 are a one-bit mask.
   if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI))
@@ -2182,6 +2313,13 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldIsPowerOf2(LHS, RHS, false /* JoinedByAnd */, Builder))
     return V;
 
+  if (Value *X =
+          foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/false, Q, Builder))
+    return X;
+  if (Value *X =
+          foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder))
+    return X;
+
   // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
   if (!LHSC || !RHSC)
     return nullptr;
@@ -2251,8 +2389,19 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     case ICmpInst::ICMP_EQ:
       // Potential folds for this case should already be handled.
       break;
-    case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change
-    case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change
+    case ICmpInst::ICMP_UGT:
+      // (X == 0 || X u> C) -> (X-1) u>= C
+      if (LHSC->isMinValue(false))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
+                               false, false);
+      // (X == 13 | X u> 14) -> no change
+      break;
+    case ICmpInst::ICMP_SGT:
+      // (X == INT_MIN || X s> C) -> (X-(INT_MIN+1)) u>= C-INT_MIN
+      if (LHSC->isMinValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
+                               true, false);
+      // (X == 13 | X s> 14) -> no change
       break;
     }
     break;
@@ -2261,6 +2410,10 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     default:
       llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
+      // (X u< C || X == UINT_MAX) => (X-C) u>= UINT_MAX-C
+      if (RHSC->isMaxValue(false))
+        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
+                               false, false);
       break;
     case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
       assert(!RHSC->isMaxValue(false) && "Missed icmp simplification");
@@ -2272,9 +2425,14 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     switch (PredR) {
     default:
       llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ: // (X s< 13 | X == 14) -> no change
+    case ICmpInst::ICMP_EQ:
+      // (X s< C || X == INT_MAX) => (X-C) u>= INT_MAX-C
+      if (RHSC->isMaxValue(true))
+        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
+                               true, false);
+      // (X s< 13 | X == 14) -> no change
       break;
-    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) s> 2
+    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) u> 2
       assert(!RHSC->isMaxValue(true) && "Missed icmp simplification");
       return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
                              false);
@@ -2552,6 +2710,25 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     }
   }
 
+  // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X)  --> X s> Y ? -1 : X.
+  {
+    Value *X, *Y;
+    const APInt *ShAmt;
+    Type *Ty = I.getType();
+    if (match(&I, m_c_Or(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)),
+                                         m_APInt(ShAmt))),
+                         m_Deferred(X))) &&
+        *ShAmt == Ty->getScalarSizeInBits() - 1) {
+      Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
+      return SelectInst::Create(NewICmpInst, ConstantInt::getAllOnesValue(Ty),
+                                X);
+    }
+  }
+
+  if (Instruction *V =
+          canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+    return V;
+
   return nullptr;
 }
 
@@ -2617,7 +2794,11 @@ static Instruction *foldXorToXor(BinaryOperator &I,
   return nullptr;
 }
 
-Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                    BinaryOperator &I) {
+  assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS &&
+         I.getOperand(1) == RHS && "Should be 'xor' with these operands");
+
   if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
@@ -2672,14 +2853,35 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     // TODO: If OrICmp is false, the whole thing is false (InstSimplify?).
     if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
       // TODO: Independently handle cases where the 'and' side is a constant.
-      if (OrICmp == LHS && AndICmp == RHS && RHS->hasOneUse()) {
-        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS
-        RHS->setPredicate(RHS->getInversePredicate());
-        return Builder.CreateAnd(LHS, RHS);
+      ICmpInst *X = nullptr, *Y = nullptr;
+      if (OrICmp == LHS && AndICmp == RHS) {
+        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS  --> X & !Y
+        X = LHS;
+        Y = RHS;
       }
-      if (OrICmp == RHS && AndICmp == LHS && LHS->hasOneUse()) {
-        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS
-        LHS->setPredicate(LHS->getInversePredicate());
+      if (OrICmp == RHS && AndICmp == LHS) {
+        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS  --> !Y & X
+        X = RHS;
+        Y = LHS;
+      }
+      if (X && Y && (Y->hasOneUse() || canFreelyInvertAllUsersOf(Y, &I))) {
+        // Invert the predicate of 'Y', thus inverting its output.
+        Y->setPredicate(Y->getInversePredicate());
+        // So, are there other uses of Y?
+        if (!Y->hasOneUse()) {
+          // We need to adapt other uses of Y though. Get a value that matches
+          // the original value of Y before inversion. While this increases
+          // immediate instruction count, we have just ensured that all the
+          // users are freely-invertible, so that 'not' *will* get folded away.
+          BuilderTy::InsertPointGuard Guard(Builder);
+          // Set insertion point to right after the Y.
+          Builder.SetInsertPoint(Y->getParent(), ++(Y->getIterator()));
+          Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+          // Replace all uses of Y (excluding the one in NotY!) with NotY.
+          Y->replaceUsesWithIf(NotY,
+                               [NotY](Use &U) { return U.getUser() != NotY; });
+        }
+        // All done.
         return Builder.CreateAnd(LHS, RHS);
       }
     }
@@ -2747,9 +2949,9 @@ static Instruction *sinkNotIntoXor(BinaryOperator &I,
     return nullptr;
 
   // We only want to do the transform if it is free to do.
-  if (IsFreeToInvert(X, X->hasOneUse())) {
+  if (isFreeToInvert(X, X->hasOneUse())) {
     // Ok, good.
-  } else if (IsFreeToInvert(Y, Y->hasOneUse())) {
+  } else if (isFreeToInvert(Y, Y->hasOneUse())) {
     std::swap(X, Y);
   } else
     return nullptr;
@@ -2827,9 +3029,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       // Apply DeMorgan's Law when inverts are free:
       // ~(X & Y) --> (~X | ~Y)
       // ~(X | Y) --> (~X & ~Y)
-      if (IsFreeToInvert(NotVal->getOperand(0),
+      if (isFreeToInvert(NotVal->getOperand(0),
                          NotVal->getOperand(0)->hasOneUse()) &&
-          IsFreeToInvert(NotVal->getOperand(1),
+          isFreeToInvert(NotVal->getOperand(1),
                          NotVal->getOperand(1)->hasOneUse())) {
         Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs");
         Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs");
@@ -3004,7 +3206,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
   if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
     if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
-      if (Value *V = foldXorOfICmps(LHS, RHS))
+      if (Value *V = foldXorOfICmps(LHS, RHS, I))
         return replaceInstUsesWith(I, V);
 
   if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
@@ -3052,7 +3254,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     if (SelectPatternResult::isMinOrMax(SPF)) {
       // It's possible we get here before the not has been simplified, so make
       // sure the input to the not isn't freely invertible.
-      if (match(LHS, m_Not(m_Value(X))) && !IsFreeToInvert(X, X->hasOneUse())) {
+      if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) {
         Value *NotY = Builder.CreateNot(RHS);
         return SelectInst::Create(
             Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
@@ -3060,7 +3262,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
       // It's possible we get here before the not has been simplified, so make
       // sure the input to the not isn't freely invertible.
-      if (match(RHS, m_Not(m_Value(Y))) && !IsFreeToInvert(Y, Y->hasOneUse())) {
+      if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) {
         Value *NotX = Builder.CreateNot(LHS);
         return SelectInst::Create(
             Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y);
@@ -3068,8 +3270,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
       // If both sides are freely invertible, then we can get rid of the xor
       // completely.
-      if (IsFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
-          IsFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
+      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
+          isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
         Value *NotLHS = Builder.CreateNot(LHS);
         Value *NotRHS = Builder.CreateNot(RHS);
         return SelectInst::Create(
diff --git a/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
index 5f37a00f56cf..825f4b468b0a 100644
--- a/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -124,7 +124,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
     auto *SI = new StoreInst(RMWI.getValOperand(),
                              RMWI.getPointerOperand(), &RMWI);
     SI->setAtomic(Ordering, RMWI.getSyncScopeID());
-    SI->setAlignment(DL.getABITypeAlignment(RMWI.getType()));
+    SI->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType())));
     return eraseInstFromFunction(RMWI);
   }
   
@@ -154,6 +154,6 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
   
   LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand());
   Load->setAtomic(Ordering, RMWI.getSyncScopeID());
-  Load->setAlignment(DL.getABITypeAlignment(RMWI.getType()));
+  Load->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType())));
   return Load;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 4b3333affa72..c650d242cd50 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -185,7 +185,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
   LoadInst *L = Builder.CreateLoad(IntType, Src);
   // Alignment from the mem intrinsic will be better, so use it.
-  L->setAlignment(CopySrcAlign);
+  L->setAlignment(
+      MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead.
   if (CopyMD)
     L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   MDNode *LoopMemParallelMD =
@@ -198,7 +199,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
 
   StoreInst *S = Builder.CreateStore(L, Dest);
   // Alignment from the mem intrinsic will be better, so use it.
-  S->setAlignment(CopyDstAlign);
+  S->setAlignment(
+      MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead.
   if (CopyMD)
     S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   if (LoopMemParallelMD)
@@ -223,9 +225,10 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
 }
 
 Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
-  unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
-  if (MI->getDestAlignment() < Alignment) {
-    MI->setDestAlignment(Alignment);
+  const unsigned KnownAlignment =
+      getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
+  if (MI->getDestAlignment() < KnownAlignment) {
+    MI->setDestAlignment(KnownAlignment);
     return MI;
   }
 
@@ -243,13 +246,9 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
   ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
   if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
     return nullptr;
-  uint64_t Len = LenC->getLimitedValue();
-  Alignment = MI->getDestAlignment();
+  const uint64_t Len = LenC->getLimitedValue();
   assert(Len && "0-sized memory setting should be removed already.");
-
-  // Alignment 0 is identity for alignment 1 for memset, but not store.
-  if (Alignment == 0)
-    Alignment = 1;
+  const Align Alignment = assumeAligned(MI->getDestAlignment());
 
   // If it is an atomic and alignment is less than the size then we will
   // introduce the unaligned memory access which will be later transformed
@@ -1060,9 +1059,9 @@ Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
 
   // If we can unconditionally load from this address, replace with a
   // load/select idiom. TODO: use DT for context sensitive query
-  if (isDereferenceableAndAlignedPointer(LoadPtr, II.getType(), Alignment,
-                                         II.getModule()->getDataLayout(),
-                                         &II, nullptr)) {
+  if (isDereferenceableAndAlignedPointer(
+          LoadPtr, II.getType(), MaybeAlign(Alignment),
+          II.getModule()->getDataLayout(), &II, nullptr)) {
     Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
                                          "unmaskedload");
     return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
@@ -1086,7 +1085,8 @@ Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
   // If the mask is all ones, this is a plain vector store of the 1st argument.
   if (ConstMask->isAllOnesValue()) {
     Value *StorePtr = II.getArgOperand(1);
-    unsigned Alignment = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
+    MaybeAlign Alignment(
+        cast<ConstantInt>(II.getArgOperand(2))->getZExtValue());
     return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
   }
 
@@ -2234,6 +2234,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return replaceInstUsesWith(*II, Add);
     }
 
+    // Try to simplify the underlying FMul.
+    if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
+                                    II->getFastMathFlags(),
+                                    SQ.getWithInstruction(II))) {
+      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
+      FAdd->copyFastMathFlags(II);
+      return FAdd;
+    }
+
     LLVM_FALLTHROUGH;
   }
   case Intrinsic::fma: {
@@ -2258,9 +2267,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return II;
     }
 
-    // fma x, 1, z -> fadd x, z
-    if (match(Src1, m_FPOne())) {
-      auto *FAdd = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
+    // Try to simplify the underlying FMul. We can only apply simplifications
+    // that do not require rounding.
+    if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
+                                   II->getFastMathFlags(),
+                                   SQ.getWithInstruction(II))) {
+      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
       FAdd->copyFastMathFlags(II);
       return FAdd;
     }
@@ -2331,7 +2343,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Turn PPC VSX loads into normal loads.
     Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
                                        PointerType::getUnqual(II->getType()));
-    return new LoadInst(II->getType(), Ptr, Twine(""), false, 1);
+    return new LoadInst(II->getType(), Ptr, Twine(""), false, Align::None());
   }
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
@@ -2349,7 +2361,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Turn PPC VSX stores into normal stores.
     Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
     Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-    return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
+    return new StoreInst(II->getArgOperand(0), Ptr, false, Align::None());
   }
   case Intrinsic::ppc_qpx_qvlfs:
     // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
@@ -3885,6 +3897,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Asan needs to poison memory to detect invalid access which is possible
     // even for empty lifetime range.
     if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
+        II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) ||
         II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
       break;
 
@@ -3950,10 +3963,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
   case Intrinsic::experimental_gc_relocate: {
+    auto &GCR = *cast<GCRelocateInst>(II);
+
+    // If we have two copies of the same pointer in the statepoint argument
+    // list, canonicalize to one.  This may let us common gc.relocates.
+    if (GCR.getBasePtr() == GCR.getDerivedPtr() &&
+        GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) {
+      auto *OpIntTy = GCR.getOperand(2)->getType();
+      II->setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
+      return II;
+    }
+    
     // Translate facts known about a pointer before relocating into
     // facts about the relocate value, while being careful to
     // preserve relocation semantics.
-    Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr();
+    Value *DerivedPtr = GCR.getDerivedPtr();
 
     // Remove the relocation if unused, note that this check is required
     // to prevent the cases below from looping forever.
@@ -4177,10 +4201,58 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) {
   return nullptr;
 }
 
+static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
+  unsigned NumArgs = Call.getNumArgOperands();
+  ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
+  ConstantInt *Op1C =
+      (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
+  // Bail out if the allocation size is zero.
+  if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
+    return;
+
+  if (isMallocLikeFn(&Call, TLI) && Op0C) {
+    if (isOpNewLikeFn(&Call, TLI))
+      Call.addAttribute(AttributeList::ReturnIndex,
+                        Attribute::getWithDereferenceableBytes(
+                            Call.getContext(), Op0C->getZExtValue()));
+    else
+      Call.addAttribute(AttributeList::ReturnIndex,
+                        Attribute::getWithDereferenceableOrNullBytes(
+                            Call.getContext(), Op0C->getZExtValue()));
+  } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
+    Call.addAttribute(AttributeList::ReturnIndex,
+                      Attribute::getWithDereferenceableOrNullBytes(
+                          Call.getContext(), Op1C->getZExtValue()));
+  } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
+    bool Overflow;
+    const APInt &N = Op0C->getValue();
+    APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
+    if (!Overflow)
+      Call.addAttribute(AttributeList::ReturnIndex,
+                        Attribute::getWithDereferenceableOrNullBytes(
+                            Call.getContext(), Size.getZExtValue()));
+  } else if (isStrdupLikeFn(&Call, TLI)) {
+    uint64_t Len = GetStringLength(Call.getOperand(0));
+    if (Len) {
+      // strdup
+      if (NumArgs == 1)
+        Call.addAttribute(AttributeList::ReturnIndex,
+                          Attribute::getWithDereferenceableOrNullBytes(
+                              Call.getContext(), Len));
+      // strndup
+      else if (NumArgs == 2 && Op1C)
+        Call.addAttribute(
+            AttributeList::ReturnIndex,
+            Attribute::getWithDereferenceableOrNullBytes(
+                Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
+    }
+  }
+}
+
 /// Improvements for call, callbr and invoke instructions.
 Instruction *InstCombiner::visitCallBase(CallBase &Call) {
-  if (isAllocLikeFn(&Call, &TLI))
-    return visitAllocSite(Call);
+  if (isAllocationFn(&Call, &TLI))
+    annotateAnyAllocSite(Call, &TLI);
 
   bool Changed = false;
 
@@ -4312,6 +4384,9 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) {
     if (I) return eraseInstFromFunction(*I);
   }
 
+  if (isAllocLikeFn(&Call, &TLI))
+    return visitAllocSite(Call);
+
   return Changed ? &Call : nullptr;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2c9ba203fbf3..65aaef28d87a 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -140,7 +140,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   }
 
   AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
-  New->setAlignment(AI.getAlignment());
+  New->setAlignment(MaybeAlign(AI.getAlignment()));
   New->takeName(&AI);
   New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
 
@@ -1531,16 +1531,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
   // what we can and cannot do safely varies from operation to operation, and
   // is explained below in the various case statements.
   Type *Ty = FPT.getType();
-  BinaryOperator *OpI = dyn_cast<BinaryOperator>(FPT.getOperand(0));
-  if (OpI && OpI->hasOneUse()) {
-    Type *LHSMinType = getMinimumFPType(OpI->getOperand(0));
-    Type *RHSMinType = getMinimumFPType(OpI->getOperand(1));
-    unsigned OpWidth = OpI->getType()->getFPMantissaWidth();
+  auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
+  if (BO && BO->hasOneUse()) {
+    Type *LHSMinType = getMinimumFPType(BO->getOperand(0));
+    Type *RHSMinType = getMinimumFPType(BO->getOperand(1));
+    unsigned OpWidth = BO->getType()->getFPMantissaWidth();
     unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
     unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
     unsigned SrcWidth = std::max(LHSWidth, RHSWidth);
     unsigned DstWidth = Ty->getFPMantissaWidth();
-    switch (OpI->getOpcode()) {
+    switch (BO->getOpcode()) {
       default: break;
       case Instruction::FAdd:
       case Instruction::FSub:
@@ -1563,10 +1563,10 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
         // could be tightened for those cases, but they are rare (the main
         // case of interest here is (float)((double)float + float)).
         if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
-          Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
-          Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
-          Instruction *RI = BinaryOperator::Create(OpI->getOpcode(), LHS, RHS);
-          RI->copyFastMathFlags(OpI);
+          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+          Instruction *RI = BinaryOperator::Create(BO->getOpcode(), LHS, RHS);
+          RI->copyFastMathFlags(BO);
           return RI;
         }
         break;
@@ -1577,9 +1577,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
         // rounding can possibly occur; we can safely perform the operation
         // in the destination format if it can represent both sources.
         if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
-          Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
-          Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
-          return BinaryOperator::CreateFMulFMF(LHS, RHS, OpI);
+          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+          return BinaryOperator::CreateFMulFMF(LHS, RHS, BO);
         }
         break;
       case Instruction::FDiv:
@@ -1590,9 +1590,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
         // condition used here is a good conservative first pass.
         // TODO: Tighten bound via rigorous analysis of the unbalanced case.
         if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
-          Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty);
-          Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
-          return BinaryOperator::CreateFDivFMF(LHS, RHS, OpI);
+          Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+          Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+          return BinaryOperator::CreateFDivFMF(LHS, RHS, BO);
         }
         break;
       case Instruction::FRem: {
@@ -1604,14 +1604,14 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
           break;
         Value *LHS, *RHS;
         if (LHSWidth == SrcWidth) {
-           LHS = Builder.CreateFPTrunc(OpI->getOperand(0), LHSMinType);
-           RHS = Builder.CreateFPTrunc(OpI->getOperand(1), LHSMinType);
+           LHS = Builder.CreateFPTrunc(BO->getOperand(0), LHSMinType);
+           RHS = Builder.CreateFPTrunc(BO->getOperand(1), LHSMinType);
         } else {
-           LHS = Builder.CreateFPTrunc(OpI->getOperand(0), RHSMinType);
-           RHS = Builder.CreateFPTrunc(OpI->getOperand(1), RHSMinType);
+           LHS = Builder.CreateFPTrunc(BO->getOperand(0), RHSMinType);
+           RHS = Builder.CreateFPTrunc(BO->getOperand(1), RHSMinType);
         }
 
-        Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, OpI);
+        Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, BO);
         return CastInst::CreateFPCast(ExactResult, Ty);
       }
     }
@@ -2338,8 +2338,23 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // If we found a path from the src to dest, create the getelementptr now.
     if (SrcElTy == DstElTy) {
       SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0));
-      return GetElementPtrInst::CreateInBounds(SrcPTy->getElementType(), Src,
-                                               Idxs);
+      GetElementPtrInst *GEP =
+          GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs);
+
+      // If the source pointer is dereferenceable, then assume it points to an
+      // allocated object and apply "inbounds" to the GEP.
+      bool CanBeNull;
+      if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) {
+        // In a non-default address space (not 0), a null pointer can not be
+        // assumed inbounds, so ignore that case (dereferenceable_or_null).
+        // The reason is that 'null' is not treated differently in these address
+        // spaces, and we consequently ignore the 'gep inbounds' special case
+        // for 'null' which allows 'inbounds' on 'null' if the indices are
+        // zeros.
+        if (SrcPTy->getAddressSpace() == 0 || !CanBeNull)
+          GEP->setIsInBounds();
+      }
+      return GEP;
     }
   }
 
@@ -2391,28 +2406,47 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
+  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) {
     // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
     // a bitcast to a vector with the same # elts.
-    if (SVI->hasOneUse() && DestTy->isVectorTy() &&
-        DestTy->getVectorNumElements() == SVI->getType()->getNumElements() &&
-        SVI->getType()->getNumElements() ==
-        SVI->getOperand(0)->getType()->getVectorNumElements()) {
+    Value *ShufOp0 = Shuf->getOperand(0);
+    Value *ShufOp1 = Shuf->getOperand(1);
+    unsigned NumShufElts = Shuf->getType()->getVectorNumElements();
+    unsigned NumSrcVecElts = ShufOp0->getType()->getVectorNumElements();
+    if (Shuf->hasOneUse() && DestTy->isVectorTy() &&
+        DestTy->getVectorNumElements() == NumShufElts &&
+        NumShufElts == NumSrcVecElts) {
       BitCastInst *Tmp;
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
       // us to eliminate at least one cast.
-      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) &&
+      if (((Tmp = dyn_cast<BitCastInst>(ShufOp0)) &&
            Tmp->getOperand(0)->getType() == DestTy) ||
-          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) &&
+          ((Tmp = dyn_cast<BitCastInst>(ShufOp1)) &&
            Tmp->getOperand(0)->getType() == DestTy)) {
-        Value *LHS = Builder.CreateBitCast(SVI->getOperand(0), DestTy);
-        Value *RHS = Builder.CreateBitCast(SVI->getOperand(1), DestTy);
+        Value *LHS = Builder.CreateBitCast(ShufOp0, DestTy);
+        Value *RHS = Builder.CreateBitCast(ShufOp1, DestTy);
         // Return a new shuffle vector.  Use the same element ID's, as we
         // know the vector types match #elts.
-        return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2));
+        return new ShuffleVectorInst(LHS, RHS, Shuf->getOperand(2));
       }
     }
+
+    // A bitcasted-to-scalar and byte-reversing shuffle is better recognized as
+    // a byte-swap:
+    // bitcast <N x i8> (shuf X, undef, <N, N-1,...0>) --> bswap (bitcast X)
+    // TODO: We should match the related pattern for bitreverse.
+    if (DestTy->isIntegerTy() &&
+        DL.isLegalInteger(DestTy->getScalarSizeInBits()) &&
+        SrcTy->getScalarSizeInBits() == 8 && NumShufElts % 2 == 0 &&
+        Shuf->hasOneUse() && Shuf->isReverse()) {
+      assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask");
+      assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op");
+      Function *Bswap =
+          Intrinsic::getDeclaration(CI.getModule(), Intrinsic::bswap, DestTy);
+      Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy);
+      return IntrinsicInst::Create(Bswap, { ScalarX });
+    }
   }
 
   // Handle the A->B->A cast, and there is an intervening PHI node.
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 3a4283ae5406..a9f64feb600c 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -69,34 +69,6 @@ static bool hasBranchUse(ICmpInst &I) {
   return false;
 }
 
-/// Given an exploded icmp instruction, return true if the comparison only
-/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
-/// result of the comparison is true when the input value is signed.
-static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
-                           bool &TrueIfSigned) {
-  switch (Pred) {
-  case ICmpInst::ICMP_SLT:   // True if LHS s< 0
-    TrueIfSigned = true;
-    return RHS.isNullValue();
-  case ICmpInst::ICMP_SLE:   // True if LHS s<= RHS and RHS == -1
-    TrueIfSigned = true;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_SGT:   // True if LHS s> -1
-    TrueIfSigned = false;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_UGT:
-    // True if LHS u> RHS and RHS == high-bit-mask - 1
-    TrueIfSigned = true;
-    return RHS.isMaxSignedValue();
-  case ICmpInst::ICMP_UGE:
-    // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
-    TrueIfSigned = true;
-    return RHS.isSignMask();
-  default:
-    return false;
-  }
-}
-
 /// Returns true if the exploded icmp can be expressed as a signed comparison
 /// to zero and updates the predicate accordingly.
 /// The signedness of the comparison is preserved.
@@ -832,6 +804,10 @@ getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
 static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
                                               ICmpInst::Predicate Cond,
                                               const DataLayout &DL) {
+  // FIXME: Support vector of pointers.
+  if (GEPLHS->getType()->isVectorTy())
+    return nullptr;
+
   if (!GEPLHS->hasAllConstantIndices())
     return nullptr;
 
@@ -882,7 +858,9 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     RHS = RHS->stripPointerCasts();
 
   Value *PtrBase = GEPLHS->getOperand(0);
-  if (PtrBase == RHS && GEPLHS->isInBounds()) {
+  // FIXME: Support vector pointer GEPs.
+  if (PtrBase == RHS && GEPLHS->isInBounds() &&
+      !GEPLHS->getType()->isVectorTy()) {
     // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
     // This transformation (ignoring the base and scales) is valid because we
     // know pointers can't overflow since the gep is inbounds.  See if we can
@@ -894,6 +872,37 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       Offset = EmitGEPOffset(GEPLHS);
     return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
                         Constant::getNullValue(Offset->getType()));
+  }
+
+  if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) &&
+      isa<Constant>(RHS) && cast<Constant>(RHS)->isNullValue() &&
+      !NullPointerIsDefined(I.getFunction(),
+                            RHS->getType()->getPointerAddressSpace())) {
+    // For most address spaces, an allocation can't be placed at null, but null
+    // itself is treated as a 0 size allocation in the in bounds rules.  Thus,
+    // the only valid inbounds address derived from null, is null itself.
+    // Thus, we have four cases to consider:
+    // 1) Base == nullptr, Offset == 0 -> inbounds, null
+    // 2) Base == nullptr, Offset != 0 -> poison as the result is out of bounds
+    // 3) Base != nullptr, Offset == (-base) -> poison (crossing allocations)
+    // 4) Base != nullptr, Offset != (-base) -> nonnull (and possibly poison)
+    //
+    // (Note if we're indexing a type of size 0, that simply collapses into one
+    //  of the buckets above.)
+    //
+    // In general, we're allowed to make values less poison (i.e. remove
+    //   sources of full UB), so in this case, we just select between the two
+    //   non-poison cases (1 and 4 above).
+    //
+    // For vectors, we apply the same reasoning on a per-lane basis.
+    auto *Base = GEPLHS->getPointerOperand();
+    if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) {
+      int NumElts = GEPLHS->getType()->getVectorNumElements();
+      Base = Builder.CreateVectorSplat(NumElts, Base);
+    }
+    return new ICmpInst(Cond, Base,
+                        ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                            cast<Constant>(RHS), Base->getType()));
   } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
     // If the base pointers are different, but the indices are the same, just
     // compare the base pointer.
@@ -916,11 +925,13 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       // If we're comparing GEPs with two base pointers that only differ in type
       // and both GEPs have only constant indices or just one use, then fold
       // the compare with the adjusted indices.
+      // FIXME: Support vector of pointers.
       if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
           (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
           (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
           PtrBase->stripPointerCasts() ==
-              GEPRHS->getOperand(0)->stripPointerCasts()) {
+              GEPRHS->getOperand(0)->stripPointerCasts() &&
+          !GEPLHS->getType()->isVectorTy()) {
         Value *LOffset = EmitGEPOffset(GEPLHS);
         Value *ROffset = EmitGEPOffset(GEPRHS);
 
@@ -949,12 +960,14 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     }
 
     // If one of the GEPs has all zero indices, recurse.
-    if (GEPLHS->hasAllZeroIndices())
+    // FIXME: Handle vector of pointers.
+    if (!GEPLHS->getType()->isVectorTy() && GEPLHS->hasAllZeroIndices())
       return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
                          ICmpInst::getSwappedPredicate(Cond), I);
 
     // If the other GEP has all zero indices, recurse.
-    if (GEPRHS->hasAllZeroIndices())
+    // FIXME: Handle vector of pointers.
+    if (!GEPRHS->getType()->isVectorTy() && GEPRHS->hasAllZeroIndices())
       return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
 
     bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
@@ -964,15 +977,20 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       unsigned DiffOperand = 0;     // The operand that differs.
       for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
         if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
-          if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() !=
-                   GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) {
+          Type *LHSType = GEPLHS->getOperand(i)->getType();
+          Type *RHSType = GEPRHS->getOperand(i)->getType();
+          // FIXME: Better support for vector of pointers.
+          if (LHSType->getPrimitiveSizeInBits() !=
+                   RHSType->getPrimitiveSizeInBits() ||
+              (GEPLHS->getType()->isVectorTy() &&
+               (!LHSType->isVectorTy() || !RHSType->isVectorTy()))) {
             // Irreconcilable differences.
             NumDifferences = 2;
             break;
-          } else {
-            if (NumDifferences++) break;
-            DiffOperand = i;
           }
+
+          if (NumDifferences++) break;
+          DiffOperand = i;
         }
 
       if (NumDifferences == 0)   // SAME GEP?
@@ -1317,6 +1335,59 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   return ExtractValueInst::Create(Call, 1, "sadd.overflow");
 }
 
+/// If we have:
+///   icmp eq/ne (urem/srem %x, %y), 0
+/// iff %y is a power-of-two, we can replace this with a bit test:
+///   icmp eq/ne (and %x, (add %y, -1)), 0
+Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
+  // This fold is only valid for equality predicates.
+  if (!I.isEquality())
+    return nullptr;
+  ICmpInst::Predicate Pred;
+  Value *X, *Y, *Zero;
+  if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))),
+                        m_CombineAnd(m_Zero(), m_Value(Zero)))))
+    return nullptr;
+  if (!isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, 0, &I))
+    return nullptr;
+  // This may increase instruction count, we don't enforce that Y is a constant.
+  Value *Mask = Builder.CreateAdd(Y, Constant::getAllOnesValue(Y->getType()));
+  Value *Masked = Builder.CreateAnd(X, Mask);
+  return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero);
+}
+
+/// Fold equality-comparison between zero and any (maybe truncated) right-shift
+/// by one-less-than-bitwidth into a sign test on the original value.
+Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) {
+  Instruction *Val;
+  ICmpInst::Predicate Pred;
+  if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero())))
+    return nullptr;
+
+  Value *X;
+  Type *XTy;
+
+  Constant *C;
+  if (match(Val, m_TruncOrSelf(m_Shr(m_Value(X), m_Constant(C))))) {
+    XTy = X->getType();
+    unsigned XBitWidth = XTy->getScalarSizeInBits();
+    if (!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                     APInt(XBitWidth, XBitWidth - 1))))
+      return nullptr;
+  } else if (isa<BinaryOperator>(Val) &&
+             (X = reassociateShiftAmtsOfTwoSameDirectionShifts(
+                  cast<BinaryOperator>(Val), SQ.getWithInstruction(Val),
+                  /*AnalyzeForSignBitExtraction=*/true))) {
+    XTy = X->getType();
+  } else
+    return nullptr;
+
+  return ICmpInst::Create(Instruction::ICmp,
+                          Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE
+                                                    : ICmpInst::ICMP_SLT,
+                          X, ConstantInt::getNullValue(XTy));
+}
+
 // Handle  icmp pred X, 0
 Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
   CmpInst::Predicate Pred = Cmp.getPredicate();
@@ -1335,6 +1406,9 @@ Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
     }
   }
 
+  if (Instruction *New = foldIRemByPowerOfTwoToBitTest(Cmp))
+    return New;
+
   // Given:
   //   icmp eq/ne (urem %x, %y), 0
   // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
@@ -2179,6 +2253,44 @@ Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
   return nullptr;
 }
 
+Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp,
+                                                BinaryOperator *SRem,
+                                                const APInt &C) {
+  // Match an 'is positive' or 'is negative' comparison of remainder by a
+  // constant power-of-2 value:
+  // (X % pow2C) sgt/slt 0
+  const ICmpInst::Predicate Pred = Cmp.getPredicate();
+  if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT)
+    return nullptr;
+
+  // TODO: The one-use check is standard because we do not typically want to
+  //       create longer instruction sequences, but this might be a special-case
+  //       because srem is not good for analysis or codegen.
+  if (!SRem->hasOneUse())
+    return nullptr;
+
+  const APInt *DivisorC;
+  if (!C.isNullValue() || !match(SRem->getOperand(1), m_Power2(DivisorC)))
+    return nullptr;
+
+  // Mask off the sign bit and the modulo bits (low-bits).
+  Type *Ty = SRem->getType();
+  APInt SignMask = APInt::getSignMask(Ty->getScalarSizeInBits());
+  Constant *MaskC = ConstantInt::get(Ty, SignMask | (*DivisorC - 1));
+  Value *And = Builder.CreateAnd(SRem->getOperand(0), MaskC);
+
+  // For 'is positive?' check that the sign-bit is clear and at least 1 masked
+  // bit is set. Example:
+  // (i8 X % 32) s> 0 --> (X & 159) s> 0
+  if (Pred == ICmpInst::ICMP_SGT)
+    return new ICmpInst(ICmpInst::ICMP_SGT, And, ConstantInt::getNullValue(Ty));
+
+  // For 'is negative?' check that the sign-bit is set and at least 1 masked
+  // bit is set. Example:
+  // (i16 X % 4) s< 0 --> (X & 32771) u> 32768
+  return new ICmpInst(ICmpInst::ICMP_UGT, And, ConstantInt::get(Ty, SignMask));
+}
+
 /// Fold icmp (udiv X, Y), C.
 Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
                                                 BinaryOperator *UDiv,
@@ -2387,6 +2499,11 @@ Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
   const APInt *C2;
   APInt SubResult;
 
+  // icmp eq/ne (sub C, Y), C -> icmp eq/ne Y, 0
+  if (match(X, m_APInt(C2)) && *C2 == C && Cmp.isEquality())
+    return new ICmpInst(Cmp.getPredicate(), Y,
+                        ConstantInt::get(Y->getType(), 0));
+
   // (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C)
   if (match(X, m_APInt(C2)) &&
       ((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) ||
@@ -2509,20 +2626,49 @@ bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
   // TODO: Generalize this to work with other comparison idioms or ensure
   // they get canonicalized into this form.
 
-  // select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32
-  // Greater), where Equal, Less and Greater are placeholders for any three
-  // constants.
-  ICmpInst::Predicate PredA, PredB;
-  if (match(SI->getTrueValue(), m_ConstantInt(Equal)) &&
-      match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) &&
-      PredA == ICmpInst::ICMP_EQ &&
-      match(SI->getFalseValue(),
-            m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)),
-                     m_ConstantInt(Less), m_ConstantInt(Greater))) &&
-      PredB == ICmpInst::ICMP_SLT) {
-    return true;
+  // select i1 (a == b),
+  //        i32 Equal,
+  //        i32 (select i1 (a < b), i32 Less, i32 Greater)
+  // where Equal, Less and Greater are placeholders for any three constants.
+  ICmpInst::Predicate PredA;
+  if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) ||
+      !ICmpInst::isEquality(PredA))
+    return false;
+  Value *EqualVal = SI->getTrueValue();
+  Value *UnequalVal = SI->getFalseValue();
+  // We still can get non-canonical predicate here, so canonicalize.
+  if (PredA == ICmpInst::ICMP_NE)
+    std::swap(EqualVal, UnequalVal);
+  if (!match(EqualVal, m_ConstantInt(Equal)))
+    return false;
+  ICmpInst::Predicate PredB;
+  Value *LHS2, *RHS2;
+  if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)),
+                                  m_ConstantInt(Less), m_ConstantInt(Greater))))
+    return false;
+  // We can get predicate mismatch here, so canonicalize if possible:
+  // First, ensure that 'LHS' match.
+  if (LHS2 != LHS) {
+    // x sgt y <--> y slt x
+    std::swap(LHS2, RHS2);
+    PredB = ICmpInst::getSwappedPredicate(PredB);
+  }
+  if (LHS2 != LHS)
+    return false;
+  // We also need to canonicalize 'RHS'.
+  if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
+    // x sgt C-1  <-->  x sge C  <-->  not(x slt C)
+    auto FlippedStrictness =
+        getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2));
+    if (!FlippedStrictness)
+      return false;
+    assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
+    RHS2 = FlippedStrictness->second;
+    // And kind-of perform the result swap.
+    std::swap(Less, Greater);
+    PredB = ICmpInst::ICMP_SLT;
   }
-  return false;
+  return PredB == ICmpInst::ICMP_SLT && RHS == RHS2;
 }
 
 Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
@@ -2702,6 +2848,10 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
       if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C))
         return I;
       break;
+    case Instruction::SRem:
+      if (Instruction *I = foldICmpSRemConstant(Cmp, BO, *C))
+        return I;
+      break;
     case Instruction::UDiv:
       if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C))
         return I;
@@ -2926,6 +3076,28 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
     }
     break;
   }
+
+  case Intrinsic::uadd_sat: {
+    // uadd.sat(a, b) == 0  ->  (a | b) == 0
+    if (C.isNullValue()) {
+      Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1));
+      return replaceInstUsesWith(Cmp, Builder.CreateICmp(
+          Cmp.getPredicate(), Or, Constant::getNullValue(Ty)));
+
+    }
+    break;
+  }
+
+  case Intrinsic::usub_sat: {
+    // usub.sat(a, b) == 0  ->  a <= b
+    if (C.isNullValue()) {
+      ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ
+          ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
+      return ICmpInst::Create(Instruction::ICmp, NewPred,
+                              II->getArgOperand(0), II->getArgOperand(1));
+    }
+    break;
+  }
   default:
     break;
   }
@@ -3275,6 +3447,7 @@ foldICmpWithTruncSignExtendedVal(ICmpInst &I,
 // we should move shifts to the same hand of 'and', i.e. rewrite as
 //   icmp eq/ne (and (x shift (Q+K)), y), 0  iff (Q+K) u< bitwidth(x)
 // We are only interested in opposite logical shifts here.
+// One of the shifts can be truncated.
 // If we can, we want to end up creating 'lshr' shift.
 static Value *
 foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
@@ -3284,55 +3457,215 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
     return nullptr;
 
   auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value());
-  auto m_AnyLShr = m_LShr(m_Value(), m_Value());
-
-  // Look for an 'and' of two (opposite) logical shifts.
-  // Pick the single-use shift as XShift.
-  Value *XShift, *YShift;
-  if (!match(I.getOperand(0),
-             m_c_And(m_OneUse(m_CombineAnd(m_AnyLogicalShift, m_Value(XShift))),
-                     m_CombineAnd(m_AnyLogicalShift, m_Value(YShift)))))
+
+  // Look for an 'and' of two logical shifts, one of which may be truncated.
+  // We use m_TruncOrSelf() on the RHS to correctly handle commutative case.
+  Instruction *XShift, *MaybeTruncation, *YShift;
+  if (!match(
+          I.getOperand(0),
+          m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)),
+                  m_CombineAnd(m_TruncOrSelf(m_CombineAnd(
+                                   m_AnyLogicalShift, m_Instruction(YShift))),
+                               m_Instruction(MaybeTruncation)))))
     return nullptr;
 
-  // If YShift is a single-use 'lshr', swap the shifts around.
-  if (match(YShift, m_OneUse(m_AnyLShr)))
+  // We potentially looked past 'trunc', but only when matching YShift,
+  // therefore YShift must have the widest type.
+  Instruction *WidestShift = YShift;
+  // Therefore XShift must have the shallowest type.
+  // Or they both have identical types if there was no truncation.
+  Instruction *NarrowestShift = XShift;
+
+  Type *WidestTy = WidestShift->getType();
+  assert(NarrowestShift->getType() == I.getOperand(0)->getType() &&
+         "We did not look past any shifts while matching XShift though.");
+  bool HadTrunc = WidestTy != I.getOperand(0)->getType();
+
+  // If YShift is a 'lshr', swap the shifts around.
+  if (match(YShift, m_LShr(m_Value(), m_Value())))
     std::swap(XShift, YShift);
 
   // The shifts must be in opposite directions.
-  Instruction::BinaryOps XShiftOpcode =
-      cast<BinaryOperator>(XShift)->getOpcode();
-  if (XShiftOpcode == cast<BinaryOperator>(YShift)->getOpcode())
+  auto XShiftOpcode = XShift->getOpcode();
+  if (XShiftOpcode == YShift->getOpcode())
     return nullptr; // Do not care about same-direction shifts here.
 
   Value *X, *XShAmt, *Y, *YShAmt;
-  match(XShift, m_BinOp(m_Value(X), m_Value(XShAmt)));
-  match(YShift, m_BinOp(m_Value(Y), m_Value(YShAmt)));
+  match(XShift, m_BinOp(m_Value(X), m_ZExtOrSelf(m_Value(XShAmt))));
+  match(YShift, m_BinOp(m_Value(Y), m_ZExtOrSelf(m_Value(YShAmt))));
+
+  // If one of the values being shifted is a constant, then we will end with
+  // and+icmp, and [zext+]shift instrs will be constant-folded. If they are not,
+  // however, we will need to ensure that we won't increase instruction count.
+  if (!isa<Constant>(X) && !isa<Constant>(Y)) {
+    // At least one of the hands of the 'and' should be one-use shift.
+    if (!match(I.getOperand(0),
+               m_c_And(m_OneUse(m_AnyLogicalShift), m_Value())))
+      return nullptr;
+    if (HadTrunc) {
+      // Due to the 'trunc', we will need to widen X. For that either the old
+      // 'trunc' or the shift amt in the non-truncated shift should be one-use.
+      if (!MaybeTruncation->hasOneUse() &&
+          !NarrowestShift->getOperand(1)->hasOneUse())
+        return nullptr;
+    }
+  }
+
+  // We have two shift amounts from two different shifts. The types of those
+  // shift amounts may not match. If that's the case let's bailout now.
+  if (XShAmt->getType() != YShAmt->getType())
+    return nullptr;
 
   // Can we fold (XShAmt+YShAmt) ?
-  Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, XShAmt, YShAmt,
-                                  SQ.getWithInstruction(&I));
+  auto *NewShAmt = dyn_cast_or_null<Constant>(
+      SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false,
+                      /*isNUW=*/false, SQ.getWithInstruction(&I)));
   if (!NewShAmt)
     return nullptr;
+  NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy);
+  unsigned WidestBitWidth = WidestTy->getScalarSizeInBits();
+
   // Is the new shift amount smaller than the bit width?
   // FIXME: could also rely on ConstantRange.
-  unsigned BitWidth = X->getType()->getScalarSizeInBits();
-  if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
-                                          APInt(BitWidth, BitWidth))))
+  if (!match(NewShAmt,
+             m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
+                                APInt(WidestBitWidth, WidestBitWidth))))
     return nullptr;
-  // All good, we can do this fold. The shift is the same that was for X.
+
+  // An extra legality check is needed if we had trunc-of-lshr.
+  if (HadTrunc && match(WidestShift, m_LShr(m_Value(), m_Value()))) {
+    auto CanFold = [NewShAmt, WidestBitWidth, NarrowestShift, SQ,
+                    WidestShift]() {
+      // It isn't obvious whether it's worth it to analyze non-constants here.
+      // Also, let's basically give up on non-splat cases, pessimizing vectors.
+      // If *any* of these preconditions matches we can perform the fold.
+      Constant *NewShAmtSplat = NewShAmt->getType()->isVectorTy()
+                                    ? NewShAmt->getSplatValue()
+                                    : NewShAmt;
+      // If it's edge-case shift (by 0 or by WidestBitWidth-1) we can fold.
+      if (NewShAmtSplat &&
+          (NewShAmtSplat->isNullValue() ||
+           NewShAmtSplat->getUniqueInteger() == WidestBitWidth - 1))
+        return true;
+      // We consider *min* leading zeros so a single outlier
+      // blocks the transform as opposed to allowing it.
+      if (auto *C = dyn_cast<Constant>(NarrowestShift->getOperand(0))) {
+        KnownBits Known = computeKnownBits(C, SQ.DL);
+        unsigned MinLeadZero = Known.countMinLeadingZeros();
+        // If the value being shifted has at most lowest bit set we can fold.
+        unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
+        if (MaxActiveBits <= 1)
+          return true;
+        // Precondition:  NewShAmt u<= countLeadingZeros(C)
+        if (NewShAmtSplat && NewShAmtSplat->getUniqueInteger().ule(MinLeadZero))
+          return true;
+      }
+      if (auto *C = dyn_cast<Constant>(WidestShift->getOperand(0))) {
+        KnownBits Known = computeKnownBits(C, SQ.DL);
+        unsigned MinLeadZero = Known.countMinLeadingZeros();
+        // If the value being shifted has at most lowest bit set we can fold.
+        unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
+        if (MaxActiveBits <= 1)
+          return true;
+        // Precondition:  ((WidestBitWidth-1)-NewShAmt) u<= countLeadingZeros(C)
+        if (NewShAmtSplat) {
+          APInt AdjNewShAmt =
+              (WidestBitWidth - 1) - NewShAmtSplat->getUniqueInteger();
+          if (AdjNewShAmt.ule(MinLeadZero))
+            return true;
+        }
+      }
+      return false; // Can't tell if it's ok.
+    };
+    if (!CanFold())
+      return nullptr;
+  }
+
+  // All good, we can do this fold.
+  X = Builder.CreateZExt(X, WidestTy);
+  Y = Builder.CreateZExt(Y, WidestTy);
+  // The shift is the same that was for X.
   Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr
                   ? Builder.CreateLShr(X, NewShAmt)
                   : Builder.CreateShl(X, NewShAmt);
   Value *T1 = Builder.CreateAnd(T0, Y);
   return Builder.CreateICmp(I.getPredicate(), T1,
-                            Constant::getNullValue(X->getType()));
+                            Constant::getNullValue(WidestTy));
+}
+
+/// Fold
+///   (-1 u/ x) u< y
+///   ((x * y) u/ x) != y
+/// to
+///   @llvm.umul.with.overflow(x, y) plus extraction of overflow bit
+/// Note that the comparison is commutative, while inverted (u>=, ==) predicate
+/// will mean that we are looking for the opposite answer.
+Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
+  ICmpInst::Predicate Pred;
+  Value *X, *Y;
+  Instruction *Mul;
+  bool NeedNegation;
+  // Look for: (-1 u/ x) u</u>= y
+  if (!I.isEquality() &&
+      match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))),
+                         m_Value(Y)))) {
+    Mul = nullptr;
+    // Canonicalize as-if y was on RHS.
+    if (I.getOperand(1) != Y)
+      Pred = I.getSwappedPredicate();
+
+    // Are we checking that overflow does not happen, or does happen?
+    switch (Pred) {
+    case ICmpInst::Predicate::ICMP_ULT:
+      NeedNegation = false;
+      break; // OK
+    case ICmpInst::Predicate::ICMP_UGE:
+      NeedNegation = true;
+      break; // OK
+    default:
+      return nullptr; // Wrong predicate.
+    }
+  } else // Look for: ((x * y) u/ x) !=/== y
+      if (I.isEquality() &&
+          match(&I, m_c_ICmp(Pred, m_Value(Y),
+                             m_OneUse(m_UDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y),
+                                                                  m_Value(X)),
+                                                          m_Instruction(Mul)),
+                                             m_Deferred(X)))))) {
+    NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ;
+  } else
+    return nullptr;
+
+  BuilderTy::InsertPointGuard Guard(Builder);
+  // If the pattern included (x * y), we'll want to insert new instructions
+  // right before that original multiplication so that we can replace it.
+  bool MulHadOtherUses = Mul && !Mul->hasOneUse();
+  if (MulHadOtherUses)
+    Builder.SetInsertPoint(Mul);
+
+  Function *F = Intrinsic::getDeclaration(
+      I.getModule(), Intrinsic::umul_with_overflow, X->getType());
+  CallInst *Call = Builder.CreateCall(F, {X, Y}, "umul");
+
+  // If the multiplication was used elsewhere, to ensure that we don't leave
+  // "duplicate" instructions, replace uses of that original multiplication
+  // with the multiplication result from the with.overflow intrinsic.
+  if (MulHadOtherUses)
+    replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "umul.val"));
+
+  Value *Res = Builder.CreateExtractValue(Call, 1, "umul.ov");
+  if (NeedNegation) // This technically increases instruction count.
+    Res = Builder.CreateNot(Res, "umul.not.ov");
+
+  return Res;
 }
 
 /// Try to fold icmp (binop), X or icmp X, (binop).
 /// TODO: A large part of this logic is duplicated in InstSimplify's
 /// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
 /// duplication.
-Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
+Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // Special logic for binary operators.
@@ -3345,13 +3678,13 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   Value *X;
 
   // Convert add-with-unsigned-overflow comparisons into a 'not' with compare.
-  // (Op1 + X) <u Op1 --> ~Op1 <u X
-  // Op0 >u (Op0 + X) --> X >u ~Op0
+  // (Op1 + X) u</u>= Op1 --> ~Op1 u</u>= X
   if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) &&
-      Pred == ICmpInst::ICMP_ULT)
+      (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
     return new ICmpInst(Pred, Builder.CreateNot(Op1), X);
+  // Op0 u>/u<= (Op0 + X) --> X u>/u<= ~Op0
   if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) &&
-      Pred == ICmpInst::ICMP_UGT)
+      (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
     return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
 
   bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
@@ -3378,21 +3711,21 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     D = BO1->getOperand(1);
   }
 
-  // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
+  // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow.
+  // icmp (A+B), B -> icmp A, 0 for equalities or if there is no overflow.
   if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
     return new ICmpInst(Pred, A == Op1 ? B : A,
                         Constant::getNullValue(Op1->getType()));
 
-  // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
+  // icmp C, (C+D) -> icmp 0, D for equalities or if there is no overflow.
+  // icmp D, (C+D) -> icmp 0, C for equalities or if there is no overflow.
   if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
     return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
                         C == Op0 ? D : C);
 
-  // icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow.
+  // icmp (A+B), (A+D) -> icmp B, D for equalities or if there is no overflow.
   if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem &&
-      NoOp1WrapProblem &&
-      // Try not to increase register pressure.
-      BO0->hasOneUse() && BO1->hasOneUse()) {
+      NoOp1WrapProblem) {
     // Determine Y and Z in the form icmp (X+Y), (X+Z).
     Value *Y, *Z;
     if (A == C) {
@@ -3416,39 +3749,39 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     return new ICmpInst(Pred, Y, Z);
   }
 
-  // icmp slt (X + -1), Y -> icmp sle X, Y
+  // icmp slt (A + -1), Op1 -> icmp sle A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
       match(B, m_AllOnes()))
     return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
 
-  // icmp sge (X + -1), Y -> icmp sgt X, Y
+  // icmp sge (A + -1), Op1 -> icmp sgt A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
       match(B, m_AllOnes()))
     return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
 
-  // icmp sle (X + 1), Y -> icmp slt X, Y
+  // icmp sle (A + 1), Op1 -> icmp slt A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One()))
     return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
 
-  // icmp sgt (X + 1), Y -> icmp sge X, Y
+  // icmp sgt (A + 1), Op1 -> icmp sge A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One()))
     return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
 
-  // icmp sgt X, (Y + -1) -> icmp sge X, Y
+  // icmp sgt Op0, (C + -1) -> icmp sge Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
       match(D, m_AllOnes()))
     return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
 
-  // icmp sle X, (Y + -1) -> icmp slt X, Y
+  // icmp sle Op0, (C + -1) -> icmp slt Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
       match(D, m_AllOnes()))
     return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
 
-  // icmp sge X, (Y + 1) -> icmp sgt X, Y
+  // icmp sge Op0, (C + 1) -> icmp sgt Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
 
-  // icmp slt X, (Y + 1) -> icmp sle X, Y
+  // icmp slt Op0, (C + 1) -> icmp sle Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
 
@@ -3456,33 +3789,33 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   // canonicalization from (X -nuw 1) to (X + -1) means that the combinations
   // wouldn't happen even if they were implemented.
   //
-  // icmp ult (X - 1), Y -> icmp ule X, Y
-  // icmp uge (X - 1), Y -> icmp ugt X, Y
-  // icmp ugt X, (Y - 1) -> icmp uge X, Y
-  // icmp ule X, (Y - 1) -> icmp ult X, Y
+  // icmp ult (A - 1), Op1 -> icmp ule A, Op1
+  // icmp uge (A - 1), Op1 -> icmp ugt A, Op1
+  // icmp ugt Op0, (C - 1) -> icmp uge Op0, C
+  // icmp ule Op0, (C - 1) -> icmp ult Op0, C
 
-  // icmp ule (X + 1), Y -> icmp ult X, Y
+  // icmp ule (A + 1), Op0 -> icmp ult A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
     return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);
 
-  // icmp ugt (X + 1), Y -> icmp uge X, Y
+  // icmp ugt (A + 1), Op0 -> icmp uge A, Op1
   if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
     return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);
 
-  // icmp uge X, (Y + 1) -> icmp ugt X, Y
+  // icmp uge Op0, (C + 1) -> icmp ugt Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);
 
-  // icmp ult X, (Y + 1) -> icmp ule X, Y
+  // icmp ult Op0, (C + 1) -> icmp ule Op0, C
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);
 
   // if C1 has greater magnitude than C2:
-  //  icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
+  //  icmp (A + C1), (C + C2) -> icmp (A + C3), C
   //  s.t. C3 = C1 - C2
   //
   // if C2 has greater magnitude than C1:
-  //  icmp (X + C1), (Y + C2) -> icmp X, (Y + C3)
+  //  icmp (A + C1), (C + C2) -> icmp A, (C + C3)
   //  s.t. C3 = C2 - C1
   if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
       (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
@@ -3520,29 +3853,35 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     D = BO1->getOperand(1);
   }
 
-  // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
+  // icmp (A-B), A -> icmp 0, B for equalities or if there is no overflow.
   if (A == Op1 && NoOp0WrapProblem)
     return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
-  // icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
+  // icmp C, (C-D) -> icmp D, 0 for equalities or if there is no overflow.
   if (C == Op0 && NoOp1WrapProblem)
     return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
 
-  // (A - B) >u A --> A <u B
-  if (A == Op1 && Pred == ICmpInst::ICMP_UGT)
-    return new ICmpInst(ICmpInst::ICMP_ULT, A, B);
-  // C <u (C - D) --> C <u D
-  if (C == Op0 && Pred == ICmpInst::ICMP_ULT)
-    return new ICmpInst(ICmpInst::ICMP_ULT, C, D);
-
-  // icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
-  if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
-      // Try not to increase register pressure.
-      BO0->hasOneUse() && BO1->hasOneUse())
+  // Convert sub-with-unsigned-overflow comparisons into a comparison of args.
+  // (A - B) u>/u<= A --> B u>/u<= A
+  if (A == Op1 && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
+    return new ICmpInst(Pred, B, A);
+  // C u</u>= (C - D) --> C u</u>= D
+  if (C == Op0 && (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
+    return new ICmpInst(Pred, C, D);
+  // (A - B) u>=/u< A --> B u>/u<= A  iff B != 0
+  if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) &&
+      isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+    return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A);
+  // C u<=/u> (C - D) --> C u</u>= D  iff B != 0
+  if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) &&
+      isKnownNonZero(D, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+    return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D);
+
+  // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow.
+  if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem)
     return new ICmpInst(Pred, A, C);
-  // icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
-  if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
-      // Try not to increase register pressure.
-      BO0->hasOneUse() && BO1->hasOneUse())
+
+  // icmp (A-B), (A-D) -> icmp D, B for equalities or if there is no overflow.
+  if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem)
     return new ICmpInst(Pred, D, B);
 
   // icmp (0-X) < cst --> x > -cst
@@ -3677,6 +4016,9 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     }
   }
 
+  if (Value *V = foldUnsignedMultiplicationOverflowCheck(I))
+    return replaceInstUsesWith(I, V);
+
   if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
     return replaceInstUsesWith(I, V);
 
@@ -3953,125 +4295,140 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
   return nullptr;
 }
 
-/// Handle icmp (cast x to y), (cast/cst). We only handle extending casts so
-/// far.
-Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
-  const CastInst *LHSCI = cast<CastInst>(ICmp.getOperand(0));
-  Value *LHSCIOp        = LHSCI->getOperand(0);
-  Type *SrcTy     = LHSCIOp->getType();
-  Type *DestTy    = LHSCI->getType();
-
-  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
-  // integer type is the same size as the pointer type.
-  const auto& CompatibleSizes = [&](Type* SrcTy, Type* DestTy) -> bool {
-    if (isa<VectorType>(SrcTy)) {
-      SrcTy = cast<VectorType>(SrcTy)->getElementType();
-      DestTy = cast<VectorType>(DestTy)->getElementType();
-    }
-    return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
-  };
-  if (LHSCI->getOpcode() == Instruction::PtrToInt &&
-      CompatibleSizes(SrcTy, DestTy)) {
-    Value *RHSOp = nullptr;
-    if (auto *RHSC = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
-      Value *RHSCIOp = RHSC->getOperand(0);
-      if (RHSCIOp->getType()->getPointerAddressSpace() ==
-          LHSCIOp->getType()->getPointerAddressSpace()) {
-        RHSOp = RHSC->getOperand(0);
-        // If the pointer types don't match, insert a bitcast.
-        if (LHSCIOp->getType() != RHSOp->getType())
-          RHSOp = Builder.CreateBitCast(RHSOp, LHSCIOp->getType());
-      }
-    } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
-      RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
-    }
-
-    if (RHSOp)
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSOp);
-  }
-
-  // The code below only handles extension cast instructions, so far.
-  // Enforce this.
-  if (LHSCI->getOpcode() != Instruction::ZExt &&
-      LHSCI->getOpcode() != Instruction::SExt)
+static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
+                                           InstCombiner::BuilderTy &Builder) {
+  assert(isa<CastInst>(ICmp.getOperand(0)) && "Expected cast for operand 0");
+  auto *CastOp0 = cast<CastInst>(ICmp.getOperand(0));
+  Value *X;
+  if (!match(CastOp0, m_ZExtOrSExt(m_Value(X))))
     return nullptr;
 
-  bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
-  bool isSignedCmp = ICmp.isSigned();
-
-  if (auto *CI = dyn_cast<CastInst>(ICmp.getOperand(1))) {
-    // Not an extension from the same type?
-    Value *RHSCIOp = CI->getOperand(0);
-    if (RHSCIOp->getType() != LHSCIOp->getType())
-      return nullptr;
-
+  bool IsSignedExt = CastOp0->getOpcode() == Instruction::SExt;
+  bool IsSignedCmp = ICmp.isSigned();
+  if (auto *CastOp1 = dyn_cast<CastInst>(ICmp.getOperand(1))) {
     // If the signedness of the two casts doesn't agree (i.e. one is a sext
     // and the other is a zext), then we can't handle this.
-    if (CI->getOpcode() != LHSCI->getOpcode())
+    // TODO: This is too strict. We can handle some predicates (equality?).
+    if (CastOp0->getOpcode() != CastOp1->getOpcode())
       return nullptr;
 
-    // Deal with equality cases early.
+    // Not an extension from the same type?
+    Value *Y = CastOp1->getOperand(0);
+    Type *XTy = X->getType(), *YTy = Y->getType();
+    if (XTy != YTy) {
+      // One of the casts must have one use because we are creating a new cast.
+      if (!CastOp0->hasOneUse() && !CastOp1->hasOneUse())
+        return nullptr;
+      // Extend the narrower operand to the type of the wider operand.
+      if (XTy->getScalarSizeInBits() < YTy->getScalarSizeInBits())
+        X = Builder.CreateCast(CastOp0->getOpcode(), X, YTy);
+      else if (YTy->getScalarSizeInBits() < XTy->getScalarSizeInBits())
+        Y = Builder.CreateCast(CastOp0->getOpcode(), Y, XTy);
+      else
+        return nullptr;
+    }
+
+    // (zext X) == (zext Y) --> X == Y
+    // (sext X) == (sext Y) --> X == Y
     if (ICmp.isEquality())
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
+      return new ICmpInst(ICmp.getPredicate(), X, Y);
 
     // A signed comparison of sign extended values simplifies into a
     // signed comparison.
-    if (isSignedCmp && isSignedExt)
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
+    if (IsSignedCmp && IsSignedExt)
+      return new ICmpInst(ICmp.getPredicate(), X, Y);
 
     // The other three cases all fold into an unsigned comparison.
-    return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
+    return new ICmpInst(ICmp.getUnsignedPredicate(), X, Y);
   }
 
-  // If we aren't dealing with a constant on the RHS, exit early.
+  // Below here, we are only folding a compare with constant.
   auto *C = dyn_cast<Constant>(ICmp.getOperand(1));
   if (!C)
     return nullptr;
 
   // Compute the constant that would happen if we truncated to SrcTy then
   // re-extended to DestTy.
+  Type *SrcTy = CastOp0->getSrcTy();
+  Type *DestTy = CastOp0->getDestTy();
   Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy);
-  Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy);
+  Constant *Res2 = ConstantExpr::getCast(CastOp0->getOpcode(), Res1, DestTy);
 
   // If the re-extended constant didn't change...
   if (Res2 == C) {
-    // Deal with equality cases early.
     if (ICmp.isEquality())
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
+      return new ICmpInst(ICmp.getPredicate(), X, Res1);
 
     // A signed comparison of sign extended values simplifies into a
     // signed comparison.
-    if (isSignedExt && isSignedCmp)
-      return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
+    if (IsSignedExt && IsSignedCmp)
+      return new ICmpInst(ICmp.getPredicate(), X, Res1);
 
     // The other three cases all fold into an unsigned comparison.
-    return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, Res1);
+    return new ICmpInst(ICmp.getUnsignedPredicate(), X, Res1);
   }
 
   // The re-extended constant changed, partly changed (in the case of a vector),
   // or could not be determined to be equal (in the case of a constant
   // expression), so the constant cannot be represented in the shorter type.
-  // Consequently, we cannot emit a simple comparison.
   // All the cases that fold to true or false will have already been handled
   // by SimplifyICmpInst, so only deal with the tricky case.
+  if (IsSignedCmp || !IsSignedExt || !isa<ConstantInt>(C))
+    return nullptr;
+
+  // Is source op positive?
+  // icmp ult (sext X), C --> icmp sgt X, -1
+  if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
+    return new ICmpInst(CmpInst::ICMP_SGT, X, Constant::getAllOnesValue(SrcTy));
+
+  // Is source op negative?
+  // icmp ugt (sext X), C --> icmp slt X, 0
+  assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
+  return new ICmpInst(CmpInst::ICMP_SLT, X, Constant::getNullValue(SrcTy));
+}
 
-  if (isSignedCmp || !isSignedExt || !isa<ConstantInt>(C))
+/// Handle icmp (cast x), (cast or constant).
+Instruction *InstCombiner::foldICmpWithCastOp(ICmpInst &ICmp) {
+  auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0));
+  if (!CastOp0)
+    return nullptr;
+  if (!isa<Constant>(ICmp.getOperand(1)) && !isa<CastInst>(ICmp.getOperand(1)))
     return nullptr;
 
-  // Evaluate the comparison for LT (we invert for GT below). LE and GE cases
-  // should have been folded away previously and not enter in here.
+  Value *Op0Src = CastOp0->getOperand(0);
+  Type *SrcTy = CastOp0->getSrcTy();
+  Type *DestTy = CastOp0->getDestTy();
 
-  // We're performing an unsigned comp with a sign extended value.
-  // This is true if the input is >= 0. [aka >s -1]
-  Constant *NegOne = Constant::getAllOnesValue(SrcTy);
-  Value *Result = Builder.CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName());
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
+  // integer type is the same size as the pointer type.
+  auto CompatibleSizes = [&](Type *SrcTy, Type *DestTy) {
+    if (isa<VectorType>(SrcTy)) {
+      SrcTy = cast<VectorType>(SrcTy)->getElementType();
+      DestTy = cast<VectorType>(DestTy)->getElementType();
+    }
+    return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
+  };
+  if (CastOp0->getOpcode() == Instruction::PtrToInt &&
+      CompatibleSizes(SrcTy, DestTy)) {
+    Value *NewOp1 = nullptr;
+    if (auto *PtrToIntOp1 = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
+      Value *PtrSrc = PtrToIntOp1->getOperand(0);
+      if (PtrSrc->getType()->getPointerAddressSpace() ==
+          Op0Src->getType()->getPointerAddressSpace()) {
+        NewOp1 = PtrToIntOp1->getOperand(0);
+        // If the pointer types don't match, insert a bitcast.
+        if (Op0Src->getType() != NewOp1->getType())
+          NewOp1 = Builder.CreateBitCast(NewOp1, Op0Src->getType());
+      }
+    } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
+      NewOp1 = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+    }
 
-  // Finally, return the value computed.
-  if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
-    return replaceInstUsesWith(ICmp, Result);
+    if (NewOp1)
+      return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1);
+  }
 
-  assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
-  return BinaryOperator::CreateNot(Result);
+  return foldICmpWithZextOrSext(ICmp, Builder);
 }
 
 static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
@@ -4791,41 +5148,35 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
   return nullptr;
 }
 
-/// If we have an icmp le or icmp ge instruction with a constant operand, turn
-/// it into the appropriate icmp lt or icmp gt instruction. This transform
-/// allows them to be folded in visitICmpInst.
-static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
-  ICmpInst::Predicate Pred = I.getPredicate();
-  if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGE &&
-      Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_UGE)
-    return nullptr;
+llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
+llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
+                                               Constant *C) {
+  assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
+         "Only for relational integer predicates.");
 
-  Value *Op0 = I.getOperand(0);
-  Value *Op1 = I.getOperand(1);
-  auto *Op1C = dyn_cast<Constant>(Op1);
-  if (!Op1C)
-    return nullptr;
+  Type *Type = C->getType();
+  bool IsSigned = ICmpInst::isSigned(Pred);
+
+  CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred);
+  bool WillIncrement =
+      UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT;
 
-  // Check if the constant operand can be safely incremented/decremented without
-  // overflowing/underflowing. For scalars, SimplifyICmpInst has already handled
-  // the edge cases for us, so we just assert on them. For vectors, we must
-  // handle the edge cases.
-  Type *Op1Type = Op1->getType();
-  bool IsSigned = I.isSigned();
-  bool IsLE = (Pred == ICmpInst::ICMP_SLE || Pred == ICmpInst::ICMP_ULE);
-  auto *CI = dyn_cast<ConstantInt>(Op1C);
-  if (CI) {
-    // A <= MAX -> TRUE ; A >= MIN -> TRUE
-    assert(IsLE ? !CI->isMaxValue(IsSigned) : !CI->isMinValue(IsSigned));
-  } else if (Op1Type->isVectorTy()) {
-    // TODO? If the edge cases for vectors were guaranteed to be handled as they
-    // are for scalar, we could remove the min/max checks. However, to do that,
-    // we would have to use insertelement/shufflevector to replace edge values.
-    unsigned NumElts = Op1Type->getVectorNumElements();
+  // Check if the constant operand can be safely incremented/decremented
+  // without overflowing/underflowing.
+  auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) {
+    return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned);
+  };
+
+  if (auto *CI = dyn_cast<ConstantInt>(C)) {
+    // Bail out if the constant can't be safely incremented/decremented.
+    if (!ConstantIsOk(CI))
+      return llvm::None;
+  } else if (Type->isVectorTy()) {
+    unsigned NumElts = Type->getVectorNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = Op1C->getAggregateElement(i);
+      Constant *Elt = C->getAggregateElement(i);
       if (!Elt)
-        return nullptr;
+        return llvm::None;
 
       if (isa<UndefValue>(Elt))
         continue;
@@ -4833,20 +5184,43 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
       // Bail out if we can't determine if this constant is min/max or if we
       // know that this constant is min/max.
       auto *CI = dyn_cast<ConstantInt>(Elt);
-      if (!CI || (IsLE ? CI->isMaxValue(IsSigned) : CI->isMinValue(IsSigned)))
-        return nullptr;
+      if (!CI || !ConstantIsOk(CI))
+        return llvm::None;
     }
   } else {
     // ConstantExpr?
-    return nullptr;
+    return llvm::None;
   }
 
-  // Increment or decrement the constant and set the new comparison predicate:
-  // ULE -> ULT ; UGE -> UGT ; SLE -> SLT ; SGE -> SGT
-  Constant *OneOrNegOne = ConstantInt::get(Op1Type, IsLE ? 1 : -1, true);
-  CmpInst::Predicate NewPred = IsLE ? ICmpInst::ICMP_ULT: ICmpInst::ICMP_UGT;
-  NewPred = IsSigned ? ICmpInst::getSignedPredicate(NewPred) : NewPred;
-  return new ICmpInst(NewPred, Op0, ConstantExpr::getAdd(Op1C, OneOrNegOne));
+  CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred);
+
+  // Increment or decrement the constant.
+  Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true);
+  Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne);
+
+  return std::make_pair(NewPred, NewC);
+}
+
+/// If we have an icmp le or icmp ge instruction with a constant operand, turn
+/// it into the appropriate icmp lt or icmp gt instruction. This transform
+/// allows them to be folded in visitICmpInst.
+static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
+  ICmpInst::Predicate Pred = I.getPredicate();
+  if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) ||
+      isCanonicalPredicate(Pred))
+    return nullptr;
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  auto *Op1C = dyn_cast<Constant>(Op1);
+  if (!Op1C)
+    return nullptr;
+
+  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
+  if (!FlippedStrictness)
+    return nullptr;
+
+  return new ICmpInst(FlippedStrictness->first, Op0, FlippedStrictness->second);
 }
 
 /// Integer compare with boolean values can always be turned into bitwise ops.
@@ -5002,6 +5376,7 @@ static Instruction *foldVectorCmp(CmpInst &Cmp,
 
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   unsigned Op0Cplxity = getComplexity(Op0);
   unsigned Op1Cplxity = getComplexity(Op1);
@@ -5016,8 +5391,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     Changed = true;
   }
 
-  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1,
-                                  SQ.getWithInstruction(&I)))
+  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, Q))
     return replaceInstUsesWith(I, V);
 
   // Comparing -val or val with non-zero is the same as just comparing val
@@ -5050,6 +5424,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpWithDominatingICmp(I))
     return Res;
 
+  if (Instruction *Res = foldICmpBinOp(I, Q))
+    return Res;
+
   if (Instruction *Res = foldICmpUsingKnownBits(I))
     return Res;
 
@@ -5098,6 +5475,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpInstWithConstant(I))
     return Res;
 
+  // Try to match comparison as a sign bit test. Intentionally do this after
+  // foldICmpInstWithConstant() to potentially let other folds to happen first.
+  if (Instruction *New = foldSignBitTest(I))
+    return New;
+
   if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
     return Res;
 
@@ -5124,20 +5506,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpBitCast(I, Builder))
     return Res;
 
-  if (isa<CastInst>(Op0)) {
-    // Handle the special case of: icmp (cast bool to X), <cst>
-    // This comes up when you have code like
-    //   int X = A < B;
-    //   if (X) ...
-    // For generality, we handle any zero-extension of any operand comparison
-    // with a constant or another cast from the same type.
-    if (isa<Constant>(Op1) || isa<CastInst>(Op1))
-      if (Instruction *R = foldICmpWithCastAndCast(I))
-        return R;
-  }
-
-  if (Instruction *Res = foldICmpBinOp(I))
-    return Res;
+  if (Instruction *R = foldICmpWithCastOp(I))
+    return R;
 
   if (Instruction *Res = foldICmpWithMinMax(I))
     return Res;
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 434b0d591215..1dbc06d92e7a 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -113,6 +113,48 @@ static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {
   }
 }
 
+/// Given an exploded icmp instruction, return true if the comparison only
+/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
+/// result of the comparison is true when the input value is signed.
+inline bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
+                           bool &TrueIfSigned) {
+  switch (Pred) {
+  case ICmpInst::ICMP_SLT: // True if LHS s< 0
+    TrueIfSigned = true;
+    return RHS.isNullValue();
+  case ICmpInst::ICMP_SLE: // True if LHS s<= -1
+    TrueIfSigned = true;
+    return RHS.isAllOnesValue();
+  case ICmpInst::ICMP_SGT: // True if LHS s> -1
+    TrueIfSigned = false;
+    return RHS.isAllOnesValue();
+  case ICmpInst::ICMP_SGE: // True if LHS s>= 0
+    TrueIfSigned = false;
+    return RHS.isNullValue();
+  case ICmpInst::ICMP_UGT:
+    // True if LHS u> RHS and RHS == sign-bit-mask - 1
+    TrueIfSigned = true;
+    return RHS.isMaxSignedValue();
+  case ICmpInst::ICMP_UGE:
+    // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+    TrueIfSigned = true;
+    return RHS.isMinSignedValue();
+  case ICmpInst::ICMP_ULT:
+    // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+    TrueIfSigned = false;
+    return RHS.isMinSignedValue();
+  case ICmpInst::ICMP_ULE:
+    // True if LHS u<= RHS and RHS == sign-bit-mask - 1
+    TrueIfSigned = false;
+    return RHS.isMaxSignedValue();
+  default:
+    return false;
+  }
+}
+
+llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
+getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, Constant *C);
+
 /// Return the source operand of a potentially bitcasted value while optionally
 /// checking if it has one use. If there is no bitcast or the one use check is
 /// not met, return the input value itself.
@@ -139,32 +181,17 @@ static inline Constant *SubOne(Constant *C) {
 /// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
 /// is true, work under the assumption that the caller intends to remove all
 /// uses of V and only keep uses of ~V.
-static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
+///
+/// See also: canFreelyInvertAllUsersOf()
+static inline bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
   // ~(~(X)) -> X.
   if (match(V, m_Not(m_Value())))
     return true;
 
   // Constants can be considered to be not'ed values.
-  if (isa<ConstantInt>(V))
+  if (match(V, m_AnyIntegralConstant()))
     return true;
 
-  // A vector of constant integers can be inverted easily.
-  if (V->getType()->isVectorTy() && isa<Constant>(V)) {
-    unsigned NumElts = V->getType()->getVectorNumElements();
-    for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
-      if (!Elt)
-        return false;
-
-      if (isa<UndefValue>(Elt))
-        continue;
-
-      if (!isa<ConstantInt>(Elt))
-        return false;
-    }
-    return true;
-  }
-
   // Compares can be inverted if all of their uses are being modified to use the
   // ~V.
   if (isa<CmpInst>(V))
@@ -185,6 +212,32 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
   return false;
 }
 
+/// Given i1 V, can every user of V be freely adapted if V is changed to !V ?
+///
+/// See also: isFreeToInvert()
+static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
+  // Look at every user of V.
+  for (User *U : V->users()) {
+    if (U == IgnoredUser)
+      continue; // Don't consider this user.
+
+    auto *I = cast<Instruction>(U);
+    switch (I->getOpcode()) {
+    case Instruction::Select:
+    case Instruction::Br:
+      break; // Free to invert by swapping true/false values/destinations.
+    case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring it.
+      if (!match(I, m_Not(m_Value())))
+        return false; // Not a 'not'.
+      break;
+    default:
+      return false; // Don't know, likely not freely invertible.
+    }
+    // So far all users were free to invert...
+  }
+  return true; // Can freely invert all users!
+}
+
 /// Some binary operators require special handling to avoid poison and undefined
 /// behavior. If a constant vector has undef elements, replace those undefs with
 /// identity constants if possible because those are always safe to execute.
@@ -337,6 +390,13 @@ public:
   Instruction *visitOr(BinaryOperator &I);
   Instruction *visitXor(BinaryOperator &I);
   Instruction *visitShl(BinaryOperator &I);
+  Value *reassociateShiftAmtsOfTwoSameDirectionShifts(
+      BinaryOperator *Sh0, const SimplifyQuery &SQ,
+      bool AnalyzeForSignBitExtraction = false);
+  Instruction *canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
+      BinaryOperator &I);
+  Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
+      BinaryOperator &OldAShr);
   Instruction *visitAShr(BinaryOperator &I);
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
@@ -541,6 +601,7 @@ private:
   Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
   Instruction *narrowRotate(TruncInst &Trunc);
   Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
+  Instruction *matchSAddSubSat(SelectInst &MinMax1);
 
   /// Determine if a pair of casts can be replaced by a single cast.
   ///
@@ -557,7 +618,7 @@ private:
 
   Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
   Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
-  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &I);
 
   /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
   /// NOTE: Unlike most of instcombine, this returns a Value which should
@@ -725,7 +786,7 @@ public:
       Value *LHS, Value *RHS, Instruction *CxtI) const;
 
   /// Maximum size of array considered when transforming.
-  uint64_t MaxArraySizeForCombine;
+  uint64_t MaxArraySizeForCombine = 0;
 
 private:
   /// Performs a few simplifications for operators which are associative
@@ -798,7 +859,8 @@ private:
                                                int DmaskIdx = -1);
 
   Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                    APInt &UndefElts, unsigned Depth = 0);
+                                    APInt &UndefElts, unsigned Depth = 0,
+                                    bool AllowMultipleUsers = false);
 
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
@@ -847,17 +909,21 @@ private:
                                     Constant *RHSC);
   Instruction *foldICmpAddOpConst(Value *X, const APInt &C,
                                   ICmpInst::Predicate Pred);
-  Instruction *foldICmpWithCastAndCast(ICmpInst &ICI);
+  Instruction *foldICmpWithCastOp(ICmpInst &ICI);
 
   Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
   Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
   Instruction *foldICmpWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
-  Instruction *foldICmpBinOp(ICmpInst &Cmp);
+  Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
+  Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
+  Instruction *foldSignBitTest(ICmpInst &I);
   Instruction *foldICmpWithZero(ICmpInst &Cmp);
 
+  Value *foldUnsignedMultiplicationOverflowCheck(ICmpInst &Cmp);
+
   Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select,
                                       ConstantInt *C);
   Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc,
@@ -874,6 +940,8 @@ private:
                                    const APInt &C);
   Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr,
                                    const APInt &C);
+  Instruction *foldICmpSRemConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
+                                    const APInt &C);
   Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
                                     const APInt &C);
   Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div,
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 054fb7da09a2..3a0e05832fcb 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -175,7 +175,7 @@ static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
   uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
   if (!AllocaSize)
     return false;
-  return isDereferenceableAndAlignedPointer(V, AI->getAlignment(),
+  return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()),
                                             APInt(64, AllocaSize), DL);
 }
 
@@ -197,7 +197,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
     if (C->getValue().getActiveBits() <= 64) {
       Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
       AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
-      New->setAlignment(AI.getAlignment());
+      New->setAlignment(MaybeAlign(AI.getAlignment()));
 
       // Scan to the end of the allocation instructions, to skip over a block of
       // allocas if possible...also skip interleaved debug info
@@ -345,7 +345,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   if (AI.getAllocatedType()->isSized()) {
     // If the alignment is 0 (unspecified), assign it the preferred alignment.
     if (AI.getAlignment() == 0)
-      AI.setAlignment(DL.getPrefTypeAlignment(AI.getAllocatedType()));
+      AI.setAlignment(
+          MaybeAlign(DL.getPrefTypeAlignment(AI.getAllocatedType())));
 
     // Move all alloca's of zero byte objects to the entry block and merge them
     // together.  Note that we only do this for alloca's, because malloc should
@@ -377,12 +378,12 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         // assign it the preferred alignment.
         if (EntryAI->getAlignment() == 0)
           EntryAI->setAlignment(
-              DL.getPrefTypeAlignment(EntryAI->getAllocatedType()));
+              MaybeAlign(DL.getPrefTypeAlignment(EntryAI->getAllocatedType())));
         // Replace this zero-sized alloca with the one at the start of the entry
         // block after ensuring that the address will be aligned enough for both
         // types.
-        unsigned MaxAlign = std::max(EntryAI->getAlignment(),
-                                     AI.getAlignment());
+        const MaybeAlign MaxAlign(
+            std::max(EntryAI->getAlignment(), AI.getAlignment()));
         EntryAI->setAlignment(MaxAlign);
         if (AI.getType() != EntryAI->getType())
           return new BitCastInst(EntryAI, AI.getType());
@@ -455,9 +456,6 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
 
   Value *Ptr = LI.getPointerOperand();
   unsigned AS = LI.getPointerAddressSpace();
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
-  LI.getAllMetadata(MD);
-
   Value *NewPtr = nullptr;
   if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
         NewPtr->getType()->getPointerElementType() == NewTy &&
@@ -467,48 +465,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
   LoadInst *NewLoad = IC.Builder.CreateAlignedLoad(
       NewTy, NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
   NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
-  MDBuilder MDB(NewLoad->getContext());
-  for (const auto &MDPair : MD) {
-    unsigned ID = MDPair.first;
-    MDNode *N = MDPair.second;
-    // Note, essentially every kind of metadata should be preserved here! This
-    // routine is supposed to clone a load instruction changing *only its type*.
-    // The only metadata it makes sense to drop is metadata which is invalidated
-    // when the pointer type changes. This should essentially never be the case
-    // in LLVM, but we explicitly switch over only known metadata to be
-    // conservatively correct. If you are adding metadata to LLVM which pertains
-    // to loads, you almost certainly want to add it here.
-    switch (ID) {
-    case LLVMContext::MD_dbg:
-    case LLVMContext::MD_tbaa:
-    case LLVMContext::MD_prof:
-    case LLVMContext::MD_fpmath:
-    case LLVMContext::MD_tbaa_struct:
-    case LLVMContext::MD_invariant_load:
-    case LLVMContext::MD_alias_scope:
-    case LLVMContext::MD_noalias:
-    case LLVMContext::MD_nontemporal:
-    case LLVMContext::MD_mem_parallel_loop_access:
-    case LLVMContext::MD_access_group:
-      // All of these directly apply.
-      NewLoad->setMetadata(ID, N);
-      break;
-
-    case LLVMContext::MD_nonnull:
-      copyNonnullMetadata(LI, N, *NewLoad);
-      break;
-    case LLVMContext::MD_align:
-    case LLVMContext::MD_dereferenceable:
-    case LLVMContext::MD_dereferenceable_or_null:
-      // These only directly apply if the new type is also a pointer.
-      if (NewTy->isPointerTy())
-        NewLoad->setMetadata(ID, N);
-      break;
-    case LLVMContext::MD_range:
-      copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad);
-      break;
-    }
-  }
+  copyMetadataForLoad(*NewLoad, LI);
   return NewLoad;
 }
 
@@ -1004,9 +961,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType());
 
   if (KnownAlign > EffectiveLoadAlign)
-    LI.setAlignment(KnownAlign);
+    LI.setAlignment(MaybeAlign(KnownAlign));
   else if (LoadAlign == 0)
-    LI.setAlignment(EffectiveLoadAlign);
+    LI.setAlignment(MaybeAlign(EffectiveLoadAlign));
 
   // Replace GEP indices if possible.
   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
@@ -1063,11 +1020,11 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     //
     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
-      unsigned Align = LI.getAlignment();
-      if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(), Align,
-                                      DL, SI) &&
-          isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(), Align,
-                                      DL, SI)) {
+      const MaybeAlign Alignment(LI.getAlignment());
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
+                                      Alignment, DL, SI) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
+                                      Alignment, DL, SI)) {
         LoadInst *V1 =
             Builder.CreateLoad(LI.getType(), SI->getOperand(1),
                                SI->getOperand(1)->getName() + ".val");
@@ -1075,9 +1032,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
             Builder.CreateLoad(LI.getType(), SI->getOperand(2),
                                SI->getOperand(2)->getName() + ".val");
         assert(LI.isUnordered() && "implied by above");
-        V1->setAlignment(Align);
+        V1->setAlignment(Alignment);
         V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
-        V2->setAlignment(Align);
+        V2->setAlignment(Alignment);
         V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
         return SelectInst::Create(SI->getCondition(), V1, V2);
       }
@@ -1399,15 +1356,15 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     return eraseInstFromFunction(SI);
 
   // Attempt to improve the alignment.
-  unsigned KnownAlign = getOrEnforceKnownAlignment(
-      Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT);
-  unsigned StoreAlign = SI.getAlignment();
-  unsigned EffectiveStoreAlign =
-      StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType());
+  const Align KnownAlign = Align(getOrEnforceKnownAlignment(
+      Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT));
+  const MaybeAlign StoreAlign = MaybeAlign(SI.getAlignment());
+  const Align EffectiveStoreAlign =
+      StoreAlign ? *StoreAlign : Align(DL.getABITypeAlignment(Val->getType()));
 
   if (KnownAlign > EffectiveStoreAlign)
     SI.setAlignment(KnownAlign);
-  else if (StoreAlign == 0)
+  else if (!StoreAlign)
     SI.setAlignment(EffectiveStoreAlign);
 
   // Try to canonicalize the stored type.
@@ -1622,8 +1579,8 @@ bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
 
   // Advance to a place where it is safe to insert the new store and insert it.
   BBI = DestBB->getFirstInsertionPt();
-  StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
-                                   SI.isVolatile(), SI.getAlignment(),
+  StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(),
+                                   MaybeAlign(SI.getAlignment()),
                                    SI.getOrdering(), SI.getSyncScopeID());
   InsertNewInstBefore(NewSI, *BBI);
   NewSI->setDebugLoc(MergedLoc);
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index cc753ce05313..0b9128a9f5a1 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -124,6 +124,50 @@ static Constant *getLogBase2(Type *Ty, Constant *C) {
   return ConstantVector::get(Elts);
 }
 
+// TODO: This is a specific form of a much more general pattern.
+//       We could detect a select with any binop identity constant, or we
+//       could use SimplifyBinOp to see if either arm of the select reduces.
+//       But that needs to be done carefully and/or while removing potential
+//       reverse canonicalizations as in InstCombiner::foldSelectIntoOp().
+static Value *foldMulSelectToNegate(BinaryOperator &I,
+                                    InstCombiner::BuilderTy &Builder) {
+  Value *Cond, *OtherOp;
+
+  // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp
+  // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp
+  if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())),
+                        m_Value(OtherOp))))
+    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp));
+
+  // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp
+  // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp
+  if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())),
+                        m_Value(OtherOp))))
+    return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp);
+
+  // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp
+  // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp
+  if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0),
+                                           m_SpecificFP(-1.0))),
+                         m_Value(OtherOp)))) {
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(I.getFastMathFlags());
+    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp));
+  }
+
+  // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp
+  // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp
+  if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0),
+                                           m_SpecificFP(1.0))),
+                         m_Value(OtherOp)))) {
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(I.getFastMathFlags());
+    return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
@@ -213,6 +257,9 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
     return FoldedMul;
 
+  if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
+    return replaceInstUsesWith(I, FoldedMul);
+
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
     // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
@@ -358,6 +405,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
     return FoldedMul;
 
+  if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
+    return replaceInstUsesWith(I, FoldedMul);
+
   // X * -1.0 --> -X
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   if (match(Op1, m_SpecificFP(-1.0)))
@@ -373,16 +423,6 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C)))
     return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
 
-  // Sink negation: -X * Y --> -(X * Y)
-  // But don't transform constant expressions because there's an inverse fold.
-  if (match(Op0, m_OneUse(m_FNeg(m_Value(X)))) && !isa<ConstantExpr>(Op0))
-    return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op1, &I), &I);
-
-  // Sink negation: Y * -X --> -(X * Y)
-  // But don't transform constant expressions because there's an inverse fold.
-  if (match(Op1, m_OneUse(m_FNeg(m_Value(X)))) && !isa<ConstantExpr>(Op1))
-    return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op0, &I), &I);
-
   // fabs(X) * fabs(X) -> X * X
   if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))))
     return BinaryOperator::CreateFMulFMF(X, X, &I);
@@ -1211,8 +1251,8 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
         !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
                   match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
 
-    if ((IsTan || IsCot) && hasUnaryFloatFn(&TLI, I.getType(), LibFunc_tan,
-                                            LibFunc_tanf, LibFunc_tanl)) {
+    if ((IsTan || IsCot) &&
+        hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
       IRBuilder<> B(&I);
       IRBuilder<>::FastMathFlagGuard FMFGuard(B);
       B.setFastMathFlags(I.getFastMathFlags());
@@ -1244,6 +1284,17 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
     return &I;
   }
 
+  // X / fabs(X) -> copysign(1.0, X)
+  // fabs(X) / X -> copysign(1.0, X)
+  if (I.hasNoNaNs() && I.hasNoInfs() &&
+      (match(&I,
+             m_FDiv(m_Value(X), m_Intrinsic<Intrinsic::fabs>(m_Deferred(X)))) ||
+       match(&I, m_FDiv(m_Intrinsic<Intrinsic::fabs>(m_Value(X)),
+                        m_Deferred(X))))) {
+    Value *V = Builder.CreateBinaryIntrinsic(
+        Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I);
+    return replaceInstUsesWith(I, V);
+  }
   return nullptr;
 }
 
@@ -1309,6 +1360,8 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = I.getType();
   if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
+    // This may increase instruction count, we don't enforce that Y is a
+    // constant.
     Constant *N1 = Constant::getAllOnesValue(Ty);
     Value *Add = Builder.CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 5820ab726637..e0376b7582f3 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -542,7 +542,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   // visitLoadInst will propagate an alignment onto the load when TD is around,
   // and if TD isn't around, we can't handle the mixed case.
   bool isVolatile = FirstLI->isVolatile();
-  unsigned LoadAlignment = FirstLI->getAlignment();
+  MaybeAlign LoadAlignment(FirstLI->getAlignment());
   unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
 
   // We can't sink the load if the loaded value could be modified between the
@@ -574,10 +574,10 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
 
     // If some of the loads have an alignment specified but not all of them,
     // we can't do the transformation.
-    if ((LoadAlignment != 0) != (LI->getAlignment() != 0))
+    if ((LoadAlignment.hasValue()) != (LI->getAlignment() != 0))
       return nullptr;
 
-    LoadAlignment = std::min(LoadAlignment, LI->getAlignment());
+    LoadAlignment = std::min(LoadAlignment, MaybeAlign(LI->getAlignment()));
 
     // If the PHI is of volatile loads and the load block has multiple
     // successors, sinking it would remove a load of the volatile value from
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index aefaf5af1750..9fc871e49b30 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -785,6 +785,41 @@ static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal,
   return nullptr;
 }
 
+/// Fold the following code sequence:
+/// \code
+///   int a = ctlz(x & -x);
+//    x ? 31 - a : a;
+/// \code
+///
+/// into:
+///   cttz(x)
+static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal,
+                                         Value *FalseVal,
+                                         InstCombiner::BuilderTy &Builder) {
+  unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits();
+  if (!ICI->isEquality() || !match(ICI->getOperand(1), m_Zero()))
+    return nullptr;
+
+  if (ICI->getPredicate() == ICmpInst::ICMP_NE)
+    std::swap(TrueVal, FalseVal);
+
+  if (!match(FalseVal,
+             m_Xor(m_Deferred(TrueVal), m_SpecificInt(BitWidth - 1))))
+    return nullptr;
+
+  if (!match(TrueVal, m_Intrinsic<Intrinsic::ctlz>()))
+    return nullptr;
+
+  Value *X = ICI->getOperand(0);
+  auto *II = cast<IntrinsicInst>(TrueVal);
+  if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X)))))
+    return nullptr;
+
+  Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz,
+                                          II->getType());
+  return CallInst::Create(F, {X, II->getArgOperand(1)});
+}
+
 /// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
 /// call to cttz/ctlz with flag 'is_zero_undef' cleared.
 ///
@@ -973,8 +1008,7 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
   // If we are swapping the select operands, swap the metadata too.
   assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
          "Unexpected results from matchSelectPattern");
-  Sel.setTrueValue(LHS);
-  Sel.setFalseValue(RHS);
+  Sel.swapValues();
   Sel.swapProfMetadata();
   return &Sel;
 }
@@ -1056,17 +1090,293 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
   }
 
   // We are swapping the select operands, so swap the metadata too.
-  Sel.setTrueValue(FVal);
-  Sel.setFalseValue(TVal);
+  Sel.swapValues();
   Sel.swapProfMetadata();
   return &Sel;
 }
 
+static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *ReplaceOp,
+                                     const SimplifyQuery &Q) {
+  // If this is a binary operator, try to simplify it with the replaced op
+  // because we know Op and ReplaceOp are equivalant.
+  // For example: V = X + 1, Op = X, ReplaceOp = 42
+  // Simplifies as: add(42, 1) --> 43
+  if (auto *BO = dyn_cast<BinaryOperator>(V)) {
+    if (BO->getOperand(0) == Op)
+      return SimplifyBinOp(BO->getOpcode(), ReplaceOp, BO->getOperand(1), Q);
+    if (BO->getOperand(1) == Op)
+      return SimplifyBinOp(BO->getOpcode(), BO->getOperand(0), ReplaceOp, Q);
+  }
+
+  return nullptr;
+}
+
+/// If we have a select with an equality comparison, then we know the value in
+/// one of the arms of the select. See if substituting this value into an arm
+/// and simplifying the result yields the same value as the other arm.
+///
+/// To make this transform safe, we must drop poison-generating flags
+/// (nsw, etc) if we simplified to a binop because the select may be guarding
+/// that poison from propagating. If the existing binop already had no
+/// poison-generating flags, then this transform can be done by instsimplify.
+///
+/// Consider:
+///   %cmp = icmp eq i32 %x, 2147483647
+///   %add = add nsw i32 %x, 1
+///   %sel = select i1 %cmp, i32 -2147483648, i32 %add
+///
+/// We can't replace %sel with %add unless we strip away the flags.
+/// TODO: Wrapping flags could be preserved in some cases with better analysis.
+static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp,
+                                         const SimplifyQuery &Q) {
+  if (!Cmp.isEquality())
+    return nullptr;
+
+  // Canonicalize the pattern to ICMP_EQ by swapping the select operands.
+  Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
+  if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
+    std::swap(TrueVal, FalseVal);
+
+  // Try each equivalence substitution possibility.
+  // We have an 'EQ' comparison, so the select's false value will propagate.
+  // Example:
+  // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
+  // (X == 42) ? (X + 1) : 43 --> (X == 42) ? (42 + 1) : 43 --> 43
+  Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
+  if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q) == TrueVal ||
+      simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q) == TrueVal ||
+      simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q) == FalseVal ||
+      simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q) == FalseVal) {
+    if (auto *FalseInst = dyn_cast<Instruction>(FalseVal))
+      FalseInst->dropPoisonGeneratingFlags();
+    return FalseVal;
+  }
+  return nullptr;
+}
+
+// See if this is a pattern like:
+//   %old_cmp1 = icmp slt i32 %x, C2
+//   %old_replacement = select i1 %old_cmp1, i32 %target_low, i32 %target_high
+//   %old_x_offseted = add i32 %x, C1
+//   %old_cmp0 = icmp ult i32 %old_x_offseted, C0
+//   %r = select i1 %old_cmp0, i32 %x, i32 %old_replacement
+// This can be rewritten as more canonical pattern:
+//   %new_cmp1 = icmp slt i32 %x, -C1
+//   %new_cmp2 = icmp sge i32 %x, C0-C1
+//   %new_clamped_low = select i1 %new_cmp1, i32 %target_low, i32 %x
+//   %r = select i1 %new_cmp2, i32 %target_high, i32 %new_clamped_low
+// Iff -C1 s<= C2 s<= C0-C1
+// Also ULT predicate can also be UGT iff C0 != -1 (+invert result)
+//      SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.)
+static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
+                                          InstCombiner::BuilderTy &Builder) {
+  Value *X = Sel0.getTrueValue();
+  Value *Sel1 = Sel0.getFalseValue();
+
+  // First match the condition of the outermost select.
+  // Said condition must be one-use.
+  if (!Cmp0.hasOneUse())
+    return nullptr;
+  Value *Cmp00 = Cmp0.getOperand(0);
+  Constant *C0;
+  if (!match(Cmp0.getOperand(1),
+             m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))
+    return nullptr;
+  // Canonicalize Cmp0 into the form we expect.
+  // FIXME: we shouldn't care about lanes that are 'undef' in the end?
+  switch (Cmp0.getPredicate()) {
+  case ICmpInst::Predicate::ICMP_ULT:
+    break; // Great!
+  case ICmpInst::Predicate::ICMP_ULE:
+    // We'd have to increment C0 by one, and for that it must not have all-ones
+    // element, but then it would have been canonicalized to 'ult' before
+    // we get here. So we can't do anything useful with 'ule'.
+    return nullptr;
+  case ICmpInst::Predicate::ICMP_UGT:
+    // We want to canonicalize it to 'ult', so we'll need to increment C0,
+    // which again means it must not have any all-ones elements.
+    if (!match(C0,
+               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+                                  APInt::getAllOnesValue(
+                                      C0->getType()->getScalarSizeInBits()))))
+      return nullptr; // Can't do, have all-ones element[s].
+    C0 = AddOne(C0);
+    std::swap(X, Sel1);
+    break;
+  case ICmpInst::Predicate::ICMP_UGE:
+    // The only way we'd get this predicate if this `icmp` has extra uses,
+    // but then we won't be able to do this fold.
+    return nullptr;
+  default:
+    return nullptr; // Unknown predicate.
+  }
+
+  // Now that we've canonicalized the ICmp, we know the X we expect;
+  // the select in other hand should be one-use.
+  if (!Sel1->hasOneUse())
+    return nullptr;
+
+  // We now can finish matching the condition of the outermost select:
+  // it should either be the X itself, or an addition of some constant to X.
+  Constant *C1;
+  if (Cmp00 == X)
+    C1 = ConstantInt::getNullValue(Sel0.getType());
+  else if (!match(Cmp00,
+                  m_Add(m_Specific(X),
+                        m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C1)))))
+    return nullptr;
+
+  Value *Cmp1;
+  ICmpInst::Predicate Pred1;
+  Constant *C2;
+  Value *ReplacementLow, *ReplacementHigh;
+  if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow),
+                            m_Value(ReplacementHigh))) ||
+      !match(Cmp1,
+             m_ICmp(Pred1, m_Specific(X),
+                    m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C2)))))
+    return nullptr;
+
+  if (!Cmp1->hasOneUse() && (Cmp00 == X || !Cmp00->hasOneUse()))
+    return nullptr; // Not enough one-use instructions for the fold.
+  // FIXME: this restriction could be relaxed if Cmp1 can be reused as one of
+  //        two comparisons we'll need to build.
+
+  // Canonicalize Cmp1 into the form we expect.
+  // FIXME: we shouldn't care about lanes that are 'undef' in the end?
+  switch (Pred1) {
+  case ICmpInst::Predicate::ICMP_SLT:
+    break;
+  case ICmpInst::Predicate::ICMP_SLE:
+    // We'd have to increment C2 by one, and for that it must not have signed
+    // max element, but then it would have been canonicalized to 'slt' before
+    // we get here. So we can't do anything useful with 'sle'.
+    return nullptr;
+  case ICmpInst::Predicate::ICMP_SGT:
+    // We want to canonicalize it to 'slt', so we'll need to increment C2,
+    // which again means it must not have any signed max elements.
+    if (!match(C2,
+               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+                                  APInt::getSignedMaxValue(
+                                      C2->getType()->getScalarSizeInBits()))))
+      return nullptr; // Can't do, have signed max element[s].
+    C2 = AddOne(C2);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::Predicate::ICMP_SGE:
+    // Also non-canonical, but here we don't need to change C2,
+    // so we don't have any restrictions on C2, so we can just handle it.
+    std::swap(ReplacementLow, ReplacementHigh);
+    break;
+  default:
+    return nullptr; // Unknown predicate.
+  }
+
+  // The thresholds of this clamp-like pattern.
+  auto *ThresholdLowIncl = ConstantExpr::getNeg(C1);
+  auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1);
+
+  // The fold has a precondition 1: C2 s>= ThresholdLow
+  auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2,
+                                         ThresholdLowIncl);
+  if (!match(Precond1, m_One()))
+    return nullptr;
+  // The fold has a precondition 2: C2 s<= ThresholdHigh
+  auto *Precond2 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SLE, C2,
+                                         ThresholdHighExcl);
+  if (!match(Precond2, m_One()))
+    return nullptr;
+
+  // All good, finally emit the new pattern.
+  Value *ShouldReplaceLow = Builder.CreateICmpSLT(X, ThresholdLowIncl);
+  Value *ShouldReplaceHigh = Builder.CreateICmpSGE(X, ThresholdHighExcl);
+  Value *MaybeReplacedLow =
+      Builder.CreateSelect(ShouldReplaceLow, ReplacementLow, X);
+  Instruction *MaybeReplacedHigh =
+      SelectInst::Create(ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow);
+
+  return MaybeReplacedHigh;
+}
+
+// If we have
+//  %cmp = icmp [canonical predicate] i32 %x, C0
+//  %r = select i1 %cmp, i32 %y, i32 C1
+// Where C0 != C1 and %x may be different from %y, see if the constant that we
+// will have if we flip the strictness of the predicate (i.e. without changing
+// the result) is identical to the C1 in select. If it matches we can change
+// original comparison to one with swapped predicate, reuse the constant,
+// and swap the hands of select.
+static Instruction *
+tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
+                                         InstCombiner::BuilderTy &Builder) {
+  ICmpInst::Predicate Pred;
+  Value *X;
+  Constant *C0;
+  if (!match(&Cmp, m_OneUse(m_ICmp(
+                       Pred, m_Value(X),
+                       m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))))
+    return nullptr;
+
+  // If comparison predicate is non-relational, we won't be able to do anything.
+  if (ICmpInst::isEquality(Pred))
+    return nullptr;
+
+  // If comparison predicate is non-canonical, then we certainly won't be able
+  // to make it canonical; canonicalizeCmpWithConstant() already tried.
+  if (!isCanonicalPredicate(Pred))
+    return nullptr;
+
+  // If the [input] type of comparison and select type are different, lets abort
+  // for now. We could try to compare constants with trunc/[zs]ext though.
+  if (C0->getType() != Sel.getType())
+    return nullptr;
+
+  // FIXME: are there any magic icmp predicate+constant pairs we must not touch?
+
+  Value *SelVal0, *SelVal1; // We do not care which one is from where.
+  match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1)));
+  // At least one of these values we are selecting between must be a constant
+  // else we'll never succeed.
+  if (!match(SelVal0, m_AnyIntegralConstant()) &&
+      !match(SelVal1, m_AnyIntegralConstant()))
+    return nullptr;
+
+  // Does this constant C match any of the `select` values?
+  auto MatchesSelectValue = [SelVal0, SelVal1](Constant *C) {
+    return C->isElementWiseEqual(SelVal0) || C->isElementWiseEqual(SelVal1);
+  };
+
+  // If C0 *already* matches true/false value of select, we are done.
+  if (MatchesSelectValue(C0))
+    return nullptr;
+
+  // Check the constant we'd have with flipped-strictness predicate.
+  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0);
+  if (!FlippedStrictness)
+    return nullptr;
+
+  // If said constant doesn't match either, then there is no hope,
+  if (!MatchesSelectValue(FlippedStrictness->second))
+    return nullptr;
+
+  // It matched! Lets insert the new comparison just before select.
+  InstCombiner::BuilderTy::InsertPointGuard Guard(Builder);
+  Builder.SetInsertPoint(&Sel);
+
+  Pred = ICmpInst::getSwappedPredicate(Pred); // Yes, swapped.
+  Value *NewCmp = Builder.CreateICmp(Pred, X, FlippedStrictness->second,
+                                     Cmp.getName() + ".inv");
+  Sel.setCondition(NewCmp);
+  Sel.swapValues();
+  Sel.swapProfMetadata();
+
+  return &Sel;
+}
+
 /// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
                                                   ICmpInst *ICI) {
-  Value *TrueVal = SI.getTrueValue();
-  Value *FalseVal = SI.getFalseValue();
+  if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ))
+    return replaceInstUsesWith(SI, V);
 
   if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, Builder))
     return NewSel;
@@ -1074,12 +1384,21 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
   if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, Builder))
     return NewAbs;
 
+  if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder))
+    return NewAbs;
+
+  if (Instruction *NewSel =
+          tryToReuseConstantFromSelectInComparison(SI, *ICI, Builder))
+    return NewSel;
+
   bool Changed = adjustMinMax(SI, *ICI);
 
   if (Value *V = foldSelectICmpAnd(SI, ICI, Builder))
     return replaceInstUsesWith(SI, V);
 
   // NOTE: if we wanted to, this is where to detect integer MIN/MAX
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
   ICmpInst::Predicate Pred = ICI->getPredicate();
   Value *CmpLHS = ICI->getOperand(0);
   Value *CmpRHS = ICI->getOperand(1);
@@ -1149,6 +1468,9 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
           foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
     return V;
 
+  if (Instruction *V = foldSelectCtlzToCttz(ICI, TrueVal, FalseVal, Builder))
+    return V;
+
   if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder))
     return replaceInstUsesWith(SI, V);
 
@@ -1253,6 +1575,16 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
     }
   }
 
+  // max(max(A, B), min(A, B)) --> max(A, B)
+  // min(min(A, B), max(A, B)) --> min(A, B)
+  // TODO: This could be done in instsimplify.
+  if (SPF1 == SPF2 &&
+      ((SPF1 == SPF_UMIN && match(C, m_c_UMax(m_Specific(A), m_Specific(B)))) ||
+       (SPF1 == SPF_SMIN && match(C, m_c_SMax(m_Specific(A), m_Specific(B)))) ||
+       (SPF1 == SPF_UMAX && match(C, m_c_UMin(m_Specific(A), m_Specific(B)))) ||
+       (SPF1 == SPF_SMAX && match(C, m_c_SMin(m_Specific(A), m_Specific(B))))))
+    return replaceInstUsesWith(Outer, Inner);
+
   // ABS(ABS(X)) -> ABS(X)
   // NABS(NABS(X)) -> NABS(X)
   // TODO: This could be done in instsimplify.
@@ -1280,7 +1612,7 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
       return true;
     }
 
-    if (IsFreeToInvert(V, !V->hasNUsesOrMore(3))) {
+    if (isFreeToInvert(V, !V->hasNUsesOrMore(3))) {
       NotV = nullptr;
       return true;
     }
@@ -1492,6 +1824,30 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
                                ConstantVector::get(Mask));
 }
 
+/// If we have a select of vectors with a scalar condition, try to convert that
+/// to a vector select by splatting the condition. A splat may get folded with
+/// other operations in IR and having all operands of a select be vector types
+/// is likely better for vector codegen.
+static Instruction *canonicalizeScalarSelectOfVecs(
+    SelectInst &Sel, InstCombiner::BuilderTy &Builder) {
+  Type *Ty = Sel.getType();
+  if (!Ty->isVectorTy())
+    return nullptr;
+
+  // We can replace a single-use extract with constant index.
+  Value *Cond = Sel.getCondition();
+  if (!match(Cond, m_OneUse(m_ExtractElement(m_Value(), m_ConstantInt()))))
+    return nullptr;
+
+  // select (extelt V, Index), T, F --> select (splat V, Index), T, F
+  // Splatting the extracted condition reduces code (we could directly create a
+  // splat shuffle of the source vector to eliminate the intermediate step).
+  unsigned NumElts = Ty->getVectorNumElements();
+  Value *SplatCond = Builder.CreateVectorSplat(NumElts, Cond);
+  Sel.setCondition(SplatCond);
+  return &Sel;
+}
+
 /// Reuse bitcasted operands between a compare and select:
 /// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
 /// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D))
@@ -1648,6 +2004,71 @@ static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
   return nullptr;
 }
 
+/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
+Instruction *InstCombiner::matchSAddSubSat(SelectInst &MinMax1) {
+  Type *Ty = MinMax1.getType();
+
+  // We are looking for a tree of:
+  // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B))))
+  // Where the min and max could be reversed
+  Instruction *MinMax2;
+  BinaryOperator *AddSub;
+  const APInt *MinValue, *MaxValue;
+  if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) {
+    if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue))))
+      return nullptr;
+  } else if (match(&MinMax1,
+                   m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) {
+    if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue))))
+      return nullptr;
+  } else
+    return nullptr;
+
+  // Check that the constants clamp a saturate, and that the new type would be
+  // sensible to convert to.
+  if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1)
+    return nullptr;
+  // In what bitwidth can this be treated as saturating arithmetics?
+  unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1;
+  // FIXME: This isn't quite right for vectors, but using the scalar type is a
+  // good first approximation for what should be done there.
+  if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth))
+    return nullptr;
+
+  // Also make sure that the number of uses is as expected. The "3"s are for the
+  // the two items of min/max (the compare and the select).
+  if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3))
+    return nullptr;
+
+  // Create the new type (which can be a vector type)
+  Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth);
+  // Match the two extends from the add/sub
+  Value *A, *B;
+  if(!match(AddSub, m_BinOp(m_SExt(m_Value(A)), m_SExt(m_Value(B)))))
+    return nullptr;
+  // And check the incoming values are of a type smaller than or equal to the
+  // size of the saturation. Otherwise the higher bits can cause different
+  // results.
+  if (A->getType()->getScalarSizeInBits() > NewBitWidth ||
+      B->getType()->getScalarSizeInBits() > NewBitWidth)
+    return nullptr;
+
+  Intrinsic::ID IntrinsicID;
+  if (AddSub->getOpcode() == Instruction::Add)
+    IntrinsicID = Intrinsic::sadd_sat;
+  else if (AddSub->getOpcode() == Instruction::Sub)
+    IntrinsicID = Intrinsic::ssub_sat;
+  else
+    return nullptr;
+
+  // Finally create and return the sat intrinsic, truncated to the new type
+  Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy);
+  Value *AT = Builder.CreateSExt(A, NewTy);
+  Value *BT = Builder.CreateSExt(B, NewTy);
+  Value *Sat = Builder.CreateCall(F, {AT, BT});
+  return CastInst::Create(Instruction::SExt, Sat, Ty);
+}
+
 /// Reduce a sequence of min/max with a common operand.
 static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
                                         Value *RHS,
@@ -1788,6 +2209,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *I = canonicalizeSelectToShuffle(SI))
     return I;
 
+  if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, Builder))
+    return I;
+
   // Canonicalize a one-use integer compare with a non-canonical predicate by
   // inverting the predicate and swapping the select operands. This matches a
   // compare canonicalization for conditional branches.
@@ -2013,16 +2437,17 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
           (LHS->getType()->isFPOrFPVectorTy() &&
            ((CmpLHS != LHS && CmpLHS != RHS) ||
             (CmpRHS != LHS && CmpRHS != RHS)))) {
-        CmpInst::Predicate Pred = getMinMaxPred(SPF, SPR.Ordered);
+        CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered);
 
         Value *Cmp;
-        if (CmpInst::isIntPredicate(Pred)) {
-          Cmp = Builder.CreateICmp(Pred, LHS, RHS);
+        if (CmpInst::isIntPredicate(MinMaxPred)) {
+          Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS);
         } else {
           IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-          auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
+          auto FMF =
+              cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
           Builder.setFastMathFlags(FMF);
-          Cmp = Builder.CreateFCmp(Pred, LHS, RHS);
+          Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS);
         }
 
         Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI);
@@ -2040,9 +2465,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * {
         Value *A;
         if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) &&
-            !IsFreeToInvert(A, A->hasOneUse()) &&
+            !isFreeToInvert(A, A->hasOneUse()) &&
             // Passing false to only consider m_Not and constants.
-            IsFreeToInvert(Y, false)) {
+            isFreeToInvert(Y, false)) {
           Value *B = Builder.CreateNot(Y);
           Value *NewMinMax = createMinMax(Builder, getInverseMinMaxFlavor(SPF),
                                           A, B);
@@ -2070,6 +2495,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
       if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder))
         return I;
+      if (Instruction *I = matchSAddSubSat(SI))
+        return I;
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index c821292400cd..64294838644f 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -25,50 +25,275 @@ using namespace PatternMatch;
 // we should rewrite it as
 //   x shiftopcode (Q+K)  iff (Q+K) u< bitwidth(x)
 // This is valid for any shift, but they must be identical.
-static Instruction *
-reassociateShiftAmtsOfTwoSameDirectionShifts(BinaryOperator *Sh0,
-                                             const SimplifyQuery &SQ) {
-  // Look for:  (x shiftopcode ShAmt0) shiftopcode ShAmt1
-  Value *X, *ShAmt1, *ShAmt0;
+//
+// AnalyzeForSignBitExtraction indicates that we will only analyze whether this
+// pattern has any 2 right-shifts that sum to 1 less than original bit width.
+Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts(
+    BinaryOperator *Sh0, const SimplifyQuery &SQ,
+    bool AnalyzeForSignBitExtraction) {
+  // Look for a shift of some instruction, ignore zext of shift amount if any.
+  Instruction *Sh0Op0;
+  Value *ShAmt0;
+  if (!match(Sh0,
+             m_Shift(m_Instruction(Sh0Op0), m_ZExtOrSelf(m_Value(ShAmt0)))))
+    return nullptr;
+
+  // If there is a truncation between the two shifts, we must make note of it
+  // and look through it. The truncation imposes additional constraints on the
+  // transform.
   Instruction *Sh1;
-  if (!match(Sh0, m_Shift(m_CombineAnd(m_Shift(m_Value(X), m_Value(ShAmt1)),
-                                       m_Instruction(Sh1)),
-                          m_Value(ShAmt0))))
+  Value *Trunc = nullptr;
+  match(Sh0Op0,
+        m_CombineOr(m_CombineAnd(m_Trunc(m_Instruction(Sh1)), m_Value(Trunc)),
+                    m_Instruction(Sh1)));
+
+  // Inner shift: (x shiftopcode ShAmt1)
+  // Like with other shift, ignore zext of shift amount if any.
+  Value *X, *ShAmt1;
+  if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1)))))
+    return nullptr;
+
+  // We have two shift amounts from two different shifts. The types of those
+  // shift amounts may not match. If that's the case let's bailout now..
+  if (ShAmt0->getType() != ShAmt1->getType())
+    return nullptr;
+
+  // We are only looking for signbit extraction if we have two right shifts.
+  bool HadTwoRightShifts = match(Sh0, m_Shr(m_Value(), m_Value())) &&
+                           match(Sh1, m_Shr(m_Value(), m_Value()));
+  // ... and if it's not two right-shifts, we know the answer already.
+  if (AnalyzeForSignBitExtraction && !HadTwoRightShifts)
     return nullptr;
 
-  // The shift opcodes must be identical.
+  // The shift opcodes must be identical, unless we are just checking whether
+  // this pattern can be interpreted as a sign-bit-extraction.
   Instruction::BinaryOps ShiftOpcode = Sh0->getOpcode();
-  if (ShiftOpcode != Sh1->getOpcode())
+  bool IdenticalShOpcodes = Sh0->getOpcode() == Sh1->getOpcode();
+  if (!IdenticalShOpcodes && !AnalyzeForSignBitExtraction)
     return nullptr;
+
+  // If we saw truncation, we'll need to produce extra instruction,
+  // and for that one of the operands of the shift must be one-use,
+  // unless of course we don't actually plan to produce any instructions here.
+  if (Trunc && !AnalyzeForSignBitExtraction &&
+      !match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+    return nullptr;
+
   // Can we fold (ShAmt0+ShAmt1) ?
-  Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, ShAmt0, ShAmt1,
-                                  SQ.getWithInstruction(Sh0));
+  auto *NewShAmt = dyn_cast_or_null<Constant>(
+      SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false,
+                      SQ.getWithInstruction(Sh0)));
   if (!NewShAmt)
     return nullptr; // Did not simplify.
-  // Is the new shift amount smaller than the bit width?
-  // FIXME: could also rely on ConstantRange.
-  unsigned BitWidth = X->getType()->getScalarSizeInBits();
+  unsigned NewShAmtBitWidth = NewShAmt->getType()->getScalarSizeInBits();
+  unsigned XBitWidth = X->getType()->getScalarSizeInBits();
+  // Is the new shift amount smaller than the bit width of inner/new shift?
   if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
-                                          APInt(BitWidth, BitWidth))))
-    return nullptr;
+                                          APInt(NewShAmtBitWidth, XBitWidth))))
+    return nullptr; // FIXME: could perform constant-folding.
+
+  // If there was a truncation, and we have a right-shift, we can only fold if
+  // we are left with the original sign bit. Likewise, if we were just checking
+  // that this is a sighbit extraction, this is the place to check it.
+  // FIXME: zero shift amount is also legal here, but we can't *easily* check
+  // more than one predicate so it's not really worth it.
+  if (HadTwoRightShifts && (Trunc || AnalyzeForSignBitExtraction)) {
+    // If it's not a sign bit extraction, then we're done.
+    if (!match(NewShAmt,
+               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                                  APInt(NewShAmtBitWidth, XBitWidth - 1))))
+      return nullptr;
+    // If it is, and that was the question, return the base value.
+    if (AnalyzeForSignBitExtraction)
+      return X;
+  }
+
+  assert(IdenticalShOpcodes && "Should not get here with different shifts.");
+
   // All good, we can do this fold.
+  NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType());
+
   BinaryOperator *NewShift = BinaryOperator::Create(ShiftOpcode, X, NewShAmt);
-  // If both of the original shifts had the same flag set, preserve the flag.
-  if (ShiftOpcode == Instruction::BinaryOps::Shl) {
-    NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() &&
-                                   Sh1->hasNoUnsignedWrap());
-    NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() &&
-                                 Sh1->hasNoSignedWrap());
-  } else {
-    NewShift->setIsExact(Sh0->isExact() && Sh1->isExact());
+
+  // The flags can only be propagated if there wasn't a trunc.
+  if (!Trunc) {
+    // If the pattern did not involve trunc, and both of the original shifts
+    // had the same flag set, preserve the flag.
+    if (ShiftOpcode == Instruction::BinaryOps::Shl) {
+      NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() &&
+                                     Sh1->hasNoUnsignedWrap());
+      NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() &&
+                                   Sh1->hasNoSignedWrap());
+    } else {
+      NewShift->setIsExact(Sh0->isExact() && Sh1->isExact());
+    }
+  }
+
+  Instruction *Ret = NewShift;
+  if (Trunc) {
+    Builder.Insert(NewShift);
+    Ret = CastInst::Create(Instruction::Trunc, NewShift, Sh0->getType());
+  }
+
+  return Ret;
+}
+
+// Try to replace `undef` constants in C with Replacement.
+static Constant *replaceUndefsWith(Constant *C, Constant *Replacement) {
+  if (C && match(C, m_Undef()))
+    return Replacement;
+
+  if (auto *CV = dyn_cast<ConstantVector>(C)) {
+    llvm::SmallVector<Constant *, 32> NewOps(CV->getNumOperands());
+    for (unsigned i = 0, NumElts = NewOps.size(); i != NumElts; ++i) {
+      Constant *EltC = CV->getOperand(i);
+      NewOps[i] = EltC && match(EltC, m_Undef()) ? Replacement : EltC;
+    }
+    return ConstantVector::get(NewOps);
+  }
+
+  // Don't know how to deal with this constant.
+  return C;
+}
+
+// If we have some pattern that leaves only some low bits set, and then performs
+// left-shift of those bits, if none of the bits that are left after the final
+// shift are modified by the mask, we can omit the mask.
+//
+// There are many variants to this pattern:
+//   a)  (x & ((1 << MaskShAmt) - 1)) << ShiftShAmt
+//   b)  (x & (~(-1 << MaskShAmt))) << ShiftShAmt
+//   c)  (x & (-1 >> MaskShAmt)) << ShiftShAmt
+//   d)  (x & ((-1 << MaskShAmt) >> MaskShAmt)) << ShiftShAmt
+//   e)  ((x << MaskShAmt) l>> MaskShAmt) << ShiftShAmt
+//   f)  ((x << MaskShAmt) a>> MaskShAmt) << ShiftShAmt
+// All these patterns can be simplified to just:
+//   x << ShiftShAmt
+// iff:
+//   a,b)     (MaskShAmt+ShiftShAmt) u>= bitwidth(x)
+//   c,d,e,f) (ShiftShAmt-MaskShAmt) s>= 0 (i.e. ShiftShAmt u>= MaskShAmt)
+static Instruction *
+dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
+                                     const SimplifyQuery &Q,
+                                     InstCombiner::BuilderTy &Builder) {
+  assert(OuterShift->getOpcode() == Instruction::BinaryOps::Shl &&
+         "The input must be 'shl'!");
+
+  Value *Masked, *ShiftShAmt;
+  match(OuterShift, m_Shift(m_Value(Masked), m_Value(ShiftShAmt)));
+
+  Type *NarrowestTy = OuterShift->getType();
+  Type *WidestTy = Masked->getType();
+  // The mask must be computed in a type twice as wide to ensure
+  // that no bits are lost if the sum-of-shifts is wider than the base type.
+  Type *ExtendedTy = WidestTy->getExtendedType();
+
+  Value *MaskShAmt;
+
+  // ((1 << MaskShAmt) - 1)
+  auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes());
+  // (~(-1 << maskNbits))
+  auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes());
+  // (-1 >> MaskShAmt)
+  auto MaskC = m_Shr(m_AllOnes(), m_Value(MaskShAmt));
+  // ((-1 << MaskShAmt) >> MaskShAmt)
+  auto MaskD =
+      m_Shr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt));
+
+  Value *X;
+  Constant *NewMask;
+
+  if (match(Masked, m_c_And(m_CombineOr(MaskA, MaskB), m_Value(X)))) {
+    // Can we simplify (MaskShAmt+ShiftShAmt) ?
+    auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst(
+        MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
+    if (!SumOfShAmts)
+      return nullptr; // Did not simplify.
+    // In this pattern SumOfShAmts correlates with the number of low bits
+    // that shall remain in the root value (OuterShift).
+
+    // An extend of an undef value becomes zero because the high bits are never
+    // completely unknown. Replace the the `undef` shift amounts with final
+    // shift bitwidth to ensure that the value remains undef when creating the
+    // subsequent shift op.
+    SumOfShAmts = replaceUndefsWith(
+        SumOfShAmts, ConstantInt::get(SumOfShAmts->getType()->getScalarType(),
+                                      ExtendedTy->getScalarSizeInBits()));
+    auto *ExtendedSumOfShAmts = ConstantExpr::getZExt(SumOfShAmts, ExtendedTy);
+    // And compute the mask as usual: ~(-1 << (SumOfShAmts))
+    auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
+    auto *ExtendedInvertedMask =
+        ConstantExpr::getShl(ExtendedAllOnes, ExtendedSumOfShAmts);
+    NewMask = ConstantExpr::getNot(ExtendedInvertedMask);
+  } else if (match(Masked, m_c_And(m_CombineOr(MaskC, MaskD), m_Value(X))) ||
+             match(Masked, m_Shr(m_Shl(m_Value(X), m_Value(MaskShAmt)),
+                                 m_Deferred(MaskShAmt)))) {
+    // Can we simplify (ShiftShAmt-MaskShAmt) ?
+    auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst(
+        ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
+    if (!ShAmtsDiff)
+      return nullptr; // Did not simplify.
+    // In this pattern ShAmtsDiff correlates with the number of high bits that
+    // shall be unset in the root value (OuterShift).
+
+    // An extend of an undef value becomes zero because the high bits are never
+    // completely unknown. Replace the the `undef` shift amounts with negated
+    // bitwidth of innermost shift to ensure that the value remains undef when
+    // creating the subsequent shift op.
+    unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits();
+    ShAmtsDiff = replaceUndefsWith(
+        ShAmtsDiff, ConstantInt::get(ShAmtsDiff->getType()->getScalarType(),
+                                     -WidestTyBitWidth));
+    auto *ExtendedNumHighBitsToClear = ConstantExpr::getZExt(
+        ConstantExpr::getSub(ConstantInt::get(ShAmtsDiff->getType(),
+                                              WidestTyBitWidth,
+                                              /*isSigned=*/false),
+                             ShAmtsDiff),
+        ExtendedTy);
+    // And compute the mask as usual: (-1 l>> (NumHighBitsToClear))
+    auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
+    NewMask =
+        ConstantExpr::getLShr(ExtendedAllOnes, ExtendedNumHighBitsToClear);
+  } else
+    return nullptr; // Don't know anything about this pattern.
+
+  NewMask = ConstantExpr::getTrunc(NewMask, NarrowestTy);
+
+  // Does this mask has any unset bits? If not then we can just not apply it.
+  bool NeedMask = !match(NewMask, m_AllOnes());
+
+  // If we need to apply a mask, there are several more restrictions we have.
+  if (NeedMask) {
+    // The old masking instruction must go away.
+    if (!Masked->hasOneUse())
+      return nullptr;
+    // The original "masking" instruction must not have been`ashr`.
+    if (match(Masked, m_AShr(m_Value(), m_Value())))
+      return nullptr;
   }
-  return NewShift;
+
+  // No 'NUW'/'NSW'! We no longer know that we won't shift-out non-0 bits.
+  auto *NewShift = BinaryOperator::Create(OuterShift->getOpcode(), X,
+                                          OuterShift->getOperand(1));
+
+  if (!NeedMask)
+    return NewShift;
+
+  Builder.Insert(NewShift);
+  return BinaryOperator::Create(Instruction::And, NewShift, NewMask);
 }
 
 Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   assert(Op0->getType() == Op1->getType());
 
+  // If the shift amount is a one-use `sext`, we can demote it to `zext`.
+  Value *Y;
+  if (match(Op1, m_OneUse(m_SExt(m_Value(Y))))) {
+    Value *NewExt = Builder.CreateZExt(Y, I.getType(), Op1->getName());
+    return BinaryOperator::Create(I.getOpcode(), Op0, NewExt);
+  }
+
   // See if we can fold away this shift.
   if (SimplifyDemandedInstructionBits(I))
     return &I;
@@ -83,8 +308,8 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
     if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
       return Res;
 
-  if (Instruction *NewShift =
-          reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ))
+  if (auto *NewShift = cast_or_null<Instruction>(
+          reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ)))
     return NewShift;
 
   // (C1 shift (A add C2)) -> (C1 shift C2) shift A)
@@ -618,9 +843,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
 }
 
 Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&I);
+
   if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
-                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
-                                 SQ.getWithInstruction(&I)))
+                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), Q))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *X = foldVectorBinop(I))
@@ -629,6 +855,9 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
   if (Instruction *V = commonShiftTransforms(I))
     return V;
 
+  if (Instruction *V = dropRedundantMaskingOfLeftShiftInput(&I, Q, Builder))
+    return V;
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = I.getType();
   unsigned BitWidth = Ty->getScalarSizeInBits();
@@ -636,12 +865,11 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
   const APInt *ShAmtAPInt;
   if (match(Op1, m_APInt(ShAmtAPInt))) {
     unsigned ShAmt = ShAmtAPInt->getZExtValue();
-    unsigned BitWidth = Ty->getScalarSizeInBits();
 
     // shl (zext X), ShAmt --> zext (shl X, ShAmt)
     // This is only valid if X would have zeros shifted out.
     Value *X;
-    if (match(Op0, m_ZExt(m_Value(X)))) {
+    if (match(Op0, m_OneUse(m_ZExt(m_Value(X))))) {
       unsigned SrcWidth = X->getType()->getScalarSizeInBits();
       if (ShAmt < SrcWidth &&
           MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I))
@@ -719,6 +947,12 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
     // (X * C2) << C1 --> X * (C2 << C1)
     if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
       return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
+
+    // shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
+    if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+      auto *NewC = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C1);
+      return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
+    }
   }
 
   // (1 << (C - x)) -> ((1 << C) >> x) if C is bitwidth - 1
@@ -859,6 +1093,75 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
   return nullptr;
 }
 
+Instruction *
+InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract(
+    BinaryOperator &OldAShr) {
+  assert(OldAShr.getOpcode() == Instruction::AShr &&
+         "Must be called with arithmetic right-shift instruction only.");
+
+  // Check that constant C is a splat of the element-wise bitwidth of V.
+  auto BitWidthSplat = [](Constant *C, Value *V) {
+    return match(
+        C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+                              APInt(C->getType()->getScalarSizeInBits(),
+                                    V->getType()->getScalarSizeInBits())));
+  };
+
+  // It should look like variable-length sign-extension on the outside:
+  //   (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits)
+  Value *NBits;
+  Instruction *MaybeTrunc;
+  Constant *C1, *C2;
+  if (!match(&OldAShr,
+             m_AShr(m_Shl(m_Instruction(MaybeTrunc),
+                          m_ZExtOrSelf(m_Sub(m_Constant(C1),
+                                             m_ZExtOrSelf(m_Value(NBits))))),
+                    m_ZExtOrSelf(m_Sub(m_Constant(C2),
+                                       m_ZExtOrSelf(m_Deferred(NBits)))))) ||
+      !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr))
+    return nullptr;
+
+  // There may or may not be a truncation after outer two shifts.
+  Instruction *HighBitExtract;
+  match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract)));
+  bool HadTrunc = MaybeTrunc != HighBitExtract;
+
+  // And finally, the innermost part of the pattern must be a right-shift.
+  Value *X, *NumLowBitsToSkip;
+  if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip))))
+    return nullptr;
+
+  // Said right-shift must extract high NBits bits - C0 must be it's bitwidth.
+  Constant *C0;
+  if (!match(NumLowBitsToSkip,
+             m_ZExtOrSelf(
+                 m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) ||
+      !BitWidthSplat(C0, HighBitExtract))
+    return nullptr;
+
+  // Since the NBits is identical for all shifts, if the outermost and
+  // innermost shifts are identical, then outermost shifts are redundant.
+  // If we had truncation, do keep it though.
+  if (HighBitExtract->getOpcode() == OldAShr.getOpcode())
+    return replaceInstUsesWith(OldAShr, MaybeTrunc);
+
+  // Else, if there was a truncation, then we need to ensure that one
+  // instruction will go away.
+  if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+    return nullptr;
+
+  // Finally, bypass two innermost shifts, and perform the outermost shift on
+  // the operands of the innermost shift.
+  Instruction *NewAShr =
+      BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip);
+  NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness.
+  if (!HadTrunc)
+    return NewAShr;
+
+  Builder.Insert(NewAShr);
+  return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType());
+}
+
 Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
   if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                   SQ.getWithInstruction(&I)))
@@ -933,6 +1236,9 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
     }
   }
 
+  if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I))
+    return R;
+
   // See if we can turn a signed shr into an unsigned shr.
   if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I))
     return BinaryOperator::CreateLShr(Op0, Op1);
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index e0d85c4b49ae..d30ab8001897 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -971,6 +971,13 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
 Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                                            APInt DemandedElts,
                                                            int DMaskIdx) {
+
+  // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
+  if (DMaskIdx < 0 &&
+      II->getType()->getScalarSizeInBits() != 32 &&
+      DemandedElts.getActiveBits() == 3)
+    return nullptr;
+
   unsigned VWidth = II->getType()->getVectorNumElements();
   if (VWidth == 1)
     return nullptr;
@@ -1067,16 +1074,22 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
 }
 
 /// The specified value produces a vector with any number of elements.
+/// This method analyzes which elements of the operand are undef and returns
+/// that information in UndefElts.
+///
 /// DemandedElts contains the set of elements that are actually used by the
-/// caller. This method analyzes which elements of the operand are undef and
-/// returns that information in UndefElts.
+/// caller, and by default (AllowMultipleUsers equals false) the value is
+/// simplified only if it has a single caller. If AllowMultipleUsers is set
+/// to true, DemandedElts refers to the union of sets of elements that are
+/// used by all callers.
 ///
 /// If the information about demanded elements can be used to simplify the
 /// operation, the operation is simplified, then the resultant value is
 /// returned.  This returns null if no change was made.
 Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                                 APInt &UndefElts,
-                                                unsigned Depth) {
+                                                unsigned Depth,
+                                                bool AllowMultipleUsers) {
   unsigned VWidth = V->getType()->getVectorNumElements();
   APInt EltMask(APInt::getAllOnesValue(VWidth));
   assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
@@ -1130,19 +1143,21 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   if (Depth == 10)
     return nullptr;
 
-  // If multiple users are using the root value, proceed with
-  // simplification conservatively assuming that all elements
-  // are needed.
-  if (!V->hasOneUse()) {
-    // Quit if we find multiple users of a non-root value though.
-    // They'll be handled when it's their turn to be visited by
-    // the main instcombine process.
-    if (Depth != 0)
-      // TODO: Just compute the UndefElts information recursively.
-      return nullptr;
+  if (!AllowMultipleUsers) {
+    // If multiple users are using the root value, proceed with
+    // simplification conservatively assuming that all elements
+    // are needed.
+    if (!V->hasOneUse()) {
+      // Quit if we find multiple users of a non-root value though.
+      // They'll be handled when it's their turn to be visited by
+      // the main instcombine process.
+      if (Depth != 0)
+        // TODO: Just compute the UndefElts information recursively.
+        return nullptr;
 
-    // Conservatively assume that all elements are needed.
-    DemandedElts = EltMask;
+      // Conservatively assume that all elements are needed.
+      DemandedElts = EltMask;
+    }
   }
 
   Instruction *I = dyn_cast<Instruction>(V);
@@ -1674,8 +1689,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     case Intrinsic::amdgcn_buffer_load_format:
     case Intrinsic::amdgcn_raw_buffer_load:
     case Intrinsic::amdgcn_raw_buffer_load_format:
+    case Intrinsic::amdgcn_raw_tbuffer_load:
     case Intrinsic::amdgcn_struct_buffer_load:
     case Intrinsic::amdgcn_struct_buffer_load_format:
+    case Intrinsic::amdgcn_struct_tbuffer_load:
+    case Intrinsic::amdgcn_tbuffer_load:
       return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
     default: {
       if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index dc9abdd7f47a..9c890748e5ab 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -253,6 +253,69 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
   return nullptr;
 }
 
+/// Find elements of V demanded by UserInstr.
+static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
+  unsigned VWidth = V->getType()->getVectorNumElements();
+
+  // Conservatively assume that all elements are needed.
+  APInt UsedElts(APInt::getAllOnesValue(VWidth));
+
+  switch (UserInstr->getOpcode()) {
+  case Instruction::ExtractElement: {
+    ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
+    assert(EEI->getVectorOperand() == V);
+    ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
+    if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
+      UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
+    }
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
+    unsigned MaskNumElts = UserInstr->getType()->getVectorNumElements();
+
+    UsedElts = APInt(VWidth, 0);
+    for (unsigned i = 0; i < MaskNumElts; i++) {
+      unsigned MaskVal = Shuffle->getMaskValue(i);
+      if (MaskVal == -1u || MaskVal >= 2 * VWidth)
+        continue;
+      if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
+        UsedElts.setBit(MaskVal);
+      if (Shuffle->getOperand(1) == V &&
+          ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
+        UsedElts.setBit(MaskVal - VWidth);
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  return UsedElts;
+}
+
+/// Find union of elements of V demanded by all its users.
+/// If it is known by querying findDemandedEltsBySingleUser that
+/// no user demands an element of V, then the corresponding bit
+/// remains unset in the returned value.
+static APInt findDemandedEltsByAllUsers(Value *V) {
+  unsigned VWidth = V->getType()->getVectorNumElements();
+
+  APInt UnionUsedElts(VWidth, 0);
+  for (const Use &U : V->uses()) {
+    if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
+      UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
+    } else {
+      UnionUsedElts = APInt::getAllOnesValue(VWidth);
+      break;
+    }
+
+    if (UnionUsedElts.isAllOnesValue())
+      break;
+  }
+
+  return UnionUsedElts;
+}
+
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
   Value *SrcVec = EI.getVectorOperand();
   Value *Index = EI.getIndexOperand();
@@ -271,19 +334,35 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       return nullptr;
 
     // This instruction only demands the single element from the input vector.
-    // If the input vector has a single use, simplify it based on this use
-    // property.
-    if (SrcVec->hasOneUse() && NumElts != 1) {
-      APInt UndefElts(NumElts, 0);
-      APInt DemandedElts(NumElts, 0);
-      DemandedElts.setBit(IndexC->getZExtValue());
-      if (Value *V = SimplifyDemandedVectorElts(SrcVec, DemandedElts,
-                                                UndefElts)) {
-        EI.setOperand(0, V);
-        return &EI;
+    if (NumElts != 1) {
+      // If the input vector has a single use, simplify it based on this use
+      // property.
+      if (SrcVec->hasOneUse()) {
+        APInt UndefElts(NumElts, 0);
+        APInt DemandedElts(NumElts, 0);
+        DemandedElts.setBit(IndexC->getZExtValue());
+        if (Value *V =
+                SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) {
+          EI.setOperand(0, V);
+          return &EI;
+        }
+      } else {
+        // If the input vector has multiple uses, simplify it based on a union
+        // of all elements used.
+        APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
+        if (!DemandedElts.isAllOnesValue()) {
+          APInt UndefElts(NumElts, 0);
+          if (Value *V = SimplifyDemandedVectorElts(
+                  SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
+                  true /* AllowMultipleUsers */)) {
+            if (V != SrcVec) {
+              SrcVec->replaceAllUsesWith(V);
+              return &EI;
+            }
+          }
+        }
       }
     }
-
     if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian()))
       return I;
 
@@ -766,6 +845,55 @@ static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
   return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask);
 }
 
+/// Try to fold an extract+insert element into an existing identity shuffle by
+/// changing the shuffle's mask to include the index of this insert element.
+static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
+  // Check if the vector operand of this insert is an identity shuffle.
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
+  if (!Shuf || !isa<UndefValue>(Shuf->getOperand(1)) ||
+      !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding()))
+    return nullptr;
+
+  // Check for a constant insertion index.
+  uint64_t IdxC;
+  if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
+    return nullptr;
+
+  // Check if this insert's scalar op is extracted from the identity shuffle's
+  // input vector.
+  Value *Scalar = InsElt.getOperand(1);
+  Value *X = Shuf->getOperand(0);
+  if (!match(Scalar, m_ExtractElement(m_Specific(X), m_SpecificInt(IdxC))))
+    return nullptr;
+
+  // Replace the shuffle mask element at the index of this extract+insert with
+  // that same index value.
+  // For example:
+  // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'
+  unsigned NumMaskElts = Shuf->getType()->getVectorNumElements();
+  SmallVector<Constant *, 16> NewMaskVec(NumMaskElts);
+  Type *I32Ty = IntegerType::getInt32Ty(Shuf->getContext());
+  Constant *NewMaskEltC = ConstantInt::get(I32Ty, IdxC);
+  Constant *OldMask = Shuf->getMask();
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    if (i != IdxC) {
+      // All mask elements besides the inserted element remain the same.
+      NewMaskVec[i] = OldMask->getAggregateElement(i);
+    } else if (OldMask->getAggregateElement(i) == NewMaskEltC) {
+      // If the mask element was already set, there's nothing to do
+      // (demanded elements analysis may unset it later).
+      return nullptr;
+    } else {
+      assert(isa<UndefValue>(OldMask->getAggregateElement(i)) &&
+             "Unexpected shuffle mask element for identity shuffle");
+      NewMaskVec[i] = NewMaskEltC;
+    }
+  }
+
+  Constant *NewMask = ConstantVector::get(NewMaskVec);
+  return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask);
+}
+
 /// If we have an insertelement instruction feeding into another insertelement
 /// and the 2nd is inserting a constant into the vector, canonicalize that
 /// constant insertion before the insertion of a variable:
@@ -987,6 +1115,9 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   if (Instruction *Splat = foldInsEltIntoSplat(IE))
     return Splat;
 
+  if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE))
+    return IdentityShuf;
+
   return nullptr;
 }
 
@@ -1009,17 +1140,23 @@ static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
   if (Depth == 0) return false;
 
   switch (I->getOpcode()) {
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+      // Propagating an undefined shuffle mask element to integer div/rem is not
+      // allowed because those opcodes can create immediate undefined behavior
+      // from an undefined element in an operand.
+      if (llvm::any_of(Mask, [](int M){ return M == -1; }))
+        return false;
+      LLVM_FALLTHROUGH;
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
     case Instruction::FSub:
     case Instruction::Mul:
     case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
     case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
     case Instruction::FRem:
     case Instruction::Shl:
     case Instruction::LShr:
@@ -1040,9 +1177,7 @@ static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
     case Instruction::FPExt:
     case Instruction::GetElementPtr: {
       // Bail out if we would create longer vector ops. We could allow creating
-      // longer vector ops, but that may result in more expensive codegen. We
-      // would also need to limit the transform to avoid undefined behavior for
-      // integer div/rem.
+      // longer vector ops, but that may result in more expensive codegen.
       Type *ITy = I->getType();
       if (ITy->isVectorTy() && Mask.size() > ITy->getVectorNumElements())
         return false;
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 385f4926b845..ecb486c544e0 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -200,8 +200,8 @@ bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
 // where both B and C should be ConstantInts, results in a constant that does
 // not overflow. This function only handles the Add and Sub opcodes. For
 // all other opcodes, the function conservatively returns false.
-static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
-  OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
+  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
   if (!OBO || !OBO->hasNoSignedWrap())
     return false;
 
@@ -224,10 +224,15 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
 }
 
 static bool hasNoUnsignedWrap(BinaryOperator &I) {
-  OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
   return OBO && OBO->hasNoUnsignedWrap();
 }
 
+static bool hasNoSignedWrap(BinaryOperator &I) {
+  auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  return OBO && OBO->hasNoSignedWrap();
+}
+
 /// Conservatively clears subclassOptionalData after a reassociation or
 /// commutation. We preserve fast-math flags when applicable as they can be
 /// preserved.
@@ -332,22 +337,21 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           // It simplifies to V.  Form "A op V".
           I.setOperand(0, A);
           I.setOperand(1, V);
-          // Conservatively clear the optional flags, since they may not be
-          // preserved by the reassociation.
           bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0);
-          bool IsNSW = MaintainNoSignedWrap(I, B, C);
+          bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0);
 
+          // Conservatively clear all optional flags since they may not be
+          // preserved by the reassociation. Reset nsw/nuw based on the above
+          // analysis.
           ClearSubclassDataAfterReassociation(I);
 
+          // Note: this is only valid because SimplifyBinOp doesn't look at
+          // the operands to Op0.
           if (IsNUW)
             I.setHasNoUnsignedWrap(true);
 
-          if (IsNSW &&
-              (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
-            // Note: this is only valid because SimplifyBinOp doesn't look at
-            // the operands to Op0.
+          if (IsNSW)
             I.setHasNoSignedWrap(true);
-          }
 
           Changed = true;
           ++NumReassoc;
@@ -610,7 +614,6 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
           HasNUW &= ROBO->hasNoUnsignedWrap();
         }
 
-        const APInt *CInt;
         if (TopLevelOpcode == Instruction::Add &&
             InnerOpcode == Instruction::Mul) {
           // We can propagate 'nsw' if we know that
@@ -620,6 +623,7 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
           //  %Z = mul nsw i16 %X, C+1
           //
           // iff C+1 isn't INT_MIN
+          const APInt *CInt;
           if (match(V, m_APInt(CInt))) {
             if (!CInt->isMinSignedValue())
               BO->setHasNoSignedWrap(HasNSW);
@@ -763,12 +767,16 @@ Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
   if (match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C))) &&
       match(RHS, m_Select(m_Specific(A), m_Value(D), m_Value(E)))) {
     bool SelectsHaveOneUse = LHS->hasOneUse() && RHS->hasOneUse();
+
+    FastMathFlags FMF;
     BuilderTy::FastMathFlagGuard Guard(Builder);
-    if (isa<FPMathOperator>(&I))
-      Builder.setFastMathFlags(I.getFastMathFlags());
+    if (isa<FPMathOperator>(&I)) {
+      FMF = I.getFastMathFlags();
+      Builder.setFastMathFlags(FMF);
+    }
 
-    Value *V1 = SimplifyBinOp(Opcode, C, E, SQ.getWithInstruction(&I));
-    Value *V2 = SimplifyBinOp(Opcode, B, D, SQ.getWithInstruction(&I));
+    Value *V1 = SimplifyBinOp(Opcode, C, E, FMF, SQ.getWithInstruction(&I));
+    Value *V2 = SimplifyBinOp(Opcode, B, D, FMF, SQ.getWithInstruction(&I));
     if (V1 && V2)
       SI = Builder.CreateSelect(A, V2, V1);
     else if (V2 && SelectsHaveOneUse)
@@ -1659,7 +1667,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     // to an index of zero, so replace it with zero if it is not zero already.
     Type *EltTy = GTI.getIndexedType();
     if (EltTy->isSized() && DL.getTypeAllocSize(EltTy) == 0)
-      if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) {
+      if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) {
         *I = Constant::getNullValue(NewIndexType);
         MadeChange = true;
       }
@@ -2549,9 +2557,7 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
 Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
   // Change br (not X), label True, label False to: br X, label False, True
   Value *X = nullptr;
-  BasicBlock *TrueDest;
-  BasicBlock *FalseDest;
-  if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) &&
+  if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) &&
       !isa<Constant>(X)) {
     // Swap Destinations and condition...
     BI.setCondition(X);
@@ -2569,8 +2575,8 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
 
   // Canonicalize, for example, icmp_ne -> icmp_eq or fcmp_one -> fcmp_oeq.
   CmpInst::Predicate Pred;
-  if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), TrueDest,
-                      FalseDest)) &&
+  if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())),
+                      m_BasicBlock(), m_BasicBlock())) &&
       !isCanonicalPredicate(Pred)) {
     // Swap destinations and condition.
     CmpInst *Cond = cast<CmpInst>(BI.getCondition());
@@ -3156,6 +3162,21 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   findDbgUsers(DbgUsers, I);
   for (auto *DII : reverse(DbgUsers)) {
     if (DII->getParent() == SrcBlock) {
+      if (isa<DbgDeclareInst>(DII)) {
+        // A dbg.declare instruction should not be cloned, since there can only be
+        // one per variable fragment. It should be left in the original place since
+        // sunk instruction is not an alloca(otherwise we could not be here).
+        // But we need to update arguments of dbg.declare instruction, so that it
+        // would not point into sunk instruction.
+        if (!isa<CastInst>(I))
+          continue; // dbg.declare points at something it shouldn't
+
+        DII->setOperand(
+            0, MetadataAsValue::get(I->getContext(),
+                                    ValueAsMetadata::get(I->getOperand(0))));
+        continue;
+      }
+
       // dbg.value is in the same basic block as the sunk inst, see if we can
       // salvage it. Clone a new copy of the instruction: on success we need
       // both salvaged and unsalvaged copies.
@@ -3580,7 +3601,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
   // Required analyses.
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 6821e214e921..d92ee11c2e1a 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -129,6 +129,8 @@ static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
 static const char *const kAsanModuleCtorName = "asan.module_ctor";
 static const char *const kAsanModuleDtorName = "asan.module_dtor";
 static const uint64_t kAsanCtorAndDtorPriority = 1;
+// On Emscripten, the system needs more than one priorities for constructors.
+static const uint64_t kAsanEmscriptenCtorAndDtorPriority = 50;
 static const char *const kAsanReportErrorTemplate = "__asan_report_";
 static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *const kAsanUnregisterGlobalsName =
@@ -191,6 +193,11 @@ static cl::opt<bool> ClRecover(
     cl::desc("Enable recovery mode (continue-after-error)."),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClInsertVersionCheck(
+    "asan-guard-against-version-mismatch",
+    cl::desc("Guard against compiler/runtime version mismatch."),
+    cl::Hidden, cl::init(true));
+
 // This flag may need to be replaced with -f[no-]asan-reads.
 static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
                                        cl::desc("instrument read instructions"),
@@ -530,6 +537,14 @@ static size_t RedzoneSizeForScale(int MappingScale) {
   return std::max(32U, 1U << MappingScale);
 }
 
+static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
+  if (TargetTriple.isOSEmscripten()) {
+    return kAsanEmscriptenCtorAndDtorPriority;
+  } else {
+    return kAsanCtorAndDtorPriority;
+  }
+}
+
 namespace {
 
 /// Module analysis for getting various metadata about the module.
@@ -565,10 +580,10 @@ char ASanGlobalsMetadataWrapperPass::ID = 0;
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer {
-  AddressSanitizer(Module &M, GlobalsMetadata &GlobalsMD,
+  AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
                    bool CompileKernel = false, bool Recover = false,
                    bool UseAfterScope = false)
-      : UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(GlobalsMD) {
+      : UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) {
     this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
     this->CompileKernel =
         ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel;
@@ -677,7 +692,7 @@ private:
   FunctionCallee AsanMemmove, AsanMemcpy, AsanMemset;
   InlineAsm *EmptyAsm;
   Value *LocalDynamicShadow = nullptr;
-  GlobalsMetadata GlobalsMD;
+  const GlobalsMetadata &GlobalsMD;
   DenseMap<const AllocaInst *, bool> ProcessedAllocas;
 };
 
@@ -706,8 +721,8 @@ public:
     GlobalsMetadata &GlobalsMD =
         getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    AddressSanitizer ASan(*F.getParent(), GlobalsMD, CompileKernel, Recover,
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    AddressSanitizer ASan(*F.getParent(), &GlobalsMD, CompileKernel, Recover,
                           UseAfterScope);
     return ASan.instrumentFunction(F, TLI);
   }
@@ -720,10 +735,10 @@ private:
 
 class ModuleAddressSanitizer {
 public:
-  ModuleAddressSanitizer(Module &M, GlobalsMetadata &GlobalsMD,
+  ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
                          bool CompileKernel = false, bool Recover = false,
                          bool UseGlobalsGC = true, bool UseOdrIndicator = false)
-      : GlobalsMD(GlobalsMD), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+      : GlobalsMD(*GlobalsMD), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
         // Enable aliases as they should have no downside with ODR indicators.
         UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias),
         UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator),
@@ -783,7 +798,7 @@ private:
   }
   int GetAsanVersion(const Module &M) const;
 
-  GlobalsMetadata GlobalsMD;
+  const GlobalsMetadata &GlobalsMD;
   bool CompileKernel;
   bool Recover;
   bool UseGlobalsGC;
@@ -830,7 +845,7 @@ public:
   bool runOnModule(Module &M) override {
     GlobalsMetadata &GlobalsMD =
         getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
-    ModuleAddressSanitizer ASanModule(M, GlobalsMD, CompileKernel, Recover,
+    ModuleAddressSanitizer ASanModule(M, &GlobalsMD, CompileKernel, Recover,
                                       UseGlobalGC, UseOdrIndicator);
     return ASanModule.instrumentModule(M);
   }
@@ -1033,7 +1048,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     if (!II.isLifetimeStartOrEnd())
       return;
     // Found lifetime intrinsic, add ASan instrumentation if necessary.
-    ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
+    auto *Size = cast<ConstantInt>(II.getArgOperand(0));
     // If size argument is undefined, don't do anything.
     if (Size->isMinusOne()) return;
     // Check that size doesn't saturate uint64_t and can
@@ -1156,7 +1171,7 @@ PreservedAnalyses AddressSanitizerPass::run(Function &F,
   Module &M = *F.getParent();
   if (auto *R = MAM.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
     const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
-    AddressSanitizer Sanitizer(M, *R, CompileKernel, Recover, UseAfterScope);
+    AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope);
     if (Sanitizer.instrumentFunction(F, TLI))
       return PreservedAnalyses::none();
     return PreservedAnalyses::all();
@@ -1178,7 +1193,7 @@ ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(bool CompileKernel,
 PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
                                                   AnalysisManager<Module> &AM) {
   GlobalsMetadata &GlobalsMD = AM.getResult<ASanGlobalsMetadataAnalysis>(M);
-  ModuleAddressSanitizer Sanitizer(M, GlobalsMD, CompileKernel, Recover,
+  ModuleAddressSanitizer Sanitizer(M, &GlobalsMD, CompileKernel, Recover,
                                    UseGlobalGC, UseOdrIndicator);
   if (Sanitizer.instrumentModule(M))
     return PreservedAnalyses::none();
@@ -1331,7 +1346,7 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
                                                    unsigned *Alignment,
                                                    Value **MaybeMask) {
   // Skip memory accesses inserted by another instrumentation.
-  if (I->getMetadata("nosanitize")) return nullptr;
+  if (I->hasMetadata("nosanitize")) return nullptr;
 
   // Do not instrument the load fetching the dynamic shadow address.
   if (LocalDynamicShadow == I)
@@ -1775,9 +1790,10 @@ void ModuleAddressSanitizer::createInitializerPoisonCalls(
     // Must have a function or null ptr.
     if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
       if (F->getName() == kAsanModuleCtorName) continue;
-      ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+      auto *Priority = cast<ConstantInt>(CS->getOperand(0));
       // Don't instrument CTORs that will run before asan.module_ctor.
-      if (Priority->getLimitedValue() <= kAsanCtorAndDtorPriority) continue;
+      if (Priority->getLimitedValue() <= GetCtorAndDtorPriority(TargetTriple))
+        continue;
       poisonOneInitializer(*F, ModuleName);
     }
   }
@@ -1919,7 +1935,12 @@ StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const {
   case Triple::COFF:  return ".ASAN$GL";
   case Triple::ELF:   return "asan_globals";
   case Triple::MachO: return "__DATA,__asan_globals,regular";
-  default: break;
+  case Triple::Wasm:
+  case Triple::XCOFF:
+    report_fatal_error(
+        "ModuleAddressSanitizer not implemented for object file format.");
+  case Triple::UnknownObjectFormat:
+    break;
   }
   llvm_unreachable("unsupported object format");
 }
@@ -2033,7 +2054,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
     unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType());
     assert(isPowerOf2_32(SizeOfGlobalStruct) &&
            "global metadata will not be padded appropriately");
-    Metadata->setAlignment(SizeOfGlobalStruct);
+    Metadata->setAlignment(assumeAligned(SizeOfGlobalStruct));
 
     SetComdatForGlobalMetadata(G, Metadata, "");
   }
@@ -2170,7 +2191,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray(
       M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
       ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), "");
   if (Mapping.Scale > 3)
-    AllGlobals->setAlignment(1ULL << Mapping.Scale);
+    AllGlobals->setAlignment(Align(1ULL << Mapping.Scale));
 
   IRB.CreateCall(AsanRegisterGlobals,
                  {IRB.CreatePointerCast(AllGlobals, IntptrTy),
@@ -2270,7 +2291,7 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
                            "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
     NewGlobal->setComdat(G->getComdat());
-    NewGlobal->setAlignment(MinRZ);
+    NewGlobal->setAlignment(MaybeAlign(MinRZ));
     // Don't fold globals with redzones. ODR violation detector and redzone
     // poisoning implicitly creates a dependence on the global's address, so it
     // is no longer valid for it to be marked unnamed_addr.
@@ -2338,7 +2359,7 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
       // Set meaningful attributes for indicator symbol.
       ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
       ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
-      ODRIndicatorSym->setAlignment(1);
+      ODRIndicatorSym->setAlignment(Align::None());
       ODRIndicator = ODRIndicatorSym;
     }
 
@@ -2410,39 +2431,39 @@ bool ModuleAddressSanitizer::instrumentModule(Module &M) {
 
   // Create a module constructor. A destructor is created lazily because not all
   // platforms, and not all modules need it.
+  std::string AsanVersion = std::to_string(GetAsanVersion(M));
   std::string VersionCheckName =
-      kAsanVersionCheckNamePrefix + std::to_string(GetAsanVersion(M));
+      ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : "";
   std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
       M, kAsanModuleCtorName, kAsanInitName, /*InitArgTypes=*/{},
       /*InitArgs=*/{}, VersionCheckName);
 
   bool CtorComdat = true;
-  bool Changed = false;
   // TODO(glider): temporarily disabled globals instrumentation for KASan.
   if (ClGlobals) {
     IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
-    Changed |= InstrumentGlobals(IRB, M, &CtorComdat);
+    InstrumentGlobals(IRB, M, &CtorComdat);
   }
 
+  const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple);
+
   // Put the constructor and destructor in comdat if both
   // (1) global instrumentation is not TU-specific
   // (2) target is ELF.
   if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
     AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
-    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
-                        AsanCtorFunction);
+    appendToGlobalCtors(M, AsanCtorFunction, Priority, AsanCtorFunction);
     if (AsanDtorFunction) {
       AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName));
-      appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority,
-                          AsanDtorFunction);
+      appendToGlobalDtors(M, AsanDtorFunction, Priority, AsanDtorFunction);
     }
   } else {
-    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
+    appendToGlobalCtors(M, AsanCtorFunction, Priority);
     if (AsanDtorFunction)
-      appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
+      appendToGlobalDtors(M, AsanDtorFunction, Priority);
   }
 
-  return Changed;
+  return true;
 }
 
 void AddressSanitizer::initializeCallbacks(Module &M) {
@@ -2664,7 +2685,7 @@ bool AddressSanitizer::instrumentFunction(Function &F,
         if (CS) {
           // A call inside BB.
           TempsToInstrument.clear();
-          if (CS.doesNotReturn() && !CS->getMetadata("nosanitize"))
+          if (CS.doesNotReturn() && !CS->hasMetadata("nosanitize"))
             NoReturnCalls.push_back(CS.getInstruction());
         }
         if (CallInst *CI = dyn_cast<CallInst>(&Inst))
@@ -2877,18 +2898,19 @@ void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
   for (Argument &Arg : F.args()) {
     if (Arg.hasByValAttr()) {
       Type *Ty = Arg.getType()->getPointerElementType();
-      unsigned Align = Arg.getParamAlignment();
-      if (Align == 0) Align = DL.getABITypeAlignment(Ty);
+      unsigned Alignment = Arg.getParamAlignment();
+      if (Alignment == 0)
+        Alignment = DL.getABITypeAlignment(Ty);
 
       AllocaInst *AI = IRB.CreateAlloca(
           Ty, nullptr,
           (Arg.hasName() ? Arg.getName() : "Arg" + Twine(Arg.getArgNo())) +
               ".byval");
-      AI->setAlignment(Align);
+      AI->setAlignment(Align(Alignment));
       Arg.replaceAllUsesWith(AI);
 
       uint64_t AllocSize = DL.getTypeAllocSize(Ty);
-      IRB.CreateMemCpy(AI, Align, &Arg, Align, AllocSize);
+      IRB.CreateMemCpy(AI, Alignment, &Arg, Alignment, AllocSize);
     }
   }
 }
@@ -2919,7 +2941,7 @@ Value *FunctionStackPoisoner::createAllocaForLayout(
   }
   assert((ClRealignStack & (ClRealignStack - 1)) == 0);
   size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
-  Alloca->setAlignment(FrameAlignment);
+  Alloca->setAlignment(MaybeAlign(FrameAlignment));
   return IRB.CreatePointerCast(Alloca, IntptrTy);
 }
 
@@ -2928,7 +2950,7 @@ void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
   IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin()));
   DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr);
   IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout);
-  DynamicAllocaLayout->setAlignment(32);
+  DynamicAllocaLayout->setAlignment(Align(32));
 }
 
 void FunctionStackPoisoner::processDynamicAllocas() {
@@ -3275,7 +3297,7 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
 
   // Insert new alloca with new NewSize and Align params.
   AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
-  NewAlloca->setAlignment(Align);
+  NewAlloca->setAlignment(MaybeAlign(Align));
 
   // NewAddress = Address + Align
   Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 4dc9b611c156..ae34be986537 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -224,7 +224,7 @@ struct BoundsCheckingLegacyPass : public FunctionPass {
   }
 
   bool runOnFunction(Function &F) override {
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     return addBoundsChecking(F, TLI, SE);
   }
diff --git a/lib/Transforms/Instrumentation/CFGMST.h b/lib/Transforms/Instrumentation/CFGMST.h
index 971e00041762..8bb6f47c4846 100644
--- a/lib/Transforms/Instrumentation/CFGMST.h
+++ b/lib/Transforms/Instrumentation/CFGMST.h
@@ -257,13 +257,13 @@ public:
     std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
     if (Inserted) {
       // Newly inserted, update the real info.
-      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+      Iter->second = std::move(std::make_unique<BBInfo>(Index));
       Index++;
     }
     std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
     if (Inserted)
       // Newly inserted, update the real info.
-      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+      Iter->second = std::move(std::make_unique<BBInfo>(Index));
     AllEdges.emplace_back(new Edge(Src, Dest, W));
     return *AllEdges.back();
   }
diff --git a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 3f4f9bc7145d..55c64fa4b727 100644
--- a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -512,30 +512,38 @@ static bool isHoistable(Instruction *I, DominatorTree &DT) {
 // first-region entry block) or the (hoistable or unhoistable) base values that
 // are defined outside (including the first-region entry block) of the
 // scope. The returned set doesn't include constants.
-static std::set<Value *> getBaseValues(Value *V,
-                                       DominatorTree &DT) {
+static std::set<Value *> getBaseValues(
+    Value *V, DominatorTree &DT,
+    DenseMap<Value *, std::set<Value *>> &Visited) {
+  if (Visited.count(V)) {
+    return Visited[V];
+  }
   std::set<Value *> Result;
   if (auto *I = dyn_cast<Instruction>(V)) {
     // We don't stop at a block that's not in the Scope because we would miss some
     // instructions that are based on the same base values if we stop there.
     if (!isHoistable(I, DT)) {
       Result.insert(I);
+      Visited.insert(std::make_pair(V, Result));
       return Result;
     }
     // I is hoistable above the Scope.
     for (Value *Op : I->operands()) {
-      std::set<Value *> OpResult = getBaseValues(Op, DT);
+      std::set<Value *> OpResult = getBaseValues(Op, DT, Visited);
       Result.insert(OpResult.begin(), OpResult.end());
     }
+    Visited.insert(std::make_pair(V, Result));
     return Result;
   }
   if (isa<Argument>(V)) {
     Result.insert(V);
+    Visited.insert(std::make_pair(V, Result));
     return Result;
   }
   // We don't include others like constants because those won't lead to any
   // chance of folding of conditions (eg two bit checks merged into one check)
   // after CHR.
+  Visited.insert(std::make_pair(V, Result));
   return Result;  // empty
 }
 
@@ -1078,12 +1086,13 @@ static bool shouldSplit(Instruction *InsertPoint,
   if (!PrevConditionValues.empty() && !ConditionValues.empty()) {
     // Use std::set as DenseSet doesn't work with set_intersection.
     std::set<Value *> PrevBases, Bases;
+    DenseMap<Value *, std::set<Value *>> Visited;
     for (Value *V : PrevConditionValues) {
-      std::set<Value *> BaseValues = getBaseValues(V, DT);
+      std::set<Value *> BaseValues = getBaseValues(V, DT, Visited);
       PrevBases.insert(BaseValues.begin(), BaseValues.end());
     }
     for (Value *V : ConditionValues) {
-      std::set<Value *> BaseValues = getBaseValues(V, DT);
+      std::set<Value *> BaseValues = getBaseValues(V, DT, Visited);
       Bases.insert(BaseValues.begin(), BaseValues.end());
     }
     CHR_DEBUG(
@@ -1538,10 +1547,7 @@ static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
     }
     if (auto *SI = dyn_cast<SelectInst>(U)) {
       // Swap operands
-      Value *TrueValue = SI->getTrueValue();
-      Value *FalseValue = SI->getFalseValue();
-      SI->setTrueValue(FalseValue);
-      SI->setFalseValue(TrueValue);
+      SI->swapValues();
       SI->swapProfMetadata();
       if (Scope->TrueBiasedSelects.count(SI)) {
         assert(Scope->FalseBiasedSelects.count(SI) == 0 &&
@@ -2073,7 +2079,7 @@ bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) {
       getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE =
-      llvm::make_unique<OptimizationRemarkEmitter>(&F);
+      std::make_unique<OptimizationRemarkEmitter>(&F);
   return CHR(F, BFI, DT, PSI, RI, *OwnedORE.get()).run();
 }
 
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 2279c1bcb6a8..c0353cba0b2f 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1212,7 +1212,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     return DFS.ZeroShadow;
   case 1: {
     LoadInst *LI = new LoadInst(DFS.ShadowTy, ShadowAddr, "", Pos);
-    LI->setAlignment(ShadowAlign);
+    LI->setAlignment(MaybeAlign(ShadowAlign));
     return LI;
   }
   case 2: {
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 59950ffc4e9a..ac6082441eae 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -86,7 +86,9 @@ public:
     ReversedVersion[3] = Options.Version[0];
     ReversedVersion[4] = '\0';
   }
-  bool runOnModule(Module &M, const TargetLibraryInfo &TLI);
+  bool
+  runOnModule(Module &M,
+              std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
 
 private:
   // Create the .gcno files for the Module based on DebugInfo.
@@ -102,9 +104,9 @@ private:
                                       std::vector<Regex> &Regexes);
 
   // Get pointers to the functions in the runtime library.
-  FunctionCallee getStartFileFunc();
-  FunctionCallee getEmitFunctionFunc();
-  FunctionCallee getEmitArcsFunc();
+  FunctionCallee getStartFileFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getEmitFunctionFunc(const TargetLibraryInfo *TLI);
+  FunctionCallee getEmitArcsFunc(const TargetLibraryInfo *TLI);
   FunctionCallee getSummaryInfoFunc();
   FunctionCallee getEndFileFunc();
 
@@ -127,7 +129,7 @@ private:
   SmallVector<uint32_t, 4> FileChecksums;
 
   Module *M;
-  const TargetLibraryInfo *TLI;
+  std::function<const TargetLibraryInfo &(Function &F)> GetTLI;
   LLVMContext *Ctx;
   SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
   std::vector<Regex> FilterRe;
@@ -147,8 +149,9 @@ public:
   StringRef getPassName() const override { return "GCOV Profiler"; }
 
   bool runOnModule(Module &M) override {
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-    return Profiler.runOnModule(M, TLI);
+    return Profiler.runOnModule(M, [this](Function &F) -> TargetLibraryInfo & {
+      return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    });
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -555,9 +558,10 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
   return CurPath.str();
 }
 
-bool GCOVProfiler::runOnModule(Module &M, const TargetLibraryInfo &TLI) {
+bool GCOVProfiler::runOnModule(
+    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
   this->M = &M;
-  this->TLI = &TLI;
+  this->GetTLI = std::move(GetTLI);
   Ctx = &M.getContext();
 
   AddFlushBeforeForkAndExec();
@@ -574,9 +578,12 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M,
                                         ModuleAnalysisManager &AM) {
 
   GCOVProfiler Profiler(GCOVOpts);
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
-  if (!Profiler.runOnModule(M, TLI))
+  if (!Profiler.runOnModule(M, [&](Function &F) -> TargetLibraryInfo & {
+        return FAM.getResult<TargetLibraryAnalysis>(F);
+      }))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -624,6 +631,7 @@ static bool shouldKeepInEntry(BasicBlock::iterator It) {
 void GCOVProfiler::AddFlushBeforeForkAndExec() {
   SmallVector<Instruction *, 2> ForkAndExecs;
   for (auto &F : M->functions()) {
+    auto *TLI = &GetTLI(F);
     for (auto &I : instructions(F)) {
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         if (Function *Callee = CI->getCalledFunction()) {
@@ -669,7 +677,8 @@ void GCOVProfiler::emitProfileNotes() {
       continue;
 
     std::error_code EC;
-    raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC, sys::fs::F_None);
+    raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC,
+                       sys::fs::OF_None);
     if (EC) {
       Ctx->emitError(Twine("failed to open coverage notes file for writing: ") +
                      EC.message());
@@ -695,7 +704,7 @@ void GCOVProfiler::emitProfileNotes() {
         ++It;
       EntryBlock.splitBasicBlock(It);
 
-      Funcs.push_back(make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++,
+      Funcs.push_back(std::make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++,
                                                 Options.UseCfgChecksum,
                                                 Options.ExitBlockBeforeBody));
       GCOVFunction &Func = *Funcs.back();
@@ -873,7 +882,7 @@ bool GCOVProfiler::emitProfileArcs() {
   return Result;
 }
 
-FunctionCallee GCOVProfiler::getStartFileFunc() {
+FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
   Type *Args[] = {
     Type::getInt8PtrTy(*Ctx),  // const char *orig_filename
     Type::getInt8PtrTy(*Ctx),  // const char version[4]
@@ -887,7 +896,7 @@ FunctionCallee GCOVProfiler::getStartFileFunc() {
   return Res;
 }
 
-FunctionCallee GCOVProfiler::getEmitFunctionFunc() {
+FunctionCallee GCOVProfiler::getEmitFunctionFunc(const TargetLibraryInfo *TLI) {
   Type *Args[] = {
     Type::getInt32Ty(*Ctx),    // uint32_t ident
     Type::getInt8PtrTy(*Ctx),  // const char *function_name
@@ -906,7 +915,7 @@ FunctionCallee GCOVProfiler::getEmitFunctionFunc() {
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
 }
 
-FunctionCallee GCOVProfiler::getEmitArcsFunc() {
+FunctionCallee GCOVProfiler::getEmitArcsFunc(const TargetLibraryInfo *TLI) {
   Type *Args[] = {
     Type::getInt32Ty(*Ctx),     // uint32_t num_counters
     Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
@@ -943,9 +952,11 @@ Function *GCOVProfiler::insertCounterWriteout(
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
   IRBuilder<> Builder(BB);
 
-  FunctionCallee StartFile = getStartFileFunc();
-  FunctionCallee EmitFunction = getEmitFunctionFunc();
-  FunctionCallee EmitArcs = getEmitArcsFunc();
+  auto *TLI = &GetTLI(*WriteoutF);
+
+  FunctionCallee StartFile = getStartFileFunc(TLI);
+  FunctionCallee EmitFunction = getEmitFunctionFunc(TLI);
+  FunctionCallee EmitArcs = getEmitArcsFunc(TLI);
   FunctionCallee SummaryInfo = getSummaryInfoFunc();
   FunctionCallee EndFile = getEndFileFunc();
 
diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 90a9f4955a4b..f87132ee4758 100644
--- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -12,10 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -52,7 +54,10 @@ using namespace llvm;
 #define DEBUG_TYPE "hwasan"
 
 static const char *const kHwasanModuleCtorName = "hwasan.module_ctor";
+static const char *const kHwasanNoteName = "hwasan.note";
 static const char *const kHwasanInitName = "__hwasan_init";
+static const char *const kHwasanPersonalityThunkName =
+    "__hwasan_personality_thunk";
 
 static const char *const kHwasanShadowMemoryDynamicAddress =
     "__hwasan_shadow_memory_dynamic_address";
@@ -112,6 +117,9 @@ static cl::opt<bool> ClGenerateTagsWithCalls(
     cl::desc("generate new tags with runtime library calls"), cl::Hidden,
     cl::init(false));
 
+static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"),
+                               cl::Hidden, cl::init(false));
+
 static cl::opt<int> ClMatchAllTag(
     "hwasan-match-all-tag",
     cl::desc("don't report bad accesses via pointers with this tag"),
@@ -155,8 +163,18 @@ static cl::opt<bool>
 
 static cl::opt<bool>
     ClInstrumentLandingPads("hwasan-instrument-landing-pads",
-                              cl::desc("instrument landing pads"), cl::Hidden,
-                              cl::init(true));
+                            cl::desc("instrument landing pads"), cl::Hidden,
+                            cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClUseShortGranules(
+    "hwasan-use-short-granules",
+    cl::desc("use short granules in allocas and outlined checks"), cl::Hidden,
+    cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClInstrumentPersonalityFunctions(
+    "hwasan-instrument-personality-functions",
+    cl::desc("instrument personality functions"), cl::Hidden, cl::init(false),
+    cl::ZeroOrMore);
 
 static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks",
                                        cl::desc("inline all checks"),
@@ -169,16 +187,16 @@ namespace {
 class HWAddressSanitizer {
 public:
   explicit HWAddressSanitizer(Module &M, bool CompileKernel = false,
-                              bool Recover = false) {
+                              bool Recover = false) : M(M) {
     this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
     this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ?
         ClEnableKhwasan : CompileKernel;
 
-    initializeModule(M);
+    initializeModule();
   }
 
   bool sanitizeFunction(Function &F);
-  void initializeModule(Module &M);
+  void initializeModule();
 
   void initializeCallbacks(Module &M);
 
@@ -216,9 +234,14 @@ public:
   Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty);
   void emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord);
 
+  void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
+  void instrumentGlobals();
+
+  void instrumentPersonalityFunctions();
+
 private:
   LLVMContext *C;
-  std::string CurModuleUniqueId;
+  Module &M;
   Triple TargetTriple;
   FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset;
   FunctionCallee HWAsanHandleVfork;
@@ -238,17 +261,21 @@ private:
     bool InTls;
 
     void init(Triple &TargetTriple);
-    unsigned getAllocaAlignment() const { return 1U << Scale; }
+    unsigned getObjectAlignment() const { return 1U << Scale; }
   };
   ShadowMapping Mapping;
 
+  Type *VoidTy = Type::getVoidTy(M.getContext());
   Type *IntptrTy;
   Type *Int8PtrTy;
   Type *Int8Ty;
   Type *Int32Ty;
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
 
   bool CompileKernel;
   bool Recover;
+  bool UseShortGranules;
+  bool InstrumentLandingPads;
 
   Function *HwasanCtorFunction;
 
@@ -278,7 +305,7 @@ public:
   StringRef getPassName() const override { return "HWAddressSanitizer"; }
 
   bool doInitialization(Module &M) override {
-    HWASan = llvm::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover);
+    HWASan = std::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover);
     return true;
   }
 
@@ -333,7 +360,7 @@ PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
 /// Module-level initialization.
 ///
 /// inserts a call to __hwasan_init to the module's constructor list.
-void HWAddressSanitizer::initializeModule(Module &M) {
+void HWAddressSanitizer::initializeModule() {
   LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
   auto &DL = M.getDataLayout();
 
@@ -342,7 +369,6 @@ void HWAddressSanitizer::initializeModule(Module &M) {
   Mapping.init(TargetTriple);
 
   C = &(M.getContext());
-  CurModuleUniqueId = getUniqueModuleId(&M);
   IRBuilder<> IRB(*C);
   IntptrTy = IRB.getIntPtrTy(DL);
   Int8PtrTy = IRB.getInt8PtrTy();
@@ -350,6 +376,21 @@ void HWAddressSanitizer::initializeModule(Module &M) {
   Int32Ty = IRB.getInt32Ty();
 
   HwasanCtorFunction = nullptr;
+
+  // Older versions of Android do not have the required runtime support for
+  // short granules, global or personality function instrumentation. On other
+  // platforms we currently require using the latest version of the runtime.
+  bool NewRuntime =
+      !TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30);
+
+  UseShortGranules =
+      ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime;
+
+  // If we don't have personality function support, fall back to landing pads.
+  InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences()
+                              ? ClInstrumentLandingPads
+                              : !NewRuntime;
+
   if (!CompileKernel) {
     std::tie(HwasanCtorFunction, std::ignore) =
         getOrCreateSanitizerCtorAndInitFunctions(
@@ -363,6 +404,18 @@ void HWAddressSanitizer::initializeModule(Module &M) {
               Ctor->setComdat(CtorComdat);
               appendToGlobalCtors(M, Ctor, 0, Ctor);
             });
+
+    bool InstrumentGlobals =
+        ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime;
+    if (InstrumentGlobals)
+      instrumentGlobals();
+
+    bool InstrumentPersonalityFunctions =
+        ClInstrumentPersonalityFunctions.getNumOccurrences()
+            ? ClInstrumentPersonalityFunctions
+            : NewRuntime;
+    if (InstrumentPersonalityFunctions)
+      instrumentPersonalityFunctions();
   }
 
   if (!TargetTriple.isAndroid()) {
@@ -456,7 +509,7 @@ Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
                                                      unsigned *Alignment,
                                                      Value **MaybeMask) {
   // Skip memory accesses inserted by another instrumentation.
-  if (I->getMetadata("nosanitize")) return nullptr;
+  if (I->hasMetadata("nosanitize")) return nullptr;
 
   // Do not instrument the load fetching the dynamic shadow address.
   if (LocalDynamicShadow == I)
@@ -564,9 +617,11 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
       TargetTriple.isOSBinFormatELF() && !Recover) {
     Module *M = IRB.GetInsertBlock()->getParent()->getParent();
     Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
-    IRB.CreateCall(
-        Intrinsic::getDeclaration(M, Intrinsic::hwasan_check_memaccess),
-        {shadowBase(), Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
+    IRB.CreateCall(Intrinsic::getDeclaration(
+                       M, UseShortGranules
+                              ? Intrinsic::hwasan_check_memaccess_shortgranules
+                              : Intrinsic::hwasan_check_memaccess),
+                   {shadowBase(), Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
     return;
   }
 
@@ -718,7 +773,9 @@ static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
 
 bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
                                    Value *Tag, size_t Size) {
-  size_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment());
+  size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+  if (!UseShortGranules)
+    Size = AlignedSize;
 
   Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
   if (ClInstrumentWithCalls) {
@@ -738,7 +795,7 @@ bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
       IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
     if (Size != AlignedSize) {
       IRB.CreateStore(
-          ConstantInt::get(Int8Ty, Size % Mapping.getAllocaAlignment()),
+          ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()),
           IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize));
       IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32(
                                    Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy),
@@ -778,8 +835,9 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
   // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
   // first).
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  auto GetStackPointerFn =
-      Intrinsic::getDeclaration(M, Intrinsic::frameaddress);
+  auto GetStackPointerFn = Intrinsic::getDeclaration(
+      M, Intrinsic::frameaddress,
+      IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
   Value *StackPointer = IRB.CreateCall(
       GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
 
@@ -912,8 +970,10 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
       PC = readRegister(IRB, "pc");
     else
       PC = IRB.CreatePtrToInt(F, IntptrTy);
-    auto GetStackPointerFn =
-        Intrinsic::getDeclaration(F->getParent(), Intrinsic::frameaddress);
+    Module *M = F->getParent();
+    auto GetStackPointerFn = Intrinsic::getDeclaration(
+        M, Intrinsic::frameaddress,
+        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
     Value *SP = IRB.CreatePtrToInt(
         IRB.CreateCall(GetStackPointerFn,
                        {Constant::getNullValue(IRB.getInt32Ty())}),
@@ -999,11 +1059,8 @@ bool HWAddressSanitizer::instrumentStack(
         AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
     Replacement->setName(Name + ".hwasan");
 
-    for (auto UI = AI->use_begin(), UE = AI->use_end(); UI != UE;) {
-      Use &U = *UI++;
-      if (U.getUser() != AILong)
-        U.set(Replacement);
-    }
+    AI->replaceUsesWithIf(Replacement,
+                          [AILong](Use &U) { return U.getUser() != AILong; });
 
     for (auto *DDI : AllocaDeclareMap.lookup(AI)) {
       DIExpression *OldExpr = DDI->getExpression();
@@ -1020,7 +1077,7 @@ bool HWAddressSanitizer::instrumentStack(
 
       // Re-tag alloca memory with the special UAR tag.
       Value *Tag = getUARTag(IRB, StackTag);
-      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getAllocaAlignment()));
+      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment()));
     }
   }
 
@@ -1074,7 +1131,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
         if (auto *Alloca = dyn_cast_or_null<AllocaInst>(DDI->getAddress()))
           AllocaDeclareMap[Alloca].push_back(DDI);
 
-      if (ClInstrumentLandingPads && isa<LandingPadInst>(Inst))
+      if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
         LandingPadVec.push_back(&Inst);
 
       Value *MaybeMask = nullptr;
@@ -1093,6 +1150,13 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   if (!LandingPadVec.empty())
     instrumentLandingPads(LandingPadVec);
 
+  if (AllocasToInstrument.empty() && F.hasPersonalityFn() &&
+      F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) {
+    // __hwasan_personality_thunk is a no-op for functions without an
+    // instrumented stack, so we can drop it.
+    F.setPersonalityFn(nullptr);
+  }
+
   if (AllocasToInstrument.empty() && ToInstrument.empty())
     return false;
 
@@ -1118,8 +1182,9 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
   for (AllocaInst *AI : AllocasToInstrument) {
     uint64_t Size = getAllocaSizeInBytes(*AI);
-    uint64_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment());
-    AI->setAlignment(std::max(AI->getAlignment(), 16u));
+    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+    AI->setAlignment(
+        MaybeAlign(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
     if (Size != AlignedSize) {
       Type *AllocatedType = AI->getAllocatedType();
       if (AI->isArrayAllocation()) {
@@ -1132,7 +1197,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
       auto *NewAI = new AllocaInst(
           TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
       NewAI->takeName(AI);
-      NewAI->setAlignment(AI->getAlignment());
+      NewAI->setAlignment(MaybeAlign(AI->getAlignment()));
       NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
       NewAI->setSwiftError(AI->isSwiftError());
       NewAI->copyMetadata(*AI);
@@ -1179,6 +1244,257 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   return Changed;
 }
 
+void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
+  Constant *Initializer = GV->getInitializer();
+  uint64_t SizeInBytes =
+      M.getDataLayout().getTypeAllocSize(Initializer->getType());
+  uint64_t NewSize = alignTo(SizeInBytes, Mapping.getObjectAlignment());
+  if (SizeInBytes != NewSize) {
+    // Pad the initializer out to the next multiple of 16 bytes and add the
+    // required short granule tag.
+    std::vector<uint8_t> Init(NewSize - SizeInBytes, 0);
+    Init.back() = Tag;
+    Constant *Padding = ConstantDataArray::get(*C, Init);
+    Initializer = ConstantStruct::getAnon({Initializer, Padding});
+  }
+
+  auto *NewGV = new GlobalVariable(M, Initializer->getType(), GV->isConstant(),
+                                   GlobalValue::ExternalLinkage, Initializer,
+                                   GV->getName() + ".hwasan");
+  NewGV->copyAttributesFrom(GV);
+  NewGV->setLinkage(GlobalValue::PrivateLinkage);
+  NewGV->copyMetadata(GV, 0);
+  NewGV->setAlignment(
+      MaybeAlign(std::max(GV->getAlignment(), Mapping.getObjectAlignment())));
+
+  // It is invalid to ICF two globals that have different tags. In the case
+  // where the size of the global is a multiple of the tag granularity the
+  // contents of the globals may be the same but the tags (i.e. symbol values)
+  // may be different, and the symbols are not considered during ICF. In the
+  // case where the size is not a multiple of the granularity, the short granule
+  // tags would discriminate two globals with different tags, but there would
+  // otherwise be nothing stopping such a global from being incorrectly ICF'd
+  // with an uninstrumented (i.e. tag 0) global that happened to have the short
+  // granule tag in the last byte.
+  NewGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
+  // Descriptor format (assuming little-endian):
+  // bytes 0-3: relative address of global
+  // bytes 4-6: size of global (16MB ought to be enough for anyone, but in case
+  // it isn't, we create multiple descriptors)
+  // byte 7: tag
+  auto *DescriptorTy = StructType::get(Int32Ty, Int32Ty);
+  const uint64_t MaxDescriptorSize = 0xfffff0;
+  for (uint64_t DescriptorPos = 0; DescriptorPos < SizeInBytes;
+       DescriptorPos += MaxDescriptorSize) {
+    auto *Descriptor =
+        new GlobalVariable(M, DescriptorTy, true, GlobalValue::PrivateLinkage,
+                           nullptr, GV->getName() + ".hwasan.descriptor");
+    auto *GVRelPtr = ConstantExpr::getTrunc(
+        ConstantExpr::getAdd(
+            ConstantExpr::getSub(
+                ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+                ConstantExpr::getPtrToInt(Descriptor, Int64Ty)),
+            ConstantInt::get(Int64Ty, DescriptorPos)),
+        Int32Ty);
+    uint32_t Size = std::min(SizeInBytes - DescriptorPos, MaxDescriptorSize);
+    auto *SizeAndTag = ConstantInt::get(Int32Ty, Size | (uint32_t(Tag) << 24));
+    Descriptor->setComdat(NewGV->getComdat());
+    Descriptor->setInitializer(ConstantStruct::getAnon({GVRelPtr, SizeAndTag}));
+    Descriptor->setSection("hwasan_globals");
+    Descriptor->setMetadata(LLVMContext::MD_associated,
+                            MDNode::get(*C, ValueAsMetadata::get(NewGV)));
+    appendToCompilerUsed(M, Descriptor);
+  }
+
+  Constant *Aliasee = ConstantExpr::getIntToPtr(
+      ConstantExpr::getAdd(
+          ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+          ConstantInt::get(Int64Ty, uint64_t(Tag) << kPointerTagShift)),
+      GV->getType());
+  auto *Alias = GlobalAlias::create(GV->getValueType(), GV->getAddressSpace(),
+                                    GV->getLinkage(), "", Aliasee, &M);
+  Alias->setVisibility(GV->getVisibility());
+  Alias->takeName(GV);
+  GV->replaceAllUsesWith(Alias);
+  GV->eraseFromParent();
+}
+
+void HWAddressSanitizer::instrumentGlobals() {
+  // Start by creating a note that contains pointers to the list of global
+  // descriptors. Adding a note to the output file will cause the linker to
+  // create a PT_NOTE program header pointing to the note that we can use to
+  // find the descriptor list starting from the program headers. A function
+  // provided by the runtime initializes the shadow memory for the globals by
+  // accessing the descriptor list via the note. The dynamic loader needs to
+  // call this function whenever a library is loaded.
+  //
+  // The reason why we use a note for this instead of a more conventional
+  // approach of having a global constructor pass a descriptor list pointer to
+  // the runtime is because of an order of initialization problem. With
+  // constructors we can encounter the following problematic scenario:
+  //
+  // 1) library A depends on library B and also interposes one of B's symbols
+  // 2) B's constructors are called before A's (as required for correctness)
+  // 3) during construction, B accesses one of its "own" globals (actually
+  //    interposed by A) and triggers a HWASAN failure due to the initialization
+  //    for A not having happened yet
+  //
+  // Even without interposition it is possible to run into similar situations in
+  // cases where two libraries mutually depend on each other.
+  //
+  // We only need one note per binary, so put everything for the note in a
+  // comdat.
+  Comdat *NoteComdat = M.getOrInsertComdat(kHwasanNoteName);
+
+  Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0);
+  auto Start =
+      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
+                         nullptr, "__start_hwasan_globals");
+  Start->setVisibility(GlobalValue::HiddenVisibility);
+  Start->setDSOLocal(true);
+  auto Stop =
+      new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage,
+                         nullptr, "__stop_hwasan_globals");
+  Stop->setVisibility(GlobalValue::HiddenVisibility);
+  Stop->setDSOLocal(true);
+
+  // Null-terminated so actually 8 bytes, which are required in order to align
+  // the note properly.
+  auto *Name = ConstantDataArray::get(*C, "LLVM\0\0\0");
+
+  auto *NoteTy = StructType::get(Int32Ty, Int32Ty, Int32Ty, Name->getType(),
+                                 Int32Ty, Int32Ty);
+  auto *Note =
+      new GlobalVariable(M, NoteTy, /*isConstantGlobal=*/true,
+                         GlobalValue::PrivateLinkage, nullptr, kHwasanNoteName);
+  Note->setSection(".note.hwasan.globals");
+  Note->setComdat(NoteComdat);
+  Note->setAlignment(Align(4));
+  Note->setDSOLocal(true);
+
+  // The pointers in the note need to be relative so that the note ends up being
+  // placed in rodata, which is the standard location for notes.
+  auto CreateRelPtr = [&](Constant *Ptr) {
+    return ConstantExpr::getTrunc(
+        ConstantExpr::getSub(ConstantExpr::getPtrToInt(Ptr, Int64Ty),
+                             ConstantExpr::getPtrToInt(Note, Int64Ty)),
+        Int32Ty);
+  };
+  Note->setInitializer(ConstantStruct::getAnon(
+      {ConstantInt::get(Int32Ty, 8),                           // n_namesz
+       ConstantInt::get(Int32Ty, 8),                           // n_descsz
+       ConstantInt::get(Int32Ty, ELF::NT_LLVM_HWASAN_GLOBALS), // n_type
+       Name, CreateRelPtr(Start), CreateRelPtr(Stop)}));
+  appendToCompilerUsed(M, Note);
+
+  // Create a zero-length global in hwasan_globals so that the linker will
+  // always create start and stop symbols.
+  auto Dummy = new GlobalVariable(
+      M, Int8Arr0Ty, /*isConstantGlobal*/ true, GlobalVariable::PrivateLinkage,
+      Constant::getNullValue(Int8Arr0Ty), "hwasan.dummy.global");
+  Dummy->setSection("hwasan_globals");
+  Dummy->setComdat(NoteComdat);
+  Dummy->setMetadata(LLVMContext::MD_associated,
+                     MDNode::get(*C, ValueAsMetadata::get(Note)));
+  appendToCompilerUsed(M, Dummy);
+
+  std::vector<GlobalVariable *> Globals;
+  for (GlobalVariable &GV : M.globals()) {
+    if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") ||
+        GV.isThreadLocal())
+      continue;
+
+    // Common symbols can't have aliases point to them, so they can't be tagged.
+    if (GV.hasCommonLinkage())
+      continue;
+
+    // Globals with custom sections may be used in __start_/__stop_ enumeration,
+    // which would be broken both by adding tags and potentially by the extra
+    // padding/alignment that we insert.
+    if (GV.hasSection())
+      continue;
+
+    Globals.push_back(&GV);
+  }
+
+  MD5 Hasher;
+  Hasher.update(M.getSourceFileName());
+  MD5::MD5Result Hash;
+  Hasher.final(Hash);
+  uint8_t Tag = Hash[0];
+
+  for (GlobalVariable *GV : Globals) {
+    // Skip tag 0 in order to avoid collisions with untagged memory.
+    if (Tag == 0)
+      Tag = 1;
+    instrumentGlobal(GV, Tag++);
+  }
+}
+
+void HWAddressSanitizer::instrumentPersonalityFunctions() {
+  // We need to untag stack frames as we unwind past them. That is the job of
+  // the personality function wrapper, which either wraps an existing
+  // personality function or acts as a personality function on its own. Each
+  // function that has a personality function or that can be unwound past has
+  // its personality function changed to a thunk that calls the personality
+  // function wrapper in the runtime.
+  MapVector<Constant *, std::vector<Function *>> PersonalityFns;
+  for (Function &F : M) {
+    if (F.isDeclaration() || !F.hasFnAttribute(Attribute::SanitizeHWAddress))
+      continue;
+
+    if (F.hasPersonalityFn()) {
+      PersonalityFns[F.getPersonalityFn()->stripPointerCasts()].push_back(&F);
+    } else if (!F.hasFnAttribute(Attribute::NoUnwind)) {
+      PersonalityFns[nullptr].push_back(&F);
+    }
+  }
+
+  if (PersonalityFns.empty())
+    return;
+
+  FunctionCallee HwasanPersonalityWrapper = M.getOrInsertFunction(
+      "__hwasan_personality_wrapper", Int32Ty, Int32Ty, Int32Ty, Int64Ty,
+      Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy);
+  FunctionCallee UnwindGetGR = M.getOrInsertFunction("_Unwind_GetGR", VoidTy);
+  FunctionCallee UnwindGetCFA = M.getOrInsertFunction("_Unwind_GetCFA", VoidTy);
+
+  for (auto &P : PersonalityFns) {
+    std::string ThunkName = kHwasanPersonalityThunkName;
+    if (P.first)
+      ThunkName += ("." + P.first->getName()).str();
+    FunctionType *ThunkFnTy = FunctionType::get(
+        Int32Ty, {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int8PtrTy}, false);
+    bool IsLocal = P.first && (!isa<GlobalValue>(P.first) ||
+                               cast<GlobalValue>(P.first)->hasLocalLinkage());
+    auto *ThunkFn = Function::Create(ThunkFnTy,
+                                     IsLocal ? GlobalValue::InternalLinkage
+                                             : GlobalValue::LinkOnceODRLinkage,
+                                     ThunkName, &M);
+    if (!IsLocal) {
+      ThunkFn->setVisibility(GlobalValue::HiddenVisibility);
+      ThunkFn->setComdat(M.getOrInsertComdat(ThunkName));
+    }
+
+    auto *BB = BasicBlock::Create(*C, "entry", ThunkFn);
+    IRBuilder<> IRB(BB);
+    CallInst *WrapperCall = IRB.CreateCall(
+        HwasanPersonalityWrapper,
+        {ThunkFn->getArg(0), ThunkFn->getArg(1), ThunkFn->getArg(2),
+         ThunkFn->getArg(3), ThunkFn->getArg(4),
+         P.first ? IRB.CreateBitCast(P.first, Int8PtrTy)
+                 : Constant::getNullValue(Int8PtrTy),
+         IRB.CreateBitCast(UnwindGetGR.getCallee(), Int8PtrTy),
+         IRB.CreateBitCast(UnwindGetCFA.getCallee(), Int8PtrTy)});
+    WrapperCall->setTailCall();
+    IRB.CreateRet(WrapperCall);
+
+    for (Function *F : P.second)
+      F->setPersonalityFn(ThunkFn);
+  }
+}
+
 void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) {
   Scale = kDefaultShadowScale;
   if (ClMappingOffset.getNumOccurrences() > 0) {
diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index c7371f567ff3..74d6e76eceb6 100644
--- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -403,7 +403,7 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
           AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
       ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
     } else {
-      OwnedORE = llvm::make_unique<OptimizationRemarkEmitter>(&F);
+      OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
       ORE = OwnedORE.get();
     }
 
diff --git a/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/lib/Transforms/Instrumentation/InstrOrderFile.cpp
index a2c1ddfd279e..93d3a8a14d5c 100644
--- a/lib/Transforms/Instrumentation/InstrOrderFile.cpp
+++ b/lib/Transforms/Instrumentation/InstrOrderFile.cpp
@@ -100,7 +100,8 @@ public:
     if (!ClOrderFileWriteMapping.empty()) {
       std::lock_guard<std::mutex> LogLock(MappingMutex);
       std::error_code EC;
-      llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC, llvm::sys::fs::F_Append);
+      llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC,
+                              llvm::sys::fs::OF_Append);
       if (EC) {
         report_fatal_error(Twine("Failed to open ") + ClOrderFileWriteMapping +
                            " to save mapping file for order file instrumentation\n");
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 63c2b8078967..1f092a5f3103 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -157,7 +157,10 @@ public:
   }
 
   bool runOnModule(Module &M) override {
-    return InstrProf.run(M, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI());
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+    return InstrProf.run(M, GetTLI);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -370,8 +373,12 @@ private:
 } // end anonymous namespace
 
 PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
-  if (!run(M, TLI))
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+  if (!run(M, GetTLI))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -441,7 +448,7 @@ void InstrProfiling::promoteCounterLoadStores(Function *F) {
   std::unique_ptr<BlockFrequencyInfo> BFI;
   if (Options.UseBFIInPromotion) {
     std::unique_ptr<BranchProbabilityInfo> BPI;
-    BPI.reset(new BranchProbabilityInfo(*F, LI, TLI));
+    BPI.reset(new BranchProbabilityInfo(*F, LI, &GetTLI(*F)));
     BFI.reset(new BlockFrequencyInfo(*F, *BPI, LI));
   }
 
@@ -482,9 +489,10 @@ static bool containsProfilingIntrinsics(Module &M) {
   return false;
 }
 
-bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
+bool InstrProfiling::run(
+    Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
   this->M = &M;
-  this->TLI = &TLI;
+  this->GetTLI = std::move(GetTLI);
   NamesVar = nullptr;
   NamesSize = 0;
   ProfileDataMap.clear();
@@ -601,6 +609,7 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   bool IsRange = (Ind->getValueKind()->getZExtValue() ==
                   llvm::InstrProfValueKind::IPVK_MemOPSize);
   CallInst *Call = nullptr;
+  auto *TLI = &GetTLI(*Ind->getFunction());
   if (!IsRange) {
     Value *Args[3] = {Ind->getTargetValue(),
                       Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
@@ -731,9 +740,8 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
     PD = It->second;
   }
 
-  // Match the linkage and visibility of the name global, except on COFF, where
-  // the linkage must be local and consequentially the visibility must be
-  // default.
+  // Match the linkage and visibility of the name global. COFF supports using
+  // comdats with internal symbols, so do that if we can.
   Function *Fn = Inc->getParent()->getParent();
   GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
   GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
@@ -749,19 +757,21 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   // new comdat group for the counters and profiling data. If we use the comdat
   // of the parent function, that will result in relocations against discarded
   // sections.
-  Comdat *Cmdt = nullptr;
-  GlobalValue::LinkageTypes CounterLinkage = Linkage;
-  if (needsComdatForCounter(*Fn, *M)) {
-    StringRef CmdtPrefix = getInstrProfComdatPrefix();
+  bool NeedComdat = needsComdatForCounter(*Fn, *M);
+  if (NeedComdat) {
     if (TT.isOSBinFormatCOFF()) {
-      // For COFF, the comdat group name must be the name of a symbol in the
-      // group. Use the counter variable name, and upgrade its linkage to
-      // something externally visible, like linkonce_odr.
-      CmdtPrefix = getInstrProfCountersVarPrefix();
-      CounterLinkage = GlobalValue::LinkOnceODRLinkage;
+      // For COFF, put the counters, data, and values each into their own
+      // comdats. We can't use a group because the Visual C++ linker will
+      // report duplicate symbol errors if there are multiple external symbols
+      // with the same name marked IMAGE_COMDAT_SELECT_ASSOCIATIVE.
+      Linkage = GlobalValue::LinkOnceODRLinkage;
+      Visibility = GlobalValue::HiddenVisibility;
     }
-    Cmdt = M->getOrInsertComdat(getVarName(Inc, CmdtPrefix));
   }
+  auto MaybeSetComdat = [=](GlobalVariable *GV) {
+    if (NeedComdat)
+      GV->setComdat(M->getOrInsertComdat(GV->getName()));
+  };
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
   LLVMContext &Ctx = M->getContext();
@@ -775,9 +785,9 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   CounterPtr->setVisibility(Visibility);
   CounterPtr->setSection(
       getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
-  CounterPtr->setAlignment(8);
-  CounterPtr->setComdat(Cmdt);
-  CounterPtr->setLinkage(CounterLinkage);
+  CounterPtr->setAlignment(Align(8));
+  MaybeSetComdat(CounterPtr);
+  CounterPtr->setLinkage(Linkage);
 
   auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
   // Allocate statically the array of pointers to value profile nodes for
@@ -797,8 +807,8 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
       ValuesVar->setVisibility(Visibility);
       ValuesVar->setSection(
           getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
-      ValuesVar->setAlignment(8);
-      ValuesVar->setComdat(Cmdt);
+      ValuesVar->setAlignment(Align(8));
+      MaybeSetComdat(ValuesVar);
       ValuesPtrExpr =
           ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
     }
@@ -830,8 +840,9 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
                                   getVarName(Inc, getInstrProfDataVarPrefix()));
   Data->setVisibility(Visibility);
   Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
-  Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT);
-  Data->setComdat(Cmdt);
+  Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
+  MaybeSetComdat(Data);
+  Data->setLinkage(Linkage);
 
   PD.RegionCounters = CounterPtr;
   PD.DataVar = Data;
@@ -920,7 +931,7 @@ void InstrProfiling::emitNameData() {
   // On COFF, it's important to reduce the alignment down to 1 to prevent the
   // linker from inserting padding before the start of the names section or
   // between names entries.
-  NamesVar->setAlignment(1);
+  NamesVar->setAlignment(Align::None());
   UsedVars.push_back(NamesVar);
 
   for (auto *NamePtr : ReferencedNames)
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index f56a1bd91b89..a6c2c9b464b6 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -68,7 +68,8 @@ GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
                          GlobalValue::PrivateLinkage, StrConst, NamePrefix);
   if (AllowMerging)
     GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  GV->setAlignment(Align::None()); // Strings may not be merged w/o setting
+                                   // alignment explicitly.
   return GV;
 }
 
@@ -116,7 +117,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeMemorySanitizerLegacyPassPass(Registry);
   initializeHWAddressSanitizerLegacyPassPass(Registry);
   initializeThreadSanitizerLegacyPassPass(Registry);
-  initializeSanitizerCoverageModulePass(Registry);
+  initializeModuleSanitizerCoverageLegacyPassPass(Registry);
   initializeDataFlowSanitizerPass(Registry);
 }
 
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b25cbed1bb02..69c9020e060b 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -462,16 +462,9 @@ namespace {
 /// the module.
 class MemorySanitizer {
 public:
-  MemorySanitizer(Module &M, MemorySanitizerOptions Options) {
-    this->CompileKernel =
-        ClEnableKmsan.getNumOccurrences() > 0 ? ClEnableKmsan : Options.Kernel;
-    if (ClTrackOrigins.getNumOccurrences() > 0)
-      this->TrackOrigins = ClTrackOrigins;
-    else
-      this->TrackOrigins = this->CompileKernel ? 2 : Options.TrackOrigins;
-    this->Recover = ClKeepGoing.getNumOccurrences() > 0
-                        ? ClKeepGoing
-                        : (this->CompileKernel | Options.Recover);
+  MemorySanitizer(Module &M, MemorySanitizerOptions Options)
+      : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins),
+        Recover(Options.Recover) {
     initializeModule(M);
   }
 
@@ -594,10 +587,26 @@ private:
 
   /// An empty volatile inline asm that prevents callback merge.
   InlineAsm *EmptyAsm;
-
-  Function *MsanCtorFunction;
 };
 
+void insertModuleCtor(Module &M) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kMsanModuleCtorName, kMsanInitName,
+      /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) {
+        if (!ClWithComdat) {
+          appendToGlobalCtors(M, Ctor, 0);
+          return;
+        }
+        Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
+        Ctor->setComdat(MsanCtorComdat);
+        appendToGlobalCtors(M, Ctor, 0, Ctor);
+      });
+}
+
 /// A legacy function pass for msan instrumentation.
 ///
 /// Instruments functions to detect unitialized reads.
@@ -615,7 +624,7 @@ struct MemorySanitizerLegacyPass : public FunctionPass {
 
   bool runOnFunction(Function &F) override {
     return MSan->sanitizeFunction(
-        F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI());
+        F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F));
   }
   bool doInitialization(Module &M) override;
 
@@ -623,8 +632,17 @@ struct MemorySanitizerLegacyPass : public FunctionPass {
   MemorySanitizerOptions Options;
 };
 
+template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
+  return (Opt.getNumOccurrences() > 0) ? Opt : Default;
+}
+
 } // end anonymous namespace
 
+MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K)
+    : Kernel(getOptOrDefault(ClEnableKmsan, K)),
+      TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)),
+      Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {}
+
 PreservedAnalyses MemorySanitizerPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
   MemorySanitizer Msan(*F.getParent(), Options);
@@ -633,6 +651,14 @@ PreservedAnalyses MemorySanitizerPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
+PreservedAnalyses MemorySanitizerPass::run(Module &M,
+                                           ModuleAnalysisManager &AM) {
+  if (Options.Kernel)
+    return PreservedAnalyses::all();
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
+
 char MemorySanitizerLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan",
@@ -918,23 +944,6 @@ void MemorySanitizer::initializeModule(Module &M) {
   OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
 
   if (!CompileKernel) {
-    std::tie(MsanCtorFunction, std::ignore) =
-        getOrCreateSanitizerCtorAndInitFunctions(
-            M, kMsanModuleCtorName, kMsanInitName,
-            /*InitArgTypes=*/{},
-            /*InitArgs=*/{},
-            // This callback is invoked when the functions are created the first
-            // time. Hook them into the global ctors list in that case:
-            [&](Function *Ctor, FunctionCallee) {
-              if (!ClWithComdat) {
-                appendToGlobalCtors(M, Ctor, 0);
-                return;
-              }
-              Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
-              Ctor->setComdat(MsanCtorComdat);
-              appendToGlobalCtors(M, Ctor, 0, Ctor);
-            });
-
     if (TrackOrigins)
       M.getOrInsertGlobal("__msan_track_origins", IRB.getInt32Ty(), [&] {
         return new GlobalVariable(
@@ -952,6 +961,8 @@ void MemorySanitizer::initializeModule(Module &M) {
 }
 
 bool MemorySanitizerLegacyPass::doInitialization(Module &M) {
+  if (!Options.Kernel)
+    insertModuleCtor(M);
   MSan.emplace(M, Options);
   return true;
 }
@@ -2562,6 +2573,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return false;
   }
 
+  void handleInvariantGroup(IntrinsicInst &I) {
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
   void handleLifetimeStart(IntrinsicInst &I) {
     if (!PoisonStack)
       return;
@@ -2993,6 +3009,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::lifetime_start:
       handleLifetimeStart(I);
       break;
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::strip_invariant_group:
+      handleInvariantGroup(I);
+      break;
     case Intrinsic::bswap:
       handleBswap(I);
       break;
@@ -3627,10 +3647,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   int getNumOutputArgs(InlineAsm *IA, CallBase *CB) {
     int NumRetOutputs = 0;
     int NumOutputs = 0;
-    Type *RetTy = dyn_cast<Value>(CB)->getType();
+    Type *RetTy = cast<Value>(CB)->getType();
     if (!RetTy->isVoidTy()) {
       // Register outputs are returned via the CallInst return value.
-      StructType *ST = dyn_cast_or_null<StructType>(RetTy);
+      auto *ST = dyn_cast<StructType>(RetTy);
       if (ST)
         NumRetOutputs = ST->getNumElements();
       else
@@ -3667,7 +3687,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // corresponding CallInst has nO+nI+1 operands (the last operand is the
     // function to be called).
     const DataLayout &DL = F.getParent()->getDataLayout();
-    CallBase *CB = dyn_cast<CallBase>(&I);
+    CallBase *CB = cast<CallBase>(&I);
     IRBuilder<> IRB(&I);
     InlineAsm *IA = cast<InlineAsm>(CB->getCalledValue());
     int OutputArgs = getNumOutputArgs(IA, CB);
@@ -4567,8 +4587,9 @@ static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
 }
 
 bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
-  if (!CompileKernel && (&F == MsanCtorFunction))
+  if (!CompileKernel && F.getName() == kMsanModuleCtorName)
     return false;
+
   MemorySanitizerVisitor Visitor(F, *this, TLI);
 
   // Clear out readonly/readnone attributes.
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 6fec3c9c79ee..ca1bb62389e9 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -48,6 +48,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CFGMST.h"
+#include "ValueProfileCollector.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -61,7 +62,6 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/IndirectCallVisitor.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -96,6 +96,7 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CRC.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DOTGraphTraits.h"
@@ -103,11 +104,11 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
-#include "llvm/Support/JamCRC.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -120,6 +121,7 @@
 
 using namespace llvm;
 using ProfileCount = Function::ProfileCount;
+using VPCandidateInfo = ValueProfileCollector::CandidateInfo;
 
 #define DEBUG_TYPE "pgo-instrumentation"
 
@@ -286,6 +288,11 @@ static std::string getBranchCondString(Instruction *TI) {
   return result;
 }
 
+static const char *ValueProfKindDescr[] = {
+#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
 namespace {
 
 /// The select instruction visitor plays three roles specified
@@ -348,50 +355,6 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
   unsigned getNumOfSelectInsts() const { return NSIs; }
 };
 
-/// Instruction Visitor class to visit memory intrinsic calls.
-struct MemIntrinsicVisitor : public InstVisitor<MemIntrinsicVisitor> {
-  Function &F;
-  unsigned NMemIs = 0;          // Number of memIntrinsics instrumented.
-  VisitMode Mode = VM_counting; // Visiting mode.
-  unsigned CurCtrId = 0;        // Current counter index.
-  unsigned TotalNumCtrs = 0;    // Total number of counters
-  GlobalVariable *FuncNameVar = nullptr;
-  uint64_t FuncHash = 0;
-  PGOUseFunc *UseFunc = nullptr;
-  std::vector<Instruction *> Candidates;
-
-  MemIntrinsicVisitor(Function &Func) : F(Func) {}
-
-  void countMemIntrinsics(Function &Func) {
-    NMemIs = 0;
-    Mode = VM_counting;
-    visit(Func);
-  }
-
-  void instrumentMemIntrinsics(Function &Func, unsigned TotalNC,
-                               GlobalVariable *FNV, uint64_t FHash) {
-    Mode = VM_instrument;
-    TotalNumCtrs = TotalNC;
-    FuncHash = FHash;
-    FuncNameVar = FNV;
-    visit(Func);
-  }
-
-  std::vector<Instruction *> findMemIntrinsics(Function &Func) {
-    Candidates.clear();
-    Mode = VM_annotate;
-    visit(Func);
-    return Candidates;
-  }
-
-  // Visit the IR stream and annotate all mem intrinsic call instructions.
-  void instrumentOneMemIntrinsic(MemIntrinsic &MI);
-
-  // Visit \p MI instruction and perform tasks according to visit mode.
-  void visitMemIntrinsic(MemIntrinsic &SI);
-
-  unsigned getNumOfMemIntrinsics() const { return NMemIs; }
-};
 
 class PGOInstrumentationGenLegacyPass : public ModulePass {
 public:
@@ -563,13 +526,14 @@ private:
   // A map that stores the Comdat group in function F.
   std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
 
+  ValueProfileCollector VPC;
+
   void computeCFGHash();
   void renameComdatFunction();
 
 public:
-  std::vector<std::vector<Instruction *>> ValueSites;
+  std::vector<std::vector<VPCandidateInfo>> ValueSites;
   SelectInstVisitor SIVisitor;
-  MemIntrinsicVisitor MIVisitor;
   std::string FuncName;
   GlobalVariable *FuncNameVar;
 
@@ -604,23 +568,21 @@ public:
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
       BlockFrequencyInfo *BFI = nullptr, bool IsCS = false)
-      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers),
-        ValueSites(IPVK_Last + 1), SIVisitor(Func), MIVisitor(Func),
-        MST(F, BPI, BFI) {
+      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func),
+        ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) {
     // This should be done before CFG hash computation.
     SIVisitor.countSelects(Func);
-    MIVisitor.countMemIntrinsics(Func);
+    ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize);
     if (!IsCS) {
       NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
-      NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics();
+      NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
       NumOfPGOBB += MST.BBInfos.size();
-      ValueSites[IPVK_IndirectCallTarget] = findIndirectCalls(Func);
+      ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget);
     } else {
       NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
-      NumOfCSPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics();
+      NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
       NumOfCSPGOBB += MST.BBInfos.size();
     }
-    ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func);
 
     FuncName = getPGOFuncName(F);
     computeCFGHash();
@@ -647,7 +609,7 @@ public:
 // value of each BB in the CFG. The higher 32 bits record the number of edges.
 template <class Edge, class BBInfo>
 void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
-  std::vector<char> Indexes;
+  std::vector<uint8_t> Indexes;
   JamCRC JC;
   for (auto &BB : F) {
     const Instruction *TI = BB.getTerminator();
@@ -658,7 +620,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
         continue;
       uint32_t Index = BI->Index;
       for (int J = 0; J < 4; J++)
-        Indexes.push_back((char)(Index >> (J * 8)));
+        Indexes.push_back((uint8_t)(Index >> (J * 8)));
     }
   }
   JC.update(Indexes);
@@ -874,28 +836,36 @@ static void instrumentOneFunc(
   if (DisableValueProfiling)
     return;
 
-  unsigned NumIndirectCalls = 0;
-  for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) {
-    CallSite CS(I);
-    Value *Callee = CS.getCalledValue();
-    LLVM_DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
-                      << NumIndirectCalls << "\n");
-    IRBuilder<> Builder(I);
-    assert(Builder.GetInsertPoint() != I->getParent()->end() &&
-           "Cannot get the Instrumentation point");
-    Builder.CreateCall(
-        Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
-        {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
-         Builder.getInt64(FuncInfo.FunctionHash),
-         Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()),
-         Builder.getInt32(IPVK_IndirectCallTarget),
-         Builder.getInt32(NumIndirectCalls++)});
-  }
-  NumOfPGOICall += NumIndirectCalls;
+  NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size();
 
-  // Now instrument memop intrinsic calls.
-  FuncInfo.MIVisitor.instrumentMemIntrinsics(
-      F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash);
+  // For each VP Kind, walk the VP candidates and instrument each one.
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
+    unsigned SiteIndex = 0;
+    if (Kind == IPVK_MemOPSize && !PGOInstrMemOP)
+      continue;
+
+    for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) {
+      LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind]
+                        << " site: CallSite Index = " << SiteIndex << "\n");
+
+      IRBuilder<> Builder(Cand.InsertPt);
+      assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() &&
+             "Cannot get the Instrumentation point");
+
+      Value *ToProfile = nullptr;
+      if (Cand.V->getType()->isIntegerTy())
+        ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty());
+      else if (Cand.V->getType()->isPointerTy())
+        ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
+      assert(ToProfile && "value profiling Value is of unexpected type");
+
+      Builder.CreateCall(
+          Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+          {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+           Builder.getInt64(FuncInfo.FunctionHash), ToProfile,
+           Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)});
+    }
+  } // IPVK_First <= Kind <= IPVK_Last
 }
 
 namespace {
@@ -984,9 +954,9 @@ class PGOUseFunc {
 public:
   PGOUseFunc(Function &Func, Module *Modu,
              std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
-             BranchProbabilityInfo *BPI = nullptr,
-             BlockFrequencyInfo *BFIin = nullptr, bool IsCS = false)
-      : F(Func), M(Modu), BFI(BFIin),
+             BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
+             ProfileSummaryInfo *PSI, bool IsCS)
+      : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
         FuncInfo(Func, ComdatMembers, false, BPI, BFIin, IsCS),
         FreqAttr(FFA_Normal), IsCS(IsCS) {}
 
@@ -1041,6 +1011,7 @@ private:
   Function &F;
   Module *M;
   BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
 
   // This member stores the shared information with class PGOGenFunc.
   FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
@@ -1078,15 +1049,9 @@ private:
   // FIXME: This function should be removed once the functionality in
   // the inliner is implemented.
   void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
-    if (ProgramMaxCount == 0)
-      return;
-    // Threshold of the hot functions.
-    const BranchProbability HotFunctionThreshold(1, 100);
-    // Threshold of the cold functions.
-    const BranchProbability ColdFunctionThreshold(2, 10000);
-    if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount))
+    if (PSI->isHotCount(EntryCount))
       FreqAttr = FFA_Hot;
-    else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount))
+    else if (PSI->isColdCount(MaxCount))
       FreqAttr = FFA_Cold;
   }
 };
@@ -1433,43 +1398,6 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   llvm_unreachable("Unknown visiting mode");
 }
 
-void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) {
-  Module *M = F.getParent();
-  IRBuilder<> Builder(&MI);
-  Type *Int64Ty = Builder.getInt64Ty();
-  Type *I8PtrTy = Builder.getInt8PtrTy();
-  Value *Length = MI.getLength();
-  assert(!isa<ConstantInt>(Length));
-  Builder.CreateCall(
-      Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
-      {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
-       Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty),
-       Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)});
-  ++CurCtrId;
-}
-
-void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) {
-  if (!PGOInstrMemOP)
-    return;
-  Value *Length = MI.getLength();
-  // Not instrument constant length calls.
-  if (dyn_cast<ConstantInt>(Length))
-    return;
-
-  switch (Mode) {
-  case VM_counting:
-    NMemIs++;
-    return;
-  case VM_instrument:
-    instrumentOneMemIntrinsic(MI);
-    return;
-  case VM_annotate:
-    Candidates.push_back(&MI);
-    return;
-  }
-  llvm_unreachable("Unknown visiting mode");
-}
-
 // Traverse all valuesites and annotate the instructions for all value kind.
 void PGOUseFunc::annotateValueSites() {
   if (DisableValueProfiling)
@@ -1482,11 +1410,6 @@ void PGOUseFunc::annotateValueSites() {
     annotateValueSites(Kind);
 }
 
-static const char *ValueProfKindDescr[] = {
-#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr,
-#include "llvm/ProfileData/InstrProfData.inc"
-};
-
 // Annotate the instructions for a specific value kind.
 void PGOUseFunc::annotateValueSites(uint32_t Kind) {
   assert(Kind <= IPVK_Last);
@@ -1505,11 +1428,11 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) {
     return;
   }
 
-  for (auto &I : ValueSites) {
+  for (VPCandidateInfo &I : ValueSites) {
     LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
                       << "): Index = " << ValueSiteIndex << " out of "
                       << NumValueSites << "\n");
-    annotateValueSite(*M, *I, ProfileRecord,
+    annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord,
                       static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
                       Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
                                              : MaxNumAnnotations);
@@ -1595,7 +1518,8 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M,
 static bool annotateAllFunctions(
     Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
     function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
-    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
+    function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
+    ProfileSummaryInfo *PSI, bool IsCS) {
   LLVM_DEBUG(dbgs() << "Read in profile counters: ");
   auto &Ctx = M.getContext();
   // Read the counter array from file.
@@ -1626,6 +1550,13 @@ static bool annotateAllFunctions(
     return false;
   }
 
+  // Add the profile summary (read from the header of the indexed summary) here
+  // so that we can use it below when reading counters (which checks if the
+  // function should be marked with a cold or inlinehint attribute).
+  M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
+                      IsCS ? ProfileSummary::PSK_CSInstr
+                           : ProfileSummary::PSK_Instr);
+
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
   collectComdatMembers(M, ComdatMembers);
   std::vector<Function *> HotFunctions;
@@ -1638,7 +1569,7 @@ static bool annotateAllFunctions(
     // Split indirectbr critical edges here before computing the MST rather than
     // later in getInstrBB() to avoid invalidating it.
     SplitIndirectBrCriticalEdges(F, BPI, BFI);
-    PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, IsCS);
+    PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, PSI, IsCS);
     bool AllZeros = false;
     if (!Func.readCounters(PGOReader.get(), AllZeros))
       continue;
@@ -1662,9 +1593,9 @@ static bool annotateAllFunctions(
          F.getName().equals(ViewBlockFreqFuncName))) {
       LoopInfo LI{DominatorTree(F)};
       std::unique_ptr<BranchProbabilityInfo> NewBPI =
-          llvm::make_unique<BranchProbabilityInfo>(F, LI);
+          std::make_unique<BranchProbabilityInfo>(F, LI);
       std::unique_ptr<BlockFrequencyInfo> NewBFI =
-          llvm::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
+          std::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
       if (PGOViewCounts == PGOVCT_Graph)
         NewBFI->view();
       else if (PGOViewCounts == PGOVCT_Text) {
@@ -1686,9 +1617,6 @@ static bool annotateAllFunctions(
       }
     }
   }
-  M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
-                      IsCS ? ProfileSummary::PSK_CSInstr
-                           : ProfileSummary::PSK_Instr);
 
   // Set function hotness attribute from the profile.
   // We have to apply these attributes at the end because their presence
@@ -1730,8 +1658,10 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
 
+  auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
   if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
-                            LookupBPI, LookupBFI, IsCS))
+                            LookupBPI, LookupBFI, PSI, IsCS))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -1748,7 +1678,8 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
   };
 
-  return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI,
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI, PSI,
                               IsCS);
 }
 
@@ -1776,6 +1707,9 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
                                            : Weights) {
     dbgs() << W << " ";
   } dbgs() << "\n";);
+
+  misexpect::verifyMisExpect(TI, Weights, TI->getContext());
+
   TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
   if (EmitBranchProbability) {
     std::string BrCondStr = getBranchCondString(TI);
diff --git a/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 188f95b4676b..9f81bb16d0a7 100644
--- a/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -138,7 +138,7 @@ public:
                OptimizationRemarkEmitter &ORE, DominatorTree *DT)
       : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
     ValueDataArray =
-        llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+        std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
     // Get the MemOPSize range information from option MemOPSizeRange,
     getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
                                 PreciseRangeLast);
@@ -374,8 +374,8 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
     Instruction *NewInst = MI->clone();
     // Fix the argument.
-    MemIntrinsic * MemI = dyn_cast<MemIntrinsic>(NewInst);
-    IntegerType *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
+    auto *MemI = cast<MemIntrinsic>(NewInst);
+    auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
     assert(SizeType && "Expected integer type size argument.");
     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
     MemI->setLength(CaseSizeId);
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index ca0cb4bdbe84..f8fa9cad03b8 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/EHPersonalities.h"
@@ -176,24 +177,21 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   return Options;
 }
 
-class SanitizerCoverageModule : public ModulePass {
+using DomTreeCallback = function_ref<const DominatorTree *(Function &F)>;
+using PostDomTreeCallback =
+    function_ref<const PostDominatorTree *(Function &F)>;
+
+class ModuleSanitizerCoverage {
 public:
-  SanitizerCoverageModule(
+  ModuleSanitizerCoverage(
       const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
-      : ModulePass(ID), Options(OverrideFromCL(Options)) {
-    initializeSanitizerCoverageModulePass(*PassRegistry::getPassRegistry());
-  }
-  bool runOnModule(Module &M) override;
-  bool runOnFunction(Function &F);
-  static char ID; // Pass identification, replacement for typeid
-  StringRef getPassName() const override { return "SanitizerCoverageModule"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<PostDominatorTreeWrapperPass>();
-  }
+      : Options(OverrideFromCL(Options)) {}
+  bool instrumentModule(Module &M, DomTreeCallback DTCallback,
+                        PostDomTreeCallback PDTCallback);
 
 private:
+  void instrumentFunction(Function &F, DomTreeCallback DTCallback,
+                          PostDomTreeCallback PDTCallback);
   void InjectCoverageForIndirectCalls(Function &F,
                                       ArrayRef<Instruction *> IndirCalls);
   void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
@@ -252,10 +250,57 @@ private:
   SanitizerCoverageOptions Options;
 };
 
+class ModuleSanitizerCoverageLegacyPass : public ModulePass {
+public:
+  ModuleSanitizerCoverageLegacyPass(
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
+      : ModulePass(ID), Options(Options) {
+    initializeModuleSanitizerCoverageLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override {
+    ModuleSanitizerCoverage ModuleSancov(Options);
+    auto DTCallback = [this](Function &F) -> const DominatorTree * {
+      return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+    };
+    auto PDTCallback = [this](Function &F) -> const PostDominatorTree * {
+      return &this->getAnalysis<PostDominatorTreeWrapperPass>(F)
+                  .getPostDomTree();
+    };
+    return ModuleSancov.instrumentModule(M, DTCallback, PDTCallback);
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+  StringRef getPassName() const override { return "ModuleSanitizerCoverage"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+  }
+
+private:
+  SanitizerCoverageOptions Options;
+};
+
 } // namespace
 
+PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M,
+                                                   ModuleAnalysisManager &MAM) {
+  ModuleSanitizerCoverage ModuleSancov(Options);
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto DTCallback = [&FAM](Function &F) -> const DominatorTree * {
+    return &FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+  auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree * {
+    return &FAM.getResult<PostDominatorTreeAnalysis>(F);
+  };
+  if (ModuleSancov.instrumentModule(M, DTCallback, PDTCallback))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
 std::pair<Value *, Value *>
-SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section,
+ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,
                                            Type *Ty) {
   GlobalVariable *SecStart =
       new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, nullptr,
@@ -278,7 +323,7 @@ SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section,
   return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEndPtr);
 }
 
-Function *SanitizerCoverageModule::CreateInitCallsForSections(
+Function *ModuleSanitizerCoverage::CreateInitCallsForSections(
     Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty,
     const char *Section) {
   auto SecStartEnd = CreateSecStartEnd(M, Section, Ty);
@@ -310,7 +355,8 @@ Function *SanitizerCoverageModule::CreateInitCallsForSections(
   return CtorFunc;
 }
 
-bool SanitizerCoverageModule::runOnModule(Module &M) {
+bool ModuleSanitizerCoverage::instrumentModule(
+    Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
   if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
     return false;
   C = &(M.getContext());
@@ -403,7 +449,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
       M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, Int32PtrTy);
 
   for (auto &F : M)
-    runOnFunction(F);
+    instrumentFunction(F, DTCallback, PDTCallback);
 
   Function *Ctor = nullptr;
 
@@ -518,29 +564,30 @@ static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT,
   return true;
 }
 
-bool SanitizerCoverageModule::runOnFunction(Function &F) {
+void ModuleSanitizerCoverage::instrumentFunction(
+    Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
   if (F.empty())
-    return false;
+    return;
   if (F.getName().find(".module_ctor") != std::string::npos)
-    return false; // Should not instrument sanitizer init functions.
+    return; // Should not instrument sanitizer init functions.
   if (F.getName().startswith("__sanitizer_"))
-    return false;  // Don't instrument __sanitizer_* callbacks.
+    return; // Don't instrument __sanitizer_* callbacks.
   // Don't touch available_externally functions, their actual body is elewhere.
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
-    return false;
+    return;
   // Don't instrument MSVC CRT configuration helpers. They may run before normal
   // initialization.
   if (F.getName() == "__local_stdio_printf_options" ||
       F.getName() == "__local_stdio_scanf_options")
-    return false;
+    return;
   if (isa<UnreachableInst>(F.getEntryBlock().getTerminator()))
-    return false;
+    return;
   // Don't instrument functions using SEH for now. Splitting basic blocks like
   // we do for coverage breaks WinEHPrepare.
   // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
   if (F.hasPersonalityFn() &&
       isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
-    return false;
+    return;
   if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
     SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests());
   SmallVector<Instruction *, 8> IndirCalls;
@@ -550,10 +597,8 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   SmallVector<BinaryOperator *, 8> DivTraceTargets;
   SmallVector<GetElementPtrInst *, 8> GepTraceTargets;
 
-  const DominatorTree *DT =
-      &getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
-  const PostDominatorTree *PDT =
-      &getAnalysis<PostDominatorTreeWrapperPass>(F).getPostDomTree();
+  const DominatorTree *DT = DTCallback(F);
+  const PostDominatorTree *PDT = PDTCallback(F);
   bool IsLeafFunc = true;
 
   for (auto &BB : F) {
@@ -593,10 +638,9 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   InjectTraceForSwitch(F, SwitchTraceTargets);
   InjectTraceForDiv(F, DivTraceTargets);
   InjectTraceForGep(F, GepTraceTargets);
-  return true;
 }
 
-GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
+GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection(
     size_t NumElements, Function &F, Type *Ty, const char *Section) {
   ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
   auto Array = new GlobalVariable(
@@ -608,8 +652,9 @@ GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
             GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
       Array->setComdat(Comdat);
   Array->setSection(getSectionName(Section));
-  Array->setAlignment(Ty->isPointerTy() ? DL->getPointerSize()
-                                        : Ty->getPrimitiveSizeInBits() / 8);
+  Array->setAlignment(Align(Ty->isPointerTy()
+                                ? DL->getPointerSize()
+                                : Ty->getPrimitiveSizeInBits() / 8));
   GlobalsToAppendToUsed.push_back(Array);
   GlobalsToAppendToCompilerUsed.push_back(Array);
   MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
@@ -619,7 +664,7 @@ GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
 }
 
 GlobalVariable *
-SanitizerCoverageModule::CreatePCArray(Function &F,
+ModuleSanitizerCoverage::CreatePCArray(Function &F,
                                        ArrayRef<BasicBlock *> AllBlocks) {
   size_t N = AllBlocks.size();
   assert(N);
@@ -646,7 +691,7 @@ SanitizerCoverageModule::CreatePCArray(Function &F,
   return PCArray;
 }
 
-void SanitizerCoverageModule::CreateFunctionLocalArrays(
+void ModuleSanitizerCoverage::CreateFunctionLocalArrays(
     Function &F, ArrayRef<BasicBlock *> AllBlocks) {
   if (Options.TracePCGuard)
     FunctionGuardArray = CreateFunctionLocalArrayInSection(
@@ -660,7 +705,7 @@ void SanitizerCoverageModule::CreateFunctionLocalArrays(
     FunctionPCsArray = CreatePCArray(F, AllBlocks);
 }
 
-bool SanitizerCoverageModule::InjectCoverage(Function &F,
+bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
                                              ArrayRef<BasicBlock *> AllBlocks,
                                              bool IsLeafFunc) {
   if (AllBlocks.empty()) return false;
@@ -677,7 +722,7 @@ bool SanitizerCoverageModule::InjectCoverage(Function &F,
 //     The cache is used to speed up recording the caller-callee pairs.
 // The address of the caller is passed implicitly via caller PC.
 // CacheSize is encoded in the name of the run-time function.
-void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
+void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls(
     Function &F, ArrayRef<Instruction *> IndirCalls) {
   if (IndirCalls.empty())
     return;
@@ -696,7 +741,7 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
 // __sanitizer_cov_trace_switch(CondValue,
 //      {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... })
 
-void SanitizerCoverageModule::InjectTraceForSwitch(
+void ModuleSanitizerCoverage::InjectTraceForSwitch(
     Function &, ArrayRef<Instruction *> SwitchTraceTargets) {
   for (auto I : SwitchTraceTargets) {
     if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
@@ -735,7 +780,7 @@ void SanitizerCoverageModule::InjectTraceForSwitch(
   }
 }
 
-void SanitizerCoverageModule::InjectTraceForDiv(
+void ModuleSanitizerCoverage::InjectTraceForDiv(
     Function &, ArrayRef<BinaryOperator *> DivTraceTargets) {
   for (auto BO : DivTraceTargets) {
     IRBuilder<> IRB(BO);
@@ -753,7 +798,7 @@ void SanitizerCoverageModule::InjectTraceForDiv(
   }
 }
 
-void SanitizerCoverageModule::InjectTraceForGep(
+void ModuleSanitizerCoverage::InjectTraceForGep(
     Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) {
   for (auto GEP : GepTraceTargets) {
     IRBuilder<> IRB(GEP);
@@ -764,7 +809,7 @@ void SanitizerCoverageModule::InjectTraceForGep(
   }
 }
 
-void SanitizerCoverageModule::InjectTraceForCmp(
+void ModuleSanitizerCoverage::InjectTraceForCmp(
     Function &, ArrayRef<Instruction *> CmpTraceTargets) {
   for (auto I : CmpTraceTargets) {
     if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
@@ -799,7 +844,7 @@ void SanitizerCoverageModule::InjectTraceForCmp(
   }
 }
 
-void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
+void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
                                                     size_t Idx,
                                                     bool IsLeafFunc) {
   BasicBlock::iterator IP = BB.getFirstInsertionPt();
@@ -842,8 +887,10 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   }
   if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
     // Check stack depth.  If it's the deepest so far, record it.
-    Function *GetFrameAddr =
-        Intrinsic::getDeclaration(F.getParent(), Intrinsic::frameaddress);
+    Module *M = F.getParent();
+    Function *GetFrameAddr = Intrinsic::getDeclaration(
+        M, Intrinsic::frameaddress,
+        IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
     auto FrameAddrPtr =
         IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)});
     auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy);
@@ -858,7 +905,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
 }
 
 std::string
-SanitizerCoverageModule::getSectionName(const std::string &Section) const {
+ModuleSanitizerCoverage::getSectionName(const std::string &Section) const {
   if (TargetTriple.isOSBinFormatCOFF()) {
     if (Section == SanCovCountersSectionName)
       return ".SCOV$CM";
@@ -872,32 +919,29 @@ SanitizerCoverageModule::getSectionName(const std::string &Section) const {
 }
 
 std::string
-SanitizerCoverageModule::getSectionStart(const std::string &Section) const {
+ModuleSanitizerCoverage::getSectionStart(const std::string &Section) const {
   if (TargetTriple.isOSBinFormatMachO())
     return "\1section$start$__DATA$__" + Section;
   return "__start___" + Section;
 }
 
 std::string
-SanitizerCoverageModule::getSectionEnd(const std::string &Section) const {
+ModuleSanitizerCoverage::getSectionEnd(const std::string &Section) const {
   if (TargetTriple.isOSBinFormatMachO())
     return "\1section$end$__DATA$__" + Section;
   return "__stop___" + Section;
 }
 
-
-char SanitizerCoverageModule::ID = 0;
-INITIALIZE_PASS_BEGIN(SanitizerCoverageModule, "sancov",
-                      "SanitizerCoverage: TODO."
-                      "ModulePass",
-                      false, false)
+char ModuleSanitizerCoverageLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ModuleSanitizerCoverageLegacyPass, "sancov",
+                      "Pass for instrumenting coverage on functions", false,
+                      false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SanitizerCoverageModule, "sancov",
-                    "SanitizerCoverage: TODO."
-                    "ModulePass",
-                    false, false)
-ModulePass *llvm::createSanitizerCoverageModulePass(
+INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov",
+                    "Pass for instrumenting coverage on functions", false,
+                    false)
+ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass(
     const SanitizerCoverageOptions &Options) {
-  return new SanitizerCoverageModule(Options);
+  return new ModuleSanitizerCoverageLegacyPass(Options);
 }
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 5be13fa745cb..ac274a155a80 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -92,11 +92,10 @@ namespace {
 /// ensures the __tsan_init function is in the list of global constructors for
 /// the module.
 struct ThreadSanitizer {
-  ThreadSanitizer(Module &M);
   bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
 
 private:
-  void initializeCallbacks(Module &M);
+  void initialize(Module &M);
   bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
   bool instrumentAtomic(Instruction *I, const DataLayout &DL);
   bool instrumentMemIntrinsic(Instruction *I);
@@ -108,8 +107,6 @@ private:
   void InsertRuntimeIgnores(Function &F);
 
   Type *IntptrTy;
-  IntegerType *OrdTy;
-  // Callbacks to run-time library are computed in doInitialization.
   FunctionCallee TsanFuncEntry;
   FunctionCallee TsanFuncExit;
   FunctionCallee TsanIgnoreBegin;
@@ -130,7 +127,6 @@ private:
   FunctionCallee TsanVptrUpdate;
   FunctionCallee TsanVptrLoad;
   FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
-  Function *TsanCtorFunction;
 };
 
 struct ThreadSanitizerLegacyPass : FunctionPass {
@@ -143,16 +139,32 @@ struct ThreadSanitizerLegacyPass : FunctionPass {
 private:
   Optional<ThreadSanitizer> TSan;
 };
+
+void insertModuleCtor(Module &M) {
+  getOrCreateSanitizerCtorAndInitFunctions(
+      M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{},
+      // This callback is invoked when the functions are created the first
+      // time. Hook them into the global ctors list in that case:
+      [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
+}
+
 }  // namespace
 
 PreservedAnalyses ThreadSanitizerPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
-  ThreadSanitizer TSan(*F.getParent());
+  ThreadSanitizer TSan;
   if (TSan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
 
+PreservedAnalyses ThreadSanitizerPass::run(Module &M,
+                                           ModuleAnalysisManager &MAM) {
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
+
 char ThreadSanitizerLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(ThreadSanitizerLegacyPass, "tsan",
                       "ThreadSanitizer: detects data races.", false, false)
@@ -169,12 +181,13 @@ void ThreadSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool ThreadSanitizerLegacyPass::doInitialization(Module &M) {
-  TSan.emplace(M);
+  insertModuleCtor(M);
+  TSan.emplace();
   return true;
 }
 
 bool ThreadSanitizerLegacyPass::runOnFunction(Function &F) {
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   TSan->sanitizeFunction(F, TLI);
   return true;
 }
@@ -183,7 +196,10 @@ FunctionPass *llvm::createThreadSanitizerLegacyPassPass() {
   return new ThreadSanitizerLegacyPass();
 }
 
-void ThreadSanitizer::initializeCallbacks(Module &M) {
+void ThreadSanitizer::initialize(Module &M) {
+  const DataLayout &DL = M.getDataLayout();
+  IntptrTy = DL.getIntPtrType(M.getContext());
+
   IRBuilder<> IRB(M.getContext());
   AttributeList Attr;
   Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
@@ -197,7 +213,7 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
                                           IRB.getVoidTy());
   TsanIgnoreEnd =
       M.getOrInsertFunction("__tsan_ignore_thread_end", Attr, IRB.getVoidTy());
-  OrdTy = IRB.getInt32Ty();
+  IntegerType *OrdTy = IRB.getInt32Ty();
   for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
     const unsigned ByteSize = 1U << i;
     const unsigned BitSize = ByteSize * 8;
@@ -280,20 +296,6 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
                             IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy);
 }
 
-ThreadSanitizer::ThreadSanitizer(Module &M) {
-  const DataLayout &DL = M.getDataLayout();
-  IntptrTy = DL.getIntPtrType(M.getContext());
-  std::tie(TsanCtorFunction, std::ignore) =
-      getOrCreateSanitizerCtorAndInitFunctions(
-          M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
-          /*InitArgs=*/{},
-          // This callback is invoked when the functions are created the first
-          // time. Hook them into the global ctors list in that case:
-          [&](Function *Ctor, FunctionCallee) {
-            appendToGlobalCtors(M, Ctor, 0);
-          });
-}
-
 static bool isVtableAccess(Instruction *I) {
   if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
     return Tag->isTBAAVtableAccess();
@@ -436,9 +438,9 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
                                        const TargetLibraryInfo &TLI) {
   // This is required to prevent instrumenting call to __tsan_init from within
   // the module constructor.
-  if (&F == TsanCtorFunction)
+  if (F.getName() == kTsanModuleCtorName)
     return false;
-  initializeCallbacks(*F.getParent());
+  initialize(*F.getParent());
   SmallVector<Instruction*, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
   SmallVector<Instruction*, 8> AtomicAccesses;
diff --git a/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
new file mode 100644
index 000000000000..604726d4f40f
--- /dev/null
+++ b/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
@@ -0,0 +1,78 @@
+//===- ValueProfileCollector.cpp - determine what to value profile --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfilePlugins.inc"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+/// A plugin-based class that takes an arbitrary number of Plugin types.
+/// Each plugin type must satisfy the following API:
+///  1) the constructor must take a `Function &f`. Typically, the plugin would
+///     scan the function looking for candidates.
+///  2) contain a member function with the following signature and name:
+///        void run(std::vector<CandidateInfo> &Candidates);
+///    such that the plugin would append its result into the vector parameter.
+///
+/// Plugins are defined in ValueProfilePlugins.inc
+template <class... Ts> class PluginChain;
+
+/// The type PluginChainFinal is the final chain of plugins that will be used by
+/// ValueProfileCollectorImpl.
+using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>;
+
+template <> class PluginChain<> {
+public:
+  PluginChain(Function &F) {}
+  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {}
+};
+
+template <class PluginT, class... Ts>
+class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> {
+  PluginT Plugin;
+  using Base = PluginChain<Ts...>;
+
+public:
+  PluginChain(Function &F) : PluginChain<Ts...>(F), Plugin(F) {}
+
+  void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {
+    if (K == PluginT::Kind)
+      Plugin.run(Candidates);
+    Base::get(K, Candidates);
+  }
+};
+
+} // end anonymous namespace
+
+/// ValueProfileCollectorImpl inherits the API of PluginChainFinal.
+class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal {
+public:
+  using PluginChainFinal::PluginChainFinal;
+};
+
+ValueProfileCollector::ValueProfileCollector(Function &F)
+    : PImpl(new ValueProfileCollectorImpl(F)) {}
+
+ValueProfileCollector::~ValueProfileCollector() = default;
+
+std::vector<CandidateInfo>
+ValueProfileCollector::get(InstrProfValueKind Kind) const {
+  std::vector<CandidateInfo> Result;
+  PImpl->get(Kind, Result);
+  return Result;
+}
diff --git a/lib/Transforms/Instrumentation/ValueProfileCollector.h b/lib/Transforms/Instrumentation/ValueProfileCollector.h
new file mode 100644
index 000000000000..ff883c8d0c77
--- /dev/null
+++ b/lib/Transforms/Instrumentation/ValueProfileCollector.h
@@ -0,0 +1,79 @@
+//===- ValueProfileCollector.h - determine what to value profile ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a utility class, ValueProfileCollector, that is used to
+// determine what kind of llvm::Value's are worth value-profiling, at which
+// point in the program, and which instruction holds the Value Profile metadata.
+// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use]
+// passes.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+
+namespace llvm {
+
+/// Utility analysis that determines what values are worth profiling.
+/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to
+/// populate the Candidates vector.
+///
+/// Value profiling an expression means to track the values that this expression
+/// takes at runtime and the frequency of each value.
+/// It is important to distinguish between two sets of value profiles for a
+/// particular expression:
+///  1) The set of values at the point of evaluation.
+///  2) The set of values at the point of use.
+/// In some cases, the two sets are identical, but it's not unusual for the two
+/// to differ.
+///
+/// To elaborate more, consider this C code, and focus on the expression `nn`:
+///  void foo(int nn, bool b) {
+///    if (b)  memcpy(x, y, nn);
+///  }
+/// The point of evaluation can be as early as the start of the function, and
+/// let's say the value profile for `nn` is:
+///     total=100; (value,freq) set = {(8,10), (32,50)}
+/// The point of use is right before we call memcpy, and since we execute the
+/// memcpy conditionally, the value profile of `nn` can be:
+///     total=15; (value,freq) set = {(8,10), (4,5)}
+///
+/// For this reason, a plugin is responsible for computing the insertion point
+/// for each value to be profiled. The `CandidateInfo` structure encapsulates
+/// all the information needed for each value profile site.
+class ValueProfileCollector {
+public:
+  struct CandidateInfo {
+    Value *V;                   // The value to profile.
+    Instruction *InsertPt;      // Insert the VP lib call before this instr.
+    Instruction *AnnotatedInst; // Where metadata is attached.
+  };
+
+  ValueProfileCollector(Function &Fn);
+  ValueProfileCollector(ValueProfileCollector &&) = delete;
+  ValueProfileCollector &operator=(ValueProfileCollector &&) = delete;
+
+  ValueProfileCollector(const ValueProfileCollector &) = delete;
+  ValueProfileCollector &operator=(const ValueProfileCollector &) = delete;
+  ~ValueProfileCollector();
+
+  /// returns a list of value profiling candidates of the given kind
+  std::vector<CandidateInfo> get(InstrProfValueKind Kind) const;
+
+private:
+  class ValueProfileCollectorImpl;
+  std::unique_ptr<ValueProfileCollectorImpl> PImpl;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
new file mode 100644
index 000000000000..4cc4c6c848c3
--- /dev/null
+++ b/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -0,0 +1,75 @@
+//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a set of plugin classes used in ValueProfileCollectorImpl.
+// Each plugin is responsible for collecting Value Profiling candidates for a
+// particular optimization.
+// Each plugin must satisfy the interface described in ValueProfileCollector.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfileCollector.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+using CandidateInfo = ValueProfileCollector::CandidateInfo;
+
+///--------------------------- MemIntrinsicPlugin ------------------------------
+class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
+  Function &F;
+  std::vector<CandidateInfo> *Candidates;
+
+public:
+  static constexpr InstrProfValueKind Kind = IPVK_MemOPSize;
+
+  MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {}
+
+  void run(std::vector<CandidateInfo> &Cs) {
+    Candidates = &Cs;
+    visit(F);
+    Candidates = nullptr;
+  }
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not instrument constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+
+    Instruction *InsertPt = &MI;
+    Instruction *AnnotatedInst = &MI;
+    Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
+  }
+};
+
+///------------------------ IndirectCallPromotionPlugin ------------------------
+class IndirectCallPromotionPlugin {
+  Function &F;
+
+public:
+  static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget;
+
+  IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {}
+
+  void run(std::vector<CandidateInfo> &Candidates) {
+    std::vector<Instruction *> Result = findIndirectCalls(F);
+    for (Instruction *I : Result) {
+      Value *Callee = CallSite(I).getCalledValue();
+      Instruction *InsertPt = I;
+      Instruction *AnnotatedInst = I;
+      Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst});
+    }
+  }
+};
+
+///----------------------- Registration of the plugins -------------------------
+/// For now, registering a plugin with the ValueProfileCollector is done by
+/// adding the plugin type to the VP_PLUGIN_LIST macro.
+#define VP_PLUGIN_LIST           \
+    MemIntrinsicPlugin,          \
+    IndirectCallPromotionPlugin
diff --git a/lib/Transforms/ObjCARC/PtrState.cpp b/lib/Transforms/ObjCARC/PtrState.cpp
index 3243481dee0d..26dd416d6184 100644
--- a/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/lib/Transforms/ObjCARC/PtrState.cpp
@@ -275,6 +275,10 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
     } else {
       InsertAfter = std::next(Inst->getIterator());
     }
+
+    if (InsertAfter != BB->end())
+      InsertAfter = skipDebugIntrinsics(InsertAfter);
+
     InsertReverseInsertPt(&*InsertAfter);
   };
 
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index de9a62e88c27..0e9f03a06061 100644
--- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -93,9 +93,7 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
                                     const SCEV *AlignSCEV,
                                     ScalarEvolution *SE) {
   // DiffUnits = Diff % int64_t(Alignment)
-  const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
-  const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
-  const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+  const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
 
   LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
                     << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
@@ -323,7 +321,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
         LI->getPointerOperand(), SE);
 
       if (NewAlignment > LI->getAlignment()) {
-        LI->setAlignment(NewAlignment);
+        LI->setAlignment(MaybeAlign(NewAlignment));
         ++NumLoadAlignChanged;
       }
     } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
@@ -331,7 +329,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
         SI->getPointerOperand(), SE);
 
       if (NewAlignment > SI->getAlignment()) {
-        SI->setAlignment(NewAlignment);
+        SI->setAlignment(MaybeAlign(NewAlignment));
         ++NumStoreAlignChanged;
       }
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 3519b000a33f..c3fba923104f 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -562,7 +562,7 @@ struct CallSiteSplittingLegacyPass : public FunctionPass {
     if (skipFunction(F))
       return false;
 
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     return doCallSiteSplitting(F, TLI, TTI, DT);
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 98243a23f1ef..9f340afbf7c2 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -204,7 +204,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
 /// set found in \p BBs.
 static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
                                  BasicBlock *Entry,
-                                 SmallPtrSet<BasicBlock *, 8> &BBs) {
+                                 SetVector<BasicBlock *> &BBs) {
   assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
   // Nodes on the current path to the root.
   SmallPtrSet<BasicBlock *, 8> Path;
@@ -257,7 +257,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
 
   // Visit Orders in bottom-up order.
   using InsertPtsCostPair =
-      std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>;
+      std::pair<SetVector<BasicBlock *>, BlockFrequency>;
 
   // InsertPtsMap is a map from a BB to the best insertion points for the
   // subtree of BB (subtree not including the BB itself).
@@ -266,7 +266,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
   for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
     BasicBlock *Node = *RIt;
     bool NodeInBBs = BBs.count(Node);
-    SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first;
+    auto &InsertPts = InsertPtsMap[Node].first;
     BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
 
     // Return the optimal insert points in BBs.
@@ -283,7 +283,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
     BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
     // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
     // will update its parent's ParentInsertPts and ParentPtsFreq.
-    SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first;
+    auto &ParentInsertPts = InsertPtsMap[Parent].first;
     BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
     // Choose to insert in Node or in subtree of Node.
     // Don't hoist to EHPad because we may not find a proper place to insert
@@ -305,12 +305,12 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
 }
 
 /// Find an insertion point that dominates all uses.
-SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
+SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
     const ConstantInfo &ConstInfo) const {
   assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
   // Collect all basic blocks.
-  SmallPtrSet<BasicBlock *, 8> BBs;
-  SmallPtrSet<Instruction *, 8> InsertPts;
+  SetVector<BasicBlock *> BBs;
+  SetVector<Instruction *> InsertPts;
   for (auto const &RCI : ConstInfo.RebasedConstants)
     for (auto const &U : RCI.Uses)
       BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
@@ -333,15 +333,13 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
 
   while (BBs.size() >= 2) {
     BasicBlock *BB, *BB1, *BB2;
-    BB1 = *BBs.begin();
-    BB2 = *std::next(BBs.begin());
+    BB1 = BBs.pop_back_val();
+    BB2 = BBs.pop_back_val();
     BB = DT->findNearestCommonDominator(BB1, BB2);
     if (BB == Entry) {
       InsertPts.insert(&Entry->front());
       return InsertPts;
     }
-    BBs.erase(BB1);
-    BBs.erase(BB2);
     BBs.insert(BB);
   }
   assert((BBs.size() == 1) && "Expected only one element.");
@@ -403,7 +401,7 @@ void ConstantHoistingPass::collectConstantCandidates(
     return;
 
   // Get offset from the base GV.
-  PointerType *GVPtrTy = dyn_cast<PointerType>(BaseGV->getType());
+  PointerType *GVPtrTy = cast<PointerType>(BaseGV->getType());
   IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace());
   APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true);
   auto *GEPO = cast<GEPOperator>(ConstExpr);
@@ -830,7 +828,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
   SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec =
       BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
   for (auto const &ConstInfo : ConstInfoVec) {
-    SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo);
+    SetVector<Instruction *> IPSet = findConstantInsertionPoint(ConstInfo);
     // We can have an empty set if the function contains unreachable blocks.
     if (IPSet.empty())
       continue;
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 770321c740a0..e9e6afe3fdd4 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -82,7 +82,7 @@ bool ConstantPropagation::runOnFunction(Function &F) {
   bool Changed = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   TargetLibraryInfo *TLI =
-      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
   while (!WorkList.empty()) {
     SmallVector<Instruction*, 16> NewWorkListVec;
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 89497177524f..2ef85268df48 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -62,6 +62,23 @@ STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
 STATISTIC(NumUDivs,     "Number of udivs whose width was decreased");
 STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
 STATISTIC(NumSRems,     "Number of srem converted to urem");
+STATISTIC(NumSExt,      "Number of sext converted to zext");
+STATISTIC(NumAnd,       "Number of ands removed");
+STATISTIC(NumNW,        "Number of no-wrap deductions");
+STATISTIC(NumNSW,       "Number of no-signed-wrap deductions");
+STATISTIC(NumNUW,       "Number of no-unsigned-wrap deductions");
+STATISTIC(NumAddNW,     "Number of no-wrap deductions for add");
+STATISTIC(NumAddNSW,    "Number of no-signed-wrap deductions for add");
+STATISTIC(NumAddNUW,    "Number of no-unsigned-wrap deductions for add");
+STATISTIC(NumSubNW,     "Number of no-wrap deductions for sub");
+STATISTIC(NumSubNSW,    "Number of no-signed-wrap deductions for sub");
+STATISTIC(NumSubNUW,    "Number of no-unsigned-wrap deductions for sub");
+STATISTIC(NumMulNW,     "Number of no-wrap deductions for mul");
+STATISTIC(NumMulNSW,    "Number of no-signed-wrap deductions for mul");
+STATISTIC(NumMulNUW,    "Number of no-unsigned-wrap deductions for mul");
+STATISTIC(NumShlNW,     "Number of no-wrap deductions for shl");
+STATISTIC(NumShlNSW,    "Number of no-signed-wrap deductions for shl");
+STATISTIC(NumShlNUW,    "Number of no-unsigned-wrap deductions for shl");
 STATISTIC(NumOverflows, "Number of overflow checks removed");
 STATISTIC(NumSaturating,
     "Number of saturating arithmetics converted to normal arithmetics");
@@ -85,6 +102,7 @@ namespace {
       AU.addRequired<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LazyValueInfoWrapperPass>();
     }
   };
 
@@ -416,37 +434,96 @@ static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) {
   return NWRegion.contains(LRange);
 }
 
-static void processOverflowIntrinsic(WithOverflowInst *WO) {
-  IRBuilder<> B(WO);
-  Value *NewOp = B.CreateBinOp(
-      WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), WO->getName());
-  // Constant-folding could have happened.
-  if (auto *Inst = dyn_cast<Instruction>(NewOp)) {
-    if (WO->isSigned())
+static void setDeducedOverflowingFlags(Value *V, Instruction::BinaryOps Opcode,
+                                       bool NewNSW, bool NewNUW) {
+  Statistic *OpcNW, *OpcNSW, *OpcNUW;
+  switch (Opcode) {
+  case Instruction::Add:
+    OpcNW = &NumAddNW;
+    OpcNSW = &NumAddNSW;
+    OpcNUW = &NumAddNUW;
+    break;
+  case Instruction::Sub:
+    OpcNW = &NumSubNW;
+    OpcNSW = &NumSubNSW;
+    OpcNUW = &NumSubNUW;
+    break;
+  case Instruction::Mul:
+    OpcNW = &NumMulNW;
+    OpcNSW = &NumMulNSW;
+    OpcNUW = &NumMulNUW;
+    break;
+  case Instruction::Shl:
+    OpcNW = &NumShlNW;
+    OpcNSW = &NumShlNSW;
+    OpcNUW = &NumShlNUW;
+    break;
+  default:
+    llvm_unreachable("Will not be called with other binops");
+  }
+
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (NewNSW) {
+    ++NumNW;
+    ++*OpcNW;
+    ++NumNSW;
+    ++*OpcNSW;
+    if (Inst)
       Inst->setHasNoSignedWrap();
-    else
+  }
+  if (NewNUW) {
+    ++NumNW;
+    ++*OpcNW;
+    ++NumNUW;
+    ++*OpcNUW;
+    if (Inst)
       Inst->setHasNoUnsignedWrap();
   }
+}
 
-  Value *NewI = B.CreateInsertValue(UndefValue::get(WO->getType()), NewOp, 0);
-  NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(WO->getContext()), 1);
+static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI);
+
+// Rewrite this with.overflow intrinsic as non-overflowing.
+static void processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) {
+  IRBuilder<> B(WO);
+  Instruction::BinaryOps Opcode = WO->getBinaryOp();
+  bool NSW = WO->isSigned();
+  bool NUW = !WO->isSigned();
+
+  Value *NewOp =
+      B.CreateBinOp(Opcode, WO->getLHS(), WO->getRHS(), WO->getName());
+  setDeducedOverflowingFlags(NewOp, Opcode, NSW, NUW);
+
+  StructType *ST = cast<StructType>(WO->getType());
+  Constant *Struct = ConstantStruct::get(ST,
+      { UndefValue::get(ST->getElementType(0)),
+        ConstantInt::getFalse(ST->getElementType(1)) });
+  Value *NewI = B.CreateInsertValue(Struct, NewOp, 0);
   WO->replaceAllUsesWith(NewI);
   WO->eraseFromParent();
   ++NumOverflows;
+
+  // See if we can infer the other no-wrap too.
+  if (auto *BO = dyn_cast<BinaryOperator>(NewOp))
+    processBinOp(BO, LVI);
 }
 
-static void processSaturatingInst(SaturatingInst *SI) {
+static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) {
+  Instruction::BinaryOps Opcode = SI->getBinaryOp();
+  bool NSW = SI->isSigned();
+  bool NUW = !SI->isSigned();
   BinaryOperator *BinOp = BinaryOperator::Create(
-      SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI);
+      Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI);
   BinOp->setDebugLoc(SI->getDebugLoc());
-  if (SI->isSigned())
-    BinOp->setHasNoSignedWrap();
-  else
-    BinOp->setHasNoUnsignedWrap();
+  setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW);
 
   SI->replaceAllUsesWith(BinOp);
   SI->eraseFromParent();
   ++NumSaturating;
+
+  // See if we can infer the other no-wrap too.
+  if (auto *BO = dyn_cast<BinaryOperator>(BinOp))
+    processBinOp(BO, LVI);
 }
 
 /// Infer nonnull attributes for the arguments at the specified callsite.
@@ -456,14 +533,14 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
 
   if (auto *WO = dyn_cast<WithOverflowInst>(CS.getInstruction())) {
     if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
-      processOverflowIntrinsic(WO);
+      processOverflowIntrinsic(WO, LVI);
       return true;
     }
   }
 
   if (auto *SI = dyn_cast<SaturatingInst>(CS.getInstruction())) {
     if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) {
-      processSaturatingInst(SI);
+      processSaturatingInst(SI, LVI);
       return true;
     }
   }
@@ -632,6 +709,27 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
   return true;
 }
 
+static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
+  if (SDI->getType()->isVectorTy())
+    return false;
+
+  Value *Base = SDI->getOperand(0);
+
+  Constant *Zero = ConstantInt::get(Base->getType(), 0);
+  if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, Base, Zero, SDI) !=
+      LazyValueInfo::True)
+    return false;
+
+  ++NumSExt;
+  auto *ZExt =
+      CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI);
+  ZExt->setDebugLoc(SDI->getDebugLoc());
+  SDI->replaceAllUsesWith(ZExt);
+  SDI->eraseFromParent();
+
+  return true;
+}
+
 static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
   using OBO = OverflowingBinaryOperator;
 
@@ -648,6 +746,7 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
 
   BasicBlock *BB = BinOp->getParent();
 
+  Instruction::BinaryOps Opcode = BinOp->getOpcode();
   Value *LHS = BinOp->getOperand(0);
   Value *RHS = BinOp->getOperand(1);
 
@@ -655,24 +754,48 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
   ConstantRange RRange = LVI->getConstantRange(RHS, BB, BinOp);
 
   bool Changed = false;
+  bool NewNUW = false, NewNSW = false;
   if (!NUW) {
     ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
-        BinOp->getOpcode(), RRange, OBO::NoUnsignedWrap);
-    bool NewNUW = NUWRange.contains(LRange);
-    BinOp->setHasNoUnsignedWrap(NewNUW);
+        Opcode, RRange, OBO::NoUnsignedWrap);
+    NewNUW = NUWRange.contains(LRange);
     Changed |= NewNUW;
   }
   if (!NSW) {
     ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
-        BinOp->getOpcode(), RRange, OBO::NoSignedWrap);
-    bool NewNSW = NSWRange.contains(LRange);
-    BinOp->setHasNoSignedWrap(NewNSW);
+        Opcode, RRange, OBO::NoSignedWrap);
+    NewNSW = NSWRange.contains(LRange);
     Changed |= NewNSW;
   }
 
+  setDeducedOverflowingFlags(BinOp, Opcode, NewNSW, NewNUW);
+
   return Changed;
 }
 
+static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
+  if (BinOp->getType()->isVectorTy())
+    return false;
+
+  // Pattern match (and lhs, C) where C includes a superset of bits which might
+  // be set in lhs.  This is a common truncation idiom created by instcombine.
+  BasicBlock *BB = BinOp->getParent();
+  Value *LHS = BinOp->getOperand(0);
+  ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+  if (!RHS || !RHS->getValue().isMask())
+    return false;
+
+  ConstantRange LRange = LVI->getConstantRange(LHS, BB, BinOp);
+  if (!LRange.getUnsignedMax().ule(RHS->getValue()))
+    return false;
+
+  BinOp->replaceAllUsesWith(LHS);
+  BinOp->eraseFromParent();
+  NumAnd++;
+  return true;
+}
+
+
 static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
   if (Constant *C = LVI->getConstant(V, At->getParent(), At))
     return C;
@@ -740,10 +863,18 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
       case Instruction::AShr:
         BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
         break;
+      case Instruction::SExt:
+        BBChanged |= processSExt(cast<SExtInst>(II), LVI);
+        break;
       case Instruction::Add:
       case Instruction::Sub:
+      case Instruction::Mul:
+      case Instruction::Shl:
         BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI);
         break;
+      case Instruction::And:
+        BBChanged |= processAnd(cast<BinaryOperator>(II), LVI);
+        break;
       }
     }
 
@@ -796,5 +927,6 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
   PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LazyValueAnalysis>();
   return PA;
 }
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 479e0ed74074..a79d775aa7f3 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -38,17 +38,19 @@ namespace {
   //===--------------------------------------------------------------------===//
   // DeadInstElimination pass implementation
   //
-  struct DeadInstElimination : public BasicBlockPass {
-    static char ID; // Pass identification, replacement for typeid
-    DeadInstElimination() : BasicBlockPass(ID) {
-      initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
-    }
-    bool runOnBasicBlock(BasicBlock &BB) override {
-      if (skipBasicBlock(BB))
-        return false;
-      auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-      TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
-      bool Changed = false;
+struct DeadInstElimination : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  DeadInstElimination() : FunctionPass(ID) {
+    initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+
+    bool Changed = false;
+    for (auto &BB : F) {
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
         Instruction *Inst = &*DI++;
         if (isInstructionTriviallyDead(Inst, TLI)) {
@@ -60,13 +62,14 @@ namespace {
           ++DIEEliminated;
         }
       }
-      return Changed;
     }
+    return Changed;
+  }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
     }
-  };
+};
 }
 
 char DeadInstElimination::ID = 0;
@@ -154,7 +157,7 @@ struct DCELegacyPass : public FunctionPass {
       return false;
 
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
 
     return eliminateDeadCode(F, TLI);
   }
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a81645745b48..685de82810ed 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1254,8 +1254,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
 
             auto *SI = new StoreInst(
                 ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
-                Earlier->getPointerOperand(), false, Earlier->getAlignment(),
-                Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
+                Earlier->getPointerOperand(), false,
+                MaybeAlign(Earlier->getAlignment()), Earlier->getOrdering(),
+                Earlier->getSyncScopeID(), DepWrite);
 
             unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
                                    LLVMContext::MD_alias_scope,
@@ -1361,7 +1362,7 @@ public:
     MemoryDependenceResults *MD =
         &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
     return eliminateDeadStores(F, AA, MD, DT, TLI);
   }
diff --git a/lib/Transforms/Scalar/DivRemPairs.cpp b/lib/Transforms/Scalar/DivRemPairs.cpp
index 876681b4f9de..934853507478 100644
--- a/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -1,4 +1,4 @@
-//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===//
+//===- DivRemPairs.cpp - Hoist/[dr]ecompose division and remainder --------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass hoists and/or decomposes integer division and remainder
+// This pass hoists and/or decomposes/recomposes integer division and remainder
 // instructions to enable CFG improvements and better codegen.
 //
 //===----------------------------------------------------------------------===//
@@ -19,37 +19,105 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "div-rem-pairs"
 STATISTIC(NumPairs, "Number of div/rem pairs");
+STATISTIC(NumRecomposed, "Number of instructions recomposed");
 STATISTIC(NumHoisted, "Number of instructions hoisted");
 STATISTIC(NumDecomposed, "Number of instructions decomposed");
 DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform",
               "Controls transformations in div-rem-pairs pass");
 
-/// Find matching pairs of integer div/rem ops (they have the same numerator,
-/// denominator, and signedness). If they exist in different basic blocks, bring
-/// them together by hoisting or replace the common division operation that is
-/// implicit in the remainder:
-/// X % Y <--> X - ((X / Y) * Y).
-///
-/// We can largely ignore the normal safety and cost constraints on speculation
-/// of these ops when we find a matching pair. This is because we are already
-/// guaranteed that any exceptions and most cost are already incurred by the
-/// first member of the pair.
-///
-/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
-/// SimplifyCFG, but it's split off on its own because it's different enough
-/// that it doesn't quite match the stated objectives of those passes.
-static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
-                           const DominatorTree &DT) {
-  bool Changed = false;
+namespace {
+struct ExpandedMatch {
+  DivRemMapKey Key;
+  Instruction *Value;
+};
+} // namespace
+
+/// See if we can match: (which is the form we expand into)
+///   X - ((X ?/ Y) * Y)
+/// which is equivalent to:
+///   X ?% Y
+static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) {
+  Value *Dividend, *XroundedDownToMultipleOfY;
+  if (!match(&I, m_Sub(m_Value(Dividend), m_Value(XroundedDownToMultipleOfY))))
+    return llvm::None;
+
+  Value *Divisor;
+  Instruction *Div;
+  // Look for  ((X / Y) * Y)
+  if (!match(
+          XroundedDownToMultipleOfY,
+          m_c_Mul(m_CombineAnd(m_IDiv(m_Specific(Dividend), m_Value(Divisor)),
+                               m_Instruction(Div)),
+                  m_Deferred(Divisor))))
+    return llvm::None;
+
+  ExpandedMatch M;
+  M.Key.SignedOp = Div->getOpcode() == Instruction::SDiv;
+  M.Key.Dividend = Dividend;
+  M.Key.Divisor = Divisor;
+  M.Value = &I;
+  return M;
+}
+
+/// A thin wrapper to store two values that we matched as div-rem pair.
+/// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
+struct DivRemPairWorklistEntry {
+  /// The actual udiv/sdiv instruction. Source of truth.
+  AssertingVH<Instruction> DivInst;
+
+  /// The instruction that we have matched as a remainder instruction.
+  /// Should only be used as Value, don't introspect it.
+  AssertingVH<Instruction> RemInst;
+
+  DivRemPairWorklistEntry(Instruction *DivInst_, Instruction *RemInst_)
+      : DivInst(DivInst_), RemInst(RemInst_) {
+    assert((DivInst->getOpcode() == Instruction::UDiv ||
+            DivInst->getOpcode() == Instruction::SDiv) &&
+           "Not a division.");
+    assert(DivInst->getType() == RemInst->getType() && "Types should match.");
+    // We can't check anything else about remainder instruction,
+    // it's not strictly required to be a urem/srem.
+  }
 
+  /// The type for this pair, identical for both the div and rem.
+  Type *getType() const { return DivInst->getType(); }
+
+  /// Is this pair signed or unsigned?
+  bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; }
+
+  /// In this pair, what are the divident and divisor?
+  Value *getDividend() const { return DivInst->getOperand(0); }
+  Value *getDivisor() const { return DivInst->getOperand(1); }
+
+  bool isRemExpanded() const {
+    switch (RemInst->getOpcode()) {
+    case Instruction::SRem:
+    case Instruction::URem:
+      return false; // single 'rem' instruction - unexpanded form.
+    default:
+      return true; // anything else means we have remainder in expanded form.
+    }
+  }
+};
+using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). Place those pairs into a worklist for further
+/// processing. This indirection is needed because we have to use TrackingVH<>
+/// because we will be doing RAUW, and if one of the rem instructions we change
+/// happens to be an input to another div/rem in the maps, we'd have problems.
+static DivRemWorklistTy getWorklist(Function &F) {
   // Insert all divide and remainder instructions into maps keyed by their
   // operands and opcode (signed or unsigned).
   DenseMap<DivRemMapKey, Instruction *> DivMap;
@@ -66,9 +134,14 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
         RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
       else if (I.getOpcode() == Instruction::URem)
         RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+      else if (auto Match = matchExpandedRem(I))
+        RemMap[Match->Key] = Match->Value;
     }
   }
 
+  // We'll accumulate the matching pairs of div-rem instructions here.
+  DivRemWorklistTy Worklist;
+
   // We can iterate over either map because we are only looking for matched
   // pairs. Choose remainders for efficiency because they are usually even more
   // rare than division.
@@ -78,12 +151,77 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
     if (!DivInst)
       continue;
 
-    // We have a matching pair of div/rem instructions. If one dominates the
-    // other, hoist and/or replace one.
+    // We have a matching pair of div/rem instructions.
     NumPairs++;
     Instruction *RemInst = RemPair.second;
-    bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
-    bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);
+
+    // Place it in the worklist.
+    Worklist.emplace_back(DivInst, RemInst);
+  }
+
+  return Worklist;
+}
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). If they exist in different basic blocks, bring
+/// them together by hoisting or replace the common division operation that is
+/// implicit in the remainder:
+/// X % Y <--> X - ((X / Y) * Y).
+///
+/// We can largely ignore the normal safety and cost constraints on speculation
+/// of these ops when we find a matching pair. This is because we are already
+/// guaranteed that any exceptions and most cost are already incurred by the
+/// first member of the pair.
+///
+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
+/// SimplifyCFG, but it's split off on its own because it's different enough
+/// that it doesn't quite match the stated objectives of those passes.
+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
+                           const DominatorTree &DT) {
+  bool Changed = false;
+
+  // Get the matching pairs of div-rem instructions. We want this extra
+  // indirection to avoid dealing with having to RAUW the keys of the maps.
+  DivRemWorklistTy Worklist = getWorklist(F);
+
+  // Process each entry in the worklist.
+  for (DivRemPairWorklistEntry &E : Worklist) {
+    if (!DebugCounter::shouldExecute(DRPCounter))
+      continue;
+
+    bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned());
+
+    auto &DivInst = E.DivInst;
+    auto &RemInst = E.RemInst;
+
+    const bool RemOriginallyWasInExpandedForm = E.isRemExpanded();
+    (void)RemOriginallyWasInExpandedForm; // suppress unused variable warning
+
+    if (HasDivRemOp && E.isRemExpanded()) {
+      // The target supports div+rem but the rem is expanded.
+      // We should recompose it first.
+      Value *X = E.getDividend();
+      Value *Y = E.getDivisor();
+      Instruction *RealRem = E.isSigned() ? BinaryOperator::CreateSRem(X, Y)
+                                          : BinaryOperator::CreateURem(X, Y);
+      // Note that we place it right next to the original expanded instruction,
+      // and letting further handling to move it if needed.
+      RealRem->setName(RemInst->getName() + ".recomposed");
+      RealRem->insertAfter(RemInst);
+      Instruction *OrigRemInst = RemInst;
+      // Update AssertingVH<> with new instruction so it doesn't assert.
+      RemInst = RealRem;
+      // And replace the original instruction with the new one.
+      OrigRemInst->replaceAllUsesWith(RealRem);
+      OrigRemInst->eraseFromParent();
+      NumRecomposed++;
+      // Note that we have left ((X / Y) * Y) around.
+      // If it had other uses we could rewrite it as X - X % Y
+    }
+
+    assert((!E.isRemExpanded() || !HasDivRemOp) &&
+           "*If* the target supports div-rem, then by now the RemInst *is* "
+           "Instruction::[US]Rem.");
 
     // If the target supports div+rem and the instructions are in the same block
     // already, there's nothing to do. The backend should handle this. If the
@@ -92,10 +230,16 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
       continue;
 
     bool DivDominates = DT.dominates(DivInst, RemInst);
-    if (!DivDominates && !DT.dominates(RemInst, DivInst))
+    if (!DivDominates && !DT.dominates(RemInst, DivInst)) {
+      // We have matching div-rem pair, but they are in two different blocks,
+      // neither of which dominates one another.
+      // FIXME: We could hoist both ops to the common predecessor block?
       continue;
+    }
 
-    if (!DebugCounter::shouldExecute(DRPCounter))
+    // The target does not have a single div/rem operation,
+    // and the rem is already in expanded form. Nothing to do.
+    if (!HasDivRemOp && E.isRemExpanded())
       continue;
 
     if (HasDivRemOp) {
@@ -107,11 +251,17 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
         DivInst->moveAfter(RemInst);
       NumHoisted++;
     } else {
-      // The target does not have a single div/rem operation. Decompose the
-      // remainder calculation as:
+      // The target does not have a single div/rem operation,
+      // and the rem is *not* in a already-expanded form.
+      // Decompose the remainder calculation as:
       // X % Y --> X - ((X / Y) * Y).
-      Value *X = RemInst->getOperand(0);
-      Value *Y = RemInst->getOperand(1);
+
+      assert(!RemOriginallyWasInExpandedForm &&
+             "We should not be expanding if the rem was in expanded form to "
+             "begin with.");
+
+      Value *X = E.getDividend();
+      Value *Y = E.getDivisor();
       Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
       Instruction *Sub = BinaryOperator::CreateSub(X, Mul);
 
@@ -152,8 +302,13 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
 
       // Now kill the explicit remainder. We have replaced it with:
       // (sub X, (mul (div X, Y), Y)
-      RemInst->replaceAllUsesWith(Sub);
-      RemInst->eraseFromParent();
+      Sub->setName(RemInst->getName() + ".decomposed");
+      Instruction *OrigRemInst = RemInst;
+      // Update AssertingVH<> with new instruction so it doesn't assert.
+      RemInst = Sub;
+      // And replace the original instruction with the new one.
+      OrigRemInst->replaceAllUsesWith(Sub);
+      OrigRemInst->eraseFromParent();
       NumDecomposed++;
     }
     Changed = true;
@@ -188,7 +343,7 @@ struct DivRemPairsLegacyPass : public FunctionPass {
     return optimizeDivRem(F, TTI, DT);
   }
 };
-}
+} // namespace
 
 char DivRemPairsLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index f1f075257020..ce540683dae2 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -108,11 +108,12 @@ struct SimpleValue {
     // This can only handle non-void readnone functions.
     if (CallInst *CI = dyn_cast<CallInst>(Inst))
       return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
-    return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
-           isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
-           isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
-           isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
-           isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+    return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
+           isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
+           isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
+           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+           isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
+           isa<InsertValueInst>(Inst);
   }
 };
 
@@ -240,7 +241,7 @@ static unsigned getHashValueImpl(SimpleValue Val) {
 
   assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
-          isa<ShuffleVectorInst>(Inst)) &&
+          isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst)) &&
          "Invalid/unknown instruction");
 
   // Mix in the opcode.
@@ -526,7 +527,7 @@ public:
            const TargetTransformInfo &TTI, DominatorTree &DT,
            AssumptionCache &AC, MemorySSA *MSSA)
       : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
-        MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
+        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
 
   bool run();
 
@@ -651,7 +652,7 @@ private:
 
     bool isInvariantLoad() const {
       if (auto *LI = dyn_cast<LoadInst>(Inst))
-        return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+        return LI->hasMetadata(LLVMContext::MD_invariant_load);
       return false;
     }
 
@@ -790,7 +791,7 @@ bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
   // A location loaded from with an invariant_load is assumed to *never* change
   // within the visible scope of the compilation.
   if (auto *LI = dyn_cast<LoadInst>(I))
-    if (LI->getMetadata(LLVMContext::MD_invariant_load))
+    if (LI->hasMetadata(LLVMContext::MD_invariant_load))
       return true;
 
   auto MemLocOpt = MemoryLocation::getOrNone(I);
@@ -1359,7 +1360,7 @@ public:
     if (skipFunction(F))
       return false;
 
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -1381,6 +1382,7 @@ public:
       AU.addPreserved<MemorySSAWrapperPass>();
     }
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
     AU.setPreservesCFG();
   }
 };
diff --git a/lib/Transforms/Scalar/FlattenCFGPass.cpp b/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 31670b1464e4..e6abf1ceb026 100644
--- a/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -11,10 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "flattencfg"
@@ -52,15 +54,23 @@ FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
 static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
   bool Changed = false;
   bool LocalChange = true;
+
+  // Use block handles instead of iterating over function blocks directly
+  // to avoid using iterators invalidated by erasing blocks.
+  std::vector<WeakVH> Blocks;
+  Blocks.reserve(F.size());
+  for (auto &BB : F)
+    Blocks.push_back(&BB);
+
   while (LocalChange) {
     LocalChange = false;
 
-    // Loop over all of the basic blocks and remove them if they are unneeded...
-    //
-    for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
-      if (FlattenCFG(&*BBIt++, AA)) {
-        LocalChange = true;
-      }
+    // Loop over all of the basic blocks and try to flatten them.
+    for (WeakVH &BlockHandle : Blocks) {
+      // Skip blocks erased by FlattenCFG.
+      if (auto *BB = cast_or_null<BasicBlock>(BlockHandle))
+        if (FlattenCFG(BB, AA))
+          LocalChange = true;
     }
     Changed |= LocalChange;
   }
diff --git a/lib/Transforms/Scalar/Float2Int.cpp b/lib/Transforms/Scalar/Float2Int.cpp
index 4f83e869b303..4d2eac0451df 100644
--- a/lib/Transforms/Scalar/Float2Int.cpp
+++ b/lib/Transforms/Scalar/Float2Int.cpp
@@ -60,11 +60,13 @@ namespace {
       if (skipFunction(F))
         return false;
 
-      return Impl.runImpl(F);
+      const DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      return Impl.runImpl(F, DT);
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
     }
 
@@ -116,21 +118,29 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
 
 // Find the roots - instructions that convert from the FP domain to
 // integer domain.
-void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
-  for (auto &I : instructions(F)) {
-    if (isa<VectorType>(I.getType()))
+void Float2IntPass::findRoots(Function &F, const DominatorTree &DT,
+                              SmallPtrSet<Instruction*,8> &Roots) {
+  for (BasicBlock &BB : F) {
+    // Unreachable code can take on strange forms that we are not prepared to
+    // handle. For example, an instruction may have itself as an operand.
+    if (!DT.isReachableFromEntry(&BB))
       continue;
-    switch (I.getOpcode()) {
-    default: break;
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-      Roots.insert(&I);
-      break;
-    case Instruction::FCmp:
-      if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
-          CmpInst::BAD_ICMP_PREDICATE)
+
+    for (Instruction &I : BB) {
+      if (isa<VectorType>(I.getType()))
+        continue;
+      switch (I.getOpcode()) {
+      default: break;
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
         Roots.insert(&I);
-      break;
+        break;
+      case Instruction::FCmp:
+        if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
+            CmpInst::BAD_ICMP_PREDICATE)
+          Roots.insert(&I);
+        break;
+      }
     }
   }
 }
@@ -503,7 +513,7 @@ void Float2IntPass::cleanup() {
     I.first->eraseFromParent();
 }
 
-bool Float2IntPass::runImpl(Function &F) {
+bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
   // Clear out all state.
   ECs = EquivalenceClasses<Instruction*>();
@@ -513,7 +523,7 @@ bool Float2IntPass::runImpl(Function &F) {
 
   Ctx = &F.getParent()->getContext();
 
-  findRoots(F, Roots);
+  findRoots(F, DT, Roots);
 
   walkBackwards(Roots);
   walkForwards();
@@ -527,8 +537,9 @@ bool Float2IntPass::runImpl(Function &F) {
 namespace llvm {
 FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
 
-PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
-  if (!runImpl(F))
+PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &AM) {
+  const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, DT))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 1a02e9d33f49..743353eaea22 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -70,6 +70,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -626,6 +627,8 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<GlobalsAA>();
   PA.preserve<TargetLibraryAnalysis>();
+  if (LI)
+    PA.preserve<LoopAnalysis>();
   return PA;
 }
 
@@ -1161,15 +1164,30 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
     // Do PHI translation to get its value in the predecessor if necessary.  The
     // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+    // We do the translation for each edge we skipped by going from LI's block
+    // to LoadBB, otherwise we might miss pieces needing translation.
 
     // If all preds have a single successor, then we know it is safe to insert
     // the load on the pred (?!?), so we can insert code to materialize the
     // pointer if it is not available.
-    PHITransAddr Address(LI->getPointerOperand(), DL, AC);
-    Value *LoadPtr = nullptr;
-    LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
-                                                *DT, NewInsts);
+    Value *LoadPtr = LI->getPointerOperand();
+    BasicBlock *Cur = LI->getParent();
+    while (Cur != LoadBB) {
+      PHITransAddr Address(LoadPtr, DL, AC);
+      LoadPtr = Address.PHITranslateWithInsertion(
+          Cur, Cur->getSinglePredecessor(), *DT, NewInsts);
+      if (!LoadPtr) {
+        CanDoPRE = false;
+        break;
+      }
+      Cur = Cur->getSinglePredecessor();
+    }
 
+    if (LoadPtr) {
+      PHITransAddr Address(LoadPtr, DL, AC);
+      LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT,
+                                                  NewInsts);
+    }
     // If we couldn't find or insert a computation of this phi translated value,
     // we fail PRE.
     if (!LoadPtr) {
@@ -1184,8 +1202,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
   if (!CanDoPRE) {
     while (!NewInsts.empty()) {
-      Instruction *I = NewInsts.pop_back_val();
-      markInstructionForDeletion(I);
+      // Erase instructions generated by the failed PHI translation before
+      // trying to number them. PHI translation might insert instructions
+      // in basic blocks other than the current one, and we delete them
+      // directly, as markInstructionForDeletion only allows removing from the
+      // current basic block.
+      NewInsts.pop_back_val()->eraseFromParent();
     }
     // HINT: Don't revert the edge-splitting as following transformation may
     // also need to split these critical edges.
@@ -1219,10 +1241,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     BasicBlock *UnavailablePred = PredLoad.first;
     Value *LoadPtr = PredLoad.second;
 
-    auto *NewLoad =
-        new LoadInst(LI->getType(), LoadPtr, LI->getName() + ".pre",
-                     LI->isVolatile(), LI->getAlignment(), LI->getOrdering(),
-                     LI->getSyncScopeID(), UnavailablePred->getTerminator());
+    auto *NewLoad = new LoadInst(
+        LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(),
+        MaybeAlign(LI->getAlignment()), LI->getOrdering(), LI->getSyncScopeID(),
+        UnavailablePred->getTerminator());
     NewLoad->setDebugLoc(LI->getDebugLoc());
 
     // Transfer the old load's AA tags to the new load.
@@ -1365,6 +1387,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
 }
 
+static bool hasUsersIn(Value *V, BasicBlock *BB) {
+  for (User *U : V->users())
+    if (isa<Instruction>(U) &&
+        cast<Instruction>(U)->getParent() == BB)
+      return true;
+  return false;
+}
+
 bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
   assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
          "This function can only be called with llvm.assume intrinsic");
@@ -1403,12 +1433,23 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
   // We can replace assume value with true, which covers cases like this:
   // call void @llvm.assume(i1 %cmp)
   // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
-  ReplaceWithConstMap[V] = True;
-
-  // If one of *cmp *eq operand is const, adding it to map will cover this:
+  ReplaceOperandsWithMap[V] = True;
+
+  // If we find an equality fact, canonicalize all dominated uses in this block
+  // to one of the two values.  We heuristically choice the "oldest" of the
+  // two where age is determined by value number. (Note that propagateEquality
+  // above handles the cross block case.) 
+  // 
+  // Key case to cover are:
+  // 1) 
   // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
   // call void @llvm.assume(i1 %cmp)
   // ret float %0 ; will change it to ret float 3.000000e+00
+  // 2)
+  // %load = load float, float* %addr
+  // %cmp = fcmp oeq float %load, %0
+  // call void @llvm.assume(i1 %cmp)
+  // ret float %load ; will change it to ret float %0
   if (auto *CmpI = dyn_cast<CmpInst>(V)) {
     if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ ||
         CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
@@ -1416,13 +1457,50 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
          CmpI->getFastMathFlags().noNaNs())) {
       Value *CmpLHS = CmpI->getOperand(0);
       Value *CmpRHS = CmpI->getOperand(1);
-      if (isa<Constant>(CmpLHS))
+      // Heuristically pick the better replacement -- the choice of heuristic
+      // isn't terribly important here, but the fact we canonicalize on some
+      // replacement is for exposing other simplifications.
+      // TODO: pull this out as a helper function and reuse w/existing
+      // (slightly different) logic.
+      if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS))
         std::swap(CmpLHS, CmpRHS);
-      auto *RHSConst = dyn_cast<Constant>(CmpRHS);
+      if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))
+        std::swap(CmpLHS, CmpRHS);
+      if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) ||
+          (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) {
+        // Move the 'oldest' value to the right-hand side, using the value
+        // number as a proxy for age.
+        uint32_t LVN = VN.lookupOrAdd(CmpLHS);
+        uint32_t RVN = VN.lookupOrAdd(CmpRHS);
+        if (LVN < RVN)
+          std::swap(CmpLHS, CmpRHS);
+      }
 
-      // If only one operand is constant.
-      if (RHSConst != nullptr && !isa<Constant>(CmpLHS))
-        ReplaceWithConstMap[CmpLHS] = RHSConst;
+      // Handle degenerate case where we either haven't pruned a dead path or a
+      // removed a trivial assume yet.
+      if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS))
+        return Changed;
+
+      // +0.0 and -0.0 compare equal, but do not imply equivalence.  Unless we
+      // can prove equivalence, bail.
+      if (CmpRHS->getType()->isFloatTy() &&
+          (!isa<ConstantFP>(CmpRHS) || cast<ConstantFP>(CmpRHS)->isZero()))
+        return Changed;
+
+      LLVM_DEBUG(dbgs() << "Replacing dominated uses of "
+                 << *CmpLHS << " with "
+                 << *CmpRHS << " in block "
+                 << IntrinsicI->getParent()->getName() << "\n");
+      
+
+      // Setup the replacement map - this handles uses within the same block
+      if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
+        ReplaceOperandsWithMap[CmpLHS] = CmpRHS;
+
+      // NOTE: The non-block local cases are handled by the call to
+      // propagateEquality above; this block is just about handling the block
+      // local cases.  TODO: There's a bunch of logic in propagateEqualiy which
+      // isn't duplicated for the block local case, can we share it somehow?
     }
   }
   return Changed;
@@ -1522,6 +1600,41 @@ uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
   return NewNum;
 }
 
+// Return true if the value number \p Num and NewNum have equal value.
+// Return false if the result is unknown.
+bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
+                                       const BasicBlock *Pred,
+                                       const BasicBlock *PhiBlock, GVN &Gvn) {
+  CallInst *Call = nullptr;
+  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+  while (Vals) {
+    Call = dyn_cast<CallInst>(Vals->Val);
+    if (Call && Call->getParent() == PhiBlock)
+      break;
+    Vals = Vals->Next;
+  }
+
+  if (AA->doesNotAccessMemory(Call))
+    return true;
+
+  if (!MD || !AA->onlyReadsMemory(Call))
+    return false;
+
+  MemDepResult local_dep = MD->getDependency(Call);
+  if (!local_dep.isNonLocal())
+    return false;
+
+  const MemoryDependenceResults::NonLocalDepInfo &deps =
+      MD->getNonLocalCallDependency(Call);
+
+  // Check to see if the Call has no function local clobber.
+  for (unsigned i = 0; i < deps.size(); i++) {
+    if (deps[i].getResult().isNonFuncLocal())
+      return true;
+  }
+  return false;
+}
+
 /// Translate value number \p Num using phis, so that it has the values of
 /// the phis in BB.
 uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
@@ -1568,8 +1681,11 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
     }
   }
 
-  if (uint32_t NewNum = expressionNumbering[Exp])
+  if (uint32_t NewNum = expressionNumbering[Exp]) {
+    if (Exp.opcode == Instruction::Call && NewNum != Num)
+      return areCallValsEqual(Num, NewNum, Pred, PhiBlock, Gvn) ? NewNum : Num;
     return NewNum;
+  }
   return Num;
 }
 
@@ -1637,16 +1753,12 @@ void GVN::assignBlockRPONumber(Function &F) {
   InvalidBlockRPONumbers = false;
 }
 
-// Tries to replace instruction with const, using information from
-// ReplaceWithConstMap.
-bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
+bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
   bool Changed = false;
   for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
-    Value *Operand = Instr->getOperand(OpNum);
-    auto it = ReplaceWithConstMap.find(Operand);
-    if (it != ReplaceWithConstMap.end()) {
-      assert(!isa<Constant>(Operand) &&
-             "Replacing constants with constants is invalid");
+    Value *Operand = Instr->getOperand(OpNum); 
+    auto it = ReplaceOperandsWithMap.find(Operand);
+    if (it != ReplaceOperandsWithMap.end()) {
       LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
                         << *it->second << " in instruction " << *Instr << '\n');
       Instr->setOperand(OpNum, it->second);
@@ -1976,6 +2088,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   MD = RunMD;
   ImplicitControlFlowTracking ImplicitCFT(DT);
   ICF = &ImplicitCFT;
+  this->LI = LI;
   VN.setMemDep(MD);
   ORE = RunORE;
   InvalidBlockRPONumbers = true;
@@ -2037,13 +2150,13 @@ bool GVN::processBlock(BasicBlock *BB) {
     return false;
 
   // Clearing map before every BB because it can be used only for single BB.
-  ReplaceWithConstMap.clear();
+  ReplaceOperandsWithMap.clear();
   bool ChangedFunction = false;
 
   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
        BI != BE;) {
-    if (!ReplaceWithConstMap.empty())
-      ChangedFunction |= replaceOperandsWithConsts(&*BI);
+    if (!ReplaceOperandsWithMap.empty())
+      ChangedFunction |= replaceOperandsForInBlockEquality(&*BI);
     ChangedFunction |= processInstruction(&*BI);
 
     if (InstrsToErase.empty()) {
@@ -2335,7 +2448,7 @@ bool GVN::performPRE(Function &F) {
 /// the block inserted to the critical edge.
 BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
   BasicBlock *BB =
-      SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT));
+      SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT, LI));
   if (MD)
     MD->invalidateCachedPredecessors();
   InvalidBlockRPONumbers = true;
@@ -2350,7 +2463,7 @@ bool GVN::splitCriticalEdges() {
   do {
     std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
     SplitCriticalEdge(Edge.first, Edge.second,
-                      CriticalEdgeSplittingOptions(DT));
+                      CriticalEdgeSplittingOptions(DT, LI));
   } while (!toSplit.empty());
   if (MD) MD->invalidateCachedPredecessors();
   InvalidBlockRPONumbers = true;
@@ -2456,18 +2569,26 @@ void GVN::addDeadBlock(BasicBlock *BB) {
     if (DeadBlocks.count(B))
       continue;
 
+    // First, split the critical edges. This might also create additional blocks
+    // to preserve LoopSimplify form and adjust edges accordingly.
     SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
     for (BasicBlock *P : Preds) {
       if (!DeadBlocks.count(P))
         continue;
 
-      if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+      if (llvm::any_of(successors(P),
+                       [B](BasicBlock *Succ) { return Succ == B; }) &&
+          isCriticalEdge(P->getTerminator(), B)) {
         if (BasicBlock *S = splitCriticalEdges(P, B))
           DeadBlocks.insert(P = S);
       }
+    }
 
-      for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
-        PHINode &Phi = cast<PHINode>(*II);
+    // Now undef the incoming values from the dead predecessors.
+    for (BasicBlock *P : predecessors(B)) {
+      if (!DeadBlocks.count(P))
+        continue;
+      for (PHINode &Phi : B->phis()) {
         Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType()));
         if (MD)
           MD->invalidateCachedPointerInfo(&Phi);
@@ -2544,10 +2665,11 @@ public:
     return Impl.runImpl(
         F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
         getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
         getAnalysis<AAResultsWrapperPass>().getAAResults(),
-        NoMemDepAnalysis ? nullptr
-                : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
+        NoMemDepAnalysis
+            ? nullptr
+            : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
         LIWP ? &LIWP->getLoopInfo() : nullptr,
         &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
   }
@@ -2556,6 +2678,7 @@ public:
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
     if (!NoMemDepAnalysis)
       AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
@@ -2563,6 +2686,8 @@ public:
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreservedID(LoopSimplifyID);
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index 7614599653c4..c87e41484b13 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -257,7 +257,7 @@ public:
   GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
            MemoryDependenceResults *MD, MemorySSA *MSSA)
       : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
-        MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
+        MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
 
   bool run(Function &F) {
     NumFuncArgs = F.arg_size();
@@ -539,7 +539,7 @@ private:
 
     // Check for unsafe hoistings due to side effects.
     if (K == InsKind::Store) {
-      if (hasEHOrLoadsOnPath(NewPt, dyn_cast<MemoryDef>(U), NBBsOnAllPaths))
+      if (hasEHOrLoadsOnPath(NewPt, cast<MemoryDef>(U), NBBsOnAllPaths))
         return false;
     } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths))
       return false;
@@ -889,19 +889,18 @@ private:
 
   void updateAlignment(Instruction *I, Instruction *Repl) {
     if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
-      ReplacementLoad->setAlignment(
-          std::min(ReplacementLoad->getAlignment(),
-                   cast<LoadInst>(I)->getAlignment()));
+      ReplacementLoad->setAlignment(MaybeAlign(std::min(
+          ReplacementLoad->getAlignment(), cast<LoadInst>(I)->getAlignment())));
       ++NumLoadsRemoved;
     } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
       ReplacementStore->setAlignment(
-          std::min(ReplacementStore->getAlignment(),
-                   cast<StoreInst>(I)->getAlignment()));
+          MaybeAlign(std::min(ReplacementStore->getAlignment(),
+                              cast<StoreInst>(I)->getAlignment())));
       ++NumStoresRemoved;
     } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
       ReplacementAlloca->setAlignment(
-          std::max(ReplacementAlloca->getAlignment(),
-                   cast<AllocaInst>(I)->getAlignment()));
+          MaybeAlign(std::max(ReplacementAlloca->getAlignment(),
+                              cast<AllocaInst>(I)->getAlignment())));
     } else if (isa<CallInst>(Repl)) {
       ++NumCallsRemoved;
     }
diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp
index e14f44bb7069..2697d7809568 100644
--- a/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@@ -591,7 +591,7 @@ bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
           else
             Result = RC.getCheckInst();
         }
-
+        assert(Result && "Failed to find result value");
         Result->setName("wide.chk");
       }
       return true;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index f9fc698a4a9b..5519a00c12c9 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -124,6 +124,11 @@ static cl::opt<bool>
 DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
             cl::desc("Disable Linear Function Test Replace optimization"));
 
+static cl::opt<bool>
+LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(false),
+                cl::desc("Predicate conditions in read only loops"));
+
+
 namespace {
 
 struct RewritePhi;
@@ -144,7 +149,11 @@ class IndVarSimplify {
   bool rewriteNonIntegerIVs(Loop *L);
 
   bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
-  bool optimizeLoopExits(Loop *L);
+  /// Try to eliminate loop exits based on analyzeable exit counts
+  bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter);
+  /// Try to form loop invariant tests for loop exits by changing how many
+  /// iterations of the loop run when that is unobservable.
+  bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
 
   bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
   bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
@@ -628,12 +637,30 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 
         // Okay, this instruction has a user outside of the current loop
         // and varies predictably *inside* the loop.  Evaluate the value it
-        // contains when the loop exits, if possible.
+        // contains when the loop exits, if possible.  We prefer to start with
+        // expressions which are true for all exits (so as to maximize
+        // expression reuse by the SCEVExpander), but resort to per-exit
+        // evaluation if that fails.  
         const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
-        if (!SE->isLoopInvariant(ExitValue, L) ||
-            !isSafeToExpand(ExitValue, *SE))
-          continue;
-
+        if (isa<SCEVCouldNotCompute>(ExitValue) ||
+            !SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE)) {
+          // TODO: This should probably be sunk into SCEV in some way; maybe a
+          // getSCEVForExit(SCEV*, L, ExitingBB)?  It can be generalized for
+          // most SCEV expressions and other recurrence types (e.g. shift
+          // recurrences).  Is there existing code we can reuse?
+          const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i));
+          if (isa<SCEVCouldNotCompute>(ExitCount))
+            continue;
+          if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst)))
+            if (AddRec->getLoop() == L)
+              ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE);
+          if (isa<SCEVCouldNotCompute>(ExitValue) ||
+              !SE->isLoopInvariant(ExitValue, L) ||
+              !isSafeToExpand(ExitValue, *SE))
+            continue;
+        }
+        
         // Computing the value outside of the loop brings no benefit if it is
         // definitely used inside the loop in a way which can not be optimized
         // away.  Avoid doing so unless we know we have a value which computes
@@ -804,7 +831,7 @@ bool IndVarSimplify::canLoopBeDeleted(
   L->getExitingBlocks(ExitingBlocks);
   SmallVector<BasicBlock *, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1)
+  if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1)
     return false;
 
   BasicBlock *ExitBlock = ExitBlocks[0];
@@ -1654,6 +1681,10 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     return nullptr;
   }
 
+  // if we reached this point then we are going to replace
+  // DU.NarrowUse with WideUse. Reattach DbgValue then.
+  replaceAllDbgUsesWith(*DU.NarrowUse, *WideUse, *WideUse, *DT);
+
   ExtendKindMap[DU.NarrowUse] = WideAddRec.second;
   // Returning WideUse pushes it on the worklist.
   return WideUse;
@@ -1779,14 +1810,9 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
       DeadInsts.emplace_back(DU.NarrowDef);
   }
 
-  // Attach any debug information to the new PHI. Since OrigPhi and WidePHI
-  // evaluate the same recurrence, we can just copy the debug info over.
-  SmallVector<DbgValueInst *, 1> DbgValues;
-  llvm::findDbgValues(DbgValues, OrigPhi);
-  auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(),
-                                     ValueAsMetadata::get(WidePhi));
-  for (auto &DbgValue : DbgValues)
-    DbgValue->setOperand(0, MDPhi);
+  // Attach any debug information to the new PHI.
+  replaceAllDbgUsesWith(*OrigPhi, *WidePhi, *WidePhi, *DT);
+
   return WidePhi;
 }
 
@@ -1817,8 +1843,8 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
     auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
     auto CmpConstrainedLHSRange =
             ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange);
-    auto NarrowDefRange =
-            CmpConstrainedLHSRange.addWithNoSignedWrap(*NarrowDefRHS);
+    auto NarrowDefRange = CmpConstrainedLHSRange.addWithNoWrap(
+        *NarrowDefRHS, OverflowingBinaryOperator::NoSignedWrap);
 
     updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange);
   };
@@ -2242,8 +2268,8 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
     if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
       continue;
 
-    const auto *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
-    
+    const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+
     // AR may be a pointer type, while BECount is an integer type.
     // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
     // AR may not be a narrower type, or we may never exit.
@@ -2624,74 +2650,125 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
   return MadeAnyChanges;
 }
 
-bool IndVarSimplify::optimizeLoopExits(Loop *L) {
+/// Return a symbolic upper bound for the backedge taken count of the loop.
+/// This is more general than getConstantMaxBackedgeTakenCount as it returns
+/// an arbitrary expression as opposed to only constants.
+/// TODO: Move into the ScalarEvolution class.
+static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE,
+                                            DominatorTree &DT, Loop *L) {
   SmallVector<BasicBlock*, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
   // Form an expression for the maximum exit count possible for this loop. We
   // merge the max and exact information to approximate a version of
-  // getMaxBackedgeTakenInfo which isn't restricted to just constants.
-  // TODO: factor this out as a version of getMaxBackedgeTakenCount which
-  // isn't guaranteed to return a constant.
+  // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
   SmallVector<const SCEV*, 4> ExitCounts;
-  const SCEV *MaxConstEC = SE->getMaxBackedgeTakenCount(L);
+  const SCEV *MaxConstEC = SE.getConstantMaxBackedgeTakenCount(L);
   if (!isa<SCEVCouldNotCompute>(MaxConstEC))
     ExitCounts.push_back(MaxConstEC);
   for (BasicBlock *ExitingBB : ExitingBlocks) {
-    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+    const SCEV *ExitCount = SE.getExitCount(L, ExitingBB);
     if (!isa<SCEVCouldNotCompute>(ExitCount)) {
-      assert(DT->dominates(ExitingBB, L->getLoopLatch()) &&
+      assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
              "We should only have known counts for exiting blocks that "
              "dominate latch!");
       ExitCounts.push_back(ExitCount);
     }
   }
   if (ExitCounts.empty())
-    return false;
-  const SCEV *MaxExitCount = SE->getUMinFromMismatchedTypes(ExitCounts);
+    return SE.getCouldNotCompute();
+  return SE.getUMinFromMismatchedTypes(ExitCounts);
+}
 
-  bool Changed = false;
-  for (BasicBlock *ExitingBB : ExitingBlocks) {
+bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
+  SmallVector<BasicBlock*, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // Remove all exits which aren't both rewriteable and analyzeable.
+  auto NewEnd = llvm::remove_if(ExitingBlocks,
+                                [&](BasicBlock *ExitingBB) {
     // If our exitting block exits multiple loops, we can only rewrite the
     // innermost one.  Otherwise, we're changing how many times the innermost
     // loop runs before it exits. 
     if (LI->getLoopFor(ExitingBB) != L)
-      continue;
+      return true;
 
     // Can't rewrite non-branch yet.
     BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
     if (!BI)
-      continue;
+      return true;
 
     // If already constant, nothing to do.
     if (isa<Constant>(BI->getCondition()))
-      continue;
+      return true;
     
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
     if (isa<SCEVCouldNotCompute>(ExitCount))
-      continue;
+      return true;
+    return false;
+   });
+  ExitingBlocks.erase(NewEnd, ExitingBlocks.end());
+
+  if (ExitingBlocks.empty())
+    return false;
+  
+  // Get a symbolic upper bound on the loop backedge taken count.  
+  const SCEV *MaxExitCount = getMaxBackedgeTakenCount(*SE, *DT, L);
+  if (isa<SCEVCouldNotCompute>(MaxExitCount))
+    return false;
+
+  // Visit our exit blocks in order of dominance.  We know from the fact that
+  // all exits (left) are analyzeable that the must be a total dominance order
+  // between them as each must dominate the latch.  The visit order only
+  // matters for the provably equal case.  
+  llvm::sort(ExitingBlocks,
+             [&](BasicBlock *A, BasicBlock *B) {
+               // std::sort sorts in ascending order, so we want the inverse of
+               // the normal dominance relation.
+               if (DT->properlyDominates(A, B)) return true;
+               if (DT->properlyDominates(B, A)) return false;
+               llvm_unreachable("expected total dominance order!");
+             });
+#ifdef ASSERT
+  for (unsigned i = 1; i < ExitingBlocks.size(); i++) {
+    assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]));
+  }
+#endif
+  
+  auto FoldExit = [&](BasicBlock *ExitingBB, bool IsTaken) {
+    BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+    auto *OldCond = BI->getCondition();
+    auto *NewCond = ConstantInt::get(OldCond->getType(),
+                                     IsTaken ? ExitIfTrue : !ExitIfTrue);
+    BI->setCondition(NewCond);
+    if (OldCond->use_empty())
+      DeadInsts.push_back(OldCond);
+  };
 
+  bool Changed = false;
+  SmallSet<const SCEV*, 8> DominatingExitCounts;
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+    assert(!isa<SCEVCouldNotCompute>(ExitCount) && "checked above");
+    
     // If we know we'd exit on the first iteration, rewrite the exit to
     // reflect this.  This does not imply the loop must exit through this
     // exit; there may be an earlier one taken on the first iteration.
     // TODO: Given we know the backedge can't be taken, we should go ahead
     // and break it.  Or at least, kill all the header phis and simplify.
     if (ExitCount->isZero()) {
-      bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
-      auto *OldCond = BI->getCondition();
-      auto *NewCond = ExitIfTrue ? ConstantInt::getTrue(OldCond->getType()) :
-        ConstantInt::getFalse(OldCond->getType());
-      BI->setCondition(NewCond);
-      if (OldCond->use_empty())
-        DeadInsts.push_back(OldCond);
+      FoldExit(ExitingBB, true);
       Changed = true;
       continue;
     }
 
-    // If we end up with a pointer exit count, bail.
+    // If we end up with a pointer exit count, bail.  Note that we can end up
+    // with a pointer exit count for one exiting block, and not for another in
+    // the same loop.
     if (!ExitCount->getType()->isIntegerTy() ||
         !MaxExitCount->getType()->isIntegerTy())
-      return false;
+      continue;
     
     Type *WiderType =
       SE->getWiderType(MaxExitCount->getType(), ExitCount->getType());
@@ -2700,35 +2777,198 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L) {
     assert(MaxExitCount->getType() == ExitCount->getType());
     
     // Can we prove that some other exit must be taken strictly before this
-    // one?  TODO: handle cases where ule is known, and equality is covered
-    // by a dominating exit
+    // one?
     if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT,
                                      MaxExitCount, ExitCount)) {
-      bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
-      auto *OldCond = BI->getCondition();
-      auto *NewCond = ExitIfTrue ? ConstantInt::getFalse(OldCond->getType()) :
-        ConstantInt::getTrue(OldCond->getType());
-      BI->setCondition(NewCond);
-      if (OldCond->use_empty())
-        DeadInsts.push_back(OldCond);
+      FoldExit(ExitingBB, false);
       Changed = true;
       continue;
     }
 
-    // TODO: If we can prove that the exiting iteration is equal to the exit
-    // count for this exit and that no previous exit oppurtunities exist within
-    // the loop, then we can discharge all other exits.  (May fall out of
-    // previous TODO.) 
-    
-    // TODO: If we can't prove any relation between our exit count and the
-    // loops exit count, but taking this exit doesn't require actually running
-    // the loop (i.e. no side effects, no computed values used in exit), then
-    // we can replace the exit test with a loop invariant test which exits on
-    // the first iteration.  
+    // As we run, keep track of which exit counts we've encountered.  If we
+    // find a duplicate, we've found an exit which would have exited on the
+    // exiting iteration, but (from the visit order) strictly follows another
+    // which does the same and is thus dead. 
+    if (!DominatingExitCounts.insert(ExitCount).second) {
+      FoldExit(ExitingBB, false);
+      Changed = true;
+      continue;
+    }
+
+    // TODO: There might be another oppurtunity to leverage SCEV's reasoning
+    // here.  If we kept track of the min of dominanting exits so far, we could
+    // discharge exits with EC >= MDEC. This is less powerful than the existing
+    // transform (since later exits aren't considered), but potentially more
+    // powerful for any case where SCEV can prove a >=u b, but neither a == b
+    // or a >u b.  Such a case is not currently known.
   }
   return Changed;
 }
 
+bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
+  SmallVector<BasicBlock*, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  bool Changed = false;
+
+  // Finally, see if we can rewrite our exit conditions into a loop invariant
+  // form.  If we have a read-only loop, and we can tell that we must exit down
+  // a path which does not need any of the values computed within the loop, we
+  // can rewrite the loop to exit on the first iteration.  Note that this
+  // doesn't either a) tell us the loop exits on the first iteration (unless
+  // *all* exits are predicateable) or b) tell us *which* exit might be taken.
+  // This transformation looks a lot like a restricted form of dead loop
+  // elimination, but restricted to read-only loops and without neccesssarily
+  // needing to kill the loop entirely. 
+  if (!LoopPredication)
+    return Changed;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return Changed;
+
+  // Note: ExactBTC is the exact backedge taken count *iff* the loop exits
+  // through *explicit* control flow.  We have to eliminate the possibility of
+  // implicit exits (see below) before we know it's truly exact.
+  const SCEV *ExactBTC = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(ExactBTC) ||
+      !SE->isLoopInvariant(ExactBTC, L) ||
+      !isSafeToExpand(ExactBTC, *SE))
+    return Changed;
+
+  auto BadExit = [&](BasicBlock *ExitingBB) {
+    // If our exiting block exits multiple loops, we can only rewrite the
+    // innermost one.  Otherwise, we're changing how many times the innermost
+    // loop runs before it exits. 
+    if (LI->getLoopFor(ExitingBB) != L)
+      return true;
+
+    // Can't rewrite non-branch yet.
+    BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    if (!BI)
+      return true;
+
+    // If already constant, nothing to do.
+    if (isa<Constant>(BI->getCondition()))
+      return true;
+
+    // If the exit block has phis, we need to be able to compute the values
+    // within the loop which contains them.  This assumes trivially lcssa phis
+    // have already been removed; TODO: generalize
+    BasicBlock *ExitBlock =
+    BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
+    if (!ExitBlock->phis().empty())
+      return true;
+
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+    assert(!isa<SCEVCouldNotCompute>(ExactBTC) && "implied by having exact trip count");
+    if (!SE->isLoopInvariant(ExitCount, L) ||
+        !isSafeToExpand(ExitCount, *SE))
+      return true;
+
+    return false;
+  };
+
+  // If we have any exits which can't be predicated themselves, than we can't
+  // predicate any exit which isn't guaranteed to execute before it.  Consider
+  // two exits (a) and (b) which would both exit on the same iteration.  If we
+  // can predicate (b), but not (a), and (a) preceeds (b) along some path, then
+  // we could convert a loop from exiting through (a) to one exiting through
+  // (b).  Note that this problem exists only for exits with the same exit
+  // count, and we could be more aggressive when exit counts are known inequal.
+  llvm::sort(ExitingBlocks,
+            [&](BasicBlock *A, BasicBlock *B) {
+              // std::sort sorts in ascending order, so we want the inverse of
+              // the normal dominance relation, plus a tie breaker for blocks
+              // unordered by dominance.
+              if (DT->properlyDominates(A, B)) return true;
+              if (DT->properlyDominates(B, A)) return false;
+              return A->getName() < B->getName();
+            });
+  // Check to see if our exit blocks are a total order (i.e. a linear chain of
+  // exits before the backedge).  If they aren't, reasoning about reachability
+  // is complicated and we choose not to for now.
+  for (unsigned i = 1; i < ExitingBlocks.size(); i++)
+    if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]))
+      return Changed;
+
+  // Given our sorted total order, we know that exit[j] must be evaluated
+  // after all exit[i] such j > i.
+  for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++)
+    if (BadExit(ExitingBlocks[i])) {
+      ExitingBlocks.resize(i);  
+      break;
+    }
+
+  if (ExitingBlocks.empty())
+    return Changed;
+
+  // We rely on not being able to reach an exiting block on a later iteration
+  // then it's statically compute exit count.  The implementaton of
+  // getExitCount currently has this invariant, but assert it here so that
+  // breakage is obvious if this ever changes..
+  assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) {
+        return DT->dominates(ExitingBB, L->getLoopLatch());
+      }));
+
+  // At this point, ExitingBlocks consists of only those blocks which are
+  // predicatable.  Given that, we know we have at least one exit we can
+  // predicate if the loop is doesn't have side effects and doesn't have any
+  // implicit exits (because then our exact BTC isn't actually exact).
+  // @Reviewers - As structured, this is O(I^2) for loop nests.  Any
+  // suggestions on how to improve this?  I can obviously bail out for outer
+  // loops, but that seems less than ideal.  MemorySSA can find memory writes,
+  // is that enough for *all* side effects?
+  for (BasicBlock *BB : L->blocks())
+    for (auto &I : *BB)
+      // TODO:isGuaranteedToTransfer
+      if (I.mayHaveSideEffects() || I.mayThrow())
+        return Changed;
+
+  // Finally, do the actual predication for all predicatable blocks.  A couple
+  // of notes here:
+  // 1) We don't bother to constant fold dominated exits with identical exit
+  //    counts; that's simply a form of CSE/equality propagation and we leave
+  //    it for dedicated passes.
+  // 2) We insert the comparison at the branch.  Hoisting introduces additional
+  //    legality constraints and we leave that to dedicated logic.  We want to
+  //    predicate even if we can't insert a loop invariant expression as
+  //    peeling or unrolling will likely reduce the cost of the otherwise loop
+  //    varying check.
+  Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator());
+  IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+  Value *ExactBTCV = nullptr; //lazy generated if needed
+  for (BasicBlock *ExitingBB : ExitingBlocks) {
+    const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+
+    auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    Value *NewCond;
+    if (ExitCount == ExactBTC) {
+      NewCond = L->contains(BI->getSuccessor(0)) ?
+        B.getFalse() : B.getTrue();
+    } else {
+      Value *ECV = Rewriter.expandCodeFor(ExitCount);
+      if (!ExactBTCV)
+        ExactBTCV = Rewriter.expandCodeFor(ExactBTC);
+      Value *RHS = ExactBTCV;
+      if (ECV->getType() != RHS->getType()) {
+        Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
+        ECV = B.CreateZExt(ECV, WiderTy);
+        RHS = B.CreateZExt(RHS, WiderTy);
+      }
+      auto Pred = L->contains(BI->getSuccessor(0)) ?
+        ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+      NewCond = B.CreateICmp(Pred, ECV, RHS);
+    }
+    Value *OldCond = BI->getCondition();
+    BI->setCondition(NewCond);
+    if (OldCond->use_empty())
+      DeadInsts.push_back(OldCond);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
 //===----------------------------------------------------------------------===//
 //  IndVarSimplify driver. Manage several subpasses of IV simplification.
 //===----------------------------------------------------------------------===//
@@ -2755,7 +2995,10 @@ bool IndVarSimplify::run(Loop *L) {
   // transform them to use integer recurrences.
   Changed |= rewriteNonIntegerIVs(L);
 
+#ifndef NDEBUG
+  // Used below for a consistency check only
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+#endif
 
   // Create a rewriter object which we'll use to transform the code with.
   SCEVExpander Rewriter(*SE, DL, "indvars");
@@ -2772,20 +3015,22 @@ bool IndVarSimplify::run(Loop *L) {
   Rewriter.disableCanonicalMode();
   Changed |= simplifyAndExtend(L, Rewriter, LI);
 
-  // Check to see if this loop has a computable loop-invariant execution count.
-  // If so, this means that we can compute the final value of any expressions
+  // Check to see if we can compute the final value of any expressions
   // that are recurrent in the loop, and substitute the exit values from the
-  // loop into any instructions outside of the loop that use the final values of
-  // the current expressions.
-  //
-  if (ReplaceExitValue != NeverRepl &&
-      !isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+  // loop into any instructions outside of the loop that use the final values
+  // of the current expressions.
+  if (ReplaceExitValue != NeverRepl)
     Changed |= rewriteLoopExitValues(L, Rewriter);
 
   // Eliminate redundant IV cycles.
   NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
 
-  Changed |= optimizeLoopExits(L);
+  // Try to eliminate loop exits based on analyzeable exit counts
+  Changed |= optimizeLoopExits(L, Rewriter);
+  
+  // Try to form loop invariant tests for loop exits by changing how many
+  // iterations of the loop run when that is unobservable.
+  Changed |= predicateLoopExits(L, Rewriter);
 
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  
@@ -2825,7 +3070,7 @@ bool IndVarSimplify::run(Loop *L) {
       // that our definition of "high cost" is not exactly principled.  
       if (Rewriter.isHighCostExpansion(ExitCount, L))
         continue;
-      
+
       // Check preconditions for proper SCEVExpander operation. SCEV does not
       // express SCEVExpander's dependencies, such as LoopSimplify. Instead
       // any pass that uses the SCEVExpander must do it. This does not work
@@ -2924,7 +3169,7 @@ struct IndVarSimplifyLegacyPass : public LoopPass {
     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
     auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
     auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 5f0e2001c73d..e7e73a132fbe 100644
--- a/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -141,6 +141,8 @@ using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
 
 /// InferAddressSpaces
 class InferAddressSpaces : public FunctionPass {
+  const TargetTransformInfo *TTI;
+
   /// Target specific address space which uses of should be replaced if
   /// possible.
   unsigned FlatAddrSpace;
@@ -264,17 +266,6 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
   Module *M = II->getParent()->getParent()->getParent();
 
   switch (II->getIntrinsicID()) {
-  case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax: {
-    const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
-    if (!IsVolatile || !IsVolatile->isZero())
-      return false;
-
-    LLVM_FALLTHROUGH;
-  }
   case Intrinsic::objectsize: {
     Type *DestTy = II->getType();
     Type *SrcTy = NewV->getType();
@@ -285,25 +276,27 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
     return true;
   }
   default:
-    return false;
+    return TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
   }
 }
 
-// TODO: Move logic to TTI?
 void InferAddressSpaces::collectRewritableIntrinsicOperands(
     IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack,
     DenseSet<Value *> &Visited) const {
-  switch (II->getIntrinsicID()) {
+  auto IID = II->getIntrinsicID();
+  switch (IID) {
   case Intrinsic::objectsize:
-  case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:
-  case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
-  case Intrinsic::amdgcn_ds_fmax:
     appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
                                                  PostorderStack, Visited);
     break;
   default:
+    SmallVector<int, 2> OpIndexes;
+    if (TTI->collectFlatAddressOperands(OpIndexes, IID)) {
+      for (int Idx : OpIndexes) {
+        appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(Idx),
+                                                     PostorderStack, Visited);
+      }
+    }
     break;
   }
 }
@@ -631,11 +624,10 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  const TargetTransformInfo &TTI =
-      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
   if (FlatAddrSpace == UninitializedAddressSpace) {
-    FlatAddrSpace = TTI.getFlatAddressSpace();
+    FlatAddrSpace = TTI->getFlatAddressSpace();
     if (FlatAddrSpace == UninitializedAddressSpace)
       return false;
   }
@@ -650,7 +642,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
 
   // Changes the address spaces of the flat address expressions who are inferred
   // to point to a specific address space.
-  return rewriteWithNewAddressSpaces(TTI, Postorder, InferredAddrSpace, &F);
+  return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F);
 }
 
 // Constants need to be tracked through RAUW to handle cases with nested
diff --git a/lib/Transforms/Scalar/InstSimplifyPass.cpp b/lib/Transforms/Scalar/InstSimplifyPass.cpp
index 6616364ab203..ec28f790f252 100644
--- a/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -33,37 +33,39 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ,
   bool Changed = false;
 
   do {
-    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
-      // Here be subtlety: the iterator must be incremented before the loop
-      // body (not sure why), so a range-for loop won't work here.
-      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-        Instruction *I = &*BI++;
-        // The first time through the loop ToSimplify is empty and we try to
-        // simplify all instructions.  On later iterations ToSimplify is not
+    for (BasicBlock &BB : F) {
+      // Unreachable code can take on strange forms that we are not prepared to
+      // handle. For example, an instruction may have itself as an operand.
+      if (!SQ.DT->isReachableFromEntry(&BB))
+        continue;
+
+      SmallVector<Instruction *, 8> DeadInstsInBB;
+      for (Instruction &I : BB) {
+        // The first time through the loop, ToSimplify is empty and we try to
+        // simplify all instructions. On later iterations, ToSimplify is not
         // empty and we only bother simplifying instructions that are in it.
-        if (!ToSimplify->empty() && !ToSimplify->count(I))
+        if (!ToSimplify->empty() && !ToSimplify->count(&I))
           continue;
 
-        // Don't waste time simplifying unused instructions.
-        if (!I->use_empty()) {
-          if (Value *V = SimplifyInstruction(I, SQ, ORE)) {
+        // Don't waste time simplifying dead/unused instructions.
+        if (isInstructionTriviallyDead(&I)) {
+          DeadInstsInBB.push_back(&I);
+          Changed = true;
+        } else if (!I.use_empty()) {
+          if (Value *V = SimplifyInstruction(&I, SQ, ORE)) {
             // Mark all uses for resimplification next time round the loop.
-            for (User *U : I->users())
+            for (User *U : I.users())
               Next->insert(cast<Instruction>(U));
-            I->replaceAllUsesWith(V);
+            I.replaceAllUsesWith(V);
             ++NumSimplified;
             Changed = true;
+            // A call can get simplified, but it may not be trivially dead.
+            if (isInstructionTriviallyDead(&I))
+              DeadInstsInBB.push_back(&I);
           }
         }
-        if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) {
-          // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
-          // instruction, so simply incrementing the iterator does not work.
-          // When instructions get deleted re-iterate instead.
-          BI = BB->begin();
-          BE = BB->end();
-          Changed = true;
-        }
       }
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI);
     }
 
     // Place the list of instructions to simplify on the next loop iteration
@@ -90,7 +92,7 @@ struct InstSimplifyLegacyPass : public FunctionPass {
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
-  /// runOnFunction - Remove instructions that simplify.
+  /// Remove instructions that simplify.
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
@@ -98,7 +100,7 @@ struct InstSimplifyLegacyPass : public FunctionPass {
     const DominatorTree *DT =
         &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     AssumptionCache *AC =
         &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     OptimizationRemarkEmitter *ORE =
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index b86bf2fefbe5..0cf00baaa24a 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -224,13 +224,21 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
          BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
     auto *PredBB = IncomingBB;
     auto *SuccBB = PhiBB;
+    SmallPtrSet<BasicBlock *, 16> Visited;
     while (true) {
       BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
       if (PredBr && PredBr->isConditional())
         return {PredBB, SuccBB};
+      Visited.insert(PredBB);
       auto *SinglePredBB = PredBB->getSinglePredecessor();
       if (!SinglePredBB)
         return {nullptr, nullptr};
+
+      // Stop searching when SinglePredBB has been visited. It means we see
+      // an unreachable loop.
+      if (Visited.count(SinglePredBB))
+        return {nullptr, nullptr};
+
       SuccBB = PredBB;
       PredBB = SinglePredBB;
     }
@@ -253,7 +261,9 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
       return;
 
     BasicBlock *PredBB = PredOutEdge.first;
-    BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator());
+    BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+    if (!PredBr)
+      return;
 
     uint64_t PredTrueWeight, PredFalseWeight;
     // FIXME: We currently only set the profile data when it is missing.
@@ -286,7 +296,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
 bool JumpThreading::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
-  auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   // Get DT analysis before LVI. When LVI is initialized it conditionally adds
   // DT if it's available.
   auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -1461,7 +1471,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
            "Can't handle critical edge here!");
     LoadInst *NewVal = new LoadInst(
         LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
-        LoadI->getName() + ".pr", false, LoadI->getAlignment(),
+        LoadI->getName() + ".pr", false, MaybeAlign(LoadI->getAlignment()),
         LoadI->getOrdering(), LoadI->getSyncScopeID(),
         UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LoadI->getDebugLoc());
@@ -2423,7 +2433,7 @@ void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
   //  |-----
   //  v
   // BB
-  BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+  BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator());
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
                                          BB->getParent(), BB);
   // Move the unconditional branch to NewBB.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index d9dda4cef2d2..6ce4831a7359 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -220,7 +220,8 @@ struct LegacyLICMPass : public LoopPass {
                           &getAnalysis<AAResultsWrapperPass>().getAAResults(),
                           &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
                           &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+                              *L->getHeader()->getParent()),
                           &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
                               *L->getHeader()->getParent()),
                           SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
@@ -294,7 +295,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
 
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
 
   return PA;
@@ -330,6 +331,12 @@ bool LoopInvariantCodeMotion::runOnLoop(
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
 
+  // If this loop has metadata indicating that LICM is not to be performed then
+  // just exit.
+  if (hasDisableLICMTransformsHint(L)) {
+    return false;
+  }
+
   std::unique_ptr<AliasSetTracker> CurAST;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
   bool NoOfMemAccTooLarge = false;
@@ -340,7 +347,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
     CurAST = collectAliasInfoForLoop(L, LI, AA);
   } else {
     LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
-    MSSAU = make_unique<MemorySSAUpdater>(MSSA);
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
 
     unsigned AccessCapCount = 0;
     for (auto *BB : L->getBlocks()) {
@@ -956,7 +963,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 
     // Now that we've finished hoisting make sure that LI and DT are still
     // valid.
-#ifndef NDEBUG
+#ifdef EXPENSIVE_CHECKS
   if (Changed) {
     assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
            "Dominator tree verification failed");
@@ -1026,7 +1033,8 @@ namespace {
 bool isHoistableAndSinkableInst(Instruction &I) {
   // Only these instructions are hoistable/sinkable.
   return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
-          isa<FenceInst>(I) || isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+          isa<FenceInst>(I) || isa<CastInst>(I) ||
+          isa<UnaryOperator>(I) || isa<BinaryOperator>(I) ||
           isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
           isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
           isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
@@ -1092,7 +1100,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     // in the same alias set as something that ends up being modified.
     if (AA->pointsToConstantMemory(LI->getOperand(0)))
       return true;
-    if (LI->getMetadata(LLVMContext::MD_invariant_load))
+    if (LI->hasMetadata(LLVMContext::MD_invariant_load))
       return true;
 
     if (LI->isAtomic() && !TargetExecutesOncePerLoop)
@@ -1240,12 +1248,22 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
               // FIXME: More precise: no Uses that alias SI.
               if (!Flags->IsSink && !MSSA->dominates(SIMD, MU))
                 return false;
-            } else if (const auto *MD = dyn_cast<MemoryDef>(&MA))
+            } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
               if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
                 (void)LI; // Silence warning.
                 assert(!LI->isUnordered() && "Expected unordered load");
                 return false;
               }
+              // Any call, while it may not be clobbering SI, it may be a use.
+              if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
+                // Check if the call may read from the memory locattion written
+                // to by SI. Check CI's attributes and arguments; the number of
+                // such checks performed is limited above by NoOfMemAccTooLarge.
+                ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI));
+                if (isModOrRefSet(MRI))
+                  return false;
+              }
+            }
         }
 
       auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
@@ -1375,8 +1393,7 @@ static Instruction *CloneInstructionInExitBlock(
   if (!I.getName().empty())
     New->setName(I.getName() + ".le");
 
-  MemoryAccess *OldMemAcc;
-  if (MSSAU && (OldMemAcc = MSSAU->getMemorySSA()->getMemoryAccess(&I))) {
+  if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
     // Create a new MemoryAccess and let MemorySSA set its defining access.
     MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
         New, nullptr, New->getParent(), MemorySSA::Beginning);
@@ -1385,7 +1402,7 @@ static Instruction *CloneInstructionInExitBlock(
         MSSAU->insertDef(MemDef, /*RenameUses=*/true);
       else {
         auto *MemUse = cast<MemoryUse>(NewMemAcc);
-        MSSAU->insertUse(MemUse);
+        MSSAU->insertUse(MemUse, /*RenameUses=*/true);
       }
     }
   }
@@ -1783,7 +1800,7 @@ public:
       StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
       if (UnorderedAtomic)
         NewSI->setOrdering(AtomicOrdering::Unordered);
-      NewSI->setAlignment(Alignment);
+      NewSI->setAlignment(MaybeAlign(Alignment));
       NewSI->setDebugLoc(DL);
       if (AATags)
         NewSI->setAAMetadata(AATags);
@@ -2016,7 +2033,8 @@ bool llvm::promoteLoopAccessesToScalars(
         if (!DereferenceableInPH) {
           DereferenceableInPH = isDereferenceableAndAlignedPointer(
               Store->getPointerOperand(), Store->getValueOperand()->getType(),
-              Store->getAlignment(), MDL, Preheader->getTerminator(), DT);
+              MaybeAlign(Store->getAlignment()), MDL,
+              Preheader->getTerminator(), DT);
         }
       } else
         return false; // Not a load or store.
@@ -2101,20 +2119,21 @@ bool llvm::promoteLoopAccessesToScalars(
       SomePtr->getName() + ".promoted", Preheader->getTerminator());
   if (SawUnorderedAtomic)
     PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
-  PreheaderLoad->setAlignment(Alignment);
+  PreheaderLoad->setAlignment(MaybeAlign(Alignment));
   PreheaderLoad->setDebugLoc(DL);
   if (AATags)
     PreheaderLoad->setAAMetadata(AATags);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
 
-  MemoryAccess *PreheaderLoadMemoryAccess;
   if (MSSAU) {
-    PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
+    MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
         PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
     MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
-    MSSAU->insertUse(NewMemUse);
+    MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
   }
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
   // Rewrite all the loads in the loop and remember all the definitions from
   // stores in the loop.
   Promoter.run(LoopUses);
@@ -2161,7 +2180,7 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
     LoopToAliasSetMap.erase(MapI);
   }
   if (!CurAST)
-    CurAST = make_unique<AliasSetTracker>(*AA);
+    CurAST = std::make_unique<AliasSetTracker>(*AA);
 
   // Add everything from the sub loops that are no longer directly available.
   for (Loop *InnerL : RecomputeLoops)
@@ -2180,7 +2199,7 @@ std::unique_ptr<AliasSetTracker>
 LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA(
     Loop *L, AliasAnalysis *AA, MemorySSAUpdater *MSSAU) {
   auto *MSSA = MSSAU->getMemorySSA();
-  auto CurAST = make_unique<AliasSetTracker>(*AA, MSSA, L);
+  auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L);
   CurAST->addAllInstructionsInLoopUsingMSSA();
   return CurAST;
 }
diff --git a/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 1fcf1315a177..a972d6fa2fcd 100644
--- a/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -312,8 +312,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
       IRBuilder<> Builder(MemI);
       Module *M = BB->getParent()->getParent();
       Type *I32 = Type::getInt32Ty(BB->getContext());
-      Function *PrefetchFunc =
-          Intrinsic::getDeclaration(M, Intrinsic::prefetch);
+      Function *PrefetchFunc = Intrinsic::getDeclaration(
+          M, Intrinsic::prefetch, PrefPtrValue->getType());
       Builder.CreateCall(
           PrefetchFunc,
           {PrefPtrValue,
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 8371367e24e7..cee197cf8354 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -191,7 +191,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
 
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
-  const SCEV *S = SE.getMaxBackedgeTakenCount(L);
+  const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S)) {
     LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
     return Changed ? LoopDeletionResult::Modified
diff --git a/lib/Transforms/Scalar/LoopFuse.cpp b/lib/Transforms/Scalar/LoopFuse.cpp
index 0bc2bcff2ae1..9f93c68e6128 100644
--- a/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/lib/Transforms/Scalar/LoopFuse.cpp
@@ -66,7 +66,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-fusion"
 
-STATISTIC(FuseCounter, "Count number of loop fusions performed");
+STATISTIC(FuseCounter, "Loops fused");
 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
 STATISTIC(InvalidPreheader, "Loop has invalid preheader");
 STATISTIC(InvalidHeader, "Loop has invalid header");
@@ -79,12 +79,15 @@ STATISTIC(MayThrowException, "Loop may throw an exception");
 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
 STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
-STATISTIC(InvalidTripCount,
-          "Loop does not have invariant backedge taken count");
+STATISTIC(UnknownTripCount, "Loop has unknown trip count");
 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
-STATISTIC(NonEqualTripCount, "Candidate trip counts are not the same");
-STATISTIC(NonAdjacent, "Candidates are not adjacent");
-STATISTIC(NonEmptyPreheader, "Candidate has a non-empty preheader");
+STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
+STATISTIC(NonAdjacent, "Loops are not adjacent");
+STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");
+STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
+STATISTIC(NonIdenticalGuards, "Candidates have different guards");
+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block");
+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block");
 
 enum FusionDependenceAnalysisChoice {
   FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -110,6 +113,7 @@ static cl::opt<bool>
                            cl::Hidden, cl::init(false), cl::ZeroOrMore);
 #endif
 
+namespace {
 /// This class is used to represent a candidate for loop fusion. When it is
 /// constructed, it checks the conditions for loop fusion to ensure that it
 /// represents a valid candidate. It caches several parts of a loop that are
@@ -143,6 +147,8 @@ struct FusionCandidate {
   SmallVector<Instruction *, 16> MemWrites;
   /// Are all of the members of this fusion candidate still valid
   bool Valid;
+  /// Guard branch of the loop, if it exists
+  BranchInst *GuardBranch;
 
   /// Dominator and PostDominator trees are needed for the
   /// FusionCandidateCompare function, required by FusionCandidateSet to
@@ -151,11 +157,20 @@ struct FusionCandidate {
   const DominatorTree *DT;
   const PostDominatorTree *PDT;
 
+  OptimizationRemarkEmitter &ORE;
+
   FusionCandidate(Loop *L, const DominatorTree *DT,
-                  const PostDominatorTree *PDT)
+                  const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
       : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
         ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
-        Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT) {
+        Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr),
+        DT(DT), PDT(PDT), ORE(ORE) {
+
+    // TODO: This is temporary while we fuse both rotated and non-rotated
+    // loops. Once we switch to only fusing rotated loops, the initialization of
+    // GuardBranch can be moved into the initialization list above.
+    if (isRotated())
+      GuardBranch = L->getLoopGuardBranch();
 
     // Walk over all blocks in the loop and check for conditions that may
     // prevent fusion. For each block, walk over all instructions and collect
@@ -163,28 +178,28 @@ struct FusionCandidate {
     // found, invalidate this object and return.
     for (BasicBlock *BB : L->blocks()) {
       if (BB->hasAddressTaken()) {
-        AddressTakenBB++;
         invalidate();
+        reportInvalidCandidate(AddressTakenBB);
         return;
       }
 
       for (Instruction &I : *BB) {
         if (I.mayThrow()) {
-          MayThrowException++;
           invalidate();
+          reportInvalidCandidate(MayThrowException);
           return;
         }
         if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
           if (SI->isVolatile()) {
-            ContainsVolatileAccess++;
             invalidate();
+            reportInvalidCandidate(ContainsVolatileAccess);
             return;
           }
         }
         if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
           if (LI->isVolatile()) {
-            ContainsVolatileAccess++;
             invalidate();
+            reportInvalidCandidate(ContainsVolatileAccess);
             return;
           }
         }
@@ -214,19 +229,96 @@ struct FusionCandidate {
     assert(Latch == L->getLoopLatch() && "Latch is out of sync");
   }
 
+  /// Get the entry block for this fusion candidate.
+  ///
+  /// If this fusion candidate represents a guarded loop, the entry block is the
+  /// loop guard block. If it represents an unguarded loop, the entry block is
+  /// the preheader of the loop.
+  BasicBlock *getEntryBlock() const {
+    if (GuardBranch)
+      return GuardBranch->getParent();
+    else
+      return Preheader;
+  }
+
+  /// Given a guarded loop, get the successor of the guard that is not in the
+  /// loop.
+  ///
+  /// This method returns the successor of the loop guard that is not located
+  /// within the loop (i.e., the successor of the guard that is not the
+  /// preheader).
+  /// This method is only valid for guarded loops.
+  BasicBlock *getNonLoopBlock() const {
+    assert(GuardBranch && "Only valid on guarded loops.");
+    assert(GuardBranch->isConditional() &&
+           "Expecting guard to be a conditional branch.");
+    return (GuardBranch->getSuccessor(0) == Preheader)
+               ? GuardBranch->getSuccessor(1)
+               : GuardBranch->getSuccessor(0);
+  }
+
+  bool isRotated() const {
+    assert(L && "Expecting loop to be valid.");
+    assert(Latch && "Expecting latch to be valid.");
+    return L->isLoopExiting(Latch);
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const {
-    dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
+    dbgs() << "\tGuardBranch: "
+           << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
+           << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
            << "\n"
            << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
            << "\tExitingBB: "
            << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
            << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
            << "\n"
-           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n";
+           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
+           << "\tEntryBlock: "
+           << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
+           << "\n";
   }
 #endif
 
+  /// Determine if a fusion candidate (representing a loop) is eligible for
+  /// fusion. Note that this only checks whether a single loop can be fused - it
+  /// does not check whether it is *legal* to fuse two loops together.
+  bool isEligibleForFusion(ScalarEvolution &SE) const {
+    if (!isValid()) {
+      LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
+      if (!Preheader)
+        ++InvalidPreheader;
+      if (!Header)
+        ++InvalidHeader;
+      if (!ExitingBlock)
+        ++InvalidExitingBlock;
+      if (!ExitBlock)
+        ++InvalidExitBlock;
+      if (!Latch)
+        ++InvalidLatch;
+      if (L->isInvalid())
+        ++InvalidLoop;
+
+      return false;
+    }
+
+    // Require ScalarEvolution to be able to determine a trip count.
+    if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
+      LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+                        << " trip count not computable!\n");
+      return reportInvalidCandidate(UnknownTripCount);
+    }
+
+    if (!L->isLoopSimplifyForm()) {
+      LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+                        << " is not in simplified form!\n");
+      return reportInvalidCandidate(NotSimplifiedForm);
+    }
+
+    return true;
+  }
+
 private:
   // This is only used internally for now, to clear the MemWrites and MemReads
   // list and setting Valid to false. I can't envision other uses of this right
@@ -239,17 +331,18 @@ private:
     MemReads.clear();
     Valid = false;
   }
-};
 
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                                     const FusionCandidate &FC) {
-  if (FC.isValid())
-    OS << FC.Preheader->getName();
-  else
-    OS << "<Invalid>";
-
-  return OS;
-}
+  bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+    using namespace ore;
+    assert(L && Preheader && "Fusion candidate not initialized properly!");
+    ++Stat;
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
+                                        L->getStartLoc(), Preheader)
+             << "[" << Preheader->getParent()->getName() << "]: "
+             << "Loop is not a candidate for fusion: " << Stat.getDesc());
+    return false;
+  }
+};
 
 struct FusionCandidateCompare {
   /// Comparison functor to sort two Control Flow Equivalent fusion candidates
@@ -260,21 +353,24 @@ struct FusionCandidateCompare {
                   const FusionCandidate &RHS) const {
     const DominatorTree *DT = LHS.DT;
 
+    BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
+    BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
+
     // Do not save PDT to local variable as it is only used in asserts and thus
     // will trigger an unused variable warning if building without asserts.
     assert(DT && LHS.PDT && "Expecting valid dominator tree");
 
     // Do this compare first so if LHS == RHS, function returns false.
-    if (DT->dominates(RHS.Preheader, LHS.Preheader)) {
+    if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
       // RHS dominates LHS
       // Verify LHS post-dominates RHS
-      assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader));
+      assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
       return false;
     }
 
-    if (DT->dominates(LHS.Preheader, RHS.Preheader)) {
+    if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
       // Verify RHS Postdominates LHS
-      assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader));
+      assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
       return true;
     }
 
@@ -286,7 +382,6 @@ struct FusionCandidateCompare {
   }
 };
 
-namespace {
 using LoopVector = SmallVector<Loop *, 4>;
 
 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
@@ -301,17 +396,26 @@ using LoopVector = SmallVector<Loop *, 4>;
 // keeps the FusionCandidateSet sorted will also simplify the implementation.
 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
-} // namespace
 
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+#if !defined(NDEBUG)
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                                     const FusionCandidate &FC) {
+  if (FC.isValid())
+    OS << FC.Preheader->getName();
+  else
+    OS << "<Invalid>";
+
+  return OS;
+}
+
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                                      const FusionCandidateSet &CandSet) {
-  for (auto IT : CandSet)
-    OS << IT << "\n";
+  for (const FusionCandidate &FC : CandSet)
+    OS << FC << '\n';
 
   return OS;
 }
 
-#if !defined(NDEBUG)
 static void
 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
   dbgs() << "Fusion Candidates: \n";
@@ -391,16 +495,6 @@ static void printLoopVector(const LoopVector &LV) {
 }
 #endif
 
-static void reportLoopFusion(const FusionCandidate &FC0,
-                             const FusionCandidate &FC1,
-                             OptimizationRemarkEmitter &ORE) {
-  using namespace ore;
-  ORE.emit(
-      OptimizationRemark(DEBUG_TYPE, "LoopFusion", FC0.Preheader->getParent())
-      << "Fused " << NV("Cand1", StringRef(FC0.Preheader->getName()))
-      << " with " << NV("Cand2", StringRef(FC1.Preheader->getName())));
-}
-
 struct LoopFuser {
 private:
   // Sets of control flow equivalent fusion candidates for a given nest level.
@@ -497,53 +591,16 @@ private:
                                const FusionCandidate &FC1) const {
     assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
 
-    if (DT.dominates(FC0.Preheader, FC1.Preheader))
-      return PDT.dominates(FC1.Preheader, FC0.Preheader);
+    BasicBlock *FC0EntryBlock = FC0.getEntryBlock();
+    BasicBlock *FC1EntryBlock = FC1.getEntryBlock();
 
-    if (DT.dominates(FC1.Preheader, FC0.Preheader))
-      return PDT.dominates(FC0.Preheader, FC1.Preheader);
+    if (DT.dominates(FC0EntryBlock, FC1EntryBlock))
+      return PDT.dominates(FC1EntryBlock, FC0EntryBlock);
 
-    return false;
-  }
-
-  /// Determine if a fusion candidate (representing a loop) is eligible for
-  /// fusion. Note that this only checks whether a single loop can be fused - it
-  /// does not check whether it is *legal* to fuse two loops together.
-  bool eligibleForFusion(const FusionCandidate &FC) const {
-    if (!FC.isValid()) {
-      LLVM_DEBUG(dbgs() << "FC " << FC << " has invalid CFG requirements!\n");
-      if (!FC.Preheader)
-        InvalidPreheader++;
-      if (!FC.Header)
-        InvalidHeader++;
-      if (!FC.ExitingBlock)
-        InvalidExitingBlock++;
-      if (!FC.ExitBlock)
-        InvalidExitBlock++;
-      if (!FC.Latch)
-        InvalidLatch++;
-      if (FC.L->isInvalid())
-        InvalidLoop++;
+    if (DT.dominates(FC1EntryBlock, FC0EntryBlock))
+      return PDT.dominates(FC0EntryBlock, FC1EntryBlock);
 
-      return false;
-    }
-
-    // Require ScalarEvolution to be able to determine a trip count.
-    if (!SE.hasLoopInvariantBackedgeTakenCount(FC.L)) {
-      LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()
-                        << " trip count not computable!\n");
-      InvalidTripCount++;
-      return false;
-    }
-
-    if (!FC.L->isLoopSimplifyForm()) {
-      LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()
-                        << " is not in simplified form!\n");
-      NotSimplifiedForm++;
-      return false;
-    }
-
-    return true;
+    return false;
   }
 
   /// Iterate over all loops in the given loop set and identify the loops that
@@ -551,8 +608,8 @@ private:
   /// Flow Equivalent sets, sorted by dominance.
   void collectFusionCandidates(const LoopVector &LV) {
     for (Loop *L : LV) {
-      FusionCandidate CurrCand(L, &DT, &PDT);
-      if (!eligibleForFusion(CurrCand))
+      FusionCandidate CurrCand(L, &DT, &PDT, ORE);
+      if (!CurrCand.isEligibleForFusion(SE))
         continue;
 
       // Go through each list in FusionCandidates and determine if L is control
@@ -664,31 +721,64 @@ private:
           if (!identicalTripCounts(*FC0, *FC1)) {
             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
                                  "counts. Not fusing.\n");
-            NonEqualTripCount++;
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEqualTripCount);
             continue;
           }
 
           if (!isAdjacent(*FC0, *FC1)) {
             LLVM_DEBUG(dbgs()
                        << "Fusion candidates are not adjacent. Not fusing.\n");
-            NonAdjacent++;
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
             continue;
           }
 
-          // For now we skip fusing if the second candidate has any instructions
-          // in the preheader. This is done because we currently do not have the
-          // safety checks to determine if it is save to move the preheader of
-          // the second candidate past the body of the first candidate. Once
-          // these checks are added, this condition can be removed.
+          // Ensure that FC0 and FC1 have identical guards.
+          // If one (or both) are not guarded, this check is not necessary.
+          if (FC0->GuardBranch && FC1->GuardBranch &&
+              !haveIdenticalGuards(*FC0, *FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
+                                 "guards. Not Fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonIdenticalGuards);
+            continue;
+          }
+
+          // The following three checks look for empty blocks in FC0 and FC1. If
+          // any of these blocks are non-empty, we do not fuse. This is done
+          // because we currently do not have the safety checks to determine if
+          // it is safe to move the blocks past other blocks in the loop. Once
+          // these checks are added, these conditions can be relaxed.
           if (!isEmptyPreheader(*FC1)) {
             LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
                                  "preheader. Not fusing.\n");
-            NonEmptyPreheader++;
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyPreheader);
+            continue;
+          }
+
+          if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit "
+                                 "block. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyExitBlock);
+            continue;
+          }
+
+          if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard "
+                                 "block. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyGuardBlock);
             continue;
           }
 
+          // Check the dependencies across the loops and do not fuse if it would
+          // violate them.
           if (!dependencesAllowFusion(*FC0, *FC1)) {
             LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       InvalidDependencies);
             continue;
           }
 
@@ -696,9 +786,11 @@ private:
           LLVM_DEBUG(dbgs()
                      << "\tFusion appears to be "
                      << (BeneficialToFuse ? "" : "un") << "profitable!\n");
-          if (!BeneficialToFuse)
+          if (!BeneficialToFuse) {
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       FusionNotBeneficial);
             continue;
-
+          }
           // All analysis has completed and has determined that fusion is legal
           // and profitable. At this point, start transforming the code and
           // perform fusion.
@@ -710,15 +802,14 @@ private:
           // Note this needs to be done *before* performFusion because
           // performFusion will change the original loops, making it not
           // possible to identify them after fusion is complete.
-          reportLoopFusion(*FC0, *FC1, ORE);
+          reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
 
-          FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT);
+          FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
           FusedCand.verify();
-          assert(eligibleForFusion(FusedCand) &&
+          assert(FusedCand.isEligibleForFusion(SE) &&
                  "Fused candidate should be eligible for fusion!");
 
           // Notify the loop-depth-tree that these loops are not valid objects
-          // anymore.
           LDT.removeLoop(FC1->L);
 
           CandidateSet.erase(FC0);
@@ -889,7 +980,7 @@ private:
     LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
                       << "\n");
     assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
-    assert(DT.dominates(FC0.Preheader, FC1.Preheader));
+    assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
 
     for (Instruction *WriteL0 : FC0.MemWrites) {
       for (Instruction *WriteL1 : FC1.MemWrites)
@@ -939,18 +1030,89 @@ private:
     return true;
   }
 
-  /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this
-  /// case, there is no code in between the two fusion candidates, thus making
-  /// them adjacent.
+  /// Determine if two fusion candidates are adjacent in the CFG.
+  ///
+  /// This method will determine if there are additional basic blocks in the CFG
+  /// between the exit of \p FC0 and the entry of \p FC1.
+  /// If the two candidates are guarded loops, then it checks whether the
+  /// non-loop successor of the \p FC0 guard branch is the entry block of \p
+  /// FC1. If not, then the loops are not adjacent. If the two candidates are
+  /// not guarded loops, then it checks whether the exit block of \p FC0 is the
+  /// preheader of \p FC1.
   bool isAdjacent(const FusionCandidate &FC0,
                   const FusionCandidate &FC1) const {
-    return FC0.ExitBlock == FC1.Preheader;
+    // If the successor of the guard branch is FC1, then the loops are adjacent
+    if (FC0.GuardBranch)
+      return FC0.getNonLoopBlock() == FC1.getEntryBlock();
+    else
+      return FC0.ExitBlock == FC1.getEntryBlock();
+  }
+
+  /// Determine if two fusion candidates have identical guards
+  ///
+  /// This method will determine if two fusion candidates have the same guards.
+  /// The guards are considered the same if:
+  ///   1. The instructions to compute the condition used in the compare are
+  ///      identical.
+  ///   2. The successors of the guard have the same flow into/around the loop.
+  /// If the compare instructions are identical, then the first successor of the
+  /// guard must go to the same place (either the preheader of the loop or the
+  /// NonLoopBlock). In other words, the the first successor of both loops must
+  /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
+  /// the NonLoopBlock). The same must be true for the second successor.
+  bool haveIdenticalGuards(const FusionCandidate &FC0,
+                           const FusionCandidate &FC1) const {
+    assert(FC0.GuardBranch && FC1.GuardBranch &&
+           "Expecting FC0 and FC1 to be guarded loops.");
+
+    if (auto FC0CmpInst =
+            dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
+      if (auto FC1CmpInst =
+              dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
+        if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
+          return false;
+
+    // The compare instructions are identical.
+    // Now make sure the successor of the guards have the same flow into/around
+    // the loop
+    if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
+      return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
+    else
+      return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
+  }
+
+  /// Check that the guard for \p FC *only* contains the cmp/branch for the
+  /// guard.
+  /// Once we are able to handle intervening code, any code in the guard block
+  /// for FC1 will need to be treated as intervening code and checked whether
+  /// it can safely move around the loops.
+  bool isEmptyGuardBlock(const FusionCandidate &FC) const {
+    assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch.");
+    if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) {
+      auto *GuardBlock = FC.GuardBranch->getParent();
+      // If the generation of the cmp value is in GuardBlock, then the size of
+      // the guard block should be 2 (cmp + branch). If the generation of the
+      // cmp value is in a different block, then the size of the guard block
+      // should only be 1.
+      if (CmpInst->getParent() == GuardBlock)
+        return GuardBlock->size() == 2;
+      else
+        return GuardBlock->size() == 1;
+    }
+
+    return false;
   }
 
   bool isEmptyPreheader(const FusionCandidate &FC) const {
+    assert(FC.Preheader && "Expecting a valid preheader");
     return FC.Preheader->size() == 1;
   }
 
+  bool isEmptyExitBlock(const FusionCandidate &FC) const {
+    assert(FC.ExitBlock && "Expecting a valid exit block");
+    return FC.ExitBlock->size() == 1;
+  }
+
   /// Fuse two fusion candidates, creating a new fused loop.
   ///
   /// This method contains the mechanics of fusing two loops, represented by \p
@@ -987,6 +1149,12 @@ private:
     LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
                dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
 
+    // Fusing guarded loops is handled slightly differently than non-guarded
+    // loops and has been broken out into a separate method instead of trying to
+    // intersperse the logic within a single method.
+    if (FC0.GuardBranch)
+      return fuseGuardedLoops(FC0, FC1);
+
     assert(FC1.Preheader == FC0.ExitBlock);
     assert(FC1.Preheader->size() == 1 &&
            FC1.Preheader->getSingleSuccessor() == FC1.Header);
@@ -1131,7 +1299,258 @@ private:
     SE.verify();
 #endif
 
-    FuseCounter++;
+    LLVM_DEBUG(dbgs() << "Fusion done:\n");
+
+    return FC0.L;
+  }
+
+  /// Report details on loop fusion opportunities.
+  ///
+  /// This template function can be used to report both successful and missed
+  /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
+  /// be one of:
+  ///   - OptimizationRemarkMissed to report when loop fusion is unsuccessful
+  ///     given two valid fusion candidates.
+  ///   - OptimizationRemark to report successful fusion of two fusion
+  ///     candidates.
+  /// The remarks will be printed using the form:
+  ///    <path/filename>:<line number>:<column number>: [<function name>]:
+  ///       <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
+  template <typename RemarkKind>
+  void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
+                        llvm::Statistic &Stat) {
+    assert(FC0.Preheader && FC1.Preheader &&
+           "Expecting valid fusion candidates");
+    using namespace ore;
+    ++Stat;
+    ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
+                        FC0.Preheader)
+             << "[" << FC0.Preheader->getParent()->getName()
+             << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
+             << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
+             << ": " << Stat.getDesc());
+  }
+
+  /// Fuse two guarded fusion candidates, creating a new fused loop.
+  ///
+  /// Fusing guarded loops is handled much the same way as fusing non-guarded
+  /// loops. The rewiring of the CFG is slightly different though, because of
+  /// the presence of the guards around the loops and the exit blocks after the
+  /// loop body. As such, the new loop is rewired as follows:
+  ///    1. Keep the guard branch from FC0 and use the non-loop block target
+  /// from the FC1 guard branch.
+  ///    2. Remove the exit block from FC0 (this exit block should be empty
+  /// right now).
+  ///    3. Remove the guard branch for FC1
+  ///    4. Remove the preheader for FC1.
+  /// The exit block successor for the latch of FC0 is updated to be the header
+  /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
+  /// be the header of FC0, thus creating the fused loop.
+  Loop *fuseGuardedLoops(const FusionCandidate &FC0,
+                         const FusionCandidate &FC1) {
+    assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
+
+    BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
+    BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
+    BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
+    BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
+
+    assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
+
+    SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Update the Loop Guard
+    ////////////////////////////////////////////////////////////////////////////
+    // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
+    // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
+    // Thus, one path from the guard goes to the preheader for FC0 (and thus
+    // executes the new fused loop) and the other path goes to the NonLoopBlock
+    // for FC1 (where FC1 guard would have gone if FC1 was not executed).
+    FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
+    FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
+                                                      FC1.Header);
+
+    // The guard of FC1 is not necessary anymore.
+    FC1.GuardBranch->eraseFromParent();
+    new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
+
+    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+           "Expecting guard block to have no predecessors");
+    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+           "Expecting guard block to have no successors");
+
+    // Remember the phi nodes originally in the header of FC0 in order to rewire
+    // them later. However, this is only necessary if the new loop carried
+    // values might not dominate the exiting branch. While we do not generally
+    // test if this is the case but simply insert intermediate phi nodes, we
+    // need to make sure these intermediate phi nodes have different
+    // predecessors. To this end, we filter the special case where the exiting
+    // block is the latch block of the first loop. Nothing needs to be done
+    // anyway as all loop carried values dominate the latch and thereby also the
+    // exiting branch.
+    // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
+    // (because the loops are rotated. Thus, nothing will ever be added to
+    // OriginalFC0PHIs.
+    SmallVector<PHINode *, 8> OriginalFC0PHIs;
+    if (FC0.ExitingBlock != FC0.Latch)
+      for (PHINode &PHI : FC0.Header->phis())
+        OriginalFC0PHIs.push_back(&PHI);
+
+    assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
+
+    // Replace incoming blocks for header PHIs first.
+    FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
+    FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
+
+    // The old exiting block of the first loop (FC0) has to jump to the header
+    // of the second as we need to execute the code in the second header block
+    // regardless of the trip count. That is, if the trip count is 0, so the
+    // back edge is never taken, we still have to execute both loop headers,
+    // especially (but not only!) if the second is a do-while style loop.
+    // However, doing so might invalidate the phi nodes of the first loop as
+    // the new values do only need to dominate their latch and not the exiting
+    // predicate. To remedy this potential problem we always introduce phi
+    // nodes in the header of the second loop later that select the loop carried
+    // value, if the second header was reached through an old latch of the
+    // first, or undef otherwise. This is sound as exiting the first implies the
+    // second will exit too, __without__ taking the back-edge (their
+    // trip-counts are equal after all).
+    FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+                                                         FC1.Header);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+
+    // Remove FC0 Exit Block
+    // The exit block for FC0 is no longer needed since control will flow
+    // directly to the header of FC1. Since it is an empty block, it can be
+    // removed at this point.
+    // TODO: In the future, we can handle non-empty exit blocks my merging any
+    // instructions from FC0 exit block into FC1 exit block prior to removing
+    // the block.
+    assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) &&
+           "Expecting exit block to be empty");
+    FC0.ExitBlock->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+
+    // Remove FC1 Preheader
+    // The pre-header of L1 is not necessary anymore.
+    assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
+    FC1.Preheader->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1.Preheader, FC1.Header));
+
+    // Moves the phi nodes from the second to the first loops header block.
+    while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
+      if (SE.isSCEVable(PHI->getType()))
+        SE.forgetValue(PHI);
+      if (PHI->hasNUsesOrMore(1))
+        PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
+      else
+        PHI->eraseFromParent();
+    }
+
+    // Introduce new phi nodes in the second loop header to ensure
+    // exiting the first and jumping to the header of the second does not break
+    // the SSA property of the phis originally in the first loop. See also the
+    // comment above.
+    Instruction *L1HeaderIP = &FC1.Header->front();
+    for (PHINode *LCPHI : OriginalFC0PHIs) {
+      int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
+      assert(L1LatchBBIdx >= 0 &&
+             "Expected loop carried value to be rewired at this point!");
+
+      Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
+
+      PHINode *L1HeaderPHI = PHINode::Create(
+          LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
+      L1HeaderPHI->addIncoming(LCV, FC0.Latch);
+      L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+                               FC0.ExitingBlock);
+
+      LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
+    }
+
+    // Update the latches
+
+    // Replace latch terminator destinations.
+    FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
+    FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
+
+    // If FC0.Latch and FC0.ExitingBlock are the same then we have already
+    // performed the updates above.
+    if (FC0.Latch != FC0.ExitingBlock)
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.Latch, FC1.Header));
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC0.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
+                                                       FC1.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC1.Latch, FC1.Header));
+
+    // All done
+    // Apply the updates to the Dominator Tree and cleanup.
+
+    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+           "FC1GuardBlock has successors!!");
+    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+           "FC1GuardBlock has predecessors!!");
+
+    // Update DT/PDT
+    DTU.applyUpdates(TreeUpdates);
+
+    LI.removeBlock(FC1.Preheader);
+    DTU.deleteBB(FC1.Preheader);
+    DTU.deleteBB(FC0.ExitBlock);
+    DTU.flush();
+
+    // Is there a way to keep SE up-to-date so we don't need to forget the loops
+    // and rebuild the information in subsequent passes of fusion?
+    SE.forgetLoop(FC1.L);
+    SE.forgetLoop(FC0.L);
+
+    // Merge the loops.
+    SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
+                                        FC1.L->block_end());
+    for (BasicBlock *BB : Blocks) {
+      FC0.L->addBlockEntry(BB);
+      FC1.L->removeBlockFromLoop(BB);
+      if (LI.getLoopFor(BB) != FC1.L)
+        continue;
+      LI.changeLoopFor(BB, FC0.L);
+    }
+    while (!FC1.L->empty()) {
+      const auto &ChildLoopIt = FC1.L->begin();
+      Loop *ChildLoop = *ChildLoopIt;
+      FC1.L->removeChildLoop(ChildLoopIt);
+      FC0.L->addChildLoop(ChildLoop);
+    }
+
+    // Delete the now empty loop L1.
+    LI.erase(FC1.L);
+
+#ifndef NDEBUG
+    assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
+    assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+    assert(PDT.verify());
+    LI.verify(DT);
+    SE.verify();
+#endif
 
     LLVM_DEBUG(dbgs() << "Fusion done:\n");
 
@@ -1177,6 +1596,7 @@ struct LoopFuseLegacy : public FunctionPass {
     return LF.fuseLoops(F);
   }
 };
+} // namespace
 
 PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index e561494f19cf..dd477e800693 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -41,6 +41,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -77,16 +78,20 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -102,6 +107,7 @@ using namespace llvm;
 
 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+STATISTIC(NumBCmp, "Number of memcmp's formed from loop 2xload+eq-compare");
 
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
@@ -111,6 +117,26 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
 
 namespace {
 
+// FIXME: reinventing the wheel much? Is there a cleaner solution?
+struct PMAbstraction {
+  virtual void markLoopAsDeleted(Loop *L) = 0;
+  virtual ~PMAbstraction() = default;
+};
+struct LegacyPMAbstraction : PMAbstraction {
+  LPPassManager &LPM;
+  LegacyPMAbstraction(LPPassManager &LPM) : LPM(LPM) {}
+  virtual ~LegacyPMAbstraction() = default;
+  void markLoopAsDeleted(Loop *L) override { LPM.markLoopAsDeleted(*L); }
+};
+struct NewPMAbstraction : PMAbstraction {
+  LPMUpdater &Updater;
+  NewPMAbstraction(LPMUpdater &Updater) : Updater(Updater) {}
+  virtual ~NewPMAbstraction() = default;
+  void markLoopAsDeleted(Loop *L) override {
+    Updater.markLoopAsDeleted(*L, L->getName());
+  }
+};
+
 class LoopIdiomRecognize {
   Loop *CurLoop = nullptr;
   AliasAnalysis *AA;
@@ -120,6 +146,7 @@ class LoopIdiomRecognize {
   TargetLibraryInfo *TLI;
   const TargetTransformInfo *TTI;
   const DataLayout *DL;
+  PMAbstraction &LoopDeleter;
   OptimizationRemarkEmitter &ORE;
   bool ApplyCodeSizeHeuristics;
 
@@ -128,9 +155,10 @@ public:
                               LoopInfo *LI, ScalarEvolution *SE,
                               TargetLibraryInfo *TLI,
                               const TargetTransformInfo *TTI,
-                              const DataLayout *DL,
+                              const DataLayout *DL, PMAbstraction &LoopDeleter,
                               OptimizationRemarkEmitter &ORE)
-      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {}
+      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL),
+        LoopDeleter(LoopDeleter), ORE(ORE) {}
 
   bool runOnLoop(Loop *L);
 
@@ -144,6 +172,8 @@ private:
   bool HasMemset;
   bool HasMemsetPattern;
   bool HasMemcpy;
+  bool HasMemCmp;
+  bool HasBCmp;
 
   /// Return code for isLegalStore()
   enum LegalStoreKind {
@@ -186,6 +216,32 @@ private:
 
   bool runOnNoncountableLoop();
 
+  struct CmpLoopStructure {
+    Value *BCmpValue, *LatchCmpValue;
+    BasicBlock *HeaderBrEqualBB, *HeaderBrUnequalBB;
+    BasicBlock *LatchBrFinishBB, *LatchBrContinueBB;
+  };
+  bool matchBCmpLoopStructure(CmpLoopStructure &CmpLoop) const;
+  struct CmpOfLoads {
+    ICmpInst::Predicate BCmpPred;
+    Value *LoadSrcA, *LoadSrcB;
+    Value *LoadA, *LoadB;
+  };
+  bool matchBCmpOfLoads(Value *BCmpValue, CmpOfLoads &CmpOfLoads) const;
+  bool recognizeBCmpLoopControlFlow(const CmpOfLoads &CmpOfLoads,
+                                    CmpLoopStructure &CmpLoop) const;
+  bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, CmpOfLoads &CmpOfLoads,
+                             const SCEV *&SrcA, const SCEV *&SrcB,
+                             const SCEV *&Iterations) const;
+  bool detectBCmpIdiom(ICmpInst *&BCmpInst, CmpInst *&LatchCmpInst,
+                       LoadInst *&LoadA, LoadInst *&LoadB, const SCEV *&SrcA,
+                       const SCEV *&SrcB, const SCEV *&NBytes) const;
+  BasicBlock *transformBCmpControlFlow(ICmpInst *ComparedEqual);
+  void transformLoopToBCmp(ICmpInst *BCmpInst, CmpInst *LatchCmpInst,
+                           LoadInst *LoadA, LoadInst *LoadB, const SCEV *SrcA,
+                           const SCEV *SrcB, const SCEV *NBytes);
+  bool recognizeBCmp();
+
   bool recognizePopcount();
   void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
                                PHINode *CntPhi, Value *Var);
@@ -217,18 +273,20 @@ public:
     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+            *L->getHeader()->getParent());
     const TargetTransformInfo *TTI =
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
             *L->getHeader()->getParent());
     const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+    LegacyPMAbstraction LoopDeleter(LPM);
 
     // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
     // pass.  Function analyses need to be preserved across loop transformations
     // but ORE cannot be preserved (see comment before the pass definition).
     OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
 
-    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE);
+    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, LoopDeleter, ORE);
     return LIR.runOnLoop(L);
   }
 
@@ -247,7 +305,7 @@ char LoopIdiomRecognizeLegacyPass::ID = 0;
 
 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
                                               LoopStandardAnalysisResults &AR,
-                                              LPMUpdater &) {
+                                              LPMUpdater &Updater) {
   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
 
   const auto &FAM =
@@ -261,8 +319,9 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
         "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
         "at a higher level");
 
+  NewPMAbstraction LoopDeleter(Updater);
   LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
-                         *ORE);
+                         LoopDeleter, *ORE);
   if (!LIR.runOnLoop(&L))
     return PreservedAnalyses::all();
 
@@ -299,7 +358,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
 
   // Disable loop idiom recognition if the function's name is a common idiom.
   StringRef Name = L->getHeader()->getParent()->getName();
-  if (Name == "memset" || Name == "memcpy")
+  if (Name == "memset" || Name == "memcpy" || Name == "memcmp" ||
+      Name == "bcmp")
     return false;
 
   // Determine if code size heuristics need to be applied.
@@ -309,8 +369,10 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
   HasMemset = TLI->has(LibFunc_memset);
   HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
   HasMemcpy = TLI->has(LibFunc_memcpy);
+  HasMemCmp = TLI->has(LibFunc_memcmp);
+  HasBCmp = TLI->has(LibFunc_bcmp);
 
-  if (HasMemset || HasMemsetPattern || HasMemcpy)
+  if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemCmp || HasBCmp)
     if (SE->hasLoopInvariantBackedgeTakenCount(L))
       return runOnCountableLoop();
 
@@ -961,7 +1023,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
                                             GlobalValue::PrivateLinkage,
                                             PatternValue, ".memset_pattern");
     GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
-    GV->setAlignment(16);
+    GV->setAlignment(Align(16));
     Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
   }
@@ -1149,7 +1211,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
                     << "] Noncountable Loop %"
                     << CurLoop->getHeader()->getName() << "\n");
 
-  return recognizePopcount() || recognizeAndInsertFFS();
+  return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS();
 }
 
 /// Check if the given conditional branch is based on the comparison between
@@ -1823,3 +1885,811 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
   //   loop. The loop would otherwise not be deleted even if it becomes empty.
   SE->forgetLoop(CurLoop);
 }
+
+bool LoopIdiomRecognize::matchBCmpLoopStructure(
+    CmpLoopStructure &CmpLoop) const {
+  ICmpInst::Predicate BCmpPred;
+
+  // We are looking for the following basic layout:
+  //  PreheaderBB: <preheader>              ; preds = ???
+  //    <...>
+  //    br label %LoopHeaderBB
+  //  LoopHeaderBB: <header,exiting>        ; preds = %PreheaderBB,%LoopLatchBB
+  //    <...>
+  //    %BCmpValue = icmp <...>
+  //    br i1 %BCmpValue, label %LoopLatchBB, label %Successor0
+  //  LoopLatchBB: <latch,exiting>          ; preds = %LoopHeaderBB
+  //    <...>
+  //    %LatchCmpValue = <are we done, or do next iteration?>
+  //    br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB
+  //  Successor0: <exit>                    ; preds = %LoopHeaderBB
+  //    <...>
+  //  Successor1: <exit>                    ; preds = %LoopLatchBB
+  //    <...>
+  //
+  // Successor0 and Successor1 may or may not be the same basic block.
+
+  // Match basic frame-work of this supposedly-comparison loop.
+  using namespace PatternMatch;
+  if (!match(CurLoop->getHeader()->getTerminator(),
+             m_Br(m_CombineAnd(m_ICmp(BCmpPred, m_Value(), m_Value()),
+                               m_Value(CmpLoop.BCmpValue)),
+                  CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB)) ||
+      !match(CurLoop->getLoopLatch()->getTerminator(),
+             m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop.LatchCmpValue)),
+                  CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB))) {
+    LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n");
+  return true;
+}
+
+bool LoopIdiomRecognize::matchBCmpOfLoads(Value *BCmpValue,
+                                          CmpOfLoads &CmpOfLoads) const {
+  using namespace PatternMatch;
+  LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue
+                    << "   as bcmp pattern.\n");
+
+  // Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example:
+  //    %v0 = load <...>, <...>* %LoadSrcA
+  //    %v1 = load <...>, <...>* %LoadSrcB
+  //    %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1
+  // There won't be any no-op bitcasts between load and icmp,
+  // they would have been transformed into a load of bitcast.
+  // FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too.
+  if (!match(BCmpValue,
+             m_ICmp(CmpOfLoads.BCmpPred,
+                    m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcA)),
+                                 m_Value(CmpOfLoads.LoadA)),
+                    m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcB)),
+                                 m_Value(CmpOfLoads.LoadB)))) ||
+      !ICmpInst::isEquality(CmpOfLoads.BCmpPred)) {
+    LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t"
+                    << *CmpOfLoads.LoadA << "\n\t" << *CmpOfLoads.LoadB
+                    << "\n");
+  // FIXME: handle memcmp pattern?
+  return true;
+}
+
+bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow(
+    const CmpOfLoads &CmpOfLoads, CmpLoopStructure &CmpLoop) const {
+  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
+  BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
+
+  // Be wary, comparisons can be inverted, canonicalize order.
+  // If this 'element' comparison passed, we expect to proceed to the next elt.
+  if (CmpOfLoads.BCmpPred != ICmpInst::Predicate::ICMP_EQ)
+    std::swap(CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB);
+  // The predicate on loop latch does not matter, just canonicalize some order.
+  if (CmpLoop.LatchBrContinueBB != LoopHeaderBB)
+    std::swap(CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB);
+
+  SmallVector<BasicBlock *, 2> ExitBlocks;
+
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  assert(ExitBlocks.size() <= 2U && "Can't have more than two exit blocks.");
+
+  // Check that control-flow between blocks is as expected.
+  if (CmpLoop.HeaderBrEqualBB != LoopLatchBB ||
+      CmpLoop.LatchBrContinueBB != LoopHeaderBB ||
+      !is_contained(ExitBlocks, CmpLoop.HeaderBrUnequalBB) ||
+      !is_contained(ExitBlocks, CmpLoop.LatchBrFinishBB)) {
+    LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n");
+    return false;
+  }
+
+  assert(!is_contained(ExitBlocks, CmpLoop.HeaderBrEqualBB) &&
+         !is_contained(ExitBlocks, CmpLoop.LatchBrContinueBB) &&
+         "Unexpected exit edges.");
+
+  LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n");
+
+  LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n");
+  assert(CurLoop->isLCSSAForm(*DT) && "Should only get LCSSA-form loops here.");
+  // No loop instructions must be used outside of the loop. Since we are in
+  // LCSSA form, we only need to check successor block's PHI nodes's incoming
+  // values for incoming blocks that are the loop basic blocks.
+  for (const BasicBlock *ExitBB : ExitBlocks) {
+    for (const PHINode &PHI : ExitBB->phis()) {
+      for (const BasicBlock *LoopBB :
+           make_filter_range(PHI.blocks(), [this](BasicBlock *PredecessorBB) {
+             return CurLoop->contains(PredecessorBB);
+           })) {
+        const auto *I =
+            dyn_cast<Instruction>(PHI.getIncomingValueForBlock(LoopBB));
+        if (I && CurLoop->contains(I)) {
+          LLVM_DEBUG(dbgs()
+                     << "Loop contains instruction " << *I
+                     << "   which is used outside of the loop in basic block  "
+                     << ExitBB->getName() << "  in phi node  " << PHI << "\n");
+          return false;
+        }
+      }
+    }
+  }
+  // Similarly, the loop should not have any other observable side-effects
+  // other than the final comparison result.
+  for (BasicBlock *LoopBB : CurLoop->blocks()) {
+    for (Instruction &I : *LoopBB) {
+      if (isa<DbgInfoIntrinsic>(I)) // Ignore dbginfo.
+        continue;                   // FIXME: anything else? lifetime info?
+      if ((I.mayHaveSideEffects() || I.isAtomic() || I.isFenceLike()) &&
+          &I != CmpOfLoads.LoadA && &I != CmpOfLoads.LoadB) {
+        LLVM_DEBUG(
+            dbgs() << "Loop contains instruction with potential side-effects: "
+                   << I << "\n");
+        return false;
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n");
+  return true;
+}
+
+bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes,
+                                               CmpOfLoads &CmpOfLoads,
+                                               const SCEV *&SrcA,
+                                               const SCEV *&SrcB,
+                                               const SCEV *&Iterations) const {
+  // Try to compute SCEV of the loads, for this loop's scope.
+  const auto *ScevForSrcA = dyn_cast<SCEVAddRecExpr>(
+      SE->getSCEVAtScope(CmpOfLoads.LoadSrcA, CurLoop));
+  const auto *ScevForSrcB = dyn_cast<SCEVAddRecExpr>(
+      SE->getSCEVAtScope(CmpOfLoads.LoadSrcB, CurLoop));
+  if (!ScevForSrcA || !ScevForSrcB) {
+    LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t"
+                    << *ScevForSrcA << "\n\t" << *ScevForSrcB << "\n");
+
+  // Loads must have folloving SCEV exprs:  {%ptr,+,BCmpTyBytes}<%LoopHeaderBB>
+  const SCEV *RecStepForA = ScevForSrcA->getStepRecurrence(*SE);
+  const SCEV *RecStepForB = ScevForSrcB->getStepRecurrence(*SE);
+  if (!ScevForSrcA->isAffine() || !ScevForSrcB->isAffine() ||
+      ScevForSrcA->getLoop() != CurLoop || ScevForSrcB->getLoop() != CurLoop ||
+      RecStepForA != RecStepForB || !isa<SCEVConstant>(RecStepForA) ||
+      cast<SCEVConstant>(RecStepForA)->getAPInt() != BCmpTyBytes) {
+    LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support "
+                         "affine SCEV expressions originating in the loop we "
+                         "are analysing with identical constant positive step, "
+                         "equal to the count of bytes compared. Got:\n\t"
+                      << *RecStepForA << "\n\t" << *RecStepForB << "\n");
+    return false;
+    // FIXME: can support BCmpTyBytes > Step.
+    // But will need to account for the extra bytes compared at the end.
+  }
+
+  SrcA = ScevForSrcA->getStart();
+  SrcB = ScevForSrcB->getStart();
+  LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA
+                    << "\n\t" << *SrcB << "\n");
+
+  // The load sources must be loop-invants that dominate the loop header.
+  if (SrcA == SE->getCouldNotCompute() || SrcB == SE->getCouldNotCompute() ||
+      !SE->isAvailableAtLoopEntry(SrcA, CurLoop) ||
+      !SE->isAvailableAtLoopEntry(SrcB, CurLoop)) {
+    LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable "
+                         "prior to loop header.\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n");
+
+  // bcmp / memcmp take length argument as size_t, so let's conservatively
+  // assume that the iteration count should be not wider than that.
+  Type *CmpFuncSizeTy = DL->getIntPtrType(SE->getContext());
+
+  // For how many iterations is loop guaranteed not to exit via LoopLatch?
+  // This is one less than the maximal number of comparisons,and is:  n + -1
+  const SCEV *LoopExitCount =
+      SE->getExitCount(CurLoop, CurLoop->getLoopLatch());
+  LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: "
+                    << *LoopExitCount << "\n");
+  // Exit count, similarly, must be loop-invant that dominates the loop header.
+  if (LoopExitCount == SE->getCouldNotCompute() ||
+      !LoopExitCount->getType()->isIntOrPtrTy() ||
+      LoopExitCount->getType()->getScalarSizeInBits() >
+          CmpFuncSizeTy->getScalarSizeInBits() ||
+      !SE->isAvailableAtLoopEntry(LoopExitCount, CurLoop)) {
+    LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n");
+    return false;
+  }
+
+  // LoopExitCount is always one less than the actual count of iterations.
+  // Do this before cast, else we will be stuck with   1 + zext(-1 + n)
+  Iterations = SE->getAddExpr(
+      LoopExitCount, SE->getOne(LoopExitCount->getType()), SCEV::FlagNUW);
+  assert(Iterations != SE->getCouldNotCompute() &&
+         "Shouldn't fail to increment by one.");
+
+  LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations << "\n");
+  return true;
+}
+
+/// Return true iff the bcmp idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p BCmpInst is set to the root byte-comparison instruction.
+/// 2) \p LatchCmpInst is set to the comparison that controls the latch.
+/// 3) \p LoadA is set to the first  LoadInst.
+/// 4) \p LoadB is set to the second LoadInst.
+/// 5) \p SrcA is set to the first  source location that is being compared.
+/// 6) \p SrcB is set to the second source location that is being compared.
+/// 7) \p NBytes is set to the number of bytes to compare.
+bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst *&BCmpInst,
+                                         CmpInst *&LatchCmpInst,
+                                         LoadInst *&LoadA, LoadInst *&LoadB,
+                                         const SCEV *&SrcA, const SCEV *&SrcB,
+                                         const SCEV *&NBytes) const {
+  LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n");
+
+  // Give up if the loop is not in normal form, or has more than 2 blocks.
+  if (!CurLoop->isLoopSimplifyForm() || CurLoop->getNumBlocks() > 2) {
+    LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n");
+
+  CmpLoopStructure CmpLoop;
+  if (!matchBCmpLoopStructure(CmpLoop))
+    return false;
+
+  CmpOfLoads CmpOfLoads;
+  if (!matchBCmpOfLoads(CmpLoop.BCmpValue, CmpOfLoads))
+    return false;
+
+  if (!recognizeBCmpLoopControlFlow(CmpOfLoads, CmpLoop))
+    return false;
+
+  BCmpInst = cast<ICmpInst>(CmpLoop.BCmpValue);        // FIXME: is there no
+  LatchCmpInst = cast<CmpInst>(CmpLoop.LatchCmpValue); // way to combine
+  LoadA = cast<LoadInst>(CmpOfLoads.LoadA);            // these cast with
+  LoadB = cast<LoadInst>(CmpOfLoads.LoadB);            // m_Value() matcher?
+
+  Type *BCmpValTy = BCmpInst->getOperand(0)->getType();
+  LLVMContext &Context = BCmpValTy->getContext();
+  uint64_t BCmpTyBits = DL->getTypeSizeInBits(BCmpValTy);
+  static constexpr uint64_t ByteTyBits = 8;
+
+  LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy
+                    << " of size " << BCmpTyBits
+                    << " bits (while byte = " << ByteTyBits << " bits).\n");
+  // bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check
+  // that we are dealing with a multiple of a byte here.
+  if (BCmpTyBits % ByteTyBits != 0) {
+    LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n");
+    return false;
+    // FIXME: could still be done under a run-time check that the total bit
+    // count is a multiple of a byte i guess? Or handle remainder separately?
+  }
+
+  // Each comparison is done on this many bytes.
+  uint64_t BCmpTyBytes = BCmpTyBits / ByteTyBits;
+  LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes
+                    << " bytes, eligible for bcmp conversion.\n");
+
+  const SCEV *Iterations;
+  if (!recognizeBCmpLoopSCEV(BCmpTyBytes, CmpOfLoads, SrcA, SrcB, Iterations))
+    return false;
+
+  // bcmp / memcmp take length argument as size_t, do promotion now.
+  Type *CmpFuncSizeTy = DL->getIntPtrType(Context);
+  Iterations = SE->getNoopOrZeroExtend(Iterations, CmpFuncSizeTy);
+  assert(Iterations != SE->getCouldNotCompute() && "Promotion failed.");
+  // Note that it didn't do ptrtoint cast, we will need to do it manually.
+
+  // We will be comparing *bytes*, not BCmpTy, we need to recalculate size.
+  // It's a multiplication, and it *could* overflow. But for it to overflow
+  // we'd want to compare more bytes than could be represented by size_t, But
+  // allocation functions also take size_t. So how'd you produce such buffer?
+  // FIXME: we likely need to actually check that we know this won't overflow,
+  //        via llvm::computeOverflowForUnsignedMul().
+  NBytes = SE->getMulExpr(
+      Iterations, SE->getConstant(CmpFuncSizeTy, BCmpTyBytes), SCEV::FlagNUW);
+  assert(NBytes != SE->getCouldNotCompute() &&
+         "Shouldn't fail to increment by one.");
+
+  LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes << "\n");
+
+  if (LoadA->getPointerAddressSpace() != LoadB->getPointerAddressSpace() ||
+      LoadA->getPointerAddressSpace() != 0 || !LoadA->isSimple() ||
+      !LoadB->isSimple()) {
+    StringLiteral L("Unsupported loads in idiom - only support identical, "
+                    "simple loads from address space 0.\n");
+    LLVM_DEBUG(dbgs() << L);
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "BCmpIdiomUnsupportedLoads",
+                                      BCmpInst->getDebugLoc(),
+                                      CurLoop->getHeader())
+             << L;
+    });
+    return false; // FIXME: support non-simple loads.
+  }
+
+  LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n");
+  ORE.emit([&]() {
+    return OptimizationRemarkAnalysis(DEBUG_TYPE, "RecognizedBCmpIdiom",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+           << "Loop recognized as a bcmp idiom";
+  });
+
+  return true;
+}
+
+BasicBlock *
+LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst *ComparedEqual) {
+  LLVM_DEBUG(dbgs() << "Transforming control-flow.\n");
+  SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
+
+  BasicBlock *PreheaderBB = CurLoop->getLoopPreheader();
+  BasicBlock *HeaderBB = CurLoop->getHeader();
+  BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
+  SmallString<32> LoopName = CurLoop->getName();
+  Function *Func = PreheaderBB->getParent();
+  LLVMContext &Context = Func->getContext();
+
+  // Before doing anything, drop SCEV info.
+  SE->forgetLoop(CurLoop);
+
+  // Here we start with: (0/6)
+  //  PreheaderBB: <preheader>        ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    br label %LoopHeaderBB
+  //  LoopHeaderBB: <header,exiting>  ; preds = %PreheaderBB,%LoopLatchBB
+  //    <...>
+  //    br i1 %<...>, label %LoopLatchBB, label %Successor0BB
+  //  LoopLatchBB: <latch,exiting>    ; preds = %LoopHeaderBB
+  //    <...>
+  //    br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
+  //  Successor0BB: <exit>            ; preds = %LoopHeaderBB
+  //    %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
+  //    <...>
+  //  Successor1BB: <exit>            ; preds = %LoopLatchBB
+  //    %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
+  //    <...>
+  //
+  // Successor0 and Successor1 may or may not be the same basic block.
+
+  // Decouple the edge between loop preheader basic block and loop header basic
+  // block. Thus the loop has become unreachable.
+  assert(cast<BranchInst>(PreheaderBB->getTerminator())->isUnconditional() &&
+         PreheaderBB->getTerminator()->getSuccessor(0) == HeaderBB &&
+         "Preheader bb must end with an unconditional branch to header bb.");
+  PreheaderBB->getTerminator()->eraseFromParent();
+  DTUpdates.push_back({DominatorTree::Delete, PreheaderBB, HeaderBB});
+
+  // Create a new preheader basic block before loop header basic block.
+  auto *PhonyPreheaderBB = BasicBlock::Create(
+      Context, LoopName + ".phonypreheaderbb", Func, HeaderBB);
+  // And insert an unconditional branch from phony preheader basic block to
+  // loop header basic block.
+  IRBuilder<>(PhonyPreheaderBB).CreateBr(HeaderBB);
+  DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
+
+  // Create a *single* new empty block that we will substitute as a
+  // successor basic block for the loop's exits. This one is temporary.
+  // Much like phony preheader basic block, it is not connected.
+  auto *PhonySuccessorBB =
+      BasicBlock::Create(Context, LoopName + ".phonysuccessorbb", Func,
+                         LoopLatchBB->getNextNode());
+  // That block must have *some* non-PHI instruction, or else deleteDeadLoop()
+  // will mess up cleanup of dbginfo, and verifier will complain.
+  IRBuilder<>(PhonySuccessorBB).CreateUnreachable();
+
+  // Create two new empty blocks that we will use to preserve the original
+  // loop exit control-flow, and preserve the incoming values in the PHI nodes
+  // in loop's successor exit blocks. These will live one.
+  auto *ComparedUnequalBB =
+      BasicBlock::Create(Context, ComparedEqual->getName() + ".unequalbb", Func,
+                         PhonySuccessorBB->getNextNode());
+  auto *ComparedEqualBB =
+      BasicBlock::Create(Context, ComparedEqual->getName() + ".equalbb", Func,
+                         PhonySuccessorBB->getNextNode());
+
+  // By now we have: (1/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    [no terminator instruction!]
+  //  PhonyPreheaderBB: <preheader>   ; No preds, UNREACHABLE!
+  //    br label %LoopHeaderBB
+  //  LoopHeaderBB: <header,exiting>  ; preds = %PhonyPreheaderBB, %LoopLatchBB
+  //    <...>
+  //    br i1 %<...>, label %LoopLatchBB, label %Successor0BB
+  //  LoopLatchBB: <latch,exiting>    ; preds = %LoopHeaderBB
+  //    <...>
+  //    br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
+  //  PhonySuccessorBB:               ; No preds, UNREACHABLE!
+  //    unreachable
+  //  EqualBB:                        ; No preds, UNREACHABLE!
+  //    [no terminator instruction!]
+  //  UnequalBB:                      ; No preds, UNREACHABLE!
+  //    [no terminator instruction!]
+  //  Successor0BB: <exit>            ; preds = %LoopHeaderBB
+  //    %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
+  //    <...>
+  //  Successor1BB: <exit>            ; preds = %LoopLatchBB
+  //    %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
+  //    <...>
+
+  // What is the mapping/replacement basic block for exiting out of the loop
+  // from either of old's loop basic blocks?
+  auto GetReplacementBB = [this, ComparedEqualBB,
+                           ComparedUnequalBB](const BasicBlock *OldBB) {
+    assert(CurLoop->contains(OldBB) && "Only for loop's basic blocks.");
+    if (OldBB == CurLoop->getLoopLatch()) // "all elements compared equal".
+      return ComparedEqualBB;
+    if (OldBB == CurLoop->getHeader()) // "element compared unequal".
+      return ComparedUnequalBB;
+    llvm_unreachable("Only had two basic blocks in loop.");
+  };
+
+  // What are the exits out of this loop?
+  SmallVector<Loop::Edge, 2> LoopExitEdges;
+  CurLoop->getExitEdges(LoopExitEdges);
+  assert(LoopExitEdges.size() == 2 && "Should have only to two exit edges.");
+
+  // Populate new basic blocks, update the exiting control-flow, PHI nodes.
+  for (const Loop::Edge &Edge : LoopExitEdges) {
+    auto *OldLoopBB = const_cast<BasicBlock *>(Edge.first);
+    auto *SuccessorBB = const_cast<BasicBlock *>(Edge.second);
+    assert(CurLoop->contains(OldLoopBB) && !CurLoop->contains(SuccessorBB) &&
+           "Unexpected edge.");
+
+    // If we would exit the loop from this loop's basic block,
+    // what semantically would that mean? Did comparison succeed or fail?
+    BasicBlock *NewBB = GetReplacementBB(OldLoopBB);
+    assert(NewBB->empty() && "Should not get same new basic block here twice.");
+    IRBuilder<> Builder(NewBB);
+    Builder.SetCurrentDebugLocation(OldLoopBB->getTerminator()->getDebugLoc());
+    Builder.CreateBr(SuccessorBB);
+    DTUpdates.push_back({DominatorTree::Insert, NewBB, SuccessorBB});
+    // Also, be *REALLY* careful with PHI nodes in successor basic block,
+    // update them to recieve the same input value, but not from current loop's
+    // basic block, but from new basic block instead.
+    SuccessorBB->replacePhiUsesWith(OldLoopBB, NewBB);
+    // Also, change loop control-flow. This loop's basic block shall no longer
+    // exit from the loop to it's original successor basic block, but to our new
+    // phony successor basic block. Note that new successor will be unique exit.
+    OldLoopBB->getTerminator()->replaceSuccessorWith(SuccessorBB,
+                                                     PhonySuccessorBB);
+    DTUpdates.push_back({DominatorTree::Delete, OldLoopBB, SuccessorBB});
+    DTUpdates.push_back({DominatorTree::Insert, OldLoopBB, PhonySuccessorBB});
+  }
+
+  // Inform DomTree about edge changes. Note that LoopInfo is still out-of-date.
+  assert(DTUpdates.size() == 8 && "Update count prediction failed.");
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  DTU.applyUpdates(DTUpdates);
+  DTUpdates.clear();
+
+  // By now we have: (2/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    [no terminator instruction!]
+  //  PhonyPreheaderBB: <preheader>   ; No preds, UNREACHABLE!
+  //    br label %LoopHeaderBB
+  //  LoopHeaderBB: <header,exiting>  ; preds = %PhonyPreheaderBB, %LoopLatchBB
+  //    <...>
+  //    br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB
+  //  LoopLatchBB: <latch,exiting>    ; preds = %LoopHeaderBB
+  //    <...>
+  //    br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB
+  //  PhonySuccessorBB: <uniq. exit>  ; preds = %LoopHeaderBB, %LoopLatchBB
+  //    unreachable
+  //  EqualBB:                        ; No preds, UNREACHABLE!
+  //    br label %Successor1BB
+  //  UnequalBB:                      ; No preds, UNREACHABLE!
+  //    br label %Successor0BB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // *Finally*, zap the original loop. Record it's parent loop though.
+  Loop *ParentLoop = CurLoop->getParentLoop();
+  LLVM_DEBUG(dbgs() << "Deleting old loop.\n");
+  LoopDeleter.markLoopAsDeleted(CurLoop); // Mark as deleted *BEFORE* deleting!
+  deleteDeadLoop(CurLoop, DT, SE, LI);    // And actually delete the loop.
+  CurLoop = nullptr;
+
+  // By now we have: (3/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    [no terminator instruction!]
+  //  PhonyPreheaderBB:               ; No preds, UNREACHABLE!
+  //    br label %PhonySuccessorBB
+  //  PhonySuccessorBB:               ; preds = %PhonyPreheaderBB
+  //    unreachable
+  //  EqualBB:                        ; No preds, UNREACHABLE!
+  //    br label %Successor1BB
+  //  UnequalBB:                      ; No preds, UNREACHABLE!
+  //    br label %Successor0BB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // Now, actually restore the CFG.
+
+  // Insert an unconditional branch from an actual preheader basic block to
+  // phony preheader basic block.
+  IRBuilder<>(PreheaderBB).CreateBr(PhonyPreheaderBB);
+  DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
+  // Insert proper conditional branch from phony successor basic block to the
+  // "dispatch" basic blocks, which were used to preserve incoming values in
+  // original loop's successor basic blocks.
+  assert(isa<UnreachableInst>(PhonySuccessorBB->getTerminator()) &&
+         "Yep, that's the one we created to keep deleteDeadLoop() happy.");
+  PhonySuccessorBB->getTerminator()->eraseFromParent();
+  {
+    IRBuilder<> Builder(PhonySuccessorBB);
+    Builder.SetCurrentDebugLocation(ComparedEqual->getDebugLoc());
+    Builder.CreateCondBr(ComparedEqual, ComparedEqualBB, ComparedUnequalBB);
+  }
+  DTUpdates.push_back(
+      {DominatorTree::Insert, PhonySuccessorBB, ComparedEqualBB});
+  DTUpdates.push_back(
+      {DominatorTree::Insert, PhonySuccessorBB, ComparedUnequalBB});
+
+  BasicBlock *DispatchBB = PhonySuccessorBB;
+  DispatchBB->setName(LoopName + ".bcmpdispatchbb");
+
+  assert(DTUpdates.size() == 3 && "Update count prediction failed.");
+  DTU.applyUpdates(DTUpdates);
+  DTUpdates.clear();
+
+  // By now we have: (4/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    br label %PhonyPreheaderBB
+  //  PhonyPreheaderBB:               ; preds = %PreheaderBB
+  //    br label %DispatchBB
+  //  DispatchBB:                     ; preds = %PhonyPreheaderBB
+  //    br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+  //  EqualBB:                        ; preds = %DispatchBB
+  //    br label %Successor1BB
+  //  UnequalBB:                      ; preds = %DispatchBB
+  //    br label %Successor0BB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // The basic CFG has been restored! Now let's merge redundant basic blocks.
+
+  // Merge phony successor basic block into it's only predecessor,
+  // phony preheader basic block. It is fully pointlessly redundant.
+  MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
+
+  // By now we have: (5/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    br label %DispatchBB
+  //  DispatchBB:                     ; preds = %PreheaderBB
+  //    br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+  //  EqualBB:                        ; preds = %DispatchBB
+  //    br label %Successor1BB
+  //  UnequalBB:                      ; preds = %DispatchBB
+  //    br label %Successor0BB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // Was this loop nested?
+  if (!ParentLoop) {
+    // If the loop was *NOT* nested, then let's also merge phony successor
+    // basic block into it's only predecessor, preheader basic block.
+    // Also, here we need to update LoopInfo.
+    LI->removeBlock(PreheaderBB);
+    MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
+
+    // By now we have: (6/6)
+    //  DispatchBB:                   ; preds = ???
+    //    <...>
+    //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+    //    %ComparedEqual = icmp eq <...> %memcmp, 0
+    //    br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+    //  EqualBB:                      ; preds = %DispatchBB
+    //    br label %Successor1BB
+    //  UnequalBB:                    ; preds = %DispatchBB
+    //    br label %Successor0BB
+    //  Successor0BB:                 ; preds = %UnequalBB
+    //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+    //    <...>
+    //  Successor1BB:                 ; preds = %EqualBB
+    //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+    //    <...>
+
+    return DispatchBB;
+  }
+
+  // Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop.
+  // To achieve that, we shall keep the preheader basic block (mainly so that
+  // the loop header block will be guaranteed to have a predecessor outside of
+  // the loop), and create a phony loop with all these new three basic blocks.
+  Loop *PhonyLoop = LI->AllocateLoop();
+  ParentLoop->addChildLoop(PhonyLoop);
+  PhonyLoop->addBasicBlockToLoop(DispatchBB, *LI);
+  PhonyLoop->addBasicBlockToLoop(ComparedEqualBB, *LI);
+  PhonyLoop->addBasicBlockToLoop(ComparedUnequalBB, *LI);
+
+  // But we only have a preheader basic block, a header basic block block and
+  // two exiting basic blocks. For a proper loop we also need a backedge from
+  // non-header basic block to header bb.
+  // Let's just add a never-taken branch from both of the exiting basic blocks.
+  for (BasicBlock *BB : {ComparedEqualBB, ComparedUnequalBB}) {
+    BranchInst *OldTerminator = cast<BranchInst>(BB->getTerminator());
+    assert(OldTerminator->isUnconditional() && "That's the one we created.");
+    BasicBlock *SuccessorBB = OldTerminator->getSuccessor(0);
+
+    IRBuilder<> Builder(OldTerminator);
+    Builder.SetCurrentDebugLocation(OldTerminator->getDebugLoc());
+    Builder.CreateCondBr(ConstantInt::getTrue(Context), SuccessorBB,
+                         DispatchBB);
+    OldTerminator->eraseFromParent();
+    // Yes, the backedge will never be taken. The control-flow is redundant.
+    // If it can be simplified further, other passes will take care.
+    DTUpdates.push_back({DominatorTree::Delete, BB, SuccessorBB});
+    DTUpdates.push_back({DominatorTree::Insert, BB, SuccessorBB});
+    DTUpdates.push_back({DominatorTree::Insert, BB, DispatchBB});
+  }
+  assert(DTUpdates.size() == 6 && "Update count prediction failed.");
+  DTU.applyUpdates(DTUpdates);
+  DTUpdates.clear();
+
+  // By now we have: (6/6)
+  //  PreheaderBB: <preheader>        ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    br label %BCmpDispatchBB
+  //  BCmpDispatchBB: <header>        ; preds = %PreheaderBB
+  //    br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+  //  EqualBB: <latch,exiting>        ; preds = %BCmpDispatchBB
+  //    br i1 %true, label %Successor1BB, label %BCmpDispatchBB
+  //  UnequalBB: <latch,exiting>      ; preds = %BCmpDispatchBB
+  //    br i1 %true, label %Successor0BB, label %BCmpDispatchBB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // Finally fully DONE!
+  return DispatchBB;
+}
+
+void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst *BCmpInst,
+                                             CmpInst *LatchCmpInst,
+                                             LoadInst *LoadA, LoadInst *LoadB,
+                                             const SCEV *SrcA, const SCEV *SrcB,
+                                             const SCEV *NBytes) {
+  // We will be inserting before the terminator instruction of preheader block.
+  IRBuilder<> Builder(CurLoop->getLoopPreheader()->getTerminator());
+
+  LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n");
+  LLVM_DEBUG(dbgs() << "Emitting new instructions.\n");
+
+  // Expand the SCEV expressions for both sources to compare, and produce value
+  // for the byte len (beware of Iterations potentially being a pointer, and
+  // account for element size being BCmpTyBytes bytes, which may be not 1 byte)
+  Value *PtrA, *PtrB, *Len;
+  {
+    SCEVExpander SExp(*SE, *DL, "LoopToBCmp");
+    SExp.setInsertPoint(&*Builder.GetInsertPoint());
+
+    auto HandlePtr = [&SExp](LoadInst *Load, const SCEV *Src) {
+      SExp.SetCurrentDebugLocation(DebugLoc());
+      // If the pointer operand of original load had dbgloc - use it.
+      if (const auto *I = dyn_cast<Instruction>(Load->getPointerOperand()))
+        SExp.SetCurrentDebugLocation(I->getDebugLoc());
+      return SExp.expandCodeFor(Src);
+    };
+    PtrA = HandlePtr(LoadA, SrcA);
+    PtrB = HandlePtr(LoadB, SrcB);
+
+    // For len calculation let's use dbgloc for the loop's latch condition.
+    Builder.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
+    SExp.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
+    Len = SExp.expandCodeFor(NBytes);
+
+    Type *CmpFuncSizeTy = DL->getIntPtrType(Builder.getContext());
+    assert(SE->getTypeSizeInBits(Len->getType()) ==
+               DL->getTypeSizeInBits(CmpFuncSizeTy) &&
+           "Len should already have the correct size.");
+
+    // Make sure that iteration count is a number, insert ptrtoint cast if not.
+    if (Len->getType()->isPointerTy())
+      Len = Builder.CreatePtrToInt(Len, CmpFuncSizeTy);
+    assert(Len->getType() == CmpFuncSizeTy && "Should have correct type now.");
+
+    Len->setName(Len->getName() + ".bytecount");
+
+    // There is no legality check needed. We want to compare that the memory
+    // regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal.
+    // For them to be fully equal, they must match bit-by-bit. And likewise,
+    // for them to *NOT* be fully equal, they have to differ just by one bit.
+    // The step of comparison (bits compared at once) simply does not matter.
+  }
+
+  // For the rest of new instructions, dbgloc should point at the value cmp.
+  Builder.SetCurrentDebugLocation(BCmpInst->getDebugLoc());
+
+  // Emit the comparison itself.
+  auto *CmpCall =
+      cast<CallInst>(HasBCmp ? emitBCmp(PtrA, PtrB, Len, Builder, *DL, TLI)
+                             : emitMemCmp(PtrA, PtrB, Len, Builder, *DL, TLI));
+  // FIXME: add {B,Mem}CmpInst with MemoryCompareInst
+  //        (based on MemIntrinsicBase) as base?
+  // FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...)
+
+  // {b,mem}cmp returned 0 if they were equal, or non-zero if not equal.
+  auto *ComparedEqual = cast<ICmpInst>(Builder.CreateICmpEQ(
+      CmpCall, ConstantInt::get(CmpCall->getType(), 0),
+      PtrA->getName() + ".vs." + PtrB->getName() + ".eqcmp"));
+
+  BasicBlock *BB = transformBCmpControlFlow(ComparedEqual);
+  Builder.ClearInsertionPoint();
+
+  // We're done.
+  LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n");
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "TransformedBCmpIdiomToCall",
+                              CmpCall->getDebugLoc(), BB)
+           << "Transformed bcmp idiom into a call to "
+           << ore::NV("NewFunction", CmpCall->getCalledFunction())
+           << "() function";
+  });
+  ++NumBCmp;
+}
+
+/// Recognizes a bcmp idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the bcmp (or memcmp)
+/// intrinsic function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeBCmp() {
+  if (!HasMemCmp && !HasBCmp)
+    return false;
+
+  ICmpInst *BCmpInst;
+  CmpInst *LatchCmpInst;
+  LoadInst *LoadA, *LoadB;
+  const SCEV *SrcA, *SrcB, *NBytes;
+  if (!detectBCmpIdiom(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB,
+                       NBytes)) {
+    LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n");
+    return false;
+  }
+
+  transformLoopToBCmp(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, NBytes);
+  return true;
+}
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 31191b52895c..368b9d4e8df1 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -192,7 +192,8 @@ public:
         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
             *L->getHeader()->getParent());
     const TargetLibraryInfo &TLI =
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+            *L->getHeader()->getParent());
     MemorySSA *MSSA = nullptr;
     Optional<MemorySSAUpdater> MSSAU;
     if (EnableMSSALoopDependency) {
@@ -233,7 +234,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
 
   auto PA = getLoopPassPreservedAnalyses();
   PA.preserveSet<CFGAnalyses>();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 9a42365adc1b..1af4b21b432e 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -410,8 +410,6 @@ public:
   void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
 
 private:
-  void splitInnerLoopLatch(Instruction *);
-  void splitInnerLoopHeader();
   bool adjustLoopLinks();
   void adjustLoopPreheaders();
   bool adjustLoopBranches();
@@ -1226,7 +1224,7 @@ bool LoopInterchangeTransform::transform() {
 
   if (InnerLoop->getSubLoops().empty()) {
     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
-    LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n");
+    LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
     PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
     if (!InductionPHI) {
       LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
@@ -1242,11 +1240,55 @@ bool LoopInterchangeTransform::transform() {
     if (&InductionPHI->getParent()->front() != InductionPHI)
       InductionPHI->moveBefore(&InductionPHI->getParent()->front());
 
-    // Split at the place were the induction variable is
-    // incremented/decremented.
-    // TODO: This splitting logic may not work always. Fix this.
-    splitInnerLoopLatch(InnerIndexVar);
-    LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n");
+    // Create a new latch block for the inner loop. We split at the
+    // current latch's terminator and then move the condition and all
+    // operands that are not either loop-invariant or the induction PHI into the
+    // new latch block.
+    BasicBlock *NewLatch =
+        SplitBlock(InnerLoop->getLoopLatch(),
+                   InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
+
+    SmallSetVector<Instruction *, 4> WorkList;
+    unsigned i = 0;
+    auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
+      for (; i < WorkList.size(); i++) {
+        // Duplicate instruction and move it the new latch. Update uses that
+        // have been moved.
+        Instruction *NewI = WorkList[i]->clone();
+        NewI->insertBefore(NewLatch->getFirstNonPHI());
+        assert(!NewI->mayHaveSideEffects() &&
+               "Moving instructions with side-effects may change behavior of "
+               "the loop nest!");
+        for (auto UI = WorkList[i]->use_begin(), UE = WorkList[i]->use_end();
+             UI != UE;) {
+          Use &U = *UI++;
+          Instruction *UserI = cast<Instruction>(U.getUser());
+          if (!InnerLoop->contains(UserI->getParent()) ||
+              UserI->getParent() == NewLatch || UserI == InductionPHI)
+            U.set(NewI);
+        }
+        // Add operands of moved instruction to the worklist, except if they are
+        // outside the inner loop or are the induction PHI.
+        for (Value *Op : WorkList[i]->operands()) {
+          Instruction *OpI = dyn_cast<Instruction>(Op);
+          if (!OpI ||
+              this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
+              OpI == InductionPHI)
+            continue;
+          WorkList.insert(OpI);
+        }
+      }
+    };
+
+    // FIXME: Should we interchange when we have a constant condition?
+    Instruction *CondI = dyn_cast<Instruction>(
+        cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
+            ->getCondition());
+    if (CondI)
+      WorkList.insert(CondI);
+    MoveInstructions();
+    WorkList.insert(cast<Instruction>(InnerIndexVar));
+    MoveInstructions();
 
     // Splits the inner loops phi nodes out into a separate basic block.
     BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
@@ -1263,10 +1305,6 @@ bool LoopInterchangeTransform::transform() {
   return true;
 }
 
-void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
-  SplitBlock(InnerLoop->getLoopLatch(), Inc, DT, LI);
-}
-
 /// \brief Move all instructions except the terminator from FromBB right before
 /// InsertBefore
 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 2b3d5e0ce9b7..e8dc879a184b 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -435,7 +435,8 @@ public:
                                           PH->getTerminator());
     Value *Initial = new LoadInst(
         Cand.Load->getType(), InitialPtr, "load_initial",
-        /* isVolatile */ false, Cand.Load->getAlignment(), PH->getTerminator());
+        /* isVolatile */ false, MaybeAlign(Cand.Load->getAlignment()),
+        PH->getTerminator());
 
     PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
                                    &L->getHeader()->front());
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index 507a1e251ca6..885c0e8f4b8b 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -543,7 +543,7 @@ bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
     if (const auto *LI = dyn_cast<LoadInst>(U->getValue()))
       if (LI->isUnordered() && L->hasLoopInvariantOperands(LI))
         if (AA->pointsToConstantMemory(LI->getOperand(0)) ||
-            LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr)
+            LI->hasMetadata(LLVMContext::MD_invariant_load))
           return true;
   return false;
 }
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index 166b57f20b43..96e2c2a3ac6b 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -1644,7 +1644,8 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+      *L->getHeader()->getParent());
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index e009947690af..94517996df39 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -55,7 +55,7 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
     AR.MSSA->verifyMemorySSA();
 
   auto PA = getLoopPassPreservedAnalyses();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
@@ -94,17 +94,15 @@ public:
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-    auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-    auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
     Optional<MemorySSAUpdater> MSSAU;
     if (EnableMSSALoopDependency) {
       MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
       MSSAU = MemorySSAUpdater(MSSA);
     }
-    return LoopRotation(L, LI, TTI, AC, DT, SE,
+    return LoopRotation(L, LI, TTI, AC, &DT, &SE,
                         MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
                         false, MaxHeaderSize, false);
   }
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 046f4c8af492..299f3fc5fb19 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -690,7 +690,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LoopStandardAnalysisResults &AR,
                                            LPMUpdater &LPMU) {
   Optional<MemorySSAUpdater> MSSAU;
-  if (EnableMSSALoopDependency && AR.MSSA)
+  if (AR.MSSA)
     MSSAU = MemorySSAUpdater(AR.MSSA);
   bool DeleteCurrentLoop = false;
   if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE,
@@ -702,7 +702,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
     LPMU.markLoopAsDeleted(L, "loop-simplifycfg");
 
   auto PA = getLoopPassPreservedAnalyses();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index 975452e13f09..65e0dee0225a 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -230,12 +230,9 @@ static bool sinkInstruction(Loop &L, Instruction &I,
     IC->setName(I.getName());
     IC->insertBefore(&*N->getFirstInsertionPt());
     // Replaces uses of I with IC in N
-    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
-      Use &U = *UI++;
-      auto *I = cast<Instruction>(U.getUser());
-      if (I->getParent() == N)
-        U.set(IC);
-    }
+    I.replaceUsesWithIf(IC, [N](Use &U) {
+      return cast<Instruction>(U.getUser())->getParent() == N;
+    });
     // Replaces uses of I with IC in blocks dominated by N
     replaceDominatedUsesWith(&I, IC, DT, N);
     LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 59a387a186b8..7f119175c4a8 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1386,7 +1386,9 @@ void Cost::RateFormula(const Formula &F,
 
   // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
   // additional instruction (at least fill).
-  unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1;
+  // TODO: Need distinguish register class?
+  unsigned TTIRegNum = TTI->getNumberOfRegisters(
+                       TTI->getRegisterClassForType(false, F.getType())) - 1;
   if (C.NumRegs > TTIRegNum) {
     // Cost already exceeded TTIRegNum, then only newly added register can add
     // new instructions.
@@ -3165,6 +3167,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
     LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
     return;
   }
+  assert(IVSrc && "Failed to find IV chain source");
 
   LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
   Type *IVTy = IVSrc->getType();
@@ -3265,12 +3268,12 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     // requirements for both N and i at the same time. Limiting this code to
     // equality icmps is not a problem because all interesting loops use
     // equality icmps, thanks to IndVarSimplify.
-    if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
+    if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
+      // If CI can be saved in some target, like replaced inside hardware loop
+      // in PowerPC, no need to generate initial formulae for it.
+      if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
+        continue;
       if (CI->isEquality()) {
-        // If CI can be saved in some target, like replaced inside hardware loop
-        // in PowerPC, no need to generate initial formulae for it.
-        if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
-          continue;
         // Swap the operands if needed to put the OperandValToReplace on the
         // left, for consistency.
         Value *NV = CI->getOperand(1);
@@ -3298,6 +3301,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
             Factors.insert(-(uint64_t)Factors[i]);
         Factors.insert(-1);
       }
+    }
 
     // Get or create an LSRUse.
     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
@@ -4834,6 +4838,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
         }
       }
     }
+    assert(Best && "Failed to find best LSRUse candidate");
 
     LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
                       << " will yield profitable reuse.\n");
@@ -5740,7 +5745,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
       *L->getHeader()->getParent());
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
       *L->getHeader()->getParent());
-  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+      *L->getHeader()->getParent());
   return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo);
 }
 
diff --git a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 86891eb451bb..8d88be420314 100644
--- a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -166,7 +166,7 @@ static bool computeUnrollAndJamCount(
   bool UseUpperBound = false;
   bool ExplicitUnroll = computeUnrollCount(
       L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
-      OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+      /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
   if (ExplicitUnroll || UseUpperBound) {
     // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
     // for the unroller instead.
@@ -293,9 +293,9 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   if (Latch != Exit || SubLoopLatch != SubLoopExit)
     return LoopUnrollResult::Unmodified;
 
-  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, SE, TTI, nullptr, nullptr, OptLevel,
-      None, None, None, None, None, None);
+  TargetTransformInfo::UnrollingPreferences UP =
+      gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None,
+                                 None, None, None, None, None, None, None);
   if (AllowUnrollAndJam.getNumOccurrences() > 0)
     UP.UnrollAndJam = AllowUnrollAndJam;
   if (UnrollAndJamThreshold.getNumOccurrences() > 0)
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 2fa7436213dd..a6d4164c3645 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -178,7 +178,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
-    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
+    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling,
+    Optional<bool> UserAllowProfileBasedPeeling,
+    Optional<unsigned> UserFullUnrollMaxCount) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
@@ -202,6 +204,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.UpperBound = false;
   UP.AllowPeeling = true;
   UP.UnrollAndJam = false;
+  UP.PeelProfiledIterations = true;
   UP.UnrollAndJamInnerLoopThreshold = 60;
 
   // Override with any target specific settings
@@ -257,6 +260,10 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.UpperBound = *UserUpperBound;
   if (UserAllowPeeling.hasValue())
     UP.AllowPeeling = *UserAllowPeeling;
+  if (UserAllowProfileBasedPeeling.hasValue())
+    UP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
+  if (UserFullUnrollMaxCount.hasValue())
+    UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
 
   return UP;
 }
@@ -730,7 +737,7 @@ bool llvm::computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
     ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
     OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
-    unsigned &TripMultiple, unsigned LoopSize,
+    bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
 
   // Check for explicit Count.
@@ -781,18 +788,34 @@ bool llvm::computeUnrollCount(
   // Also we need to check if we exceed FullUnrollMaxCount.
   // If using the upper bound to unroll, TripMultiple should be set to 1 because
   // we do not know when loop may exit.
-  // MaxTripCount and ExactTripCount cannot both be non zero since we only
+
+  // We can unroll by the upper bound amount if it's generally allowed or if
+  // we know that the loop is executed either the upper bound or zero times.
+  // (MaxOrZero unrolling keeps only the first loop test, so the number of
+  // loop tests remains the same compared to the non-unrolled version, whereas
+  // the generic upper bound unrolling keeps all but the last loop test so the
+  // number of loop tests goes up which may end up being worse on targets with
+  // constrained branch predictor resources so is controlled by an option.)
+  // In addition we only unroll small upper bounds.
+  unsigned FullUnrollMaxTripCount = MaxTripCount;
+  if (!(UP.UpperBound || MaxOrZero) ||
+      FullUnrollMaxTripCount > UnrollMaxUpperBound)
+    FullUnrollMaxTripCount = 0;
+
+  // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
   // compute the former when the latter is zero.
   unsigned ExactTripCount = TripCount;
-  assert((ExactTripCount == 0 || MaxTripCount == 0) &&
-         "ExtractTripCount and MaxTripCount cannot both be non zero.");
-  unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
+  assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
+         "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
+
+  unsigned FullUnrollTripCount =
+      ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
   UP.Count = FullUnrollTripCount;
   if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
     // When computing the unrolled size, note that BEInsns are not replicated
     // like the rest of the loop body.
     if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
-      UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+      UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
       TripCount = FullUnrollTripCount;
       TripMultiple = UP.UpperBound ? 1 : TripMultiple;
       return ExplicitUnroll;
@@ -806,7 +829,7 @@ bool llvm::computeUnrollCount(
         unsigned Boost =
             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
         if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
-          UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+          UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
           TripCount = FullUnrollTripCount;
           TripMultiple = UP.UpperBound ? 1 : TripMultiple;
           return ExplicitUnroll;
@@ -882,6 +905,8 @@ bool llvm::computeUnrollCount(
                   "because "
                   "unrolled size is too large.";
       });
+    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
+                      << "\n");
     return ExplicitUnroll;
   }
   assert(TripCount == 0 &&
@@ -903,6 +928,12 @@ bool llvm::computeUnrollCount(
     return false;
   }
 
+  // Don't unroll a small upper bound loop unless user or TTI asked to do so.
+  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+    UP.Count = 0;
+    return false;
+  }
+
   // Check if the runtime trip count is too small when profile is available.
   if (L->getHeader()->getParent()->hasProfileData()) {
     if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
@@ -966,7 +997,11 @@ bool llvm::computeUnrollCount(
 
   if (UP.Count > UP.MaxCount)
     UP.Count = UP.MaxCount;
-  LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
+
+  if (MaxTripCount && UP.Count > MaxTripCount)
+    UP.Count = MaxTripCount;
+
+  LLVM_DEBUG(dbgs() << "  runtime unrolling with count: " << UP.Count
                     << "\n");
   if (UP.Count < 2)
     UP.Count = 0;
@@ -976,13 +1011,14 @@ bool llvm::computeUnrollCount(
 static LoopUnrollResult tryToUnrollLoop(
     Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
     const TargetTransformInfo &TTI, AssumptionCache &AC,
-    OptimizationRemarkEmitter &ORE,
-    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
-    bool PreserveLCSSA, int OptLevel,
+    OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+    ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
     bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
     Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
     Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
-    Optional<bool> ProvidedAllowPeeling) {
+    Optional<bool> ProvidedAllowPeeling,
+    Optional<bool> ProvidedAllowProfileBasedPeeling,
+    Optional<unsigned> ProvidedFullUnrollMaxCount) {
   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
                     << L->getHeader()->getParent()->getName() << "] Loop %"
                     << L->getHeader()->getName() << "\n");
@@ -1007,7 +1043,8 @@ static LoopUnrollResult tryToUnrollLoop(
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
       L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
       ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
-      ProvidedAllowPeeling);
+      ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
+      ProvidedFullUnrollMaxCount);
 
   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
   // as threshold later on.
@@ -1028,10 +1065,10 @@ static LoopUnrollResult tryToUnrollLoop(
     return LoopUnrollResult::Unmodified;
   }
 
-  // When optimizing for size, use LoopSize as threshold, to (fully) unroll
-  // loops, if it does not increase code size.
+  // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
+  // later), to (fully) unroll loops, if it does not increase code size.
   if (OptForSize)
-    UP.Threshold = std::max(UP.Threshold, LoopSize);
+    UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
 
   if (NumInlineCandidates != 0) {
     LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
@@ -1040,7 +1077,6 @@ static LoopUnrollResult tryToUnrollLoop(
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
-  unsigned MaxTripCount = 0;
   unsigned TripMultiple = 1;
   // If there are multiple exiting blocks but one of them is the latch, use the
   // latch for the trip count estimation. Otherwise insist on a single exiting
@@ -1070,28 +1106,18 @@ static LoopUnrollResult tryToUnrollLoop(
 
   // Try to find the trip count upper bound if we cannot find the exact trip
   // count.
+  unsigned MaxTripCount = 0;
   bool MaxOrZero = false;
   if (!TripCount) {
     MaxTripCount = SE.getSmallConstantMaxTripCount(L);
     MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
-    // We can unroll by the upper bound amount if it's generally allowed or if
-    // we know that the loop is executed either the upper bound or zero times.
-    // (MaxOrZero unrolling keeps only the first loop test, so the number of
-    // loop tests remains the same compared to the non-unrolled version, whereas
-    // the generic upper bound unrolling keeps all but the last loop test so the
-    // number of loop tests goes up which may end up being worse on targets with
-    // constrained branch predictor resources so is controlled by an option.)
-    // In addition we only unroll small upper bounds.
-    if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
-      MaxTripCount = 0;
-    }
   }
 
   // computeUnrollCount() decides whether it is beneficial to use upper bound to
   // fully unroll the loop.
   bool UseUpperBound = false;
   bool IsCountSetExplicitly = computeUnrollCount(
-      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
+      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
       TripMultiple, LoopSize, UP, UseUpperBound);
   if (!UP.Count)
     return LoopUnrollResult::Unmodified;
@@ -1139,7 +1165,7 @@ static LoopUnrollResult tryToUnrollLoop(
   // If the loop was peeled, we already "used up" the profile information
   // we had, so we don't want to unroll or peel again.
   if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
-      (IsCountSetExplicitly || UP.PeelCount))
+      (IsCountSetExplicitly || (UP.PeelProfiledIterations && UP.PeelCount)))
     L->setLoopAlreadyUnrolled();
 
   return UnrollResult;
@@ -1169,18 +1195,24 @@ public:
   Optional<bool> ProvidedRuntime;
   Optional<bool> ProvidedUpperBound;
   Optional<bool> ProvidedAllowPeeling;
+  Optional<bool> ProvidedAllowProfileBasedPeeling;
+  Optional<unsigned> ProvidedFullUnrollMaxCount;
 
   LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
              bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
              Optional<unsigned> Count = None,
              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
              Optional<bool> UpperBound = None,
-             Optional<bool> AllowPeeling = None)
+             Optional<bool> AllowPeeling = None,
+             Optional<bool> AllowProfileBasedPeeling = None,
+             Optional<unsigned> ProvidedFullUnrollMaxCount = None)
       : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
         ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
-        ProvidedAllowPeeling(AllowPeeling) {
+        ProvidedAllowPeeling(AllowPeeling),
+        ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
+        ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
@@ -1203,10 +1235,11 @@ public:
     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
     LoopUnrollResult Result = tryToUnrollLoop(
-        L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr,
-        PreserveLCSSA, OptLevel, OnlyWhenForced,
-        ForgetAllSCEV, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
-        ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling);
+        L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
+        OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
+        ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+        ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
+        ProvidedFullUnrollMaxCount);
 
     if (Result == LoopUnrollResult::FullyUnrolled)
       LPM.markLoopAsDeleted(*L);
@@ -1283,14 +1316,16 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
 
   std::string LoopName = L.getName();
 
-  bool Changed =
-      tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
-                      /*BFI*/ nullptr, /*PSI*/ nullptr,
-                      /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced,
-                      ForgetSCEV, /*Count*/ None,
-                      /*Threshold*/ None, /*AllowPartial*/ false,
-                      /*Runtime*/ false, /*UpperBound*/ false,
-                      /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified;
+  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+                                 /*BFI*/ nullptr, /*PSI*/ nullptr,
+                                 /*PreserveLCSSA*/ true, OptLevel,
+                                 OnlyWhenForced, ForgetSCEV, /*Count*/ None,
+                                 /*Threshold*/ None, /*AllowPartial*/ false,
+                                 /*Runtime*/ false, /*UpperBound*/ false,
+                                 /*AllowPeeling*/ false,
+                                 /*AllowProfileBasedPeeling*/ false,
+                                 /*FullUnrollMaxCount*/ None) !=
+                 LoopUnrollResult::Unmodified;
   if (!Changed)
     return PreservedAnalyses::all();
 
@@ -1430,7 +1465,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
         /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
         UnrollOpts.ForgetSCEV, /*Count*/ None,
         /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
-        UnrollOpts.AllowUpperBound, LocalAllowPeeling);
+        UnrollOpts.AllowUpperBound, LocalAllowPeeling,
+        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
     Changed |= Result != LoopUnrollResult::Unmodified;
 
     // The parent must not be damaged by unrolling!
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index b5b8e720069c..b410df0c5f68 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -420,7 +420,8 @@ enum OperatorChain {
 /// cost of creating an entirely new loop.
 static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
                                    OperatorChain &ParentChain,
-                                   DenseMap<Value *, Value *> &Cache) {
+                                   DenseMap<Value *, Value *> &Cache,
+                                   MemorySSAUpdater *MSSAU) {
   auto CacheIt = Cache.find(Cond);
   if (CacheIt != Cache.end())
     return CacheIt->second;
@@ -438,7 +439,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
   // TODO: Handle: br (VARIANT|INVARIANT).
 
   // Hoist simple values out.
-  if (L->makeLoopInvariant(Cond, Changed)) {
+  if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) {
     Cache[Cond] = Cond;
     return Cond;
   }
@@ -478,7 +479,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
         // which will cause the branch to go away in one loop and the condition to
         // simplify in the other one.
         if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
-                                              ParentChain, Cache)) {
+                                              ParentChain, Cache, MSSAU)) {
           Cache[Cond] = LHS;
           return LHS;
         }
@@ -486,7 +487,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
         // operand(1).
         ParentChain = NewChain;
         if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
-                                              ParentChain, Cache)) {
+                                              ParentChain, Cache, MSSAU)) {
           Cache[Cond] = RHS;
           return RHS;
         }
@@ -500,12 +501,12 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
 /// Cond is a condition that occurs in L. If it is invariant in the loop, or has
 /// an invariant piece, return the invariant along with the operator chain type.
 /// Otherwise, return null.
-static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
-                                                              Loop *L,
-                                                              bool &Changed) {
+static std::pair<Value *, OperatorChain>
+FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                     MemorySSAUpdater *MSSAU) {
   DenseMap<Value *, Value *> Cache;
   OperatorChain OpChain = OC_OpChainNone;
-  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU);
 
   // In case we do find a LIV, it can not be obtained by walking up a mixed
   // operator chain.
@@ -525,7 +526,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   if (EnableMSSALoopDependency) {
     MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-    MSSAU = make_unique<MemorySSAUpdater>(MSSA);
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
     assert(DT && "Cannot update MemorySSA without a valid DomTree.");
   }
   currentLoop = L;
@@ -694,8 +695,9 @@ bool LoopUnswitch::processCurrentLoop() {
   }
 
   for (IntrinsicInst *Guard : Guards) {
-    Value *LoopCond =
-        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
+    Value *LoopCond = FindLIVLoopCondition(Guard->getOperand(0), currentLoop,
+                                           Changed, MSSAU.get())
+                          .first;
     if (LoopCond &&
         UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
       // NB! Unswitching (if successful) could have erased some of the
@@ -735,8 +737,9 @@ bool LoopUnswitch::processCurrentLoop() {
       if (BI->isConditional()) {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
-        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                               currentLoop, Changed).first;
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop,
+                                               Changed, MSSAU.get())
+                              .first;
         if (LoopCond && !EqualityPropUnSafe(*LoopCond) &&
             UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
           ++NumBranches;
@@ -748,7 +751,7 @@ bool LoopUnswitch::processCurrentLoop() {
       Value *LoopCond;
       OperatorChain OpChain;
       std::tie(LoopCond, OpChain) =
-        FindLIVLoopCondition(SC, currentLoop, Changed);
+          FindLIVLoopCondition(SC, currentLoop, Changed, MSSAU.get());
 
       unsigned NumCases = SI->getNumCases();
       if (LoopCond && NumCases) {
@@ -808,8 +811,9 @@ bool LoopUnswitch::processCurrentLoop() {
     for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
-        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                               currentLoop, Changed).first;
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop,
+                                               Changed, MSSAU.get())
+                              .first;
         if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumSelects;
@@ -1123,8 +1127,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     if (!BI->isConditional())
       return false;
 
-    Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                           currentLoop, Changed).first;
+    Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop,
+                                           Changed, MSSAU.get())
+                          .first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -1157,8 +1162,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
     // If this isn't switching on an invariant condition, we can't unswitch it.
-    Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                           currentLoop, Changed).first;
+    Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop,
+                                           Changed, MSSAU.get())
+                          .first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -1240,6 +1246,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   LoopBlocks.clear();
   NewBlocks.clear();
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
   // First step, split the preheader and exit blocks, and add these blocks to
   // the LoopBlocks list.
   BasicBlock *NewPreheader =
@@ -1607,36 +1616,30 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         // If BI's parent is the only pred of the successor, fold the two blocks
         // together.
         BasicBlock *Pred = BI->getParent();
+        (void)Pred;
         BasicBlock *Succ = BI->getSuccessor(0);
         BasicBlock *SinglePred = Succ->getSinglePredecessor();
         if (!SinglePred) continue;  // Nothing to do.
         assert(SinglePred == Pred && "CFG broken");
 
-        LLVM_DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
-                          << Succ->getName() << "\n");
-
-        // Resolve any single entry PHI nodes in Succ.
-        while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
-          ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM,
-                            MSSAU.get());
-
-        // If Succ has any successors with PHI nodes, update them to have
-        // entries coming from Pred instead of Succ.
-        Succ->replaceAllUsesWith(Pred);
-
-        // Move all of the successor contents from Succ to Pred.
-        Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
-                                   Succ->begin(), Succ->end());
-        if (MSSAU)
-          MSSAU->moveAllAfterMergeBlocks(Succ, Pred, BI);
+        // Make the LPM and Worklist updates specific to LoopUnswitch.
         LPM->deleteSimpleAnalysisValue(BI, L);
         RemoveFromWorklist(BI, Worklist);
-        BI->eraseFromParent();
-
-        // Remove Succ from the loop tree.
-        LI->removeBlock(Succ);
         LPM->deleteSimpleAnalysisValue(Succ, L);
-        Succ->eraseFromParent();
+        auto SuccIt = Succ->begin();
+        while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) {
+          for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It)
+            if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It)))
+              Worklist.push_back(Use);
+          for (User *U : PN->users())
+            Worklist.push_back(cast<Instruction>(U));
+          LPM->deleteSimpleAnalysisValue(PN, L);
+          RemoveFromWorklist(PN, Worklist);
+          ++NumSimplify;
+        }
+        // Merge the block and make the remaining analyses updates.
+        DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+        MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get());
         ++NumSimplify;
         continue;
       }
diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 896dd8bcb922..2ccb7cae3079 100644
--- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -112,37 +112,6 @@ static cl::opt<unsigned> LVLoopDepthThreshold(
         "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
     cl::init(2), cl::Hidden);
 
-/// Create MDNode for input string.
-static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
-  LLVMContext &Context = TheLoop->getHeader()->getContext();
-  Metadata *MDs[] = {
-      MDString::get(Context, Name),
-      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
-  return MDNode::get(Context, MDs);
-}
-
-/// Set input string into loop metadata by keeping other values intact.
-void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
-                                   unsigned V) {
-  SmallVector<Metadata *, 4> MDs(1);
-  // If the loop already has metadata, retain it.
-  MDNode *LoopID = TheLoop->getLoopID();
-  if (LoopID) {
-    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-      MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
-      MDs.push_back(Node);
-    }
-  }
-  // Add new metadata.
-  MDs.push_back(createStringMetadata(TheLoop, MDString, V));
-  // Replace current metadata node with new one.
-  LLVMContext &Context = TheLoop->getHeader()->getContext();
-  MDNode *NewLoopID = MDNode::get(Context, MDs);
-  // Set operand 0 to refer to the loop id itself.
-  NewLoopID->replaceOperandWith(0, NewLoopID);
-  TheLoop->setLoopID(NewLoopID);
-}
-
 namespace {
 
 struct LoopVersioningLICM : public LoopPass {
diff --git a/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
new file mode 100644
index 000000000000..d0fcf38b5a7b
--- /dev/null
+++ b/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -0,0 +1,170 @@
+//===- LowerConstantIntrinsics.cpp - Lower constant intrinsic calls -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers all remaining 'objectsize' 'is.constant' intrinsic calls
+// and provides constant propagation and basic CFG cleanup on the result.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "lower-is-constant-intrinsic"
+
+STATISTIC(IsConstantIntrinsicsHandled,
+          "Number of 'is.constant' intrinsic calls handled");
+STATISTIC(ObjectSizeIntrinsicsHandled,
+          "Number of 'objectsize' intrinsic calls handled");
+
+static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) {
+  Value *Op = II->getOperand(0);
+
+  return isa<Constant>(Op) ? ConstantInt::getTrue(II->getType())
+                           : ConstantInt::getFalse(II->getType());
+}
+
+static bool replaceConditionalBranchesOnConstant(Instruction *II,
+                                                 Value *NewValue) {
+  bool HasDeadBlocks = false;
+  SmallSetVector<Instruction *, 8> Worklist;
+  replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr,
+                                &Worklist);
+  for (auto I : Worklist) {
+    BranchInst *BI = dyn_cast<BranchInst>(I);
+    if (!BI)
+      continue;
+    if (BI->isUnconditional())
+      continue;
+
+    BasicBlock *Target, *Other;
+    if (match(BI->getOperand(0), m_Zero())) {
+      Target = BI->getSuccessor(1);
+      Other = BI->getSuccessor(0);
+    } else if (match(BI->getOperand(0), m_One())) {
+      Target = BI->getSuccessor(0);
+      Other = BI->getSuccessor(1);
+    } else {
+      Target = nullptr;
+      Other = nullptr;
+    }
+    if (Target && Target != Other) {
+      BasicBlock *Source = BI->getParent();
+      Other->removePredecessor(Source);
+      BI->eraseFromParent();
+      BranchInst::Create(Target, Source);
+      if (pred_begin(Other) == pred_end(Other))
+        HasDeadBlocks = true;
+    }
+  }
+  return HasDeadBlocks;
+}
+
+static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) {
+  bool HasDeadBlocks = false;
+  const auto &DL = F.getParent()->getDataLayout();
+  SmallVector<WeakTrackingVH, 8> Worklist;
+
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT) {
+    for (Instruction &I: *BB) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+      if (!II)
+        continue;
+      switch (II->getIntrinsicID()) {
+      default:
+        break;
+      case Intrinsic::is_constant:
+      case Intrinsic::objectsize:
+        Worklist.push_back(WeakTrackingVH(&I));
+        break;
+      }
+    }
+  }
+  for (WeakTrackingVH &VH: Worklist) {
+    // Items on the worklist can be mutated by earlier recursive replaces.
+    // This can remove the intrinsic as dead (VH == null), but also replace
+    // the intrinsic in place.
+    if (!VH)
+      continue;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*VH);
+    if (!II)
+      continue;
+    Value *NewValue;
+    switch (II->getIntrinsicID()) {
+    default:
+      continue;
+    case Intrinsic::is_constant:
+      NewValue = lowerIsConstantIntrinsic(II);
+      IsConstantIntrinsicsHandled++;
+      break;
+    case Intrinsic::objectsize:
+      NewValue = lowerObjectSizeCall(II, DL, TLI, true);
+      ObjectSizeIntrinsicsHandled++;
+      break;
+    }
+    HasDeadBlocks |= replaceConditionalBranchesOnConstant(II, NewValue);
+  }
+  if (HasDeadBlocks)
+    removeUnreachableBlocks(F);
+  return !Worklist.empty();
+}
+
+PreservedAnalyses
+LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) {
+  if (lowerConstantIntrinsics(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+/// Legacy pass for lowering is.constant intrinsics out of the IR.
+///
+/// When this pass is run over a function it converts is.constant intrinsics
+/// into 'true' or 'false'. This is completements the normal constand folding
+/// to 'true' as part of Instruction Simplify passes.
+class LowerConstantIntrinsics : public FunctionPass {
+public:
+  static char ID;
+  LowerConstantIntrinsics() : FunctionPass(ID) {
+    initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+    return lowerConstantIntrinsics(F, TLI);
+  }
+};
+} // namespace
+
+char LowerConstantIntrinsics::ID = 0;
+INITIALIZE_PASS(LowerConstantIntrinsics, "lower-constant-intrinsics",
+                "Lower constant intrinsics", false, false)
+
+FunctionPass *llvm::createLowerConstantIntrinsicsPass() {
+  return new LowerConstantIntrinsics();
+}
diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 0d67c0d740ec..d85f20b3f80c 100644
--- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/MisExpect.h"
 
 using namespace llvm;
 
@@ -71,15 +72,20 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   unsigned n = SI.getNumCases(); // +1 for default case.
   SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
 
-  if (Case == *SI.case_default())
-    Weights[0] = LikelyBranchWeight;
-  else
-    Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
+  uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
+  Weights[Index] = LikelyBranchWeight;
+
+  SI.setMetadata(
+      LLVMContext::MD_misexpect,
+      MDBuilder(CI->getContext())
+          .createMisExpect(Index, LikelyBranchWeight, UnlikelyBranchWeight));
+
+  SI.setCondition(ArgValue);
+  misexpect::checkFrontendInstrumentation(SI);
 
   SI.setMetadata(LLVMContext::MD_prof,
                  MDBuilder(CI->getContext()).createBranchWeights(Weights));
 
-  SI.setCondition(ArgValue);
   return true;
 }
 
@@ -155,7 +161,7 @@ static void handlePhiDef(CallInst *Expect) {
     return Result;
   };
 
-  auto *PhiDef = dyn_cast<PHINode>(V);
+  auto *PhiDef = cast<PHINode>(V);
 
   // Get the first dominating conditional branch of the operand
   // i's incoming block.
@@ -280,19 +286,28 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
 
   MDBuilder MDB(CI->getContext());
   MDNode *Node;
+  MDNode *ExpNode;
 
   if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
-      (Predicate == CmpInst::ICMP_EQ))
+      (Predicate == CmpInst::ICMP_EQ)) {
     Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
-  else
+    ExpNode = MDB.createMisExpect(0, LikelyBranchWeight, UnlikelyBranchWeight);
+  } else {
     Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+    ExpNode = MDB.createMisExpect(1, LikelyBranchWeight, UnlikelyBranchWeight);
+  }
 
-  BSI.setMetadata(LLVMContext::MD_prof, Node);
+  BSI.setMetadata(LLVMContext::MD_misexpect, ExpNode);
 
   if (CmpI)
     CmpI->setOperand(0, ArgValue);
   else
     BSI.setCondition(ArgValue);
+
+  misexpect::checkFrontendInstrumentation(BSI);
+
+  BSI.setMetadata(LLVMContext::MD_prof, Node);
+
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 5a055139be4f..2364748efb05 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -69,90 +69,6 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 
-static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
-                                  bool &VariableIdxFound,
-                                  const DataLayout &DL) {
-  // Skip over the first indices.
-  gep_type_iterator GTI = gep_type_begin(GEP);
-  for (unsigned i = 1; i != Idx; ++i, ++GTI)
-    /*skip along*/;
-
-  // Compute the offset implied by the rest of the indices.
-  int64_t Offset = 0;
-  for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
-    ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (!OpC)
-      return VariableIdxFound = true;
-    if (OpC->isZero()) continue;  // No offset.
-
-    // Handle struct indices, which add their field offset to the pointer.
-    if (StructType *STy = GTI.getStructTypeOrNull()) {
-      Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
-      continue;
-    }
-
-    // Otherwise, we have a sequential type like an array or vector.  Multiply
-    // the index by the ElementSize.
-    uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
-    Offset += Size*OpC->getSExtValue();
-  }
-
-  return Offset;
-}
-
-/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and
-/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2
-/// might be &A[40]. In this case offset would be -8.
-static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
-                            const DataLayout &DL) {
-  Ptr1 = Ptr1->stripPointerCasts();
-  Ptr2 = Ptr2->stripPointerCasts();
-
-  // Handle the trivial case first.
-  if (Ptr1 == Ptr2) {
-    Offset = 0;
-    return true;
-  }
-
-  GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
-  GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
-
-  bool VariableIdxFound = false;
-
-  // If one pointer is a GEP and the other isn't, then see if the GEP is a
-  // constant offset from the base, as in "P" and "gep P, 1".
-  if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
-    Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL);
-    return !VariableIdxFound;
-  }
-
-  if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
-    Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL);
-    return !VariableIdxFound;
-  }
-
-  // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
-  // base.  After that base, they may have some number of common (and
-  // potentially variable) indices.  After that they handle some constant
-  // offset, which determines their offset from each other.  At this point, we
-  // handle no other case.
-  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
-    return false;
-
-  // Skip any common indices and track the GEP types.
-  unsigned Idx = 1;
-  for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
-    if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
-      break;
-
-  int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL);
-  int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL);
-  if (VariableIdxFound) return false;
-
-  Offset = Offset2-Offset1;
-  return true;
-}
-
 namespace {
 
 /// Represents a range of memset'd bytes with the ByteVal value.
@@ -419,12 +335,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
         break;
 
       // Check to see if this store is to a constant offset from the start ptr.
-      int64_t Offset;
-      if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
-                           DL))
+      Optional<int64_t> Offset =
+          isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL);
+      if (!Offset)
         break;
 
-      Ranges.addStore(Offset, NextStore);
+      Ranges.addStore(*Offset, NextStore);
     } else {
       MemSetInst *MSI = cast<MemSetInst>(BI);
 
@@ -433,11 +349,11 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
         break;
 
       // Check to see if this store is to a constant offset from the start ptr.
-      int64_t Offset;
-      if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL))
+      Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL);
+      if (!Offset)
         break;
 
-      Ranges.addMemSet(Offset, MSI);
+      Ranges.addMemSet(*Offset, MSI);
     }
   }
 
@@ -597,9 +513,13 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
 
     ToLift.push_back(C);
     for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k)
-      if (auto *A = dyn_cast<Instruction>(C->getOperand(k)))
-        if (A->getParent() == SI->getParent())
+      if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) {
+        if (A->getParent() == SI->getParent()) {
+          // Cannot hoist user of P above P
+          if(A == P) return false;
           Args.insert(A);
+        }
+      }
   }
 
   // We made it, we need to lift
@@ -979,7 +899,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   // If the destination wasn't sufficiently aligned then increase its alignment.
   if (!isDestSufficientlyAligned) {
     assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
-    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+    cast<AllocaInst>(cpyDest)->setAlignment(MaybeAlign(srcAlign));
   }
 
   // Drop any cached information about the call, because we may have changed
@@ -1516,7 +1436,7 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
     return false;
 
   auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
-  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
   auto LookupAliasAnalysis = [this]() -> AliasAnalysis & {
     return getAnalysis<AAResultsWrapperPass>().getAAResults();
diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp
index 3d047a193267..98a45b391319 100644
--- a/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/lib/Transforms/Scalar/MergeICmps.cpp
@@ -897,7 +897,7 @@ public:
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F)) return false;
-    const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     // MergeICmps does not need the DominatorTree, but we update it if it's
     // already available.
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 30645f4400e3..9799ea7960ec 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -14,9 +14,11 @@
 // diamond (hammock) and merges them into a single load in the header. Similar
 // it sinks and merges two stores to the tail block (footer). The algorithm
 // iterates over the instructions of one side of the diamond and attempts to
-// find a matching load/store on the other side. It hoists / sinks when it
-// thinks it safe to do so.  This optimization helps with eg. hiding load
-// latencies, triggering if-conversion, and reducing static code size.
+// find a matching load/store on the other side. New tail/footer block may be
+// insterted if the tail/footer block has more predecessors (not only the two
+// predecessors that are forming the diamond). It hoists / sinks when it thinks
+// it safe to do so.  This optimization helps with eg. hiding load latencies,
+// triggering if-conversion, and reducing static code size.
 //
 // NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
 //
@@ -103,7 +105,9 @@ class MergedLoadStoreMotion {
   // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
   const int MagicCompileTimeControl = 250;
 
+  const bool SplitFooterBB;
 public:
+  MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {}
   bool run(Function &F, AliasAnalysis &AA);
 
 private:
@@ -114,7 +118,9 @@ private:
   PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
   bool isStoreSinkBarrierInRange(const Instruction &Start,
                                  const Instruction &End, MemoryLocation Loc);
-  bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
+  bool canSinkStoresAndGEPs(StoreInst *S0, StoreInst *S1) const;
+  void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand,
+                         StoreInst *ElseInst);
   bool mergeStores(BasicBlock *BB);
 };
 } // end anonymous namespace
@@ -216,75 +222,83 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
   return NewPN;
 }
 
+///
+/// Check if 2 stores can be sunk together with corresponding GEPs
+///
+bool MergedLoadStoreMotion::canSinkStoresAndGEPs(StoreInst *S0,
+                                                 StoreInst *S1) const {
+  auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+  auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+  return A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+         (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+         (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0);
+}
+
 ///
 /// Merge two stores to same address and sink into \p BB
 ///
 /// Also sinks GEP instruction computing the store address
 ///
-bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
-                                      StoreInst *S1) {
+void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0,
+                                              StoreInst *S1) {
   // Only one definition?
   auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
   auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
-  if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
-      (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
-      (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
-    LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
-               dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
-               dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
-    // Hoist the instruction.
-    BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
-    // Intersect optional metadata.
-    S0->andIRFlags(S1);
-    S0->dropUnknownNonDebugMetadata();
-
-    // Create the new store to be inserted at the join point.
-    StoreInst *SNew = cast<StoreInst>(S0->clone());
-    Instruction *ANew = A0->clone();
-    SNew->insertBefore(&*InsertPt);
-    ANew->insertBefore(SNew);
-
-    assert(S0->getParent() == A0->getParent());
-    assert(S1->getParent() == A1->getParent());
-
-    // New PHI operand? Use it.
-    if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
-      SNew->setOperand(0, NewPN);
-    S0->eraseFromParent();
-    S1->eraseFromParent();
-    A0->replaceAllUsesWith(ANew);
-    A0->eraseFromParent();
-    A1->replaceAllUsesWith(ANew);
-    A1->eraseFromParent();
-    return true;
-  }
-  return false;
+  LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+             dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+             dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+  // Hoist the instruction.
+  BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+  // Intersect optional metadata.
+  S0->andIRFlags(S1);
+  S0->dropUnknownNonDebugMetadata();
+
+  // Create the new store to be inserted at the join point.
+  StoreInst *SNew = cast<StoreInst>(S0->clone());
+  Instruction *ANew = A0->clone();
+  SNew->insertBefore(&*InsertPt);
+  ANew->insertBefore(SNew);
+
+  assert(S0->getParent() == A0->getParent());
+  assert(S1->getParent() == A1->getParent());
+
+  // New PHI operand? Use it.
+  if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
+    SNew->setOperand(0, NewPN);
+  S0->eraseFromParent();
+  S1->eraseFromParent();
+  A0->replaceAllUsesWith(ANew);
+  A0->eraseFromParent();
+  A1->replaceAllUsesWith(ANew);
+  A1->eraseFromParent();
 }
 
 ///
 /// True when two stores are equivalent and can sink into the footer
 ///
-/// Starting from a diamond tail block, iterate over the instructions in one
-/// predecessor block and try to match a store in the second predecessor.
+/// Starting from a diamond head block, iterate over the instructions in one
+/// successor block and try to match a store in the second successor.
 ///
-bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB) {
 
   bool MergedStores = false;
-  assert(T && "Footer of a diamond cannot be empty");
-
-  pred_iterator PI = pred_begin(T), E = pred_end(T);
-  assert(PI != E);
-  BasicBlock *Pred0 = *PI;
-  ++PI;
-  BasicBlock *Pred1 = *PI;
-  ++PI;
+  BasicBlock *TailBB = getDiamondTail(HeadBB);
+  BasicBlock *SinkBB = TailBB;
+  assert(SinkBB && "Footer of a diamond cannot be empty");
+
+  succ_iterator SI = succ_begin(HeadBB);
+  assert(SI != succ_end(HeadBB) && "Diamond head cannot have zero successors");
+  BasicBlock *Pred0 = *SI;
+  ++SI;
+  assert(SI != succ_end(HeadBB) && "Diamond head cannot have single successor");
+  BasicBlock *Pred1 = *SI;
   // tail block  of a diamond/hammock?
   if (Pred0 == Pred1)
     return false; // No.
-  if (PI != E)
-    return false; // No. More than 2 predecessors.
-
-  // #Instructions in Succ1 for Compile Time Control
+  // bail out early if we can not merge into the footer BB
+  if (!SplitFooterBB && TailBB->hasNPredecessorsOrMore(3))
+    return false;
+  // #Instructions in Pred1 for Compile Time Control
   auto InstsNoDbg = Pred1->instructionsWithoutDebug();
   int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end());
   int NStores = 0;
@@ -304,14 +318,23 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
     if (NStores * Size1 >= MagicCompileTimeControl)
       break;
     if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
-      bool Res = sinkStore(T, S0, S1);
-      MergedStores |= Res;
-      // Don't attempt to sink below stores that had to stick around
-      // But after removal of a store and some of its feeding
-      // instruction search again from the beginning since the iterator
-      // is likely stale at this point.
-      if (!Res)
+      if (!canSinkStoresAndGEPs(S0, S1))
+        // Don't attempt to sink below stores that had to stick around
+        // But after removal of a store and some of its feeding
+        // instruction search again from the beginning since the iterator
+        // is likely stale at this point.
         break;
+
+      if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) {
+        // We have more than 2 predecessors. Insert a new block
+        // postdominating 2 predecessors we're going to sink from.
+        SinkBB = SplitBlockPredecessors(TailBB, {Pred0, Pred1}, ".sink.split");
+        if (!SinkBB)
+          break;
+      }
+
+      MergedStores = true;
+      sinkStoresAndGEPs(SinkBB, S0, S1);
       RBI = Pred0->rbegin();
       RBE = Pred0->rend();
       LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
@@ -328,13 +351,15 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
 
   // Merge unconditional branches, allowing PRE to catch more
   // optimization opportunities.
+  // This loop doesn't care about newly inserted/split blocks 
+  // since they never will be diamond heads.
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
     BasicBlock *BB = &*FI++;
 
     // Hoist equivalent loads and sink stores
     // outside diamonds when possible
     if (isDiamondHead(BB)) {
-      Changed |= mergeStores(getDiamondTail(BB));
+      Changed |= mergeStores(BB);
     }
   }
   return Changed;
@@ -342,9 +367,11 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
 
 namespace {
 class MergedLoadStoreMotionLegacyPass : public FunctionPass {
+  const bool SplitFooterBB;
 public:
   static char ID; // Pass identification, replacement for typeid
-  MergedLoadStoreMotionLegacyPass() : FunctionPass(ID) {
+  MergedLoadStoreMotionLegacyPass(bool SplitFooterBB = false)
+      : FunctionPass(ID), SplitFooterBB(SplitFooterBB) {
     initializeMergedLoadStoreMotionLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -355,13 +382,14 @@ public:
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
-    MergedLoadStoreMotion Impl;
+    MergedLoadStoreMotion Impl(SplitFooterBB);
     return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
   }
 
 private:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
+    if (!SplitFooterBB)
+      AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
@@ -373,8 +401,8 @@ char MergedLoadStoreMotionLegacyPass::ID = 0;
 ///
 /// createMergedLoadStoreMotionPass - The public interface to this file.
 ///
-FunctionPass *llvm::createMergedLoadStoreMotionPass() {
-  return new MergedLoadStoreMotionLegacyPass();
+FunctionPass *llvm::createMergedLoadStoreMotionPass(bool SplitFooterBB) {
+  return new MergedLoadStoreMotionLegacyPass(SplitFooterBB);
 }
 
 INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
@@ -385,13 +413,14 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
 
 PreservedAnalyses
 MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
-  MergedLoadStoreMotion Impl;
+  MergedLoadStoreMotion Impl(Options.SplitFooterBB);
   auto &AA = AM.getResult<AAManager>(F);
   if (!Impl.run(F, AA))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
+  if (!Options.SplitFooterBB)
+    PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp
index 94436b55752a..1260bd39cdee 100644
--- a/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -170,7 +170,7 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
   auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
   return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 08ac2b666fce..b213264de557 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -89,6 +89,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
@@ -122,6 +123,7 @@
 using namespace llvm;
 using namespace llvm::GVNExpression;
 using namespace llvm::VNCoercion;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "newgvn"
 
@@ -656,7 +658,7 @@ public:
          TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
          const DataLayout &DL)
       : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
-        PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)),
+        PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)),
         SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false) {}
 
   bool runGVN();
@@ -1332,7 +1334,7 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
   E->setOpcode(0);
   E->op_push_back(PointerOp);
   if (LI)
-    E->setAlignment(LI->getAlignment());
+    E->setAlignment(MaybeAlign(LI->getAlignment()));
 
   // TODO: Value number heap versions. We may be able to discover
   // things alias analysis can't on it's own (IE that a store and a
@@ -1637,8 +1639,11 @@ const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
   if (AA->doesNotAccessMemory(CI)) {
     return createCallExpression(CI, TOPClass->getMemoryLeader());
   } else if (AA->onlyReadsMemory(CI)) {
-    MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
-    return createCallExpression(CI, DefiningAccess);
+    if (auto *MA = MSSA->getMemoryAccess(CI)) {
+      auto *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(MA);
+      return createCallExpression(CI, DefiningAccess);
+    } else // MSSA determined that CI does not access memory.
+      return createCallExpression(CI, TOPClass->getMemoryLeader());
   }
   return nullptr;
 }
@@ -1754,7 +1759,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
     return true;
   });
   // If we are left with no operands, it's dead.
-  if (empty(Filtered)) {
+  if (Filtered.empty()) {
     // If it has undef at this point, it means there are no-non-undef arguments,
     // and thus, the value of the phi node must be undef.
     if (HasUndef) {
@@ -2464,9 +2469,9 @@ Value *NewGVN::findConditionEquivalence(Value *Cond) const {
 // Process the outgoing edges of a block for reachability.
 void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
   // Evaluate reachability of terminator instruction.
-  BranchInst *BR;
-  if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
-    Value *Cond = BR->getCondition();
+  Value *Cond;
+  BasicBlock *TrueSucc, *FalseSucc;
+  if (match(TI, m_Br(m_Value(Cond), TrueSucc, FalseSucc))) {
     Value *CondEvaluated = findConditionEquivalence(Cond);
     if (!CondEvaluated) {
       if (auto *I = dyn_cast<Instruction>(Cond)) {
@@ -2479,8 +2484,6 @@ void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
       }
     }
     ConstantInt *CI;
-    BasicBlock *TrueSucc = BR->getSuccessor(0);
-    BasicBlock *FalseSucc = BR->getSuccessor(1);
     if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
       if (CI->isOne()) {
         LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
@@ -4196,7 +4199,7 @@ bool NewGVNLegacyPass::runOnFunction(Function &F) {
     return false;
   return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
-                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
                 &getAnalysis<AAResultsWrapperPass>().getAAResults(),
                 &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
                 F.getParent()->getDataLayout())
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 039123218544..68a0f5151ad5 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -161,7 +161,7 @@ public:
       return false;
 
     TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     const TargetTransformInfo *TTI =
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     return runPartiallyInlineLibCalls(F, TLI, TTI);
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
index b544f0a39ea8..beb299272ed8 100644
--- a/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -131,7 +131,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     for (Loop *I : *LI) {
       runOnLoopAndSubLoops(I);
     }
@@ -240,7 +240,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
 static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
                                     BasicBlock *Pred) {
   // A conservative bound on the loop as a whole.
-  const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
+  const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L);
   if (MaxTrips != SE->getCouldNotCompute() &&
       SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
           CountedLoopTripWidth))
@@ -478,7 +478,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
     return false;
 
   const TargetLibraryInfo &TLI =
-      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
   bool Modified = false;
 
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index fa8c9e2a5fe4..124f625ef7b6 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -861,7 +861,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
     // this use.  We do this by moving it to the entry block (if it is a
     // non-instruction value) or right after the definition.  These negates will
     // be zapped by reassociate later, so we don't need much finesse here.
-    BinaryOperator *TheNeg = cast<BinaryOperator>(U);
+    Instruction *TheNeg = cast<Instruction>(U);
 
     // Verify that the negate is in this function, V might be a constant expr.
     if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
@@ -1938,88 +1938,132 @@ void ReassociatePass::EraseInst(Instruction *I) {
   MadeChange = true;
 }
 
-// Canonicalize expressions of the following form:
-//  x + (-Constant * y) -> x - (Constant * y)
-//  x - (-Constant * y) -> x + (Constant * y)
-Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
-  if (!I->hasOneUse() || I->getType()->isVectorTy())
-    return nullptr;
-
-  // Must be a fmul or fdiv instruction.
-  unsigned Opcode = I->getOpcode();
-  if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
-    return nullptr;
-
-  auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
-  auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
-
-  // Both operands are constant, let it get constant folded away.
-  if (C0 && C1)
-    return nullptr;
-
-  ConstantFP *CF = C0 ? C0 : C1;
-
-  // Must have one constant operand.
-  if (!CF)
-    return nullptr;
+/// Recursively analyze an expression to build a list of instructions that have
+/// negative floating-point constant operands. The caller can then transform
+/// the list to create positive constants for better reassociation and CSE.
+static void getNegatibleInsts(Value *V,
+                              SmallVectorImpl<Instruction *> &Candidates) {
+  // Handle only one-use instructions. Combining negations does not justify
+  // replicating instructions.
+  Instruction *I;
+  if (!match(V, m_OneUse(m_Instruction(I))))
+    return;
 
-  // Must be a negative ConstantFP.
-  if (!CF->isNegative())
-    return nullptr;
+  // Handle expressions of multiplications and divisions.
+  // TODO: This could look through floating-point casts.
+  const APFloat *C;
+  switch (I->getOpcode()) {
+    case Instruction::FMul:
+      // Not expecting non-canonical code here. Bail out and wait.
+      if (match(I->getOperand(0), m_Constant()))
+        break;
 
-  // User must be a binary operator with one or more uses.
-  Instruction *User = I->user_back();
-  if (!isa<BinaryOperator>(User) || User->use_empty())
-    return nullptr;
+      if (match(I->getOperand(1), m_APFloat(C)) && C->isNegative()) {
+        Candidates.push_back(I);
+        LLVM_DEBUG(dbgs() << "FMul with negative constant: " << *I << '\n');
+      }
+      getNegatibleInsts(I->getOperand(0), Candidates);
+      getNegatibleInsts(I->getOperand(1), Candidates);
+      break;
+    case Instruction::FDiv:
+      // Not expecting non-canonical code here. Bail out and wait.
+      if (match(I->getOperand(0), m_Constant()) &&
+          match(I->getOperand(1), m_Constant()))
+        break;
 
-  unsigned UserOpcode = User->getOpcode();
-  if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
-    return nullptr;
+      if ((match(I->getOperand(0), m_APFloat(C)) && C->isNegative()) ||
+          (match(I->getOperand(1), m_APFloat(C)) && C->isNegative())) {
+        Candidates.push_back(I);
+        LLVM_DEBUG(dbgs() << "FDiv with negative constant: " << *I << '\n');
+      }
+      getNegatibleInsts(I->getOperand(0), Candidates);
+      getNegatibleInsts(I->getOperand(1), Candidates);
+      break;
+    default:
+      break;
+  }
+}
 
-  // Subtraction is not commutative. Explicitly, the following transform is
-  // not valid: (-Constant * y) - x  -> x + (Constant * y)
-  if (!User->isCommutative() && User->getOperand(1) != I)
+/// Given an fadd/fsub with an operand that is a one-use instruction
+/// (the fadd/fsub), try to change negative floating-point constants into
+/// positive constants to increase potential for reassociation and CSE.
+Instruction *ReassociatePass::canonicalizeNegFPConstantsForOp(Instruction *I,
+                                                              Instruction *Op,
+                                                              Value *OtherOp) {
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expected fadd/fsub");
+
+  // Collect instructions with negative FP constants from the subtree that ends
+  // in Op.
+  SmallVector<Instruction *, 4> Candidates;
+  getNegatibleInsts(Op, Candidates);
+  if (Candidates.empty())
     return nullptr;
 
   // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the
   // resulting subtract will be broken up later.  This can get us into an
   // infinite loop during reassociation.
-  if (UserOpcode == Instruction::FAdd && ShouldBreakUpSubtract(User))
+  bool IsFSub = I->getOpcode() == Instruction::FSub;
+  bool NeedsSubtract = !IsFSub && Candidates.size() % 2 == 1;
+  if (NeedsSubtract && ShouldBreakUpSubtract(I))
     return nullptr;
 
-  // Change the sign of the constant.
-  APFloat Val = CF->getValueAPF();
-  Val.changeSign();
-  I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));
-
-  // Canonicalize I to RHS to simplify the next bit of logic. E.g.,
-  // ((-Const*y) + x) -> (x + (-Const*y)).
-  if (User->getOperand(0) == I && User->isCommutative())
-    cast<BinaryOperator>(User)->swapOperands();
-
-  Value *Op0 = User->getOperand(0);
-  Value *Op1 = User->getOperand(1);
-  BinaryOperator *NI;
-  switch (UserOpcode) {
-  default:
-    llvm_unreachable("Unexpected Opcode!");
-  case Instruction::FAdd:
-    NI = BinaryOperator::CreateFSub(Op0, Op1);
-    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
-    break;
-  case Instruction::FSub:
-    NI = BinaryOperator::CreateFAdd(Op0, Op1);
-    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
-    break;
+  for (Instruction *Negatible : Candidates) {
+    const APFloat *C;
+    if (match(Negatible->getOperand(0), m_APFloat(C))) {
+      assert(!match(Negatible->getOperand(1), m_Constant()) &&
+             "Expecting only 1 constant operand");
+      assert(C->isNegative() && "Expected negative FP constant");
+      Negatible->setOperand(0, ConstantFP::get(Negatible->getType(), abs(*C)));
+      MadeChange = true;
+    }
+    if (match(Negatible->getOperand(1), m_APFloat(C))) {
+      assert(!match(Negatible->getOperand(0), m_Constant()) &&
+             "Expecting only 1 constant operand");
+      assert(C->isNegative() && "Expected negative FP constant");
+      Negatible->setOperand(1, ConstantFP::get(Negatible->getType(), abs(*C)));
+      MadeChange = true;
+    }
   }
+  assert(MadeChange == true && "Negative constant candidate was not changed");
 
-  NI->insertBefore(User);
-  NI->setName(User->getName());
-  User->replaceAllUsesWith(NI);
-  NI->setDebugLoc(I->getDebugLoc());
+  // Negations cancelled out.
+  if (Candidates.size() % 2 == 0)
+    return I;
+
+  // Negate the final operand in the expression by flipping the opcode of this
+  // fadd/fsub.
+  assert(Candidates.size() % 2 == 1 && "Expected odd number");
+  IRBuilder<> Builder(I);
+  Value *NewInst = IsFSub ? Builder.CreateFAddFMF(OtherOp, Op, I)
+                          : Builder.CreateFSubFMF(OtherOp, Op, I);
+  I->replaceAllUsesWith(NewInst);
   RedoInsts.insert(I);
-  MadeChange = true;
-  return NI;
+  return dyn_cast<Instruction>(NewInst);
+}
+
+/// Canonicalize expressions that contain a negative floating-point constant
+/// of the following form:
+///   OtherOp + (subtree) -> OtherOp {+/-} (canonical subtree)
+///   (subtree) + OtherOp -> OtherOp {+/-} (canonical subtree)
+///   OtherOp - (subtree) -> OtherOp {+/-} (canonical subtree)
+///
+/// The fadd/fsub opcode may be switched to allow folding a negation into the
+/// input instruction.
+Instruction *ReassociatePass::canonicalizeNegFPConstants(Instruction *I) {
+  LLVM_DEBUG(dbgs() << "Combine negations for: " << *I << '\n');
+  Value *X;
+  Instruction *Op;
+  if (match(I, m_FAdd(m_Value(X), m_OneUse(m_Instruction(Op)))))
+    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+      I = R;
+  if (match(I, m_FAdd(m_OneUse(m_Instruction(Op)), m_Value(X))))
+    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+      I = R;
+  if (match(I, m_FSub(m_Value(X), m_OneUse(m_Instruction(Op)))))
+    if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+      I = R;
+  return I;
 }
 
 /// Inspect and optimize the given instruction. Note that erasing
@@ -2042,16 +2086,16 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
       I = NI;
     }
 
-  // Canonicalize negative constants out of expressions.
-  if (Instruction *Res = canonicalizeNegConstExpr(I))
-    I = Res;
-
   // Commute binary operators, to canonicalize the order of their operands.
   // This can potentially expose more CSE opportunities, and makes writing other
   // transformations simpler.
   if (I->isCommutative())
     canonicalizeOperands(I);
 
+  // Canonicalize negative constants out of expressions.
+  if (Instruction *Res = canonicalizeNegFPConstants(I))
+    I = Res;
+
   // Don't optimize floating-point instructions unless they are 'fast'.
   if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
     return;
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index c358258d24cf..48bbdd8d1b33 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -172,8 +172,6 @@ public:
 
   bool runOnModule(Module &M) override {
     bool Changed = false;
-    const TargetLibraryInfo &TLI =
-        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     for (Function &F : M) {
       // Nothing to do for declarations.
       if (F.isDeclaration() || F.empty())
@@ -186,6 +184,8 @@ public:
 
       TargetTransformInfo &TTI =
           getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+      const TargetLibraryInfo &TLI =
+          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
       auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
 
       Changed |= Impl.runOnFunction(F, DT, TTI, TLI);
@@ -2530,7 +2530,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
   // statepoints surviving this pass.  This makes testing easier and the
   // resulting IR less confusing to human readers.
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-  bool MadeChange = removeUnreachableBlocks(F, nullptr, &DTU);
+  bool MadeChange = removeUnreachableBlocks(F, &DTU);
   // Flush the Dominator Tree.
   DTU.getDomTree();
 
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 4093e50ce899..10fbdc8aacd2 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -191,7 +191,7 @@ public:
 ///
 class SCCPSolver : public InstVisitor<SCCPSolver> {
   const DataLayout &DL;
-  const TargetLibraryInfo *TLI;
+  std::function<const TargetLibraryInfo &(Function &)> GetTLI;
   SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
   DenseMap<Value *, LatticeVal> ValueState;  // The state each value is in.
   // The state each parameter is in.
@@ -268,8 +268,9 @@ public:
     return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy};
   }
 
-  SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
-      : DL(DL), TLI(tli) {}
+  SCCPSolver(const DataLayout &DL,
+             std::function<const TargetLibraryInfo &(Function &)> GetTLI)
+      : DL(DL), GetTLI(std::move(GetTLI)) {}
 
   /// MarkBlockExecutable - This method can be used by clients to mark all of
   /// the blocks that are known to be intrinsically live in the processed unit.
@@ -1290,7 +1291,7 @@ CallOverdefined:
       // If we can constant fold this, mark the result of the call as a
       // constant.
       if (Constant *C = ConstantFoldCall(cast<CallBase>(CS.getInstruction()), F,
-                                         Operands, TLI)) {
+                                         Operands, &GetTLI(*F))) {
         // call -> undef.
         if (isa<UndefValue>(C))
           return;
@@ -1465,7 +1466,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       }
 
       LatticeVal &LV = getValueState(&I);
-      if (!LV.isUnknown()) continue;
+      if (!LV.isUnknown())
+        continue;
+
+      // There are two reasons a call can have an undef result
+      // 1. It could be tracked.
+      // 2. It could be constant-foldable.
+      // Because of the way we solve return values, tracked calls must
+      // never be marked overdefined in ResolvedUndefsIn.
+      if (CallSite CS = CallSite(&I)) {
+        if (Function *F = CS.getCalledFunction())
+          if (TrackedRetVals.count(F))
+            continue;
+
+        // If the call is constant-foldable, we mark it overdefined because
+        // we do not know what return values are valid.
+        markOverdefined(&I);
+        return true;
+      }
 
       // extractvalue is safe; check here because the argument is a struct.
       if (isa<ExtractValueInst>(I))
@@ -1638,19 +1656,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       case Instruction::Call:
       case Instruction::Invoke:
       case Instruction::CallBr:
-        // There are two reasons a call can have an undef result
-        // 1. It could be tracked.
-        // 2. It could be constant-foldable.
-        // Because of the way we solve return values, tracked calls must
-        // never be marked overdefined in ResolvedUndefsIn.
-        if (Function *F = CallSite(&I).getCalledFunction())
-          if (TrackedRetVals.count(F))
-            break;
-
-        // If the call is constant-foldable, we mark it overdefined because
-        // we do not know what return values are valid.
-        markOverdefined(&I);
-        return true;
+        llvm_unreachable("Call-like instructions should have be handled early");
       default:
         // If we don't know what should happen here, conservatively mark it
         // overdefined.
@@ -1751,7 +1757,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
                      [](const LatticeVal &LV) { return LV.isOverdefined(); }))
       return false;
     std::vector<Constant *> ConstVals;
-    auto *ST = dyn_cast<StructType>(V->getType());
+    auto *ST = cast<StructType>(V->getType());
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       LatticeVal V = IVs[i];
       ConstVals.push_back(V.isConstant()
@@ -1796,7 +1802,8 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
 static bool runSCCP(Function &F, const DataLayout &DL,
                     const TargetLibraryInfo *TLI) {
   LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
-  SCCPSolver Solver(DL, TLI);
+  SCCPSolver Solver(
+      DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; });
 
   // Mark the first block of the function as being executable.
   Solver.MarkBlockExecutable(&F.front());
@@ -1891,7 +1898,7 @@ public:
       return false;
     const DataLayout &DL = F.getParent()->getDataLayout();
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     return runSCCP(F, DL, TLI);
   }
 };
@@ -1924,6 +1931,27 @@ static void findReturnsToZap(Function &F,
     return;
   }
 
+  assert(
+      all_of(F.users(),
+             [&Solver](User *U) {
+               if (isa<Instruction>(U) &&
+                   !Solver.isBlockExecutable(cast<Instruction>(U)->getParent()))
+                 return true;
+               // Non-callsite uses are not impacted by zapping. Also, constant
+               // uses (like blockaddresses) could stuck around, without being
+               // used in the underlying IR, meaning we do not have lattice
+               // values for them.
+               if (!CallSite(U))
+                 return true;
+               if (U->getType()->isStructTy()) {
+                 return all_of(
+                     Solver.getStructLatticeValueFor(U),
+                     [](const LatticeVal &LV) { return !LV.isOverdefined(); });
+               }
+               return !Solver.getLatticeValueFor(U).isOverdefined();
+             }) &&
+      "We can only zap functions where all live users have a concrete value");
+
   for (BasicBlock &BB : F) {
     if (CallInst *CI = BB.getTerminatingMustTailCall()) {
       LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present "
@@ -1974,9 +2002,10 @@ static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) {
 }
 
 bool llvm::runIPSCCP(
-    Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI,
+    Module &M, const DataLayout &DL,
+    std::function<const TargetLibraryInfo &(Function &)> GetTLI,
     function_ref<AnalysisResultsForFn(Function &)> getAnalysis) {
-  SCCPSolver Solver(DL, TLI);
+  SCCPSolver Solver(DL, GetTLI);
 
   // Loop over all functions, marking arguments to those with their addresses
   // taken or that are external as overdefined.
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 33f90d0b01e4..74b8ff913050 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -959,14 +959,16 @@ private:
       std::tie(UsedI, I) = Uses.pop_back_val();
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
+        Size = std::max(Size,
+                        DL.getTypeStoreSize(LI->getType()).getFixedSize());
         continue;
       }
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         Value *Op = SI->getOperand(0);
         if (Op == UsedI)
           return SI;
-        Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
+        Size = std::max(Size,
+                        DL.getTypeStoreSize(Op->getType()).getFixedSize());
         continue;
       }
 
@@ -1197,7 +1199,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
   // TODO: Allow recursive phi users.
   // TODO: Allow stores.
   BasicBlock *BB = PN.getParent();
-  unsigned MaxAlign = 0;
+  MaybeAlign MaxAlign;
   uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
   APInt MaxSize(APWidth, 0);
   bool HaveLoad = false;
@@ -1218,8 +1220,8 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
       if (BBI->mayWriteToMemory())
         return false;
 
-    uint64_t Size = DL.getTypeStoreSizeInBits(LI->getType());
-    MaxAlign = std::max(MaxAlign, LI->getAlignment());
+    uint64_t Size = DL.getTypeStoreSize(LI->getType());
+    MaxAlign = std::max(MaxAlign, MaybeAlign(LI->getAlignment()));
     MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
     HaveLoad = true;
   }
@@ -1266,11 +1268,11 @@ static void speculatePHINodeLoads(PHINode &PN) {
   PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
                                         PN.getName() + ".sroa.speculated");
 
-  // Get the AA tags and alignment to use from one of the loads.  It doesn't
+  // Get the AA tags and alignment to use from one of the loads. It does not
   // matter which one we get and if any differ.
   AAMDNodes AATags;
   SomeLoad->getAAMetadata(AATags);
-  unsigned Align = SomeLoad->getAlignment();
+  const MaybeAlign Align = MaybeAlign(SomeLoad->getAlignment());
 
   // Rewrite all loads of the PN to use the new PHI.
   while (!PN.use_empty()) {
@@ -1338,11 +1340,11 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
     // Both operands to the select need to be dereferenceable, either
     // absolutely (e.g. allocas) or at this point because we can see other
     // accesses to it.
-    if (!isSafeToLoadUnconditionally(TValue, LI->getType(), LI->getAlignment(),
-                                     DL, LI))
+    if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
+                                     MaybeAlign(LI->getAlignment()), DL, LI))
       return false;
-    if (!isSafeToLoadUnconditionally(FValue, LI->getType(), LI->getAlignment(),
-                                     DL, LI))
+    if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
+                                     MaybeAlign(LI->getAlignment()), DL, LI))
       return false;
   }
 
@@ -1368,8 +1370,8 @@ static void speculateSelectInstLoads(SelectInst &SI) {
     NumLoadsSpeculated += 2;
 
     // Transfer alignment and AA info if present.
-    TL->setAlignment(LI->getAlignment());
-    FL->setAlignment(LI->getAlignment());
+    TL->setAlignment(MaybeAlign(LI->getAlignment()));
+    FL->setAlignment(MaybeAlign(LI->getAlignment()));
 
     AAMDNodes Tags;
     LI->getAAMetadata(Tags);
@@ -1888,6 +1890,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   bool HaveCommonEltTy = true;
   auto CheckCandidateType = [&](Type *Ty) {
     if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      // Return if bitcast to vectors is different for total size in bits.
+      if (!CandidateTys.empty()) {
+        VectorType *V = CandidateTys[0];
+        if (DL.getTypeSizeInBits(VTy) != DL.getTypeSizeInBits(V)) {
+          CandidateTys.clear();
+          return;
+        }
+      }
       CandidateTys.push_back(VTy);
       if (!CommonEltTy)
         CommonEltTy = VTy->getElementType();
@@ -3110,7 +3120,7 @@ private:
         unsigned LoadAlign = LI->getAlignment();
         if (!LoadAlign)
           LoadAlign = DL.getABITypeAlignment(LI->getType());
-        LI->setAlignment(std::min(LoadAlign, getSliceAlign()));
+        LI->setAlignment(MaybeAlign(std::min(LoadAlign, getSliceAlign())));
         continue;
       }
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
@@ -3119,7 +3129,7 @@ private:
           Value *Op = SI->getOperand(0);
           StoreAlign = DL.getABITypeAlignment(Op->getType());
         }
-        SI->setAlignment(std::min(StoreAlign, getSliceAlign()));
+        SI->setAlignment(MaybeAlign(std::min(StoreAlign, getSliceAlign())));
         continue;
       }
 
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 869cf00e0a89..1d2e40bf62be 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -79,6 +79,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopVersioningLICMPass(Registry);
   initializeLoopIdiomRecognizeLegacyPassPass(Registry);
   initializeLowerAtomicLegacyPassPass(Registry);
+  initializeLowerConstantIntrinsicsPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeLowerGuardIntrinsicLegacyPassPass(Registry);
   initializeLowerWidenableConditionLegacyPassPass(Registry);
@@ -123,6 +124,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createAggressiveDCEPass());
 }
 
+void LLVMAddDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadCodeEliminationPass());
+}
+
 void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBitTrackingDCEPass());
 }
@@ -280,6 +285,10 @@ void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBasicAAWrapperPass());
 }
 
+void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerConstantIntrinsicsPass());
+}
+
 void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLowerExpectIntrinsicPass());
 }
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index f6a12fb13142..41554fccdf08 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1121,7 +1121,7 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   bool Changed = false;
   for (BasicBlock &B : F) {
     for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index aeac6f548b32..ac832b9b4567 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1909,7 +1909,7 @@ static void unswitchNontrivialInvariants(
 
   // We can only unswitch switches, conditional branches with an invariant
   // condition, or combining invariant conditions with an instruction.
-  assert((SI || BI->isConditional()) &&
+  assert((SI || (BI && BI->isConditional())) &&
          "Can only unswitch switches and conditional branch!");
   bool FullUnswitch = SI || BI->getCondition() == Invariants[0];
   if (FullUnswitch)
@@ -2141,17 +2141,21 @@ static void unswitchNontrivialInvariants(
     buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
                                           *ClonedPH, *LoopPH);
     DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+
+    if (MSSAU) {
+      DT.applyUpdates(DTUpdates);
+      DTUpdates.clear();
+
+      // Perform MSSA cloning updates.
+      for (auto &VMap : VMaps)
+        MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
+                                   /*IgnoreIncomingWithNoClones=*/true);
+      MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
+    }
   }
 
   // Apply the updates accumulated above to get an up-to-date dominator tree.
   DT.applyUpdates(DTUpdates);
-  if (!FullUnswitch && MSSAU) {
-    // Update MSSA for partial unswitch, after DT update.
-    SmallVector<CFGUpdate, 1> Updates;
-    Updates.push_back(
-        {cfg::UpdateKind::Insert, SplitBB, ClonedPHs.begin()->second});
-    MSSAU->applyInsertUpdates(Updates, DT);
-  }
 
   // Now that we have an accurate dominator tree, first delete the dead cloned
   // blocks so that we can accurately build any cloned loops. It is important to
@@ -2720,7 +2724,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
     return Cost * (SuccessorsCount - 1);
   };
   Instruction *BestUnswitchTI = nullptr;
-  int BestUnswitchCost;
+  int BestUnswitchCost = 0;
   ArrayRef<Value *> BestUnswitchInvariants;
   for (auto &TerminatorAndInvariants : UnswitchCandidates) {
     Instruction &TI = *TerminatorAndInvariants.first;
@@ -2752,6 +2756,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
       BestUnswitchInvariants = Invariants;
     }
   }
+  assert(BestUnswitchTI && "Failed to find loop unswitch candidate");
 
   if (BestUnswitchCost >= UnswitchThreshold) {
     LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
@@ -2880,7 +2885,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
   assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
 
   auto PA = getLoopPassPreservedAnalyses();
-  if (EnableMSSALoopDependency)
+  if (AR.MSSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
diff --git a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index c13fb3e04516..e6db11f47ead 100644
--- a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -777,8 +777,10 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
     // speculation if the predecessor is an invoke. This doesn't seem
     // fundamental and we should probably be splitting critical edges
     // differently.
-    if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
-        isa<InvokeInst>(PredBB->getTerminator())) {
+    const auto *TermInst = PredBB->getTerminator();
+    if (isa<IndirectBrInst>(TermInst) ||
+        isa<InvokeInst>(TermInst) ||
+        isa<CallBrInst>(TermInst)) {
       LLVM_DEBUG(dbgs() << "  Invalid: predecessor terminator: "
                         << PredBB->getName() << "\n");
       return false;
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index e5400676c7e8..9791cf41f621 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -65,7 +65,7 @@ static cl::opt<bool> ForceSkipUniformRegions(
 static cl::opt<bool>
     RelaxedUniformRegions("structurizecfg-relaxed-uniform-regions", cl::Hidden,
                           cl::desc("Allow relaxed uniform region checks"),
-                          cl::init(false));
+                          cl::init(true));
 
 // Definition of the complex types used in this pass.
 
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index f0b79079d817..b27a36b67d62 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -341,7 +341,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
       const DataLayout &DL = L->getModule()->getDataLayout();
       if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
           !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
-                                       L->getAlignment(), DL, L))
+                                       MaybeAlign(L->getAlignment()), DL, L))
         return false;
     }
   }
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 5fa371377c85..d85cc40c372a 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -170,7 +170,8 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
 
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
                                      LoopInfo *LI, MemorySSAUpdater *MSSAU,
-                                     MemoryDependenceResults *MemDep) {
+                                     MemoryDependenceResults *MemDep,
+                                     bool PredecessorWithTwoSuccessors) {
   if (BB->hasAddressTaken())
     return false;
 
@@ -185,9 +186,24 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
     return false;
 
   // Can't merge if there are multiple distinct successors.
-  if (PredBB->getUniqueSuccessor() != BB)
+  if (!PredecessorWithTwoSuccessors && PredBB->getUniqueSuccessor() != BB)
     return false;
 
+  // Currently only allow PredBB to have two predecessors, one being BB.
+  // Update BI to branch to BB's only successor instead of BB.
+  BranchInst *PredBB_BI;
+  BasicBlock *NewSucc = nullptr;
+  unsigned FallThruPath;
+  if (PredecessorWithTwoSuccessors) {
+    if (!(PredBB_BI = dyn_cast<BranchInst>(PredBB->getTerminator())))
+      return false;
+    BranchInst *BB_JmpI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BB_JmpI || !BB_JmpI->isUnconditional())
+      return false;
+    NewSucc = BB_JmpI->getSuccessor(0);
+    FallThruPath = PredBB_BI->getSuccessor(0) == BB ? 0 : 1;
+  }
+
   // Can't merge if there is PHI loop.
   for (PHINode &PN : BB->phis())
     for (Value *IncValue : PN.incoming_values())
@@ -227,18 +243,39 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
     Updates.push_back({DominatorTree::Delete, PredBB, BB});
   }
 
-  if (MSSAU)
-    MSSAU->moveAllAfterMergeBlocks(BB, PredBB, &*(BB->begin()));
+  Instruction *PTI = PredBB->getTerminator();
+  Instruction *STI = BB->getTerminator();
+  Instruction *Start = &*BB->begin();
+  // If there's nothing to move, mark the starting instruction as the last
+  // instruction in the block.
+  if (Start == STI)
+    Start = PTI;
+
+  // Move all definitions in the successor to the predecessor...
+  PredBB->getInstList().splice(PTI->getIterator(), BB->getInstList(),
+                               BB->begin(), STI->getIterator());
 
-  // Delete the unconditional branch from the predecessor...
-  PredBB->getInstList().pop_back();
+  if (MSSAU)
+    MSSAU->moveAllAfterMergeBlocks(BB, PredBB, Start);
 
   // Make all PHI nodes that referred to BB now refer to Pred as their
   // source...
   BB->replaceAllUsesWith(PredBB);
 
-  // Move all definitions in the successor to the predecessor...
-  PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
+  if (PredecessorWithTwoSuccessors) {
+    // Delete the unconditional branch from BB.
+    BB->getInstList().pop_back();
+
+    // Update branch in the predecessor.
+    PredBB_BI->setSuccessor(FallThruPath, NewSucc);
+  } else {
+    // Delete the unconditional branch from the predecessor.
+    PredBB->getInstList().pop_back();
+
+    // Move terminator instruction.
+    PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
+  }
+  // Add unreachable to now empty BB.
   new UnreachableInst(BB->getContext(), BB);
 
   // Eliminate duplicate dbg.values describing the entry PHI node post-splice.
@@ -274,11 +311,10 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
            "applying corresponding DTU updates.");
     DTU->applyUpdatesPermissive(Updates);
     DTU->deleteBB(BB);
-  }
-
-  else {
+  } else {
     BB->eraseFromParent(); // Nuke BB if DTU is nullptr.
   }
+
   return true;
 }
 
@@ -365,11 +401,13 @@ llvm::SplitAllCriticalEdges(Function &F,
 
 BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
                              DominatorTree *DT, LoopInfo *LI,
-                             MemorySSAUpdater *MSSAU) {
+                             MemorySSAUpdater *MSSAU, const Twine &BBName) {
   BasicBlock::iterator SplitIt = SplitPt->getIterator();
   while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())
     ++SplitIt;
-  BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split");
+  std::string Name = BBName.str();
+  BasicBlock *New = Old->splitBasicBlock(
+      SplitIt, Name.empty() ? Old->getName() + ".split" : Name);
 
   // The new block lives in whichever loop the old one did. This preserves
   // LCSSA as well, because we force the split point to be after any PHI nodes.
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 27f110e24f9c..71316ce8f758 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -88,6 +88,14 @@ static bool setDoesNotCapture(Function &F, unsigned ArgNo) {
   return true;
 }
 
+static bool setDoesNotAlias(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::NoAlias))
+    return false;
+  F.addParamAttr(ArgNo, Attribute::NoAlias);
+  ++NumNoAlias;
+  return true;
+}
+
 static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) {
   if (F.hasParamAttribute(ArgNo, Attribute::ReadOnly))
     return false;
@@ -175,6 +183,9 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_strcpy:
   case LibFunc_strncpy:
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotAlias(F, 1);
+    LLVM_FALLTHROUGH;
   case LibFunc_strcat:
   case LibFunc_strncat:
     Changed |= setReturnedArg(F, 0);
@@ -249,12 +260,14 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_sprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_snprintf:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
@@ -291,11 +304,23 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_memcpy:
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotAlias(F, 1);
+    Changed |= setReturnedArg(F, 0);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
   case LibFunc_memmove:
     Changed |= setReturnedArg(F, 0);
-    LLVM_FALLTHROUGH;
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
   case LibFunc_mempcpy:
   case LibFunc_memccpy:
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotAlias(F, 1);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
@@ -760,9 +785,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   }
 }
 
-bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                           LibFunc DoubleFn, LibFunc FloatFn,
-                           LibFunc LongDoubleFn) {
+bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                      LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) {
   switch (Ty->getTypeID()) {
   case Type::HalfTyID:
     return false;
@@ -775,10 +799,10 @@ bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
   }
 }
 
-StringRef llvm::getUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                                LibFunc DoubleFn, LibFunc FloatFn,
-                                LibFunc LongDoubleFn) {
-  assert(hasUnaryFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
+StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
+                               LibFunc DoubleFn, LibFunc FloatFn,
+                               LibFunc LongDoubleFn) {
+  assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
          "Cannot get name for unavailable function!");
 
   switch (Ty->getTypeID()) {
@@ -827,6 +851,12 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
                      B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI);
 }
 
+Value *llvm::emitStrDup(Value *Ptr, IRBuilder<> &B,
+                        const TargetLibraryInfo *TLI) {
+  return emitLibCall(LibFunc_strdup, B.getInt8PtrTy(), B.getInt8PtrTy(),
+                     castToCStr(Ptr, B), B, TLI);
+}
+
 Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI) {
   Type *I8Ptr = B.getInt8PtrTy();
@@ -1045,24 +1075,28 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
                                   LibFunc LongDoubleFn, IRBuilder<> &B,
                                   const AttributeList &Attrs) {
   // Get the name of the function according to TLI.
-  StringRef Name = getUnaryFloatFn(TLI, Op->getType(),
-                                   DoubleFn, FloatFn, LongDoubleFn);
+  StringRef Name = getFloatFnName(TLI, Op->getType(),
+                                  DoubleFn, FloatFn, LongDoubleFn);
 
   return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
 }
 
-Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                                   IRBuilder<> &B, const AttributeList &Attrs) {
+static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
+                                          StringRef Name, IRBuilder<> &B,
+                                          const AttributeList &Attrs) {
   assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
 
-  SmallString<20> NameBuffer;
-  appendTypeSuffix(Op1, Name, NameBuffer);
-
   Module *M = B.GetInsertBlock()->getModule();
-  FunctionCallee Callee = M->getOrInsertFunction(
-      Name, Op1->getType(), Op1->getType(), Op2->getType());
-  CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name);
-  CI->setAttributes(Attrs);
+  FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(),
+                                                 Op1->getType(), Op2->getType());
+  CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name);
+
+  // The incoming attribute set may have come from a speculatable intrinsic, but
+  // is being replaced with a library call which is not allowed to be
+  // speculatable.
+  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
+                                          AttributeList::FunctionIndex,
+                                          Attribute::Speculatable));
   if (const Function *F =
           dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1070,6 +1104,28 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
   return CI;
 }
 
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
+                                   IRBuilder<> &B, const AttributeList &Attrs) {
+  assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
+
+  SmallString<20> NameBuffer;
+  appendTypeSuffix(Op1, Name, NameBuffer);
+
+  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+}
+
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
+                                   const TargetLibraryInfo *TLI,
+                                   LibFunc DoubleFn, LibFunc FloatFn,
+                                   LibFunc LongDoubleFn, IRBuilder<> &B,
+                                   const AttributeList &Attrs) {
+  // Get the name of the function according to TLI.
+  StringRef Name = getFloatFnName(TLI, Op1->getType(),
+                                  DoubleFn, FloatFn, LongDoubleFn);
+
+  return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+}
+
 Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_putchar))
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index df299f673f65..9a6761040bd8 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -448,13 +448,17 @@ bool llvm::bypassSlowDivision(BasicBlock *BB,
   DivCacheTy PerBBDivCache;
 
   bool MadeChange = false;
-  Instruction* Next = &*BB->begin();
+  Instruction *Next = &*BB->begin();
   while (Next != nullptr) {
     // We may add instructions immediately after I, but we want to skip over
     // them.
-    Instruction* I = Next;
+    Instruction *I = Next;
     Next = Next->getNextNode();
 
+    // Ignore dead code to save time and avoid bugs.
+    if (I->hasNUses(0))
+      continue;
+
     FastDivInsertionTask Task(I, BypassWidths);
     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
       I->replaceAllUsesWith(Replacement);
diff --git a/lib/Transforms/Utils/CanonicalizeAliases.cpp b/lib/Transforms/Utils/CanonicalizeAliases.cpp
index 455fcbb1cf98..3c7c8d872595 100644
--- a/lib/Transforms/Utils/CanonicalizeAliases.cpp
+++ b/lib/Transforms/Utils/CanonicalizeAliases.cpp
@@ -33,6 +33,7 @@
 
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 1026c9d37038..75e8963303c2 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -210,6 +210,21 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
       RemapInstruction(&II, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
                        TypeMapper, Materializer);
+
+  // Register all DICompileUnits of the old parent module in the new parent module
+  auto* OldModule = OldFunc->getParent();
+  auto* NewModule = NewFunc->getParent();
+  if (OldModule && NewModule && OldModule != NewModule && DIFinder.compile_unit_count()) {
+    auto* NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
+    // Avoid multiple insertions of the same DICompileUnit to NMD.
+    SmallPtrSet<const void*, 8> Visited;
+    for (auto* Operand : NMD->operands())
+      Visited.insert(Operand);
+    for (auto* Unit : DIFinder.compile_units())
+      // VMap.MD()[Unit] == Unit
+      if (Visited.insert(Unit).second)
+        NMD->addOperand(Unit);
+  }
 }
 
 /// Return a copy of the specified function and add it to that function's
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 7ddf59becba9..2c8c3abb2922 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -48,7 +48,7 @@ std::unique_ptr<Module> llvm::CloneModule(
     function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
   // First off, we need to create the new module.
   std::unique_ptr<Module> New =
-      llvm::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
+      std::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
   New->setSourceFileName(M.getSourceFileName());
   New->setDataLayout(M.getDataLayout());
   New->setTargetTriple(M.getTargetTriple());
@@ -181,13 +181,25 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // And named metadata....
+  const auto* LLVM_DBG_CU = M.getNamedMetadata("llvm.dbg.cu");
   for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
                                              E = M.named_metadata_end();
        I != E; ++I) {
     const NamedMDNode &NMD = *I;
     NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
-    for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
-      NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
+    if (&NMD == LLVM_DBG_CU) {
+      // Do not insert duplicate operands.
+      SmallPtrSet<const void*, 8> Visited;
+      for (const auto* Operand : NewNMD->operands())
+        Visited.insert(Operand);
+      for (const auto* Operand : NMD.operands()) {
+        auto* MappedOperand = MapMetadata(Operand, VMap);
+        if (Visited.insert(MappedOperand).second)
+          NewNMD->addOperand(MappedOperand);
+      }
+    } else
+      for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
+        NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
   }
 
   return New;
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index fa6d3f8ae873..0298ff9a395f 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -293,10 +293,8 @@ static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
         CommonExitBlock = Succ;
         continue;
       }
-      if (CommonExitBlock == Succ)
-        continue;
-
-      return true;
+      if (CommonExitBlock != Succ)
+        return true;
     }
     return false;
   };
@@ -307,52 +305,79 @@ static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
   return CommonExitBlock;
 }
 
-bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
-    Instruction *Addr) const {
-  AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
-  Function *Func = (*Blocks.begin())->getParent();
-  for (BasicBlock &BB : *Func) {
-    if (Blocks.count(&BB))
-      continue;
-    for (Instruction &II : BB) {
-      if (isa<DbgInfoIntrinsic>(II))
-        continue;
+CodeExtractorAnalysisCache::CodeExtractorAnalysisCache(Function &F) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &II : BB.instructionsWithoutDebug())
+      if (auto *AI = dyn_cast<AllocaInst>(&II))
+        Allocas.push_back(AI);
 
-      unsigned Opcode = II.getOpcode();
-      Value *MemAddr = nullptr;
-      switch (Opcode) {
-      case Instruction::Store:
-      case Instruction::Load: {
-        if (Opcode == Instruction::Store) {
-          StoreInst *SI = cast<StoreInst>(&II);
-          MemAddr = SI->getPointerOperand();
-        } else {
-          LoadInst *LI = cast<LoadInst>(&II);
-          MemAddr = LI->getPointerOperand();
-        }
-        // Global variable can not be aliased with locals.
-        if (dyn_cast<Constant>(MemAddr))
-          break;
-        Value *Base = MemAddr->stripInBoundsConstantOffsets();
-        if (!isa<AllocaInst>(Base) || Base == AI)
-          return false;
+    findSideEffectInfoForBlock(BB);
+  }
+}
+
+void CodeExtractorAnalysisCache::findSideEffectInfoForBlock(BasicBlock &BB) {
+  for (Instruction &II : BB.instructionsWithoutDebug()) {
+    unsigned Opcode = II.getOpcode();
+    Value *MemAddr = nullptr;
+    switch (Opcode) {
+    case Instruction::Store:
+    case Instruction::Load: {
+      if (Opcode == Instruction::Store) {
+        StoreInst *SI = cast<StoreInst>(&II);
+        MemAddr = SI->getPointerOperand();
+      } else {
+        LoadInst *LI = cast<LoadInst>(&II);
+        MemAddr = LI->getPointerOperand();
+      }
+      // Global variable can not be aliased with locals.
+      if (dyn_cast<Constant>(MemAddr))
         break;
+      Value *Base = MemAddr->stripInBoundsConstantOffsets();
+      if (!isa<AllocaInst>(Base)) {
+        SideEffectingBlocks.insert(&BB);
+        return;
       }
-      default: {
-        IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
-        if (IntrInst) {
-          if (IntrInst->isLifetimeStartOrEnd())
-            break;
-          return false;
-        }
-        // Treat all the other cases conservatively if it has side effects.
-        if (II.mayHaveSideEffects())
-          return false;
+      BaseMemAddrs[&BB].insert(Base);
+      break;
+    }
+    default: {
+      IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
+      if (IntrInst) {
+        if (IntrInst->isLifetimeStartOrEnd())
+          break;
+        SideEffectingBlocks.insert(&BB);
+        return;
       }
+      // Treat all the other cases conservatively if it has side effects.
+      if (II.mayHaveSideEffects()) {
+        SideEffectingBlocks.insert(&BB);
+        return;
       }
     }
+    }
   }
+}
 
+bool CodeExtractorAnalysisCache::doesBlockContainClobberOfAddr(
+    BasicBlock &BB, AllocaInst *Addr) const {
+  if (SideEffectingBlocks.count(&BB))
+    return true;
+  auto It = BaseMemAddrs.find(&BB);
+  if (It != BaseMemAddrs.end())
+    return It->second.count(Addr);
+  return false;
+}
+
+bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
+    const CodeExtractorAnalysisCache &CEAC, Instruction *Addr) const {
+  AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
+  Function *Func = (*Blocks.begin())->getParent();
+  for (BasicBlock &BB : *Func) {
+    if (Blocks.count(&BB))
+      continue;
+    if (CEAC.doesBlockContainClobberOfAddr(BB, AI))
+      return false;
+  }
   return true;
 }
 
@@ -415,7 +440,8 @@ CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
 // outline region. If there are not other untracked uses of the address, return
 // the pair of markers if found; otherwise return a pair of nullptr.
 CodeExtractor::LifetimeMarkerInfo
-CodeExtractor::getLifetimeMarkers(Instruction *Addr,
+CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
+                                  Instruction *Addr,
                                   BasicBlock *ExitBlock) const {
   LifetimeMarkerInfo Info;
 
@@ -447,7 +473,7 @@ CodeExtractor::getLifetimeMarkers(Instruction *Addr,
   Info.HoistLifeEnd = !definedInRegion(Blocks, Info.LifeEnd);
   // Do legality check.
   if ((Info.SinkLifeStart || Info.HoistLifeEnd) &&
-      !isLegalToShrinkwrapLifetimeMarkers(Addr))
+      !isLegalToShrinkwrapLifetimeMarkers(CEAC, Addr))
     return {};
 
   // Check to see if we have a place to do hoisting, if not, bail.
@@ -457,7 +483,8 @@ CodeExtractor::getLifetimeMarkers(Instruction *Addr,
   return Info;
 }
 
-void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands,
+void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
+                                ValueSet &SinkCands, ValueSet &HoistCands,
                                 BasicBlock *&ExitBlock) const {
   Function *Func = (*Blocks.begin())->getParent();
   ExitBlock = getCommonExitBlock(Blocks);
@@ -478,74 +505,104 @@ void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands,
     return true;
   };
 
-  for (BasicBlock &BB : *Func) {
-    if (Blocks.count(&BB))
+  // Look up allocas in the original function in CodeExtractorAnalysisCache, as
+  // this is much faster than walking all the instructions.
+  for (AllocaInst *AI : CEAC.getAllocas()) {
+    BasicBlock *BB = AI->getParent();
+    if (Blocks.count(BB))
       continue;
-    for (Instruction &II : BB) {
-      auto *AI = dyn_cast<AllocaInst>(&II);
-      if (!AI)
-        continue;
 
-      LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(AI, ExitBlock);
-      bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo);
-      if (Moved) {
-        LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n");
-        SinkCands.insert(AI);
-        continue;
-      }
+    // As a prior call to extractCodeRegion() may have shrinkwrapped the alloca,
+    // check whether it is actually still in the original function.
+    Function *AIFunc = BB->getParent();
+    if (AIFunc != Func)
+      continue;
 
-      // Follow any bitcasts.
-      SmallVector<Instruction *, 2> Bitcasts;
-      SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo;
-      for (User *U : AI->users()) {
-        if (U->stripInBoundsConstantOffsets() == AI) {
-          Instruction *Bitcast = cast<Instruction>(U);
-          LifetimeMarkerInfo LMI = getLifetimeMarkers(Bitcast, ExitBlock);
-          if (LMI.LifeStart) {
-            Bitcasts.push_back(Bitcast);
-            BitcastLifetimeInfo.push_back(LMI);
-            continue;
-          }
-        }
+    LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(CEAC, AI, ExitBlock);
+    bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo);
+    if (Moved) {
+      LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n");
+      SinkCands.insert(AI);
+      continue;
+    }
 
-        // Found unknown use of AI.
-        if (!definedInRegion(Blocks, U)) {
-          Bitcasts.clear();
-          break;
+    // Follow any bitcasts.
+    SmallVector<Instruction *, 2> Bitcasts;
+    SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo;
+    for (User *U : AI->users()) {
+      if (U->stripInBoundsConstantOffsets() == AI) {
+        Instruction *Bitcast = cast<Instruction>(U);
+        LifetimeMarkerInfo LMI = getLifetimeMarkers(CEAC, Bitcast, ExitBlock);
+        if (LMI.LifeStart) {
+          Bitcasts.push_back(Bitcast);
+          BitcastLifetimeInfo.push_back(LMI);
+          continue;
         }
       }
 
-      // Either no bitcasts reference the alloca or there are unknown uses.
-      if (Bitcasts.empty())
-        continue;
+      // Found unknown use of AI.
+      if (!definedInRegion(Blocks, U)) {
+        Bitcasts.clear();
+        break;
+      }
+    }
 
-      LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n");
-      SinkCands.insert(AI);
-      for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) {
-        Instruction *BitcastAddr = Bitcasts[I];
-        const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I];
-        assert(LMI.LifeStart &&
-               "Unsafe to sink bitcast without lifetime markers");
-        moveOrIgnoreLifetimeMarkers(LMI);
-        if (!definedInRegion(Blocks, BitcastAddr)) {
-          LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr
-                            << "\n");
-          SinkCands.insert(BitcastAddr);
-        }
+    // Either no bitcasts reference the alloca or there are unknown uses.
+    if (Bitcasts.empty())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n");
+    SinkCands.insert(AI);
+    for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) {
+      Instruction *BitcastAddr = Bitcasts[I];
+      const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I];
+      assert(LMI.LifeStart &&
+             "Unsafe to sink bitcast without lifetime markers");
+      moveOrIgnoreLifetimeMarkers(LMI);
+      if (!definedInRegion(Blocks, BitcastAddr)) {
+        LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr
+                          << "\n");
+        SinkCands.insert(BitcastAddr);
       }
     }
   }
 }
 
+bool CodeExtractor::isEligible() const {
+  if (Blocks.empty())
+    return false;
+  BasicBlock *Header = *Blocks.begin();
+  Function *F = Header->getParent();
+
+  // For functions with varargs, check that varargs handling is only done in the
+  // outlined function, i.e vastart and vaend are only used in outlined blocks.
+  if (AllowVarArgs && F->getFunctionType()->isVarArg()) {
+    auto containsVarArgIntrinsic = [](const Instruction &I) {
+      if (const CallInst *CI = dyn_cast<CallInst>(&I))
+        if (const Function *Callee = CI->getCalledFunction())
+          return Callee->getIntrinsicID() == Intrinsic::vastart ||
+                 Callee->getIntrinsicID() == Intrinsic::vaend;
+      return false;
+    };
+
+    for (auto &BB : *F) {
+      if (Blocks.count(&BB))
+        continue;
+      if (llvm::any_of(BB, containsVarArgIntrinsic))
+        return false;
+    }
+  }
+  return true;
+}
+
 void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
                                       const ValueSet &SinkCands) const {
   for (BasicBlock *BB : Blocks) {
     // If a used value is defined outside the region, it's an input.  If an
     // instruction is used outside the region, it's an output.
     for (Instruction &II : *BB) {
-      for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE;
-           ++OI) {
-        Value *V = *OI;
+      for (auto &OI : II.operands()) {
+        Value *V = OI;
         if (!SinkCands.count(V) && definedInCaller(Blocks, V))
           Inputs.insert(V);
       }
@@ -904,12 +961,12 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   // within the new function. This must be done before we lose track of which
   // blocks were originally in the code region.
   std::vector<User *> Users(header->user_begin(), header->user_end());
-  for (unsigned i = 0, e = Users.size(); i != e; ++i)
+  for (auto &U : Users)
     // The BasicBlock which contains the branch is not in the region
     // modify the branch target to a new block
-    if (Instruction *I = dyn_cast<Instruction>(Users[i]))
-      if (I->isTerminator() && !Blocks.count(I->getParent()) &&
-          I->getParent()->getParent() == oldFunction)
+    if (Instruction *I = dyn_cast<Instruction>(U))
+      if (I->isTerminator() && I->getFunction() == oldFunction &&
+          !Blocks.count(I->getParent()))
         I->replaceUsesOfWith(header, newHeader);
 
   return newFunction;
@@ -1277,13 +1334,6 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
 
     // Insert this basic block into the new function
     newBlocks.push_back(Block);
-
-    // Remove @llvm.assume calls that were moved to the new function from the
-    // old function's assumption cache.
-    if (AC)
-      for (auto &I : *Block)
-        if (match(&I, m_Intrinsic<Intrinsic::assume>()))
-          AC->unregisterAssumption(cast<CallInst>(&I));
   }
 }
 
@@ -1332,7 +1382,8 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
       MDBuilder(TI->getContext()).createBranchWeights(BranchWeights));
 }
 
-Function *CodeExtractor::extractCodeRegion() {
+Function *
+CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
   if (!isEligible())
     return nullptr;
 
@@ -1341,27 +1392,6 @@ Function *CodeExtractor::extractCodeRegion() {
   BasicBlock *header = *Blocks.begin();
   Function *oldFunction = header->getParent();
 
-  // For functions with varargs, check that varargs handling is only done in the
-  // outlined function, i.e vastart and vaend are only used in outlined blocks.
-  if (AllowVarArgs && oldFunction->getFunctionType()->isVarArg()) {
-    auto containsVarArgIntrinsic = [](Instruction &I) {
-      if (const CallInst *CI = dyn_cast<CallInst>(&I))
-        if (const Function *F = CI->getCalledFunction())
-          return F->getIntrinsicID() == Intrinsic::vastart ||
-                 F->getIntrinsicID() == Intrinsic::vaend;
-      return false;
-    };
-
-    for (auto &BB : *oldFunction) {
-      if (Blocks.count(&BB))
-        continue;
-      if (llvm::any_of(BB, containsVarArgIntrinsic))
-        return nullptr;
-    }
-  }
-  ValueSet inputs, outputs, SinkingCands, HoistingCands;
-  BasicBlock *CommonExit = nullptr;
-
   // Calculate the entry frequency of the new function before we change the root
   //   block.
   BlockFrequency EntryFreq;
@@ -1375,6 +1405,15 @@ Function *CodeExtractor::extractCodeRegion() {
     }
   }
 
+  if (AC) {
+    // Remove @llvm.assume calls that were moved to the new function from the
+    // old function's assumption cache.
+    for (BasicBlock *Block : Blocks)
+      for (auto &I : *Block)
+        if (match(&I, m_Intrinsic<Intrinsic::assume>()))
+          AC->unregisterAssumption(cast<CallInst>(&I));
+  }
+
   // If we have any return instructions in the region, split those blocks so
   // that the return is not in the region.
   splitReturnBlocks();
@@ -1428,7 +1467,9 @@ Function *CodeExtractor::extractCodeRegion() {
   }
   newFuncRoot->getInstList().push_back(BranchI);
 
-  findAllocas(SinkingCands, HoistingCands, CommonExit);
+  ValueSet inputs, outputs, SinkingCands, HoistingCands;
+  BasicBlock *CommonExit = nullptr;
+  findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
   assert(HoistingCands.empty() || CommonExit);
 
   // Find inputs to, outputs from the code region.
@@ -1563,5 +1604,17 @@ Function *CodeExtractor::extractCodeRegion() {
   });
   LLVM_DEBUG(if (verifyFunction(*oldFunction))
              report_fatal_error("verification of oldFunction failed!"));
+  LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, AC))
+             report_fatal_error("Stale Asumption cache for old Function!"));
   return newFunction;
 }
+
+bool CodeExtractor::verifyAssumptionCache(const Function& F,
+                                          AssumptionCache *AC) {
+  for (auto AssumeVH : AC->assumptions()) {
+    CallInst *I = cast<CallInst>(AssumeVH);
+    if (I->getFunction() != &F)
+      return true;
+  }
+  return false;
+}
diff --git a/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 4aa40eeadda4..57e2ff0251a9 100644
--- a/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -24,7 +24,7 @@ static void insertCall(Function &CurFn, StringRef Func,
 
   if (Func == "mcount" ||
       Func == ".mcount" ||
-      Func == "\01__gnu_mcount_nc" ||
+      Func == "llvm.arm.gnu.eabi.mcount" ||
       Func == "\01_mcount" ||
       Func == "\01mcount" ||
       Func == "__mcount" ||
diff --git a/lib/Transforms/Utils/Evaluator.cpp b/lib/Transforms/Utils/Evaluator.cpp
index 0e203f4e075d..ad36790b8c6a 100644
--- a/lib/Transforms/Utils/Evaluator.cpp
+++ b/lib/Transforms/Utils/Evaluator.cpp
@@ -469,7 +469,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         return false;  // Cannot handle array allocs.
       }
       Type *Ty = AI->getAllocatedType();
-      AllocaTmps.push_back(llvm::make_unique<GlobalVariable>(
+      AllocaTmps.push_back(std::make_unique<GlobalVariable>(
           Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty),
           AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal,
           AI->getType()->getPointerAddressSpace()));
diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
index 0c52e6f3703b..893f23eb6048 100644
--- a/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/lib/Transforms/Utils/FlattenCFG.cpp
@@ -67,7 +67,7 @@ public:
 /// Before:
 ///   ......
 ///   %cmp10 = fcmp une float %tmp1, %tmp2
-///   br i1 %cmp1, label %if.then, label %lor.rhs
+///   br i1 %cmp10, label %if.then, label %lor.rhs
 ///
 /// lor.rhs:
 ///   ......
@@ -251,8 +251,8 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
     bool EverChanged = false;
     for (; CurrBlock != FirstCondBlock;
          CurrBlock = CurrBlock->getSinglePredecessor()) {
-      BranchInst *BI = dyn_cast<BranchInst>(CurrBlock->getTerminator());
-      CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+      auto *BI = cast<BranchInst>(CurrBlock->getTerminator());
+      auto *CI = dyn_cast<CmpInst>(BI->getCondition());
       if (!CI)
         continue;
 
@@ -278,7 +278,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
 
   // Do the transformation.
   BasicBlock *CB;
-  BranchInst *PBI = dyn_cast<BranchInst>(FirstCondBlock->getTerminator());
+  BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
   bool Iteration = true;
   IRBuilder<>::InsertPointGuard Guard(Builder);
   Value *PC = PBI->getCondition();
@@ -444,7 +444,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   FirstEntryBlock->getInstList().pop_back();
   FirstEntryBlock->getInstList()
       .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList());
-  BranchInst *PBI = dyn_cast<BranchInst>(FirstEntryBlock->getTerminator());
+  BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator());
   Value *CC = PBI->getCondition();
   BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
   BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
@@ -453,6 +453,16 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   PBI->replaceUsesOfWith(CC, NC);
   Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
 
+  // Handle PHI node to replace its predecessors to FirstEntryBlock.
+  for (BasicBlock *Succ : successors(PBI)) {
+    for (PHINode &Phi : Succ->phis()) {
+      for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) {
+        if (Phi.getIncomingBlock(i) == SecondEntryBlock)
+          Phi.setIncomingBlock(i, FirstEntryBlock);
+      }
+    }
+  }
+
   // Remove IfTrue1
   if (IfTrue1 != FirstEntryBlock) {
     IfTrue1->dropAllReferences();
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index c9cc0990f237..76b4635ad501 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -210,7 +210,7 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
       if (Function *F = dyn_cast<Function>(&GV)) {
         if (!F->isDeclaration()) {
           for (auto &S : VI.getSummaryList()) {
-            FunctionSummary *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
+            auto *FS = cast<FunctionSummary>(S->getBaseObject());
             if (FS->modulePath() == M.getModuleIdentifier()) {
               F->setEntryCount(Function::ProfileCount(FS->entryCount(),
                                                       Function::PCT_Synthetic));
diff --git a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
index 8041e66e6c4c..ea93f99d69e3 100644
--- a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
+++ b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -25,8 +25,8 @@ ImportedFunctionsInliningStatistics::createInlineGraphNode(const Function &F) {
 
   auto &ValueLookup = NodesMap[F.getName()];
   if (!ValueLookup) {
-    ValueLookup = llvm::make_unique<InlineGraphNode>();
-    ValueLookup->Imported = F.getMetadata("thinlto_src_module") != nullptr;
+    ValueLookup = std::make_unique<InlineGraphNode>();
+    ValueLookup->Imported = F.hasMetadata("thinlto_src_module");
   }
   return *ValueLookup;
 }
@@ -64,7 +64,7 @@ void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) {
     if (F.isDeclaration())
       continue;
     AllFunctions++;
-    ImportedFunctions += int(F.getMetadata("thinlto_src_module") != nullptr);
+    ImportedFunctions += int(F.hasMetadata("thinlto_src_module"));
   }
 }
 static std::string getStatString(const char *Msg, int32_t Fraction, int32_t All,
diff --git a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 8c67d1dc6eb3..ed28fffc22b5 100644
--- a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -533,7 +533,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
 }
 
 bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) {
-  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   return runImpl(F, TLI, DT);
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 39b6b889f91c..5bcd05757ec1 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -324,8 +324,14 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       Value *Address = IBI->getAddress();
       IBI->eraseFromParent();
       if (DeleteDeadConditions)
+        // Delete pointer cast instructions.
         RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
 
+      // Also zap the blockaddress constant if there are no users remaining,
+      // otherwise the destination is still marked as having its address taken.
+      if (BA->use_empty())
+        BA->destroyConstant();
+
       // If we didn't find our destination in the IBI successor list, then we
       // have undefined behavior.  Replace the unconditional branch with an
       // 'unreachable' instruction.
@@ -633,17 +639,6 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
 //  Control Flow Graph Restructuring.
 //
 
-/// RemovePredecessorAndSimplify - Like BasicBlock::removePredecessor, this
-/// method is called when we're about to delete Pred as a predecessor of BB.  If
-/// BB contains any PHI nodes, this drops the entries in the PHI nodes for Pred.
-///
-/// Unlike the removePredecessor method, this attempts to simplify uses of PHI
-/// nodes that collapse into identity values.  For example, if we have:
-///   x = phi(1, 0, 0, 0)
-///   y = and x, z
-///
-/// .. and delete the predecessor corresponding to the '1', this will attempt to
-/// recursively fold the and to 0.
 void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
                                         DomTreeUpdater *DTU) {
   // This only adjusts blocks with PHI nodes.
@@ -672,10 +667,6 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
     DTU->applyUpdatesPermissive({{DominatorTree::Delete, Pred, BB}});
 }
 
-/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
-/// predecessor is known to have one successor (DestBB!). Eliminate the edge
-/// between them, moving the instructions in the predecessor into DestBB and
-/// deleting the predecessor block.
 void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
                                        DomTreeUpdater *DTU) {
 
@@ -755,15 +746,14 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
   }
 }
 
-/// CanMergeValues - Return true if we can choose one of these values to use
-/// in place of the other. Note that we will always choose the non-undef
-/// value to keep.
+/// Return true if we can choose one of these values to use in place of the
+/// other. Note that we will always choose the non-undef value to keep.
 static bool CanMergeValues(Value *First, Value *Second) {
   return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second);
 }
 
-/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
-/// almost-empty BB ending in an unconditional branch to Succ, into Succ.
+/// Return true if we can fold BB, an almost-empty BB ending in an unconditional
+/// branch to Succ, into Succ.
 ///
 /// Assumption: Succ is the single successor for BB.
 static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
@@ -956,11 +946,6 @@ static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
   replaceUndefValuesInPhi(PN, IncomingValues);
 }
 
-/// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an
-/// unconditional branch, and contains no instructions other than PHI nodes,
-/// potential side-effect free intrinsics and the branch.  If possible,
-/// eliminate BB by rewriting all the predecessors to branch to the successor
-/// block and return true.  If we can't transform, return false.
 bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
                                                    DomTreeUpdater *DTU) {
   assert(BB != &BB->getParent()->getEntryBlock() &&
@@ -1088,10 +1073,6 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
   return true;
 }
 
-/// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI
-/// nodes in this block. This doesn't try to be clever about PHI nodes
-/// which differ only in the order of the incoming values, but instcombine
-/// orders them so it usually won't matter.
 bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
   // This implementation doesn't currently consider undef operands
   // specially. Theoretically, two phis which are identical except for
@@ -1151,10 +1132,10 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 /// often possible though. If alignment is important, a more reliable approach
 /// is to simply align all global variables and allocation instructions to
 /// their preferred alignment from the beginning.
-static unsigned enforceKnownAlignment(Value *V, unsigned Align,
+static unsigned enforceKnownAlignment(Value *V, unsigned Alignment,
                                       unsigned PrefAlign,
                                       const DataLayout &DL) {
-  assert(PrefAlign > Align);
+  assert(PrefAlign > Alignment);
 
   V = V->stripPointerCasts();
 
@@ -1165,36 +1146,36 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
     // stripPointerCasts recurses through infinite layers of bitcasts,
     // while computeKnownBits is not allowed to traverse more than 6
     // levels.
-    Align = std::max(AI->getAlignment(), Align);
-    if (PrefAlign <= Align)
-      return Align;
+    Alignment = std::max(AI->getAlignment(), Alignment);
+    if (PrefAlign <= Alignment)
+      return Alignment;
 
     // If the preferred alignment is greater than the natural stack alignment
     // then don't round up. This avoids dynamic stack realignment.
-    if (DL.exceedsNaturalStackAlignment(PrefAlign))
-      return Align;
-    AI->setAlignment(PrefAlign);
+    if (DL.exceedsNaturalStackAlignment(Align(PrefAlign)))
+      return Alignment;
+    AI->setAlignment(MaybeAlign(PrefAlign));
     return PrefAlign;
   }
 
   if (auto *GO = dyn_cast<GlobalObject>(V)) {
     // TODO: as above, this shouldn't be necessary.
-    Align = std::max(GO->getAlignment(), Align);
-    if (PrefAlign <= Align)
-      return Align;
+    Alignment = std::max(GO->getAlignment(), Alignment);
+    if (PrefAlign <= Alignment)
+      return Alignment;
 
     // If there is a large requested alignment and we can, bump up the alignment
     // of the global.  If the memory we set aside for the global may not be the
     // memory used by the final program then it is impossible for us to reliably
     // enforce the preferred alignment.
     if (!GO->canIncreaseAlignment())
-      return Align;
+      return Alignment;
 
-    GO->setAlignment(PrefAlign);
+    GO->setAlignment(MaybeAlign(PrefAlign));
     return PrefAlign;
   }
 
-  return Align;
+  return Alignment;
 }
 
 unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
@@ -1397,7 +1378,12 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
 /// Determine whether this alloca is either a VLA or an array.
 static bool isArray(AllocaInst *AI) {
   return AI->isArrayAllocation() ||
-    AI->getType()->getElementType()->isArrayTy();
+         (AI->getAllocatedType() && AI->getAllocatedType()->isArrayTy());
+}
+
+/// Determine whether this alloca is a structure.
+static bool isStructure(AllocaInst *AI) {
+  return AI->getAllocatedType() && AI->getAllocatedType()->isStructTy();
 }
 
 /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
@@ -1422,7 +1408,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
     // stored on the stack, while the dbg.declare can only describe
     // the stack slot (and at a lexical-scope granularity). Later
     // passes will attempt to elide the stack slot.
-    if (!AI || isArray(AI))
+    if (!AI || isArray(AI) || isStructure(AI))
       continue;
 
     // A volatile load/store means that the alloca can't be elided anyway.
@@ -1591,15 +1577,10 @@ static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
       DIExpr->getElement(0) != dwarf::DW_OP_deref)
     return;
 
-  // Insert the offset immediately after the first deref.
+  // Insert the offset before the first deref.
   // We could just change the offset argument of dbg.value, but it's unsigned...
-  if (Offset) {
-    SmallVector<uint64_t, 4> Ops;
-    Ops.push_back(dwarf::DW_OP_deref);
-    DIExpression::appendOffset(Ops, Offset);
-    Ops.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
-    DIExpr = Builder.createExpression(Ops);
-  }
+  if (Offset)
+    DIExpr = DIExpression::prepend(DIExpr, 0, Offset);
 
   Builder.insertDbgValueIntrinsic(NewAddress, DIVar, DIExpr, Loc, DVI);
   DVI->eraseFromParent();
@@ -1957,18 +1938,24 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
   return NumInstrsRemoved;
 }
 
-/// changeToCall - Convert the specified invoke into a normal call.
-static void changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr) {
-  SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
+CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) {
+  SmallVector<Value *, 8> Args(II->arg_begin(), II->arg_end());
   SmallVector<OperandBundleDef, 1> OpBundles;
   II->getOperandBundlesAsDefs(OpBundles);
-  CallInst *NewCall = CallInst::Create(
-      II->getFunctionType(), II->getCalledValue(), Args, OpBundles, "", II);
-  NewCall->takeName(II);
+  CallInst *NewCall = CallInst::Create(II->getFunctionType(),
+                                       II->getCalledValue(), Args, OpBundles);
   NewCall->setCallingConv(II->getCallingConv());
   NewCall->setAttributes(II->getAttributes());
   NewCall->setDebugLoc(II->getDebugLoc());
   NewCall->copyMetadata(*II);
+  return NewCall;
+}
+
+/// changeToCall - Convert the specified invoke into a normal call.
+void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
+  CallInst *NewCall = createCallMatchingInvoke(II);
+  NewCall->takeName(II);
+  NewCall->insertBefore(II);
   II->replaceAllUsesWith(NewCall);
 
   // Follow the call by a branch to the normal destination.
@@ -2223,12 +2210,10 @@ void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
 
 /// removeUnreachableBlocks - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
-/// otherwise. If `LVI` is passed, this function preserves LazyValueInfo
-/// after modifying the CFG.
-bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
-                                   DomTreeUpdater *DTU,
+/// otherwise.
+bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
                                    MemorySSAUpdater *MSSAU) {
-  SmallPtrSet<BasicBlock*, 16> Reachable;
+  SmallPtrSet<BasicBlock *, 16> Reachable;
   bool Changed = markAliveBlocks(F, Reachable, DTU);
 
   // If there are unreachable blocks in the CFG...
@@ -2236,21 +2221,21 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
     return Changed;
 
   assert(Reachable.size() < F.size());
-  NumRemoved += F.size()-Reachable.size();
+  NumRemoved += F.size() - Reachable.size();
 
   SmallSetVector<BasicBlock *, 8> DeadBlockSet;
-  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ++I) {
-    auto *BB = &*I;
-    if (Reachable.count(BB))
+  for (BasicBlock &BB : F) {
+    // Skip reachable basic blocks
+    if (Reachable.find(&BB) != Reachable.end())
       continue;
-    DeadBlockSet.insert(BB);
+    DeadBlockSet.insert(&BB);
   }
 
   if (MSSAU)
     MSSAU->removeBlocks(DeadBlockSet);
 
   // Loop over all of the basic blocks that are not reachable, dropping all of
-  // their internal references. Update DTU and LVI if available.
+  // their internal references. Update DTU if available.
   std::vector<DominatorTree::UpdateType> Updates;
   for (auto *BB : DeadBlockSet) {
     for (BasicBlock *Successor : successors(BB)) {
@@ -2259,26 +2244,18 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
       if (DTU)
         Updates.push_back({DominatorTree::Delete, BB, Successor});
     }
-    if (LVI)
-      LVI->eraseBlock(BB);
     BB->dropAllReferences();
-  }
-  for (Function::iterator I = ++F.begin(); I != F.end();) {
-    auto *BB = &*I;
-    if (Reachable.count(BB)) {
-      ++I;
-      continue;
-    }
     if (DTU) {
-      // Remove the terminator of BB to clear the successor list of BB.
-      if (BB->getTerminator())
-        BB->getInstList().pop_back();
+      Instruction *TI = BB->getTerminator();
+      assert(TI && "Basic block should have a terminator");
+      // Terminators like invoke can have users. We have to replace their users,
+      // before removing them.
+      if (!TI->use_empty())
+        TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+      TI->eraseFromParent();
       new UnreachableInst(BB->getContext(), BB);
       assert(succ_empty(BB) && "The successor list of BB isn't empty before "
                                "applying corresponding DTU updates.");
-      ++I;
-    } else {
-      I = F.getBasicBlockList().erase(I);
     }
   }
 
@@ -2294,7 +2271,11 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
     }
     if (!Deleted)
       return false;
+  } else {
+    for (auto *BB : DeadBlockSet)
+      BB->eraseFromParent();
   }
+
   return true;
 }
 
@@ -2363,6 +2344,9 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         K->setMetadata(Kind,
           MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
         break;
+      case LLVMContext::MD_preserve_access_index:
+        // Preserve !preserve.access.index in K.
+        break;
     }
   }
   // Set !invariant.group from J if J has it. If both instructions have it
@@ -2385,10 +2369,61 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J,
       LLVMContext::MD_invariant_group, LLVMContext::MD_align,
       LLVMContext::MD_dereferenceable,
       LLVMContext::MD_dereferenceable_or_null,
-      LLVMContext::MD_access_group};
+      LLVMContext::MD_access_group,    LLVMContext::MD_preserve_access_index};
   combineMetadata(K, J, KnownIDs, KDominatesJ);
 }
 
+void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  Source.getAllMetadata(MD);
+  MDBuilder MDB(Dest.getContext());
+  Type *NewType = Dest.getType();
+  const DataLayout &DL = Source.getModule()->getDataLayout();
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    // Note, essentially every kind of metadata should be preserved here! This
+    // routine is supposed to clone a load instruction changing *only its type*.
+    // The only metadata it makes sense to drop is metadata which is invalidated
+    // when the pointer type changes. This should essentially never be the case
+    // in LLVM, but we explicitly switch over only known metadata to be
+    // conservatively correct. If you are adding metadata to LLVM which pertains
+    // to loads, you almost certainly want to add it here.
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_fpmath:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_invariant_load:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_mem_parallel_loop_access:
+    case LLVMContext::MD_access_group:
+      // All of these directly apply.
+      Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_nonnull:
+      copyNonnullMetadata(Source, N, Dest);
+      break;
+
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These only directly apply if the new type is also a pointer.
+      if (NewType->isPointerTy())
+        Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_range:
+      copyRangeMetadata(DL, Source, N, Dest);
+      break;
+    }
+  }
+}
+
 void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
   auto *ReplInst = dyn_cast<Instruction>(Repl);
   if (!ReplInst)
@@ -2417,7 +2452,7 @@ void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
       LLVMContext::MD_noalias,         LLVMContext::MD_range,
       LLVMContext::MD_fpmath,          LLVMContext::MD_invariant_load,
       LLVMContext::MD_invariant_group, LLVMContext::MD_nonnull,
-      LLVMContext::MD_access_group};
+      LLVMContext::MD_access_group,    LLVMContext::MD_preserve_access_index};
   combineMetadata(ReplInst, I, KnownIDs, false);
 }
 
diff --git a/lib/Transforms/Utils/LoopRotationUtils.cpp b/lib/Transforms/Utils/LoopRotationUtils.cpp
index 37389a695b45..889ea5ca9970 100644
--- a/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -615,30 +615,9 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
   LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
                     << LastExit->getName() << "\n");
 
-  // Hoist the instructions from Latch into LastExit.
-  Instruction *FirstLatchInst = &*(Latch->begin());
-  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
-                                 Latch->begin(), Jmp->getIterator());
-
-  // Update MemorySSA
-  if (MSSAU)
-    MSSAU->moveAllAfterMergeBlocks(Latch, LastExit, FirstLatchInst);
-
-  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
-  BasicBlock *Header = Jmp->getSuccessor(0);
-  assert(Header == L->getHeader() && "expected a backward branch");
-
-  // Remove Latch from the CFG so that LastExit becomes the new Latch.
-  BI->setSuccessor(FallThruPath, Header);
-  Latch->replaceSuccessorsPhiUsesWith(LastExit);
-  Jmp->eraseFromParent();
-
-  // Nuke the Latch block.
-  assert(Latch->empty() && "unable to evacuate Latch");
-  LI->removeBlock(Latch);
-  if (DT)
-    DT->eraseNode(Latch);
-  Latch->eraseFromParent();
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  MergeBlockIntoPredecessor(Latch, &DTU, LI, MSSAU, nullptr,
+                            /*PredecessorWithTwoSuccessors=*/true);
 
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 7e6da02d5707..d0f89dc54bfb 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -808,7 +808,7 @@ bool LoopSimplify::runOnFunction(Function &F) {
     auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
     if (MSSAAnalysis) {
       MSSA = &MSSAAnalysis->getMSSA();
-      MSSAU = make_unique<MemorySSAUpdater>(MSSA);
+      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
     }
   }
 
@@ -835,12 +835,19 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F,
   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
   AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+  auto *MSSAAnalysis = AM.getCachedResult<MemorySSAAnalysis>(F);
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSAAnalysis) {
+    auto *MSSA = &MSSAAnalysis->getMSSA();
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+  }
+
 
   // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA
-  // after simplifying the loops. MemorySSA is not preserved either.
+  // after simplifying the loops. MemorySSA is preserved if it exists.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
     Changed |=
-        simplifyLoop(*I, DT, LI, SE, AC, nullptr, /*PreserveLCSSA*/ false);
+        simplifyLoop(*I, DT, LI, SE, AC, MSSAU.get(), /*PreserveLCSSA*/ false);
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -853,6 +860,8 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F,
   PA.preserve<SCEVAA>();
   PA.preserve<ScalarEvolutionAnalysis>();
   PA.preserve<DependenceAnalysis>();
+  if (MSSAAnalysis)
+    PA.preserve<MemorySSAAnalysis>();
   // BPI maps conditional terminators to probabilities, LoopSimplify can insert
   // blocks, but it does so only by splitting existing blocks and edges. This
   // results in the interesting property that all new terminators inserted are
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index e39ade523714..a7590fc32545 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -711,7 +711,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
 
   auto setDest = [LoopExit, ContinueOnTrue](BasicBlock *Src, BasicBlock *Dest,
                                             ArrayRef<BasicBlock *> NextBlocks,
-                                            BasicBlock *CurrentHeader,
+                                            BasicBlock *BlockInLoop,
                                             bool NeedConditional) {
     auto *Term = cast<BranchInst>(Src->getTerminator());
     if (NeedConditional) {
@@ -723,7 +723,9 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       if (Dest != LoopExit) {
         BasicBlock *BB = Src;
         for (BasicBlock *Succ : successors(BB)) {
-          if (Succ == CurrentHeader)
+          // Preserve the incoming value from BB if we are jumping to the block
+          // in the current loop.
+          if (Succ == BlockInLoop)
             continue;
           for (PHINode &Phi : Succ->phis())
             Phi.removeIncomingValue(BB, false);
@@ -794,7 +796,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // unconditional branch for some iterations.
         NeedConditional = false;
 
-      setDest(Headers[i], Dest, Headers, Headers[i], NeedConditional);
+      setDest(Headers[i], Dest, Headers, HeaderSucc[i], NeedConditional);
     }
 
     // Set up latches to branch to the new header in the unrolled iterations or
@@ -868,7 +870,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   assert(!DT || !UnrollVerifyDomtree ||
          DT->verify(DominatorTree::VerificationLevel::Fast));
 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   // Merge adjacent basic blocks, if possible.
   for (BasicBlock *Latch : Latches) {
     BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator());
@@ -888,6 +890,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       }
     }
   }
+  // Apply updates to the DomTree.
+  DT = &DTU.getDomTree();
 
   // At this point, the code is well formed.  We now simplify the unrolled loop,
   // doing constant propagation and dead code elimination as we go.
diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index ff49d83f25c5..bf2e87b0d49f 100644
--- a/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -517,6 +517,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
     movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
   }
 
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
   // new ones required.
   if (Count != 1) {
@@ -530,7 +531,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
                            ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
                            SubLoopBlocksLast.back(), AftBlocksFirst[0]);
-    DT->applyUpdates(DTUpdates);
+    DTU.applyUpdatesPermissive(DTUpdates);
   }
 
   // Merge adjacent basic blocks, if possible.
@@ -538,7 +539,6 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   while (!MergeBlocks.empty()) {
     BasicBlock *BB = *MergeBlocks.begin();
     BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
@@ -555,6 +555,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
     } else
       MergeBlocks.erase(BB);
   }
+  // Apply updates to the DomTree.
+  DT = &DTU.getDomTree();
 
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 005306cf1898..58e42074f963 100644
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -62,9 +62,11 @@ static cl::opt<unsigned> UnrollForcePeelCount(
     cl::desc("Force a peel count regardless of profiling information."));
 
 static cl::opt<bool> UnrollPeelMultiDeoptExit(
-    "unroll-peel-multi-deopt-exit", cl::init(false), cl::Hidden,
+    "unroll-peel-multi-deopt-exit", cl::init(true), cl::Hidden,
     cl::desc("Allow peeling of loops with multiple deopt exits."));
 
+static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
+
 // Designates that a Phi is estimated to become invariant after an "infinite"
 // number of loop iterations (i.e. only may become an invariant if the loop is
 // fully unrolled).
@@ -275,6 +277,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
                       << " iterations.\n");
     UP.PeelCount = UnrollForcePeelCount;
+    UP.PeelProfiledIterations = true;
     return;
   }
 
@@ -282,6 +285,13 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!UP.AllowPeeling)
     return;
 
+  unsigned AlreadyPeeled = 0;
+  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
+    AlreadyPeeled = *Peeled;
+  // Stop if we already peeled off the maximum number of iterations.
+  if (AlreadyPeeled >= UnrollPeelMaxCount)
+    return;
+
   // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
   // iterations of the loop. For this we compute the number for iterations after
   // which every Phi is guaranteed to become an invariant, and try to peel the
@@ -317,11 +327,14 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
       DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
       // Consider max peel count limitation.
       assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
-      LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
-                        << " iteration(s) to turn"
-                        << " some Phis into invariants.\n");
-      UP.PeelCount = DesiredPeelCount;
-      return;
+      if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
+        LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
+                          << " iteration(s) to turn"
+                          << " some Phis into invariants.\n");
+        UP.PeelCount = DesiredPeelCount;
+        UP.PeelProfiledIterations = false;
+        return;
+      }
     }
   }
 
@@ -330,6 +343,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (TripCount)
     return;
 
+  // Do not apply profile base peeling if it is disabled.
+  if (!UP.PeelProfiledIterations)
+    return;
   // If we don't know the trip count, but have reason to believe the average
   // trip count is low, peeling should be beneficial, since we will usually
   // hit the peeled section.
@@ -344,7 +360,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                       << "\n");
 
     if (*PeelCount) {
-      if ((*PeelCount <= UnrollPeelMaxCount) &&
+      if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) &&
           (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
         LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
                           << " iterations.\n");
@@ -352,6 +368,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
         return;
       }
       LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
+      LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
       LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
       LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
                         << "\n");
@@ -364,88 +381,77 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
 /// iteration.
 /// This sets the branch weights for the latch of the recently peeled off loop
 /// iteration correctly.
-/// Our goal is to make sure that:
-/// a) The total weight of all the copies of the loop body is preserved.
-/// b) The total weight of the loop exit is preserved.
-/// c) The body weight is reasonably distributed between the peeled iterations.
+/// Let F is a weight of the edge from latch to header.
+/// Let E is a weight of the edge from latch to exit.
+/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
+/// go to exit.
+/// Then, Estimated TripCount = F / E.
+/// For I-th (counting from 0) peeled off iteration we set the the weights for
+/// the peeled latch as (TC - I, 1). It gives us reasonable distribution,
+/// The probability to go to exit 1/(TC-I) increases. At the same time
+/// the estimated trip count of remaining loop reduces by I.
+/// To avoid dealing with division rounding we can just multiple both part
+/// of weights to E and use weight as (F - I * E, E).
 ///
 /// \param Header The copy of the header block that belongs to next iteration.
 /// \param LatchBR The copy of the latch branch that belongs to this iteration.
-/// \param IterNumber The serial number of the iteration that was just
-/// peeled off.
-/// \param AvgIters The average number of iterations we expect the loop to have.
-/// \param[in,out] PeeledHeaderWeight The total number of dynamic loop
-/// iterations that are unaccounted for. As an input, it represents the number
-/// of times we expect to enter the header of the iteration currently being
-/// peeled off. The output is the number of times we expect to enter the
-/// header of the next iteration.
+/// \param[in,out] FallThroughWeight The weight of the edge from latch to
+/// header before peeling (in) and after peeled off one iteration (out).
 static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                                unsigned IterNumber, unsigned AvgIters,
-                                uint64_t &PeeledHeaderWeight) {
-  if (!PeeledHeaderWeight)
+                                uint64_t ExitWeight,
+                                uint64_t &FallThroughWeight) {
+  // FallThroughWeight is 0 means that there is no branch weights on original
+  // latch block or estimated trip count is zero.
+  if (!FallThroughWeight)
     return;
-  // FIXME: Pick a more realistic distribution.
-  // Currently the proportion of weight we assign to the fall-through
-  // side of the branch drops linearly with the iteration number, and we use
-  // a 0.9 fudge factor to make the drop-off less sharp...
-  uint64_t FallThruWeight =
-      PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9);
-  uint64_t ExitWeight = PeeledHeaderWeight - FallThruWeight;
-  PeeledHeaderWeight -= ExitWeight;
 
   unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
   MDBuilder MDB(LatchBR->getContext());
   MDNode *WeightNode =
-      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThruWeight)
-                : MDB.createBranchWeights(FallThruWeight, ExitWeight);
+      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
+                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
   LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+  FallThroughWeight =
+      FallThroughWeight > ExitWeight ? FallThroughWeight - ExitWeight : 1;
 }
 
 /// Initialize the weights.
 ///
 /// \param Header The header block.
 /// \param LatchBR The latch branch.
-/// \param AvgIters The average number of iterations we expect the loop to have.
-/// \param[out] ExitWeight The # of times the edge from Latch to Exit is taken.
-/// \param[out] CurHeaderWeight The # of times the header is executed.
+/// \param[out] ExitWeight The weight of the edge from Latch to Exit.
+/// \param[out] FallThroughWeight The weight of the edge from Latch to Header.
 static void initBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                              unsigned AvgIters, uint64_t &ExitWeight,
-                              uint64_t &CurHeaderWeight) {
+                              uint64_t &ExitWeight,
+                              uint64_t &FallThroughWeight) {
   uint64_t TrueWeight, FalseWeight;
   if (!LatchBR->extractProfMetadata(TrueWeight, FalseWeight))
     return;
   unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
   ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
-  // The # of times the loop body executes is the sum of the exit block
-  // is taken and the # of times the backedges are taken.
-  CurHeaderWeight = TrueWeight + FalseWeight;
+  FallThroughWeight = HeaderIdx ? FalseWeight : TrueWeight;
 }
 
 /// Update the weights of original Latch block after peeling off all iterations.
 ///
 /// \param Header The header block.
 /// \param LatchBR The latch branch.
-/// \param ExitWeight The weight of the edge from Latch to Exit block.
-/// \param CurHeaderWeight The # of time the header is executed.
+/// \param ExitWeight The weight of the edge from Latch to Exit.
+/// \param FallThroughWeight The weight of the edge from Latch to Header.
 static void fixupBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                               uint64_t ExitWeight, uint64_t CurHeaderWeight) {
-  // Adjust the branch weights on the loop exit.
-  if (!ExitWeight)
+                               uint64_t ExitWeight,
+                               uint64_t FallThroughWeight) {
+  // FallThroughWeight is 0 means that there is no branch weights on original
+  // latch block or estimated trip count is zero.
+  if (!FallThroughWeight)
     return;
 
-  // The backedge count is the difference of current header weight and
-  // current loop exit weight. If the current header weight is smaller than
-  // the current loop exit weight, we mark the loop backedge weight as 1.
-  uint64_t BackEdgeWeight = 0;
-  if (ExitWeight < CurHeaderWeight)
-    BackEdgeWeight = CurHeaderWeight - ExitWeight;
-  else
-    BackEdgeWeight = 1;
+  // Sets the branch weights on the loop exit.
   MDBuilder MDB(LatchBR->getContext());
   unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
   MDNode *WeightNode =
-      HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
-                : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
+      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
+                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
   LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
 }
 
@@ -586,11 +592,30 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
 
   DenseMap<BasicBlock *, BasicBlock *> ExitIDom;
   if (DT) {
+    // We'd like to determine the idom of exit block after peeling one
+    // iteration.
+    // Let Exit is exit block.
+    // Let ExitingSet - is a set of predecessors of Exit block. They are exiting
+    // blocks.
+    // Let Latch' and ExitingSet' are copies after a peeling.
+    // We'd like to find an idom'(Exit) - idom of Exit after peeling.
+    // It is an evident that idom'(Exit) will be the nearest common dominator
+    // of ExitingSet and ExitingSet'.
+    // idom(Exit) is a nearest common dominator of ExitingSet.
+    // idom(Exit)' is a nearest common dominator of ExitingSet'.
+    // Taking into account that we have a single Latch, Latch' will dominate
+    // Header and idom(Exit).
+    // So the idom'(Exit) is nearest common dominator of idom(Exit)' and Latch'.
+    // All these basic blocks are in the same loop, so what we find is
+    // (nearest common dominator of idom(Exit) and Latch)'.
+    // In the loop below we remember nearest common dominator of idom(Exit) and
+    // Latch to update idom of Exit later.
     assert(L->hasDedicatedExits() && "No dedicated exits?");
     for (auto Edge : ExitEdges) {
       if (ExitIDom.count(Edge.second))
         continue;
-      BasicBlock *BB = DT->getNode(Edge.second)->getIDom()->getBlock();
+      BasicBlock *BB = DT->findNearestCommonDominator(
+          DT->getNode(Edge.second)->getIDom()->getBlock(), Latch);
       assert(L->contains(BB) && "IDom is not in a loop");
       ExitIDom[Edge.second] = BB;
     }
@@ -659,23 +684,14 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   // newly created branches.
   BranchInst *LatchBR =
       cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator());
-  uint64_t ExitWeight = 0, CurHeaderWeight = 0;
-  initBranchWeights(Header, LatchBR, PeelCount, ExitWeight, CurHeaderWeight);
+  uint64_t ExitWeight = 0, FallThroughWeight = 0;
+  initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
 
   // For each peeled-off iteration, make a copy of the loop.
   for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
     SmallVector<BasicBlock *, 8> NewBlocks;
     ValueToValueMapTy VMap;
 
-    // Subtract the exit weight from the current header weight -- the exit
-    // weight is exactly the weight of the previous iteration's header.
-    // FIXME: due to the way the distribution is constructed, we need a
-    // guard here to make sure we don't end up with non-positive weights.
-    if (ExitWeight < CurHeaderWeight)
-      CurHeaderWeight -= ExitWeight;
-    else
-      CurHeaderWeight = 1;
-
     cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
                     LoopBlocks, VMap, LVMap, DT, LI);
 
@@ -697,8 +713,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     }
 
     auto *LatchBRCopy = cast<BranchInst>(VMap[LatchBR]);
-    updateBranchWeights(InsertBot, LatchBRCopy, Iter,
-                        PeelCount, ExitWeight);
+    updateBranchWeights(InsertBot, LatchBRCopy, ExitWeight, FallThroughWeight);
     // Remove Loop metadata from the latch branch instruction
     // because it is not the Loop's latch branch anymore.
     LatchBRCopy->setMetadata(LLVMContext::MD_loop, nullptr);
@@ -724,7 +739,13 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
   }
 
-  fixupBranchWeights(Header, LatchBR, ExitWeight, CurHeaderWeight);
+  fixupBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
+
+  // Update Metadata for count of peeled off iterations.
+  unsigned AlreadyPeeled = 0;
+  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
+    AlreadyPeeled = *Peeled;
+  addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
 
   if (Loop *ParentLoop = L->getParentLoop())
     L = ParentLoop;
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index ec226e65f650..b4d7f35d2d9a 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -45,6 +46,7 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "loop-utils"
 
 static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
+static const char *LLVMLoopDisableLICM = "llvm.licm.disable";
 
 bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
                                    MemorySSAUpdater *MSSAU,
@@ -169,6 +171,8 @@ void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) {
   AU.addPreserved<SCEVAAWrapperPass>();
   AU.addRequired<ScalarEvolutionWrapperPass>();
   AU.addPreserved<ScalarEvolutionWrapperPass>();
+  // FIXME: When all loop passes preserve MemorySSA, it can be required and
+  // preserved here instead of the individual handling in each pass.
 }
 
 /// Manually defined generic "LoopPass" dependency initialization. This is used
@@ -189,6 +193,54 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) {
   INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
   INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
   INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+}
+
+/// Create MDNode for input string.
+static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  Metadata *MDs[] = {
+      MDString::get(Context, Name),
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+  return MDNode::get(Context, MDs);
+}
+
+/// Set input string into loop metadata by keeping other values intact.
+/// If the string is already in loop metadata update value if it is
+/// different.
+void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
+                                   unsigned V) {
+  SmallVector<Metadata *, 4> MDs(1);
+  // If the loop already has metadata, retain it.
+  MDNode *LoopID = TheLoop->getLoopID();
+  if (LoopID) {
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+      // If it is of form key = value, try to parse it.
+      if (Node->getNumOperands() == 2) {
+        MDString *S = dyn_cast<MDString>(Node->getOperand(0));
+        if (S && S->getString().equals(StringMD)) {
+          ConstantInt *IntMD =
+              mdconst::extract_or_null<ConstantInt>(Node->getOperand(1));
+          if (IntMD && IntMD->getSExtValue() == V)
+            // It is already in place. Do nothing.
+            return;
+          // We need to update the value, so just skip it here and it will
+          // be added after copying other existed nodes.
+          continue;
+        }
+      }
+      MDs.push_back(Node);
+    }
+  }
+  // Add new metadata.
+  MDs.push_back(createStringMetadata(TheLoop, StringMD, V));
+  // Replace current metadata node with new one.
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  TheLoop->setLoopID(NewLoopID);
 }
 
 /// Find string metadata for loop
@@ -332,6 +384,10 @@ bool llvm::hasDisableAllTransformsHint(const Loop *L) {
   return getBooleanLoopAttribute(L, LLVMLoopDisableNonforced);
 }
 
+bool llvm::hasDisableLICMTransformsHint(const Loop *L) {
+  return getBooleanLoopAttribute(L, LLVMLoopDisableLICM);
+}
+
 TransformationMode llvm::hasUnrollTransformation(Loop *L) {
   if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable"))
     return TM_SuppressedByUser;
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
index a9a480a4b7f9..5d7759056c7d 100644
--- a/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -92,8 +92,8 @@ void LoopVersioning::versionLoop(
   // Create empty preheader for the loop (and after cloning for the
   // non-versioned loop).
   BasicBlock *PH =
-      SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI);
-  PH->setName(VersionedLoop->getHeader()->getName() + ".ph");
+      SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI,
+                 nullptr, VersionedLoop->getHeader()->getName() + ".ph");
 
   // Clone the loop including the preheader.
   //
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index c0b7edc547fd..60bb2775a194 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -121,15 +121,14 @@ namespace {
       }
 
       // Rename all functions
-      const TargetLibraryInfo &TLI =
-          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       for (auto &F : M) {
         StringRef Name = F.getName();
         LibFunc Tmp;
         // Leave library functions alone because their presence or absence could
         // affect the behavior of other passes.
         if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
-            TLI.getLibFunc(F, Tmp))
+            getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F).getLibFunc(
+                F, Tmp))
           continue;
 
         // Leave @main alone. The output of -metarenamer might be passed to
diff --git a/lib/Transforms/Utils/MisExpect.cpp b/lib/Transforms/Utils/MisExpect.cpp
new file mode 100644
index 000000000000..26d3402bd279
--- /dev/null
+++ b/lib/Transforms/Utils/MisExpect.cpp
@@ -0,0 +1,177 @@
+//===--- MisExpect.cpp - Check the use of llvm.expect with PGO data -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code to emit warnings for potentially incorrect usage of the
+// llvm.expect intrinsic. This utility extracts the threshold values from
+// metadata associated with the instrumented Branch or Switch instruction. The
+// threshold values are then used to determine if a warning should be emmited.
+//
+// MisExpect metadata is generated when llvm.expect intrinsics are lowered see
+// LowerExpectIntrinsic.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/MisExpect.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <cstdint>
+#include <functional>
+#include <numeric>
+
+#define DEBUG_TYPE "misexpect"
+
+using namespace llvm;
+using namespace misexpect;
+
+namespace llvm {
+
+// Command line option to enable/disable the warning when profile data suggests
+// a mismatch with the use of the llvm.expect intrinsic
+static cl::opt<bool> PGOWarnMisExpect(
+    "pgo-warn-misexpect", cl::init(false), cl::Hidden,
+    cl::desc("Use this option to turn on/off "
+             "warnings about incorrect usage of llvm.expect intrinsics."));
+
+} // namespace llvm
+
+namespace {
+
+Instruction *getOprndOrInst(Instruction *I) {
+  assert(I != nullptr && "MisExpect target Instruction cannot be nullptr");
+  Instruction *Ret = nullptr;
+  if (auto *B = dyn_cast<BranchInst>(I)) {
+    Ret = dyn_cast<Instruction>(B->getCondition());
+  }
+  // TODO: Find a way to resolve condition location for switches
+  // Using the condition of the switch seems to often resolve to an earlier
+  // point in the program, i.e. the calculation of the switch condition, rather
+  // than the switches location in the source code. Thus, we should use the
+  // instruction to get source code locations rather than the condition to
+  // improve diagnostic output, such as the caret. If the same problem exists
+  // for branch instructions, then we should remove this function and directly
+  // use the instruction
+  //
+  // else if (auto S = dyn_cast<SwitchInst>(I)) {
+  // Ret = I;
+  //}
+  return Ret ? Ret : I;
+}
+
+void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx,
+                             uint64_t ProfCount, uint64_t TotalCount) {
+  double PercentageCorrect = (double)ProfCount / TotalCount;
+  auto PerString =
+      formatv("{0:P} ({1} / {2})", PercentageCorrect, ProfCount, TotalCount);
+  auto RemStr = formatv(
+      "Potential performance regression from use of the llvm.expect intrinsic: "
+      "Annotation was correct on {0} of profiled executions.",
+      PerString);
+  Twine Msg(PerString);
+  Instruction *Cond = getOprndOrInst(I);
+  if (PGOWarnMisExpect)
+    Ctx.diagnose(DiagnosticInfoMisExpect(Cond, Msg));
+  OptimizationRemarkEmitter ORE(I->getParent()->getParent());
+  ORE.emit(OptimizationRemark(DEBUG_TYPE, "misexpect", Cond) << RemStr.str());
+}
+
+} // namespace
+
+namespace llvm {
+namespace misexpect {
+
+void verifyMisExpect(Instruction *I, const SmallVector<uint32_t, 4> &Weights,
+                     LLVMContext &Ctx) {
+  if (auto *MisExpectData = I->getMetadata(LLVMContext::MD_misexpect)) {
+    auto *MisExpectDataName = dyn_cast<MDString>(MisExpectData->getOperand(0));
+    if (MisExpectDataName &&
+        MisExpectDataName->getString().equals("misexpect")) {
+      LLVM_DEBUG(llvm::dbgs() << "------------------\n");
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Function: " << I->getFunction()->getName() << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "Instruction: " << *I << ":\n");
+      LLVM_DEBUG(for (int Idx = 0, Size = Weights.size(); Idx < Size; ++Idx) {
+        llvm::dbgs() << "Weights[" << Idx << "] = " << Weights[Idx] << "\n";
+      });
+
+      // extract values from misexpect metadata
+      const auto *IndexCint =
+          mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(1));
+      const auto *LikelyCInt =
+          mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(2));
+      const auto *UnlikelyCInt =
+          mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(3));
+
+      if (!IndexCint || !LikelyCInt || !UnlikelyCInt)
+        return;
+
+      const uint64_t Index = IndexCint->getZExtValue();
+      const uint64_t LikelyBranchWeight = LikelyCInt->getZExtValue();
+      const uint64_t UnlikelyBranchWeight = UnlikelyCInt->getZExtValue();
+      const uint64_t ProfileCount = Weights[Index];
+      const uint64_t CaseTotal = std::accumulate(
+          Weights.begin(), Weights.end(), (uint64_t)0, std::plus<uint64_t>());
+      const uint64_t NumUnlikelyTargets = Weights.size() - 1;
+
+      const uint64_t TotalBranchWeight =
+          LikelyBranchWeight + (UnlikelyBranchWeight * NumUnlikelyTargets);
+
+      const llvm::BranchProbability LikelyThreshold(LikelyBranchWeight,
+                                                    TotalBranchWeight);
+      uint64_t ScaledThreshold = LikelyThreshold.scale(CaseTotal);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Unlikely Targets: " << NumUnlikelyTargets << ":\n");
+      LLVM_DEBUG(llvm::dbgs() << "Profile Count: " << ProfileCount << ":\n");
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Scaled Threshold: " << ScaledThreshold << ":\n");
+      LLVM_DEBUG(llvm::dbgs() << "------------------\n");
+      if (ProfileCount < ScaledThreshold)
+        emitMisexpectDiagnostic(I, Ctx, ProfileCount, CaseTotal);
+    }
+  }
+}
+
+void checkFrontendInstrumentation(Instruction &I) {
+  if (auto *MD = I.getMetadata(LLVMContext::MD_prof)) {
+    unsigned NOps = MD->getNumOperands();
+
+    // Only emit misexpect diagnostics if at least 2 branch weights are present.
+    // Less than 2 branch weights means that the profiling metadata is:
+    //    1) incorrect/corrupted
+    //    2) not branch weight metadata
+    //    3) completely deterministic
+    // In these cases we should not emit any diagnostic related to misexpect.
+    if (NOps < 3)
+      return;
+
+    // Operand 0 is a string tag "branch_weights"
+    if (MDString *Tag = cast<MDString>(MD->getOperand(0))) {
+      if (Tag->getString().equals("branch_weights")) {
+        SmallVector<uint32_t, 4> RealWeights(NOps - 1);
+        for (unsigned i = 1; i < NOps; i++) {
+          ConstantInt *Value =
+              mdconst::dyn_extract<ConstantInt>(MD->getOperand(i));
+          RealWeights[i - 1] = Value->getZExtValue();
+        }
+        verifyMisExpect(&I, RealWeights, I.getContext());
+      }
+    }
+  }
+}
+
+} // namespace misexpect
+} // namespace llvm
+#undef DEBUG_TYPE
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index c84beceee191..1ef3757017a8 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -73,7 +73,7 @@ static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *>
   SmallPtrSet<Constant *, 16> InitAsSet;
   SmallVector<Constant *, 16> Init;
   if (GV) {
-    ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+    auto *CA = cast<ConstantArray>(GV->getInitializer());
     for (auto &Op : CA->operands()) {
       Constant *C = cast_or_null<Constant>(Op);
       if (InitAsSet.insert(C).second)
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
index bdf24d80bd17..44859eafb9c1 100644
--- a/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -125,8 +125,10 @@ static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
 // necessary to compare uses/defs in the same block.  Doing so allows us to walk
 // the minimum number of instructions necessary to compute our def/use ordering.
 struct ValueDFS_Compare {
+  DominatorTree &DT;
   OrderedInstructions &OI;
-  ValueDFS_Compare(OrderedInstructions &OI) : OI(OI) {}
+  ValueDFS_Compare(DominatorTree &DT, OrderedInstructions &OI)
+      : DT(DT), OI(OI) {}
 
   bool operator()(const ValueDFS &A, const ValueDFS &B) const {
     if (&A == &B)
@@ -136,7 +138,9 @@ struct ValueDFS_Compare {
     // comesbefore to see what the real ordering is, because they are in the
     // same basic block.
 
-    bool SameBlock = std::tie(A.DFSIn, A.DFSOut) == std::tie(B.DFSIn, B.DFSOut);
+    assert((A.DFSIn != B.DFSIn || A.DFSOut == B.DFSOut) &&
+           "Equal DFS-in numbers imply equal out numbers");
+    bool SameBlock = A.DFSIn == B.DFSIn;
 
     // We want to put the def that will get used for a given set of phi uses,
     // before those phi uses.
@@ -145,9 +149,11 @@ struct ValueDFS_Compare {
     if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
       return comparePHIRelated(A, B);
 
+    bool isADef = A.Def;
+    bool isBDef = B.Def;
     if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
-      return std::tie(A.DFSIn, A.DFSOut, A.LocalNum, A.Def, A.U) <
-             std::tie(B.DFSIn, B.DFSOut, B.LocalNum, B.Def, B.U);
+      return std::tie(A.DFSIn, A.LocalNum, isADef) <
+             std::tie(B.DFSIn, B.LocalNum, isBDef);
     return localComesBefore(A, B);
   }
 
@@ -164,10 +170,35 @@ struct ValueDFS_Compare {
 
   // For two phi related values, return the ordering.
   bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const {
-    auto &ABlockEdge = getBlockEdge(A);
-    auto &BBlockEdge = getBlockEdge(B);
-    // Now sort by block edge and then defs before uses.
-    return std::tie(ABlockEdge, A.Def, A.U) < std::tie(BBlockEdge, B.Def, B.U);
+    BasicBlock *ASrc, *ADest, *BSrc, *BDest;
+    std::tie(ASrc, ADest) = getBlockEdge(A);
+    std::tie(BSrc, BDest) = getBlockEdge(B);
+
+#ifndef NDEBUG
+    // This function should only be used for values in the same BB, check that.
+    DomTreeNode *DomASrc = DT.getNode(ASrc);
+    DomTreeNode *DomBSrc = DT.getNode(BSrc);
+    assert(DomASrc->getDFSNumIn() == (unsigned)A.DFSIn &&
+           "DFS numbers for A should match the ones of the source block");
+    assert(DomBSrc->getDFSNumIn() == (unsigned)B.DFSIn &&
+           "DFS numbers for B should match the ones of the source block");
+    assert(A.DFSIn == B.DFSIn && "Values must be in the same block");
+#endif
+    (void)ASrc;
+    (void)BSrc;
+
+    // Use DFS numbers to compare destination blocks, to guarantee a
+    // deterministic order.
+    DomTreeNode *DomADest = DT.getNode(ADest);
+    DomTreeNode *DomBDest = DT.getNode(BDest);
+    unsigned AIn = DomADest->getDFSNumIn();
+    unsigned BIn = DomBDest->getDFSNumIn();
+    bool isADef = A.Def;
+    bool isBDef = B.Def;
+    assert((!A.Def || !A.U) && (!B.Def || !B.U) &&
+           "Def and U cannot be set at the same time");
+    // Now sort by edge destination and then defs before uses.
+    return std::tie(AIn, isADef) < std::tie(BIn, isBDef);
   }
 
   // Get the definition of an instruction that occurs in the middle of a block.
@@ -306,10 +337,11 @@ void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
 }
 
 // Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
-void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
+void PredicateInfo::addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op,
                                PredicateBase *PB) {
-  OpsToRename.insert(Op);
   auto &OperandInfo = getOrCreateValueInfo(Op);
+  if (OperandInfo.Infos.empty())
+    OpsToRename.push_back(Op);
   AllInfos.push_back(PB);
   OperandInfo.Infos.push_back(PB);
 }
@@ -317,7 +349,7 @@ void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
 // Process an assume instruction and place relevant operations we want to rename
 // into OpsToRename.
 void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
-                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+                                  SmallVectorImpl<Value *> &OpsToRename) {
   // See if we have a comparison we support
   SmallVector<Value *, 8> CmpOperands;
   SmallVector<Value *, 2> ConditionsToProcess;
@@ -357,7 +389,7 @@ void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
 // Process a block terminating branch, and place relevant operations to be
 // renamed into OpsToRename.
 void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
-                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+                                  SmallVectorImpl<Value *> &OpsToRename) {
   BasicBlock *FirstBB = BI->getSuccessor(0);
   BasicBlock *SecondBB = BI->getSuccessor(1);
   SmallVector<BasicBlock *, 2> SuccsToProcess;
@@ -427,7 +459,7 @@ void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
 // Process a block terminating switch, and place relevant operations to be
 // renamed into OpsToRename.
 void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB,
-                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+                                  SmallVectorImpl<Value *> &OpsToRename) {
   Value *Op = SI->getCondition();
   if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
     return;
@@ -457,7 +489,7 @@ void PredicateInfo::buildPredicateInfo() {
   DT.updateDFSNumbers();
   // Collect operands to rename from all conditional branch terminators, as well
   // as assume statements.
-  SmallPtrSet<Value *, 8> OpsToRename;
+  SmallVector<Value *, 8> OpsToRename;
   for (auto DTN : depth_first(DT.getRootNode())) {
     BasicBlock *BranchBB = DTN->getBlock();
     if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
@@ -524,7 +556,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
-      if (empty(IF->users()))
+      if (IF->users().empty())
         CreatedDeclarations.insert(IF);
       CallInst *PIC =
           B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
@@ -536,7 +568,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
              "Should not have gotten here without it being an assume");
       IRBuilder<> B(PAssume->AssumeInst);
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
-      if (empty(IF->users()))
+      if (IF->users().empty())
         CreatedDeclarations.insert(IF);
       CallInst *PIC = B.CreateCall(IF, Op);
       PredicateMap.insert({PIC, ValInfo});
@@ -565,14 +597,8 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
 //
 // TODO: Use this algorithm to perform fast single-variable renaming in
 // promotememtoreg and memoryssa.
-void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
-  // Sort OpsToRename since we are going to iterate it.
-  SmallVector<Value *, 8> OpsToRename(OpSet.begin(), OpSet.end());
-  auto Comparator = [&](const Value *A, const Value *B) {
-    return valueComesBefore(OI, A, B);
-  };
-  llvm::sort(OpsToRename, Comparator);
-  ValueDFS_Compare Compare(OI);
+void PredicateInfo::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
+  ValueDFS_Compare Compare(DT, OI);
   // Compute liveness, and rename in O(uses) per Op.
   for (auto *Op : OpsToRename) {
     LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n");
@@ -772,7 +798,7 @@ static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
 bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
   PredInfo->print(dbgs());
   if (VerifyPredicateInfo)
     PredInfo->verifyPredicateInfo();
@@ -786,7 +812,7 @@ PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   OS << "PredicateInfo for function: " << F.getName() << "\n";
-  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
   PredInfo->print(OS);
 
   replaceCreatedSSACopys(*PredInfo, F);
@@ -845,7 +871,7 @@ PreservedAnalyses PredicateInfoVerifierPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
+  std::make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
 
   return PreservedAnalyses::all();
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 11651d040dc0..3a5e3293ed4f 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -94,6 +94,12 @@ static cl::opt<unsigned> PHINodeFoldingThreshold(
     cl::desc(
         "Control the amount of phi node folding to perform (default = 2)"));
 
+static cl::opt<unsigned> TwoEntryPHINodeFoldingThreshold(
+    "two-entry-phi-node-folding-threshold", cl::Hidden, cl::init(4),
+    cl::desc("Control the maximal total instruction cost that we are willing "
+             "to speculatively execute to fold a 2-entry PHI node into a "
+             "select (default = 4)"));
+
 static cl::opt<bool> DupRet(
     "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
     cl::desc("Duplicate return instructions into unconditional branches"));
@@ -332,7 +338,7 @@ static unsigned ComputeSpeculationCost(const User *I,
 /// CostRemaining, false is returned and CostRemaining is undefined.
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
                                 SmallPtrSetImpl<Instruction *> &AggressiveInsts,
-                                unsigned &CostRemaining,
+                                int &BudgetRemaining,
                                 const TargetTransformInfo &TTI,
                                 unsigned Depth = 0) {
   // It is possible to hit a zero-cost cycle (phi/gep instructions for example),
@@ -375,7 +381,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   if (!isSafeToSpeculativelyExecute(I))
     return false;
 
-  unsigned Cost = ComputeSpeculationCost(I, TTI);
+  BudgetRemaining -= ComputeSpeculationCost(I, TTI);
 
   // Allow exactly one instruction to be speculated regardless of its cost
   // (as long as it is safe to do so).
@@ -383,17 +389,14 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // or other expensive operation. The speculation of an expensive instruction
   // is expected to be undone in CodeGenPrepare if the speculation has not
   // enabled further IR optimizations.
-  if (Cost > CostRemaining &&
+  if (BudgetRemaining < 0 &&
       (!SpeculateOneExpensiveInst || !AggressiveInsts.empty() || Depth > 0))
     return false;
 
-  // Avoid unsigned wrap.
-  CostRemaining = (Cost > CostRemaining) ? 0 : CostRemaining - Cost;
-
   // Okay, we can only really hoist these out if their operands do
   // not take us over the cost threshold.
   for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
-    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI,
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, BudgetRemaining, TTI,
                              Depth + 1))
       return false;
   // Okay, it's safe to do this!  Remember this instruction.
@@ -629,8 +632,7 @@ private:
   /// vector.
   /// One "Extra" case is allowed to differ from the other.
   void gather(Value *V) {
-    Instruction *I = dyn_cast<Instruction>(V);
-    bool isEQ = (I->getOpcode() == Instruction::Or);
+    bool isEQ = (cast<Instruction>(V)->getOpcode() == Instruction::Or);
 
     // Keep a stack (SmallVector for efficiency) for depth-first traversal
     SmallVector<Value *, 8> DFT;
@@ -1313,7 +1315,8 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
                              LLVMContext::MD_dereferenceable,
                              LLVMContext::MD_dereferenceable_or_null,
                              LLVMContext::MD_mem_parallel_loop_access,
-                             LLVMContext::MD_access_group};
+                             LLVMContext::MD_access_group,
+                             LLVMContext::MD_preserve_access_index};
       combineMetadata(I1, I2, KnownIDs, true);
 
       // I1 and I2 are being combined into a single instruction.  Its debug
@@ -1420,6 +1423,20 @@ HoistTerminator:
   return true;
 }
 
+// Check lifetime markers.
+static bool isLifeTimeMarker(const Instruction *I) {
+  if (auto II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      return true;
+    }
+  }
+  return false;
+}
+
 // All instructions in Insts belong to different blocks that all unconditionally
 // branch to a common successor. Analyze each instruction and return true if it
 // would be possible to sink them into their successor, creating one common
@@ -1474,20 +1491,25 @@ static bool canSinkInstructions(
       return false;
   }
 
-  // Because SROA can't handle speculating stores of selects, try not
-  // to sink loads or stores of allocas when we'd have to create a PHI for
-  // the address operand. Also, because it is likely that loads or stores
-  // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
+  // Because SROA can't handle speculating stores of selects, try not to sink
+  // loads, stores or lifetime markers of allocas when we'd have to create a
+  // PHI for the address operand. Also, because it is likely that loads or
+  // stores of allocas will disappear when Mem2Reg/SROA is run, don't sink
+  // them.
   // This can cause code churn which can have unintended consequences down
   // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
   // FIXME: This is a workaround for a deficiency in SROA - see
   // https://llvm.org/bugs/show_bug.cgi?id=30188
   if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) {
-        return isa<AllocaInst>(I->getOperand(1));
+        return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
       }))
     return false;
   if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
-        return isa<AllocaInst>(I->getOperand(0));
+        return isa<AllocaInst>(I->getOperand(0)->stripPointerCasts());
+      }))
+    return false;
+  if (isLifeTimeMarker(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
       }))
     return false;
 
@@ -1959,7 +1981,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
   SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
 
-  unsigned SpeculationCost = 0;
+  unsigned SpeculatedInstructions = 0;
   Value *SpeculatedStoreValue = nullptr;
   StoreInst *SpeculatedStore = nullptr;
   for (BasicBlock::iterator BBI = ThenBB->begin(),
@@ -1974,8 +1996,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
     // Only speculatively execute a single instruction (not counting the
     // terminator) for now.
-    ++SpeculationCost;
-    if (SpeculationCost > 1)
+    ++SpeculatedInstructions;
+    if (SpeculatedInstructions > 1)
       return false;
 
     // Don't hoist the instruction if it's unsafe or expensive.
@@ -2012,8 +2034,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
            E = SinkCandidateUseCounts.end();
        I != E; ++I)
     if (I->first->hasNUses(I->second)) {
-      ++SpeculationCost;
-      if (SpeculationCost > 1)
+      ++SpeculatedInstructions;
+      if (SpeculatedInstructions > 1)
         return false;
     }
 
@@ -2053,8 +2075,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     // getting expanded into Instructions.
     // FIXME: This doesn't account for how many operations are combined in the
     // constant expression.
-    ++SpeculationCost;
-    if (SpeculationCost > 1)
+    ++SpeculatedInstructions;
+    if (SpeculatedInstructions > 1)
       return false;
   }
 
@@ -2302,10 +2324,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // instructions.  While we are at it, keep track of the instructions
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction *, 4> AggressiveInsts;
-  unsigned MaxCostVal0 = PHINodeFoldingThreshold,
-           MaxCostVal1 = PHINodeFoldingThreshold;
-  MaxCostVal0 *= TargetTransformInfo::TCC_Basic;
-  MaxCostVal1 *= TargetTransformInfo::TCC_Basic;
+  int BudgetRemaining =
+      TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
 
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
@@ -2316,9 +2336,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
     }
 
     if (!DominatesMergePoint(PN->getIncomingValue(0), BB, AggressiveInsts,
-                             MaxCostVal0, TTI) ||
+                             BudgetRemaining, TTI) ||
         !DominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts,
-                             MaxCostVal1, TTI))
+                             BudgetRemaining, TTI))
       return false;
   }
 
@@ -2328,12 +2348,24 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   if (!PN)
     return true;
 
-  // Don't fold i1 branches on PHIs which contain binary operators.  These can
-  // often be turned into switches and other things.
+  // Return true if at least one of these is a 'not', and another is either
+  // a 'not' too, or a constant.
+  auto CanHoistNotFromBothValues = [](Value *V0, Value *V1) {
+    if (!match(V0, m_Not(m_Value())))
+      std::swap(V0, V1);
+    auto Invertible = m_CombineOr(m_Not(m_Value()), m_AnyIntegralConstant());
+    return match(V0, m_Not(m_Value())) && match(V1, Invertible);
+  };
+
+  // Don't fold i1 branches on PHIs which contain binary operators, unless one
+  // of the incoming values is an 'not' and another one is freely invertible.
+  // These can often be turned into switches and other things.
   if (PN->getType()->isIntegerTy(1) &&
       (isa<BinaryOperator>(PN->getIncomingValue(0)) ||
        isa<BinaryOperator>(PN->getIncomingValue(1)) ||
-       isa<BinaryOperator>(IfCond)))
+       isa<BinaryOperator>(IfCond)) &&
+      !CanHoistNotFromBothValues(PN->getIncomingValue(0),
+                                 PN->getIncomingValue(1)))
     return false;
 
   // If all PHI nodes are promotable, check to make sure that all instructions
@@ -2368,6 +2400,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
         return false;
       }
   }
+  assert(DomBlock && "Failed to find root DomBlock");
 
   LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond
                     << "  T: " << IfTrue->getName()
@@ -2913,42 +2946,8 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
                                            BasicBlock *QTB, BasicBlock *QFB,
                                            BasicBlock *PostBB, Value *Address,
                                            bool InvertPCond, bool InvertQCond,
-                                           const DataLayout &DL) {
-  auto IsaBitcastOfPointerType = [](const Instruction &I) {
-    return Operator::getOpcode(&I) == Instruction::BitCast &&
-           I.getType()->isPointerTy();
-  };
-
-  // If we're not in aggressive mode, we only optimize if we have some
-  // confidence that by optimizing we'll allow P and/or Q to be if-converted.
-  auto IsWorthwhile = [&](BasicBlock *BB) {
-    if (!BB)
-      return true;
-    // Heuristic: if the block can be if-converted/phi-folded and the
-    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
-    // thread this store.
-    unsigned N = 0;
-    for (auto &I : BB->instructionsWithoutDebug()) {
-      // Cheap instructions viable for folding.
-      if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) ||
-          isa<StoreInst>(I))
-        ++N;
-      // Free instructions.
-      else if (I.isTerminator() || IsaBitcastOfPointerType(I))
-        continue;
-      else
-        return false;
-    }
-    // The store we want to merge is counted in N, so add 1 to make sure
-    // we're counting the instructions that would be left.
-    return N <= (PHINodeFoldingThreshold + 1);
-  };
-
-  if (!MergeCondStoresAggressively &&
-      (!IsWorthwhile(PTB) || !IsWorthwhile(PFB) || !IsWorthwhile(QTB) ||
-       !IsWorthwhile(QFB)))
-    return false;
-
+                                           const DataLayout &DL,
+                                           const TargetTransformInfo &TTI) {
   // For every pointer, there must be exactly two stores, one coming from
   // PTB or PFB, and the other from QTB or QFB. We don't support more than one
   // store (to any address) in PTB,PFB or QTB,QFB.
@@ -2989,6 +2988,46 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     if (&*I != PStore && I->mayReadOrWriteMemory())
       return false;
 
+  // If we're not in aggressive mode, we only optimize if we have some
+  // confidence that by optimizing we'll allow P and/or Q to be if-converted.
+  auto IsWorthwhile = [&](BasicBlock *BB, ArrayRef<StoreInst *> FreeStores) {
+    if (!BB)
+      return true;
+    // Heuristic: if the block can be if-converted/phi-folded and the
+    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
+    // thread this store.
+    int BudgetRemaining =
+        PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+    for (auto &I : BB->instructionsWithoutDebug()) {
+      // Consider terminator instruction to be free.
+      if (I.isTerminator())
+        continue;
+      // If this is one the stores that we want to speculate out of this BB,
+      // then don't count it's cost, consider it to be free.
+      if (auto *S = dyn_cast<StoreInst>(&I))
+        if (llvm::find(FreeStores, S))
+          continue;
+      // Else, we have a white-list of instructions that we are ak speculating.
+      if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I))
+        return false; // Not in white-list - not worthwhile folding.
+      // And finally, if this is a non-free instruction that we are okay
+      // speculating, ensure that we consider the speculation budget.
+      BudgetRemaining -= TTI.getUserCost(&I);
+      if (BudgetRemaining < 0)
+        return false; // Eagerly refuse to fold as soon as we're out of budget.
+    }
+    assert(BudgetRemaining >= 0 &&
+           "When we run out of budget we will eagerly return from within the "
+           "per-instruction loop.");
+    return true;
+  };
+
+  const SmallVector<StoreInst *, 2> FreeStores = {PStore, QStore};
+  if (!MergeCondStoresAggressively &&
+      (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) ||
+       !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores)))
+    return false;
+
   // If PostBB has more than two predecessors, we need to split it so we can
   // sink the store.
   if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
@@ -3048,15 +3087,15 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
   // store that doesn't execute.
   if (MinAlignment != 0) {
     // Choose the minimum of all non-zero alignments.
-    SI->setAlignment(MinAlignment);
+    SI->setAlignment(Align(MinAlignment));
   } else if (MaxAlignment != 0) {
     // Choose the minimal alignment between the non-zero alignment and the ABI
     // default alignment for the type of the stored value.
-    SI->setAlignment(std::min(MaxAlignment, TypeAlignment));
+    SI->setAlignment(Align(std::min(MaxAlignment, TypeAlignment)));
   } else {
     // If both alignments are zero, use ABI default alignment for the type of
     // the stored value.
-    SI->setAlignment(TypeAlignment);
+    SI->setAlignment(Align(TypeAlignment));
   }
 
   QStore->eraseFromParent();
@@ -3066,7 +3105,8 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
 }
 
 static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
-                                   const DataLayout &DL) {
+                                   const DataLayout &DL,
+                                   const TargetTransformInfo &TTI) {
   // The intention here is to find diamonds or triangles (see below) where each
   // conditional block contains a store to the same address. Both of these
   // stores are conditional, so they can't be unconditionally sunk. But it may
@@ -3168,7 +3208,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
   bool Changed = false;
   for (auto *Address : CommonAddresses)
     Changed |= mergeConditionalStoreToAddress(
-        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL);
+        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL, TTI);
   return Changed;
 }
 
@@ -3177,7 +3217,8 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
 /// that PBI and BI are both conditional branches, and BI is in one of the
 /// successor blocks of PBI - PBI branches to BI.
 static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
-                                           const DataLayout &DL) {
+                                           const DataLayout &DL,
+                                           const TargetTransformInfo &TTI) {
   assert(PBI->isConditional() && BI->isConditional());
   BasicBlock *BB = BI->getParent();
 
@@ -3233,7 +3274,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // If both branches are conditional and both contain stores to the same
   // address, remove the stores from the conditionals and create a conditional
   // merged store at the end.
-  if (MergeCondStores && mergeConditionalStores(PBI, BI, DL))
+  if (MergeCondStores && mergeConditionalStores(PBI, BI, DL, TTI))
     return true;
 
   // If this is a conditional branch in an empty block, and if any
@@ -3697,12 +3738,17 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
 
   BasicBlock *BB = BI->getParent();
 
+  // MSAN does not like undefs as branch condition which can be introduced
+  // with "explicit branch".
+  if (ExtraCase && BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
+    return false;
+
   LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
                     << " cases into SWITCH.  BB is:\n"
                     << *BB);
 
   // If there are any extra values that couldn't be folded into the switch
-  // then we evaluate them with an explicit branch first.  Split the block
+  // then we evaluate them with an explicit branch first. Split the block
   // right before the condbr to handle it.
   if (ExtraCase) {
     BasicBlock *NewBB =
@@ -3851,7 +3897,7 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
 // Simplify resume that is only used by a single (non-phi) landing pad.
 bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
-  LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI());
+  auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI());
   assert(RI->getValue() == LPInst &&
          "Resume must unwind the exception that caused control to here");
 
@@ -4178,23 +4224,22 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
     IRBuilder<> Builder(TI);
     if (auto *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isUnconditional()) {
-        if (BI->getSuccessor(0) == BB) {
-          new UnreachableInst(TI->getContext(), TI);
-          TI->eraseFromParent();
-          Changed = true;
-        }
+        assert(BI->getSuccessor(0) == BB && "Incorrect CFG");
+        new UnreachableInst(TI->getContext(), TI);
+        TI->eraseFromParent();
+        Changed = true;
       } else {
         Value* Cond = BI->getCondition();
         if (BI->getSuccessor(0) == BB) {
           Builder.CreateAssumption(Builder.CreateNot(Cond));
           Builder.CreateBr(BI->getSuccessor(1));
-          EraseTerminatorAndDCECond(BI);
-        } else if (BI->getSuccessor(1) == BB) {
+        } else {
+          assert(BI->getSuccessor(1) == BB && "Incorrect CFG");
           Builder.CreateAssumption(Cond);
           Builder.CreateBr(BI->getSuccessor(0));
-          EraseTerminatorAndDCECond(BI);
-          Changed = true;
         }
+        EraseTerminatorAndDCECond(BI);
+        Changed = true;
       }
     } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
       SwitchInstProfUpdateWrapper SU(*SI);
@@ -4276,6 +4321,17 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
   return true;
 }
 
+static void createUnreachableSwitchDefault(SwitchInst *Switch) {
+  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+  BasicBlock *NewDefaultBlock =
+     SplitBlockPredecessors(Switch->getDefaultDest(), Switch->getParent(), "");
+  Switch->setDefaultDest(&*NewDefaultBlock);
+  SplitBlock(&*NewDefaultBlock, &NewDefaultBlock->front());
+  auto *NewTerminator = NewDefaultBlock->getTerminator();
+  new UnreachableInst(Switch->getContext(), NewTerminator);
+  EraseTerminatorAndDCECond(NewTerminator);
+}
+
 /// Turn a switch with two reachable destinations into an integer range
 /// comparison and branch.
 static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
@@ -4384,6 +4440,11 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
 
+  // Clean up the default block - it may have phis or other instructions before
+  // the unreachable terminator.
+  if (!HasDefault)
+    createUnreachableSwitchDefault(SI);
+
   // Drop the switch.
   SI->eraseFromParent();
 
@@ -4428,14 +4489,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
       SI->getNumCases() == (1ULL << NumUnknownBits)) {
-    LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
-    BasicBlock *NewDefault =
-        SplitBlockPredecessors(SI->getDefaultDest(), SI->getParent(), "");
-    SI->setDefaultDest(&*NewDefault);
-    SplitBlock(&*NewDefault, &NewDefault->front());
-    auto *OldTI = NewDefault->getTerminator();
-    new UnreachableInst(SI->getContext(), OldTI);
-    EraseTerminatorAndDCECond(OldTI);
+    createUnreachableSwitchDefault(SI);
     return true;
   }
 
@@ -5031,7 +5085,7 @@ SwitchLookupTable::SwitchLookupTable(
   Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   // Set the alignment to that of an array items. We will be only loading one
   // value out of it.
-  Array->setAlignment(DL.getPrefTypeAlignment(ValueType));
+  Array->setAlignment(Align(DL.getPrefTypeAlignment(ValueType)));
   Kind = ArrayKind;
 }
 
@@ -5260,7 +5314,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   // Figure out the corresponding result for each case value and phi node in the
   // common destination, as well as the min and max case values.
-  assert(!empty(SI->cases()));
+  assert(!SI->cases().empty());
   SwitchInst::CaseIt CI = SI->case_begin();
   ConstantInt *MinCaseVal = CI->getCaseValue();
   ConstantInt *MaxCaseVal = CI->getCaseValue();
@@ -5892,7 +5946,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
-        if (SimplifyCondBranchToCondBranch(PBI, BI, DL))
+        if (SimplifyCondBranchToCondBranch(PBI, BI, DL, TTI))
           return requestResimplify();
 
   // Look for diamond patterns.
@@ -5900,7 +5954,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
       if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
         if (PBI != BI && PBI->isConditional())
-          if (mergeConditionalStores(PBI, BI, DL))
+          if (mergeConditionalStores(PBI, BI, DL, TTI))
             return requestResimplify();
 
   return false;
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index e0def81d5eee..0324993a8203 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 
@@ -47,7 +48,6 @@ static cl::opt<bool>
                          cl::desc("Enable unsafe double to float "
                                   "shrinking for math lib calls"));
 
-
 //===----------------------------------------------------------------------===//
 // Helper Functions
 //===----------------------------------------------------------------------===//
@@ -177,7 +177,8 @@ static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
   if (!isOnlyUsedInComparisonWithZero(CI))
     return false;
 
-  if (!isDereferenceableAndAlignedPointer(Str, 1, APInt(64, Len), DL))
+  if (!isDereferenceableAndAlignedPointer(Str, Align::None(), APInt(64, Len),
+                                          DL))
     return false;
 
   if (CI->getFunction()->hasFnAttribute(Attribute::SanitizeMemory))
@@ -186,6 +187,67 @@ static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
   return true;
 }
 
+static void annotateDereferenceableBytes(CallInst *CI,
+                                         ArrayRef<unsigned> ArgNos,
+                                         uint64_t DereferenceableBytes) {
+  const Function *F = CI->getCaller();
+  if (!F)
+    return;
+  for (unsigned ArgNo : ArgNos) {
+    uint64_t DerefBytes = DereferenceableBytes;
+    unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
+    if (!llvm::NullPointerIsDefined(F, AS) ||
+        CI->paramHasAttr(ArgNo, Attribute::NonNull))
+      DerefBytes = std::max(CI->getDereferenceableOrNullBytes(
+                                ArgNo + AttributeList::FirstArgIndex),
+                            DereferenceableBytes);
+  
+    if (CI->getDereferenceableBytes(ArgNo + AttributeList::FirstArgIndex) <
+        DerefBytes) {
+      CI->removeParamAttr(ArgNo, Attribute::Dereferenceable);
+      if (!llvm::NullPointerIsDefined(F, AS) ||
+          CI->paramHasAttr(ArgNo, Attribute::NonNull))
+        CI->removeParamAttr(ArgNo, Attribute::DereferenceableOrNull);
+      CI->addParamAttr(ArgNo, Attribute::getWithDereferenceableBytes(
+                                  CI->getContext(), DerefBytes));
+    }
+  }
+}
+
+static void annotateNonNullBasedOnAccess(CallInst *CI,
+                                         ArrayRef<unsigned> ArgNos) {
+  Function *F = CI->getCaller();
+  if (!F)
+    return;
+
+  for (unsigned ArgNo : ArgNos) {
+    if (CI->paramHasAttr(ArgNo, Attribute::NonNull))
+      continue;
+    unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
+    if (llvm::NullPointerIsDefined(F, AS))
+      continue;
+
+    CI->addParamAttr(ArgNo, Attribute::NonNull);
+    annotateDereferenceableBytes(CI, ArgNo, 1);
+  }
+}
+
+static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> ArgNos,
+                               Value *Size, const DataLayout &DL) {
+  if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size)) {
+    annotateNonNullBasedOnAccess(CI, ArgNos);
+    annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue());
+  } else if (isKnownNonZero(Size, DL)) {
+    annotateNonNullBasedOnAccess(CI, ArgNos);
+    const APInt *X, *Y;
+    uint64_t DerefMin = 1;
+    if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) {
+      DerefMin = std::min(X->getZExtValue(), Y->getZExtValue());
+      annotateDereferenceableBytes(CI, ArgNos, DerefMin);
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // String and Memory Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -194,10 +256,13 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) {
   // Extract some information from the instruction
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
+  annotateNonNullBasedOnAccess(CI, {0, 1});
 
   // See if we can get the length of the input string.
   uint64_t Len = GetStringLength(Src);
-  if (Len == 0)
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
     return nullptr;
   --Len; // Unbias length.
 
@@ -232,24 +297,34 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
   // Extract some information from the instruction.
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
   uint64_t Len;
+  annotateNonNullBasedOnAccess(CI, 0);
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullBasedOnAccess(CI, 1);
 
   // We don't do anything if length is not constant.
-  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+  ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size);
+  if (LengthArg) {
     Len = LengthArg->getZExtValue();
-  else
+    // strncat(x, c, 0) -> x
+    if (!Len)
+      return Dst;
+  } else {
     return nullptr;
+  }
 
   // See if we can get the length of the input string.
   uint64_t SrcLen = GetStringLength(Src);
-  if (SrcLen == 0)
+  if (SrcLen) {
+    annotateDereferenceableBytes(CI, 1, SrcLen);
+    --SrcLen; // Unbias length.
+  } else {
     return nullptr;
-  --SrcLen; // Unbias length.
+  }
 
-  // Handle the simple, do-nothing cases:
   // strncat(x, "", c) -> x
-  // strncat(x,  c, 0) -> x
-  if (SrcLen == 0 || Len == 0)
+  if (SrcLen == 0)
     return Dst;
 
   // We don't optimize this case.
@@ -265,13 +340,18 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   Value *SrcStr = CI->getArgOperand(0);
+  annotateNonNullBasedOnAccess(CI, 0);
 
   // If the second operand is non-constant, see if we can compute the length
   // of the input string and turn this into memchr.
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   if (!CharC) {
     uint64_t Len = GetStringLength(SrcStr);
-    if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
+    if (Len)
+      annotateDereferenceableBytes(CI, 0, Len);
+    else
+      return nullptr;
+    if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
       return nullptr;
 
     return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
@@ -304,6 +384,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
   Value *SrcStr = CI->getArgOperand(0);
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  annotateNonNullBasedOnAccess(CI, 0);
 
   // Cannot fold anything if we're not looking for a constant.
   if (!CharC)
@@ -351,7 +432,12 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
 
   // strcmp(P, "x") -> memcmp(P, "x", 2)
   uint64_t Len1 = GetStringLength(Str1P);
+  if (Len1)
+    annotateDereferenceableBytes(CI, 0, Len1);
   uint64_t Len2 = GetStringLength(Str2P);
+  if (Len2)
+    annotateDereferenceableBytes(CI, 1, Len2);
+
   if (Len1 && Len2) {
     return emitMemCmp(Str1P, Str2P,
                       ConstantInt::get(DL.getIntPtrType(CI->getContext()),
@@ -374,17 +460,22 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
           TLI);
   }
 
+  annotateNonNullBasedOnAccess(CI, {0, 1});
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
-  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+  Value *Str1P = CI->getArgOperand(0);
+  Value *Str2P = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
   if (Str1P == Str2P) // strncmp(x,x,n)  -> 0
     return ConstantInt::get(CI->getType(), 0);
 
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullBasedOnAccess(CI, {0, 1});
   // Get the length argument if it is constant.
   uint64_t Length;
-  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
     Length = LengthArg->getZExtValue();
   else
     return nullptr;
@@ -393,7 +484,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
     return ConstantInt::get(CI->getType(), 0);
 
   if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
-    return emitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
+    return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI);
 
   StringRef Str1, Str2;
   bool HasStr1 = getConstantStringInfo(Str1P, Str1);
@@ -415,7 +506,11 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
                         CI->getType());
 
   uint64_t Len1 = GetStringLength(Str1P);
+  if (Len1)
+    annotateDereferenceableBytes(CI, 0, Len1);
   uint64_t Len2 = GetStringLength(Str2P);
+  if (Len2)
+    annotateDereferenceableBytes(CI, 1, Len2);
 
   // strncmp to memcmp
   if (!HasStr1 && HasStr2) {
@@ -437,20 +532,38 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilder<> &B) {
+  Value *Src = CI->getArgOperand(0);
+  ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  uint64_t SrcLen = GetStringLength(Src);
+  if (SrcLen && Size) {
+    annotateDereferenceableBytes(CI, 0, SrcLen);
+    if (SrcLen <= Size->getZExtValue() + 1)
+      return emitStrDup(Src, B, TLI);
+  }
+
+  return nullptr;
+}
+
 Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) // strcpy(x,x)  -> x
     return Src;
-
+  
+  annotateNonNullBasedOnAccess(CI, {0, 1});
   // See if we can get the length of the input string.
   uint64_t Len = GetStringLength(Src);
-  if (Len == 0)
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
     return nullptr;
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(Dst, 1, Src, 1,
-                 ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
+  CallInst *NewCI =
+      B.CreateMemCpy(Dst, 1, Src, 1,
+                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
+  NewCI->setAttributes(CI->getAttributes());
   return Dst;
 }
 
@@ -464,7 +577,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
   // See if we can get the length of the input string.
   uint64_t Len = GetStringLength(Src);
-  if (Len == 0)
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
     return nullptr;
 
   Type *PT = Callee->getFunctionType()->getParamType(0);
@@ -474,7 +589,8 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(Dst, 1, Src, 1, LenV);
+  CallInst *NewCI = B.CreateMemCpy(Dst, 1, Src, 1, LenV);
+  NewCI->setAttributes(CI->getAttributes());
   return DstEnd;
 }
 
@@ -482,37 +598,47 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
-  Value *LenOp = CI->getArgOperand(2);
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullBasedOnAccess(CI, 0);
+  if (isKnownNonZero(Size, DL))
+    annotateNonNullBasedOnAccess(CI, 1);
+
+  uint64_t Len;
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
+    Len = LengthArg->getZExtValue();
+  else
+    return nullptr;
+
+  // strncpy(x, y, 0) -> x
+  if (Len == 0)
+    return Dst;
 
   // See if we can get the length of the input string.
   uint64_t SrcLen = GetStringLength(Src);
-  if (SrcLen == 0)
+  if (SrcLen) {
+    annotateDereferenceableBytes(CI, 1, SrcLen);
+    --SrcLen; // Unbias length.
+  } else {
     return nullptr;
-  --SrcLen;
+  }
 
   if (SrcLen == 0) {
     // strncpy(x, "", y) -> memset(align 1 x, '\0', y)
-    B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
+    CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, 1);
+    AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0));
+    NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
+        CI->getContext(), 0, ArgAttrs));
     return Dst;
   }
 
-  uint64_t Len;
-  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
-    Len = LengthArg->getZExtValue();
-  else
-    return nullptr;
-
-  if (Len == 0)
-    return Dst; // strncpy(x, y, 0) -> x
-
   // Let strncpy handle the zero padding
   if (Len > SrcLen + 1)
     return nullptr;
 
   Type *PT = Callee->getFunctionType()->getParamType(0);
   // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
-  B.CreateMemCpy(Dst, 1, Src, 1, ConstantInt::get(DL.getIntPtrType(PT), Len));
-
+  CallInst *NewCI = B.CreateMemCpy(Dst, 1, Src, 1, ConstantInt::get(DL.getIntPtrType(PT), Len));
+  NewCI->setAttributes(CI->getAttributes());
   return Dst;
 }
 
@@ -608,7 +734,10 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B,
 }
 
 Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
-  return optimizeStringLength(CI, B, 8);
+  if (Value *V = optimizeStringLength(CI, B, 8))
+    return V;
+  annotateNonNullBasedOnAccess(CI, 0);
+  return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
@@ -756,21 +885,35 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
     Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
     return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
   }
+
+  annotateNonNullBasedOnAccess(CI, {0, 1});
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilder<> &B) {
+  if (isKnownNonZero(CI->getOperand(2), DL))
+    annotateNonNullBasedOnAccess(CI, 0);
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
   Value *SrcStr = CI->getArgOperand(0);
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, 0, Size, DL);
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-  ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
 
   // memchr(x, y, 0) -> null
-  if (LenC && LenC->isZero())
-    return Constant::getNullValue(CI->getType());
+  if (LenC) {
+    if (LenC->isZero())
+      return Constant::getNullValue(CI->getType());
+  } else {
+    // From now on we need at least constant length and string.
+    return nullptr;
+  }
 
-  // From now on we need at least constant length and string.
   StringRef Str;
-  if (!LenC || !getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
+  if (!getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
     return nullptr;
 
   // Truncate the string to LenC. If Str is smaller than LenC we will still only
@@ -913,6 +1056,7 @@ static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS,
       Ret = 1;
     return ConstantInt::get(CI->getType(), Ret);
   }
+
   return nullptr;
 }
 
@@ -925,12 +1069,19 @@ Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
   if (LHS == RHS) // memcmp(s,s,x) -> 0
     return Constant::getNullValue(CI->getType());
 
+  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
   // Handle constant lengths.
-  if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size))
-    if (Value *Res = optimizeMemCmpConstantSize(CI, LHS, RHS,
-                                                LenC->getZExtValue(), B, DL))
-      return Res;
+  ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
+  if (!LenC)
+    return nullptr;
 
+  // memcmp(d,s,0) -> 0
+  if (LenC->getZExtValue() == 0)
+    return Constant::getNullValue(CI->getType());
+
+  if (Value *Res =
+          optimizeMemCmpConstantSize(CI, LHS, RHS, LenC->getZExtValue(), B, DL))
+    return Res;
   return nullptr;
 }
 
@@ -939,9 +1090,9 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
     return V;
 
   // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0
-  // `bcmp` can be more efficient than memcmp because it only has to know that
-  // there is a difference, not where it is.
-  if (isOnlyUsedInZeroEqualityComparison(CI) && TLI->has(LibFunc_bcmp)) {
+  // bcmp can be more efficient than memcmp because it only has to know that
+  // there is a difference, not how different one is to the other.
+  if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) {
     Value *LHS = CI->getArgOperand(0);
     Value *RHS = CI->getArgOperand(1);
     Value *Size = CI->getArgOperand(2);
@@ -956,16 +1107,37 @@ Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+  if (isa<IntrinsicInst>(CI))
+    return nullptr;
+
   // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n)
-  B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
-                 CI->getArgOperand(2));
+  CallInst *NewCI =
+      B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, Size);
+  NewCI->setAttributes(CI->getAttributes());
   return CI->getArgOperand(0);
 }
 
+Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilder<> &B) {
+  Value *Dst = CI->getArgOperand(0);
+  Value *N = CI->getArgOperand(2);
+  // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n
+  CallInst *NewCI = B.CreateMemCpy(Dst, 1, CI->getArgOperand(1), 1, N);
+  NewCI->setAttributes(CI->getAttributes());
+  return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
+}
+
 Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+  if (isa<IntrinsicInst>(CI))
+    return nullptr;
+
   // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n)
-  B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
-                  CI->getArgOperand(2));
+  CallInst *NewCI =
+      B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, Size);
+  NewCI->setAttributes(CI->getAttributes());
   return CI->getArgOperand(0);
 }
 
@@ -1003,25 +1175,29 @@ Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) {
   B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator());
   const DataLayout &DL = Malloc->getModule()->getDataLayout();
   IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
-  Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
-                             Malloc->getArgOperand(0), Malloc->getAttributes(),
-                             B, *TLI);
-  if (!Calloc)
-    return nullptr;
-
-  Malloc->replaceAllUsesWith(Calloc);
-  eraseFromParent(Malloc);
+  if (Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
+                                 Malloc->getArgOperand(0),
+                                 Malloc->getAttributes(), B, *TLI)) {
+    substituteInParent(Malloc, Calloc);
+    return Calloc;
+  }
 
-  return Calloc;
+  return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
+  Value *Size = CI->getArgOperand(2);
+  annotateNonNullAndDereferenceable(CI, 0, Size, DL);
+  if (isa<IntrinsicInst>(CI))
+    return nullptr;
+
   if (auto *Calloc = foldMallocMemset(CI, B))
     return Calloc;
 
   // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
-  B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+  CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, 1);
+  NewCI->setAttributes(CI->getAttributes());
   return CI->getArgOperand(0);
 }
 
@@ -1096,21 +1272,18 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B,
   if (!V[0] || (isBinary && !V[1]))
     return nullptr;
 
-  StringRef CalleeNm = CalleeFn->getName();
-  AttributeList CalleeAt = CalleeFn->getAttributes();
-  bool CalleeIn = CalleeFn->isIntrinsic();
-
   // If call isn't an intrinsic, check that it isn't within a function with the
   // same name as the float version of this call, otherwise the result is an
   // infinite loop.  For example, from MinGW-w64:
   //
   // float expf(float val) { return (float) exp((double) val); }
-  if (!CalleeIn) {
-    const Function *Fn = CI->getFunction();
-    StringRef FnName = Fn->getName();
-    if (FnName.back() == 'f' &&
-        FnName.size() == (CalleeNm.size() + 1) &&
-        FnName.startswith(CalleeNm))
+  StringRef CalleeName = CalleeFn->getName();
+  bool IsIntrinsic = CalleeFn->isIntrinsic();
+  if (!IsIntrinsic) {
+    StringRef CallerName = CI->getFunction()->getName();
+    if (!CallerName.empty() && CallerName.back() == 'f' &&
+        CallerName.size() == (CalleeName.size() + 1) &&
+        CallerName.startswith(CalleeName))
       return nullptr;
   }
 
@@ -1120,16 +1293,16 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B,
 
   // g((double) float) -> (double) gf(float)
   Value *R;
-  if (CalleeIn) {
+  if (IsIntrinsic) {
     Module *M = CI->getModule();
     Intrinsic::ID IID = CalleeFn->getIntrinsicID();
     Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy());
     R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
+  } else {
+    AttributeList CalleeAttrs = CalleeFn->getAttributes();
+    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs)
+                 : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs);
   }
-  else
-    R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeNm, B, CalleeAt)
-                 : emitUnaryFloatFnCall(V[0], CalleeNm, B, CalleeAt);
-
   return B.CreateFPExt(R, B.getDoubleTy());
 }
 
@@ -1234,9 +1407,25 @@ static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) {
   return InnerChain[Exp];
 }
 
+// Return a properly extended 32-bit integer if the operation is an itofp.
+static Value *getIntToFPVal(Value *I2F, IRBuilder<> &B) {
+  if (isa<SIToFPInst>(I2F) || isa<UIToFPInst>(I2F)) {
+    Value *Op = cast<Instruction>(I2F)->getOperand(0);
+    // Make sure that the exponent fits inside an int32_t,
+    // thus avoiding any range issues that FP has not.
+    unsigned BitWidth = Op->getType()->getPrimitiveSizeInBits();
+    if (BitWidth < 32 ||
+        (BitWidth == 32 && isa<SIToFPInst>(I2F)))
+      return isa<SIToFPInst>(I2F) ? B.CreateSExt(Op, B.getInt32Ty())
+                                  : B.CreateZExt(Op, B.getInt32Ty());
+  }
+
+  return nullptr;
+}
+
 /// Use exp{,2}(x * y) for pow(exp{,2}(x), y);
-/// exp2(n * x) for pow(2.0 ** n, x); exp10(x) for pow(10.0, x);
-/// exp2(log2(n) * x) for pow(n, x).
+/// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x);
+/// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x).
 Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
   Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
   AttributeList Attrs = Pow->getCalledFunction()->getAttributes();
@@ -1269,9 +1458,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       StringRef ExpName;
       Intrinsic::ID ID;
       Value *ExpFn;
-      LibFunc LibFnFloat;
-      LibFunc LibFnDouble;
-      LibFunc LibFnLongDouble;
+      LibFunc LibFnFloat, LibFnDouble, LibFnLongDouble;
 
       switch (LibFn) {
       default:
@@ -1305,9 +1492,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       // elimination cannot be trusted to remove it, since it may have side
       // effects (e.g., errno).  When the only consumer for the original
       // exp{,2}() is pow(), then it has to be explicitly erased.
-      BaseFn->replaceAllUsesWith(ExpFn);
-      eraseFromParent(BaseFn);
-
+      substituteInParent(BaseFn, ExpFn);
       return ExpFn;
     }
   }
@@ -1318,8 +1503,18 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
   if (!match(Pow->getArgOperand(0), m_APFloat(BaseF)))
     return nullptr;
 
+  // pow(2.0, itofp(x)) -> ldexp(1.0, x)
+  if (match(Base, m_SpecificFP(2.0)) &&
+      (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
+      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+    if (Value *ExpoI = getIntToFPVal(Expo, B))
+      return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI,
+                                   LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
+                                   B, Attrs);
+  }
+
   // pow(2.0 ** n, x) -> exp2(n * x)
-  if (hasUnaryFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
+  if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
     APFloat BaseR = APFloat(1.0);
     BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored);
     BaseR = BaseR / *BaseF;
@@ -1344,7 +1539,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
   // pow(10.0, x) -> exp10(x)
   // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
   if (match(Base, m_SpecificFP(10.0)) &&
-      hasUnaryFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
+      hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
     return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
                                 LibFunc_exp10l, B, Attrs);
 
@@ -1359,17 +1554,15 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
 
     if (Log) {
       Value *FMul = B.CreateFMul(Log, Expo, "mul");
-      if (Pow->doesNotAccessMemory()) {
+      if (Pow->doesNotAccessMemory())
         return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
                             FMul, "exp2");
-      } else {
-        if (hasUnaryFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f,
-                            LibFunc_exp2l))
-          return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
-                                      LibFunc_exp2l, B, Attrs);
-      }
+      else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
+        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
+                                    LibFunc_exp2l, B, Attrs);
     }
   }
+
   return nullptr;
 }
 
@@ -1384,8 +1577,7 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
   }
 
   // Otherwise, use the libcall for sqrt().
-  if (hasUnaryFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf,
-                      LibFunc_sqrtl))
+  if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl))
     // TODO: We also should check that the target can in fact lower the sqrt()
     // libcall. We currently have no way to ask this question, so we ask if
     // the target has a sqrt() libcall, which is not exactly the same.
@@ -1452,7 +1644,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
   bool Ignored;
 
   // Bail out if simplifying libcalls to pow() is disabled.
-  if (!hasUnaryFloatFn(TLI, Ty, LibFunc_pow, LibFunc_powf, LibFunc_powl))
+  if (!hasFloatFn(TLI, Ty, LibFunc_pow, LibFunc_powf, LibFunc_powl))
     return nullptr;
 
   // Propagate the math semantics from the call to any created instructions.
@@ -1480,8 +1672,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
   if (match(Expo, m_SpecificFP(-1.0)))
     return B.CreateFDiv(ConstantFP::get(Ty, 1.0), Base, "reciprocal");
 
-  // pow(x, 0.0) -> 1.0
-  if (match(Expo, m_SpecificFP(0.0)))
+  // pow(x, +/-0.0) -> 1.0
+  if (match(Expo, m_AnyZeroFP()))
     return ConstantFP::get(Ty, 1.0);
 
   // pow(x, 1.0) -> x
@@ -1558,16 +1750,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
 
   // powf(x, itofp(y)) -> powi(x, y)
   if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) {
-    Value *IntExpo = cast<Instruction>(Expo)->getOperand(0);
-    Value *NewExpo = nullptr;
-    unsigned BitWidth = IntExpo->getType()->getPrimitiveSizeInBits();
-    if (isa<SIToFPInst>(Expo) && BitWidth == 32)
-      NewExpo = IntExpo;
-    else if (BitWidth < 32)
-      NewExpo = isa<SIToFPInst>(Expo) ? B.CreateSExt(IntExpo, B.getInt32Ty())
-                                      : B.CreateZExt(IntExpo, B.getInt32Ty());
-    if (NewExpo)
-      return createPowWithIntegerExponent(Base, NewExpo, M, B);
+    if (Value *ExpoI = getIntToFPVal(Expo, B))
+      return createPowWithIntegerExponent(Base, ExpoI, M, B);
   }
 
   return Shrunk;
@@ -1575,45 +1759,25 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  Value *Ret = nullptr;
   StringRef Name = Callee->getName();
-  if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name))
+  Value *Ret = nullptr;
+  if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) &&
+      hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
+  Type *Ty = CI->getType();
   Value *Op = CI->getArgOperand(0);
+
   // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
   // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
-  LibFunc LdExp = LibFunc_ldexpl;
-  if (Op->getType()->isFloatTy())
-    LdExp = LibFunc_ldexpf;
-  else if (Op->getType()->isDoubleTy())
-    LdExp = LibFunc_ldexp;
-
-  if (TLI->has(LdExp)) {
-    Value *LdExpArg = nullptr;
-    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
-      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
-        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
-    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
-      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
-        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
-    }
-
-    if (LdExpArg) {
-      Constant *One = ConstantFP::get(CI->getContext(), APFloat(1.0f));
-      if (!Op->getType()->isFloatTy())
-        One = ConstantExpr::getFPExtend(One, Op->getType());
-
-      Module *M = CI->getModule();
-      FunctionCallee NewCallee = M->getOrInsertFunction(
-          TLI->getName(LdExp), Op->getType(), Op->getType(), B.getInt32Ty());
-      CallInst *CI = B.CreateCall(NewCallee, {One, LdExpArg});
-      if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
-        CI->setCallingConv(F->getCallingConv());
-
-      return CI;
-    }
+  if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) &&
+      hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+    if (Value *Exp = getIntToFPVal(Op, B))
+      return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI,
+                                   LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
+                                   B, CI->getCalledFunction()->getAttributes());
   }
+
   return Ret;
 }
 
@@ -1644,48 +1808,155 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
   return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) });
 }
 
-Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
+Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilder<> &B) {
+  Function *LogFn = Log->getCalledFunction();
+  AttributeList Attrs = LogFn->getAttributes();
+  StringRef LogNm = LogFn->getName();
+  Intrinsic::ID LogID = LogFn->getIntrinsicID();
+  Module *Mod = Log->getModule();
+  Type *Ty = Log->getType();
   Value *Ret = nullptr;
-  StringRef Name = Callee->getName();
-  if (UnsafeFPShrink && hasFloatVersion(Name))
-    Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  if (!CI->isFast())
-    return Ret;
-  Value *Op1 = CI->getArgOperand(0);
-  auto *OpC = dyn_cast<CallInst>(Op1);
+  if (UnsafeFPShrink && hasFloatVersion(LogNm))
+    Ret = optimizeUnaryDoubleFP(Log, B, true);
 
   // The earlier call must also be 'fast' in order to do these transforms.
-  if (!OpC || !OpC->isFast())
+  CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0));
+  if (!Log->isFast() || !Arg || !Arg->isFast() || !Arg->hasOneUse())
     return Ret;
 
-  // log(pow(x,y)) -> y*log(x)
-  // This is only applicable to log, log2, log10.
-  if (Name != "log" && Name != "log2" && Name != "log10")
+  LibFunc LogLb, ExpLb, Exp2Lb, Exp10Lb, PowLb;
+
+  // This is only applicable to log(), log2(), log10().
+  if (TLI->getLibFunc(LogNm, LogLb))
+    switch (LogLb) {
+    case LibFunc_logf:
+      LogID = Intrinsic::log;
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+      break;
+    case LibFunc_log:
+      LogID = Intrinsic::log;
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+      break;
+    case LibFunc_logl:
+      LogID = Intrinsic::log;
+      ExpLb = LibFunc_expl;
+      Exp2Lb = LibFunc_exp2l;
+      Exp10Lb = LibFunc_exp10l;
+      PowLb = LibFunc_powl;
+      break;
+    case LibFunc_log2f:
+      LogID = Intrinsic::log2;
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+      break;
+    case LibFunc_log2:
+      LogID = Intrinsic::log2;
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+      break;
+    case LibFunc_log2l:
+      LogID = Intrinsic::log2;
+      ExpLb = LibFunc_expl;
+      Exp2Lb = LibFunc_exp2l;
+      Exp10Lb = LibFunc_exp10l;
+      PowLb = LibFunc_powl;
+      break;
+    case LibFunc_log10f:
+      LogID = Intrinsic::log10;
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+      break;
+    case LibFunc_log10:
+      LogID = Intrinsic::log10;
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+      break;
+    case LibFunc_log10l:
+      LogID = Intrinsic::log10;
+      ExpLb = LibFunc_expl;
+      Exp2Lb = LibFunc_exp2l;
+      Exp10Lb = LibFunc_exp10l;
+      PowLb = LibFunc_powl;
+      break;
+    default:
+      return Ret;
+    }
+  else if (LogID == Intrinsic::log || LogID == Intrinsic::log2 ||
+           LogID == Intrinsic::log10) {
+    if (Ty->getScalarType()->isFloatTy()) {
+      ExpLb = LibFunc_expf;
+      Exp2Lb = LibFunc_exp2f;
+      Exp10Lb = LibFunc_exp10f;
+      PowLb = LibFunc_powf;
+    } else if (Ty->getScalarType()->isDoubleTy()) {
+      ExpLb = LibFunc_exp;
+      Exp2Lb = LibFunc_exp2;
+      Exp10Lb = LibFunc_exp10;
+      PowLb = LibFunc_pow;
+    } else
+      return Ret;
+  } else
     return Ret;
 
   IRBuilder<>::FastMathFlagGuard Guard(B);
-  FastMathFlags FMF;
-  FMF.setFast();
-  B.setFastMathFlags(FMF);
+  B.setFastMathFlags(FastMathFlags::getFast());
+
+  Intrinsic::ID ArgID = Arg->getIntrinsicID();
+  LibFunc ArgLb = NotLibFunc;
+  TLI->getLibFunc(Arg, ArgLb);
+
+  // log(pow(x,y)) -> y*log(x)
+  if (ArgLb == PowLb || ArgID == Intrinsic::pow) {
+    Value *LogX =
+        Log->doesNotAccessMemory()
+            ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
+                           Arg->getOperand(0), "log")
+            : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs);
+    Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul");
+    // Since pow() may have side effects, e.g. errno,
+    // dead code elimination may not be trusted to remove it.
+    substituteInParent(Arg, MulY);
+    return MulY;
+  }
+
+  // log(exp{,2,10}(y)) -> y*log({e,2,10})
+  // TODO: There is no exp10() intrinsic yet.
+  if (ArgLb == ExpLb || ArgLb == Exp2Lb || ArgLb == Exp10Lb ||
+           ArgID == Intrinsic::exp || ArgID == Intrinsic::exp2) {
+    Constant *Eul;
+    if (ArgLb == ExpLb || ArgID == Intrinsic::exp)
+      // FIXME: Add more precise value of e for long double.
+      Eul = ConstantFP::get(Log->getType(), numbers::e);
+    else if (ArgLb == Exp2Lb || ArgID == Intrinsic::exp2)
+      Eul = ConstantFP::get(Log->getType(), 2.0);
+    else
+      Eul = ConstantFP::get(Log->getType(), 10.0);
+    Value *LogE = Log->doesNotAccessMemory()
+                      ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
+                                     Eul, "log")
+                      : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs);
+    Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul");
+    // Since exp() may have side effects, e.g. errno,
+    // dead code elimination may not be trusted to remove it.
+    substituteInParent(Arg, MulY);
+    return MulY;
+  }
 
-  LibFunc Func;
-  Function *F = OpC->getCalledFunction();
-  if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
-      Func == LibFunc_pow) || F->getIntrinsicID() == Intrinsic::pow))
-    return B.CreateFMul(OpC->getArgOperand(1),
-      emitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
-                           Callee->getAttributes()), "mul");
-
-  // log(exp2(y)) -> y*log(2)
-  if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) &&
-      TLI->has(Func) && Func == LibFunc_exp2)
-    return B.CreateFMul(
-        OpC->getArgOperand(0),
-        emitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
-                             Callee->getName(), B, Callee->getAttributes()),
-        "logmul");
   return Ret;
 }
 
@@ -2137,6 +2408,7 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
     return New;
   }
 
+  annotateNonNullBasedOnAccess(CI, 0);
   return nullptr;
 }
 
@@ -2231,21 +2503,21 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
     return New;
   }
 
+  annotateNonNullBasedOnAccess(CI, {0, 1});
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
-  // Check for a fixed format string.
-  StringRef FormatStr;
-  if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
-    return nullptr;
-
   // Check for size
   ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   if (!Size)
     return nullptr;
 
   uint64_t N = Size->getZExtValue();
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
+    return nullptr;
 
   // If we just have a format string (nothing else crazy) transform it.
   if (CI->getNumArgOperands() == 3) {
@@ -2318,6 +2590,8 @@ Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) {
     return V;
   }
 
+  if (isKnownNonZero(CI->getOperand(1), DL))
+    annotateNonNullBasedOnAccess(CI, 0);
   return nullptr;
 }
 
@@ -2503,6 +2777,7 @@ Value *LibCallSimplifier::optimizeFRead(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
+  annotateNonNullBasedOnAccess(CI, 0);
   if (!CI->use_empty())
     return nullptr;
 
@@ -2515,6 +2790,12 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilder<> &B) {
+  // bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
+  return B.CreateMemMove(CI->getArgOperand(1), 1, CI->getArgOperand(0), 1,
+                         CI->getArgOperand(2));
+}
+
 bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
   LibFunc Func;
   SmallString<20> FloatFuncName = FuncName;
@@ -2557,6 +2838,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeStrLen(CI, Builder);
     case LibFunc_strpbrk:
       return optimizeStrPBrk(CI, Builder);
+    case LibFunc_strndup:
+      return optimizeStrNDup(CI, Builder);
     case LibFunc_strtol:
     case LibFunc_strtod:
     case LibFunc_strtof:
@@ -2573,12 +2856,16 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeStrStr(CI, Builder);
     case LibFunc_memchr:
       return optimizeMemChr(CI, Builder);
+    case LibFunc_memrchr:
+      return optimizeMemRChr(CI, Builder);
     case LibFunc_bcmp:
       return optimizeBCmp(CI, Builder);
     case LibFunc_memcmp:
       return optimizeMemCmp(CI, Builder);
     case LibFunc_memcpy:
       return optimizeMemCpy(CI, Builder);
+    case LibFunc_mempcpy:
+      return optimizeMemPCpy(CI, Builder);
     case LibFunc_memmove:
       return optimizeMemMove(CI, Builder);
     case LibFunc_memset:
@@ -2587,6 +2874,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeRealloc(CI, Builder);
     case LibFunc_wcslen:
       return optimizeWcslen(CI, Builder);
+    case LibFunc_bcopy:
+      return optimizeBCopy(CI, Builder);
     default:
       break;
     }
@@ -2626,11 +2915,21 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
   case LibFunc_sqrt:
   case LibFunc_sqrtl:
     return optimizeSqrt(CI, Builder);
+  case LibFunc_logf:
   case LibFunc_log:
+  case LibFunc_logl:
+  case LibFunc_log10f:
   case LibFunc_log10:
+  case LibFunc_log10l:
+  case LibFunc_log1pf:
   case LibFunc_log1p:
+  case LibFunc_log1pl:
+  case LibFunc_log2f:
   case LibFunc_log2:
+  case LibFunc_log2l:
+  case LibFunc_logbf:
   case LibFunc_logb:
+  case LibFunc_logbl:
     return optimizeLog(CI, Builder);
   case LibFunc_tan:
   case LibFunc_tanf:
@@ -2721,10 +3020,18 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
     case Intrinsic::exp2:
       return optimizeExp2(CI, Builder);
     case Intrinsic::log:
+    case Intrinsic::log2:
+    case Intrinsic::log10:
       return optimizeLog(CI, Builder);
     case Intrinsic::sqrt:
       return optimizeSqrt(CI, Builder);
     // TODO: Use foldMallocMemset() with memset intrinsic.
+    case Intrinsic::memset:
+      return optimizeMemSet(CI, Builder);
+    case Intrinsic::memcpy:
+      return optimizeMemCpy(CI, Builder);
+    case Intrinsic::memmove:
+      return optimizeMemMove(CI, Builder);
     default:
       return nullptr;
     }
@@ -2740,8 +3047,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       IRBuilder<> TmpBuilder(SimplifiedCI);
       if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) {
         // If we were able to further simplify, remove the now redundant call.
-        SimplifiedCI->replaceAllUsesWith(V);
-        eraseFromParent(SimplifiedCI);
+        substituteInParent(SimplifiedCI, V);
         return V;
       }
     }
@@ -2898,7 +3204,9 @@ FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
       uint64_t Len = GetStringLength(CI->getArgOperand(*StrOp));
       // If the length is 0 we don't know how long it is and so we can't
       // remove the check.
-      if (Len == 0)
+      if (Len)
+        annotateDereferenceableBytes(CI, *StrOp, Len);
+      else
         return false;
       return ObjSizeCI->getZExtValue() >= Len;
     }
@@ -2915,8 +3223,9 @@ FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
                                                      IRBuilder<> &B) {
   if (isFortifiedCallFoldable(CI, 3, 2)) {
-    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
-                   CI->getArgOperand(2));
+    CallInst *NewCI = B.CreateMemCpy(
+        CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, CI->getArgOperand(2));
+    NewCI->setAttributes(CI->getAttributes());
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -2925,8 +3234,9 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
                                                       IRBuilder<> &B) {
   if (isFortifiedCallFoldable(CI, 3, 2)) {
-    B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
-                    CI->getArgOperand(2));
+    CallInst *NewCI = B.CreateMemMove(
+        CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, CI->getArgOperand(2));
+    NewCI->setAttributes(CI->getAttributes());
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -2938,7 +3248,9 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
 
   if (isFortifiedCallFoldable(CI, 3, 2)) {
     Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
-    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    CallInst *NewCI =
+        B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    NewCI->setAttributes(CI->getAttributes());
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -2974,7 +3286,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
 
   // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk.
   uint64_t Len = GetStringLength(Src);
-  if (Len == 0)
+  if (Len)
+    annotateDereferenceableBytes(CI, 1, Len);
+  else
     return nullptr;
 
   Type *SizeTTy = DL.getIntPtrType(CI->getContext());
diff --git a/lib/Transforms/Utils/SymbolRewriter.cpp b/lib/Transforms/Utils/SymbolRewriter.cpp
index 456724779b43..5d380dcf231c 100644
--- a/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -380,11 +380,11 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
   // TODO see if there is a more elegant solution to selecting the rewrite
   // descriptor type
   if (!Target.empty())
-    DL->push_back(llvm::make_unique<ExplicitRewriteFunctionDescriptor>(
+    DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>(
         Source, Target, Naked));
   else
     DL->push_back(
-        llvm::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
+        std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
 
   return true;
 }
@@ -442,11 +442,11 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
   }
 
   if (!Target.empty())
-    DL->push_back(llvm::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
+    DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
         Source, Target,
         /*Naked*/ false));
   else
-    DL->push_back(llvm::make_unique<PatternRewriteGlobalVariableDescriptor>(
+    DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>(
         Source, Transform));
 
   return true;
@@ -505,11 +505,11 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
   }
 
   if (!Target.empty())
-    DL->push_back(llvm::make_unique<ExplicitRewriteNamedAliasDescriptor>(
+    DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>(
         Source, Target,
         /*Naked*/ false));
   else
-    DL->push_back(llvm::make_unique<PatternRewriteNamedAliasDescriptor>(
+    DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>(
         Source, Transform));
 
   return true;
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
index a77bf50fe10b..591e1fd2dbee 100644
--- a/lib/Transforms/Utils/VNCoercion.cpp
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -431,7 +431,7 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
     LoadInst *NewLoad = Builder.CreateLoad(DestTy, PtrVal);
     NewLoad->takeName(SrcVal);
-    NewLoad->setAlignment(SrcVal->getAlignment());
+    NewLoad->setAlignment(MaybeAlign(SrcVal->getAlignment()));
 
     LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
     LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index fbc3407c301f..da68d3713b40 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -27,8 +27,8 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
@@ -66,7 +66,7 @@ struct WorklistEntry {
   enum EntryKind {
     MapGlobalInit,
     MapAppendingVar,
-    MapGlobalAliasee,
+    MapGlobalIndirectSymbol,
     RemapFunction
   };
   struct GVInitTy {
@@ -77,9 +77,9 @@ struct WorklistEntry {
     GlobalVariable *GV;
     Constant *InitPrefix;
   };
-  struct GlobalAliaseeTy {
-    GlobalAlias *GA;
-    Constant *Aliasee;
+  struct GlobalIndirectSymbolTy {
+    GlobalIndirectSymbol *GIS;
+    Constant *Target;
   };
 
   unsigned Kind : 2;
@@ -89,7 +89,7 @@ struct WorklistEntry {
   union {
     GVInitTy GVInit;
     AppendingGVTy AppendingGV;
-    GlobalAliaseeTy GlobalAliasee;
+    GlobalIndirectSymbolTy GlobalIndirectSymbol;
     Function *RemapF;
   } Data;
 };
@@ -161,8 +161,8 @@ public:
                                     bool IsOldCtorDtor,
                                     ArrayRef<Constant *> NewMembers,
                                     unsigned MCID);
-  void scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
-                                unsigned MCID);
+  void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target,
+                                       unsigned MCID);
   void scheduleRemapFunction(Function &F, unsigned MCID);
 
   void flush();
@@ -172,7 +172,7 @@ private:
   void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
                             bool IsOldCtorDtor,
                             ArrayRef<Constant *> NewMembers);
-  void mapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee);
+  void mapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target);
   void remapFunction(Function &F, ValueToValueMapTy &VM);
 
   ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
@@ -774,20 +774,6 @@ Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) {
   return *getMappedOp(&FirstN);
 }
 
-namespace {
-
-struct MapMetadataDisabler {
-  ValueToValueMapTy &VM;
-
-  MapMetadataDisabler(ValueToValueMapTy &VM) : VM(VM) {
-    VM.disableMapMetadata();
-  }
-
-  ~MapMetadataDisabler() { VM.enableMapMetadata(); }
-};
-
-} // end anonymous namespace
-
 Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
   // If the value already exists in the map, use it.
   if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD))
@@ -802,9 +788,6 @@ Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
     return const_cast<Metadata *>(MD);
 
   if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) {
-    // Disallow recursion into metadata mapping through mapValue.
-    MapMetadataDisabler MMD(getVM());
-
     // Don't memoize ConstantAsMetadata.  Instead of lasting until the
     // LLVMContext is destroyed, they can be deleted when the GlobalValue they
     // reference is destructed.  These aren't super common, so the extra
@@ -846,9 +829,9 @@ void Mapper::flush() {
       AppendingInits.resize(PrefixSize);
       break;
     }
-    case WorklistEntry::MapGlobalAliasee:
-      E.Data.GlobalAliasee.GA->setAliasee(
-          mapConstant(E.Data.GlobalAliasee.Aliasee));
+    case WorklistEntry::MapGlobalIndirectSymbol:
+      E.Data.GlobalIndirectSymbol.GIS->setIndirectSymbol(
+          mapConstant(E.Data.GlobalIndirectSymbol.Target));
       break;
     case WorklistEntry::RemapFunction:
       remapFunction(*E.Data.RemapF);
@@ -1041,16 +1024,16 @@ void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
   AppendingInits.append(NewMembers.begin(), NewMembers.end());
 }
 
-void Mapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
-                                      unsigned MCID) {
-  assert(AlreadyScheduled.insert(&GA).second && "Should not reschedule");
+void Mapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
+                                             Constant &Target, unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GIS).second && "Should not reschedule");
   assert(MCID < MCs.size() && "Invalid mapping context");
 
   WorklistEntry WE;
-  WE.Kind = WorklistEntry::MapGlobalAliasee;
+  WE.Kind = WorklistEntry::MapGlobalIndirectSymbol;
   WE.MCID = MCID;
-  WE.Data.GlobalAliasee.GA = &GA;
-  WE.Data.GlobalAliasee.Aliasee = &Aliasee;
+  WE.Data.GlobalIndirectSymbol.GIS = &GIS;
+  WE.Data.GlobalIndirectSymbol.Target = &Target;
   Worklist.push_back(WE);
 }
 
@@ -1147,9 +1130,10 @@ void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
       GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
 }
 
-void ValueMapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
-                                           unsigned MCID) {
-  getAsMapper(pImpl)->scheduleMapGlobalAliasee(GA, Aliasee, MCID);
+void ValueMapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
+                                                  Constant &Target,
+                                                  unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalIndirectSymbol(GIS, Target, MCID);
 }
 
 void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 4273080ddd91..f44976c723ec 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -147,7 +147,7 @@ private:
   static const unsigned MaxDepth = 3;
 
   bool isConsecutiveAccess(Value *A, Value *B);
-  bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
                               unsigned Depth = 0) const;
   bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
                                    unsigned Depth) const;
@@ -336,14 +336,29 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
 }
 
 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
-                                        const APInt &PtrDelta,
-                                        unsigned Depth) const {
+                                        APInt PtrDelta, unsigned Depth) const {
   unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
   APInt OffsetA(PtrBitWidth, 0);
   APInt OffsetB(PtrBitWidth, 0);
   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
+  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+
+  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+    return false;
+
+  // In case if we have to shrink the pointer
+  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
+  // possible overflow and the value should fit into a smallest data type
+  // used in the cast/gep chain.
+  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
+         OffsetB.getMinSignedBits() <= NewPtrBitWidth);
+
+  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
+  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
+  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
+
   APInt OffsetDelta = OffsetB - OffsetA;
 
   // Check if they are based on the same pointer. That makes the offsets
@@ -650,7 +665,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
       // We can ignore the alias if the we have a load store pair and the load
       // is known to be invariant. The load cannot be clobbered by the store.
       auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
-        return LI->getMetadata(LLVMContext::MD_invariant_load);
+        return LI->hasMetadata(LLVMContext::MD_invariant_load);
       };
 
       // We can ignore the alias as long as the load comes before the store,
@@ -1077,7 +1092,7 @@ bool Vectorizer::vectorizeLoadChain(
   LoadInst *L0 = cast<LoadInst>(Chain[0]);
 
   // If the vector has an int element, default to int for the whole load.
-  Type *LoadTy;
+  Type *LoadTy = nullptr;
   for (const auto &V : Chain) {
     LoadTy = cast<LoadInst>(V)->getType();
     if (LoadTy->isIntOrIntVectorTy())
@@ -1089,6 +1104,7 @@ bool Vectorizer::vectorizeLoadChain(
       break;
     }
   }
+  assert(LoadTy && "Can't determine LoadInst type from chain");
 
   unsigned Sz = DL.getTypeSizeInBits(LoadTy);
   unsigned AS = L0->getPointerAddressSpace();
diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 6ef8dc2d3cd7..f43842be5357 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -13,7 +13,10 @@
 // pass. It should be easy to create an analysis pass around it if there
 // is a need (but D45420 needs to happen first).
 //
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
 
@@ -47,38 +50,6 @@ static const unsigned MaxInterleaveFactor = 16;
 
 namespace llvm {
 
-#ifndef NDEBUG
-static void debugVectorizationFailure(const StringRef DebugMsg,
-    Instruction *I) {
-  dbgs() << "LV: Not vectorizing: " << DebugMsg;
-  if (I != nullptr)
-    dbgs() << " " << *I;
-  else
-    dbgs() << '.';
-  dbgs() << '\n';
-}
-#endif
-
-OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
-                                                  StringRef RemarkName,
-                                                  Loop *TheLoop,
-                                                  Instruction *I) {
-  Value *CodeRegion = TheLoop->getHeader();
-  DebugLoc DL = TheLoop->getStartLoc();
-
-  if (I) {
-    CodeRegion = I->getParent();
-    // If there is no debug location attached to the instruction, revert back to
-    // using the loop's.
-    if (I->getDebugLoc())
-      DL = I->getDebugLoc();
-  }
-
-  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
-  R << "loop not vectorized: ";
-  return R;
-}
-
 bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   switch (Kind) {
   case HK_WIDTH:
@@ -88,6 +59,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   case HK_FORCE:
     return (Val <= 1);
   case HK_ISVECTORIZED:
+  case HK_PREDICATE:
     return (Val == 0 || Val == 1);
   }
   return false;
@@ -99,7 +71,9 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
     : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
       Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
       Force("vectorize.enable", FK_Undefined, HK_FORCE),
-      IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
+      IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
+      Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L),
+      ORE(ORE) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
 
@@ -250,7 +224,7 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
     return;
   unsigned Val = C->getZExtValue();
 
-  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
+  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
   for (auto H : Hints) {
     if (Name == H->Name) {
       if (H->validate(Val))
@@ -435,7 +409,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   const ValueToValueMap &Strides =
       getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
 
-  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
+  bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();
+  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
   if (Stride == 1 || Stride == -1)
     return Stride;
   return 0;
@@ -445,14 +420,6 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
   return LAI->isUniform(V);
 }
 
-void LoopVectorizationLegality::reportVectorizationFailure(
-    const StringRef DebugMsg, const StringRef OREMsg,
-    const StringRef ORETag, Instruction *I) const {
-  LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
-  ORE->emit(createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
-      ORETag, TheLoop, I) << OREMsg);
-}
-
 bool LoopVectorizationLegality::canVectorizeOuterLoop() {
   assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
   // Store the result and return it at the end instead of exiting early, in case
@@ -467,7 +434,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
     if (!Br) {
       reportVectorizationFailure("Unsupported basic block terminator",
           "loop control flow is not understood by vectorizer",
-          "CFGNotUnderstood");
+          "CFGNotUnderstood", ORE, TheLoop);
       if (DoExtraAnalysis)
         Result = false;
       else
@@ -486,7 +453,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
         !LI->isLoopHeader(Br->getSuccessor(1))) {
       reportVectorizationFailure("Unsupported conditional branch",
           "loop control flow is not understood by vectorizer",
-          "CFGNotUnderstood");
+          "CFGNotUnderstood", ORE, TheLoop);
       if (DoExtraAnalysis)
         Result = false;
       else
@@ -500,7 +467,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
                          TheLoop /*context outer loop*/)) {
     reportVectorizationFailure("Outer loop contains divergent loops",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -511,7 +478,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
   if (!setupOuterLoopInductions()) {
     reportVectorizationFailure("Unsupported outer loop Phi(s)",
                                "Unsupported outer loop Phi(s)",
-                               "UnsupportedPhi");
+                               "UnsupportedPhi", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -618,7 +585,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
             !PhiTy->isPointerTy()) {
           reportVectorizationFailure("Found a non-int non-pointer PHI",
                                      "loop control flow is not understood by vectorizer",
-                                     "CFGNotUnderstood");
+                                     "CFGNotUnderstood", ORE, TheLoop);
           return false;
         }
 
@@ -631,6 +598,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           // Unsafe cyclic dependencies with header phis are identified during
           // legalization for reduction, induction and first order
           // recurrences.
+          AllowedExit.insert(&I);
           continue;
         }
 
@@ -638,7 +606,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (Phi->getNumIncomingValues() != 2) {
           reportVectorizationFailure("Found an invalid PHI",
               "loop control flow is not understood by vectorizer",
-              "CFGNotUnderstood", Phi);
+              "CFGNotUnderstood", ORE, TheLoop, Phi);
           return false;
         }
 
@@ -690,7 +658,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         reportVectorizationFailure("Found an unidentified PHI",
             "value that could not be identified as "
             "reduction is used outside the loop",
-            "NonReductionValueUsedOutsideLoop", Phi);
+            "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
         return false;
       } // end of PHI handling
 
@@ -721,11 +689,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
               "library call cannot be vectorized. "
               "Try compiling with -fno-math-errno, -ffast-math, "
               "or similar flags",
-              "CantVectorizeLibcall", CI);
+              "CantVectorizeLibcall", ORE, TheLoop, CI);
         } else {
           reportVectorizationFailure("Found a non-intrinsic callsite",
                                      "call instruction cannot be vectorized",
-                                     "CantVectorizeLibcall", CI);
+                                     "CantVectorizeLibcall", ORE, TheLoop, CI);
         }
         return false;
       }
@@ -740,7 +708,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
             if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
               reportVectorizationFailure("Found unvectorizable intrinsic",
                   "intrinsic instruction cannot be vectorized",
-                  "CantVectorizeIntrinsic", CI);
+                  "CantVectorizeIntrinsic", ORE, TheLoop, CI);
               return false;
             }
           }
@@ -753,7 +721,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           isa<ExtractElementInst>(I)) {
         reportVectorizationFailure("Found unvectorizable type",
             "instruction return type cannot be vectorized",
-            "CantVectorizeInstructionReturnType", &I);
+            "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
         return false;
       }
 
@@ -763,7 +731,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (!VectorType::isValidElementType(T)) {
           reportVectorizationFailure("Store instruction cannot be vectorized",
                                      "store instruction cannot be vectorized",
-                                     "CantVectorizeStore", ST);
+                                     "CantVectorizeStore", ORE, TheLoop, ST);
           return false;
         }
 
@@ -773,12 +741,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           // Arbitrarily try a vector of 2 elements.
           Type *VecTy = VectorType::get(T, /*NumElements=*/2);
           assert(VecTy && "did not find vectorized version of stored type");
-          unsigned Alignment = getLoadStoreAlignment(ST);
-          if (!TTI->isLegalNTStore(VecTy, Alignment)) {
+          const MaybeAlign Alignment = getLoadStoreAlignment(ST);
+          assert(Alignment && "Alignment should be set");
+          if (!TTI->isLegalNTStore(VecTy, *Alignment)) {
             reportVectorizationFailure(
                 "nontemporal store instruction cannot be vectorized",
                 "nontemporal store instruction cannot be vectorized",
-                "CantVectorizeNontemporalStore", ST);
+                "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
             return false;
           }
         }
@@ -789,12 +758,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           // supported on the target (arbitrarily try a vector of 2 elements).
           Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
           assert(VecTy && "did not find vectorized version of load type");
-          unsigned Alignment = getLoadStoreAlignment(LD);
-          if (!TTI->isLegalNTLoad(VecTy, Alignment)) {
+          const MaybeAlign Alignment = getLoadStoreAlignment(LD);
+          assert(Alignment && "Alignment should be set");
+          if (!TTI->isLegalNTLoad(VecTy, *Alignment)) {
             reportVectorizationFailure(
                 "nontemporal load instruction cannot be vectorized",
                 "nontemporal load instruction cannot be vectorized",
-                "CantVectorizeNontemporalLoad", LD);
+                "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
             return false;
           }
         }
@@ -823,7 +793,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         }
         reportVectorizationFailure("Value cannot be used outside the loop",
                                    "value cannot be used outside the loop",
-                                   "ValueUsedOutsideLoop", &I);
+                                   "ValueUsedOutsideLoop", ORE, TheLoop, &I);
         return false;
       }
     } // next instr.
@@ -833,12 +803,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
     if (Inductions.empty()) {
       reportVectorizationFailure("Did not find one integer induction var",
           "loop induction variable could not be identified",
-          "NoInductionVariable");
+          "NoInductionVariable", ORE, TheLoop);
       return false;
     } else if (!WidestIndTy) {
       reportVectorizationFailure("Did not find one integer induction var",
           "integer loop induction variable could not be identified",
-          "NoIntegerInductionVariable");
+          "NoIntegerInductionVariable", ORE, TheLoop);
       return false;
     } else {
       LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
@@ -869,7 +839,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
     reportVectorizationFailure("Stores to a uniform address",
         "write to a loop invariant address could not be vectorized",
-        "CantVectorizeStoreToLoopInvariantAddress");
+        "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
     return false;
   }
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
@@ -905,7 +875,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
 }
 
 bool LoopVectorizationLegality::blockCanBePredicated(
-    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
 
   for (Instruction &I : *BB) {
@@ -924,7 +894,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(
         // !llvm.mem.parallel_loop_access implies if-conversion safety.
         // Otherwise, record that the load needs (real or emulated) masking
         // and let the cost model decide.
-        if (!IsAnnotatedParallel)
+        if (!IsAnnotatedParallel || PreserveGuards)
           MaskedOp.insert(LI);
         continue;
       }
@@ -953,23 +923,41 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   if (!EnableIfConversion) {
     reportVectorizationFailure("If-conversion is disabled",
                                "if-conversion is disabled",
-                               "IfConversionDisabled");
+                               "IfConversionDisabled",
+                               ORE, TheLoop);
     return false;
   }
 
   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
 
-  // A list of pointers that we can safely read and write to.
+  // A list of pointers which are known to be dereferenceable within scope of
+  // the loop body for each iteration of the loop which executes.  That is,
+  // the memory pointed to can be dereferenced (with the access size implied by
+  // the value's type) unconditionally within the loop header without
+  // introducing a new fault.
   SmallPtrSet<Value *, 8> SafePointes;
 
   // Collect safe addresses.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (blockNeedsPredication(BB))
+    if (!blockNeedsPredication(BB)) {
+      for (Instruction &I : *BB)
+        if (auto *Ptr = getLoadStorePointerOperand(&I))
+          SafePointes.insert(Ptr);
       continue;
+    }
 
-    for (Instruction &I : *BB)
-      if (auto *Ptr = getLoadStorePointerOperand(&I))
-        SafePointes.insert(Ptr);
+    // For a block which requires predication, a address may be safe to access
+    // in the loop w/o predication if we can prove dereferenceability facts
+    // sufficient to ensure it'll never fault within the loop. For the moment,
+    // we restrict this to loads; stores are more complicated due to
+    // concurrency restrictions.
+    ScalarEvolution &SE = *PSE.getSE();
+    for (Instruction &I : *BB) {
+      LoadInst *LI = dyn_cast<LoadInst>(&I);
+      if (LI && !mustSuppressSpeculation(*LI) &&
+          isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
+        SafePointes.insert(LI->getPointerOperand());
+    }
   }
 
   // Collect the blocks that need predication.
@@ -979,7 +967,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     if (!isa<BranchInst>(BB->getTerminator())) {
       reportVectorizationFailure("Loop contains a switch statement",
                                  "loop contains a switch statement",
-                                 "LoopContainsSwitch", BB->getTerminator());
+                                 "LoopContainsSwitch", ORE, TheLoop,
+                                 BB->getTerminator());
       return false;
     }
 
@@ -989,14 +978,16 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
         reportVectorizationFailure(
             "Control flow cannot be substituted for a select",
             "control flow cannot be substituted for a select",
-            "NoCFGForSelect", BB->getTerminator());
+            "NoCFGForSelect", ORE, TheLoop,
+            BB->getTerminator());
         return false;
       }
     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
       reportVectorizationFailure(
           "Control flow cannot be substituted for a select",
           "control flow cannot be substituted for a select",
-          "NoCFGForSelect", BB->getTerminator());
+          "NoCFGForSelect", ORE, TheLoop,
+          BB->getTerminator());
       return false;
     }
   }
@@ -1026,7 +1017,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
   if (!Lp->getLoopPreheader()) {
     reportVectorizationFailure("Loop doesn't have a legal pre-header",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1037,7 +1028,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
   if (Lp->getNumBackEdges() != 1) {
     reportVectorizationFailure("The loop must have a single backedge",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1048,7 +1039,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
   if (!Lp->getExitingBlock()) {
     reportVectorizationFailure("The loop must have an exiting block",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1061,7 +1052,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
   if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
     reportVectorizationFailure("The exiting block is not the loop latch",
         "loop control flow is not understood by vectorizer",
-        "CFGNotUnderstood");
+        "CFGNotUnderstood", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1124,7 +1115,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
     if (!canVectorizeOuterLoop()) {
       reportVectorizationFailure("Unsupported outer loop",
                                  "unsupported outer loop",
-                                 "UnsupportedOuterLoop");
+                                 "UnsupportedOuterLoop",
+                                 ORE, TheLoop);
       // TODO: Implement DoExtraAnalysis when subsequent legal checks support
       // outer loops.
       return false;
@@ -1176,7 +1168,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
     reportVectorizationFailure("Too many SCEV checks needed",
         "Too many SCEV assumptions need to be made and checked at runtime",
-        "TooManySCEVRunTimeChecks");
+        "TooManySCEVRunTimeChecks", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
     else
@@ -1190,7 +1182,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   return Result;
 }
 
-bool LoopVectorizationLegality::canFoldTailByMasking() {
+bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
 
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
 
@@ -1199,22 +1191,21 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {
         "No primary induction, cannot fold tail by masking",
         "Missing a primary induction variable in the loop, which is "
         "needed in order to fold tail by masking as required.",
-        "NoPrimaryInduction");
+        "NoPrimaryInduction", ORE, TheLoop);
     return false;
   }
 
-  // TODO: handle reductions when tail is folded by masking.
-  if (!Reductions.empty()) {
-    reportVectorizationFailure(
-        "Loop has reductions, cannot fold tail by masking",
-        "Cannot fold tail by masking in the presence of reductions.",
-        "ReductionFoldingTailByMasking");
-    return false;
-  }
+  SmallPtrSet<const Value *, 8> ReductionLiveOuts;
 
-  // TODO: handle outside users when tail is folded by masking.
+  for (auto &Reduction : *getReductionVars())
+    ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
+
+  // TODO: handle non-reduction outside users when tail is folded by masking.
   for (auto *AE : AllowedExit) {
-    // Check that all users of allowed exit values are inside the loop.
+    // Check that all users of allowed exit values are inside the loop or
+    // are the live-out of a reduction.
+    if (ReductionLiveOuts.count(AE))
+      continue;
     for (User *U : AE->users()) {
       Instruction *UI = cast<Instruction>(U);
       if (TheLoop->contains(UI))
@@ -1222,7 +1213,7 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {
       reportVectorizationFailure(
           "Cannot fold tail by masking, loop has an outside user for",
           "Cannot fold tail by masking in the presence of live outs.",
-          "LiveOutFoldingTailByMasking", UI);
+          "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
       return false;
     }
   }
@@ -1233,11 +1224,12 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {
   // Check and mark all blocks for predication, including those that ordinarily
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers)) {
+    if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
       reportVectorizationFailure(
           "Cannot fold tail by masking as required",
           "control flow cannot be substituted for a select",
-          "NoCFGForSelect", BB->getTerminator());
+          "NoCFGForSelect", ORE, TheLoop,
+          BB->getTerminator());
       return false;
     }
   }
diff --git a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 97077cce83e3..a5e85f27fabf 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -228,11 +228,11 @@ public:
 
   /// Plan how to best vectorize, return the best VF and its cost, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF);
+  Optional<VectorizationFactor> plan(unsigned UserVF);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
-  VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);
+  VectorizationFactor planInVPlanNativePath(unsigned UserVF);
 
   /// Finalize the best decision and dispose of all other VPlans.
   void setBestPlan(unsigned VF, unsigned UF);
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 46265e3f3e13..8f0bf70f873c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -177,6 +177,14 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
              "value are vectorized only if no scalar iteration overheads "
              "are incurred."));
 
+// Indicates that an epilogue is undesired, predication is preferred.
+// This means that the vectorizer will try to fold the loop-tail (epilogue)
+// into the loop and predicate the loop body accordingly.
+static cl::opt<bool> PreferPredicateOverEpilog(
+    "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
+    cl::desc("Indicate that an epilogue is undesired, predication should be "
+             "used instead."));
+
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -347,6 +355,29 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
                            : ConstantFP::get(Ty, C);
 }
 
+/// Returns "best known" trip count for the specified loop \p L as defined by
+/// the following procedure:
+///   1) Returns exact trip count if it is known.
+///   2) Returns expected trip count according to profile data if any.
+///   3) Returns upper bound estimate if it is known.
+///   4) Returns None if all of the above failed.
+static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
+  // Check if exact trip count is known.
+  if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
+    return ExpectedTC;
+
+  // Check if there is an expected trip count available from profile data.
+  if (LoopVectorizeWithBlockFrequency)
+    if (auto EstimatedTC = getLoopEstimatedTripCount(L))
+      return EstimatedTC;
+
+  // Check if upper bound estimate is known.
+  if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
+    return ExpectedTC;
+
+  return None;
+}
+
 namespace llvm {
 
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
@@ -795,6 +826,59 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
     B.SetCurrentDebugLocation(DebugLoc());
 }
 
+/// Write a record \p DebugMsg about vectorization failure to the debug
+/// output stream. If \p I is passed, it is an instruction that prevents
+/// vectorization.
+#ifndef NDEBUG
+static void debugVectorizationFailure(const StringRef DebugMsg,
+    Instruction *I) {
+  dbgs() << "LV: Not vectorizing: " << DebugMsg;
+  if (I != nullptr)
+    dbgs() << " " << *I;
+  else
+    dbgs() << '.';
+  dbgs() << '\n';
+}
+#endif
+
+/// Create an analysis remark that explains why vectorization failed
+///
+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
+/// RemarkName is the identifier for the remark.  If \p I is passed it is an
+/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
+/// the location of the remark.  \return the remark object that can be
+/// streamed to.
+static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
+    StringRef RemarkName, Loop *TheLoop, Instruction *I) {
+  Value *CodeRegion = TheLoop->getHeader();
+  DebugLoc DL = TheLoop->getStartLoc();
+
+  if (I) {
+    CodeRegion = I->getParent();
+    // If there is no debug location attached to the instruction, revert back to
+    // using the loop's.
+    if (I->getDebugLoc())
+      DL = I->getDebugLoc();
+  }
+
+  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+  R << "loop not vectorized: ";
+  return R;
+}
+
+namespace llvm {
+
+void reportVectorizationFailure(const StringRef DebugMsg,
+    const StringRef OREMsg, const StringRef ORETag,
+    OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
+  LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
+  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
+  ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
+                ORETag, TheLoop, I) << OREMsg);
+}
+
+} // end namespace llvm
+
 #ifndef NDEBUG
 /// \return string containing a file name and a line # for the given loop.
 static std::string getDebugLocString(const Loop *L) {
@@ -836,6 +920,26 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
 
 namespace llvm {
 
+// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// lowered.
+enum ScalarEpilogueLowering {
+
+  // The default: allowing scalar epilogues.
+  CM_ScalarEpilogueAllowed,
+
+  // Vectorization with OptForSize: don't allow epilogues.
+  CM_ScalarEpilogueNotAllowedOptSize,
+
+  // A special case of vectorisation with OptForSize: loops with a very small
+  // trip count are considered for vectorization under OptForSize, thereby
+  // making sure the cost of their loop body is dominant, free of runtime
+  // guards and scalar iteration overheads.
+  CM_ScalarEpilogueNotAllowedLowTripLoop,
+
+  // Loop hint predicate indicating an epilogue is undesired.
+  CM_ScalarEpilogueNotNeededUsePredicate
+};
+
 /// LoopVectorizationCostModel - estimates the expected speedups due to
 /// vectorization.
 /// In many cases vectorization is not profitable. This can happen because of
@@ -845,20 +949,26 @@ namespace llvm {
 /// different operations.
 class LoopVectorizationCostModel {
 public:
-  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
-                             LoopInfo *LI, LoopVectorizationLegality *Legal,
+  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
+                             PredicatedScalarEvolution &PSE, LoopInfo *LI,
+                             LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
                              const LoopVectorizeHints *Hints,
                              InterleavedAccessInfo &IAI)
-      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
-    AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
+      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
+        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+        Hints(Hints), InterleaveInfo(IAI) {}
 
   /// \return An upper bound for the vectorization factor, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<unsigned> computeMaxVF(bool OptForSize);
+  Optional<unsigned> computeMaxVF();
+
+  /// \return True if runtime checks are required for vectorization, and false
+  /// otherwise.
+  bool runtimeChecksRequired();
 
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
@@ -881,8 +991,7 @@ public:
   /// If interleave count has been specified by metadata it will be returned.
   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
   /// are the selected vectorization factor and the cost of the selected VF.
-  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
-                                 unsigned LoopCost);
+  unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
 
   /// Memory access instruction may be vectorized in more than one way.
   /// Form of instruction after vectorization depends on cost.
@@ -897,10 +1006,11 @@ public:
   /// of a loop.
   struct RegisterUsage {
     /// Holds the number of loop invariant values that are used in the loop.
-    unsigned LoopInvariantRegs;
-
+    /// The key is ClassID of target-provided register class.
+    SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
     /// Holds the maximum number of concurrent live intervals in the loop.
-    unsigned MaxLocalUsers;
+    /// The key is ClassID of target-provided register class.
+    SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
   };
 
   /// \return Returns information about the register usages of the loop for the
@@ -1080,14 +1190,16 @@ public:
 
   /// Returns true if the target machine supports masked store operation
   /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
-    return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
+  bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+    return Legal->isConsecutivePtr(Ptr) &&
+           TTI.isLegalMaskedStore(DataType, Alignment);
   }
 
   /// Returns true if the target machine supports masked load operation
   /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
-    return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
+  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+    return Legal->isConsecutivePtr(Ptr) &&
+           TTI.isLegalMaskedLoad(DataType, Alignment);
   }
 
   /// Returns true if the target machine supports masked scatter operation
@@ -1157,11 +1269,14 @@ public:
   /// to handle accesses with gaps, and there is nothing preventing us from
   /// creating a scalar epilogue.
   bool requiresScalarEpilogue() const {
-    return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
+    return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
   }
 
-  /// Returns true if a scalar epilogue is not allowed due to optsize.
-  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+  /// Returns true if a scalar epilogue is not allowed due to optsize or a
+  /// loop hint annotation.
+  bool isScalarEpilogueAllowed() const {
+    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+  }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const { return FoldTailByMasking; }
@@ -1187,7 +1302,7 @@ private:
 
   /// \return An upper bound for the vectorization factor, larger than zero.
   /// One is returned if vectorization should best be avoided due to cost.
-  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
+  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
 
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -1246,15 +1361,6 @@ private:
   /// should be used.
   bool useEmulatedMaskMemRefHack(Instruction *I);
 
-  /// Create an analysis remark that explains why vectorization failed
-  ///
-  /// \p RemarkName is the identifier for the remark.  \return the remark object
-  /// that can be streamed to.
-  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
-    return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
-                                  RemarkName, TheLoop);
-  }
-
   /// Map of scalar integer values to the smallest bitwidth they can be legally
   /// represented as. The vector equivalents of these values should be truncated
   /// to this type.
@@ -1270,13 +1376,13 @@ private:
   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
 
   /// Records whether it is allowed to have the original scalar loop execute at
-  /// least once. This may be needed as a fallback loop in case runtime 
+  /// least once. This may be needed as a fallback loop in case runtime
   /// aliasing/dependence checks fail, or to handle the tail/remainder
   /// iterations when the trip count is unknown or doesn't divide by the VF,
   /// or as a peel-loop to handle gaps in interleave-groups.
   /// Under optsize and when the trip count is very small we don't allow any
   /// iterations to execute in the scalar loop.
-  bool IsScalarEpilogueAllowed = true;
+  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
 
   /// All blocks of loop are to be masked to fold tail of scalar iterations.
   bool FoldTailByMasking = false;
@@ -1496,7 +1602,7 @@ struct LoopVectorize : public FunctionPass {
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
@@ -2253,12 +2359,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   Type *ScalarDataTy = getMemInstValueType(Instr);
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = getLoadStorePointerOperand(Instr);
-  unsigned Alignment = getLoadStoreAlignment(Instr);
   // An alignment of 0 means target abi alignment. We need to use the scalar's
   // target abi alignment in such a case.
   const DataLayout &DL = Instr->getModule()->getDataLayout();
-  if (!Alignment)
-    Alignment = DL.getABITypeAlignment(ScalarDataTy);
+  const Align Alignment =
+      DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
 
   // Determine if the pointer operand of the access is either consecutive or
@@ -2322,8 +2427,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       if (CreateGatherScatter) {
         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
-        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
-                                            MaskPart);
+        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
+                                            Alignment.value(), MaskPart);
       } else {
         if (Reverse) {
           // If we store to reverse consecutive memory locations, then we need
@@ -2334,10 +2439,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
         }
         auto *VecPtr = CreateVecPtr(Part, Ptr);
         if (isMaskRequired)
-          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
-                                            Mask[Part]);
+          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
+                                            Alignment.value(), Mask[Part]);
         else
-          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+          NewSI =
+              Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
       }
       addMetadata(NewSI, SI);
     }
@@ -2352,18 +2458,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
     if (CreateGatherScatter) {
       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
-      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
+      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
                                          nullptr, "wide.masked.gather");
       addMetadata(NewLI, LI);
     } else {
       auto *VecPtr = CreateVecPtr(Part, Ptr);
       if (isMaskRequired)
-        NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
+        NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
                                          UndefValue::get(DataTy),
                                          "wide.masked.load");
       else
-        NewLI =
-            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+        NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
+                                          "wide.load");
 
       // Add metadata to the load, but setVectorValue to the reverse shuffle.
       addMetadata(NewLI, LI);
@@ -2615,8 +2721,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
     if (C->isZero())
       return;
 
-  assert(!Cost->foldTailByMasking() &&
-         "Cannot SCEV check stride or overflow when folding tail");
+  assert(!BB->getParent()->hasOptSize() &&
+         "Cannot SCEV check stride or overflow when optimizing for size");
+
   // Create a new block containing the stride check.
   BB->setName("vector.scevcheck");
   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2649,7 +2756,20 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   if (!MemRuntimeCheck)
     return;
 
-  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
+  if (BB->getParent()->hasOptSize()) {
+    assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+           "Cannot emit memory checks when optimizing for size, unless forced "
+           "to vectorize.");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
+                                        L->getStartLoc(), L->getHeader())
+             << "Code-size may be reduced by not forcing "
+                "vectorization, or by source-code modifications "
+                "eliminating the need for runtime checks "
+                "(e.g., adding 'restrict').";
+    });
+  }
+
   // Create a new block containing the memory check.
   BB->setName("vector.memcheck");
   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2666,7 +2786,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
 
   // We currently don't use LoopVersioning for the actual loop cloning but we
   // still use it to add the noalias metadata.
-  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
+  LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
                                            PSE.getSE());
   LVer->prepareNoAliasMetadata();
 }
@@ -3598,6 +3718,26 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   setDebugLocFromInst(Builder, LoopExitInst);
 
+  // If tail is folded by masking, the vector value to leave the loop should be
+  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
+  // instead of the former.
+  if (Cost->foldTailByMasking()) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *VecLoopExitInst =
+          VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+      Value *Sel = nullptr;
+      for (User *U : VecLoopExitInst->users()) {
+        if (isa<SelectInst>(U)) {
+          assert(!Sel && "Reduction exit feeding two selects");
+          Sel = U;
+        } else
+          assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
+      }
+      assert(Sel && "Reduction exit feeds no select");
+      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+    }
+  }
+
   // If the vector reduction can be performed in a smaller type, we truncate
   // then extend the loop exit value to enable InstCombine to evaluate the
   // entire expression in the smaller type.
@@ -4064,7 +4204,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
   case Instruction::FCmp: {
     // Widen compares. Generate vector compares.
     bool FCmp = (I.getOpcode() == Instruction::FCmp);
-    auto *Cmp = dyn_cast<CmpInst>(&I);
+    auto *Cmp = cast<CmpInst>(&I);
     setDebugLocFromInst(Builder, Cmp);
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
@@ -4097,7 +4237,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    auto *CI = dyn_cast<CastInst>(&I);
+    auto *CI = cast<CastInst>(&I);
     setDebugLocFromInst(Builder, CI);
 
     /// Vectorize casts.
@@ -4421,9 +4561,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
              "Widening decision should be ready at this moment");
       return WideningDecision == CM_Scalarize;
     }
+    const MaybeAlign Alignment = getLoadStoreAlignment(I);
     return isa<LoadInst>(I) ?
-        !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
-      : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
+        !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
+      : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
@@ -4452,10 +4593,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
   // Check if masking is required.
   // A Group may need masking for one of two reasons: it resides in a block that
   // needs predication, or it was decided to use masking to deal with gaps.
-  bool PredicatedAccessRequiresMasking = 
+  bool PredicatedAccessRequiresMasking =
       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
-  bool AccessWithGapsRequiresMasking = 
-      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  bool AccessWithGapsRequiresMasking =
+      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
     return true;
 
@@ -4466,8 +4607,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
          "Masked interleave-groups for predicated accesses are not enabled.");
 
   auto *Ty = getMemInstValueType(I);
-  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
-                          : TTI.isLegalMaskedStore(Ty);
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
+  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
+                          : TTI.isLegalMaskedStore(Ty, Alignment);
 }
 
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
@@ -4675,82 +4817,96 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
-  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
-    // TODO: It may by useful to do since it's still likely to be dynamically
-    // uniform if the target can skip.
-    LLVM_DEBUG(
-        dbgs() << "LV: Not inserting runtime ptr check for divergent target");
-
-    ORE->emit(
-      createMissedAnalysis("CantVersionLoopWithDivergentTarget")
-      << "runtime pointer checks needed. Not enabled for divergent target");
-
-    return None;
-  }
-
-  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
-    return computeFeasibleMaxVF(OptForSize, TC);
+bool LoopVectorizationCostModel::runtimeChecksRequired() {
+  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
 
   if (Legal->getRuntimePointerChecking()->Need) {
-    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
-              << "runtime pointer checks needed. Enable vectorization of this "
-                 "loop with '#pragma clang loop vectorize(enable)' when "
-                 "compiling with -Os/-Oz");
-    LLVM_DEBUG(
-        dbgs()
-        << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
-    return None;
+    reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
+        "runtime pointer checks needed. Enable vectorization of this "
+        "loop with '#pragma clang loop vectorize(enable)' when "
+        "compiling with -Os/-Oz",
+        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    return true;
   }
 
   if (!PSE.getUnionPredicate().getPredicates().empty()) {
-    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
-              << "runtime SCEV checks needed. Enable vectorization of this "
-                 "loop with '#pragma clang loop vectorize(enable)' when "
-                 "compiling with -Os/-Oz");
-    LLVM_DEBUG(
-        dbgs()
-        << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
-    return None;
+    reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
+        "runtime SCEV checks needed. Enable vectorization of this "
+        "loop with '#pragma clang loop vectorize(enable)' when "
+        "compiling with -Os/-Oz",
+        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    return true;
   }
 
   // FIXME: Avoid specializing for stride==1 instead of bailing out.
   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
-    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
-              << "runtime stride == 1 checks needed. Enable vectorization of "
-                 "this loop with '#pragma clang loop vectorize(enable)' when "
-                 "compiling with -Os/-Oz");
-    LLVM_DEBUG(
-        dbgs()
-        << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+    reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
+        "runtime stride == 1 checks needed. Enable vectorization of "
+        "this loop with '#pragma clang loop vectorize(enable)' when "
+        "compiling with -Os/-Oz",
+        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    return true;
+  }
+
+  return false;
+}
+
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
+  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+    // TODO: It may by useful to do since it's still likely to be dynamically
+    // uniform if the target can skip.
+    reportVectorizationFailure(
+        "Not inserting runtime ptr check for divergent target",
+        "runtime pointer checks needed. Not enabled for divergent target",
+        "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
     return None;
   }
 
-  // If we optimize the program for size, avoid creating the tail loop.
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
   if (TC == 1) {
-    ORE->emit(createMissedAnalysis("SingleIterationLoop")
-              << "loop trip count is one, irrelevant for vectorization");
-    LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
+    reportVectorizationFailure("Single iteration (non) loop",
+        "loop trip count is one, irrelevant for vectorization",
+        "SingleIterationLoop", ORE, TheLoop);
     return None;
   }
 
-  // Record that scalar epilogue is not allowed.
-  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+  switch (ScalarEpilogueStatus) {
+  case CM_ScalarEpilogueAllowed:
+    return computeFeasibleMaxVF(TC);
+  case CM_ScalarEpilogueNotNeededUsePredicate:
+    LLVM_DEBUG(
+        dbgs() << "LV: vector predicate hint/switch found.\n"
+               << "LV: Not allowing scalar epilogue, creating predicated "
+               << "vector loop.\n");
+    break;
+  case CM_ScalarEpilogueNotAllowedLowTripLoop:
+    // fallthrough as a special case of OptForSize
+  case CM_ScalarEpilogueNotAllowedOptSize:
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+      LLVM_DEBUG(
+          dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+    else
+      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
+                        << "count.\n");
+
+    // Bail if runtime checks are required, which are not good when optimising
+    // for size.
+    if (runtimeChecksRequired())
+      return None;
+    break;
+  }
 
-  IsScalarEpilogueAllowed = !OptForSize;
+  // Now try the tail folding
 
-  // We don't create an epilogue when optimizing for size.
   // Invalidate interleave groups that require an epilogue if we can't mask
   // the interleave-group.
-  if (!useMaskedInterleavedAccesses(TTI)) 
+  if (!useMaskedInterleavedAccesses(TTI))
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
 
-  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
-
+  unsigned MaxVF = computeFeasibleMaxVF(TC);
   if (TC > 0 && TC % MaxVF == 0) {
+    // Accept MaxVF if we do not have a tail.
     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
     return MaxVF;
   }
@@ -4759,28 +4915,30 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  if (Legal->canFoldTailByMasking()) {
+  if (Legal->prepareToFoldTailByMasking()) {
     FoldTailByMasking = true;
     return MaxVF;
   }
 
   if (TC == 0) {
-    ORE->emit(
-        createMissedAnalysis("UnknownLoopCountComplexCFG")
-        << "unable to calculate the loop count due to complex control flow");
+    reportVectorizationFailure(
+        "Unable to calculate the loop count due to complex control flow",
+        "unable to calculate the loop count due to complex control flow",
+        "UnknownLoopCountComplexCFG", ORE, TheLoop);
     return None;
   }
 
-  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
-            << "cannot optimize for size and vectorize at the same time. "
-               "Enable vectorization of this loop with '#pragma clang loop "
-               "vectorize(enable)' when compiling with -Os/-Oz");
+  reportVectorizationFailure(
+      "Cannot optimize for size and vectorize at the same time.",
+      "cannot optimize for size and vectorize at the same time. "
+      "Enable vectorization of this loop with '#pragma clang loop "
+      "vectorize(enable)' when compiling with -Os/-Oz",
+      "NoTailLoopWithOptForSize", ORE, TheLoop);
   return None;
 }
 
 unsigned
-LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
-                                                 unsigned ConstTripCount) {
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -4818,8 +4976,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
   }
 
   unsigned MaxVF = MaxVectorSize;
-  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
-      (MaximizeBandwidth && !OptForSize)) {
+  if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
+      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
     // Collect all viable vectorization factors larger than the default MaxVF
     // (i.e. MaxVectorSize).
     SmallVector<unsigned, 8> VFs;
@@ -4832,9 +4990,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
 
     // Select the largest VF which doesn't require more registers than existing
     // ones.
-    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
     for (int i = RUs.size() - 1; i >= 0; --i) {
-      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+      bool Selected = true;
+      for (auto& pair : RUs[i].MaxLocalUsers) {
+        unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+        if (pair.second > TargetNumRegisters)
+          Selected = false;
+      }
+      if (Selected) {
         MaxVF = VFs[i];
         break;
       }
@@ -4886,10 +5049,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
   }
 
   if (!EnableCondStoresVectorization && NumPredStores) {
-    ORE->emit(createMissedAnalysis("ConditionalStore")
-              << "store that is conditionally executed prevents vectorization");
-    LLVM_DEBUG(
-        dbgs() << "LV: No vectorization. There are conditional stores.\n");
+    reportVectorizationFailure("There are conditional stores.",
+        "store that is conditionally executed prevents vectorization",
+        "ConditionalStore", ORE, TheLoop);
     Width = 1;
     Cost = ScalarCost;
   }
@@ -4958,8 +5120,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   return {MinWidth, MaxWidth};
 }
 
-unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
-                                                           unsigned VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
                                                            unsigned LoopCost) {
   // -- The interleave heuristics --
   // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4975,8 +5136,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  // When we optimize for size, we don't interleave.
-  if (OptForSize)
+  if (!isScalarEpilogueAllowed())
     return 1;
 
   // We used the distance for the interleave count.
@@ -4988,22 +5148,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
     return 1;
 
-  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
-  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
-                    << " registers\n");
-
-  if (VF == 1) {
-    if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
-      TargetNumRegisters = ForceTargetNumScalarRegs;
-  } else {
-    if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
-      TargetNumRegisters = ForceTargetNumVectorRegs;
-  }
-
   RegisterUsage R = calculateRegisterUsage({VF})[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
-  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+  for (auto& pair : R.MaxLocalUsers) {
+    pair.second = std::max(pair.second, 1U);
+  }
 
   // We calculate the interleave count using the following formula.
   // Subtract the number of loop invariants from the number of available
@@ -5016,13 +5166,35 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // We also want power of two interleave counts to ensure that the induction
   // variable of the vector loop wraps to zero, when tail is folded by masking;
   // this currently happens when OptForSize, in which case IC is set to 1 above.
-  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
-                              R.MaxLocalUsers);
+  unsigned IC = UINT_MAX;
 
-  // Don't count the induction variable as interleaved.
-  if (EnableIndVarRegisterHeur)
-    IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
-                       std::max(1U, (R.MaxLocalUsers - 1)));
+  for (auto& pair : R.MaxLocalUsers) {
+    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+    LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+                      << " registers of "
+                      << TTI.getRegisterClassName(pair.first) << " register class\n");
+    if (VF == 1) {
+      if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+        TargetNumRegisters = ForceTargetNumScalarRegs;
+    } else {
+      if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+        TargetNumRegisters = ForceTargetNumVectorRegs;
+    }
+    unsigned MaxLocalUsers = pair.second;
+    unsigned LoopInvariantRegs = 0;
+    if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
+      LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
+
+    unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+    // Don't count the induction variable as interleaved.
+    if (EnableIndVarRegisterHeur) {
+      TmpIC =
+          PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+                        std::max(1U, (MaxLocalUsers - 1)));
+    }
+
+    IC = std::min(IC, TmpIC);
+  }
 
   // Clamp the interleave ranges to reasonable counts.
   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
@@ -5036,6 +5208,14 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
   }
 
+  // If the trip count is constant, limit the interleave count to be less than
+  // the trip count divided by VF.
+  if (TC > 0) {
+    assert(TC >= VF && "VF exceeds trip count?");
+    if ((TC / VF) < MaxInterleaveCount)
+      MaxInterleaveCount = (TC / VF);
+  }
+
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
   if (LoopCost == 0)
@@ -5044,7 +5224,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   assert(LoopCost && "Non-zero loop cost expected");
 
   // Clamp the calculated IC to be between the 1 and the max interleave count
-  // that the target allows.
+  // that the target and trip count allows.
   if (IC > MaxInterleaveCount)
     IC = MaxInterleaveCount;
   else if (IC < 1)
@@ -5196,7 +5376,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
 
   SmallVector<RegisterUsage, 8> RUs(VFs.size());
-  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
+  SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
 
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
@@ -5226,21 +5406,45 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
 
     // For each VF find the maximum usage of registers.
     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+      // Count the number of live intervals.
+      SmallMapVector<unsigned, unsigned, 4> RegUsage;
+
       if (VFs[j] == 1) {
-        MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
-        continue;
+        for (auto Inst : OpenIntervals) {
+          unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+          if (RegUsage.find(ClassID) == RegUsage.end())
+            RegUsage[ClassID] = 1;
+          else
+            RegUsage[ClassID] += 1;
+        }
+      } else {
+        collectUniformsAndScalars(VFs[j]);
+        for (auto Inst : OpenIntervals) {
+          // Skip ignored values for VF > 1.
+          if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
+            continue;
+          if (isScalarAfterVectorization(Inst, VFs[j])) {
+            unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+            if (RegUsage.find(ClassID) == RegUsage.end())
+              RegUsage[ClassID] = 1;
+            else
+              RegUsage[ClassID] += 1;
+          } else {
+            unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
+            if (RegUsage.find(ClassID) == RegUsage.end())
+              RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
+            else
+              RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+          }
+        }
       }
-      collectUniformsAndScalars(VFs[j]);
-      // Count the number of live intervals.
-      unsigned RegUsage = 0;
-      for (auto Inst : OpenIntervals) {
-        // Skip ignored values for VF > 1.
-        if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
-            isScalarAfterVectorization(Inst, VFs[j]))
-          continue;
-        RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+    
+      for (auto& pair : RegUsage) {
+        if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
+          MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
+        else
+          MaxUsages[j][pair.first] = pair.second;
       }
-      MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
     }
 
     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
@@ -5251,18 +5455,34 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   }
 
   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
-    unsigned Invariant = 0;
-    if (VFs[i] == 1)
-      Invariant = LoopInvariants.size();
-    else {
-      for (auto Inst : LoopInvariants)
-        Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+    SmallMapVector<unsigned, unsigned, 4> Invariant;
+  
+    for (auto Inst : LoopInvariants) {
+      unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+      unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
+      if (Invariant.find(ClassID) == Invariant.end())
+        Invariant[ClassID] = Usage;
+      else
+        Invariant[ClassID] += Usage;
     }
 
-    LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
-    LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
-    LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
-                      << '\n');
+    LLVM_DEBUG({
+      dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
+      dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
+             << " item\n";
+      for (const auto &pair : MaxUsages[i]) {
+        dbgs() << "LV(REG): RegisterClass: "
+               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+               << " registers\n";
+      }
+      dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
+             << " item\n";
+      for (const auto &pair : Invariant) {
+        dbgs() << "LV(REG): RegisterClass: "
+               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+               << " registers\n";
+      }
+    });
 
     RU.LoopInvariantRegs = Invariant;
     RU.MaxLocalUsers = MaxUsages[i];
@@ -5511,7 +5731,6 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   Type *ValTy = getMemInstValueType(I);
   auto SE = PSE.getSE();
 
-  unsigned Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   Value *Ptr = getLoadStorePointerOperand(I);
   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
@@ -5525,9 +5744,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
-  Cost += VF *
-          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
-                              AS);
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
+  Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+                                   Alignment ? Alignment->value() : 0, AS);
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
@@ -5552,18 +5771,20 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
                                                              unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
   Type *VectorTy = ToVectorTy(ValTy, VF);
-  unsigned Alignment = getLoadStoreAlignment(I);
   Value *Ptr = getLoadStorePointerOperand(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
 
   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
          "Stride should be 1 or -1 for consecutive memory access");
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
   unsigned Cost = 0;
   if (Legal->isMaskRequired(I))
-    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
+                                      Alignment ? Alignment->value() : 0, AS);
   else
-    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
+                                Alignment ? Alignment->value() : 0, AS, I);
 
   bool Reverse = ConsecutiveStride < 0;
   if (Reverse)
@@ -5575,33 +5796,37 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                          unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
   Type *VectorTy = ToVectorTy(ValTy, VF);
-  unsigned Alignment = getLoadStoreAlignment(I);
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   if (isa<LoadInst>(I)) {
     return TTI.getAddressComputationCost(ValTy) +
-           TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+           TTI.getMemoryOpCost(Instruction::Load, ValTy,
+                               Alignment ? Alignment->value() : 0, AS) +
            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
   }
   StoreInst *SI = cast<StoreInst>(I);
 
   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
   return TTI.getAddressComputationCost(ValTy) +
-         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
-         (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
-                                               Instruction::ExtractElement,
-                                               VectorTy, VF - 1));
+         TTI.getMemoryOpCost(Instruction::Store, ValTy,
+                             Alignment ? Alignment->value() : 0, AS) +
+         (isLoopInvariantStoreValue
+              ? 0
+              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
+                                       VF - 1));
 }
 
 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                                                           unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
   Type *VectorTy = ToVectorTy(ValTy, VF);
-  unsigned Alignment = getLoadStoreAlignment(I);
+  const MaybeAlign Alignment = getLoadStoreAlignment(I);
   Value *Ptr = getLoadStorePointerOperand(I);
 
   return TTI.getAddressComputationCost(VectorTy) +
          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
-                                    Legal->isMaskRequired(I), Alignment);
+                                    Legal->isMaskRequired(I),
+                                    Alignment ? Alignment->value() : 0);
 }
 
 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
@@ -5626,8 +5851,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   }
 
   // Calculate the cost of the whole interleaved group.
-  bool UseMaskForGaps = 
-      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  bool UseMaskForGaps =
+      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
   unsigned Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
@@ -5648,11 +5873,12 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   // moment.
   if (VF == 1) {
     Type *ValTy = getMemInstValueType(I);
-    unsigned Alignment = getLoadStoreAlignment(I);
+    const MaybeAlign Alignment = getLoadStoreAlignment(I);
     unsigned AS = getLoadStoreAddressSpace(I);
 
     return TTI.getAddressComputationCost(ValTy) +
-           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy,
+                               Alignment ? Alignment->value() : 0, AS, I);
   }
   return getWideningCost(I, VF);
 }
@@ -6167,8 +6393,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
 }
 
 VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
-                                                unsigned UserVF) {
+LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
   unsigned VF = UserVF;
   // Outer loop handling: They may require CFG and instruction level
   // transformations before even evaluating whether vectorization is profitable.
@@ -6207,10 +6432,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
   return VectorizationFactor::Disabled();
 }
 
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
-                                                             unsigned UserVF) {
+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
   assert(OrigLoop->empty() && "Inner loop expected.");
-  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
     return None;
 
@@ -6840,8 +7064,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
 
   // If the tail is to be folded by masking, the primary induction variable
   // needs to be represented in VPlan for it to model early-exit masking.
-  if (CM.foldTailByMasking())
+  // Also, both the Phi and the live-out instruction of each reduction are
+  // required in order to introduce a select between them in VPlan.
+  if (CM.foldTailByMasking()) {
     NeedDef.insert(Legal->getPrimaryInduction());
+    for (auto &Reduction : *Legal->getReductionVars()) {
+      NeedDef.insert(Reduction.first);
+      NeedDef.insert(Reduction.second.getLoopExitInstr());
+    }
+  }
 
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
@@ -6873,7 +7104,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
 
   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
-  auto Plan = llvm::make_unique<VPlan>(VPBB);
+  auto Plan = std::make_unique<VPlan>(VPBB);
 
   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
   // Represent values that will have defs inside VPlan.
@@ -6968,6 +7199,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
   delete PreEntry;
 
+  // Finally, if tail is folded by masking, introduce selects between the phi
+  // and the live-out instruction of each reduction, at the end of the latch.
+  if (CM.foldTailByMasking()) {
+    Builder.setInsertPoint(VPBB);
+    auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+    for (auto &Reduction : *Legal->getReductionVars()) {
+      VPValue *Phi = Plan->getVPValue(Reduction.first);
+      VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
+      Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
+    }
+  }
+
   std::string PlanName;
   raw_string_ostream RSO(PlanName);
   unsigned VF = Range.Start;
@@ -6993,7 +7236,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
 
   // Create new empty VPlan
-  auto Plan = llvm::make_unique<VPlan>();
+  auto Plan = std::make_unique<VPlan>();
 
   // Build hierarchical CFG
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -7199,6 +7442,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
 }
 
+static ScalarEpilogueLowering
+getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
+                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+      (F->hasOptSize() ||
+       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+    SEL = CM_ScalarEpilogueNotAllowedOptSize;
+  else if (PreferPredicateOverEpilog || Hints.getPredicate()) 
+    SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+
+  return SEL;
+}
+
 // Process the loop in the VPlan-native vectorization path. This path builds
 // VPlan upfront in the vectorization pipeline, which allows to apply
 // VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -7213,7 +7470,9 @@ static bool processLoopInVPlanNativePath(
   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
   Function *F = L->getHeader()->getParent();
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
-  LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
                                 &Hints, IAI);
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
@@ -7223,15 +7482,8 @@ static bool processLoopInVPlanNativePath(
   // Get user vectorization factor.
   const unsigned UserVF = Hints.getWidth();
 
-  // Check the function attributes and profiles to find out if this function
-  // should be optimized for size.
-  bool OptForSize =
-      Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
-
   // Plan how to best vectorize, return the best VF and its cost.
-  const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
+  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
 
   // If we are stress testing VPlan builds, do not attempt to generate vector
   // code. Masked vector code generation support will follow soon.
@@ -7310,10 +7562,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check the function attributes and profiles to find out if this function
   // should be optimized for size.
-  bool OptForSize =
-      Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
@@ -7325,36 +7574,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                         ORE, BFI, PSI, Hints);
 
   assert(L->empty() && "Inner loop expected.");
+
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
-  // Prefer constant trip counts over profile data, over upper bound estimate.
-  unsigned ExpectedTC = 0;
-  bool HasExpectedTC = false;
-  if (const SCEVConstant *ConstExits =
-      dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
-    const APInt &ExitsCount = ConstExits->getAPInt();
-    // We are interested in small values for ExpectedTC. Skip over those that
-    // can't fit an unsigned.
-    if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
-      ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
-      HasExpectedTC = true;
-    }
-  }
-  // ExpectedTC may be large because it's bound by a variable. Check
-  // profiling information to validate we should vectorize.
-  if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
-    auto EstimatedTC = getLoopEstimatedTripCount(L);
-    if (EstimatedTC) {
-      ExpectedTC = *EstimatedTC;
-      HasExpectedTC = true;
-    }
-  }
-  if (!HasExpectedTC) {
-    ExpectedTC = SE->getSmallConstantMaxTripCount(L);
-    HasExpectedTC = (ExpectedTC > 0);
-  }
-
-  if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
+  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
+  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
                       << "This loop is worth vectorizing only if no scalar "
                       << "iteration overheads are incurred.");
@@ -7362,10 +7586,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
     else {
       LLVM_DEBUG(dbgs() << "\n");
-      // Loops with a very small trip count are considered for vectorization
-      // under OptForSize, thereby making sure the cost of their loop body is
-      // dominant, free of runtime guards and scalar iteration overheads.
-      OptForSize = true;
+      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
     }
   }
 
@@ -7374,11 +7595,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // an integer loop and the vector instructions selected are purely integer
   // vector instructions?
   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
-    LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
-                         "attribute is used.\n");
-    ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
-                                     "NoImplicitFloat", L)
-              << "loop not vectorized due to NoImplicitFloat attribute");
+    reportVectorizationFailure(
+        "Can't vectorize when the NoImplicitFloat attribute is used",
+        "loop not vectorized due to NoImplicitFloat attribute",
+        "NoImplicitFloat", ORE, L);
     Hints.emitRemarkWithHints();
     return false;
   }
@@ -7389,11 +7609,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // additional fp-math flags can help.
   if (Hints.isPotentiallyUnsafe() &&
       TTI->isFPVectorizationPotentiallyUnsafe()) {
-    LLVM_DEBUG(
-        dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
-    ORE->emit(
-        createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
-        << "loop not vectorized due to unsafe FP support.");
+    reportVectorizationFailure(
+        "Potentially unsafe FP op prevents vectorization",
+        "loop not vectorized due to unsafe FP support.",
+        "UnsafeFP", ORE, L);
     Hints.emitRemarkWithHints();
     return false;
   }
@@ -7411,8 +7630,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   }
 
   // Use the cost model.
-  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+                                F, &Hints, IAI);
   CM.collectValuesToIgnore();
 
   // Use the planner for vectorization.
@@ -7422,7 +7641,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   unsigned UserVF = Hints.getWidth();
 
   // Plan how to best vectorize, return the best VF and its cost.
-  Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
+  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
 
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
@@ -7431,7 +7650,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (MaybeVF) {
     VF = *MaybeVF;
     // Select the interleave count.
-    IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
+    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
   }
 
   // Identify the diagnostic messages that should be produced.
@@ -7609,7 +7828,8 @@ bool LoopVectorizePass::runImpl(
   // The second condition is necessary because, even if the target has no
   // vector registers, loop vectorization may still enable scalar
   // interleaving.
-  if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
+  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+      TTI->getMaxInterleaveFactor(1) < 2)
     return false;
 
   bool Changed = false;
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 27a86c0bca91..974eff9974d9 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -194,10 +194,13 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
   return true;
 }
 
-/// \returns True if all of the values in \p VL are constants.
+/// \returns True if all of the values in \p VL are constants (but not
+/// globals/constant expressions).
 static bool allConstant(ArrayRef<Value *> VL) {
+  // Constant expressions and globals can't be vectorized like normal integer/FP
+  // constants.
   for (Value *i : VL)
-    if (!isa<Constant>(i))
+    if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
       return false;
   return true;
 }
@@ -486,6 +489,7 @@ namespace slpvectorizer {
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
   struct TreeEntry;
+  struct ScheduleData;
 
 public:
   using ValueList = SmallVector<Value *, 8>;
@@ -614,6 +618,15 @@ public:
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable() const;
 
+  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
+  /// can be load combined in the backend. Load combining may not be allowed in
+  /// the IR optimizer, so we do not want to alter the pattern. For example,
+  /// partially transforming a scalar bswap() pattern into vector code is
+  /// effectively impossible for the backend to undo.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
+  bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1117,6 +1130,14 @@ public:
 #endif
   };
 
+  /// Checks if the instruction is marked for deletion.
+  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+  /// Marks values operands for later deletion by replacing them with Undefs.
+  void eraseInstructions(ArrayRef<Value *> AV);
+
+  ~BoUpSLP();
+
 private:
   /// Checks if all users of \p I are the part of the vectorization tree.
   bool areAllUsersVectorized(Instruction *I) const;
@@ -1153,8 +1174,7 @@ private:
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
-  void setInsertPointAfterBundle(ArrayRef<Value *> VL,
-                                 const InstructionsState &S);
+  void setInsertPointAfterBundle(TreeEntry *E);
 
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
@@ -1220,27 +1240,37 @@ private:
     /// reordering of operands during buildTree_rec() and vectorizeTree().
     SmallVector<ValueList, 2> Operands;
 
+    /// The main/alternate instruction.
+    Instruction *MainOp = nullptr;
+    Instruction *AltOp = nullptr;
+
   public:
     /// Set this bundle's \p OpIdx'th operand to \p OpVL.
-    void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL,
-                    ArrayRef<unsigned> ReuseShuffleIndices) {
+    void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
       if (Operands.size() < OpIdx + 1)
         Operands.resize(OpIdx + 1);
       assert(Operands[OpIdx].size() == 0 && "Already resized?");
       Operands[OpIdx].resize(Scalars.size());
       for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
-        Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty())
-                                    ? OpVL[ReuseShuffleIndices[Lane]]
-                                    : OpVL[Lane];
-    }
-
-    /// If there is a user TreeEntry, then set its operand.
-    void trySetUserTEOperand(const EdgeInfo &UserTreeIdx,
-                             ArrayRef<Value *> OpVL,
-                             ArrayRef<unsigned> ReuseShuffleIndices) {
-      if (UserTreeIdx.UserTE)
-        UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL,
-                                       ReuseShuffleIndices);
+        Operands[OpIdx][Lane] = OpVL[Lane];
+    }
+
+    /// Set the operands of this bundle in their original order.
+    void setOperandsInOrder() {
+      assert(Operands.empty() && "Already initialized?");
+      auto *I0 = cast<Instruction>(Scalars[0]);
+      Operands.resize(I0->getNumOperands());
+      unsigned NumLanes = Scalars.size();
+      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
+           OpIdx != NumOperands; ++OpIdx) {
+        Operands[OpIdx].resize(NumLanes);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          auto *I = cast<Instruction>(Scalars[Lane]);
+          assert(I->getNumOperands() == NumOperands &&
+                 "Expected same number of operands");
+          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
+        }
+      }
     }
 
     /// \returns the \p OpIdx operand of this TreeEntry.
@@ -1249,6 +1279,9 @@ private:
       return Operands[OpIdx];
     }
 
+    /// \returns the number of operands.
+    unsigned getNumOperands() const { return Operands.size(); }
+
     /// \return the single \p OpIdx operand.
     Value *getSingleOperand(unsigned OpIdx) const {
       assert(OpIdx < Operands.size() && "Off bounds");
@@ -1256,6 +1289,58 @@ private:
       return Operands[OpIdx][0];
     }
 
+    /// Some of the instructions in the list have alternate opcodes.
+    bool isAltShuffle() const {
+      return getOpcode() != getAltOpcode();
+    }
+
+    bool isOpcodeOrAlt(Instruction *I) const {
+      unsigned CheckedOpcode = I->getOpcode();
+      return (getOpcode() == CheckedOpcode ||
+              getAltOpcode() == CheckedOpcode);
+    }
+
+    /// Chooses the correct key for scheduling data. If \p Op has the same (or
+    /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
+    /// \p OpValue.
+    Value *isOneOf(Value *Op) const {
+      auto *I = dyn_cast<Instruction>(Op);
+      if (I && isOpcodeOrAlt(I))
+        return Op;
+      return MainOp;
+    }
+
+    void setOperations(const InstructionsState &S) {
+      MainOp = S.MainOp;
+      AltOp = S.AltOp;
+    }
+
+    Instruction *getMainOp() const {
+      return MainOp;
+    }
+
+    Instruction *getAltOp() const {
+      return AltOp;
+    }
+
+    /// The main/alternate opcodes for the list of instructions.
+    unsigned getOpcode() const {
+      return MainOp ? MainOp->getOpcode() : 0;
+    }
+
+    unsigned getAltOpcode() const {
+      return AltOp ? AltOp->getOpcode() : 0;
+    }
+
+    /// Update operations state of this entry if reorder occurred.
+    bool updateStateIfReorder() {
+      if (ReorderIndices.empty())
+        return false;
+      InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
+      setOperations(S);
+      return true;
+    }
+
 #ifndef NDEBUG
     /// Debug printer.
     LLVM_DUMP_METHOD void dump() const {
@@ -1269,6 +1354,8 @@ private:
       for (Value *V : Scalars)
         dbgs().indent(2) << *V << "\n";
       dbgs() << "NeedToGather: " << NeedToGather << "\n";
+      dbgs() << "MainOp: " << *MainOp << "\n";
+      dbgs() << "AltOp: " << *AltOp << "\n";
       dbgs() << "VectorizedValue: ";
       if (VectorizedValue)
         dbgs() << *VectorizedValue;
@@ -1279,12 +1366,12 @@ private:
       if (ReuseShuffleIndices.empty())
         dbgs() << "Emtpy";
       else
-        for (unsigned Idx : ReuseShuffleIndices)
-          dbgs() << Idx << ", ";
+        for (unsigned ReuseIdx : ReuseShuffleIndices)
+          dbgs() << ReuseIdx << ", ";
       dbgs() << "\n";
       dbgs() << "ReorderIndices: ";
-      for (unsigned Idx : ReorderIndices)
-        dbgs() << Idx << ", ";
+      for (unsigned ReorderIdx : ReorderIndices)
+        dbgs() << ReorderIdx << ", ";
       dbgs() << "\n";
       dbgs() << "UserTreeIndices: ";
       for (const auto &EInfo : UserTreeIndices)
@@ -1295,11 +1382,13 @@ private:
   };
 
   /// Create a new VectorizableTree entry.
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
+                          const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<unsigned> ReuseShuffleIndices = None,
                           ArrayRef<unsigned> ReorderIndices = None) {
-    VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree));
+    bool Vectorized = (bool)Bundle;
+    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
     TreeEntry *Last = VectorizableTree.back().get();
     Last->Idx = VectorizableTree.size() - 1;
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
@@ -1307,11 +1396,22 @@ private:
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
+    Last->setOperations(S);
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
-        ScalarToTreeEntry[VL[i]] = Last->Idx;
-      }
+        ScalarToTreeEntry[VL[i]] = Last;
+      }
+      // Update the scheduler bundle to point to this TreeEntry.
+      unsigned Lane = 0;
+      for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
+           BundleMember = BundleMember->NextInBundle) {
+        BundleMember->TE = Last;
+        BundleMember->Lane = Lane;
+        ++Lane;
+      }
+      assert((!Bundle.getValue() || Lane == VL.size()) &&
+             "Bundle and VL out of sync");
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
@@ -1319,7 +1419,6 @@ private:
     if (UserTreeIdx.UserTE)
       Last->UserTreeIndices.push_back(UserTreeIdx);
 
-    Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices);
     return Last;
   }
 
@@ -1340,19 +1439,19 @@ private:
   TreeEntry *getTreeEntry(Value *V) {
     auto I = ScalarToTreeEntry.find(V);
     if (I != ScalarToTreeEntry.end())
-      return VectorizableTree[I->second].get();
+      return I->second;
     return nullptr;
   }
 
   const TreeEntry *getTreeEntry(Value *V) const {
     auto I = ScalarToTreeEntry.find(V);
     if (I != ScalarToTreeEntry.end())
-      return VectorizableTree[I->second].get();
+      return I->second;
     return nullptr;
   }
 
   /// Maps a specific scalar to its tree entry.
-  SmallDenseMap<Value*, int> ScalarToTreeEntry;
+  SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
 
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
@@ -1408,15 +1507,14 @@ private:
   /// This is required to ensure that there are no incorrect collisions in the
   /// AliasCache, which can happen if a new instruction is allocated at the
   /// same address as a previously deleted instruction.
-  void eraseInstruction(Instruction *I) {
-    I->removeFromParent();
-    I->dropAllReferences();
-    DeletedInstructions.emplace_back(I);
+  void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
+    auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
+    It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
   }
 
   /// Temporary store for deleted instructions. Instructions will be deleted
   /// eventually when the BoUpSLP is destructed.
-  SmallVector<unique_value, 8> DeletedInstructions;
+  DenseMap<Instruction *, bool> DeletedInstructions;
 
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
@@ -1453,6 +1551,8 @@ private:
       UnscheduledDepsInBundle = UnscheduledDeps;
       clearDependencies();
       OpValue = OpVal;
+      TE = nullptr;
+      Lane = -1;
     }
 
     /// Returns true if the dependency information has been calculated.
@@ -1559,6 +1659,12 @@ private:
 
     /// Opcode of the current instruction in the schedule data.
     Value *OpValue = nullptr;
+
+    /// The TreeEntry that this instruction corresponds to.
+    TreeEntry *TE = nullptr;
+
+    /// The lane of this node in the TreeEntry.
+    int Lane = -1;
   };
 
 #ifndef NDEBUG
@@ -1633,10 +1739,9 @@ private:
           continue;
         }
         // Handle the def-use chain dependencies.
-        for (Use &U : BundleMember->Inst->operands()) {
-          auto *I = dyn_cast<Instruction>(U.get());
-          if (!I)
-            continue;
+
+        // Decrement the unscheduled counter and insert to ready list if ready.
+        auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
           doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
             if (OpDef && OpDef->hasValidDependencies() &&
                 OpDef->incrementUnscheduledDeps(-1) == 0) {
@@ -1651,6 +1756,24 @@ private:
                          << "SLP:    gets ready (def): " << *DepBundle << "\n");
             }
           });
+        };
+
+        // If BundleMember is a vector bundle, its operands may have been
+        // reordered duiring buildTree(). We therefore need to get its operands
+        // through the TreeEntry.
+        if (TreeEntry *TE = BundleMember->TE) {
+          int Lane = BundleMember->Lane;
+          assert(Lane >= 0 && "Lane not set");
+          for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
+               OpIdx != NumOperands; ++OpIdx)
+            if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
+              DecrUnsched(I);
+        } else {
+          // If BundleMember is a stand-alone instruction, no operand reordering
+          // has taken place, so we directly access its operands.
+          for (Use &U : BundleMember->Inst->operands())
+            if (auto *I = dyn_cast<Instruction>(U.get()))
+              DecrUnsched(I);
         }
         // Handle the memory dependencies.
         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
@@ -1697,8 +1820,11 @@ private:
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
-    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                           const InstructionsState &S);
+    /// \returns the scheduling bundle. The returned Optional value is non-None
+    /// if \p VL is allowed to be scheduled.
+    Optional<ScheduleData *>
+    tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+                      const InstructionsState &S);
 
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
@@ -1945,6 +2071,30 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
 
 } // end namespace llvm
 
+BoUpSLP::~BoUpSLP() {
+  for (const auto &Pair : DeletedInstructions) {
+    // Replace operands of ignored instructions with Undefs in case if they were
+    // marked for deletion.
+    if (Pair.getSecond()) {
+      Value *Undef = UndefValue::get(Pair.getFirst()->getType());
+      Pair.getFirst()->replaceAllUsesWith(Undef);
+    }
+    Pair.getFirst()->dropAllReferences();
+  }
+  for (const auto &Pair : DeletedInstructions) {
+    assert(Pair.getFirst()->use_empty() &&
+           "trying to erase instruction with users.");
+    Pair.getFirst()->eraseFromParent();
+  }
+}
+
+void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
+  for (auto *V : AV) {
+    if (auto *I = dyn_cast<Instruction>(V))
+      eraseInstruction(I, /*ReplaceWithUndef=*/true);
+  };
+}
+
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
   ExtraValueToDebugLocsMap ExternallyUsedValues;
@@ -2026,28 +2176,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
@@ -2055,11 +2205,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // the same block.
 
   // Don't vectorize ephemeral values.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    if (EphValues.count(VL[i])) {
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+  for (Value *V : VL) {
+    if (EphValues.count(V)) {
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
   }
@@ -2069,7 +2219,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
     if (!E->isSame(VL)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
     // Record the reuse of the tree node.  FIXME, currently this is only used to
@@ -2077,19 +2227,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     E->UserTreeIndices.push_back(UserTreeIdx);
     LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
                       << ".\n");
-    E->trySetUserTEOperand(UserTreeIdx, VL, None);
     return;
   }
 
   // Check that none of the instructions in the bundle are already in the tree.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    auto *I = dyn_cast<Instruction>(VL[i]);
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
     if (!I)
       continue;
     if (getTreeEntry(I)) {
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
   }
@@ -2097,10 +2246,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // If any of the scalars is marked as a value that needs to stay scalar, then
   // we need to gather the scalars.
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {
+  for (Value *V : VL) {
+    if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
   }
@@ -2114,7 +2263,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
@@ -2128,13 +2277,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (Res.second)
       UniqueValues.emplace_back(V);
   }
-  if (UniqueValues.size() == VL.size()) {
+  size_t NumUniqueScalarValues = UniqueValues.size();
+  if (NumUniqueScalarValues == VL.size()) {
     ReuseShuffleIndicies.clear();
   } else {
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
-    if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
+    if (NumUniqueScalarValues <= 1 ||
+        !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
       LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
     VL = UniqueValues;
@@ -2142,16 +2293,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
   auto &BSRef = BlocksSchedules[BB];
   if (!BSRef)
-    BSRef = llvm::make_unique<BlockScheduling>(BB);
+    BSRef = std::make_unique<BlockScheduling>(BB);
 
   BlockScheduling &BS = *BSRef.get();
 
-  if (!BS.tryScheduleBundle(VL, this, S)) {
+  Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+  if (!Bundle) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                 ReuseShuffleIndicies);
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -2160,7 +2313,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                 (unsigned) Instruction::ShuffleVector : S.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
-      PHINode *PH = dyn_cast<PHINode>(VL0);
+      auto *PH = cast<PHINode>(VL0);
 
       // Check for terminator values (e.g. invoke).
       for (unsigned j = 0; j < VL.size(); ++j)
@@ -2172,23 +2325,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                         ReuseShuffleIndicies);
             return;
           }
         }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE =
+          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
+      // Keeps the reordered operands to avoid code duplication.
+      SmallVector<ValueList, 2> OperandsVec;
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *j : VL)
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
-
-        buildTree_rec(Operands, Depth + 1, {TE, i});
+        TE->setOperand(i, Operands);
+        OperandsVec.push_back(Operands);
       }
+      for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
+        buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
       return;
     }
     case Instruction::ExtractValue:
@@ -2198,13 +2357,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         ++NumOpsWantToKeepOriginalOrder;
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies);
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
+        VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       if (!CurrentOrder.empty()) {
@@ -2220,17 +2379,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         auto StoredCurrentOrderAndNum =
             NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies,
                      StoredCurrentOrderAndNum->getFirst());
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);
+        VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
-      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
       BS.cancelScheduling(VL, VL0);
       return;
     }
@@ -2246,7 +2407,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -2259,7 +2421,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         auto *L = cast<LoadInst>(V);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -2289,15 +2452,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
             ++NumOpsWantToKeepOriginalOrder;
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
-                         ReuseShuffleIndicies);
+            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+                                         UserTreeIdx, ReuseShuffleIndicies);
+            TE->setOperandsInOrder();
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
             auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
-                         ReuseShuffleIndicies, I->getFirst());
+            TreeEntry *TE =
+                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                             ReuseShuffleIndicies, I->getFirst());
+            TE->setOperandsInOrder();
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
           }
           return;
@@ -2306,7 +2472,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
@@ -2322,24 +2489,27 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+      for (Value *V : VL) {
+        Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -2351,19 +2521,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
       Type *ComparedTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-        CmpInst *Cmp = cast<CmpInst>(VL[i]);
+      for (Value *V : VL) {
+        CmpInst *Cmp = cast<CmpInst>(V);
         if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
@@ -2384,7 +2556,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           Right.push_back(RHS);
         }
       }
-
+      TE->setOperand(0, Left);
+      TE->setOperand(1, Right);
       buildTree_rec(Left, Depth + 1, {TE, 0});
       buildTree_rec(Right, Depth + 1, {TE, 1});
       return;
@@ -2409,7 +2582,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -2417,11 +2591,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+        TE->setOperand(0, Left);
+        TE->setOperand(1, Right);
         buildTree_rec(Left, Depth + 1, {TE, 0});
         buildTree_rec(Right, Depth + 1, {TE, 1});
         return;
       }
 
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -2434,11 +2611,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+      for (Value *V : VL) {
+        if (cast<Instruction>(V)->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           return;
         }
       }
@@ -2446,58 +2624,64 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // We can't combine several GEPs into one vector if they operate on
       // different types.
       Type *Ty0 = VL0->getOperand(0)->getType();
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+      for (Value *V : VL) {
+        Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           return;
         }
       }
 
       // We don't combine GEPs with non-constant indexes.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        auto Op = cast<Instruction>(VL[j])->getOperand(1);
+      for (Value *V : VL) {
+        auto Op = cast<Instruction>(V)->getOperand(1);
         if (!isa<ConstantInt>(Op)) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           return;
         }
       }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
       return;
     }
     case Instruction::Store: {
-      // Check if the stores are consecutive or of we need to swizzle them.
+      // Check if the stores are consecutive or if we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
-      for (Value *j : VL)
-        Operands.push_back(cast<Instruction>(j)->getOperand(0));
-
+      for (Value *V : VL)
+        Operands.push_back(cast<Instruction>(V)->getOperand(0));
+      TE->setOperandsInOrder();
       buildTree_rec(Operands, Depth + 1, {TE, 0});
       return;
     }
@@ -2509,7 +2693,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -2519,14 +2704,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned j = 0; j != NumArgs; ++j)
         if (hasVectorInstrinsicScalarOpd(ID, j))
           ScalarArgs[j] = CI->getArgOperand(j);
-      for (unsigned i = 1, e = VL.size(); i != e; ++i) {
-        CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
+      for (Value *V : VL) {
+        CallInst *CI2 = dyn_cast<CallInst>(V);
         if (!CI2 || CI2->getCalledFunction() != Int ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
-          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
                             << "\n");
           return;
         }
@@ -2537,7 +2723,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             Value *A1J = CI2->getArgOperand(j);
             if (ScalarArgs[j] != A1J) {
               BS.cancelScheduling(VL, VL0);
-              newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+              newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                           ReuseShuffleIndicies);
               LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                                 << " argument " << ScalarArgs[j] << "!=" << A1J
                                 << "\n");
@@ -2551,19 +2738,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
-                            << *CI << "!=" << *VL[i] << '\n');
+                            << *CI << "!=" << *V << '\n');
           return;
         }
       }
 
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL) {
-          CallInst *CI2 = dyn_cast<CallInst>(j);
+        for (Value *V : VL) {
+          auto *CI2 = cast<CallInst>(V);
           Operands.push_back(CI2->getArgOperand(i));
         }
         buildTree_rec(Operands, Depth + 1, {TE, i});
@@ -2575,27 +2765,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // then do not vectorize this instruction.
       if (!S.isAltShuffle()) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
+        TE->setOperand(0, Left);
+        TE->setOperand(1, Right);
         buildTree_rec(Left, Depth + 1, {TE, 0});
         buildTree_rec(Right, Depth + 1, {TE, 1});
         return;
       }
 
+      TE->setOperandsInOrder();
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (Value *V : VL)
+          Operands.push_back(cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -2603,7 +2798,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     default:
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -2738,7 +2934,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       return ReuseShuffleCost +
              TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
     }
-    if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
+    if (E->getOpcode() == Instruction::ExtractElement &&
         allSameType(VL) && allSameBlock(VL)) {
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
       if (ShuffleKind.hasValue()) {
@@ -2761,11 +2957,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     return ReuseShuffleCost + getGatherCost(VL);
   }
-  InstructionsState S = getSameOpcode(VL);
-  assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-               (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+  Instruction *VL0 = E->getMainOp();
+  unsigned ShuffleOrOp =
+      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI:
       return 0;
@@ -2851,7 +3046,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       int ScalarEltCost =
-          TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
@@ -2864,7 +3059,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost = ReuseShuffleCost +
-                  TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+                  TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
       }
       return VecCost - ScalarCost;
     }
@@ -2872,14 +3067,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
-      int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
+      int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
                                                   Builder.getInt1Ty(), VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+      int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::FNeg:
@@ -2940,12 +3135,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
       int ScalarEltCost = TTI->getArithmeticInstrCost(
-          S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+          E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
+      int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK,
                                                 Op2VK, Op1VP, Op2VP, Operands);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
@@ -3027,11 +3222,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       return ReuseShuffleCost + VecCallCost - ScalarCallCost;
     }
     case Instruction::ShuffleVector: {
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->getOpcode()) &&
+               Instruction::isBinaryOp(E->getAltOpcode())) ||
+              (Instruction::isCast(E->getOpcode()) &&
+               Instruction::isCast(E->getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
       int ScalarCost = 0;
       if (NeedToShuffleReuses) {
@@ -3046,25 +3241,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
               I, TargetTransformInfo::TCK_RecipThroughput);
         }
       }
-      for (Value *i : VL) {
-        Instruction *I = cast<Instruction>(i);
-        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+      for (Value *V : VL) {
+        Instruction *I = cast<Instruction>(V);
+        assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
         ScalarCost += TTI->getInstructionCost(
             I, TargetTransformInfo::TCK_RecipThroughput);
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       int VecCost = 0;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
-        VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+      if (Instruction::isBinaryOp(E->getOpcode())) {
+        VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);
+        VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);
       } else {
-        Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
-        Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
+        Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
+        Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
         VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
         VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
-        VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+        VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);
+        VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       return ReuseShuffleCost + VecCost - ScalarCost;
@@ -3098,6 +3293,43 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
   return true;
 }
 
+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
+  if (RdxOpcode != Instruction::Or)
+    return false;
+
+  unsigned NumElts = VectorizableTree[0]->Scalars.size();
+  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+
+  // Look past the reduction to find a source value. Arbitrarily follow the
+  // path through operand 0 of any 'or'. Also, peek through optional
+  // shift-left-by-constant.
+  Value *ZextLoad = FirstReduced;
+  while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+         match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+    ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+  // Check if the input to the reduction is an extended load.
+  Value *LoadPtr;
+  if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+    return false;
+
+  // Require that the total load bit width is a legal integer type.
+  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+  Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+  unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
+  LLVMContext &Context = FirstReduced->getContext();
+  if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
+    return false;
+
+  // Everything matched - assume that we can fold the whole sequence using
+  // load combining.
+  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "
+             << *(cast<Instruction>(FirstReduced)) << "\n");
+
+  return true;
+}
+
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
@@ -3319,16 +3551,16 @@ void BoUpSLP::reorderInputsAccordingToOpcode(
   Right = Ops.getVL(1);
 }
 
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
-                                        const InstructionsState &S) {
+void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block.
-  auto *Front = cast<Instruction>(S.OpValue);
+  auto *Front = E->getMainOp();
   auto *BB = Front->getParent();
-  assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
-    auto *I = cast<Instruction>(V);
-    return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
-  }));
+  assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
+                      [=](Value *V) -> bool {
+                        auto *I = cast<Instruction>(V);
+                        return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+                      }));
 
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
@@ -3339,7 +3571,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB)) {
     auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
+        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -3365,14 +3597,15 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
   if (!LastInst) {
-    SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
+    SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
-      if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
+      if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
         LastInst = &I;
       if (Bundle.empty())
         break;
     }
   }
+  assert(LastInst && "Failed to find last instruction in bundle");
 
   // Set the insertion point after the last instruction in the bundle. Set the
   // debug location to Front.
@@ -3385,7 +3618,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   // Generate the 'InsertElement' instruction.
   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
-    if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
+    if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
       GatherSeq.insert(Insrt);
       CSEBlocks.insert(Insrt->getParent());
 
@@ -3494,8 +3727,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     return E->VectorizedValue;
   }
 
-  InstructionsState S = getSameOpcode(E->Scalars);
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
+  Instruction *VL0 = E->getMainOp();
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
@@ -3504,7 +3736,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
   if (E->NeedToGather) {
-    setInsertPointAfterBundle(E->Scalars, S);
+    setInsertPointAfterBundle(E);
     auto *V = Gather(E->Scalars, VecTy);
     if (NeedToShuffleReuses) {
       V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3518,11 +3750,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     return V;
   }
 
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-           (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  unsigned ShuffleOrOp =
+      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
-      PHINode *PH = dyn_cast<PHINode>(VL0);
+      auto *PH = cast<PHINode>(VL0);
       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
@@ -3577,7 +3809,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         E->VectorizedValue = V;
         return V;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3612,7 +3844,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         E->VectorizedValue = NewV;
         return NewV;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3637,7 +3869,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *InVec = vectorizeTree(E->getOperand(0));
 
@@ -3646,7 +3878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         return E->VectorizedValue;
       }
 
-      CastInst *CI = dyn_cast<CastInst>(VL0);
+      auto *CI = cast<CastInst>(VL0);
       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3658,7 +3890,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
     case Instruction::FCmp:
     case Instruction::ICmp: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *L = vectorizeTree(E->getOperand(0));
       Value *R = vectorizeTree(E->getOperand(1));
@@ -3670,7 +3902,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V;
-      if (S.getOpcode() == Instruction::FCmp)
+      if (E->getOpcode() == Instruction::FCmp)
         V = Builder.CreateFCmp(P0, L, R);
       else
         V = Builder.CreateICmp(P0, L, R);
@@ -3685,7 +3917,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::Select: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *Cond = vectorizeTree(E->getOperand(0));
       Value *True = vectorizeTree(E->getOperand(1));
@@ -3706,7 +3938,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::FNeg: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *Op = vectorizeTree(E->getOperand(0));
 
@@ -3716,7 +3948,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V = Builder.CreateUnOp(
-          static_cast<Instruction::UnaryOps>(S.getOpcode()), Op);
+          static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
@@ -3748,7 +3980,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *LHS = vectorizeTree(E->getOperand(0));
       Value *RHS = vectorizeTree(E->getOperand(1));
@@ -3759,7 +3991,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
+          RHS);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
@@ -3776,12 +4009,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::Load: {
       // Loads are inserted at the head of the tree because we don't want to
       // sink them all the way down past store instructions.
-      bool IsReorder = !E->ReorderIndices.empty();
-      if (IsReorder) {
-        S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
-        VL0 = cast<Instruction>(S.OpValue);
-      }
-      setInsertPointAfterBundle(E->Scalars, S);
+      bool IsReorder = E->updateStateIfReorder();
+      if (IsReorder)
+        VL0 = E->getMainOp();
+      setInsertPointAfterBundle(E);
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
@@ -3797,11 +4028,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (getTreeEntry(PO))
         ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
-      unsigned Alignment = LI->getAlignment();
+      MaybeAlign Alignment = MaybeAlign(LI->getAlignment());
       LI = Builder.CreateLoad(VecTy, VecPtr);
-      if (!Alignment) {
-        Alignment = DL->getABITypeAlignment(ScalarLoadTy);
-      }
+      if (!Alignment)
+        Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy));
       LI->setAlignment(Alignment);
       Value *V = propagateMetadata(LI, E->Scalars);
       if (IsReorder) {
@@ -3824,7 +4054,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       unsigned Alignment = SI->getAlignment();
       unsigned AS = SI->getPointerAddressSpace();
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *VecValue = vectorizeTree(E->getOperand(0));
       Value *ScalarPtr = SI->getPointerOperand();
@@ -3840,7 +4070,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (!Alignment)
         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
 
-      ST->setAlignment(Alignment);
+      ST->setAlignment(Align(Alignment));
       Value *V = propagateMetadata(ST, E->Scalars);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3851,7 +4081,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::GetElementPtr: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E);
 
       Value *Op0 = vectorizeTree(E->getOperand(0));
 
@@ -3878,13 +4108,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      setInsertPointAfterBundle(E->Scalars, S);
-      Function *FI;
+      setInsertPointAfterBundle(E);
+
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
-      Value *ScalarArg = nullptr;
-      if (CI && (FI = CI->getCalledFunction())) {
+      if (Function *FI = CI->getCalledFunction())
         IID = FI->getIntrinsicID();
-      }
+
+      Value *ScalarArg = nullptr;
       std::vector<Value *> OpVecs;
       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
         ValueList OpVL;
@@ -3926,20 +4156,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::ShuffleVector: {
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->getOpcode()) &&
+               Instruction::isBinaryOp(E->getAltOpcode())) ||
+              (Instruction::isCast(E->getOpcode()) &&
+               Instruction::isCast(E->getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
 
-      Value *LHS, *RHS;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        setInsertPointAfterBundle(E->Scalars, S);
+      Value *LHS = nullptr, *RHS = nullptr;
+      if (Instruction::isBinaryOp(E->getOpcode())) {
+        setInsertPointAfterBundle(E);
         LHS = vectorizeTree(E->getOperand(0));
         RHS = vectorizeTree(E->getOperand(1));
       } else {
-        setInsertPointAfterBundle(E->Scalars, S);
+        setInsertPointAfterBundle(E);
         LHS = vectorizeTree(E->getOperand(0));
       }
 
@@ -3949,16 +4179,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V0, *V1;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
+      if (Instruction::isBinaryOp(E->getOpcode())) {
         V0 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+            static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
         V1 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+            static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
       } else {
         V0 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
         V1 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
       }
 
       // Create shuffle to take alternate operations from the vector.
@@ -3969,8 +4199,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
         auto *OpInst = cast<Instruction>(E->Scalars[i]);
-        assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
-        if (OpInst->getOpcode() == S.getAltOpcode()) {
+        assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+        if (OpInst->getOpcode() == E->getAltOpcode()) {
           Mask[i] = Builder.getInt32(e + i);
           AltScalars.push_back(E->Scalars[i]);
         } else {
@@ -4136,20 +4366,18 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
+#ifndef NDEBUG
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
-#ifndef NDEBUG
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
-          // It is legal to replace users in the ignorelist by undef.
+          // It is legal to delete users in the ignorelist.
           assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
-                 "Replacing out-of-tree value with undef");
+                 "Deleting out-of-tree value");
         }
-#endif
-        Value *Undef = UndefValue::get(Ty);
-        Scalar->replaceAllUsesWith(Undef);
       }
+#endif
       LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
       eraseInstruction(cast<Instruction>(Scalar));
     }
@@ -4165,7 +4393,7 @@ void BoUpSLP::optimizeGatherSequence() {
                     << " gather sequences instructions.\n");
   // LICM InsertElementInst sequences.
   for (Instruction *I : GatherSeq) {
-    if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
+    if (isDeleted(I))
       continue;
 
     // Check if this block is inside a loop.
@@ -4219,6 +4447,8 @@ void BoUpSLP::optimizeGatherSequence() {
     // For all instructions in blocks containing gather sequences:
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
       Instruction *In = &*it++;
+      if (isDeleted(In))
+        continue;
       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
         continue;
 
@@ -4245,11 +4475,11 @@ void BoUpSLP::optimizeGatherSequence() {
 
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
-bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
-                                                 BoUpSLP *SLP,
-                                                 const InstructionsState &S) {
+Optional<BoUpSLP::ScheduleData *>
+BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+                                            const InstructionsState &S) {
   if (isa<PHINode>(S.OpValue))
-    return true;
+    return nullptr;
 
   // Initialize the instruction bundle.
   Instruction *OldScheduleEnd = ScheduleEnd;
@@ -4262,7 +4492,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   // instructions of the bundle.
   for (Value *V : VL) {
     if (!extendSchedulingRegion(V, S))
-      return false;
+      return None;
   }
 
   for (Value *V : VL) {
@@ -4308,6 +4538,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
     resetSchedule();
     initialFillReadyList(ReadyInsts);
   }
+  assert(Bundle && "Failed to find schedule bundle");
 
   LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
                     << BB->getName() << "\n");
@@ -4329,9 +4560,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   }
   if (!Bundle->isReady()) {
     cancelScheduling(VL, S.OpValue);
-    return false;
+    return None;
   }
-  return true;
+  return Bundle;
 }
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
@@ -4364,7 +4595,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
   // Allocate a new ScheduleData for the instruction.
   if (ChunkPos >= ChunkSize) {
-    ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
+    ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
     ChunkPos = 0;
   }
   return &(ScheduleDataChunks.back()[ChunkPos++]);
@@ -4977,7 +5208,7 @@ struct SLPVectorizer : public FunctionPass {
     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-    auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -5052,7 +5283,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // If the target claims to have no vector registers don't attempt
   // vectorization.
-  if (!TTI->getNumberOfRegisters(true))
+  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
     return false;
 
   // Don't vectorize when the attribute NoImplicitFloat is used.
@@ -5100,19 +5331,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
   return Changed;
 }
 
-/// Check that the Values in the slice in VL array are still existent in
-/// the WeakTrackingVH array.
-/// Vectorization of part of the VL array may cause later values in the VL array
-/// to become invalid. We track when this has happened in the WeakTrackingVH
-/// array.
-static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
-                               ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
-                               unsigned SliceSize) {
-  VL = VL.slice(SliceBegin, SliceSize);
-  VH = VH.slice(SliceBegin, SliceSize);
-  return !std::equal(VL.begin(), VL.end(), VH.begin());
-}
-
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                             unsigned VecRegSize) {
   const unsigned ChainLen = Chain.size();
@@ -5124,20 +5342,20 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   if (!isPowerOf2_32(Sz) || VF < 2)
     return false;
 
-  // Keep track of values that were deleted by vectorizing in the loop below.
-  const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
-
   bool Changed = false;
   // Look for profitable vectorizable trees at all offsets, starting at zero.
   for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
 
+    ArrayRef<Value *> Operands = Chain.slice(i, VF);
     // Check that a previous iteration of this loop did not delete the Value.
-    if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+    if (llvm::any_of(Operands, [&R](Value *V) {
+          auto *I = dyn_cast<Instruction>(V);
+          return I && R.isDeleted(I);
+        }))
       continue;
 
     LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
                       << "\n");
-    ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
     R.buildTree(Operands);
     if (R.isTreeTinyAndNotFullyVectorizable())
@@ -5329,12 +5547,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   bool CandidateFound = false;
   int MinCost = SLPCostThreshold;
 
-  // Keep track of values that were deleted by vectorizing in the loop below.
-  SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
-
   unsigned NextInst = 0, MaxInst = VL.size();
-  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
-       VF /= 2) {
+  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
     // No actual vectorization should happen, if number of parts is the same as
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
@@ -5352,13 +5566,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
         break;
 
+      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
       // Check that a previous iteration of this loop did not delete the Value.
-      if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
+      if (llvm::any_of(Ops, [&R](Value *V) {
+            auto *I = dyn_cast<Instruction>(V);
+            return I && R.isDeleted(I);
+          }))
         continue;
 
       LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
                         << "\n");
-      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
 
       R.buildTree(Ops);
       Optional<ArrayRef<unsigned>> Order = R.bestOrder();
@@ -5571,7 +5788,7 @@ class HorizontalReduction {
     Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
       assert(isVectorizable() &&
              "Expected add|fadd or min/max reduction operation.");
-      Value *Cmp;
+      Value *Cmp = nullptr;
       switch (Kind) {
       case RK_Arithmetic:
         return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
@@ -5579,23 +5796,23 @@ class HorizontalReduction {
       case RK_Min:
         Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
                                           : Builder.CreateFCmpOLT(LHS, RHS);
-        break;
+        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       case RK_Max:
         Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
                                           : Builder.CreateFCmpOGT(LHS, RHS);
-        break;
+        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       case RK_UMin:
         assert(Opcode == Instruction::ICmp && "Expected integer types.");
         Cmp = Builder.CreateICmpULT(LHS, RHS);
-        break;
+        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       case RK_UMax:
         assert(Opcode == Instruction::ICmp && "Expected integer types.");
         Cmp = Builder.CreateICmpUGT(LHS, RHS);
-        break;
+        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       case RK_None:
-        llvm_unreachable("Unknown reduction operation.");
+        break;
       }
-      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+      llvm_unreachable("Unknown reduction operation.");
     }
 
   public:
@@ -6203,6 +6420,8 @@ public:
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
+      if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
+        break;
 
       V.computeMinimumValueSizes();
 
@@ -6275,6 +6494,9 @@ public:
       }
       // Update users.
       ReductionRoot->replaceAllUsesWith(VectorizedTree);
+      // Mark all scalar reduction ops for deletion, they are replaced by the
+      // vector reductions.
+      V.eraseInstructions(IgnoreList);
     }
     return VectorizedTree != nullptr;
   }
@@ -6323,7 +6545,7 @@ private:
     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
 
-    int ScalarReduxCost;
+    int ScalarReduxCost = 0;
     switch (ReductionData.getKind()) {
     case RK_Arithmetic:
       ScalarReduxCost =
@@ -6429,10 +6651,9 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
 /// \return true if it matches.
 static bool findBuildAggregate(InsertValueInst *IV,
                                SmallVectorImpl<Value *> &BuildVectorOpds) {
-  Value *V;
   do {
     BuildVectorOpds.push_back(IV->getInsertedValueOperand());
-    V = IV->getAggregateOperand();
+    Value *V = IV->getAggregateOperand();
     if (isa<UndefValue>(V))
       break;
     IV = dyn_cast<InsertValueInst>(V);
@@ -6530,18 +6751,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   // horizontal reduction.
   // Interrupt the process if the Root instruction itself was vectorized or all
   // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
-  SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
+  SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
   SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
   while (!Stack.empty()) {
-    Value *V;
+    Instruction *Inst;
     unsigned Level;
-    std::tie(V, Level) = Stack.pop_back_val();
-    if (!V)
-      continue;
-    auto *Inst = dyn_cast<Instruction>(V);
-    if (!Inst)
-      continue;
+    std::tie(Inst, Level) = Stack.pop_back_val();
     auto *BI = dyn_cast<BinaryOperator>(Inst);
     auto *SI = dyn_cast<SelectInst>(Inst);
     if (BI || SI) {
@@ -6582,8 +6798,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
       for (auto *Op : Inst->operand_values())
         if (VisitedInstrs.insert(Op).second)
           if (auto *I = dyn_cast<Instruction>(Op))
-            if (!isa<PHINode>(I) && I->getParent() == BB)
-              Stack.emplace_back(Op, Level);
+            if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
+              Stack.emplace_back(I, Level);
   }
   return Res;
 }
@@ -6652,11 +6868,10 @@ bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
 }
 
 bool SLPVectorizerPass::vectorizeSimpleInstructions(
-    SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+    SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
   bool OpsChanged = false;
-  for (auto &VH : reverse(Instructions)) {
-    auto *I = dyn_cast_or_null<Instruction>(VH);
-    if (!I)
+  for (auto *I : reverse(Instructions)) {
+    if (R.isDeleted(I))
       continue;
     if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
       OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
@@ -6685,7 +6900,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (!P)
         break;
 
-      if (!VisitedInstrs.count(P))
+      if (!VisitedInstrs.count(P) && !R.isDeleted(P))
         Incoming.push_back(P);
     }
 
@@ -6729,9 +6944,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
   VisitedInstrs.clear();
 
-  SmallVector<WeakVH, 8> PostProcessInstructions;
+  SmallVector<Instruction *, 8> PostProcessInstructions;
   SmallDenseSet<Instruction *, 4> KeyNodes;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    // Skip instructions marked for the deletion.
+    if (R.isDeleted(&*it))
+      continue;
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*it).second) {
       if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
@@ -6811,10 +7029,16 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
     LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
                       << Entry.second.size() << ".\n");
 
-    // We process the getelementptr list in chunks of 16 (like we do for
-    // stores) to minimize compile-time.
-    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
-      auto Len = std::min<unsigned>(BE - BI, 16);
+    // Process the GEP list in chunks suitable for the target's supported
+    // vector size. If a vector register can't hold 1 element, we are done.
+    unsigned MaxVecRegSize = R.getMaxVecRegSize();
+    unsigned EltSize = R.getVectorElementSize(Entry.second[0]);
+    if (MaxVecRegSize < EltSize)
+      continue;
+
+    unsigned MaxElts = MaxVecRegSize / EltSize;
+    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
+      auto Len = std::min<unsigned>(BE - BI, MaxElts);
       auto GEPList = makeArrayRef(&Entry.second[BI], Len);
 
       // Initialize a set a candidate getelementptrs. Note that we use a
@@ -6824,10 +7048,10 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
 
       // Some of the candidates may have already been vectorized after we
-      // initially collected them. If so, the WeakTrackingVHs will have
-      // nullified the
-      // values, so remove them from the set of candidates.
-      Candidates.remove(nullptr);
+      // initially collected them. If so, they are marked as deleted, so remove
+      // them from the set of candidates.
+      Candidates.remove_if(
+          [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
 
       // Remove from the set of candidates all pairs of getelementptrs with
       // constant differences. Such getelementptrs are likely not good
@@ -6835,18 +7059,18 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
       // computed from the other. We also ensure all candidate getelementptr
       // indices are unique.
       for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
-        auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
+        auto *GEPI = GEPList[I];
         if (!Candidates.count(GEPI))
           continue;
         auto *SCEVI = SE->getSCEV(GEPList[I]);
         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
-          auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
+          auto *GEPJ = GEPList[J];
           auto *SCEVJ = SE->getSCEV(GEPList[J]);
           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
-            Candidates.remove(GEPList[I]);
-            Candidates.remove(GEPList[J]);
+            Candidates.remove(GEPI);
+            Candidates.remove(GEPJ);
           } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
-            Candidates.remove(GEPList[J]);
+            Candidates.remove(GEPJ);
           }
         }
       }
diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
index 517d759d7bfc..4b80d1fb20aa 100644
--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp
@@ -283,6 +283,12 @@ iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
   return getParent()->getRecipeList().erase(getIterator());
 }
 
+void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
+  InsertPos->getParent()->getRecipeList().splice(
+      std::next(InsertPos->getIterator()), getParent()->getRecipeList(),
+      getIterator());
+}
+
 void VPInstruction::generateInstruction(VPTransformState &State,
                                         unsigned Part) {
   IRBuilder<> &Builder = State.Builder;
@@ -309,6 +315,14 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     State.set(this, V, Part);
     break;
   }
+  case Instruction::Select: {
+    Value *Cond = State.get(getOperand(0), Part);
+    Value *Op1 = State.get(getOperand(1), Part);
+    Value *Op2 = State.get(getOperand(2), Part);
+    Value *V = Builder.CreateSelect(Cond, Op1, Op2);
+    State.set(this, V, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -728,7 +742,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
       auto NewIGIter = Old2New.find(IG);
       if (NewIGIter == Old2New.end())
         Old2New[IG] = new InterleaveGroup<VPInstruction>(
-            IG->getFactor(), IG->isReverse(), IG->getAlignment());
+            IG->getFactor(), IG->isReverse(), Align(IG->getAlignment()));
 
       if (Inst == IG->getInsertPos())
         Old2New[IG]->setInsertPos(VPInst);
@@ -736,7 +750,8 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
       InterleaveGroupMap[VPInst] = Old2New[IG];
       InterleaveGroupMap[VPInst]->insertMember(
           VPInst, IG->getIndex(Inst),
-          IG->isReverse() ? (-1) * int(IG->getFactor()) : IG->getFactor());
+          Align(IG->isReverse() ? (-1) * int(IG->getFactor())
+                                : IG->getFactor()));
     }
   } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
     visitRegion(Region, Old2New, IAI);
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index 8a06412ad590..44d8a198f27e 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -615,6 +615,10 @@ public:
   /// the specified recipe.
   void insertBefore(VPRecipeBase *InsertPos);
 
+  /// Unlink this recipe from its current VPBasicBlock and insert it into
+  /// the VPBasicBlock that MovePos lives in, right after MovePos.
+  void moveAfter(VPRecipeBase *MovePos);
+
   /// This method unlinks 'this' from the containing basic block and deletes it.
   ///
   /// \returns an iterator pointing to the element after the erased one
diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
index 7ed7d21b6caa..b22d3190d654 100644
--- a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
+++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
@@ -21,7 +21,7 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
     LoopVectorizationLegality::InductionList *Inductions,
     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
 
-  VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry());
+  auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
   ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
 
   // Condition bit VPValues get deleted during transformation to VPRecipes.
diff --git a/lib/Transforms/Vectorize/VPlanSLP.cpp b/lib/Transforms/Vectorize/VPlanSLP.cpp
index e5ab24e52df6..9019ed15ec5f 100644
--- a/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -346,11 +346,14 @@ SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
 
 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
   dbgs() << " Ops: ";
-  for (auto Op : Values)
-    if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())
-      dbgs() << *Instr << " | ";
-    else
-      dbgs() << " nullptr | ";
+  for (auto Op : Values) {
+    if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
+      if (auto *Instr = VPInstr->getUnderlyingInstr()) {
+        dbgs() << *Instr << " | ";
+        continue;
+      }
+    dbgs() << " nullptr | ";
+  }
   dbgs() << "\n";
 }
 
diff --git a/lib/WindowsManifest/WindowsManifestMerger.cpp b/lib/WindowsManifest/WindowsManifestMerger.cpp
index d092ab493c9b..031a963cd3b0 100644
--- a/lib/WindowsManifest/WindowsManifestMerger.cpp
+++ b/lib/WindowsManifest/WindowsManifestMerger.cpp
@@ -58,7 +58,7 @@ private:
 
 #if LLVM_LIBXML2_ENABLED
 
-static const std::pair<StringRef, StringRef> MtNsHrefsPrefixes[] = {
+static constexpr std::pair<StringLiteral, StringLiteral> MtNsHrefsPrefixes[] = {
     {"urn:schemas-microsoft-com:asm.v1", "ms_asmv1"},
     {"urn:schemas-microsoft-com:asm.v2", "ms_asmv2"},
     {"urn:schemas-microsoft-com:asm.v3", "ms_asmv3"},
@@ -704,7 +704,7 @@ bool windows_manifest::isAvailable() { return false; }
 #endif
 
 WindowsManifestMerger::WindowsManifestMerger()
-    : Impl(make_unique<WindowsManifestMergerImpl>()) {}
+    : Impl(std::make_unique<WindowsManifestMergerImpl>()) {}
 
 WindowsManifestMerger::~WindowsManifestMerger() {}
 
diff --git a/lib/XRay/FDRRecordProducer.cpp b/lib/XRay/FDRRecordProducer.cpp
index 452bc6c55fb8..479b710444be 100644
--- a/lib/XRay/FDRRecordProducer.cpp
+++ b/lib/XRay/FDRRecordProducer.cpp
@@ -40,32 +40,32 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
                              "Invalid metadata record type: %d", T);
   switch (T) {
   case MetadataRecordKinds::NewBufferKind:
-    return make_unique<NewBufferRecord>();
+    return std::make_unique<NewBufferRecord>();
   case MetadataRecordKinds::EndOfBufferKind:
     if (Header.Version >= 2)
       return createStringError(
           std::make_error_code(std::errc::executable_format_error),
           "End of buffer records are no longer supported starting version "
           "2 of the log.");
-    return make_unique<EndBufferRecord>();
+    return std::make_unique<EndBufferRecord>();
   case MetadataRecordKinds::NewCPUIdKind:
-    return make_unique<NewCPUIDRecord>();
+    return std::make_unique<NewCPUIDRecord>();
   case MetadataRecordKinds::TSCWrapKind:
-    return make_unique<TSCWrapRecord>();
+    return std::make_unique<TSCWrapRecord>();
   case MetadataRecordKinds::WalltimeMarkerKind:
-    return make_unique<WallclockRecord>();
+    return std::make_unique<WallclockRecord>();
   case MetadataRecordKinds::CustomEventMarkerKind:
     if (Header.Version >= 5)
-      return make_unique<CustomEventRecordV5>();
-    return make_unique<CustomEventRecord>();
+      return std::make_unique<CustomEventRecordV5>();
+    return std::make_unique<CustomEventRecord>();
   case MetadataRecordKinds::CallArgumentKind:
-    return make_unique<CallArgRecord>();
+    return std::make_unique<CallArgRecord>();
   case MetadataRecordKinds::BufferExtentsKind:
-    return make_unique<BufferExtents>();
+    return std::make_unique<BufferExtents>();
   case MetadataRecordKinds::TypedEventMarkerKind:
-    return make_unique<TypedEventRecord>();
+    return std::make_unique<TypedEventRecord>();
   case MetadataRecordKinds::PidKind:
-    return make_unique<PIDRecord>();
+    return std::make_unique<PIDRecord>();
   case MetadataRecordKinds::EnumEndMarker:
     llvm_unreachable("Invalid MetadataRecordKind");
   }
@@ -89,7 +89,7 @@ FileBasedRecordProducer::findNextBufferExtent() {
     if (OffsetPtr == PreReadOffset)
       return createStringError(
           std::make_error_code(std::errc::executable_format_error),
-          "Failed reading one byte from offset %d.", OffsetPtr);
+          "Failed reading one byte from offset %" PRId64 ".", OffsetPtr);
 
     if (isMetadataIntroducer(FirstByte)) {
       auto LoadedType = FirstByte >> 1;
@@ -130,7 +130,7 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
     R = std::move(BufferExtentsOrError.get());
     assert(R != nullptr);
     assert(isa<BufferExtents>(R.get()));
-    auto BE = dyn_cast<BufferExtents>(R.get());
+    auto BE = cast<BufferExtents>(R.get());
     CurrentBufferBytes = BE->size();
     return std::move(R);
   }
@@ -151,7 +151,7 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
   if (OffsetPtr == PreReadOffset)
     return createStringError(
         std::make_error_code(std::errc::executable_format_error),
-        "Failed reading one byte from offset %d.", OffsetPtr);
+        "Failed reading one byte from offset %" PRId64 ".", OffsetPtr);
 
   // For metadata records, handle especially here.
   if (isMetadataIntroducer(FirstByte)) {
@@ -162,11 +162,12 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
           MetadataRecordOrErr.takeError(),
           createStringError(
               std::make_error_code(std::errc::executable_format_error),
-              "Encountered an unsupported metadata record (%d) at offset %d.",
+              "Encountered an unsupported metadata record (%d) "
+              "at offset %" PRId64 ".",
               LoadedType, PreReadOffset));
     R = std::move(MetadataRecordOrErr.get());
   } else {
-    R = llvm::make_unique<FunctionRecord>();
+    R = std::make_unique<FunctionRecord>();
   }
   RecordInitializer RI(E, OffsetPtr);
 
@@ -182,8 +183,8 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
     if (OffsetPtr - PreReadOffset > CurrentBufferBytes)
       return createStringError(
           std::make_error_code(std::errc::executable_format_error),
-          "Buffer over-read at offset %d (over-read by %d bytes); Record Type "
-          "= %s.",
+          "Buffer over-read at offset %" PRId64 " (over-read by %" PRId64
+          " bytes); Record Type = %s.",
           OffsetPtr, (OffsetPtr - PreReadOffset) - CurrentBufferBytes,
           Record::kindToString(R->getRecordType()).data());
 
diff --git a/lib/XRay/FileHeaderReader.cpp b/lib/XRay/FileHeaderReader.cpp
index 3fb021906a6f..6b6daf9deba5 100644
--- a/lib/XRay/FileHeaderReader.cpp
+++ b/lib/XRay/FileHeaderReader.cpp
@@ -12,7 +12,7 @@ namespace xray {
 
 // Populates the FileHeader reference by reading the first 32 bytes of the file.
 Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
-                                                uint32_t &OffsetPtr) {
+                                                uint64_t &OffsetPtr) {
   // FIXME: Maybe deduce whether the data is little or big-endian using some
   // magic bytes in the beginning of the file?
 
@@ -30,21 +30,24 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
   if (OffsetPtr == PreReadOffset)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading version from file header at offset %d.", OffsetPtr);
+        "Failed reading version from file header at offset %" PRId64 ".",
+        OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   FileHeader.Type = HeaderExtractor.getU16(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading file type from file header at offset %d.", OffsetPtr);
+        "Failed reading file type from file header at offset %" PRId64 ".",
+        OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   uint32_t Bitfield = HeaderExtractor.getU32(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading flag bits from file header at offset %d.", OffsetPtr);
+        "Failed reading flag bits from file header at offset %" PRId64 ".",
+        OffsetPtr);
 
   FileHeader.ConstantTSC = Bitfield & 1uL;
   FileHeader.NonstopTSC = Bitfield & 1uL << 1;
@@ -53,7 +56,8 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
   if (OffsetPtr == PreReadOffset)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading cycle frequency from file header at offset %d.",
+        "Failed reading cycle frequency from file header at offset %" PRId64
+        ".",
         OffsetPtr);
 
   std::memcpy(&FileHeader.FreeFormData,
diff --git a/lib/XRay/InstrumentationMap.cpp b/lib/XRay/InstrumentationMap.cpp
index fe5e941f7ea6..7453613c7038 100644
--- a/lib/XRay/InstrumentationMap.cpp
+++ b/lib/XRay/InstrumentationMap.cpp
@@ -67,10 +67,11 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
   StringRef Contents = "";
   const auto &Sections = ObjFile.getBinary()->sections();
   auto I = llvm::find_if(Sections, [&](object::SectionRef Section) {
-    StringRef Name = "";
-    if (Section.getName(Name))
-      return false;
-    return Name == "xray_instr_map";
+    Expected<StringRef> NameOrErr = Section.getName();
+    if (NameOrErr)
+      return *NameOrErr == "xray_instr_map";
+    consumeError(NameOrErr.takeError());
+    return false;
   });
 
   if (I == Sections.end())
@@ -118,7 +119,7 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
               "an XRay sled entry in ELF64."),
         std::make_error_code(std::errc::executable_format_error));
 
-  auto RelocateOrElse = [&](uint32_t Offset, uint64_t Address) {
+  auto RelocateOrElse = [&](uint64_t Offset, uint64_t Address) {
     if (!Address) {
       uint64_t A = I->getAddress() + C - Contents.bytes_begin() + Offset;
       RelocMap::const_iterator R = Relocs.find(A);
@@ -136,10 +137,10 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
         8);
     Sleds.push_back({});
     auto &Entry = Sleds.back();
-    uint32_t OffsetPtr = 0;
-    uint32_t AddrOff = OffsetPtr;
+    uint64_t OffsetPtr = 0;
+    uint64_t AddrOff = OffsetPtr;
     Entry.Address = RelocateOrElse(AddrOff, Extractor.getU64(&OffsetPtr));
-    uint32_t FuncOff = OffsetPtr;
+    uint64_t FuncOff = OffsetPtr;
     Entry.Function = RelocateOrElse(FuncOff, Extractor.getU64(&OffsetPtr));
     auto Kind = Extractor.getU8(&OffsetPtr);
     static constexpr SledEntry::FunctionKinds Kinds[] = {
diff --git a/lib/XRay/Profile.cpp b/lib/XRay/Profile.cpp
index e34b182f2e02..c1a43632b600 100644
--- a/lib/XRay/Profile.cpp
+++ b/lib/XRay/Profile.cpp
@@ -49,9 +49,9 @@ struct BlockHeader {
 };
 
 static Expected<BlockHeader> readBlockHeader(DataExtractor &Extractor,
-                                             uint32_t &Offset) {
+                                             uint64_t &Offset) {
   BlockHeader H;
-  uint32_t CurrentOffset = Offset;
+  uint64_t CurrentOffset = Offset;
   H.Size = Extractor.getU32(&Offset);
   if (Offset == CurrentOffset)
     return make_error<StringError>(
@@ -76,7 +76,7 @@ static Expected<BlockHeader> readBlockHeader(DataExtractor &Extractor,
 }
 
 static Expected<std::vector<Profile::FuncID>> readPath(DataExtractor &Extractor,
-                                                       uint32_t &Offset) {
+                                                       uint64_t &Offset) {
   // We're reading a sequence of int32_t's until we find a 0.
   std::vector<Profile::FuncID> Path;
   auto CurrentOffset = Offset;
@@ -94,7 +94,7 @@ static Expected<std::vector<Profile::FuncID>> readPath(DataExtractor &Extractor,
 }
 
 static Expected<Profile::Data> readData(DataExtractor &Extractor,
-                                        uint32_t &Offset) {
+                                        uint64_t &Offset) {
   // We expect a certain number of elements for Data:
   //   - A 64-bit CallCount
   //   - A 64-bit CumulativeLocalTime counter
@@ -280,7 +280,7 @@ Expected<Profile> loadProfile(StringRef Filename) {
   StringRef Data(MappedFile.data(), MappedFile.size());
 
   Profile P;
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   DataExtractor Extractor(Data, true, 8);
 
   // For each block we get from the file:
diff --git a/lib/XRay/RecordInitializer.cpp b/lib/XRay/RecordInitializer.cpp
index 78163031a8cc..68ab3db06208 100644
--- a/lib/XRay/RecordInitializer.cpp
+++ b/lib/XRay/RecordInitializer.cpp
@@ -12,15 +12,15 @@ namespace xray {
 
 Error RecordInitializer::visit(BufferExtents &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr, sizeof(uint64_t)))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a buffer extent (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a buffer extent (%" PRId64 ").", OffsetPtr);
 
   auto PreReadOffset = OffsetPtr;
   R.Size = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
     return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Cannot read buffer extent at offset %d.",
+                             "Cannot read buffer extent at offset %" PRId64 ".",
                              OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - PreReadOffset);
@@ -30,23 +30,25 @@ Error RecordInitializer::visit(BufferExtents &R) {
 Error RecordInitializer::visit(WallclockRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a wallclock record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a wallclock record (%" PRId64 ").", OffsetPtr);
   auto BeginOffset = OffsetPtr;
   auto PreReadOffset = OffsetPtr;
   R.Seconds = E.getU64(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Cannot read wall clock 'seconds' field at offset %d.", OffsetPtr);
+        "Cannot read wall clock 'seconds' field at offset %" PRId64 ".",
+        OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.Nanos = E.getU32(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Cannot read wall clock 'nanos' field at offset %d.", OffsetPtr);
+        "Cannot read wall clock 'nanos' field at offset %" PRId64 ".",
+        OffsetPtr);
 
   // Align to metadata record size boundary.
   assert(OffsetPtr - BeginOffset <= MetadataRecord::kMetadataBodySize);
@@ -57,21 +59,23 @@ Error RecordInitializer::visit(WallclockRecord &R) {
 Error RecordInitializer::visit(NewCPUIDRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a new cpu id record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a new cpu id record (%" PRId64 ").", OffsetPtr);
   auto BeginOffset = OffsetPtr;
   auto PreReadOffset = OffsetPtr;
   R.CPUId = E.getU16(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Cannot read CPU id at offset %d.", OffsetPtr);
+                             "Cannot read CPU id at offset %" PRId64 ".",
+                             OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.TSC = E.getU64(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Cannot read CPU TSC at offset %d.", OffsetPtr);
+                             "Cannot read CPU TSC at offset %" PRId64 ".",
+                             OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
   return Error::success();
@@ -80,16 +84,16 @@ Error RecordInitializer::visit(NewCPUIDRecord &R) {
 Error RecordInitializer::visit(TSCWrapRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a new TSC wrap record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a new TSC wrap record (%" PRId64 ").", OffsetPtr);
 
   auto PreReadOffset = OffsetPtr;
   R.BaseTSC = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Cannot read TSC wrap record at offset %d.",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read TSC wrap record at offset %" PRId64 ".", OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - PreReadOffset);
   return Error::success();
@@ -98,9 +102,9 @@ Error RecordInitializer::visit(TSCWrapRecord &R) {
 Error RecordInitializer::visit(CustomEventRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a custom event record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a custom event record (%" PRId64 ").", OffsetPtr);
 
   auto BeginOffset = OffsetPtr;
   auto PreReadOffset = OffsetPtr;
@@ -108,20 +112,22 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
   if (PreReadOffset == OffsetPtr)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Cannot read a custom event record size field offset %d.", OffsetPtr);
+        "Cannot read a custom event record size field offset %" PRId64 ".",
+        OffsetPtr);
 
   if (R.Size <= 0)
     return createStringError(
         std::make_error_code(std::errc::bad_address),
-        "Invalid size for custom event (size = %d) at offset %d.", R.Size,
-        OffsetPtr);
+        "Invalid size for custom event (size = %d) at offset %" PRId64 ".",
+        R.Size, OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.TSC = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Cannot read a custom event TSC field at offset %d.", OffsetPtr);
+        "Cannot read a custom event TSC field at offset %" PRId64 ".",
+        OffsetPtr);
 
   // For version 4 onwards, of the FDR log, we want to also capture the CPU ID
   // of the custom event.
@@ -131,7 +137,7 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
     if (PreReadOffset == OffsetPtr)
       return createStringError(
           std::make_error_code(std::errc::invalid_argument),
-          "Missing CPU field at offset %d", OffsetPtr);
+          "Missing CPU field at offset %" PRId64 ".", OffsetPtr);
   }
 
   assert(OffsetPtr > BeginOffset &&
@@ -142,8 +148,8 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr, R.Size))
     return createStringError(
         std::make_error_code(std::errc::bad_address),
-        "Cannot read %d bytes of custom event data from offset %d.", R.Size,
-        OffsetPtr);
+        "Cannot read %d bytes of custom event data from offset %" PRId64 ".",
+        R.Size, OffsetPtr);
 
   std::vector<uint8_t> Buffer;
   Buffer.resize(R.Size);
@@ -151,15 +157,15 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
   if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading data into buffer of size %d at offset %d.", R.Size,
-        OffsetPtr);
+        "Failed reading data into buffer of size %d at offset %" PRId64 ".",
+        R.Size, OffsetPtr);
 
   assert(OffsetPtr >= PreReadOffset);
   if (OffsetPtr - PreReadOffset != static_cast<uint32_t>(R.Size))
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading enough bytes for the custom event payload -- read %d "
-        "expecting %d bytes at offset %d.",
+        "Failed reading enough bytes for the custom event payload -- read "
+        "%" PRId64 " expecting %d bytes at offset %" PRId64 ".",
         OffsetPtr - PreReadOffset, R.Size, PreReadOffset);
 
   R.Data.assign(Buffer.begin(), Buffer.end());
@@ -169,9 +175,9 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
 Error RecordInitializer::visit(CustomEventRecordV5 &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a custom event record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a custom event record (%" PRId64 ").", OffsetPtr);
 
   auto BeginOffset = OffsetPtr;
   auto PreReadOffset = OffsetPtr;
@@ -180,20 +186,22 @@ Error RecordInitializer::visit(CustomEventRecordV5 &R) {
   if (PreReadOffset == OffsetPtr)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Cannot read a custom event record size field offset %d.", OffsetPtr);
+        "Cannot read a custom event record size field offset %" PRId64 ".",
+        OffsetPtr);
 
   if (R.Size <= 0)
     return createStringError(
         std::make_error_code(std::errc::bad_address),
-        "Invalid size for custom event (size = %d) at offset %d.", R.Size,
-        OffsetPtr);
+        "Invalid size for custom event (size = %d) at offset %" PRId64 ".",
+        R.Size, OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t));
   if (PreReadOffset == OffsetPtr)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Cannot read a custom event record TSC delta field at offset %d.",
+        "Cannot read a custom event record TSC delta field at offset "
+        "%" PRId64 ".",
         OffsetPtr);
 
   assert(OffsetPtr > BeginOffset &&
@@ -204,8 +212,8 @@ Error RecordInitializer::visit(CustomEventRecordV5 &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr, R.Size))
     return createStringError(
         std::make_error_code(std::errc::bad_address),
-        "Cannot read %d bytes of custom event data from offset %d.", R.Size,
-        OffsetPtr);
+        "Cannot read %d bytes of custom event data from offset %" PRId64 ".",
+        R.Size, OffsetPtr);
 
   std::vector<uint8_t> Buffer;
   Buffer.resize(R.Size);
@@ -213,15 +221,15 @@ Error RecordInitializer::visit(CustomEventRecordV5 &R) {
   if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading data into buffer of size %d at offset %d.", R.Size,
-        OffsetPtr);
+        "Failed reading data into buffer of size %d at offset %" PRId64 ".",
+        R.Size, OffsetPtr);
 
   assert(OffsetPtr >= PreReadOffset);
   if (OffsetPtr - PreReadOffset != static_cast<uint32_t>(R.Size))
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading enough bytes for the custom event payload -- read %d "
-        "expecting %d bytes at offset %d.",
+        "Failed reading enough bytes for the custom event payload -- read "
+        "%" PRId64 " expecting %d bytes at offset %" PRId64 ".",
         OffsetPtr - PreReadOffset, R.Size, PreReadOffset);
 
   R.Data.assign(Buffer.begin(), Buffer.end());
@@ -231,9 +239,9 @@ Error RecordInitializer::visit(CustomEventRecordV5 &R) {
 Error RecordInitializer::visit(TypedEventRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a typed event record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a typed event record (%" PRId64 ").", OffsetPtr);
 
   auto BeginOffset = OffsetPtr;
   auto PreReadOffset = OffsetPtr;
@@ -242,20 +250,22 @@ Error RecordInitializer::visit(TypedEventRecord &R) {
   if (PreReadOffset == OffsetPtr)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Cannot read a typed event record size field offset %d.", OffsetPtr);
+        "Cannot read a typed event record size field offset %" PRId64 ".",
+        OffsetPtr);
 
   if (R.Size <= 0)
     return createStringError(
         std::make_error_code(std::errc::bad_address),
-        "Invalid size for typed event (size = %d) at offset %d.", R.Size,
-        OffsetPtr);
+        "Invalid size for typed event (size = %d) at offset %" PRId64 ".",
+        R.Size, OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t));
   if (PreReadOffset == OffsetPtr)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Cannot read a typed event record TSC delta field at offset %d.",
+        "Cannot read a typed event record TSC delta field at offset "
+        "%" PRId64 ".",
         OffsetPtr);
 
   PreReadOffset = OffsetPtr;
@@ -263,7 +273,8 @@ Error RecordInitializer::visit(TypedEventRecord &R) {
   if (PreReadOffset == OffsetPtr)
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Cannot read a typed event record type field at offset %d.", OffsetPtr);
+        "Cannot read a typed event record type field at offset %" PRId64 ".",
+        OffsetPtr);
 
   assert(OffsetPtr > BeginOffset &&
          OffsetPtr - BeginOffset <= MetadataRecord::kMetadataBodySize);
@@ -273,8 +284,8 @@ Error RecordInitializer::visit(TypedEventRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr, R.Size))
     return createStringError(
         std::make_error_code(std::errc::bad_address),
-        "Cannot read %d bytes of custom event data from offset %d.", R.Size,
-        OffsetPtr);
+        "Cannot read %d bytes of custom event data from offset %" PRId64 ".",
+        R.Size, OffsetPtr);
 
   std::vector<uint8_t> Buffer;
   Buffer.resize(R.Size);
@@ -282,15 +293,15 @@ Error RecordInitializer::visit(TypedEventRecord &R) {
   if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading data into buffer of size %d at offset %d.", R.Size,
-        OffsetPtr);
+        "Failed reading data into buffer of size %d at offset %" PRId64 ".",
+        R.Size, OffsetPtr);
 
   assert(OffsetPtr >= PreReadOffset);
   if (OffsetPtr - PreReadOffset != static_cast<uint32_t>(R.Size))
     return createStringError(
         std::make_error_code(std::errc::invalid_argument),
-        "Failed reading enough bytes for the typed event payload -- read %d "
-        "expecting %d bytes at offset %d.",
+        "Failed reading enough bytes for the typed event payload -- read "
+        "%" PRId64 " expecting %d bytes at offset %" PRId64 ".",
         OffsetPtr - PreReadOffset, R.Size, PreReadOffset);
 
   R.Data.assign(Buffer.begin(), Buffer.end());
@@ -300,16 +311,17 @@ Error RecordInitializer::visit(TypedEventRecord &R) {
 Error RecordInitializer::visit(CallArgRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a call argument record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a call argument record (%" PRId64 ").",
+        OffsetPtr);
 
   auto PreReadOffset = OffsetPtr;
   R.Arg = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Cannot read a call arg record at offset %d.",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a call arg record at offset %" PRId64 ".", OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - PreReadOffset);
   return Error::success();
@@ -318,16 +330,16 @@ Error RecordInitializer::visit(CallArgRecord &R) {
 Error RecordInitializer::visit(PIDRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a process ID record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a process ID record (%" PRId64 ").", OffsetPtr);
 
   auto PreReadOffset = OffsetPtr;
   R.PID = E.getSigned(&OffsetPtr, 4);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Cannot read a process ID record at offset %d.",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a process ID record at offset %" PRId64 ".", OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - PreReadOffset);
   return Error::success();
@@ -336,16 +348,16 @@ Error RecordInitializer::visit(PIDRecord &R) {
 Error RecordInitializer::visit(NewBufferRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a new buffer record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a new buffer record (%" PRId64 ").", OffsetPtr);
 
   auto PreReadOffset = OffsetPtr;
   R.TID = E.getSigned(&OffsetPtr, sizeof(int32_t));
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Cannot read a new buffer record at offset %d.",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a new buffer record at offset %" PRId64 ".", OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - PreReadOffset);
   return Error::success();
@@ -354,9 +366,10 @@ Error RecordInitializer::visit(NewBufferRecord &R) {
 Error RecordInitializer::visit(EndBufferRecord &R) {
   if (!E.isValidOffsetForDataOfSize(OffsetPtr,
                                     MetadataRecord::kMetadataBodySize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for an end-of-buffer record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for an end-of-buffer record (%" PRId64 ").",
+        OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize;
   return Error::success();
@@ -373,17 +386,17 @@ Error RecordInitializer::visit(FunctionRecord &R) {
   //
   if (OffsetPtr == 0 || !E.isValidOffsetForDataOfSize(
                             --OffsetPtr, FunctionRecord::kFunctionRecordSize))
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Invalid offset for a function record (%d).",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Invalid offset for a function record (%" PRId64 ").", OffsetPtr);
 
   auto BeginOffset = OffsetPtr;
   auto PreReadOffset = BeginOffset;
   uint32_t Buffer = E.getU32(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_address),
-                             "Cannot read function id field from offset %d.",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Cannot read function id field from offset %" PRId64 ".", OffsetPtr);
 
   // To get the function record type, we shift the buffer one to the right
   // (truncating the function record indicator) then take the three bits
@@ -397,18 +410,19 @@ Error RecordInitializer::visit(FunctionRecord &R) {
     R.Kind = static_cast<RecordTypes>(FunctionType);
     break;
   default:
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Unknown function record type '%d' at offset %d.",
-                             FunctionType, BeginOffset);
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Unknown function record type '%d' at offset %" PRId64 ".",
+        FunctionType, BeginOffset);
   }
 
   R.FuncId = Buffer >> 4;
   PreReadOffset = OffsetPtr;
   R.Delta = E.getU32(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Failed reading TSC delta from offset %d.",
-                             OffsetPtr);
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Failed reading TSC delta from offset %" PRId64 ".", OffsetPtr);
   assert(FunctionRecord::kFunctionRecordSize == (OffsetPtr - BeginOffset));
   return Error::success();
 }
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index b9b67c561c66..4f107e1059cc 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -47,7 +47,7 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
         std::make_error_code(std::errc::invalid_argument));
 
   DataExtractor Reader(Data, IsLittleEndian, 8);
-  uint32_t OffsetPtr = 0;
+  uint64_t OffsetPtr = 0;
   auto FileHeaderOrError = readBinaryFormatHeader(Reader, OffsetPtr);
   if (!FileHeaderOrError)
     return FileHeaderOrError.takeError();
@@ -67,13 +67,14 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
     if (!Reader.isValidOffsetForDataOfSize(OffsetPtr, 32))
       return createStringError(
           std::make_error_code(std::errc::executable_format_error),
-          "Not enough bytes to read a full record at offset %d.", OffsetPtr);
+          "Not enough bytes to read a full record at offset %" PRId64 ".",
+          OffsetPtr);
     auto PreReadOffset = OffsetPtr;
     auto RecordType = Reader.getU16(&OffsetPtr);
     if (OffsetPtr == PreReadOffset)
       return createStringError(
           std::make_error_code(std::errc::executable_format_error),
-          "Failed reading record type at offset %d.", OffsetPtr);
+          "Failed reading record type at offset %" PRId64 ".", OffsetPtr);
 
     switch (RecordType) {
     case 0: { // Normal records.
@@ -86,14 +87,15 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading CPU field at offset %d.", OffsetPtr);
+            "Failed reading CPU field at offset %" PRId64 ".", OffsetPtr);
 
       PreReadOffset = OffsetPtr;
       auto Type = Reader.getU8(&OffsetPtr);
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading record type field at offset %d.", OffsetPtr);
+            "Failed reading record type field at offset %" PRId64 ".",
+            OffsetPtr);
 
       switch (Type) {
       case 0:
@@ -111,7 +113,7 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
       default:
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Unknown record type '%d' at offset %d.", Type, OffsetPtr);
+            "Unknown record type '%d' at offset %" PRId64 ".", Type, OffsetPtr);
       }
 
       PreReadOffset = OffsetPtr;
@@ -119,28 +121,29 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading function id field at offset %d.", OffsetPtr);
+            "Failed reading function id field at offset %" PRId64 ".",
+            OffsetPtr);
 
       PreReadOffset = OffsetPtr;
       Record.TSC = Reader.getU64(&OffsetPtr);
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading TSC field at offset %d.", OffsetPtr);
+            "Failed reading TSC field at offset %" PRId64 ".", OffsetPtr);
 
       PreReadOffset = OffsetPtr;
       Record.TId = Reader.getU32(&OffsetPtr);
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading thread id field at offset %d.", OffsetPtr);
+            "Failed reading thread id field at offset %" PRId64 ".", OffsetPtr);
 
       PreReadOffset = OffsetPtr;
       Record.PId = Reader.getU32(&OffsetPtr);
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading process id at offset %d.", OffsetPtr);
+            "Failed reading process id at offset %" PRId64 ".", OffsetPtr);
 
       break;
     }
@@ -155,21 +158,23 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading function id field at offset %d.", OffsetPtr);
+            "Failed reading function id field at offset %" PRId64 ".",
+            OffsetPtr);
 
       PreReadOffset = OffsetPtr;
       auto TId = Reader.getU32(&OffsetPtr);
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading thread id field at offset %d.", OffsetPtr);
+            "Failed reading thread id field at offset %" PRId64 ".", OffsetPtr);
 
       PreReadOffset = OffsetPtr;
       auto PId = Reader.getU32(&OffsetPtr);
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading process id field at offset %d.", OffsetPtr);
+            "Failed reading process id field at offset %" PRId64 ".",
+            OffsetPtr);
 
       // Make a check for versions above 3 for the Pid field
       if (Record.FuncId != FuncId || Record.TId != TId ||
@@ -178,7 +183,7 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
             std::make_error_code(std::errc::executable_format_error),
             "Corrupted log, found arg payload following non-matching "
             "function+thread record. Record for function %d != %d at offset "
-            "%d",
+            "%" PRId64 ".",
             Record.FuncId, FuncId, OffsetPtr);
 
       PreReadOffset = OffsetPtr;
@@ -186,7 +191,8 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
       if (OffsetPtr == PreReadOffset)
         return createStringError(
             std::make_error_code(std::errc::executable_format_error),
-            "Failed reading argument payload at offset %d.", OffsetPtr);
+            "Failed reading argument payload at offset %" PRId64 ".",
+            OffsetPtr);
 
       Record.CallArgs.push_back(Arg);
       break;
@@ -194,7 +200,8 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
     default:
       return createStringError(
           std::make_error_code(std::errc::executable_format_error),
-          "Unknown record type '%d' at offset %d.", RecordType, OffsetPtr);
+          "Unknown record type '%d' at offset %" PRId64 ".", RecordType,
+          OffsetPtr);
     }
     // Advance the offset pointer enough bytes to align to 32-byte records for
     // basic mode logs.
@@ -265,7 +272,7 @@ Error loadFDRLog(StringRef Data, bool IsLittleEndian,
                              "Not enough bytes for an XRay FDR log.");
   DataExtractor DE(Data, IsLittleEndian, 8);
 
-  uint32_t OffsetPtr = 0;
+  uint64_t OffsetPtr = 0;
   auto FileHeaderOrError = readBinaryFormatHeader(DE, OffsetPtr);
   if (!FileHeaderOrError)
     return FileHeaderOrError.takeError();
@@ -424,7 +431,7 @@ Expected<Trace> llvm::xray::loadTrace(const DataExtractor &DE, bool Sort) {
   // Only if we can't load either the binary or the YAML format will we yield an
   // error.
   DataExtractor HeaderExtractor(DE.getData(), DE.isLittleEndian(), 8);
-  uint32_t OffsetPtr = 0;
+  uint64_t OffsetPtr = 0;
   uint16_t Version = HeaderExtractor.getU16(&OffsetPtr);
   uint16_t Type = HeaderExtractor.getU16(&OffsetPtr);
 
diff --git a/tools/bugpoint/BugDriver.h b/tools/bugpoint/BugDriver.h
index 75f166b21b2c..fe5201eb2e6c 100644
--- a/tools/bugpoint/BugDriver.h
+++ b/tools/bugpoint/BugDriver.h
@@ -217,8 +217,7 @@ public:
   /// returning the transformed module on success, or a null pointer on failure.
   std::unique_ptr<Module> runPassesOn(Module *M,
                                       const std::vector<std::string> &Passes,
-                                      unsigned NumExtraArgs = 0,
-                                      const char *const *ExtraArgs = nullptr);
+                                      ArrayRef<std::string> ExtraArgs = {});
 
   /// runPasses - Run the specified passes on Program, outputting a bitcode
   /// file and writting the filename into OutputFile if successful.  If the
@@ -231,8 +230,8 @@ public:
   ///
   bool runPasses(Module &Program, const std::vector<std::string> &PassesToRun,
                  std::string &OutputFilename, bool DeleteOutput = false,
-                 bool Quiet = false, unsigned NumExtraArgs = 0,
-                 const char *const *ExtraArgs = nullptr) const;
+                 bool Quiet = false,
+                 ArrayRef<std::string> ExtraArgs = {}) const;
 
   /// runPasses - Just like the method above, but this just returns true or
   /// false indicating whether or not the optimizer crashed on the specified
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 105702de3f1d..d9047acd30e1 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -407,11 +407,10 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
 
   std::string uniqueFN = "--extract-blocks-file=";
   uniqueFN += Temp->TmpName;
-  const char *ExtraArg = uniqueFN.c_str();
 
   std::vector<std::string> PI;
   PI.push_back("extract-blocks");
-  std::unique_ptr<Module> Ret = runPassesOn(M, PI, 1, &ExtraArg);
+  std::unique_ptr<Module> Ret = runPassesOn(M, PI, {uniqueFN});
 
   if (!Ret) {
     outs() << "*** Basic Block extraction failed, please report a bug!\n";
diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index 562de7952388..64af81fcc8a1 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp
@@ -79,7 +79,7 @@ bool BugDriver::writeProgramToFile(int FD, const Module &M) const {
 bool BugDriver::writeProgramToFile(const std::string &Filename,
                                    const Module &M) const {
   std::error_code EC;
-  ToolOutputFile Out(Filename, EC, sys::fs::F_None);
+  ToolOutputFile Out(Filename, EC, sys::fs::OF_None);
   if (!EC)
     return writeProgramToFileAux(Out, M);
   return true;
@@ -130,8 +130,7 @@ static cl::list<std::string> OptArgs("opt-args", cl::Positional,
 bool BugDriver::runPasses(Module &Program,
                           const std::vector<std::string> &Passes,
                           std::string &OutputFilename, bool DeleteOutput,
-                          bool Quiet, unsigned NumExtraArgs,
-                          const char *const *ExtraArgs) const {
+                          bool Quiet, ArrayRef<std::string> ExtraArgs) const {
   // setup the output file name
   outs().flush();
   SmallString<128> UniqueFilename;
@@ -223,8 +222,7 @@ bool BugDriver::runPasses(Module &Program,
        I != E; ++I)
     Args.push_back(I->c_str());
   Args.push_back(Temp->TmpName.c_str());
-  for (unsigned i = 0; i < NumExtraArgs; ++i)
-    Args.push_back(*ExtraArgs);
+  Args.append(ExtraArgs.begin(), ExtraArgs.end());
 
   LLVM_DEBUG(errs() << "\nAbout to run:\t";
              for (unsigned i = 0, e = Args.size() - 1; i != e; ++i) errs()
@@ -268,10 +266,10 @@ bool BugDriver::runPasses(Module &Program,
 
 std::unique_ptr<Module>
 BugDriver::runPassesOn(Module *M, const std::vector<std::string> &Passes,
-                       unsigned NumExtraArgs, const char *const *ExtraArgs) {
+                       ArrayRef<std::string> ExtraArgs) {
   std::string BitcodeResult;
   if (runPasses(*M, Passes, BitcodeResult, false /*delete*/, true /*quiet*/,
-                NumExtraArgs, ExtraArgs)) {
+                ExtraArgs)) {
     return nullptr;
   }
 
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index da4244345e3b..19b2ea2c0181 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -170,7 +170,7 @@ Expected<int> LLI::ExecuteProgram(const std::string &Bitcode,
                                   const std::vector<std::string> &SharedLibs,
                                   unsigned Timeout, unsigned MemoryLimit) {
   std::vector<StringRef> LLIArgs;
-  LLIArgs.push_back(LLIPath.c_str());
+  LLIArgs.push_back(LLIPath);
   LLIArgs.push_back("-force-interpreter=true");
 
   for (std::vector<std::string>::const_iterator i = SharedLibs.begin(),
@@ -266,15 +266,15 @@ Error CustomCompiler::compileProgram(const std::string &Bitcode,
                                      unsigned Timeout, unsigned MemoryLimit) {
 
   std::vector<StringRef> ProgramArgs;
-  ProgramArgs.push_back(CompilerCommand.c_str());
+  ProgramArgs.push_back(CompilerCommand);
 
-  for (std::size_t i = 0; i < CompilerArgs.size(); ++i)
-    ProgramArgs.push_back(CompilerArgs.at(i).c_str());
+  for (const auto &Arg : CompilerArgs)
+    ProgramArgs.push_back(Arg);
   ProgramArgs.push_back(Bitcode);
 
   // Add optional parameters to the running program from Argv
-  for (unsigned i = 0, e = CompilerArgs.size(); i != e; ++i)
-    ProgramArgs.push_back(CompilerArgs[i].c_str());
+  for (const auto &Arg : CompilerArgs)
+    ProgramArgs.push_back(Arg);
 
   if (RunProgramWithTimeout(CompilerCommand, ProgramArgs, "", "", "", Timeout,
                             MemoryLimit))
@@ -559,7 +559,7 @@ Expected<int> JIT::ExecuteProgram(const std::string &Bitcode,
                                   unsigned Timeout, unsigned MemoryLimit) {
   // Construct a vector of parameters, incorporating those from the command-line
   std::vector<StringRef> JITArgs;
-  JITArgs.push_back(LLIPath.c_str());
+  JITArgs.push_back(LLIPath);
   JITArgs.push_back("-force-interpreter=false");
 
   // Add any extra LLI args.
@@ -570,7 +570,7 @@ Expected<int> JIT::ExecuteProgram(const std::string &Bitcode,
     JITArgs.push_back("-load");
     JITArgs.push_back(SharedLibs[i]);
   }
-  JITArgs.push_back(Bitcode.c_str());
+  JITArgs.push_back(Bitcode);
   // Add optional parameters to the running program from Argv
   for (unsigned i = 0, e = Args.size(); i != e; ++i)
     JITArgs.push_back(Args[i]);
diff --git a/tools/bugpoint/bugpoint.cpp b/tools/bugpoint/bugpoint.cpp
index 2d5322a351ad..c7644e75ae4b 100644
--- a/tools/bugpoint/bugpoint.cpp
+++ b/tools/bugpoint/bugpoint.cpp
@@ -80,6 +80,10 @@ static cl::opt<bool> OptLevelOs(
     cl::desc(
         "Like -O2 with extra optimizations for size. Similar to clang -Os"));
 
+static cl::opt<bool>
+OptLevelOz("Oz",
+           cl::desc("Like -Os but reduces code size further. Similar to clang -Oz"));
+
 static cl::opt<bool>
     OptLevelO3("O3", cl::desc("Optimization level 3. Identical to 'opt -O3'"));
 
@@ -109,6 +113,26 @@ public:
 };
 }
 
+// This routine adds optimization passes based on selected optimization level,
+// OptLevel.
+//
+// OptLevel - Optimization Level
+static void AddOptimizationPasses(legacy::FunctionPassManager &FPM,
+                                  unsigned OptLevel,
+                                  unsigned SizeLevel) {
+  PassManagerBuilder Builder;
+  Builder.OptLevel = OptLevel;
+  Builder.SizeLevel = SizeLevel;
+
+  if (OptLevel > 1)
+    Builder.Inliner = createFunctionInliningPass(OptLevel, SizeLevel, false);
+  else
+    Builder.Inliner = createAlwaysInlinerLegacyPass();
+
+  Builder.populateFunctionPassManager(FPM);
+  Builder.populateModulePassManager(FPM);
+}
+
 #ifdef LINK_POLLY_INTO_TOOLS
 namespace polly {
 void initializePollyPasses(llvm::PassRegistry &Registry);
@@ -189,18 +213,16 @@ int main(int argc, char **argv) {
     Builder.populateLTOPassManager(PM);
   }
 
-  if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
-    PassManagerBuilder Builder;
-    if (OptLevelO1)
-      Builder.Inliner = createAlwaysInlinerLegacyPass();
-    else if (OptLevelOs || OptLevelO2)
-      Builder.Inliner = createFunctionInliningPass(
-          2, OptLevelOs ? 1 : 0, false);
-    else
-      Builder.Inliner = createFunctionInliningPass(275);
-    Builder.populateFunctionPassManager(PM);
-    Builder.populateModulePassManager(PM);
-  }
+  if (OptLevelO1)
+    AddOptimizationPasses(PM, 1, 0);
+  else if (OptLevelO2)
+    AddOptimizationPasses(PM, 2, 0);
+  else if (OptLevelO3)
+    AddOptimizationPasses(PM, 3, 0);
+  else if (OptLevelOs)
+    AddOptimizationPasses(PM, 2, 1);
+  else if (OptLevelOz)
+    AddOptimizationPasses(PM, 2, 2);
 
   for (const PassInfo *PI : PassList)
     D.addPass(PI->getPassArgument());
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 76da843f065e..574b15b399c3 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -239,10 +239,10 @@ static std::unique_ptr<ToolOutputFile> GetOutputStream(const char *TargetName,
 
   // Open the file.
   std::error_code EC;
-  sys::fs::OpenFlags OpenFlags = sys::fs::F_None;
+  sys::fs::OpenFlags OpenFlags = sys::fs::OF_None;
   if (!Binary)
-    OpenFlags |= sys::fs::F_Text;
-  auto FDOut = llvm::make_unique<ToolOutputFile>(OutputFilename, EC, OpenFlags);
+    OpenFlags |= sys::fs::OF_Text;
+  auto FDOut = std::make_unique<ToolOutputFile>(OutputFilename, EC, OpenFlags);
   if (EC) {
     WithColor::error() << EC.message() << '\n';
     return nullptr;
@@ -329,7 +329,7 @@ int main(int argc, char **argv) {
   // Set a diagnostic handler that doesn't exit on the first error
   bool HasError = false;
   Context.setDiagnosticHandler(
-      llvm::make_unique<LLCDiagnosticHandler>(&HasError));
+      std::make_unique<LLCDiagnosticHandler>(&HasError));
   Context.setInlineAsmDiagnosticHandler(InlineAsmDiagHandler, &HasError);
 
   Expected<std::unique_ptr<ToolOutputFile>> RemarksFileOrErr =
@@ -479,8 +479,8 @@ static int compileModule(char **argv, LLVMContext &Context) {
   std::unique_ptr<ToolOutputFile> DwoOut;
   if (!SplitDwarfOutputFile.empty()) {
     std::error_code EC;
-    DwoOut = llvm::make_unique<ToolOutputFile>(SplitDwarfOutputFile, EC,
-                                               sys::fs::F_None);
+    DwoOut = std::make_unique<ToolOutputFile>(SplitDwarfOutputFile, EC,
+                                               sys::fs::OF_None);
     if (EC) {
       WithColor::error(errs(), argv[0]) << EC.message() << '\n';
       return 1;
@@ -533,13 +533,14 @@ static int compileModule(char **argv, LLVMContext &Context) {
     if ((FileType != TargetMachine::CGFT_AssemblyFile &&
          !Out->os().supportsSeeking()) ||
         CompileTwice) {
-      BOS = make_unique<raw_svector_ostream>(Buffer);
+      BOS = std::make_unique<raw_svector_ostream>(Buffer);
       OS = BOS.get();
     }
 
     const char *argv0 = argv[0];
-    LLVMTargetMachine &LLVMTM = static_cast<LLVMTargetMachine&>(*Target);
-    MachineModuleInfo *MMI = new MachineModuleInfo(&LLVMTM);
+    LLVMTargetMachine &LLVMTM = static_cast<LLVMTargetMachine &>(*Target);
+    MachineModuleInfoWrapperPass *MMIWP =
+        new MachineModuleInfoWrapperPass(&LLVMTM);
 
     // Construct a custom pass pipeline that starts after instruction
     // selection.
@@ -559,7 +560,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
       TPC.setDisableVerify(NoVerify);
       PM.add(&TPC);
-      PM.add(MMI);
+      PM.add(MMIWP);
       TPC.printAndVerify("");
       for (const std::string &RunPassName : *RunPassNames) {
         if (addPass(PM, argv0, RunPassName, TPC))
@@ -570,7 +571,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
       PM.add(createFreeMachineFunctionPass());
     } else if (Target->addPassesToEmitFile(PM, *OS,
                                            DwoOut ? &DwoOut->os() : nullptr,
-                                           FileType, NoVerify, MMI)) {
+                                           FileType, NoVerify, MMIWP)) {
       WithColor::warning(errs(), argv[0])
           << "target does not support generation of this"
           << " file type!\n";
@@ -578,8 +579,8 @@ static int compileModule(char **argv, LLVMContext &Context) {
     }
 
     if (MIR) {
-      assert(MMI && "Forgot to create MMI?");
-      if (MIR->parseMachineFunctions(*M, *MMI))
+      assert(MMIWP && "Forgot to create MMIWP?");
+      if (MIR->parseMachineFunctions(*M, MMIWP->getMMI()))
         return 1;
     }
 
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 8c8cd88c9711..ccad06721414 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -251,7 +251,7 @@ public:
       sys::fs::create_directories(Twine(dir));
     }
     std::error_code EC;
-    raw_fd_ostream outfile(CacheName, EC, sys::fs::F_None);
+    raw_fd_ostream outfile(CacheName, EC, sys::fs::OF_None);
     outfile.write(Obj.getBufferStart(), Obj.getBufferSize());
     outfile.close();
   }
@@ -308,7 +308,7 @@ static void addCygMingExtraModule(ExecutionEngine &EE, LLVMContext &Context,
   Triple TargetTriple(TargetTripleStr);
 
   // Create a new module.
-  std::unique_ptr<Module> M = make_unique<Module>("CygMingHelper", Context);
+  std::unique_ptr<Module> M = std::make_unique<Module>("CygMingHelper", Context);
   M->setTargetTriple(TargetTripleStr);
 
   // Create an empty function named "__main".
@@ -695,18 +695,16 @@ int main(int argc, char **argv, char * const *envp) {
   return Result;
 }
 
-static orc::IRTransformLayer::TransformFunction createDebugDumper() {
+static std::function<void(Module &)> createDebugDumper() {
   switch (OrcDumpKind) {
   case DumpKind::NoDump:
-    return [](orc::ThreadSafeModule TSM,
-              const orc::MaterializationResponsibility &R) { return TSM; };
+    return [](Module &M) {};
 
   case DumpKind::DumpFuncsToStdOut:
-    return [](orc::ThreadSafeModule TSM,
-              const orc::MaterializationResponsibility &R) {
+    return [](Module &M) {
       printf("[ ");
 
-      for (const auto &F : *TSM.getModule()) {
+      for (const auto &F : M) {
         if (F.isDeclaration())
           continue;
 
@@ -718,31 +716,23 @@ static orc::IRTransformLayer::TransformFunction createDebugDumper() {
       }
 
       printf("]\n");
-      return TSM;
     };
 
   case DumpKind::DumpModsToStdOut:
-    return [](orc::ThreadSafeModule TSM,
-              const orc::MaterializationResponsibility &R) {
-      outs() << "----- Module Start -----\n"
-             << *TSM.getModule() << "----- Module End -----\n";
-
-      return TSM;
+    return [](Module &M) {
+      outs() << "----- Module Start -----\n" << M << "----- Module End -----\n";
     };
 
   case DumpKind::DumpModsToDisk:
-    return [](orc::ThreadSafeModule TSM,
-              const orc::MaterializationResponsibility &R) {
+    return [](Module &M) {
       std::error_code EC;
-      raw_fd_ostream Out(TSM.getModule()->getModuleIdentifier() + ".ll", EC,
-                         sys::fs::F_Text);
+      raw_fd_ostream Out(M.getModuleIdentifier() + ".ll", EC, sys::fs::OF_Text);
       if (EC) {
-        errs() << "Couldn't open " << TSM.getModule()->getModuleIdentifier()
+        errs() << "Couldn't open " << M.getModuleIdentifier()
                << " for dumping.\nError:" << EC.message() << "\n";
         exit(1);
       }
-      Out << *TSM.getModule();
-      return TSM;
+      Out << M;
     };
   }
   llvm_unreachable("Unknown DumpKind");
@@ -754,14 +744,13 @@ int runOrcLazyJIT(const char *ProgName) {
   // Start setting up the JIT environment.
 
   // Parse the main module.
-  orc::ThreadSafeContext TSCtx(llvm::make_unique<LLVMContext>());
+  orc::ThreadSafeContext TSCtx(std::make_unique<LLVMContext>());
   SMDiagnostic Err;
-  auto MainModule = orc::ThreadSafeModule(
-      parseIRFile(InputFile, Err, *TSCtx.getContext()), TSCtx);
+  auto MainModule = parseIRFile(InputFile, Err, *TSCtx.getContext());
   if (!MainModule)
     reportError(Err, ProgName);
 
-  const auto &TT = MainModule.getModule()->getTargetTriple();
+  const auto &TT = MainModule->getTargetTriple();
   orc::LLLazyJITBuilder Builder;
 
   Builder.setJITTargetMachineBuilder(
@@ -794,13 +783,16 @@ int runOrcLazyJIT(const char *ProgName) {
 
   J->setLazyCompileTransform([&](orc::ThreadSafeModule TSM,
                                  const orc::MaterializationResponsibility &R) {
-    if (verifyModule(*TSM.getModule(), &dbgs())) {
-      dbgs() << "Bad module: " << *TSM.getModule() << "\n";
-      exit(1);
-    }
-    return Dump(std::move(TSM), R);
+    TSM.withModuleDo([&](Module &M) {
+      if (verifyModule(M, &dbgs())) {
+        dbgs() << "Bad module: " << &M << "\n";
+        exit(1);
+      }
+      Dump(M);
+    });
+    return TSM;
   });
-  J->getMainJITDylib().setGenerator(
+  J->getMainJITDylib().addGenerator(
       ExitOnErr(orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
           J->getDataLayout().getGlobalPrefix())));
 
@@ -809,7 +801,8 @@ int runOrcLazyJIT(const char *ProgName) {
   ExitOnErr(CXXRuntimeOverrides.enable(J->getMainJITDylib(), Mangle));
 
   // Add the main module.
-  ExitOnErr(J->addLazyIRModule(std::move(MainModule)));
+  ExitOnErr(
+      J->addLazyIRModule(orc::ThreadSafeModule(std::move(MainModule), TSCtx)));
 
   // Create JITDylibs and add any extra modules.
   {
@@ -839,6 +832,16 @@ int runOrcLazyJIT(const char *ProgName) {
       ExitOnErr(
           J->addLazyIRModule(JD, orc::ThreadSafeModule(std::move(M), TSCtx)));
     }
+
+    for (auto EAItr = ExtraArchives.begin(), EAEnd = ExtraArchives.end();
+         EAItr != EAEnd; ++EAItr) {
+      auto EAIdx = ExtraArchives.getPosition(EAItr - ExtraArchives.begin());
+      assert(EAIdx != 0 && "ExtraArchive should have index > 0");
+      auto JDItr = std::prev(IdxToDylib.lower_bound(EAIdx));
+      auto &JD = *JDItr->second;
+      JD.addGenerator(ExitOnErr(orc::StaticLibraryDefinitionGenerator::Load(
+          J->getObjLinkingLayer(), EAItr->c_str())));
+    }
   }
 
   // Add the objects.
@@ -959,6 +962,6 @@ std::unique_ptr<FDRawChannel> launchRemote() {
   close(PipeFD[1][1]);
 
   // Return an RPC channel connected to our end of the pipes.
-  return llvm::make_unique<FDRawChannel>(PipeFD[1][0], PipeFD[0][1]);
+  return std::make_unique<FDRawChannel>(PipeFD[1][0], PipeFD[0][1]);
 #endif
 }
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 91746d0fab37..c9cf217f7688 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -43,6 +43,11 @@
 #include <io.h>
 #endif
 
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
 using namespace llvm;
 
 // The name this program was invoked as.
@@ -70,14 +75,14 @@ USAGE: llvm-ar [options] [-]<operation>[modifiers] [relpos] [count] <archive> [f
        llvm-ar -M [<mri-script]
 
 OPTIONS:
-  --format              - Archive format to create
+  --format              - archive format to create
     =default            -   default
     =gnu                -   gnu
     =darwin             -   darwin
     =bsd                -   bsd
-  --plugin=<string>     - Ignored for compatibility
-  --help                - Display available options
-  --version             - Display the version of this program
+  --plugin=<string>     - ignored for compatibility
+  -h --help             - display this help and exit
+  --version             - print the version and exit
   @<file>               - read options from <file>
 
 OPERATIONS:
@@ -95,11 +100,13 @@ MODIFIERS:
   [b] - put [files] before [relpos] (same as [i])
   [c] - do not warn if archive had to be created
   [D] - use zero for timestamps and uids/gids (default)
+  [h] - display this help and exit
   [i] - put [files] before [relpos] (same as [b])
   [l] - ignored for compatibility
   [L] - add archive's contents
   [N] - use instance [count] of name
   [o] - preserve original dates
+  [O] - display member offsets
   [P] - use full names when matching (implied for thin archives)
   [s] - create an archive index (cf. ranlib)
   [S] - do not build a symbol table
@@ -107,6 +114,7 @@ MODIFIERS:
   [u] - update only [files] newer than archive contents
   [U] - use actual timestamps and uids/gids
   [v] - be verbose about actions taken
+  [V] - display the version and exit
 )";
 
 void printHelpMessage() {
@@ -116,10 +124,19 @@ void printHelpMessage() {
     outs() << ArHelp;
 }
 
+static unsigned MRILineNumber;
+static bool ParsingMRIScript;
+
 // Show the error message and exit.
 LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
-  WithColor::error(errs(), ToolName) << Error << ".\n";
-  printHelpMessage();
+  if (ParsingMRIScript) {
+    WithColor::error(errs(), ToolName)
+        << "script line " << MRILineNumber << ": " << Error << "\n";
+  } else {
+    WithColor::error(errs(), ToolName) << Error << "\n";
+    printHelpMessage();
+  }
+
   exit(1);
 }
 
@@ -171,17 +188,18 @@ enum ArchiveOperation {
 };
 
 // Modifiers to follow operation to vary behavior
-static bool AddAfter = false;        ///< 'a' modifier
-static bool AddBefore = false;       ///< 'b' modifier
-static bool Create = false;          ///< 'c' modifier
-static bool OriginalDates = false;   ///< 'o' modifier
-static bool CompareFullPath = false; ///< 'P' modifier
-static bool OnlyUpdate = false;      ///< 'u' modifier
-static bool Verbose = false;         ///< 'v' modifier
-static bool Symtab = true;           ///< 's' modifier
-static bool Deterministic = true;    ///< 'D' and 'U' modifiers
-static bool Thin = false;            ///< 'T' modifier
-static bool AddLibrary = false;      ///< 'L' modifier
+static bool AddAfter = false;             ///< 'a' modifier
+static bool AddBefore = false;            ///< 'b' modifier
+static bool Create = false;               ///< 'c' modifier
+static bool OriginalDates = false;        ///< 'o' modifier
+static bool DisplayMemberOffsets = false; ///< 'O' modifier
+static bool CompareFullPath = false;      ///< 'P' modifier
+static bool OnlyUpdate = false;           ///< 'u' modifier
+static bool Verbose = false;              ///< 'v' modifier
+static bool Symtab = true;                ///< 's' modifier
+static bool Deterministic = true;         ///< 'D' and 'U' modifiers
+static bool Thin = false;                 ///< 'T' modifier
+static bool AddLibrary = false;           ///< 'L' modifier
 
 // Relative Positional Argument (for insert/move). This variable holds
 // the name of the archive member to which the 'a', 'b' or 'i' modifier
@@ -198,6 +216,9 @@ static int CountParam = 0;
 // command line.
 static std::string ArchiveName;
 
+static std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
+static std::vector<std::unique_ptr<object::Archive>> Archives;
+
 // This variable holds the list of member files to proecess, as given
 // on the command line.
 static std::vector<StringRef> Members;
@@ -209,7 +230,7 @@ static BumpPtrAllocator Alloc;
 // associated with a, b, and i modifiers
 static void getRelPos() {
   if (PositionalArgs.empty())
-    fail("Expected [relpos] for a, b, or i modifier");
+    fail("expected [relpos] for 'a', 'b', or 'i' modifier");
   RelPos = PositionalArgs[0];
   PositionalArgs.erase(PositionalArgs.begin());
 }
@@ -218,40 +239,31 @@ static void getRelPos() {
 // associated with the N modifier
 static void getCountParam() {
   if (PositionalArgs.empty())
-    fail("Expected [count] for N modifier");
+    fail("expected [count] for 'N' modifier");
   auto CountParamArg = StringRef(PositionalArgs[0]);
   if (CountParamArg.getAsInteger(10, CountParam))
-    fail("Value for [count] must be numeric, got: " + CountParamArg);
+    fail("value for [count] must be numeric, got: " + CountParamArg);
   if (CountParam < 1)
-    fail("Value for [count] must be positive, got: " + CountParamArg);
+    fail("value for [count] must be positive, got: " + CountParamArg);
   PositionalArgs.erase(PositionalArgs.begin());
 }
 
 // Get the archive file name from the command line
 static void getArchive() {
   if (PositionalArgs.empty())
-    fail("An archive name must be specified");
+    fail("an archive name must be specified");
   ArchiveName = PositionalArgs[0];
   PositionalArgs.erase(PositionalArgs.begin());
 }
 
-// Copy over remaining items in PositionalArgs to our Members vector
-static void getMembers() {
-  for (auto &Arg : PositionalArgs)
-    Members.push_back(Arg);
-}
-
-std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
-std::vector<std::unique_ptr<object::Archive>> Archives;
-
 static object::Archive &readLibrary(const Twine &Library) {
   auto BufOrErr = MemoryBuffer::getFile(Library, -1, false);
-  failIfError(BufOrErr.getError(), "Could not open library " + Library);
+  failIfError(BufOrErr.getError(), "could not open library " + Library);
   ArchiveBuffers.push_back(std::move(*BufOrErr));
   auto LibOrErr =
       object::Archive::create(ArchiveBuffers.back()->getMemBufferRef());
   failIfError(errorToErrorCode(LibOrErr.takeError()),
-              "Could not parse library");
+              "could not parse library");
   Archives.push_back(std::move(*LibOrErr));
   return *Archives.back();
 }
@@ -264,7 +276,7 @@ static void runMRIScript();
 static ArchiveOperation parseCommandLine() {
   if (MRI) {
     if (!PositionalArgs.empty() || !Options.empty())
-      fail("Cannot mix -M and other options");
+      fail("cannot mix -M and other options");
     runMRIScript();
   }
 
@@ -319,6 +331,9 @@ static ArchiveOperation parseCommandLine() {
     case 'o':
       OriginalDates = true;
       break;
+    case 'O':
+      DisplayMemberOffsets = true;
+      break;
     case 'P':
       CompareFullPath = true;
       break;
@@ -367,6 +382,12 @@ static ArchiveOperation parseCommandLine() {
     case 'L':
       AddLibrary = true;
       break;
+    case 'V':
+      cl::PrintVersionMessage();
+      exit(0);
+    case 'h':
+      printHelpMessage();
+      exit(0);
     default:
       fail(std::string("unknown option ") + Options[i]);
     }
@@ -377,37 +398,37 @@ static ArchiveOperation parseCommandLine() {
   getArchive();
 
   // Everything on the command line at this point is a member.
-  getMembers();
+  Members.assign(PositionalArgs.begin(), PositionalArgs.end());
 
   if (NumOperations == 0 && MaybeJustCreateSymTab) {
     NumOperations = 1;
     Operation = CreateSymTab;
     if (!Members.empty())
-      fail("The s operation takes only an archive as argument");
+      fail("the 's' operation takes only an archive as argument");
   }
 
   // Perform various checks on the operation/modifier specification
   // to make sure we are dealing with a legal request.
   if (NumOperations == 0)
-    fail("You must specify at least one of the operations");
+    fail("you must specify at least one of the operations");
   if (NumOperations > 1)
-    fail("Only one operation may be specified");
+    fail("only one operation may be specified");
   if (NumPositional > 1)
-    fail("You may only specify one of a, b, and i modifiers");
+    fail("you may only specify one of 'a', 'b', and 'i' modifiers");
   if (AddAfter || AddBefore)
     if (Operation != Move && Operation != ReplaceOrInsert)
-      fail("The 'a', 'b' and 'i' modifiers can only be specified with "
+      fail("the 'a', 'b' and 'i' modifiers can only be specified with "
            "the 'm' or 'r' operations");
   if (CountParam)
     if (Operation != Extract && Operation != Delete)
-      fail("The 'N' modifier can only be specified with the 'x' or 'd' "
+      fail("the 'N' modifier can only be specified with the 'x' or 'd' "
            "operations");
   if (OriginalDates && Operation != Extract)
-    fail("The 'o' modifier is only applicable to the 'x' operation");
+    fail("the 'o' modifier is only applicable to the 'x' operation");
   if (OnlyUpdate && Operation != ReplaceOrInsert)
-    fail("The 'u' modifier is only applicable to the 'r' operation");
+    fail("the 'u' modifier is only applicable to the 'r' operation");
   if (AddLibrary && Operation != QuickAppend)
-    fail("The 'L' modifier is only applicable to the 'q' operation");
+    fail("the 'L' modifier is only applicable to the 'q' operation");
 
   // Return the parsed operation to the caller
   return Operation;
@@ -470,12 +491,35 @@ static void doDisplayTable(StringRef Name, const object::Archive::Child &C) {
       if (!ParentDir.empty())
         outs() << sys::path::convert_to_slash(ParentDir) << '/';
     }
+    outs() << Name;
+  } else {
+    outs() << Name;
+    if (DisplayMemberOffsets)
+      outs() << " 0x" << utohexstr(C.getDataOffset(), true);
   }
-  outs() << Name << "\n";
+  outs() << '\n';
 }
 
-static StringRef normalizePath(StringRef Path) {
-  return CompareFullPath ? Path : sys::path::filename(Path);
+static std::string normalizePath(StringRef Path) {
+  return CompareFullPath ? sys::path::convert_to_slash(Path)
+                         : std::string(sys::path::filename(Path));
+}
+
+static bool comparePaths(StringRef Path1, StringRef Path2) {
+// When on Windows this function calls CompareStringOrdinal
+// as Windows file paths are case-insensitive. 
+// CompareStringOrdinal compares two Unicode strings for
+// binary equivalence and allows for case insensitivity.
+#ifdef _WIN32
+  SmallVector<wchar_t, 128> WPath1, WPath2;
+  failIfError(sys::path::widenPath(normalizePath(Path1), WPath1));
+  failIfError(sys::path::widenPath(normalizePath(Path2), WPath2));
+
+  return CompareStringOrdinal(WPath1.data(), WPath1.size(), WPath2.data(),
+                              WPath2.size(), true) == CSTR_EQUAL;
+#else
+  return normalizePath(Path1) == normalizePath(Path2);
+#endif
 }
 
 // Implement the 'x' operation. This function extracts files back to the file
@@ -489,7 +533,7 @@ static void doExtract(StringRef Name, const object::Archive::Child &C) {
   int FD;
   failIfError(sys::fs::openFileForWrite(sys::path::filename(Name), FD,
                                         sys::fs::CD_CreateAlways,
-                                        sys::fs::F_None, Mode),
+                                        sys::fs::OF_None, Mode),
               Name);
 
   {
@@ -551,7 +595,7 @@ static void performReadOperation(ArchiveOperation Operation,
 
       if (Filter) {
         auto I = find_if(Members, [Name](StringRef Path) {
-          return Name == normalizePath(Path);
+          return comparePaths(Name, Path);
         });
         if (I == Members.end())
           continue;
@@ -588,7 +632,7 @@ static void addChildMember(std::vector<NewArchiveMember> &Members,
                            const object::Archive::Child &M,
                            bool FlattenArchive = false) {
   if (Thin && !M.getParent()->isThin())
-    fail("Cannot convert a regular archive to a thin one");
+    fail("cannot convert a regular archive to a thin one");
   Expected<NewArchiveMember> NMOrErr =
       NewArchiveMember::getOldMember(M, Deterministic);
   failIfError(NMOrErr.takeError());
@@ -681,7 +725,7 @@ static InsertAction computeInsertAction(ArchiveOperation Operation,
   if (Operation == QuickAppend || Members.empty())
     return IA_AddOldMember;
   auto MI = find_if(
-      Members, [Name](StringRef Path) { return Name == normalizePath(Path); });
+      Members, [Name](StringRef Path) { return comparePaths(Name, Path); });
 
   if (MI == Members.end())
     return IA_AddOldMember;
@@ -698,9 +742,8 @@ static InsertAction computeInsertAction(ArchiveOperation Operation,
     return IA_MoveOldMember;
 
   if (Operation == ReplaceOrInsert) {
-    StringRef PosName = normalizePath(RelPos);
     if (!OnlyUpdate) {
-      if (PosName.empty())
+      if (RelPos.empty())
         return IA_AddNewMember;
       return IA_MoveNewMember;
     }
@@ -712,12 +755,12 @@ static InsertAction computeInsertAction(ArchiveOperation Operation,
     auto ModTimeOrErr = Member.getLastModified();
     failIfError(ModTimeOrErr.takeError());
     if (Status.getLastModificationTime() < ModTimeOrErr.get()) {
-      if (PosName.empty())
+      if (RelPos.empty())
         return IA_AddOldMember;
       return IA_MoveOldMember;
     }
 
-    if (PosName.empty())
+    if (RelPos.empty())
       return IA_AddNewMember;
     return IA_MoveNewMember;
   }
@@ -732,7 +775,6 @@ computeNewArchiveMembers(ArchiveOperation Operation,
   std::vector<NewArchiveMember> Ret;
   std::vector<NewArchiveMember> Moved;
   int InsertPos = -1;
-  StringRef PosName = normalizePath(RelPos);
   if (OldArchive) {
     Error Err = Error::success();
     StringMap<int> MemberCount;
@@ -740,8 +782,8 @@ computeNewArchiveMembers(ArchiveOperation Operation,
       int Pos = Ret.size();
       Expected<StringRef> NameOrErr = Child.getName();
       failIfError(NameOrErr.takeError());
-      StringRef Name = NameOrErr.get();
-      if (Name == PosName) {
+      std::string Name = NameOrErr.get();
+      if (comparePaths(Name, RelPos)) {
         assert(AddAfter || AddBefore);
         if (AddBefore)
           InsertPos = Pos;
@@ -783,7 +825,7 @@ computeNewArchiveMembers(ArchiveOperation Operation,
     return Ret;
 
   if (!RelPos.empty() && InsertPos == -1)
-    fail("Insertion point not found");
+    fail("insertion point not found");
 
   if (RelPos.empty())
     InsertPos = Ret.size();
@@ -859,12 +901,12 @@ static void performWriteOperation(ArchiveOperation Operation,
     break;
   case BSD:
     if (Thin)
-      fail("Only the gnu format has a thin mode");
+      fail("only the gnu format has a thin mode");
     Kind = object::Archive::K_BSD;
     break;
   case DARWIN:
     if (Thin)
-      fail("Only the gnu format has a thin mode");
+      fail("only the gnu format has a thin mode");
     Kind = object::Archive::K_DARWIN;
     break;
   case Unknown:
@@ -922,14 +964,12 @@ static int performOperation(ArchiveOperation Operation,
       MemoryBuffer::getFile(ArchiveName, -1, false);
   std::error_code EC = Buf.getError();
   if (EC && EC != errc::no_such_file_or_directory)
-    fail("error opening '" + ArchiveName + "': " + EC.message() + "!");
+    fail("error opening '" + ArchiveName + "': " + EC.message());
 
   if (!EC) {
     Error Err = Error::success();
     object::Archive Archive(Buf.get()->getMemBufferRef(), Err);
-    EC = errorToErrorCode(std::move(Err));
-    failIfError(EC,
-                "error loading '" + ArchiveName + "': " + EC.message() + "!");
+    failIfError(std::move(Err), "unable to load '" + ArchiveName + "'");
     if (Archive.isThin())
       CompareFullPath = true;
     performOperation(Operation, &Archive, std::move(Buf.get()), NewMembers);
@@ -960,8 +1000,10 @@ static void runMRIScript() {
   const MemoryBuffer &Ref = *Buf.get();
   bool Saved = false;
   std::vector<NewArchiveMember> NewMembers;
+  ParsingMRIScript = true;
 
   for (line_iterator I(Ref, /*SkipBlanks*/ false), E; I != E; ++I) {
+    ++MRILineNumber;
     StringRef Line = *I;
     Line = Line.split(';').first;
     Line = Line.split('*').first;
@@ -1003,15 +1045,15 @@ static void runMRIScript() {
     case MRICommand::Create:
       Create = true;
       if (!ArchiveName.empty())
-        fail("Editing multiple archives not supported");
+        fail("editing multiple archives not supported");
       if (Saved)
-        fail("File already saved");
+        fail("file already saved");
       ArchiveName = Rest;
       break;
     case MRICommand::Delete: {
-      StringRef Name = normalizePath(Rest);
-      llvm::erase_if(NewMembers,
-                     [=](NewArchiveMember &M) { return M.MemberName == Name; });
+      llvm::erase_if(NewMembers, [=](NewArchiveMember &M) {
+        return comparePaths(M.MemberName, Rest);
+      });
       break;
     }
     case MRICommand::Save:
@@ -1020,10 +1062,12 @@ static void runMRIScript() {
     case MRICommand::End:
       break;
     case MRICommand::Invalid:
-      fail("Unknown command: " + CommandStr);
+      fail("unknown command: " + CommandStr);
     }
   }
-
+  
+  ParsingMRIScript = false;
+  
   // Nothing to do if not saved.
   if (Saved)
     performOperation(ReplaceOrInsert, &NewMembers);
@@ -1108,7 +1152,7 @@ static int ranlib_main(int argc, char **argv) {
       return 0;
     } else {
       if (ArchiveSpecified)
-        fail("Exactly one archive should be specified");
+        fail("exactly one archive should be specified");
       ArchiveSpecified = true;
       ArchiveName = argv[i];
     }
@@ -1136,5 +1180,5 @@ int main(int argc, char **argv) {
 
   if (Stem.contains_lower("ar"))
     return ar_main(argc, argv);
-  fail("Not ranlib, ar, lib or dlltool!");
+  fail("not ranlib, ar, lib or dlltool");
 }
diff --git a/tools/llvm-as/llvm-as.cpp b/tools/llvm-as/llvm-as.cpp
index 234fef907a38..c9f50e38fc61 100644
--- a/tools/llvm-as/llvm-as.cpp
+++ b/tools/llvm-as/llvm-as.cpp
@@ -82,7 +82,7 @@ static void WriteOutputFile(const Module *M, const ModuleSummaryIndex *Index) {
 
   std::error_code EC;
   std::unique_ptr<ToolOutputFile> Out(
-      new ToolOutputFile(OutputFilename, EC, sys::fs::F_None));
+      new ToolOutputFile(OutputFilename, EC, sys::fs::OF_None));
   if (EC) {
     errs() << EC.message() << '\n';
     exit(1);
diff --git a/tools/llvm-cov/CodeCoverage.cpp b/tools/llvm-cov/CodeCoverage.cpp
index f707e3c7ab53..7151cfb032f3 100644
--- a/tools/llvm-cov/CodeCoverage.cpp
+++ b/tools/llvm-cov/CodeCoverage.cpp
@@ -712,15 +712,15 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
 
     // Create the function filters
     if (!NameFilters.empty() || NameWhitelist || !NameRegexFilters.empty()) {
-      auto NameFilterer = llvm::make_unique<CoverageFilters>();
+      auto NameFilterer = std::make_unique<CoverageFilters>();
       for (const auto &Name : NameFilters)
-        NameFilterer->push_back(llvm::make_unique<NameCoverageFilter>(Name));
+        NameFilterer->push_back(std::make_unique<NameCoverageFilter>(Name));
       if (NameWhitelist)
         NameFilterer->push_back(
-            llvm::make_unique<NameWhitelistCoverageFilter>(*NameWhitelist));
+            std::make_unique<NameWhitelistCoverageFilter>(*NameWhitelist));
       for (const auto &Regex : NameRegexFilters)
         NameFilterer->push_back(
-            llvm::make_unique<NameRegexCoverageFilter>(Regex));
+            std::make_unique<NameRegexCoverageFilter>(Regex));
       Filters.push_back(std::move(NameFilterer));
     }
 
@@ -728,18 +728,18 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
         RegionCoverageGtFilter.getNumOccurrences() ||
         LineCoverageLtFilter.getNumOccurrences() ||
         LineCoverageGtFilter.getNumOccurrences()) {
-      auto StatFilterer = llvm::make_unique<CoverageFilters>();
+      auto StatFilterer = std::make_unique<CoverageFilters>();
       if (RegionCoverageLtFilter.getNumOccurrences())
-        StatFilterer->push_back(llvm::make_unique<RegionCoverageFilter>(
+        StatFilterer->push_back(std::make_unique<RegionCoverageFilter>(
             RegionCoverageFilter::LessThan, RegionCoverageLtFilter));
       if (RegionCoverageGtFilter.getNumOccurrences())
-        StatFilterer->push_back(llvm::make_unique<RegionCoverageFilter>(
+        StatFilterer->push_back(std::make_unique<RegionCoverageFilter>(
             RegionCoverageFilter::GreaterThan, RegionCoverageGtFilter));
       if (LineCoverageLtFilter.getNumOccurrences())
-        StatFilterer->push_back(llvm::make_unique<LineCoverageFilter>(
+        StatFilterer->push_back(std::make_unique<LineCoverageFilter>(
             LineCoverageFilter::LessThan, LineCoverageLtFilter));
       if (LineCoverageGtFilter.getNumOccurrences())
-        StatFilterer->push_back(llvm::make_unique<LineCoverageFilter>(
+        StatFilterer->push_back(std::make_unique<LineCoverageFilter>(
             RegionCoverageFilter::GreaterThan, LineCoverageGtFilter));
       Filters.push_back(std::move(StatFilterer));
     }
@@ -747,7 +747,7 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
     // Create the ignore filename filters.
     for (const auto &RE : IgnoreFilenameRegexFilters)
       IgnoreFilenameFilters.push_back(
-          llvm::make_unique<NameRegexCoverageFilter>(RE));
+          std::make_unique<NameRegexCoverageFilter>(RE));
 
     if (!Arches.empty()) {
       for (const std::string &Arch : Arches) {
@@ -1040,7 +1040,7 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
 
   switch (ViewOpts.Format) {
   case CoverageViewOptions::OutputFormat::Text:
-    Exporter = llvm::make_unique<CoverageExporterJson>(*Coverage.get(),
+    Exporter = std::make_unique<CoverageExporterJson>(*Coverage.get(),
                                                        ViewOpts, outs());
     break;
   case CoverageViewOptions::OutputFormat::HTML:
@@ -1048,7 +1048,7 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
     // above.
     llvm_unreachable("Export in HTML is not supported!");
   case CoverageViewOptions::OutputFormat::Lcov:
-    Exporter = llvm::make_unique<CoverageExporterLcov>(*Coverage.get(),
+    Exporter = std::make_unique<CoverageExporterLcov>(*Coverage.get(),
                                                        ViewOpts, outs());
     break;
   }
diff --git a/tools/llvm-cov/SourceCoverageView.cpp b/tools/llvm-cov/SourceCoverageView.cpp
index 616f667e2c84..0e20ea63cd6f 100644
--- a/tools/llvm-cov/SourceCoverageView.cpp
+++ b/tools/llvm-cov/SourceCoverageView.cpp
@@ -76,9 +76,9 @@ std::unique_ptr<CoveragePrinter>
 CoveragePrinter::create(const CoverageViewOptions &Opts) {
   switch (Opts.Format) {
   case CoverageViewOptions::OutputFormat::Text:
-    return llvm::make_unique<CoveragePrinterText>(Opts);
+    return std::make_unique<CoveragePrinterText>(Opts);
   case CoverageViewOptions::OutputFormat::HTML:
-    return llvm::make_unique<CoveragePrinterHTML>(Opts);
+    return std::make_unique<CoveragePrinterHTML>(Opts);
   case CoverageViewOptions::OutputFormat::Lcov:
     // Unreachable because CodeCoverage.cpp should terminate with an error
     // before we get here.
@@ -141,10 +141,10 @@ SourceCoverageView::create(StringRef SourceName, const MemoryBuffer &File,
                            CoverageData &&CoverageInfo) {
   switch (Options.Format) {
   case CoverageViewOptions::OutputFormat::Text:
-    return llvm::make_unique<SourceCoverageViewText>(
+    return std::make_unique<SourceCoverageViewText>(
         SourceName, File, Options, std::move(CoverageInfo));
   case CoverageViewOptions::OutputFormat::HTML:
-    return llvm::make_unique<SourceCoverageViewHTML>(
+    return std::make_unique<SourceCoverageViewHTML>(
         SourceName, File, Options, std::move(CoverageInfo));
   case CoverageViewOptions::OutputFormat::Lcov:
     // Unreachable because CodeCoverage.cpp should terminate with an error
diff --git a/tools/llvm-cov/TestingSupport.cpp b/tools/llvm-cov/TestingSupport.cpp
index 3ee318c9c640..b99bd83157d0 100644
--- a/tools/llvm-cov/TestingSupport.cpp
+++ b/tools/llvm-cov/TestingSupport.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
@@ -50,8 +51,13 @@ int convertForTestingMain(int argc, const char *argv[]) {
   auto ObjFormat = OF->getTripleObjectFormat();
   for (const auto &Section : OF->sections()) {
     StringRef Name;
-    if (Section.getName(Name))
+    if (Expected<StringRef> NameOrErr = Section.getName()) {
+      Name = *NameOrErr;
+    } else {
+      consumeError(NameOrErr.takeError());
       return 1;
+    }
+
     if (Name == llvm::getInstrProfSectionName(IPSK_name, ObjFormat,
                                               /*AddSegmentInfo=*/false)) {
       ProfileNames = Section;
@@ -94,7 +100,7 @@ int convertForTestingMain(int argc, const char *argv[]) {
   encodeULEB128(ProfileNamesAddress, OS);
   OS << ProfileNamesData;
   // Coverage mapping data is expected to have an alignment of 8.
-  for (unsigned Pad = OffsetToAlignment(OS.tell(), 8); Pad; --Pad)
+  for (unsigned Pad = offsetToAlignment(OS.tell(), Align(8)); Pad; --Pad)
     OS.write(uint8_t(0));
   OS << CoverageMappingData;
 
diff --git a/tools/llvm-cxxdump/llvm-cxxdump.cpp b/tools/llvm-cxxdump/llvm-cxxdump.cpp
index 833312655788..03e1bab9417e 100644
--- a/tools/llvm-cxxdump/llvm-cxxdump.cpp
+++ b/tools/llvm-cxxdump/llvm-cxxdump.cpp
@@ -174,7 +174,11 @@ static void dumpCXXData(const ObjectFile *Obj) {
 
   SectionRelocMap.clear();
   for (const SectionRef &Section : Obj->sections()) {
-    section_iterator Sec2 = Section.getRelocatedSection();
+    Expected<section_iterator> ErrOrSec = Section.getRelocatedSection();
+    if (!ErrOrSec)
+      error(ErrOrSec.takeError());
+
+    section_iterator Sec2 = *ErrOrSec;
     if (Sec2 != Obj->section_end())
       SectionRelocMap[*Sec2].push_back(Section);
   }
diff --git a/tools/llvm-cxxmap/llvm-cxxmap.cpp b/tools/llvm-cxxmap/llvm-cxxmap.cpp
index 87d4d06bbc96..b53a6364c89e 100644
--- a/tools/llvm-cxxmap/llvm-cxxmap.cpp
+++ b/tools/llvm-cxxmap/llvm-cxxmap.cpp
@@ -145,7 +145,7 @@ int main(int argc, const char *argv[]) {
     exitWithErrorCode(RemappingBufOrError.getError(), RemappingFile);
 
   std::error_code EC;
-  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::F_Text);
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_Text);
   if (EC)
     exitWithErrorCode(EC, OutputFilename);
 
diff --git a/tools/llvm-dis/llvm-dis.cpp b/tools/llvm-dis/llvm-dis.cpp
index 3f337b874b16..d66299cbf767 100644
--- a/tools/llvm-dis/llvm-dis.cpp
+++ b/tools/llvm-dis/llvm-dis.cpp
@@ -153,7 +153,7 @@ int main(int argc, char **argv) {
 
   LLVMContext Context;
   Context.setDiagnosticHandler(
-      llvm::make_unique<LLVMDisDiagnosticHandler>(argv[0]));
+      std::make_unique<LLVMDisDiagnosticHandler>(argv[0]));
   cl::ParseCommandLineOptions(argc, argv, "llvm .bc -> .ll disassembler\n");
 
   std::unique_ptr<MemoryBuffer> MB =
@@ -186,7 +186,7 @@ int main(int argc, char **argv) {
 
   std::error_code EC;
   std::unique_ptr<ToolOutputFile> Out(
-      new ToolOutputFile(OutputFilename, EC, sys::fs::F_None));
+      new ToolOutputFile(OutputFilename, EC, sys::fs::OF_Text));
   if (EC) {
     errs() << EC.message() << '\n';
     return 1;
diff --git a/tools/llvm-dwarfdump/Statistics.cpp b/tools/llvm-dwarfdump/Statistics.cpp
index f26369b935cb..c29ad783a9e6 100644
--- a/tools/llvm-dwarfdump/Statistics.cpp
+++ b/tools/llvm-dwarfdump/Statistics.cpp
@@ -5,11 +5,18 @@
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/JSON.h"
 
 #define DEBUG_TYPE "dwarfdump"
 using namespace llvm;
 using namespace object;
 
+/// This represents the number of categories of debug location coverage being
+/// calculated. The first category is the number of variables with 0% location
+/// coverage, but the last category is the number of variables with 100%
+/// location coverage.
+constexpr int NumOfCoverageCategories = 12;
+
 /// Holds statistics for one function (or other entity that has a PC range and
 /// contains variables, such as a compile unit).
 struct PerFunctionStats {
@@ -43,9 +50,9 @@ struct PerFunctionStats {
   unsigned NumVars = 0;
   /// Number of variables with source location.
   unsigned NumVarSourceLocations = 0;
-  /// Number of variables wtih type.
+  /// Number of variables with type.
   unsigned NumVarTypes = 0;
-  /// Number of variables wtih DW_AT_location.
+  /// Number of variables with DW_AT_location.
   unsigned NumVarLocations = 0;
 };
 
@@ -56,16 +63,74 @@ struct GlobalStats {
   /// Total number of PC range bytes in each variable's enclosing scope,
   /// starting from the first definition of the variable.
   unsigned ScopeBytesFromFirstDefinition = 0;
-  /// Total number of call site entries (DW_TAG_call_site) or
-  /// (DW_AT_call_file & DW_AT_call_line).
+  /// Total number of PC range bytes covered by DW_AT_locations with
+  /// the debug entry values (DW_OP_entry_value).
+  unsigned ScopeEntryValueBytesCovered = 0;
+  /// Total number of PC range bytes covered by DW_AT_locations of
+  /// formal parameters.
+  unsigned ParamScopeBytesCovered = 0;
+  /// Total number of PC range bytes in each variable's enclosing scope,
+  /// starting from the first definition of the variable (only for parameters).
+  unsigned ParamScopeBytesFromFirstDefinition = 0;
+  /// Total number of PC range bytes covered by DW_AT_locations with
+  /// the debug entry values (DW_OP_entry_value) (only for parameters).
+  unsigned ParamScopeEntryValueBytesCovered = 0;
+  /// Total number of PC range bytes covered by DW_AT_locations (only for local
+  /// variables).
+  unsigned VarScopeBytesCovered = 0;
+  /// Total number of PC range bytes in each variable's enclosing scope,
+  /// starting from the first definition of the variable (only for local
+  /// variables).
+  unsigned VarScopeBytesFromFirstDefinition = 0;
+  /// Total number of PC range bytes covered by DW_AT_locations with
+  /// the debug entry values (DW_OP_entry_value) (only for local variables).
+  unsigned VarScopeEntryValueBytesCovered = 0;
+  /// Total number of call site entries (DW_AT_call_file & DW_AT_call_line).
   unsigned CallSiteEntries = 0;
+  /// Total number of call site DIEs (DW_TAG_call_site).
+  unsigned CallSiteDIEs = 0;
+  /// Total number of call site parameter DIEs (DW_TAG_call_site_parameter).
+  unsigned CallSiteParamDIEs = 0;
   /// Total byte size of concrete functions. This byte size includes
   /// inline functions contained in the concrete functions.
-  uint64_t FunctionSize = 0;
+  unsigned FunctionSize = 0;
   /// Total byte size of inlined functions. This is the total number of bytes
   /// for the top inline functions within concrete functions. This can help
   /// tune the inline settings when compiling to match user expectations.
-  uint64_t InlineFunctionSize = 0;
+  unsigned InlineFunctionSize = 0;
+};
+
+/// Holds accumulated debug location statistics about local variables and
+/// formal parameters.
+struct LocationStats {
+  /// Map the scope coverage decile to the number of variables in the decile.
+  /// The first element of the array (at the index zero) represents the number
+  /// of variables with the no debug location at all, but the last element
+  /// in the vector represents the number of fully covered variables within
+  /// its scope.
+  std::vector<unsigned> VarParamLocStats{
+      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  /// Map non debug entry values coverage.
+  std::vector<unsigned> VarParamNonEntryValLocStats{
+      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  /// The debug location statistics for formal parameters.
+  std::vector<unsigned> ParamLocStats{
+      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  /// Map non debug entry values coverage for formal parameters.
+  std::vector<unsigned> ParamNonEntryValLocStats{
+      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  /// The debug location statistics for local variables.
+  std::vector<unsigned> VarLocStats{
+      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  /// Map non debug entry values coverage for local variables.
+  std::vector<unsigned> VarNonEntryValLocStats{
+      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  /// Total number of local variables and function parameters processed.
+  unsigned NumVarParam = 0;
+  /// Total number of formal parameters processed.
+  unsigned NumParam = 0;
+  /// Total number of local variables processed.
+  unsigned NumVar = 0;
 };
 
 /// Extract the low pc from a Die.
@@ -81,27 +146,66 @@ static uint64_t getLowPC(DWARFDie Die) {
   return dwarf::toAddress(Die.find(dwarf::DW_AT_low_pc), 0);
 }
 
+/// Collect debug location statistics for one DIE.
+static void collectLocStats(uint64_t BytesCovered, uint64_t BytesInScope,
+                            std::vector<unsigned> &VarParamLocStats,
+                            std::vector<unsigned> &ParamLocStats,
+                            std::vector<unsigned> &VarLocStats, bool IsParam,
+                            bool IsLocalVar) {
+  auto getCoverageBucket = [BytesCovered, BytesInScope]() -> unsigned {
+    unsigned LocBucket = 100 * (double)BytesCovered / BytesInScope;
+    if (LocBucket == 0) {
+      // No debug location at all for the variable.
+      return 0;
+    } else if (LocBucket == 100 || BytesCovered > BytesInScope) {
+      // Fully covered variable within its scope.
+      return NumOfCoverageCategories - 1;
+    } else {
+      // Get covered range (e.g. 20%-29%).
+      LocBucket /= 10;
+      return LocBucket + 1;
+    }
+  };
+
+  unsigned CoverageBucket = getCoverageBucket();
+  VarParamLocStats[CoverageBucket]++;
+  if (IsParam)
+    ParamLocStats[CoverageBucket]++;
+  else if (IsLocalVar)
+    VarLocStats[CoverageBucket]++;
+}
+
 /// Collect debug info quality metrics for one DIE.
-static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
+static void collectStatsForDie(DWARFDie Die, uint64_t UnitLowPC, std::string FnPrefix,
                                std::string VarPrefix, uint64_t ScopeLowPC,
                                uint64_t BytesInScope, uint32_t InlineDepth,
                                StringMap<PerFunctionStats> &FnStatMap,
-                               GlobalStats &GlobalStats) {
+                               GlobalStats &GlobalStats,
+                               LocationStats &LocStats) {
   bool HasLoc = false;
   bool HasSrcLoc = false;
   bool HasType = false;
   bool IsArtificial = false;
   uint64_t BytesCovered = 0;
+  uint64_t BytesEntryValuesCovered = 0;
   uint64_t OffsetToFirstDefinition = 0;
+  auto &FnStats = FnStatMap[FnPrefix];
+  bool IsParam = Die.getTag() == dwarf::DW_TAG_formal_parameter;
+  bool IsLocalVar = Die.getTag() == dwarf::DW_TAG_variable;
+
+  if (Die.getTag() == dwarf::DW_TAG_call_site ||
+      Die.getTag() == dwarf::DW_TAG_GNU_call_site) {
+    GlobalStats.CallSiteDIEs++;
+    return;
+  }
 
-  if (Die.getTag() == dwarf::DW_TAG_call_site) {
-    GlobalStats.CallSiteEntries++;
+  if (Die.getTag() == dwarf::DW_TAG_call_site_parameter ||
+      Die.getTag() == dwarf::DW_TAG_GNU_call_site_parameter) {
+    GlobalStats.CallSiteParamDIEs++;
     return;
   }
 
-  if (Die.getTag() != dwarf::DW_TAG_formal_parameter &&
-      Die.getTag() != dwarf::DW_TAG_variable &&
-      Die.getTag() != dwarf::DW_TAG_member) {
+  if (!IsParam && !IsLocalVar && Die.getTag() != dwarf::DW_TAG_member) {
     // Not a variable or constant member.
     return;
   }
@@ -116,6 +220,19 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
   if (Die.find(dwarf::DW_AT_artificial))
     IsArtificial = true;
 
+  auto IsEntryValue = [&](ArrayRef<uint8_t> D) -> bool {
+    DWARFUnit *U = Die.getDwarfUnit();
+    DataExtractor Data(toStringRef(D),
+                       Die.getDwarfUnit()->getContext().isLittleEndian(), 0);
+    DWARFExpression Expression(Data, U->getVersion(), U->getAddressByteSize());
+    // Consider the expression containing the DW_OP_entry_value as
+    // an entry value.
+    return llvm::any_of(Expression, [](DWARFExpression::Operation &Op) {
+      return Op.getCode() == dwarf::DW_OP_entry_value ||
+             Op.getCode() == dwarf::DW_OP_GNU_entry_value;
+    });
+  };
+
   if (Die.find(dwarf::DW_AT_const_value)) {
     // This catches constant members *and* variables.
     HasLoc = true;
@@ -133,11 +250,15 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
       if (auto DebugLocOffset = FormValue->getAsSectionOffset()) {
         auto *DebugLoc = Die.getDwarfUnit()->getContext().getDebugLoc();
         if (auto List = DebugLoc->getLocationListAtOffset(*DebugLocOffset)) {
-          for (auto Entry : List->Entries)
-            BytesCovered += Entry.End - Entry.Begin;
+          for (auto Entry : List->Entries) {
+            uint64_t BytesEntryCovered = Entry.End - Entry.Begin;
+            BytesCovered += BytesEntryCovered;
+            if (IsEntryValue(Entry.Loc))
+              BytesEntryValuesCovered += BytesEntryCovered;
+          }
           if (List->Entries.size()) {
             uint64_t FirstDef = List->Entries[0].Begin;
-            uint64_t UnitOfs = getLowPC(Die.getDwarfUnit()->getUnitDIE());
+            uint64_t UnitOfs = UnitLowPC; 
             // Ranges sometimes start before the lexical scope.
             if (UnitOfs + FirstDef >= ScopeLowPC)
               OffsetToFirstDefinition = UnitOfs + FirstDef - ScopeLowPC;
@@ -154,8 +275,25 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
     }
   }
 
+  // Calculate the debug location statistics.
+  if (BytesInScope) {
+    LocStats.NumVarParam++;
+    if (IsParam)
+      LocStats.NumParam++;
+    else if (IsLocalVar)
+      LocStats.NumVar++;
+
+    collectLocStats(BytesCovered, BytesInScope, LocStats.VarParamLocStats,
+                    LocStats.ParamLocStats, LocStats.VarLocStats, IsParam,
+                    IsLocalVar);
+    // Non debug entry values coverage statistics.
+    collectLocStats(BytesCovered - BytesEntryValuesCovered, BytesInScope,
+                    LocStats.VarParamNonEntryValLocStats,
+                    LocStats.ParamNonEntryValLocStats,
+                    LocStats.VarNonEntryValLocStats, IsParam, IsLocalVar);
+  }
+
   // Collect PC range coverage data.
-  auto &FnStats = FnStatMap[FnPrefix];
   if (DWARFDie D =
           Die.getAttributeValueAsReferencedDie(dwarf::DW_AT_abstract_origin))
     Die = D;
@@ -171,6 +309,17 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
     // Turns out we have a lot of ranges that extend past the lexical scope.
     GlobalStats.ScopeBytesCovered += std::min(BytesInScope, BytesCovered);
     GlobalStats.ScopeBytesFromFirstDefinition += BytesInScope;
+    GlobalStats.ScopeEntryValueBytesCovered += BytesEntryValuesCovered;
+    if (IsParam) {
+      GlobalStats.ParamScopeBytesCovered +=
+          std::min(BytesInScope, BytesCovered);
+      GlobalStats.ParamScopeBytesFromFirstDefinition += BytesInScope;
+      GlobalStats.ParamScopeEntryValueBytesCovered += BytesEntryValuesCovered;
+    } else if (IsLocalVar) {
+      GlobalStats.VarScopeBytesCovered += std::min(BytesInScope, BytesCovered);
+      GlobalStats.VarScopeBytesFromFirstDefinition += BytesInScope;
+      GlobalStats.VarScopeEntryValueBytesCovered += BytesEntryValuesCovered;
+    }
     assert(GlobalStats.ScopeBytesCovered <=
            GlobalStats.ScopeBytesFromFirstDefinition);
   } else if (Die.getTag() == dwarf::DW_TAG_member) {
@@ -179,7 +328,7 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
     FnStats.TotalVarWithLoc += (unsigned)HasLoc;
   }
   if (!IsArtificial) {
-    if (Die.getTag() == dwarf::DW_TAG_formal_parameter) {
+    if (IsParam) {
       FnStats.NumParams++;
       if (HasType)
         FnStats.NumParamTypes++;
@@ -187,7 +336,7 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
         FnStats.NumParamSourceLocations++;
       if (HasLoc)
         FnStats.NumParamLocations++;
-    } else if (Die.getTag() == dwarf::DW_TAG_variable) {
+    } else if (IsLocalVar) {
       FnStats.NumVars++;
       if (HasType)
         FnStats.NumVarTypes++;
@@ -200,11 +349,12 @@ static void collectStatsForDie(DWARFDie Die, std::string FnPrefix,
 }
 
 /// Recursively collect debug info quality metrics.
-static void collectStatsRecursive(DWARFDie Die, std::string FnPrefix,
+static void collectStatsRecursive(DWARFDie Die, uint64_t UnitLowPC, std::string FnPrefix,
                                   std::string VarPrefix, uint64_t ScopeLowPC,
                                   uint64_t BytesInScope, uint32_t InlineDepth,
                                   StringMap<PerFunctionStats> &FnStatMap,
-                                  GlobalStats &GlobalStats) {
+                                  GlobalStats &GlobalStats,
+                                  LocationStats &LocStats) {
   // Handle any kind of lexical scope.
   const dwarf::Tag Tag = Die.getTag();
   const bool IsFunction = Tag == dwarf::DW_TAG_subprogram;
@@ -272,8 +422,8 @@ static void collectStatsRecursive(DWARFDie Die, std::string FnPrefix,
     }
   } else {
     // Not a scope, visit the Die itself. It could be a variable.
-    collectStatsForDie(Die, FnPrefix, VarPrefix, ScopeLowPC, BytesInScope,
-                       InlineDepth, FnStatMap, GlobalStats);
+    collectStatsForDie(Die, UnitLowPC, FnPrefix, VarPrefix, ScopeLowPC, BytesInScope,
+                       InlineDepth, FnStatMap, GlobalStats, LocStats);
   }
 
   // Set InlineDepth correctly for child recursion
@@ -290,8 +440,9 @@ static void collectStatsRecursive(DWARFDie Die, std::string FnPrefix,
     if (Child.getTag() == dwarf::DW_TAG_lexical_block)
       ChildVarPrefix += toHex(LexicalBlockIndex++) + '.';
 
-    collectStatsRecursive(Child, FnPrefix, ChildVarPrefix, ScopeLowPC,
-                          BytesInScope, InlineDepth, FnStatMap, GlobalStats);
+    collectStatsRecursive(Child, UnitLowPC, FnPrefix, ChildVarPrefix, ScopeLowPC,
+                          BytesInScope, InlineDepth, FnStatMap, GlobalStats,
+                          LocStats);
     Child = Child.getSibling();
   }
 }
@@ -299,14 +450,33 @@ static void collectStatsRecursive(DWARFDie Die, std::string FnPrefix,
 /// Print machine-readable output.
 /// The machine-readable format is single-line JSON output.
 /// \{
-static void printDatum(raw_ostream &OS, const char *Key, StringRef Value) {
-  OS << ",\"" << Key << "\":\"" << Value << '"';
-  LLVM_DEBUG(llvm::dbgs() << Key << ": " << Value << '\n');
-}
-static void printDatum(raw_ostream &OS, const char *Key, uint64_t Value) {
+static void printDatum(raw_ostream &OS, const char *Key, json::Value Value) {
   OS << ",\"" << Key << "\":" << Value;
   LLVM_DEBUG(llvm::dbgs() << Key << ": " << Value << '\n');
 }
+static void printLocationStats(raw_ostream &OS,
+                               const char *Key,
+                               std::vector<unsigned> &LocationStats) {
+  OS << ",\"" << Key << " with 0% of its scope covered\":"
+     << LocationStats[0];
+  LLVM_DEBUG(llvm::dbgs() << Key << " with 0% of its scope covered: "
+                          << LocationStats[0] << '\n');
+  OS << ",\"" << Key << " with 1-9% of its scope covered\":"
+     << LocationStats[1];
+  LLVM_DEBUG(llvm::dbgs() << Key << " with 1-9% of its scope covered: "
+                          << LocationStats[1] << '\n');
+  for (unsigned i = 2; i < NumOfCoverageCategories - 1; ++i) {
+    OS << ",\"" << Key << " with " << (i - 1) * 10 << "-" << i * 10 - 1
+       << "% of its scope covered\":" << LocationStats[i];
+    LLVM_DEBUG(llvm::dbgs()
+               << Key << " with " << (i - 1) * 10 << "-" << i * 10 - 1
+               << "% of its scope covered: " << LocationStats[i]);
+  }
+  OS << ",\"" << Key << " with 100% of its scope covered\":"
+     << LocationStats[NumOfCoverageCategories - 1];
+  LLVM_DEBUG(llvm::dbgs() << Key << " with 100% of its scope covered: "
+                          << LocationStats[NumOfCoverageCategories - 1]);
+}
 /// \}
 
 /// Collect debug info quality metrics for an entire DIContext.
@@ -321,10 +491,12 @@ bool collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
                                Twine Filename, raw_ostream &OS) {
   StringRef FormatName = Obj.getFileFormatName();
   GlobalStats GlobalStats;
+  LocationStats LocStats;
   StringMap<PerFunctionStats> Statistics;
   for (const auto &CU : static_cast<DWARFContext *>(&DICtx)->compile_units())
     if (DWARFDie CUDie = CU->getNonSkeletonUnitDIE(false))
-      collectStatsRecursive(CUDie, "/", "g", 0, 0, 0, Statistics, GlobalStats);
+      collectStatsRecursive(CUDie, getLowPC(CUDie), "/", "g", 0, 0, 0,
+                            Statistics, GlobalStats, LocStats);
 
   /// The version number should be increased every time the algorithm is changed
   /// (including bug fixes). New metrics may be added without increasing the
@@ -387,9 +559,24 @@ bool collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   printDatum(OS, "source variables", VarParamTotal);
   printDatum(OS, "variables with location", VarParamWithLoc);
   printDatum(OS, "call site entries", GlobalStats.CallSiteEntries);
+  printDatum(OS, "call site DIEs", GlobalStats.CallSiteDIEs);
+  printDatum(OS, "call site parameter DIEs", GlobalStats.CallSiteParamDIEs);
   printDatum(OS, "scope bytes total",
              GlobalStats.ScopeBytesFromFirstDefinition);
   printDatum(OS, "scope bytes covered", GlobalStats.ScopeBytesCovered);
+  printDatum(OS, "entry value scope bytes covered",
+             GlobalStats.ScopeEntryValueBytesCovered);
+  printDatum(OS, "formal params scope bytes total",
+             GlobalStats.ParamScopeBytesFromFirstDefinition);
+  printDatum(OS, "formal params scope bytes covered",
+             GlobalStats.ParamScopeBytesCovered);
+  printDatum(OS, "formal params entry value scope bytes covered",
+             GlobalStats.ParamScopeEntryValueBytesCovered);
+  printDatum(OS, "vars scope bytes total",
+             GlobalStats.VarScopeBytesFromFirstDefinition);
+  printDatum(OS, "vars scope bytes covered", GlobalStats.VarScopeBytesCovered);
+  printDatum(OS, "vars entry value scope bytes covered",
+             GlobalStats.VarScopeEntryValueBytesCovered);
   printDatum(OS, "total function size", GlobalStats.FunctionSize);
   printDatum(OS, "total inlined function size", GlobalStats.InlineFunctionSize);
   printDatum(OS, "total formal params", ParamTotal);
@@ -400,6 +587,20 @@ bool collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   printDatum(OS, "vars with source location", VarWithSrcLoc);
   printDatum(OS, "vars with type", VarWithType);
   printDatum(OS, "vars with binary location", VarWithLoc);
+  printDatum(OS, "total variables procesed by location statistics",
+             LocStats.NumVarParam);
+  printLocationStats(OS, "variables", LocStats.VarParamLocStats);
+  printLocationStats(OS, "variables (excluding the debug entry values)",
+                     LocStats.VarParamNonEntryValLocStats);
+  printDatum(OS, "total params procesed by location statistics",
+             LocStats.NumParam);
+  printLocationStats(OS, "params", LocStats.ParamLocStats);
+  printLocationStats(OS, "params (excluding the debug entry values)",
+                     LocStats.ParamNonEntryValLocStats);
+  printDatum(OS, "total vars procesed by location statistics", LocStats.NumVar);
+  printLocationStats(OS, "vars", LocStats.VarLocStats);
+  printLocationStats(OS, "vars (excluding the debug entry values)",
+                     LocStats.VarNonEntryValLocStats);
   OS << "}\n";
   LLVM_DEBUG(
       llvm::dbgs() << "Total Availability: "
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 05a7aef67ece..e20f6041f98d 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -584,7 +584,7 @@ int main(int argc, char **argv) {
   }
 
   std::error_code EC;
-  ToolOutputFile OutputFile(OutputFilename, EC, sys::fs::OF_None);
+  ToolOutputFile OutputFile(OutputFilename, EC, sys::fs::OF_Text);
   error("Unable to open output file" + OutputFilename, EC);
   // Don't remove output file if we exit with an error.
   OutputFile.keep();
diff --git a/tools/llvm-extract/llvm-extract.cpp b/tools/llvm-extract/llvm-extract.cpp
index 300bc0b4bd52..dddc0d9baa08 100644
--- a/tools/llvm-extract/llvm-extract.cpp
+++ b/tools/llvm-extract/llvm-extract.cpp
@@ -74,8 +74,18 @@ static cl::list<std::string>
 
 // ExtractBlocks - The blocks to extract from the module.
 static cl::list<std::string> ExtractBlocks(
-    "bb", cl::desc("Specify <function, basic block> pairs to extract"),
-    cl::ZeroOrMore, cl::value_desc("function:bb"), cl::cat(ExtractCat));
+    "bb",
+    cl::desc(
+        "Specify <function, basic block1[;basic block2...]> pairs to extract.\n"
+        "Each pair will create a function.\n"
+        "If multiple basic blocks are specified in one pair,\n"
+        "the first block in the sequence should dominate the rest.\n"
+        "eg:\n"
+        "  --bb=f:bb1;bb2 will extract one function with both bb1 and bb2;\n"
+        "  --bb=f:bb1 --bb=f:bb2 will extract two functions, one with bb1, one "
+        "with bb2."),
+    cl::ZeroOrMore, cl::value_desc("function:bb1[;bb2...]"),
+    cl::cat(ExtractCat));
 
 // ExtractAlias - The alias to extract from the module.
 static cl::list<std::string>
@@ -350,7 +360,7 @@ int main(int argc, char **argv) {
   Passes.add(createStripDeadPrototypesPass());   // Remove dead func decls
 
   std::error_code EC;
-  ToolOutputFile Out(OutputFilename, EC, sys::fs::F_None);
+  ToolOutputFile Out(OutputFilename, EC, sys::fs::OF_None);
   if (EC) {
     errs() << EC.message() << '\n';
     return 1;
diff --git a/tools/llvm-ifs/CMakeLists.txt b/tools/llvm-ifs/CMakeLists.txt
new file mode 100644
index 000000000000..544b0e41a5ed
--- /dev/null
+++ b/tools/llvm-ifs/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Object
+  Support
+  TextAPI
+  ObjectYAML
+  )
+
+add_llvm_tool(llvm-ifs
+  llvm-ifs.cpp
+  )
diff --git a/tools/llvm-ifs/LLVMBuild.txt b/tools/llvm-ifs/LLVMBuild.txt
new file mode 100644
index 000000000000..10dc6bd8f550
--- /dev/null
+++ b/tools/llvm-ifs/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./tools/llvm-ifs/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-ifs
+parent = Tools
+required_libraries = Object Support TextAPI
diff --git a/tools/llvm-ifs/llvm-ifs.cpp b/tools/llvm-ifs/llvm-ifs.cpp
new file mode 100644
index 000000000000..f329b4633632
--- /dev/null
+++ b/tools/llvm-ifs/llvm-ifs.cpp
@@ -0,0 +1,532 @@
+//===- llvm-ifs.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===/
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/VersionTuple.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TextAPI/MachO/InterfaceFile.h"
+#include "llvm/TextAPI/MachO/TextAPIReader.h"
+#include "llvm/TextAPI/MachO/TextAPIWriter.h"
+#include <set>
+#include <string>
+
+using namespace llvm;
+using namespace llvm::yaml;
+using namespace llvm::MachO;
+
+#define DEBUG_TYPE "llvm-ifs"
+
+namespace {
+const VersionTuple IFSVersionCurrent(1, 2);
+}
+
+static cl::opt<std::string> Action("action", cl::desc("<llvm-ifs action>"),
+                                   cl::value_desc("write-ifs | write-bin"),
+                                   cl::init("write-ifs"));
+
+static cl::opt<std::string> ForceFormat("force-format",
+                                        cl::desc("<force object format>"),
+                                        cl::value_desc("ELF | TBD"),
+                                        cl::init(""));
+
+static cl::list<std::string> InputFilenames(cl::Positional,
+                                            cl::desc("<input ifs files>"),
+                                            cl::ZeroOrMore);
+
+static cl::opt<std::string> OutputFilename("o", cl::desc("<output file>"),
+                                           cl::value_desc("path"));
+
+enum class IFSSymbolType {
+  NoType = 0,
+  Object,
+  Func,
+  // Type information is 4 bits, so 16 is safely out of range.
+  Unknown = 16,
+};
+
+std::string getTypeName(IFSSymbolType Type) {
+  switch (Type) {
+  case IFSSymbolType::NoType:
+    return "NoType";
+  case IFSSymbolType::Func:
+    return "Func";
+  case IFSSymbolType::Object:
+    return "Object";
+  case IFSSymbolType::Unknown:
+    return "Unknown";
+  }
+  llvm_unreachable("Unexpected ifs symbol type.");
+}
+
+struct IFSSymbol {
+  IFSSymbol(std::string SymbolName) : Name(SymbolName) {}
+  std::string Name;
+  uint64_t Size;
+  IFSSymbolType Type;
+  bool Weak;
+  Optional<std::string> Warning;
+  bool operator<(const IFSSymbol &RHS) const { return Name < RHS.Name; }
+};
+
+namespace llvm {
+namespace yaml {
+/// YAML traits for IFSSymbolType.
+template <> struct ScalarEnumerationTraits<IFSSymbolType> {
+  static void enumeration(IO &IO, IFSSymbolType &SymbolType) {
+    IO.enumCase(SymbolType, "NoType", IFSSymbolType::NoType);
+    IO.enumCase(SymbolType, "Func", IFSSymbolType::Func);
+    IO.enumCase(SymbolType, "Object", IFSSymbolType::Object);
+    IO.enumCase(SymbolType, "Unknown", IFSSymbolType::Unknown);
+    // Treat other symbol types as noise, and map to Unknown.
+    if (!IO.outputting() && IO.matchEnumFallback())
+      SymbolType = IFSSymbolType::Unknown;
+  }
+};
+
+template <> struct ScalarTraits<VersionTuple> {
+  static void output(const VersionTuple &Value, void *,
+                     llvm::raw_ostream &Out) {
+    Out << Value.getAsString();
+  }
+
+  static StringRef input(StringRef Scalar, void *, VersionTuple &Value) {
+    if (Value.tryParse(Scalar))
+      return StringRef("Can't parse version: invalid version format.");
+
+    if (Value > IFSVersionCurrent)
+      return StringRef("Unsupported IFS version.");
+
+    // Returning empty StringRef indicates successful parse.
+    return StringRef();
+  }
+
+  // Don't place quotation marks around version value.
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+
+/// YAML traits for IFSSymbol.
+template <> struct MappingTraits<IFSSymbol> {
+  static void mapping(IO &IO, IFSSymbol &Symbol) {
+    IO.mapRequired("Type", Symbol.Type);
+    // The need for symbol size depends on the symbol type.
+    if (Symbol.Type == IFSSymbolType::NoType)
+      IO.mapOptional("Size", Symbol.Size, (uint64_t)0);
+    else if (Symbol.Type == IFSSymbolType::Func)
+      Symbol.Size = 0;
+    else
+      IO.mapRequired("Size", Symbol.Size);
+    IO.mapOptional("Weak", Symbol.Weak, false);
+    IO.mapOptional("Warning", Symbol.Warning);
+  }
+
+  // Compacts symbol information into a single line.
+  static const bool flow = true;
+};
+
+/// YAML traits for set of IFSSymbols.
+template <> struct CustomMappingTraits<std::set<IFSSymbol>> {
+  static void inputOne(IO &IO, StringRef Key, std::set<IFSSymbol> &Set) {
+    std::string Name = Key.str();
+    IFSSymbol Sym(Name);
+    IO.mapRequired(Name.c_str(), Sym);
+    Set.insert(Sym);
+  }
+
+  static void output(IO &IO, std::set<IFSSymbol> &Set) {
+    for (auto &Sym : Set)
+      IO.mapRequired(Sym.Name.c_str(), const_cast<IFSSymbol &>(Sym));
+  }
+};
+} // namespace yaml
+} // namespace llvm
+
+// A cumulative representation of ELF stubs.
+// Both textual and binary stubs will read into and write from this object.
+class IFSStub {
+  // TODO: Add support for symbol versioning.
+public:
+  VersionTuple IfsVersion;
+  std::string Triple;
+  std::string ObjectFileFormat;
+  Optional<std::string> SOName;
+  std::vector<std::string> NeededLibs;
+  std::set<IFSSymbol> Symbols;
+
+  IFSStub() = default;
+  IFSStub(const IFSStub &Stub)
+      : IfsVersion(Stub.IfsVersion), Triple(Stub.Triple),
+        ObjectFileFormat(Stub.ObjectFileFormat), SOName(Stub.SOName),
+        NeededLibs(Stub.NeededLibs), Symbols(Stub.Symbols) {}
+  IFSStub(IFSStub &&Stub)
+      : IfsVersion(std::move(Stub.IfsVersion)), Triple(std::move(Stub.Triple)),
+        ObjectFileFormat(std::move(Stub.ObjectFileFormat)),
+        SOName(std::move(Stub.SOName)), NeededLibs(std::move(Stub.NeededLibs)),
+        Symbols(std::move(Stub.Symbols)) {}
+};
+
+namespace llvm {
+namespace yaml {
+/// YAML traits for IFSStub objects.
+template <> struct MappingTraits<IFSStub> {
+  static void mapping(IO &IO, IFSStub &Stub) {
+    if (!IO.mapTag("!experimental-ifs-v1", true))
+      IO.setError("Not a .ifs YAML file.");
+    IO.mapRequired("IfsVersion", Stub.IfsVersion);
+    IO.mapOptional("Triple", Stub.Triple);
+    IO.mapOptional("ObjectFileFormat", Stub.ObjectFileFormat);
+    IO.mapOptional("SOName", Stub.SOName);
+    IO.mapOptional("NeededLibs", Stub.NeededLibs);
+    IO.mapRequired("Symbols", Stub.Symbols);
+  }
+};
+} // namespace yaml
+} // namespace llvm
+
+static Expected<std::unique_ptr<IFSStub>> readInputFile(StringRef FilePath) {
+  // Read in file.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrError =
+      MemoryBuffer::getFileOrSTDIN(FilePath);
+  if (!BufOrError)
+    return createStringError(BufOrError.getError(), "Could not open `%s`",
+                             FilePath.data());
+
+  std::unique_ptr<MemoryBuffer> FileReadBuffer = std::move(*BufOrError);
+  yaml::Input YamlIn(FileReadBuffer->getBuffer());
+  std::unique_ptr<IFSStub> Stub(new IFSStub());
+  YamlIn >> *Stub;
+
+  if (std::error_code Err = YamlIn.error())
+    return createStringError(Err, "Failed reading Interface Stub File.");
+
+  return std::move(Stub);
+}
+
+int writeTbdStub(const llvm::Triple &T, const std::set<IFSSymbol> &Symbols,
+                 const StringRef Format, raw_ostream &Out) {
+
+  auto PlatformKindOrError =
+      [](const llvm::Triple &T) -> llvm::Expected<llvm::MachO::PlatformKind> {
+    if (T.isMacOSX())
+      return llvm::MachO::PlatformKind::macOS;
+    if (T.isTvOS())
+      return llvm::MachO::PlatformKind::tvOS;
+    if (T.isWatchOS())
+      return llvm::MachO::PlatformKind::watchOS;
+    // Note: put isiOS last because tvOS and watchOS are also iOS according
+    // to the Triple.
+    if (T.isiOS())
+      return llvm::MachO::PlatformKind::iOS;
+
+    // TODO: Add an option for ForceTriple, but keep ForceFormat for now.
+    if (ForceFormat == "TBD")
+      return llvm::MachO::PlatformKind::macOS;
+
+    return createStringError(errc::not_supported, "Invalid Platform.\n");
+  }(T);
+
+  if (!PlatformKindOrError)
+    return -1;
+
+  PlatformKind Plat = PlatformKindOrError.get();
+  TargetList Targets({Target(llvm::MachO::mapToArchitecture(T), Plat)});
+
+  InterfaceFile File;
+  File.setFileType(FileType::TBD_V3); // Only supporting v3 for now.
+  File.addTargets(Targets);
+
+  for (const auto &Symbol : Symbols) {
+    auto Name = Symbol.Name;
+    auto Kind = SymbolKind::GlobalSymbol;
+    switch (Symbol.Type) {
+    default:
+    case IFSSymbolType::NoType:
+      Kind = SymbolKind::GlobalSymbol;
+      break;
+    case IFSSymbolType::Object:
+      Kind = SymbolKind::GlobalSymbol;
+      break;
+    case IFSSymbolType::Func:
+      Kind = SymbolKind::GlobalSymbol;
+      break;
+    }
+    if (Symbol.Weak)
+      File.addSymbol(Kind, Name, Targets, SymbolFlags::WeakDefined);
+    else
+      File.addSymbol(Kind, Name, Targets);
+  }
+
+  SmallString<4096> Buffer;
+  raw_svector_ostream OS(Buffer);
+  if (Error Result = TextAPIWriter::writeToStream(OS, File))
+    return -1;
+  Out << OS.str();
+  return 0;
+}
+
+int writeElfStub(const llvm::Triple &T, const std::set<IFSSymbol> &Symbols,
+                 const StringRef Format, raw_ostream &Out) {
+  SmallString<0> Storage;
+  Storage.clear();
+  raw_svector_ostream OS(Storage);
+
+  OS << "--- !ELF\n";
+  OS << "FileHeader:\n";
+  OS << "  Class:           ELFCLASS";
+  OS << (T.isArch64Bit() ? "64" : "32");
+  OS << "\n";
+  OS << "  Data:            ELFDATA2";
+  OS << (T.isLittleEndian() ? "LSB" : "MSB");
+  OS << "\n";
+  OS << "  Type:            ET_DYN\n";
+  OS << "  Machine:         "
+     << llvm::StringSwitch<llvm::StringRef>(T.getArchName())
+            .Case("x86_64", "EM_X86_64")
+            .Case("i386", "EM_386")
+            .Case("i686", "EM_386")
+            .Case("aarch64", "EM_AARCH64")
+            .Case("amdgcn", "EM_AMDGPU")
+            .Case("r600", "EM_AMDGPU")
+            .Case("arm", "EM_ARM")
+            .Case("thumb", "EM_ARM")
+            .Case("avr", "EM_AVR")
+            .Case("mips", "EM_MIPS")
+            .Case("mipsel", "EM_MIPS")
+            .Case("mips64", "EM_MIPS")
+            .Case("mips64el", "EM_MIPS")
+            .Case("msp430", "EM_MSP430")
+            .Case("ppc", "EM_PPC")
+            .Case("ppc64", "EM_PPC64")
+            .Case("ppc64le", "EM_PPC64")
+            .Case("x86", T.isOSIAMCU() ? "EM_IAMCU" : "EM_386")
+            .Case("x86_64", "EM_X86_64")
+            .Default("EM_NONE")
+     << "\nSections:"
+     << "\n  - Name:            .text"
+     << "\n    Type:            SHT_PROGBITS"
+     << "\n  - Name:            .data"
+     << "\n    Type:            SHT_PROGBITS"
+     << "\n  - Name:            .rodata"
+     << "\n    Type:            SHT_PROGBITS"
+     << "\nSymbols:\n";
+  for (const auto &Symbol : Symbols) {
+    OS << "  - Name:            " << Symbol.Name << "\n"
+       << "    Type:            STT_";
+    switch (Symbol.Type) {
+    default:
+    case IFSSymbolType::NoType:
+      OS << "NOTYPE";
+      break;
+    case IFSSymbolType::Object:
+      OS << "OBJECT";
+      break;
+    case IFSSymbolType::Func:
+      OS << "FUNC";
+      break;
+    }
+    OS << "\n    Section:         .text"
+       << "\n    Binding:         STB_" << (Symbol.Weak ? "WEAK" : "GLOBAL")
+       << "\n";
+  }
+  OS << "...\n";
+
+  std::string YamlStr = OS.str();
+
+  // Only or debugging. Not an offical format.
+  LLVM_DEBUG({
+    if (ForceFormat == "ELFOBJYAML") {
+      Out << YamlStr;
+      return 0;
+    }
+  });
+
+  yaml::Input YIn(YamlStr);
+  auto ErrHandler = [](const Twine &Msg) {
+    WithColor::error(errs(), "llvm-ifs") << Msg << "\n";
+  };
+  return convertYAML(YIn, Out, ErrHandler) ? 0 : 1;
+}
+
+int writeIfso(const IFSStub &Stub, bool IsWriteIfs, raw_ostream &Out) {
+  if (IsWriteIfs) {
+    yaml::Output YamlOut(Out, NULL, /*WrapColumn =*/0);
+    YamlOut << const_cast<IFSStub &>(Stub);
+    return 0;
+  }
+
+  std::string ObjectFileFormat =
+      ForceFormat.empty() ? Stub.ObjectFileFormat : ForceFormat;
+
+  if (ObjectFileFormat == "ELF" || ForceFormat == "ELFOBJYAML")
+    return writeElfStub(llvm::Triple(Stub.Triple), Stub.Symbols,
+                        Stub.ObjectFileFormat, Out);
+  if (ObjectFileFormat == "TBD")
+    return writeTbdStub(llvm::Triple(Stub.Triple), Stub.Symbols,
+                        Stub.ObjectFileFormat, Out);
+
+  WithColor::error()
+      << "Invalid ObjectFileFormat: Only ELF and TBD are supported.\n";
+  return -1;
+}
+
+// New Interface Stubs Yaml Format:
+// --- !experimental-ifs-v1
+// IfsVersion:      1.0
+// Triple:          <llvm triple>
+// ObjectFileFormat: <ELF | others not yet supported>
+// Symbols:
+//   _ZSymbolName: { Type: <type> }
+// ...
+
+int main(int argc, char *argv[]) {
+  // Parse arguments.
+  cl::ParseCommandLineOptions(argc, argv);
+
+  if (InputFilenames.empty())
+    InputFilenames.push_back("-");
+
+  IFSStub Stub;
+  std::map<std::string, IFSSymbol> SymbolMap;
+
+  std::string PreviousInputFilePath = "";
+  for (const std::string &InputFilePath : InputFilenames) {
+    Expected<std::unique_ptr<IFSStub>> StubOrErr = readInputFile(InputFilePath);
+    if (!StubOrErr) {
+      WithColor::error() << StubOrErr.takeError() << "\n";
+      return -1;
+    }
+    std::unique_ptr<IFSStub> TargetStub = std::move(StubOrErr.get());
+
+    if (Stub.Triple.empty()) {
+      PreviousInputFilePath = InputFilePath;
+      Stub.IfsVersion = TargetStub->IfsVersion;
+      Stub.Triple = TargetStub->Triple;
+      Stub.ObjectFileFormat = TargetStub->ObjectFileFormat;
+      Stub.SOName = TargetStub->SOName;
+      Stub.NeededLibs = TargetStub->NeededLibs;
+    } else {
+      if (Stub.IfsVersion != TargetStub->IfsVersion) {
+        if (Stub.IfsVersion.getMajor() != IFSVersionCurrent.getMajor()) {
+          WithColor::error()
+              << "Interface Stub: IfsVersion Mismatch."
+              << "\nFilenames: " << PreviousInputFilePath << " "
+              << InputFilePath << "\nIfsVersion Values: " << Stub.IfsVersion
+              << " " << TargetStub->IfsVersion << "\n";
+          return -1;
+        }
+        if (TargetStub->IfsVersion > Stub.IfsVersion)
+          Stub.IfsVersion = TargetStub->IfsVersion;
+      }
+      if (Stub.ObjectFileFormat != TargetStub->ObjectFileFormat) {
+        WithColor::error() << "Interface Stub: ObjectFileFormat Mismatch."
+                           << "\nFilenames: " << PreviousInputFilePath << " "
+                           << InputFilePath << "\nObjectFileFormat Values: "
+                           << Stub.ObjectFileFormat << " "
+                           << TargetStub->ObjectFileFormat << "\n";
+        return -1;
+      }
+      if (Stub.Triple != TargetStub->Triple) {
+        WithColor::error() << "Interface Stub: Triple Mismatch."
+                           << "\nFilenames: " << PreviousInputFilePath << " "
+                           << InputFilePath
+                           << "\nTriple Values: " << Stub.Triple << " "
+                           << TargetStub->Triple << "\n";
+        return -1;
+      }
+      if (Stub.SOName != TargetStub->SOName) {
+        WithColor::error() << "Interface Stub: SOName Mismatch."
+                           << "\nFilenames: " << PreviousInputFilePath << " "
+                           << InputFilePath
+                           << "\nSOName Values: " << Stub.SOName << " "
+                           << TargetStub->SOName << "\n";
+        return -1;
+      }
+      if (Stub.NeededLibs != TargetStub->NeededLibs) {
+        WithColor::error() << "Interface Stub: NeededLibs Mismatch."
+                           << "\nFilenames: " << PreviousInputFilePath << " "
+                           << InputFilePath << "\n";
+        return -1;
+      }
+    }
+
+    for (auto Symbol : TargetStub->Symbols) {
+      auto SI = SymbolMap.find(Symbol.Name);
+      if (SI == SymbolMap.end()) {
+        SymbolMap.insert(
+            std::pair<std::string, IFSSymbol>(Symbol.Name, Symbol));
+        continue;
+      }
+
+      assert(Symbol.Name == SI->second.Name && "Symbol Names Must Match.");
+
+      // Check conflicts:
+      if (Symbol.Type != SI->second.Type) {
+        WithColor::error() << "Interface Stub: Type Mismatch for "
+                           << Symbol.Name << ".\nFilename: " << InputFilePath
+                           << "\nType Values: " << getTypeName(SI->second.Type)
+                           << " " << getTypeName(Symbol.Type) << "\n";
+
+        return -1;
+      }
+      if (Symbol.Size != SI->second.Size) {
+        WithColor::error() << "Interface Stub: Size Mismatch for "
+                           << Symbol.Name << ".\nFilename: " << InputFilePath
+                           << "\nSize Values: " << SI->second.Size << " "
+                           << Symbol.Size << "\n";
+
+        return -1;
+      }
+      if (Symbol.Weak != SI->second.Weak) {
+        // TODO: Add conflict resolution for Weak vs non-Weak.
+        WithColor::error() << "Interface Stub: Weak Mismatch for "
+                           << Symbol.Name << ".\nFilename: " << InputFilePath
+                           << "\nWeak Values: " << SI->second.Weak << " "
+                           << Symbol.Weak << "\n";
+
+        return -1;
+      }
+      // TODO: Not checking Warning. Will be dropped.
+    }
+
+    PreviousInputFilePath = InputFilePath;
+  }
+
+  if (Stub.IfsVersion != IFSVersionCurrent)
+    if (Stub.IfsVersion.getMajor() != IFSVersionCurrent.getMajor()) {
+      WithColor::error() << "Interface Stub: Bad IfsVersion: "
+                         << Stub.IfsVersion << ", llvm-ifs supported version: "
+                         << IFSVersionCurrent << ".\n";
+      return -1;
+    }
+
+  for (auto &Entry : SymbolMap)
+    Stub.Symbols.insert(Entry.second);
+
+  std::error_code SysErr;
+
+  // Open file for writing.
+  raw_fd_ostream Out(OutputFilename, SysErr);
+  if (SysErr) {
+    WithColor::error() << "Couldn't open " << OutputFilename
+                       << " for writing.\n";
+    return -1;
+  }
+
+  return writeIfso(Stub, (Action == "write-ifs"), Out);
+}
diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index 50ba57178d02..fa36e083b6f8 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -351,13 +351,13 @@ int main(int argc, char **argv) {
 
   LLVMContext Context;
   Context.setDiagnosticHandler(
-    llvm::make_unique<LLVMLinkDiagnosticHandler>(), true);
+    std::make_unique<LLVMLinkDiagnosticHandler>(), true);
   cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
 
   if (!DisableDITypeMap)
     Context.enableDebugTypeODRUniquing();
 
-  auto Composite = make_unique<Module>("llvm-link", Context);
+  auto Composite = std::make_unique<Module>("llvm-link", Context);
   Linker L(*Composite);
 
   unsigned Flags = Linker::Flags::None;
@@ -381,7 +381,7 @@ int main(int argc, char **argv) {
     errs() << "Here's the assembly:\n" << *Composite;
 
   std::error_code EC;
-  ToolOutputFile Out(OutputFilename, EC, sys::fs::F_None);
+  ToolOutputFile Out(OutputFilename, EC, sys::fs::OF_None);
   if (EC) {
     WithColor::error() << EC.message() << '\n';
     return 1;
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 585207b25185..b47e68e82850 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -315,8 +315,8 @@ getLocalLTOModule(StringRef Path, std::unique_ptr<MemoryBuffer> &Buffer,
   error(BufferOrErr, "error loading file '" + Path + "'");
   Buffer = std::move(BufferOrErr.get());
   CurrentActivity = ("loading file '" + Path + "'").str();
-  std::unique_ptr<LLVMContext> Context = llvm::make_unique<LLVMContext>();
-  Context->setDiagnosticHandler(llvm::make_unique<LLVMLTODiagnosticHandler>(),
+  std::unique_ptr<LLVMContext> Context = std::make_unique<LLVMContext>();
+  Context->setDiagnosticHandler(std::make_unique<LLVMLTODiagnosticHandler>(),
                                 true);
   ErrorOr<std::unique_ptr<LTOModule>> Ret = LTOModule::createInLocalContext(
       std::move(Context), Buffer->getBufferStart(), Buffer->getBufferSize(),
@@ -420,7 +420,7 @@ static void createCombinedModuleSummaryIndex() {
   std::error_code EC;
   assert(!OutputFilename.empty());
   raw_fd_ostream OS(OutputFilename + ".thinlto.bc", EC,
-                    sys::fs::OpenFlags::F_None);
+                    sys::fs::OpenFlags::OF_None);
   error(EC, "error opening the file '" + OutputFilename + ".thinlto.bc'");
   WriteIndexToFile(CombinedIndex, OS);
   OS.close();
@@ -510,7 +510,7 @@ static std::unique_ptr<Module> loadModuleFromInput(lto::InputFile &File,
 
 static void writeModuleToFile(Module &TheModule, StringRef Filename) {
   std::error_code EC;
-  raw_fd_ostream OS(Filename, EC, sys::fs::OpenFlags::F_None);
+  raw_fd_ostream OS(Filename, EC, sys::fs::OpenFlags::OF_None);
   error(EC, "error opening the file '" + Filename + "'");
   maybeVerifyModule(TheModule);
   WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
@@ -581,7 +581,7 @@ private:
     if (!CombinedIndex)
       report_fatal_error("ThinLink didn't create an index");
     std::error_code EC;
-    raw_fd_ostream OS(OutputFilename, EC, sys::fs::OpenFlags::F_None);
+    raw_fd_ostream OS(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
     error(EC, "error opening the file '" + OutputFilename + "'");
     WriteIndexToFile(*CombinedIndex, OS);
   }
@@ -619,7 +619,7 @@ private:
       }
       OutputName = getThinLTOOutputFile(OutputName, OldPrefix, NewPrefix);
       std::error_code EC;
-      raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::F_None);
+      raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::OF_None);
       error(EC, "error opening the file '" + OutputName + "'");
       WriteIndexToFile(*Index, OS, &ModuleToSummariesForIndex);
     }
@@ -802,7 +802,7 @@ private:
       }
 
       std::error_code EC;
-      raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::F_None);
+      raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::OF_None);
       error(EC, "error opening the file '" + OutputName + "'");
       OS << std::get<0>(BinName)->getBuffer();
     }
@@ -848,7 +848,7 @@ private:
     for (unsigned BufID = 0; BufID < Binaries.size(); ++BufID) {
       auto OutputName = InputFilenames[BufID] + ".thinlto.o";
       std::error_code EC;
-      raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::F_None);
+      raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::OF_None);
       error(EC, "error opening the file '" + OutputName + "'");
       OS << Binaries[BufID]->getBuffer();
     }
@@ -921,7 +921,7 @@ int main(int argc, char **argv) {
   unsigned BaseArg = 0;
 
   LLVMContext Context;
-  Context.setDiagnosticHandler(llvm::make_unique<LLVMLTODiagnosticHandler>(),
+  Context.setDiagnosticHandler(std::make_unique<LLVMLTODiagnosticHandler>(),
                                true);
 
   LTOCodeGenerator CodeGen(Context);
@@ -1020,7 +1020,7 @@ int main(int argc, char **argv) {
       if (Parallelism != 1)
         PartFilename += "." + utostr(I);
       std::error_code EC;
-      OSs.emplace_back(PartFilename, EC, sys::fs::F_None);
+      OSs.emplace_back(PartFilename, EC, sys::fs::OF_None);
       if (EC)
         error("error opening the file '" + PartFilename + "': " + EC.message());
       OSPtrs.push_back(&OSs.back().os());
diff --git a/tools/llvm-lto2/llvm-lto2.cpp b/tools/llvm-lto2/llvm-lto2.cpp
index 0bd9289dc938..5e3b3dcb6c31 100644
--- a/tools/llvm-lto2/llvm-lto2.cpp
+++ b/tools/llvm-lto2/llvm-lto2.cpp
@@ -291,6 +291,14 @@ static int run(int argc, char **argv) {
     std::vector<SymbolResolution> Res;
     for (const InputFile::Symbol &Sym : Input->symbols()) {
       auto I = CommandLineResolutions.find({F, Sym.getName()});
+      // If it isn't found, look for "$", which would have been added
+      // (followed by a hash) when the symbol was promoted during module
+      // splitting if it was defined in one part and used in the other.
+      // Try looking up the symbol name before the "$".
+      if (I == CommandLineResolutions.end()) {
+        auto SplitName = Sym.getName().rsplit("$");
+        I = CommandLineResolutions.find({F, SplitName.first});
+      }
       if (I == CommandLineResolutions.end()) {
         llvm::errs() << argv[0] << ": missing symbol resolution for " << F
                      << ',' << Sym.getName() << '\n';
@@ -325,9 +333,9 @@ static int run(int argc, char **argv) {
     std::string Path = OutputFilename + "." + utostr(Task);
 
     std::error_code EC;
-    auto S = llvm::make_unique<raw_fd_ostream>(Path, EC, sys::fs::F_None);
+    auto S = std::make_unique<raw_fd_ostream>(Path, EC, sys::fs::OF_None);
     check(EC, Path);
-    return llvm::make_unique<lto::NativeObjectStream>(std::move(S));
+    return std::make_unique<lto::NativeObjectStream>(std::move(S));
   };
 
   auto AddBuffer = [&](size_t Task, std::unique_ptr<MemoryBuffer> MB) {
diff --git a/tools/llvm-mc/Disassembler.cpp b/tools/llvm-mc/Disassembler.cpp
index e2af2e7f2e32..1ddbddfa1846 100644
--- a/tools/llvm-mc/Disassembler.cpp
+++ b/tools/llvm-mc/Disassembler.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -129,13 +130,10 @@ static bool ByteArrayFromString(ByteArrayTy &ByteArray,
   return false;
 }
 
-int Disassembler::disassemble(const Target &T,
-                              const std::string &Triple,
-                              MCSubtargetInfo &STI,
-                              MCStreamer &Streamer,
-                              MemoryBuffer &Buffer,
-                              SourceMgr &SM,
-                              raw_ostream &Out) {
+int Disassembler::disassemble(const Target &T, const std::string &Triple,
+                              MCSubtargetInfo &STI, MCStreamer &Streamer,
+                              MemoryBuffer &Buffer, SourceMgr &SM,
+                              MCContext &Ctx, raw_ostream &Out) {
 
   std::unique_ptr<const MCRegisterInfo> MRI(T.createMCRegInfo(Triple));
   if (!MRI) {
@@ -149,9 +147,6 @@ int Disassembler::disassemble(const Target &T,
     return -1;
   }
 
-  // Set up the MCContext for creating symbols and MCExpr's.
-  MCContext Ctx(MAI.get(), MRI.get(), nullptr);
-
   std::unique_ptr<const MCDisassembler> DisAsm(
     T.createMCDisassembler(STI, Ctx));
   if (!DisAsm) {
diff --git a/tools/llvm-mc/Disassembler.h b/tools/llvm-mc/Disassembler.h
index 11b685233abc..dcd8c279c91a 100644
--- a/tools/llvm-mc/Disassembler.h
+++ b/tools/llvm-mc/Disassembler.h
@@ -22,17 +22,15 @@ class MemoryBuffer;
 class Target;
 class raw_ostream;
 class SourceMgr;
+class MCContext;
 class MCSubtargetInfo;
 class MCStreamer;
 
 class Disassembler {
 public:
-  static int disassemble(const Target &T,
-                         const std::string &Triple,
-                         MCSubtargetInfo &STI,
-                         MCStreamer &Streamer,
-                         MemoryBuffer &Buffer,
-                         SourceMgr &SM,
+  static int disassemble(const Target &T, const std::string &Triple,
+                         MCSubtargetInfo &STI, MCStreamer &Streamer,
+                         MemoryBuffer &Buffer, SourceMgr &SM, MCContext &Ctx,
                          raw_ostream &Out);
 };
 
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index ec189c297860..c23740a3094d 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -209,9 +209,10 @@ static const Target *GetTarget(const char *ProgName) {
   return TheTarget;
 }
 
-static std::unique_ptr<ToolOutputFile> GetOutputStream(StringRef Path) {
+static std::unique_ptr<ToolOutputFile> GetOutputStream(StringRef Path,
+    sys::fs::OpenFlags Flags) {
   std::error_code EC;
-  auto Out = llvm::make_unique<ToolOutputFile>(Path, EC, sys::fs::F_None);
+  auto Out = std::make_unique<ToolOutputFile>(Path, EC, Flags);
   if (EC) {
     WithColor::error() << EC.message() << '\n';
     return nullptr;
@@ -279,7 +280,7 @@ static int fillCommandLineSymbols(MCAsmParser &Parser) {
 static int AssembleInput(const char *ProgName, const Target *TheTarget,
                          SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str,
                          MCAsmInfo &MAI, MCSubtargetInfo &STI,
-                         MCInstrInfo &MCII, MCTargetOptions &MCOptions) {
+                         MCInstrInfo &MCII, MCTargetOptions const &MCOptions) {
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, Ctx, Str, MAI));
   std::unique_ptr<MCTargetAsmParser> TAP(
@@ -316,7 +317,7 @@ int main(int argc, char **argv) {
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
 
   cl::ParseCommandLineOptions(argc, argv, "llvm machine code playground\n");
-  MCTargetOptions MCOptions = InitMCTargetOptionsFromFlags();
+  const MCTargetOptions MCOptions = InitMCTargetOptionsFromFlags();
   setDwarfDebugFlags(argc, argv);
 
   setDwarfDebugProducer();
@@ -368,7 +369,7 @@ int main(int argc, char **argv) {
   // FIXME: This is not pretty. MCContext has a ptr to MCObjectFileInfo and
   // MCObjectFileInfo needs a MCContext reference in order to initialize itself.
   MCObjectFileInfo MOFI;
-  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr);
+  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr, &MCOptions);
   MOFI.InitMCObjectFileInfo(TheTriple, PIC, Ctx, LargeCodeModel);
 
   if (SaveTempLabels)
@@ -413,7 +414,9 @@ int main(int argc, char **argv) {
     FeaturesStr = Features.getString();
   }
 
-  std::unique_ptr<ToolOutputFile> Out = GetOutputStream(OutputFilename);
+  sys::fs::OpenFlags Flags = (FileType == OFT_AssemblyFile) ? sys::fs::OF_Text
+                                                            : sys::fs::OF_None;
+  std::unique_ptr<ToolOutputFile> Out = GetOutputStream(OutputFilename, Flags);
   if (!Out)
     return 1;
 
@@ -423,7 +426,7 @@ int main(int argc, char **argv) {
       WithColor::error() << "dwo output only supported with object files\n";
       return 1;
     }
-    DwoOut = GetOutputStream(SplitDwarfFile);
+    DwoOut = GetOutputStream(SplitDwarfFile, sys::fs::OF_None);
     if (!DwoOut)
       return 1;
   }
@@ -459,7 +462,7 @@ int main(int argc, char **argv) {
 
     std::unique_ptr<MCAsmBackend> MAB(
         TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions));
-    auto FOut = llvm::make_unique<formatted_raw_ostream>(*OS);
+    auto FOut = std::make_unique<formatted_raw_ostream>(*OS);
     Str.reset(
         TheTarget->createAsmStreamer(Ctx, std::move(FOut), /*asmverbose*/ true,
                                      /*useDwarfDirectory*/ true, IP,
@@ -474,7 +477,7 @@ int main(int argc, char **argv) {
     Ctx.setUseNamesOnTempLabels(false);
 
     if (!Out->os().supportsSeeking()) {
-      BOS = make_unique<buffer_ostream>(Out->os());
+      BOS = std::make_unique<buffer_ostream>(Out->os());
       OS = BOS.get();
     }
 
@@ -506,7 +509,7 @@ int main(int argc, char **argv) {
     break;
   case AC_MDisassemble:
     assert(IP && "Expected assembly output");
-    IP->setUseMarkup(1);
+    IP->setUseMarkup(true);
     disassemble = true;
     break;
   case AC_Disassemble:
@@ -514,8 +517,8 @@ int main(int argc, char **argv) {
     break;
   }
   if (disassemble)
-    Res = Disassembler::disassemble(*TheTarget, TripleName, *STI, *Str,
-                                    *Buffer, SrcMgr, Out->os());
+    Res = Disassembler::disassemble(*TheTarget, TripleName, *STI, *Str, *Buffer,
+                                    SrcMgr, Ctx, Out->os());
 
   // Keep output if no errors.
   if (Res == 0) {
diff --git a/tools/llvm-mca/CodeRegion.cpp b/tools/llvm-mca/CodeRegion.cpp
index bf592f67245e..e05517c1ac95 100644
--- a/tools/llvm-mca/CodeRegion.cpp
+++ b/tools/llvm-mca/CodeRegion.cpp
@@ -18,7 +18,7 @@ namespace mca {
 
 CodeRegions::CodeRegions(llvm::SourceMgr &S) : SM(S), FoundErrors(false) {
   // Create a default region for the input code sequence.
-  Regions.emplace_back(make_unique<CodeRegion>("", SMLoc()));
+  Regions.emplace_back(std::make_unique<CodeRegion>("", SMLoc()));
 }
 
 bool CodeRegion::isLocInRange(SMLoc Loc) const {
@@ -36,7 +36,7 @@ void CodeRegions::beginRegion(StringRef Description, SMLoc Loc) {
     if (Regions.size() == 1 && !Regions[0]->startLoc().isValid() &&
         !Regions[0]->endLoc().isValid()) {
       ActiveRegions[Description] = 0;
-      Regions[0] = make_unique<CodeRegion>(Description, Loc);
+      Regions[0] = std::make_unique<CodeRegion>(Description, Loc);
       return;
     }
   } else {
@@ -62,7 +62,7 @@ void CodeRegions::beginRegion(StringRef Description, SMLoc Loc) {
   }
 
   ActiveRegions[Description] = Regions.size();
-  Regions.emplace_back(make_unique<CodeRegion>(Description, Loc));
+  Regions.emplace_back(std::make_unique<CodeRegion>(Description, Loc));
   return;
 }
 
diff --git a/tools/llvm-mca/CodeRegionGenerator.cpp b/tools/llvm-mca/CodeRegionGenerator.cpp
index c793169e64e0..8ddcd2f4abe2 100644
--- a/tools/llvm-mca/CodeRegionGenerator.cpp
+++ b/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -118,6 +118,8 @@ Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions() {
   MCAsmLexer &Lexer = Parser->getLexer();
   MCACommentConsumer CC(Regions);
   Lexer.setCommentConsumer(&CC);
+  // Enable support for MASM literal numbers (example: 05h, 101b).
+  Lexer.setLexMasmIntegers(true);
 
   std::unique_ptr<MCTargetAsmParser> TAP(
       TheTarget.createMCAsmParser(STI, *Parser, MCII, Opts));
diff --git a/tools/llvm-mca/Views/BottleneckAnalysis.cpp b/tools/llvm-mca/Views/BottleneckAnalysis.cpp
index 560c6c6e8a33..feff0cd6d524 100644
--- a/tools/llvm-mca/Views/BottleneckAnalysis.cpp
+++ b/tools/llvm-mca/Views/BottleneckAnalysis.cpp
@@ -165,10 +165,33 @@ void DependencyGraph::dumpDependencyEdge(raw_ostream &OS,
            "Unsupported dependency type!");
     OS << " - RESOURCE MASK: " << DE.ResourceOrRegID;
   }
-  OS << " - CYCLES: " << DE.Cost << '\n';
+  OS << " - COST: " << DE.Cost << '\n';
 }
 #endif // NDEBUG
 
+void DependencyGraph::pruneEdges(unsigned Iterations) {
+  for (DGNode &N : Nodes) {
+    unsigned NumPruned = 0;
+    const unsigned Size = N.OutgoingEdges.size();
+    // Use a cut-off threshold to prune edges with a low frequency.
+    for (unsigned I = 0, E = Size; I < E; ++I) {
+      DependencyEdge &Edge = N.OutgoingEdges[I];
+      if (Edge.Frequency == Iterations)
+        continue;
+      double Factor = (double)Edge.Frequency / Iterations;
+      if (0.10 < Factor)
+        continue;
+      Nodes[Edge.ToIID].NumPredecessors--;
+      std::swap(Edge, N.OutgoingEdges[E - 1]);
+      --E;
+      ++NumPruned;
+    }
+
+    if (NumPruned)
+      N.OutgoingEdges.resize(Size - NumPruned);
+  }
+}
+
 void DependencyGraph::initializeRootSet(
     SmallVectorImpl<unsigned> &RootSet) const {
   for (unsigned I = 0, E = Nodes.size(); I < E; ++I) {
@@ -179,7 +202,7 @@ void DependencyGraph::initializeRootSet(
 }
 
 void DependencyGraph::propagateThroughEdges(
-    SmallVectorImpl<unsigned> &RootSet) {
+    SmallVectorImpl<unsigned> &RootSet, unsigned Iterations) {
   SmallVector<unsigned, 8> ToVisit;
 
   // A critical sequence is computed as the longest path from a node of the
@@ -189,6 +212,10 @@ void DependencyGraph::propagateThroughEdges(
   // Each node of the graph starts with an initial default cost of zero.  The
   // cost of a node is a measure of criticality: the higher the cost, the bigger
   // is the performance impact.
+  // For register and memory dependencies, the cost is a function of the write
+  // latency as well as the actual delay (in cycles) caused to users.
+  // For processor resource dependencies, the cost is a function of the resource
+  // pressure. Resource interferences with low frequency values are ignored.
   //
   // This algorithm is very similar to a (reverse) Dijkstra.  Every iteration of
   // the inner loop selects (i.e. visits) a node N from a set of `unvisited
@@ -277,6 +304,10 @@ static void printInstruction(formatted_raw_ostream &FOS,
 }
 
 void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
+  // Early exit if no bottlenecks were found during the simulation.
+  if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
+    return;
+
   SmallVector<const DependencyEdge *, 16> Seq;
   DG.getCriticalSequence(Seq);
   if (Seq.empty())
@@ -432,7 +463,6 @@ void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To,
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addRegisterDep(From, To + SourceSize, RegID, Cost);
     DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost);
     return;
@@ -445,7 +475,6 @@ void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To,
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addMemoryDep(From, To + SourceSize, Cost);
     DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost);
     return;
@@ -458,7 +487,6 @@ void BottleneckAnalysis::addResourceDep(unsigned From, unsigned To,
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addResourceDep(From, To + SourceSize, Mask, Cost);
     DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost);
     return;
@@ -514,7 +542,7 @@ void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) {
 
   // Check if this is the last simulated instruction.
   if (IID == ((Iterations * Source.size()) - 1))
-    DG.finalizeGraph();
+    DG.finalizeGraph(Iterations);
 }
 
 void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) {
diff --git a/tools/llvm-mca/Views/BottleneckAnalysis.h b/tools/llvm-mca/Views/BottleneckAnalysis.h
index 7564b1a48206..9e3bd5978f09 100644
--- a/tools/llvm-mca/Views/BottleneckAnalysis.h
+++ b/tools/llvm-mca/Views/BottleneckAnalysis.h
@@ -236,8 +236,9 @@ class DependencyGraph {
   void addDependency(unsigned From, unsigned To,
                      DependencyEdge::Dependency &&DE);
 
+  void pruneEdges(unsigned Iterations);
   void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
-  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet);
+  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
 
 #ifndef NDEBUG
   void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
@@ -263,10 +264,11 @@ public:
 
   // Called by the bottleneck analysis at the end of simulation to propagate
   // costs through the edges of the graph, and compute a critical path.
-  void finalizeGraph() {
+  void finalizeGraph(unsigned Iterations) {
     SmallVector<unsigned, 16> RootSet;
+    pruneEdges(Iterations);
     initializeRootSet(RootSet);
-    propagateThroughEdges(RootSet);
+    propagateThroughEdges(RootSet, Iterations);
   }
 
   // Returns a sequence of edges representing the critical sequence based on the
diff --git a/tools/llvm-mca/Views/InstructionInfoView.cpp b/tools/llvm-mca/Views/InstructionInfoView.cpp
index 1fbffa3e5b69..a6f9153b4945 100644
--- a/tools/llvm-mca/Views/InstructionInfoView.cpp
+++ b/tools/llvm-mca/Views/InstructionInfoView.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Views/InstructionInfoView.h"
+#include "llvm/Support/FormattedStream.h"
 
 namespace llvm {
 namespace mca {
@@ -26,10 +27,17 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
 
   TempStream << "\n\nInstruction Info:\n";
   TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n"
-             << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects (U)\n\n";
+             << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects (U)\n";
+  if (PrintEncodings) {
+    TempStream << "[7]: Encoding Size\n";
+    TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    [7]    "
+               << "Encodings:                    Instructions:\n";
+  } else {
+    TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    Instructions:\n";
+  }
 
-  TempStream << "[1]    [2]    [3]    [4]    [5]    [6]    Instructions:\n";
-  for (const MCInst &Inst : Source) {
+  for (unsigned I = 0, E = Source.size(); I < E; ++I) {
+    const MCInst &Inst = Source[I];
     const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode());
 
     // Obtain the scheduling class information from the instruction.
@@ -72,7 +80,20 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
     }
     TempStream << (MCDesc.mayLoad() ? " *     " : "       ");
     TempStream << (MCDesc.mayStore() ? " *     " : "       ");
-    TempStream << (MCDesc.hasUnmodeledSideEffects() ? " U " : "   ");
+    TempStream << (MCDesc.hasUnmodeledSideEffects() ? " U     " : "       ");
+
+    if (PrintEncodings) {
+      StringRef Encoding(CE.getEncoding(I));
+      unsigned EncodingSize = Encoding.size();
+      TempStream << " " << EncodingSize
+                 << (EncodingSize < 10 ? "     " : "    ");
+      TempStream.flush();
+      formatted_raw_ostream FOS(TempStream);
+      for (unsigned i = 0, e = Encoding.size(); i != e; ++i)
+        FOS << format("%02x ", (uint8_t)Encoding[i]);
+      FOS.PadToColumn(30);
+      FOS.flush();
+    }
 
     MCIP.printInst(&Inst, InstrStream, "", STI);
     InstrStream.flush();
@@ -80,7 +101,7 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
     // Consume any tabs or spaces at the beginning of the string.
     StringRef Str(Instruction);
     Str = Str.ltrim();
-    TempStream << "    " << Str << '\n';
+    TempStream << Str << '\n';
     Instruction = "";
   }
 
diff --git a/tools/llvm-mca/Views/InstructionInfoView.h b/tools/llvm-mca/Views/InstructionInfoView.h
index 640d87383436..0e948304119f 100644
--- a/tools/llvm-mca/Views/InstructionInfoView.h
+++ b/tools/llvm-mca/Views/InstructionInfoView.h
@@ -40,6 +40,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/CodeEmitter.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "llvm-mca"
@@ -51,14 +52,18 @@ namespace mca {
 class InstructionInfoView : public View {
   const llvm::MCSubtargetInfo &STI;
   const llvm::MCInstrInfo &MCII;
+  CodeEmitter &CE;
+  bool PrintEncodings;
   llvm::ArrayRef<llvm::MCInst> Source;
   llvm::MCInstPrinter &MCIP;
 
 public:
-  InstructionInfoView(const llvm::MCSubtargetInfo &sti,
-                      const llvm::MCInstrInfo &mcii,
-                      llvm::ArrayRef<llvm::MCInst> S, llvm::MCInstPrinter &IP)
-      : STI(sti), MCII(mcii), Source(S), MCIP(IP) {}
+  InstructionInfoView(const llvm::MCSubtargetInfo &ST,
+                      const llvm::MCInstrInfo &II, CodeEmitter &C,
+                      bool ShouldPrintEncodings, llvm::ArrayRef<llvm::MCInst> S,
+                      llvm::MCInstPrinter &IP)
+      : STI(ST), MCII(II), CE(C), PrintEncodings(ShouldPrintEncodings),
+        Source(S), MCIP(IP) {}
 
   void printView(llvm::raw_ostream &OS) const override;
 };
diff --git a/tools/llvm-mca/Views/TimelineView.cpp b/tools/llvm-mca/Views/TimelineView.cpp
index fe3f16ba344c..1e7caa297ac6 100644
--- a/tools/llvm-mca/Views/TimelineView.cpp
+++ b/tools/llvm-mca/Views/TimelineView.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Views/TimelineView.h"
+#include <numeric>
 
 namespace llvm {
 namespace mca {
@@ -132,25 +133,38 @@ void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS,
                                       const WaitTimeEntry &Entry,
                                       unsigned SourceIndex,
                                       unsigned Executions) const {
-  OS << SourceIndex << '.';
+  bool PrintingTotals = SourceIndex == Source.size();
+  unsigned CumulativeExecutions = PrintingTotals ? Timeline.size() : Executions;
+
+  if (!PrintingTotals)
+    OS << SourceIndex << '.';
+
   OS.PadToColumn(7);
 
   double AverageTime1, AverageTime2, AverageTime3;
-  AverageTime1 = (double)Entry.CyclesSpentInSchedulerQueue / Executions;
-  AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / Executions;
-  AverageTime3 = (double)Entry.CyclesSpentAfterWBAndBeforeRetire / Executions;
+  AverageTime1 =
+      (double)Entry.CyclesSpentInSchedulerQueue / CumulativeExecutions;
+  AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / CumulativeExecutions;
+  AverageTime3 =
+      (double)Entry.CyclesSpentAfterWBAndBeforeRetire / CumulativeExecutions;
 
   OS << Executions;
   OS.PadToColumn(13);
-  int BufferSize = UsedBuffer[SourceIndex].second;
-  tryChangeColor(OS, Entry.CyclesSpentInSchedulerQueue, Executions, BufferSize);
+
+  int BufferSize = PrintingTotals ? 0 : UsedBuffer[SourceIndex].second;
+  if (!PrintingTotals)
+    tryChangeColor(OS, Entry.CyclesSpentInSchedulerQueue, CumulativeExecutions,
+                   BufferSize);
   OS << format("%.1f", floor((AverageTime1 * 10) + 0.5) / 10);
   OS.PadToColumn(20);
-  tryChangeColor(OS, Entry.CyclesSpentInSQWhileReady, Executions, BufferSize);
+  if (!PrintingTotals)
+    tryChangeColor(OS, Entry.CyclesSpentInSQWhileReady, CumulativeExecutions,
+                   BufferSize);
   OS << format("%.1f", floor((AverageTime2 * 10) + 0.5) / 10);
   OS.PadToColumn(27);
-  tryChangeColor(OS, Entry.CyclesSpentAfterWBAndBeforeRetire, Executions,
-                 STI.getSchedModel().MicroOpBufferSize);
+  if (!PrintingTotals)
+    tryChangeColor(OS, Entry.CyclesSpentAfterWBAndBeforeRetire,
+                   CumulativeExecutions, STI.getSchedModel().MicroOpBufferSize);
   OS << format("%.1f", floor((AverageTime3 * 10) + 0.5) / 10);
 
   if (OS.has_colors())
@@ -190,6 +204,24 @@ void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
 
     ++IID;
   }
+
+  // If the timeline contains more than one instruction,
+  // let's also print global averages.
+  if (Source.size() != 1) {
+    WaitTimeEntry TotalWaitTime = std::accumulate(
+        WaitTime.begin(), WaitTime.end(), WaitTimeEntry{0, 0, 0},
+        [](const WaitTimeEntry &A, const WaitTimeEntry &B) {
+          return WaitTimeEntry{
+              A.CyclesSpentInSchedulerQueue + B.CyclesSpentInSchedulerQueue,
+              A.CyclesSpentInSQWhileReady + B.CyclesSpentInSQWhileReady,
+              A.CyclesSpentAfterWBAndBeforeRetire +
+                  B.CyclesSpentAfterWBAndBeforeRetire};
+        });
+    printWaitTimeEntry(FOS, TotalWaitTime, IID, Executions);
+    FOS << "   "
+        << "<total>" << '\n';
+    InstrStream.flush();
+  }
 }
 
 void TimelineView::printTimelineViewEntry(formatted_raw_ostream &OS,
diff --git a/tools/llvm-mca/Views/TimelineView.h b/tools/llvm-mca/Views/TimelineView.h
index b63b234293cd..9bec3b87db45 100644
--- a/tools/llvm-mca/Views/TimelineView.h
+++ b/tools/llvm-mca/Views/TimelineView.h
@@ -84,6 +84,7 @@
 /// 3.	 2	1.5	0.5	1.0	vaddss  %xmm1, %xmm0, %xmm3
 /// 4.	 2	3.5	0.0	0.0	vaddss  %xmm3, %xmm2, %xmm4
 /// 5.	 2	6.5	0.0	0.0	vaddss  %xmm4, %xmm5, %xmm6
+///      2	2.4	0.6	1.6     <total>
 ///
 /// By comparing column [2] with column [1], we get an idea about how many
 /// cycles were spent in the scheduler's queue due to data dependencies.
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index b3590b5910ec..99c45eebdd88 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -32,11 +32,17 @@
 #include "Views/SchedulerStatistics.h"
 #include "Views/SummaryView.h"
 #include "Views/TimelineView.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.inc"
+#include "llvm/MCA/CodeEmitter.h"
 #include "llvm/MCA/Context.h"
+#include "llvm/MCA/InstrBuilder.h"
 #include "llvm/MCA/Pipeline.h"
 #include "llvm/MCA/Stages/EntryStage.h"
 #include "llvm/MCA/Stages/InstructionTables.h"
@@ -83,11 +89,20 @@ static cl::opt<std::string>
          cl::desc("Target a specific cpu type (-mcpu=help for details)"),
          cl::value_desc("cpu-name"), cl::cat(ToolOptions), cl::init("native"));
 
+static cl::opt<std::string>
+    MATTR("mattr",
+          cl::desc("Additional target features."),
+          cl::cat(ToolOptions));
+
 static cl::opt<int>
     OutputAsmVariant("output-asm-variant",
                      cl::desc("Syntax variant to use for output printing"),
                      cl::cat(ToolOptions), cl::init(-1));
 
+static cl::opt<bool>
+    PrintImmHex("print-imm-hex", cl::cat(ToolOptions), cl::init(false),
+                cl::desc("Prefer hex format when printing immediate values"));
+
 static cl::opt<unsigned> Iterations("iterations",
                                     cl::desc("Number of iterations to run"),
                                     cl::cat(ToolOptions), cl::init(0));
@@ -193,6 +208,11 @@ static cl::opt<bool> EnableBottleneckAnalysis(
     cl::desc("Enable bottleneck analysis (disabled by default)"),
     cl::cat(ViewOptions), cl::init(false));
 
+static cl::opt<bool> ShowEncoding(
+    "show-encoding",
+    cl::desc("Print encoding information in the instruction info view"),
+    cl::cat(ViewOptions), cl::init(false));
+
 namespace {
 
 const Target *getTarget(const char *ProgName) {
@@ -218,7 +238,7 @@ ErrorOr<std::unique_ptr<ToolOutputFile>> getOutputStream() {
     OutputFilename = "-";
   std::error_code EC;
   auto Out =
-      llvm::make_unique<ToolOutputFile>(OutputFilename, EC, sys::fs::F_None);
+      std::make_unique<ToolOutputFile>(OutputFilename, EC, sys::fs::OF_Text);
   if (!EC)
     return std::move(Out);
   return EC;
@@ -303,33 +323,11 @@ int main(int argc, char **argv) {
   // Apply overrides to llvm-mca specific options.
   processViewOptions();
 
-  SourceMgr SrcMgr;
-
-  // Tell SrcMgr about this buffer, which is what the parser will pick up.
-  SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc());
-
-  std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
-  assert(MRI && "Unable to create target register info!");
-
-  std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName));
-  assert(MAI && "Unable to create target asm info!");
-
-  MCObjectFileInfo MOFI;
-  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr);
-  MOFI.InitMCObjectFileInfo(TheTriple, /* PIC= */ false, Ctx);
-
-  std::unique_ptr<buffer_ostream> BOS;
-
-  std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
-
-  std::unique_ptr<MCInstrAnalysis> MCIA(
-      TheTarget->createMCInstrAnalysis(MCII.get()));
-
   if (!MCPU.compare("native"))
     MCPU = llvm::sys::getHostCPUName();
 
   std::unique_ptr<MCSubtargetInfo> STI(
-      TheTarget->createMCSubtargetInfo(TripleName, MCPU, /* FeaturesStr */ ""));
+      TheTarget->createMCSubtargetInfo(TripleName, MCPU, MATTR));
   if (!STI->isCPUStringValid(MCPU))
     return 1;
 
@@ -352,6 +350,29 @@ int main(int argc, char **argv) {
     return 1;
   }
 
+  std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+  assert(MRI && "Unable to create target register info!");
+
+  std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName));
+  assert(MAI && "Unable to create target asm info!");
+
+  MCObjectFileInfo MOFI;
+  SourceMgr SrcMgr;
+
+  // Tell SrcMgr about this buffer, which is what the parser will pick up.
+  SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc());
+
+  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr);
+
+  MOFI.InitMCObjectFileInfo(TheTriple, /* PIC= */ false, Ctx);
+
+  std::unique_ptr<buffer_ostream> BOS;
+
+  std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+
+  std::unique_ptr<MCInstrAnalysis> MCIA(
+      TheTarget->createMCInstrAnalysis(MCII.get()));
+
   // Parse the input and create CodeRegions that llvm-mca can analyze.
   mca::AsmCodeRegionGenerator CRG(*TheTarget, SrcMgr, Ctx, *MAI, *STI, *MCII);
   Expected<const mca::CodeRegions &> RegionsOrErr = CRG.parseCodeRegions();
@@ -396,6 +417,9 @@ int main(int argc, char **argv) {
     return 1;
   }
 
+  // Set the display preference for hex vs. decimal immediates.
+  IP->setPrintImmHex(PrintImmHex);
+
   std::unique_ptr<ToolOutputFile> TOF = std::move(*OF);
 
   const MCSchedModel &SM = STI->getSchedModel();
@@ -413,6 +437,12 @@ int main(int argc, char **argv) {
   // Number each region in the sequence.
   unsigned RegionIdx = 0;
 
+  std::unique_ptr<MCCodeEmitter> MCE(
+      TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx));
+
+  std::unique_ptr<MCAsmBackend> MAB(TheTarget->createMCAsmBackend(
+      *STI, *MRI, InitMCTargetOptionsFromFlags()));
+
   for (const std::unique_ptr<mca::CodeRegion> &Region : Regions) {
     // Skip empty code regions.
     if (Region->empty())
@@ -430,6 +460,7 @@ int main(int argc, char **argv) {
 
     // Lower the MCInst sequence into an mca::Instruction sequence.
     ArrayRef<MCInst> Insts = Region->getInstructions();
+    mca::CodeEmitter CE(*STI, *MAB, *MCE, Insts);
     std::vector<std::unique_ptr<mca::Instruction>> LoweredSequence;
     for (const MCInst &MCI : Insts) {
       Expected<std::unique_ptr<mca::Instruction>> Inst =
@@ -459,18 +490,18 @@ int main(int argc, char **argv) {
 
     if (PrintInstructionTables) {
       //  Create a pipeline, stages, and a printer.
-      auto P = llvm::make_unique<mca::Pipeline>();
-      P->appendStage(llvm::make_unique<mca::EntryStage>(S));
-      P->appendStage(llvm::make_unique<mca::InstructionTables>(SM));
+      auto P = std::make_unique<mca::Pipeline>();
+      P->appendStage(std::make_unique<mca::EntryStage>(S));
+      P->appendStage(std::make_unique<mca::InstructionTables>(SM));
       mca::PipelinePrinter Printer(*P);
 
       // Create the views for this pipeline, execute, and emit a report.
       if (PrintInstructionInfoView) {
-        Printer.addView(llvm::make_unique<mca::InstructionInfoView>(
-            *STI, *MCII, Insts, *IP));
+        Printer.addView(std::make_unique<mca::InstructionInfoView>(
+            *STI, *MCII, CE, ShowEncoding, Insts, *IP));
       }
       Printer.addView(
-          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
+          std::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
 
       if (!runPipeline(*P))
         return 1;
@@ -480,42 +511,42 @@ int main(int argc, char **argv) {
     }
 
     // Create a basic pipeline simulating an out-of-order backend.
-    auto P = MCA.createDefaultPipeline(PO, IB, S);
+    auto P = MCA.createDefaultPipeline(PO, S);
     mca::PipelinePrinter Printer(*P);
 
     if (PrintSummaryView)
       Printer.addView(
-          llvm::make_unique<mca::SummaryView>(SM, Insts, DispatchWidth));
+          std::make_unique<mca::SummaryView>(SM, Insts, DispatchWidth));
 
     if (EnableBottleneckAnalysis) {
-      Printer.addView(llvm::make_unique<mca::BottleneckAnalysis>(
+      Printer.addView(std::make_unique<mca::BottleneckAnalysis>(
           *STI, *IP, Insts, S.getNumIterations()));
     }
 
     if (PrintInstructionInfoView)
-      Printer.addView(
-          llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, Insts, *IP));
+      Printer.addView(std::make_unique<mca::InstructionInfoView>(
+          *STI, *MCII, CE, ShowEncoding, Insts, *IP));
 
     if (PrintDispatchStats)
-      Printer.addView(llvm::make_unique<mca::DispatchStatistics>());
+      Printer.addView(std::make_unique<mca::DispatchStatistics>());
 
     if (PrintSchedulerStats)
-      Printer.addView(llvm::make_unique<mca::SchedulerStatistics>(*STI));
+      Printer.addView(std::make_unique<mca::SchedulerStatistics>(*STI));
 
     if (PrintRetireStats)
-      Printer.addView(llvm::make_unique<mca::RetireControlUnitStatistics>(SM));
+      Printer.addView(std::make_unique<mca::RetireControlUnitStatistics>(SM));
 
     if (PrintRegisterFileStats)
-      Printer.addView(llvm::make_unique<mca::RegisterFileStatistics>(*STI));
+      Printer.addView(std::make_unique<mca::RegisterFileStatistics>(*STI));
 
     if (PrintResourcePressureView)
       Printer.addView(
-          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
+          std::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
 
     if (PrintTimelineView) {
       unsigned TimelineIterations =
           TimelineMaxIterations ? TimelineMaxIterations : 10;
-      Printer.addView(llvm::make_unique<mca::TimelineView>(
+      Printer.addView(std::make_unique<mca::TimelineView>(
           *STI, *IP, Insts, std::min(TimelineIterations, S.getNumIterations()),
           TimelineMaxCycles));
     }
diff --git a/tools/llvm-modextract/llvm-modextract.cpp b/tools/llvm-modextract/llvm-modextract.cpp
index 3adefc5f0d3e..7c4099625842 100644
--- a/tools/llvm-modextract/llvm-modextract.cpp
+++ b/tools/llvm-modextract/llvm-modextract.cpp
@@ -54,7 +54,7 @@ int main(int argc, char **argv) {
 
   std::error_code EC;
   std::unique_ptr<ToolOutputFile> Out(
-      new ToolOutputFile(OutputFilename, EC, sys::fs::F_None));
+      new ToolOutputFile(OutputFilename, EC, sys::fs::OF_None));
   ExitOnErr(errorCodeToError(EC));
 
   if (BinaryExtract) {
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index aa62e6f0209b..ee55722dc139 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -711,17 +711,21 @@ static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
                                    const std::string &ArchiveName,
                                    const std::string &ArchitectureName) {
   if (!NoSort) {
-    std::function<bool(const NMSymbol &, const NMSymbol &)> Cmp;
+    using Comparator = bool (*)(const NMSymbol &, const NMSymbol &);
+    Comparator Cmp;
     if (NumericSort)
-      Cmp = compareSymbolAddress;
+      Cmp = &compareSymbolAddress;
     else if (SizeSort)
-      Cmp = compareSymbolSize;
+      Cmp = &compareSymbolSize;
     else
-      Cmp = compareSymbolName;
+      Cmp = &compareSymbolName;
 
     if (ReverseSort)
-      Cmp = [=](const NMSymbol &A, const NMSymbol &B) { return Cmp(B, A); };
-    llvm::sort(SymbolList, Cmp);
+      llvm::sort(SymbolList, [=](const NMSymbol &A, const NMSymbol &B) -> bool {
+        return Cmp(B, A);
+      });
+    else
+      llvm::sort(SymbolList, Cmp);
   }
 
   if (!PrintFileName) {
@@ -913,10 +917,12 @@ static char getSymbolNMTypeChar(ELFObjectFileBase &Obj,
     if (Flags & ELF::SHF_ALLOC)
       return Flags & ELF::SHF_WRITE ? 'd' : 'r';
 
-    StringRef SecName;
-    if (SecI->getName(SecName))
+    auto NameOrErr = SecI->getName();
+    if (!NameOrErr) {
+      consumeError(NameOrErr.takeError());
       return '?';
-    if (SecName.startswith(".debug"))
+    }
+    if ((*NameOrErr).startswith(".debug"))
       return 'N';
     if (!(Flags & ELF::SHF_WRITE))
       return 'n';
@@ -1076,7 +1082,7 @@ static StringRef getNMTypeName(SymbolicFile &Obj, basic_symbol_iterator I) {
 static char getNMSectionTagAndName(SymbolicFile &Obj, basic_symbol_iterator I,
                                    StringRef &SecName) {
   uint32_t Symflags = I->getFlags();
-  if (isa<ELFObjectFileBase>(&Obj)) {
+  if (ELFObjectFileBase *ELFObj = dyn_cast<ELFObjectFileBase>(&Obj)) {
     if (Symflags & object::SymbolRef::SF_Absolute)
       SecName = "*ABS*";
     else if (Symflags & object::SymbolRef::SF_Common)
@@ -1090,8 +1096,16 @@ static char getNMSectionTagAndName(SymbolicFile &Obj, basic_symbol_iterator I,
         consumeError(SecIOrErr.takeError());
         return '?';
       }
-      elf_section_iterator secT = *SecIOrErr;
-      secT->getName(SecName);
+
+      if (*SecIOrErr == ELFObj->section_end())
+        return '?';
+
+      Expected<StringRef> NameOrErr = (*SecIOrErr)->getName();
+      if (!NameOrErr) {
+        consumeError(NameOrErr.takeError());
+        return '?';
+      }
+      SecName = *NameOrErr;
     }
   }
 
@@ -1347,7 +1361,12 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
             StringRef SectionName = StringRef();
             for (const SectionRef &Section : MachO->sections()) {
               S.NSect++;
-              Section.getName(SectionName);
+ 
+              if (Expected<StringRef> NameOrErr = Section.getName())
+                SectionName = *NameOrErr;
+              else
+                consumeError(NameOrErr.takeError());
+
               SegmentName = MachO->getSectionFinalSegmentName(
                                                   Section.getRawDataRefImpl());
               if (S.Address >= Section.getAddress() &&
@@ -1667,7 +1686,11 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
           StringRef SegmentName = StringRef();
           StringRef SectionName = StringRef();
           for (const SectionRef &Section : MachO->sections()) {
-            Section.getName(SectionName);
+            if (Expected<StringRef> NameOrErr = Section.getName())
+              SectionName = *NameOrErr;
+            else
+              consumeError(NameOrErr.takeError());
+
             SegmentName = MachO->getSectionFinalSegmentName(
                                                 Section.getRawDataRefImpl());
             F.NSect++;
diff --git a/tools/llvm-objcopy/COFF/COFFObjcopy.cpp b/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
index 4ae46851a66f..2a8d816e6f3c 100644
--- a/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
+++ b/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
@@ -16,8 +16,8 @@
 
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/CRC.h"
 #include "llvm/Support/Errc.h"
-#include "llvm/Support/JamCRC.h"
 #include "llvm/Support/Path.h"
 #include <cassert>
 
@@ -40,22 +40,13 @@ static uint64_t getNextRVA(const Object &Obj) {
                  Obj.IsPE ? Obj.PeHeader.SectionAlignment : 1);
 }
 
-static uint32_t getCRC32(StringRef Data) {
-  JamCRC CRC;
-  CRC.update(ArrayRef<char>(Data.data(), Data.size()));
-  // The CRC32 value needs to be complemented because the JamCRC dosn't
-  // finalize the CRC32 value. It also dosn't negate the initial CRC32 value
-  // but it starts by default at 0xFFFFFFFF which is the complement of zero.
-  return ~CRC.getCRC();
-}
-
 static std::vector<uint8_t> createGnuDebugLinkSectionContents(StringRef File) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> LinkTargetOrErr =
       MemoryBuffer::getFile(File);
   if (!LinkTargetOrErr)
     error("'" + File + "': " + LinkTargetOrErr.getError().message());
   auto LinkTarget = std::move(*LinkTargetOrErr);
-  uint32_t CRC32 = getCRC32(LinkTarget->getBuffer());
+  uint32_t CRC32 = llvm::crc32(arrayRefFromStringRef(LinkTarget->getBuffer()));
 
   StringRef FileName = sys::path::filename(File);
   size_t CRCPos = alignTo(FileName.size() + 1, 4);
@@ -65,26 +56,37 @@ static std::vector<uint8_t> createGnuDebugLinkSectionContents(StringRef File) {
   return Data;
 }
 
-static void addGnuDebugLink(Object &Obj, StringRef DebugLinkFile) {
-  uint32_t StartRVA = getNextRVA(Obj);
+// Adds named section with given contents to the object.
+static void addSection(Object &Obj, StringRef Name, ArrayRef<uint8_t> Contents,
+                       uint32_t Characteristics) {
+  bool NeedVA = Characteristics & (IMAGE_SCN_MEM_EXECUTE | IMAGE_SCN_MEM_READ |
+                                   IMAGE_SCN_MEM_WRITE);
 
-  std::vector<Section> Sections;
   Section Sec;
-  Sec.setOwnedContents(createGnuDebugLinkSectionContents(DebugLinkFile));
-  Sec.Name = ".gnu_debuglink";
-  Sec.Header.VirtualSize = Sec.getContents().size();
-  Sec.Header.VirtualAddress = StartRVA;
-  Sec.Header.SizeOfRawData = alignTo(Sec.Header.VirtualSize,
-                                     Obj.IsPE ? Obj.PeHeader.FileAlignment : 1);
+  Sec.setOwnedContents(Contents);
+  Sec.Name = Name;
+  Sec.Header.VirtualSize = NeedVA ? Sec.getContents().size() : 0u;
+  Sec.Header.VirtualAddress = NeedVA ? getNextRVA(Obj) : 0u;
+  Sec.Header.SizeOfRawData =
+      NeedVA ? alignTo(Sec.Header.VirtualSize,
+                       Obj.IsPE ? Obj.PeHeader.FileAlignment : 1)
+             : Sec.getContents().size();
   // Sec.Header.PointerToRawData is filled in by the writer.
   Sec.Header.PointerToRelocations = 0;
   Sec.Header.PointerToLinenumbers = 0;
   // Sec.Header.NumberOfRelocations is filled in by the writer.
   Sec.Header.NumberOfLinenumbers = 0;
-  Sec.Header.Characteristics = IMAGE_SCN_CNT_INITIALIZED_DATA |
-                               IMAGE_SCN_MEM_READ | IMAGE_SCN_MEM_DISCARDABLE;
-  Sections.push_back(Sec);
-  Obj.addSections(Sections);
+  Sec.Header.Characteristics = Characteristics;
+
+  Obj.addSections(Sec);
+}
+
+static void addGnuDebugLink(Object &Obj, StringRef DebugLinkFile) {
+  std::vector<uint8_t> Contents =
+      createGnuDebugLinkSectionContents(DebugLinkFile);
+  addSection(Obj, ".gnu_debuglink", Contents,
+             IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
+                 IMAGE_SCN_MEM_DISCARDABLE);
 }
 
 static Error handleArgs(const CopyConfig &Config, Object &Obj) {
@@ -92,8 +94,7 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
   Obj.removeSections([&Config](const Section &Sec) {
     // Contrary to --only-keep-debug, --only-section fully removes sections that
     // aren't mentioned.
-    if (!Config.OnlySection.empty() &&
-        !is_contained(Config.OnlySection, Sec.Name))
+    if (!Config.OnlySection.empty() && !Config.OnlySection.matches(Sec.Name))
       return true;
 
     if (Config.StripDebug || Config.StripAll || Config.StripAllGNU ||
@@ -103,7 +104,7 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
         return true;
     }
 
-    if (is_contained(Config.ToRemove, Sec.Name))
+    if (Config.ToRemove.matches(Sec.Name))
       return true;
 
     return false;
@@ -137,7 +138,7 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
     if (Config.StripAll || Config.StripAllGNU)
       return true;
 
-    if (is_contained(Config.SymbolsToRemove, Sym.Name)) {
+    if (Config.SymbolsToRemove.matches(Sym.Name)) {
       // Explicitly removing a referenced symbol is an error.
       if (Sym.Referenced)
         reportError(Config.OutputFilename,
@@ -156,7 +157,7 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
       if (Sym.Sym.StorageClass == IMAGE_SYM_CLASS_STATIC ||
           Sym.Sym.SectionNumber == 0)
         if (Config.StripUnneeded ||
-            is_contained(Config.UnneededSymbolsToRemove, Sym.Name))
+            Config.UnneededSymbolsToRemove.matches(Sym.Name))
           return true;
 
       // GNU objcopy keeps referenced local symbols and external symbols
@@ -171,21 +172,38 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
     return false;
   });
 
+  for (const auto &Flag : Config.AddSection) {
+    StringRef SecName, FileName;
+    std::tie(SecName, FileName) = Flag.split("=");
+
+    auto BufOrErr = MemoryBuffer::getFile(FileName);
+    if (!BufOrErr)
+      return createFileError(FileName, errorCodeToError(BufOrErr.getError()));
+    auto Buf = std::move(*BufOrErr);
+
+    addSection(
+        Obj, SecName,
+        makeArrayRef(reinterpret_cast<const uint8_t *>(Buf->getBufferStart()),
+                     Buf->getBufferSize()),
+        IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_ALIGN_1BYTES);
+  }
+
   if (!Config.AddGnuDebugLink.empty())
     addGnuDebugLink(Obj, Config.AddGnuDebugLink);
 
   if (Config.AllowBrokenLinks || !Config.BuildIdLinkDir.empty() ||
       Config.BuildIdLinkInput || Config.BuildIdLinkOutput ||
       !Config.SplitDWO.empty() || !Config.SymbolsPrefix.empty() ||
-      !Config.AllocSectionsPrefix.empty() || !Config.AddSection.empty() ||
-      !Config.DumpSection.empty() || !Config.KeepSection.empty() ||
+      !Config.AllocSectionsPrefix.empty() || !Config.DumpSection.empty() ||
+      !Config.KeepSection.empty() || Config.NewSymbolVisibility ||
       !Config.SymbolsToGlobalize.empty() || !Config.SymbolsToKeep.empty() ||
       !Config.SymbolsToLocalize.empty() || !Config.SymbolsToWeaken.empty() ||
       !Config.SymbolsToKeepGlobal.empty() || !Config.SectionsToRename.empty() ||
-      !Config.SetSectionFlags.empty() || !Config.SymbolsToRename.empty() ||
-      Config.ExtractDWO || Config.KeepFileSymbols || Config.LocalizeHidden ||
-      Config.PreserveDates || Config.StripDWO || Config.StripNonAlloc ||
-      Config.StripSections || Config.Weaken || Config.DecompressDebugSections ||
+      !Config.SetSectionAlignment.empty() || !Config.SetSectionFlags.empty() ||
+      !Config.SymbolsToRename.empty() || Config.ExtractDWO ||
+      Config.KeepFileSymbols || Config.LocalizeHidden || Config.PreserveDates ||
+      Config.StripDWO || Config.StripNonAlloc || Config.StripSections ||
+      Config.Weaken || Config.DecompressDebugSections ||
       Config.DiscardMode == DiscardType::Locals ||
       !Config.SymbolsToAdd.empty() || Config.EntryExpr) {
     return createStringError(llvm::errc::invalid_argument,
diff --git a/tools/llvm-objcopy/COFF/Reader.cpp b/tools/llvm-objcopy/COFF/Reader.cpp
index 1f0ec9fa9691..2fcec0057c03 100644
--- a/tools/llvm-objcopy/COFF/Reader.cpp
+++ b/tools/llvm-objcopy/COFF/Reader.cpp
@@ -36,14 +36,9 @@ Error COFFReader::readExecutableHeaders(Object &Obj) const {
                                     DH->AddressOfNewExeHeader - sizeof(*DH));
 
   if (COFFObj.is64()) {
-    const pe32plus_header *PE32Plus = nullptr;
-    if (auto EC = COFFObj.getPE32PlusHeader(PE32Plus))
-      return errorCodeToError(EC);
-    Obj.PeHeader = *PE32Plus;
+    Obj.PeHeader = *COFFObj.getPE32PlusHeader();
   } else {
-    const pe32_header *PE32 = nullptr;
-    if (auto EC = COFFObj.getPE32Header(PE32))
-      return errorCodeToError(EC);
+    const pe32_header *PE32 = COFFObj.getPE32Header();
     copyPeHeader(Obj.PeHeader, *PE32);
     // The pe32plus_header (stored in Object) lacks the BaseOfData field.
     Obj.BaseOfData = PE32->BaseOfData;
@@ -196,16 +191,13 @@ Error COFFReader::setSymbolTargets(Object &Obj) const {
 }
 
 Expected<std::unique_ptr<Object>> COFFReader::create() const {
-  auto Obj = llvm::make_unique<Object>();
+  auto Obj = std::make_unique<Object>();
 
-  const coff_file_header *CFH = nullptr;
-  const coff_bigobj_file_header *CBFH = nullptr;
-  COFFObj.getCOFFHeader(CFH);
-  COFFObj.getCOFFBigObjHeader(CBFH);
   bool IsBigObj = false;
-  if (CFH) {
+  if (const coff_file_header *CFH = COFFObj.getCOFFHeader()) {
     Obj->CoffFileHeader = *CFH;
   } else {
+    const coff_bigobj_file_header *CBFH = COFFObj.getCOFFBigObjHeader();
     if (!CBFH)
       return createStringError(object_error::parse_failed,
                                "no COFF file header returned");
diff --git a/tools/llvm-objcopy/COFF/Writer.cpp b/tools/llvm-objcopy/COFF/Writer.cpp
index f3bb1ce331f2..6db37435fd96 100644
--- a/tools/llvm-objcopy/COFF/Writer.cpp
+++ b/tools/llvm-objcopy/COFF/Writer.cpp
@@ -120,12 +120,12 @@ size_t COFFWriter::finalizeStringTable() {
   StrTabBuilder.finalize();
 
   for (auto &S : Obj.getMutableSections()) {
+    memset(S.Header.Name, 0, sizeof(S.Header.Name));
     if (S.Name.size() > COFF::NameSize) {
-      memset(S.Header.Name, 0, sizeof(S.Header.Name));
       snprintf(S.Header.Name, sizeof(S.Header.Name), "/%d",
                (int)StrTabBuilder.getOffset(S.Name));
     } else {
-      strncpy(S.Header.Name, S.Name.data(), COFF::NameSize);
+      memcpy(S.Header.Name, S.Name.data(), S.Name.size());
     }
   }
   for (auto &S : Obj.getMutableSymbols()) {
diff --git a/tools/llvm-objcopy/CommonOpts.td b/tools/llvm-objcopy/CommonOpts.td
new file mode 100644
index 000000000000..e8c092b44431
--- /dev/null
+++ b/tools/llvm-objcopy/CommonOpts.td
@@ -0,0 +1,123 @@
+include "llvm/Option/OptParser.td"
+
+multiclass Eq<string name, string help> {
+  def NAME : Separate<["--"], name>;
+  def NAME #_eq : Joined<["--"], name #"=">,
+                  Alias<!cast<Separate>(NAME)>,
+                  HelpText<help>;
+}
+
+def help : Flag<["--"], "help">;
+def h : Flag<["-"], "h">, Alias<help>;
+
+def allow_broken_links
+    : Flag<["--"], "allow-broken-links">,
+      HelpText<"Allow the tool to remove sections even if it would leave "
+               "invalid section references. The appropriate sh_link fields "
+               "will be set to zero.">;
+
+def enable_deterministic_archives
+    : Flag<["--"], "enable-deterministic-archives">,
+      HelpText<"Enable deterministic mode when operating on archives (use "
+               "zero for UIDs, GIDs, and timestamps).">;
+def D : Flag<["-"], "D">,
+        Alias<enable_deterministic_archives>,
+        HelpText<"Alias for --enable-deterministic-archives">;
+
+def disable_deterministic_archives
+    : Flag<["--"], "disable-deterministic-archives">,
+      HelpText<"Disable deterministic mode when operating on archives (use "
+               "real values for UIDs, GIDs, and timestamps).">;
+def U : Flag<["-"], "U">,
+        Alias<disable_deterministic_archives>,
+        HelpText<"Alias for --disable-deterministic-archives">;
+
+def preserve_dates : Flag<["--"], "preserve-dates">,
+                     HelpText<"Preserve access and modification timestamps">;
+def p : Flag<["-"], "p">,
+        Alias<preserve_dates>,
+        HelpText<"Alias for --preserve-dates">;
+
+def strip_all : Flag<["--"], "strip-all">,
+                HelpText<"Remove non-allocated sections outside segments. "
+                         ".gnu.warning* sections are not removed">;
+
+def strip_all_gnu
+    : Flag<["--"], "strip-all-gnu">,
+      HelpText<"Compatible with GNU's --strip-all">;
+
+def strip_debug : Flag<["--"], "strip-debug">,
+                  HelpText<"Remove all debug sections">;
+def g : Flag<["-"], "g">,
+        Alias<strip_debug>,
+        HelpText<"Alias for --strip-debug">;
+
+def strip_unneeded : Flag<["--"], "strip-unneeded">,
+                     HelpText<"Remove all symbols not needed by relocations">;
+
+defm remove_section : Eq<"remove-section", "Remove <section>">,
+                      MetaVarName<"section">;
+def R : JoinedOrSeparate<["-"], "R">,
+        Alias<remove_section>,
+        HelpText<"Alias for --remove-section">;
+
+def strip_sections
+    : Flag<["--"], "strip-sections">,
+      HelpText<"Remove all section headers and all sections not in segments">;
+
+defm strip_symbol : Eq<"strip-symbol", "Strip <symbol>">,
+                    MetaVarName<"symbol">;
+def N : JoinedOrSeparate<["-"], "N">,
+        Alias<strip_symbol>,
+        HelpText<"Alias for --strip-symbol">;
+
+defm keep_section : Eq<"keep-section", "Keep <section>">,
+                    MetaVarName<"section">;
+
+defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
+                   MetaVarName<"symbol">;
+def K : JoinedOrSeparate<["-"], "K">,
+        Alias<keep_symbol>,
+        HelpText<"Alias for --keep-symbol">;
+
+def keep_file_symbols : Flag<["--"], "keep-file-symbols">,
+                        HelpText<"Do not remove file symbols">;
+
+def only_keep_debug
+    : Flag<["--"], "only-keep-debug">,
+      HelpText<"Clear sections that would not be stripped by --strip-debug. "
+               "Currently only implemented for COFF.">;
+
+def discard_locals : Flag<["--"], "discard-locals">,
+                     HelpText<"Remove compiler-generated local symbols, (e.g. "
+                              "symbols starting with .L)">;
+def X : Flag<["-"], "X">,
+        Alias<discard_locals>,
+        HelpText<"Alias for --discard-locals">;
+
+def discard_all
+    : Flag<["--"], "discard-all">,
+      HelpText<"Remove all local symbols except file and section symbols">;
+def x : Flag<["-"], "x">,
+        Alias<discard_all>,
+        HelpText<"Alias for --discard-all">;
+
+def regex
+    : Flag<["--"], "regex">,
+      HelpText<"Permit regular expressions in name comparison">;
+
+def version : Flag<["--"], "version">,
+              HelpText<"Print the version and exit.">;
+def V : Flag<["-"], "V">,
+        Alias<version>,
+        HelpText<"Alias for --version">;
+
+def wildcard
+    : Flag<["--"], "wildcard">,
+      HelpText<"Allow wildcard syntax for symbol-related flags. Incompatible "
+               "with --regex. Allows using '*' to match any number of "
+               "characters, '?' to match any single character, '\' to escape "
+               "special characters, and '[]' to define character classes. "
+               "Wildcards beginning with '!' will prevent a match, for example "
+               "\"-N '*' -N '!x'\" will strip all symbols except for \"x\".">;
+def w : Flag<["-"], "w">, Alias<wildcard>, HelpText<"Alias for --wildcard">;
diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp
index 8d6431b3044f..d707bec20c49 100644
--- a/tools/llvm-objcopy/CopyConfig.cpp
+++ b/tools/llvm-objcopy/CopyConfig.cpp
@@ -14,10 +14,10 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Support/CRC.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/Errc.h"
-#include "llvm/Support/JamCRC.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/StringSaver.h"
 #include <memory>
@@ -155,6 +155,25 @@ static Expected<SectionRename> parseRenameSectionValue(StringRef FlagValue) {
   return SR;
 }
 
+static Expected<std::pair<StringRef, uint64_t>>
+parseSetSectionAlignment(StringRef FlagValue) {
+  if (!FlagValue.contains('='))
+    return createStringError(
+        errc::invalid_argument,
+        "bad format for --set-section-alignment: missing '='");
+  auto Split = StringRef(FlagValue).split('=');
+  if (Split.first.empty())
+    return createStringError(
+        errc::invalid_argument,
+        "bad format for --set-section-alignment: missing section name");
+  uint64_t NewAlign;
+  if (Split.second.getAsInteger(0, NewAlign))
+    return createStringError(errc::invalid_argument,
+                             "invalid alignment for --set-section-alignment: '%s'",
+                             Split.second.str().c_str());
+  return std::make_pair(Split.first, NewAlign);
+}
+
 static Expected<SectionFlagsUpdate>
 parseSetSectionFlagValue(StringRef FlagValue) {
   if (!StringRef(FlagValue).contains('='))
@@ -177,106 +196,6 @@ parseSetSectionFlagValue(StringRef FlagValue) {
   return SFU;
 }
 
-static Expected<NewSymbolInfo> parseNewSymbolInfo(StringRef FlagValue) {
-  // Parse value given with --add-symbol option and create the
-  // new symbol if possible. The value format for --add-symbol is:
-  //
-  // <name>=[<section>:]<value>[,<flags>]
-  //
-  // where:
-  // <name> - symbol name, can be empty string
-  // <section> - optional section name. If not given ABS symbol is created
-  // <value> - symbol value, can be decimal or hexadecimal number prefixed
-  //           with 0x.
-  // <flags> - optional flags affecting symbol type, binding or visibility:
-  //           The following are currently supported:
-  //
-  //           global, local, weak, default, hidden, file, section, object,
-  //           indirect-function.
-  //
-  //           The following flags are ignored and provided for GNU
-  //           compatibility only:
-  //
-  //           warning, debug, constructor, indirect, synthetic,
-  //           unique-object, before=<symbol>.
-  NewSymbolInfo SI;
-  StringRef Value;
-  std::tie(SI.SymbolName, Value) = FlagValue.split('=');
-  if (Value.empty())
-    return createStringError(
-        errc::invalid_argument,
-        "bad format for --add-symbol, missing '=' after '%s'",
-        SI.SymbolName.str().c_str());
-
-  if (Value.contains(':')) {
-    std::tie(SI.SectionName, Value) = Value.split(':');
-    if (SI.SectionName.empty() || Value.empty())
-      return createStringError(
-          errc::invalid_argument,
-          "bad format for --add-symbol, missing section name or symbol value");
-  }
-
-  SmallVector<StringRef, 6> Flags;
-  Value.split(Flags, ',');
-  if (Flags[0].getAsInteger(0, SI.Value))
-    return createStringError(errc::invalid_argument, "bad symbol value: '%s'",
-                             Flags[0].str().c_str());
-
-  using Functor = std::function<void(void)>;
-  SmallVector<StringRef, 6> UnsupportedFlags;
-  for (size_t I = 1, NumFlags = Flags.size(); I < NumFlags; ++I)
-    static_cast<Functor>(
-        StringSwitch<Functor>(Flags[I])
-            .CaseLower("global", [&SI] { SI.Bind = ELF::STB_GLOBAL; })
-            .CaseLower("local", [&SI] { SI.Bind = ELF::STB_LOCAL; })
-            .CaseLower("weak", [&SI] { SI.Bind = ELF::STB_WEAK; })
-            .CaseLower("default", [&SI] { SI.Visibility = ELF::STV_DEFAULT; })
-            .CaseLower("hidden", [&SI] { SI.Visibility = ELF::STV_HIDDEN; })
-            .CaseLower("file", [&SI] { SI.Type = ELF::STT_FILE; })
-            .CaseLower("section", [&SI] { SI.Type = ELF::STT_SECTION; })
-            .CaseLower("object", [&SI] { SI.Type = ELF::STT_OBJECT; })
-            .CaseLower("function", [&SI] { SI.Type = ELF::STT_FUNC; })
-            .CaseLower("indirect-function",
-                       [&SI] { SI.Type = ELF::STT_GNU_IFUNC; })
-            .CaseLower("debug", [] {})
-            .CaseLower("constructor", [] {})
-            .CaseLower("warning", [] {})
-            .CaseLower("indirect", [] {})
-            .CaseLower("synthetic", [] {})
-            .CaseLower("unique-object", [] {})
-            .StartsWithLower("before", [] {})
-            .Default([&] { UnsupportedFlags.push_back(Flags[I]); }))();
-  if (!UnsupportedFlags.empty())
-    return createStringError(errc::invalid_argument,
-                             "unsupported flag%s for --add-symbol: '%s'",
-                             UnsupportedFlags.size() > 1 ? "s" : "",
-                             join(UnsupportedFlags, "', '").c_str());
-  return SI;
-}
-
-static const StringMap<MachineInfo> ArchMap{
-    // Name, {EMachine, 64bit, LittleEndian}
-    {"aarch64", {ELF::EM_AARCH64, true, true}},
-    {"arm", {ELF::EM_ARM, false, true}},
-    {"i386", {ELF::EM_386, false, true}},
-    {"i386:x86-64", {ELF::EM_X86_64, true, true}},
-    {"mips", {ELF::EM_MIPS, false, false}},
-    {"powerpc:common64", {ELF::EM_PPC64, true, true}},
-    {"riscv:rv32", {ELF::EM_RISCV, false, true}},
-    {"riscv:rv64", {ELF::EM_RISCV, true, true}},
-    {"sparc", {ELF::EM_SPARC, false, false}},
-    {"sparcel", {ELF::EM_SPARC, false, true}},
-    {"x86-64", {ELF::EM_X86_64, true, true}},
-};
-
-static Expected<const MachineInfo &> getMachineInfo(StringRef Arch) {
-  auto Iter = ArchMap.find(Arch);
-  if (Iter == std::end(ArchMap))
-    return createStringError(errc::invalid_argument,
-                             "invalid architecture: '%s'", Arch.str().c_str());
-  return Iter->getValue();
-}
-
 struct TargetInfo {
   FileFormat Format;
   MachineInfo Machine;
@@ -341,9 +260,10 @@ getOutputTargetInfoByTargetName(StringRef TargetName) {
   return {TargetInfo{Format, MI}};
 }
 
-static Error addSymbolsFromFile(std::vector<NameOrRegex> &Symbols,
-                                BumpPtrAllocator &Alloc, StringRef Filename,
-                                bool UseRegex) {
+static Error
+addSymbolsFromFile(NameMatcher &Symbols, BumpPtrAllocator &Alloc,
+                   StringRef Filename, MatchStyle MS,
+                   llvm::function_ref<Error(Error)> ErrorCallback) {
   StringSaver Saver(Alloc);
   SmallVector<StringRef, 16> Lines;
   auto BufOrErr = MemoryBuffer::getFile(Filename);
@@ -356,21 +276,47 @@ static Error addSymbolsFromFile(std::vector<NameOrRegex> &Symbols,
     // it's not empty.
     auto TrimmedLine = Line.split('#').first.trim();
     if (!TrimmedLine.empty())
-      Symbols.emplace_back(Saver.save(TrimmedLine), UseRegex);
+      if (Error E = Symbols.addMatcher(NameOrPattern::create(
+              Saver.save(TrimmedLine), MS, ErrorCallback)))
+        return E;
   }
 
   return Error::success();
 }
 
-NameOrRegex::NameOrRegex(StringRef Pattern, bool IsRegex) {
-  if (!IsRegex) {
-    Name = Pattern;
-    return;
-  }
+Expected<NameOrPattern>
+NameOrPattern::create(StringRef Pattern, MatchStyle MS,
+                      llvm::function_ref<Error(Error)> ErrorCallback) {
+  switch (MS) {
+  case MatchStyle::Literal:
+    return NameOrPattern(Pattern);
+  case MatchStyle::Wildcard: {
+    SmallVector<char, 32> Data;
+    bool IsPositiveMatch = true;
+    if (Pattern[0] == '!') {
+      IsPositiveMatch = false;
+      Pattern = Pattern.drop_front();
+    }
+    Expected<GlobPattern> GlobOrErr = GlobPattern::create(Pattern);
+
+    // If we couldn't create it as a glob, report the error, but try again with
+    // a literal if the error reporting is non-fatal.
+    if (!GlobOrErr) {
+      if (Error E = ErrorCallback(GlobOrErr.takeError()))
+        return std::move(E);
+      return create(Pattern, MatchStyle::Literal, ErrorCallback);
+    }
 
-  SmallVector<char, 32> Data;
-  R = std::make_shared<Regex>(
-      ("^" + Pattern.ltrim('^').rtrim('$') + "$").toStringRef(Data));
+    return NameOrPattern(std::make_shared<GlobPattern>(*GlobOrErr),
+                         IsPositiveMatch);
+  }
+  case MatchStyle::Regex: {
+    SmallVector<char, 32> Data;
+    return NameOrPattern(std::make_shared<Regex>(
+        ("^" + Pattern.ltrim('^').rtrim('$') + "$").toStringRef(Data)));
+  }
+  }
+  llvm_unreachable("Unhandled llvm.objcopy.MatchStyle enum");
 }
 
 static Error addSymbolsToRenameFromFile(StringMap<StringRef> &SymbolsToRename,
@@ -407,10 +353,22 @@ template <class T> static ErrorOr<T> getAsInteger(StringRef Val) {
   return Result;
 }
 
+static void printHelp(const opt::OptTable &OptTable, raw_ostream &OS,
+                      StringRef ToolName) {
+  OptTable.PrintHelp(OS, (ToolName + " input [output]").str().c_str(),
+                     (ToolName + " tool").str().c_str());
+  // TODO: Replace this with libOption call once it adds extrahelp support.
+  // The CommandLine library has a cl::extrahelp class to support this,
+  // but libOption does not have that yet.
+  OS << "\nPass @FILE as argument to read options from FILE.\n";
+}
+
 // ParseObjcopyOptions returns the config and sets the input arguments. If a
 // help flag is set then ParseObjcopyOptions will print the help messege and
 // exit.
-Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
+Expected<DriverConfig>
+parseObjcopyOptions(ArrayRef<const char *> ArgsArr,
+                    llvm::function_ref<Error(Error)> ErrorCallback) {
   DriverConfig DC;
   ObjcopyOptTable T;
   unsigned MissingArgumentIndex, MissingArgumentCount;
@@ -418,12 +376,12 @@ Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
       T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
 
   if (InputArgs.size() == 0) {
-    T.PrintHelp(errs(), "llvm-objcopy input [output]", "objcopy tool");
+    printHelp(T, errs(), "llvm-objcopy");
     exit(1);
   }
 
   if (InputArgs.hasArg(OBJCOPY_help)) {
-    T.PrintHelp(outs(), "llvm-objcopy input [output]", "objcopy tool");
+    printHelp(T, outs(), "llvm-objcopy");
     exit(0);
   }
 
@@ -459,7 +417,18 @@ Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
         errc::invalid_argument,
         "--target cannot be used with --input-target or --output-target");
 
-  bool UseRegex = InputArgs.hasArg(OBJCOPY_regex);
+  if (InputArgs.hasArg(OBJCOPY_regex) && InputArgs.hasArg(OBJCOPY_wildcard))
+    return createStringError(errc::invalid_argument,
+                             "--regex and --wildcard are incompatible");
+
+  MatchStyle SectionMatchStyle = InputArgs.hasArg(OBJCOPY_regex)
+                                     ? MatchStyle::Regex
+                                     : MatchStyle::Wildcard;
+  MatchStyle SymbolMatchStyle = InputArgs.hasArg(OBJCOPY_regex)
+                                    ? MatchStyle::Regex
+                                    : InputArgs.hasArg(OBJCOPY_wildcard)
+                                          ? MatchStyle::Wildcard
+                                          : MatchStyle::Literal;
   StringRef InputFormat, OutputFormat;
   if (InputArgs.hasArg(OBJCOPY_target)) {
     InputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
@@ -476,28 +445,26 @@ Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
                            .Case("binary", FileFormat::Binary)
                            .Case("ihex", FileFormat::IHex)
                            .Default(FileFormat::Unspecified);
-  if (Config.InputFormat == FileFormat::Binary) {
-    auto BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture);
-    if (BinaryArch.empty())
-      return createStringError(
-          errc::invalid_argument,
-          "specified binary input without specifiying an architecture");
-    Expected<const MachineInfo &> MI = getMachineInfo(BinaryArch);
-    if (!MI)
-      return MI.takeError();
-    Config.BinaryArch = *MI;
-  }
+
+  if (InputArgs.hasArg(OBJCOPY_new_symbol_visibility))
+    Config.NewSymbolVisibility =
+        InputArgs.getLastArgValue(OBJCOPY_new_symbol_visibility);
 
   Config.OutputFormat = StringSwitch<FileFormat>(OutputFormat)
                             .Case("binary", FileFormat::Binary)
                             .Case("ihex", FileFormat::IHex)
                             .Default(FileFormat::Unspecified);
-  if (Config.OutputFormat == FileFormat::Unspecified && !OutputFormat.empty()) {
-    Expected<TargetInfo> Target = getOutputTargetInfoByTargetName(OutputFormat);
-    if (!Target)
-      return Target.takeError();
-    Config.OutputFormat = Target->Format;
-    Config.OutputArch = Target->Machine;
+  if (Config.OutputFormat == FileFormat::Unspecified) {
+    if (OutputFormat.empty()) {
+      Config.OutputFormat = Config.InputFormat;
+    } else {
+      Expected<TargetInfo> Target =
+          getOutputTargetInfoByTargetName(OutputFormat);
+      if (!Target)
+        return Target.takeError();
+      Config.OutputFormat = Target->Format;
+      Config.OutputArch = Target->Machine;
+    }
   }
 
   if (auto Arg = InputArgs.getLastArg(OBJCOPY_compress_debug_sections,
@@ -535,12 +502,8 @@ Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
     if (!DebugOrErr)
       return createFileError(Config.AddGnuDebugLink, DebugOrErr.getError());
     auto Debug = std::move(*DebugOrErr);
-    JamCRC CRC;
-    CRC.update(
-        ArrayRef<char>(Debug->getBuffer().data(), Debug->getBuffer().size()));
-    // The CRC32 value needs to be complemented because the JamCRC doesn't
-    // finalize the CRC32 value.
-    Config.GnuDebugLinkCRC32 = ~CRC.getCRC();
+    Config.GnuDebugLinkCRC32 =
+        llvm::crc32(arrayRefFromStringRef(Debug->getBuffer()));
   }
   Config.BuildIdLinkDir = InputArgs.getLastArgValue(OBJCOPY_build_id_link_dir);
   if (InputArgs.hasArg(OBJCOPY_build_id_link_input))
@@ -582,6 +545,13 @@ Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
                                "multiple renames of section '%s'",
                                SR->OriginalName.str().c_str());
   }
+  for (auto Arg : InputArgs.filtered(OBJCOPY_set_section_alignment)) {
+    Expected<std::pair<StringRef, uint64_t>> NameAndAlign =
+        parseSetSectionAlignment(Arg->getValue());
+    if (!NameAndAlign)
+      return NameAndAlign.takeError();
+    Config.SetSectionAlignment[NameAndAlign->first] = NameAndAlign->second;
+  }
   for (auto Arg : InputArgs.filtered(OBJCOPY_set_section_flags)) {
     Expected<SectionFlagsUpdate> SFU =
         parseSetSectionFlagValue(Arg->getValue());
@@ -612,13 +582,28 @@ Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
   }
 
   for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section))
-    Config.ToRemove.emplace_back(Arg->getValue(), UseRegex);
+    if (Error E = Config.ToRemove.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_keep_section))
-    Config.KeepSection.emplace_back(Arg->getValue(), UseRegex);
+    if (Error E = Config.KeepSection.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_only_section))
-    Config.OnlySection.emplace_back(Arg->getValue(), UseRegex);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_add_section))
-    Config.AddSection.push_back(Arg->getValue());
+    if (Error E = Config.OnlySection.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_add_section)) {
+    StringRef ArgValue(Arg->getValue());
+    if (!ArgValue.contains('='))
+      return createStringError(errc::invalid_argument,
+                               "bad format for --add-section: missing '='");
+    if (ArgValue.split("=").second.empty())
+      return createStringError(
+          errc::invalid_argument,
+          "bad format for --add-section: missing file name");
+    Config.AddSection.push_back(ArgValue);
+  }
   for (auto Arg : InputArgs.filtered(OBJCOPY_dump_section))
     Config.DumpSection.push_back(Arg->getValue());
   Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all);
@@ -645,53 +630,71 @@ Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
   if (Config.DiscardMode == DiscardType::All)
     Config.StripDebug = true;
   for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol))
-    Config.SymbolsToLocalize.emplace_back(Arg->getValue(), UseRegex);
+    if (Error E = Config.SymbolsToLocalize.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbols))
     if (Error E = addSymbolsFromFile(Config.SymbolsToLocalize, DC.Alloc,
-                                     Arg->getValue(), UseRegex))
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
       return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbol))
-    Config.SymbolsToKeepGlobal.emplace_back(Arg->getValue(), UseRegex);
+    if (Error E = Config.SymbolsToKeepGlobal.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbols))
     if (Error E = addSymbolsFromFile(Config.SymbolsToKeepGlobal, DC.Alloc,
-                                     Arg->getValue(), UseRegex))
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
       return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol))
-    Config.SymbolsToGlobalize.emplace_back(Arg->getValue(), UseRegex);
+    if (Error E = Config.SymbolsToGlobalize.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbols))
     if (Error E = addSymbolsFromFile(Config.SymbolsToGlobalize, DC.Alloc,
-                                     Arg->getValue(), UseRegex))
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
       return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol))
-    Config.SymbolsToWeaken.emplace_back(Arg->getValue(), UseRegex);
+    if (Error E = Config.SymbolsToWeaken.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbols))
     if (Error E = addSymbolsFromFile(Config.SymbolsToWeaken, DC.Alloc,
-                                     Arg->getValue(), UseRegex))
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
       return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol))
-    Config.SymbolsToRemove.emplace_back(Arg->getValue(), UseRegex);
+    if (Error E = Config.SymbolsToRemove.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbols))
     if (Error E = addSymbolsFromFile(Config.SymbolsToRemove, DC.Alloc,
-                                     Arg->getValue(), UseRegex))
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
       return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_strip_unneeded_symbol))
-    Config.UnneededSymbolsToRemove.emplace_back(Arg->getValue(), UseRegex);
+    if (Error E =
+            Config.UnneededSymbolsToRemove.addMatcher(NameOrPattern::create(
+                Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_strip_unneeded_symbols))
     if (Error E = addSymbolsFromFile(Config.UnneededSymbolsToRemove, DC.Alloc,
-                                     Arg->getValue(), UseRegex))
+                                     Arg->getValue(), SymbolMatchStyle,
+                                     ErrorCallback))
       return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol))
-    Config.SymbolsToKeep.emplace_back(Arg->getValue(), UseRegex);
+    if (Error E = Config.SymbolsToKeep.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
   for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbols))
-    if (Error E = addSymbolsFromFile(Config.SymbolsToKeep, DC.Alloc,
-                                     Arg->getValue(), UseRegex))
+    if (Error E =
+            addSymbolsFromFile(Config.SymbolsToKeep, DC.Alloc, Arg->getValue(),
+                               SymbolMatchStyle, ErrorCallback))
       return std::move(E);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_add_symbol)) {
-    Expected<NewSymbolInfo> NSI = parseNewSymbolInfo(Arg->getValue());
-    if (!NSI)
-      return NSI.takeError();
-    Config.SymbolsToAdd.push_back(*NSI);
-  }
+  for (auto Arg : InputArgs.filtered(OBJCOPY_add_symbol))
+    Config.SymbolsToAdd.push_back(Arg->getValue());
 
   Config.AllowBrokenLinks = InputArgs.hasArg(OBJCOPY_allow_broken_links);
 
@@ -754,19 +757,19 @@ Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
 // exit.
 Expected<DriverConfig>
 parseStripOptions(ArrayRef<const char *> ArgsArr,
-                  std::function<Error(Error)> ErrorCallback) {
+                  llvm::function_ref<Error(Error)> ErrorCallback) {
   StripOptTable T;
   unsigned MissingArgumentIndex, MissingArgumentCount;
   llvm::opt::InputArgList InputArgs =
       T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
 
   if (InputArgs.size() == 0) {
-    T.PrintHelp(errs(), "llvm-strip [options] file...", "strip tool");
+    printHelp(T, errs(), "llvm-strip");
     exit(1);
   }
 
   if (InputArgs.hasArg(STRIP_help)) {
-    T.PrintHelp(outs(), "llvm-strip [options] file...", "strip tool");
+    printHelp(T, outs(), "llvm-strip");
     exit(0);
   }
 
@@ -792,7 +795,17 @@ parseStripOptions(ArrayRef<const char *> ArgsArr,
         "multiple input files cannot be used in combination with -o");
 
   CopyConfig Config;
-  bool UseRegexp = InputArgs.hasArg(STRIP_regex);
+
+  if (InputArgs.hasArg(STRIP_regex) && InputArgs.hasArg(STRIP_wildcard))
+    return createStringError(errc::invalid_argument,
+                             "--regex and --wildcard are incompatible");
+  MatchStyle SectionMatchStyle =
+      InputArgs.hasArg(STRIP_regex) ? MatchStyle::Regex : MatchStyle::Wildcard;
+  MatchStyle SymbolMatchStyle = InputArgs.hasArg(STRIP_regex)
+                                    ? MatchStyle::Regex
+                                    : InputArgs.hasArg(STRIP_wildcard)
+                                          ? MatchStyle::Wildcard
+                                          : MatchStyle::Literal;
   Config.AllowBrokenLinks = InputArgs.hasArg(STRIP_allow_broken_links);
   Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
 
@@ -801,6 +814,7 @@ parseStripOptions(ArrayRef<const char *> ArgsArr,
         InputArgs.hasFlag(STRIP_discard_all, STRIP_discard_locals)
             ? DiscardType::All
             : DiscardType::Locals;
+  Config.StripSections = InputArgs.hasArg(STRIP_strip_sections);
   Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded);
   if (auto Arg = InputArgs.getLastArg(STRIP_strip_all, STRIP_no_strip_all))
     Config.StripAll = Arg->getOption().getID() == STRIP_strip_all;
@@ -809,16 +823,24 @@ parseStripOptions(ArrayRef<const char *> ArgsArr,
   Config.KeepFileSymbols = InputArgs.hasArg(STRIP_keep_file_symbols);
 
   for (auto Arg : InputArgs.filtered(STRIP_keep_section))
-    Config.KeepSection.emplace_back(Arg->getValue(), UseRegexp);
+    if (Error E = Config.KeepSection.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
 
   for (auto Arg : InputArgs.filtered(STRIP_remove_section))
-    Config.ToRemove.emplace_back(Arg->getValue(), UseRegexp);
+    if (Error E = Config.ToRemove.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SectionMatchStyle, ErrorCallback)))
+      return std::move(E);
 
   for (auto Arg : InputArgs.filtered(STRIP_strip_symbol))
-    Config.SymbolsToRemove.emplace_back(Arg->getValue(), UseRegexp);
+    if (Error E = Config.SymbolsToRemove.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
 
   for (auto Arg : InputArgs.filtered(STRIP_keep_symbol))
-    Config.SymbolsToKeep.emplace_back(Arg->getValue(), UseRegexp);
+    if (Error E = Config.SymbolsToKeep.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
 
   if (!InputArgs.hasArg(STRIP_no_strip_all) && !Config.StripDebug &&
       !Config.StripUnneeded && Config.DiscardMode == DiscardType::None &&
diff --git a/tools/llvm-objcopy/CopyConfig.h b/tools/llvm-objcopy/CopyConfig.h
index aff3631a487c..55a55d3a2bc2 100644
--- a/tools/llvm-objcopy/CopyConfig.h
+++ b/tools/llvm-objcopy/CopyConfig.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_TOOLS_LLVM_OBJCOPY_COPY_CONFIG_H
 #define LLVM_TOOLS_LLVM_OBJCOPY_COPY_CONFIG_H
 
+#include "ELF/ELFConfig.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/Optional.h"
@@ -18,6 +19,7 @@
 #include "llvm/Object/ELFTypes.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/Regex.h"
 // Necessary for llvm::DebugCompressionType::None
 #include "llvm/Target/TargetOptions.h"
@@ -87,36 +89,71 @@ enum class DiscardType {
   Locals, // --discard-locals (-X)
 };
 
-class NameOrRegex {
+enum class MatchStyle {
+  Literal,  // Default for symbols.
+  Wildcard, // Default for sections, or enabled with --wildcard (-w).
+  Regex,    // Enabled with --regex.
+};
+
+class NameOrPattern {
   StringRef Name;
   // Regex is shared between multiple CopyConfig instances.
   std::shared_ptr<Regex> R;
+  std::shared_ptr<GlobPattern> G;
+  bool IsPositiveMatch = true;
+
+  NameOrPattern(StringRef N) : Name(N) {}
+  NameOrPattern(std::shared_ptr<Regex> R) : R(R) {}
+  NameOrPattern(std::shared_ptr<GlobPattern> G, bool IsPositiveMatch)
+      : G(G), IsPositiveMatch(IsPositiveMatch) {}
 
 public:
-  NameOrRegex(StringRef Pattern, bool IsRegex);
-  bool operator==(StringRef S) const { return R ? R->match(S) : Name == S; }
+  // ErrorCallback is used to handle recoverable errors. An Error returned
+  // by the callback aborts the parsing and is then returned by this function.
+  static Expected<NameOrPattern>
+  create(StringRef Pattern, MatchStyle MS,
+         llvm::function_ref<Error(Error)> ErrorCallback);
+
+  bool isPositiveMatch() const { return IsPositiveMatch; }
+  bool operator==(StringRef S) const {
+    return R ? R->match(S) : G ? G->match(S) : Name == S;
+  }
   bool operator!=(StringRef S) const { return !operator==(S); }
 };
 
-struct NewSymbolInfo {
-  StringRef SymbolName;
-  StringRef SectionName;
-  uint64_t Value = 0;
-  uint8_t Type = ELF::STT_NOTYPE;
-  uint8_t Bind = ELF::STB_GLOBAL;
-  uint8_t Visibility = ELF::STV_DEFAULT;
+// Matcher that checks symbol or section names against the command line flags
+// provided for that option.
+class NameMatcher {
+  std::vector<NameOrPattern> PosMatchers;
+  std::vector<NameOrPattern> NegMatchers;
+
+public:
+  Error addMatcher(Expected<NameOrPattern> Matcher) {
+    if (!Matcher)
+      return Matcher.takeError();
+    if (Matcher->isPositiveMatch())
+      PosMatchers.push_back(std::move(*Matcher));
+    else
+      NegMatchers.push_back(std::move(*Matcher));
+    return Error::success();
+  }
+  bool matches(StringRef S) const {
+    return is_contained(PosMatchers, S) && !is_contained(NegMatchers, S);
+  }
+  bool empty() const { return PosMatchers.empty() && NegMatchers.empty(); }
 };
 
 // Configuration for copying/stripping a single file.
 struct CopyConfig {
+  // Format-specific options to be initialized lazily when needed.
+  Optional<elf::ELFCopyConfig> ELF;
+
   // Main input/output options
   StringRef InputFilename;
   FileFormat InputFormat;
   StringRef OutputFilename;
   FileFormat OutputFormat;
 
-  // Only applicable for --input-format=binary
-  MachineInfo BinaryArch;
   // Only applicable when --output-format!=binary (e.g. elf64-x86-64).
   Optional<MachineInfo> OutputArch;
 
@@ -132,24 +169,30 @@ struct CopyConfig {
   StringRef SymbolsPrefix;
   StringRef AllocSectionsPrefix;
   DiscardType DiscardMode = DiscardType::None;
+  Optional<StringRef> NewSymbolVisibility;
 
   // Repeated options
   std::vector<StringRef> AddSection;
   std::vector<StringRef> DumpSection;
-  std::vector<NewSymbolInfo> SymbolsToAdd;
-  std::vector<NameOrRegex> KeepSection;
-  std::vector<NameOrRegex> OnlySection;
-  std::vector<NameOrRegex> SymbolsToGlobalize;
-  std::vector<NameOrRegex> SymbolsToKeep;
-  std::vector<NameOrRegex> SymbolsToLocalize;
-  std::vector<NameOrRegex> SymbolsToRemove;
-  std::vector<NameOrRegex> UnneededSymbolsToRemove;
-  std::vector<NameOrRegex> SymbolsToWeaken;
-  std::vector<NameOrRegex> ToRemove;
-  std::vector<NameOrRegex> SymbolsToKeepGlobal;
+  std::vector<StringRef> SymbolsToAdd;
+
+  // Section matchers
+  NameMatcher KeepSection;
+  NameMatcher OnlySection;
+  NameMatcher ToRemove;
+
+  // Symbol matchers
+  NameMatcher SymbolsToGlobalize;
+  NameMatcher SymbolsToKeep;
+  NameMatcher SymbolsToLocalize;
+  NameMatcher SymbolsToRemove;
+  NameMatcher UnneededSymbolsToRemove;
+  NameMatcher SymbolsToWeaken;
+  NameMatcher SymbolsToKeepGlobal;
 
   // Map options
   StringMap<SectionRename> SectionsToRename;
+  StringMap<uint64_t> SetSectionAlignment;
   StringMap<SectionFlagsUpdate> SetSectionFlags;
   StringMap<StringRef> SymbolsToRename;
 
@@ -178,6 +221,18 @@ struct CopyConfig {
   bool Weaken = false;
   bool DecompressDebugSections = false;
   DebugCompressionType CompressionType = DebugCompressionType::None;
+
+  // parseELFConfig performs ELF-specific command-line parsing. Fills `ELF` on
+  // success or returns an Error otherwise.
+  Error parseELFConfig() {
+    if (!ELF) {
+      Expected<elf::ELFCopyConfig> ELFConfig = elf::parseConfig(*this);
+      if (!ELFConfig)
+        return ELFConfig.takeError();
+      ELF = *ELFConfig;
+    }
+    return Error::success();
+  }
 };
 
 // Configuration for the overall invocation of this tool. When invoked as
@@ -190,8 +245,11 @@ struct DriverConfig {
 
 // ParseObjcopyOptions returns the config and sets the input arguments. If a
 // help flag is set then ParseObjcopyOptions will print the help messege and
-// exit.
-Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr);
+// exit. ErrorCallback is used to handle recoverable errors. An Error returned
+// by the callback aborts the parsing and is then returned by this function.
+Expected<DriverConfig>
+parseObjcopyOptions(ArrayRef<const char *> ArgsArr,
+                    llvm::function_ref<Error(Error)> ErrorCallback);
 
 // ParseStripOptions returns the config and sets the input arguments. If a
 // help flag is set then ParseStripOptions will print the help messege and
@@ -199,7 +257,7 @@ Expected<DriverConfig> parseObjcopyOptions(ArrayRef<const char *> ArgsArr);
 // by the callback aborts the parsing and is then returned by this function.
 Expected<DriverConfig>
 parseStripOptions(ArrayRef<const char *> ArgsArr,
-                  std::function<Error(Error)> ErrorCallback);
+                  llvm::function_ref<Error(Error)> ErrorCallback);
 
 } // namespace objcopy
 } // namespace llvm
diff --git a/tools/llvm-objcopy/ELF/ELFConfig.cpp b/tools/llvm-objcopy/ELF/ELFConfig.cpp
new file mode 100644
index 000000000000..40993760add7
--- /dev/null
+++ b/tools/llvm-objcopy/ELF/ELFConfig.cpp
@@ -0,0 +1,133 @@
+//===- ELFConfig.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CopyConfig.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace objcopy {
+namespace elf {
+
+static Expected<NewSymbolInfo> parseNewSymbolInfo(StringRef FlagValue,
+                                                  uint8_t DefaultVisibility) {
+  // Parse value given with --add-symbol option and create the
+  // new symbol if possible. The value format for --add-symbol is:
+  //
+  // <name>=[<section>:]<value>[,<flags>]
+  //
+  // where:
+  // <name> - symbol name, can be empty string
+  // <section> - optional section name. If not given ABS symbol is created
+  // <value> - symbol value, can be decimal or hexadecimal number prefixed
+  //           with 0x.
+  // <flags> - optional flags affecting symbol type, binding or visibility:
+  //           The following are currently supported:
+  //
+  //           global, local, weak, default, hidden, file, section, object,
+  //           indirect-function.
+  //
+  //           The following flags are ignored and provided for GNU
+  //           compatibility only:
+  //
+  //           warning, debug, constructor, indirect, synthetic,
+  //           unique-object, before=<symbol>.
+  NewSymbolInfo SI;
+  StringRef Value;
+  std::tie(SI.SymbolName, Value) = FlagValue.split('=');
+  if (Value.empty())
+    return createStringError(
+        errc::invalid_argument,
+        "bad format for --add-symbol, missing '=' after '%s'",
+        SI.SymbolName.str().c_str());
+
+  if (Value.contains(':')) {
+    std::tie(SI.SectionName, Value) = Value.split(':');
+    if (SI.SectionName.empty() || Value.empty())
+      return createStringError(
+          errc::invalid_argument,
+          "bad format for --add-symbol, missing section name or symbol value");
+  }
+
+  SmallVector<StringRef, 6> Flags;
+  Value.split(Flags, ',');
+  if (Flags[0].getAsInteger(0, SI.Value))
+    return createStringError(errc::invalid_argument, "bad symbol value: '%s'",
+                             Flags[0].str().c_str());
+
+  SI.Visibility = DefaultVisibility;
+
+  using Functor = std::function<void(void)>;
+  SmallVector<StringRef, 6> UnsupportedFlags;
+  for (size_t I = 1, NumFlags = Flags.size(); I < NumFlags; ++I)
+    static_cast<Functor>(
+        StringSwitch<Functor>(Flags[I])
+            .CaseLower("global", [&SI] { SI.Bind = ELF::STB_GLOBAL; })
+            .CaseLower("local", [&SI] { SI.Bind = ELF::STB_LOCAL; })
+            .CaseLower("weak", [&SI] { SI.Bind = ELF::STB_WEAK; })
+            .CaseLower("default", [&SI] { SI.Visibility = ELF::STV_DEFAULT; })
+            .CaseLower("hidden", [&SI] { SI.Visibility = ELF::STV_HIDDEN; })
+            .CaseLower("protected",
+                       [&SI] { SI.Visibility = ELF::STV_PROTECTED; })
+            .CaseLower("file", [&SI] { SI.Type = ELF::STT_FILE; })
+            .CaseLower("section", [&SI] { SI.Type = ELF::STT_SECTION; })
+            .CaseLower("object", [&SI] { SI.Type = ELF::STT_OBJECT; })
+            .CaseLower("function", [&SI] { SI.Type = ELF::STT_FUNC; })
+            .CaseLower("indirect-function",
+                       [&SI] { SI.Type = ELF::STT_GNU_IFUNC; })
+            .CaseLower("debug", [] {})
+            .CaseLower("constructor", [] {})
+            .CaseLower("warning", [] {})
+            .CaseLower("indirect", [] {})
+            .CaseLower("synthetic", [] {})
+            .CaseLower("unique-object", [] {})
+            .StartsWithLower("before", [] {})
+            .Default([&] { UnsupportedFlags.push_back(Flags[I]); }))();
+  if (!UnsupportedFlags.empty())
+    return createStringError(errc::invalid_argument,
+                             "unsupported flag%s for --add-symbol: '%s'",
+                             UnsupportedFlags.size() > 1 ? "s" : "",
+                             join(UnsupportedFlags, "', '").c_str());
+  return SI;
+}
+
+Expected<ELFCopyConfig> parseConfig(const CopyConfig &Config) {
+  ELFCopyConfig ELFConfig;
+  if (Config.NewSymbolVisibility) {
+    const uint8_t Invalid = 0xff;
+    ELFConfig.NewSymbolVisibility =
+        StringSwitch<uint8_t>(*Config.NewSymbolVisibility)
+            .Case("default", ELF::STV_DEFAULT)
+            .Case("hidden", ELF::STV_HIDDEN)
+            .Case("internal", ELF::STV_INTERNAL)
+            .Case("protected", ELF::STV_PROTECTED)
+            .Default(Invalid);
+
+    if (ELFConfig.NewSymbolVisibility == Invalid)
+      return createStringError(errc::invalid_argument,
+                               "'%s' is not a valid symbol visibility",
+                               Config.NewSymbolVisibility->str().c_str());
+  }
+
+  for (StringRef Arg : Config.SymbolsToAdd) {
+    Expected<elf::NewSymbolInfo> NSI = parseNewSymbolInfo(
+        Arg,
+        ELFConfig.NewSymbolVisibility.getValueOr((uint8_t)ELF::STV_DEFAULT));
+    if (!NSI)
+      return NSI.takeError();
+    ELFConfig.SymbolsToAdd.push_back(*NSI);
+  }
+
+  return ELFConfig;
+}
+
+} // end namespace elf
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/tools/llvm-objcopy/ELF/ELFConfig.h b/tools/llvm-objcopy/ELF/ELFConfig.h
new file mode 100644
index 000000000000..977efbc4166f
--- /dev/null
+++ b/tools/llvm-objcopy/ELF/ELFConfig.h
@@ -0,0 +1,44 @@
+//===- ELFConfig.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OBJCOPY_ELFCONFIG_H
+#define LLVM_TOOLS_OBJCOPY_ELFCONFIG_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Support/Error.h"
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+struct CopyConfig;
+
+namespace elf {
+
+struct NewSymbolInfo {
+  StringRef SymbolName;
+  StringRef SectionName;
+  uint64_t Value = 0;
+  uint8_t Type = ELF::STT_NOTYPE;
+  uint8_t Bind = ELF::STB_GLOBAL;
+  uint8_t Visibility = ELF::STV_DEFAULT;
+};
+
+struct ELFCopyConfig {
+  Optional<uint8_t> NewSymbolVisibility;
+  std::vector<NewSymbolInfo> SymbolsToAdd;
+};
+
+Expected<ELFCopyConfig> parseConfig(const CopyConfig &Config);
+
+} // namespace elf
+} // namespace objcopy
+} // namespace llvm
+
+#endif
diff --git a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
index b366c6e55987..8bf7e0f88010 100644
--- a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -136,16 +136,16 @@ static std::unique_ptr<Writer> createELFWriter(const CopyConfig &Config,
   // Depending on the initial ELFT and OutputFormat we need a different Writer.
   switch (OutputElfType) {
   case ELFT_ELF32LE:
-    return llvm::make_unique<ELFWriter<ELF32LE>>(Obj, Buf,
+    return std::make_unique<ELFWriter<ELF32LE>>(Obj, Buf,
                                                  !Config.StripSections);
   case ELFT_ELF64LE:
-    return llvm::make_unique<ELFWriter<ELF64LE>>(Obj, Buf,
+    return std::make_unique<ELFWriter<ELF64LE>>(Obj, Buf,
                                                  !Config.StripSections);
   case ELFT_ELF32BE:
-    return llvm::make_unique<ELFWriter<ELF32BE>>(Obj, Buf,
+    return std::make_unique<ELFWriter<ELF32BE>>(Obj, Buf,
                                                  !Config.StripSections);
   case ELFT_ELF64BE:
-    return llvm::make_unique<ELFWriter<ELF64BE>>(Obj, Buf,
+    return std::make_unique<ELFWriter<ELF64BE>>(Obj, Buf,
                                                  !Config.StripSections);
   }
   llvm_unreachable("Invalid output format");
@@ -156,9 +156,9 @@ static std::unique_ptr<Writer> createWriter(const CopyConfig &Config,
                                             ElfType OutputElfType) {
   switch (Config.OutputFormat) {
   case FileFormat::Binary:
-    return llvm::make_unique<BinaryWriter>(Obj, Buf);
+    return std::make_unique<BinaryWriter>(Obj, Buf);
   case FileFormat::IHex:
-    return llvm::make_unique<IHexWriter>(Obj, Buf);
+    return std::make_unique<IHexWriter>(Obj, Buf);
   default:
     return createELFWriter(Config, Obj, Buf, OutputElfType);
   }
@@ -263,7 +263,7 @@ static Error linkToBuildIdDir(const CopyConfig &Config, StringRef ToLink,
 
 static Error splitDWOToFile(const CopyConfig &Config, const Reader &Reader,
                             StringRef File, ElfType OutputElfType) {
-  auto DWOFile = Reader.create();
+  auto DWOFile = Reader.create(false);
   auto OnlyKeepDWOPred = [&DWOFile](const SectionBase &Sec) {
     return onlyKeepDWOPred(*DWOFile, Sec);
   };
@@ -305,9 +305,9 @@ static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
                            SecName.str().c_str());
 }
 
-static bool isCompressable(const SectionBase &Section) {
-  return !(Section.Flags & ELF::SHF_COMPRESSED) &&
-         StringRef(Section.Name).startswith(".debug");
+static bool isCompressable(const SectionBase &Sec) {
+  return !(Sec.Flags & ELF::SHF_COMPRESSED) &&
+         StringRef(Sec.Name).startswith(".debug");
 }
 
 static void replaceDebugSections(
@@ -356,7 +356,7 @@ static Error updateAndRemoveSymbols(const CopyConfig &Config, Object &Obj) {
     if (!Sym.isCommon() && Sym.getShndx() != SHN_UNDEF &&
         ((Config.LocalizeHidden &&
           (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
-         is_contained(Config.SymbolsToLocalize, Sym.Name)))
+         Config.SymbolsToLocalize.matches(Sym.Name)))
       Sym.Binding = STB_LOCAL;
 
     // Note: these two globalize flags have very similar names but different
@@ -370,16 +370,15 @@ static Error updateAndRemoveSymbols(const CopyConfig &Config, Object &Obj) {
     // --keep-global-symbol. Because of that, make sure to check
     // --globalize-symbol second.
     if (!Config.SymbolsToKeepGlobal.empty() &&
-        !is_contained(Config.SymbolsToKeepGlobal, Sym.Name) &&
+        !Config.SymbolsToKeepGlobal.matches(Sym.Name) &&
         Sym.getShndx() != SHN_UNDEF)
       Sym.Binding = STB_LOCAL;
 
-    if (is_contained(Config.SymbolsToGlobalize, Sym.Name) &&
+    if (Config.SymbolsToGlobalize.matches(Sym.Name) &&
         Sym.getShndx() != SHN_UNDEF)
       Sym.Binding = STB_GLOBAL;
 
-    if (is_contained(Config.SymbolsToWeaken, Sym.Name) &&
-        Sym.Binding == STB_GLOBAL)
+    if (Config.SymbolsToWeaken.matches(Sym.Name) && Sym.Binding == STB_GLOBAL)
       Sym.Binding = STB_WEAK;
 
     if (Config.Weaken && Sym.Binding == STB_GLOBAL &&
@@ -399,12 +398,12 @@ static Error updateAndRemoveSymbols(const CopyConfig &Config, Object &Obj) {
   // symbols are still 'needed' and which are not.
   if (Config.StripUnneeded || !Config.UnneededSymbolsToRemove.empty() ||
       !Config.OnlySection.empty()) {
-    for (auto &Section : Obj.sections())
-      Section.markSymbols();
+    for (SectionBase &Sec : Obj.sections())
+      Sec.markSymbols();
   }
 
   auto RemoveSymbolsPred = [&](const Symbol &Sym) {
-    if (is_contained(Config.SymbolsToKeep, Sym.Name) ||
+    if (Config.SymbolsToKeep.matches(Sym.Name) ||
         (Config.KeepFileSymbols && Sym.Type == STT_FILE))
       return false;
 
@@ -418,12 +417,12 @@ static Error updateAndRemoveSymbols(const CopyConfig &Config, Object &Obj) {
     if (Config.StripAll || Config.StripAllGNU)
       return true;
 
-    if (is_contained(Config.SymbolsToRemove, Sym.Name))
+    if (Config.SymbolsToRemove.matches(Sym.Name))
       return true;
 
     if ((Config.StripUnneeded ||
-         is_contained(Config.UnneededSymbolsToRemove, Sym.Name)) &&
-        isUnneededSymbol(Sym))
+         Config.UnneededSymbolsToRemove.matches(Sym.Name)) &&
+        (!Obj.isRelocatable() || isUnneededSymbol(Sym)))
       return true;
 
     // We want to remove undefined symbols if all references have been stripped.
@@ -443,7 +442,7 @@ static Error replaceAndRemoveSections(const CopyConfig &Config, Object &Obj) {
   // Removes:
   if (!Config.ToRemove.empty()) {
     RemovePred = [&Config](const SectionBase &Sec) {
-      return is_contained(Config.ToRemove, Sec.Name);
+      return Config.ToRemove.matches(Sec.Name);
     };
   }
 
@@ -481,7 +480,7 @@ static Error replaceAndRemoveSections(const CopyConfig &Config, Object &Obj) {
     };
   }
 
-  if (Config.StripDebug) {
+  if (Config.StripDebug || Config.StripUnneeded) {
     RemovePred = [RemovePred](const SectionBase &Sec) {
       return RemovePred(Sec) || isDebugSection(Sec);
     };
@@ -523,7 +522,7 @@ static Error replaceAndRemoveSections(const CopyConfig &Config, Object &Obj) {
   if (!Config.OnlySection.empty()) {
     RemovePred = [&Config, RemovePred, &Obj](const SectionBase &Sec) {
       // Explicitly keep these sections regardless of previous removes.
-      if (is_contained(Config.OnlySection, Sec.Name))
+      if (Config.OnlySection.matches(Sec.Name))
         return false;
 
       // Allow all implicit removes.
@@ -545,7 +544,7 @@ static Error replaceAndRemoveSections(const CopyConfig &Config, Object &Obj) {
   if (!Config.KeepSection.empty()) {
     RemovePred = [&Config, RemovePred](const SectionBase &Sec) {
       // Explicitly keep these sections regardless of previous removes.
-      if (is_contained(Config.KeepSection, Sec.Name))
+      if (Config.KeepSection.matches(Sec.Name))
         return false;
       // Otherwise defer to RemovePred.
       return RemovePred(Sec);
@@ -614,9 +613,8 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj,
   if (Error E = updateAndRemoveSymbols(Config, Obj))
     return E;
 
-  if (!Config.SectionsToRename.empty() || !Config.AllocSectionsPrefix.empty()) {
-    DenseSet<SectionBase *> PrefixedSections;
-    for (auto &Sec : Obj.sections()) {
+  if (!Config.SectionsToRename.empty()) {
+    for (SectionBase &Sec : Obj.sections()) {
       const auto Iter = Config.SectionsToRename.find(Sec.Name);
       if (Iter != Config.SectionsToRename.end()) {
         const SectionRename &SR = Iter->second;
@@ -624,63 +622,62 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj,
         if (SR.NewFlags.hasValue())
           setSectionFlagsAndType(Sec, SR.NewFlags.getValue());
       }
+    }
+  }
 
-      // Add a prefix to allocated sections and their relocation sections. This
-      // should be done after renaming the section by Config.SectionToRename to
-      // imitate the GNU objcopy behavior.
-      if (!Config.AllocSectionsPrefix.empty()) {
-        if (Sec.Flags & SHF_ALLOC) {
-          Sec.Name = (Config.AllocSectionsPrefix + Sec.Name).str();
-          PrefixedSections.insert(&Sec);
-
-          // Rename relocation sections associated to the allocated sections.
-          // For example, if we rename .text to .prefix.text, we also rename
-          // .rel.text to .rel.prefix.text.
-          //
-          // Dynamic relocation sections (SHT_REL[A] with SHF_ALLOC) are handled
-          // above, e.g., .rela.plt is renamed to .prefix.rela.plt, not
-          // .rela.prefix.plt since GNU objcopy does so.
-        } else if (auto *RelocSec = dyn_cast<RelocationSectionBase>(&Sec)) {
-          auto *TargetSec = RelocSec->getSection();
-          if (TargetSec && (TargetSec->Flags & SHF_ALLOC)) {
-            StringRef prefix;
-            switch (Sec.Type) {
-            case SHT_REL:
-              prefix = ".rel";
-              break;
-            case SHT_RELA:
-              prefix = ".rela";
-              break;
-            default:
-              continue;
-            }
-
-            // If the relocation section comes *after* the target section, we
-            // don't add Config.AllocSectionsPrefix because we've already added
-            // the prefix to TargetSec->Name. Otherwise, if the relocation
-            // section comes *before* the target section, we add the prefix.
-            if (PrefixedSections.count(TargetSec)) {
-              Sec.Name = (prefix + TargetSec->Name).str();
-            } else {
-              const auto Iter = Config.SectionsToRename.find(TargetSec->Name);
-              if (Iter != Config.SectionsToRename.end()) {
-                // Both `--rename-section` and `--prefix-alloc-sections` are
-                // given but the target section is not yet renamed.
-                Sec.Name =
-                    (prefix + Config.AllocSectionsPrefix + Iter->second.NewName)
-                        .str();
-              } else {
-                Sec.Name =
-                    (prefix + Config.AllocSectionsPrefix + TargetSec->Name)
-                        .str();
-              }
-            }
+  // Add a prefix to allocated sections and their relocation sections. This
+  // should be done after renaming the section by Config.SectionToRename to
+  // imitate the GNU objcopy behavior.
+  if (!Config.AllocSectionsPrefix.empty()) {
+    DenseSet<SectionBase *> PrefixedSections;
+    for (SectionBase &Sec : Obj.sections()) {
+      if (Sec.Flags & SHF_ALLOC) {
+        Sec.Name = (Config.AllocSectionsPrefix + Sec.Name).str();
+        PrefixedSections.insert(&Sec);
+      } else if (auto *RelocSec = dyn_cast<RelocationSectionBase>(&Sec)) {
+        // Rename relocation sections associated to the allocated sections.
+        // For example, if we rename .text to .prefix.text, we also rename
+        // .rel.text to .rel.prefix.text.
+        //
+        // Dynamic relocation sections (SHT_REL[A] with SHF_ALLOC) are handled
+        // above, e.g., .rela.plt is renamed to .prefix.rela.plt, not
+        // .rela.prefix.plt since GNU objcopy does so.
+        const SectionBase *TargetSec = RelocSec->getSection();
+        if (TargetSec && (TargetSec->Flags & SHF_ALLOC)) {
+          StringRef prefix;
+          switch (Sec.Type) {
+          case SHT_REL:
+            prefix = ".rel";
+            break;
+          case SHT_RELA:
+            prefix = ".rela";
+            break;
+          default:
+            llvm_unreachable("not a relocation section");
           }
+
+          // If the relocation section comes *after* the target section, we
+          // don't add Config.AllocSectionsPrefix because we've already added
+          // the prefix to TargetSec->Name. Otherwise, if the relocation
+          // section comes *before* the target section, we add the prefix.
+          if (PrefixedSections.count(TargetSec))
+            Sec.Name = (prefix + TargetSec->Name).str();
+          else
+            Sec.Name =
+                (prefix + Config.AllocSectionsPrefix + TargetSec->Name).str();
         }
       }
     }
   }
 
+  if (!Config.SetSectionAlignment.empty()) {
+    for (SectionBase &Sec : Obj.sections()) {
+      auto I = Config.SetSectionAlignment.find(Sec.Name);
+      if (I != Config.SetSectionAlignment.end())
+        Sec.Align = I->second;
+    }
+  }
+
   if (!Config.SetSectionFlags.empty()) {
     for (auto &Sec : Obj.sections()) {
       const auto Iter = Config.SetSectionFlags.find(Sec.Name);
@@ -721,7 +718,7 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj,
     Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink,
                                         Config.GnuDebugLinkCRC32);
 
-  for (const NewSymbolInfo &SI : Config.SymbolsToAdd) {
+  for (const NewSymbolInfo &SI : Config.ELF->SymbolsToAdd) {
     SectionBase *Sec = Obj.findSection(SI.SectionName);
     uint64_t Value = Sec ? Sec->Addr + SI.Value : SI.Value;
     Obj.SymbolTable->addSymbol(
@@ -746,9 +743,9 @@ static Error writeOutput(const CopyConfig &Config, Object &Obj, Buffer &Out,
 Error executeObjcopyOnIHex(const CopyConfig &Config, MemoryBuffer &In,
                            Buffer &Out) {
   IHexReader Reader(&In);
-  std::unique_ptr<Object> Obj = Reader.create();
+  std::unique_ptr<Object> Obj = Reader.create(true);
   const ElfType OutputElfType =
-      getOutputElfType(Config.OutputArch.getValueOr(Config.BinaryArch));
+    getOutputElfType(Config.OutputArch.getValueOr(MachineInfo()));
   if (Error E = handleArgs(Config, *Obj, Reader, OutputElfType))
     return E;
   return writeOutput(Config, *Obj, Out, OutputElfType);
@@ -756,13 +753,15 @@ Error executeObjcopyOnIHex(const CopyConfig &Config, MemoryBuffer &In,
 
 Error executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
                                 Buffer &Out) {
-  BinaryReader Reader(Config.BinaryArch, &In);
-  std::unique_ptr<Object> Obj = Reader.create();
+  uint8_t NewSymbolVisibility =
+      Config.ELF->NewSymbolVisibility.getValueOr((uint8_t)ELF::STV_DEFAULT);
+  BinaryReader Reader(&In, NewSymbolVisibility);
+  std::unique_ptr<Object> Obj = Reader.create(true);
 
   // Prefer OutputArch (-O<format>) if set, otherwise fallback to BinaryArch
   // (-B<arch>).
   const ElfType OutputElfType =
-      getOutputElfType(Config.OutputArch.getValueOr(Config.BinaryArch));
+      getOutputElfType(Config.OutputArch.getValueOr(MachineInfo()));
   if (Error E = handleArgs(Config, *Obj, Reader, OutputElfType))
     return E;
   return writeOutput(Config, *Obj, Out, OutputElfType);
@@ -771,7 +770,7 @@ Error executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
 Error executeObjcopyOnBinary(const CopyConfig &Config,
                              object::ELFObjectFileBase &In, Buffer &Out) {
   ELFReader Reader(&In, Config.ExtractPartition);
-  std::unique_ptr<Object> Obj = Reader.create();
+  std::unique_ptr<Object> Obj = Reader.create(!Config.SymbolsToAdd.empty());
   // Prefer OutputArch (-O<format>) if set, otherwise infer it from the input.
   const ElfType OutputElfType =
       Config.OutputArch ? getOutputElfType(Config.OutputArch.getValue())
diff --git a/tools/llvm-objcopy/ELF/Object.cpp b/tools/llvm-objcopy/ELF/Object.cpp
index fa696380e17c..74145dad6e6b 100644
--- a/tools/llvm-objcopy/ELF/Object.cpp
+++ b/tools/llvm-objcopy/ELF/Object.cpp
@@ -397,7 +397,7 @@ void SectionWriter::visit(const OwnedDataSection &Sec) {
   llvm::copy(Sec.Data, Out.getBufferStart() + Sec.Offset);
 }
 
-static const std::vector<uint8_t> ZlibGnuMagic = {'Z', 'L', 'I', 'B'};
+static constexpr std::array<uint8_t, 4> ZlibGnuMagic = {{'Z', 'L', 'I', 'B'}};
 
 static bool isDataGnuCompressed(ArrayRef<uint8_t> Data) {
   return Data.size() > ZlibGnuMagic.size() &&
@@ -665,7 +665,7 @@ void SymbolTableSection::addSymbol(Twine Name, uint8_t Bind, uint8_t Type,
   Sym.Visibility = Visibility;
   Sym.Size = SymbolSize;
   Sym.Index = Symbols.size();
-  Symbols.emplace_back(llvm::make_unique<Symbol>(Sym));
+  Symbols.emplace_back(std::make_unique<Symbol>(Sym));
   Size += this->EntrySize;
 }
 
@@ -1055,29 +1055,28 @@ void GroupSection::accept(MutableSectionVisitor &Visitor) {
 }
 
 // Returns true IFF a section is wholly inside the range of a segment
-static bool sectionWithinSegment(const SectionBase &Section,
-                                 const Segment &Segment) {
+static bool sectionWithinSegment(const SectionBase &Sec, const Segment &Seg) {
   // If a section is empty it should be treated like it has a size of 1. This is
   // to clarify the case when an empty section lies on a boundary between two
   // segments and ensures that the section "belongs" to the second segment and
   // not the first.
-  uint64_t SecSize = Section.Size ? Section.Size : 1;
+  uint64_t SecSize = Sec.Size ? Sec.Size : 1;
 
-  if (Section.Type == SHT_NOBITS) {
-    if (!(Section.Flags & SHF_ALLOC))
+  if (Sec.Type == SHT_NOBITS) {
+    if (!(Sec.Flags & SHF_ALLOC))
       return false;
 
-    bool SectionIsTLS = Section.Flags & SHF_TLS;
-    bool SegmentIsTLS = Segment.Type == PT_TLS;
+    bool SectionIsTLS = Sec.Flags & SHF_TLS;
+    bool SegmentIsTLS = Seg.Type == PT_TLS;
     if (SectionIsTLS != SegmentIsTLS)
       return false;
 
-    return Segment.VAddr <= Section.Addr &&
-           Segment.VAddr + Segment.MemSize >= Section.Addr + SecSize;
+    return Seg.VAddr <= Sec.Addr &&
+           Seg.VAddr + Seg.MemSize >= Sec.Addr + SecSize;
   }
 
-  return Segment.Offset <= Section.OriginalOffset &&
-         Segment.Offset + Segment.FileSize >= Section.OriginalOffset + SecSize;
+  return Seg.Offset <= Sec.OriginalOffset &&
+         Seg.Offset + Seg.FileSize >= Sec.OriginalOffset + SecSize;
 }
 
 // Returns true IFF a segment's original offset is inside of another segment's
@@ -1113,7 +1112,7 @@ void BasicELFBuilder::initFileHeader() {
   Obj->OSABI = ELFOSABI_NONE;
   Obj->ABIVersion = 0;
   Obj->Entry = 0x0;
-  Obj->Machine = EMachine;
+  Obj->Machine = EM_NONE;
   Obj->Version = 1;
 }
 
@@ -1141,8 +1140,8 @@ SymbolTableSection *BasicELFBuilder::addSymTab(StringTableSection *StrTab) {
 }
 
 void BasicELFBuilder::initSections() {
-  for (auto &Section : Obj->sections())
-    Section.initialize(Obj->sections());
+  for (SectionBase &Sec : Obj->sections())
+    Sec.initialize(Obj->sections());
 }
 
 void BinaryELFBuilder::addData(SymbolTableSection *SymTab) {
@@ -1161,11 +1160,12 @@ void BinaryELFBuilder::addData(SymbolTableSection *SymTab) {
   Twine Prefix = Twine("_binary_") + SanitizedFilename;
 
   SymTab->addSymbol(Prefix + "_start", STB_GLOBAL, STT_NOTYPE, &DataSection,
-                    /*Value=*/0, STV_DEFAULT, 0, 0);
+                    /*Value=*/0, NewSymbolVisibility, 0, 0);
   SymTab->addSymbol(Prefix + "_end", STB_GLOBAL, STT_NOTYPE, &DataSection,
-                    /*Value=*/DataSection.Size, STV_DEFAULT, 0, 0);
+                    /*Value=*/DataSection.Size, NewSymbolVisibility, 0, 0);
   SymTab->addSymbol(Prefix + "_size", STB_GLOBAL, STT_NOTYPE, nullptr,
-                    /*Value=*/DataSection.Size, STV_DEFAULT, SHN_ABS, 0);
+                    /*Value=*/DataSection.Size, NewSymbolVisibility, SHN_ABS,
+                    0);
 }
 
 std::unique_ptr<Object> BinaryELFBuilder::build() {
@@ -1255,10 +1255,9 @@ template <class ELFT> void ELFBuilder<ELFT>::findEhdrOffset() {
   if (!ExtractPartition)
     return;
 
-  for (const SectionBase &Section : Obj.sections()) {
-    if (Section.Type == SHT_LLVM_PART_EHDR &&
-        Section.Name == *ExtractPartition) {
-      EhdrOffset = Section.Offset;
+  for (const SectionBase &Sec : Obj.sections()) {
+    if (Sec.Type == SHT_LLVM_PART_EHDR && Sec.Name == *ExtractPartition) {
+      EhdrOffset = Sec.Offset;
       return;
     }
   }
@@ -1287,15 +1286,12 @@ void ELFBuilder<ELFT>::readProgramHeaders(const ELFFile<ELFT> &HeadersFile) {
     Seg.MemSize = Phdr.p_memsz;
     Seg.Align = Phdr.p_align;
     Seg.Index = Index++;
-    for (SectionBase &Section : Obj.sections()) {
-      if (sectionWithinSegment(Section, Seg)) {
-        Seg.addSection(&Section);
-        if (!Section.ParentSegment ||
-            Section.ParentSegment->Offset > Seg.Offset) {
-          Section.ParentSegment = &Seg;
-        }
+    for (SectionBase &Sec : Obj.sections())
+      if (sectionWithinSegment(Sec, Seg)) {
+        Seg.addSection(&Sec);
+        if (!Sec.ParentSegment || Sec.ParentSegment->Offset > Seg.Offset)
+          Sec.ParentSegment = &Seg;
       }
-    }
   }
 
   auto &ElfHdr = Obj.ElfHdrSegment;
@@ -1531,7 +1527,7 @@ template <class ELFT> void ELFBuilder<ELFT>::readSectionHeaders() {
   }
 }
 
-template <class ELFT> void ELFBuilder<ELFT>::readSections() {
+template <class ELFT> void ELFBuilder<ELFT>::readSections(bool EnsureSymtab) {
   // If a section index table exists we'll need to initialize it before we
   // initialize the symbol table because the symbol table might need to
   // reference it.
@@ -1544,16 +1540,37 @@ template <class ELFT> void ELFBuilder<ELFT>::readSections() {
   if (Obj.SymbolTable) {
     Obj.SymbolTable->initialize(Obj.sections());
     initSymbolTable(Obj.SymbolTable);
+  } else if (EnsureSymtab) {
+    // Reuse the existing SHT_STRTAB section if exists.
+    StringTableSection *StrTab = nullptr;
+    for (auto &Sec : Obj.sections()) {
+      if (Sec.Type == ELF::SHT_STRTAB && !(Sec.Flags & SHF_ALLOC)) {
+        StrTab = static_cast<StringTableSection *>(&Sec);
+
+        // Prefer .strtab to .shstrtab.
+        if (Obj.SectionNames != &Sec)
+          break;
+      }
+    }
+    if (!StrTab)
+      StrTab = &Obj.addSection<StringTableSection>();
+
+    SymbolTableSection &SymTab = Obj.addSection<SymbolTableSection>();
+    SymTab.Name = ".symtab";
+    SymTab.Link = StrTab->Index;
+    SymTab.initialize(Obj.sections());
+    SymTab.addSymbol("", 0, 0, nullptr, 0, 0, 0, 0);
+    Obj.SymbolTable = &SymTab;
   }
 
   // Now that all sections and symbols have been added we can add
   // relocations that reference symbols and set the link and info fields for
   // relocation sections.
-  for (auto &Section : Obj.sections()) {
-    if (&Section == Obj.SymbolTable)
+  for (auto &Sec : Obj.sections()) {
+    if (&Sec == Obj.SymbolTable)
       continue;
-    Section.initialize(Obj.sections());
-    if (auto RelSec = dyn_cast<RelocationSection>(&Section)) {
+    Sec.initialize(Obj.sections());
+    if (auto RelSec = dyn_cast<RelocationSection>(&Sec)) {
       auto Shdr = unwrapOrError(ElfFile.sections()).begin() + RelSec->Index;
       if (RelSec->Type == SHT_REL)
         initRelocations(RelSec, Obj.SymbolTable,
@@ -1561,7 +1578,7 @@ template <class ELFT> void ELFBuilder<ELFT>::readSections() {
       else
         initRelocations(RelSec, Obj.SymbolTable,
                         unwrapOrError(ElfFile.relas(Shdr)));
-    } else if (auto GroupSec = dyn_cast<GroupSection>(&Section)) {
+    } else if (auto GroupSec = dyn_cast<GroupSection>(&Sec)) {
       initGroupSection(GroupSec);
     }
   }
@@ -1582,7 +1599,7 @@ template <class ELFT> void ELFBuilder<ELFT>::readSections() {
                 " is not a string table");
 }
 
-template <class ELFT> void ELFBuilder<ELFT>::build() {
+template <class ELFT> void ELFBuilder<ELFT>::build(bool EnsureSymtab) {
   readSectionHeaders();
   findEhdrOffset();
 
@@ -1601,7 +1618,7 @@ template <class ELFT> void ELFBuilder<ELFT>::build() {
   Obj.Entry = Ehdr.e_entry;
   Obj.Flags = Ehdr.e_flags;
 
-  readSections();
+  readSections(EnsureSymtab);
   readProgramHeaders(HeadersFile);
 }
 
@@ -1609,8 +1626,8 @@ Writer::~Writer() {}
 
 Reader::~Reader() {}
 
-std::unique_ptr<Object> BinaryReader::create() const {
-  return BinaryELFBuilder(MInfo.EMachine, MemBuf).build();
+std::unique_ptr<Object> BinaryReader::create(bool /*EnsureSymtab*/) const {
+  return BinaryELFBuilder(MemBuf, NewSymbolVisibility).build();
 }
 
 Expected<std::vector<IHexRecord>> IHexReader::parse() const {
@@ -1639,28 +1656,28 @@ Expected<std::vector<IHexRecord>> IHexReader::parse() const {
   return std::move(Records);
 }
 
-std::unique_ptr<Object> IHexReader::create() const {
+std::unique_ptr<Object> IHexReader::create(bool /*EnsureSymtab*/) const {
   std::vector<IHexRecord> Records = unwrapOrError(parse());
   return IHexELFBuilder(Records).build();
 }
 
-std::unique_ptr<Object> ELFReader::create() const {
-  auto Obj = llvm::make_unique<Object>();
+std::unique_ptr<Object> ELFReader::create(bool EnsureSymtab) const {
+  auto Obj = std::make_unique<Object>();
   if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
     ELFBuilder<ELF32LE> Builder(*O, *Obj, ExtractPartition);
-    Builder.build();
+    Builder.build(EnsureSymtab);
     return Obj;
   } else if (auto *O = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
     ELFBuilder<ELF64LE> Builder(*O, *Obj, ExtractPartition);
-    Builder.build();
+    Builder.build(EnsureSymtab);
     return Obj;
   } else if (auto *O = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
     ELFBuilder<ELF32BE> Builder(*O, *Obj, ExtractPartition);
-    Builder.build();
+    Builder.build(EnsureSymtab);
     return Obj;
   } else if (auto *O = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
     ELFBuilder<ELF64BE> Builder(*O, *Obj, ExtractPartition);
-    Builder.build();
+    Builder.build(EnsureSymtab);
     return Obj;
   }
   error("invalid file type");
@@ -1693,7 +1710,7 @@ template <class ELFT> void ELFWriter<ELFT>::writeEhdr() {
   Ehdr.e_ehsize = sizeof(Elf_Ehdr);
   if (WriteSectionHeaders && Obj.sections().size() != 0) {
     Ehdr.e_shentsize = sizeof(Elf_Shdr);
-    Ehdr.e_shoff = Obj.SHOffset;
+    Ehdr.e_shoff = Obj.SHOff;
     // """
     // If the number of sections is greater than or equal to
     // SHN_LORESERVE (0xff00), this member has the value zero and the actual
@@ -1732,7 +1749,7 @@ template <class ELFT> void ELFWriter<ELFT>::writeShdrs() {
   // This reference serves to write the dummy section header at the begining
   // of the file. It is not used for anything else
   Elf_Shdr &Shdr =
-      *reinterpret_cast<Elf_Shdr *>(Buf.getBufferStart() + Obj.SHOffset);
+      *reinterpret_cast<Elf_Shdr *>(Buf.getBufferStart() + Obj.SHOff);
   Shdr.sh_name = 0;
   Shdr.sh_type = SHT_NULL;
   Shdr.sh_flags = 0;
@@ -1862,26 +1879,13 @@ void Object::sortSections() {
   });
 }
 
-static uint64_t alignToAddr(uint64_t Offset, uint64_t Addr, uint64_t Align) {
-  // Calculate Diff such that (Offset + Diff) & -Align == Addr & -Align.
-  if (Align == 0)
-    Align = 1;
-  auto Diff =
-      static_cast<int64_t>(Addr % Align) - static_cast<int64_t>(Offset % Align);
-  // We only want to add to Offset, however, so if Diff < 0 we can add Align and
-  // (Offset + Diff) & -Align == Addr & -Align will still hold.
-  if (Diff < 0)
-    Diff += Align;
-  return Offset + Diff;
-}
-
 // Orders segments such that if x = y->ParentSegment then y comes before x.
 static void orderSegments(std::vector<Segment *> &Segments) {
   llvm::stable_sort(Segments, compareSegmentsByOffset);
 }
 
 // This function finds a consistent layout for a list of segments starting from
-// an Offset. It assumes that Segments have been sorted by OrderSegments and
+// an Offset. It assumes that Segments have been sorted by orderSegments and
 // returns an Offset one past the end of the last segment.
 static uint64_t layoutSegments(std::vector<Segment *> &Segments,
                                uint64_t Offset) {
@@ -1902,8 +1906,8 @@ static uint64_t layoutSegments(std::vector<Segment *> &Segments,
       Seg->Offset =
           Parent->Offset + Seg->OriginalOffset - Parent->OriginalOffset;
     } else {
-      Offset = alignToAddr(Offset, Seg->VAddr, Seg->Align);
-      Seg->Offset = Offset;
+      Seg->Offset =
+          alignTo(Offset, std::max<uint64_t>(Seg->Align, 1), Seg->VAddr);
     }
     Offset = std::max(Offset, Seg->Offset + Seg->FileSize);
   }
@@ -1925,17 +1929,17 @@ static uint64_t layoutSections(Range Sections, uint64_t Offset) {
   // of the segment we can assign a new offset to the section. For sections not
   // covered by segments we can just bump Offset to the next valid location.
   uint32_t Index = 1;
-  for (auto &Section : Sections) {
-    Section.Index = Index++;
-    if (Section.ParentSegment != nullptr) {
-      auto Segment = *Section.ParentSegment;
-      Section.Offset =
-          Segment.Offset + (Section.OriginalOffset - Segment.OriginalOffset);
+  for (auto &Sec : Sections) {
+    Sec.Index = Index++;
+    if (Sec.ParentSegment != nullptr) {
+      auto Segment = *Sec.ParentSegment;
+      Sec.Offset =
+          Segment.Offset + (Sec.OriginalOffset - Segment.OriginalOffset);
     } else {
-      Offset = alignTo(Offset, Section.Align == 0 ? 1 : Section.Align);
-      Section.Offset = Offset;
-      if (Section.Type != SHT_NOBITS)
-        Offset += Section.Size;
+      Offset = alignTo(Offset, Sec.Align == 0 ? 1 : Sec.Align);
+      Sec.Offset = Offset;
+      if (Sec.Type != SHT_NOBITS)
+        Offset += Sec.Size;
     }
   }
   return Offset;
@@ -1971,16 +1975,16 @@ template <class ELFT> void ELFWriter<ELFT>::assignOffsets() {
   // Offset so that SHOffset is valid.
   if (WriteSectionHeaders)
     Offset = alignTo(Offset, sizeof(Elf_Addr));
-  Obj.SHOffset = Offset;
+  Obj.SHOff = Offset;
 }
 
 template <class ELFT> size_t ELFWriter<ELFT>::totalSize() const {
   // We already have the section header offset so we can calculate the total
   // size by just adding up the size of each section header.
   if (!WriteSectionHeaders)
-    return Obj.SHOffset;
+    return Obj.SHOff;
   size_t ShdrCount = Obj.sections().size() + 1; // Includes null shdr.
-  return Obj.SHOffset + ShdrCount * sizeof(Elf_Shdr);
+  return Obj.SHOff + ShdrCount * sizeof(Elf_Shdr);
 }
 
 template <class ELFT> Error ELFWriter<ELFT>::write() {
@@ -1995,6 +1999,25 @@ template <class ELFT> Error ELFWriter<ELFT>::write() {
   return Buf.commit();
 }
 
+static Error removeUnneededSections(Object &Obj) {
+  // We can remove an empty symbol table from non-relocatable objects.
+  // Relocatable objects typically have relocation sections whose
+  // sh_link field points to .symtab, so we can't remove .symtab
+  // even if it is empty.
+  if (Obj.isRelocatable() || Obj.SymbolTable == nullptr ||
+      !Obj.SymbolTable->empty())
+    return Error::success();
+
+  // .strtab can be used for section names. In such a case we shouldn't
+  // remove it.
+  auto *StrTab = Obj.SymbolTable->getStrTab() == Obj.SectionNames
+                     ? nullptr
+                     : Obj.SymbolTable->getStrTab();
+  return Obj.removeSections(false, [&](const SectionBase &Sec) {
+    return &Sec == Obj.SymbolTable || &Sec == StrTab;
+  });
+}
+
 template <class ELFT> Error ELFWriter<ELFT>::finalize() {
   // It could happen that SectionNames has been removed and yet the user wants
   // a section header table output. We need to throw an error if a user tries
@@ -2004,6 +2027,8 @@ template <class ELFT> Error ELFWriter<ELFT>::finalize() {
                              "cannot write section header table because "
                              "section header string table was removed");
 
+  if (Error E = removeUnneededSections(Obj))
+    return E;
   Obj.sortSections();
 
   // We need to assign indexes before we perform layout because we need to know
@@ -2045,9 +2070,8 @@ template <class ELFT> Error ELFWriter<ELFT>::finalize() {
   // Make sure we add the names of all the sections. Importantly this must be
   // done after we decide to add or remove SectionIndexes.
   if (Obj.SectionNames != nullptr)
-    for (const auto &Section : Obj.sections()) {
-      Obj.SectionNames->addString(Section.Name);
-    }
+    for (const SectionBase &Sec : Obj.sections())
+      Obj.SectionNames->addString(Sec.Name);
 
   initEhdrSegment();
 
@@ -2055,8 +2079,8 @@ template <class ELFT> Error ELFWriter<ELFT>::finalize() {
   // Also, the output arch may not be the same as the input arch, so fix up
   // size-related fields before doing layout calculations.
   uint64_t Index = 0;
-  auto SecSizer = llvm::make_unique<ELFSectionSizer<ELFT>>();
-  for (auto &Sec : Obj.sections()) {
+  auto SecSizer = std::make_unique<ELFSectionSizer<ELFT>>();
+  for (SectionBase &Sec : Obj.sections()) {
     Sec.Index = Index++;
     Sec.accept(*SecSizer);
   }
@@ -2082,40 +2106,36 @@ template <class ELFT> Error ELFWriter<ELFT>::finalize() {
 
   // Finally now that all offsets and indexes have been set we can finalize any
   // remaining issues.
-  uint64_t Offset = Obj.SHOffset + sizeof(Elf_Shdr);
-  for (SectionBase &Section : Obj.sections()) {
-    Section.HeaderOffset = Offset;
+  uint64_t Offset = Obj.SHOff + sizeof(Elf_Shdr);
+  for (SectionBase &Sec : Obj.sections()) {
+    Sec.HeaderOffset = Offset;
     Offset += sizeof(Elf_Shdr);
     if (WriteSectionHeaders)
-      Section.NameIndex = Obj.SectionNames->findIndex(Section.Name);
-    Section.finalize();
+      Sec.NameIndex = Obj.SectionNames->findIndex(Sec.Name);
+    Sec.finalize();
   }
 
   if (Error E = Buf.allocate(totalSize()))
     return E;
-  SecWriter = llvm::make_unique<ELFSectionWriter<ELFT>>(Buf);
+  SecWriter = std::make_unique<ELFSectionWriter<ELFT>>(Buf);
   return Error::success();
 }
 
 Error BinaryWriter::write() {
-  for (auto &Section : Obj.sections())
-    if (Section.Flags & SHF_ALLOC)
-      Section.accept(*SecWriter);
+  for (const SectionBase &Sec : Obj.allocSections())
+    Sec.accept(*SecWriter);
   return Buf.commit();
 }
 
 Error BinaryWriter::finalize() {
-  // TODO: Create a filter range to construct OrderedSegments from so that this
-  // code can be deduped with assignOffsets above. This should also solve the
-  // todo below for LayoutSections.
   // We need a temporary list of segments that has a special order to it
   // so that we know that anytime ->ParentSegment is set that segment has
   // already had it's offset properly set. We only want to consider the segments
   // that will affect layout of allocated sections so we only add those.
   std::vector<Segment *> OrderedSegments;
-  for (SectionBase &Section : Obj.sections())
-    if ((Section.Flags & SHF_ALLOC) != 0 && Section.ParentSegment != nullptr)
-      OrderedSegments.push_back(Section.ParentSegment);
+  for (const SectionBase &Sec : Obj.allocSections())
+    if (Sec.ParentSegment != nullptr)
+      OrderedSegments.push_back(Sec.ParentSegment);
 
   // For binary output, we're going to use physical addresses instead of
   // virtual addresses, since a binary output is used for cases like ROM
@@ -2130,7 +2150,7 @@ Error BinaryWriter::finalize() {
   llvm::stable_sort(OrderedSegments, compareSegmentsByPAddr);
 
   // Because we add a ParentSegment for each section we might have duplicate
-  // segments in OrderedSegments. If there were duplicates then LayoutSegments
+  // segments in OrderedSegments. If there were duplicates then layoutSegments
   // would do very strange things.
   auto End =
       std::unique(std::begin(OrderedSegments), std::end(OrderedSegments));
@@ -2158,28 +2178,20 @@ Error BinaryWriter::finalize() {
     }
   }
 
-  // TODO: generalize LayoutSections to take a range. Pass a special range
-  // constructed from an iterator that skips values for which a predicate does
-  // not hold. Then pass such a range to LayoutSections instead of constructing
-  // AllocatedSections here.
-  std::vector<SectionBase *> AllocatedSections;
-  for (SectionBase &Section : Obj.sections())
-    if (Section.Flags & SHF_ALLOC)
-      AllocatedSections.push_back(&Section);
-  layoutSections(make_pointee_range(AllocatedSections), Offset);
+  layoutSections(Obj.allocSections(), Offset);
 
   // Now that every section has been laid out we just need to compute the total
   // file size. This might not be the same as the offset returned by
-  // LayoutSections, because we want to truncate the last segment to the end of
+  // layoutSections, because we want to truncate the last segment to the end of
   // its last section, to match GNU objcopy's behaviour.
   TotalSize = 0;
-  for (SectionBase *Section : AllocatedSections)
-    if (Section->Type != SHT_NOBITS)
-      TotalSize = std::max(TotalSize, Section->Offset + Section->Size);
+  for (const SectionBase &Sec : Obj.allocSections())
+    if (Sec.Type != SHT_NOBITS)
+      TotalSize = std::max(TotalSize, Sec.Offset + Sec.Size);
 
   if (Error E = Buf.allocate(TotalSize))
     return E;
-  SecWriter = llvm::make_unique<BinarySectionWriter>(Buf);
+  SecWriter = std::make_unique<BinarySectionWriter>(Buf);
   return Error::success();
 }
 
@@ -2259,17 +2271,17 @@ Error IHexWriter::finalize() {
   // If any section we're to write has segment then we
   // switch to using physical addresses. Otherwise we
   // use section virtual address.
-  for (auto &Section : Obj.sections())
-    if (ShouldWrite(Section) && IsInPtLoad(Section)) {
+  for (const SectionBase &Sec : Obj.sections())
+    if (ShouldWrite(Sec) && IsInPtLoad(Sec)) {
       UseSegments = true;
       break;
     }
 
-  for (auto &Section : Obj.sections())
-    if (ShouldWrite(Section) && (!UseSegments || IsInPtLoad(Section))) {
-      if (Error E = checkSection(Section))
+  for (const SectionBase &Sec : Obj.sections())
+    if (ShouldWrite(Sec) && (!UseSegments || IsInPtLoad(Sec))) {
+      if (Error E = checkSection(Sec))
         return E;
-      Sections.insert(&Section);
+      Sections.insert(&Sec);
     }
 
   IHexSectionWriterBase LengthCalc(Buf);
diff --git a/tools/llvm-objcopy/ELF/Object.h b/tools/llvm-objcopy/ELF/Object.h
index f3df93b9662f..eeacb014e4dc 100644
--- a/tools/llvm-objcopy/ELF/Object.h
+++ b/tools/llvm-objcopy/ELF/Object.h
@@ -57,8 +57,8 @@ public:
       : Sections(Secs) {}
   SectionTableRef(const SectionTableRef &) = default;
 
-  iterator begin() { return iterator(Sections.data()); }
-  iterator end() { return iterator(Sections.data() + Sections.size()); }
+  iterator begin() const { return iterator(Sections.data()); }
+  iterator end() const { return iterator(Sections.data() + Sections.size()); }
   size_t size() const { return Sections.size(); }
 
   SectionBase *getSection(uint32_t Index, Twine ErrMsg);
@@ -863,7 +863,7 @@ public:
 class Reader {
 public:
   virtual ~Reader();
-  virtual std::unique_ptr<Object> create() const = 0;
+  virtual std::unique_ptr<Object> create(bool EnsureSymtab) const = 0;
 };
 
 using object::Binary;
@@ -873,7 +873,6 @@ using object::OwningBinary;
 
 class BasicELFBuilder {
 protected:
-  uint16_t EMachine;
   std::unique_ptr<Object> Obj;
 
   void initFileHeader();
@@ -883,17 +882,18 @@ protected:
   void initSections();
 
 public:
-  BasicELFBuilder(uint16_t EM)
-      : EMachine(EM), Obj(llvm::make_unique<Object>()) {}
+  BasicELFBuilder() : Obj(std::make_unique<Object>()) {}
 };
 
 class BinaryELFBuilder : public BasicELFBuilder {
   MemoryBuffer *MemBuf;
+  uint8_t NewSymbolVisibility;
   void addData(SymbolTableSection *SymTab);
 
 public:
-  BinaryELFBuilder(uint16_t EM, MemoryBuffer *MB)
-      : BasicELFBuilder(EM), MemBuf(MB) {}
+  BinaryELFBuilder(MemoryBuffer *MB, uint8_t NewSymbolVisibility)
+      : BasicELFBuilder(), MemBuf(MB),
+        NewSymbolVisibility(NewSymbolVisibility) {}
 
   std::unique_ptr<Object> build();
 };
@@ -905,7 +905,7 @@ class IHexELFBuilder : public BasicELFBuilder {
 
 public:
   IHexELFBuilder(const std::vector<IHexRecord> &Records)
-      : BasicELFBuilder(ELF::EM_386), Records(Records) {}
+      : BasicELFBuilder(), Records(Records) {}
 
   std::unique_ptr<Object> build();
 };
@@ -926,7 +926,7 @@ private:
   void initGroupSection(GroupSection *GroupSec);
   void initSymbolTable(SymbolTableSection *SymTab);
   void readSectionHeaders();
-  void readSections();
+  void readSections(bool EnsureSymtab);
   void findEhdrOffset();
   SectionBase &makeSection(const Elf_Shdr &Shdr);
 
@@ -936,17 +936,17 @@ public:
       : ElfFile(*ElfObj.getELFFile()), Obj(Obj),
         ExtractPartition(ExtractPartition) {}
 
-  void build();
+  void build(bool EnsureSymtab);
 };
 
 class BinaryReader : public Reader {
-  const MachineInfo &MInfo;
   MemoryBuffer *MemBuf;
+  uint8_t NewSymbolVisibility;
 
 public:
-  BinaryReader(const MachineInfo &MI, MemoryBuffer *MB)
-      : MInfo(MI), MemBuf(MB) {}
-  std::unique_ptr<Object> create() const override;
+  BinaryReader(MemoryBuffer *MB, const uint8_t NewSymbolVisibility)
+      : MemBuf(MB), NewSymbolVisibility(NewSymbolVisibility) {}
+  std::unique_ptr<Object> create(bool EnsureSymtab) const override;
 };
 
 class IHexReader : public Reader {
@@ -968,7 +968,7 @@ class IHexReader : public Reader {
 public:
   IHexReader(MemoryBuffer *MB) : MemBuf(MB) {}
 
-  std::unique_ptr<Object> create() const override;
+  std::unique_ptr<Object> create(bool EnsureSymtab) const override;
 };
 
 class ELFReader : public Reader {
@@ -976,7 +976,7 @@ class ELFReader : public Reader {
   Optional<StringRef> ExtractPartition;
 
 public:
-  std::unique_ptr<Object> create() const override;
+  std::unique_ptr<Object> create(bool EnsureSymtab) const override;
   explicit ELFReader(Binary *B, Optional<StringRef> ExtractPartition)
       : Bin(B), ExtractPartition(ExtractPartition) {}
 };
@@ -990,6 +990,10 @@ private:
   std::vector<SegPtr> Segments;
   std::vector<SecPtr> RemovedSections;
 
+  static bool sectionIsAlloc(const SectionBase &Sec) {
+    return Sec.Flags & ELF::SHF_ALLOC;
+  };
+
 public:
   template <class T>
   using Range = iterator_range<
@@ -1011,13 +1015,14 @@ public:
   uint8_t OSABI;
   uint8_t ABIVersion;
   uint64_t Entry;
-  uint64_t SHOffset;
+  uint64_t SHOff;
   uint32_t Type;
   uint32_t Machine;
   uint32_t Version;
   uint32_t Flags;
 
   bool HadShdrs = true;
+  bool MustBeRelocatable = false;
   StringTableSection *SectionNames = nullptr;
   SymbolTableSection *SymbolTable = nullptr;
   SectionIndexSection *SectionIndexTable = nullptr;
@@ -1027,6 +1032,13 @@ public:
   ConstRange<SectionBase> sections() const {
     return make_pointee_range(Sections);
   }
+  iterator_range<
+      filter_iterator<pointee_iterator<std::vector<SecPtr>::const_iterator>,
+                      decltype(&sectionIsAlloc)>>
+  allocSections() const {
+    return make_filter_range(make_pointee_range(Sections), sectionIsAlloc);
+  }
+
   SectionBase *findSection(StringRef Name) {
     auto SecIt =
         find_if(Sections, [&](const SecPtr &Sec) { return Sec->Name == Name; });
@@ -1041,16 +1053,20 @@ public:
                        std::function<bool(const SectionBase &)> ToRemove);
   Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
   template <class T, class... Ts> T &addSection(Ts &&... Args) {
-    auto Sec = llvm::make_unique<T>(std::forward<Ts>(Args)...);
+    auto Sec = std::make_unique<T>(std::forward<Ts>(Args)...);
     auto Ptr = Sec.get();
+    MustBeRelocatable |= isa<RelocationSection>(*Ptr);
     Sections.emplace_back(std::move(Sec));
     Ptr->Index = Sections.size();
     return *Ptr;
   }
   Segment &addSegment(ArrayRef<uint8_t> Data) {
-    Segments.emplace_back(llvm::make_unique<Segment>(Data));
+    Segments.emplace_back(std::make_unique<Segment>(Data));
     return *Segments.back();
   }
+  bool isRelocatable() const {
+    return (Type != ELF::ET_DYN && Type != ELF::ET_EXEC) || MustBeRelocatable;
+  }
 };
 
 } // end namespace elf
diff --git a/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp b/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
new file mode 100644
index 000000000000..f621f3aa09cf
--- /dev/null
+++ b/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
@@ -0,0 +1,350 @@
+//===- MachOLayoutBuilder.cpp -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MachOLayoutBuilder.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+namespace objcopy {
+namespace macho {
+
+uint32_t MachOLayoutBuilder::computeSizeOfCmds() const {
+  uint32_t Size = 0;
+  for (const auto &LC : O.LoadCommands) {
+    const MachO::macho_load_command &MLC = LC.MachOLoadCommand;
+    auto cmd = MLC.load_command_data.cmd;
+    switch (cmd) {
+    case MachO::LC_SEGMENT:
+      Size += sizeof(MachO::segment_command) +
+              sizeof(MachO::section) * LC.Sections.size();
+      continue;
+    case MachO::LC_SEGMENT_64:
+      Size += sizeof(MachO::segment_command_64) +
+              sizeof(MachO::section_64) * LC.Sections.size();
+      continue;
+    }
+
+    switch (cmd) {
+#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
+  case MachO::LCName:                                                          \
+    Size += sizeof(MachO::LCStruct) + LC.Payload.size();                       \
+    break;
+#include "llvm/BinaryFormat/MachO.def"
+#undef HANDLE_LOAD_COMMAND
+    }
+  }
+
+  return Size;
+}
+
+void MachOLayoutBuilder::constructStringTable() {
+  for (std::unique_ptr<SymbolEntry> &Sym : O.SymTable.Symbols)
+    StrTableBuilder.add(Sym->Name);
+  StrTableBuilder.finalize();
+}
+
+void MachOLayoutBuilder::updateSymbolIndexes() {
+  uint32_t Index = 0;
+  for (auto &Symbol : O.SymTable.Symbols)
+    Symbol->Index = Index++;
+}
+
+// Updates the index and the number of local/external/undefined symbols.
+void MachOLayoutBuilder::updateDySymTab(MachO::macho_load_command &MLC) {
+  assert(MLC.load_command_data.cmd == MachO::LC_DYSYMTAB);
+  // Make sure that nlist entries in the symbol table are sorted by the those
+  // types. The order is: local < defined external < undefined external.
+  assert(std::is_sorted(O.SymTable.Symbols.begin(), O.SymTable.Symbols.end(),
+                        [](const std::unique_ptr<SymbolEntry> &A,
+                           const std::unique_ptr<SymbolEntry> &B) {
+                          return (A->isLocalSymbol() && !B->isLocalSymbol()) ||
+                                 (!A->isUndefinedSymbol() &&
+                                  B->isUndefinedSymbol());
+                        }) &&
+         "Symbols are not sorted by their types.");
+
+  uint32_t NumLocalSymbols = 0;
+  auto Iter = O.SymTable.Symbols.begin();
+  auto End = O.SymTable.Symbols.end();
+  for (; Iter != End; ++Iter) {
+    if ((*Iter)->isExternalSymbol())
+      break;
+
+    ++NumLocalSymbols;
+  }
+
+  uint32_t NumExtDefSymbols = 0;
+  for (; Iter != End; ++Iter) {
+    if ((*Iter)->isUndefinedSymbol())
+      break;
+
+    ++NumExtDefSymbols;
+  }
+
+  MLC.dysymtab_command_data.ilocalsym = 0;
+  MLC.dysymtab_command_data.nlocalsym = NumLocalSymbols;
+  MLC.dysymtab_command_data.iextdefsym = NumLocalSymbols;
+  MLC.dysymtab_command_data.nextdefsym = NumExtDefSymbols;
+  MLC.dysymtab_command_data.iundefsym = NumLocalSymbols + NumExtDefSymbols;
+  MLC.dysymtab_command_data.nundefsym =
+      O.SymTable.Symbols.size() - (NumLocalSymbols + NumExtDefSymbols);
+}
+
+// Recomputes and updates offset and size fields in load commands and sections
+// since they could be modified.
+uint64_t MachOLayoutBuilder::layoutSegments() {
+  auto HeaderSize =
+      Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
+  const bool IsObjectFile =
+      O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
+  uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
+  for (auto &LC : O.LoadCommands) {
+    auto &MLC = LC.MachOLoadCommand;
+    StringRef Segname;
+    uint64_t SegmentVmAddr;
+    uint64_t SegmentVmSize;
+    switch (MLC.load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+      SegmentVmAddr = MLC.segment_command_data.vmaddr;
+      SegmentVmSize = MLC.segment_command_data.vmsize;
+      Segname = StringRef(MLC.segment_command_data.segname,
+                          strnlen(MLC.segment_command_data.segname,
+                                  sizeof(MLC.segment_command_data.segname)));
+      break;
+    case MachO::LC_SEGMENT_64:
+      SegmentVmAddr = MLC.segment_command_64_data.vmaddr;
+      SegmentVmSize = MLC.segment_command_64_data.vmsize;
+      Segname = StringRef(MLC.segment_command_64_data.segname,
+                          strnlen(MLC.segment_command_64_data.segname,
+                                  sizeof(MLC.segment_command_64_data.segname)));
+      break;
+    default:
+      continue;
+    }
+
+    if (Segname == "__LINKEDIT") {
+      // We update the __LINKEDIT segment later (in layoutTail).
+      assert(LC.Sections.empty() && "__LINKEDIT segment has sections");
+      LinkEditLoadCommand = &MLC;
+      continue;
+    }
+
+    // Update file offsets and sizes of sections.
+    uint64_t SegOffset = Offset;
+    uint64_t SegFileSize = 0;
+    uint64_t VMSize = 0;
+    for (auto &Sec : LC.Sections) {
+      if (IsObjectFile) {
+        if (Sec.isVirtualSection()) {
+          Sec.Offset = 0;
+        } else {
+          uint64_t PaddingSize =
+              offsetToAlignment(SegFileSize, Align(1ull << Sec.Align));
+          Sec.Offset = SegOffset + SegFileSize + PaddingSize;
+          Sec.Size = Sec.Content.size();
+          SegFileSize += PaddingSize + Sec.Size;
+        }
+        VMSize = std::max(VMSize, Sec.Addr + Sec.Size);
+      } else {
+        if (Sec.isVirtualSection()) {
+          Sec.Offset = 0;
+          VMSize += Sec.Size;
+        } else {
+          uint32_t SectOffset = Sec.Addr - SegmentVmAddr;
+          Sec.Offset = SegOffset + SectOffset;
+          Sec.Size = Sec.Content.size();
+          SegFileSize = std::max(SegFileSize, SectOffset + Sec.Size);
+          VMSize = std::max(VMSize, SegFileSize);
+        }
+      }
+    }
+
+    if (IsObjectFile) {
+      Offset += SegFileSize;
+    } else {
+      Offset = alignTo(Offset + SegFileSize, PageSize);
+      SegFileSize = alignTo(SegFileSize, PageSize);
+      // Use the original vmsize if the segment is __PAGEZERO.
+      VMSize =
+          Segname == "__PAGEZERO" ? SegmentVmSize : alignTo(VMSize, PageSize);
+    }
+
+    switch (MLC.load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+      MLC.segment_command_data.cmdsize =
+          sizeof(MachO::segment_command) +
+          sizeof(MachO::section) * LC.Sections.size();
+      MLC.segment_command_data.nsects = LC.Sections.size();
+      MLC.segment_command_data.fileoff = SegOffset;
+      MLC.segment_command_data.vmsize = VMSize;
+      MLC.segment_command_data.filesize = SegFileSize;
+      break;
+    case MachO::LC_SEGMENT_64:
+      MLC.segment_command_64_data.cmdsize =
+          sizeof(MachO::segment_command_64) +
+          sizeof(MachO::section_64) * LC.Sections.size();
+      MLC.segment_command_64_data.nsects = LC.Sections.size();
+      MLC.segment_command_64_data.fileoff = SegOffset;
+      MLC.segment_command_64_data.vmsize = VMSize;
+      MLC.segment_command_64_data.filesize = SegFileSize;
+      break;
+    }
+  }
+
+  return Offset;
+}
+
+uint64_t MachOLayoutBuilder::layoutRelocations(uint64_t Offset) {
+  for (auto &LC : O.LoadCommands)
+    for (auto &Sec : LC.Sections) {
+      Sec.RelOff = Sec.Relocations.empty() ? 0 : Offset;
+      Sec.NReloc = Sec.Relocations.size();
+      Offset += sizeof(MachO::any_relocation_info) * Sec.NReloc;
+    }
+
+  return Offset;
+}
+
+Error MachOLayoutBuilder::layoutTail(uint64_t Offset) {
+  // The order of LINKEDIT elements is as follows:
+  // rebase info, binding info, weak binding info, lazy binding info, export
+  // trie, data-in-code, symbol table, indirect symbol table, symbol table
+  // strings.
+  uint64_t NListSize = Is64Bit ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
+  uint64_t StartOfLinkEdit = Offset;
+  uint64_t StartOfRebaseInfo = StartOfLinkEdit;
+  uint64_t StartOfBindingInfo = StartOfRebaseInfo + O.Rebases.Opcodes.size();
+  uint64_t StartOfWeakBindingInfo = StartOfBindingInfo + O.Binds.Opcodes.size();
+  uint64_t StartOfLazyBindingInfo =
+      StartOfWeakBindingInfo + O.WeakBinds.Opcodes.size();
+  uint64_t StartOfExportTrie =
+      StartOfLazyBindingInfo + O.LazyBinds.Opcodes.size();
+  uint64_t StartOfFunctionStarts = StartOfExportTrie + O.Exports.Trie.size();
+  uint64_t StartOfDataInCode =
+      StartOfFunctionStarts + O.FunctionStarts.Data.size();
+  uint64_t StartOfSymbols = StartOfDataInCode + O.DataInCode.Data.size();
+  uint64_t StartOfIndirectSymbols =
+      StartOfSymbols + NListSize * O.SymTable.Symbols.size();
+  uint64_t StartOfSymbolStrings =
+      StartOfIndirectSymbols +
+      sizeof(uint32_t) * O.IndirectSymTable.Symbols.size();
+  uint64_t LinkEditSize =
+      (StartOfSymbolStrings + StrTableBuilder.getSize()) - StartOfLinkEdit;
+
+  // Now we have determined the layout of the contents of the __LINKEDIT
+  // segment. Update its load command.
+  if (LinkEditLoadCommand) {
+    MachO::macho_load_command *MLC = LinkEditLoadCommand;
+    switch (LinkEditLoadCommand->load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+      MLC->segment_command_data.cmdsize = sizeof(MachO::segment_command);
+      MLC->segment_command_data.fileoff = StartOfLinkEdit;
+      MLC->segment_command_data.vmsize = alignTo(LinkEditSize, PageSize);
+      MLC->segment_command_data.filesize = LinkEditSize;
+      break;
+    case MachO::LC_SEGMENT_64:
+      MLC->segment_command_64_data.cmdsize = sizeof(MachO::segment_command_64);
+      MLC->segment_command_64_data.fileoff = StartOfLinkEdit;
+      MLC->segment_command_64_data.vmsize = alignTo(LinkEditSize, PageSize);
+      MLC->segment_command_64_data.filesize = LinkEditSize;
+      break;
+    }
+  }
+
+  for (auto &LC : O.LoadCommands) {
+    auto &MLC = LC.MachOLoadCommand;
+    auto cmd = MLC.load_command_data.cmd;
+    switch (cmd) {
+    case MachO::LC_SYMTAB:
+      MLC.symtab_command_data.symoff = StartOfSymbols;
+      MLC.symtab_command_data.nsyms = O.SymTable.Symbols.size();
+      MLC.symtab_command_data.stroff = StartOfSymbolStrings;
+      MLC.symtab_command_data.strsize = StrTableBuilder.getSize();
+      break;
+    case MachO::LC_DYSYMTAB: {
+      if (MLC.dysymtab_command_data.ntoc != 0 ||
+          MLC.dysymtab_command_data.nmodtab != 0 ||
+          MLC.dysymtab_command_data.nextrefsyms != 0 ||
+          MLC.dysymtab_command_data.nlocrel != 0 ||
+          MLC.dysymtab_command_data.nextrel != 0)
+        return createStringError(llvm::errc::not_supported,
+                                 "shared library is not yet supported");
+
+      if (!O.IndirectSymTable.Symbols.empty()) {
+        MLC.dysymtab_command_data.indirectsymoff = StartOfIndirectSymbols;
+        MLC.dysymtab_command_data.nindirectsyms =
+            O.IndirectSymTable.Symbols.size();
+      }
+
+      updateDySymTab(MLC);
+      break;
+    }
+    case MachO::LC_DATA_IN_CODE:
+      MLC.linkedit_data_command_data.dataoff = StartOfDataInCode;
+      MLC.linkedit_data_command_data.datasize = O.DataInCode.Data.size();
+      break;
+    case MachO::LC_FUNCTION_STARTS:
+      MLC.linkedit_data_command_data.dataoff = StartOfFunctionStarts;
+      MLC.linkedit_data_command_data.datasize = O.FunctionStarts.Data.size();
+      break;
+    case MachO::LC_DYLD_INFO:
+    case MachO::LC_DYLD_INFO_ONLY:
+      MLC.dyld_info_command_data.rebase_off =
+          O.Rebases.Opcodes.empty() ? 0 : StartOfRebaseInfo;
+      MLC.dyld_info_command_data.rebase_size = O.Rebases.Opcodes.size();
+      MLC.dyld_info_command_data.bind_off =
+          O.Binds.Opcodes.empty() ? 0 : StartOfBindingInfo;
+      MLC.dyld_info_command_data.bind_size = O.Binds.Opcodes.size();
+      MLC.dyld_info_command_data.weak_bind_off =
+          O.WeakBinds.Opcodes.empty() ? 0 : StartOfWeakBindingInfo;
+      MLC.dyld_info_command_data.weak_bind_size = O.WeakBinds.Opcodes.size();
+      MLC.dyld_info_command_data.lazy_bind_off =
+          O.LazyBinds.Opcodes.empty() ? 0 : StartOfLazyBindingInfo;
+      MLC.dyld_info_command_data.lazy_bind_size = O.LazyBinds.Opcodes.size();
+      MLC.dyld_info_command_data.export_off =
+          O.Exports.Trie.empty() ? 0 : StartOfExportTrie;
+      MLC.dyld_info_command_data.export_size = O.Exports.Trie.size();
+      break;
+    case MachO::LC_LOAD_DYLINKER:
+    case MachO::LC_MAIN:
+    case MachO::LC_RPATH:
+    case MachO::LC_SEGMENT:
+    case MachO::LC_SEGMENT_64:
+    case MachO::LC_VERSION_MIN_MACOSX:
+    case MachO::LC_BUILD_VERSION:
+    case MachO::LC_ID_DYLIB:
+    case MachO::LC_LOAD_DYLIB:
+    case MachO::LC_UUID:
+    case MachO::LC_SOURCE_VERSION:
+      // Nothing to update.
+      break;
+    default:
+      // Abort if it's unsupported in order to prevent corrupting the object.
+      return createStringError(llvm::errc::not_supported,
+                               "unsupported load command (cmd=0x%x)", cmd);
+    }
+  }
+
+  return Error::success();
+}
+
+Error MachOLayoutBuilder::layout() {
+  O.Header.NCmds = O.LoadCommands.size();
+  O.Header.SizeOfCmds = computeSizeOfCmds();
+  constructStringTable();
+  updateSymbolIndexes();
+  uint64_t Offset = layoutSegments();
+  Offset = layoutRelocations(Offset);
+  return layoutTail(Offset);
+}
+
+} // end namespace macho
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h b/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
new file mode 100644
index 000000000000..21cbe56605de
--- /dev/null
+++ b/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
@@ -0,0 +1,50 @@
+//===- MachOLayoutBuilder.h -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
+#define LLVM_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
+
+#include "MachOObjcopy.h"
+#include "Object.h"
+
+namespace llvm {
+namespace objcopy {
+namespace macho {
+
+class MachOLayoutBuilder {
+  Object &O;
+  bool Is64Bit;
+  uint64_t PageSize;
+
+  // Points to the __LINKEDIT segment if it exists.
+  MachO::macho_load_command *LinkEditLoadCommand = nullptr;
+  StringTableBuilder StrTableBuilder{StringTableBuilder::MachO};
+
+  uint32_t computeSizeOfCmds() const;
+  void constructStringTable();
+  void updateSymbolIndexes();
+  void updateDySymTab(MachO::macho_load_command &MLC);
+  uint64_t layoutSegments();
+  uint64_t layoutRelocations(uint64_t Offset);
+  Error layoutTail(uint64_t Offset);
+
+public:
+  MachOLayoutBuilder(Object &O, bool Is64Bit, uint64_t PageSize)
+      : O(O), Is64Bit(Is64Bit), PageSize(PageSize) {}
+
+  // Recomputes and updates fields in the given object such as file offsets.
+  Error layout();
+
+  StringTableBuilder &getStringTableBuilder() { return StrTableBuilder; }
+};
+
+} // end namespace macho
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
diff --git a/tools/llvm-objcopy/MachO/MachOObjcopy.cpp b/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
index 19343b65dd1e..6d586e7d73f1 100644
--- a/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
+++ b/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
@@ -25,18 +25,20 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
       !Config.SplitDWO.empty() || !Config.SymbolsPrefix.empty() ||
       !Config.AllocSectionsPrefix.empty() || !Config.AddSection.empty() ||
       !Config.DumpSection.empty() || !Config.KeepSection.empty() ||
-      !Config.OnlySection.empty() || !Config.SymbolsToGlobalize.empty() ||
-      !Config.SymbolsToKeep.empty() || !Config.SymbolsToLocalize.empty() ||
-      !Config.SymbolsToWeaken.empty() || !Config.SymbolsToKeepGlobal.empty() ||
-      !Config.SectionsToRename.empty() || !Config.SymbolsToRename.empty() ||
+      Config.NewSymbolVisibility || !Config.OnlySection.empty() ||
+      !Config.SymbolsToGlobalize.empty() || !Config.SymbolsToKeep.empty() ||
+      !Config.SymbolsToLocalize.empty() || !Config.SymbolsToWeaken.empty() ||
+      !Config.SymbolsToKeepGlobal.empty() || !Config.SectionsToRename.empty() ||
+      !Config.SymbolsToRename.empty() ||
       !Config.UnneededSymbolsToRemove.empty() ||
-      !Config.SetSectionFlags.empty() || !Config.ToRemove.empty() ||
-      Config.ExtractDWO || Config.KeepFileSymbols || Config.LocalizeHidden ||
-      Config.PreserveDates || Config.StripDWO || Config.StripNonAlloc ||
-      Config.StripSections || Config.Weaken || Config.DecompressDebugSections ||
-      Config.StripDebug || Config.StripNonAlloc || Config.StripSections ||
-      Config.StripUnneeded || Config.DiscardMode != DiscardType::None ||
-      !Config.SymbolsToAdd.empty() || Config.EntryExpr) {
+      !Config.SetSectionAlignment.empty() || !Config.SetSectionFlags.empty() ||
+      !Config.ToRemove.empty() || Config.ExtractDWO || Config.KeepFileSymbols ||
+      Config.LocalizeHidden || Config.PreserveDates || Config.StripDWO ||
+      Config.StripNonAlloc || Config.StripSections || Config.Weaken ||
+      Config.DecompressDebugSections || Config.StripDebug ||
+      Config.StripNonAlloc || Config.StripSections || Config.StripUnneeded ||
+      Config.DiscardMode != DiscardType::None || !Config.SymbolsToAdd.empty() ||
+      Config.EntryExpr) {
     return createStringError(llvm::errc::invalid_argument,
                              "option not supported by llvm-objcopy for MachO");
   }
@@ -57,7 +59,11 @@ Error executeObjcopyOnBinary(const CopyConfig &Config,
   if (Error E = handleArgs(Config, *O))
     return createFileError(Config.InputFilename, std::move(E));
 
-  MachOWriter Writer(*O, In.is64Bit(), In.isLittleEndian(), Out);
+  // TODO: Support 16KB pages which are employed in iOS arm64 binaries:
+  //       https://github.com/llvm/llvm-project/commit/1bebb2832ee312d3b0316dacff457a7a29435edb
+  const uint64_t PageSize = 4096;
+
+  MachOWriter Writer(*O, In.is64Bit(), In.isLittleEndian(), PageSize, Out);
   if (auto E = Writer.finalize())
     return E;
   return Writer.write();
diff --git a/tools/llvm-objcopy/MachO/MachOReader.cpp b/tools/llvm-objcopy/MachO/MachOReader.cpp
index d31293034608..b48a0d8952d0 100644
--- a/tools/llvm-objcopy/MachO/MachOReader.cpp
+++ b/tools/llvm-objcopy/MachO/MachOReader.cpp
@@ -129,10 +129,19 @@ void MachOReader::readLoadCommands(Object &O) const {
     case MachO::LC_SYMTAB:
       O.SymTabCommandIndex = O.LoadCommands.size();
       break;
+    case MachO::LC_DYSYMTAB:
+      O.DySymTabCommandIndex = O.LoadCommands.size();
+      break;
     case MachO::LC_DYLD_INFO:
     case MachO::LC_DYLD_INFO_ONLY:
       O.DyLdInfoCommandIndex = O.LoadCommands.size();
       break;
+    case MachO::LC_DATA_IN_CODE:
+      O.DataInCodeCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_FUNCTION_STARTS:
+      O.FunctionStartsCommandIndex = O.LoadCommands.size();
+      break;
     }
 #define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
   case MachO::LCName:                                                          \
@@ -188,7 +197,7 @@ void MachOReader::readSymbolTable(Object &O) const {
                    StrTable,
                    MachOObj.getSymbolTableEntry(Symbol.getRawDataRefImpl())));
 
-    O.SymTable.Symbols.push_back(llvm::make_unique<SymbolEntry>(SE));
+    O.SymTable.Symbols.push_back(std::make_unique<SymbolEntry>(SE));
   }
 }
 
@@ -222,8 +231,37 @@ void MachOReader::readExportInfo(Object &O) const {
   O.Exports.Trie = MachOObj.getDyldInfoExportsTrie();
 }
 
+void MachOReader::readDataInCodeData(Object &O) const {
+  if (!O.DataInCodeCommandIndex)
+    return;
+  const MachO::linkedit_data_command &LDC =
+      O.LoadCommands[*O.DataInCodeCommandIndex]
+          .MachOLoadCommand.linkedit_data_command_data;
+
+  O.DataInCode.Data = arrayRefFromStringRef(
+      MachOObj.getData().substr(LDC.dataoff, LDC.datasize));
+}
+
+void MachOReader::readFunctionStartsData(Object &O) const {
+  if (!O.FunctionStartsCommandIndex)
+    return;
+  const MachO::linkedit_data_command &LDC =
+      O.LoadCommands[*O.FunctionStartsCommandIndex]
+          .MachOLoadCommand.linkedit_data_command_data;
+
+  O.FunctionStarts.Data = arrayRefFromStringRef(
+      MachOObj.getData().substr(LDC.dataoff, LDC.datasize));
+}
+
+void MachOReader::readIndirectSymbolTable(Object &O) const {
+  MachO::dysymtab_command DySymTab = MachOObj.getDysymtabLoadCommand();
+  for (uint32_t i = 0; i < DySymTab.nindirectsyms; ++i)
+    O.IndirectSymTable.Symbols.push_back(
+        MachOObj.getIndirectSymbolTableEntry(DySymTab, i));
+}
+
 std::unique_ptr<Object> MachOReader::create() const {
-  auto Obj = llvm::make_unique<Object>();
+  auto Obj = std::make_unique<Object>();
   readHeader(*Obj);
   readLoadCommands(*Obj);
   readSymbolTable(*Obj);
@@ -233,6 +271,9 @@ std::unique_ptr<Object> MachOReader::create() const {
   readWeakBindInfo(*Obj);
   readLazyBindInfo(*Obj);
   readExportInfo(*Obj);
+  readDataInCodeData(*Obj);
+  readFunctionStartsData(*Obj);
+  readIndirectSymbolTable(*Obj);
   return Obj;
 }
 
diff --git a/tools/llvm-objcopy/MachO/MachOReader.h b/tools/llvm-objcopy/MachO/MachOReader.h
index 795e5cc2363d..00c8f0d55f61 100644
--- a/tools/llvm-objcopy/MachO/MachOReader.h
+++ b/tools/llvm-objcopy/MachO/MachOReader.h
@@ -36,6 +36,9 @@ class MachOReader : public Reader {
   void readWeakBindInfo(Object &O) const;
   void readLazyBindInfo(Object &O) const;
   void readExportInfo(Object &O) const;
+  void readDataInCodeData(Object &O) const;
+  void readFunctionStartsData(Object &O) const;
+  void readIndirectSymbolTable(Object &O) const;
 
 public:
   explicit MachOReader(const object::MachOObjectFile &Obj) : MachOObj(Obj) {}
diff --git a/tools/llvm-objcopy/MachO/MachOWriter.cpp b/tools/llvm-objcopy/MachO/MachOWriter.cpp
index 74200c5aa62a..4ec91cc9eb7a 100644
--- a/tools/llvm-objcopy/MachO/MachOWriter.cpp
+++ b/tools/llvm-objcopy/MachO/MachOWriter.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MachOWriter.h"
+#include "MachOLayoutBuilder.h"
 #include "Object.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/MachO.h"
@@ -40,16 +41,10 @@ size_t MachOWriter::totalSize() const {
     const MachO::symtab_command &SymTabCommand =
         O.LoadCommands[*O.SymTabCommandIndex]
             .MachOLoadCommand.symtab_command_data;
-    if (SymTabCommand.symoff) {
-      assert((SymTabCommand.nsyms == O.SymTable.Symbols.size()) &&
-             "Incorrect number of symbols");
+    if (SymTabCommand.symoff)
       Ends.push_back(SymTabCommand.symoff + symTableSize());
-    }
-    if (SymTabCommand.stroff) {
-      assert((SymTabCommand.strsize == StrTableBuilder.getSize()) &&
-             "Incorrect string table size");
+    if (SymTabCommand.stroff)
       Ends.push_back(SymTabCommand.stroff + SymTabCommand.strsize);
-    }
   }
   if (O.DyLdInfoCommandIndex) {
     const MachO::dyld_info_command &DyLdInfoCommand =
@@ -84,6 +79,36 @@ size_t MachOWriter::totalSize() const {
     }
   }
 
+  if (O.DySymTabCommandIndex) {
+    const MachO::dysymtab_command &DySymTabCommand =
+        O.LoadCommands[*O.DySymTabCommandIndex]
+            .MachOLoadCommand.dysymtab_command_data;
+
+    if (DySymTabCommand.indirectsymoff)
+      Ends.push_back(DySymTabCommand.indirectsymoff +
+                     sizeof(uint32_t) * O.IndirectSymTable.Symbols.size());
+  }
+
+  if (O.DataInCodeCommandIndex) {
+    const MachO::linkedit_data_command &LinkEditDataCommand =
+        O.LoadCommands[*O.DataInCodeCommandIndex]
+            .MachOLoadCommand.linkedit_data_command_data;
+
+    if (LinkEditDataCommand.dataoff)
+      Ends.push_back(LinkEditDataCommand.dataoff +
+                     LinkEditDataCommand.datasize);
+  }
+
+  if (O.FunctionStartsCommandIndex) {
+    const MachO::linkedit_data_command &LinkEditDataCommand =
+        O.LoadCommands[*O.FunctionStartsCommandIndex]
+            .MachOLoadCommand.linkedit_data_command_data;
+
+    if (LinkEditDataCommand.dataoff)
+      Ends.push_back(LinkEditDataCommand.dataoff +
+                     LinkEditDataCommand.datasize);
+  }
+
   // Otherwise, use the last section / reloction.
   for (const auto &LC : O.LoadCommands)
     for (const auto &S : LC.Sections) {
@@ -120,14 +145,6 @@ void MachOWriter::writeHeader() {
   memcpy(B.getBufferStart(), &Header, HeaderSize);
 }
 
-void MachOWriter::updateSymbolIndexes() {
-  uint32_t Index = 0;
-  for (auto &Symbol : O.SymTable.Symbols) {
-    Symbol->Index = Index;
-    Index++;
-  }
-}
-
 void MachOWriter::writeLoadCommands() {
   uint8_t *Begin = B.getBufferStart() + headerSize();
   for (const auto &LC : O.LoadCommands) {
@@ -253,7 +270,7 @@ void writeNListEntry(const SymbolEntry &SE, bool IsLittleEndian, char *&Out,
   Out += sizeof(NListType);
 }
 
-void MachOWriter::writeSymbolTable() {
+void MachOWriter::writeStringTable() {
   if (!O.SymTabCommandIndex)
     return;
   const MachO::symtab_command &SymTabCommand =
@@ -261,10 +278,10 @@ void MachOWriter::writeSymbolTable() {
           .MachOLoadCommand.symtab_command_data;
 
   uint8_t *StrTable = (uint8_t *)B.getBufferStart() + SymTabCommand.stroff;
-  StrTableBuilder.write(StrTable);
+  LayoutBuilder.getStringTableBuilder().write(StrTable);
 }
 
-void MachOWriter::writeStringTable() {
+void MachOWriter::writeSymbolTable() {
   if (!O.SymTabCommandIndex)
     return;
   const MachO::symtab_command &SymTabCommand =
@@ -275,7 +292,7 @@ void MachOWriter::writeStringTable() {
   for (auto Iter = O.SymTable.Symbols.begin(), End = O.SymTable.Symbols.end();
        Iter != End; Iter++) {
     SymbolEntry *Sym = Iter->get();
-    auto Nstrx = StrTableBuilder.getOffset(Sym->Name);
+    uint32_t Nstrx = LayoutBuilder.getStringTableBuilder().getOffset(Sym->Name);
 
     if (Is64Bit)
       writeNListEntry<MachO::nlist_64>(*Sym, IsLittleEndian, SymTable, Nstrx);
@@ -344,6 +361,45 @@ void MachOWriter::writeExportInfo() {
   memcpy(Out, O.Exports.Trie.data(), O.Exports.Trie.size());
 }
 
+void MachOWriter::writeIndirectSymbolTable() {
+  if (!O.DySymTabCommandIndex)
+    return;
+
+  const MachO::dysymtab_command &DySymTabCommand =
+      O.LoadCommands[*O.DySymTabCommandIndex]
+          .MachOLoadCommand.dysymtab_command_data;
+
+  char *Out = (char *)B.getBufferStart() + DySymTabCommand.indirectsymoff;
+  assert((DySymTabCommand.nindirectsyms == O.IndirectSymTable.Symbols.size()) &&
+         "Incorrect indirect symbol table size");
+  memcpy(Out, O.IndirectSymTable.Symbols.data(),
+         sizeof(uint32_t) * O.IndirectSymTable.Symbols.size());
+}
+
+void MachOWriter::writeDataInCodeData() {
+  if (!O.DataInCodeCommandIndex)
+    return;
+  const MachO::linkedit_data_command &LinkEditDataCommand =
+      O.LoadCommands[*O.DataInCodeCommandIndex]
+          .MachOLoadCommand.linkedit_data_command_data;
+  char *Out = (char *)B.getBufferStart() + LinkEditDataCommand.dataoff;
+  assert((LinkEditDataCommand.datasize == O.DataInCode.Data.size()) &&
+         "Incorrect data in code data size");
+  memcpy(Out, O.DataInCode.Data.data(), O.DataInCode.Data.size());
+}
+
+void MachOWriter::writeFunctionStartsData() {
+  if (!O.FunctionStartsCommandIndex)
+    return;
+  const MachO::linkedit_data_command &LinkEditDataCommand =
+      O.LoadCommands[*O.FunctionStartsCommandIndex]
+          .MachOLoadCommand.linkedit_data_command_data;
+  char *Out = (char *)B.getBufferStart() + LinkEditDataCommand.dataoff;
+  assert((LinkEditDataCommand.datasize == O.FunctionStarts.Data.size()) &&
+         "Incorrect function starts data size");
+  memcpy(Out, O.FunctionStarts.Data.data(), O.FunctionStarts.Data.size());
+}
+
 void MachOWriter::writeTail() {
   typedef void (MachOWriter::*WriteHandlerType)(void);
   typedef std::pair<uint64_t, WriteHandlerType> WriteOperation;
@@ -379,206 +435,51 @@ void MachOWriter::writeTail() {
           {DyLdInfoCommand.export_off, &MachOWriter::writeExportInfo});
   }
 
-  llvm::sort(Queue, [](const WriteOperation &LHS, const WriteOperation &RHS) {
-    return LHS.first < RHS.first;
-  });
-
-  for (auto WriteOp : Queue)
-    (this->*WriteOp.second)();
-}
-
-void MachOWriter::updateSizeOfCmds() {
-  auto Size = 0;
-  for (const auto &LC : O.LoadCommands) {
-    auto &MLC = LC.MachOLoadCommand;
-    auto cmd = MLC.load_command_data.cmd;
-
-    switch (cmd) {
-    case MachO::LC_SEGMENT:
-      Size += sizeof(MachO::segment_command) +
-              sizeof(MachO::section) * LC.Sections.size();
-      continue;
-    case MachO::LC_SEGMENT_64:
-      Size += sizeof(MachO::segment_command_64) +
-              sizeof(MachO::section_64) * LC.Sections.size();
-      continue;
-    }
-
-    switch (cmd) {
-#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
-  case MachO::LCName:                                                          \
-    Size += sizeof(MachO::LCStruct);                                           \
-    break;
-#include "llvm/BinaryFormat/MachO.def"
-#undef HANDLE_LOAD_COMMAND
-    }
-  }
-
-  O.Header.SizeOfCmds = Size;
-}
-
-// Updates the index and the number of local/external/undefined symbols. Here we
-// assume that MLC is a LC_DYSYMTAB and the nlist entries in the symbol table
-// are already sorted by the those types.
-void MachOWriter::updateDySymTab(MachO::macho_load_command &MLC) {
-  uint32_t NumLocalSymbols = 0;
-  auto Iter = O.SymTable.Symbols.begin();
-  auto End = O.SymTable.Symbols.end();
-  for (; Iter != End; Iter++) {
-    if ((*Iter)->n_type & (MachO::N_EXT | MachO::N_PEXT))
-      break;
+  if (O.DySymTabCommandIndex) {
+    const MachO::dysymtab_command &DySymTabCommand =
+        O.LoadCommands[*O.DySymTabCommandIndex]
+            .MachOLoadCommand.dysymtab_command_data;
 
-    NumLocalSymbols++;
+    if (DySymTabCommand.indirectsymoff)
+      Queue.emplace_back(DySymTabCommand.indirectsymoff,
+                         &MachOWriter::writeIndirectSymbolTable);
   }
 
-  uint32_t NumExtDefSymbols = 0;
-  for (; Iter != End; Iter++) {
-    if (((*Iter)->n_type & MachO::N_TYPE) == MachO::N_UNDF)
-      break;
-
-    NumExtDefSymbols++;
-  }
-
-  MLC.dysymtab_command_data.ilocalsym = 0;
-  MLC.dysymtab_command_data.nlocalsym = NumLocalSymbols;
-  MLC.dysymtab_command_data.iextdefsym = NumLocalSymbols;
-  MLC.dysymtab_command_data.nextdefsym = NumExtDefSymbols;
-  MLC.dysymtab_command_data.iundefsym = NumLocalSymbols + NumExtDefSymbols;
-  MLC.dysymtab_command_data.nundefsym =
-      O.SymTable.Symbols.size() - (NumLocalSymbols + NumExtDefSymbols);
-}
-
-// Recomputes and updates offset and size fields in load commands and sections
-// since they could be modified.
-Error MachOWriter::layout() {
-  auto SizeOfCmds = loadCommandsSize();
-  auto Offset = headerSize() + SizeOfCmds;
-  O.Header.NCmds = O.LoadCommands.size();
-  O.Header.SizeOfCmds = SizeOfCmds;
-
-  // Lay out sections.
-  for (auto &LC : O.LoadCommands) {
-    uint64_t FileOff = Offset;
-    uint64_t VMSize = 0;
-    uint64_t FileOffsetInSegment = 0;
-    for (auto &Sec : LC.Sections) {
-      if (!Sec.isVirtualSection()) {
-        auto FilePaddingSize =
-            OffsetToAlignment(FileOffsetInSegment, 1ull << Sec.Align);
-        Sec.Offset = Offset + FileOffsetInSegment + FilePaddingSize;
-        Sec.Size = Sec.Content.size();
-        FileOffsetInSegment += FilePaddingSize + Sec.Size;
-      }
-
-      VMSize = std::max(VMSize, Sec.Addr + Sec.Size);
-    }
-
-    // TODO: Handle the __PAGEZERO segment.
-    auto &MLC = LC.MachOLoadCommand;
-    switch (MLC.load_command_data.cmd) {
-    case MachO::LC_SEGMENT:
-      MLC.segment_command_data.cmdsize =
-          sizeof(MachO::segment_command) +
-          sizeof(MachO::section) * LC.Sections.size();
-      MLC.segment_command_data.nsects = LC.Sections.size();
-      MLC.segment_command_data.fileoff = FileOff;
-      MLC.segment_command_data.vmsize = VMSize;
-      MLC.segment_command_data.filesize = FileOffsetInSegment;
-      break;
-    case MachO::LC_SEGMENT_64:
-      MLC.segment_command_64_data.cmdsize =
-          sizeof(MachO::segment_command_64) +
-          sizeof(MachO::section_64) * LC.Sections.size();
-      MLC.segment_command_64_data.nsects = LC.Sections.size();
-      MLC.segment_command_64_data.fileoff = FileOff;
-      MLC.segment_command_64_data.vmsize = VMSize;
-      MLC.segment_command_64_data.filesize = FileOffsetInSegment;
-      break;
-    }
+  if (O.DataInCodeCommandIndex) {
+    const MachO::linkedit_data_command &LinkEditDataCommand =
+        O.LoadCommands[*O.DataInCodeCommandIndex]
+            .MachOLoadCommand.linkedit_data_command_data;
 
-    Offset += FileOffsetInSegment;
+    if (LinkEditDataCommand.dataoff)
+      Queue.emplace_back(LinkEditDataCommand.dataoff,
+                         &MachOWriter::writeDataInCodeData);
   }
 
-  // Lay out relocations.
-  for (auto &LC : O.LoadCommands)
-    for (auto &Sec : LC.Sections) {
-      Sec.RelOff = Sec.Relocations.empty() ? 0 : Offset;
-      Sec.NReloc = Sec.Relocations.size();
-      Offset += sizeof(MachO::any_relocation_info) * Sec.NReloc;
-    }
+  if (O.FunctionStartsCommandIndex) {
+    const MachO::linkedit_data_command &LinkEditDataCommand =
+        O.LoadCommands[*O.FunctionStartsCommandIndex]
+            .MachOLoadCommand.linkedit_data_command_data;
 
-  // Lay out tail stuff.
-  auto NListSize = Is64Bit ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
-  for (auto &LC : O.LoadCommands) {
-    auto &MLC = LC.MachOLoadCommand;
-    auto cmd = MLC.load_command_data.cmd;
-    switch (cmd) {
-    case MachO::LC_SYMTAB:
-      MLC.symtab_command_data.nsyms = O.SymTable.Symbols.size();
-      MLC.symtab_command_data.strsize = StrTableBuilder.getSize();
-      MLC.symtab_command_data.symoff = Offset;
-      Offset += NListSize * MLC.symtab_command_data.nsyms;
-      MLC.symtab_command_data.stroff = Offset;
-      Offset += MLC.symtab_command_data.strsize;
-      break;
-    case MachO::LC_DYSYMTAB: {
-      if (MLC.dysymtab_command_data.ntoc != 0 ||
-          MLC.dysymtab_command_data.nmodtab != 0 ||
-          MLC.dysymtab_command_data.nextrefsyms != 0 ||
-          MLC.dysymtab_command_data.nlocrel != 0 ||
-          MLC.dysymtab_command_data.nextrel != 0)
-        return createStringError(llvm::errc::not_supported,
-                                 "shared library is not yet supported");
-
-      if (MLC.dysymtab_command_data.nindirectsyms != 0)
-        return createStringError(llvm::errc::not_supported,
-                                 "indirect symbol table is not yet supported");
-
-      updateDySymTab(MLC);
-      break;
-    }
-    case MachO::LC_SEGMENT:
-    case MachO::LC_SEGMENT_64:
-    case MachO::LC_VERSION_MIN_MACOSX:
-    case MachO::LC_BUILD_VERSION:
-    case MachO::LC_ID_DYLIB:
-    case MachO::LC_LOAD_DYLIB:
-    case MachO::LC_UUID:
-    case MachO::LC_SOURCE_VERSION:
-      // Nothing to update.
-      break;
-    default:
-      // Abort if it's unsupported in order to prevent corrupting the object.
-      return createStringError(llvm::errc::not_supported,
-                               "unsupported load command (cmd=0x%x)", cmd);
-    }
+    if (LinkEditDataCommand.dataoff)
+      Queue.emplace_back(LinkEditDataCommand.dataoff,
+                         &MachOWriter::writeFunctionStartsData);
   }
 
-  return Error::success();
-}
+  llvm::sort(Queue, [](const WriteOperation &LHS, const WriteOperation &RHS) {
+    return LHS.first < RHS.first;
+  });
 
-void MachOWriter::constructStringTable() {
-  for (std::unique_ptr<SymbolEntry> &Sym : O.SymTable.Symbols)
-    StrTableBuilder.add(Sym->Name);
-  StrTableBuilder.finalize();
+  for (auto WriteOp : Queue)
+    (this->*WriteOp.second)();
 }
 
-Error MachOWriter::finalize() {
-  updateSizeOfCmds();
-  constructStringTable();
-
-  if (auto E = layout())
-    return E;
-
-  return Error::success();
-}
+Error MachOWriter::finalize() { return LayoutBuilder.layout(); }
 
 Error MachOWriter::write() {
   if (Error E = B.allocate(totalSize()))
     return E;
   memset(B.getBufferStart(), 0, totalSize());
   writeHeader();
-  updateSymbolIndexes();
   writeLoadCommands();
   writeSections();
   writeTail();
diff --git a/tools/llvm-objcopy/MachO/MachOWriter.h b/tools/llvm-objcopy/MachO/MachOWriter.h
index ecf12d62de2c..22abbad56f41 100644
--- a/tools/llvm-objcopy/MachO/MachOWriter.h
+++ b/tools/llvm-objcopy/MachO/MachOWriter.h
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "../Buffer.h"
+#include "MachOLayoutBuilder.h"
 #include "MachOObjcopy.h"
 #include "Object.h"
 #include "llvm/BinaryFormat/MachO.h"
@@ -22,20 +23,15 @@ class MachOWriter {
   Object &O;
   bool Is64Bit;
   bool IsLittleEndian;
+  uint64_t PageSize;
   Buffer &B;
-  StringTableBuilder StrTableBuilder{StringTableBuilder::MachO};
+  MachOLayoutBuilder LayoutBuilder;
 
   size_t headerSize() const;
   size_t loadCommandsSize() const;
   size_t symTableSize() const;
   size_t strTableSize() const;
 
-  void updateDySymTab(MachO::macho_load_command &MLC);
-  void updateSizeOfCmds();
-  void updateSymbolIndexes();
-  void constructStringTable();
-  Error layout();
-
   void writeHeader();
   void writeLoadCommands();
   template <typename StructType>
@@ -48,11 +44,16 @@ class MachOWriter {
   void writeWeakBindInfo();
   void writeLazyBindInfo();
   void writeExportInfo();
+  void writeIndirectSymbolTable();
+  void writeDataInCodeData();
+  void writeFunctionStartsData();
   void writeTail();
 
 public:
-  MachOWriter(Object &O, bool Is64Bit, bool IsLittleEndian, Buffer &B)
-      : O(O), Is64Bit(Is64Bit), IsLittleEndian(IsLittleEndian), B(B) {}
+  MachOWriter(Object &O, bool Is64Bit, bool IsLittleEndian, uint64_t PageSize,
+              Buffer &B)
+      : O(O), Is64Bit(Is64Bit), IsLittleEndian(IsLittleEndian),
+        PageSize(PageSize), B(B), LayoutBuilder(O, Is64Bit, PageSize) {}
 
   size_t totalSize() const;
   Error finalize();
diff --git a/tools/llvm-objcopy/MachO/Object.h b/tools/llvm-objcopy/MachO/Object.h
index ed85fcbc47f7..1cebf8253d19 100644
--- a/tools/llvm-objcopy/MachO/Object.h
+++ b/tools/llvm-objcopy/MachO/Object.h
@@ -90,6 +90,16 @@ struct SymbolEntry {
   uint8_t n_sect;
   uint16_t n_desc;
   uint64_t n_value;
+
+  bool isExternalSymbol() const {
+    return n_type & ((MachO::N_EXT | MachO::N_PEXT));
+  }
+
+  bool isLocalSymbol() const { return !isExternalSymbol(); }
+
+  bool isUndefinedSymbol() const {
+    return (n_type & MachO::N_TYPE) == MachO::N_UNDF;
+  }
 };
 
 /// The location of the symbol table inside the binary is described by LC_SYMTAB
@@ -100,6 +110,10 @@ struct SymbolTable {
   const SymbolEntry *getSymbolByIndex(uint32_t Index) const;
 };
 
+struct IndirectSymbolTable {
+  std::vector<uint32_t> Symbols;
+};
+
 /// The location of the string table inside the binary is described by LC_SYMTAB
 /// load command.
 struct StringTable {
@@ -206,6 +220,10 @@ struct ExportInfo {
   ArrayRef<uint8_t> Trie;
 };
 
+struct LinkData {
+  ArrayRef<uint8_t> Data;
+};
+
 struct Object {
   MachHeader Header;
   std::vector<LoadCommand> LoadCommands;
@@ -218,11 +236,20 @@ struct Object {
   WeakBindInfo WeakBinds;
   LazyBindInfo LazyBinds;
   ExportInfo Exports;
+  IndirectSymbolTable IndirectSymTable;
+  LinkData DataInCode;
+  LinkData FunctionStarts;
 
   /// The index of LC_SYMTAB load command if present.
   Optional<size_t> SymTabCommandIndex;
   /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
   Optional<size_t> DyLdInfoCommandIndex;
+  /// The index LC_DYSYMTAB load comamnd if present.
+  Optional<size_t> DySymTabCommandIndex;
+  /// The index LC_DATA_IN_CODE load comamnd if present.
+  Optional<size_t> DataInCodeCommandIndex;
+  /// The index LC_FUNCTION_STARTS load comamnd if present.
+  Optional<size_t> FunctionStartsCommandIndex;
 };
 
 } // end namespace macho
diff --git a/tools/llvm-objcopy/ObjcopyOpts.td b/tools/llvm-objcopy/ObjcopyOpts.td
index 5fce4fbde539..9e6b6f0005cd 100644
--- a/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/tools/llvm-objcopy/ObjcopyOpts.td
@@ -1,37 +1,33 @@
-include "llvm/Option/OptParser.td"
-
-multiclass Eq<string name, string help> {
-  def NAME : Separate<["--"], name>;
-  def NAME #_eq : Joined<["--"], name #"=">,
-                  Alias<!cast<Separate>(NAME)>,
-                  HelpText<help>;
-}
-
-def help : Flag<["--"], "help">;
-def h : Flag<["-"], "h">, Alias<help>;
-
-def allow_broken_links
-    : Flag<["--"], "allow-broken-links">,
-      HelpText<"Allow llvm-objcopy to remove sections even if it would leave "
-               "invalid section references. The appropriate sh_link fields "
-               "will be set to zero.">;
+include "CommonOpts.td"
 
 defm binary_architecture
-    : Eq<"binary-architecture", "Used when transforming an architecture-less "
-                                "format (such as binary) to another format">;
-def B : JoinedOrSeparate<["-"], "B">, Alias<binary_architecture>;
+    : Eq<"binary-architecture", "Ignored for compatibility">;
+def B : JoinedOrSeparate<["-"], "B">,
+        Alias<binary_architecture>,
+        HelpText<"Alias for --binary-architecture">;
 
 defm target : Eq<"target", "Format of the input and output file">,
               Values<"binary">;
-def F : JoinedOrSeparate<["-"], "F">, Alias<target>;
+def F : JoinedOrSeparate<["-"], "F">,
+        Alias<target>,
+        HelpText<"Alias for --target">;
 
 defm input_target : Eq<"input-target", "Format of the input file">,
                     Values<"binary">;
-def I : JoinedOrSeparate<["-"], "I">, Alias<input_target>;
+def I : JoinedOrSeparate<["-"], "I">,
+        Alias<input_target>,
+        HelpText<"Alias for --input-target">;
 
 defm output_target : Eq<"output-target", "Format of the output file">,
                      Values<"binary">;
-def O : JoinedOrSeparate<["-"], "O">, Alias<output_target>;
+def O : JoinedOrSeparate<["-"], "O">,
+        Alias<output_target>,
+        HelpText<"Alias for --output-target">;
+
+defm new_symbol_visibility : Eq<"new-symbol-visibility", "Visibility of "
+                                "symbols generated for binary input or added"
+                                " with --add-symbol unless otherwise"
+                                " specified. The default value is 'default'.">;
 
 def compress_debug_sections : Flag<["--"], "compress-debug-sections">;
 def compress_debug_sections_eq
@@ -46,34 +42,10 @@ defm split_dwo
                       "<dwo-file>, then strip-dwo on the input file">,
       MetaVarName<"dwo-file">;
 
-def enable_deterministic_archives
-    : Flag<["--"], "enable-deterministic-archives">,
-      HelpText<"Enable deterministic mode when copying archives (use zero for "
-               "UIDs, GIDs, and timestamps).">;
-def D : Flag<["-"], "D">,
-        Alias<enable_deterministic_archives>,
-        HelpText<"Alias for --enable-deterministic-archives">;
-
-def disable_deterministic_archives
-    : Flag<["--"], "disable-deterministic-archives">,
-      HelpText<"Disable deterministic mode when copying archives (use real "
-               "values for UIDs, GIDs, and timestamps).">;
-def U : Flag<["-"], "U">,
-        Alias<disable_deterministic_archives>,
-        HelpText<"Alias for --disable-deterministic-archives">;
-
-def preserve_dates : Flag<["--"], "preserve-dates">,
-                     HelpText<"Preserve access and modification timestamps">;
-def p : Flag<["-"], "p">, Alias<preserve_dates>;
-
 defm add_gnu_debuglink
     : Eq<"add-gnu-debuglink", "Add a .gnu_debuglink for <debug-file>">,
       MetaVarName<"debug-file">;
 
-defm remove_section : Eq<"remove-section", "Remove <section>">,
-                      MetaVarName<"section">;
-def R : JoinedOrSeparate<["-"], "R">, Alias<remove_section>;
-
 defm rename_section
     : Eq<"rename-section",
          "Renames a section from old to new, optionally with specified flags. "
@@ -93,16 +65,20 @@ defm redefine_symbols
          "symbols from many files.">,         
       MetaVarName<"filename">;
 
-defm keep_section : Eq<"keep-section", "Keep <section>">,
-                    MetaVarName<"section">;
 defm only_section : Eq<"only-section", "Remove all but <section>">,
                     MetaVarName<"section">;
-def j : JoinedOrSeparate<["-"], "j">, Alias<only_section>;
+def j : JoinedOrSeparate<["-"], "j">,
+        Alias<only_section>,
+        HelpText<"Alias for --only-section">;
 defm add_section
     : Eq<"add-section",
          "Make a section named <section> with the contents of <file>.">,
       MetaVarName<"section=file">;
 
+defm set_section_alignment
+    : Eq<"set-section-alignment", "Set alignment for a given section.">,
+      MetaVarName<"section=align">;
+
 defm set_section_flags
     : Eq<"set-section-flags",
          "Set section flags for a given section. Flags supported for GNU "
@@ -110,26 +86,14 @@ defm set_section_flags
          "rom, share, contents, merge, strings.">,
       MetaVarName<"section=flag1[,flag2,...]">;
 
-def strip_all : Flag<["--"], "strip-all">,
-                HelpText<"Remove non-allocated sections outside segments. "
-                         ".gnu.warning* sections are not removed">;
-def S : Flag<["-"], "S">, Alias<strip_all>;
-def strip_all_gnu : Flag<["--"], "strip-all-gnu">,
-                    HelpText<"Compatible with GNU objcopy's --strip-all">;
-def strip_debug : Flag<["--"], "strip-debug">,
-                  HelpText<"Remove all debug information">;
-def g : Flag<["-"], "g">, Alias<strip_debug>,
-        HelpText<"Alias for --strip-debug">;
+def S : Flag<["-"], "S">,
+        Alias<strip_all>,
+        HelpText<"Alias for --strip-all">;
 def strip_dwo : Flag<["--"], "strip-dwo">,
                 HelpText<"Remove all DWARF .dwo sections from file">;
-def strip_sections
-    : Flag<["--"], "strip-sections">,
-      HelpText<"Remove all section headers and all sections not in segments">;
 def strip_non_alloc
     : Flag<["--"], "strip-non-alloc">,
       HelpText<"Remove all non-allocated sections outside segments">;
-def strip_unneeded : Flag<["--"], "strip-unneeded">,
-                     HelpText<"Remove all symbols not needed by relocations">;
 defm strip_unneeded_symbol
     : Eq<"strip-unneeded-symbol",
          "Remove symbol <symbol> if it is not needed by relocations">,
@@ -163,7 +127,9 @@ defm localize_symbols
          "Reads a list of symbols from <filename> and marks them local.">,
       MetaVarName<"filename">;
 
-def L : JoinedOrSeparate<["-"], "L">, Alias<localize_symbol>;
+def L : JoinedOrSeparate<["-"], "L">,
+        Alias<localize_symbol>,
+        HelpText<"Alias for --localize-symbol">;
 
 defm globalize_symbol : Eq<"globalize-symbol", "Mark <symbol> as global">,
                         MetaVarName<"symbol">;
@@ -178,7 +144,9 @@ defm keep_global_symbol
          "Convert all symbols except <symbol> to local. May be repeated to "
          "convert all except a set of symbols to local.">,
       MetaVarName<"symbol">;
-def G : JoinedOrSeparate<["-"], "G">, Alias<keep_global_symbol>;
+def G : JoinedOrSeparate<["-"], "G">,
+        Alias<keep_global_symbol>,
+        HelpText<"Alias for --keep-global-symbol">;
 
 defm keep_global_symbols
     : Eq<"keep-global-symbols",
@@ -196,31 +164,17 @@ defm weaken_symbols
          "Reads a list of symbols from <filename> and marks them weak.">,
       MetaVarName<"filename">;
 
-def W : JoinedOrSeparate<["-"], "W">, Alias<weaken_symbol>;
+def W : JoinedOrSeparate<["-"], "W">,
+        Alias<weaken_symbol>,
+        HelpText<"Alias for --weaken-symbol">;
 def weaken : Flag<["--"], "weaken">,
              HelpText<"Mark all global symbols as weak">;
 
-def discard_locals : Flag<["--"], "discard-locals">,
-                     HelpText<"Remove compiler-generated local symbols, (e.g. "
-                              "symbols starting with .L)">;
-def X : Flag<["-"], "X">, Alias<discard_locals>;
-
-def discard_all
-    : Flag<["--"], "discard-all">,
-      HelpText<"Remove all local symbols except file and section symbols">;
-def x : Flag<["-"], "x">, Alias<discard_all>;
-defm strip_symbol : Eq<"strip-symbol", "Remove symbol <symbol>">,
-                    MetaVarName<"symbol">;
 defm strip_symbols
     : Eq<"strip-symbols",
          "Reads a list of symbols from <filename> and removes them.">,
       MetaVarName<"filename">;
 
-def N : JoinedOrSeparate<["-"], "N">, Alias<strip_symbol>;
-defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
-                   MetaVarName<"symbol">;
-def K : JoinedOrSeparate<["-"], "K">, Alias<keep_symbol>;
-
 defm keep_symbols
     : Eq<"keep-symbols",
          "Reads a list of symbols from <filename> and runs as if "
@@ -230,13 +184,6 @@ defm keep_symbols
          "be repeated to read symbols from many files.">,
       MetaVarName<"filename">;
 
-def only_keep_debug
-    : Flag<["--"], "only-keep-debug">,
-      HelpText<"Clear sections that would not be stripped by --strip-debug. "
-               "Currently only implemented for COFF.">;
-
-def keep_file_symbols : Flag<["--"], "keep-file-symbols">,
-                        HelpText<"Do not remove file symbols">;
 defm dump_section
     : Eq<"dump-section",
          "Dump contents of section named <section> into file <file>">,
@@ -249,9 +196,6 @@ defm prefix_alloc_sections
     : Eq<"prefix-alloc-sections", "Add <prefix> to the start of every allocated section name">,
       MetaVarName<"prefix">;
 
-def version : Flag<["--"], "version">,
-              HelpText<"Print the version and exit.">;
-def V : Flag<["-"], "V">, Alias<version>;
 defm build_id_link_dir
     : Eq<"build-id-link-dir", "Set directory for --build-id-link-input and "
                               "--build-id-link-output to <dir>">,
@@ -265,10 +209,6 @@ defm build_id_link_output
                                  "name derived from hex build ID">,
       MetaVarName<"suffix">;
 
-def regex
-    : Flag<["--"], "regex">,
-      HelpText<"Permit regular expressions in name comparison">;
-
 defm set_start : Eq<"set-start", "Set the start address to <addr>. Overrides "
                     "any previous --change-start or --adjust-start values.">,
                  MetaVarName<"addr">;
@@ -277,11 +217,12 @@ defm change_start : Eq<"change-start", "Add <incr> to the start address. Can be
                        "cumulatively.">,
                     MetaVarName<"incr">;
 def adjust_start : JoinedOrSeparate<["--"], "adjust-start">,
-                   Alias<change_start>;
+                   Alias<change_start>,
+                   HelpText<"Alias for --change-start">;
 
 defm add_symbol
     : Eq<"add-symbol", "Add new symbol <name> to .symtab. Accepted flags: "
-         "global, local, weak, default, hidden, file, section, object, "
+         "global, local, weak, default, hidden, protected, file, section, object, "
          "function, indirect-function. Accepted but ignored for "
          "compatibility: debug, constructor, warning, indirect, synthetic, "
          "unique-object, before.">,
diff --git a/tools/llvm-objcopy/StripOpts.td b/tools/llvm-objcopy/StripOpts.td
index 1d06bb3dfb38..cd02cffae673 100644
--- a/tools/llvm-objcopy/StripOpts.td
+++ b/tools/llvm-objcopy/StripOpts.td
@@ -1,96 +1,17 @@
-include "llvm/Option/OptParser.td"
+include "CommonOpts.td"
 
-multiclass Eq<string name, string help> {
-  def NAME : Separate<["--"], name>;
-  def NAME #_eq : Joined<["--"], name #"=">,
-                  Alias<!cast<Separate>(NAME)>,
-                  HelpText<help>;
-}
+def output : JoinedOrSeparate<["-"], "o">, HelpText<"Write output to <file>">,
+             MetaVarName<"<file>">;
 
-def help : Flag<["--"], "help">;
-def h : Flag<["-"], "h">, Alias<help>;
-
-def allow_broken_links
-    : Flag<["--"], "allow-broken-links">,
-      HelpText<"Allow llvm-strip to remove sections even if it would leave "
-               "invalid section references. The appropriate sh_link fields "
-               "will be set to zero.">;
-
-def enable_deterministic_archives
-    : Flag<["--"], "enable-deterministic-archives">,
-      HelpText<"Enable deterministic mode when stripping archives (use zero "
-               "for UIDs, GIDs, and timestamps).">;
-def D : Flag<["-"], "D">,
-        Alias<enable_deterministic_archives>,
-        HelpText<"Alias for --enable-deterministic-archives">;
-
-def disable_deterministic_archives
-    : Flag<["--"], "disable-deterministic-archives">,
-      HelpText<"Disable deterministic mode when stripping archives (use real "
-               "values for UIDs, GIDs, and timestamps).">;
-def U : Flag<["-"], "U">,
-        Alias<disable_deterministic_archives>,
-        HelpText<"Alias for --disable-deterministic-archives">;
-
-def output : JoinedOrSeparate<["-"], "o">, HelpText<"Write output to <file>">;
-
-def preserve_dates : Flag<["--"], "preserve-dates">,
-                     HelpText<"Preserve access and modification timestamps">;
-def p : Flag<["-"], "p">, Alias<preserve_dates>;
-
-def strip_all : Flag<["--"], "strip-all">,
-                HelpText<"Remove non-allocated sections outside segments. "
-                         ".gnu.warning* sections are not removed">;
-def s : Flag<["-"], "s">, Alias<strip_all>;
+def s : Flag<["-"], "s">,
+        Alias<strip_all>,
+        HelpText<"Alias for --strip-all">;
 def no_strip_all : Flag<["--"], "no-strip-all">,
                    HelpText<"Disable --strip-all">;
 
-def strip_all_gnu : Flag<["--"], "strip-all-gnu">,
-                    HelpText<"Compatible with GNU strip's --strip-all">;
-def strip_debug : Flag<["--"], "strip-debug">,
-                  HelpText<"Remove debugging symbols only">;
-def d : Flag<["-"], "d">, Alias<strip_debug>;
-def g : Flag<["-"], "g">, Alias<strip_debug>;
-def S : Flag<["-"], "S">, Alias<strip_debug>;
-def strip_unneeded : Flag<["--"], "strip-unneeded">,
-                     HelpText<"Remove all symbols not needed by relocations">;
-
-defm remove_section : Eq<"remove-section", "Remove <section>">,
-                      MetaVarName<"section">;
-def R : JoinedOrSeparate<["-"], "R">, Alias<remove_section>;
-
-defm strip_symbol : Eq<"strip-symbol", "Strip <symbol>">,
-                    MetaVarName<"symbol">;
-def N : JoinedOrSeparate<["-"], "N">, Alias<strip_symbol>;
-
-defm keep_section : Eq<"keep-section", "Keep <section>">,
-                    MetaVarName<"section">;
-defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
-                   MetaVarName<"symbol">;
-def keep_file_symbols : Flag<["--"], "keep-file-symbols">,
-                        HelpText<"Do not remove file symbols">;
-
-def K : JoinedOrSeparate<["-"], "K">, Alias<keep_symbol>;
-
-def only_keep_debug
-    : Flag<["--"], "only-keep-debug">,
-      HelpText<"Clear sections that would not be stripped by --strip-debug. "
-               "Currently only implemented for COFF.">;
-
-def discard_locals : Flag<["--"], "discard-locals">,
-                     HelpText<"Remove compiler-generated local symbols, (e.g. "
-                              "symbols starting with .L)">;
-def X : Flag<["-"], "X">, Alias<discard_locals>;
-
-def discard_all
-    : Flag<["--"], "discard-all">,
-      HelpText<"Remove all local symbols except file and section symbols">;
-def x : Flag<["-"], "x">, Alias<discard_all>;
-
-def regex
-    : Flag<["--"], "regex">,
-      HelpText<"Permit regular expressions in name comparison">;
-
-def version : Flag<["--"], "version">,
-              HelpText<"Print the version and exit.">;
-def V : Flag<["-"], "V">, Alias<version>;
+def d : Flag<["-"], "d">,
+        Alias<strip_debug>,
+        HelpText<"Alias for --strip-debug">;
+def S : Flag<["-"], "S">,
+        Alias<strip_debug>,
+        HelpText<"Alias for --strip-debug">;
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index e9372176e43b..a68210f3fdd3 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
@@ -36,6 +37,7 @@
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/StringSaver.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -84,7 +86,7 @@ LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Error E) {
 
 ErrorSuccess reportWarning(Error E) {
   assert(E);
-  WithColor::warning(errs(), ToolName) << toString(std::move(E));
+  WithColor::warning(errs(), ToolName) << toString(std::move(E)) << '\n';
   return Error::success();
 }
 
@@ -130,16 +132,18 @@ static Error deepWriteArchive(StringRef ArcName,
 
 /// The function executeObjcopyOnIHex does the dispatch based on the format
 /// of the output specified by the command line options.
-static Error executeObjcopyOnIHex(const CopyConfig &Config, MemoryBuffer &In,
+static Error executeObjcopyOnIHex(CopyConfig &Config, MemoryBuffer &In,
                                   Buffer &Out) {
   // TODO: support output formats other than ELF.
+  if (Error E = Config.parseELFConfig())
+    return E;
   return elf::executeObjcopyOnIHex(Config, In, Out);
 }
 
 /// The function executeObjcopyOnRawBinary does the dispatch based on the format
 /// of the output specified by the command line options.
-static Error executeObjcopyOnRawBinary(const CopyConfig &Config,
-                                       MemoryBuffer &In, Buffer &Out) {
+static Error executeObjcopyOnRawBinary(CopyConfig &Config, MemoryBuffer &In,
+                                       Buffer &Out) {
   switch (Config.OutputFormat) {
   case FileFormat::ELF:
   // FIXME: Currently, we call elf::executeObjcopyOnRawBinary even if the
@@ -148,6 +152,8 @@ static Error executeObjcopyOnRawBinary(const CopyConfig &Config,
   case FileFormat::Binary:
   case FileFormat::IHex:
   case FileFormat::Unspecified:
+    if (Error E = Config.parseELFConfig())
+      return E;
     return elf::executeObjcopyOnRawBinary(Config, In, Out);
   }
 
@@ -156,11 +162,13 @@ static Error executeObjcopyOnRawBinary(const CopyConfig &Config,
 
 /// The function executeObjcopyOnBinary does the dispatch based on the format
 /// of the input binary (ELF, MachO or COFF).
-static Error executeObjcopyOnBinary(const CopyConfig &Config,
-                                    object::Binary &In, Buffer &Out) {
-  if (auto *ELFBinary = dyn_cast<object::ELFObjectFileBase>(&In))
+static Error executeObjcopyOnBinary(CopyConfig &Config, object::Binary &In,
+                                    Buffer &Out) {
+  if (auto *ELFBinary = dyn_cast<object::ELFObjectFileBase>(&In)) {
+    if (Error E = Config.parseELFConfig())
+      return E;
     return elf::executeObjcopyOnBinary(Config, *ELFBinary, Out);
-  else if (auto *COFFBinary = dyn_cast<object::COFFObjectFile>(&In))
+  } else if (auto *COFFBinary = dyn_cast<object::COFFObjectFile>(&In))
     return coff::executeObjcopyOnBinary(Config, *COFFBinary, Out);
   else if (auto *MachOBinary = dyn_cast<object::MachOObjectFile>(&In))
     return macho::executeObjcopyOnBinary(Config, *MachOBinary, Out);
@@ -169,8 +177,7 @@ static Error executeObjcopyOnBinary(const CopyConfig &Config,
                              "unsupported object file format");
 }
 
-static Error executeObjcopyOnArchive(const CopyConfig &Config,
-                                     const Archive &Ar) {
+static Error executeObjcopyOnArchive(CopyConfig &Config, const Archive &Ar) {
   std::vector<NewArchiveMember> NewArchiveMembers;
   Error Err = Error::success();
   for (const Archive::Child &Child : Ar.children(Err)) {
@@ -246,7 +253,7 @@ static Error restoreStatOnFile(StringRef Filename,
 /// The function executeObjcopy does the higher level dispatch based on the type
 /// of input (raw binary, archive or single object file) and takes care of the
 /// format-agnostic modifications, i.e. preserving dates.
-static Error executeObjcopy(const CopyConfig &Config) {
+static Error executeObjcopy(CopyConfig &Config) {
   sys::fs::file_status Stat;
   if (Config.InputFilename != "-") {
     if (auto EC = sys::fs::status(Config.InputFilename, Stat))
@@ -255,7 +262,7 @@ static Error executeObjcopy(const CopyConfig &Config) {
     Stat.permissions(static_cast<sys::fs::perms>(0777));
   }
 
-  typedef Error (*ProcessRawFn)(const CopyConfig &, MemoryBuffer &, Buffer &);
+  using ProcessRawFn = Error (*)(CopyConfig &, MemoryBuffer &, Buffer &);
   ProcessRawFn ProcessRaw;
   switch (Config.InputFormat) {
   case FileFormat::Binary:
@@ -310,15 +317,31 @@ int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   ToolName = argv[0];
   bool IsStrip = sys::path::stem(ToolName).contains("strip");
+
+  // Expand response files.
+  // TODO: Move these lines, which are copied from lib/Support/CommandLine.cpp,
+  // into a separate function in the CommandLine library and call that function
+  // here. This is duplicated code.
+  SmallVector<const char *, 20> NewArgv(argv, argv + argc);
+  BumpPtrAllocator A;
+  StringSaver Saver(A);
+  cl::ExpandResponseFiles(Saver,
+                          Triple(sys::getProcessTriple()).isOSWindows()
+                              ? cl::TokenizeWindowsCommandLine
+                              : cl::TokenizeGNUCommandLine,
+                          NewArgv);
+
+  auto Args = makeArrayRef(NewArgv).drop_front();
+
   Expected<DriverConfig> DriverConfig =
-      IsStrip ? parseStripOptions(makeArrayRef(argv + 1, argc), reportWarning)
-              : parseObjcopyOptions(makeArrayRef(argv + 1, argc));
+      IsStrip ? parseStripOptions(Args, reportWarning)
+              : parseObjcopyOptions(Args, reportWarning);
   if (!DriverConfig) {
     logAllUnhandledErrors(DriverConfig.takeError(),
                           WithColor::error(errs(), ToolName));
     return 1;
   }
-  for (const CopyConfig &CopyConfig : DriverConfig->CopyConfigs) {
+  for (CopyConfig &CopyConfig : DriverConfig->CopyConfigs) {
     if (Error E = executeObjcopy(CopyConfig)) {
       logAllUnhandledErrors(std::move(E), WithColor::error(errs(), ToolName));
       return 1;
diff --git a/tools/llvm-objdump/COFFDump.cpp b/tools/llvm-objdump/COFFDump.cpp
index 1ba0a68902c9..60b0f5a3cbd1 100644
--- a/tools/llvm-objdump/COFFDump.cpp
+++ b/tools/llvm-objdump/COFFDump.cpp
@@ -234,15 +234,14 @@ printSEHTable(const COFFObjectFile *Obj, uint32_t TableVA, int Count) {
   if (Count == 0)
     return;
 
-  const pe32_header *PE32Header;
-  error(Obj->getPE32Header(PE32Header));
-  uint32_t ImageBase = PE32Header->ImageBase;
   uintptr_t IntPtr = 0;
-  error(Obj->getVaPtr(TableVA, IntPtr));
+  if (std::error_code EC = Obj->getVaPtr(TableVA, IntPtr))
+    reportError(errorCodeToError(EC), Obj->getFileName());
+
   const support::ulittle32_t *P = (const support::ulittle32_t *)IntPtr;
   outs() << "SEH Table:";
   for (int I = 0; I < Count; ++I)
-    outs() << format(" 0x%x", P[I] + ImageBase);
+    outs() << format(" 0x%x", P[I] + Obj->getPE32Header()->ImageBase);
   outs() << "\n\n";
 }
 
@@ -268,22 +267,24 @@ static void printTLSDirectoryT(const coff_tls_directory<T> *TLSDir) {
 }
 
 static void printTLSDirectory(const COFFObjectFile *Obj) {
-  const pe32_header *PE32Header;
-  error(Obj->getPE32Header(PE32Header));
-
-  const pe32plus_header *PE32PlusHeader;
-  error(Obj->getPE32PlusHeader(PE32PlusHeader));
+  const pe32_header *PE32Header = Obj->getPE32Header();
+  const pe32plus_header *PE32PlusHeader = Obj->getPE32PlusHeader();
 
   // Skip if it's not executable.
   if (!PE32Header && !PE32PlusHeader)
     return;
 
   const data_directory *DataDir;
-  error(Obj->getDataDirectory(COFF::TLS_TABLE, DataDir));
-  uintptr_t IntPtr = 0;
+  if (std::error_code EC = Obj->getDataDirectory(COFF::TLS_TABLE, DataDir))
+    reportError(errorCodeToError(EC), Obj->getFileName());
+
   if (DataDir->RelativeVirtualAddress == 0)
     return;
-  error(Obj->getRvaPtr(DataDir->RelativeVirtualAddress, IntPtr));
+
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC =
+          Obj->getRvaPtr(DataDir->RelativeVirtualAddress, IntPtr))
+    reportError(errorCodeToError(EC), Obj->getFileName());
 
   if (PE32Header) {
     auto *TLSDir = reinterpret_cast<const coff_tls_directory32 *>(IntPtr);
@@ -298,9 +299,7 @@ static void printTLSDirectory(const COFFObjectFile *Obj) {
 
 static void printLoadConfiguration(const COFFObjectFile *Obj) {
   // Skip if it's not executable.
-  const pe32_header *PE32Header;
-  error(Obj->getPE32Header(PE32Header));
-  if (!PE32Header)
+  if (!Obj->getPE32Header())
     return;
 
   // Currently only x86 is supported
@@ -308,11 +307,18 @@ static void printLoadConfiguration(const COFFObjectFile *Obj) {
     return;
 
   const data_directory *DataDir;
-  error(Obj->getDataDirectory(COFF::LOAD_CONFIG_TABLE, DataDir));
+
+  if (std::error_code EC =
+          Obj->getDataDirectory(COFF::LOAD_CONFIG_TABLE, DataDir))
+    reportError(errorCodeToError(EC), Obj->getFileName());
+
   uintptr_t IntPtr = 0;
   if (DataDir->RelativeVirtualAddress == 0)
     return;
-  error(Obj->getRvaPtr(DataDir->RelativeVirtualAddress, IntPtr));
+
+  if (std::error_code EC =
+          Obj->getRvaPtr(DataDir->RelativeVirtualAddress, IntPtr))
+    reportError(errorCodeToError(EC), Obj->getFileName());
 
   auto *LoadConf = reinterpret_cast<const coff_load_configuration32 *>(IntPtr);
   outs() << "Load configuration:"
@@ -442,8 +448,7 @@ static bool getPDataSection(const COFFObjectFile *Obj,
                             std::vector<RelocationRef> &Rels,
                             const RuntimeFunction *&RFStart, int &NumRFs) {
   for (const SectionRef &Section : Obj->sections()) {
-    StringRef Name;
-    error(Section.getName(Name));
+    StringRef Name = unwrapOrError(Section.getName(), Obj->getFileName());
     if (Name != ".pdata")
       continue;
 
@@ -455,7 +460,9 @@ static bool getPDataSection(const COFFObjectFile *Obj,
     llvm::sort(Rels, isRelocAddressLess);
 
     ArrayRef<uint8_t> Contents;
-    error(Obj->getSectionContents(Pdata, Contents));
+    if (Error E = Obj->getSectionContents(Pdata, Contents))
+      reportError(std::move(E), Obj->getFileName());
+
     if (Contents.empty())
       continue;
 
@@ -571,10 +578,12 @@ static void printRuntimeFunctionRels(const COFFObjectFile *Obj,
 
   ArrayRef<uint8_t> XContents;
   uint64_t UnwindInfoOffset = 0;
-  error(getSectionContents(
-          Obj, Rels, SectionOffset +
-                         /*offsetof(RuntimeFunction, UnwindInfoOffset)*/ 8,
-          XContents, UnwindInfoOffset));
+  if (Error E = getSectionContents(
+          Obj, Rels,
+          SectionOffset +
+              /*offsetof(RuntimeFunction, UnwindInfoOffset)*/ 8,
+          XContents, UnwindInfoOffset))
+    reportError(std::move(E), Obj->getFileName());
   if (XContents.empty())
     return;
 
@@ -650,9 +659,12 @@ void printCOFFSymbolTable(const object::COFFImportFile *i) {
 void printCOFFSymbolTable(const COFFObjectFile *coff) {
   for (unsigned SI = 0, SE = coff->getNumberOfSymbols(); SI != SE; ++SI) {
     Expected<COFFSymbolRef> Symbol = coff->getSymbol(SI);
+    if (!Symbol)
+      reportError(Symbol.takeError(), coff->getFileName());
+
     StringRef Name;
-    error(Symbol.takeError());
-    error(coff->getSymbolName(*Symbol, Name));
+    if (std::error_code EC = coff->getSymbolName(*Symbol, Name))
+      reportError(errorCodeToError(EC), coff->getFileName());
 
     outs() << "[" << format("%2d", SI) << "]"
            << "(sec " << format("%2d", int(Symbol->getSectionNumber())) << ")"
@@ -682,7 +694,9 @@ void printCOFFSymbolTable(const COFFObjectFile *coff) {
     for (unsigned AI = 0, AE = Symbol->getNumberOfAuxSymbols(); AI < AE; ++AI, ++SI) {
       if (Symbol->isSectionDefinition()) {
         const coff_aux_section_definition *asd;
-        error(coff->getAuxSymbol<coff_aux_section_definition>(SI + 1, asd));
+        if (std::error_code EC =
+                coff->getAuxSymbol<coff_aux_section_definition>(SI + 1, asd))
+          reportError(errorCodeToError(EC), coff->getFileName());
 
         int32_t AuxNumber = asd->getNumber(Symbol->isBigObj());
 
@@ -697,7 +711,8 @@ void printCOFFSymbolTable(const COFFObjectFile *coff) {
                          , unsigned(asd->Selection));
       } else if (Symbol->isFileRecord()) {
         const char *FileName;
-        error(coff->getAuxSymbol<char>(SI + 1, FileName));
+        if (std::error_code EC = coff->getAuxSymbol<char>(SI + 1, FileName))
+          reportError(errorCodeToError(EC), coff->getFileName());
 
         StringRef Name(FileName, Symbol->getNumberOfAuxSymbols() *
                                      coff->getSymbolTableEntrySize());
@@ -707,7 +722,9 @@ void printCOFFSymbolTable(const COFFObjectFile *coff) {
         break;
       } else if (Symbol->isWeakExternal()) {
         const coff_aux_weak_external *awe;
-        error(coff->getAuxSymbol<coff_aux_weak_external>(SI + 1, awe));
+        if (std::error_code EC =
+                coff->getAuxSymbol<coff_aux_weak_external>(SI + 1, awe))
+          reportError(errorCodeToError(EC), coff->getFileName());
 
         outs() << "AUX " << format("indx %d srch %d\n",
                                    static_cast<uint32_t>(awe->TagIndex),
diff --git a/tools/llvm-objdump/ELFDump.cpp b/tools/llvm-objdump/ELFDump.cpp
index 9c4d67d0f1bd..93d070eee16c 100644
--- a/tools/llvm-objdump/ELFDump.cpp
+++ b/tools/llvm-objdump/ELFDump.cpp
@@ -178,7 +178,7 @@ void printDynamicSection(const ELFFile<ELFT> *Elf, StringRef Filename) {
         outs() << (Data + Dyn.d_un.d_val) << "\n";
         continue;
       }
-      warn(toString(StrTabOrErr.takeError()));
+      reportWarning(toString(StrTabOrErr.takeError()), Filename);
       consumeError(StrTabOrErr.takeError());
     }
     outs() << format(Fmt, (uint64_t)Dyn.d_un.d_val);
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 58ff7be4543c..e4684d0f1601 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -236,11 +236,11 @@ struct SymbolSorter {
   bool operator()(const SymbolRef &A, const SymbolRef &B) {
     Expected<SymbolRef::Type> ATypeOrErr = A.getType();
     if (!ATypeOrErr)
-      report_error(ATypeOrErr.takeError(), A.getObject()->getFileName());
+      reportError(ATypeOrErr.takeError(), A.getObject()->getFileName());
     SymbolRef::Type AType = *ATypeOrErr;
     Expected<SymbolRef::Type> BTypeOrErr = B.getType();
     if (!BTypeOrErr)
-      report_error(BTypeOrErr.takeError(), B.getObject()->getFileName());
+      reportError(BTypeOrErr.takeError(), B.getObject()->getFileName());
     SymbolRef::Type BType = *BTypeOrErr;
     uint64_t AAddr = (AType != SymbolRef::ST_Function) ? 0 : A.getValue();
     uint64_t BAddr = (BType != SymbolRef::ST_Function) ? 0 : B.getValue();
@@ -371,11 +371,8 @@ static void getSectionsAndSymbols(MachOObjectFile *MachOObj,
       Symbols.push_back(Symbol);
   }
 
-  for (const SectionRef &Section : MachOObj->sections()) {
-    StringRef SectName;
-    Section.getName(SectName);
+  for (const SectionRef &Section : MachOObj->sections())
     Sections.push_back(Section);
-  }
 
   bool BaseSegmentAddressSet = false;
   for (const auto &Command : MachOObj->load_commands()) {
@@ -393,10 +390,40 @@ static void getSectionsAndSymbols(MachOObjectFile *MachOObj,
         BaseSegmentAddressSet = true;
         BaseSegmentAddress = SLC.vmaddr;
       }
+    } else if (Command.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 SLC = MachOObj->getSegment64LoadCommand(Command);
+      StringRef SegName = SLC.segname;
+      if (!BaseSegmentAddressSet && SegName != "__PAGEZERO") {
+        BaseSegmentAddressSet = true;
+        BaseSegmentAddress = SLC.vmaddr;
+      }
     }
   }
 }
 
+static bool DumpAndSkipDataInCode(uint64_t PC, const uint8_t *bytes,
+                                 DiceTable &Dices, uint64_t &InstSize) {
+  // Check the data in code table here to see if this is data not an
+  // instruction to be disassembled.
+  DiceTable Dice;
+  Dice.push_back(std::make_pair(PC, DiceRef()));
+  dice_table_iterator DTI =
+      std::search(Dices.begin(), Dices.end(), Dice.begin(), Dice.end(),
+                  compareDiceTableEntries);
+  if (DTI != Dices.end()) {
+    uint16_t Length;
+    DTI->second.getLength(Length);
+    uint16_t Kind;
+    DTI->second.getKind(Kind);
+    InstSize = DumpDataInCode(bytes, Length, Kind);
+    if ((Kind == MachO::DICE_KIND_JUMP_TABLE8) &&
+        (PC == (DTI->first + Length - 1)) && (Length & 1))
+      InstSize++;
+    return true;
+  }
+  return false;
+}
+
 static void printRelocationTargetName(const MachOObjectFile *O,
                                       const MachO::any_relocation_info &RE,
                                       raw_string_ostream &Fmt) {
@@ -419,13 +446,11 @@ static void printRelocationTargetName(const MachOObjectFile *O,
     // If we couldn't find a symbol that this relocation refers to, try
     // to find a section beginning instead.
     for (const SectionRef &Section : ToolSectionFilter(*O)) {
-      StringRef Name;
       uint64_t Addr = Section.getAddress();
       if (Addr != Val)
         continue;
-      if (std::error_code EC = Section.getName(Name))
-        report_error(errorCodeToError(EC), O->getFileName());
-      Fmt << Name;
+      StringRef NameOrErr = unwrapOrError(Section.getName(), O->getFileName());
+      Fmt << NameOrErr;
       return;
     }
 
@@ -458,10 +483,14 @@ static void printRelocationTargetName(const MachOObjectFile *O,
       --I;
       advance(SI, 1);
     }
-    if (SI == O->section_end())
+    if (SI == O->section_end()) {
       Fmt << Val << " (?,?)";
-    else
-      SI->getName(S);
+    } else {
+      if (Expected<StringRef> NameOrErr = SI->getName())
+        S = *NameOrErr;
+      else
+        consumeError(NameOrErr.takeError());
+    }
   }
 
   Fmt << S;
@@ -504,8 +533,8 @@ Error getMachORelocationValueString(const MachOObjectFile *Obj,
       // NOTE: Scattered relocations don't exist on x86_64.
       unsigned RType = Obj->getAnyRelocationType(RENext);
       if (RType != MachO::X86_64_RELOC_UNSIGNED)
-        report_error(Obj->getFileName(), "Expected X86_64_RELOC_UNSIGNED after "
-                                         "X86_64_RELOC_SUBTRACTOR.");
+        reportError(Obj->getFileName(), "Expected X86_64_RELOC_UNSIGNED after "
+                                        "X86_64_RELOC_SUBTRACTOR.");
 
       // The X86_64_RELOC_UNSIGNED contains the minuend symbol;
       // X86_64_RELOC_SUBTRACTOR contains the subtrahend.
@@ -553,8 +582,8 @@ Error getMachORelocationValueString(const MachOObjectFile *Obj,
       unsigned RType = Obj->getAnyRelocationType(RENext);
 
       if (RType != MachO::GENERIC_RELOC_PAIR)
-        report_error(Obj->getFileName(), "Expected GENERIC_RELOC_PAIR after "
-                                         "GENERIC_RELOC_SECTDIFF.");
+        reportError(Obj->getFileName(), "Expected GENERIC_RELOC_PAIR after "
+                                        "GENERIC_RELOC_SECTDIFF.");
 
       printRelocationTargetName(Obj, RE, Fmt);
       Fmt << "-";
@@ -574,8 +603,8 @@ Error getMachORelocationValueString(const MachOObjectFile *Obj,
         // GENERIC_RELOC_PAIR.
         unsigned RType = Obj->getAnyRelocationType(RENext);
         if (RType != MachO::GENERIC_RELOC_PAIR)
-          report_error(Obj->getFileName(), "Expected GENERIC_RELOC_PAIR after "
-                                           "GENERIC_RELOC_LOCAL_SECTDIFF.");
+          reportError(Obj->getFileName(), "Expected GENERIC_RELOC_PAIR after "
+                                          "GENERIC_RELOC_LOCAL_SECTDIFF.");
 
         printRelocationTargetName(Obj, RE, Fmt);
         Fmt << "-";
@@ -614,8 +643,8 @@ Error getMachORelocationValueString(const MachOObjectFile *Obj,
         // ARM_RELOC_PAIR.
         unsigned RType = Obj->getAnyRelocationType(RENext);
         if (RType != MachO::ARM_RELOC_PAIR)
-          report_error(Obj->getFileName(), "Expected ARM_RELOC_PAIR after "
-                                           "ARM_RELOC_HALF");
+          reportError(Obj->getFileName(), "Expected ARM_RELOC_PAIR after "
+                                          "ARM_RELOC_HALF");
 
         // NOTE: The half of the target virtual address is stashed in the
         // address field of the secondary relocation, but we can't reverse
@@ -1501,7 +1530,12 @@ static void DumpLiteralPointerSection(MachOObjectFile *O,
     uint64_t SectSize = Sect->getSize();
 
     StringRef SectName;
-    Sect->getName(SectName);
+    Expected<StringRef> SectNameOrErr = Sect->getName();
+    if (SectNameOrErr)
+      SectName = *SectNameOrErr;
+    else
+      consumeError(SectNameOrErr.takeError());
+
     DataRefImpl Ref = Sect->getRawDataRefImpl();
     StringRef SegmentName = O->getSectionFinalSegmentName(Ref);
     outs() << SegmentName << ":" << SectName << ":";
@@ -1713,7 +1747,12 @@ static void DumpSectionContents(StringRef Filename, MachOObjectFile *O,
     }
     for (const SectionRef &Section : O->sections()) {
       StringRef SectName;
-      Section.getName(SectName);
+      Expected<StringRef> SecNameOrErr = Section.getName();
+      if (SecNameOrErr)
+        SectName = *SecNameOrErr;
+      else
+        consumeError(SecNameOrErr.takeError());
+
       DataRefImpl Ref = Section.getRawDataRefImpl();
       StringRef SegName = O->getSectionFinalSegmentName(Ref);
       if ((DumpSegName.empty() || SegName == DumpSegName) &&
@@ -1809,7 +1848,12 @@ static void DumpInfoPlistSectionContents(StringRef Filename,
                                          MachOObjectFile *O) {
   for (const SectionRef &Section : O->sections()) {
     StringRef SectName;
-    Section.getName(SectName);
+    Expected<StringRef> SecNameOrErr = Section.getName();
+    if (SecNameOrErr)
+      SectName = *SecNameOrErr;
+    else
+      consumeError(SecNameOrErr.takeError());
+
     DataRefImpl Ref = Section.getRawDataRefImpl();
     StringRef SegName = O->getSectionFinalSegmentName(Ref);
     if (SegName == "__TEXT" && SectName == "__info_plist") {
@@ -1901,12 +1945,16 @@ static void ProcessMachO(StringRef Name, MachOObjectFile *MachOOF,
   // the error message.
   if (Disassemble || IndirectSymbols || !FilterSections.empty() || UnwindInfo)
     if (Error Err = MachOOF->checkSymbolTable())
-      report_error(std::move(Err), ArchiveName, FileName, ArchitectureName);
+      reportError(std::move(Err), FileName, ArchiveName, ArchitectureName);
 
   if (DisassembleAll) {
     for (const SectionRef &Section : MachOOF->sections()) {
       StringRef SectName;
-      Section.getName(SectName);
+      if (Expected<StringRef> NameOrErr = Section.getName())
+        SectName = *NameOrErr;
+      else
+        consumeError(NameOrErr.takeError());
+
       if (SectName.equals("__text")) {
         DataRefImpl Ref = Section.getRawDataRefImpl();
         StringRef SegName = MachOOF->getSectionFinalSegmentName(Ref);
@@ -2151,7 +2199,7 @@ static void printMachOUniversalHeaders(const object::MachOUniversalBinary *UB,
     outs() << "    offset " << OFA.getOffset();
     if (OFA.getOffset() > size)
       outs() << " (past end of file)";
-    if (OFA.getOffset() % (1 << OFA.getAlign()) != 0)
+    if (OFA.getOffset() % (1ull << OFA.getAlign()) != 0)
       outs() << " (not aligned on it's alignment (2^" << OFA.getAlign() << ")";
     outs() << "\n";
     outs() << "    size " << OFA.getSize();
@@ -2165,12 +2213,14 @@ static void printMachOUniversalHeaders(const object::MachOUniversalBinary *UB,
 }
 
 static void printArchiveChild(StringRef Filename, const Archive::Child &C,
-                              bool verbose, bool print_offset,
+                              size_t ChildIndex, bool verbose,
+                              bool print_offset,
                               StringRef ArchitectureName = StringRef()) {
   if (print_offset)
     outs() << C.getChildOffset() << "\t";
   sys::fs::perms Mode =
-      unwrapOrError(C.getAccessMode(), Filename, C, ArchitectureName);
+      unwrapOrError(C.getAccessMode(), getFileNameForError(C, ChildIndex),
+                    Filename, ArchitectureName);
   if (verbose) {
     // FIXME: this first dash, "-", is for (Mode & S_IFMT) == S_IFREG.
     // But there is nothing in sys::fs::perms for S_IFMT or S_IFREG.
@@ -2188,11 +2238,14 @@ static void printArchiveChild(StringRef Filename, const Archive::Child &C,
     outs() << format("0%o ", Mode);
   }
 
-  outs() << format(
-      "%3d/%-3d %5" PRId64 " ",
-      unwrapOrError(C.getUID(), Filename, C, ArchitectureName),
-      unwrapOrError(C.getGID(), Filename, C, ArchitectureName),
-      unwrapOrError(C.getRawSize(), Filename, C, ArchitectureName));
+  outs() << format("%3d/%-3d %5" PRId64 " ",
+                   unwrapOrError(C.getUID(), getFileNameForError(C, ChildIndex),
+                                 Filename, ArchitectureName),
+                   unwrapOrError(C.getGID(), getFileNameForError(C, ChildIndex),
+                                 Filename, ArchitectureName),
+                   unwrapOrError(C.getRawSize(),
+                                 getFileNameForError(C, ChildIndex), Filename,
+                                 ArchitectureName));
 
   StringRef RawLastModified = C.getRawLastModified();
   if (verbose) {
@@ -2215,14 +2268,17 @@ static void printArchiveChild(StringRef Filename, const Archive::Child &C,
     Expected<StringRef> NameOrErr = C.getName();
     if (!NameOrErr) {
       consumeError(NameOrErr.takeError());
-      outs() << unwrapOrError(C.getRawName(), Filename, C, ArchitectureName)
+      outs() << unwrapOrError(C.getRawName(),
+                              getFileNameForError(C, ChildIndex), Filename,
+                              ArchitectureName)
              << "\n";
     } else {
       StringRef Name = NameOrErr.get();
       outs() << Name << "\n";
     }
   } else {
-    outs() << unwrapOrError(C.getRawName(), Filename, C, ArchitectureName)
+    outs() << unwrapOrError(C.getRawName(), getFileNameForError(C, ChildIndex),
+                            Filename, ArchitectureName)
            << "\n";
   }
 }
@@ -2231,11 +2287,13 @@ static void printArchiveHeaders(StringRef Filename, Archive *A, bool verbose,
                                 bool print_offset,
                                 StringRef ArchitectureName = StringRef()) {
   Error Err = Error::success();
+  size_t I = 0;
   for (const auto &C : A->children(Err, false))
-    printArchiveChild(Filename, C, verbose, print_offset, ArchitectureName);
+    printArchiveChild(Filename, C, I++, verbose, print_offset,
+                      ArchitectureName);
 
   if (Err)
-    report_error(std::move(Err), StringRef(), Filename, ArchitectureName);
+    reportError(std::move(Err), Filename, "", ArchitectureName);
 }
 
 static bool ValidateArchFlags() {
@@ -2267,7 +2325,7 @@ void parseInputMachO(StringRef Filename) {
   Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(Filename);
   if (!BinaryOrErr) {
     if (Error E = isNotObjectErrorInvalidFileType(BinaryOrErr.takeError()))
-      report_error(std::move(E), Filename);
+      reportError(std::move(E), Filename);
     else
       outs() << Filename << ": is not an object file\n";
     return;
@@ -2280,11 +2338,13 @@ void parseInputMachO(StringRef Filename) {
       printArchiveHeaders(Filename, A, !NonVerbose, ArchiveMemberOffsets);
 
     Error Err = Error::success();
+    unsigned I = -1;
     for (auto &C : A->children(Err)) {
+      ++I;
       Expected<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
       if (!ChildOrErr) {
         if (Error E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
-          report_error(std::move(E), Filename, C);
+          reportError(std::move(E), getFileNameForError(C, I), Filename);
         continue;
       }
       if (MachOObjectFile *O = dyn_cast<MachOObjectFile>(&*ChildOrErr.get())) {
@@ -2294,7 +2354,7 @@ void parseInputMachO(StringRef Filename) {
       }
     }
     if (Err)
-      report_error(std::move(Err), Filename);
+      reportError(std::move(Err), Filename);
     return;
   }
   if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Bin)) {
@@ -2346,7 +2406,7 @@ void parseInputMachO(MachOUniversalBinary *UB) {
               ProcessMachO(Filename, MachOOF, "", ArchitectureName);
           } else if (Error E = isNotObjectErrorInvalidFileType(
                          ObjOrErr.takeError())) {
-            report_error(std::move(E), Filename, StringRef(), ArchitectureName);
+            reportError(std::move(E), "", Filename, ArchitectureName);
             continue;
           } else if (Expected<std::unique_ptr<Archive>> AOrErr =
                          I->getAsArchive()) {
@@ -2359,11 +2419,15 @@ void parseInputMachO(MachOUniversalBinary *UB) {
               printArchiveHeaders(Filename, A.get(), !NonVerbose,
                                   ArchiveMemberOffsets, ArchitectureName);
             Error Err = Error::success();
+            unsigned I = -1;
             for (auto &C : A->children(Err)) {
+              ++I;
               Expected<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
               if (!ChildOrErr) {
-                if (Error E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
-                  report_error(std::move(E), Filename, C, ArchitectureName);
+                if (Error E =
+                        isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
+                  reportError(std::move(E), getFileNameForError(C, I), Filename,
+                              ArchitectureName);
                 continue;
               }
               if (MachOObjectFile *O =
@@ -2371,12 +2435,13 @@ void parseInputMachO(MachOUniversalBinary *UB) {
                 ProcessMachO(Filename, O, O->getFileName(), ArchitectureName);
             }
             if (Err)
-              report_error(std::move(Err), Filename);
+              reportError(std::move(Err), Filename);
           } else {
             consumeError(AOrErr.takeError());
-            error("Mach-O universal file: " + Filename + " for " +
-                  "architecture " + StringRef(I->getArchFlagName()) +
-                  " is not a Mach-O file or an archive file");
+            reportError(Filename,
+                        "Mach-O universal file for architecture " +
+                            StringRef(I->getArchFlagName()) +
+                            " is not a Mach-O file or an archive file");
           }
         }
       }
@@ -2406,7 +2471,7 @@ void parseInputMachO(MachOUniversalBinary *UB) {
             ProcessMachO(Filename, MachOOF);
         } else if (Error E =
                        isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
-          report_error(std::move(E), Filename);
+          reportError(std::move(E), Filename);
         } else if (Expected<std::unique_ptr<Archive>> AOrErr =
                        I->getAsArchive()) {
           std::unique_ptr<Archive> &A = *AOrErr;
@@ -2415,12 +2480,14 @@ void parseInputMachO(MachOUniversalBinary *UB) {
             printArchiveHeaders(Filename, A.get(), !NonVerbose,
                                 ArchiveMemberOffsets);
           Error Err = Error::success();
+          unsigned I = -1;
           for (auto &C : A->children(Err)) {
+            ++I;
             Expected<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
             if (!ChildOrErr) {
               if (Error E =
                       isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
-                report_error(std::move(E), Filename, C);
+                reportError(std::move(E), getFileNameForError(C, I), Filename);
               continue;
             }
             if (MachOObjectFile *O =
@@ -2428,12 +2495,12 @@ void parseInputMachO(MachOUniversalBinary *UB) {
               ProcessMachO(Filename, O, O->getFileName());
           }
           if (Err)
-            report_error(std::move(Err), Filename);
+            reportError(std::move(Err), Filename);
         } else {
           consumeError(AOrErr.takeError());
-          error("Mach-O universal file: " + Filename + " for architecture " +
-                StringRef(I->getArchFlagName()) +
-                " is not a Mach-O file or an archive file");
+          reportError(Filename, "Mach-O universal file for architecture " +
+                                    StringRef(I->getArchFlagName()) +
+                                    " is not a Mach-O file or an archive file");
         }
         return;
       }
@@ -2455,7 +2522,7 @@ void parseInputMachO(MachOUniversalBinary *UB) {
         ProcessMachO(Filename, MachOOF, "", ArchitectureName);
     } else if (Error E =
                    isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
-      report_error(std::move(E), StringRef(), Filename, ArchitectureName);
+      reportError(std::move(E), Filename, "", ArchitectureName);
     } else if (Expected<std::unique_ptr<Archive>> AOrErr = I->getAsArchive()) {
       std::unique_ptr<Archive> &A = *AOrErr;
       outs() << "Archive : " << Filename;
@@ -2466,11 +2533,14 @@ void parseInputMachO(MachOUniversalBinary *UB) {
         printArchiveHeaders(Filename, A.get(), !NonVerbose,
                             ArchiveMemberOffsets, ArchitectureName);
       Error Err = Error::success();
+      unsigned I = -1;
       for (auto &C : A->children(Err)) {
+        ++I;
         Expected<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
         if (!ChildOrErr) {
           if (Error E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
-            report_error(std::move(E), Filename, C, ArchitectureName);
+            reportError(std::move(E), getFileNameForError(C, I), Filename,
+                        ArchitectureName);
           continue;
         }
         if (MachOObjectFile *O =
@@ -2481,12 +2551,12 @@ void parseInputMachO(MachOUniversalBinary *UB) {
         }
       }
       if (Err)
-        report_error(std::move(Err), Filename);
+        reportError(std::move(Err), Filename);
     } else {
       consumeError(AOrErr.takeError());
-      error("Mach-O universal file: " + Filename + " for architecture " +
-            StringRef(I->getArchFlagName()) +
-            " is not a Mach-O file or an archive file");
+      reportError(Filename, "Mach-O universal file for architecture " +
+                                StringRef(I->getArchFlagName()) +
+                                " is not a Mach-O file or an archive file");
     }
   }
 }
@@ -3083,7 +3153,7 @@ static void method_reference(struct DisassembleInfo *info,
     if (strcmp(*ReferenceName, "_objc_msgSend") == 0) {
       if (info->selector_name != nullptr) {
         if (info->class_name != nullptr) {
-          info->method = llvm::make_unique<char[]>(
+          info->method = std::make_unique<char[]>(
               5 + strlen(info->class_name) + strlen(info->selector_name));
           char *method = info->method.get();
           if (method != nullptr) {
@@ -3097,7 +3167,7 @@ static void method_reference(struct DisassembleInfo *info,
           }
         } else {
           info->method =
-              llvm::make_unique<char[]>(9 + strlen(info->selector_name));
+              std::make_unique<char[]>(9 + strlen(info->selector_name));
           char *method = info->method.get();
           if (method != nullptr) {
             if (Arch == Triple::x86_64)
@@ -3117,7 +3187,7 @@ static void method_reference(struct DisassembleInfo *info,
     } else if (strcmp(*ReferenceName, "_objc_msgSendSuper2") == 0) {
       if (info->selector_name != nullptr) {
         info->method =
-            llvm::make_unique<char[]>(17 + strlen(info->selector_name));
+            std::make_unique<char[]>(17 + strlen(info->selector_name));
         char *method = info->method.get();
         if (method != nullptr) {
           if (Arch == Triple::x86_64)
@@ -3217,7 +3287,13 @@ static const char *get_pointer_64(uint64_t Address, uint32_t &offset,
       continue;
     if (objc_only) {
       StringRef SectName;
-      ((*(info->Sections))[SectIdx]).getName(SectName);
+      Expected<StringRef> SecNameOrErr =
+          ((*(info->Sections))[SectIdx]).getName();
+      if (SecNameOrErr)
+        SectName = *SecNameOrErr;
+      else
+        consumeError(SecNameOrErr.takeError());
+
       DataRefImpl Ref = ((*(info->Sections))[SectIdx]).getRawDataRefImpl();
       StringRef SegName = info->O->getSectionFinalSegmentName(Ref);
       if (SegName != "__OBJC" && SectName != "__cstring")
@@ -4009,7 +4085,12 @@ static const SectionRef get_section(MachOObjectFile *O, const char *segname,
                                     const char *sectname) {
   for (const SectionRef &Section : O->sections()) {
     StringRef SectName;
-    Section.getName(SectName);
+    Expected<StringRef> SecNameOrErr = Section.getName();
+    if (SecNameOrErr)
+      SectName = *SecNameOrErr;
+    else
+      consumeError(SecNameOrErr.takeError());
+
     DataRefImpl Ref = Section.getRawDataRefImpl();
     StringRef SegName = O->getSectionFinalSegmentName(Ref);
     if (SegName == segname && SectName == sectname)
@@ -4026,7 +4107,12 @@ walk_pointer_list_64(const char *listname, const SectionRef S,
     return;
 
   StringRef SectName;
-  S.getName(SectName);
+  Expected<StringRef> SecNameOrErr = S.getName();
+  if (SecNameOrErr)
+    SectName = *SecNameOrErr;
+  else
+    consumeError(SecNameOrErr.takeError());
+
   DataRefImpl Ref = S.getRawDataRefImpl();
   StringRef SegName = O->getSectionFinalSegmentName(Ref);
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
@@ -4075,8 +4161,7 @@ walk_pointer_list_32(const char *listname, const SectionRef S,
   if (S == SectionRef())
     return;
 
-  StringRef SectName;
-  S.getName(SectName);
+  StringRef SectName = unwrapOrError(S.getName(), O->getFileName());
   DataRefImpl Ref = S.getRawDataRefImpl();
   StringRef SegName = O->getSectionFinalSegmentName(Ref);
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
@@ -5750,7 +5835,12 @@ static void print_message_refs64(SectionRef S, struct DisassembleInfo *info) {
     return;
 
   StringRef SectName;
-  S.getName(SectName);
+  Expected<StringRef> SecNameOrErr = S.getName();
+  if (SecNameOrErr)
+    SectName = *SecNameOrErr;
+  else
+    consumeError(SecNameOrErr.takeError());
+
   DataRefImpl Ref = S.getRawDataRefImpl();
   StringRef SegName = info->O->getSectionFinalSegmentName(Ref);
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
@@ -5813,7 +5903,12 @@ static void print_message_refs32(SectionRef S, struct DisassembleInfo *info) {
     return;
 
   StringRef SectName;
-  S.getName(SectName);
+  Expected<StringRef> SecNameOrErr = S.getName();
+  if (SecNameOrErr)
+    SectName = *SecNameOrErr;
+  else
+    consumeError(SecNameOrErr.takeError());
+
   DataRefImpl Ref = S.getRawDataRefImpl();
   StringRef SegName = info->O->getSectionFinalSegmentName(Ref);
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
@@ -5859,7 +5954,12 @@ static void print_image_info64(SectionRef S, struct DisassembleInfo *info) {
     return;
 
   StringRef SectName;
-  S.getName(SectName);
+  Expected<StringRef> SecNameOrErr = S.getName();
+  if (SecNameOrErr)
+    SectName = *SecNameOrErr;
+  else
+    consumeError(SecNameOrErr.takeError());
+
   DataRefImpl Ref = S.getRawDataRefImpl();
   StringRef SegName = info->O->getSectionFinalSegmentName(Ref);
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
@@ -5916,7 +6016,12 @@ static void print_image_info32(SectionRef S, struct DisassembleInfo *info) {
     return;
 
   StringRef SectName;
-  S.getName(SectName);
+  Expected<StringRef> SecNameOrErr = S.getName();
+  if (SecNameOrErr)
+    SectName = *SecNameOrErr;
+  else
+    consumeError(SecNameOrErr.takeError());
+
   DataRefImpl Ref = S.getRawDataRefImpl();
   StringRef SegName = info->O->getSectionFinalSegmentName(Ref);
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
@@ -5966,7 +6071,12 @@ static void print_image_info(SectionRef S, struct DisassembleInfo *info) {
   const char *r;
 
   StringRef SectName;
-  S.getName(SectName);
+  Expected<StringRef> SecNameOrErr = S.getName();
+  if (SecNameOrErr)
+    SectName = *SecNameOrErr;
+  else
+    consumeError(SecNameOrErr.takeError());
+
   DataRefImpl Ref = S.getRawDataRefImpl();
   StringRef SegName = info->O->getSectionFinalSegmentName(Ref);
   outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
@@ -6001,11 +6111,8 @@ static void printObjc2_64bit_MetaData(MachOObjectFile *O, bool verbose) {
     CreateSymbolAddressMap(O, &AddrMap);
 
   std::vector<SectionRef> Sections;
-  for (const SectionRef &Section : O->sections()) {
-    StringRef SectName;
-    Section.getName(SectName);
+  for (const SectionRef &Section : O->sections())
     Sections.push_back(Section);
-  }
 
   struct DisassembleInfo info(O, &AddrMap, &Sections, verbose);
 
@@ -6086,11 +6193,8 @@ static void printObjc2_32bit_MetaData(MachOObjectFile *O, bool verbose) {
     CreateSymbolAddressMap(O, &AddrMap);
 
   std::vector<SectionRef> Sections;
-  for (const SectionRef &Section : O->sections()) {
-    StringRef SectName;
-    Section.getName(SectName);
+  for (const SectionRef &Section : O->sections())
     Sections.push_back(Section);
-  }
 
   struct DisassembleInfo info(O, &AddrMap, &Sections, verbose);
 
@@ -6184,11 +6288,8 @@ static bool printObjc1_32bit_MetaData(MachOObjectFile *O, bool verbose) {
     CreateSymbolAddressMap(O, &AddrMap);
 
   std::vector<SectionRef> Sections;
-  for (const SectionRef &Section : O->sections()) {
-    StringRef SectName;
-    Section.getName(SectName);
+  for (const SectionRef &Section : O->sections())
     Sections.push_back(Section);
-  }
 
   struct DisassembleInfo info(O, &AddrMap, &Sections, verbose);
 
@@ -6345,11 +6446,8 @@ static void DumpProtocolSection(MachOObjectFile *O, const char *sect,
   CreateSymbolAddressMap(O, &AddrMap);
 
   std::vector<SectionRef> Sections;
-  for (const SectionRef &Section : O->sections()) {
-    StringRef SectName;
-    Section.getName(SectName);
+  for (const SectionRef &Section : O->sections())
     Sections.push_back(Section);
-  }
 
   struct DisassembleInfo info(O, &AddrMap, &Sections, true);
 
@@ -7203,7 +7301,7 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   std::vector<SectionRef> Sections;
   std::vector<SymbolRef> Symbols;
   SmallVector<uint64_t, 8> FoundFns;
-  uint64_t BaseSegmentAddress;
+  uint64_t BaseSegmentAddress = 0;
 
   getSectionsAndSymbols(MachOOF, Sections, Symbols, FoundFns,
                         BaseSegmentAddress);
@@ -7242,10 +7340,24 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
     // A separate DSym file path was specified, parse it as a macho file,
     // get the sections and supply it to the section name parsing machinery.
     if (!DSYMFile.empty()) {
+      std::string DSYMPath(DSYMFile);
+
+      // If DSYMPath is a .dSYM directory, append the Mach-O file.
+      if (llvm::sys::fs::is_directory(DSYMPath) &&
+          llvm::sys::path::extension(DSYMPath) == ".dSYM") {
+        SmallString<128> ShortName(llvm::sys::path::filename(DSYMPath));
+        llvm::sys::path::replace_extension(ShortName, "");
+        SmallString<1024> FullPath(DSYMPath);
+        llvm::sys::path::append(FullPath, "Contents", "Resources", "DWARF",
+                                ShortName);
+        DSYMPath = FullPath.str();
+      }
+
+      // Load the file.
       ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
-          MemoryBuffer::getFileOrSTDIN(DSYMFile);
+          MemoryBuffer::getFileOrSTDIN(DSYMPath);
       if (std::error_code EC = BufOrErr.getError()) {
-        report_error(errorCodeToError(EC), DSYMFile);
+        reportError(errorCodeToError(EC), DSYMPath);
         return;
       }
 
@@ -7255,13 +7367,12 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
       Expected<std::unique_ptr<Binary>> BinaryOrErr =
       createBinary(DSYMBuf.get()->getMemBufferRef());
       if (!BinaryOrErr) {
-        report_error(BinaryOrErr.takeError(), DSYMFile);
+        reportError(BinaryOrErr.takeError(), DSYMPath);
         return;
       }
 
-      // We need to keep the Binary elive with the buffer
+      // We need to keep the Binary alive with the buffer
       DSYMBinary = std::move(BinaryOrErr.get());
-    
       if (ObjectFile *O = dyn_cast<ObjectFile>(DSYMBinary.get())) {
         // this is a Mach-O object file, use it
         if (MachOObjectFile *MachDSYM = dyn_cast<MachOObjectFile>(&*O)) {
@@ -7269,7 +7380,7 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
         }
         else {
           WithColor::error(errs(), "llvm-objdump")
-            << DSYMFile << " is not a Mach-O file type.\n";
+            << DSYMPath << " is not a Mach-O file type.\n";
           return;
         }
       }
@@ -7289,19 +7400,19 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
         Triple T = MachOObjectFile::getArchTriple(CPUType, CPUSubType, nullptr,
                                                   &ArchFlag);
         Expected<std::unique_ptr<MachOObjectFile>> MachDSYM =
-            UB->getObjectForArch(ArchFlag);
+            UB->getMachOObjectForArch(ArchFlag);
         if (!MachDSYM) {
-          report_error(MachDSYM.takeError(), DSYMFile);
+          reportError(MachDSYM.takeError(), DSYMPath);
           return;
         }
     
-        // We need to keep the Binary elive with the buffer
+        // We need to keep the Binary alive with the buffer
         DbgObj = &*MachDSYM.get();
         DSYMBinary = std::move(*MachDSYM);
       }
       else {
         WithColor::error(errs(), "llvm-objdump")
-          << DSYMFile << " is not a Mach-O or Universal file type.\n";
+          << DSYMPath << " is not a Mach-O or Universal file type.\n";
         return;
       }
     }
@@ -7314,8 +7425,12 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
     outs() << "(" << DisSegName << "," << DisSectName << ") section\n";
 
   for (unsigned SectIdx = 0; SectIdx != Sections.size(); SectIdx++) {
-    StringRef SectName;
-    if (Sections[SectIdx].getName(SectName) || SectName != DisSectName)
+    Expected<StringRef> SecNameOrErr = Sections[SectIdx].getName();
+    if (!SecNameOrErr) {
+      consumeError(SecNameOrErr.takeError());
+      continue;
+    }
+    if (*SecNameOrErr != DisSectName)
       continue;
 
     DataRefImpl DR = Sections[SectIdx].getRawDataRefImpl();
@@ -7496,24 +7611,8 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
         if (!NoShowRawInsn || Arch == Triple::arm)
           outs() << "\t";
 
-        // Check the data in code table here to see if this is data not an
-        // instruction to be disassembled.
-        DiceTable Dice;
-        Dice.push_back(std::make_pair(PC, DiceRef()));
-        dice_table_iterator DTI =
-            std::search(Dices.begin(), Dices.end(), Dice.begin(), Dice.end(),
-                        compareDiceTableEntries);
-        if (DTI != Dices.end()) {
-          uint16_t Length;
-          DTI->second.getLength(Length);
-          uint16_t Kind;
-          DTI->second.getKind(Kind);
-          Size = DumpDataInCode(Bytes.data() + Index, Length, Kind);
-          if ((Kind == MachO::DICE_KIND_JUMP_TABLE8) &&
-              (PC == (DTI->first + Length - 1)) && (Length & 1))
-            Size++;
+        if (DumpAndSkipDataInCode(PC, Bytes.data() + Index, Dices, Size))
           continue;
-        }
 
         SmallVector<char, 64> AnnotationsBytes;
         raw_svector_ostream Annotations(AnnotationsBytes);
@@ -7588,6 +7687,10 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
         MCInst Inst;
 
         uint64_t PC = SectAddress + Index;
+
+        if (DumpAndSkipDataInCode(PC, Bytes.data() + Index, Dices, InstSize))
+          continue;
+
         SmallVector<char, 64> AnnotationsBytes;
         raw_svector_ostream Annotations(AnnotationsBytes);
         if (DisAsm->getInstruction(Inst, InstSize, Bytes.slice(Index), PC,
@@ -7724,8 +7827,12 @@ static void findUnwindRelocNameAddend(const MachOObjectFile *Obj,
   auto Sym = Symbols.upper_bound(Addr);
   if (Sym == Symbols.begin()) {
     // The first symbol in the object is after this reference, the best we can
-    // do is section-relative notation.
-    RelocSection.getName(Name);
+    // do is section-relative notation.  
+    if (Expected<StringRef> NameOrErr = RelocSection.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
     Addend = Addr - SectionAddr;
     return;
   }
@@ -7744,7 +7851,11 @@ static void findUnwindRelocNameAddend(const MachOObjectFile *Obj,
 
   // There is a symbol before this reference, but it's in a different
   // section. Probably not helpful to mention it, so use the section name.
-  RelocSection.getName(Name);
+  if (Expected<StringRef> NameOrErr = RelocSection.getName())
+    Name = *NameOrErr;
+  else
+    consumeError(NameOrErr.takeError());
+
   Addend = Addr - SectionAddr;
 }
 
@@ -8109,7 +8220,11 @@ void printMachOUnwindInfo(const MachOObjectFile *Obj) {
 
   for (const SectionRef &Section : Obj->sections()) {
     StringRef SectName;
-    Section.getName(SectName);
+    if (Expected<StringRef> NameOrErr = Section.getName())
+      SectName = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
     if (SectName == "__compact_unwind")
       printMachOCompactUnwindSection(Obj, Symbols, Section);
     else if (SectName == "__unwind_info")
@@ -10191,7 +10306,7 @@ void printMachOExportsTrie(const object::MachOObjectFile *Obj) {
     outs() << "\n";
   }
   if (Err)
-    report_error(std::move(Err), Obj->getFileName());
+    reportError(std::move(Err), Obj->getFileName());
 }
 
 //===----------------------------------------------------------------------===//
@@ -10212,7 +10327,7 @@ void printMachORebaseTable(object::MachOObjectFile *Obj) {
                      Address, Entry.typeName().str().c_str());
   }
   if (Err)
-    report_error(std::move(Err), Obj->getFileName());
+    reportError(std::move(Err), Obj->getFileName());
 }
 
 static StringRef ordinalName(const object::MachOObjectFile *Obj, int Ordinal) {
@@ -10264,7 +10379,7 @@ void printMachOBindTable(object::MachOObjectFile *Obj) {
            << Entry.symbolName() << Attr << "\n";
   }
   if (Err)
-    report_error(std::move(Err), Obj->getFileName());
+    reportError(std::move(Err), Obj->getFileName());
 }
 
 //===----------------------------------------------------------------------===//
@@ -10289,7 +10404,7 @@ void printMachOLazyBindTable(object::MachOObjectFile *Obj) {
            << Entry.symbolName() << "\n";
   }
   if (Err)
-    report_error(std::move(Err), Obj->getFileName());
+    reportError(std::move(Err), Obj->getFileName());
 }
 
 //===----------------------------------------------------------------------===//
@@ -10321,7 +10436,7 @@ void printMachOWeakBindTable(object::MachOObjectFile *Obj) {
            << "\n";
   }
   if (Err)
-    report_error(std::move(Err), Obj->getFileName());
+    reportError(std::move(Err), Obj->getFileName());
 }
 
 // get_dyld_bind_info_symbolname() is used for disassembly and passed an
@@ -10331,7 +10446,7 @@ void printMachOWeakBindTable(object::MachOObjectFile *Obj) {
 static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue,
                                                  struct DisassembleInfo *info) {
   if (info->bindtable == nullptr) {
-    info->bindtable = llvm::make_unique<SymbolAddressMap>();
+    info->bindtable = std::make_unique<SymbolAddressMap>();
     Error Err = Error::success();
     for (const object::MachOBindEntry &Entry : info->O->bindTable(Err)) {
       uint64_t Address = Entry.address();
@@ -10340,7 +10455,7 @@ static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue,
         (*info->bindtable)[Address] = name;
     }
     if (Err)
-      report_error(std::move(Err), info->O->getFileName());
+      reportError(std::move(Err), info->O->getFileName());
   }
   auto name = info->bindtable->lookup(ReferenceValue);
   return !name.empty() ? name.data() : nullptr;
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 58981203c59e..34a44b3b7fa9 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -51,6 +51,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/InitLLVM.h"
@@ -341,78 +342,84 @@ static StringRef ToolName;
 
 typedef std::vector<std::tuple<uint64_t, StringRef, uint8_t>> SectionSymbolsTy;
 
-static bool shouldKeep(object::SectionRef S) {
+namespace {
+struct FilterResult {
+  // True if the section should not be skipped.
+  bool Keep;
+
+  // True if the index counter should be incremented, even if the section should
+  // be skipped. For example, sections may be skipped if they are not included
+  // in the --section flag, but we still want those to count toward the section
+  // count.
+  bool IncrementIndex;
+};
+} // namespace
+
+static FilterResult checkSectionFilter(object::SectionRef S) {
   if (FilterSections.empty())
-    return true;
-  StringRef SecName;
-  std::error_code error = S.getName(SecName);
-  if (error)
-    return false;
+    return {/*Keep=*/true, /*IncrementIndex=*/true};
+
+  Expected<StringRef> SecNameOrErr = S.getName();
+  if (!SecNameOrErr) {
+    consumeError(SecNameOrErr.takeError());
+    return {/*Keep=*/false, /*IncrementIndex=*/false};
+  }
+  StringRef SecName = *SecNameOrErr;
+
   // StringSet does not allow empty key so avoid adding sections with
   // no name (such as the section with index 0) here.
   if (!SecName.empty())
     FoundSectionSet.insert(SecName);
-  return is_contained(FilterSections, SecName);
-}
 
-SectionFilter ToolSectionFilter(object::ObjectFile const &O) {
-  return SectionFilter([](object::SectionRef S) { return shouldKeep(S); }, O);
-}
-
-void error(std::error_code EC) {
-  if (!EC)
-    return;
-  WithColor::error(errs(), ToolName)
-      << "reading file: " << EC.message() << ".\n";
-  errs().flush();
-  exit(1);
-}
-
-void error(Error E) {
-  if (!E)
-    return;
-  WithColor::error(errs(), ToolName) << toString(std::move(E));
-  exit(1);
+  // Only show the section if it's in the FilterSections list, but always
+  // increment so the indexing is stable.
+  return {/*Keep=*/is_contained(FilterSections, SecName),
+          /*IncrementIndex=*/true};
 }
 
-LLVM_ATTRIBUTE_NORETURN void error(Twine Message) {
-  WithColor::error(errs(), ToolName) << Message << ".\n";
-  errs().flush();
-  exit(1);
+SectionFilter ToolSectionFilter(object::ObjectFile const &O, uint64_t *Idx) {
+  // Start at UINT64_MAX so that the first index returned after an increment is
+  // zero (after the unsigned wrap).
+  if (Idx)
+    *Idx = UINT64_MAX;
+  return SectionFilter(
+      [Idx](object::SectionRef S) {
+        FilterResult Result = checkSectionFilter(S);
+        if (Idx != nullptr && Result.IncrementIndex)
+          *Idx += 1;
+        return Result.Keep;
+      },
+      O);
 }
 
-void warn(StringRef Message) {
-  WithColor::warning(errs(), ToolName) << Message << ".\n";
-  errs().flush();
+std::string getFileNameForError(const object::Archive::Child &C,
+                                unsigned Index) {
+  Expected<StringRef> NameOrErr = C.getName();
+  if (NameOrErr)
+    return NameOrErr.get();
+  // If we have an error getting the name then we print the index of the archive
+  // member. Since we are already in an error state, we just ignore this error.
+  consumeError(NameOrErr.takeError());
+  return "<file index: " + std::to_string(Index) + ">";
 }
 
-static void warn(Twine Message) {
+void reportWarning(Twine Message, StringRef File) {
   // Output order between errs() and outs() matters especially for archive
   // files where the output is per member object.
   outs().flush();
-  WithColor::warning(errs(), ToolName) << Message << "\n";
+  WithColor::warning(errs(), ToolName)
+      << "'" << File << "': " << Message << "\n";
   errs().flush();
 }
 
-LLVM_ATTRIBUTE_NORETURN void report_error(StringRef File, Twine Message) {
-  WithColor::error(errs(), ToolName)
-      << "'" << File << "': " << Message << ".\n";
-  exit(1);
-}
-
-LLVM_ATTRIBUTE_NORETURN void report_error(Error E, StringRef File) {
-  assert(E);
-  std::string Buf;
-  raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS);
-  OS.flush();
-  WithColor::error(errs(), ToolName) << "'" << File << "': " << Buf;
+LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Twine Message) {
+  WithColor::error(errs(), ToolName) << "'" << File << "': " << Message << "\n";
   exit(1);
 }
 
-LLVM_ATTRIBUTE_NORETURN void report_error(Error E, StringRef ArchiveName,
-                                          StringRef FileName,
-                                          StringRef ArchitectureName) {
+LLVM_ATTRIBUTE_NORETURN void reportError(Error E, StringRef FileName,
+                                         StringRef ArchiveName,
+                                         StringRef ArchitectureName) {
   assert(E);
   WithColor::error(errs(), ToolName);
   if (ArchiveName != "")
@@ -429,18 +436,13 @@ LLVM_ATTRIBUTE_NORETURN void report_error(Error E, StringRef ArchiveName,
   exit(1);
 }
 
-LLVM_ATTRIBUTE_NORETURN void report_error(Error E, StringRef ArchiveName,
-                                          const object::Archive::Child &C,
-                                          StringRef ArchitectureName) {
-  Expected<StringRef> NameOrErr = C.getName();
-  // TODO: if we have a error getting the name then it would be nice to print
-  // the index of which archive member this is and or its offset in the
-  // archive instead of "???" as the name.
-  if (!NameOrErr) {
-    consumeError(NameOrErr.takeError());
-    report_error(std::move(E), ArchiveName, "???", ArchitectureName);
-  } else
-    report_error(std::move(E), ArchiveName, NameOrErr.get(), ArchitectureName);
+static void reportCmdLineWarning(Twine Message) {
+  WithColor::warning(errs(), ToolName) << Message << "\n";
+}
+
+LLVM_ATTRIBUTE_NORETURN static void reportCmdLineError(Twine Message) {
+  WithColor::error(errs(), ToolName) << Message << "\n";
+  exit(1);
 }
 
 static void warnOnNoMatchForSections() {
@@ -455,37 +457,29 @@ static void warnOnNoMatchForSections() {
 
   // Warn only if no section in FilterSections is matched.
   for (StringRef S : MissingSections)
-    warn("section '" + S + "' mentioned in a -j/--section option, but not "
-         "found in any input file");
+    reportCmdLineWarning("section '" + S +
+                         "' mentioned in a -j/--section option, but not "
+                         "found in any input file");
 }
 
-static const Target *getTarget(const ObjectFile *Obj = nullptr) {
+static const Target *getTarget(const ObjectFile *Obj) {
   // Figure out the target triple.
   Triple TheTriple("unknown-unknown-unknown");
   if (TripleName.empty()) {
-    if (Obj)
-      TheTriple = Obj->makeTriple();
+    TheTriple = Obj->makeTriple();
   } else {
     TheTriple.setTriple(Triple::normalize(TripleName));
-
-    // Use the triple, but also try to combine with ARM build attributes.
-    if (Obj) {
-      auto Arch = Obj->getArch();
-      if (Arch == Triple::arm || Arch == Triple::armeb)
-        Obj->setARMSubArch(TheTriple);
-    }
+    auto Arch = Obj->getArch();
+    if (Arch == Triple::arm || Arch == Triple::armeb)
+      Obj->setARMSubArch(TheTriple);
   }
 
   // Get the target specific parser.
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
                                                          Error);
-  if (!TheTarget) {
-    if (Obj)
-      report_error(Obj->getFileName(), "can't find target: " + Error);
-    else
-      error("can't find target: " + Error);
-  }
+  if (!TheTarget)
+    reportError(Obj->getFileName(), "can't find target: " + Error);
 
   // Update the triple name and return the found target.
   TripleName = TheTriple.getTriple();
@@ -548,17 +542,22 @@ protected:
   DILineInfo OldLineInfo;
   const ObjectFile *Obj = nullptr;
   std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
-  // File name to file contents of source
+  // File name to file contents of source.
   std::unordered_map<std::string, std::unique_ptr<MemoryBuffer>> SourceCache;
-  // Mark the line endings of the cached source
+  // Mark the line endings of the cached source.
   std::unordered_map<std::string, std::vector<StringRef>> LineCache;
+  // Keep track of missing sources.
+  StringSet<> MissingSources;
+  // Only emit 'no debug info' warning once.
+  bool WarnedNoDebugInfo;
 
 private:
   bool cacheSource(const DILineInfo& LineInfoFile);
 
 public:
   SourcePrinter() = default;
-  SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch) : Obj(Obj) {
+  SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch)
+      : Obj(Obj), WarnedNoDebugInfo(false) {
     symbolize::LLVMSymbolizer::Options SymbolizerOpts;
     SymbolizerOpts.PrintFunctions = DILineInfoSpecifier::FunctionNameKind::None;
     SymbolizerOpts.Demangle = false;
@@ -568,6 +567,7 @@ public:
   virtual ~SourcePrinter() = default;
   virtual void printSourceLine(raw_ostream &OS,
                                object::SectionedAddress Address,
+                               StringRef ObjectFilename,
                                StringRef Delimiter = "; ");
 };
 
@@ -577,8 +577,12 @@ bool SourcePrinter::cacheSource(const DILineInfo &LineInfo) {
     Buffer = MemoryBuffer::getMemBuffer(*LineInfo.Source);
   } else {
     auto BufferOrError = MemoryBuffer::getFile(LineInfo.FileName);
-    if (!BufferOrError)
+    if (!BufferOrError) {
+      if (MissingSources.insert(LineInfo.FileName).second)
+        reportWarning("failed to find source " + LineInfo.FileName,
+                      Obj->getFileName());
       return false;
+    }
     Buffer = std::move(*BufferOrError);
   }
   // Chomp the file to get lines
@@ -599,20 +603,33 @@ bool SourcePrinter::cacheSource(const DILineInfo &LineInfo) {
 
 void SourcePrinter::printSourceLine(raw_ostream &OS,
                                     object::SectionedAddress Address,
+                                    StringRef ObjectFilename,
                                     StringRef Delimiter) {
   if (!Symbolizer)
     return;
 
   DILineInfo LineInfo = DILineInfo();
   auto ExpectedLineInfo = Symbolizer->symbolizeCode(*Obj, Address);
+  std::string ErrorMessage;
   if (!ExpectedLineInfo)
-    consumeError(ExpectedLineInfo.takeError());
+    ErrorMessage = toString(ExpectedLineInfo.takeError());
   else
     LineInfo = *ExpectedLineInfo;
 
-  if ((LineInfo.FileName == "<invalid>") || LineInfo.Line == 0 ||
-      ((OldLineInfo.Line == LineInfo.Line) &&
-       (OldLineInfo.FileName == LineInfo.FileName)))
+  if (LineInfo.FileName == DILineInfo::BadString) {
+    if (!WarnedNoDebugInfo) {
+      std::string Warning =
+          "failed to parse debug information for " + ObjectFilename.str();
+      if (!ErrorMessage.empty())
+        Warning += ": " + ErrorMessage;
+      reportWarning(Warning, ObjectFilename);
+      WarnedNoDebugInfo = true;
+    }
+    return;
+  }
+
+  if (LineInfo.Line == 0 || ((OldLineInfo.Line == LineInfo.Line) &&
+                             (OldLineInfo.FileName == LineInfo.FileName)))
     return;
 
   if (PrintLines)
@@ -623,8 +640,14 @@ void SourcePrinter::printSourceLine(raw_ostream &OS,
         return;
     auto LineBuffer = LineCache.find(LineInfo.FileName);
     if (LineBuffer != LineCache.end()) {
-      if (LineInfo.Line > LineBuffer->second.size())
+      if (LineInfo.Line > LineBuffer->second.size()) {
+        reportWarning(
+            formatv(
+                "debug info line number {0} exceeds the number of lines in {1}",
+                LineInfo.Line, LineInfo.FileName),
+            ObjectFilename);
         return;
+      }
       // Vector begins at 0, line numbers are non-zero
       OS << Delimiter << LineBuffer->second[LineInfo.Line - 1] << '\n';
     }
@@ -646,13 +669,14 @@ static bool hasMappingSymbols(const ObjectFile *Obj) {
   return isArmElf(Obj) || isAArch64Elf(Obj);
 }
 
-static void printRelocation(const RelocationRef &Rel, uint64_t Address,
-                            bool Is64Bits) {
+static void printRelocation(StringRef FileName, const RelocationRef &Rel,
+                            uint64_t Address, bool Is64Bits) {
   StringRef Fmt = Is64Bits ? "\t\t%016" PRIx64 ":  " : "\t\t\t%08" PRIx64 ":  ";
   SmallString<16> Name;
   SmallString<32> Val;
   Rel.getTypeName(Name);
-  error(getRelocationValueString(Rel, Val));
+  if (Error E = getRelocationValueString(Rel, Val))
+    reportError(std::move(E), FileName);
   outs() << format(Fmt.data(), Address) << Name << "\t" << Val << "\n";
 }
 
@@ -663,29 +687,25 @@ public:
                          ArrayRef<uint8_t> Bytes,
                          object::SectionedAddress Address, raw_ostream &OS,
                          StringRef Annot, MCSubtargetInfo const &STI,
-                         SourcePrinter *SP,
+                         SourcePrinter *SP, StringRef ObjectFilename,
                          std::vector<RelocationRef> *Rels = nullptr) {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address);
+      SP->printSourceLine(OS, Address, ObjectFilename);
 
-    {
-      formatted_raw_ostream FOS(OS);
-      if (!NoLeadingAddr)
-        FOS << format("%8" PRIx64 ":", Address.Address);
-      if (!NoShowRawInsn) {
-        FOS << ' ';
-        dumpBytes(Bytes, FOS);
-      }
-      FOS.flush();
-      // The output of printInst starts with a tab. Print some spaces so that
-      // the tab has 1 column and advances to the target tab stop.
-      unsigned TabStop = NoShowRawInsn ? 16 : 40;
-      unsigned Column = FOS.getColumn();
-      FOS.indent(Column < TabStop - 1 ? TabStop - 1 - Column : 7 - Column % 8);
-
-      // The dtor calls flush() to ensure the indent comes before printInst().
+    size_t Start = OS.tell();
+    if (!NoLeadingAddr)
+      OS << format("%8" PRIx64 ":", Address.Address);
+    if (!NoShowRawInsn) {
+      OS << ' ';
+      dumpBytes(Bytes, OS);
     }
 
+    // The output of printInst starts with a tab. Print some spaces so that
+    // the tab has 1 column and advances to the target tab stop.
+    unsigned TabStop = NoShowRawInsn ? 16 : 40;
+    unsigned Column = OS.tell() - Start;
+    OS.indent(Column < TabStop - 1 ? TabStop - 1 - Column : 7 - Column % 8);
+
     if (MI)
       IP.printInst(MI, OS, "", STI);
     else
@@ -711,9 +731,10 @@ public:
   void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
                  object::SectionedAddress Address, raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
+                 StringRef ObjectFilename,
                  std::vector<RelocationRef> *Rels) override {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address, "");
+      SP->printSourceLine(OS, Address, ObjectFilename, "");
     if (!MI) {
       printLead(Bytes, Address.Address, OS);
       OS << " <unknown>";
@@ -739,7 +760,7 @@ public:
     auto PrintReloc = [&]() -> void {
       while ((RelCur != RelEnd) && (RelCur->getOffset() <= Address.Address)) {
         if (RelCur->getOffset() == Address.Address) {
-          printRelocation(*RelCur, Address.Address, false);
+          printRelocation(ObjectFilename, *RelCur, Address.Address, false);
           return;
         }
         ++RelCur;
@@ -750,7 +771,7 @@ public:
       OS << Separator;
       Separator = "\n";
       if (SP && (PrintSource || PrintLines))
-        SP->printSourceLine(OS, Address, "");
+        SP->printSourceLine(OS, Address, ObjectFilename, "");
       printLead(Bytes, Address.Address, OS);
       OS << Preamble;
       Preamble = "   ";
@@ -780,9 +801,10 @@ public:
   void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
                  object::SectionedAddress Address, raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
+                 StringRef ObjectFilename,
                  std::vector<RelocationRef> *Rels) override {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address);
+      SP->printSourceLine(OS, Address, ObjectFilename);
 
     if (MI) {
       SmallString<40> InstStr;
@@ -831,9 +853,10 @@ public:
   void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
                  object::SectionedAddress Address, raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
+                 StringRef ObjectFilename,
                  std::vector<RelocationRef> *Rels) override {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address);
+      SP->printSourceLine(OS, Address, ObjectFilename);
     if (!NoLeadingAddr)
       OS << format("%8" PRId64 ":", Address.Address / 8);
     if (!NoShowRawInsn) {
@@ -924,10 +947,12 @@ static void addPltEntries(const ObjectFile *Obj,
                           StringSaver &Saver) {
   Optional<SectionRef> Plt = None;
   for (const SectionRef &Section : Obj->sections()) {
-    StringRef Name;
-    if (Section.getName(Name))
+    Expected<StringRef> SecNameOrErr = Section.getName();
+    if (!SecNameOrErr) {
+      consumeError(SecNameOrErr.takeError());
       continue;
-    if (Name == ".plt")
+    }
+    if (*SecNameOrErr == ".plt")
       Plt = Section;
   }
   if (!Plt)
@@ -968,9 +993,18 @@ static size_t countSkippableZeroBytes(ArrayRef<uint8_t> Buf) {
 static std::map<SectionRef, std::vector<RelocationRef>>
 getRelocsMap(object::ObjectFile const &Obj) {
   std::map<SectionRef, std::vector<RelocationRef>> Ret;
+  uint64_t I = (uint64_t)-1;
   for (SectionRef Sec : Obj.sections()) {
-    section_iterator Relocated = Sec.getRelocatedSection();
-    if (Relocated == Obj.section_end() || !shouldKeep(*Relocated))
+    ++I;
+    Expected<section_iterator> RelocatedOrErr = Sec.getRelocatedSection();
+    if (!RelocatedOrErr)
+      reportError(Obj.getFileName(),
+                  "section (" + Twine(I) +
+                      "): failed to get a relocated section: " +
+                      toString(RelocatedOrErr.takeError()));
+
+    section_iterator Relocated = *RelocatedOrErr;
+    if (Relocated == Obj.section_end() || !checkSectionFilter(*Relocated).Keep)
       continue;
     std::vector<RelocationRef> &V = Ret[*Relocated];
     for (const RelocationRef &R : Sec.relocations())
@@ -1137,11 +1171,14 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
   if (const auto *COFFObj = dyn_cast<COFFObjectFile>(Obj)) {
     for (const auto &ExportEntry : COFFObj->export_directories()) {
       StringRef Name;
-      error(ExportEntry.getSymbolName(Name));
+      if (std::error_code EC = ExportEntry.getSymbolName(Name))
+        reportError(errorCodeToError(EC), Obj->getFileName());
       if (Name.empty())
         continue;
+
       uint32_t RVA;
-      error(ExportEntry.getExportRVA(RVA));
+      if (std::error_code EC = ExportEntry.getExportRVA(RVA))
+        reportError(errorCodeToError(EC), Obj->getFileName());
 
       uint64_t VA = COFFObj->getImageBase() + RVA;
       auto Sec = partition_point(
@@ -1210,9 +1247,8 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
       DataRefImpl DR = Section.getRawDataRefImpl();
       SegmentName = MachO->getSectionFinalSegmentName(DR);
     }
-    StringRef SectionName;
-    error(Section.getName(SectionName));
 
+    StringRef SectionName = unwrapOrError(Section.getName(), Obj->getFileName());
     // If the section has no symbol at the start, just insert a dummy one.
     if (Symbols.empty() || std::get<0>(Symbols[0]) != 0) {
       Symbols.insert(
@@ -1381,10 +1417,10 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
         if (Size == 0)
           Size = 1;
 
-        PIP.printInst(
-            *IP, Disassembled ? &Inst : nullptr, Bytes.slice(Index, Size),
-            {SectionAddr + Index + VMAAdjustment, Section.getIndex()}, outs(),
-            "", *STI, &SP, &Rels);
+        PIP.printInst(*IP, Disassembled ? &Inst : nullptr,
+                      Bytes.slice(Index, Size),
+                      {SectionAddr + Index + VMAAdjustment, Section.getIndex()},
+                      outs(), "", *STI, &SP, Obj->getFileName(), &Rels);
         outs() << CommentStream.str();
         Comments.clear();
 
@@ -1470,7 +1506,8 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
                 Offset += AdjustVMA;
             }
 
-            printRelocation(*RelCur, SectionAddr + Offset, Is64Bits);
+            printRelocation(Obj->getFileName(), *RelCur, SectionAddr + Offset,
+                            Is64Bits);
             ++RelCur;
           }
         }
@@ -1482,7 +1519,8 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
   StringSet<> MissingDisasmFuncsSet =
       set_difference(DisasmFuncsSet, FoundDisasmFuncsSet);
   for (StringRef MissingDisasmFunc : MissingDisasmFuncsSet.keys())
-    warn("failed to disassemble missing function " + MissingDisasmFunc);
+    reportWarning("failed to disassemble missing function " + MissingDisasmFunc,
+                  FileName);
 }
 
 static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
@@ -1497,24 +1535,24 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   std::unique_ptr<const MCRegisterInfo> MRI(
       TheTarget->createMCRegInfo(TripleName));
   if (!MRI)
-    report_error(Obj->getFileName(),
-                 "no register info for target " + TripleName);
+    reportError(Obj->getFileName(),
+                "no register info for target " + TripleName);
 
   // Set up disassembler.
   std::unique_ptr<const MCAsmInfo> AsmInfo(
       TheTarget->createMCAsmInfo(*MRI, TripleName));
   if (!AsmInfo)
-    report_error(Obj->getFileName(),
-                 "no assembly info for target " + TripleName);
+    reportError(Obj->getFileName(),
+                "no assembly info for target " + TripleName);
   std::unique_ptr<const MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, MCPU, Features.getString()));
   if (!STI)
-    report_error(Obj->getFileName(),
-                 "no subtarget info for target " + TripleName);
+    reportError(Obj->getFileName(),
+                "no subtarget info for target " + TripleName);
   std::unique_ptr<const MCInstrInfo> MII(TheTarget->createMCInstrInfo());
   if (!MII)
-    report_error(Obj->getFileName(),
-                 "no instruction info for target " + TripleName);
+    reportError(Obj->getFileName(),
+                "no instruction info for target " + TripleName);
   MCObjectFileInfo MOFI;
   MCContext Ctx(AsmInfo.get(), MRI.get(), &MOFI);
   // FIXME: for now initialize MCObjectFileInfo with default values
@@ -1523,8 +1561,7 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   std::unique_ptr<MCDisassembler> DisAsm(
       TheTarget->createMCDisassembler(*STI, Ctx));
   if (!DisAsm)
-    report_error(Obj->getFileName(),
-                 "no disassembler for target " + TripleName);
+    reportError(Obj->getFileName(), "no disassembler for target " + TripleName);
 
   // If we have an ARM object file, we need a second disassembler, because
   // ARM CPUs have two different instruction sets: ARM mode, and Thumb mode.
@@ -1549,8 +1586,8 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
       Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI));
   if (!IP)
-    report_error(Obj->getFileName(),
-                 "no instruction printer for target " + TripleName);
+    reportError(Obj->getFileName(),
+                "no instruction printer for target " + TripleName);
   IP->setPrintImmHex(PrintImmHex);
 
   PrettyPrinter &PIP = selectPrettyPrinter(Triple(TripleName));
@@ -1558,7 +1595,8 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
 
   for (StringRef Opt : DisassemblerOptions)
     if (!IP->applyTargetSpecificCLOption(Opt))
-      error("Unrecognized disassembler option: " + Opt);
+      reportError(Obj->getFileName(),
+                  "Unrecognized disassembler option: " + Opt);
 
   disassembleObject(TheTarget, Obj, Ctx, DisAsm.get(), SecondaryDisAsm.get(),
                     MIA.get(), IP.get(), STI.get(), SecondarySTI.get(), PIP,
@@ -1577,16 +1615,21 @@ void printRelocations(const ObjectFile *Obj) {
   // sections. Usually, there is an only one relocation section for
   // each relocated section.
   MapVector<SectionRef, std::vector<SectionRef>> SecToRelSec;
-  for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
+  uint64_t Ndx;
+  for (const SectionRef &Section : ToolSectionFilter(*Obj, &Ndx)) {
     if (Section.relocation_begin() == Section.relocation_end())
       continue;
-    const SectionRef TargetSec = *Section.getRelocatedSection();
-    SecToRelSec[TargetSec].push_back(Section);
+    Expected<section_iterator> SecOrErr = Section.getRelocatedSection();
+    if (!SecOrErr)
+      reportError(Obj->getFileName(),
+                  "section (" + Twine(Ndx) +
+                      "): unable to get a relocation target: " +
+                      toString(SecOrErr.takeError()));
+    SecToRelSec[**SecOrErr].push_back(Section);
   }
 
   for (std::pair<SectionRef, std::vector<SectionRef>> &P : SecToRelSec) {
-    StringRef SecName;
-    error(P.first.getName(SecName));
+    StringRef SecName = unwrapOrError(P.first.getName(), Obj->getFileName());
     outs() << "RELOCATION RECORDS FOR [" << SecName << "]:\n";
 
     for (SectionRef Section : P.second) {
@@ -1597,7 +1640,9 @@ void printRelocations(const ObjectFile *Obj) {
         if (Address < StartAddress || Address > StopAddress || getHidden(Reloc))
           continue;
         Reloc.getTypeName(RelocName);
-        error(getRelocationValueString(Reloc, ValueStr));
+        if (Error E = getRelocationValueString(Reloc, ValueStr))
+          reportError(std::move(E), Obj->getFileName());
+
         outs() << format(Fmt.data(), Address) << " " << RelocName << " "
                << ValueStr << "\n";
       }
@@ -1613,7 +1658,7 @@ void printDynamicRelocations(const ObjectFile *Obj) {
 
   const auto *Elf = dyn_cast<ELFObjectFileBase>(Obj);
   if (!Elf || Elf->getEType() != ELF::ET_DYN) {
-    error("not a dynamic object");
+    reportError(Obj->getFileName(), "not a dynamic object");
     return;
   }
 
@@ -1629,7 +1674,8 @@ void printDynamicRelocations(const ObjectFile *Obj) {
       SmallString<32> RelocName;
       SmallString<32> ValueStr;
       Reloc.getTypeName(RelocName);
-      error(getRelocationValueString(Reloc, ValueStr));
+      if (Error E = getRelocationValueString(Reloc, ValueStr))
+        reportError(std::move(E), Obj->getFileName());
       outs() << format(Fmt.data(), Address) << " " << RelocName << " "
              << ValueStr << "\n";
     }
@@ -1647,47 +1693,64 @@ static bool shouldDisplayLMA(const ObjectFile *Obj) {
   return ShowLMA;
 }
 
+static size_t getMaxSectionNameWidth(const ObjectFile *Obj) {
+  // Default column width for names is 13 even if no names are that long.
+  size_t MaxWidth = 13;
+  for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
+    StringRef Name = unwrapOrError(Section.getName(), Obj->getFileName());
+    MaxWidth = std::max(MaxWidth, Name.size());
+  }
+  return MaxWidth;
+}
+
 void printSectionHeaders(const ObjectFile *Obj) {
+  size_t NameWidth = getMaxSectionNameWidth(Obj);
+  size_t AddressWidth = 2 * Obj->getBytesInAddress();
   bool HasLMAColumn = shouldDisplayLMA(Obj);
   if (HasLMAColumn)
     outs() << "Sections:\n"
-              "Idx Name          Size     VMA              LMA              "
-              "Type\n";
+              "Idx "
+           << left_justify("Name", NameWidth) << " Size     "
+           << left_justify("VMA", AddressWidth) << " "
+           << left_justify("LMA", AddressWidth) << " Type\n";
   else
     outs() << "Sections:\n"
-              "Idx Name          Size     VMA          Type\n";
+              "Idx "
+           << left_justify("Name", NameWidth) << " Size     "
+           << left_justify("VMA", AddressWidth) << " Type\n";
 
-  for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
-    StringRef Name;
-    error(Section.getName(Name));
+  uint64_t Idx;
+  for (const SectionRef &Section : ToolSectionFilter(*Obj, &Idx)) {
+    StringRef Name = unwrapOrError(Section.getName(), Obj->getFileName());
     uint64_t VMA = Section.getAddress();
     if (shouldAdjustVA(Section))
       VMA += AdjustVMA;
 
     uint64_t Size = Section.getSize();
-    bool Text = Section.isText();
-    bool Data = Section.isData();
-    bool BSS = Section.isBSS();
-    std::string Type = (std::string(Text ? "TEXT " : "") +
-                        (Data ? "DATA " : "") + (BSS ? "BSS" : ""));
+
+    std::string Type = Section.isText() ? "TEXT" : "";
+    if (Section.isData())
+      Type += Type.empty() ? "DATA" : " DATA";
+    if (Section.isBSS())
+      Type += Type.empty() ? "BSS" : " BSS";
 
     if (HasLMAColumn)
-      outs() << format("%3d %-13s %08" PRIx64 " %016" PRIx64 " %016" PRIx64
-                       " %s\n",
-                       (unsigned)Section.getIndex(), Name.str().c_str(), Size,
-                       VMA, getELFSectionLMA(Section), Type.c_str());
+      outs() << format("%3" PRIu64 " %-*s %08" PRIx64 " ", Idx, NameWidth,
+                       Name.str().c_str(), Size)
+             << format_hex_no_prefix(VMA, AddressWidth) << " "
+             << format_hex_no_prefix(getELFSectionLMA(Section), AddressWidth)
+             << " " << Type << "\n";
     else
-      outs() << format("%3d %-13s %08" PRIx64 " %016" PRIx64 " %s\n",
-                       (unsigned)Section.getIndex(), Name.str().c_str(), Size,
-                       VMA, Type.c_str());
+      outs() << format("%3" PRIu64 " %-*s %08" PRIx64 " ", Idx, NameWidth,
+                       Name.str().c_str(), Size)
+             << format_hex_no_prefix(VMA, AddressWidth) << " " << Type << "\n";
   }
   outs() << "\n";
 }
 
 void printSectionContents(const ObjectFile *Obj) {
   for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
-    StringRef Name;
-    error(Section.getName(Name));
+    StringRef Name = unwrapOrError(Section.getName(), Obj->getFileName());
     uint64_t BaseAddr = Section.getAddress();
     uint64_t Size = Section.getSize();
     if (!Size)
@@ -1741,21 +1804,26 @@ void printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
   const StringRef FileName = O->getFileName();
   for (auto I = O->symbol_begin(), E = O->symbol_end(); I != E; ++I) {
     const SymbolRef &Symbol = *I;
-    uint64_t Address = unwrapOrError(Symbol.getAddress(), ArchiveName, FileName,
+    uint64_t Address = unwrapOrError(Symbol.getAddress(), FileName, ArchiveName,
                                      ArchitectureName);
     if ((Address < StartAddress) || (Address > StopAddress))
       continue;
-    SymbolRef::Type Type = unwrapOrError(Symbol.getType(), ArchiveName,
-                                         FileName, ArchitectureName);
+    SymbolRef::Type Type = unwrapOrError(Symbol.getType(), FileName,
+                                         ArchiveName, ArchitectureName);
     uint32_t Flags = Symbol.getFlags();
-    section_iterator Section = unwrapOrError(Symbol.getSection(), ArchiveName,
-                                             FileName, ArchitectureName);
+    section_iterator Section = unwrapOrError(Symbol.getSection(), FileName,
+                                             ArchiveName, ArchitectureName);
     StringRef Name;
-    if (Type == SymbolRef::ST_Debug && Section != O->section_end())
-      Section->getName(Name);
-    else
-      Name = unwrapOrError(Symbol.getName(), ArchiveName, FileName,
+    if (Type == SymbolRef::ST_Debug && Section != O->section_end()) {
+      if (Expected<StringRef> NameOrErr = Section->getName())
+        Name = *NameOrErr;
+      else
+        consumeError(NameOrErr.takeError());
+
+    } else {
+      Name = unwrapOrError(Symbol.getName(), FileName, ArchiveName,
                            ArchitectureName);
+    }
 
     bool Global = Flags & SymbolRef::SF_Global;
     bool Weak = Flags & SymbolRef::SF_Weak;
@@ -1801,8 +1869,8 @@ void printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
         StringRef SegmentName = MachO->getSectionFinalSegmentName(DR);
         outs() << SegmentName << ",";
       }
-      StringRef SectionName;
-      error(Section->getName(SectionName));
+      StringRef SectionName =
+          unwrapOrError(Section->getName(), O->getFileName());
       outs() << SectionName;
     }
 
@@ -1875,7 +1943,11 @@ void printRawClangAST(const ObjectFile *Obj) {
   Optional<object::SectionRef> ClangASTSection;
   for (auto Sec : ToolSectionFilter(*Obj)) {
     StringRef Name;
-    Sec.getName(Name);
+    if (Expected<StringRef> NameOrErr = Sec.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
     if (Name == ClangASTSectionName) {
       ClangASTSection = Sec;
       break;
@@ -1907,7 +1979,11 @@ static void printFaultMaps(const ObjectFile *Obj) {
 
   for (auto Sec : ToolSectionFilter(*Obj)) {
     StringRef Name;
-    Sec.getName(Name);
+    if (Expected<StringRef> NameOrErr = Sec.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
     if (Name == FaultMapSectionName) {
       FaultMapSection = Sec;
       break;
@@ -1946,12 +2022,12 @@ static void printPrivateFileHeaders(const ObjectFile *O, bool OnlyFirst) {
       printMachOLoadCommands(O);
     return;
   }
-  report_error(O->getFileName(), "Invalid/Unsupported object file format");
+  reportError(O->getFileName(), "Invalid/Unsupported object file format");
 }
 
 static void printFileHeaders(const ObjectFile *O) {
   if (!O->isELF() && !O->isCOFF())
-    report_error(O->getFileName(), "Invalid/Unsupported object file format");
+    reportError(O->getFileName(), "Invalid/Unsupported object file format");
 
   Triple::ArchType AT = O->getArch();
   outs() << "architecture: " << Triple::getArchTypeName(AT) << "\n";
@@ -2010,6 +2086,43 @@ static void printArchiveChild(StringRef Filename, const Archive::Child &C) {
   outs() << Name << "\n";
 }
 
+// For ELF only now.
+static bool shouldWarnForInvalidStartStopAddress(ObjectFile *Obj) {
+  if (const auto *Elf = dyn_cast<ELFObjectFileBase>(Obj)) {
+    if (Elf->getEType() != ELF::ET_REL)
+      return true;
+  }
+  return false;
+}
+
+static void checkForInvalidStartStopAddress(ObjectFile *Obj,
+                                            uint64_t Start, uint64_t Stop) {
+  if (!shouldWarnForInvalidStartStopAddress(Obj))
+    return;
+
+  for (const SectionRef &Section : Obj->sections())
+    if (ELFSectionRef(Section).getFlags() & ELF::SHF_ALLOC) {
+      uint64_t BaseAddr = Section.getAddress();
+      uint64_t Size = Section.getSize();
+      if ((Start < BaseAddr + Size) && Stop > BaseAddr)
+        return;
+    }
+
+  if (StartAddress.getNumOccurrences() == 0)
+    reportWarning("no section has address less than 0x" +
+                      Twine::utohexstr(Stop) + " specified by --stop-address",
+                  Obj->getFileName());
+  else if (StopAddress.getNumOccurrences() == 0)
+    reportWarning("no section has address greater than or equal to 0x" +
+                      Twine::utohexstr(Start) + " specified by --start-address",
+                  Obj->getFileName());
+  else
+    reportWarning("no section overlaps the range [0x" +
+                      Twine::utohexstr(Start) + ",0x" + Twine::utohexstr(Stop) +
+                      ") specified by --start-address/--stop-address",
+                  Obj->getFileName());
+}
+
 static void dumpObject(ObjectFile *O, const Archive *A = nullptr,
                        const Archive::Child *C = nullptr) {
   // Avoid other output when using a raw option.
@@ -2022,27 +2135,40 @@ static void dumpObject(ObjectFile *O, const Archive *A = nullptr,
     outs() << ":\tfile format " << O->getFileFormatName() << "\n\n";
   }
 
+  if (StartAddress.getNumOccurrences() || StopAddress.getNumOccurrences())
+    checkForInvalidStartStopAddress(O, StartAddress, StopAddress);
+
+  // Note: the order here matches GNU objdump for compatability.
   StringRef ArchiveName = A ? A->getFileName() : "";
-  if (FileHeaders)
-    printFileHeaders(O);
   if (ArchiveHeaders && !MachOOpt && C)
     printArchiveChild(ArchiveName, *C);
-  if (Disassemble)
-    disassembleObject(O, Relocations);
+  if (FileHeaders)
+    printFileHeaders(O);
+  if (PrivateHeaders || FirstPrivateHeader)
+    printPrivateFileHeaders(O, FirstPrivateHeader);
+  if (SectionHeaders)
+    printSectionHeaders(O);
+  if (SymbolTable)
+    printSymbolTable(O, ArchiveName);
+  if (DwarfDumpType != DIDT_Null) {
+    std::unique_ptr<DIContext> DICtx = DWARFContext::create(*O);
+    // Dump the complete DWARF structure.
+    DIDumpOptions DumpOpts;
+    DumpOpts.DumpType = DwarfDumpType;
+    DICtx->dump(outs(), DumpOpts);
+  }
   if (Relocations && !Disassemble)
     printRelocations(O);
   if (DynamicRelocations)
     printDynamicRelocations(O);
-  if (SectionHeaders)
-    printSectionHeaders(O);
   if (SectionContents)
     printSectionContents(O);
-  if (SymbolTable)
-    printSymbolTable(O, ArchiveName);
+  if (Disassemble)
+    disassembleObject(O, Relocations);
   if (UnwindInfo)
     printUnwindInfo(O);
-  if (PrivateHeaders || FirstPrivateHeader)
-    printPrivateFileHeaders(O, FirstPrivateHeader);
+
+  // Mach-O specific options:
   if (ExportsTrie)
     printExportsTrie(O);
   if (Rebase)
@@ -2053,17 +2179,12 @@ static void dumpObject(ObjectFile *O, const Archive *A = nullptr,
     printLazyBindTable(O);
   if (WeakBind)
     printWeakBindTable(O);
+
+  // Other special sections:
   if (RawClangAST)
     printRawClangAST(O);
   if (FaultMapSection)
     printFaultMaps(O);
-  if (DwarfDumpType != DIDT_Null) {
-    std::unique_ptr<DIContext> DICtx = DWARFContext::create(*O);
-    // Dump the complete DWARF structure.
-    DIDumpOptions DumpOpts;
-    DumpOpts.DumpType = DwarfDumpType;
-    DICtx->dump(outs(), DumpOpts);
-  }
 }
 
 static void dumpObject(const COFFImportFile *I, const Archive *A,
@@ -2086,11 +2207,13 @@ static void dumpObject(const COFFImportFile *I, const Archive *A,
 /// Dump each object file in \a a;
 static void dumpArchive(const Archive *A) {
   Error Err = Error::success();
+  unsigned I = -1;
   for (auto &C : A->children(Err)) {
+    ++I;
     Expected<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
     if (!ChildOrErr) {
       if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
-        report_error(std::move(E), A->getFileName(), C);
+        reportError(std::move(E), getFileNameForError(C, I), A->getFileName());
       continue;
     }
     if (ObjectFile *O = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
@@ -2098,11 +2221,11 @@ static void dumpArchive(const Archive *A) {
     else if (COFFImportFile *I = dyn_cast<COFFImportFile>(&*ChildOrErr.get()))
       dumpObject(I, A, &C);
     else
-      report_error(errorCodeToError(object_error::invalid_file_type),
-                   A->getFileName());
+      reportError(errorCodeToError(object_error::invalid_file_type),
+                  A->getFileName());
   }
   if (Err)
-    report_error(std::move(Err), A->getFileName());
+    reportError(std::move(Err), A->getFileName());
 }
 
 /// Open file and figure out how to dump it.
@@ -2126,7 +2249,7 @@ static void dumpInput(StringRef file) {
   else if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Binary))
     parseInputMachO(UB);
   else
-    report_error(errorCodeToError(object_error::invalid_file_type), file);
+    reportError(errorCodeToError(object_error::invalid_file_type), file);
 }
 } // namespace llvm
 
@@ -2147,7 +2270,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm object file dumper\n");
 
   if (StartAddress >= StopAddress)
-    error("start address should be less than stop address");
+    reportCmdLineError("start address should be less than stop address");
 
   ToolName = argv[0];
 
diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index e58d4a05c2e6..43ce02ae0bc2 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h
@@ -31,6 +31,8 @@ extern cl::opt<bool> Demangle;
 
 typedef std::function<bool(llvm::object::SectionRef const &)> FilterPredicate;
 
+/// A filtered iterator for SectionRefs that skips sections based on some given
+/// predicate.
 class SectionFilterIterator {
 public:
   SectionFilterIterator(FilterPredicate P,
@@ -60,6 +62,8 @@ private:
   llvm::object::section_iterator End;
 };
 
+/// Creates an iterator range of SectionFilterIterators for a given Object and
+/// predicate.
 class SectionFilter {
 public:
   SectionFilter(FilterPredicate P, llvm::object::ObjectFile const &O)
@@ -79,7 +83,15 @@ private:
 };
 
 // Various helper functions.
-SectionFilter ToolSectionFilter(llvm::object::ObjectFile const &O);
+
+/// Creates a SectionFilter with a standard predicate that conditionally skips
+/// sections when the --section objdump flag is provided.
+///
+/// Idx is an optional output parameter that keeps track of which section index
+/// this is. This may be different than the actual section number, as some
+/// sections may be filtered (e.g. symbol tables).
+SectionFilter ToolSectionFilter(llvm::object::ObjectFile const &O,
+                                uint64_t *Idx = nullptr);
 
 Error getELFRelocationValueString(const object::ELFObjectFileBase *Obj,
                                   const object::RelocationRef &Rel,
@@ -96,8 +108,6 @@ Error getMachORelocationValueString(const object::MachOObjectFile *Obj,
 
 uint64_t getELFSectionLMA(const object::ELFSectionRef& Sec);
 
-void error(std::error_code ec);
-void error(Error E);
 bool isRelocAddressLess(object::RelocationRef A, object::RelocationRef B);
 void parseInputMachO(StringRef Filename);
 void parseInputMachO(object::MachOUniversalBinary *UB);
@@ -129,24 +139,22 @@ void printSectionHeaders(const object::ObjectFile *O);
 void printSectionContents(const object::ObjectFile *O);
 void printSymbolTable(const object::ObjectFile *O, StringRef ArchiveName,
                       StringRef ArchitectureName = StringRef());
-void warn(StringRef Message);
-LLVM_ATTRIBUTE_NORETURN void error(Twine Message);
-LLVM_ATTRIBUTE_NORETURN void report_error(StringRef File, Twine Message);
-LLVM_ATTRIBUTE_NORETURN void report_error(Error E, StringRef File);
-LLVM_ATTRIBUTE_NORETURN void
-report_error(Error E, StringRef FileName, StringRef ArchiveName,
-             StringRef ArchitectureName = StringRef());
-LLVM_ATTRIBUTE_NORETURN void
-report_error(Error E, StringRef ArchiveName, const object::Archive::Child &C,
-             StringRef ArchitectureName = StringRef());
+LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Twine Message);
+LLVM_ATTRIBUTE_NORETURN void reportError(Error E, StringRef FileName,
+                                         StringRef ArchiveName = "",
+                                         StringRef ArchitectureName = "");
+void reportWarning(Twine Message, StringRef File);
 
 template <typename T, typename... Ts>
 T unwrapOrError(Expected<T> EO, Ts &&... Args) {
   if (EO)
     return std::move(*EO);
-  report_error(EO.takeError(), std::forward<Ts>(Args)...);
+  reportError(EO.takeError(), std::forward<Ts>(Args)...);
 }
 
+std::string getFileNameForError(const object::Archive::Child &C,
+                                unsigned Index);
+
 } // end namespace llvm
 
 #endif
diff --git a/tools/llvm-pdbutil/BytesOutputStyle.cpp b/tools/llvm-pdbutil/BytesOutputStyle.cpp
index 162d12c120b4..ffc907e09f11 100644
--- a/tools/llvm-pdbutil/BytesOutputStyle.cpp
+++ b/tools/llvm-pdbutil/BytesOutputStyle.cpp
@@ -457,7 +457,7 @@ BytesOutputStyle::initializeTypes(uint32_t StreamIdx) {
   uint32_t Count = Tpi->getNumTypeRecords();
   auto Offsets = Tpi->getTypeIndexOffsets();
   TypeCollection =
-      llvm::make_unique<LazyRandomTypeCollection>(Types, Count, Offsets);
+      std::make_unique<LazyRandomTypeCollection>(Types, Count, Offsets);
 
   return *TypeCollection;
 }
diff --git a/tools/llvm-pdbutil/DumpOutputStyle.cpp b/tools/llvm-pdbutil/DumpOutputStyle.cpp
index 962d4cf88a8a..4d82e0fd9174 100644
--- a/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -1369,9 +1369,10 @@ Error DumpOutputStyle::dumpTypesFromObjectFile() {
   LazyRandomTypeCollection Types(100);
 
   for (const auto &S : getObj().sections()) {
-    StringRef SectionName;
-    if (auto EC = S.getName(SectionName))
-      return errorCodeToError(EC);
+    Expected<StringRef> NameOrErr = S.getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+    StringRef SectionName = *NameOrErr;
 
     // .debug$T is a standard CodeView type section, while .debug$P is the same
     // format but used for MSVC precompiled header object files.
@@ -1551,7 +1552,7 @@ Error DumpOutputStyle::dumpModuleSymsForObj() {
         Dumper.setSymbolGroup(&Strings);
         for (auto Symbol : Symbols) {
           if (auto EC = Visitor.visitSymbolRecord(Symbol)) {
-            SymbolError = llvm::make_unique<Error>(std::move(EC));
+            SymbolError = std::make_unique<Error>(std::move(EC));
             return;
           }
         }
diff --git a/tools/llvm-pdbutil/ExplainOutputStyle.cpp b/tools/llvm-pdbutil/ExplainOutputStyle.cpp
index 94faa0463981..3d2490509c03 100644
--- a/tools/llvm-pdbutil/ExplainOutputStyle.cpp
+++ b/tools/llvm-pdbutil/ExplainOutputStyle.cpp
@@ -64,7 +64,7 @@ Error ExplainOutputStyle::explainPdbFile() {
 
 Error ExplainOutputStyle::explainBinaryFile() {
   std::unique_ptr<BinaryByteStream> Stream =
-      llvm::make_unique<BinaryByteStream>(File.unknown().getBuffer(),
+      std::make_unique<BinaryByteStream>(File.unknown().getBuffer(),
                                           llvm::support::little);
   switch (opts::explain::InputType) {
   case opts::explain::InputFileType::DBIStream: {
diff --git a/tools/llvm-pdbutil/InputFile.cpp b/tools/llvm-pdbutil/InputFile.cpp
index bd23bfdbe31a..b316882de64d 100644
--- a/tools/llvm-pdbutil/InputFile.cpp
+++ b/tools/llvm-pdbutil/InputFile.cpp
@@ -66,12 +66,13 @@ getModuleDebugStream(PDBFile &File, StringRef &ModuleName, uint32_t Index) {
 static inline bool isCodeViewDebugSubsection(object::SectionRef Section,
                                              StringRef Name,
                                              BinaryStreamReader &Reader) {
-  StringRef SectionName;
-  if (Section.getName(SectionName))
-    return false;
-
-  if (SectionName != Name)
+  if (Expected<StringRef> NameOrErr = Section.getName()) {
+    if (*NameOrErr != Name)
+      return false;
+  } else {
+    consumeError(NameOrErr.takeError());
     return false;
+  }
 
   Expected<StringRef> ContentsOrErr = Section.getContents();
   if (!ContentsOrErr) {
@@ -384,7 +385,7 @@ InputFile::getOrCreateTypeCollection(TypeCollectionKind Kind) {
     uint32_t Count = Stream.getNumTypeRecords();
     auto Offsets = Stream.getTypeIndexOffsets();
     Collection =
-        llvm::make_unique<LazyRandomTypeCollection>(Array, Count, Offsets);
+        std::make_unique<LazyRandomTypeCollection>(Array, Count, Offsets);
     return *Collection;
   }
 
@@ -397,11 +398,11 @@ InputFile::getOrCreateTypeCollection(TypeCollectionKind Kind) {
     if (!isDebugTSection(Section, Records))
       continue;
 
-    Types = llvm::make_unique<LazyRandomTypeCollection>(Records, 100);
+    Types = std::make_unique<LazyRandomTypeCollection>(Records, 100);
     return *Types;
   }
 
-  Types = llvm::make_unique<LazyRandomTypeCollection>(100);
+  Types = std::make_unique<LazyRandomTypeCollection>(100);
   return *Types;
 }
 
diff --git a/tools/llvm-pdbutil/MinimalSymbolDumper.cpp b/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
index e5ae47050678..ebfa50625e76 100644
--- a/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
+++ b/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
@@ -569,8 +569,9 @@ Error MinimalSymbolDumper::visitKnownRecord(
 Error MinimalSymbolDumper::visitKnownRecord(CVSymbol &CVR,
                                             DefRangeFramePointerRelSym &Def) {
   AutoIndent Indent(P, 7);
-  P.formatLine("offset = {0}, range = {1}", Def.Offset, formatRange(Def.Range));
-  P.formatLine("gaps = {2}", Def.Offset,
+  P.formatLine("offset = {0}, range = {1}", Def.Hdr.Offset,
+               formatRange(Def.Range));
+  P.formatLine("gaps = {2}", Def.Hdr.Offset,
                formatGaps(P.getIndentLevel() + 9, Def.Gaps));
   return Error::success();
 }
diff --git a/tools/llvm-pdbutil/PrettyTypeDumper.cpp b/tools/llvm-pdbutil/PrettyTypeDumper.cpp
index e8f8e5aa62c9..2f7a39803ca5 100644
--- a/tools/llvm-pdbutil/PrettyTypeDumper.cpp
+++ b/tools/llvm-pdbutil/PrettyTypeDumper.cpp
@@ -117,7 +117,7 @@ filterAndSortClassDefs(LinePrinter &Printer, Enumerator &E,
       continue;
     }
 
-    auto Layout = llvm::make_unique<ClassLayout>(std::move(Class));
+    auto Layout = std::make_unique<ClassLayout>(std::move(Class));
     if (Layout->deepPaddingSize() < opts::pretty::PaddingThreshold) {
       ++Discarded;
       continue;
@@ -259,7 +259,7 @@ void TypeDumper::start(const PDBSymbolExe &Exe) {
             continue;
           }
 
-          auto Layout = llvm::make_unique<ClassLayout>(std::move(Class));
+          auto Layout = std::make_unique<ClassLayout>(std::move(Class));
           if (Layout->deepPaddingSize() < opts::pretty::PaddingThreshold)
             continue;
 
diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 785a98086791..9307300861d4 100644
--- a/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -863,8 +863,8 @@ static void pdb2Yaml(StringRef Path) {
   std::unique_ptr<IPDBSession> Session;
   auto &File = loadPDB(Path, Session);
 
-  auto O = llvm::make_unique<YAMLOutputStyle>(File);
-  O = llvm::make_unique<YAMLOutputStyle>(File);
+  auto O = std::make_unique<YAMLOutputStyle>(File);
+  O = std::make_unique<YAMLOutputStyle>(File);
 
   ExitOnErr(O->dump());
 }
@@ -872,7 +872,7 @@ static void pdb2Yaml(StringRef Path) {
 static void dumpRaw(StringRef Path) {
   InputFile IF = ExitOnErr(InputFile::open(Path));
 
-  auto O = llvm::make_unique<DumpOutputStyle>(IF);
+  auto O = std::make_unique<DumpOutputStyle>(IF);
   ExitOnErr(O->dump());
 }
 
@@ -880,7 +880,7 @@ static void dumpBytes(StringRef Path) {
   std::unique_ptr<IPDBSession> Session;
   auto &File = loadPDB(Path, Session);
 
-  auto O = llvm::make_unique<BytesOutputStyle>(File);
+  auto O = std::make_unique<BytesOutputStyle>(File);
 
   ExitOnErr(O->dump());
 }
@@ -1347,7 +1347,7 @@ static void explain() {
       ExitOnErr(InputFile::open(opts::explain::InputFilename.front(), true));
 
   for (uint64_t Off : opts::explain::Offsets) {
-    auto O = llvm::make_unique<ExplainOutputStyle>(IF, Off);
+    auto O = std::make_unique<ExplainOutputStyle>(IF, Off);
 
     ExitOnErr(O->dump());
   }
diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index 16d3ebe3fcbc..41e9abb82b1f 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
@@ -37,6 +38,7 @@ enum ProfileFormat {
   PF_None = 0,
   PF_Text,
   PF_Compact_Binary,
+  PF_Ext_Binary,
   PF_GCC,
   PF_Binary
 };
@@ -84,6 +86,15 @@ static void exitWithErrorCode(std::error_code EC, StringRef Whence = "") {
 
 namespace {
 enum ProfileKinds { instr, sample };
+enum FailureMode { failIfAnyAreInvalid, failIfAllAreInvalid };
+}
+
+static void warnOrExitGivenError(FailureMode FailMode, std::error_code EC,
+                                 StringRef Whence = "") {
+  if (FailMode == failIfAnyAreInvalid)
+    exitWithErrorCode(EC, Whence);
+  else
+    warn(EC.message(), Whence);
 }
 
 static void handleMergeWriterError(Error E, StringRef WhenceFile = "",
@@ -136,7 +147,7 @@ public:
     if (!BufOrError)
       exitWithErrorCode(BufOrError.getError(), InputFile);
 
-    auto Remapper = llvm::make_unique<SymbolRemapper>();
+    auto Remapper = std::make_unique<SymbolRemapper>();
     Remapper->File = std::move(BufOrError.get());
 
     for (line_iterator LineIt(*Remapper->File, /*SkipBlanks=*/true, '#');
@@ -173,33 +184,16 @@ typedef SmallVector<WeightedFile, 5> WeightedFileVector;
 struct WriterContext {
   std::mutex Lock;
   InstrProfWriter Writer;
-  Error Err;
-  std::string ErrWhence;
+  std::vector<std::pair<Error, std::string>> Errors;
   std::mutex &ErrLock;
   SmallSet<instrprof_error, 4> &WriterErrorCodes;
 
   WriterContext(bool IsSparse, std::mutex &ErrLock,
                 SmallSet<instrprof_error, 4> &WriterErrorCodes)
-      : Lock(), Writer(IsSparse), Err(Error::success()), ErrWhence(""),
-        ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {}
+      : Lock(), Writer(IsSparse), Errors(), ErrLock(ErrLock),
+        WriterErrorCodes(WriterErrorCodes) {}
 };
 
-/// Determine whether an error is fatal for profile merging.
-static bool isFatalError(instrprof_error IPE) {
-  switch (IPE) {
-  default:
-    return true;
-  case instrprof_error::success:
-  case instrprof_error::eof:
-  case instrprof_error::unknown_function:
-  case instrprof_error::hash_mismatch:
-  case instrprof_error::count_mismatch:
-  case instrprof_error::counter_overflow:
-  case instrprof_error::value_site_count_mismatch:
-    return false;
-  }
-}
-
 /// Computer the overlap b/w profile BaseFilename and TestFileName,
 /// and store the program level result to Overlap.
 static void overlapInput(const std::string &BaseFilename,
@@ -212,7 +206,7 @@ static void overlapInput(const std::string &BaseFilename,
     // Skip the empty profiles by returning sliently.
     instrprof_error IPE = InstrProfError::take(std::move(E));
     if (IPE != instrprof_error::empty_raw_profile)
-      WC->Err = make_error<InstrProfError>(IPE);
+      WC->Errors.emplace_back(make_error<InstrProfError>(IPE), TestFilename);
     return;
   }
 
@@ -231,21 +225,17 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
                       WriterContext *WC) {
   std::unique_lock<std::mutex> CtxGuard{WC->Lock};
 
-  // If there's a pending hard error, don't do more work.
-  if (WC->Err)
-    return;
-
   // Copy the filename, because llvm::ThreadPool copied the input "const
   // WeightedFile &" by value, making a reference to the filename within it
   // invalid outside of this packaged task.
-  WC->ErrWhence = Input.Filename;
+  std::string Filename = Input.Filename;
 
   auto ReaderOrErr = InstrProfReader::create(Input.Filename);
   if (Error E = ReaderOrErr.takeError()) {
     // Skip the empty profiles by returning sliently.
     instrprof_error IPE = InstrProfError::take(std::move(E));
     if (IPE != instrprof_error::empty_raw_profile)
-      WC->Err = make_error<InstrProfError>(IPE);
+      WC->Errors.emplace_back(make_error<InstrProfError>(IPE), Filename);
     return;
   }
 
@@ -253,9 +243,11 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
   bool IsIRProfile = Reader->isIRLevelProfile();
   bool HasCSIRProfile = Reader->hasCSIRLevelProfile();
   if (WC->Writer.setIsIRLevelProfile(IsIRProfile, HasCSIRProfile)) {
-    WC->Err = make_error<StringError>(
-        "Merge IR generated profile with Clang generated profile.",
-        std::error_code());
+    WC->Errors.emplace_back(
+        make_error<StringError>(
+            "Merge IR generated profile with Clang generated profile.",
+            std::error_code()),
+        Filename);
     return;
   }
 
@@ -278,30 +270,23 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
                              FuncName, firstTime);
     });
   }
-  if (Reader->hasError()) {
-    if (Error E = Reader->getError()) {
-      instrprof_error IPE = InstrProfError::take(std::move(E));
-      if (isFatalError(IPE))
-        WC->Err = make_error<InstrProfError>(IPE);
-    }
-  }
+  if (Reader->hasError())
+    if (Error E = Reader->getError())
+      WC->Errors.emplace_back(std::move(E), Filename);
 }
 
 /// Merge the \p Src writer context into \p Dst.
 static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) {
-  // If we've already seen a hard error, continuing with the merge would
-  // clobber it.
-  if (Dst->Err || Src->Err)
-    return;
+  for (auto &ErrorPair : Src->Errors)
+    Dst->Errors.push_back(std::move(ErrorPair));
+  Src->Errors.clear();
 
-  bool Reported = false;
   Dst->Writer.mergeRecordsFromWriter(std::move(Src->Writer), [&](Error E) {
-    if (Reported) {
-      consumeError(std::move(E));
-      return;
-    }
-    Reported = true;
-    Dst->Err = std::move(E);
+    instrprof_error IPE = InstrProfError::take(std::move(E));
+    std::unique_lock<std::mutex> ErrGuard{Dst->ErrLock};
+    bool firstTime = Dst->WriterErrorCodes.insert(IPE).second;
+    if (firstTime)
+      warn(toString(make_error<InstrProfError>(IPE)));
   });
 }
 
@@ -309,12 +294,12 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
                               SymbolRemapper *Remapper,
                               StringRef OutputFilename,
                               ProfileFormat OutputFormat, bool OutputSparse,
-                              unsigned NumThreads) {
+                              unsigned NumThreads, FailureMode FailMode) {
   if (OutputFilename.compare("-") == 0)
     exitWithError("Cannot write indexed profdata format to stdout.");
 
   if (OutputFormat != PF_Binary && OutputFormat != PF_Compact_Binary &&
-      OutputFormat != PF_Text)
+      OutputFormat != PF_Ext_Binary && OutputFormat != PF_Text)
     exitWithError("Unknown format is specified.");
 
   std::mutex ErrorLock;
@@ -328,7 +313,7 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
   // Initialize the writer contexts.
   SmallVector<std::unique_ptr<WriterContext>, 4> Contexts;
   for (unsigned I = 0; I < NumThreads; ++I)
-    Contexts.emplace_back(llvm::make_unique<WriterContext>(
+    Contexts.emplace_back(std::make_unique<WriterContext>(
         OutputSparse, ErrorLock, WriterErrorCodes));
 
   if (NumThreads == 1) {
@@ -364,23 +349,21 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
     } while (Mid > 0);
   }
 
-  // Handle deferred hard errors encountered during merging.
+  // Handle deferred errors encountered during merging. If the number of errors
+  // is equal to the number of inputs the merge failed.
+  unsigned NumErrors = 0;
   for (std::unique_ptr<WriterContext> &WC : Contexts) {
-    if (!WC->Err)
-      continue;
-    if (!WC->Err.isA<InstrProfError>())
-      exitWithError(std::move(WC->Err), WC->ErrWhence);
-
-    instrprof_error IPE = InstrProfError::take(std::move(WC->Err));
-    if (isFatalError(IPE))
-      exitWithError(make_error<InstrProfError>(IPE), WC->ErrWhence);
-    else
-      warn(toString(make_error<InstrProfError>(IPE)),
-           WC->ErrWhence);
+    for (auto &ErrorPair : WC->Errors) {
+      ++NumErrors;
+      warn(toString(std::move(ErrorPair.first)), ErrorPair.second);
+    }
   }
+  if (NumErrors == Inputs.size() ||
+      (NumErrors > 0 && FailMode == failIfAnyAreInvalid))
+    exitWithError("No profiles could be merged.");
 
   std::error_code EC;
-  raw_fd_ostream Output(OutputFilename.data(), EC, sys::fs::F_None);
+  raw_fd_ostream Output(OutputFilename.data(), EC, sys::fs::OF_None);
   if (EC)
     exitWithErrorCode(EC, OutputFilename);
 
@@ -425,21 +408,78 @@ remapSamples(const sampleprof::FunctionSamples &Samples,
 }
 
 static sampleprof::SampleProfileFormat FormatMap[] = {
-    sampleprof::SPF_None, sampleprof::SPF_Text, sampleprof::SPF_Compact_Binary,
-    sampleprof::SPF_GCC, sampleprof::SPF_Binary};
+    sampleprof::SPF_None,
+    sampleprof::SPF_Text,
+    sampleprof::SPF_Compact_Binary,
+    sampleprof::SPF_Ext_Binary,
+    sampleprof::SPF_GCC,
+    sampleprof::SPF_Binary};
+
+static std::unique_ptr<MemoryBuffer>
+getInputFileBuf(const StringRef &InputFile) {
+  if (InputFile == "")
+    return {};
+
+  auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFile);
+  if (!BufOrError)
+    exitWithErrorCode(BufOrError.getError(), InputFile);
+
+  return std::move(*BufOrError);
+}
+
+static void populateProfileSymbolList(MemoryBuffer *Buffer,
+                                      sampleprof::ProfileSymbolList &PSL) {
+  if (!Buffer)
+    return;
+
+  SmallVector<StringRef, 32> SymbolVec;
+  StringRef Data = Buffer->getBuffer();
+  Data.split(SymbolVec, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+
+  for (StringRef symbol : SymbolVec)
+    PSL.add(symbol);
+}
+
+static void handleExtBinaryWriter(sampleprof::SampleProfileWriter &Writer,
+                                  ProfileFormat OutputFormat,
+                                  MemoryBuffer *Buffer,
+                                  sampleprof::ProfileSymbolList &WriterList,
+                                  bool CompressAllSections) {
+  populateProfileSymbolList(Buffer, WriterList);
+  if (WriterList.size() > 0 && OutputFormat != PF_Ext_Binary)
+    warn("Profile Symbol list is not empty but the output format is not "
+         "ExtBinary format. The list will be lost in the output. ");
+
+  Writer.setProfileSymbolList(&WriterList);
+
+  if (CompressAllSections) {
+    if (OutputFormat != PF_Ext_Binary) {
+      warn("-compress-all-section is ignored. Specify -extbinary to enable it");
+    } else {
+      auto ExtBinaryWriter =
+          static_cast<sampleprof::SampleProfileWriterExtBinary *>(&Writer);
+      ExtBinaryWriter->setToCompressAllSections();
+    }
+  }
+}
 
 static void mergeSampleProfile(const WeightedFileVector &Inputs,
                                SymbolRemapper *Remapper,
                                StringRef OutputFilename,
-                               ProfileFormat OutputFormat) {
+                               ProfileFormat OutputFormat,
+                               StringRef ProfileSymbolListFile,
+                               bool CompressAllSections, FailureMode FailMode) {
   using namespace sampleprof;
   StringMap<FunctionSamples> ProfileMap;
   SmallVector<std::unique_ptr<sampleprof::SampleProfileReader>, 5> Readers;
   LLVMContext Context;
+  sampleprof::ProfileSymbolList WriterList;
   for (const auto &Input : Inputs) {
     auto ReaderOrErr = SampleProfileReader::create(Input.Filename, Context);
-    if (std::error_code EC = ReaderOrErr.getError())
-      exitWithErrorCode(EC, Input.Filename);
+    if (std::error_code EC = ReaderOrErr.getError()) {
+      warnOrExitGivenError(FailMode, EC, Input.Filename);
+      continue;
+    }
 
     // We need to keep the readers around until after all the files are
     // read so that we do not lose the function names stored in each
@@ -447,8 +487,11 @@ static void mergeSampleProfile(const WeightedFileVector &Inputs,
     // merged profile map.
     Readers.push_back(std::move(ReaderOrErr.get()));
     const auto Reader = Readers.back().get();
-    if (std::error_code EC = Reader->read())
-      exitWithErrorCode(EC, Input.Filename);
+    if (std::error_code EC = Reader->read()) {
+      warnOrExitGivenError(FailMode, EC, Input.Filename);
+      Readers.pop_back();
+      continue;
+    }
 
     StringMap<FunctionSamples> &Profiles = Reader->getProfiles();
     for (StringMap<FunctionSamples>::iterator I = Profiles.begin(),
@@ -466,6 +509,11 @@ static void mergeSampleProfile(const WeightedFileVector &Inputs,
         handleMergeWriterError(errorCodeToError(EC), Input.Filename, FName);
       }
     }
+
+    std::unique_ptr<sampleprof::ProfileSymbolList> ReaderList =
+        Reader->getProfileSymbolList();
+    if (ReaderList)
+      WriterList.merge(*ReaderList);
   }
   auto WriterOrErr =
       SampleProfileWriter::create(OutputFilename, FormatMap[OutputFormat]);
@@ -473,6 +521,11 @@ static void mergeSampleProfile(const WeightedFileVector &Inputs,
     exitWithErrorCode(EC, OutputFilename);
 
   auto Writer = std::move(WriterOrErr.get());
+  // WriterList will have StringRef refering to string in Buffer.
+  // Make sure Buffer lives as long as WriterList.
+  auto Buffer = getInputFileBuf(ProfileSymbolListFile);
+  handleExtBinaryWriter(*Writer, OutputFormat, Buffer.get(), WriterList,
+                        CompressAllSections);
   Writer->write(ProfileMap);
 }
 
@@ -487,18 +540,6 @@ static WeightedFile parseWeightedFile(const StringRef &WeightedFilename) {
   return {FileName, Weight};
 }
 
-static std::unique_ptr<MemoryBuffer>
-getInputFilenamesFileBuf(const StringRef &InputFilenamesFile) {
-  if (InputFilenamesFile == "")
-    return {};
-
-  auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFilenamesFile);
-  if (!BufOrError)
-    exitWithErrorCode(BufOrError.getError(), InputFilenamesFile);
-
-  return std::move(*BufOrError);
-}
-
 static void addWeightedInput(WeightedFileVector &WNI, const WeightedFile &WF) {
   StringRef Filename = WF.Filename;
   uint64_t Weight = WF.Weight;
@@ -583,12 +624,20 @@ static int merge_main(int argc, const char *argv[]) {
                  clEnumVal(sample, "Sample profile")));
   cl::opt<ProfileFormat> OutputFormat(
       cl::desc("Format of output profile"), cl::init(PF_Binary),
-      cl::values(clEnumValN(PF_Binary, "binary", "Binary encoding (default)"),
-                 clEnumValN(PF_Compact_Binary, "compbinary",
-                            "Compact binary encoding"),
-                 clEnumValN(PF_Text, "text", "Text encoding"),
-                 clEnumValN(PF_GCC, "gcc",
-                            "GCC encoding (only meaningful for -sample)")));
+      cl::values(
+          clEnumValN(PF_Binary, "binary", "Binary encoding (default)"),
+          clEnumValN(PF_Compact_Binary, "compbinary",
+                     "Compact binary encoding"),
+          clEnumValN(PF_Ext_Binary, "extbinary", "Extensible binary encoding"),
+          clEnumValN(PF_Text, "text", "Text encoding"),
+          clEnumValN(PF_GCC, "gcc",
+                     "GCC encoding (only meaningful for -sample)")));
+  cl::opt<FailureMode> FailureMode(
+      "failure-mode", cl::init(failIfAnyAreInvalid), cl::desc("Failure mode:"),
+      cl::values(clEnumValN(failIfAnyAreInvalid, "any",
+                            "Fail if any profile is invalid."),
+                 clEnumValN(failIfAllAreInvalid, "all",
+                            "Fail only if all profiles are invalid.")));
   cl::opt<bool> OutputSparse("sparse", cl::init(false),
       cl::desc("Generate a sparse profile (only meaningful for -instr)"));
   cl::opt<unsigned> NumThreads(
@@ -596,6 +645,14 @@ static int merge_main(int argc, const char *argv[]) {
       cl::desc("Number of merge threads to use (default: autodetect)"));
   cl::alias NumThreadsA("j", cl::desc("Alias for --num-threads"),
                         cl::aliasopt(NumThreads));
+  cl::opt<std::string> ProfileSymbolListFile(
+      "prof-sym-list", cl::init(""),
+      cl::desc("Path to file containing the list of function symbols "
+               "used to populate profile symbol list"));
+  cl::opt<bool> CompressAllSections(
+      "compress-all-sections", cl::init(false), cl::Hidden,
+      cl::desc("Compress all sections when writing the profile (only "
+               "meaningful for -extbinary)"));
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
 
@@ -607,7 +664,7 @@ static int merge_main(int argc, const char *argv[]) {
 
   // Make sure that the file buffer stays alive for the duration of the
   // weighted input vector's lifetime.
-  auto Buffer = getInputFilenamesFileBuf(InputFilenamesFile);
+  auto Buffer = getInputFileBuf(InputFilenamesFile);
   parseInputFilenamesFile(Buffer.get(), WeightedInputs);
 
   if (WeightedInputs.empty())
@@ -626,10 +683,11 @@ static int merge_main(int argc, const char *argv[]) {
 
   if (ProfileKind == instr)
     mergeInstrProfile(WeightedInputs, Remapper.get(), OutputFilename,
-                      OutputFormat, OutputSparse, NumThreads);
+                      OutputFormat, OutputSparse, NumThreads, FailureMode);
   else
     mergeSampleProfile(WeightedInputs, Remapper.get(), OutputFilename,
-                       OutputFormat);
+                       OutputFormat, ProfileSymbolListFile, CompressAllSections,
+                       FailureMode);
 
   return 0;
 }
@@ -644,7 +702,7 @@ static void overlapInstrProfile(const std::string &BaseFilename,
   WriterContext Context(false, ErrorLock, WriterErrorCodes);
   WeightedFile WeightedInput{BaseFilename, 1};
   OverlapStats Overlap;
-  Error E = Overlap.accumuateCounts(BaseFilename, TestFilename, IsCS);
+  Error E = Overlap.accumulateCounts(BaseFilename, TestFilename, IsCS);
   if (E)
     exitWithError(std::move(E), "Error in getting profile count sums");
   if (Overlap.Base.CountSum < 1.0f) {
@@ -682,7 +740,7 @@ static int overlap_main(int argc, const char *argv[]) {
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data overlap tool\n");
 
   std::error_code EC;
-  raw_fd_ostream OS(Output.data(), EC, sys::fs::F_Text);
+  raw_fd_ostream OS(Output.data(), EC, sys::fs::OF_Text);
   if (EC)
     exitWithErrorCode(EC, Output);
 
@@ -944,10 +1002,21 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
   return 0;
 }
 
+static void showSectionInfo(sampleprof::SampleProfileReader *Reader,
+                            raw_fd_ostream &OS) {
+  if (!Reader->dumpSectionInfo(OS)) {
+    WithColor::warning() << "-show-sec-info-only is only supported for "
+                         << "sample profile in extbinary format and is "
+                         << "ignored for other formats.\n";
+    return;
+  }
+}
+
 static int showSampleProfile(const std::string &Filename, bool ShowCounts,
                              bool ShowAllFunctions,
                              const std::string &ShowFunction,
-                             raw_fd_ostream &OS) {
+                             bool ShowProfileSymbolList,
+                             bool ShowSectionInfoOnly, raw_fd_ostream &OS) {
   using namespace sampleprof;
   LLVMContext Context;
   auto ReaderOrErr = SampleProfileReader::create(Filename, Context);
@@ -955,6 +1024,12 @@ static int showSampleProfile(const std::string &Filename, bool ShowCounts,
     exitWithErrorCode(EC, Filename);
 
   auto Reader = std::move(ReaderOrErr.get());
+
+  if (ShowSectionInfoOnly) {
+    showSectionInfo(Reader.get(), OS);
+    return 0;
+  }
+
   if (std::error_code EC = Reader->read())
     exitWithErrorCode(EC, Filename);
 
@@ -963,6 +1038,12 @@ static int showSampleProfile(const std::string &Filename, bool ShowCounts,
   else
     Reader->dumpFunctionProfile(ShowFunction, OS);
 
+  if (ShowProfileSymbolList) {
+    std::unique_ptr<sampleprof::ProfileSymbolList> ReaderList =
+        Reader->getProfileSymbolList();
+    ReaderList->dump(OS);
+  }
+
   return 0;
 }
 
@@ -1015,6 +1096,15 @@ static int show_main(int argc, const char *argv[]) {
       "list-below-cutoff", cl::init(false),
       cl::desc("Only output names of functions whose max count values are "
                "below the cutoff value"));
+  cl::opt<bool> ShowProfileSymbolList(
+      "show-prof-sym-list", cl::init(false),
+      cl::desc("Show profile symbol list if it exists in the profile. "));
+  cl::opt<bool> ShowSectionInfoOnly(
+      "show-sec-info-only", cl::init(false),
+      cl::desc("Show the information of each section in the sample profile. "
+               "The flag is only usable when the sample profile is in "
+               "extbinary format"));
+
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data summary\n");
 
   if (OutputFilename.empty())
@@ -1027,7 +1117,7 @@ static int show_main(int argc, const char *argv[]) {
   }
 
   std::error_code EC;
-  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::F_Text);
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_Text);
   if (EC)
     exitWithErrorCode(EC, OutputFilename);
 
@@ -1042,7 +1132,8 @@ static int show_main(int argc, const char *argv[]) {
                             OnlyListBelow, ShowFunction, TextFormat, OS);
   else
     return showSampleProfile(Filename, ShowCounts, ShowAllFunctions,
-                             ShowFunction, OS);
+                             ShowFunction, ShowProfileSymbolList,
+                             ShowSectionInfoOnly, OS);
 }
 
 int main(int argc, const char *argv[]) {
diff --git a/tools/llvm-readobj/ARMEHABIPrinter.h b/tools/llvm-readobj/ARMEHABIPrinter.h
index 11f9d6166a59..2c0912038c31 100644
--- a/tools/llvm-readobj/ARMEHABIPrinter.h
+++ b/tools/llvm-readobj/ARMEHABIPrinter.h
@@ -329,6 +329,7 @@ class PrinterContext {
 
   ScopedPrinter &SW;
   const object::ELFFile<ET> *ELF;
+  StringRef FileName;
   const Elf_Shdr *Symtab;
   ArrayRef<Elf_Word> ShndxTable;
 
@@ -352,8 +353,8 @@ class PrinterContext {
 
 public:
   PrinterContext(ScopedPrinter &SW, const object::ELFFile<ET> *ELF,
-                 const Elf_Shdr *Symtab)
-      : SW(SW), ELF(ELF), Symtab(Symtab) {}
+                 StringRef FileName, const Elf_Shdr *Symtab)
+      : SW(SW), ELF(ELF), FileName(FileName), Symtab(Symtab) {}
 
   void PrintUnwindInformation() const;
 };
@@ -369,10 +370,10 @@ PrinterContext<ET>::FunctionAtAddress(unsigned Section,
     return readobj_error::unknown_symbol;
   auto StrTableOrErr = ELF->getStringTableForSymtab(*Symtab);
   if (!StrTableOrErr)
-    error(StrTableOrErr.takeError());
+    reportError(StrTableOrErr.takeError(), FileName);
   StringRef StrTable = *StrTableOrErr;
 
-  for (const Elf_Sym &Sym : unwrapOrError(ELF->symbols(Symtab)))
+  for (const Elf_Sym &Sym : unwrapOrError(FileName, ELF->symbols(Symtab)))
     if (Sym.st_shndx == Section && Sym.st_value == Address &&
         Sym.getType() == ELF::STT_FUNC) {
       auto NameOrErr = Sym.getName(StrTable);
@@ -398,16 +399,16 @@ PrinterContext<ET>::FindExceptionTable(unsigned IndexSectionIndex,
   /// handling table.  Use this symbol to recover the actual exception handling
   /// table.
 
-  for (const Elf_Shdr &Sec : unwrapOrError(ELF->sections())) {
+  for (const Elf_Shdr &Sec : unwrapOrError(FileName, ELF->sections())) {
     if (Sec.sh_type != ELF::SHT_REL || Sec.sh_info != IndexSectionIndex)
       continue;
 
     auto SymTabOrErr = ELF->getSection(Sec.sh_link);
     if (!SymTabOrErr)
-      error(SymTabOrErr.takeError());
+      reportError(SymTabOrErr.takeError(), FileName);
     const Elf_Shdr *SymTab = *SymTabOrErr;
 
-    for (const Elf_Rel &R : unwrapOrError(ELF->rels(&Sec))) {
+    for (const Elf_Rel &R : unwrapOrError(FileName, ELF->rels(&Sec))) {
       if (R.r_offset != static_cast<unsigned>(IndexTableOffset))
         continue;
 
@@ -417,7 +418,7 @@ PrinterContext<ET>::FindExceptionTable(unsigned IndexSectionIndex,
       RelA.r_addend = 0;
 
       const Elf_Sym *Symbol =
-          unwrapOrError(ELF->getRelocationSymbol(&RelA, SymTab));
+          unwrapOrError(FileName, ELF->getRelocationSymbol(&RelA, SymTab));
 
       auto Ret = ELF->getSection(Symbol, SymTab, ShndxTable);
       if (!Ret)
@@ -570,7 +571,7 @@ void PrinterContext<ET>::PrintUnwindInformation() const {
   DictScope UI(SW, "UnwindInformation");
 
   int SectionIndex = 0;
-  for (const Elf_Shdr &Sec : unwrapOrError(ELF->sections())) {
+  for (const Elf_Shdr &Sec : unwrapOrError(FileName, ELF->sections())) {
     if (Sec.sh_type == ELF::SHT_ARM_EXIDX) {
       DictScope UIT(SW, "UnwindIndexTable");
 
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
index 4de14e2e78d5..3e026f58871b 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -842,8 +842,10 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
 
   if ((int64_t)(Contents.size() - Offset - 4 * HeaderWords(XData) -
                 (XData.E() ? 0 : XData.EpilogueCount() * 4) -
-                (XData.X() ? 8 : 0)) < (int64_t)ByteCodeLength)
+                (XData.X() ? 8 : 0)) < (int64_t)ByteCodeLength) {
+    SW.flush();
     report_fatal_error("Malformed unwind data");
+  }
 
   if (XData.E()) {
     ArrayRef<uint8_t> UC = XData.UnwindByteCode();
@@ -1039,10 +1041,7 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
     }
     FunctionAddress = *FunctionAddressOrErr;
   } else {
-    const pe32_header *PEHeader;
-    if (COFF.getPE32Header(PEHeader))
-      return false;
-    FunctionAddress = PEHeader->ImageBase + RF.BeginAddress;
+    FunctionAddress = COFF.getPE32Header()->ImageBase + RF.BeginAddress;
   }
 
   SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 4c2e39dfa3cc..9b2c6adb9d93 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -60,6 +60,10 @@ using namespace llvm::codeview;
 using namespace llvm::support;
 using namespace llvm::Win64EH;
 
+static inline Error createError(const Twine &Err) {
+  return make_error<StringError>(Err, object_error::parse_failed);
+}
+
 namespace {
 
 struct LoadConfigTables {
@@ -167,9 +171,6 @@ private:
   void printDelayImportedSymbols(
       const DelayImportDirectoryEntryRef &I,
       iterator_range<imported_symbol_iterator> Range);
-  ErrorOr<const coff_resource_dir_entry &>
-  getResourceDirectoryTableEntry(const coff_resource_dir_table &Table,
-                                 uint32_t Index);
 
   typedef DenseMap<const coff_section*, std::vector<RelocationRef> > RelocMapTy;
 
@@ -627,14 +628,10 @@ void COFFDumper::printFileHeaders() {
 
   // Print PE header. This header does not exist if this is an object file and
   // not an executable.
-  const pe32_header *PEHeader = nullptr;
-  error(Obj->getPE32Header(PEHeader));
-  if (PEHeader)
+  if (const pe32_header *PEHeader = Obj->getPE32Header())
     printPEHeader<pe32_header>(PEHeader);
 
-  const pe32plus_header *PEPlusHeader = nullptr;
-  error(Obj->getPE32PlusHeader(PEPlusHeader));
-  if (PEPlusHeader)
+  if (const pe32plus_header *PEPlusHeader = Obj->getPE32PlusHeader())
     printPEHeader<pe32plus_header>(PEPlusHeader);
 
   if (const dos_header *DH = Obj->getDOSHeader())
@@ -728,7 +725,9 @@ void COFFDumper::printCOFFDebugDirectory() {
     if (D.Type == COFF::IMAGE_DEBUG_TYPE_CODEVIEW) {
       const codeview::DebugInfo *DebugInfo;
       StringRef PDBFileName;
-      error(Obj->getDebugPDBInfo(&D, DebugInfo, PDBFileName));
+      if (std::error_code EC = Obj->getDebugPDBInfo(&D, DebugInfo, PDBFileName))
+        reportError(errorCodeToError(EC), Obj->getFileName());
+
       DictScope PDBScope(W, "PDBInfo");
       W.printHex("PDBSignature", DebugInfo->Signature.CVSignature);
       if (DebugInfo->Signature.CVSignature == OMF::Signature::PDB70) {
@@ -740,8 +739,9 @@ void COFFDumper::printCOFFDebugDirectory() {
       // FIXME: Type values of 12 and 13 are commonly observed but are not in
       // the documented type enum.  Figure out what they mean.
       ArrayRef<uint8_t> RawData;
-      error(
-          Obj->getRvaAndSizeAsBytes(D.AddressOfRawData, D.SizeOfData, RawData));
+      if (std::error_code EC = Obj->getRvaAndSizeAsBytes(D.AddressOfRawData,
+                                                         D.SizeOfData, RawData))
+        reportError(errorCodeToError(EC), Obj->getFileName());
       W.printBinaryBlock("RawData", RawData);
     }
   }
@@ -750,8 +750,11 @@ void COFFDumper::printCOFFDebugDirectory() {
 void COFFDumper::printRVATable(uint64_t TableVA, uint64_t Count,
                                uint64_t EntrySize, PrintExtraCB PrintExtra) {
   uintptr_t TableStart, TableEnd;
-  error(Obj->getVaPtr(TableVA, TableStart));
-  error(Obj->getVaPtr(TableVA + Count * EntrySize - 1, TableEnd));
+  if (std::error_code EC = Obj->getVaPtr(TableVA, TableStart))
+    reportError(errorCodeToError(EC), Obj->getFileName());
+  if (std::error_code EC =
+          Obj->getVaPtr(TableVA + Count * EntrySize - 1, TableEnd))
+    reportError(errorCodeToError(EC), Obj->getFileName());
   TableEnd++;
   for (uintptr_t I = TableStart; I < TableEnd; I += EntrySize) {
     uint32_t RVA = *reinterpret_cast<const ulittle32_t *>(I);
@@ -887,16 +890,14 @@ void COFFDumper::printBaseOfDataField(const pe32plus_header *) {}
 void COFFDumper::printCodeViewDebugInfo() {
   // Print types first to build CVUDTNames, then print symbols.
   for (const SectionRef &S : Obj->sections()) {
-    StringRef SectionName;
-    error(S.getName(SectionName));
+    StringRef SectionName = unwrapOrError(Obj->getFileName(), S.getName());
     // .debug$T is a standard CodeView type section, while .debug$P is the same
     // format but used for MSVC precompiled header object files.
     if (SectionName == ".debug$T" || SectionName == ".debug$P")
       printCodeViewTypeSection(SectionName, S);
   }
   for (const SectionRef &S : Obj->sections()) {
-    StringRef SectionName;
-    error(S.getName(SectionName));
+    StringRef SectionName = unwrapOrError(Obj->getFileName(), S.getName());
     if (SectionName == ".debug$S")
       printCodeViewSymbolSection(SectionName, S);
   }
@@ -908,32 +909,40 @@ void COFFDumper::initializeFileAndStringTables(BinaryStreamReader &Reader) {
     // The section consists of a number of subsection in the following format:
     // |SubSectionType|SubSectionSize|Contents...|
     uint32_t SubType, SubSectionSize;
-    error(Reader.readInteger(SubType));
-    error(Reader.readInteger(SubSectionSize));
+
+    if (Error E = Reader.readInteger(SubType))
+      reportError(std::move(E), Obj->getFileName());
+    if (Error E = Reader.readInteger(SubSectionSize))
+      reportError(std::move(E), Obj->getFileName());
 
     StringRef Contents;
-    error(Reader.readFixedString(Contents, SubSectionSize));
+    if (Error E = Reader.readFixedString(Contents, SubSectionSize))
+      reportError(std::move(E), Obj->getFileName());
 
     BinaryStreamRef ST(Contents, support::little);
     switch (DebugSubsectionKind(SubType)) {
     case DebugSubsectionKind::FileChecksums:
-      error(CVFileChecksumTable.initialize(ST));
+      if (Error E = CVFileChecksumTable.initialize(ST))
+        reportError(std::move(E), Obj->getFileName());
       break;
     case DebugSubsectionKind::StringTable:
-      error(CVStringTable.initialize(ST));
+      if (Error E = CVStringTable.initialize(ST))
+        reportError(std::move(E), Obj->getFileName());
       break;
     default:
       break;
     }
 
     uint32_t PaddedSize = alignTo(SubSectionSize, 4);
-    error(Reader.skip(PaddedSize - SubSectionSize));
+    if (Error E = Reader.skip(PaddedSize - SubSectionSize))
+      reportError(std::move(E), Obj->getFileName());
   }
 }
 
 void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
                                             const SectionRef &Section) {
-  StringRef SectionContents = unwrapOrError(Section.getContents());
+  StringRef SectionContents =
+      unwrapOrError(Obj->getFileName(), Section.getContents());
   StringRef Data = SectionContents;
 
   SmallVector<StringRef, 10> FunctionNames;
@@ -944,10 +953,13 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
   W.printNumber("Section", SectionName, Obj->getSectionID(Section));
 
   uint32_t Magic;
-  error(consume(Data, Magic));
+  if (Error E = consume(Data, Magic))
+    reportError(std::move(E), Obj->getFileName());
+
   W.printHex("Magic", Magic);
   if (Magic != COFF::DEBUG_SECTION_MAGIC)
-    return error(object_error::parse_failed);
+    reportError(errorCodeToError(object_error::parse_failed),
+                Obj->getFileName());
 
   BinaryStreamReader FSReader(Data, support::little);
   initializeFileAndStringTables(FSReader);
@@ -957,8 +969,10 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
     // The section consists of a number of subsection in the following format:
     // |SubSectionType|SubSectionSize|Contents...|
     uint32_t SubType, SubSectionSize;
-    error(consume(Data, SubType));
-    error(consume(Data, SubSectionSize));
+    if (Error E = consume(Data, SubType))
+      reportError(std::move(E), Obj->getFileName());
+    if (Error E = consume(Data, SubSectionSize))
+      reportError(std::move(E), Obj->getFileName());
 
     ListScope S(W, "Subsection");
     // Dump the subsection as normal even if the ignore bit is set.
@@ -971,7 +985,8 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
 
     // Get the contents of the subsection.
     if (SubSectionSize > Data.size())
-      return error(object_error::parse_failed);
+      return reportError(errorCodeToError(object_error::parse_failed),
+                         Obj->getFileName());
     StringRef Contents = Data.substr(0, SubSectionSize);
 
     // Add SubSectionSize to the current offset and align that offset to find
@@ -980,7 +995,8 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
     size_t NextOffset = SectionOffset + SubSectionSize;
     NextOffset = alignTo(NextOffset, 4);
     if (NextOffset > SectionContents.size())
-      return error(object_error::parse_failed);
+      return reportError(errorCodeToError(object_error::parse_failed),
+                         Obj->getFileName());
     Data = SectionContents.drop_front(NextOffset);
 
     // Optionally print the subsection bytes in case our parsing gets confused
@@ -1010,17 +1026,21 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
       if (SubSectionSize < 12) {
         // There should be at least three words to store two function
         // relocations and size of the code.
-        error(object_error::parse_failed);
+        reportError(errorCodeToError(object_error::parse_failed),
+                    Obj->getFileName());
         return;
       }
 
       StringRef LinkageName;
-      error(resolveSymbolName(Obj->getCOFFSection(Section), SectionOffset,
-                              LinkageName));
+      if (std::error_code EC = resolveSymbolName(Obj->getCOFFSection(Section),
+                                                 SectionOffset, LinkageName))
+        reportError(errorCodeToError(EC), Obj->getFileName());
+
       W.printString("LinkageName", LinkageName);
       if (FunctionLineTables.count(LinkageName) != 0) {
         // Saw debug info for this function already?
-        error(object_error::parse_failed);
+        reportError(errorCodeToError(object_error::parse_failed),
+                    Obj->getFileName());
         return;
       }
 
@@ -1033,17 +1053,21 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
       BinaryStreamReader SR(Contents, llvm::support::little);
 
       DebugFrameDataSubsectionRef FrameData;
-      error(FrameData.initialize(SR));
+      if (Error E = FrameData.initialize(SR))
+        reportError(std::move(E), Obj->getFileName());
 
       StringRef LinkageName;
-      error(resolveSymbolName(Obj->getCOFFSection(Section), SectionContents,
-                              FrameData.getRelocPtr(), LinkageName));
+      if (std::error_code EC =
+              resolveSymbolName(Obj->getCOFFSection(Section), SectionContents,
+                                FrameData.getRelocPtr(), LinkageName))
+        reportError(errorCodeToError(EC), Obj->getFileName());
       W.printString("LinkageName", LinkageName);
 
       // To find the active frame description, search this array for the
       // smallest PC range that includes the current PC.
       for (const auto &FD : FrameData) {
-        StringRef FrameFunc = error(CVStringTable.getString(FD.FrameFunc));
+        StringRef FrameFunc = unwrapOrError(
+            Obj->getFileName(), CVStringTable.getString(FD.FrameFunc));
 
         DictScope S(W, "FrameData");
         W.printHex("RvaStart", FD.RvaStart);
@@ -1094,7 +1118,8 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
     BinaryStreamReader Reader(FunctionLineTables[Name], support::little);
 
     DebugLinesSubsectionRef LineInfo;
-    error(LineInfo.initialize(Reader));
+    if (Error E = LineInfo.initialize(Reader))
+      reportError(std::move(E), Obj->getFileName());
 
     W.printHex("Flags", LineInfo.header()->Flags);
     W.printHex("CodeSize", LineInfo.header()->CodeSize);
@@ -1105,7 +1130,8 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
       uint32_t ColumnIndex = 0;
       for (const auto &Line : Entry.LineNumbers) {
         if (Line.Offset >= LineInfo.header()->CodeSize) {
-          error(object_error::parse_failed);
+          reportError(errorCodeToError(object_error::parse_failed),
+                      Obj->getFileName());
           return;
         }
 
@@ -1136,21 +1162,20 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
                                                 StringRef SectionContents) {
   ArrayRef<uint8_t> BinaryData(Subsection.bytes_begin(),
                                Subsection.bytes_end());
-  auto CODD = llvm::make_unique<COFFObjectDumpDelegate>(*this, Section, Obj,
+  auto CODD = std::make_unique<COFFObjectDumpDelegate>(*this, Section, Obj,
                                                         SectionContents);
   CVSymbolDumper CVSD(W, Types, CodeViewContainer::ObjectFile, std::move(CODD),
                       CompilationCPUType, opts::CodeViewSubsectionBytes);
   CVSymbolArray Symbols;
   BinaryStreamReader Reader(BinaryData, llvm::support::little);
-  if (auto EC = Reader.readArray(Symbols, Reader.getLength())) {
-    consumeError(std::move(EC));
+  if (Error E = Reader.readArray(Symbols, Reader.getLength())) {
     W.flush();
-    error(object_error::parse_failed);
+    reportError(std::move(E), Obj->getFileName());
   }
 
-  if (auto EC = CVSD.dump(Symbols)) {
+  if (Error E = CVSD.dump(Symbols)) {
     W.flush();
-    error(std::move(EC));
+    reportError(std::move(E), Obj->getFileName());
   }
   CompilationCPUType = CVSD.getCompilationCPUType();
   W.flush();
@@ -1159,12 +1184,14 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
 void COFFDumper::printCodeViewFileChecksums(StringRef Subsection) {
   BinaryStreamRef Stream(Subsection, llvm::support::little);
   DebugChecksumsSubsectionRef Checksums;
-  error(Checksums.initialize(Stream));
+  if (Error E = Checksums.initialize(Stream))
+    reportError(std::move(E), Obj->getFileName());
 
   for (auto &FC : Checksums) {
     DictScope S(W, "FileChecksum");
 
-    StringRef Filename = error(CVStringTable.getString(FC.FileNameOffset));
+    StringRef Filename = unwrapOrError(
+        Obj->getFileName(), CVStringTable.getString(FC.FileNameOffset));
     W.printHex("Filename", Filename, FC.FileNameOffset);
     W.printHex("ChecksumSize", FC.Checksum.size());
     W.printEnum("ChecksumKind", uint8_t(FC.Kind),
@@ -1177,7 +1204,8 @@ void COFFDumper::printCodeViewFileChecksums(StringRef Subsection) {
 void COFFDumper::printCodeViewInlineeLines(StringRef Subsection) {
   BinaryStreamReader SR(Subsection, llvm::support::little);
   DebugInlineeLinesSubsectionRef Lines;
-  error(Lines.initialize(SR));
+  if (Error E = Lines.initialize(SR))
+    reportError(std::move(E), Obj->getFileName());
 
   for (auto &Line : Lines) {
     DictScope S(W, "InlineeSourceLine");
@@ -1198,15 +1226,18 @@ void COFFDumper::printCodeViewInlineeLines(StringRef Subsection) {
 StringRef COFFDumper::getFileNameForFileOffset(uint32_t FileOffset) {
   // The file checksum subsection should precede all references to it.
   if (!CVFileChecksumTable.valid() || !CVStringTable.valid())
-    error(object_error::parse_failed);
+    reportError(errorCodeToError(object_error::parse_failed),
+                Obj->getFileName());
 
   auto Iter = CVFileChecksumTable.getArray().at(FileOffset);
 
   // Check if the file checksum table offset is valid.
   if (Iter == CVFileChecksumTable.end())
-    error(object_error::parse_failed);
+    reportError(errorCodeToError(object_error::parse_failed),
+                Obj->getFileName());
 
-  return error(CVStringTable.getString(Iter->FileNameOffset));
+  return unwrapOrError(Obj->getFileName(),
+                       CVStringTable.getString(Iter->FileNameOffset));
 }
 
 void COFFDumper::printFileNameForOffset(StringRef Label, uint32_t FileOffset) {
@@ -1219,35 +1250,38 @@ void COFFDumper::mergeCodeViewTypes(MergingTypeTableBuilder &CVIDs,
                                     GlobalTypeTableBuilder &GlobalCVTypes,
                                     bool GHash) {
   for (const SectionRef &S : Obj->sections()) {
-    StringRef SectionName;
-    error(S.getName(SectionName));
+    StringRef SectionName = unwrapOrError(Obj->getFileName(), S.getName());
     if (SectionName == ".debug$T") {
-      StringRef Data = unwrapOrError(S.getContents());
+      StringRef Data = unwrapOrError(Obj->getFileName(), S.getContents());
       uint32_t Magic;
-      error(consume(Data, Magic));
+      if (Error E = consume(Data, Magic))
+        reportError(std::move(E), Obj->getFileName());
+
       if (Magic != 4)
-        error(object_error::parse_failed);
+        reportError(errorCodeToError(object_error::parse_failed),
+                    Obj->getFileName());
 
       CVTypeArray Types;
       BinaryStreamReader Reader(Data, llvm::support::little);
       if (auto EC = Reader.readArray(Types, Reader.getLength())) {
         consumeError(std::move(EC));
         W.flush();
-        error(object_error::parse_failed);
+        reportError(errorCodeToError(object_error::parse_failed),
+                    Obj->getFileName());
       }
       SmallVector<TypeIndex, 128> SourceToDest;
       Optional<uint32_t> PCHSignature;
       if (GHash) {
         std::vector<GloballyHashedType> Hashes =
             GloballyHashedType::hashTypes(Types);
-        if (auto EC =
+        if (Error E =
                 mergeTypeAndIdRecords(GlobalCVIDs, GlobalCVTypes, SourceToDest,
                                       Types, Hashes, PCHSignature))
-          return error(std::move(EC));
+          return reportError(std::move(E), Obj->getFileName());
       } else {
-        if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, Types,
+        if (Error E = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, Types,
                                             PCHSignature))
-          return error(std::move(EC));
+          return reportError(std::move(E), Obj->getFileName());
       }
     }
   }
@@ -1258,20 +1292,25 @@ void COFFDumper::printCodeViewTypeSection(StringRef SectionName,
   ListScope D(W, "CodeViewTypes");
   W.printNumber("Section", SectionName, Obj->getSectionID(Section));
 
-  StringRef Data = unwrapOrError(Section.getContents());
+  StringRef Data = unwrapOrError(Obj->getFileName(), Section.getContents());
   if (opts::CodeViewSubsectionBytes)
     W.printBinaryBlock("Data", Data);
 
   uint32_t Magic;
-  error(consume(Data, Magic));
+  if (Error E = consume(Data, Magic))
+    reportError(std::move(E), Obj->getFileName());
+
   W.printHex("Magic", Magic);
   if (Magic != COFF::DEBUG_SECTION_MAGIC)
-    return error(object_error::parse_failed);
+    reportError(errorCodeToError(object_error::parse_failed),
+                Obj->getFileName());
 
   Types.reset(Data, 100);
 
   TypeDumpVisitor TDV(Types, &W, opts::CodeViewSubsectionBytes);
-  error(codeview::visitTypeStream(Types, TDV));
+  if (Error E = codeview::visitTypeStream(Types, TDV))
+    reportError(std::move(E), Obj->getFileName());
+
   W.flush();
 }
 
@@ -1282,8 +1321,7 @@ void COFFDumper::printSectionHeaders() {
     ++SectionNumber;
     const coff_section *Section = Obj->getCOFFSection(Sec);
 
-    StringRef Name;
-    error(Sec.getName(Name));
+    StringRef Name = unwrapOrError(Obj->getFileName(), Sec.getName());
 
     DictScope D(W, "Section");
     W.printNumber("Number", SectionNumber);
@@ -1318,7 +1356,7 @@ void COFFDumper::printSectionHeaders() {
 
     if (opts::SectionData &&
         !(Section->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)) {
-      StringRef Data = unwrapOrError(Sec.getContents());
+      StringRef Data = unwrapOrError(Obj->getFileName(), Sec.getContents());
       W.printBinaryBlock("SectionData", Data);
     }
   }
@@ -1330,8 +1368,7 @@ void COFFDumper::printRelocations() {
   int SectionNumber = 0;
   for (const SectionRef &Section : Obj->sections()) {
     ++SectionNumber;
-    StringRef Name;
-    error(Section.getName(Name));
+    StringRef Name = unwrapOrError(Obj->getFileName(), Section.getName());
 
     bool PrintedGroup = false;
     for (const RelocationRef &Reloc : Section.relocations()) {
@@ -1362,7 +1399,9 @@ void COFFDumper::printRelocation(const SectionRef &Section,
   int64_t SymbolIndex = -1;
   if (Symbol != Obj->symbol_end()) {
     Expected<StringRef> SymbolNameOrErr = Symbol->getName();
-    error(errorToErrorCode(SymbolNameOrErr.takeError()));
+    if (!SymbolNameOrErr)
+      reportError(SymbolNameOrErr.takeError(), Obj->getFileName());
+
     SymbolName = *SymbolNameOrErr;
     SymbolIndex = Obj->getSymbolIndex(Obj->getCOFFSymbol(*Symbol));
   }
@@ -1439,7 +1478,8 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
   for (uint8_t I = 0; I < Symbol.getNumberOfAuxSymbols(); ++I) {
     if (Symbol.isFunctionDefinition()) {
       const coff_aux_function_definition *Aux;
-      error(getSymbolAuxData(Obj, Symbol, I, Aux));
+      if (std::error_code EC = getSymbolAuxData(Obj, Symbol, I, Aux))
+        reportError(errorCodeToError(EC), Obj->getFileName());
 
       DictScope AS(W, "AuxFunctionDef");
       W.printNumber("TagIndex", Aux->TagIndex);
@@ -1449,15 +1489,16 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
 
     } else if (Symbol.isAnyUndefined()) {
       const coff_aux_weak_external *Aux;
-      error(getSymbolAuxData(Obj, Symbol, I, Aux));
+      if (std::error_code EC = getSymbolAuxData(Obj, Symbol, I, Aux))
+        reportError(errorCodeToError(EC), Obj->getFileName());
 
       Expected<COFFSymbolRef> Linked = Obj->getSymbol(Aux->TagIndex);
+      if (!Linked)
+        reportError(Linked.takeError(), Obj->getFileName());
+
       StringRef LinkedName;
-      std::error_code EC = errorToErrorCode(Linked.takeError());
-      if (EC || (EC = Obj->getSymbolName(*Linked, LinkedName))) {
-        LinkedName = "";
-        error(EC);
-      }
+      if (std::error_code EC = Obj->getSymbolName(*Linked, LinkedName))
+        reportError(errorCodeToError(EC), Obj->getFileName());
 
       DictScope AS(W, "AuxWeakExternal");
       W.printNumber("Linked", LinkedName, Aux->TagIndex);
@@ -1466,8 +1507,8 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
 
     } else if (Symbol.isFileRecord()) {
       const char *FileName;
-      error(getSymbolAuxData(Obj, Symbol, I, FileName));
-
+      if (std::error_code EC = getSymbolAuxData(Obj, Symbol, I, FileName))
+        reportError(errorCodeToError(EC), Obj->getFileName());
       DictScope AS(W, "AuxFileRecord");
 
       StringRef Name(FileName, Symbol.getNumberOfAuxSymbols() *
@@ -1476,7 +1517,8 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
       break;
     } else if (Symbol.isSectionDefinition()) {
       const coff_aux_section_definition *Aux;
-      error(getSymbolAuxData(Obj, Symbol, I, Aux));
+      if (std::error_code EC = getSymbolAuxData(Obj, Symbol, I, Aux))
+        reportError(errorCodeToError(EC), Obj->getFileName());
 
       int32_t AuxNumber = Aux->getNumber(Symbol.isBigObj());
 
@@ -1493,26 +1535,27 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
         const coff_section *Assoc;
         StringRef AssocName = "";
         if (std::error_code EC = Obj->getSection(AuxNumber, Assoc))
-          error(EC);
+          reportError(errorCodeToError(EC), Obj->getFileName());
         Expected<StringRef> Res = getSectionName(Obj, AuxNumber, Assoc);
         if (!Res)
-          error(Res.takeError());
+          reportError(Res.takeError(), Obj->getFileName());
         AssocName = *Res;
 
         W.printNumber("AssocSection", AssocName, AuxNumber);
       }
     } else if (Symbol.isCLRToken()) {
       const coff_aux_clr_token *Aux;
-      error(getSymbolAuxData(Obj, Symbol, I, Aux));
+      if (std::error_code EC = getSymbolAuxData(Obj, Symbol, I, Aux))
+        reportError(errorCodeToError(EC), Obj->getFileName());
 
       Expected<COFFSymbolRef> ReferredSym =
           Obj->getSymbol(Aux->SymbolTableIndex);
+      if (!ReferredSym)
+        reportError(ReferredSym.takeError(), Obj->getFileName());
+
       StringRef ReferredName;
-      std::error_code EC = errorToErrorCode(ReferredSym.takeError());
-      if (EC || (EC = Obj->getSymbolName(*ReferredSym, ReferredName))) {
-        ReferredName = "";
-        error(EC);
-      }
+      if (std::error_code EC = Obj->getSymbolName(*ReferredSym, ReferredName))
+        reportError(errorCodeToError(EC), Obj->getFileName());
 
       DictScope AS(W, "AuxCLRToken");
       W.printNumber("AuxType", Aux->AuxType);
@@ -1578,9 +1621,11 @@ void COFFDumper::printImportedSymbols(
     iterator_range<imported_symbol_iterator> Range) {
   for (const ImportedSymbolRef &I : Range) {
     StringRef Sym;
-    error(I.getSymbolName(Sym));
+    if (std::error_code EC = I.getSymbolName(Sym))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     uint16_t Ordinal;
-    error(I.getOrdinal(Ordinal));
+    if (std::error_code EC = I.getOrdinal(Ordinal))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     W.printNumber("Symbol", Sym, Ordinal);
   }
 }
@@ -1592,12 +1637,17 @@ void COFFDumper::printDelayImportedSymbols(
   for (const ImportedSymbolRef &S : Range) {
     DictScope Import(W, "Import");
     StringRef Sym;
-    error(S.getSymbolName(Sym));
+    if (std::error_code EC = S.getSymbolName(Sym))
+      reportError(errorCodeToError(EC), Obj->getFileName());
+
     uint16_t Ordinal;
-    error(S.getOrdinal(Ordinal));
+    if (std::error_code EC = S.getOrdinal(Ordinal))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     W.printNumber("Symbol", Sym, Ordinal);
+
     uint64_t Addr;
-    error(I.getImportAddress(Index++, Addr));
+    if (std::error_code EC = I.getImportAddress(Index++, Addr))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     W.printHex("Address", Addr);
   }
 }
@@ -1607,13 +1657,16 @@ void COFFDumper::printCOFFImports() {
   for (const ImportDirectoryEntryRef &I : Obj->import_directories()) {
     DictScope Import(W, "Import");
     StringRef Name;
-    error(I.getName(Name));
+    if (std::error_code EC = I.getName(Name))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     W.printString("Name", Name);
     uint32_t ILTAddr;
-    error(I.getImportLookupTableRVA(ILTAddr));
+    if (std::error_code EC = I.getImportLookupTableRVA(ILTAddr))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     W.printHex("ImportLookupTableRVA", ILTAddr);
     uint32_t IATAddr;
-    error(I.getImportAddressTableRVA(IATAddr));
+    if (std::error_code EC = I.getImportAddressTableRVA(IATAddr))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     W.printHex("ImportAddressTableRVA", IATAddr);
     // The import lookup table can be missing with certain older linkers, so
     // fall back to the import address table in that case.
@@ -1627,10 +1680,12 @@ void COFFDumper::printCOFFImports() {
   for (const DelayImportDirectoryEntryRef &I : Obj->delay_import_directories()) {
     DictScope Import(W, "DelayImport");
     StringRef Name;
-    error(I.getName(Name));
+    if (std::error_code EC = I.getName(Name))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     W.printString("Name", Name);
     const delay_import_directory_table_entry *Table;
-    error(I.getDelayImportTable(Table));
+    if (std::error_code EC = I.getDelayImportTable(Table))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     W.printHex("Attributes", Table->Attributes);
     W.printHex("ModuleHandle", Table->ModuleHandle);
     W.printHex("ImportAddressTable", Table->DelayImportAddressTable);
@@ -1648,9 +1703,12 @@ void COFFDumper::printCOFFExports() {
     StringRef Name;
     uint32_t Ordinal, RVA;
 
-    error(E.getSymbolName(Name));
-    error(E.getOrdinal(Ordinal));
-    error(E.getExportRVA(RVA));
+    if (std::error_code EC = E.getSymbolName(Name))
+      reportError(errorCodeToError(EC), Obj->getFileName());
+    if (std::error_code EC = E.getOrdinal(Ordinal))
+      reportError(errorCodeToError(EC), Obj->getFileName());
+    if (std::error_code EC = E.getExportRVA(RVA))
+      reportError(errorCodeToError(EC), Obj->getFileName());
 
     W.printNumber("Ordinal", Ordinal);
     W.printString("Name", Name);
@@ -1660,13 +1718,12 @@ void COFFDumper::printCOFFExports() {
 
 void COFFDumper::printCOFFDirectives() {
   for (const SectionRef &Section : Obj->sections()) {
-    StringRef Name;
-
-    error(Section.getName(Name));
+    StringRef Name = unwrapOrError(Obj->getFileName(), Section.getName());
     if (Name != ".drectve")
       continue;
 
-    StringRef Contents = unwrapOrError(Section.getContents());
+    StringRef Contents =
+        unwrapOrError(Obj->getFileName(), Section.getContents());
     W.printString("Directive(s)", Contents);
   }
 }
@@ -1689,8 +1746,10 @@ void COFFDumper::printCOFFBaseReloc() {
   for (const BaseRelocRef &I : Obj->base_relocs()) {
     uint8_t Type;
     uint32_t RVA;
-    error(I.getRVA(RVA));
-    error(I.getType(Type));
+    if (std::error_code EC = I.getRVA(RVA))
+      reportError(errorCodeToError(EC), Obj->getFileName());
+    if (std::error_code EC = I.getType(Type))
+      reportError(errorCodeToError(EC), Obj->getFileName());
     DictScope Import(W, "Entry");
     W.printString("Type", getBaseRelocTypeName(Type));
     W.printHex("Address", RVA);
@@ -1700,16 +1759,18 @@ void COFFDumper::printCOFFBaseReloc() {
 void COFFDumper::printCOFFResources() {
   ListScope ResourcesD(W, "Resources");
   for (const SectionRef &S : Obj->sections()) {
-    StringRef Name;
-    error(S.getName(Name));
+    StringRef Name = unwrapOrError(Obj->getFileName(), S.getName());
     if (!Name.startswith(".rsrc"))
       continue;
 
-    StringRef Ref = unwrapOrError(S.getContents());
+    StringRef Ref = unwrapOrError(Obj->getFileName(), S.getContents());
 
     if ((Name == ".rsrc") || (Name == ".rsrc$01")) {
-      ResourceSectionRef RSF(Ref);
-      auto &BaseTable = unwrapOrError(RSF.getBaseTable());
+      ResourceSectionRef RSF;
+      Error E = RSF.load(Obj, S);
+      if (E)
+        reportError(std::move(E), Obj->getFileName());
+      auto &BaseTable = unwrapOrError(Obj->getFileName(), RSF.getBaseTable());
       W.printNumber("Total Number of Resources",
                     countTotalTableEntries(RSF, BaseTable, "Type"));
       W.printHex("Base Table Address",
@@ -1729,14 +1790,15 @@ COFFDumper::countTotalTableEntries(ResourceSectionRef RSF,
   uint32_t TotalEntries = 0;
   for (int i = 0; i < Table.NumberOfNameEntries + Table.NumberOfIDEntries;
        i++) {
-    auto Entry = unwrapOrError(getResourceDirectoryTableEntry(Table, i));
+    auto Entry = unwrapOrError(Obj->getFileName(), RSF.getTableEntry(Table, i));
     if (Entry.Offset.isSubDir()) {
       StringRef NextLevel;
       if (Level == "Name")
         NextLevel = "Language";
       else
         NextLevel = "Name";
-      auto &NextTable = unwrapOrError(RSF.getEntrySubDir(Entry));
+      auto &NextTable =
+          unwrapOrError(Obj->getFileName(), RSF.getEntrySubDir(Entry));
       TotalEntries += countTotalTableEntries(RSF, NextTable, NextLevel);
     } else {
       TotalEntries += 1;
@@ -1755,13 +1817,13 @@ void COFFDumper::printResourceDirectoryTable(
   // Iterate through level in resource directory tree.
   for (int i = 0; i < Table.NumberOfNameEntries + Table.NumberOfIDEntries;
        i++) {
-    auto Entry = unwrapOrError(getResourceDirectoryTableEntry(Table, i));
+    auto Entry = unwrapOrError(Obj->getFileName(), RSF.getTableEntry(Table, i));
     StringRef Name;
     SmallString<20> IDStr;
     raw_svector_ostream OS(IDStr);
     if (i < Table.NumberOfNameEntries) {
       ArrayRef<UTF16> RawEntryNameString =
-          unwrapOrError(RSF.getEntryNameString(Entry));
+          unwrapOrError(Obj->getFileName(), RSF.getEntryNameString(Entry));
       std::vector<UTF16> EndianCorrectedNameString;
       if (llvm::sys::IsBigEndianHost) {
         EndianCorrectedNameString.resize(RawEntryNameString.size() + 1);
@@ -1772,14 +1834,14 @@ void COFFDumper::printResourceDirectoryTable(
       }
       std::string EntryNameString;
       if (!llvm::convertUTF16ToUTF8String(RawEntryNameString, EntryNameString))
-        error(object_error::parse_failed);
+        reportError(errorCodeToError(object_error::parse_failed),
+                    Obj->getFileName());
       OS << ": ";
       OS << EntryNameString;
     } else {
       if (Level == "Type") {
         OS << ": ";
         printResourceTypeName(Entry.Identifier.ID, OS);
-        IDStr = IDStr.slice(0, IDStr.find_first_of(")", 0) + 1);
       } else {
         OS << ": (ID " << Entry.Identifier.ID << ")";
       }
@@ -1793,7 +1855,8 @@ void COFFDumper::printResourceDirectoryTable(
         NextLevel = "Language";
       else
         NextLevel = "Name";
-      auto &NextTable = unwrapOrError(RSF.getEntrySubDir(Entry));
+      auto &NextTable =
+          unwrapOrError(Obj->getFileName(), RSF.getEntrySubDir(Entry));
       printResourceDirectoryTable(RSF, NextTable, NextLevel);
     } else {
       W.printHex("Entry Offset", Entry.Offset.value());
@@ -1804,24 +1867,29 @@ void COFFDumper::printResourceDirectoryTable(
       W.printNumber("Major Version", Table.MajorVersion);
       W.printNumber("Minor Version", Table.MinorVersion);
       W.printNumber("Characteristics", Table.Characteristics);
+      ListScope DataScope(W, "Data");
+      auto &DataEntry =
+          unwrapOrError(Obj->getFileName(), RSF.getEntryData(Entry));
+      W.printHex("DataRVA", DataEntry.DataRVA);
+      W.printNumber("DataSize", DataEntry.DataSize);
+      W.printNumber("Codepage", DataEntry.Codepage);
+      W.printNumber("Reserved", DataEntry.Reserved);
+      StringRef Contents =
+          unwrapOrError(Obj->getFileName(), RSF.getContents(DataEntry));
+      W.printBinaryBlock("Data", Contents);
     }
   }
 }
 
-ErrorOr<const coff_resource_dir_entry &>
-COFFDumper::getResourceDirectoryTableEntry(const coff_resource_dir_table &Table,
-                                           uint32_t Index) {
-  if (Index >= (uint32_t)(Table.NumberOfNameEntries + Table.NumberOfIDEntries))
-    return object_error::parse_failed;
-  auto TablePtr = reinterpret_cast<const coff_resource_dir_entry *>(&Table + 1);
-  return TablePtr[Index];
-}
-
 void COFFDumper::printStackMap() const {
   object::SectionRef StackMapSection;
   for (auto Sec : Obj->sections()) {
     StringRef Name;
-    Sec.getName(Name);
+    if (Expected<StringRef> NameOrErr = Sec.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
     if (Name == ".llvm_stackmaps") {
       StackMapSection = Sec;
       break;
@@ -1831,7 +1899,8 @@ void COFFDumper::printStackMap() const {
   if (StackMapSection == object::SectionRef())
     return;
 
-  StringRef StackMapContents = unwrapOrError(StackMapSection.getContents());
+  StringRef StackMapContents =
+      unwrapOrError(Obj->getFileName(), StackMapSection.getContents());
   ArrayRef<uint8_t> StackMapContentsArray =
       arrayRefFromStringRef(StackMapContents);
 
@@ -1847,7 +1916,11 @@ void COFFDumper::printAddrsig() {
   object::SectionRef AddrsigSection;
   for (auto Sec : Obj->sections()) {
     StringRef Name;
-    Sec.getName(Name);
+    if (Expected<StringRef> NameOrErr = Sec.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
     if (Name == ".llvm_addrsig") {
       AddrsigSection = Sec;
       break;
@@ -1857,7 +1930,8 @@ void COFFDumper::printAddrsig() {
   if (AddrsigSection == object::SectionRef())
     return;
 
-  StringRef AddrsigContents = unwrapOrError(AddrsigSection.getContents());
+  StringRef AddrsigContents =
+      unwrapOrError(Obj->getFileName(), AddrsigSection.getContents());
   ArrayRef<uint8_t> AddrsigContentsArray(AddrsigContents.bytes_begin(),
                                          AddrsigContents.size());
 
@@ -1869,15 +1943,15 @@ void COFFDumper::printAddrsig() {
     const char *Err;
     uint64_t SymIndex = decodeULEB128(Cur, &Size, End, &Err);
     if (Err)
-      reportError(Err);
+      reportError(createError(Err), Obj->getFileName());
 
     Expected<COFFSymbolRef> Sym = Obj->getSymbol(SymIndex);
+    if (!Sym)
+      reportError(Sym.takeError(), Obj->getFileName());
+
     StringRef SymName;
-    std::error_code EC = errorToErrorCode(Sym.takeError());
-    if (EC || (EC = Obj->getSymbolName(*Sym, SymName))) {
-      SymName = "";
-      error(EC);
-    }
+    if (std::error_code EC = Obj->getSymbolName(*Sym, SymName))
+      reportError(errorCodeToError(EC), Obj->getFileName());
 
     W.printNumber("Sym", SymName, SymIndex);
     Cur += Size;
@@ -1891,7 +1965,8 @@ void llvm::dumpCodeViewMergedTypes(ScopedPrinter &Writer,
   {
     ListScope S(Writer, "MergedTypeStream");
     TypeDumpVisitor TDV(TpiTypes, &Writer, opts::CodeViewSubsectionBytes);
-    error(codeview::visitTypeStream(TpiTypes, TDV));
+    if (Error Err = codeview::visitTypeStream(TpiTypes, TDV))
+      reportError(std::move(Err), "<?>");
     Writer.flush();
   }
 
@@ -1902,7 +1977,8 @@ void llvm::dumpCodeViewMergedTypes(ScopedPrinter &Writer,
     ListScope S(Writer, "MergedIDStream");
     TypeDumpVisitor TDV(TpiTypes, &Writer, opts::CodeViewSubsectionBytes);
     TDV.setIpiTypes(IpiTypes);
-    error(codeview::visitTypeStream(IpiTypes, TDV));
+    if (Error Err = codeview::visitTypeStream(IpiTypes, TDV))
+      reportError(std::move(Err), "<?>");
     Writer.flush();
   }
 }
diff --git a/tools/llvm-readobj/DwarfCFIEHPrinter.h b/tools/llvm-readobj/DwarfCFIEHPrinter.h
index 7055510ef2f2..0a365d4fe72a 100644
--- a/tools/llvm-readobj/DwarfCFIEHPrinter.h
+++ b/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -44,12 +44,12 @@ public:
   void printUnwindInformation() const;
 };
 
-template <class ELFO>
-static const typename ELFO::Elf_Shdr *findSectionByAddress(const ELFO *Obj,
-                                                           uint64_t Addr) {
-  auto Sections = Obj->sections();
+template <class ELFT>
+static const typename object::ELFObjectFile<ELFT>::Elf_Shdr *
+findSectionByAddress(const object::ELFObjectFile<ELFT> *ObjF, uint64_t Addr) {
+  auto Sections = ObjF->getELFFile()->sections();
   if (Error E = Sections.takeError())
-    reportError(toString(std::move(E)));
+    reportError(std::move(E), ObjF->getFileName());
 
   for (const auto &Shdr : *Sections)
     if (Shdr.sh_addr == Addr)
@@ -64,13 +64,15 @@ void PrinterContext<ELFT>::printUnwindInformation() const {
 
   auto PHs = Obj->program_headers();
   if (Error E = PHs.takeError())
-    reportError(toString(std::move(E)));
+    reportError(std::move(E), ObjF->getFileName());
 
   for (const auto &Phdr : *PHs) {
     if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) {
       EHFramePhdr = &Phdr;
       if (Phdr.p_memsz != Phdr.p_filesz)
-        reportError("p_memsz does not match p_filesz for GNU_EH_FRAME");
+        reportError(object::createError(
+                        "p_memsz does not match p_filesz for GNU_EH_FRAME"),
+                    ObjF->getFileName());
       break;
     }
   }
@@ -81,12 +83,12 @@ void PrinterContext<ELFT>::printUnwindInformation() const {
 
   auto Sections = Obj->sections();
   if (Error E = Sections.takeError())
-    reportError(toString(std::move(E)));
+    reportError(std::move(E), ObjF->getFileName());
 
   for (const auto &Shdr : *Sections) {
     auto SectionName = Obj->getSectionName(&Shdr);
     if (Error E = SectionName.takeError())
-      reportError(toString(std::move(E)));
+      reportError(std::move(E), ObjF->getFileName());
 
     if (*SectionName == ".eh_frame")
       printEHFrame(&Shdr);
@@ -97,49 +99,52 @@ template <typename ELFT>
 void PrinterContext<ELFT>::printEHFrameHdr(uint64_t EHFrameHdrOffset,
                                            uint64_t EHFrameHdrAddress,
                                            uint64_t EHFrameHdrSize) const {
-  ListScope L(W, "EH_FRAME Header");
+  DictScope L(W, "EHFrameHeader");
   W.startLine() << format("Address: 0x%" PRIx64 "\n", EHFrameHdrAddress);
   W.startLine() << format("Offset: 0x%" PRIx64 "\n", EHFrameHdrOffset);
   W.startLine() << format("Size: 0x%" PRIx64 "\n", EHFrameHdrSize);
 
   const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const auto *EHFrameHdrShdr = findSectionByAddress(Obj, EHFrameHdrAddress);
+  const auto *EHFrameHdrShdr = findSectionByAddress(ObjF, EHFrameHdrAddress);
   if (EHFrameHdrShdr) {
     auto SectionName = Obj->getSectionName(EHFrameHdrShdr);
     if (Error E = SectionName.takeError())
-      reportError(toString(std::move(E)));
+      reportError(std::move(E), ObjF->getFileName());
 
     W.printString("Corresponding Section", *SectionName);
   }
 
-  DataExtractor DE(
-      StringRef(reinterpret_cast<const char *>(Obj->base()) + EHFrameHdrOffset,
-                EHFrameHdrSize),
-      ELFT::TargetEndianness == support::endianness::little,
-      ELFT::Is64Bits ? 8 : 4);
+  DataExtractor DE(makeArrayRef(Obj->base() + EHFrameHdrOffset, EHFrameHdrSize),
+                   ELFT::TargetEndianness == support::endianness::little,
+                   ELFT::Is64Bits ? 8 : 4);
 
   DictScope D(W, "Header");
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
 
   auto Version = DE.getU8(&Offset);
   W.printNumber("version", Version);
   if (Version != 1)
-    reportError("only version 1 of .eh_frame_hdr is supported");
+    reportError(
+        object::createError("only version 1 of .eh_frame_hdr is supported"),
+        ObjF->getFileName());
 
   uint64_t EHFramePtrEnc = DE.getU8(&Offset);
   W.startLine() << format("eh_frame_ptr_enc: 0x%" PRIx64 "\n", EHFramePtrEnc);
   if (EHFramePtrEnc != (dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4))
-    reportError("unexpected encoding eh_frame_ptr_enc");
+    reportError(object::createError("unexpected encoding eh_frame_ptr_enc"),
+                ObjF->getFileName());
 
   uint64_t FDECountEnc = DE.getU8(&Offset);
   W.startLine() << format("fde_count_enc: 0x%" PRIx64 "\n", FDECountEnc);
   if (FDECountEnc != dwarf::DW_EH_PE_udata4)
-    reportError("unexpected encoding fde_count_enc");
+    reportError(object::createError("unexpected encoding fde_count_enc"),
+                ObjF->getFileName());
 
   uint64_t TableEnc = DE.getU8(&Offset);
   W.startLine() << format("table_enc: 0x%" PRIx64 "\n", TableEnc);
   if (TableEnc != (dwarf::DW_EH_PE_datarel | dwarf::DW_EH_PE_sdata4))
-    reportError("unexpected encoding table_enc");
+    reportError(object::createError("unexpected encoding table_enc"),
+                ObjF->getFileName());
 
   auto EHFramePtr = DE.getSigned(&Offset, 4) + EHFrameHdrAddress + 4;
   W.startLine() << format("eh_frame_ptr: 0x%" PRIx64 "\n", EHFramePtr);
@@ -158,7 +163,8 @@ void PrinterContext<ELFT>::printEHFrameHdr(uint64_t EHFrameHdrOffset,
     W.startLine() << format("address: 0x%" PRIx64 "\n", Address);
 
     if (InitialPC < PrevPC)
-      reportError("initial_location is out of order");
+      reportError(object::createError("initial_location is out of order"),
+                  ObjF->getFileName());
 
     PrevPC = InitialPC;
     ++NumEntries;
@@ -178,7 +184,7 @@ void PrinterContext<ELFT>::printEHFrame(
   const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
   auto Result = Obj->getSectionContents(EHFrameShdr);
   if (Error E = Result.takeError())
-    reportError(toString(std::move(E)));
+    reportError(std::move(E), ObjF->getFileName());
 
   auto Contents = Result.get();
   DWARFDataExtractor DE(
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 4e1cb7d544e7..57144882c4b4 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
@@ -36,6 +37,7 @@
 #include "llvm/Object/ELFTypes.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/RelocationResolver.h"
 #include "llvm/Object/StackMapParser.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/ARMAttributeParser.h"
@@ -61,6 +63,7 @@
 #include <memory>
 #include <string>
 #include <system_error>
+#include <unordered_set>
 #include <vector>
 
 using namespace llvm;
@@ -119,9 +122,9 @@ template <class ELFT> class DumpStyle;
 /// the size, entity size and virtual address are different entries in arbitrary
 /// order (DT_REL, DT_RELSZ, DT_RELENT for example).
 struct DynRegionInfo {
-  DynRegionInfo() = default;
-  DynRegionInfo(const void *A, uint64_t S, uint64_t ES)
-      : Addr(A), Size(S), EntSize(ES) {}
+  DynRegionInfo(StringRef ObjName) : FileName(ObjName) {}
+  DynRegionInfo(const void *A, uint64_t S, uint64_t ES, StringRef ObjName)
+      : Addr(A), Size(S), EntSize(ES), FileName(ObjName) {}
 
   /// Address in current address space.
   const void *Addr = nullptr;
@@ -130,14 +133,18 @@ struct DynRegionInfo {
   /// Size of each entity in the region.
   uint64_t EntSize = 0;
 
+  /// Name of the file. Used for error reporting.
+  StringRef FileName;
+
   template <typename Type> ArrayRef<Type> getAsArrayRef() const {
     const Type *Start = reinterpret_cast<const Type *>(Addr);
     if (!Start)
       return {Start, Start};
     if (EntSize != sizeof(Type) || Size % EntSize) {
       // TODO: Add a section index to this warning.
-      reportWarning("invalid section size (" + Twine(Size) +
-                    ") or entity size (" + Twine(EntSize) + ")");
+      reportWarning(createError("invalid section size (" + Twine(Size) +
+                                ") or entity size (" + Twine(EntSize) + ")"),
+                    FileName);
       return {Start, Start};
     }
     return {Start, Start + (Size / EntSize)};
@@ -166,11 +173,7 @@ public:
   void printVersionInfo() override;
   void printGroupSections() override;
 
-  void printAttributes() override;
-  void printMipsPLTGOT() override;
-  void printMipsABIFlags() override;
-  void printMipsReginfo() override;
-  void printMipsOptions() override;
+  void printArchSpecificInfo() override;
 
   void printStackMap() const override;
 
@@ -182,6 +185,7 @@ public:
   void printNotes() override;
 
   void printELFLinkerOptions() override;
+  void printStackSizes() override;
 
   const object::ELFObjectFile<ELFT> *getElfObject() const { return ObjF; };
 
@@ -195,20 +199,27 @@ private:
     if (DRI.Addr < Obj->base() ||
         reinterpret_cast<const uint8_t *>(DRI.Addr) + DRI.Size >
             Obj->base() + Obj->getBufSize())
-      error(llvm::object::object_error::parse_failed);
+      reportError(errorCodeToError(llvm::object::object_error::parse_failed),
+                  ObjF->getFileName());
     return DRI;
   }
 
   DynRegionInfo createDRIFrom(const Elf_Phdr *P, uintX_t EntSize) {
-    return checkDRI(
-        {ObjF->getELFFile()->base() + P->p_offset, P->p_filesz, EntSize});
+    return checkDRI({ObjF->getELFFile()->base() + P->p_offset, P->p_filesz,
+                     EntSize, ObjF->getFileName()});
   }
 
   DynRegionInfo createDRIFrom(const Elf_Shdr *S) {
-    return checkDRI(
-        {ObjF->getELFFile()->base() + S->sh_offset, S->sh_size, S->sh_entsize});
+    return checkDRI({ObjF->getELFFile()->base() + S->sh_offset, S->sh_size,
+                     S->sh_entsize, ObjF->getFileName()});
   }
 
+  void printAttributes();
+  void printMipsReginfo();
+  void printMipsOptions();
+
+  std::pair<const Elf_Phdr *, const Elf_Shdr *>
+  findDynamic(const ELFFile<ELFT> *Obj);
   void loadDynamicTable(const ELFFile<ELFT> *Obj);
   void parseDynamicTable();
 
@@ -226,7 +237,7 @@ private:
   DynRegionInfo DynSymRegion;
   DynRegionInfo DynamicTable;
   StringRef DynamicStringTable;
-  StringRef SOName = "<Not found>";
+  std::string SOName = "<Not found>";
   const Elf_Hash *HashTable = nullptr;
   const Elf_GnuHash *GnuHashTable = nullptr;
   const Elf_Shdr *DotSymtabSec = nullptr;
@@ -291,7 +302,8 @@ public:
   void getSectionNameIndex(const Elf_Sym *Symbol, const Elf_Sym *FirstSym,
                            StringRef &SectionName,
                            unsigned &SectionIndex) const;
-  std::string getStaticSymbolName(uint32_t Index) const;
+  Expected<std::string> getStaticSymbolName(uint32_t Index) const;
+  std::string getDynamicString(uint64_t Value) const;
   StringRef getSymbolVersionByIndex(StringRef StrTab,
                                     uint32_t VersionSymbolIndex,
                                     bool &IsDefault) const;
@@ -328,16 +340,27 @@ void ELFDumper<ELFT>::printSymbolsHelper(bool IsDynamic) const {
   } else {
     if (!DotSymtabSec)
       return;
-    StrTable = unwrapOrError(Obj->getStringTableForSymtab(*DotSymtabSec));
-    Syms = unwrapOrError(Obj->symbols(DotSymtabSec));
-    SymtabName = unwrapOrError(Obj->getSectionName(DotSymtabSec));
+    StrTable = unwrapOrError(ObjF->getFileName(),
+                             Obj->getStringTableForSymtab(*DotSymtabSec));
+    Syms = unwrapOrError(ObjF->getFileName(), Obj->symbols(DotSymtabSec));
+    SymtabName =
+        unwrapOrError(ObjF->getFileName(), Obj->getSectionName(DotSymtabSec));
     Entries = DotSymtabSec->getEntityCount();
   }
   if (Syms.begin() == Syms.end())
     return;
-  ELFDumperStyle->printSymtabMessage(Obj, SymtabName, Entries);
+
+  // The st_other field has 2 logical parts. The first two bits hold the symbol
+  // visibility (STV_*) and the remainder hold other platform-specific values.
+  bool NonVisibilityBitsUsed = llvm::find_if(Syms, [](const Elf_Sym &S) {
+                                 return S.st_other & ~0x3;
+                               }) != Syms.end();
+
+  ELFDumperStyle->printSymtabMessage(Obj, SymtabName, Entries,
+                                     NonVisibilityBitsUsed);
   for (const auto &Sym : Syms)
-    ELFDumperStyle->printSymbol(Obj, &Sym, Syms.begin(), StrTable, IsDynamic);
+    ELFDumperStyle->printSymbol(Obj, &Sym, Syms.begin(), StrTable, IsDynamic,
+                                NonVisibilityBitsUsed);
 }
 
 template <class ELFT> class MipsGOTParser;
@@ -346,8 +369,20 @@ template <typename ELFT> class DumpStyle {
 public:
   using Elf_Shdr = typename ELFT::Shdr;
   using Elf_Sym = typename ELFT::Sym;
+  using Elf_Addr = typename ELFT::Addr;
+
+  DumpStyle(ELFDumper<ELFT> *Dumper) : Dumper(Dumper) {
+    FileName = this->Dumper->getElfObject()->getFileName();
+
+    // Dumper reports all non-critical errors as warnings.
+    // It does not print the same warning more than once.
+    WarningHandler = [this](const Twine &Msg) {
+      if (Warnings.insert(Msg.str()).second)
+        reportWarning(createError(Msg), FileName);
+      return Error::success();
+    };
+  }
 
-  DumpStyle(ELFDumper<ELFT> *Dumper) : Dumper(Dumper) {}
   virtual ~DumpStyle() = default;
 
   virtual void printFileHeaders(const ELFFile<ELFT> *Obj) = 0;
@@ -360,10 +395,10 @@ public:
   virtual void printDynamic(const ELFFile<ELFT> *Obj) {}
   virtual void printDynamicRelocations(const ELFFile<ELFT> *Obj) = 0;
   virtual void printSymtabMessage(const ELFFile<ELFT> *Obj, StringRef Name,
-                                  size_t Offset) {}
+                                  size_t Offset, bool NonVisibilityBitsUsed) {}
   virtual void printSymbol(const ELFFile<ELFT> *Obj, const Elf_Sym *Symbol,
                            const Elf_Sym *FirstSym, StringRef StrTable,
-                           bool IsDynamic) = 0;
+                           bool IsDynamic, bool NonVisibilityBitsUsed) = 0;
   virtual void printProgramHeaders(const ELFFile<ELFT> *Obj,
                                    bool PrintProgramHeaders,
                                    cl::boolOrDefault PrintSectionMapping) = 0;
@@ -378,11 +413,31 @@ public:
   virtual void printAddrsig(const ELFFile<ELFT> *Obj) = 0;
   virtual void printNotes(const ELFFile<ELFT> *Obj) = 0;
   virtual void printELFLinkerOptions(const ELFFile<ELFT> *Obj) = 0;
+  virtual void printStackSizes(const ELFObjectFile<ELFT> *Obj) = 0;
+  void printNonRelocatableStackSizes(const ELFObjectFile<ELFT> *Obj,
+                                     std::function<void()> PrintHeader);
+  void printRelocatableStackSizes(const ELFObjectFile<ELFT> *Obj,
+                                  std::function<void()> PrintHeader);
+  void printFunctionStackSize(const ELFObjectFile<ELFT> *Obj, uint64_t SymValue,
+                              SectionRef FunctionSec,
+                              const StringRef SectionName, DataExtractor Data,
+                              uint64_t *Offset);
+  void printStackSize(const ELFObjectFile<ELFT> *Obj, RelocationRef Rel,
+                      SectionRef FunctionSec,
+                      const StringRef &StackSizeSectionName,
+                      const RelocationResolver &Resolver, DataExtractor Data);
+  virtual void printStackSizeEntry(uint64_t Size, StringRef FuncName) = 0;
   virtual void printMipsGOT(const MipsGOTParser<ELFT> &Parser) = 0;
   virtual void printMipsPLT(const MipsGOTParser<ELFT> &Parser) = 0;
+  virtual void printMipsABIFlags(const ELFObjectFile<ELFT> *Obj) = 0;
   const ELFDumper<ELFT> *dumper() const { return Dumper; }
 
+protected:
+  std::function<Error(const Twine &Msg)> WarningHandler;
+  StringRef FileName;
+
 private:
+  std::unordered_set<std::string> Warnings;
   const ELFDumper<ELFT> *Dumper;
 };
 
@@ -407,8 +462,8 @@ public:
   void printHashSymbols(const ELFO *Obj) override;
   void printDynamic(const ELFFile<ELFT> *Obj) override;
   void printDynamicRelocations(const ELFO *Obj) override;
-  void printSymtabMessage(const ELFO *Obj, StringRef Name,
-                          size_t Offset) override;
+  void printSymtabMessage(const ELFO *Obj, StringRef Name, size_t Offset,
+                          bool NonVisibilityBitsUsed) override;
   void printProgramHeaders(const ELFO *Obj, bool PrintProgramHeaders,
                            cl::boolOrDefault PrintSectionMapping) override;
   void printVersionSymbolSection(const ELFFile<ELFT> *Obj,
@@ -422,8 +477,11 @@ public:
   void printAddrsig(const ELFFile<ELFT> *Obj) override;
   void printNotes(const ELFFile<ELFT> *Obj) override;
   void printELFLinkerOptions(const ELFFile<ELFT> *Obj) override;
+  void printStackSizes(const ELFObjectFile<ELFT> *Obj) override;
+  void printStackSizeEntry(uint64_t Size, StringRef FuncName) override;
   void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
   void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
+  void printMipsABIFlags(const ELFObjectFile<ELFT> *Obj) override;
 
 private:
   struct Field {
@@ -484,7 +542,8 @@ private:
   void printRelocation(const ELFO *Obj, const Elf_Sym *Sym,
                        StringRef SymbolName, const Elf_Rela &R, bool IsRela);
   void printSymbol(const ELFO *Obj, const Elf_Sym *Symbol, const Elf_Sym *First,
-                   StringRef StrTable, bool IsDynamic) override;
+                   StringRef StrTable, bool IsDynamic,
+                   bool NonVisibilityBitsUsed) override;
   std::string getSymbolSectionNdx(const ELFO *Obj, const Elf_Sym *Symbol,
                                   const Elf_Sym *FirstSym);
   void printDynamicRelocation(const ELFO *Obj, Elf_Rela R, bool IsRela);
@@ -525,8 +584,11 @@ public:
   void printAddrsig(const ELFFile<ELFT> *Obj) override;
   void printNotes(const ELFFile<ELFT> *Obj) override;
   void printELFLinkerOptions(const ELFFile<ELFT> *Obj) override;
+  void printStackSizes(const ELFObjectFile<ELFT> *Obj) override;
+  void printStackSizeEntry(uint64_t Size, StringRef FuncName) override;
   void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
   void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
+  void printMipsABIFlags(const ELFObjectFile<ELFT> *Obj) override;
 
 private:
   void printRelocation(const ELFO *Obj, Elf_Rela Rel, const Elf_Shdr *SymTab);
@@ -534,7 +596,8 @@ private:
   void printSymbols(const ELFO *Obj);
   void printDynamicSymbols(const ELFO *Obj);
   void printSymbol(const ELFO *Obj, const Elf_Sym *Symbol, const Elf_Sym *First,
-                   StringRef StrTable, bool IsDynamic) override;
+                   StringRef StrTable, bool IsDynamic,
+                   bool /*NonVisibilityBitsUsed*/) override;
   void printProgramHeaders(const ELFO *Obj);
   void printSectionMapping(const ELFO *Obj) {}
 
@@ -680,9 +743,9 @@ StringRef ELFDumper<ELFT>::getSymbolVersion(StringRef StrTab,
                        sizeof(Elf_Sym);
 
   // Get the corresponding version index entry.
-  const Elf_Versym *Versym =
-      unwrapOrError(ObjF->getELFFile()->template getEntry<Elf_Versym>(
-          SymbolVersionSection, EntryIndex));
+  const Elf_Versym *Versym = unwrapOrError(
+      ObjF->getFileName(), ObjF->getELFFile()->template getEntry<Elf_Versym>(
+                               SymbolVersionSection, EntryIndex));
   return this->getSymbolVersionByIndex(StrTab, Versym->vs_index, IsDefault);
 }
 
@@ -691,15 +754,22 @@ static std::string maybeDemangle(StringRef Name) {
 }
 
 template <typename ELFT>
-std::string ELFDumper<ELFT>::getStaticSymbolName(uint32_t Index) const {
+Expected<std::string>
+ELFDumper<ELFT>::getStaticSymbolName(uint32_t Index) const {
   const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  StringRef StrTable =
-      unwrapOrError(Obj->getStringTableForSymtab(*DotSymtabSec));
-  Elf_Sym_Range Syms = unwrapOrError(Obj->symbols(DotSymtabSec));
-  if (Index >= Syms.size())
-    reportError("Invalid symbol index");
-  const Elf_Sym *Sym = &Syms[Index];
-  return maybeDemangle(unwrapOrError(Sym->getName(StrTable)));
+  Expected<const typename ELFT::Sym *> SymOrErr =
+      Obj->getSymbol(DotSymtabSec, Index);
+  if (!SymOrErr)
+    return SymOrErr.takeError();
+
+  Expected<StringRef> StrTabOrErr = Obj->getStringTableForSymtab(*DotSymtabSec);
+  if (!StrTabOrErr)
+    return StrTabOrErr.takeError();
+
+  Expected<StringRef> NameOrErr = (*SymOrErr)->getName(*StrTabOrErr);
+  if (!NameOrErr)
+    return NameOrErr.takeError();
+  return maybeDemangle(*NameOrErr);
 }
 
 template <typename ELFT>
@@ -717,7 +787,7 @@ StringRef ELFDumper<ELFT>::getSymbolVersionByIndex(StringRef StrTab,
   // Lookup this symbol in the version table.
   LoadVersionMap();
   if (VersionIndex >= VersionMap.size() || VersionMap[VersionIndex].isNull())
-    reportError("Invalid version entry");
+    reportError(createError("Invalid version entry"), ObjF->getFileName());
   const VersionMapEntry &Entry = VersionMap[VersionIndex];
 
   // Get the version name string.
@@ -731,7 +801,7 @@ StringRef ELFDumper<ELFT>::getSymbolVersionByIndex(StringRef StrTab,
     IsDefault = false;
   }
   if (NameOffset >= StrTab.size())
-    reportError("Invalid string offset");
+    reportError(createError("Invalid string offset"), ObjF->getFileName());
   return StrTab.data() + NameOffset;
 }
 
@@ -739,14 +809,14 @@ template <typename ELFT>
 std::string ELFDumper<ELFT>::getFullSymbolName(const Elf_Sym *Symbol,
                                                StringRef StrTable,
                                                bool IsDynamic) const {
-  std::string SymbolName =
-      maybeDemangle(unwrapOrError(Symbol->getName(StrTable)));
+  std::string SymbolName = maybeDemangle(
+      unwrapOrError(ObjF->getFileName(), Symbol->getName(StrTable)));
 
   if (SymbolName.empty() && Symbol->getType() == ELF::STT_SECTION) {
     unsigned SectionIndex;
     StringRef SectionName;
-    Elf_Sym_Range Syms =
-        unwrapOrError(ObjF->getELFFile()->symbols(DotSymtabSec));
+    Elf_Sym_Range Syms = unwrapOrError(
+        ObjF->getFileName(), ObjF->getELFFile()->symbols(DotSymtabSec));
     getSectionNameIndex(Symbol, Syms.begin(), SectionName, SectionIndex);
     return SectionName;
   }
@@ -783,31 +853,32 @@ void ELFDumper<ELFT>::getSectionNameIndex(const Elf_Sym *Symbol,
     SectionName = "Reserved";
   else {
     if (SectionIndex == SHN_XINDEX)
-      SectionIndex = unwrapOrError(object::getExtendedSymbolTableIndex<ELFT>(
-          Symbol, FirstSym, ShndxTable));
+      SectionIndex = unwrapOrError(ObjF->getFileName(),
+                                   object::getExtendedSymbolTableIndex<ELFT>(
+                                       Symbol, FirstSym, ShndxTable));
     const ELFFile<ELFT> *Obj = ObjF->getELFFile();
     const typename ELFT::Shdr *Sec =
-        unwrapOrError(Obj->getSection(SectionIndex));
-    SectionName = unwrapOrError(Obj->getSectionName(Sec));
+        unwrapOrError(ObjF->getFileName(), Obj->getSection(SectionIndex));
+    SectionName = unwrapOrError(ObjF->getFileName(), Obj->getSectionName(Sec));
   }
 }
 
 template <class ELFO>
 static const typename ELFO::Elf_Shdr *
-findNotEmptySectionByAddress(const ELFO *Obj, uint64_t Addr) {
-  for (const auto &Shdr : unwrapOrError(Obj->sections()))
+findNotEmptySectionByAddress(const ELFO *Obj, StringRef FileName,
+                             uint64_t Addr) {
+  for (const auto &Shdr : unwrapOrError(FileName, Obj->sections()))
     if (Shdr.sh_addr == Addr && Shdr.sh_size > 0)
       return &Shdr;
   return nullptr;
 }
 
 template <class ELFO>
-static const typename ELFO::Elf_Shdr *findSectionByName(const ELFO &Obj,
-                                                        StringRef Name) {
-  for (const auto &Shdr : unwrapOrError(Obj.sections())) {
-    if (Name == unwrapOrError(Obj.getSectionName(&Shdr)))
+static const typename ELFO::Elf_Shdr *
+findSectionByName(const ELFO &Obj, StringRef FileName, StringRef Name) {
+  for (const auto &Shdr : unwrapOrError(FileName, Obj.sections()))
+    if (Name == unwrapOrError(FileName, Obj.getSectionName(&Shdr)))
       return &Shdr;
-  }
   return nullptr;
 }
 
@@ -1356,10 +1427,12 @@ static const char *getElfMipsOptionsOdkType(unsigned Odk) {
 }
 
 template <typename ELFT>
-void ELFDumper<ELFT>::loadDynamicTable(const ELFFile<ELFT> *Obj) {
+std::pair<const typename ELFT::Phdr *, const typename ELFT::Shdr *>
+ELFDumper<ELFT>::findDynamic(const ELFFile<ELFT> *Obj) {
   // Try to locate the PT_DYNAMIC header.
   const Elf_Phdr *DynamicPhdr = nullptr;
-  for (const Elf_Phdr &Phdr : unwrapOrError(Obj->program_headers())) {
+  for (const Elf_Phdr &Phdr :
+       unwrapOrError(ObjF->getFileName(), Obj->program_headers())) {
     if (Phdr.p_type != ELF::PT_DYNAMIC)
       continue;
     DynamicPhdr = &Phdr;
@@ -1368,61 +1441,132 @@ void ELFDumper<ELFT>::loadDynamicTable(const ELFFile<ELFT> *Obj) {
 
   // Try to locate the .dynamic section in the sections header table.
   const Elf_Shdr *DynamicSec = nullptr;
-  for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
+  for (const Elf_Shdr &Sec :
+       unwrapOrError(ObjF->getFileName(), Obj->sections())) {
     if (Sec.sh_type != ELF::SHT_DYNAMIC)
       continue;
     DynamicSec = &Sec;
     break;
   }
 
-  // Information in the section header has priority over the information
-  // in a PT_DYNAMIC header.
+  if (DynamicPhdr && DynamicPhdr->p_offset + DynamicPhdr->p_filesz >
+                         ObjF->getMemoryBufferRef().getBufferSize()) {
+    reportWarning(
+        createError(
+            "PT_DYNAMIC segment offset + size exceeds the size of the file"),
+        ObjF->getFileName());
+    // Don't use the broken dynamic header.
+    DynamicPhdr = nullptr;
+  }
+
+  if (DynamicPhdr && DynamicSec) {
+    StringRef Name =
+        unwrapOrError(ObjF->getFileName(), Obj->getSectionName(DynamicSec));
+    if (DynamicSec->sh_addr + DynamicSec->sh_size >
+            DynamicPhdr->p_vaddr + DynamicPhdr->p_memsz ||
+        DynamicSec->sh_addr < DynamicPhdr->p_vaddr)
+      reportWarning(createError("The SHT_DYNAMIC section '" + Name +
+                                "' is not contained within the "
+                                "PT_DYNAMIC segment"),
+                    ObjF->getFileName());
+
+    if (DynamicSec->sh_addr != DynamicPhdr->p_vaddr)
+      reportWarning(createError("The SHT_DYNAMIC section '" + Name +
+                                "' is not at the start of "
+                                "PT_DYNAMIC segment"),
+                    ObjF->getFileName());
+  }
+
+  return std::make_pair(DynamicPhdr, DynamicSec);
+}
+
+template <typename ELFT>
+void ELFDumper<ELFT>::loadDynamicTable(const ELFFile<ELFT> *Obj) {
+  const Elf_Phdr *DynamicPhdr;
+  const Elf_Shdr *DynamicSec;
+  std::tie(DynamicPhdr, DynamicSec) = findDynamic(Obj);
+  if (!DynamicPhdr && !DynamicSec)
+    return;
+
+  DynRegionInfo FromPhdr(ObjF->getFileName());
+  bool IsPhdrTableValid = false;
+  if (DynamicPhdr) {
+    FromPhdr = createDRIFrom(DynamicPhdr, sizeof(Elf_Dyn));
+    IsPhdrTableValid = !FromPhdr.getAsArrayRef<Elf_Dyn>().empty();
+  }
+
+  // Locate the dynamic table described in a section header.
   // Ignore sh_entsize and use the expected value for entry size explicitly.
-  // This allows us to dump the dynamic sections with a broken sh_entsize
+  // This allows us to dump dynamic sections with a broken sh_entsize
   // field.
+  DynRegionInfo FromSec(ObjF->getFileName());
+  bool IsSecTableValid = false;
   if (DynamicSec) {
-    DynamicTable = checkDRI({ObjF->getELFFile()->base() + DynamicSec->sh_offset,
-                             DynamicSec->sh_size, sizeof(Elf_Dyn)});
-    parseDynamicTable();
+    FromSec =
+        checkDRI({ObjF->getELFFile()->base() + DynamicSec->sh_offset,
+                  DynamicSec->sh_size, sizeof(Elf_Dyn), ObjF->getFileName()});
+    IsSecTableValid = !FromSec.getAsArrayRef<Elf_Dyn>().empty();
   }
 
-  // If we have a PT_DYNAMIC header, we will either check the found dynamic
-  // section or take the dynamic table data directly from the header.
-  if (!DynamicPhdr)
+  // When we only have information from one of the SHT_DYNAMIC section header or
+  // PT_DYNAMIC program header, just use that.
+  if (!DynamicPhdr || !DynamicSec) {
+    if ((DynamicPhdr && IsPhdrTableValid) || (DynamicSec && IsSecTableValid)) {
+      DynamicTable = DynamicPhdr ? FromPhdr : FromSec;
+      parseDynamicTable();
+    } else {
+      reportWarning(createError("no valid dynamic table was found"),
+                    ObjF->getFileName());
+    }
     return;
+  }
 
-  if (DynamicPhdr->p_offset + DynamicPhdr->p_filesz >
-      ObjF->getMemoryBufferRef().getBufferSize())
-    reportError(
-        "PT_DYNAMIC segment offset + size exceeds the size of the file");
+  // At this point we have tables found from the section header and from the
+  // dynamic segment. Usually they match, but we have to do sanity checks to
+  // verify that.
 
-  if (!DynamicSec) {
-    DynamicTable = createDRIFrom(DynamicPhdr, sizeof(Elf_Dyn));
-    parseDynamicTable();
+  if (FromPhdr.Addr != FromSec.Addr)
+    reportWarning(createError("SHT_DYNAMIC section header and PT_DYNAMIC "
+                              "program header disagree about "
+                              "the location of the dynamic table"),
+                  ObjF->getFileName());
+
+  if (!IsPhdrTableValid && !IsSecTableValid) {
+    reportWarning(createError("no valid dynamic table was found"),
+                  ObjF->getFileName());
     return;
   }
 
-  StringRef Name = unwrapOrError(Obj->getSectionName(DynamicSec));
-  if (DynamicSec->sh_addr + DynamicSec->sh_size >
-          DynamicPhdr->p_vaddr + DynamicPhdr->p_memsz ||
-      DynamicSec->sh_addr < DynamicPhdr->p_vaddr)
-    reportWarning("The SHT_DYNAMIC section '" + Name +
-                  "' is not contained within the "
-                  "PT_DYNAMIC segment");
+  // Information in the PT_DYNAMIC program header has priority over the information
+  // in a section header.
+  if (IsPhdrTableValid) {
+    if (!IsSecTableValid)
+      reportWarning(
+          createError(
+              "SHT_DYNAMIC dynamic table is invalid: PT_DYNAMIC will be used"),
+          ObjF->getFileName());
+    DynamicTable = FromPhdr;
+  } else {
+    reportWarning(
+        createError(
+            "PT_DYNAMIC dynamic table is invalid: SHT_DYNAMIC will be used"),
+        ObjF->getFileName());
+    DynamicTable = FromSec;
+  }
 
-  if (DynamicSec->sh_addr != DynamicPhdr->p_vaddr)
-    reportWarning("The SHT_DYNAMIC section '" + Name +
-                  "' is not at the start of "
-                  "PT_DYNAMIC segment");
+  parseDynamicTable();
 }
 
 template <typename ELFT>
 ELFDumper<ELFT>::ELFDumper(const object::ELFObjectFile<ELFT> *ObjF,
-    ScopedPrinter &Writer)
-    : ObjDumper(Writer), ObjF(ObjF) {
+                           ScopedPrinter &Writer)
+    : ObjDumper(Writer), ObjF(ObjF), DynRelRegion(ObjF->getFileName()),
+      DynRelaRegion(ObjF->getFileName()), DynRelrRegion(ObjF->getFileName()),
+      DynPLTRelRegion(ObjF->getFileName()), DynSymRegion(ObjF->getFileName()),
+      DynamicTable(ObjF->getFileName()) {
   const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-
-  for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
+  for (const Elf_Shdr &Sec :
+       unwrapOrError(ObjF->getFileName(), Obj->sections())) {
     switch (Sec.sh_type) {
     case ELF::SHT_SYMTAB:
       if (!DotSymtabSec)
@@ -1433,16 +1577,17 @@ ELFDumper<ELFT>::ELFDumper(const object::ELFObjectFile<ELFT> *ObjF,
         DynSymRegion = createDRIFrom(&Sec);
         // This is only used (if Elf_Shdr present)for naming section in GNU
         // style
-        DynSymtabName = unwrapOrError(Obj->getSectionName(&Sec));
+        DynSymtabName =
+            unwrapOrError(ObjF->getFileName(), Obj->getSectionName(&Sec));
 
         if (Expected<StringRef> E = Obj->getStringTableForSymtab(Sec))
           DynamicStringTable = *E;
         else
-          warn(E.takeError());
+          reportWarning(E.takeError(), ObjF->getFileName());
       }
       break;
     case ELF::SHT_SYMTAB_SHNDX:
-      ShndxTable = unwrapOrError(Obj->getSHNDXTable(Sec));
+      ShndxTable = unwrapOrError(ObjF->getFileName(), Obj->getSHNDXTable(Sec));
       break;
     case ELF::SHT_GNU_versym:
       if (!SymbolVersionSection)
@@ -1547,10 +1692,13 @@ template <typename ELFT> void ELFDumper<ELFT>::parseDynamicTable() {
   auto toMappedAddr = [&](uint64_t Tag, uint64_t VAddr) -> const uint8_t * {
     auto MappedAddrOrError = ObjF->getELFFile()->toMappedAddr(VAddr);
     if (!MappedAddrOrError) {
-      reportWarning("Unable to parse DT_" +
-                    Twine(getTypeString(
-                        ObjF->getELFFile()->getHeader()->e_machine, Tag)) +
-                    ": " + llvm::toString(MappedAddrOrError.takeError()));
+      Error Err =
+          createError("Unable to parse DT_" +
+                      Twine(getTypeString(
+                          ObjF->getELFFile()->getHeader()->e_machine, Tag)) +
+                      ": " + llvm::toString(MappedAddrOrError.takeError()));
+
+      reportWarning(std::move(Err), ObjF->getFileName());
       return nullptr;
     }
     return MappedAddrOrError.get();
@@ -1576,10 +1724,29 @@ template <typename ELFT> void ELFDumper<ELFT>::parseDynamicTable() {
     case ELF::DT_STRSZ:
       StringTableSize = Dyn.getVal();
       break;
-    case ELF::DT_SYMTAB:
-      DynSymRegion.Addr = toMappedAddr(Dyn.getTag(), Dyn.getPtr());
-      DynSymRegion.EntSize = sizeof(Elf_Sym);
+    case ELF::DT_SYMTAB: {
+      // Often we find the information about the dynamic symbol table
+      // location in the SHT_DYNSYM section header. However, the value in
+      // DT_SYMTAB has priority, because it is used by dynamic loaders to
+      // locate .dynsym at runtime. The location we find in the section header
+      // and the location we find here should match. If we can't map the
+      // DT_SYMTAB value to an address (e.g. when there are no program headers), we
+      // ignore its value.
+      if (const uint8_t *VA = toMappedAddr(Dyn.getTag(), Dyn.getPtr())) {
+        // EntSize is non-zero if the dynamic symbol table has been found via a
+        // section header.
+        if (DynSymRegion.EntSize && VA != DynSymRegion.Addr)
+          reportWarning(
+              createError(
+                  "SHT_DYNSYM section header and DT_SYMTAB disagree about "
+                  "the location of the dynamic symbol table"),
+              ObjF->getFileName());
+
+        DynSymRegion.Addr = VA;
+        DynSymRegion.EntSize = sizeof(Elf_Sym);
+      }
       break;
+    }
     case ELF::DT_RELA:
       DynRelaRegion.Addr = toMappedAddr(Dyn.getTag(), Dyn.getPtr());
       break;
@@ -1619,8 +1786,9 @@ template <typename ELFT> void ELFDumper<ELFT>::parseDynamicTable() {
       else if (Dyn.getVal() == DT_RELA)
         DynPLTRelRegion.EntSize = sizeof(Elf_Rela);
       else
-        reportError(Twine("unknown DT_PLTREL value of ") +
-                    Twine((uint64_t)Dyn.getVal()));
+        reportError(createError(Twine("unknown DT_PLTREL value of ") +
+                                Twine((uint64_t)Dyn.getVal())),
+                    ObjF->getFileName());
       break;
     case ELF::DT_JMPREL:
       DynPLTRelRegion.Addr = toMappedAddr(Dyn.getTag(), Dyn.getPtr());
@@ -1632,8 +1800,7 @@ template <typename ELFT> void ELFDumper<ELFT>::parseDynamicTable() {
   }
   if (StringTableBegin)
     DynamicStringTable = StringRef(StringTableBegin, StringTableSize);
-  if (SONameOffset && SONameOffset < DynamicStringTable.size())
-    SOName = DynamicStringTable.data() + SONameOffset;
+  SOName = getDynamicString(SONameOffset);
 }
 
 template <typename ELFT>
@@ -1715,6 +1882,10 @@ template <class ELFT> void ELFDumper<ELFT>::printELFLinkerOptions() {
   ELFDumperStyle->printELFLinkerOptions(ObjF->getELFFile());
 }
 
+template <class ELFT> void ELFDumper<ELFT>::printStackSizes() {
+  ELFDumperStyle->printStackSizes(ObjF);
+}
+
 #define LLVM_READOBJ_DT_FLAG_ENT(prefix, enum)                                 \
   { #enum, prefix##_##enum }
 
@@ -1953,13 +2124,7 @@ void ELFDumper<ELFT>::printDynamicEntry(raw_ostream &OS, uint64_t Type,
       {DT_RPATH,     "Library rpath"},
       {DT_RUNPATH,   "Library runpath"},
     };
-    OS << TagNames.at(Type) << ": ";
-    if (DynamicStringTable.empty())
-      OS << "<String table is empty or was not found> ";
-    else if (Value < DynamicStringTable.size())
-      OS << "[" << StringRef(DynamicStringTable.data() + Value) << "]";
-    else
-      OS << "<Invalid offset 0x" << utohexstr(Value) << ">";
+    OS << TagNames.at(Type) << ": [" << getDynamicString(Value) << "]";
     break;
   }
   case DT_FLAGS:
@@ -1974,6 +2139,15 @@ void ELFDumper<ELFT>::printDynamicEntry(raw_ostream &OS, uint64_t Type,
   }
 }
 
+template <class ELFT>
+std::string ELFDumper<ELFT>::getDynamicString(uint64_t Value) const {
+  if (DynamicStringTable.empty())
+    return "<String table is empty or was not found>";
+  if (Value < DynamicStringTable.size())
+    return DynamicStringTable.data() + Value;
+  return Twine("<Invalid offset 0x" + utohexstr(Value) + ">").str();
+}
+
 template <class ELFT> void ELFDumper<ELFT>::printUnwindInfo() {
   DwarfCFIEH::PrinterContext<ELFT> Ctx(W, ObjF);
   Ctx.printUnwindInformation();
@@ -1985,7 +2159,8 @@ template <> void ELFDumper<ELF32LE>::printUnwindInfo() {
   const ELFFile<ELF32LE> *Obj = ObjF->getELFFile();
   const unsigned Machine = Obj->getHeader()->e_machine;
   if (Machine == EM_ARM) {
-    ARM::EHABI::PrinterContext<ELF32LE> Ctx(W, Obj, DotSymtabSec);
+    ARM::EHABI::PrinterContext<ELF32LE> Ctx(W, Obj, ObjF->getFileName(),
+                                            DotSymtabSec);
     Ctx.PrintUnwindInformation();
   }
   DwarfCFIEH::PrinterContext<ELF32LE> Ctx(W, ObjF);
@@ -2001,17 +2176,10 @@ template <class ELFT> void ELFDumper<ELFT>::printDynamicTable() {
 template <class ELFT> void ELFDumper<ELFT>::printNeededLibraries() {
   ListScope D(W, "NeededLibraries");
 
-  using LibsTy = std::vector<StringRef>;
-  LibsTy Libs;
-
+  std::vector<std::string> Libs;
   for (const auto &Entry : dynamic_table())
-    if (Entry.d_tag == ELF::DT_NEEDED) {
-      uint64_t Value = Entry.d_un.d_val;
-      if (Value < DynamicStringTable.size())
-        Libs.push_back(StringRef(DynamicStringTable.data() + Value));
-      else
-        Libs.push_back("<Library name index out of range>");
-    }
+    if (Entry.d_tag == ELF::DT_NEEDED)
+      Libs.push_back(getDynamicString(Entry.d_un.d_val));
 
   llvm::stable_sort(Libs);
 
@@ -2042,7 +2210,7 @@ template <typename ELFT> void ELFDumper<ELFT>::printGnuHashTable() {
   Elf_Sym_Range Syms = dynamic_symbols();
   unsigned NumSyms = std::distance(Syms.begin(), Syms.end());
   if (!NumSyms)
-    reportError("No dynamic symbol section");
+    reportError(createError("No dynamic symbol section"), ObjF->getFileName());
   W.printHexList("Values", GnuHashTable->values(NumSyms));
 }
 
@@ -2050,6 +2218,30 @@ template <typename ELFT> void ELFDumper<ELFT>::printLoadName() {
   W.printString("LoadName", SOName);
 }
 
+template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
+  switch (Obj->getHeader()->e_machine) {
+  case EM_ARM:
+    printAttributes();
+    break;
+  case EM_MIPS: {
+    ELFDumperStyle->printMipsABIFlags(ObjF);
+    printMipsOptions();
+    printMipsReginfo();
+
+    MipsGOTParser<ELFT> Parser(Obj, ObjF->getFileName(), dynamic_table(),
+                               dynamic_symbols());
+    if (Parser.hasGot())
+      ELFDumperStyle->printMipsGOT(Parser);
+    if (Parser.hasPlt())
+      ELFDumperStyle->printMipsPLT(Parser);
+    break;
+  }
+  default:
+    break;
+  }
+}
+
 template <class ELFT> void ELFDumper<ELFT>::printAttributes() {
   W.startLine() << "Attributes not implemented.\n";
 }
@@ -2064,11 +2256,13 @@ template <> void ELFDumper<ELF32LE>::printAttributes() {
   }
 
   DictScope BA(W, "BuildAttributes");
-  for (const ELFO::Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
+  for (const ELFO::Elf_Shdr &Sec :
+       unwrapOrError(ObjF->getFileName(), Obj->sections())) {
     if (Sec.sh_type != ELF::SHT_ARM_ATTRIBUTES)
       continue;
 
-    ArrayRef<uint8_t> Contents = unwrapOrError(Obj->getSectionContents(&Sec));
+    ArrayRef<uint8_t> Contents =
+        unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(&Sec));
     if (Contents[0] != ARMBuildAttrs::Format_Version) {
       errs() << "unrecognised FormatVersion: 0x"
              << Twine::utohexstr(Contents[0]) << '\n';
@@ -2092,7 +2286,8 @@ public:
   const bool IsStatic;
   const ELFO * const Obj;
 
-  MipsGOTParser(const ELFO *Obj, Elf_Dyn_Range DynTable, Elf_Sym_Range DynSyms);
+  MipsGOTParser(const ELFO *Obj, StringRef FileName, Elf_Dyn_Range DynTable,
+                Elf_Sym_Range DynSyms);
 
   bool hasGot() const { return !GotEntries.empty(); }
   bool hasPlt() const { return !PltEntries.empty(); }
@@ -2126,6 +2321,8 @@ private:
   const Elf_Shdr *PltSec;
   const Elf_Shdr *PltRelSec;
   const Elf_Shdr *PltSymTable;
+  StringRef FileName;
+
   Elf_Sym_Range GotDynSyms;
   StringRef PltStrTable;
 
@@ -2136,21 +2333,24 @@ private:
 } // end anonymous namespace
 
 template <class ELFT>
-MipsGOTParser<ELFT>::MipsGOTParser(const ELFO *Obj, Elf_Dyn_Range DynTable,
+MipsGOTParser<ELFT>::MipsGOTParser(const ELFO *Obj, StringRef FileName,
+                                   Elf_Dyn_Range DynTable,
                                    Elf_Sym_Range DynSyms)
     : IsStatic(DynTable.empty()), Obj(Obj), GotSec(nullptr), LocalNum(0),
-      GlobalNum(0), PltSec(nullptr), PltRelSec(nullptr), PltSymTable(nullptr) {
+      GlobalNum(0), PltSec(nullptr), PltRelSec(nullptr), PltSymTable(nullptr),
+      FileName(FileName) {
   // See "Global Offset Table" in Chapter 5 in the following document
   // for detailed GOT description.
   // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
 
   // Find static GOT secton.
   if (IsStatic) {
-    GotSec = findSectionByName(*Obj, ".got");
+    GotSec = findSectionByName(*Obj, FileName, ".got");
     if (!GotSec)
-      reportError("Cannot find .got section");
+      return;
 
-    ArrayRef<uint8_t> Content = unwrapOrError(Obj->getSectionContents(GotSec));
+    ArrayRef<uint8_t> Content =
+        unwrapOrError(FileName, Obj->getSectionContents(GotSec));
     GotEntries = Entries(reinterpret_cast<const Entry *>(Content.data()),
                          Content.size() / sizeof(Entry));
     LocalNum = GotEntries.size();
@@ -2194,17 +2394,21 @@ MipsGOTParser<ELFT>::MipsGOTParser(const ELFO *Obj, Elf_Dyn_Range DynTable,
 
     size_t DynSymTotal = DynSyms.size();
     if (*DtGotSym > DynSymTotal)
-      reportError("MIPS_GOTSYM exceeds a number of dynamic symbols");
+      reportError(
+          createError("MIPS_GOTSYM exceeds a number of dynamic symbols"),
+          FileName);
 
-    GotSec = findNotEmptySectionByAddress(Obj, *DtPltGot);
+    GotSec = findNotEmptySectionByAddress(Obj, FileName, *DtPltGot);
     if (!GotSec)
-      reportError("There is no not empty GOT section at 0x" +
-                  Twine::utohexstr(*DtPltGot));
+      reportError(createError("There is no not empty GOT section at 0x" +
+                              Twine::utohexstr(*DtPltGot)),
+                  FileName);
 
     LocalNum = *DtLocalGotNum;
     GlobalNum = DynSymTotal - *DtGotSym;
 
-    ArrayRef<uint8_t> Content = unwrapOrError(Obj->getSectionContents(GotSec));
+    ArrayRef<uint8_t> Content =
+        unwrapOrError(FileName, Obj->getSectionContents(GotSec));
     GotEntries = Entries(reinterpret_cast<const Entry *>(Content.data()),
                          Content.size() / sizeof(Entry));
     GotDynSyms = DynSyms.drop_front(*DtGotSym);
@@ -2217,23 +2421,24 @@ MipsGOTParser<ELFT>::MipsGOTParser(const ELFO *Obj, Elf_Dyn_Range DynTable,
     if (!DtJmpRel)
       report_fatal_error("Cannot find JMPREL dynamic table tag.");
 
-    PltSec = findNotEmptySectionByAddress(Obj, *DtMipsPltGot);
+    PltSec = findNotEmptySectionByAddress(Obj, FileName, * DtMipsPltGot);
     if (!PltSec)
       report_fatal_error("There is no not empty PLTGOT section at 0x " +
                          Twine::utohexstr(*DtMipsPltGot));
 
-    PltRelSec = findNotEmptySectionByAddress(Obj, *DtJmpRel);
+    PltRelSec = findNotEmptySectionByAddress(Obj, FileName, * DtJmpRel);
     if (!PltRelSec)
       report_fatal_error("There is no not empty RELPLT section at 0x" +
                          Twine::utohexstr(*DtJmpRel));
 
     ArrayRef<uint8_t> PltContent =
-        unwrapOrError(Obj->getSectionContents(PltSec));
+        unwrapOrError(FileName, Obj->getSectionContents(PltSec));
     PltEntries = Entries(reinterpret_cast<const Entry *>(PltContent.data()),
                          PltContent.size() / sizeof(Entry));
 
-    PltSymTable = unwrapOrError(Obj->getSection(PltRelSec->sh_link));
-    PltStrTable = unwrapOrError(Obj->getStringTableForSymtab(*PltSymTable));
+    PltSymTable = unwrapOrError(FileName, Obj->getSection(PltRelSec->sh_link));
+    PltStrTable =
+        unwrapOrError(FileName, Obj->getStringTableForSymtab(*PltSymTable));
   }
 }
 
@@ -2334,26 +2539,16 @@ const typename MipsGOTParser<ELFT>::Elf_Sym *
 MipsGOTParser<ELFT>::getPltSym(const Entry *E) const {
   int64_t Offset = std::distance(getPltEntries().data(), E);
   if (PltRelSec->sh_type == ELF::SHT_REL) {
-    Elf_Rel_Range Rels = unwrapOrError(Obj->rels(PltRelSec));
-    return unwrapOrError(Obj->getRelocationSymbol(&Rels[Offset], PltSymTable));
+    Elf_Rel_Range Rels = unwrapOrError(FileName, Obj->rels(PltRelSec));
+    return unwrapOrError(FileName,
+                         Obj->getRelocationSymbol(&Rels[Offset], PltSymTable));
   } else {
-    Elf_Rela_Range Rels = unwrapOrError(Obj->relas(PltRelSec));
-    return unwrapOrError(Obj->getRelocationSymbol(&Rels[Offset], PltSymTable));
+    Elf_Rela_Range Rels = unwrapOrError(FileName, Obj->relas(PltRelSec));
+    return unwrapOrError(FileName,
+                         Obj->getRelocationSymbol(&Rels[Offset], PltSymTable));
   }
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printMipsPLTGOT() {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  if (Obj->getHeader()->e_machine != EM_MIPS)
-    reportError("MIPS PLT GOT is available for MIPS targets only");
-
-  MipsGOTParser<ELFT> Parser(Obj, dynamic_table(), dynamic_symbols());
-  if (Parser.hasGot())
-    ELFDumperStyle->printMipsGOT(Parser);
-  if (Parser.hasPlt())
-    ELFDumperStyle->printMipsPLT(Parser);
-}
-
 static const EnumEntry<unsigned> ElfMipsISAExtType[] = {
   {"None",                    Mips::AFL_EXT_NONE},
   {"Broadcom SB-1",           Mips::AFL_EXT_SB1},
@@ -2427,41 +2622,6 @@ static int getMipsRegisterSize(uint8_t Flag) {
   }
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printMipsABIFlags() {
-  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const Elf_Shdr *Shdr = findSectionByName(*Obj, ".MIPS.abiflags");
-  if (!Shdr) {
-    W.startLine() << "There is no .MIPS.abiflags section in the file.\n";
-    return;
-  }
-  ArrayRef<uint8_t> Sec = unwrapOrError(Obj->getSectionContents(Shdr));
-  if (Sec.size() != sizeof(Elf_Mips_ABIFlags<ELFT>)) {
-    W.startLine() << "The .MIPS.abiflags section has a wrong size.\n";
-    return;
-  }
-
-  auto *Flags = reinterpret_cast<const Elf_Mips_ABIFlags<ELFT> *>(Sec.data());
-
-  raw_ostream &OS = W.getOStream();
-  DictScope GS(W, "MIPS ABI Flags");
-
-  W.printNumber("Version", Flags->version);
-  W.startLine() << "ISA: ";
-  if (Flags->isa_rev <= 1)
-    OS << format("MIPS%u", Flags->isa_level);
-  else
-    OS << format("MIPS%ur%u", Flags->isa_level, Flags->isa_rev);
-  OS << "\n";
-  W.printEnum("ISA Extension", Flags->isa_ext, makeArrayRef(ElfMipsISAExtType));
-  W.printFlags("ASEs", Flags->ases, makeArrayRef(ElfMipsASEFlags));
-  W.printEnum("FP ABI", Flags->fp_abi, makeArrayRef(ElfMipsFpABIType));
-  W.printNumber("GPR size", getMipsRegisterSize(Flags->gpr_size));
-  W.printNumber("CPR1 size", getMipsRegisterSize(Flags->cpr1_size));
-  W.printNumber("CPR2 size", getMipsRegisterSize(Flags->cpr2_size));
-  W.printFlags("Flags 1", Flags->flags1, makeArrayRef(ElfMipsFlags1));
-  W.printHex("Flags 2", Flags->flags2);
-}
-
 template <class ELFT>
 static void printMipsReginfoData(ScopedPrinter &W,
                                  const Elf_Mips_RegInfo<ELFT> &Reginfo) {
@@ -2475,12 +2635,13 @@ static void printMipsReginfoData(ScopedPrinter &W,
 
 template <class ELFT> void ELFDumper<ELFT>::printMipsReginfo() {
   const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const Elf_Shdr *Shdr = findSectionByName(*Obj, ".reginfo");
+  const Elf_Shdr *Shdr = findSectionByName(*Obj, ObjF->getFileName(), ".reginfo");
   if (!Shdr) {
     W.startLine() << "There is no .reginfo section in the file.\n";
     return;
   }
-  ArrayRef<uint8_t> Sec = unwrapOrError(Obj->getSectionContents(Shdr));
+  ArrayRef<uint8_t> Sec =
+      unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr));
   if (Sec.size() != sizeof(Elf_Mips_RegInfo<ELFT>)) {
     W.startLine() << "The .reginfo section has a wrong size.\n";
     return;
@@ -2493,7 +2654,8 @@ template <class ELFT> void ELFDumper<ELFT>::printMipsReginfo() {
 
 template <class ELFT> void ELFDumper<ELFT>::printMipsOptions() {
   const ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const Elf_Shdr *Shdr = findSectionByName(*Obj, ".MIPS.options");
+  const Elf_Shdr *Shdr =
+      findSectionByName(*Obj, ObjF->getFileName(), ".MIPS.options");
   if (!Shdr) {
     W.startLine() << "There is no .MIPS.options section in the file.\n";
     return;
@@ -2501,7 +2663,8 @@ template <class ELFT> void ELFDumper<ELFT>::printMipsOptions() {
 
   DictScope GS(W, "MIPS Options");
 
-  ArrayRef<uint8_t> Sec = unwrapOrError(Obj->getSectionContents(Shdr));
+  ArrayRef<uint8_t> Sec =
+      unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr));
   while (!Sec.empty()) {
     if (Sec.size() < sizeof(Elf_Mips_Options<ELFT>)) {
       W.startLine() << "The .MIPS.options section has a wrong size.\n";
@@ -2524,8 +2687,9 @@ template <class ELFT> void ELFDumper<ELFT>::printMipsOptions() {
 template <class ELFT> void ELFDumper<ELFT>::printStackMap() const {
   const ELFFile<ELFT> *Obj = ObjF->getELFFile();
   const Elf_Shdr *StackMapSection = nullptr;
-  for (const auto &Sec : unwrapOrError(Obj->sections())) {
-    StringRef Name = unwrapOrError(Obj->getSectionName(&Sec));
+  for (const auto &Sec : unwrapOrError(ObjF->getFileName(), Obj->sections())) {
+    StringRef Name =
+        unwrapOrError(ObjF->getFileName(), Obj->getSectionName(&Sec));
     if (Name == ".llvm_stackmaps") {
       StackMapSection = &Sec;
       break;
@@ -2535,8 +2699,8 @@ template <class ELFT> void ELFDumper<ELFT>::printStackMap() const {
   if (!StackMapSection)
     return;
 
-  ArrayRef<uint8_t> StackMapContentsArray =
-      unwrapOrError(Obj->getSectionContents(StackMapSection));
+  ArrayRef<uint8_t> StackMapContentsArray = unwrapOrError(
+      ObjF->getFileName(), Obj->getSectionContents(StackMapSection));
 
   prettyPrintStackMap(
       W, StackMapParser<ELFT::TargetEndianness>(StackMapContentsArray));
@@ -2560,24 +2724,26 @@ static inline void printFields(formatted_raw_ostream &OS, StringRef Str1,
 }
 
 template <class ELFT>
-static std::string getSectionHeadersNumString(const ELFFile<ELFT> *Obj) {
+static std::string getSectionHeadersNumString(const ELFFile<ELFT> *Obj,
+                                              StringRef FileName) {
   const typename ELFT::Ehdr *ElfHeader = Obj->getHeader();
   if (ElfHeader->e_shnum != 0)
     return to_string(ElfHeader->e_shnum);
 
-  ArrayRef<typename ELFT::Shdr> Arr = unwrapOrError(Obj->sections());
+  ArrayRef<typename ELFT::Shdr> Arr = unwrapOrError(FileName, Obj->sections());
   if (Arr.empty())
     return "0";
   return "0 (" + to_string(Arr[0].sh_size) + ")";
 }
 
 template <class ELFT>
-static std::string getSectionHeaderTableIndexString(const ELFFile<ELFT> *Obj) {
+static std::string getSectionHeaderTableIndexString(const ELFFile<ELFT> *Obj,
+                                                    StringRef FileName) {
   const typename ELFT::Ehdr *ElfHeader = Obj->getHeader();
   if (ElfHeader->e_shstrndx != SHN_XINDEX)
     return to_string(ElfHeader->e_shstrndx);
 
-  ArrayRef<typename ELFT::Shdr> Arr = unwrapOrError(Obj->sections());
+  ArrayRef<typename ELFT::Shdr> Arr = unwrapOrError(FileName, Obj->sections());
   if (Arr.empty())
     return "65535 (corrupt: out of range)";
   return to_string(ElfHeader->e_shstrndx) + " (" + to_string(Arr[0].sh_link) +
@@ -2639,9 +2805,9 @@ template <class ELFT> void GNUStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
   printFields(OS, "Number of program headers:", Str);
   Str = to_string(e->e_shentsize) + " (bytes)";
   printFields(OS, "Size of section headers:", Str);
-  Str = getSectionHeadersNumString(Obj);
+  Str = getSectionHeadersNumString(Obj, this->FileName);
   printFields(OS, "Number of section headers:", Str);
-  Str = getSectionHeaderTableIndexString(Obj);
+  Str = getSectionHeaderTableIndexString(Obj, this->FileName);
   printFields(OS, "Section header string table index:", Str);
 }
 
@@ -2663,26 +2829,29 @@ struct GroupSection {
 };
 
 template <class ELFT>
-std::vector<GroupSection> getGroups(const ELFFile<ELFT> *Obj) {
+std::vector<GroupSection> getGroups(const ELFFile<ELFT> *Obj,
+                                    StringRef FileName) {
   using Elf_Shdr = typename ELFT::Shdr;
   using Elf_Sym = typename ELFT::Sym;
   using Elf_Word = typename ELFT::Word;
 
   std::vector<GroupSection> Ret;
   uint64_t I = 0;
-  for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
+  for (const Elf_Shdr &Sec : unwrapOrError(FileName, Obj->sections())) {
     ++I;
     if (Sec.sh_type != ELF::SHT_GROUP)
       continue;
 
-    const Elf_Shdr *Symtab = unwrapOrError(Obj->getSection(Sec.sh_link));
-    StringRef StrTable = unwrapOrError(Obj->getStringTableForSymtab(*Symtab));
-    const Elf_Sym *Sym =
-        unwrapOrError(Obj->template getEntry<Elf_Sym>(Symtab, Sec.sh_info));
-    auto Data =
-        unwrapOrError(Obj->template getSectionContentsAsArray<Elf_Word>(&Sec));
+    const Elf_Shdr *Symtab =
+        unwrapOrError(FileName, Obj->getSection(Sec.sh_link));
+    StringRef StrTable =
+        unwrapOrError(FileName, Obj->getStringTableForSymtab(*Symtab));
+    const Elf_Sym *Sym = unwrapOrError(
+        FileName, Obj->template getEntry<Elf_Sym>(Symtab, Sec.sh_info));
+    auto Data = unwrapOrError(
+        FileName, Obj->template getSectionContentsAsArray<Elf_Word>(&Sec));
 
-    StringRef Name = unwrapOrError(Obj->getSectionName(&Sec));
+    StringRef Name = unwrapOrError(FileName, Obj->getSectionName(&Sec));
     StringRef Signature = StrTable.data() + Sym->st_name;
     Ret.push_back({Name,
                    maybeDemangle(Signature),
@@ -2695,8 +2864,8 @@ std::vector<GroupSection> getGroups(const ELFFile<ELFT> *Obj) {
 
     std::vector<GroupMember> &GM = Ret.back().Members;
     for (uint32_t Ndx : Data.slice(1)) {
-      auto Sec = unwrapOrError(Obj->getSection(Ndx));
-      const StringRef Name = unwrapOrError(Obj->getSectionName(Sec));
+      auto Sec = unwrapOrError(FileName, Obj->getSection(Ndx));
+      const StringRef Name = unwrapOrError(FileName, Obj->getSectionName(Sec));
       GM.push_back({Name, Ndx});
     }
   }
@@ -2715,7 +2884,7 @@ mapSectionsToGroups(ArrayRef<GroupSection> Groups) {
 } // namespace
 
 template <class ELFT> void GNUStyle<ELFT>::printGroupSections(const ELFO *Obj) {
-  std::vector<GroupSection> V = getGroups<ELFT>(Obj);
+  std::vector<GroupSection> V = getGroups<ELFT>(Obj, this->FileName);
   DenseMap<uint64_t, const GroupSection *> Map = mapSectionsToGroups(V);
   for (const GroupSection &G : V) {
     OS << "\n"
@@ -2745,14 +2914,17 @@ template <class ELFT> void GNUStyle<ELFT>::printGroupSections(const ELFO *Obj) {
 template <class ELFT>
 void GNUStyle<ELFT>::printRelocation(const ELFO *Obj, const Elf_Shdr *SymTab,
                                      const Elf_Rela &R, bool IsRela) {
-  const Elf_Sym *Sym = unwrapOrError(Obj->getRelocationSymbol(&R, SymTab));
+  const Elf_Sym *Sym =
+      unwrapOrError(this->FileName, Obj->getRelocationSymbol(&R, SymTab));
   std::string TargetName;
   if (Sym && Sym->getType() == ELF::STT_SECTION) {
     const Elf_Shdr *Sec = unwrapOrError(
+        this->FileName,
         Obj->getSection(Sym, SymTab, this->dumper()->getShndxTable()));
-    TargetName = unwrapOrError(Obj->getSectionName(Sec));
+    TargetName = unwrapOrError(this->FileName, Obj->getSectionName(Sec));
   } else if (Sym) {
-    StringRef StrTable = unwrapOrError(Obj->getStringTableForSymtab(*SymTab));
+    StringRef StrTable =
+        unwrapOrError(this->FileName, Obj->getStringTableForSymtab(*SymTab));
     TargetName = this->dumper()->getFullSymbolName(
         Sym, StrTable, SymTab->sh_type == SHT_DYNSYM /* IsDynamic */);
   }
@@ -2821,21 +2993,21 @@ template <class ELFT> void GNUStyle<ELFT>::printRelocHeader(unsigned SType) {
 
 template <class ELFT> void GNUStyle<ELFT>::printRelocations(const ELFO *Obj) {
   bool HasRelocSections = false;
-  for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
+  for (const Elf_Shdr &Sec : unwrapOrError(this->FileName, Obj->sections())) {
     if (Sec.sh_type != ELF::SHT_REL && Sec.sh_type != ELF::SHT_RELA &&
         Sec.sh_type != ELF::SHT_RELR && Sec.sh_type != ELF::SHT_ANDROID_REL &&
         Sec.sh_type != ELF::SHT_ANDROID_RELA &&
         Sec.sh_type != ELF::SHT_ANDROID_RELR)
       continue;
     HasRelocSections = true;
-    StringRef Name = unwrapOrError(Obj->getSectionName(&Sec));
+    StringRef Name = unwrapOrError(this->FileName, Obj->getSectionName(&Sec));
     unsigned Entries = Sec.getEntityCount();
     std::vector<Elf_Rela> AndroidRelas;
     if (Sec.sh_type == ELF::SHT_ANDROID_REL ||
         Sec.sh_type == ELF::SHT_ANDROID_RELA) {
       // Android's packed relocation section needs to be unpacked first
       // to get the actual number of entries.
-      AndroidRelas = unwrapOrError(Obj->android_relas(&Sec));
+      AndroidRelas = unwrapOrError(this->FileName, Obj->android_relas(&Sec));
       Entries = AndroidRelas.size();
     }
     std::vector<Elf_Rela> RelrRelas;
@@ -2843,8 +3015,8 @@ template <class ELFT> void GNUStyle<ELFT>::printRelocations(const ELFO *Obj) {
                            Sec.sh_type == ELF::SHT_ANDROID_RELR)) {
       // .relr.dyn relative relocation section needs to be unpacked first
       // to get the actual number of entries.
-      Elf_Relr_Range Relrs = unwrapOrError(Obj->relrs(&Sec));
-      RelrRelas = unwrapOrError(Obj->decode_relrs(Relrs));
+      Elf_Relr_Range Relrs = unwrapOrError(this->FileName, Obj->relrs(&Sec));
+      RelrRelas = unwrapOrError(this->FileName, Obj->decode_relrs(Relrs));
       Entries = RelrRelas.size();
     }
     uintX_t Offset = Sec.sh_offset;
@@ -2852,10 +3024,11 @@ template <class ELFT> void GNUStyle<ELFT>::printRelocations(const ELFO *Obj) {
        << to_hexString(Offset, false) << " contains " << Entries
        << " entries:\n";
     printRelocHeader(Sec.sh_type);
-    const Elf_Shdr *SymTab = unwrapOrError(Obj->getSection(Sec.sh_link));
+    const Elf_Shdr *SymTab =
+        unwrapOrError(this->FileName, Obj->getSection(Sec.sh_link));
     switch (Sec.sh_type) {
     case ELF::SHT_REL:
-      for (const auto &R : unwrapOrError(Obj->rels(&Sec))) {
+      for (const auto &R : unwrapOrError(this->FileName, Obj->rels(&Sec))) {
         Elf_Rela Rela;
         Rela.r_offset = R.r_offset;
         Rela.r_info = R.r_info;
@@ -2864,13 +3037,13 @@ template <class ELFT> void GNUStyle<ELFT>::printRelocations(const ELFO *Obj) {
       }
       break;
     case ELF::SHT_RELA:
-      for (const auto &R : unwrapOrError(Obj->relas(&Sec)))
+      for (const auto &R : unwrapOrError(this->FileName, Obj->relas(&Sec)))
         printRelocation(Obj, SymTab, R, true);
       break;
     case ELF::SHT_RELR:
     case ELF::SHT_ANDROID_RELR:
       if (opts::RawRelr)
-        for (const auto &R : unwrapOrError(Obj->relrs(&Sec)))
+        for (const auto &R : unwrapOrError(this->FileName, Obj->relrs(&Sec)))
           OS << to_string(format_hex_no_prefix(R, ELFT::Is64Bits ? 16 : 8))
              << "\n";
       else
@@ -2992,6 +3165,12 @@ static std::string getSectionTypeString(unsigned Arch, unsigned Type) {
     return "LLVM_ADDRSIG";
   case SHT_LLVM_DEPENDENT_LIBRARIES:
     return "LLVM_DEPENDENT_LIBRARIES";
+  case SHT_LLVM_SYMPART:
+    return "LLVM_SYMPART";
+  case SHT_LLVM_PART_EHDR:
+    return "LLVM_PART_EHDR";
+  case SHT_LLVM_PART_PHDR:
+    return "LLVM_PART_PHDR";
   // FIXME: Parse processor specific GNU attributes
   case SHT_GNU_ATTRIBUTES:
     return "ATTRIBUTES";
@@ -3009,30 +3188,10 @@ static std::string getSectionTypeString(unsigned Arch, unsigned Type) {
   return "";
 }
 
-template <class ELFT>
-static StringRef getSectionName(const typename ELFT::Shdr &Sec,
-                                const ELFObjectFile<ELFT> &ElfObj,
-                                ArrayRef<typename ELFT::Shdr> Sections) {
-  const ELFFile<ELFT> &Obj = *ElfObj.getELFFile();
-  uint32_t Index = Obj.getHeader()->e_shstrndx;
-  if (Index == ELF::SHN_XINDEX)
-    Index = Sections[0].sh_link;
-  if (!Index) // no section string table.
-    return "";
-  // TODO: Test a case when the sh_link of the section with index 0 is broken.
-  if (Index >= Sections.size())
-    reportError(ElfObj.getFileName(),
-                createError("section header string table index " +
-                            Twine(Index) + " does not exist"));
-  StringRef Data = toStringRef(unwrapOrError(
-      Obj.template getSectionContentsAsArray<uint8_t>(&Sections[Index])));
-  return unwrapOrError(Obj.getSectionName(&Sec, Data));
-}
-
 template <class ELFT>
 void GNUStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
   unsigned Bias = ELFT::Is64Bits ? 0 : 8;
-  ArrayRef<Elf_Shdr> Sections = unwrapOrError(Obj->sections());
+  ArrayRef<Elf_Shdr> Sections = unwrapOrError(this->FileName, Obj->sections());
   OS << "There are " << to_string(Sections.size())
      << " section headers, starting at offset "
      << "0x" << to_hexString(Obj->getHeader()->e_shoff, false) << ":\n\n";
@@ -3050,7 +3209,8 @@ void GNUStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
   size_t SectionIndex = 0;
   for (const Elf_Shdr &Sec : Sections) {
     Fields[0].Str = to_string(SectionIndex);
-    Fields[1].Str = getSectionName(Sec, *ElfObj, Sections);
+    Fields[1].Str = unwrapOrError<StringRef>(
+        ElfObj->getFileName(), Obj->getSectionName(&Sec, this->WarningHandler));
     Fields[2].Str =
         getSectionTypeString(Obj->getHeader()->e_machine, Sec.sh_type);
     Fields[3].Str =
@@ -3089,7 +3249,8 @@ void GNUStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
 
 template <class ELFT>
 void GNUStyle<ELFT>::printSymtabMessage(const ELFO *Obj, StringRef Name,
-                                        size_t Entries) {
+                                        size_t Entries,
+                                        bool NonVisibilityBitsUsed) {
   if (!Name.empty())
     OS << "\nSymbol table '" << Name << "' contains " << Entries
        << " entries:\n";
@@ -3097,9 +3258,13 @@ void GNUStyle<ELFT>::printSymtabMessage(const ELFO *Obj, StringRef Name,
     OS << "\n Symbol table for image:\n";
 
   if (ELFT::Is64Bits)
-    OS << "   Num:    Value          Size Type    Bind   Vis      Ndx Name\n";
+    OS << "   Num:    Value          Size Type    Bind   Vis";
   else
-    OS << "   Num:    Value  Size Type    Bind   Vis      Ndx Name\n";
+    OS << "   Num:    Value  Size Type    Bind   Vis";
+
+  if (NonVisibilityBitsUsed)
+    OS << "             ";
+  OS << "       Ndx Name\n";
 }
 
 template <class ELFT>
@@ -3115,10 +3280,11 @@ std::string GNUStyle<ELFT>::getSymbolSectionNdx(const ELFO *Obj,
   case ELF::SHN_COMMON:
     return "COM";
   case ELF::SHN_XINDEX:
-    return to_string(
-        format_decimal(unwrapOrError(object::getExtendedSymbolTableIndex<ELFT>(
-                           Symbol, FirstSym, this->dumper()->getShndxTable())),
-                       3));
+    return to_string(format_decimal(
+        unwrapOrError(this->FileName,
+                      object::getExtendedSymbolTableIndex<ELFT>(
+                          Symbol, FirstSym, this->dumper()->getShndxTable())),
+        3));
   default:
     // Find if:
     // Processor specific
@@ -3142,7 +3308,7 @@ std::string GNUStyle<ELFT>::getSymbolSectionNdx(const ELFO *Obj,
 template <class ELFT>
 void GNUStyle<ELFT>::printSymbol(const ELFO *Obj, const Elf_Sym *Symbol,
                                  const Elf_Sym *FirstSym, StringRef StrTable,
-                                 bool IsDynamic) {
+                                 bool IsDynamic, bool NonVisibilityBitsUsed) {
   static int Idx = 0;
   static bool Dynamic = true;
 
@@ -3156,7 +3322,7 @@ void GNUStyle<ELFT>::printSymbol(const ELFO *Obj, const Elf_Sym *Symbol,
 
   unsigned Bias = ELFT::Is64Bits ? 8 : 0;
   Field Fields[8] = {0,         8,         17 + Bias, 23 + Bias,
-                     31 + Bias, 38 + Bias, 47 + Bias, 51 + Bias};
+                     31 + Bias, 38 + Bias, 48 + Bias, 51 + Bias};
   Fields[0].Str = to_string(format_decimal(Idx++, 6)) + ":";
   Fields[1].Str = to_string(
       format_hex_no_prefix(Symbol->st_value, ELFT::Is64Bits ? 16 : 8));
@@ -3173,7 +3339,13 @@ void GNUStyle<ELFT>::printSymbol(const ELFO *Obj, const Elf_Sym *Symbol,
       printEnum(Symbol->getBinding(), makeArrayRef(ElfSymbolBindings));
   Fields[5].Str =
       printEnum(Symbol->getVisibility(), makeArrayRef(ElfSymbolVisibilities));
+  if (Symbol->st_other & ~0x3)
+    Fields[5].Str +=
+        " [<other: " + to_string(format_hex(Symbol->st_other, 2)) + ">]";
+
+  Fields[6].Column += NonVisibilityBitsUsed ? 13 : 0;
   Fields[6].Str = getSymbolSectionNdx(Obj, Symbol, FirstSym);
+
   Fields[7].Str =
       this->dumper()->getFullSymbolName(Symbol, StrTable, IsDynamic);
   for (auto &Entry : Fields)
@@ -3193,7 +3365,7 @@ void GNUStyle<ELFT>::printHashedSymbol(const ELFO *Obj, const Elf_Sym *FirstSym,
 
   const auto Symbol = FirstSym + Sym;
   Fields[2].Str = to_string(
-      format_hex_no_prefix(Symbol->st_value, ELFT::Is64Bits ? 18 : 8));
+      format_hex_no_prefix(Symbol->st_value, ELFT::Is64Bits ? 16 : 8));
   Fields[3].Str = to_string(format_decimal(Symbol->st_size, 5));
 
   unsigned char SymbolType = Symbol->getType();
@@ -3246,10 +3418,21 @@ template <class ELFT> void GNUStyle<ELFT>::printHashSymbols(const ELFO *Obj) {
     for (uint32_t Buc = 0; Buc < SysVHash->nbucket; Buc++) {
       if (Buckets[Buc] == ELF::STN_UNDEF)
         continue;
+      std::vector<bool> Visited(SysVHash->nchain);
       for (uint32_t Ch = Buckets[Buc]; Ch < SysVHash->nchain; Ch = Chains[Ch]) {
         if (Ch == ELF::STN_UNDEF)
           break;
+
+        if (Visited[Ch]) {
+          reportWarning(
+              createError(".hash section is invalid: bucket " + Twine(Ch) +
+                          ": a cycle was detected in the linked chain"),
+              this->FileName);
+          break;
+        }
+
         printHashedSymbol(Obj, &DynSyms[0], Ch, StringTable, Buc);
+        Visited[Ch] = true;
       }
     }
   }
@@ -3380,7 +3563,8 @@ void GNUStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
 
   unsigned Width = ELFT::Is64Bits ? 18 : 10;
   unsigned SizeWidth = ELFT::Is64Bits ? 8 : 7;
-  for (const auto &Phdr : unwrapOrError(Obj->program_headers())) {
+  for (const auto &Phdr :
+       unwrapOrError(this->FileName, Obj->program_headers())) {
     Fields[0].Str = getElfPtType(Header->e_machine, Phdr.p_type);
     Fields[1].Str = to_string(format_hex(Phdr.p_offset, 8));
     Fields[2].Str = to_string(format_hex(Phdr.p_vaddr, Width));
@@ -3404,10 +3588,11 @@ void GNUStyle<ELFT>::printSectionMapping(const ELFO *Obj) {
   OS << "\n Section to Segment mapping:\n  Segment Sections...\n";
   DenseSet<const Elf_Shdr *> BelongsToSegment;
   int Phnum = 0;
-  for (const Elf_Phdr &Phdr : unwrapOrError(Obj->program_headers())) {
+  for (const Elf_Phdr &Phdr :
+       unwrapOrError(this->FileName, Obj->program_headers())) {
     std::string Sections;
     OS << format("   %2.2d     ", Phnum++);
-    for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
+    for (const Elf_Shdr &Sec : unwrapOrError(this->FileName, Obj->sections())) {
       // Check if each section is in a segment and then print mapping.
       // readelf additionally makes sure it does not print zero sized sections
       // at end of segments and for PT_DYNAMIC both start and end of section
@@ -3418,7 +3603,9 @@ void GNUStyle<ELFT>::printSectionMapping(const ELFO *Obj) {
       if (!TbssInNonTLS && checkTLSSections(Phdr, Sec) &&
           checkoffsets(Phdr, Sec) && checkVMA(Phdr, Sec) &&
           checkPTDynamic(Phdr, Sec) && (Sec.sh_type != ELF::SHT_NULL)) {
-        Sections += unwrapOrError(Obj->getSectionName(&Sec)).str() + " ";
+        Sections +=
+            unwrapOrError(this->FileName, Obj->getSectionName(&Sec)).str() +
+            " ";
         BelongsToSegment.insert(&Sec);
       }
     }
@@ -3428,9 +3615,10 @@ void GNUStyle<ELFT>::printSectionMapping(const ELFO *Obj) {
 
   // Display sections that do not belong to a segment.
   std::string Sections;
-  for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
+  for (const Elf_Shdr &Sec : unwrapOrError(this->FileName, Obj->sections())) {
     if (BelongsToSegment.find(&Sec) == BelongsToSegment.end())
-      Sections += unwrapOrError(Obj->getSectionName(&Sec)).str() + ' ';
+      Sections +=
+          unwrapOrError(this->FileName, Obj->getSectionName(&Sec)).str() + ' ';
   }
   if (!Sections.empty()) {
     OS << "   None  " << Sections << '\n';
@@ -3438,14 +3626,40 @@ void GNUStyle<ELFT>::printSectionMapping(const ELFO *Obj) {
   }
 }
 
+namespace {
+template <class ELFT> struct RelSymbol {
+  const typename ELFT::Sym *Sym;
+  std::string Name;
+};
+
+template <class ELFT>
+RelSymbol<ELFT> getSymbolForReloc(const ELFFile<ELFT> *Obj, StringRef FileName,
+                                  const ELFDumper<ELFT> *Dumper,
+                                  const typename ELFT::Rela &Reloc) {
+  uint32_t SymIndex = Reloc.getSymbol(Obj->isMips64EL());
+  const typename ELFT::Sym *Sym = Dumper->dynamic_symbols().begin() + SymIndex;
+  Expected<StringRef> ErrOrName = Sym->getName(Dumper->getDynamicStringTable());
+
+  std::string Name;
+  if (ErrOrName) {
+    Name = maybeDemangle(*ErrOrName);
+  } else {
+    reportWarning(
+        createError("unable to get name of the dynamic symbol with index " +
+                    Twine(SymIndex) + ": " + toString(ErrOrName.takeError())),
+        FileName);
+    Name = "<corrupt>";
+  }
+
+  return {Sym, std::move(Name)};
+}
+} // namespace
+
 template <class ELFT>
 void GNUStyle<ELFT>::printDynamicRelocation(const ELFO *Obj, Elf_Rela R,
                                             bool IsRela) {
-  uint32_t SymIndex = R.getSymbol(Obj->isMips64EL());
-  const Elf_Sym *Sym = this->dumper()->dynamic_symbols().begin() + SymIndex;
-  std::string SymbolName = maybeDemangle(
-      unwrapOrError(Sym->getName(this->dumper()->getDynamicStringTable())));
-  printRelocation(Obj, Sym, SymbolName, R, IsRela);
+  RelSymbol<ELFT> S = getSymbolForReloc(Obj, this->FileName, this->dumper(), R);
+  printRelocation(Obj, S.Sym, S.Name, R, IsRela);
 }
 
 template <class ELFT> void GNUStyle<ELFT>::printDynamic(const ELFO *Obj) {
@@ -3518,7 +3732,8 @@ void GNUStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
        << " contains " << DynRelrRegion.Size << " bytes:\n";
     printRelocHeader(ELF::SHT_REL);
     Elf_Relr_Range Relrs = this->dumper()->dyn_relrs();
-    std::vector<Elf_Rela> RelrRelas = unwrapOrError(Obj->decode_relrs(Relrs));
+    std::vector<Elf_Rela> RelrRelas =
+        unwrapOrError(this->FileName, Obj->decode_relrs(Relrs));
     for (const Elf_Rela &Rela : RelrRelas) {
       printDynamicRelocation(Obj, Rela, false);
     }
@@ -3550,14 +3765,15 @@ template <class ELFT>
 static void printGNUVersionSectionProlog(formatted_raw_ostream &OS,
                                          const Twine &Name, unsigned EntriesNum,
                                          const ELFFile<ELFT> *Obj,
-                                         const typename ELFT::Shdr *Sec) {
-  StringRef SecName = unwrapOrError(Obj->getSectionName(Sec));
+                                         const typename ELFT::Shdr *Sec,
+                                         StringRef FileName) {
+  StringRef SecName = unwrapOrError(FileName, Obj->getSectionName(Sec));
   OS << Name << " section '" << SecName << "' "
      << "contains " << EntriesNum << " entries:\n";
 
   const typename ELFT::Shdr *SymTab =
-      unwrapOrError(Obj->getSection(Sec->sh_link));
-  StringRef SymTabName = unwrapOrError(Obj->getSectionName(SymTab));
+      unwrapOrError(FileName, Obj->getSection(Sec->sh_link));
+  StringRef SymTabName = unwrapOrError(FileName, Obj->getSectionName(SymTab));
   OS << " Addr: " << format_hex_no_prefix(Sec->sh_addr, 16)
      << "  Offset: " << format_hex(Sec->sh_offset, 8)
      << "  Link: " << Sec->sh_link << " (" << SymTabName << ")\n";
@@ -3570,7 +3786,8 @@ void GNUStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
     return;
 
   unsigned Entries = Sec->sh_size / sizeof(Elf_Versym);
-  printGNUVersionSectionProlog(OS, "Version symbols", Entries, Obj, Sec);
+  printGNUVersionSectionProlog(OS, "Version symbols", Entries, Obj, Sec,
+                               this->FileName);
 
   const uint8_t *VersymBuf =
       reinterpret_cast<const uint8_t *>(Obj->base() + Sec->sh_offset);
@@ -3642,14 +3859,17 @@ void GNUStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
     return;
 
   unsigned VerDefsNum = Sec->sh_info;
-  printGNUVersionSectionProlog(OS, "Version definition", VerDefsNum, Obj, Sec);
+  printGNUVersionSectionProlog(OS, "Version definition", VerDefsNum, Obj, Sec,
+                               this->FileName);
 
-  const Elf_Shdr *StrTabSec = unwrapOrError(Obj->getSection(Sec->sh_link));
+  const Elf_Shdr *StrTabSec =
+      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
   StringRef StringTable(
       reinterpret_cast<const char *>(Obj->base() + StrTabSec->sh_offset),
       (size_t)StrTabSec->sh_size);
 
-  const uint8_t *VerdefBuf = unwrapOrError(Obj->getSectionContents(Sec)).data();
+  const uint8_t *VerdefBuf =
+      unwrapOrError(this->FileName, Obj->getSectionContents(Sec)).data();
   const uint8_t *Begin = VerdefBuf;
 
   while (VerDefsNum--) {
@@ -3684,11 +3904,14 @@ void GNUStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
     return;
 
   unsigned VerneedNum = Sec->sh_info;
-  printGNUVersionSectionProlog(OS, "Version needs", VerneedNum, Obj, Sec);
+  printGNUVersionSectionProlog(OS, "Version needs", VerneedNum, Obj, Sec,
+                               this->FileName);
 
-  ArrayRef<uint8_t> SecData = unwrapOrError(Obj->getSectionContents(Sec));
+  ArrayRef<uint8_t> SecData =
+      unwrapOrError(this->FileName, Obj->getSectionContents(Sec));
 
-  const Elf_Shdr *StrTabSec = unwrapOrError(Obj->getSection(Sec->sh_link));
+  const Elf_Shdr *StrTabSec =
+      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
   StringRef StringTable = {
       reinterpret_cast<const char *>(Obj->base() + StrTabSec->sh_offset),
       (size_t)StrTabSec->sh_size};
@@ -3745,9 +3968,21 @@ void GNUStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
     // Go over all buckets and and note chain lengths of each bucket (total
     // unique chain lengths).
     for (size_t B = 0; B < NBucket; B++) {
-      for (size_t C = Buckets[B]; C > 0 && C < NChain; C = Chains[C])
+      std::vector<bool> Visited(NChain);
+      for (size_t C = Buckets[B]; C < NChain; C = Chains[C]) {
+        if (C == ELF::STN_UNDEF)
+          break;
+        if (Visited[C]) {
+          reportWarning(
+              createError(".hash section is invalid: bucket " + Twine(C) +
+                          ": a cycle was detected in the linked chain"),
+              this->FileName);
+          break;
+        }
+        Visited[C] = true;
         if (MaxChain <= ++ChainLen[B])
           MaxChain++;
+      }
       TotalSyms += ChainLen[B];
     }
 
@@ -3829,7 +4064,7 @@ void GNUStyle<ELFT>::printCGProfile(const ELFFile<ELFT> *Obj) {
 
 template <class ELFT>
 void GNUStyle<ELFT>::printAddrsig(const ELFFile<ELFT> *Obj) {
-    OS << "GNUStyle::printAddrsig not implemented\n";
+  reportError(createError("--addrsig: not implemented"), this->FileName);
 }
 
 static StringRef getGenericNoteTypeName(const uint32_t NT) {
@@ -3850,6 +4085,86 @@ static StringRef getGenericNoteTypeName(const uint32_t NT) {
   return "";
 }
 
+static StringRef getCoreNoteTypeName(const uint32_t NT) {
+  static const struct {
+    uint32_t ID;
+    const char *Name;
+  } Notes[] = {
+      {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"},
+      {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"},
+      {ELF::NT_PRPSINFO, "NT_PRPSINFO (prpsinfo structure)"},
+      {ELF::NT_TASKSTRUCT, "NT_TASKSTRUCT (task structure)"},
+      {ELF::NT_AUXV, "NT_AUXV (auxiliary vector)"},
+      {ELF::NT_PSTATUS, "NT_PSTATUS (pstatus structure)"},
+      {ELF::NT_FPREGS, "NT_FPREGS (floating point registers)"},
+      {ELF::NT_PSINFO, "NT_PSINFO (psinfo structure)"},
+      {ELF::NT_LWPSTATUS, "NT_LWPSTATUS (lwpstatus_t structure)"},
+      {ELF::NT_LWPSINFO, "NT_LWPSINFO (lwpsinfo_t structure)"},
+      {ELF::NT_WIN32PSTATUS, "NT_WIN32PSTATUS (win32_pstatus structure)"},
+
+      {ELF::NT_PPC_VMX, "NT_PPC_VMX (ppc Altivec registers)"},
+      {ELF::NT_PPC_VSX, "NT_PPC_VSX (ppc VSX registers)"},
+      {ELF::NT_PPC_TAR, "NT_PPC_TAR (ppc TAR register)"},
+      {ELF::NT_PPC_PPR, "NT_PPC_PPR (ppc PPR register)"},
+      {ELF::NT_PPC_DSCR, "NT_PPC_DSCR (ppc DSCR register)"},
+      {ELF::NT_PPC_EBB, "NT_PPC_EBB (ppc EBB registers)"},
+      {ELF::NT_PPC_PMU, "NT_PPC_PMU (ppc PMU registers)"},
+      {ELF::NT_PPC_TM_CGPR, "NT_PPC_TM_CGPR (ppc checkpointed GPR registers)"},
+      {ELF::NT_PPC_TM_CFPR,
+       "NT_PPC_TM_CFPR (ppc checkpointed floating point registers)"},
+      {ELF::NT_PPC_TM_CVMX,
+       "NT_PPC_TM_CVMX (ppc checkpointed Altivec registers)"},
+      {ELF::NT_PPC_TM_CVSX, "NT_PPC_TM_CVSX (ppc checkpointed VSX registers)"},
+      {ELF::NT_PPC_TM_SPR, "NT_PPC_TM_SPR (ppc TM special purpose registers)"},
+      {ELF::NT_PPC_TM_CTAR, "NT_PPC_TM_CTAR (ppc checkpointed TAR register)"},
+      {ELF::NT_PPC_TM_CPPR, "NT_PPC_TM_CPPR (ppc checkpointed PPR register)"},
+      {ELF::NT_PPC_TM_CDSCR,
+       "NT_PPC_TM_CDSCR (ppc checkpointed DSCR register)"},
+
+      {ELF::NT_386_TLS, "NT_386_TLS (x86 TLS information)"},
+      {ELF::NT_386_IOPERM, "NT_386_IOPERM (x86 I/O permissions)"},
+      {ELF::NT_X86_XSTATE, "NT_X86_XSTATE (x86 XSAVE extended state)"},
+
+      {ELF::NT_S390_HIGH_GPRS,
+       "NT_S390_HIGH_GPRS (s390 upper register halves)"},
+      {ELF::NT_S390_TIMER, "NT_S390_TIMER (s390 timer register)"},
+      {ELF::NT_S390_TODCMP, "NT_S390_TODCMP (s390 TOD comparator register)"},
+      {ELF::NT_S390_TODPREG,
+       "NT_S390_TODPREG (s390 TOD programmable register)"},
+      {ELF::NT_S390_CTRS, "NT_S390_CTRS (s390 control registers)"},
+      {ELF::NT_S390_PREFIX, "NT_S390_PREFIX (s390 prefix register)"},
+      {ELF::NT_S390_LAST_BREAK,
+       "NT_S390_LAST_BREAK (s390 last breaking event address)"},
+      {ELF::NT_S390_SYSTEM_CALL,
+       "NT_S390_SYSTEM_CALL (s390 system call restart data)"},
+      {ELF::NT_S390_TDB, "NT_S390_TDB (s390 transaction diagnostic block)"},
+      {ELF::NT_S390_VXRS_LOW,
+       "NT_S390_VXRS_LOW (s390 vector registers 0-15 upper half)"},
+      {ELF::NT_S390_VXRS_HIGH,
+       "NT_S390_VXRS_HIGH (s390 vector registers 16-31)"},
+      {ELF::NT_S390_GS_CB, "NT_S390_GS_CB (s390 guarded-storage registers)"},
+      {ELF::NT_S390_GS_BC,
+       "NT_S390_GS_BC (s390 guarded-storage broadcast control)"},
+
+      {ELF::NT_ARM_VFP, "NT_ARM_VFP (arm VFP registers)"},
+      {ELF::NT_ARM_TLS, "NT_ARM_TLS (AArch TLS registers)"},
+      {ELF::NT_ARM_HW_BREAK,
+       "NT_ARM_HW_BREAK (AArch hardware breakpoint registers)"},
+      {ELF::NT_ARM_HW_WATCH,
+       "NT_ARM_HW_WATCH (AArch hardware watchpoint registers)"},
+
+      {ELF::NT_FILE, "NT_FILE (mapped files)"},
+      {ELF::NT_PRXFPREG, "NT_PRXFPREG (user_xfpregs structure)"},
+      {ELF::NT_SIGINFO, "NT_SIGINFO (siginfo_t data)"},
+  };
+
+  for (const auto &Note : Notes)
+    if (Note.ID == NT)
+      return Note.Name;
+
+  return "";
+}
+
 static std::string getGNUNoteTypeName(const uint32_t NT) {
   static const struct {
     uint32_t ID;
@@ -4207,13 +4522,85 @@ static AMDGPUNote getAMDGPUNote(uint32_t NoteType, ArrayRef<uint8_t> Desc) {
   }
 }
 
+struct CoreFileMapping {
+  uint64_t Start, End, Offset;
+  StringRef Filename;
+};
+
+struct CoreNote {
+  uint64_t PageSize;
+  std::vector<CoreFileMapping> Mappings;
+};
+
+static Expected<CoreNote> readCoreNote(DataExtractor Desc) {
+  // Expected format of the NT_FILE note description:
+  // 1. # of file mappings (call it N)
+  // 2. Page size
+  // 3. N (start, end, offset) triples
+  // 4. N packed filenames (null delimited)
+  // Each field is an Elf_Addr, except for filenames which are char* strings.
+
+  CoreNote Ret;
+  const int Bytes = Desc.getAddressSize();
+
+  if (!Desc.isValidOffsetForAddress(2))
+    return createStringError(object_error::parse_failed,
+                             "malformed note: header too short");
+  if (Desc.getData().back() != 0)
+    return createStringError(object_error::parse_failed,
+                             "malformed note: not NUL terminated");
+
+  uint64_t DescOffset = 0;
+  uint64_t FileCount = Desc.getAddress(&DescOffset);
+  Ret.PageSize = Desc.getAddress(&DescOffset);
+
+  if (!Desc.isValidOffsetForAddress(3 * FileCount * Bytes))
+    return createStringError(object_error::parse_failed,
+                             "malformed note: too short for number of files");
+
+  uint64_t FilenamesOffset = 0;
+  DataExtractor Filenames(
+      Desc.getData().drop_front(DescOffset + 3 * FileCount * Bytes),
+      Desc.isLittleEndian(), Desc.getAddressSize());
+
+  Ret.Mappings.resize(FileCount);
+  for (CoreFileMapping &Mapping : Ret.Mappings) {
+    if (!Filenames.isValidOffsetForDataOfSize(FilenamesOffset, 1))
+      return createStringError(object_error::parse_failed,
+                               "malformed note: too few filenames");
+    Mapping.Start = Desc.getAddress(&DescOffset);
+    Mapping.End = Desc.getAddress(&DescOffset);
+    Mapping.Offset = Desc.getAddress(&DescOffset);
+    Mapping.Filename = Filenames.getCStrRef(&FilenamesOffset);
+  }
+
+  return Ret;
+}
+
+template <typename ELFT>
+static void printCoreNote(raw_ostream &OS, const CoreNote &Note) {
+  // Length of "0x<address>" string.
+  const int FieldWidth = ELFT::Is64Bits ? 18 : 10;
+
+  OS << "    Page size: " << format_decimal(Note.PageSize, 0) << '\n';
+  OS << "    " << right_justify("Start", FieldWidth) << "  "
+     << right_justify("End", FieldWidth) << "  "
+     << right_justify("Page Offset", FieldWidth) << '\n';
+  for (const CoreFileMapping &Mapping : Note.Mappings) {
+    OS << "    " << format_hex(Mapping.Start, FieldWidth) << "  "
+       << format_hex(Mapping.End, FieldWidth) << "  "
+       << format_hex(Mapping.Offset, FieldWidth) << "\n        "
+       << Mapping.Filename << '\n';
+  }
+}
+
 template <class ELFT>
 void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
   auto PrintHeader = [&](const typename ELFT::Off Offset,
                          const typename ELFT::Addr Size) {
     OS << "Displaying notes found at file offset " << format_hex(Offset, 10)
        << " with length " << format_hex(Size, 10) << ":\n"
-       << "  Owner                 Data size\tDescription\n";
+       << "  Owner                Data size \tDescription\n";
   };
 
   auto ProcessNote = [&](const Elf_Note &Note) {
@@ -4221,55 +4608,81 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
     ArrayRef<uint8_t> Descriptor = Note.getDesc();
     Elf_Word Type = Note.getType();
 
-    OS << "  " << Name << std::string(22 - Name.size(), ' ')
+    // Print the note owner/type.
+    OS << "  " << left_justify(Name, 20) << ' '
        << format_hex(Descriptor.size(), 10) << '\t';
-
     if (Name == "GNU") {
       OS << getGNUNoteTypeName(Type) << '\n';
-      printGNUNote<ELFT>(OS, Type, Descriptor);
     } else if (Name == "FreeBSD") {
       OS << getFreeBSDNoteTypeName(Type) << '\n';
     } else if (Name == "AMD") {
       OS << getAMDNoteTypeName(Type) << '\n';
+    } else if (Name == "AMDGPU") {
+      OS << getAMDGPUNoteTypeName(Type) << '\n';
+    } else {
+      StringRef NoteType = Obj->getHeader()->e_type == ELF::ET_CORE
+                               ? getCoreNoteTypeName(Type)
+                               : getGenericNoteTypeName(Type);
+      if (!NoteType.empty())
+        OS << NoteType << '\n';
+      else
+        OS << "Unknown note type: (" << format_hex(Type, 10) << ")\n";
+    }
+
+    // Print the description, or fallback to printing raw bytes for unknown
+    // owners.
+    if (Name == "GNU") {
+      printGNUNote<ELFT>(OS, Type, Descriptor);
+    } else if (Name == "AMD") {
       const AMDNote N = getAMDNote<ELFT>(Type, Descriptor);
       if (!N.Type.empty())
         OS << "    " << N.Type << ":\n        " << N.Value << '\n';
     } else if (Name == "AMDGPU") {
-      OS << getAMDGPUNoteTypeName(Type) << '\n';
       const AMDGPUNote N = getAMDGPUNote<ELFT>(Type, Descriptor);
       if (!N.Type.empty())
         OS << "    " << N.Type << ":\n        " << N.Value << '\n';
-    } else {
-      StringRef NoteType = getGenericNoteTypeName(Type);
-      if (!NoteType.empty())
-        OS << NoteType;
-      else
-        OS << "Unknown note type: (" << format_hex(Type, 10) << ')';
+    } else if (Name == "CORE") {
+      if (Type == ELF::NT_FILE) {
+        DataExtractor DescExtractor(Descriptor,
+                                    ELFT::TargetEndianness == support::little,
+                                    sizeof(Elf_Addr));
+        Expected<CoreNote> Note = readCoreNote(DescExtractor);
+        if (Note)
+          printCoreNote<ELFT>(OS, *Note);
+        else
+          reportWarning(Note.takeError(), this->FileName);
+      }
+    } else if (!Descriptor.empty()) {
+      OS << "   description data:";
+      for (uint8_t B : Descriptor)
+        OS << " " << format("%02x", B);
+      OS << '\n';
     }
-    OS << '\n';
   };
 
-  if (Obj->getHeader()->e_type == ELF::ET_CORE) {
-    for (const auto &P : unwrapOrError(Obj->program_headers())) {
-      if (P.p_type != PT_NOTE)
+  ArrayRef<Elf_Shdr> Sections = unwrapOrError(this->FileName, Obj->sections());
+  if (Obj->getHeader()->e_type != ELF::ET_CORE && !Sections.empty()) {
+    for (const auto &S : Sections) {
+      if (S.sh_type != SHT_NOTE)
         continue;
-      PrintHeader(P.p_offset, P.p_filesz);
+      PrintHeader(S.sh_offset, S.sh_size);
       Error Err = Error::success();
-      for (const auto &Note : Obj->notes(P, Err))
+      for (const auto &Note : Obj->notes(S, Err))
         ProcessNote(Note);
       if (Err)
-        error(std::move(Err));
+        reportError(std::move(Err), this->FileName);
     }
   } else {
-    for (const auto &S : unwrapOrError(Obj->sections())) {
-      if (S.sh_type != SHT_NOTE)
+    for (const auto &P :
+         unwrapOrError(this->FileName, Obj->program_headers())) {
+      if (P.p_type != PT_NOTE)
         continue;
-      PrintHeader(S.sh_offset, S.sh_size);
+      PrintHeader(P.p_offset, P.p_filesz);
       Error Err = Error::success();
-      for (const auto &Note : Obj->notes(S, Err))
+      for (const auto &Note : Obj->notes(P, Err))
         ProcessNote(Note);
       if (Err)
-        error(std::move(Err));
+        reportError(std::move(Err), this->FileName);
     }
   }
 }
@@ -4279,6 +4692,294 @@ void GNUStyle<ELFT>::printELFLinkerOptions(const ELFFile<ELFT> *Obj) {
   OS << "printELFLinkerOptions not implemented!\n";
 }
 
+// Used for printing section names in places where possible errors can be
+// ignored.
+static StringRef getSectionName(const SectionRef &Sec) {
+  Expected<StringRef> NameOrErr = Sec.getName();
+  if (NameOrErr)
+    return *NameOrErr;
+  consumeError(NameOrErr.takeError());
+  return "<?>";
+}
+
+// Used for printing symbol names in places where possible errors can be
+// ignored.
+static std::string getSymbolName(const ELFSymbolRef &Sym) {
+  Expected<StringRef> NameOrErr = Sym.getName();
+  if (NameOrErr)
+    return maybeDemangle(*NameOrErr);
+  consumeError(NameOrErr.takeError());
+  return "<?>";
+}
+
+template <class ELFT>
+void DumpStyle<ELFT>::printFunctionStackSize(
+    const ELFObjectFile<ELFT> *Obj, uint64_t SymValue, SectionRef FunctionSec,
+    const StringRef SectionName, DataExtractor Data, uint64_t *Offset) {
+  // This function ignores potentially erroneous input, unless it is directly
+  // related to stack size reporting.
+  SymbolRef FuncSym;
+  for (const ELFSymbolRef &Symbol : Obj->symbols()) {
+    Expected<uint64_t> SymAddrOrErr = Symbol.getAddress();
+    if (!SymAddrOrErr) {
+      consumeError(SymAddrOrErr.takeError());
+      continue;
+    }
+    if (Symbol.getELFType() == ELF::STT_FUNC && *SymAddrOrErr == SymValue) {
+      // Check if the symbol is in the right section.
+      if (FunctionSec.containsSymbol(Symbol)) {
+        FuncSym = Symbol;
+        break;
+      }
+    }
+  }
+
+  std::string FuncName = "?";
+  // A valid SymbolRef has a non-null object file pointer.
+  if (FuncSym.BasicSymbolRef::getObject())
+    FuncName = getSymbolName(FuncSym);
+  else
+    reportWarning(
+        createError("could not identify function symbol for stack size entry"),
+        Obj->getFileName());
+
+  // Extract the size. The expectation is that Offset is pointing to the right
+  // place, i.e. past the function address.
+  uint64_t PrevOffset = *Offset;
+  uint64_t StackSize = Data.getULEB128(Offset);
+  // getULEB128() does not advance Offset if it is not able to extract a valid
+  // integer.
+  if (*Offset == PrevOffset)
+    reportError(
+        createStringError(object_error::parse_failed,
+                          "could not extract a valid stack size in section %s",
+                          SectionName.data()),
+        Obj->getFileName());
+
+  printStackSizeEntry(StackSize, FuncName);
+}
+
+template <class ELFT>
+void GNUStyle<ELFT>::printStackSizeEntry(uint64_t Size, StringRef FuncName) {
+  OS.PadToColumn(2);
+  OS << format_decimal(Size, 11);
+  OS.PadToColumn(18);
+  OS << FuncName << "\n";
+}
+
+template <class ELFT>
+void DumpStyle<ELFT>::printStackSize(const ELFObjectFile<ELFT> *Obj,
+                                     RelocationRef Reloc,
+                                     SectionRef FunctionSec,
+                                     const StringRef &StackSizeSectionName,
+                                     const RelocationResolver &Resolver,
+                                     DataExtractor Data) {
+  // This function ignores potentially erroneous input, unless it is directly
+  // related to stack size reporting.
+  object::symbol_iterator RelocSym = Reloc.getSymbol();
+  uint64_t RelocSymValue = 0;
+  StringRef FileStr = Obj->getFileName();
+  if (RelocSym != Obj->symbol_end()) {
+    // Ensure that the relocation symbol is in the function section, i.e. the
+    // section where the functions whose stack sizes we are reporting are
+    // located.
+    auto SectionOrErr = RelocSym->getSection();
+    if (!SectionOrErr) {
+      reportWarning(
+          createError("cannot identify the section for relocation symbol '" +
+                      getSymbolName(*RelocSym) + "'"),
+          FileStr);
+      consumeError(SectionOrErr.takeError());
+    } else if (*SectionOrErr != FunctionSec) {
+      reportWarning(createError("relocation symbol '" +
+                                getSymbolName(*RelocSym) +
+                                "' is not in the expected section"),
+                    FileStr);
+      // Pretend that the symbol is in the correct section and report its
+      // stack size anyway.
+      FunctionSec = **SectionOrErr;
+    }
+
+    Expected<uint64_t> RelocSymValueOrErr = RelocSym->getValue();
+    if (RelocSymValueOrErr)
+      RelocSymValue = *RelocSymValueOrErr;
+    else
+      consumeError(RelocSymValueOrErr.takeError());
+  }
+
+  uint64_t Offset = Reloc.getOffset();
+  if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(Elf_Addr) + 1))
+    reportError(
+        createStringError(object_error::parse_failed,
+                          "found invalid relocation offset into section %s "
+                          "while trying to extract a stack size entry",
+                          StackSizeSectionName.data()),
+        FileStr);
+
+  uint64_t Addend = Data.getAddress(&Offset);
+  uint64_t SymValue = Resolver(Reloc, RelocSymValue, Addend);
+  this->printFunctionStackSize(Obj, SymValue, FunctionSec, StackSizeSectionName,
+                               Data, &Offset);
+}
+
+template <class ELFT>
+void DumpStyle<ELFT>::printNonRelocatableStackSizes(
+    const ELFObjectFile<ELFT> *Obj, std::function<void()> PrintHeader) {
+  // This function ignores potentially erroneous input, unless it is directly
+  // related to stack size reporting.
+  const ELFFile<ELFT> *EF = Obj->getELFFile();
+  StringRef FileStr = Obj->getFileName();
+  for (const SectionRef &Sec : Obj->sections()) {
+    StringRef SectionName = getSectionName(Sec);
+    if (SectionName != ".stack_sizes")
+      continue;
+    PrintHeader();
+    const Elf_Shdr *ElfSec = Obj->getSection(Sec.getRawDataRefImpl());
+    ArrayRef<uint8_t> Contents =
+        unwrapOrError(this->FileName, EF->getSectionContents(ElfSec));
+    DataExtractor Data(Contents, Obj->isLittleEndian(), sizeof(Elf_Addr));
+    // A .stack_sizes section header's sh_link field is supposed to point
+    // to the section that contains the functions whose stack sizes are
+    // described in it.
+    const Elf_Shdr *FunctionELFSec =
+        unwrapOrError(this->FileName, EF->getSection(ElfSec->sh_link));
+    uint64_t Offset = 0;
+    while (Offset < Contents.size()) {
+      // The function address is followed by a ULEB representing the stack
+      // size. Check for an extra byte before we try to process the entry.
+      if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(Elf_Addr) + 1)) {
+        reportError(
+            createStringError(
+                object_error::parse_failed,
+                "section %s ended while trying to extract a stack size entry",
+                SectionName.data()),
+            FileStr);
+      }
+      uint64_t SymValue = Data.getAddress(&Offset);
+      printFunctionStackSize(Obj, SymValue, Obj->toSectionRef(FunctionELFSec),
+                             SectionName, Data, &Offset);
+    }
+  }
+}
+
+template <class ELFT>
+void DumpStyle<ELFT>::printRelocatableStackSizes(
+    const ELFObjectFile<ELFT> *Obj, std::function<void()> PrintHeader) {
+  const ELFFile<ELFT> *EF = Obj->getELFFile();
+
+  // Build a map between stack size sections and their corresponding relocation
+  // sections.
+  llvm::MapVector<SectionRef, SectionRef> StackSizeRelocMap;
+  const SectionRef NullSection{};
+
+  for (const SectionRef &Sec : Obj->sections()) {
+    StringRef SectionName;
+    if (Expected<StringRef> NameOrErr = Sec.getName())
+      SectionName = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
+    // A stack size section that we haven't encountered yet is mapped to the
+    // null section until we find its corresponding relocation section.
+    if (SectionName == ".stack_sizes")
+      if (StackSizeRelocMap.count(Sec) == 0) {
+        StackSizeRelocMap[Sec] = NullSection;
+        continue;
+      }
+
+    // Check relocation sections if they are relocating contents of a
+    // stack sizes section.
+    const Elf_Shdr *ElfSec = Obj->getSection(Sec.getRawDataRefImpl());
+    uint32_t SectionType = ElfSec->sh_type;
+    if (SectionType != ELF::SHT_RELA && SectionType != ELF::SHT_REL)
+      continue;
+
+    Expected<section_iterator> RelSecOrErr = Sec.getRelocatedSection();
+    if (!RelSecOrErr)
+      reportError(createStringError(object_error::parse_failed,
+                                    "%s: failed to get a relocated section: %s",
+                                    SectionName.data(),
+                                    toString(RelSecOrErr.takeError()).c_str()),
+                  Obj->getFileName());
+
+    const Elf_Shdr *ContentsSec =
+        Obj->getSection((*RelSecOrErr)->getRawDataRefImpl());
+    Expected<StringRef> ContentsSectionNameOrErr =
+        EF->getSectionName(ContentsSec);
+    if (!ContentsSectionNameOrErr) {
+      consumeError(ContentsSectionNameOrErr.takeError());
+      continue;
+    }
+    if (*ContentsSectionNameOrErr != ".stack_sizes")
+      continue;
+    // Insert a mapping from the stack sizes section to its relocation section.
+    StackSizeRelocMap[Obj->toSectionRef(ContentsSec)] = Sec;
+  }
+
+  for (const auto &StackSizeMapEntry : StackSizeRelocMap) {
+    PrintHeader();
+    const SectionRef &StackSizesSec = StackSizeMapEntry.first;
+    const SectionRef &RelocSec = StackSizeMapEntry.second;
+
+    // Warn about stack size sections without a relocation section.
+    StringRef StackSizeSectionName = getSectionName(StackSizesSec);
+    if (RelocSec == NullSection) {
+      reportWarning(createError("section " + StackSizeSectionName +
+                                " does not have a corresponding "
+                                "relocation section"),
+                    Obj->getFileName());
+      continue;
+    }
+
+    // A .stack_sizes section header's sh_link field is supposed to point
+    // to the section that contains the functions whose stack sizes are
+    // described in it.
+    const Elf_Shdr *StackSizesELFSec =
+        Obj->getSection(StackSizesSec.getRawDataRefImpl());
+    const SectionRef FunctionSec = Obj->toSectionRef(unwrapOrError(
+        this->FileName, EF->getSection(StackSizesELFSec->sh_link)));
+
+    bool (*IsSupportedFn)(uint64_t);
+    RelocationResolver Resolver;
+    std::tie(IsSupportedFn, Resolver) = getRelocationResolver(*Obj);
+    auto Contents = unwrapOrError(this->FileName, StackSizesSec.getContents());
+    DataExtractor Data(Contents, Obj->isLittleEndian(), sizeof(Elf_Addr));
+    for (const RelocationRef &Reloc : RelocSec.relocations()) {
+      if (!IsSupportedFn || !IsSupportedFn(Reloc.getType()))
+        reportError(createStringError(
+                        object_error::parse_failed,
+                        "unsupported relocation type in section %s: %s",
+                        getSectionName(RelocSec).data(),
+                        EF->getRelocationTypeName(Reloc.getType()).data()),
+                    Obj->getFileName());
+      this->printStackSize(Obj, Reloc, FunctionSec, StackSizeSectionName,
+                           Resolver, Data);
+    }
+  }
+}
+
+template <class ELFT>
+void GNUStyle<ELFT>::printStackSizes(const ELFObjectFile<ELFT> *Obj) {
+  bool HeaderHasBeenPrinted = false;
+  auto PrintHeader = [&]() {
+    if (HeaderHasBeenPrinted)
+      return;
+    OS << "\nStack Sizes:\n";
+    OS.PadToColumn(9);
+    OS << "Size";
+    OS.PadToColumn(18);
+    OS << "Function\n";
+    HeaderHasBeenPrinted = true;
+  };
+
+  // For non-relocatable objects, look directly for sections whose name starts
+  // with .stack_sizes and process the contents.
+  if (Obj->isRelocatableObject())
+    this->printRelocatableStackSizes(Obj, PrintHeader);
+  else
+    this->printNonRelocatableStackSizes(Obj, PrintHeader);
+}
+
 template <class ELFT>
 void GNUStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
   size_t Bias = ELFT::Is64Bits ? 8 : 0;
@@ -4402,6 +5103,45 @@ void GNUStyle<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
   }
 }
 
+template <class ELFT>
+void GNUStyle<ELFT>::printMipsABIFlags(const ELFObjectFile<ELFT> *ObjF) {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
+  const Elf_Shdr *Shdr =
+      findSectionByName(*Obj, ObjF->getFileName(), ".MIPS.abiflags");
+  if (!Shdr)
+    return;
+
+  ArrayRef<uint8_t> Sec =
+      unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr));
+  if (Sec.size() != sizeof(Elf_Mips_ABIFlags<ELFT>))
+    reportError(createError(".MIPS.abiflags section has a wrong size"),
+                ObjF->getFileName());
+
+  auto *Flags = reinterpret_cast<const Elf_Mips_ABIFlags<ELFT> *>(Sec.data());
+
+  OS << "MIPS ABI Flags Version: " << Flags->version << "\n\n";
+  OS << "ISA: MIPS" << int(Flags->isa_level);
+  if (Flags->isa_rev > 1)
+    OS << "r" << int(Flags->isa_rev);
+  OS << "\n";
+  OS << "GPR size: " << getMipsRegisterSize(Flags->gpr_size) << "\n";
+  OS << "CPR1 size: " << getMipsRegisterSize(Flags->cpr1_size) << "\n";
+  OS << "CPR2 size: " << getMipsRegisterSize(Flags->cpr2_size) << "\n";
+  OS << "FP ABI: " << printEnum(Flags->fp_abi, makeArrayRef(ElfMipsFpABIType))
+     << "\n";
+  OS << "ISA Extension: "
+     << printEnum(Flags->isa_ext, makeArrayRef(ElfMipsISAExtType)) << "\n";
+  if (Flags->ases == 0)
+    OS << "ASEs: None\n";
+  else
+    // FIXME: Print each flag on a separate line.
+    OS << "ASEs: " << printFlags(Flags->ases, makeArrayRef(ElfMipsASEFlags))
+       << "\n";
+  OS << "FLAGS 1: " << format_hex_no_prefix(Flags->flags1, 8, false) << "\n";
+  OS << "FLAGS 2: " << format_hex_no_prefix(Flags->flags2, 8, false) << "\n";
+  OS << "\n";
+}
+
 template <class ELFT> void LLVMStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
   const Elf_Ehdr *E = Obj->getHeader();
   {
@@ -4455,16 +5195,17 @@ template <class ELFT> void LLVMStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
     W.printNumber("ProgramHeaderEntrySize", E->e_phentsize);
     W.printNumber("ProgramHeaderCount", E->e_phnum);
     W.printNumber("SectionHeaderEntrySize", E->e_shentsize);
-    W.printString("SectionHeaderCount", getSectionHeadersNumString(Obj));
+    W.printString("SectionHeaderCount",
+                  getSectionHeadersNumString(Obj, this->FileName));
     W.printString("StringTableSectionIndex",
-                  getSectionHeaderTableIndexString(Obj));
+                  getSectionHeaderTableIndexString(Obj, this->FileName));
   }
 }
 
 template <class ELFT>
 void LLVMStyle<ELFT>::printGroupSections(const ELFO *Obj) {
   DictScope Lists(W, "Groups");
-  std::vector<GroupSection> V = getGroups<ELFT>(Obj);
+  std::vector<GroupSection> V = getGroups<ELFT>(Obj, this->FileName);
   DenseMap<uint64_t, const GroupSection *> Map = mapSectionsToGroups(V);
   for (const GroupSection &G : V) {
     DictScope D(W, "Group");
@@ -4499,7 +5240,7 @@ template <class ELFT> void LLVMStyle<ELFT>::printRelocations(const ELFO *Obj) {
   ListScope D(W, "Relocations");
 
   int SectionNumber = -1;
-  for (const Elf_Shdr &Sec : unwrapOrError(Obj->sections())) {
+  for (const Elf_Shdr &Sec : unwrapOrError(this->FileName, Obj->sections())) {
     ++SectionNumber;
 
     if (Sec.sh_type != ELF::SHT_REL && Sec.sh_type != ELF::SHT_RELA &&
@@ -4508,7 +5249,7 @@ template <class ELFT> void LLVMStyle<ELFT>::printRelocations(const ELFO *Obj) {
         Sec.sh_type != ELF::SHT_ANDROID_RELR)
       continue;
 
-    StringRef Name = unwrapOrError(Obj->getSectionName(&Sec));
+    StringRef Name = unwrapOrError(this->FileName, Obj->getSectionName(&Sec));
 
     W.startLine() << "Section (" << SectionNumber << ") " << Name << " {\n";
     W.indent();
@@ -4522,11 +5263,12 @@ template <class ELFT> void LLVMStyle<ELFT>::printRelocations(const ELFO *Obj) {
 
 template <class ELFT>
 void LLVMStyle<ELFT>::printRelocations(const Elf_Shdr *Sec, const ELFO *Obj) {
-  const Elf_Shdr *SymTab = unwrapOrError(Obj->getSection(Sec->sh_link));
+  const Elf_Shdr *SymTab =
+      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
 
   switch (Sec->sh_type) {
   case ELF::SHT_REL:
-    for (const Elf_Rel &R : unwrapOrError(Obj->rels(Sec))) {
+    for (const Elf_Rel &R : unwrapOrError(this->FileName, Obj->rels(Sec))) {
       Elf_Rela Rela;
       Rela.r_offset = R.r_offset;
       Rela.r_info = R.r_info;
@@ -4535,17 +5277,18 @@ void LLVMStyle<ELFT>::printRelocations(const Elf_Shdr *Sec, const ELFO *Obj) {
     }
     break;
   case ELF::SHT_RELA:
-    for (const Elf_Rela &R : unwrapOrError(Obj->relas(Sec)))
+    for (const Elf_Rela &R : unwrapOrError(this->FileName, Obj->relas(Sec)))
       printRelocation(Obj, R, SymTab);
     break;
   case ELF::SHT_RELR:
   case ELF::SHT_ANDROID_RELR: {
-    Elf_Relr_Range Relrs = unwrapOrError(Obj->relrs(Sec));
+    Elf_Relr_Range Relrs = unwrapOrError(this->FileName, Obj->relrs(Sec));
     if (opts::RawRelr) {
       for (const Elf_Relr &R : Relrs)
         W.startLine() << W.hex(R) << "\n";
     } else {
-      std::vector<Elf_Rela> RelrRelas = unwrapOrError(Obj->decode_relrs(Relrs));
+      std::vector<Elf_Rela> RelrRelas =
+          unwrapOrError(this->FileName, Obj->decode_relrs(Relrs));
       for (const Elf_Rela &R : RelrRelas)
         printRelocation(Obj, R, SymTab);
     }
@@ -4553,7 +5296,8 @@ void LLVMStyle<ELFT>::printRelocations(const Elf_Shdr *Sec, const ELFO *Obj) {
   }
   case ELF::SHT_ANDROID_REL:
   case ELF::SHT_ANDROID_RELA:
-    for (const Elf_Rela &R : unwrapOrError(Obj->android_relas(Sec)))
+    for (const Elf_Rela &R :
+         unwrapOrError(this->FileName, Obj->android_relas(Sec)))
       printRelocation(Obj, R, SymTab);
     break;
   }
@@ -4565,13 +5309,16 @@ void LLVMStyle<ELFT>::printRelocation(const ELFO *Obj, Elf_Rela Rel,
   SmallString<32> RelocName;
   Obj->getRelocationTypeName(Rel.getType(Obj->isMips64EL()), RelocName);
   std::string TargetName;
-  const Elf_Sym *Sym = unwrapOrError(Obj->getRelocationSymbol(&Rel, SymTab));
+  const Elf_Sym *Sym =
+      unwrapOrError(this->FileName, Obj->getRelocationSymbol(&Rel, SymTab));
   if (Sym && Sym->getType() == ELF::STT_SECTION) {
     const Elf_Shdr *Sec = unwrapOrError(
+        this->FileName,
         Obj->getSection(Sym, SymTab, this->dumper()->getShndxTable()));
-    TargetName = unwrapOrError(Obj->getSectionName(Sec));
+    TargetName = unwrapOrError(this->FileName, Obj->getSectionName(Sec));
   } else if (Sym) {
-    StringRef StrTable = unwrapOrError(Obj->getStringTableForSymtab(*SymTab));
+    StringRef StrTable =
+        unwrapOrError(this->FileName, Obj->getStringTableForSymtab(*SymTab));
     TargetName = this->dumper()->getFullSymbolName(
         Sym, StrTable, SymTab->sh_type == SHT_DYNSYM /* IsDynamic */);
   }
@@ -4596,10 +5343,11 @@ void LLVMStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
   ListScope SectionsD(W, "Sections");
 
   int SectionIndex = -1;
-  ArrayRef<Elf_Shdr> Sections = unwrapOrError(Obj->sections());
+  ArrayRef<Elf_Shdr> Sections = unwrapOrError(this->FileName, Obj->sections());
   const ELFObjectFile<ELFT> *ElfObj = this->dumper()->getElfObject();
   for (const Elf_Shdr &Sec : Sections) {
-    StringRef Name = getSectionName(Sec, *ElfObj, Sections);
+    StringRef Name = unwrapOrError(
+        ElfObj->getFileName(), Obj->getSectionName(&Sec, this->WarningHandler));
     DictScope SectionD(W, "Section");
     W.printNumber("Index", ++SectionIndex);
     W.printNumber("Name", Name, Sec.sh_name);
@@ -4652,19 +5400,25 @@ void LLVMStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
     if (opts::SectionSymbols) {
       ListScope D(W, "Symbols");
       const Elf_Shdr *Symtab = this->dumper()->getDotSymtabSec();
-      StringRef StrTable = unwrapOrError(Obj->getStringTableForSymtab(*Symtab));
+      StringRef StrTable =
+          unwrapOrError(this->FileName, Obj->getStringTableForSymtab(*Symtab));
 
-      for (const Elf_Sym &Sym : unwrapOrError(Obj->symbols(Symtab))) {
+      for (const Elf_Sym &Sym :
+           unwrapOrError(this->FileName, Obj->symbols(Symtab))) {
         const Elf_Shdr *SymSec = unwrapOrError(
+            this->FileName,
             Obj->getSection(&Sym, Symtab, this->dumper()->getShndxTable()));
         if (SymSec == &Sec)
-          printSymbol(Obj, &Sym, unwrapOrError(Obj->symbols(Symtab)).begin(),
-                      StrTable, false);
+          printSymbol(
+              Obj, &Sym,
+              unwrapOrError(this->FileName, Obj->symbols(Symtab)).begin(),
+              StrTable, false, false);
       }
     }
 
     if (opts::SectionData && Sec.sh_type != ELF::SHT_NOBITS) {
-      ArrayRef<uint8_t> Data = unwrapOrError(Obj->getSectionContents(&Sec));
+      ArrayRef<uint8_t> Data =
+          unwrapOrError(this->FileName, Obj->getSectionContents(&Sec));
       W.printBinaryBlock(
           "SectionData",
           StringRef(reinterpret_cast<const char *>(Data.data()), Data.size()));
@@ -4675,7 +5429,8 @@ void LLVMStyle<ELFT>::printSectionHeaders(const ELFO *Obj) {
 template <class ELFT>
 void LLVMStyle<ELFT>::printSymbol(const ELFO *Obj, const Elf_Sym *Symbol,
                                   const Elf_Sym *First, StringRef StrTable,
-                                  bool IsDynamic) {
+                                  bool IsDynamic,
+                                  bool /*NonVisibilityBitsUsed*/) {
   unsigned SectionIndex = 0;
   StringRef SectionName;
   this->dumper()->getSectionNameIndex(Symbol, First, SectionName, SectionIndex);
@@ -4786,7 +5541,8 @@ void LLVMStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
     }
   if (DynRelrRegion.Size > 0) {
     Elf_Relr_Range Relrs = this->dumper()->dyn_relrs();
-    std::vector<Elf_Rela> RelrRelas = unwrapOrError(Obj->decode_relrs(Relrs));
+    std::vector<Elf_Rela> RelrRelas =
+        unwrapOrError(this->FileName, Obj->decode_relrs(Relrs));
     for (const Elf_Rela &Rela : RelrRelas)
       printDynamicRelocation(Obj, Rela);
   }
@@ -4809,11 +5565,9 @@ template <class ELFT>
 void LLVMStyle<ELFT>::printDynamicRelocation(const ELFO *Obj, Elf_Rela Rel) {
   SmallString<32> RelocName;
   Obj->getRelocationTypeName(Rel.getType(Obj->isMips64EL()), RelocName);
-  std::string SymbolName;
-  uint32_t SymIndex = Rel.getSymbol(Obj->isMips64EL());
-  const Elf_Sym *Sym = this->dumper()->dynamic_symbols().begin() + SymIndex;
-  SymbolName = maybeDemangle(
-      unwrapOrError(Sym->getName(this->dumper()->getDynamicStringTable())));
+  std::string SymbolName =
+      getSymbolForReloc(Obj, this->FileName, this->dumper(), Rel).Name;
+
   if (opts::ExpandRelocs) {
     DictScope Group(W, "Relocation");
     W.printHex("Offset", Rel.r_offset);
@@ -4842,7 +5596,8 @@ template <class ELFT>
 void LLVMStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
   ListScope L(W, "ProgramHeaders");
 
-  for (const Elf_Phdr &Phdr : unwrapOrError(Obj->program_headers())) {
+  for (const Elf_Phdr &Phdr :
+       unwrapOrError(this->FileName, Obj->program_headers())) {
     DictScope P(W, "ProgramHeader");
     W.printHex("Type",
                getElfSegmentType(Obj->getHeader()->e_machine, Phdr.p_type),
@@ -4860,23 +5615,16 @@ void LLVMStyle<ELFT>::printProgramHeaders(const ELFO *Obj) {
 template <class ELFT>
 void LLVMStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
                                                 const Elf_Shdr *Sec) {
-  DictScope SS(W, "Version symbols");
+  ListScope SS(W, "VersionSymbols");
   if (!Sec)
     return;
 
-  StringRef SecName = unwrapOrError(Obj->getSectionName(Sec));
-  W.printNumber("Section Name", SecName, Sec->sh_name);
-  W.printHex("Address", Sec->sh_addr);
-  W.printHex("Offset", Sec->sh_offset);
-  W.printNumber("Link", Sec->sh_link);
-
   const uint8_t *VersymBuf =
       reinterpret_cast<const uint8_t *>(Obj->base() + Sec->sh_offset);
   const ELFDumper<ELFT> *Dumper = this->dumper();
   StringRef StrTable = Dumper->getDynamicStringTable();
 
   // Same number of entries in the dynamic symbol table (DT_SYMTAB).
-  ListScope Syms(W, "Symbols");
   for (const Elf_Sym &Sym : Dumper->dynamic_symbols()) {
     DictScope S(W, "Symbol");
     const Elf_Versym *Versym = reinterpret_cast<const Elf_Versym *>(VersymBuf);
@@ -4891,7 +5639,7 @@ void LLVMStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
 template <class ELFT>
 void LLVMStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
                                                     const Elf_Shdr *Sec) {
-  DictScope SD(W, "SHT_GNU_verdef");
+  ListScope SD(W, "VersionDefinitions");
   if (!Sec)
     return;
 
@@ -4899,7 +5647,8 @@ void LLVMStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
       reinterpret_cast<const uint8_t *>(Obj->base() + Sec->sh_offset);
   const uint8_t *SecEndAddress = SecStartAddress + Sec->sh_size;
   const uint8_t *VerdefBuf = SecStartAddress;
-  const Elf_Shdr *StrTab = unwrapOrError(Obj->getSection(Sec->sh_link));
+  const Elf_Shdr *StrTab =
+      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
 
   unsigned VerDefsNum = Sec->sh_info;
   while (VerDefsNum--) {
@@ -4938,13 +5687,14 @@ void LLVMStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
 template <class ELFT>
 void LLVMStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
                                                     const Elf_Shdr *Sec) {
-  DictScope SD(W, "SHT_GNU_verneed");
+  ListScope SD(W, "VersionRequirements");
   if (!Sec)
     return;
 
   const uint8_t *SecData =
       reinterpret_cast<const uint8_t *>(Obj->base() + Sec->sh_offset);
-  const Elf_Shdr *StrTab = unwrapOrError(Obj->getSection(Sec->sh_link));
+  const Elf_Shdr *StrTab =
+      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
 
   const uint8_t *VerneedBuf = SecData;
   unsigned VerneedNum = Sec->sh_info;
@@ -4986,37 +5736,62 @@ void LLVMStyle<ELFT>::printCGProfile(const ELFFile<ELFT> *Obj) {
   ListScope L(W, "CGProfile");
   if (!this->dumper()->getDotCGProfileSec())
     return;
-  auto CGProfile =
-      unwrapOrError(Obj->template getSectionContentsAsArray<Elf_CGProfile>(
-          this->dumper()->getDotCGProfileSec()));
+  auto CGProfile = unwrapOrError(
+      this->FileName, Obj->template getSectionContentsAsArray<Elf_CGProfile>(
+                          this->dumper()->getDotCGProfileSec()));
   for (const Elf_CGProfile &CGPE : CGProfile) {
     DictScope D(W, "CGProfileEntry");
-    W.printNumber("From", this->dumper()->getStaticSymbolName(CGPE.cgp_from),
-                  CGPE.cgp_from);
-    W.printNumber("To", this->dumper()->getStaticSymbolName(CGPE.cgp_to),
-                  CGPE.cgp_to);
+    W.printNumber(
+        "From",
+        unwrapOrError(this->FileName,
+                      this->dumper()->getStaticSymbolName(CGPE.cgp_from)),
+        CGPE.cgp_from);
+    W.printNumber(
+        "To",
+        unwrapOrError(this->FileName,
+                      this->dumper()->getStaticSymbolName(CGPE.cgp_to)),
+        CGPE.cgp_to);
     W.printNumber("Weight", CGPE.cgp_weight);
   }
 }
 
+static Expected<std::vector<uint64_t>> toULEB128Array(ArrayRef<uint8_t> Data) {
+  std::vector<uint64_t> Ret;
+  const uint8_t *Cur = Data.begin();
+  const uint8_t *End = Data.end();
+  while (Cur != End) {
+    unsigned Size;
+    const char *Err;
+    Ret.push_back(decodeULEB128(Cur, &Size, End, &Err));
+    if (Err)
+      return createError(Err);
+    Cur += Size;
+  }
+  return Ret;
+}
+
 template <class ELFT>
 void LLVMStyle<ELFT>::printAddrsig(const ELFFile<ELFT> *Obj) {
   ListScope L(W, "Addrsig");
   if (!this->dumper()->getDotAddrsigSec())
     return;
   ArrayRef<uint8_t> Contents = unwrapOrError(
+      this->FileName,
       Obj->getSectionContents(this->dumper()->getDotAddrsigSec()));
-  const uint8_t *Cur = Contents.begin();
-  const uint8_t *End = Contents.end();
-  while (Cur != End) {
-    unsigned Size;
-    const char *Err;
-    uint64_t SymIndex = decodeULEB128(Cur, &Size, End, &Err);
-    if (Err)
-      reportError(Err);
-    W.printNumber("Sym", this->dumper()->getStaticSymbolName(SymIndex),
-                  SymIndex);
-    Cur += Size;
+  Expected<std::vector<uint64_t>> V = toULEB128Array(Contents);
+  if (!V) {
+    reportWarning(V.takeError(), this->FileName);
+    return;
+  }
+
+  for (uint64_t Sym : *V) {
+    Expected<std::string> NameOrErr = this->dumper()->getStaticSymbolName(Sym);
+    if (NameOrErr) {
+      W.printNumber("Sym", *NameOrErr, Sym);
+      continue;
+    }
+    reportWarning(NameOrErr.takeError(), this->FileName);
+    W.printNumber("Sym", "<?>", Sym);
   }
 }
 
@@ -5051,6 +5826,17 @@ static void printGNUNoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
   }
 }
 
+static void printCoreNoteLLVMStyle(const CoreNote &Note, ScopedPrinter &W) {
+  W.printNumber("Page Size", Note.PageSize);
+  for (const CoreFileMapping &Mapping : Note.Mappings) {
+    ListScope D(W, "Mapping");
+    W.printHex("Start", Mapping.Start);
+    W.printHex("End", Mapping.End);
+    W.printHex("Offset", Mapping.Offset);
+    W.printString("Filename", Mapping.Filename);
+  }
+}
+
 template <class ELFT>
 void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
   ListScope L(W, "Notes");
@@ -5067,56 +5853,81 @@ void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
     ArrayRef<uint8_t> Descriptor = Note.getDesc();
     Elf_Word Type = Note.getType();
 
+    // Print the note owner/type.
     W.printString("Owner", Name);
     W.printHex("Data size", Descriptor.size());
     if (Name == "GNU") {
       W.printString("Type", getGNUNoteTypeName(Type));
-      printGNUNoteLLVMStyle<ELFT>(Type, Descriptor, W);
     } else if (Name == "FreeBSD") {
       W.printString("Type", getFreeBSDNoteTypeName(Type));
     } else if (Name == "AMD") {
       W.printString("Type", getAMDNoteTypeName(Type));
-      const AMDNote N = getAMDNote<ELFT>(Type, Descriptor);
-      if (!N.Type.empty())
-        W.printString(N.Type, N.Value);
     } else if (Name == "AMDGPU") {
       W.printString("Type", getAMDGPUNoteTypeName(Type));
-      const AMDGPUNote N = getAMDGPUNote<ELFT>(Type, Descriptor);
-      if (!N.Type.empty())
-        W.printString(N.Type, N.Value);
     } else {
-      StringRef NoteType = getGenericNoteTypeName(Type);
+      StringRef NoteType = Obj->getHeader()->e_type == ELF::ET_CORE
+                               ? getCoreNoteTypeName(Type)
+                               : getGenericNoteTypeName(Type);
       if (!NoteType.empty())
         W.printString("Type", NoteType);
       else
         W.printString("Type",
                       "Unknown (" + to_string(format_hex(Type, 10)) + ")");
     }
+
+    // Print the description, or fallback to printing raw bytes for unknown
+    // owners.
+    if (Name == "GNU") {
+      printGNUNoteLLVMStyle<ELFT>(Type, Descriptor, W);
+    } else if (Name == "AMD") {
+      const AMDNote N = getAMDNote<ELFT>(Type, Descriptor);
+      if (!N.Type.empty())
+        W.printString(N.Type, N.Value);
+    } else if (Name == "AMDGPU") {
+      const AMDGPUNote N = getAMDGPUNote<ELFT>(Type, Descriptor);
+      if (!N.Type.empty())
+        W.printString(N.Type, N.Value);
+    } else if (Name == "CORE") {
+      if (Type == ELF::NT_FILE) {
+        DataExtractor DescExtractor(Descriptor,
+                                    ELFT::TargetEndianness == support::little,
+                                    sizeof(Elf_Addr));
+        Expected<CoreNote> Note = readCoreNote(DescExtractor);
+        if (Note)
+          printCoreNoteLLVMStyle(*Note, W);
+        else
+          reportWarning(Note.takeError(), this->FileName);
+      }
+    } else if (!Descriptor.empty()) {
+      W.printBinaryBlock("Description data", Descriptor);
+    }
   };
 
-  if (Obj->getHeader()->e_type == ELF::ET_CORE) {
-    for (const auto &P : unwrapOrError(Obj->program_headers())) {
-      if (P.p_type != PT_NOTE)
+  ArrayRef<Elf_Shdr> Sections = unwrapOrError(this->FileName, Obj->sections());
+  if (Obj->getHeader()->e_type != ELF::ET_CORE && !Sections.empty()) {
+    for (const auto &S : Sections) {
+      if (S.sh_type != SHT_NOTE)
         continue;
       DictScope D(W, "NoteSection");
-      PrintHeader(P.p_offset, P.p_filesz);
+      PrintHeader(S.sh_offset, S.sh_size);
       Error Err = Error::success();
-      for (const auto &Note : Obj->notes(P, Err))
+      for (const auto &Note : Obj->notes(S, Err))
         ProcessNote(Note);
       if (Err)
-        error(std::move(Err));
+        reportError(std::move(Err), this->FileName);
     }
   } else {
-    for (const auto &S : unwrapOrError(Obj->sections())) {
-      if (S.sh_type != SHT_NOTE)
+    for (const auto &P :
+         unwrapOrError(this->FileName, Obj->program_headers())) {
+      if (P.p_type != PT_NOTE)
         continue;
       DictScope D(W, "NoteSection");
-      PrintHeader(S.sh_offset, S.sh_size);
+      PrintHeader(P.p_offset, P.p_filesz);
       Error Err = Error::success();
-      for (const auto &Note : Obj->notes(S, Err))
+      for (const auto &Note : Obj->notes(P, Err))
         ProcessNote(Note);
       if (Err)
-        error(std::move(Err));
+        reportError(std::move(Err), this->FileName);
     }
   }
 }
@@ -5125,11 +5936,12 @@ template <class ELFT>
 void LLVMStyle<ELFT>::printELFLinkerOptions(const ELFFile<ELFT> *Obj) {
   ListScope L(W, "LinkerOptions");
 
-  for (const Elf_Shdr &Shdr : unwrapOrError(Obj->sections())) {
+  for (const Elf_Shdr &Shdr : unwrapOrError(this->FileName, Obj->sections())) {
     if (Shdr.sh_type != ELF::SHT_LLVM_LINKER_OPTIONS)
       continue;
 
-    ArrayRef<uint8_t> Contents = unwrapOrError(Obj->getSectionContents(&Shdr));
+    ArrayRef<uint8_t> Contents =
+        unwrapOrError(this->FileName, Obj->getSectionContents(&Shdr));
     for (const uint8_t *P = Contents.begin(), *E = Contents.end(); P < E; ) {
       StringRef Key = StringRef(reinterpret_cast<const char *>(P));
       StringRef Value =
@@ -5142,6 +5954,22 @@ void LLVMStyle<ELFT>::printELFLinkerOptions(const ELFFile<ELFT> *Obj) {
   }
 }
 
+template <class ELFT>
+void LLVMStyle<ELFT>::printStackSizes(const ELFObjectFile<ELFT> *Obj) {
+  ListScope L(W, "StackSizes");
+  if (Obj->isRelocatableObject())
+    this->printRelocatableStackSizes(Obj, []() {});
+  else
+    this->printNonRelocatableStackSizes(Obj, []() {});
+}
+
+template <class ELFT>
+void LLVMStyle<ELFT>::printStackSizeEntry(uint64_t Size, StringRef FuncName) {
+  DictScope D(W, "Entry");
+  W.printString("Function", FuncName);
+  W.printHex("Size", Size);
+}
+
 template <class ELFT>
 void LLVMStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
   auto PrintEntry = [&](const Elf_Addr *E) {
@@ -5252,3 +6080,41 @@ void LLVMStyle<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
     }
   }
 }
+
+template <class ELFT>
+void LLVMStyle<ELFT>::printMipsABIFlags(const ELFObjectFile<ELFT> *ObjF) {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
+  const Elf_Shdr *Shdr =
+      findSectionByName(*Obj, ObjF->getFileName(), ".MIPS.abiflags");
+  if (!Shdr) {
+    W.startLine() << "There is no .MIPS.abiflags section in the file.\n";
+    return;
+  }
+  ArrayRef<uint8_t> Sec =
+      unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr));
+  if (Sec.size() != sizeof(Elf_Mips_ABIFlags<ELFT>)) {
+    W.startLine() << "The .MIPS.abiflags section has a wrong size.\n";
+    return;
+  }
+
+  auto *Flags = reinterpret_cast<const Elf_Mips_ABIFlags<ELFT> *>(Sec.data());
+
+  raw_ostream &OS = W.getOStream();
+  DictScope GS(W, "MIPS ABI Flags");
+
+  W.printNumber("Version", Flags->version);
+  W.startLine() << "ISA: ";
+  if (Flags->isa_rev <= 1)
+    OS << format("MIPS%u", Flags->isa_level);
+  else
+    OS << format("MIPS%ur%u", Flags->isa_level, Flags->isa_rev);
+  OS << "\n";
+  W.printEnum("ISA Extension", Flags->isa_ext, makeArrayRef(ElfMipsISAExtType));
+  W.printFlags("ASEs", Flags->ases, makeArrayRef(ElfMipsASEFlags));
+  W.printEnum("FP ABI", Flags->fp_abi, makeArrayRef(ElfMipsFpABIType));
+  W.printNumber("GPR size", getMipsRegisterSize(Flags->gpr_size));
+  W.printNumber("CPR1 size", getMipsRegisterSize(Flags->cpr1_size));
+  W.printNumber("CPR2 size", getMipsRegisterSize(Flags->cpr2_size));
+  W.printFlags("Flags 1", Flags->flags1, makeArrayRef(ElfMipsFlags1));
+  W.printHex("Flags 2", Flags->flags2);
+}
diff --git a/tools/llvm-readobj/MachODumper.cpp b/tools/llvm-readobj/MachODumper.cpp
index 32a3866eb2f2..20a60b3df699 100644
--- a/tools/llvm-readobj/MachODumper.cpp
+++ b/tools/llvm-readobj/MachODumper.cpp
@@ -214,6 +214,31 @@ static const EnumEntry<uint32_t> MachOHeaderFlags[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, MH_APP_EXTENSION_SAFE),
 };
 
+static const EnumEntry<unsigned> MachOSectionTypes[] = {
+  { "Regular"                        , MachO::S_REGULAR },
+  { "ZeroFill"                       , MachO::S_ZEROFILL },
+  { "CStringLiterals"                , MachO::S_CSTRING_LITERALS },
+  { "4ByteLiterals"                  , MachO::S_4BYTE_LITERALS },
+  { "8ByteLiterals"                  , MachO::S_8BYTE_LITERALS },
+  { "LiteralPointers"                , MachO::S_LITERAL_POINTERS },
+  { "NonLazySymbolPointers"          , MachO::S_NON_LAZY_SYMBOL_POINTERS },
+  { "LazySymbolPointers"             , MachO::S_LAZY_SYMBOL_POINTERS },
+  { "SymbolStubs"                    , MachO::S_SYMBOL_STUBS },
+  { "ModInitFuncPointers"            , MachO::S_MOD_INIT_FUNC_POINTERS },
+  { "ModTermFuncPointers"            , MachO::S_MOD_TERM_FUNC_POINTERS },
+  { "Coalesced"                      , MachO::S_COALESCED },
+  { "GBZeroFill"                     , MachO::S_GB_ZEROFILL },
+  { "Interposing"                    , MachO::S_INTERPOSING },
+  { "16ByteLiterals"                 , MachO::S_16BYTE_LITERALS },
+  { "DTraceDOF"                      , MachO::S_DTRACE_DOF },
+  { "LazyDylibSymbolPointers"        , MachO::S_LAZY_DYLIB_SYMBOL_POINTERS },
+  { "ThreadLocalRegular"             , MachO::S_THREAD_LOCAL_REGULAR },
+  { "ThreadLocalZerofill"            , MachO::S_THREAD_LOCAL_ZEROFILL },
+  { "ThreadLocalVariables"           , MachO::S_THREAD_LOCAL_VARIABLES },
+  { "ThreadLocalVariablePointers"    , MachO::S_THREAD_LOCAL_VARIABLE_POINTERS },
+  { "ThreadLocalInitFunctionPointers", MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS }
+};
+
 static const EnumEntry<unsigned> MachOSectionAttributes[] = {
   { "LocReloc"         , 1 <<  0 /*S_ATTR_LOC_RELOC          */ },
   { "ExtReloc"         , 1 <<  1 /*S_ATTR_EXT_RELOC          */ },
@@ -440,10 +465,7 @@ void MachODumper::printSectionHeaders(const MachOObjectFile *Obj) {
     MachOSection MOSection;
     getSection(Obj, Section.getRawDataRefImpl(), MOSection);
     DataRefImpl DR = Section.getRawDataRefImpl();
-
-    StringRef Name;
-    error(Section.getName(Name));
-
+    StringRef Name = unwrapOrError(Obj->getFileName(), Section.getName());
     ArrayRef<char> RawName = Obj->getSectionRawName(DR);
     StringRef SegmentName = Obj->getSectionFinalSegmentName(DR);
     ArrayRef<char> RawSegmentName = Obj->getSectionRawFinalSegmentName(DR);
@@ -459,7 +481,7 @@ void MachODumper::printSectionHeaders(const MachOObjectFile *Obj) {
     W.printHex("RelocationOffset", MOSection.RelocationTableOffset);
     W.printNumber("RelocationCount", MOSection.NumRelocationTableEntries);
     W.printEnum("Type", MOSection.Flags & 0xFF,
-                makeArrayRef(MachOSectionAttributes));
+                makeArrayRef(MachOSectionTypes));
     W.printFlags("Attributes", MOSection.Flags >> 8,
                  makeArrayRef(MachOSectionAttributes));
     W.printHex("Reserved1", MOSection.Reserved1);
@@ -484,7 +506,8 @@ void MachODumper::printSectionHeaders(const MachOObjectFile *Obj) {
     }
 
     if (opts::SectionData && !Section.isBSS())
-      W.printBinaryBlock("SectionData", unwrapOrError(Section.getContents()));
+      W.printBinaryBlock("SectionData", unwrapOrError(Obj->getFileName(),
+                                                      Section.getContents()));
   }
 }
 
@@ -493,9 +516,7 @@ void MachODumper::printRelocations() {
 
   std::error_code EC;
   for (const SectionRef &Section : Obj->sections()) {
-    StringRef Name;
-    error(Section.getName(Name));
-
+    StringRef Name = unwrapOrError(Obj->getFileName(), Section.getName());
     bool PrintedGroup = false;
     for (const RelocationRef &Reloc : Section.relocations()) {
       if (!PrintedGroup) {
@@ -535,14 +556,13 @@ void MachODumper::printRelocation(const MachOObjectFile *Obj,
     if (Symbol != Obj->symbol_end()) {
       Expected<StringRef> TargetNameOrErr = Symbol->getName();
       if (!TargetNameOrErr)
-        error(errorToErrorCode(TargetNameOrErr.takeError()));
+        reportError(TargetNameOrErr.takeError(), Obj->getFileName());
       TargetName = *TargetNameOrErr;
     }
   } else if (!IsScattered) {
     section_iterator SecI = Obj->getRelocationSection(DR);
-    if (SecI != Obj->section_end()) {
-      error(SecI->getName(TargetName));
-    }
+    if (SecI != Obj->section_end())
+      TargetName = unwrapOrError(Obj->getFileName(), SecI->getName());
   }
   if (TargetName.empty())
     TargetName = "-";
@@ -610,10 +630,12 @@ void MachODumper::printSymbol(const SymbolRef &Symbol) {
 
   StringRef SectionName = "";
   Expected<section_iterator> SecIOrErr = Symbol.getSection();
-  error(errorToErrorCode(SecIOrErr.takeError()));
+  if (!SecIOrErr)
+    reportError(SecIOrErr.takeError(), Obj->getFileName());
+
   section_iterator SecI = *SecIOrErr;
   if (SecI != Obj->section_end())
-    error(SecI->getName(SectionName));
+    SectionName = unwrapOrError(Obj->getFileName(), SecI->getName());
 
   DictScope D(W, "Symbol");
   W.printNumber("Name", SymbolName, MOSymbol.StringIndex);
@@ -643,7 +665,11 @@ void MachODumper::printStackMap() const {
   object::SectionRef StackMapSection;
   for (auto Sec : Obj->sections()) {
     StringRef Name;
-    Sec.getName(Name);
+    if (Expected<StringRef> NameOrErr = Sec.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
+
     if (Name == "__llvm_stackmaps") {
       StackMapSection = Sec;
       break;
@@ -653,7 +679,8 @@ void MachODumper::printStackMap() const {
   if (StackMapSection == object::SectionRef())
     return;
 
-  StringRef StackMapContents = unwrapOrError(StackMapSection.getContents());
+  StringRef StackMapContents =
+      unwrapOrError(Obj->getFileName(), StackMapSection.getContents());
   ArrayRef<uint8_t> StackMapContentsArray =
       arrayRefFromStringRef(StackMapContents);
 
diff --git a/tools/llvm-readobj/ObjDumper.cpp b/tools/llvm-readobj/ObjDumper.cpp
index 0a9e22c8a71c..9e5ebd99ac37 100644
--- a/tools/llvm-readobj/ObjDumper.cpp
+++ b/tools/llvm-readobj/ObjDumper.cpp
@@ -23,6 +23,10 @@
 
 namespace llvm {
 
+static inline Error createError(const Twine &Msg) {
+  return createStringError(object::object_error::parse_failed, Msg);
+}
+
 ObjDumper::ObjDumper(ScopedPrinter &Writer) : W(Writer) {}
 
 ObjDumper::~ObjDumper() {
@@ -49,8 +53,7 @@ getSectionRefsByNameOrIndex(const object::ObjectFile *Obj,
 
   SecIndex = Obj->isELF() ? 0 : 1;
   for (object::SectionRef SecRef : Obj->sections()) {
-    StringRef SecName;
-    error(SecRef.getName(SecName));
+    StringRef SecName = unwrapOrError(Obj->getFileName(), SecRef.getName());
     auto NameIt = SecNames.find(SecName);
     if (NameIt != SecNames.end())
       NameIt->second = true;
@@ -64,10 +67,15 @@ getSectionRefsByNameOrIndex(const object::ObjectFile *Obj,
 
   for (const std::pair<std::string, bool> &S : SecNames)
     if (!S.second)
-      reportWarning(formatv("could not find section '{0}'", S.first).str());
+      reportWarning(
+          createError(formatv("could not find section '{0}'", S.first).str()),
+          Obj->getFileName());
+
   for (std::pair<unsigned, bool> S : SecIndices)
     if (!S.second)
-      reportWarning(formatv("could not find section {0}", S.first).str());
+      reportWarning(
+          createError(formatv("could not find section {0}", S.first).str()),
+          Obj->getFileName());
 
   return Ret;
 }
@@ -77,14 +85,16 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile *Obj,
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
-    StringRef SectionName;
-    error(Section.getName(SectionName));
+    StringRef SectionName =
+        unwrapOrError(Obj->getFileName(), Section.getName());
+
     if (!First)
       W.startLine() << '\n';
     First = false;
     W.startLine() << "String dump of section '" << SectionName << "':\n";
 
-    StringRef SectionContent = unwrapOrError(Section.getContents());
+    StringRef SectionContent =
+        unwrapOrError(Obj->getFileName(), Section.getContents());
 
     const uint8_t *SecContent = SectionContent.bytes_begin();
     const uint8_t *CurrentWord = SecContent;
@@ -110,14 +120,16 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile *Obj,
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
-    StringRef SectionName;
-    error(Section.getName(SectionName));
+    StringRef SectionName =
+        unwrapOrError(Obj->getFileName(), Section.getName());
+
     if (!First)
       W.startLine() << '\n';
     First = false;
     W.startLine() << "Hex dump of section '" << SectionName << "':\n";
 
-    StringRef SectionContent = unwrapOrError(Section.getContents());
+    StringRef SectionContent =
+        unwrapOrError(Obj->getFileName(), Section.getContents());
     const uint8_t *SecContent = SectionContent.bytes_begin();
     const uint8_t *SecEnd = SecContent + SectionContent.size();
 
diff --git a/tools/llvm-readobj/ObjDumper.h b/tools/llvm-readobj/ObjDumper.h
index aaabfa2ca2e8..2ba441342499 100644
--- a/tools/llvm-readobj/ObjDumper.h
+++ b/tools/llvm-readobj/ObjDumper.h
@@ -68,15 +68,8 @@ public:
   virtual void printAddrsig() {}
   virtual void printNotes() {}
   virtual void printELFLinkerOptions() {}
-
-  // Only implemented for ARM ELF at this time.
-  virtual void printAttributes() { }
-
-  // Only implemented for MIPS ELF at this time.
-  virtual void printMipsPLTGOT() { }
-  virtual void printMipsABIFlags() { }
-  virtual void printMipsReginfo() { }
-  virtual void printMipsOptions() { }
+  virtual void printStackSizes() {}
+  virtual void printArchSpecificInfo() { }
 
   // Only implemented for PE/COFF.
   virtual void printCOFFImports() { }
diff --git a/tools/llvm-readobj/WasmDumper.cpp b/tools/llvm-readobj/WasmDumper.cpp
index 041a9a15bdb6..dfab9f40d71b 100644
--- a/tools/llvm-readobj/WasmDumper.cpp
+++ b/tools/llvm-readobj/WasmDumper.cpp
@@ -51,6 +51,7 @@ static const EnumEntry<unsigned> WasmSymbolFlags[] = {
   ENUM_ENTRY(UNDEFINED),
   ENUM_ENTRY(EXPORTED),
   ENUM_ENTRY(EXPLICIT_NAME),
+  ENUM_ENTRY(NO_STRIP),
 #undef ENUM_ENTRY
 };
 
@@ -90,7 +91,7 @@ void WasmDumper::printRelocation(const SectionRef &Section,
   StringRef SymName;
   symbol_iterator SI = Reloc.getSymbol();
   if (SI != Obj->symbol_end())
-    SymName = error(SI->getName());
+    SymName = unwrapOrError(Obj->getFileName(), SI->getName());
 
   bool HasAddend = false;
   switch (RelocType) {
@@ -133,8 +134,8 @@ void WasmDumper::printRelocations() {
   int SectionNumber = 0;
   for (const SectionRef &Section : Obj->sections()) {
     bool PrintedGroup = false;
-    StringRef Name;
-    error(Section.getName(Name));
+    StringRef Name = unwrapOrError(Obj->getFileName(), Section.getName());
+
     ++SectionNumber;
 
     for (const RelocationRef &Reloc : Section.relocations()) {
diff --git a/tools/llvm-readobj/Win64EHDumper.cpp b/tools/llvm-readobj/Win64EHDumper.cpp
index e64b8f157180..fa268ce9d434 100644
--- a/tools/llvm-readobj/Win64EHDumper.cpp
+++ b/tools/llvm-readobj/Win64EHDumper.cpp
@@ -289,7 +289,9 @@ void Dumper::printRuntimeFunction(const Context &Ctx,
   resolveRelocation(Ctx, Section, SectionOffset + 8, XData, Offset);
 
   ArrayRef<uint8_t> Contents;
-  error(Ctx.COFF.getSectionContents(XData, Contents));
+  if (Error E = Ctx.COFF.getSectionContents(XData, Contents))
+    reportError(std::move(E), Ctx.COFF.getFileName());
+
   if (Contents.empty())
     return;
 
@@ -304,14 +306,19 @@ void Dumper::printRuntimeFunction(const Context &Ctx,
 void Dumper::printData(const Context &Ctx) {
   for (const auto &Section : Ctx.COFF.sections()) {
     StringRef Name;
-    Section.getName(Name);
+    if (Expected<StringRef> NameOrErr = Section.getName())
+      Name = *NameOrErr;
+    else
+      consumeError(NameOrErr.takeError());
 
     if (Name != ".pdata" && !Name.startswith(".pdata$"))
       continue;
 
     const coff_section *PData = Ctx.COFF.getCOFFSection(Section);
     ArrayRef<uint8_t> Contents;
-    error(Ctx.COFF.getSectionContents(PData, Contents));
+
+    if (Error E = Ctx.COFF.getSectionContents(PData, Contents))
+      reportError(std::move(E), Ctx.COFF.getFileName());
     if (Contents.empty())
       continue;
 
diff --git a/tools/llvm-readobj/WindowsResourceDumper.cpp b/tools/llvm-readobj/WindowsResourceDumper.cpp
index 13989f696d9d..a2fb6aac3f93 100644
--- a/tools/llvm-readobj/WindowsResourceDumper.cpp
+++ b/tools/llvm-readobj/WindowsResourceDumper.cpp
@@ -56,8 +56,12 @@ void Dumper::printEntry(const ResourceEntryRef &Ref) {
   if (Ref.checkTypeString()) {
     auto NarrowStr = stripUTF16(Ref.getTypeString());
     SW.printString("Resource type (string)", NarrowStr);
-  } else
-    SW.printNumber("Resource type (int)", Ref.getTypeID());
+  } else {
+    SmallString<20> IDStr;
+    raw_svector_ostream OS(IDStr);
+    printResourceTypeName(Ref.getTypeID(), OS);
+    SW.printString("Resource type (int)", IDStr);
+  }
 
   if (Ref.checkNameString()) {
     auto NarrowStr = stripUTF16(Ref.getNameString());
diff --git a/tools/llvm-readobj/XCOFFDumper.cpp b/tools/llvm-readobj/XCOFFDumper.cpp
index 6f260f91537f..fe95b6d1b494 100644
--- a/tools/llvm-readobj/XCOFFDumper.cpp
+++ b/tools/llvm-readobj/XCOFFDumper.cpp
@@ -22,6 +22,12 @@ using namespace object;
 namespace {
 
 class XCOFFDumper : public ObjDumper {
+  enum {
+    SymbolTypeMask = 0x07,
+    SymbolAlignmentMask = 0xF8,
+    SymbolAlignmentBitOffset = 3
+  };
+
 public:
   XCOFFDumper(const XCOFFObjectFile &Obj, ScopedPrinter &Writer)
       : ObjDumper(Writer), Obj(Obj) {}
@@ -37,11 +43,21 @@ public:
 
 private:
   template <typename T> void printSectionHeaders(ArrayRef<T> Sections);
-
-  const XCOFFObjectFile &Obj;
+  template <typename T> void printGenericSectionHeader(T &Sec) const;
+  template <typename T> void printOverflowSectionHeader(T &Sec) const;
+  void printFileAuxEnt(const XCOFFFileAuxEnt *AuxEntPtr);
+  void printCsectAuxEnt32(const XCOFFCsectAuxEnt32 *AuxEntPtr);
+  void printSectAuxEntForStat(const XCOFFSectAuxEntForStat *AuxEntPtr);
+  void printSymbol(const SymbolRef &);
 
   // Least significant 3 bits are reserved.
   static constexpr unsigned SectionFlagsReservedMask = 0x7;
+
+  // The low order 16 bits of section flags denotes the section type.
+  static constexpr unsigned SectionFlagsTypeMask = 0xffffu;
+
+  void printRelocations(ArrayRef<XCOFFSectionHeader32> Sections);
+  const XCOFFObjectFile &Obj;
 };
 } // anonymous namespace
 
@@ -100,11 +116,315 @@ void XCOFFDumper::printSectionHeaders() {
 }
 
 void XCOFFDumper::printRelocations() {
-  llvm_unreachable("Unimplemented functionality for XCOFFDumper");
+  if (Obj.is64Bit())
+    llvm_unreachable("64-bit relocation output not implemented!");
+  else
+    printRelocations(Obj.sections32());
+}
+
+static const EnumEntry<XCOFF::RelocationType> RelocationTypeNameclass[] = {
+#define ECase(X)                                                               \
+  { #X, XCOFF::X }
+    ECase(R_POS),    ECase(R_RL),     ECase(R_RLA),    ECase(R_NEG),
+    ECase(R_REL),    ECase(R_TOC),    ECase(R_TRL),    ECase(R_TRLA),
+    ECase(R_GL),     ECase(R_TCL),    ECase(R_REF),    ECase(R_BA),
+    ECase(R_BR),     ECase(R_RBA),    ECase(R_RBR),    ECase(R_TLS),
+    ECase(R_TLS_IE), ECase(R_TLS_LD), ECase(R_TLS_LE), ECase(R_TLSM),
+    ECase(R_TLSML),  ECase(R_TOCU),   ECase(R_TOCL)
+#undef ECase
+};
+
+void XCOFFDumper::printRelocations(ArrayRef<XCOFFSectionHeader32> Sections) {
+  if (!opts::ExpandRelocs)
+    report_fatal_error("Unexpanded relocation output not implemented.");
+
+  ListScope LS(W, "Relocations");
+  uint16_t Index = 0;
+  for (const auto &Sec : Sections) {
+    ++Index;
+    // Only the .text, .data, .tdata, and STYP_DWARF sections have relocation.
+    if (Sec.Flags != XCOFF::STYP_TEXT && Sec.Flags != XCOFF::STYP_DATA &&
+        Sec.Flags != XCOFF::STYP_TDATA && Sec.Flags != XCOFF::STYP_DWARF)
+      continue;
+    auto Relocations = unwrapOrError(Obj.getFileName(), Obj.relocations(Sec));
+    if (Relocations.empty())
+      continue;
+
+    W.startLine() << "Section (index: " << Index << ") " << Sec.getName()
+                  << " {\n";
+    for (auto Reloc : Relocations) {
+      StringRef SymbolName = unwrapOrError(
+          Obj.getFileName(), Obj.getSymbolNameByIndex(Reloc.SymbolIndex));
+
+      DictScope RelocScope(W, "Relocation");
+      W.printHex("Virtual Address", Reloc.VirtualAddress);
+      W.printNumber("Symbol", SymbolName, Reloc.SymbolIndex);
+      W.printString("IsSigned", Reloc.isRelocationSigned() ? "Yes" : "No");
+      W.printNumber("FixupBitValue", Reloc.isFixupIndicated() ? 1 : 0);
+      W.printNumber("Length", Reloc.getRelocatedLength());
+      W.printEnum("Type", (uint8_t)Reloc.Type,
+                  makeArrayRef(RelocationTypeNameclass));
+    }
+    W.unindent();
+    W.startLine() << "}\n";
+  }
+}
+
+static const EnumEntry<XCOFF::CFileStringType> FileStringType[] = {
+#define ECase(X)                                                               \
+  { #X, XCOFF::X }
+    ECase(XFT_FN), ECase(XFT_CT), ECase(XFT_CV), ECase(XFT_CD)
+#undef ECase
+};
+
+void XCOFFDumper::printFileAuxEnt(const XCOFFFileAuxEnt *AuxEntPtr) {
+  if (Obj.is64Bit())
+    report_fatal_error(
+        "Printing for File Auxiliary Entry in 64-bit is unimplemented.");
+  StringRef FileName =
+      unwrapOrError(Obj.getFileName(), Obj.getCFileName(AuxEntPtr));
+  DictScope SymDs(W, "File Auxiliary Entry");
+  W.printNumber("Index",
+                Obj.getSymbolIndex(reinterpret_cast<uintptr_t>(AuxEntPtr)));
+  W.printString("Name", FileName);
+  W.printEnum("Type", static_cast<uint8_t>(AuxEntPtr->Type),
+              makeArrayRef(FileStringType));
+}
+
+static const EnumEntry<XCOFF::StorageMappingClass> CsectStorageMappingClass[] =
+    {
+#define ECase(X)                                                               \
+  { #X, XCOFF::X }
+        ECase(XMC_PR),   ECase(XMC_RO),     ECase(XMC_DB),
+        ECase(XMC_GL),   ECase(XMC_XO),     ECase(XMC_SV),
+        ECase(XMC_SV64), ECase(XMC_SV3264), ECase(XMC_TI),
+        ECase(XMC_TB),   ECase(XMC_RW),     ECase(XMC_TC0),
+        ECase(XMC_TC),   ECase(XMC_TD),     ECase(XMC_DS),
+        ECase(XMC_UA),   ECase(XMC_BS),     ECase(XMC_UC),
+        ECase(XMC_TL),   ECase(XMC_TE)
+#undef ECase
+};
+
+static const EnumEntry<XCOFF::SymbolType> CsectSymbolTypeClass[] = {
+#define ECase(X)                                                               \
+  { #X, XCOFF::X }
+    ECase(XTY_ER), ECase(XTY_SD), ECase(XTY_LD), ECase(XTY_CM)
+#undef ECase
+};
+
+void XCOFFDumper::printCsectAuxEnt32(const XCOFFCsectAuxEnt32 *AuxEntPtr) {
+  assert(!Obj.is64Bit() && "32-bit interface called on 64-bit object file.");
+
+  DictScope SymDs(W, "CSECT Auxiliary Entry");
+  W.printNumber("Index",
+                Obj.getSymbolIndex(reinterpret_cast<uintptr_t>(AuxEntPtr)));
+  if ((AuxEntPtr->SymbolAlignmentAndType & SymbolTypeMask) == XCOFF::XTY_LD)
+    W.printNumber("ContainingCsectSymbolIndex", AuxEntPtr->SectionOrLength);
+  else
+    W.printNumber("SectionLen", AuxEntPtr->SectionOrLength);
+  W.printHex("ParameterHashIndex", AuxEntPtr->ParameterHashIndex);
+  W.printHex("TypeChkSectNum", AuxEntPtr->TypeChkSectNum);
+  // Print out symbol alignment and type.
+  W.printNumber("SymbolAlignmentLog2",
+                (AuxEntPtr->SymbolAlignmentAndType & SymbolAlignmentMask) >>
+                    SymbolAlignmentBitOffset);
+  W.printEnum("SymbolType", AuxEntPtr->SymbolAlignmentAndType & SymbolTypeMask,
+              makeArrayRef(CsectSymbolTypeClass));
+  W.printEnum("StorageMappingClass",
+              static_cast<uint8_t>(AuxEntPtr->StorageMappingClass),
+              makeArrayRef(CsectStorageMappingClass));
+  W.printHex("StabInfoIndex", AuxEntPtr->StabInfoIndex);
+  W.printHex("StabSectNum", AuxEntPtr->StabSectNum);
+}
+
+void XCOFFDumper::printSectAuxEntForStat(
+    const XCOFFSectAuxEntForStat *AuxEntPtr) {
+  assert(!Obj.is64Bit() && "32-bit interface called on 64-bit object file.");
+
+  DictScope SymDs(W, "Sect Auxiliary Entry For Stat");
+  W.printNumber("Index",
+                Obj.getSymbolIndex(reinterpret_cast<uintptr_t>(AuxEntPtr)));
+  W.printNumber("SectionLength", AuxEntPtr->SectionLength);
+
+  // Unlike the corresponding fields in the section header, NumberOfRelocEnt
+  // and NumberOfLineNum do not handle values greater than 65535.
+  W.printNumber("NumberOfRelocEnt", AuxEntPtr->NumberOfRelocEnt);
+  W.printNumber("NumberOfLineNum", AuxEntPtr->NumberOfLineNum);
+}
+
+static const EnumEntry<XCOFF::StorageClass> SymStorageClass[] = {
+#define ECase(X)                                                               \
+  { #X, XCOFF::X }
+    ECase(C_NULL),  ECase(C_AUTO),    ECase(C_EXT),     ECase(C_STAT),
+    ECase(C_REG),   ECase(C_EXTDEF),  ECase(C_LABEL),   ECase(C_ULABEL),
+    ECase(C_MOS),   ECase(C_ARG),     ECase(C_STRTAG),  ECase(C_MOU),
+    ECase(C_UNTAG), ECase(C_TPDEF),   ECase(C_USTATIC), ECase(C_ENTAG),
+    ECase(C_MOE),   ECase(C_REGPARM), ECase(C_FIELD),   ECase(C_BLOCK),
+    ECase(C_FCN),   ECase(C_EOS),     ECase(C_FILE),    ECase(C_LINE),
+    ECase(C_ALIAS), ECase(C_HIDDEN),  ECase(C_HIDEXT),  ECase(C_BINCL),
+    ECase(C_EINCL), ECase(C_INFO),    ECase(C_WEAKEXT), ECase(C_DWARF),
+    ECase(C_GSYM),  ECase(C_LSYM),    ECase(C_PSYM),    ECase(C_RSYM),
+    ECase(C_RPSYM), ECase(C_STSYM),   ECase(C_TCSYM),   ECase(C_BCOMM),
+    ECase(C_ECOML), ECase(C_ECOMM),   ECase(C_DECL),    ECase(C_ENTRY),
+    ECase(C_FUN),   ECase(C_BSTAT),   ECase(C_ESTAT),   ECase(C_GTLS),
+    ECase(C_STTLS), ECase(C_EFCN)
+#undef ECase
+};
+
+static StringRef GetSymbolValueName(XCOFF::StorageClass SC) {
+  switch (SC) {
+  case XCOFF::C_EXT:
+  case XCOFF::C_WEAKEXT:
+  case XCOFF::C_HIDEXT:
+  case XCOFF::C_STAT:
+    return "Value (RelocatableAddress)";
+  case XCOFF::C_FILE:
+    return "Value (SymbolTableIndex)";
+  case XCOFF::C_FCN:
+  case XCOFF::C_BLOCK:
+  case XCOFF::C_FUN:
+  case XCOFF::C_STSYM:
+  case XCOFF::C_BINCL:
+  case XCOFF::C_EINCL:
+  case XCOFF::C_INFO:
+  case XCOFF::C_BSTAT:
+  case XCOFF::C_LSYM:
+  case XCOFF::C_PSYM:
+  case XCOFF::C_RPSYM:
+  case XCOFF::C_RSYM:
+  case XCOFF::C_ECOML:
+  case XCOFF::C_DWARF:
+    assert(false && "This StorageClass for the symbol is not yet implemented.");
+    return "";
+  default:
+    return "Value";
+  }
+}
+
+static const EnumEntry<XCOFF::CFileLangId> CFileLangIdClass[] = {
+#define ECase(X)                                                               \
+  { #X, XCOFF::X }
+    ECase(TB_C), ECase(TB_CPLUSPLUS)
+#undef ECase
+};
+
+static const EnumEntry<XCOFF::CFileCpuId> CFileCpuIdClass[] = {
+#define ECase(X)                                                               \
+  { #X, XCOFF::X }
+    ECase(TCPU_PPC64), ECase(TCPU_COM), ECase(TCPU_970)
+#undef ECase
+};
+
+void XCOFFDumper::printSymbol(const SymbolRef &S) {
+  if (Obj.is64Bit())
+    report_fatal_error("64-bit support is unimplemented.");
+
+  DataRefImpl SymbolDRI = S.getRawDataRefImpl();
+  const XCOFFSymbolEntry *SymbolEntPtr = Obj.toSymbolEntry(SymbolDRI);
+
+  XCOFFSymbolRef XCOFFSymRef(SymbolDRI, &Obj);
+  uint8_t NumberOfAuxEntries = XCOFFSymRef.getNumberOfAuxEntries();
+
+  DictScope SymDs(W, "Symbol");
+
+  StringRef SymbolName =
+      unwrapOrError(Obj.getFileName(), Obj.getSymbolName(SymbolDRI));
+
+  W.printNumber("Index",
+                Obj.getSymbolIndex(reinterpret_cast<uintptr_t>(SymbolEntPtr)));
+  W.printString("Name", SymbolName);
+  W.printHex(GetSymbolValueName(SymbolEntPtr->StorageClass),
+             SymbolEntPtr->Value);
+
+  StringRef SectionName =
+      unwrapOrError(Obj.getFileName(), Obj.getSymbolSectionName(SymbolEntPtr));
+
+  W.printString("Section", SectionName);
+  if (XCOFFSymRef.getStorageClass() == XCOFF::C_FILE) {
+    W.printEnum("Source Language ID",
+                SymbolEntPtr->CFileLanguageIdAndTypeId.LanguageId,
+                makeArrayRef(CFileLangIdClass));
+    W.printEnum("CPU Version ID",
+                SymbolEntPtr->CFileLanguageIdAndTypeId.CpuTypeId,
+                makeArrayRef(CFileCpuIdClass));
+  } else
+    W.printHex("Type", SymbolEntPtr->SymbolType);
+
+  W.printEnum("StorageClass", static_cast<uint8_t>(SymbolEntPtr->StorageClass),
+              makeArrayRef(SymStorageClass));
+  W.printNumber("NumberOfAuxEntries", SymbolEntPtr->NumberOfAuxEntries);
+
+  if (NumberOfAuxEntries == 0)
+    return;
+
+  switch (XCOFFSymRef.getStorageClass()) {
+  case XCOFF::C_FILE:
+    // If the symbol is C_FILE and has auxiliary entries...
+    for (int i = 1; i <= NumberOfAuxEntries; i++) {
+      const XCOFFFileAuxEnt *FileAuxEntPtr =
+          reinterpret_cast<const XCOFFFileAuxEnt *>(SymbolEntPtr + i);
+#ifndef NDEBUG
+      Obj.checkSymbolEntryPointer(reinterpret_cast<uintptr_t>(FileAuxEntPtr));
+#endif
+      printFileAuxEnt(FileAuxEntPtr);
+    }
+    break;
+  case XCOFF::C_EXT:
+  case XCOFF::C_WEAKEXT:
+  case XCOFF::C_HIDEXT:
+    // If the symbol is for a function, and it has more than 1 auxiliary entry,
+    // then one of them must be function auxiliary entry which we do not
+    // support yet.
+    if (XCOFFSymRef.isFunction() && NumberOfAuxEntries >= 2)
+      report_fatal_error("Function auxiliary entry printing is unimplemented.");
+
+    // If there is more than 1 auxiliary entry, instead of printing out
+    // error information, print out the raw Auxiliary entry from 1st till
+    // the last - 1. The last one must be a CSECT Auxiliary Entry.
+    for (int i = 1; i < NumberOfAuxEntries; i++) {
+      W.startLine() << "!Unexpected raw auxiliary entry data:\n";
+      W.startLine() << format_bytes(
+          ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(SymbolEntPtr + i),
+                            XCOFF::SymbolTableEntrySize));
+    }
+
+    // The symbol's last auxiliary entry is a CSECT Auxiliary Entry.
+    printCsectAuxEnt32(XCOFFSymRef.getXCOFFCsectAuxEnt32());
+    break;
+  case XCOFF::C_STAT:
+    if (NumberOfAuxEntries > 1)
+      report_fatal_error(
+          "C_STAT symbol should not have more than 1 auxiliary entry.");
+
+    const XCOFFSectAuxEntForStat *StatAuxEntPtr;
+    StatAuxEntPtr =
+        reinterpret_cast<const XCOFFSectAuxEntForStat *>(SymbolEntPtr + 1);
+#ifndef NDEBUG
+    Obj.checkSymbolEntryPointer(reinterpret_cast<uintptr_t>(StatAuxEntPtr));
+#endif
+    printSectAuxEntForStat(StatAuxEntPtr);
+    break;
+  case XCOFF::C_DWARF:
+  case XCOFF::C_BLOCK:
+  case XCOFF::C_FCN:
+    report_fatal_error("Symbol table entry printing for this storage class "
+                       "type is unimplemented.");
+    break;
+  default:
+    for (int i = 1; i <= NumberOfAuxEntries; i++) {
+      W.startLine() << "!Unexpected raw auxiliary entry data:\n";
+      W.startLine() << format_bytes(
+          ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(SymbolEntPtr + i),
+                            XCOFF::SymbolTableEntrySize));
+    }
+    break;
+  }
 }
 
 void XCOFFDumper::printSymbols() {
-  llvm_unreachable("Unimplemented functionality for XCOFFDumper");
+  ListScope Group(W, "Symbols");
+  for (const SymbolRef &S : Obj.symbols())
+    printSymbol(S);
 }
 
 void XCOFFDumper::printDynamicSymbols() {
@@ -134,6 +454,39 @@ static const EnumEntry<XCOFF::SectionTypeFlags> SectionTypeFlagsNames[] = {
 #undef ECase
 };
 
+template <typename T>
+void XCOFFDumper::printOverflowSectionHeader(T &Sec) const {
+  if (Obj.is64Bit()) {
+    reportWarning(make_error<StringError>("An 64-bit XCOFF object file may not "
+                                          "contain an overflow section header.",
+                                          object_error::parse_failed),
+                  Obj.getFileName());
+  }
+
+  W.printString("Name", Sec.getName());
+  W.printNumber("NumberOfRelocations", Sec.PhysicalAddress);
+  W.printNumber("NumberOfLineNumbers", Sec.VirtualAddress);
+  W.printHex("Size", Sec.SectionSize);
+  W.printHex("RawDataOffset", Sec.FileOffsetToRawData);
+  W.printHex("RelocationPointer", Sec.FileOffsetToRelocationInfo);
+  W.printHex("LineNumberPointer", Sec.FileOffsetToLineNumberInfo);
+  W.printNumber("IndexOfSectionOverflowed", Sec.NumberOfRelocations);
+  W.printNumber("IndexOfSectionOverflowed", Sec.NumberOfLineNumbers);
+}
+
+template <typename T>
+void XCOFFDumper::printGenericSectionHeader(T &Sec) const {
+  W.printString("Name", Sec.getName());
+  W.printHex("PhysicalAddress", Sec.PhysicalAddress);
+  W.printHex("VirtualAddress", Sec.VirtualAddress);
+  W.printHex("Size", Sec.SectionSize);
+  W.printHex("RawDataOffset", Sec.FileOffsetToRawData);
+  W.printHex("RelocationPointer", Sec.FileOffsetToRelocationInfo);
+  W.printHex("LineNumberPointer", Sec.FileOffsetToLineNumberInfo);
+  W.printNumber("NumberOfRelocations", Sec.NumberOfRelocations);
+  W.printNumber("NumberOfLineNumbers", Sec.NumberOfLineNumbers);
+}
+
 template <typename T>
 void XCOFFDumper::printSectionHeaders(ArrayRef<T> Sections) {
   ListScope Group(W, "Sections");
@@ -143,27 +496,28 @@ void XCOFFDumper::printSectionHeaders(ArrayRef<T> Sections) {
     DictScope SecDS(W, "Section");
 
     W.printNumber("Index", Index++);
-    W.printString("Name", Sec.getName());
-
-    W.printHex("PhysicalAddress", Sec.PhysicalAddress);
-    W.printHex("VirtualAddress", Sec.VirtualAddress);
-    W.printHex("Size", Sec.SectionSize);
-    W.printHex("RawDataOffset", Sec.FileOffsetToRawData);
-    W.printHex("RelocationPointer", Sec.FileOffsetToRelocationInfo);
-    W.printHex("LineNumberPointer", Sec.FileOffsetToLineNumberInfo);
-
-    // TODO Need to add overflow handling when NumberOfX == _OVERFLOW_MARKER
-    // in 32-bit object files.
-    W.printNumber("NumberOfRelocations", Sec.NumberOfRelocations);
-    W.printNumber("NumberOfLineNumbers", Sec.NumberOfLineNumbers);
-
-    // The most significant 16-bits represent the DWARF section subtype. For
-    // now we just dump the section type flags.
-    uint16_t Flags = Sec.Flags & 0xffffu;
-    if (Flags & SectionFlagsReservedMask)
-      W.printHex("Flags", "Reserved", Flags);
+
+    uint16_t SectionType = Sec.Flags & SectionFlagsTypeMask;
+    switch (SectionType) {
+    case XCOFF::STYP_OVRFLO:
+      printOverflowSectionHeader(Sec);
+      break;
+    case XCOFF::STYP_LOADER:
+    case XCOFF::STYP_EXCEPT:
+    case XCOFF::STYP_TYPCHK:
+      // TODO The interpretation of loader, exception and type check section
+      // headers are different from that of generic section headers. We will
+      // implement them later. We interpret them as generic section headers for
+      // now.
+    default:
+      printGenericSectionHeader(Sec);
+      break;
+    }
+    // For now we just dump the section type portion of the flags.
+    if (SectionType & SectionFlagsReservedMask)
+      W.printHex("Flags", "Reserved", SectionType);
     else
-      W.printEnum("Type", Flags, makeArrayRef(SectionTypeFlagsNames));
+      W.printEnum("Type", SectionType, makeArrayRef(SectionTypeFlagsNames));
   }
 
   if (opts::SectionRelocations)
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 1bd5bb74bf29..4db13897879d 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -231,26 +231,11 @@ namespace opts {
       "codeview-subsection-bytes",
       cl::desc("Dump raw contents of codeview debug sections and records"));
 
-  // --arm-attributes
-  cl::opt<bool> ARMAttributes("arm-attributes",
-                              cl::desc("Display the ARM attributes section"));
-
-  // --mips-plt-got
-  cl::opt<bool>
-  MipsPLTGOT("mips-plt-got",
-             cl::desc("Display the MIPS GOT and PLT GOT sections"));
-
-  // --mips-abi-flags
-  cl::opt<bool> MipsABIFlags("mips-abi-flags",
-                             cl::desc("Display the MIPS.abiflags section"));
-
-  // --mips-reginfo
-  cl::opt<bool> MipsReginfo("mips-reginfo",
-                            cl::desc("Display the MIPS .reginfo section"));
-
-  // --mips-options
-  cl::opt<bool> MipsOptions("mips-options",
-                            cl::desc("Display the MIPS .MIPS.options section"));
+  // --arch-specific
+  cl::opt<bool> ArchSpecificInfo("arch-specific",
+                              cl::desc("Displays architecture-specific information, if there is any."));
+  cl::alias ArchSpecifcInfoShort("A", cl::desc("Alias for --arch-specific"),
+                                 cl::aliasopt(ArchSpecificInfo), cl::NotHidden);
 
   // --coff-imports
   cl::opt<bool>
@@ -324,6 +309,11 @@ namespace opts {
   PrintStackMap("stackmap",
                 cl::desc("Display contents of stackmap section"));
 
+  // --stack-sizes
+  cl::opt<bool>
+      PrintStackSizes("stack-sizes",
+                      cl::desc("Display contents of all stack sizes sections"));
+
   // --version-info, -V
   cl::opt<bool>
       VersionInfo("version-info",
@@ -368,63 +358,45 @@ namespace opts {
       HelpResponse("\nPass @FILE as argument to read options from FILE.\n");
 } // namespace opts
 
+static StringRef ToolName;
+
 namespace llvm {
 
-LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg) {
+LLVM_ATTRIBUTE_NORETURN static void error(Twine Msg) {
+  // Flush the standard output to print the error at a
+  // proper place.
   fouts().flush();
   errs() << "\n";
-  WithColor::error(errs()) << Msg << "\n";
+  WithColor::error(errs(), ToolName) << Msg << "\n";
   exit(1);
 }
 
-void reportError(StringRef Input, Error Err) {
+LLVM_ATTRIBUTE_NORETURN void reportError(Error Err, StringRef Input) {
+  assert(Err);
   if (Input == "-")
     Input = "<stdin>";
-  error(createFileError(Input, std::move(Err)));
+  handleAllErrors(createFileError(Input, std::move(Err)),
+                  [&](const ErrorInfoBase &EI) { error(EI.message()); });
+  llvm_unreachable("error() call should never return");
 }
 
-void reportWarning(Twine Msg) {
-  fouts().flush();
-  errs() << "\n";
-  WithColor::warning(errs()) << Msg << "\n";
-}
-
-void warn(Error Err) {
-  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-    reportWarning(EI.message());
-  });
-}
-
-void error(Error EC) {
-  if (!EC)
-    return;
-  handleAllErrors(std::move(EC),
-                  [&](const ErrorInfoBase &EI) { reportError(EI.message()); });
-}
+void reportWarning(Error Err, StringRef Input) {
+  assert(Err);
+  if (Input == "-")
+    Input = "<stdin>";
 
-void error(std::error_code EC) {
-  if (!EC)
-    return;
-  reportError(EC.message());
+  // Flush the standard output to print the warning at a
+  // proper place.
+  fouts().flush();
+  handleAllErrors(
+      createFileError(Input, std::move(Err)), [&](const ErrorInfoBase &EI) {
+        errs() << "\n";
+        WithColor::warning(errs(), ToolName) << EI.message() << "\n";
+      });
 }
 
 } // namespace llvm
 
-static void reportError(StringRef Input, std::error_code EC) {
-  reportError(Input, errorCodeToError(EC));
-}
-
-static bool isMipsArch(unsigned Arch) {
-  switch (Arch) {
-  case llvm::Triple::mips:
-  case llvm::Triple::mipsel:
-  case llvm::Triple::mips64:
-  case llvm::Triple::mips64el:
-    return true;
-  default:
-    return false;
-  }
-}
 namespace {
 struct ReadObjTypeTableBuilder {
   ReadObjTypeTableBuilder()
@@ -471,19 +443,19 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer,
 
   std::unique_ptr<ObjDumper> Dumper;
   if (std::error_code EC = createDumper(Obj, Writer, Dumper))
-    reportError(FileStr, EC);
+    reportError(errorCodeToError(EC), FileStr);
 
-  Writer.startLine() << "\n";
-  if (opts::Output == opts::LLVM) {
+  if (opts::Output == opts::LLVM || opts::InputFilenames.size() > 1 || A) {
+    Writer.startLine() << "\n";
     Writer.printString("File", FileStr);
+  }
+  if (opts::Output == opts::LLVM) {
     Writer.printString("Format", Obj->getFileFormatName());
     Writer.printString("Arch", Triple::getArchTypeName(
                                    (llvm::Triple::ArchType)Obj->getArch()));
     Writer.printString("AddressSize",
                        formatv("{0}bit", 8 * Obj->getBytesInAddress()));
     Dumper->printLoadName();
-  } else if (opts::Output == opts::GNU && A) {
-    Writer.printString("File", FileStr);
   }
 
   if (opts::FileHeaders)
@@ -519,19 +491,8 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer,
   if (Obj->isELF()) {
     if (opts::ELFLinkerOptions)
       Dumper->printELFLinkerOptions();
-    if (Obj->getArch() == llvm::Triple::arm)
-      if (opts::ARMAttributes)
-        Dumper->printAttributes();
-    if (isMipsArch(Obj->getArch())) {
-      if (opts::MipsPLTGOT)
-        Dumper->printMipsPLTGOT();
-      if (opts::MipsABIFlags)
-        Dumper->printMipsABIFlags();
-      if (opts::MipsReginfo)
-        Dumper->printMipsReginfo();
-      if (opts::MipsOptions)
-        Dumper->printMipsOptions();
-    }
+    if (opts::ArchSpecificInfo)
+      Dumper->printArchSpecificInfo();
     if (opts::SectionGroups)
       Dumper->printGroupSections();
     if (opts::HashHistogram)
@@ -583,6 +544,8 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer,
   }
   if (opts::PrintStackMap)
     Dumper->printStackMap();
+  if (opts::PrintStackSizes)
+    Dumper->printStackSizes();
 }
 
 /// Dumps each object file in \a Arc;
@@ -591,9 +554,8 @@ static void dumpArchive(const Archive *Arc, ScopedPrinter &Writer) {
   for (auto &Child : Arc->children(Err)) {
     Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary();
     if (!ChildOrErr) {
-      if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError())) {
-        reportError(Arc->getFileName(), std::move(E));
-      }
+      if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
+        reportError(std::move(E), Arc->getFileName());
       continue;
     }
     if (ObjectFile *Obj = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
@@ -601,10 +563,11 @@ static void dumpArchive(const Archive *Arc, ScopedPrinter &Writer) {
     else if (COFFImportFile *Imp = dyn_cast<COFFImportFile>(&*ChildOrErr.get()))
       dumpCOFFImportFile(Imp, Writer);
     else
-      reportError(Arc->getFileName(), readobj_error::unrecognized_file_format);
+      reportError(errorCodeToError(readobj_error::unrecognized_file_format),
+                  Arc->getFileName());
   }
   if (Err)
-    reportError(Arc->getFileName(), std::move(Err));
+    reportError(std::move(Err), Arc->getFileName());
 }
 
 /// Dumps each object file in \a MachO Universal Binary;
@@ -614,9 +577,8 @@ static void dumpMachOUniversalBinary(const MachOUniversalBinary *UBinary,
     Expected<std::unique_ptr<MachOObjectFile>> ObjOrErr = Obj.getAsObjectFile();
     if (ObjOrErr)
       dumpObject(&*ObjOrErr.get(), Writer);
-    else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
-      reportError(UBinary->getFileName(), ObjOrErr.takeError());
-    }
+    else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError()))
+      reportError(ObjOrErr.takeError(), UBinary->getFileName());
     else if (Expected<std::unique_ptr<Archive>> AOrErr = Obj.getAsArchive())
       dumpArchive(&*AOrErr.get(), Writer);
   }
@@ -627,7 +589,7 @@ static void dumpWindowsResourceFile(WindowsResource *WinRes,
                                     ScopedPrinter &Printer) {
   WindowsRes::Dumper Dumper(WinRes, Printer);
   if (auto Err = Dumper.printData())
-    reportError(WinRes->getFileName(), std::move(Err));
+    reportError(std::move(Err), WinRes->getFileName());
 }
 
 
@@ -636,7 +598,7 @@ static void dumpInput(StringRef File, ScopedPrinter &Writer) {
   // Attempt to open the binary.
   Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
   if (!BinaryOrErr)
-    reportError(File, BinaryOrErr.takeError());
+    reportError(BinaryOrErr.takeError(), File);
   Binary &Binary = *BinaryOrErr.get().getBinary();
 
   if (Archive *Arc = dyn_cast<Archive>(&Binary))
@@ -651,7 +613,8 @@ static void dumpInput(StringRef File, ScopedPrinter &Writer) {
   else if (WindowsResource *WinRes = dyn_cast<WindowsResource>(&Binary))
     dumpWindowsResourceFile(WinRes, Writer);
   else
-    reportError(File, readobj_error::unrecognized_file_format);
+    reportError(errorCodeToError(readobj_error::unrecognized_file_format),
+                File);
 
   CVTypes.Binaries.push_back(std::move(*BinaryOrErr));
 }
@@ -702,6 +665,7 @@ static void registerReadelfAliases() {
 
 int main(int argc, const char *argv[]) {
   InitLLVM X(argc, argv);
+  ToolName = argv[0];
 
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
@@ -727,6 +691,10 @@ int main(int argc, const char *argv[]) {
     opts::UnwindInfo = true;
     opts::SectionGroups = true;
     opts::HashHistogram = true;
+    if (opts::Output == opts::LLVM) {
+      opts::Addrsig = true;
+      opts::PrintStackSizes = true;
+    }
   }
 
   if (opts::Headers) {
diff --git a/tools/llvm-readobj/llvm-readobj.h b/tools/llvm-readobj/llvm-readobj.h
index 0e02da4cb847..d9813f5dea62 100644
--- a/tools/llvm-readobj/llvm-readobj.h
+++ b/tools/llvm-readobj/llvm-readobj.h
@@ -21,30 +21,13 @@ namespace llvm {
   }
 
   // Various helper functions.
-  LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg);
-  void reportError(StringRef Input, Error Err); 
-  void reportWarning(Twine Msg);
-  void warn(llvm::Error Err);
-  void error(std::error_code EC);
-  void error(llvm::Error EC);
-  template <typename T> T error(llvm::Expected<T> &&E) {
-    error(E.takeError());
-    return std::move(*E);
-  }
+  LLVM_ATTRIBUTE_NORETURN void reportError(Error Err, StringRef Input); 
+  void reportWarning(Error Err, StringRef Input);
 
-  template <class T> T unwrapOrError(ErrorOr<T> EO) {
-    if (EO)
-      return *EO;
-    reportError(EO.getError().message());
-  }
-  template <class T> T unwrapOrError(Expected<T> EO) {
+  template <class T> T unwrapOrError(StringRef Input, Expected<T> EO) {
     if (EO)
       return *EO;
-    std::string Buf;
-    raw_string_ostream OS(Buf);
-    logAllUnhandledErrors(EO.takeError(), OS);
-    OS.flush();
-    reportError(Buf);
+    reportError(EO.takeError(), Input);
   }
 } // namespace llvm
 
diff --git a/tools/llvm-reduce/CMakeLists.txt b/tools/llvm-reduce/CMakeLists.txt
new file mode 100644
index 000000000000..48de0ffa78a1
--- /dev/null
+++ b/tools/llvm-reduce/CMakeLists.txt
@@ -0,0 +1,26 @@
+set(LLVM_LINK_COMPONENTS
+  AllTargetsAsmParsers
+  AllTargetsCodeGens
+  AllTargetsDescs
+  AllTargetsInfos
+  Core
+  IRReader
+  Support
+  Target
+  TransformUtils
+  )
+
+add_llvm_tool(llvm-reduce
+  llvm-reduce.cpp
+  TestRunner.cpp
+  deltas/Delta.cpp
+  deltas/ReduceFunctions.cpp
+  deltas/ReduceGlobalVars.cpp
+  deltas/ReduceMetadata.cpp
+  deltas/ReduceArguments.cpp
+  deltas/ReduceBasicBlocks.cpp
+  deltas/ReduceInstructions.cpp
+
+  DEPENDS
+  intrinsics_gen
+  )
diff --git a/tools/llvm-reduce/DeltaManager.h b/tools/llvm-reduce/DeltaManager.h
new file mode 100644
index 000000000000..2309c3adf4e6
--- /dev/null
+++ b/tools/llvm-reduce/DeltaManager.h
@@ -0,0 +1,36 @@
+//===- DeltaManager.h - Runs Delta Passes to reduce Input -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file calls each specialized Delta pass in order to reduce the input IR
+// file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestRunner.h"
+#include "deltas/Delta.h"
+#include "deltas/ReduceArguments.h"
+#include "deltas/ReduceBasicBlocks.h"
+#include "deltas/ReduceFunctions.h"
+#include "deltas/ReduceGlobalVars.h"
+#include "deltas/ReduceMetadata.h"
+#include "deltas/ReduceInstructions.h"
+
+namespace llvm {
+
+// TODO: Add CLI option to run only specified Passes (for unit tests)
+inline void runDeltaPasses(TestRunner &Tester) {
+  reduceFunctionsDeltaPass(Tester);
+  reduceBasicBlocksDeltaPass(Tester);
+  reduceGlobalsDeltaPass(Tester);
+  reduceMetadataDeltaPass(Tester);
+  reduceArgumentsDeltaPass(Tester);
+  reduceInstructionsDeltaPass(Tester);
+  // TODO: Implement the remaining Delta Passes
+}
+
+} // namespace llvm
diff --git a/tools/llvm-reduce/LLVMBuild.txt b/tools/llvm-reduce/LLVMBuild.txt
new file mode 100644
index 000000000000..7928f0503283
--- /dev/null
+++ b/tools/llvm-reduce/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./tools/llvm-reduce/LLVMBuild.txt ------------------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-reduce
+parent = Tools
+required_libraries =
+ BitReader
+ IRReader
+ all-targets
diff --git a/tools/llvm-reduce/TestRunner.cpp b/tools/llvm-reduce/TestRunner.cpp
new file mode 100644
index 000000000000..d0e195d5697c
--- /dev/null
+++ b/tools/llvm-reduce/TestRunner.cpp
@@ -0,0 +1,42 @@
+//===-- TestRunner.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestRunner.h"
+
+using namespace llvm;
+
+TestRunner::TestRunner(StringRef TestName, const std::vector<std::string> &TestArgs)
+    : TestName(TestName), TestArgs(TestArgs) {
+}
+
+/// Runs the interestingness test, passes file to be tested as first argument
+/// and other specified test arguments after that.
+int TestRunner::run(StringRef Filename) {
+  std::vector<StringRef> ProgramArgs;
+  ProgramArgs.push_back(TestName);
+
+  for (const auto &Arg : TestArgs)
+    ProgramArgs.push_back(Arg);
+
+  ProgramArgs.push_back(Filename);
+
+  std::string ErrMsg;
+  int Result = sys::ExecuteAndWait(
+      TestName, ProgramArgs, /*Env=*/None, /*Redirects=*/None,
+      /*SecondsToWait=*/0, /*MemoryLimit=*/0, &ErrMsg);
+
+  if (Result < 0) {
+    Error E = make_error<StringError>("Error running interesting-ness test: " +
+                                          ErrMsg,
+                                      inconvertibleErrorCode());
+    errs() << toString(std::move(E));
+    exit(1);
+  }
+
+  return !Result;
+}
diff --git a/tools/llvm-reduce/TestRunner.h b/tools/llvm-reduce/TestRunner.h
new file mode 100644
index 000000000000..2270d6bd90b2
--- /dev/null
+++ b/tools/llvm-reduce/TestRunner.h
@@ -0,0 +1,46 @@
+//===-- tools/llvm-reduce/TestRunner.h ---------------------------*- C++ -*-===/
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMREDUCE_TESTRUNNER_H
+#define LLVM_TOOLS_LLVMREDUCE_TESTRUNNER_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include <vector>
+
+namespace llvm {
+
+// This class contains all the info necessary for running the provided
+// interesting-ness test, as well as the most reduced module and its
+// respective filename.
+class TestRunner {
+public:
+  TestRunner(StringRef TestName, const std::vector<std::string> &TestArgs);
+
+  /// Runs the interesting-ness test for the specified file
+  /// @returns 0 if test was successful, 1 if otherwise
+  int run(StringRef Filename);
+
+  /// Returns the most reduced version of the original testcase
+  Module *getProgram() const { return Program.get(); }
+
+  void setProgram(std::unique_ptr<Module> P) { Program = std::move(P); }
+
+private:
+  StringRef TestName;
+  const std::vector<std::string> &TestArgs;
+  std::unique_ptr<Module> Program;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/tools/llvm-reduce/deltas/Delta.cpp b/tools/llvm-reduce/deltas/Delta.cpp
new file mode 100644
index 000000000000..0642241ddebd
--- /dev/null
+++ b/tools/llvm-reduce/deltas/Delta.cpp
@@ -0,0 +1,162 @@
+//===- Delta.cpp - Delta Debugging Algorithm Implementation ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation for the Delta Debugging Algorithm:
+// it splits a given set of Targets (i.e. Functions, Instructions, BBs, etc.)
+// into chunks and tries to reduce the number chunks that are interesting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Delta.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <fstream>
+#include <set>
+
+using namespace llvm;
+
+bool IsReduced(Module &M, TestRunner &Test, SmallString<128> &CurrentFilepath) {
+  // Write Module to tmp file
+  int FD;
+  std::error_code EC =
+      sys::fs::createTemporaryFile("llvm-reduce", "ll", FD, CurrentFilepath);
+  if (EC) {
+    errs() << "Error making unique filename: " << EC.message() << "!\n";
+    exit(1);
+  }
+
+  ToolOutputFile Out(CurrentFilepath, FD);
+  M.print(Out.os(), /*AnnotationWriter=*/nullptr);
+  Out.os().close();
+  if (Out.os().has_error()) {
+    errs() << "Error emitting bitcode to file '" << CurrentFilepath << "'!\n";
+    exit(1);
+  }
+
+  // Current Chunks aren't interesting
+  return Test.run(CurrentFilepath);
+}
+
+/// Counts the amount of lines for a given file
+static int getLines(StringRef Filepath) {
+  int Lines = 0;
+  std::string CurrLine;
+  std::ifstream FileStream(Filepath);
+
+  while (std::getline(FileStream, CurrLine))
+    ++Lines;
+
+  return Lines;
+}
+
+/// Splits Chunks in half and prints them.
+/// If unable to split (when chunk size is 1) returns false.
+static bool increaseGranularity(std::vector<Chunk> &Chunks) {
+  errs() << "Increasing granularity...";
+  std::vector<Chunk> NewChunks;
+  bool SplitOne = false;
+
+  for (auto &C : Chunks) {
+    if (C.end - C.begin == 0)
+      NewChunks.push_back(C);
+    else {
+      int Half = (C.begin + C.end) / 2;
+      NewChunks.push_back({C.begin, Half});
+      NewChunks.push_back({Half + 1, C.end});
+      SplitOne = true;
+    }
+  }
+  if (SplitOne) {
+    Chunks = NewChunks;
+    errs() << "Success! New Chunks:\n";
+    for (auto C : Chunks) {
+      errs() << '\t';
+      C.print();
+      errs() << '\n';
+    }
+  }
+  return SplitOne;
+}
+
+/// Runs the Delta Debugging algorithm, splits the code into chunks and
+/// reduces the amount of chunks that are considered interesting by the
+/// given test.
+void llvm::runDeltaPass(
+    TestRunner &Test, int Targets,
+    std::function<void(const std::vector<Chunk> &, Module *)>
+        ExtractChunksFromModule) {
+  assert(Targets >= 0);
+  if (!Targets) {
+    errs() << "\nNothing to reduce\n";
+    return;
+  }
+
+  if (Module *Program = Test.getProgram()) {
+    SmallString<128> CurrentFilepath;
+    if (!IsReduced(*Program, Test, CurrentFilepath)) {
+      errs() << "\nInput isn't interesting! Verify interesting-ness test\n";
+      exit(1);
+    }
+  }
+
+  std::vector<Chunk> Chunks = {{1, Targets}};
+  std::set<Chunk> UninterestingChunks;
+  std::unique_ptr<Module> ReducedProgram;
+
+  if (!increaseGranularity(Chunks)) {
+    errs() << "\nAlready at minimum size. Cannot reduce anymore.\n";
+    return;
+  }
+
+  do {
+    UninterestingChunks = {};
+    for (int I = Chunks.size() - 1; I >= 0; --I) {
+      std::vector<Chunk> CurrentChunks;
+
+      for (auto C : Chunks)
+        if (!UninterestingChunks.count(C) && C != Chunks[I])
+          CurrentChunks.push_back(C);
+
+      if (CurrentChunks.empty())
+        continue;
+
+      // Clone module before hacking it up..
+      std::unique_ptr<Module> Clone = CloneModule(*Test.getProgram());
+      // Generate Module with only Targets inside Current Chunks
+      ExtractChunksFromModule(CurrentChunks, Clone.get());
+
+      errs() << "Ignoring: ";
+      Chunks[I].print();
+      for (auto C : UninterestingChunks)
+        C.print();
+
+
+
+      SmallString<128> CurrentFilepath;
+      if (!IsReduced(*Clone, Test, CurrentFilepath)) {
+        errs() << "\n";
+        continue;
+      }
+
+      UninterestingChunks.insert(Chunks[I]);
+      ReducedProgram = std::move(Clone);
+      errs() << " **** SUCCESS | lines: " << getLines(CurrentFilepath) << "\n";
+    }
+    // Delete uninteresting chunks
+    erase_if(Chunks, [&UninterestingChunks](const Chunk &C) {
+      return UninterestingChunks.count(C);
+    });
+
+  } while (!UninterestingChunks.empty() || increaseGranularity(Chunks));
+
+  // If we reduced the testcase replace it
+  if (ReducedProgram)
+    Test.setProgram(std::move(ReducedProgram));
+  errs() << "Couldn't increase anymore.\n";
+}
diff --git a/tools/llvm-reduce/deltas/Delta.h b/tools/llvm-reduce/deltas/Delta.h
new file mode 100644
index 000000000000..dbb18e4bd07f
--- /dev/null
+++ b/tools/llvm-reduce/deltas/Delta.h
@@ -0,0 +1,76 @@
+//===- Delta.h - Delta Debugging Algorithm Implementation -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation for the Delta Debugging Algorithm:
+// it splits a given set of Targets (i.e. Functions, Instructions, BBs, etc.)
+// into chunks and tries to reduce the number chunks that are interesting.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMREDUCE_LLVMREDUCE_DELTA_H
+#define LLVM_TOOLS_LLVMREDUCE_LLVMREDUCE_DELTA_H
+
+#include "TestRunner.h"
+#include <vector>
+#include <utility>
+#include <functional>
+
+namespace llvm {
+
+struct Chunk {
+  int begin;
+  int end;
+
+  /// Helper function to verify if a given Target-index is inside the Chunk
+  bool contains(int Index) const { return Index >= begin && Index <= end; }
+
+  void print() const {
+    errs() << "[" << begin;
+    if (end - begin != 0)
+      errs() << "," << end;
+    errs() << "]";
+  }
+
+  /// Operator when populating CurrentChunks in Generic Delta Pass
+  friend bool operator!=(const Chunk &C1, const Chunk &C2) {
+    return C1.begin != C2.begin || C1.end != C2.end;
+  }
+
+  /// Operator used for sets
+  friend bool operator<(const Chunk &C1, const Chunk &C2) {
+    return std::tie(C1.begin, C1.end) < std::tie(C2.begin, C2.end);
+  }
+};
+
+/// This function implements the Delta Debugging algorithm, it receives a
+/// number of Targets (e.g. Functions, Instructions, Basic Blocks, etc.) and
+/// splits them in half; these chunks of targets are then tested while ignoring
+/// one chunk, if a chunk is proven to be uninteresting (i.e. fails the test)
+/// it is removed from consideration. The algorithm will attempt to split the
+/// Chunks in half and start the process again until it can't split chunks
+/// anymore.
+///
+/// This function is intended to be called by each specialized delta pass (e.g.
+/// RemoveFunctions) and receives three key parameters:
+/// * Test: The main TestRunner instance which is used to run the provided
+/// interesting-ness test, as well as to store and access the reduced Program.
+/// * Targets: The amount of Targets that are going to be reduced by the
+/// algorithm, for example, the RemoveGlobalVars pass would send the amount of
+/// initialized GVs.
+/// * ExtractChunksFromModule: A function used to tailor the main program so it
+/// only contains Targets that are inside Chunks of the given iteration.
+/// Note: This function is implemented by each specialized Delta pass
+///
+/// Other implementations of the Delta Debugging algorithm can also be found in
+/// the CReduce, Delta, and Lithium projects.
+void runDeltaPass(TestRunner &Test, int Targets,
+                  std::function<void(const std::vector<Chunk> &, Module *)>
+                      ExtractChunksFromModule);
+} // namespace llvm
+
+#endif
diff --git a/tools/llvm-reduce/deltas/ReduceArguments.cpp b/tools/llvm-reduce/deltas/ReduceArguments.cpp
new file mode 100644
index 000000000000..f5f14b83f42c
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceArguments.cpp
@@ -0,0 +1,125 @@
+//===- ReduceArguments.cpp - Specialized Delta Pass -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce uninteresting Arguments from defined functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReduceArguments.h"
+#include "Delta.h"
+#include "llvm/ADT/SmallVector.h"
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+/// Goes over OldF calls and replaces them with a call to NewF
+static void replaceFunctionCalls(Function &OldF, Function &NewF,
+                                 const std::set<int> &ArgIndexesToKeep) {
+  const auto &Users = OldF.users();
+  for (auto I = Users.begin(), E = Users.end(); I != E; )
+    if (auto *CI = dyn_cast<CallInst>(*I++)) {
+      SmallVector<Value *, 8> Args;
+      for (auto ArgI = CI->arg_begin(), E = CI->arg_end(); ArgI != E; ++ArgI)
+        if (ArgIndexesToKeep.count(ArgI - CI->arg_begin()))
+          Args.push_back(*ArgI);
+
+      CallInst *NewCI = CallInst::Create(&NewF, Args);
+      NewCI->setCallingConv(NewF.getCallingConv());
+      if (!CI->use_empty())
+        CI->replaceAllUsesWith(NewCI);
+      ReplaceInstWithInst(CI, NewCI);
+    }
+}
+
+/// Removes out-of-chunk arguments from functions, and modifies their calls
+/// accordingly. It also removes allocations of out-of-chunk arguments.
+static void extractArgumentsFromModule(std::vector<Chunk> ChunksToKeep,
+                                       Module *Program) {
+  int I = 0, ArgCount = 0;
+  std::set<Argument *> ArgsToKeep;
+  std::vector<Function *> Funcs;
+  // Get inside-chunk arguments, as well as their parent function
+  for (auto &F : *Program)
+    if (!F.isDeclaration()) {
+      Funcs.push_back(&F);
+      for (auto &A : F.args())
+        if (I < (int)ChunksToKeep.size()) {
+          if (ChunksToKeep[I].contains(++ArgCount))
+            ArgsToKeep.insert(&A);
+          if (ChunksToKeep[I].end == ArgCount)
+            ++I;
+        }
+    }
+
+  for (auto *F : Funcs) {
+    ValueToValueMapTy VMap;
+    std::vector<Instruction *> InstToDelete;
+    for (auto &A : F->args())
+      if (!ArgsToKeep.count(&A)) {
+        // By adding undesired arguments to the VMap, CloneFunction will remove
+        // them from the resulting Function
+        VMap[&A] = UndefValue::get(A.getType());
+        for (auto *U : A.users())
+          if (auto *I = dyn_cast<Instruction>(*&U))
+            InstToDelete.push_back(I);
+      }
+    // Delete any instruction that uses the argument
+    for (auto *I : InstToDelete) {
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      I->eraseFromParent();
+    }
+
+    // No arguments to reduce
+    if (VMap.empty())
+      continue;
+
+    std::set<int> ArgIndexesToKeep;
+    int ArgI = 0;
+    for (auto &Arg : F->args())
+      if (ArgsToKeep.count(&Arg))
+        ArgIndexesToKeep.insert(++ArgI);
+
+    auto *ClonedFunc = CloneFunction(F, VMap);
+    // In order to preserve function order, we move Clone after old Function
+    ClonedFunc->removeFromParent();
+    Program->getFunctionList().insertAfter(F->getIterator(), ClonedFunc);
+
+    replaceFunctionCalls(*F, *ClonedFunc, ArgIndexesToKeep);
+    // Rename Cloned Function to Old's name
+    std::string FName = F->getName();
+    F->eraseFromParent();
+    ClonedFunc->setName(FName);
+  }
+}
+
+/// Counts the amount of arguments in non-declaration functions and prints their
+/// respective name, index, and parent function name
+static int countArguments(Module *Program) {
+  // TODO: Silence index with --quiet flag
+  outs() << "----------------------------\n";
+  outs() << "Param Index Reference:\n";
+  int ArgsCount = 0;
+  for (auto &F : *Program)
+    if (!F.isDeclaration() && F.arg_size()) {
+      outs() << "  " << F.getName() << "\n";
+      for (auto &A : F.args())
+        outs() << "\t" << ++ArgsCount << ": " << A.getName() << "\n";
+
+      outs() << "----------------------------\n";
+    }
+
+  return ArgsCount;
+}
+
+void llvm::reduceArgumentsDeltaPass(TestRunner &Test) {
+  outs() << "*** Reducing Arguments...\n";
+  int ArgCount = countArguments(Test.getProgram());
+  runDeltaPass(Test, ArgCount, extractArgumentsFromModule);
+}
diff --git a/tools/llvm-reduce/deltas/ReduceArguments.h b/tools/llvm-reduce/deltas/ReduceArguments.h
new file mode 100644
index 000000000000..d9682b44f74d
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceArguments.h
@@ -0,0 +1,21 @@
+//===- ReduceArguments.h - Specialized Delta Pass -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce uninteresting Arguments from defined functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Delta.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace llvm {
+void reduceArgumentsDeltaPass(TestRunner &Test);
+} // namespace llvm
diff --git a/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp b/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
new file mode 100644
index 000000000000..03c3962d2fd9
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
@@ -0,0 +1,146 @@
+//===- ReduceArguments.cpp - Specialized Delta Pass -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce uninteresting Arguments from defined functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReduceBasicBlocks.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+/// Replaces BB Terminator with one that only contains Chunk BBs
+static void replaceBranchTerminator(BasicBlock &BB,
+                                    std::set<BasicBlock *> BBsToKeep) {
+  auto Term = BB.getTerminator();
+  std::vector<BasicBlock *> ChunkSucessors;
+  for (auto Succ : successors(&BB))
+    if (BBsToKeep.count(Succ))
+      ChunkSucessors.push_back(Succ);
+
+  // BB only references Chunk BBs
+  if (ChunkSucessors.size() == Term->getNumSuccessors())
+    return;
+
+  bool IsBranch = isa<BranchInst>(Term);
+  Value *Address = nullptr;
+  if (auto IndBI = dyn_cast<IndirectBrInst>(Term))
+    Address = IndBI->getAddress();
+
+  Term->eraseFromParent();
+
+  if (ChunkSucessors.empty()) {
+    ReturnInst::Create(BB.getContext(), nullptr, &BB);
+    return;
+  }
+
+  if (IsBranch)
+    BranchInst::Create(ChunkSucessors[0], &BB);
+
+  if (Address) {
+    auto NewIndBI =
+        IndirectBrInst::Create(Address, ChunkSucessors.size(), &BB);
+    for (auto Dest : ChunkSucessors)
+      NewIndBI->addDestination(Dest);
+  }
+}
+
+/// Removes uninteresting BBs from switch, if the default case ends up being
+/// uninteresting, the switch is replaced with a void return (since it has to be
+/// replace with something)
+static void removeUninterestingBBsFromSwitch(SwitchInst &SwInst,
+                                             std::set<BasicBlock *> BBsToKeep) {
+  if (!BBsToKeep.count(SwInst.getDefaultDest())) {
+    ReturnInst::Create(SwInst.getContext(), nullptr, SwInst.getParent());
+    SwInst.eraseFromParent();
+  } else
+    for (int I = 0, E = SwInst.getNumCases(); I != E; ++I) {
+      auto Case = SwInst.case_begin() + I;
+      if (!BBsToKeep.count(Case->getCaseSuccessor())) {
+        SwInst.removeCase(Case);
+        --I;
+        --E;
+      }
+    }
+}
+
+/// Removes out-of-chunk arguments from functions, and modifies their calls
+/// accordingly. It also removes allocations of out-of-chunk arguments.
+static void extractBasicBlocksFromModule(std::vector<Chunk> ChunksToKeep,
+                                         Module *Program) {
+  int I = 0, BBCount = 0;
+  std::set<BasicBlock *> BBsToKeep;
+
+  for (auto &F : *Program)
+    for (auto &BB : F)
+      if (I < (int)ChunksToKeep.size()) {
+        if (ChunksToKeep[I].contains(++BBCount))
+          BBsToKeep.insert(&BB);
+        if (ChunksToKeep[I].end == BBCount)
+          ++I;
+      }
+
+  std::vector<BasicBlock *> BBsToDelete;
+  for (auto &F : *Program)
+    for (auto &BB : F) {
+      if (!BBsToKeep.count(&BB)) {
+        BBsToDelete.push_back(&BB);
+        // Remove out-of-chunk BB from successor phi nodes
+        for (auto *Succ : successors(&BB))
+          Succ->removePredecessor(&BB);
+      }
+    }
+
+  // Replace terminators that reference out-of-chunk BBs
+  for (auto &F : *Program)
+    for (auto &BB : F) {
+      if (auto *SwInst = dyn_cast<SwitchInst>(BB.getTerminator()))
+        removeUninterestingBBsFromSwitch(*SwInst, BBsToKeep);
+      else
+        replaceBranchTerminator(BB, BBsToKeep);
+    }
+
+  // Replace out-of-chunk switch uses
+  for (auto &BB : BBsToDelete) {
+    // Instructions might be referenced in other BBs
+    for (auto &I : *BB)
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    BB->eraseFromParent();
+  }
+}
+
+/// Counts the amount of basic blocks and prints their name & respective index
+static int countBasicBlocks(Module *Program) {
+  // TODO: Silence index with --quiet flag
+  outs() << "----------------------------\n";
+  int BBCount = 0;
+  for (auto &F : *Program)
+    for (auto &BB : F) {
+      if (BB.hasName())
+        outs() << "\t" << ++BBCount << ": " << BB.getName() << "\n";
+      else
+        outs() << "\t" << ++BBCount << ": Unnamed\n";
+    }
+
+  return BBCount;
+}
+
+void llvm::reduceBasicBlocksDeltaPass(TestRunner &Test) {
+  outs() << "*** Reducing Basic Blocks...\n";
+  int BBCount = countBasicBlocks(Test.getProgram());
+  runDeltaPass(Test, BBCount, extractBasicBlocksFromModule);
+}
diff --git a/tools/llvm-reduce/deltas/ReduceBasicBlocks.h b/tools/llvm-reduce/deltas/ReduceBasicBlocks.h
new file mode 100644
index 000000000000..cf76a0abbcd7
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceBasicBlocks.h
@@ -0,0 +1,20 @@
+//===- ReduceArguments.h - Specialized Delta Pass -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce uninteresting Arguments from defined functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Delta.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace llvm {
+void reduceBasicBlocksDeltaPass(TestRunner &Test);
+} // namespace llvm
diff --git a/tools/llvm-reduce/deltas/ReduceFunctions.cpp b/tools/llvm-reduce/deltas/ReduceFunctions.cpp
new file mode 100644
index 000000000000..3382f35a945a
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceFunctions.cpp
@@ -0,0 +1,77 @@
+//===- ReduceFunctions.cpp - Specialized Delta Pass -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce functions (and any instruction that calls it) in the provided
+// Module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReduceFunctions.h"
+#include "Delta.h"
+#include "llvm/ADT/SetVector.h"
+#include <set>
+
+using namespace llvm;
+
+/// Removes all the Defined Functions (as well as their calls)
+/// that aren't inside any of the desired Chunks.
+static void extractFunctionsFromModule(const std::vector<Chunk> &ChunksToKeep,
+                                       Module *Program) {
+  // Get functions inside desired chunks
+  std::set<Function *> FuncsToKeep;
+  int I = 0, FunctionCount = 0;
+  for (auto &F : *Program)
+    if (I < (int)ChunksToKeep.size()) {
+      if (ChunksToKeep[I].contains(++FunctionCount))
+        FuncsToKeep.insert(&F);
+      if (FunctionCount == ChunksToKeep[I].end)
+        ++I;
+    }
+
+  // Delete out-of-chunk functions, and replace their calls with undef
+  std::vector<Function *> FuncsToRemove;
+  SetVector<CallInst *> CallsToRemove;
+  for (auto &F : *Program)
+    if (!FuncsToKeep.count(&F)) {
+      for (auto U : F.users())
+        if (auto *Call = dyn_cast<CallInst>(U)) {
+          Call->replaceAllUsesWith(UndefValue::get(Call->getType()));
+          CallsToRemove.insert(Call);
+        }
+      F.replaceAllUsesWith(UndefValue::get(F.getType()));
+      FuncsToRemove.push_back(&F);
+    }
+
+  for (auto *C : CallsToRemove)
+    C->eraseFromParent();
+
+  for (auto *F : FuncsToRemove)
+    F->eraseFromParent();
+}
+
+/// Counts the amount of non-declaration functions and prints their
+/// respective name & index
+static int countFunctions(Module *Program) {
+  // TODO: Silence index with --quiet flag
+  errs() << "----------------------------\n";
+  errs() << "Function Index Reference:\n";
+  int FunctionCount = 0;
+  for (auto &F : *Program)
+    errs() << "\t" << ++FunctionCount << ": " << F.getName() << "\n";
+
+  errs() << "----------------------------\n";
+  return FunctionCount;
+}
+
+void llvm::reduceFunctionsDeltaPass(TestRunner &Test) {
+  errs() << "*** Reducing Functions...\n";
+  int Functions = countFunctions(Test.getProgram());
+  runDeltaPass(Test, Functions, extractFunctionsFromModule);
+  errs() << "----------------------------\n";
+}
diff --git a/tools/llvm-reduce/deltas/ReduceFunctions.h b/tools/llvm-reduce/deltas/ReduceFunctions.h
new file mode 100644
index 000000000000..7c2cd3f33e9f
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceFunctions.h
@@ -0,0 +1,20 @@
+//===- ReduceFunctions.h - Specialized Delta Pass -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce functions (and any instruction that calls it) in the provided
+// Module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Delta.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace llvm {
+void reduceFunctionsDeltaPass(TestRunner &Test);
+} // namespace llvm
diff --git a/tools/llvm-reduce/deltas/ReduceGlobalVars.cpp b/tools/llvm-reduce/deltas/ReduceGlobalVars.cpp
new file mode 100644
index 000000000000..5732208ee0a9
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceGlobalVars.cpp
@@ -0,0 +1,74 @@
+//===- ReduceGlobalVars.cpp - Specialized Delta Pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce initialized Global Variables in the provided Module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReduceGlobalVars.h"
+#include <set>
+
+using namespace llvm;
+
+/// Removes all the Initialized GVs that aren't inside the desired Chunks.
+static void extractGVsFromModule(std::vector<Chunk> ChunksToKeep,
+                                 Module *Program) {
+  // Get GVs inside desired chunks
+  std::set<GlobalVariable *> GVsToKeep;
+  int I = 0, GVCount = 0;
+  for (auto &GV : Program->globals())
+    if (GV.hasInitializer() && I < (int)ChunksToKeep.size()) {
+      if (ChunksToKeep[I].contains(++GVCount))
+        GVsToKeep.insert(&GV);
+      if (GVCount == ChunksToKeep[I].end)
+        ++I;
+    }
+
+  // Delete out-of-chunk GVs and their uses
+  std::vector<GlobalVariable *> ToRemove;
+  std::vector<Instruction *> InstToRemove;
+  for (auto &GV : Program->globals())
+    if (GV.hasInitializer() && !GVsToKeep.count(&GV)) {
+      for (auto U : GV.users())
+        if (auto *Inst = dyn_cast<Instruction>(U))
+          InstToRemove.push_back(Inst);
+
+      GV.replaceAllUsesWith(UndefValue::get(GV.getType()));
+      ToRemove.push_back(&GV);
+    }
+
+  // Delete Instruction uses of unwanted GVs
+  for (auto *Inst : InstToRemove) {
+    Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    Inst->eraseFromParent();
+  }
+
+  for (auto *GV : ToRemove)
+    GV->eraseFromParent();
+}
+
+/// Counts the amount of initialized GVs and displays their
+/// respective name & index
+static int countGVs(Module *Program) {
+  // TODO: Silence index with --quiet flag
+  outs() << "----------------------------\n";
+  outs() << "GlobalVariable Index Reference:\n";
+  int GVCount = 0;
+  for (auto &GV : Program->globals())
+    if (GV.hasInitializer())
+      outs() << "\t" << ++GVCount << ": " << GV.getName() << "\n";
+  outs() << "----------------------------\n";
+  return GVCount;
+}
+
+void llvm::reduceGlobalsDeltaPass(TestRunner &Test) {
+  outs() << "*** Reducing GVs...\n";
+  int GVCount = countGVs(Test.getProgram());
+  runDeltaPass(Test, GVCount, extractGVsFromModule);
+}
diff --git a/tools/llvm-reduce/deltas/ReduceGlobalVars.h b/tools/llvm-reduce/deltas/ReduceGlobalVars.h
new file mode 100644
index 000000000000..d4a870aded58
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceGlobalVars.h
@@ -0,0 +1,20 @@
+//===- ReduceGlobalVars.h - Specialized Delta Pass ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce initialized Global Variables in the provided Module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Delta.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace llvm {
+void reduceGlobalsDeltaPass(TestRunner &Test);
+} // namespace llvm
diff --git a/tools/llvm-reduce/deltas/ReduceInstructions.cpp b/tools/llvm-reduce/deltas/ReduceInstructions.cpp
new file mode 100644
index 000000000000..b3497ad2dc02
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceInstructions.cpp
@@ -0,0 +1,65 @@
+//===- ReduceArguments.cpp - Specialized Delta Pass -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce uninteresting Arguments from defined functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReduceInstructions.h"
+
+using namespace llvm;
+
+/// Removes out-of-chunk arguments from functions, and modifies their calls
+/// accordingly. It also removes allocations of out-of-chunk arguments.
+static void extractInstrFromModule(std::vector<Chunk> ChunksToKeep,
+                                   Module *Program) {
+  int I = 0, InstCount = 0;
+  std::set<Instruction *> InstToKeep;
+
+  for (auto &F : *Program)
+    for (auto &BB : F)
+      for (auto &Inst : BB)
+        if (I < (int)ChunksToKeep.size()) {
+          if (ChunksToKeep[I].contains(++InstCount))
+            InstToKeep.insert(&Inst);
+          if (ChunksToKeep[I].end == InstCount)
+            ++I;
+        }
+
+  std::vector<Instruction *> InstToDelete;
+  for (auto &F : *Program)
+    for (auto &BB : F)
+      for (auto &Inst : BB)
+        if (!InstToKeep.count(&Inst)) {
+          Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+          InstToDelete.push_back(&Inst);
+        }
+
+  for (auto &I : InstToDelete)
+    I->eraseFromParent();
+}
+
+/// Counts the amount of basic blocks and prints their name & respective index
+static unsigned countInstructions(Module *Program) {
+  // TODO: Silence index with --quiet flag
+  outs() << "----------------------------\n";
+  int InstCount = 0;
+  for (auto &F : *Program)
+    for (auto &BB : F)
+        InstCount += BB.getInstList().size();
+  outs() << "Number of instructions: " << InstCount << "\n";
+
+  return InstCount;
+}
+
+void llvm::reduceInstructionsDeltaPass(TestRunner &Test) {
+  outs() << "*** Reducing Insructions...\n";
+  unsigned InstCount = countInstructions(Test.getProgram());
+  runDeltaPass(Test, InstCount, extractInstrFromModule);
+}
diff --git a/tools/llvm-reduce/deltas/ReduceInstructions.h b/tools/llvm-reduce/deltas/ReduceInstructions.h
new file mode 100644
index 000000000000..a9266acd051a
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceInstructions.h
@@ -0,0 +1,20 @@
+//===- ReduceArguments.h - Specialized Delta Pass -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce uninteresting Arguments from defined functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Delta.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace llvm {
+void reduceInstructionsDeltaPass(TestRunner &Test);
+} // namespace llvm
diff --git a/tools/llvm-reduce/deltas/ReduceMetadata.cpp b/tools/llvm-reduce/deltas/ReduceMetadata.cpp
new file mode 100644
index 000000000000..4ea223546efa
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceMetadata.cpp
@@ -0,0 +1,138 @@
+//===- ReduceMetadata.cpp - Specialized Delta Pass ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements two functions used by the Generic Delta Debugging
+// Algorithm, which are used to reduce Metadata nodes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReduceMetadata.h"
+#include "Delta.h"
+#include "llvm/ADT/SmallVector.h"
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+/// Adds all Unnamed Metadata Nodes that are inside desired Chunks to set
+template <class T>
+static void getChunkMetadataNodes(T &MDUser, int &I,
+                                  const std::vector<Chunk> &ChunksToKeep,
+                                  std::set<MDNode *> &SeenNodes,
+                                  std::set<MDNode *> &NodesToKeep) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  MDUser.getAllMetadata(MDs);
+  for (auto &MD : MDs) {
+    SeenNodes.insert(MD.second);
+    if (I < (int)ChunksToKeep.size()) {
+      if (ChunksToKeep[I].contains(SeenNodes.size()))
+        NodesToKeep.insert(MD.second);
+      if (ChunksToKeep[I].end == (int)SeenNodes.size())
+        ++I;
+    }
+  }
+}
+
+/// Erases out-of-chunk unnamed metadata nodes from its user
+template <class T>
+static void eraseMetadataIfOutsideChunk(T &MDUser,
+                                        const std::set<MDNode *> &NodesToKeep) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  MDUser.getAllMetadata(MDs);
+  for (int I = 0, E = MDs.size(); I != E; ++I)
+    if (!NodesToKeep.count(MDs[I].second))
+      MDUser.setMetadata(I, NULL);
+}
+
+/// Removes all the Named and Unnamed Metadata Nodes, as well as any debug
+/// functions that aren't inside the desired Chunks.
+static void extractMetadataFromModule(const std::vector<Chunk> &ChunksToKeep,
+                                      Module *Program) {
+  std::set<MDNode *> SeenNodes;
+  std::set<MDNode *> NodesToKeep;
+  int I = 0;
+
+  // Add chunk MDNodes used by GVs, Functions, and Instructions to set
+  for (auto &GV : Program->globals())
+    getChunkMetadataNodes(GV, I, ChunksToKeep, SeenNodes, NodesToKeep);
+
+  for (auto &F : *Program) {
+    getChunkMetadataNodes(F, I, ChunksToKeep, SeenNodes, NodesToKeep);
+    for (auto &BB : F)
+      for (auto &Inst : BB)
+        getChunkMetadataNodes(Inst, I, ChunksToKeep, SeenNodes, NodesToKeep);
+  }
+
+  // Once more, go over metadata nodes, but deleting the ones outside chunks
+  for (auto &GV : Program->globals())
+    eraseMetadataIfOutsideChunk(GV, NodesToKeep);
+
+  for (auto &F : *Program) {
+    eraseMetadataIfOutsideChunk(F, NodesToKeep);
+    for (auto &BB : F)
+      for (auto &Inst : BB)
+        eraseMetadataIfOutsideChunk(Inst, NodesToKeep);
+  }
+
+
+  // Get out-of-chunk Named metadata nodes
+  unsigned MetadataCount = SeenNodes.size();
+  std::vector<NamedMDNode *> NamedNodesToDelete;
+  for (auto &MD : Program->named_metadata()) {
+    if (I < (int)ChunksToKeep.size()) {
+      if (!ChunksToKeep[I].contains(++MetadataCount))
+        NamedNodesToDelete.push_back(&MD);
+      if (ChunksToKeep[I].end == (int)SeenNodes.size())
+        ++I;
+    } else
+      NamedNodesToDelete.push_back(&MD);
+  }
+
+  for (auto *NN : NamedNodesToDelete) {
+    for (int I = 0, E = NN->getNumOperands(); I != E; ++I)
+      NN->setOperand(I, NULL);
+    NN->eraseFromParent();
+  }
+}
+
+// Gets unnamed metadata nodes used by a given instruction/GV/function and adds
+// them to the set of seen nodes
+template <class T>
+static void addMetadataToSet(T &MDUser, std::set<MDNode *> &UnnamedNodes) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  MDUser.getAllMetadata(MDs);
+  for (auto &MD : MDs)
+    UnnamedNodes.insert(MD.second);
+}
+
+/// Returns the amount of Named and Unnamed Metadata Nodes
+static int countMetadataTargets(Module *Program) {
+  std::set<MDNode *> UnnamedNodes;
+  int NamedMetadataNodes = Program->named_metadata_size();
+
+  // Get metadata nodes used by globals
+  for (auto &GV : Program->globals())
+    addMetadataToSet(GV, UnnamedNodes);
+
+  // Do the same for nodes used by functions & instructions
+  for (auto &F : *Program) {
+    addMetadataToSet(F, UnnamedNodes);
+    for (auto &BB : F)
+      for (auto &I : BB)
+        addMetadataToSet(I, UnnamedNodes);
+  }
+
+  return UnnamedNodes.size() + NamedMetadataNodes;
+}
+
+void llvm::reduceMetadataDeltaPass(TestRunner &Test) {
+  outs() << "*** Reducing Metadata...\n";
+  int MDCount = countMetadataTargets(Test.getProgram());
+  runDeltaPass(Test, MDCount, extractMetadataFromModule);
+  outs() << "----------------------------\n";
+}
diff --git a/tools/llvm-reduce/deltas/ReduceMetadata.h b/tools/llvm-reduce/deltas/ReduceMetadata.h
new file mode 100644
index 000000000000..275b44c2aa7d
--- /dev/null
+++ b/tools/llvm-reduce/deltas/ReduceMetadata.h
@@ -0,0 +1,18 @@
+//===- ReduceMetadata.h - Specialized Delta Pass ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements two functions used by the Generic Delta Debugging
+// Algorithm, which are used to reduce Metadata nodes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestRunner.h"
+
+namespace llvm {
+void reduceMetadataDeltaPass(TestRunner &Test);
+} // namespace llvm
diff --git a/tools/llvm-reduce/llvm-reduce.cpp b/tools/llvm-reduce/llvm-reduce.cpp
new file mode 100644
index 000000000000..83dcf980a786
--- /dev/null
+++ b/tools/llvm-reduce/llvm-reduce.cpp
@@ -0,0 +1,114 @@
+//===- llvm-reduce.cpp - The LLVM Delta Reduction utility -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This program tries to reduce an IR test case for a given interesting-ness
+// test. It runs multiple delta debugging passes in order to minimize the input
+// file. It's worth noting that this is a part of the bugpoint redesign
+// proposal, and thus a *temporary* tool that will eventually be integrated
+// into the bugpoint tool itself.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DeltaManager.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <system_error>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden);
+static cl::opt<bool> Version("v", cl::desc("Alias for -version"), cl::Hidden);
+
+static cl::opt<std::string> InputFilename(cl::Positional, cl::Required,
+                                          cl::desc("<input llvm ll/bc file>"));
+
+static cl::opt<std::string>
+    TestFilename("test", cl::Required,
+                 cl::desc("Name of the interesting-ness test to be run"));
+
+static cl::list<std::string>
+    TestArguments("test-arg", cl::ZeroOrMore,
+                  cl::desc("Arguments passed onto the interesting-ness test"));
+
+static cl::opt<std::string>
+    OutputFilename("output",
+                   cl::desc("Specify the output file. default: reduced.ll"));
+static cl::alias OutputFileAlias("o", cl::desc("Alias for -output"),
+                                 cl::aliasopt(OutputFilename));
+
+static cl::opt<bool>
+    ReplaceInput("in-place",
+                 cl::desc("WARNING: This option will replace your input file"
+                          "with the reduced version!"));
+
+// Parses IR into a Module and verifies it
+static std::unique_ptr<Module> parseInputFile(StringRef Filename,
+                                              LLVMContext &Ctxt) {
+  SMDiagnostic Err;
+  std::unique_ptr<Module> Result = parseIRFile(Filename, Err, Ctxt);
+  if (!Result) {
+    Err.print("llvm-reduce", errs());
+    return Result;
+  }
+
+  if (verifyModule(*Result, &errs())) {
+    errs() << "Error: " << Filename << " - input module is broken!\n";
+    return std::unique_ptr<Module>();
+  }
+
+  return Result;
+}
+
+int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM automatic testcase reducer.\n");
+
+  LLVMContext Context;
+  std::unique_ptr<Module> OriginalProgram =
+      parseInputFile(InputFilename, Context);
+
+  // Initialize test environment
+  TestRunner Tester(TestFilename, TestArguments);
+  Tester.setProgram(std::move(OriginalProgram));
+
+  // Try to reduce code
+  runDeltaPasses(Tester);
+
+  if (!Tester.getProgram()) {
+    errs() << "\nCouldnt reduce input :/\n";
+  } else {
+    // Print reduced file to STDOUT
+    if (OutputFilename == "-")
+      Tester.getProgram()->print(outs(), nullptr);
+    else {
+      if (ReplaceInput) // In-place
+        OutputFilename = InputFilename.c_str();
+      else if (OutputFilename.empty())
+        OutputFilename = "reduced.ll";
+
+      std::error_code EC;
+      raw_fd_ostream Out(OutputFilename, EC);
+      if (EC) {
+        errs() << "Error opening output file: " << EC.message() << "!\n";
+        exit(1);
+      }
+      Tester.getProgram()->print(Out, /*AnnotationWriter=*/nullptr);
+      errs() << "\nDone reducing! Reduced testcase: " << OutputFilename << "\n";
+    }
+  }
+
+  return 0;
+}
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index a7cc1deb8cf6..3a36e7709483 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -27,12 +27,13 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MSVCErrorWorkarounds.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/MSVCErrorWorkarounds.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <future>
@@ -138,8 +139,21 @@ PrintAllocationRequests("print-alloc-requests",
                                  "manager by RuntimeDyld"),
                         cl::Hidden);
 
+static cl::opt<bool> ShowTimes("show-times",
+                               cl::desc("Show times for llvm-rtdyld phases"),
+                               cl::init(false));
+
 ExitOnError ExitOnErr;
 
+struct RTDyldTimers {
+  TimerGroup RTDyldTG{"llvm-rtdyld timers", "timers for llvm-rtdyld phases"};
+  Timer LoadObjectsTimer{"load", "time to load/add object files", RTDyldTG};
+  Timer LinkTimer{"link", "time to link object files", RTDyldTG};
+  Timer RunTimer{"run", "time to execute jitlink'd code", RTDyldTG};
+};
+
+std::unique_ptr<RTDyldTimers> Timers;
+
 /* *** */
 
 using SectionIDMap = StringMap<unsigned>;
@@ -441,8 +455,6 @@ static int printLineInfoForInput(bool LoadObjects, bool UseDebugObj) {
             continue;
           }
           object::section_iterator Sec = *SecOrErr;
-          StringRef SecName;
-          Sec->getName(SecName);
           Address.SectionIndex = Sec->getIndex();
           uint64_t SectionLoadAddress =
             LoadedObjInfo->getSectionLoadAddress(*Sec);
@@ -491,35 +503,41 @@ static int executeInput() {
   // If we don't have any input files, read from stdin.
   if (!InputFileList.size())
     InputFileList.push_back("-");
-  for (auto &File : InputFileList) {
-    // Load the input memory buffer.
-    ErrorOr<std::unique_ptr<MemoryBuffer>> InputBuffer =
-        MemoryBuffer::getFileOrSTDIN(File);
-    if (std::error_code EC = InputBuffer.getError())
-      ErrorAndExit("unable to read input: '" + EC.message() + "'");
-    Expected<std::unique_ptr<ObjectFile>> MaybeObj(
-      ObjectFile::createObjectFile((*InputBuffer)->getMemBufferRef()));
-
-    if (!MaybeObj) {
-      std::string Buf;
-      raw_string_ostream OS(Buf);
-      logAllUnhandledErrors(MaybeObj.takeError(), OS);
-      OS.flush();
-      ErrorAndExit("unable to create object file: '" + Buf + "'");
-    }
+  {
+    TimeRegion TR(Timers ? &Timers->LoadObjectsTimer : nullptr);
+    for (auto &File : InputFileList) {
+      // Load the input memory buffer.
+      ErrorOr<std::unique_ptr<MemoryBuffer>> InputBuffer =
+          MemoryBuffer::getFileOrSTDIN(File);
+      if (std::error_code EC = InputBuffer.getError())
+        ErrorAndExit("unable to read input: '" + EC.message() + "'");
+      Expected<std::unique_ptr<ObjectFile>> MaybeObj(
+          ObjectFile::createObjectFile((*InputBuffer)->getMemBufferRef()));
+
+      if (!MaybeObj) {
+        std::string Buf;
+        raw_string_ostream OS(Buf);
+        logAllUnhandledErrors(MaybeObj.takeError(), OS);
+        OS.flush();
+        ErrorAndExit("unable to create object file: '" + Buf + "'");
+      }
 
-    ObjectFile &Obj = **MaybeObj;
+      ObjectFile &Obj = **MaybeObj;
 
-    // Load the object file
-    Dyld.loadObject(Obj);
-    if (Dyld.hasError()) {
-      ErrorAndExit(Dyld.getErrorString());
+      // Load the object file
+      Dyld.loadObject(Obj);
+      if (Dyld.hasError()) {
+        ErrorAndExit(Dyld.getErrorString());
+      }
     }
   }
 
-  // Resove all the relocations we can.
-  // FIXME: Error out if there are unresolved relocations.
-  Dyld.resolveRelocations();
+  {
+    TimeRegion TR(Timers ? &Timers->LinkTimer : nullptr);
+    // Resove all the relocations we can.
+    // FIXME: Error out if there are unresolved relocations.
+    Dyld.resolveRelocations();
+  }
 
   // Get the address of the entry point (_main by default).
   void *MainAddress = Dyld.getSymbolLocalAddress(EntryPoint);
@@ -551,7 +569,13 @@ static int executeInput() {
   for (auto &Arg : InputArgv)
     Argv.push_back(Arg.data());
   Argv.push_back(nullptr);
-  return Main(Argv.size() - 1, Argv.data());
+  int Result = 0;
+  {
+    TimeRegion TR(Timers ? &Timers->RunTimer : nullptr);
+    Result = Main(Argv.size() - 1, Argv.data());
+  }
+
+  return Result;
 }
 
 static int checkAllExpressions(RuntimeDyldChecker &Checker) {
@@ -891,7 +915,7 @@ static int linkAndVerify() {
     ObjectFile &Obj = **MaybeObj;
 
     if (!Checker)
-      Checker = llvm::make_unique<RuntimeDyldChecker>(
+      Checker = std::make_unique<RuntimeDyldChecker>(
           IsSymbolValid, GetSymbolInfo, GetSectionInfo, GetStubInfo,
           GetStubInfo, Obj.isLittleEndian() ? support::little : support::big,
           Disassembler.get(), InstPrinter.get(), dbgs());
@@ -937,16 +961,28 @@ int main(int argc, char **argv) {
 
   ExitOnErr.setBanner(std::string(argv[0]) + ": ");
 
+  Timers = ShowTimes ? std::make_unique<RTDyldTimers>() : nullptr;
+
+  int Result;
   switch (Action) {
   case AC_Execute:
-    return executeInput();
+    Result = executeInput();
+    break;
   case AC_PrintDebugLineInfo:
-    return printLineInfoForInput(/* LoadObjects */ true,/* UseDebugObj */ true);
+    Result =
+        printLineInfoForInput(/* LoadObjects */ true, /* UseDebugObj */ true);
+    break;
   case AC_PrintLineInfo:
-    return printLineInfoForInput(/* LoadObjects */ true,/* UseDebugObj */false);
+    Result =
+        printLineInfoForInput(/* LoadObjects */ true, /* UseDebugObj */ false);
+    break;
   case AC_PrintObjectLineInfo:
-    return printLineInfoForInput(/* LoadObjects */false,/* UseDebugObj */false);
+    Result =
+        printLineInfoForInput(/* LoadObjects */ false, /* UseDebugObj */ false);
+    break;
   case AC_Verify:
-    return linkAndVerify();
+    Result = linkAndVerify();
+    break;
   }
+  return Result;
 }
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index a455bf13fe7b..5f36a785332b 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -735,7 +735,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm codegen stress-tester\n");
   llvm_shutdown_obj Y;
 
-  auto M = llvm::make_unique<Module>("/tmp/autogen.bc", Context);
+  auto M = std::make_unique<Module>("/tmp/autogen.bc", Context);
   Function *F = GenEmptyFunction(M.get());
 
   // Pick an initial seed value
@@ -752,7 +752,7 @@ int main(int argc, char **argv) {
     OutputFilename = "-";
 
   std::error_code EC;
-  Out.reset(new ToolOutputFile(OutputFilename, EC, sys::fs::F_None));
+  Out.reset(new ToolOutputFile(OutputFilename, EC, sys::fs::OF_None));
   if (EC) {
     errs() << EC.message() << '\n';
     return 1;
diff --git a/tools/llvm-symbolizer/llvm-symbolizer.cpp b/tools/llvm-symbolizer/llvm-symbolizer.cpp
index ea94cf9b69a1..54ce87d47979 100644
--- a/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -55,6 +55,10 @@ static cl::opt<bool>
                          cl::desc("Interpret addresses as relative addresses"),
                          cl::ReallyHidden);
 
+static cl::opt<bool> ClUntagAddresses(
+    "untag-addresses", cl::init(true),
+    cl::desc("Remove memory tags from addresses before symbolization"));
+
 static cl::opt<bool>
     ClPrintInlining("inlining", cl::init(true),
                     cl::desc("Print all inlined frames for a given address"));
@@ -274,6 +278,7 @@ int main(int argc, char **argv) {
     ClDemangle.setInitialValue(false);
     ClPrintFunctions.setInitialValue(FunctionNameKind::None);
     ClPrintInlining.setInitialValue(false);
+    ClUntagAddresses.setInitialValue(false);
     ClOutputStyle.setInitialValue(DIPrinter::OutputStyle::GNU);
   }
 
@@ -290,6 +295,7 @@ int main(int argc, char **argv) {
   Opts.UseSymbolTable = ClUseSymbolTable;
   Opts.Demangle = ClDemangle;
   Opts.RelativeAddresses = ClUseRelativeAddress;
+  Opts.UntagAddresses = ClUntagAddresses;
   Opts.DefaultArch = ClDefaultArch;
   Opts.FallbackDebugPath = ClFallbackDebugPath;
   Opts.DWPName = ClDwpName;
diff --git a/tools/llvm-xray/func-id-helper.cpp b/tools/llvm-xray/func-id-helper.cpp
index dc821a420c67..afc912a6398e 100644
--- a/tools/llvm-xray/func-id-helper.cpp
+++ b/tools/llvm-xray/func-id-helper.cpp
@@ -36,7 +36,7 @@ std::string FuncIdConversionHelper::SymbolOrNumber(int32_t FuncId) const {
   ModuleAddress.SectionIndex = object::SectionedAddress::UndefSection;
   if (auto ResOrErr = Symbolizer.symbolizeCode(BinaryInstrMap, ModuleAddress)) {
     auto &DI = *ResOrErr;
-    if (DI.FunctionName == "<invalid>")
+    if (DI.FunctionName == DILineInfo::BadString)
       F << "@(" << std::hex << It->second << ")";
     else
       F << DI.FunctionName;
diff --git a/tools/llvm-xray/xray-account.cpp b/tools/llvm-xray/xray-account.cpp
index 2b49a311d7e3..e37cd212377a 100644
--- a/tools/llvm-xray/xray-account.cpp
+++ b/tools/llvm-xray/xray-account.cpp
@@ -421,7 +421,7 @@ static CommandRegistration Unused(&Account, []() -> Error {
   }
 
   std::error_code EC;
-  raw_fd_ostream OS(AccountOutput, EC, sys::fs::OpenFlags::F_Text);
+  raw_fd_ostream OS(AccountOutput, EC, sys::fs::OpenFlags::OF_Text);
   if (EC)
     return make_error<StringError>(
         Twine("Cannot open file '") + AccountOutput + "' for writing.", EC);
diff --git a/tools/llvm-xray/xray-converter.cpp b/tools/llvm-xray/xray-converter.cpp
index dfc757e0f276..7258245b95cc 100644
--- a/tools/llvm-xray/xray-converter.cpp
+++ b/tools/llvm-xray/xray-converter.cpp
@@ -387,8 +387,8 @@ static CommandRegistration Unused(&Convert, []() -> Error {
   std::error_code EC;
   raw_fd_ostream OS(ConvertOutput, EC,
                     ConvertOutputFormat == ConvertFormats::BINARY
-                        ? sys::fs::OpenFlags::F_None
-                        : sys::fs::OpenFlags::F_Text);
+                        ? sys::fs::OpenFlags::OF_None
+                        : sys::fs::OpenFlags::OF_Text);
   if (EC)
     return make_error<StringError>(
         Twine("Cannot open file '") + ConvertOutput + "' for writing.", EC);
diff --git a/tools/llvm-xray/xray-extract.cpp b/tools/llvm-xray/xray-extract.cpp
index 7c7d26b5a389..7800b88d9eeb 100644
--- a/tools/llvm-xray/xray-extract.cpp
+++ b/tools/llvm-xray/xray-extract.cpp
@@ -80,7 +80,7 @@ static CommandRegistration Unused(&Extract, []() -> Error {
                       InstrumentationMapOrError.takeError());
 
   std::error_code EC;
-  raw_fd_ostream OS(ExtractOutput, EC, sys::fs::OpenFlags::F_Text);
+  raw_fd_ostream OS(ExtractOutput, EC, sys::fs::OpenFlags::OF_Text);
   if (EC)
     return make_error<StringError>(
         Twine("Cannot open file '") + ExtractOutput + "' for writing.", EC);
diff --git a/tools/llvm-xray/xray-fdr-dump.cpp b/tools/llvm-xray/xray-fdr-dump.cpp
index 81a93cac57c4..295f7a78765f 100644
--- a/tools/llvm-xray/xray-fdr-dump.cpp
+++ b/tools/llvm-xray/xray-fdr-dump.cpp
@@ -51,7 +51,7 @@ static CommandRegistration Unused(&Dump, []() -> Error {
   sys::fs::closeFile(*FDOrErr);
 
   DataExtractor DE(StringRef(MappedFile.data(), MappedFile.size()), true, 8);
-  uint32_t OffsetPtr = 0;
+  uint64_t OffsetPtr = 0;
 
   auto FileHeaderOrError = readBinaryFormatHeader(DE, OffsetPtr);
   if (!FileHeaderOrError)
diff --git a/tools/llvm-xray/xray-graph-diff.cpp b/tools/llvm-xray/xray-graph-diff.cpp
index a514be97f40b..116aa6869ec1 100644
--- a/tools/llvm-xray/xray-graph-diff.cpp
+++ b/tools/llvm-xray/xray-graph-diff.cpp
@@ -470,7 +470,7 @@ static CommandRegistration Unused(&GraphDiff, []() -> Error {
   auto &GDR = *GDROrErr;
 
   std::error_code EC;
-  raw_fd_ostream OS(GraphDiffOutput, EC, sys::fs::OpenFlags::F_Text);
+  raw_fd_ostream OS(GraphDiffOutput, EC, sys::fs::OpenFlags::OF_Text);
   if (EC)
     return make_error<StringError>(
         Twine("Cannot open file '") + GraphDiffOutput + "' for writing.", EC);
diff --git a/tools/llvm-xray/xray-graph.cpp b/tools/llvm-xray/xray-graph.cpp
index c09357fcb502..0be511219c1a 100644
--- a/tools/llvm-xray/xray-graph.cpp
+++ b/tools/llvm-xray/xray-graph.cpp
@@ -506,7 +506,7 @@ static CommandRegistration Unused(&GraphC, []() -> Error {
   auto &GR = *GROrError;
 
   std::error_code EC;
-  raw_fd_ostream OS(GraphOutput, EC, sys::fs::OpenFlags::F_Text);
+  raw_fd_ostream OS(GraphOutput, EC, sys::fs::OpenFlags::OF_Text);
   if (EC)
     return make_error<StringError>(
         Twine("Cannot open file '") + GraphOutput + "' for writing.", EC);
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index ccf8b073b82b..15495a511d06 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -523,7 +523,6 @@ int main(int argc, char **argv) {
   initializeDwarfEHPreparePass(Registry);
   initializeSafeStackLegacyPassPass(Registry);
   initializeSjLjEHPreparePass(Registry);
-  initializeStackProtectorPass(Registry);
   initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
   initializeGlobalMergePass(Registry);
   initializeIndirectBrExpandPassPass(Registry);
@@ -612,7 +611,9 @@ int main(int argc, char **argv) {
       OutputFilename = "-";
 
     std::error_code EC;
-    Out.reset(new ToolOutputFile(OutputFilename, EC, sys::fs::F_None));
+    sys::fs::OpenFlags Flags = OutputAssembly ? sys::fs::OF_Text
+                                              : sys::fs::OF_None;
+    Out.reset(new ToolOutputFile(OutputFilename, EC, Flags));
     if (EC) {
       errs() << EC.message() << '\n';
       return 1;
@@ -620,7 +621,7 @@ int main(int argc, char **argv) {
 
     if (!ThinLinkBitcodeFile.empty()) {
       ThinLinkOut.reset(
-          new ToolOutputFile(ThinLinkBitcodeFile, EC, sys::fs::F_None));
+          new ToolOutputFile(ThinLinkBitcodeFile, EC, sys::fs::OF_None));
       if (EC) {
         errs() << EC.message() << '\n';
         return 1;
@@ -720,8 +721,8 @@ int main(int argc, char **argv) {
         OutputFilename = "-";
 
       std::error_code EC;
-      Out = llvm::make_unique<ToolOutputFile>(OutputFilename, EC,
-                                              sys::fs::F_None);
+      Out = std::make_unique<ToolOutputFile>(OutputFilename, EC,
+                                              sys::fs::OF_None);
       if (EC) {
         errs() << EC.message() << '\n';
         return 1;
@@ -867,7 +868,7 @@ int main(int argc, char **argv) {
     assert(Out);
     OS = &Out->os();
     if (RunTwice) {
-      BOS = make_unique<raw_svector_ostream>(Buffer);
+      BOS = std::make_unique<raw_svector_ostream>(Buffer);
       OS = BOS.get();
     }
     if (OutputAssembly) {
diff --git a/tools/vfabi-demangle-fuzzer/CMakeLists.txt b/tools/vfabi-demangle-fuzzer/CMakeLists.txt
new file mode 100644
index 000000000000..908364690f5e
--- /dev/null
+++ b/tools/vfabi-demangle-fuzzer/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  Support
+)
+add_llvm_fuzzer(vfabi-demangler-fuzzer
+  vfabi-demangler-fuzzer.cpp
+)
diff --git a/tools/vfabi-demangle-fuzzer/vfabi-demangler-fuzzer.cpp b/tools/vfabi-demangle-fuzzer/vfabi-demangler-fuzzer.cpp
new file mode 100644
index 000000000000..13657effbbeb
--- /dev/null
+++ b/tools/vfabi-demangle-fuzzer/vfabi-demangler-fuzzer.cpp
@@ -0,0 +1,26 @@
+//===-- vfabi-demangler-fuzzer.cpp - Fuzzer VFABI using lib/Fuzzer   ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Build tool to fuzz the demangler for the vector function ABI names.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/VectorUtils.h"
+
+using namespace llvm;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  const StringRef MangledName((const char *)Data, Size);
+  const auto Info = VFABI::tryDemangleForVFABI(MangledName);
+
+  // Do not optimize away the return value. Inspired by
+  // https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307-L345
+  asm volatile("" : : "r,m"(Info) : "memory");
+
+  return 0;
+}
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 146d10835b8d..1d39b300091f 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -1111,6 +1111,7 @@ static std::string getEnumNameForToken(StringRef Str) {
     case '<': Res += "_LT_"; break;
     case '>': Res += "_GT_"; break;
     case '-': Res += "_MINUS_"; break;
+    case '#': Res += "_HASH_"; break;
     default:
       if ((*it >= 'A' && *it <= 'Z') ||
           (*it >= 'a' && *it <= 'z') ||
@@ -1439,7 +1440,7 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
 
   /// Map containing a mask with all operands indices that can be found for
   /// that class inside a instruction.
-  typedef std::map<ClassInfo *, unsigned, less_ptr<ClassInfo>> OpClassMaskTy;
+  typedef std::map<ClassInfo *, unsigned, deref<std::less<>>> OpClassMaskTy;
   OpClassMaskTy OpClassMask;
 
   for (const auto &MI : Matchables) {
@@ -1515,7 +1516,7 @@ void AsmMatcherInfo::buildInfo() {
       if (!V.empty() && V != Variant.Name)
         continue;
 
-      auto II = llvm::make_unique<MatchableInfo>(*CGI);
+      auto II = std::make_unique<MatchableInfo>(*CGI);
 
       II->initialize(*this, SingletonRegisters, Variant, HasMnemonicFirst);
 
@@ -1532,7 +1533,7 @@ void AsmMatcherInfo::buildInfo() {
     std::vector<Record*> AllInstAliases =
       Records.getAllDerivedDefinitions("InstAlias");
     for (unsigned i = 0, e = AllInstAliases.size(); i != e; ++i) {
-      auto Alias = llvm::make_unique<CodeGenInstAlias>(AllInstAliases[i],
+      auto Alias = std::make_unique<CodeGenInstAlias>(AllInstAliases[i],
                                                        Target);
 
       // If the tblgen -match-prefix option is specified (for tblgen hackers),
@@ -1546,7 +1547,7 @@ void AsmMatcherInfo::buildInfo() {
       if (!V.empty() && V != Variant.Name)
         continue;
 
-      auto II = llvm::make_unique<MatchableInfo>(std::move(Alias));
+      auto II = std::make_unique<MatchableInfo>(std::move(Alias));
 
       II->initialize(*this, SingletonRegisters, Variant, HasMnemonicFirst);
 
@@ -1615,7 +1616,7 @@ void AsmMatcherInfo::buildInfo() {
           II->TheDef->getValueAsString("TwoOperandAliasConstraint");
       if (Constraint != "") {
         // Start by making a copy of the original matchable.
-        auto AliasII = llvm::make_unique<MatchableInfo>(*II);
+        auto AliasII = std::make_unique<MatchableInfo>(*II);
 
         // Adjust it to be a two-operand alias.
         AliasII->formTwoOperandAlias(Constraint);
@@ -2381,7 +2382,7 @@ static void emitMatchClassEnumeration(CodeGenTarget &Target,
   OS << "  NumMatchClassKinds\n";
   OS << "};\n\n";
 
-  OS << "}\n\n";
+  OS << "} // end anonymous namespace\n\n";
 }
 
 /// emitMatchClassDiagStrings - Emit a function to get the diagnostic text to be
@@ -2866,7 +2867,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   OS << "    }\n";
   OS << "  };\n";
 
-  OS << "} // end anonymous namespace.\n\n";
+  OS << "} // end anonymous namespace\n\n";
 
   OS << "static const OperandMatchEntry OperandMatchTable["
      << Info.OperandMatchInfo.size() << "] = {\n";
@@ -3366,7 +3367,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "  " << getNameForFeatureBitset(FeatureBitset) << ",\n";
   }
   OS << "};\n\n"
-     << "const static FeatureBitset FeatureBitsets[] {\n"
+     << "static constexpr FeatureBitset FeatureBitsets[] = {\n"
      << "  {}, // AMFBS_None\n";
   for (const auto &FeatureBitset : FeatureBitsets) {
     if (FeatureBitset.empty())
@@ -3422,7 +3423,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "    }\n";
   OS << "  };\n";
 
-  OS << "} // end anonymous namespace.\n\n";
+  OS << "} // end anonymous namespace\n\n";
 
   unsigned VariantCount = Target.getAsmParserVariantCount();
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 05d81f133505..b5c7f35be0e5 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -784,8 +784,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
       continue; // Aliases with priority 0 are never emitted.
 
     const DagInit *DI = R->getValueAsDag("ResultInst");
-    const DefInit *Op = cast<DefInit>(DI->getOperator());
-    AliasMap[getQualifiedName(Op->getDef())].insert(
+    AliasMap[getQualifiedName(DI->getOperatorAsDef(R->getLoc()))].insert(
         std::make_pair(CodeGenInstAlias(R, Target), Priority));
   }
 
diff --git a/utils/TableGen/CallingConvEmitter.cpp b/utils/TableGen/CallingConvEmitter.cpp
index de5044e24d49..9eabb44d9004 100644
--- a/utils/TableGen/CallingConvEmitter.cpp
+++ b/utils/TableGen/CallingConvEmitter.cpp
@@ -264,6 +264,10 @@ void CallingConvEmitter::EmitAction(Record *Action,
       Record *DestTy = Action->getValueAsDef("DestTy");
       O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
       O << IndentStr << "LocInfo = CCValAssign::BCvt;\n";
+    } else if (Action->isSubClassOf("CCTruncToType")) {
+      Record *DestTy = Action->getValueAsDef("DestTy");
+      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
+      O << IndentStr << "LocInfo = CCValAssign::Trunc;\n";
     } else if (Action->isSubClassOf("CCPassIndirect")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
       O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
diff --git a/utils/TableGen/CodeEmitterGen.cpp b/utils/TableGen/CodeEmitterGen.cpp
index da65763905a8..42f69cb253d2 100644
--- a/utils/TableGen/CodeEmitterGen.cpp
+++ b/utils/TableGen/CodeEmitterGen.cpp
@@ -16,6 +16,7 @@
 #include "CodeGenTarget.h"
 #include "SubtargetFeatureInfo.h"
 #include "Types.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Casting.h"
@@ -45,12 +46,19 @@ public:
 private:
   int getVariableBit(const std::string &VarName, BitsInit *BI, int bit);
   std::string getInstructionCase(Record *R, CodeGenTarget &Target);
+  std::string getInstructionCaseForEncoding(Record *R, Record *EncodingDef,
+                                            CodeGenTarget &Target);
   void AddCodeToMergeInOperand(Record *R, BitsInit *BI,
                                const std::string &VarName,
                                unsigned &NumberedOp,
                                std::set<unsigned> &NamedOpIndices,
                                std::string &Case, CodeGenTarget &Target);
 
+  void emitInstructionBaseValues(
+      raw_ostream &o, ArrayRef<const CodeGenInstruction *> NumberedInstructions,
+      CodeGenTarget &Target, int HwMode = -1);
+  unsigned BitWidth;
+  bool UseAPInt;
 };
 
 // If the VarBitInit at position 'bit' matches the specified variable then
@@ -126,7 +134,10 @@ AddCodeToMergeInOperand(Record *R, BitsInit *BI, const std::string &VarName,
   
   std::pair<unsigned, unsigned> SO = CGI.Operands.getSubOperandNumber(OpIdx);
   std::string &EncoderMethodName = CGI.Operands[SO.first].EncoderMethodName;
-  
+
+  if (UseAPInt)
+    Case += "      op.clearAllBits();\n";
+
   // If the source operand has a custom encoder, use it. This will
   // get the encoding for all of the suboperands.
   if (!EncoderMethodName.empty()) {
@@ -134,18 +145,54 @@ AddCodeToMergeInOperand(Record *R, BitsInit *BI, const std::string &VarName,
     // sub-operands, if there are more than one, so only
     // query the encoder once per source operand.
     if (SO.second == 0) {
-      Case += "      // op: " + VarName + "\n" +
-              "      op = " + EncoderMethodName + "(MI, " + utostr(OpIdx);
-      Case += ", Fixups, STI";
-      Case += ");\n";
+      Case += "      // op: " + VarName + "\n";
+      if (UseAPInt) {
+        Case += "      " + EncoderMethodName + "(MI, " + utostr(OpIdx);
+        Case += ", op";
+      } else {
+        Case += "      op = " + EncoderMethodName + "(MI, " + utostr(OpIdx);
+      }
+      Case += ", Fixups, STI);\n";
     }
   } else {
-    Case += "      // op: " + VarName + "\n" +
-      "      op = getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
-    Case += ", Fixups, STI";
+    Case += "      // op: " + VarName + "\n";
+    if (UseAPInt) {
+      Case += "      getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
+      Case += ", op, Fixups, STI";
+    } else {
+      Case += "      op = getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
+      Case += ", Fixups, STI";
+    }
     Case += ");\n";
   }
-  
+
+  // Precalculate the number of lits this variable contributes to in the
+  // operand. If there is a single lit (consecutive range of bits) we can use a
+  // destructive sequence on APInt that reduces memory allocations.
+  int numOperandLits = 0;
+  for (int tmpBit = bit; tmpBit >= 0;) {
+    int varBit = getVariableBit(VarName, BI, tmpBit);
+
+    // If this bit isn't from a variable, skip it.
+    if (varBit == -1) {
+      --tmpBit;
+      continue;
+    }
+
+    // Figure out the consecutive range of bits covered by this operand, in
+    // order to generate better encoding code.
+    int beginVarBit = varBit;
+    int N = 1;
+    for (--tmpBit; tmpBit >= 0;) {
+      varBit = getVariableBit(VarName, BI, tmpBit);
+      if (varBit == -1 || varBit != (beginVarBit - N))
+        break;
+      ++N;
+      --tmpBit;
+    }
+    ++numOperandLits;
+  }
+
   for (; bit >= 0; ) {
     int varBit = getVariableBit(VarName, BI, bit);
     
@@ -166,20 +213,52 @@ AddCodeToMergeInOperand(Record *R, BitsInit *BI, const std::string &VarName,
       ++N;
       --bit;
     }
-     
-    uint64_t opMask = ~(uint64_t)0 >> (64-N);
-    int opShift = beginVarBit - N + 1;
-    opMask <<= opShift;
-    opShift = beginInstBit - beginVarBit;
-    
-    if (opShift > 0) {
-      Case += "      Value |= (op & UINT64_C(" + utostr(opMask) + ")) << " +
-              itostr(opShift) + ";\n";
-    } else if (opShift < 0) {
-      Case += "      Value |= (op & UINT64_C(" + utostr(opMask) + ")) >> " + 
-              itostr(-opShift) + ";\n";
+
+    std::string maskStr;
+    int opShift;
+
+    unsigned loBit = beginVarBit - N + 1;
+    unsigned hiBit = loBit + N;
+    unsigned loInstBit = beginInstBit - N + 1;
+    if (UseAPInt) {
+      std::string extractStr;
+      if (N >= 64) {
+        extractStr = "op.extractBits(" + itostr(hiBit - loBit) + ", " +
+                     itostr(loBit) + ")";
+        Case += "      Value.insertBits(" + extractStr + ", " +
+                itostr(loInstBit) + ");\n";
+      } else {
+        extractStr = "op.extractBitsAsZExtValue(" + itostr(hiBit - loBit) +
+                     ", " + itostr(loBit) + ")";
+        Case += "      Value.insertBits(" + extractStr + ", " +
+                itostr(loInstBit) + ", " + itostr(hiBit - loBit) + ");\n";
+      }
     } else {
-      Case += "      Value |= op & UINT64_C(" + utostr(opMask) + ");\n";
+      uint64_t opMask = ~(uint64_t)0 >> (64 - N);
+      opShift = beginVarBit - N + 1;
+      opMask <<= opShift;
+      maskStr = "UINT64_C(" + utostr(opMask) + ")";
+      opShift = beginInstBit - beginVarBit;
+
+      if (numOperandLits == 1) {
+        Case += "      op &= " + maskStr + ";\n";
+        if (opShift > 0) {
+          Case += "      op <<= " + itostr(opShift) + ";\n";
+        } else if (opShift < 0) {
+          Case += "      op >>= " + itostr(-opShift) + ";\n";
+        }
+        Case += "      Value |= op;\n";
+      } else {
+        if (opShift > 0) {
+          Case += "      Value |= (op & " + maskStr + ") << " +
+                  itostr(opShift) + ";\n";
+        } else if (opShift < 0) {
+          Case += "      Value |= (op & " + maskStr + ") >> " +
+                  itostr(-opShift) + ";\n";
+        } else {
+          Case += "      Value |= (op & " + maskStr + ");\n";
+        }
+      }
     }
   }
 }
@@ -187,7 +266,29 @@ AddCodeToMergeInOperand(Record *R, BitsInit *BI, const std::string &VarName,
 std::string CodeEmitterGen::getInstructionCase(Record *R,
                                                CodeGenTarget &Target) {
   std::string Case;
-  BitsInit *BI = R->getValueAsBitsInit("Inst");
+  if (const RecordVal *RV = R->getValue("EncodingInfos")) {
+    if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+      const CodeGenHwModes &HWM = Target.getHwModes();
+      EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+      Case += "      switch (HwMode) {\n";
+      Case += "      default: llvm_unreachable(\"Unhandled HwMode\");\n";
+      for (auto &KV : EBM.Map) {
+        Case += "      case " + itostr(KV.first) + ": {\n";
+        Case += getInstructionCaseForEncoding(R, KV.second, Target);
+        Case += "      break;\n";
+        Case += "      }\n";
+      }
+      Case += "      }\n";
+      return Case;
+    }
+  }
+  return getInstructionCaseForEncoding(R, R, Target);
+}
+
+std::string CodeEmitterGen::getInstructionCaseForEncoding(Record *R, Record *EncodingDef,
+                                                          CodeGenTarget &Target) {
+  std::string Case;
+  BitsInit *BI = EncodingDef->getValueAsBitsInit("Inst");
   unsigned NumberedOp = 0;
   std::set<unsigned> NamedOpIndices;
 
@@ -207,7 +308,7 @@ std::string CodeEmitterGen::getInstructionCase(Record *R,
 
   // Loop over all of the fields in the instruction, determining which are the
   // operands to the instruction.
-  for (const RecordVal &RV : R->getValues()) { 
+  for (const RecordVal &RV : EncodingDef->getValues()) {
     // Ignore fixed fields in the record, we're looking for values like:
     //    bits<5> RST = { ?, ?, ?, ?, ? };
     if (RV.getPrefix() || RV.getValue()->isComplete())
@@ -237,6 +338,54 @@ getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) {
   return Name;
 }
 
+static void emitInstBits(raw_ostream &OS, const APInt &Bits) {
+  for (unsigned I = 0; I < Bits.getNumWords(); ++I)
+    OS << ((I > 0) ? ", " : "") << "UINT64_C(" << utostr(Bits.getRawData()[I])
+       << ")";
+}
+
+void CodeEmitterGen::emitInstructionBaseValues(
+    raw_ostream &o, ArrayRef<const CodeGenInstruction *> NumberedInstructions,
+    CodeGenTarget &Target, int HwMode) {
+  const CodeGenHwModes &HWM = Target.getHwModes();
+  if (HwMode == -1)
+    o << "  static const uint64_t InstBits[] = {\n";
+  else
+    o << "  static const uint64_t InstBits_" << HWM.getMode(HwMode).Name
+      << "[] = {\n";
+
+  for (const CodeGenInstruction *CGI : NumberedInstructions) {
+    Record *R = CGI->TheDef;
+
+    if (R->getValueAsString("Namespace") == "TargetOpcode" ||
+        R->getValueAsBit("isPseudo")) {
+      o << "    "; emitInstBits(o, APInt(BitWidth, 0)); o << ",\n";
+      continue;
+    }
+
+    Record *EncodingDef = R;
+    if (const RecordVal *RV = R->getValue("EncodingInfos")) {
+      if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+        if (EBM.hasMode(HwMode))
+          EncodingDef = EBM.get(HwMode);
+      }
+    }
+    BitsInit *BI = EncodingDef->getValueAsBitsInit("Inst");
+
+    // Start by filling in fixed values.
+    APInt Value(BitWidth, 0);
+    for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i) {
+      if (BitInit *B = dyn_cast<BitInit>(BI->getBit(e - i - 1)))
+        Value |= APInt(BitWidth, (uint64_t)B->getValue()) << (e - i - 1);
+    }
+    o << "    ";
+    emitInstBits(o, Value);
+    o << "," << '\t' << "// " << R->getName() << "\n";
+  }
+  o << "    UINT64_C(0)\n  };\n";
+}
+
 void CodeEmitterGen::run(raw_ostream &o) {
   CodeGenTarget Target(Records);
   std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
@@ -247,34 +396,66 @@ void CodeEmitterGen::run(raw_ostream &o) {
   ArrayRef<const CodeGenInstruction*> NumberedInstructions =
     Target.getInstructionsByEnumValue();
 
-  // Emit function declaration
-  o << "uint64_t " << Target.getName();
-  o << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
-    << "    SmallVectorImpl<MCFixup> &Fixups,\n"
-    << "    const MCSubtargetInfo &STI) const {\n";
-
-  // Emit instruction base values
-  o << "  static const uint64_t InstBits[] = {\n";
+  const CodeGenHwModes &HWM = Target.getHwModes();
+  // The set of HwModes used by instruction encodings.
+  std::set<unsigned> HwModes;
+  BitWidth = 0;
   for (const CodeGenInstruction *CGI : NumberedInstructions) {
     Record *R = CGI->TheDef;
-
     if (R->getValueAsString("Namespace") == "TargetOpcode" ||
-        R->getValueAsBit("isPseudo")) {
-      o << "    UINT64_C(0),\n";
+        R->getValueAsBit("isPseudo"))
       continue;
-    }
 
+    if (const RecordVal *RV = R->getValue("EncodingInfos")) {
+      if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+        for (auto &KV : EBM.Map) {
+          BitsInit *BI = KV.second->getValueAsBitsInit("Inst");
+          BitWidth = std::max(BitWidth, BI->getNumBits());
+          HwModes.insert(KV.first);
+        }
+        continue;
+      }
+    }
     BitsInit *BI = R->getValueAsBitsInit("Inst");
+    BitWidth = std::max(BitWidth, BI->getNumBits());
+  }
+  UseAPInt = BitWidth > 64;
+  
+  // Emit function declaration
+  if (UseAPInt) {
+    o << "void " << Target.getName()
+      << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
+      << "    SmallVectorImpl<MCFixup> &Fixups,\n"
+      << "    APInt &Inst,\n"
+      << "    APInt &Scratch,\n"
+      << "    const MCSubtargetInfo &STI) const {\n";
+  } else {
+    o << "uint64_t " << Target.getName();
+    o << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
+      << "    SmallVectorImpl<MCFixup> &Fixups,\n"
+      << "    const MCSubtargetInfo &STI) const {\n";
+  }
+  
+  // Emit instruction base values
+  if (HwModes.empty()) {
+    emitInstructionBaseValues(o, NumberedInstructions, Target, -1);
+  } else {
+    for (unsigned HwMode : HwModes)
+      emitInstructionBaseValues(o, NumberedInstructions, Target, (int)HwMode);
+  }
 
-    // Start by filling in fixed values.
-    uint64_t Value = 0;
-    for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i) {
-      if (BitInit *B = dyn_cast<BitInit>(BI->getBit(e-i-1)))
-        Value |= (uint64_t)B->getValue() << (e-i-1);
+  if (!HwModes.empty()) {
+    o << "  const uint64_t *InstBits;\n";
+    o << "  unsigned HwMode = STI.getHwMode();\n";
+    o << "  switch (HwMode) {\n";
+    o << "  default: llvm_unreachable(\"Unknown hardware mode!\"); break;\n";
+    for (unsigned I : HwModes) {
+      o << "  case " << I << ": InstBits = InstBits_" << HWM.getMode(I).Name
+        << "; break;\n";
     }
-    o << "    UINT64_C(" << Value << ")," << '\t' << "// " << R->getName() << "\n";
+    o << "  };\n";
   }
-  o << "    UINT64_C(0)\n  };\n";
 
   // Map to accumulate all the cases.
   std::map<std::string, std::vector<std::string>> CaseMap;
@@ -294,11 +475,26 @@ void CodeEmitterGen::run(raw_ostream &o) {
   }
 
   // Emit initial function code
-  o << "  const unsigned opcode = MI.getOpcode();\n"
-    << "  uint64_t Value = InstBits[opcode];\n"
-    << "  uint64_t op = 0;\n"
-    << "  (void)op;  // suppress warning\n"
-    << "  switch (opcode) {\n";
+  if (UseAPInt) {
+    int NumWords = APInt::getNumWords(BitWidth);
+    int NumBytes = (BitWidth + 7) / 8;
+    o << "  const unsigned opcode = MI.getOpcode();\n"
+      << "  if (Inst.getBitWidth() != " << BitWidth << ")\n"
+      << "    Inst = Inst.zext(" << BitWidth << ");\n"
+      << "  if (Scratch.getBitWidth() != " << BitWidth << ")\n"
+      << "    Scratch = Scratch.zext(" << BitWidth << ");\n"
+      << "  LoadIntFromMemory(Inst, (uint8_t*)&InstBits[opcode * " << NumWords
+      << "], " << NumBytes << ");\n"
+      << "  APInt &Value = Inst;\n"
+      << "  APInt &op = Scratch;\n"
+      << "  switch (opcode) {\n";
+  } else {
+    o << "  const unsigned opcode = MI.getOpcode();\n"
+      << "  uint64_t Value = InstBits[opcode];\n"
+      << "  uint64_t op = 0;\n"
+      << "  (void)op;  // suppress warning\n"
+      << "  switch (opcode) {\n";
+  }
 
   // Emit each case statement
   std::map<std::string, std::vector<std::string>>::iterator IE, EE;
@@ -322,9 +518,12 @@ void CodeEmitterGen::run(raw_ostream &o) {
     << "    raw_string_ostream Msg(msg);\n"
     << "    Msg << \"Not supported instr: \" << MI;\n"
     << "    report_fatal_error(Msg.str());\n"
-    << "  }\n"
-    << "  return Value;\n"
-    << "}\n\n";
+    << "  }\n";
+  if (UseAPInt)
+    o << "  Inst = Value;\n";
+  else
+    o << "  return Value;\n";
+  o << "}\n\n";
 
   const auto &All = SubtargetFeatureInfo::getAll(Records);
   std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures;
@@ -385,8 +584,8 @@ void CodeEmitterGen::run(raw_ostream &o) {
     o << "  " << getNameForFeatureBitset(FeatureBitset) << ",\n";
   }
   o << "};\n\n"
-     << "const static FeatureBitset FeatureBitsets[] {\n"
-     << "  {}, // CEFBS_None\n";
+    << "static constexpr FeatureBitset FeatureBitsets[] = {\n"
+    << "  {}, // CEFBS_None\n";
   for (const auto &FeatureBitset : FeatureBitsets) {
     if (FeatureBitset.empty())
       continue;
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index c8f710d66a03..46f986ca0176 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -769,7 +769,10 @@ void TypeInfer::expandOverloads(TypeSetByHwMode::SetType &Out,
         for (MVT T : MVT::integer_valuetypes())
           if (Legal.count(T))
             Out.insert(T);
-        for (MVT T : MVT::integer_vector_valuetypes())
+        for (MVT T : MVT::integer_fixedlen_vector_valuetypes())
+          if (Legal.count(T))
+            Out.insert(T);
+        for (MVT T : MVT::integer_scalable_vector_valuetypes())
           if (Legal.count(T))
             Out.insert(T);
         return;
@@ -777,7 +780,10 @@ void TypeInfer::expandOverloads(TypeSetByHwMode::SetType &Out,
         for (MVT T : MVT::fp_valuetypes())
           if (Legal.count(T))
             Out.insert(T);
-        for (MVT T : MVT::fp_vector_valuetypes())
+        for (MVT T : MVT::fp_fixedlen_vector_valuetypes())
+          if (Legal.count(T))
+            Out.insert(T);
+        for (MVT T : MVT::fp_scalable_vector_valuetypes())
           if (Legal.count(T))
             Out.insert(T);
         return;
@@ -883,7 +889,8 @@ std::string TreePredicateFn::getPredCode() const {
   if (isLoad()) {
     if (!isUnindexed() && !isNonExtLoad() && !isAnyExtLoad() &&
         !isSignExtLoad() && !isZeroExtLoad() && getMemoryVT() == nullptr &&
-        getScalarMemoryVT() == nullptr)
+        getScalarMemoryVT() == nullptr && getAddressSpaces() == nullptr &&
+        getMinAlignment() < 1)
       PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(),
                       "IsLoad cannot be used by itself");
   } else {
@@ -903,7 +910,8 @@ std::string TreePredicateFn::getPredCode() const {
 
   if (isStore()) {
     if (!isUnindexed() && !isTruncStore() && !isNonTruncStore() &&
-        getMemoryVT() == nullptr && getScalarMemoryVT() == nullptr)
+        getMemoryVT() == nullptr && getScalarMemoryVT() == nullptr &&
+        getAddressSpaces() == nullptr && getMinAlignment() < 1)
       PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(),
                       "IsStore cannot be used by itself");
   } else {
@@ -917,6 +925,7 @@ std::string TreePredicateFn::getPredCode() const {
 
   if (isAtomic()) {
     if (getMemoryVT() == nullptr && !isAtomicOrderingMonotonic() &&
+        getAddressSpaces() == nullptr &&
         !isAtomicOrderingAcquire() && !isAtomicOrderingRelease() &&
         !isAtomicOrderingAcquireRelease() &&
         !isAtomicOrderingSequentiallyConsistent() &&
@@ -977,6 +986,13 @@ std::string TreePredicateFn::getPredCode() const {
       Code += ")\nreturn false;\n";
     }
 
+    int64_t MinAlign = getMinAlignment();
+    if (MinAlign > 0) {
+      Code += "if (cast<MemSDNode>(N)->getAlignment() < ";
+      Code += utostr(MinAlign);
+      Code += ")\nreturn false;\n";
+    }
+
     Record *MemoryVT = getMemoryVT();
 
     if (MemoryVT)
@@ -1177,6 +1193,13 @@ ListInit *TreePredicateFn::getAddressSpaces() const {
   return R->getValueAsListInit("AddressSpaces");
 }
 
+int64_t TreePredicateFn::getMinAlignment() const {
+  Record *R = getOrigPatFragRecord()->getRecord();
+  if (R->isValueUnset("MinAlignment"))
+    return 0;
+  return R->getValueAsInt("MinAlignment");
+}
+
 Record *TreePredicateFn::getScalarMemoryVT() const {
   Record *R = getOrigPatFragRecord()->getRecord();
   if (R->isValueUnset("ScalarMemoryVT"))
@@ -1373,9 +1396,11 @@ getPatternComplexity(const CodeGenDAGPatterns &CGP) const {
 ///
 std::string PatternToMatch::getPredicateCheck() const {
   SmallVector<const Predicate*,4> PredList;
-  for (const Predicate &P : Predicates)
-    PredList.push_back(&P);
-  llvm::sort(PredList, deref<llvm::less>());
+  for (const Predicate &P : Predicates) {
+    if (!P.getCondString().empty())
+      PredList.push_back(&P);
+  }
+  llvm::sort(PredList, deref<std::less<>>());
 
   std::string Check;
   for (unsigned i = 0, e = PredList.size(); i != e; ++i) {
@@ -2772,6 +2797,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit,
 
     if (Operator->isSubClassOf("SDNode") &&
         Operator->getName() != "imm" &&
+        Operator->getName() != "timm" &&
         Operator->getName() != "fpimm" &&
         Operator->getName() != "tglobaltlsaddr" &&
         Operator->getName() != "tconstpool" &&
@@ -3083,7 +3109,7 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
 
     ListInit *LI = Frag->getValueAsListInit("Fragments");
     TreePattern *P =
-        (PatternFragments[Frag] = llvm::make_unique<TreePattern>(
+        (PatternFragments[Frag] = std::make_unique<TreePattern>(
              Frag, LI, !Frag->isSubClassOf("OutPatFrag"),
              *this)).get();
 
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index 2b49a64c3f1d..80fc932a7a50 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -594,6 +594,7 @@ public:
   Record *getScalarMemoryVT() const;
 
   ListInit *getAddressSpaces() const;
+  int64_t getMinAlignment() const;
 
   // If true, indicates that GlobalISel-based C++ code was supplied.
   bool hasGISelPredicateCode() const;
@@ -1075,8 +1076,11 @@ public:
     std::string C = IsHwMode
         ? std::string("MF->getSubtarget().checkFeatures(\"" + Features + "\")")
         : std::string(Def->getValueAsString("CondString"));
+    if (C.empty())
+      return "";
     return IfCond ? C : "!("+C+')';
   }
+
   bool operator==(const Predicate &P) const {
     return IfCond == P.IfCond && IsHwMode == P.IsHwMode && Def == P.Def;
   }
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index 2463824469ab..fde946d06589 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -363,6 +363,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
   Namespace = R->getValueAsString("Namespace");
   AsmString = R->getValueAsString("AsmString");
 
+  isPreISelOpcode = R->getValueAsBit("isPreISelOpcode");
   isReturn     = R->getValueAsBit("isReturn");
   isEHScopeReturn = R->getValueAsBit("isEHScopeReturn");
   isBranch     = R->getValueAsBit("isBranch");
diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index bb5b1369649f..2cb28425df7a 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h
@@ -231,6 +231,7 @@ template <typename T> class ArrayRef;
     std::vector<Record*> ImplicitDefs, ImplicitUses;
 
     // Various boolean values we track for the instruction.
+    bool isPreISelOpcode : 1;
     bool isReturn : 1;
     bool isEHScopeReturn : 1;
     bool isBranch : 1;
diff --git a/utils/TableGen/CodeGenIntrinsics.h b/utils/TableGen/CodeGenIntrinsics.h
index 7b74bb07d6e0..83e780671b43 100644
--- a/utils/TableGen/CodeGenIntrinsics.h
+++ b/utils/TableGen/CodeGenIntrinsics.h
@@ -141,6 +141,7 @@ struct CodeGenIntrinsic {
 
   enum ArgAttribute {
     NoCapture,
+    NoAlias,
     Returned,
     ReadOnly,
     WriteOnly,
@@ -154,6 +155,13 @@ struct CodeGenIntrinsic {
     return Properties & (1 << Prop);
   }
 
+  /// Returns true if the parameter at \p ParamIdx is a pointer type. Returns
+  /// false if the parameter is not a pointer, or \p ParamIdx is greater than
+  /// the size of \p IS.ParamVTs.
+  ///
+  /// Note that this requires that \p IS.ParamVTs is available.
+  bool isParamAPointer(unsigned ParamIdx) const;
+
   CodeGenIntrinsic(Record *R);
 };
 
diff --git a/utils/TableGen/CodeGenMapTable.cpp b/utils/TableGen/CodeGenMapTable.cpp
index b1774b01ba8c..793bb61481e7 100644
--- a/utils/TableGen/CodeGenMapTable.cpp
+++ b/utils/TableGen/CodeGenMapTable.cpp
@@ -132,7 +132,7 @@ public:
         MapRec->getName() + "' has empty " + "`ValueCols' field!");
 
     for (Init *I : ColValList->getValues()) {
-      ListInit *ColI = dyn_cast<ListInit>(I);
+      auto *ColI = cast<ListInit>(I);
 
       // Make sure that all the sub-lists in 'ValueCols' have same number of
       // elements as the fields in 'ColFields'.
@@ -168,7 +168,7 @@ public:
     return ValueCols;
   }
 };
-} // End anonymous namespace.
+} // end anonymous namespace
 
 
 //===----------------------------------------------------------------------===//
@@ -226,7 +226,7 @@ public:
   void emitMapFuncBody(raw_ostream &OS, unsigned TableSize);
 
 };
-} // End anonymous namespace.
+} // end anonymous namespace
 
 
 //===----------------------------------------------------------------------===//
@@ -521,7 +521,7 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
     unsigned ListSize = List->size();
 
     for (unsigned j = 0; j < ListSize; j++) {
-      ListInit *ListJ = dyn_cast<ListInit>(List->getElement(j));
+      auto *ListJ = cast<ListInit>(List->getElement(j));
 
       if (ListJ->size() != ColFields->size())
         PrintFatalError("Record `" + CurMap->getName() + "', field "
@@ -604,8 +604,8 @@ void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
     // Emit map tables and the functions to query them.
     IMap.emitTablesWithFunc(OS);
   }
-  OS << "} // End " << NameSpace << " namespace\n";
-  OS << "} // End llvm namespace\n";
+  OS << "} // end namespace " << NameSpace << "\n";
+  OS << "} // end namespace llvm\n";
   OS << "#endif // GET_INSTRMAP_INFO\n\n";
 }
 
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index f87c6d6c945a..6153c759b123 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -639,7 +639,8 @@ struct TupleExpander : SetTheory::Expander {
     // Precompute some types.
     Record *RegisterCl = Def->getRecords().getClass("Register");
     RecTy *RegisterRecTy = RecordRecTy::get(RegisterCl);
-    StringInit *BlankName = StringInit::get("");
+    std::vector<StringRef> RegNames =
+      Def->getValueAsListOfStrings("RegAsmNames");
 
     // Zip them up.
     for (unsigned n = 0; n != Length; ++n) {
@@ -656,11 +657,20 @@ struct TupleExpander : SetTheory::Expander {
                               unsigned(Reg->getValueAsInt("CostPerUse")));
       }
 
+      StringInit *AsmName = StringInit::get("");
+      if (!RegNames.empty()) {
+        if (RegNames.size() <= n)
+          PrintFatalError(Def->getLoc(),
+                          "Register tuple definition missing name for '" +
+                            Name + "'.");
+        AsmName = StringInit::get(RegNames[n]);
+      }
+
       // Create a new Record representing the synthesized register. This record
       // is only for consumption by CodeGenRegister, it is not added to the
       // RecordKeeper.
       SynthDefs.emplace_back(
-          llvm::make_unique<Record>(Name, Def->getLoc(), Def->getRecords()));
+          std::make_unique<Record>(Name, Def->getLoc(), Def->getRecords()));
       Record *NewReg = SynthDefs.back().get();
       Elts.insert(NewReg);
 
@@ -683,9 +693,8 @@ struct TupleExpander : SetTheory::Expander {
         if (Field == "SubRegs")
           RV.setValue(ListInit::get(Tuple, RegisterRecTy));
 
-        // Provide a blank AsmName. MC hacks are required anyway.
         if (Field == "AsmName")
-          RV.setValue(BlankName);
+          RV.setValue(AsmName);
 
         // CostPerUse is aggregated from all Tuple members.
         if (Field == "CostPerUse")
@@ -725,8 +734,8 @@ struct TupleExpander : SetTheory::Expander {
 //===----------------------------------------------------------------------===//
 
 static void sortAndUniqueRegisters(CodeGenRegister::Vec &M) {
-  llvm::sort(M, deref<llvm::less>());
-  M.erase(std::unique(M.begin(), M.end(), deref<llvm::equal>()), M.end());
+  llvm::sort(M, deref<std::less<>>());
+  M.erase(std::unique(M.begin(), M.end(), deref<std::equal_to<>>()), M.end());
 }
 
 CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
@@ -851,7 +860,7 @@ void CodeGenRegisterClass::inheritProperties(CodeGenRegBank &RegBank) {
 
 bool CodeGenRegisterClass::contains(const CodeGenRegister *Reg) const {
   return std::binary_search(Members.begin(), Members.end(), Reg,
-                            deref<llvm::less>());
+                            deref<std::less<>>());
 }
 
 namespace llvm {
@@ -887,7 +896,7 @@ static bool testSubClass(const CodeGenRegisterClass *A,
   return A->RSI.isSubClassOf(B->RSI) &&
          std::includes(A->getMembers().begin(), A->getMembers().end(),
                        B->getMembers().begin(), B->getMembers().end(),
-                       deref<llvm::less>());
+                       deref<std::less<>>());
 }
 
 /// Sorting predicate for register classes.  This provides a topological
@@ -1089,7 +1098,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
   Sets.addFieldExpander("RegisterClass", "MemberList");
   Sets.addFieldExpander("CalleeSavedRegs", "SaveList");
   Sets.addExpander("RegisterTuples",
-                   llvm::make_unique<TupleExpander>(SynthDefs));
+                   std::make_unique<TupleExpander>(SynthDefs));
 
   // Read in the user-defined (named) sub-register indices.
   // More indices will be synthesized later.
@@ -2131,9 +2140,10 @@ void CodeGenRegBank::inferCommonSubClass(CodeGenRegisterClass *RC) {
     const CodeGenRegister::Vec &Memb1 = RC1->getMembers();
     const CodeGenRegister::Vec &Memb2 = RC2->getMembers();
     CodeGenRegister::Vec Intersection;
-    std::set_intersection(
-        Memb1.begin(), Memb1.end(), Memb2.begin(), Memb2.end(),
-        std::inserter(Intersection, Intersection.begin()), deref<llvm::less>());
+    std::set_intersection(Memb1.begin(), Memb1.end(), Memb2.begin(),
+                          Memb2.end(),
+                          std::inserter(Intersection, Intersection.begin()),
+                          deref<std::less<>>());
 
     // Skip disjoint class pairs.
     if (Intersection.empty())
@@ -2158,7 +2168,8 @@ void CodeGenRegBank::inferCommonSubClass(CodeGenRegisterClass *RC) {
 void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
   // Map SubRegIndex to set of registers in RC supporting that SubRegIndex.
   typedef std::map<const CodeGenSubRegIndex *, CodeGenRegister::Vec,
-                   deref<llvm::less>> SubReg2SetMap;
+                   deref<std::less<>>>
+      SubReg2SetMap;
 
   // Compute the set of registers supporting each SubRegIndex.
   SubReg2SetMap SRSets;
@@ -2357,6 +2368,21 @@ CodeGenRegBank::getRegClassForRegister(Record *R) {
   return FoundRC;
 }
 
+const CodeGenRegisterClass *
+CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord,
+                                       ValueTypeByHwMode *VT) {
+  const CodeGenRegister *Reg = getReg(RegRecord);
+  const CodeGenRegisterClass *BestRC = nullptr;
+  for (const auto &RC : getRegClasses()) {
+    if ((!VT || RC.hasType(*VT)) &&
+        RC.contains(Reg) && (!BestRC || BestRC->hasSubClass(&RC)))
+      BestRC = &RC;
+  }
+
+  assert(BestRC && "Couldn't find the register class");
+  return BestRC;
+}
+
 BitVector CodeGenRegBank::computeCoveredRegisters(ArrayRef<Record*> Regs) {
   SetVector<const CodeGenRegister*> Set;
 
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index f04a90f8fde5..6d933baec2ae 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -93,7 +93,8 @@ namespace llvm {
 
     // Map of composite subreg indices.
     typedef std::map<CodeGenSubRegIndex *, CodeGenSubRegIndex *,
-                     deref<llvm::less>> CompMap;
+                     deref<std::less<>>>
+        CompMap;
 
     // Returns the subreg index that results from composing this with Idx.
     // Returns NULL if this and Idx don't compose.
@@ -137,15 +138,14 @@ namespace llvm {
     /// list of subregisters they are composed of (if any). Do this recursively.
     void computeConcatTransitiveClosure();
 
+    bool operator<(const CodeGenSubRegIndex &RHS) const {
+      return this->EnumValue < RHS.EnumValue;
+    }
+
   private:
     CompMap Composed;
   };
 
-  inline bool operator<(const CodeGenSubRegIndex &A,
-                        const CodeGenSubRegIndex &B) {
-    return A.EnumValue < B.EnumValue;
-  }
-
   /// CodeGenRegister - Represents a register definition.
   struct CodeGenRegister {
     Record *TheDef;
@@ -156,7 +156,8 @@ namespace llvm {
     bool Artificial;
 
     // Map SubRegIndex -> Register.
-    typedef std::map<CodeGenSubRegIndex *, CodeGenRegister *, deref<llvm::less>>
+    typedef std::map<CodeGenSubRegIndex *, CodeGenRegister *,
+                     deref<std::less<>>>
         SubRegMap;
 
     CodeGenRegister(Record *R, unsigned Enum);
@@ -347,6 +348,10 @@ namespace llvm {
     ArrayRef<ValueTypeByHwMode> getValueTypes() const { return VTs; }
     unsigned getNumValueTypes() const { return VTs.size(); }
 
+    bool hasType(const ValueTypeByHwMode &VT) const {
+      return std::find(VTs.begin(), VTs.end(), VT) != VTs.end();
+    }
+
     const ValueTypeByHwMode &getValueTypeNum(unsigned VTNum) const {
       if (VTNum < VTs.size())
         return VTs[VTNum];
@@ -708,6 +713,13 @@ namespace llvm {
     /// return the superclass.  Otherwise return null.
     const CodeGenRegisterClass* getRegClassForRegister(Record *R);
 
+    // Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike
+    // getRegClassForRegister, this tries to find the smallest class containing
+    // the physical register. If \p VT is specified, it will only find classes
+    // with a matching type
+    const CodeGenRegisterClass *
+    getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr);
+
     // Get the sum of unit weights.
     unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const {
       unsigned Weight = 0;
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index fd007044a16e..f12d7d484a8e 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -172,8 +172,8 @@ CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
 
   // Allow Set evaluation to recognize the dags used in InstRW records:
   // (instrs Op1, Op1...)
-  Sets.addOperator("instrs", llvm::make_unique<InstrsOp>());
-  Sets.addOperator("instregex", llvm::make_unique<InstRegexOp>(Target));
+  Sets.addOperator("instrs", std::make_unique<InstrsOp>());
+  Sets.addOperator("instregex", std::make_unique<InstRegexOp>(Target));
 
   // Instantiate a CodeGenProcModel for each SchedMachineModel with the values
   // that are explicitly referenced in tablegen records. Resources associated
@@ -1083,9 +1083,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
             if (RWD->getValueAsDef("SchedModel") == RWModelDef &&
                 RWModelDef->getValueAsBit("FullInstRWOverlapCheck")) {
               for (Record *Inst : InstDefs) {
-                PrintFatalError(InstRWDef->getLoc(), "Overlapping InstRW def " +
-                            Inst->getName() + " also matches " +
-                            RWD->getValue("Instrs")->getValue()->getAsString());
+                PrintFatalError
+                    (InstRWDef->getLoc(),
+                     "Overlapping InstRW definition for \"" +
+                     Inst->getName() +
+                     "\" also matches previous \"" +
+                     RWD->getValue("Instrs")->getValue()->getAsString() +
+                     "\".");
               }
             }
           }
@@ -1115,9 +1119,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
       for (Record *OldRWDef : SchedClasses[OldSCIdx].InstRWs) {
         if (OldRWDef->getValueAsDef("SchedModel") == RWModelDef) {
           for (Record *InstDef : InstDefs) {
-            PrintFatalError(OldRWDef->getLoc(), "Overlapping InstRW def " +
-                       InstDef->getName() + " also matches " +
-                       OldRWDef->getValue("Instrs")->getValue()->getAsString());
+            PrintFatalError
+                (InstRWDef->getLoc(),
+                 "Overlapping InstRW definition for \"" +
+                 InstDef->getName() +
+                 "\" also matches previous \"" +
+                 OldRWDef->getValue("Instrs")->getValue()->getAsString() +
+                 "\".");
           }
         }
         assert(OldRWDef != InstRWDef &&
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index b65e1b6af791..fa8b842c97f9 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -98,6 +98,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v256i8:   return "MVT::v256i8";
   case MVT::v1i16:    return "MVT::v1i16";
   case MVT::v2i16:    return "MVT::v2i16";
+  case MVT::v3i16:    return "MVT::v3i16";
   case MVT::v4i16:    return "MVT::v4i16";
   case MVT::v8i16:    return "MVT::v8i16";
   case MVT::v16i16:   return "MVT::v16i16";
@@ -126,8 +127,11 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v32i64:   return "MVT::v32i64";
   case MVT::v1i128:   return "MVT::v1i128";
   case MVT::v2f16:    return "MVT::v2f16";
+  case MVT::v3f16:    return "MVT::v3f16";
   case MVT::v4f16:    return "MVT::v4f16";
   case MVT::v8f16:    return "MVT::v8f16";
+  case MVT::v16f16:   return "MVT::v16f16";
+  case MVT::v32f16:   return "MVT::v32f16";
   case MVT::v1f32:    return "MVT::v1f32";
   case MVT::v2f32:    return "MVT::v2f32";
   case MVT::v3f32:    return "MVT::v3f32";
@@ -289,10 +293,57 @@ Record *CodeGenTarget::getAsmWriter() const {
 
 CodeGenRegBank &CodeGenTarget::getRegBank() const {
   if (!RegBank)
-    RegBank = llvm::make_unique<CodeGenRegBank>(Records, getHwModes());
+    RegBank = std::make_unique<CodeGenRegBank>(Records, getHwModes());
   return *RegBank;
 }
 
+Optional<CodeGenRegisterClass *>
+CodeGenTarget::getSuperRegForSubReg(const ValueTypeByHwMode &ValueTy,
+                                    CodeGenRegBank &RegBank,
+                                    const CodeGenSubRegIndex *SubIdx) const {
+  std::vector<CodeGenRegisterClass *> Candidates;
+  auto &RegClasses = RegBank.getRegClasses();
+
+  // Try to find a register class which supports ValueTy, and also contains
+  // SubIdx.
+  for (CodeGenRegisterClass &RC : RegClasses) {
+    // Is there a subclass of this class which contains this subregister index?
+    CodeGenRegisterClass *SubClassWithSubReg = RC.getSubClassWithSubReg(SubIdx);
+    if (!SubClassWithSubReg)
+      continue;
+
+    // We have a class. Check if it supports this value type.
+    if (llvm::none_of(SubClassWithSubReg->VTs,
+                      [&ValueTy](const ValueTypeByHwMode &ClassVT) {
+                        return ClassVT == ValueTy;
+                      }))
+      continue;
+
+    // We have a register class which supports both the value type and
+    // subregister index. Remember it.
+    Candidates.push_back(SubClassWithSubReg);
+  }
+
+  // If we didn't find anything, we're done.
+  if (Candidates.empty())
+    return None;
+
+  // Find and return the largest of our candidate classes.
+  llvm::stable_sort(Candidates, [&](const CodeGenRegisterClass *A,
+                                    const CodeGenRegisterClass *B) {
+    if (A->getMembers().size() > B->getMembers().size())
+      return true;
+
+    if (A->getMembers().size() < B->getMembers().size())
+      return false;
+
+    // Order by name as a tie-breaker.
+    return StringRef(A->getName()) < B->getName();
+  });
+
+  return Candidates[0];
+}
+
 void CodeGenTarget::ReadRegAltNameIndices() const {
   RegAltNameIndices = Records.getAllDerivedDefinitions("RegAltNameIndex");
   llvm::sort(RegAltNameIndices, LessRecord());
@@ -339,7 +390,7 @@ void CodeGenTarget::ReadLegalValueTypes() const {
 
 CodeGenSchedModels &CodeGenTarget::getSchedModels() const {
   if (!SchedModels)
-    SchedModels = llvm::make_unique<CodeGenSchedModels>(Records, *this);
+    SchedModels = std::make_unique<CodeGenSchedModels>(Records, *this);
   return *SchedModels;
 }
 
@@ -352,7 +403,7 @@ void CodeGenTarget::ReadInstructions() const {
 
   // Parse the instructions defined in the .td file.
   for (unsigned i = 0, e = Insts.size(); i != e; ++i)
-    Instructions[Insts[i]] = llvm::make_unique<CodeGenInstruction>(Insts[i]);
+    Instructions[Insts[i]] = std::make_unique<CodeGenInstruction>(Insts[i]);
 }
 
 static const CodeGenInstruction *
@@ -427,7 +478,8 @@ void CodeGenTarget::reverseBitsForLittleEndianEncoding() {
   if (!isLittleEndianEncoding())
     return;
 
-  std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
+  std::vector<Record *> Insts =
+      Records.getAllDerivedDefinitions("InstructionEncoding");
   for (Record *R : Insts) {
     if (R->getValueAsString("Namespace") == "TargetOpcode" ||
         R->getValueAsBit("isPseudo"))
@@ -733,6 +785,9 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
     else if (Property->isSubClassOf("NoCapture")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
       ArgumentAttributes.push_back(std::make_pair(ArgNo, NoCapture));
+    } else if (Property->isSubClassOf("NoAlias")) {
+      unsigned ArgNo = Property->getValueAsInt("ArgNo");
+      ArgumentAttributes.push_back(std::make_pair(ArgNo, NoAlias));
     } else if (Property->isSubClassOf("Returned")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
       ArgumentAttributes.push_back(std::make_pair(ArgNo, Returned));
@@ -758,3 +813,10 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   // Sort the argument attributes for later benefit.
   llvm::sort(ArgumentAttributes);
 }
+
+bool CodeGenIntrinsic::isParamAPointer(unsigned ParamIdx) const {
+  if (ParamIdx >= IS.ParamVTs.size())
+    return false;
+  MVT ParamType = MVT(IS.ParamVTs[ParamIdx]);
+  return ParamType == MVT::iPTR || ParamType == MVT::iPTRAny;
+}
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 1ab2de269c76..d52ffac4ce6c 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -103,6 +103,12 @@ public:
   /// getRegBank - Return the register bank description.
   CodeGenRegBank &getRegBank() const;
 
+  /// Return the largest register class on \p RegBank which supports \p Ty and
+  /// covers \p SubIdx if it exists.
+  Optional<CodeGenRegisterClass *>
+  getSuperRegForSubReg(const ValueTypeByHwMode &Ty, CodeGenRegBank &RegBank,
+                       const CodeGenSubRegIndex *SubIdx) const;
+
   /// getRegisterByName - If there is a register with the specific AsmName,
   /// return it.
   const CodeGenRegister *getRegisterByName(StringRef Name) const;
diff --git a/utils/TableGen/DAGISelEmitter.cpp b/utils/TableGen/DAGISelEmitter.cpp
index fb0c6faa5295..d8e78ce55c7b 100644
--- a/utils/TableGen/DAGISelEmitter.cpp
+++ b/utils/TableGen/DAGISelEmitter.cpp
@@ -173,7 +173,7 @@ void DAGISelEmitter::run(raw_ostream &OS) {
   }
 
   std::unique_ptr<Matcher> TheMatcher =
-    llvm::make_unique<ScopeMatcher>(PatternMatchers);
+    std::make_unique<ScopeMatcher>(PatternMatchers);
 
   OptimizeMatcher(TheMatcher, CGP);
   //Matcher->dump();
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index 0a782e84a372..223513fc8d38 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -932,13 +932,15 @@ private:
 ///
 class EmitCopyToRegMatcher : public Matcher {
   unsigned SrcSlot; // Value to copy into the physreg.
-  Record *DestPhysReg;
+  const CodeGenRegister *DestPhysReg;
+
 public:
-  EmitCopyToRegMatcher(unsigned srcSlot, Record *destPhysReg)
+  EmitCopyToRegMatcher(unsigned srcSlot,
+                       const CodeGenRegister *destPhysReg)
     : Matcher(EmitCopyToReg), SrcSlot(srcSlot), DestPhysReg(destPhysReg) {}
 
   unsigned getSrcSlot() const { return SrcSlot; }
-  Record *getDestPhysReg() const { return DestPhysReg; }
+  const CodeGenRegister *getDestPhysReg() const { return DestPhysReg; }
 
   static bool classof(const Matcher *N) {
     return N->getKind() == EmitCopyToReg;
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index cecbc6cccdff..e9f1fb93d516 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -670,12 +670,22 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
     OS << '\n';
     return 2+MN->getNumNodes();
   }
-  case Matcher::EmitCopyToReg:
-    OS << "OPC_EmitCopyToReg, "
-       << cast<EmitCopyToRegMatcher>(N)->getSrcSlot() << ", "
-       << getQualifiedName(cast<EmitCopyToRegMatcher>(N)->getDestPhysReg())
-       << ",\n";
-    return 3;
+  case Matcher::EmitCopyToReg: {
+    const auto *C2RMatcher = cast<EmitCopyToRegMatcher>(N);
+    int Bytes = 3;
+    const CodeGenRegister *Reg = C2RMatcher->getDestPhysReg();
+    if (Reg->EnumValue > 255) {
+      assert(isUInt<16>(Reg->EnumValue) && "not handled");
+      OS << "OPC_EmitCopyToReg2, " << C2RMatcher->getSrcSlot() << ", "
+         << "TARGET_VAL(" << getQualifiedName(Reg->TheDef) << "),\n";
+      ++Bytes;
+    } else {
+      OS << "OPC_EmitCopyToReg, " << C2RMatcher->getSrcSlot() << ", "
+         << getQualifiedName(Reg->TheDef) << ",\n";
+    }
+
+    return Bytes;
+  }
   case Matcher::EmitNodeXForm: {
     const EmitNodeXFormMatcher *XF = cast<EmitNodeXFormMatcher>(N);
     OS << "OPC_EmitNodeXForm, " << getNodeXFormID(XF->getNodeXForm()) << ", "
diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index 8f54beeba65b..49c09c7d195e 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp
@@ -141,7 +141,7 @@ namespace {
                                         SmallVectorImpl<unsigned> &ResultOps);
     };
 
-} // end anon namespace.
+} // end anonymous namespace
 
 MatcherGen::MatcherGen(const PatternToMatch &pattern,
                        const CodeGenDAGPatterns &cgp)
@@ -867,9 +867,13 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   if (isRoot && !PhysRegInputs.empty()) {
     // Emit all of the CopyToReg nodes for the input physical registers.  These
     // occur in patterns like (mul:i8 AL:i8, GR8:i8:$src).
-    for (unsigned i = 0, e = PhysRegInputs.size(); i != e; ++i)
+    for (unsigned i = 0, e = PhysRegInputs.size(); i != e; ++i) {
+      const CodeGenRegister *Reg =
+        CGP.getTargetInfo().getRegBank().getReg(PhysRegInputs[i].first);
       AddMatcher(new EmitCopyToRegMatcher(PhysRegInputs[i].second,
-                                          PhysRegInputs[i].first));
+                                          Reg));
+    }
+
     // Even if the node has no other glue inputs, the resultant node must be
     // glued to the CopyFromReg nodes we just generated.
     TreeHasInGlue = true;
diff --git a/utils/TableGen/DAGISelMatcherOpt.cpp b/utils/TableGen/DAGISelMatcherOpt.cpp
index 7d51b0769372..6746fdd676a7 100644
--- a/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -409,13 +409,14 @@ static void FactorNodes(std::unique_ptr<Matcher> &InputMatcherPtr) {
     DenseMap<unsigned, unsigned> TypeEntry;
     SmallVector<std::pair<MVT::SimpleValueType, Matcher*>, 8> Cases;
     for (unsigned i = 0, e = NewOptionsToMatch.size(); i != e; ++i) {
-      CheckTypeMatcher *CTM =
-        cast_or_null<CheckTypeMatcher>(FindNodeWithKind(NewOptionsToMatch[i],
-                                                        Matcher::CheckType));
+      Matcher* M = FindNodeWithKind(NewOptionsToMatch[i], Matcher::CheckType);
+      assert(M && isa<CheckTypeMatcher>(M) && "Unknown Matcher type");
+
+      auto *CTM = cast<CheckTypeMatcher>(M);
       Matcher *MatcherWithoutCTM = NewOptionsToMatch[i]->unlinkNode(CTM);
       MVT::SimpleValueType CTMTy = CTM->getType();
       delete CTM;
-      
+
       unsigned &Entry = TypeEntry[CTMTy];
       if (Entry != 0) {
         // If we have unfactored duplicate types, then we should factor them.
diff --git a/utils/TableGen/DFAEmitter.cpp b/utils/TableGen/DFAEmitter.cpp
new file mode 100644
index 000000000000..dd3db7c150ba
--- /dev/null
+++ b/utils/TableGen/DFAEmitter.cpp
@@ -0,0 +1,394 @@
+//===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class can produce a generic deterministic finite state automaton (DFA),
+// given a set of possible states and transitions.
+//
+// The input transitions can be nondeterministic - this class will produce the
+// deterministic equivalent state machine.
+//
+// The generated code can run the DFA and produce an accepted / not accepted
+// state and also produce, given a sequence of transitions that results in an
+// accepted state, the sequence of intermediate states. This is useful if the
+// initial automaton was nondeterministic - it allows mapping back from the DFA
+// to the NFA.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "dfa-emitter"
+
+#include "DFAEmitter.h"
+#include "CodeGenTarget.h"
+#include "SequenceToOffsetTable.h"
+#include "TableGenBackends.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// DfaEmitter implementation. This is independent of the GenAutomaton backend.
+//===----------------------------------------------------------------------===//
+
+void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
+  Actions.insert(A);
+  NfaStates.insert(From);
+  NfaStates.insert(To);
+  NfaTransitions[{From, A}].push_back(To);
+  ++NumNfaTransitions;
+}
+
+void DfaEmitter::visitDfaState(DfaState DS) {
+  // For every possible action...
+  auto FromId = DfaStates.idFor(DS);
+  for (action_type A : Actions) {
+    DfaState NewStates;
+    DfaTransitionInfo TI;
+    // For every represented state, word pair in the original NFA...
+    for (state_type &FromState : DS) {
+      // If this action is possible from this state add the transitioned-to
+      // states to NewStates.
+      auto I = NfaTransitions.find({FromState, A});
+      if (I == NfaTransitions.end())
+        continue;
+      for (state_type &ToState : I->second) {
+        NewStates.push_back(ToState);
+        TI.emplace_back(FromState, ToState);
+      }
+    }
+    if (NewStates.empty())
+      continue;
+    // Sort and unique.
+    sort(NewStates);
+    NewStates.erase(std::unique(NewStates.begin(), NewStates.end()),
+                    NewStates.end());
+    sort(TI);
+    TI.erase(std::unique(TI.begin(), TI.end()), TI.end());
+    unsigned ToId = DfaStates.insert(NewStates);
+    DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI));
+  }
+}
+
+void DfaEmitter::constructDfa() {
+  DfaState Initial(1, /*NFA initial state=*/0);
+  DfaStates.insert(Initial);
+
+  // Note that UniqueVector starts indices at 1, not zero.
+  unsigned DfaStateId = 1;
+  while (DfaStateId <= DfaStates.size())
+    visitDfaState(DfaStates[DfaStateId++]);
+}
+
+void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
+  constructDfa();
+
+  OS << "// Input NFA has " << NfaStates.size() << " states with "
+     << NumNfaTransitions << " transitions.\n";
+  OS << "// Generated DFA has " << DfaStates.size() << " states with "
+     << DfaTransitions.size() << " transitions.\n\n";
+
+  // Implementation note: We don't bake a simple std::pair<> here as it requires
+  // significantly more effort to parse. A simple test with a large array of
+  // struct-pairs (N=100000) took clang-10 6s to parse. The same array of
+  // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
+  // define the pair type.
+  //
+  // FIXME: It may make sense to emit these as ULEB sequences instead of
+  // pairs of uint64_t.
+  OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
+  OS << "// transition implies a set of NFA transitions. These are referred\n";
+  OS << "// to by index in " << Name << "Transitions[].\n";
+
+  SequenceToOffsetTable<DfaTransitionInfo> Table;
+  std::map<DfaTransitionInfo, unsigned> EmittedIndices;
+  for (auto &T : DfaTransitions)
+    Table.add(T.second.second);
+  Table.layout();
+  OS << "std::array<NfaStatePair, " << Table.size() << "> " << Name
+     << "TransitionInfo = {{\n";
+  Table.emit(
+      OS,
+      [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
+        OS << "{" << P.first << ", " << P.second << "}";
+      },
+      "{0ULL, 0ULL}");
+
+  OS << "}};\n\n";
+
+  OS << "// A transition in the generated " << Name << " DFA.\n";
+  OS << "struct " << Name << "Transition {\n";
+  OS << "  unsigned FromDfaState; // The transitioned-from DFA state.\n";
+  OS << "  ";
+  printActionType(OS);
+  OS << " Action;       // The input symbol that causes this transition.\n";
+  OS << "  unsigned ToDfaState;   // The transitioned-to DFA state.\n";
+  OS << "  unsigned InfoIdx;      // Start index into " << Name
+     << "TransitionInfo.\n";
+  OS << "};\n\n";
+
+  OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
+  OS << "// The initial state is 1, not zero.\n";
+  OS << "std::array<" << Name << "Transition, " << DfaTransitions.size() << "> "
+     << Name << "Transitions = {{\n";
+  for (auto &KV : DfaTransitions) {
+    dfa_state_type From = KV.first.first;
+    dfa_state_type To = KV.second.first;
+    action_type A = KV.first.second;
+    unsigned InfoIdx = Table.get(KV.second.second);
+    OS << "  {" << From << ", ";
+    printActionValue(A, OS);
+    OS << ", " << To << ", " << InfoIdx << "},\n";
+  }
+  OS << "\n}};\n\n";
+}
+
+void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
+
+void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
+
+//===----------------------------------------------------------------------===//
+// AutomatonEmitter implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+// FIXME: This entire discriminated union could be removed with c++17:
+//   using Action = std::variant<Record *, unsigned, std::string>;
+struct Action {
+  Record *R = nullptr;
+  unsigned I = 0;
+  std::string S = nullptr;
+
+  Action() = default;
+  Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {}
+
+  void print(raw_ostream &OS) const {
+    if (R)
+      OS << R->getName();
+    else if (!S.empty())
+      OS << '"' << S << '"';
+    else
+      OS << I;
+  }
+  bool operator<(const Action &Other) const {
+    return std::make_tuple(R, I, S) <
+           std::make_tuple(Other.R, Other.I, Other.S);
+  }
+};
+
+using ActionTuple = std::vector<Action>;
+class Automaton;
+
+class Transition {
+  uint64_t NewState;
+  // The tuple of actions that causes this transition.
+  ActionTuple Actions;
+  // The types of the actions; this is the same across all transitions.
+  SmallVector<std::string, 4> Types;
+
+public:
+  Transition(Record *R, Automaton *Parent);
+  const ActionTuple &getActions() { return Actions; }
+  SmallVector<std::string, 4> getTypes() { return Types; }
+
+  bool canTransitionFrom(uint64_t State);
+  uint64_t transitionFrom(uint64_t State);
+};
+
+class Automaton {
+  RecordKeeper &Records;
+  Record *R;
+  std::vector<Transition> Transitions;
+  /// All possible action tuples, uniqued.
+  UniqueVector<ActionTuple> Actions;
+  /// The fields within each Transition object to find the action symbols.
+  std::vector<StringRef> ActionSymbolFields;
+
+public:
+  Automaton(RecordKeeper &Records, Record *R);
+  void emit(raw_ostream &OS);
+
+  ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
+  /// If the type of action A has been overridden (there exists a field
+  /// "TypeOf_A") return that, otherwise return the empty string.
+  StringRef getActionSymbolType(StringRef A);
+};
+
+class AutomatonEmitter {
+  RecordKeeper &Records;
+
+public:
+  AutomatonEmitter(RecordKeeper &R) : Records(R) {}
+  void run(raw_ostream &OS);
+};
+
+/// A DfaEmitter implementation that can print our variant action type.
+class CustomDfaEmitter : public DfaEmitter {
+  const UniqueVector<ActionTuple> &Actions;
+  std::string TypeName;
+
+public:
+  CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
+      : Actions(Actions), TypeName(TypeName) {}
+
+  void printActionType(raw_ostream &OS) override;
+  void printActionValue(action_type A, raw_ostream &OS) override;
+};
+} // namespace
+
+void AutomatonEmitter::run(raw_ostream &OS) {
+  for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) {
+    Automaton A(Records, R);
+    OS << "#ifdef GET_" << R->getName() << "_DECL\n";
+    A.emit(OS);
+    OS << "#endif  // GET_" << R->getName() << "_DECL\n";
+  }
+}
+
+Automaton::Automaton(RecordKeeper &Records, Record *R)
+    : Records(Records), R(R) {
+  LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
+  ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields");
+}
+
+void Automaton::emit(raw_ostream &OS) {
+  StringRef TransitionClass = R->getValueAsString("TransitionClass");
+  for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) {
+    assert(T->isSubClassOf("Transition"));
+    Transitions.emplace_back(T, this);
+    Actions.insert(Transitions.back().getActions());
+  }
+
+  LLVM_DEBUG(dbgs() << "  Action alphabet cardinality: " << Actions.size()
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "  Each state has " << Transitions.size()
+                    << " potential transitions.\n");
+
+  StringRef Name = R->getName();
+
+  CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action");
+  // Starting from the initial state, build up a list of possible states and
+  // transitions.
+  std::deque<uint64_t> Worklist(1, 0);
+  std::set<uint64_t> SeenStates;
+  unsigned NumTransitions = 0;
+  SeenStates.insert(Worklist.front());
+  while (!Worklist.empty()) {
+    uint64_t State = Worklist.front();
+    Worklist.pop_front();
+    for (Transition &T : Transitions) {
+      if (!T.canTransitionFrom(State))
+        continue;
+      uint64_t NewState = T.transitionFrom(State);
+      if (SeenStates.emplace(NewState).second)
+        Worklist.emplace_back(NewState);
+      ++NumTransitions;
+      Emitter.addTransition(State, NewState, Actions.idFor(T.getActions()));
+    }
+  }
+  LLVM_DEBUG(dbgs() << "  NFA automaton has " << SeenStates.size()
+                    << " states with " << NumTransitions << " transitions.\n");
+
+  const auto &ActionTypes = Transitions.back().getTypes();
+  OS << "// The type of an action in the " << Name << " automaton.\n";
+  if (ActionTypes.size() == 1) {
+    OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
+  } else {
+    OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ")
+       << ">;\n";
+  }
+  OS << "\n";
+
+  Emitter.emit(Name, OS);
+}
+
+StringRef Automaton::getActionSymbolType(StringRef A) {
+  Twine Ty = "TypeOf_" + A;
+  if (!R->getValue(Ty.str()))
+    return "";
+  return R->getValueAsString(Ty.str());
+}
+
+Transition::Transition(Record *R, Automaton *Parent) {
+  BitsInit *NewStateInit = R->getValueAsBitsInit("NewState");
+  NewState = 0;
+  assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
+         "State cannot be represented in 64 bits!");
+  for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
+    if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) {
+      if (Bit->getValue())
+        NewState |= 1ULL << I;
+    }
+  }
+
+  for (StringRef A : Parent->getActionSymbolFields()) {
+    RecordVal *SymbolV = R->getValue(A);
+    if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) {
+      Actions.emplace_back(R->getValueAsDef(A), 0, "");
+      Types.emplace_back(Ty->getAsString());
+    } else if (isa<IntRecTy>(SymbolV->getType())) {
+      Actions.emplace_back(nullptr, R->getValueAsInt(A), "");
+      Types.emplace_back("unsigned");
+    } else if (isa<StringRecTy>(SymbolV->getType()) ||
+               isa<CodeRecTy>(SymbolV->getType())) {
+      Actions.emplace_back(nullptr, 0, R->getValueAsString(A));
+      Types.emplace_back("std::string");
+    } else {
+      report_fatal_error("Unhandled symbol type!");
+    }
+
+    StringRef TypeOverride = Parent->getActionSymbolType(A);
+    if (!TypeOverride.empty())
+      Types.back() = TypeOverride;
+  }
+}
+
+bool Transition::canTransitionFrom(uint64_t State) {
+  if ((State & NewState) == 0)
+    // The bits we want to set are not set;
+    return true;
+  return false;
+}
+
+uint64_t Transition::transitionFrom(uint64_t State) {
+  return State | NewState;
+}
+
+void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
+
+void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
+  const ActionTuple &AT = Actions[A];
+  if (AT.size() > 1)
+    OS << "std::make_tuple(";
+  bool First = true;
+  for (const auto &SingleAction : AT) {
+    if (!First)
+      OS << ", ";
+    First = false;
+    SingleAction.print(OS);
+  }
+  if (AT.size() > 1)
+    OS << ")";
+}
+
+namespace llvm {
+
+void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) {
+  AutomatonEmitter(RK).run(OS);
+}
+
+} // namespace llvm
diff --git a/utils/TableGen/DFAEmitter.h b/utils/TableGen/DFAEmitter.h
new file mode 100644
index 000000000000..76de8f72cd88
--- /dev/null
+++ b/utils/TableGen/DFAEmitter.h
@@ -0,0 +1,107 @@
+//===--------------------- DfaEmitter.h -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Defines a generic automaton builder. This takes a set of transitions and
+// states that represent a nondeterministic finite state automaton (NFA) and
+// emits a determinized DFA in a form that include/llvm/Support/Automaton.h can
+// drive.
+//
+// See file llvm/TableGen/Automaton.td for the TableGen API definition.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_DFAEMITTER_H
+#define LLVM_UTILS_TABLEGEN_DFAEMITTER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Record.h"
+#include <set>
+#include <unordered_map>
+
+namespace llvm {
+
+class raw_ostream;
+/// Construct a deterministic finite state automaton from possible
+/// nondeterministic state and transition data.
+///
+/// The state type is a 64-bit unsigned integer. The generated automaton is
+/// invariant to the sparsity of the state representation - its size is only
+/// a function of the cardinality of the set of states.
+///
+/// The inputs to this emitter are considered to define a nondeterministic
+/// finite state automaton (NFA). This is then converted to a DFA during
+/// emission. The emitted tables can be used to by
+/// include/llvm/Support/Automaton.h.
+class DfaEmitter {
+public:
+  // The type of an NFA state. The initial state is always zero.
+  using state_type = uint64_t;
+  // The type of an action.
+  using action_type = uint64_t;
+
+  DfaEmitter() = default;
+  virtual ~DfaEmitter() = default;
+
+  void addTransition(state_type From, state_type To, action_type A);
+  void emit(StringRef Name, raw_ostream &OS);
+
+protected:
+  /// Emit the C++ type of an action to OS.
+  virtual void printActionType(raw_ostream &OS);
+  /// Emit the C++ value of an action A to OS.
+  virtual void printActionValue(action_type A, raw_ostream &OS);
+
+private:
+  /// The state type of deterministic states. These are only used internally to
+  /// this class. This is an ID into the DfaStates UniqueVector.
+  using dfa_state_type = unsigned;
+
+  /// The actual representation of a DFA state, which is a union of one or more
+  /// NFA states.
+  using DfaState = SmallVector<state_type, 4>;
+
+  /// A DFA transition consists of a set of NFA states transitioning to a
+  /// new set of NFA states. The DfaTransitionInfo tracks, for every
+  /// transitioned-from NFA state, a set of valid transitioned-to states.
+  ///
+  /// Emission of this transition relation allows algorithmic determination of
+  /// the possible candidate NFA paths taken under a given input sequence to
+  /// reach a given DFA state.
+  using DfaTransitionInfo = SmallVector<std::pair<state_type, state_type>, 4>;
+
+  /// The set of all possible actions.
+  std::set<action_type> Actions;
+
+  /// The set of nondeterministic transitions. A state-action pair can
+  /// transition to multiple target states.
+  std::map<std::pair<state_type, action_type>, std::vector<state_type>>
+      NfaTransitions;
+  std::set<state_type> NfaStates;
+  unsigned NumNfaTransitions = 0;
+
+  /// The set of deterministic states. DfaStates.getId(DfaState) returns an ID,
+  /// which is dfa_state_type. Note that because UniqueVector reserves state
+  /// zero, the initial DFA state is always 1.
+  UniqueVector<DfaState> DfaStates;
+  /// The set of deterministic transitions. A state-action pair has only a
+  /// single target state.
+  std::map<std::pair<dfa_state_type, action_type>,
+           std::pair<dfa_state_type, DfaTransitionInfo>>
+      DfaTransitions;
+
+  /// Visit all NFA states and construct the DFA.
+  void constructDfa();
+  /// Visit a single DFA state and construct all possible transitions to new DFA
+  /// states.
+  void visitDfaState(DfaState DS);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/utils/TableGen/DFAPacketizerEmitter.cpp b/utils/TableGen/DFAPacketizerEmitter.cpp
index dabcc8f8ed55..ccb4ef1b9678 100644
--- a/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -17,6 +17,7 @@
 #define DEBUG_TYPE "dfa-emitter"
 
 #include "CodeGenTarget.h"
+#include "DFAEmitter.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -29,6 +30,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 using namespace llvm;
@@ -154,121 +156,13 @@ public:
                            int &maxStages,
                            raw_ostream &OS);
 
-  void run(raw_ostream &OS);
-};
-
-//
-// State represents the usage of machine resources if the packet contains
-// a set of instruction classes.
-//
-// Specifically, currentState is a set of bit-masks.
-// The nth bit in a bit-mask indicates whether the nth resource is being used
-// by this state. The set of bit-masks in a state represent the different
-// possible outcomes of transitioning to this state.
-// For example: consider a two resource architecture: resource L and resource M
-// with three instruction classes: L, M, and L_or_M.
-// From the initial state (currentState = 0x00), if we add instruction class
-// L_or_M we will transition to a state with currentState = [0x01, 0x10]. This
-// represents the possible resource states that can result from adding a L_or_M
-// instruction
-//
-// Another way of thinking about this transition is we are mapping a NDFA with
-// two states [0x01] and [0x10] into a DFA with a single state [0x01, 0x10].
-//
-// A State instance also contains a collection of transitions from that state:
-// a map from inputs to new states.
-//
-class State {
- public:
-  static int currentStateNum;
-  // stateNum is the only member used for equality/ordering, all other members
-  // can be mutated even in const State objects.
-  const int stateNum;
-  mutable bool isInitial;
-  mutable std::set<unsigned> stateInfo;
-  typedef std::map<std::vector<unsigned>, const State *> TransitionMap;
-  mutable TransitionMap Transitions;
-
-  State();
-
-  bool operator<(const State &s) const {
-    return stateNum < s.stateNum;
-  }
-
-  //
-  // canMaybeAddInsnClass - Quickly verifies if an instruction of type InsnClass
-  // may be a valid transition from this state i.e., can an instruction of type
-  // InsnClass be added to the packet represented by this state.
-  //
-  // Note that for multiple stages, this quick check does not take into account
-  // any possible resource competition between the stages themselves.  That is
-  // enforced in AddInsnClassStages which checks the cross product of all
-  // stages for resource availability (which is a more involved check).
-  //
-  bool canMaybeAddInsnClass(std::vector<unsigned> &InsnClass,
-                        std::map<unsigned, unsigned> &ComboBitToBitsMap) const;
-
-  //
-  // AddInsnClass - Return all combinations of resource reservation
-  // which are possible from this state (PossibleStates).
-  //
-  // PossibleStates is the set of valid resource states that ensue from valid
-  // transitions.
-  //
-  void AddInsnClass(std::vector<unsigned> &InsnClass,
-                        std::map<unsigned, unsigned> &ComboBitToBitsMap,
-                        std::set<unsigned> &PossibleStates) const;
-
-  //
-  // AddInsnClassStages - Return all combinations of resource reservation
-  // resulting from the cross product of all stages for this InsnClass
-  // which are possible from this state (PossibleStates).
-  //
-  void AddInsnClassStages(std::vector<unsigned> &InsnClass,
-                        std::map<unsigned, unsigned> &ComboBitToBitsMap,
-                        unsigned chkstage, unsigned numstages,
-                        unsigned prevState, unsigned origState,
-                        DenseSet<unsigned> &VisitedResourceStates,
-                        std::set<unsigned> &PossibleStates) const;
-
-  //
-  // addTransition - Add a transition from this state given the input InsnClass
-  //
-  void addTransition(std::vector<unsigned> InsnClass, const State *To) const;
-
-  //
-  // hasTransition - Returns true if there is a transition from this state
-  // given the input InsnClass
-  //
-  bool hasTransition(std::vector<unsigned> InsnClass) const;
-};
-
-//
-// class DFA: deterministic finite automaton for processor resource tracking.
-//
-class DFA {
-public:
-  DFA() = default;
-
-  // Set of states. Need to keep this sorted to emit the transition table.
-  typedef std::set<State> StateSet;
-  StateSet states;
+  // Emit code for a subset of itineraries.
+  void emitForItineraries(raw_ostream &OS,
+                          std::vector<Record *> &ProcItinList,
+                          std::string DFAName);
 
-  State *currentState = nullptr;
-
-  //
-  // Modify the DFA.
-  //
-  const State &newState();
-
-  //
-  // writeTable: Print out a table representing the DFA.
-  //
-  void writeTableAndAPI(raw_ostream &OS, const std::string &ClassName,
-                 int numInsnClasses = 0,
-                 int maxResources = 0, int numCombos = 0, int maxStages = 0);
+  void run(raw_ostream &OS);
 };
-
 } // end anonymous namespace
 
 #ifndef NDEBUG
@@ -288,22 +182,6 @@ void dbgsInsnClass(const std::vector<unsigned> &InsnClass) {
   LLVM_DEBUG(dbgs() << " (input: 0x" << Twine::utohexstr(InsnInput) << ")");
 }
 
-//
-// dbgsStateInfo - When debugging, print the set of state info.
-//
-void dbgsStateInfo(const std::set<unsigned> &stateInfo) {
-  LLVM_DEBUG(dbgs() << "StateInfo: ");
-  unsigned i = 0;
-  for (std::set<unsigned>::iterator SI = stateInfo.begin();
-       SI != stateInfo.end(); ++SI, ++i) {
-    unsigned thisState = *SI;
-    if (i > 0) {
-      LLVM_DEBUG(dbgs() << ", ");
-    }
-    LLVM_DEBUG(dbgs() << "0x" << Twine::utohexstr(thisState));
-  }
-}
-
 //
 // dbgsIndent - When debugging, indent by the specified amount.
 //
@@ -314,335 +192,9 @@ void dbgsIndent(unsigned indent) {
 }
 #endif // NDEBUG
 
-//
-// Constructors and destructors for State and DFA
-//
-State::State() :
-  stateNum(currentStateNum++), isInitial(false) {}
-
-//
-// addTransition - Add a transition from this state given the input InsnClass
-//
-void State::addTransition(std::vector<unsigned> InsnClass, const State *To)
-      const {
-  assert(!Transitions.count(InsnClass) &&
-      "Cannot have multiple transitions for the same input");
-  Transitions[InsnClass] = To;
-}
-
-//
-// hasTransition - Returns true if there is a transition from this state
-// given the input InsnClass
-//
-bool State::hasTransition(std::vector<unsigned> InsnClass) const {
-  return Transitions.count(InsnClass) > 0;
-}
-
-//
-// AddInsnClass - Return all combinations of resource reservation
-// which are possible from this state (PossibleStates).
-//
-// PossibleStates is the set of valid resource states that ensue from valid
-// transitions.
-//
-void State::AddInsnClass(std::vector<unsigned> &InsnClass,
-                        std::map<unsigned, unsigned> &ComboBitToBitsMap,
-                        std::set<unsigned> &PossibleStates) const {
-  //
-  // Iterate over all resource states in currentState.
-  //
-  unsigned numstages = InsnClass.size();
-  assert((numstages > 0) && "InsnClass has no stages");
-
-  for (std::set<unsigned>::iterator SI = stateInfo.begin();
-       SI != stateInfo.end(); ++SI) {
-    unsigned thisState = *SI;
-
-    DenseSet<unsigned> VisitedResourceStates;
-
-    LLVM_DEBUG(dbgs() << "  thisState: 0x" << Twine::utohexstr(thisState)
-                      << "\n");
-    AddInsnClassStages(InsnClass, ComboBitToBitsMap,
-                                numstages - 1, numstages,
-                                thisState, thisState,
-                                VisitedResourceStates, PossibleStates);
-  }
-}
-
-void State::AddInsnClassStages(std::vector<unsigned> &InsnClass,
-                        std::map<unsigned, unsigned> &ComboBitToBitsMap,
-                        unsigned chkstage, unsigned numstages,
-                        unsigned prevState, unsigned origState,
-                        DenseSet<unsigned> &VisitedResourceStates,
-                        std::set<unsigned> &PossibleStates) const {
-  assert((chkstage < numstages) && "AddInsnClassStages: stage out of range");
-  unsigned thisStage = InsnClass[chkstage];
-
-  LLVM_DEBUG({
-    dbgsIndent((1 + numstages - chkstage) << 1);
-    dbgs() << "AddInsnClassStages " << chkstage << " (0x"
-           << Twine::utohexstr(thisStage) << ") from ";
-    dbgsInsnClass(InsnClass);
-    dbgs() << "\n";
-  });
-
-  //
-  // Iterate over all possible resources used in thisStage.
-  // For ex: for thisStage = 0x11, all resources = {0x01, 0x10}.
-  //
-  for (unsigned int j = 0; j < DFA_MAX_RESOURCES; ++j) {
-    unsigned resourceMask = (0x1 << j);
-    if (resourceMask & thisStage) {
-      unsigned combo = ComboBitToBitsMap[resourceMask];
-      if (combo && ((~prevState & combo) != combo)) {
-        LLVM_DEBUG(dbgs() << "\tSkipped Add 0x" << Twine::utohexstr(prevState)
-                          << " - combo op 0x" << Twine::utohexstr(resourceMask)
-                          << " (0x" << Twine::utohexstr(combo)
-                          << ") cannot be scheduled\n");
-        continue;
-      }
-      //
-      // For each possible resource used in thisStage, generate the
-      // resource state if that resource was used.
-      //
-      unsigned ResultingResourceState = prevState | resourceMask | combo;
-      LLVM_DEBUG({
-        dbgsIndent((2 + numstages - chkstage) << 1);
-        dbgs() << "0x" << Twine::utohexstr(prevState) << " | 0x"
-               << Twine::utohexstr(resourceMask);
-        if (combo)
-          dbgs() << " | 0x" << Twine::utohexstr(combo);
-        dbgs() << " = 0x" << Twine::utohexstr(ResultingResourceState) << " ";
-      });
-
-      //
-      // If this is the final stage for this class
-      //
-      if (chkstage == 0) {
-        //
-        // Check if the resulting resource state can be accommodated in this
-        // packet.
-        // We compute resource OR prevState (originally started as origState).
-        // If the result of the OR is different than origState, it implies
-        // that there is at least one resource that can be used to schedule
-        // thisStage in the current packet.
-        // Insert ResultingResourceState into PossibleStates only if we haven't
-        // processed ResultingResourceState before.
-        //
-        if (ResultingResourceState != prevState) {
-          if (VisitedResourceStates.count(ResultingResourceState) == 0) {
-            VisitedResourceStates.insert(ResultingResourceState);
-            PossibleStates.insert(ResultingResourceState);
-            LLVM_DEBUG(dbgs()
-                       << "\tResultingResourceState: 0x"
-                       << Twine::utohexstr(ResultingResourceState) << "\n");
-          } else {
-            LLVM_DEBUG(dbgs() << "\tSkipped Add - state already seen\n");
-          }
-        } else {
-          LLVM_DEBUG(dbgs()
-                     << "\tSkipped Add - no final resources available\n");
-        }
-      } else {
-        //
-        // If the current resource can be accommodated, check the next
-        // stage in InsnClass for available resources.
-        //
-        if (ResultingResourceState != prevState) {
-          LLVM_DEBUG(dbgs() << "\n");
-          AddInsnClassStages(InsnClass, ComboBitToBitsMap,
-                                chkstage - 1, numstages,
-                                ResultingResourceState, origState,
-                                VisitedResourceStates, PossibleStates);
-        } else {
-          LLVM_DEBUG(dbgs() << "\tSkipped Add - no resources available\n");
-        }
-      }
-    }
-  }
-}
-
-//
-// canMaybeAddInsnClass - Quickly verifies if an instruction of type InsnClass
-// may be a valid transition from this state i.e., can an instruction of type
-// InsnClass be added to the packet represented by this state.
-//
-// Note that this routine is performing conservative checks that can be
-// quickly executed acting as a filter before calling AddInsnClassStages.
-// Any cases allowed through here will be caught later in AddInsnClassStages
-// which performs the more expensive exact check.
-//
-bool State::canMaybeAddInsnClass(std::vector<unsigned> &InsnClass,
-                    std::map<unsigned, unsigned> &ComboBitToBitsMap) const {
-  for (std::set<unsigned>::const_iterator SI = stateInfo.begin();
-       SI != stateInfo.end(); ++SI) {
-    // Check to see if all required resources are available.
-    bool available = true;
-
-    // Inspect each stage independently.
-    // note: This is a conservative check as we aren't checking for
-    //       possible resource competition between the stages themselves
-    //       The full cross product is examined later in AddInsnClass.
-    for (unsigned i = 0; i < InsnClass.size(); ++i) {
-      unsigned resources = *SI;
-      if ((~resources & InsnClass[i]) == 0) {
-        available = false;
-        break;
-      }
-      // Make sure _all_ resources for a combo function are available.
-      // note: This is a quick conservative check as it won't catch an
-      //       unscheduleable combo if this stage is an OR expression
-      //       containing a combo.
-      //       These cases are caught later in AddInsnClass.
-      unsigned combo = ComboBitToBitsMap[InsnClass[i]];
-      if (combo && ((~resources & combo) != combo)) {
-        LLVM_DEBUG(dbgs() << "\tSkipped canMaybeAdd 0x"
-                          << Twine::utohexstr(resources) << " - combo op 0x"
-                          << Twine::utohexstr(InsnClass[i]) << " (0x"
-                          << Twine::utohexstr(combo)
-                          << ") cannot be scheduled\n");
-        available = false;
-        break;
-      }
-    }
-
-    if (available) {
-      return true;
-    }
-  }
-  return false;
-}
-
-const State &DFA::newState() {
-  auto IterPair = states.insert(State());
-  assert(IterPair.second && "State already exists");
-  return *IterPair.first;
-}
-
-int State::currentStateNum = 0;
-
 DFAPacketizerEmitter::DFAPacketizerEmitter(RecordKeeper &R):
   TargetName(CodeGenTarget(R).getName()), Records(R) {}
 
-//
-// writeTableAndAPI - Print out a table representing the DFA and the
-// associated API to create a DFA packetizer.
-//
-// Format:
-// DFAStateInputTable[][2] = pairs of <Input, Transition> for all valid
-//                           transitions.
-// DFAStateEntryTable[i] = Index of the first entry in DFAStateInputTable for
-//                         the ith state.
-//
-//
-void DFA::writeTableAndAPI(raw_ostream &OS, const std::string &TargetName,
-                           int numInsnClasses,
-                           int maxResources, int numCombos, int maxStages) {
-  unsigned numStates = states.size();
-
-  LLVM_DEBUG(dbgs() << "-------------------------------------------------------"
-                       "----------------------\n");
-  LLVM_DEBUG(dbgs() << "writeTableAndAPI\n");
-  LLVM_DEBUG(dbgs() << "Total states: " << numStates << "\n");
-
-  OS << "namespace llvm {\n";
-
-  OS << "\n// Input format:\n";
-  OS << "#define DFA_MAX_RESTERMS        " << DFA_MAX_RESTERMS
-     << "\t// maximum AND'ed resource terms\n";
-  OS << "#define DFA_MAX_RESOURCES       " << DFA_MAX_RESOURCES
-     << "\t// maximum resource bits in one term\n";
-
-  OS << "\n// " << TargetName << "DFAStateInputTable[][2] = "
-     << "pairs of <Input, NextState> for all valid\n";
-  OS << "//                           transitions.\n";
-  OS << "// " << numStates << "\tstates\n";
-  OS << "// " << numInsnClasses << "\tinstruction classes\n";
-  OS << "// " << maxResources << "\tresources max\n";
-  OS << "// " << numCombos << "\tcombo resources\n";
-  OS << "// " << maxStages << "\tstages max\n";
-  OS << "const " << DFA_TBLTYPE << " "
-     << TargetName << "DFAStateInputTable[][2] = {\n";
-
-  // This table provides a map to the beginning of the transitions for State s
-  // in DFAStateInputTable.
-  std::vector<int> StateEntry(numStates+1);
-  static const std::string SentinelEntry = "{-1, -1}";
-
-  // Tracks the total valid transitions encountered so far. It is used
-  // to construct the StateEntry table.
-  int ValidTransitions = 0;
-  DFA::StateSet::iterator SI = states.begin();
-  for (unsigned i = 0; i < numStates; ++i, ++SI) {
-    assert ((SI->stateNum == (int) i) && "Mismatch in state numbers");
-    StateEntry[i] = ValidTransitions;
-    for (State::TransitionMap::iterator
-        II = SI->Transitions.begin(), IE = SI->Transitions.end();
-        II != IE; ++II) {
-      OS << "{0x" << Twine::utohexstr(getDFAInsnInput(II->first)) << ", "
-         << II->second->stateNum << "},\t";
-    }
-    ValidTransitions += SI->Transitions.size();
-
-    // If there are no valid transitions from this stage, we need a sentinel
-    // transition.
-    if (ValidTransitions == StateEntry[i]) {
-      OS << SentinelEntry << ",\t";
-      ++ValidTransitions;
-    }
-
-    OS << " // state " << i << ": " << StateEntry[i];
-    if (StateEntry[i] != (ValidTransitions-1)) {   // More than one transition.
-       OS << "-" << (ValidTransitions-1);
-    }
-    OS << "\n";
-  }
-
-  // Print out a sentinel entry at the end of the StateInputTable. This is
-  // needed to iterate over StateInputTable in DFAPacketizer::ReadTable()
-  OS << SentinelEntry << "\t";
-  OS << " // state " << numStates << ": " << ValidTransitions;
-  OS << "\n";
-
-  OS << "};\n\n";
-  OS << "// " << TargetName << "DFAStateEntryTable[i] = "
-     << "Index of the first entry in DFAStateInputTable for\n";
-  OS << "//                         "
-     << "the ith state.\n";
-  OS << "// " << numStates << " states\n";
-  OS << "const unsigned int " << TargetName << "DFAStateEntryTable[] = {\n";
-
-  // Multiply i by 2 since each entry in DFAStateInputTable is a set of
-  // two numbers.
-  unsigned lastState = 0;
-  for (unsigned i = 0; i < numStates; ++i) {
-    if (i && ((i % 10) == 0)) {
-        lastState = i-1;
-        OS << "   // states " << (i-10) << ":" << lastState << "\n";
-    }
-    OS << StateEntry[i] << ", ";
-  }
-
-  // Print out the index to the sentinel entry in StateInputTable
-  OS << ValidTransitions << ", ";
-  OS << "   // states " << (lastState+1) << ":" << numStates << "\n";
-
-  OS << "};\n";
-  OS << "} // namespace\n";
-
-  //
-  // Emit DFA Packetizer tables if the target is a VLIW machine.
-  //
-  std::string SubTargetClassName = TargetName + "GenSubtargetInfo";
-  OS << "\n" << "#include \"llvm/CodeGen/DFAPacketizer.h\"\n";
-  OS << "namespace llvm {\n";
-  OS << "DFAPacketizer *" << SubTargetClassName << "::"
-     << "createDFAPacketizer(const InstrItineraryData *IID) const {\n"
-     << "   return new DFAPacketizer(IID, " << TargetName
-     << "DFAStateInputTable, " << TargetName << "DFAStateEntryTable);\n}\n\n";
-  OS << "} // End llvm namespace \n";
-}
-
 //
 // collectAllFuncUnits - Construct a map of function unit names to bits.
 //
@@ -837,10 +389,32 @@ int DFAPacketizerEmitter::collectAllInsnClasses(const std::string &ProcName,
 // Run the worklist algorithm to generate the DFA.
 //
 void DFAPacketizerEmitter::run(raw_ostream &OS) {
+  OS << "\n"
+     << "#include \"llvm/CodeGen/DFAPacketizer.h\"\n";
+  OS << "namespace llvm {\n";
+
+  OS << "\n// Input format:\n";
+  OS << "#define DFA_MAX_RESTERMS        " << DFA_MAX_RESTERMS
+     << "\t// maximum AND'ed resource terms\n";
+  OS << "#define DFA_MAX_RESOURCES       " << DFA_MAX_RESOURCES
+     << "\t// maximum resource bits in one term\n";
+
   // Collect processor iteraries.
   std::vector<Record*> ProcItinList =
     Records.getAllDerivedDefinitions("ProcessorItineraries");
 
+  std::unordered_map<std::string, std::vector<Record*>> ItinsByNamespace;
+  for (Record *R : ProcItinList)
+    ItinsByNamespace[R->getValueAsString("PacketizerNamespace")].push_back(R);
+
+  for (auto &KV : ItinsByNamespace)
+    emitForItineraries(OS, KV.second, KV.first);
+  OS << "} // end namespace llvm\n";
+}
+
+void DFAPacketizerEmitter::emitForItineraries(
+    raw_ostream &OS, std::vector<Record *> &ProcItinList,
+    std::string DFAName) {
   //
   // Collect the Functional units.
   //
@@ -855,8 +429,7 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
   std::map<unsigned, unsigned> ComboBitToBitsMap;
   std::vector<Record*> ComboFuncList =
     Records.getAllDerivedDefinitions("ComboFuncUnits");
-  int numCombos = collectAllComboFuncs(ComboFuncList,
-                              FUNameToBitsMap, ComboBitToBitsMap, OS);
+  collectAllComboFuncs(ComboFuncList, FUNameToBitsMap, ComboBitToBitsMap, OS);
 
   //
   // Collect the itineraries.
@@ -887,103 +460,89 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
                           FUNameToBitsMap, ItinDataList, maxStages, OS);
   }
 
-  //
-  // Run a worklist algorithm to generate the DFA.
-  //
-  DFA D;
-  const State *Initial = &D.newState();
-  Initial->isInitial = true;
-  Initial->stateInfo.insert(0x0);
-  SmallVector<const State*, 32> WorkList;
-  std::map<std::set<unsigned>, const State*> Visited;
-
-  WorkList.push_back(Initial);
-
-  //
-  // Worklist algorithm to create a DFA for processor resource tracking.
-  // C = {set of InsnClasses}
-  // Begin with initial node in worklist. Initial node does not have
-  // any consumed resources,
-  //     ResourceState = 0x0
-  // Visited = {}
-  // While worklist != empty
-  //    S = first element of worklist
-  //    For every instruction class C
-  //      if we can accommodate C in S:
-  //          S' = state with resource states = {S Union C}
-  //          Add a new transition: S x C -> S'
-  //          If S' is not in Visited:
-  //             Add S' to worklist
-  //             Add S' to Visited
-  //
-  while (!WorkList.empty()) {
-    const State *current = WorkList.pop_back_val();
-    LLVM_DEBUG({
-      dbgs() << "---------------------\n";
-      dbgs() << "Processing state: " << current->stateNum << " - ";
-      dbgsStateInfo(current->stateInfo);
-      dbgs() << "\n";
-    });
-    for (unsigned i = 0; i < allInsnClasses.size(); i++) {
-      std::vector<unsigned> InsnClass = allInsnClasses[i];
-      LLVM_DEBUG({
-        dbgs() << i << " ";
-        dbgsInsnClass(InsnClass);
-        dbgs() << "\n";
-      });
-
-      std::set<unsigned> NewStateResources;
-      //
-      // If we haven't already created a transition for this input
-      // and the state can accommodate this InsnClass, create a transition.
-      //
-      if (!current->hasTransition(InsnClass) &&
-          current->canMaybeAddInsnClass(InsnClass, ComboBitToBitsMap)) {
-        const State *NewState = nullptr;
-        current->AddInsnClass(InsnClass, ComboBitToBitsMap, NewStateResources);
-        if (NewStateResources.empty()) {
-          LLVM_DEBUG(dbgs() << "  Skipped - no new states generated\n");
-          continue;
-        }
-
-        LLVM_DEBUG({
-          dbgs() << "\t";
-          dbgsStateInfo(NewStateResources);
-          dbgs() << "\n";
-        });
-
-        //
-        // If we have seen this state before, then do not create a new state.
-        //
-        auto VI = Visited.find(NewStateResources);
-        if (VI != Visited.end()) {
-          NewState = VI->second;
-          LLVM_DEBUG({
-            dbgs() << "\tFound existing state: " << NewState->stateNum
-                   << " - ";
-            dbgsStateInfo(NewState->stateInfo);
-            dbgs() << "\n";
-          });
-        } else {
-          NewState = &D.newState();
-          NewState->stateInfo = NewStateResources;
-          Visited[NewStateResources] = NewState;
-          WorkList.push_back(NewState);
-          LLVM_DEBUG({
-            dbgs() << "\tAccepted new state: " << NewState->stateNum << " - ";
-            dbgsStateInfo(NewState->stateInfo);
-            dbgs() << "\n";
-          });
+  // The type of a state in the nondeterministic automaton we're defining.
+  using NfaStateTy = unsigned;
+
+  // Given a resource state, return all resource states by applying
+  // InsnClass.
+  auto applyInsnClass = [&](ArrayRef<unsigned> InsnClass,
+                            NfaStateTy State) -> std::deque<unsigned> {
+    std::deque<unsigned> V(1, State);
+    // Apply every stage in the class individually.
+    for (unsigned Stage : InsnClass) {
+      // Apply this stage to every existing member of V in turn.
+      size_t Sz = V.size();
+      for (unsigned I = 0; I < Sz; ++I) {
+        unsigned S = V.front();
+        V.pop_front();
+
+        // For this stage, state combination, try all possible resources.
+        for (unsigned J = 0; J < DFA_MAX_RESOURCES; ++J) {
+          unsigned ResourceMask = 1U << J;
+          if ((ResourceMask & Stage) == 0)
+            // This resource isn't required by this stage.
+            continue;
+          unsigned Combo = ComboBitToBitsMap[ResourceMask];
+          if (Combo && ((~S & Combo) != Combo))
+            // This combo units bits are not available.
+            continue;
+          unsigned ResultingResourceState = S | ResourceMask | Combo;
+          if (ResultingResourceState == S)
+            continue;
+          V.push_back(ResultingResourceState);
         }
-
-        current->addTransition(InsnClass, NewState);
+      }
+    }
+    return V;
+  };
+
+  // Given a resource state, return a quick (conservative) guess as to whether
+  // InsnClass can be applied. This is a filter for the more heavyweight
+  // applyInsnClass.
+  auto canApplyInsnClass = [](ArrayRef<unsigned> InsnClass,
+                              NfaStateTy State) -> bool {
+    for (unsigned Resources : InsnClass) {
+      if ((State | Resources) == State)
+        return false;
+    }
+    return true;
+  };
+
+  DfaEmitter Emitter;
+  std::deque<NfaStateTy> Worklist(1, 0);
+  std::set<NfaStateTy> SeenStates;
+  SeenStates.insert(Worklist.front());
+  while (!Worklist.empty()) {
+    NfaStateTy State = Worklist.front();
+    Worklist.pop_front();
+    for (unsigned i = 0; i < allInsnClasses.size(); i++) {
+      const std::vector<unsigned> &InsnClass = allInsnClasses[i];
+      if (!canApplyInsnClass(InsnClass, State))
+        continue;
+      for (unsigned NewState : applyInsnClass(InsnClass, State)) {
+        if (SeenStates.emplace(NewState).second)
+          Worklist.emplace_back(NewState);
+        Emitter.addTransition(State, NewState, getDFAInsnInput(InsnClass));
       }
     }
   }
 
-  // Print out the table.
-  D.writeTableAndAPI(OS, TargetName,
-               numInsnClasses, maxResources, numCombos, maxStages);
+  OS << "} // end namespace llvm\n\n";
+  OS << "namespace {\n";
+  std::string TargetAndDFAName = TargetName + DFAName;
+  Emitter.emit(TargetAndDFAName, OS);
+  OS << "} // end anonymous namespace\n\n";
+
+  std::string SubTargetClassName = TargetName + "GenSubtargetInfo";
+  OS << "namespace llvm {\n";
+  OS << "DFAPacketizer *" << SubTargetClassName << "::"
+     << "create" << DFAName
+     << "DFAPacketizer(const InstrItineraryData *IID) const {\n"
+     << "  static Automaton<uint64_t> A(ArrayRef<" << TargetAndDFAName
+     << "Transition>(" << TargetAndDFAName << "Transitions), "
+     << TargetAndDFAName << "TransitionInfo);\n"
+     << "  return new DFAPacketizer(IID, A);\n"
+     << "\n}\n\n";
 }
 
 namespace llvm {
diff --git a/utils/TableGen/DisassemblerEmitter.cpp b/utils/TableGen/DisassemblerEmitter.cpp
index 9e75c7fba77b..0002b0e14db6 100644
--- a/utils/TableGen/DisassemblerEmitter.cpp
+++ b/utils/TableGen/DisassemblerEmitter.cpp
@@ -153,4 +153,4 @@ void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
                       "MCDisassembler::Success", "MCDisassembler::Fail", "");
 }
 
-} // End llvm namespace
+} // end namespace llvm
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index f5e975d2e5ae..ac69b431607d 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -13,6 +13,7 @@
 
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/CachedHashString.h"
@@ -64,9 +65,10 @@ struct OperandInfo {
   std::vector<EncodingField> Fields;
   std::string Decoder;
   bool HasCompleteDecoder;
+  uint64_t InitValue;
 
   OperandInfo(std::string D, bool HCD)
-      : Decoder(std::move(D)), HasCompleteDecoder(HCD) {}
+      : Decoder(std::move(D)), HasCompleteDecoder(HCD), InitValue(0) {}
 
   void addField(unsigned Base, unsigned Width, unsigned Offset) {
     Fields.push_back(EncodingField(Base, Width, Offset));
@@ -96,9 +98,11 @@ struct DecoderTableInfo {
 struct EncodingAndInst {
   const Record *EncodingDef;
   const CodeGenInstruction *Inst;
+  StringRef HwModeName;
 
-  EncodingAndInst(const Record *EncodingDef, const CodeGenInstruction *Inst)
-      : EncodingDef(EncodingDef), Inst(Inst) {}
+  EncodingAndInst(const Record *EncodingDef, const CodeGenInstruction *Inst,
+                  StringRef HwModeName = "")
+      : EncodingDef(EncodingDef), Inst(Inst), HwModeName(HwModeName) {}
 };
 
 struct EncodingIDAndOpcode {
@@ -599,7 +603,7 @@ void Filter::recurse() {
     // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
     FilterChooserMap.insert(
-        std::make_pair(-1U, llvm::make_unique<FilterChooser>(
+        std::make_pair(-1U, std::make_unique<FilterChooser>(
                                 Owner->AllInstructions, VariableInstructions,
                                 Owner->Operands, BitValueArray, *Owner)));
   }
@@ -625,7 +629,7 @@ void Filter::recurse() {
     // Delegates to an inferior filter chooser for further processing on this
     // category of instructions.
     FilterChooserMap.insert(std::make_pair(
-        Inst.first, llvm::make_unique<FilterChooser>(
+        Inst.first, std::make_unique<FilterChooser>(
                                 Owner->AllInstructions, Inst.second,
                                 Owner->Operands, BitValueArray, *Owner)));
   }
@@ -1103,12 +1107,15 @@ void FilterChooser::emitBinaryParser(raw_ostream &o, unsigned &Indentation,
                                      bool &OpHasCompleteDecoder) const {
   const std::string &Decoder = OpInfo.Decoder;
 
-  if (OpInfo.numFields() != 1)
-    o.indent(Indentation) << "tmp = 0;\n";
+  if (OpInfo.numFields() != 1 || OpInfo.InitValue != 0) {
+    o.indent(Indentation) << "tmp = 0x";
+    o.write_hex(OpInfo.InitValue);
+    o << ";\n";
+  }
 
   for (const EncodingField &EF : OpInfo) {
     o.indent(Indentation) << "tmp ";
-    if (OpInfo.numFields() != 1) o << '|';
+    if (OpInfo.numFields() != 1 || OpInfo.InitValue != 0) o << '|';
     o << "= fieldFromInstruction"
       << "(insn, " << EF.Base << ", " << EF.Width << ')';
     if (OpInfo.numFields() != 1 || EF.Offset != 0)
@@ -2026,6 +2033,16 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef,
       HasCompleteDecoderBit->getValue() : true;
 
     OperandInfo OpInfo(Decoder, HasCompleteDecoder);
+
+    // Some bits of the operand may be required to be 1 depending on the
+    // instruction's encoding. Collect those bits.
+    if (const RecordVal *EncodedValue = EncodingDef.getValue(Op.second))
+      if (const BitsInit *OpBits = dyn_cast<BitsInit>(EncodedValue->getValue()))
+        for (unsigned I = 0; I < OpBits->getNumBits(); ++I)
+          if (const BitInit *OpBit = dyn_cast<BitInit>(OpBits->getBit(I)))
+            if (OpBit->getValue())
+              OpInfo.InitValue |= 1ULL << I;
+
     unsigned Base = ~0U;
     unsigned Width = 0;
     unsigned Offset = 0;
@@ -2368,12 +2385,50 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) {
   Target.reverseBitsForLittleEndianEncoding();
 
   // Parameterize the decoders based on namespace and instruction width.
+  std::set<StringRef> HwModeNames;
   const auto &NumberedInstructions = Target.getInstructionsByEnumValue();
   NumberedEncodings.reserve(NumberedInstructions.size());
   DenseMap<Record *, unsigned> IndexOfInstruction;
+  // First, collect all HwModes referenced by the target.
   for (const auto &NumberedInstruction : NumberedInstructions) {
     IndexOfInstruction[NumberedInstruction->TheDef] = NumberedEncodings.size();
-    NumberedEncodings.emplace_back(NumberedInstruction->TheDef, NumberedInstruction);
+
+    if (const RecordVal *RV =
+            NumberedInstruction->TheDef->getValue("EncodingInfos")) {
+      if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+        const CodeGenHwModes &HWM = Target.getHwModes();
+        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+        for (auto &KV : EBM.Map)
+          HwModeNames.insert(HWM.getMode(KV.first).Name);
+      }
+    }
+  }
+
+  // If HwModeNames is empty, add the empty string so we always have one HwMode.
+  if (HwModeNames.empty())
+    HwModeNames.insert("");
+
+  for (const auto &NumberedInstruction : NumberedInstructions) {
+    IndexOfInstruction[NumberedInstruction->TheDef] = NumberedEncodings.size();
+
+    if (const RecordVal *RV =
+            NumberedInstruction->TheDef->getValue("EncodingInfos")) {
+      if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
+        const CodeGenHwModes &HWM = Target.getHwModes();
+        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+        for (auto &KV : EBM.Map) {
+          NumberedEncodings.emplace_back(KV.second, NumberedInstruction,
+                                         HWM.getMode(KV.first).Name);
+          HwModeNames.insert(HWM.getMode(KV.first).Name);
+        }
+        continue;
+      }
+    }
+    // This instruction is encoded the same on all HwModes. Emit it for all
+    // HwModes.
+    for (StringRef HwModeName : HwModeNames)
+      NumberedEncodings.emplace_back(NumberedInstruction->TheDef,
+                                     NumberedInstruction, HwModeName);
   }
   for (const auto &NumberedAlias : RK.getAllDerivedDefinitions("AdditionalEncoding"))
     NumberedEncodings.emplace_back(
@@ -2401,13 +2456,19 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) {
       NumInstructions++;
     NumEncodings++;
 
-    StringRef DecoderNamespace = EncodingDef->getValueAsString("DecoderNamespace");
+    if (!Size)
+      continue;
 
-    if (Size) {
-      if (populateInstruction(Target, *EncodingDef, *Inst, i, Operands)) {
-        OpcMap[std::make_pair(DecoderNamespace, Size)].emplace_back(i, IndexOfInstruction.find(Def)->second);
-      } else
-        NumEncodingsOmitted++;
+    if (populateInstruction(Target, *EncodingDef, *Inst, i, Operands)) {
+      std::string DecoderNamespace =
+          EncodingDef->getValueAsString("DecoderNamespace");
+      if (!NumberedEncodings[i].HwModeName.empty())
+        DecoderNamespace +=
+            std::string("_") + NumberedEncodings[i].HwModeName.str();
+      OpcMap[std::make_pair(DecoderNamespace, Size)].emplace_back(
+          i, IndexOfInstruction.find(Def)->second);
+    } else {
+      NumEncodingsOmitted++;
     }
   }
 
@@ -2451,7 +2512,7 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) {
   // Emit the main entry point for the decoder, decodeInstruction().
   emitDecodeInstruction(OS);
 
-  OS << "\n} // End llvm namespace\n";
+  OS << "\n} // end namespace llvm\n";
 }
 
 namespace llvm {
diff --git a/utils/TableGen/GICombinerEmitter.cpp b/utils/TableGen/GICombinerEmitter.cpp
new file mode 100644
index 000000000000..5dc4d6b07740
--- /dev/null
+++ b/utils/TableGen/GICombinerEmitter.cpp
@@ -0,0 +1,452 @@
+//===- GlobalCombinerEmitter.cpp - Generate a combiner --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Generate a combiner implementation for GlobalISel from a declarative
+/// syntax
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/StringMatcher.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include "CodeGenTarget.h"
+#include "GlobalISel/CodeExpander.h"
+#include "GlobalISel/CodeExpansions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gicombiner-emitter"
+
+// FIXME: Use ALWAYS_ENABLED_STATISTIC once it's available.
+unsigned NumPatternTotal = 0;
+STATISTIC(NumPatternTotalStatistic, "Total number of patterns");
+
+cl::OptionCategory
+    GICombinerEmitterCat("Options for -gen-global-isel-combiner");
+static cl::list<std::string>
+    SelectedCombiners("combiners", cl::desc("Emit the specified combiners"),
+                      cl::cat(GICombinerEmitterCat), cl::CommaSeparated);
+static cl::opt<bool> ShowExpansions(
+    "gicombiner-show-expansions",
+    cl::desc("Use C++ comments to indicate occurence of code expansion"),
+    cl::cat(GICombinerEmitterCat));
+
+namespace {
+typedef uint64_t RuleID;
+
+class RootInfo {
+  StringRef PatternSymbol;
+
+public:
+  RootInfo(StringRef PatternSymbol) : PatternSymbol(PatternSymbol) {}
+
+  StringRef getPatternSymbol() const { return PatternSymbol; }
+};
+
+class CombineRule {
+protected:
+  /// A unique ID for this rule
+  /// ID's are used for debugging and run-time disabling of rules among other
+  /// things.
+  RuleID ID;
+
+  /// The record defining this rule.
+  const Record &TheDef;
+
+  /// The roots of a match. These are the leaves of the DAG that are closest to
+  /// the end of the function. I.e. the nodes that are encountered without
+  /// following any edges of the DAG described by the pattern as we work our way
+  /// from the bottom of the function to the top.
+  std::vector<RootInfo> Roots;
+
+  /// A block of arbitrary C++ to finish testing the match.
+  /// FIXME: This is a temporary measure until we have actual pattern matching
+  const CodeInit *MatchingFixupCode = nullptr;
+public:
+  CombineRule(const CodeGenTarget &Target, RuleID ID, const Record &R)
+      : ID(ID), TheDef(R) {}
+  bool parseDefs();
+  bool parseMatcher(const CodeGenTarget &Target);
+
+  RuleID getID() const { return ID; }
+  StringRef getName() const { return TheDef.getName(); }
+  const Record &getDef() const { return TheDef; }
+  const CodeInit *getMatchingFixupCode() const { return MatchingFixupCode; }
+  size_t getNumRoots() const { return Roots.size(); }
+
+  using const_root_iterator = std::vector<RootInfo>::const_iterator;
+  const_root_iterator roots_begin() const { return Roots.begin(); }
+  const_root_iterator roots_end() const { return Roots.end(); }
+  iterator_range<const_root_iterator> roots() const {
+    return llvm::make_range(Roots.begin(), Roots.end());
+  }
+};
+
+/// A convenience function to check that an Init refers to a specific def. This
+/// is primarily useful for testing for defs and similar in DagInit's since
+/// DagInit's support any type inside them.
+static bool isSpecificDef(const Init &N, StringRef Def) {
+  if (const DefInit *OpI = dyn_cast<DefInit>(&N))
+    if (OpI->getDef()->getName() == Def)
+      return true;
+  return false;
+}
+
+/// A convenience function to check that an Init refers to a def that is a
+/// subclass of the given class and coerce it to a def if it is. This is
+/// primarily useful for testing for subclasses of GIMatchKind and similar in
+/// DagInit's since DagInit's support any type inside them.
+static Record *getDefOfSubClass(const Init &N, StringRef Cls) {
+  if (const DefInit *OpI = dyn_cast<DefInit>(&N))
+    if (OpI->getDef()->isSubClassOf(Cls))
+      return OpI->getDef();
+  return nullptr;
+}
+
+bool CombineRule::parseDefs() {
+  NamedRegionTimer T("parseDefs", "Time spent parsing the defs", "Rule Parsing",
+                     "Time spent on rule parsing", TimeRegions);
+  DagInit *Defs = TheDef.getValueAsDag("Defs");
+
+  if (Defs->getOperatorAsDef(TheDef.getLoc())->getName() != "defs") {
+    PrintError(TheDef.getLoc(), "Expected defs operator");
+    return false;
+  }
+
+  for (unsigned I = 0, E = Defs->getNumArgs(); I < E; ++I) {
+    // Roots should be collected into Roots
+    if (isSpecificDef(*Defs->getArg(I), "root")) {
+      Roots.emplace_back(Defs->getArgNameStr(I));
+      continue;
+    }
+
+    // Otherwise emit an appropriate error message.
+    if (getDefOfSubClass(*Defs->getArg(I), "GIDefKind"))
+      PrintError(TheDef.getLoc(),
+                 "This GIDefKind not implemented in tablegen");
+    else if (getDefOfSubClass(*Defs->getArg(I), "GIDefKindWithArgs"))
+      PrintError(TheDef.getLoc(),
+                 "This GIDefKindWithArgs not implemented in tablegen");
+    else
+      PrintError(TheDef.getLoc(),
+                 "Expected a subclass of GIDefKind or a sub-dag whose "
+                 "operator is of type GIDefKindWithArgs");
+    return false;
+  }
+
+  if (Roots.empty()) {
+    PrintError(TheDef.getLoc(), "Combine rules must have at least one root");
+    return false;
+  }
+  return true;
+}
+
+bool CombineRule::parseMatcher(const CodeGenTarget &Target) {
+  NamedRegionTimer T("parseMatcher", "Time spent parsing the matcher",
+                     "Rule Parsing", "Time spent on rule parsing", TimeRegions);
+  DagInit *Matchers = TheDef.getValueAsDag("Match");
+
+  if (Matchers->getOperatorAsDef(TheDef.getLoc())->getName() != "match") {
+    PrintError(TheDef.getLoc(), "Expected match operator");
+    return false;
+  }
+
+  if (Matchers->getNumArgs() == 0) {
+    PrintError(TheDef.getLoc(), "Matcher is empty");
+    return false;
+  }
+
+  // The match section consists of a list of matchers and predicates. Parse each
+  // one and add the equivalent GIMatchDag nodes, predicates, and edges.
+  for (unsigned I = 0; I < Matchers->getNumArgs(); ++I) {
+
+    // Parse arbitrary C++ code we have in lieu of supporting MIR matching
+    if (const CodeInit *CodeI = dyn_cast<CodeInit>(Matchers->getArg(I))) {
+      assert(!MatchingFixupCode &&
+             "Only one block of arbitrary code is currently permitted");
+      MatchingFixupCode = CodeI;
+      continue;
+    }
+
+    PrintError(TheDef.getLoc(),
+               "Expected a subclass of GIMatchKind or a sub-dag whose "
+               "operator is either of a GIMatchKindWithArgs or Instruction");
+    PrintNote("Pattern was `" + Matchers->getArg(I)->getAsString() + "'");
+    return false;
+  }
+  return true;
+}
+
+class GICombinerEmitter {
+  StringRef Name;
+  const CodeGenTarget &Target;
+  Record *Combiner;
+  std::vector<std::unique_ptr<CombineRule>> Rules;
+  std::unique_ptr<CombineRule> makeCombineRule(const Record &R);
+
+  void gatherRules(std::vector<std::unique_ptr<CombineRule>> &ActiveRules,
+                   const std::vector<Record *> &&RulesAndGroups);
+
+public:
+  explicit GICombinerEmitter(RecordKeeper &RK, const CodeGenTarget &Target,
+                             StringRef Name, Record *Combiner);
+  ~GICombinerEmitter() {}
+
+  StringRef getClassName() const {
+    return Combiner->getValueAsString("Classname");
+  }
+  void run(raw_ostream &OS);
+
+  /// Emit the name matcher (guarded by #ifndef NDEBUG) used to disable rules in
+  /// response to the generated cl::opt.
+  void emitNameMatcher(raw_ostream &OS) const;
+  void generateCodeForRule(raw_ostream &OS, const CombineRule *Rule,
+                           StringRef Indent) const;
+};
+
+GICombinerEmitter::GICombinerEmitter(RecordKeeper &RK,
+                                     const CodeGenTarget &Target,
+                                     StringRef Name, Record *Combiner)
+    : Name(Name), Target(Target), Combiner(Combiner) {}
+
+void GICombinerEmitter::emitNameMatcher(raw_ostream &OS) const {
+  std::vector<std::pair<std::string, std::string>> Cases;
+  Cases.reserve(Rules.size());
+
+  for (const CombineRule &EnumeratedRule : make_pointee_range(Rules)) {
+    std::string Code;
+    raw_string_ostream SS(Code);
+    SS << "return " << EnumeratedRule.getID() << ";\n";
+    Cases.push_back(std::make_pair(EnumeratedRule.getName(), SS.str()));
+  }
+
+  OS << "static Optional<uint64_t> getRuleIdxForIdentifier(StringRef "
+        "RuleIdentifier) {\n"
+     << "  uint64_t I;\n"
+     << "  // getAtInteger(...) returns false on success\n"
+     << "  bool Parsed = !RuleIdentifier.getAsInteger(0, I);\n"
+     << "  if (Parsed)\n"
+     << "    return I;\n\n"
+     << "#ifndef NDEBUG\n";
+  StringMatcher Matcher("RuleIdentifier", Cases, OS);
+  Matcher.Emit();
+  OS << "#endif // ifndef NDEBUG\n\n"
+     << "  return None;\n"
+     << "}\n";
+}
+
+std::unique_ptr<CombineRule>
+GICombinerEmitter::makeCombineRule(const Record &TheDef) {
+  std::unique_ptr<CombineRule> Rule =
+      std::make_unique<CombineRule>(Target, NumPatternTotal, TheDef);
+
+  if (!Rule->parseDefs())
+    return nullptr;
+  if (!Rule->parseMatcher(Target))
+    return nullptr;
+  // For now, don't support multi-root rules. We'll come back to this later
+  // once we have the algorithm changes to support it.
+  if (Rule->getNumRoots() > 1) {
+    PrintError(TheDef.getLoc(), "Multi-root matches are not supported (yet)");
+    return nullptr;
+  }
+  return Rule;
+}
+
+/// Recurse into GICombineGroup's and flatten the ruleset into a simple list.
+void GICombinerEmitter::gatherRules(
+    std::vector<std::unique_ptr<CombineRule>> &ActiveRules,
+    const std::vector<Record *> &&RulesAndGroups) {
+  for (Record *R : RulesAndGroups) {
+    if (R->isValueUnset("Rules")) {
+      std::unique_ptr<CombineRule> Rule = makeCombineRule(*R);
+      if (Rule == nullptr) {
+        PrintError(R->getLoc(), "Failed to parse rule");
+        continue;
+      }
+      ActiveRules.emplace_back(std::move(Rule));
+      ++NumPatternTotal;
+    } else
+      gatherRules(ActiveRules, R->getValueAsListOfDefs("Rules"));
+  }
+}
+
+void GICombinerEmitter::generateCodeForRule(raw_ostream &OS,
+                                            const CombineRule *Rule,
+                                            StringRef Indent) const {
+  {
+    const Record &RuleDef = Rule->getDef();
+
+    OS << Indent << "// Rule: " << RuleDef.getName() << "\n"
+       << Indent << "if (!isRuleDisabled(" << Rule->getID() << ")) {\n";
+
+    CodeExpansions Expansions;
+    for (const RootInfo &Root : Rule->roots()) {
+      Expansions.declare(Root.getPatternSymbol(), "MI");
+    }
+    DagInit *Applyer = RuleDef.getValueAsDag("Apply");
+    if (Applyer->getOperatorAsDef(RuleDef.getLoc())->getName() !=
+        "apply") {
+      PrintError(RuleDef.getLoc(), "Expected apply operator");
+      return;
+    }
+
+    OS << Indent << "  if (1\n";
+
+    if (Rule->getMatchingFixupCode() &&
+        !Rule->getMatchingFixupCode()->getValue().empty()) {
+      // FIXME: Single-use lambda's like this are a serious compile-time
+      // performance and memory issue. It's convenient for this early stage to
+      // defer some work to successive patches but we need to eliminate this
+      // before the ruleset grows to small-moderate size. Last time, it became
+      // a big problem for low-mem systems around the 500 rule mark but by the
+      // time we grow that large we should have merged the ISel match table
+      // mechanism with the Combiner.
+      OS << Indent << "      && [&]() {\n"
+         << Indent << "      "
+         << CodeExpander(Rule->getMatchingFixupCode()->getValue(), Expansions,
+                         Rule->getMatchingFixupCode()->getLoc(), ShowExpansions)
+         << "\n"
+         << Indent << "      return true;\n"
+         << Indent << "  }()";
+    }
+    OS << ") {\n" << Indent << "   ";
+
+    if (const CodeInit *Code = dyn_cast<CodeInit>(Applyer->getArg(0))) {
+      OS << CodeExpander(Code->getAsUnquotedString(), Expansions,
+                         Code->getLoc(), ShowExpansions)
+         << "\n"
+         << Indent << "    return true;\n"
+         << Indent << "  }\n";
+    } else {
+      PrintError(RuleDef.getLoc(), "Expected apply code block");
+      return;
+    }
+
+    OS << Indent << "}\n";
+  }
+}
+
+void GICombinerEmitter::run(raw_ostream &OS) {
+  gatherRules(Rules, Combiner->getValueAsListOfDefs("Rules"));
+  NamedRegionTimer T("Emit", "Time spent emitting the combiner",
+                     "Code Generation", "Time spent generating code",
+                     TimeRegions);
+  OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_DEPS\n"
+     << "#include \"llvm/ADT/SparseBitVector.h\"\n"
+     << "namespace llvm {\n"
+     << "extern cl::OptionCategory GICombinerOptionCategory;\n"
+     << "} // end namespace llvm\n"
+     << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_DEPS\n\n";
+
+  OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_H\n"
+     << "class " << getClassName() << " {\n"
+     << "  SparseBitVector<> DisabledRules;\n"
+     << "\n"
+     << "public:\n"
+     << "  bool parseCommandLineOption();\n"
+     << "  bool isRuleDisabled(unsigned ID) const;\n"
+     << "  bool setRuleDisabled(StringRef RuleIdentifier);\n"
+     << "\n"
+     << "  bool tryCombineAll(\n"
+     << "    GISelChangeObserver &Observer,\n"
+     << "    MachineInstr &MI,\n"
+     << "    MachineIRBuilder &B) const;\n"
+     << "};\n\n";
+
+  emitNameMatcher(OS);
+
+  OS << "bool " << getClassName()
+     << "::setRuleDisabled(StringRef RuleIdentifier) {\n"
+     << "  std::pair<StringRef, StringRef> RangePair = "
+        "RuleIdentifier.split('-');\n"
+     << "  if (!RangePair.second.empty()) {\n"
+     << "    const auto First = getRuleIdxForIdentifier(RangePair.first);\n"
+     << "    const auto Last = getRuleIdxForIdentifier(RangePair.second);\n"
+     << "    if (!First.hasValue() || !Last.hasValue())\n"
+     << "      return false;\n"
+     << "    if (First >= Last)\n"
+     << "      report_fatal_error(\"Beginning of range should be before end of "
+        "range\");\n"
+     << "    for (auto I = First.getValue(); I < Last.getValue(); ++I)\n"
+     << "      DisabledRules.set(I);\n"
+     << "    return true;\n"
+     << "  } else {\n"
+     << "    const auto I = getRuleIdxForIdentifier(RangePair.first);\n"
+     << "    if (!I.hasValue())\n"
+     << "      return false;\n"
+     << "    DisabledRules.set(I.getValue());\n"
+     << "    return true;\n"
+     << "  }\n"
+     << "  return false;\n"
+     << "}\n";
+
+  OS << "bool " << getClassName()
+     << "::isRuleDisabled(unsigned RuleID) const {\n"
+     << "  return DisabledRules.test(RuleID);\n"
+     << "}\n";
+  OS << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_H\n\n";
+
+  OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_CPP\n"
+     << "\n"
+     << "cl::list<std::string> " << Name << "Option(\n"
+     << "    \"" << Name.lower() << "-disable-rule\",\n"
+     << "    cl::desc(\"Disable one or more combiner rules temporarily in "
+     << "the " << Name << " pass\"),\n"
+     << "    cl::CommaSeparated,\n"
+     << "    cl::Hidden,\n"
+     << "    cl::cat(GICombinerOptionCategory));\n"
+     << "\n"
+     << "bool " << getClassName() << "::parseCommandLineOption() {\n"
+     << "  for (const auto &Identifier : " << Name << "Option)\n"
+     << "    if (!setRuleDisabled(Identifier))\n"
+     << "      return false;\n"
+     << "  return true;\n"
+     << "}\n\n";
+
+  OS << "bool " << getClassName() << "::tryCombineAll(\n"
+     << "    GISelChangeObserver &Observer,\n"
+     << "    MachineInstr &MI,\n"
+     << "    MachineIRBuilder &B) const {\n"
+     << "  CombinerHelper Helper(Observer, B);\n"
+     << "  MachineBasicBlock *MBB = MI.getParent();\n"
+     << "  MachineFunction *MF = MBB->getParent();\n"
+     << "  MachineRegisterInfo &MRI = MF->getRegInfo();\n"
+     << "  (void)MBB; (void)MF; (void)MRI;\n\n";
+
+  for (const auto &Rule : Rules)
+    generateCodeForRule(OS, Rule.get(), "  ");
+  OS << "\n  return false;\n"
+     << "}\n"
+     << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_CPP\n";
+}
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS) {
+  CodeGenTarget Target(RK);
+  emitSourceFileHeader("Global Combiner", OS);
+
+  if (SelectedCombiners.empty())
+    PrintFatalError("No combiners selected with -combiners");
+  for (const auto &Combiner : SelectedCombiners) {
+    Record *CombinerDef = RK.getDef(Combiner);
+    if (!CombinerDef)
+      PrintFatalError("Could not find " + Combiner);
+    GICombinerEmitter(RK, Target, Combiner, CombinerDef).run(OS);
+  }
+  NumPatternTotalStatistic = NumPatternTotal;
+}
+
+} // namespace llvm
diff --git a/utils/TableGen/GlobalISel/CMakeLists.txt b/utils/TableGen/GlobalISel/CMakeLists.txt
new file mode 100644
index 000000000000..2f74d1087bcd
--- /dev/null
+++ b/utils/TableGen/GlobalISel/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+llvm_add_library(LLVMTableGenGlobalISel STATIC DISABLE_LLVM_LINK_LLVM_DYLIB
+                 CodeExpander.cpp
+                )
diff --git a/utils/TableGen/GlobalISel/CodeExpander.cpp b/utils/TableGen/GlobalISel/CodeExpander.cpp
new file mode 100644
index 000000000000..d59a9b8e3b65
--- /dev/null
+++ b/utils/TableGen/GlobalISel/CodeExpander.cpp
@@ -0,0 +1,93 @@
+//===- CodeExpander.cpp - Expand variables in a string --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Expand the variables in a string.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeExpander.h"
+#include "CodeExpansions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+
+using namespace llvm;
+
+void CodeExpander::emit(raw_ostream &OS) const {
+  StringRef Current = Code;
+
+  while (!Current.empty()) {
+    size_t Pos = Current.find_first_of("$\n\\");
+    if (Pos == StringRef::npos) {
+      OS << Current;
+      Current = "";
+      continue;
+    }
+
+    OS << Current.substr(0, Pos);
+    Current = Current.substr(Pos);
+
+    if (Current.startswith("\n")) {
+      OS << "\n" << Indent;
+      Current = Current.drop_front(1);
+      continue;
+    }
+
+    if (Current.startswith("\\$") || Current.startswith("\\\\")) {
+      OS << Current[1];
+      Current = Current.drop_front(2);
+      continue;
+    }
+
+    if (Current.startswith("\\")) {
+      Current = Current.drop_front(1);
+      continue;
+    }
+
+    if (Current.startswith("${")) {
+      StringRef StartVar = Current;
+      Current = Current.drop_front(2);
+      StringRef Var;
+      std::tie(Var, Current) = Current.split("}");
+
+      // Warn if we split because no terminator was found.
+      StringRef EndVar = StartVar.drop_front(2 /* ${ */ + Var.size());
+      if (EndVar.empty()) {
+        size_t LocOffset = StartVar.data() - Code.data();
+        PrintWarning(
+            Loc.size() > 0 && Loc[0].isValid()
+                ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset)
+                : SMLoc(),
+            "Unterminated expansion");
+      }
+
+      auto ValueI = Expansions.find(Var);
+      if (ValueI == Expansions.end()) {
+        size_t LocOffset = StartVar.data() - Code.data();
+        PrintError(Loc.size() > 0 && Loc[0].isValid()
+                       ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset)
+                       : SMLoc(),
+                   "Attempting to expand an undeclared variable " + Var);
+      }
+      if (ShowExpansions)
+        OS << "/*$" << Var << "{*/";
+      OS << Expansions.lookup(Var);
+      if (ShowExpansions)
+        OS << "/*}*/";
+      continue;
+    }
+
+    size_t LocOffset = Current.data() - Code.data();
+    PrintWarning(Loc.size() > 0 && Loc[0].isValid()
+                     ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset)
+                     : SMLoc(),
+                 "Assuming missing escape character");
+    OS << "$";
+    Current = Current.drop_front(1);
+  }
+}
diff --git a/utils/TableGen/GlobalISel/CodeExpander.h b/utils/TableGen/GlobalISel/CodeExpander.h
new file mode 100644
index 000000000000..bd6946de5925
--- /dev/null
+++ b/utils/TableGen/GlobalISel/CodeExpander.h
@@ -0,0 +1,55 @@
+//===- CodeExpander.h - Expand variables in a string ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Expand the variables in a string.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_CODEEXPANDER_H
+#define LLVM_UTILS_TABLEGEN_CODEEXPANDER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/SMLoc.h"
+
+namespace llvm {
+class CodeExpansions;
+class raw_ostream;
+
+/// Emit the given code with all '${foo}' placeholders expanded to their
+/// replacements.
+///
+/// It's an error to use an undefined expansion and expansion-like output that
+/// needs to be emitted verbatim can be escaped as '\${foo}'
+///
+/// The emitted code can be given a custom indent to enable both indentation by
+/// an arbitrary amount of whitespace and emission of the code as a comment.
+class CodeExpander {
+  StringRef Code;
+  const CodeExpansions &Expansions;
+  const ArrayRef<SMLoc> &Loc;
+  bool ShowExpansions;
+  StringRef Indent;
+
+public:
+  CodeExpander(StringRef Code, const CodeExpansions &Expansions,
+               const ArrayRef<SMLoc> &Loc, bool ShowExpansions,
+               StringRef Indent = "    ")
+      : Code(Code), Expansions(Expansions), Loc(Loc),
+        ShowExpansions(ShowExpansions), Indent(Indent) {}
+
+  void emit(raw_ostream &OS) const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const CodeExpander &Expander) {
+  Expander.emit(OS);
+  return OS;
+}
+} // end namespace llvm
+
+#endif // ifndef LLVM_UTILS_TABLEGEN_CODEEXPANDER_H
diff --git a/utils/TableGen/GlobalISel/CodeExpansions.h b/utils/TableGen/GlobalISel/CodeExpansions.h
new file mode 100644
index 000000000000..bb890ec8f57e
--- /dev/null
+++ b/utils/TableGen/GlobalISel/CodeExpansions.h
@@ -0,0 +1,43 @@
+//===- CodeExpansions.h - Record expansions for CodeExpander --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Record the expansions to use in a CodeExpander.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringMap.h"
+
+#ifndef LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H
+#define LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H
+namespace llvm {
+class CodeExpansions {
+public:
+  using const_iterator = StringMap<std::string>::const_iterator;
+
+protected:
+  StringMap<std::string> Expansions;
+
+public:
+  void declare(StringRef Name, StringRef Expansion) {
+    bool Inserted = Expansions.try_emplace(Name, Expansion).second;
+    assert(Inserted && "Declared variable twice");
+    (void)Inserted;
+  }
+
+  std::string lookup(StringRef Variable) const {
+    return Expansions.lookup(Variable);
+  }
+
+  const_iterator begin() const { return Expansions.begin(); }
+  const_iterator end() const { return Expansions.end(); }
+  const_iterator find(StringRef Variable) const {
+    return Expansions.find(Variable);
+  }
+};
+} // end namespace llvm
+#endif // ifndef LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H
diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp
index f1c02134198b..d8d4c9f4f55c 100644
--- a/utils/TableGen/GlobalISelEmitter.cpp
+++ b/utils/TableGen/GlobalISelEmitter.cpp
@@ -249,6 +249,10 @@ static std::string explainPredicates(const TreePatternNode *N) {
       OS << ']';
     }
 
+    int64_t MinAlign = P.getMinAlignment();
+    if (MinAlign > 0)
+      Explanation += " MinAlign=" + utostr(MinAlign);
+
     if (P.isAtomicOrderingMonotonic())
       Explanation += " monotonic";
     if (P.isAtomicOrderingAcquire())
@@ -329,6 +333,9 @@ static Error isTrivialOperatorNode(const TreePatternNode *N) {
       const ListInit *AddrSpaces = Predicate.getAddressSpaces();
       if (AddrSpaces && !AddrSpaces->empty())
         continue;
+
+      if (Predicate.getMinAlignment() > 0)
+        continue;
     }
 
     if (Predicate.isAtomic() && Predicate.getMemoryVT())
@@ -822,6 +829,10 @@ protected:
   /// the renderers.
   StringMap<OperandMatcher *> DefinedOperands;
 
+  /// A map of anonymous physical register operands defined by the matchers that
+  /// may be referenced by the renderers.
+  DenseMap<Record *, OperandMatcher *> PhysRegOperands;
+
   /// ID for the next instruction variable defined with implicitlyDefineInsnVar()
   unsigned NextInsnVarID;
 
@@ -904,6 +915,8 @@ public:
 
   void defineOperand(StringRef SymbolicName, OperandMatcher &OM);
 
+  void definePhysRegOperand(Record *Reg, OperandMatcher &OM);
+
   Error defineComplexSubOperand(StringRef SymbolicName, Record *ComplexPattern,
                                 unsigned RendererID, unsigned SubOperandID) {
     if (ComplexSubOperands.count(SymbolicName))
@@ -927,6 +940,7 @@ public:
 
   InstructionMatcher &getInstructionMatcher(StringRef SymbolicName) const;
   const OperandMatcher &getOperandMatcher(StringRef Name) const;
+  const OperandMatcher &getPhysRegOperandMatcher(Record *) const;
 
   void optimize() override;
   void emit(MatchTable &Table) override;
@@ -1048,14 +1062,17 @@ public:
     IPM_Opcode,
     IPM_NumOperands,
     IPM_ImmPredicate,
+    IPM_Imm,
     IPM_AtomicOrderingMMO,
     IPM_MemoryLLTSize,
     IPM_MemoryVsLLTSize,
     IPM_MemoryAddressSpace,
+    IPM_MemoryAlignment,
     IPM_GenericPredicate,
     OPM_SameOperand,
     OPM_ComplexPattern,
     OPM_IntrinsicID,
+    OPM_CmpPredicate,
     OPM_Instruction,
     OPM_Int,
     OPM_LiteralInt,
@@ -1324,6 +1341,23 @@ public:
   }
 };
 
+class ImmOperandMatcher : public OperandPredicateMatcher {
+public:
+  ImmOperandMatcher(unsigned InsnVarID, unsigned OpIdx)
+      : OperandPredicateMatcher(IPM_Imm, InsnVarID, OpIdx) {}
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == IPM_Imm;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override {
+    Table << MatchTable::Opcode("GIM_CheckIsImm") << MatchTable::Comment("MI")
+          << MatchTable::IntValue(InsnVarID) << MatchTable::Comment("Op")
+          << MatchTable::IntValue(OpIdx) << MatchTable::LineBreak;
+  }
+};
+
 /// Generates code to check that an operand is a G_CONSTANT with a particular
 /// int.
 class ConstantIntOperandMatcher : public OperandPredicateMatcher {
@@ -1381,6 +1415,36 @@ public:
   }
 };
 
+/// Generates code to check that an operand is an CmpInst predicate
+class CmpPredicateOperandMatcher : public OperandPredicateMatcher {
+protected:
+  std::string PredName;
+
+public:
+  CmpPredicateOperandMatcher(unsigned InsnVarID, unsigned OpIdx,
+                             std::string P)
+    : OperandPredicateMatcher(OPM_CmpPredicate, InsnVarID, OpIdx), PredName(P) {}
+
+  bool isIdentical(const PredicateMatcher &B) const override {
+    return OperandPredicateMatcher::isIdentical(B) &&
+           PredName == cast<CmpPredicateOperandMatcher>(&B)->PredName;
+  }
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == OPM_CmpPredicate;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override {
+    Table << MatchTable::Opcode("GIM_CheckCmpPredicate")
+          << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID)
+          << MatchTable::Comment("Op") << MatchTable::IntValue(OpIdx)
+          << MatchTable::Comment("Predicate")
+          << MatchTable::NamedValue("CmpInst", PredName)
+          << MatchTable::LineBreak;
+  }
+};
+
 /// Generates code to check that an operand is an intrinsic ID.
 class IntrinsicIDOperandMatcher : public OperandPredicateMatcher {
 protected:
@@ -1442,7 +1506,7 @@ public:
   Optional<Kind *> addPredicate(Args &&... args) {
     if (isSameAsAnotherOperand())
       return None;
-    Predicates.emplace_back(llvm::make_unique<Kind>(
+    Predicates.emplace_back(std::make_unique<Kind>(
         getInsnVarID(), getOpIdx(), std::forward<Args>(args)...));
     return static_cast<Kind *>(Predicates.back().get());
   }
@@ -1849,6 +1913,40 @@ public:
   }
 };
 
+class MemoryAlignmentPredicateMatcher : public InstructionPredicateMatcher {
+protected:
+  unsigned MMOIdx;
+  int MinAlign;
+
+public:
+  MemoryAlignmentPredicateMatcher(unsigned InsnVarID, unsigned MMOIdx,
+                                  int MinAlign)
+      : InstructionPredicateMatcher(IPM_MemoryAlignment, InsnVarID),
+        MMOIdx(MMOIdx), MinAlign(MinAlign) {
+    assert(MinAlign > 0);
+  }
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == IPM_MemoryAlignment;
+  }
+
+  bool isIdentical(const PredicateMatcher &B) const override {
+    if (!InstructionPredicateMatcher::isIdentical(B))
+      return false;
+    auto *Other = cast<MemoryAlignmentPredicateMatcher>(&B);
+    return MMOIdx == Other->MMOIdx && MinAlign == Other->MinAlign;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override {
+    Table << MatchTable::Opcode("GIM_CheckMemoryAlignment")
+          << MatchTable::Comment("MI") << MatchTable::IntValue(InsnVarID)
+          << MatchTable::Comment("MMO") << MatchTable::IntValue(MMOIdx)
+          << MatchTable::Comment("MinAlign") << MatchTable::IntValue(MinAlign)
+          << MatchTable::LineBreak;
+  }
+};
+
 /// Generates code to check that the size of an MMO is less-than, equal-to, or
 /// greater than a given LLT.
 class MemoryVsLLTSizePredicateMatcher : public InstructionPredicateMatcher {
@@ -1945,6 +2043,11 @@ protected:
   std::string SymbolicName;
   unsigned InsnVarID;
 
+  /// PhysRegInputs - List list has an entry for each explicitly specified
+  /// physreg input to the pattern.  The first elt is the Register node, the
+  /// second is the recorded slot number the input pattern match saved it in.
+  SmallVector<std::pair<Record *, unsigned>, 2> PhysRegInputs;
+
 public:
   InstructionMatcher(RuleMatcher &Rule, StringRef SymbolicName)
       : Rule(Rule), SymbolicName(SymbolicName) {
@@ -1957,7 +2060,7 @@ public:
   template <class Kind, class... Args>
   Optional<Kind *> addPredicate(Args &&... args) {
     Predicates.emplace_back(
-        llvm::make_unique<Kind>(getInsnVarID(), std::forward<Args>(args)...));
+        std::make_unique<Kind>(getInsnVarID(), std::forward<Args>(args)...));
     return static_cast<Kind *>(Predicates.back().get());
   }
 
@@ -1986,6 +2089,20 @@ public:
     llvm_unreachable("Failed to lookup operand");
   }
 
+  OperandMatcher &addPhysRegInput(Record *Reg, unsigned OpIdx,
+                                  unsigned TempOpIdx) {
+    assert(SymbolicName.empty());
+    OperandMatcher *OM = new OperandMatcher(*this, OpIdx, "", TempOpIdx);
+    Operands.emplace_back(OM);
+    Rule.definePhysRegOperand(Reg, *OM);
+    PhysRegInputs.emplace_back(Reg, OpIdx);
+    return *OM;
+  }
+
+  ArrayRef<std::pair<Record *, unsigned>> getPhysRegInputs() const {
+    return PhysRegInputs;
+  }
+
   StringRef getSymbolicName() const { return SymbolicName; }
   unsigned getNumOperands() const { return Operands.size(); }
   OperandVec::iterator operands_begin() { return Operands.begin(); }
@@ -2193,9 +2310,11 @@ public:
     OR_Copy,
     OR_CopyOrAddZeroReg,
     OR_CopySubReg,
+    OR_CopyPhysReg,
     OR_CopyConstantAsImm,
     OR_CopyFConstantAsFPImm,
     OR_Imm,
+    OR_SubRegIndex,
     OR_Register,
     OR_TempRegister,
     OR_ComplexPattern,
@@ -2247,6 +2366,38 @@ public:
   }
 };
 
+/// A CopyRenderer emits code to copy a virtual register to a specific physical
+/// register.
+class CopyPhysRegRenderer : public OperandRenderer {
+protected:
+  unsigned NewInsnID;
+  Record *PhysReg;
+
+public:
+  CopyPhysRegRenderer(unsigned NewInsnID, Record *Reg)
+      : OperandRenderer(OR_CopyPhysReg), NewInsnID(NewInsnID),
+        PhysReg(Reg) {
+    assert(PhysReg);
+  }
+
+  static bool classof(const OperandRenderer *R) {
+    return R->getKind() == OR_CopyPhysReg;
+  }
+
+  Record *getPhysReg() const { return PhysReg; }
+
+  void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
+    const OperandMatcher &Operand = Rule.getPhysRegOperandMatcher(PhysReg);
+    unsigned OldInsnVarID = Rule.getInsnVarID(Operand.getInstructionMatcher());
+    Table << MatchTable::Opcode("GIR_Copy") << MatchTable::Comment("NewInsnID")
+          << MatchTable::IntValue(NewInsnID) << MatchTable::Comment("OldInsnID")
+          << MatchTable::IntValue(OldInsnVarID) << MatchTable::Comment("OpIdx")
+          << MatchTable::IntValue(Operand.getOpIdx())
+          << MatchTable::Comment(PhysReg->getName())
+          << MatchTable::LineBreak;
+  }
+};
+
 /// A CopyOrAddZeroRegRenderer emits code to copy a single operand from an
 /// existing instruction to the one being built. If the operand turns out to be
 /// a 'G_CONSTANT 0' then it replaces the operand with a zero register.
@@ -2393,11 +2544,13 @@ class AddRegisterRenderer : public OperandRenderer {
 protected:
   unsigned InsnID;
   const Record *RegisterDef;
+  bool IsDef;
 
 public:
-  AddRegisterRenderer(unsigned InsnID, const Record *RegisterDef)
-      : OperandRenderer(OR_Register), InsnID(InsnID), RegisterDef(RegisterDef) {
-  }
+  AddRegisterRenderer(unsigned InsnID, const Record *RegisterDef,
+                      bool IsDef = false)
+      : OperandRenderer(OR_Register), InsnID(InsnID), RegisterDef(RegisterDef),
+        IsDef(IsDef) {}
 
   static bool classof(const OperandRenderer *R) {
     return R->getKind() == OR_Register;
@@ -2411,7 +2564,16 @@ public:
                       ? RegisterDef->getValueAsString("Namespace")
                       : ""),
                  RegisterDef->getName())
-          << MatchTable::LineBreak;
+          << MatchTable::Comment("AddRegisterRegFlags");
+
+    // TODO: This is encoded as a 64-bit element, but only 16 or 32-bits are
+    // really needed for a physical register reference. We can pack the
+    // register and flags in a single field.
+    if (IsDef)
+      Table << MatchTable::NamedValue("RegState::Define");
+    else
+      Table << MatchTable::IntValue(0);
+    Table << MatchTable::LineBreak;
   }
 };
 
@@ -2467,6 +2629,28 @@ public:
   }
 };
 
+/// Adds an enum value for a subreg index to the instruction being built.
+class SubRegIndexRenderer : public OperandRenderer {
+protected:
+  unsigned InsnID;
+  const CodeGenSubRegIndex *SubRegIdx;
+
+public:
+  SubRegIndexRenderer(unsigned InsnID, const CodeGenSubRegIndex *SRI)
+      : OperandRenderer(OR_SubRegIndex), InsnID(InsnID), SubRegIdx(SRI) {}
+
+  static bool classof(const OperandRenderer *R) {
+    return R->getKind() == OR_SubRegIndex;
+  }
+
+  void emitRenderOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
+    Table << MatchTable::Opcode("GIR_AddImm") << MatchTable::Comment("InsnID")
+          << MatchTable::IntValue(InsnID) << MatchTable::Comment("SubRegIndex")
+          << MatchTable::IntValue(SubRegIdx->EnumValue)
+          << MatchTable::LineBreak;
+  }
+};
+
 /// Adds operands by calling a renderer function supplied by the ComplexPattern
 /// matcher function.
 class RenderComplexPatternOperand : public OperandRenderer {
@@ -2620,7 +2804,7 @@ public:
   template <class Kind, class... Args>
   Kind &addRenderer(Args&&... args) {
     OperandRenderers.emplace_back(
-        llvm::make_unique<Kind>(InsnID, std::forward<Args>(args)...));
+        std::make_unique<Kind>(InsnID, std::forward<Args>(args)...));
     return *static_cast<Kind *>(OperandRenderers.back().get());
   }
 
@@ -2747,7 +2931,9 @@ private:
 
 public:
   MakeTempRegisterAction(const LLTCodeGen &Ty, unsigned TempRegID)
-      : Ty(Ty), TempRegID(TempRegID) {}
+      : Ty(Ty), TempRegID(TempRegID) {
+    KnownTypes.insert(Ty);
+  }
 
   void emitActionOpcodes(MatchTable &Table, RuleMatcher &Rule) const override {
     Table << MatchTable::Opcode("GIR_MakeTempReg")
@@ -2781,7 +2967,7 @@ const std::vector<Record *> &RuleMatcher::getRequiredFeatures() const {
 // iterator.
 template <class Kind, class... Args>
 Kind &RuleMatcher::addAction(Args &&... args) {
-  Actions.emplace_back(llvm::make_unique<Kind>(std::forward<Args>(args)...));
+  Actions.emplace_back(std::make_unique<Kind>(std::forward<Args>(args)...));
   return *static_cast<Kind *>(Actions.back().get());
 }
 
@@ -2796,7 +2982,7 @@ template <class Kind, class... Args>
 action_iterator RuleMatcher::insertAction(action_iterator InsertPt,
                                           Args &&... args) {
   return Actions.emplace(InsertPt,
-                         llvm::make_unique<Kind>(std::forward<Args>(args)...));
+                         std::make_unique<Kind>(std::forward<Args>(args)...));
 }
 
 unsigned RuleMatcher::implicitlyDefineInsnVar(InstructionMatcher &Matcher) {
@@ -2823,6 +3009,13 @@ void RuleMatcher::defineOperand(StringRef SymbolicName, OperandMatcher &OM) {
   OM.addPredicate<SameOperandMatcher>(OM.getSymbolicName());
 }
 
+void RuleMatcher::definePhysRegOperand(Record *Reg, OperandMatcher &OM) {
+  if (PhysRegOperands.find(Reg) == PhysRegOperands.end()) {
+    PhysRegOperands[Reg] = &OM;
+    return;
+  }
+}
+
 InstructionMatcher &
 RuleMatcher::getInstructionMatcher(StringRef SymbolicName) const {
   for (const auto &I : InsnVariableIDs)
@@ -2832,6 +3025,18 @@ RuleMatcher::getInstructionMatcher(StringRef SymbolicName) const {
       ("Failed to lookup instruction " + SymbolicName).str().c_str());
 }
 
+const OperandMatcher &
+RuleMatcher::getPhysRegOperandMatcher(Record *Reg) const {
+  const auto &I = PhysRegOperands.find(Reg);
+
+  if (I == PhysRegOperands.end()) {
+    PrintFatalError(SrcLoc, "Register " + Reg->getName() +
+                    " was not declared in matcher");
+  }
+
+  return *I->second;
+}
+
 const OperandMatcher &
 RuleMatcher::getOperandMatcher(StringRef Name) const {
   const auto &I = DefinedOperands.find(Name);
@@ -3079,9 +3284,9 @@ private:
                            bool OperandIsAPointer, unsigned OpIdx,
                            unsigned &TempOpIdx);
 
-  Expected<BuildMIAction &>
-  createAndImportInstructionRenderer(RuleMatcher &M,
-                                     const TreePatternNode *Dst);
+  Expected<BuildMIAction &> createAndImportInstructionRenderer(
+      RuleMatcher &M, InstructionMatcher &InsnMatcher,
+      const TreePatternNode *Src, const TreePatternNode *Dst);
   Expected<action_iterator> createAndImportSubInstructionRenderer(
       action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst,
       unsigned TempReg);
@@ -3089,6 +3294,7 @@ private:
   createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M,
                             const TreePatternNode *Dst);
   void importExplicitDefRenderers(BuildMIAction &DstMIBuilder);
+
   Expected<action_iterator>
   importExplicitUseRenderers(action_iterator InsertPt, RuleMatcher &M,
                              BuildMIAction &DstMIBuilder,
@@ -3122,6 +3328,32 @@ private:
   MatchTable buildMatchTable(MutableArrayRef<RuleMatcher> Rules, bool Optimize,
                              bool WithCoverage);
 
+  /// Infer a CodeGenRegisterClass for the type of \p SuperRegNode. The returned
+  /// CodeGenRegisterClass will support the CodeGenRegisterClass of
+  /// \p SubRegNode, and the subregister index defined by \p SubRegIdxNode.
+  /// If no register class is found, return None.
+  Optional<const CodeGenRegisterClass *>
+  inferSuperRegisterClassForNode(const TypeSetByHwMode &Ty,
+                                 TreePatternNode *SuperRegNode,
+                                 TreePatternNode *SubRegIdxNode);
+  Optional<CodeGenSubRegIndex *>
+  inferSubRegIndexForNode(TreePatternNode *SubRegIdxNode);
+
+  /// Infer a CodeGenRegisterClass which suppoorts \p Ty and \p SubRegIdxNode.
+  /// Return None if no such class exists.
+  Optional<const CodeGenRegisterClass *>
+  inferSuperRegisterClass(const TypeSetByHwMode &Ty,
+                          TreePatternNode *SubRegIdxNode);
+
+  /// Return the CodeGenRegisterClass associated with \p Leaf if it has one.
+  Optional<const CodeGenRegisterClass *>
+  getRegClassFromLeaf(TreePatternNode *Leaf);
+
+  /// Return a CodeGenRegisterClass for \p N if one can be found. Return None
+  /// otherwise.
+  Optional<const CodeGenRegisterClass *>
+  inferRegClassFromPattern(TreePatternNode *N);
+
 public:
   /// Takes a sequence of \p Rules and group them based on the predicates
   /// they share. \p MatcherStorage is used as a memory container
@@ -3190,6 +3422,13 @@ Record *GlobalISelEmitter::findNodeEquiv(Record *N) const {
 
 const CodeGenInstruction *
 GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode *N) const {
+  if (N->getNumChildren() >= 1) {
+    // setcc operation maps to two different G_* instructions based on the type.
+    if (!Equiv.isValueUnset("IfFloatingPoint") &&
+        MVT(N->getChild(0)->getSimpleType(0)).isFloatingPoint())
+      return &Target.getInstruction(Equiv.getValueAsDef("IfFloatingPoint"));
+  }
+
   for (const TreePredicateCall &Call : N->getPredicateCalls()) {
     const TreePredicateFn &Predicate = Call.Fn;
     if (!Equiv.isValueUnset("IfSignExtend") && Predicate.isLoad() &&
@@ -3199,6 +3438,7 @@ GlobalISelEmitter::getEquivNode(Record &Equiv, const TreePatternNode *N) const {
         Predicate.isZeroExtLoad())
       return &Target.getInstruction(Equiv.getValueAsDef("IfZeroExtend"));
   }
+
   return &Target.getInstruction(Equiv.getValueAsDef("I"));
 }
 
@@ -3212,7 +3452,7 @@ Error
 GlobalISelEmitter::importRulePredicates(RuleMatcher &M,
                                         ArrayRef<Predicate> Predicates) {
   for (const Predicate &P : Predicates) {
-    if (!P.Def)
+    if (!P.Def || P.getCondString().empty())
       continue;
     declareSubtargetFeature(P.Def);
     M.addRequiredFeature(P.Def);
@@ -3287,6 +3527,10 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
             0, ParsedAddrSpaces);
         }
       }
+
+      int64_t MinAlign = Predicate.getMinAlignment();
+      if (MinAlign > 0)
+        InsnMatcher.addPredicate<MemoryAlignmentPredicateMatcher>(0, MinAlign);
     }
 
     // G_LOAD is used for both non-extending and any-extending loads.
@@ -3301,11 +3545,19 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       continue;
     }
 
-    if (Predicate.isStore() && Predicate.isTruncStore()) {
-      // FIXME: If MemoryVT is set, we end up with 2 checks for the MMO size.
-      InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
-        0, MemoryVsLLTSizePredicateMatcher::LessThan, 0);
-      continue;
+    if (Predicate.isStore()) {
+      if (Predicate.isTruncStore()) {
+        // FIXME: If MemoryVT is set, we end up with 2 checks for the MMO size.
+        InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
+            0, MemoryVsLLTSizePredicateMatcher::LessThan, 0);
+        continue;
+      }
+      if (Predicate.isNonTruncStore()) {
+        // We need to check the sizes match here otherwise we could incorrectly
+        // match truncating stores with non-truncating ones.
+        InsnMatcher.addPredicate<MemoryVsLLTSizePredicateMatcher>(
+            0, MemoryVsLLTSizePredicateMatcher::EqualTo, 0);
+      }
     }
 
     // No check required. We already did it by swapping the opcode.
@@ -3405,6 +3657,10 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
   }
   if (SrcGIEquivOrNull && SrcGIEquivOrNull->getValueAsBit("CheckMMOIsNonAtomic"))
     InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>("NotAtomic");
+  else if (SrcGIEquivOrNull && SrcGIEquivOrNull->getValueAsBit("CheckMMOIsAtomic")) {
+    InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
+      "Unordered", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
+  }
 
   if (Src->isLeaf()) {
     Init *SrcInit = Src->getLeafValue();
@@ -3427,8 +3683,43 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       return InsnMatcher;
     }
 
+    // Special case because the operand order is changed from setcc. The
+    // predicate operand needs to be swapped from the last operand to the first
+    // source.
+
+    unsigned NumChildren = Src->getNumChildren();
+    bool IsFCmp = SrcGIOrNull->TheDef->getName() == "G_FCMP";
+
+    if (IsFCmp || SrcGIOrNull->TheDef->getName() == "G_ICMP") {
+      TreePatternNode *SrcChild = Src->getChild(NumChildren - 1);
+      if (SrcChild->isLeaf()) {
+        DefInit *DI = dyn_cast<DefInit>(SrcChild->getLeafValue());
+        Record *CCDef = DI ? DI->getDef() : nullptr;
+        if (!CCDef || !CCDef->isSubClassOf("CondCode"))
+          return failedImport("Unable to handle CondCode");
+
+        OperandMatcher &OM =
+          InsnMatcher.addOperand(OpIdx++, SrcChild->getName(), TempOpIdx);
+        StringRef PredType = IsFCmp ? CCDef->getValueAsString("FCmpPredicate") :
+                                      CCDef->getValueAsString("ICmpPredicate");
+
+        if (!PredType.empty()) {
+          OM.addPredicate<CmpPredicateOperandMatcher>(PredType);
+          // Process the other 2 operands normally.
+          --NumChildren;
+        }
+      }
+    }
+
     // Match the used operands (i.e. the children of the operator).
-    for (unsigned i = 0, e = Src->getNumChildren(); i != e; ++i) {
+    bool IsIntrinsic =
+        SrcGIOrNull->TheDef->getName() == "G_INTRINSIC" ||
+        SrcGIOrNull->TheDef->getName() == "G_INTRINSIC_W_SIDE_EFFECTS";
+    const CodeGenIntrinsic *II = Src->getIntrinsicInfo(CGP);
+    if (IsIntrinsic && !II)
+      return failedImport("Expected IntInit containing intrinsic ID)");
+
+    for (unsigned i = 0; i != NumChildren; ++i) {
       TreePatternNode *SrcChild = Src->getChild(i);
 
       // SelectionDAG allows pointers to be represented with iN since it doesn't
@@ -3436,19 +3727,21 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       // Coerce integers to pointers to address space 0 if the context indicates a pointer.
       bool OperandIsAPointer = SrcGIOrNull->isOperandAPointer(i);
 
-      // For G_INTRINSIC/G_INTRINSIC_W_SIDE_EFFECTS, the operand immediately
-      // following the defs is an intrinsic ID.
-      if ((SrcGIOrNull->TheDef->getName() == "G_INTRINSIC" ||
-           SrcGIOrNull->TheDef->getName() == "G_INTRINSIC_W_SIDE_EFFECTS") &&
-          i == 0) {
-        if (const CodeGenIntrinsic *II = Src->getIntrinsicInfo(CGP)) {
+      if (IsIntrinsic) {
+        // For G_INTRINSIC/G_INTRINSIC_W_SIDE_EFFECTS, the operand immediately
+        // following the defs is an intrinsic ID.
+        if (i == 0) {
           OperandMatcher &OM =
               InsnMatcher.addOperand(OpIdx++, SrcChild->getName(), TempOpIdx);
           OM.addPredicate<IntrinsicIDOperandMatcher>(II);
           continue;
         }
 
-        return failedImport("Expected IntInit containing instrinsic ID)");
+        // We have to check intrinsics for llvm_anyptr_ty parameters.
+        //
+        // Note that we have to look at the i-1th parameter, because we don't
+        // have the intrinsic ID in the intrinsic's parameter list.
+        OperandIsAPointer |= II->isParamAPointer(i - 1);
       }
 
       if (auto Error =
@@ -3473,14 +3766,37 @@ Error GlobalISelEmitter::importComplexPatternOperandMatcher(
   return Error::success();
 }
 
+// Get the name to use for a pattern operand. For an anonymous physical register
+// input, this should use the register name.
+static StringRef getSrcChildName(const TreePatternNode *SrcChild,
+                                 Record *&PhysReg) {
+  StringRef SrcChildName = SrcChild->getName();
+  if (SrcChildName.empty() && SrcChild->isLeaf()) {
+    if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild->getLeafValue())) {
+      auto *ChildRec = ChildDefInit->getDef();
+      if (ChildRec->isSubClassOf("Register")) {
+        SrcChildName = ChildRec->getName();
+        PhysReg = ChildRec;
+      }
+    }
+  }
+
+  return SrcChildName;
+}
+
 Error GlobalISelEmitter::importChildMatcher(RuleMatcher &Rule,
                                             InstructionMatcher &InsnMatcher,
                                             const TreePatternNode *SrcChild,
                                             bool OperandIsAPointer,
                                             unsigned OpIdx,
                                             unsigned &TempOpIdx) {
-  OperandMatcher &OM =
-      InsnMatcher.addOperand(OpIdx, SrcChild->getName(), TempOpIdx);
+
+  Record *PhysReg = nullptr;
+  StringRef SrcChildName = getSrcChildName(SrcChild, PhysReg);
+
+  OperandMatcher &OM = PhysReg ?
+    InsnMatcher.addPhysRegInput(PhysReg, OpIdx, TempOpIdx) :
+    InsnMatcher.addOperand(OpIdx, SrcChildName, TempOpIdx);
   if (OM.isSameAsAnotherOperand())
     return Error::success();
 
@@ -3496,6 +3812,10 @@ Error GlobalISelEmitter::importChildMatcher(RuleMatcher &Rule,
         OM.addPredicate<MBBOperandMatcher>();
         return Error::success();
       }
+      if (SrcChild->getOperator()->getName() == "timm") {
+        OM.addPredicate<ImmOperandMatcher>();
+        return Error::success();
+      }
     }
   }
 
@@ -3569,6 +3889,20 @@ Error GlobalISelEmitter::importChildMatcher(RuleMatcher &Rule,
       return Error::success();
     }
 
+    if (ChildRec->isSubClassOf("Register")) {
+      // This just be emitted as a copy to the specific register.
+      ValueTypeByHwMode VT = ChildTypes.front().getValueTypeByHwMode();
+      const CodeGenRegisterClass *RC
+        = CGRegs.getMinimalPhysRegClass(ChildRec, &VT);
+      if (!RC) {
+        return failedImport(
+          "Could not determine physical register class of pattern source");
+      }
+
+      OM.addPredicate<RegisterBankOperandMatcher>(*RC);
+      return Error::success();
+    }
+
     // Check for ValueType.
     if (ChildRec->isSubClassOf("ValueType")) {
       // We already added a type check as standard practice so this doesn't need
@@ -3631,7 +3965,10 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     // rendered as operands.
     // FIXME: The target should be able to choose sign-extended when appropriate
     //        (e.g. on Mips).
-    if (DstChild->getOperator()->getName() == "imm") {
+    if (DstChild->getOperator()->getName() == "timm") {
+      DstMIBuilder.addRenderer<CopyRenderer>(DstChild->getName());
+      return InsertPt;
+    } else if (DstChild->getOperator()->getName() == "imm") {
       DstMIBuilder.addRenderer<CopyConstantAsImmRenderer>(DstChild->getName());
       return InsertPt;
     } else if (DstChild->getOperator()->getName() == "fpimm") {
@@ -3708,6 +4045,12 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
       return InsertPt;
     }
 
+    if (ChildRec->isSubClassOf("SubRegIndex")) {
+      CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(ChildRec);
+      DstMIBuilder.addRenderer<ImmRenderer>(SubIdx->EnumValue);
+      return InsertPt;
+    }
+
     if (ChildRec->isSubClassOf("ComplexPattern")) {
       const auto &ComplexPattern = ComplexPatternEquivs.find(ChildRec);
       if (ComplexPattern == ComplexPatternEquivs.end())
@@ -3729,7 +4072,8 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
 }
 
 Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
-    RuleMatcher &M, const TreePatternNode *Dst) {
+    RuleMatcher &M, InstructionMatcher &InsnMatcher, const TreePatternNode *Src,
+    const TreePatternNode *Dst) {
   auto InsertPtOrError = createInstructionRenderer(M.actions_end(), M, Dst);
   if (auto Error = InsertPtOrError.takeError())
     return std::move(Error);
@@ -3737,6 +4081,17 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
   action_iterator InsertPt = InsertPtOrError.get();
   BuildMIAction &DstMIBuilder = *static_cast<BuildMIAction *>(InsertPt->get());
 
+  for (auto PhysInput : InsnMatcher.getPhysRegInputs()) {
+    InsertPt = M.insertAction<BuildMIAction>(
+        InsertPt, M.allocateOutputInsnID(),
+        &Target.getInstruction(RK.getDef("COPY")));
+    BuildMIAction &CopyToPhysRegMIBuilder =
+        *static_cast<BuildMIAction *>(InsertPt->get());
+    CopyToPhysRegMIBuilder.addRenderer<AddRegisterRenderer>(PhysInput.first,
+                                                            true);
+    CopyToPhysRegMIBuilder.addRenderer<CopyPhysRegRenderer>(PhysInput.first);
+  }
+
   importExplicitDefRenderers(DstMIBuilder);
 
   if (auto Error = importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst)
@@ -3768,6 +4123,78 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   if (auto Error = InsertPtOrError.takeError())
     return std::move(Error);
 
+  // We need to make sure that when we import an INSERT_SUBREG as a
+  // subinstruction that it ends up being constrained to the correct super
+  // register and subregister classes.
+  auto OpName = Target.getInstruction(Dst->getOperator()).TheDef->getName();
+  if (OpName == "INSERT_SUBREG") {
+    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    if (!SubClass)
+      return failedImport(
+          "Cannot infer register class from INSERT_SUBREG operand #1");
+    Optional<const CodeGenRegisterClass *> SuperClass =
+        inferSuperRegisterClassForNode(Dst->getExtType(0), Dst->getChild(0),
+                                       Dst->getChild(2));
+    if (!SuperClass)
+      return failedImport(
+          "Cannot infer register class for INSERT_SUBREG operand #0");
+    // The destination and the super register source of an INSERT_SUBREG must
+    // be the same register class.
+    M.insertAction<ConstrainOperandToRegClassAction>(
+        InsertPt, DstMIBuilder.getInsnID(), 0, **SuperClass);
+    M.insertAction<ConstrainOperandToRegClassAction>(
+        InsertPt, DstMIBuilder.getInsnID(), 1, **SuperClass);
+    M.insertAction<ConstrainOperandToRegClassAction>(
+        InsertPt, DstMIBuilder.getInsnID(), 2, **SubClass);
+    return InsertPtOrError.get();
+  }
+
+  if (OpName == "EXTRACT_SUBREG") {
+    // EXTRACT_SUBREG selects into a subregister COPY but unlike most
+    // instructions, the result register class is controlled by the
+    // subregisters of the operand. As a result, we must constrain the result
+    // class rather than check that it's already the right one.
+    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    if (!SuperClass)
+      return failedImport(
+        "Cannot infer register class from EXTRACT_SUBREG operand #0");
+
+    auto SubIdx = inferSubRegIndexForNode(Dst->getChild(1));
+    if (!SubIdx)
+      return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
+
+    const auto &SrcRCDstRCPair =
+      (*SuperClass)->getMatchingSubClassWithSubRegs(CGRegs, *SubIdx);
+    assert(SrcRCDstRCPair->second && "Couldn't find a matching subclass");
+    M.insertAction<ConstrainOperandToRegClassAction>(
+      InsertPt, DstMIBuilder.getInsnID(), 0, *SrcRCDstRCPair->second);
+    M.insertAction<ConstrainOperandToRegClassAction>(
+      InsertPt, DstMIBuilder.getInsnID(), 1, *SrcRCDstRCPair->first);
+
+    // We're done with this pattern!  It's eligible for GISel emission; return
+    // it.
+    return InsertPtOrError.get();
+  }
+
+  // Similar to INSERT_SUBREG, we also have to handle SUBREG_TO_REG as a
+  // subinstruction.
+  if (OpName == "SUBREG_TO_REG") {
+    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    if (!SubClass)
+      return failedImport(
+        "Cannot infer register class from SUBREG_TO_REG child #1");
+    auto SuperClass = inferSuperRegisterClass(Dst->getExtType(0),
+                                              Dst->getChild(2));
+    if (!SuperClass)
+      return failedImport(
+        "Cannot infer register class for SUBREG_TO_REG operand #0");
+    M.insertAction<ConstrainOperandToRegClassAction>(
+      InsertPt, DstMIBuilder.getInsnID(), 0, **SuperClass);
+    M.insertAction<ConstrainOperandToRegClassAction>(
+      InsertPt, DstMIBuilder.getInsnID(), 2, **SubClass);
+    return InsertPtOrError.get();
+  }
+
   M.insertAction<ConstrainOperandsToDefinitionAction>(InsertPt,
                                                       DstMIBuilder.getInsnID());
   return InsertPtOrError.get();
@@ -3786,12 +4213,9 @@ Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer(
 
   // COPY_TO_REGCLASS is just a copy with a ConstrainOperandToRegClassAction
   // attached. Similarly for EXTRACT_SUBREG except that's a subregister copy.
-  if (DstI->TheDef->getName() == "COPY_TO_REGCLASS")
-    DstI = &Target.getInstruction(RK.getDef("COPY"));
-  else if (DstI->TheDef->getName() == "EXTRACT_SUBREG")
+  StringRef Name = DstI->TheDef->getName();
+  if (Name == "COPY_TO_REGCLASS" || Name == "EXTRACT_SUBREG")
     DstI = &Target.getInstruction(RK.getDef("COPY"));
-  else if (DstI->TheDef->getName() == "REG_SEQUENCE")
-    return failedImport("Unable to emit REG_SEQUENCE");
 
   return M.insertAction<BuildMIAction>(InsertPt, M.allocateOutputInsnID(),
                                        DstI);
@@ -3812,8 +4236,11 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
   const CodeGenInstruction *DstI = DstMIBuilder.getCGI();
   CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst->getOperator());
 
+  StringRef Name = OrigDstI->TheDef->getName();
+  unsigned ExpectedDstINumUses = Dst->getNumChildren();
+
   // EXTRACT_SUBREG needs to use a subregister COPY.
-  if (OrigDstI->TheDef->getName() == "EXTRACT_SUBREG") {
+  if (Name == "EXTRACT_SUBREG") {
     if (!Dst->getChild(0)->isLeaf())
       return failedImport("EXTRACT_SUBREG child #1 is not a leaf");
 
@@ -3843,10 +4270,41 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
     return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
   }
 
+  if (Name == "REG_SEQUENCE") {
+    if (!Dst->getChild(0)->isLeaf())
+      return failedImport("REG_SEQUENCE child #0 is not a leaf");
+
+    Record *RCDef = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
+    if (!RCDef)
+      return failedImport("REG_SEQUENCE child #0 could not "
+                          "be coerced to a register class");
+
+    if ((ExpectedDstINumUses - 1) % 2 != 0)
+      return failedImport("Malformed REG_SEQUENCE");
+
+    for (unsigned I = 1; I != ExpectedDstINumUses; I += 2) {
+      TreePatternNode *ValChild = Dst->getChild(I);
+      TreePatternNode *SubRegChild = Dst->getChild(I + 1);
+
+      if (DefInit *SubRegInit =
+              dyn_cast<DefInit>(SubRegChild->getLeafValue())) {
+        CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
+
+        auto InsertPtOrError =
+            importExplicitUseRenderer(InsertPt, M, DstMIBuilder, ValChild);
+        if (auto Error = InsertPtOrError.takeError())
+          return std::move(Error);
+        InsertPt = InsertPtOrError.get();
+        DstMIBuilder.addRenderer<SubRegIndexRenderer>(SubIdx);
+      }
+    }
+
+    return InsertPt;
+  }
+
   // Render the explicit uses.
   unsigned DstINumUses = OrigDstI->Operands.size() - OrigDstI->Operands.NumDefs;
-  unsigned ExpectedDstINumUses = Dst->getNumChildren();
-  if (OrigDstI->TheDef->getName() == "COPY_TO_REGCLASS") {
+  if (Name == "COPY_TO_REGCLASS") {
     DstINumUses--; // Ignore the class constraint.
     ExpectedDstINumUses--;
   }
@@ -3945,6 +4403,126 @@ Error GlobalISelEmitter::importImplicitDefRenderers(
   return Error::success();
 }
 
+Optional<const CodeGenRegisterClass *>
+GlobalISelEmitter::getRegClassFromLeaf(TreePatternNode *Leaf) {
+  assert(Leaf && "Expected node?");
+  assert(Leaf->isLeaf() && "Expected leaf?");
+  Record *RCRec = getInitValueAsRegClass(Leaf->getLeafValue());
+  if (!RCRec)
+    return None;
+  CodeGenRegisterClass *RC = CGRegs.getRegClass(RCRec);
+  if (!RC)
+    return None;
+  return RC;
+}
+
+Optional<const CodeGenRegisterClass *>
+GlobalISelEmitter::inferRegClassFromPattern(TreePatternNode *N) {
+  if (!N)
+    return None;
+
+  if (N->isLeaf())
+    return getRegClassFromLeaf(N);
+
+  // We don't have a leaf node, so we have to try and infer something. Check
+  // that we have an instruction that we an infer something from.
+
+  // Only handle things that produce a single type.
+  if (N->getNumTypes() != 1)
+    return None;
+  Record *OpRec = N->getOperator();
+
+  // We only want instructions.
+  if (!OpRec->isSubClassOf("Instruction"))
+    return None;
+
+  // Don't want to try and infer things when there could potentially be more
+  // than one candidate register class.
+  auto &Inst = Target.getInstruction(OpRec);
+  if (Inst.Operands.NumDefs > 1)
+    return None;
+
+  // Handle any special-case instructions which we can safely infer register
+  // classes from.
+  StringRef InstName = Inst.TheDef->getName();
+  bool IsRegSequence = InstName == "REG_SEQUENCE";
+  if (IsRegSequence || InstName == "COPY_TO_REGCLASS") {
+    // If we have a COPY_TO_REGCLASS, then we need to handle it specially. It
+    // has the desired register class as the first child.
+    TreePatternNode *RCChild = N->getChild(IsRegSequence ? 0 : 1);
+    if (!RCChild->isLeaf())
+      return None;
+    return getRegClassFromLeaf(RCChild);
+  }
+
+  // Handle destination record types that we can safely infer a register class
+  // from.
+  const auto &DstIOperand = Inst.Operands[0];
+  Record *DstIOpRec = DstIOperand.Rec;
+  if (DstIOpRec->isSubClassOf("RegisterOperand")) {
+    DstIOpRec = DstIOpRec->getValueAsDef("RegClass");
+    const CodeGenRegisterClass &RC = Target.getRegisterClass(DstIOpRec);
+    return &RC;
+  }
+
+  if (DstIOpRec->isSubClassOf("RegisterClass")) {
+    const CodeGenRegisterClass &RC = Target.getRegisterClass(DstIOpRec);
+    return &RC;
+  }
+
+  return None;
+}
+
+Optional<const CodeGenRegisterClass *>
+GlobalISelEmitter::inferSuperRegisterClass(const TypeSetByHwMode &Ty,
+                                           TreePatternNode *SubRegIdxNode) {
+  assert(SubRegIdxNode && "Expected subregister index node!");
+  // We need a ValueTypeByHwMode for getSuperRegForSubReg.
+  if (!Ty.isValueTypeByHwMode(false))
+    return None;
+  if (!SubRegIdxNode->isLeaf())
+    return None;
+  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode->getLeafValue());
+  if (!SubRegInit)
+    return None;
+  CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
+
+  // Use the information we found above to find a minimal register class which
+  // supports the subregister and type we want.
+  auto RC =
+      Target.getSuperRegForSubReg(Ty.getValueTypeByHwMode(), CGRegs, SubIdx);
+  if (!RC)
+    return None;
+  return *RC;
+}
+
+Optional<const CodeGenRegisterClass *>
+GlobalISelEmitter::inferSuperRegisterClassForNode(
+    const TypeSetByHwMode &Ty, TreePatternNode *SuperRegNode,
+    TreePatternNode *SubRegIdxNode) {
+  assert(SuperRegNode && "Expected super register node!");
+  // Check if we already have a defined register class for the super register
+  // node. If we do, then we should preserve that rather than inferring anything
+  // from the subregister index node. We can assume that whoever wrote the
+  // pattern in the first place made sure that the super register and
+  // subregister are compatible.
+  if (Optional<const CodeGenRegisterClass *> SuperRegisterClass =
+          inferRegClassFromPattern(SuperRegNode))
+    return *SuperRegisterClass;
+  return inferSuperRegisterClass(Ty, SubRegIdxNode);
+}
+
+Optional<CodeGenSubRegIndex *>
+GlobalISelEmitter::inferSubRegIndexForNode(TreePatternNode *SubRegIdxNode) {
+  if (!SubRegIdxNode->isLeaf())
+    return None;
+
+  DefInit *SubRegInit = dyn_cast<DefInit>(SubRegIdxNode->getLeafValue());
+  if (!SubRegInit)
+    return None;
+  return CGRegs.getSubRegIdx(SubRegInit->getDef());
+}
+
 Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   // Keep track of the matchers and actions to emit.
   int Score = P.getPatternComplexity(CGP);
@@ -4035,6 +4613,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     return failedImport("Pattern operator isn't an instruction");
 
   auto &DstI = Target.getInstruction(DstOp);
+  StringRef DstIName = DstI.TheDef->getName();
+
   if (DstI.Operands.NumDefs != Src->getExtTypes().size())
     return failedImport("Src pattern results and dst MI defs are different (" +
                         to_string(Src->getExtTypes().size()) + " def(s) vs " +
@@ -4048,13 +4628,17 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
 
     const auto &DstIOperand = DstI.Operands[OpIdx];
     Record *DstIOpRec = DstIOperand.Rec;
-    if (DstI.TheDef->getName() == "COPY_TO_REGCLASS") {
+    if (DstIName == "COPY_TO_REGCLASS") {
       DstIOpRec = getInitValueAsRegClass(Dst->getChild(1)->getLeafValue());
 
       if (DstIOpRec == nullptr)
         return failedImport(
             "COPY_TO_REGCLASS operand #1 isn't a register class");
-    } else if (DstI.TheDef->getName() == "EXTRACT_SUBREG") {
+    } else if (DstIName == "REG_SEQUENCE") {
+      DstIOpRec = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
+      if (DstIOpRec == nullptr)
+        return failedImport("REG_SEQUENCE operand #0 isn't a register class");
+    } else if (DstIName == "EXTRACT_SUBREG") {
       if (!Dst->getChild(0)->isLeaf())
         return failedImport("EXTRACT_SUBREG operand #0 isn't a leaf");
 
@@ -4063,8 +4647,33 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
       DstIOpRec = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
 
       if (DstIOpRec == nullptr)
+        return failedImport("EXTRACT_SUBREG operand #0 isn't a register class");
+    } else if (DstIName == "INSERT_SUBREG") {
+      auto MaybeSuperClass = inferSuperRegisterClassForNode(
+          VTy, Dst->getChild(0), Dst->getChild(2));
+      if (!MaybeSuperClass)
         return failedImport(
-            "EXTRACT_SUBREG operand #0 isn't a register class");
+            "Cannot infer register class for INSERT_SUBREG operand #0");
+      // Move to the next pattern here, because the register class we found
+      // doesn't necessarily have a record associated with it. So, we can't
+      // set DstIOpRec using this.
+      OperandMatcher &OM = InsnMatcher.getOperand(OpIdx);
+      OM.setSymbolicName(DstIOperand.Name);
+      M.defineOperand(OM.getSymbolicName(), OM);
+      OM.addPredicate<RegisterBankOperandMatcher>(**MaybeSuperClass);
+      ++OpIdx;
+      continue;
+    } else if (DstIName == "SUBREG_TO_REG") {
+      auto MaybeRegClass = inferSuperRegisterClass(VTy, Dst->getChild(2));
+      if (!MaybeRegClass)
+        return failedImport(
+            "Cannot infer register class for SUBREG_TO_REG operand #0");
+      OperandMatcher &OM = InsnMatcher.getOperand(OpIdx);
+      OM.setSymbolicName(DstIOperand.Name);
+      M.defineOperand(OM.getSymbolicName(), OM);
+      OM.addPredicate<RegisterBankOperandMatcher>(**MaybeRegClass);
+      ++OpIdx;
+      continue;
     } else if (DstIOpRec->isSubClassOf("RegisterOperand"))
       DstIOpRec = DstIOpRec->getValueAsDef("RegClass");
     else if (!DstIOpRec->isSubClassOf("RegisterClass"))
@@ -4079,7 +4688,8 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     ++OpIdx;
   }
 
-  auto DstMIBuilderOrError = createAndImportInstructionRenderer(M, Dst);
+  auto DstMIBuilderOrError =
+      createAndImportInstructionRenderer(M, InsnMatcher, Src, Dst);
   if (auto Error = DstMIBuilderOrError.takeError())
     return std::move(Error);
   BuildMIAction &DstMIBuilder = DstMIBuilderOrError.get();
@@ -4093,7 +4703,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
 
   // Constrain the registers to classes. This is normally derived from the
   // emitted instruction but a few instructions require special handling.
-  if (DstI.TheDef->getName() == "COPY_TO_REGCLASS") {
+  if (DstIName == "COPY_TO_REGCLASS") {
     // COPY_TO_REGCLASS does not provide operand constraints itself but the
     // result is constrained to the class given by the second child.
     Record *DstIOpRec =
@@ -4111,28 +4721,16 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     return std::move(M);
   }
 
-  if (DstI.TheDef->getName() == "EXTRACT_SUBREG") {
-    // EXTRACT_SUBREG selects into a subregister COPY but unlike most
-    // instructions, the result register class is controlled by the
-    // subregisters of the operand. As a result, we must constrain the result
-    // class rather than check that it's already the right one.
-    if (!Dst->getChild(0)->isLeaf())
-      return failedImport("EXTRACT_SUBREG child #1 is not a leaf");
+  if (DstIName == "EXTRACT_SUBREG") {
+    auto SuperClass = inferRegClassFromPattern(Dst->getChild(0));
+    if (!SuperClass)
+      return failedImport(
+        "Cannot infer register class from EXTRACT_SUBREG operand #0");
 
-    DefInit *SubRegInit = dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue());
-    if (!SubRegInit)
+    auto SubIdx = inferSubRegIndexForNode(Dst->getChild(1));
+    if (!SubIdx)
       return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
 
-    // Constrain the result to the same register bank as the operand.
-    Record *DstIOpRec =
-        getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
-
-    if (DstIOpRec == nullptr)
-      return failedImport("EXTRACT_SUBREG operand #1 isn't a register class");
-
-    CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
-    CodeGenRegisterClass *SrcRC = CGRegs.getRegClass(DstIOpRec);
-
     // It would be nice to leave this constraint implicit but we're required
     // to pick a register class so constrain the result to a register class
     // that can hold the correct MVT.
@@ -4143,7 +4741,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
              "Expected Src of EXTRACT_SUBREG to have one result type");
 
     const auto &SrcRCDstRCPair =
-        SrcRC->getMatchingSubClassWithSubRegs(CGRegs, SubIdx);
+      (*SuperClass)->getMatchingSubClassWithSubRegs(CGRegs, *SubIdx);
     assert(SrcRCDstRCPair->second && "Couldn't find a matching subclass");
     M.addAction<ConstrainOperandToRegClassAction>(0, 0, *SrcRCDstRCPair->second);
     M.addAction<ConstrainOperandToRegClassAction>(0, 1, *SrcRCDstRCPair->first);
@@ -4154,6 +4752,51 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     return std::move(M);
   }
 
+  if (DstIName == "INSERT_SUBREG") {
+    assert(Src->getExtTypes().size() == 1 &&
+           "Expected Src of INSERT_SUBREG to have one result type");
+    // We need to constrain the destination, a super regsister source, and a
+    // subregister source.
+    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    if (!SubClass)
+      return failedImport(
+          "Cannot infer register class from INSERT_SUBREG operand #1");
+    auto SuperClass = inferSuperRegisterClassForNode(
+        Src->getExtType(0), Dst->getChild(0), Dst->getChild(2));
+    if (!SuperClass)
+      return failedImport(
+          "Cannot infer register class for INSERT_SUBREG operand #0");
+    M.addAction<ConstrainOperandToRegClassAction>(0, 0, **SuperClass);
+    M.addAction<ConstrainOperandToRegClassAction>(0, 1, **SuperClass);
+    M.addAction<ConstrainOperandToRegClassAction>(0, 2, **SubClass);
+    ++NumPatternImported;
+    return std::move(M);
+  }
+
+  if (DstIName == "SUBREG_TO_REG") {
+    // We need to constrain the destination and subregister source.
+    assert(Src->getExtTypes().size() == 1 &&
+           "Expected Src of SUBREG_TO_REG to have one result type");
+
+    // Attempt to infer the subregister source from the first child. If it has
+    // an explicitly given register class, we'll use that. Otherwise, we will
+    // fail.
+    auto SubClass = inferRegClassFromPattern(Dst->getChild(1));
+    if (!SubClass)
+      return failedImport(
+          "Cannot infer register class from SUBREG_TO_REG child #1");
+    // We don't have a child to look at that might have a super register node.
+    auto SuperClass =
+        inferSuperRegisterClass(Src->getExtType(0), Dst->getChild(2));
+    if (!SuperClass)
+      return failedImport(
+          "Cannot infer register class for SUBREG_TO_REG operand #0");
+    M.addAction<ConstrainOperandToRegClassAction>(0, 0, **SuperClass);
+    M.addAction<ConstrainOperandToRegClassAction>(0, 2, **SubClass);
+    ++NumPatternImported;
+    return std::move(M);
+  }
+
   M.addAction<ConstrainOperandsToDefinitionAction>(0);
 
   // We're done with this pattern!  It's eligible for GISel emission; return it.
@@ -4235,7 +4878,7 @@ std::vector<Matcher *> GlobalISelEmitter::optimizeRules(
     std::vector<std::unique_ptr<Matcher>> &MatcherStorage) {
 
   std::vector<Matcher *> OptRules;
-  std::unique_ptr<GroupT> CurrentGroup = make_unique<GroupT>();
+  std::unique_ptr<GroupT> CurrentGroup = std::make_unique<GroupT>();
   assert(CurrentGroup->empty() && "Newly created group isn't empty!");
   unsigned NumGroups = 0;
 
@@ -4256,7 +4899,7 @@ std::vector<Matcher *> GlobalISelEmitter::optimizeRules(
       MatcherStorage.emplace_back(std::move(CurrentGroup));
       ++NumGroups;
     }
-    CurrentGroup = make_unique<GroupT>();
+    CurrentGroup = std::make_unique<GroupT>();
   };
   for (Matcher *Rule : Rules) {
     // Greedily add as many matchers as possible to the current group:
diff --git a/utils/TableGen/InfoByHwMode.cpp b/utils/TableGen/InfoByHwMode.cpp
index d9662889a5db..7cd1b0f08132 100644
--- a/utils/TableGen/InfoByHwMode.cpp
+++ b/utils/TableGen/InfoByHwMode.cpp
@@ -192,6 +192,17 @@ void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const {
   OS << '}';
 }
 
+EncodingInfoByHwMode::EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH) {
+  const HwModeSelect &MS = CGH.getHwModeSelect(R);
+  for (const HwModeSelect::PairType &P : MS.Items) {
+    assert(P.second && P.second->isSubClassOf("InstructionEncoding") &&
+           "Encoding must subclass InstructionEncoding");
+    auto I = Map.insert({P.first, P.second});
+    assert(I.second && "Duplicate entry?");
+    (void)I;
+  }
+}
+
 namespace llvm {
   raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T) {
     T.writeToStream(OS);
diff --git a/utils/TableGen/InfoByHwMode.h b/utils/TableGen/InfoByHwMode.h
index 9e5cc3d5f2a4..d92e5901a7f3 100644
--- a/utils/TableGen/InfoByHwMode.h
+++ b/utils/TableGen/InfoByHwMode.h
@@ -184,6 +184,11 @@ raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T);
 raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T);
 raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T);
 
+struct EncodingInfoByHwMode : public InfoByHwMode<Record*> {
+  EncodingInfoByHwMode(Record *R, const CodeGenHwModes &CGH);
+  EncodingInfoByHwMode() = default;
+};
+
 } // namespace llvm
 
 #endif // LLVM_UTILS_TABLEGEN_INFOBYHWMODE_H
diff --git a/utils/TableGen/InstrDocsEmitter.cpp b/utils/TableGen/InstrDocsEmitter.cpp
index 91c457ba08fd..45fa936b9574 100644
--- a/utils/TableGen/InstrDocsEmitter.cpp
+++ b/utils/TableGen/InstrDocsEmitter.cpp
@@ -231,4 +231,4 @@ void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
   }
 }
 
-} // end llvm namespace
+} // end namespace llvm
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index 2d367f538b71..300ba36a7007 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -332,6 +332,10 @@ void InstrInfoEmitter::emitOperandTypeMappings(
 
   StringRef Namespace = Target.getInstNamespace();
   std::vector<Record *> Operands = Records.getAllDerivedDefinitions("Operand");
+  std::vector<Record *> RegisterOperands =
+      Records.getAllDerivedDefinitions("RegisterOperand");
+  std::vector<Record *> RegisterClasses =
+      Records.getAllDerivedDefinitions("RegisterClass");
 
   OS << "#ifdef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
   OS << "#undef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
@@ -341,10 +345,13 @@ void InstrInfoEmitter::emitOperandTypeMappings(
   OS << "enum OperandType {\n";
 
   unsigned EnumVal = 0;
-  for (const Record *Op : Operands) {
-    if (!Op->isAnonymous())
-      OS << "  " << Op->getName() << " = " << EnumVal << ",\n";
-    ++EnumVal;
+  for (const std::vector<Record *> *RecordsToAdd :
+       {&Operands, &RegisterOperands, &RegisterClasses}) {
+    for (const Record *Op : *RecordsToAdd) {
+      if (!Op->isAnonymous())
+        OS << "  " << Op->getName() << " = " << EnumVal << ",\n";
+      ++EnumVal;
+    }
   }
 
   OS << "  OPERAND_TYPE_LIST_END" << "\n};\n";
@@ -358,7 +365,8 @@ void InstrInfoEmitter::emitOperandTypeMappings(
   OS << "namespace llvm {\n";
   OS << "namespace " << Namespace << " {\n";
   OS << "LLVM_READONLY\n";
-  OS << "int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n";
+  OS << "static int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n";
+  // TODO: Factor out instructions with same operands to compress the tables.
   if (!NumberedInstructions.empty()) {
     std::vector<int> OperandOffsets;
     std::vector<Record *> OperandRecords;
@@ -399,7 +407,10 @@ void InstrInfoEmitter::emitOperandTypeMappings(
           OS << "/**/\n    ";
       }
       Record *OpR = OperandRecords[I];
-      if (OpR->isSubClassOf("Operand") && !OpR->isAnonymous())
+      if ((OpR->isSubClassOf("Operand") ||
+           OpR->isSubClassOf("RegisterOperand") ||
+           OpR->isSubClassOf("RegisterClass")) &&
+          !OpR->isAnonymous())
         OS << "OpTypes::" << OpR->getName();
       else
         OS << -1;
@@ -414,7 +425,7 @@ void InstrInfoEmitter::emitOperandTypeMappings(
   OS << "}\n";
   OS << "} // end namespace " << Namespace << "\n";
   OS << "} // end namespace llvm\n";
-  OS << "#endif //GET_INSTRINFO_OPERAND_TYPE\n\n";
+  OS << "#endif // GET_INSTRINFO_OPERAND_TYPE\n\n";
 }
 
 void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
@@ -436,8 +447,8 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
         << "(const MCInst &MI);\n";
   }
 
-  OS << "\n} // end " << TargetName << "_MC namespace\n";
-  OS << "} // end llvm namespace\n\n";
+  OS << "\n} // end namespace " << TargetName << "_MC\n";
+  OS << "} // end namespace llvm\n\n";
 
   OS << "#endif // GET_INSTRINFO_MC_HELPER_DECLS\n\n";
 
@@ -459,8 +470,8 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
     OS << "\n}\n\n";
   }
 
-  OS << "} // end " << TargetName << "_MC namespace\n";
-  OS << "} // end llvm namespace\n\n";
+  OS << "} // end namespace " << TargetName << "_MC\n";
+  OS << "} // end namespace llvm\n\n";
 
   OS << "#endif // GET_GENISTRINFO_MC_HELPERS\n";
 }
@@ -576,7 +587,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
      << TargetName << "InstrNameIndices, " << TargetName << "InstrNameData, "
      << NumberedInstructions.size() << ");\n}\n\n";
 
-  OS << "} // end llvm namespace\n";
+  OS << "} // end namespace llvm\n";
 
   OS << "#endif // GET_INSTRINFO_MC_DESC\n\n";
 
@@ -592,7 +603,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
      << "  ~" << ClassName << "() override = default;\n";
 
 
-  OS << "\n};\n} // end llvm namespace\n";
+  OS << "\n};\n} // end namespace llvm\n";
 
   OS << "#endif // GET_INSTRINFO_HEADER\n\n";
 
@@ -620,7 +631,7 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
      << "  InitMCInstrInfo(" << TargetName << "Insts, " << TargetName
      << "InstrNameIndices, " << TargetName << "InstrNameData, "
      << NumberedInstructions.size() << ");\n}\n";
-  OS << "} // end llvm namespace\n";
+  OS << "} // end namespace llvm\n";
 
   OS << "#endif // GET_INSTRINFO_CTOR_DTOR\n\n";
 
@@ -651,6 +662,7 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
   CodeGenTarget &Target = CDP.getTargetInfo();
 
   // Emit all of the target independent flags...
+  if (Inst.isPreISelOpcode)    OS << "|(1ULL<<MCID::PreISelOpcode)";
   if (Inst.isPseudo)           OS << "|(1ULL<<MCID::Pseudo)";
   if (Inst.isReturn)           OS << "|(1ULL<<MCID::Return)";
   if (Inst.isEHScopeReturn)    OS << "|(1ULL<<MCID::EHScopeReturn)";
@@ -765,8 +777,8 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
     OS << "    " << Inst->TheDef->getName() << "\t= " << Num++ << ",\n";
   OS << "    INSTRUCTION_LIST_END = " << Num << "\n";
   OS << "  };\n\n";
-  OS << "} // end " << Namespace << " namespace\n";
-  OS << "} // end llvm namespace\n";
+  OS << "} // end namespace " << Namespace << "\n";
+  OS << "} // end namespace llvm\n";
   OS << "#endif // GET_INSTRINFO_ENUM\n\n";
 
   OS << "#ifdef GET_INSTRINFO_SCHED_ENUM\n";
@@ -780,9 +792,9 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
     OS << "    " << Class.Name << "\t= " << Num++ << ",\n";
   OS << "    SCHED_LIST_END = " << Num << "\n";
   OS << "  };\n";
-  OS << "} // end Sched namespace\n";
-  OS << "} // end " << Namespace << " namespace\n";
-  OS << "} // end llvm namespace\n";
+  OS << "} // end namespace Sched\n";
+  OS << "} // end namespace " << Namespace << "\n";
+  OS << "} // end namespace llvm\n";
 
   OS << "#endif // GET_INSTRINFO_SCHED_ENUM\n\n";
 }
@@ -794,4 +806,4 @@ void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS) {
   EmitMapTable(RK, OS);
 }
 
-} // end llvm namespace
+} // end namespace llvm
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 979af98f6768..e01f91c20456 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -220,7 +220,11 @@ enum IIT_Info {
   IIT_STRUCT7 = 39,
   IIT_STRUCT8 = 40,
   IIT_F128 = 41,
-  IIT_VEC_ELEMENT = 42
+  IIT_VEC_ELEMENT = 42,
+  IIT_SCALABLE_VEC = 43,
+  IIT_SUBDIVIDE2_ARG = 44,
+  IIT_SUBDIVIDE4_ARG = 45,
+  IIT_VEC_OF_BITCASTS_TO_INT = 46
 };
 
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
@@ -292,6 +296,12 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
       Sig.push_back(IIT_PTR_TO_ELT);
     else if (R->isSubClassOf("LLVMVectorElementType"))
       Sig.push_back(IIT_VEC_ELEMENT);
+    else if (R->isSubClassOf("LLVMSubdivide2VectorType"))
+      Sig.push_back(IIT_SUBDIVIDE2_ARG);
+    else if (R->isSubClassOf("LLVMSubdivide4VectorType"))
+      Sig.push_back(IIT_SUBDIVIDE4_ARG);
+    else if (R->isSubClassOf("LLVMVectorOfBitcastsToInt"))
+      Sig.push_back(IIT_VEC_OF_BITCASTS_TO_INT);
     else
       Sig.push_back(IIT_ARG);
     return Sig.push_back((Number << 3) | 7 /*IITDescriptor::AK_MatchType*/);
@@ -339,6 +349,8 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
 
   if (MVT(VT).isVector()) {
     MVT VVT = VT;
+    if (VVT.isScalableVector())
+      Sig.push_back(IIT_SCALABLE_VEC);
     switch (VVT.getVectorNumElements()) {
     default: PrintFatalError("unhandled vector type width in intrinsic!");
     case 1: Sig.push_back(IIT_V1); break;
@@ -647,6 +659,12 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
             OS << "Attribute::NoCapture";
             addComma = true;
             break;
+          case CodeGenIntrinsic::NoAlias:
+            if (addComma)
+              OS << ",";
+            OS << "Attribute::NoAlias";
+            addComma = true;
+            break;
           case CodeGenIntrinsic::Returned:
             if (addComma)
               OS << ",";
diff --git a/utils/TableGen/RISCVCompressInstEmitter.cpp b/utils/TableGen/RISCVCompressInstEmitter.cpp
index e62f528ebc2e..2f1d3898f182 100644
--- a/utils/TableGen/RISCVCompressInstEmitter.cpp
+++ b/utils/TableGen/RISCVCompressInstEmitter.cpp
@@ -411,12 +411,8 @@ void RISCVCompressInstEmitter::evaluateCompressPat(Record *Rec) {
   assert(SourceDag && "Missing 'Input' in compress pattern!");
   LLVM_DEBUG(dbgs() << "Input: " << *SourceDag << "\n");
 
-  DefInit *OpDef = dyn_cast<DefInit>(SourceDag->getOperator());
-  if (!OpDef)
-    PrintFatalError(Rec->getLoc(),
-                    Rec->getName() + " has unexpected operator type!");
   // Checking we are transforming from compressed to uncompressed instructions.
-  Record *Operator = OpDef->getDef();
+  Record *Operator = SourceDag->getOperatorAsDef(Rec->getLoc());
   if (!Operator->isSubClassOf("RVInst"))
     PrintFatalError(Rec->getLoc(), "Input instruction '" + Operator->getName() +
                                        "' is not a 32 bit wide instruction!");
@@ -428,12 +424,7 @@ void RISCVCompressInstEmitter::evaluateCompressPat(Record *Rec) {
   assert(DestDag && "Missing 'Output' in compress pattern!");
   LLVM_DEBUG(dbgs() << "Output: " << *DestDag << "\n");
 
-  DefInit *DestOpDef = dyn_cast<DefInit>(DestDag->getOperator());
-  if (!DestOpDef)
-    PrintFatalError(Rec->getLoc(),
-                    Rec->getName() + " has unexpected operator type!");
-
-  Record *DestOperator = DestOpDef->getDef();
+  Record *DestOperator = DestDag->getOperatorAsDef(Rec->getLoc());
   if (!DestOperator->isSubClassOf("RVInst16"))
     PrintFatalError(Rec->getLoc(), "Output instruction  '" +
                                        DestOperator->getName() +
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 1b619072c814..513cd14e0fab 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -888,7 +888,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   // Keep track of sub-register names as well. These are not differentially
   // encoded.
   typedef SmallVector<const CodeGenSubRegIndex*, 4> SubRegIdxVec;
-  SequenceToOffsetTable<SubRegIdxVec, deref<llvm::less>> SubRegIdxSeqs;
+  SequenceToOffsetTable<SubRegIdxVec, deref<std::less<>>> SubRegIdxSeqs;
   SmallVector<SubRegIdxVec, 4> SubRegIdxLists(Regs.size());
 
   SequenceToOffsetTable<std::string> RegStrings;
@@ -1315,7 +1315,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     // Compress the sub-reg index lists.
     typedef std::vector<const CodeGenSubRegIndex*> IdxList;
     SmallVector<IdxList, 8> SuperRegIdxLists(RegisterClasses.size());
-    SequenceToOffsetTable<IdxList, deref<llvm::less>> SuperRegIdxSeqs;
+    SequenceToOffsetTable<IdxList, deref<std::less<>>> SuperRegIdxSeqs;
     BitVector MaskBV(RegisterClasses.size());
 
     for (const auto &RC : RegisterClasses) {
diff --git a/utils/TableGen/SearchableTableEmitter.cpp b/utils/TableGen/SearchableTableEmitter.cpp
index 954b63e7253c..f08f8aa01956 100644
--- a/utils/TableGen/SearchableTableEmitter.cpp
+++ b/utils/TableGen/SearchableTableEmitter.cpp
@@ -134,7 +134,7 @@ private:
   CodeGenIntrinsic &getIntrinsic(Init *I) {
     std::unique_ptr<CodeGenIntrinsic> &Intr = Intrinsics[I];
     if (!Intr)
-      Intr = make_unique<CodeGenIntrinsic>(cast<DefInit>(I)->getDef());
+      Intr = std::make_unique<CodeGenIntrinsic>(cast<DefInit>(I)->getDef());
     return *Intr;
   }
 
@@ -496,7 +496,7 @@ void SearchableTableEmitter::emitGenericTable(const GenericTable &Table,
   emitIfdef((Twine("GET_") + Table.PreprocessorGuard + "_IMPL").str(), OS);
 
   // The primary data table contains all the fields defined for this map.
-  OS << "const " << Table.CppTypeName << " " << Table.Name << "[] = {\n";
+  OS << "constexpr " << Table.CppTypeName << " " << Table.Name << "[] = {\n";
   for (unsigned i = 0; i < Table.Entries.size(); ++i) {
     Record *Entry = Table.Entries[i];
     OS << "  { ";
@@ -541,7 +541,7 @@ std::unique_ptr<SearchIndex>
 SearchableTableEmitter::parseSearchIndex(GenericTable &Table, StringRef Name,
                                          const std::vector<StringRef> &Key,
                                          bool EarlyOut) {
-  auto Index = llvm::make_unique<SearchIndex>();
+  auto Index = std::make_unique<SearchIndex>();
   Index->Name = Name;
   Index->EarlyOut = EarlyOut;
 
@@ -577,7 +577,7 @@ void SearchableTableEmitter::collectEnumEntries(
     if (!ValueField.empty())
       Value = getInt(EntryRec, ValueField);
 
-    Enum.Entries.push_back(llvm::make_unique<GenericEnum::Entry>(Name, Value));
+    Enum.Entries.push_back(std::make_unique<GenericEnum::Entry>(Name, Value));
     Enum.EntryMap.insert(std::make_pair(EntryRec, Enum.Entries.back().get()));
   }
 
@@ -647,7 +647,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
     if (!EnumRec->isValueUnset("ValueField"))
       ValueField = EnumRec->getValueAsString("ValueField");
 
-    auto Enum = llvm::make_unique<GenericEnum>();
+    auto Enum = std::make_unique<GenericEnum>();
     Enum->Name = EnumRec->getName();
     Enum->PreprocessorGuard = EnumRec->getName();
 
@@ -664,7 +664,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
   }
 
   for (auto TableRec : Records.getAllDerivedDefinitions("GenericTable")) {
-    auto Table = llvm::make_unique<GenericTable>();
+    auto Table = std::make_unique<GenericTable>();
     Table->Name = TableRec->getName();
     Table->PreprocessorGuard = TableRec->getName();
     Table->CppTypeName = TableRec->getValueAsString("CppTypeName");
@@ -733,7 +733,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
       if (!Class->isValueUnset("EnumValueField"))
         ValueField = Class->getValueAsString("EnumValueField");
 
-      auto Enum = llvm::make_unique<GenericEnum>();
+      auto Enum = std::make_unique<GenericEnum>();
       Enum->Name = (Twine(Class->getName()) + "Values").str();
       Enum->PreprocessorGuard = Class->getName().upper();
       Enum->Class = Class;
@@ -743,7 +743,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
       Enums.emplace_back(std::move(Enum));
     }
 
-    auto Table = llvm::make_unique<GenericTable>();
+    auto Table = std::make_unique<GenericTable>();
     Table->Name = (Twine(Class->getName()) + "sList").str();
     Table->PreprocessorGuard = Class->getName().upper();
     Table->CppTypeName = Class->getName();
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 9ce2b3b275c8..9b094adb7d5c 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -1057,6 +1057,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         LLVM_DEBUG(dbgs() << ProcModel.ModelName
                           << " does not have resources for class " << SC.Name
                           << '\n');
+        SCDesc.NumMicroOps = MCSchedClassDesc::InvalidNumMicroOps;
       }
     }
     // Sum resources across all operand writes.
@@ -1728,7 +1729,7 @@ void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
      << "    const MCInst *MI, unsigned CPUID) {\n";
   emitSchedModelHelpersImpl(OS, /* OnlyExpandMCPredicates */ true);
   OS << "}\n";
-  OS << "} // end of namespace " << Target << "_MC\n\n";
+  OS << "} // end namespace " << Target << "_MC\n\n";
 
   OS << "struct " << Target
      << "GenMCSubtargetInfo : public MCSubtargetInfo {\n";
@@ -1746,7 +1747,10 @@ void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
      << "    return " << Target << "_MC"
      << "::resolveVariantSchedClassImpl(SchedClass, MI, CPUID); \n";
   OS << "  }\n";
+  if (TGT.getHwModes().getNumModeIds() > 1)
+    OS << "  unsigned getHwMode() const override;\n";
   OS << "};\n";
+  EmitHwModeCheck(Target + "GenMCSubtargetInfo", OS);
 }
 
 void SubtargetEmitter::EmitMCInstrAnalysisPredicateFunctions(raw_ostream &OS) {
@@ -1858,7 +1862,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "namespace " << Target << "_MC {\n"
      << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,"
      << " const MCInst *MI, unsigned CPUID);\n"
-     << "}\n\n";
+     << "} // end namespace " << Target << "_MC\n\n";
   OS << "struct " << ClassName << " : public TargetSubtargetInfo {\n"
      << "  explicit " << ClassName << "(const Triple &TT, StringRef CPU, "
      << "StringRef FS);\n"
diff --git a/utils/TableGen/SubtargetFeatureInfo.cpp b/utils/TableGen/SubtargetFeatureInfo.cpp
index edf0b4a01c6d..5430f73d5e09 100644
--- a/utils/TableGen/SubtargetFeatureInfo.cpp
+++ b/utils/TableGen/SubtargetFeatureInfo.cpp
@@ -38,6 +38,10 @@ SubtargetFeatureInfo::getAll(const RecordKeeper &Records) {
     if (Pred->getName().empty())
       PrintFatalError(Pred->getLoc(), "Predicate has no name!");
 
+    // Ignore always true predicates.
+    if (Pred->getValueAsString("CondString").empty())
+      continue;
+
     SubtargetFeatures.emplace_back(
         Pred, SubtargetFeatureInfo(Pred, SubtargetFeatures.size()));
   }
@@ -95,9 +99,11 @@ void SubtargetFeatureInfo::emitComputeAvailableFeatures(
   OS << "  PredicateBitset Features;\n";
   for (const auto &SF : SubtargetFeatures) {
     const SubtargetFeatureInfo &SFI = SF.second;
+    StringRef CondStr = SFI.TheDef->getValueAsString("CondString");
+    assert(!CondStr.empty() && "true predicate should have been filtered");
 
-    OS << "  if (" << SFI.TheDef->getValueAsString("CondString") << ")\n";
-    OS << "    Features[" << SFI.getEnumBitName() << "] = 1;\n";
+    OS << "  if (" << CondStr << ")\n";
+    OS << "    Features.set(" << SFI.getEnumBitName() << ");\n";
   }
   OS << "  return Features;\n";
   OS << "}\n\n";
@@ -142,7 +148,7 @@ void SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
     } while (true);
 
     OS << ")\n";
-    OS << "    Features[" << SFI.getEnumBitName() << "] = 1;\n";
+    OS << "    Features.set(" << SFI.getEnumBitName() << ");\n";
   }
   OS << "  return Features;\n";
   OS << "}\n\n";
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index c485ed2feb7a..f730d91160ad 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -49,10 +49,12 @@ enum ActionType {
   GenAttributes,
   GenSearchableTables,
   GenGlobalISel,
+  GenGICombiner,
   GenX86EVEX2VEXTables,
   GenX86FoldTables,
   GenRegisterBank,
   GenExegesis,
+  GenAutomata,
 };
 
 namespace llvm {
@@ -62,75 +64,75 @@ bool TimeRegions = false;
 } // end namespace llvm
 
 namespace {
-  cl::opt<ActionType>
-  Action(cl::desc("Action to perform:"),
-         cl::values(clEnumValN(PrintRecords, "print-records",
-                               "Print all records to stdout (default)"),
-                    clEnumValN(DumpJSON, "dump-json",
-                               "Dump all records as machine-readable JSON"),
-                    clEnumValN(GenEmitter, "gen-emitter",
-                               "Generate machine code emitter"),
-                    clEnumValN(GenRegisterInfo, "gen-register-info",
-                               "Generate registers and register classes info"),
-                    clEnumValN(GenInstrInfo, "gen-instr-info",
-                               "Generate instruction descriptions"),
-                    clEnumValN(GenInstrDocs, "gen-instr-docs",
-                               "Generate instruction documentation"),
-                    clEnumValN(GenCallingConv, "gen-callingconv",
-                               "Generate calling convention descriptions"),
-                    clEnumValN(GenAsmWriter, "gen-asm-writer",
-                               "Generate assembly writer"),
-                    clEnumValN(GenDisassembler, "gen-disassembler",
-                               "Generate disassembler"),
-                    clEnumValN(GenPseudoLowering, "gen-pseudo-lowering",
-                               "Generate pseudo instruction lowering"),
-                    clEnumValN(GenCompressInst, "gen-compress-inst-emitter",
-                               "Generate RISCV compressed instructions."),
-                    clEnumValN(GenAsmMatcher, "gen-asm-matcher",
-                               "Generate assembly instruction matcher"),
-                    clEnumValN(GenDAGISel, "gen-dag-isel",
-                               "Generate a DAG instruction selector"),
-                    clEnumValN(GenDFAPacketizer, "gen-dfa-packetizer",
-                               "Generate DFA Packetizer for VLIW targets"),
-                    clEnumValN(GenFastISel, "gen-fast-isel",
-                               "Generate a \"fast\" instruction selector"),
-                    clEnumValN(GenSubtarget, "gen-subtarget",
-                               "Generate subtarget enumerations"),
-                    clEnumValN(GenIntrinsicEnums, "gen-intrinsic-enums",
-                               "Generate intrinsic enums"),
-                    clEnumValN(GenIntrinsicImpl, "gen-intrinsic-impl",
-                               "Generate intrinsic information"),
-                    clEnumValN(GenTgtIntrinsicEnums, "gen-tgt-intrinsic-enums",
-                               "Generate target intrinsic enums"),
-                    clEnumValN(GenTgtIntrinsicImpl, "gen-tgt-intrinsic-impl",
-                               "Generate target intrinsic information"),
-                    clEnumValN(PrintEnums, "print-enums",
-                               "Print enum values for a class"),
-                    clEnumValN(PrintSets, "print-sets",
-                               "Print expanded sets for testing DAG exprs"),
-                    clEnumValN(GenOptParserDefs, "gen-opt-parser-defs",
-                               "Generate option definitions"),
-                    clEnumValN(GenCTags, "gen-ctags",
-                               "Generate ctags-compatible index"),
-                    clEnumValN(GenAttributes, "gen-attrs",
-                               "Generate attributes"),
-                    clEnumValN(GenSearchableTables, "gen-searchable-tables",
-                               "Generate generic binary-searchable table"),
-                    clEnumValN(GenGlobalISel, "gen-global-isel",
-                               "Generate GlobalISel selector"),
-                    clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables",
-                               "Generate X86 EVEX to VEX compress tables"),
-                    clEnumValN(GenX86FoldTables, "gen-x86-fold-tables",
-                               "Generate X86 fold tables"),
-                    clEnumValN(GenRegisterBank, "gen-register-bank",
-                               "Generate registers bank descriptions"),
-                    clEnumValN(GenExegesis, "gen-exegesis",
-                               "Generate llvm-exegesis tables")));
+cl::opt<ActionType> Action(
+    cl::desc("Action to perform:"),
+    cl::values(
+        clEnumValN(PrintRecords, "print-records",
+                   "Print all records to stdout (default)"),
+        clEnumValN(DumpJSON, "dump-json",
+                   "Dump all records as machine-readable JSON"),
+        clEnumValN(GenEmitter, "gen-emitter", "Generate machine code emitter"),
+        clEnumValN(GenRegisterInfo, "gen-register-info",
+                   "Generate registers and register classes info"),
+        clEnumValN(GenInstrInfo, "gen-instr-info",
+                   "Generate instruction descriptions"),
+        clEnumValN(GenInstrDocs, "gen-instr-docs",
+                   "Generate instruction documentation"),
+        clEnumValN(GenCallingConv, "gen-callingconv",
+                   "Generate calling convention descriptions"),
+        clEnumValN(GenAsmWriter, "gen-asm-writer", "Generate assembly writer"),
+        clEnumValN(GenDisassembler, "gen-disassembler",
+                   "Generate disassembler"),
+        clEnumValN(GenPseudoLowering, "gen-pseudo-lowering",
+                   "Generate pseudo instruction lowering"),
+        clEnumValN(GenCompressInst, "gen-compress-inst-emitter",
+                   "Generate RISCV compressed instructions."),
+        clEnumValN(GenAsmMatcher, "gen-asm-matcher",
+                   "Generate assembly instruction matcher"),
+        clEnumValN(GenDAGISel, "gen-dag-isel",
+                   "Generate a DAG instruction selector"),
+        clEnumValN(GenDFAPacketizer, "gen-dfa-packetizer",
+                   "Generate DFA Packetizer for VLIW targets"),
+        clEnumValN(GenFastISel, "gen-fast-isel",
+                   "Generate a \"fast\" instruction selector"),
+        clEnumValN(GenSubtarget, "gen-subtarget",
+                   "Generate subtarget enumerations"),
+        clEnumValN(GenIntrinsicEnums, "gen-intrinsic-enums",
+                   "Generate intrinsic enums"),
+        clEnumValN(GenIntrinsicImpl, "gen-intrinsic-impl",
+                   "Generate intrinsic information"),
+        clEnumValN(GenTgtIntrinsicEnums, "gen-tgt-intrinsic-enums",
+                   "Generate target intrinsic enums"),
+        clEnumValN(GenTgtIntrinsicImpl, "gen-tgt-intrinsic-impl",
+                   "Generate target intrinsic information"),
+        clEnumValN(PrintEnums, "print-enums", "Print enum values for a class"),
+        clEnumValN(PrintSets, "print-sets",
+                   "Print expanded sets for testing DAG exprs"),
+        clEnumValN(GenOptParserDefs, "gen-opt-parser-defs",
+                   "Generate option definitions"),
+        clEnumValN(GenCTags, "gen-ctags", "Generate ctags-compatible index"),
+        clEnumValN(GenAttributes, "gen-attrs", "Generate attributes"),
+        clEnumValN(GenSearchableTables, "gen-searchable-tables",
+                   "Generate generic binary-searchable table"),
+        clEnumValN(GenGlobalISel, "gen-global-isel",
+                   "Generate GlobalISel selector"),
+        clEnumValN(GenGICombiner, "gen-global-isel-combiner",
+                   "Generate GlobalISel combiner"),
+        clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables",
+                   "Generate X86 EVEX to VEX compress tables"),
+        clEnumValN(GenX86FoldTables, "gen-x86-fold-tables",
+                   "Generate X86 fold tables"),
+        clEnumValN(GenRegisterBank, "gen-register-bank",
+                   "Generate registers bank descriptions"),
+        clEnumValN(GenExegesis, "gen-exegesis",
+                   "Generate llvm-exegesis tables"),
+        clEnumValN(GenAutomata, "gen-automata",
+                   "Generate generic automata")));
 
-  cl::OptionCategory PrintEnumsCat("Options for -print-enums");
-  cl::opt<std::string>
-  Class("class", cl::desc("Print Enum list for this class"),
-        cl::value_desc("class name"), cl::cat(PrintEnumsCat));
+cl::OptionCategory PrintEnumsCat("Options for -print-enums");
+cl::opt<std::string> Class("class", cl::desc("Print Enum list for this class"),
+                           cl::value_desc("class name"),
+                           cl::cat(PrintEnumsCat));
 
 cl::opt<bool, true>
     TimeRegionsOpt("time-regions",
@@ -235,6 +237,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenGlobalISel:
     EmitGlobalISel(Records, OS);
     break;
+  case GenGICombiner:
+    EmitGICombiner(Records, OS);
+    break;
   case GenRegisterBank:
     EmitRegisterBank(Records, OS);
     break;
@@ -247,6 +252,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenExegesis:
     EmitExegesis(Records, OS);
     break;
+  case GenAutomata:
+    EmitAutomata(Records, OS);
+    break;
   }
 
   return false;
@@ -263,11 +271,16 @@ int main(int argc, char **argv) {
   return TableGenMain(argv[0], &LLVMTableGenMain);
 }
 
-#ifdef __has_feature
-#if __has_feature(address_sanitizer)
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) ||       \
+    __has_feature(leak_sanitizer)
+
 #include <sanitizer/lsan_interface.h>
 // Disable LeakSanitizer for this binary as it has too many leaks that are not
 // very interesting to fix. See compiler-rt/include/sanitizer/lsan_interface.h .
 LLVM_ATTRIBUTE_USED int __lsan_is_turned_off() { return 1; }
-#endif  // __has_feature(address_sanitizer)
-#endif  // defined(__has_feature)
+
+#endif
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 135ec65c0f95..8c067dd51b3b 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -85,10 +85,12 @@ void EmitCTags(RecordKeeper &RK, raw_ostream &OS);
 void EmitAttributes(RecordKeeper &RK, raw_ostream &OS);
 void EmitSearchableTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS);
+void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS);
 void EmitExegesis(RecordKeeper &RK, raw_ostream &OS);
+void EmitAutomata(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
 
diff --git a/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index 365cba5a60ca..54aa5a8164f2 100644
--- a/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -167,7 +167,7 @@ void emitWebAssemblyDisassemblerTables(
     OS << " },\n";
   }
   OS << "  { 0, nullptr }\n};\n\n";
-  OS << "} // End llvm namespace\n";
+  OS << "} // end namespace llvm\n";
 }
 
 } // namespace llvm
diff --git a/utils/TableGen/X86DisassemblerTables.cpp b/utils/TableGen/X86DisassemblerTables.cpp
index 8036aecc4f4b..14bce4c29446 100644
--- a/utils/TableGen/X86DisassemblerTables.cpp
+++ b/utils/TableGen/X86DisassemblerTables.cpp
@@ -651,7 +651,7 @@ static const char* stringForDecisionType(ModRMDecisionType dt) {
 
 DisassemblerTables::DisassemblerTables() {
   for (unsigned i = 0; i < array_lengthof(Tables); i++)
-    Tables[i] = llvm::make_unique<ContextDecision>();
+    Tables[i] = std::make_unique<ContextDecision>();
 
   HasConflicts = false;
 }
diff --git a/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
index 3df14f40e4a9..6dc7e31e0dab 100644
--- a/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
+++ b/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
@@ -98,6 +98,7 @@ public:
     bool EVEX_W1_VEX_W0 = RecE->getValueAsBit("EVEX_W1_VEX_W0");
 
     if (RecV->getValueAsDef("OpEnc")->getName().str() != "EncVEX" ||
+        RecV->getValueAsBit("isCodeGenOnly") != RecE->getValueAsBit("isCodeGenOnly") ||
         // VEX/EVEX fields
         RecV->getValueAsDef("OpPrefix") != RecE->getValueAsDef("OpPrefix") ||
         RecV->getValueAsDef("OpMap") != RecE->getValueAsDef("OpMap") ||
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index ab8a8855c478..33dc6f3f9e23 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -749,7 +749,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   case X86Local::RawFrmImm8:
   case X86Local::RawFrmImm16:
   case X86Local::AddCCFrm:
-    filter = llvm::make_unique<DumbFilter>();
+    filter = std::make_unique<DumbFilter>();
     break;
   case X86Local::MRMDestReg:
   case X86Local::MRMSrcReg:
@@ -758,7 +758,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   case X86Local::MRMSrcRegCC:
   case X86Local::MRMXrCC:
   case X86Local::MRMXr:
-    filter = llvm::make_unique<ModFilter>(true);
+    filter = std::make_unique<ModFilter>(true);
     break;
   case X86Local::MRMDestMem:
   case X86Local::MRMSrcMem:
@@ -767,22 +767,22 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   case X86Local::MRMSrcMemCC:
   case X86Local::MRMXmCC:
   case X86Local::MRMXm:
-    filter = llvm::make_unique<ModFilter>(false);
+    filter = std::make_unique<ModFilter>(false);
     break;
   case X86Local::MRM0r: case X86Local::MRM1r:
   case X86Local::MRM2r: case X86Local::MRM3r:
   case X86Local::MRM4r: case X86Local::MRM5r:
   case X86Local::MRM6r: case X86Local::MRM7r:
-    filter = llvm::make_unique<ExtendedFilter>(true, Form - X86Local::MRM0r);
+    filter = std::make_unique<ExtendedFilter>(true, Form - X86Local::MRM0r);
     break;
   case X86Local::MRM0m: case X86Local::MRM1m:
   case X86Local::MRM2m: case X86Local::MRM3m:
   case X86Local::MRM4m: case X86Local::MRM5m:
   case X86Local::MRM6m: case X86Local::MRM7m:
-    filter = llvm::make_unique<ExtendedFilter>(false, Form - X86Local::MRM0m);
+    filter = std::make_unique<ExtendedFilter>(false, Form - X86Local::MRM0m);
     break;
   X86_INSTR_MRM_MAPPING
-    filter = llvm::make_unique<ExactFilter>(0xC0 + Form - X86Local::MRM_C0);
+    filter = std::make_unique<ExactFilter>(0xC0 + Form - X86Local::MRM_C0);
     break;
   } // switch (Form)
 
@@ -854,6 +854,7 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("GR64",                TYPE_R64)
   TYPE("i8mem",               TYPE_M)
   TYPE("i8imm",               TYPE_IMM)
+  TYPE("u4imm",               TYPE_UIMM8)
   TYPE("u8imm",               TYPE_UIMM8)
   TYPE("i16u8imm",            TYPE_UIMM8)
   TYPE("i32u8imm",            TYPE_UIMM8)
@@ -973,6 +974,7 @@ RecognizableInstr::immediateEncodingFromString(const std::string &s,
   ENCODING("i64i32imm",       ENCODING_ID)
   ENCODING("i64i8imm",        ENCODING_IB)
   ENCODING("i8imm",           ENCODING_IB)
+  ENCODING("u4imm",           ENCODING_IB)
   ENCODING("u8imm",           ENCODING_IB)
   ENCODING("i16u8imm",        ENCODING_IB)
   ENCODING("i32u8imm",        ENCODING_IB)
diff --git a/utils/add_argument_names.py b/utils/add_argument_names.py
new file mode 100755
index 000000000000..38dde2599794
--- /dev/null
+++ b/utils/add_argument_names.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+import re, sys
+
+def fix_string(s):
+    TYPE = re.compile('\s*(i[0-9]+|float|double|x86_fp80|fp128|ppc_fp128|\[\[.*?\]\]|\[2 x \[\[[A-Z_0-9]+\]\]\]|<.*?>|{.*?}|\[[0-9]+ x .*?\]|%["a-z:A-Z0-9._]+({{.*?}})?|%{{.*?}}|{{.*?}}|\[\[.*?\]\])(\s*(\*|addrspace\(.*?\)|dereferenceable\(.*?\)|byval\(.*?\)|sret|zeroext|inreg|returned|signext|nocapture|align \d+|swiftself|swifterror|readonly|noalias|inalloca|nocapture))*\s*')
+
+    counter = 0
+    if 'i32{{.*}}' in s:
+        counter = 1
+
+    at_pos = s.find('@')
+    if at_pos == -1:
+        at_pos = 0
+
+    annoying_pos = s.find('{{[^(]+}}')
+    if annoying_pos != -1:
+        at_pos = annoying_pos + 9
+
+    paren_pos = s.find('(', at_pos)
+    if paren_pos == -1:
+        return s
+
+    res = s[:paren_pos+1]
+    s = s[paren_pos+1:]
+
+    m = TYPE.match(s)
+    while m:
+        res += m.group()
+        s = s[m.end():]
+        if s.startswith(',') or s.startswith(')'):
+            res += f' %{counter}'
+            counter += 1
+
+        next_arg = s.find(',')
+        if next_arg == -1:
+            break
+
+        res += s[:next_arg+1]
+        s = s[next_arg+1:]
+        m = TYPE.match(s)
+
+    return res+s
+
+def process_file(contents):
+    PREFIX = re.compile(r'check-prefix(es)?(=|\s+)([a-zA-Z0-9,]+)')
+    check_prefixes = ['CHECK']
+    result = ''
+    for line in contents.split('\n'):
+        if 'FileCheck' in line:
+            m = PREFIX.search(line)
+            if m:
+                check_prefixes.extend(m.group(3).split(','))
+
+        found_check = False
+        for prefix in check_prefixes:
+            if prefix in line:
+                found_check = True
+                break
+
+        if not found_check or 'define' not in line:
+            result += line + '\n'
+            continue
+
+        # We have a check for a function definition. Number the args.
+        line = fix_string(line)
+        result += line + '\n'
+    return result
+
+def main():
+    print(f'Processing {sys.argv[1]}')
+    f = open(sys.argv[1])
+    content = f.read()
+    f.close()
+
+    content = process_file(content)
+
+    f = open(sys.argv[1], 'w')
+    f.write(content)
+    f.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/utils/llvm-locstats/CMakeLists.txt b/utils/llvm-locstats/CMakeLists.txt
new file mode 100644
index 000000000000..a919023e141e
--- /dev/null
+++ b/utils/llvm-locstats/CMakeLists.txt
@@ -0,0 +1,12 @@
+if (LLVM_BUILD_UTILS AND LLVM_BUILD_TOOLS)
+  add_custom_command(
+    OUTPUT ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats
+    DEPENDS ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py
+    COMMAND ${CMAKE_COMMAND} -E copy ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats
+    COMMENT "Copying llvm-locstats into ${LLVM_TOOLS_BINARY_DIR}"
+    )
+  add_custom_target(llvm-locstats ALL
+    DEPENDS ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats
+    )
+  set_target_properties(llvm-locstats PROPERTIES FOLDER "Tools")
+endif()
diff --git a/utils/llvm-locstats/llvm-locstats.py b/utils/llvm-locstats/llvm-locstats.py
new file mode 100755
index 000000000000..4df525ed1a96
--- /dev/null
+++ b/utils/llvm-locstats/llvm-locstats.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+#
+# This is a tool that works like debug location coverage calculator.
+# It parses the llvm-dwarfdump --statistics output by reporting it
+# in a more human readable way.
+#
+
+from __future__ import print_function
+import argparse
+import os
+import sys
+from json import loads
+from math import ceil
+from subprocess import Popen, PIPE
+
+def coverage_buckets():
+  yield '0%'
+  yield '1-9%'
+  for start in range(10, 91, 10):
+    yield '{0}-{1}%'.format(start, start + 9)
+  yield '100%'
+
+def locstats_output(
+  variables_total,
+  variables_total_locstats,
+  variables_with_loc,
+  scope_bytes_covered,
+  scope_bytes_from_first_def,
+  variables_coverage_map
+  ):
+
+  pc_ranges_covered = int(ceil(scope_bytes_covered * 100.0)
+              / scope_bytes_from_first_def)
+  variables_coverage_per_map = {}
+  for cov_bucket in coverage_buckets():
+    variables_coverage_per_map[cov_bucket] = \
+      int(ceil(variables_coverage_map[cov_bucket] * 100.0) \
+               / variables_total_locstats)
+
+  print (' =================================================')
+  print ('            Debug Location Statistics       ')
+  print (' =================================================')
+  print ('     cov%          samples          percentage(~)  ')
+  print (' -------------------------------------------------')
+  for cov_bucket in coverage_buckets():
+    print ('   {0:6}        {1:8d}             {2:3d}%'. \
+      format(cov_bucket, variables_coverage_map[cov_bucket], \
+             variables_coverage_per_map[cov_bucket]))
+  print (' =================================================')
+  print (' -the number of debug variables processed: ' \
+    + str(variables_total_locstats))
+  print (' -PC ranges covered: ' + str(pc_ranges_covered) + '%')
+
+  # Only if we are processing all the variables output the total
+  # availability.
+  if variables_total and variables_with_loc:
+    total_availability = int(ceil(variables_with_loc * 100.0) \
+                                  / variables_total)
+    print (' -------------------------------------------------')
+    print (' -total availability: ' + str(total_availability) + '%')
+  print (' =================================================')
+
+def parse_program_args(parser):
+  parser.add_argument('-only-variables', action='store_true',
+            default=False,
+            help='calculate the location statistics only for '
+               'local variables'
+            )
+  parser.add_argument('-only-formal-parameters', action='store_true',
+            default=False,
+            help='calculate the location statistics only for '
+               'formal parameters'
+            )
+  parser.add_argument('-ignore-debug-entry-values', action='store_true',
+            default=False,
+            help='ignore the location statistics on locations with '
+               'entry values'
+            )
+  parser.add_argument('file_name', type=str, help='file to process')
+  return parser.parse_args()
+
+
+def Main():
+  parser = argparse.ArgumentParser()
+  results = parse_program_args(parser)
+
+  if len(sys.argv) < 2:
+    print ('error: Too few arguments.')
+    parser.print_help()
+    sys.exit(1)
+
+  if results.only_variables and results.only_formal_parameters:
+    print ('error: Please use just one only* option.')
+    parser.print_help()
+    sys.exit(1)
+
+  # These will be different due to different options enabled.
+  variables_total = None
+  variables_total_locstats = None
+  variables_with_loc = None
+  variables_scope_bytes_covered = None
+  variables_scope_bytes_from_first_def = None
+  variables_scope_bytes_entry_values = None
+  variables_coverage_map = {}
+  binary = results.file_name
+
+  # Get the directory of the LLVM tools.
+  llvm_dwarfdump_cmd = os.path.join(os.path.dirname(__file__), \
+                                    "llvm-dwarfdump")
+  # The statistics llvm-dwarfdump option.
+  llvm_dwarfdump_stats_opt = "--statistics"
+
+  subproc = Popen([llvm_dwarfdump_cmd, llvm_dwarfdump_stats_opt, binary], \
+                  stdin=PIPE, stdout=PIPE, stderr=PIPE, \
+                  universal_newlines = True)
+  cmd_stdout, cmd_stderr = subproc.communicate()
+
+  # Get the JSON and parse it.
+  json_parsed = None
+
+  try:
+    json_parsed = loads(cmd_stdout)
+  except:
+    print ('error: No valid llvm-dwarfdump statistics found.')
+    sys.exit(1)
+
+  if results.only_variables:
+    # Read the JSON only for local variables.
+    variables_total_locstats = \
+      json_parsed['total vars procesed by location statistics']
+    variables_scope_bytes_covered = \
+      json_parsed['vars scope bytes covered']
+    variables_scope_bytes_from_first_def = \
+      json_parsed['vars scope bytes total']
+    if not results.ignore_debug_entry_values:
+      for cov_bucket in coverage_buckets():
+        cov_category = "vars with {} of its scope covered".format(cov_bucket)
+        variables_coverage_map[cov_bucket] = json_parsed[cov_category]
+    else:
+      variables_scope_bytes_entry_values = \
+        json_parsed['vars entry value scope bytes covered']
+      variables_scope_bytes_covered = variables_scope_bytes_covered \
+         - variables_scope_bytes_entry_values
+      for cov_bucket in coverage_buckets():
+        cov_category = \
+          "vars (excluding the debug entry values) " \
+          "with {} of its scope covered".format(cov_bucket)
+        variables_coverage_map[cov_bucket] = json_parsed[cov_category]
+  elif results.only_formal_parameters:
+    # Read the JSON only for formal parameters.
+    variables_total_locstats = \
+      json_parsed['total params procesed by location statistics']
+    variables_scope_bytes_covered = \
+      json_parsed['formal params scope bytes covered']
+    variables_scope_bytes_from_first_def = \
+      json_parsed['formal params scope bytes total']
+    if not results.ignore_debug_entry_values:
+      for cov_bucket in coverage_buckets():
+        cov_category = "params with {} of its scope covered".format(cov_bucket)
+        variables_coverage_map[cov_bucket] = json_parsed[cov_category]
+    else:
+      variables_scope_bytes_entry_values = \
+        json_parsed['formal params entry value scope bytes covered']
+      variables_scope_bytes_covered = variables_scope_bytes_covered \
+        - variables_scope_bytes_entry_values
+      for cov_bucket in coverage_buckets():
+        cov_category = \
+          "params (excluding the debug entry values) " \
+          "with {} of its scope covered".format(cov_bucket)
+        variables_coverage_map[cov_bucket] = json_parsed[cov_category]
+  else:
+    # Read the JSON for both local variables and formal parameters.
+    variables_total = \
+      json_parsed['source variables']
+    variables_with_loc = json_parsed['variables with location']
+    variables_total_locstats = \
+      json_parsed['total variables procesed by location statistics']
+    variables_scope_bytes_covered = \
+      json_parsed['scope bytes covered']
+    variables_scope_bytes_from_first_def = \
+      json_parsed['scope bytes total']
+    if not results.ignore_debug_entry_values:
+      for cov_bucket in coverage_buckets():
+        cov_category = "variables with {} of its scope covered". \
+                       format(cov_bucket)
+        variables_coverage_map[cov_bucket] = json_parsed[cov_category]
+    else:
+      variables_scope_bytes_entry_values = \
+        json_parsed['entry value scope bytes covered']
+      variables_scope_bytes_covered = variables_scope_bytes_covered \
+        - variables_scope_bytes_entry_values
+      for cov_bucket in coverage_buckets():
+        cov_category = "variables (excluding the debug entry values) " \
+                       "with {} of its scope covered". format(cov_bucket)
+        variables_coverage_map[cov_bucket] = json_parsed[cov_category]
+
+  # Pretty print collected info.
+  locstats_output(
+    variables_total,
+    variables_total_locstats,
+    variables_with_loc,
+    variables_scope_bytes_covered,
+    variables_scope_bytes_from_first_def,
+    variables_coverage_map
+    )
+
+if __name__ == '__main__':
+  Main()
+  sys.exit(0)
-- 
cgit v1.2.3